# DP: Changes for the Linaro 6-2017.03 release.

MSG=$(git log origin/linaro/gcc-6-branch --format=format:"%s" -n 1 --grep "Merge branches"); SVN=${MSG##* }; git log origin/gcc-6-branch --format=format:"%H" -n 1 --grep "gcc-6-branch@${SVN%.}"

LANG=C git diff --no-renames 4b7882c54dabbb54686cb577f2a2cf28e93e743b..630c5507bb37d2caaef60a6f0773e4c820d76fe0 \
 | egrep -v '^(diff|index) ' \
 | filterdiff --strip=1 --addoldprefix=a/src/  --addnewprefix=b/src/ \
 | sed 's,a/src//dev/null,/dev/null,'

--- a/src/contrib/compare_tests
+++ b/src/contrib/compare_tests
@@ -107,8 +107,8 @@ elif [ -d "$1" -o -d "$2" ] ; then
 	usage "Must specify either two directories or two files"
 fi
 
-sed 's/^XFAIL/FAIL/; s/^XPASS/PASS/' < "$1" | awk '/^Running target / {target = $3} { if (target != "unix") { sub(/: /, "&"target": " ); }; print $0; }' | cut -c1-2000 >$tmp1
-sed 's/^XFAIL/FAIL/; s/^XPASS/PASS/' < "$2" | awk '/^Running target / {target = $3} { if (target != "unix") { sub(/: /, "&"target": " ); }; print $0; }' | cut -c1-2000 >$tmp2
+sed 's/^XFAIL/FAIL/; s/^ERROR/FAIL/; s/^XPASS/PASS/' < "$1" | awk '/^Running target / {target = $3} { if (target != "unix") { sub(/: /, "&"target": " ); }; print $0; }' | cut -c1-2000 >$tmp1
+sed 's/^XFAIL/FAIL/; s/^ERROR/FAIL/; s/^XPASS/PASS/' < "$2" | awk '/^Running target / {target = $3} { if (target != "unix") { sub(/: /, "&"target": " ); }; print $0; }' | cut -c1-2000 >$tmp2
 
 before=$tmp1
 now=$tmp2
--- a/src/contrib/dg-extract-results.py
+++ b/src/contrib/dg-extract-results.py
@@ -134,6 +134,7 @@ class Prog:
         self.end_line = None
         # Known summary types.
         self.count_names = [
+            '# of DejaGnu errors\t\t',
             '# of expected passes\t\t',
             '# of unexpected failures\t',
             '# of unexpected successes\t',
@@ -245,6 +246,10 @@ class Prog:
             segment = Segment (filename, file.tell())
             variation.header = segment
 
+        # Parse the rest of the summary (the '# of ' lines).
+        if len (variation.counts) == 0:
+            variation.counts = self.zero_counts()
+
         # Parse up until the first line of the summary.
         if num_variations == 1:
             end = '\t\t=== ' + tool.name + ' Summary ===\n'
@@ -291,6 +296,11 @@ class Prog:
                 harness.results.append ((key, line))
                 if not first_key and sort_logs:
                     first_key = key
+                if line.startswith ('ERROR: (DejaGnu)'):
+                    for i in range (len (self.count_names)):
+                        if 'DejaGnu errors' in self.count_names[i]:
+                            variation.counts[i] += 1
+                            break
 
             # 'Using ...' lines are only interesting in a header.  Splitting
             # the test up into parallel runs leads to more 'Using ...' lines
@@ -309,9 +319,6 @@ class Prog:
             segment.lines -= final_using
             harness.add_segment (first_key, segment)
 
-        # Parse the rest of the summary (the '# of ' lines).
-        if len (variation.counts) == 0:
-            variation.counts = self.zero_counts()
         while True:
             before = file.tell()
             line = file.readline()
--- a/src/contrib/dg-extract-results.sh
+++ b/src/contrib/dg-extract-results.sh
@@ -369,10 +369,11 @@ EOF
 BEGIN {
   variant="$VAR"
   tool="$TOOL"
-  passcnt=0; failcnt=0; untstcnt=0; xpasscnt=0; xfailcnt=0; kpasscnt=0; kfailcnt=0; unsupcnt=0; unrescnt=0;
+  passcnt=0; failcnt=0; untstcnt=0; xpasscnt=0; xfailcnt=0; kpasscnt=0; kfailcnt=0; unsupcnt=0; unrescnt=0; dgerrorcnt=0;
   curvar=""; insummary=0
 }
 /^Running target /		{ curvar = \$3; next }
+/^ERROR: \(DejaGnu\)/		{ if (variant == curvar) dgerrorcnt += 1 }
 /^# of /			{ if (variant == curvar) insummary = 1 }
 /^# of expected passes/		{ if (insummary == 1) passcnt += \$5; next; }
 /^# of unexpected successes/	{ if (insummary == 1) xpasscnt += \$5; next; }
@@ -390,6 +391,7 @@ BEGIN {
 { next }
 END {
   printf ("\t\t=== %s Summary for %s ===\n\n", tool, variant)
+  if (dgerrorcnt != 0) printf ("# of DejaGnu errors\t\t%d\n", dgerrorcnt)
   if (passcnt != 0) printf ("# of expected passes\t\t%d\n", passcnt)
   if (failcnt != 0) printf ("# of unexpected failures\t%d\n", failcnt)
   if (xpasscnt != 0) printf ("# of unexpected successes\t%d\n", xpasscnt)
@@ -419,8 +421,9 @@ TOTAL_AWK=${TMP}/total.awk
 cat << EOF > $TOTAL_AWK
 BEGIN {
   tool="$TOOL"
-  passcnt=0; failcnt=0; untstcnt=0; xpasscnt=0; xfailcnt=0; kfailcnt=0; unsupcnt=0; unrescnt=0
+  passcnt=0; failcnt=0; untstcnt=0; xpasscnt=0; xfailcnt=0; kfailcnt=0; unsupcnt=0; unrescnt=0; dgerrorcnt=0
 }
+/^# of DejaGnu errors/		{ dgerrorcnt += \$5 }
 /^# of expected passes/		{ passcnt += \$5 }
 /^# of unexpected failures/	{ failcnt += \$5 }
 /^# of unexpected successes/	{ xpasscnt += \$5 }
@@ -431,7 +434,8 @@ BEGIN {
 /^# of unresolved testcases/	{ unrescnt += \$5 }
 /^# of unsupported tests/	{ unsupcnt += \$5 }
 END {
-  printf ("\n\t\t=== %s Summary ===\n\n", tool)
+  printf ("\n\t\t=== %s MySummary ===\n\n", tool)
+  if (dgerrorcnt != 0) printf ("# of DejaGnu errors\t\t%d\n", dgerrorcnt)
   if (passcnt != 0) printf ("# of expected passes\t\t%d\n", passcnt)
   if (failcnt != 0) printf ("# of unexpected failures\t%d\n", failcnt)
   if (xpasscnt != 0) printf ("# of unexpected successes\t%d\n", xpasscnt)
--- /dev/null
+++ b/src/gcc/LINARO-VERSION
@@ -0,0 +1 @@
+Snapshot 6.3-2017.03
--- a/src/gcc/Makefile.in
+++ b/src/gcc/Makefile.in
@@ -832,10 +832,12 @@ BASEVER     := $(srcdir)/BASE-VER  # 4.x.y
 DEVPHASE    := $(srcdir)/DEV-PHASE # experimental, prerelease, ""
 DATESTAMP   := $(srcdir)/DATESTAMP # YYYYMMDD or empty
 REVISION    := $(srcdir)/REVISION  # [BRANCH revision XXXXXX]
+LINAROVER   := $(srcdir)/LINARO-VERSION # M.x-YYYY.MM[-S][~dev]
 
 BASEVER_c   := $(shell cat $(BASEVER))
 DEVPHASE_c  := $(shell cat $(DEVPHASE))
 DATESTAMP_c := $(shell cat $(DATESTAMP))
+LINAROVER_c := $(shell cat $(LINAROVER))
 
 ifeq (,$(wildcard $(REVISION)))
 REVISION_c  :=
@@ -862,6 +864,7 @@ DATESTAMP_s := \
   "\"$(if $(DEVPHASE_c)$(filter-out 0,$(PATCHLEVEL_c)), $(DATESTAMP_c))\""
 PKGVERSION_s:= "\"@PKGVERSION@\""
 BUGURL_s    := "\"@REPORT_BUGS_TO@\""
+LINAROVER_s := "\"$(LINAROVER_c)\""
 
 PKGVERSION  := @PKGVERSION@
 BUGURL_TEXI := @REPORT_BUGS_TEXI@
@@ -2701,8 +2704,9 @@ PREPROCESSOR_DEFINES = \
   -DSTANDARD_EXEC_PREFIX=\"$(libdir)/gcc/\" \
   @TARGET_SYSTEM_ROOT_DEFINE@
 
-CFLAGS-cppbuiltin.o += $(PREPROCESSOR_DEFINES) -DBASEVER=$(BASEVER_s)
-cppbuiltin.o: $(BASEVER)
+CFLAGS-cppbuiltin.o += $(PREPROCESSOR_DEFINES) -DBASEVER=$(BASEVER_s) \
+	-DLINAROVER=$(LINAROVER_s)
+cppbuiltin.o: $(BASEVER) $(LINAROVER)
 
 CFLAGS-cppdefault.o += $(PREPROCESSOR_DEFINES)
 
--- a/src/gcc/ada/gcc-interface/misc.c
+++ b/src/gcc/ada/gcc-interface/misc.c
@@ -255,8 +255,7 @@ static bool
 gnat_post_options (const char **pfilename ATTRIBUTE_UNUSED)
 {
   /* Excess precision other than "fast" requires front-end support.  */
-  if (flag_excess_precision_cmdline == EXCESS_PRECISION_STANDARD
-      && TARGET_FLT_EVAL_METHOD_NON_DEFAULT)
+  if (flag_excess_precision_cmdline == EXCESS_PRECISION_STANDARD)
     sorry ("-fexcess-precision=standard for Ada");
   flag_excess_precision_cmdline = EXCESS_PRECISION_FAST;
 
--- a/src/gcc/builtins.c
+++ b/src/gcc/builtins.c
@@ -28,6 +28,7 @@ along with GCC; see the file COPYING3.  If not see
 #include "target.h"
 #include "rtl.h"
 #include "tree.h"
+#include "memmodel.h"
 #include "gimple.h"
 #include "predict.h"
 #include "tm_p.h"
--- a/src/gcc/c-family/c-common.c
+++ b/src/gcc/c-family/c-common.c
@@ -25,6 +25,7 @@ along with GCC; see the file COPYING3.  If not see
 #include "target.h"
 #include "function.h"
 #include "tree.h"
+#include "memmodel.h"
 #include "c-common.h"
 #include "gimple-expr.h"
 #include "tm_p.h"
--- a/src/gcc/c-family/c-opts.c
+++ b/src/gcc/c-family/c-opts.c
@@ -772,8 +772,7 @@ c_common_post_options (const char **pfilename)
      support.  */
   if (c_dialect_cxx ())
     {
-      if (flag_excess_precision_cmdline == EXCESS_PRECISION_STANDARD
-	  && TARGET_FLT_EVAL_METHOD_NON_DEFAULT)
+      if (flag_excess_precision_cmdline == EXCESS_PRECISION_STANDARD)
 	sorry ("-fexcess-precision=standard for C++");
       flag_excess_precision_cmdline = EXCESS_PRECISION_FAST;
     }
--- a/src/gcc/calls.c
+++ b/src/gcc/calls.c
@@ -194,10 +194,19 @@ prepare_call_address (tree fndecl_or_type, rtx funexp, rtx static_chain_value,
 	       && targetm.small_register_classes_for_mode_p (FUNCTION_MODE))
 	      ? force_not_mem (memory_address (FUNCTION_MODE, funexp))
 	      : memory_address (FUNCTION_MODE, funexp));
-  else if (! sibcallp)
+  else
     {
-      if (!NO_FUNCTION_CSE && optimize && ! flag_no_function_cse)
-	funexp = force_reg (Pmode, funexp);
+      /* funexp could be a SYMBOL_REF represents a function pointer which is
+	 of ptr_mode.  In this case, it should be converted into address mode
+	 to be a valid address for memory rtx pattern.  See PR 64971.  */
+      if (GET_MODE (funexp) != Pmode)
+	funexp = convert_memory_address (Pmode, funexp);
+
+      if (! sibcallp)
+	{
+	  if (!NO_FUNCTION_CSE && optimize && ! flag_no_function_cse)
+	    funexp = force_reg (Pmode, funexp);
+	}
     }
 
   if (static_chain_value != 0
--- a/src/gcc/cfg.c
+++ b/src/gcc/cfg.c
@@ -1064,7 +1064,7 @@ free_original_copy_tables (void)
   delete bb_copy;
   bb_copy = NULL;
   delete bb_original;
-  bb_copy = NULL;
+  bb_original = NULL;
   delete loop_copy;
   loop_copy = NULL;
   delete original_copy_bb_pool;
--- a/src/gcc/common/config/arm/arm-common.c
+++ b/src/gcc/common/config/arm/arm-common.c
@@ -97,6 +97,49 @@ arm_rewrite_mcpu (int argc, const char **argv)
   return arm_rewrite_selected_cpu (argv[argc - 1]);
 }
 
+struct arm_arch_core_flag
+{
+  const char *const name;
+  const arm_feature_set flags;
+};
+
+static const struct arm_arch_core_flag arm_arch_core_flags[] =
+{
+#undef ARM_CORE
+#define ARM_CORE(NAME, X, IDENT, ARCH, FLAGS, COSTS) \
+  {NAME, FLAGS},
+#include "config/arm/arm-cores.def"
+#undef ARM_CORE
+#undef ARM_ARCH
+#define ARM_ARCH(NAME, CORE, ARCH, FLAGS) \
+  {NAME, FLAGS},
+#include "config/arm/arm-arches.def"
+#undef ARM_ARCH
+};
+
+/* Called by the driver to check whether the target denoted by current
+   command line options is a Thumb-only target.  ARGV is an array of
+   -march and -mcpu values (ie. it contains the rhs after the equal
+   sign) and we use the last one of them to make a decision.  The
+   number of elements in ARGV is given in ARGC.  */
+const char *
+arm_target_thumb_only (int argc, const char **argv)
+{
+  unsigned int opt;
+
+  if (argc)
+    {
+      for (opt = 0; opt < (ARRAY_SIZE (arm_arch_core_flags)); opt++)
+	if ((strcmp (argv[argc - 1], arm_arch_core_flags[opt].name) == 0)
+	    && !ARM_FSET_HAS_CPU1(arm_arch_core_flags[opt].flags, FL_NOTM))
+	  return "-mthumb";
+
+      return NULL;
+    }
+  else
+    return NULL;
+}
+
 #undef ARM_CPU_NAME_LENGTH
 
 
--- a/src/gcc/config.gcc
+++ b/src/gcc/config.gcc
@@ -307,7 +307,7 @@ m32c*-*-*)
         ;;
 aarch64*-*-*)
 	cpu_type=aarch64
-	extra_headers="arm_neon.h arm_acle.h"
+	extra_headers="arm_fp16.h arm_neon.h arm_acle.h"
 	c_target_objs="aarch64-c.o"
 	cxx_target_objs="aarch64-c.o"
 	extra_objs="aarch64-builtins.o aarch-common.o cortex-a57-fma-steering.o"
@@ -327,7 +327,7 @@ arc*-*-*)
 arm*-*-*)
 	cpu_type=arm
 	extra_objs="arm-builtins.o aarch-common.o"
-	extra_headers="mmintrin.h arm_neon.h arm_acle.h"
+	extra_headers="mmintrin.h arm_neon.h arm_acle.h arm_fp16.h arm_cmse.h"
 	target_type_format_char='%'
 	c_target_objs="arm-c.o"
 	cxx_target_objs="arm-c.o"
@@ -1500,7 +1500,7 @@ i[34567]86-*-linux* | i[34567]86-*-kfreebsd*-gnu | i[34567]86-*-knetbsd*-gnu | i
 		extra_options="${extra_options} linux-android.opt"
 		# Assume modern glibc if not targeting Android nor uclibc.
 		case ${target} in
-		*-*-*android*|*-*-*uclibc*)
+		*-*-*android*|*-*-*uclibc*|*-*-*musl*)
 		  ;;
 		*)
 		  default_gnu_indirect_function=yes
@@ -1569,7 +1569,7 @@ x86_64-*-linux* | x86_64-*-kfreebsd*-gnu | x86_64-*-knetbsd*-gnu)
 		extra_options="${extra_options} linux-android.opt"
 		# Assume modern glibc if not targeting Android nor uclibc.
 		case ${target} in
-		*-*-*android*|*-*-*uclibc*)
+		*-*-*android*|*-*-*uclibc*|*-*-*musl*)
 		  ;;
 		*)
 		  default_gnu_indirect_function=yes
@@ -3811,38 +3811,51 @@ case "${target}" in
 		# Add extra multilibs
 		if test "x$with_multilib_list" != x; then
 			arm_multilibs=`echo $with_multilib_list | sed -e 's/,/ /g'`
-			for arm_multilib in ${arm_multilibs}; do
-				case ${arm_multilib} in
-				aprofile)
+			case ${arm_multilibs} in
+			aprofile)
 				# Note that arm/t-aprofile is a
 				# stand-alone make file fragment to be
 				# used only with itself.  We do not
 				# specifically use the
 				# TM_MULTILIB_OPTION framework because
 				# this shorthand is more
-				# pragmatic. Additionally it is only
-				# designed to work without any
-				# with-cpu, with-arch with-mode
-				# with-fpu or with-float options.
-					if test "x$with_arch" != x \
-					    || test "x$with_cpu" != x \
-					    || test "x$with_float" != x \
-					    || test "x$with_fpu" != x \
-					    || test "x$with_mode" != x ; then
-					    echo "Error: You cannot use any of --with-arch/cpu/fpu/float/mode with --with-multilib-list=aprofile" 1>&2
-					    exit 1
-					fi
-					tmake_file="${tmake_file} arm/t-aprofile"
-					break
-					;;
-				default)
-					;;
-				*)
-					echo "Error: --with-multilib-list=${with_multilib_list} not supported." 1>&2
-					exit 1
-					;;
-				esac
-			done
+				# pragmatic.
+				tmake_profile_file="arm/t-aprofile"
+				;;
+			rmprofile)
+				# Note that arm/t-rmprofile is a
+				# stand-alone make file fragment to be
+				# used only with itself.  We do not
+				# specifically use the
+				# TM_MULTILIB_OPTION framework because
+				# this shorthand is more
+				# pragmatic.
+				tmake_profile_file="arm/t-rmprofile"
+				;;
+			default)
+				;;
+			*)
+				echo "Error: --with-multilib-list=${with_multilib_list} not supported." 1>&2
+				exit 1
+				;;
+			esac
+
+			if test "x${tmake_profile_file}" != x ; then
+				# arm/t-aprofile and arm/t-rmprofile are only
+				# designed to work without any with-cpu,
+				# with-arch, with-mode, with-fpu or with-float
+				# options.
+				if test "x$with_arch" != x \
+				    || test "x$with_cpu" != x \
+				    || test "x$with_float" != x \
+				    || test "x$with_fpu" != x \
+				    || test "x$with_mode" != x ; then
+				    echo "Error: You cannot use any of --with-arch/cpu/fpu/float/mode with --with-multilib-list=${with_multilib_list}" 1>&2
+				    exit 1
+				fi
+
+				tmake_file="${tmake_file} ${tmake_profile_file}"
+			fi
 		fi
 		;;
 
--- a/src/gcc/config/aarch64/aarch64-arches.def
+++ b/src/gcc/config/aarch64/aarch64-arches.def
@@ -32,4 +32,6 @@
 
 AARCH64_ARCH("armv8-a",	      generic,	     8A,	8,  AARCH64_FL_FOR_ARCH8)
 AARCH64_ARCH("armv8.1-a",     generic,	     8_1A,	8,  AARCH64_FL_FOR_ARCH8_1)
+AARCH64_ARCH("armv8.2-a",     generic,	     8_2A,	8,  AARCH64_FL_FOR_ARCH8_2)
+AARCH64_ARCH("armv8.3-a",     generic,	     8_3A,	8,  AARCH64_FL_FOR_ARCH8_3)
 
--- a/src/gcc/config/aarch64/aarch64-builtins.c
+++ b/src/gcc/config/aarch64/aarch64-builtins.c
@@ -62,6 +62,7 @@
 #define si_UP    SImode
 #define sf_UP    SFmode
 #define hi_UP    HImode
+#define hf_UP    HFmode
 #define qi_UP    QImode
 #define UP(X) X##_UP
 
@@ -139,6 +140,10 @@ aarch64_types_binop_ssu_qualifiers[SIMD_MAX_BUILTIN_ARGS]
   = { qualifier_none, qualifier_none, qualifier_unsigned };
 #define TYPES_BINOP_SSU (aarch64_types_binop_ssu_qualifiers)
 static enum aarch64_type_qualifiers
+aarch64_types_binop_uss_qualifiers[SIMD_MAX_BUILTIN_ARGS]
+  = { qualifier_unsigned, qualifier_none, qualifier_none };
+#define TYPES_BINOP_USS (aarch64_types_binop_uss_qualifiers)
+static enum aarch64_type_qualifiers
 aarch64_types_binopp_qualifiers[SIMD_MAX_BUILTIN_ARGS]
   = { qualifier_poly, qualifier_poly, qualifier_poly };
 #define TYPES_BINOPP (aarch64_types_binopp_qualifiers)
@@ -164,6 +169,10 @@ aarch64_types_quadop_lane_qualifiers[SIMD_MAX_BUILTIN_ARGS]
 #define TYPES_QUADOP_LANE (aarch64_types_quadop_lane_qualifiers)
 
 static enum aarch64_type_qualifiers
+aarch64_types_binop_imm_p_qualifiers[SIMD_MAX_BUILTIN_ARGS]
+  = { qualifier_poly, qualifier_none, qualifier_immediate };
+#define TYPES_GETREGP (aarch64_types_binop_imm_p_qualifiers)
+static enum aarch64_type_qualifiers
 aarch64_types_binop_imm_qualifiers[SIMD_MAX_BUILTIN_ARGS]
   = { qualifier_none, qualifier_none, qualifier_immediate };
 #define TYPES_GETREG (aarch64_types_binop_imm_qualifiers)
@@ -173,16 +182,29 @@ aarch64_types_shift_to_unsigned_qualifiers[SIMD_MAX_BUILTIN_ARGS]
   = { qualifier_unsigned, qualifier_none, qualifier_immediate };
 #define TYPES_SHIFTIMM_USS (aarch64_types_shift_to_unsigned_qualifiers)
 static enum aarch64_type_qualifiers
+aarch64_types_fcvt_from_unsigned_qualifiers[SIMD_MAX_BUILTIN_ARGS]
+  = { qualifier_none, qualifier_unsigned, qualifier_immediate };
+#define TYPES_FCVTIMM_SUS (aarch64_types_fcvt_from_unsigned_qualifiers)
+static enum aarch64_type_qualifiers
 aarch64_types_unsigned_shift_qualifiers[SIMD_MAX_BUILTIN_ARGS]
   = { qualifier_unsigned, qualifier_unsigned, qualifier_immediate };
 #define TYPES_USHIFTIMM (aarch64_types_unsigned_shift_qualifiers)
 
 static enum aarch64_type_qualifiers
-aarch64_types_ternop_imm_qualifiers[SIMD_MAX_BUILTIN_ARGS]
-  = { qualifier_none, qualifier_none, qualifier_none, qualifier_immediate };
-#define TYPES_SETREG (aarch64_types_ternop_imm_qualifiers)
-#define TYPES_SHIFTINSERT (aarch64_types_ternop_imm_qualifiers)
-#define TYPES_SHIFTACC (aarch64_types_ternop_imm_qualifiers)
+aarch64_types_ternop_s_imm_p_qualifiers[SIMD_MAX_BUILTIN_ARGS]
+  = { qualifier_none, qualifier_none, qualifier_poly, qualifier_immediate};
+#define TYPES_SETREGP (aarch64_types_ternop_s_imm_p_qualifiers)
+static enum aarch64_type_qualifiers
+aarch64_types_ternop_s_imm_qualifiers[SIMD_MAX_BUILTIN_ARGS]
+  = { qualifier_none, qualifier_none, qualifier_none, qualifier_immediate};
+#define TYPES_SETREG (aarch64_types_ternop_s_imm_qualifiers)
+#define TYPES_SHIFTINSERT (aarch64_types_ternop_s_imm_qualifiers)
+#define TYPES_SHIFTACC (aarch64_types_ternop_s_imm_qualifiers)
+
+static enum aarch64_type_qualifiers
+aarch64_types_ternop_p_imm_qualifiers[SIMD_MAX_BUILTIN_ARGS]
+  = { qualifier_poly, qualifier_poly, qualifier_poly, qualifier_immediate};
+#define TYPES_SHIFTINSERTP (aarch64_types_ternop_p_imm_qualifiers)
 
 static enum aarch64_type_qualifiers
 aarch64_types_unsigned_shiftacc_qualifiers[SIMD_MAX_BUILTIN_ARGS]
@@ -197,6 +219,11 @@ aarch64_types_combine_qualifiers[SIMD_MAX_BUILTIN_ARGS]
 #define TYPES_COMBINE (aarch64_types_combine_qualifiers)
 
 static enum aarch64_type_qualifiers
+aarch64_types_combine_p_qualifiers[SIMD_MAX_BUILTIN_ARGS]
+  = { qualifier_poly, qualifier_poly, qualifier_poly };
+#define TYPES_COMBINEP (aarch64_types_combine_p_qualifiers)
+
+static enum aarch64_type_qualifiers
 aarch64_types_load1_qualifiers[SIMD_MAX_BUILTIN_ARGS]
   = { qualifier_none, qualifier_const_pointer_map_mode };
 #define TYPES_LOAD1 (aarch64_types_load1_qualifiers)
@@ -229,6 +256,10 @@ aarch64_types_bsl_u_qualifiers[SIMD_MAX_BUILTIN_ARGS]
    qualifier_map_mode | qualifier_pointer to build a pointer to the
    element type of the vector.  */
 static enum aarch64_type_qualifiers
+aarch64_types_store1_p_qualifiers[SIMD_MAX_BUILTIN_ARGS]
+  = { qualifier_void, qualifier_pointer_map_mode, qualifier_poly };
+#define TYPES_STORE1P (aarch64_types_store1_p_qualifiers)
+static enum aarch64_type_qualifiers
 aarch64_types_store1_qualifiers[SIMD_MAX_BUILTIN_ARGS]
   = { qualifier_void, qualifier_pointer_map_mode, qualifier_none };
 #define TYPES_STORE1 (aarch64_types_store1_qualifiers)
@@ -753,16 +784,16 @@ aarch64_init_simd_builtins (void)
 
 	  if (qualifiers & qualifier_unsigned)
 	    {
-	      type_signature[arg_num] = 'u';
+	      type_signature[op_num] = 'u';
 	      print_type_signature_p = true;
 	    }
 	  else if (qualifiers & qualifier_poly)
 	    {
-	      type_signature[arg_num] = 'p';
+	      type_signature[op_num] = 'p';
 	      print_type_signature_p = true;
 	    }
 	  else
-	    type_signature[arg_num] = 's';
+	    type_signature[op_num] = 's';
 
 	  /* Skip an internal operand for vget_{low, high}.  */
 	  if (qualifiers & qualifier_internal)
--- a/src/gcc/config/aarch64/aarch64-c.c
+++ b/src/gcc/config/aarch64/aarch64-c.c
@@ -95,6 +95,11 @@ aarch64_update_cpp_builtins (cpp_reader *pfile)
   else
     cpp_undef (pfile, "__ARM_FP");
 
+  aarch64_def_or_undef (TARGET_FP_F16INST,
+			"__ARM_FEATURE_FP16_SCALAR_ARITHMETIC", pfile);
+  aarch64_def_or_undef (TARGET_SIMD_F16INST,
+			"__ARM_FEATURE_FP16_VECTOR_ARITHMETIC", pfile);
+
   aarch64_def_or_undef (TARGET_SIMD, "__ARM_FEATURE_NUMERIC_MAXMIN", pfile);
   aarch64_def_or_undef (TARGET_SIMD, "__ARM_NEON", pfile);
 
--- a/src/gcc/config/aarch64/aarch64-cores.def
+++ b/src/gcc/config/aarch64/aarch64-cores.def
@@ -40,17 +40,33 @@
 
 /* V8 Architecture Processors.  */
 
+/* ARM ('A') cores. */
 AARCH64_CORE("cortex-a35",  cortexa35, cortexa53, 8A,  AARCH64_FL_FOR_ARCH8 | AARCH64_FL_CRC, cortexa35, "0x41", "0xd04")
 AARCH64_CORE("cortex-a53",  cortexa53, cortexa53, 8A,  AARCH64_FL_FOR_ARCH8 | AARCH64_FL_CRC, cortexa53, "0x41", "0xd03")
 AARCH64_CORE("cortex-a57",  cortexa57, cortexa57, 8A,  AARCH64_FL_FOR_ARCH8 | AARCH64_FL_CRC, cortexa57, "0x41", "0xd07")
 AARCH64_CORE("cortex-a72",  cortexa72, cortexa57, 8A,  AARCH64_FL_FOR_ARCH8 | AARCH64_FL_CRC, cortexa72, "0x41", "0xd08")
+AARCH64_CORE("cortex-a73",  cortexa73, cortexa57, 8A,  AARCH64_FL_FOR_ARCH8 | AARCH64_FL_CRC, cortexa73, "0x41", "0xd09")
+
+/* Samsung ('S') cores. */
 AARCH64_CORE("exynos-m1",   exynosm1,  exynosm1,  8A,  AARCH64_FL_FOR_ARCH8 | AARCH64_FL_CRC | AARCH64_FL_CRYPTO, exynosm1,  "0x53", "0x001")
-AARCH64_CORE("qdf24xx",     qdf24xx,   cortexa57, 8A,  AARCH64_FL_FOR_ARCH8 | AARCH64_FL_CRC | AARCH64_FL_CRYPTO, cortexa57, "0x51", "0x800")
+
+/* Qualcomm ('Q') cores. */
+AARCH64_CORE("qdf24xx",     qdf24xx,   cortexa57, 8A,  AARCH64_FL_FOR_ARCH8 | AARCH64_FL_CRC | AARCH64_FL_CRYPTO, qdf24xx,   "0x51", "0x800")
+
+/* Cavium ('C') cores. */
 AARCH64_CORE("thunderx",    thunderx,  thunderx,  8A,  AARCH64_FL_FOR_ARCH8 | AARCH64_FL_CRC | AARCH64_FL_CRYPTO, thunderx,  "0x43", "0x0a1")
+
+/* APM ('P') cores. */
 AARCH64_CORE("xgene1",      xgene1,    xgene1,    8A,  AARCH64_FL_FOR_ARCH8, xgene1, "0x50", "0x000")
 
+/* V8.1 Architecture Processors.  */
+
+/* Broadcom ('B') cores. */
+AARCH64_CORE("vulcan",  vulcan, cortexa57, 8_1A,  AARCH64_FL_FOR_ARCH8_1 | AARCH64_FL_CRYPTO, vulcan, "0x42", "0x516")
+
 /* V8 big.LITTLE implementations.  */
 
 AARCH64_CORE("cortex-a57.cortex-a53",  cortexa57cortexa53, cortexa53, 8A,  AARCH64_FL_FOR_ARCH8 | AARCH64_FL_CRC, cortexa57, "0x41", "0xd07.0xd03")
 AARCH64_CORE("cortex-a72.cortex-a53",  cortexa72cortexa53, cortexa53, 8A,  AARCH64_FL_FOR_ARCH8 | AARCH64_FL_CRC, cortexa72, "0x41", "0xd08.0xd03")
-
+AARCH64_CORE("cortex-a73.cortex-a35",  cortexa73cortexa35, cortexa53, 8A,  AARCH64_FL_FOR_ARCH8 | AARCH64_FL_CRC, cortexa73, "0x41", "0xd09.0xd04")
+AARCH64_CORE("cortex-a73.cortex-a53",  cortexa73cortexa53, cortexa53, 8A,  AARCH64_FL_FOR_ARCH8 | AARCH64_FL_CRC, cortexa73, "0x41", "0xd09.0xd03")
--- a/src/gcc/config/aarch64/aarch64-cost-tables.h
+++ b/src/gcc/config/aarch64/aarch64-cost-tables.h
@@ -127,6 +127,108 @@ const struct cpu_cost_table thunderx_extra_costs =
   }
 };
 
+const struct cpu_cost_table vulcan_extra_costs =
+{
+  /* ALU */
+  {
+    0,			/* Arith.  */
+    0,			/* Logical.  */
+    0,			/* Shift.  */
+    0,			/* Shift_reg.  */
+    COSTS_N_INSNS (1),	/* Arith_shift.  */
+    COSTS_N_INSNS (1),	/* Arith_shift_reg.  */
+    COSTS_N_INSNS (1),	/* Log_shift.  */
+    COSTS_N_INSNS (1),	/* Log_shift_reg.  */
+    0,			/* Extend.  */
+    COSTS_N_INSNS (1),	/* Extend_arith.  */
+    0,			/* Bfi.  */
+    0,			/* Bfx.  */
+    COSTS_N_INSNS (3),	/* Clz.  */
+    0,			/* Rev.  */
+    0,			/* Non_exec.  */
+    true		/* Non_exec_costs_exec.  */
+  },
+  {
+    /* MULT SImode */
+    {
+      COSTS_N_INSNS (4),	/* Simple.  */
+      COSTS_N_INSNS (4),	/* Flag_setting.  */
+      COSTS_N_INSNS (4),	/* Extend.  */
+      COSTS_N_INSNS (5),	/* Add.  */
+      COSTS_N_INSNS (5),	/* Extend_add.  */
+      COSTS_N_INSNS (18)	/* Idiv.  */
+    },
+    /* MULT DImode */
+    {
+      COSTS_N_INSNS (4),       /* Simple.  */
+      0,                       /* Flag_setting.  */
+      COSTS_N_INSNS (4),       /* Extend.  */
+      COSTS_N_INSNS (5),       /* Add.  */
+      COSTS_N_INSNS (5),       /* Extend_add.  */
+      COSTS_N_INSNS (26)       /* Idiv.  */
+    }
+  },
+  /* LD/ST */
+  {
+    COSTS_N_INSNS (4),	/* Load.  */
+    COSTS_N_INSNS (4),	/* Load_sign_extend.  */
+    COSTS_N_INSNS (5),	/* Ldrd.  */
+    COSTS_N_INSNS (4),	/* Ldm_1st.  */
+    1,			/* Ldm_regs_per_insn_1st.  */
+    1,			/* Ldm_regs_per_insn_subsequent.  */
+    COSTS_N_INSNS (4),	/* Loadf.  */
+    COSTS_N_INSNS (4),	/* Loadd.  */
+    COSTS_N_INSNS (4),	/* Load_unaligned.  */
+    0,			/* Store.  */
+    0,			/* Strd.  */
+    0,			/* Stm_1st.  */
+    1,			/* Stm_regs_per_insn_1st.  */
+    1,			/* Stm_regs_per_insn_subsequent.  */
+    0,			/* Storef.  */
+    0,			/* Stored.  */
+    0,			/* Store_unaligned.  */
+    COSTS_N_INSNS (1),	/* Loadv.  */
+    COSTS_N_INSNS (1)	/* Storev.  */
+  },
+  {
+    /* FP SFmode */
+    {
+      COSTS_N_INSNS (4),	/* Div.  */
+      COSTS_N_INSNS (1),	/* Mult.  */
+      COSTS_N_INSNS (1),	/* Mult_addsub. */
+      COSTS_N_INSNS (1),	/* Fma.  */
+      COSTS_N_INSNS (1),	/* Addsub.  */
+      COSTS_N_INSNS (1),	/* Fpconst. */
+      COSTS_N_INSNS (1),	/* Neg.  */
+      COSTS_N_INSNS (1),	/* Compare.  */
+      COSTS_N_INSNS (2),	/* Widen.  */
+      COSTS_N_INSNS (2),	/* Narrow.  */
+      COSTS_N_INSNS (2),	/* Toint.  */
+      COSTS_N_INSNS (2),	/* Fromint.  */
+      COSTS_N_INSNS (2) 	/* Roundint.  */
+    },
+    /* FP DFmode */
+    {
+      COSTS_N_INSNS (6),	/* Div.  */
+      COSTS_N_INSNS (1),	/* Mult.  */
+      COSTS_N_INSNS (1),	/* Mult_addsub.  */
+      COSTS_N_INSNS (1),	/* Fma.  */
+      COSTS_N_INSNS (1),	/* Addsub.  */
+      COSTS_N_INSNS (1),	/* Fpconst.  */
+      COSTS_N_INSNS (1),	/* Neg.  */
+      COSTS_N_INSNS (1),	/* Compare.  */
+      COSTS_N_INSNS (2),	/* Widen.  */
+      COSTS_N_INSNS (2),	/* Narrow.  */
+      COSTS_N_INSNS (2),	/* Toint.  */
+      COSTS_N_INSNS (2),	/* Fromint.  */
+      COSTS_N_INSNS (2) 	/* Roundint.  */
+    }
+  },
+  /* Vector */
+  {
+    COSTS_N_INSNS (1)	/* Alu.  */
+  }
+};
 
 
 #endif
--- a/src/gcc/config/aarch64/aarch64-elf.h
+++ b/src/gcc/config/aarch64/aarch64-elf.h
@@ -25,15 +25,6 @@
 #define ASM_OUTPUT_LABELREF(FILE, NAME) \
   aarch64_asm_output_labelref (FILE, NAME)
 
-#define ASM_OUTPUT_DEF(FILE, NAME1, NAME2)	\
-  do						\
-    {						\
-      assemble_name (FILE, NAME1);		\
-      fputs (" = ", FILE);			\
-      assemble_name (FILE, NAME2);		\
-      fputc ('\n', FILE);			\
-    } while (0)
-
 #define TEXT_SECTION_ASM_OP	"\t.text"
 #define DATA_SECTION_ASM_OP	"\t.data"
 #define BSS_SECTION_ASM_OP	"\t.bss"
--- a/src/gcc/config/aarch64/aarch64-modes.def
+++ b/src/gcc/config/aarch64/aarch64-modes.def
@@ -21,8 +21,6 @@
 CC_MODE (CCFP);
 CC_MODE (CCFPE);
 CC_MODE (CC_SWP);
-CC_MODE (CC_ZESWP); /* zero-extend LHS (but swap to make it RHS).  */
-CC_MODE (CC_SESWP); /* sign-extend LHS (but swap to make it RHS).  */
 CC_MODE (CC_NZ);    /* Only N and Z bits of condition flags are valid.  */
 CC_MODE (CC_Z);     /* Only Z bit of condition flags is valid.  */
 CC_MODE (CC_C);     /* Only C bit of condition flags is valid.  */
--- a/src/gcc/config/aarch64/aarch64-option-extensions.def
+++ b/src/gcc/config/aarch64/aarch64-option-extensions.def
@@ -39,8 +39,8 @@
    that are required.  Their order is not important.  */
 
 /* Enabling "fp" just enables "fp".
-   Disabling "fp" also disables "simd", "crypto".  */
-AARCH64_OPT_EXTENSION("fp", AARCH64_FL_FP, 0, AARCH64_FL_SIMD | AARCH64_FL_CRYPTO, "fp")
+   Disabling "fp" also disables "simd", "crypto" and "fp16".  */
+AARCH64_OPT_EXTENSION("fp", AARCH64_FL_FP, 0, AARCH64_FL_SIMD | AARCH64_FL_CRYPTO | AARCH64_FL_F16, "fp")
 
 /* Enabling "simd" also enables "fp".
    Disabling "simd" also disables "crypto".  */
@@ -55,3 +55,7 @@ AARCH64_OPT_EXTENSION("crc", AARCH64_FL_CRC, 0, 0, "crc32")
 
 /* Enabling or disabling "lse" only changes "lse".  */
 AARCH64_OPT_EXTENSION("lse", AARCH64_FL_LSE, 0, 0, "atomics")
+
+/* Enabling "fp16" also enables "fp".
+   Disabling "fp16" just disables "fp16".  */
+AARCH64_OPT_EXTENSION("fp16", AARCH64_FL_F16, AARCH64_FL_FP, 0, "fp16")
--- /dev/null
+++ b/src/gcc/config/aarch64/aarch64-passes.def
@@ -0,0 +1,21 @@
+/* AArch64-specific passes declarations.
+   Copyright (C) 2016 Free Software Foundation, Inc.
+   Contributed by ARM Ltd.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify it
+   under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   GCC is distributed in the hope that it will be useful, but
+   WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with GCC; see the file COPYING3.  If not see
+   <http://www.gnu.org/licenses/>.  */
+
+INSERT_PASS_AFTER (pass_regrename, 1, pass_fma_steering);
--- a/src/gcc/config/aarch64/aarch64-protos.h
+++ b/src/gcc/config/aarch64/aarch64-protos.h
@@ -178,6 +178,25 @@ struct cpu_branch_cost
   const int unpredictable;  /* Unpredictable branch or optimizing for speed.  */
 };
 
+/* Control approximate alternatives to certain FP operators.  */
+#define AARCH64_APPROX_MODE(MODE) \
+  ((MIN_MODE_FLOAT <= (MODE) && (MODE) <= MAX_MODE_FLOAT) \
+   ? (1 << ((MODE) - MIN_MODE_FLOAT)) \
+   : (MIN_MODE_VECTOR_FLOAT <= (MODE) && (MODE) <= MAX_MODE_VECTOR_FLOAT) \
+     ? (1 << ((MODE) - MIN_MODE_VECTOR_FLOAT \
+	      + MAX_MODE_FLOAT - MIN_MODE_FLOAT + 1)) \
+     : (0))
+#define AARCH64_APPROX_NONE (0)
+#define AARCH64_APPROX_ALL (-1)
+
+/* Allowed modes for approximations.  */
+struct cpu_approx_modes
+{
+  const unsigned int division;		/* Division.  */
+  const unsigned int sqrt;		/* Square root.  */
+  const unsigned int recip_sqrt;	/* Reciprocal square root.  */
+};
+
 struct tune_params
 {
   const struct cpu_cost_table *insn_extra_cost;
@@ -185,6 +204,7 @@ struct tune_params
   const struct cpu_regmove_cost *regmove_cost;
   const struct cpu_vector_cost *vec_costs;
   const struct cpu_branch_cost *branch_costs;
+  const struct cpu_approx_modes *approx_modes;
   int memmov_cost;
   int issue_rate;
   unsigned int fusible_ops;
@@ -282,14 +302,14 @@ int aarch64_get_condition_code (rtx);
 bool aarch64_bitmask_imm (HOST_WIDE_INT val, machine_mode);
 int aarch64_branch_cost (bool, bool);
 enum aarch64_symbol_type aarch64_classify_symbolic_expression (rtx);
-bool aarch64_cannot_change_mode_class (machine_mode,
-				       machine_mode,
-				       enum reg_class);
 bool aarch64_const_vec_all_same_int_p (rtx, HOST_WIDE_INT);
 bool aarch64_constant_address_p (rtx);
+bool aarch64_emit_approx_div (rtx, rtx, rtx);
+bool aarch64_emit_approx_sqrt (rtx, rtx, bool);
 bool aarch64_expand_movmem (rtx *);
 bool aarch64_float_const_zero_rtx_p (rtx);
 bool aarch64_function_arg_regno_p (unsigned);
+bool aarch64_fusion_enabled_p (enum aarch64_fusion_pairs);
 bool aarch64_gen_movmemqi (rtx *);
 bool aarch64_gimple_fold_builtin (gimple_stmt_iterator *);
 bool aarch64_is_extend_from_extract (machine_mode, rtx, rtx);
@@ -298,6 +318,7 @@ bool aarch64_is_noplt_call_p (rtx);
 bool aarch64_label_mentioned_p (rtx);
 void aarch64_declare_function_name (FILE *, const char*, tree);
 bool aarch64_legitimate_pic_operand_p (rtx);
+bool aarch64_mask_and_shift_for_ubfiz_p (machine_mode, rtx, rtx);
 bool aarch64_modes_tieable_p (machine_mode mode1,
 			      machine_mode mode2);
 bool aarch64_zero_extend_const_eq (machine_mode, rtx, machine_mode, rtx);
@@ -320,6 +341,7 @@ bool aarch64_simd_scalar_immediate_valid_for_move (rtx, machine_mode);
 bool aarch64_simd_shift_imm_p (rtx, machine_mode, bool);
 bool aarch64_simd_valid_immediate (rtx, machine_mode, bool,
 				   struct simd_immediate_info *);
+bool aarch64_split_dimode_const_store (rtx, rtx);
 bool aarch64_symbolic_address_p (rtx);
 bool aarch64_uimm12_shift (HOST_WIDE_INT);
 bool aarch64_use_return_insn_p (void);
@@ -335,11 +357,9 @@ machine_mode aarch64_hard_regno_caller_save_mode (unsigned, unsigned,
 						       machine_mode);
 int aarch64_hard_regno_mode_ok (unsigned, machine_mode);
 int aarch64_hard_regno_nregs (unsigned, machine_mode);
-int aarch64_simd_attr_length_move (rtx_insn *);
 int aarch64_uxt_size (int, HOST_WIDE_INT);
 int aarch64_vec_fpconst_pow_of_2 (rtx);
 rtx aarch64_eh_return_handler_rtx (void);
-rtx aarch64_legitimize_reload_address (rtx *, machine_mode, int, int, int);
 rtx aarch64_mask_from_zextract_ops (rtx, rtx);
 const char *aarch64_output_move_struct (rtx *operands);
 rtx aarch64_return_addr (int, rtx);
@@ -352,7 +372,6 @@ unsigned aarch64_dbx_register_number (unsigned);
 unsigned aarch64_trampoline_size (void);
 void aarch64_asm_output_labelref (FILE *, const char *);
 void aarch64_cpu_cpp_builtins (cpp_reader *);
-void aarch64_elf_asm_named_section (const char *, unsigned, tree);
 const char * aarch64_gen_far_branch (rtx *, int, const char *, const char *);
 const char * aarch64_output_probe_stack_range (rtx, rtx);
 void aarch64_err_no_fpadvsimd (machine_mode, const char *);
@@ -369,7 +388,6 @@ void aarch64_register_pragmas (void);
 void aarch64_relayout_simd_types (void);
 void aarch64_reset_previous_fndecl (void);
 void aarch64_save_restore_target_globals (tree);
-void aarch64_emit_approx_rsqrt (rtx, rtx);
 
 /* Initialize builtins for SIMD intrinsics.  */
 void init_aarch64_simd_builtins (void);
@@ -436,7 +454,6 @@ int aarch64_ccmp_mode_to_code (enum machine_mode mode);
 bool extract_base_offset_in_addr (rtx mem, rtx *base, rtx *offset);
 bool aarch64_operands_ok_for_ldpstp (rtx *, bool, enum machine_mode);
 bool aarch64_operands_adjust_ok_for_ldpstp (rtx *, bool, enum machine_mode);
-extern bool aarch64_nopcrelative_literal_loads;
 
 extern void aarch64_asm_output_pool_epilogue (FILE *, const char *,
 					      tree, HOST_WIDE_INT);
@@ -450,4 +467,6 @@ enum aarch64_parse_opt_result aarch64_parse_extension (const char *,
 std::string aarch64_get_extension_string_for_isa_flags (unsigned long,
 							unsigned long);
 
+rtl_opt_pass *make_pass_fma_steering (gcc::context *ctxt);
+
 #endif /* GCC_AARCH64_PROTOS_H */
--- a/src/gcc/config/aarch64/aarch64-simd-builtins.def
+++ b/src/gcc/config/aarch64/aarch64-simd-builtins.def
@@ -40,9 +40,10 @@
    10 - CODE_FOR_<name><mode>.  */
 
   BUILTIN_VDC (COMBINE, combine, 0)
+  VAR1 (COMBINEP, combine, 0, di)
   BUILTIN_VB (BINOP, pmul, 0)
-  BUILTIN_VALLF (BINOP, fmulx, 0)
-  BUILTIN_VDQF_DF (UNOP, sqrt, 2)
+  BUILTIN_VHSDF_HSDF (BINOP, fmulx, 0)
+  BUILTIN_VHSDF_DF (UNOP, sqrt, 2)
   BUILTIN_VD_BHSI (BINOP, addp, 0)
   VAR1 (UNOP, addp, 0, di)
   BUILTIN_VDQ_BHSI (UNOP, clrsb, 2)
@@ -68,14 +69,23 @@
   BUILTIN_VDC (GETREG, get_dregoi, 0)
   BUILTIN_VDC (GETREG, get_dregci, 0)
   BUILTIN_VDC (GETREG, get_dregxi, 0)
+  VAR1 (GETREGP, get_dregoi, 0, di)
+  VAR1 (GETREGP, get_dregci, 0, di)
+  VAR1 (GETREGP, get_dregxi, 0, di)
   /* Implemented by aarch64_get_qreg<VSTRUCT:mode><VQ:mode>.  */
   BUILTIN_VQ (GETREG, get_qregoi, 0)
   BUILTIN_VQ (GETREG, get_qregci, 0)
   BUILTIN_VQ (GETREG, get_qregxi, 0)
+  VAR1 (GETREGP, get_qregoi, 0, v2di)
+  VAR1 (GETREGP, get_qregci, 0, v2di)
+  VAR1 (GETREGP, get_qregxi, 0, v2di)
   /* Implemented by aarch64_set_qreg<VSTRUCT:mode><VQ:mode>.  */
   BUILTIN_VQ (SETREG, set_qregoi, 0)
   BUILTIN_VQ (SETREG, set_qregci, 0)
   BUILTIN_VQ (SETREG, set_qregxi, 0)
+  VAR1 (SETREGP, set_qregoi, 0, v2di)
+  VAR1 (SETREGP, set_qregci, 0, v2di)
+  VAR1 (SETREGP, set_qregxi, 0, v2di)
   /* Implemented by aarch64_ld<VSTRUCT:nregs><VDC:mode>.  */
   BUILTIN_VDC (LOADSTRUCT, ld2, 0)
   BUILTIN_VDC (LOADSTRUCT, ld3, 0)
@@ -224,6 +234,7 @@
   BUILTIN_VSDQ_I_DI (SHIFTINSERT, ssri_n, 0)
   BUILTIN_VSDQ_I_DI (USHIFTACC, usri_n, 0)
   BUILTIN_VSDQ_I_DI (SHIFTINSERT, ssli_n, 0)
+  VAR2 (SHIFTINSERTP, ssli_n, 0, di, v2di)
   BUILTIN_VSDQ_I_DI (USHIFTACC, usli_n, 0)
   /* Implemented by aarch64_<sur>qshl<u>_n<mode>.  */
   BUILTIN_VSDQ_I (SHIFTIMM_USS, sqshlu_n, 0)
@@ -234,105 +245,145 @@
   BUILTIN_VALL (UNOP, reduc_plus_scal_, 10)
 
   /* Implemented by reduc_<maxmin_uns>_scal_<mode> (producing scalar).  */
-  BUILTIN_VDQIF (UNOP, reduc_smax_scal_, 10)
-  BUILTIN_VDQIF (UNOP, reduc_smin_scal_, 10)
+  BUILTIN_VDQIF_F16 (UNOP, reduc_smax_scal_, 10)
+  BUILTIN_VDQIF_F16 (UNOP, reduc_smin_scal_, 10)
   BUILTIN_VDQ_BHSI (UNOPU, reduc_umax_scal_, 10)
   BUILTIN_VDQ_BHSI (UNOPU, reduc_umin_scal_, 10)
-  BUILTIN_VDQF (UNOP, reduc_smax_nan_scal_, 10)
-  BUILTIN_VDQF (UNOP, reduc_smin_nan_scal_, 10)
+  BUILTIN_VHSDF (UNOP, reduc_smax_nan_scal_, 10)
+  BUILTIN_VHSDF (UNOP, reduc_smin_nan_scal_, 10)
 
-  /* Implemented by <maxmin><mode>3.
+  /* Implemented by <maxmin_uns><mode>3.
      smax variants map to fmaxnm,
      smax_nan variants map to fmax.  */
   BUILTIN_VDQ_BHSI (BINOP, smax, 3)
   BUILTIN_VDQ_BHSI (BINOP, smin, 3)
   BUILTIN_VDQ_BHSI (BINOP, umax, 3)
   BUILTIN_VDQ_BHSI (BINOP, umin, 3)
-  BUILTIN_VDQF (BINOP, smax_nan, 3)
-  BUILTIN_VDQF (BINOP, smin_nan, 3)
+  BUILTIN_VHSDF_DF (BINOP, smax_nan, 3)
+  BUILTIN_VHSDF_DF (BINOP, smin_nan, 3)
 
-  /* Implemented by <fmaxmin><mode>3.  */
-  BUILTIN_VDQF (BINOP, fmax, 3)
-  BUILTIN_VDQF (BINOP, fmin, 3)
+  /* Implemented by <maxmin_uns><mode>3.  */
+  BUILTIN_VHSDF_HSDF (BINOP, fmax, 3)
+  BUILTIN_VHSDF_HSDF (BINOP, fmin, 3)
 
   /* Implemented by aarch64_<maxmin_uns>p<mode>.  */
   BUILTIN_VDQ_BHSI (BINOP, smaxp, 0)
   BUILTIN_VDQ_BHSI (BINOP, sminp, 0)
   BUILTIN_VDQ_BHSI (BINOP, umaxp, 0)
   BUILTIN_VDQ_BHSI (BINOP, uminp, 0)
-  BUILTIN_VDQF (BINOP, smaxp, 0)
-  BUILTIN_VDQF (BINOP, sminp, 0)
-  BUILTIN_VDQF (BINOP, smax_nanp, 0)
-  BUILTIN_VDQF (BINOP, smin_nanp, 0)
+  BUILTIN_VHSDF (BINOP, smaxp, 0)
+  BUILTIN_VHSDF (BINOP, sminp, 0)
+  BUILTIN_VHSDF (BINOP, smax_nanp, 0)
+  BUILTIN_VHSDF (BINOP, smin_nanp, 0)
 
   /* Implemented by <frint_pattern><mode>2.  */
-  BUILTIN_VDQF (UNOP, btrunc, 2)
-  BUILTIN_VDQF (UNOP, ceil, 2)
-  BUILTIN_VDQF (UNOP, floor, 2)
-  BUILTIN_VDQF (UNOP, nearbyint, 2)
-  BUILTIN_VDQF (UNOP, rint, 2)
-  BUILTIN_VDQF (UNOP, round, 2)
-  BUILTIN_VDQF_DF (UNOP, frintn, 2)
+  BUILTIN_VHSDF (UNOP, btrunc, 2)
+  BUILTIN_VHSDF (UNOP, ceil, 2)
+  BUILTIN_VHSDF (UNOP, floor, 2)
+  BUILTIN_VHSDF (UNOP, nearbyint, 2)
+  BUILTIN_VHSDF (UNOP, rint, 2)
+  BUILTIN_VHSDF (UNOP, round, 2)
+  BUILTIN_VHSDF_DF (UNOP, frintn, 2)
+
+  VAR1 (UNOP, btrunc, 2, hf)
+  VAR1 (UNOP, ceil, 2, hf)
+  VAR1 (UNOP, floor, 2, hf)
+  VAR1 (UNOP, frintn, 2, hf)
+  VAR1 (UNOP, nearbyint, 2, hf)
+  VAR1 (UNOP, rint, 2, hf)
+  VAR1 (UNOP, round, 2, hf)
 
   /* Implemented by l<fcvt_pattern><su_optab><VQDF:mode><vcvt_target>2.  */
+  VAR1 (UNOP, lbtruncv4hf, 2, v4hi)
+  VAR1 (UNOP, lbtruncv8hf, 2, v8hi)
   VAR1 (UNOP, lbtruncv2sf, 2, v2si)
   VAR1 (UNOP, lbtruncv4sf, 2, v4si)
   VAR1 (UNOP, lbtruncv2df, 2, v2di)
 
+  VAR1 (UNOPUS, lbtruncuv4hf, 2, v4hi)
+  VAR1 (UNOPUS, lbtruncuv8hf, 2, v8hi)
   VAR1 (UNOPUS, lbtruncuv2sf, 2, v2si)
   VAR1 (UNOPUS, lbtruncuv4sf, 2, v4si)
   VAR1 (UNOPUS, lbtruncuv2df, 2, v2di)
 
+  VAR1 (UNOP, lroundv4hf, 2, v4hi)
+  VAR1 (UNOP, lroundv8hf, 2, v8hi)
   VAR1 (UNOP, lroundv2sf, 2, v2si)
   VAR1 (UNOP, lroundv4sf, 2, v4si)
   VAR1 (UNOP, lroundv2df, 2, v2di)
-  /* Implemented by l<fcvt_pattern><su_optab><GPF:mode><GPI:mode>2.  */
+  /* Implemented by l<fcvt_pattern><su_optab><GPF_F16:mode><GPI:mode>2.  */
+  BUILTIN_GPI_I16 (UNOP, lroundhf, 2)
   VAR1 (UNOP, lroundsf, 2, si)
   VAR1 (UNOP, lrounddf, 2, di)
 
+  VAR1 (UNOPUS, lrounduv4hf, 2, v4hi)
+  VAR1 (UNOPUS, lrounduv8hf, 2, v8hi)
   VAR1 (UNOPUS, lrounduv2sf, 2, v2si)
   VAR1 (UNOPUS, lrounduv4sf, 2, v4si)
   VAR1 (UNOPUS, lrounduv2df, 2, v2di)
+  BUILTIN_GPI_I16 (UNOPUS, lrounduhf, 2)
   VAR1 (UNOPUS, lroundusf, 2, si)
   VAR1 (UNOPUS, lroundudf, 2, di)
 
+  VAR1 (UNOP, lceilv4hf, 2, v4hi)
+  VAR1 (UNOP, lceilv8hf, 2, v8hi)
   VAR1 (UNOP, lceilv2sf, 2, v2si)
   VAR1 (UNOP, lceilv4sf, 2, v4si)
   VAR1 (UNOP, lceilv2df, 2, v2di)
+  BUILTIN_GPI_I16 (UNOP, lceilhf, 2)
 
+  VAR1 (UNOPUS, lceiluv4hf, 2, v4hi)
+  VAR1 (UNOPUS, lceiluv8hf, 2, v8hi)
   VAR1 (UNOPUS, lceiluv2sf, 2, v2si)
   VAR1 (UNOPUS, lceiluv4sf, 2, v4si)
   VAR1 (UNOPUS, lceiluv2df, 2, v2di)
+  BUILTIN_GPI_I16 (UNOPUS, lceiluhf, 2)
   VAR1 (UNOPUS, lceilusf, 2, si)
   VAR1 (UNOPUS, lceiludf, 2, di)
 
+  VAR1 (UNOP, lfloorv4hf, 2, v4hi)
+  VAR1 (UNOP, lfloorv8hf, 2, v8hi)
   VAR1 (UNOP, lfloorv2sf, 2, v2si)
   VAR1 (UNOP, lfloorv4sf, 2, v4si)
   VAR1 (UNOP, lfloorv2df, 2, v2di)
+  BUILTIN_GPI_I16 (UNOP, lfloorhf, 2)
 
+  VAR1 (UNOPUS, lflooruv4hf, 2, v4hi)
+  VAR1 (UNOPUS, lflooruv8hf, 2, v8hi)
   VAR1 (UNOPUS, lflooruv2sf, 2, v2si)
   VAR1 (UNOPUS, lflooruv4sf, 2, v4si)
   VAR1 (UNOPUS, lflooruv2df, 2, v2di)
+  BUILTIN_GPI_I16 (UNOPUS, lflooruhf, 2)
   VAR1 (UNOPUS, lfloorusf, 2, si)
   VAR1 (UNOPUS, lfloorudf, 2, di)
 
+  VAR1 (UNOP, lfrintnv4hf, 2, v4hi)
+  VAR1 (UNOP, lfrintnv8hf, 2, v8hi)
   VAR1 (UNOP, lfrintnv2sf, 2, v2si)
   VAR1 (UNOP, lfrintnv4sf, 2, v4si)
   VAR1 (UNOP, lfrintnv2df, 2, v2di)
+  BUILTIN_GPI_I16 (UNOP, lfrintnhf, 2)
   VAR1 (UNOP, lfrintnsf, 2, si)
   VAR1 (UNOP, lfrintndf, 2, di)
 
+  VAR1 (UNOPUS, lfrintnuv4hf, 2, v4hi)
+  VAR1 (UNOPUS, lfrintnuv8hf, 2, v8hi)
   VAR1 (UNOPUS, lfrintnuv2sf, 2, v2si)
   VAR1 (UNOPUS, lfrintnuv4sf, 2, v4si)
   VAR1 (UNOPUS, lfrintnuv2df, 2, v2di)
+  BUILTIN_GPI_I16 (UNOPUS, lfrintnuhf, 2)
   VAR1 (UNOPUS, lfrintnusf, 2, si)
   VAR1 (UNOPUS, lfrintnudf, 2, di)
 
   /* Implemented by <optab><fcvt_target><VDQF:mode>2.  */
+  VAR1 (UNOP, floatv4hi, 2, v4hf)
+  VAR1 (UNOP, floatv8hi, 2, v8hf)
   VAR1 (UNOP, floatv2si, 2, v2sf)
   VAR1 (UNOP, floatv4si, 2, v4sf)
   VAR1 (UNOP, floatv2di, 2, v2df)
 
+  VAR1 (UNOP, floatunsv4hi, 2, v4hf)
+  VAR1 (UNOP, floatunsv8hi, 2, v8hf)
   VAR1 (UNOP, floatunsv2si, 2, v2sf)
   VAR1 (UNOP, floatunsv4si, 2, v4sf)
   VAR1 (UNOP, floatunsv2di, 2, v2df)
@@ -352,19 +403,19 @@
 
   /* Implemented by
      aarch64_frecp<FRECP:frecp_suffix><mode>.  */
-  BUILTIN_GPF (UNOP, frecpe, 0)
-  BUILTIN_GPF (BINOP, frecps, 0)
-  BUILTIN_GPF (UNOP, frecpx, 0)
+  BUILTIN_GPF_F16 (UNOP, frecpe, 0)
+  BUILTIN_GPF_F16 (UNOP, frecpx, 0)
 
   BUILTIN_VDQ_SI (UNOP, urecpe, 0)
 
-  BUILTIN_VDQF (UNOP, frecpe, 0)
-  BUILTIN_VDQF (BINOP, frecps, 0)
+  BUILTIN_VHSDF (UNOP, frecpe, 0)
+  BUILTIN_VHSDF_HSDF (BINOP, frecps, 0)
 
   /* Implemented by a mixture of abs2 patterns.  Note the DImode builtin is
      only ever used for the int64x1_t intrinsic, there is no scalar version.  */
   BUILTIN_VSDQ_I_DI (UNOP, abs, 0)
-  BUILTIN_VDQF (UNOP, abs, 2)
+  BUILTIN_VHSDF (UNOP, abs, 2)
+  VAR1 (UNOP, abs, 2, hf)
 
   BUILTIN_VQ_HSF (UNOP, vec_unpacks_hi_, 10)
   VAR1 (BINOP, float_truncate_hi_, 0, v4sf)
@@ -376,15 +427,22 @@
 
   /* Implemented by aarch64_ld1<VALL_F16:mode>.  */
   BUILTIN_VALL_F16 (LOAD1, ld1, 0)
+  VAR1(STORE1P, ld1, 0, v2di)
 
   /* Implemented by aarch64_st1<VALL_F16:mode>.  */
   BUILTIN_VALL_F16 (STORE1, st1, 0)
+  VAR1(STORE1P, st1, 0, v2di)
 
   /* Implemented by fma<mode>4.  */
-  BUILTIN_VDQF (TERNOP, fma, 4)
+  BUILTIN_VHSDF (TERNOP, fma, 4)
+  VAR1 (TERNOP, fma, 4, hf)
+  /* Implemented by fnma<mode>4.  */
+  BUILTIN_VHSDF (TERNOP, fnma, 4)
+  VAR1 (TERNOP, fnma, 4, hf)
 
   /* Implemented by aarch64_simd_bsl<mode>.  */
   BUILTIN_VDQQH (BSL_P, simd_bsl, 0)
+  VAR2 (BSL_P, simd_bsl,0, di, v2di)
   BUILTIN_VSDQ_I_DI (BSL_U, simd_bsl, 0)
   BUILTIN_VALLDIF (BSL_S, simd_bsl, 0)
 
@@ -436,7 +494,7 @@
   VAR1 (TERNOP, qtbx4, 0, v8qi)
   VAR1 (TERNOP, qtbx4, 0, v16qi)
 
-  /* Builtins for ARMv8.1 Adv.SIMD instructions.  */
+  /* Builtins for ARMv8.1-A Adv.SIMD instructions.  */
 
   /* Implemented by aarch64_sqrdml<SQRDMLH_AS:rdma_as>h<mode>.  */
   BUILTIN_VSDQ_HSI (TERNOP, sqrdmlah, 0)
@@ -449,3 +507,60 @@
   /* Implemented by aarch64_sqrdml<SQRDMLH_AS:rdma_as>h_laneq<mode>.  */
   BUILTIN_VSDQ_HSI (QUADOP_LANE, sqrdmlah_laneq, 0)
   BUILTIN_VSDQ_HSI (QUADOP_LANE, sqrdmlsh_laneq, 0)
+
+  /* Implemented by <FCVT_F2FIXED/FIXED2F:fcvt_fixed_insn><*><*>3.  */
+  BUILTIN_VSDQ_HSDI (SHIFTIMM, scvtf, 3)
+  BUILTIN_VSDQ_HSDI (FCVTIMM_SUS, ucvtf, 3)
+  BUILTIN_VHSDF_HSDF (SHIFTIMM, fcvtzs, 3)
+  BUILTIN_VHSDF_HSDF (SHIFTIMM_USS, fcvtzu, 3)
+  VAR1 (SHIFTIMM, scvtfsi, 3, hf)
+  VAR1 (SHIFTIMM, scvtfdi, 3, hf)
+  VAR1 (FCVTIMM_SUS, ucvtfsi, 3, hf)
+  VAR1 (FCVTIMM_SUS, ucvtfdi, 3, hf)
+  BUILTIN_GPI (SHIFTIMM, fcvtzshf, 3)
+  BUILTIN_GPI (SHIFTIMM_USS, fcvtzuhf, 3)
+
+  /* Implemented by aarch64_rsqrte<mode>.  */
+  BUILTIN_VHSDF_HSDF (UNOP, rsqrte, 0)
+
+  /* Implemented by aarch64_rsqrts<mode>.  */
+  BUILTIN_VHSDF_HSDF (BINOP, rsqrts, 0)
+
+  /* Implemented by fabd<mode>3.  */
+  BUILTIN_VHSDF_HSDF (BINOP, fabd, 3)
+
+  /* Implemented by aarch64_faddp<mode>.  */
+  BUILTIN_VHSDF (BINOP, faddp, 0)
+
+  /* Implemented by aarch64_cm<optab><mode>.  */
+  BUILTIN_VHSDF_HSDF (BINOP_USS, cmeq, 0)
+  BUILTIN_VHSDF_HSDF (BINOP_USS, cmge, 0)
+  BUILTIN_VHSDF_HSDF (BINOP_USS, cmgt, 0)
+  BUILTIN_VHSDF_HSDF (BINOP_USS, cmle, 0)
+  BUILTIN_VHSDF_HSDF (BINOP_USS, cmlt, 0)
+
+  /* Implemented by neg<mode>2.  */
+  BUILTIN_VHSDF_HSDF (UNOP, neg, 2)
+
+  /* Implemented by aarch64_fac<optab><mode>.  */
+  BUILTIN_VHSDF_HSDF (BINOP_USS, faclt, 0)
+  BUILTIN_VHSDF_HSDF (BINOP_USS, facle, 0)
+  BUILTIN_VHSDF_HSDF (BINOP_USS, facgt, 0)
+  BUILTIN_VHSDF_HSDF (BINOP_USS, facge, 0)
+
+  /* Implemented by sqrt<mode>2.  */
+  VAR1 (UNOP, sqrt, 2, hf)
+
+  /* Implemented by <optab><mode>hf2.  */
+  VAR1 (UNOP, floatdi, 2, hf)
+  VAR1 (UNOP, floatsi, 2, hf)
+  VAR1 (UNOP, floathi, 2, hf)
+  VAR1 (UNOPUS, floatunsdi, 2, hf)
+  VAR1 (UNOPUS, floatunssi, 2, hf)
+  VAR1 (UNOPUS, floatunshi, 2, hf)
+  BUILTIN_GPI_I16 (UNOP, fix_trunchf, 2)
+  BUILTIN_GPI (UNOP, fix_truncsf, 2)
+  BUILTIN_GPI (UNOP, fix_truncdf, 2)
+  BUILTIN_GPI_I16 (UNOPUS, fixuns_trunchf, 2)
+  BUILTIN_GPI (UNOPUS, fixuns_truncsf, 2)
+  BUILTIN_GPI (UNOPUS, fixuns_truncdf, 2)
\ No newline at end of file
--- a/src/gcc/config/aarch64/aarch64-simd.md
+++ b/src/gcc/config/aarch64/aarch64-simd.md
@@ -351,7 +351,7 @@
     operands[2] = GEN_INT (ENDIAN_LANE_N (<MODE>mode, INTVAL (operands[2])));
     return "<f>mul\\t%0.<Vtype>, %3.<Vtype>, %1.<Vetype>[%2]";
   }
-  [(set_attr "type" "neon<fp>_mul_<Vetype>_scalar<q>")]
+  [(set_attr "type" "neon<fp>_mul_<stype>_scalar<q>")]
 )
 
 (define_insn "*aarch64_mul3_elt_<vswap_width_name><mode>"
@@ -371,33 +371,33 @@
   [(set_attr "type" "neon<fp>_mul_<Vetype>_scalar<q>")]
 )
 
-(define_insn "*aarch64_mul3_elt_to_128df"
-  [(set (match_operand:V2DF 0 "register_operand" "=w")
-     (mult:V2DF
-       (vec_duplicate:V2DF
-	 (match_operand:DF 2 "register_operand" "w"))
-      (match_operand:V2DF 1 "register_operand" "w")))]
+(define_insn "*aarch64_mul3_elt_from_dup<mode>"
+ [(set (match_operand:VMUL 0 "register_operand" "=w")
+    (mult:VMUL
+      (vec_duplicate:VMUL
+	    (match_operand:<VEL> 1 "register_operand" "<h_con>"))
+      (match_operand:VMUL 2 "register_operand" "w")))]
   "TARGET_SIMD"
-  "fmul\\t%0.2d, %1.2d, %2.d[0]"
-  [(set_attr "type" "neon_fp_mul_d_scalar_q")]
+  "<f>mul\t%0.<Vtype>, %2.<Vtype>, %1.<Vetype>[0]";
+  [(set_attr "type" "neon<fp>_mul_<stype>_scalar<q>")]
 )
 
-(define_insn "aarch64_rsqrte_<mode>2"
-  [(set (match_operand:VALLF 0 "register_operand" "=w")
-	(unspec:VALLF [(match_operand:VALLF 1 "register_operand" "w")]
+(define_insn "aarch64_rsqrte<mode>"
+  [(set (match_operand:VHSDF_HSDF 0 "register_operand" "=w")
+	(unspec:VHSDF_HSDF [(match_operand:VHSDF_HSDF 1 "register_operand" "w")]
 		     UNSPEC_RSQRTE))]
   "TARGET_SIMD"
   "frsqrte\\t%<v>0<Vmtype>, %<v>1<Vmtype>"
-  [(set_attr "type" "neon_fp_rsqrte_<Vetype><q>")])
+  [(set_attr "type" "neon_fp_rsqrte_<stype><q>")])
 
-(define_insn "aarch64_rsqrts_<mode>3"
-  [(set (match_operand:VALLF 0 "register_operand" "=w")
-	(unspec:VALLF [(match_operand:VALLF 1 "register_operand" "w")
-	       (match_operand:VALLF 2 "register_operand" "w")]
-		     UNSPEC_RSQRTS))]
+(define_insn "aarch64_rsqrts<mode>"
+  [(set (match_operand:VHSDF_HSDF 0 "register_operand" "=w")
+	(unspec:VHSDF_HSDF [(match_operand:VHSDF_HSDF 1 "register_operand" "w")
+			    (match_operand:VHSDF_HSDF 2 "register_operand" "w")]
+	 UNSPEC_RSQRTS))]
   "TARGET_SIMD"
   "frsqrts\\t%<v>0<Vmtype>, %<v>1<Vmtype>, %<v>2<Vmtype>"
-  [(set_attr "type" "neon_fp_rsqrts_<Vetype><q>")])
+  [(set_attr "type" "neon_fp_rsqrts_<stype><q>")])
 
 (define_expand "rsqrt<mode>2"
   [(set (match_operand:VALLF 0 "register_operand" "=w")
@@ -405,7 +405,7 @@
 		     UNSPEC_RSQRT))]
   "TARGET_SIMD"
 {
-  aarch64_emit_approx_rsqrt (operands[0], operands[1]);
+  aarch64_emit_approx_sqrt (operands[0], operands[1], true);
   DONE;
 })
 
@@ -474,24 +474,15 @@
   [(set_attr "type" "neon_arith_acc<q>")]
 )
 
-(define_insn "fabd<mode>_3"
-  [(set (match_operand:VDQF 0 "register_operand" "=w")
-	(abs:VDQF (minus:VDQF
-		   (match_operand:VDQF 1 "register_operand" "w")
-		   (match_operand:VDQF 2 "register_operand" "w"))))]
-  "TARGET_SIMD"
-  "fabd\t%0.<Vtype>, %1.<Vtype>, %2.<Vtype>"
-  [(set_attr "type" "neon_fp_abd_<Vetype><q>")]
-)
-
-(define_insn "*fabd_scalar<mode>3"
-  [(set (match_operand:GPF 0 "register_operand" "=w")
-        (abs:GPF (minus:GPF
-                 (match_operand:GPF 1 "register_operand" "w")
-                 (match_operand:GPF 2 "register_operand" "w"))))]
+(define_insn "fabd<mode>3"
+  [(set (match_operand:VHSDF_HSDF 0 "register_operand" "=w")
+	(abs:VHSDF_HSDF
+	  (minus:VHSDF_HSDF
+	    (match_operand:VHSDF_HSDF 1 "register_operand" "w")
+	    (match_operand:VHSDF_HSDF 2 "register_operand" "w"))))]
   "TARGET_SIMD"
-  "fabd\t%<s>0, %<s>1, %<s>2"
-  [(set_attr "type" "neon_fp_abd_<Vetype><q>")]
+  "fabd\t%<v>0<Vmtype>, %<v>1<Vmtype>, %<v>2<Vmtype>"
+  [(set_attr "type" "neon_fp_abd_<stype><q>")]
 )
 
 (define_insn "and<mode>3"
@@ -555,6 +546,49 @@
   [(set_attr "type" "neon_from_gp<q>, neon_ins<q>, neon_load1_1reg<q>")]
 )
 
+(define_insn "*aarch64_simd_vec_copy_lane<mode>"
+  [(set (match_operand:VALL 0 "register_operand" "=w")
+	(vec_merge:VALL
+	    (vec_duplicate:VALL
+	      (vec_select:<VEL>
+		(match_operand:VALL 3 "register_operand" "w")
+		(parallel
+		  [(match_operand:SI 4 "immediate_operand" "i")])))
+	    (match_operand:VALL 1 "register_operand" "0")
+	    (match_operand:SI 2 "immediate_operand" "i")))]
+  "TARGET_SIMD"
+  {
+    int elt = ENDIAN_LANE_N (<MODE>mode, exact_log2 (INTVAL (operands[2])));
+    operands[2] = GEN_INT (HOST_WIDE_INT_1 << elt);
+    operands[4] = GEN_INT (ENDIAN_LANE_N (<MODE>mode, INTVAL (operands[4])));
+
+    return "ins\t%0.<Vetype>[%p2], %3.<Vetype>[%4]";
+  }
+  [(set_attr "type" "neon_ins<q>")]
+)
+
+(define_insn "*aarch64_simd_vec_copy_lane_<vswap_width_name><mode>"
+  [(set (match_operand:VALL 0 "register_operand" "=w")
+	(vec_merge:VALL
+	    (vec_duplicate:VALL
+	      (vec_select:<VEL>
+		(match_operand:<VSWAP_WIDTH> 3 "register_operand" "w")
+		(parallel
+		  [(match_operand:SI 4 "immediate_operand" "i")])))
+	    (match_operand:VALL 1 "register_operand" "0")
+	    (match_operand:SI 2 "immediate_operand" "i")))]
+  "TARGET_SIMD"
+  {
+    int elt = ENDIAN_LANE_N (<MODE>mode, exact_log2 (INTVAL (operands[2])));
+    operands[2] = GEN_INT (HOST_WIDE_INT_1 << elt);
+    operands[4] = GEN_INT (ENDIAN_LANE_N (<VSWAP_WIDTH>mode,
+			   INTVAL (operands[4])));
+
+    return "ins\t%0.<Vetype>[%p2], %3.<Vetype>[%4]";
+  }
+  [(set_attr "type" "neon_ins<q>")]
+)
+
 (define_insn "aarch64_simd_lshr<mode>"
  [(set (match_operand:VDQ_I 0 "register_operand" "=w")
        (lshiftrt:VDQ_I (match_operand:VDQ_I 1 "register_operand" "w")
@@ -1071,10 +1105,10 @@
 
 ;; Pairwise FP Max/Min operations.
 (define_insn "aarch64_<maxmin_uns>p<mode>"
- [(set (match_operand:VDQF 0 "register_operand" "=w")
-       (unspec:VDQF [(match_operand:VDQF 1 "register_operand" "w")
-		     (match_operand:VDQF 2 "register_operand" "w")]
-		    FMAXMINV))]
+ [(set (match_operand:VHSDF 0 "register_operand" "=w")
+       (unspec:VHSDF [(match_operand:VHSDF 1 "register_operand" "w")
+		      (match_operand:VHSDF 2 "register_operand" "w")]
+		      FMAXMINV))]
  "TARGET_SIMD"
  "<maxmin_uns_op>p\t%0.<Vtype>, %1.<Vtype>, %2.<Vtype>"
   [(set_attr "type" "neon_minmax<q>")]
@@ -1483,65 +1517,77 @@
 ;; FP arithmetic operations.
 
 (define_insn "add<mode>3"
- [(set (match_operand:VDQF 0 "register_operand" "=w")
-       (plus:VDQF (match_operand:VDQF 1 "register_operand" "w")
-		  (match_operand:VDQF 2 "register_operand" "w")))]
+ [(set (match_operand:VHSDF 0 "register_operand" "=w")
+       (plus:VHSDF (match_operand:VHSDF 1 "register_operand" "w")
+		   (match_operand:VHSDF 2 "register_operand" "w")))]
  "TARGET_SIMD"
  "fadd\\t%0.<Vtype>, %1.<Vtype>, %2.<Vtype>"
-  [(set_attr "type" "neon_fp_addsub_<Vetype><q>")]
+  [(set_attr "type" "neon_fp_addsub_<stype><q>")]
 )
 
 (define_insn "sub<mode>3"
- [(set (match_operand:VDQF 0 "register_operand" "=w")
-       (minus:VDQF (match_operand:VDQF 1 "register_operand" "w")
-		   (match_operand:VDQF 2 "register_operand" "w")))]
+ [(set (match_operand:VHSDF 0 "register_operand" "=w")
+       (minus:VHSDF (match_operand:VHSDF 1 "register_operand" "w")
+		    (match_operand:VHSDF 2 "register_operand" "w")))]
  "TARGET_SIMD"
  "fsub\\t%0.<Vtype>, %1.<Vtype>, %2.<Vtype>"
-  [(set_attr "type" "neon_fp_addsub_<Vetype><q>")]
+  [(set_attr "type" "neon_fp_addsub_<stype><q>")]
 )
 
 (define_insn "mul<mode>3"
- [(set (match_operand:VDQF 0 "register_operand" "=w")
-       (mult:VDQF (match_operand:VDQF 1 "register_operand" "w")
-		  (match_operand:VDQF 2 "register_operand" "w")))]
+ [(set (match_operand:VHSDF 0 "register_operand" "=w")
+       (mult:VHSDF (match_operand:VHSDF 1 "register_operand" "w")
+		   (match_operand:VHSDF 2 "register_operand" "w")))]
  "TARGET_SIMD"
  "fmul\\t%0.<Vtype>, %1.<Vtype>, %2.<Vtype>"
-  [(set_attr "type" "neon_fp_mul_<Vetype><q>")]
+  [(set_attr "type" "neon_fp_mul_<stype><q>")]
 )
 
-(define_insn "div<mode>3"
- [(set (match_operand:VDQF 0 "register_operand" "=w")
-       (div:VDQF (match_operand:VDQF 1 "register_operand" "w")
-		 (match_operand:VDQF 2 "register_operand" "w")))]
+(define_expand "div<mode>3"
+ [(set (match_operand:VHSDF 0 "register_operand" "=w")
+       (div:VHSDF (match_operand:VHSDF 1 "register_operand" "w")
+		  (match_operand:VHSDF 2 "register_operand" "w")))]
+ "TARGET_SIMD"
+{
+  if (aarch64_emit_approx_div (operands[0], operands[1], operands[2]))
+    DONE;
+
+  operands[1] = force_reg (<MODE>mode, operands[1]);
+})
+
+(define_insn "*div<mode>3"
+ [(set (match_operand:VHSDF 0 "register_operand" "=w")
+       (div:VHSDF (match_operand:VHSDF 1 "register_operand" "w")
+		 (match_operand:VHSDF 2 "register_operand" "w")))]
  "TARGET_SIMD"
  "fdiv\\t%0.<Vtype>, %1.<Vtype>, %2.<Vtype>"
-  [(set_attr "type" "neon_fp_div_<Vetype><q>")]
+  [(set_attr "type" "neon_fp_div_<stype><q>")]
 )
 
 (define_insn "neg<mode>2"
- [(set (match_operand:VDQF 0 "register_operand" "=w")
-       (neg:VDQF (match_operand:VDQF 1 "register_operand" "w")))]
+ [(set (match_operand:VHSDF 0 "register_operand" "=w")
+       (neg:VHSDF (match_operand:VHSDF 1 "register_operand" "w")))]
  "TARGET_SIMD"
  "fneg\\t%0.<Vtype>, %1.<Vtype>"
-  [(set_attr "type" "neon_fp_neg_<Vetype><q>")]
+  [(set_attr "type" "neon_fp_neg_<stype><q>")]
 )
 
 (define_insn "abs<mode>2"
- [(set (match_operand:VDQF 0 "register_operand" "=w")
-       (abs:VDQF (match_operand:VDQF 1 "register_operand" "w")))]
+ [(set (match_operand:VHSDF 0 "register_operand" "=w")
+       (abs:VHSDF (match_operand:VHSDF 1 "register_operand" "w")))]
  "TARGET_SIMD"
  "fabs\\t%0.<Vtype>, %1.<Vtype>"
-  [(set_attr "type" "neon_fp_abs_<Vetype><q>")]
+  [(set_attr "type" "neon_fp_abs_<stype><q>")]
 )
 
 (define_insn "fma<mode>4"
-  [(set (match_operand:VDQF 0 "register_operand" "=w")
-       (fma:VDQF (match_operand:VDQF 1 "register_operand" "w")
-                (match_operand:VDQF 2 "register_operand" "w")
-                (match_operand:VDQF 3 "register_operand" "0")))]
+  [(set (match_operand:VHSDF 0 "register_operand" "=w")
+       (fma:VHSDF (match_operand:VHSDF 1 "register_operand" "w")
+		  (match_operand:VHSDF 2 "register_operand" "w")
+		  (match_operand:VHSDF 3 "register_operand" "0")))]
   "TARGET_SIMD"
  "fmla\\t%0.<Vtype>, %1.<Vtype>, %2.<Vtype>"
-  [(set_attr "type" "neon_fp_mla_<Vetype><q>")]
+  [(set_attr "type" "neon_fp_mla_<stype><q>")]
 )
 
 (define_insn "*aarch64_fma4_elt<mode>"
@@ -1579,16 +1625,16 @@
   [(set_attr "type" "neon_fp_mla_<Vetype>_scalar<q>")]
 )
 
-(define_insn "*aarch64_fma4_elt_to_128df"
-  [(set (match_operand:V2DF 0 "register_operand" "=w")
-    (fma:V2DF
-      (vec_duplicate:V2DF
-	  (match_operand:DF 1 "register_operand" "w"))
-      (match_operand:V2DF 2 "register_operand" "w")
-      (match_operand:V2DF 3 "register_operand" "0")))]
+(define_insn "*aarch64_fma4_elt_from_dup<mode>"
+  [(set (match_operand:VMUL 0 "register_operand" "=w")
+    (fma:VMUL
+      (vec_duplicate:VMUL
+	  (match_operand:<VEL> 1 "register_operand" "w"))
+      (match_operand:VMUL 2 "register_operand" "w")
+      (match_operand:VMUL 3 "register_operand" "0")))]
   "TARGET_SIMD"
-  "fmla\\t%0.2d, %2.2d, %1.2d[0]"
-  [(set_attr "type" "neon_fp_mla_d_scalar_q")]
+  "fmla\t%0.<Vtype>, %2.<Vtype>, %1.<Vetype>[0]"
+  [(set_attr "type" "neon<fp>_mla_<stype>_scalar<q>")]
 )
 
 (define_insn "*aarch64_fma4_elt_to_64v2df"
@@ -1608,15 +1654,15 @@
 )
 
 (define_insn "fnma<mode>4"
-  [(set (match_operand:VDQF 0 "register_operand" "=w")
-	(fma:VDQF
-	  (match_operand:VDQF 1 "register_operand" "w")
-          (neg:VDQF
-	    (match_operand:VDQF 2 "register_operand" "w"))
-	  (match_operand:VDQF 3 "register_operand" "0")))]
+  [(set (match_operand:VHSDF 0 "register_operand" "=w")
+	(fma:VHSDF
+	  (match_operand:VHSDF 1 "register_operand" "w")
+          (neg:VHSDF
+	    (match_operand:VHSDF 2 "register_operand" "w"))
+	  (match_operand:VHSDF 3 "register_operand" "0")))]
   "TARGET_SIMD"
- "fmls\\t%0.<Vtype>, %1.<Vtype>, %2.<Vtype>"
-  [(set_attr "type" "neon_fp_mla_<Vetype><q>")]
+  "fmls\\t%0.<Vtype>, %1.<Vtype>, %2.<Vtype>"
+  [(set_attr "type" "neon_fp_mla_<stype><q>")]
 )
 
 (define_insn "*aarch64_fnma4_elt<mode>"
@@ -1656,17 +1702,17 @@
   [(set_attr "type" "neon_fp_mla_<Vetype>_scalar<q>")]
 )
 
-(define_insn "*aarch64_fnma4_elt_to_128df"
-  [(set (match_operand:V2DF 0 "register_operand" "=w")
-    (fma:V2DF
-      (neg:V2DF
-        (match_operand:V2DF 2 "register_operand" "w"))
-      (vec_duplicate:V2DF
-	(match_operand:DF 1 "register_operand" "w"))
-      (match_operand:V2DF 3 "register_operand" "0")))]
+(define_insn "*aarch64_fnma4_elt_from_dup<mode>"
+  [(set (match_operand:VMUL 0 "register_operand" "=w")
+    (fma:VMUL
+      (neg:VMUL
+        (match_operand:VMUL 2 "register_operand" "w"))
+      (vec_duplicate:VMUL
+	(match_operand:<VEL> 1 "register_operand" "w"))
+      (match_operand:VMUL 3 "register_operand" "0")))]
   "TARGET_SIMD"
-  "fmls\\t%0.2d, %2.2d, %1.2d[0]"
-  [(set_attr "type" "neon_fp_mla_d_scalar_q")]
+  "fmls\t%0.<Vtype>, %2.<Vtype>, %1.<Vetype>[0]"
+  [(set_attr "type" "neon<fp>_mla_<stype>_scalar<q>")]
 )
 
 (define_insn "*aarch64_fnma4_elt_to_64v2df"
@@ -1689,24 +1735,50 @@
 ;; Vector versions of the floating-point frint patterns.
 ;; Expands to btrunc, ceil, floor, nearbyint, rint, round, frintn.
 (define_insn "<frint_pattern><mode>2"
-  [(set (match_operand:VDQF 0 "register_operand" "=w")
-	(unspec:VDQF [(match_operand:VDQF 1 "register_operand" "w")]
-		      FRINT))]
+  [(set (match_operand:VHSDF 0 "register_operand" "=w")
+	(unspec:VHSDF [(match_operand:VHSDF 1 "register_operand" "w")]
+		       FRINT))]
   "TARGET_SIMD"
   "frint<frint_suffix>\\t%0.<Vtype>, %1.<Vtype>"
-  [(set_attr "type" "neon_fp_round_<Vetype><q>")]
+  [(set_attr "type" "neon_fp_round_<stype><q>")]
 )
 
 ;; Vector versions of the fcvt standard patterns.
 ;; Expands to lbtrunc, lround, lceil, lfloor
-(define_insn "l<fcvt_pattern><su_optab><VDQF:mode><fcvt_target>2"
+(define_insn "l<fcvt_pattern><su_optab><VHSDF:mode><fcvt_target>2"
   [(set (match_operand:<FCVT_TARGET> 0 "register_operand" "=w")
 	(FIXUORS:<FCVT_TARGET> (unspec:<FCVT_TARGET>
-			       [(match_operand:VDQF 1 "register_operand" "w")]
+			       [(match_operand:VHSDF 1 "register_operand" "w")]
 			       FCVT)))]
   "TARGET_SIMD"
   "fcvt<frint_suffix><su>\\t%0.<Vtype>, %1.<Vtype>"
-  [(set_attr "type" "neon_fp_to_int_<Vetype><q>")]
+  [(set_attr "type" "neon_fp_to_int_<stype><q>")]
+)
+
+;; HF Scalar variants of related SIMD instructions.
+(define_insn "l<fcvt_pattern><su_optab>hfhi2"
+  [(set (match_operand:HI 0 "register_operand" "=w")
+	(FIXUORS:HI (unspec:HF [(match_operand:HF 1 "register_operand" "w")]
+		      FCVT)))]
+  "TARGET_SIMD_F16INST"
+  "fcvt<frint_suffix><su>\t%h0, %h1"
+  [(set_attr "type" "neon_fp_to_int_s")]
+)
+
+(define_insn "<optab>_trunchfhi2"
+  [(set (match_operand:HI 0 "register_operand" "=w")
+	(FIXUORS:HI (match_operand:HF 1 "register_operand" "w")))]
+  "TARGET_SIMD_F16INST"
+  "fcvtz<su>\t%h0, %h1"
+  [(set_attr "type" "neon_fp_to_int_s")]
+)
+
+(define_insn "<optab>hihf2"
+  [(set (match_operand:HF 0 "register_operand" "=w")
+	(FLOATUORS:HF (match_operand:HI 1 "register_operand" "w")))]
+  "TARGET_SIMD_F16INST"
+  "<su_optab>cvtf\t%h0, %h1"
+  [(set_attr "type" "neon_int_to_fp_s")]
 )
 
 (define_insn "*aarch64_fcvt<su_optab><VDQF:mode><fcvt_target>2_mult"
@@ -1729,36 +1801,36 @@
   [(set_attr "type" "neon_fp_to_int_<Vetype><q>")]
 )
 
-(define_expand "<optab><VDQF:mode><fcvt_target>2"
+(define_expand "<optab><VHSDF:mode><fcvt_target>2"
   [(set (match_operand:<FCVT_TARGET> 0 "register_operand")
 	(FIXUORS:<FCVT_TARGET> (unspec:<FCVT_TARGET>
-			       [(match_operand:VDQF 1 "register_operand")]
-			       UNSPEC_FRINTZ)))]
+			       [(match_operand:VHSDF 1 "register_operand")]
+				UNSPEC_FRINTZ)))]
   "TARGET_SIMD"
   {})
 
-(define_expand "<fix_trunc_optab><VDQF:mode><fcvt_target>2"
+(define_expand "<fix_trunc_optab><VHSDF:mode><fcvt_target>2"
   [(set (match_operand:<FCVT_TARGET> 0 "register_operand")
 	(FIXUORS:<FCVT_TARGET> (unspec:<FCVT_TARGET>
-			       [(match_operand:VDQF 1 "register_operand")]
-			       UNSPEC_FRINTZ)))]
+			       [(match_operand:VHSDF 1 "register_operand")]
+				UNSPEC_FRINTZ)))]
   "TARGET_SIMD"
   {})
 
-(define_expand "ftrunc<VDQF:mode>2"
-  [(set (match_operand:VDQF 0 "register_operand")
-	(unspec:VDQF [(match_operand:VDQF 1 "register_operand")]
-		      UNSPEC_FRINTZ))]
+(define_expand "ftrunc<VHSDF:mode>2"
+  [(set (match_operand:VHSDF 0 "register_operand")
+	(unspec:VHSDF [(match_operand:VHSDF 1 "register_operand")]
+		       UNSPEC_FRINTZ))]
   "TARGET_SIMD"
   {})
 
-(define_insn "<optab><fcvt_target><VDQF:mode>2"
-  [(set (match_operand:VDQF 0 "register_operand" "=w")
-	(FLOATUORS:VDQF
+(define_insn "<optab><fcvt_target><VHSDF:mode>2"
+  [(set (match_operand:VHSDF 0 "register_operand" "=w")
+	(FLOATUORS:VHSDF
 	  (match_operand:<FCVT_TARGET> 1 "register_operand" "w")))]
   "TARGET_SIMD"
   "<su_optab>cvtf\\t%0.<Vtype>, %1.<Vtype>"
-  [(set_attr "type" "neon_int_to_fp_<Vetype><q>")]
+  [(set_attr "type" "neon_int_to_fp_<stype><q>")]
 )
 
 ;; Conversions between vectors of floats and doubles.
@@ -1778,6 +1850,30 @@
   [(set_attr "type" "neon_fp_cvt_widen_s")]
 )
 
+;; Convert between fixed-point and floating-point (vector modes)
+
+(define_insn "<FCVT_F2FIXED:fcvt_fixed_insn><VHSDF:mode>3"
+  [(set (match_operand:<VHSDF:FCVT_TARGET> 0 "register_operand" "=w")
+	(unspec:<VHSDF:FCVT_TARGET>
+	  [(match_operand:VHSDF 1 "register_operand" "w")
+	   (match_operand:SI 2 "immediate_operand" "i")]
+	 FCVT_F2FIXED))]
+  "TARGET_SIMD"
+  "<FCVT_F2FIXED:fcvt_fixed_insn>\t%<v>0<Vmtype>, %<v>1<Vmtype>, #%2"
+  [(set_attr "type" "neon_fp_to_int_<VHSDF:stype><q>")]
+)
+
+(define_insn "<FCVT_FIXED2F:fcvt_fixed_insn><VDQ_HSDI:mode>3"
+  [(set (match_operand:<VDQ_HSDI:FCVT_TARGET> 0 "register_operand" "=w")
+	(unspec:<VDQ_HSDI:FCVT_TARGET>
+	  [(match_operand:VDQ_HSDI 1 "register_operand" "w")
+	   (match_operand:SI 2 "immediate_operand" "i")]
+	 FCVT_FIXED2F))]
+  "TARGET_SIMD"
+  "<FCVT_FIXED2F:fcvt_fixed_insn>\t%<v>0<Vmtype>, %<v>1<Vmtype>, #%2"
+  [(set_attr "type" "neon_int_to_fp_<VDQ_HSDI:stype><q>")]
+)
+
 ;; ??? Note that the vectorizer usage of the vec_unpacks_[lo/hi] patterns
 ;; is inconsistent with vector ordering elsewhere in the compiler, in that
 ;; the meaning of HI and LO changes depending on the target endianness.
@@ -1934,33 +2030,25 @@
 ;; NaNs.
 
 (define_insn "<su><maxmin><mode>3"
-  [(set (match_operand:VDQF 0 "register_operand" "=w")
-        (FMAXMIN:VDQF (match_operand:VDQF 1 "register_operand" "w")
-		   (match_operand:VDQF 2 "register_operand" "w")))]
+  [(set (match_operand:VHSDF 0 "register_operand" "=w")
+	(FMAXMIN:VHSDF (match_operand:VHSDF 1 "register_operand" "w")
+		       (match_operand:VHSDF 2 "register_operand" "w")))]
   "TARGET_SIMD"
   "f<maxmin>nm\\t%0.<Vtype>, %1.<Vtype>, %2.<Vtype>"
-  [(set_attr "type" "neon_fp_minmax_<Vetype><q>")]
+  [(set_attr "type" "neon_fp_minmax_<stype><q>")]
 )
 
+;; Vector forms for fmax, fmin, fmaxnm, fminnm.
+;; fmaxnm and fminnm are used for the fmax<mode>3 standard pattern names,
+;; which implement the IEEE fmax ()/fmin () functions.
 (define_insn "<maxmin_uns><mode>3"
-  [(set (match_operand:VDQF 0 "register_operand" "=w")
-       (unspec:VDQF [(match_operand:VDQF 1 "register_operand" "w")
-		     (match_operand:VDQF 2 "register_operand" "w")]
-		    FMAXMIN_UNS))]
+  [(set (match_operand:VHSDF 0 "register_operand" "=w")
+       (unspec:VHSDF [(match_operand:VHSDF 1 "register_operand" "w")
+		      (match_operand:VHSDF 2 "register_operand" "w")]
+		      FMAXMIN_UNS))]
   "TARGET_SIMD"
   "<maxmin_uns_op>\\t%0.<Vtype>, %1.<Vtype>, %2.<Vtype>"
-  [(set_attr "type" "neon_fp_minmax_<Vetype><q>")]
-)
-
-;; Auto-vectorized forms for the IEEE-754 fmax()/fmin() functions
-(define_insn "<fmaxmin><mode>3"
-  [(set (match_operand:VDQF 0 "register_operand" "=w")
-	(unspec:VDQF [(match_operand:VDQF 1 "register_operand" "w")
-		      (match_operand:VDQF 2 "register_operand" "w")]
-		      FMAXMIN))]
-  "TARGET_SIMD"
-  "<fmaxmin_op>\\t%0.<Vtype>, %1.<Vtype>, %2.<Vtype>"
-  [(set_attr "type" "neon_fp_minmax_<Vetype><q>")]
+  [(set_attr "type" "neon_fp_minmax_<stype><q>")]
 )
 
 ;; 'across lanes' add.
@@ -1979,17 +2067,14 @@
   }
 )
 
-(define_expand "reduc_plus_scal_<mode>"
-  [(match_operand:<VEL> 0 "register_operand" "=w")
-   (match_operand:V2F 1 "register_operand" "w")]
-  "TARGET_SIMD"
-  {
-    rtx elt = GEN_INT (ENDIAN_LANE_N (<MODE>mode, 0));
-    rtx scratch = gen_reg_rtx (<MODE>mode);
-    emit_insn (gen_aarch64_reduc_plus_internal<mode> (scratch, operands[1]));
-    emit_insn (gen_aarch64_get_lane<mode> (operands[0], scratch, elt));
-    DONE;
-  }
+(define_insn "aarch64_faddp<mode>"
+ [(set (match_operand:VHSDF 0 "register_operand" "=w")
+       (unspec:VHSDF [(match_operand:VHSDF 1 "register_operand" "w")
+		      (match_operand:VHSDF 2 "register_operand" "w")]
+	UNSPEC_FADDV))]
+ "TARGET_SIMD"
+ "faddp\t%0.<Vtype>, %1.<Vtype>, %2.<Vtype>"
+  [(set_attr "type" "neon_fp_reduc_add_<stype><q>")]
 )
 
 (define_insn "aarch64_reduc_plus_internal<mode>"
@@ -2010,24 +2095,15 @@
   [(set_attr "type" "neon_reduc_add")]
 )
 
-(define_insn "aarch64_reduc_plus_internal<mode>"
- [(set (match_operand:V2F 0 "register_operand" "=w")
-       (unspec:V2F [(match_operand:V2F 1 "register_operand" "w")]
+(define_insn "reduc_plus_scal_<mode>"
+ [(set (match_operand:<VEL> 0 "register_operand" "=w")
+       (unspec:<VEL> [(match_operand:V2F 1 "register_operand" "w")]
 		   UNSPEC_FADDV))]
  "TARGET_SIMD"
  "faddp\\t%<Vetype>0, %1.<Vtype>"
   [(set_attr "type" "neon_fp_reduc_add_<Vetype><q>")]
 )
 
-(define_insn "aarch64_addpv4sf"
- [(set (match_operand:V4SF 0 "register_operand" "=w")
-       (unspec:V4SF [(match_operand:V4SF 1 "register_operand" "w")]
-		    UNSPEC_FADDV))]
- "TARGET_SIMD"
- "faddp\\t%0.4s, %1.4s, %1.4s"
-  [(set_attr "type" "neon_fp_reduc_add_s_q")]
-)
-
 (define_expand "reduc_plus_scal_v4sf"
  [(set (match_operand:SF 0 "register_operand")
        (unspec:V4SF [(match_operand:V4SF 1 "register_operand")]
@@ -2036,8 +2112,8 @@
 {
   rtx elt = GEN_INT (ENDIAN_LANE_N (V4SFmode, 0));
   rtx scratch = gen_reg_rtx (V4SFmode);
-  emit_insn (gen_aarch64_addpv4sf (scratch, operands[1]));
-  emit_insn (gen_aarch64_addpv4sf (scratch, scratch));
+  emit_insn (gen_aarch64_faddpv4sf (scratch, operands[1], operands[1]));
+  emit_insn (gen_aarch64_faddpv4sf (scratch, scratch, scratch));
   emit_insn (gen_aarch64_get_lanev4sf (operands[0], scratch, elt));
   DONE;
 })
@@ -2072,8 +2148,8 @@
 ;; gimple_fold'd to the REDUC_(MAX|MIN)_EXPR tree code.  (This is FP smax/smin).
 (define_expand "reduc_<maxmin_uns>_scal_<mode>"
   [(match_operand:<VEL> 0 "register_operand")
-   (unspec:VDQF [(match_operand:VDQF 1 "register_operand")]
-		FMAXMINV)]
+   (unspec:VHSDF [(match_operand:VHSDF 1 "register_operand")]
+		  FMAXMINV)]
   "TARGET_SIMD"
   {
     rtx elt = GEN_INT (ENDIAN_LANE_N (<MODE>mode, 0));
@@ -2120,12 +2196,12 @@
 )
 
 (define_insn "aarch64_reduc_<maxmin_uns>_internal<mode>"
- [(set (match_operand:VDQF 0 "register_operand" "=w")
-       (unspec:VDQF [(match_operand:VDQF 1 "register_operand" "w")]
-		    FMAXMINV))]
+ [(set (match_operand:VHSDF 0 "register_operand" "=w")
+       (unspec:VHSDF [(match_operand:VHSDF 1 "register_operand" "w")]
+		      FMAXMINV))]
  "TARGET_SIMD"
  "<maxmin_uns_op><vp>\\t%<Vetype>0, %1.<Vtype>"
-  [(set_attr "type" "neon_fp_reduc_minmax_<Vetype><q>")]
+  [(set_attr "type" "neon_fp_reduc_minmax_<stype><q>")]
 )
 
 ;; aarch64_simd_bsl may compile to any of bsl/bif/bit depending on register
@@ -2635,7 +2711,7 @@
 (define_insn "*aarch64_combinez<mode>"
   [(set (match_operand:<VDBL> 0 "register_operand" "=w,w,w")
         (vec_concat:<VDBL>
-	   (match_operand:VD_BHSI 1 "general_operand" "w,r,m")
+	   (match_operand:VD_BHSI 1 "general_operand" "w,?r,m")
 	   (match_operand:VD_BHSI 2 "aarch64_simd_imm_zero" "Dz,Dz,Dz")))]
   "TARGET_SIMD && !BYTES_BIG_ENDIAN"
   "@
@@ -2651,7 +2727,7 @@
   [(set (match_operand:<VDBL> 0 "register_operand" "=w,w,w")
         (vec_concat:<VDBL>
 	   (match_operand:VD_BHSI 2 "aarch64_simd_imm_zero" "Dz,Dz,Dz")
-	   (match_operand:VD_BHSI 1 "general_operand" "w,r,m")))]
+	   (match_operand:VD_BHSI 1 "general_operand" "w,?r,m")))]
   "TARGET_SIMD && BYTES_BIG_ENDIAN"
   "@
    mov\\t%0.8b, %1.8b
@@ -2994,13 +3070,14 @@
 ;; fmulx.
 
 (define_insn "aarch64_fmulx<mode>"
-  [(set (match_operand:VALLF 0 "register_operand" "=w")
-	(unspec:VALLF [(match_operand:VALLF 1 "register_operand" "w")
-		       (match_operand:VALLF 2 "register_operand" "w")]
-		      UNSPEC_FMULX))]
+  [(set (match_operand:VHSDF_HSDF 0 "register_operand" "=w")
+	(unspec:VHSDF_HSDF
+	  [(match_operand:VHSDF_HSDF 1 "register_operand" "w")
+	   (match_operand:VHSDF_HSDF 2 "register_operand" "w")]
+	   UNSPEC_FMULX))]
  "TARGET_SIMD"
  "fmulx\t%<v>0<Vmtype>, %<v>1<Vmtype>, %<v>2<Vmtype>"
- [(set_attr "type" "neon_fp_mul_<Vetype>")]
+ [(set_attr "type" "neon_fp_mul_<stype>")]
 )
 
 ;; vmulxq_lane_f32, and vmulx_laneq_f32
@@ -3042,20 +3119,18 @@
   [(set_attr "type" "neon_fp_mul_<Vetype><q>")]
 )
 
-;; vmulxq_lane_f64
+;; vmulxq_lane
 
-(define_insn "*aarch64_mulx_elt_to_64v2df"
-  [(set (match_operand:V2DF 0 "register_operand" "=w")
-	(unspec:V2DF
-	 [(match_operand:V2DF 1 "register_operand" "w")
-	  (vec_duplicate:V2DF
-	    (match_operand:DF 2 "register_operand" "w"))]
+(define_insn "*aarch64_mulx_elt_from_dup<mode>"
+  [(set (match_operand:VHSDF 0 "register_operand" "=w")
+	(unspec:VHSDF
+	 [(match_operand:VHSDF 1 "register_operand" "w")
+	  (vec_duplicate:VHSDF
+	    (match_operand:<VEL> 2 "register_operand" "w"))]
 	 UNSPEC_FMULX))]
   "TARGET_SIMD"
-  {
-    return "fmulx\t%0.2d, %1.2d, %2.d[0]";
-  }
-  [(set_attr "type" "neon_fp_mul_d_scalar_q")]
+  "fmulx\t%0.<Vtype>, %1.<Vtype>, %2.<Vetype>[0]";
+  [(set_attr "type" "neon<fp>_mul_<stype>_scalar<q>")]
 )
 
 ;; vmulxs_lane_f32, vmulxs_laneq_f32
@@ -3937,15 +4012,12 @@
 			   "aarch64_simd_shift_imm_bitsize_<ve_mode>" "i")]
                          VSHLL))]
   "TARGET_SIMD"
-  "*
-  int bit_width = GET_MODE_UNIT_SIZE (<MODE>mode) * BITS_PER_UNIT;
-  if (INTVAL (operands[2]) == bit_width)
   {
-    return \"shll\\t%0.<Vwtype>, %1.<Vtype>, %2\";
+    if (INTVAL (operands[2]) == GET_MODE_UNIT_BITSIZE (<MODE>mode))
+      return "shll\\t%0.<Vwtype>, %1.<Vtype>, %2";
+    else
+      return "<sur>shll\\t%0.<Vwtype>, %1.<Vtype>, %2";
   }
-  else {
-    return \"<sur>shll\\t%0.<Vwtype>, %1.<Vtype>, %2\";
-  }"
   [(set_attr "type" "neon_shift_imm_long")]
 )
 
@@ -3957,15 +4029,12 @@
 			 (match_operand:SI 2 "immediate_operand" "i")]
                          VSHLL))]
   "TARGET_SIMD"
-  "*
-  int bit_width = GET_MODE_UNIT_SIZE (<MODE>mode) * BITS_PER_UNIT;
-  if (INTVAL (operands[2]) == bit_width)
   {
-    return \"shll2\\t%0.<Vwtype>, %1.<Vtype>, %2\";
+    if (INTVAL (operands[2]) == GET_MODE_UNIT_BITSIZE (<MODE>mode))
+      return "shll2\\t%0.<Vwtype>, %1.<Vtype>, %2";
+    else
+      return "<sur>shll2\\t%0.<Vwtype>, %1.<Vtype>, %2";
   }
-  else {
-    return \"<sur>shll2\\t%0.<Vwtype>, %1.<Vtype>, %2\";
-  }"
   [(set_attr "type" "neon_shift_imm_long")]
 )
 
@@ -4246,30 +4315,32 @@
   [(set (match_operand:<V_cmp_result> 0 "register_operand" "=w,w")
 	(neg:<V_cmp_result>
 	  (COMPARISONS:<V_cmp_result>
-	    (match_operand:VALLF 1 "register_operand" "w,w")
-	    (match_operand:VALLF 2 "aarch64_simd_reg_or_zero" "w,YDz")
+	    (match_operand:VHSDF_HSDF 1 "register_operand" "w,w")
+	    (match_operand:VHSDF_HSDF 2 "aarch64_simd_reg_or_zero" "w,YDz")
 	  )))]
   "TARGET_SIMD"
   "@
   fcm<n_optab>\t%<v>0<Vmtype>, %<v><cmp_1><Vmtype>, %<v><cmp_2><Vmtype>
   fcm<optab>\t%<v>0<Vmtype>, %<v>1<Vmtype>, 0"
-  [(set_attr "type" "neon_fp_compare_<Vetype><q>")]
+  [(set_attr "type" "neon_fp_compare_<stype><q>")]
 )
 
 ;; fac(ge|gt)
 ;; Note we can also handle what would be fac(le|lt) by
 ;; generating fac(ge|gt).
 
-(define_insn "*aarch64_fac<optab><mode>"
+(define_insn "aarch64_fac<optab><mode>"
   [(set (match_operand:<V_cmp_result> 0 "register_operand" "=w")
 	(neg:<V_cmp_result>
 	  (FAC_COMPARISONS:<V_cmp_result>
-	    (abs:VALLF (match_operand:VALLF 1 "register_operand" "w"))
-	    (abs:VALLF (match_operand:VALLF 2 "register_operand" "w"))
+	    (abs:VHSDF_HSDF
+	      (match_operand:VHSDF_HSDF 1 "register_operand" "w"))
+	    (abs:VHSDF_HSDF
+	      (match_operand:VHSDF_HSDF 2 "register_operand" "w"))
   )))]
   "TARGET_SIMD"
   "fac<n_optab>\t%<v>0<Vmtype>, %<v><cmp_1><Vmtype>, %<v><cmp_2><Vmtype>"
-  [(set_attr "type" "neon_fp_compare_<Vetype><q>")]
+  [(set_attr "type" "neon_fp_compare_<stype><q>")]
 )
 
 ;; addp
@@ -4297,12 +4368,21 @@
 
 ;; sqrt
 
-(define_insn "sqrt<mode>2"
-  [(set (match_operand:VDQF 0 "register_operand" "=w")
-        (sqrt:VDQF (match_operand:VDQF 1 "register_operand" "w")))]
+(define_expand "sqrt<mode>2"
+  [(set (match_operand:VHSDF 0 "register_operand" "=w")
+	(sqrt:VHSDF (match_operand:VHSDF 1 "register_operand" "w")))]
+  "TARGET_SIMD"
+{
+  if (aarch64_emit_approx_sqrt (operands[0], operands[1], false))
+    DONE;
+})
+
+(define_insn "*sqrt<mode>2"
+  [(set (match_operand:VHSDF 0 "register_operand" "=w")
+	(sqrt:VHSDF (match_operand:VHSDF 1 "register_operand" "w")))]
   "TARGET_SIMD"
   "fsqrt\\t%0.<Vtype>, %1.<Vtype>"
-  [(set_attr "type" "neon_fp_sqrt_<Vetype><q>")]
+  [(set_attr "type" "neon_fp_sqrt_<stype><q>")]
 )
 
 ;; Patterns for vector struct loads and stores.
@@ -4652,7 +4732,7 @@
    ld1\\t{%S0.16b - %<Vendreg>0.16b}, %1"
   [(set_attr "type" "multiple,neon_store<nregs>_<nregs>reg_q,\
 		     neon_load<nregs>_<nregs>reg_q")
-   (set (attr "length") (symbol_ref "aarch64_simd_attr_length_move (insn)"))]
+   (set_attr "length" "<insn_count>,4,4")]
 )
 
 (define_insn "aarch64_be_ld1<mode>"
@@ -4685,7 +4765,7 @@
    stp\\t%q1, %R1, %0
    ldp\\t%q0, %R0, %1"
   [(set_attr "type" "multiple,neon_stp_q,neon_ldp_q")
-   (set (attr "length") (symbol_ref "aarch64_simd_attr_length_move (insn)"))]
+   (set_attr "length" "8,4,4")]
 )
 
 (define_insn "*aarch64_be_movci"
@@ -4696,7 +4776,7 @@
        || register_operand (operands[1], CImode))"
   "#"
   [(set_attr "type" "multiple")
-   (set (attr "length") (symbol_ref "aarch64_simd_attr_length_move (insn)"))]
+   (set_attr "length" "12,4,4")]
 )
 
 (define_insn "*aarch64_be_movxi"
@@ -4707,7 +4787,7 @@
        || register_operand (operands[1], XImode))"
   "#"
   [(set_attr "type" "multiple")
-   (set (attr "length") (symbol_ref "aarch64_simd_attr_length_move (insn)"))]
+   (set_attr "length" "16,4,4")]
 )
 
 (define_split
@@ -4787,7 +4867,7 @@
   DONE;
 })
 
-(define_insn "aarch64_ld2<mode>_dreg"
+(define_insn "aarch64_ld2<mode>_dreg_le"
   [(set (match_operand:OI 0 "register_operand" "=w")
 	(subreg:OI
 	  (vec_concat:<VRL2>
@@ -4800,12 +4880,30 @@
 	     (unspec:VD [(match_dup 1)]
 			UNSPEC_LD2)
 	     (vec_duplicate:VD (const_int 0)))) 0))]
-  "TARGET_SIMD"
+  "TARGET_SIMD && !BYTES_BIG_ENDIAN"
   "ld2\\t{%S0.<Vtype> - %T0.<Vtype>}, %1"
   [(set_attr "type" "neon_load2_2reg<q>")]
 )
 
-(define_insn "aarch64_ld2<mode>_dreg"
+(define_insn "aarch64_ld2<mode>_dreg_be"
+  [(set (match_operand:OI 0 "register_operand" "=w")
+	(subreg:OI
+	  (vec_concat:<VRL2>
+	    (vec_concat:<VDBL>
+	     (vec_duplicate:VD (const_int 0))
+	     (unspec:VD
+		[(match_operand:BLK 1 "aarch64_simd_struct_operand" "Utv")]
+		UNSPEC_LD2))
+	    (vec_concat:<VDBL>
+	     (vec_duplicate:VD (const_int 0))
+	     (unspec:VD [(match_dup 1)]
+			UNSPEC_LD2))) 0))]
+  "TARGET_SIMD && BYTES_BIG_ENDIAN"
+  "ld2\\t{%S0.<Vtype> - %T0.<Vtype>}, %1"
+  [(set_attr "type" "neon_load2_2reg<q>")]
+)
+
+(define_insn "aarch64_ld2<mode>_dreg_le"
   [(set (match_operand:OI 0 "register_operand" "=w")
 	(subreg:OI
 	  (vec_concat:<VRL2>
@@ -4818,12 +4916,30 @@
 	     (unspec:DX [(match_dup 1)]
 			UNSPEC_LD2)
 	     (const_int 0))) 0))]
-  "TARGET_SIMD"
+  "TARGET_SIMD && !BYTES_BIG_ENDIAN"
   "ld1\\t{%S0.1d - %T0.1d}, %1"
   [(set_attr "type" "neon_load1_2reg<q>")]
 )
 
-(define_insn "aarch64_ld3<mode>_dreg"
+(define_insn "aarch64_ld2<mode>_dreg_be"
+  [(set (match_operand:OI 0 "register_operand" "=w")
+	(subreg:OI
+	  (vec_concat:<VRL2>
+	    (vec_concat:<VDBL>
+	     (const_int 0)
+	     (unspec:DX
+		[(match_operand:BLK 1 "aarch64_simd_struct_operand" "Utv")]
+		UNSPEC_LD2))
+	    (vec_concat:<VDBL>
+	     (const_int 0)
+	     (unspec:DX [(match_dup 1)]
+			UNSPEC_LD2))) 0))]
+  "TARGET_SIMD && BYTES_BIG_ENDIAN"
+  "ld1\\t{%S0.1d - %T0.1d}, %1"
+  [(set_attr "type" "neon_load1_2reg<q>")]
+)
+
+(define_insn "aarch64_ld3<mode>_dreg_le"
   [(set (match_operand:CI 0 "register_operand" "=w")
 	(subreg:CI
 	 (vec_concat:<VRL3>
@@ -4841,12 +4957,35 @@
 	     (unspec:VD [(match_dup 1)]
 			UNSPEC_LD3)
 	     (vec_duplicate:VD (const_int 0)))) 0))]
-  "TARGET_SIMD"
+  "TARGET_SIMD && !BYTES_BIG_ENDIAN"
+  "ld3\\t{%S0.<Vtype> - %U0.<Vtype>}, %1"
+  [(set_attr "type" "neon_load3_3reg<q>")]
+)
+
+(define_insn "aarch64_ld3<mode>_dreg_be"
+  [(set (match_operand:CI 0 "register_operand" "=w")
+	(subreg:CI
+	 (vec_concat:<VRL3>
+	  (vec_concat:<VRL2>
+	    (vec_concat:<VDBL>
+	     (vec_duplicate:VD (const_int 0))
+	     (unspec:VD
+		[(match_operand:BLK 1 "aarch64_simd_struct_operand" "Utv")]
+		UNSPEC_LD3))
+	    (vec_concat:<VDBL>
+	     (vec_duplicate:VD (const_int 0))
+	     (unspec:VD [(match_dup 1)]
+			UNSPEC_LD3)))
+	  (vec_concat:<VDBL>
+	     (vec_duplicate:VD (const_int 0))
+	     (unspec:VD [(match_dup 1)]
+			UNSPEC_LD3))) 0))]
+  "TARGET_SIMD && BYTES_BIG_ENDIAN"
   "ld3\\t{%S0.<Vtype> - %U0.<Vtype>}, %1"
   [(set_attr "type" "neon_load3_3reg<q>")]
 )
 
-(define_insn "aarch64_ld3<mode>_dreg"
+(define_insn "aarch64_ld3<mode>_dreg_le"
   [(set (match_operand:CI 0 "register_operand" "=w")
 	(subreg:CI
 	 (vec_concat:<VRL3>
@@ -4864,12 +5003,35 @@
 	     (unspec:DX [(match_dup 1)]
 			UNSPEC_LD3)
 	     (const_int 0))) 0))]
-  "TARGET_SIMD"
+  "TARGET_SIMD && !BYTES_BIG_ENDIAN"
   "ld1\\t{%S0.1d - %U0.1d}, %1"
   [(set_attr "type" "neon_load1_3reg<q>")]
 )
 
-(define_insn "aarch64_ld4<mode>_dreg"
+(define_insn "aarch64_ld3<mode>_dreg_be"
+  [(set (match_operand:CI 0 "register_operand" "=w")
+	(subreg:CI
+	 (vec_concat:<VRL3>
+	  (vec_concat:<VRL2>
+	    (vec_concat:<VDBL>
+	     (const_int 0)
+	     (unspec:DX
+		[(match_operand:BLK 1 "aarch64_simd_struct_operand" "Utv")]
+		UNSPEC_LD3))
+	    (vec_concat:<VDBL>
+	     (const_int 0)
+	     (unspec:DX [(match_dup 1)]
+			UNSPEC_LD3)))
+	  (vec_concat:<VDBL>
+	     (const_int 0)
+	     (unspec:DX [(match_dup 1)]
+			UNSPEC_LD3))) 0))]
+  "TARGET_SIMD && BYTES_BIG_ENDIAN"
+  "ld1\\t{%S0.1d - %U0.1d}, %1"
+  [(set_attr "type" "neon_load1_3reg<q>")]
+)
+
+(define_insn "aarch64_ld4<mode>_dreg_le"
   [(set (match_operand:XI 0 "register_operand" "=w")
 	(subreg:XI
 	 (vec_concat:<VRL4>
@@ -4880,9 +5042,9 @@
 		UNSPEC_LD4)
 	       (vec_duplicate:VD (const_int 0)))
 	      (vec_concat:<VDBL>
-	        (unspec:VD [(match_dup 1)]
+		(unspec:VD [(match_dup 1)]
 			UNSPEC_LD4)
-	        (vec_duplicate:VD (const_int 0))))
+		(vec_duplicate:VD (const_int 0))))
 	   (vec_concat:<VRL2>
 	     (vec_concat:<VDBL>
 	       (unspec:VD [(match_dup 1)]
@@ -4892,12 +5054,40 @@
 	       (unspec:VD [(match_dup 1)]
 			UNSPEC_LD4)
 	       (vec_duplicate:VD (const_int 0))))) 0))]
-  "TARGET_SIMD"
+  "TARGET_SIMD && !BYTES_BIG_ENDIAN"
+  "ld4\\t{%S0.<Vtype> - %V0.<Vtype>}, %1"
+  [(set_attr "type" "neon_load4_4reg<q>")]
+)
+
+(define_insn "aarch64_ld4<mode>_dreg_be"
+  [(set (match_operand:XI 0 "register_operand" "=w")
+	(subreg:XI
+	 (vec_concat:<VRL4>
+	   (vec_concat:<VRL2>
+	     (vec_concat:<VDBL>
+	       (vec_duplicate:VD (const_int 0))
+	       (unspec:VD
+		[(match_operand:BLK 1 "aarch64_simd_struct_operand" "Utv")]
+		UNSPEC_LD4))
+	      (vec_concat:<VDBL>
+		(vec_duplicate:VD (const_int 0))
+		(unspec:VD [(match_dup 1)]
+			UNSPEC_LD4)))
+	   (vec_concat:<VRL2>
+	     (vec_concat:<VDBL>
+	       (vec_duplicate:VD (const_int 0))
+	       (unspec:VD [(match_dup 1)]
+			UNSPEC_LD4))
+	     (vec_concat:<VDBL>
+	       (vec_duplicate:VD (const_int 0))
+	       (unspec:VD [(match_dup 1)]
+			UNSPEC_LD4)))) 0))]
+  "TARGET_SIMD && BYTES_BIG_ENDIAN"
   "ld4\\t{%S0.<Vtype> - %V0.<Vtype>}, %1"
   [(set_attr "type" "neon_load4_4reg<q>")]
 )
 
-(define_insn "aarch64_ld4<mode>_dreg"
+(define_insn "aarch64_ld4<mode>_dreg_le"
   [(set (match_operand:XI 0 "register_operand" "=w")
 	(subreg:XI
 	 (vec_concat:<VRL4>
@@ -4910,7 +5100,7 @@
 	      (vec_concat:<VDBL>
 	        (unspec:DX [(match_dup 1)]
 			UNSPEC_LD4)
-	        (const_int 0)))
+		(const_int 0)))
 	   (vec_concat:<VRL2>
 	     (vec_concat:<VDBL>
 	       (unspec:DX [(match_dup 1)]
@@ -4920,7 +5110,35 @@
 	       (unspec:DX [(match_dup 1)]
 			UNSPEC_LD4)
 	       (const_int 0)))) 0))]
-  "TARGET_SIMD"
+  "TARGET_SIMD && !BYTES_BIG_ENDIAN"
+  "ld1\\t{%S0.1d - %V0.1d}, %1"
+  [(set_attr "type" "neon_load1_4reg<q>")]
+)
+
+(define_insn "aarch64_ld4<mode>_dreg_be"
+  [(set (match_operand:XI 0 "register_operand" "=w")
+	(subreg:XI
+	 (vec_concat:<VRL4>
+	   (vec_concat:<VRL2>
+	     (vec_concat:<VDBL>
+	       (const_int 0)
+	       (unspec:DX
+		[(match_operand:BLK 1 "aarch64_simd_struct_operand" "Utv")]
+		UNSPEC_LD4))
+	      (vec_concat:<VDBL>
+		(const_int 0)
+		(unspec:DX [(match_dup 1)]
+			UNSPEC_LD4)))
+	   (vec_concat:<VRL2>
+	     (vec_concat:<VDBL>
+	       (const_int 0)
+	       (unspec:DX [(match_dup 1)]
+			UNSPEC_LD4))
+	     (vec_concat:<VDBL>
+	       (const_int 0)
+	       (unspec:DX [(match_dup 1)]
+			UNSPEC_LD4)))) 0))]
+  "TARGET_SIMD && BYTES_BIG_ENDIAN"
   "ld1\\t{%S0.1d - %V0.1d}, %1"
   [(set_attr "type" "neon_load1_4reg<q>")]
 )
@@ -4934,7 +5152,12 @@
   rtx mem = gen_rtx_MEM (BLKmode, operands[1]);
   set_mem_size (mem, <VSTRUCT:nregs> * 8);
 
-  emit_insn (gen_aarch64_ld<VSTRUCT:nregs><VDC:mode>_dreg (operands[0], mem));
+  if (BYTES_BIG_ENDIAN)
+    emit_insn (gen_aarch64_ld<VSTRUCT:nregs><VDC:mode>_dreg_be (operands[0],
+								mem));
+  else
+    emit_insn (gen_aarch64_ld<VSTRUCT:nregs><VDC:mode>_dreg_le (operands[0],
+								mem));
   DONE;
 })
 
@@ -5160,10 +5383,10 @@
 )
 
 (define_insn "aarch64_<PERMUTE:perm_insn><PERMUTE:perm_hilo><mode>"
-  [(set (match_operand:VALL 0 "register_operand" "=w")
-	(unspec:VALL [(match_operand:VALL 1 "register_operand" "w")
-		      (match_operand:VALL 2 "register_operand" "w")]
-		       PERMUTE))]
+  [(set (match_operand:VALL_F16 0 "register_operand" "=w")
+	(unspec:VALL_F16 [(match_operand:VALL_F16 1 "register_operand" "w")
+			  (match_operand:VALL_F16 2 "register_operand" "w")]
+	 PERMUTE))]
   "TARGET_SIMD"
   "<PERMUTE:perm_insn><PERMUTE:perm_hilo>\\t%0.<Vtype>, %1.<Vtype>, %2.<Vtype>"
   [(set_attr "type" "neon_permute<q>")]
@@ -5171,11 +5394,11 @@
 
 ;; Note immediate (third) operand is lane index not byte index.
 (define_insn "aarch64_ext<mode>"
-  [(set (match_operand:VALL 0 "register_operand" "=w")
-        (unspec:VALL [(match_operand:VALL 1 "register_operand" "w")
-                      (match_operand:VALL 2 "register_operand" "w")
-                      (match_operand:SI 3 "immediate_operand" "i")]
-                     UNSPEC_EXT))]
+  [(set (match_operand:VALL_F16 0 "register_operand" "=w")
+        (unspec:VALL_F16 [(match_operand:VALL_F16 1 "register_operand" "w")
+			  (match_operand:VALL_F16 2 "register_operand" "w")
+			  (match_operand:SI 3 "immediate_operand" "i")]
+	 UNSPEC_EXT))]
   "TARGET_SIMD"
 {
   operands[3] = GEN_INT (INTVAL (operands[3])
@@ -5186,8 +5409,8 @@
 )
 
 (define_insn "aarch64_rev<REVERSE:rev_op><mode>"
-  [(set (match_operand:VALL 0 "register_operand" "=w")
-	(unspec:VALL [(match_operand:VALL 1 "register_operand" "w")]
+  [(set (match_operand:VALL_F16 0 "register_operand" "=w")
+	(unspec:VALL_F16 [(match_operand:VALL_F16 1 "register_operand" "w")]
                     REVERSE))]
   "TARGET_SIMD"
   "rev<REVERSE:rev_op>\\t%0.<Vtype>, %1.<Vtype>"
@@ -5354,31 +5577,32 @@
 )
 
 (define_insn "aarch64_frecpe<mode>"
-  [(set (match_operand:VDQF 0 "register_operand" "=w")
-	(unspec:VDQF [(match_operand:VDQF 1 "register_operand" "w")]
-		    UNSPEC_FRECPE))]
+  [(set (match_operand:VHSDF 0 "register_operand" "=w")
+	(unspec:VHSDF [(match_operand:VHSDF 1 "register_operand" "w")]
+	 UNSPEC_FRECPE))]
   "TARGET_SIMD"
   "frecpe\\t%0.<Vtype>, %1.<Vtype>"
-  [(set_attr "type" "neon_fp_recpe_<Vetype><q>")]
+  [(set_attr "type" "neon_fp_recpe_<stype><q>")]
 )
 
 (define_insn "aarch64_frecp<FRECP:frecp_suffix><mode>"
-  [(set (match_operand:GPF 0 "register_operand" "=w")
-	(unspec:GPF [(match_operand:GPF 1 "register_operand" "w")]
-		    FRECP))]
+  [(set (match_operand:GPF_F16 0 "register_operand" "=w")
+	(unspec:GPF_F16 [(match_operand:GPF_F16 1 "register_operand" "w")]
+	 FRECP))]
   "TARGET_SIMD"
   "frecp<FRECP:frecp_suffix>\\t%<s>0, %<s>1"
-  [(set_attr "type" "neon_fp_recp<FRECP:frecp_suffix>_<GPF:Vetype><GPF:q>")]
+  [(set_attr "type" "neon_fp_recp<FRECP:frecp_suffix>_<GPF_F16:stype>")]
 )
 
 (define_insn "aarch64_frecps<mode>"
-  [(set (match_operand:VALLF 0 "register_operand" "=w")
-	(unspec:VALLF [(match_operand:VALLF 1 "register_operand" "w")
-		     (match_operand:VALLF 2 "register_operand" "w")]
-		    UNSPEC_FRECPS))]
+  [(set (match_operand:VHSDF_HSDF 0 "register_operand" "=w")
+	(unspec:VHSDF_HSDF
+	  [(match_operand:VHSDF_HSDF 1 "register_operand" "w")
+	  (match_operand:VHSDF_HSDF 2 "register_operand" "w")]
+	  UNSPEC_FRECPS))]
   "TARGET_SIMD"
   "frecps\\t%<v>0<Vmtype>, %<v>1<Vmtype>, %<v>2<Vmtype>"
-  [(set_attr "type" "neon_fp_recps_<Vetype><q>")]
+  [(set_attr "type" "neon_fp_recps_<stype><q>")]
 )
 
 (define_insn "aarch64_urecpe<mode>"
@@ -5414,13 +5638,25 @@
   [(set_attr "type" "crypto_aese")]
 )
 
+;; When AES/AESMC fusion is enabled we want the register allocation to
+;; look like:
+;;    AESE Vn, _
+;;    AESMC Vn, Vn
+;; So prefer to tie operand 1 to operand 0 when fusing.
+
 (define_insn "aarch64_crypto_aes<aesmc_op>v16qi"
-  [(set (match_operand:V16QI 0 "register_operand" "=w")
-	(unspec:V16QI [(match_operand:V16QI 1 "register_operand" "w")]
+  [(set (match_operand:V16QI 0 "register_operand" "=w,w")
+	(unspec:V16QI [(match_operand:V16QI 1 "register_operand" "0,w")]
 	 CRYPTO_AESMC))]
   "TARGET_SIMD && TARGET_CRYPTO"
   "aes<aesmc_op>\\t%0.16b, %1.16b"
-  [(set_attr "type" "crypto_aesmc")]
+  [(set_attr "type" "crypto_aesmc")
+   (set_attr_alternative "enabled"
+     [(if_then_else (match_test
+		       "aarch64_fusion_enabled_p (AARCH64_FUSE_AES_AESMC)")
+		     (const_string "yes" )
+		     (const_string "no"))
+      (const_string "yes")])]
 )
 
 ;; sha1
@@ -5435,6 +5671,26 @@
   [(set_attr "type" "crypto_sha1_fast")]
 )
 
+(define_insn "aarch64_crypto_sha1hv4si"
+  [(set (match_operand:SI 0 "register_operand" "=w")
+	(unspec:SI [(vec_select:SI (match_operand:V4SI 1 "register_operand" "w")
+		     (parallel [(const_int 0)]))]
+	 UNSPEC_SHA1H))]
+  "TARGET_SIMD && TARGET_CRYPTO && !BYTES_BIG_ENDIAN"
+  "sha1h\\t%s0, %s1"
+  [(set_attr "type" "crypto_sha1_fast")]
+)
+
+(define_insn "aarch64_be_crypto_sha1hv4si"
+  [(set (match_operand:SI 0 "register_operand" "=w")
+	(unspec:SI [(vec_select:SI (match_operand:V4SI 1 "register_operand" "w")
+		     (parallel [(const_int 3)]))]
+	 UNSPEC_SHA1H))]
+  "TARGET_SIMD && TARGET_CRYPTO && BYTES_BIG_ENDIAN"
+  "sha1h\\t%s0, %s1"
+  [(set_attr "type" "crypto_sha1_fast")]
+)
+
 (define_insn "aarch64_crypto_sha1su1v4si"
   [(set (match_operand:V4SI 0 "register_operand" "=w")
         (unspec:V4SI [(match_operand:V4SI 1 "register_operand" "0")
--- a/src/gcc/config/aarch64/aarch64-tune.md
+++ b/src/gcc/config/aarch64/aarch64-tune.md
@@ -1,5 +1,5 @@
 ;; -*- buffer-read-only: t -*-
 ;; Generated automatically by gentune.sh from aarch64-cores.def
 (define_attr "tune"
-	"cortexa35,cortexa53,cortexa57,cortexa72,exynosm1,qdf24xx,thunderx,xgene1,cortexa57cortexa53,cortexa72cortexa53"
+	"cortexa35,cortexa53,cortexa57,cortexa72,cortexa73,exynosm1,qdf24xx,thunderx,xgene1,vulcan,cortexa57cortexa53,cortexa72cortexa53,cortexa73cortexa35,cortexa73cortexa53"
 	(const (symbol_ref "((enum attr_tune) aarch64_tune)")))
--- a/src/gcc/config/aarch64/aarch64-tuning-flags.def
+++ b/src/gcc/config/aarch64/aarch64-tuning-flags.def
@@ -29,5 +29,8 @@
      AARCH64_TUNE_ to give an enum name. */
 
 AARCH64_EXTRA_TUNING_OPTION ("rename_fma_regs", RENAME_FMA_REGS)
-AARCH64_EXTRA_TUNING_OPTION ("approx_rsqrt", APPROX_RSQRT)
 
+/* Don't create non-8 byte aligned load/store pair.  That is if the
+two load/stores are not at least 8 byte aligned don't create load/store
+pairs.   */
+AARCH64_EXTRA_TUNING_OPTION ("slow_unaligned_ldpw", SLOW_UNALIGNED_LDPW)
--- a/src/gcc/config/aarch64/aarch64.c
+++ b/src/gcc/config/aarch64/aarch64.c
@@ -26,6 +26,7 @@
 #include "target.h"
 #include "rtl.h"
 #include "tree.h"
+#include "memmodel.h"
 #include "gimple.h"
 #include "cfghooks.h"
 #include "cfgloop.h"
@@ -61,7 +62,6 @@
 #include "rtl-iter.h"
 #include "tm-constrs.h"
 #include "sched-int.h"
-#include "cortex-a57-fma-steering.h"
 #include "target-globals.h"
 #include "common/common-target.h"
 
@@ -141,6 +141,10 @@ static bool aarch64_vector_mode_supported_p (machine_mode);
 static bool aarch64_vectorize_vec_perm_const_ok (machine_mode vmode,
 						 const unsigned char *sel);
 static int aarch64_address_cost (rtx, machine_mode, addr_space_t, bool);
+static bool aarch64_builtin_support_vector_misalignment (machine_mode mode,
+							 const_tree type,
+							 int misalignment,
+							 bool is_packed);
 
 /* Major revision number of the ARM Architecture implemented by the target.  */
 unsigned aarch64_architecture_version;
@@ -152,7 +156,7 @@ enum aarch64_processor aarch64_tune = cortexa53;
 unsigned long aarch64_tune_flags = 0;
 
 /* Global flag for PC relative loads.  */
-bool aarch64_nopcrelative_literal_loads;
+bool aarch64_pcrelative_literal_loads;
 
 /* Support for command line parsing of boolean flags in the tuning
    structures.  */
@@ -250,6 +254,38 @@ static const struct cpu_addrcost_table xgene1_addrcost_table =
   0, /* imm_offset  */
 };
 
+static const struct cpu_addrcost_table qdf24xx_addrcost_table =
+{
+    {
+      1, /* hi  */
+      0, /* si  */
+      0, /* di  */
+      1, /* ti  */
+    },
+  0, /* pre_modify  */
+  0, /* post_modify  */
+  0, /* register_offset  */
+  0, /* register_sextend  */
+  0, /* register_zextend  */
+  0 /* imm_offset  */
+};
+
+static const struct cpu_addrcost_table vulcan_addrcost_table =
+{
+    {
+      0, /* hi  */
+      0, /* si  */
+      0, /* di  */
+      2, /* ti  */
+    },
+  0, /* pre_modify  */
+  0, /* post_modify  */
+  2, /* register_offset  */
+  3, /* register_sextend  */
+  3, /* register_zextend  */
+  0, /* imm_offset  */
+};
+
 static const struct cpu_regmove_cost generic_regmove_cost =
 {
   1, /* GP2GP  */
@@ -308,6 +344,24 @@ static const struct cpu_regmove_cost xgene1_regmove_cost =
   2 /* FP2FP  */
 };
 
+static const struct cpu_regmove_cost qdf24xx_regmove_cost =
+{
+  2, /* GP2GP  */
+  /* Avoid the use of int<->fp moves for spilling.  */
+  6, /* GP2FP  */
+  6, /* FP2GP  */
+  4 /* FP2FP  */
+};
+
+static const struct cpu_regmove_cost vulcan_regmove_cost =
+{
+  1, /* GP2GP  */
+  /* Avoid the use of int<->fp moves for spilling.  */
+  8, /* GP2FP  */
+  8, /* FP2GP  */
+  4  /* FP2FP  */
+};
+
 /* Generic costs for vector insn classes.  */
 static const struct cpu_vector_cost generic_vector_cost =
 {
@@ -326,18 +380,36 @@ static const struct cpu_vector_cost generic_vector_cost =
   1 /* cond_not_taken_branch_cost  */
 };
 
+/* ThunderX costs for vector insn classes.  */
+static const struct cpu_vector_cost thunderx_vector_cost =
+{
+  1, /* scalar_stmt_cost  */
+  3, /* scalar_load_cost  */
+  1, /* scalar_store_cost  */
+  4, /* vec_stmt_cost  */
+  4, /* vec_permute_cost  */
+  2, /* vec_to_scalar_cost  */
+  2, /* scalar_to_vec_cost  */
+  3, /* vec_align_load_cost  */
+  10, /* vec_unalign_load_cost  */
+  10, /* vec_unalign_store_cost  */
+  1, /* vec_store_cost  */
+  3, /* cond_taken_branch_cost  */
+  3 /* cond_not_taken_branch_cost  */
+};
+
 /* Generic costs for vector insn classes.  */
 static const struct cpu_vector_cost cortexa57_vector_cost =
 {
   1, /* scalar_stmt_cost  */
   4, /* scalar_load_cost  */
   1, /* scalar_store_cost  */
-  3, /* vec_stmt_cost  */
+  2, /* vec_stmt_cost  */
   3, /* vec_permute_cost  */
   8, /* vec_to_scalar_cost  */
   8, /* scalar_to_vec_cost  */
-  5, /* vec_align_load_cost  */
-  5, /* vec_unalign_load_cost  */
+  4, /* vec_align_load_cost  */
+  4, /* vec_unalign_load_cost  */
   1, /* vec_unalign_store_cost  */
   1, /* vec_store_cost  */
   1, /* cond_taken_branch_cost  */
@@ -379,6 +451,24 @@ static const struct cpu_vector_cost xgene1_vector_cost =
   1 /* cond_not_taken_branch_cost  */
 };
 
+/* Costs for vector insn classes for Vulcan.  */
+static const struct cpu_vector_cost vulcan_vector_cost =
+{
+  6, /* scalar_stmt_cost  */
+  4, /* scalar_load_cost  */
+  1, /* scalar_store_cost  */
+  6, /* vec_stmt_cost  */
+  3, /* vec_permute_cost  */
+  6, /* vec_to_scalar_cost  */
+  5, /* scalar_to_vec_cost  */
+  8, /* vec_align_load_cost  */
+  8, /* vec_unalign_load_cost  */
+  4, /* vec_unalign_store_cost  */
+  4, /* vec_store_cost  */
+  2, /* cond_taken_branch_cost  */
+  1  /* cond_not_taken_branch_cost  */
+};
+
 /* Generic costs for branch instructions.  */
 static const struct cpu_branch_cost generic_branch_cost =
 {
@@ -393,6 +483,37 @@ static const struct cpu_branch_cost cortexa57_branch_cost =
   3   /* Unpredictable.  */
 };
 
+/* Branch costs for Vulcan.  */
+static const struct cpu_branch_cost vulcan_branch_cost =
+{
+  1,  /* Predictable.  */
+  3   /* Unpredictable.  */
+};
+
+/* Generic approximation modes.  */
+static const cpu_approx_modes generic_approx_modes =
+{
+  AARCH64_APPROX_NONE,	/* division  */
+  AARCH64_APPROX_NONE,	/* sqrt  */
+  AARCH64_APPROX_NONE	/* recip_sqrt  */
+};
+
+/* Approximation modes for Exynos M1.  */
+static const cpu_approx_modes exynosm1_approx_modes =
+{
+  AARCH64_APPROX_NONE,	/* division  */
+  AARCH64_APPROX_ALL,	/* sqrt  */
+  AARCH64_APPROX_ALL	/* recip_sqrt  */
+};
+
+/* Approximation modes for X-Gene 1.  */
+static const cpu_approx_modes xgene1_approx_modes =
+{
+  AARCH64_APPROX_NONE,	/* division  */
+  AARCH64_APPROX_NONE,	/* sqrt  */
+  AARCH64_APPROX_ALL	/* recip_sqrt  */
+};
+
 static const struct tune_params generic_tunings =
 {
   &cortexa57_extra_costs,
@@ -400,6 +521,7 @@ static const struct tune_params generic_tunings =
   &generic_regmove_cost,
   &generic_vector_cost,
   &generic_branch_cost,
+  &generic_approx_modes,
   4, /* memmov_cost  */
   2, /* issue_rate  */
   AARCH64_FUSE_NOTHING, /* fusible_ops  */
@@ -423,14 +545,15 @@ static const struct tune_params cortexa35_tunings =
   &generic_addrcost_table,
   &cortexa53_regmove_cost,
   &generic_vector_cost,
-  &generic_branch_cost,
+  &cortexa57_branch_cost,
+  &generic_approx_modes,
   4, /* memmov_cost  */
   1, /* issue_rate  */
-  (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
+  (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
    | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops  */
-  8,	/* function_align.  */
+  16,	/* function_align.  */
   8,	/* jump_align.  */
-  4,	/* loop_align.  */
+  8,	/* loop_align.  */
   2,	/* int_reassoc_width.  */
   4,	/* fp_reassoc_width.  */
   1,	/* vec_reassoc_width.  */
@@ -448,14 +571,15 @@ static const struct tune_params cortexa53_tunings =
   &generic_addrcost_table,
   &cortexa53_regmove_cost,
   &generic_vector_cost,
-  &generic_branch_cost,
+  &cortexa57_branch_cost,
+  &generic_approx_modes,
   4, /* memmov_cost  */
   2, /* issue_rate  */
   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
    | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops  */
-  8,	/* function_align.  */
+  16,	/* function_align.  */
   8,	/* jump_align.  */
-  4,	/* loop_align.  */
+  8,	/* loop_align.  */
   2,	/* int_reassoc_width.  */
   4,	/* fp_reassoc_width.  */
   1,	/* vec_reassoc_width.  */
@@ -474,13 +598,14 @@ static const struct tune_params cortexa57_tunings =
   &cortexa57_regmove_cost,
   &cortexa57_vector_cost,
   &cortexa57_branch_cost,
+  &generic_approx_modes,
   4, /* memmov_cost  */
   3, /* issue_rate  */
   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
    | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops  */
   16,	/* function_align.  */
   8,	/* jump_align.  */
-  4,	/* loop_align.  */
+  8,	/* loop_align.  */
   2,	/* int_reassoc_width.  */
   4,	/* fp_reassoc_width.  */
   1,	/* vec_reassoc_width.  */
@@ -498,14 +623,15 @@ static const struct tune_params cortexa72_tunings =
   &cortexa57_addrcost_table,
   &cortexa57_regmove_cost,
   &cortexa57_vector_cost,
-  &generic_branch_cost,
+  &cortexa57_branch_cost,
+  &generic_approx_modes,
   4, /* memmov_cost  */
   3, /* issue_rate  */
   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
    | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops  */
   16,	/* function_align.  */
   8,	/* jump_align.  */
-  4,	/* loop_align.  */
+  8,	/* loop_align.  */
   2,	/* int_reassoc_width.  */
   4,	/* fp_reassoc_width.  */
   1,	/* vec_reassoc_width.  */
@@ -513,7 +639,33 @@ static const struct tune_params cortexa72_tunings =
   2,	/* min_div_recip_mul_df.  */
   0,	/* max_case_values.  */
   0,	/* cache_line_size.  */
-  tune_params::AUTOPREFETCHER_OFF,	/* autoprefetcher_model.  */
+  tune_params::AUTOPREFETCHER_WEAK,	/* autoprefetcher_model.  */
+  (AARCH64_EXTRA_TUNE_NONE)	/* tune_flags.  */
+};
+
+static const struct tune_params cortexa73_tunings =
+{
+  &cortexa57_extra_costs,
+  &cortexa57_addrcost_table,
+  &cortexa57_regmove_cost,
+  &cortexa57_vector_cost,
+  &cortexa57_branch_cost,
+  &generic_approx_modes,
+  4, /* memmov_cost.  */
+  2, /* issue_rate.  */
+  (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
+   | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops  */
+  16,	/* function_align.  */
+  8,	/* jump_align.  */
+  8,	/* loop_align.  */
+  2,	/* int_reassoc_width.  */
+  4,	/* fp_reassoc_width.  */
+  1,	/* vec_reassoc_width.  */
+  2,	/* min_div_recip_mul_sf.  */
+  2,	/* min_div_recip_mul_df.  */
+  0,	/* max_case_values.  */
+  0,	/* cache_line_size.  */
+  tune_params::AUTOPREFETCHER_WEAK,	/* autoprefetcher_model.  */
   (AARCH64_EXTRA_TUNE_NONE)	/* tune_flags.  */
 };
 
@@ -524,6 +676,7 @@ static const struct tune_params exynosm1_tunings =
   &exynosm1_regmove_cost,
   &exynosm1_vector_cost,
   &generic_branch_cost,
+  &exynosm1_approx_modes,
   4,	/* memmov_cost  */
   3,	/* issue_rate  */
   (AARCH64_FUSE_AES_AESMC), /* fusible_ops  */
@@ -538,7 +691,7 @@ static const struct tune_params exynosm1_tunings =
   48,	/* max_case_values.  */
   64,	/* cache_line_size.  */
   tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model.  */
-  (AARCH64_EXTRA_TUNE_APPROX_RSQRT) /* tune_flags.  */
+  (AARCH64_EXTRA_TUNE_NONE) /* tune_flags.  */
 };
 
 static const struct tune_params thunderx_tunings =
@@ -546,8 +699,9 @@ static const struct tune_params thunderx_tunings =
   &thunderx_extra_costs,
   &generic_addrcost_table,
   &thunderx_regmove_cost,
-  &generic_vector_cost,
+  &thunderx_vector_cost,
   &generic_branch_cost,
+  &generic_approx_modes,
   6, /* memmov_cost  */
   2, /* issue_rate  */
   AARCH64_FUSE_CMP_BRANCH, /* fusible_ops  */
@@ -562,7 +716,7 @@ static const struct tune_params thunderx_tunings =
   0,	/* max_case_values.  */
   0,	/* cache_line_size.  */
   tune_params::AUTOPREFETCHER_OFF,	/* autoprefetcher_model.  */
-  (AARCH64_EXTRA_TUNE_NONE)	/* tune_flags.  */
+  (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)	/* tune_flags.  */
 };
 
 static const struct tune_params xgene1_tunings =
@@ -572,6 +726,7 @@ static const struct tune_params xgene1_tunings =
   &xgene1_regmove_cost,
   &xgene1_vector_cost,
   &generic_branch_cost,
+  &xgene1_approx_modes,
   6, /* memmov_cost  */
   4, /* issue_rate  */
   AARCH64_FUSE_NOTHING, /* fusible_ops  */
@@ -586,7 +741,58 @@ static const struct tune_params xgene1_tunings =
   0,	/* max_case_values.  */
   0,	/* cache_line_size.  */
   tune_params::AUTOPREFETCHER_OFF,	/* autoprefetcher_model.  */
-  (AARCH64_EXTRA_TUNE_APPROX_RSQRT)	/* tune_flags.  */
+  (AARCH64_EXTRA_TUNE_NONE)	/* tune_flags.  */
+};
+
+static const struct tune_params qdf24xx_tunings =
+{
+  &qdf24xx_extra_costs,
+  &qdf24xx_addrcost_table,
+  &qdf24xx_regmove_cost,
+  &generic_vector_cost,
+  &generic_branch_cost,
+  &generic_approx_modes,
+  4, /* memmov_cost  */
+  4, /* issue_rate  */
+  (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
+   | AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops  */
+  16,	/* function_align.  */
+  8,	/* jump_align.  */
+  16,	/* loop_align.  */
+  2,	/* int_reassoc_width.  */
+  4,	/* fp_reassoc_width.  */
+  1,	/* vec_reassoc_width.  */
+  2,	/* min_div_recip_mul_sf.  */
+  2,	/* min_div_recip_mul_df.  */
+  0,	/* max_case_values.  */
+  64,	/* cache_line_size.  */
+  tune_params::AUTOPREFETCHER_STRONG,	/* autoprefetcher_model.  */
+  (AARCH64_EXTRA_TUNE_NONE)		/* tune_flags.  */
+};
+
+static const struct tune_params vulcan_tunings =
+{
+  &vulcan_extra_costs,
+  &vulcan_addrcost_table,
+  &vulcan_regmove_cost,
+  &vulcan_vector_cost,
+  &vulcan_branch_cost,
+  &generic_approx_modes,
+  4, /* memmov_cost.  */
+  4, /* issue_rate.  */
+  AARCH64_FUSE_NOTHING, /* fuseable_ops.  */
+  16,	/* function_align.  */
+  8,	/* jump_align.  */
+  16,	/* loop_align.  */
+  3,	/* int_reassoc_width.  */
+  2,	/* fp_reassoc_width.  */
+  2,	/* vec_reassoc_width.  */
+  2,	/* min_div_recip_mul_sf.  */
+  2,	/* min_div_recip_mul_df.  */
+  0,	/* max_case_values.  */
+  64,	/* cache_line_size.  */
+  tune_params::AUTOPREFETCHER_OFF,	/* autoprefetcher_model.  */
+  (AARCH64_EXTRA_TUNE_NONE)	/* tune_flags.  */
 };
 
 /* Support for fine-grained override of the tuning structures.  */
@@ -663,16 +869,6 @@ struct aarch64_option_extension
   const unsigned long flags_off;
 };
 
-/* ISA extensions in AArch64.  */
-static const struct aarch64_option_extension all_extensions[] =
-{
-#define AARCH64_OPT_EXTENSION(NAME, X, FLAGS_ON, FLAGS_OFF, FEATURE_STRING) \
-  {NAME, FLAGS_ON, FLAGS_OFF},
-#include "aarch64-option-extensions.def"
-#undef AARCH64_OPT_EXTENSION
-  {NULL, 0, 0}
-};
-
 typedef enum aarch64_cond_code
 {
   AARCH64_EQ = 0, AARCH64_NE, AARCH64_CS, AARCH64_CC, AARCH64_MI, AARCH64_PL,
@@ -1110,7 +1306,8 @@ aarch64_load_symref_appropriately (rtx dest, rtx imm,
 	    emit_move_insn (gp_rtx, gen_rtx_HIGH (Pmode, s));
 
 	    if (mode != GET_MODE (gp_rtx))
-	      gp_rtx = simplify_gen_subreg (mode, gp_rtx, GET_MODE (gp_rtx), 0);
+             gp_rtx = gen_lowpart (mode, gp_rtx);
+
 	  }
 
 	if (mode == ptr_mode)
@@ -1186,10 +1383,14 @@ aarch64_load_symref_appropriately (rtx dest, rtx imm,
     case SYMBOL_SMALL_TLSGD:
       {
 	rtx_insn *insns;
-	rtx result = gen_rtx_REG (Pmode, R0_REGNUM);
+	machine_mode mode = GET_MODE (dest);
+	rtx result = gen_rtx_REG (mode, R0_REGNUM);
 
 	start_sequence ();
-	aarch64_emit_call_insn (gen_tlsgd_small (result, imm));
+	if (TARGET_ILP32)
+	  aarch64_emit_call_insn (gen_tlsgd_small_si (result, imm));
+	else
+	  aarch64_emit_call_insn (gen_tlsgd_small_di (result, imm));
 	insns = get_insns ();
 	end_sequence ();
 
@@ -1703,7 +1904,7 @@ aarch64_expand_mov_immediate (rtx dest, rtx imm)
 	     we need to expand the literal pool access carefully.
 	     This is something that needs to be done in a number
 	     of places, so could well live as a separate function.  */
-	  if (aarch64_nopcrelative_literal_loads)
+	  if (!aarch64_pcrelative_literal_loads)
 	    {
 	      gcc_assert (can_create_pseudo_p ());
 	      base = gen_reg_rtx (ptr_mode);
@@ -1766,6 +1967,88 @@ aarch64_expand_mov_immediate (rtx dest, rtx imm)
   aarch64_internal_mov_immediate (dest, imm, true, GET_MODE (dest));
 }
 
+/* Add DELTA to REGNUM in mode MODE.  SCRATCHREG can be used to hold a
+   temporary value if necessary.  FRAME_RELATED_P should be true if
+   the RTX_FRAME_RELATED flag should be set and CFA adjustments added
+   to the generated instructions.  If SCRATCHREG is known to hold
+   abs (delta), EMIT_MOVE_IMM can be set to false to avoid emitting the
+   immediate again.
+
+   Since this function may be used to adjust the stack pointer, we must
+   ensure that it cannot cause transient stack deallocation (for example
+   by first incrementing SP and then decrementing when adjusting by a
+   large immediate).  */
+
+static void
+aarch64_add_constant_internal (machine_mode mode, int regnum, int scratchreg,
+			       HOST_WIDE_INT delta, bool frame_related_p,
+			       bool emit_move_imm)
+{
+  HOST_WIDE_INT mdelta = abs_hwi (delta);
+  rtx this_rtx = gen_rtx_REG (mode, regnum);
+  rtx_insn *insn;
+
+  if (!mdelta)
+    return;
+
+  /* Single instruction adjustment.  */
+  if (aarch64_uimm12_shift (mdelta))
+    {
+      insn = emit_insn (gen_add2_insn (this_rtx, GEN_INT (delta)));
+      RTX_FRAME_RELATED_P (insn) = frame_related_p;
+      return;
+    }
+
+  /* Emit 2 additions/subtractions if the adjustment is less than 24 bits.
+     Only do this if mdelta is not a 16-bit move as adjusting using a move
+     is better.  */
+  if (mdelta < 0x1000000 && !aarch64_move_imm (mdelta, mode))
+    {
+      HOST_WIDE_INT low_off = mdelta & 0xfff;
+
+      low_off = delta < 0 ? -low_off : low_off;
+      insn = emit_insn (gen_add2_insn (this_rtx, GEN_INT (low_off)));
+      RTX_FRAME_RELATED_P (insn) = frame_related_p;
+      insn = emit_insn (gen_add2_insn (this_rtx, GEN_INT (delta - low_off)));
+      RTX_FRAME_RELATED_P (insn) = frame_related_p;
+      return;
+    }
+
+  /* Emit a move immediate if required and an addition/subtraction.  */
+  rtx scratch_rtx = gen_rtx_REG (mode, scratchreg);
+  if (emit_move_imm)
+    aarch64_internal_mov_immediate (scratch_rtx, GEN_INT (mdelta), true, mode);
+  insn = emit_insn (delta < 0 ? gen_sub2_insn (this_rtx, scratch_rtx)
+			      : gen_add2_insn (this_rtx, scratch_rtx));
+  if (frame_related_p)
+    {
+      RTX_FRAME_RELATED_P (insn) = frame_related_p;
+      rtx adj = plus_constant (mode, this_rtx, delta);
+      add_reg_note (insn , REG_CFA_ADJUST_CFA, gen_rtx_SET (this_rtx, adj));
+    }
+}
+
+static inline void
+aarch64_add_constant (machine_mode mode, int regnum, int scratchreg,
+		      HOST_WIDE_INT delta)
+{
+  aarch64_add_constant_internal (mode, regnum, scratchreg, delta, false, true);
+}
+
+static inline void
+aarch64_add_sp (int scratchreg, HOST_WIDE_INT delta, bool emit_move_imm)
+{
+  aarch64_add_constant_internal (Pmode, SP_REGNUM, scratchreg, delta,
+				 true, emit_move_imm);
+}
+
+static inline void
+aarch64_sub_sp (int scratchreg, HOST_WIDE_INT delta, bool frame_related_p)
+{
+  aarch64_add_constant_internal (Pmode, SP_REGNUM, scratchreg, -delta,
+				 frame_related_p, true);
+}
+
 static bool
 aarch64_function_ok_for_sibcall (tree decl ATTRIBUTE_UNUSED,
 				 tree exp ATTRIBUTE_UNUSED)
@@ -2494,7 +2777,7 @@ static void
 aarch64_layout_frame (void)
 {
   HOST_WIDE_INT offset = 0;
-  int regno;
+  int regno, last_fp_reg = INVALID_REGNUM;
 
   if (reload_completed && cfun->machine->frame.laid_out)
     return;
@@ -2502,8 +2785,8 @@ aarch64_layout_frame (void)
 #define SLOT_NOT_REQUIRED (-2)
 #define SLOT_REQUIRED     (-1)
 
-  cfun->machine->frame.wb_candidate1 = FIRST_PSEUDO_REGISTER;
-  cfun->machine->frame.wb_candidate2 = FIRST_PSEUDO_REGISTER;
+  cfun->machine->frame.wb_candidate1 = INVALID_REGNUM;
+  cfun->machine->frame.wb_candidate2 = INVALID_REGNUM;
 
   /* First mark all the registers that really need to be saved...  */
   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
@@ -2528,7 +2811,10 @@ aarch64_layout_frame (void)
   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
     if (df_regs_ever_live_p (regno)
 	&& !call_used_regs[regno])
-      cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
+      {
+	cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
+	last_fp_reg = regno;
+      }
 
   if (frame_pointer_needed)
     {
@@ -2537,7 +2823,6 @@ aarch64_layout_frame (void)
       cfun->machine->frame.wb_candidate1 = R29_REGNUM;
       cfun->machine->frame.reg_offset[R30_REGNUM] = UNITS_PER_WORD;
       cfun->machine->frame.wb_candidate2 = R30_REGNUM;
-      cfun->machine->frame.hardfp_offset = 2 * UNITS_PER_WORD;
       offset += 2 * UNITS_PER_WORD;
     }
 
@@ -2546,35 +2831,46 @@ aarch64_layout_frame (void)
     if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
       {
 	cfun->machine->frame.reg_offset[regno] = offset;
-	if (cfun->machine->frame.wb_candidate1 == FIRST_PSEUDO_REGISTER)
+	if (cfun->machine->frame.wb_candidate1 == INVALID_REGNUM)
 	  cfun->machine->frame.wb_candidate1 = regno;
-	else if (cfun->machine->frame.wb_candidate2 == FIRST_PSEUDO_REGISTER)
+	else if (cfun->machine->frame.wb_candidate2 == INVALID_REGNUM)
 	  cfun->machine->frame.wb_candidate2 = regno;
 	offset += UNITS_PER_WORD;
       }
 
+  HOST_WIDE_INT max_int_offset = offset;
+  offset = ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT);
+  bool has_align_gap = offset != max_int_offset;
+
   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
     if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
       {
+	/* If there is an alignment gap between integer and fp callee-saves,
+	   allocate the last fp register to it if possible.  */
+	if (regno == last_fp_reg && has_align_gap && (offset & 8) == 0)
+	  {
+	    cfun->machine->frame.reg_offset[regno] = max_int_offset;
+	    break;
+	  }
+
 	cfun->machine->frame.reg_offset[regno] = offset;
-	if (cfun->machine->frame.wb_candidate1 == FIRST_PSEUDO_REGISTER)
+	if (cfun->machine->frame.wb_candidate1 == INVALID_REGNUM)
 	  cfun->machine->frame.wb_candidate1 = regno;
-	else if (cfun->machine->frame.wb_candidate2 == FIRST_PSEUDO_REGISTER
+	else if (cfun->machine->frame.wb_candidate2 == INVALID_REGNUM
 		 && cfun->machine->frame.wb_candidate1 >= V0_REGNUM)
 	  cfun->machine->frame.wb_candidate2 = regno;
 	offset += UNITS_PER_WORD;
       }
 
-  cfun->machine->frame.padding0 =
-    (ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT) - offset);
   offset = ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT);
 
   cfun->machine->frame.saved_regs_size = offset;
 
+  HOST_WIDE_INT varargs_and_saved_regs_size
+    = offset + cfun->machine->frame.saved_varargs_size;
+
   cfun->machine->frame.hard_fp_offset
-    = ROUND_UP (cfun->machine->frame.saved_varargs_size
-		+ get_frame_size ()
-		+ cfun->machine->frame.saved_regs_size,
+    = ROUND_UP (varargs_and_saved_regs_size + get_frame_size (),
 		STACK_BOUNDARY / BITS_PER_UNIT);
 
   cfun->machine->frame.frame_size
@@ -2582,15 +2878,92 @@ aarch64_layout_frame (void)
 		+ crtl->outgoing_args_size,
 		STACK_BOUNDARY / BITS_PER_UNIT);
 
+  cfun->machine->frame.locals_offset = cfun->machine->frame.saved_varargs_size;
+
+  cfun->machine->frame.initial_adjust = 0;
+  cfun->machine->frame.final_adjust = 0;
+  cfun->machine->frame.callee_adjust = 0;
+  cfun->machine->frame.callee_offset = 0;
+
+  HOST_WIDE_INT max_push_offset = 0;
+  if (cfun->machine->frame.wb_candidate2 != INVALID_REGNUM)
+    max_push_offset = 512;
+  else if (cfun->machine->frame.wb_candidate1 != INVALID_REGNUM)
+    max_push_offset = 256;
+
+  if (cfun->machine->frame.frame_size < max_push_offset
+      && crtl->outgoing_args_size == 0)
+    {
+      /* Simple, small frame with no outgoing arguments:
+	 stp reg1, reg2, [sp, -frame_size]!
+	 stp reg3, reg4, [sp, 16]  */
+      cfun->machine->frame.callee_adjust = cfun->machine->frame.frame_size;
+    }
+  else if ((crtl->outgoing_args_size
+	    + cfun->machine->frame.saved_regs_size < 512)
+	   && !(cfun->calls_alloca
+		&& cfun->machine->frame.hard_fp_offset < max_push_offset))
+    {
+      /* Frame with small outgoing arguments:
+	 sub sp, sp, frame_size
+	 stp reg1, reg2, [sp, outgoing_args_size]
+	 stp reg3, reg4, [sp, outgoing_args_size + 16]  */
+      cfun->machine->frame.initial_adjust = cfun->machine->frame.frame_size;
+      cfun->machine->frame.callee_offset
+	= cfun->machine->frame.frame_size - cfun->machine->frame.hard_fp_offset;
+    }
+  else if (cfun->machine->frame.hard_fp_offset < max_push_offset)
+    {
+      /* Frame with large outgoing arguments but a small local area:
+	 stp reg1, reg2, [sp, -hard_fp_offset]!
+	 stp reg3, reg4, [sp, 16]
+	 sub sp, sp, outgoing_args_size  */
+      cfun->machine->frame.callee_adjust = cfun->machine->frame.hard_fp_offset;
+      cfun->machine->frame.final_adjust
+	= cfun->machine->frame.frame_size - cfun->machine->frame.callee_adjust;
+    }
+  else if (!frame_pointer_needed
+	   && varargs_and_saved_regs_size < max_push_offset)
+    {
+      /* Frame with large local area and outgoing arguments (this pushes the
+	 callee-saves first, followed by the locals and outgoing area):
+	 stp reg1, reg2, [sp, -varargs_and_saved_regs_size]!
+	 stp reg3, reg4, [sp, 16]
+	 sub sp, sp, frame_size - varargs_and_saved_regs_size  */
+      cfun->machine->frame.callee_adjust = varargs_and_saved_regs_size;
+      cfun->machine->frame.final_adjust
+	= cfun->machine->frame.frame_size - cfun->machine->frame.callee_adjust;
+      cfun->machine->frame.hard_fp_offset = cfun->machine->frame.callee_adjust;
+      cfun->machine->frame.locals_offset = cfun->machine->frame.hard_fp_offset;
+    }
+  else
+    {
+      /* Frame with large local area and outgoing arguments using frame pointer:
+	 sub sp, sp, hard_fp_offset
+	 stp x29, x30, [sp, 0]
+	 add x29, sp, 0
+	 stp reg3, reg4, [sp, 16]
+	 sub sp, sp, outgoing_args_size  */
+      cfun->machine->frame.initial_adjust = cfun->machine->frame.hard_fp_offset;
+      cfun->machine->frame.final_adjust
+	= cfun->machine->frame.frame_size - cfun->machine->frame.initial_adjust;
+    }
+
   cfun->machine->frame.laid_out = true;
 }
 
+/* Return true if the register REGNO is saved on entry to
+   the current function.  */
+
 static bool
 aarch64_register_saved_on_entry (int regno)
 {
   return cfun->machine->frame.reg_offset[regno] >= 0;
 }
 
+/* Return the next register up from REGNO up to LIMIT for the callee
+   to save.  */
+
 static unsigned
 aarch64_next_callee_save (unsigned regno, unsigned limit)
 {
@@ -2599,6 +2972,9 @@ aarch64_next_callee_save (unsigned regno, unsigned limit)
   return regno;
 }
 
+/* Push the register number REGNO of mode MODE to the stack with write-back
+   adjusting the stack by ADJUSTMENT.  */
+
 static void
 aarch64_pushwb_single_reg (machine_mode mode, unsigned regno,
 			   HOST_WIDE_INT adjustment)
@@ -2615,6 +2991,10 @@ aarch64_pushwb_single_reg (machine_mode mode, unsigned regno,
   RTX_FRAME_RELATED_P (insn) = 1;
 }
 
+/* Generate and return an instruction to store the pair of registers
+   REG and REG2 of mode MODE to location BASE with write-back adjusting
+   the stack location BASE by ADJUSTMENT.  */
+
 static rtx
 aarch64_gen_storewb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
 			  HOST_WIDE_INT adjustment)
@@ -2634,11 +3014,18 @@ aarch64_gen_storewb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
     }
 }
 
+/* Push registers numbered REGNO1 and REGNO2 to the stack, adjusting the
+   stack pointer by ADJUSTMENT.  */
+
 static void
-aarch64_pushwb_pair_reg (machine_mode mode, unsigned regno1,
-			 unsigned regno2, HOST_WIDE_INT adjustment)
+aarch64_push_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment)
 {
   rtx_insn *insn;
+  machine_mode mode = (regno1 <= R30_REGNUM) ? DImode : DFmode;
+
+  if (regno2 == INVALID_REGNUM)
+    return aarch64_pushwb_single_reg (mode, regno1, adjustment);
+
   rtx reg1 = gen_rtx_REG (mode, regno1);
   rtx reg2 = gen_rtx_REG (mode, regno2);
 
@@ -2649,6 +3036,9 @@ aarch64_pushwb_pair_reg (machine_mode mode, unsigned regno1,
   RTX_FRAME_RELATED_P (insn) = 1;
 }
 
+/* Load the pair of register REG, REG2 of mode MODE from stack location BASE,
+   adjusting it by ADJUSTMENT afterwards.  */
+
 static rtx
 aarch64_gen_loadwb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
 			 HOST_WIDE_INT adjustment)
@@ -2666,6 +3056,37 @@ aarch64_gen_loadwb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
     }
 }
 
+/* Pop the two registers numbered REGNO1, REGNO2 from the stack, adjusting it
+   afterwards by ADJUSTMENT and writing the appropriate REG_CFA_RESTORE notes
+   into CFI_OPS.  */
+
+static void
+aarch64_pop_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment,
+		  rtx *cfi_ops)
+{
+  machine_mode mode = (regno1 <= R30_REGNUM) ? DImode : DFmode;
+  rtx reg1 = gen_rtx_REG (mode, regno1);
+
+  *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg1, *cfi_ops);
+
+  if (regno2 == INVALID_REGNUM)
+    {
+      rtx mem = plus_constant (Pmode, stack_pointer_rtx, adjustment);
+      mem = gen_rtx_POST_MODIFY (Pmode, stack_pointer_rtx, mem);
+      emit_move_insn (reg1, gen_rtx_MEM (mode, mem));
+    }
+  else
+    {
+      rtx reg2 = gen_rtx_REG (mode, regno2);
+      *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
+      emit_insn (aarch64_gen_loadwb_pair (mode, stack_pointer_rtx, reg1,
+					  reg2, adjustment));
+    }
+}
+
+/* Generate and return a store pair instruction of mode MODE to store
+   register REG1 to MEM1 and register REG2 to MEM2.  */
+
 static rtx
 aarch64_gen_store_pair (machine_mode mode, rtx mem1, rtx reg1, rtx mem2,
 			rtx reg2)
@@ -2683,6 +3104,9 @@ aarch64_gen_store_pair (machine_mode mode, rtx mem1, rtx reg1, rtx mem2,
     }
 }
 
+/* Generate and regurn a load pair isntruction of mode MODE to load register
+   REG1 from MEM1 and register REG2 from MEM2.  */
+
 static rtx
 aarch64_gen_load_pair (machine_mode mode, rtx reg1, rtx mem1, rtx reg2,
 		       rtx mem2)
@@ -2700,6 +3124,9 @@ aarch64_gen_load_pair (machine_mode mode, rtx reg1, rtx mem1, rtx reg2,
     }
 }
 
+/* Emit code to save the callee-saved registers from register number START
+   to LIMIT to the stack at the location starting at offset START_OFFSET,
+   skipping any write-back candidates if SKIP_WB is true.  */
 
 static void
 aarch64_save_callee_saves (machine_mode mode, HOST_WIDE_INT start_offset,
@@ -2758,6 +3185,11 @@ aarch64_save_callee_saves (machine_mode mode, HOST_WIDE_INT start_offset,
     }
 }
 
+/* Emit code to restore the callee registers of mode MODE from register
+   number START up to and including LIMIT.  Restore from the stack offset
+   START_OFFSET, skipping any write-back candidates if SKIP_WB is true.
+   Write the appropriate REG_CFA_RESTORE notes into CFI_OPS.  */
+
 static void
 aarch64_restore_callee_saves (machine_mode mode,
 			      HOST_WIDE_INT start_offset, unsigned start,
@@ -2852,23 +3284,16 @@ aarch64_restore_callee_saves (machine_mode mode,
 void
 aarch64_expand_prologue (void)
 {
-  /* sub sp, sp, #<frame_size>
-     stp {fp, lr}, [sp, #<frame_size> - 16]
-     add fp, sp, #<frame_size> - hardfp_offset
-     stp {cs_reg}, [fp, #-16] etc.
-
-     sub sp, sp, <final_adjustment_if_any>
-  */
-  HOST_WIDE_INT frame_size, offset;
-  HOST_WIDE_INT fp_offset;		/* Offset from hard FP to SP.  */
-  HOST_WIDE_INT hard_fp_offset;
-  rtx_insn *insn;
-
   aarch64_layout_frame ();
 
-  offset = frame_size = cfun->machine->frame.frame_size;
-  hard_fp_offset = cfun->machine->frame.hard_fp_offset;
-  fp_offset = frame_size - hard_fp_offset;
+  HOST_WIDE_INT frame_size = cfun->machine->frame.frame_size;
+  HOST_WIDE_INT initial_adjust = cfun->machine->frame.initial_adjust;
+  HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
+  HOST_WIDE_INT final_adjust = cfun->machine->frame.final_adjust;
+  HOST_WIDE_INT callee_offset = cfun->machine->frame.callee_offset;
+  unsigned reg1 = cfun->machine->frame.wb_candidate1;
+  unsigned reg2 = cfun->machine->frame.wb_candidate2;
+  rtx_insn *insn;
 
   if (flag_stack_usage_info)
     current_function_static_stack_size = frame_size;
@@ -2885,129 +3310,28 @@ aarch64_expand_prologue (void)
 	aarch64_emit_probe_stack_range (STACK_CHECK_PROTECT, frame_size);
     }
 
-  /* Store pairs and load pairs have a range only -512 to 504.  */
-  if (offset >= 512)
-    {
-      /* When the frame has a large size, an initial decrease is done on
-	 the stack pointer to jump over the callee-allocated save area for
-	 register varargs, the local variable area and/or the callee-saved
-	 register area.  This will allow the pre-index write-back
-	 store pair instructions to be used for setting up the stack frame
-	 efficiently.  */
-      offset = hard_fp_offset;
-      if (offset >= 512)
-	offset = cfun->machine->frame.saved_regs_size;
+  aarch64_sub_sp (IP0_REGNUM, initial_adjust, true);
 
-      frame_size -= (offset + crtl->outgoing_args_size);
-      fp_offset = 0;
+  if (callee_adjust != 0)
+    aarch64_push_regs (reg1, reg2, callee_adjust);
 
-      if (frame_size >= 0x1000000)
-	{
-	  rtx op0 = gen_rtx_REG (Pmode, IP0_REGNUM);
-	  emit_move_insn (op0, GEN_INT (-frame_size));
-	  insn = emit_insn (gen_add2_insn (stack_pointer_rtx, op0));
-
-	  add_reg_note (insn, REG_CFA_ADJUST_CFA,
-			gen_rtx_SET (stack_pointer_rtx,
-				     plus_constant (Pmode, stack_pointer_rtx,
-						    -frame_size)));
-	  RTX_FRAME_RELATED_P (insn) = 1;
-	}
-      else if (frame_size > 0)
-	{
-	  int hi_ofs = frame_size & 0xfff000;
-	  int lo_ofs = frame_size & 0x000fff;
-
-	  if (hi_ofs)
-	    {
-	      insn = emit_insn (gen_add2_insn
-				(stack_pointer_rtx, GEN_INT (-hi_ofs)));
-	      RTX_FRAME_RELATED_P (insn) = 1;
-	    }
-	  if (lo_ofs)
-	    {
-	      insn = emit_insn (gen_add2_insn
-				(stack_pointer_rtx, GEN_INT (-lo_ofs)));
-	      RTX_FRAME_RELATED_P (insn) = 1;
-	    }
-	}
-    }
-  else
-    frame_size = -1;
-
-  if (offset > 0)
+  if (frame_pointer_needed)
     {
-      bool skip_wb = false;
-
-      if (frame_pointer_needed)
-	{
-	  skip_wb = true;
-
-	  if (fp_offset)
-	    {
-	      insn = emit_insn (gen_add2_insn (stack_pointer_rtx,
-					       GEN_INT (-offset)));
-	      RTX_FRAME_RELATED_P (insn) = 1;
-
-	      aarch64_save_callee_saves (DImode, fp_offset, R29_REGNUM,
-					 R30_REGNUM, false);
-	    }
-	  else
-	    aarch64_pushwb_pair_reg (DImode, R29_REGNUM, R30_REGNUM, offset);
-
-	  /* Set up frame pointer to point to the location of the
-	     previous frame pointer on the stack.  */
-	  insn = emit_insn (gen_add3_insn (hard_frame_pointer_rtx,
-					   stack_pointer_rtx,
-					   GEN_INT (fp_offset)));
-	  RTX_FRAME_RELATED_P (insn) = 1;
-	  emit_insn (gen_stack_tie (stack_pointer_rtx, hard_frame_pointer_rtx));
-	}
-      else
-	{
-	  unsigned reg1 = cfun->machine->frame.wb_candidate1;
-	  unsigned reg2 = cfun->machine->frame.wb_candidate2;
-
-	  if (fp_offset
-	      || reg1 == FIRST_PSEUDO_REGISTER
-	      || (reg2 == FIRST_PSEUDO_REGISTER
-		  && offset >= 256))
-	    {
-	      insn = emit_insn (gen_add2_insn (stack_pointer_rtx,
-					       GEN_INT (-offset)));
-	      RTX_FRAME_RELATED_P (insn) = 1;
-	    }
-	  else
-	    {
-	      machine_mode mode1 = (reg1 <= R30_REGNUM) ? DImode : DFmode;
-
-	      skip_wb = true;
-
-	      if (reg2 == FIRST_PSEUDO_REGISTER)
-		aarch64_pushwb_single_reg (mode1, reg1, offset);
-	      else
-		aarch64_pushwb_pair_reg (mode1, reg1, reg2, offset);
-	    }
-	}
-
-      aarch64_save_callee_saves (DImode, fp_offset, R0_REGNUM, R30_REGNUM,
-				 skip_wb);
-      aarch64_save_callee_saves (DFmode, fp_offset, V0_REGNUM, V31_REGNUM,
-				 skip_wb);
+      if (callee_adjust == 0)
+	aarch64_save_callee_saves (DImode, callee_offset, R29_REGNUM,
+				   R30_REGNUM, false);
+      insn = emit_insn (gen_add3_insn (hard_frame_pointer_rtx,
+				       stack_pointer_rtx,
+				       GEN_INT (callee_offset)));
+      RTX_FRAME_RELATED_P (insn) = 1;
+      emit_insn (gen_stack_tie (stack_pointer_rtx, hard_frame_pointer_rtx));
     }
 
-  /* when offset >= 512,
-     sub sp, sp, #<outgoing_args_size> */
-  if (frame_size > -1)
-    {
-      if (crtl->outgoing_args_size > 0)
-	{
-	  insn = emit_insn (gen_add2_insn
-			    (stack_pointer_rtx,
-			     GEN_INT (- crtl->outgoing_args_size)));
-	  RTX_FRAME_RELATED_P (insn) = 1;
-	}
-    }
+  aarch64_save_callee_saves (DImode, callee_offset, R0_REGNUM, R30_REGNUM,
+			     callee_adjust != 0 || frame_pointer_needed);
+  aarch64_save_callee_saves (DFmode, callee_offset, V0_REGNUM, V31_REGNUM,
+			     callee_adjust != 0 || frame_pointer_needed);
+  aarch64_sub_sp (IP1_REGNUM, final_adjust, !frame_pointer_needed);
 }
 
 /* Return TRUE if we can use a simple_return insn.
@@ -3030,151 +3354,80 @@ aarch64_use_return_insn_p (void)
   return cfun->machine->frame.frame_size == 0;
 }
 
-/* Generate the epilogue instructions for returning from a function.  */
+/* Generate the epilogue instructions for returning from a function.
+   This is almost exactly the reverse of the prolog sequence, except
+   that we need to insert barriers to avoid scheduling loads that read
+   from a deallocated stack, and we optimize the unwind records by
+   emitting them all together if possible.  */
 void
 aarch64_expand_epilogue (bool for_sibcall)
 {
-  HOST_WIDE_INT frame_size, offset;
-  HOST_WIDE_INT fp_offset;
-  HOST_WIDE_INT hard_fp_offset;
-  rtx_insn *insn;
-  /* We need to add memory barrier to prevent read from deallocated stack.  */
-  bool need_barrier_p = (get_frame_size () != 0
-			 || cfun->machine->frame.saved_varargs_size
-			 || crtl->calls_eh_return);
-
   aarch64_layout_frame ();
 
-  offset = frame_size = cfun->machine->frame.frame_size;
-  hard_fp_offset = cfun->machine->frame.hard_fp_offset;
-  fp_offset = frame_size - hard_fp_offset;
+  HOST_WIDE_INT initial_adjust = cfun->machine->frame.initial_adjust;
+  HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
+  HOST_WIDE_INT final_adjust = cfun->machine->frame.final_adjust;
+  HOST_WIDE_INT callee_offset = cfun->machine->frame.callee_offset;
+  unsigned reg1 = cfun->machine->frame.wb_candidate1;
+  unsigned reg2 = cfun->machine->frame.wb_candidate2;
+  rtx cfi_ops = NULL;
+  rtx_insn *insn;
 
-  /* Store pairs and load pairs have a range only -512 to 504.  */
-  if (offset >= 512)
-    {
-      offset = hard_fp_offset;
-      if (offset >= 512)
-	offset = cfun->machine->frame.saved_regs_size;
+  /* We need to add memory barrier to prevent read from deallocated stack.  */
+  bool need_barrier_p = (get_frame_size ()
+			 + cfun->machine->frame.saved_varargs_size) != 0;
 
-      frame_size -= (offset + crtl->outgoing_args_size);
-      fp_offset = 0;
-      if (!frame_pointer_needed && crtl->outgoing_args_size > 0)
-	{
-	  insn = emit_insn (gen_add2_insn
-			    (stack_pointer_rtx,
-			     GEN_INT (crtl->outgoing_args_size)));
-	  RTX_FRAME_RELATED_P (insn) = 1;
-	}
+  /* Emit a barrier to prevent loads from a deallocated stack.  */
+  if (final_adjust > crtl->outgoing_args_size || cfun->calls_alloca
+      || crtl->calls_eh_return)
+    {
+      emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
+      need_barrier_p = false;
     }
-  else
-    frame_size = -1;
 
-  /* If there were outgoing arguments or we've done dynamic stack
-     allocation, then restore the stack pointer from the frame
-     pointer.  This is at most one insn and more efficient than using
-     GCC's internal mechanism.  */
-  if (frame_pointer_needed
-      && (crtl->outgoing_args_size || cfun->calls_alloca))
+  /* Restore the stack pointer from the frame pointer if it may not
+     be the same as the stack pointer.  */
+  if (frame_pointer_needed && (final_adjust || cfun->calls_alloca))
     {
-      if (cfun->calls_alloca)
-	emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
-
       insn = emit_insn (gen_add3_insn (stack_pointer_rtx,
 				       hard_frame_pointer_rtx,
-				       GEN_INT (0)));
-      offset = offset - fp_offset;
-    }
-
-  if (offset > 0)
-    {
-      unsigned reg1 = cfun->machine->frame.wb_candidate1;
-      unsigned reg2 = cfun->machine->frame.wb_candidate2;
-      bool skip_wb = true;
-      rtx cfi_ops = NULL;
-
-      if (frame_pointer_needed)
-	fp_offset = 0;
-      else if (fp_offset
-	       || reg1 == FIRST_PSEUDO_REGISTER
-	       || (reg2 == FIRST_PSEUDO_REGISTER
-		   && offset >= 256))
-	skip_wb = false;
-
-      aarch64_restore_callee_saves (DImode, fp_offset, R0_REGNUM, R30_REGNUM,
-				    skip_wb, &cfi_ops);
-      aarch64_restore_callee_saves (DFmode, fp_offset, V0_REGNUM, V31_REGNUM,
-				    skip_wb, &cfi_ops);
-
-      if (need_barrier_p)
-	emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
-
-      if (skip_wb)
-	{
-	  machine_mode mode1 = (reg1 <= R30_REGNUM) ? DImode : DFmode;
-	  rtx rreg1 = gen_rtx_REG (mode1, reg1);
-
-	  cfi_ops = alloc_reg_note (REG_CFA_RESTORE, rreg1, cfi_ops);
-	  if (reg2 == FIRST_PSEUDO_REGISTER)
-	    {
-	      rtx mem = plus_constant (Pmode, stack_pointer_rtx, offset);
-	      mem = gen_rtx_POST_MODIFY (Pmode, stack_pointer_rtx, mem);
-	      mem = gen_rtx_MEM (mode1, mem);
-	      insn = emit_move_insn (rreg1, mem);
-	    }
-	  else
-	    {
-	      rtx rreg2 = gen_rtx_REG (mode1, reg2);
-
-	      cfi_ops = alloc_reg_note (REG_CFA_RESTORE, rreg2, cfi_ops);
-	      insn = emit_insn (aarch64_gen_loadwb_pair
-				(mode1, stack_pointer_rtx, rreg1,
-				 rreg2, offset));
-	    }
-	}
-      else
-	{
-	  insn = emit_insn (gen_add2_insn (stack_pointer_rtx,
-					   GEN_INT (offset)));
-	}
-
-      /* Reset the CFA to be SP + FRAME_SIZE.  */
-      rtx new_cfa = stack_pointer_rtx;
-      if (frame_size > 0)
-	new_cfa = plus_constant (Pmode, new_cfa, frame_size);
-      cfi_ops = alloc_reg_note (REG_CFA_DEF_CFA, new_cfa, cfi_ops);
-      REG_NOTES (insn) = cfi_ops;
-      RTX_FRAME_RELATED_P (insn) = 1;
+				       GEN_INT (-callee_offset)));
+      /* If writeback is used when restoring callee-saves, the CFA
+	 is restored on the instruction doing the writeback.  */
+      RTX_FRAME_RELATED_P (insn) = callee_adjust == 0;
     }
+  else
+    aarch64_add_sp (IP1_REGNUM, final_adjust, df_regs_ever_live_p (IP1_REGNUM));
 
-  if (frame_size > 0)
-    {
-      if (need_barrier_p)
-	emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
+  aarch64_restore_callee_saves (DImode, callee_offset, R0_REGNUM, R30_REGNUM,
+				callee_adjust != 0, &cfi_ops);
+  aarch64_restore_callee_saves (DFmode, callee_offset, V0_REGNUM, V31_REGNUM,
+				callee_adjust != 0, &cfi_ops);
 
-      if (frame_size >= 0x1000000)
-	{
-	  rtx op0 = gen_rtx_REG (Pmode, IP0_REGNUM);
-	  emit_move_insn (op0, GEN_INT (frame_size));
-	  insn = emit_insn (gen_add2_insn (stack_pointer_rtx, op0));
-	}
-      else
-	{
-          int hi_ofs = frame_size & 0xfff000;
-          int lo_ofs = frame_size & 0x000fff;
+  if (need_barrier_p)
+    emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
 
-	  if (hi_ofs && lo_ofs)
-	    {
-	      insn = emit_insn (gen_add2_insn
-				(stack_pointer_rtx, GEN_INT (hi_ofs)));
-	      RTX_FRAME_RELATED_P (insn) = 1;
-	      frame_size = lo_ofs;
-	    }
-	  insn = emit_insn (gen_add2_insn
-			    (stack_pointer_rtx, GEN_INT (frame_size)));
-	}
+  if (callee_adjust != 0)
+    aarch64_pop_regs (reg1, reg2, callee_adjust, &cfi_ops);
+
+  if (callee_adjust != 0 || initial_adjust > 65536)
+    {
+      /* Emit delayed restores and set the CFA to be SP + initial_adjust.  */
+      insn = get_last_insn ();
+      rtx new_cfa = plus_constant (Pmode, stack_pointer_rtx, initial_adjust);
+      REG_NOTES (insn) = alloc_reg_note (REG_CFA_DEF_CFA, new_cfa, cfi_ops);
+      RTX_FRAME_RELATED_P (insn) = 1;
+      cfi_ops = NULL;
+    }
+
+  aarch64_add_sp (IP0_REGNUM, initial_adjust, df_regs_ever_live_p (IP0_REGNUM));
 
-      /* Reset the CFA to be SP + 0.  */
-      add_reg_note (insn, REG_CFA_DEF_CFA, stack_pointer_rtx);
+  if (cfi_ops)
+    {
+      /* Emit delayed restores and reset the CFA to be SP.  */
+      insn = get_last_insn ();
+      cfi_ops = alloc_reg_note (REG_CFA_DEF_CFA, stack_pointer_rtx, cfi_ops);
+      REG_NOTES (insn) = cfi_ops;
       RTX_FRAME_RELATED_P (insn) = 1;
     }
 
@@ -3230,122 +3483,6 @@ aarch64_eh_return_handler_rtx (void)
   return tmp;
 }
 
-/* Possibly output code to build up a constant in a register.  For
-   the benefit of the costs infrastructure, returns the number of
-   instructions which would be emitted.  GENERATE inhibits or
-   enables code generation.  */
-
-static int
-aarch64_build_constant (int regnum, HOST_WIDE_INT val, bool generate)
-{
-  int insns = 0;
-
-  if (aarch64_bitmask_imm (val, DImode))
-    {
-      if (generate)
-	emit_move_insn (gen_rtx_REG (Pmode, regnum), GEN_INT (val));
-      insns = 1;
-    }
-  else
-    {
-      int i;
-      int ncount = 0;
-      int zcount = 0;
-      HOST_WIDE_INT valp = val >> 16;
-      HOST_WIDE_INT valm;
-      HOST_WIDE_INT tval;
-
-      for (i = 16; i < 64; i += 16)
-	{
-	  valm = (valp & 0xffff);
-
-	  if (valm != 0)
-	    ++ zcount;
-
-	  if (valm != 0xffff)
-	    ++ ncount;
-
-	  valp >>= 16;
-	}
-
-      /* zcount contains the number of additional MOVK instructions
-	 required if the constant is built up with an initial MOVZ instruction,
-	 while ncount is the number of MOVK instructions required if starting
-	 with a MOVN instruction.  Choose the sequence that yields the fewest
-	 number of instructions, preferring MOVZ instructions when they are both
-	 the same.  */
-      if (ncount < zcount)
-	{
-	  if (generate)
-	    emit_move_insn (gen_rtx_REG (Pmode, regnum),
-			    GEN_INT (val | ~(HOST_WIDE_INT) 0xffff));
-	  tval = 0xffff;
-	  insns++;
-	}
-      else
-	{
-	  if (generate)
-	    emit_move_insn (gen_rtx_REG (Pmode, regnum),
-			    GEN_INT (val & 0xffff));
-	  tval = 0;
-	  insns++;
-	}
-
-      val >>= 16;
-
-      for (i = 16; i < 64; i += 16)
-	{
-	  if ((val & 0xffff) != tval)
-	    {
-	      if (generate)
-		emit_insn (gen_insv_immdi (gen_rtx_REG (Pmode, regnum),
-					   GEN_INT (i),
-					   GEN_INT (val & 0xffff)));
-	      insns++;
-	    }
-	  val >>= 16;
-	}
-    }
-  return insns;
-}
-
-static void
-aarch64_add_constant (int regnum, int scratchreg, HOST_WIDE_INT delta)
-{
-  HOST_WIDE_INT mdelta = delta;
-  rtx this_rtx = gen_rtx_REG (Pmode, regnum);
-  rtx scratch_rtx = gen_rtx_REG (Pmode, scratchreg);
-
-  if (mdelta < 0)
-    mdelta = -mdelta;
-
-  if (mdelta >= 4096 * 4096)
-    {
-      (void) aarch64_build_constant (scratchreg, delta, true);
-      emit_insn (gen_add3_insn (this_rtx, this_rtx, scratch_rtx));
-    }
-  else if (mdelta > 0)
-    {
-      if (mdelta >= 4096)
-	{
-	  emit_insn (gen_rtx_SET (scratch_rtx, GEN_INT (mdelta / 4096)));
-	  rtx shift = gen_rtx_ASHIFT (Pmode, scratch_rtx, GEN_INT (12));
-	  if (delta < 0)
-	    emit_insn (gen_rtx_SET (this_rtx,
-				    gen_rtx_MINUS (Pmode, this_rtx, shift)));
-	  else
-	    emit_insn (gen_rtx_SET (this_rtx,
-				    gen_rtx_PLUS (Pmode, this_rtx, shift)));
-	}
-      if (mdelta % 4096 != 0)
-	{
-	  scratch_rtx = GEN_INT ((delta < 0 ? -1 : 1) * (mdelta % 4096));
-	  emit_insn (gen_rtx_SET (this_rtx,
-				  gen_rtx_PLUS (Pmode, this_rtx, scratch_rtx)));
-	}
-    }
-}
-
 /* Output code to add DELTA to the first argument, and then jump
    to FUNCTION.  Used for C++ multiple inheritance.  */
 static void
@@ -3366,7 +3503,7 @@ aarch64_output_mi_thunk (FILE *file, tree thunk ATTRIBUTE_UNUSED,
   emit_note (NOTE_INSN_PROLOGUE_END);
 
   if (vcall_offset == 0)
-    aarch64_add_constant (this_regno, IP1_REGNUM, delta);
+    aarch64_add_constant (Pmode, this_regno, IP1_REGNUM, delta);
   else
     {
       gcc_assert ((vcall_offset & (POINTER_BYTES - 1)) == 0);
@@ -3382,7 +3519,7 @@ aarch64_output_mi_thunk (FILE *file, tree thunk ATTRIBUTE_UNUSED,
 	    addr = gen_rtx_PRE_MODIFY (Pmode, this_rtx,
 				       plus_constant (Pmode, this_rtx, delta));
 	  else
-	    aarch64_add_constant (this_regno, IP1_REGNUM, delta);
+	    aarch64_add_constant (Pmode, this_regno, IP1_REGNUM, delta);
 	}
 
       if (Pmode == ptr_mode)
@@ -3396,7 +3533,8 @@ aarch64_output_mi_thunk (FILE *file, tree thunk ATTRIBUTE_UNUSED,
 	  addr = plus_constant (Pmode, temp0, vcall_offset);
       else
 	{
-	  (void) aarch64_build_constant (IP1_REGNUM, vcall_offset, true);
+	  aarch64_internal_mov_immediate (temp1, GEN_INT (vcall_offset), true,
+					  Pmode);
 	  addr = gen_rtx_PLUS (Pmode, temp0, temp1);
 	}
 
@@ -3575,7 +3713,12 @@ aarch64_cannot_force_const_mem (machine_mode mode ATTRIBUTE_UNUSED, rtx x)
   return aarch64_tls_referenced_p (x);
 }
 
-/* Implement TARGET_CASE_VALUES_THRESHOLD.  */
+/* Implement TARGET_CASE_VALUES_THRESHOLD.
+   The expansion for a table switch is quite expensive due to the number
+   of instructions, the table lookup and hard to predict indirect jump.
+   When optimizing for speed, and -O3 enabled, use the per-core tuning if 
+   set, otherwise use tables for > 16 cases as a tradeoff between size and
+   performance.  When optimizing for size, use the default setting.  */
 
 static unsigned int
 aarch64_case_values_threshold (void)
@@ -3586,7 +3729,7 @@ aarch64_case_values_threshold (void)
       && selected_cpu->tune->max_case_values != 0)
     return selected_cpu->tune->max_case_values;
   else
-    return default_case_values_threshold ();
+    return optimize_size ? default_case_values_threshold () : 17;
 }
 
 /* Return true if register REGNO is a valid index register.
@@ -3921,9 +4064,11 @@ aarch64_classify_address (struct aarch64_address_info *info,
 	     X,X: 7-bit signed scaled offset
 	     Q:   9-bit signed offset
 	     We conservatively require an offset representable in either mode.
-	   */
+	     When performing the check for pairs of X registers i.e.  LDP/STP
+	     pass down DImode since that is the natural size of the LDP/STP
+	     instruction memory accesses.  */
 	  if (mode == TImode || mode == TFmode)
-	    return (aarch64_offset_7bit_signed_scaled_p (mode, offset)
+	    return (aarch64_offset_7bit_signed_scaled_p (DImode, offset)
 		    && offset_9bit_signed_unscaled_p (mode, offset));
 
 	  /* A 7bit offset check because OImode will emit a ldp/stp
@@ -4031,7 +4176,7 @@ aarch64_classify_address (struct aarch64_address_info *info,
 	  return ((GET_CODE (sym) == LABEL_REF
 		   || (GET_CODE (sym) == SYMBOL_REF
 		       && CONSTANT_POOL_ADDRESS_P (sym)
-		       && !aarch64_nopcrelative_literal_loads)));
+		       && aarch64_pcrelative_literal_loads)));
 	}
       return false;
 
@@ -4125,6 +4270,24 @@ aarch64_legitimate_address_p (machine_mode mode, rtx x,
   return aarch64_classify_address (&addr, x, mode, outer_code, strict_p);
 }
 
+/* Split an out-of-range address displacement into a base and offset.
+   Use 4KB range for 1- and 2-byte accesses and a 16KB range otherwise
+   to increase opportunities for sharing the base address of different sizes.
+   For TI/TFmode and unaligned accesses use a 256-byte range.  */
+static bool
+aarch64_legitimize_address_displacement (rtx *disp, rtx *off, machine_mode mode)
+{
+  HOST_WIDE_INT mask = GET_MODE_SIZE (mode) < 4 ? 0xfff : 0x3fff;
+
+  if (mode == TImode || mode == TFmode ||
+      (INTVAL (*disp) & (GET_MODE_SIZE (mode) - 1)) != 0)
+    mask = 0xff;
+
+  *off = GEN_INT (INTVAL (*disp) & ~mask);
+  *disp = GEN_INT (INTVAL (*disp) & mask);
+  return true;
+}
+
 /* Return TRUE if rtx X is immediate constant 0.0 */
 bool
 aarch64_float_const_zero_rtx_p (rtx x)
@@ -4198,6 +4361,14 @@ aarch64_select_cc_mode (RTX_CODE code, rtx x, rtx y)
       && (GET_MODE (x) == HImode || GET_MODE (x) == QImode))
     return CC_NZmode;
 
+  /* Similarly, comparisons of zero_extends from shorter modes can
+     be performed using an ANDS with an immediate mask.  */
+  if (y == const0_rtx && GET_CODE (x) == ZERO_EXTEND
+      && (GET_MODE (x) == SImode || GET_MODE (x) == DImode)
+      && (GET_MODE (XEXP (x, 0)) == HImode || GET_MODE (XEXP (x, 0)) == QImode)
+      && (code == EQ || code == NE))
+    return CC_NZmode;
+
   if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
       && y == const0_rtx
       && (code == EQ || code == NE || code == LT || code == GE)
@@ -4225,14 +4396,6 @@ aarch64_select_cc_mode (RTX_CODE code, rtx x, rtx y)
       && GET_CODE (x) == NEG)
     return CC_Zmode;
 
-  /* A compare of a mode narrower than SI mode against zero can be done
-     by extending the value in the comparison.  */
-  if ((GET_MODE (x) == QImode || GET_MODE (x) == HImode)
-      && y == const0_rtx)
-    /* Only use sign-extension if we really need it.  */
-    return ((code == GT || code == GE || code == LE || code == LT)
-	    ? CC_SESWPmode : CC_ZESWPmode);
-
   /* A test for unsigned overflow.  */
   if ((GET_MODE (x) == DImode || GET_MODE (x) == TImode)
       && code == NE
@@ -4301,8 +4464,6 @@ aarch64_get_condition_code_1 (enum machine_mode mode, enum rtx_code comp_code)
       break;
 
     case CC_SWPmode:
-    case CC_ZESWPmode:
-    case CC_SESWPmode:
       switch (comp_code)
 	{
 	case NE: return AARCH64_NE;
@@ -4957,7 +5118,7 @@ aarch64_legitimize_address (rtx x, rtx /* orig_x  */, machine_mode mode)
   if (GET_CODE (x) == PLUS && CONST_INT_P (XEXP (x, 1)))
     {
       rtx base = XEXP (x, 0);
-      rtx offset_rtx XEXP (x, 1);
+      rtx offset_rtx = XEXP (x, 1);
       HOST_WIDE_INT offset = INTVAL (offset_rtx);
 
       if (GET_CODE (base) == PLUS)
@@ -5015,120 +5176,6 @@ aarch64_legitimize_address (rtx x, rtx /* orig_x  */, machine_mode mode)
   return x;
 }
 
-/* Try a machine-dependent way of reloading an illegitimate address
-   operand.  If we find one, push the reload and return the new rtx.  */
-
-rtx
-aarch64_legitimize_reload_address (rtx *x_p,
-				   machine_mode mode,
-				   int opnum, int type,
-				   int ind_levels ATTRIBUTE_UNUSED)
-{
-  rtx x = *x_p;
-
-  /* Do not allow mem (plus (reg, const)) if vector struct mode.  */
-  if (aarch64_vect_struct_mode_p (mode)
-      && GET_CODE (x) == PLUS
-      && REG_P (XEXP (x, 0))
-      && CONST_INT_P (XEXP (x, 1)))
-    {
-      rtx orig_rtx = x;
-      x = copy_rtx (x);
-      push_reload (orig_rtx, NULL_RTX, x_p, NULL,
-		   BASE_REG_CLASS, GET_MODE (x), VOIDmode, 0, 0,
-		   opnum, (enum reload_type) type);
-      return x;
-    }
-
-  /* We must recognize output that we have already generated ourselves.  */
-  if (GET_CODE (x) == PLUS
-      && GET_CODE (XEXP (x, 0)) == PLUS
-      && REG_P (XEXP (XEXP (x, 0), 0))
-      && CONST_INT_P (XEXP (XEXP (x, 0), 1))
-      && CONST_INT_P (XEXP (x, 1)))
-    {
-      push_reload (XEXP (x, 0), NULL_RTX, &XEXP (x, 0), NULL,
-		   BASE_REG_CLASS, GET_MODE (x), VOIDmode, 0, 0,
-		   opnum, (enum reload_type) type);
-      return x;
-    }
-
-  /* We wish to handle large displacements off a base register by splitting
-     the addend across an add and the mem insn.  This can cut the number of
-     extra insns needed from 3 to 1.  It is only useful for load/store of a
-     single register with 12 bit offset field.  */
-  if (GET_CODE (x) == PLUS
-      && REG_P (XEXP (x, 0))
-      && CONST_INT_P (XEXP (x, 1))
-      && HARD_REGISTER_P (XEXP (x, 0))
-      && mode != TImode
-      && mode != TFmode
-      && aarch64_regno_ok_for_base_p (REGNO (XEXP (x, 0)), true))
-    {
-      HOST_WIDE_INT val = INTVAL (XEXP (x, 1));
-      HOST_WIDE_INT low = val & 0xfff;
-      HOST_WIDE_INT high = val - low;
-      HOST_WIDE_INT offs;
-      rtx cst;
-      machine_mode xmode = GET_MODE (x);
-
-      /* In ILP32, xmode can be either DImode or SImode.  */
-      gcc_assert (xmode == DImode || xmode == SImode);
-
-      /* Reload non-zero BLKmode offsets.  This is because we cannot ascertain
-	 BLKmode alignment.  */
-      if (GET_MODE_SIZE (mode) == 0)
-	return NULL_RTX;
-
-      offs = low % GET_MODE_SIZE (mode);
-
-      /* Align misaligned offset by adjusting high part to compensate.  */
-      if (offs != 0)
-	{
-	  if (aarch64_uimm12_shift (high + offs))
-	    {
-	      /* Align down.  */
-	      low = low - offs;
-	      high = high + offs;
-	    }
-	  else
-	    {
-	      /* Align up.  */
-	      offs = GET_MODE_SIZE (mode) - offs;
-	      low = low + offs;
-	      high = high + (low & 0x1000) - offs;
-	      low &= 0xfff;
-	    }
-	}
-
-      /* Check for overflow.  */
-      if (high + low != val)
-	return NULL_RTX;
-
-      cst = GEN_INT (high);
-      if (!aarch64_uimm12_shift (high))
-	cst = force_const_mem (xmode, cst);
-
-      /* Reload high part into base reg, leaving the low part
-	 in the mem instruction.
-	 Note that replacing this gen_rtx_PLUS with plus_constant is
-	 wrong in this case because we rely on the
-	 (plus (plus reg c1) c2) structure being preserved so that
-	 XEXP (*p, 0) in push_reload below uses the correct term.  */
-      x = gen_rtx_PLUS (xmode,
-			gen_rtx_PLUS (xmode, XEXP (x, 0), cst),
-			GEN_INT (low));
-
-      push_reload (XEXP (x, 0), NULL_RTX, &XEXP (x, 0), NULL,
-		   BASE_REG_CLASS, xmode, VOIDmode, 0, 0,
-		   opnum, (enum reload_type) type);
-      return x;
-    }
-
-  return NULL_RTX;
-}
-
-
 /* Return the reload icode required for a constant pool in mode.  */
 static enum insn_code
 aarch64_constant_pool_reload_icode (machine_mode mode)
@@ -5186,7 +5233,7 @@ aarch64_secondary_reload (bool in_p ATTRIBUTE_UNUSED, rtx x,
   if (MEM_P (x) && GET_CODE (x) == SYMBOL_REF && CONSTANT_POOL_ADDRESS_P (x)
       && (SCALAR_FLOAT_MODE_P (GET_MODE (x))
 	  || targetm.vector_mode_supported_p (GET_MODE (x)))
-      && aarch64_nopcrelative_literal_loads)
+      && !aarch64_pcrelative_literal_loads)
     {
       sri->icode = aarch64_constant_pool_reload_icode (mode);
       return NO_REGS;
@@ -5260,18 +5307,18 @@ aarch64_initial_elimination_offset (unsigned from, unsigned to)
   if (to == HARD_FRAME_POINTER_REGNUM)
     {
       if (from == ARG_POINTER_REGNUM)
-	return cfun->machine->frame.frame_size - crtl->outgoing_args_size;
+	return cfun->machine->frame.hard_fp_offset;
 
       if (from == FRAME_POINTER_REGNUM)
-	return (cfun->machine->frame.hard_fp_offset
-		- cfun->machine->frame.saved_varargs_size);
+	return cfun->machine->frame.hard_fp_offset
+	       - cfun->machine->frame.locals_offset;
     }
 
   if (to == STACK_POINTER_REGNUM)
     {
       if (from == FRAME_POINTER_REGNUM)
-	  return (cfun->machine->frame.frame_size
-		  - cfun->machine->frame.saved_varargs_size);
+	  return cfun->machine->frame.frame_size
+		 - cfun->machine->frame.locals_offset;
     }
 
   return cfun->machine->frame.frame_size;
@@ -5418,7 +5465,10 @@ aarch64_elf_asm_constructor (rtx symbol, int priority)
   else
     {
       section *s;
-      char buf[18];
+      /* While priority is known to be in range [0, 65535], so 18 bytes
+         would be enough, the compiler might not know that.  To avoid
+         -Wformat-truncation false positive, use a larger size.  */
+      char buf[23];
       snprintf (buf, sizeof (buf), ".init_array.%.5u", priority);
       s = get_section (buf, SECTION_WRITE, NULL);
       switch_to_section (s);
@@ -5435,7 +5485,10 @@ aarch64_elf_asm_destructor (rtx symbol, int priority)
   else
     {
       section *s;
-      char buf[18];
+      /* While priority is known to be in range [0, 65535], so 18 bytes
+         would be enough, the compiler might not know that.  To avoid
+         -Wformat-truncation false positive, use a larger size.  */
+      char buf[23];
       snprintf (buf, sizeof (buf), ".fini_array.%.5u", priority);
       s = get_section (buf, SECTION_WRITE, NULL);
       switch_to_section (s);
@@ -5520,7 +5573,7 @@ aarch64_uxt_size (int shift, HOST_WIDE_INT mask)
 static inline bool
 aarch64_can_use_per_function_literal_pools_p (void)
 {
-  return (!aarch64_nopcrelative_literal_loads
+  return (aarch64_pcrelative_literal_loads
 	  || aarch64_cmodel == AARCH64_CMODEL_LARGE);
 }
 
@@ -6139,6 +6192,19 @@ aarch64_extend_bitfield_pattern_p (rtx x)
   return op;
 }
 
+/* Return true if the mask and a shift amount from an RTX of the form
+   (x << SHFT_AMNT) & MASK are valid to combine into a UBFIZ instruction of
+   mode MODE.  See the *andim_ashift<mode>_bfiz pattern.  */
+
+bool
+aarch64_mask_and_shift_for_ubfiz_p (machine_mode mode, rtx mask, rtx shft_amnt)
+{
+  return CONST_INT_P (mask) && CONST_INT_P (shft_amnt)
+	 && INTVAL (shft_amnt) < GET_MODE_BITSIZE (mode)
+	 && exact_log2 ((INTVAL (mask) >> INTVAL (shft_amnt)) + 1) >= 0
+	 && (INTVAL (mask) & ((1 << INTVAL (shft_amnt)) - 1)) == 0;
+}
+
 /* Calculate the cost of calculating X, storing it in *COST.  Result
    is true if the total cost of the operation has now been calculated.  */
 static bool
@@ -6404,10 +6470,6 @@ aarch64_rtx_costs (rtx x, machine_mode mode, int outer ATTRIBUTE_UNUSED,
           /* TODO: A write to the CC flags possibly costs extra, this
 	     needs encoding in the cost tables.  */
 
-          /* CC_ZESWPmode supports zero extend for free.  */
-          if (mode == CC_ZESWPmode && GET_CODE (op0) == ZERO_EXTEND)
-            op0 = XEXP (op0, 0);
-
 	  mode = GET_MODE (op0);
           /* ANDS.  */
           if (GET_CODE (op0) == AND)
@@ -6717,17 +6779,31 @@ cost_plus:
 
       if (GET_MODE_CLASS (mode) == MODE_INT)
 	{
-	  /* We possibly get the immediate for free, this is not
-	     modelled.  */
-	  if (CONST_INT_P (op1)
-	      && aarch64_bitmask_imm (INTVAL (op1), mode))
+	  if (CONST_INT_P (op1))
 	    {
-	      *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
+	      /* We have a mask + shift version of a UBFIZ
+		 i.e. the *andim_ashift<mode>_bfiz pattern.  */
+	      if (GET_CODE (op0) == ASHIFT
+		  && aarch64_mask_and_shift_for_ubfiz_p (mode, op1,
+							  XEXP (op0, 1)))
+		{
+		  *cost += rtx_cost (XEXP (op0, 0), mode,
+				     (enum rtx_code) code, 0, speed);
+		  if (speed)
+		    *cost += extra_cost->alu.bfx;
 
-	      if (speed)
-		*cost += extra_cost->alu.logical;
+		  return true;
+		}
+	      else if (aarch64_bitmask_imm (INTVAL (op1), mode))
+		{
+		/* We possibly get the immediate for free, this is not
+		   modelled.  */
+		  *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
+		  if (speed)
+		    *cost += extra_cost->alu.logical;
 
-	      return true;
+		  return true;
+		}
 	    }
 	  else
 	    {
@@ -6831,11 +6907,12 @@ cost_plus:
 	{
 	  int op_cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, 0, speed);
 
-	  if (!op_cost && speed)
-	    /* MOV.  */
-	    *cost += extra_cost->alu.extend;
-	  else
-	    /* Free, the cost is that of the SI mode operation.  */
+	/* If OP_COST is non-zero, then the cost of the zero extend
+	   is effectively the cost of the inner operation.  Otherwise
+	   we have a MOV instruction and we take the cost from the MOV
+	   itself.  This is true independently of whether we are
+	   optimizing for space or time.  */
+	  if (op_cost)
 	    *cost = op_cost;
 
 	  return true;
@@ -6865,8 +6942,8 @@ cost_plus:
 	    }
 	  else
 	    {
-	      /* UXTB/UXTH.  */
-	      *cost += extra_cost->alu.extend;
+	      /* We generate an AND instead of UXTB/UXTH.  */
+	      *cost += extra_cost->alu.logical;
 	    }
 	}
       return false;
@@ -7349,7 +7426,8 @@ cost_plus:
       break;
     }
 
-  if (dump_file && (dump_flags & TDF_DETAILS))
+  if (dump_file
+      && flag_aarch64_verbose_cost)
     fprintf (dump_file,
       "\nFailed to cost RTX.  Assuming default cost.\n");
 
@@ -7365,7 +7443,8 @@ aarch64_rtx_costs_wrapper (rtx x, machine_mode mode, int outer,
 {
   bool result = aarch64_rtx_costs (x, mode, outer, param, cost, speed);
 
-  if (dump_file && (dump_flags & TDF_DETAILS))
+  if (dump_file
+      && flag_aarch64_verbose_cost)
     {
       print_rtl_single (dump_file, x);
       fprintf (dump_file, "\n%s cost: %d (%s)\n",
@@ -7445,12 +7524,12 @@ aarch64_memory_move_cost (machine_mode mode ATTRIBUTE_UNUSED,
    to optimize 1.0/sqrt.  */
 
 static bool
-use_rsqrt_p (void)
+use_rsqrt_p (machine_mode mode)
 {
   return (!flag_trapping_math
 	  && flag_unsafe_math_optimizations
-	  && ((aarch64_tune_params.extra_tuning_flags
-	       & AARCH64_EXTRA_TUNE_APPROX_RSQRT)
+	  && ((aarch64_tune_params.approx_modes->recip_sqrt
+	       & AARCH64_APPROX_MODE (mode))
 	      || flag_mrecip_low_precision_sqrt));
 }
 
@@ -7460,89 +7539,225 @@ use_rsqrt_p (void)
 static tree
 aarch64_builtin_reciprocal (tree fndecl)
 {
-  if (!use_rsqrt_p ())
+  machine_mode mode = TYPE_MODE (TREE_TYPE (fndecl));
+
+  if (!use_rsqrt_p (mode))
     return NULL_TREE;
   return aarch64_builtin_rsqrt (DECL_FUNCTION_CODE (fndecl));
 }
 
 typedef rtx (*rsqrte_type) (rtx, rtx);
 
-/* Select reciprocal square root initial estimate
-   insn depending on machine mode.  */
+/* Select reciprocal square root initial estimate insn depending on machine
+   mode.  */
 
-rsqrte_type
+static rsqrte_type
 get_rsqrte_type (machine_mode mode)
 {
   switch (mode)
   {
-    case DFmode:   return gen_aarch64_rsqrte_df2;
-    case SFmode:   return gen_aarch64_rsqrte_sf2;
-    case V2DFmode: return gen_aarch64_rsqrte_v2df2;
-    case V2SFmode: return gen_aarch64_rsqrte_v2sf2;
-    case V4SFmode: return gen_aarch64_rsqrte_v4sf2;
+    case DFmode:   return gen_aarch64_rsqrtedf;
+    case SFmode:   return gen_aarch64_rsqrtesf;
+    case V2DFmode: return gen_aarch64_rsqrtev2df;
+    case V2SFmode: return gen_aarch64_rsqrtev2sf;
+    case V4SFmode: return gen_aarch64_rsqrtev4sf;
     default: gcc_unreachable ();
   }
 }
 
 typedef rtx (*rsqrts_type) (rtx, rtx, rtx);
 
-/* Select reciprocal square root Newton-Raphson step
-   insn depending on machine mode.  */
+/* Select reciprocal square root series step insn depending on machine mode.  */
 
-rsqrts_type
+static rsqrts_type
 get_rsqrts_type (machine_mode mode)
 {
   switch (mode)
   {
-    case DFmode:   return gen_aarch64_rsqrts_df3;
-    case SFmode:   return gen_aarch64_rsqrts_sf3;
-    case V2DFmode: return gen_aarch64_rsqrts_v2df3;
-    case V2SFmode: return gen_aarch64_rsqrts_v2sf3;
-    case V4SFmode: return gen_aarch64_rsqrts_v4sf3;
+    case DFmode:   return gen_aarch64_rsqrtsdf;
+    case SFmode:   return gen_aarch64_rsqrtssf;
+    case V2DFmode: return gen_aarch64_rsqrtsv2df;
+    case V2SFmode: return gen_aarch64_rsqrtsv2sf;
+    case V4SFmode: return gen_aarch64_rsqrtsv4sf;
     default: gcc_unreachable ();
   }
 }
 
-/* Emit instruction sequence to compute the reciprocal square root using the
-   Newton-Raphson series.  Iterate over the series twice for SF
-   and thrice for DF.  */
+/* Emit instruction sequence to compute either the approximate square root
+   or its approximate reciprocal, depending on the flag RECP, and return
+   whether the sequence was emitted or not.  */
 
-void
-aarch64_emit_approx_rsqrt (rtx dst, rtx src)
+bool
+aarch64_emit_approx_sqrt (rtx dst, rtx src, bool recp)
 {
-  machine_mode mode = GET_MODE (src);
-  gcc_assert (
-    mode == SFmode || mode == V2SFmode || mode == V4SFmode
-	|| mode == DFmode || mode == V2DFmode);
+  machine_mode mode = GET_MODE (dst);
+
+  if (GET_MODE_INNER (mode) == HFmode)
+    return false;
 
-  rtx xsrc = gen_reg_rtx (mode);
-  emit_move_insn (xsrc, src);
-  rtx x0 = gen_reg_rtx (mode);
+  machine_mode mmsk = mode_for_vector
+		        (int_mode_for_mode (GET_MODE_INNER (mode)),
+			 GET_MODE_NUNITS (mode));
+  bool use_approx_sqrt_p = (!recp
+			    && (flag_mlow_precision_sqrt
+			        || (aarch64_tune_params.approx_modes->sqrt
+				    & AARCH64_APPROX_MODE (mode))));
+  bool use_approx_rsqrt_p = (recp
+			     && (flag_mrecip_low_precision_sqrt
+				 || (aarch64_tune_params.approx_modes->recip_sqrt
+				     & AARCH64_APPROX_MODE (mode))));
+
+  if (!flag_finite_math_only
+      || flag_trapping_math
+      || !flag_unsafe_math_optimizations
+      || !(use_approx_sqrt_p || use_approx_rsqrt_p)
+      || optimize_function_for_size_p (cfun))
+    return false;
 
-  emit_insn ((*get_rsqrte_type (mode)) (x0, xsrc));
+  rtx xmsk = gen_reg_rtx (mmsk);
+  if (!recp)
+    /* When calculating the approximate square root, compare the argument with
+       0.0 and create a mask.  */
+    emit_insn (gen_rtx_SET (xmsk, gen_rtx_NEG (mmsk, gen_rtx_EQ (mmsk, src,
+							  CONST0_RTX (mode)))));
 
-  bool double_mode = (mode == DFmode || mode == V2DFmode);
+  /* Estimate the approximate reciprocal square root.  */
+  rtx xdst = gen_reg_rtx (mode);
+  emit_insn ((*get_rsqrte_type (mode)) (xdst, src));
 
-  int iterations = double_mode ? 3 : 2;
+  /* Iterate over the series twice for SF and thrice for DF.  */
+  int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
 
-  /* Optionally iterate over the series one less time than otherwise.  */
-  if (flag_mrecip_low_precision_sqrt)
+  /* Optionally iterate over the series once less for faster performance
+     while sacrificing the accuracy.  */
+  if ((recp && flag_mrecip_low_precision_sqrt)
+      || (!recp && flag_mlow_precision_sqrt))
     iterations--;
 
-  for (int i = 0; i < iterations; ++i)
+  /* Iterate over the series to calculate the approximate reciprocal square
+     root.  */
+  rtx x1 = gen_reg_rtx (mode);
+  while (iterations--)
     {
-      rtx x1 = gen_reg_rtx (mode);
       rtx x2 = gen_reg_rtx (mode);
-      rtx x3 = gen_reg_rtx (mode);
-      emit_set_insn (x2, gen_rtx_MULT (mode, x0, x0));
+      emit_set_insn (x2, gen_rtx_MULT (mode, xdst, xdst));
+
+      emit_insn ((*get_rsqrts_type (mode)) (x1, src, x2));
+
+      if (iterations > 0)
+	emit_set_insn (xdst, gen_rtx_MULT (mode, xdst, x1));
+    }
+
+  if (!recp)
+    {
+      /* Qualify the approximate reciprocal square root when the argument is
+	 0.0 by squashing the intermediary result to 0.0.  */
+      rtx xtmp = gen_reg_rtx (mmsk);
+      emit_set_insn (xtmp, gen_rtx_AND (mmsk, gen_rtx_NOT (mmsk, xmsk),
+					      gen_rtx_SUBREG (mmsk, xdst, 0)));
+      emit_move_insn (xdst, gen_rtx_SUBREG (mode, xtmp, 0));
+
+      /* Calculate the approximate square root.  */
+      emit_set_insn (xdst, gen_rtx_MULT (mode, xdst, src));
+    }
+
+  /* Finalize the approximation.  */
+  emit_set_insn (dst, gen_rtx_MULT (mode, xdst, x1));
+
+  return true;
+}
+
+typedef rtx (*recpe_type) (rtx, rtx);
+
+/* Select reciprocal initial estimate insn depending on machine mode.  */
+
+static recpe_type
+get_recpe_type (machine_mode mode)
+{
+  switch (mode)
+  {
+    case SFmode:   return (gen_aarch64_frecpesf);
+    case V2SFmode: return (gen_aarch64_frecpev2sf);
+    case V4SFmode: return (gen_aarch64_frecpev4sf);
+    case DFmode:   return (gen_aarch64_frecpedf);
+    case V2DFmode: return (gen_aarch64_frecpev2df);
+    default:       gcc_unreachable ();
+  }
+}
+
+typedef rtx (*recps_type) (rtx, rtx, rtx);
+
+/* Select reciprocal series step insn depending on machine mode.  */
+
+static recps_type
+get_recps_type (machine_mode mode)
+{
+  switch (mode)
+  {
+    case SFmode:   return (gen_aarch64_frecpssf);
+    case V2SFmode: return (gen_aarch64_frecpsv2sf);
+    case V4SFmode: return (gen_aarch64_frecpsv4sf);
+    case DFmode:   return (gen_aarch64_frecpsdf);
+    case V2DFmode: return (gen_aarch64_frecpsv2df);
+    default:       gcc_unreachable ();
+  }
+}
+
+/* Emit the instruction sequence to compute the approximation for the division
+   of NUM by DEN in QUO and return whether the sequence was emitted or not.  */
+
+bool
+aarch64_emit_approx_div (rtx quo, rtx num, rtx den)
+{
+  machine_mode mode = GET_MODE (quo);
 
-      emit_insn ((*get_rsqrts_type (mode)) (x3, xsrc, x2));
+  if (GET_MODE_INNER (mode) == HFmode)
+    return false;
+
+  bool use_approx_division_p = (flag_mlow_precision_div
+			        || (aarch64_tune_params.approx_modes->division
+				    & AARCH64_APPROX_MODE (mode)));
+
+  if (!flag_finite_math_only
+      || flag_trapping_math
+      || !flag_unsafe_math_optimizations
+      || optimize_function_for_size_p (cfun)
+      || !use_approx_division_p)
+    return false;
+
+  /* Estimate the approximate reciprocal.  */
+  rtx xrcp = gen_reg_rtx (mode);
+  emit_insn ((*get_recpe_type (mode)) (xrcp, den));
+
+  /* Iterate over the series twice for SF and thrice for DF.  */
+  int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
+
+  /* Optionally iterate over the series once less for faster performance,
+     while sacrificing the accuracy.  */
+  if (flag_mlow_precision_div)
+    iterations--;
 
-      emit_set_insn (x1, gen_rtx_MULT (mode, x0, x3));
-      x0 = x1;
+  /* Iterate over the series to calculate the approximate reciprocal.  */
+  rtx xtmp = gen_reg_rtx (mode);
+  while (iterations--)
+    {
+      emit_insn ((*get_recps_type (mode)) (xtmp, xrcp, den));
+
+      if (iterations > 0)
+	emit_set_insn (xrcp, gen_rtx_MULT (mode, xrcp, xtmp));
+    }
+
+  if (num != CONST1_RTX (mode))
+    {
+      /* As the approximate reciprocal of DEN is already calculated, only
+	 calculate the approximate division when NUM is not 1.0.  */
+      rtx xnum = force_reg (mode, num);
+      emit_set_insn (xrcp, gen_rtx_MULT (mode, xrcp, xnum));
     }
 
-  emit_move_insn (dst, x0);
+  /* Finalize the approximation.  */
+  emit_set_insn (quo, gen_rtx_MULT (mode, xrcp, xtmp));
+  return true;
 }
 
 /* Return the number of instructions that can be issued per cycle.  */
@@ -8046,32 +8261,37 @@ aarch64_override_options_after_change_1 (struct gcc_options *opts)
 	opts->x_align_functions = aarch64_tune_params.function_align;
     }
 
-  /* If nopcrelative_literal_loads is set on the command line, this
+  /* We default to no pc-relative literal loads.  */
+
+  aarch64_pcrelative_literal_loads = false;
+
+  /* If -mpc-relative-literal-loads is set on the command line, this
      implies that the user asked for PC relative literal loads.  */
-  if (opts->x_nopcrelative_literal_loads == 1)
-    aarch64_nopcrelative_literal_loads = false;
+  if (opts->x_pcrelative_literal_loads == 1)
+    aarch64_pcrelative_literal_loads = true;
 
-  /* If it is not set on the command line, we default to no pc
-     relative literal loads, unless the workaround for Cortex-A53
-     erratum 843419 is in effect.  */
   /* This is PR70113. When building the Linux kernel with
      CONFIG_ARM64_ERRATUM_843419, support for relocations
      R_AARCH64_ADR_PREL_PG_HI21 and R_AARCH64_ADR_PREL_PG_HI21_NC is
      removed from the kernel to avoid loading objects with possibly
-     offending sequences. With nopcrelative_literal_loads, we would
+     offending sequences.  Without -mpc-relative-literal-loads we would
      generate such relocations, preventing the kernel build from
      succeeding.  */
-  if (opts->x_nopcrelative_literal_loads == 2
-      && !TARGET_FIX_ERR_A53_843419)
-    aarch64_nopcrelative_literal_loads = true;
+  if (opts->x_pcrelative_literal_loads == 2
+      && TARGET_FIX_ERR_A53_843419)
+    aarch64_pcrelative_literal_loads = true;
 
-  /* In the tiny memory model it makes no sense
-     to disallow non PC relative literal pool loads
-     as many other things will break anyway.  */
-  if (opts->x_nopcrelative_literal_loads
-      && (aarch64_cmodel == AARCH64_CMODEL_TINY
-	  || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC))
-    aarch64_nopcrelative_literal_loads = false;
+  /* In the tiny memory model it makes no sense to disallow PC relative
+     literal pool loads.  */
+  if (aarch64_cmodel == AARCH64_CMODEL_TINY
+      || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
+    aarch64_pcrelative_literal_loads = true;
+
+  /* When enabling the lower precision Newton series for the square root, also
+     enable it for the reciprocal square root, since the latter is an
+     intermediary step for the former.  */
+  if (flag_mlow_precision_sqrt)
+    flag_mrecip_low_precision_sqrt = true;
 }
 
 /* 'Unpack' up the internal tuning structs and update the options
@@ -8374,9 +8594,6 @@ aarch64_override_options (void)
      while processing functions with potential target attributes.  */
   target_option_default_node = target_option_current_node
       = build_target_option_node (&global_options);
-
-  aarch64_register_fma_steering ();
-
 }
 
 /* Implement targetm.override_options_after_change.  */
@@ -9279,15 +9496,18 @@ aarch64_classify_symbol (rtx x, rtx offset)
       switch (aarch64_cmodel)
 	{
 	case AARCH64_CMODEL_TINY:
-	  /* When we retreive symbol + offset address, we have to make sure
+	  /* When we retrieve symbol + offset address, we have to make sure
 	     the offset does not cause overflow of the final address.  But
 	     we have no way of knowing the address of symbol at compile time
 	     so we can't accurately say if the distance between the PC and
 	     symbol + offset is outside the addressible range of +/-1M in the
 	     TINY code model.  So we rely on images not being greater than
 	     1M and cap the offset at 1M and anything beyond 1M will have to
-	     be loaded using an alternative mechanism.  */
-	  if (SYMBOL_REF_WEAK (x)
+	     be loaded using an alternative mechanism.  Furthermore if the
+	     symbol is a weak reference to something that isn't known to
+	     resolve to a symbol in this module, then force to memory.  */
+	  if ((SYMBOL_REF_WEAK (x)
+	       && !aarch64_symbol_binds_local_p (x))
 	      || INTVAL (offset) < -1048575 || INTVAL (offset) > 1048575)
 	    return SYMBOL_FORCE_TO_MEM;
 	  return SYMBOL_TINY_ABSOLUTE;
@@ -9295,7 +9515,8 @@ aarch64_classify_symbol (rtx x, rtx offset)
 	case AARCH64_CMODEL_SMALL:
 	  /* Same reasoning as the tiny code model, but the offset cap here is
 	     4G.  */
-	  if (SYMBOL_REF_WEAK (x)
+	  if ((SYMBOL_REF_WEAK (x)
+	       && !aarch64_symbol_binds_local_p (x))
 	      || !IN_RANGE (INTVAL (offset), HOST_WIDE_INT_C (-4294967263),
 			    HOST_WIDE_INT_C (4294967264)))
 	    return SYMBOL_FORCE_TO_MEM;
@@ -9317,8 +9538,7 @@ aarch64_classify_symbol (rtx x, rtx offset)
 	  /* This is alright even in PIC code as the constant
 	     pool reference is always PC relative and within
 	     the same translation unit.  */
-	  if (nopcrelative_literal_loads
-	      && CONSTANT_POOL_ADDRESS_P (x))
+	  if (CONSTANT_POOL_ADDRESS_P (x))
 	    return SYMBOL_SMALL_ABSOLUTE;
 	  else
 	    return SYMBOL_FORCE_TO_MEM;
@@ -9454,6 +9674,13 @@ aarch64_build_builtin_va_list (void)
 			FIELD_DECL, get_identifier ("__vr_offs"),
 			integer_type_node);
 
+  /* Tell tree-stdarg pass about our internal offset fields.
+     NOTE: va_list_gpr/fpr_counter_field are only used for tree comparision
+     purpose to identify whether the code is updating va_list internal
+     offset fields through irregular way.  */
+  va_list_gpr_counter_field = f_groff;
+  va_list_fpr_counter_field = f_vroff;
+
   DECL_ARTIFICIAL (f_stack) = 1;
   DECL_ARTIFICIAL (f_grtop) = 1;
   DECL_ARTIFICIAL (f_vrtop) = 1;
@@ -9486,15 +9713,17 @@ aarch64_expand_builtin_va_start (tree valist, rtx nextarg ATTRIBUTE_UNUSED)
   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
   tree stack, grtop, vrtop, groff, vroff;
   tree t;
-  int gr_save_area_size;
-  int vr_save_area_size;
+  int gr_save_area_size = cfun->va_list_gpr_size;
+  int vr_save_area_size = cfun->va_list_fpr_size;
   int vr_offset;
 
   cum = &crtl->args.info;
-  gr_save_area_size
-    = (NUM_ARG_REGS - cum->aapcs_ncrn) * UNITS_PER_WORD;
-  vr_save_area_size
-    = (NUM_FP_ARG_REGS - cum->aapcs_nvrn) * UNITS_PER_VREG;
+  if (cfun->va_list_gpr_size)
+    gr_save_area_size = MIN ((NUM_ARG_REGS - cum->aapcs_ncrn) * UNITS_PER_WORD,
+			     cfun->va_list_gpr_size);
+  if (cfun->va_list_fpr_size)
+    vr_save_area_size = MIN ((NUM_FP_ARG_REGS - cum->aapcs_nvrn)
+			     * UNITS_PER_VREG, cfun->va_list_fpr_size);
 
   if (!TARGET_FLOAT)
     {
@@ -9823,7 +10052,8 @@ aarch64_setup_incoming_varargs (cumulative_args_t cum_v, machine_mode mode,
 {
   CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
   CUMULATIVE_ARGS local_cum;
-  int gr_saved, vr_saved;
+  int gr_saved = cfun->va_list_gpr_size;
+  int vr_saved = cfun->va_list_fpr_size;
 
   /* The caller has advanced CUM up to, but not beyond, the last named
      argument.  Advance a local copy of CUM past the last "real" named
@@ -9831,9 +10061,14 @@ aarch64_setup_incoming_varargs (cumulative_args_t cum_v, machine_mode mode,
   local_cum = *cum;
   aarch64_function_arg_advance (pack_cumulative_args(&local_cum), mode, type, true);
 
-  /* Found out how many registers we need to save.  */
-  gr_saved = NUM_ARG_REGS - local_cum.aapcs_ncrn;
-  vr_saved = NUM_FP_ARG_REGS - local_cum.aapcs_nvrn;
+  /* Found out how many registers we need to save.
+     Honor tree-stdvar analysis results.  */
+  if (cfun->va_list_gpr_size)
+    gr_saved = MIN (NUM_ARG_REGS - local_cum.aapcs_ncrn,
+		    cfun->va_list_gpr_size / UNITS_PER_WORD);
+  if (cfun->va_list_fpr_size)
+    vr_saved = MIN (NUM_FP_ARG_REGS - local_cum.aapcs_nvrn,
+		    cfun->va_list_fpr_size / UNITS_PER_VREG);
 
   if (!TARGET_FLOAT)
     {
@@ -9861,7 +10096,7 @@ aarch64_setup_incoming_varargs (cumulative_args_t cum_v, machine_mode mode,
 	  /* We can't use move_block_from_reg, because it will use
 	     the wrong mode, storing D regs only.  */
 	  machine_mode mode = TImode;
-	  int off, i;
+	  int off, i, vr_start;
 
 	  /* Set OFF to the offset from virtual_incoming_args_rtx of
 	     the first vector register.  The VR save area lies below
@@ -9870,14 +10105,15 @@ aarch64_setup_incoming_varargs (cumulative_args_t cum_v, machine_mode mode,
 			   STACK_BOUNDARY / BITS_PER_UNIT);
 	  off -= vr_saved * UNITS_PER_VREG;
 
-	  for (i = local_cum.aapcs_nvrn; i < NUM_FP_ARG_REGS; ++i)
+	  vr_start = V0_REGNUM + local_cum.aapcs_nvrn;
+	  for (i = 0; i < vr_saved; ++i)
 	    {
 	      rtx ptr, mem;
 
 	      ptr = plus_constant (Pmode, virtual_incoming_args_rtx, off);
 	      mem = gen_frame_mem (mode, ptr);
 	      set_mem_alias_set (mem, get_varargs_alias_set ());
-	      aarch64_emit_move (mem, gen_rtx_REG (mode, V0_REGNUM + i));
+	      aarch64_emit_move (mem, gen_rtx_REG (mode, vr_start + i));
 	      off += UNITS_PER_VREG;
 	    }
 	}
@@ -10839,33 +11075,6 @@ aarch64_simd_emit_reg_reg_move (rtx *operands, enum machine_mode mode,
 		      gen_rtx_REG (mode, rsrc + count - i - 1));
 }
 
-/* Compute and return the length of aarch64_simd_mov<mode>, where <mode> is
-   one of VSTRUCT modes: OI, CI or XI.  */
-int
-aarch64_simd_attr_length_move (rtx_insn *insn)
-{
-  machine_mode mode;
-
-  extract_insn_cached (insn);
-
-  if (REG_P (recog_data.operand[0]) && REG_P (recog_data.operand[1]))
-    {
-      mode = GET_MODE (recog_data.operand[0]);
-      switch (mode)
-	{
-	case OImode:
-	  return 8;
-	case CImode:
-	  return 12;
-	case XImode:
-	  return 16;
-	default:
-	  gcc_unreachable ();
-	}
-    }
-  return 4;
-}
-
 /* Compute and return the length of aarch64_simd_reglist<mode>, where <mode> is
    one of VSTRUCT modes: OI, CI, or XI.  */
 int
@@ -10899,6 +11108,37 @@ aarch64_simd_vector_alignment_reachable (const_tree type, bool is_packed)
   return true;
 }
 
+/* Return true if the vector misalignment factor is supported by the
+   target.  */
+static bool
+aarch64_builtin_support_vector_misalignment (machine_mode mode,
+					     const_tree type, int misalignment,
+					     bool is_packed)
+{
+  if (TARGET_SIMD && STRICT_ALIGNMENT)
+    {
+      /* Return if movmisalign pattern is not supported for this mode.  */
+      if (optab_handler (movmisalign_optab, mode) == CODE_FOR_nothing)
+        return false;
+
+      if (misalignment == -1)
+	{
+	  /* Misalignment factor is unknown at compile time but we know
+	     it's word aligned.  */
+	  if (aarch64_simd_vector_alignment_reachable (type, is_packed))
+            {
+              int element_size = TREE_INT_CST_LOW (TYPE_SIZE (type));
+
+              if (element_size != 64)
+                return true;
+            }
+	  return false;
+	}
+    }
+  return default_builtin_support_vector_misalignment (mode, type, misalignment,
+						      is_packed);
+}
+
 /* If VALS is a vector constant that can be loaded into a register
    using DUP, generate instructions to do so and return an RTX to
    assign to the register.  Otherwise return NULL_RTX.  */
@@ -11947,12 +12187,11 @@ aarch64_output_simd_mov_immediate (rtx const_vector,
         info.value = GEN_INT (0);
       else
 	{
-#define buf_size 20
+	  const unsigned int buf_size = 20;
 	  char float_buf[buf_size] = {'\0'};
 	  real_to_decimal_for_mode (float_buf,
 				    CONST_DOUBLE_REAL_VALUE (info.value),
 				    buf_size, buf_size, 1, mode);
-#undef buf_size
 
 	  if (lane_count == 1)
 	    snprintf (templ, sizeof (templ), "fmov\t%%d0, %s", float_buf);
@@ -12186,6 +12425,8 @@ aarch64_evpc_trn (struct expand_vec_perm_d *d)
 	case V4SImode: gen = gen_aarch64_trn2v4si; break;
 	case V2SImode: gen = gen_aarch64_trn2v2si; break;
 	case V2DImode: gen = gen_aarch64_trn2v2di; break;
+	case V4HFmode: gen = gen_aarch64_trn2v4hf; break;
+	case V8HFmode: gen = gen_aarch64_trn2v8hf; break;
 	case V4SFmode: gen = gen_aarch64_trn2v4sf; break;
 	case V2SFmode: gen = gen_aarch64_trn2v2sf; break;
 	case V2DFmode: gen = gen_aarch64_trn2v2df; break;
@@ -12204,6 +12445,8 @@ aarch64_evpc_trn (struct expand_vec_perm_d *d)
 	case V4SImode: gen = gen_aarch64_trn1v4si; break;
 	case V2SImode: gen = gen_aarch64_trn1v2si; break;
 	case V2DImode: gen = gen_aarch64_trn1v2di; break;
+	case V4HFmode: gen = gen_aarch64_trn1v4hf; break;
+	case V8HFmode: gen = gen_aarch64_trn1v8hf; break;
 	case V4SFmode: gen = gen_aarch64_trn1v4sf; break;
 	case V2SFmode: gen = gen_aarch64_trn1v2sf; break;
 	case V2DFmode: gen = gen_aarch64_trn1v2df; break;
@@ -12269,6 +12512,8 @@ aarch64_evpc_uzp (struct expand_vec_perm_d *d)
 	case V4SImode: gen = gen_aarch64_uzp2v4si; break;
 	case V2SImode: gen = gen_aarch64_uzp2v2si; break;
 	case V2DImode: gen = gen_aarch64_uzp2v2di; break;
+	case V4HFmode: gen = gen_aarch64_uzp2v4hf; break;
+	case V8HFmode: gen = gen_aarch64_uzp2v8hf; break;
 	case V4SFmode: gen = gen_aarch64_uzp2v4sf; break;
 	case V2SFmode: gen = gen_aarch64_uzp2v2sf; break;
 	case V2DFmode: gen = gen_aarch64_uzp2v2df; break;
@@ -12287,6 +12532,8 @@ aarch64_evpc_uzp (struct expand_vec_perm_d *d)
 	case V4SImode: gen = gen_aarch64_uzp1v4si; break;
 	case V2SImode: gen = gen_aarch64_uzp1v2si; break;
 	case V2DImode: gen = gen_aarch64_uzp1v2di; break;
+	case V4HFmode: gen = gen_aarch64_uzp1v4hf; break;
+	case V8HFmode: gen = gen_aarch64_uzp1v8hf; break;
 	case V4SFmode: gen = gen_aarch64_uzp1v4sf; break;
 	case V2SFmode: gen = gen_aarch64_uzp1v2sf; break;
 	case V2DFmode: gen = gen_aarch64_uzp1v2df; break;
@@ -12357,6 +12604,8 @@ aarch64_evpc_zip (struct expand_vec_perm_d *d)
 	case V4SImode: gen = gen_aarch64_zip2v4si; break;
 	case V2SImode: gen = gen_aarch64_zip2v2si; break;
 	case V2DImode: gen = gen_aarch64_zip2v2di; break;
+	case V4HFmode: gen = gen_aarch64_zip2v4hf; break;
+	case V8HFmode: gen = gen_aarch64_zip2v8hf; break;
 	case V4SFmode: gen = gen_aarch64_zip2v4sf; break;
 	case V2SFmode: gen = gen_aarch64_zip2v2sf; break;
 	case V2DFmode: gen = gen_aarch64_zip2v2df; break;
@@ -12375,6 +12624,8 @@ aarch64_evpc_zip (struct expand_vec_perm_d *d)
 	case V4SImode: gen = gen_aarch64_zip1v4si; break;
 	case V2SImode: gen = gen_aarch64_zip1v2si; break;
 	case V2DImode: gen = gen_aarch64_zip1v2di; break;
+	case V4HFmode: gen = gen_aarch64_zip1v4hf; break;
+	case V8HFmode: gen = gen_aarch64_zip1v8hf; break;
 	case V4SFmode: gen = gen_aarch64_zip1v4sf; break;
 	case V2SFmode: gen = gen_aarch64_zip1v2sf; break;
 	case V2DFmode: gen = gen_aarch64_zip1v2df; break;
@@ -12419,6 +12670,8 @@ aarch64_evpc_ext (struct expand_vec_perm_d *d)
     case V8HImode: gen = gen_aarch64_extv8hi; break;
     case V2SImode: gen = gen_aarch64_extv2si; break;
     case V4SImode: gen = gen_aarch64_extv4si; break;
+    case V4HFmode: gen = gen_aarch64_extv4hf; break;
+    case V8HFmode: gen = gen_aarch64_extv8hf; break;
     case V2SFmode: gen = gen_aarch64_extv2sf; break;
     case V4SFmode: gen = gen_aarch64_extv4sf; break;
     case V2DImode: gen = gen_aarch64_extv2di; break;
@@ -12494,6 +12747,8 @@ aarch64_evpc_rev (struct expand_vec_perm_d *d)
 	case V2SImode: gen = gen_aarch64_rev64v2si;  break;
 	case V4SFmode: gen = gen_aarch64_rev64v4sf;  break;
 	case V2SFmode: gen = gen_aarch64_rev64v2sf;  break;
+	case V8HFmode: gen = gen_aarch64_rev64v8hf;  break;
+	case V4HFmode: gen = gen_aarch64_rev64v4hf;  break;
 	default:
 	  return false;
 	}
@@ -12737,24 +12992,6 @@ aarch64_vectorize_vec_perm_const_ok (machine_mode vmode,
   return ret;
 }
 
-/* Implement target hook CANNOT_CHANGE_MODE_CLASS.  */
-bool
-aarch64_cannot_change_mode_class (machine_mode from,
-				  machine_mode to,
-				  enum reg_class rclass)
-{
-  /* We cannot allow word_mode subregs of full vector modes.
-     Otherwise the middle-end will assume it's ok to store to
-     (subreg:DI (reg:TI 100) 0) in order to modify only the low 64 bits
-     of the 128-bit register.  However, after reload the subreg will
-     be dropped leaving a plain DImode store.  See PR67609 for a more
-     detailed dicussion.  In all other cases, we want to be permissive
-     and return false.  */
-  return (reg_classes_intersect_p (FP_REGS, rclass)
-	  && GET_MODE_SIZE (to) == UNITS_PER_WORD
-	  && GET_MODE_SIZE (from) > UNITS_PER_WORD);
-}
-
 rtx
 aarch64_reverse_mask (enum machine_mode mode)
 {
@@ -12776,7 +13013,14 @@ aarch64_reverse_mask (enum machine_mode mode)
   return force_reg (V16QImode, mask);
 }
 
-/* Implement MODES_TIEABLE_P.  */
+/* Implement MODES_TIEABLE_P.  In principle we should always return true.
+   However due to issues with register allocation it is preferable to avoid
+   tieing integer scalar and FP scalar modes.  Executing integer operations
+   in general registers is better than treating them as scalar vector
+   operations.  This reduces latency and avoids redundant int<->FP moves.
+   So tie modes if they are either the same class, or vector modes with
+   other vector modes, vector structs or any scalar mode.
+*/
 
 bool
 aarch64_modes_tieable_p (machine_mode mode1, machine_mode mode2)
@@ -12787,9 +13031,12 @@ aarch64_modes_tieable_p (machine_mode mode1, machine_mode mode2)
   /* We specifically want to allow elements of "structure" modes to
      be tieable to the structure.  This more general condition allows
      other rarer situations too.  */
-  if (TARGET_SIMD
-      && aarch64_vector_mode_p (mode1)
-      && aarch64_vector_mode_p (mode2))
+  if (aarch64_vector_mode_p (mode1) && aarch64_vector_mode_p (mode2))
+    return true;
+
+  /* Also allow any scalar modes with vectors.  */
+  if (aarch64_vector_mode_supported_p (mode1)
+      || aarch64_vector_mode_supported_p (mode2))
     return true;
 
   return false;
@@ -12953,6 +13200,63 @@ aarch64_expand_movmem (rtx *operands)
   return true;
 }
 
+/* Split a DImode store of a CONST_INT SRC to MEM DST as two
+   SImode stores.  Handle the case when the constant has identical
+   bottom and top halves.  This is beneficial when the two stores can be
+   merged into an STP and we avoid synthesising potentially expensive
+   immediates twice.  Return true if such a split is possible.  */
+
+bool
+aarch64_split_dimode_const_store (rtx dst, rtx src)
+{
+  rtx lo = gen_lowpart (SImode, src);
+  rtx hi = gen_highpart_mode (SImode, DImode, src);
+
+  bool size_p = optimize_function_for_size_p (cfun);
+
+  if (!rtx_equal_p (lo, hi))
+    return false;
+
+  unsigned int orig_cost
+    = aarch64_internal_mov_immediate (NULL_RTX, src, false, DImode);
+  unsigned int lo_cost
+    = aarch64_internal_mov_immediate (NULL_RTX, lo, false, SImode);
+
+  /* We want to transform:
+     MOV	x1, 49370
+     MOVK	x1, 0x140, lsl 16
+     MOVK	x1, 0xc0da, lsl 32
+     MOVK	x1, 0x140, lsl 48
+     STR	x1, [x0]
+   into:
+     MOV	w1, 49370
+     MOVK	w1, 0x140, lsl 16
+     STP	w1, w1, [x0]
+   So we want to perform this only when we save two instructions
+   or more.  When optimizing for size, however, accept any code size
+   savings we can.  */
+  if (size_p && orig_cost <= lo_cost)
+    return false;
+
+  if (!size_p
+      && (orig_cost <= lo_cost + 1))
+    return false;
+
+  rtx mem_lo = adjust_address (dst, SImode, 0);
+  if (!aarch64_mem_pair_operand (mem_lo, SImode))
+    return false;
+
+  rtx tmp_reg = gen_reg_rtx (SImode);
+  aarch64_expand_mov_immediate (tmp_reg, lo);
+  rtx mem_hi = aarch64_move_pointer (mem_lo, GET_MODE_SIZE (SImode));
+  /* Don't emit an explicit store pair as this may not be always profitable.
+     Let the sched-fusion logic decide whether to merge them.  */
+  emit_move_insn (mem_lo, tmp_reg);
+  emit_move_insn (mem_hi, tmp_reg);
+
+  return true;
+}
+
 /* Implement the TARGET_ASAN_SHADOW_OFFSET hook.  */
 
 static unsigned HOST_WIDE_INT
@@ -13305,6 +13609,14 @@ aarch_macro_fusion_pair_p (rtx_insn *prev, rtx_insn *curr)
   return false;
 }
 
+/* Return true iff the instruction fusion described by OP is enabled.  */
+
+bool
+aarch64_fusion_enabled_p (enum aarch64_fusion_pairs op)
+{
+  return (aarch64_tune_params.fusible_ops & op) != 0;
+}
+
 /* If MEM is in the form of [base+offset], extract the two parts
    of address and set to BASE and OFFSET, otherwise return false
    after clearing BASE and OFFSET.  */
@@ -13449,6 +13761,26 @@ aarch64_sched_fusion_priority (rtx_insn *insn, int max_pri,
   return;
 }
 
+/* Implement the TARGET_SCHED_ADJUST_PRIORITY hook.
+   Adjust priority of sha1h instructions so they are scheduled before
+   other SHA1 instructions.  */
+
+static int
+aarch64_sched_adjust_priority (rtx_insn *insn, int priority)
+{
+  rtx x = PATTERN (insn);
+
+  if (GET_CODE (x) == SET)
+    {
+      x = SET_SRC (x);
+
+      if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_SHA1H)
+	return priority + 10;
+    }
+
+  return priority;
+}
+
 /* Given OPERANDS of consecutive load/store, check if we can merge
    them into ldp/stp.  LOAD is true if they are load instructions.
    MODE is the mode of memory operands.  */
@@ -13483,6 +13815,15 @@ aarch64_operands_ok_for_ldpstp (rtx *operands, bool load,
   if (MEM_VOLATILE_P (mem_1) || MEM_VOLATILE_P (mem_2))
     return false;
 
+  /* If we have SImode and slow unaligned ldp,
+     check the alignment to be at least 8 byte. */
+  if (mode == SImode
+      && (aarch64_tune_params.extra_tuning_flags
+          & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)
+      && !optimize_size
+      && MEM_ALIGN (mem_1) < 8 * BITS_PER_UNIT)
+    return false;
+
   /* Check if the addresses are in the form of [base+offset].  */
   extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
   if (base_1 == NULL_RTX || offset_1 == NULL_RTX)
@@ -13642,6 +13983,15 @@ aarch64_operands_adjust_ok_for_ldpstp (rtx *operands, bool load,
 	return false;
     }
 
+  /* If we have SImode and slow unaligned ldp,
+     check the alignment to be at least 8 byte. */
+  if (mode == SImode
+      && (aarch64_tune_params.extra_tuning_flags
+          & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)
+      && !optimize_size
+      && MEM_ALIGN (mem_1) < 8 * BITS_PER_UNIT)
+    return false;
+
   if (REG_P (reg_1) && FP_REGNUM_P (REGNO (reg_1)))
     rclass_1 = FP_REGS;
   else
@@ -13877,13 +14227,13 @@ aarch64_promoted_type (const_tree t)
 /* Implement the TARGET_OPTAB_SUPPORTED_P hook.  */
 
 static bool
-aarch64_optab_supported_p (int op, machine_mode, machine_mode,
+aarch64_optab_supported_p (int op, machine_mode mode1, machine_mode,
 			   optimization_type opt_type)
 {
   switch (op)
     {
     case rsqrt_optab:
-      return opt_type == OPTIMIZE_FOR_SPEED && use_rsqrt_p ();
+      return opt_type == OPTIMIZE_FOR_SPEED && use_rsqrt_p (mode1);
 
     default:
       return true;
@@ -14017,6 +14367,10 @@ aarch64_optab_supported_p (int op, machine_mode, machine_mode,
 #undef TARGET_LEGITIMATE_CONSTANT_P
 #define TARGET_LEGITIMATE_CONSTANT_P aarch64_legitimate_constant_p
 
+#undef TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT
+#define TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT \
+  aarch64_legitimize_address_displacement
+
 #undef TARGET_LIBGCC_CMP_RETURN_MODE
 #define TARGET_LIBGCC_CMP_RETURN_MODE aarch64_libgcc_cmp_return_mode
 
@@ -14119,6 +14473,10 @@ aarch64_optab_supported_p (int op, machine_mode, machine_mode,
 #undef TARGET_VECTOR_MODE_SUPPORTED_P
 #define TARGET_VECTOR_MODE_SUPPORTED_P aarch64_vector_mode_supported_p
 
+#undef TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT
+#define TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT \
+  aarch64_builtin_support_vector_misalignment
+
 #undef TARGET_ARRAY_MODE_SUPPORTED_P
 #define TARGET_ARRAY_MODE_SUPPORTED_P aarch64_array_mode_supported_p
 
@@ -14196,6 +14554,9 @@ aarch64_optab_supported_p (int op, machine_mode, machine_mode,
 #undef TARGET_CAN_USE_DOLOOP_P
 #define TARGET_CAN_USE_DOLOOP_P can_use_doloop_if_innermost
 
+#undef TARGET_SCHED_ADJUST_PRIORITY
+#define TARGET_SCHED_ADJUST_PRIORITY aarch64_sched_adjust_priority
+
 #undef TARGET_SCHED_MACRO_FUSION_P
 #define TARGET_SCHED_MACRO_FUSION_P aarch64_macro_fusion_p
 
@@ -14220,6 +14581,9 @@ aarch64_optab_supported_p (int op, machine_mode, machine_mode,
 #undef TARGET_OPTAB_SUPPORTED_P
 #define TARGET_OPTAB_SUPPORTED_P aarch64_optab_supported_p
 
+#undef TARGET_OMIT_STRUCT_RETURN_REG
+#define TARGET_OMIT_STRUCT_RETURN_REG true
+
 struct gcc_target targetm = TARGET_INITIALIZER;
 
 #include "gt-aarch64.h"
--- a/src/gcc/config/aarch64/aarch64.h
+++ b/src/gcc/config/aarch64/aarch64.h
@@ -132,9 +132,14 @@ extern unsigned aarch64_architecture_version;
 #define AARCH64_FL_FP         (1 << 1)	/* Has FP.  */
 #define AARCH64_FL_CRYPTO     (1 << 2)	/* Has crypto.  */
 #define AARCH64_FL_CRC        (1 << 3)	/* Has CRC.  */
-/* ARMv8.1 architecture extensions.  */
+/* ARMv8.1-A architecture extensions.  */
 #define AARCH64_FL_LSE	      (1 << 4)  /* Has Large System Extensions.  */
-#define AARCH64_FL_V8_1	      (1 << 5)  /* Has ARMv8.1 extensions.  */
+#define AARCH64_FL_V8_1	      (1 << 5)  /* Has ARMv8.1-A extensions.  */
+/* ARMv8.2-A architecture extensions.  */
+#define AARCH64_FL_V8_2	      (1 << 8)  /* Has ARMv8.2-A features.  */
+#define AARCH64_FL_F16	      (1 << 9)  /* Has ARMv8.2-A FP16 extensions.  */
+/* ARMv8.3-A architecture extensions.  */
+#define AARCH64_FL_V8_3	      (1 << 10)  /* Has ARMv8.3-A features.  */
 
 /* Has FP and SIMD.  */
 #define AARCH64_FL_FPSIMD     (AARCH64_FL_FP | AARCH64_FL_SIMD)
@@ -146,6 +151,10 @@ extern unsigned aarch64_architecture_version;
 #define AARCH64_FL_FOR_ARCH8       (AARCH64_FL_FPSIMD)
 #define AARCH64_FL_FOR_ARCH8_1			       \
   (AARCH64_FL_FOR_ARCH8 | AARCH64_FL_LSE | AARCH64_FL_CRC | AARCH64_FL_V8_1)
+#define AARCH64_FL_FOR_ARCH8_2			\
+  (AARCH64_FL_FOR_ARCH8_1 | AARCH64_FL_V8_2)
+#define AARCH64_FL_FOR_ARCH8_3			\
+  (AARCH64_FL_FOR_ARCH8_2 | AARCH64_FL_V8_3)
 
 /* Macros to test ISA flags.  */
 
@@ -155,6 +164,9 @@ extern unsigned aarch64_architecture_version;
 #define AARCH64_ISA_SIMD           (aarch64_isa_flags & AARCH64_FL_SIMD)
 #define AARCH64_ISA_LSE		   (aarch64_isa_flags & AARCH64_FL_LSE)
 #define AARCH64_ISA_RDMA	   (aarch64_isa_flags & AARCH64_FL_V8_1)
+#define AARCH64_ISA_V8_2	   (aarch64_isa_flags & AARCH64_FL_V8_2)
+#define AARCH64_ISA_F16		   (aarch64_isa_flags & AARCH64_FL_F16)
+#define AARCH64_ISA_V8_3	   (aarch64_isa_flags & AARCH64_FL_V8_3)
 
 /* Crypto is an optional extension to AdvSIMD.  */
 #define TARGET_CRYPTO (TARGET_SIMD && AARCH64_ISA_CRYPTO)
@@ -165,6 +177,13 @@ extern unsigned aarch64_architecture_version;
 /* Atomic instructions that can be enabled through the +lse extension.  */
 #define TARGET_LSE (AARCH64_ISA_LSE)
 
+/* ARMv8.2-A FP16 support that can be enabled through the +fp16 extension.  */
+#define TARGET_FP_F16INST (TARGET_FLOAT && AARCH64_ISA_F16)
+#define TARGET_SIMD_F16INST (TARGET_SIMD && AARCH64_ISA_F16)
+
+/* ARMv8.3-A features.  */
+#define TARGET_ARMV8_3	(AARCH64_ISA_V8_3)
+
 /* Make sure this is always defined so we don't have to check for ifdefs
    but rather use normal ifs.  */
 #ifndef TARGET_FIX_ERR_A53_835769_DEFAULT
@@ -193,7 +212,7 @@ extern unsigned aarch64_architecture_version;
   ((aarch64_fix_a53_err843419 == 2)	\
   ? TARGET_FIX_ERR_A53_843419_DEFAULT : aarch64_fix_a53_err843419)
 
-/* ARMv8.1 Adv.SIMD support.  */
+/* ARMv8.1-A Adv.SIMD support.  */
 #define TARGET_SIMD_RDMA (TARGET_SIMD && AARCH64_ISA_RDMA)
 
 /* Standard register usage.  */
@@ -539,11 +558,14 @@ struct GTY (()) aarch64_frame
      STACK_BOUNDARY.  */
   HOST_WIDE_INT saved_varargs_size;
 
+  /* The size of the saved callee-save int/FP registers.  */
+
   HOST_WIDE_INT saved_regs_size;
-  /* Padding if needed after the all the callee save registers have
-     been saved.  */
-  HOST_WIDE_INT padding0;
-  HOST_WIDE_INT hardfp_offset;	/* HARD_FRAME_POINTER_REGNUM */
+
+  /* Offset from the base of the frame (incomming SP) to the
+     top of the locals area.  This value is always a multiple of
+     STACK_BOUNDARY.  */
+  HOST_WIDE_INT locals_offset;
 
   /* Offset from the base of the frame (incomming SP) to the
      hard_frame_pointer.  This value is always a multiple of
@@ -553,12 +575,25 @@ struct GTY (()) aarch64_frame
   /* The size of the frame.  This value is the offset from base of the
    * frame (incomming SP) to the stack_pointer.  This value is always
    * a multiple of STACK_BOUNDARY.  */
+  HOST_WIDE_INT frame_size;
+
+  /* The size of the initial stack adjustment before saving callee-saves.  */
+  HOST_WIDE_INT initial_adjust;
+
+  /* The writeback value when pushing callee-save registers.
+     It is zero when no push is used.  */
+  HOST_WIDE_INT callee_adjust;
+
+  /* The offset from SP to the callee-save registers after initial_adjust.
+     It may be non-zero if no push is used (ie. callee_adjust == 0).  */
+  HOST_WIDE_INT callee_offset;
+
+  /* The size of the stack adjustment after saving callee-saves.  */
+  HOST_WIDE_INT final_adjust;
 
   unsigned wb_candidate1;
   unsigned wb_candidate2;
 
-  HOST_WIDE_INT frame_size;
-
   bool laid_out;
 };
 
@@ -652,21 +687,6 @@ typedef struct
 
 #define CONSTANT_ADDRESS_P(X)		aarch64_constant_address_p(X)
 
-/* Try a machine-dependent way of reloading an illegitimate address
-   operand.  If we find one, push the reload and jump to WIN.  This
-   macro is used in only one place: `find_reloads_address' in reload.c.  */
-
-#define LEGITIMIZE_RELOAD_ADDRESS(X, MODE, OPNUM, TYPE, IND_L, WIN)	     \
-do {									     \
-  rtx new_x = aarch64_legitimize_reload_address (&(X), MODE, OPNUM, TYPE,    \
-						 IND_L);		     \
-  if (new_x)								     \
-    {									     \
-      X = new_x;							     \
-      goto WIN;								     \
-    }									     \
-} while (0)
-
 #define REGNO_OK_FOR_BASE_P(REGNO)	\
   aarch64_regno_ok_for_base_p (REGNO, true)
 
@@ -722,7 +742,12 @@ do {									     \
 #define USE_STORE_PRE_INCREMENT(MODE)   0
 #define USE_STORE_PRE_DECREMENT(MODE)   0
 
-/* ?? #define WORD_REGISTER_OPERATIONS  */
+/* WORD_REGISTER_OPERATIONS does not hold for AArch64.
+   The assigned word_mode is DImode but operations narrower than SImode
+   behave as 32-bit operations if using the W-form of the registers rather
+   than as word_mode (64-bit) operations as WORD_REGISTER_OPERATIONS
+   expects.  */
+#define WORD_REGISTER_OPERATIONS 0
 
 /* Define if loading from memory in MODE, an integral mode narrower than
    BITS_PER_WORD will either zero-extend or sign-extend.  The value of this
@@ -842,10 +867,7 @@ do {									     \
   extern void  __aarch64_sync_cache_range (void *, void *);	\
   __aarch64_sync_cache_range (beg, end)
 
-#define CANNOT_CHANGE_MODE_CLASS(FROM, TO, CLASS)	\
-  aarch64_cannot_change_mode_class (FROM, TO, CLASS)
-
-#define SHIFT_COUNT_TRUNCATED !TARGET_SIMD
+#define SHIFT_COUNT_TRUNCATED (!TARGET_SIMD)
 
 /* Choose appropriate mode for caller saves, so we do the minimum
    required size of load/store.  */
--- a/src/gcc/config/aarch64/aarch64.md
+++ b/src/gcc/config/aarch64/aarch64.md
@@ -75,6 +75,8 @@
     UNSPEC_CRC32H
     UNSPEC_CRC32W
     UNSPEC_CRC32X
+    UNSPEC_FCVTZS
+    UNSPEC_FCVTZU
     UNSPEC_URECPE
     UNSPEC_FRECPE
     UNSPEC_FRECPS
@@ -105,6 +107,7 @@
     UNSPEC_NOP
     UNSPEC_PRLG_STK
     UNSPEC_RBIT
+    UNSPEC_SCVTF
     UNSPEC_SISD_NEG
     UNSPEC_SISD_SSHL
     UNSPEC_SISD_USHL
@@ -122,6 +125,7 @@
     UNSPEC_TLSLE24
     UNSPEC_TLSLE32
     UNSPEC_TLSLE48
+    UNSPEC_UCVTF
     UNSPEC_USHL_2S
     UNSPEC_VSTRUCTDUMMY
     UNSPEC_SP_SET
@@ -837,13 +841,6 @@
 	   || aarch64_is_noplt_call_p (callee)))
       XEXP (operands[0], 0) = force_reg (Pmode, callee);
 
-    /* FIXME: This is a band-aid.  Need to analyze why expand_expr_addr_expr
-       is generating an SImode symbol reference.  See PR 64971.  */
-    if (TARGET_ILP32
-	&& GET_CODE (XEXP (operands[0], 0)) == SYMBOL_REF
-	&& GET_MODE (XEXP (operands[0], 0)) == SImode)
-      XEXP (operands[0], 0) = convert_memory_address (Pmode,
-						      XEXP (operands[0], 0));
     if (operands[2] == NULL_RTX)
       operands[2] = const0_rtx;
 
@@ -875,14 +872,6 @@
 	   || aarch64_is_noplt_call_p (callee)))
       XEXP (operands[1], 0) = force_reg (Pmode, callee);
 
-    /* FIXME: This is a band-aid.  Need to analyze why expand_expr_addr_expr
-       is generating an SImode symbol reference.  See PR 64971.  */
-    if (TARGET_ILP32
-	&& GET_CODE (XEXP (operands[1], 0)) == SYMBOL_REF
-	&& GET_MODE (XEXP (operands[1], 0)) == SImode)
-      XEXP (operands[1], 0) = convert_memory_address (Pmode,
-						      XEXP (operands[1], 0));
-
     if (operands[3] == NULL_RTX)
       operands[3] = const0_rtx;
 
@@ -1003,6 +992,11 @@
 	(match_operand:GPI 1 "general_operand" ""))]
   ""
   "
+    if (MEM_P (operands[0]) && CONST_INT_P (operands[1])
+	&& <MODE>mode == DImode
+	&& aarch64_split_dimode_const_store (operands[0], operands[1]))
+      DONE;
+
     if (GET_CODE (operands[0]) == MEM && operands[1] != const0_rtx)
       operands[1] = force_reg (<MODE>mode, operands[1]);
 
@@ -1160,11 +1154,12 @@
 )
 
 (define_insn "*movhf_aarch64"
-  [(set (match_operand:HF 0 "nonimmediate_operand" "=w, ?r,w,w,m,r,m ,r")
-	(match_operand:HF 1 "general_operand"      "?rY, w,w,m,w,m,rY,r"))]
+  [(set (match_operand:HF 0 "nonimmediate_operand" "=w,w  ,?r,w,w,m,r,m ,r")
+	(match_operand:HF 1 "general_operand"      "Y ,?rY, w,w,m,w,m,rY,r"))]
   "TARGET_FLOAT && (register_operand (operands[0], HFmode)
     || aarch64_reg_or_fp_zero (operands[1], HFmode))"
   "@
+   movi\\t%0.4h, #0
    mov\\t%0.h[0], %w1
    umov\\t%w0, %1.h[0]
    mov\\t%0.h[0], %1.h[0]
@@ -1173,18 +1168,18 @@
    ldrh\\t%w0, %1
    strh\\t%w1, %0
    mov\\t%w0, %w1"
-  [(set_attr "type" "neon_from_gp,neon_to_gp,neon_move,\
+  [(set_attr "type" "neon_move,neon_from_gp,neon_to_gp,neon_move,\
                      f_loads,f_stores,load1,store1,mov_reg")
-   (set_attr "simd" "yes,yes,yes,*,*,*,*,*")
-   (set_attr "fp"   "*,*,*,yes,yes,*,*,*")]
+   (set_attr "simd" "yes,yes,yes,yes,*,*,*,*,*")]
 )
 
 (define_insn "*movsf_aarch64"
-  [(set (match_operand:SF 0 "nonimmediate_operand" "=w, ?r,w,w  ,w,m,r,m ,r")
-	(match_operand:SF 1 "general_operand"      "?rY, w,w,Ufc,m,w,m,rY,r"))]
+  [(set (match_operand:SF 0 "nonimmediate_operand" "=w,w  ,?r,w,w  ,w,m,r,m ,r")
+	(match_operand:SF 1 "general_operand"      "Y ,?rY, w,w,Ufc,m,w,m,rY,r"))]
   "TARGET_FLOAT && (register_operand (operands[0], SFmode)
     || aarch64_reg_or_fp_zero (operands[1], SFmode))"
   "@
+   movi\\t%0.2s, #0
    fmov\\t%s0, %w1
    fmov\\t%w0, %s1
    fmov\\t%s0, %s1
@@ -1194,16 +1189,18 @@
    ldr\\t%w0, %1
    str\\t%w1, %0
    mov\\t%w0, %w1"
-  [(set_attr "type" "f_mcr,f_mrc,fmov,fconsts,\
-                     f_loads,f_stores,load1,store1,mov_reg")]
+  [(set_attr "type" "neon_move,f_mcr,f_mrc,fmov,fconsts,\
+                     f_loads,f_stores,load1,store1,mov_reg")
+   (set_attr "simd" "yes,*,*,*,*,*,*,*,*,*")]
 )
 
 (define_insn "*movdf_aarch64"
-  [(set (match_operand:DF 0 "nonimmediate_operand" "=w, ?r,w,w  ,w,m,r,m ,r")
-	(match_operand:DF 1 "general_operand"      "?rY, w,w,Ufc,m,w,m,rY,r"))]
+  [(set (match_operand:DF 0 "nonimmediate_operand" "=w,w  ,?r,w,w  ,w,m,r,m ,r")
+	(match_operand:DF 1 "general_operand"      "Y ,?rY, w,w,Ufc,m,w,m,rY,r"))]
   "TARGET_FLOAT && (register_operand (operands[0], DFmode)
     || aarch64_reg_or_fp_zero (operands[1], DFmode))"
   "@
+   movi\\t%d0, #0
    fmov\\t%d0, %x1
    fmov\\t%x0, %d1
    fmov\\t%d0, %d1
@@ -1213,8 +1210,9 @@
    ldr\\t%x0, %1
    str\\t%x1, %0
    mov\\t%x0, %x1"
-  [(set_attr "type" "f_mcr,f_mrc,fmov,fconstd,\
-                     f_loadd,f_stored,load1,store1,mov_reg")]
+  [(set_attr "type" "neon_move,f_mcr,f_mrc,fmov,fconstd,\
+                     f_loadd,f_stored,load1,store1,mov_reg")
+   (set_attr "simd" "yes,*,*,*,*,*,*,*,*,*")]
 )
 
 (define_insn "*movtf_aarch64"
@@ -1239,7 +1237,6 @@
   [(set_attr "type" "logic_reg,multiple,f_mcr,f_mrc,neon_move_q,f_mcr,\
                      f_loadd,f_stored,load2,store2,store2")
    (set_attr "length" "4,8,8,8,4,4,4,4,4,4,4")
-   (set_attr "fp" "*,*,yes,yes,*,yes,yes,yes,*,*,*")
    (set_attr "simd" "yes,*,*,*,yes,*,*,*,*,*,*")]
 )
 
@@ -1552,10 +1549,10 @@
         (zero_extend:GPI (match_operand:SHORT 1 "nonimmediate_operand" "r,m,m")))]
   ""
   "@
-   uxt<SHORT:size>\t%<GPI:w>0, %w1
+   and\t%<GPI:w>0, %<GPI:w>1, <SHORT:short_mask>
    ldr<SHORT:size>\t%w0, %1
    ldr\t%<SHORT:size>0, %1"
-  [(set_attr "type" "extend,load1,load1")]
+  [(set_attr "type" "logic_imm,load1,load1")]
 )
 
 (define_expand "<optab>qihi2"
@@ -1564,16 +1561,26 @@
   ""
 )
 
-(define_insn "*<optab>qihi2_aarch64"
+(define_insn "*extendqihi2_aarch64"
   [(set (match_operand:HI 0 "register_operand" "=r,r")
-        (ANY_EXTEND:HI (match_operand:QI 1 "nonimmediate_operand" "r,m")))]
+	(sign_extend:HI (match_operand:QI 1 "nonimmediate_operand" "r,m")))]
   ""
   "@
-   <su>xtb\t%w0, %w1
-   <ldrxt>b\t%w0, %1"
+   sxtb\t%w0, %w1
+   ldrsb\t%w0, %1"
   [(set_attr "type" "extend,load1")]
 )
 
+(define_insn "*zero_extendqihi2_aarch64"
+  [(set (match_operand:HI 0 "register_operand" "=r,r")
+	(zero_extend:HI (match_operand:QI 1 "nonimmediate_operand" "r,m")))]
+  ""
+  "@
+   and\t%w0, %w1, 255
+   ldrb\t%w0, %1"
+  [(set_attr "type" "logic_imm,load1")]
+)
+
 ;; -------------------------------------------------------------------
 ;; Simple arithmetic
 ;; -------------------------------------------------------------------
@@ -1585,25 +1592,16 @@
 	      (match_operand:GPI 2 "aarch64_pluslong_operand" "")))]
   ""
 {
-  if (aarch64_pluslong_strict_immedate (operands[2], <MODE>mode))
-    {
-      /* Give CSE the opportunity to share this constant across additions.  */
-      if (!cse_not_expected && can_create_pseudo_p ())
-        operands[2] = force_reg (<MODE>mode, operands[2]);
-
-      /* Split will refuse to operate on a modification to the stack pointer.
-	 Aid the prologue and epilogue expanders by splitting this now.  */
-      else if (reload_completed && operands[0] == stack_pointer_rtx)
-	{
-	  HOST_WIDE_INT i = INTVAL (operands[2]);
-	  HOST_WIDE_INT s = (i >= 0 ? i & 0xfff : -(-i & 0xfff));
-	  emit_insn (gen_rtx_SET (operands[0],
-				  gen_rtx_PLUS (<MODE>mode, operands[1],
-						GEN_INT (i - s))));
-	  operands[1] = operands[0];
-	  operands[2] = GEN_INT (s);
-	}
-    }
+  /* If operands[1] is a subreg extract the inner RTX.  */
+  rtx op1 = REG_P (operands[1]) ? operands[1] : SUBREG_REG (operands[1]);
+
+  /* If the constant is too large for a single instruction and isn't frame
+     based, split off the immediate so it is available for CSE.  */
+  if (!aarch64_plus_immediate (operands[2], <MODE>mode)
+      && can_create_pseudo_p ()
+      && (!REG_P (op1)
+	 || !REGNO_PTR_FRAME_P (REGNO (op1))))
+    operands[2] = force_reg (<MODE>mode, operands[2]);
 })
 
 (define_insn "*add<mode>3_aarch64"
@@ -1765,7 +1763,7 @@
   "aarch64_zero_extend_const_eq (<DWI>mode, operands[2],
 				 <MODE>mode, operands[1])"
   "@
-  cmn\\t%<w>0, %<w>1
+  cmn\\t%<w>0, %1
   cmp\\t%<w>0, #%n1"
   [(set_attr "type" "alus_imm")]
 )
@@ -1797,11 +1795,11 @@
   "aarch64_zero_extend_const_eq (<DWI>mode, operands[3],
                                  <MODE>mode, operands[2])"
   "@
-  adds\\t%<w>0, %<w>1, %<w>2
+  adds\\t%<w>0, %<w>1, %2
   subs\\t%<w>0, %<w>1, #%n2"
   [(set_attr "type" "alus_imm")]
 )
- 
+
 (define_insn "add<mode>3_compareC"
   [(set (reg:CC_C CC_REGNUM)
 	(ne:CC_C
@@ -3404,7 +3402,9 @@
          (LOGICAL:SI (match_operand:SI 1 "register_operand" "%r,r")
 		     (match_operand:SI 2 "aarch64_logical_operand" "r,K"))))]
   ""
-  "<logical>\\t%w0, %w1, %w2"
+  "@
+   <logical>\\t%w0, %w1, %w2
+   <logical>\\t%w0, %w1, %2"
   [(set_attr "type" "logic_reg,logic_imm")]
 )
 
@@ -3417,7 +3417,9 @@
    (set (match_operand:GPI 0 "register_operand" "=r,r")
 	(and:GPI (match_dup 1) (match_dup 2)))]
   ""
-  "ands\\t%<w>0, %<w>1, %<w>2"
+  "@
+   ands\\t%<w>0, %<w>1, %<w>2
+   ands\\t%<w>0, %<w>1, %2"
   [(set_attr "type" "logics_reg,logics_imm")]
 )
 
@@ -3431,7 +3433,9 @@
    (set (match_operand:DI 0 "register_operand" "=r,r")
 	(zero_extend:DI (and:SI (match_dup 1) (match_dup 2))))]
   ""
-  "ands\\t%w0, %w1, %w2"
+  "@
+   ands\\t%w0, %w1, %w2
+   ands\\t%w0, %w1, %2"
   [(set_attr "type" "logics_reg,logics_imm")]
 )
 
@@ -3741,6 +3745,39 @@
   }
 )
 
+;; Pop count be done via the "CNT" instruction in AdvSIMD.
+;;
+;; MOV	v.1d, x0
+;; CNT	v1.8b, v.8b
+;; ADDV b2, v1.8b
+;; MOV	w0, v2.b[0]
+
+(define_expand "popcount<mode>2"
+  [(match_operand:GPI 0 "register_operand")
+   (match_operand:GPI 1 "register_operand")]
+  "TARGET_SIMD"
+{
+  rtx v = gen_reg_rtx (V8QImode);
+  rtx v1 = gen_reg_rtx (V8QImode);
+  rtx r = gen_reg_rtx (QImode);
+  rtx in = operands[1];
+  rtx out = operands[0];
+  if(<MODE>mode == SImode)
+    {
+      rtx tmp;
+      tmp = gen_reg_rtx (DImode);
+      /* If we have SImode, zero extend to DImode, pop count does
+         not change if we have extra zeros. */
+      emit_insn (gen_zero_extendsidi2 (tmp, in));
+      in = tmp;
+    }
+  emit_move_insn (v, gen_lowpart (V8QImode, in));
+  emit_insn (gen_popcountv8qi2 (v1, v));
+  emit_insn (gen_reduc_plus_scal_v8qi (r, v1));
+  emit_insn (gen_zero_extendqi<mode>2 (out, r));
+  DONE;
+})
+
 (define_insn "clrsb<mode>2"
   [(set (match_operand:GPI 0 "register_operand" "=r")
         (clrsb:GPI (match_operand:GPI 1 "register_operand" "r")))]
@@ -3757,16 +3794,23 @@
   [(set_attr "type" "rbit")]
 )
 
-(define_expand "ctz<mode>2"
-  [(match_operand:GPI 0 "register_operand")
-   (match_operand:GPI 1 "register_operand")]
+;; Split after reload into RBIT + CLZ.  Since RBIT is represented as an UNSPEC
+;; it is unlikely to fold with any other operation, so keep this as a CTZ
+;; expression and split after reload to enable scheduling them apart if
+;; needed.
+
+(define_insn_and_split "ctz<mode>2"
+ [(set (match_operand:GPI           0 "register_operand" "=r")
+       (ctz:GPI (match_operand:GPI  1 "register_operand" "r")))]
   ""
-  {
-    emit_insn (gen_rbit<mode>2 (operands[0], operands[1]));
-    emit_insn (gen_clz<mode>2 (operands[0], operands[0]));
-    DONE;
-  }
-)
+  "#"
+  "reload_completed"
+  [(const_int 0)]
+  "
+  emit_insn (gen_rbit<mode>2 (operands[0], operands[1]));
+  emit_insn (gen_clz<mode>2 (operands[0], operands[0]));
+  DONE;
+")
 
 (define_insn "*and<mode>_compare0"
   [(set (reg:CC_NZ CC_REGNUM)
@@ -3778,6 +3822,18 @@
   [(set_attr "type" "alus_imm")]
 )
 
+(define_insn "*ands<mode>_compare0"
+  [(set (reg:CC_NZ CC_REGNUM)
+	(compare:CC_NZ
+	 (zero_extend:GPI (match_operand:SHORT 1 "register_operand" "r"))
+	 (const_int 0)))
+   (set (match_operand:GPI 0 "register_operand" "=r")
+	(zero_extend:GPI (match_dup 1)))]
+  ""
+  "ands\\t%<GPI:w>0, %<GPI:w>1, <short_mask>"
+  [(set_attr "type" "alus_imm")]
+)
+
 (define_insn "*and<mode>3nr_compare0"
   [(set (reg:CC_NZ CC_REGNUM)
 	(compare:CC_NZ
@@ -3785,7 +3841,9 @@
 		  (match_operand:GPI 1 "aarch64_logical_operand" "r,<lconst>"))
 	 (const_int 0)))]
   ""
-  "tst\\t%<w>0, %<w>1"
+  "@
+   tst\\t%<w>0, %<w>1
+   tst\\t%<w>0, %1"
   [(set_attr "type" "logics_reg,logics_imm")]
 )
 
@@ -3851,22 +3909,16 @@
 (define_expand "ashl<mode>3"
   [(set (match_operand:SHORT 0 "register_operand")
 	(ashift:SHORT (match_operand:SHORT 1 "register_operand")
-		      (match_operand:QI 2 "nonmemory_operand")))]
+		      (match_operand:QI 2 "const_int_operand")))]
   ""
   {
-    if (CONST_INT_P (operands[2]))
-      {
-        operands[2] = GEN_INT (INTVAL (operands[2])
-                               & (GET_MODE_BITSIZE (<MODE>mode) - 1));
+    operands[2] = GEN_INT (INTVAL (operands[2]) & GET_MODE_MASK (<MODE>mode));
 
-        if (operands[2] == const0_rtx)
-          {
-	    emit_insn (gen_mov<mode> (operands[0], operands[1]));
-	    DONE;
-          }
+    if (operands[2] == const0_rtx)
+      {
+	emit_insn (gen_mov<mode> (operands[0], operands[1]));
+	DONE;
       }
-    else
-      FAIL;
   }
 )
 
@@ -3915,33 +3967,35 @@
 
 ;; Logical left shift using SISD or Integer instruction
 (define_insn "*aarch64_ashl_sisd_or_int_<mode>3"
-  [(set (match_operand:GPI 0 "register_operand" "=r,w,w")
-        (ashift:GPI
-          (match_operand:GPI 1 "register_operand" "r,w,w")
-          (match_operand:QI 2 "aarch64_reg_or_shift_imm_<mode>" "rUs<cmode>,Us<cmode>,w")))]
+  [(set (match_operand:GPI 0 "register_operand" "=r,r,w,w")
+	(ashift:GPI
+	  (match_operand:GPI 1 "register_operand" "r,r,w,w")
+	  (match_operand:QI 2 "aarch64_reg_or_shift_imm_<mode>" "Us<cmode>,r,Us<cmode>,w")))]
   ""
   "@
+   lsl\t%<w>0, %<w>1, %2
    lsl\t%<w>0, %<w>1, %<w>2
    shl\t%<rtn>0<vas>, %<rtn>1<vas>, %2
    ushl\t%<rtn>0<vas>, %<rtn>1<vas>, %<rtn>2<vas>"
-  [(set_attr "simd" "no,yes,yes")
-   (set_attr "type" "shift_reg,neon_shift_imm<q>, neon_shift_reg<q>")]
+  [(set_attr "simd" "no,no,yes,yes")
+   (set_attr "type" "bfx,shift_reg,neon_shift_imm<q>, neon_shift_reg<q>")]
 )
 
 ;; Logical right shift using SISD or Integer instruction
 (define_insn "*aarch64_lshr_sisd_or_int_<mode>3"
-  [(set (match_operand:GPI 0 "register_operand" "=r,w,&w,&w")
-        (lshiftrt:GPI
-          (match_operand:GPI 1 "register_operand" "r,w,w,w")
-          (match_operand:QI 2 "aarch64_reg_or_shift_imm_<mode>" "rUs<cmode>,Us<cmode>,w,0")))]
+  [(set (match_operand:GPI 0 "register_operand" "=r,r,w,&w,&w")
+	(lshiftrt:GPI
+	 (match_operand:GPI 1 "register_operand" "r,r,w,w,w")
+	 (match_operand:QI 2 "aarch64_reg_or_shift_imm_<mode>" "Us<cmode>,r,Us<cmode>,w,0")))]
   ""
   "@
+   lsr\t%<w>0, %<w>1, %2
    lsr\t%<w>0, %<w>1, %<w>2
    ushr\t%<rtn>0<vas>, %<rtn>1<vas>, %2
    #
    #"
-  [(set_attr "simd" "no,yes,yes,yes")
-   (set_attr "type" "shift_reg,neon_shift_imm<q>,neon_shift_reg<q>,neon_shift_reg<q>")]
+  [(set_attr "simd" "no,no,yes,yes,yes")
+   (set_attr "type" "bfx,shift_reg,neon_shift_imm<q>,neon_shift_reg<q>,neon_shift_reg<q>")]
 )
 
 (define_split
@@ -3976,18 +4030,19 @@
 
 ;; Arithmetic right shift using SISD or Integer instruction
 (define_insn "*aarch64_ashr_sisd_or_int_<mode>3"
-  [(set (match_operand:GPI 0 "register_operand" "=r,w,&w,&w")
+  [(set (match_operand:GPI 0 "register_operand" "=r,r,w,&w,&w")
         (ashiftrt:GPI
-          (match_operand:GPI 1 "register_operand" "r,w,w,w")
-          (match_operand:QI 2 "aarch64_reg_or_shift_imm_di" "rUs<cmode>,Us<cmode>,w,0")))]
+          (match_operand:GPI 1 "register_operand" "r,r,w,w,w")
+          (match_operand:QI 2 "aarch64_reg_or_shift_imm_di" "Us<cmode>,r,Us<cmode>,w,0")))]
   ""
   "@
+   asr\t%<w>0, %<w>1, %2
    asr\t%<w>0, %<w>1, %<w>2
    sshr\t%<rtn>0<vas>, %<rtn>1<vas>, %2
    #
    #"
-  [(set_attr "simd" "no,yes,yes,yes")
-   (set_attr "type" "shift_reg,neon_shift_imm<q>,neon_shift_reg<q>,neon_shift_reg<q>")]
+  [(set_attr "simd" "no,no,yes,yes,yes")
+   (set_attr "type" "bfx,shift_reg,neon_shift_imm<q>,neon_shift_reg<q>,neon_shift_reg<q>")]
 )
 
 (define_split
@@ -4079,21 +4134,25 @@
   [(set (match_operand:GPI 0 "register_operand" "=r,r")
      (rotatert:GPI
        (match_operand:GPI 1 "register_operand" "r,r")
-       (match_operand:QI 2 "aarch64_reg_or_shift_imm_<mode>" "r,Us<cmode>")))]
+       (match_operand:QI 2 "aarch64_reg_or_shift_imm_<mode>" "Us<cmode>,r")))]
   ""
-  "ror\\t%<w>0, %<w>1, %<w>2"
-  [(set_attr "type" "shift_reg, rotate_imm")]
+  "@
+   ror\\t%<w>0, %<w>1, %2
+   ror\\t%<w>0, %<w>1, %<w>2"
+  [(set_attr "type" "rotate_imm,shift_reg")]
 )
 
 ;; zero_extend version of above
 (define_insn "*<optab>si3_insn_uxtw"
-  [(set (match_operand:DI 0 "register_operand" "=r")
+  [(set (match_operand:DI 0 "register_operand" "=r,r")
 	(zero_extend:DI (SHIFT:SI
-	 (match_operand:SI 1 "register_operand" "r")
-	 (match_operand:QI 2 "aarch64_reg_or_shift_imm_si" "rUss"))))]
+	 (match_operand:SI 1 "register_operand" "r,r")
+	 (match_operand:QI 2 "aarch64_reg_or_shift_imm_si" "Uss,r"))))]
   ""
-  "<shift>\\t%w0, %w1, %w2"
-  [(set_attr "type" "shift_reg")]
+  "@
+   <shift>\\t%w0, %w1, %2
+   <shift>\\t%w0, %w1, %w2"
+  [(set_attr "type" "bfx,shift_reg")]
 )
 
 (define_insn "*<optab><mode>3_insn"
@@ -4105,7 +4164,7 @@
   operands[3] = GEN_INT (<sizen> - UINTVAL (operands[2]));
   return "<bfshift>\t%w0, %w1, %2, %3";
 }
-  [(set_attr "type" "bfm")]
+  [(set_attr "type" "bfx")]
 )
 
 (define_insn "*extr<mode>5_insn"
@@ -4117,7 +4176,7 @@
   "UINTVAL (operands[3]) < GET_MODE_BITSIZE (<MODE>mode) &&
    (UINTVAL (operands[3]) + UINTVAL (operands[4]) == GET_MODE_BITSIZE (<MODE>mode))"
   "extr\\t%<w>0, %<w>1, %<w>2, %4"
-  [(set_attr "type" "shift_imm")]
+  [(set_attr "type" "rotate_imm")]
 )
 
 ;; There are no canonicalisation rules for ashift and lshiftrt inside an ior
@@ -4132,7 +4191,7 @@
    && (UINTVAL (operands[3]) + UINTVAL (operands[4])
        == GET_MODE_BITSIZE (<MODE>mode))"
   "extr\\t%<w>0, %<w>1, %<w>2, %4"
-  [(set_attr "type" "shift_imm")]
+  [(set_attr "type" "rotate_imm")]
 )
 
 ;; zero_extend version of the above
@@ -4146,7 +4205,7 @@
   "UINTVAL (operands[3]) < 32 &&
    (UINTVAL (operands[3]) + UINTVAL (operands[4]) == 32)"
   "extr\\t%w0, %w1, %w2, %4"
-  [(set_attr "type" "shift_imm")]
+  [(set_attr "type" "rotate_imm")]
 )
 
 (define_insn "*extrsi5_insn_uxtw_alt"
@@ -4159,7 +4218,7 @@
   "UINTVAL (operands[3]) < 32 &&
    (UINTVAL (operands[3]) + UINTVAL (operands[4]) == 32)"
   "extr\\t%w0, %w1, %w2, %4"
-  [(set_attr "type" "shift_imm")]
+  [(set_attr "type" "rotate_imm")]
 )
 
 (define_insn "*ror<mode>3_insn"
@@ -4198,7 +4257,7 @@
   operands[3] = GEN_INT (<SHORT:sizen> - UINTVAL (operands[2]));
   return "<su>bfiz\t%<GPI:w>0, %<GPI:w>1, %2, %3";
 }
-  [(set_attr "type" "bfm")]
+  [(set_attr "type" "bfx")]
 )
 
 (define_insn "*zero_extend<GPI:mode>_lshr<SHORT:mode>"
@@ -4211,7 +4270,7 @@
   operands[3] = GEN_INT (<SHORT:sizen> - UINTVAL (operands[2]));
   return "ubfx\t%<GPI:w>0, %<GPI:w>1, %2, %3";
 }
-  [(set_attr "type" "bfm")]
+  [(set_attr "type" "bfx")]
 )
 
 (define_insn "*extend<GPI:mode>_ashr<SHORT:mode>"
@@ -4224,7 +4283,7 @@
   operands[3] = GEN_INT (<SHORT:sizen> - UINTVAL (operands[2]));
   return "sbfx\\t%<GPI:w>0, %<GPI:w>1, %2, %3";
 }
-  [(set_attr "type" "bfm")]
+  [(set_attr "type" "bfx")]
 )
 
 ;; -------------------------------------------------------------------
@@ -4256,7 +4315,27 @@
   "IN_RANGE (INTVAL (operands[2]) + INTVAL (operands[3]),
 	     1, GET_MODE_BITSIZE (<MODE>mode) - 1)"
   "<su>bfx\\t%<w>0, %<w>1, %3, %2"
-  [(set_attr "type" "bfm")]
+  [(set_attr "type" "bfx")]
+)
+
+;; When the bit position and width add up to 32 we can use a W-reg LSR
+;; instruction taking advantage of the implicit zero-extension of the X-reg.
+(define_split
+  [(set (match_operand:DI 0 "register_operand")
+	(zero_extract:DI (match_operand:DI 1 "register_operand")
+			 (match_operand 2
+			   "aarch64_simd_shift_imm_offset_di")
+			 (match_operand 3
+			   "aarch64_simd_shift_imm_di")))]
+  "IN_RANGE (INTVAL (operands[2]) + INTVAL (operands[3]), 1,
+	     GET_MODE_BITSIZE (DImode) - 1)
+   && (INTVAL (operands[2]) + INTVAL (operands[3]))
+       == GET_MODE_BITSIZE (SImode)"
+  [(set (match_dup 0)
+	(zero_extend:DI (lshiftrt:SI (match_dup 4) (match_dup 3))))]
+  {
+    operands[4] = gen_lowpart (SImode, operands[1]);
+  }
 )
 
 ;; Bitfield Insert (insv)
@@ -4338,7 +4417,7 @@
 	      : GEN_INT (<GPI:sizen> - UINTVAL (operands[2]));
   return "<su>bfiz\t%<GPI:w>0, %<GPI:w>1, %2, %3";
 }
-  [(set_attr "type" "bfm")]
+  [(set_attr "type" "bfx")]
 )
 
 ;; XXX We should match (any_extend (ashift)) here, like (and (ashift)) below
@@ -4348,11 +4427,27 @@
 	(and:GPI (ashift:GPI (match_operand:GPI 1 "register_operand" "r")
 			     (match_operand 2 "const_int_operand" "n"))
 		 (match_operand 3 "const_int_operand" "n")))]
-  "(INTVAL (operands[2]) < (<GPI:sizen>))
-   && exact_log2 ((INTVAL (operands[3]) >> INTVAL (operands[2])) + 1) >= 0
-   && (INTVAL (operands[3]) & ((1 << INTVAL (operands[2])) - 1)) == 0"
+  "aarch64_mask_and_shift_for_ubfiz_p (<MODE>mode, operands[3], operands[2])"
   "ubfiz\\t%<w>0, %<w>1, %2, %P3"
-  [(set_attr "type" "bfm")]
+  [(set_attr "type" "bfx")]
+)
+
+;; When the bit position and width of the equivalent extraction add up to 32
+;; we can use a W-reg LSL instruction taking advantage of the implicit
+;; zero-extension of the X-reg.
+(define_split
+  [(set (match_operand:DI 0 "register_operand")
+	(and:DI (ashift:DI (match_operand:DI 1 "register_operand")
+			     (match_operand 2 "const_int_operand"))
+		 (match_operand 3 "const_int_operand")))]
+ "aarch64_mask_and_shift_for_ubfiz_p (DImode, operands[3], operands[2])
+  && (INTVAL (operands[2]) + popcount_hwi (INTVAL (operands[3])))
+      == GET_MODE_BITSIZE (SImode)"
+  [(set (match_dup 0)
+	(zero_extend:DI (ashift:SI (match_dup 4) (match_dup 2))))]
+  {
+    operands[4] = gen_lowpart (SImode, operands[1]);
+  }
 )
 
 (define_insn "bswap<mode>2"
@@ -4420,22 +4515,23 @@
 ;; Expands to btrunc, ceil, floor, nearbyint, rint, round, frintn.
 
 (define_insn "<frint_pattern><mode>2"
-  [(set (match_operand:GPF 0 "register_operand" "=w")
-	(unspec:GPF [(match_operand:GPF 1 "register_operand" "w")]
+  [(set (match_operand:GPF_F16 0 "register_operand" "=w")
+	(unspec:GPF_F16 [(match_operand:GPF_F16 1 "register_operand" "w")]
 	 FRINT))]
   "TARGET_FLOAT"
   "frint<frint_suffix>\\t%<s>0, %<s>1"
-  [(set_attr "type" "f_rint<s>")]
+  [(set_attr "type" "f_rint<stype>")]
 )
 
 ;; frcvt floating-point round to integer and convert standard patterns.
 ;; Expands to lbtrunc, lceil, lfloor, lround.
-(define_insn "l<fcvt_pattern><su_optab><GPF:mode><GPI:mode>2"
+(define_insn "l<fcvt_pattern><su_optab><GPF_F16:mode><GPI:mode>2"
   [(set (match_operand:GPI 0 "register_operand" "=r")
-	(FIXUORS:GPI (unspec:GPF [(match_operand:GPF 1 "register_operand" "w")]
-		      FCVT)))]
+	(FIXUORS:GPI
+	  (unspec:GPF_F16 [(match_operand:GPF_F16 1 "register_operand" "w")]
+	   FCVT)))]
   "TARGET_FLOAT"
-  "fcvt<frint_suffix><su>\\t%<GPI:w>0, %<GPF:s>1"
+  "fcvt<frint_suffix><su>\\t%<GPI:w>0, %<GPF_F16:s>1"
   [(set_attr "type" "f_cvtf2i")]
 )
 
@@ -4461,23 +4557,24 @@
 ;; fma - no throw
 
 (define_insn "fma<mode>4"
-  [(set (match_operand:GPF 0 "register_operand" "=w")
-        (fma:GPF (match_operand:GPF 1 "register_operand" "w")
-		 (match_operand:GPF 2 "register_operand" "w")
-		 (match_operand:GPF 3 "register_operand" "w")))]
+  [(set (match_operand:GPF_F16 0 "register_operand" "=w")
+        (fma:GPF_F16 (match_operand:GPF_F16 1 "register_operand" "w")
+		     (match_operand:GPF_F16 2 "register_operand" "w")
+		     (match_operand:GPF_F16 3 "register_operand" "w")))]
   "TARGET_FLOAT"
   "fmadd\\t%<s>0, %<s>1, %<s>2, %<s>3"
-  [(set_attr "type" "fmac<s>")]
+  [(set_attr "type" "fmac<stype>")]
 )
 
 (define_insn "fnma<mode>4"
-  [(set (match_operand:GPF 0 "register_operand" "=w")
-	(fma:GPF (neg:GPF (match_operand:GPF 1 "register_operand" "w"))
-		 (match_operand:GPF 2 "register_operand" "w")
-		 (match_operand:GPF 3 "register_operand" "w")))]
+  [(set (match_operand:GPF_F16 0 "register_operand" "=w")
+	(fma:GPF_F16
+	  (neg:GPF_F16 (match_operand:GPF_F16 1 "register_operand" "w"))
+	  (match_operand:GPF_F16 2 "register_operand" "w")
+	  (match_operand:GPF_F16 3 "register_operand" "w")))]
   "TARGET_FLOAT"
   "fmsub\\t%<s>0, %<s>1, %<s>2, %<s>3"
-  [(set_attr "type" "fmac<s>")]
+  [(set_attr "type" "fmac<stype>")]
 )
 
 (define_insn "fms<mode>4"
@@ -4563,19 +4660,11 @@
   [(set_attr "type" "f_cvt")]
 )
 
-(define_insn "fix_trunc<GPF:mode><GPI:mode>2"
-  [(set (match_operand:GPI 0 "register_operand" "=r")
-        (fix:GPI (match_operand:GPF 1 "register_operand" "w")))]
-  "TARGET_FLOAT"
-  "fcvtzs\\t%<GPI:w>0, %<GPF:s>1"
-  [(set_attr "type" "f_cvtf2i")]
-)
-
-(define_insn "fixuns_trunc<GPF:mode><GPI:mode>2"
+(define_insn "<optab>_trunc<GPF_F16:mode><GPI:mode>2"
   [(set (match_operand:GPI 0 "register_operand" "=r")
-        (unsigned_fix:GPI (match_operand:GPF 1 "register_operand" "w")))]
+	(FIXUORS:GPI (match_operand:GPF_F16 1 "register_operand" "w")))]
   "TARGET_FLOAT"
-  "fcvtzu\\t%<GPI:w>0, %<GPF:s>1"
+  "fcvtz<su>\t%<GPI:w>0, %<GPF_F16:s>1"
   [(set_attr "type" "f_cvtf2i")]
 )
 
@@ -4599,38 +4688,116 @@
   [(set_attr "type" "f_cvti2f")]
 )
 
+(define_insn "<optab><mode>hf2"
+  [(set (match_operand:HF 0 "register_operand" "=w")
+	(FLOATUORS:HF (match_operand:GPI 1 "register_operand" "r")))]
+  "TARGET_FP_F16INST"
+  "<su_optab>cvtf\t%h0, %<w>1"
+  [(set_attr "type" "f_cvti2f")]
+)
+
+;; Convert between fixed-point and floating-point (scalar modes)
+
+(define_insn "<FCVT_F2FIXED:fcvt_fixed_insn><GPF:mode>3"
+  [(set (match_operand:<GPF:FCVT_TARGET> 0 "register_operand" "=r, w")
+	(unspec:<GPF:FCVT_TARGET> [(match_operand:GPF 1 "register_operand" "w, w")
+				   (match_operand:SI 2 "immediate_operand" "i, i")]
+	 FCVT_F2FIXED))]
+  ""
+  "@
+   <FCVT_F2FIXED:fcvt_fixed_insn>\t%<GPF:w1>0, %<GPF:s>1, #%2
+   <FCVT_F2FIXED:fcvt_fixed_insn>\t%<GPF:s>0, %<GPF:s>1, #%2"
+  [(set_attr "type" "f_cvtf2i, neon_fp_to_int_<GPF:Vetype>")
+   (set_attr "fp" "yes, *")
+   (set_attr "simd" "*, yes")]
+)
+
+(define_insn "<FCVT_FIXED2F:fcvt_fixed_insn><GPI:mode>3"
+  [(set (match_operand:<GPI:FCVT_TARGET> 0 "register_operand" "=w, w")
+	(unspec:<GPI:FCVT_TARGET> [(match_operand:GPI 1 "register_operand" "r, w")
+				   (match_operand:SI 2 "immediate_operand" "i, i")]
+	 FCVT_FIXED2F))]
+  ""
+  "@
+   <FCVT_FIXED2F:fcvt_fixed_insn>\t%<GPI:v>0, %<GPI:w>1, #%2
+   <FCVT_FIXED2F:fcvt_fixed_insn>\t%<GPI:v>0, %<GPI:v>1, #%2"
+  [(set_attr "type" "f_cvti2f, neon_int_to_fp_<GPI:Vetype>")
+   (set_attr "fp" "yes, *")
+   (set_attr "simd" "*, yes")]
+)
+
+(define_insn "<FCVT_F2FIXED:fcvt_fixed_insn>hf<mode>3"
+  [(set (match_operand:GPI 0 "register_operand" "=r")
+	(unspec:GPI [(match_operand:HF 1 "register_operand" "w")
+		     (match_operand:SI 2 "immediate_operand" "i")]
+	 FCVT_F2FIXED))]
+  "TARGET_FP_F16INST"
+   "<FCVT_F2FIXED:fcvt_fixed_insn>\t%<GPI:w>0, %h1, #%2"
+  [(set_attr "type" "f_cvtf2i")]
+)
+
+(define_insn "<FCVT_FIXED2F:fcvt_fixed_insn><mode>hf3"
+  [(set (match_operand:HF 0 "register_operand" "=w")
+	(unspec:HF [(match_operand:GPI 1 "register_operand" "r")
+		    (match_operand:SI 2 "immediate_operand" "i")]
+	 FCVT_FIXED2F))]
+  "TARGET_FP_F16INST"
+  "<FCVT_FIXED2F:fcvt_fixed_insn>\t%h0, %<GPI:w>1, #%2"
+  [(set_attr "type" "f_cvti2f")]
+)
+
+(define_insn "<FCVT_F2FIXED:fcvt_fixed_insn>hf3"
+  [(set (match_operand:HI 0 "register_operand" "=w")
+	(unspec:HI [(match_operand:HF 1 "register_operand" "w")
+		    (match_operand:SI 2 "immediate_operand" "i")]
+	 FCVT_F2FIXED))]
+  "TARGET_SIMD"
+  "<FCVT_F2FIXED:fcvt_fixed_insn>\t%h0, %h1, #%2"
+  [(set_attr "type" "neon_fp_to_int_s")]
+)
+
+(define_insn "<FCVT_FIXED2F:fcvt_fixed_insn>hi3"
+  [(set (match_operand:HF 0 "register_operand" "=w")
+	(unspec:HF [(match_operand:HI 1 "register_operand" "w")
+		    (match_operand:SI 2 "immediate_operand" "i")]
+	 FCVT_FIXED2F))]
+  "TARGET_SIMD"
+  "<FCVT_FIXED2F:fcvt_fixed_insn>\t%h0, %h1, #%2"
+  [(set_attr "type" "neon_int_to_fp_s")]
+)
+
 ;; -------------------------------------------------------------------
 ;; Floating-point arithmetic
 ;; -------------------------------------------------------------------
 
 (define_insn "add<mode>3"
-  [(set (match_operand:GPF 0 "register_operand" "=w")
-        (plus:GPF
-         (match_operand:GPF 1 "register_operand" "w")
-         (match_operand:GPF 2 "register_operand" "w")))]
+  [(set (match_operand:GPF_F16 0 "register_operand" "=w")
+	(plus:GPF_F16
+	 (match_operand:GPF_F16 1 "register_operand" "w")
+	 (match_operand:GPF_F16 2 "register_operand" "w")))]
   "TARGET_FLOAT"
   "fadd\\t%<s>0, %<s>1, %<s>2"
-  [(set_attr "type" "fadd<s>")]
+  [(set_attr "type" "fadd<stype>")]
 )
 
 (define_insn "sub<mode>3"
-  [(set (match_operand:GPF 0 "register_operand" "=w")
-        (minus:GPF
-         (match_operand:GPF 1 "register_operand" "w")
-         (match_operand:GPF 2 "register_operand" "w")))]
+  [(set (match_operand:GPF_F16 0 "register_operand" "=w")
+	(minus:GPF_F16
+	 (match_operand:GPF_F16 1 "register_operand" "w")
+	 (match_operand:GPF_F16 2 "register_operand" "w")))]
   "TARGET_FLOAT"
   "fsub\\t%<s>0, %<s>1, %<s>2"
-  [(set_attr "type" "fadd<s>")]
+  [(set_attr "type" "fadd<stype>")]
 )
 
 (define_insn "mul<mode>3"
-  [(set (match_operand:GPF 0 "register_operand" "=w")
-        (mult:GPF
-         (match_operand:GPF 1 "register_operand" "w")
-         (match_operand:GPF 2 "register_operand" "w")))]
+  [(set (match_operand:GPF_F16 0 "register_operand" "=w")
+	(mult:GPF_F16
+	 (match_operand:GPF_F16 1 "register_operand" "w")
+	 (match_operand:GPF_F16 2 "register_operand" "w")))]
   "TARGET_FLOAT"
   "fmul\\t%<s>0, %<s>1, %<s>2"
-  [(set_attr "type" "fmul<s>")]
+  [(set_attr "type" "fmul<stype>")]
 )
 
 (define_insn "*fnmul<mode>3"
@@ -4653,38 +4820,58 @@
   [(set_attr "type" "fmul<s>")]
 )
 
-(define_insn "div<mode>3"
-  [(set (match_operand:GPF 0 "register_operand" "=w")
-        (div:GPF
-         (match_operand:GPF 1 "register_operand" "w")
-         (match_operand:GPF 2 "register_operand" "w")))]
+(define_expand "div<mode>3"
+ [(set (match_operand:GPF_F16 0 "register_operand")
+       (div:GPF_F16 (match_operand:GPF_F16 1 "general_operand")
+		    (match_operand:GPF_F16 2 "register_operand")))]
+ "TARGET_SIMD"
+{
+  if (aarch64_emit_approx_div (operands[0], operands[1], operands[2]))
+    DONE;
+
+  operands[1] = force_reg (<MODE>mode, operands[1]);
+})
+
+(define_insn "*div<mode>3"
+  [(set (match_operand:GPF_F16 0 "register_operand" "=w")
+	(div:GPF_F16 (match_operand:GPF_F16 1 "register_operand" "w")
+		     (match_operand:GPF_F16 2 "register_operand" "w")))]
   "TARGET_FLOAT"
   "fdiv\\t%<s>0, %<s>1, %<s>2"
-  [(set_attr "type" "fdiv<s>")]
+  [(set_attr "type" "fdiv<stype>")]
 )
 
 (define_insn "neg<mode>2"
-  [(set (match_operand:GPF 0 "register_operand" "=w")
-        (neg:GPF (match_operand:GPF 1 "register_operand" "w")))]
+  [(set (match_operand:GPF_F16 0 "register_operand" "=w")
+	(neg:GPF_F16 (match_operand:GPF_F16 1 "register_operand" "w")))]
   "TARGET_FLOAT"
   "fneg\\t%<s>0, %<s>1"
-  [(set_attr "type" "ffarith<s>")]
+  [(set_attr "type" "ffarith<stype>")]
 )
 
-(define_insn "sqrt<mode>2"
-  [(set (match_operand:GPF 0 "register_operand" "=w")
-        (sqrt:GPF (match_operand:GPF 1 "register_operand" "w")))]
+(define_expand "sqrt<mode>2"
+  [(set (match_operand:GPF_F16 0 "register_operand" "=w")
+	(sqrt:GPF_F16 (match_operand:GPF_F16 1 "register_operand" "w")))]
+  "TARGET_FLOAT"
+{
+  if (aarch64_emit_approx_sqrt (operands[0], operands[1], false))
+    DONE;
+})
+
+(define_insn "*sqrt<mode>2"
+  [(set (match_operand:GPF_F16 0 "register_operand" "=w")
+	(sqrt:GPF_F16 (match_operand:GPF_F16 1 "register_operand" "w")))]
   "TARGET_FLOAT"
   "fsqrt\\t%<s>0, %<s>1"
-  [(set_attr "type" "fsqrt<s>")]
+  [(set_attr "type" "fsqrt<stype>")]
 )
 
 (define_insn "abs<mode>2"
-  [(set (match_operand:GPF 0 "register_operand" "=w")
-        (abs:GPF (match_operand:GPF 1 "register_operand" "w")))]
+  [(set (match_operand:GPF_F16 0 "register_operand" "=w")
+	(abs:GPF_F16 (match_operand:GPF_F16 1 "register_operand" "w")))]
   "TARGET_FLOAT"
   "fabs\\t%<s>0, %<s>1"
-  [(set_attr "type" "ffarith<s>")]
+  [(set_attr "type" "ffarith<stype>")]
 )
 
 ;; Given that smax/smin do not specify the result when either input is NaN,
@@ -4709,15 +4896,17 @@
   [(set_attr "type" "f_minmax<s>")]
 )
 
-;; Scalar forms for the IEEE-754 fmax()/fmin() functions
-(define_insn "<fmaxmin><mode>3"
-  [(set (match_operand:GPF 0 "register_operand" "=w")
-	(unspec:GPF [(match_operand:GPF 1 "register_operand" "w")
-		     (match_operand:GPF 2 "register_operand" "w")]
-		     FMAXMIN))]
+;; Scalar forms for fmax, fmin, fmaxnm, fminnm.
+;; fmaxnm and fminnm are used for the fmax<mode>3 standard pattern names,
+;; which implement the IEEE fmax ()/fmin () functions.
+(define_insn "<maxmin_uns><mode>3"
+  [(set (match_operand:GPF_F16 0 "register_operand" "=w")
+	(unspec:GPF_F16 [(match_operand:GPF_F16 1 "register_operand" "w")
+		     (match_operand:GPF_F16 2 "register_operand" "w")]
+		     FMAXMIN_UNS))]
   "TARGET_FLOAT"
-  "<fmaxmin_op>\\t%<s>0, %<s>1, %<s>2"
-  [(set_attr "type" "f_minmax<s>")]
+  "<maxmin_uns_op>\\t%<s>0, %<s>1, %<s>2"
+  [(set_attr "type" "f_minmax<stype>")]
 )
 
 ;; For copysign (x, y), we want to generate:
@@ -4775,7 +4964,7 @@
  [(set (match_operand:GPF_TF 0 "register_operand" "=w")
        (mem:GPF_TF (match_operand 1 "aarch64_constant_pool_symref" "S")))
   (clobber (match_operand:P 2 "register_operand" "=&r"))]
- "TARGET_FLOAT && aarch64_nopcrelative_literal_loads"
+ "TARGET_FLOAT"
  {
    aarch64_expand_mov_immediate (operands[2], XEXP (operands[1], 0));
    emit_move_insn (operands[0], gen_rtx_MEM (<GPF_TF:MODE>mode, operands[2]));
@@ -4788,7 +4977,7 @@
  [(set (match_operand:VALL 0 "register_operand" "=w")
        (mem:VALL (match_operand 1 "aarch64_constant_pool_symref" "S")))
   (clobber (match_operand:P 2 "register_operand" "=&r"))]
- "TARGET_FLOAT && aarch64_nopcrelative_literal_loads"
+ "TARGET_FLOAT"
  {
    aarch64_expand_mov_immediate (operands[2], XEXP (operands[1], 0));
    emit_move_insn (operands[0], gen_rtx_MEM (<VALL:MODE>mode, operands[2]));
@@ -4961,20 +5150,20 @@
 ;; The TLS ABI specifically requires that the compiler does not schedule
 ;; instructions in the TLS stubs, in order to enable linker relaxation.
 ;; Therefore we treat the stubs as an atomic sequence.
-(define_expand "tlsgd_small"
+(define_expand "tlsgd_small_<mode>"
  [(parallel [(set (match_operand 0 "register_operand" "")
                   (call (mem:DI (match_dup 2)) (const_int 1)))
-	     (unspec:DI [(match_operand:DI 1 "aarch64_valid_symref" "")] UNSPEC_GOTSMALLTLS)
+	     (unspec:DI [(match_operand:PTR 1 "aarch64_valid_symref" "")] UNSPEC_GOTSMALLTLS)
 	     (clobber (reg:DI LR_REGNUM))])]
  ""
 {
   operands[2] = aarch64_tls_get_addr ();
 })
 
-(define_insn "*tlsgd_small"
+(define_insn "*tlsgd_small_<mode>"
   [(set (match_operand 0 "register_operand" "")
 	(call (mem:DI (match_operand:DI 2 "" "")) (const_int 1)))
-   (unspec:DI [(match_operand:DI 1 "aarch64_valid_symref" "S")] UNSPEC_GOTSMALLTLS)
+   (unspec:DI [(match_operand:PTR 1 "aarch64_valid_symref" "S")] UNSPEC_GOTSMALLTLS)
    (clobber (reg:DI LR_REGNUM))
   ]
   ""
@@ -5182,7 +5371,7 @@
 	 UNSPEC_SP_TEST))
    (clobber (match_scratch:PTR 3 "=&r"))]
   ""
-  "ldr\t%<w>3, %x1\;ldr\t%<w>0, %x2\;eor\t%<w>0, %<w>3, %<w>0"
+  "ldr\t%<w>3, %1\;ldr\t%<w>0, %2\;eor\t%<w>0, %<w>3, %<w>0"
   [(set_attr "length" "12")
    (set_attr "type" "multiple")])
 
--- a/src/gcc/config/aarch64/aarch64.opt
+++ b/src/gcc/config/aarch64/aarch64.opt
@@ -146,10 +146,28 @@ EnumValue
 Enum(aarch64_abi) String(lp64) Value(AARCH64_ABI_LP64)
 
 mpc-relative-literal-loads
-Target Report Save Var(nopcrelative_literal_loads) Init(2) Save
+Target Report Save Var(pcrelative_literal_loads) Init(2) Save
 PC relative literal loads.
 
 mlow-precision-recip-sqrt
 Common Var(flag_mrecip_low_precision_sqrt) Optimization
-When calculating the reciprocal square root approximation,
-uses one less step than otherwise, thus reducing latency and precision.
+Enable the reciprocal square root approximation.  Enabling this reduces
+precision of reciprocal square root results to about 16 bits for
+single precision and to 32 bits for double precision.
+
+mlow-precision-sqrt
+Common Var(flag_mlow_precision_sqrt) Optimization
+Enable the square root approximation.  Enabling this reduces
+precision of square root results to about 16 bits for
+single precision and to 32 bits for double precision.
+If enabled, it implies -mlow-precision-recip-sqrt.
+
+mlow-precision-div
+Common Var(flag_mlow_precision_div) Optimization
+Enable the division approximation.  Enabling this reduces
+precision of division results to about 16 bits for
+single precision and to 32 bits for double precision.
+
+mverbose-cost-dump
+Common Undocumented Var(flag_aarch64_verbose_cost)
+Enables verbose cost model dummping in the debug dump files.
--- /dev/null
+++ b/src/gcc/config/aarch64/arm_fp16.h
@@ -0,0 +1,579 @@
+/* ARM FP16 scalar intrinsics include file.
+
+   Copyright (C) 2016 Free Software Foundation, Inc.
+   Contributed by ARM Ltd.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify it
+   under the terms of the GNU General Public License as published
+   by the Free Software Foundation; either version 3, or (at your
+   option) any later version.
+
+   GCC is distributed in the hope that it will be useful, but WITHOUT
+   ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+   or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
+   License for more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef _AARCH64_FP16_H_
+#define _AARCH64_FP16_H_
+
+#include <stdint.h>
+
+#pragma GCC push_options
+#pragma GCC target ("arch=armv8.2-a+fp16")
+
+typedef __fp16 float16_t;
+
+/* ARMv8.2-A FP16 one operand scalar intrinsics.  */
+
+__extension__ static __inline float16_t __attribute__ ((__always_inline__))
+vabsh_f16 (float16_t __a)
+{
+  return __builtin_aarch64_abshf (__a);
+}
+
+__extension__ static __inline uint16_t __attribute__ ((__always_inline__))
+vceqzh_f16 (float16_t __a)
+{
+  return __builtin_aarch64_cmeqhf_uss (__a, 0.0f);
+}
+
+__extension__ static __inline uint16_t __attribute__ ((__always_inline__))
+vcgezh_f16 (float16_t __a)
+{
+  return __builtin_aarch64_cmgehf_uss (__a, 0.0f);
+}
+
+__extension__ static __inline uint16_t __attribute__ ((__always_inline__))
+vcgtzh_f16 (float16_t __a)
+{
+  return __builtin_aarch64_cmgthf_uss (__a, 0.0f);
+}
+
+__extension__ static __inline uint16_t __attribute__ ((__always_inline__))
+vclezh_f16 (float16_t __a)
+{
+  return __builtin_aarch64_cmlehf_uss (__a, 0.0f);
+}
+
+__extension__ static __inline uint16_t __attribute__ ((__always_inline__))
+vcltzh_f16 (float16_t __a)
+{
+  return __builtin_aarch64_cmlthf_uss (__a, 0.0f);
+}
+
+__extension__ static __inline float16_t __attribute__ ((__always_inline__))
+vcvth_f16_s16 (int16_t __a)
+{
+  return __builtin_aarch64_floathihf (__a);
+}
+
+__extension__ static __inline float16_t __attribute__ ((__always_inline__))
+vcvth_f16_s32 (int32_t __a)
+{
+  return __builtin_aarch64_floatsihf (__a);
+}
+
+__extension__ static __inline float16_t __attribute__ ((__always_inline__))
+vcvth_f16_s64 (int64_t __a)
+{
+  return __builtin_aarch64_floatdihf (__a);
+}
+
+__extension__ static __inline float16_t __attribute__ ((__always_inline__))
+vcvth_f16_u16 (uint16_t __a)
+{
+  return __builtin_aarch64_floatunshihf_us (__a);
+}
+
+__extension__ static __inline float16_t __attribute__ ((__always_inline__))
+vcvth_f16_u32 (uint32_t __a)
+{
+  return __builtin_aarch64_floatunssihf_us (__a);
+}
+
+__extension__ static __inline float16_t __attribute__ ((__always_inline__))
+vcvth_f16_u64 (uint64_t __a)
+{
+  return __builtin_aarch64_floatunsdihf_us (__a);
+}
+
+__extension__ static __inline int16_t __attribute__ ((__always_inline__))
+vcvth_s16_f16 (float16_t __a)
+{
+  return __builtin_aarch64_fix_trunchfhi (__a);
+}
+
+__extension__ static __inline int32_t __attribute__ ((__always_inline__))
+vcvth_s32_f16 (float16_t __a)
+{
+  return __builtin_aarch64_fix_trunchfsi (__a);
+}
+
+__extension__ static __inline int64_t __attribute__ ((__always_inline__))
+vcvth_s64_f16 (float16_t __a)
+{
+  return __builtin_aarch64_fix_trunchfdi (__a);
+}
+
+__extension__ static __inline uint16_t __attribute__ ((__always_inline__))
+vcvth_u16_f16 (float16_t __a)
+{
+  return __builtin_aarch64_fixuns_trunchfhi_us (__a);
+}
+
+__extension__ static __inline uint32_t __attribute__ ((__always_inline__))
+vcvth_u32_f16 (float16_t __a)
+{
+  return __builtin_aarch64_fixuns_trunchfsi_us (__a);
+}
+
+__extension__ static __inline uint64_t __attribute__ ((__always_inline__))
+vcvth_u64_f16 (float16_t __a)
+{
+  return __builtin_aarch64_fixuns_trunchfdi_us (__a);
+}
+
+__extension__ static __inline int16_t __attribute__ ((__always_inline__))
+vcvtah_s16_f16 (float16_t __a)
+{
+  return __builtin_aarch64_lroundhfhi (__a);
+}
+
+__extension__ static __inline int32_t __attribute__ ((__always_inline__))
+vcvtah_s32_f16 (float16_t __a)
+{
+  return __builtin_aarch64_lroundhfsi (__a);
+}
+
+__extension__ static __inline int64_t __attribute__ ((__always_inline__))
+vcvtah_s64_f16 (float16_t __a)
+{
+  return __builtin_aarch64_lroundhfdi (__a);
+}
+
+__extension__ static __inline uint16_t __attribute__ ((__always_inline__))
+vcvtah_u16_f16 (float16_t __a)
+{
+  return __builtin_aarch64_lrounduhfhi_us (__a);
+}
+
+__extension__ static __inline uint32_t __attribute__ ((__always_inline__))
+vcvtah_u32_f16 (float16_t __a)
+{
+  return __builtin_aarch64_lrounduhfsi_us (__a);
+}
+
+__extension__ static __inline uint64_t __attribute__ ((__always_inline__))
+vcvtah_u64_f16 (float16_t __a)
+{
+  return __builtin_aarch64_lrounduhfdi_us (__a);
+}
+
+__extension__ static __inline int16_t __attribute__ ((__always_inline__))
+vcvtmh_s16_f16 (float16_t __a)
+{
+  return __builtin_aarch64_lfloorhfhi (__a);
+}
+
+__extension__ static __inline int32_t __attribute__ ((__always_inline__))
+vcvtmh_s32_f16 (float16_t __a)
+{
+  return __builtin_aarch64_lfloorhfsi (__a);
+}
+
+__extension__ static __inline int64_t __attribute__ ((__always_inline__))
+vcvtmh_s64_f16 (float16_t __a)
+{
+  return __builtin_aarch64_lfloorhfdi (__a);
+}
+
+__extension__ static __inline uint16_t __attribute__ ((__always_inline__))
+vcvtmh_u16_f16 (float16_t __a)
+{
+  return __builtin_aarch64_lflooruhfhi_us (__a);
+}
+
+__extension__ static __inline uint32_t __attribute__ ((__always_inline__))
+vcvtmh_u32_f16 (float16_t __a)
+{
+  return __builtin_aarch64_lflooruhfsi_us (__a);
+}
+
+__extension__ static __inline uint64_t __attribute__ ((__always_inline__))
+vcvtmh_u64_f16 (float16_t __a)
+{
+  return __builtin_aarch64_lflooruhfdi_us (__a);
+}
+
+__extension__ static __inline int16_t __attribute__ ((__always_inline__))
+vcvtnh_s16_f16 (float16_t __a)
+{
+  return __builtin_aarch64_lfrintnhfhi (__a);
+}
+
+__extension__ static __inline int32_t __attribute__ ((__always_inline__))
+vcvtnh_s32_f16 (float16_t __a)
+{
+  return __builtin_aarch64_lfrintnhfsi (__a);
+}
+
+__extension__ static __inline int64_t __attribute__ ((__always_inline__))
+vcvtnh_s64_f16 (float16_t __a)
+{
+  return __builtin_aarch64_lfrintnhfdi (__a);
+}
+
+__extension__ static __inline uint16_t __attribute__ ((__always_inline__))
+vcvtnh_u16_f16 (float16_t __a)
+{
+  return __builtin_aarch64_lfrintnuhfhi_us (__a);
+}
+
+__extension__ static __inline uint32_t __attribute__ ((__always_inline__))
+vcvtnh_u32_f16 (float16_t __a)
+{
+  return __builtin_aarch64_lfrintnuhfsi_us (__a);
+}
+
+__extension__ static __inline uint64_t __attribute__ ((__always_inline__))
+vcvtnh_u64_f16 (float16_t __a)
+{
+  return __builtin_aarch64_lfrintnuhfdi_us (__a);
+}
+
+__extension__ static __inline int16_t __attribute__ ((__always_inline__))
+vcvtph_s16_f16 (float16_t __a)
+{
+  return __builtin_aarch64_lceilhfhi (__a);
+}
+
+__extension__ static __inline int32_t __attribute__ ((__always_inline__))
+vcvtph_s32_f16 (float16_t __a)
+{
+  return __builtin_aarch64_lceilhfsi (__a);
+}
+
+__extension__ static __inline int64_t __attribute__ ((__always_inline__))
+vcvtph_s64_f16 (float16_t __a)
+{
+  return __builtin_aarch64_lceilhfdi (__a);
+}
+
+__extension__ static __inline uint16_t __attribute__ ((__always_inline__))
+vcvtph_u16_f16 (float16_t __a)
+{
+  return __builtin_aarch64_lceiluhfhi_us (__a);
+}
+
+__extension__ static __inline uint32_t __attribute__ ((__always_inline__))
+vcvtph_u32_f16 (float16_t __a)
+{
+  return __builtin_aarch64_lceiluhfsi_us (__a);
+}
+
+__extension__ static __inline uint64_t __attribute__ ((__always_inline__))
+vcvtph_u64_f16 (float16_t __a)
+{
+  return __builtin_aarch64_lceiluhfdi_us (__a);
+}
+
+__extension__ static __inline float16_t __attribute__ ((__always_inline__))
+vnegh_f16 (float16_t __a)
+{
+  return __builtin_aarch64_neghf (__a);
+}
+
+__extension__ static __inline float16_t __attribute__ ((__always_inline__))
+vrecpeh_f16 (float16_t __a)
+{
+  return __builtin_aarch64_frecpehf (__a);
+}
+
+__extension__ static __inline float16_t __attribute__ ((__always_inline__))
+vrecpxh_f16 (float16_t __a)
+{
+  return __builtin_aarch64_frecpxhf (__a);
+}
+
+__extension__ static __inline float16_t __attribute__ ((__always_inline__))
+vrndh_f16 (float16_t __a)
+{
+  return __builtin_aarch64_btrunchf (__a);
+}
+
+__extension__ static __inline float16_t __attribute__ ((__always_inline__))
+vrndah_f16 (float16_t __a)
+{
+  return __builtin_aarch64_roundhf (__a);
+}
+
+__extension__ static __inline float16_t __attribute__ ((__always_inline__))
+vrndih_f16 (float16_t __a)
+{
+  return __builtin_aarch64_nearbyinthf (__a);
+}
+
+__extension__ static __inline float16_t __attribute__ ((__always_inline__))
+vrndmh_f16 (float16_t __a)
+{
+  return __builtin_aarch64_floorhf (__a);
+}
+
+__extension__ static __inline float16_t __attribute__ ((__always_inline__))
+vrndnh_f16 (float16_t __a)
+{
+  return __builtin_aarch64_frintnhf (__a);
+}
+
+__extension__ static __inline float16_t __attribute__ ((__always_inline__))
+vrndph_f16 (float16_t __a)
+{
+  return __builtin_aarch64_ceilhf (__a);
+}
+
+__extension__ static __inline float16_t __attribute__ ((__always_inline__))
+vrndxh_f16 (float16_t __a)
+{
+  return __builtin_aarch64_rinthf (__a);
+}
+
+__extension__ static __inline float16_t __attribute__ ((__always_inline__))
+vrsqrteh_f16 (float16_t __a)
+{
+  return __builtin_aarch64_rsqrtehf (__a);
+}
+
+__extension__ static __inline float16_t __attribute__ ((__always_inline__))
+vsqrth_f16 (float16_t __a)
+{
+  return __builtin_aarch64_sqrthf (__a);
+}
+
+/* ARMv8.2-A FP16 two operands scalar intrinsics.  */
+
+__extension__ static __inline float16_t __attribute__ ((__always_inline__))
+vaddh_f16 (float16_t __a, float16_t __b)
+{
+  return __a + __b;
+}
+
+__extension__ static __inline float16_t __attribute__ ((__always_inline__))
+vabdh_f16 (float16_t __a, float16_t __b)
+{
+  return __builtin_aarch64_fabdhf (__a, __b);
+}
+
+__extension__ static __inline uint16_t __attribute__ ((__always_inline__))
+vcageh_f16 (float16_t __a, float16_t __b)
+{
+  return __builtin_aarch64_facgehf_uss (__a, __b);
+}
+
+__extension__ static __inline uint16_t __attribute__ ((__always_inline__))
+vcagth_f16 (float16_t __a, float16_t __b)
+{
+  return __builtin_aarch64_facgthf_uss (__a, __b);
+}
+
+__extension__ static __inline uint16_t __attribute__ ((__always_inline__))
+vcaleh_f16 (float16_t __a, float16_t __b)
+{
+  return __builtin_aarch64_faclehf_uss (__a, __b);
+}
+
+__extension__ static __inline uint16_t __attribute__ ((__always_inline__))
+vcalth_f16 (float16_t __a, float16_t __b)
+{
+  return __builtin_aarch64_faclthf_uss (__a, __b);
+}
+
+__extension__ static __inline uint16_t __attribute__ ((__always_inline__))
+vceqh_f16 (float16_t __a, float16_t __b)
+{
+  return __builtin_aarch64_cmeqhf_uss (__a, __b);
+}
+
+__extension__ static __inline uint16_t __attribute__ ((__always_inline__))
+vcgeh_f16 (float16_t __a, float16_t __b)
+{
+  return __builtin_aarch64_cmgehf_uss (__a, __b);
+}
+
+__extension__ static __inline uint16_t __attribute__ ((__always_inline__))
+vcgth_f16 (float16_t __a, float16_t __b)
+{
+  return __builtin_aarch64_cmgthf_uss (__a, __b);
+}
+
+__extension__ static __inline uint16_t __attribute__ ((__always_inline__))
+vcleh_f16 (float16_t __a, float16_t __b)
+{
+  return __builtin_aarch64_cmlehf_uss (__a, __b);
+}
+
+__extension__ static __inline uint16_t __attribute__ ((__always_inline__))
+vclth_f16 (float16_t __a, float16_t __b)
+{
+  return __builtin_aarch64_cmlthf_uss (__a, __b);
+}
+
+__extension__ static __inline float16_t __attribute__ ((__always_inline__))
+vcvth_n_f16_s16 (int16_t __a, const int __b)
+{
+  return __builtin_aarch64_scvtfhi (__a, __b);
+}
+
+__extension__ static __inline float16_t __attribute__ ((__always_inline__))
+vcvth_n_f16_s32 (int32_t __a, const int __b)
+{
+  return __builtin_aarch64_scvtfsihf (__a, __b);
+}
+
+__extension__ static __inline float16_t __attribute__ ((__always_inline__))
+vcvth_n_f16_s64 (int64_t __a, const int __b)
+{
+  return __builtin_aarch64_scvtfdihf (__a, __b);
+}
+
+__extension__ static __inline float16_t __attribute__ ((__always_inline__))
+vcvth_n_f16_u16 (uint16_t __a, const int __b)
+{
+  return __builtin_aarch64_ucvtfhi_sus (__a, __b);
+}
+
+__extension__ static __inline float16_t __attribute__ ((__always_inline__))
+vcvth_n_f16_u32 (uint32_t __a, const int __b)
+{
+  return __builtin_aarch64_ucvtfsihf_sus (__a, __b);
+}
+
+__extension__ static __inline float16_t __attribute__ ((__always_inline__))
+vcvth_n_f16_u64 (uint64_t __a, const int __b)
+{
+  return __builtin_aarch64_ucvtfdihf_sus (__a, __b);
+}
+
+__extension__ static __inline int16_t __attribute__ ((__always_inline__))
+vcvth_n_s16_f16 (float16_t __a, const int __b)
+{
+  return __builtin_aarch64_fcvtzshf (__a, __b);
+}
+
+__extension__ static __inline int32_t __attribute__ ((__always_inline__))
+vcvth_n_s32_f16 (float16_t __a, const int __b)
+{
+  return __builtin_aarch64_fcvtzshfsi (__a, __b);
+}
+
+__extension__ static __inline int64_t __attribute__ ((__always_inline__))
+vcvth_n_s64_f16 (float16_t __a, const int __b)
+{
+  return __builtin_aarch64_fcvtzshfdi (__a, __b);
+}
+
+__extension__ static __inline uint16_t __attribute__ ((__always_inline__))
+vcvth_n_u16_f16 (float16_t __a, const int __b)
+{
+  return __builtin_aarch64_fcvtzuhf_uss (__a, __b);
+}
+
+__extension__ static __inline uint32_t __attribute__ ((__always_inline__))
+vcvth_n_u32_f16 (float16_t __a, const int __b)
+{
+  return __builtin_aarch64_fcvtzuhfsi_uss (__a, __b);
+}
+
+__extension__ static __inline uint64_t __attribute__ ((__always_inline__))
+vcvth_n_u64_f16 (float16_t __a, const int __b)
+{
+  return __builtin_aarch64_fcvtzuhfdi_uss (__a, __b);
+}
+
+__extension__ static __inline float16_t __attribute__ ((__always_inline__))
+vdivh_f16 (float16_t __a, float16_t __b)
+{
+  return __a / __b;
+}
+
+__extension__ static __inline float16_t __attribute__ ((__always_inline__))
+vmaxh_f16 (float16_t __a, float16_t __b)
+{
+  return __builtin_aarch64_fmaxhf (__a, __b);
+}
+
+__extension__ static __inline float16_t __attribute__ ((__always_inline__))
+vmaxnmh_f16 (float16_t __a, float16_t __b)
+{
+  return __builtin_aarch64_fmaxhf (__a, __b);
+}
+
+__extension__ static __inline float16_t __attribute__ ((__always_inline__))
+vminh_f16 (float16_t __a, float16_t __b)
+{
+  return __builtin_aarch64_fminhf (__a, __b);
+}
+
+__extension__ static __inline float16_t __attribute__ ((__always_inline__))
+vminnmh_f16 (float16_t __a, float16_t __b)
+{
+  return __builtin_aarch64_fminhf (__a, __b);
+}
+
+__extension__ static __inline float16_t __attribute__ ((__always_inline__))
+vmulh_f16 (float16_t __a, float16_t __b)
+{
+  return __a * __b;
+}
+
+__extension__ static __inline float16_t __attribute__ ((__always_inline__))
+vmulxh_f16 (float16_t __a, float16_t __b)
+{
+  return __builtin_aarch64_fmulxhf (__a, __b);
+}
+
+__extension__ static __inline float16_t __attribute__ ((__always_inline__))
+vrecpsh_f16 (float16_t __a, float16_t __b)
+{
+  return __builtin_aarch64_frecpshf (__a, __b);
+}
+
+__extension__ static __inline float16_t __attribute__ ((__always_inline__))
+vrsqrtsh_f16 (float16_t __a, float16_t __b)
+{
+  return __builtin_aarch64_rsqrtshf (__a, __b);
+}
+
+__extension__ static __inline float16_t __attribute__ ((__always_inline__))
+vsubh_f16 (float16_t __a, float16_t __b)
+{
+  return __a - __b;
+}
+
+/* ARMv8.2-A FP16 three operands scalar intrinsics.  */
+
+__extension__ static __inline float16_t __attribute__ ((__always_inline__))
+vfmah_f16 (float16_t __a, float16_t __b, float16_t __c)
+{
+  return __builtin_aarch64_fmahf (__b, __c, __a);
+}
+
+__extension__ static __inline float16_t __attribute__ ((__always_inline__))
+vfmsh_f16 (float16_t __a, float16_t __b, float16_t __c)
+{
+  return __builtin_aarch64_fnmahf (__b, __c, __a);
+}
+
+#pragma GCC pop_options
+
+#endif
--- a/src/gcc/config/aarch64/arm_neon.h
+++ b/src/gcc/config/aarch64/arm_neon.h
@@ -58,6 +58,7 @@ typedef __Float64x2_t float64x2_t;
 typedef __Poly8x16_t poly8x16_t;
 typedef __Poly16x8_t poly16x8_t;
 typedef __Poly64x2_t poly64x2_t;
+typedef __Poly64x1_t poly64x1_t;
 typedef __Uint8x16_t uint8x16_t;
 typedef __Uint16x8_t uint16x8_t;
 typedef __Uint32x4_t uint32x4_t;
@@ -202,6 +203,36 @@ typedef struct poly16x8x2_t
   poly16x8_t val[2];
 } poly16x8x2_t;
 
+typedef struct poly64x1x2_t
+{
+  poly64x1_t val[2];
+} poly64x1x2_t;
+
+typedef struct poly64x1x3_t
+{
+  poly64x1_t val[3];
+} poly64x1x3_t;
+
+typedef struct poly64x1x4_t
+{
+  poly64x1_t val[4];
+} poly64x1x4_t;
+
+typedef struct poly64x2x2_t
+{
+  poly64x2_t val[2];
+} poly64x2x2_t;
+
+typedef struct poly64x2x3_t
+{
+  poly64x2_t val[3];
+} poly64x2x3_t;
+
+typedef struct poly64x2x4_t
+{
+  poly64x2_t val[4];
+} poly64x2x4_t;
+
 typedef struct int8x8x3_t
 {
   int8x8_t val[3];
@@ -466,6 +497,8 @@ typedef struct poly16x8x4_t
 #define __aarch64_vdup_lane_any(__size, __q, __a, __b) \
   vdup##__q##_n_##__size (__aarch64_vget_lane_any (__a, __b))
 
+#define __aarch64_vdup_lane_f16(__a, __b) \
+   __aarch64_vdup_lane_any (f16, , __a, __b)
 #define __aarch64_vdup_lane_f32(__a, __b) \
    __aarch64_vdup_lane_any (f32, , __a, __b)
 #define __aarch64_vdup_lane_f64(__a, __b) \
@@ -474,6 +507,8 @@ typedef struct poly16x8x4_t
    __aarch64_vdup_lane_any (p8, , __a, __b)
 #define __aarch64_vdup_lane_p16(__a, __b) \
    __aarch64_vdup_lane_any (p16, , __a, __b)
+#define __aarch64_vdup_lane_p64(__a, __b) \
+   __aarch64_vdup_lane_any (p64, , __a, __b)
 #define __aarch64_vdup_lane_s8(__a, __b) \
    __aarch64_vdup_lane_any (s8, , __a, __b)
 #define __aarch64_vdup_lane_s16(__a, __b) \
@@ -492,6 +527,8 @@ typedef struct poly16x8x4_t
    __aarch64_vdup_lane_any (u64, , __a, __b)
 
 /* __aarch64_vdup_laneq internal macros.  */
+#define __aarch64_vdup_laneq_f16(__a, __b) \
+   __aarch64_vdup_lane_any (f16, , __a, __b)
 #define __aarch64_vdup_laneq_f32(__a, __b) \
    __aarch64_vdup_lane_any (f32, , __a, __b)
 #define __aarch64_vdup_laneq_f64(__a, __b) \
@@ -500,6 +537,8 @@ typedef struct poly16x8x4_t
    __aarch64_vdup_lane_any (p8, , __a, __b)
 #define __aarch64_vdup_laneq_p16(__a, __b) \
    __aarch64_vdup_lane_any (p16, , __a, __b)
+#define __aarch64_vdup_laneq_p64(__a, __b) \
+   __aarch64_vdup_lane_any (p64, , __a, __b)
 #define __aarch64_vdup_laneq_s8(__a, __b) \
    __aarch64_vdup_lane_any (s8, , __a, __b)
 #define __aarch64_vdup_laneq_s16(__a, __b) \
@@ -518,6 +557,8 @@ typedef struct poly16x8x4_t
    __aarch64_vdup_lane_any (u64, , __a, __b)
 
 /* __aarch64_vdupq_lane internal macros.  */
+#define __aarch64_vdupq_lane_f16(__a, __b) \
+   __aarch64_vdup_lane_any (f16, q, __a, __b)
 #define __aarch64_vdupq_lane_f32(__a, __b) \
    __aarch64_vdup_lane_any (f32, q, __a, __b)
 #define __aarch64_vdupq_lane_f64(__a, __b) \
@@ -526,6 +567,8 @@ typedef struct poly16x8x4_t
    __aarch64_vdup_lane_any (p8, q, __a, __b)
 #define __aarch64_vdupq_lane_p16(__a, __b) \
    __aarch64_vdup_lane_any (p16, q, __a, __b)
+#define __aarch64_vdupq_lane_p64(__a, __b) \
+   __aarch64_vdup_lane_any (p64, q, __a, __b)
 #define __aarch64_vdupq_lane_s8(__a, __b) \
    __aarch64_vdup_lane_any (s8, q, __a, __b)
 #define __aarch64_vdupq_lane_s16(__a, __b) \
@@ -544,6 +587,8 @@ typedef struct poly16x8x4_t
    __aarch64_vdup_lane_any (u64, q, __a, __b)
 
 /* __aarch64_vdupq_laneq internal macros.  */
+#define __aarch64_vdupq_laneq_f16(__a, __b) \
+   __aarch64_vdup_lane_any (f16, q, __a, __b)
 #define __aarch64_vdupq_laneq_f32(__a, __b) \
    __aarch64_vdup_lane_any (f32, q, __a, __b)
 #define __aarch64_vdupq_laneq_f64(__a, __b) \
@@ -552,6 +597,8 @@ typedef struct poly16x8x4_t
    __aarch64_vdup_lane_any (p8, q, __a, __b)
 #define __aarch64_vdupq_laneq_p16(__a, __b) \
    __aarch64_vdup_lane_any (p16, q, __a, __b)
+#define __aarch64_vdupq_laneq_p64(__a, __b) \
+   __aarch64_vdup_lane_any (p64, q, __a, __b)
 #define __aarch64_vdupq_laneq_s8(__a, __b) \
    __aarch64_vdup_lane_any (s8, q, __a, __b)
 #define __aarch64_vdupq_laneq_s16(__a, __b) \
@@ -601,535 +648,619 @@ typedef struct poly16x8x4_t
   })
 
 /* vadd  */
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int8x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vadd_s8 (int8x8_t __a, int8x8_t __b)
 {
   return __a + __b;
 }
 
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vadd_s16 (int16x4_t __a, int16x4_t __b)
 {
   return __a + __b;
 }
 
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vadd_s32 (int32x2_t __a, int32x2_t __b)
 {
   return __a + __b;
 }
 
-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline float32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vadd_f32 (float32x2_t __a, float32x2_t __b)
 {
   return __a + __b;
 }
 
-__extension__ static __inline float64x1_t __attribute__ ((__always_inline__))
+__extension__ extern __inline float64x1_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vadd_f64 (float64x1_t __a, float64x1_t __b)
 {
   return __a + __b;
 }
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint8x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vadd_u8 (uint8x8_t __a, uint8x8_t __b)
 {
   return __a + __b;
 }
 
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vadd_u16 (uint16x4_t __a, uint16x4_t __b)
 {
   return __a + __b;
 }
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vadd_u32 (uint32x2_t __a, uint32x2_t __b)
 {
   return __a + __b;
 }
 
-__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int64x1_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vadd_s64 (int64x1_t __a, int64x1_t __b)
 {
   return __a + __b;
 }
 
-__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint64x1_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vadd_u64 (uint64x1_t __a, uint64x1_t __b)
 {
   return __a + __b;
 }
 
-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int8x16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vaddq_s8 (int8x16_t __a, int8x16_t __b)
 {
   return __a + __b;
 }
 
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vaddq_s16 (int16x8_t __a, int16x8_t __b)
 {
   return __a + __b;
 }
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vaddq_s32 (int32x4_t __a, int32x4_t __b)
 {
   return __a + __b;
 }
 
-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vaddq_s64 (int64x2_t __a, int64x2_t __b)
 {
   return __a + __b;
 }
 
-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline float32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vaddq_f32 (float32x4_t __a, float32x4_t __b)
 {
   return __a + __b;
 }
 
-__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline float64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vaddq_f64 (float64x2_t __a, float64x2_t __b)
 {
   return __a + __b;
 }
 
-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint8x16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vaddq_u8 (uint8x16_t __a, uint8x16_t __b)
 {
   return __a + __b;
 }
 
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vaddq_u16 (uint16x8_t __a, uint16x8_t __b)
 {
   return __a + __b;
 }
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vaddq_u32 (uint32x4_t __a, uint32x4_t __b)
 {
   return __a + __b;
 }
 
-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vaddq_u64 (uint64x2_t __a, uint64x2_t __b)
 {
   return __a + __b;
 }
 
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vaddl_s8 (int8x8_t __a, int8x8_t __b)
 {
   return (int16x8_t) __builtin_aarch64_saddlv8qi (__a, __b);
 }
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vaddl_s16 (int16x4_t __a, int16x4_t __b)
 {
   return (int32x4_t) __builtin_aarch64_saddlv4hi (__a, __b);
 }
 
-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vaddl_s32 (int32x2_t __a, int32x2_t __b)
 {
   return (int64x2_t) __builtin_aarch64_saddlv2si (__a, __b);
 }
 
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vaddl_u8 (uint8x8_t __a, uint8x8_t __b)
 {
   return (uint16x8_t) __builtin_aarch64_uaddlv8qi ((int8x8_t) __a,
 						   (int8x8_t) __b);
 }
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vaddl_u16 (uint16x4_t __a, uint16x4_t __b)
 {
   return (uint32x4_t) __builtin_aarch64_uaddlv4hi ((int16x4_t) __a,
 						   (int16x4_t) __b);
 }
 
-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vaddl_u32 (uint32x2_t __a, uint32x2_t __b)
 {
   return (uint64x2_t) __builtin_aarch64_uaddlv2si ((int32x2_t) __a,
 						   (int32x2_t) __b);
 }
 
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vaddl_high_s8 (int8x16_t __a, int8x16_t __b)
 {
   return (int16x8_t) __builtin_aarch64_saddl2v16qi (__a, __b);
 }
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vaddl_high_s16 (int16x8_t __a, int16x8_t __b)
 {
   return (int32x4_t) __builtin_aarch64_saddl2v8hi (__a, __b);
 }
 
-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vaddl_high_s32 (int32x4_t __a, int32x4_t __b)
 {
   return (int64x2_t) __builtin_aarch64_saddl2v4si (__a, __b);
 }
 
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vaddl_high_u8 (uint8x16_t __a, uint8x16_t __b)
 {
   return (uint16x8_t) __builtin_aarch64_uaddl2v16qi ((int8x16_t) __a,
 						     (int8x16_t) __b);
 }
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vaddl_high_u16 (uint16x8_t __a, uint16x8_t __b)
 {
   return (uint32x4_t) __builtin_aarch64_uaddl2v8hi ((int16x8_t) __a,
 						    (int16x8_t) __b);
 }
 
-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vaddl_high_u32 (uint32x4_t __a, uint32x4_t __b)
 {
   return (uint64x2_t) __builtin_aarch64_uaddl2v4si ((int32x4_t) __a,
 						    (int32x4_t) __b);
 }
 
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vaddw_s8 (int16x8_t __a, int8x8_t __b)
 {
   return (int16x8_t) __builtin_aarch64_saddwv8qi (__a, __b);
 }
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vaddw_s16 (int32x4_t __a, int16x4_t __b)
 {
   return (int32x4_t) __builtin_aarch64_saddwv4hi (__a, __b);
 }
 
-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vaddw_s32 (int64x2_t __a, int32x2_t __b)
 {
   return (int64x2_t) __builtin_aarch64_saddwv2si (__a, __b);
 }
 
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vaddw_u8 (uint16x8_t __a, uint8x8_t __b)
 {
   return (uint16x8_t) __builtin_aarch64_uaddwv8qi ((int16x8_t) __a,
 						   (int8x8_t) __b);
 }
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vaddw_u16 (uint32x4_t __a, uint16x4_t __b)
 {
   return (uint32x4_t) __builtin_aarch64_uaddwv4hi ((int32x4_t) __a,
 						   (int16x4_t) __b);
 }
 
-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vaddw_u32 (uint64x2_t __a, uint32x2_t __b)
 {
   return (uint64x2_t) __builtin_aarch64_uaddwv2si ((int64x2_t) __a,
 						   (int32x2_t) __b);
 }
 
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vaddw_high_s8 (int16x8_t __a, int8x16_t __b)
 {
   return (int16x8_t) __builtin_aarch64_saddw2v16qi (__a, __b);
 }
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vaddw_high_s16 (int32x4_t __a, int16x8_t __b)
 {
   return (int32x4_t) __builtin_aarch64_saddw2v8hi (__a, __b);
 }
 
-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vaddw_high_s32 (int64x2_t __a, int32x4_t __b)
 {
   return (int64x2_t) __builtin_aarch64_saddw2v4si (__a, __b);
 }
 
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vaddw_high_u8 (uint16x8_t __a, uint8x16_t __b)
 {
   return (uint16x8_t) __builtin_aarch64_uaddw2v16qi ((int16x8_t) __a,
 						     (int8x16_t) __b);
 }
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vaddw_high_u16 (uint32x4_t __a, uint16x8_t __b)
 {
   return (uint32x4_t) __builtin_aarch64_uaddw2v8hi ((int32x4_t) __a,
 						    (int16x8_t) __b);
 }
 
-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vaddw_high_u32 (uint64x2_t __a, uint32x4_t __b)
 {
   return (uint64x2_t) __builtin_aarch64_uaddw2v4si ((int64x2_t) __a,
 						    (int32x4_t) __b);
 }
 
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int8x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vhadd_s8 (int8x8_t __a, int8x8_t __b)
 {
   return (int8x8_t) __builtin_aarch64_shaddv8qi (__a, __b);
 }
 
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vhadd_s16 (int16x4_t __a, int16x4_t __b)
 {
   return (int16x4_t) __builtin_aarch64_shaddv4hi (__a, __b);
 }
 
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vhadd_s32 (int32x2_t __a, int32x2_t __b)
 {
   return (int32x2_t) __builtin_aarch64_shaddv2si (__a, __b);
 }
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint8x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vhadd_u8 (uint8x8_t __a, uint8x8_t __b)
 {
   return (uint8x8_t) __builtin_aarch64_uhaddv8qi ((int8x8_t) __a,
 						  (int8x8_t) __b);
 }
 
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vhadd_u16 (uint16x4_t __a, uint16x4_t __b)
 {
   return (uint16x4_t) __builtin_aarch64_uhaddv4hi ((int16x4_t) __a,
 						   (int16x4_t) __b);
 }
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vhadd_u32 (uint32x2_t __a, uint32x2_t __b)
 {
   return (uint32x2_t) __builtin_aarch64_uhaddv2si ((int32x2_t) __a,
 						   (int32x2_t) __b);
 }
 
-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int8x16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vhaddq_s8 (int8x16_t __a, int8x16_t __b)
 {
   return (int8x16_t) __builtin_aarch64_shaddv16qi (__a, __b);
 }
 
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vhaddq_s16 (int16x8_t __a, int16x8_t __b)
 {
   return (int16x8_t) __builtin_aarch64_shaddv8hi (__a, __b);
 }
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vhaddq_s32 (int32x4_t __a, int32x4_t __b)
 {
   return (int32x4_t) __builtin_aarch64_shaddv4si (__a, __b);
 }
 
-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint8x16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vhaddq_u8 (uint8x16_t __a, uint8x16_t __b)
 {
   return (uint8x16_t) __builtin_aarch64_uhaddv16qi ((int8x16_t) __a,
 						    (int8x16_t) __b);
 }
 
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vhaddq_u16 (uint16x8_t __a, uint16x8_t __b)
 {
   return (uint16x8_t) __builtin_aarch64_uhaddv8hi ((int16x8_t) __a,
 						   (int16x8_t) __b);
 }
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vhaddq_u32 (uint32x4_t __a, uint32x4_t __b)
 {
   return (uint32x4_t) __builtin_aarch64_uhaddv4si ((int32x4_t) __a,
 						   (int32x4_t) __b);
 }
 
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int8x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vrhadd_s8 (int8x8_t __a, int8x8_t __b)
 {
   return (int8x8_t) __builtin_aarch64_srhaddv8qi (__a, __b);
 }
 
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vrhadd_s16 (int16x4_t __a, int16x4_t __b)
 {
   return (int16x4_t) __builtin_aarch64_srhaddv4hi (__a, __b);
 }
 
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vrhadd_s32 (int32x2_t __a, int32x2_t __b)
 {
   return (int32x2_t) __builtin_aarch64_srhaddv2si (__a, __b);
 }
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint8x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vrhadd_u8 (uint8x8_t __a, uint8x8_t __b)
 {
   return (uint8x8_t) __builtin_aarch64_urhaddv8qi ((int8x8_t) __a,
 						   (int8x8_t) __b);
 }
 
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vrhadd_u16 (uint16x4_t __a, uint16x4_t __b)
 {
   return (uint16x4_t) __builtin_aarch64_urhaddv4hi ((int16x4_t) __a,
 						    (int16x4_t) __b);
 }
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vrhadd_u32 (uint32x2_t __a, uint32x2_t __b)
 {
   return (uint32x2_t) __builtin_aarch64_urhaddv2si ((int32x2_t) __a,
 						    (int32x2_t) __b);
 }
 
-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int8x16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vrhaddq_s8 (int8x16_t __a, int8x16_t __b)
 {
   return (int8x16_t) __builtin_aarch64_srhaddv16qi (__a, __b);
 }
 
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vrhaddq_s16 (int16x8_t __a, int16x8_t __b)
 {
   return (int16x8_t) __builtin_aarch64_srhaddv8hi (__a, __b);
 }
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vrhaddq_s32 (int32x4_t __a, int32x4_t __b)
 {
   return (int32x4_t) __builtin_aarch64_srhaddv4si (__a, __b);
 }
 
-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint8x16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vrhaddq_u8 (uint8x16_t __a, uint8x16_t __b)
 {
   return (uint8x16_t) __builtin_aarch64_urhaddv16qi ((int8x16_t) __a,
 						     (int8x16_t) __b);
 }
 
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vrhaddq_u16 (uint16x8_t __a, uint16x8_t __b)
 {
   return (uint16x8_t) __builtin_aarch64_urhaddv8hi ((int16x8_t) __a,
 						    (int16x8_t) __b);
 }
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vrhaddq_u32 (uint32x4_t __a, uint32x4_t __b)
 {
   return (uint32x4_t) __builtin_aarch64_urhaddv4si ((int32x4_t) __a,
 						    (int32x4_t) __b);
 }
 
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int8x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vaddhn_s16 (int16x8_t __a, int16x8_t __b)
 {
   return (int8x8_t) __builtin_aarch64_addhnv8hi (__a, __b);
 }
 
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vaddhn_s32 (int32x4_t __a, int32x4_t __b)
 {
   return (int16x4_t) __builtin_aarch64_addhnv4si (__a, __b);
 }
 
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vaddhn_s64 (int64x2_t __a, int64x2_t __b)
 {
   return (int32x2_t) __builtin_aarch64_addhnv2di (__a, __b);
 }
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint8x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vaddhn_u16 (uint16x8_t __a, uint16x8_t __b)
 {
   return (uint8x8_t) __builtin_aarch64_addhnv8hi ((int16x8_t) __a,
 						  (int16x8_t) __b);
 }
 
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vaddhn_u32 (uint32x4_t __a, uint32x4_t __b)
 {
   return (uint16x4_t) __builtin_aarch64_addhnv4si ((int32x4_t) __a,
 						   (int32x4_t) __b);
 }
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vaddhn_u64 (uint64x2_t __a, uint64x2_t __b)
 {
   return (uint32x2_t) __builtin_aarch64_addhnv2di ((int64x2_t) __a,
 						   (int64x2_t) __b);
 }
 
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int8x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vraddhn_s16 (int16x8_t __a, int16x8_t __b)
 {
   return (int8x8_t) __builtin_aarch64_raddhnv8hi (__a, __b);
 }
 
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vraddhn_s32 (int32x4_t __a, int32x4_t __b)
 {
   return (int16x4_t) __builtin_aarch64_raddhnv4si (__a, __b);
 }
 
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vraddhn_s64 (int64x2_t __a, int64x2_t __b)
 {
   return (int32x2_t) __builtin_aarch64_raddhnv2di (__a, __b);
 }
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint8x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vraddhn_u16 (uint16x8_t __a, uint16x8_t __b)
 {
   return (uint8x8_t) __builtin_aarch64_raddhnv8hi ((int16x8_t) __a,
 						   (int16x8_t) __b);
 }
 
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vraddhn_u32 (uint32x4_t __a, uint32x4_t __b)
 {
   return (uint16x4_t) __builtin_aarch64_raddhnv4si ((int32x4_t) __a,
 						    (int32x4_t) __b);
 }
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vraddhn_u64 (uint64x2_t __a, uint64x2_t __b)
 {
   return (uint32x2_t) __builtin_aarch64_raddhnv2di ((int64x2_t) __a,
 						    (int64x2_t) __b);
 }
 
-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int8x16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vaddhn_high_s16 (int8x8_t __a, int16x8_t __b, int16x8_t __c)
 {
   return (int8x16_t) __builtin_aarch64_addhn2v8hi (__a, __b, __c);
 }
 
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vaddhn_high_s32 (int16x4_t __a, int32x4_t __b, int32x4_t __c)
 {
   return (int16x8_t) __builtin_aarch64_addhn2v4si (__a, __b, __c);
 }
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vaddhn_high_s64 (int32x2_t __a, int64x2_t __b, int64x2_t __c)
 {
   return (int32x4_t) __builtin_aarch64_addhn2v2di (__a, __b, __c);
 }
 
-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint8x16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vaddhn_high_u16 (uint8x8_t __a, uint16x8_t __b, uint16x8_t __c)
 {
   return (uint8x16_t) __builtin_aarch64_addhn2v8hi ((int8x8_t) __a,
@@ -1137,7 +1268,8 @@ vaddhn_high_u16 (uint8x8_t __a, uint16x8_t __b, uint16x8_t __c)
 						    (int16x8_t) __c);
 }
 
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vaddhn_high_u32 (uint16x4_t __a, uint32x4_t __b, uint32x4_t __c)
 {
   return (uint16x8_t) __builtin_aarch64_addhn2v4si ((int16x4_t) __a,
@@ -1145,7 +1277,8 @@ vaddhn_high_u32 (uint16x4_t __a, uint32x4_t __b, uint32x4_t __c)
 						    (int32x4_t) __c);
 }
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vaddhn_high_u64 (uint32x2_t __a, uint64x2_t __b, uint64x2_t __c)
 {
   return (uint32x4_t) __builtin_aarch64_addhn2v2di ((int32x2_t) __a,
@@ -1153,25 +1286,29 @@ vaddhn_high_u64 (uint32x2_t __a, uint64x2_t __b, uint64x2_t __c)
 						    (int64x2_t) __c);
 }
 
-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int8x16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vraddhn_high_s16 (int8x8_t __a, int16x8_t __b, int16x8_t __c)
 {
   return (int8x16_t) __builtin_aarch64_raddhn2v8hi (__a, __b, __c);
 }
 
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vraddhn_high_s32 (int16x4_t __a, int32x4_t __b, int32x4_t __c)
 {
   return (int16x8_t) __builtin_aarch64_raddhn2v4si (__a, __b, __c);
 }
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vraddhn_high_s64 (int32x2_t __a, int64x2_t __b, int64x2_t __c)
 {
   return (int32x4_t) __builtin_aarch64_raddhn2v2di (__a, __b, __c);
 }
 
-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint8x16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vraddhn_high_u16 (uint8x8_t __a, uint16x8_t __b, uint16x8_t __c)
 {
   return (uint8x16_t) __builtin_aarch64_raddhn2v8hi ((int8x8_t) __a,
@@ -1179,7 +1316,8 @@ vraddhn_high_u16 (uint8x8_t __a, uint16x8_t __b, uint16x8_t __c)
 						     (int16x8_t) __c);
 }
 
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vraddhn_high_u32 (uint16x4_t __a, uint32x4_t __b, uint32x4_t __c)
 {
   return (uint16x8_t) __builtin_aarch64_raddhn2v4si ((int16x4_t) __a,
@@ -1187,7 +1325,8 @@ vraddhn_high_u32 (uint16x4_t __a, uint32x4_t __b, uint32x4_t __c)
 						     (int32x4_t) __c);
 }
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vraddhn_high_u64 (uint32x2_t __a, uint64x2_t __b, uint64x2_t __c)
 {
   return (uint32x4_t) __builtin_aarch64_raddhn2v2di ((int32x2_t) __a,
@@ -1195,1101 +1334,1280 @@ vraddhn_high_u64 (uint32x2_t __a, uint64x2_t __b, uint64x2_t __c)
 						     (int64x2_t) __c);
 }
 
-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline float32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vdiv_f32 (float32x2_t __a, float32x2_t __b)
 {
   return __a / __b;
 }
 
-__extension__ static __inline float64x1_t __attribute__ ((__always_inline__))
+__extension__ extern __inline float64x1_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vdiv_f64 (float64x1_t __a, float64x1_t __b)
 {
   return __a / __b;
 }
 
-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline float32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vdivq_f32 (float32x4_t __a, float32x4_t __b)
 {
   return __a / __b;
 }
 
-__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline float64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vdivq_f64 (float64x2_t __a, float64x2_t __b)
 {
   return __a / __b;
 }
 
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int8x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vmul_s8 (int8x8_t __a, int8x8_t __b)
 {
   return __a * __b;
 }
 
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vmul_s16 (int16x4_t __a, int16x4_t __b)
 {
   return __a * __b;
 }
 
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vmul_s32 (int32x2_t __a, int32x2_t __b)
 {
   return __a * __b;
 }
 
-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline float32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vmul_f32 (float32x2_t __a, float32x2_t __b)
 {
   return __a * __b;
 }
 
-__extension__ static __inline float64x1_t __attribute__ ((__always_inline__))
+__extension__ extern __inline float64x1_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vmul_f64 (float64x1_t __a, float64x1_t __b)
 {
   return __a * __b;
 }
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint8x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vmul_u8 (uint8x8_t __a, uint8x8_t __b)
 {
   return __a * __b;
 }
 
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vmul_u16 (uint16x4_t __a, uint16x4_t __b)
 {
   return __a * __b;
 }
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vmul_u32 (uint32x2_t __a, uint32x2_t __b)
 {
   return __a * __b;
 }
 
-__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline poly8x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vmul_p8 (poly8x8_t __a, poly8x8_t __b)
 {
   return (poly8x8_t) __builtin_aarch64_pmulv8qi ((int8x8_t) __a,
 						 (int8x8_t) __b);
 }
 
-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int8x16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vmulq_s8 (int8x16_t __a, int8x16_t __b)
 {
   return __a * __b;
 }
 
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vmulq_s16 (int16x8_t __a, int16x8_t __b)
 {
   return __a * __b;
 }
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vmulq_s32 (int32x4_t __a, int32x4_t __b)
 {
   return __a * __b;
 }
 
-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline float32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vmulq_f32 (float32x4_t __a, float32x4_t __b)
 {
   return __a * __b;
 }
 
-__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline float64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vmulq_f64 (float64x2_t __a, float64x2_t __b)
 {
   return __a * __b;
 }
 
-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint8x16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vmulq_u8 (uint8x16_t __a, uint8x16_t __b)
 {
   return __a * __b;
 }
 
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vmulq_u16 (uint16x8_t __a, uint16x8_t __b)
 {
   return __a * __b;
 }
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vmulq_u32 (uint32x4_t __a, uint32x4_t __b)
 {
   return __a * __b;
 }
 
-__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
+__extension__ extern __inline poly8x16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vmulq_p8 (poly8x16_t __a, poly8x16_t __b)
 {
   return (poly8x16_t) __builtin_aarch64_pmulv16qi ((int8x16_t) __a,
 						   (int8x16_t) __b);
 }
 
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int8x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vand_s8 (int8x8_t __a, int8x8_t __b)
 {
   return __a & __b;
 }
 
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vand_s16 (int16x4_t __a, int16x4_t __b)
 {
   return __a & __b;
 }
 
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vand_s32 (int32x2_t __a, int32x2_t __b)
 {
   return __a & __b;
 }
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint8x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vand_u8 (uint8x8_t __a, uint8x8_t __b)
 {
   return __a & __b;
 }
 
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vand_u16 (uint16x4_t __a, uint16x4_t __b)
 {
   return __a & __b;
 }
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vand_u32 (uint32x2_t __a, uint32x2_t __b)
 {
   return __a & __b;
 }
 
-__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int64x1_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vand_s64 (int64x1_t __a, int64x1_t __b)
 {
   return __a & __b;
 }
 
-__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint64x1_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vand_u64 (uint64x1_t __a, uint64x1_t __b)
 {
   return __a & __b;
 }
 
-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int8x16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vandq_s8 (int8x16_t __a, int8x16_t __b)
 {
   return __a & __b;
 }
 
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vandq_s16 (int16x8_t __a, int16x8_t __b)
 {
   return __a & __b;
 }
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vandq_s32 (int32x4_t __a, int32x4_t __b)
 {
   return __a & __b;
 }
 
-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vandq_s64 (int64x2_t __a, int64x2_t __b)
 {
   return __a & __b;
 }
 
-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint8x16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vandq_u8 (uint8x16_t __a, uint8x16_t __b)
 {
   return __a & __b;
 }
 
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vandq_u16 (uint16x8_t __a, uint16x8_t __b)
 {
   return __a & __b;
 }
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vandq_u32 (uint32x4_t __a, uint32x4_t __b)
 {
   return __a & __b;
 }
 
-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vandq_u64 (uint64x2_t __a, uint64x2_t __b)
 {
   return __a & __b;
 }
 
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int8x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vorr_s8 (int8x8_t __a, int8x8_t __b)
 {
   return __a | __b;
 }
 
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vorr_s16 (int16x4_t __a, int16x4_t __b)
 {
   return __a | __b;
 }
 
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vorr_s32 (int32x2_t __a, int32x2_t __b)
 {
   return __a | __b;
 }
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint8x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vorr_u8 (uint8x8_t __a, uint8x8_t __b)
 {
   return __a | __b;
 }
 
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vorr_u16 (uint16x4_t __a, uint16x4_t __b)
 {
   return __a | __b;
 }
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vorr_u32 (uint32x2_t __a, uint32x2_t __b)
 {
   return __a | __b;
 }
 
-__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int64x1_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vorr_s64 (int64x1_t __a, int64x1_t __b)
 {
   return __a | __b;
 }
 
-__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint64x1_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vorr_u64 (uint64x1_t __a, uint64x1_t __b)
 {
   return __a | __b;
 }
 
-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int8x16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vorrq_s8 (int8x16_t __a, int8x16_t __b)
 {
   return __a | __b;
 }
 
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vorrq_s16 (int16x8_t __a, int16x8_t __b)
 {
   return __a | __b;
 }
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vorrq_s32 (int32x4_t __a, int32x4_t __b)
 {
   return __a | __b;
 }
 
-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vorrq_s64 (int64x2_t __a, int64x2_t __b)
 {
   return __a | __b;
 }
 
-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint8x16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vorrq_u8 (uint8x16_t __a, uint8x16_t __b)
 {
   return __a | __b;
 }
 
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vorrq_u16 (uint16x8_t __a, uint16x8_t __b)
 {
   return __a | __b;
 }
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vorrq_u32 (uint32x4_t __a, uint32x4_t __b)
 {
   return __a | __b;
 }
 
-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vorrq_u64 (uint64x2_t __a, uint64x2_t __b)
 {
   return __a | __b;
 }
 
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int8x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 veor_s8 (int8x8_t __a, int8x8_t __b)
 {
   return __a ^ __b;
 }
 
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 veor_s16 (int16x4_t __a, int16x4_t __b)
 {
   return __a ^ __b;
 }
 
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 veor_s32 (int32x2_t __a, int32x2_t __b)
 {
   return __a ^ __b;
 }
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint8x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 veor_u8 (uint8x8_t __a, uint8x8_t __b)
 {
   return __a ^ __b;
 }
 
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 veor_u16 (uint16x4_t __a, uint16x4_t __b)
 {
   return __a ^ __b;
 }
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 veor_u32 (uint32x2_t __a, uint32x2_t __b)
 {
   return __a ^ __b;
 }
 
-__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int64x1_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 veor_s64 (int64x1_t __a, int64x1_t __b)
 {
   return __a ^ __b;
 }
 
-__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint64x1_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 veor_u64 (uint64x1_t __a, uint64x1_t __b)
 {
   return __a ^ __b;
 }
 
-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int8x16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 veorq_s8 (int8x16_t __a, int8x16_t __b)
 {
   return __a ^ __b;
 }
 
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 veorq_s16 (int16x8_t __a, int16x8_t __b)
 {
   return __a ^ __b;
 }
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 veorq_s32 (int32x4_t __a, int32x4_t __b)
 {
   return __a ^ __b;
 }
 
-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 veorq_s64 (int64x2_t __a, int64x2_t __b)
 {
   return __a ^ __b;
 }
 
-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint8x16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 veorq_u8 (uint8x16_t __a, uint8x16_t __b)
 {
   return __a ^ __b;
 }
 
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 veorq_u16 (uint16x8_t __a, uint16x8_t __b)
 {
   return __a ^ __b;
 }
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 veorq_u32 (uint32x4_t __a, uint32x4_t __b)
 {
   return __a ^ __b;
 }
 
-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 veorq_u64 (uint64x2_t __a, uint64x2_t __b)
 {
   return __a ^ __b;
 }
 
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int8x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vbic_s8 (int8x8_t __a, int8x8_t __b)
 {
   return __a & ~__b;
 }
 
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vbic_s16 (int16x4_t __a, int16x4_t __b)
 {
   return __a & ~__b;
 }
 
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vbic_s32 (int32x2_t __a, int32x2_t __b)
 {
   return __a & ~__b;
 }
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint8x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vbic_u8 (uint8x8_t __a, uint8x8_t __b)
 {
   return __a & ~__b;
 }
 
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vbic_u16 (uint16x4_t __a, uint16x4_t __b)
 {
   return __a & ~__b;
 }
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vbic_u32 (uint32x2_t __a, uint32x2_t __b)
 {
   return __a & ~__b;
 }
 
-__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int64x1_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vbic_s64 (int64x1_t __a, int64x1_t __b)
 {
   return __a & ~__b;
 }
 
-__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint64x1_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vbic_u64 (uint64x1_t __a, uint64x1_t __b)
 {
   return __a & ~__b;
 }
 
-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int8x16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vbicq_s8 (int8x16_t __a, int8x16_t __b)
 {
   return __a & ~__b;
 }
 
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vbicq_s16 (int16x8_t __a, int16x8_t __b)
 {
   return __a & ~__b;
 }
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vbicq_s32 (int32x4_t __a, int32x4_t __b)
 {
   return __a & ~__b;
 }
 
-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vbicq_s64 (int64x2_t __a, int64x2_t __b)
 {
   return __a & ~__b;
 }
 
-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint8x16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vbicq_u8 (uint8x16_t __a, uint8x16_t __b)
 {
   return __a & ~__b;
 }
 
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vbicq_u16 (uint16x8_t __a, uint16x8_t __b)
 {
   return __a & ~__b;
 }
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vbicq_u32 (uint32x4_t __a, uint32x4_t __b)
 {
   return __a & ~__b;
 }
 
-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vbicq_u64 (uint64x2_t __a, uint64x2_t __b)
 {
   return __a & ~__b;
 }
 
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int8x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vorn_s8 (int8x8_t __a, int8x8_t __b)
 {
   return __a | ~__b;
 }
 
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vorn_s16 (int16x4_t __a, int16x4_t __b)
 {
   return __a | ~__b;
 }
 
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vorn_s32 (int32x2_t __a, int32x2_t __b)
 {
   return __a | ~__b;
 }
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint8x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vorn_u8 (uint8x8_t __a, uint8x8_t __b)
 {
   return __a | ~__b;
 }
 
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vorn_u16 (uint16x4_t __a, uint16x4_t __b)
 {
   return __a | ~__b;
 }
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vorn_u32 (uint32x2_t __a, uint32x2_t __b)
 {
   return __a | ~__b;
 }
 
-__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int64x1_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vorn_s64 (int64x1_t __a, int64x1_t __b)
 {
   return __a | ~__b;
 }
 
-__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint64x1_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vorn_u64 (uint64x1_t __a, uint64x1_t __b)
 {
   return __a | ~__b;
 }
 
-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int8x16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vornq_s8 (int8x16_t __a, int8x16_t __b)
 {
   return __a | ~__b;
 }
 
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vornq_s16 (int16x8_t __a, int16x8_t __b)
 {
   return __a | ~__b;
 }
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vornq_s32 (int32x4_t __a, int32x4_t __b)
 {
   return __a | ~__b;
 }
 
-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vornq_s64 (int64x2_t __a, int64x2_t __b)
 {
   return __a | ~__b;
 }
 
-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint8x16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vornq_u8 (uint8x16_t __a, uint8x16_t __b)
 {
   return __a | ~__b;
 }
 
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vornq_u16 (uint16x8_t __a, uint16x8_t __b)
 {
   return __a | ~__b;
 }
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vornq_u32 (uint32x4_t __a, uint32x4_t __b)
 {
   return __a | ~__b;
 }
 
-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vornq_u64 (uint64x2_t __a, uint64x2_t __b)
 {
   return __a | ~__b;
 }
 
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int8x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vsub_s8 (int8x8_t __a, int8x8_t __b)
 {
   return __a - __b;
 }
 
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vsub_s16 (int16x4_t __a, int16x4_t __b)
 {
   return __a - __b;
 }
 
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vsub_s32 (int32x2_t __a, int32x2_t __b)
 {
   return __a - __b;
 }
 
-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline float32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vsub_f32 (float32x2_t __a, float32x2_t __b)
 {
   return __a - __b;
 }
 
-__extension__ static __inline float64x1_t __attribute__ ((__always_inline__))
+__extension__ extern __inline float64x1_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vsub_f64 (float64x1_t __a, float64x1_t __b)
 {
   return __a - __b;
 }
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint8x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vsub_u8 (uint8x8_t __a, uint8x8_t __b)
 {
   return __a - __b;
 }
 
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vsub_u16 (uint16x4_t __a, uint16x4_t __b)
 {
   return __a - __b;
 }
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vsub_u32 (uint32x2_t __a, uint32x2_t __b)
 {
   return __a - __b;
 }
 
-__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int64x1_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vsub_s64 (int64x1_t __a, int64x1_t __b)
 {
   return __a - __b;
 }
 
-__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint64x1_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vsub_u64 (uint64x1_t __a, uint64x1_t __b)
 {
   return __a - __b;
 }
 
-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int8x16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vsubq_s8 (int8x16_t __a, int8x16_t __b)
 {
   return __a - __b;
 }
 
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vsubq_s16 (int16x8_t __a, int16x8_t __b)
 {
   return __a - __b;
 }
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vsubq_s32 (int32x4_t __a, int32x4_t __b)
 {
   return __a - __b;
 }
 
-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vsubq_s64 (int64x2_t __a, int64x2_t __b)
 {
   return __a - __b;
 }
 
-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline float32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vsubq_f32 (float32x4_t __a, float32x4_t __b)
 {
   return __a - __b;
 }
 
-__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline float64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vsubq_f64 (float64x2_t __a, float64x2_t __b)
 {
   return __a - __b;
 }
 
-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint8x16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vsubq_u8 (uint8x16_t __a, uint8x16_t __b)
 {
   return __a - __b;
 }
 
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vsubq_u16 (uint16x8_t __a, uint16x8_t __b)
 {
   return __a - __b;
 }
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vsubq_u32 (uint32x4_t __a, uint32x4_t __b)
 {
   return __a - __b;
 }
 
-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vsubq_u64 (uint64x2_t __a, uint64x2_t __b)
 {
   return __a - __b;
 }
 
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vsubl_s8 (int8x8_t __a, int8x8_t __b)
 {
   return (int16x8_t) __builtin_aarch64_ssublv8qi (__a, __b);
 }
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vsubl_s16 (int16x4_t __a, int16x4_t __b)
 {
   return (int32x4_t) __builtin_aarch64_ssublv4hi (__a, __b);
 }
 
-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vsubl_s32 (int32x2_t __a, int32x2_t __b)
 {
   return (int64x2_t) __builtin_aarch64_ssublv2si (__a, __b);
 }
 
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vsubl_u8 (uint8x8_t __a, uint8x8_t __b)
 {
   return (uint16x8_t) __builtin_aarch64_usublv8qi ((int8x8_t) __a,
 						   (int8x8_t) __b);
 }
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vsubl_u16 (uint16x4_t __a, uint16x4_t __b)
 {
   return (uint32x4_t) __builtin_aarch64_usublv4hi ((int16x4_t) __a,
 						   (int16x4_t) __b);
 }
 
-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vsubl_u32 (uint32x2_t __a, uint32x2_t __b)
 {
   return (uint64x2_t) __builtin_aarch64_usublv2si ((int32x2_t) __a,
 						   (int32x2_t) __b);
 }
 
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vsubl_high_s8 (int8x16_t __a, int8x16_t __b)
 {
   return (int16x8_t) __builtin_aarch64_ssubl2v16qi (__a, __b);
 }
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vsubl_high_s16 (int16x8_t __a, int16x8_t __b)
 {
   return (int32x4_t) __builtin_aarch64_ssubl2v8hi (__a, __b);
 }
 
-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vsubl_high_s32 (int32x4_t __a, int32x4_t __b)
 {
   return (int64x2_t) __builtin_aarch64_ssubl2v4si (__a, __b);
 }
 
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vsubl_high_u8 (uint8x16_t __a, uint8x16_t __b)
 {
   return (uint16x8_t) __builtin_aarch64_usubl2v16qi ((int8x16_t) __a,
 						     (int8x16_t) __b);
 }
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vsubl_high_u16 (uint16x8_t __a, uint16x8_t __b)
 {
   return (uint32x4_t) __builtin_aarch64_usubl2v8hi ((int16x8_t) __a,
 						    (int16x8_t) __b);
 }
 
-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vsubl_high_u32 (uint32x4_t __a, uint32x4_t __b)
 {
   return (uint64x2_t) __builtin_aarch64_usubl2v4si ((int32x4_t) __a,
 						    (int32x4_t) __b);
 }
 
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vsubw_s8 (int16x8_t __a, int8x8_t __b)
 {
   return (int16x8_t) __builtin_aarch64_ssubwv8qi (__a, __b);
 }
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vsubw_s16 (int32x4_t __a, int16x4_t __b)
 {
   return (int32x4_t) __builtin_aarch64_ssubwv4hi (__a, __b);
 }
 
-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vsubw_s32 (int64x2_t __a, int32x2_t __b)
 {
   return (int64x2_t) __builtin_aarch64_ssubwv2si (__a, __b);
 }
 
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vsubw_u8 (uint16x8_t __a, uint8x8_t __b)
 {
   return (uint16x8_t) __builtin_aarch64_usubwv8qi ((int16x8_t) __a,
 						   (int8x8_t) __b);
 }
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vsubw_u16 (uint32x4_t __a, uint16x4_t __b)
 {
   return (uint32x4_t) __builtin_aarch64_usubwv4hi ((int32x4_t) __a,
 						   (int16x4_t) __b);
 }
 
-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vsubw_u32 (uint64x2_t __a, uint32x2_t __b)
 {
   return (uint64x2_t) __builtin_aarch64_usubwv2si ((int64x2_t) __a,
 						   (int32x2_t) __b);
 }
 
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vsubw_high_s8 (int16x8_t __a, int8x16_t __b)
 {
   return (int16x8_t) __builtin_aarch64_ssubw2v16qi (__a, __b);
 }
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vsubw_high_s16 (int32x4_t __a, int16x8_t __b)
 {
   return (int32x4_t) __builtin_aarch64_ssubw2v8hi (__a, __b);
 }
 
-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vsubw_high_s32 (int64x2_t __a, int32x4_t __b)
 {
   return (int64x2_t) __builtin_aarch64_ssubw2v4si (__a, __b);
 }
 
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vsubw_high_u8 (uint16x8_t __a, uint8x16_t __b)
 {
   return (uint16x8_t) __builtin_aarch64_usubw2v16qi ((int16x8_t) __a,
 						     (int8x16_t) __b);
 }
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vsubw_high_u16 (uint32x4_t __a, uint16x8_t __b)
 {
   return (uint32x4_t) __builtin_aarch64_usubw2v8hi ((int32x4_t) __a,
 						    (int16x8_t) __b);
 }
 
-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vsubw_high_u32 (uint64x2_t __a, uint32x4_t __b)
 {
   return (uint64x2_t) __builtin_aarch64_usubw2v4si ((int64x2_t) __a,
 						    (int32x4_t) __b);
 }
 
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int8x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vqadd_s8 (int8x8_t __a, int8x8_t __b)
 {
   return (int8x8_t) __builtin_aarch64_sqaddv8qi (__a, __b);
 }
 
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vqadd_s16 (int16x4_t __a, int16x4_t __b)
 {
   return (int16x4_t) __builtin_aarch64_sqaddv4hi (__a, __b);
 }
 
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vqadd_s32 (int32x2_t __a, int32x2_t __b)
 {
   return (int32x2_t) __builtin_aarch64_sqaddv2si (__a, __b);
 }
 
-__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int64x1_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vqadd_s64 (int64x1_t __a, int64x1_t __b)
 {
   return (int64x1_t) {__builtin_aarch64_sqadddi (__a[0], __b[0])};
 }
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint8x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vqadd_u8 (uint8x8_t __a, uint8x8_t __b)
 {
   return __builtin_aarch64_uqaddv8qi_uuu (__a, __b);
 }
 
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int8x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vhsub_s8 (int8x8_t __a, int8x8_t __b)
 {
   return (int8x8_t)__builtin_aarch64_shsubv8qi (__a, __b);
 }
 
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vhsub_s16 (int16x4_t __a, int16x4_t __b)
 {
   return (int16x4_t) __builtin_aarch64_shsubv4hi (__a, __b);
 }
 
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vhsub_s32 (int32x2_t __a, int32x2_t __b)
 {
   return (int32x2_t) __builtin_aarch64_shsubv2si (__a, __b);
 }
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint8x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vhsub_u8 (uint8x8_t __a, uint8x8_t __b)
 {
   return (uint8x8_t) __builtin_aarch64_uhsubv8qi ((int8x8_t) __a,
 						  (int8x8_t) __b);
 }
 
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vhsub_u16 (uint16x4_t __a, uint16x4_t __b)
 {
   return (uint16x4_t) __builtin_aarch64_uhsubv4hi ((int16x4_t) __a,
 						   (int16x4_t) __b);
 }
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vhsub_u32 (uint32x2_t __a, uint32x2_t __b)
 {
   return (uint32x2_t) __builtin_aarch64_uhsubv2si ((int32x2_t) __a,
 						   (int32x2_t) __b);
 }
 
-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int8x16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vhsubq_s8 (int8x16_t __a, int8x16_t __b)
 {
   return (int8x16_t) __builtin_aarch64_shsubv16qi (__a, __b);
 }
 
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vhsubq_s16 (int16x8_t __a, int16x8_t __b)
 {
   return (int16x8_t) __builtin_aarch64_shsubv8hi (__a, __b);
 }
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vhsubq_s32 (int32x4_t __a, int32x4_t __b)
 {
   return (int32x4_t) __builtin_aarch64_shsubv4si (__a, __b);
 }
 
-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint8x16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vhsubq_u8 (uint8x16_t __a, uint8x16_t __b)
 {
   return (uint8x16_t) __builtin_aarch64_uhsubv16qi ((int8x16_t) __a,
 						    (int8x16_t) __b);
 }
 
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vhsubq_u16 (uint16x8_t __a, uint16x8_t __b)
 {
   return (uint16x8_t) __builtin_aarch64_uhsubv8hi ((int16x8_t) __a,
 						   (int16x8_t) __b);
 }
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vhsubq_u32 (uint32x4_t __a, uint32x4_t __b)
 {
   return (uint32x4_t) __builtin_aarch64_uhsubv4si ((int32x4_t) __a,
 						   (int32x4_t) __b);
 }
 
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int8x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vsubhn_s16 (int16x8_t __a, int16x8_t __b)
 {
   return (int8x8_t) __builtin_aarch64_subhnv8hi (__a, __b);
 }
 
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vsubhn_s32 (int32x4_t __a, int32x4_t __b)
 {
   return (int16x4_t) __builtin_aarch64_subhnv4si (__a, __b);
 }
 
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vsubhn_s64 (int64x2_t __a, int64x2_t __b)
 {
   return (int32x2_t) __builtin_aarch64_subhnv2di (__a, __b);
 }
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint8x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vsubhn_u16 (uint16x8_t __a, uint16x8_t __b)
 {
   return (uint8x8_t) __builtin_aarch64_subhnv8hi ((int16x8_t) __a,
 						  (int16x8_t) __b);
 }
 
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vsubhn_u32 (uint32x4_t __a, uint32x4_t __b)
 {
   return (uint16x4_t) __builtin_aarch64_subhnv4si ((int32x4_t) __a,
 						   (int32x4_t) __b);
 }
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vsubhn_u64 (uint64x2_t __a, uint64x2_t __b)
 {
   return (uint32x2_t) __builtin_aarch64_subhnv2di ((int64x2_t) __a,
 						   (int64x2_t) __b);
 }
 
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int8x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vrsubhn_s16 (int16x8_t __a, int16x8_t __b)
 {
   return (int8x8_t) __builtin_aarch64_rsubhnv8hi (__a, __b);
 }
 
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vrsubhn_s32 (int32x4_t __a, int32x4_t __b)
 {
   return (int16x4_t) __builtin_aarch64_rsubhnv4si (__a, __b);
 }
 
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vrsubhn_s64 (int64x2_t __a, int64x2_t __b)
 {
   return (int32x2_t) __builtin_aarch64_rsubhnv2di (__a, __b);
 }
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint8x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vrsubhn_u16 (uint16x8_t __a, uint16x8_t __b)
 {
   return (uint8x8_t) __builtin_aarch64_rsubhnv8hi ((int16x8_t) __a,
 						   (int16x8_t) __b);
 }
 
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vrsubhn_u32 (uint32x4_t __a, uint32x4_t __b)
 {
   return (uint16x4_t) __builtin_aarch64_rsubhnv4si ((int32x4_t) __a,
 						    (int32x4_t) __b);
 }
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vrsubhn_u64 (uint64x2_t __a, uint64x2_t __b)
 {
   return (uint32x2_t) __builtin_aarch64_rsubhnv2di ((int64x2_t) __a,
 						    (int64x2_t) __b);
 }
 
-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int8x16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vrsubhn_high_s16 (int8x8_t __a, int16x8_t __b, int16x8_t __c)
 {
   return (int8x16_t) __builtin_aarch64_rsubhn2v8hi (__a, __b, __c);
 }
 
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vrsubhn_high_s32 (int16x4_t __a, int32x4_t __b, int32x4_t __c)
 {
   return (int16x8_t) __builtin_aarch64_rsubhn2v4si (__a, __b, __c);
 }
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vrsubhn_high_s64 (int32x2_t __a, int64x2_t __b, int64x2_t __c)
 {
   return (int32x4_t) __builtin_aarch64_rsubhn2v2di (__a, __b, __c);
 }
 
-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint8x16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vrsubhn_high_u16 (uint8x8_t __a, uint16x8_t __b, uint16x8_t __c)
 {
   return (uint8x16_t) __builtin_aarch64_rsubhn2v8hi ((int8x8_t) __a,
@@ -2297,7 +2615,8 @@ vrsubhn_high_u16 (uint8x8_t __a, uint16x8_t __b, uint16x8_t __c)
 						     (int16x8_t) __c);
 }
 
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vrsubhn_high_u32 (uint16x4_t __a, uint32x4_t __b, uint32x4_t __c)
 {
   return (uint16x8_t) __builtin_aarch64_rsubhn2v4si ((int16x4_t) __a,
@@ -2305,7 +2624,8 @@ vrsubhn_high_u32 (uint16x4_t __a, uint32x4_t __b, uint32x4_t __c)
 						     (int32x4_t) __c);
 }
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vrsubhn_high_u64 (uint32x2_t __a, uint64x2_t __b, uint64x2_t __c)
 {
   return (uint32x4_t) __builtin_aarch64_rsubhn2v2di ((int32x2_t) __a,
@@ -2313,25 +2633,29 @@ vrsubhn_high_u64 (uint32x2_t __a, uint64x2_t __b, uint64x2_t __c)
 						     (int64x2_t) __c);
 }
 
-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int8x16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vsubhn_high_s16 (int8x8_t __a, int16x8_t __b, int16x8_t __c)
 {
   return (int8x16_t) __builtin_aarch64_subhn2v8hi (__a, __b, __c);
 }
 
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vsubhn_high_s32 (int16x4_t __a, int32x4_t __b, int32x4_t __c)
 {
   return (int16x8_t) __builtin_aarch64_subhn2v4si (__a, __b, __c);;
 }
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vsubhn_high_s64 (int32x2_t __a, int64x2_t __b, int64x2_t __c)
 {
   return (int32x4_t) __builtin_aarch64_subhn2v2di (__a, __b, __c);
 }
 
-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint8x16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vsubhn_high_u16 (uint8x8_t __a, uint16x8_t __b, uint16x8_t __c)
 {
   return (uint8x16_t) __builtin_aarch64_subhn2v8hi ((int8x8_t) __a,
@@ -2339,7 +2663,8 @@ vsubhn_high_u16 (uint8x8_t __a, uint16x8_t __b, uint16x8_t __c)
 						    (int16x8_t) __c);
 }
 
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vsubhn_high_u32 (uint16x4_t __a, uint32x4_t __b, uint32x4_t __c)
 {
   return (uint16x8_t) __builtin_aarch64_subhn2v4si ((int16x4_t) __a,
@@ -2347,7 +2672,8 @@ vsubhn_high_u32 (uint16x4_t __a, uint32x4_t __b, uint32x4_t __c)
 						    (int32x4_t) __c);
 }
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vsubhn_high_u64 (uint32x2_t __a, uint64x2_t __b, uint64x2_t __c)
 {
   return (uint32x4_t) __builtin_aarch64_subhn2v2di ((int32x2_t) __a,
@@ -2355,453 +2681,542 @@ vsubhn_high_u64 (uint32x2_t __a, uint64x2_t __b, uint64x2_t __c)
 						    (int64x2_t) __c);
 }
 
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vqadd_u16 (uint16x4_t __a, uint16x4_t __b)
 {
   return __builtin_aarch64_uqaddv4hi_uuu (__a, __b);
 }
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vqadd_u32 (uint32x2_t __a, uint32x2_t __b)
 {
   return __builtin_aarch64_uqaddv2si_uuu (__a, __b);
 }
 
-__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint64x1_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vqadd_u64 (uint64x1_t __a, uint64x1_t __b)
 {
   return (uint64x1_t) {__builtin_aarch64_uqadddi_uuu (__a[0], __b[0])};
 }
 
-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int8x16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vqaddq_s8 (int8x16_t __a, int8x16_t __b)
 {
   return (int8x16_t) __builtin_aarch64_sqaddv16qi (__a, __b);
 }
 
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vqaddq_s16 (int16x8_t __a, int16x8_t __b)
 {
   return (int16x8_t) __builtin_aarch64_sqaddv8hi (__a, __b);
 }
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vqaddq_s32 (int32x4_t __a, int32x4_t __b)
 {
   return (int32x4_t) __builtin_aarch64_sqaddv4si (__a, __b);
 }
 
-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vqaddq_s64 (int64x2_t __a, int64x2_t __b)
 {
   return (int64x2_t) __builtin_aarch64_sqaddv2di (__a, __b);
 }
 
-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint8x16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vqaddq_u8 (uint8x16_t __a, uint8x16_t __b)
 {
   return __builtin_aarch64_uqaddv16qi_uuu (__a, __b);
 }
 
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vqaddq_u16 (uint16x8_t __a, uint16x8_t __b)
 {
   return __builtin_aarch64_uqaddv8hi_uuu (__a, __b);
 }
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vqaddq_u32 (uint32x4_t __a, uint32x4_t __b)
 {
   return __builtin_aarch64_uqaddv4si_uuu (__a, __b);
 }
 
-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vqaddq_u64 (uint64x2_t __a, uint64x2_t __b)
 {
   return __builtin_aarch64_uqaddv2di_uuu (__a, __b);
 }
 
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int8x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vqsub_s8 (int8x8_t __a, int8x8_t __b)
 {
   return (int8x8_t) __builtin_aarch64_sqsubv8qi (__a, __b);
 }
 
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vqsub_s16 (int16x4_t __a, int16x4_t __b)
 {
   return (int16x4_t) __builtin_aarch64_sqsubv4hi (__a, __b);
 }
 
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vqsub_s32 (int32x2_t __a, int32x2_t __b)
 {
   return (int32x2_t) __builtin_aarch64_sqsubv2si (__a, __b);
 }
 
-__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int64x1_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vqsub_s64 (int64x1_t __a, int64x1_t __b)
 {
   return (int64x1_t) {__builtin_aarch64_sqsubdi (__a[0], __b[0])};
 }
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint8x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vqsub_u8 (uint8x8_t __a, uint8x8_t __b)
 {
   return __builtin_aarch64_uqsubv8qi_uuu (__a, __b);
 }
 
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vqsub_u16 (uint16x4_t __a, uint16x4_t __b)
 {
   return __builtin_aarch64_uqsubv4hi_uuu (__a, __b);
 }
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vqsub_u32 (uint32x2_t __a, uint32x2_t __b)
 {
   return __builtin_aarch64_uqsubv2si_uuu (__a, __b);
 }
 
-__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint64x1_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vqsub_u64 (uint64x1_t __a, uint64x1_t __b)
 {
   return (uint64x1_t) {__builtin_aarch64_uqsubdi_uuu (__a[0], __b[0])};
 }
 
-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int8x16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vqsubq_s8 (int8x16_t __a, int8x16_t __b)
 {
   return (int8x16_t) __builtin_aarch64_sqsubv16qi (__a, __b);
 }
 
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vqsubq_s16 (int16x8_t __a, int16x8_t __b)
 {
   return (int16x8_t) __builtin_aarch64_sqsubv8hi (__a, __b);
 }
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vqsubq_s32 (int32x4_t __a, int32x4_t __b)
 {
   return (int32x4_t) __builtin_aarch64_sqsubv4si (__a, __b);
 }
 
-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vqsubq_s64 (int64x2_t __a, int64x2_t __b)
 {
   return (int64x2_t) __builtin_aarch64_sqsubv2di (__a, __b);
 }
 
-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint8x16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vqsubq_u8 (uint8x16_t __a, uint8x16_t __b)
 {
   return __builtin_aarch64_uqsubv16qi_uuu (__a, __b);
 }
 
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vqsubq_u16 (uint16x8_t __a, uint16x8_t __b)
 {
   return __builtin_aarch64_uqsubv8hi_uuu (__a, __b);
 }
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vqsubq_u32 (uint32x4_t __a, uint32x4_t __b)
 {
   return __builtin_aarch64_uqsubv4si_uuu (__a, __b);
 }
 
-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vqsubq_u64 (uint64x2_t __a, uint64x2_t __b)
 {
   return __builtin_aarch64_uqsubv2di_uuu (__a, __b);
 }
 
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int8x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vqneg_s8 (int8x8_t __a)
 {
   return (int8x8_t) __builtin_aarch64_sqnegv8qi (__a);
 }
 
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vqneg_s16 (int16x4_t __a)
 {
   return (int16x4_t) __builtin_aarch64_sqnegv4hi (__a);
 }
 
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vqneg_s32 (int32x2_t __a)
 {
   return (int32x2_t) __builtin_aarch64_sqnegv2si (__a);
 }
 
-__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int64x1_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vqneg_s64 (int64x1_t __a)
 {
   return (int64x1_t) {__builtin_aarch64_sqnegdi (__a[0])};
 }
 
-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int8x16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vqnegq_s8 (int8x16_t __a)
 {
   return (int8x16_t) __builtin_aarch64_sqnegv16qi (__a);
 }
 
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vqnegq_s16 (int16x8_t __a)
 {
   return (int16x8_t) __builtin_aarch64_sqnegv8hi (__a);
 }
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vqnegq_s32 (int32x4_t __a)
 {
   return (int32x4_t) __builtin_aarch64_sqnegv4si (__a);
 }
 
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int8x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vqabs_s8 (int8x8_t __a)
 {
   return (int8x8_t) __builtin_aarch64_sqabsv8qi (__a);
 }
 
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vqabs_s16 (int16x4_t __a)
 {
   return (int16x4_t) __builtin_aarch64_sqabsv4hi (__a);
 }
 
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vqabs_s32 (int32x2_t __a)
 {
   return (int32x2_t) __builtin_aarch64_sqabsv2si (__a);
 }
 
-__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int64x1_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vqabs_s64 (int64x1_t __a)
 {
   return (int64x1_t) {__builtin_aarch64_sqabsdi (__a[0])};
 }
 
-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int8x16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vqabsq_s8 (int8x16_t __a)
 {
   return (int8x16_t) __builtin_aarch64_sqabsv16qi (__a);
 }
 
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vqabsq_s16 (int16x8_t __a)
 {
   return (int16x8_t) __builtin_aarch64_sqabsv8hi (__a);
 }
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vqabsq_s32 (int32x4_t __a)
 {
   return (int32x4_t) __builtin_aarch64_sqabsv4si (__a);
 }
 
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vqdmulh_s16 (int16x4_t __a, int16x4_t __b)
 {
   return (int16x4_t) __builtin_aarch64_sqdmulhv4hi (__a, __b);
 }
 
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vqdmulh_s32 (int32x2_t __a, int32x2_t __b)
 {
   return (int32x2_t) __builtin_aarch64_sqdmulhv2si (__a, __b);
 }
 
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vqdmulhq_s16 (int16x8_t __a, int16x8_t __b)
 {
   return (int16x8_t) __builtin_aarch64_sqdmulhv8hi (__a, __b);
 }
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vqdmulhq_s32 (int32x4_t __a, int32x4_t __b)
 {
   return (int32x4_t) __builtin_aarch64_sqdmulhv4si (__a, __b);
 }
 
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vqrdmulh_s16 (int16x4_t __a, int16x4_t __b)
 {
   return (int16x4_t) __builtin_aarch64_sqrdmulhv4hi (__a, __b);
 }
 
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vqrdmulh_s32 (int32x2_t __a, int32x2_t __b)
 {
   return (int32x2_t) __builtin_aarch64_sqrdmulhv2si (__a, __b);
 }
 
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vqrdmulhq_s16 (int16x8_t __a, int16x8_t __b)
 {
   return (int16x8_t) __builtin_aarch64_sqrdmulhv8hi (__a, __b);
 }
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vqrdmulhq_s32 (int32x4_t __a, int32x4_t __b)
 {
   return (int32x4_t) __builtin_aarch64_sqrdmulhv4si (__a, __b);
 }
 
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int8x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vcreate_s8 (uint64_t __a)
 {
   return (int8x8_t) __a;
 }
 
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vcreate_s16 (uint64_t __a)
 {
   return (int16x4_t) __a;
 }
 
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vcreate_s32 (uint64_t __a)
 {
   return (int32x2_t) __a;
 }
 
-__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int64x1_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vcreate_s64 (uint64_t __a)
 {
   return (int64x1_t) {__a};
 }
 
-__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline float16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vcreate_f16 (uint64_t __a)
 {
   return (float16x4_t) __a;
 }
 
-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline float32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vcreate_f32 (uint64_t __a)
 {
   return (float32x2_t) __a;
 }
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint8x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vcreate_u8 (uint64_t __a)
 {
   return (uint8x8_t) __a;
 }
 
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vcreate_u16 (uint64_t __a)
 {
   return (uint16x4_t) __a;
 }
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vcreate_u32 (uint64_t __a)
 {
   return (uint32x2_t) __a;
 }
 
-__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint64x1_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vcreate_u64 (uint64_t __a)
 {
   return (uint64x1_t) {__a};
 }
 
-__extension__ static __inline float64x1_t __attribute__ ((__always_inline__))
+__extension__ extern __inline float64x1_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vcreate_f64 (uint64_t __a)
 {
   return (float64x1_t) __a;
 }
 
-__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline poly8x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vcreate_p8 (uint64_t __a)
 {
   return (poly8x8_t) __a;
 }
 
-__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline poly16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vcreate_p16 (uint64_t __a)
 {
   return (poly16x4_t) __a;
 }
 
+__extension__ extern __inline poly64x1_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcreate_p64 (uint64_t __a)
+{
+  return (poly64x1_t) __a;
+}
+
 /* vget_lane  */
 
-__extension__ static __inline float16_t __attribute__ ((__always_inline__))
+__extension__ extern __inline float16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vget_lane_f16 (float16x4_t __a, const int __b)
 {
   return __aarch64_vget_lane_any (__a, __b);
 }
 
-__extension__ static __inline float32_t __attribute__ ((__always_inline__))
+__extension__ extern __inline float32_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vget_lane_f32 (float32x2_t __a, const int __b)
 {
   return __aarch64_vget_lane_any (__a, __b);
 }
 
-__extension__ static __inline float64_t __attribute__ ((__always_inline__))
+__extension__ extern __inline float64_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vget_lane_f64 (float64x1_t __a, const int __b)
 {
   return __aarch64_vget_lane_any (__a, __b);
 }
 
-__extension__ static __inline poly8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline poly8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vget_lane_p8 (poly8x8_t __a, const int __b)
 {
   return __aarch64_vget_lane_any (__a, __b);
 }
 
-__extension__ static __inline poly16_t __attribute__ ((__always_inline__))
+__extension__ extern __inline poly16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vget_lane_p16 (poly16x4_t __a, const int __b)
 {
   return __aarch64_vget_lane_any (__a, __b);
 }
 
-__extension__ static __inline int8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline poly64_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vget_lane_p64 (poly64x1_t __a, const int __b)
+{
+  return __aarch64_vget_lane_any (__a, __b);
+}
+
+__extension__ extern __inline int8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vget_lane_s8 (int8x8_t __a, const int __b)
 {
   return __aarch64_vget_lane_any (__a, __b);
 }
 
-__extension__ static __inline int16_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vget_lane_s16 (int16x4_t __a, const int __b)
 {
   return __aarch64_vget_lane_any (__a, __b);
 }
 
-__extension__ static __inline int32_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vget_lane_s32 (int32x2_t __a, const int __b)
 {
   return __aarch64_vget_lane_any (__a, __b);
 }
 
-__extension__ static __inline int64_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int64_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vget_lane_s64 (int64x1_t __a, const int __b)
 {
   return __aarch64_vget_lane_any (__a, __b);
 }
 
-__extension__ static __inline uint8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vget_lane_u8 (uint8x8_t __a, const int __b)
 {
   return __aarch64_vget_lane_any (__a, __b);
 }
 
-__extension__ static __inline uint16_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vget_lane_u16 (uint16x4_t __a, const int __b)
 {
   return __aarch64_vget_lane_any (__a, __b);
 }
 
-__extension__ static __inline uint32_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vget_lane_u32 (uint32x2_t __a, const int __b)
 {
   return __aarch64_vget_lane_any (__a, __b);
 }
 
-__extension__ static __inline uint64_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint64_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vget_lane_u64 (uint64x1_t __a, const int __b)
 {
   return __aarch64_vget_lane_any (__a, __b);
@@ -2809,79 +3224,99 @@ vget_lane_u64 (uint64x1_t __a, const int __b)
 
 /* vgetq_lane  */
 
-__extension__ static __inline float16_t __attribute__ ((__always_inline__))
+__extension__ extern __inline float16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vgetq_lane_f16 (float16x8_t __a, const int __b)
 {
   return __aarch64_vget_lane_any (__a, __b);
 }
 
-__extension__ static __inline float32_t __attribute__ ((__always_inline__))
+__extension__ extern __inline float32_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vgetq_lane_f32 (float32x4_t __a, const int __b)
 {
   return __aarch64_vget_lane_any (__a, __b);
 }
 
-__extension__ static __inline float64_t __attribute__ ((__always_inline__))
+__extension__ extern __inline float64_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vgetq_lane_f64 (float64x2_t __a, const int __b)
 {
   return __aarch64_vget_lane_any (__a, __b);
 }
 
-__extension__ static __inline poly8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline poly8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vgetq_lane_p8 (poly8x16_t __a, const int __b)
 {
   return __aarch64_vget_lane_any (__a, __b);
 }
 
-__extension__ static __inline poly16_t __attribute__ ((__always_inline__))
+__extension__ extern __inline poly16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vgetq_lane_p16 (poly16x8_t __a, const int __b)
 {
   return __aarch64_vget_lane_any (__a, __b);
 }
 
-__extension__ static __inline int8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline poly64_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vgetq_lane_p64 (poly64x2_t __a, const int __b)
+{
+  return __aarch64_vget_lane_any (__a, __b);
+}
+
+__extension__ extern __inline int8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vgetq_lane_s8 (int8x16_t __a, const int __b)
 {
   return __aarch64_vget_lane_any (__a, __b);
 }
 
-__extension__ static __inline int16_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vgetq_lane_s16 (int16x8_t __a, const int __b)
 {
   return __aarch64_vget_lane_any (__a, __b);
 }
 
-__extension__ static __inline int32_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vgetq_lane_s32 (int32x4_t __a, const int __b)
 {
   return __aarch64_vget_lane_any (__a, __b);
 }
 
-__extension__ static __inline int64_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int64_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vgetq_lane_s64 (int64x2_t __a, const int __b)
 {
   return __aarch64_vget_lane_any (__a, __b);
 }
 
-__extension__ static __inline uint8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vgetq_lane_u8 (uint8x16_t __a, const int __b)
 {
   return __aarch64_vget_lane_any (__a, __b);
 }
 
-__extension__ static __inline uint16_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vgetq_lane_u16 (uint16x8_t __a, const int __b)
 {
   return __aarch64_vget_lane_any (__a, __b);
 }
 
-__extension__ static __inline uint32_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vgetq_lane_u32 (uint32x4_t __a, const int __b)
 {
   return __aarch64_vget_lane_any (__a, __b);
 }
 
-__extension__ static __inline uint64_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint64_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vgetq_lane_u64 (uint64x2_t __a, const int __b)
 {
   return __aarch64_vget_lane_any (__a, __b);
@@ -2889,1953 +3324,2832 @@ vgetq_lane_u64 (uint64x2_t __a, const int __b)
 
 /* vreinterpret  */
 
-__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline poly8x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpret_p8_f16 (float16x4_t __a)
 {
   return (poly8x8_t) __a;
 }
 
-__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline poly8x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpret_p8_f64 (float64x1_t __a)
 {
   return (poly8x8_t) __a;
 }
 
-__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline poly8x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpret_p8_s8 (int8x8_t __a)
 {
   return (poly8x8_t) __a;
 }
 
-__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline poly8x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpret_p8_s16 (int16x4_t __a)
 {
   return (poly8x8_t) __a;
 }
 
-__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline poly8x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpret_p8_s32 (int32x2_t __a)
 {
   return (poly8x8_t) __a;
 }
 
-__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline poly8x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpret_p8_s64 (int64x1_t __a)
 {
   return (poly8x8_t) __a;
 }
 
-__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline poly8x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpret_p8_f32 (float32x2_t __a)
 {
   return (poly8x8_t) __a;
 }
 
-__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline poly8x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpret_p8_u8 (uint8x8_t __a)
 {
   return (poly8x8_t) __a;
 }
 
-__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline poly8x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpret_p8_u16 (uint16x4_t __a)
 {
   return (poly8x8_t) __a;
 }
 
-__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline poly8x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpret_p8_u32 (uint32x2_t __a)
 {
   return (poly8x8_t) __a;
 }
 
-__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline poly8x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpret_p8_u64 (uint64x1_t __a)
 {
   return (poly8x8_t) __a;
 }
 
-__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline poly8x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpret_p8_p16 (poly16x4_t __a)
 {
   return (poly8x8_t) __a;
 }
 
-__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
+__extension__ extern __inline poly8x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpret_p8_p64 (poly64x1_t __a)
+{
+  return (poly8x8_t) __a;
+}
+
+__extension__ extern __inline poly8x16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_p8_f64 (float64x2_t __a)
 {
   return (poly8x16_t) __a;
 }
 
-__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
+__extension__ extern __inline poly8x16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_p8_s8 (int8x16_t __a)
 {
   return (poly8x16_t) __a;
 }
 
-__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
+__extension__ extern __inline poly8x16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_p8_s16 (int16x8_t __a)
 {
   return (poly8x16_t) __a;
 }
 
-__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
+__extension__ extern __inline poly8x16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_p8_s32 (int32x4_t __a)
 {
   return (poly8x16_t) __a;
 }
 
-__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
+__extension__ extern __inline poly8x16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_p8_s64 (int64x2_t __a)
 {
   return (poly8x16_t) __a;
 }
 
-__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
+__extension__ extern __inline poly8x16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_p8_f16 (float16x8_t __a)
 {
   return (poly8x16_t) __a;
 }
 
-__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
+__extension__ extern __inline poly8x16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_p8_f32 (float32x4_t __a)
 {
   return (poly8x16_t) __a;
 }
 
-__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
+__extension__ extern __inline poly8x16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_p8_u8 (uint8x16_t __a)
 {
   return (poly8x16_t) __a;
 }
 
-__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
+__extension__ extern __inline poly8x16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_p8_u16 (uint16x8_t __a)
 {
   return (poly8x16_t) __a;
 }
 
-__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
+__extension__ extern __inline poly8x16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_p8_u32 (uint32x4_t __a)
 {
   return (poly8x16_t) __a;
 }
 
-__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
+__extension__ extern __inline poly8x16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_p8_u64 (uint64x2_t __a)
 {
   return (poly8x16_t) __a;
 }
 
-__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
+__extension__ extern __inline poly8x16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_p8_p16 (poly16x8_t __a)
 {
   return (poly8x16_t) __a;
 }
 
-__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline poly8x16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpretq_p8_p64 (poly64x2_t __a)
+{
+  return (poly8x16_t) __a;
+}
+
+__extension__ extern __inline poly8x16_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpretq_p8_p128 (poly128_t __a)
+{
+  return (poly8x16_t)__a;
+}
+
+__extension__ extern __inline poly16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpret_p16_f16 (float16x4_t __a)
 {
   return (poly16x4_t) __a;
 }
 
-__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline poly16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpret_p16_f64 (float64x1_t __a)
 {
   return (poly16x4_t) __a;
 }
 
-__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline poly16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpret_p16_s8 (int8x8_t __a)
 {
   return (poly16x4_t) __a;
 }
 
-__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline poly16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpret_p16_s16 (int16x4_t __a)
 {
   return (poly16x4_t) __a;
 }
 
-__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline poly16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpret_p16_s32 (int32x2_t __a)
 {
   return (poly16x4_t) __a;
 }
 
-__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline poly16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpret_p16_s64 (int64x1_t __a)
 {
   return (poly16x4_t) __a;
 }
 
-__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline poly16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpret_p16_f32 (float32x2_t __a)
 {
   return (poly16x4_t) __a;
 }
 
-__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline poly16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpret_p16_u8 (uint8x8_t __a)
 {
   return (poly16x4_t) __a;
 }
 
-__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline poly16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpret_p16_u16 (uint16x4_t __a)
 {
   return (poly16x4_t) __a;
 }
 
-__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline poly16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpret_p16_u32 (uint32x2_t __a)
 {
   return (poly16x4_t) __a;
 }
 
-__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline poly16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpret_p16_u64 (uint64x1_t __a)
 {
   return (poly16x4_t) __a;
 }
 
-__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline poly16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpret_p16_p8 (poly8x8_t __a)
 {
   return (poly16x4_t) __a;
 }
 
-__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline poly16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpret_p16_p64 (poly64x1_t __a)
+{
+  return (poly16x4_t) __a;
+}
+
+__extension__ extern __inline poly16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_p16_f64 (float64x2_t __a)
 {
   return (poly16x8_t) __a;
 }
 
-__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline poly16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_p16_s8 (int8x16_t __a)
 {
   return (poly16x8_t) __a;
 }
 
-__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline poly16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_p16_s16 (int16x8_t __a)
 {
   return (poly16x8_t) __a;
 }
 
-__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline poly16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_p16_s32 (int32x4_t __a)
 {
   return (poly16x8_t) __a;
 }
 
-__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline poly16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_p16_s64 (int64x2_t __a)
 {
   return (poly16x8_t) __a;
 }
 
-__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline poly16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_p16_f16 (float16x8_t __a)
 {
   return (poly16x8_t) __a;
 }
 
-__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline poly16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_p16_f32 (float32x4_t __a)
 {
   return (poly16x8_t) __a;
 }
 
-__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline poly16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_p16_u8 (uint8x16_t __a)
 {
   return (poly16x8_t) __a;
 }
 
-__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline poly16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_p16_u16 (uint16x8_t __a)
 {
   return (poly16x8_t) __a;
 }
 
-__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline poly16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_p16_u32 (uint32x4_t __a)
 {
   return (poly16x8_t) __a;
 }
 
-__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline poly16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_p16_u64 (uint64x2_t __a)
 {
   return (poly16x8_t) __a;
 }
 
-__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline poly16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_p16_p8 (poly8x16_t __a)
 {
   return (poly16x8_t) __a;
 }
 
-__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline poly16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpretq_p16_p64 (poly64x2_t __a)
+{
+  return (poly16x8_t) __a;
+}
+
+__extension__ extern __inline poly16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpretq_p16_p128 (poly128_t __a)
+{
+  return (poly16x8_t)__a;
+}
+
+__extension__ extern __inline poly64x1_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpret_p64_f16 (float16x4_t __a)
+{
+  return (poly64x1_t) __a;
+}
+
+__extension__ extern __inline poly64x1_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpret_p64_f64 (float64x1_t __a)
+{
+  return (poly64x1_t) __a;
+}
+
+__extension__ extern __inline poly64x1_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpret_p64_s8 (int8x8_t __a)
+{
+  return (poly64x1_t) __a;
+}
+
+__extension__ extern __inline poly64x1_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpret_p64_s16 (int16x4_t __a)
+{
+  return (poly64x1_t) __a;
+}
+
+__extension__ extern __inline poly64x1_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpret_p64_s32 (int32x2_t __a)
+{
+  return (poly64x1_t) __a;
+}
+
+__extension__ extern __inline poly64x1_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpret_p64_s64 (int64x1_t __a)
+{
+  return (poly64x1_t) __a;
+}
+
+__extension__ extern __inline poly64x1_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpret_p64_f32 (float32x2_t __a)
+{
+  return (poly64x1_t) __a;
+}
+
+__extension__ extern __inline poly64x1_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpret_p64_u8 (uint8x8_t __a)
+{
+  return (poly64x1_t) __a;
+}
+
+__extension__ extern __inline poly64x1_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpret_p64_u16 (uint16x4_t __a)
+{
+  return (poly64x1_t) __a;
+}
+
+__extension__ extern __inline poly64x1_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpret_p64_u32 (uint32x2_t __a)
+{
+  return (poly64x1_t) __a;
+}
+
+__extension__ extern __inline poly64x1_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpret_p64_u64 (uint64x1_t __a)
+{
+  return (poly64x1_t) __a;
+}
+
+__extension__ extern __inline poly64x1_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpret_p64_p8 (poly8x8_t __a)
+{
+  return (poly64x1_t) __a;
+}
+
+__extension__ extern __inline poly64x1_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpret_p64_p16 (poly16x4_t __a)
+{
+  return (poly64x1_t)__a;
+}
+
+__extension__ extern __inline poly64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpretq_p64_f64 (float64x2_t __a)
+{
+  return (poly64x2_t) __a;
+}
+
+__extension__ extern __inline poly64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpretq_p64_s8 (int8x16_t __a)
+{
+  return (poly64x2_t) __a;
+}
+
+__extension__ extern __inline poly64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpretq_p64_s16 (int16x8_t __a)
+{
+  return (poly64x2_t) __a;
+}
+
+__extension__ extern __inline poly64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpretq_p64_s32 (int32x4_t __a)
+{
+  return (poly64x2_t) __a;
+}
+
+__extension__ extern __inline poly64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpretq_p64_s64 (int64x2_t __a)
+{
+  return (poly64x2_t) __a;
+}
+
+__extension__ extern __inline poly64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpretq_p64_f16 (float16x8_t __a)
+{
+  return (poly64x2_t) __a;
+}
+
+__extension__ extern __inline poly64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpretq_p64_f32 (float32x4_t __a)
+{
+  return (poly64x2_t) __a;
+}
+
+__extension__ extern __inline poly64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpretq_p64_p128 (poly128_t __a)
+{
+  return (poly64x2_t)__a;
+}
+
+__extension__ extern __inline poly64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpretq_p64_u8 (uint8x16_t __a)
+{
+  return (poly64x2_t) __a;
+}
+
+__extension__ extern __inline poly64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpretq_p64_u16 (uint16x8_t __a)
+{
+  return (poly64x2_t) __a;
+}
+
+__extension__ extern __inline poly64x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpretq_p64_p16 (poly16x8_t __a)
+{
+  return (poly64x2_t)__a;
+}
+
+__extension__ extern __inline poly64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpretq_p64_u32 (uint32x4_t __a)
+{
+  return (poly64x2_t) __a;
+}
+
+__extension__ extern __inline poly64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpretq_p64_u64 (uint64x2_t __a)
+{
+  return (poly64x2_t) __a;
+}
+
+__extension__ extern __inline poly64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpretq_p64_p8 (poly8x16_t __a)
+{
+  return (poly64x2_t) __a;
+}
+
+__extension__ extern __inline poly128_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpretq_p128_p8 (poly8x16_t __a)
+{
+  return (poly128_t)__a;
+}
+
+__extension__ extern __inline poly128_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpretq_p128_p16 (poly16x8_t __a)
+{
+  return (poly128_t)__a;
+}
+
+__extension__ extern __inline poly128_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpretq_p128_f16 (float16x8_t __a)
+{
+  return (poly128_t) __a;
+}
+
+__extension__ extern __inline poly128_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpretq_p128_f32 (float32x4_t __a)
+{
+  return (poly128_t)__a;
+}
+
+__extension__ extern __inline poly128_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpretq_p128_p64 (poly64x2_t __a)
+{
+  return (poly128_t)__a;
+}
+
+__extension__ extern __inline poly128_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpretq_p128_s64 (int64x2_t __a)
+{
+  return (poly128_t)__a;
+}
+
+__extension__ extern __inline poly128_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpretq_p128_u64 (uint64x2_t __a)
+{
+  return (poly128_t)__a;
+}
+
+__extension__ extern __inline poly128_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpretq_p128_s8 (int8x16_t __a)
+{
+  return (poly128_t)__a;
+}
+
+__extension__ extern __inline poly128_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpretq_p128_s16 (int16x8_t __a)
+{
+  return (poly128_t)__a;
+}
+
+__extension__ extern __inline poly128_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpretq_p128_s32 (int32x4_t __a)
+{
+  return (poly128_t)__a;
+}
+
+__extension__ extern __inline poly128_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpretq_p128_u8 (uint8x16_t __a)
+{
+  return (poly128_t)__a;
+}
+
+__extension__ extern __inline poly128_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpretq_p128_u16 (uint16x8_t __a)
+{
+  return (poly128_t)__a;
+}
+
+__extension__ extern __inline poly128_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpretq_p128_u32 (uint32x4_t __a)
+{
+  return (poly128_t)__a;
+}
+
+__extension__ extern __inline float16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpret_f16_f64 (float64x1_t __a)
 {
   return (float16x4_t) __a;
 }
 
-__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline float16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpret_f16_s8 (int8x8_t __a)
 {
   return (float16x4_t) __a;
 }
 
-__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline float16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpret_f16_s16 (int16x4_t __a)
 {
   return (float16x4_t) __a;
 }
 
-__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline float16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpret_f16_s32 (int32x2_t __a)
 {
   return (float16x4_t) __a;
 }
 
-__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline float16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpret_f16_s64 (int64x1_t __a)
 {
   return (float16x4_t) __a;
 }
 
-__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline float16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpret_f16_f32 (float32x2_t __a)
 {
   return (float16x4_t) __a;
 }
 
-__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline float16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpret_f16_u8 (uint8x8_t __a)
 {
   return (float16x4_t) __a;
 }
 
-__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline float16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpret_f16_u16 (uint16x4_t __a)
 {
   return (float16x4_t) __a;
 }
 
-__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline float16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpret_f16_u32 (uint32x2_t __a)
 {
   return (float16x4_t) __a;
 }
 
-__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline float16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpret_f16_u64 (uint64x1_t __a)
 {
   return (float16x4_t) __a;
 }
 
-__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline float16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpret_f16_p8 (poly8x8_t __a)
 {
   return (float16x4_t) __a;
 }
 
-__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline float16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpret_f16_p16 (poly16x4_t __a)
 {
   return (float16x4_t) __a;
 }
 
-__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline float16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpret_f16_p64 (poly64x1_t __a)
+{
+  return (float16x4_t) __a;
+}
+
+__extension__ extern __inline float16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_f16_f64 (float64x2_t __a)
 {
   return (float16x8_t) __a;
 }
 
-__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline float16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_f16_s8 (int8x16_t __a)
 {
   return (float16x8_t) __a;
 }
 
-__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline float16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_f16_s16 (int16x8_t __a)
 {
   return (float16x8_t) __a;
 }
 
-__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline float16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_f16_s32 (int32x4_t __a)
 {
   return (float16x8_t) __a;
 }
 
-__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline float16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_f16_s64 (int64x2_t __a)
 {
   return (float16x8_t) __a;
 }
 
-__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline float16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_f16_f32 (float32x4_t __a)
 {
   return (float16x8_t) __a;
 }
 
-__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline float16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_f16_u8 (uint8x16_t __a)
 {
   return (float16x8_t) __a;
 }
 
-__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline float16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_f16_u16 (uint16x8_t __a)
 {
   return (float16x8_t) __a;
 }
 
-__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline float16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_f16_u32 (uint32x4_t __a)
 {
   return (float16x8_t) __a;
 }
 
-__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline float16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_f16_u64 (uint64x2_t __a)
 {
   return (float16x8_t) __a;
 }
 
-__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline float16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_f16_p8 (poly8x16_t __a)
 {
   return (float16x8_t) __a;
 }
 
-__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline float16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpretq_f16_p128 (poly128_t __a)
+{
+  return (float16x8_t) __a;
+}
+
+__extension__ extern __inline float16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_f16_p16 (poly16x8_t __a)
 {
   return (float16x8_t) __a;
 }
 
-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline float16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpretq_f16_p64 (poly64x2_t __a)
+{
+  return (float16x8_t) __a;
+}
+
+__extension__ extern __inline float32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpret_f32_f16 (float16x4_t __a)
 {
   return (float32x2_t) __a;
 }
 
-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline float32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpret_f32_f64 (float64x1_t __a)
 {
   return (float32x2_t) __a;
 }
 
-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline float32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpret_f32_s8 (int8x8_t __a)
 {
   return (float32x2_t) __a;
 }
 
-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline float32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpret_f32_s16 (int16x4_t __a)
 {
   return (float32x2_t) __a;
 }
 
-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline float32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpret_f32_s32 (int32x2_t __a)
 {
   return (float32x2_t) __a;
 }
 
-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline float32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpret_f32_s64 (int64x1_t __a)
 {
   return (float32x2_t) __a;
 }
 
-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline float32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpret_f32_u8 (uint8x8_t __a)
 {
   return (float32x2_t) __a;
 }
 
-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline float32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpret_f32_u16 (uint16x4_t __a)
 {
   return (float32x2_t) __a;
 }
 
-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline float32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpret_f32_u32 (uint32x2_t __a)
 {
   return (float32x2_t) __a;
 }
 
-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline float32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpret_f32_u64 (uint64x1_t __a)
 {
   return (float32x2_t) __a;
 }
 
-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline float32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpret_f32_p8 (poly8x8_t __a)
 {
   return (float32x2_t) __a;
 }
 
-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline float32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpret_f32_p16 (poly16x4_t __a)
 {
   return (float32x2_t) __a;
 }
 
-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline float32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpret_f32_p64 (poly64x1_t __a)
+{
+  return (float32x2_t) __a;
+}
+
+__extension__ extern __inline float32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_f32_f16 (float16x8_t __a)
 {
   return (float32x4_t) __a;
 }
 
-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline float32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_f32_f64 (float64x2_t __a)
 {
   return (float32x4_t) __a;
 }
 
-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline float32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_f32_s8 (int8x16_t __a)
 {
   return (float32x4_t) __a;
 }
 
-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline float32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_f32_s16 (int16x8_t __a)
 {
   return (float32x4_t) __a;
 }
 
-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline float32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_f32_s32 (int32x4_t __a)
 {
   return (float32x4_t) __a;
 }
 
-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline float32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_f32_s64 (int64x2_t __a)
 {
   return (float32x4_t) __a;
 }
 
-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline float32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_f32_u8 (uint8x16_t __a)
 {
   return (float32x4_t) __a;
 }
 
-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline float32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_f32_u16 (uint16x8_t __a)
 {
   return (float32x4_t) __a;
 }
 
-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline float32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_f32_u32 (uint32x4_t __a)
 {
   return (float32x4_t) __a;
 }
 
-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline float32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_f32_u64 (uint64x2_t __a)
 {
   return (float32x4_t) __a;
 }
 
-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline float32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_f32_p8 (poly8x16_t __a)
 {
   return (float32x4_t) __a;
 }
 
-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline float32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_f32_p16 (poly16x8_t __a)
 {
   return (float32x4_t) __a;
 }
 
-__extension__ static __inline float64x1_t __attribute__((__always_inline__))
+__extension__ extern __inline float32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpretq_f32_p64 (poly64x2_t __a)
+{
+  return (float32x4_t) __a;
+}
+
+__extension__ extern __inline float32x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpretq_f32_p128 (poly128_t __a)
+{
+  return (float32x4_t)__a;
+}
+
+
+__extension__ extern __inline float64x1_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpret_f64_f16 (float16x4_t __a)
 {
   return (float64x1_t) __a;
 }
 
-__extension__ static __inline float64x1_t __attribute__((__always_inline__))
+__extension__ extern __inline float64x1_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpret_f64_f32 (float32x2_t __a)
 {
   return (float64x1_t) __a;
 }
 
-__extension__ static __inline float64x1_t __attribute__((__always_inline__))
+__extension__ extern __inline float64x1_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpret_f64_p8 (poly8x8_t __a)
 {
   return (float64x1_t) __a;
 }
 
-__extension__ static __inline float64x1_t __attribute__((__always_inline__))
+__extension__ extern __inline float64x1_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpret_f64_p16 (poly16x4_t __a)
 {
   return (float64x1_t) __a;
 }
 
-__extension__ static __inline float64x1_t __attribute__((__always_inline__))
+__extension__ extern __inline float64x1_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpret_f64_p64 (poly64x1_t __a)
+{
+  return (float64x1_t) __a;
+}
+
+__extension__ extern __inline float64x1_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpret_f64_s8 (int8x8_t __a)
 {
   return (float64x1_t) __a;
 }
 
-__extension__ static __inline float64x1_t __attribute__((__always_inline__))
+__extension__ extern __inline float64x1_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpret_f64_s16 (int16x4_t __a)
 {
   return (float64x1_t) __a;
 }
 
-__extension__ static __inline float64x1_t __attribute__((__always_inline__))
+__extension__ extern __inline float64x1_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpret_f64_s32 (int32x2_t __a)
 {
   return (float64x1_t) __a;
 }
 
-__extension__ static __inline float64x1_t __attribute__((__always_inline__))
+__extension__ extern __inline float64x1_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpret_f64_s64 (int64x1_t __a)
 {
   return (float64x1_t) __a;
 }
 
-__extension__ static __inline float64x1_t __attribute__((__always_inline__))
+__extension__ extern __inline float64x1_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpret_f64_u8 (uint8x8_t __a)
 {
   return (float64x1_t) __a;
 }
 
-__extension__ static __inline float64x1_t __attribute__((__always_inline__))
+__extension__ extern __inline float64x1_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpret_f64_u16 (uint16x4_t __a)
 {
   return (float64x1_t) __a;
 }
 
-__extension__ static __inline float64x1_t __attribute__((__always_inline__))
+__extension__ extern __inline float64x1_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpret_f64_u32 (uint32x2_t __a)
 {
   return (float64x1_t) __a;
 }
 
-__extension__ static __inline float64x1_t __attribute__((__always_inline__))
+__extension__ extern __inline float64x1_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpret_f64_u64 (uint64x1_t __a)
 {
   return (float64x1_t) __a;
 }
 
-__extension__ static __inline float64x2_t __attribute__((__always_inline__))
+__extension__ extern __inline float64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_f64_f16 (float16x8_t __a)
 {
   return (float64x2_t) __a;
 }
 
-__extension__ static __inline float64x2_t __attribute__((__always_inline__))
+__extension__ extern __inline float64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_f64_f32 (float32x4_t __a)
 {
   return (float64x2_t) __a;
 }
 
-__extension__ static __inline float64x2_t __attribute__((__always_inline__))
+__extension__ extern __inline float64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_f64_p8 (poly8x16_t __a)
 {
   return (float64x2_t) __a;
 }
 
-__extension__ static __inline float64x2_t __attribute__((__always_inline__))
+__extension__ extern __inline float64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_f64_p16 (poly16x8_t __a)
 {
   return (float64x2_t) __a;
 }
 
-__extension__ static __inline float64x2_t __attribute__((__always_inline__))
+__extension__ extern __inline float64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpretq_f64_p64 (poly64x2_t __a)
+{
+  return (float64x2_t) __a;
+}
+
+__extension__ extern __inline float64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_f64_s8 (int8x16_t __a)
 {
   return (float64x2_t) __a;
 }
 
-__extension__ static __inline float64x2_t __attribute__((__always_inline__))
+__extension__ extern __inline float64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_f64_s16 (int16x8_t __a)
 {
   return (float64x2_t) __a;
 }
 
-__extension__ static __inline float64x2_t __attribute__((__always_inline__))
+__extension__ extern __inline float64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_f64_s32 (int32x4_t __a)
 {
   return (float64x2_t) __a;
 }
 
-__extension__ static __inline float64x2_t __attribute__((__always_inline__))
+__extension__ extern __inline float64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_f64_s64 (int64x2_t __a)
 {
   return (float64x2_t) __a;
 }
 
-__extension__ static __inline float64x2_t __attribute__((__always_inline__))
+__extension__ extern __inline float64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_f64_u8 (uint8x16_t __a)
 {
   return (float64x2_t) __a;
 }
 
-__extension__ static __inline float64x2_t __attribute__((__always_inline__))
+__extension__ extern __inline float64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_f64_u16 (uint16x8_t __a)
 {
   return (float64x2_t) __a;
 }
 
-__extension__ static __inline float64x2_t __attribute__((__always_inline__))
+__extension__ extern __inline float64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_f64_u32 (uint32x4_t __a)
 {
   return (float64x2_t) __a;
 }
 
-__extension__ static __inline float64x2_t __attribute__((__always_inline__))
+__extension__ extern __inline float64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_f64_u64 (uint64x2_t __a)
 {
   return (float64x2_t) __a;
 }
 
-__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int64x1_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpret_s64_f16 (float16x4_t __a)
 {
   return (int64x1_t) __a;
 }
 
-__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int64x1_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpret_s64_f64 (float64x1_t __a)
 {
   return (int64x1_t) __a;
 }
 
-__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int64x1_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpret_s64_s8 (int8x8_t __a)
 {
   return (int64x1_t) __a;
 }
 
-__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int64x1_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpret_s64_s16 (int16x4_t __a)
 {
   return (int64x1_t) __a;
 }
 
-__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int64x1_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpret_s64_s32 (int32x2_t __a)
 {
   return (int64x1_t) __a;
 }
 
-__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int64x1_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpret_s64_f32 (float32x2_t __a)
 {
   return (int64x1_t) __a;
 }
 
-__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int64x1_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpret_s64_u8 (uint8x8_t __a)
 {
   return (int64x1_t) __a;
 }
 
-__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int64x1_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpret_s64_u16 (uint16x4_t __a)
 {
   return (int64x1_t) __a;
 }
 
-__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int64x1_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpret_s64_u32 (uint32x2_t __a)
 {
   return (int64x1_t) __a;
 }
 
-__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int64x1_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpret_s64_u64 (uint64x1_t __a)
 {
   return (int64x1_t) __a;
 }
 
-__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int64x1_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpret_s64_p8 (poly8x8_t __a)
 {
   return (int64x1_t) __a;
 }
 
-__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int64x1_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpret_s64_p16 (poly16x4_t __a)
 {
   return (int64x1_t) __a;
 }
 
-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int64x1_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpret_s64_p64 (poly64x1_t __a)
+{
+  return (int64x1_t) __a;
+}
+
+__extension__ extern __inline int64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_s64_f64 (float64x2_t __a)
 {
   return (int64x2_t) __a;
 }
 
-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_s64_s8 (int8x16_t __a)
 {
   return (int64x2_t) __a;
 }
 
-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_s64_s16 (int16x8_t __a)
 {
   return (int64x2_t) __a;
 }
 
-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_s64_s32 (int32x4_t __a)
 {
   return (int64x2_t) __a;
 }
 
-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_s64_f16 (float16x8_t __a)
 {
   return (int64x2_t) __a;
 }
 
-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_s64_f32 (float32x4_t __a)
 {
   return (int64x2_t) __a;
 }
 
-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_s64_u8 (uint8x16_t __a)
 {
   return (int64x2_t) __a;
 }
 
-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_s64_u16 (uint16x8_t __a)
 {
   return (int64x2_t) __a;
 }
 
-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_s64_u32 (uint32x4_t __a)
 {
   return (int64x2_t) __a;
 }
 
-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_s64_u64 (uint64x2_t __a)
 {
   return (int64x2_t) __a;
 }
 
-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_s64_p8 (poly8x16_t __a)
 {
   return (int64x2_t) __a;
 }
 
-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_s64_p16 (poly16x8_t __a)
 {
   return (int64x2_t) __a;
 }
 
-__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpretq_s64_p64 (poly64x2_t __a)
+{
+  return (int64x2_t) __a;
+}
+
+__extension__ extern __inline int64x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpretq_s64_p128 (poly128_t __a)
+{
+  return (int64x2_t)__a;
+}
+
+__extension__ extern __inline uint64x1_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpret_u64_f16 (float16x4_t __a)
 {
   return (uint64x1_t) __a;
 }
 
-__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint64x1_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpret_u64_f64 (float64x1_t __a)
 {
   return (uint64x1_t) __a;
 }
 
-__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint64x1_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpret_u64_s8 (int8x8_t __a)
 {
   return (uint64x1_t) __a;
 }
 
-__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint64x1_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpret_u64_s16 (int16x4_t __a)
 {
   return (uint64x1_t) __a;
 }
 
-__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint64x1_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpret_u64_s32 (int32x2_t __a)
 {
   return (uint64x1_t) __a;
 }
 
-__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint64x1_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpret_u64_s64 (int64x1_t __a)
 {
   return (uint64x1_t) __a;
 }
 
-__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint64x1_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpret_u64_f32 (float32x2_t __a)
 {
   return (uint64x1_t) __a;
 }
 
-__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint64x1_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpret_u64_u8 (uint8x8_t __a)
 {
   return (uint64x1_t) __a;
 }
 
-__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint64x1_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpret_u64_u16 (uint16x4_t __a)
 {
   return (uint64x1_t) __a;
 }
 
-__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint64x1_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpret_u64_u32 (uint32x2_t __a)
 {
   return (uint64x1_t) __a;
 }
 
-__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint64x1_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpret_u64_p8 (poly8x8_t __a)
 {
   return (uint64x1_t) __a;
 }
 
-__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint64x1_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpret_u64_p16 (poly16x4_t __a)
 {
   return (uint64x1_t) __a;
 }
 
-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint64x1_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpret_u64_p64 (poly64x1_t __a)
+{
+  return (uint64x1_t) __a;
+}
+
+__extension__ extern __inline uint64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_u64_f64 (float64x2_t __a)
 {
   return (uint64x2_t) __a;
 }
 
-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_u64_s8 (int8x16_t __a)
 {
   return (uint64x2_t) __a;
 }
 
-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_u64_s16 (int16x8_t __a)
 {
   return (uint64x2_t) __a;
 }
 
-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_u64_s32 (int32x4_t __a)
 {
   return (uint64x2_t) __a;
 }
 
-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_u64_s64 (int64x2_t __a)
 {
   return (uint64x2_t) __a;
 }
 
-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_u64_f16 (float16x8_t __a)
 {
   return (uint64x2_t) __a;
 }
 
-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_u64_f32 (float32x4_t __a)
 {
   return (uint64x2_t) __a;
 }
 
-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_u64_u8 (uint8x16_t __a)
 {
   return (uint64x2_t) __a;
 }
 
-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_u64_u16 (uint16x8_t __a)
 {
   return (uint64x2_t) __a;
 }
 
-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_u64_u32 (uint32x4_t __a)
 {
   return (uint64x2_t) __a;
 }
 
-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_u64_p8 (poly8x16_t __a)
 {
   return (uint64x2_t) __a;
 }
 
-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_u64_p16 (poly16x8_t __a)
 {
   return (uint64x2_t) __a;
 }
 
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpretq_u64_p64 (poly64x2_t __a)
+{
+  return (uint64x2_t) __a;
+}
+
+__extension__ extern __inline uint64x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpretq_u64_p128 (poly128_t __a)
+{
+  return (uint64x2_t)__a;
+}
+
+__extension__ extern __inline int8x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpret_s8_f16 (float16x4_t __a)
 {
   return (int8x8_t) __a;
 }
 
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int8x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpret_s8_f64 (float64x1_t __a)
 {
   return (int8x8_t) __a;
 }
 
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int8x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpret_s8_s16 (int16x4_t __a)
 {
   return (int8x8_t) __a;
 }
 
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int8x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpret_s8_s32 (int32x2_t __a)
 {
   return (int8x8_t) __a;
 }
 
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int8x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpret_s8_s64 (int64x1_t __a)
 {
   return (int8x8_t) __a;
 }
 
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int8x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpret_s8_f32 (float32x2_t __a)
 {
   return (int8x8_t) __a;
 }
 
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int8x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpret_s8_u8 (uint8x8_t __a)
 {
   return (int8x8_t) __a;
 }
 
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int8x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpret_s8_u16 (uint16x4_t __a)
 {
   return (int8x8_t) __a;
 }
 
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int8x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpret_s8_u32 (uint32x2_t __a)
 {
   return (int8x8_t) __a;
 }
 
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int8x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpret_s8_u64 (uint64x1_t __a)
 {
   return (int8x8_t) __a;
 }
 
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int8x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpret_s8_p8 (poly8x8_t __a)
 {
   return (int8x8_t) __a;
 }
 
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int8x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpret_s8_p16 (poly16x4_t __a)
 {
   return (int8x8_t) __a;
 }
 
-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int8x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpret_s8_p64 (poly64x1_t __a)
+{
+  return (int8x8_t) __a;
+}
+
+__extension__ extern __inline int8x16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_s8_f64 (float64x2_t __a)
 {
   return (int8x16_t) __a;
 }
 
-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int8x16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_s8_s16 (int16x8_t __a)
 {
   return (int8x16_t) __a;
 }
 
-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int8x16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_s8_s32 (int32x4_t __a)
 {
   return (int8x16_t) __a;
 }
 
-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int8x16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_s8_s64 (int64x2_t __a)
 {
   return (int8x16_t) __a;
 }
 
-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int8x16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_s8_f16 (float16x8_t __a)
 {
   return (int8x16_t) __a;
 }
 
-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int8x16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_s8_f32 (float32x4_t __a)
 {
   return (int8x16_t) __a;
 }
 
-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int8x16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_s8_u8 (uint8x16_t __a)
 {
   return (int8x16_t) __a;
 }
 
-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int8x16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_s8_u16 (uint16x8_t __a)
 {
   return (int8x16_t) __a;
 }
 
-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int8x16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_s8_u32 (uint32x4_t __a)
 {
   return (int8x16_t) __a;
 }
 
-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int8x16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_s8_u64 (uint64x2_t __a)
 {
   return (int8x16_t) __a;
 }
 
-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int8x16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_s8_p8 (poly8x16_t __a)
 {
   return (int8x16_t) __a;
 }
 
-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int8x16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_s8_p16 (poly16x8_t __a)
 {
   return (int8x16_t) __a;
 }
 
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int8x16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpretq_s8_p64 (poly64x2_t __a)
+{
+  return (int8x16_t) __a;
+}
+
+__extension__ extern __inline int8x16_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpretq_s8_p128 (poly128_t __a)
+{
+  return (int8x16_t)__a;
+}
+
+__extension__ extern __inline int16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpret_s16_f16 (float16x4_t __a)
 {
   return (int16x4_t) __a;
 }
 
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpret_s16_f64 (float64x1_t __a)
 {
   return (int16x4_t) __a;
 }
 
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpret_s16_s8 (int8x8_t __a)
 {
   return (int16x4_t) __a;
 }
 
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpret_s16_s32 (int32x2_t __a)
 {
   return (int16x4_t) __a;
 }
 
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpret_s16_s64 (int64x1_t __a)
 {
   return (int16x4_t) __a;
 }
 
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpret_s16_f32 (float32x2_t __a)
 {
   return (int16x4_t) __a;
 }
 
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpret_s16_u8 (uint8x8_t __a)
 {
   return (int16x4_t) __a;
 }
 
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpret_s16_u16 (uint16x4_t __a)
 {
   return (int16x4_t) __a;
 }
 
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpret_s16_u32 (uint32x2_t __a)
 {
   return (int16x4_t) __a;
 }
 
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpret_s16_u64 (uint64x1_t __a)
 {
   return (int16x4_t) __a;
 }
 
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpret_s16_p8 (poly8x8_t __a)
 {
   return (int16x4_t) __a;
 }
 
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpret_s16_p16 (poly16x4_t __a)
 {
   return (int16x4_t) __a;
 }
 
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpret_s16_p64 (poly64x1_t __a)
+{
+  return (int16x4_t) __a;
+}
+
+__extension__ extern __inline int16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_s16_f64 (float64x2_t __a)
 {
   return (int16x8_t) __a;
 }
 
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_s16_s8 (int8x16_t __a)
 {
   return (int16x8_t) __a;
 }
 
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_s16_s32 (int32x4_t __a)
 {
   return (int16x8_t) __a;
 }
 
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_s16_s64 (int64x2_t __a)
 {
   return (int16x8_t) __a;
 }
 
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_s16_f16 (float16x8_t __a)
 {
   return (int16x8_t) __a;
 }
 
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_s16_f32 (float32x4_t __a)
 {
   return (int16x8_t) __a;
 }
 
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_s16_u8 (uint8x16_t __a)
 {
   return (int16x8_t) __a;
 }
 
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_s16_u16 (uint16x8_t __a)
 {
   return (int16x8_t) __a;
 }
 
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_s16_u32 (uint32x4_t __a)
 {
   return (int16x8_t) __a;
 }
 
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_s16_u64 (uint64x2_t __a)
 {
   return (int16x8_t) __a;
 }
 
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_s16_p8 (poly8x16_t __a)
 {
   return (int16x8_t) __a;
 }
 
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_s16_p16 (poly16x8_t __a)
 {
   return (int16x8_t) __a;
 }
 
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpretq_s16_p64 (poly64x2_t __a)
+{
+  return (int16x8_t) __a;
+}
+
+__extension__ extern __inline int16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpretq_s16_p128 (poly128_t __a)
+{
+  return (int16x8_t)__a;
+}
+
+__extension__ extern __inline int32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpret_s32_f16 (float16x4_t __a)
 {
   return (int32x2_t) __a;
 }
 
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpret_s32_f64 (float64x1_t __a)
 {
   return (int32x2_t) __a;
 }
 
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpret_s32_s8 (int8x8_t __a)
 {
   return (int32x2_t) __a;
 }
 
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpret_s32_s16 (int16x4_t __a)
 {
   return (int32x2_t) __a;
 }
 
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpret_s32_s64 (int64x1_t __a)
 {
   return (int32x2_t) __a;
 }
 
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpret_s32_f32 (float32x2_t __a)
 {
   return (int32x2_t) __a;
 }
 
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpret_s32_u8 (uint8x8_t __a)
 {
   return (int32x2_t) __a;
 }
 
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpret_s32_u16 (uint16x4_t __a)
 {
   return (int32x2_t) __a;
 }
 
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpret_s32_u32 (uint32x2_t __a)
 {
   return (int32x2_t) __a;
 }
 
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpret_s32_u64 (uint64x1_t __a)
 {
   return (int32x2_t) __a;
 }
 
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpret_s32_p8 (poly8x8_t __a)
 {
   return (int32x2_t) __a;
 }
 
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpret_s32_p16 (poly16x4_t __a)
 {
   return (int32x2_t) __a;
 }
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpret_s32_p64 (poly64x1_t __a)
+{
+  return (int32x2_t) __a;
+}
+
+__extension__ extern __inline int32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_s32_f64 (float64x2_t __a)
 {
   return (int32x4_t) __a;
 }
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_s32_s8 (int8x16_t __a)
 {
   return (int32x4_t) __a;
 }
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_s32_s16 (int16x8_t __a)
 {
   return (int32x4_t) __a;
 }
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_s32_s64 (int64x2_t __a)
 {
   return (int32x4_t) __a;
 }
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_s32_f16 (float16x8_t __a)
 {
   return (int32x4_t) __a;
 }
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_s32_f32 (float32x4_t __a)
 {
   return (int32x4_t) __a;
 }
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_s32_u8 (uint8x16_t __a)
 {
   return (int32x4_t) __a;
 }
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_s32_u16 (uint16x8_t __a)
 {
   return (int32x4_t) __a;
 }
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_s32_u32 (uint32x4_t __a)
 {
   return (int32x4_t) __a;
 }
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_s32_u64 (uint64x2_t __a)
 {
   return (int32x4_t) __a;
 }
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_s32_p8 (poly8x16_t __a)
 {
   return (int32x4_t) __a;
 }
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_s32_p16 (poly16x8_t __a)
 {
   return (int32x4_t) __a;
 }
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpretq_s32_p64 (poly64x2_t __a)
+{
+  return (int32x4_t) __a;
+}
+
+__extension__ extern __inline int32x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpretq_s32_p128 (poly128_t __a)
+{
+  return (int32x4_t)__a;
+}
+
+__extension__ extern __inline uint8x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpret_u8_f16 (float16x4_t __a)
 {
   return (uint8x8_t) __a;
 }
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint8x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpret_u8_f64 (float64x1_t __a)
 {
   return (uint8x8_t) __a;
 }
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint8x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpret_u8_s8 (int8x8_t __a)
 {
   return (uint8x8_t) __a;
 }
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint8x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpret_u8_s16 (int16x4_t __a)
 {
   return (uint8x8_t) __a;
 }
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint8x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpret_u8_s32 (int32x2_t __a)
 {
   return (uint8x8_t) __a;
 }
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint8x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpret_u8_s64 (int64x1_t __a)
 {
   return (uint8x8_t) __a;
 }
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint8x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpret_u8_f32 (float32x2_t __a)
 {
   return (uint8x8_t) __a;
 }
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint8x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpret_u8_u16 (uint16x4_t __a)
 {
   return (uint8x8_t) __a;
 }
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint8x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpret_u8_u32 (uint32x2_t __a)
 {
   return (uint8x8_t) __a;
 }
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint8x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpret_u8_u64 (uint64x1_t __a)
 {
   return (uint8x8_t) __a;
 }
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint8x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpret_u8_p8 (poly8x8_t __a)
 {
   return (uint8x8_t) __a;
 }
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint8x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpret_u8_p16 (poly16x4_t __a)
 {
   return (uint8x8_t) __a;
 }
 
-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint8x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpret_u8_p64 (poly64x1_t __a)
+{
+  return (uint8x8_t) __a;
+}
+
+__extension__ extern __inline uint8x16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_u8_f64 (float64x2_t __a)
 {
   return (uint8x16_t) __a;
 }
 
-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint8x16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_u8_s8 (int8x16_t __a)
 {
   return (uint8x16_t) __a;
 }
 
-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint8x16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_u8_s16 (int16x8_t __a)
 {
   return (uint8x16_t) __a;
 }
 
-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint8x16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_u8_s32 (int32x4_t __a)
 {
   return (uint8x16_t) __a;
 }
 
-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint8x16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_u8_s64 (int64x2_t __a)
 {
   return (uint8x16_t) __a;
 }
 
-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint8x16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_u8_f16 (float16x8_t __a)
 {
   return (uint8x16_t) __a;
 }
 
-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint8x16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_u8_f32 (float32x4_t __a)
 {
   return (uint8x16_t) __a;
 }
 
-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint8x16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_u8_u16 (uint16x8_t __a)
 {
   return (uint8x16_t) __a;
 }
 
-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint8x16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_u8_u32 (uint32x4_t __a)
 {
   return (uint8x16_t) __a;
 }
 
-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint8x16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_u8_u64 (uint64x2_t __a)
 {
   return (uint8x16_t) __a;
 }
 
-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint8x16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_u8_p8 (poly8x16_t __a)
 {
   return (uint8x16_t) __a;
 }
 
-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint8x16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_u8_p16 (poly16x8_t __a)
 {
   return (uint8x16_t) __a;
 }
 
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint8x16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpretq_u8_p64 (poly64x2_t __a)
+{
+  return (uint8x16_t) __a;
+}
+
+__extension__ extern __inline uint8x16_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpretq_u8_p128 (poly128_t __a)
+{
+  return (uint8x16_t)__a;
+}
+
+__extension__ extern __inline uint16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpret_u16_f16 (float16x4_t __a)
 {
   return (uint16x4_t) __a;
 }
 
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpret_u16_f64 (float64x1_t __a)
 {
   return (uint16x4_t) __a;
 }
 
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpret_u16_s8 (int8x8_t __a)
 {
   return (uint16x4_t) __a;
 }
 
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpret_u16_s16 (int16x4_t __a)
 {
   return (uint16x4_t) __a;
 }
 
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpret_u16_s32 (int32x2_t __a)
 {
   return (uint16x4_t) __a;
 }
 
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpret_u16_s64 (int64x1_t __a)
 {
   return (uint16x4_t) __a;
 }
 
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpret_u16_f32 (float32x2_t __a)
 {
   return (uint16x4_t) __a;
 }
 
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpret_u16_u8 (uint8x8_t __a)
 {
   return (uint16x4_t) __a;
 }
 
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpret_u16_u32 (uint32x2_t __a)
 {
   return (uint16x4_t) __a;
 }
 
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpret_u16_u64 (uint64x1_t __a)
 {
   return (uint16x4_t) __a;
 }
 
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpret_u16_p8 (poly8x8_t __a)
 {
   return (uint16x4_t) __a;
 }
 
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpret_u16_p16 (poly16x4_t __a)
 {
   return (uint16x4_t) __a;
 }
 
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpret_u16_p64 (poly64x1_t __a)
+{
+  return (uint16x4_t) __a;
+}
+
+__extension__ extern __inline uint16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_u16_f64 (float64x2_t __a)
 {
   return (uint16x8_t) __a;
 }
 
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_u16_s8 (int8x16_t __a)
 {
   return (uint16x8_t) __a;
 }
 
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_u16_s16 (int16x8_t __a)
 {
   return (uint16x8_t) __a;
 }
 
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_u16_s32 (int32x4_t __a)
 {
   return (uint16x8_t) __a;
 }
 
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_u16_s64 (int64x2_t __a)
 {
   return (uint16x8_t) __a;
 }
 
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_u16_f16 (float16x8_t __a)
 {
   return (uint16x8_t) __a;
 }
 
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_u16_f32 (float32x4_t __a)
 {
   return (uint16x8_t) __a;
 }
 
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_u16_u8 (uint8x16_t __a)
 {
   return (uint16x8_t) __a;
 }
 
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_u16_u32 (uint32x4_t __a)
 {
   return (uint16x8_t) __a;
 }
 
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_u16_u64 (uint64x2_t __a)
 {
   return (uint16x8_t) __a;
 }
 
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_u16_p8 (poly8x16_t __a)
 {
   return (uint16x8_t) __a;
 }
 
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_u16_p16 (poly16x8_t __a)
 {
   return (uint16x8_t) __a;
 }
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpretq_u16_p64 (poly64x2_t __a)
+{
+  return (uint16x8_t) __a;
+}
+
+__extension__ extern __inline uint16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpretq_u16_p128 (poly128_t __a)
+{
+  return (uint16x8_t)__a;
+}
+
+__extension__ extern __inline uint32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpret_u32_f16 (float16x4_t __a)
 {
   return (uint32x2_t) __a;
 }
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpret_u32_f64 (float64x1_t __a)
 {
   return (uint32x2_t) __a;
 }
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpret_u32_s8 (int8x8_t __a)
 {
   return (uint32x2_t) __a;
 }
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpret_u32_s16 (int16x4_t __a)
 {
   return (uint32x2_t) __a;
 }
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpret_u32_s32 (int32x2_t __a)
 {
   return (uint32x2_t) __a;
 }
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpret_u32_s64 (int64x1_t __a)
 {
   return (uint32x2_t) __a;
 }
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpret_u32_f32 (float32x2_t __a)
 {
   return (uint32x2_t) __a;
 }
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpret_u32_u8 (uint8x8_t __a)
 {
   return (uint32x2_t) __a;
 }
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpret_u32_u16 (uint16x4_t __a)
 {
   return (uint32x2_t) __a;
 }
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpret_u32_u64 (uint64x1_t __a)
 {
   return (uint32x2_t) __a;
 }
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpret_u32_p8 (poly8x8_t __a)
 {
   return (uint32x2_t) __a;
 }
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpret_u32_p16 (poly16x4_t __a)
 {
   return (uint32x2_t) __a;
 }
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpret_u32_p64 (poly64x1_t __a)
+{
+  return (uint32x2_t) __a;
+}
+
+__extension__ extern __inline uint32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_u32_f64 (float64x2_t __a)
 {
   return (uint32x4_t) __a;
 }
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_u32_s8 (int8x16_t __a)
 {
   return (uint32x4_t) __a;
 }
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_u32_s16 (int16x8_t __a)
 {
   return (uint32x4_t) __a;
 }
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_u32_s32 (int32x4_t __a)
 {
   return (uint32x4_t) __a;
 }
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_u32_s64 (int64x2_t __a)
 {
   return (uint32x4_t) __a;
 }
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_u32_f16 (float16x8_t __a)
 {
   return (uint32x4_t) __a;
 }
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_u32_f32 (float32x4_t __a)
 {
   return (uint32x4_t) __a;
 }
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_u32_u8 (uint8x16_t __a)
 {
   return (uint32x4_t) __a;
 }
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_u32_u16 (uint16x8_t __a)
 {
   return (uint32x4_t) __a;
 }
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_u32_u64 (uint64x2_t __a)
 {
   return (uint32x4_t) __a;
 }
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_u32_p8 (poly8x16_t __a)
 {
   return (uint32x4_t) __a;
 }
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_u32_p16 (poly16x8_t __a)
 {
   return (uint32x4_t) __a;
 }
 
+__extension__ extern __inline uint32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpretq_u32_p64 (poly64x2_t __a)
+{
+  return (uint32x4_t) __a;
+}
+
+__extension__ extern __inline uint32x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpretq_u32_p128 (poly128_t __a)
+{
+  return (uint32x4_t)__a;
+}
+
 /* vset_lane  */
 
-__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline float16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vset_lane_f16 (float16_t __elem, float16x4_t __vec, const int __index)
 {
   return __aarch64_vset_lane_any (__elem, __vec, __index);
 }
 
-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline float32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vset_lane_f32 (float32_t __elem, float32x2_t __vec, const int __index)
 {
   return __aarch64_vset_lane_any (__elem, __vec, __index);
 }
 
-__extension__ static __inline float64x1_t __attribute__ ((__always_inline__))
+__extension__ extern __inline float64x1_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vset_lane_f64 (float64_t __elem, float64x1_t __vec, const int __index)
 {
   return __aarch64_vset_lane_any (__elem, __vec, __index);
 }
 
-__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline poly8x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vset_lane_p8 (poly8_t __elem, poly8x8_t __vec, const int __index)
 {
   return __aarch64_vset_lane_any (__elem, __vec, __index);
 }
 
-__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline poly16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vset_lane_p16 (poly16_t __elem, poly16x4_t __vec, const int __index)
 {
   return __aarch64_vset_lane_any (__elem, __vec, __index);
 }
 
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline poly64x1_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vset_lane_p64 (poly64_t __elem, poly64x1_t __vec, const int __index)
+{
+  return __aarch64_vset_lane_any (__elem, __vec, __index);
+}
+
+__extension__ extern __inline int8x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vset_lane_s8 (int8_t __elem, int8x8_t __vec, const int __index)
 {
   return __aarch64_vset_lane_any (__elem, __vec, __index);
 }
 
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vset_lane_s16 (int16_t __elem, int16x4_t __vec, const int __index)
 {
   return __aarch64_vset_lane_any (__elem, __vec, __index);
 }
 
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vset_lane_s32 (int32_t __elem, int32x2_t __vec, const int __index)
 {
   return __aarch64_vset_lane_any (__elem, __vec, __index);
 }
 
-__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int64x1_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vset_lane_s64 (int64_t __elem, int64x1_t __vec, const int __index)
 {
   return __aarch64_vset_lane_any (__elem, __vec, __index);
 }
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint8x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vset_lane_u8 (uint8_t __elem, uint8x8_t __vec, const int __index)
 {
   return __aarch64_vset_lane_any (__elem, __vec, __index);
 }
 
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vset_lane_u16 (uint16_t __elem, uint16x4_t __vec, const int __index)
 {
   return __aarch64_vset_lane_any (__elem, __vec, __index);
 }
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vset_lane_u32 (uint32_t __elem, uint32x2_t __vec, const int __index)
 {
   return __aarch64_vset_lane_any (__elem, __vec, __index);
 }
 
-__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint64x1_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vset_lane_u64 (uint64_t __elem, uint64x1_t __vec, const int __index)
 {
   return __aarch64_vset_lane_any (__elem, __vec, __index);
@@ -4843,79 +6157,99 @@ vset_lane_u64 (uint64_t __elem, uint64x1_t __vec, const int __index)
 
 /* vsetq_lane  */
 
-__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline float16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vsetq_lane_f16 (float16_t __elem, float16x8_t __vec, const int __index)
 {
   return __aarch64_vset_lane_any (__elem, __vec, __index);
 }
 
-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline float32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vsetq_lane_f32 (float32_t __elem, float32x4_t __vec, const int __index)
 {
   return __aarch64_vset_lane_any (__elem, __vec, __index);
 }
 
-__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline float64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vsetq_lane_f64 (float64_t __elem, float64x2_t __vec, const int __index)
 {
   return __aarch64_vset_lane_any (__elem, __vec, __index);
 }
 
-__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
+__extension__ extern __inline poly8x16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vsetq_lane_p8 (poly8_t __elem, poly8x16_t __vec, const int __index)
 {
   return __aarch64_vset_lane_any (__elem, __vec, __index);
 }
 
-__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline poly16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vsetq_lane_p16 (poly16_t __elem, poly16x8_t __vec, const int __index)
 {
   return __aarch64_vset_lane_any (__elem, __vec, __index);
 }
 
-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
+__extension__ extern __inline poly64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vsetq_lane_p64 (poly64_t __elem, poly64x2_t __vec, const int __index)
+{
+  return __aarch64_vset_lane_any (__elem, __vec, __index);
+}
+
+__extension__ extern __inline int8x16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vsetq_lane_s8 (int8_t __elem, int8x16_t __vec, const int __index)
 {
   return __aarch64_vset_lane_any (__elem, __vec, __index);
 }
 
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vsetq_lane_s16 (int16_t __elem, int16x8_t __vec, const int __index)
 {
   return __aarch64_vset_lane_any (__elem, __vec, __index);
 }
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vsetq_lane_s32 (int32_t __elem, int32x4_t __vec, const int __index)
 {
   return __aarch64_vset_lane_any (__elem, __vec, __index);
 }
 
-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vsetq_lane_s64 (int64_t __elem, int64x2_t __vec, const int __index)
 {
   return __aarch64_vset_lane_any (__elem, __vec, __index);
 }
 
-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint8x16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vsetq_lane_u8 (uint8_t __elem, uint8x16_t __vec, const int __index)
 {
   return __aarch64_vset_lane_any (__elem, __vec, __index);
 }
 
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vsetq_lane_u16 (uint16_t __elem, uint16x8_t __vec, const int __index)
 {
   return __aarch64_vset_lane_any (__elem, __vec, __index);
 }
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vsetq_lane_u32 (uint32_t __elem, uint32x4_t __vec, const int __index)
 {
   return __aarch64_vset_lane_any (__elem, __vec, __index);
 }
 
-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vsetq_lane_u64 (uint64_t __elem, uint64x2_t __vec, const int __index)
 {
   return __aarch64_vset_lane_any (__elem, __vec, __index);
@@ -4926,79 +6260,99 @@ vsetq_lane_u64 (uint64_t __elem, uint64x2_t __vec, const int __index)
   uint64x1_t lo = vcreate_u64 (vgetq_lane_u64 (tmp, 0));  \
   return vreinterpret_##__TYPE##_u64 (lo);
 
-__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline float16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vget_low_f16 (float16x8_t __a)
 {
   __GET_LOW (f16);
 }
 
-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline float32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vget_low_f32 (float32x4_t __a)
 {
   __GET_LOW (f32);
 }
 
-__extension__ static __inline float64x1_t __attribute__ ((__always_inline__))
+__extension__ extern __inline float64x1_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vget_low_f64 (float64x2_t __a)
 {
   return (float64x1_t) {vgetq_lane_f64 (__a, 0)};
 }
 
-__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline poly8x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vget_low_p8 (poly8x16_t __a)
 {
   __GET_LOW (p8);
 }
 
-__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline poly16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vget_low_p16 (poly16x8_t __a)
 {
   __GET_LOW (p16);
 }
 
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline poly64x1_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vget_low_p64 (poly64x2_t __a)
+{
+  __GET_LOW (p64);
+}
+
+__extension__ extern __inline int8x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vget_low_s8 (int8x16_t __a)
 {
   __GET_LOW (s8);
 }
 
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vget_low_s16 (int16x8_t __a)
 {
   __GET_LOW (s16);
 }
 
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vget_low_s32 (int32x4_t __a)
 {
   __GET_LOW (s32);
 }
 
-__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int64x1_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vget_low_s64 (int64x2_t __a)
 {
   __GET_LOW (s64);
 }
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint8x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vget_low_u8 (uint8x16_t __a)
 {
   __GET_LOW (u8);
 }
 
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vget_low_u16 (uint16x8_t __a)
 {
   __GET_LOW (u16);
 }
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vget_low_u32 (uint32x4_t __a)
 {
   __GET_LOW (u32);
 }
 
-__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint64x1_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vget_low_u64 (uint64x2_t __a)
 {
   return vcreate_u64 (vgetq_lane_u64 (__a, 0));
@@ -5011,73 +6365,92 @@ vget_low_u64 (uint64x2_t __a)
   uint64x1_t hi = vcreate_u64 (vgetq_lane_u64 (tmp, 1));	\
   return vreinterpret_##__TYPE##_u64 (hi);
 
-__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline float16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vget_high_f16 (float16x8_t __a)
 {
   __GET_HIGH (f16);
 }
 
-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline float32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vget_high_f32 (float32x4_t __a)
 {
   __GET_HIGH (f32);
 }
 
-__extension__ static __inline float64x1_t __attribute__ ((__always_inline__))
+__extension__ extern __inline float64x1_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vget_high_f64 (float64x2_t __a)
 {
   __GET_HIGH (f64);
 }
 
-__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline poly8x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vget_high_p8 (poly8x16_t __a)
 {
   __GET_HIGH (p8);
 }
 
-__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline poly16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vget_high_p16 (poly16x8_t __a)
 {
   __GET_HIGH (p16);
 }
 
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline poly64x1_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vget_high_p64 (poly64x2_t __a)
+{
+  __GET_HIGH (p64);
+}
+
+__extension__ extern __inline int8x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vget_high_s8 (int8x16_t __a)
 {
   __GET_HIGH (s8);
 }
 
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vget_high_s16 (int16x8_t __a)
 {
   __GET_HIGH (s16);
 }
 
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vget_high_s32 (int32x4_t __a)
 {
   __GET_HIGH (s32);
 }
 
-__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int64x1_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vget_high_s64 (int64x2_t __a)
 {
   __GET_HIGH (s64);
 }
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint8x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vget_high_u8 (uint8x16_t __a)
 {
   __GET_HIGH (u8);
 }
 
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vget_high_u16 (uint16x8_t __a)
 {
   __GET_HIGH (u16);
 }
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vget_high_u32 (uint32x4_t __a)
 {
   __GET_HIGH (u32);
@@ -5085,98 +6458,120 @@ vget_high_u32 (uint32x4_t __a)
 
 #undef __GET_HIGH
 
-__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint64x1_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vget_high_u64 (uint64x2_t __a)
 {
   return vcreate_u64 (vgetq_lane_u64 (__a, 1));
 }
 
-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int8x16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vcombine_s8 (int8x8_t __a, int8x8_t __b)
 {
   return (int8x16_t) __builtin_aarch64_combinev8qi (__a, __b);
 }
 
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vcombine_s16 (int16x4_t __a, int16x4_t __b)
 {
   return (int16x8_t) __builtin_aarch64_combinev4hi (__a, __b);
 }
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vcombine_s32 (int32x2_t __a, int32x2_t __b)
 {
   return (int32x4_t) __builtin_aarch64_combinev2si (__a, __b);
 }
 
-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vcombine_s64 (int64x1_t __a, int64x1_t __b)
 {
   return __builtin_aarch64_combinedi (__a[0], __b[0]);
 }
 
-__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline float16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vcombine_f16 (float16x4_t __a, float16x4_t __b)
 {
   return __builtin_aarch64_combinev4hf (__a, __b);
 }
 
-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline float32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vcombine_f32 (float32x2_t __a, float32x2_t __b)
 {
   return (float32x4_t) __builtin_aarch64_combinev2sf (__a, __b);
 }
 
-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint8x16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vcombine_u8 (uint8x8_t __a, uint8x8_t __b)
 {
   return (uint8x16_t) __builtin_aarch64_combinev8qi ((int8x8_t) __a,
 						     (int8x8_t) __b);
 }
 
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vcombine_u16 (uint16x4_t __a, uint16x4_t __b)
 {
   return (uint16x8_t) __builtin_aarch64_combinev4hi ((int16x4_t) __a,
 						     (int16x4_t) __b);
 }
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vcombine_u32 (uint32x2_t __a, uint32x2_t __b)
 {
   return (uint32x4_t) __builtin_aarch64_combinev2si ((int32x2_t) __a,
 						     (int32x2_t) __b);
 }
 
-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vcombine_u64 (uint64x1_t __a, uint64x1_t __b)
 {
   return (uint64x2_t) __builtin_aarch64_combinedi (__a[0], __b[0]);
 }
 
-__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline float64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vcombine_f64 (float64x1_t __a, float64x1_t __b)
 {
   return __builtin_aarch64_combinedf (__a[0], __b[0]);
 }
 
-__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
+__extension__ extern __inline poly8x16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vcombine_p8 (poly8x8_t __a, poly8x8_t __b)
 {
   return (poly8x16_t) __builtin_aarch64_combinev8qi ((int8x8_t) __a,
 						     (int8x8_t) __b);
 }
 
-__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline poly16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vcombine_p16 (poly16x4_t __a, poly16x4_t __b)
 {
   return (poly16x8_t) __builtin_aarch64_combinev4hi ((int16x4_t) __a,
 						     (int16x4_t) __b);
 }
 
+__extension__ extern __inline poly64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcombine_p64 (poly64x1_t __a, poly64x1_t __b)
+{
+  return (poly64x2_t) __builtin_aarch64_combinedi_ppp (__a[0], __b[0]);
+}
+
 /* Start of temporary inline asm implementations.  */
 
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int8x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vaba_s8 (int8x8_t a, int8x8_t b, int8x8_t c)
 {
   int8x8_t result;
@@ -5187,7 +6582,8 @@ vaba_s8 (int8x8_t a, int8x8_t b, int8x8_t c)
   return result;
 }
 
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vaba_s16 (int16x4_t a, int16x4_t b, int16x4_t c)
 {
   int16x4_t result;
@@ -5198,7 +6594,8 @@ vaba_s16 (int16x4_t a, int16x4_t b, int16x4_t c)
   return result;
 }
 
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vaba_s32 (int32x2_t a, int32x2_t b, int32x2_t c)
 {
   int32x2_t result;
@@ -5209,7 +6606,8 @@ vaba_s32 (int32x2_t a, int32x2_t b, int32x2_t c)
   return result;
 }
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint8x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vaba_u8 (uint8x8_t a, uint8x8_t b, uint8x8_t c)
 {
   uint8x8_t result;
@@ -5220,7 +6618,8 @@ vaba_u8 (uint8x8_t a, uint8x8_t b, uint8x8_t c)
   return result;
 }
 
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vaba_u16 (uint16x4_t a, uint16x4_t b, uint16x4_t c)
 {
   uint16x4_t result;
@@ -5231,7 +6630,8 @@ vaba_u16 (uint16x4_t a, uint16x4_t b, uint16x4_t c)
   return result;
 }
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vaba_u32 (uint32x2_t a, uint32x2_t b, uint32x2_t c)
 {
   uint32x2_t result;
@@ -5242,7 +6642,8 @@ vaba_u32 (uint32x2_t a, uint32x2_t b, uint32x2_t c)
   return result;
 }
 
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vabal_high_s8 (int16x8_t a, int8x16_t b, int8x16_t c)
 {
   int16x8_t result;
@@ -5253,7 +6654,8 @@ vabal_high_s8 (int16x8_t a, int8x16_t b, int8x16_t c)
   return result;
 }
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vabal_high_s16 (int32x4_t a, int16x8_t b, int16x8_t c)
 {
   int32x4_t result;
@@ -5264,7 +6666,8 @@ vabal_high_s16 (int32x4_t a, int16x8_t b, int16x8_t c)
   return result;
 }
 
-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vabal_high_s32 (int64x2_t a, int32x4_t b, int32x4_t c)
 {
   int64x2_t result;
@@ -5275,7 +6678,8 @@ vabal_high_s32 (int64x2_t a, int32x4_t b, int32x4_t c)
   return result;
 }
 
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vabal_high_u8 (uint16x8_t a, uint8x16_t b, uint8x16_t c)
 {
   uint16x8_t result;
@@ -5286,7 +6690,8 @@ vabal_high_u8 (uint16x8_t a, uint8x16_t b, uint8x16_t c)
   return result;
 }
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vabal_high_u16 (uint32x4_t a, uint16x8_t b, uint16x8_t c)
 {
   uint32x4_t result;
@@ -5297,7 +6702,8 @@ vabal_high_u16 (uint32x4_t a, uint16x8_t b, uint16x8_t c)
   return result;
 }
 
-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vabal_high_u32 (uint64x2_t a, uint32x4_t b, uint32x4_t c)
 {
   uint64x2_t result;
@@ -5308,7 +6714,8 @@ vabal_high_u32 (uint64x2_t a, uint32x4_t b, uint32x4_t c)
   return result;
 }
 
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vabal_s8 (int16x8_t a, int8x8_t b, int8x8_t c)
 {
   int16x8_t result;
@@ -5319,7 +6726,8 @@ vabal_s8 (int16x8_t a, int8x8_t b, int8x8_t c)
   return result;
 }
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vabal_s16 (int32x4_t a, int16x4_t b, int16x4_t c)
 {
   int32x4_t result;
@@ -5330,7 +6738,8 @@ vabal_s16 (int32x4_t a, int16x4_t b, int16x4_t c)
   return result;
 }
 
-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vabal_s32 (int64x2_t a, int32x2_t b, int32x2_t c)
 {
   int64x2_t result;
@@ -5341,7 +6750,8 @@ vabal_s32 (int64x2_t a, int32x2_t b, int32x2_t c)
   return result;
 }
 
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vabal_u8 (uint16x8_t a, uint8x8_t b, uint8x8_t c)
 {
   uint16x8_t result;
@@ -5352,7 +6762,8 @@ vabal_u8 (uint16x8_t a, uint8x8_t b, uint8x8_t c)
   return result;
 }
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vabal_u16 (uint32x4_t a, uint16x4_t b, uint16x4_t c)
 {
   uint32x4_t result;
@@ -5363,7 +6774,8 @@ vabal_u16 (uint32x4_t a, uint16x4_t b, uint16x4_t c)
   return result;
 }
 
-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vabal_u32 (uint64x2_t a, uint32x2_t b, uint32x2_t c)
 {
   uint64x2_t result;
@@ -5374,7 +6786,8 @@ vabal_u32 (uint64x2_t a, uint32x2_t b, uint32x2_t c)
   return result;
 }
 
-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int8x16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vabaq_s8 (int8x16_t a, int8x16_t b, int8x16_t c)
 {
   int8x16_t result;
@@ -5385,7 +6798,8 @@ vabaq_s8 (int8x16_t a, int8x16_t b, int8x16_t c)
   return result;
 }
 
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vabaq_s16 (int16x8_t a, int16x8_t b, int16x8_t c)
 {
   int16x8_t result;
@@ -5396,7 +6810,8 @@ vabaq_s16 (int16x8_t a, int16x8_t b, int16x8_t c)
   return result;
 }
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vabaq_s32 (int32x4_t a, int32x4_t b, int32x4_t c)
 {
   int32x4_t result;
@@ -5407,7 +6822,8 @@ vabaq_s32 (int32x4_t a, int32x4_t b, int32x4_t c)
   return result;
 }
 
-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint8x16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vabaq_u8 (uint8x16_t a, uint8x16_t b, uint8x16_t c)
 {
   uint8x16_t result;
@@ -5418,7 +6834,8 @@ vabaq_u8 (uint8x16_t a, uint8x16_t b, uint8x16_t c)
   return result;
 }
 
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vabaq_u16 (uint16x8_t a, uint16x8_t b, uint16x8_t c)
 {
   uint16x8_t result;
@@ -5429,7 +6846,8 @@ vabaq_u16 (uint16x8_t a, uint16x8_t b, uint16x8_t c)
   return result;
 }
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vabaq_u32 (uint32x4_t a, uint32x4_t b, uint32x4_t c)
 {
   uint32x4_t result;
@@ -5440,18 +6858,8 @@ vabaq_u32 (uint32x4_t a, uint32x4_t b, uint32x4_t c)
   return result;
 }
 
-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
-vabd_f32 (float32x2_t a, float32x2_t b)
-{
-  float32x2_t result;
-  __asm__ ("fabd %0.2s, %1.2s, %2.2s"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int8x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vabd_s8 (int8x8_t a, int8x8_t b)
 {
   int8x8_t result;
@@ -5462,7 +6870,8 @@ vabd_s8 (int8x8_t a, int8x8_t b)
   return result;
 }
 
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vabd_s16 (int16x4_t a, int16x4_t b)
 {
   int16x4_t result;
@@ -5473,7 +6882,8 @@ vabd_s16 (int16x4_t a, int16x4_t b)
   return result;
 }
 
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vabd_s32 (int32x2_t a, int32x2_t b)
 {
   int32x2_t result;
@@ -5484,7 +6894,8 @@ vabd_s32 (int32x2_t a, int32x2_t b)
   return result;
 }
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint8x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vabd_u8 (uint8x8_t a, uint8x8_t b)
 {
   uint8x8_t result;
@@ -5495,7 +6906,8 @@ vabd_u8 (uint8x8_t a, uint8x8_t b)
   return result;
 }
 
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vabd_u16 (uint16x4_t a, uint16x4_t b)
 {
   uint16x4_t result;
@@ -5506,7 +6918,8 @@ vabd_u16 (uint16x4_t a, uint16x4_t b)
   return result;
 }
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vabd_u32 (uint32x2_t a, uint32x2_t b)
 {
   uint32x2_t result;
@@ -5517,18 +6930,8 @@ vabd_u32 (uint32x2_t a, uint32x2_t b)
   return result;
 }
 
-__extension__ static __inline float64_t __attribute__ ((__always_inline__))
-vabdd_f64 (float64_t a, float64_t b)
-{
-  float64_t result;
-  __asm__ ("fabd %d0, %d1, %d2"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vabdl_high_s8 (int8x16_t a, int8x16_t b)
 {
   int16x8_t result;
@@ -5539,7 +6942,8 @@ vabdl_high_s8 (int8x16_t a, int8x16_t b)
   return result;
 }
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vabdl_high_s16 (int16x8_t a, int16x8_t b)
 {
   int32x4_t result;
@@ -5550,7 +6954,8 @@ vabdl_high_s16 (int16x8_t a, int16x8_t b)
   return result;
 }
 
-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vabdl_high_s32 (int32x4_t a, int32x4_t b)
 {
   int64x2_t result;
@@ -5561,7 +6966,8 @@ vabdl_high_s32 (int32x4_t a, int32x4_t b)
   return result;
 }
 
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vabdl_high_u8 (uint8x16_t a, uint8x16_t b)
 {
   uint16x8_t result;
@@ -5572,7 +6978,8 @@ vabdl_high_u8 (uint8x16_t a, uint8x16_t b)
   return result;
 }
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vabdl_high_u16 (uint16x8_t a, uint16x8_t b)
 {
   uint32x4_t result;
@@ -5583,7 +6990,8 @@ vabdl_high_u16 (uint16x8_t a, uint16x8_t b)
   return result;
 }
 
-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vabdl_high_u32 (uint32x4_t a, uint32x4_t b)
 {
   uint64x2_t result;
@@ -5594,7 +7002,8 @@ vabdl_high_u32 (uint32x4_t a, uint32x4_t b)
   return result;
 }
 
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vabdl_s8 (int8x8_t a, int8x8_t b)
 {
   int16x8_t result;
@@ -5605,7 +7014,8 @@ vabdl_s8 (int8x8_t a, int8x8_t b)
   return result;
 }
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vabdl_s16 (int16x4_t a, int16x4_t b)
 {
   int32x4_t result;
@@ -5616,7 +7026,8 @@ vabdl_s16 (int16x4_t a, int16x4_t b)
   return result;
 }
 
-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vabdl_s32 (int32x2_t a, int32x2_t b)
 {
   int64x2_t result;
@@ -5627,7 +7038,8 @@ vabdl_s32 (int32x2_t a, int32x2_t b)
   return result;
 }
 
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vabdl_u8 (uint8x8_t a, uint8x8_t b)
 {
   uint16x8_t result;
@@ -5638,7 +7050,8 @@ vabdl_u8 (uint8x8_t a, uint8x8_t b)
   return result;
 }
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vabdl_u16 (uint16x4_t a, uint16x4_t b)
 {
   uint32x4_t result;
@@ -5649,7 +7062,8 @@ vabdl_u16 (uint16x4_t a, uint16x4_t b)
   return result;
 }
 
-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vabdl_u32 (uint32x2_t a, uint32x2_t b)
 {
   uint64x2_t result;
@@ -5660,29 +7074,8 @@ vabdl_u32 (uint32x2_t a, uint32x2_t b)
   return result;
 }
 
-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
-vabdq_f32 (float32x4_t a, float32x4_t b)
-{
-  float32x4_t result;
-  __asm__ ("fabd %0.4s, %1.4s, %2.4s"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
-vabdq_f64 (float64x2_t a, float64x2_t b)
-{
-  float64x2_t result;
-  __asm__ ("fabd %0.2d, %1.2d, %2.2d"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int8x16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vabdq_s8 (int8x16_t a, int8x16_t b)
 {
   int8x16_t result;
@@ -5693,7 +7086,8 @@ vabdq_s8 (int8x16_t a, int8x16_t b)
   return result;
 }
 
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vabdq_s16 (int16x8_t a, int16x8_t b)
 {
   int16x8_t result;
@@ -5704,7 +7098,8 @@ vabdq_s16 (int16x8_t a, int16x8_t b)
   return result;
 }
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vabdq_s32 (int32x4_t a, int32x4_t b)
 {
   int32x4_t result;
@@ -5715,7 +7110,8 @@ vabdq_s32 (int32x4_t a, int32x4_t b)
   return result;
 }
 
-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint8x16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vabdq_u8 (uint8x16_t a, uint8x16_t b)
 {
   uint8x16_t result;
@@ -5726,7 +7122,8 @@ vabdq_u8 (uint8x16_t a, uint8x16_t b)
   return result;
 }
 
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vabdq_u16 (uint16x8_t a, uint16x8_t b)
 {
   uint16x8_t result;
@@ -5737,7 +7134,8 @@ vabdq_u16 (uint16x8_t a, uint16x8_t b)
   return result;
 }
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vabdq_u32 (uint32x4_t a, uint32x4_t b)
 {
   uint32x4_t result;
@@ -5748,18 +7146,8 @@ vabdq_u32 (uint32x4_t a, uint32x4_t b)
   return result;
 }
 
-__extension__ static __inline float32_t __attribute__ ((__always_inline__))
-vabds_f32 (float32_t a, float32_t b)
-{
-  float32_t result;
-  __asm__ ("fabd %s0, %s1, %s2"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline int16_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vaddlv_s8 (int8x8_t a)
 {
   int16_t result;
@@ -5770,7 +7158,8 @@ vaddlv_s8 (int8x8_t a)
   return result;
 }
 
-__extension__ static __inline int32_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vaddlv_s16 (int16x4_t a)
 {
   int32_t result;
@@ -5781,7 +7170,8 @@ vaddlv_s16 (int16x4_t a)
   return result;
 }
 
-__extension__ static __inline uint16_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vaddlv_u8 (uint8x8_t a)
 {
   uint16_t result;
@@ -5792,7 +7182,8 @@ vaddlv_u8 (uint8x8_t a)
   return result;
 }
 
-__extension__ static __inline uint32_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vaddlv_u16 (uint16x4_t a)
 {
   uint32_t result;
@@ -5803,7 +7194,8 @@ vaddlv_u16 (uint16x4_t a)
   return result;
 }
 
-__extension__ static __inline int16_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vaddlvq_s8 (int8x16_t a)
 {
   int16_t result;
@@ -5814,7 +7206,8 @@ vaddlvq_s8 (int8x16_t a)
   return result;
 }
 
-__extension__ static __inline int32_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vaddlvq_s16 (int16x8_t a)
 {
   int32_t result;
@@ -5825,7 +7218,8 @@ vaddlvq_s16 (int16x8_t a)
   return result;
 }
 
-__extension__ static __inline int64_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int64_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vaddlvq_s32 (int32x4_t a)
 {
   int64_t result;
@@ -5836,7 +7230,8 @@ vaddlvq_s32 (int32x4_t a)
   return result;
 }
 
-__extension__ static __inline uint16_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vaddlvq_u8 (uint8x16_t a)
 {
   uint16_t result;
@@ -5847,7 +7242,8 @@ vaddlvq_u8 (uint8x16_t a)
   return result;
 }
 
-__extension__ static __inline uint32_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vaddlvq_u16 (uint16x8_t a)
 {
   uint32_t result;
@@ -5858,7 +7254,8 @@ vaddlvq_u16 (uint16x8_t a)
   return result;
 }
 
-__extension__ static __inline uint64_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint64_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vaddlvq_u32 (uint32x4_t a)
 {
   uint64_t result;
@@ -5869,18584 +7266,23100 @@ vaddlvq_u32 (uint32x4_t a)
   return result;
 }
 
-#define vcopyq_lane_f32(a, b, c, d)                                     \
-  __extension__                                                         \
-    ({                                                                  \
-       float32x4_t c_ = (c);                                            \
-       float32x4_t a_ = (a);                                            \
-       float32x4_t result;                                              \
-       __asm__ ("ins %0.s[%2], %3.s[%4]"                                \
-                : "=w"(result)                                          \
-                : "0"(a_), "i"(b), "w"(c_), "i"(d)                      \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
+__extension__ extern __inline float32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcvtx_f32_f64 (float64x2_t a)
+{
+  float32x2_t result;
+  __asm__ ("fcvtxn %0.2s,%1.2d"
+           : "=w"(result)
+           : "w"(a)
+           : /* No clobbers */);
+  return result;
+}
+
+__extension__ extern __inline float32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcvtx_high_f32_f64 (float32x2_t a, float64x2_t b)
+{
+  float32x4_t result;
+  __asm__ ("fcvtxn2 %0.4s,%1.2d"
+           : "=w"(result)
+           : "w" (b), "0"(a)
+           : /* No clobbers */);
+  return result;
+}
+
+__extension__ extern __inline float32_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcvtxd_f32_f64 (float64_t a)
+{
+  float32_t result;
+  __asm__ ("fcvtxn %s0,%d1"
+           : "=w"(result)
+           : "w"(a)
+           : /* No clobbers */);
+  return result;
+}
+
+__extension__ extern __inline float32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmla_n_f32 (float32x2_t a, float32x2_t b, float32_t c)
+{
+  float32x2_t result;
+  float32x2_t t1;
+  __asm__ ("fmul %1.2s, %3.2s, %4.s[0]; fadd %0.2s, %0.2s, %1.2s"
+           : "=w"(result), "=w"(t1)
+           : "0"(a), "w"(b), "w"(c)
+           : /* No clobbers */);
+  return result;
+}
+
+__extension__ extern __inline int16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmla_n_s16 (int16x4_t a, int16x4_t b, int16_t c)
+{
+  int16x4_t result;
+  __asm__ ("mla %0.4h,%2.4h,%3.h[0]"
+           : "=w"(result)
+           : "0"(a), "w"(b), "x"(c)
+           : /* No clobbers */);
+  return result;
+}
+
+__extension__ extern __inline int32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmla_n_s32 (int32x2_t a, int32x2_t b, int32_t c)
+{
+  int32x2_t result;
+  __asm__ ("mla %0.2s,%2.2s,%3.s[0]"
+           : "=w"(result)
+           : "0"(a), "w"(b), "w"(c)
+           : /* No clobbers */);
+  return result;
+}
+
+__extension__ extern __inline uint16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmla_n_u16 (uint16x4_t a, uint16x4_t b, uint16_t c)
+{
+  uint16x4_t result;
+  __asm__ ("mla %0.4h,%2.4h,%3.h[0]"
+           : "=w"(result)
+           : "0"(a), "w"(b), "x"(c)
+           : /* No clobbers */);
+  return result;
+}
+
+__extension__ extern __inline uint32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmla_n_u32 (uint32x2_t a, uint32x2_t b, uint32_t c)
+{
+  uint32x2_t result;
+  __asm__ ("mla %0.2s,%2.2s,%3.s[0]"
+           : "=w"(result)
+           : "0"(a), "w"(b), "w"(c)
+           : /* No clobbers */);
+  return result;
+}
+
+__extension__ extern __inline int8x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmla_s8 (int8x8_t a, int8x8_t b, int8x8_t c)
+{
+  int8x8_t result;
+  __asm__ ("mla %0.8b, %2.8b, %3.8b"
+           : "=w"(result)
+           : "0"(a), "w"(b), "w"(c)
+           : /* No clobbers */);
+  return result;
+}
+
+__extension__ extern __inline int16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmla_s16 (int16x4_t a, int16x4_t b, int16x4_t c)
+{
+  int16x4_t result;
+  __asm__ ("mla %0.4h, %2.4h, %3.4h"
+           : "=w"(result)
+           : "0"(a), "w"(b), "w"(c)
+           : /* No clobbers */);
+  return result;
+}
+
+__extension__ extern __inline int32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmla_s32 (int32x2_t a, int32x2_t b, int32x2_t c)
+{
+  int32x2_t result;
+  __asm__ ("mla %0.2s, %2.2s, %3.2s"
+           : "=w"(result)
+           : "0"(a), "w"(b), "w"(c)
+           : /* No clobbers */);
+  return result;
+}
+
+__extension__ extern __inline uint8x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmla_u8 (uint8x8_t a, uint8x8_t b, uint8x8_t c)
+{
+  uint8x8_t result;
+  __asm__ ("mla %0.8b, %2.8b, %3.8b"
+           : "=w"(result)
+           : "0"(a), "w"(b), "w"(c)
+           : /* No clobbers */);
+  return result;
+}
+
+__extension__ extern __inline uint16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmla_u16 (uint16x4_t a, uint16x4_t b, uint16x4_t c)
+{
+  uint16x4_t result;
+  __asm__ ("mla %0.4h, %2.4h, %3.4h"
+           : "=w"(result)
+           : "0"(a), "w"(b), "w"(c)
+           : /* No clobbers */);
+  return result;
+}
+
+__extension__ extern __inline uint32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmla_u32 (uint32x2_t a, uint32x2_t b, uint32x2_t c)
+{
+  uint32x2_t result;
+  __asm__ ("mla %0.2s, %2.2s, %3.2s"
+           : "=w"(result)
+           : "0"(a), "w"(b), "w"(c)
+           : /* No clobbers */);
+  return result;
+}
 
-#define vcopyq_lane_f64(a, b, c, d)                                     \
+#define vmlal_high_lane_s16(a, b, c, d)                                 \
   __extension__                                                         \
     ({                                                                  \
-       float64x2_t c_ = (c);                                            \
-       float64x2_t a_ = (a);                                            \
-       float64x2_t result;                                              \
-       __asm__ ("ins %0.d[%2], %3.d[%4]"                                \
+       int16x4_t c_ = (c);                                              \
+       int16x8_t b_ = (b);                                              \
+       int32x4_t a_ = (a);                                              \
+       int32x4_t result;                                                \
+       __asm__ ("smlal2 %0.4s, %2.8h, %3.h[%4]"                         \
                 : "=w"(result)                                          \
-                : "0"(a_), "i"(b), "w"(c_), "i"(d)                      \
+                : "0"(a_), "w"(b_), "x"(c_), "i"(d)                     \
                 : /* No clobbers */);                                   \
        result;                                                          \
      })
 
-#define vcopyq_lane_p8(a, b, c, d)                                      \
+#define vmlal_high_lane_s32(a, b, c, d)                                 \
   __extension__                                                         \
     ({                                                                  \
-       poly8x16_t c_ = (c);                                             \
-       poly8x16_t a_ = (a);                                             \
-       poly8x16_t result;                                               \
-       __asm__ ("ins %0.b[%2], %3.b[%4]"                                \
+       int32x2_t c_ = (c);                                              \
+       int32x4_t b_ = (b);                                              \
+       int64x2_t a_ = (a);                                              \
+       int64x2_t result;                                                \
+       __asm__ ("smlal2 %0.2d, %2.4s, %3.s[%4]"                         \
                 : "=w"(result)                                          \
-                : "0"(a_), "i"(b), "w"(c_), "i"(d)                      \
+                : "0"(a_), "w"(b_), "w"(c_), "i"(d)                     \
                 : /* No clobbers */);                                   \
        result;                                                          \
      })
 
-#define vcopyq_lane_p16(a, b, c, d)                                     \
+#define vmlal_high_lane_u16(a, b, c, d)                                 \
   __extension__                                                         \
     ({                                                                  \
-       poly16x8_t c_ = (c);                                             \
-       poly16x8_t a_ = (a);                                             \
-       poly16x8_t result;                                               \
-       __asm__ ("ins %0.h[%2], %3.h[%4]"                                \
+       uint16x4_t c_ = (c);                                             \
+       uint16x8_t b_ = (b);                                             \
+       uint32x4_t a_ = (a);                                             \
+       uint32x4_t result;                                               \
+       __asm__ ("umlal2 %0.4s, %2.8h, %3.h[%4]"                         \
                 : "=w"(result)                                          \
-                : "0"(a_), "i"(b), "w"(c_), "i"(d)                      \
+                : "0"(a_), "w"(b_), "x"(c_), "i"(d)                     \
                 : /* No clobbers */);                                   \
        result;                                                          \
      })
 
-#define vcopyq_lane_s8(a, b, c, d)                                      \
+#define vmlal_high_lane_u32(a, b, c, d)                                 \
   __extension__                                                         \
     ({                                                                  \
-       int8x16_t c_ = (c);                                              \
-       int8x16_t a_ = (a);                                              \
-       int8x16_t result;                                                \
-       __asm__ ("ins %0.b[%2], %3.b[%4]"                                \
+       uint32x2_t c_ = (c);                                             \
+       uint32x4_t b_ = (b);                                             \
+       uint64x2_t a_ = (a);                                             \
+       uint64x2_t result;                                               \
+       __asm__ ("umlal2 %0.2d, %2.4s, %3.s[%4]"                         \
                 : "=w"(result)                                          \
-                : "0"(a_), "i"(b), "w"(c_), "i"(d)                      \
+                : "0"(a_), "w"(b_), "w"(c_), "i"(d)                     \
                 : /* No clobbers */);                                   \
        result;                                                          \
      })
 
-#define vcopyq_lane_s16(a, b, c, d)                                     \
+#define vmlal_high_laneq_s16(a, b, c, d)                                \
   __extension__                                                         \
     ({                                                                  \
        int16x8_t c_ = (c);                                              \
-       int16x8_t a_ = (a);                                              \
-       int16x8_t result;                                                \
-       __asm__ ("ins %0.h[%2], %3.h[%4]"                                \
-                : "=w"(result)                                          \
-                : "0"(a_), "i"(b), "w"(c_), "i"(d)                      \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
-
-#define vcopyq_lane_s32(a, b, c, d)                                     \
-  __extension__                                                         \
-    ({                                                                  \
-       int32x4_t c_ = (c);                                              \
+       int16x8_t b_ = (b);                                              \
        int32x4_t a_ = (a);                                              \
        int32x4_t result;                                                \
-       __asm__ ("ins %0.s[%2], %3.s[%4]"                                \
+       __asm__ ("smlal2 %0.4s, %2.8h, %3.h[%4]"                         \
                 : "=w"(result)                                          \
-                : "0"(a_), "i"(b), "w"(c_), "i"(d)                      \
+                : "0"(a_), "w"(b_), "x"(c_), "i"(d)                     \
                 : /* No clobbers */);                                   \
        result;                                                          \
      })
 
-#define vcopyq_lane_s64(a, b, c, d)                                     \
+#define vmlal_high_laneq_s32(a, b, c, d)                                \
   __extension__                                                         \
     ({                                                                  \
-       int64x2_t c_ = (c);                                              \
+       int32x4_t c_ = (c);                                              \
+       int32x4_t b_ = (b);                                              \
        int64x2_t a_ = (a);                                              \
        int64x2_t result;                                                \
-       __asm__ ("ins %0.d[%2], %3.d[%4]"                                \
-                : "=w"(result)                                          \
-                : "0"(a_), "i"(b), "w"(c_), "i"(d)                      \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
-
-#define vcopyq_lane_u8(a, b, c, d)                                      \
-  __extension__                                                         \
-    ({                                                                  \
-       uint8x16_t c_ = (c);                                             \
-       uint8x16_t a_ = (a);                                             \
-       uint8x16_t result;                                               \
-       __asm__ ("ins %0.b[%2], %3.b[%4]"                                \
+       __asm__ ("smlal2 %0.2d, %2.4s, %3.s[%4]"                         \
                 : "=w"(result)                                          \
-                : "0"(a_), "i"(b), "w"(c_), "i"(d)                      \
+                : "0"(a_), "w"(b_), "w"(c_), "i"(d)                     \
                 : /* No clobbers */);                                   \
        result;                                                          \
      })
 
-#define vcopyq_lane_u16(a, b, c, d)                                     \
+#define vmlal_high_laneq_u16(a, b, c, d)                                \
   __extension__                                                         \
     ({                                                                  \
        uint16x8_t c_ = (c);                                             \
-       uint16x8_t a_ = (a);                                             \
-       uint16x8_t result;                                               \
-       __asm__ ("ins %0.h[%2], %3.h[%4]"                                \
-                : "=w"(result)                                          \
-                : "0"(a_), "i"(b), "w"(c_), "i"(d)                      \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
-
-#define vcopyq_lane_u32(a, b, c, d)                                     \
-  __extension__                                                         \
-    ({                                                                  \
-       uint32x4_t c_ = (c);                                             \
+       uint16x8_t b_ = (b);                                             \
        uint32x4_t a_ = (a);                                             \
        uint32x4_t result;                                               \
-       __asm__ ("ins %0.s[%2], %3.s[%4]"                                \
+       __asm__ ("umlal2 %0.4s, %2.8h, %3.h[%4]"                         \
                 : "=w"(result)                                          \
-                : "0"(a_), "i"(b), "w"(c_), "i"(d)                      \
+                : "0"(a_), "w"(b_), "x"(c_), "i"(d)                     \
                 : /* No clobbers */);                                   \
        result;                                                          \
      })
 
-#define vcopyq_lane_u64(a, b, c, d)                                     \
+#define vmlal_high_laneq_u32(a, b, c, d)                                \
   __extension__                                                         \
     ({                                                                  \
-       uint64x2_t c_ = (c);                                             \
+       uint32x4_t c_ = (c);                                             \
+       uint32x4_t b_ = (b);                                             \
        uint64x2_t a_ = (a);                                             \
        uint64x2_t result;                                               \
-       __asm__ ("ins %0.d[%2], %3.d[%4]"                                \
+       __asm__ ("umlal2 %0.2d, %2.4s, %3.s[%4]"                         \
                 : "=w"(result)                                          \
-                : "0"(a_), "i"(b), "w"(c_), "i"(d)                      \
+                : "0"(a_), "w"(b_), "w"(c_), "i"(d)                     \
                 : /* No clobbers */);                                   \
        result;                                                          \
      })
 
-#define vcvt_n_f32_s32(a, b)                                            \
-  __extension__                                                         \
-    ({                                                                  \
-       int32x2_t a_ = (a);                                              \
-       float32x2_t result;                                              \
-       __asm__ ("scvtf %0.2s, %1.2s, #%2"                               \
-                : "=w"(result)                                          \
-                : "w"(a_), "i"(b)                                       \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
+__extension__ extern __inline int32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmlal_high_n_s16 (int32x4_t a, int16x8_t b, int16_t c)
+{
+  int32x4_t result;
+  __asm__ ("smlal2 %0.4s,%2.8h,%3.h[0]"
+           : "=w"(result)
+           : "0"(a), "w"(b), "x"(c)
+           : /* No clobbers */);
+  return result;
+}
 
-#define vcvt_n_f32_u32(a, b)                                            \
-  __extension__                                                         \
-    ({                                                                  \
-       uint32x2_t a_ = (a);                                             \
-       float32x2_t result;                                              \
-       __asm__ ("ucvtf %0.2s, %1.2s, #%2"                               \
-                : "=w"(result)                                          \
-                : "w"(a_), "i"(b)                                       \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
+__extension__ extern __inline int64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmlal_high_n_s32 (int64x2_t a, int32x4_t b, int32_t c)
+{
+  int64x2_t result;
+  __asm__ ("smlal2 %0.2d,%2.4s,%3.s[0]"
+           : "=w"(result)
+           : "0"(a), "w"(b), "w"(c)
+           : /* No clobbers */);
+  return result;
+}
 
-#define vcvt_n_s32_f32(a, b)                                            \
-  __extension__                                                         \
-    ({                                                                  \
-       float32x2_t a_ = (a);                                            \
-       int32x2_t result;                                                \
-       __asm__ ("fcvtzs %0.2s, %1.2s, #%2"                              \
-                : "=w"(result)                                          \
-                : "w"(a_), "i"(b)                                       \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
+__extension__ extern __inline uint32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmlal_high_n_u16 (uint32x4_t a, uint16x8_t b, uint16_t c)
+{
+  uint32x4_t result;
+  __asm__ ("umlal2 %0.4s,%2.8h,%3.h[0]"
+           : "=w"(result)
+           : "0"(a), "w"(b), "x"(c)
+           : /* No clobbers */);
+  return result;
+}
 
-#define vcvt_n_u32_f32(a, b)                                            \
-  __extension__                                                         \
-    ({                                                                  \
-       float32x2_t a_ = (a);                                            \
-       uint32x2_t result;                                               \
-       __asm__ ("fcvtzu %0.2s, %1.2s, #%2"                              \
-                : "=w"(result)                                          \
-                : "w"(a_), "i"(b)                                       \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
+__extension__ extern __inline uint64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmlal_high_n_u32 (uint64x2_t a, uint32x4_t b, uint32_t c)
+{
+  uint64x2_t result;
+  __asm__ ("umlal2 %0.2d,%2.4s,%3.s[0]"
+           : "=w"(result)
+           : "0"(a), "w"(b), "w"(c)
+           : /* No clobbers */);
+  return result;
+}
 
-#define vcvtd_n_f64_s64(a, b)                                           \
-  __extension__                                                         \
-    ({                                                                  \
-       int64_t a_ = (a);                                                \
-       float64_t result;                                                \
-       __asm__ ("scvtf %d0,%d1,%2"                                      \
-                : "=w"(result)                                          \
-                : "w"(a_), "i"(b)                                       \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
+__extension__ extern __inline int16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmlal_high_s8 (int16x8_t a, int8x16_t b, int8x16_t c)
+{
+  int16x8_t result;
+  __asm__ ("smlal2 %0.8h,%2.16b,%3.16b"
+           : "=w"(result)
+           : "0"(a), "w"(b), "w"(c)
+           : /* No clobbers */);
+  return result;
+}
 
-#define vcvtd_n_f64_u64(a, b)                                           \
-  __extension__                                                         \
-    ({                                                                  \
-       uint64_t a_ = (a);                                               \
-       float64_t result;                                                \
-       __asm__ ("ucvtf %d0,%d1,%2"                                      \
-                : "=w"(result)                                          \
-                : "w"(a_), "i"(b)                                       \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
+__extension__ extern __inline int32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmlal_high_s16 (int32x4_t a, int16x8_t b, int16x8_t c)
+{
+  int32x4_t result;
+  __asm__ ("smlal2 %0.4s,%2.8h,%3.8h"
+           : "=w"(result)
+           : "0"(a), "w"(b), "w"(c)
+           : /* No clobbers */);
+  return result;
+}
 
-#define vcvtd_n_s64_f64(a, b)                                           \
-  __extension__                                                         \
-    ({                                                                  \
-       float64_t a_ = (a);                                              \
-       int64_t result;                                                  \
-       __asm__ ("fcvtzs %d0,%d1,%2"                                     \
-                : "=w"(result)                                          \
-                : "w"(a_), "i"(b)                                       \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
+__extension__ extern __inline int64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmlal_high_s32 (int64x2_t a, int32x4_t b, int32x4_t c)
+{
+  int64x2_t result;
+  __asm__ ("smlal2 %0.2d,%2.4s,%3.4s"
+           : "=w"(result)
+           : "0"(a), "w"(b), "w"(c)
+           : /* No clobbers */);
+  return result;
+}
 
-#define vcvtd_n_u64_f64(a, b)                                           \
-  __extension__                                                         \
-    ({                                                                  \
-       float64_t a_ = (a);                                              \
-       uint64_t result;                                                 \
-       __asm__ ("fcvtzu %d0,%d1,%2"                                     \
-                : "=w"(result)                                          \
-                : "w"(a_), "i"(b)                                       \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
+__extension__ extern __inline uint16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmlal_high_u8 (uint16x8_t a, uint8x16_t b, uint8x16_t c)
+{
+  uint16x8_t result;
+  __asm__ ("umlal2 %0.8h,%2.16b,%3.16b"
+           : "=w"(result)
+           : "0"(a), "w"(b), "w"(c)
+           : /* No clobbers */);
+  return result;
+}
+
+__extension__ extern __inline uint32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmlal_high_u16 (uint32x4_t a, uint16x8_t b, uint16x8_t c)
+{
+  uint32x4_t result;
+  __asm__ ("umlal2 %0.4s,%2.8h,%3.8h"
+           : "=w"(result)
+           : "0"(a), "w"(b), "w"(c)
+           : /* No clobbers */);
+  return result;
+}
+
+__extension__ extern __inline uint64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmlal_high_u32 (uint64x2_t a, uint32x4_t b, uint32x4_t c)
+{
+  uint64x2_t result;
+  __asm__ ("umlal2 %0.2d,%2.4s,%3.4s"
+           : "=w"(result)
+           : "0"(a), "w"(b), "w"(c)
+           : /* No clobbers */);
+  return result;
+}
 
-#define vcvtq_n_f32_s32(a, b)                                           \
+#define vmlal_lane_s16(a, b, c, d)                                      \
   __extension__                                                         \
     ({                                                                  \
+       int16x4_t c_ = (c);                                              \
+       int16x4_t b_ = (b);                                              \
        int32x4_t a_ = (a);                                              \
-       float32x4_t result;                                              \
-       __asm__ ("scvtf %0.4s, %1.4s, #%2"                               \
+       int32x4_t result;                                                \
+       __asm__ ("smlal %0.4s,%2.4h,%3.h[%4]"                            \
                 : "=w"(result)                                          \
-                : "w"(a_), "i"(b)                                       \
+                : "0"(a_), "w"(b_), "x"(c_), "i"(d)                     \
                 : /* No clobbers */);                                   \
        result;                                                          \
      })
 
-#define vcvtq_n_f32_u32(a, b)                                           \
+#define vmlal_lane_s32(a, b, c, d)                                      \
   __extension__                                                         \
     ({                                                                  \
-       uint32x4_t a_ = (a);                                             \
-       float32x4_t result;                                              \
-       __asm__ ("ucvtf %0.4s, %1.4s, #%2"                               \
+       int32x2_t c_ = (c);                                              \
+       int32x2_t b_ = (b);                                              \
+       int64x2_t a_ = (a);                                              \
+       int64x2_t result;                                                \
+       __asm__ ("smlal %0.2d,%2.2s,%3.s[%4]"                            \
                 : "=w"(result)                                          \
-                : "w"(a_), "i"(b)                                       \
+                : "0"(a_), "w"(b_), "w"(c_), "i"(d)                     \
                 : /* No clobbers */);                                   \
        result;                                                          \
      })
 
-#define vcvtq_n_f64_s64(a, b)                                           \
+#define vmlal_lane_u16(a, b, c, d)                                      \
   __extension__                                                         \
     ({                                                                  \
-       int64x2_t a_ = (a);                                              \
-       float64x2_t result;                                              \
-       __asm__ ("scvtf %0.2d, %1.2d, #%2"                               \
+       uint16x4_t c_ = (c);                                             \
+       uint16x4_t b_ = (b);                                             \
+       uint32x4_t a_ = (a);                                             \
+       uint32x4_t result;                                               \
+       __asm__ ("umlal %0.4s,%2.4h,%3.h[%4]"                            \
                 : "=w"(result)                                          \
-                : "w"(a_), "i"(b)                                       \
+                : "0"(a_), "w"(b_), "x"(c_), "i"(d)                     \
                 : /* No clobbers */);                                   \
        result;                                                          \
      })
 
-#define vcvtq_n_f64_u64(a, b)                                           \
+#define vmlal_lane_u32(a, b, c, d)                                      \
   __extension__                                                         \
     ({                                                                  \
+       uint32x2_t c_ = (c);                                             \
+       uint32x2_t b_ = (b);                                             \
        uint64x2_t a_ = (a);                                             \
-       float64x2_t result;                                              \
-       __asm__ ("ucvtf %0.2d, %1.2d, #%2"                               \
+       uint64x2_t result;                                               \
+       __asm__ ("umlal %0.2d, %2.2s, %3.s[%4]"                          \
                 : "=w"(result)                                          \
-                : "w"(a_), "i"(b)                                       \
+                : "0"(a_), "w"(b_), "w"(c_), "i"(d)                     \
                 : /* No clobbers */);                                   \
        result;                                                          \
      })
 
-#define vcvtq_n_s32_f32(a, b)                                           \
+#define vmlal_laneq_s16(a, b, c, d)                                     \
   __extension__                                                         \
     ({                                                                  \
-       float32x4_t a_ = (a);                                            \
+       int16x8_t c_ = (c);                                              \
+       int16x4_t b_ = (b);                                              \
+       int32x4_t a_ = (a);                                              \
        int32x4_t result;                                                \
-       __asm__ ("fcvtzs %0.4s, %1.4s, #%2"                              \
+       __asm__ ("smlal %0.4s, %2.4h, %3.h[%4]"                          \
                 : "=w"(result)                                          \
-                : "w"(a_), "i"(b)                                       \
+                : "0"(a_), "w"(b_), "x"(c_), "i"(d)                     \
                 : /* No clobbers */);                                   \
        result;                                                          \
      })
 
-#define vcvtq_n_s64_f64(a, b)                                           \
+#define vmlal_laneq_s32(a, b, c, d)                                     \
   __extension__                                                         \
     ({                                                                  \
-       float64x2_t a_ = (a);                                            \
+       int32x4_t c_ = (c);                                              \
+       int32x2_t b_ = (b);                                              \
+       int64x2_t a_ = (a);                                              \
        int64x2_t result;                                                \
-       __asm__ ("fcvtzs %0.2d, %1.2d, #%2"                              \
+       __asm__ ("smlal %0.2d, %2.2s, %3.s[%4]"                          \
                 : "=w"(result)                                          \
-                : "w"(a_), "i"(b)                                       \
+                : "0"(a_), "w"(b_), "w"(c_), "i"(d)                     \
                 : /* No clobbers */);                                   \
        result;                                                          \
      })
 
-#define vcvtq_n_u32_f32(a, b)                                           \
+#define vmlal_laneq_u16(a, b, c, d)                                     \
   __extension__                                                         \
     ({                                                                  \
-       float32x4_t a_ = (a);                                            \
+       uint16x8_t c_ = (c);                                             \
+       uint16x4_t b_ = (b);                                             \
+       uint32x4_t a_ = (a);                                             \
        uint32x4_t result;                                               \
-       __asm__ ("fcvtzu %0.4s, %1.4s, #%2"                              \
+       __asm__ ("umlal %0.4s, %2.4h, %3.h[%4]"                          \
                 : "=w"(result)                                          \
-                : "w"(a_), "i"(b)                                       \
+                : "0"(a_), "w"(b_), "x"(c_), "i"(d)                     \
                 : /* No clobbers */);                                   \
        result;                                                          \
      })
 
-#define vcvtq_n_u64_f64(a, b)                                           \
+#define vmlal_laneq_u32(a, b, c, d)                                     \
   __extension__                                                         \
     ({                                                                  \
-       float64x2_t a_ = (a);                                            \
-       uint64x2_t result;                                               \
-       __asm__ ("fcvtzu %0.2d, %1.2d, #%2"                              \
+       uint32x4_t c_ = (c);                                             \
+       uint32x2_t b_ = (b);                                             \
+       uint64x2_t a_ = (a);                                             \
+       uint64x2_t result;                                               \
+       __asm__ ("umlal %0.2d, %2.2s, %3.s[%4]"                          \
                 : "=w"(result)                                          \
-                : "w"(a_), "i"(b)                                       \
+                : "0"(a_), "w"(b_), "w"(c_), "i"(d)                     \
                 : /* No clobbers */);                                   \
        result;                                                          \
      })
 
-#define vcvts_n_f32_s32(a, b)                                           \
-  __extension__                                                         \
-    ({                                                                  \
-       int32_t a_ = (a);                                                \
-       float32_t result;                                                \
-       __asm__ ("scvtf %s0,%s1,%2"                                      \
-                : "=w"(result)                                          \
-                : "w"(a_), "i"(b)                                       \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
+__extension__ extern __inline int32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmlal_n_s16 (int32x4_t a, int16x4_t b, int16_t c)
+{
+  int32x4_t result;
+  __asm__ ("smlal %0.4s,%2.4h,%3.h[0]"
+           : "=w"(result)
+           : "0"(a), "w"(b), "x"(c)
+           : /* No clobbers */);
+  return result;
+}
 
-#define vcvts_n_f32_u32(a, b)                                           \
-  __extension__                                                         \
-    ({                                                                  \
-       uint32_t a_ = (a);                                               \
-       float32_t result;                                                \
-       __asm__ ("ucvtf %s0,%s1,%2"                                      \
-                : "=w"(result)                                          \
-                : "w"(a_), "i"(b)                                       \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
+__extension__ extern __inline int64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmlal_n_s32 (int64x2_t a, int32x2_t b, int32_t c)
+{
+  int64x2_t result;
+  __asm__ ("smlal %0.2d,%2.2s,%3.s[0]"
+           : "=w"(result)
+           : "0"(a), "w"(b), "w"(c)
+           : /* No clobbers */);
+  return result;
+}
 
-#define vcvts_n_s32_f32(a, b)                                           \
-  __extension__                                                         \
-    ({                                                                  \
-       float32_t a_ = (a);                                              \
-       int32_t result;                                                  \
-       __asm__ ("fcvtzs %s0,%s1,%2"                                     \
-                : "=w"(result)                                          \
-                : "w"(a_), "i"(b)                                       \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
+__extension__ extern __inline uint32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmlal_n_u16 (uint32x4_t a, uint16x4_t b, uint16_t c)
+{
+  uint32x4_t result;
+  __asm__ ("umlal %0.4s,%2.4h,%3.h[0]"
+           : "=w"(result)
+           : "0"(a), "w"(b), "x"(c)
+           : /* No clobbers */);
+  return result;
+}
 
-#define vcvts_n_u32_f32(a, b)                                           \
-  __extension__                                                         \
-    ({                                                                  \
-       float32_t a_ = (a);                                              \
-       uint32_t result;                                                 \
-       __asm__ ("fcvtzu %s0,%s1,%2"                                     \
-                : "=w"(result)                                          \
-                : "w"(a_), "i"(b)                                       \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
+__extension__ extern __inline uint64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmlal_n_u32 (uint64x2_t a, uint32x2_t b, uint32_t c)
+{
+  uint64x2_t result;
+  __asm__ ("umlal %0.2d,%2.2s,%3.s[0]"
+           : "=w"(result)
+           : "0"(a), "w"(b), "w"(c)
+           : /* No clobbers */);
+  return result;
+}
 
-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
-vcvtx_f32_f64 (float64x2_t a)
+__extension__ extern __inline int16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmlal_s8 (int16x8_t a, int8x8_t b, int8x8_t c)
 {
-  float32x2_t result;
-  __asm__ ("fcvtxn %0.2s,%1.2d"
+  int16x8_t result;
+  __asm__ ("smlal %0.8h,%2.8b,%3.8b"
            : "=w"(result)
-           : "w"(a)
+           : "0"(a), "w"(b), "w"(c)
            : /* No clobbers */);
   return result;
 }
 
-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
-vcvtx_high_f32_f64 (float32x2_t a, float64x2_t b)
+__extension__ extern __inline int32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmlal_s16 (int32x4_t a, int16x4_t b, int16x4_t c)
 {
-  float32x4_t result;
-  __asm__ ("fcvtxn2 %0.4s,%1.2d"
+  int32x4_t result;
+  __asm__ ("smlal %0.4s,%2.4h,%3.4h"
            : "=w"(result)
-           : "w" (b), "0"(a)
+           : "0"(a), "w"(b), "w"(c)
            : /* No clobbers */);
   return result;
 }
 
-__extension__ static __inline float32_t __attribute__ ((__always_inline__))
-vcvtxd_f32_f64 (float64_t a)
+__extension__ extern __inline int64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmlal_s32 (int64x2_t a, int32x2_t b, int32x2_t c)
 {
-  float32_t result;
-  __asm__ ("fcvtxn %s0,%d1"
+  int64x2_t result;
+  __asm__ ("smlal %0.2d,%2.2s,%3.2s"
            : "=w"(result)
-           : "w"(a)
+           : "0"(a), "w"(b), "w"(c)
            : /* No clobbers */);
   return result;
 }
 
-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
-vmla_n_f32 (float32x2_t a, float32x2_t b, float32_t c)
+__extension__ extern __inline uint16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmlal_u8 (uint16x8_t a, uint8x8_t b, uint8x8_t c)
 {
-  float32x2_t result;
-  float32x2_t t1;
-  __asm__ ("fmul %1.2s, %3.2s, %4.s[0]; fadd %0.2s, %0.2s, %1.2s"
+  uint16x8_t result;
+  __asm__ ("umlal %0.8h,%2.8b,%3.8b"
+           : "=w"(result)
+           : "0"(a), "w"(b), "w"(c)
+           : /* No clobbers */);
+  return result;
+}
+
+__extension__ extern __inline uint32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmlal_u16 (uint32x4_t a, uint16x4_t b, uint16x4_t c)
+{
+  uint32x4_t result;
+  __asm__ ("umlal %0.4s,%2.4h,%3.4h"
+           : "=w"(result)
+           : "0"(a), "w"(b), "w"(c)
+           : /* No clobbers */);
+  return result;
+}
+
+__extension__ extern __inline uint64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmlal_u32 (uint64x2_t a, uint32x2_t b, uint32x2_t c)
+{
+  uint64x2_t result;
+  __asm__ ("umlal %0.2d,%2.2s,%3.2s"
+           : "=w"(result)
+           : "0"(a), "w"(b), "w"(c)
+           : /* No clobbers */);
+  return result;
+}
+
+__extension__ extern __inline float32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmlaq_n_f32 (float32x4_t a, float32x4_t b, float32_t c)
+{
+  float32x4_t result;
+  float32x4_t t1;
+  __asm__ ("fmul %1.4s, %3.4s, %4.s[0]; fadd %0.4s, %0.4s, %1.4s"
            : "=w"(result), "=w"(t1)
            : "0"(a), "w"(b), "w"(c)
            : /* No clobbers */);
   return result;
 }
 
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-vmla_n_s16 (int16x4_t a, int16x4_t b, int16_t c)
+__extension__ extern __inline int16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmlaq_n_s16 (int16x8_t a, int16x8_t b, int16_t c)
 {
-  int16x4_t result;
-  __asm__ ("mla %0.4h,%2.4h,%3.h[0]"
+  int16x8_t result;
+  __asm__ ("mla %0.8h,%2.8h,%3.h[0]"
            : "=w"(result)
            : "0"(a), "w"(b), "x"(c)
            : /* No clobbers */);
   return result;
 }
 
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-vmla_n_s32 (int32x2_t a, int32x2_t b, int32_t c)
+__extension__ extern __inline int32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmlaq_n_s32 (int32x4_t a, int32x4_t b, int32_t c)
 {
-  int32x2_t result;
-  __asm__ ("mla %0.2s,%2.2s,%3.s[0]"
+  int32x4_t result;
+  __asm__ ("mla %0.4s,%2.4s,%3.s[0]"
            : "=w"(result)
            : "0"(a), "w"(b), "w"(c)
            : /* No clobbers */);
   return result;
 }
 
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-vmla_n_u16 (uint16x4_t a, uint16x4_t b, uint16_t c)
+__extension__ extern __inline uint16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmlaq_n_u16 (uint16x8_t a, uint16x8_t b, uint16_t c)
 {
-  uint16x4_t result;
-  __asm__ ("mla %0.4h,%2.4h,%3.h[0]"
+  uint16x8_t result;
+  __asm__ ("mla %0.8h,%2.8h,%3.h[0]"
            : "=w"(result)
            : "0"(a), "w"(b), "x"(c)
            : /* No clobbers */);
   return result;
 }
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-vmla_n_u32 (uint32x2_t a, uint32x2_t b, uint32_t c)
+__extension__ extern __inline uint32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmlaq_n_u32 (uint32x4_t a, uint32x4_t b, uint32_t c)
 {
-  uint32x2_t result;
-  __asm__ ("mla %0.2s,%2.2s,%3.s[0]"
+  uint32x4_t result;
+  __asm__ ("mla %0.4s,%2.4s,%3.s[0]"
            : "=w"(result)
            : "0"(a), "w"(b), "w"(c)
            : /* No clobbers */);
   return result;
 }
 
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-vmla_s8 (int8x8_t a, int8x8_t b, int8x8_t c)
+__extension__ extern __inline int8x16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmlaq_s8 (int8x16_t a, int8x16_t b, int8x16_t c)
 {
-  int8x8_t result;
-  __asm__ ("mla %0.8b, %2.8b, %3.8b"
+  int8x16_t result;
+  __asm__ ("mla %0.16b, %2.16b, %3.16b"
            : "=w"(result)
            : "0"(a), "w"(b), "w"(c)
            : /* No clobbers */);
   return result;
 }
 
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-vmla_s16 (int16x4_t a, int16x4_t b, int16x4_t c)
+__extension__ extern __inline int16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmlaq_s16 (int16x8_t a, int16x8_t b, int16x8_t c)
 {
-  int16x4_t result;
-  __asm__ ("mla %0.4h, %2.4h, %3.4h"
+  int16x8_t result;
+  __asm__ ("mla %0.8h, %2.8h, %3.8h"
            : "=w"(result)
            : "0"(a), "w"(b), "w"(c)
            : /* No clobbers */);
   return result;
 }
 
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-vmla_s32 (int32x2_t a, int32x2_t b, int32x2_t c)
+__extension__ extern __inline int32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmlaq_s32 (int32x4_t a, int32x4_t b, int32x4_t c)
 {
-  int32x2_t result;
-  __asm__ ("mla %0.2s, %2.2s, %3.2s"
+  int32x4_t result;
+  __asm__ ("mla %0.4s, %2.4s, %3.4s"
            : "=w"(result)
            : "0"(a), "w"(b), "w"(c)
            : /* No clobbers */);
   return result;
 }
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-vmla_u8 (uint8x8_t a, uint8x8_t b, uint8x8_t c)
+__extension__ extern __inline uint8x16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmlaq_u8 (uint8x16_t a, uint8x16_t b, uint8x16_t c)
 {
-  uint8x8_t result;
-  __asm__ ("mla %0.8b, %2.8b, %3.8b"
+  uint8x16_t result;
+  __asm__ ("mla %0.16b, %2.16b, %3.16b"
            : "=w"(result)
            : "0"(a), "w"(b), "w"(c)
            : /* No clobbers */);
   return result;
 }
 
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-vmla_u16 (uint16x4_t a, uint16x4_t b, uint16x4_t c)
+__extension__ extern __inline uint16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmlaq_u16 (uint16x8_t a, uint16x8_t b, uint16x8_t c)
 {
-  uint16x4_t result;
-  __asm__ ("mla %0.4h, %2.4h, %3.4h"
+  uint16x8_t result;
+  __asm__ ("mla %0.8h, %2.8h, %3.8h"
            : "=w"(result)
            : "0"(a), "w"(b), "w"(c)
            : /* No clobbers */);
   return result;
 }
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-vmla_u32 (uint32x2_t a, uint32x2_t b, uint32x2_t c)
+__extension__ extern __inline uint32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmlaq_u32 (uint32x4_t a, uint32x4_t b, uint32x4_t c)
 {
-  uint32x2_t result;
-  __asm__ ("mla %0.2s, %2.2s, %3.2s"
+  uint32x4_t result;
+  __asm__ ("mla %0.4s, %2.4s, %3.4s"
            : "=w"(result)
            : "0"(a), "w"(b), "w"(c)
            : /* No clobbers */);
   return result;
 }
 
-#define vmlal_high_lane_s16(a, b, c, d)                                 \
-  __extension__                                                         \
-    ({                                                                  \
-       int16x4_t c_ = (c);                                              \
-       int16x8_t b_ = (b);                                              \
-       int32x4_t a_ = (a);                                              \
-       int32x4_t result;                                                \
-       __asm__ ("smlal2 %0.4s, %2.8h, %3.h[%4]"                         \
-                : "=w"(result)                                          \
-                : "0"(a_), "w"(b_), "x"(c_), "i"(d)                     \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
-
-#define vmlal_high_lane_s32(a, b, c, d)                                 \
-  __extension__                                                         \
-    ({                                                                  \
-       int32x2_t c_ = (c);                                              \
-       int32x4_t b_ = (b);                                              \
-       int64x2_t a_ = (a);                                              \
-       int64x2_t result;                                                \
-       __asm__ ("smlal2 %0.2d, %2.4s, %3.s[%4]"                         \
-                : "=w"(result)                                          \
-                : "0"(a_), "w"(b_), "w"(c_), "i"(d)                     \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
+__extension__ extern __inline float32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmls_n_f32 (float32x2_t a, float32x2_t b, float32_t c)
+{
+  float32x2_t result;
+  float32x2_t t1;
+  __asm__ ("fmul %1.2s, %3.2s, %4.s[0]; fsub %0.2s, %0.2s, %1.2s"
+           : "=w"(result), "=w"(t1)
+           : "0"(a), "w"(b), "w"(c)
+           : /* No clobbers */);
+  return result;
+}
 
-#define vmlal_high_lane_u16(a, b, c, d)                                 \
-  __extension__                                                         \
-    ({                                                                  \
-       uint16x4_t c_ = (c);                                             \
-       uint16x8_t b_ = (b);                                             \
-       uint32x4_t a_ = (a);                                             \
-       uint32x4_t result;                                               \
-       __asm__ ("umlal2 %0.4s, %2.8h, %3.h[%4]"                         \
-                : "=w"(result)                                          \
-                : "0"(a_), "w"(b_), "x"(c_), "i"(d)                     \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
+__extension__ extern __inline int16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmls_n_s16 (int16x4_t a, int16x4_t b, int16_t c)
+{
+  int16x4_t result;
+  __asm__ ("mls %0.4h, %2.4h, %3.h[0]"
+           : "=w"(result)
+           : "0"(a), "w"(b), "x"(c)
+           : /* No clobbers */);
+  return result;
+}
 
-#define vmlal_high_lane_u32(a, b, c, d)                                 \
-  __extension__                                                         \
-    ({                                                                  \
-       uint32x2_t c_ = (c);                                             \
+__extension__ extern __inline int32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmls_n_s32 (int32x2_t a, int32x2_t b, int32_t c)
+{
+  int32x2_t result;
+  __asm__ ("mls %0.2s, %2.2s, %3.s[0]"
+           : "=w"(result)
+           : "0"(a), "w"(b), "w"(c)
+           : /* No clobbers */);
+  return result;
+}
+
+__extension__ extern __inline uint16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmls_n_u16 (uint16x4_t a, uint16x4_t b, uint16_t c)
+{
+  uint16x4_t result;
+  __asm__ ("mls %0.4h, %2.4h, %3.h[0]"
+           : "=w"(result)
+           : "0"(a), "w"(b), "x"(c)
+           : /* No clobbers */);
+  return result;
+}
+
+__extension__ extern __inline uint32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmls_n_u32 (uint32x2_t a, uint32x2_t b, uint32_t c)
+{
+  uint32x2_t result;
+  __asm__ ("mls %0.2s, %2.2s, %3.s[0]"
+           : "=w"(result)
+           : "0"(a), "w"(b), "w"(c)
+           : /* No clobbers */);
+  return result;
+}
+
+__extension__ extern __inline int8x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmls_s8 (int8x8_t a, int8x8_t b, int8x8_t c)
+{
+  int8x8_t result;
+  __asm__ ("mls %0.8b,%2.8b,%3.8b"
+           : "=w"(result)
+           : "0"(a), "w"(b), "w"(c)
+           : /* No clobbers */);
+  return result;
+}
+
+__extension__ extern __inline int16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmls_s16 (int16x4_t a, int16x4_t b, int16x4_t c)
+{
+  int16x4_t result;
+  __asm__ ("mls %0.4h,%2.4h,%3.4h"
+           : "=w"(result)
+           : "0"(a), "w"(b), "w"(c)
+           : /* No clobbers */);
+  return result;
+}
+
+__extension__ extern __inline int32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmls_s32 (int32x2_t a, int32x2_t b, int32x2_t c)
+{
+  int32x2_t result;
+  __asm__ ("mls %0.2s,%2.2s,%3.2s"
+           : "=w"(result)
+           : "0"(a), "w"(b), "w"(c)
+           : /* No clobbers */);
+  return result;
+}
+
+__extension__ extern __inline uint8x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmls_u8 (uint8x8_t a, uint8x8_t b, uint8x8_t c)
+{
+  uint8x8_t result;
+  __asm__ ("mls %0.8b,%2.8b,%3.8b"
+           : "=w"(result)
+           : "0"(a), "w"(b), "w"(c)
+           : /* No clobbers */);
+  return result;
+}
+
+__extension__ extern __inline uint16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmls_u16 (uint16x4_t a, uint16x4_t b, uint16x4_t c)
+{
+  uint16x4_t result;
+  __asm__ ("mls %0.4h,%2.4h,%3.4h"
+           : "=w"(result)
+           : "0"(a), "w"(b), "w"(c)
+           : /* No clobbers */);
+  return result;
+}
+
+__extension__ extern __inline uint32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmls_u32 (uint32x2_t a, uint32x2_t b, uint32x2_t c)
+{
+  uint32x2_t result;
+  __asm__ ("mls %0.2s,%2.2s,%3.2s"
+           : "=w"(result)
+           : "0"(a), "w"(b), "w"(c)
+           : /* No clobbers */);
+  return result;
+}
+
+#define vmlsl_high_lane_s16(a, b, c, d)                                 \
+  __extension__                                                         \
+    ({                                                                  \
+       int16x4_t c_ = (c);                                              \
+       int16x8_t b_ = (b);                                              \
+       int32x4_t a_ = (a);                                              \
+       int32x4_t result;                                                \
+       __asm__ ("smlsl2 %0.4s, %2.8h, %3.h[%4]"                         \
+                : "=w"(result)                                          \
+                : "0"(a_), "w"(b_), "x"(c_), "i"(d)                     \
+                : /* No clobbers */);                                   \
+       result;                                                          \
+     })
+
+#define vmlsl_high_lane_s32(a, b, c, d)                                 \
+  __extension__                                                         \
+    ({                                                                  \
+       int32x2_t c_ = (c);                                              \
+       int32x4_t b_ = (b);                                              \
+       int64x2_t a_ = (a);                                              \
+       int64x2_t result;                                                \
+       __asm__ ("smlsl2 %0.2d, %2.4s, %3.s[%4]"                         \
+                : "=w"(result)                                          \
+                : "0"(a_), "w"(b_), "w"(c_), "i"(d)                     \
+                : /* No clobbers */);                                   \
+       result;                                                          \
+     })
+
+#define vmlsl_high_lane_u16(a, b, c, d)                                 \
+  __extension__                                                         \
+    ({                                                                  \
+       uint16x4_t c_ = (c);                                             \
+       uint16x8_t b_ = (b);                                             \
+       uint32x4_t a_ = (a);                                             \
+       uint32x4_t result;                                               \
+       __asm__ ("umlsl2 %0.4s, %2.8h, %3.h[%4]"                         \
+                : "=w"(result)                                          \
+                : "0"(a_), "w"(b_), "x"(c_), "i"(d)                     \
+                : /* No clobbers */);                                   \
+       result;                                                          \
+     })
+
+#define vmlsl_high_lane_u32(a, b, c, d)                                 \
+  __extension__                                                         \
+    ({                                                                  \
+       uint32x2_t c_ = (c);                                             \
        uint32x4_t b_ = (b);                                             \
        uint64x2_t a_ = (a);                                             \
        uint64x2_t result;                                               \
-       __asm__ ("umlal2 %0.2d, %2.4s, %3.s[%4]"                         \
+       __asm__ ("umlsl2 %0.2d, %2.4s, %3.s[%4]"                         \
                 : "=w"(result)                                          \
                 : "0"(a_), "w"(b_), "w"(c_), "i"(d)                     \
                 : /* No clobbers */);                                   \
        result;                                                          \
      })
 
-#define vmlal_high_laneq_s16(a, b, c, d)                                \
+#define vmlsl_high_laneq_s16(a, b, c, d)                                \
   __extension__                                                         \
     ({                                                                  \
        int16x8_t c_ = (c);                                              \
        int16x8_t b_ = (b);                                              \
        int32x4_t a_ = (a);                                              \
        int32x4_t result;                                                \
-       __asm__ ("smlal2 %0.4s, %2.8h, %3.h[%4]"                         \
+       __asm__ ("smlsl2 %0.4s, %2.8h, %3.h[%4]"                         \
                 : "=w"(result)                                          \
                 : "0"(a_), "w"(b_), "x"(c_), "i"(d)                     \
                 : /* No clobbers */);                                   \
        result;                                                          \
      })
 
-#define vmlal_high_laneq_s32(a, b, c, d)                                \
+#define vmlsl_high_laneq_s32(a, b, c, d)                                \
   __extension__                                                         \
     ({                                                                  \
        int32x4_t c_ = (c);                                              \
        int32x4_t b_ = (b);                                              \
        int64x2_t a_ = (a);                                              \
        int64x2_t result;                                                \
-       __asm__ ("smlal2 %0.2d, %2.4s, %3.s[%4]"                         \
+       __asm__ ("smlsl2 %0.2d, %2.4s, %3.s[%4]"                         \
                 : "=w"(result)                                          \
                 : "0"(a_), "w"(b_), "w"(c_), "i"(d)                     \
                 : /* No clobbers */);                                   \
        result;                                                          \
      })
 
-#define vmlal_high_laneq_u16(a, b, c, d)                                \
+#define vmlsl_high_laneq_u16(a, b, c, d)                                \
   __extension__                                                         \
     ({                                                                  \
        uint16x8_t c_ = (c);                                             \
        uint16x8_t b_ = (b);                                             \
        uint32x4_t a_ = (a);                                             \
        uint32x4_t result;                                               \
-       __asm__ ("umlal2 %0.4s, %2.8h, %3.h[%4]"                         \
+       __asm__ ("umlsl2 %0.4s, %2.8h, %3.h[%4]"                         \
                 : "=w"(result)                                          \
                 : "0"(a_), "w"(b_), "x"(c_), "i"(d)                     \
                 : /* No clobbers */);                                   \
        result;                                                          \
      })
 
-#define vmlal_high_laneq_u32(a, b, c, d)                                \
+#define vmlsl_high_laneq_u32(a, b, c, d)                                \
   __extension__                                                         \
     ({                                                                  \
        uint32x4_t c_ = (c);                                             \
        uint32x4_t b_ = (b);                                             \
        uint64x2_t a_ = (a);                                             \
        uint64x2_t result;                                               \
-       __asm__ ("umlal2 %0.2d, %2.4s, %3.s[%4]"                         \
+       __asm__ ("umlsl2 %0.2d, %2.4s, %3.s[%4]"                         \
                 : "=w"(result)                                          \
                 : "0"(a_), "w"(b_), "w"(c_), "i"(d)                     \
                 : /* No clobbers */);                                   \
        result;                                                          \
      })
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-vmlal_high_n_s16 (int32x4_t a, int16x8_t b, int16_t c)
+__extension__ extern __inline int32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmlsl_high_n_s16 (int32x4_t a, int16x8_t b, int16_t c)
 {
   int32x4_t result;
-  __asm__ ("smlal2 %0.4s,%2.8h,%3.h[0]"
+  __asm__ ("smlsl2 %0.4s, %2.8h, %3.h[0]"
            : "=w"(result)
            : "0"(a), "w"(b), "x"(c)
            : /* No clobbers */);
   return result;
 }
 
-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
-vmlal_high_n_s32 (int64x2_t a, int32x4_t b, int32_t c)
+__extension__ extern __inline int64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmlsl_high_n_s32 (int64x2_t a, int32x4_t b, int32_t c)
 {
   int64x2_t result;
-  __asm__ ("smlal2 %0.2d,%2.4s,%3.s[0]"
+  __asm__ ("smlsl2 %0.2d, %2.4s, %3.s[0]"
            : "=w"(result)
            : "0"(a), "w"(b), "w"(c)
            : /* No clobbers */);
   return result;
 }
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vmlal_high_n_u16 (uint32x4_t a, uint16x8_t b, uint16_t c)
+__extension__ extern __inline uint32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmlsl_high_n_u16 (uint32x4_t a, uint16x8_t b, uint16_t c)
 {
   uint32x4_t result;
-  __asm__ ("umlal2 %0.4s,%2.8h,%3.h[0]"
+  __asm__ ("umlsl2 %0.4s, %2.8h, %3.h[0]"
            : "=w"(result)
            : "0"(a), "w"(b), "x"(c)
            : /* No clobbers */);
   return result;
 }
 
-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
-vmlal_high_n_u32 (uint64x2_t a, uint32x4_t b, uint32_t c)
+__extension__ extern __inline uint64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmlsl_high_n_u32 (uint64x2_t a, uint32x4_t b, uint32_t c)
 {
   uint64x2_t result;
-  __asm__ ("umlal2 %0.2d,%2.4s,%3.s[0]"
+  __asm__ ("umlsl2 %0.2d, %2.4s, %3.s[0]"
            : "=w"(result)
            : "0"(a), "w"(b), "w"(c)
            : /* No clobbers */);
   return result;
 }
 
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-vmlal_high_s8 (int16x8_t a, int8x16_t b, int8x16_t c)
+__extension__ extern __inline int16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmlsl_high_s8 (int16x8_t a, int8x16_t b, int8x16_t c)
 {
   int16x8_t result;
-  __asm__ ("smlal2 %0.8h,%2.16b,%3.16b"
+  __asm__ ("smlsl2 %0.8h,%2.16b,%3.16b"
            : "=w"(result)
            : "0"(a), "w"(b), "w"(c)
            : /* No clobbers */);
   return result;
 }
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-vmlal_high_s16 (int32x4_t a, int16x8_t b, int16x8_t c)
+__extension__ extern __inline int32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmlsl_high_s16 (int32x4_t a, int16x8_t b, int16x8_t c)
 {
   int32x4_t result;
-  __asm__ ("smlal2 %0.4s,%2.8h,%3.8h"
+  __asm__ ("smlsl2 %0.4s,%2.8h,%3.8h"
            : "=w"(result)
            : "0"(a), "w"(b), "w"(c)
            : /* No clobbers */);
   return result;
 }
 
-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
-vmlal_high_s32 (int64x2_t a, int32x4_t b, int32x4_t c)
+__extension__ extern __inline int64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmlsl_high_s32 (int64x2_t a, int32x4_t b, int32x4_t c)
 {
   int64x2_t result;
-  __asm__ ("smlal2 %0.2d,%2.4s,%3.4s"
+  __asm__ ("smlsl2 %0.2d,%2.4s,%3.4s"
            : "=w"(result)
            : "0"(a), "w"(b), "w"(c)
            : /* No clobbers */);
   return result;
 }
 
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-vmlal_high_u8 (uint16x8_t a, uint8x16_t b, uint8x16_t c)
+__extension__ extern __inline uint16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmlsl_high_u8 (uint16x8_t a, uint8x16_t b, uint8x16_t c)
 {
   uint16x8_t result;
-  __asm__ ("umlal2 %0.8h,%2.16b,%3.16b"
+  __asm__ ("umlsl2 %0.8h,%2.16b,%3.16b"
            : "=w"(result)
            : "0"(a), "w"(b), "w"(c)
            : /* No clobbers */);
   return result;
 }
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vmlal_high_u16 (uint32x4_t a, uint16x8_t b, uint16x8_t c)
+__extension__ extern __inline uint32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmlsl_high_u16 (uint32x4_t a, uint16x8_t b, uint16x8_t c)
 {
   uint32x4_t result;
-  __asm__ ("umlal2 %0.4s,%2.8h,%3.8h"
+  __asm__ ("umlsl2 %0.4s,%2.8h,%3.8h"
            : "=w"(result)
            : "0"(a), "w"(b), "w"(c)
            : /* No clobbers */);
   return result;
 }
 
-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
-vmlal_high_u32 (uint64x2_t a, uint32x4_t b, uint32x4_t c)
+__extension__ extern __inline uint64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmlsl_high_u32 (uint64x2_t a, uint32x4_t b, uint32x4_t c)
 {
   uint64x2_t result;
-  __asm__ ("umlal2 %0.2d,%2.4s,%3.4s"
+  __asm__ ("umlsl2 %0.2d,%2.4s,%3.4s"
            : "=w"(result)
            : "0"(a), "w"(b), "w"(c)
            : /* No clobbers */);
   return result;
 }
 
-#define vmlal_lane_s16(a, b, c, d)                                      \
+#define vmlsl_lane_s16(a, b, c, d)                                      \
   __extension__                                                         \
     ({                                                                  \
        int16x4_t c_ = (c);                                              \
        int16x4_t b_ = (b);                                              \
        int32x4_t a_ = (a);                                              \
        int32x4_t result;                                                \
-       __asm__ ("smlal %0.4s,%2.4h,%3.h[%4]"                            \
+       __asm__ ("smlsl %0.4s, %2.4h, %3.h[%4]"                          \
                 : "=w"(result)                                          \
                 : "0"(a_), "w"(b_), "x"(c_), "i"(d)                     \
                 : /* No clobbers */);                                   \
        result;                                                          \
      })
 
-#define vmlal_lane_s32(a, b, c, d)                                      \
+#define vmlsl_lane_s32(a, b, c, d)                                      \
   __extension__                                                         \
     ({                                                                  \
        int32x2_t c_ = (c);                                              \
        int32x2_t b_ = (b);                                              \
        int64x2_t a_ = (a);                                              \
        int64x2_t result;                                                \
-       __asm__ ("smlal %0.2d,%2.2s,%3.s[%4]"                            \
+       __asm__ ("smlsl %0.2d, %2.2s, %3.s[%4]"                          \
                 : "=w"(result)                                          \
                 : "0"(a_), "w"(b_), "w"(c_), "i"(d)                     \
                 : /* No clobbers */);                                   \
        result;                                                          \
      })
 
-#define vmlal_lane_u16(a, b, c, d)                                      \
+#define vmlsl_lane_u16(a, b, c, d)                                      \
   __extension__                                                         \
     ({                                                                  \
        uint16x4_t c_ = (c);                                             \
        uint16x4_t b_ = (b);                                             \
        uint32x4_t a_ = (a);                                             \
        uint32x4_t result;                                               \
-       __asm__ ("umlal %0.4s,%2.4h,%3.h[%4]"                            \
+       __asm__ ("umlsl %0.4s, %2.4h, %3.h[%4]"                          \
                 : "=w"(result)                                          \
                 : "0"(a_), "w"(b_), "x"(c_), "i"(d)                     \
                 : /* No clobbers */);                                   \
        result;                                                          \
      })
 
-#define vmlal_lane_u32(a, b, c, d)                                      \
+#define vmlsl_lane_u32(a, b, c, d)                                      \
   __extension__                                                         \
     ({                                                                  \
        uint32x2_t c_ = (c);                                             \
        uint32x2_t b_ = (b);                                             \
        uint64x2_t a_ = (a);                                             \
        uint64x2_t result;                                               \
-       __asm__ ("umlal %0.2d, %2.2s, %3.s[%4]"                          \
+       __asm__ ("umlsl %0.2d, %2.2s, %3.s[%4]"                          \
                 : "=w"(result)                                          \
                 : "0"(a_), "w"(b_), "w"(c_), "i"(d)                     \
                 : /* No clobbers */);                                   \
        result;                                                          \
      })
 
-#define vmlal_laneq_s16(a, b, c, d)                                     \
+#define vmlsl_laneq_s16(a, b, c, d)                                     \
   __extension__                                                         \
     ({                                                                  \
        int16x8_t c_ = (c);                                              \
        int16x4_t b_ = (b);                                              \
        int32x4_t a_ = (a);                                              \
        int32x4_t result;                                                \
-       __asm__ ("smlal %0.4s, %2.4h, %3.h[%4]"                          \
+       __asm__ ("smlsl %0.4s, %2.4h, %3.h[%4]"                          \
                 : "=w"(result)                                          \
                 : "0"(a_), "w"(b_), "x"(c_), "i"(d)                     \
                 : /* No clobbers */);                                   \
        result;                                                          \
      })
 
-#define vmlal_laneq_s32(a, b, c, d)                                     \
+#define vmlsl_laneq_s32(a, b, c, d)                                     \
   __extension__                                                         \
     ({                                                                  \
        int32x4_t c_ = (c);                                              \
        int32x2_t b_ = (b);                                              \
        int64x2_t a_ = (a);                                              \
        int64x2_t result;                                                \
-       __asm__ ("smlal %0.2d, %2.2s, %3.s[%4]"                          \
+       __asm__ ("smlsl %0.2d, %2.2s, %3.s[%4]"                          \
                 : "=w"(result)                                          \
                 : "0"(a_), "w"(b_), "w"(c_), "i"(d)                     \
                 : /* No clobbers */);                                   \
        result;                                                          \
      })
 
-#define vmlal_laneq_u16(a, b, c, d)                                     \
+#define vmlsl_laneq_u16(a, b, c, d)                                     \
   __extension__                                                         \
     ({                                                                  \
        uint16x8_t c_ = (c);                                             \
        uint16x4_t b_ = (b);                                             \
        uint32x4_t a_ = (a);                                             \
        uint32x4_t result;                                               \
-       __asm__ ("umlal %0.4s, %2.4h, %3.h[%4]"                          \
+       __asm__ ("umlsl %0.4s, %2.4h, %3.h[%4]"                          \
                 : "=w"(result)                                          \
                 : "0"(a_), "w"(b_), "x"(c_), "i"(d)                     \
                 : /* No clobbers */);                                   \
        result;                                                          \
      })
 
-#define vmlal_laneq_u32(a, b, c, d)                                     \
+#define vmlsl_laneq_u32(a, b, c, d)                                     \
   __extension__                                                         \
     ({                                                                  \
        uint32x4_t c_ = (c);                                             \
        uint32x2_t b_ = (b);                                             \
        uint64x2_t a_ = (a);                                             \
        uint64x2_t result;                                               \
-       __asm__ ("umlal %0.2d, %2.2s, %3.s[%4]"                          \
+       __asm__ ("umlsl %0.2d, %2.2s, %3.s[%4]"                          \
                 : "=w"(result)                                          \
                 : "0"(a_), "w"(b_), "w"(c_), "i"(d)                     \
                 : /* No clobbers */);                                   \
        result;                                                          \
      })
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-vmlal_n_s16 (int32x4_t a, int16x4_t b, int16_t c)
+__extension__ extern __inline int32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmlsl_n_s16 (int32x4_t a, int16x4_t b, int16_t c)
 {
   int32x4_t result;
-  __asm__ ("smlal %0.4s,%2.4h,%3.h[0]"
+  __asm__ ("smlsl %0.4s, %2.4h, %3.h[0]"
            : "=w"(result)
            : "0"(a), "w"(b), "x"(c)
            : /* No clobbers */);
   return result;
 }
 
-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
-vmlal_n_s32 (int64x2_t a, int32x2_t b, int32_t c)
+__extension__ extern __inline int64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmlsl_n_s32 (int64x2_t a, int32x2_t b, int32_t c)
 {
   int64x2_t result;
-  __asm__ ("smlal %0.2d,%2.2s,%3.s[0]"
+  __asm__ ("smlsl %0.2d, %2.2s, %3.s[0]"
            : "=w"(result)
            : "0"(a), "w"(b), "w"(c)
            : /* No clobbers */);
   return result;
 }
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vmlal_n_u16 (uint32x4_t a, uint16x4_t b, uint16_t c)
+__extension__ extern __inline uint32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmlsl_n_u16 (uint32x4_t a, uint16x4_t b, uint16_t c)
 {
   uint32x4_t result;
-  __asm__ ("umlal %0.4s,%2.4h,%3.h[0]"
+  __asm__ ("umlsl %0.4s, %2.4h, %3.h[0]"
            : "=w"(result)
            : "0"(a), "w"(b), "x"(c)
            : /* No clobbers */);
   return result;
 }
 
-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
-vmlal_n_u32 (uint64x2_t a, uint32x2_t b, uint32_t c)
+__extension__ extern __inline uint64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmlsl_n_u32 (uint64x2_t a, uint32x2_t b, uint32_t c)
 {
   uint64x2_t result;
-  __asm__ ("umlal %0.2d,%2.2s,%3.s[0]"
+  __asm__ ("umlsl %0.2d, %2.2s, %3.s[0]"
            : "=w"(result)
            : "0"(a), "w"(b), "w"(c)
            : /* No clobbers */);
   return result;
 }
 
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-vmlal_s8 (int16x8_t a, int8x8_t b, int8x8_t c)
+__extension__ extern __inline int16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmlsl_s8 (int16x8_t a, int8x8_t b, int8x8_t c)
 {
   int16x8_t result;
-  __asm__ ("smlal %0.8h,%2.8b,%3.8b"
+  __asm__ ("smlsl %0.8h, %2.8b, %3.8b"
            : "=w"(result)
            : "0"(a), "w"(b), "w"(c)
            : /* No clobbers */);
   return result;
 }
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-vmlal_s16 (int32x4_t a, int16x4_t b, int16x4_t c)
+__extension__ extern __inline int32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmlsl_s16 (int32x4_t a, int16x4_t b, int16x4_t c)
 {
   int32x4_t result;
-  __asm__ ("smlal %0.4s,%2.4h,%3.4h"
+  __asm__ ("smlsl %0.4s, %2.4h, %3.4h"
            : "=w"(result)
            : "0"(a), "w"(b), "w"(c)
            : /* No clobbers */);
   return result;
 }
 
-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
-vmlal_s32 (int64x2_t a, int32x2_t b, int32x2_t c)
+__extension__ extern __inline int64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmlsl_s32 (int64x2_t a, int32x2_t b, int32x2_t c)
 {
   int64x2_t result;
-  __asm__ ("smlal %0.2d,%2.2s,%3.2s"
+  __asm__ ("smlsl %0.2d, %2.2s, %3.2s"
            : "=w"(result)
            : "0"(a), "w"(b), "w"(c)
            : /* No clobbers */);
   return result;
 }
 
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-vmlal_u8 (uint16x8_t a, uint8x8_t b, uint8x8_t c)
+__extension__ extern __inline uint16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmlsl_u8 (uint16x8_t a, uint8x8_t b, uint8x8_t c)
 {
   uint16x8_t result;
-  __asm__ ("umlal %0.8h,%2.8b,%3.8b"
+  __asm__ ("umlsl %0.8h, %2.8b, %3.8b"
            : "=w"(result)
            : "0"(a), "w"(b), "w"(c)
            : /* No clobbers */);
   return result;
 }
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vmlal_u16 (uint32x4_t a, uint16x4_t b, uint16x4_t c)
+__extension__ extern __inline uint32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmlsl_u16 (uint32x4_t a, uint16x4_t b, uint16x4_t c)
 {
   uint32x4_t result;
-  __asm__ ("umlal %0.4s,%2.4h,%3.4h"
+  __asm__ ("umlsl %0.4s, %2.4h, %3.4h"
            : "=w"(result)
            : "0"(a), "w"(b), "w"(c)
            : /* No clobbers */);
   return result;
 }
 
-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
-vmlal_u32 (uint64x2_t a, uint32x2_t b, uint32x2_t c)
+__extension__ extern __inline uint64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmlsl_u32 (uint64x2_t a, uint32x2_t b, uint32x2_t c)
 {
   uint64x2_t result;
-  __asm__ ("umlal %0.2d,%2.2s,%3.2s"
+  __asm__ ("umlsl %0.2d, %2.2s, %3.2s"
            : "=w"(result)
            : "0"(a), "w"(b), "w"(c)
            : /* No clobbers */);
   return result;
 }
 
-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
-vmlaq_n_f32 (float32x4_t a, float32x4_t b, float32_t c)
+__extension__ extern __inline float32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmlsq_n_f32 (float32x4_t a, float32x4_t b, float32_t c)
 {
   float32x4_t result;
   float32x4_t t1;
-  __asm__ ("fmul %1.4s, %3.4s, %4.s[0]; fadd %0.4s, %0.4s, %1.4s"
+  __asm__ ("fmul %1.4s, %3.4s, %4.s[0]; fsub %0.4s, %0.4s, %1.4s"
            : "=w"(result), "=w"(t1)
            : "0"(a), "w"(b), "w"(c)
            : /* No clobbers */);
   return result;
 }
 
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-vmlaq_n_s16 (int16x8_t a, int16x8_t b, int16_t c)
+__extension__ extern __inline int16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmlsq_n_s16 (int16x8_t a, int16x8_t b, int16_t c)
 {
   int16x8_t result;
-  __asm__ ("mla %0.8h,%2.8h,%3.h[0]"
+  __asm__ ("mls %0.8h, %2.8h, %3.h[0]"
            : "=w"(result)
            : "0"(a), "w"(b), "x"(c)
            : /* No clobbers */);
   return result;
 }
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-vmlaq_n_s32 (int32x4_t a, int32x4_t b, int32_t c)
+__extension__ extern __inline int32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmlsq_n_s32 (int32x4_t a, int32x4_t b, int32_t c)
 {
   int32x4_t result;
-  __asm__ ("mla %0.4s,%2.4s,%3.s[0]"
+  __asm__ ("mls %0.4s, %2.4s, %3.s[0]"
            : "=w"(result)
            : "0"(a), "w"(b), "w"(c)
            : /* No clobbers */);
   return result;
 }
 
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-vmlaq_n_u16 (uint16x8_t a, uint16x8_t b, uint16_t c)
+__extension__ extern __inline uint16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmlsq_n_u16 (uint16x8_t a, uint16x8_t b, uint16_t c)
 {
   uint16x8_t result;
-  __asm__ ("mla %0.8h,%2.8h,%3.h[0]"
+  __asm__ ("mls %0.8h, %2.8h, %3.h[0]"
            : "=w"(result)
            : "0"(a), "w"(b), "x"(c)
            : /* No clobbers */);
   return result;
 }
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vmlaq_n_u32 (uint32x4_t a, uint32x4_t b, uint32_t c)
+__extension__ extern __inline uint32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmlsq_n_u32 (uint32x4_t a, uint32x4_t b, uint32_t c)
 {
   uint32x4_t result;
-  __asm__ ("mla %0.4s,%2.4s,%3.s[0]"
+  __asm__ ("mls %0.4s, %2.4s, %3.s[0]"
            : "=w"(result)
            : "0"(a), "w"(b), "w"(c)
            : /* No clobbers */);
   return result;
 }
 
-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
-vmlaq_s8 (int8x16_t a, int8x16_t b, int8x16_t c)
+__extension__ extern __inline int8x16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmlsq_s8 (int8x16_t a, int8x16_t b, int8x16_t c)
 {
   int8x16_t result;
-  __asm__ ("mla %0.16b, %2.16b, %3.16b"
+  __asm__ ("mls %0.16b,%2.16b,%3.16b"
            : "=w"(result)
            : "0"(a), "w"(b), "w"(c)
            : /* No clobbers */);
   return result;
 }
 
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-vmlaq_s16 (int16x8_t a, int16x8_t b, int16x8_t c)
+__extension__ extern __inline int16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmlsq_s16 (int16x8_t a, int16x8_t b, int16x8_t c)
 {
   int16x8_t result;
-  __asm__ ("mla %0.8h, %2.8h, %3.8h"
+  __asm__ ("mls %0.8h,%2.8h,%3.8h"
            : "=w"(result)
            : "0"(a), "w"(b), "w"(c)
            : /* No clobbers */);
   return result;
 }
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-vmlaq_s32 (int32x4_t a, int32x4_t b, int32x4_t c)
+__extension__ extern __inline int32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmlsq_s32 (int32x4_t a, int32x4_t b, int32x4_t c)
 {
   int32x4_t result;
-  __asm__ ("mla %0.4s, %2.4s, %3.4s"
+  __asm__ ("mls %0.4s,%2.4s,%3.4s"
            : "=w"(result)
            : "0"(a), "w"(b), "w"(c)
            : /* No clobbers */);
   return result;
 }
 
-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-vmlaq_u8 (uint8x16_t a, uint8x16_t b, uint8x16_t c)
+__extension__ extern __inline uint8x16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmlsq_u8 (uint8x16_t a, uint8x16_t b, uint8x16_t c)
 {
   uint8x16_t result;
-  __asm__ ("mla %0.16b, %2.16b, %3.16b"
+  __asm__ ("mls %0.16b,%2.16b,%3.16b"
            : "=w"(result)
            : "0"(a), "w"(b), "w"(c)
            : /* No clobbers */);
   return result;
 }
 
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-vmlaq_u16 (uint16x8_t a, uint16x8_t b, uint16x8_t c)
+__extension__ extern __inline uint16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmlsq_u16 (uint16x8_t a, uint16x8_t b, uint16x8_t c)
 {
   uint16x8_t result;
-  __asm__ ("mla %0.8h, %2.8h, %3.8h"
+  __asm__ ("mls %0.8h,%2.8h,%3.8h"
            : "=w"(result)
            : "0"(a), "w"(b), "w"(c)
            : /* No clobbers */);
   return result;
 }
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vmlaq_u32 (uint32x4_t a, uint32x4_t b, uint32x4_t c)
+__extension__ extern __inline uint32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmlsq_u32 (uint32x4_t a, uint32x4_t b, uint32x4_t c)
 {
   uint32x4_t result;
-  __asm__ ("mla %0.4s, %2.4s, %3.4s"
+  __asm__ ("mls %0.4s,%2.4s,%3.4s"
            : "=w"(result)
            : "0"(a), "w"(b), "w"(c)
            : /* No clobbers */);
   return result;
 }
 
-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
-vmls_n_f32 (float32x2_t a, float32x2_t b, float32_t c)
+__extension__ extern __inline int16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmovl_high_s8 (int8x16_t a)
 {
-  float32x2_t result;
-  float32x2_t t1;
-  __asm__ ("fmul %1.2s, %3.2s, %4.s[0]; fsub %0.2s, %0.2s, %1.2s"
-           : "=w"(result), "=w"(t1)
-           : "0"(a), "w"(b), "w"(c)
+  int16x8_t result;
+  __asm__ ("sshll2 %0.8h,%1.16b,#0"
+           : "=w"(result)
+           : "w"(a)
            : /* No clobbers */);
   return result;
 }
 
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-vmls_n_s16 (int16x4_t a, int16x4_t b, int16_t c)
+__extension__ extern __inline int32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmovl_high_s16 (int16x8_t a)
 {
-  int16x4_t result;
-  __asm__ ("mls %0.4h, %2.4h, %3.h[0]"
+  int32x4_t result;
+  __asm__ ("sshll2 %0.4s,%1.8h,#0"
            : "=w"(result)
-           : "0"(a), "w"(b), "x"(c)
+           : "w"(a)
            : /* No clobbers */);
   return result;
 }
 
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-vmls_n_s32 (int32x2_t a, int32x2_t b, int32_t c)
+__extension__ extern __inline int64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmovl_high_s32 (int32x4_t a)
 {
-  int32x2_t result;
-  __asm__ ("mls %0.2s, %2.2s, %3.s[0]"
+  int64x2_t result;
+  __asm__ ("sshll2 %0.2d,%1.4s,#0"
            : "=w"(result)
-           : "0"(a), "w"(b), "w"(c)
+           : "w"(a)
            : /* No clobbers */);
   return result;
 }
 
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-vmls_n_u16 (uint16x4_t a, uint16x4_t b, uint16_t c)
+__extension__ extern __inline uint16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmovl_high_u8 (uint8x16_t a)
 {
-  uint16x4_t result;
-  __asm__ ("mls %0.4h, %2.4h, %3.h[0]"
+  uint16x8_t result;
+  __asm__ ("ushll2 %0.8h,%1.16b,#0"
            : "=w"(result)
-           : "0"(a), "w"(b), "x"(c)
+           : "w"(a)
            : /* No clobbers */);
   return result;
 }
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-vmls_n_u32 (uint32x2_t a, uint32x2_t b, uint32_t c)
+__extension__ extern __inline uint32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmovl_high_u16 (uint16x8_t a)
 {
-  uint32x2_t result;
-  __asm__ ("mls %0.2s, %2.2s, %3.s[0]"
+  uint32x4_t result;
+  __asm__ ("ushll2 %0.4s,%1.8h,#0"
            : "=w"(result)
-           : "0"(a), "w"(b), "w"(c)
+           : "w"(a)
            : /* No clobbers */);
   return result;
 }
 
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-vmls_s8 (int8x8_t a, int8x8_t b, int8x8_t c)
+__extension__ extern __inline uint64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmovl_high_u32 (uint32x4_t a)
 {
-  int8x8_t result;
-  __asm__ ("mls %0.8b,%2.8b,%3.8b"
+  uint64x2_t result;
+  __asm__ ("ushll2 %0.2d,%1.4s,#0"
            : "=w"(result)
-           : "0"(a), "w"(b), "w"(c)
+           : "w"(a)
            : /* No clobbers */);
   return result;
 }
 
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-vmls_s16 (int16x4_t a, int16x4_t b, int16x4_t c)
+__extension__ extern __inline int16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmovl_s8 (int8x8_t a)
 {
-  int16x4_t result;
-  __asm__ ("mls %0.4h,%2.4h,%3.4h"
+  int16x8_t result;
+  __asm__ ("sshll %0.8h,%1.8b,#0"
            : "=w"(result)
-           : "0"(a), "w"(b), "w"(c)
+           : "w"(a)
            : /* No clobbers */);
   return result;
 }
 
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-vmls_s32 (int32x2_t a, int32x2_t b, int32x2_t c)
+__extension__ extern __inline int32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmovl_s16 (int16x4_t a)
 {
-  int32x2_t result;
-  __asm__ ("mls %0.2s,%2.2s,%3.2s"
+  int32x4_t result;
+  __asm__ ("sshll %0.4s,%1.4h,#0"
            : "=w"(result)
-           : "0"(a), "w"(b), "w"(c)
+           : "w"(a)
            : /* No clobbers */);
   return result;
 }
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-vmls_u8 (uint8x8_t a, uint8x8_t b, uint8x8_t c)
+__extension__ extern __inline int64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmovl_s32 (int32x2_t a)
 {
-  uint8x8_t result;
-  __asm__ ("mls %0.8b,%2.8b,%3.8b"
+  int64x2_t result;
+  __asm__ ("sshll %0.2d,%1.2s,#0"
            : "=w"(result)
-           : "0"(a), "w"(b), "w"(c)
+           : "w"(a)
            : /* No clobbers */);
   return result;
 }
 
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-vmls_u16 (uint16x4_t a, uint16x4_t b, uint16x4_t c)
+__extension__ extern __inline uint16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmovl_u8 (uint8x8_t a)
 {
-  uint16x4_t result;
-  __asm__ ("mls %0.4h,%2.4h,%3.4h"
+  uint16x8_t result;
+  __asm__ ("ushll %0.8h,%1.8b,#0"
            : "=w"(result)
-           : "0"(a), "w"(b), "w"(c)
+           : "w"(a)
            : /* No clobbers */);
   return result;
 }
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-vmls_u32 (uint32x2_t a, uint32x2_t b, uint32x2_t c)
+__extension__ extern __inline uint32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmovl_u16 (uint16x4_t a)
 {
-  uint32x2_t result;
-  __asm__ ("mls %0.2s,%2.2s,%3.2s"
+  uint32x4_t result;
+  __asm__ ("ushll %0.4s,%1.4h,#0"
            : "=w"(result)
-           : "0"(a), "w"(b), "w"(c)
+           : "w"(a)
            : /* No clobbers */);
   return result;
 }
 
-#define vmlsl_high_lane_s16(a, b, c, d)                                 \
-  __extension__                                                         \
-    ({                                                                  \
-       int16x4_t c_ = (c);                                              \
-       int16x8_t b_ = (b);                                              \
-       int32x4_t a_ = (a);                                              \
-       int32x4_t result;                                                \
-       __asm__ ("smlsl2 %0.4s, %2.8h, %3.h[%4]"                         \
-                : "=w"(result)                                          \
-                : "0"(a_), "w"(b_), "x"(c_), "i"(d)                     \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
+__extension__ extern __inline uint64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmovl_u32 (uint32x2_t a)
+{
+  uint64x2_t result;
+  __asm__ ("ushll %0.2d,%1.2s,#0"
+           : "=w"(result)
+           : "w"(a)
+           : /* No clobbers */);
+  return result;
+}
 
-#define vmlsl_high_lane_s32(a, b, c, d)                                 \
-  __extension__                                                         \
-    ({                                                                  \
-       int32x2_t c_ = (c);                                              \
-       int32x4_t b_ = (b);                                              \
-       int64x2_t a_ = (a);                                              \
-       int64x2_t result;                                                \
-       __asm__ ("smlsl2 %0.2d, %2.4s, %3.s[%4]"                         \
-                : "=w"(result)                                          \
-                : "0"(a_), "w"(b_), "w"(c_), "i"(d)                     \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
+__extension__ extern __inline int8x16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmovn_high_s16 (int8x8_t a, int16x8_t b)
+{
+  int8x16_t result = vcombine_s8 (a, vcreate_s8 (__AARCH64_UINT64_C (0x0)));
+  __asm__ ("xtn2 %0.16b,%1.8h"
+           : "+w"(result)
+           : "w"(b)
+           : /* No clobbers */);
+  return result;
+}
 
-#define vmlsl_high_lane_u16(a, b, c, d)                                 \
-  __extension__                                                         \
-    ({                                                                  \
-       uint16x4_t c_ = (c);                                             \
-       uint16x8_t b_ = (b);                                             \
-       uint32x4_t a_ = (a);                                             \
-       uint32x4_t result;                                               \
-       __asm__ ("umlsl2 %0.4s, %2.8h, %3.h[%4]"                         \
-                : "=w"(result)                                          \
-                : "0"(a_), "w"(b_), "x"(c_), "i"(d)                     \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
+__extension__ extern __inline int16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmovn_high_s32 (int16x4_t a, int32x4_t b)
+{
+  int16x8_t result = vcombine_s16 (a, vcreate_s16 (__AARCH64_UINT64_C (0x0)));
+  __asm__ ("xtn2 %0.8h,%1.4s"
+           : "+w"(result)
+           : "w"(b)
+           : /* No clobbers */);
+  return result;
+}
 
-#define vmlsl_high_lane_u32(a, b, c, d)                                 \
-  __extension__                                                         \
-    ({                                                                  \
-       uint32x2_t c_ = (c);                                             \
-       uint32x4_t b_ = (b);                                             \
-       uint64x2_t a_ = (a);                                             \
-       uint64x2_t result;                                               \
-       __asm__ ("umlsl2 %0.2d, %2.4s, %3.s[%4]"                         \
-                : "=w"(result)                                          \
-                : "0"(a_), "w"(b_), "w"(c_), "i"(d)                     \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
+__extension__ extern __inline int32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmovn_high_s64 (int32x2_t a, int64x2_t b)
+{
+  int32x4_t result = vcombine_s32 (a, vcreate_s32 (__AARCH64_UINT64_C (0x0)));
+  __asm__ ("xtn2 %0.4s,%1.2d"
+           : "+w"(result)
+           : "w"(b)
+           : /* No clobbers */);
+  return result;
+}
 
-#define vmlsl_high_laneq_s16(a, b, c, d)                                \
-  __extension__                                                         \
-    ({                                                                  \
-       int16x8_t c_ = (c);                                              \
-       int16x8_t b_ = (b);                                              \
-       int32x4_t a_ = (a);                                              \
-       int32x4_t result;                                                \
-       __asm__ ("smlsl2 %0.4s, %2.8h, %3.h[%4]"                         \
-                : "=w"(result)                                          \
-                : "0"(a_), "w"(b_), "x"(c_), "i"(d)                     \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
+__extension__ extern __inline uint8x16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmovn_high_u16 (uint8x8_t a, uint16x8_t b)
+{
+  uint8x16_t result = vcombine_u8 (a, vcreate_u8 (__AARCH64_UINT64_C (0x0)));
+  __asm__ ("xtn2 %0.16b,%1.8h"
+           : "+w"(result)
+           : "w"(b)
+           : /* No clobbers */);
+  return result;
+}
 
-#define vmlsl_high_laneq_s32(a, b, c, d)                                \
-  __extension__                                                         \
-    ({                                                                  \
-       int32x4_t c_ = (c);                                              \
-       int32x4_t b_ = (b);                                              \
-       int64x2_t a_ = (a);                                              \
-       int64x2_t result;                                                \
-       __asm__ ("smlsl2 %0.2d, %2.4s, %3.s[%4]"                         \
-                : "=w"(result)                                          \
-                : "0"(a_), "w"(b_), "w"(c_), "i"(d)                     \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
-
-#define vmlsl_high_laneq_u16(a, b, c, d)                                \
-  __extension__                                                         \
-    ({                                                                  \
-       uint16x8_t c_ = (c);                                             \
-       uint16x8_t b_ = (b);                                             \
-       uint32x4_t a_ = (a);                                             \
-       uint32x4_t result;                                               \
-       __asm__ ("umlsl2 %0.4s, %2.8h, %3.h[%4]"                         \
-                : "=w"(result)                                          \
-                : "0"(a_), "w"(b_), "x"(c_), "i"(d)                     \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
-
-#define vmlsl_high_laneq_u32(a, b, c, d)                                \
-  __extension__                                                         \
-    ({                                                                  \
-       uint32x4_t c_ = (c);                                             \
-       uint32x4_t b_ = (b);                                             \
-       uint64x2_t a_ = (a);                                             \
-       uint64x2_t result;                                               \
-       __asm__ ("umlsl2 %0.2d, %2.4s, %3.s[%4]"                         \
-                : "=w"(result)                                          \
-                : "0"(a_), "w"(b_), "w"(c_), "i"(d)                     \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
-
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-vmlsl_high_n_s16 (int32x4_t a, int16x8_t b, int16_t c)
-{
-  int32x4_t result;
-  __asm__ ("smlsl2 %0.4s, %2.8h, %3.h[0]"
-           : "=w"(result)
-           : "0"(a), "w"(b), "x"(c)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
-vmlsl_high_n_s32 (int64x2_t a, int32x4_t b, int32_t c)
-{
-  int64x2_t result;
-  __asm__ ("smlsl2 %0.2d, %2.4s, %3.s[0]"
-           : "=w"(result)
-           : "0"(a), "w"(b), "w"(c)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vmlsl_high_n_u16 (uint32x4_t a, uint16x8_t b, uint16_t c)
+__extension__ extern __inline uint16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmovn_high_u32 (uint16x4_t a, uint32x4_t b)
 {
-  uint32x4_t result;
-  __asm__ ("umlsl2 %0.4s, %2.8h, %3.h[0]"
-           : "=w"(result)
-           : "0"(a), "w"(b), "x"(c)
+  uint16x8_t result = vcombine_u16 (a, vcreate_u16 (__AARCH64_UINT64_C (0x0)));
+  __asm__ ("xtn2 %0.8h,%1.4s"
+           : "+w"(result)
+           : "w"(b)
            : /* No clobbers */);
   return result;
 }
 
-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
-vmlsl_high_n_u32 (uint64x2_t a, uint32x4_t b, uint32_t c)
+__extension__ extern __inline uint32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmovn_high_u64 (uint32x2_t a, uint64x2_t b)
 {
-  uint64x2_t result;
-  __asm__ ("umlsl2 %0.2d, %2.4s, %3.s[0]"
-           : "=w"(result)
-           : "0"(a), "w"(b), "w"(c)
+  uint32x4_t result = vcombine_u32 (a, vcreate_u32 (__AARCH64_UINT64_C (0x0)));
+  __asm__ ("xtn2 %0.4s,%1.2d"
+           : "+w"(result)
+           : "w"(b)
            : /* No clobbers */);
   return result;
 }
 
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-vmlsl_high_s8 (int16x8_t a, int8x16_t b, int8x16_t c)
+__extension__ extern __inline int8x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmovn_s16 (int16x8_t a)
 {
-  int16x8_t result;
-  __asm__ ("smlsl2 %0.8h,%2.16b,%3.16b"
+  int8x8_t result;
+  __asm__ ("xtn %0.8b,%1.8h"
            : "=w"(result)
-           : "0"(a), "w"(b), "w"(c)
+           : "w"(a)
            : /* No clobbers */);
   return result;
 }
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-vmlsl_high_s16 (int32x4_t a, int16x8_t b, int16x8_t c)
+__extension__ extern __inline int16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmovn_s32 (int32x4_t a)
 {
-  int32x4_t result;
-  __asm__ ("smlsl2 %0.4s,%2.8h,%3.8h"
+  int16x4_t result;
+  __asm__ ("xtn %0.4h,%1.4s"
            : "=w"(result)
-           : "0"(a), "w"(b), "w"(c)
+           : "w"(a)
            : /* No clobbers */);
   return result;
 }
 
-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
-vmlsl_high_s32 (int64x2_t a, int32x4_t b, int32x4_t c)
+__extension__ extern __inline int32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmovn_s64 (int64x2_t a)
 {
-  int64x2_t result;
-  __asm__ ("smlsl2 %0.2d,%2.4s,%3.4s"
+  int32x2_t result;
+  __asm__ ("xtn %0.2s,%1.2d"
            : "=w"(result)
-           : "0"(a), "w"(b), "w"(c)
+           : "w"(a)
            : /* No clobbers */);
   return result;
 }
 
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-vmlsl_high_u8 (uint16x8_t a, uint8x16_t b, uint8x16_t c)
+__extension__ extern __inline uint8x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmovn_u16 (uint16x8_t a)
 {
-  uint16x8_t result;
-  __asm__ ("umlsl2 %0.8h,%2.16b,%3.16b"
+  uint8x8_t result;
+  __asm__ ("xtn %0.8b,%1.8h"
            : "=w"(result)
-           : "0"(a), "w"(b), "w"(c)
+           : "w"(a)
            : /* No clobbers */);
   return result;
 }
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vmlsl_high_u16 (uint32x4_t a, uint16x8_t b, uint16x8_t c)
+__extension__ extern __inline uint16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmovn_u32 (uint32x4_t a)
 {
-  uint32x4_t result;
-  __asm__ ("umlsl2 %0.4s,%2.8h,%3.8h"
+  uint16x4_t result;
+  __asm__ ("xtn %0.4h,%1.4s"
            : "=w"(result)
-           : "0"(a), "w"(b), "w"(c)
+           : "w"(a)
            : /* No clobbers */);
   return result;
 }
 
-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
-vmlsl_high_u32 (uint64x2_t a, uint32x4_t b, uint32x4_t c)
+__extension__ extern __inline uint32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmovn_u64 (uint64x2_t a)
 {
-  uint64x2_t result;
-  __asm__ ("umlsl2 %0.2d,%2.4s,%3.4s"
+  uint32x2_t result;
+  __asm__ ("xtn %0.2s,%1.2d"
            : "=w"(result)
-           : "0"(a), "w"(b), "w"(c)
+           : "w"(a)
            : /* No clobbers */);
   return result;
 }
 
-#define vmlsl_lane_s16(a, b, c, d)                                      \
+#define vmull_high_lane_s16(a, b, c)                                    \
   __extension__                                                         \
     ({                                                                  \
-       int16x4_t c_ = (c);                                              \
        int16x4_t b_ = (b);                                              \
-       int32x4_t a_ = (a);                                              \
+       int16x8_t a_ = (a);                                              \
        int32x4_t result;                                                \
-       __asm__ ("smlsl %0.4s, %2.4h, %3.h[%4]"                          \
+       __asm__ ("smull2 %0.4s, %1.8h, %2.h[%3]"                         \
                 : "=w"(result)                                          \
-                : "0"(a_), "w"(b_), "x"(c_), "i"(d)                     \
+                : "w"(a_), "x"(b_), "i"(c)                              \
                 : /* No clobbers */);                                   \
        result;                                                          \
      })
 
-#define vmlsl_lane_s32(a, b, c, d)                                      \
+#define vmull_high_lane_s32(a, b, c)                                    \
   __extension__                                                         \
     ({                                                                  \
-       int32x2_t c_ = (c);                                              \
        int32x2_t b_ = (b);                                              \
-       int64x2_t a_ = (a);                                              \
+       int32x4_t a_ = (a);                                              \
        int64x2_t result;                                                \
-       __asm__ ("smlsl %0.2d, %2.2s, %3.s[%4]"                          \
+       __asm__ ("smull2 %0.2d, %1.4s, %2.s[%3]"                         \
                 : "=w"(result)                                          \
-                : "0"(a_), "w"(b_), "w"(c_), "i"(d)                     \
+                : "w"(a_), "w"(b_), "i"(c)                              \
                 : /* No clobbers */);                                   \
        result;                                                          \
      })
 
-#define vmlsl_lane_u16(a, b, c, d)                                      \
+#define vmull_high_lane_u16(a, b, c)                                    \
   __extension__                                                         \
     ({                                                                  \
-       uint16x4_t c_ = (c);                                             \
        uint16x4_t b_ = (b);                                             \
-       uint32x4_t a_ = (a);                                             \
+       uint16x8_t a_ = (a);                                             \
        uint32x4_t result;                                               \
-       __asm__ ("umlsl %0.4s, %2.4h, %3.h[%4]"                          \
+       __asm__ ("umull2 %0.4s, %1.8h, %2.h[%3]"                         \
                 : "=w"(result)                                          \
-                : "0"(a_), "w"(b_), "x"(c_), "i"(d)                     \
+                : "w"(a_), "x"(b_), "i"(c)                              \
                 : /* No clobbers */);                                   \
        result;                                                          \
      })
 
-#define vmlsl_lane_u32(a, b, c, d)                                      \
+#define vmull_high_lane_u32(a, b, c)                                    \
   __extension__                                                         \
     ({                                                                  \
-       uint32x2_t c_ = (c);                                             \
        uint32x2_t b_ = (b);                                             \
-       uint64x2_t a_ = (a);                                             \
+       uint32x4_t a_ = (a);                                             \
        uint64x2_t result;                                               \
-       __asm__ ("umlsl %0.2d, %2.2s, %3.s[%4]"                          \
+       __asm__ ("umull2 %0.2d, %1.4s, %2.s[%3]"                         \
                 : "=w"(result)                                          \
-                : "0"(a_), "w"(b_), "w"(c_), "i"(d)                     \
+                : "w"(a_), "w"(b_), "i"(c)                              \
                 : /* No clobbers */);                                   \
        result;                                                          \
      })
 
-#define vmlsl_laneq_s16(a, b, c, d)                                     \
+#define vmull_high_laneq_s16(a, b, c)                                   \
   __extension__                                                         \
     ({                                                                  \
-       int16x8_t c_ = (c);                                              \
-       int16x4_t b_ = (b);                                              \
-       int32x4_t a_ = (a);                                              \
+       int16x8_t b_ = (b);                                              \
+       int16x8_t a_ = (a);                                              \
        int32x4_t result;                                                \
-       __asm__ ("smlsl %0.4s, %2.4h, %3.h[%4]"                          \
+       __asm__ ("smull2 %0.4s, %1.8h, %2.h[%3]"                         \
                 : "=w"(result)                                          \
-                : "0"(a_), "w"(b_), "x"(c_), "i"(d)                     \
+                : "w"(a_), "x"(b_), "i"(c)                              \
                 : /* No clobbers */);                                   \
        result;                                                          \
      })
 
-#define vmlsl_laneq_s32(a, b, c, d)                                     \
+#define vmull_high_laneq_s32(a, b, c)                                   \
   __extension__                                                         \
     ({                                                                  \
-       int32x4_t c_ = (c);                                              \
-       int32x2_t b_ = (b);                                              \
-       int64x2_t a_ = (a);                                              \
+       int32x4_t b_ = (b);                                              \
+       int32x4_t a_ = (a);                                              \
        int64x2_t result;                                                \
-       __asm__ ("smlsl %0.2d, %2.2s, %3.s[%4]"                          \
+       __asm__ ("smull2 %0.2d, %1.4s, %2.s[%3]"                         \
                 : "=w"(result)                                          \
-                : "0"(a_), "w"(b_), "w"(c_), "i"(d)                     \
+                : "w"(a_), "w"(b_), "i"(c)                              \
                 : /* No clobbers */);                                   \
        result;                                                          \
      })
 
-#define vmlsl_laneq_u16(a, b, c, d)                                     \
+#define vmull_high_laneq_u16(a, b, c)                                   \
   __extension__                                                         \
     ({                                                                  \
-       uint16x8_t c_ = (c);                                             \
-       uint16x4_t b_ = (b);                                             \
-       uint32x4_t a_ = (a);                                             \
+       uint16x8_t b_ = (b);                                             \
+       uint16x8_t a_ = (a);                                             \
        uint32x4_t result;                                               \
-       __asm__ ("umlsl %0.4s, %2.4h, %3.h[%4]"                          \
+       __asm__ ("umull2 %0.4s, %1.8h, %2.h[%3]"                         \
                 : "=w"(result)                                          \
-                : "0"(a_), "w"(b_), "x"(c_), "i"(d)                     \
+                : "w"(a_), "x"(b_), "i"(c)                              \
                 : /* No clobbers */);                                   \
        result;                                                          \
      })
 
-#define vmlsl_laneq_u32(a, b, c, d)                                     \
+#define vmull_high_laneq_u32(a, b, c)                                   \
   __extension__                                                         \
     ({                                                                  \
-       uint32x4_t c_ = (c);                                             \
-       uint32x2_t b_ = (b);                                             \
-       uint64x2_t a_ = (a);                                             \
+       uint32x4_t b_ = (b);                                             \
+       uint32x4_t a_ = (a);                                             \
        uint64x2_t result;                                               \
-       __asm__ ("umlsl %0.2d, %2.2s, %3.s[%4]"                          \
+       __asm__ ("umull2 %0.2d, %1.4s, %2.s[%3]"                         \
                 : "=w"(result)                                          \
-                : "0"(a_), "w"(b_), "w"(c_), "i"(d)                     \
+                : "w"(a_), "w"(b_), "i"(c)                              \
                 : /* No clobbers */);                                   \
        result;                                                          \
      })
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-vmlsl_n_s16 (int32x4_t a, int16x4_t b, int16_t c)
+__extension__ extern __inline int32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmull_high_n_s16 (int16x8_t a, int16_t b)
 {
   int32x4_t result;
-  __asm__ ("smlsl %0.4s, %2.4h, %3.h[0]"
+  __asm__ ("smull2 %0.4s,%1.8h,%2.h[0]"
            : "=w"(result)
-           : "0"(a), "w"(b), "x"(c)
+           : "w"(a), "x"(b)
            : /* No clobbers */);
   return result;
 }
 
-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
-vmlsl_n_s32 (int64x2_t a, int32x2_t b, int32_t c)
+__extension__ extern __inline int64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmull_high_n_s32 (int32x4_t a, int32_t b)
 {
   int64x2_t result;
-  __asm__ ("smlsl %0.2d, %2.2s, %3.s[0]"
+  __asm__ ("smull2 %0.2d,%1.4s,%2.s[0]"
            : "=w"(result)
-           : "0"(a), "w"(b), "w"(c)
+           : "w"(a), "w"(b)
            : /* No clobbers */);
   return result;
 }
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vmlsl_n_u16 (uint32x4_t a, uint16x4_t b, uint16_t c)
+__extension__ extern __inline uint32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmull_high_n_u16 (uint16x8_t a, uint16_t b)
 {
   uint32x4_t result;
-  __asm__ ("umlsl %0.4s, %2.4h, %3.h[0]"
+  __asm__ ("umull2 %0.4s,%1.8h,%2.h[0]"
            : "=w"(result)
-           : "0"(a), "w"(b), "x"(c)
+           : "w"(a), "x"(b)
            : /* No clobbers */);
   return result;
 }
 
-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
-vmlsl_n_u32 (uint64x2_t a, uint32x2_t b, uint32_t c)
+__extension__ extern __inline uint64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmull_high_n_u32 (uint32x4_t a, uint32_t b)
 {
   uint64x2_t result;
-  __asm__ ("umlsl %0.2d, %2.2s, %3.s[0]"
+  __asm__ ("umull2 %0.2d,%1.4s,%2.s[0]"
            : "=w"(result)
-           : "0"(a), "w"(b), "w"(c)
+           : "w"(a), "w"(b)
            : /* No clobbers */);
   return result;
 }
 
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-vmlsl_s8 (int16x8_t a, int8x8_t b, int8x8_t c)
+__extension__ extern __inline poly16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmull_high_p8 (poly8x16_t a, poly8x16_t b)
 {
-  int16x8_t result;
-  __asm__ ("smlsl %0.8h, %2.8b, %3.8b"
+  poly16x8_t result;
+  __asm__ ("pmull2 %0.8h,%1.16b,%2.16b"
            : "=w"(result)
-           : "0"(a), "w"(b), "w"(c)
+           : "w"(a), "w"(b)
            : /* No clobbers */);
   return result;
 }
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-vmlsl_s16 (int32x4_t a, int16x4_t b, int16x4_t c)
+__extension__ extern __inline int16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmull_high_s8 (int8x16_t a, int8x16_t b)
 {
-  int32x4_t result;
-  __asm__ ("smlsl %0.4s, %2.4h, %3.4h"
+  int16x8_t result;
+  __asm__ ("smull2 %0.8h,%1.16b,%2.16b"
            : "=w"(result)
-           : "0"(a), "w"(b), "w"(c)
+           : "w"(a), "w"(b)
            : /* No clobbers */);
   return result;
 }
 
-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
-vmlsl_s32 (int64x2_t a, int32x2_t b, int32x2_t c)
+__extension__ extern __inline int32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmull_high_s16 (int16x8_t a, int16x8_t b)
 {
-  int64x2_t result;
-  __asm__ ("smlsl %0.2d, %2.2s, %3.2s"
+  int32x4_t result;
+  __asm__ ("smull2 %0.4s,%1.8h,%2.8h"
            : "=w"(result)
-           : "0"(a), "w"(b), "w"(c)
+           : "w"(a), "w"(b)
            : /* No clobbers */);
   return result;
 }
 
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-vmlsl_u8 (uint16x8_t a, uint8x8_t b, uint8x8_t c)
+__extension__ extern __inline int64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmull_high_s32 (int32x4_t a, int32x4_t b)
 {
-  uint16x8_t result;
-  __asm__ ("umlsl %0.8h, %2.8b, %3.8b"
+  int64x2_t result;
+  __asm__ ("smull2 %0.2d,%1.4s,%2.4s"
            : "=w"(result)
-           : "0"(a), "w"(b), "w"(c)
+           : "w"(a), "w"(b)
            : /* No clobbers */);
   return result;
 }
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vmlsl_u16 (uint32x4_t a, uint16x4_t b, uint16x4_t c)
+__extension__ extern __inline uint16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmull_high_u8 (uint8x16_t a, uint8x16_t b)
 {
-  uint32x4_t result;
-  __asm__ ("umlsl %0.4s, %2.4h, %3.4h"
+  uint16x8_t result;
+  __asm__ ("umull2 %0.8h,%1.16b,%2.16b"
            : "=w"(result)
-           : "0"(a), "w"(b), "w"(c)
+           : "w"(a), "w"(b)
            : /* No clobbers */);
   return result;
 }
 
-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
-vmlsl_u32 (uint64x2_t a, uint32x2_t b, uint32x2_t c)
+__extension__ extern __inline uint32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmull_high_u16 (uint16x8_t a, uint16x8_t b)
 {
-  uint64x2_t result;
-  __asm__ ("umlsl %0.2d, %2.2s, %3.2s"
+  uint32x4_t result;
+  __asm__ ("umull2 %0.4s,%1.8h,%2.8h"
            : "=w"(result)
-           : "0"(a), "w"(b), "w"(c)
+           : "w"(a), "w"(b)
            : /* No clobbers */);
   return result;
 }
 
-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
-vmlsq_n_f32 (float32x4_t a, float32x4_t b, float32_t c)
+__extension__ extern __inline uint64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmull_high_u32 (uint32x4_t a, uint32x4_t b)
 {
-  float32x4_t result;
-  float32x4_t t1;
-  __asm__ ("fmul %1.4s, %3.4s, %4.s[0]; fsub %0.4s, %0.4s, %1.4s"
-           : "=w"(result), "=w"(t1)
-           : "0"(a), "w"(b), "w"(c)
+  uint64x2_t result;
+  __asm__ ("umull2 %0.2d,%1.4s,%2.4s"
+           : "=w"(result)
+           : "w"(a), "w"(b)
            : /* No clobbers */);
   return result;
 }
 
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-vmlsq_n_s16 (int16x8_t a, int16x8_t b, int16_t c)
+#define vmull_lane_s16(a, b, c)                                         \
+  __extension__                                                         \
+    ({                                                                  \
+       int16x4_t b_ = (b);                                              \
+       int16x4_t a_ = (a);                                              \
+       int32x4_t result;                                                \
+       __asm__ ("smull %0.4s,%1.4h,%2.h[%3]"                            \
+                : "=w"(result)                                          \
+                : "w"(a_), "x"(b_), "i"(c)                              \
+                : /* No clobbers */);                                   \
+       result;                                                          \
+     })
+
+#define vmull_lane_s32(a, b, c)                                         \
+  __extension__                                                         \
+    ({                                                                  \
+       int32x2_t b_ = (b);                                              \
+       int32x2_t a_ = (a);                                              \
+       int64x2_t result;                                                \
+       __asm__ ("smull %0.2d,%1.2s,%2.s[%3]"                            \
+                : "=w"(result)                                          \
+                : "w"(a_), "w"(b_), "i"(c)                              \
+                : /* No clobbers */);                                   \
+       result;                                                          \
+     })
+
+#define vmull_lane_u16(a, b, c)                                         \
+  __extension__                                                         \
+    ({                                                                  \
+       uint16x4_t b_ = (b);                                             \
+       uint16x4_t a_ = (a);                                             \
+       uint32x4_t result;                                               \
+       __asm__ ("umull %0.4s,%1.4h,%2.h[%3]"                            \
+                : "=w"(result)                                          \
+                : "w"(a_), "x"(b_), "i"(c)                              \
+                : /* No clobbers */);                                   \
+       result;                                                          \
+     })
+
+#define vmull_lane_u32(a, b, c)                                         \
+  __extension__                                                         \
+    ({                                                                  \
+       uint32x2_t b_ = (b);                                             \
+       uint32x2_t a_ = (a);                                             \
+       uint64x2_t result;                                               \
+       __asm__ ("umull %0.2d, %1.2s, %2.s[%3]"                          \
+                : "=w"(result)                                          \
+                : "w"(a_), "w"(b_), "i"(c)                              \
+                : /* No clobbers */);                                   \
+       result;                                                          \
+     })
+
+#define vmull_laneq_s16(a, b, c)                                        \
+  __extension__                                                         \
+    ({                                                                  \
+       int16x8_t b_ = (b);                                              \
+       int16x4_t a_ = (a);                                              \
+       int32x4_t result;                                                \
+       __asm__ ("smull %0.4s, %1.4h, %2.h[%3]"                          \
+                : "=w"(result)                                          \
+                : "w"(a_), "x"(b_), "i"(c)                              \
+                : /* No clobbers */);                                   \
+       result;                                                          \
+     })
+
+#define vmull_laneq_s32(a, b, c)                                        \
+  __extension__                                                         \
+    ({                                                                  \
+       int32x4_t b_ = (b);                                              \
+       int32x2_t a_ = (a);                                              \
+       int64x2_t result;                                                \
+       __asm__ ("smull %0.2d, %1.2s, %2.s[%3]"                          \
+                : "=w"(result)                                          \
+                : "w"(a_), "w"(b_), "i"(c)                              \
+                : /* No clobbers */);                                   \
+       result;                                                          \
+     })
+
+#define vmull_laneq_u16(a, b, c)                                        \
+  __extension__                                                         \
+    ({                                                                  \
+       uint16x8_t b_ = (b);                                             \
+       uint16x4_t a_ = (a);                                             \
+       uint32x4_t result;                                               \
+       __asm__ ("umull %0.4s, %1.4h, %2.h[%3]"                          \
+                : "=w"(result)                                          \
+                : "w"(a_), "x"(b_), "i"(c)                              \
+                : /* No clobbers */);                                   \
+       result;                                                          \
+     })
+
+#define vmull_laneq_u32(a, b, c)                                        \
+  __extension__                                                         \
+    ({                                                                  \
+       uint32x4_t b_ = (b);                                             \
+       uint32x2_t a_ = (a);                                             \
+       uint64x2_t result;                                               \
+       __asm__ ("umull %0.2d, %1.2s, %2.s[%3]"                          \
+                : "=w"(result)                                          \
+                : "w"(a_), "w"(b_), "i"(c)                              \
+                : /* No clobbers */);                                   \
+       result;                                                          \
+     })
+
+__extension__ extern __inline int32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmull_n_s16 (int16x4_t a, int16_t b)
 {
-  int16x8_t result;
-  __asm__ ("mls %0.8h, %2.8h, %3.h[0]"
+  int32x4_t result;
+  __asm__ ("smull %0.4s,%1.4h,%2.h[0]"
            : "=w"(result)
-           : "0"(a), "w"(b), "x"(c)
+           : "w"(a), "x"(b)
            : /* No clobbers */);
   return result;
 }
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-vmlsq_n_s32 (int32x4_t a, int32x4_t b, int32_t c)
+__extension__ extern __inline int64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmull_n_s32 (int32x2_t a, int32_t b)
 {
-  int32x4_t result;
-  __asm__ ("mls %0.4s, %2.4s, %3.s[0]"
+  int64x2_t result;
+  __asm__ ("smull %0.2d,%1.2s,%2.s[0]"
            : "=w"(result)
-           : "0"(a), "w"(b), "w"(c)
+           : "w"(a), "w"(b)
            : /* No clobbers */);
   return result;
 }
 
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-vmlsq_n_u16 (uint16x8_t a, uint16x8_t b, uint16_t c)
+__extension__ extern __inline uint32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmull_n_u16 (uint16x4_t a, uint16_t b)
 {
-  uint16x8_t result;
-  __asm__ ("mls %0.8h, %2.8h, %3.h[0]"
+  uint32x4_t result;
+  __asm__ ("umull %0.4s,%1.4h,%2.h[0]"
            : "=w"(result)
-           : "0"(a), "w"(b), "x"(c)
+           : "w"(a), "x"(b)
            : /* No clobbers */);
   return result;
 }
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vmlsq_n_u32 (uint32x4_t a, uint32x4_t b, uint32_t c)
+__extension__ extern __inline uint64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmull_n_u32 (uint32x2_t a, uint32_t b)
 {
-  uint32x4_t result;
-  __asm__ ("mls %0.4s, %2.4s, %3.s[0]"
+  uint64x2_t result;
+  __asm__ ("umull %0.2d,%1.2s,%2.s[0]"
            : "=w"(result)
-           : "0"(a), "w"(b), "w"(c)
+           : "w"(a), "w"(b)
            : /* No clobbers */);
   return result;
 }
 
-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
-vmlsq_s8 (int8x16_t a, int8x16_t b, int8x16_t c)
+__extension__ extern __inline poly16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmull_p8 (poly8x8_t a, poly8x8_t b)
 {
-  int8x16_t result;
-  __asm__ ("mls %0.16b,%2.16b,%3.16b"
+  poly16x8_t result;
+  __asm__ ("pmull %0.8h, %1.8b, %2.8b"
            : "=w"(result)
-           : "0"(a), "w"(b), "w"(c)
+           : "w"(a), "w"(b)
            : /* No clobbers */);
   return result;
 }
 
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-vmlsq_s16 (int16x8_t a, int16x8_t b, int16x8_t c)
+__extension__ extern __inline int16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmull_s8 (int8x8_t a, int8x8_t b)
 {
   int16x8_t result;
-  __asm__ ("mls %0.8h,%2.8h,%3.8h"
+  __asm__ ("smull %0.8h, %1.8b, %2.8b"
            : "=w"(result)
-           : "0"(a), "w"(b), "w"(c)
+           : "w"(a), "w"(b)
            : /* No clobbers */);
   return result;
 }
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-vmlsq_s32 (int32x4_t a, int32x4_t b, int32x4_t c)
+__extension__ extern __inline int32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmull_s16 (int16x4_t a, int16x4_t b)
 {
   int32x4_t result;
-  __asm__ ("mls %0.4s,%2.4s,%3.4s"
+  __asm__ ("smull %0.4s, %1.4h, %2.4h"
            : "=w"(result)
-           : "0"(a), "w"(b), "w"(c)
+           : "w"(a), "w"(b)
            : /* No clobbers */);
   return result;
 }
 
-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-vmlsq_u8 (uint8x16_t a, uint8x16_t b, uint8x16_t c)
+__extension__ extern __inline int64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmull_s32 (int32x2_t a, int32x2_t b)
 {
-  uint8x16_t result;
-  __asm__ ("mls %0.16b,%2.16b,%3.16b"
+  int64x2_t result;
+  __asm__ ("smull %0.2d, %1.2s, %2.2s"
            : "=w"(result)
-           : "0"(a), "w"(b), "w"(c)
+           : "w"(a), "w"(b)
            : /* No clobbers */);
   return result;
 }
 
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-vmlsq_u16 (uint16x8_t a, uint16x8_t b, uint16x8_t c)
+__extension__ extern __inline uint16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmull_u8 (uint8x8_t a, uint8x8_t b)
 {
   uint16x8_t result;
-  __asm__ ("mls %0.8h,%2.8h,%3.8h"
+  __asm__ ("umull %0.8h, %1.8b, %2.8b"
            : "=w"(result)
-           : "0"(a), "w"(b), "w"(c)
+           : "w"(a), "w"(b)
            : /* No clobbers */);
   return result;
 }
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vmlsq_u32 (uint32x4_t a, uint32x4_t b, uint32x4_t c)
+__extension__ extern __inline uint32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmull_u16 (uint16x4_t a, uint16x4_t b)
 {
   uint32x4_t result;
-  __asm__ ("mls %0.4s,%2.4s,%3.4s"
+  __asm__ ("umull %0.4s, %1.4h, %2.4h"
            : "=w"(result)
-           : "0"(a), "w"(b), "w"(c)
+           : "w"(a), "w"(b)
            : /* No clobbers */);
   return result;
 }
 
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-vmovl_high_s8 (int8x16_t a)
+__extension__ extern __inline uint64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmull_u32 (uint32x2_t a, uint32x2_t b)
 {
-  int16x8_t result;
-  __asm__ ("sshll2 %0.8h,%1.16b,#0"
+  uint64x2_t result;
+  __asm__ ("umull %0.2d, %1.2s, %2.2s"
            : "=w"(result)
-           : "w"(a)
+           : "w"(a), "w"(b)
            : /* No clobbers */);
   return result;
 }
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-vmovl_high_s16 (int16x8_t a)
+__extension__ extern __inline int16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vpadal_s8 (int16x4_t a, int8x8_t b)
 {
-  int32x4_t result;
-  __asm__ ("sshll2 %0.4s,%1.8h,#0"
+  int16x4_t result;
+  __asm__ ("sadalp %0.4h,%2.8b"
            : "=w"(result)
-           : "w"(a)
+           : "0"(a), "w"(b)
            : /* No clobbers */);
   return result;
 }
 
-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
-vmovl_high_s32 (int32x4_t a)
+__extension__ extern __inline int32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vpadal_s16 (int32x2_t a, int16x4_t b)
 {
-  int64x2_t result;
-  __asm__ ("sshll2 %0.2d,%1.4s,#0"
+  int32x2_t result;
+  __asm__ ("sadalp %0.2s,%2.4h"
            : "=w"(result)
-           : "w"(a)
+           : "0"(a), "w"(b)
            : /* No clobbers */);
   return result;
 }
 
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-vmovl_high_u8 (uint8x16_t a)
+__extension__ extern __inline int64x1_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vpadal_s32 (int64x1_t a, int32x2_t b)
 {
-  uint16x8_t result;
-  __asm__ ("ushll2 %0.8h,%1.16b,#0"
+  int64x1_t result;
+  __asm__ ("sadalp %0.1d,%2.2s"
            : "=w"(result)
-           : "w"(a)
+           : "0"(a), "w"(b)
            : /* No clobbers */);
   return result;
 }
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vmovl_high_u16 (uint16x8_t a)
+__extension__ extern __inline uint16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vpadal_u8 (uint16x4_t a, uint8x8_t b)
 {
-  uint32x4_t result;
-  __asm__ ("ushll2 %0.4s,%1.8h,#0"
+  uint16x4_t result;
+  __asm__ ("uadalp %0.4h,%2.8b"
            : "=w"(result)
-           : "w"(a)
+           : "0"(a), "w"(b)
            : /* No clobbers */);
   return result;
 }
 
-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
-vmovl_high_u32 (uint32x4_t a)
+__extension__ extern __inline uint32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vpadal_u16 (uint32x2_t a, uint16x4_t b)
 {
-  uint64x2_t result;
-  __asm__ ("ushll2 %0.2d,%1.4s,#0"
+  uint32x2_t result;
+  __asm__ ("uadalp %0.2s,%2.4h"
            : "=w"(result)
-           : "w"(a)
+           : "0"(a), "w"(b)
            : /* No clobbers */);
   return result;
 }
 
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-vmovl_s8 (int8x8_t a)
+__extension__ extern __inline uint64x1_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vpadal_u32 (uint64x1_t a, uint32x2_t b)
+{
+  uint64x1_t result;
+  __asm__ ("uadalp %0.1d,%2.2s"
+           : "=w"(result)
+           : "0"(a), "w"(b)
+           : /* No clobbers */);
+  return result;
+}
+
+__extension__ extern __inline int16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vpadalq_s8 (int16x8_t a, int8x16_t b)
 {
   int16x8_t result;
-  __asm__ ("sshll %0.8h,%1.8b,#0"
+  __asm__ ("sadalp %0.8h,%2.16b"
            : "=w"(result)
-           : "w"(a)
+           : "0"(a), "w"(b)
            : /* No clobbers */);
   return result;
 }
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-vmovl_s16 (int16x4_t a)
+__extension__ extern __inline int32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vpadalq_s16 (int32x4_t a, int16x8_t b)
 {
   int32x4_t result;
-  __asm__ ("sshll %0.4s,%1.4h,#0"
+  __asm__ ("sadalp %0.4s,%2.8h"
            : "=w"(result)
-           : "w"(a)
+           : "0"(a), "w"(b)
            : /* No clobbers */);
   return result;
 }
 
-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
-vmovl_s32 (int32x2_t a)
+__extension__ extern __inline int64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vpadalq_s32 (int64x2_t a, int32x4_t b)
 {
   int64x2_t result;
-  __asm__ ("sshll %0.2d,%1.2s,#0"
+  __asm__ ("sadalp %0.2d,%2.4s"
            : "=w"(result)
-           : "w"(a)
+           : "0"(a), "w"(b)
            : /* No clobbers */);
   return result;
 }
 
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-vmovl_u8 (uint8x8_t a)
+__extension__ extern __inline uint16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vpadalq_u8 (uint16x8_t a, uint8x16_t b)
 {
   uint16x8_t result;
-  __asm__ ("ushll %0.8h,%1.8b,#0"
+  __asm__ ("uadalp %0.8h,%2.16b"
            : "=w"(result)
-           : "w"(a)
+           : "0"(a), "w"(b)
            : /* No clobbers */);
   return result;
 }
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vmovl_u16 (uint16x4_t a)
+__extension__ extern __inline uint32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vpadalq_u16 (uint32x4_t a, uint16x8_t b)
 {
   uint32x4_t result;
-  __asm__ ("ushll %0.4s,%1.4h,#0"
+  __asm__ ("uadalp %0.4s,%2.8h"
            : "=w"(result)
-           : "w"(a)
+           : "0"(a), "w"(b)
            : /* No clobbers */);
   return result;
 }
 
-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
-vmovl_u32 (uint32x2_t a)
+__extension__ extern __inline uint64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vpadalq_u32 (uint64x2_t a, uint32x4_t b)
 {
   uint64x2_t result;
-  __asm__ ("ushll %0.2d,%1.2s,#0"
+  __asm__ ("uadalp %0.2d,%2.4s"
            : "=w"(result)
-           : "w"(a)
+           : "0"(a), "w"(b)
            : /* No clobbers */);
   return result;
 }
 
-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
-vmovn_high_s16 (int8x8_t a, int16x8_t b)
+__extension__ extern __inline int16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vpaddl_s8 (int8x8_t a)
 {
-  int8x16_t result = vcombine_s8 (a, vcreate_s8 (__AARCH64_UINT64_C (0x0)));
-  __asm__ ("xtn2 %0.16b,%1.8h"
-           : "+w"(result)
-           : "w"(b)
+  int16x4_t result;
+  __asm__ ("saddlp %0.4h,%1.8b"
+           : "=w"(result)
+           : "w"(a)
            : /* No clobbers */);
   return result;
 }
 
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-vmovn_high_s32 (int16x4_t a, int32x4_t b)
+__extension__ extern __inline int32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vpaddl_s16 (int16x4_t a)
 {
-  int16x8_t result = vcombine_s16 (a, vcreate_s16 (__AARCH64_UINT64_C (0x0)));
-  __asm__ ("xtn2 %0.8h,%1.4s"
-           : "+w"(result)
-           : "w"(b)
+  int32x2_t result;
+  __asm__ ("saddlp %0.2s,%1.4h"
+           : "=w"(result)
+           : "w"(a)
            : /* No clobbers */);
   return result;
 }
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-vmovn_high_s64 (int32x2_t a, int64x2_t b)
+__extension__ extern __inline int64x1_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vpaddl_s32 (int32x2_t a)
 {
-  int32x4_t result = vcombine_s32 (a, vcreate_s32 (__AARCH64_UINT64_C (0x0)));
-  __asm__ ("xtn2 %0.4s,%1.2d"
-           : "+w"(result)
-           : "w"(b)
+  int64x1_t result;
+  __asm__ ("saddlp %0.1d,%1.2s"
+           : "=w"(result)
+           : "w"(a)
            : /* No clobbers */);
   return result;
 }
 
-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-vmovn_high_u16 (uint8x8_t a, uint16x8_t b)
+__extension__ extern __inline uint16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vpaddl_u8 (uint8x8_t a)
 {
-  uint8x16_t result = vcombine_u8 (a, vcreate_u8 (__AARCH64_UINT64_C (0x0)));
-  __asm__ ("xtn2 %0.16b,%1.8h"
-           : "+w"(result)
-           : "w"(b)
+  uint16x4_t result;
+  __asm__ ("uaddlp %0.4h,%1.8b"
+           : "=w"(result)
+           : "w"(a)
            : /* No clobbers */);
   return result;
 }
 
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-vmovn_high_u32 (uint16x4_t a, uint32x4_t b)
+__extension__ extern __inline uint32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vpaddl_u16 (uint16x4_t a)
 {
-  uint16x8_t result = vcombine_u16 (a, vcreate_u16 (__AARCH64_UINT64_C (0x0)));
-  __asm__ ("xtn2 %0.8h,%1.4s"
-           : "+w"(result)
-           : "w"(b)
+  uint32x2_t result;
+  __asm__ ("uaddlp %0.2s,%1.4h"
+           : "=w"(result)
+           : "w"(a)
            : /* No clobbers */);
   return result;
 }
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vmovn_high_u64 (uint32x2_t a, uint64x2_t b)
+__extension__ extern __inline uint64x1_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vpaddl_u32 (uint32x2_t a)
 {
-  uint32x4_t result = vcombine_u32 (a, vcreate_u32 (__AARCH64_UINT64_C (0x0)));
-  __asm__ ("xtn2 %0.4s,%1.2d"
-           : "+w"(result)
-           : "w"(b)
+  uint64x1_t result;
+  __asm__ ("uaddlp %0.1d,%1.2s"
+           : "=w"(result)
+           : "w"(a)
            : /* No clobbers */);
   return result;
 }
 
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-vmovn_s16 (int16x8_t a)
+__extension__ extern __inline int16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vpaddlq_s8 (int8x16_t a)
 {
-  int8x8_t result;
-  __asm__ ("xtn %0.8b,%1.8h"
+  int16x8_t result;
+  __asm__ ("saddlp %0.8h,%1.16b"
            : "=w"(result)
            : "w"(a)
            : /* No clobbers */);
   return result;
 }
 
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-vmovn_s32 (int32x4_t a)
+__extension__ extern __inline int32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vpaddlq_s16 (int16x8_t a)
 {
-  int16x4_t result;
-  __asm__ ("xtn %0.4h,%1.4s"
+  int32x4_t result;
+  __asm__ ("saddlp %0.4s,%1.8h"
            : "=w"(result)
            : "w"(a)
            : /* No clobbers */);
   return result;
 }
 
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-vmovn_s64 (int64x2_t a)
+__extension__ extern __inline int64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vpaddlq_s32 (int32x4_t a)
 {
-  int32x2_t result;
-  __asm__ ("xtn %0.2s,%1.2d"
+  int64x2_t result;
+  __asm__ ("saddlp %0.2d,%1.4s"
            : "=w"(result)
            : "w"(a)
            : /* No clobbers */);
   return result;
 }
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-vmovn_u16 (uint16x8_t a)
+__extension__ extern __inline uint16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vpaddlq_u8 (uint8x16_t a)
 {
-  uint8x8_t result;
-  __asm__ ("xtn %0.8b,%1.8h"
+  uint16x8_t result;
+  __asm__ ("uaddlp %0.8h,%1.16b"
            : "=w"(result)
            : "w"(a)
            : /* No clobbers */);
   return result;
 }
 
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-vmovn_u32 (uint32x4_t a)
+__extension__ extern __inline uint32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vpaddlq_u16 (uint16x8_t a)
 {
-  uint16x4_t result;
-  __asm__ ("xtn %0.4h,%1.4s"
+  uint32x4_t result;
+  __asm__ ("uaddlp %0.4s,%1.8h"
            : "=w"(result)
            : "w"(a)
            : /* No clobbers */);
   return result;
 }
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-vmovn_u64 (uint64x2_t a)
+__extension__ extern __inline uint64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vpaddlq_u32 (uint32x4_t a)
 {
-  uint32x2_t result;
-  __asm__ ("xtn %0.2s,%1.2d"
+  uint64x2_t result;
+  __asm__ ("uaddlp %0.2d,%1.4s"
            : "=w"(result)
            : "w"(a)
            : /* No clobbers */);
   return result;
 }
 
-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
-vmul_n_f32 (float32x2_t a, float32_t b)
+__extension__ extern __inline int8x16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vpaddq_s8 (int8x16_t a, int8x16_t b)
 {
-  float32x2_t result;
-  __asm__ ("fmul %0.2s,%1.2s,%2.s[0]"
+  int8x16_t result;
+  __asm__ ("addp %0.16b,%1.16b,%2.16b"
+           : "=w"(result)
+           : "w"(a), "w"(b)
+           : /* No clobbers */);
+  return result;
+}
+
+__extension__ extern __inline int16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vpaddq_s16 (int16x8_t a, int16x8_t b)
+{
+  int16x8_t result;
+  __asm__ ("addp %0.8h,%1.8h,%2.8h"
+           : "=w"(result)
+           : "w"(a), "w"(b)
+           : /* No clobbers */);
+  return result;
+}
+
+__extension__ extern __inline int32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vpaddq_s32 (int32x4_t a, int32x4_t b)
+{
+  int32x4_t result;
+  __asm__ ("addp %0.4s,%1.4s,%2.4s"
+           : "=w"(result)
+           : "w"(a), "w"(b)
+           : /* No clobbers */);
+  return result;
+}
+
+__extension__ extern __inline int64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vpaddq_s64 (int64x2_t a, int64x2_t b)
+{
+  int64x2_t result;
+  __asm__ ("addp %0.2d,%1.2d,%2.2d"
+           : "=w"(result)
+           : "w"(a), "w"(b)
+           : /* No clobbers */);
+  return result;
+}
+
+__extension__ extern __inline uint8x16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vpaddq_u8 (uint8x16_t a, uint8x16_t b)
+{
+  uint8x16_t result;
+  __asm__ ("addp %0.16b,%1.16b,%2.16b"
+           : "=w"(result)
+           : "w"(a), "w"(b)
+           : /* No clobbers */);
+  return result;
+}
+
+__extension__ extern __inline uint16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vpaddq_u16 (uint16x8_t a, uint16x8_t b)
+{
+  uint16x8_t result;
+  __asm__ ("addp %0.8h,%1.8h,%2.8h"
+           : "=w"(result)
+           : "w"(a), "w"(b)
+           : /* No clobbers */);
+  return result;
+}
+
+__extension__ extern __inline uint32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vpaddq_u32 (uint32x4_t a, uint32x4_t b)
+{
+  uint32x4_t result;
+  __asm__ ("addp %0.4s,%1.4s,%2.4s"
+           : "=w"(result)
+           : "w"(a), "w"(b)
+           : /* No clobbers */);
+  return result;
+}
+
+__extension__ extern __inline uint64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vpaddq_u64 (uint64x2_t a, uint64x2_t b)
+{
+  uint64x2_t result;
+  __asm__ ("addp %0.2d,%1.2d,%2.2d"
            : "=w"(result)
            : "w"(a), "w"(b)
            : /* No clobbers */);
   return result;
 }
 
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-vmul_n_s16 (int16x4_t a, int16_t b)
+__extension__ extern __inline int16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqdmulh_n_s16 (int16x4_t a, int16_t b)
 {
   int16x4_t result;
-  __asm__ ("mul %0.4h,%1.4h,%2.h[0]"
+  __asm__ ("sqdmulh %0.4h,%1.4h,%2.h[0]"
            : "=w"(result)
            : "w"(a), "x"(b)
            : /* No clobbers */);
   return result;
 }
 
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-vmul_n_s32 (int32x2_t a, int32_t b)
+__extension__ extern __inline int32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqdmulh_n_s32 (int32x2_t a, int32_t b)
 {
   int32x2_t result;
-  __asm__ ("mul %0.2s,%1.2s,%2.s[0]"
+  __asm__ ("sqdmulh %0.2s,%1.2s,%2.s[0]"
            : "=w"(result)
            : "w"(a), "w"(b)
            : /* No clobbers */);
   return result;
 }
 
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-vmul_n_u16 (uint16x4_t a, uint16_t b)
+__extension__ extern __inline int16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqdmulhq_n_s16 (int16x8_t a, int16_t b)
 {
-  uint16x4_t result;
-  __asm__ ("mul %0.4h,%1.4h,%2.h[0]"
+  int16x8_t result;
+  __asm__ ("sqdmulh %0.8h,%1.8h,%2.h[0]"
            : "=w"(result)
            : "w"(a), "x"(b)
            : /* No clobbers */);
   return result;
 }
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-vmul_n_u32 (uint32x2_t a, uint32_t b)
+__extension__ extern __inline int32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqdmulhq_n_s32 (int32x4_t a, int32_t b)
 {
-  uint32x2_t result;
-  __asm__ ("mul %0.2s,%1.2s,%2.s[0]"
+  int32x4_t result;
+  __asm__ ("sqdmulh %0.4s,%1.4s,%2.s[0]"
            : "=w"(result)
            : "w"(a), "w"(b)
            : /* No clobbers */);
   return result;
 }
 
-#define vmull_high_lane_s16(a, b, c)                                    \
-  __extension__                                                         \
-    ({                                                                  \
-       int16x4_t b_ = (b);                                              \
-       int16x8_t a_ = (a);                                              \
-       int32x4_t result;                                                \
-       __asm__ ("smull2 %0.4s, %1.8h, %2.h[%3]"                         \
-                : "=w"(result)                                          \
-                : "w"(a_), "x"(b_), "i"(c)                              \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
+__extension__ extern __inline int8x16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqmovn_high_s16 (int8x8_t a, int16x8_t b)
+{
+  int8x16_t result = vcombine_s8 (a, vcreate_s8 (__AARCH64_UINT64_C (0x0)));
+  __asm__ ("sqxtn2 %0.16b, %1.8h"
+           : "+w"(result)
+           : "w"(b)
+           : /* No clobbers */);
+  return result;
+}
 
-#define vmull_high_lane_s32(a, b, c)                                    \
-  __extension__                                                         \
-    ({                                                                  \
-       int32x2_t b_ = (b);                                              \
-       int32x4_t a_ = (a);                                              \
-       int64x2_t result;                                                \
-       __asm__ ("smull2 %0.2d, %1.4s, %2.s[%3]"                         \
-                : "=w"(result)                                          \
-                : "w"(a_), "w"(b_), "i"(c)                              \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
-
-#define vmull_high_lane_u16(a, b, c)                                    \
-  __extension__                                                         \
-    ({                                                                  \
-       uint16x4_t b_ = (b);                                             \
-       uint16x8_t a_ = (a);                                             \
-       uint32x4_t result;                                               \
-       __asm__ ("umull2 %0.4s, %1.8h, %2.h[%3]"                         \
-                : "=w"(result)                                          \
-                : "w"(a_), "x"(b_), "i"(c)                              \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
-
-#define vmull_high_lane_u32(a, b, c)                                    \
-  __extension__                                                         \
-    ({                                                                  \
-       uint32x2_t b_ = (b);                                             \
-       uint32x4_t a_ = (a);                                             \
-       uint64x2_t result;                                               \
-       __asm__ ("umull2 %0.2d, %1.4s, %2.s[%3]"                         \
-                : "=w"(result)                                          \
-                : "w"(a_), "w"(b_), "i"(c)                              \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
-
-#define vmull_high_laneq_s16(a, b, c)                                   \
-  __extension__                                                         \
-    ({                                                                  \
-       int16x8_t b_ = (b);                                              \
-       int16x8_t a_ = (a);                                              \
-       int32x4_t result;                                                \
-       __asm__ ("smull2 %0.4s, %1.8h, %2.h[%3]"                         \
-                : "=w"(result)                                          \
-                : "w"(a_), "x"(b_), "i"(c)                              \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
-
-#define vmull_high_laneq_s32(a, b, c)                                   \
-  __extension__                                                         \
-    ({                                                                  \
-       int32x4_t b_ = (b);                                              \
-       int32x4_t a_ = (a);                                              \
-       int64x2_t result;                                                \
-       __asm__ ("smull2 %0.2d, %1.4s, %2.s[%3]"                         \
-                : "=w"(result)                                          \
-                : "w"(a_), "w"(b_), "i"(c)                              \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
-
-#define vmull_high_laneq_u16(a, b, c)                                   \
-  __extension__                                                         \
-    ({                                                                  \
-       uint16x8_t b_ = (b);                                             \
-       uint16x8_t a_ = (a);                                             \
-       uint32x4_t result;                                               \
-       __asm__ ("umull2 %0.4s, %1.8h, %2.h[%3]"                         \
-                : "=w"(result)                                          \
-                : "w"(a_), "x"(b_), "i"(c)                              \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
-
-#define vmull_high_laneq_u32(a, b, c)                                   \
-  __extension__                                                         \
-    ({                                                                  \
-       uint32x4_t b_ = (b);                                             \
-       uint32x4_t a_ = (a);                                             \
-       uint64x2_t result;                                               \
-       __asm__ ("umull2 %0.2d, %1.4s, %2.s[%3]"                         \
-                : "=w"(result)                                          \
-                : "w"(a_), "w"(b_), "i"(c)                              \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
+__extension__ extern __inline int16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqmovn_high_s32 (int16x4_t a, int32x4_t b)
+{
+  int16x8_t result = vcombine_s16 (a, vcreate_s16 (__AARCH64_UINT64_C (0x0)));
+  __asm__ ("sqxtn2 %0.8h, %1.4s"
+           : "+w"(result)
+           : "w"(b)
+           : /* No clobbers */);
+  return result;
+}
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-vmull_high_n_s16 (int16x8_t a, int16_t b)
+__extension__ extern __inline int32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqmovn_high_s64 (int32x2_t a, int64x2_t b)
 {
-  int32x4_t result;
-  __asm__ ("smull2 %0.4s,%1.8h,%2.h[0]"
-           : "=w"(result)
-           : "w"(a), "x"(b)
+  int32x4_t result = vcombine_s32 (a, vcreate_s32 (__AARCH64_UINT64_C (0x0)));
+  __asm__ ("sqxtn2 %0.4s, %1.2d"
+           : "+w"(result)
+           : "w"(b)
            : /* No clobbers */);
   return result;
 }
 
-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
-vmull_high_n_s32 (int32x4_t a, int32_t b)
+__extension__ extern __inline uint8x16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqmovn_high_u16 (uint8x8_t a, uint16x8_t b)
 {
-  int64x2_t result;
-  __asm__ ("smull2 %0.2d,%1.4s,%2.s[0]"
-           : "=w"(result)
-           : "w"(a), "w"(b)
+  uint8x16_t result = vcombine_u8 (a, vcreate_u8 (__AARCH64_UINT64_C (0x0)));
+  __asm__ ("uqxtn2 %0.16b, %1.8h"
+           : "+w"(result)
+           : "w"(b)
            : /* No clobbers */);
   return result;
 }
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vmull_high_n_u16 (uint16x8_t a, uint16_t b)
+__extension__ extern __inline uint16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqmovn_high_u32 (uint16x4_t a, uint32x4_t b)
 {
-  uint32x4_t result;
-  __asm__ ("umull2 %0.4s,%1.8h,%2.h[0]"
-           : "=w"(result)
-           : "w"(a), "x"(b)
+  uint16x8_t result = vcombine_u16 (a, vcreate_u16 (__AARCH64_UINT64_C (0x0)));
+  __asm__ ("uqxtn2 %0.8h, %1.4s"
+           : "+w"(result)
+           : "w"(b)
            : /* No clobbers */);
   return result;
 }
 
-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
-vmull_high_n_u32 (uint32x4_t a, uint32_t b)
+__extension__ extern __inline uint32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqmovn_high_u64 (uint32x2_t a, uint64x2_t b)
 {
-  uint64x2_t result;
-  __asm__ ("umull2 %0.2d,%1.4s,%2.s[0]"
-           : "=w"(result)
-           : "w"(a), "w"(b)
+  uint32x4_t result = vcombine_u32 (a, vcreate_u32 (__AARCH64_UINT64_C (0x0)));
+  __asm__ ("uqxtn2 %0.4s, %1.2d"
+           : "+w"(result)
+           : "w"(b)
            : /* No clobbers */);
   return result;
 }
 
-__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
-vmull_high_p8 (poly8x16_t a, poly8x16_t b)
+__extension__ extern __inline uint8x16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqmovun_high_s16 (uint8x8_t a, int16x8_t b)
 {
-  poly16x8_t result;
-  __asm__ ("pmull2 %0.8h,%1.16b,%2.16b"
-           : "=w"(result)
-           : "w"(a), "w"(b)
+  uint8x16_t result = vcombine_u8 (a, vcreate_u8 (__AARCH64_UINT64_C (0x0)));
+  __asm__ ("sqxtun2 %0.16b, %1.8h"
+           : "+w"(result)
+           : "w"(b)
            : /* No clobbers */);
   return result;
 }
 
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-vmull_high_s8 (int8x16_t a, int8x16_t b)
+__extension__ extern __inline uint16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqmovun_high_s32 (uint16x4_t a, int32x4_t b)
 {
-  int16x8_t result;
-  __asm__ ("smull2 %0.8h,%1.16b,%2.16b"
-           : "=w"(result)
-           : "w"(a), "w"(b)
+  uint16x8_t result = vcombine_u16 (a, vcreate_u16 (__AARCH64_UINT64_C (0x0)));
+  __asm__ ("sqxtun2 %0.8h, %1.4s"
+           : "+w"(result)
+           : "w"(b)
            : /* No clobbers */);
   return result;
 }
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-vmull_high_s16 (int16x8_t a, int16x8_t b)
+__extension__ extern __inline uint32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqmovun_high_s64 (uint32x2_t a, int64x2_t b)
 {
-  int32x4_t result;
-  __asm__ ("smull2 %0.4s,%1.8h,%2.8h"
-           : "=w"(result)
-           : "w"(a), "w"(b)
+  uint32x4_t result = vcombine_u32 (a, vcreate_u32 (__AARCH64_UINT64_C (0x0)));
+  __asm__ ("sqxtun2 %0.4s, %1.2d"
+           : "+w"(result)
+           : "w"(b)
            : /* No clobbers */);
   return result;
 }
 
-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
-vmull_high_s32 (int32x4_t a, int32x4_t b)
+__extension__ extern __inline int16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqrdmulh_n_s16 (int16x4_t a, int16_t b)
 {
-  int64x2_t result;
-  __asm__ ("smull2 %0.2d,%1.4s,%2.4s"
+  int16x4_t result;
+  __asm__ ("sqrdmulh %0.4h,%1.4h,%2.h[0]"
            : "=w"(result)
-           : "w"(a), "w"(b)
+           : "w"(a), "x"(b)
            : /* No clobbers */);
   return result;
 }
 
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-vmull_high_u8 (uint8x16_t a, uint8x16_t b)
+__extension__ extern __inline int32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqrdmulh_n_s32 (int32x2_t a, int32_t b)
 {
-  uint16x8_t result;
-  __asm__ ("umull2 %0.8h,%1.16b,%2.16b"
+  int32x2_t result;
+  __asm__ ("sqrdmulh %0.2s,%1.2s,%2.s[0]"
            : "=w"(result)
            : "w"(a), "w"(b)
            : /* No clobbers */);
   return result;
 }
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vmull_high_u16 (uint16x8_t a, uint16x8_t b)
+__extension__ extern __inline int16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqrdmulhq_n_s16 (int16x8_t a, int16_t b)
 {
-  uint32x4_t result;
-  __asm__ ("umull2 %0.4s,%1.8h,%2.8h"
+  int16x8_t result;
+  __asm__ ("sqrdmulh %0.8h,%1.8h,%2.h[0]"
            : "=w"(result)
-           : "w"(a), "w"(b)
+           : "w"(a), "x"(b)
            : /* No clobbers */);
   return result;
 }
 
-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
-vmull_high_u32 (uint32x4_t a, uint32x4_t b)
+__extension__ extern __inline int32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqrdmulhq_n_s32 (int32x4_t a, int32_t b)
 {
-  uint64x2_t result;
-  __asm__ ("umull2 %0.2d,%1.4s,%2.4s"
+  int32x4_t result;
+  __asm__ ("sqrdmulh %0.4s,%1.4s,%2.s[0]"
            : "=w"(result)
            : "w"(a), "w"(b)
            : /* No clobbers */);
   return result;
 }
 
-#define vmull_lane_s16(a, b, c)                                         \
+#define vqrshrn_high_n_s16(a, b, c)                                     \
   __extension__                                                         \
     ({                                                                  \
-       int16x4_t b_ = (b);                                              \
+       int16x8_t b_ = (b);                                              \
+       int8x8_t a_ = (a);                                               \
+       int8x16_t result = vcombine_s8                                   \
+                            (a_, vcreate_s8                             \
+                                   (__AARCH64_UINT64_C (0x0)));         \
+       __asm__ ("sqrshrn2 %0.16b, %1.8h, #%2"                           \
+                : "+w"(result)                                          \
+                : "w"(b_), "i"(c)                                       \
+                : /* No clobbers */);                                   \
+       result;                                                          \
+     })
+
+#define vqrshrn_high_n_s32(a, b, c)                                     \
+  __extension__                                                         \
+    ({                                                                  \
+       int32x4_t b_ = (b);                                              \
        int16x4_t a_ = (a);                                              \
-       int32x4_t result;                                                \
-       __asm__ ("smull %0.4s,%1.4h,%2.h[%3]"                            \
-                : "=w"(result)                                          \
-                : "w"(a_), "x"(b_), "i"(c)                              \
+       int16x8_t result = vcombine_s16                                  \
+                            (a_, vcreate_s16                            \
+                                   (__AARCH64_UINT64_C (0x0)));         \
+       __asm__ ("sqrshrn2 %0.8h, %1.4s, #%2"                            \
+                : "+w"(result)                                          \
+                : "w"(b_), "i"(c)                                       \
                 : /* No clobbers */);                                   \
        result;                                                          \
      })
 
-#define vmull_lane_s32(a, b, c)                                         \
+#define vqrshrn_high_n_s64(a, b, c)                                     \
   __extension__                                                         \
     ({                                                                  \
-       int32x2_t b_ = (b);                                              \
+       int64x2_t b_ = (b);                                              \
        int32x2_t a_ = (a);                                              \
-       int64x2_t result;                                                \
-       __asm__ ("smull %0.2d,%1.2s,%2.s[%3]"                            \
-                : "=w"(result)                                          \
-                : "w"(a_), "w"(b_), "i"(c)                              \
+       int32x4_t result = vcombine_s32                                  \
+                            (a_, vcreate_s32                            \
+                                   (__AARCH64_UINT64_C (0x0)));         \
+       __asm__ ("sqrshrn2 %0.4s, %1.2d, #%2"                            \
+                : "+w"(result)                                          \
+                : "w"(b_), "i"(c)                                       \
                 : /* No clobbers */);                                   \
        result;                                                          \
      })
 
-#define vmull_lane_u16(a, b, c)                                         \
+#define vqrshrn_high_n_u16(a, b, c)                                     \
   __extension__                                                         \
     ({                                                                  \
-       uint16x4_t b_ = (b);                                             \
-       uint16x4_t a_ = (a);                                             \
-       uint32x4_t result;                                               \
-       __asm__ ("umull %0.4s,%1.4h,%2.h[%3]"                            \
-                : "=w"(result)                                          \
-                : "w"(a_), "x"(b_), "i"(c)                              \
+       uint16x8_t b_ = (b);                                             \
+       uint8x8_t a_ = (a);                                              \
+       uint8x16_t result = vcombine_u8                                  \
+                             (a_, vcreate_u8                            \
+                                    (__AARCH64_UINT64_C (0x0)));        \
+       __asm__ ("uqrshrn2 %0.16b, %1.8h, #%2"                           \
+                : "+w"(result)                                          \
+                : "w"(b_), "i"(c)                                       \
                 : /* No clobbers */);                                   \
        result;                                                          \
      })
 
-#define vmull_lane_u32(a, b, c)                                         \
+#define vqrshrn_high_n_u32(a, b, c)                                     \
   __extension__                                                         \
     ({                                                                  \
-       uint32x2_t b_ = (b);                                             \
+       uint32x4_t b_ = (b);                                             \
+       uint16x4_t a_ = (a);                                             \
+       uint16x8_t result = vcombine_u16                                 \
+                             (a_, vcreate_u16                           \
+                                    (__AARCH64_UINT64_C (0x0)));        \
+       __asm__ ("uqrshrn2 %0.8h, %1.4s, #%2"                            \
+                : "+w"(result)                                          \
+                : "w"(b_), "i"(c)                                       \
+                : /* No clobbers */);                                   \
+       result;                                                          \
+     })
+
+#define vqrshrn_high_n_u64(a, b, c)                                     \
+  __extension__                                                         \
+    ({                                                                  \
+       uint64x2_t b_ = (b);                                             \
        uint32x2_t a_ = (a);                                             \
-       uint64x2_t result;                                               \
-       __asm__ ("umull %0.2d, %1.2s, %2.s[%3]"                          \
-                : "=w"(result)                                          \
-                : "w"(a_), "w"(b_), "i"(c)                              \
+       uint32x4_t result = vcombine_u32                                 \
+                             (a_, vcreate_u32                           \
+                                    (__AARCH64_UINT64_C (0x0)));        \
+       __asm__ ("uqrshrn2 %0.4s, %1.2d, #%2"                            \
+                : "+w"(result)                                          \
+                : "w"(b_), "i"(c)                                       \
                 : /* No clobbers */);                                   \
        result;                                                          \
      })
 
-#define vmull_laneq_s16(a, b, c)                                        \
+#define vqrshrun_high_n_s16(a, b, c)                                    \
   __extension__                                                         \
     ({                                                                  \
        int16x8_t b_ = (b);                                              \
-       int16x4_t a_ = (a);                                              \
-       int32x4_t result;                                                \
-       __asm__ ("smull %0.4s, %1.4h, %2.h[%3]"                          \
-                : "=w"(result)                                          \
-                : "w"(a_), "x"(b_), "i"(c)                              \
+       uint8x8_t a_ = (a);                                              \
+       uint8x16_t result = vcombine_u8                                  \
+                             (a_, vcreate_u8                            \
+                                    (__AARCH64_UINT64_C (0x0)));        \
+       __asm__ ("sqrshrun2 %0.16b, %1.8h, #%2"                          \
+                : "+w"(result)                                          \
+                : "w"(b_), "i"(c)                                       \
                 : /* No clobbers */);                                   \
        result;                                                          \
      })
 
-#define vmull_laneq_s32(a, b, c)                                        \
+#define vqrshrun_high_n_s32(a, b, c)                                    \
+  __extension__                                                         \
+    ({                                                                  \
+       int32x4_t b_ = (b);                                              \
+       uint16x4_t a_ = (a);                                             \
+       uint16x8_t result = vcombine_u16                                 \
+                             (a_, vcreate_u16                           \
+                                    (__AARCH64_UINT64_C (0x0)));        \
+       __asm__ ("sqrshrun2 %0.8h, %1.4s, #%2"                           \
+                : "+w"(result)                                          \
+                : "w"(b_), "i"(c)                                       \
+                : /* No clobbers */);                                   \
+       result;                                                          \
+     })
+
+#define vqrshrun_high_n_s64(a, b, c)                                    \
+  __extension__                                                         \
+    ({                                                                  \
+       int64x2_t b_ = (b);                                              \
+       uint32x2_t a_ = (a);                                             \
+       uint32x4_t result = vcombine_u32                                 \
+                             (a_, vcreate_u32                           \
+                                    (__AARCH64_UINT64_C (0x0)));        \
+       __asm__ ("sqrshrun2 %0.4s, %1.2d, #%2"                           \
+                : "+w"(result)                                          \
+                : "w"(b_), "i"(c)                                       \
+                : /* No clobbers */);                                   \
+       result;                                                          \
+     })
+
+#define vqshrn_high_n_s16(a, b, c)                                      \
+  __extension__                                                         \
+    ({                                                                  \
+       int16x8_t b_ = (b);                                              \
+       int8x8_t a_ = (a);                                               \
+       int8x16_t result = vcombine_s8                                   \
+                            (a_, vcreate_s8                             \
+                                   (__AARCH64_UINT64_C (0x0)));         \
+       __asm__ ("sqshrn2 %0.16b, %1.8h, #%2"                            \
+                : "+w"(result)                                          \
+                : "w"(b_), "i"(c)                                       \
+                : /* No clobbers */);                                   \
+       result;                                                          \
+     })
+
+#define vqshrn_high_n_s32(a, b, c)                                      \
   __extension__                                                         \
     ({                                                                  \
        int32x4_t b_ = (b);                                              \
+       int16x4_t a_ = (a);                                              \
+       int16x8_t result = vcombine_s16                                  \
+                            (a_, vcreate_s16                            \
+                                   (__AARCH64_UINT64_C (0x0)));         \
+       __asm__ ("sqshrn2 %0.8h, %1.4s, #%2"                             \
+                : "+w"(result)                                          \
+                : "w"(b_), "i"(c)                                       \
+                : /* No clobbers */);                                   \
+       result;                                                          \
+     })
+
+#define vqshrn_high_n_s64(a, b, c)                                      \
+  __extension__                                                         \
+    ({                                                                  \
+       int64x2_t b_ = (b);                                              \
        int32x2_t a_ = (a);                                              \
-       int64x2_t result;                                                \
-       __asm__ ("smull %0.2d, %1.2s, %2.s[%3]"                          \
-                : "=w"(result)                                          \
-                : "w"(a_), "w"(b_), "i"(c)                              \
+       int32x4_t result = vcombine_s32                                  \
+                            (a_, vcreate_s32                            \
+                                   (__AARCH64_UINT64_C (0x0)));         \
+       __asm__ ("sqshrn2 %0.4s, %1.2d, #%2"                             \
+                : "+w"(result)                                          \
+                : "w"(b_), "i"(c)                                       \
                 : /* No clobbers */);                                   \
        result;                                                          \
      })
 
-#define vmull_laneq_u16(a, b, c)                                        \
+#define vqshrn_high_n_u16(a, b, c)                                      \
   __extension__                                                         \
     ({                                                                  \
        uint16x8_t b_ = (b);                                             \
-       uint16x4_t a_ = (a);                                             \
-       uint32x4_t result;                                               \
-       __asm__ ("umull %0.4s, %1.4h, %2.h[%3]"                          \
-                : "=w"(result)                                          \
-                : "w"(a_), "x"(b_), "i"(c)                              \
+       uint8x8_t a_ = (a);                                              \
+       uint8x16_t result = vcombine_u8                                  \
+                             (a_, vcreate_u8                            \
+                                    (__AARCH64_UINT64_C (0x0)));        \
+       __asm__ ("uqshrn2 %0.16b, %1.8h, #%2"                            \
+                : "+w"(result)                                          \
+                : "w"(b_), "i"(c)                                       \
                 : /* No clobbers */);                                   \
        result;                                                          \
      })
 
-#define vmull_laneq_u32(a, b, c)                                        \
+#define vqshrn_high_n_u32(a, b, c)                                      \
   __extension__                                                         \
     ({                                                                  \
        uint32x4_t b_ = (b);                                             \
-       uint32x2_t a_ = (a);                                             \
-       uint64x2_t result;                                               \
-       __asm__ ("umull %0.2d, %1.2s, %2.s[%3]"                          \
-                : "=w"(result)                                          \
-                : "w"(a_), "w"(b_), "i"(c)                              \
+       uint16x4_t a_ = (a);                                             \
+       uint16x8_t result = vcombine_u16                                 \
+                             (a_, vcreate_u16                           \
+                                    (__AARCH64_UINT64_C (0x0)));        \
+       __asm__ ("uqshrn2 %0.8h, %1.4s, #%2"                             \
+                : "+w"(result)                                          \
+                : "w"(b_), "i"(c)                                       \
                 : /* No clobbers */);                                   \
        result;                                                          \
      })
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-vmull_n_s16 (int16x4_t a, int16_t b)
-{
-  int32x4_t result;
-  __asm__ ("smull %0.4s,%1.4h,%2.h[0]"
-           : "=w"(result)
-           : "w"(a), "x"(b)
-           : /* No clobbers */);
-  return result;
-}
+#define vqshrn_high_n_u64(a, b, c)                                      \
+  __extension__                                                         \
+    ({                                                                  \
+       uint64x2_t b_ = (b);                                             \
+       uint32x2_t a_ = (a);                                             \
+       uint32x4_t result = vcombine_u32                                 \
+                             (a_, vcreate_u32                           \
+                                    (__AARCH64_UINT64_C (0x0)));        \
+       __asm__ ("uqshrn2 %0.4s, %1.2d, #%2"                             \
+                : "+w"(result)                                          \
+                : "w"(b_), "i"(c)                                       \
+                : /* No clobbers */);                                   \
+       result;                                                          \
+     })
 
-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
-vmull_n_s32 (int32x2_t a, int32_t b)
-{
-  int64x2_t result;
-  __asm__ ("smull %0.2d,%1.2s,%2.s[0]"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
+#define vqshrun_high_n_s16(a, b, c)                                     \
+  __extension__                                                         \
+    ({                                                                  \
+       int16x8_t b_ = (b);                                              \
+       uint8x8_t a_ = (a);                                              \
+       uint8x16_t result = vcombine_u8                                  \
+                             (a_, vcreate_u8                            \
+                                    (__AARCH64_UINT64_C (0x0)));        \
+       __asm__ ("sqshrun2 %0.16b, %1.8h, #%2"                           \
+                : "+w"(result)                                          \
+                : "w"(b_), "i"(c)                                       \
+                : /* No clobbers */);                                   \
+       result;                                                          \
+     })
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vmull_n_u16 (uint16x4_t a, uint16_t b)
-{
-  uint32x4_t result;
-  __asm__ ("umull %0.4s,%1.4h,%2.h[0]"
-           : "=w"(result)
-           : "w"(a), "x"(b)
-           : /* No clobbers */);
-  return result;
-}
+#define vqshrun_high_n_s32(a, b, c)                                     \
+  __extension__                                                         \
+    ({                                                                  \
+       int32x4_t b_ = (b);                                              \
+       uint16x4_t a_ = (a);                                             \
+       uint16x8_t result = vcombine_u16                                 \
+                             (a_, vcreate_u16                           \
+                                    (__AARCH64_UINT64_C (0x0)));        \
+       __asm__ ("sqshrun2 %0.8h, %1.4s, #%2"                            \
+                : "+w"(result)                                          \
+                : "w"(b_), "i"(c)                                       \
+                : /* No clobbers */);                                   \
+       result;                                                          \
+     })
 
-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
-vmull_n_u32 (uint32x2_t a, uint32_t b)
-{
-  uint64x2_t result;
-  __asm__ ("umull %0.2d,%1.2s,%2.s[0]"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
+#define vqshrun_high_n_s64(a, b, c)                                     \
+  __extension__                                                         \
+    ({                                                                  \
+       int64x2_t b_ = (b);                                              \
+       uint32x2_t a_ = (a);                                             \
+       uint32x4_t result = vcombine_u32                                 \
+                             (a_, vcreate_u32                           \
+                                    (__AARCH64_UINT64_C (0x0)));        \
+       __asm__ ("sqshrun2 %0.4s, %1.2d, #%2"                            \
+                : "+w"(result)                                          \
+                : "w"(b_), "i"(c)                                       \
+                : /* No clobbers */);                                   \
+       result;                                                          \
+     })
 
-__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
-vmull_p8 (poly8x8_t a, poly8x8_t b)
-{
-  poly16x8_t result;
-  __asm__ ("pmull %0.8h, %1.8b, %2.8b"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
+#define vrshrn_high_n_s16(a, b, c)                                      \
+  __extension__                                                         \
+    ({                                                                  \
+       int16x8_t b_ = (b);                                              \
+       int8x8_t a_ = (a);                                               \
+       int8x16_t result = vcombine_s8                                   \
+                            (a_, vcreate_s8                             \
+                                   (__AARCH64_UINT64_C (0x0)));         \
+       __asm__ ("rshrn2 %0.16b,%1.8h,#%2"                               \
+                : "+w"(result)                                          \
+                : "w"(b_), "i"(c)                                       \
+                : /* No clobbers */);                                   \
+       result;                                                          \
+     })
 
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-vmull_s8 (int8x8_t a, int8x8_t b)
-{
-  int16x8_t result;
-  __asm__ ("smull %0.8h, %1.8b, %2.8b"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-vmull_s16 (int16x4_t a, int16x4_t b)
-{
-  int32x4_t result;
-  __asm__ ("smull %0.4s, %1.4h, %2.4h"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
-vmull_s32 (int32x2_t a, int32x2_t b)
-{
-  int64x2_t result;
-  __asm__ ("smull %0.2d, %1.2s, %2.2s"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-vmull_u8 (uint8x8_t a, uint8x8_t b)
-{
-  uint16x8_t result;
-  __asm__ ("umull %0.8h, %1.8b, %2.8b"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
+#define vrshrn_high_n_s32(a, b, c)                                      \
+  __extension__                                                         \
+    ({                                                                  \
+       int32x4_t b_ = (b);                                              \
+       int16x4_t a_ = (a);                                              \
+       int16x8_t result = vcombine_s16                                  \
+                            (a_, vcreate_s16                            \
+                                   (__AARCH64_UINT64_C (0x0)));         \
+       __asm__ ("rshrn2 %0.8h,%1.4s,#%2"                                \
+                : "+w"(result)                                          \
+                : "w"(b_), "i"(c)                                       \
+                : /* No clobbers */);                                   \
+       result;                                                          \
+     })
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vmull_u16 (uint16x4_t a, uint16x4_t b)
-{
-  uint32x4_t result;
-  __asm__ ("umull %0.4s, %1.4h, %2.4h"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
+#define vrshrn_high_n_s64(a, b, c)                                      \
+  __extension__                                                         \
+    ({                                                                  \
+       int64x2_t b_ = (b);                                              \
+       int32x2_t a_ = (a);                                              \
+       int32x4_t result = vcombine_s32                                  \
+                            (a_, vcreate_s32                            \
+                                   (__AARCH64_UINT64_C (0x0)));         \
+       __asm__ ("rshrn2 %0.4s,%1.2d,#%2"                                \
+                : "+w"(result)                                          \
+                : "w"(b_), "i"(c)                                       \
+                : /* No clobbers */);                                   \
+       result;                                                          \
+     })
 
-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
-vmull_u32 (uint32x2_t a, uint32x2_t b)
-{
-  uint64x2_t result;
-  __asm__ ("umull %0.2d, %1.2s, %2.2s"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
+#define vrshrn_high_n_u16(a, b, c)                                      \
+  __extension__                                                         \
+    ({                                                                  \
+       uint16x8_t b_ = (b);                                             \
+       uint8x8_t a_ = (a);                                              \
+       uint8x16_t result = vcombine_u8                                  \
+                            (a_, vcreate_u8                             \
+                                   (__AARCH64_UINT64_C (0x0)));         \
+       __asm__ ("rshrn2 %0.16b,%1.8h,#%2"                               \
+                : "+w"(result)                                          \
+                : "w"(b_), "i"(c)                                       \
+                : /* No clobbers */);                                   \
+       result;                                                          \
+     })
 
-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
-vmulq_n_f32 (float32x4_t a, float32_t b)
-{
-  float32x4_t result;
-  __asm__ ("fmul %0.4s,%1.4s,%2.s[0]"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
+#define vrshrn_high_n_u32(a, b, c)                                      \
+  __extension__                                                         \
+    ({                                                                  \
+       uint32x4_t b_ = (b);                                             \
+       uint16x4_t a_ = (a);                                             \
+       uint16x8_t result = vcombine_u16                                 \
+                            (a_, vcreate_u16                            \
+                                   (__AARCH64_UINT64_C (0x0)));         \
+       __asm__ ("rshrn2 %0.8h,%1.4s,#%2"                                \
+                : "+w"(result)                                          \
+                : "w"(b_), "i"(c)                                       \
+                : /* No clobbers */);                                   \
+       result;                                                          \
+     })
 
-__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
-vmulq_n_f64 (float64x2_t a, float64_t b)
-{
-  float64x2_t result;
-  __asm__ ("fmul %0.2d,%1.2d,%2.d[0]"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
+#define vrshrn_high_n_u64(a, b, c)                                      \
+  __extension__                                                         \
+    ({                                                                  \
+       uint64x2_t b_ = (b);                                             \
+       uint32x2_t a_ = (a);                                             \
+       uint32x4_t result = vcombine_u32                                 \
+                            (a_, vcreate_u32                            \
+                                   (__AARCH64_UINT64_C (0x0)));         \
+       __asm__ ("rshrn2 %0.4s,%1.2d,#%2"                                \
+                : "+w"(result)                                          \
+                : "w"(b_), "i"(c)                                       \
+                : /* No clobbers */);                                   \
+       result;                                                          \
+     })
 
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-vmulq_n_s16 (int16x8_t a, int16_t b)
-{
-  int16x8_t result;
-  __asm__ ("mul %0.8h,%1.8h,%2.h[0]"
-           : "=w"(result)
-           : "w"(a), "x"(b)
-           : /* No clobbers */);
-  return result;
-}
+#define vrshrn_n_s16(a, b)                                              \
+  __extension__                                                         \
+    ({                                                                  \
+       int16x8_t a_ = (a);                                              \
+       int8x8_t result;                                                 \
+       __asm__ ("rshrn %0.8b,%1.8h,%2"                                  \
+                : "=w"(result)                                          \
+                : "w"(a_), "i"(b)                                       \
+                : /* No clobbers */);                                   \
+       result;                                                          \
+     })
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-vmulq_n_s32 (int32x4_t a, int32_t b)
-{
-  int32x4_t result;
-  __asm__ ("mul %0.4s,%1.4s,%2.s[0]"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
+#define vrshrn_n_s32(a, b)                                              \
+  __extension__                                                         \
+    ({                                                                  \
+       int32x4_t a_ = (a);                                              \
+       int16x4_t result;                                                \
+       __asm__ ("rshrn %0.4h,%1.4s,%2"                                  \
+                : "=w"(result)                                          \
+                : "w"(a_), "i"(b)                                       \
+                : /* No clobbers */);                                   \
+       result;                                                          \
+     })
 
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-vmulq_n_u16 (uint16x8_t a, uint16_t b)
-{
-  uint16x8_t result;
-  __asm__ ("mul %0.8h,%1.8h,%2.h[0]"
-           : "=w"(result)
-           : "w"(a), "x"(b)
-           : /* No clobbers */);
-  return result;
-}
+#define vrshrn_n_s64(a, b)                                              \
+  __extension__                                                         \
+    ({                                                                  \
+       int64x2_t a_ = (a);                                              \
+       int32x2_t result;                                                \
+       __asm__ ("rshrn %0.2s,%1.2d,%2"                                  \
+                : "=w"(result)                                          \
+                : "w"(a_), "i"(b)                                       \
+                : /* No clobbers */);                                   \
+       result;                                                          \
+     })
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vmulq_n_u32 (uint32x4_t a, uint32_t b)
-{
-  uint32x4_t result;
-  __asm__ ("mul %0.4s,%1.4s,%2.s[0]"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
+#define vrshrn_n_u16(a, b)                                              \
+  __extension__                                                         \
+    ({                                                                  \
+       uint16x8_t a_ = (a);                                             \
+       uint8x8_t result;                                                \
+       __asm__ ("rshrn %0.8b,%1.8h,%2"                                  \
+                : "=w"(result)                                          \
+                : "w"(a_), "i"(b)                                       \
+                : /* No clobbers */);                                   \
+       result;                                                          \
+     })
 
-__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
-vmvn_p8 (poly8x8_t a)
-{
-  poly8x8_t result;
-  __asm__ ("mvn %0.8b,%1.8b"
-           : "=w"(result)
-           : "w"(a)
-           : /* No clobbers */);
-  return result;
-}
+#define vrshrn_n_u32(a, b)                                              \
+  __extension__                                                         \
+    ({                                                                  \
+       uint32x4_t a_ = (a);                                             \
+       uint16x4_t result;                                               \
+       __asm__ ("rshrn %0.4h,%1.4s,%2"                                  \
+                : "=w"(result)                                          \
+                : "w"(a_), "i"(b)                                       \
+                : /* No clobbers */);                                   \
+       result;                                                          \
+     })
 
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-vmvn_s8 (int8x8_t a)
-{
-  int8x8_t result;
-  __asm__ ("mvn %0.8b,%1.8b"
-           : "=w"(result)
-           : "w"(a)
-           : /* No clobbers */);
-  return result;
-}
+#define vrshrn_n_u64(a, b)                                              \
+  __extension__                                                         \
+    ({                                                                  \
+       uint64x2_t a_ = (a);                                             \
+       uint32x2_t result;                                               \
+       __asm__ ("rshrn %0.2s,%1.2d,%2"                                  \
+                : "=w"(result)                                          \
+                : "w"(a_), "i"(b)                                       \
+                : /* No clobbers */);                                   \
+       result;                                                          \
+     })
 
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-vmvn_s16 (int16x4_t a)
+__extension__ extern __inline uint32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vrsqrte_u32 (uint32x2_t a)
 {
-  int16x4_t result;
-  __asm__ ("mvn %0.8b,%1.8b"
+  uint32x2_t result;
+  __asm__ ("ursqrte %0.2s,%1.2s"
            : "=w"(result)
            : "w"(a)
            : /* No clobbers */);
   return result;
 }
 
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-vmvn_s32 (int32x2_t a)
+__extension__ extern __inline uint32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vrsqrteq_u32 (uint32x4_t a)
 {
-  int32x2_t result;
-  __asm__ ("mvn %0.8b,%1.8b"
+  uint32x4_t result;
+  __asm__ ("ursqrte %0.4s,%1.4s"
            : "=w"(result)
            : "w"(a)
            : /* No clobbers */);
   return result;
 }
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-vmvn_u8 (uint8x8_t a)
-{
+#define vshrn_high_n_s16(a, b, c)                                       \
+  __extension__                                                         \
+    ({                                                                  \
+       int16x8_t b_ = (b);                                              \
+       int8x8_t a_ = (a);                                               \
+       int8x16_t result = vcombine_s8                                   \
+                            (a_, vcreate_s8                             \
+                                   (__AARCH64_UINT64_C (0x0)));         \
+       __asm__ ("shrn2 %0.16b,%1.8h,#%2"                                \
+                : "+w"(result)                                          \
+                : "w"(b_), "i"(c)                                       \
+                : /* No clobbers */);                                   \
+       result;                                                          \
+     })
+
+#define vshrn_high_n_s32(a, b, c)                                       \
+  __extension__                                                         \
+    ({                                                                  \
+       int32x4_t b_ = (b);                                              \
+       int16x4_t a_ = (a);                                              \
+       int16x8_t result = vcombine_s16                                  \
+                            (a_, vcreate_s16                            \
+                                   (__AARCH64_UINT64_C (0x0)));         \
+       __asm__ ("shrn2 %0.8h,%1.4s,#%2"                                 \
+                : "+w"(result)                                          \
+                : "w"(b_), "i"(c)                                       \
+                : /* No clobbers */);                                   \
+       result;                                                          \
+     })
+
+#define vshrn_high_n_s64(a, b, c)                                       \
+  __extension__                                                         \
+    ({                                                                  \
+       int64x2_t b_ = (b);                                              \
+       int32x2_t a_ = (a);                                              \
+       int32x4_t result = vcombine_s32                                  \
+                            (a_, vcreate_s32                            \
+                                   (__AARCH64_UINT64_C (0x0)));         \
+       __asm__ ("shrn2 %0.4s,%1.2d,#%2"                                 \
+                : "+w"(result)                                          \
+                : "w"(b_), "i"(c)                                       \
+                : /* No clobbers */);                                   \
+       result;                                                          \
+     })
+
+#define vshrn_high_n_u16(a, b, c)                                       \
+  __extension__                                                         \
+    ({                                                                  \
+       uint16x8_t b_ = (b);                                             \
+       uint8x8_t a_ = (a);                                              \
+       uint8x16_t result = vcombine_u8                                  \
+                            (a_, vcreate_u8                             \
+                                   (__AARCH64_UINT64_C (0x0)));         \
+       __asm__ ("shrn2 %0.16b,%1.8h,#%2"                                \
+                : "+w"(result)                                          \
+                : "w"(b_), "i"(c)                                       \
+                : /* No clobbers */);                                   \
+       result;                                                          \
+     })
+
+#define vshrn_high_n_u32(a, b, c)                                       \
+  __extension__                                                         \
+    ({                                                                  \
+       uint32x4_t b_ = (b);                                             \
+       uint16x4_t a_ = (a);                                             \
+       uint16x8_t result = vcombine_u16                                 \
+                            (a_, vcreate_u16                            \
+                                   (__AARCH64_UINT64_C (0x0)));         \
+       __asm__ ("shrn2 %0.8h,%1.4s,#%2"                                 \
+                : "+w"(result)                                          \
+                : "w"(b_), "i"(c)                                       \
+                : /* No clobbers */);                                   \
+       result;                                                          \
+     })
+
+#define vshrn_high_n_u64(a, b, c)                                       \
+  __extension__                                                         \
+    ({                                                                  \
+       uint64x2_t b_ = (b);                                             \
+       uint32x2_t a_ = (a);                                             \
+       uint32x4_t result = vcombine_u32                                 \
+                            (a_, vcreate_u32                            \
+                                   (__AARCH64_UINT64_C (0x0)));         \
+       __asm__ ("shrn2 %0.4s,%1.2d,#%2"                                 \
+                : "+w"(result)                                          \
+                : "w"(b_), "i"(c)                                       \
+                : /* No clobbers */);                                   \
+       result;                                                          \
+     })
+
+#define vshrn_n_s16(a, b)                                               \
+  __extension__                                                         \
+    ({                                                                  \
+       int16x8_t a_ = (a);                                              \
+       int8x8_t result;                                                 \
+       __asm__ ("shrn %0.8b,%1.8h,%2"                                   \
+                : "=w"(result)                                          \
+                : "w"(a_), "i"(b)                                       \
+                : /* No clobbers */);                                   \
+       result;                                                          \
+     })
+
+#define vshrn_n_s32(a, b)                                               \
+  __extension__                                                         \
+    ({                                                                  \
+       int32x4_t a_ = (a);                                              \
+       int16x4_t result;                                                \
+       __asm__ ("shrn %0.4h,%1.4s,%2"                                   \
+                : "=w"(result)                                          \
+                : "w"(a_), "i"(b)                                       \
+                : /* No clobbers */);                                   \
+       result;                                                          \
+     })
+
+#define vshrn_n_s64(a, b)                                               \
+  __extension__                                                         \
+    ({                                                                  \
+       int64x2_t a_ = (a);                                              \
+       int32x2_t result;                                                \
+       __asm__ ("shrn %0.2s,%1.2d,%2"                                   \
+                : "=w"(result)                                          \
+                : "w"(a_), "i"(b)                                       \
+                : /* No clobbers */);                                   \
+       result;                                                          \
+     })
+
+#define vshrn_n_u16(a, b)                                               \
+  __extension__                                                         \
+    ({                                                                  \
+       uint16x8_t a_ = (a);                                             \
+       uint8x8_t result;                                                \
+       __asm__ ("shrn %0.8b,%1.8h,%2"                                   \
+                : "=w"(result)                                          \
+                : "w"(a_), "i"(b)                                       \
+                : /* No clobbers */);                                   \
+       result;                                                          \
+     })
+
+#define vshrn_n_u32(a, b)                                               \
+  __extension__                                                         \
+    ({                                                                  \
+       uint32x4_t a_ = (a);                                             \
+       uint16x4_t result;                                               \
+       __asm__ ("shrn %0.4h,%1.4s,%2"                                   \
+                : "=w"(result)                                          \
+                : "w"(a_), "i"(b)                                       \
+                : /* No clobbers */);                                   \
+       result;                                                          \
+     })
+
+#define vshrn_n_u64(a, b)                                               \
+  __extension__                                                         \
+    ({                                                                  \
+       uint64x2_t a_ = (a);                                             \
+       uint32x2_t result;                                               \
+       __asm__ ("shrn %0.2s,%1.2d,%2"                                   \
+                : "=w"(result)                                          \
+                : "w"(a_), "i"(b)                                       \
+                : /* No clobbers */);                                   \
+       result;                                                          \
+     })
+
+#define vsli_n_p8(a, b, c)                                              \
+  __extension__                                                         \
+    ({                                                                  \
+       poly8x8_t b_ = (b);                                              \
+       poly8x8_t a_ = (a);                                              \
+       poly8x8_t result;                                                \
+       __asm__ ("sli %0.8b,%2.8b,%3"                                    \
+                : "=w"(result)                                          \
+                : "0"(a_), "w"(b_), "i"(c)                              \
+                : /* No clobbers */);                                   \
+       result;                                                          \
+     })
+
+#define vsli_n_p16(a, b, c)                                             \
+  __extension__                                                         \
+    ({                                                                  \
+       poly16x4_t b_ = (b);                                             \
+       poly16x4_t a_ = (a);                                             \
+       poly16x4_t result;                                               \
+       __asm__ ("sli %0.4h,%2.4h,%3"                                    \
+                : "=w"(result)                                          \
+                : "0"(a_), "w"(b_), "i"(c)                              \
+                : /* No clobbers */);                                   \
+       result;                                                          \
+     })
+
+#define vsliq_n_p8(a, b, c)                                             \
+  __extension__                                                         \
+    ({                                                                  \
+       poly8x16_t b_ = (b);                                             \
+       poly8x16_t a_ = (a);                                             \
+       poly8x16_t result;                                               \
+       __asm__ ("sli %0.16b,%2.16b,%3"                                  \
+                : "=w"(result)                                          \
+                : "0"(a_), "w"(b_), "i"(c)                              \
+                : /* No clobbers */);                                   \
+       result;                                                          \
+     })
+
+#define vsliq_n_p16(a, b, c)                                            \
+  __extension__                                                         \
+    ({                                                                  \
+       poly16x8_t b_ = (b);                                             \
+       poly16x8_t a_ = (a);                                             \
+       poly16x8_t result;                                               \
+       __asm__ ("sli %0.8h,%2.8h,%3"                                    \
+                : "=w"(result)                                          \
+                : "0"(a_), "w"(b_), "i"(c)                              \
+                : /* No clobbers */);                                   \
+       result;                                                          \
+     })
+
+#define vsri_n_p8(a, b, c)                                              \
+  __extension__                                                         \
+    ({                                                                  \
+       poly8x8_t b_ = (b);                                              \
+       poly8x8_t a_ = (a);                                              \
+       poly8x8_t result;                                                \
+       __asm__ ("sri %0.8b,%2.8b,%3"                                    \
+                : "=w"(result)                                          \
+                : "0"(a_), "w"(b_), "i"(c)                              \
+                : /* No clobbers */);                                   \
+       result;                                                          \
+     })
+
+#define vsri_n_p16(a, b, c)                                             \
+  __extension__                                                         \
+    ({                                                                  \
+       poly16x4_t b_ = (b);                                             \
+       poly16x4_t a_ = (a);                                             \
+       poly16x4_t result;                                               \
+       __asm__ ("sri %0.4h,%2.4h,%3"                                    \
+                : "=w"(result)                                          \
+                : "0"(a_), "w"(b_), "i"(c)                              \
+                : /* No clobbers */);                                   \
+       result;                                                          \
+     })
+
+#define vsri_n_p64(a, b, c)						\
+  __extension__								\
+    ({									\
+       poly64x1_t b_ = (b);						\
+       poly64x1_t a_ = (a);						\
+       poly64x1_t result;						\
+       __asm__ ("sri %d0,%d2,%3"					\
+		: "=w"(result)						\
+		: "0"(a_), "w"(b_), "i"(c)				\
+		: /* No clobbers.  */);					\
+       result;								\
+     })
+
+#define vsriq_n_p8(a, b, c)                                             \
+  __extension__                                                         \
+    ({                                                                  \
+       poly8x16_t b_ = (b);                                             \
+       poly8x16_t a_ = (a);                                             \
+       poly8x16_t result;                                               \
+       __asm__ ("sri %0.16b,%2.16b,%3"                                  \
+                : "=w"(result)                                          \
+                : "0"(a_), "w"(b_), "i"(c)                              \
+                : /* No clobbers */);                                   \
+       result;                                                          \
+     })
+
+#define vsriq_n_p16(a, b, c)                                            \
+  __extension__                                                         \
+    ({                                                                  \
+       poly16x8_t b_ = (b);                                             \
+       poly16x8_t a_ = (a);                                             \
+       poly16x8_t result;                                               \
+       __asm__ ("sri %0.8h,%2.8h,%3"                                    \
+                : "=w"(result)                                          \
+                : "0"(a_), "w"(b_), "i"(c)                              \
+                : /* No clobbers */);                                   \
+       result;                                                          \
+     })
+
+#define vsriq_n_p64(a, b, c)						\
+  __extension__								\
+    ({									\
+       poly64x2_t b_ = (b);						\
+       poly64x2_t a_ = (a);						\
+       poly64x2_t result;						\
+       __asm__ ("sri %0.2d,%2.2d,%3"					\
+		: "=w"(result)						\
+		: "0"(a_), "w"(b_), "i"(c)				\
+		: /* No clobbers.  */);					\
+       result;								\
+     })
+
+__extension__ extern __inline uint8x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vtst_p8 (poly8x8_t a, poly8x8_t b)
+{
+  uint8x8_t result;
+  __asm__ ("cmtst %0.8b, %1.8b, %2.8b"
+           : "=w"(result)
+           : "w"(a), "w"(b)
+           : /* No clobbers */);
+  return result;
+}
+
+__extension__ extern __inline uint16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vtst_p16 (poly16x4_t a, poly16x4_t b)
+{
+  uint16x4_t result;
+  __asm__ ("cmtst %0.4h, %1.4h, %2.4h"
+           : "=w"(result)
+           : "w"(a), "w"(b)
+           : /* No clobbers */);
+  return result;
+}
+
+__extension__ extern __inline uint8x16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vtstq_p8 (poly8x16_t a, poly8x16_t b)
+{
+  uint8x16_t result;
+  __asm__ ("cmtst %0.16b, %1.16b, %2.16b"
+           : "=w"(result)
+           : "w"(a), "w"(b)
+           : /* No clobbers */);
+  return result;
+}
+
+__extension__ extern __inline uint16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vtstq_p16 (poly16x8_t a, poly16x8_t b)
+{
+  uint16x8_t result;
+  __asm__ ("cmtst %0.8h, %1.8h, %2.8h"
+           : "=w"(result)
+           : "w"(a), "w"(b)
+           : /* No clobbers */);
+  return result;
+}
+
+/* End of temporary inline asm implementations.  */
+
+/* Start of temporary inline asm for vldn, vstn and friends.  */
+
+/* Create struct element types for duplicating loads.
+
+   Create 2 element structures of:
+
+   +------+----+----+----+----+
+   |      | 8  | 16 | 32 | 64 |
+   +------+----+----+----+----+
+   |int   | Y  | Y  | N  | N  |
+   +------+----+----+----+----+
+   |uint  | Y  | Y  | N  | N  |
+   +------+----+----+----+----+
+   |float | -  | Y  | N  | N  |
+   +------+----+----+----+----+
+   |poly  | Y  | Y  | -  | -  |
+   +------+----+----+----+----+
+
+   Create 3 element structures of:
+
+   +------+----+----+----+----+
+   |      | 8  | 16 | 32 | 64 |
+   +------+----+----+----+----+
+   |int   | Y  | Y  | Y  | Y  |
+   +------+----+----+----+----+
+   |uint  | Y  | Y  | Y  | Y  |
+   +------+----+----+----+----+
+   |float | -  | Y  | Y  | Y  |
+   +------+----+----+----+----+
+   |poly  | Y  | Y  | -  | -  |
+   +------+----+----+----+----+
+
+   Create 4 element structures of:
+
+   +------+----+----+----+----+
+   |      | 8  | 16 | 32 | 64 |
+   +------+----+----+----+----+
+   |int   | Y  | N  | N  | Y  |
+   +------+----+----+----+----+
+   |uint  | Y  | N  | N  | Y  |
+   +------+----+----+----+----+
+   |float | -  | N  | N  | Y  |
+   +------+----+----+----+----+
+   |poly  | Y  | N  | -  | -  |
+   +------+----+----+----+----+
+
+  This is required for casting memory reference.  */
+#define __STRUCTN(t, sz, nelem)			\
+  typedef struct t ## sz ## x ## nelem ## _t {	\
+    t ## sz ## _t val[nelem];			\
+  }  t ## sz ## x ## nelem ## _t;
+
+/* 2-element structs.  */
+__STRUCTN (int, 8, 2)
+__STRUCTN (int, 16, 2)
+__STRUCTN (uint, 8, 2)
+__STRUCTN (uint, 16, 2)
+__STRUCTN (float, 16, 2)
+__STRUCTN (poly, 8, 2)
+__STRUCTN (poly, 16, 2)
+/* 3-element structs.  */
+__STRUCTN (int, 8, 3)
+__STRUCTN (int, 16, 3)
+__STRUCTN (int, 32, 3)
+__STRUCTN (int, 64, 3)
+__STRUCTN (uint, 8, 3)
+__STRUCTN (uint, 16, 3)
+__STRUCTN (uint, 32, 3)
+__STRUCTN (uint, 64, 3)
+__STRUCTN (float, 16, 3)
+__STRUCTN (float, 32, 3)
+__STRUCTN (float, 64, 3)
+__STRUCTN (poly, 8, 3)
+__STRUCTN (poly, 16, 3)
+/* 4-element structs.  */
+__STRUCTN (int, 8, 4)
+__STRUCTN (int, 64, 4)
+__STRUCTN (uint, 8, 4)
+__STRUCTN (uint, 64, 4)
+__STRUCTN (poly, 8, 4)
+__STRUCTN (float, 64, 4)
+#undef __STRUCTN
+
+
+#define __ST2_LANE_FUNC(intype, largetype, ptrtype, mode,		     \
+			qmode, ptr_mode, funcsuffix, signedtype)	     \
+__extension__ extern __inline void					     \
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) \
+vst2_lane_ ## funcsuffix (ptrtype *__ptr,				     \
+			  intype __b, const int __c)			     \
+{									     \
+  __builtin_aarch64_simd_oi __o;					     \
+  largetype __temp;							     \
+  __temp.val[0]								     \
+    = vcombine_##funcsuffix (__b.val[0],				     \
+			     vcreate_##funcsuffix (__AARCH64_UINT64_C (0))); \
+  __temp.val[1]								     \
+    = vcombine_##funcsuffix (__b.val[1],				     \
+			     vcreate_##funcsuffix (__AARCH64_UINT64_C (0))); \
+  __o = __builtin_aarch64_set_qregoi##qmode (__o,			     \
+					     (signedtype) __temp.val[0], 0); \
+  __o = __builtin_aarch64_set_qregoi##qmode (__o,			     \
+					     (signedtype) __temp.val[1], 1); \
+  __builtin_aarch64_st2_lane##mode ((__builtin_aarch64_simd_ ## ptr_mode *)  \
+				     __ptr, __o, __c);			     \
+}
+
+__ST2_LANE_FUNC (float16x4x2_t, float16x8x2_t, float16_t, v4hf, v8hf, hf, f16,
+		 float16x8_t)
+__ST2_LANE_FUNC (float32x2x2_t, float32x4x2_t, float32_t, v2sf, v4sf, sf, f32,
+		 float32x4_t)
+__ST2_LANE_FUNC (float64x1x2_t, float64x2x2_t, float64_t, df, v2df, df, f64,
+		 float64x2_t)
+__ST2_LANE_FUNC (poly8x8x2_t, poly8x16x2_t, poly8_t, v8qi, v16qi, qi, p8,
+		 int8x16_t)
+__ST2_LANE_FUNC (poly16x4x2_t, poly16x8x2_t, poly16_t, v4hi, v8hi, hi, p16,
+		 int16x8_t)
+__ST2_LANE_FUNC (poly64x1x2_t, poly64x2x2_t, poly64_t, di, v2di_ssps, di, p64,
+		 poly64x2_t)
+__ST2_LANE_FUNC (int8x8x2_t, int8x16x2_t, int8_t, v8qi, v16qi, qi, s8,
+		 int8x16_t)
+__ST2_LANE_FUNC (int16x4x2_t, int16x8x2_t, int16_t, v4hi, v8hi, hi, s16,
+		 int16x8_t)
+__ST2_LANE_FUNC (int32x2x2_t, int32x4x2_t, int32_t, v2si, v4si, si, s32,
+		 int32x4_t)
+__ST2_LANE_FUNC (int64x1x2_t, int64x2x2_t, int64_t, di, v2di, di, s64,
+		 int64x2_t)
+__ST2_LANE_FUNC (uint8x8x2_t, uint8x16x2_t, uint8_t, v8qi, v16qi, qi, u8,
+		 int8x16_t)
+__ST2_LANE_FUNC (uint16x4x2_t, uint16x8x2_t, uint16_t, v4hi, v8hi, hi, u16,
+		 int16x8_t)
+__ST2_LANE_FUNC (uint32x2x2_t, uint32x4x2_t, uint32_t, v2si, v4si, si, u32,
+		 int32x4_t)
+__ST2_LANE_FUNC (uint64x1x2_t, uint64x2x2_t, uint64_t, di, v2di, di, u64,
+		 int64x2_t)
+
+#undef __ST2_LANE_FUNC
+#define __ST2_LANE_FUNC(intype, ptrtype, mode, ptr_mode, funcsuffix)	    \
+__extension__ extern __inline void					    \
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) \
+vst2q_lane_ ## funcsuffix (ptrtype *__ptr,				    \
+			   intype __b, const int __c)			    \
+{									    \
+  union { intype __i;							    \
+	  __builtin_aarch64_simd_oi __o; } __temp = { __b };		    \
+  __builtin_aarch64_st2_lane##mode ((__builtin_aarch64_simd_ ## ptr_mode *) \
+				    __ptr, __temp.__o, __c);		    \
+}
+
+__ST2_LANE_FUNC (float16x8x2_t, float16_t, v8hf, hf, f16)
+__ST2_LANE_FUNC (float32x4x2_t, float32_t, v4sf, sf, f32)
+__ST2_LANE_FUNC (float64x2x2_t, float64_t, v2df, df, f64)
+__ST2_LANE_FUNC (poly8x16x2_t, poly8_t, v16qi, qi, p8)
+__ST2_LANE_FUNC (poly16x8x2_t, poly16_t, v8hi, hi, p16)
+__ST2_LANE_FUNC (poly64x2x2_t, poly64_t, v2di, di, p64)
+__ST2_LANE_FUNC (int8x16x2_t, int8_t, v16qi, qi, s8)
+__ST2_LANE_FUNC (int16x8x2_t, int16_t, v8hi, hi, s16)
+__ST2_LANE_FUNC (int32x4x2_t, int32_t, v4si, si, s32)
+__ST2_LANE_FUNC (int64x2x2_t, int64_t, v2di, di, s64)
+__ST2_LANE_FUNC (uint8x16x2_t, uint8_t, v16qi, qi, u8)
+__ST2_LANE_FUNC (uint16x8x2_t, uint16_t, v8hi, hi, u16)
+__ST2_LANE_FUNC (uint32x4x2_t, uint32_t, v4si, si, u32)
+__ST2_LANE_FUNC (uint64x2x2_t, uint64_t, v2di, di, u64)
+
+#define __ST3_LANE_FUNC(intype, largetype, ptrtype, mode,		     \
+			qmode, ptr_mode, funcsuffix, signedtype)	     \
+__extension__ extern __inline void					     \
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) \
+vst3_lane_ ## funcsuffix (ptrtype *__ptr,				     \
+			  intype __b, const int __c)			     \
+{									     \
+  __builtin_aarch64_simd_ci __o;					     \
+  largetype __temp;							     \
+  __temp.val[0]								     \
+    = vcombine_##funcsuffix (__b.val[0],				     \
+			     vcreate_##funcsuffix (__AARCH64_UINT64_C (0))); \
+  __temp.val[1]								     \
+    = vcombine_##funcsuffix (__b.val[1],				     \
+			     vcreate_##funcsuffix (__AARCH64_UINT64_C (0))); \
+  __temp.val[2]								     \
+    = vcombine_##funcsuffix (__b.val[2],				     \
+			     vcreate_##funcsuffix (__AARCH64_UINT64_C (0))); \
+  __o = __builtin_aarch64_set_qregci##qmode (__o,			     \
+					     (signedtype) __temp.val[0], 0); \
+  __o = __builtin_aarch64_set_qregci##qmode (__o,			     \
+					     (signedtype) __temp.val[1], 1); \
+  __o = __builtin_aarch64_set_qregci##qmode (__o,			     \
+					     (signedtype) __temp.val[2], 2); \
+  __builtin_aarch64_st3_lane##mode ((__builtin_aarch64_simd_ ## ptr_mode *)  \
+				     __ptr, __o, __c);			     \
+}
+
+__ST3_LANE_FUNC (float16x4x3_t, float16x8x3_t, float16_t, v4hf, v8hf, hf, f16,
+		 float16x8_t)
+__ST3_LANE_FUNC (float32x2x3_t, float32x4x3_t, float32_t, v2sf, v4sf, sf, f32,
+		 float32x4_t)
+__ST3_LANE_FUNC (float64x1x3_t, float64x2x3_t, float64_t, df, v2df, df, f64,
+		 float64x2_t)
+__ST3_LANE_FUNC (poly8x8x3_t, poly8x16x3_t, poly8_t, v8qi, v16qi, qi, p8,
+		 int8x16_t)
+__ST3_LANE_FUNC (poly16x4x3_t, poly16x8x3_t, poly16_t, v4hi, v8hi, hi, p16,
+		 int16x8_t)
+__ST3_LANE_FUNC (poly64x1x3_t, poly64x2x3_t, poly64_t, di, v2di_ssps, di, p64,
+		 poly64x2_t)
+__ST3_LANE_FUNC (int8x8x3_t, int8x16x3_t, int8_t, v8qi, v16qi, qi, s8,
+		 int8x16_t)
+__ST3_LANE_FUNC (int16x4x3_t, int16x8x3_t, int16_t, v4hi, v8hi, hi, s16,
+		 int16x8_t)
+__ST3_LANE_FUNC (int32x2x3_t, int32x4x3_t, int32_t, v2si, v4si, si, s32,
+		 int32x4_t)
+__ST3_LANE_FUNC (int64x1x3_t, int64x2x3_t, int64_t, di, v2di, di, s64,
+		 int64x2_t)
+__ST3_LANE_FUNC (uint8x8x3_t, uint8x16x3_t, uint8_t, v8qi, v16qi, qi, u8,
+		 int8x16_t)
+__ST3_LANE_FUNC (uint16x4x3_t, uint16x8x3_t, uint16_t, v4hi, v8hi, hi, u16,
+		 int16x8_t)
+__ST3_LANE_FUNC (uint32x2x3_t, uint32x4x3_t, uint32_t, v2si, v4si, si, u32,
+		 int32x4_t)
+__ST3_LANE_FUNC (uint64x1x3_t, uint64x2x3_t, uint64_t, di, v2di, di, u64,
+		 int64x2_t)
+
+#undef __ST3_LANE_FUNC
+#define __ST3_LANE_FUNC(intype, ptrtype, mode, ptr_mode, funcsuffix)	    \
+__extension__ extern __inline void					    \
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) \
+vst3q_lane_ ## funcsuffix (ptrtype *__ptr,				    \
+			   intype __b, const int __c)			    \
+{									    \
+  union { intype __i;							    \
+	  __builtin_aarch64_simd_ci __o; } __temp = { __b };		    \
+  __builtin_aarch64_st3_lane##mode ((__builtin_aarch64_simd_ ## ptr_mode *) \
+				    __ptr, __temp.__o, __c);		    \
+}
+
+__ST3_LANE_FUNC (float16x8x3_t, float16_t, v8hf, hf, f16)
+__ST3_LANE_FUNC (float32x4x3_t, float32_t, v4sf, sf, f32)
+__ST3_LANE_FUNC (float64x2x3_t, float64_t, v2df, df, f64)
+__ST3_LANE_FUNC (poly8x16x3_t, poly8_t, v16qi, qi, p8)
+__ST3_LANE_FUNC (poly16x8x3_t, poly16_t, v8hi, hi, p16)
+__ST3_LANE_FUNC (poly64x2x3_t, poly64_t, v2di, di, p64)
+__ST3_LANE_FUNC (int8x16x3_t, int8_t, v16qi, qi, s8)
+__ST3_LANE_FUNC (int16x8x3_t, int16_t, v8hi, hi, s16)
+__ST3_LANE_FUNC (int32x4x3_t, int32_t, v4si, si, s32)
+__ST3_LANE_FUNC (int64x2x3_t, int64_t, v2di, di, s64)
+__ST3_LANE_FUNC (uint8x16x3_t, uint8_t, v16qi, qi, u8)
+__ST3_LANE_FUNC (uint16x8x3_t, uint16_t, v8hi, hi, u16)
+__ST3_LANE_FUNC (uint32x4x3_t, uint32_t, v4si, si, u32)
+__ST3_LANE_FUNC (uint64x2x3_t, uint64_t, v2di, di, u64)
+
+#define __ST4_LANE_FUNC(intype, largetype, ptrtype, mode,		     \
+			qmode, ptr_mode, funcsuffix, signedtype)	     \
+__extension__ extern __inline void					     \
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) \
+vst4_lane_ ## funcsuffix (ptrtype *__ptr,				     \
+			  intype __b, const int __c)			     \
+{									     \
+  __builtin_aarch64_simd_xi __o;					     \
+  largetype __temp;							     \
+  __temp.val[0]								     \
+    = vcombine_##funcsuffix (__b.val[0],				     \
+			     vcreate_##funcsuffix (__AARCH64_UINT64_C (0))); \
+  __temp.val[1]								     \
+    = vcombine_##funcsuffix (__b.val[1],				     \
+			     vcreate_##funcsuffix (__AARCH64_UINT64_C (0))); \
+  __temp.val[2]								     \
+    = vcombine_##funcsuffix (__b.val[2],				     \
+			     vcreate_##funcsuffix (__AARCH64_UINT64_C (0))); \
+  __temp.val[3]								     \
+    = vcombine_##funcsuffix (__b.val[3],				     \
+			     vcreate_##funcsuffix (__AARCH64_UINT64_C (0))); \
+  __o = __builtin_aarch64_set_qregxi##qmode (__o,			     \
+					     (signedtype) __temp.val[0], 0); \
+  __o = __builtin_aarch64_set_qregxi##qmode (__o,			     \
+					     (signedtype) __temp.val[1], 1); \
+  __o = __builtin_aarch64_set_qregxi##qmode (__o,			     \
+					     (signedtype) __temp.val[2], 2); \
+  __o = __builtin_aarch64_set_qregxi##qmode (__o,			     \
+					     (signedtype) __temp.val[3], 3); \
+  __builtin_aarch64_st4_lane##mode ((__builtin_aarch64_simd_ ## ptr_mode *)  \
+				     __ptr, __o, __c);			     \
+}
+
+__ST4_LANE_FUNC (float16x4x4_t, float16x8x4_t, float16_t, v4hf, v8hf, hf, f16,
+		 float16x8_t)
+__ST4_LANE_FUNC (float32x2x4_t, float32x4x4_t, float32_t, v2sf, v4sf, sf, f32,
+		 float32x4_t)
+__ST4_LANE_FUNC (float64x1x4_t, float64x2x4_t, float64_t, df, v2df, df, f64,
+		 float64x2_t)
+__ST4_LANE_FUNC (poly8x8x4_t, poly8x16x4_t, poly8_t, v8qi, v16qi, qi, p8,
+		 int8x16_t)
+__ST4_LANE_FUNC (poly16x4x4_t, poly16x8x4_t, poly16_t, v4hi, v8hi, hi, p16,
+		 int16x8_t)
+__ST4_LANE_FUNC (poly64x1x4_t, poly64x2x4_t, poly64_t, di, v2di_ssps, di, p64,
+		 poly64x2_t)
+__ST4_LANE_FUNC (int8x8x4_t, int8x16x4_t, int8_t, v8qi, v16qi, qi, s8,
+		 int8x16_t)
+__ST4_LANE_FUNC (int16x4x4_t, int16x8x4_t, int16_t, v4hi, v8hi, hi, s16,
+		 int16x8_t)
+__ST4_LANE_FUNC (int32x2x4_t, int32x4x4_t, int32_t, v2si, v4si, si, s32,
+		 int32x4_t)
+__ST4_LANE_FUNC (int64x1x4_t, int64x2x4_t, int64_t, di, v2di, di, s64,
+		 int64x2_t)
+__ST4_LANE_FUNC (uint8x8x4_t, uint8x16x4_t, uint8_t, v8qi, v16qi, qi, u8,
+		 int8x16_t)
+__ST4_LANE_FUNC (uint16x4x4_t, uint16x8x4_t, uint16_t, v4hi, v8hi, hi, u16,
+		 int16x8_t)
+__ST4_LANE_FUNC (uint32x2x4_t, uint32x4x4_t, uint32_t, v2si, v4si, si, u32,
+		 int32x4_t)
+__ST4_LANE_FUNC (uint64x1x4_t, uint64x2x4_t, uint64_t, di, v2di, di, u64,
+		 int64x2_t)
+
+#undef __ST4_LANE_FUNC
+#define __ST4_LANE_FUNC(intype, ptrtype, mode, ptr_mode, funcsuffix)	    \
+__extension__ extern __inline void					    \
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) \
+vst4q_lane_ ## funcsuffix (ptrtype *__ptr,				    \
+			   intype __b, const int __c)			    \
+{									    \
+  union { intype __i;							    \
+	  __builtin_aarch64_simd_xi __o; } __temp = { __b };		    \
+  __builtin_aarch64_st4_lane##mode ((__builtin_aarch64_simd_ ## ptr_mode *) \
+				    __ptr, __temp.__o, __c);		    \
+}
+
+__ST4_LANE_FUNC (float16x8x4_t, float16_t, v8hf, hf, f16)
+__ST4_LANE_FUNC (float32x4x4_t, float32_t, v4sf, sf, f32)
+__ST4_LANE_FUNC (float64x2x4_t, float64_t, v2df, df, f64)
+__ST4_LANE_FUNC (poly8x16x4_t, poly8_t, v16qi, qi, p8)
+__ST4_LANE_FUNC (poly16x8x4_t, poly16_t, v8hi, hi, p16)
+__ST4_LANE_FUNC (poly64x2x4_t, poly64_t, v2di, di, p64)
+__ST4_LANE_FUNC (int8x16x4_t, int8_t, v16qi, qi, s8)
+__ST4_LANE_FUNC (int16x8x4_t, int16_t, v8hi, hi, s16)
+__ST4_LANE_FUNC (int32x4x4_t, int32_t, v4si, si, s32)
+__ST4_LANE_FUNC (int64x2x4_t, int64_t, v2di, di, s64)
+__ST4_LANE_FUNC (uint8x16x4_t, uint8_t, v16qi, qi, u8)
+__ST4_LANE_FUNC (uint16x8x4_t, uint16_t, v8hi, hi, u16)
+__ST4_LANE_FUNC (uint32x4x4_t, uint32_t, v4si, si, u32)
+__ST4_LANE_FUNC (uint64x2x4_t, uint64_t, v2di, di, u64)
+
+__extension__ extern __inline int64_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vaddlv_s32 (int32x2_t a)
+{
+  int64_t result;
+  __asm__ ("saddlp %0.1d, %1.2s" : "=w"(result) : "w"(a) : );
+  return result;
+}
+
+__extension__ extern __inline uint64_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vaddlv_u32 (uint32x2_t a)
+{
+  uint64_t result;
+  __asm__ ("uaddlp %0.1d, %1.2s" : "=w"(result) : "w"(a) : );
+  return result;
+}
+
+__extension__ extern __inline int16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqdmulh_laneq_s16 (int16x4_t __a, int16x8_t __b, const int __c)
+{
+  return __builtin_aarch64_sqdmulh_laneqv4hi (__a, __b, __c);
+}
+
+__extension__ extern __inline int32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqdmulh_laneq_s32 (int32x2_t __a, int32x4_t __b, const int __c)
+{
+  return __builtin_aarch64_sqdmulh_laneqv2si (__a, __b, __c);
+}
+
+__extension__ extern __inline int16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqdmulhq_laneq_s16 (int16x8_t __a, int16x8_t __b, const int __c)
+{
+  return __builtin_aarch64_sqdmulh_laneqv8hi (__a, __b, __c);
+}
+
+__extension__ extern __inline int32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqdmulhq_laneq_s32 (int32x4_t __a, int32x4_t __b, const int __c)
+{
+  return __builtin_aarch64_sqdmulh_laneqv4si (__a, __b, __c);
+}
+
+__extension__ extern __inline int16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqrdmulh_laneq_s16 (int16x4_t __a, int16x8_t __b, const int __c)
+{
+  return  __builtin_aarch64_sqrdmulh_laneqv4hi (__a, __b, __c);
+}
+
+__extension__ extern __inline int32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqrdmulh_laneq_s32 (int32x2_t __a, int32x4_t __b, const int __c)
+{
+  return __builtin_aarch64_sqrdmulh_laneqv2si (__a, __b, __c);
+}
+
+__extension__ extern __inline int16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqrdmulhq_laneq_s16 (int16x8_t __a, int16x8_t __b, const int __c)
+{
+  return __builtin_aarch64_sqrdmulh_laneqv8hi (__a, __b, __c);
+}
+
+__extension__ extern __inline int32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqrdmulhq_laneq_s32 (int32x4_t __a, int32x4_t __b, const int __c)
+{
+  return __builtin_aarch64_sqrdmulh_laneqv4si (__a, __b, __c);
+}
+
+/* Table intrinsics.  */
+
+__extension__ extern __inline poly8x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqtbl1_p8 (poly8x16_t a, uint8x8_t b)
+{
+  poly8x8_t result;
+  __asm__ ("tbl %0.8b, {%1.16b}, %2.8b"
+           : "=w"(result)
+           : "w"(a), "w"(b)
+           : /* No clobbers */);
+  return result;
+}
+
+__extension__ extern __inline int8x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqtbl1_s8 (int8x16_t a, uint8x8_t b)
+{
+  int8x8_t result;
+  __asm__ ("tbl %0.8b, {%1.16b}, %2.8b"
+           : "=w"(result)
+           : "w"(a), "w"(b)
+           : /* No clobbers */);
+  return result;
+}
+
+__extension__ extern __inline uint8x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqtbl1_u8 (uint8x16_t a, uint8x8_t b)
+{
+  uint8x8_t result;
+  __asm__ ("tbl %0.8b, {%1.16b}, %2.8b"
+           : "=w"(result)
+           : "w"(a), "w"(b)
+           : /* No clobbers */);
+  return result;
+}
+
+__extension__ extern __inline poly8x16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqtbl1q_p8 (poly8x16_t a, uint8x16_t b)
+{
+  poly8x16_t result;
+  __asm__ ("tbl %0.16b, {%1.16b}, %2.16b"
+           : "=w"(result)
+           : "w"(a), "w"(b)
+           : /* No clobbers */);
+  return result;
+}
+
+__extension__ extern __inline int8x16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqtbl1q_s8 (int8x16_t a, uint8x16_t b)
+{
+  int8x16_t result;
+  __asm__ ("tbl %0.16b, {%1.16b}, %2.16b"
+           : "=w"(result)
+           : "w"(a), "w"(b)
+           : /* No clobbers */);
+  return result;
+}
+
+__extension__ extern __inline uint8x16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqtbl1q_u8 (uint8x16_t a, uint8x16_t b)
+{
+  uint8x16_t result;
+  __asm__ ("tbl %0.16b, {%1.16b}, %2.16b"
+           : "=w"(result)
+           : "w"(a), "w"(b)
+           : /* No clobbers */);
+  return result;
+}
+
+__extension__ extern __inline int8x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqtbx1_s8 (int8x8_t r, int8x16_t tab, uint8x8_t idx)
+{
+  int8x8_t result = r;
+  __asm__ ("tbx %0.8b,{%1.16b},%2.8b"
+           : "+w"(result)
+           : "w"(tab), "w"(idx)
+           : /* No clobbers */);
+  return result;
+}
+
+__extension__ extern __inline uint8x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqtbx1_u8 (uint8x8_t r, uint8x16_t tab, uint8x8_t idx)
+{
+  uint8x8_t result = r;
+  __asm__ ("tbx %0.8b,{%1.16b},%2.8b"
+           : "+w"(result)
+           : "w"(tab), "w"(idx)
+           : /* No clobbers */);
+  return result;
+}
+
+__extension__ extern __inline poly8x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqtbx1_p8 (poly8x8_t r, poly8x16_t tab, uint8x8_t idx)
+{
+  poly8x8_t result = r;
+  __asm__ ("tbx %0.8b,{%1.16b},%2.8b"
+           : "+w"(result)
+           : "w"(tab), "w"(idx)
+           : /* No clobbers */);
+  return result;
+}
+
+__extension__ extern __inline int8x16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqtbx1q_s8 (int8x16_t r, int8x16_t tab, uint8x16_t idx)
+{
+  int8x16_t result = r;
+  __asm__ ("tbx %0.16b,{%1.16b},%2.16b"
+           : "+w"(result)
+           : "w"(tab), "w"(idx)
+           : /* No clobbers */);
+  return result;
+}
+
+__extension__ extern __inline uint8x16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqtbx1q_u8 (uint8x16_t r, uint8x16_t tab, uint8x16_t idx)
+{
+  uint8x16_t result = r;
+  __asm__ ("tbx %0.16b,{%1.16b},%2.16b"
+           : "+w"(result)
+           : "w"(tab), "w"(idx)
+           : /* No clobbers */);
+  return result;
+}
+
+__extension__ extern __inline poly8x16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqtbx1q_p8 (poly8x16_t r, poly8x16_t tab, uint8x16_t idx)
+{
+  poly8x16_t result = r;
+  __asm__ ("tbx %0.16b,{%1.16b},%2.16b"
+           : "+w"(result)
+           : "w"(tab), "w"(idx)
+           : /* No clobbers */);
+  return result;
+}
+
+/* V7 legacy table intrinsics.  */
+
+__extension__ extern __inline int8x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vtbl1_s8 (int8x8_t tab, int8x8_t idx)
+{
+  int8x8_t result;
+  int8x16_t temp = vcombine_s8 (tab, vcreate_s8 (__AARCH64_UINT64_C (0x0)));
+  __asm__ ("tbl %0.8b, {%1.16b}, %2.8b"
+           : "=w"(result)
+           : "w"(temp), "w"(idx)
+           : /* No clobbers */);
+  return result;
+}
+
+__extension__ extern __inline uint8x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vtbl1_u8 (uint8x8_t tab, uint8x8_t idx)
+{
   uint8x8_t result;
-  __asm__ ("mvn %0.8b,%1.8b"
+  uint8x16_t temp = vcombine_u8 (tab, vcreate_u8 (__AARCH64_UINT64_C (0x0)));
+  __asm__ ("tbl %0.8b, {%1.16b}, %2.8b"
            : "=w"(result)
-           : "w"(a)
+           : "w"(temp), "w"(idx)
+           : /* No clobbers */);
+  return result;
+}
+
+__extension__ extern __inline poly8x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vtbl1_p8 (poly8x8_t tab, uint8x8_t idx)
+{
+  poly8x8_t result;
+  poly8x16_t temp = vcombine_p8 (tab, vcreate_p8 (__AARCH64_UINT64_C (0x0)));
+  __asm__ ("tbl %0.8b, {%1.16b}, %2.8b"
+           : "=w"(result)
+           : "w"(temp), "w"(idx)
+           : /* No clobbers */);
+  return result;
+}
+
+__extension__ extern __inline int8x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vtbl2_s8 (int8x8x2_t tab, int8x8_t idx)
+{
+  int8x8_t result;
+  int8x16_t temp = vcombine_s8 (tab.val[0], tab.val[1]);
+  __asm__ ("tbl %0.8b, {%1.16b}, %2.8b"
+           : "=w"(result)
+           : "w"(temp), "w"(idx)
+           : /* No clobbers */);
+  return result;
+}
+
+__extension__ extern __inline uint8x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vtbl2_u8 (uint8x8x2_t tab, uint8x8_t idx)
+{
+  uint8x8_t result;
+  uint8x16_t temp = vcombine_u8 (tab.val[0], tab.val[1]);
+  __asm__ ("tbl %0.8b, {%1.16b}, %2.8b"
+           : "=w"(result)
+           : "w"(temp), "w"(idx)
+           : /* No clobbers */);
+  return result;
+}
+
+__extension__ extern __inline poly8x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vtbl2_p8 (poly8x8x2_t tab, uint8x8_t idx)
+{
+  poly8x8_t result;
+  poly8x16_t temp = vcombine_p8 (tab.val[0], tab.val[1]);
+  __asm__ ("tbl %0.8b, {%1.16b}, %2.8b"
+           : "=w"(result)
+           : "w"(temp), "w"(idx)
+           : /* No clobbers */);
+  return result;
+}
+
+__extension__ extern __inline int8x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vtbl3_s8 (int8x8x3_t tab, int8x8_t idx)
+{
+  int8x8_t result;
+  int8x16x2_t temp;
+  __builtin_aarch64_simd_oi __o;
+  temp.val[0] = vcombine_s8 (tab.val[0], tab.val[1]);
+  temp.val[1] = vcombine_s8 (tab.val[2], vcreate_s8 (__AARCH64_UINT64_C (0x0)));
+  __o = __builtin_aarch64_set_qregoiv16qi (__o,
+					   (int8x16_t) temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv16qi (__o,
+					   (int8x16_t) temp.val[1], 1);
+  result = __builtin_aarch64_tbl3v8qi (__o, idx);
+  return result;
+}
+
+__extension__ extern __inline uint8x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vtbl3_u8 (uint8x8x3_t tab, uint8x8_t idx)
+{
+  uint8x8_t result;
+  uint8x16x2_t temp;
+  __builtin_aarch64_simd_oi __o;
+  temp.val[0] = vcombine_u8 (tab.val[0], tab.val[1]);
+  temp.val[1] = vcombine_u8 (tab.val[2], vcreate_u8 (__AARCH64_UINT64_C (0x0)));
+  __o = __builtin_aarch64_set_qregoiv16qi (__o,
+					   (int8x16_t) temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv16qi (__o,
+					   (int8x16_t) temp.val[1], 1);
+  result = (uint8x8_t)__builtin_aarch64_tbl3v8qi (__o, (int8x8_t)idx);
+  return result;
+}
+
+__extension__ extern __inline poly8x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vtbl3_p8 (poly8x8x3_t tab, uint8x8_t idx)
+{
+  poly8x8_t result;
+  poly8x16x2_t temp;
+  __builtin_aarch64_simd_oi __o;
+  temp.val[0] = vcombine_p8 (tab.val[0], tab.val[1]);
+  temp.val[1] = vcombine_p8 (tab.val[2], vcreate_p8 (__AARCH64_UINT64_C (0x0)));
+  __o = __builtin_aarch64_set_qregoiv16qi (__o,
+					   (int8x16_t) temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv16qi (__o,
+					   (int8x16_t) temp.val[1], 1);
+  result = (poly8x8_t)__builtin_aarch64_tbl3v8qi (__o, (int8x8_t)idx);
+  return result;
+}
+
+__extension__ extern __inline int8x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vtbl4_s8 (int8x8x4_t tab, int8x8_t idx)
+{
+  int8x8_t result;
+  int8x16x2_t temp;
+  __builtin_aarch64_simd_oi __o;
+  temp.val[0] = vcombine_s8 (tab.val[0], tab.val[1]);
+  temp.val[1] = vcombine_s8 (tab.val[2], tab.val[3]);
+  __o = __builtin_aarch64_set_qregoiv16qi (__o,
+					   (int8x16_t) temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv16qi (__o,
+					   (int8x16_t) temp.val[1], 1);
+  result = __builtin_aarch64_tbl3v8qi (__o, idx);
+  return result;
+}
+
+__extension__ extern __inline uint8x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vtbl4_u8 (uint8x8x4_t tab, uint8x8_t idx)
+{
+  uint8x8_t result;
+  uint8x16x2_t temp;
+  __builtin_aarch64_simd_oi __o;
+  temp.val[0] = vcombine_u8 (tab.val[0], tab.val[1]);
+  temp.val[1] = vcombine_u8 (tab.val[2], tab.val[3]);
+  __o = __builtin_aarch64_set_qregoiv16qi (__o,
+					   (int8x16_t) temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv16qi (__o,
+					   (int8x16_t) temp.val[1], 1);
+  result = (uint8x8_t)__builtin_aarch64_tbl3v8qi (__o, (int8x8_t)idx);
+  return result;
+}
+
+__extension__ extern __inline poly8x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vtbl4_p8 (poly8x8x4_t tab, uint8x8_t idx)
+{
+  poly8x8_t result;
+  poly8x16x2_t temp;
+  __builtin_aarch64_simd_oi __o;
+  temp.val[0] = vcombine_p8 (tab.val[0], tab.val[1]);
+  temp.val[1] = vcombine_p8 (tab.val[2], tab.val[3]);
+  __o = __builtin_aarch64_set_qregoiv16qi (__o,
+					   (int8x16_t) temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv16qi (__o,
+					   (int8x16_t) temp.val[1], 1);
+  result = (poly8x8_t)__builtin_aarch64_tbl3v8qi (__o, (int8x8_t)idx);
+  return result;
+}
+
+__extension__ extern __inline int8x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vtbx2_s8 (int8x8_t r, int8x8x2_t tab, int8x8_t idx)
+{
+  int8x8_t result = r;
+  int8x16_t temp = vcombine_s8 (tab.val[0], tab.val[1]);
+  __asm__ ("tbx %0.8b, {%1.16b}, %2.8b"
+           : "+w"(result)
+           : "w"(temp), "w"(idx)
+           : /* No clobbers */);
+  return result;
+}
+
+__extension__ extern __inline uint8x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vtbx2_u8 (uint8x8_t r, uint8x8x2_t tab, uint8x8_t idx)
+{
+  uint8x8_t result = r;
+  uint8x16_t temp = vcombine_u8 (tab.val[0], tab.val[1]);
+  __asm__ ("tbx %0.8b, {%1.16b}, %2.8b"
+           : "+w"(result)
+           : "w"(temp), "w"(idx)
+           : /* No clobbers */);
+  return result;
+}
+
+__extension__ extern __inline poly8x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vtbx2_p8 (poly8x8_t r, poly8x8x2_t tab, uint8x8_t idx)
+{
+  poly8x8_t result = r;
+  poly8x16_t temp = vcombine_p8 (tab.val[0], tab.val[1]);
+  __asm__ ("tbx %0.8b, {%1.16b}, %2.8b"
+           : "+w"(result)
+           : "w"(temp), "w"(idx)
            : /* No clobbers */);
   return result;
 }
 
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-vmvn_u16 (uint16x4_t a)
+/* End of temporary inline asm.  */
+
+/* Start of optimal implementations in approved order.  */
+
+/* vabd.  */
+
+__extension__ extern __inline float32_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vabds_f32 (float32_t __a, float32_t __b)
+{
+  return __builtin_aarch64_fabdsf (__a, __b);
+}
+
+__extension__ extern __inline float64_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vabdd_f64 (float64_t __a, float64_t __b)
+{
+  return __builtin_aarch64_fabddf (__a, __b);
+}
+
+__extension__ extern __inline float32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vabd_f32 (float32x2_t __a, float32x2_t __b)
+{
+  return __builtin_aarch64_fabdv2sf (__a, __b);
+}
+
+__extension__ extern __inline float64x1_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vabd_f64 (float64x1_t __a, float64x1_t __b)
+{
+  return (float64x1_t) {vabdd_f64 (vget_lane_f64 (__a, 0),
+				   vget_lane_f64 (__b, 0))};
+}
+
+__extension__ extern __inline float32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vabdq_f32 (float32x4_t __a, float32x4_t __b)
+{
+  return __builtin_aarch64_fabdv4sf (__a, __b);
+}
+
+__extension__ extern __inline float64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vabdq_f64 (float64x2_t __a, float64x2_t __b)
+{
+  return __builtin_aarch64_fabdv2df (__a, __b);
+}
+
+/* vabs  */
+
+__extension__ extern __inline float32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vabs_f32 (float32x2_t __a)
+{
+  return __builtin_aarch64_absv2sf (__a);
+}
+
+__extension__ extern __inline float64x1_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vabs_f64 (float64x1_t __a)
+{
+  return (float64x1_t) {__builtin_fabs (__a[0])};
+}
+
+__extension__ extern __inline int8x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vabs_s8 (int8x8_t __a)
+{
+  return __builtin_aarch64_absv8qi (__a);
+}
+
+__extension__ extern __inline int16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vabs_s16 (int16x4_t __a)
+{
+  return __builtin_aarch64_absv4hi (__a);
+}
+
+__extension__ extern __inline int32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vabs_s32 (int32x2_t __a)
+{
+  return __builtin_aarch64_absv2si (__a);
+}
+
+__extension__ extern __inline int64x1_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vabs_s64 (int64x1_t __a)
+{
+  return (int64x1_t) {__builtin_aarch64_absdi (__a[0])};
+}
+
+__extension__ extern __inline float32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vabsq_f32 (float32x4_t __a)
+{
+  return __builtin_aarch64_absv4sf (__a);
+}
+
+__extension__ extern __inline float64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vabsq_f64 (float64x2_t __a)
+{
+  return __builtin_aarch64_absv2df (__a);
+}
+
+__extension__ extern __inline int8x16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vabsq_s8 (int8x16_t __a)
+{
+  return __builtin_aarch64_absv16qi (__a);
+}
+
+__extension__ extern __inline int16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vabsq_s16 (int16x8_t __a)
+{
+  return __builtin_aarch64_absv8hi (__a);
+}
+
+__extension__ extern __inline int32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vabsq_s32 (int32x4_t __a)
+{
+  return __builtin_aarch64_absv4si (__a);
+}
+
+__extension__ extern __inline int64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vabsq_s64 (int64x2_t __a)
+{
+  return __builtin_aarch64_absv2di (__a);
+}
+
+/* vadd */
+
+__extension__ extern __inline int64_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vaddd_s64 (int64_t __a, int64_t __b)
+{
+  return __a + __b;
+}
+
+__extension__ extern __inline uint64_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vaddd_u64 (uint64_t __a, uint64_t __b)
+{
+  return __a + __b;
+}
+
+/* vaddv */
+
+__extension__ extern __inline int8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vaddv_s8 (int8x8_t __a)
+{
+  return __builtin_aarch64_reduc_plus_scal_v8qi (__a);
+}
+
+__extension__ extern __inline int16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vaddv_s16 (int16x4_t __a)
+{
+  return __builtin_aarch64_reduc_plus_scal_v4hi (__a);
+}
+
+__extension__ extern __inline int32_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vaddv_s32 (int32x2_t __a)
+{
+  return __builtin_aarch64_reduc_plus_scal_v2si (__a);
+}
+
+__extension__ extern __inline uint8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vaddv_u8 (uint8x8_t __a)
+{
+  return (uint8_t) __builtin_aarch64_reduc_plus_scal_v8qi ((int8x8_t) __a);
+}
+
+__extension__ extern __inline uint16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vaddv_u16 (uint16x4_t __a)
+{
+  return (uint16_t) __builtin_aarch64_reduc_plus_scal_v4hi ((int16x4_t) __a);
+}
+
+__extension__ extern __inline uint32_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vaddv_u32 (uint32x2_t __a)
+{
+  return (int32_t) __builtin_aarch64_reduc_plus_scal_v2si ((int32x2_t) __a);
+}
+
+__extension__ extern __inline int8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vaddvq_s8 (int8x16_t __a)
+{
+  return __builtin_aarch64_reduc_plus_scal_v16qi (__a);
+}
+
+__extension__ extern __inline int16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vaddvq_s16 (int16x8_t __a)
+{
+  return __builtin_aarch64_reduc_plus_scal_v8hi (__a);
+}
+
+__extension__ extern __inline int32_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vaddvq_s32 (int32x4_t __a)
+{
+  return __builtin_aarch64_reduc_plus_scal_v4si (__a);
+}
+
+__extension__ extern __inline int64_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vaddvq_s64 (int64x2_t __a)
+{
+  return __builtin_aarch64_reduc_plus_scal_v2di (__a);
+}
+
+__extension__ extern __inline uint8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vaddvq_u8 (uint8x16_t __a)
+{
+  return (uint8_t) __builtin_aarch64_reduc_plus_scal_v16qi ((int8x16_t) __a);
+}
+
+__extension__ extern __inline uint16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vaddvq_u16 (uint16x8_t __a)
+{
+  return (uint16_t) __builtin_aarch64_reduc_plus_scal_v8hi ((int16x8_t) __a);
+}
+
+__extension__ extern __inline uint32_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vaddvq_u32 (uint32x4_t __a)
+{
+  return (uint32_t) __builtin_aarch64_reduc_plus_scal_v4si ((int32x4_t) __a);
+}
+
+__extension__ extern __inline uint64_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vaddvq_u64 (uint64x2_t __a)
+{
+  return (uint64_t) __builtin_aarch64_reduc_plus_scal_v2di ((int64x2_t) __a);
+}
+
+__extension__ extern __inline float32_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vaddv_f32 (float32x2_t __a)
+{
+  return __builtin_aarch64_reduc_plus_scal_v2sf (__a);
+}
+
+__extension__ extern __inline float32_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vaddvq_f32 (float32x4_t __a)
+{
+  return __builtin_aarch64_reduc_plus_scal_v4sf (__a);
+}
+
+__extension__ extern __inline float64_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vaddvq_f64 (float64x2_t __a)
+{
+  return __builtin_aarch64_reduc_plus_scal_v2df (__a);
+}
+
+/* vbsl  */
+
+__extension__ extern __inline float16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vbsl_f16 (uint16x4_t __a, float16x4_t __b, float16x4_t __c)
+{
+  return __builtin_aarch64_simd_bslv4hf_suss (__a, __b, __c);
+}
+
+__extension__ extern __inline float32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vbsl_f32 (uint32x2_t __a, float32x2_t __b, float32x2_t __c)
+{
+  return __builtin_aarch64_simd_bslv2sf_suss (__a, __b, __c);
+}
+
+__extension__ extern __inline float64x1_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vbsl_f64 (uint64x1_t __a, float64x1_t __b, float64x1_t __c)
+{
+  return (float64x1_t)
+    { __builtin_aarch64_simd_bsldf_suss (__a[0], __b[0], __c[0]) };
+}
+
+__extension__ extern __inline poly8x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vbsl_p8 (uint8x8_t __a, poly8x8_t __b, poly8x8_t __c)
+{
+  return __builtin_aarch64_simd_bslv8qi_pupp (__a, __b, __c);
+}
+
+__extension__ extern __inline poly16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vbsl_p16 (uint16x4_t __a, poly16x4_t __b, poly16x4_t __c)
+{
+  return __builtin_aarch64_simd_bslv4hi_pupp (__a, __b, __c);
+}
+__extension__ extern __inline poly64x1_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vbsl_p64 (uint64x1_t __a, poly64x1_t __b, poly64x1_t __c)
+{
+  return (poly64x1_t)
+      {__builtin_aarch64_simd_bsldi_pupp (__a[0], __b[0], __c[0])};
+}
+
+__extension__ extern __inline int8x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vbsl_s8 (uint8x8_t __a, int8x8_t __b, int8x8_t __c)
+{
+  return __builtin_aarch64_simd_bslv8qi_suss (__a, __b, __c);
+}
+
+__extension__ extern __inline int16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vbsl_s16 (uint16x4_t __a, int16x4_t __b, int16x4_t __c)
+{
+  return __builtin_aarch64_simd_bslv4hi_suss (__a, __b, __c);
+}
+
+__extension__ extern __inline int32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vbsl_s32 (uint32x2_t __a, int32x2_t __b, int32x2_t __c)
+{
+  return __builtin_aarch64_simd_bslv2si_suss (__a, __b, __c);
+}
+
+__extension__ extern __inline int64x1_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vbsl_s64 (uint64x1_t __a, int64x1_t __b, int64x1_t __c)
+{
+  return (int64x1_t)
+      {__builtin_aarch64_simd_bsldi_suss (__a[0], __b[0], __c[0])};
+}
+
+__extension__ extern __inline uint8x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vbsl_u8 (uint8x8_t __a, uint8x8_t __b, uint8x8_t __c)
+{
+  return __builtin_aarch64_simd_bslv8qi_uuuu (__a, __b, __c);
+}
+
+__extension__ extern __inline uint16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vbsl_u16 (uint16x4_t __a, uint16x4_t __b, uint16x4_t __c)
+{
+  return __builtin_aarch64_simd_bslv4hi_uuuu (__a, __b, __c);
+}
+
+__extension__ extern __inline uint32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vbsl_u32 (uint32x2_t __a, uint32x2_t __b, uint32x2_t __c)
+{
+  return __builtin_aarch64_simd_bslv2si_uuuu (__a, __b, __c);
+}
+
+__extension__ extern __inline uint64x1_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vbsl_u64 (uint64x1_t __a, uint64x1_t __b, uint64x1_t __c)
+{
+  return (uint64x1_t)
+      {__builtin_aarch64_simd_bsldi_uuuu (__a[0], __b[0], __c[0])};
+}
+
+__extension__ extern __inline float16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vbslq_f16 (uint16x8_t __a, float16x8_t __b, float16x8_t __c)
+{
+  return __builtin_aarch64_simd_bslv8hf_suss (__a, __b, __c);
+}
+
+__extension__ extern __inline float32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vbslq_f32 (uint32x4_t __a, float32x4_t __b, float32x4_t __c)
+{
+  return __builtin_aarch64_simd_bslv4sf_suss (__a, __b, __c);
+}
+
+__extension__ extern __inline float64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vbslq_f64 (uint64x2_t __a, float64x2_t __b, float64x2_t __c)
+{
+  return __builtin_aarch64_simd_bslv2df_suss (__a, __b, __c);
+}
+
+__extension__ extern __inline poly8x16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vbslq_p8 (uint8x16_t __a, poly8x16_t __b, poly8x16_t __c)
+{
+  return __builtin_aarch64_simd_bslv16qi_pupp (__a, __b, __c);
+}
+
+__extension__ extern __inline poly16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vbslq_p16 (uint16x8_t __a, poly16x8_t __b, poly16x8_t __c)
+{
+  return __builtin_aarch64_simd_bslv8hi_pupp (__a, __b, __c);
+}
+
+__extension__ extern __inline int8x16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vbslq_s8 (uint8x16_t __a, int8x16_t __b, int8x16_t __c)
+{
+  return __builtin_aarch64_simd_bslv16qi_suss (__a, __b, __c);
+}
+
+__extension__ extern __inline int16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vbslq_s16 (uint16x8_t __a, int16x8_t __b, int16x8_t __c)
+{
+  return __builtin_aarch64_simd_bslv8hi_suss (__a, __b, __c);
+}
+
+__extension__ extern __inline poly64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vbslq_p64 (uint64x2_t __a, poly64x2_t __b, poly64x2_t __c)
+{
+  return __builtin_aarch64_simd_bslv2di_pupp (__a, __b, __c);
+}
+
+__extension__ extern __inline int32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vbslq_s32 (uint32x4_t __a, int32x4_t __b, int32x4_t __c)
+{
+  return __builtin_aarch64_simd_bslv4si_suss (__a, __b, __c);
+}
+
+__extension__ extern __inline int64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vbslq_s64 (uint64x2_t __a, int64x2_t __b, int64x2_t __c)
+{
+  return __builtin_aarch64_simd_bslv2di_suss (__a, __b, __c);
+}
+
+__extension__ extern __inline uint8x16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vbslq_u8 (uint8x16_t __a, uint8x16_t __b, uint8x16_t __c)
+{
+  return __builtin_aarch64_simd_bslv16qi_uuuu (__a, __b, __c);
+}
+
+__extension__ extern __inline uint16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vbslq_u16 (uint16x8_t __a, uint16x8_t __b, uint16x8_t __c)
+{
+  return __builtin_aarch64_simd_bslv8hi_uuuu (__a, __b, __c);
+}
+
+__extension__ extern __inline uint32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vbslq_u32 (uint32x4_t __a, uint32x4_t __b, uint32x4_t __c)
+{
+  return __builtin_aarch64_simd_bslv4si_uuuu (__a, __b, __c);
+}
+
+__extension__ extern __inline uint64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vbslq_u64 (uint64x2_t __a, uint64x2_t __b, uint64x2_t __c)
+{
+  return __builtin_aarch64_simd_bslv2di_uuuu (__a, __b, __c);
+}
+
+/* ARMv8.1-A instrinsics.  */
+#pragma GCC push_options
+#pragma GCC target ("arch=armv8.1-a")
+
+__extension__ extern __inline int16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqrdmlah_s16 (int16x4_t __a, int16x4_t __b, int16x4_t __c)
+{
+  return __builtin_aarch64_sqrdmlahv4hi (__a, __b, __c);
+}
+
+__extension__ extern __inline int32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqrdmlah_s32 (int32x2_t __a, int32x2_t __b, int32x2_t __c)
+{
+  return __builtin_aarch64_sqrdmlahv2si (__a, __b, __c);
+}
+
+__extension__ extern __inline int16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqrdmlahq_s16 (int16x8_t __a, int16x8_t __b, int16x8_t __c)
+{
+  return __builtin_aarch64_sqrdmlahv8hi (__a, __b, __c);
+}
+
+__extension__ extern __inline int32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqrdmlahq_s32 (int32x4_t __a, int32x4_t __b, int32x4_t __c)
+{
+  return __builtin_aarch64_sqrdmlahv4si (__a, __b, __c);
+}
+
+__extension__ extern __inline int16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqrdmlsh_s16 (int16x4_t __a, int16x4_t __b, int16x4_t __c)
+{
+  return __builtin_aarch64_sqrdmlshv4hi (__a, __b, __c);
+}
+
+__extension__ extern __inline int32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqrdmlsh_s32 (int32x2_t __a, int32x2_t __b, int32x2_t __c)
+{
+  return __builtin_aarch64_sqrdmlshv2si (__a, __b, __c);
+}
+
+__extension__ extern __inline int16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqrdmlshq_s16 (int16x8_t __a, int16x8_t __b, int16x8_t __c)
+{
+  return __builtin_aarch64_sqrdmlshv8hi (__a, __b, __c);
+}
+
+__extension__ extern __inline int32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqrdmlshq_s32 (int32x4_t __a, int32x4_t __b, int32x4_t __c)
+{
+  return __builtin_aarch64_sqrdmlshv4si (__a, __b, __c);
+}
+
+__extension__ extern __inline int16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqrdmlah_laneq_s16 (int16x4_t __a, int16x4_t __b, int16x8_t __c, const int __d)
+{
+  return  __builtin_aarch64_sqrdmlah_laneqv4hi (__a, __b, __c, __d);
+}
+
+__extension__ extern __inline int32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqrdmlah_laneq_s32 (int32x2_t __a, int32x2_t __b, int32x4_t __c, const int __d)
+{
+  return __builtin_aarch64_sqrdmlah_laneqv2si (__a, __b, __c, __d);
+}
+
+__extension__ extern __inline int16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqrdmlahq_laneq_s16 (int16x8_t __a, int16x8_t __b, int16x8_t __c, const int __d)
+{
+  return __builtin_aarch64_sqrdmlah_laneqv8hi (__a, __b, __c, __d);
+}
+
+__extension__ extern __inline int32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqrdmlahq_laneq_s32 (int32x4_t __a, int32x4_t __b, int32x4_t __c, const int __d)
+{
+  return __builtin_aarch64_sqrdmlah_laneqv4si (__a, __b, __c, __d);
+}
+
+__extension__ extern __inline int16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqrdmlsh_laneq_s16 (int16x4_t __a, int16x4_t __b, int16x8_t __c, const int __d)
+{
+  return  __builtin_aarch64_sqrdmlsh_laneqv4hi (__a, __b, __c, __d);
+}
+
+__extension__ extern __inline int32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqrdmlsh_laneq_s32 (int32x2_t __a, int32x2_t __b, int32x4_t __c, const int __d)
+{
+  return __builtin_aarch64_sqrdmlsh_laneqv2si (__a, __b, __c, __d);
+}
+
+__extension__ extern __inline int16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqrdmlshq_laneq_s16 (int16x8_t __a, int16x8_t __b, int16x8_t __c, const int __d)
+{
+  return __builtin_aarch64_sqrdmlsh_laneqv8hi (__a, __b, __c, __d);
+}
+
+__extension__ extern __inline int32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqrdmlshq_laneq_s32 (int32x4_t __a, int32x4_t __b, int32x4_t __c, const int __d)
+{
+  return __builtin_aarch64_sqrdmlsh_laneqv4si (__a, __b, __c, __d);
+}
+
+__extension__ extern __inline int16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqrdmlah_lane_s16 (int16x4_t __a, int16x4_t __b, int16x4_t __c, const int __d)
+{
+  return  __builtin_aarch64_sqrdmlah_lanev4hi (__a, __b, __c, __d);
+}
+
+__extension__ extern __inline int32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqrdmlah_lane_s32 (int32x2_t __a, int32x2_t __b, int32x2_t __c, const int __d)
+{
+  return __builtin_aarch64_sqrdmlah_lanev2si (__a, __b, __c, __d);
+}
+
+__extension__ extern __inline int16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqrdmlahq_lane_s16 (int16x8_t __a, int16x8_t __b, int16x4_t __c, const int __d)
+{
+  return __builtin_aarch64_sqrdmlah_lanev8hi (__a, __b, __c, __d);
+}
+
+__extension__ extern __inline int32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqrdmlahq_lane_s32 (int32x4_t __a, int32x4_t __b, int32x2_t __c, const int __d)
+{
+  return __builtin_aarch64_sqrdmlah_lanev4si (__a, __b, __c, __d);
+}
+
+__extension__ extern __inline int16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqrdmlahh_s16 (int16_t __a, int16_t __b, int16_t __c)
+{
+  return (int16_t) __builtin_aarch64_sqrdmlahhi (__a, __b, __c);
+}
+
+__extension__ extern __inline int16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqrdmlahh_lane_s16 (int16_t __a, int16_t __b, int16x4_t __c, const int __d)
+{
+  return __builtin_aarch64_sqrdmlah_lanehi (__a, __b, __c, __d);
+}
+
+__extension__ extern __inline int16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqrdmlahh_laneq_s16 (int16_t __a, int16_t __b, int16x8_t __c, const int __d)
+{
+  return __builtin_aarch64_sqrdmlah_laneqhi (__a, __b, __c, __d);
+}
+
+__extension__ extern __inline int32_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqrdmlahs_s32 (int32_t __a, int32_t __b, int32_t __c)
+{
+  return (int32_t) __builtin_aarch64_sqrdmlahsi (__a, __b, __c);
+}
+
+__extension__ extern __inline int32_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqrdmlahs_lane_s32 (int32_t __a, int32_t __b, int32x2_t __c, const int __d)
+{
+  return __builtin_aarch64_sqrdmlah_lanesi (__a, __b, __c, __d);
+}
+
+__extension__ extern __inline int32_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqrdmlahs_laneq_s32 (int32_t __a, int32_t __b, int32x4_t __c, const int __d)
+{
+  return __builtin_aarch64_sqrdmlah_laneqsi (__a, __b, __c, __d);
+}
+
+__extension__ extern __inline int16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqrdmlsh_lane_s16 (int16x4_t __a, int16x4_t __b, int16x4_t __c, const int __d)
+{
+  return  __builtin_aarch64_sqrdmlsh_lanev4hi (__a, __b, __c, __d);
+}
+
+__extension__ extern __inline int32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqrdmlsh_lane_s32 (int32x2_t __a, int32x2_t __b, int32x2_t __c, const int __d)
+{
+  return __builtin_aarch64_sqrdmlsh_lanev2si (__a, __b, __c, __d);
+}
+
+__extension__ extern __inline int16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqrdmlshq_lane_s16 (int16x8_t __a, int16x8_t __b, int16x4_t __c, const int __d)
+{
+  return __builtin_aarch64_sqrdmlsh_lanev8hi (__a, __b, __c, __d);
+}
+
+__extension__ extern __inline int32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqrdmlshq_lane_s32 (int32x4_t __a, int32x4_t __b, int32x2_t __c, const int __d)
+{
+  return __builtin_aarch64_sqrdmlsh_lanev4si (__a, __b, __c, __d);
+}
+
+__extension__ extern __inline int16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqrdmlshh_s16 (int16_t __a, int16_t __b, int16_t __c)
+{
+  return (int16_t) __builtin_aarch64_sqrdmlshhi (__a, __b, __c);
+}
+
+__extension__ extern __inline int16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqrdmlshh_lane_s16 (int16_t __a, int16_t __b, int16x4_t __c, const int __d)
+{
+  return __builtin_aarch64_sqrdmlsh_lanehi (__a, __b, __c, __d);
+}
+
+__extension__ extern __inline int16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqrdmlshh_laneq_s16 (int16_t __a, int16_t __b, int16x8_t __c, const int __d)
+{
+  return __builtin_aarch64_sqrdmlsh_laneqhi (__a, __b, __c, __d);
+}
+
+__extension__ extern __inline int32_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqrdmlshs_s32 (int32_t __a, int32_t __b, int32_t __c)
+{
+  return (int32_t) __builtin_aarch64_sqrdmlshsi (__a, __b, __c);
+}
+
+__extension__ extern __inline int32_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqrdmlshs_lane_s32 (int32_t __a, int32_t __b, int32x2_t __c, const int __d)
+{
+  return __builtin_aarch64_sqrdmlsh_lanesi (__a, __b, __c, __d);
+}
+
+__extension__ extern __inline int32_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqrdmlshs_laneq_s32 (int32_t __a, int32_t __b, int32x4_t __c, const int __d)
+{
+  return __builtin_aarch64_sqrdmlsh_laneqsi (__a, __b, __c, __d);
+}
+#pragma GCC pop_options
+
+#pragma GCC push_options
+#pragma GCC target ("+nothing+crypto")
+/* vaes  */
+
+__extension__ extern __inline uint8x16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vaeseq_u8 (uint8x16_t data, uint8x16_t key)
 {
-  uint16x4_t result;
-  __asm__ ("mvn %0.8b,%1.8b"
-           : "=w"(result)
-           : "w"(a)
-           : /* No clobbers */);
-  return result;
+  return __builtin_aarch64_crypto_aesev16qi_uuu (data, key);
 }
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-vmvn_u32 (uint32x2_t a)
+__extension__ extern __inline uint8x16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vaesdq_u8 (uint8x16_t data, uint8x16_t key)
 {
-  uint32x2_t result;
-  __asm__ ("mvn %0.8b,%1.8b"
-           : "=w"(result)
-           : "w"(a)
-           : /* No clobbers */);
-  return result;
+  return __builtin_aarch64_crypto_aesdv16qi_uuu (data, key);
 }
 
-__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
-vmvnq_p8 (poly8x16_t a)
+__extension__ extern __inline uint8x16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vaesmcq_u8 (uint8x16_t data)
 {
-  poly8x16_t result;
-  __asm__ ("mvn %0.16b,%1.16b"
-           : "=w"(result)
-           : "w"(a)
-           : /* No clobbers */);
-  return result;
+  return __builtin_aarch64_crypto_aesmcv16qi_uu (data);
 }
 
-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
-vmvnq_s8 (int8x16_t a)
+__extension__ extern __inline uint8x16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vaesimcq_u8 (uint8x16_t data)
 {
-  int8x16_t result;
-  __asm__ ("mvn %0.16b,%1.16b"
-           : "=w"(result)
-           : "w"(a)
-           : /* No clobbers */);
-  return result;
+  return __builtin_aarch64_crypto_aesimcv16qi_uu (data);
 }
+#pragma GCC pop_options
+
+/* vcage  */
 
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-vmvnq_s16 (int16x8_t a)
+__extension__ extern __inline uint64x1_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcage_f64 (float64x1_t __a, float64x1_t __b)
 {
-  int16x8_t result;
-  __asm__ ("mvn %0.16b,%1.16b"
-           : "=w"(result)
-           : "w"(a)
-           : /* No clobbers */);
-  return result;
+  return vabs_f64 (__a) >= vabs_f64 (__b);
 }
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-vmvnq_s32 (int32x4_t a)
+__extension__ extern __inline uint32_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcages_f32 (float32_t __a, float32_t __b)
 {
-  int32x4_t result;
-  __asm__ ("mvn %0.16b,%1.16b"
-           : "=w"(result)
-           : "w"(a)
-           : /* No clobbers */);
-  return result;
+  return __builtin_fabsf (__a) >= __builtin_fabsf (__b) ? -1 : 0;
 }
 
-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-vmvnq_u8 (uint8x16_t a)
+__extension__ extern __inline uint32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcage_f32 (float32x2_t __a, float32x2_t __b)
 {
-  uint8x16_t result;
-  __asm__ ("mvn %0.16b,%1.16b"
-           : "=w"(result)
-           : "w"(a)
-           : /* No clobbers */);
-  return result;
+  return vabs_f32 (__a) >= vabs_f32 (__b);
 }
 
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-vmvnq_u16 (uint16x8_t a)
+__extension__ extern __inline uint32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcageq_f32 (float32x4_t __a, float32x4_t __b)
 {
-  uint16x8_t result;
-  __asm__ ("mvn %0.16b,%1.16b"
-           : "=w"(result)
-           : "w"(a)
-           : /* No clobbers */);
-  return result;
+  return vabsq_f32 (__a) >= vabsq_f32 (__b);
 }
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vmvnq_u32 (uint32x4_t a)
+__extension__ extern __inline uint64_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcaged_f64 (float64_t __a, float64_t __b)
 {
-  uint32x4_t result;
-  __asm__ ("mvn %0.16b,%1.16b"
-           : "=w"(result)
-           : "w"(a)
-           : /* No clobbers */);
-  return result;
+  return __builtin_fabs (__a) >= __builtin_fabs (__b) ? -1 : 0;
+}
+
+__extension__ extern __inline uint64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcageq_f64 (float64x2_t __a, float64x2_t __b)
+{
+  return vabsq_f64 (__a) >= vabsq_f64 (__b);
 }
 
+/* vcagt  */
 
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-vpadal_s8 (int16x4_t a, int8x8_t b)
+__extension__ extern __inline uint32_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcagts_f32 (float32_t __a, float32_t __b)
 {
-  int16x4_t result;
-  __asm__ ("sadalp %0.4h,%2.8b"
-           : "=w"(result)
-           : "0"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
+  return __builtin_fabsf (__a) > __builtin_fabsf (__b) ? -1 : 0;
+}
+
+__extension__ extern __inline uint32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcagt_f32 (float32x2_t __a, float32x2_t __b)
+{
+  return vabs_f32 (__a) > vabs_f32 (__b);
+}
+
+__extension__ extern __inline uint64x1_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcagt_f64 (float64x1_t __a, float64x1_t __b)
+{
+  return vabs_f64 (__a) > vabs_f64 (__b);
+}
+
+__extension__ extern __inline uint32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcagtq_f32 (float32x4_t __a, float32x4_t __b)
+{
+  return vabsq_f32 (__a) > vabsq_f32 (__b);
+}
+
+__extension__ extern __inline uint64_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcagtd_f64 (float64_t __a, float64_t __b)
+{
+  return __builtin_fabs (__a) > __builtin_fabs (__b) ? -1 : 0;
+}
+
+__extension__ extern __inline uint64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcagtq_f64 (float64x2_t __a, float64x2_t __b)
+{
+  return vabsq_f64 (__a) > vabsq_f64 (__b);
+}
+
+/* vcale  */
+
+__extension__ extern __inline uint32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcale_f32 (float32x2_t __a, float32x2_t __b)
+{
+  return vabs_f32 (__a) <= vabs_f32 (__b);
+}
+
+__extension__ extern __inline uint64x1_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcale_f64 (float64x1_t __a, float64x1_t __b)
+{
+  return vabs_f64 (__a) <= vabs_f64 (__b);
+}
+
+__extension__ extern __inline uint64_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcaled_f64 (float64_t __a, float64_t __b)
+{
+  return __builtin_fabs (__a) <= __builtin_fabs (__b) ? -1 : 0;
+}
+
+__extension__ extern __inline uint32_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcales_f32 (float32_t __a, float32_t __b)
+{
+  return __builtin_fabsf (__a) <= __builtin_fabsf (__b) ? -1 : 0;
+}
+
+__extension__ extern __inline uint32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcaleq_f32 (float32x4_t __a, float32x4_t __b)
+{
+  return vabsq_f32 (__a) <= vabsq_f32 (__b);
+}
+
+__extension__ extern __inline uint64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcaleq_f64 (float64x2_t __a, float64x2_t __b)
+{
+  return vabsq_f64 (__a) <= vabsq_f64 (__b);
+}
+
+/* vcalt  */
+
+__extension__ extern __inline uint32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcalt_f32 (float32x2_t __a, float32x2_t __b)
+{
+  return vabs_f32 (__a) < vabs_f32 (__b);
+}
+
+__extension__ extern __inline uint64x1_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcalt_f64 (float64x1_t __a, float64x1_t __b)
+{
+  return vabs_f64 (__a) < vabs_f64 (__b);
+}
+
+__extension__ extern __inline uint64_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcaltd_f64 (float64_t __a, float64_t __b)
+{
+  return __builtin_fabs (__a) < __builtin_fabs (__b) ? -1 : 0;
+}
+
+__extension__ extern __inline uint32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcaltq_f32 (float32x4_t __a, float32x4_t __b)
+{
+  return vabsq_f32 (__a) < vabsq_f32 (__b);
+}
+
+__extension__ extern __inline uint64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcaltq_f64 (float64x2_t __a, float64x2_t __b)
+{
+  return vabsq_f64 (__a) < vabsq_f64 (__b);
+}
+
+__extension__ extern __inline uint32_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcalts_f32 (float32_t __a, float32_t __b)
+{
+  return __builtin_fabsf (__a) < __builtin_fabsf (__b) ? -1 : 0;
+}
+
+/* vceq - vector.  */
+
+__extension__ extern __inline uint32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vceq_f32 (float32x2_t __a, float32x2_t __b)
+{
+  return (uint32x2_t) (__a == __b);
+}
+
+__extension__ extern __inline uint64x1_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vceq_f64 (float64x1_t __a, float64x1_t __b)
+{
+  return (uint64x1_t) (__a == __b);
+}
+
+__extension__ extern __inline uint8x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vceq_p8 (poly8x8_t __a, poly8x8_t __b)
+{
+  return (uint8x8_t) (__a == __b);
+}
+
+__extension__ extern __inline uint64x1_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vceq_p64 (poly64x1_t __a, poly64x1_t __b)
+{
+  return (uint64x1_t) (__a == __b);
+}
+
+__extension__ extern __inline uint8x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vceq_s8 (int8x8_t __a, int8x8_t __b)
+{
+  return (uint8x8_t) (__a == __b);
+}
+
+__extension__ extern __inline uint16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vceq_s16 (int16x4_t __a, int16x4_t __b)
+{
+  return (uint16x4_t) (__a == __b);
+}
+
+__extension__ extern __inline uint32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vceq_s32 (int32x2_t __a, int32x2_t __b)
+{
+  return (uint32x2_t) (__a == __b);
+}
+
+__extension__ extern __inline uint64x1_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vceq_s64 (int64x1_t __a, int64x1_t __b)
+{
+  return (uint64x1_t) (__a == __b);
+}
+
+__extension__ extern __inline uint8x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vceq_u8 (uint8x8_t __a, uint8x8_t __b)
+{
+  return (__a == __b);
+}
+
+__extension__ extern __inline uint16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vceq_u16 (uint16x4_t __a, uint16x4_t __b)
+{
+  return (__a == __b);
+}
+
+__extension__ extern __inline uint32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vceq_u32 (uint32x2_t __a, uint32x2_t __b)
+{
+  return (__a == __b);
+}
+
+__extension__ extern __inline uint64x1_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vceq_u64 (uint64x1_t __a, uint64x1_t __b)
+{
+  return (__a == __b);
+}
+
+__extension__ extern __inline uint32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vceqq_f32 (float32x4_t __a, float32x4_t __b)
+{
+  return (uint32x4_t) (__a == __b);
+}
+
+__extension__ extern __inline uint64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vceqq_f64 (float64x2_t __a, float64x2_t __b)
+{
+  return (uint64x2_t) (__a == __b);
+}
+
+__extension__ extern __inline uint8x16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vceqq_p8 (poly8x16_t __a, poly8x16_t __b)
+{
+  return (uint8x16_t) (__a == __b);
+}
+
+__extension__ extern __inline uint8x16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vceqq_s8 (int8x16_t __a, int8x16_t __b)
+{
+  return (uint8x16_t) (__a == __b);
 }
 
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-vpadal_s16 (int32x2_t a, int16x4_t b)
+__extension__ extern __inline uint16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vceqq_s16 (int16x8_t __a, int16x8_t __b)
 {
-  int32x2_t result;
-  __asm__ ("sadalp %0.2s,%2.4h"
-           : "=w"(result)
-           : "0"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
+  return (uint16x8_t) (__a == __b);
 }
 
-__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
-vpadal_s32 (int64x1_t a, int32x2_t b)
+__extension__ extern __inline uint32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vceqq_s32 (int32x4_t __a, int32x4_t __b)
 {
-  int64x1_t result;
-  __asm__ ("sadalp %0.1d,%2.2s"
-           : "=w"(result)
-           : "0"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
+  return (uint32x4_t) (__a == __b);
 }
 
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-vpadal_u8 (uint16x4_t a, uint8x8_t b)
+__extension__ extern __inline uint64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vceqq_s64 (int64x2_t __a, int64x2_t __b)
 {
-  uint16x4_t result;
-  __asm__ ("uadalp %0.4h,%2.8b"
-           : "=w"(result)
-           : "0"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
+  return (uint64x2_t) (__a == __b);
 }
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-vpadal_u16 (uint32x2_t a, uint16x4_t b)
+__extension__ extern __inline uint8x16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vceqq_u8 (uint8x16_t __a, uint8x16_t __b)
 {
-  uint32x2_t result;
-  __asm__ ("uadalp %0.2s,%2.4h"
-           : "=w"(result)
-           : "0"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
+  return (__a == __b);
 }
 
-__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
-vpadal_u32 (uint64x1_t a, uint32x2_t b)
+__extension__ extern __inline uint16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vceqq_u16 (uint16x8_t __a, uint16x8_t __b)
 {
-  uint64x1_t result;
-  __asm__ ("uadalp %0.1d,%2.2s"
-           : "=w"(result)
-           : "0"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
+  return (__a == __b);
 }
 
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-vpadalq_s8 (int16x8_t a, int8x16_t b)
+__extension__ extern __inline uint32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vceqq_u32 (uint32x4_t __a, uint32x4_t __b)
 {
-  int16x8_t result;
-  __asm__ ("sadalp %0.8h,%2.16b"
-           : "=w"(result)
-           : "0"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
+  return (__a == __b);
 }
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-vpadalq_s16 (int32x4_t a, int16x8_t b)
+__extension__ extern __inline uint64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vceqq_u64 (uint64x2_t __a, uint64x2_t __b)
 {
-  int32x4_t result;
-  __asm__ ("sadalp %0.4s,%2.8h"
-           : "=w"(result)
-           : "0"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
+  return (__a == __b);
 }
 
-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
-vpadalq_s32 (int64x2_t a, int32x4_t b)
+/* vceq - scalar.  */
+
+__extension__ extern __inline uint32_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vceqs_f32 (float32_t __a, float32_t __b)
 {
-  int64x2_t result;
-  __asm__ ("sadalp %0.2d,%2.4s"
-           : "=w"(result)
-           : "0"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
+  return __a == __b ? -1 : 0;
 }
 
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-vpadalq_u8 (uint16x8_t a, uint8x16_t b)
+__extension__ extern __inline uint64_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vceqd_s64 (int64_t __a, int64_t __b)
 {
-  uint16x8_t result;
-  __asm__ ("uadalp %0.8h,%2.16b"
-           : "=w"(result)
-           : "0"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
+  return __a == __b ? -1ll : 0ll;
 }
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vpadalq_u16 (uint32x4_t a, uint16x8_t b)
+__extension__ extern __inline uint64_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vceqd_u64 (uint64_t __a, uint64_t __b)
 {
-  uint32x4_t result;
-  __asm__ ("uadalp %0.4s,%2.8h"
-           : "=w"(result)
-           : "0"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
+  return __a == __b ? -1ll : 0ll;
 }
 
-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
-vpadalq_u32 (uint64x2_t a, uint32x4_t b)
+__extension__ extern __inline uint64_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vceqd_f64 (float64_t __a, float64_t __b)
 {
-  uint64x2_t result;
-  __asm__ ("uadalp %0.2d,%2.4s"
-           : "=w"(result)
-           : "0"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
+  return __a == __b ? -1ll : 0ll;
 }
 
-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
-vpadd_f32 (float32x2_t a, float32x2_t b)
+/* vceqz - vector.  */
+
+__extension__ extern __inline uint32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vceqz_f32 (float32x2_t __a)
 {
-  float32x2_t result;
-  __asm__ ("faddp %0.2s,%1.2s,%2.2s"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
+  return (uint32x2_t) (__a == 0.0f);
 }
 
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-vpaddl_s8 (int8x8_t a)
+__extension__ extern __inline uint64x1_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vceqz_f64 (float64x1_t __a)
 {
-  int16x4_t result;
-  __asm__ ("saddlp %0.4h,%1.8b"
-           : "=w"(result)
-           : "w"(a)
-           : /* No clobbers */);
-  return result;
+  return (uint64x1_t) (__a == (float64x1_t) {0.0});
 }
 
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-vpaddl_s16 (int16x4_t a)
+__extension__ extern __inline uint8x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vceqz_p8 (poly8x8_t __a)
 {
-  int32x2_t result;
-  __asm__ ("saddlp %0.2s,%1.4h"
-           : "=w"(result)
-           : "w"(a)
-           : /* No clobbers */);
-  return result;
+  return (uint8x8_t) (__a == 0);
 }
 
-__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
-vpaddl_s32 (int32x2_t a)
+__extension__ extern __inline uint8x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vceqz_s8 (int8x8_t __a)
 {
-  int64x1_t result;
-  __asm__ ("saddlp %0.1d,%1.2s"
-           : "=w"(result)
-           : "w"(a)
-           : /* No clobbers */);
-  return result;
+  return (uint8x8_t) (__a == 0);
 }
 
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-vpaddl_u8 (uint8x8_t a)
+__extension__ extern __inline uint16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vceqz_s16 (int16x4_t __a)
 {
-  uint16x4_t result;
-  __asm__ ("uaddlp %0.4h,%1.8b"
-           : "=w"(result)
-           : "w"(a)
-           : /* No clobbers */);
-  return result;
+  return (uint16x4_t) (__a == 0);
 }
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-vpaddl_u16 (uint16x4_t a)
+__extension__ extern __inline uint32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vceqz_s32 (int32x2_t __a)
 {
-  uint32x2_t result;
-  __asm__ ("uaddlp %0.2s,%1.4h"
-           : "=w"(result)
-           : "w"(a)
-           : /* No clobbers */);
-  return result;
+  return (uint32x2_t) (__a == 0);
 }
 
-__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
-vpaddl_u32 (uint32x2_t a)
+__extension__ extern __inline uint64x1_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vceqz_s64 (int64x1_t __a)
 {
-  uint64x1_t result;
-  __asm__ ("uaddlp %0.1d,%1.2s"
-           : "=w"(result)
-           : "w"(a)
-           : /* No clobbers */);
-  return result;
+  return (uint64x1_t) (__a == __AARCH64_INT64_C (0));
 }
 
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-vpaddlq_s8 (int8x16_t a)
+__extension__ extern __inline uint8x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vceqz_u8 (uint8x8_t __a)
 {
-  int16x8_t result;
-  __asm__ ("saddlp %0.8h,%1.16b"
-           : "=w"(result)
-           : "w"(a)
-           : /* No clobbers */);
-  return result;
+  return (__a == 0);
 }
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-vpaddlq_s16 (int16x8_t a)
+__extension__ extern __inline uint16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vceqz_u16 (uint16x4_t __a)
 {
-  int32x4_t result;
-  __asm__ ("saddlp %0.4s,%1.8h"
-           : "=w"(result)
-           : "w"(a)
-           : /* No clobbers */);
-  return result;
+  return (__a == 0);
 }
 
-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
-vpaddlq_s32 (int32x4_t a)
+__extension__ extern __inline uint32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vceqz_u32 (uint32x2_t __a)
 {
-  int64x2_t result;
-  __asm__ ("saddlp %0.2d,%1.4s"
-           : "=w"(result)
-           : "w"(a)
-           : /* No clobbers */);
-  return result;
+  return (__a == 0);
 }
 
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-vpaddlq_u8 (uint8x16_t a)
+__extension__ extern __inline uint64x1_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vceqz_u64 (uint64x1_t __a)
 {
-  uint16x8_t result;
-  __asm__ ("uaddlp %0.8h,%1.16b"
-           : "=w"(result)
-           : "w"(a)
-           : /* No clobbers */);
-  return result;
+  return (__a == __AARCH64_UINT64_C (0));
 }
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vpaddlq_u16 (uint16x8_t a)
+__extension__ extern __inline uint32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vceqzq_f32 (float32x4_t __a)
 {
-  uint32x4_t result;
-  __asm__ ("uaddlp %0.4s,%1.8h"
-           : "=w"(result)
-           : "w"(a)
-           : /* No clobbers */);
-  return result;
+  return (uint32x4_t) (__a == 0.0f);
 }
 
-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
-vpaddlq_u32 (uint32x4_t a)
+__extension__ extern __inline uint64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vceqzq_f64 (float64x2_t __a)
 {
-  uint64x2_t result;
-  __asm__ ("uaddlp %0.2d,%1.4s"
-           : "=w"(result)
-           : "w"(a)
-           : /* No clobbers */);
-  return result;
+  return (uint64x2_t) (__a == 0.0f);
 }
 
-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
-vpaddq_f32 (float32x4_t a, float32x4_t b)
+__extension__ extern __inline uint8x16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vceqzq_p8 (poly8x16_t __a)
 {
-  float32x4_t result;
-  __asm__ ("faddp %0.4s,%1.4s,%2.4s"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
+  return (uint8x16_t) (__a == 0);
 }
 
-__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
-vpaddq_f64 (float64x2_t a, float64x2_t b)
+__extension__ extern __inline uint8x16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vceqzq_s8 (int8x16_t __a)
 {
-  float64x2_t result;
-  __asm__ ("faddp %0.2d,%1.2d,%2.2d"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
+  return (uint8x16_t) (__a == 0);
 }
 
-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
-vpaddq_s8 (int8x16_t a, int8x16_t b)
+__extension__ extern __inline uint16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vceqzq_s16 (int16x8_t __a)
 {
-  int8x16_t result;
-  __asm__ ("addp %0.16b,%1.16b,%2.16b"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
+  return (uint16x8_t) (__a == 0);
 }
 
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-vpaddq_s16 (int16x8_t a, int16x8_t b)
+__extension__ extern __inline uint32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vceqzq_s32 (int32x4_t __a)
 {
-  int16x8_t result;
-  __asm__ ("addp %0.8h,%1.8h,%2.8h"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
+  return (uint32x4_t) (__a == 0);
+}
+
+__extension__ extern __inline uint64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vceqzq_s64 (int64x2_t __a)
+{
+  return (uint64x2_t) (__a == __AARCH64_INT64_C (0));
+}
+
+__extension__ extern __inline uint8x16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vceqzq_u8 (uint8x16_t __a)
+{
+  return (__a == 0);
 }
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-vpaddq_s32 (int32x4_t a, int32x4_t b)
+__extension__ extern __inline uint16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vceqzq_u16 (uint16x8_t __a)
 {
-  int32x4_t result;
-  __asm__ ("addp %0.4s,%1.4s,%2.4s"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
+  return (__a == 0);
 }
 
-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
-vpaddq_s64 (int64x2_t a, int64x2_t b)
+__extension__ extern __inline uint32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vceqzq_u32 (uint32x4_t __a)
 {
-  int64x2_t result;
-  __asm__ ("addp %0.2d,%1.2d,%2.2d"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
+  return (__a == 0);
 }
 
-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-vpaddq_u8 (uint8x16_t a, uint8x16_t b)
+__extension__ extern __inline uint64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vceqzq_u64 (uint64x2_t __a)
 {
-  uint8x16_t result;
-  __asm__ ("addp %0.16b,%1.16b,%2.16b"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
+  return (__a == __AARCH64_UINT64_C (0));
 }
 
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-vpaddq_u16 (uint16x8_t a, uint16x8_t b)
+/* vceqz - scalar.  */
+
+__extension__ extern __inline uint32_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vceqzs_f32 (float32_t __a)
 {
-  uint16x8_t result;
-  __asm__ ("addp %0.8h,%1.8h,%2.8h"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
+  return __a == 0.0f ? -1 : 0;
 }
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vpaddq_u32 (uint32x4_t a, uint32x4_t b)
+__extension__ extern __inline uint64_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vceqzd_s64 (int64_t __a)
 {
-  uint32x4_t result;
-  __asm__ ("addp %0.4s,%1.4s,%2.4s"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
+  return __a == 0 ? -1ll : 0ll;
 }
 
-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
-vpaddq_u64 (uint64x2_t a, uint64x2_t b)
+__extension__ extern __inline uint64_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vceqzd_u64 (uint64_t __a)
 {
-  uint64x2_t result;
-  __asm__ ("addp %0.2d,%1.2d,%2.2d"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
+  return __a == 0 ? -1ll : 0ll;
 }
 
-__extension__ static __inline float32_t __attribute__ ((__always_inline__))
-vpadds_f32 (float32x2_t a)
+__extension__ extern __inline uint64_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vceqzd_f64 (float64_t __a)
 {
-  float32_t result;
-  __asm__ ("faddp %s0,%1.2s"
-           : "=w"(result)
-           : "w"(a)
-           : /* No clobbers */);
-  return result;
+  return __a == 0.0 ? -1ll : 0ll;
 }
 
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-vqdmulh_n_s16 (int16x4_t a, int16_t b)
+/* vcge - vector.  */
+
+__extension__ extern __inline uint32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcge_f32 (float32x2_t __a, float32x2_t __b)
 {
-  int16x4_t result;
-  __asm__ ("sqdmulh %0.4h,%1.4h,%2.h[0]"
-           : "=w"(result)
-           : "w"(a), "x"(b)
-           : /* No clobbers */);
-  return result;
+  return (uint32x2_t) (__a >= __b);
 }
 
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-vqdmulh_n_s32 (int32x2_t a, int32_t b)
+__extension__ extern __inline uint64x1_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcge_f64 (float64x1_t __a, float64x1_t __b)
 {
-  int32x2_t result;
-  __asm__ ("sqdmulh %0.2s,%1.2s,%2.s[0]"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
+  return (uint64x1_t) (__a >= __b);
 }
 
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-vqdmulhq_n_s16 (int16x8_t a, int16_t b)
+__extension__ extern __inline uint8x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcge_s8 (int8x8_t __a, int8x8_t __b)
 {
-  int16x8_t result;
-  __asm__ ("sqdmulh %0.8h,%1.8h,%2.h[0]"
-           : "=w"(result)
-           : "w"(a), "x"(b)
-           : /* No clobbers */);
-  return result;
+  return (uint8x8_t) (__a >= __b);
 }
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-vqdmulhq_n_s32 (int32x4_t a, int32_t b)
+__extension__ extern __inline uint16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcge_s16 (int16x4_t __a, int16x4_t __b)
 {
-  int32x4_t result;
-  __asm__ ("sqdmulh %0.4s,%1.4s,%2.s[0]"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
+  return (uint16x4_t) (__a >= __b);
 }
 
-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
-vqmovn_high_s16 (int8x8_t a, int16x8_t b)
+__extension__ extern __inline uint32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcge_s32 (int32x2_t __a, int32x2_t __b)
 {
-  int8x16_t result = vcombine_s8 (a, vcreate_s8 (__AARCH64_UINT64_C (0x0)));
-  __asm__ ("sqxtn2 %0.16b, %1.8h"
-           : "+w"(result)
-           : "w"(b)
-           : /* No clobbers */);
-  return result;
+  return (uint32x2_t) (__a >= __b);
 }
 
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-vqmovn_high_s32 (int16x4_t a, int32x4_t b)
+__extension__ extern __inline uint64x1_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcge_s64 (int64x1_t __a, int64x1_t __b)
 {
-  int16x8_t result = vcombine_s16 (a, vcreate_s16 (__AARCH64_UINT64_C (0x0)));
-  __asm__ ("sqxtn2 %0.8h, %1.4s"
-           : "+w"(result)
-           : "w"(b)
-           : /* No clobbers */);
-  return result;
+  return (uint64x1_t) (__a >= __b);
 }
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-vqmovn_high_s64 (int32x2_t a, int64x2_t b)
+__extension__ extern __inline uint8x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcge_u8 (uint8x8_t __a, uint8x8_t __b)
 {
-  int32x4_t result = vcombine_s32 (a, vcreate_s32 (__AARCH64_UINT64_C (0x0)));
-  __asm__ ("sqxtn2 %0.4s, %1.2d"
-           : "+w"(result)
-           : "w"(b)
-           : /* No clobbers */);
-  return result;
+  return (__a >= __b);
 }
 
-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-vqmovn_high_u16 (uint8x8_t a, uint16x8_t b)
+__extension__ extern __inline uint16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcge_u16 (uint16x4_t __a, uint16x4_t __b)
 {
-  uint8x16_t result = vcombine_u8 (a, vcreate_u8 (__AARCH64_UINT64_C (0x0)));
-  __asm__ ("uqxtn2 %0.16b, %1.8h"
-           : "+w"(result)
-           : "w"(b)
-           : /* No clobbers */);
-  return result;
+  return (__a >= __b);
 }
 
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-vqmovn_high_u32 (uint16x4_t a, uint32x4_t b)
+__extension__ extern __inline uint32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcge_u32 (uint32x2_t __a, uint32x2_t __b)
 {
-  uint16x8_t result = vcombine_u16 (a, vcreate_u16 (__AARCH64_UINT64_C (0x0)));
-  __asm__ ("uqxtn2 %0.8h, %1.4s"
-           : "+w"(result)
-           : "w"(b)
-           : /* No clobbers */);
-  return result;
+  return (__a >= __b);
 }
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vqmovn_high_u64 (uint32x2_t a, uint64x2_t b)
+__extension__ extern __inline uint64x1_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcge_u64 (uint64x1_t __a, uint64x1_t __b)
 {
-  uint32x4_t result = vcombine_u32 (a, vcreate_u32 (__AARCH64_UINT64_C (0x0)));
-  __asm__ ("uqxtn2 %0.4s, %1.2d"
-           : "+w"(result)
-           : "w"(b)
-           : /* No clobbers */);
-  return result;
+  return (__a >= __b);
 }
 
-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-vqmovun_high_s16 (uint8x8_t a, int16x8_t b)
+__extension__ extern __inline uint32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcgeq_f32 (float32x4_t __a, float32x4_t __b)
 {
-  uint8x16_t result = vcombine_u8 (a, vcreate_u8 (__AARCH64_UINT64_C (0x0)));
-  __asm__ ("sqxtun2 %0.16b, %1.8h"
-           : "+w"(result)
-           : "w"(b)
-           : /* No clobbers */);
-  return result;
+  return (uint32x4_t) (__a >= __b);
 }
 
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-vqmovun_high_s32 (uint16x4_t a, int32x4_t b)
+__extension__ extern __inline uint64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcgeq_f64 (float64x2_t __a, float64x2_t __b)
 {
-  uint16x8_t result = vcombine_u16 (a, vcreate_u16 (__AARCH64_UINT64_C (0x0)));
-  __asm__ ("sqxtun2 %0.8h, %1.4s"
-           : "+w"(result)
-           : "w"(b)
-           : /* No clobbers */);
-  return result;
+  return (uint64x2_t) (__a >= __b);
 }
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vqmovun_high_s64 (uint32x2_t a, int64x2_t b)
+__extension__ extern __inline uint8x16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcgeq_s8 (int8x16_t __a, int8x16_t __b)
 {
-  uint32x4_t result = vcombine_u32 (a, vcreate_u32 (__AARCH64_UINT64_C (0x0)));
-  __asm__ ("sqxtun2 %0.4s, %1.2d"
-           : "+w"(result)
-           : "w"(b)
-           : /* No clobbers */);
-  return result;
+  return (uint8x16_t) (__a >= __b);
 }
 
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-vqrdmulh_n_s16 (int16x4_t a, int16_t b)
+__extension__ extern __inline uint16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcgeq_s16 (int16x8_t __a, int16x8_t __b)
 {
-  int16x4_t result;
-  __asm__ ("sqrdmulh %0.4h,%1.4h,%2.h[0]"
-           : "=w"(result)
-           : "w"(a), "x"(b)
-           : /* No clobbers */);
-  return result;
+  return (uint16x8_t) (__a >= __b);
 }
 
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-vqrdmulh_n_s32 (int32x2_t a, int32_t b)
+__extension__ extern __inline uint32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcgeq_s32 (int32x4_t __a, int32x4_t __b)
 {
-  int32x2_t result;
-  __asm__ ("sqrdmulh %0.2s,%1.2s,%2.s[0]"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
+  return (uint32x4_t) (__a >= __b);
 }
 
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-vqrdmulhq_n_s16 (int16x8_t a, int16_t b)
+__extension__ extern __inline uint64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcgeq_s64 (int64x2_t __a, int64x2_t __b)
 {
-  int16x8_t result;
-  __asm__ ("sqrdmulh %0.8h,%1.8h,%2.h[0]"
-           : "=w"(result)
-           : "w"(a), "x"(b)
-           : /* No clobbers */);
-  return result;
+  return (uint64x2_t) (__a >= __b);
 }
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-vqrdmulhq_n_s32 (int32x4_t a, int32_t b)
+__extension__ extern __inline uint8x16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcgeq_u8 (uint8x16_t __a, uint8x16_t __b)
 {
-  int32x4_t result;
-  __asm__ ("sqrdmulh %0.4s,%1.4s,%2.s[0]"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
+  return (__a >= __b);
 }
 
-#define vqrshrn_high_n_s16(a, b, c)                                     \
-  __extension__                                                         \
-    ({                                                                  \
-       int16x8_t b_ = (b);                                              \
-       int8x8_t a_ = (a);                                               \
-       int8x16_t result = vcombine_s8                                   \
-                            (a_, vcreate_s8                             \
-                                   (__AARCH64_UINT64_C (0x0)));         \
-       __asm__ ("sqrshrn2 %0.16b, %1.8h, #%2"                           \
-                : "+w"(result)                                          \
-                : "w"(b_), "i"(c)                                       \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
+__extension__ extern __inline uint16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcgeq_u16 (uint16x8_t __a, uint16x8_t __b)
+{
+  return (__a >= __b);
+}
 
-#define vqrshrn_high_n_s32(a, b, c)                                     \
-  __extension__                                                         \
-    ({                                                                  \
-       int32x4_t b_ = (b);                                              \
-       int16x4_t a_ = (a);                                              \
-       int16x8_t result = vcombine_s16                                  \
-                            (a_, vcreate_s16                            \
-                                   (__AARCH64_UINT64_C (0x0)));         \
-       __asm__ ("sqrshrn2 %0.8h, %1.4s, #%2"                            \
-                : "+w"(result)                                          \
-                : "w"(b_), "i"(c)                                       \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
+__extension__ extern __inline uint32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcgeq_u32 (uint32x4_t __a, uint32x4_t __b)
+{
+  return (__a >= __b);
+}
 
-#define vqrshrn_high_n_s64(a, b, c)                                     \
-  __extension__                                                         \
-    ({                                                                  \
-       int64x2_t b_ = (b);                                              \
-       int32x2_t a_ = (a);                                              \
-       int32x4_t result = vcombine_s32                                  \
-                            (a_, vcreate_s32                            \
-                                   (__AARCH64_UINT64_C (0x0)));         \
-       __asm__ ("sqrshrn2 %0.4s, %1.2d, #%2"                            \
-                : "+w"(result)                                          \
-                : "w"(b_), "i"(c)                                       \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
+__extension__ extern __inline uint64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcgeq_u64 (uint64x2_t __a, uint64x2_t __b)
+{
+  return (__a >= __b);
+}
 
-#define vqrshrn_high_n_u16(a, b, c)                                     \
-  __extension__                                                         \
-    ({                                                                  \
-       uint16x8_t b_ = (b);                                             \
-       uint8x8_t a_ = (a);                                              \
-       uint8x16_t result = vcombine_u8                                  \
-                             (a_, vcreate_u8                            \
-                                    (__AARCH64_UINT64_C (0x0)));        \
-       __asm__ ("uqrshrn2 %0.16b, %1.8h, #%2"                           \
-                : "+w"(result)                                          \
-                : "w"(b_), "i"(c)                                       \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
+/* vcge - scalar.  */
 
-#define vqrshrn_high_n_u32(a, b, c)                                     \
-  __extension__                                                         \
-    ({                                                                  \
-       uint32x4_t b_ = (b);                                             \
-       uint16x4_t a_ = (a);                                             \
-       uint16x8_t result = vcombine_u16                                 \
-                             (a_, vcreate_u16                           \
-                                    (__AARCH64_UINT64_C (0x0)));        \
-       __asm__ ("uqrshrn2 %0.8h, %1.4s, #%2"                            \
-                : "+w"(result)                                          \
-                : "w"(b_), "i"(c)                                       \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
+__extension__ extern __inline uint32_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcges_f32 (float32_t __a, float32_t __b)
+{
+  return __a >= __b ? -1 : 0;
+}
 
-#define vqrshrn_high_n_u64(a, b, c)                                     \
-  __extension__                                                         \
-    ({                                                                  \
-       uint64x2_t b_ = (b);                                             \
-       uint32x2_t a_ = (a);                                             \
-       uint32x4_t result = vcombine_u32                                 \
-                             (a_, vcreate_u32                           \
-                                    (__AARCH64_UINT64_C (0x0)));        \
-       __asm__ ("uqrshrn2 %0.4s, %1.2d, #%2"                            \
-                : "+w"(result)                                          \
-                : "w"(b_), "i"(c)                                       \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
+__extension__ extern __inline uint64_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcged_s64 (int64_t __a, int64_t __b)
+{
+  return __a >= __b ? -1ll : 0ll;
+}
 
-#define vqrshrun_high_n_s16(a, b, c)                                    \
-  __extension__                                                         \
-    ({                                                                  \
-       int16x8_t b_ = (b);                                              \
-       uint8x8_t a_ = (a);                                              \
-       uint8x16_t result = vcombine_u8                                  \
-                             (a_, vcreate_u8                            \
-                                    (__AARCH64_UINT64_C (0x0)));        \
-       __asm__ ("sqrshrun2 %0.16b, %1.8h, #%2"                          \
-                : "+w"(result)                                          \
-                : "w"(b_), "i"(c)                                       \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
+__extension__ extern __inline uint64_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcged_u64 (uint64_t __a, uint64_t __b)
+{
+  return __a >= __b ? -1ll : 0ll;
+}
 
-#define vqrshrun_high_n_s32(a, b, c)                                    \
-  __extension__                                                         \
-    ({                                                                  \
-       int32x4_t b_ = (b);                                              \
-       uint16x4_t a_ = (a);                                             \
-       uint16x8_t result = vcombine_u16                                 \
-                             (a_, vcreate_u16                           \
-                                    (__AARCH64_UINT64_C (0x0)));        \
-       __asm__ ("sqrshrun2 %0.8h, %1.4s, #%2"                           \
-                : "+w"(result)                                          \
-                : "w"(b_), "i"(c)                                       \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
+__extension__ extern __inline uint64_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcged_f64 (float64_t __a, float64_t __b)
+{
+  return __a >= __b ? -1ll : 0ll;
+}
 
-#define vqrshrun_high_n_s64(a, b, c)                                    \
-  __extension__                                                         \
-    ({                                                                  \
-       int64x2_t b_ = (b);                                              \
-       uint32x2_t a_ = (a);                                             \
-       uint32x4_t result = vcombine_u32                                 \
-                             (a_, vcreate_u32                           \
-                                    (__AARCH64_UINT64_C (0x0)));        \
-       __asm__ ("sqrshrun2 %0.4s, %1.2d, #%2"                           \
-                : "+w"(result)                                          \
-                : "w"(b_), "i"(c)                                       \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
+/* vcgez - vector.  */
 
-#define vqshrn_high_n_s16(a, b, c)                                      \
-  __extension__                                                         \
-    ({                                                                  \
-       int16x8_t b_ = (b);                                              \
-       int8x8_t a_ = (a);                                               \
-       int8x16_t result = vcombine_s8                                   \
-                            (a_, vcreate_s8                             \
-                                   (__AARCH64_UINT64_C (0x0)));         \
-       __asm__ ("sqshrn2 %0.16b, %1.8h, #%2"                            \
-                : "+w"(result)                                          \
-                : "w"(b_), "i"(c)                                       \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
+__extension__ extern __inline uint32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcgez_f32 (float32x2_t __a)
+{
+  return (uint32x2_t) (__a >= 0.0f);
+}
 
-#define vqshrn_high_n_s32(a, b, c)                                      \
-  __extension__                                                         \
-    ({                                                                  \
-       int32x4_t b_ = (b);                                              \
-       int16x4_t a_ = (a);                                              \
-       int16x8_t result = vcombine_s16                                  \
-                            (a_, vcreate_s16                            \
-                                   (__AARCH64_UINT64_C (0x0)));         \
-       __asm__ ("sqshrn2 %0.8h, %1.4s, #%2"                             \
-                : "+w"(result)                                          \
-                : "w"(b_), "i"(c)                                       \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
+__extension__ extern __inline uint64x1_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcgez_f64 (float64x1_t __a)
+{
+  return (uint64x1_t) (__a[0] >= (float64x1_t) {0.0});
+}
 
-#define vqshrn_high_n_s64(a, b, c)                                      \
-  __extension__                                                         \
-    ({                                                                  \
-       int64x2_t b_ = (b);                                              \
-       int32x2_t a_ = (a);                                              \
-       int32x4_t result = vcombine_s32                                  \
-                            (a_, vcreate_s32                            \
-                                   (__AARCH64_UINT64_C (0x0)));         \
-       __asm__ ("sqshrn2 %0.4s, %1.2d, #%2"                             \
-                : "+w"(result)                                          \
-                : "w"(b_), "i"(c)                                       \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
+__extension__ extern __inline uint8x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcgez_s8 (int8x8_t __a)
+{
+  return (uint8x8_t) (__a >= 0);
+}
 
-#define vqshrn_high_n_u16(a, b, c)                                      \
-  __extension__                                                         \
-    ({                                                                  \
-       uint16x8_t b_ = (b);                                             \
-       uint8x8_t a_ = (a);                                              \
-       uint8x16_t result = vcombine_u8                                  \
-                             (a_, vcreate_u8                            \
-                                    (__AARCH64_UINT64_C (0x0)));        \
-       __asm__ ("uqshrn2 %0.16b, %1.8h, #%2"                            \
-                : "+w"(result)                                          \
-                : "w"(b_), "i"(c)                                       \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
+__extension__ extern __inline uint16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcgez_s16 (int16x4_t __a)
+{
+  return (uint16x4_t) (__a >= 0);
+}
 
-#define vqshrn_high_n_u32(a, b, c)                                      \
-  __extension__                                                         \
-    ({                                                                  \
-       uint32x4_t b_ = (b);                                             \
-       uint16x4_t a_ = (a);                                             \
-       uint16x8_t result = vcombine_u16                                 \
-                             (a_, vcreate_u16                           \
-                                    (__AARCH64_UINT64_C (0x0)));        \
-       __asm__ ("uqshrn2 %0.8h, %1.4s, #%2"                             \
-                : "+w"(result)                                          \
-                : "w"(b_), "i"(c)                                       \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
+__extension__ extern __inline uint32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcgez_s32 (int32x2_t __a)
+{
+  return (uint32x2_t) (__a >= 0);
+}
 
-#define vqshrn_high_n_u64(a, b, c)                                      \
-  __extension__                                                         \
-    ({                                                                  \
-       uint64x2_t b_ = (b);                                             \
-       uint32x2_t a_ = (a);                                             \
-       uint32x4_t result = vcombine_u32                                 \
-                             (a_, vcreate_u32                           \
-                                    (__AARCH64_UINT64_C (0x0)));        \
-       __asm__ ("uqshrn2 %0.4s, %1.2d, #%2"                             \
-                : "+w"(result)                                          \
-                : "w"(b_), "i"(c)                                       \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
+__extension__ extern __inline uint64x1_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcgez_s64 (int64x1_t __a)
+{
+  return (uint64x1_t) (__a >= __AARCH64_INT64_C (0));
+}
 
-#define vqshrun_high_n_s16(a, b, c)                                     \
-  __extension__                                                         \
-    ({                                                                  \
-       int16x8_t b_ = (b);                                              \
-       uint8x8_t a_ = (a);                                              \
-       uint8x16_t result = vcombine_u8                                  \
-                             (a_, vcreate_u8                            \
-                                    (__AARCH64_UINT64_C (0x0)));        \
-       __asm__ ("sqshrun2 %0.16b, %1.8h, #%2"                           \
-                : "+w"(result)                                          \
-                : "w"(b_), "i"(c)                                       \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
+__extension__ extern __inline uint32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcgezq_f32 (float32x4_t __a)
+{
+  return (uint32x4_t) (__a >= 0.0f);
+}
+
+__extension__ extern __inline uint64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcgezq_f64 (float64x2_t __a)
+{
+  return (uint64x2_t) (__a >= 0.0);
+}
+
+__extension__ extern __inline uint8x16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcgezq_s8 (int8x16_t __a)
+{
+  return (uint8x16_t) (__a >= 0);
+}
 
-#define vqshrun_high_n_s32(a, b, c)                                     \
-  __extension__                                                         \
-    ({                                                                  \
-       int32x4_t b_ = (b);                                              \
-       uint16x4_t a_ = (a);                                             \
-       uint16x8_t result = vcombine_u16                                 \
-                             (a_, vcreate_u16                           \
-                                    (__AARCH64_UINT64_C (0x0)));        \
-       __asm__ ("sqshrun2 %0.8h, %1.4s, #%2"                            \
-                : "+w"(result)                                          \
-                : "w"(b_), "i"(c)                                       \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
+__extension__ extern __inline uint16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcgezq_s16 (int16x8_t __a)
+{
+  return (uint16x8_t) (__a >= 0);
+}
 
-#define vqshrun_high_n_s64(a, b, c)                                     \
-  __extension__                                                         \
-    ({                                                                  \
-       int64x2_t b_ = (b);                                              \
-       uint32x2_t a_ = (a);                                             \
-       uint32x4_t result = vcombine_u32                                 \
-                             (a_, vcreate_u32                           \
-                                    (__AARCH64_UINT64_C (0x0)));        \
-       __asm__ ("sqshrun2 %0.4s, %1.2d, #%2"                            \
-                : "+w"(result)                                          \
-                : "w"(b_), "i"(c)                                       \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
+__extension__ extern __inline uint32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcgezq_s32 (int32x4_t __a)
+{
+  return (uint32x4_t) (__a >= 0);
+}
 
-#define vrshrn_high_n_s16(a, b, c)                                      \
-  __extension__                                                         \
-    ({                                                                  \
-       int16x8_t b_ = (b);                                              \
-       int8x8_t a_ = (a);                                               \
-       int8x16_t result = vcombine_s8                                   \
-                            (a_, vcreate_s8                             \
-                                   (__AARCH64_UINT64_C (0x0)));         \
-       __asm__ ("rshrn2 %0.16b,%1.8h,#%2"                               \
-                : "+w"(result)                                          \
-                : "w"(b_), "i"(c)                                       \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
+__extension__ extern __inline uint64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcgezq_s64 (int64x2_t __a)
+{
+  return (uint64x2_t) (__a >= __AARCH64_INT64_C (0));
+}
 
-#define vrshrn_high_n_s32(a, b, c)                                      \
-  __extension__                                                         \
-    ({                                                                  \
-       int32x4_t b_ = (b);                                              \
-       int16x4_t a_ = (a);                                              \
-       int16x8_t result = vcombine_s16                                  \
-                            (a_, vcreate_s16                            \
-                                   (__AARCH64_UINT64_C (0x0)));         \
-       __asm__ ("rshrn2 %0.8h,%1.4s,#%2"                                \
-                : "+w"(result)                                          \
-                : "w"(b_), "i"(c)                                       \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
+/* vcgez - scalar.  */
 
-#define vrshrn_high_n_s64(a, b, c)                                      \
-  __extension__                                                         \
-    ({                                                                  \
-       int64x2_t b_ = (b);                                              \
-       int32x2_t a_ = (a);                                              \
-       int32x4_t result = vcombine_s32                                  \
-                            (a_, vcreate_s32                            \
-                                   (__AARCH64_UINT64_C (0x0)));         \
-       __asm__ ("rshrn2 %0.4s,%1.2d,#%2"                                \
-                : "+w"(result)                                          \
-                : "w"(b_), "i"(c)                                       \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
+__extension__ extern __inline uint32_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcgezs_f32 (float32_t __a)
+{
+  return __a >= 0.0f ? -1 : 0;
+}
 
-#define vrshrn_high_n_u16(a, b, c)                                      \
-  __extension__                                                         \
-    ({                                                                  \
-       uint16x8_t b_ = (b);                                             \
-       uint8x8_t a_ = (a);                                              \
-       uint8x16_t result = vcombine_u8                                  \
-                            (a_, vcreate_u8                             \
-                                   (__AARCH64_UINT64_C (0x0)));         \
-       __asm__ ("rshrn2 %0.16b,%1.8h,#%2"                               \
-                : "+w"(result)                                          \
-                : "w"(b_), "i"(c)                                       \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
+__extension__ extern __inline uint64_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcgezd_s64 (int64_t __a)
+{
+  return __a >= 0 ? -1ll : 0ll;
+}
 
-#define vrshrn_high_n_u32(a, b, c)                                      \
-  __extension__                                                         \
-    ({                                                                  \
-       uint32x4_t b_ = (b);                                             \
-       uint16x4_t a_ = (a);                                             \
-       uint16x8_t result = vcombine_u16                                 \
-                            (a_, vcreate_u16                            \
-                                   (__AARCH64_UINT64_C (0x0)));         \
-       __asm__ ("rshrn2 %0.8h,%1.4s,#%2"                                \
-                : "+w"(result)                                          \
-                : "w"(b_), "i"(c)                                       \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
+__extension__ extern __inline uint64_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcgezd_f64 (float64_t __a)
+{
+  return __a >= 0.0 ? -1ll : 0ll;
+}
 
-#define vrshrn_high_n_u64(a, b, c)                                      \
-  __extension__                                                         \
-    ({                                                                  \
-       uint64x2_t b_ = (b);                                             \
-       uint32x2_t a_ = (a);                                             \
-       uint32x4_t result = vcombine_u32                                 \
-                            (a_, vcreate_u32                            \
-                                   (__AARCH64_UINT64_C (0x0)));         \
-       __asm__ ("rshrn2 %0.4s,%1.2d,#%2"                                \
-                : "+w"(result)                                          \
-                : "w"(b_), "i"(c)                                       \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
+/* vcgt - vector.  */
 
-#define vrshrn_n_s16(a, b)                                              \
-  __extension__                                                         \
-    ({                                                                  \
-       int16x8_t a_ = (a);                                              \
-       int8x8_t result;                                                 \
-       __asm__ ("rshrn %0.8b,%1.8h,%2"                                  \
-                : "=w"(result)                                          \
-                : "w"(a_), "i"(b)                                       \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
+__extension__ extern __inline uint32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcgt_f32 (float32x2_t __a, float32x2_t __b)
+{
+  return (uint32x2_t) (__a > __b);
+}
 
-#define vrshrn_n_s32(a, b)                                              \
-  __extension__                                                         \
-    ({                                                                  \
-       int32x4_t a_ = (a);                                              \
-       int16x4_t result;                                                \
-       __asm__ ("rshrn %0.4h,%1.4s,%2"                                  \
-                : "=w"(result)                                          \
-                : "w"(a_), "i"(b)                                       \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
+__extension__ extern __inline uint64x1_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcgt_f64 (float64x1_t __a, float64x1_t __b)
+{
+  return (uint64x1_t) (__a > __b);
+}
 
-#define vrshrn_n_s64(a, b)                                              \
-  __extension__                                                         \
-    ({                                                                  \
-       int64x2_t a_ = (a);                                              \
-       int32x2_t result;                                                \
-       __asm__ ("rshrn %0.2s,%1.2d,%2"                                  \
-                : "=w"(result)                                          \
-                : "w"(a_), "i"(b)                                       \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
+__extension__ extern __inline uint8x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcgt_s8 (int8x8_t __a, int8x8_t __b)
+{
+  return (uint8x8_t) (__a > __b);
+}
 
-#define vrshrn_n_u16(a, b)                                              \
-  __extension__                                                         \
-    ({                                                                  \
-       uint16x8_t a_ = (a);                                             \
-       uint8x8_t result;                                                \
-       __asm__ ("rshrn %0.8b,%1.8h,%2"                                  \
-                : "=w"(result)                                          \
-                : "w"(a_), "i"(b)                                       \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
+__extension__ extern __inline uint16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcgt_s16 (int16x4_t __a, int16x4_t __b)
+{
+  return (uint16x4_t) (__a > __b);
+}
 
-#define vrshrn_n_u32(a, b)                                              \
-  __extension__                                                         \
-    ({                                                                  \
-       uint32x4_t a_ = (a);                                             \
-       uint16x4_t result;                                               \
-       __asm__ ("rshrn %0.4h,%1.4s,%2"                                  \
-                : "=w"(result)                                          \
-                : "w"(a_), "i"(b)                                       \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
+__extension__ extern __inline uint32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcgt_s32 (int32x2_t __a, int32x2_t __b)
+{
+  return (uint32x2_t) (__a > __b);
+}
 
-#define vrshrn_n_u64(a, b)                                              \
-  __extension__                                                         \
-    ({                                                                  \
-       uint64x2_t a_ = (a);                                             \
-       uint32x2_t result;                                               \
-       __asm__ ("rshrn %0.2s,%1.2d,%2"                                  \
-                : "=w"(result)                                          \
-                : "w"(a_), "i"(b)                                       \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
+__extension__ extern __inline uint64x1_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcgt_s64 (int64x1_t __a, int64x1_t __b)
+{
+  return (uint64x1_t) (__a > __b);
+}
 
-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
-vrsqrte_f32 (float32x2_t a)
+__extension__ extern __inline uint8x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcgt_u8 (uint8x8_t __a, uint8x8_t __b)
 {
-  float32x2_t result;
-  __asm__ ("frsqrte %0.2s,%1.2s"
-           : "=w"(result)
-           : "w"(a)
-           : /* No clobbers */);
-  return result;
+  return (__a > __b);
 }
 
-__extension__ static __inline float64x1_t __attribute__ ((__always_inline__))
-vrsqrte_f64 (float64x1_t a)
+__extension__ extern __inline uint16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcgt_u16 (uint16x4_t __a, uint16x4_t __b)
 {
-  float64x1_t result;
-  __asm__ ("frsqrte %d0,%d1"
-           : "=w"(result)
-           : "w"(a)
-           : /* No clobbers */);
-  return result;
+  return (__a > __b);
 }
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-vrsqrte_u32 (uint32x2_t a)
+__extension__ extern __inline uint32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcgt_u32 (uint32x2_t __a, uint32x2_t __b)
 {
-  uint32x2_t result;
-  __asm__ ("ursqrte %0.2s,%1.2s"
-           : "=w"(result)
-           : "w"(a)
-           : /* No clobbers */);
-  return result;
+  return (__a > __b);
 }
 
-__extension__ static __inline float64_t __attribute__ ((__always_inline__))
-vrsqrted_f64 (float64_t a)
+__extension__ extern __inline uint64x1_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcgt_u64 (uint64x1_t __a, uint64x1_t __b)
 {
-  float64_t result;
-  __asm__ ("frsqrte %d0,%d1"
-           : "=w"(result)
-           : "w"(a)
-           : /* No clobbers */);
-  return result;
+  return (__a > __b);
 }
 
-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
-vrsqrteq_f32 (float32x4_t a)
+__extension__ extern __inline uint32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcgtq_f32 (float32x4_t __a, float32x4_t __b)
 {
-  float32x4_t result;
-  __asm__ ("frsqrte %0.4s,%1.4s"
-           : "=w"(result)
-           : "w"(a)
-           : /* No clobbers */);
-  return result;
+  return (uint32x4_t) (__a > __b);
 }
 
-__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
-vrsqrteq_f64 (float64x2_t a)
+__extension__ extern __inline uint64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcgtq_f64 (float64x2_t __a, float64x2_t __b)
 {
-  float64x2_t result;
-  __asm__ ("frsqrte %0.2d,%1.2d"
-           : "=w"(result)
-           : "w"(a)
-           : /* No clobbers */);
-  return result;
+  return (uint64x2_t) (__a > __b);
 }
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vrsqrteq_u32 (uint32x4_t a)
+__extension__ extern __inline uint8x16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcgtq_s8 (int8x16_t __a, int8x16_t __b)
 {
-  uint32x4_t result;
-  __asm__ ("ursqrte %0.4s,%1.4s"
-           : "=w"(result)
-           : "w"(a)
-           : /* No clobbers */);
-  return result;
+  return (uint8x16_t) (__a > __b);
 }
 
-__extension__ static __inline float32_t __attribute__ ((__always_inline__))
-vrsqrtes_f32 (float32_t a)
+__extension__ extern __inline uint16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcgtq_s16 (int16x8_t __a, int16x8_t __b)
 {
-  float32_t result;
-  __asm__ ("frsqrte %s0,%s1"
-           : "=w"(result)
-           : "w"(a)
-           : /* No clobbers */);
-  return result;
+  return (uint16x8_t) (__a > __b);
 }
 
-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
-vrsqrts_f32 (float32x2_t a, float32x2_t b)
+__extension__ extern __inline uint32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcgtq_s32 (int32x4_t __a, int32x4_t __b)
 {
-  float32x2_t result;
-  __asm__ ("frsqrts %0.2s,%1.2s,%2.2s"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
+  return (uint32x4_t) (__a > __b);
 }
 
-__extension__ static __inline float64_t __attribute__ ((__always_inline__))
-vrsqrtsd_f64 (float64_t a, float64_t b)
+__extension__ extern __inline uint64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcgtq_s64 (int64x2_t __a, int64x2_t __b)
 {
-  float64_t result;
-  __asm__ ("frsqrts %d0,%d1,%d2"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
+  return (uint64x2_t) (__a > __b);
 }
 
-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
-vrsqrtsq_f32 (float32x4_t a, float32x4_t b)
+__extension__ extern __inline uint8x16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcgtq_u8 (uint8x16_t __a, uint8x16_t __b)
 {
-  float32x4_t result;
-  __asm__ ("frsqrts %0.4s,%1.4s,%2.4s"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
+  return (__a > __b);
 }
 
-__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
-vrsqrtsq_f64 (float64x2_t a, float64x2_t b)
+__extension__ extern __inline uint16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcgtq_u16 (uint16x8_t __a, uint16x8_t __b)
 {
-  float64x2_t result;
-  __asm__ ("frsqrts %0.2d,%1.2d,%2.2d"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
+  return (__a > __b);
 }
 
-__extension__ static __inline float32_t __attribute__ ((__always_inline__))
-vrsqrtss_f32 (float32_t a, float32_t b)
+__extension__ extern __inline uint32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcgtq_u32 (uint32x4_t __a, uint32x4_t __b)
 {
-  float32_t result;
-  __asm__ ("frsqrts %s0,%s1,%s2"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
+  return (__a > __b);
 }
 
-#define vshrn_high_n_s16(a, b, c)                                       \
-  __extension__                                                         \
-    ({                                                                  \
-       int16x8_t b_ = (b);                                              \
-       int8x8_t a_ = (a);                                               \
-       int8x16_t result = vcombine_s8                                   \
-                            (a_, vcreate_s8                             \
-                                   (__AARCH64_UINT64_C (0x0)));         \
-       __asm__ ("shrn2 %0.16b,%1.8h,#%2"                                \
-                : "+w"(result)                                          \
-                : "w"(b_), "i"(c)                                       \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
-
-#define vshrn_high_n_s32(a, b, c)                                       \
-  __extension__                                                         \
-    ({                                                                  \
-       int32x4_t b_ = (b);                                              \
-       int16x4_t a_ = (a);                                              \
-       int16x8_t result = vcombine_s16                                  \
-                            (a_, vcreate_s16                            \
-                                   (__AARCH64_UINT64_C (0x0)));         \
-       __asm__ ("shrn2 %0.8h,%1.4s,#%2"                                 \
-                : "+w"(result)                                          \
-                : "w"(b_), "i"(c)                                       \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
-
-#define vshrn_high_n_s64(a, b, c)                                       \
-  __extension__                                                         \
-    ({                                                                  \
-       int64x2_t b_ = (b);                                              \
-       int32x2_t a_ = (a);                                              \
-       int32x4_t result = vcombine_s32                                  \
-                            (a_, vcreate_s32                            \
-                                   (__AARCH64_UINT64_C (0x0)));         \
-       __asm__ ("shrn2 %0.4s,%1.2d,#%2"                                 \
-                : "+w"(result)                                          \
-                : "w"(b_), "i"(c)                                       \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
-
-#define vshrn_high_n_u16(a, b, c)                                       \
-  __extension__                                                         \
-    ({                                                                  \
-       uint16x8_t b_ = (b);                                             \
-       uint8x8_t a_ = (a);                                              \
-       uint8x16_t result = vcombine_u8                                  \
-                            (a_, vcreate_u8                             \
-                                   (__AARCH64_UINT64_C (0x0)));         \
-       __asm__ ("shrn2 %0.16b,%1.8h,#%2"                                \
-                : "+w"(result)                                          \
-                : "w"(b_), "i"(c)                                       \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
-
-#define vshrn_high_n_u32(a, b, c)                                       \
-  __extension__                                                         \
-    ({                                                                  \
-       uint32x4_t b_ = (b);                                             \
-       uint16x4_t a_ = (a);                                             \
-       uint16x8_t result = vcombine_u16                                 \
-                            (a_, vcreate_u16                            \
-                                   (__AARCH64_UINT64_C (0x0)));         \
-       __asm__ ("shrn2 %0.8h,%1.4s,#%2"                                 \
-                : "+w"(result)                                          \
-                : "w"(b_), "i"(c)                                       \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
-
-#define vshrn_high_n_u64(a, b, c)                                       \
-  __extension__                                                         \
-    ({                                                                  \
-       uint64x2_t b_ = (b);                                             \
-       uint32x2_t a_ = (a);                                             \
-       uint32x4_t result = vcombine_u32                                 \
-                            (a_, vcreate_u32                            \
-                                   (__AARCH64_UINT64_C (0x0)));         \
-       __asm__ ("shrn2 %0.4s,%1.2d,#%2"                                 \
-                : "+w"(result)                                          \
-                : "w"(b_), "i"(c)                                       \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
+__extension__ extern __inline uint64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcgtq_u64 (uint64x2_t __a, uint64x2_t __b)
+{
+  return (__a > __b);
+}
 
-#define vshrn_n_s16(a, b)                                               \
-  __extension__                                                         \
-    ({                                                                  \
-       int16x8_t a_ = (a);                                              \
-       int8x8_t result;                                                 \
-       __asm__ ("shrn %0.8b,%1.8h,%2"                                   \
-                : "=w"(result)                                          \
-                : "w"(a_), "i"(b)                                       \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
+/* vcgt - scalar.  */
 
-#define vshrn_n_s32(a, b)                                               \
-  __extension__                                                         \
-    ({                                                                  \
-       int32x4_t a_ = (a);                                              \
-       int16x4_t result;                                                \
-       __asm__ ("shrn %0.4h,%1.4s,%2"                                   \
-                : "=w"(result)                                          \
-                : "w"(a_), "i"(b)                                       \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
+__extension__ extern __inline uint32_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcgts_f32 (float32_t __a, float32_t __b)
+{
+  return __a > __b ? -1 : 0;
+}
 
-#define vshrn_n_s64(a, b)                                               \
-  __extension__                                                         \
-    ({                                                                  \
-       int64x2_t a_ = (a);                                              \
-       int32x2_t result;                                                \
-       __asm__ ("shrn %0.2s,%1.2d,%2"                                   \
-                : "=w"(result)                                          \
-                : "w"(a_), "i"(b)                                       \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
+__extension__ extern __inline uint64_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcgtd_s64 (int64_t __a, int64_t __b)
+{
+  return __a > __b ? -1ll : 0ll;
+}
 
-#define vshrn_n_u16(a, b)                                               \
-  __extension__                                                         \
-    ({                                                                  \
-       uint16x8_t a_ = (a);                                             \
-       uint8x8_t result;                                                \
-       __asm__ ("shrn %0.8b,%1.8h,%2"                                   \
-                : "=w"(result)                                          \
-                : "w"(a_), "i"(b)                                       \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
+__extension__ extern __inline uint64_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcgtd_u64 (uint64_t __a, uint64_t __b)
+{
+  return __a > __b ? -1ll : 0ll;
+}
 
-#define vshrn_n_u32(a, b)                                               \
-  __extension__                                                         \
-    ({                                                                  \
-       uint32x4_t a_ = (a);                                             \
-       uint16x4_t result;                                               \
-       __asm__ ("shrn %0.4h,%1.4s,%2"                                   \
-                : "=w"(result)                                          \
-                : "w"(a_), "i"(b)                                       \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
+__extension__ extern __inline uint64_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcgtd_f64 (float64_t __a, float64_t __b)
+{
+  return __a > __b ? -1ll : 0ll;
+}
 
-#define vshrn_n_u64(a, b)                                               \
-  __extension__                                                         \
-    ({                                                                  \
-       uint64x2_t a_ = (a);                                             \
-       uint32x2_t result;                                               \
-       __asm__ ("shrn %0.2s,%1.2d,%2"                                   \
-                : "=w"(result)                                          \
-                : "w"(a_), "i"(b)                                       \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
+/* vcgtz - vector.  */
 
-#define vsli_n_p8(a, b, c)                                              \
-  __extension__                                                         \
-    ({                                                                  \
-       poly8x8_t b_ = (b);                                              \
-       poly8x8_t a_ = (a);                                              \
-       poly8x8_t result;                                                \
-       __asm__ ("sli %0.8b,%2.8b,%3"                                    \
-                : "=w"(result)                                          \
-                : "0"(a_), "w"(b_), "i"(c)                              \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
+__extension__ extern __inline uint32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcgtz_f32 (float32x2_t __a)
+{
+  return (uint32x2_t) (__a > 0.0f);
+}
 
-#define vsli_n_p16(a, b, c)                                             \
-  __extension__                                                         \
-    ({                                                                  \
-       poly16x4_t b_ = (b);                                             \
-       poly16x4_t a_ = (a);                                             \
-       poly16x4_t result;                                               \
-       __asm__ ("sli %0.4h,%2.4h,%3"                                    \
-                : "=w"(result)                                          \
-                : "0"(a_), "w"(b_), "i"(c)                              \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
+__extension__ extern __inline uint64x1_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcgtz_f64 (float64x1_t __a)
+{
+  return (uint64x1_t) (__a > (float64x1_t) {0.0});
+}
 
-#define vsliq_n_p8(a, b, c)                                             \
-  __extension__                                                         \
-    ({                                                                  \
-       poly8x16_t b_ = (b);                                             \
-       poly8x16_t a_ = (a);                                             \
-       poly8x16_t result;                                               \
-       __asm__ ("sli %0.16b,%2.16b,%3"                                  \
-                : "=w"(result)                                          \
-                : "0"(a_), "w"(b_), "i"(c)                              \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
+__extension__ extern __inline uint8x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcgtz_s8 (int8x8_t __a)
+{
+  return (uint8x8_t) (__a > 0);
+}
 
-#define vsliq_n_p16(a, b, c)                                            \
-  __extension__                                                         \
-    ({                                                                  \
-       poly16x8_t b_ = (b);                                             \
-       poly16x8_t a_ = (a);                                             \
-       poly16x8_t result;                                               \
-       __asm__ ("sli %0.8h,%2.8h,%3"                                    \
-                : "=w"(result)                                          \
-                : "0"(a_), "w"(b_), "i"(c)                              \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
+__extension__ extern __inline uint16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcgtz_s16 (int16x4_t __a)
+{
+  return (uint16x4_t) (__a > 0);
+}
 
-#define vsri_n_p8(a, b, c)                                              \
-  __extension__                                                         \
-    ({                                                                  \
-       poly8x8_t b_ = (b);                                              \
-       poly8x8_t a_ = (a);                                              \
-       poly8x8_t result;                                                \
-       __asm__ ("sri %0.8b,%2.8b,%3"                                    \
-                : "=w"(result)                                          \
-                : "0"(a_), "w"(b_), "i"(c)                              \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
+__extension__ extern __inline uint32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcgtz_s32 (int32x2_t __a)
+{
+  return (uint32x2_t) (__a > 0);
+}
 
-#define vsri_n_p16(a, b, c)                                             \
-  __extension__                                                         \
-    ({                                                                  \
-       poly16x4_t b_ = (b);                                             \
-       poly16x4_t a_ = (a);                                             \
-       poly16x4_t result;                                               \
-       __asm__ ("sri %0.4h,%2.4h,%3"                                    \
-                : "=w"(result)                                          \
-                : "0"(a_), "w"(b_), "i"(c)                              \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
+__extension__ extern __inline uint64x1_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcgtz_s64 (int64x1_t __a)
+{
+  return (uint64x1_t) (__a > __AARCH64_INT64_C (0));
+}
 
-#define vsriq_n_p8(a, b, c)                                             \
-  __extension__                                                         \
-    ({                                                                  \
-       poly8x16_t b_ = (b);                                             \
-       poly8x16_t a_ = (a);                                             \
-       poly8x16_t result;                                               \
-       __asm__ ("sri %0.16b,%2.16b,%3"                                  \
-                : "=w"(result)                                          \
-                : "0"(a_), "w"(b_), "i"(c)                              \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
+__extension__ extern __inline uint32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcgtzq_f32 (float32x4_t __a)
+{
+  return (uint32x4_t) (__a > 0.0f);
+}
 
-#define vsriq_n_p16(a, b, c)                                            \
-  __extension__                                                         \
-    ({                                                                  \
-       poly16x8_t b_ = (b);                                             \
-       poly16x8_t a_ = (a);                                             \
-       poly16x8_t result;                                               \
-       __asm__ ("sri %0.8h,%2.8h,%3"                                    \
-                : "=w"(result)                                          \
-                : "0"(a_), "w"(b_), "i"(c)                              \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
+__extension__ extern __inline uint64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcgtzq_f64 (float64x2_t __a)
+{
+    return (uint64x2_t) (__a > 0.0);
+}
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-vtst_p8 (poly8x8_t a, poly8x8_t b)
+__extension__ extern __inline uint8x16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcgtzq_s8 (int8x16_t __a)
 {
-  uint8x8_t result;
-  __asm__ ("cmtst %0.8b, %1.8b, %2.8b"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
+  return (uint8x16_t) (__a > 0);
 }
 
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-vtst_p16 (poly16x4_t a, poly16x4_t b)
+__extension__ extern __inline uint16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcgtzq_s16 (int16x8_t __a)
 {
-  uint16x4_t result;
-  __asm__ ("cmtst %0.4h, %1.4h, %2.4h"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
+  return (uint16x8_t) (__a > 0);
 }
 
-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-vtstq_p8 (poly8x16_t a, poly8x16_t b)
+__extension__ extern __inline uint32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcgtzq_s32 (int32x4_t __a)
 {
-  uint8x16_t result;
-  __asm__ ("cmtst %0.16b, %1.16b, %2.16b"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
+  return (uint32x4_t) (__a > 0);
 }
 
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-vtstq_p16 (poly16x8_t a, poly16x8_t b)
+__extension__ extern __inline uint64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcgtzq_s64 (int64x2_t __a)
 {
-  uint16x8_t result;
-  __asm__ ("cmtst %0.8h, %1.8h, %2.8h"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
+  return (uint64x2_t) (__a > __AARCH64_INT64_C (0));
 }
 
-/* End of temporary inline asm implementations.  */
+/* vcgtz - scalar.  */
 
-/* Start of temporary inline asm for vldn, vstn and friends.  */
+__extension__ extern __inline uint32_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcgtzs_f32 (float32_t __a)
+{
+  return __a > 0.0f ? -1 : 0;
+}
 
-/* Create struct element types for duplicating loads.
+__extension__ extern __inline uint64_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcgtzd_s64 (int64_t __a)
+{
+  return __a > 0 ? -1ll : 0ll;
+}
 
-   Create 2 element structures of:
+__extension__ extern __inline uint64_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcgtzd_f64 (float64_t __a)
+{
+  return __a > 0.0 ? -1ll : 0ll;
+}
 
-   +------+----+----+----+----+
-   |      | 8  | 16 | 32 | 64 |
-   +------+----+----+----+----+
-   |int   | Y  | Y  | N  | N  |
-   +------+----+----+----+----+
-   |uint  | Y  | Y  | N  | N  |
-   +------+----+----+----+----+
-   |float | -  | Y  | N  | N  |
-   +------+----+----+----+----+
-   |poly  | Y  | Y  | -  | -  |
-   +------+----+----+----+----+
+/* vcle - vector.  */
 
-   Create 3 element structures of:
+__extension__ extern __inline uint32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcle_f32 (float32x2_t __a, float32x2_t __b)
+{
+  return (uint32x2_t) (__a <= __b);
+}
 
-   +------+----+----+----+----+
-   |      | 8  | 16 | 32 | 64 |
-   +------+----+----+----+----+
-   |int   | Y  | Y  | Y  | Y  |
-   +------+----+----+----+----+
-   |uint  | Y  | Y  | Y  | Y  |
-   +------+----+----+----+----+
-   |float | -  | Y  | Y  | Y  |
-   +------+----+----+----+----+
-   |poly  | Y  | Y  | -  | -  |
-   +------+----+----+----+----+
+__extension__ extern __inline uint64x1_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcle_f64 (float64x1_t __a, float64x1_t __b)
+{
+  return (uint64x1_t) (__a <= __b);
+}
 
-   Create 4 element structures of:
+__extension__ extern __inline uint8x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcle_s8 (int8x8_t __a, int8x8_t __b)
+{
+  return (uint8x8_t) (__a <= __b);
+}
 
-   +------+----+----+----+----+
-   |      | 8  | 16 | 32 | 64 |
-   +------+----+----+----+----+
-   |int   | Y  | N  | N  | Y  |
-   +------+----+----+----+----+
-   |uint  | Y  | N  | N  | Y  |
-   +------+----+----+----+----+
-   |float | -  | N  | N  | Y  |
-   +------+----+----+----+----+
-   |poly  | Y  | N  | -  | -  |
-   +------+----+----+----+----+
+__extension__ extern __inline uint16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcle_s16 (int16x4_t __a, int16x4_t __b)
+{
+  return (uint16x4_t) (__a <= __b);
+}
 
-  This is required for casting memory reference.  */
-#define __STRUCTN(t, sz, nelem)			\
-  typedef struct t ## sz ## x ## nelem ## _t {	\
-    t ## sz ## _t val[nelem];			\
-  }  t ## sz ## x ## nelem ## _t;
+__extension__ extern __inline uint32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcle_s32 (int32x2_t __a, int32x2_t __b)
+{
+  return (uint32x2_t) (__a <= __b);
+}
 
-/* 2-element structs.  */
-__STRUCTN (int, 8, 2)
-__STRUCTN (int, 16, 2)
-__STRUCTN (uint, 8, 2)
-__STRUCTN (uint, 16, 2)
-__STRUCTN (float, 16, 2)
-__STRUCTN (poly, 8, 2)
-__STRUCTN (poly, 16, 2)
-/* 3-element structs.  */
-__STRUCTN (int, 8, 3)
-__STRUCTN (int, 16, 3)
-__STRUCTN (int, 32, 3)
-__STRUCTN (int, 64, 3)
-__STRUCTN (uint, 8, 3)
-__STRUCTN (uint, 16, 3)
-__STRUCTN (uint, 32, 3)
-__STRUCTN (uint, 64, 3)
-__STRUCTN (float, 16, 3)
-__STRUCTN (float, 32, 3)
-__STRUCTN (float, 64, 3)
-__STRUCTN (poly, 8, 3)
-__STRUCTN (poly, 16, 3)
-/* 4-element structs.  */
-__STRUCTN (int, 8, 4)
-__STRUCTN (int, 64, 4)
-__STRUCTN (uint, 8, 4)
-__STRUCTN (uint, 64, 4)
-__STRUCTN (poly, 8, 4)
-__STRUCTN (float, 64, 4)
-#undef __STRUCTN
+__extension__ extern __inline uint64x1_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcle_s64 (int64x1_t __a, int64x1_t __b)
+{
+  return (uint64x1_t) (__a <= __b);
+}
 
+__extension__ extern __inline uint8x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcle_u8 (uint8x8_t __a, uint8x8_t __b)
+{
+  return (__a <= __b);
+}
 
-#define __ST2_LANE_FUNC(intype, largetype, ptrtype, mode,		     \
-			qmode, ptr_mode, funcsuffix, signedtype)	     \
-__extension__ static __inline void					     \
-__attribute__ ((__always_inline__))					     \
-vst2_lane_ ## funcsuffix (ptrtype *__ptr,				     \
-			  intype __b, const int __c)			     \
-{									     \
-  __builtin_aarch64_simd_oi __o;					     \
-  largetype __temp;							     \
-  __temp.val[0]								     \
-    = vcombine_##funcsuffix (__b.val[0],				     \
-			     vcreate_##funcsuffix (__AARCH64_UINT64_C (0))); \
-  __temp.val[1]								     \
-    = vcombine_##funcsuffix (__b.val[1],				     \
-			     vcreate_##funcsuffix (__AARCH64_UINT64_C (0))); \
-  __o = __builtin_aarch64_set_qregoi##qmode (__o,			     \
-					     (signedtype) __temp.val[0], 0); \
-  __o = __builtin_aarch64_set_qregoi##qmode (__o,			     \
-					     (signedtype) __temp.val[1], 1); \
-  __builtin_aarch64_st2_lane##mode ((__builtin_aarch64_simd_ ## ptr_mode *)  \
-				     __ptr, __o, __c);			     \
+__extension__ extern __inline uint16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcle_u16 (uint16x4_t __a, uint16x4_t __b)
+{
+  return (__a <= __b);
 }
 
-__ST2_LANE_FUNC (float16x4x2_t, float16x8x2_t, float16_t, v4hf, v8hf, hf, f16,
-		 float16x8_t)
-__ST2_LANE_FUNC (float32x2x2_t, float32x4x2_t, float32_t, v2sf, v4sf, sf, f32,
-		 float32x4_t)
-__ST2_LANE_FUNC (float64x1x2_t, float64x2x2_t, float64_t, df, v2df, df, f64,
-		 float64x2_t)
-__ST2_LANE_FUNC (poly8x8x2_t, poly8x16x2_t, poly8_t, v8qi, v16qi, qi, p8,
-		 int8x16_t)
-__ST2_LANE_FUNC (poly16x4x2_t, poly16x8x2_t, poly16_t, v4hi, v8hi, hi, p16,
-		 int16x8_t)
-__ST2_LANE_FUNC (int8x8x2_t, int8x16x2_t, int8_t, v8qi, v16qi, qi, s8,
-		 int8x16_t)
-__ST2_LANE_FUNC (int16x4x2_t, int16x8x2_t, int16_t, v4hi, v8hi, hi, s16,
-		 int16x8_t)
-__ST2_LANE_FUNC (int32x2x2_t, int32x4x2_t, int32_t, v2si, v4si, si, s32,
-		 int32x4_t)
-__ST2_LANE_FUNC (int64x1x2_t, int64x2x2_t, int64_t, di, v2di, di, s64,
-		 int64x2_t)
-__ST2_LANE_FUNC (uint8x8x2_t, uint8x16x2_t, uint8_t, v8qi, v16qi, qi, u8,
-		 int8x16_t)
-__ST2_LANE_FUNC (uint16x4x2_t, uint16x8x2_t, uint16_t, v4hi, v8hi, hi, u16,
-		 int16x8_t)
-__ST2_LANE_FUNC (uint32x2x2_t, uint32x4x2_t, uint32_t, v2si, v4si, si, u32,
-		 int32x4_t)
-__ST2_LANE_FUNC (uint64x1x2_t, uint64x2x2_t, uint64_t, di, v2di, di, u64,
-		 int64x2_t)
+__extension__ extern __inline uint32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcle_u32 (uint32x2_t __a, uint32x2_t __b)
+{
+  return (__a <= __b);
+}
 
-#undef __ST2_LANE_FUNC
-#define __ST2_LANE_FUNC(intype, ptrtype, mode, ptr_mode, funcsuffix)	    \
-__extension__ static __inline void					    \
-__attribute__ ((__always_inline__))					    \
-vst2q_lane_ ## funcsuffix (ptrtype *__ptr,				    \
-			   intype __b, const int __c)			    \
-{									    \
-  union { intype __i;							    \
-	  __builtin_aarch64_simd_oi __o; } __temp = { __b };		    \
-  __builtin_aarch64_st2_lane##mode ((__builtin_aarch64_simd_ ## ptr_mode *) \
-				    __ptr, __temp.__o, __c);		    \
+__extension__ extern __inline uint64x1_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcle_u64 (uint64x1_t __a, uint64x1_t __b)
+{
+  return (__a <= __b);
 }
 
-__ST2_LANE_FUNC (float16x8x2_t, float16_t, v8hf, hf, f16)
-__ST2_LANE_FUNC (float32x4x2_t, float32_t, v4sf, sf, f32)
-__ST2_LANE_FUNC (float64x2x2_t, float64_t, v2df, df, f64)
-__ST2_LANE_FUNC (poly8x16x2_t, poly8_t, v16qi, qi, p8)
-__ST2_LANE_FUNC (poly16x8x2_t, poly16_t, v8hi, hi, p16)
-__ST2_LANE_FUNC (int8x16x2_t, int8_t, v16qi, qi, s8)
-__ST2_LANE_FUNC (int16x8x2_t, int16_t, v8hi, hi, s16)
-__ST2_LANE_FUNC (int32x4x2_t, int32_t, v4si, si, s32)
-__ST2_LANE_FUNC (int64x2x2_t, int64_t, v2di, di, s64)
-__ST2_LANE_FUNC (uint8x16x2_t, uint8_t, v16qi, qi, u8)
-__ST2_LANE_FUNC (uint16x8x2_t, uint16_t, v8hi, hi, u16)
-__ST2_LANE_FUNC (uint32x4x2_t, uint32_t, v4si, si, u32)
-__ST2_LANE_FUNC (uint64x2x2_t, uint64_t, v2di, di, u64)
+__extension__ extern __inline uint32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcleq_f32 (float32x4_t __a, float32x4_t __b)
+{
+  return (uint32x4_t) (__a <= __b);
+}
 
-#define __ST3_LANE_FUNC(intype, largetype, ptrtype, mode,		     \
-			qmode, ptr_mode, funcsuffix, signedtype)	     \
-__extension__ static __inline void					     \
-__attribute__ ((__always_inline__))					     \
-vst3_lane_ ## funcsuffix (ptrtype *__ptr,				     \
-			  intype __b, const int __c)			     \
-{									     \
-  __builtin_aarch64_simd_ci __o;					     \
-  largetype __temp;							     \
-  __temp.val[0]								     \
-    = vcombine_##funcsuffix (__b.val[0],				     \
-			     vcreate_##funcsuffix (__AARCH64_UINT64_C (0))); \
-  __temp.val[1]								     \
-    = vcombine_##funcsuffix (__b.val[1],				     \
-			     vcreate_##funcsuffix (__AARCH64_UINT64_C (0))); \
-  __temp.val[2]								     \
-    = vcombine_##funcsuffix (__b.val[2],				     \
-			     vcreate_##funcsuffix (__AARCH64_UINT64_C (0))); \
-  __o = __builtin_aarch64_set_qregci##qmode (__o,			     \
-					     (signedtype) __temp.val[0], 0); \
-  __o = __builtin_aarch64_set_qregci##qmode (__o,			     \
-					     (signedtype) __temp.val[1], 1); \
-  __o = __builtin_aarch64_set_qregci##qmode (__o,			     \
-					     (signedtype) __temp.val[2], 2); \
-  __builtin_aarch64_st3_lane##mode ((__builtin_aarch64_simd_ ## ptr_mode *)  \
-				     __ptr, __o, __c);			     \
+__extension__ extern __inline uint64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcleq_f64 (float64x2_t __a, float64x2_t __b)
+{
+  return (uint64x2_t) (__a <= __b);
 }
 
-__ST3_LANE_FUNC (float16x4x3_t, float16x8x3_t, float16_t, v4hf, v8hf, hf, f16,
-		 float16x8_t)
-__ST3_LANE_FUNC (float32x2x3_t, float32x4x3_t, float32_t, v2sf, v4sf, sf, f32,
-		 float32x4_t)
-__ST3_LANE_FUNC (float64x1x3_t, float64x2x3_t, float64_t, df, v2df, df, f64,
-		 float64x2_t)
-__ST3_LANE_FUNC (poly8x8x3_t, poly8x16x3_t, poly8_t, v8qi, v16qi, qi, p8,
-		 int8x16_t)
-__ST3_LANE_FUNC (poly16x4x3_t, poly16x8x3_t, poly16_t, v4hi, v8hi, hi, p16,
-		 int16x8_t)
-__ST3_LANE_FUNC (int8x8x3_t, int8x16x3_t, int8_t, v8qi, v16qi, qi, s8,
-		 int8x16_t)
-__ST3_LANE_FUNC (int16x4x3_t, int16x8x3_t, int16_t, v4hi, v8hi, hi, s16,
-		 int16x8_t)
-__ST3_LANE_FUNC (int32x2x3_t, int32x4x3_t, int32_t, v2si, v4si, si, s32,
-		 int32x4_t)
-__ST3_LANE_FUNC (int64x1x3_t, int64x2x3_t, int64_t, di, v2di, di, s64,
-		 int64x2_t)
-__ST3_LANE_FUNC (uint8x8x3_t, uint8x16x3_t, uint8_t, v8qi, v16qi, qi, u8,
-		 int8x16_t)
-__ST3_LANE_FUNC (uint16x4x3_t, uint16x8x3_t, uint16_t, v4hi, v8hi, hi, u16,
-		 int16x8_t)
-__ST3_LANE_FUNC (uint32x2x3_t, uint32x4x3_t, uint32_t, v2si, v4si, si, u32,
-		 int32x4_t)
-__ST3_LANE_FUNC (uint64x1x3_t, uint64x2x3_t, uint64_t, di, v2di, di, u64,
-		 int64x2_t)
+__extension__ extern __inline uint8x16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcleq_s8 (int8x16_t __a, int8x16_t __b)
+{
+  return (uint8x16_t) (__a <= __b);
+}
 
-#undef __ST3_LANE_FUNC
-#define __ST3_LANE_FUNC(intype, ptrtype, mode, ptr_mode, funcsuffix)	    \
-__extension__ static __inline void					    \
-__attribute__ ((__always_inline__))					    \
-vst3q_lane_ ## funcsuffix (ptrtype *__ptr,				    \
-			   intype __b, const int __c)			    \
-{									    \
-  union { intype __i;							    \
-	  __builtin_aarch64_simd_ci __o; } __temp = { __b };		    \
-  __builtin_aarch64_st3_lane##mode ((__builtin_aarch64_simd_ ## ptr_mode *) \
-				    __ptr, __temp.__o, __c);		    \
+__extension__ extern __inline uint16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcleq_s16 (int16x8_t __a, int16x8_t __b)
+{
+  return (uint16x8_t) (__a <= __b);
 }
 
-__ST3_LANE_FUNC (float16x8x3_t, float16_t, v8hf, hf, f16)
-__ST3_LANE_FUNC (float32x4x3_t, float32_t, v4sf, sf, f32)
-__ST3_LANE_FUNC (float64x2x3_t, float64_t, v2df, df, f64)
-__ST3_LANE_FUNC (poly8x16x3_t, poly8_t, v16qi, qi, p8)
-__ST3_LANE_FUNC (poly16x8x3_t, poly16_t, v8hi, hi, p16)
-__ST3_LANE_FUNC (int8x16x3_t, int8_t, v16qi, qi, s8)
-__ST3_LANE_FUNC (int16x8x3_t, int16_t, v8hi, hi, s16)
-__ST3_LANE_FUNC (int32x4x3_t, int32_t, v4si, si, s32)
-__ST3_LANE_FUNC (int64x2x3_t, int64_t, v2di, di, s64)
-__ST3_LANE_FUNC (uint8x16x3_t, uint8_t, v16qi, qi, u8)
-__ST3_LANE_FUNC (uint16x8x3_t, uint16_t, v8hi, hi, u16)
-__ST3_LANE_FUNC (uint32x4x3_t, uint32_t, v4si, si, u32)
-__ST3_LANE_FUNC (uint64x2x3_t, uint64_t, v2di, di, u64)
+__extension__ extern __inline uint32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcleq_s32 (int32x4_t __a, int32x4_t __b)
+{
+  return (uint32x4_t) (__a <= __b);
+}
 
-#define __ST4_LANE_FUNC(intype, largetype, ptrtype, mode,		     \
-			qmode, ptr_mode, funcsuffix, signedtype)	     \
-__extension__ static __inline void					     \
-__attribute__ ((__always_inline__))					     \
-vst4_lane_ ## funcsuffix (ptrtype *__ptr,				     \
-			  intype __b, const int __c)			     \
-{									     \
-  __builtin_aarch64_simd_xi __o;					     \
-  largetype __temp;							     \
-  __temp.val[0]								     \
-    = vcombine_##funcsuffix (__b.val[0],				     \
-			     vcreate_##funcsuffix (__AARCH64_UINT64_C (0))); \
-  __temp.val[1]								     \
-    = vcombine_##funcsuffix (__b.val[1],				     \
-			     vcreate_##funcsuffix (__AARCH64_UINT64_C (0))); \
-  __temp.val[2]								     \
-    = vcombine_##funcsuffix (__b.val[2],				     \
-			     vcreate_##funcsuffix (__AARCH64_UINT64_C (0))); \
-  __temp.val[3]								     \
-    = vcombine_##funcsuffix (__b.val[3],				     \
-			     vcreate_##funcsuffix (__AARCH64_UINT64_C (0))); \
-  __o = __builtin_aarch64_set_qregxi##qmode (__o,			     \
-					     (signedtype) __temp.val[0], 0); \
-  __o = __builtin_aarch64_set_qregxi##qmode (__o,			     \
-					     (signedtype) __temp.val[1], 1); \
-  __o = __builtin_aarch64_set_qregxi##qmode (__o,			     \
-					     (signedtype) __temp.val[2], 2); \
-  __o = __builtin_aarch64_set_qregxi##qmode (__o,			     \
-					     (signedtype) __temp.val[3], 3); \
-  __builtin_aarch64_st4_lane##mode ((__builtin_aarch64_simd_ ## ptr_mode *)  \
-				     __ptr, __o, __c);			     \
+__extension__ extern __inline uint64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcleq_s64 (int64x2_t __a, int64x2_t __b)
+{
+  return (uint64x2_t) (__a <= __b);
 }
 
-__ST4_LANE_FUNC (float16x4x4_t, float16x8x4_t, float16_t, v4hf, v8hf, hf, f16,
-		 float16x8_t)
-__ST4_LANE_FUNC (float32x2x4_t, float32x4x4_t, float32_t, v2sf, v4sf, sf, f32,
-		 float32x4_t)
-__ST4_LANE_FUNC (float64x1x4_t, float64x2x4_t, float64_t, df, v2df, df, f64,
-		 float64x2_t)
-__ST4_LANE_FUNC (poly8x8x4_t, poly8x16x4_t, poly8_t, v8qi, v16qi, qi, p8,
-		 int8x16_t)
-__ST4_LANE_FUNC (poly16x4x4_t, poly16x8x4_t, poly16_t, v4hi, v8hi, hi, p16,
-		 int16x8_t)
-__ST4_LANE_FUNC (int8x8x4_t, int8x16x4_t, int8_t, v8qi, v16qi, qi, s8,
-		 int8x16_t)
-__ST4_LANE_FUNC (int16x4x4_t, int16x8x4_t, int16_t, v4hi, v8hi, hi, s16,
-		 int16x8_t)
-__ST4_LANE_FUNC (int32x2x4_t, int32x4x4_t, int32_t, v2si, v4si, si, s32,
-		 int32x4_t)
-__ST4_LANE_FUNC (int64x1x4_t, int64x2x4_t, int64_t, di, v2di, di, s64,
-		 int64x2_t)
-__ST4_LANE_FUNC (uint8x8x4_t, uint8x16x4_t, uint8_t, v8qi, v16qi, qi, u8,
-		 int8x16_t)
-__ST4_LANE_FUNC (uint16x4x4_t, uint16x8x4_t, uint16_t, v4hi, v8hi, hi, u16,
-		 int16x8_t)
-__ST4_LANE_FUNC (uint32x2x4_t, uint32x4x4_t, uint32_t, v2si, v4si, si, u32,
-		 int32x4_t)
-__ST4_LANE_FUNC (uint64x1x4_t, uint64x2x4_t, uint64_t, di, v2di, di, u64,
-		 int64x2_t)
+__extension__ extern __inline uint8x16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcleq_u8 (uint8x16_t __a, uint8x16_t __b)
+{
+  return (__a <= __b);
+}
 
-#undef __ST4_LANE_FUNC
-#define __ST4_LANE_FUNC(intype, ptrtype, mode, ptr_mode, funcsuffix)	    \
-__extension__ static __inline void					    \
-__attribute__ ((__always_inline__))					    \
-vst4q_lane_ ## funcsuffix (ptrtype *__ptr,				    \
-			   intype __b, const int __c)			    \
-{									    \
-  union { intype __i;							    \
-	  __builtin_aarch64_simd_xi __o; } __temp = { __b };		    \
-  __builtin_aarch64_st4_lane##mode ((__builtin_aarch64_simd_ ## ptr_mode *) \
-				    __ptr, __temp.__o, __c);		    \
+__extension__ extern __inline uint16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcleq_u16 (uint16x8_t __a, uint16x8_t __b)
+{
+  return (__a <= __b);
+}
+
+__extension__ extern __inline uint32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcleq_u32 (uint32x4_t __a, uint32x4_t __b)
+{
+  return (__a <= __b);
+}
+
+__extension__ extern __inline uint64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcleq_u64 (uint64x2_t __a, uint64x2_t __b)
+{
+  return (__a <= __b);
 }
 
-__ST4_LANE_FUNC (float16x8x4_t, float16_t, v8hf, hf, f16)
-__ST4_LANE_FUNC (float32x4x4_t, float32_t, v4sf, sf, f32)
-__ST4_LANE_FUNC (float64x2x4_t, float64_t, v2df, df, f64)
-__ST4_LANE_FUNC (poly8x16x4_t, poly8_t, v16qi, qi, p8)
-__ST4_LANE_FUNC (poly16x8x4_t, poly16_t, v8hi, hi, p16)
-__ST4_LANE_FUNC (int8x16x4_t, int8_t, v16qi, qi, s8)
-__ST4_LANE_FUNC (int16x8x4_t, int16_t, v8hi, hi, s16)
-__ST4_LANE_FUNC (int32x4x4_t, int32_t, v4si, si, s32)
-__ST4_LANE_FUNC (int64x2x4_t, int64_t, v2di, di, s64)
-__ST4_LANE_FUNC (uint8x16x4_t, uint8_t, v16qi, qi, u8)
-__ST4_LANE_FUNC (uint16x8x4_t, uint16_t, v8hi, hi, u16)
-__ST4_LANE_FUNC (uint32x4x4_t, uint32_t, v4si, si, u32)
-__ST4_LANE_FUNC (uint64x2x4_t, uint64_t, v2di, di, u64)
+/* vcle - scalar.  */
 
-__extension__ static __inline int64_t __attribute__ ((__always_inline__))
-vaddlv_s32 (int32x2_t a)
+__extension__ extern __inline uint32_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcles_f32 (float32_t __a, float32_t __b)
 {
-  int64_t result;
-  __asm__ ("saddlp %0.1d, %1.2s" : "=w"(result) : "w"(a) : );
-  return result;
+  return __a <= __b ? -1 : 0;
 }
 
-__extension__ static __inline uint64_t __attribute__ ((__always_inline__))
-vaddlv_u32 (uint32x2_t a)
+__extension__ extern __inline uint64_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcled_s64 (int64_t __a, int64_t __b)
 {
-  uint64_t result;
-  __asm__ ("uaddlp %0.1d, %1.2s" : "=w"(result) : "w"(a) : );
-  return result;
+  return __a <= __b ? -1ll : 0ll;
 }
 
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-vqdmulh_laneq_s16 (int16x4_t __a, int16x8_t __b, const int __c)
+__extension__ extern __inline uint64_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcled_u64 (uint64_t __a, uint64_t __b)
 {
-  return __builtin_aarch64_sqdmulh_laneqv4hi (__a, __b, __c);
+  return __a <= __b ? -1ll : 0ll;
 }
 
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-vqdmulh_laneq_s32 (int32x2_t __a, int32x4_t __b, const int __c)
+__extension__ extern __inline uint64_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcled_f64 (float64_t __a, float64_t __b)
 {
-  return __builtin_aarch64_sqdmulh_laneqv2si (__a, __b, __c);
+  return __a <= __b ? -1ll : 0ll;
 }
 
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-vqdmulhq_laneq_s16 (int16x8_t __a, int16x8_t __b, const int __c)
+/* vclez - vector.  */
+
+__extension__ extern __inline uint32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vclez_f32 (float32x2_t __a)
 {
-  return __builtin_aarch64_sqdmulh_laneqv8hi (__a, __b, __c);
+  return (uint32x2_t) (__a <= 0.0f);
 }
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-vqdmulhq_laneq_s32 (int32x4_t __a, int32x4_t __b, const int __c)
+__extension__ extern __inline uint64x1_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vclez_f64 (float64x1_t __a)
 {
-  return __builtin_aarch64_sqdmulh_laneqv4si (__a, __b, __c);
+  return (uint64x1_t) (__a <= (float64x1_t) {0.0});
 }
 
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-vqrdmulh_laneq_s16 (int16x4_t __a, int16x8_t __b, const int __c)
+__extension__ extern __inline uint8x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vclez_s8 (int8x8_t __a)
 {
-  return  __builtin_aarch64_sqrdmulh_laneqv4hi (__a, __b, __c);
+  return (uint8x8_t) (__a <= 0);
 }
 
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-vqrdmulh_laneq_s32 (int32x2_t __a, int32x4_t __b, const int __c)
+__extension__ extern __inline uint16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vclez_s16 (int16x4_t __a)
 {
-  return __builtin_aarch64_sqrdmulh_laneqv2si (__a, __b, __c);
+  return (uint16x4_t) (__a <= 0);
 }
 
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-vqrdmulhq_laneq_s16 (int16x8_t __a, int16x8_t __b, const int __c)
+__extension__ extern __inline uint32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vclez_s32 (int32x2_t __a)
 {
-  return __builtin_aarch64_sqrdmulh_laneqv8hi (__a, __b, __c);
+  return (uint32x2_t) (__a <= 0);
 }
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-vqrdmulhq_laneq_s32 (int32x4_t __a, int32x4_t __b, const int __c)
+__extension__ extern __inline uint64x1_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vclez_s64 (int64x1_t __a)
 {
-  return __builtin_aarch64_sqrdmulh_laneqv4si (__a, __b, __c);
+  return (uint64x1_t) (__a <= __AARCH64_INT64_C (0));
 }
 
-/* Table intrinsics.  */
-
-__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
-vqtbl1_p8 (poly8x16_t a, uint8x8_t b)
+__extension__ extern __inline uint32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vclezq_f32 (float32x4_t __a)
 {
-  poly8x8_t result;
-  __asm__ ("tbl %0.8b, {%1.16b}, %2.8b"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
+  return (uint32x4_t) (__a <= 0.0f);
 }
 
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-vqtbl1_s8 (int8x16_t a, uint8x8_t b)
+__extension__ extern __inline uint64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vclezq_f64 (float64x2_t __a)
 {
-  int8x8_t result;
-  __asm__ ("tbl %0.8b, {%1.16b}, %2.8b"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
+  return (uint64x2_t) (__a <= 0.0);
 }
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-vqtbl1_u8 (uint8x16_t a, uint8x8_t b)
+__extension__ extern __inline uint8x16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vclezq_s8 (int8x16_t __a)
 {
-  uint8x8_t result;
-  __asm__ ("tbl %0.8b, {%1.16b}, %2.8b"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
+  return (uint8x16_t) (__a <= 0);
 }
 
-__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
-vqtbl1q_p8 (poly8x16_t a, uint8x16_t b)
+__extension__ extern __inline uint16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vclezq_s16 (int16x8_t __a)
 {
-  poly8x16_t result;
-  __asm__ ("tbl %0.16b, {%1.16b}, %2.16b"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
+  return (uint16x8_t) (__a <= 0);
 }
 
-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
-vqtbl1q_s8 (int8x16_t a, uint8x16_t b)
+__extension__ extern __inline uint32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vclezq_s32 (int32x4_t __a)
 {
-  int8x16_t result;
-  __asm__ ("tbl %0.16b, {%1.16b}, %2.16b"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
+  return (uint32x4_t) (__a <= 0);
 }
 
-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-vqtbl1q_u8 (uint8x16_t a, uint8x16_t b)
+__extension__ extern __inline uint64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vclezq_s64 (int64x2_t __a)
 {
-  uint8x16_t result;
-  __asm__ ("tbl %0.16b, {%1.16b}, %2.16b"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
+  return (uint64x2_t) (__a <= __AARCH64_INT64_C (0));
 }
 
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-vqtbx1_s8 (int8x8_t r, int8x16_t tab, uint8x8_t idx)
+/* vclez - scalar.  */
+
+__extension__ extern __inline uint32_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vclezs_f32 (float32_t __a)
 {
-  int8x8_t result = r;
-  __asm__ ("tbx %0.8b,{%1.16b},%2.8b"
-           : "+w"(result)
-           : "w"(tab), "w"(idx)
-           : /* No clobbers */);
-  return result;
+  return __a <= 0.0f ? -1 : 0;
 }
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-vqtbx1_u8 (uint8x8_t r, uint8x16_t tab, uint8x8_t idx)
+__extension__ extern __inline uint64_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vclezd_s64 (int64_t __a)
 {
-  uint8x8_t result = r;
-  __asm__ ("tbx %0.8b,{%1.16b},%2.8b"
-           : "+w"(result)
-           : "w"(tab), "w"(idx)
-           : /* No clobbers */);
-  return result;
+  return __a <= 0 ? -1ll : 0ll;
 }
 
-__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
-vqtbx1_p8 (poly8x8_t r, poly8x16_t tab, uint8x8_t idx)
+__extension__ extern __inline uint64_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vclezd_f64 (float64_t __a)
 {
-  poly8x8_t result = r;
-  __asm__ ("tbx %0.8b,{%1.16b},%2.8b"
-           : "+w"(result)
-           : "w"(tab), "w"(idx)
-           : /* No clobbers */);
-  return result;
+  return __a <= 0.0 ? -1ll : 0ll;
 }
 
-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
-vqtbx1q_s8 (int8x16_t r, int8x16_t tab, uint8x16_t idx)
+/* vclt - vector.  */
+
+__extension__ extern __inline uint32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vclt_f32 (float32x2_t __a, float32x2_t __b)
 {
-  int8x16_t result = r;
-  __asm__ ("tbx %0.16b,{%1.16b},%2.16b"
-           : "+w"(result)
-           : "w"(tab), "w"(idx)
-           : /* No clobbers */);
-  return result;
+  return (uint32x2_t) (__a < __b);
 }
 
-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-vqtbx1q_u8 (uint8x16_t r, uint8x16_t tab, uint8x16_t idx)
+__extension__ extern __inline uint64x1_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vclt_f64 (float64x1_t __a, float64x1_t __b)
 {
-  uint8x16_t result = r;
-  __asm__ ("tbx %0.16b,{%1.16b},%2.16b"
-           : "+w"(result)
-           : "w"(tab), "w"(idx)
-           : /* No clobbers */);
-  return result;
+  return (uint64x1_t) (__a < __b);
 }
 
-__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
-vqtbx1q_p8 (poly8x16_t r, poly8x16_t tab, uint8x16_t idx)
+__extension__ extern __inline uint8x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vclt_s8 (int8x8_t __a, int8x8_t __b)
 {
-  poly8x16_t result = r;
-  __asm__ ("tbx %0.16b,{%1.16b},%2.16b"
-           : "+w"(result)
-           : "w"(tab), "w"(idx)
-           : /* No clobbers */);
-  return result;
+  return (uint8x8_t) (__a < __b);
 }
 
-/* V7 legacy table intrinsics.  */
+__extension__ extern __inline uint16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vclt_s16 (int16x4_t __a, int16x4_t __b)
+{
+  return (uint16x4_t) (__a < __b);
+}
 
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-vtbl1_s8 (int8x8_t tab, int8x8_t idx)
+__extension__ extern __inline uint32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vclt_s32 (int32x2_t __a, int32x2_t __b)
 {
-  int8x8_t result;
-  int8x16_t temp = vcombine_s8 (tab, vcreate_s8 (__AARCH64_UINT64_C (0x0)));
-  __asm__ ("tbl %0.8b, {%1.16b}, %2.8b"
-           : "=w"(result)
-           : "w"(temp), "w"(idx)
-           : /* No clobbers */);
-  return result;
+  return (uint32x2_t) (__a < __b);
 }
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-vtbl1_u8 (uint8x8_t tab, uint8x8_t idx)
+__extension__ extern __inline uint64x1_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vclt_s64 (int64x1_t __a, int64x1_t __b)
 {
-  uint8x8_t result;
-  uint8x16_t temp = vcombine_u8 (tab, vcreate_u8 (__AARCH64_UINT64_C (0x0)));
-  __asm__ ("tbl %0.8b, {%1.16b}, %2.8b"
-           : "=w"(result)
-           : "w"(temp), "w"(idx)
-           : /* No clobbers */);
-  return result;
+  return (uint64x1_t) (__a < __b);
 }
 
-__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
-vtbl1_p8 (poly8x8_t tab, uint8x8_t idx)
+__extension__ extern __inline uint8x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vclt_u8 (uint8x8_t __a, uint8x8_t __b)
 {
-  poly8x8_t result;
-  poly8x16_t temp = vcombine_p8 (tab, vcreate_p8 (__AARCH64_UINT64_C (0x0)));
-  __asm__ ("tbl %0.8b, {%1.16b}, %2.8b"
-           : "=w"(result)
-           : "w"(temp), "w"(idx)
-           : /* No clobbers */);
-  return result;
+  return (__a < __b);
 }
 
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-vtbl2_s8 (int8x8x2_t tab, int8x8_t idx)
+__extension__ extern __inline uint16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vclt_u16 (uint16x4_t __a, uint16x4_t __b)
 {
-  int8x8_t result;
-  int8x16_t temp = vcombine_s8 (tab.val[0], tab.val[1]);
-  __asm__ ("tbl %0.8b, {%1.16b}, %2.8b"
-           : "=w"(result)
-           : "w"(temp), "w"(idx)
-           : /* No clobbers */);
-  return result;
+  return (__a < __b);
 }
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-vtbl2_u8 (uint8x8x2_t tab, uint8x8_t idx)
+__extension__ extern __inline uint32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vclt_u32 (uint32x2_t __a, uint32x2_t __b)
 {
-  uint8x8_t result;
-  uint8x16_t temp = vcombine_u8 (tab.val[0], tab.val[1]);
-  __asm__ ("tbl %0.8b, {%1.16b}, %2.8b"
-           : "=w"(result)
-           : "w"(temp), "w"(idx)
-           : /* No clobbers */);
-  return result;
+  return (__a < __b);
 }
 
-__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
-vtbl2_p8 (poly8x8x2_t tab, uint8x8_t idx)
+__extension__ extern __inline uint64x1_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vclt_u64 (uint64x1_t __a, uint64x1_t __b)
 {
-  poly8x8_t result;
-  poly8x16_t temp = vcombine_p8 (tab.val[0], tab.val[1]);
-  __asm__ ("tbl %0.8b, {%1.16b}, %2.8b"
-           : "=w"(result)
-           : "w"(temp), "w"(idx)
-           : /* No clobbers */);
-  return result;
+  return (__a < __b);
 }
 
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-vtbl3_s8 (int8x8x3_t tab, int8x8_t idx)
+__extension__ extern __inline uint32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcltq_f32 (float32x4_t __a, float32x4_t __b)
 {
-  int8x8_t result;
-  int8x16x2_t temp;
-  __builtin_aarch64_simd_oi __o;
-  temp.val[0] = vcombine_s8 (tab.val[0], tab.val[1]);
-  temp.val[1] = vcombine_s8 (tab.val[2], vcreate_s8 (__AARCH64_UINT64_C (0x0)));
-  __o = __builtin_aarch64_set_qregoiv16qi (__o,
-					   (int8x16_t) temp.val[0], 0);
-  __o = __builtin_aarch64_set_qregoiv16qi (__o,
-					   (int8x16_t) temp.val[1], 1);
-  result = __builtin_aarch64_tbl3v8qi (__o, idx);
-  return result;
+  return (uint32x4_t) (__a < __b);
 }
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-vtbl3_u8 (uint8x8x3_t tab, uint8x8_t idx)
+__extension__ extern __inline uint64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcltq_f64 (float64x2_t __a, float64x2_t __b)
 {
-  uint8x8_t result;
-  uint8x16x2_t temp;
-  __builtin_aarch64_simd_oi __o;
-  temp.val[0] = vcombine_u8 (tab.val[0], tab.val[1]);
-  temp.val[1] = vcombine_u8 (tab.val[2], vcreate_u8 (__AARCH64_UINT64_C (0x0)));
-  __o = __builtin_aarch64_set_qregoiv16qi (__o,
-					   (int8x16_t) temp.val[0], 0);
-  __o = __builtin_aarch64_set_qregoiv16qi (__o,
-					   (int8x16_t) temp.val[1], 1);
-  result = (uint8x8_t)__builtin_aarch64_tbl3v8qi (__o, (int8x8_t)idx);
-  return result;
+  return (uint64x2_t) (__a < __b);
 }
 
-__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
-vtbl3_p8 (poly8x8x3_t tab, uint8x8_t idx)
+__extension__ extern __inline uint8x16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcltq_s8 (int8x16_t __a, int8x16_t __b)
 {
-  poly8x8_t result;
-  poly8x16x2_t temp;
-  __builtin_aarch64_simd_oi __o;
-  temp.val[0] = vcombine_p8 (tab.val[0], tab.val[1]);
-  temp.val[1] = vcombine_p8 (tab.val[2], vcreate_p8 (__AARCH64_UINT64_C (0x0)));
-  __o = __builtin_aarch64_set_qregoiv16qi (__o,
-					   (int8x16_t) temp.val[0], 0);
-  __o = __builtin_aarch64_set_qregoiv16qi (__o,
-					   (int8x16_t) temp.val[1], 1);
-  result = (poly8x8_t)__builtin_aarch64_tbl3v8qi (__o, (int8x8_t)idx);
-  return result;
+  return (uint8x16_t) (__a < __b);
 }
 
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-vtbl4_s8 (int8x8x4_t tab, int8x8_t idx)
+__extension__ extern __inline uint16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcltq_s16 (int16x8_t __a, int16x8_t __b)
 {
-  int8x8_t result;
-  int8x16x2_t temp;
-  __builtin_aarch64_simd_oi __o;
-  temp.val[0] = vcombine_s8 (tab.val[0], tab.val[1]);
-  temp.val[1] = vcombine_s8 (tab.val[2], tab.val[3]);
-  __o = __builtin_aarch64_set_qregoiv16qi (__o,
-					   (int8x16_t) temp.val[0], 0);
-  __o = __builtin_aarch64_set_qregoiv16qi (__o,
-					   (int8x16_t) temp.val[1], 1);
-  result = __builtin_aarch64_tbl3v8qi (__o, idx);
-  return result;
+  return (uint16x8_t) (__a < __b);
 }
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-vtbl4_u8 (uint8x8x4_t tab, uint8x8_t idx)
+__extension__ extern __inline uint32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcltq_s32 (int32x4_t __a, int32x4_t __b)
 {
-  uint8x8_t result;
-  uint8x16x2_t temp;
-  __builtin_aarch64_simd_oi __o;
-  temp.val[0] = vcombine_u8 (tab.val[0], tab.val[1]);
-  temp.val[1] = vcombine_u8 (tab.val[2], tab.val[3]);
-  __o = __builtin_aarch64_set_qregoiv16qi (__o,
-					   (int8x16_t) temp.val[0], 0);
-  __o = __builtin_aarch64_set_qregoiv16qi (__o,
-					   (int8x16_t) temp.val[1], 1);
-  result = (uint8x8_t)__builtin_aarch64_tbl3v8qi (__o, (int8x8_t)idx);
-  return result;
+  return (uint32x4_t) (__a < __b);
 }
 
-__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
-vtbl4_p8 (poly8x8x4_t tab, uint8x8_t idx)
+__extension__ extern __inline uint64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcltq_s64 (int64x2_t __a, int64x2_t __b)
 {
-  poly8x8_t result;
-  poly8x16x2_t temp;
-  __builtin_aarch64_simd_oi __o;
-  temp.val[0] = vcombine_p8 (tab.val[0], tab.val[1]);
-  temp.val[1] = vcombine_p8 (tab.val[2], tab.val[3]);
-  __o = __builtin_aarch64_set_qregoiv16qi (__o,
-					   (int8x16_t) temp.val[0], 0);
-  __o = __builtin_aarch64_set_qregoiv16qi (__o,
-					   (int8x16_t) temp.val[1], 1);
-  result = (poly8x8_t)__builtin_aarch64_tbl3v8qi (__o, (int8x8_t)idx);
-  return result;
+  return (uint64x2_t) (__a < __b);
 }
 
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-vtbx2_s8 (int8x8_t r, int8x8x2_t tab, int8x8_t idx)
+__extension__ extern __inline uint8x16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcltq_u8 (uint8x16_t __a, uint8x16_t __b)
 {
-  int8x8_t result = r;
-  int8x16_t temp = vcombine_s8 (tab.val[0], tab.val[1]);
-  __asm__ ("tbx %0.8b, {%1.16b}, %2.8b"
-           : "+w"(result)
-           : "w"(temp), "w"(idx)
-           : /* No clobbers */);
-  return result;
+  return (__a < __b);
 }
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-vtbx2_u8 (uint8x8_t r, uint8x8x2_t tab, uint8x8_t idx)
+__extension__ extern __inline uint16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcltq_u16 (uint16x8_t __a, uint16x8_t __b)
 {
-  uint8x8_t result = r;
-  uint8x16_t temp = vcombine_u8 (tab.val[0], tab.val[1]);
-  __asm__ ("tbx %0.8b, {%1.16b}, %2.8b"
-           : "+w"(result)
-           : "w"(temp), "w"(idx)
-           : /* No clobbers */);
-  return result;
+  return (__a < __b);
 }
 
-__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
-vtbx2_p8 (poly8x8_t r, poly8x8x2_t tab, uint8x8_t idx)
+__extension__ extern __inline uint32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcltq_u32 (uint32x4_t __a, uint32x4_t __b)
 {
-  poly8x8_t result = r;
-  poly8x16_t temp = vcombine_p8 (tab.val[0], tab.val[1]);
-  __asm__ ("tbx %0.8b, {%1.16b}, %2.8b"
-           : "+w"(result)
-           : "w"(temp), "w"(idx)
-           : /* No clobbers */);
-  return result;
+  return (__a < __b);
 }
 
-/* End of temporary inline asm.  */
+__extension__ extern __inline uint64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcltq_u64 (uint64x2_t __a, uint64x2_t __b)
+{
+  return (__a < __b);
+}
 
-/* Start of optimal implementations in approved order.  */
+/* vclt - scalar.  */
 
-/* vabs  */
+__extension__ extern __inline uint32_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vclts_f32 (float32_t __a, float32_t __b)
+{
+  return __a < __b ? -1 : 0;
+}
 
-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
-vabs_f32 (float32x2_t __a)
+__extension__ extern __inline uint64_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcltd_s64 (int64_t __a, int64_t __b)
 {
-  return __builtin_aarch64_absv2sf (__a);
+  return __a < __b ? -1ll : 0ll;
 }
 
-__extension__ static __inline float64x1_t __attribute__ ((__always_inline__))
-vabs_f64 (float64x1_t __a)
+__extension__ extern __inline uint64_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcltd_u64 (uint64_t __a, uint64_t __b)
 {
-  return (float64x1_t) {__builtin_fabs (__a[0])};
+  return __a < __b ? -1ll : 0ll;
 }
 
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-vabs_s8 (int8x8_t __a)
+__extension__ extern __inline uint64_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcltd_f64 (float64_t __a, float64_t __b)
 {
-  return __builtin_aarch64_absv8qi (__a);
+  return __a < __b ? -1ll : 0ll;
 }
 
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-vabs_s16 (int16x4_t __a)
+/* vcltz - vector.  */
+
+__extension__ extern __inline uint32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcltz_f32 (float32x2_t __a)
 {
-  return __builtin_aarch64_absv4hi (__a);
+  return (uint32x2_t) (__a < 0.0f);
 }
 
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-vabs_s32 (int32x2_t __a)
+__extension__ extern __inline uint64x1_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcltz_f64 (float64x1_t __a)
 {
-  return __builtin_aarch64_absv2si (__a);
+  return (uint64x1_t) (__a < (float64x1_t) {0.0});
 }
 
-__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
-vabs_s64 (int64x1_t __a)
+__extension__ extern __inline uint8x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcltz_s8 (int8x8_t __a)
 {
-  return (int64x1_t) {__builtin_aarch64_absdi (__a[0])};
+  return (uint8x8_t) (__a < 0);
 }
 
-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
-vabsq_f32 (float32x4_t __a)
+__extension__ extern __inline uint16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcltz_s16 (int16x4_t __a)
 {
-  return __builtin_aarch64_absv4sf (__a);
+  return (uint16x4_t) (__a < 0);
 }
 
-__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
-vabsq_f64 (float64x2_t __a)
+__extension__ extern __inline uint32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcltz_s32 (int32x2_t __a)
 {
-  return __builtin_aarch64_absv2df (__a);
+  return (uint32x2_t) (__a < 0);
 }
 
-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
-vabsq_s8 (int8x16_t __a)
+__extension__ extern __inline uint64x1_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcltz_s64 (int64x1_t __a)
 {
-  return __builtin_aarch64_absv16qi (__a);
+  return (uint64x1_t) (__a < __AARCH64_INT64_C (0));
 }
 
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-vabsq_s16 (int16x8_t __a)
+__extension__ extern __inline uint32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcltzq_f32 (float32x4_t __a)
 {
-  return __builtin_aarch64_absv8hi (__a);
+  return (uint32x4_t) (__a < 0.0f);
 }
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-vabsq_s32 (int32x4_t __a)
+__extension__ extern __inline uint64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcltzq_f64 (float64x2_t __a)
 {
-  return __builtin_aarch64_absv4si (__a);
+  return (uint64x2_t) (__a < 0.0);
 }
 
-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
-vabsq_s64 (int64x2_t __a)
+__extension__ extern __inline uint8x16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcltzq_s8 (int8x16_t __a)
 {
-  return __builtin_aarch64_absv2di (__a);
+  return (uint8x16_t) (__a < 0);
 }
 
-/* vadd */
+__extension__ extern __inline uint16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcltzq_s16 (int16x8_t __a)
+{
+  return (uint16x8_t) (__a < 0);
+}
 
-__extension__ static __inline int64_t __attribute__ ((__always_inline__))
-vaddd_s64 (int64_t __a, int64_t __b)
+__extension__ extern __inline uint32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcltzq_s32 (int32x4_t __a)
 {
-  return __a + __b;
+  return (uint32x4_t) (__a < 0);
 }
 
-__extension__ static __inline uint64_t __attribute__ ((__always_inline__))
-vaddd_u64 (uint64_t __a, uint64_t __b)
+__extension__ extern __inline uint64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcltzq_s64 (int64x2_t __a)
 {
-  return __a + __b;
+  return (uint64x2_t) (__a < __AARCH64_INT64_C (0));
 }
 
-/* vaddv */
+/* vcltz - scalar.  */
 
-__extension__ static __inline int8_t __attribute__ ((__always_inline__))
-vaddv_s8 (int8x8_t __a)
+__extension__ extern __inline uint32_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcltzs_f32 (float32_t __a)
 {
-  return __builtin_aarch64_reduc_plus_scal_v8qi (__a);
+  return __a < 0.0f ? -1 : 0;
 }
 
-__extension__ static __inline int16_t __attribute__ ((__always_inline__))
-vaddv_s16 (int16x4_t __a)
+__extension__ extern __inline uint64_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcltzd_s64 (int64_t __a)
 {
-  return __builtin_aarch64_reduc_plus_scal_v4hi (__a);
+  return __a < 0 ? -1ll : 0ll;
 }
 
-__extension__ static __inline int32_t __attribute__ ((__always_inline__))
-vaddv_s32 (int32x2_t __a)
+__extension__ extern __inline uint64_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcltzd_f64 (float64_t __a)
 {
-  return __builtin_aarch64_reduc_plus_scal_v2si (__a);
+  return __a < 0.0 ? -1ll : 0ll;
 }
 
-__extension__ static __inline uint8_t __attribute__ ((__always_inline__))
-vaddv_u8 (uint8x8_t __a)
+/* vcls.  */
+
+__extension__ extern __inline int8x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcls_s8 (int8x8_t __a)
 {
-  return (uint8_t) __builtin_aarch64_reduc_plus_scal_v8qi ((int8x8_t) __a);
+  return __builtin_aarch64_clrsbv8qi (__a);
 }
 
-__extension__ static __inline uint16_t __attribute__ ((__always_inline__))
-vaddv_u16 (uint16x4_t __a)
+__extension__ extern __inline int16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcls_s16 (int16x4_t __a)
 {
-  return (uint16_t) __builtin_aarch64_reduc_plus_scal_v4hi ((int16x4_t) __a);
+  return __builtin_aarch64_clrsbv4hi (__a);
 }
 
-__extension__ static __inline uint32_t __attribute__ ((__always_inline__))
-vaddv_u32 (uint32x2_t __a)
+__extension__ extern __inline int32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcls_s32 (int32x2_t __a)
 {
-  return (int32_t) __builtin_aarch64_reduc_plus_scal_v2si ((int32x2_t) __a);
+  return __builtin_aarch64_clrsbv2si (__a);
 }
 
-__extension__ static __inline int8_t __attribute__ ((__always_inline__))
-vaddvq_s8 (int8x16_t __a)
+__extension__ extern __inline int8x16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vclsq_s8 (int8x16_t __a)
 {
-  return __builtin_aarch64_reduc_plus_scal_v16qi (__a);
+  return __builtin_aarch64_clrsbv16qi (__a);
 }
 
-__extension__ static __inline int16_t __attribute__ ((__always_inline__))
-vaddvq_s16 (int16x8_t __a)
+__extension__ extern __inline int16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vclsq_s16 (int16x8_t __a)
 {
-  return __builtin_aarch64_reduc_plus_scal_v8hi (__a);
+  return __builtin_aarch64_clrsbv8hi (__a);
 }
 
-__extension__ static __inline int32_t __attribute__ ((__always_inline__))
-vaddvq_s32 (int32x4_t __a)
+__extension__ extern __inline int32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vclsq_s32 (int32x4_t __a)
 {
-  return __builtin_aarch64_reduc_plus_scal_v4si (__a);
+  return __builtin_aarch64_clrsbv4si (__a);
 }
 
-__extension__ static __inline int64_t __attribute__ ((__always_inline__))
-vaddvq_s64 (int64x2_t __a)
+/* vclz.  */
+
+__extension__ extern __inline int8x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vclz_s8 (int8x8_t __a)
 {
-  return __builtin_aarch64_reduc_plus_scal_v2di (__a);
+  return __builtin_aarch64_clzv8qi (__a);
 }
 
-__extension__ static __inline uint8_t __attribute__ ((__always_inline__))
-vaddvq_u8 (uint8x16_t __a)
+__extension__ extern __inline int16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vclz_s16 (int16x4_t __a)
 {
-  return (uint8_t) __builtin_aarch64_reduc_plus_scal_v16qi ((int8x16_t) __a);
+  return __builtin_aarch64_clzv4hi (__a);
 }
 
-__extension__ static __inline uint16_t __attribute__ ((__always_inline__))
-vaddvq_u16 (uint16x8_t __a)
+__extension__ extern __inline int32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vclz_s32 (int32x2_t __a)
 {
-  return (uint16_t) __builtin_aarch64_reduc_plus_scal_v8hi ((int16x8_t) __a);
+  return __builtin_aarch64_clzv2si (__a);
 }
 
-__extension__ static __inline uint32_t __attribute__ ((__always_inline__))
-vaddvq_u32 (uint32x4_t __a)
+__extension__ extern __inline uint8x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vclz_u8 (uint8x8_t __a)
 {
-  return (uint32_t) __builtin_aarch64_reduc_plus_scal_v4si ((int32x4_t) __a);
+  return (uint8x8_t)__builtin_aarch64_clzv8qi ((int8x8_t)__a);
 }
 
-__extension__ static __inline uint64_t __attribute__ ((__always_inline__))
-vaddvq_u64 (uint64x2_t __a)
+__extension__ extern __inline uint16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vclz_u16 (uint16x4_t __a)
 {
-  return (uint64_t) __builtin_aarch64_reduc_plus_scal_v2di ((int64x2_t) __a);
+  return (uint16x4_t)__builtin_aarch64_clzv4hi ((int16x4_t)__a);
 }
 
-__extension__ static __inline float32_t __attribute__ ((__always_inline__))
-vaddv_f32 (float32x2_t __a)
+__extension__ extern __inline uint32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vclz_u32 (uint32x2_t __a)
 {
-  return __builtin_aarch64_reduc_plus_scal_v2sf (__a);
+  return (uint32x2_t)__builtin_aarch64_clzv2si ((int32x2_t)__a);
 }
 
-__extension__ static __inline float32_t __attribute__ ((__always_inline__))
-vaddvq_f32 (float32x4_t __a)
+__extension__ extern __inline int8x16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vclzq_s8 (int8x16_t __a)
 {
-  return __builtin_aarch64_reduc_plus_scal_v4sf (__a);
+  return __builtin_aarch64_clzv16qi (__a);
 }
 
-__extension__ static __inline float64_t __attribute__ ((__always_inline__))
-vaddvq_f64 (float64x2_t __a)
+__extension__ extern __inline int16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vclzq_s16 (int16x8_t __a)
 {
-  return __builtin_aarch64_reduc_plus_scal_v2df (__a);
+  return __builtin_aarch64_clzv8hi (__a);
 }
 
-/* vbsl  */
+__extension__ extern __inline int32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vclzq_s32 (int32x4_t __a)
+{
+  return __builtin_aarch64_clzv4si (__a);
+}
 
-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
-vbsl_f32 (uint32x2_t __a, float32x2_t __b, float32x2_t __c)
+__extension__ extern __inline uint8x16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vclzq_u8 (uint8x16_t __a)
 {
-  return __builtin_aarch64_simd_bslv2sf_suss (__a, __b, __c);
+  return (uint8x16_t)__builtin_aarch64_clzv16qi ((int8x16_t)__a);
 }
 
-__extension__ static __inline float64x1_t __attribute__ ((__always_inline__))
-vbsl_f64 (uint64x1_t __a, float64x1_t __b, float64x1_t __c)
+__extension__ extern __inline uint16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vclzq_u16 (uint16x8_t __a)
 {
-  return (float64x1_t)
-    { __builtin_aarch64_simd_bsldf_suss (__a[0], __b[0], __c[0]) };
+  return (uint16x8_t)__builtin_aarch64_clzv8hi ((int16x8_t)__a);
 }
 
-__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
-vbsl_p8 (uint8x8_t __a, poly8x8_t __b, poly8x8_t __c)
+__extension__ extern __inline uint32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vclzq_u32 (uint32x4_t __a)
 {
-  return __builtin_aarch64_simd_bslv8qi_pupp (__a, __b, __c);
+  return (uint32x4_t)__builtin_aarch64_clzv4si ((int32x4_t)__a);
 }
 
-__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__))
-vbsl_p16 (uint16x4_t __a, poly16x4_t __b, poly16x4_t __c)
+/* vcnt.  */
+
+__extension__ extern __inline poly8x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcnt_p8 (poly8x8_t __a)
 {
-  return __builtin_aarch64_simd_bslv4hi_pupp (__a, __b, __c);
+  return (poly8x8_t) __builtin_aarch64_popcountv8qi ((int8x8_t) __a);
 }
 
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-vbsl_s8 (uint8x8_t __a, int8x8_t __b, int8x8_t __c)
+__extension__ extern __inline int8x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcnt_s8 (int8x8_t __a)
 {
-  return __builtin_aarch64_simd_bslv8qi_suss (__a, __b, __c);
+  return __builtin_aarch64_popcountv8qi (__a);
 }
 
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-vbsl_s16 (uint16x4_t __a, int16x4_t __b, int16x4_t __c)
+__extension__ extern __inline uint8x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcnt_u8 (uint8x8_t __a)
 {
-  return __builtin_aarch64_simd_bslv4hi_suss (__a, __b, __c);
+  return (uint8x8_t) __builtin_aarch64_popcountv8qi ((int8x8_t) __a);
 }
 
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-vbsl_s32 (uint32x2_t __a, int32x2_t __b, int32x2_t __c)
+__extension__ extern __inline poly8x16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcntq_p8 (poly8x16_t __a)
 {
-  return __builtin_aarch64_simd_bslv2si_suss (__a, __b, __c);
+  return (poly8x16_t) __builtin_aarch64_popcountv16qi ((int8x16_t) __a);
 }
 
-__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
-vbsl_s64 (uint64x1_t __a, int64x1_t __b, int64x1_t __c)
+__extension__ extern __inline int8x16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcntq_s8 (int8x16_t __a)
 {
-  return (int64x1_t)
-      {__builtin_aarch64_simd_bsldi_suss (__a[0], __b[0], __c[0])};
+  return __builtin_aarch64_popcountv16qi (__a);
 }
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-vbsl_u8 (uint8x8_t __a, uint8x8_t __b, uint8x8_t __c)
+__extension__ extern __inline uint8x16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcntq_u8 (uint8x16_t __a)
 {
-  return __builtin_aarch64_simd_bslv8qi_uuuu (__a, __b, __c);
+  return (uint8x16_t) __builtin_aarch64_popcountv16qi ((int8x16_t) __a);
 }
 
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-vbsl_u16 (uint16x4_t __a, uint16x4_t __b, uint16x4_t __c)
+/* vcopy_lane.  */
+
+__extension__ extern __inline float32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcopy_lane_f32 (float32x2_t __a, const int __lane1,
+		float32x2_t __b, const int __lane2)
 {
-  return __builtin_aarch64_simd_bslv4hi_uuuu (__a, __b, __c);
+  return __aarch64_vset_lane_any (__aarch64_vget_lane_any (__b, __lane2),
+				  __a, __lane1);
 }
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-vbsl_u32 (uint32x2_t __a, uint32x2_t __b, uint32x2_t __c)
+__extension__ extern __inline float64x1_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcopy_lane_f64 (float64x1_t __a, const int __lane1,
+		float64x1_t __b, const int __lane2)
 {
-  return __builtin_aarch64_simd_bslv2si_uuuu (__a, __b, __c);
+  return __aarch64_vset_lane_any (__aarch64_vget_lane_any (__b, __lane2),
+				  __a, __lane1);
 }
 
-__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
-vbsl_u64 (uint64x1_t __a, uint64x1_t __b, uint64x1_t __c)
+__extension__ extern __inline poly8x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcopy_lane_p8 (poly8x8_t __a, const int __lane1,
+	       poly8x8_t __b, const int __lane2)
 {
-  return (uint64x1_t)
-      {__builtin_aarch64_simd_bsldi_uuuu (__a[0], __b[0], __c[0])};
+  return __aarch64_vset_lane_any (__aarch64_vget_lane_any (__b, __lane2),
+				 __a, __lane1);
 }
 
-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
-vbslq_f32 (uint32x4_t __a, float32x4_t __b, float32x4_t __c)
+__extension__ extern __inline poly16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcopy_lane_p16 (poly16x4_t __a, const int __lane1,
+		poly16x4_t __b, const int __lane2)
 {
-  return __builtin_aarch64_simd_bslv4sf_suss (__a, __b, __c);
+  return __aarch64_vset_lane_any (__aarch64_vget_lane_any (__b, __lane2),
+				  __a, __lane1);
 }
 
-__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
-vbslq_f64 (uint64x2_t __a, float64x2_t __b, float64x2_t __c)
+__extension__ extern __inline poly64x1_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcopy_lane_p64 (poly64x1_t __a, const int __lane1,
+		poly64x1_t __b, const int __lane2)
 {
-  return __builtin_aarch64_simd_bslv2df_suss (__a, __b, __c);
+  return __aarch64_vset_lane_any (__aarch64_vget_lane_any (__b, __lane2),
+				  __a, __lane1);
 }
 
-__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
-vbslq_p8 (uint8x16_t __a, poly8x16_t __b, poly8x16_t __c)
+__extension__ extern __inline int8x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcopy_lane_s8 (int8x8_t __a, const int __lane1,
+	       int8x8_t __b, const int __lane2)
 {
-  return __builtin_aarch64_simd_bslv16qi_pupp (__a, __b, __c);
+  return __aarch64_vset_lane_any (__aarch64_vget_lane_any (__b, __lane2),
+				 __a, __lane1);
 }
 
-__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
-vbslq_p16 (uint16x8_t __a, poly16x8_t __b, poly16x8_t __c)
+__extension__ extern __inline int16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcopy_lane_s16 (int16x4_t __a, const int __lane1,
+		int16x4_t __b, const int __lane2)
 {
-  return __builtin_aarch64_simd_bslv8hi_pupp (__a, __b, __c);
+  return __aarch64_vset_lane_any (__aarch64_vget_lane_any (__b, __lane2),
+				  __a, __lane1);
 }
 
-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
-vbslq_s8 (uint8x16_t __a, int8x16_t __b, int8x16_t __c)
+__extension__ extern __inline int32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcopy_lane_s32 (int32x2_t __a, const int __lane1,
+		int32x2_t __b, const int __lane2)
 {
-  return __builtin_aarch64_simd_bslv16qi_suss (__a, __b, __c);
+  return __aarch64_vset_lane_any (__aarch64_vget_lane_any (__b, __lane2),
+				  __a, __lane1);
 }
 
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-vbslq_s16 (uint16x8_t __a, int16x8_t __b, int16x8_t __c)
+__extension__ extern __inline int64x1_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcopy_lane_s64 (int64x1_t __a, const int __lane1,
+		int64x1_t __b, const int __lane2)
 {
-  return __builtin_aarch64_simd_bslv8hi_suss (__a, __b, __c);
+  return __aarch64_vset_lane_any (__aarch64_vget_lane_any (__b, __lane2),
+				  __a, __lane1);
 }
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-vbslq_s32 (uint32x4_t __a, int32x4_t __b, int32x4_t __c)
+__extension__ extern __inline uint8x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcopy_lane_u8 (uint8x8_t __a, const int __lane1,
+	       uint8x8_t __b, const int __lane2)
 {
-  return __builtin_aarch64_simd_bslv4si_suss (__a, __b, __c);
+  return __aarch64_vset_lane_any (__aarch64_vget_lane_any (__b, __lane2),
+				 __a, __lane1);
 }
 
-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
-vbslq_s64 (uint64x2_t __a, int64x2_t __b, int64x2_t __c)
+__extension__ extern __inline uint16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcopy_lane_u16 (uint16x4_t __a, const int __lane1,
+		uint16x4_t __b, const int __lane2)
 {
-  return __builtin_aarch64_simd_bslv2di_suss (__a, __b, __c);
+  return __aarch64_vset_lane_any (__aarch64_vget_lane_any (__b, __lane2),
+				  __a, __lane1);
 }
 
-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-vbslq_u8 (uint8x16_t __a, uint8x16_t __b, uint8x16_t __c)
+__extension__ extern __inline uint32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcopy_lane_u32 (uint32x2_t __a, const int __lane1,
+		uint32x2_t __b, const int __lane2)
 {
-  return __builtin_aarch64_simd_bslv16qi_uuuu (__a, __b, __c);
+  return __aarch64_vset_lane_any (__aarch64_vget_lane_any (__b, __lane2),
+				  __a, __lane1);
 }
 
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-vbslq_u16 (uint16x8_t __a, uint16x8_t __b, uint16x8_t __c)
+__extension__ extern __inline uint64x1_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcopy_lane_u64 (uint64x1_t __a, const int __lane1,
+		uint64x1_t __b, const int __lane2)
 {
-  return __builtin_aarch64_simd_bslv8hi_uuuu (__a, __b, __c);
+  return __aarch64_vset_lane_any (__aarch64_vget_lane_any (__b, __lane2),
+				  __a, __lane1);
 }
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vbslq_u32 (uint32x4_t __a, uint32x4_t __b, uint32x4_t __c)
+/* vcopy_laneq.  */
+
+__extension__ extern __inline float32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcopy_laneq_f32 (float32x2_t __a, const int __lane1,
+		 float32x4_t __b, const int __lane2)
 {
-  return __builtin_aarch64_simd_bslv4si_uuuu (__a, __b, __c);
+  return __aarch64_vset_lane_any (__aarch64_vget_lane_any (__b, __lane2),
+				  __a, __lane1);
 }
 
-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
-vbslq_u64 (uint64x2_t __a, uint64x2_t __b, uint64x2_t __c)
+__extension__ extern __inline float64x1_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcopy_laneq_f64 (float64x1_t __a, const int __lane1,
+		 float64x2_t __b, const int __lane2)
 {
-  return __builtin_aarch64_simd_bslv2di_uuuu (__a, __b, __c);
+  return __aarch64_vset_lane_any (__aarch64_vget_lane_any (__b, __lane2),
+				  __a, __lane1);
 }
 
-/* ARMv8.1 instrinsics.  */
-#pragma GCC push_options
-#pragma GCC target ("arch=armv8.1-a")
+__extension__ extern __inline poly8x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcopy_laneq_p8 (poly8x8_t __a, const int __lane1,
+		poly8x16_t __b, const int __lane2)
+{
+  return __aarch64_vset_lane_any (__aarch64_vget_lane_any (__b, __lane2),
+				 __a, __lane1);
+}
 
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-vqrdmlah_s16 (int16x4_t __a, int16x4_t __b, int16x4_t __c)
+__extension__ extern __inline poly16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcopy_laneq_p16 (poly16x4_t __a, const int __lane1,
+		 poly16x8_t __b, const int __lane2)
 {
-  return __builtin_aarch64_sqrdmlahv4hi (__a, __b, __c);
+  return __aarch64_vset_lane_any (__aarch64_vget_lane_any (__b, __lane2),
+				  __a, __lane1);
 }
 
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-vqrdmlah_s32 (int32x2_t __a, int32x2_t __b, int32x2_t __c)
+__extension__ extern __inline poly64x1_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcopy_laneq_p64 (poly64x1_t __a, const int __lane1,
+		 poly64x2_t __b, const int __lane2)
 {
-  return __builtin_aarch64_sqrdmlahv2si (__a, __b, __c);
+  return __aarch64_vset_lane_any (__aarch64_vget_lane_any (__b, __lane2),
+				  __a, __lane1);
 }
 
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-vqrdmlahq_s16 (int16x8_t __a, int16x8_t __b, int16x8_t __c)
+__extension__ extern __inline int8x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcopy_laneq_s8 (int8x8_t __a, const int __lane1,
+		int8x16_t __b, const int __lane2)
 {
-  return __builtin_aarch64_sqrdmlahv8hi (__a, __b, __c);
+  return __aarch64_vset_lane_any (__aarch64_vget_lane_any (__b, __lane2),
+				 __a, __lane1);
 }
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-vqrdmlahq_s32 (int32x4_t __a, int32x4_t __b, int32x4_t __c)
+__extension__ extern __inline int16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcopy_laneq_s16 (int16x4_t __a, const int __lane1,
+		 int16x8_t __b, const int __lane2)
 {
-  return __builtin_aarch64_sqrdmlahv4si (__a, __b, __c);
+  return __aarch64_vset_lane_any (__aarch64_vget_lane_any (__b, __lane2),
+				  __a, __lane1);
 }
 
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-vqrdmlsh_s16 (int16x4_t __a, int16x4_t __b, int16x4_t __c)
+__extension__ extern __inline int32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcopy_laneq_s32 (int32x2_t __a, const int __lane1,
+		 int32x4_t __b, const int __lane2)
 {
-  return __builtin_aarch64_sqrdmlshv4hi (__a, __b, __c);
+  return __aarch64_vset_lane_any (__aarch64_vget_lane_any (__b, __lane2),
+				  __a, __lane1);
 }
 
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-vqrdmlsh_s32 (int32x2_t __a, int32x2_t __b, int32x2_t __c)
+__extension__ extern __inline int64x1_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcopy_laneq_s64 (int64x1_t __a, const int __lane1,
+		 int64x2_t __b, const int __lane2)
 {
-  return __builtin_aarch64_sqrdmlshv2si (__a, __b, __c);
+  return __aarch64_vset_lane_any (__aarch64_vget_lane_any (__b, __lane2),
+				  __a, __lane1);
 }
 
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-vqrdmlshq_s16 (int16x8_t __a, int16x8_t __b, int16x8_t __c)
+__extension__ extern __inline uint8x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcopy_laneq_u8 (uint8x8_t __a, const int __lane1,
+		uint8x16_t __b, const int __lane2)
 {
-  return __builtin_aarch64_sqrdmlshv8hi (__a, __b, __c);
+  return __aarch64_vset_lane_any (__aarch64_vget_lane_any (__b, __lane2),
+				 __a, __lane1);
 }
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-vqrdmlshq_s32 (int32x4_t __a, int32x4_t __b, int32x4_t __c)
+__extension__ extern __inline uint16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcopy_laneq_u16 (uint16x4_t __a, const int __lane1,
+		 uint16x8_t __b, const int __lane2)
 {
-  return __builtin_aarch64_sqrdmlshv4si (__a, __b, __c);
+  return __aarch64_vset_lane_any (__aarch64_vget_lane_any (__b, __lane2),
+				  __a, __lane1);
 }
 
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-vqrdmlah_laneq_s16 (int16x4_t __a, int16x4_t __b, int16x8_t __c, const int __d)
+__extension__ extern __inline uint32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcopy_laneq_u32 (uint32x2_t __a, const int __lane1,
+		 uint32x4_t __b, const int __lane2)
 {
-  return  __builtin_aarch64_sqrdmlah_laneqv4hi (__a, __b, __c, __d);
+  return __aarch64_vset_lane_any (__aarch64_vget_lane_any (__b, __lane2),
+				  __a, __lane1);
 }
 
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-vqrdmlah_laneq_s32 (int32x2_t __a, int32x2_t __b, int32x4_t __c, const int __d)
+__extension__ extern __inline uint64x1_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcopy_laneq_u64 (uint64x1_t __a, const int __lane1,
+		 uint64x2_t __b, const int __lane2)
 {
-  return __builtin_aarch64_sqrdmlah_laneqv2si (__a, __b, __c, __d);
+  return __aarch64_vset_lane_any (__aarch64_vget_lane_any (__b, __lane2),
+				  __a, __lane1);
 }
 
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-vqrdmlahq_laneq_s16 (int16x8_t __a, int16x8_t __b, int16x8_t __c, const int __d)
+/* vcopyq_lane.  */
+
+__extension__ extern __inline float32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcopyq_lane_f32 (float32x4_t __a, const int __lane1,
+		 float32x2_t __b, const int __lane2)
 {
-  return __builtin_aarch64_sqrdmlah_laneqv8hi (__a, __b, __c, __d);
+  return __aarch64_vset_lane_any (__aarch64_vget_lane_any (__b, __lane2),
+				   __a, __lane1);
 }
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-vqrdmlahq_laneq_s32 (int32x4_t __a, int32x4_t __b, int32x4_t __c, const int __d)
+__extension__ extern __inline float64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcopyq_lane_f64 (float64x2_t __a, const int __lane1,
+		 float64x1_t __b, const int __lane2)
 {
-  return __builtin_aarch64_sqrdmlah_laneqv4si (__a, __b, __c, __d);
+  return __aarch64_vset_lane_any (__aarch64_vget_lane_any (__b, __lane2),
+				   __a, __lane1);
 }
 
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-vqrdmlsh_laneq_s16 (int16x4_t __a, int16x4_t __b, int16x8_t __c, const int __d)
+__extension__ extern __inline poly8x16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcopyq_lane_p8 (poly8x16_t __a, const int __lane1,
+		poly8x8_t __b, const int __lane2)
 {
-  return  __builtin_aarch64_sqrdmlsh_laneqv4hi (__a, __b, __c, __d);
+  return __aarch64_vset_lane_any (__aarch64_vget_lane_any (__b, __lane2),
+				  __a, __lane1);
 }
 
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-vqrdmlsh_laneq_s32 (int32x2_t __a, int32x2_t __b, int32x4_t __c, const int __d)
+__extension__ extern __inline poly16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcopyq_lane_p16 (poly16x8_t __a, const int __lane1,
+		 poly16x4_t __b, const int __lane2)
 {
-  return __builtin_aarch64_sqrdmlsh_laneqv2si (__a, __b, __c, __d);
+  return __aarch64_vset_lane_any (__aarch64_vget_lane_any (__b, __lane2),
+				   __a, __lane1);
 }
 
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-vqrdmlshq_laneq_s16 (int16x8_t __a, int16x8_t __b, int16x8_t __c, const int __d)
+__extension__ extern __inline poly64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcopyq_lane_p64 (poly64x2_t __a, const int __lane1,
+		 poly64x1_t __b, const int __lane2)
 {
-  return __builtin_aarch64_sqrdmlsh_laneqv8hi (__a, __b, __c, __d);
+  return __aarch64_vset_lane_any (__aarch64_vget_lane_any (__b, __lane2),
+				   __a, __lane1);
 }
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-vqrdmlshq_laneq_s32 (int32x4_t __a, int32x4_t __b, int32x4_t __c, const int __d)
+__extension__ extern __inline int8x16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcopyq_lane_s8 (int8x16_t __a, const int __lane1,
+		int8x8_t __b, const int __lane2)
 {
-  return __builtin_aarch64_sqrdmlsh_laneqv4si (__a, __b, __c, __d);
+  return __aarch64_vset_lane_any (__aarch64_vget_lane_any (__b, __lane2),
+				  __a, __lane1);
 }
 
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-vqrdmlah_lane_s16 (int16x4_t __a, int16x4_t __b, int16x4_t __c, const int __d)
+__extension__ extern __inline int16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcopyq_lane_s16 (int16x8_t __a, const int __lane1,
+		 int16x4_t __b, const int __lane2)
 {
-  return  __builtin_aarch64_sqrdmlah_lanev4hi (__a, __b, __c, __d);
+  return __aarch64_vset_lane_any (__aarch64_vget_lane_any (__b, __lane2),
+				   __a, __lane1);
 }
 
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-vqrdmlah_lane_s32 (int32x2_t __a, int32x2_t __b, int32x2_t __c, const int __d)
+__extension__ extern __inline int32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcopyq_lane_s32 (int32x4_t __a, const int __lane1,
+		 int32x2_t __b, const int __lane2)
 {
-  return __builtin_aarch64_sqrdmlah_lanev2si (__a, __b, __c, __d);
+  return __aarch64_vset_lane_any (__aarch64_vget_lane_any (__b, __lane2),
+				   __a, __lane1);
 }
 
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-vqrdmlahq_lane_s16 (int16x8_t __a, int16x8_t __b, int16x4_t __c, const int __d)
+__extension__ extern __inline int64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcopyq_lane_s64 (int64x2_t __a, const int __lane1,
+		 int64x1_t __b, const int __lane2)
 {
-  return __builtin_aarch64_sqrdmlah_lanev8hi (__a, __b, __c, __d);
+  return __aarch64_vset_lane_any (__aarch64_vget_lane_any (__b, __lane2),
+				   __a, __lane1);
 }
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-vqrdmlahq_lane_s32 (int32x4_t __a, int32x4_t __b, int32x2_t __c, const int __d)
+__extension__ extern __inline uint8x16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcopyq_lane_u8 (uint8x16_t __a, const int __lane1,
+		uint8x8_t __b, const int __lane2)
 {
-  return __builtin_aarch64_sqrdmlah_lanev4si (__a, __b, __c, __d);
+  return __aarch64_vset_lane_any (__aarch64_vget_lane_any (__b, __lane2),
+				  __a, __lane1);
 }
 
-__extension__ static __inline int16_t __attribute__ ((__always_inline__))
-vqrdmlahh_s16 (int16_t __a, int16_t __b, int16_t __c)
+__extension__ extern __inline uint16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcopyq_lane_u16 (uint16x8_t __a, const int __lane1,
+		 uint16x4_t __b, const int __lane2)
 {
-  return (int16_t) __builtin_aarch64_sqrdmlahhi (__a, __b, __c);
+  return __aarch64_vset_lane_any (__aarch64_vget_lane_any (__b, __lane2),
+				   __a, __lane1);
 }
 
-__extension__ static __inline int16_t __attribute__ ((__always_inline__))
-vqrdmlahh_lane_s16 (int16_t __a, int16_t __b, int16x4_t __c, const int __d)
+__extension__ extern __inline uint32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcopyq_lane_u32 (uint32x4_t __a, const int __lane1,
+		 uint32x2_t __b, const int __lane2)
 {
-  return __builtin_aarch64_sqrdmlah_lanehi (__a, __b, __c, __d);
+  return __aarch64_vset_lane_any (__aarch64_vget_lane_any (__b, __lane2),
+				   __a, __lane1);
 }
 
-__extension__ static __inline int16_t __attribute__ ((__always_inline__))
-vqrdmlahh_laneq_s16 (int16_t __a, int16_t __b, int16x8_t __c, const int __d)
+__extension__ extern __inline uint64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcopyq_lane_u64 (uint64x2_t __a, const int __lane1,
+		 uint64x1_t __b, const int __lane2)
 {
-  return __builtin_aarch64_sqrdmlah_laneqhi (__a, __b, __c, __d);
+  return __aarch64_vset_lane_any (__aarch64_vget_lane_any (__b, __lane2),
+				   __a, __lane1);
 }
 
-__extension__ static __inline int32_t __attribute__ ((__always_inline__))
-vqrdmlahs_s32 (int32_t __a, int32_t __b, int32_t __c)
+/* vcopyq_laneq.  */
+
+__extension__ extern __inline float32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcopyq_laneq_f32 (float32x4_t __a, const int __lane1,
+		  float32x4_t __b, const int __lane2)
 {
-  return (int32_t) __builtin_aarch64_sqrdmlahsi (__a, __b, __c);
+  return __aarch64_vset_lane_any (__aarch64_vget_lane_any (__b, __lane2),
+				   __a, __lane1);
 }
 
-__extension__ static __inline int32_t __attribute__ ((__always_inline__))
-vqrdmlahs_lane_s32 (int32_t __a, int32_t __b, int32x2_t __c, const int __d)
+__extension__ extern __inline float64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcopyq_laneq_f64 (float64x2_t __a, const int __lane1,
+		  float64x2_t __b, const int __lane2)
 {
-  return __builtin_aarch64_sqrdmlah_lanesi (__a, __b, __c, __d);
+  return __aarch64_vset_lane_any (__aarch64_vget_lane_any (__b, __lane2),
+				   __a, __lane1);
 }
 
-__extension__ static __inline int32_t __attribute__ ((__always_inline__))
-vqrdmlahs_laneq_s32 (int32_t __a, int32_t __b, int32x4_t __c, const int __d)
+__extension__ extern __inline poly8x16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcopyq_laneq_p8 (poly8x16_t __a, const int __lane1,
+		 poly8x16_t __b, const int __lane2)
 {
-  return __builtin_aarch64_sqrdmlah_laneqsi (__a, __b, __c, __d);
+  return __aarch64_vset_lane_any (__aarch64_vget_lane_any (__b, __lane2),
+				  __a, __lane1);
 }
 
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-vqrdmlsh_lane_s16 (int16x4_t __a, int16x4_t __b, int16x4_t __c, const int __d)
+__extension__ extern __inline poly16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcopyq_laneq_p16 (poly16x8_t __a, const int __lane1,
+		  poly16x8_t __b, const int __lane2)
 {
-  return  __builtin_aarch64_sqrdmlsh_lanev4hi (__a, __b, __c, __d);
+  return __aarch64_vset_lane_any (__aarch64_vget_lane_any (__b, __lane2),
+				   __a, __lane1);
 }
 
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-vqrdmlsh_lane_s32 (int32x2_t __a, int32x2_t __b, int32x2_t __c, const int __d)
+__extension__ extern __inline poly64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcopyq_laneq_p64 (poly64x2_t __a, const int __lane1,
+		  poly64x2_t __b, const int __lane2)
 {
-  return __builtin_aarch64_sqrdmlsh_lanev2si (__a, __b, __c, __d);
+  return __aarch64_vset_lane_any (__aarch64_vget_lane_any (__b, __lane2),
+				   __a, __lane1);
 }
 
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-vqrdmlshq_lane_s16 (int16x8_t __a, int16x8_t __b, int16x4_t __c, const int __d)
+__extension__ extern __inline int8x16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcopyq_laneq_s8 (int8x16_t __a, const int __lane1,
+		 int8x16_t __b, const int __lane2)
 {
-  return __builtin_aarch64_sqrdmlsh_lanev8hi (__a, __b, __c, __d);
+  return __aarch64_vset_lane_any (__aarch64_vget_lane_any (__b, __lane2),
+				  __a, __lane1);
 }
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-vqrdmlshq_lane_s32 (int32x4_t __a, int32x4_t __b, int32x2_t __c, const int __d)
+__extension__ extern __inline int16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcopyq_laneq_s16 (int16x8_t __a, const int __lane1,
+		  int16x8_t __b, const int __lane2)
 {
-  return __builtin_aarch64_sqrdmlsh_lanev4si (__a, __b, __c, __d);
+  return __aarch64_vset_lane_any (__aarch64_vget_lane_any (__b, __lane2),
+				   __a, __lane1);
 }
 
-__extension__ static __inline int16_t __attribute__ ((__always_inline__))
-vqrdmlshh_s16 (int16_t __a, int16_t __b, int16_t __c)
+__extension__ extern __inline int32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcopyq_laneq_s32 (int32x4_t __a, const int __lane1,
+		  int32x4_t __b, const int __lane2)
 {
-  return (int16_t) __builtin_aarch64_sqrdmlshhi (__a, __b, __c);
+  return __aarch64_vset_lane_any (__aarch64_vget_lane_any (__b, __lane2),
+				   __a, __lane1);
 }
 
-__extension__ static __inline int16_t __attribute__ ((__always_inline__))
-vqrdmlshh_lane_s16 (int16_t __a, int16_t __b, int16x4_t __c, const int __d)
+__extension__ extern __inline int64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcopyq_laneq_s64 (int64x2_t __a, const int __lane1,
+		  int64x2_t __b, const int __lane2)
 {
-  return __builtin_aarch64_sqrdmlsh_lanehi (__a, __b, __c, __d);
+  return __aarch64_vset_lane_any (__aarch64_vget_lane_any (__b, __lane2),
+				   __a, __lane1);
 }
 
-__extension__ static __inline int16_t __attribute__ ((__always_inline__))
-vqrdmlshh_laneq_s16 (int16_t __a, int16_t __b, int16x8_t __c, const int __d)
+__extension__ extern __inline uint8x16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcopyq_laneq_u8 (uint8x16_t __a, const int __lane1,
+		 uint8x16_t __b, const int __lane2)
 {
-  return __builtin_aarch64_sqrdmlsh_laneqhi (__a, __b, __c, __d);
+  return __aarch64_vset_lane_any (__aarch64_vget_lane_any (__b, __lane2),
+				  __a, __lane1);
 }
 
-__extension__ static __inline int32_t __attribute__ ((__always_inline__))
-vqrdmlshs_s32 (int32_t __a, int32_t __b, int32_t __c)
+__extension__ extern __inline uint16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcopyq_laneq_u16 (uint16x8_t __a, const int __lane1,
+		  uint16x8_t __b, const int __lane2)
 {
-  return (int32_t) __builtin_aarch64_sqrdmlshsi (__a, __b, __c);
+  return __aarch64_vset_lane_any (__aarch64_vget_lane_any (__b, __lane2),
+				   __a, __lane1);
 }
 
-__extension__ static __inline int32_t __attribute__ ((__always_inline__))
-vqrdmlshs_lane_s32 (int32_t __a, int32_t __b, int32x2_t __c, const int __d)
+__extension__ extern __inline uint32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcopyq_laneq_u32 (uint32x4_t __a, const int __lane1,
+		  uint32x4_t __b, const int __lane2)
 {
-  return __builtin_aarch64_sqrdmlsh_lanesi (__a, __b, __c, __d);
+  return __aarch64_vset_lane_any (__aarch64_vget_lane_any (__b, __lane2),
+				   __a, __lane1);
 }
 
-__extension__ static __inline int32_t __attribute__ ((__always_inline__))
-vqrdmlshs_laneq_s32 (int32_t __a, int32_t __b, int32x4_t __c, const int __d)
+__extension__ extern __inline uint64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcopyq_laneq_u64 (uint64x2_t __a, const int __lane1,
+		  uint64x2_t __b, const int __lane2)
 {
-  return __builtin_aarch64_sqrdmlsh_laneqsi (__a, __b, __c, __d);
+  return __aarch64_vset_lane_any (__aarch64_vget_lane_any (__b, __lane2),
+				   __a, __lane1);
 }
-#pragma GCC pop_options
 
-#pragma GCC push_options
-#pragma GCC target ("+nothing+crypto")
-/* vaes  */
+/* vcvt (double -> float).  */
 
-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-vaeseq_u8 (uint8x16_t data, uint8x16_t key)
+__extension__ extern __inline float16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcvt_f16_f32 (float32x4_t __a)
 {
-  return __builtin_aarch64_crypto_aesev16qi_uuu (data, key);
+  return __builtin_aarch64_float_truncate_lo_v4hf (__a);
 }
 
-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-vaesdq_u8 (uint8x16_t data, uint8x16_t key)
+__extension__ extern __inline float16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcvt_high_f16_f32 (float16x4_t __a, float32x4_t __b)
 {
-  return __builtin_aarch64_crypto_aesdv16qi_uuu (data, key);
+  return __builtin_aarch64_float_truncate_hi_v8hf (__a, __b);
 }
 
-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-vaesmcq_u8 (uint8x16_t data)
+__extension__ extern __inline float32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcvt_f32_f64 (float64x2_t __a)
 {
-  return __builtin_aarch64_crypto_aesmcv16qi_uu (data);
+  return __builtin_aarch64_float_truncate_lo_v2sf (__a);
 }
 
-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-vaesimcq_u8 (uint8x16_t data)
+__extension__ extern __inline float32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcvt_high_f32_f64 (float32x2_t __a, float64x2_t __b)
 {
-  return __builtin_aarch64_crypto_aesimcv16qi_uu (data);
+  return __builtin_aarch64_float_truncate_hi_v4sf (__a, __b);
 }
-#pragma GCC pop_options
 
-/* vcage  */
+/* vcvt (float -> double).  */
 
-__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
-vcage_f64 (float64x1_t __a, float64x1_t __b)
+__extension__ extern __inline float32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcvt_f32_f16 (float16x4_t __a)
 {
-  return vabs_f64 (__a) >= vabs_f64 (__b);
+  return __builtin_aarch64_float_extend_lo_v4sf (__a);
 }
 
-__extension__ static __inline uint32_t __attribute__ ((__always_inline__))
-vcages_f32 (float32_t __a, float32_t __b)
+__extension__ extern __inline float64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcvt_f64_f32 (float32x2_t __a)
 {
-  return __builtin_fabsf (__a) >= __builtin_fabsf (__b) ? -1 : 0;
+
+  return __builtin_aarch64_float_extend_lo_v2df (__a);
 }
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-vcage_f32 (float32x2_t __a, float32x2_t __b)
+__extension__ extern __inline float32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcvt_high_f32_f16 (float16x8_t __a)
 {
-  return vabs_f32 (__a) >= vabs_f32 (__b);
+  return __builtin_aarch64_vec_unpacks_hi_v8hf (__a);
 }
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vcageq_f32 (float32x4_t __a, float32x4_t __b)
+__extension__ extern __inline float64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcvt_high_f64_f32 (float32x4_t __a)
 {
-  return vabsq_f32 (__a) >= vabsq_f32 (__b);
+  return __builtin_aarch64_vec_unpacks_hi_v4sf (__a);
 }
 
-__extension__ static __inline uint64_t __attribute__ ((__always_inline__))
-vcaged_f64 (float64_t __a, float64_t __b)
+/* vcvt (<u>fixed-point -> float).  */
+
+__extension__ extern __inline float64_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcvtd_n_f64_s64 (int64_t __a, const int __b)
 {
-  return __builtin_fabs (__a) >= __builtin_fabs (__b) ? -1 : 0;
+  return __builtin_aarch64_scvtfdi (__a, __b);
 }
 
-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
-vcageq_f64 (float64x2_t __a, float64x2_t __b)
+__extension__ extern __inline float64_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcvtd_n_f64_u64 (uint64_t __a, const int __b)
 {
-  return vabsq_f64 (__a) >= vabsq_f64 (__b);
+  return __builtin_aarch64_ucvtfdi_sus (__a, __b);
 }
 
-/* vcagt  */
-
-__extension__ static __inline uint32_t __attribute__ ((__always_inline__))
-vcagts_f32 (float32_t __a, float32_t __b)
+__extension__ extern __inline float32_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcvts_n_f32_s32 (int32_t __a, const int __b)
 {
-  return __builtin_fabsf (__a) > __builtin_fabsf (__b) ? -1 : 0;
+  return __builtin_aarch64_scvtfsi (__a, __b);
 }
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-vcagt_f32 (float32x2_t __a, float32x2_t __b)
+__extension__ extern __inline float32_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcvts_n_f32_u32 (uint32_t __a, const int __b)
 {
-  return vabs_f32 (__a) > vabs_f32 (__b);
+  return __builtin_aarch64_ucvtfsi_sus (__a, __b);
 }
 
-__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
-vcagt_f64 (float64x1_t __a, float64x1_t __b)
+__extension__ extern __inline float32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcvt_n_f32_s32 (int32x2_t __a, const int __b)
 {
-  return vabs_f64 (__a) > vabs_f64 (__b);
+  return __builtin_aarch64_scvtfv2si (__a, __b);
 }
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vcagtq_f32 (float32x4_t __a, float32x4_t __b)
+__extension__ extern __inline float32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcvt_n_f32_u32 (uint32x2_t __a, const int __b)
 {
-  return vabsq_f32 (__a) > vabsq_f32 (__b);
+  return __builtin_aarch64_ucvtfv2si_sus (__a, __b);
 }
 
-__extension__ static __inline uint64_t __attribute__ ((__always_inline__))
-vcagtd_f64 (float64_t __a, float64_t __b)
+__extension__ extern __inline float64x1_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcvt_n_f64_s64 (int64x1_t __a, const int __b)
 {
-  return __builtin_fabs (__a) > __builtin_fabs (__b) ? -1 : 0;
+  return (float64x1_t)
+    { __builtin_aarch64_scvtfdi (vget_lane_s64 (__a, 0), __b) };
 }
 
-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
-vcagtq_f64 (float64x2_t __a, float64x2_t __b)
+__extension__ extern __inline float64x1_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcvt_n_f64_u64 (uint64x1_t __a, const int __b)
 {
-  return vabsq_f64 (__a) > vabsq_f64 (__b);
+  return (float64x1_t)
+    { __builtin_aarch64_ucvtfdi_sus (vget_lane_u64 (__a, 0), __b) };
 }
 
-/* vcale  */
-
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-vcale_f32 (float32x2_t __a, float32x2_t __b)
+__extension__ extern __inline float32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcvtq_n_f32_s32 (int32x4_t __a, const int __b)
 {
-  return vabs_f32 (__a) <= vabs_f32 (__b);
+  return __builtin_aarch64_scvtfv4si (__a, __b);
 }
 
-__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
-vcale_f64 (float64x1_t __a, float64x1_t __b)
+__extension__ extern __inline float32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcvtq_n_f32_u32 (uint32x4_t __a, const int __b)
 {
-  return vabs_f64 (__a) <= vabs_f64 (__b);
+  return __builtin_aarch64_ucvtfv4si_sus (__a, __b);
 }
 
-__extension__ static __inline uint64_t __attribute__ ((__always_inline__))
-vcaled_f64 (float64_t __a, float64_t __b)
+__extension__ extern __inline float64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcvtq_n_f64_s64 (int64x2_t __a, const int __b)
 {
-  return __builtin_fabs (__a) <= __builtin_fabs (__b) ? -1 : 0;
+  return __builtin_aarch64_scvtfv2di (__a, __b);
 }
 
-__extension__ static __inline uint32_t __attribute__ ((__always_inline__))
-vcales_f32 (float32_t __a, float32_t __b)
+__extension__ extern __inline float64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcvtq_n_f64_u64 (uint64x2_t __a, const int __b)
 {
-  return __builtin_fabsf (__a) <= __builtin_fabsf (__b) ? -1 : 0;
+  return __builtin_aarch64_ucvtfv2di_sus (__a, __b);
 }
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vcaleq_f32 (float32x4_t __a, float32x4_t __b)
+/* vcvt (float -> <u>fixed-point).  */
+
+__extension__ extern __inline int64_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcvtd_n_s64_f64 (float64_t __a, const int __b)
 {
-  return vabsq_f32 (__a) <= vabsq_f32 (__b);
+  return __builtin_aarch64_fcvtzsdf (__a, __b);
 }
 
-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
-vcaleq_f64 (float64x2_t __a, float64x2_t __b)
+__extension__ extern __inline uint64_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcvtd_n_u64_f64 (float64_t __a, const int __b)
 {
-  return vabsq_f64 (__a) <= vabsq_f64 (__b);
+  return __builtin_aarch64_fcvtzudf_uss (__a, __b);
 }
 
-/* vcalt  */
-
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-vcalt_f32 (float32x2_t __a, float32x2_t __b)
+__extension__ extern __inline int32_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcvts_n_s32_f32 (float32_t __a, const int __b)
 {
-  return vabs_f32 (__a) < vabs_f32 (__b);
+  return __builtin_aarch64_fcvtzssf (__a, __b);
 }
 
-__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
-vcalt_f64 (float64x1_t __a, float64x1_t __b)
+__extension__ extern __inline uint32_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcvts_n_u32_f32 (float32_t __a, const int __b)
 {
-  return vabs_f64 (__a) < vabs_f64 (__b);
+  return __builtin_aarch64_fcvtzusf_uss (__a, __b);
 }
 
-__extension__ static __inline uint64_t __attribute__ ((__always_inline__))
-vcaltd_f64 (float64_t __a, float64_t __b)
+__extension__ extern __inline int32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcvt_n_s32_f32 (float32x2_t __a, const int __b)
 {
-  return __builtin_fabs (__a) < __builtin_fabs (__b) ? -1 : 0;
+  return __builtin_aarch64_fcvtzsv2sf (__a, __b);
 }
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vcaltq_f32 (float32x4_t __a, float32x4_t __b)
+__extension__ extern __inline uint32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcvt_n_u32_f32 (float32x2_t __a, const int __b)
 {
-  return vabsq_f32 (__a) < vabsq_f32 (__b);
+  return __builtin_aarch64_fcvtzuv2sf_uss (__a, __b);
 }
 
-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
-vcaltq_f64 (float64x2_t __a, float64x2_t __b)
+__extension__ extern __inline int64x1_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcvt_n_s64_f64 (float64x1_t __a, const int __b)
 {
-  return vabsq_f64 (__a) < vabsq_f64 (__b);
+  return (int64x1_t)
+    { __builtin_aarch64_fcvtzsdf (vget_lane_f64 (__a, 0), __b) };
 }
 
-__extension__ static __inline uint32_t __attribute__ ((__always_inline__))
-vcalts_f32 (float32_t __a, float32_t __b)
+__extension__ extern __inline uint64x1_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcvt_n_u64_f64 (float64x1_t __a, const int __b)
 {
-  return __builtin_fabsf (__a) < __builtin_fabsf (__b) ? -1 : 0;
+  return (uint64x1_t)
+    { __builtin_aarch64_fcvtzudf_uss (vget_lane_f64 (__a, 0), __b) };
 }
 
-/* vceq - vector.  */
-
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-vceq_f32 (float32x2_t __a, float32x2_t __b)
+__extension__ extern __inline int32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcvtq_n_s32_f32 (float32x4_t __a, const int __b)
 {
-  return (uint32x2_t) (__a == __b);
+  return __builtin_aarch64_fcvtzsv4sf (__a, __b);
 }
 
-__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
-vceq_f64 (float64x1_t __a, float64x1_t __b)
+__extension__ extern __inline uint32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcvtq_n_u32_f32 (float32x4_t __a, const int __b)
 {
-  return (uint64x1_t) (__a == __b);
+  return __builtin_aarch64_fcvtzuv4sf_uss (__a, __b);
 }
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-vceq_p8 (poly8x8_t __a, poly8x8_t __b)
+__extension__ extern __inline int64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcvtq_n_s64_f64 (float64x2_t __a, const int __b)
 {
-  return (uint8x8_t) (__a == __b);
+  return __builtin_aarch64_fcvtzsv2df (__a, __b);
 }
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-vceq_s8 (int8x8_t __a, int8x8_t __b)
+__extension__ extern __inline uint64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcvtq_n_u64_f64 (float64x2_t __a, const int __b)
 {
-  return (uint8x8_t) (__a == __b);
+  return __builtin_aarch64_fcvtzuv2df_uss (__a, __b);
 }
 
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-vceq_s16 (int16x4_t __a, int16x4_t __b)
+/* vcvt  (<u>int -> float)  */
+
+__extension__ extern __inline float64_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcvtd_f64_s64 (int64_t __a)
 {
-  return (uint16x4_t) (__a == __b);
+  return (float64_t) __a;
 }
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-vceq_s32 (int32x2_t __a, int32x2_t __b)
+__extension__ extern __inline float64_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcvtd_f64_u64 (uint64_t __a)
 {
-  return (uint32x2_t) (__a == __b);
+  return (float64_t) __a;
 }
 
-__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
-vceq_s64 (int64x1_t __a, int64x1_t __b)
+__extension__ extern __inline float32_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcvts_f32_s32 (int32_t __a)
 {
-  return (uint64x1_t) (__a == __b);
+  return (float32_t) __a;
 }
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-vceq_u8 (uint8x8_t __a, uint8x8_t __b)
+__extension__ extern __inline float32_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcvts_f32_u32 (uint32_t __a)
 {
-  return (__a == __b);
+  return (float32_t) __a;
 }
 
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-vceq_u16 (uint16x4_t __a, uint16x4_t __b)
+__extension__ extern __inline float32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcvt_f32_s32 (int32x2_t __a)
 {
-  return (__a == __b);
+  return __builtin_aarch64_floatv2siv2sf (__a);
 }
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-vceq_u32 (uint32x2_t __a, uint32x2_t __b)
+__extension__ extern __inline float32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcvt_f32_u32 (uint32x2_t __a)
 {
-  return (__a == __b);
+  return __builtin_aarch64_floatunsv2siv2sf ((int32x2_t) __a);
 }
 
-__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
-vceq_u64 (uint64x1_t __a, uint64x1_t __b)
+__extension__ extern __inline float64x1_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcvt_f64_s64 (int64x1_t __a)
 {
-  return (__a == __b);
+  return (float64x1_t) { vget_lane_s64 (__a, 0) };
 }
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vceqq_f32 (float32x4_t __a, float32x4_t __b)
+__extension__ extern __inline float64x1_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcvt_f64_u64 (uint64x1_t __a)
 {
-  return (uint32x4_t) (__a == __b);
+  return (float64x1_t) { vget_lane_u64 (__a, 0) };
 }
 
-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
-vceqq_f64 (float64x2_t __a, float64x2_t __b)
+__extension__ extern __inline float32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcvtq_f32_s32 (int32x4_t __a)
 {
-  return (uint64x2_t) (__a == __b);
+  return __builtin_aarch64_floatv4siv4sf (__a);
 }
 
-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-vceqq_p8 (poly8x16_t __a, poly8x16_t __b)
+__extension__ extern __inline float32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcvtq_f32_u32 (uint32x4_t __a)
 {
-  return (uint8x16_t) (__a == __b);
+  return __builtin_aarch64_floatunsv4siv4sf ((int32x4_t) __a);
 }
 
-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-vceqq_s8 (int8x16_t __a, int8x16_t __b)
+__extension__ extern __inline float64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcvtq_f64_s64 (int64x2_t __a)
 {
-  return (uint8x16_t) (__a == __b);
+  return __builtin_aarch64_floatv2div2df (__a);
 }
 
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-vceqq_s16 (int16x8_t __a, int16x8_t __b)
+__extension__ extern __inline float64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcvtq_f64_u64 (uint64x2_t __a)
 {
-  return (uint16x8_t) (__a == __b);
+  return __builtin_aarch64_floatunsv2div2df ((int64x2_t) __a);
 }
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vceqq_s32 (int32x4_t __a, int32x4_t __b)
+/* vcvt (float -> <u>int)  */
+
+__extension__ extern __inline int64_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcvtd_s64_f64 (float64_t __a)
 {
-  return (uint32x4_t) (__a == __b);
+  return (int64_t) __a;
 }
 
-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
-vceqq_s64 (int64x2_t __a, int64x2_t __b)
+__extension__ extern __inline uint64_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcvtd_u64_f64 (float64_t __a)
 {
-  return (uint64x2_t) (__a == __b);
+  return (uint64_t) __a;
 }
 
-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-vceqq_u8 (uint8x16_t __a, uint8x16_t __b)
+__extension__ extern __inline int32_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcvts_s32_f32 (float32_t __a)
 {
-  return (__a == __b);
+  return (int32_t) __a;
 }
 
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-vceqq_u16 (uint16x8_t __a, uint16x8_t __b)
+__extension__ extern __inline uint32_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcvts_u32_f32 (float32_t __a)
 {
-  return (__a == __b);
+  return (uint32_t) __a;
 }
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vceqq_u32 (uint32x4_t __a, uint32x4_t __b)
+__extension__ extern __inline int32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcvt_s32_f32 (float32x2_t __a)
 {
-  return (__a == __b);
+  return __builtin_aarch64_lbtruncv2sfv2si (__a);
 }
 
-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
-vceqq_u64 (uint64x2_t __a, uint64x2_t __b)
+__extension__ extern __inline uint32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcvt_u32_f32 (float32x2_t __a)
 {
-  return (__a == __b);
+  return __builtin_aarch64_lbtruncuv2sfv2si_us (__a);
 }
 
-/* vceq - scalar.  */
-
-__extension__ static __inline uint32_t __attribute__ ((__always_inline__))
-vceqs_f32 (float32_t __a, float32_t __b)
+__extension__ extern __inline int32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcvtq_s32_f32 (float32x4_t __a)
 {
-  return __a == __b ? -1 : 0;
+  return __builtin_aarch64_lbtruncv4sfv4si (__a);
 }
 
-__extension__ static __inline uint64_t __attribute__ ((__always_inline__))
-vceqd_s64 (int64_t __a, int64_t __b)
+__extension__ extern __inline uint32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcvtq_u32_f32 (float32x4_t __a)
 {
-  return __a == __b ? -1ll : 0ll;
+  return __builtin_aarch64_lbtruncuv4sfv4si_us (__a);
 }
 
-__extension__ static __inline uint64_t __attribute__ ((__always_inline__))
-vceqd_u64 (uint64_t __a, uint64_t __b)
+__extension__ extern __inline int64x1_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcvt_s64_f64 (float64x1_t __a)
 {
-  return __a == __b ? -1ll : 0ll;
+  return (int64x1_t) {vcvtd_s64_f64 (__a[0])};
 }
 
-__extension__ static __inline uint64_t __attribute__ ((__always_inline__))
-vceqd_f64 (float64_t __a, float64_t __b)
+__extension__ extern __inline uint64x1_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcvt_u64_f64 (float64x1_t __a)
 {
-  return __a == __b ? -1ll : 0ll;
+  return (uint64x1_t) {vcvtd_u64_f64 (__a[0])};
 }
 
-/* vceqz - vector.  */
-
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-vceqz_f32 (float32x2_t __a)
+__extension__ extern __inline int64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcvtq_s64_f64 (float64x2_t __a)
 {
-  return (uint32x2_t) (__a == 0.0f);
+  return __builtin_aarch64_lbtruncv2dfv2di (__a);
 }
 
-__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
-vceqz_f64 (float64x1_t __a)
+__extension__ extern __inline uint64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcvtq_u64_f64 (float64x2_t __a)
 {
-  return (uint64x1_t) (__a == (float64x1_t) {0.0});
+  return __builtin_aarch64_lbtruncuv2dfv2di_us (__a);
 }
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-vceqz_p8 (poly8x8_t __a)
+/* vcvta  */
+
+__extension__ extern __inline int64_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcvtad_s64_f64 (float64_t __a)
 {
-  return (uint8x8_t) (__a == 0);
+  return __builtin_aarch64_lrounddfdi (__a);
 }
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-vceqz_s8 (int8x8_t __a)
+__extension__ extern __inline uint64_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcvtad_u64_f64 (float64_t __a)
 {
-  return (uint8x8_t) (__a == 0);
+  return __builtin_aarch64_lroundudfdi_us (__a);
 }
 
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-vceqz_s16 (int16x4_t __a)
+__extension__ extern __inline int32_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcvtas_s32_f32 (float32_t __a)
 {
-  return (uint16x4_t) (__a == 0);
+  return __builtin_aarch64_lroundsfsi (__a);
 }
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-vceqz_s32 (int32x2_t __a)
+__extension__ extern __inline uint32_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcvtas_u32_f32 (float32_t __a)
 {
-  return (uint32x2_t) (__a == 0);
+  return __builtin_aarch64_lroundusfsi_us (__a);
 }
 
-__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
-vceqz_s64 (int64x1_t __a)
+__extension__ extern __inline int32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcvta_s32_f32 (float32x2_t __a)
 {
-  return (uint64x1_t) (__a == __AARCH64_INT64_C (0));
+  return __builtin_aarch64_lroundv2sfv2si (__a);
 }
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-vceqz_u8 (uint8x8_t __a)
+__extension__ extern __inline uint32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcvta_u32_f32 (float32x2_t __a)
 {
-  return (__a == 0);
+  return __builtin_aarch64_lrounduv2sfv2si_us (__a);
 }
 
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-vceqz_u16 (uint16x4_t __a)
+__extension__ extern __inline int32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcvtaq_s32_f32 (float32x4_t __a)
 {
-  return (__a == 0);
+  return __builtin_aarch64_lroundv4sfv4si (__a);
 }
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-vceqz_u32 (uint32x2_t __a)
+__extension__ extern __inline uint32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcvtaq_u32_f32 (float32x4_t __a)
 {
-  return (__a == 0);
+  return __builtin_aarch64_lrounduv4sfv4si_us (__a);
 }
 
-__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
-vceqz_u64 (uint64x1_t __a)
+__extension__ extern __inline int64x1_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcvta_s64_f64 (float64x1_t __a)
 {
-  return (__a == __AARCH64_UINT64_C (0));
+  return (int64x1_t) {vcvtad_s64_f64 (__a[0])};
 }
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vceqzq_f32 (float32x4_t __a)
+__extension__ extern __inline uint64x1_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcvta_u64_f64 (float64x1_t __a)
 {
-  return (uint32x4_t) (__a == 0.0f);
+  return (uint64x1_t) {vcvtad_u64_f64 (__a[0])};
 }
 
-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
-vceqzq_f64 (float64x2_t __a)
+__extension__ extern __inline int64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcvtaq_s64_f64 (float64x2_t __a)
 {
-  return (uint64x2_t) (__a == 0.0f);
+  return __builtin_aarch64_lroundv2dfv2di (__a);
 }
 
-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-vceqzq_p8 (poly8x16_t __a)
+__extension__ extern __inline uint64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcvtaq_u64_f64 (float64x2_t __a)
 {
-  return (uint8x16_t) (__a == 0);
+  return __builtin_aarch64_lrounduv2dfv2di_us (__a);
 }
 
-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-vceqzq_s8 (int8x16_t __a)
+/* vcvtm  */
+
+__extension__ extern __inline int64_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcvtmd_s64_f64 (float64_t __a)
 {
-  return (uint8x16_t) (__a == 0);
+  return __builtin_llfloor (__a);
 }
 
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-vceqzq_s16 (int16x8_t __a)
+__extension__ extern __inline uint64_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcvtmd_u64_f64 (float64_t __a)
 {
-  return (uint16x8_t) (__a == 0);
+  return __builtin_aarch64_lfloorudfdi_us (__a);
 }
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vceqzq_s32 (int32x4_t __a)
+__extension__ extern __inline int32_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcvtms_s32_f32 (float32_t __a)
 {
-  return (uint32x4_t) (__a == 0);
+  return __builtin_ifloorf (__a);
 }
 
-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
-vceqzq_s64 (int64x2_t __a)
+__extension__ extern __inline uint32_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcvtms_u32_f32 (float32_t __a)
 {
-  return (uint64x2_t) (__a == __AARCH64_INT64_C (0));
+  return __builtin_aarch64_lfloorusfsi_us (__a);
 }
 
-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-vceqzq_u8 (uint8x16_t __a)
+__extension__ extern __inline int32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcvtm_s32_f32 (float32x2_t __a)
 {
-  return (__a == 0);
+  return __builtin_aarch64_lfloorv2sfv2si (__a);
 }
 
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-vceqzq_u16 (uint16x8_t __a)
+__extension__ extern __inline uint32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcvtm_u32_f32 (float32x2_t __a)
 {
-  return (__a == 0);
+  return __builtin_aarch64_lflooruv2sfv2si_us (__a);
 }
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vceqzq_u32 (uint32x4_t __a)
+__extension__ extern __inline int32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcvtmq_s32_f32 (float32x4_t __a)
 {
-  return (__a == 0);
+  return __builtin_aarch64_lfloorv4sfv4si (__a);
 }
 
-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
-vceqzq_u64 (uint64x2_t __a)
+__extension__ extern __inline uint32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcvtmq_u32_f32 (float32x4_t __a)
 {
-  return (__a == __AARCH64_UINT64_C (0));
+  return __builtin_aarch64_lflooruv4sfv4si_us (__a);
 }
 
-/* vceqz - scalar.  */
-
-__extension__ static __inline uint32_t __attribute__ ((__always_inline__))
-vceqzs_f32 (float32_t __a)
+__extension__ extern __inline int64x1_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcvtm_s64_f64 (float64x1_t __a)
 {
-  return __a == 0.0f ? -1 : 0;
+  return (int64x1_t) {vcvtmd_s64_f64 (__a[0])};
 }
 
-__extension__ static __inline uint64_t __attribute__ ((__always_inline__))
-vceqzd_s64 (int64_t __a)
+__extension__ extern __inline uint64x1_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcvtm_u64_f64 (float64x1_t __a)
 {
-  return __a == 0 ? -1ll : 0ll;
+  return (uint64x1_t) {vcvtmd_u64_f64 (__a[0])};
 }
 
-__extension__ static __inline uint64_t __attribute__ ((__always_inline__))
-vceqzd_u64 (uint64_t __a)
+__extension__ extern __inline int64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcvtmq_s64_f64 (float64x2_t __a)
 {
-  return __a == 0 ? -1ll : 0ll;
+  return __builtin_aarch64_lfloorv2dfv2di (__a);
 }
 
-__extension__ static __inline uint64_t __attribute__ ((__always_inline__))
-vceqzd_f64 (float64_t __a)
+__extension__ extern __inline uint64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcvtmq_u64_f64 (float64x2_t __a)
 {
-  return __a == 0.0 ? -1ll : 0ll;
+  return __builtin_aarch64_lflooruv2dfv2di_us (__a);
 }
 
-/* vcge - vector.  */
+/* vcvtn  */
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-vcge_f32 (float32x2_t __a, float32x2_t __b)
+__extension__ extern __inline int64_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcvtnd_s64_f64 (float64_t __a)
 {
-  return (uint32x2_t) (__a >= __b);
+  return __builtin_aarch64_lfrintndfdi (__a);
 }
 
-__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
-vcge_f64 (float64x1_t __a, float64x1_t __b)
+__extension__ extern __inline uint64_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcvtnd_u64_f64 (float64_t __a)
 {
-  return (uint64x1_t) (__a >= __b);
+  return __builtin_aarch64_lfrintnudfdi_us (__a);
 }
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-vcge_s8 (int8x8_t __a, int8x8_t __b)
+__extension__ extern __inline int32_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcvtns_s32_f32 (float32_t __a)
 {
-  return (uint8x8_t) (__a >= __b);
+  return __builtin_aarch64_lfrintnsfsi (__a);
 }
 
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-vcge_s16 (int16x4_t __a, int16x4_t __b)
+__extension__ extern __inline uint32_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcvtns_u32_f32 (float32_t __a)
 {
-  return (uint16x4_t) (__a >= __b);
+  return __builtin_aarch64_lfrintnusfsi_us (__a);
 }
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-vcge_s32 (int32x2_t __a, int32x2_t __b)
+__extension__ extern __inline int32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcvtn_s32_f32 (float32x2_t __a)
 {
-  return (uint32x2_t) (__a >= __b);
+  return __builtin_aarch64_lfrintnv2sfv2si (__a);
 }
 
-__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
-vcge_s64 (int64x1_t __a, int64x1_t __b)
+__extension__ extern __inline uint32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcvtn_u32_f32 (float32x2_t __a)
 {
-  return (uint64x1_t) (__a >= __b);
+  return __builtin_aarch64_lfrintnuv2sfv2si_us (__a);
 }
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-vcge_u8 (uint8x8_t __a, uint8x8_t __b)
+__extension__ extern __inline int32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcvtnq_s32_f32 (float32x4_t __a)
 {
-  return (__a >= __b);
+  return __builtin_aarch64_lfrintnv4sfv4si (__a);
 }
 
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-vcge_u16 (uint16x4_t __a, uint16x4_t __b)
+__extension__ extern __inline uint32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcvtnq_u32_f32 (float32x4_t __a)
 {
-  return (__a >= __b);
+  return __builtin_aarch64_lfrintnuv4sfv4si_us (__a);
 }
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-vcge_u32 (uint32x2_t __a, uint32x2_t __b)
+__extension__ extern __inline int64x1_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcvtn_s64_f64 (float64x1_t __a)
 {
-  return (__a >= __b);
+  return (int64x1_t) {vcvtnd_s64_f64 (__a[0])};
 }
 
-__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
-vcge_u64 (uint64x1_t __a, uint64x1_t __b)
+__extension__ extern __inline uint64x1_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcvtn_u64_f64 (float64x1_t __a)
 {
-  return (__a >= __b);
+  return (uint64x1_t) {vcvtnd_u64_f64 (__a[0])};
 }
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vcgeq_f32 (float32x4_t __a, float32x4_t __b)
+__extension__ extern __inline int64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcvtnq_s64_f64 (float64x2_t __a)
 {
-  return (uint32x4_t) (__a >= __b);
+  return __builtin_aarch64_lfrintnv2dfv2di (__a);
 }
 
-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
-vcgeq_f64 (float64x2_t __a, float64x2_t __b)
+__extension__ extern __inline uint64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcvtnq_u64_f64 (float64x2_t __a)
 {
-  return (uint64x2_t) (__a >= __b);
+  return __builtin_aarch64_lfrintnuv2dfv2di_us (__a);
 }
 
-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-vcgeq_s8 (int8x16_t __a, int8x16_t __b)
+/* vcvtp  */
+
+__extension__ extern __inline int64_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcvtpd_s64_f64 (float64_t __a)
 {
-  return (uint8x16_t) (__a >= __b);
+  return __builtin_llceil (__a);
 }
 
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-vcgeq_s16 (int16x8_t __a, int16x8_t __b)
+__extension__ extern __inline uint64_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcvtpd_u64_f64 (float64_t __a)
 {
-  return (uint16x8_t) (__a >= __b);
+  return __builtin_aarch64_lceiludfdi_us (__a);
 }
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vcgeq_s32 (int32x4_t __a, int32x4_t __b)
+__extension__ extern __inline int32_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcvtps_s32_f32 (float32_t __a)
 {
-  return (uint32x4_t) (__a >= __b);
+  return __builtin_iceilf (__a);
 }
 
-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
-vcgeq_s64 (int64x2_t __a, int64x2_t __b)
+__extension__ extern __inline uint32_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcvtps_u32_f32 (float32_t __a)
 {
-  return (uint64x2_t) (__a >= __b);
+  return __builtin_aarch64_lceilusfsi_us (__a);
 }
 
-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-vcgeq_u8 (uint8x16_t __a, uint8x16_t __b)
+__extension__ extern __inline int32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcvtp_s32_f32 (float32x2_t __a)
 {
-  return (__a >= __b);
+  return __builtin_aarch64_lceilv2sfv2si (__a);
 }
 
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-vcgeq_u16 (uint16x8_t __a, uint16x8_t __b)
+__extension__ extern __inline uint32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcvtp_u32_f32 (float32x2_t __a)
 {
-  return (__a >= __b);
+  return __builtin_aarch64_lceiluv2sfv2si_us (__a);
 }
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vcgeq_u32 (uint32x4_t __a, uint32x4_t __b)
+__extension__ extern __inline int32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcvtpq_s32_f32 (float32x4_t __a)
 {
-  return (__a >= __b);
+  return __builtin_aarch64_lceilv4sfv4si (__a);
 }
 
-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
-vcgeq_u64 (uint64x2_t __a, uint64x2_t __b)
+__extension__ extern __inline uint32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcvtpq_u32_f32 (float32x4_t __a)
 {
-  return (__a >= __b);
+  return __builtin_aarch64_lceiluv4sfv4si_us (__a);
 }
 
-/* vcge - scalar.  */
-
-__extension__ static __inline uint32_t __attribute__ ((__always_inline__))
-vcges_f32 (float32_t __a, float32_t __b)
+__extension__ extern __inline int64x1_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcvtp_s64_f64 (float64x1_t __a)
 {
-  return __a >= __b ? -1 : 0;
+  return (int64x1_t) {vcvtpd_s64_f64 (__a[0])};
 }
 
-__extension__ static __inline uint64_t __attribute__ ((__always_inline__))
-vcged_s64 (int64_t __a, int64_t __b)
+__extension__ extern __inline uint64x1_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcvtp_u64_f64 (float64x1_t __a)
 {
-  return __a >= __b ? -1ll : 0ll;
+  return (uint64x1_t) {vcvtpd_u64_f64 (__a[0])};
 }
 
-__extension__ static __inline uint64_t __attribute__ ((__always_inline__))
-vcged_u64 (uint64_t __a, uint64_t __b)
+__extension__ extern __inline int64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcvtpq_s64_f64 (float64x2_t __a)
 {
-  return __a >= __b ? -1ll : 0ll;
+  return __builtin_aarch64_lceilv2dfv2di (__a);
 }
 
-__extension__ static __inline uint64_t __attribute__ ((__always_inline__))
-vcged_f64 (float64_t __a, float64_t __b)
+__extension__ extern __inline uint64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcvtpq_u64_f64 (float64x2_t __a)
 {
-  return __a >= __b ? -1ll : 0ll;
+  return __builtin_aarch64_lceiluv2dfv2di_us (__a);
 }
 
-/* vcgez - vector.  */
+/* vdup_n  */
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-vcgez_f32 (float32x2_t __a)
+__extension__ extern __inline float16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vdup_n_f16 (float16_t __a)
 {
-  return (uint32x2_t) (__a >= 0.0f);
+  return (float16x4_t) {__a, __a, __a, __a};
 }
 
-__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
-vcgez_f64 (float64x1_t __a)
+__extension__ extern __inline float32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vdup_n_f32 (float32_t __a)
 {
-  return (uint64x1_t) (__a[0] >= (float64x1_t) {0.0});
+  return (float32x2_t) {__a, __a};
 }
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-vcgez_s8 (int8x8_t __a)
+__extension__ extern __inline float64x1_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vdup_n_f64 (float64_t __a)
 {
-  return (uint8x8_t) (__a >= 0);
+  return (float64x1_t) {__a};
 }
 
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-vcgez_s16 (int16x4_t __a)
+__extension__ extern __inline poly8x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vdup_n_p8 (poly8_t __a)
 {
-  return (uint16x4_t) (__a >= 0);
+  return (poly8x8_t) {__a, __a, __a, __a, __a, __a, __a, __a};
 }
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-vcgez_s32 (int32x2_t __a)
+__extension__ extern __inline poly16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vdup_n_p16 (poly16_t __a)
 {
-  return (uint32x2_t) (__a >= 0);
+  return (poly16x4_t) {__a, __a, __a, __a};
 }
 
-__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
-vcgez_s64 (int64x1_t __a)
+__extension__ extern __inline poly64x1_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vdup_n_p64 (poly64_t __a)
 {
-  return (uint64x1_t) (__a >= __AARCH64_INT64_C (0));
+  return (poly64x1_t) {__a};
 }
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vcgezq_f32 (float32x4_t __a)
+__extension__ extern __inline int8x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vdup_n_s8 (int8_t __a)
 {
-  return (uint32x4_t) (__a >= 0.0f);
+  return (int8x8_t) {__a, __a, __a, __a, __a, __a, __a, __a};
 }
 
-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
-vcgezq_f64 (float64x2_t __a)
+__extension__ extern __inline int16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vdup_n_s16 (int16_t __a)
 {
-  return (uint64x2_t) (__a >= 0.0);
+  return (int16x4_t) {__a, __a, __a, __a};
 }
 
-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-vcgezq_s8 (int8x16_t __a)
+__extension__ extern __inline int32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vdup_n_s32 (int32_t __a)
 {
-  return (uint8x16_t) (__a >= 0);
+  return (int32x2_t) {__a, __a};
 }
 
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-vcgezq_s16 (int16x8_t __a)
+__extension__ extern __inline int64x1_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vdup_n_s64 (int64_t __a)
 {
-  return (uint16x8_t) (__a >= 0);
+  return (int64x1_t) {__a};
 }
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vcgezq_s32 (int32x4_t __a)
+__extension__ extern __inline uint8x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vdup_n_u8 (uint8_t __a)
 {
-  return (uint32x4_t) (__a >= 0);
+  return (uint8x8_t) {__a, __a, __a, __a, __a, __a, __a, __a};
 }
 
-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
-vcgezq_s64 (int64x2_t __a)
+__extension__ extern __inline uint16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vdup_n_u16 (uint16_t __a)
 {
-  return (uint64x2_t) (__a >= __AARCH64_INT64_C (0));
+  return (uint16x4_t) {__a, __a, __a, __a};
 }
 
-/* vcgez - scalar.  */
-
-__extension__ static __inline uint32_t __attribute__ ((__always_inline__))
-vcgezs_f32 (float32_t __a)
+__extension__ extern __inline uint32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vdup_n_u32 (uint32_t __a)
 {
-  return __a >= 0.0f ? -1 : 0;
+  return (uint32x2_t) {__a, __a};
 }
 
-__extension__ static __inline uint64_t __attribute__ ((__always_inline__))
-vcgezd_s64 (int64_t __a)
+__extension__ extern __inline uint64x1_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vdup_n_u64 (uint64_t __a)
 {
-  return __a >= 0 ? -1ll : 0ll;
+  return (uint64x1_t) {__a};
 }
 
-__extension__ static __inline uint64_t __attribute__ ((__always_inline__))
-vcgezd_f64 (float64_t __a)
+/* vdupq_n  */
+
+__extension__ extern __inline float16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vdupq_n_f16 (float16_t __a)
 {
-  return __a >= 0.0 ? -1ll : 0ll;
+  return (float16x8_t) {__a, __a, __a, __a, __a, __a, __a, __a};
 }
 
-/* vcgt - vector.  */
-
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-vcgt_f32 (float32x2_t __a, float32x2_t __b)
+__extension__ extern __inline float32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vdupq_n_f32 (float32_t __a)
 {
-  return (uint32x2_t) (__a > __b);
+  return (float32x4_t) {__a, __a, __a, __a};
 }
 
-__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
-vcgt_f64 (float64x1_t __a, float64x1_t __b)
+__extension__ extern __inline float64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vdupq_n_f64 (float64_t __a)
 {
-  return (uint64x1_t) (__a > __b);
+  return (float64x2_t) {__a, __a};
 }
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-vcgt_s8 (int8x8_t __a, int8x8_t __b)
+__extension__ extern __inline poly8x16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vdupq_n_p8 (uint32_t __a)
 {
-  return (uint8x8_t) (__a > __b);
+  return (poly8x16_t) {__a, __a, __a, __a, __a, __a, __a, __a,
+		       __a, __a, __a, __a, __a, __a, __a, __a};
 }
 
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-vcgt_s16 (int16x4_t __a, int16x4_t __b)
+__extension__ extern __inline poly16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vdupq_n_p16 (uint32_t __a)
 {
-  return (uint16x4_t) (__a > __b);
+  return (poly16x8_t) {__a, __a, __a, __a, __a, __a, __a, __a};
 }
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-vcgt_s32 (int32x2_t __a, int32x2_t __b)
+__extension__ extern __inline poly64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vdupq_n_p64 (uint64_t __a)
 {
-  return (uint32x2_t) (__a > __b);
+  return (poly64x2_t) {__a, __a};
 }
 
-__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
-vcgt_s64 (int64x1_t __a, int64x1_t __b)
+__extension__ extern __inline int8x16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vdupq_n_s8 (int32_t __a)
 {
-  return (uint64x1_t) (__a > __b);
+  return (int8x16_t) {__a, __a, __a, __a, __a, __a, __a, __a,
+		      __a, __a, __a, __a, __a, __a, __a, __a};
 }
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-vcgt_u8 (uint8x8_t __a, uint8x8_t __b)
+__extension__ extern __inline int16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vdupq_n_s16 (int32_t __a)
 {
-  return (__a > __b);
+  return (int16x8_t) {__a, __a, __a, __a, __a, __a, __a, __a};
 }
 
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-vcgt_u16 (uint16x4_t __a, uint16x4_t __b)
+__extension__ extern __inline int32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vdupq_n_s32 (int32_t __a)
 {
-  return (__a > __b);
+  return (int32x4_t) {__a, __a, __a, __a};
 }
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-vcgt_u32 (uint32x2_t __a, uint32x2_t __b)
+__extension__ extern __inline int64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vdupq_n_s64 (int64_t __a)
 {
-  return (__a > __b);
+  return (int64x2_t) {__a, __a};
 }
 
-__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
-vcgt_u64 (uint64x1_t __a, uint64x1_t __b)
+__extension__ extern __inline uint8x16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vdupq_n_u8 (uint32_t __a)
 {
-  return (__a > __b);
+  return (uint8x16_t) {__a, __a, __a, __a, __a, __a, __a, __a,
+		       __a, __a, __a, __a, __a, __a, __a, __a};
 }
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vcgtq_f32 (float32x4_t __a, float32x4_t __b)
+__extension__ extern __inline uint16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vdupq_n_u16 (uint32_t __a)
 {
-  return (uint32x4_t) (__a > __b);
+  return (uint16x8_t) {__a, __a, __a, __a, __a, __a, __a, __a};
 }
 
-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
-vcgtq_f64 (float64x2_t __a, float64x2_t __b)
+__extension__ extern __inline uint32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vdupq_n_u32 (uint32_t __a)
 {
-  return (uint64x2_t) (__a > __b);
+  return (uint32x4_t) {__a, __a, __a, __a};
 }
 
-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-vcgtq_s8 (int8x16_t __a, int8x16_t __b)
+__extension__ extern __inline uint64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vdupq_n_u64 (uint64_t __a)
 {
-  return (uint8x16_t) (__a > __b);
+  return (uint64x2_t) {__a, __a};
 }
 
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-vcgtq_s16 (int16x8_t __a, int16x8_t __b)
+/* vdup_lane  */
+
+__extension__ extern __inline float16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vdup_lane_f16 (float16x4_t __a, const int __b)
 {
-  return (uint16x8_t) (__a > __b);
+  return __aarch64_vdup_lane_f16 (__a, __b);
 }
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vcgtq_s32 (int32x4_t __a, int32x4_t __b)
+__extension__ extern __inline float32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vdup_lane_f32 (float32x2_t __a, const int __b)
 {
-  return (uint32x4_t) (__a > __b);
+  return __aarch64_vdup_lane_f32 (__a, __b);
 }
 
-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
-vcgtq_s64 (int64x2_t __a, int64x2_t __b)
+__extension__ extern __inline float64x1_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vdup_lane_f64 (float64x1_t __a, const int __b)
 {
-  return (uint64x2_t) (__a > __b);
+  return __aarch64_vdup_lane_f64 (__a, __b);
 }
 
-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-vcgtq_u8 (uint8x16_t __a, uint8x16_t __b)
+__extension__ extern __inline poly8x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vdup_lane_p8 (poly8x8_t __a, const int __b)
 {
-  return (__a > __b);
+  return __aarch64_vdup_lane_p8 (__a, __b);
 }
 
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-vcgtq_u16 (uint16x8_t __a, uint16x8_t __b)
+__extension__ extern __inline poly16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vdup_lane_p16 (poly16x4_t __a, const int __b)
 {
-  return (__a > __b);
+  return __aarch64_vdup_lane_p16 (__a, __b);
 }
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vcgtq_u32 (uint32x4_t __a, uint32x4_t __b)
+__extension__ extern __inline poly64x1_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vdup_lane_p64 (poly64x1_t __a, const int __b)
 {
-  return (__a > __b);
+  return __aarch64_vdup_lane_p64 (__a, __b);
 }
 
-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
-vcgtq_u64 (uint64x2_t __a, uint64x2_t __b)
+__extension__ extern __inline int8x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vdup_lane_s8 (int8x8_t __a, const int __b)
 {
-  return (__a > __b);
+  return __aarch64_vdup_lane_s8 (__a, __b);
 }
 
-/* vcgt - scalar.  */
-
-__extension__ static __inline uint32_t __attribute__ ((__always_inline__))
-vcgts_f32 (float32_t __a, float32_t __b)
+__extension__ extern __inline int16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vdup_lane_s16 (int16x4_t __a, const int __b)
 {
-  return __a > __b ? -1 : 0;
+  return __aarch64_vdup_lane_s16 (__a, __b);
 }
 
-__extension__ static __inline uint64_t __attribute__ ((__always_inline__))
-vcgtd_s64 (int64_t __a, int64_t __b)
+__extension__ extern __inline int32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vdup_lane_s32 (int32x2_t __a, const int __b)
 {
-  return __a > __b ? -1ll : 0ll;
+  return __aarch64_vdup_lane_s32 (__a, __b);
 }
 
-__extension__ static __inline uint64_t __attribute__ ((__always_inline__))
-vcgtd_u64 (uint64_t __a, uint64_t __b)
+__extension__ extern __inline int64x1_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vdup_lane_s64 (int64x1_t __a, const int __b)
 {
-  return __a > __b ? -1ll : 0ll;
+  return __aarch64_vdup_lane_s64 (__a, __b);
 }
 
-__extension__ static __inline uint64_t __attribute__ ((__always_inline__))
-vcgtd_f64 (float64_t __a, float64_t __b)
+__extension__ extern __inline uint8x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vdup_lane_u8 (uint8x8_t __a, const int __b)
 {
-  return __a > __b ? -1ll : 0ll;
+  return __aarch64_vdup_lane_u8 (__a, __b);
 }
 
-/* vcgtz - vector.  */
-
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-vcgtz_f32 (float32x2_t __a)
+__extension__ extern __inline uint16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vdup_lane_u16 (uint16x4_t __a, const int __b)
 {
-  return (uint32x2_t) (__a > 0.0f);
+  return __aarch64_vdup_lane_u16 (__a, __b);
 }
 
-__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
-vcgtz_f64 (float64x1_t __a)
+__extension__ extern __inline uint32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vdup_lane_u32 (uint32x2_t __a, const int __b)
 {
-  return (uint64x1_t) (__a > (float64x1_t) {0.0});
+  return __aarch64_vdup_lane_u32 (__a, __b);
 }
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-vcgtz_s8 (int8x8_t __a)
+__extension__ extern __inline uint64x1_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vdup_lane_u64 (uint64x1_t __a, const int __b)
 {
-  return (uint8x8_t) (__a > 0);
+  return __aarch64_vdup_lane_u64 (__a, __b);
 }
 
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-vcgtz_s16 (int16x4_t __a)
+/* vdup_laneq  */
+
+__extension__ extern __inline float16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vdup_laneq_f16 (float16x8_t __a, const int __b)
 {
-  return (uint16x4_t) (__a > 0);
+  return __aarch64_vdup_laneq_f16 (__a, __b);
 }
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-vcgtz_s32 (int32x2_t __a)
+__extension__ extern __inline float32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vdup_laneq_f32 (float32x4_t __a, const int __b)
 {
-  return (uint32x2_t) (__a > 0);
+  return __aarch64_vdup_laneq_f32 (__a, __b);
 }
 
-__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
-vcgtz_s64 (int64x1_t __a)
+__extension__ extern __inline float64x1_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vdup_laneq_f64 (float64x2_t __a, const int __b)
 {
-  return (uint64x1_t) (__a > __AARCH64_INT64_C (0));
+  return __aarch64_vdup_laneq_f64 (__a, __b);
 }
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vcgtzq_f32 (float32x4_t __a)
+__extension__ extern __inline poly8x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vdup_laneq_p8 (poly8x16_t __a, const int __b)
 {
-  return (uint32x4_t) (__a > 0.0f);
+  return __aarch64_vdup_laneq_p8 (__a, __b);
 }
 
-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
-vcgtzq_f64 (float64x2_t __a)
+__extension__ extern __inline poly16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vdup_laneq_p16 (poly16x8_t __a, const int __b)
 {
-    return (uint64x2_t) (__a > 0.0);
+  return __aarch64_vdup_laneq_p16 (__a, __b);
 }
 
-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-vcgtzq_s8 (int8x16_t __a)
+__extension__ extern __inline poly64x1_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vdup_laneq_p64 (poly64x2_t __a, const int __b)
 {
-  return (uint8x16_t) (__a > 0);
+  return __aarch64_vdup_laneq_p64 (__a, __b);
 }
 
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-vcgtzq_s16 (int16x8_t __a)
+__extension__ extern __inline int8x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vdup_laneq_s8 (int8x16_t __a, const int __b)
 {
-  return (uint16x8_t) (__a > 0);
+  return __aarch64_vdup_laneq_s8 (__a, __b);
 }
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vcgtzq_s32 (int32x4_t __a)
+__extension__ extern __inline int16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vdup_laneq_s16 (int16x8_t __a, const int __b)
 {
-  return (uint32x4_t) (__a > 0);
+  return __aarch64_vdup_laneq_s16 (__a, __b);
 }
 
-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
-vcgtzq_s64 (int64x2_t __a)
+__extension__ extern __inline int32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vdup_laneq_s32 (int32x4_t __a, const int __b)
 {
-  return (uint64x2_t) (__a > __AARCH64_INT64_C (0));
+  return __aarch64_vdup_laneq_s32 (__a, __b);
 }
 
-/* vcgtz - scalar.  */
+__extension__ extern __inline int64x1_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vdup_laneq_s64 (int64x2_t __a, const int __b)
+{
+  return __aarch64_vdup_laneq_s64 (__a, __b);
+}
 
-__extension__ static __inline uint32_t __attribute__ ((__always_inline__))
-vcgtzs_f32 (float32_t __a)
+__extension__ extern __inline uint8x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vdup_laneq_u8 (uint8x16_t __a, const int __b)
 {
-  return __a > 0.0f ? -1 : 0;
+  return __aarch64_vdup_laneq_u8 (__a, __b);
 }
 
-__extension__ static __inline uint64_t __attribute__ ((__always_inline__))
-vcgtzd_s64 (int64_t __a)
+__extension__ extern __inline uint16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vdup_laneq_u16 (uint16x8_t __a, const int __b)
 {
-  return __a > 0 ? -1ll : 0ll;
+  return __aarch64_vdup_laneq_u16 (__a, __b);
 }
 
-__extension__ static __inline uint64_t __attribute__ ((__always_inline__))
-vcgtzd_f64 (float64_t __a)
+__extension__ extern __inline uint32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vdup_laneq_u32 (uint32x4_t __a, const int __b)
 {
-  return __a > 0.0 ? -1ll : 0ll;
+  return __aarch64_vdup_laneq_u32 (__a, __b);
 }
 
-/* vcle - vector.  */
-
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-vcle_f32 (float32x2_t __a, float32x2_t __b)
+__extension__ extern __inline uint64x1_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vdup_laneq_u64 (uint64x2_t __a, const int __b)
 {
-  return (uint32x2_t) (__a <= __b);
+  return __aarch64_vdup_laneq_u64 (__a, __b);
 }
 
-__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
-vcle_f64 (float64x1_t __a, float64x1_t __b)
+/* vdupq_lane  */
+
+__extension__ extern __inline float16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vdupq_lane_f16 (float16x4_t __a, const int __b)
 {
-  return (uint64x1_t) (__a <= __b);
+  return __aarch64_vdupq_lane_f16 (__a, __b);
 }
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-vcle_s8 (int8x8_t __a, int8x8_t __b)
+__extension__ extern __inline float32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vdupq_lane_f32 (float32x2_t __a, const int __b)
 {
-  return (uint8x8_t) (__a <= __b);
+  return __aarch64_vdupq_lane_f32 (__a, __b);
 }
 
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-vcle_s16 (int16x4_t __a, int16x4_t __b)
+__extension__ extern __inline float64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vdupq_lane_f64 (float64x1_t __a, const int __b)
 {
-  return (uint16x4_t) (__a <= __b);
+  return __aarch64_vdupq_lane_f64 (__a, __b);
 }
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-vcle_s32 (int32x2_t __a, int32x2_t __b)
+__extension__ extern __inline poly8x16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vdupq_lane_p8 (poly8x8_t __a, const int __b)
 {
-  return (uint32x2_t) (__a <= __b);
+  return __aarch64_vdupq_lane_p8 (__a, __b);
 }
 
-__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
-vcle_s64 (int64x1_t __a, int64x1_t __b)
+__extension__ extern __inline poly16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vdupq_lane_p16 (poly16x4_t __a, const int __b)
 {
-  return (uint64x1_t) (__a <= __b);
+  return __aarch64_vdupq_lane_p16 (__a, __b);
 }
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-vcle_u8 (uint8x8_t __a, uint8x8_t __b)
+__extension__ extern __inline poly64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vdupq_lane_p64 (poly64x1_t __a, const int __b)
 {
-  return (__a <= __b);
+  return __aarch64_vdupq_lane_p64 (__a, __b);
 }
 
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-vcle_u16 (uint16x4_t __a, uint16x4_t __b)
+__extension__ extern __inline int8x16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vdupq_lane_s8 (int8x8_t __a, const int __b)
 {
-  return (__a <= __b);
+  return __aarch64_vdupq_lane_s8 (__a, __b);
 }
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-vcle_u32 (uint32x2_t __a, uint32x2_t __b)
+__extension__ extern __inline int16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vdupq_lane_s16 (int16x4_t __a, const int __b)
 {
-  return (__a <= __b);
+  return __aarch64_vdupq_lane_s16 (__a, __b);
 }
 
-__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
-vcle_u64 (uint64x1_t __a, uint64x1_t __b)
+__extension__ extern __inline int32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vdupq_lane_s32 (int32x2_t __a, const int __b)
 {
-  return (__a <= __b);
+  return __aarch64_vdupq_lane_s32 (__a, __b);
 }
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vcleq_f32 (float32x4_t __a, float32x4_t __b)
+__extension__ extern __inline int64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vdupq_lane_s64 (int64x1_t __a, const int __b)
 {
-  return (uint32x4_t) (__a <= __b);
+  return __aarch64_vdupq_lane_s64 (__a, __b);
 }
 
-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
-vcleq_f64 (float64x2_t __a, float64x2_t __b)
+__extension__ extern __inline uint8x16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vdupq_lane_u8 (uint8x8_t __a, const int __b)
 {
-  return (uint64x2_t) (__a <= __b);
+  return __aarch64_vdupq_lane_u8 (__a, __b);
 }
 
-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-vcleq_s8 (int8x16_t __a, int8x16_t __b)
+__extension__ extern __inline uint16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vdupq_lane_u16 (uint16x4_t __a, const int __b)
 {
-  return (uint8x16_t) (__a <= __b);
+  return __aarch64_vdupq_lane_u16 (__a, __b);
 }
 
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-vcleq_s16 (int16x8_t __a, int16x8_t __b)
+__extension__ extern __inline uint32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vdupq_lane_u32 (uint32x2_t __a, const int __b)
 {
-  return (uint16x8_t) (__a <= __b);
+  return __aarch64_vdupq_lane_u32 (__a, __b);
 }
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vcleq_s32 (int32x4_t __a, int32x4_t __b)
+__extension__ extern __inline uint64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vdupq_lane_u64 (uint64x1_t __a, const int __b)
 {
-  return (uint32x4_t) (__a <= __b);
+  return __aarch64_vdupq_lane_u64 (__a, __b);
 }
 
-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
-vcleq_s64 (int64x2_t __a, int64x2_t __b)
+/* vdupq_laneq  */
+
+__extension__ extern __inline float16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vdupq_laneq_f16 (float16x8_t __a, const int __b)
 {
-  return (uint64x2_t) (__a <= __b);
+  return __aarch64_vdupq_laneq_f16 (__a, __b);
 }
 
-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-vcleq_u8 (uint8x16_t __a, uint8x16_t __b)
+__extension__ extern __inline float32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vdupq_laneq_f32 (float32x4_t __a, const int __b)
 {
-  return (__a <= __b);
+  return __aarch64_vdupq_laneq_f32 (__a, __b);
 }
 
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-vcleq_u16 (uint16x8_t __a, uint16x8_t __b)
+__extension__ extern __inline float64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vdupq_laneq_f64 (float64x2_t __a, const int __b)
 {
-  return (__a <= __b);
+  return __aarch64_vdupq_laneq_f64 (__a, __b);
 }
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vcleq_u32 (uint32x4_t __a, uint32x4_t __b)
+__extension__ extern __inline poly8x16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vdupq_laneq_p8 (poly8x16_t __a, const int __b)
 {
-  return (__a <= __b);
+  return __aarch64_vdupq_laneq_p8 (__a, __b);
 }
 
-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
-vcleq_u64 (uint64x2_t __a, uint64x2_t __b)
+__extension__ extern __inline poly16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vdupq_laneq_p16 (poly16x8_t __a, const int __b)
 {
-  return (__a <= __b);
+  return __aarch64_vdupq_laneq_p16 (__a, __b);
 }
 
-/* vcle - scalar.  */
-
-__extension__ static __inline uint32_t __attribute__ ((__always_inline__))
-vcles_f32 (float32_t __a, float32_t __b)
+__extension__ extern __inline poly64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vdupq_laneq_p64 (poly64x2_t __a, const int __b)
 {
-  return __a <= __b ? -1 : 0;
+  return __aarch64_vdupq_laneq_p64 (__a, __b);
 }
 
-__extension__ static __inline uint64_t __attribute__ ((__always_inline__))
-vcled_s64 (int64_t __a, int64_t __b)
+__extension__ extern __inline int8x16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vdupq_laneq_s8 (int8x16_t __a, const int __b)
 {
-  return __a <= __b ? -1ll : 0ll;
+  return __aarch64_vdupq_laneq_s8 (__a, __b);
 }
 
-__extension__ static __inline uint64_t __attribute__ ((__always_inline__))
-vcled_u64 (uint64_t __a, uint64_t __b)
+__extension__ extern __inline int16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vdupq_laneq_s16 (int16x8_t __a, const int __b)
 {
-  return __a <= __b ? -1ll : 0ll;
+  return __aarch64_vdupq_laneq_s16 (__a, __b);
 }
 
-__extension__ static __inline uint64_t __attribute__ ((__always_inline__))
-vcled_f64 (float64_t __a, float64_t __b)
+__extension__ extern __inline int32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vdupq_laneq_s32 (int32x4_t __a, const int __b)
 {
-  return __a <= __b ? -1ll : 0ll;
+  return __aarch64_vdupq_laneq_s32 (__a, __b);
 }
 
-/* vclez - vector.  */
-
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-vclez_f32 (float32x2_t __a)
+__extension__ extern __inline int64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vdupq_laneq_s64 (int64x2_t __a, const int __b)
 {
-  return (uint32x2_t) (__a <= 0.0f);
+  return __aarch64_vdupq_laneq_s64 (__a, __b);
 }
 
-__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
-vclez_f64 (float64x1_t __a)
+__extension__ extern __inline uint8x16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vdupq_laneq_u8 (uint8x16_t __a, const int __b)
 {
-  return (uint64x1_t) (__a <= (float64x1_t) {0.0});
+  return __aarch64_vdupq_laneq_u8 (__a, __b);
 }
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-vclez_s8 (int8x8_t __a)
+__extension__ extern __inline uint16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vdupq_laneq_u16 (uint16x8_t __a, const int __b)
 {
-  return (uint8x8_t) (__a <= 0);
+  return __aarch64_vdupq_laneq_u16 (__a, __b);
 }
 
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-vclez_s16 (int16x4_t __a)
+__extension__ extern __inline uint32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vdupq_laneq_u32 (uint32x4_t __a, const int __b)
 {
-  return (uint16x4_t) (__a <= 0);
+  return __aarch64_vdupq_laneq_u32 (__a, __b);
 }
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-vclez_s32 (int32x2_t __a)
+__extension__ extern __inline uint64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vdupq_laneq_u64 (uint64x2_t __a, const int __b)
 {
-  return (uint32x2_t) (__a <= 0);
+  return __aarch64_vdupq_laneq_u64 (__a, __b);
 }
 
-__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
-vclez_s64 (int64x1_t __a)
+/* vdupb_lane  */
+__extension__ extern __inline poly8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vdupb_lane_p8 (poly8x8_t __a, const int __b)
 {
-  return (uint64x1_t) (__a <= __AARCH64_INT64_C (0));
+  return __aarch64_vget_lane_any (__a, __b);
 }
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vclezq_f32 (float32x4_t __a)
+__extension__ extern __inline int8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vdupb_lane_s8 (int8x8_t __a, const int __b)
 {
-  return (uint32x4_t) (__a <= 0.0f);
+  return __aarch64_vget_lane_any (__a, __b);
 }
 
-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
-vclezq_f64 (float64x2_t __a)
+__extension__ extern __inline uint8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vdupb_lane_u8 (uint8x8_t __a, const int __b)
 {
-  return (uint64x2_t) (__a <= 0.0);
+  return __aarch64_vget_lane_any (__a, __b);
 }
 
-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-vclezq_s8 (int8x16_t __a)
+/* vduph_lane  */
+
+__extension__ extern __inline float16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vduph_lane_f16 (float16x4_t __a, const int __b)
 {
-  return (uint8x16_t) (__a <= 0);
+  return __aarch64_vget_lane_any (__a, __b);
 }
 
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-vclezq_s16 (int16x8_t __a)
+__extension__ extern __inline poly16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vduph_lane_p16 (poly16x4_t __a, const int __b)
 {
-  return (uint16x8_t) (__a <= 0);
+  return __aarch64_vget_lane_any (__a, __b);
 }
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vclezq_s32 (int32x4_t __a)
+__extension__ extern __inline int16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vduph_lane_s16 (int16x4_t __a, const int __b)
 {
-  return (uint32x4_t) (__a <= 0);
+  return __aarch64_vget_lane_any (__a, __b);
 }
 
-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
-vclezq_s64 (int64x2_t __a)
+__extension__ extern __inline uint16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vduph_lane_u16 (uint16x4_t __a, const int __b)
 {
-  return (uint64x2_t) (__a <= __AARCH64_INT64_C (0));
+  return __aarch64_vget_lane_any (__a, __b);
 }
 
-/* vclez - scalar.  */
+/* vdups_lane  */
 
-__extension__ static __inline uint32_t __attribute__ ((__always_inline__))
-vclezs_f32 (float32_t __a)
+__extension__ extern __inline float32_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vdups_lane_f32 (float32x2_t __a, const int __b)
 {
-  return __a <= 0.0f ? -1 : 0;
+  return __aarch64_vget_lane_any (__a, __b);
 }
 
-__extension__ static __inline uint64_t __attribute__ ((__always_inline__))
-vclezd_s64 (int64_t __a)
+__extension__ extern __inline int32_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vdups_lane_s32 (int32x2_t __a, const int __b)
 {
-  return __a <= 0 ? -1ll : 0ll;
+  return __aarch64_vget_lane_any (__a, __b);
 }
 
-__extension__ static __inline uint64_t __attribute__ ((__always_inline__))
-vclezd_f64 (float64_t __a)
+__extension__ extern __inline uint32_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vdups_lane_u32 (uint32x2_t __a, const int __b)
 {
-  return __a <= 0.0 ? -1ll : 0ll;
+  return __aarch64_vget_lane_any (__a, __b);
 }
 
-/* vclt - vector.  */
-
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-vclt_f32 (float32x2_t __a, float32x2_t __b)
+/* vdupd_lane  */
+__extension__ extern __inline float64_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vdupd_lane_f64 (float64x1_t __a, const int __b)
 {
-  return (uint32x2_t) (__a < __b);
+  __AARCH64_LANE_CHECK (__a, __b);
+  return __a[0];
 }
 
-__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
-vclt_f64 (float64x1_t __a, float64x1_t __b)
+__extension__ extern __inline int64_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vdupd_lane_s64 (int64x1_t __a, const int __b)
 {
-  return (uint64x1_t) (__a < __b);
+  __AARCH64_LANE_CHECK (__a, __b);
+  return __a[0];
 }
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-vclt_s8 (int8x8_t __a, int8x8_t __b)
+__extension__ extern __inline uint64_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vdupd_lane_u64 (uint64x1_t __a, const int __b)
 {
-  return (uint8x8_t) (__a < __b);
+  __AARCH64_LANE_CHECK (__a, __b);
+  return __a[0];
 }
 
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-vclt_s16 (int16x4_t __a, int16x4_t __b)
+/* vdupb_laneq  */
+__extension__ extern __inline poly8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vdupb_laneq_p8 (poly8x16_t __a, const int __b)
 {
-  return (uint16x4_t) (__a < __b);
+  return __aarch64_vget_lane_any (__a, __b);
 }
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-vclt_s32 (int32x2_t __a, int32x2_t __b)
+__extension__ extern __inline int8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vdupb_laneq_s8 (int8x16_t __a, const int __b)
 {
-  return (uint32x2_t) (__a < __b);
+  return __aarch64_vget_lane_any (__a, __b);
 }
 
-__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
-vclt_s64 (int64x1_t __a, int64x1_t __b)
+__extension__ extern __inline uint8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vdupb_laneq_u8 (uint8x16_t __a, const int __b)
 {
-  return (uint64x1_t) (__a < __b);
+  return __aarch64_vget_lane_any (__a, __b);
 }
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-vclt_u8 (uint8x8_t __a, uint8x8_t __b)
-{
-  return (__a < __b);
-}
+/* vduph_laneq  */
 
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-vclt_u16 (uint16x4_t __a, uint16x4_t __b)
+__extension__ extern __inline float16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vduph_laneq_f16 (float16x8_t __a, const int __b)
 {
-  return (__a < __b);
+  return __aarch64_vget_lane_any (__a, __b);
 }
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-vclt_u32 (uint32x2_t __a, uint32x2_t __b)
+__extension__ extern __inline poly16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vduph_laneq_p16 (poly16x8_t __a, const int __b)
 {
-  return (__a < __b);
+  return __aarch64_vget_lane_any (__a, __b);
 }
 
-__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
-vclt_u64 (uint64x1_t __a, uint64x1_t __b)
+__extension__ extern __inline int16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vduph_laneq_s16 (int16x8_t __a, const int __b)
 {
-  return (__a < __b);
+  return __aarch64_vget_lane_any (__a, __b);
 }
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vcltq_f32 (float32x4_t __a, float32x4_t __b)
+__extension__ extern __inline uint16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vduph_laneq_u16 (uint16x8_t __a, const int __b)
 {
-  return (uint32x4_t) (__a < __b);
+  return __aarch64_vget_lane_any (__a, __b);
 }
 
-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
-vcltq_f64 (float64x2_t __a, float64x2_t __b)
+/* vdups_laneq  */
+
+__extension__ extern __inline float32_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vdups_laneq_f32 (float32x4_t __a, const int __b)
 {
-  return (uint64x2_t) (__a < __b);
+  return __aarch64_vget_lane_any (__a, __b);
 }
 
-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-vcltq_s8 (int8x16_t __a, int8x16_t __b)
+__extension__ extern __inline int32_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vdups_laneq_s32 (int32x4_t __a, const int __b)
 {
-  return (uint8x16_t) (__a < __b);
+  return __aarch64_vget_lane_any (__a, __b);
 }
 
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-vcltq_s16 (int16x8_t __a, int16x8_t __b)
+__extension__ extern __inline uint32_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vdups_laneq_u32 (uint32x4_t __a, const int __b)
 {
-  return (uint16x8_t) (__a < __b);
+  return __aarch64_vget_lane_any (__a, __b);
 }
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vcltq_s32 (int32x4_t __a, int32x4_t __b)
+/* vdupd_laneq  */
+__extension__ extern __inline float64_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vdupd_laneq_f64 (float64x2_t __a, const int __b)
 {
-  return (uint32x4_t) (__a < __b);
+  return __aarch64_vget_lane_any (__a, __b);
 }
 
-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
-vcltq_s64 (int64x2_t __a, int64x2_t __b)
+__extension__ extern __inline int64_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vdupd_laneq_s64 (int64x2_t __a, const int __b)
 {
-  return (uint64x2_t) (__a < __b);
+  return __aarch64_vget_lane_any (__a, __b);
 }
 
-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-vcltq_u8 (uint8x16_t __a, uint8x16_t __b)
+__extension__ extern __inline uint64_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vdupd_laneq_u64 (uint64x2_t __a, const int __b)
 {
-  return (__a < __b);
+  return __aarch64_vget_lane_any (__a, __b);
 }
 
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-vcltq_u16 (uint16x8_t __a, uint16x8_t __b)
+/* vext  */
+
+__extension__ extern __inline float16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vext_f16 (float16x4_t __a, float16x4_t __b, __const int __c)
 {
-  return (__a < __b);
+  __AARCH64_LANE_CHECK (__a, __c);
+#ifdef __AARCH64EB__
+  return __builtin_shuffle (__b, __a,
+			    (uint16x4_t) {4 - __c, 5 - __c, 6 - __c, 7 - __c});
+#else
+  return __builtin_shuffle (__a, __b,
+			    (uint16x4_t) {__c, __c + 1, __c + 2, __c + 3});
+#endif
 }
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vcltq_u32 (uint32x4_t __a, uint32x4_t __b)
+__extension__ extern __inline float32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vext_f32 (float32x2_t __a, float32x2_t __b, __const int __c)
 {
-  return (__a < __b);
+  __AARCH64_LANE_CHECK (__a, __c);
+#ifdef __AARCH64EB__
+  return __builtin_shuffle (__b, __a, (uint32x2_t) {2-__c, 3-__c});
+#else
+  return __builtin_shuffle (__a, __b, (uint32x2_t) {__c, __c+1});
+#endif
 }
 
-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
-vcltq_u64 (uint64x2_t __a, uint64x2_t __b)
+__extension__ extern __inline float64x1_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vext_f64 (float64x1_t __a, float64x1_t __b, __const int __c)
 {
-  return (__a < __b);
+  __AARCH64_LANE_CHECK (__a, __c);
+  /* The only possible index to the assembler instruction returns element 0.  */
+  return __a;
 }
-
-/* vclt - scalar.  */
-
-__extension__ static __inline uint32_t __attribute__ ((__always_inline__))
-vclts_f32 (float32_t __a, float32_t __b)
+__extension__ extern __inline poly8x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vext_p8 (poly8x8_t __a, poly8x8_t __b, __const int __c)
 {
-  return __a < __b ? -1 : 0;
+  __AARCH64_LANE_CHECK (__a, __c);
+#ifdef __AARCH64EB__
+  return __builtin_shuffle (__b, __a, (uint8x8_t)
+      {8-__c, 9-__c, 10-__c, 11-__c, 12-__c, 13-__c, 14-__c, 15-__c});
+#else
+  return __builtin_shuffle (__a, __b,
+      (uint8x8_t) {__c, __c+1, __c+2, __c+3, __c+4, __c+5, __c+6, __c+7});
+#endif
 }
 
-__extension__ static __inline uint64_t __attribute__ ((__always_inline__))
-vcltd_s64 (int64_t __a, int64_t __b)
+__extension__ extern __inline poly16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vext_p16 (poly16x4_t __a, poly16x4_t __b, __const int __c)
 {
-  return __a < __b ? -1ll : 0ll;
+  __AARCH64_LANE_CHECK (__a, __c);
+#ifdef __AARCH64EB__
+  return __builtin_shuffle (__b, __a,
+      (uint16x4_t) {4-__c, 5-__c, 6-__c, 7-__c});
+#else
+  return __builtin_shuffle (__a, __b, (uint16x4_t) {__c, __c+1, __c+2, __c+3});
+#endif
 }
 
-__extension__ static __inline uint64_t __attribute__ ((__always_inline__))
-vcltd_u64 (uint64_t __a, uint64_t __b)
+__extension__ extern __inline poly64x1_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vext_p64 (poly64x1_t __a, poly64x1_t __b, __const int __c)
 {
-  return __a < __b ? -1ll : 0ll;
+  __AARCH64_LANE_CHECK (__a, __c);
+  /* The only possible index to the assembler instruction returns element 0.  */
+  return __a;
 }
 
-__extension__ static __inline uint64_t __attribute__ ((__always_inline__))
-vcltd_f64 (float64_t __a, float64_t __b)
+__extension__ extern __inline int8x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vext_s8 (int8x8_t __a, int8x8_t __b, __const int __c)
 {
-  return __a < __b ? -1ll : 0ll;
+  __AARCH64_LANE_CHECK (__a, __c);
+#ifdef __AARCH64EB__
+  return __builtin_shuffle (__b, __a, (uint8x8_t)
+      {8-__c, 9-__c, 10-__c, 11-__c, 12-__c, 13-__c, 14-__c, 15-__c});
+#else
+  return __builtin_shuffle (__a, __b,
+      (uint8x8_t) {__c, __c+1, __c+2, __c+3, __c+4, __c+5, __c+6, __c+7});
+#endif
 }
 
-/* vcltz - vector.  */
-
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-vcltz_f32 (float32x2_t __a)
+__extension__ extern __inline int16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vext_s16 (int16x4_t __a, int16x4_t __b, __const int __c)
 {
-  return (uint32x2_t) (__a < 0.0f);
+  __AARCH64_LANE_CHECK (__a, __c);
+#ifdef __AARCH64EB__
+  return __builtin_shuffle (__b, __a,
+      (uint16x4_t) {4-__c, 5-__c, 6-__c, 7-__c});
+#else
+  return __builtin_shuffle (__a, __b, (uint16x4_t) {__c, __c+1, __c+2, __c+3});
+#endif
 }
 
-__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
-vcltz_f64 (float64x1_t __a)
+__extension__ extern __inline int32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vext_s32 (int32x2_t __a, int32x2_t __b, __const int __c)
 {
-  return (uint64x1_t) (__a < (float64x1_t) {0.0});
+  __AARCH64_LANE_CHECK (__a, __c);
+#ifdef __AARCH64EB__
+  return __builtin_shuffle (__b, __a, (uint32x2_t) {2-__c, 3-__c});
+#else
+  return __builtin_shuffle (__a, __b, (uint32x2_t) {__c, __c+1});
+#endif
 }
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-vcltz_s8 (int8x8_t __a)
+__extension__ extern __inline int64x1_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vext_s64 (int64x1_t __a, int64x1_t __b, __const int __c)
 {
-  return (uint8x8_t) (__a < 0);
+  __AARCH64_LANE_CHECK (__a, __c);
+  /* The only possible index to the assembler instruction returns element 0.  */
+  return __a;
 }
 
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-vcltz_s16 (int16x4_t __a)
+__extension__ extern __inline uint8x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vext_u8 (uint8x8_t __a, uint8x8_t __b, __const int __c)
 {
-  return (uint16x4_t) (__a < 0);
+  __AARCH64_LANE_CHECK (__a, __c);
+#ifdef __AARCH64EB__
+  return __builtin_shuffle (__b, __a, (uint8x8_t)
+      {8-__c, 9-__c, 10-__c, 11-__c, 12-__c, 13-__c, 14-__c, 15-__c});
+#else
+  return __builtin_shuffle (__a, __b,
+      (uint8x8_t) {__c, __c+1, __c+2, __c+3, __c+4, __c+5, __c+6, __c+7});
+#endif
 }
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-vcltz_s32 (int32x2_t __a)
+__extension__ extern __inline uint16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vext_u16 (uint16x4_t __a, uint16x4_t __b, __const int __c)
 {
-  return (uint32x2_t) (__a < 0);
+  __AARCH64_LANE_CHECK (__a, __c);
+#ifdef __AARCH64EB__
+  return __builtin_shuffle (__b, __a,
+      (uint16x4_t) {4-__c, 5-__c, 6-__c, 7-__c});
+#else
+  return __builtin_shuffle (__a, __b, (uint16x4_t) {__c, __c+1, __c+2, __c+3});
+#endif
 }
 
-__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
-vcltz_s64 (int64x1_t __a)
+__extension__ extern __inline uint32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vext_u32 (uint32x2_t __a, uint32x2_t __b, __const int __c)
 {
-  return (uint64x1_t) (__a < __AARCH64_INT64_C (0));
+  __AARCH64_LANE_CHECK (__a, __c);
+#ifdef __AARCH64EB__
+  return __builtin_shuffle (__b, __a, (uint32x2_t) {2-__c, 3-__c});
+#else
+  return __builtin_shuffle (__a, __b, (uint32x2_t) {__c, __c+1});
+#endif
 }
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vcltzq_f32 (float32x4_t __a)
+__extension__ extern __inline uint64x1_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vext_u64 (uint64x1_t __a, uint64x1_t __b, __const int __c)
 {
-  return (uint32x4_t) (__a < 0.0f);
+  __AARCH64_LANE_CHECK (__a, __c);
+  /* The only possible index to the assembler instruction returns element 0.  */
+  return __a;
 }
 
-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
-vcltzq_f64 (float64x2_t __a)
+__extension__ extern __inline float16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vextq_f16 (float16x8_t __a, float16x8_t __b, __const int __c)
 {
-  return (uint64x2_t) (__a < 0.0);
+  __AARCH64_LANE_CHECK (__a, __c);
+#ifdef __AARCH64EB__
+  return __builtin_shuffle (__b, __a,
+			    (uint16x8_t) {8 - __c, 9 - __c, 10 - __c, 11 - __c,
+					  12 - __c, 13 - __c, 14 - __c,
+					  15 - __c});
+#else
+  return __builtin_shuffle (__a, __b,
+			    (uint16x8_t) {__c, __c + 1, __c + 2, __c + 3,
+					  __c + 4, __c + 5, __c + 6, __c + 7});
+#endif
 }
 
-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-vcltzq_s8 (int8x16_t __a)
+__extension__ extern __inline float32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vextq_f32 (float32x4_t __a, float32x4_t __b, __const int __c)
 {
-  return (uint8x16_t) (__a < 0);
+  __AARCH64_LANE_CHECK (__a, __c);
+#ifdef __AARCH64EB__
+  return __builtin_shuffle (__b, __a,
+      (uint32x4_t) {4-__c, 5-__c, 6-__c, 7-__c});
+#else
+  return __builtin_shuffle (__a, __b, (uint32x4_t) {__c, __c+1, __c+2, __c+3});
+#endif
 }
 
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-vcltzq_s16 (int16x8_t __a)
+__extension__ extern __inline float64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vextq_f64 (float64x2_t __a, float64x2_t __b, __const int __c)
 {
-  return (uint16x8_t) (__a < 0);
+  __AARCH64_LANE_CHECK (__a, __c);
+#ifdef __AARCH64EB__
+  return __builtin_shuffle (__b, __a, (uint64x2_t) {2-__c, 3-__c});
+#else
+  return __builtin_shuffle (__a, __b, (uint64x2_t) {__c, __c+1});
+#endif
 }
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vcltzq_s32 (int32x4_t __a)
+__extension__ extern __inline poly8x16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vextq_p8 (poly8x16_t __a, poly8x16_t __b, __const int __c)
 {
-  return (uint32x4_t) (__a < 0);
+  __AARCH64_LANE_CHECK (__a, __c);
+#ifdef __AARCH64EB__
+  return __builtin_shuffle (__b, __a, (uint8x16_t)
+      {16-__c, 17-__c, 18-__c, 19-__c, 20-__c, 21-__c, 22-__c, 23-__c,
+       24-__c, 25-__c, 26-__c, 27-__c, 28-__c, 29-__c, 30-__c, 31-__c});
+#else
+  return __builtin_shuffle (__a, __b, (uint8x16_t)
+      {__c, __c+1, __c+2, __c+3, __c+4, __c+5, __c+6, __c+7,
+       __c+8, __c+9, __c+10, __c+11, __c+12, __c+13, __c+14, __c+15});
+#endif
 }
 
-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
-vcltzq_s64 (int64x2_t __a)
+__extension__ extern __inline poly16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vextq_p16 (poly16x8_t __a, poly16x8_t __b, __const int __c)
 {
-  return (uint64x2_t) (__a < __AARCH64_INT64_C (0));
+  __AARCH64_LANE_CHECK (__a, __c);
+#ifdef __AARCH64EB__
+  return __builtin_shuffle (__b, __a, (uint16x8_t)
+      {8-__c, 9-__c, 10-__c, 11-__c, 12-__c, 13-__c, 14-__c, 15-__c});
+#else
+  return __builtin_shuffle (__a, __b,
+      (uint16x8_t) {__c, __c+1, __c+2, __c+3, __c+4, __c+5, __c+6, __c+7});
+#endif
 }
 
-/* vcltz - scalar.  */
-
-__extension__ static __inline uint32_t __attribute__ ((__always_inline__))
-vcltzs_f32 (float32_t __a)
+__extension__ extern __inline poly64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vextq_p64 (poly64x2_t __a, poly64x2_t __b, __const int __c)
 {
-  return __a < 0.0f ? -1 : 0;
+  __AARCH64_LANE_CHECK (__a, __c);
+#ifdef __AARCH64EB__
+  return __builtin_shuffle (__b, __a, (uint64x2_t) {2-__c, 3-__c});
+#else
+  return __builtin_shuffle (__a, __b, (uint64x2_t) {__c, __c+1});
+#endif
 }
 
-__extension__ static __inline uint64_t __attribute__ ((__always_inline__))
-vcltzd_s64 (int64_t __a)
+__extension__ extern __inline int8x16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vextq_s8 (int8x16_t __a, int8x16_t __b, __const int __c)
 {
-  return __a < 0 ? -1ll : 0ll;
+  __AARCH64_LANE_CHECK (__a, __c);
+#ifdef __AARCH64EB__
+  return __builtin_shuffle (__b, __a, (uint8x16_t)
+      {16-__c, 17-__c, 18-__c, 19-__c, 20-__c, 21-__c, 22-__c, 23-__c,
+       24-__c, 25-__c, 26-__c, 27-__c, 28-__c, 29-__c, 30-__c, 31-__c});
+#else
+  return __builtin_shuffle (__a, __b, (uint8x16_t)
+      {__c, __c+1, __c+2, __c+3, __c+4, __c+5, __c+6, __c+7,
+       __c+8, __c+9, __c+10, __c+11, __c+12, __c+13, __c+14, __c+15});
+#endif
 }
 
-__extension__ static __inline uint64_t __attribute__ ((__always_inline__))
-vcltzd_f64 (float64_t __a)
+__extension__ extern __inline int16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vextq_s16 (int16x8_t __a, int16x8_t __b, __const int __c)
 {
-  return __a < 0.0 ? -1ll : 0ll;
+  __AARCH64_LANE_CHECK (__a, __c);
+#ifdef __AARCH64EB__
+  return __builtin_shuffle (__b, __a, (uint16x8_t)
+      {8-__c, 9-__c, 10-__c, 11-__c, 12-__c, 13-__c, 14-__c, 15-__c});
+#else
+  return __builtin_shuffle (__a, __b,
+      (uint16x8_t) {__c, __c+1, __c+2, __c+3, __c+4, __c+5, __c+6, __c+7});
+#endif
 }
 
-/* vcls.  */
-
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-vcls_s8 (int8x8_t __a)
+__extension__ extern __inline int32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vextq_s32 (int32x4_t __a, int32x4_t __b, __const int __c)
 {
-  return __builtin_aarch64_clrsbv8qi (__a);
+  __AARCH64_LANE_CHECK (__a, __c);
+#ifdef __AARCH64EB__
+  return __builtin_shuffle (__b, __a,
+      (uint32x4_t) {4-__c, 5-__c, 6-__c, 7-__c});
+#else
+  return __builtin_shuffle (__a, __b, (uint32x4_t) {__c, __c+1, __c+2, __c+3});
+#endif
 }
 
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-vcls_s16 (int16x4_t __a)
+__extension__ extern __inline int64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vextq_s64 (int64x2_t __a, int64x2_t __b, __const int __c)
 {
-  return __builtin_aarch64_clrsbv4hi (__a);
+  __AARCH64_LANE_CHECK (__a, __c);
+#ifdef __AARCH64EB__
+  return __builtin_shuffle (__b, __a, (uint64x2_t) {2-__c, 3-__c});
+#else
+  return __builtin_shuffle (__a, __b, (uint64x2_t) {__c, __c+1});
+#endif
 }
 
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-vcls_s32 (int32x2_t __a)
+__extension__ extern __inline uint8x16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vextq_u8 (uint8x16_t __a, uint8x16_t __b, __const int __c)
 {
-  return __builtin_aarch64_clrsbv2si (__a);
+  __AARCH64_LANE_CHECK (__a, __c);
+#ifdef __AARCH64EB__
+  return __builtin_shuffle (__b, __a, (uint8x16_t)
+      {16-__c, 17-__c, 18-__c, 19-__c, 20-__c, 21-__c, 22-__c, 23-__c,
+       24-__c, 25-__c, 26-__c, 27-__c, 28-__c, 29-__c, 30-__c, 31-__c});
+#else
+  return __builtin_shuffle (__a, __b, (uint8x16_t)
+      {__c, __c+1, __c+2, __c+3, __c+4, __c+5, __c+6, __c+7,
+       __c+8, __c+9, __c+10, __c+11, __c+12, __c+13, __c+14, __c+15});
+#endif
 }
 
-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
-vclsq_s8 (int8x16_t __a)
+__extension__ extern __inline uint16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vextq_u16 (uint16x8_t __a, uint16x8_t __b, __const int __c)
 {
-  return __builtin_aarch64_clrsbv16qi (__a);
+  __AARCH64_LANE_CHECK (__a, __c);
+#ifdef __AARCH64EB__
+  return __builtin_shuffle (__b, __a, (uint16x8_t)
+      {8-__c, 9-__c, 10-__c, 11-__c, 12-__c, 13-__c, 14-__c, 15-__c});
+#else
+  return __builtin_shuffle (__a, __b,
+      (uint16x8_t) {__c, __c+1, __c+2, __c+3, __c+4, __c+5, __c+6, __c+7});
+#endif
 }
 
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-vclsq_s16 (int16x8_t __a)
+__extension__ extern __inline uint32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vextq_u32 (uint32x4_t __a, uint32x4_t __b, __const int __c)
 {
-  return __builtin_aarch64_clrsbv8hi (__a);
+  __AARCH64_LANE_CHECK (__a, __c);
+#ifdef __AARCH64EB__
+  return __builtin_shuffle (__b, __a,
+      (uint32x4_t) {4-__c, 5-__c, 6-__c, 7-__c});
+#else
+  return __builtin_shuffle (__a, __b, (uint32x4_t) {__c, __c+1, __c+2, __c+3});
+#endif
 }
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-vclsq_s32 (int32x4_t __a)
+__extension__ extern __inline uint64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vextq_u64 (uint64x2_t __a, uint64x2_t __b, __const int __c)
 {
-  return __builtin_aarch64_clrsbv4si (__a);
+  __AARCH64_LANE_CHECK (__a, __c);
+#ifdef __AARCH64EB__
+  return __builtin_shuffle (__b, __a, (uint64x2_t) {2-__c, 3-__c});
+#else
+  return __builtin_shuffle (__a, __b, (uint64x2_t) {__c, __c+1});
+#endif
 }
 
-/* vclz.  */
+/* vfma  */
 
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-vclz_s8 (int8x8_t __a)
+__extension__ extern __inline float64x1_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vfma_f64 (float64x1_t __a, float64x1_t __b, float64x1_t __c)
 {
-  return __builtin_aarch64_clzv8qi (__a);
+  return (float64x1_t) {__builtin_fma (__b[0], __c[0], __a[0])};
 }
 
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-vclz_s16 (int16x4_t __a)
+__extension__ extern __inline float32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vfma_f32 (float32x2_t __a, float32x2_t __b, float32x2_t __c)
 {
-  return __builtin_aarch64_clzv4hi (__a);
+  return __builtin_aarch64_fmav2sf (__b, __c, __a);
 }
 
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-vclz_s32 (int32x2_t __a)
+__extension__ extern __inline float32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vfmaq_f32 (float32x4_t __a, float32x4_t __b, float32x4_t __c)
 {
-  return __builtin_aarch64_clzv2si (__a);
+  return __builtin_aarch64_fmav4sf (__b, __c, __a);
 }
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-vclz_u8 (uint8x8_t __a)
+__extension__ extern __inline float64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vfmaq_f64 (float64x2_t __a, float64x2_t __b, float64x2_t __c)
 {
-  return (uint8x8_t)__builtin_aarch64_clzv8qi ((int8x8_t)__a);
+  return __builtin_aarch64_fmav2df (__b, __c, __a);
 }
 
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-vclz_u16 (uint16x4_t __a)
+__extension__ extern __inline float32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vfma_n_f32 (float32x2_t __a, float32x2_t __b, float32_t __c)
 {
-  return (uint16x4_t)__builtin_aarch64_clzv4hi ((int16x4_t)__a);
+  return __builtin_aarch64_fmav2sf (__b, vdup_n_f32 (__c), __a);
 }
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-vclz_u32 (uint32x2_t __a)
+__extension__ extern __inline float64x1_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vfma_n_f64 (float64x1_t __a, float64x1_t __b, float64_t __c)
 {
-  return (uint32x2_t)__builtin_aarch64_clzv2si ((int32x2_t)__a);
+  return (float64x1_t) {__b[0] * __c + __a[0]};
 }
 
-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
-vclzq_s8 (int8x16_t __a)
+__extension__ extern __inline float32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vfmaq_n_f32 (float32x4_t __a, float32x4_t __b, float32_t __c)
 {
-  return __builtin_aarch64_clzv16qi (__a);
+  return __builtin_aarch64_fmav4sf (__b, vdupq_n_f32 (__c), __a);
 }
 
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-vclzq_s16 (int16x8_t __a)
+__extension__ extern __inline float64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vfmaq_n_f64 (float64x2_t __a, float64x2_t __b, float64_t __c)
 {
-  return __builtin_aarch64_clzv8hi (__a);
+  return __builtin_aarch64_fmav2df (__b, vdupq_n_f64 (__c), __a);
 }
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-vclzq_s32 (int32x4_t __a)
+/* vfma_lane  */
+
+__extension__ extern __inline float32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vfma_lane_f32 (float32x2_t __a, float32x2_t __b,
+	       float32x2_t __c, const int __lane)
 {
-  return __builtin_aarch64_clzv4si (__a);
+  return __builtin_aarch64_fmav2sf (__b,
+				    __aarch64_vdup_lane_f32 (__c, __lane),
+				    __a);
 }
 
-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-vclzq_u8 (uint8x16_t __a)
+__extension__ extern __inline float64x1_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vfma_lane_f64 (float64x1_t __a, float64x1_t __b,
+	       float64x1_t __c, const int __lane)
 {
-  return (uint8x16_t)__builtin_aarch64_clzv16qi ((int8x16_t)__a);
+  return (float64x1_t) {__builtin_fma (__b[0], __c[0], __a[0])};
 }
 
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-vclzq_u16 (uint16x8_t __a)
+__extension__ extern __inline float64_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vfmad_lane_f64 (float64_t __a, float64_t __b,
+	        float64x1_t __c, const int __lane)
 {
-  return (uint16x8_t)__builtin_aarch64_clzv8hi ((int16x8_t)__a);
+  return __builtin_fma (__b, __c[0], __a);
 }
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vclzq_u32 (uint32x4_t __a)
+__extension__ extern __inline float32_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vfmas_lane_f32 (float32_t __a, float32_t __b,
+	        float32x2_t __c, const int __lane)
 {
-  return (uint32x4_t)__builtin_aarch64_clzv4si ((int32x4_t)__a);
+  return __builtin_fmaf (__b, __aarch64_vget_lane_any (__c, __lane), __a);
 }
 
-/* vcnt.  */
+/* vfma_laneq  */
 
-__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
-vcnt_p8 (poly8x8_t __a)
+__extension__ extern __inline float32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vfma_laneq_f32 (float32x2_t __a, float32x2_t __b,
+	        float32x4_t __c, const int __lane)
 {
-  return (poly8x8_t) __builtin_aarch64_popcountv8qi ((int8x8_t) __a);
+  return __builtin_aarch64_fmav2sf (__b,
+				    __aarch64_vdup_laneq_f32 (__c, __lane),
+				    __a);
 }
 
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-vcnt_s8 (int8x8_t __a)
+__extension__ extern __inline float64x1_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vfma_laneq_f64 (float64x1_t __a, float64x1_t __b,
+	        float64x2_t __c, const int __lane)
 {
-  return __builtin_aarch64_popcountv8qi (__a);
+  float64_t __c0 = __aarch64_vget_lane_any (__c, __lane);
+  return (float64x1_t) {__builtin_fma (__b[0], __c0, __a[0])};
 }
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-vcnt_u8 (uint8x8_t __a)
+__extension__ extern __inline float64_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vfmad_laneq_f64 (float64_t __a, float64_t __b,
+	         float64x2_t __c, const int __lane)
 {
-  return (uint8x8_t) __builtin_aarch64_popcountv8qi ((int8x8_t) __a);
+  return __builtin_fma (__b, __aarch64_vget_lane_any (__c, __lane), __a);
 }
 
-__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
-vcntq_p8 (poly8x16_t __a)
+__extension__ extern __inline float32_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vfmas_laneq_f32 (float32_t __a, float32_t __b,
+		 float32x4_t __c, const int __lane)
 {
-  return (poly8x16_t) __builtin_aarch64_popcountv16qi ((int8x16_t) __a);
+  return __builtin_fmaf (__b, __aarch64_vget_lane_any (__c, __lane), __a);
 }
 
-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
-vcntq_s8 (int8x16_t __a)
+/* vfmaq_lane  */
+
+__extension__ extern __inline float32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vfmaq_lane_f32 (float32x4_t __a, float32x4_t __b,
+	        float32x2_t __c, const int __lane)
 {
-  return __builtin_aarch64_popcountv16qi (__a);
+  return __builtin_aarch64_fmav4sf (__b,
+				    __aarch64_vdupq_lane_f32 (__c, __lane),
+				    __a);
 }
 
-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-vcntq_u8 (uint8x16_t __a)
+__extension__ extern __inline float64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vfmaq_lane_f64 (float64x2_t __a, float64x2_t __b,
+	        float64x1_t __c, const int __lane)
 {
-  return (uint8x16_t) __builtin_aarch64_popcountv16qi ((int8x16_t) __a);
+  return __builtin_aarch64_fmav2df (__b, vdupq_n_f64 (__c[0]), __a);
 }
 
-/* vcvt (double -> float).  */
+/* vfmaq_laneq  */
 
-__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
-vcvt_f16_f32 (float32x4_t __a)
+__extension__ extern __inline float32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vfmaq_laneq_f32 (float32x4_t __a, float32x4_t __b,
+	         float32x4_t __c, const int __lane)
 {
-  return __builtin_aarch64_float_truncate_lo_v4hf (__a);
+  return __builtin_aarch64_fmav4sf (__b,
+				    __aarch64_vdupq_laneq_f32 (__c, __lane),
+				    __a);
 }
 
-__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
-vcvt_high_f16_f32 (float16x4_t __a, float32x4_t __b)
+__extension__ extern __inline float64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vfmaq_laneq_f64 (float64x2_t __a, float64x2_t __b,
+	         float64x2_t __c, const int __lane)
 {
-  return __builtin_aarch64_float_truncate_hi_v8hf (__a, __b);
+  return __builtin_aarch64_fmav2df (__b,
+				    __aarch64_vdupq_laneq_f64 (__c, __lane),
+				    __a);
 }
 
-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
-vcvt_f32_f64 (float64x2_t __a)
+/* vfms  */
+
+__extension__ extern __inline float64x1_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vfms_f64 (float64x1_t __a, float64x1_t __b, float64x1_t __c)
 {
-  return __builtin_aarch64_float_truncate_lo_v2sf (__a);
+  return (float64x1_t) {__builtin_fma (-__b[0], __c[0], __a[0])};
 }
 
-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
-vcvt_high_f32_f64 (float32x2_t __a, float64x2_t __b)
+__extension__ extern __inline float32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vfms_f32 (float32x2_t __a, float32x2_t __b, float32x2_t __c)
 {
-  return __builtin_aarch64_float_truncate_hi_v4sf (__a, __b);
+  return __builtin_aarch64_fmav2sf (-__b, __c, __a);
 }
 
-/* vcvt (float -> double).  */
-
-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
-vcvt_f32_f16 (float16x4_t __a)
+__extension__ extern __inline float32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vfmsq_f32 (float32x4_t __a, float32x4_t __b, float32x4_t __c)
 {
-  return __builtin_aarch64_float_extend_lo_v4sf (__a);
+  return __builtin_aarch64_fmav4sf (-__b, __c, __a);
 }
 
-__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
-vcvt_f64_f32 (float32x2_t __a)
+__extension__ extern __inline float64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vfmsq_f64 (float64x2_t __a, float64x2_t __b, float64x2_t __c)
 {
-
-  return __builtin_aarch64_float_extend_lo_v2df (__a);
+  return __builtin_aarch64_fmav2df (-__b, __c, __a);
 }
 
-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
-vcvt_high_f32_f16 (float16x8_t __a)
+__extension__ extern __inline float32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vfms_n_f32 (float32x2_t __a, float32x2_t __b, float32_t __c)
 {
-  return __builtin_aarch64_vec_unpacks_hi_v8hf (__a);
+  return __builtin_aarch64_fmav2sf (-__b, vdup_n_f32 (__c), __a);
 }
 
-__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
-vcvt_high_f64_f32 (float32x4_t __a)
+__extension__ extern __inline float64x1_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vfms_n_f64 (float64x1_t __a, float64x1_t __b, float64_t __c)
 {
-  return __builtin_aarch64_vec_unpacks_hi_v4sf (__a);
+  return (float64x1_t) {-__b[0] * __c + __a[0]};
 }
 
-/* vcvt  (<u>int -> float)  */
-
-__extension__ static __inline float64_t __attribute__ ((__always_inline__))
-vcvtd_f64_s64 (int64_t __a)
+__extension__ extern __inline float32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vfmsq_n_f32 (float32x4_t __a, float32x4_t __b, float32_t __c)
 {
-  return (float64_t) __a;
+  return __builtin_aarch64_fmav4sf (-__b, vdupq_n_f32 (__c), __a);
 }
 
-__extension__ static __inline float64_t __attribute__ ((__always_inline__))
-vcvtd_f64_u64 (uint64_t __a)
+__extension__ extern __inline float64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vfmsq_n_f64 (float64x2_t __a, float64x2_t __b, float64_t __c)
 {
-  return (float64_t) __a;
+  return __builtin_aarch64_fmav2df (-__b, vdupq_n_f64 (__c), __a);
 }
 
-__extension__ static __inline float32_t __attribute__ ((__always_inline__))
-vcvts_f32_s32 (int32_t __a)
+/* vfms_lane  */
+
+__extension__ extern __inline float32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vfms_lane_f32 (float32x2_t __a, float32x2_t __b,
+	       float32x2_t __c, const int __lane)
 {
-  return (float32_t) __a;
+  return __builtin_aarch64_fmav2sf (-__b,
+				    __aarch64_vdup_lane_f32 (__c, __lane),
+				    __a);
 }
 
-__extension__ static __inline float32_t __attribute__ ((__always_inline__))
-vcvts_f32_u32 (uint32_t __a)
+__extension__ extern __inline float64x1_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vfms_lane_f64 (float64x1_t __a, float64x1_t __b,
+	       float64x1_t __c, const int __lane)
 {
-  return (float32_t) __a;
+  return (float64x1_t) {__builtin_fma (-__b[0], __c[0], __a[0])};
 }
 
-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
-vcvt_f32_s32 (int32x2_t __a)
+__extension__ extern __inline float64_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vfmsd_lane_f64 (float64_t __a, float64_t __b,
+	        float64x1_t __c, const int __lane)
 {
-  return __builtin_aarch64_floatv2siv2sf (__a);
+  return __builtin_fma (-__b, __c[0], __a);
 }
 
-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
-vcvt_f32_u32 (uint32x2_t __a)
+__extension__ extern __inline float32_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vfmss_lane_f32 (float32_t __a, float32_t __b,
+	        float32x2_t __c, const int __lane)
 {
-  return __builtin_aarch64_floatunsv2siv2sf ((int32x2_t) __a);
+  return __builtin_fmaf (-__b, __aarch64_vget_lane_any (__c, __lane), __a);
 }
 
-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
-vcvtq_f32_s32 (int32x4_t __a)
+/* vfms_laneq  */
+
+__extension__ extern __inline float32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vfms_laneq_f32 (float32x2_t __a, float32x2_t __b,
+	        float32x4_t __c, const int __lane)
 {
-  return __builtin_aarch64_floatv4siv4sf (__a);
+  return __builtin_aarch64_fmav2sf (-__b,
+				    __aarch64_vdup_laneq_f32 (__c, __lane),
+				    __a);
 }
 
-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
-vcvtq_f32_u32 (uint32x4_t __a)
+__extension__ extern __inline float64x1_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vfms_laneq_f64 (float64x1_t __a, float64x1_t __b,
+	        float64x2_t __c, const int __lane)
 {
-  return __builtin_aarch64_floatunsv4siv4sf ((int32x4_t) __a);
+  float64_t __c0 = __aarch64_vget_lane_any (__c, __lane);
+  return (float64x1_t) {__builtin_fma (-__b[0], __c0, __a[0])};
 }
 
-__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
-vcvtq_f64_s64 (int64x2_t __a)
+__extension__ extern __inline float64_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vfmsd_laneq_f64 (float64_t __a, float64_t __b,
+	         float64x2_t __c, const int __lane)
 {
-  return __builtin_aarch64_floatv2div2df (__a);
+  return __builtin_fma (-__b, __aarch64_vget_lane_any (__c, __lane), __a);
 }
 
-__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
-vcvtq_f64_u64 (uint64x2_t __a)
+__extension__ extern __inline float32_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vfmss_laneq_f32 (float32_t __a, float32_t __b,
+		 float32x4_t __c, const int __lane)
 {
-  return __builtin_aarch64_floatunsv2div2df ((int64x2_t) __a);
+  return __builtin_fmaf (-__b, __aarch64_vget_lane_any (__c, __lane), __a);
 }
 
-/* vcvt (float -> <u>int)  */
+/* vfmsq_lane  */
 
-__extension__ static __inline int64_t __attribute__ ((__always_inline__))
-vcvtd_s64_f64 (float64_t __a)
+__extension__ extern __inline float32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vfmsq_lane_f32 (float32x4_t __a, float32x4_t __b,
+	        float32x2_t __c, const int __lane)
 {
-  return (int64_t) __a;
+  return __builtin_aarch64_fmav4sf (-__b,
+				    __aarch64_vdupq_lane_f32 (__c, __lane),
+				    __a);
 }
 
-__extension__ static __inline uint64_t __attribute__ ((__always_inline__))
-vcvtd_u64_f64 (float64_t __a)
+__extension__ extern __inline float64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vfmsq_lane_f64 (float64x2_t __a, float64x2_t __b,
+	        float64x1_t __c, const int __lane)
 {
-  return (uint64_t) __a;
+  return __builtin_aarch64_fmav2df (-__b, vdupq_n_f64 (__c[0]), __a);
 }
 
-__extension__ static __inline int32_t __attribute__ ((__always_inline__))
-vcvts_s32_f32 (float32_t __a)
+/* vfmsq_laneq  */
+
+__extension__ extern __inline float32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vfmsq_laneq_f32 (float32x4_t __a, float32x4_t __b,
+	         float32x4_t __c, const int __lane)
 {
-  return (int32_t) __a;
+  return __builtin_aarch64_fmav4sf (-__b,
+				    __aarch64_vdupq_laneq_f32 (__c, __lane),
+				    __a);
 }
 
-__extension__ static __inline uint32_t __attribute__ ((__always_inline__))
-vcvts_u32_f32 (float32_t __a)
+__extension__ extern __inline float64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vfmsq_laneq_f64 (float64x2_t __a, float64x2_t __b,
+	         float64x2_t __c, const int __lane)
 {
-  return (uint32_t) __a;
+  return __builtin_aarch64_fmav2df (-__b,
+				    __aarch64_vdupq_laneq_f64 (__c, __lane),
+				    __a);
 }
 
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-vcvt_s32_f32 (float32x2_t __a)
+/* vld1 */
+
+__extension__ extern __inline float16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1_f16 (const float16_t *__a)
 {
-  return __builtin_aarch64_lbtruncv2sfv2si (__a);
+  return __builtin_aarch64_ld1v4hf (__a);
 }
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-vcvt_u32_f32 (float32x2_t __a)
+__extension__ extern __inline float32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1_f32 (const float32_t *a)
 {
-  return __builtin_aarch64_lbtruncuv2sfv2si_us (__a);
+  return __builtin_aarch64_ld1v2sf ((const __builtin_aarch64_simd_sf *) a);
 }
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-vcvtq_s32_f32 (float32x4_t __a)
+__extension__ extern __inline float64x1_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1_f64 (const float64_t *a)
 {
-  return __builtin_aarch64_lbtruncv4sfv4si (__a);
+  return (float64x1_t) {*a};
 }
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vcvtq_u32_f32 (float32x4_t __a)
+__extension__ extern __inline poly8x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1_p8 (const poly8_t *a)
 {
-  return __builtin_aarch64_lbtruncuv4sfv4si_us (__a);
+  return (poly8x8_t)
+    __builtin_aarch64_ld1v8qi ((const __builtin_aarch64_simd_qi *) a);
 }
 
-__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
-vcvt_s64_f64 (float64x1_t __a)
+__extension__ extern __inline poly16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1_p16 (const poly16_t *a)
 {
-  return (int64x1_t) {vcvtd_s64_f64 (__a[0])};
+  return (poly16x4_t)
+    __builtin_aarch64_ld1v4hi ((const __builtin_aarch64_simd_hi *) a);
 }
 
-__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
-vcvt_u64_f64 (float64x1_t __a)
+__extension__ extern __inline poly64x1_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1_p64 (const poly64_t *a)
 {
-  return (uint64x1_t) {vcvtd_u64_f64 (__a[0])};
+  return (poly64x1_t) {*a};
 }
 
-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
-vcvtq_s64_f64 (float64x2_t __a)
+__extension__ extern __inline int8x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1_s8 (const int8_t *a)
 {
-  return __builtin_aarch64_lbtruncv2dfv2di (__a);
+  return __builtin_aarch64_ld1v8qi ((const __builtin_aarch64_simd_qi *) a);
 }
 
-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
-vcvtq_u64_f64 (float64x2_t __a)
+__extension__ extern __inline int16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1_s16 (const int16_t *a)
 {
-  return __builtin_aarch64_lbtruncuv2dfv2di_us (__a);
+  return __builtin_aarch64_ld1v4hi ((const __builtin_aarch64_simd_hi *) a);
 }
 
-/* vcvta  */
-
-__extension__ static __inline int64_t __attribute__ ((__always_inline__))
-vcvtad_s64_f64 (float64_t __a)
+__extension__ extern __inline int32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1_s32 (const int32_t *a)
 {
-  return __builtin_aarch64_lrounddfdi (__a);
+  return __builtin_aarch64_ld1v2si ((const __builtin_aarch64_simd_si *) a);
 }
 
-__extension__ static __inline uint64_t __attribute__ ((__always_inline__))
-vcvtad_u64_f64 (float64_t __a)
+__extension__ extern __inline int64x1_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1_s64 (const int64_t *a)
 {
-  return __builtin_aarch64_lroundudfdi_us (__a);
+  return (int64x1_t) {*a};
 }
 
-__extension__ static __inline int32_t __attribute__ ((__always_inline__))
-vcvtas_s32_f32 (float32_t __a)
+__extension__ extern __inline uint8x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1_u8 (const uint8_t *a)
 {
-  return __builtin_aarch64_lroundsfsi (__a);
+  return (uint8x8_t)
+    __builtin_aarch64_ld1v8qi ((const __builtin_aarch64_simd_qi *) a);
 }
 
-__extension__ static __inline uint32_t __attribute__ ((__always_inline__))
-vcvtas_u32_f32 (float32_t __a)
+__extension__ extern __inline uint16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1_u16 (const uint16_t *a)
 {
-  return __builtin_aarch64_lroundusfsi_us (__a);
+  return (uint16x4_t)
+    __builtin_aarch64_ld1v4hi ((const __builtin_aarch64_simd_hi *) a);
 }
 
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-vcvta_s32_f32 (float32x2_t __a)
+__extension__ extern __inline uint32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1_u32 (const uint32_t *a)
 {
-  return __builtin_aarch64_lroundv2sfv2si (__a);
+  return (uint32x2_t)
+    __builtin_aarch64_ld1v2si ((const __builtin_aarch64_simd_si *) a);
 }
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-vcvta_u32_f32 (float32x2_t __a)
+__extension__ extern __inline uint64x1_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1_u64 (const uint64_t *a)
 {
-  return __builtin_aarch64_lrounduv2sfv2si_us (__a);
+  return (uint64x1_t) {*a};
 }
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-vcvtaq_s32_f32 (float32x4_t __a)
+/* vld1q */
+
+__extension__ extern __inline float16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1q_f16 (const float16_t *__a)
 {
-  return __builtin_aarch64_lroundv4sfv4si (__a);
+  return __builtin_aarch64_ld1v8hf (__a);
 }
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vcvtaq_u32_f32 (float32x4_t __a)
+__extension__ extern __inline float32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1q_f32 (const float32_t *a)
 {
-  return __builtin_aarch64_lrounduv4sfv4si_us (__a);
+  return __builtin_aarch64_ld1v4sf ((const __builtin_aarch64_simd_sf *) a);
 }
 
-__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
-vcvta_s64_f64 (float64x1_t __a)
+__extension__ extern __inline float64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1q_f64 (const float64_t *a)
 {
-  return (int64x1_t) {vcvtad_s64_f64 (__a[0])};
+  return __builtin_aarch64_ld1v2df ((const __builtin_aarch64_simd_df *) a);
 }
 
-__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
-vcvta_u64_f64 (float64x1_t __a)
+__extension__ extern __inline poly8x16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1q_p8 (const poly8_t *a)
 {
-  return (uint64x1_t) {vcvtad_u64_f64 (__a[0])};
+  return (poly8x16_t)
+    __builtin_aarch64_ld1v16qi ((const __builtin_aarch64_simd_qi *) a);
 }
 
-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
-vcvtaq_s64_f64 (float64x2_t __a)
+__extension__ extern __inline poly16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1q_p16 (const poly16_t *a)
 {
-  return __builtin_aarch64_lroundv2dfv2di (__a);
+  return (poly16x8_t)
+    __builtin_aarch64_ld1v8hi ((const __builtin_aarch64_simd_hi *) a);
 }
 
-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
-vcvtaq_u64_f64 (float64x2_t __a)
+__extension__ extern __inline poly64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1q_p64 (const poly64_t *a)
 {
-  return __builtin_aarch64_lrounduv2dfv2di_us (__a);
+  return (poly64x2_t)
+    __builtin_aarch64_ld1v2di ((const __builtin_aarch64_simd_di *) a);
 }
 
-/* vcvtm  */
-
-__extension__ static __inline int64_t __attribute__ ((__always_inline__))
-vcvtmd_s64_f64 (float64_t __a)
+__extension__ extern __inline int8x16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1q_s8 (const int8_t *a)
 {
-  return __builtin_llfloor (__a);
+  return __builtin_aarch64_ld1v16qi ((const __builtin_aarch64_simd_qi *) a);
 }
 
-__extension__ static __inline uint64_t __attribute__ ((__always_inline__))
-vcvtmd_u64_f64 (float64_t __a)
+__extension__ extern __inline int16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1q_s16 (const int16_t *a)
 {
-  return __builtin_aarch64_lfloorudfdi_us (__a);
+  return __builtin_aarch64_ld1v8hi ((const __builtin_aarch64_simd_hi *) a);
 }
 
-__extension__ static __inline int32_t __attribute__ ((__always_inline__))
-vcvtms_s32_f32 (float32_t __a)
+__extension__ extern __inline int32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1q_s32 (const int32_t *a)
 {
-  return __builtin_ifloorf (__a);
+  return __builtin_aarch64_ld1v4si ((const __builtin_aarch64_simd_si *) a);
 }
 
-__extension__ static __inline uint32_t __attribute__ ((__always_inline__))
-vcvtms_u32_f32 (float32_t __a)
+__extension__ extern __inline int64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1q_s64 (const int64_t *a)
 {
-  return __builtin_aarch64_lfloorusfsi_us (__a);
+  return __builtin_aarch64_ld1v2di ((const __builtin_aarch64_simd_di *) a);
 }
 
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-vcvtm_s32_f32 (float32x2_t __a)
+__extension__ extern __inline uint8x16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1q_u8 (const uint8_t *a)
 {
-  return __builtin_aarch64_lfloorv2sfv2si (__a);
+  return (uint8x16_t)
+    __builtin_aarch64_ld1v16qi ((const __builtin_aarch64_simd_qi *) a);
 }
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-vcvtm_u32_f32 (float32x2_t __a)
+__extension__ extern __inline uint16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1q_u16 (const uint16_t *a)
 {
-  return __builtin_aarch64_lflooruv2sfv2si_us (__a);
+  return (uint16x8_t)
+    __builtin_aarch64_ld1v8hi ((const __builtin_aarch64_simd_hi *) a);
 }
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-vcvtmq_s32_f32 (float32x4_t __a)
+__extension__ extern __inline uint32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1q_u32 (const uint32_t *a)
 {
-  return __builtin_aarch64_lfloorv4sfv4si (__a);
+  return (uint32x4_t)
+    __builtin_aarch64_ld1v4si ((const __builtin_aarch64_simd_si *) a);
 }
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vcvtmq_u32_f32 (float32x4_t __a)
+__extension__ extern __inline uint64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1q_u64 (const uint64_t *a)
 {
-  return __builtin_aarch64_lflooruv4sfv4si_us (__a);
+  return (uint64x2_t)
+    __builtin_aarch64_ld1v2di ((const __builtin_aarch64_simd_di *) a);
 }
 
-__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
-vcvtm_s64_f64 (float64x1_t __a)
+/* vld1_dup  */
+
+__extension__ extern __inline float16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1_dup_f16 (const float16_t* __a)
 {
-  return (int64x1_t) {vcvtmd_s64_f64 (__a[0])};
+  return vdup_n_f16 (*__a);
 }
 
-__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
-vcvtm_u64_f64 (float64x1_t __a)
+__extension__ extern __inline float32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1_dup_f32 (const float32_t* __a)
 {
-  return (uint64x1_t) {vcvtmd_u64_f64 (__a[0])};
+  return vdup_n_f32 (*__a);
 }
 
-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
-vcvtmq_s64_f64 (float64x2_t __a)
+__extension__ extern __inline float64x1_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1_dup_f64 (const float64_t* __a)
 {
-  return __builtin_aarch64_lfloorv2dfv2di (__a);
+  return vdup_n_f64 (*__a);
 }
 
-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
-vcvtmq_u64_f64 (float64x2_t __a)
+__extension__ extern __inline poly8x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1_dup_p8 (const poly8_t* __a)
 {
-  return __builtin_aarch64_lflooruv2dfv2di_us (__a);
+  return vdup_n_p8 (*__a);
 }
 
-/* vcvtn  */
-
-__extension__ static __inline int64_t __attribute__ ((__always_inline__))
-vcvtnd_s64_f64 (float64_t __a)
+__extension__ extern __inline poly16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1_dup_p16 (const poly16_t* __a)
 {
-  return __builtin_aarch64_lfrintndfdi (__a);
+  return vdup_n_p16 (*__a);
 }
 
-__extension__ static __inline uint64_t __attribute__ ((__always_inline__))
-vcvtnd_u64_f64 (float64_t __a)
+__extension__ extern __inline poly64x1_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1_dup_p64 (const poly64_t* __a)
 {
-  return __builtin_aarch64_lfrintnudfdi_us (__a);
+  return vdup_n_p64 (*__a);
 }
 
-__extension__ static __inline int32_t __attribute__ ((__always_inline__))
-vcvtns_s32_f32 (float32_t __a)
+__extension__ extern __inline int8x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1_dup_s8 (const int8_t* __a)
 {
-  return __builtin_aarch64_lfrintnsfsi (__a);
+  return vdup_n_s8 (*__a);
 }
 
-__extension__ static __inline uint32_t __attribute__ ((__always_inline__))
-vcvtns_u32_f32 (float32_t __a)
+__extension__ extern __inline int16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1_dup_s16 (const int16_t* __a)
 {
-  return __builtin_aarch64_lfrintnusfsi_us (__a);
+  return vdup_n_s16 (*__a);
 }
 
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-vcvtn_s32_f32 (float32x2_t __a)
+__extension__ extern __inline int32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1_dup_s32 (const int32_t* __a)
 {
-  return __builtin_aarch64_lfrintnv2sfv2si (__a);
+  return vdup_n_s32 (*__a);
 }
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-vcvtn_u32_f32 (float32x2_t __a)
+__extension__ extern __inline int64x1_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1_dup_s64 (const int64_t* __a)
 {
-  return __builtin_aarch64_lfrintnuv2sfv2si_us (__a);
+  return vdup_n_s64 (*__a);
 }
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-vcvtnq_s32_f32 (float32x4_t __a)
+__extension__ extern __inline uint8x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1_dup_u8 (const uint8_t* __a)
 {
-  return __builtin_aarch64_lfrintnv4sfv4si (__a);
+  return vdup_n_u8 (*__a);
 }
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vcvtnq_u32_f32 (float32x4_t __a)
+__extension__ extern __inline uint16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1_dup_u16 (const uint16_t* __a)
 {
-  return __builtin_aarch64_lfrintnuv4sfv4si_us (__a);
+  return vdup_n_u16 (*__a);
 }
 
-__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
-vcvtn_s64_f64 (float64x1_t __a)
+__extension__ extern __inline uint32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1_dup_u32 (const uint32_t* __a)
 {
-  return (int64x1_t) {vcvtnd_s64_f64 (__a[0])};
+  return vdup_n_u32 (*__a);
 }
 
-__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
-vcvtn_u64_f64 (float64x1_t __a)
+__extension__ extern __inline uint64x1_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1_dup_u64 (const uint64_t* __a)
 {
-  return (uint64x1_t) {vcvtnd_u64_f64 (__a[0])};
+  return vdup_n_u64 (*__a);
 }
 
-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
-vcvtnq_s64_f64 (float64x2_t __a)
+/* vld1q_dup  */
+
+__extension__ extern __inline float16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1q_dup_f16 (const float16_t* __a)
 {
-  return __builtin_aarch64_lfrintnv2dfv2di (__a);
+  return vdupq_n_f16 (*__a);
 }
 
-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
-vcvtnq_u64_f64 (float64x2_t __a)
+__extension__ extern __inline float32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1q_dup_f32 (const float32_t* __a)
 {
-  return __builtin_aarch64_lfrintnuv2dfv2di_us (__a);
+  return vdupq_n_f32 (*__a);
 }
 
-/* vcvtp  */
-
-__extension__ static __inline int64_t __attribute__ ((__always_inline__))
-vcvtpd_s64_f64 (float64_t __a)
+__extension__ extern __inline float64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1q_dup_f64 (const float64_t* __a)
 {
-  return __builtin_llceil (__a);
+  return vdupq_n_f64 (*__a);
 }
 
-__extension__ static __inline uint64_t __attribute__ ((__always_inline__))
-vcvtpd_u64_f64 (float64_t __a)
+__extension__ extern __inline poly8x16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1q_dup_p8 (const poly8_t* __a)
 {
-  return __builtin_aarch64_lceiludfdi_us (__a);
+  return vdupq_n_p8 (*__a);
 }
 
-__extension__ static __inline int32_t __attribute__ ((__always_inline__))
-vcvtps_s32_f32 (float32_t __a)
+__extension__ extern __inline poly16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1q_dup_p16 (const poly16_t* __a)
 {
-  return __builtin_iceilf (__a);
+  return vdupq_n_p16 (*__a);
 }
 
-__extension__ static __inline uint32_t __attribute__ ((__always_inline__))
-vcvtps_u32_f32 (float32_t __a)
+__extension__ extern __inline poly64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1q_dup_p64 (const poly64_t* __a)
 {
-  return __builtin_aarch64_lceilusfsi_us (__a);
+  return vdupq_n_p64 (*__a);
 }
 
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-vcvtp_s32_f32 (float32x2_t __a)
+ __extension__ extern __inline int8x16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1q_dup_s8 (const int8_t* __a)
 {
-  return __builtin_aarch64_lceilv2sfv2si (__a);
+  return vdupq_n_s8 (*__a);
 }
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-vcvtp_u32_f32 (float32x2_t __a)
+__extension__ extern __inline int16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1q_dup_s16 (const int16_t* __a)
 {
-  return __builtin_aarch64_lceiluv2sfv2si_us (__a);
+  return vdupq_n_s16 (*__a);
 }
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-vcvtpq_s32_f32 (float32x4_t __a)
+__extension__ extern __inline int32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1q_dup_s32 (const int32_t* __a)
 {
-  return __builtin_aarch64_lceilv4sfv4si (__a);
+  return vdupq_n_s32 (*__a);
 }
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vcvtpq_u32_f32 (float32x4_t __a)
+__extension__ extern __inline int64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1q_dup_s64 (const int64_t* __a)
 {
-  return __builtin_aarch64_lceiluv4sfv4si_us (__a);
+  return vdupq_n_s64 (*__a);
 }
 
-__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
-vcvtp_s64_f64 (float64x1_t __a)
+__extension__ extern __inline uint8x16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1q_dup_u8 (const uint8_t* __a)
 {
-  return (int64x1_t) {vcvtpd_s64_f64 (__a[0])};
+  return vdupq_n_u8 (*__a);
 }
 
-__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
-vcvtp_u64_f64 (float64x1_t __a)
+__extension__ extern __inline uint16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1q_dup_u16 (const uint16_t* __a)
 {
-  return (uint64x1_t) {vcvtpd_u64_f64 (__a[0])};
+  return vdupq_n_u16 (*__a);
 }
 
-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
-vcvtpq_s64_f64 (float64x2_t __a)
+__extension__ extern __inline uint32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1q_dup_u32 (const uint32_t* __a)
 {
-  return __builtin_aarch64_lceilv2dfv2di (__a);
+  return vdupq_n_u32 (*__a);
 }
 
-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
-vcvtpq_u64_f64 (float64x2_t __a)
+__extension__ extern __inline uint64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1q_dup_u64 (const uint64_t* __a)
 {
-  return __builtin_aarch64_lceiluv2dfv2di_us (__a);
+  return vdupq_n_u64 (*__a);
 }
 
-/* vdup_n  */
+/* vld1_lane  */
 
-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
-vdup_n_f32 (float32_t __a)
+__extension__ extern __inline float16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1_lane_f16 (const float16_t *__src, float16x4_t __vec, const int __lane)
 {
-  return (float32x2_t) {__a, __a};
+  return __aarch64_vset_lane_any (*__src, __vec, __lane);
 }
 
-__extension__ static __inline float64x1_t __attribute__ ((__always_inline__))
-vdup_n_f64 (float64_t __a)
+__extension__ extern __inline float32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1_lane_f32 (const float32_t *__src, float32x2_t __vec, const int __lane)
 {
-  return (float64x1_t) {__a};
+  return __aarch64_vset_lane_any (*__src, __vec, __lane);
 }
 
-__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
-vdup_n_p8 (poly8_t __a)
+__extension__ extern __inline float64x1_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1_lane_f64 (const float64_t *__src, float64x1_t __vec, const int __lane)
 {
-  return (poly8x8_t) {__a, __a, __a, __a, __a, __a, __a, __a};
+  return __aarch64_vset_lane_any (*__src, __vec, __lane);
 }
 
-__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__))
-vdup_n_p16 (poly16_t __a)
+__extension__ extern __inline poly8x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1_lane_p8 (const poly8_t *__src, poly8x8_t __vec, const int __lane)
 {
-  return (poly16x4_t) {__a, __a, __a, __a};
+  return __aarch64_vset_lane_any (*__src, __vec, __lane);
 }
 
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-vdup_n_s8 (int8_t __a)
+__extension__ extern __inline poly16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1_lane_p16 (const poly16_t *__src, poly16x4_t __vec, const int __lane)
 {
-  return (int8x8_t) {__a, __a, __a, __a, __a, __a, __a, __a};
+  return __aarch64_vset_lane_any (*__src, __vec, __lane);
 }
 
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-vdup_n_s16 (int16_t __a)
+__extension__ extern __inline poly64x1_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1_lane_p64 (const poly64_t *__src, poly64x1_t __vec, const int __lane)
 {
-  return (int16x4_t) {__a, __a, __a, __a};
+  return __aarch64_vset_lane_any (*__src, __vec, __lane);
 }
 
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-vdup_n_s32 (int32_t __a)
+__extension__ extern __inline int8x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1_lane_s8 (const int8_t *__src, int8x8_t __vec, const int __lane)
 {
-  return (int32x2_t) {__a, __a};
+  return __aarch64_vset_lane_any (*__src, __vec, __lane);
 }
 
-__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
-vdup_n_s64 (int64_t __a)
+__extension__ extern __inline int16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1_lane_s16 (const int16_t *__src, int16x4_t __vec, const int __lane)
 {
-  return (int64x1_t) {__a};
+  return __aarch64_vset_lane_any (*__src, __vec, __lane);
 }
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-vdup_n_u8 (uint8_t __a)
+__extension__ extern __inline int32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1_lane_s32 (const int32_t *__src, int32x2_t __vec, const int __lane)
 {
-  return (uint8x8_t) {__a, __a, __a, __a, __a, __a, __a, __a};
+  return __aarch64_vset_lane_any (*__src, __vec, __lane);
 }
 
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-vdup_n_u16 (uint16_t __a)
+__extension__ extern __inline int64x1_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1_lane_s64 (const int64_t *__src, int64x1_t __vec, const int __lane)
 {
-  return (uint16x4_t) {__a, __a, __a, __a};
+  return __aarch64_vset_lane_any (*__src, __vec, __lane);
 }
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-vdup_n_u32 (uint32_t __a)
+__extension__ extern __inline uint8x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1_lane_u8 (const uint8_t *__src, uint8x8_t __vec, const int __lane)
 {
-  return (uint32x2_t) {__a, __a};
+  return __aarch64_vset_lane_any (*__src, __vec, __lane);
 }
 
-__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
-vdup_n_u64 (uint64_t __a)
+__extension__ extern __inline uint16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1_lane_u16 (const uint16_t *__src, uint16x4_t __vec, const int __lane)
 {
-  return (uint64x1_t) {__a};
+  return __aarch64_vset_lane_any (*__src, __vec, __lane);
 }
 
-/* vdupq_n  */
-
-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
-vdupq_n_f32 (float32_t __a)
+__extension__ extern __inline uint32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1_lane_u32 (const uint32_t *__src, uint32x2_t __vec, const int __lane)
 {
-  return (float32x4_t) {__a, __a, __a, __a};
+  return __aarch64_vset_lane_any (*__src, __vec, __lane);
 }
 
-__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
-vdupq_n_f64 (float64_t __a)
+__extension__ extern __inline uint64x1_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1_lane_u64 (const uint64_t *__src, uint64x1_t __vec, const int __lane)
 {
-  return (float64x2_t) {__a, __a};
+  return __aarch64_vset_lane_any (*__src, __vec, __lane);
 }
 
-__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
-vdupq_n_p8 (uint32_t __a)
+/* vld1q_lane  */
+
+__extension__ extern __inline float16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1q_lane_f16 (const float16_t *__src, float16x8_t __vec, const int __lane)
 {
-  return (poly8x16_t) {__a, __a, __a, __a, __a, __a, __a, __a,
-		       __a, __a, __a, __a, __a, __a, __a, __a};
+  return __aarch64_vset_lane_any (*__src, __vec, __lane);
 }
 
-__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
-vdupq_n_p16 (uint32_t __a)
+__extension__ extern __inline float32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1q_lane_f32 (const float32_t *__src, float32x4_t __vec, const int __lane)
 {
-  return (poly16x8_t) {__a, __a, __a, __a, __a, __a, __a, __a};
+  return __aarch64_vset_lane_any (*__src, __vec, __lane);
 }
 
-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
-vdupq_n_s8 (int32_t __a)
+__extension__ extern __inline float64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1q_lane_f64 (const float64_t *__src, float64x2_t __vec, const int __lane)
 {
-  return (int8x16_t) {__a, __a, __a, __a, __a, __a, __a, __a,
-		      __a, __a, __a, __a, __a, __a, __a, __a};
+  return __aarch64_vset_lane_any (*__src, __vec, __lane);
 }
 
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-vdupq_n_s16 (int32_t __a)
+__extension__ extern __inline poly8x16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1q_lane_p8 (const poly8_t *__src, poly8x16_t __vec, const int __lane)
 {
-  return (int16x8_t) {__a, __a, __a, __a, __a, __a, __a, __a};
+  return __aarch64_vset_lane_any (*__src, __vec, __lane);
 }
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-vdupq_n_s32 (int32_t __a)
+__extension__ extern __inline poly16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1q_lane_p16 (const poly16_t *__src, poly16x8_t __vec, const int __lane)
 {
-  return (int32x4_t) {__a, __a, __a, __a};
+  return __aarch64_vset_lane_any (*__src, __vec, __lane);
 }
 
-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
-vdupq_n_s64 (int64_t __a)
+__extension__ extern __inline poly64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1q_lane_p64 (const poly64_t *__src, poly64x2_t __vec, const int __lane)
 {
-  return (int64x2_t) {__a, __a};
+  return __aarch64_vset_lane_any (*__src, __vec, __lane);
 }
 
-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-vdupq_n_u8 (uint32_t __a)
+__extension__ extern __inline int8x16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1q_lane_s8 (const int8_t *__src, int8x16_t __vec, const int __lane)
 {
-  return (uint8x16_t) {__a, __a, __a, __a, __a, __a, __a, __a,
-		       __a, __a, __a, __a, __a, __a, __a, __a};
+  return __aarch64_vset_lane_any (*__src, __vec, __lane);
 }
 
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-vdupq_n_u16 (uint32_t __a)
+__extension__ extern __inline int16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1q_lane_s16 (const int16_t *__src, int16x8_t __vec, const int __lane)
 {
-  return (uint16x8_t) {__a, __a, __a, __a, __a, __a, __a, __a};
+  return __aarch64_vset_lane_any (*__src, __vec, __lane);
 }
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vdupq_n_u32 (uint32_t __a)
+__extension__ extern __inline int32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1q_lane_s32 (const int32_t *__src, int32x4_t __vec, const int __lane)
 {
-  return (uint32x4_t) {__a, __a, __a, __a};
+  return __aarch64_vset_lane_any (*__src, __vec, __lane);
 }
 
-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
-vdupq_n_u64 (uint64_t __a)
+__extension__ extern __inline int64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1q_lane_s64 (const int64_t *__src, int64x2_t __vec, const int __lane)
 {
-  return (uint64x2_t) {__a, __a};
+  return __aarch64_vset_lane_any (*__src, __vec, __lane);
 }
 
-/* vdup_lane  */
-
-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
-vdup_lane_f32 (float32x2_t __a, const int __b)
+__extension__ extern __inline uint8x16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1q_lane_u8 (const uint8_t *__src, uint8x16_t __vec, const int __lane)
 {
-  return __aarch64_vdup_lane_f32 (__a, __b);
+  return __aarch64_vset_lane_any (*__src, __vec, __lane);
 }
 
-__extension__ static __inline float64x1_t __attribute__ ((__always_inline__))
-vdup_lane_f64 (float64x1_t __a, const int __b)
+__extension__ extern __inline uint16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1q_lane_u16 (const uint16_t *__src, uint16x8_t __vec, const int __lane)
 {
-  return __aarch64_vdup_lane_f64 (__a, __b);
+  return __aarch64_vset_lane_any (*__src, __vec, __lane);
 }
 
-__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
-vdup_lane_p8 (poly8x8_t __a, const int __b)
+__extension__ extern __inline uint32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1q_lane_u32 (const uint32_t *__src, uint32x4_t __vec, const int __lane)
 {
-  return __aarch64_vdup_lane_p8 (__a, __b);
+  return __aarch64_vset_lane_any (*__src, __vec, __lane);
 }
 
-__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__))
-vdup_lane_p16 (poly16x4_t __a, const int __b)
+__extension__ extern __inline uint64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1q_lane_u64 (const uint64_t *__src, uint64x2_t __vec, const int __lane)
 {
-  return __aarch64_vdup_lane_p16 (__a, __b);
+  return __aarch64_vset_lane_any (*__src, __vec, __lane);
 }
 
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-vdup_lane_s8 (int8x8_t __a, const int __b)
+/* vldn */
+
+__extension__ extern __inline int64x1x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld2_s64 (const int64_t * __a)
 {
-  return __aarch64_vdup_lane_s8 (__a, __b);
+  int64x1x2_t ret;
+  __builtin_aarch64_simd_oi __o;
+  __o = __builtin_aarch64_ld2di ((const __builtin_aarch64_simd_di *) __a);
+  ret.val[0] = (int64x1_t) __builtin_aarch64_get_dregoidi (__o, 0);
+  ret.val[1] = (int64x1_t) __builtin_aarch64_get_dregoidi (__o, 1);
+  return ret;
 }
 
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-vdup_lane_s16 (int16x4_t __a, const int __b)
+__extension__ extern __inline uint64x1x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld2_u64 (const uint64_t * __a)
 {
-  return __aarch64_vdup_lane_s16 (__a, __b);
+  uint64x1x2_t ret;
+  __builtin_aarch64_simd_oi __o;
+  __o = __builtin_aarch64_ld2di ((const __builtin_aarch64_simd_di *) __a);
+  ret.val[0] = (uint64x1_t) __builtin_aarch64_get_dregoidi (__o, 0);
+  ret.val[1] = (uint64x1_t) __builtin_aarch64_get_dregoidi (__o, 1);
+  return ret;
 }
 
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-vdup_lane_s32 (int32x2_t __a, const int __b)
+__extension__ extern __inline float64x1x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld2_f64 (const float64_t * __a)
 {
-  return __aarch64_vdup_lane_s32 (__a, __b);
+  float64x1x2_t ret;
+  __builtin_aarch64_simd_oi __o;
+  __o = __builtin_aarch64_ld2df ((const __builtin_aarch64_simd_df *) __a);
+  ret.val[0] = (float64x1_t) {__builtin_aarch64_get_dregoidf (__o, 0)};
+  ret.val[1] = (float64x1_t) {__builtin_aarch64_get_dregoidf (__o, 1)};
+  return ret;
 }
 
-__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
-vdup_lane_s64 (int64x1_t __a, const int __b)
+__extension__ extern __inline int8x8x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld2_s8 (const int8_t * __a)
 {
-  return __aarch64_vdup_lane_s64 (__a, __b);
+  int8x8x2_t ret;
+  __builtin_aarch64_simd_oi __o;
+  __o = __builtin_aarch64_ld2v8qi ((const __builtin_aarch64_simd_qi *) __a);
+  ret.val[0] = (int8x8_t) __builtin_aarch64_get_dregoiv8qi (__o, 0);
+  ret.val[1] = (int8x8_t) __builtin_aarch64_get_dregoiv8qi (__o, 1);
+  return ret;
 }
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-vdup_lane_u8 (uint8x8_t __a, const int __b)
+__extension__ extern __inline poly8x8x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld2_p8 (const poly8_t * __a)
 {
-  return __aarch64_vdup_lane_u8 (__a, __b);
+  poly8x8x2_t ret;
+  __builtin_aarch64_simd_oi __o;
+  __o = __builtin_aarch64_ld2v8qi ((const __builtin_aarch64_simd_qi *) __a);
+  ret.val[0] = (poly8x8_t) __builtin_aarch64_get_dregoiv8qi (__o, 0);
+  ret.val[1] = (poly8x8_t) __builtin_aarch64_get_dregoiv8qi (__o, 1);
+  return ret;
 }
 
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-vdup_lane_u16 (uint16x4_t __a, const int __b)
+__extension__ extern __inline poly64x1x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld2_p64 (const poly64_t * __a)
 {
-  return __aarch64_vdup_lane_u16 (__a, __b);
+  poly64x1x2_t ret;
+  __builtin_aarch64_simd_oi __o;
+  __o = __builtin_aarch64_ld2di ((const __builtin_aarch64_simd_di *) __a);
+  ret.val[0] = (poly64x1_t) __builtin_aarch64_get_dregoidi_pss (__o, 0);
+  ret.val[1] = (poly64x1_t) __builtin_aarch64_get_dregoidi_pss (__o, 1);
+  return ret;
 }
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-vdup_lane_u32 (uint32x2_t __a, const int __b)
+__extension__ extern __inline int16x4x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld2_s16 (const int16_t * __a)
 {
-  return __aarch64_vdup_lane_u32 (__a, __b);
+  int16x4x2_t ret;
+  __builtin_aarch64_simd_oi __o;
+  __o = __builtin_aarch64_ld2v4hi ((const __builtin_aarch64_simd_hi *) __a);
+  ret.val[0] = (int16x4_t) __builtin_aarch64_get_dregoiv4hi (__o, 0);
+  ret.val[1] = (int16x4_t) __builtin_aarch64_get_dregoiv4hi (__o, 1);
+  return ret;
 }
 
-__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
-vdup_lane_u64 (uint64x1_t __a, const int __b)
+__extension__ extern __inline poly16x4x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld2_p16 (const poly16_t * __a)
 {
-  return __aarch64_vdup_lane_u64 (__a, __b);
+  poly16x4x2_t ret;
+  __builtin_aarch64_simd_oi __o;
+  __o = __builtin_aarch64_ld2v4hi ((const __builtin_aarch64_simd_hi *) __a);
+  ret.val[0] = (poly16x4_t) __builtin_aarch64_get_dregoiv4hi (__o, 0);
+  ret.val[1] = (poly16x4_t) __builtin_aarch64_get_dregoiv4hi (__o, 1);
+  return ret;
 }
 
-/* vdup_laneq  */
-
-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
-vdup_laneq_f32 (float32x4_t __a, const int __b)
+__extension__ extern __inline int32x2x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld2_s32 (const int32_t * __a)
 {
-  return __aarch64_vdup_laneq_f32 (__a, __b);
+  int32x2x2_t ret;
+  __builtin_aarch64_simd_oi __o;
+  __o = __builtin_aarch64_ld2v2si ((const __builtin_aarch64_simd_si *) __a);
+  ret.val[0] = (int32x2_t) __builtin_aarch64_get_dregoiv2si (__o, 0);
+  ret.val[1] = (int32x2_t) __builtin_aarch64_get_dregoiv2si (__o, 1);
+  return ret;
 }
 
-__extension__ static __inline float64x1_t __attribute__ ((__always_inline__))
-vdup_laneq_f64 (float64x2_t __a, const int __b)
+__extension__ extern __inline uint8x8x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld2_u8 (const uint8_t * __a)
 {
-  return __aarch64_vdup_laneq_f64 (__a, __b);
+  uint8x8x2_t ret;
+  __builtin_aarch64_simd_oi __o;
+  __o = __builtin_aarch64_ld2v8qi ((const __builtin_aarch64_simd_qi *) __a);
+  ret.val[0] = (uint8x8_t) __builtin_aarch64_get_dregoiv8qi (__o, 0);
+  ret.val[1] = (uint8x8_t) __builtin_aarch64_get_dregoiv8qi (__o, 1);
+  return ret;
 }
 
-__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
-vdup_laneq_p8 (poly8x16_t __a, const int __b)
+__extension__ extern __inline uint16x4x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld2_u16 (const uint16_t * __a)
 {
-  return __aarch64_vdup_laneq_p8 (__a, __b);
+  uint16x4x2_t ret;
+  __builtin_aarch64_simd_oi __o;
+  __o = __builtin_aarch64_ld2v4hi ((const __builtin_aarch64_simd_hi *) __a);
+  ret.val[0] = (uint16x4_t) __builtin_aarch64_get_dregoiv4hi (__o, 0);
+  ret.val[1] = (uint16x4_t) __builtin_aarch64_get_dregoiv4hi (__o, 1);
+  return ret;
 }
 
-__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__))
-vdup_laneq_p16 (poly16x8_t __a, const int __b)
+__extension__ extern __inline uint32x2x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld2_u32 (const uint32_t * __a)
 {
-  return __aarch64_vdup_laneq_p16 (__a, __b);
+  uint32x2x2_t ret;
+  __builtin_aarch64_simd_oi __o;
+  __o = __builtin_aarch64_ld2v2si ((const __builtin_aarch64_simd_si *) __a);
+  ret.val[0] = (uint32x2_t) __builtin_aarch64_get_dregoiv2si (__o, 0);
+  ret.val[1] = (uint32x2_t) __builtin_aarch64_get_dregoiv2si (__o, 1);
+  return ret;
 }
 
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-vdup_laneq_s8 (int8x16_t __a, const int __b)
+__extension__ extern __inline float16x4x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld2_f16 (const float16_t * __a)
 {
-  return __aarch64_vdup_laneq_s8 (__a, __b);
+  float16x4x2_t ret;
+  __builtin_aarch64_simd_oi __o;
+  __o = __builtin_aarch64_ld2v4hf (__a);
+  ret.val[0] = __builtin_aarch64_get_dregoiv4hf (__o, 0);
+  ret.val[1] = __builtin_aarch64_get_dregoiv4hf (__o, 1);
+  return ret;
 }
 
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-vdup_laneq_s16 (int16x8_t __a, const int __b)
+__extension__ extern __inline float32x2x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld2_f32 (const float32_t * __a)
 {
-  return __aarch64_vdup_laneq_s16 (__a, __b);
+  float32x2x2_t ret;
+  __builtin_aarch64_simd_oi __o;
+  __o = __builtin_aarch64_ld2v2sf ((const __builtin_aarch64_simd_sf *) __a);
+  ret.val[0] = (float32x2_t) __builtin_aarch64_get_dregoiv2sf (__o, 0);
+  ret.val[1] = (float32x2_t) __builtin_aarch64_get_dregoiv2sf (__o, 1);
+  return ret;
 }
 
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-vdup_laneq_s32 (int32x4_t __a, const int __b)
+__extension__ extern __inline int8x16x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld2q_s8 (const int8_t * __a)
 {
-  return __aarch64_vdup_laneq_s32 (__a, __b);
+  int8x16x2_t ret;
+  __builtin_aarch64_simd_oi __o;
+  __o = __builtin_aarch64_ld2v16qi ((const __builtin_aarch64_simd_qi *) __a);
+  ret.val[0] = (int8x16_t) __builtin_aarch64_get_qregoiv16qi (__o, 0);
+  ret.val[1] = (int8x16_t) __builtin_aarch64_get_qregoiv16qi (__o, 1);
+  return ret;
 }
 
-__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
-vdup_laneq_s64 (int64x2_t __a, const int __b)
+__extension__ extern __inline poly8x16x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld2q_p8 (const poly8_t * __a)
 {
-  return __aarch64_vdup_laneq_s64 (__a, __b);
+  poly8x16x2_t ret;
+  __builtin_aarch64_simd_oi __o;
+  __o = __builtin_aarch64_ld2v16qi ((const __builtin_aarch64_simd_qi *) __a);
+  ret.val[0] = (poly8x16_t) __builtin_aarch64_get_qregoiv16qi (__o, 0);
+  ret.val[1] = (poly8x16_t) __builtin_aarch64_get_qregoiv16qi (__o, 1);
+  return ret;
 }
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-vdup_laneq_u8 (uint8x16_t __a, const int __b)
+__extension__ extern __inline int16x8x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld2q_s16 (const int16_t * __a)
 {
-  return __aarch64_vdup_laneq_u8 (__a, __b);
+  int16x8x2_t ret;
+  __builtin_aarch64_simd_oi __o;
+  __o = __builtin_aarch64_ld2v8hi ((const __builtin_aarch64_simd_hi *) __a);
+  ret.val[0] = (int16x8_t) __builtin_aarch64_get_qregoiv8hi (__o, 0);
+  ret.val[1] = (int16x8_t) __builtin_aarch64_get_qregoiv8hi (__o, 1);
+  return ret;
 }
 
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-vdup_laneq_u16 (uint16x8_t __a, const int __b)
+__extension__ extern __inline poly16x8x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld2q_p16 (const poly16_t * __a)
 {
-  return __aarch64_vdup_laneq_u16 (__a, __b);
+  poly16x8x2_t ret;
+  __builtin_aarch64_simd_oi __o;
+  __o = __builtin_aarch64_ld2v8hi ((const __builtin_aarch64_simd_hi *) __a);
+  ret.val[0] = (poly16x8_t) __builtin_aarch64_get_qregoiv8hi (__o, 0);
+  ret.val[1] = (poly16x8_t) __builtin_aarch64_get_qregoiv8hi (__o, 1);
+  return ret;
 }
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-vdup_laneq_u32 (uint32x4_t __a, const int __b)
+__extension__ extern __inline poly64x2x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld2q_p64 (const poly64_t * __a)
 {
-  return __aarch64_vdup_laneq_u32 (__a, __b);
+  poly64x2x2_t ret;
+  __builtin_aarch64_simd_oi __o;
+  __o = __builtin_aarch64_ld2v2di ((const __builtin_aarch64_simd_di *) __a);
+  ret.val[0] = (poly64x2_t) __builtin_aarch64_get_qregoiv2di_pss (__o, 0);
+  ret.val[1] = (poly64x2_t) __builtin_aarch64_get_qregoiv2di_pss (__o, 1);
+  return ret;
 }
 
-__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
-vdup_laneq_u64 (uint64x2_t __a, const int __b)
+__extension__ extern __inline int32x4x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld2q_s32 (const int32_t * __a)
 {
-  return __aarch64_vdup_laneq_u64 (__a, __b);
+  int32x4x2_t ret;
+  __builtin_aarch64_simd_oi __o;
+  __o = __builtin_aarch64_ld2v4si ((const __builtin_aarch64_simd_si *) __a);
+  ret.val[0] = (int32x4_t) __builtin_aarch64_get_qregoiv4si (__o, 0);
+  ret.val[1] = (int32x4_t) __builtin_aarch64_get_qregoiv4si (__o, 1);
+  return ret;
 }
 
-/* vdupq_lane  */
-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
-vdupq_lane_f32 (float32x2_t __a, const int __b)
+__extension__ extern __inline int64x2x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld2q_s64 (const int64_t * __a)
 {
-  return __aarch64_vdupq_lane_f32 (__a, __b);
+  int64x2x2_t ret;
+  __builtin_aarch64_simd_oi __o;
+  __o = __builtin_aarch64_ld2v2di ((const __builtin_aarch64_simd_di *) __a);
+  ret.val[0] = (int64x2_t) __builtin_aarch64_get_qregoiv2di (__o, 0);
+  ret.val[1] = (int64x2_t) __builtin_aarch64_get_qregoiv2di (__o, 1);
+  return ret;
 }
 
-__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
-vdupq_lane_f64 (float64x1_t __a, const int __b)
+__extension__ extern __inline uint8x16x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld2q_u8 (const uint8_t * __a)
 {
-  return __aarch64_vdupq_lane_f64 (__a, __b);
+  uint8x16x2_t ret;
+  __builtin_aarch64_simd_oi __o;
+  __o = __builtin_aarch64_ld2v16qi ((const __builtin_aarch64_simd_qi *) __a);
+  ret.val[0] = (uint8x16_t) __builtin_aarch64_get_qregoiv16qi (__o, 0);
+  ret.val[1] = (uint8x16_t) __builtin_aarch64_get_qregoiv16qi (__o, 1);
+  return ret;
 }
 
-__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
-vdupq_lane_p8 (poly8x8_t __a, const int __b)
+__extension__ extern __inline uint16x8x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld2q_u16 (const uint16_t * __a)
 {
-  return __aarch64_vdupq_lane_p8 (__a, __b);
+  uint16x8x2_t ret;
+  __builtin_aarch64_simd_oi __o;
+  __o = __builtin_aarch64_ld2v8hi ((const __builtin_aarch64_simd_hi *) __a);
+  ret.val[0] = (uint16x8_t) __builtin_aarch64_get_qregoiv8hi (__o, 0);
+  ret.val[1] = (uint16x8_t) __builtin_aarch64_get_qregoiv8hi (__o, 1);
+  return ret;
 }
 
-__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
-vdupq_lane_p16 (poly16x4_t __a, const int __b)
+__extension__ extern __inline uint32x4x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld2q_u32 (const uint32_t * __a)
 {
-  return __aarch64_vdupq_lane_p16 (__a, __b);
+  uint32x4x2_t ret;
+  __builtin_aarch64_simd_oi __o;
+  __o = __builtin_aarch64_ld2v4si ((const __builtin_aarch64_simd_si *) __a);
+  ret.val[0] = (uint32x4_t) __builtin_aarch64_get_qregoiv4si (__o, 0);
+  ret.val[1] = (uint32x4_t) __builtin_aarch64_get_qregoiv4si (__o, 1);
+  return ret;
 }
 
-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
-vdupq_lane_s8 (int8x8_t __a, const int __b)
+__extension__ extern __inline uint64x2x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld2q_u64 (const uint64_t * __a)
 {
-  return __aarch64_vdupq_lane_s8 (__a, __b);
+  uint64x2x2_t ret;
+  __builtin_aarch64_simd_oi __o;
+  __o = __builtin_aarch64_ld2v2di ((const __builtin_aarch64_simd_di *) __a);
+  ret.val[0] = (uint64x2_t) __builtin_aarch64_get_qregoiv2di (__o, 0);
+  ret.val[1] = (uint64x2_t) __builtin_aarch64_get_qregoiv2di (__o, 1);
+  return ret;
 }
 
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-vdupq_lane_s16 (int16x4_t __a, const int __b)
+__extension__ extern __inline float16x8x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld2q_f16 (const float16_t * __a)
 {
-  return __aarch64_vdupq_lane_s16 (__a, __b);
+  float16x8x2_t ret;
+  __builtin_aarch64_simd_oi __o;
+  __o = __builtin_aarch64_ld2v8hf (__a);
+  ret.val[0] = __builtin_aarch64_get_qregoiv8hf (__o, 0);
+  ret.val[1] = __builtin_aarch64_get_qregoiv8hf (__o, 1);
+  return ret;
 }
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-vdupq_lane_s32 (int32x2_t __a, const int __b)
+__extension__ extern __inline float32x4x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld2q_f32 (const float32_t * __a)
 {
-  return __aarch64_vdupq_lane_s32 (__a, __b);
+  float32x4x2_t ret;
+  __builtin_aarch64_simd_oi __o;
+  __o = __builtin_aarch64_ld2v4sf ((const __builtin_aarch64_simd_sf *) __a);
+  ret.val[0] = (float32x4_t) __builtin_aarch64_get_qregoiv4sf (__o, 0);
+  ret.val[1] = (float32x4_t) __builtin_aarch64_get_qregoiv4sf (__o, 1);
+  return ret;
 }
 
-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
-vdupq_lane_s64 (int64x1_t __a, const int __b)
+__extension__ extern __inline float64x2x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld2q_f64 (const float64_t * __a)
 {
-  return __aarch64_vdupq_lane_s64 (__a, __b);
+  float64x2x2_t ret;
+  __builtin_aarch64_simd_oi __o;
+  __o = __builtin_aarch64_ld2v2df ((const __builtin_aarch64_simd_df *) __a);
+  ret.val[0] = (float64x2_t) __builtin_aarch64_get_qregoiv2df (__o, 0);
+  ret.val[1] = (float64x2_t) __builtin_aarch64_get_qregoiv2df (__o, 1);
+  return ret;
 }
 
-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-vdupq_lane_u8 (uint8x8_t __a, const int __b)
+__extension__ extern __inline int64x1x3_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld3_s64 (const int64_t * __a)
 {
-  return __aarch64_vdupq_lane_u8 (__a, __b);
+  int64x1x3_t ret;
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_ld3di ((const __builtin_aarch64_simd_di *) __a);
+  ret.val[0] = (int64x1_t) __builtin_aarch64_get_dregcidi (__o, 0);
+  ret.val[1] = (int64x1_t) __builtin_aarch64_get_dregcidi (__o, 1);
+  ret.val[2] = (int64x1_t) __builtin_aarch64_get_dregcidi (__o, 2);
+  return ret;
 }
 
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-vdupq_lane_u16 (uint16x4_t __a, const int __b)
+__extension__ extern __inline uint64x1x3_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld3_u64 (const uint64_t * __a)
 {
-  return __aarch64_vdupq_lane_u16 (__a, __b);
+  uint64x1x3_t ret;
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_ld3di ((const __builtin_aarch64_simd_di *) __a);
+  ret.val[0] = (uint64x1_t) __builtin_aarch64_get_dregcidi (__o, 0);
+  ret.val[1] = (uint64x1_t) __builtin_aarch64_get_dregcidi (__o, 1);
+  ret.val[2] = (uint64x1_t) __builtin_aarch64_get_dregcidi (__o, 2);
+  return ret;
 }
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vdupq_lane_u32 (uint32x2_t __a, const int __b)
+__extension__ extern __inline float64x1x3_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld3_f64 (const float64_t * __a)
 {
-  return __aarch64_vdupq_lane_u32 (__a, __b);
+  float64x1x3_t ret;
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_ld3df ((const __builtin_aarch64_simd_df *) __a);
+  ret.val[0] = (float64x1_t) {__builtin_aarch64_get_dregcidf (__o, 0)};
+  ret.val[1] = (float64x1_t) {__builtin_aarch64_get_dregcidf (__o, 1)};
+  ret.val[2] = (float64x1_t) {__builtin_aarch64_get_dregcidf (__o, 2)};
+  return ret;
 }
 
-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
-vdupq_lane_u64 (uint64x1_t __a, const int __b)
+__extension__ extern __inline int8x8x3_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld3_s8 (const int8_t * __a)
 {
-  return __aarch64_vdupq_lane_u64 (__a, __b);
+  int8x8x3_t ret;
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_ld3v8qi ((const __builtin_aarch64_simd_qi *) __a);
+  ret.val[0] = (int8x8_t) __builtin_aarch64_get_dregciv8qi (__o, 0);
+  ret.val[1] = (int8x8_t) __builtin_aarch64_get_dregciv8qi (__o, 1);
+  ret.val[2] = (int8x8_t) __builtin_aarch64_get_dregciv8qi (__o, 2);
+  return ret;
 }
 
-/* vdupq_laneq  */
-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
-vdupq_laneq_f32 (float32x4_t __a, const int __b)
+__extension__ extern __inline poly8x8x3_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld3_p8 (const poly8_t * __a)
 {
-  return __aarch64_vdupq_laneq_f32 (__a, __b);
+  poly8x8x3_t ret;
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_ld3v8qi ((const __builtin_aarch64_simd_qi *) __a);
+  ret.val[0] = (poly8x8_t) __builtin_aarch64_get_dregciv8qi (__o, 0);
+  ret.val[1] = (poly8x8_t) __builtin_aarch64_get_dregciv8qi (__o, 1);
+  ret.val[2] = (poly8x8_t) __builtin_aarch64_get_dregciv8qi (__o, 2);
+  return ret;
 }
 
-__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
-vdupq_laneq_f64 (float64x2_t __a, const int __b)
+__extension__ extern __inline int16x4x3_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld3_s16 (const int16_t * __a)
 {
-  return __aarch64_vdupq_laneq_f64 (__a, __b);
+  int16x4x3_t ret;
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_ld3v4hi ((const __builtin_aarch64_simd_hi *) __a);
+  ret.val[0] = (int16x4_t) __builtin_aarch64_get_dregciv4hi (__o, 0);
+  ret.val[1] = (int16x4_t) __builtin_aarch64_get_dregciv4hi (__o, 1);
+  ret.val[2] = (int16x4_t) __builtin_aarch64_get_dregciv4hi (__o, 2);
+  return ret;
 }
 
-__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
-vdupq_laneq_p8 (poly8x16_t __a, const int __b)
+__extension__ extern __inline poly16x4x3_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld3_p16 (const poly16_t * __a)
 {
-  return __aarch64_vdupq_laneq_p8 (__a, __b);
+  poly16x4x3_t ret;
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_ld3v4hi ((const __builtin_aarch64_simd_hi *) __a);
+  ret.val[0] = (poly16x4_t) __builtin_aarch64_get_dregciv4hi (__o, 0);
+  ret.val[1] = (poly16x4_t) __builtin_aarch64_get_dregciv4hi (__o, 1);
+  ret.val[2] = (poly16x4_t) __builtin_aarch64_get_dregciv4hi (__o, 2);
+  return ret;
 }
 
-__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
-vdupq_laneq_p16 (poly16x8_t __a, const int __b)
+__extension__ extern __inline int32x2x3_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld3_s32 (const int32_t * __a)
 {
-  return __aarch64_vdupq_laneq_p16 (__a, __b);
+  int32x2x3_t ret;
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_ld3v2si ((const __builtin_aarch64_simd_si *) __a);
+  ret.val[0] = (int32x2_t) __builtin_aarch64_get_dregciv2si (__o, 0);
+  ret.val[1] = (int32x2_t) __builtin_aarch64_get_dregciv2si (__o, 1);
+  ret.val[2] = (int32x2_t) __builtin_aarch64_get_dregciv2si (__o, 2);
+  return ret;
 }
 
-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
-vdupq_laneq_s8 (int8x16_t __a, const int __b)
+__extension__ extern __inline uint8x8x3_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld3_u8 (const uint8_t * __a)
 {
-  return __aarch64_vdupq_laneq_s8 (__a, __b);
+  uint8x8x3_t ret;
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_ld3v8qi ((const __builtin_aarch64_simd_qi *) __a);
+  ret.val[0] = (uint8x8_t) __builtin_aarch64_get_dregciv8qi (__o, 0);
+  ret.val[1] = (uint8x8_t) __builtin_aarch64_get_dregciv8qi (__o, 1);
+  ret.val[2] = (uint8x8_t) __builtin_aarch64_get_dregciv8qi (__o, 2);
+  return ret;
 }
 
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-vdupq_laneq_s16 (int16x8_t __a, const int __b)
+__extension__ extern __inline uint16x4x3_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld3_u16 (const uint16_t * __a)
 {
-  return __aarch64_vdupq_laneq_s16 (__a, __b);
+  uint16x4x3_t ret;
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_ld3v4hi ((const __builtin_aarch64_simd_hi *) __a);
+  ret.val[0] = (uint16x4_t) __builtin_aarch64_get_dregciv4hi (__o, 0);
+  ret.val[1] = (uint16x4_t) __builtin_aarch64_get_dregciv4hi (__o, 1);
+  ret.val[2] = (uint16x4_t) __builtin_aarch64_get_dregciv4hi (__o, 2);
+  return ret;
 }
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-vdupq_laneq_s32 (int32x4_t __a, const int __b)
+__extension__ extern __inline uint32x2x3_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld3_u32 (const uint32_t * __a)
 {
-  return __aarch64_vdupq_laneq_s32 (__a, __b);
+  uint32x2x3_t ret;
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_ld3v2si ((const __builtin_aarch64_simd_si *) __a);
+  ret.val[0] = (uint32x2_t) __builtin_aarch64_get_dregciv2si (__o, 0);
+  ret.val[1] = (uint32x2_t) __builtin_aarch64_get_dregciv2si (__o, 1);
+  ret.val[2] = (uint32x2_t) __builtin_aarch64_get_dregciv2si (__o, 2);
+  return ret;
 }
 
-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
-vdupq_laneq_s64 (int64x2_t __a, const int __b)
+__extension__ extern __inline float16x4x3_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld3_f16 (const float16_t * __a)
 {
-  return __aarch64_vdupq_laneq_s64 (__a, __b);
+  float16x4x3_t ret;
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_ld3v4hf (__a);
+  ret.val[0] = __builtin_aarch64_get_dregciv4hf (__o, 0);
+  ret.val[1] = __builtin_aarch64_get_dregciv4hf (__o, 1);
+  ret.val[2] = __builtin_aarch64_get_dregciv4hf (__o, 2);
+  return ret;
 }
 
-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-vdupq_laneq_u8 (uint8x16_t __a, const int __b)
+__extension__ extern __inline float32x2x3_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld3_f32 (const float32_t * __a)
 {
-  return __aarch64_vdupq_laneq_u8 (__a, __b);
+  float32x2x3_t ret;
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_ld3v2sf ((const __builtin_aarch64_simd_sf *) __a);
+  ret.val[0] = (float32x2_t) __builtin_aarch64_get_dregciv2sf (__o, 0);
+  ret.val[1] = (float32x2_t) __builtin_aarch64_get_dregciv2sf (__o, 1);
+  ret.val[2] = (float32x2_t) __builtin_aarch64_get_dregciv2sf (__o, 2);
+  return ret;
 }
 
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-vdupq_laneq_u16 (uint16x8_t __a, const int __b)
+__extension__ extern __inline poly64x1x3_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld3_p64 (const poly64_t * __a)
 {
-  return __aarch64_vdupq_laneq_u16 (__a, __b);
+  poly64x1x3_t ret;
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_ld3di ((const __builtin_aarch64_simd_di *) __a);
+  ret.val[0] = (poly64x1_t) __builtin_aarch64_get_dregcidi_pss (__o, 0);
+  ret.val[1] = (poly64x1_t) __builtin_aarch64_get_dregcidi_pss (__o, 1);
+  ret.val[2] = (poly64x1_t) __builtin_aarch64_get_dregcidi_pss (__o, 2);
+  return ret;
 }
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vdupq_laneq_u32 (uint32x4_t __a, const int __b)
+__extension__ extern __inline int8x16x3_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld3q_s8 (const int8_t * __a)
 {
-  return __aarch64_vdupq_laneq_u32 (__a, __b);
+  int8x16x3_t ret;
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_ld3v16qi ((const __builtin_aarch64_simd_qi *) __a);
+  ret.val[0] = (int8x16_t) __builtin_aarch64_get_qregciv16qi (__o, 0);
+  ret.val[1] = (int8x16_t) __builtin_aarch64_get_qregciv16qi (__o, 1);
+  ret.val[2] = (int8x16_t) __builtin_aarch64_get_qregciv16qi (__o, 2);
+  return ret;
 }
 
-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
-vdupq_laneq_u64 (uint64x2_t __a, const int __b)
+__extension__ extern __inline poly8x16x3_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld3q_p8 (const poly8_t * __a)
 {
-  return __aarch64_vdupq_laneq_u64 (__a, __b);
+  poly8x16x3_t ret;
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_ld3v16qi ((const __builtin_aarch64_simd_qi *) __a);
+  ret.val[0] = (poly8x16_t) __builtin_aarch64_get_qregciv16qi (__o, 0);
+  ret.val[1] = (poly8x16_t) __builtin_aarch64_get_qregciv16qi (__o, 1);
+  ret.val[2] = (poly8x16_t) __builtin_aarch64_get_qregciv16qi (__o, 2);
+  return ret;
 }
 
-/* vdupb_lane  */
-__extension__ static __inline poly8_t __attribute__ ((__always_inline__))
-vdupb_lane_p8 (poly8x8_t __a, const int __b)
+__extension__ extern __inline int16x8x3_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld3q_s16 (const int16_t * __a)
 {
-  return __aarch64_vget_lane_any (__a, __b);
+  int16x8x3_t ret;
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_ld3v8hi ((const __builtin_aarch64_simd_hi *) __a);
+  ret.val[0] = (int16x8_t) __builtin_aarch64_get_qregciv8hi (__o, 0);
+  ret.val[1] = (int16x8_t) __builtin_aarch64_get_qregciv8hi (__o, 1);
+  ret.val[2] = (int16x8_t) __builtin_aarch64_get_qregciv8hi (__o, 2);
+  return ret;
 }
 
-__extension__ static __inline int8_t __attribute__ ((__always_inline__))
-vdupb_lane_s8 (int8x8_t __a, const int __b)
+__extension__ extern __inline poly16x8x3_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld3q_p16 (const poly16_t * __a)
 {
-  return __aarch64_vget_lane_any (__a, __b);
+  poly16x8x3_t ret;
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_ld3v8hi ((const __builtin_aarch64_simd_hi *) __a);
+  ret.val[0] = (poly16x8_t) __builtin_aarch64_get_qregciv8hi (__o, 0);
+  ret.val[1] = (poly16x8_t) __builtin_aarch64_get_qregciv8hi (__o, 1);
+  ret.val[2] = (poly16x8_t) __builtin_aarch64_get_qregciv8hi (__o, 2);
+  return ret;
 }
 
-__extension__ static __inline uint8_t __attribute__ ((__always_inline__))
-vdupb_lane_u8 (uint8x8_t __a, const int __b)
+__extension__ extern __inline int32x4x3_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld3q_s32 (const int32_t * __a)
 {
-  return __aarch64_vget_lane_any (__a, __b);
+  int32x4x3_t ret;
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_ld3v4si ((const __builtin_aarch64_simd_si *) __a);
+  ret.val[0] = (int32x4_t) __builtin_aarch64_get_qregciv4si (__o, 0);
+  ret.val[1] = (int32x4_t) __builtin_aarch64_get_qregciv4si (__o, 1);
+  ret.val[2] = (int32x4_t) __builtin_aarch64_get_qregciv4si (__o, 2);
+  return ret;
 }
 
-/* vduph_lane  */
-__extension__ static __inline poly16_t __attribute__ ((__always_inline__))
-vduph_lane_p16 (poly16x4_t __a, const int __b)
+__extension__ extern __inline int64x2x3_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld3q_s64 (const int64_t * __a)
 {
-  return __aarch64_vget_lane_any (__a, __b);
+  int64x2x3_t ret;
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_ld3v2di ((const __builtin_aarch64_simd_di *) __a);
+  ret.val[0] = (int64x2_t) __builtin_aarch64_get_qregciv2di (__o, 0);
+  ret.val[1] = (int64x2_t) __builtin_aarch64_get_qregciv2di (__o, 1);
+  ret.val[2] = (int64x2_t) __builtin_aarch64_get_qregciv2di (__o, 2);
+  return ret;
 }
 
-__extension__ static __inline int16_t __attribute__ ((__always_inline__))
-vduph_lane_s16 (int16x4_t __a, const int __b)
+__extension__ extern __inline uint8x16x3_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld3q_u8 (const uint8_t * __a)
 {
-  return __aarch64_vget_lane_any (__a, __b);
+  uint8x16x3_t ret;
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_ld3v16qi ((const __builtin_aarch64_simd_qi *) __a);
+  ret.val[0] = (uint8x16_t) __builtin_aarch64_get_qregciv16qi (__o, 0);
+  ret.val[1] = (uint8x16_t) __builtin_aarch64_get_qregciv16qi (__o, 1);
+  ret.val[2] = (uint8x16_t) __builtin_aarch64_get_qregciv16qi (__o, 2);
+  return ret;
 }
 
-__extension__ static __inline uint16_t __attribute__ ((__always_inline__))
-vduph_lane_u16 (uint16x4_t __a, const int __b)
+__extension__ extern __inline uint16x8x3_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld3q_u16 (const uint16_t * __a)
 {
-  return __aarch64_vget_lane_any (__a, __b);
+  uint16x8x3_t ret;
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_ld3v8hi ((const __builtin_aarch64_simd_hi *) __a);
+  ret.val[0] = (uint16x8_t) __builtin_aarch64_get_qregciv8hi (__o, 0);
+  ret.val[1] = (uint16x8_t) __builtin_aarch64_get_qregciv8hi (__o, 1);
+  ret.val[2] = (uint16x8_t) __builtin_aarch64_get_qregciv8hi (__o, 2);
+  return ret;
 }
 
-/* vdups_lane  */
-__extension__ static __inline float32_t __attribute__ ((__always_inline__))
-vdups_lane_f32 (float32x2_t __a, const int __b)
+__extension__ extern __inline uint32x4x3_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld3q_u32 (const uint32_t * __a)
 {
-  return __aarch64_vget_lane_any (__a, __b);
+  uint32x4x3_t ret;
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_ld3v4si ((const __builtin_aarch64_simd_si *) __a);
+  ret.val[0] = (uint32x4_t) __builtin_aarch64_get_qregciv4si (__o, 0);
+  ret.val[1] = (uint32x4_t) __builtin_aarch64_get_qregciv4si (__o, 1);
+  ret.val[2] = (uint32x4_t) __builtin_aarch64_get_qregciv4si (__o, 2);
+  return ret;
 }
 
-__extension__ static __inline int32_t __attribute__ ((__always_inline__))
-vdups_lane_s32 (int32x2_t __a, const int __b)
+__extension__ extern __inline uint64x2x3_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld3q_u64 (const uint64_t * __a)
 {
-  return __aarch64_vget_lane_any (__a, __b);
+  uint64x2x3_t ret;
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_ld3v2di ((const __builtin_aarch64_simd_di *) __a);
+  ret.val[0] = (uint64x2_t) __builtin_aarch64_get_qregciv2di (__o, 0);
+  ret.val[1] = (uint64x2_t) __builtin_aarch64_get_qregciv2di (__o, 1);
+  ret.val[2] = (uint64x2_t) __builtin_aarch64_get_qregciv2di (__o, 2);
+  return ret;
 }
 
-__extension__ static __inline uint32_t __attribute__ ((__always_inline__))
-vdups_lane_u32 (uint32x2_t __a, const int __b)
+__extension__ extern __inline float16x8x3_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld3q_f16 (const float16_t * __a)
 {
-  return __aarch64_vget_lane_any (__a, __b);
+  float16x8x3_t ret;
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_ld3v8hf (__a);
+  ret.val[0] = __builtin_aarch64_get_qregciv8hf (__o, 0);
+  ret.val[1] = __builtin_aarch64_get_qregciv8hf (__o, 1);
+  ret.val[2] = __builtin_aarch64_get_qregciv8hf (__o, 2);
+  return ret;
 }
 
-/* vdupd_lane  */
-__extension__ static __inline float64_t __attribute__ ((__always_inline__))
-vdupd_lane_f64 (float64x1_t __a, const int __b)
+__extension__ extern __inline float32x4x3_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld3q_f32 (const float32_t * __a)
 {
-  __AARCH64_LANE_CHECK (__a, __b);
-  return __a[0];
+  float32x4x3_t ret;
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_ld3v4sf ((const __builtin_aarch64_simd_sf *) __a);
+  ret.val[0] = (float32x4_t) __builtin_aarch64_get_qregciv4sf (__o, 0);
+  ret.val[1] = (float32x4_t) __builtin_aarch64_get_qregciv4sf (__o, 1);
+  ret.val[2] = (float32x4_t) __builtin_aarch64_get_qregciv4sf (__o, 2);
+  return ret;
 }
 
-__extension__ static __inline int64_t __attribute__ ((__always_inline__))
-vdupd_lane_s64 (int64x1_t __a, const int __b)
+__extension__ extern __inline float64x2x3_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld3q_f64 (const float64_t * __a)
 {
-  __AARCH64_LANE_CHECK (__a, __b);
-  return __a[0];
+  float64x2x3_t ret;
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_ld3v2df ((const __builtin_aarch64_simd_df *) __a);
+  ret.val[0] = (float64x2_t) __builtin_aarch64_get_qregciv2df (__o, 0);
+  ret.val[1] = (float64x2_t) __builtin_aarch64_get_qregciv2df (__o, 1);
+  ret.val[2] = (float64x2_t) __builtin_aarch64_get_qregciv2df (__o, 2);
+  return ret;
 }
 
-__extension__ static __inline uint64_t __attribute__ ((__always_inline__))
-vdupd_lane_u64 (uint64x1_t __a, const int __b)
+__extension__ extern __inline poly64x2x3_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld3q_p64 (const poly64_t * __a)
 {
-  __AARCH64_LANE_CHECK (__a, __b);
-  return __a[0];
+  poly64x2x3_t ret;
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_ld3v2di ((const __builtin_aarch64_simd_di *) __a);
+  ret.val[0] = (poly64x2_t) __builtin_aarch64_get_qregciv2di_pss (__o, 0);
+  ret.val[1] = (poly64x2_t) __builtin_aarch64_get_qregciv2di_pss (__o, 1);
+  ret.val[2] = (poly64x2_t) __builtin_aarch64_get_qregciv2di_pss (__o, 2);
+  return ret;
 }
 
-/* vdupb_laneq  */
-__extension__ static __inline poly8_t __attribute__ ((__always_inline__))
-vdupb_laneq_p8 (poly8x16_t __a, const int __b)
+__extension__ extern __inline int64x1x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld4_s64 (const int64_t * __a)
 {
-  return __aarch64_vget_lane_any (__a, __b);
+  int64x1x4_t ret;
+  __builtin_aarch64_simd_xi __o;
+  __o = __builtin_aarch64_ld4di ((const __builtin_aarch64_simd_di *) __a);
+  ret.val[0] = (int64x1_t) __builtin_aarch64_get_dregxidi (__o, 0);
+  ret.val[1] = (int64x1_t) __builtin_aarch64_get_dregxidi (__o, 1);
+  ret.val[2] = (int64x1_t) __builtin_aarch64_get_dregxidi (__o, 2);
+  ret.val[3] = (int64x1_t) __builtin_aarch64_get_dregxidi (__o, 3);
+  return ret;
 }
 
-__extension__ static __inline int8_t __attribute__ ((__always_inline__))
-vdupb_laneq_s8 (int8x16_t __a, const int __attribute__ ((unused)) __b)
+__extension__ extern __inline uint64x1x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld4_u64 (const uint64_t * __a)
 {
-  return __aarch64_vget_lane_any (__a, __b);
+  uint64x1x4_t ret;
+  __builtin_aarch64_simd_xi __o;
+  __o = __builtin_aarch64_ld4di ((const __builtin_aarch64_simd_di *) __a);
+  ret.val[0] = (uint64x1_t) __builtin_aarch64_get_dregxidi (__o, 0);
+  ret.val[1] = (uint64x1_t) __builtin_aarch64_get_dregxidi (__o, 1);
+  ret.val[2] = (uint64x1_t) __builtin_aarch64_get_dregxidi (__o, 2);
+  ret.val[3] = (uint64x1_t) __builtin_aarch64_get_dregxidi (__o, 3);
+  return ret;
 }
 
-__extension__ static __inline uint8_t __attribute__ ((__always_inline__))
-vdupb_laneq_u8 (uint8x16_t __a, const int __b)
+__extension__ extern __inline float64x1x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld4_f64 (const float64_t * __a)
 {
-  return __aarch64_vget_lane_any (__a, __b);
+  float64x1x4_t ret;
+  __builtin_aarch64_simd_xi __o;
+  __o = __builtin_aarch64_ld4df ((const __builtin_aarch64_simd_df *) __a);
+  ret.val[0] = (float64x1_t) {__builtin_aarch64_get_dregxidf (__o, 0)};
+  ret.val[1] = (float64x1_t) {__builtin_aarch64_get_dregxidf (__o, 1)};
+  ret.val[2] = (float64x1_t) {__builtin_aarch64_get_dregxidf (__o, 2)};
+  ret.val[3] = (float64x1_t) {__builtin_aarch64_get_dregxidf (__o, 3)};
+  return ret;
 }
 
-/* vduph_laneq  */
-__extension__ static __inline poly16_t __attribute__ ((__always_inline__))
-vduph_laneq_p16 (poly16x8_t __a, const int __b)
+__extension__ extern __inline int8x8x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld4_s8 (const int8_t * __a)
 {
-  return __aarch64_vget_lane_any (__a, __b);
+  int8x8x4_t ret;
+  __builtin_aarch64_simd_xi __o;
+  __o = __builtin_aarch64_ld4v8qi ((const __builtin_aarch64_simd_qi *) __a);
+  ret.val[0] = (int8x8_t) __builtin_aarch64_get_dregxiv8qi (__o, 0);
+  ret.val[1] = (int8x8_t) __builtin_aarch64_get_dregxiv8qi (__o, 1);
+  ret.val[2] = (int8x8_t) __builtin_aarch64_get_dregxiv8qi (__o, 2);
+  ret.val[3] = (int8x8_t) __builtin_aarch64_get_dregxiv8qi (__o, 3);
+  return ret;
 }
 
-__extension__ static __inline int16_t __attribute__ ((__always_inline__))
-vduph_laneq_s16 (int16x8_t __a, const int __b)
+__extension__ extern __inline poly8x8x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld4_p8 (const poly8_t * __a)
 {
-  return __aarch64_vget_lane_any (__a, __b);
+  poly8x8x4_t ret;
+  __builtin_aarch64_simd_xi __o;
+  __o = __builtin_aarch64_ld4v8qi ((const __builtin_aarch64_simd_qi *) __a);
+  ret.val[0] = (poly8x8_t) __builtin_aarch64_get_dregxiv8qi (__o, 0);
+  ret.val[1] = (poly8x8_t) __builtin_aarch64_get_dregxiv8qi (__o, 1);
+  ret.val[2] = (poly8x8_t) __builtin_aarch64_get_dregxiv8qi (__o, 2);
+  ret.val[3] = (poly8x8_t) __builtin_aarch64_get_dregxiv8qi (__o, 3);
+  return ret;
 }
 
-__extension__ static __inline uint16_t __attribute__ ((__always_inline__))
-vduph_laneq_u16 (uint16x8_t __a, const int __b)
+__extension__ extern __inline int16x4x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld4_s16 (const int16_t * __a)
 {
-  return __aarch64_vget_lane_any (__a, __b);
+  int16x4x4_t ret;
+  __builtin_aarch64_simd_xi __o;
+  __o = __builtin_aarch64_ld4v4hi ((const __builtin_aarch64_simd_hi *) __a);
+  ret.val[0] = (int16x4_t) __builtin_aarch64_get_dregxiv4hi (__o, 0);
+  ret.val[1] = (int16x4_t) __builtin_aarch64_get_dregxiv4hi (__o, 1);
+  ret.val[2] = (int16x4_t) __builtin_aarch64_get_dregxiv4hi (__o, 2);
+  ret.val[3] = (int16x4_t) __builtin_aarch64_get_dregxiv4hi (__o, 3);
+  return ret;
 }
 
-/* vdups_laneq  */
-__extension__ static __inline float32_t __attribute__ ((__always_inline__))
-vdups_laneq_f32 (float32x4_t __a, const int __b)
+__extension__ extern __inline poly16x4x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld4_p16 (const poly16_t * __a)
 {
-  return __aarch64_vget_lane_any (__a, __b);
+  poly16x4x4_t ret;
+  __builtin_aarch64_simd_xi __o;
+  __o = __builtin_aarch64_ld4v4hi ((const __builtin_aarch64_simd_hi *) __a);
+  ret.val[0] = (poly16x4_t) __builtin_aarch64_get_dregxiv4hi (__o, 0);
+  ret.val[1] = (poly16x4_t) __builtin_aarch64_get_dregxiv4hi (__o, 1);
+  ret.val[2] = (poly16x4_t) __builtin_aarch64_get_dregxiv4hi (__o, 2);
+  ret.val[3] = (poly16x4_t) __builtin_aarch64_get_dregxiv4hi (__o, 3);
+  return ret;
 }
 
-__extension__ static __inline int32_t __attribute__ ((__always_inline__))
-vdups_laneq_s32 (int32x4_t __a, const int __b)
+__extension__ extern __inline int32x2x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld4_s32 (const int32_t * __a)
 {
-  return __aarch64_vget_lane_any (__a, __b);
+  int32x2x4_t ret;
+  __builtin_aarch64_simd_xi __o;
+  __o = __builtin_aarch64_ld4v2si ((const __builtin_aarch64_simd_si *) __a);
+  ret.val[0] = (int32x2_t) __builtin_aarch64_get_dregxiv2si (__o, 0);
+  ret.val[1] = (int32x2_t) __builtin_aarch64_get_dregxiv2si (__o, 1);
+  ret.val[2] = (int32x2_t) __builtin_aarch64_get_dregxiv2si (__o, 2);
+  ret.val[3] = (int32x2_t) __builtin_aarch64_get_dregxiv2si (__o, 3);
+  return ret;
 }
 
-__extension__ static __inline uint32_t __attribute__ ((__always_inline__))
-vdups_laneq_u32 (uint32x4_t __a, const int __b)
+__extension__ extern __inline uint8x8x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld4_u8 (const uint8_t * __a)
 {
-  return __aarch64_vget_lane_any (__a, __b);
+  uint8x8x4_t ret;
+  __builtin_aarch64_simd_xi __o;
+  __o = __builtin_aarch64_ld4v8qi ((const __builtin_aarch64_simd_qi *) __a);
+  ret.val[0] = (uint8x8_t) __builtin_aarch64_get_dregxiv8qi (__o, 0);
+  ret.val[1] = (uint8x8_t) __builtin_aarch64_get_dregxiv8qi (__o, 1);
+  ret.val[2] = (uint8x8_t) __builtin_aarch64_get_dregxiv8qi (__o, 2);
+  ret.val[3] = (uint8x8_t) __builtin_aarch64_get_dregxiv8qi (__o, 3);
+  return ret;
 }
 
-/* vdupd_laneq  */
-__extension__ static __inline float64_t __attribute__ ((__always_inline__))
-vdupd_laneq_f64 (float64x2_t __a, const int __b)
+__extension__ extern __inline uint16x4x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld4_u16 (const uint16_t * __a)
 {
-  return __aarch64_vget_lane_any (__a, __b);
+  uint16x4x4_t ret;
+  __builtin_aarch64_simd_xi __o;
+  __o = __builtin_aarch64_ld4v4hi ((const __builtin_aarch64_simd_hi *) __a);
+  ret.val[0] = (uint16x4_t) __builtin_aarch64_get_dregxiv4hi (__o, 0);
+  ret.val[1] = (uint16x4_t) __builtin_aarch64_get_dregxiv4hi (__o, 1);
+  ret.val[2] = (uint16x4_t) __builtin_aarch64_get_dregxiv4hi (__o, 2);
+  ret.val[3] = (uint16x4_t) __builtin_aarch64_get_dregxiv4hi (__o, 3);
+  return ret;
 }
 
-__extension__ static __inline int64_t __attribute__ ((__always_inline__))
-vdupd_laneq_s64 (int64x2_t __a, const int __b)
+__extension__ extern __inline uint32x2x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld4_u32 (const uint32_t * __a)
 {
-  return __aarch64_vget_lane_any (__a, __b);
+  uint32x2x4_t ret;
+  __builtin_aarch64_simd_xi __o;
+  __o = __builtin_aarch64_ld4v2si ((const __builtin_aarch64_simd_si *) __a);
+  ret.val[0] = (uint32x2_t) __builtin_aarch64_get_dregxiv2si (__o, 0);
+  ret.val[1] = (uint32x2_t) __builtin_aarch64_get_dregxiv2si (__o, 1);
+  ret.val[2] = (uint32x2_t) __builtin_aarch64_get_dregxiv2si (__o, 2);
+  ret.val[3] = (uint32x2_t) __builtin_aarch64_get_dregxiv2si (__o, 3);
+  return ret;
 }
 
-__extension__ static __inline uint64_t __attribute__ ((__always_inline__))
-vdupd_laneq_u64 (uint64x2_t __a, const int __b)
+__extension__ extern __inline float16x4x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld4_f16 (const float16_t * __a)
 {
-  return __aarch64_vget_lane_any (__a, __b);
+  float16x4x4_t ret;
+  __builtin_aarch64_simd_xi __o;
+  __o = __builtin_aarch64_ld4v4hf (__a);
+  ret.val[0] = __builtin_aarch64_get_dregxiv4hf (__o, 0);
+  ret.val[1] = __builtin_aarch64_get_dregxiv4hf (__o, 1);
+  ret.val[2] = __builtin_aarch64_get_dregxiv4hf (__o, 2);
+  ret.val[3] = __builtin_aarch64_get_dregxiv4hf (__o, 3);
+  return ret;
 }
 
-/* vext  */
-
-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
-vext_f32 (float32x2_t __a, float32x2_t __b, __const int __c)
+__extension__ extern __inline float32x2x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld4_f32 (const float32_t * __a)
 {
-  __AARCH64_LANE_CHECK (__a, __c);
-#ifdef __AARCH64EB__
-  return __builtin_shuffle (__b, __a, (uint32x2_t) {2-__c, 3-__c});
-#else
-  return __builtin_shuffle (__a, __b, (uint32x2_t) {__c, __c+1});
-#endif
+  float32x2x4_t ret;
+  __builtin_aarch64_simd_xi __o;
+  __o = __builtin_aarch64_ld4v2sf ((const __builtin_aarch64_simd_sf *) __a);
+  ret.val[0] = (float32x2_t) __builtin_aarch64_get_dregxiv2sf (__o, 0);
+  ret.val[1] = (float32x2_t) __builtin_aarch64_get_dregxiv2sf (__o, 1);
+  ret.val[2] = (float32x2_t) __builtin_aarch64_get_dregxiv2sf (__o, 2);
+  ret.val[3] = (float32x2_t) __builtin_aarch64_get_dregxiv2sf (__o, 3);
+  return ret;
 }
 
-__extension__ static __inline float64x1_t __attribute__ ((__always_inline__))
-vext_f64 (float64x1_t __a, float64x1_t __b, __const int __c)
+__extension__ extern __inline poly64x1x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld4_p64 (const poly64_t * __a)
 {
-  __AARCH64_LANE_CHECK (__a, __c);
-  /* The only possible index to the assembler instruction returns element 0.  */
-  return __a;
+  poly64x1x4_t  ret;
+  __builtin_aarch64_simd_xi __o;
+  __o = __builtin_aarch64_ld4di ((const __builtin_aarch64_simd_di *) __a);
+  ret.val[0] = (poly64x1_t) __builtin_aarch64_get_dregxidi_pss (__o, 0);
+  ret.val[1] = (poly64x1_t) __builtin_aarch64_get_dregxidi_pss (__o, 1);
+  ret.val[2] = (poly64x1_t) __builtin_aarch64_get_dregxidi_pss (__o, 2);
+  ret.val[3] = (poly64x1_t) __builtin_aarch64_get_dregxidi_pss (__o, 3);
+  return ret;
 }
-__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
-vext_p8 (poly8x8_t __a, poly8x8_t __b, __const int __c)
+
+__extension__ extern __inline int8x16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld4q_s8 (const int8_t * __a)
 {
-  __AARCH64_LANE_CHECK (__a, __c);
-#ifdef __AARCH64EB__
-  return __builtin_shuffle (__b, __a, (uint8x8_t)
-      {8-__c, 9-__c, 10-__c, 11-__c, 12-__c, 13-__c, 14-__c, 15-__c});
-#else
-  return __builtin_shuffle (__a, __b,
-      (uint8x8_t) {__c, __c+1, __c+2, __c+3, __c+4, __c+5, __c+6, __c+7});
-#endif
+  int8x16x4_t ret;
+  __builtin_aarch64_simd_xi __o;
+  __o = __builtin_aarch64_ld4v16qi ((const __builtin_aarch64_simd_qi *) __a);
+  ret.val[0] = (int8x16_t) __builtin_aarch64_get_qregxiv16qi (__o, 0);
+  ret.val[1] = (int8x16_t) __builtin_aarch64_get_qregxiv16qi (__o, 1);
+  ret.val[2] = (int8x16_t) __builtin_aarch64_get_qregxiv16qi (__o, 2);
+  ret.val[3] = (int8x16_t) __builtin_aarch64_get_qregxiv16qi (__o, 3);
+  return ret;
 }
 
-__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__))
-vext_p16 (poly16x4_t __a, poly16x4_t __b, __const int __c)
+__extension__ extern __inline poly8x16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld4q_p8 (const poly8_t * __a)
 {
-  __AARCH64_LANE_CHECK (__a, __c);
-#ifdef __AARCH64EB__
-  return __builtin_shuffle (__b, __a,
-      (uint16x4_t) {4-__c, 5-__c, 6-__c, 7-__c});
-#else
-  return __builtin_shuffle (__a, __b, (uint16x4_t) {__c, __c+1, __c+2, __c+3});
-#endif
+  poly8x16x4_t ret;
+  __builtin_aarch64_simd_xi __o;
+  __o = __builtin_aarch64_ld4v16qi ((const __builtin_aarch64_simd_qi *) __a);
+  ret.val[0] = (poly8x16_t) __builtin_aarch64_get_qregxiv16qi (__o, 0);
+  ret.val[1] = (poly8x16_t) __builtin_aarch64_get_qregxiv16qi (__o, 1);
+  ret.val[2] = (poly8x16_t) __builtin_aarch64_get_qregxiv16qi (__o, 2);
+  ret.val[3] = (poly8x16_t) __builtin_aarch64_get_qregxiv16qi (__o, 3);
+  return ret;
 }
 
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-vext_s8 (int8x8_t __a, int8x8_t __b, __const int __c)
+__extension__ extern __inline int16x8x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld4q_s16 (const int16_t * __a)
 {
-  __AARCH64_LANE_CHECK (__a, __c);
-#ifdef __AARCH64EB__
-  return __builtin_shuffle (__b, __a, (uint8x8_t)
-      {8-__c, 9-__c, 10-__c, 11-__c, 12-__c, 13-__c, 14-__c, 15-__c});
-#else
-  return __builtin_shuffle (__a, __b,
-      (uint8x8_t) {__c, __c+1, __c+2, __c+3, __c+4, __c+5, __c+6, __c+7});
-#endif
+  int16x8x4_t ret;
+  __builtin_aarch64_simd_xi __o;
+  __o = __builtin_aarch64_ld4v8hi ((const __builtin_aarch64_simd_hi *) __a);
+  ret.val[0] = (int16x8_t) __builtin_aarch64_get_qregxiv8hi (__o, 0);
+  ret.val[1] = (int16x8_t) __builtin_aarch64_get_qregxiv8hi (__o, 1);
+  ret.val[2] = (int16x8_t) __builtin_aarch64_get_qregxiv8hi (__o, 2);
+  ret.val[3] = (int16x8_t) __builtin_aarch64_get_qregxiv8hi (__o, 3);
+  return ret;
 }
 
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-vext_s16 (int16x4_t __a, int16x4_t __b, __const int __c)
+__extension__ extern __inline poly16x8x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld4q_p16 (const poly16_t * __a)
 {
-  __AARCH64_LANE_CHECK (__a, __c);
-#ifdef __AARCH64EB__
-  return __builtin_shuffle (__b, __a,
-      (uint16x4_t) {4-__c, 5-__c, 6-__c, 7-__c});
-#else
-  return __builtin_shuffle (__a, __b, (uint16x4_t) {__c, __c+1, __c+2, __c+3});
-#endif
+  poly16x8x4_t ret;
+  __builtin_aarch64_simd_xi __o;
+  __o = __builtin_aarch64_ld4v8hi ((const __builtin_aarch64_simd_hi *) __a);
+  ret.val[0] = (poly16x8_t) __builtin_aarch64_get_qregxiv8hi (__o, 0);
+  ret.val[1] = (poly16x8_t) __builtin_aarch64_get_qregxiv8hi (__o, 1);
+  ret.val[2] = (poly16x8_t) __builtin_aarch64_get_qregxiv8hi (__o, 2);
+  ret.val[3] = (poly16x8_t) __builtin_aarch64_get_qregxiv8hi (__o, 3);
+  return ret;
 }
 
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-vext_s32 (int32x2_t __a, int32x2_t __b, __const int __c)
+__extension__ extern __inline int32x4x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld4q_s32 (const int32_t * __a)
 {
-  __AARCH64_LANE_CHECK (__a, __c);
-#ifdef __AARCH64EB__
-  return __builtin_shuffle (__b, __a, (uint32x2_t) {2-__c, 3-__c});
-#else
-  return __builtin_shuffle (__a, __b, (uint32x2_t) {__c, __c+1});
-#endif
+  int32x4x4_t ret;
+  __builtin_aarch64_simd_xi __o;
+  __o = __builtin_aarch64_ld4v4si ((const __builtin_aarch64_simd_si *) __a);
+  ret.val[0] = (int32x4_t) __builtin_aarch64_get_qregxiv4si (__o, 0);
+  ret.val[1] = (int32x4_t) __builtin_aarch64_get_qregxiv4si (__o, 1);
+  ret.val[2] = (int32x4_t) __builtin_aarch64_get_qregxiv4si (__o, 2);
+  ret.val[3] = (int32x4_t) __builtin_aarch64_get_qregxiv4si (__o, 3);
+  return ret;
 }
 
-__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
-vext_s64 (int64x1_t __a, int64x1_t __b, __const int __c)
+__extension__ extern __inline int64x2x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld4q_s64 (const int64_t * __a)
 {
-  __AARCH64_LANE_CHECK (__a, __c);
-  /* The only possible index to the assembler instruction returns element 0.  */
-  return __a;
+  int64x2x4_t ret;
+  __builtin_aarch64_simd_xi __o;
+  __o = __builtin_aarch64_ld4v2di ((const __builtin_aarch64_simd_di *) __a);
+  ret.val[0] = (int64x2_t) __builtin_aarch64_get_qregxiv2di (__o, 0);
+  ret.val[1] = (int64x2_t) __builtin_aarch64_get_qregxiv2di (__o, 1);
+  ret.val[2] = (int64x2_t) __builtin_aarch64_get_qregxiv2di (__o, 2);
+  ret.val[3] = (int64x2_t) __builtin_aarch64_get_qregxiv2di (__o, 3);
+  return ret;
 }
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-vext_u8 (uint8x8_t __a, uint8x8_t __b, __const int __c)
+__extension__ extern __inline uint8x16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld4q_u8 (const uint8_t * __a)
 {
-  __AARCH64_LANE_CHECK (__a, __c);
-#ifdef __AARCH64EB__
-  return __builtin_shuffle (__b, __a, (uint8x8_t)
-      {8-__c, 9-__c, 10-__c, 11-__c, 12-__c, 13-__c, 14-__c, 15-__c});
-#else
-  return __builtin_shuffle (__a, __b,
-      (uint8x8_t) {__c, __c+1, __c+2, __c+3, __c+4, __c+5, __c+6, __c+7});
-#endif
+  uint8x16x4_t ret;
+  __builtin_aarch64_simd_xi __o;
+  __o = __builtin_aarch64_ld4v16qi ((const __builtin_aarch64_simd_qi *) __a);
+  ret.val[0] = (uint8x16_t) __builtin_aarch64_get_qregxiv16qi (__o, 0);
+  ret.val[1] = (uint8x16_t) __builtin_aarch64_get_qregxiv16qi (__o, 1);
+  ret.val[2] = (uint8x16_t) __builtin_aarch64_get_qregxiv16qi (__o, 2);
+  ret.val[3] = (uint8x16_t) __builtin_aarch64_get_qregxiv16qi (__o, 3);
+  return ret;
 }
 
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-vext_u16 (uint16x4_t __a, uint16x4_t __b, __const int __c)
+__extension__ extern __inline uint16x8x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld4q_u16 (const uint16_t * __a)
 {
-  __AARCH64_LANE_CHECK (__a, __c);
-#ifdef __AARCH64EB__
-  return __builtin_shuffle (__b, __a,
-      (uint16x4_t) {4-__c, 5-__c, 6-__c, 7-__c});
-#else
-  return __builtin_shuffle (__a, __b, (uint16x4_t) {__c, __c+1, __c+2, __c+3});
-#endif
+  uint16x8x4_t ret;
+  __builtin_aarch64_simd_xi __o;
+  __o = __builtin_aarch64_ld4v8hi ((const __builtin_aarch64_simd_hi *) __a);
+  ret.val[0] = (uint16x8_t) __builtin_aarch64_get_qregxiv8hi (__o, 0);
+  ret.val[1] = (uint16x8_t) __builtin_aarch64_get_qregxiv8hi (__o, 1);
+  ret.val[2] = (uint16x8_t) __builtin_aarch64_get_qregxiv8hi (__o, 2);
+  ret.val[3] = (uint16x8_t) __builtin_aarch64_get_qregxiv8hi (__o, 3);
+  return ret;
 }
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-vext_u32 (uint32x2_t __a, uint32x2_t __b, __const int __c)
+__extension__ extern __inline uint32x4x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld4q_u32 (const uint32_t * __a)
 {
-  __AARCH64_LANE_CHECK (__a, __c);
-#ifdef __AARCH64EB__
-  return __builtin_shuffle (__b, __a, (uint32x2_t) {2-__c, 3-__c});
-#else
-  return __builtin_shuffle (__a, __b, (uint32x2_t) {__c, __c+1});
-#endif
+  uint32x4x4_t ret;
+  __builtin_aarch64_simd_xi __o;
+  __o = __builtin_aarch64_ld4v4si ((const __builtin_aarch64_simd_si *) __a);
+  ret.val[0] = (uint32x4_t) __builtin_aarch64_get_qregxiv4si (__o, 0);
+  ret.val[1] = (uint32x4_t) __builtin_aarch64_get_qregxiv4si (__o, 1);
+  ret.val[2] = (uint32x4_t) __builtin_aarch64_get_qregxiv4si (__o, 2);
+  ret.val[3] = (uint32x4_t) __builtin_aarch64_get_qregxiv4si (__o, 3);
+  return ret;
 }
 
-__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
-vext_u64 (uint64x1_t __a, uint64x1_t __b, __const int __c)
+__extension__ extern __inline uint64x2x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld4q_u64 (const uint64_t * __a)
 {
-  __AARCH64_LANE_CHECK (__a, __c);
-  /* The only possible index to the assembler instruction returns element 0.  */
-  return __a;
+  uint64x2x4_t ret;
+  __builtin_aarch64_simd_xi __o;
+  __o = __builtin_aarch64_ld4v2di ((const __builtin_aarch64_simd_di *) __a);
+  ret.val[0] = (uint64x2_t) __builtin_aarch64_get_qregxiv2di (__o, 0);
+  ret.val[1] = (uint64x2_t) __builtin_aarch64_get_qregxiv2di (__o, 1);
+  ret.val[2] = (uint64x2_t) __builtin_aarch64_get_qregxiv2di (__o, 2);
+  ret.val[3] = (uint64x2_t) __builtin_aarch64_get_qregxiv2di (__o, 3);
+  return ret;
 }
 
-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
-vextq_f32 (float32x4_t __a, float32x4_t __b, __const int __c)
+__extension__ extern __inline float16x8x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld4q_f16 (const float16_t * __a)
 {
-  __AARCH64_LANE_CHECK (__a, __c);
-#ifdef __AARCH64EB__
-  return __builtin_shuffle (__b, __a,
-      (uint32x4_t) {4-__c, 5-__c, 6-__c, 7-__c});
-#else
-  return __builtin_shuffle (__a, __b, (uint32x4_t) {__c, __c+1, __c+2, __c+3});
-#endif
+  float16x8x4_t ret;
+  __builtin_aarch64_simd_xi __o;
+  __o = __builtin_aarch64_ld4v8hf (__a);
+  ret.val[0] = __builtin_aarch64_get_qregxiv8hf (__o, 0);
+  ret.val[1] = __builtin_aarch64_get_qregxiv8hf (__o, 1);
+  ret.val[2] = __builtin_aarch64_get_qregxiv8hf (__o, 2);
+  ret.val[3] = __builtin_aarch64_get_qregxiv8hf (__o, 3);
+  return ret;
 }
 
-__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
-vextq_f64 (float64x2_t __a, float64x2_t __b, __const int __c)
+__extension__ extern __inline float32x4x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld4q_f32 (const float32_t * __a)
 {
-  __AARCH64_LANE_CHECK (__a, __c);
-#ifdef __AARCH64EB__
-  return __builtin_shuffle (__b, __a, (uint64x2_t) {2-__c, 3-__c});
-#else
-  return __builtin_shuffle (__a, __b, (uint64x2_t) {__c, __c+1});
-#endif
+  float32x4x4_t ret;
+  __builtin_aarch64_simd_xi __o;
+  __o = __builtin_aarch64_ld4v4sf ((const __builtin_aarch64_simd_sf *) __a);
+  ret.val[0] = (float32x4_t) __builtin_aarch64_get_qregxiv4sf (__o, 0);
+  ret.val[1] = (float32x4_t) __builtin_aarch64_get_qregxiv4sf (__o, 1);
+  ret.val[2] = (float32x4_t) __builtin_aarch64_get_qregxiv4sf (__o, 2);
+  ret.val[3] = (float32x4_t) __builtin_aarch64_get_qregxiv4sf (__o, 3);
+  return ret;
 }
 
-__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
-vextq_p8 (poly8x16_t __a, poly8x16_t __b, __const int __c)
+__extension__ extern __inline float64x2x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld4q_f64 (const float64_t * __a)
 {
-  __AARCH64_LANE_CHECK (__a, __c);
-#ifdef __AARCH64EB__
-  return __builtin_shuffle (__b, __a, (uint8x16_t)
-      {16-__c, 17-__c, 18-__c, 19-__c, 20-__c, 21-__c, 22-__c, 23-__c,
-       24-__c, 25-__c, 26-__c, 27-__c, 28-__c, 29-__c, 30-__c, 31-__c});
-#else
-  return __builtin_shuffle (__a, __b, (uint8x16_t)
-      {__c, __c+1, __c+2, __c+3, __c+4, __c+5, __c+6, __c+7,
-       __c+8, __c+9, __c+10, __c+11, __c+12, __c+13, __c+14, __c+15});
-#endif
+  float64x2x4_t ret;
+  __builtin_aarch64_simd_xi __o;
+  __o = __builtin_aarch64_ld4v2df ((const __builtin_aarch64_simd_df *) __a);
+  ret.val[0] = (float64x2_t) __builtin_aarch64_get_qregxiv2df (__o, 0);
+  ret.val[1] = (float64x2_t) __builtin_aarch64_get_qregxiv2df (__o, 1);
+  ret.val[2] = (float64x2_t) __builtin_aarch64_get_qregxiv2df (__o, 2);
+  ret.val[3] = (float64x2_t) __builtin_aarch64_get_qregxiv2df (__o, 3);
+  return ret;
 }
 
-__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
-vextq_p16 (poly16x8_t __a, poly16x8_t __b, __const int __c)
+__extension__ extern __inline poly64x2x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld4q_p64 (const poly64_t * __a)
 {
-  __AARCH64_LANE_CHECK (__a, __c);
-#ifdef __AARCH64EB__
-  return __builtin_shuffle (__b, __a, (uint16x8_t)
-      {8-__c, 9-__c, 10-__c, 11-__c, 12-__c, 13-__c, 14-__c, 15-__c});
-#else
-  return __builtin_shuffle (__a, __b,
-      (uint16x8_t) {__c, __c+1, __c+2, __c+3, __c+4, __c+5, __c+6, __c+7});
-#endif
+  poly64x2x4_t  ret;
+  __builtin_aarch64_simd_xi __o;
+  __o = __builtin_aarch64_ld4v2di ((const __builtin_aarch64_simd_di *) __a);
+  ret.val[0] = (poly64x2_t) __builtin_aarch64_get_qregxiv2di_pss (__o, 0);
+  ret.val[1] = (poly64x2_t) __builtin_aarch64_get_qregxiv2di_pss (__o, 1);
+  ret.val[2] = (poly64x2_t) __builtin_aarch64_get_qregxiv2di_pss (__o, 2);
+  ret.val[3] = (poly64x2_t) __builtin_aarch64_get_qregxiv2di_pss (__o, 3);
+  return ret;
 }
 
-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
-vextq_s8 (int8x16_t __a, int8x16_t __b, __const int __c)
+/* vldn_dup */
+
+__extension__ extern __inline int8x8x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld2_dup_s8 (const int8_t * __a)
 {
-  __AARCH64_LANE_CHECK (__a, __c);
-#ifdef __AARCH64EB__
-  return __builtin_shuffle (__b, __a, (uint8x16_t)
-      {16-__c, 17-__c, 18-__c, 19-__c, 20-__c, 21-__c, 22-__c, 23-__c,
-       24-__c, 25-__c, 26-__c, 27-__c, 28-__c, 29-__c, 30-__c, 31-__c});
-#else
-  return __builtin_shuffle (__a, __b, (uint8x16_t)
-      {__c, __c+1, __c+2, __c+3, __c+4, __c+5, __c+6, __c+7,
-       __c+8, __c+9, __c+10, __c+11, __c+12, __c+13, __c+14, __c+15});
-#endif
+  int8x8x2_t ret;
+  __builtin_aarch64_simd_oi __o;
+  __o = __builtin_aarch64_ld2rv8qi ((const __builtin_aarch64_simd_qi *) __a);
+  ret.val[0] = (int8x8_t) __builtin_aarch64_get_dregoiv8qi (__o, 0);
+  ret.val[1] = (int8x8_t) __builtin_aarch64_get_dregoiv8qi (__o, 1);
+  return ret;
 }
 
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-vextq_s16 (int16x8_t __a, int16x8_t __b, __const int __c)
+__extension__ extern __inline int16x4x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld2_dup_s16 (const int16_t * __a)
 {
-  __AARCH64_LANE_CHECK (__a, __c);
-#ifdef __AARCH64EB__
-  return __builtin_shuffle (__b, __a, (uint16x8_t)
-      {8-__c, 9-__c, 10-__c, 11-__c, 12-__c, 13-__c, 14-__c, 15-__c});
-#else
-  return __builtin_shuffle (__a, __b,
-      (uint16x8_t) {__c, __c+1, __c+2, __c+3, __c+4, __c+5, __c+6, __c+7});
-#endif
+  int16x4x2_t ret;
+  __builtin_aarch64_simd_oi __o;
+  __o = __builtin_aarch64_ld2rv4hi ((const __builtin_aarch64_simd_hi *) __a);
+  ret.val[0] = (int16x4_t) __builtin_aarch64_get_dregoiv4hi (__o, 0);
+  ret.val[1] = (int16x4_t) __builtin_aarch64_get_dregoiv4hi (__o, 1);
+  return ret;
 }
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-vextq_s32 (int32x4_t __a, int32x4_t __b, __const int __c)
+__extension__ extern __inline int32x2x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld2_dup_s32 (const int32_t * __a)
 {
-  __AARCH64_LANE_CHECK (__a, __c);
-#ifdef __AARCH64EB__
-  return __builtin_shuffle (__b, __a,
-      (uint32x4_t) {4-__c, 5-__c, 6-__c, 7-__c});
-#else
-  return __builtin_shuffle (__a, __b, (uint32x4_t) {__c, __c+1, __c+2, __c+3});
-#endif
+  int32x2x2_t ret;
+  __builtin_aarch64_simd_oi __o;
+  __o = __builtin_aarch64_ld2rv2si ((const __builtin_aarch64_simd_si *) __a);
+  ret.val[0] = (int32x2_t) __builtin_aarch64_get_dregoiv2si (__o, 0);
+  ret.val[1] = (int32x2_t) __builtin_aarch64_get_dregoiv2si (__o, 1);
+  return ret;
 }
 
-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
-vextq_s64 (int64x2_t __a, int64x2_t __b, __const int __c)
+__extension__ extern __inline float16x4x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld2_dup_f16 (const float16_t * __a)
 {
-  __AARCH64_LANE_CHECK (__a, __c);
-#ifdef __AARCH64EB__
-  return __builtin_shuffle (__b, __a, (uint64x2_t) {2-__c, 3-__c});
-#else
-  return __builtin_shuffle (__a, __b, (uint64x2_t) {__c, __c+1});
-#endif
+  float16x4x2_t ret;
+  __builtin_aarch64_simd_oi __o;
+  __o = __builtin_aarch64_ld2rv4hf ((const __builtin_aarch64_simd_hf *) __a);
+  ret.val[0] = __builtin_aarch64_get_dregoiv4hf (__o, 0);
+  ret.val[1] = (float16x4_t) __builtin_aarch64_get_dregoiv4hf (__o, 1);
+  return ret;
 }
 
-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-vextq_u8 (uint8x16_t __a, uint8x16_t __b, __const int __c)
+__extension__ extern __inline float32x2x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld2_dup_f32 (const float32_t * __a)
 {
-  __AARCH64_LANE_CHECK (__a, __c);
-#ifdef __AARCH64EB__
-  return __builtin_shuffle (__b, __a, (uint8x16_t)
-      {16-__c, 17-__c, 18-__c, 19-__c, 20-__c, 21-__c, 22-__c, 23-__c,
-       24-__c, 25-__c, 26-__c, 27-__c, 28-__c, 29-__c, 30-__c, 31-__c});
-#else
-  return __builtin_shuffle (__a, __b, (uint8x16_t)
-      {__c, __c+1, __c+2, __c+3, __c+4, __c+5, __c+6, __c+7,
-       __c+8, __c+9, __c+10, __c+11, __c+12, __c+13, __c+14, __c+15});
-#endif
+  float32x2x2_t ret;
+  __builtin_aarch64_simd_oi __o;
+  __o = __builtin_aarch64_ld2rv2sf ((const __builtin_aarch64_simd_sf *) __a);
+  ret.val[0] = (float32x2_t) __builtin_aarch64_get_dregoiv2sf (__o, 0);
+  ret.val[1] = (float32x2_t) __builtin_aarch64_get_dregoiv2sf (__o, 1);
+  return ret;
 }
 
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-vextq_u16 (uint16x8_t __a, uint16x8_t __b, __const int __c)
+__extension__ extern __inline float64x1x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld2_dup_f64 (const float64_t * __a)
 {
-  __AARCH64_LANE_CHECK (__a, __c);
-#ifdef __AARCH64EB__
-  return __builtin_shuffle (__b, __a, (uint16x8_t)
-      {8-__c, 9-__c, 10-__c, 11-__c, 12-__c, 13-__c, 14-__c, 15-__c});
-#else
-  return __builtin_shuffle (__a, __b,
-      (uint16x8_t) {__c, __c+1, __c+2, __c+3, __c+4, __c+5, __c+6, __c+7});
-#endif
+  float64x1x2_t ret;
+  __builtin_aarch64_simd_oi __o;
+  __o = __builtin_aarch64_ld2rdf ((const __builtin_aarch64_simd_df *) __a);
+  ret.val[0] = (float64x1_t) {__builtin_aarch64_get_dregoidf (__o, 0)};
+  ret.val[1] = (float64x1_t) {__builtin_aarch64_get_dregoidf (__o, 1)};
+  return ret;
 }
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vextq_u32 (uint32x4_t __a, uint32x4_t __b, __const int __c)
+__extension__ extern __inline uint8x8x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld2_dup_u8 (const uint8_t * __a)
 {
-  __AARCH64_LANE_CHECK (__a, __c);
-#ifdef __AARCH64EB__
-  return __builtin_shuffle (__b, __a,
-      (uint32x4_t) {4-__c, 5-__c, 6-__c, 7-__c});
-#else
-  return __builtin_shuffle (__a, __b, (uint32x4_t) {__c, __c+1, __c+2, __c+3});
-#endif
+  uint8x8x2_t ret;
+  __builtin_aarch64_simd_oi __o;
+  __o = __builtin_aarch64_ld2rv8qi ((const __builtin_aarch64_simd_qi *) __a);
+  ret.val[0] = (uint8x8_t) __builtin_aarch64_get_dregoiv8qi (__o, 0);
+  ret.val[1] = (uint8x8_t) __builtin_aarch64_get_dregoiv8qi (__o, 1);
+  return ret;
 }
 
-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
-vextq_u64 (uint64x2_t __a, uint64x2_t __b, __const int __c)
+__extension__ extern __inline uint16x4x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld2_dup_u16 (const uint16_t * __a)
 {
-  __AARCH64_LANE_CHECK (__a, __c);
-#ifdef __AARCH64EB__
-  return __builtin_shuffle (__b, __a, (uint64x2_t) {2-__c, 3-__c});
-#else
-  return __builtin_shuffle (__a, __b, (uint64x2_t) {__c, __c+1});
-#endif
+  uint16x4x2_t ret;
+  __builtin_aarch64_simd_oi __o;
+  __o = __builtin_aarch64_ld2rv4hi ((const __builtin_aarch64_simd_hi *) __a);
+  ret.val[0] = (uint16x4_t) __builtin_aarch64_get_dregoiv4hi (__o, 0);
+  ret.val[1] = (uint16x4_t) __builtin_aarch64_get_dregoiv4hi (__o, 1);
+  return ret;
 }
 
-/* vfma  */
-
-__extension__ static __inline float64x1_t __attribute__ ((__always_inline__))
-vfma_f64 (float64x1_t __a, float64x1_t __b, float64x1_t __c)
+__extension__ extern __inline uint32x2x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld2_dup_u32 (const uint32_t * __a)
 {
-  return (float64x1_t) {__builtin_fma (__b[0], __c[0], __a[0])};
+  uint32x2x2_t ret;
+  __builtin_aarch64_simd_oi __o;
+  __o = __builtin_aarch64_ld2rv2si ((const __builtin_aarch64_simd_si *) __a);
+  ret.val[0] = (uint32x2_t) __builtin_aarch64_get_dregoiv2si (__o, 0);
+  ret.val[1] = (uint32x2_t) __builtin_aarch64_get_dregoiv2si (__o, 1);
+  return ret;
 }
 
-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
-vfma_f32 (float32x2_t __a, float32x2_t __b, float32x2_t __c)
+__extension__ extern __inline poly8x8x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld2_dup_p8 (const poly8_t * __a)
 {
-  return __builtin_aarch64_fmav2sf (__b, __c, __a);
+  poly8x8x2_t ret;
+  __builtin_aarch64_simd_oi __o;
+  __o = __builtin_aarch64_ld2rv8qi ((const __builtin_aarch64_simd_qi *) __a);
+  ret.val[0] = (poly8x8_t) __builtin_aarch64_get_dregoiv8qi (__o, 0);
+  ret.val[1] = (poly8x8_t) __builtin_aarch64_get_dregoiv8qi (__o, 1);
+  return ret;
 }
 
-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
-vfmaq_f32 (float32x4_t __a, float32x4_t __b, float32x4_t __c)
+__extension__ extern __inline poly16x4x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld2_dup_p16 (const poly16_t * __a)
 {
-  return __builtin_aarch64_fmav4sf (__b, __c, __a);
+  poly16x4x2_t ret;
+  __builtin_aarch64_simd_oi __o;
+  __o = __builtin_aarch64_ld2rv4hi ((const __builtin_aarch64_simd_hi *) __a);
+  ret.val[0] = (poly16x4_t) __builtin_aarch64_get_dregoiv4hi (__o, 0);
+  ret.val[1] = (poly16x4_t) __builtin_aarch64_get_dregoiv4hi (__o, 1);
+  return ret;
 }
 
-__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
-vfmaq_f64 (float64x2_t __a, float64x2_t __b, float64x2_t __c)
+__extension__ extern __inline poly64x1x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld2_dup_p64 (const poly64_t * __a)
 {
-  return __builtin_aarch64_fmav2df (__b, __c, __a);
+  poly64x1x2_t ret;
+  __builtin_aarch64_simd_oi __o;
+  __o = __builtin_aarch64_ld2rv2di ((const __builtin_aarch64_simd_di *) __a);
+  ret.val[0] = (poly64x1_t) __builtin_aarch64_get_dregoidi_pss (__o, 0);
+  ret.val[1] = (poly64x1_t) __builtin_aarch64_get_dregoidi_pss (__o, 1);
+  return ret;
 }
 
-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
-vfma_n_f32 (float32x2_t __a, float32x2_t __b, float32_t __c)
+
+__extension__ extern __inline int64x1x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld2_dup_s64 (const int64_t * __a)
 {
-  return __builtin_aarch64_fmav2sf (__b, vdup_n_f32 (__c), __a);
+  int64x1x2_t ret;
+  __builtin_aarch64_simd_oi __o;
+  __o = __builtin_aarch64_ld2rdi ((const __builtin_aarch64_simd_di *) __a);
+  ret.val[0] = (int64x1_t) __builtin_aarch64_get_dregoidi (__o, 0);
+  ret.val[1] = (int64x1_t) __builtin_aarch64_get_dregoidi (__o, 1);
+  return ret;
 }
 
-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
-vfmaq_n_f32 (float32x4_t __a, float32x4_t __b, float32_t __c)
+__extension__ extern __inline uint64x1x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld2_dup_u64 (const uint64_t * __a)
 {
-  return __builtin_aarch64_fmav4sf (__b, vdupq_n_f32 (__c), __a);
+  uint64x1x2_t ret;
+  __builtin_aarch64_simd_oi __o;
+  __o = __builtin_aarch64_ld2rdi ((const __builtin_aarch64_simd_di *) __a);
+  ret.val[0] = (uint64x1_t) __builtin_aarch64_get_dregoidi (__o, 0);
+  ret.val[1] = (uint64x1_t) __builtin_aarch64_get_dregoidi (__o, 1);
+  return ret;
 }
 
-__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
-vfmaq_n_f64 (float64x2_t __a, float64x2_t __b, float64_t __c)
+__extension__ extern __inline int8x16x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld2q_dup_s8 (const int8_t * __a)
 {
-  return __builtin_aarch64_fmav2df (__b, vdupq_n_f64 (__c), __a);
+  int8x16x2_t ret;
+  __builtin_aarch64_simd_oi __o;
+  __o = __builtin_aarch64_ld2rv16qi ((const __builtin_aarch64_simd_qi *) __a);
+  ret.val[0] = (int8x16_t) __builtin_aarch64_get_qregoiv16qi (__o, 0);
+  ret.val[1] = (int8x16_t) __builtin_aarch64_get_qregoiv16qi (__o, 1);
+  return ret;
 }
 
-/* vfma_lane  */
-
-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
-vfma_lane_f32 (float32x2_t __a, float32x2_t __b,
-	       float32x2_t __c, const int __lane)
+__extension__ extern __inline poly8x16x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld2q_dup_p8 (const poly8_t * __a)
 {
-  return __builtin_aarch64_fmav2sf (__b,
-				    __aarch64_vdup_lane_f32 (__c, __lane),
-				    __a);
+  poly8x16x2_t ret;
+  __builtin_aarch64_simd_oi __o;
+  __o = __builtin_aarch64_ld2rv16qi ((const __builtin_aarch64_simd_qi *) __a);
+  ret.val[0] = (poly8x16_t) __builtin_aarch64_get_qregoiv16qi (__o, 0);
+  ret.val[1] = (poly8x16_t) __builtin_aarch64_get_qregoiv16qi (__o, 1);
+  return ret;
 }
 
-__extension__ static __inline float64x1_t __attribute__ ((__always_inline__))
-vfma_lane_f64 (float64x1_t __a, float64x1_t __b,
-	       float64x1_t __c, const int __lane)
+__extension__ extern __inline int16x8x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld2q_dup_s16 (const int16_t * __a)
 {
-  return (float64x1_t) {__builtin_fma (__b[0], __c[0], __a[0])};
+  int16x8x2_t ret;
+  __builtin_aarch64_simd_oi __o;
+  __o = __builtin_aarch64_ld2rv8hi ((const __builtin_aarch64_simd_hi *) __a);
+  ret.val[0] = (int16x8_t) __builtin_aarch64_get_qregoiv8hi (__o, 0);
+  ret.val[1] = (int16x8_t) __builtin_aarch64_get_qregoiv8hi (__o, 1);
+  return ret;
 }
 
-__extension__ static __inline float64_t __attribute__ ((__always_inline__))
-vfmad_lane_f64 (float64_t __a, float64_t __b,
-	        float64x1_t __c, const int __lane)
+__extension__ extern __inline poly16x8x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld2q_dup_p16 (const poly16_t * __a)
 {
-  return __builtin_fma (__b, __c[0], __a);
+  poly16x8x2_t ret;
+  __builtin_aarch64_simd_oi __o;
+  __o = __builtin_aarch64_ld2rv8hi ((const __builtin_aarch64_simd_hi *) __a);
+  ret.val[0] = (poly16x8_t) __builtin_aarch64_get_qregoiv8hi (__o, 0);
+  ret.val[1] = (poly16x8_t) __builtin_aarch64_get_qregoiv8hi (__o, 1);
+  return ret;
 }
 
-__extension__ static __inline float32_t __attribute__ ((__always_inline__))
-vfmas_lane_f32 (float32_t __a, float32_t __b,
-	        float32x2_t __c, const int __lane)
+__extension__ extern __inline int32x4x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld2q_dup_s32 (const int32_t * __a)
 {
-  return __builtin_fmaf (__b, __aarch64_vget_lane_any (__c, __lane), __a);
+  int32x4x2_t ret;
+  __builtin_aarch64_simd_oi __o;
+  __o = __builtin_aarch64_ld2rv4si ((const __builtin_aarch64_simd_si *) __a);
+  ret.val[0] = (int32x4_t) __builtin_aarch64_get_qregoiv4si (__o, 0);
+  ret.val[1] = (int32x4_t) __builtin_aarch64_get_qregoiv4si (__o, 1);
+  return ret;
 }
 
-/* vfma_laneq  */
+__extension__ extern __inline int64x2x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld2q_dup_s64 (const int64_t * __a)
+{
+  int64x2x2_t ret;
+  __builtin_aarch64_simd_oi __o;
+  __o = __builtin_aarch64_ld2rv2di ((const __builtin_aarch64_simd_di *) __a);
+  ret.val[0] = (int64x2_t) __builtin_aarch64_get_qregoiv2di (__o, 0);
+  ret.val[1] = (int64x2_t) __builtin_aarch64_get_qregoiv2di (__o, 1);
+  return ret;
+}
 
-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
-vfma_laneq_f32 (float32x2_t __a, float32x2_t __b,
-	        float32x4_t __c, const int __lane)
+__extension__ extern __inline uint8x16x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld2q_dup_u8 (const uint8_t * __a)
 {
-  return __builtin_aarch64_fmav2sf (__b,
-				    __aarch64_vdup_laneq_f32 (__c, __lane),
-				    __a);
+  uint8x16x2_t ret;
+  __builtin_aarch64_simd_oi __o;
+  __o = __builtin_aarch64_ld2rv16qi ((const __builtin_aarch64_simd_qi *) __a);
+  ret.val[0] = (uint8x16_t) __builtin_aarch64_get_qregoiv16qi (__o, 0);
+  ret.val[1] = (uint8x16_t) __builtin_aarch64_get_qregoiv16qi (__o, 1);
+  return ret;
 }
 
-__extension__ static __inline float64x1_t __attribute__ ((__always_inline__))
-vfma_laneq_f64 (float64x1_t __a, float64x1_t __b,
-	        float64x2_t __c, const int __lane)
+__extension__ extern __inline uint16x8x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld2q_dup_u16 (const uint16_t * __a)
 {
-  float64_t __c0 = __aarch64_vget_lane_any (__c, __lane);
-  return (float64x1_t) {__builtin_fma (__b[0], __c0, __a[0])};
+  uint16x8x2_t ret;
+  __builtin_aarch64_simd_oi __o;
+  __o = __builtin_aarch64_ld2rv8hi ((const __builtin_aarch64_simd_hi *) __a);
+  ret.val[0] = (uint16x8_t) __builtin_aarch64_get_qregoiv8hi (__o, 0);
+  ret.val[1] = (uint16x8_t) __builtin_aarch64_get_qregoiv8hi (__o, 1);
+  return ret;
 }
 
-__extension__ static __inline float64_t __attribute__ ((__always_inline__))
-vfmad_laneq_f64 (float64_t __a, float64_t __b,
-	         float64x2_t __c, const int __lane)
+__extension__ extern __inline uint32x4x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld2q_dup_u32 (const uint32_t * __a)
 {
-  return __builtin_fma (__b, __aarch64_vget_lane_any (__c, __lane), __a);
+  uint32x4x2_t ret;
+  __builtin_aarch64_simd_oi __o;
+  __o = __builtin_aarch64_ld2rv4si ((const __builtin_aarch64_simd_si *) __a);
+  ret.val[0] = (uint32x4_t) __builtin_aarch64_get_qregoiv4si (__o, 0);
+  ret.val[1] = (uint32x4_t) __builtin_aarch64_get_qregoiv4si (__o, 1);
+  return ret;
 }
 
-__extension__ static __inline float32_t __attribute__ ((__always_inline__))
-vfmas_laneq_f32 (float32_t __a, float32_t __b,
-		 float32x4_t __c, const int __lane)
+__extension__ extern __inline uint64x2x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld2q_dup_u64 (const uint64_t * __a)
 {
-  return __builtin_fmaf (__b, __aarch64_vget_lane_any (__c, __lane), __a);
+  uint64x2x2_t ret;
+  __builtin_aarch64_simd_oi __o;
+  __o = __builtin_aarch64_ld2rv2di ((const __builtin_aarch64_simd_di *) __a);
+  ret.val[0] = (uint64x2_t) __builtin_aarch64_get_qregoiv2di (__o, 0);
+  ret.val[1] = (uint64x2_t) __builtin_aarch64_get_qregoiv2di (__o, 1);
+  return ret;
 }
 
-/* vfmaq_lane  */
-
-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
-vfmaq_lane_f32 (float32x4_t __a, float32x4_t __b,
-	        float32x2_t __c, const int __lane)
+__extension__ extern __inline float16x8x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld2q_dup_f16 (const float16_t * __a)
 {
-  return __builtin_aarch64_fmav4sf (__b,
-				    __aarch64_vdupq_lane_f32 (__c, __lane),
-				    __a);
+  float16x8x2_t ret;
+  __builtin_aarch64_simd_oi __o;
+  __o = __builtin_aarch64_ld2rv8hf ((const __builtin_aarch64_simd_hf *) __a);
+  ret.val[0] = (float16x8_t) __builtin_aarch64_get_qregoiv8hf (__o, 0);
+  ret.val[1] = __builtin_aarch64_get_qregoiv8hf (__o, 1);
+  return ret;
 }
 
-__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
-vfmaq_lane_f64 (float64x2_t __a, float64x2_t __b,
-	        float64x1_t __c, const int __lane)
+__extension__ extern __inline float32x4x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld2q_dup_f32 (const float32_t * __a)
 {
-  return __builtin_aarch64_fmav2df (__b, vdupq_n_f64 (__c[0]), __a);
+  float32x4x2_t ret;
+  __builtin_aarch64_simd_oi __o;
+  __o = __builtin_aarch64_ld2rv4sf ((const __builtin_aarch64_simd_sf *) __a);
+  ret.val[0] = (float32x4_t) __builtin_aarch64_get_qregoiv4sf (__o, 0);
+  ret.val[1] = (float32x4_t) __builtin_aarch64_get_qregoiv4sf (__o, 1);
+  return ret;
 }
 
-/* vfmaq_laneq  */
-
-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
-vfmaq_laneq_f32 (float32x4_t __a, float32x4_t __b,
-	         float32x4_t __c, const int __lane)
+__extension__ extern __inline float64x2x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld2q_dup_f64 (const float64_t * __a)
 {
-  return __builtin_aarch64_fmav4sf (__b,
-				    __aarch64_vdupq_laneq_f32 (__c, __lane),
-				    __a);
+  float64x2x2_t ret;
+  __builtin_aarch64_simd_oi __o;
+  __o = __builtin_aarch64_ld2rv2df ((const __builtin_aarch64_simd_df *) __a);
+  ret.val[0] = (float64x2_t) __builtin_aarch64_get_qregoiv2df (__o, 0);
+  ret.val[1] = (float64x2_t) __builtin_aarch64_get_qregoiv2df (__o, 1);
+  return ret;
 }
 
-__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
-vfmaq_laneq_f64 (float64x2_t __a, float64x2_t __b,
-	         float64x2_t __c, const int __lane)
+__extension__ extern __inline poly64x2x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld2q_dup_p64 (const poly64_t * __a)
 {
-  return __builtin_aarch64_fmav2df (__b,
-				    __aarch64_vdupq_laneq_f64 (__c, __lane),
-				    __a);
+  poly64x2x2_t ret;
+  __builtin_aarch64_simd_oi __o;
+  __o = __builtin_aarch64_ld2rv2di ((const __builtin_aarch64_simd_di *) __a);
+  ret.val[0] = (poly64x2_t) __builtin_aarch64_get_qregoiv2di_pss (__o, 0);
+  ret.val[1] = (poly64x2_t) __builtin_aarch64_get_qregoiv2di_pss (__o, 1);
+  return ret;
 }
 
-/* vfms  */
-
-__extension__ static __inline float64x1_t __attribute__ ((__always_inline__))
-vfms_f64 (float64x1_t __a, float64x1_t __b, float64x1_t __c)
+__extension__ extern __inline int64x1x3_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld3_dup_s64 (const int64_t * __a)
 {
-  return (float64x1_t) {__builtin_fma (-__b[0], __c[0], __a[0])};
+  int64x1x3_t ret;
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_ld3rdi ((const __builtin_aarch64_simd_di *) __a);
+  ret.val[0] = (int64x1_t) __builtin_aarch64_get_dregcidi (__o, 0);
+  ret.val[1] = (int64x1_t) __builtin_aarch64_get_dregcidi (__o, 1);
+  ret.val[2] = (int64x1_t) __builtin_aarch64_get_dregcidi (__o, 2);
+  return ret;
 }
 
-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
-vfms_f32 (float32x2_t __a, float32x2_t __b, float32x2_t __c)
+__extension__ extern __inline uint64x1x3_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld3_dup_u64 (const uint64_t * __a)
 {
-  return __builtin_aarch64_fmav2sf (-__b, __c, __a);
+  uint64x1x3_t ret;
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_ld3rdi ((const __builtin_aarch64_simd_di *) __a);
+  ret.val[0] = (uint64x1_t) __builtin_aarch64_get_dregcidi (__o, 0);
+  ret.val[1] = (uint64x1_t) __builtin_aarch64_get_dregcidi (__o, 1);
+  ret.val[2] = (uint64x1_t) __builtin_aarch64_get_dregcidi (__o, 2);
+  return ret;
 }
 
-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
-vfmsq_f32 (float32x4_t __a, float32x4_t __b, float32x4_t __c)
+__extension__ extern __inline float64x1x3_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld3_dup_f64 (const float64_t * __a)
 {
-  return __builtin_aarch64_fmav4sf (-__b, __c, __a);
+  float64x1x3_t ret;
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_ld3rdf ((const __builtin_aarch64_simd_df *) __a);
+  ret.val[0] = (float64x1_t) {__builtin_aarch64_get_dregcidf (__o, 0)};
+  ret.val[1] = (float64x1_t) {__builtin_aarch64_get_dregcidf (__o, 1)};
+  ret.val[2] = (float64x1_t) {__builtin_aarch64_get_dregcidf (__o, 2)};
+  return ret;
 }
 
-__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
-vfmsq_f64 (float64x2_t __a, float64x2_t __b, float64x2_t __c)
+__extension__ extern __inline int8x8x3_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld3_dup_s8 (const int8_t * __a)
 {
-  return __builtin_aarch64_fmav2df (-__b, __c, __a);
+  int8x8x3_t ret;
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_ld3rv8qi ((const __builtin_aarch64_simd_qi *) __a);
+  ret.val[0] = (int8x8_t) __builtin_aarch64_get_dregciv8qi (__o, 0);
+  ret.val[1] = (int8x8_t) __builtin_aarch64_get_dregciv8qi (__o, 1);
+  ret.val[2] = (int8x8_t) __builtin_aarch64_get_dregciv8qi (__o, 2);
+  return ret;
 }
 
-
-/* vfms_lane  */
-
-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
-vfms_lane_f32 (float32x2_t __a, float32x2_t __b,
-	       float32x2_t __c, const int __lane)
+__extension__ extern __inline poly8x8x3_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld3_dup_p8 (const poly8_t * __a)
 {
-  return __builtin_aarch64_fmav2sf (-__b,
-				    __aarch64_vdup_lane_f32 (__c, __lane),
-				    __a);
+  poly8x8x3_t ret;
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_ld3rv8qi ((const __builtin_aarch64_simd_qi *) __a);
+  ret.val[0] = (poly8x8_t) __builtin_aarch64_get_dregciv8qi (__o, 0);
+  ret.val[1] = (poly8x8_t) __builtin_aarch64_get_dregciv8qi (__o, 1);
+  ret.val[2] = (poly8x8_t) __builtin_aarch64_get_dregciv8qi (__o, 2);
+  return ret;
 }
 
-__extension__ static __inline float64x1_t __attribute__ ((__always_inline__))
-vfms_lane_f64 (float64x1_t __a, float64x1_t __b,
-	       float64x1_t __c, const int __lane)
+__extension__ extern __inline int16x4x3_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld3_dup_s16 (const int16_t * __a)
 {
-  return (float64x1_t) {__builtin_fma (-__b[0], __c[0], __a[0])};
+  int16x4x3_t ret;
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_ld3rv4hi ((const __builtin_aarch64_simd_hi *) __a);
+  ret.val[0] = (int16x4_t) __builtin_aarch64_get_dregciv4hi (__o, 0);
+  ret.val[1] = (int16x4_t) __builtin_aarch64_get_dregciv4hi (__o, 1);
+  ret.val[2] = (int16x4_t) __builtin_aarch64_get_dregciv4hi (__o, 2);
+  return ret;
 }
 
-__extension__ static __inline float64_t __attribute__ ((__always_inline__))
-vfmsd_lane_f64 (float64_t __a, float64_t __b,
-	        float64x1_t __c, const int __lane)
+__extension__ extern __inline poly16x4x3_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld3_dup_p16 (const poly16_t * __a)
 {
-  return __builtin_fma (-__b, __c[0], __a);
+  poly16x4x3_t ret;
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_ld3rv4hi ((const __builtin_aarch64_simd_hi *) __a);
+  ret.val[0] = (poly16x4_t) __builtin_aarch64_get_dregciv4hi (__o, 0);
+  ret.val[1] = (poly16x4_t) __builtin_aarch64_get_dregciv4hi (__o, 1);
+  ret.val[2] = (poly16x4_t) __builtin_aarch64_get_dregciv4hi (__o, 2);
+  return ret;
 }
 
-__extension__ static __inline float32_t __attribute__ ((__always_inline__))
-vfmss_lane_f32 (float32_t __a, float32_t __b,
-	        float32x2_t __c, const int __lane)
+__extension__ extern __inline int32x2x3_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld3_dup_s32 (const int32_t * __a)
 {
-  return __builtin_fmaf (-__b, __aarch64_vget_lane_any (__c, __lane), __a);
+  int32x2x3_t ret;
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_ld3rv2si ((const __builtin_aarch64_simd_si *) __a);
+  ret.val[0] = (int32x2_t) __builtin_aarch64_get_dregciv2si (__o, 0);
+  ret.val[1] = (int32x2_t) __builtin_aarch64_get_dregciv2si (__o, 1);
+  ret.val[2] = (int32x2_t) __builtin_aarch64_get_dregciv2si (__o, 2);
+  return ret;
 }
 
-/* vfms_laneq  */
-
-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
-vfms_laneq_f32 (float32x2_t __a, float32x2_t __b,
-	        float32x4_t __c, const int __lane)
+__extension__ extern __inline uint8x8x3_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld3_dup_u8 (const uint8_t * __a)
 {
-  return __builtin_aarch64_fmav2sf (-__b,
-				    __aarch64_vdup_laneq_f32 (__c, __lane),
-				    __a);
+  uint8x8x3_t ret;
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_ld3rv8qi ((const __builtin_aarch64_simd_qi *) __a);
+  ret.val[0] = (uint8x8_t) __builtin_aarch64_get_dregciv8qi (__o, 0);
+  ret.val[1] = (uint8x8_t) __builtin_aarch64_get_dregciv8qi (__o, 1);
+  ret.val[2] = (uint8x8_t) __builtin_aarch64_get_dregciv8qi (__o, 2);
+  return ret;
 }
 
-__extension__ static __inline float64x1_t __attribute__ ((__always_inline__))
-vfms_laneq_f64 (float64x1_t __a, float64x1_t __b,
-	        float64x2_t __c, const int __lane)
+__extension__ extern __inline uint16x4x3_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld3_dup_u16 (const uint16_t * __a)
 {
-  float64_t __c0 = __aarch64_vget_lane_any (__c, __lane);
-  return (float64x1_t) {__builtin_fma (-__b[0], __c0, __a[0])};
+  uint16x4x3_t ret;
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_ld3rv4hi ((const __builtin_aarch64_simd_hi *) __a);
+  ret.val[0] = (uint16x4_t) __builtin_aarch64_get_dregciv4hi (__o, 0);
+  ret.val[1] = (uint16x4_t) __builtin_aarch64_get_dregciv4hi (__o, 1);
+  ret.val[2] = (uint16x4_t) __builtin_aarch64_get_dregciv4hi (__o, 2);
+  return ret;
 }
 
-__extension__ static __inline float64_t __attribute__ ((__always_inline__))
-vfmsd_laneq_f64 (float64_t __a, float64_t __b,
-	         float64x2_t __c, const int __lane)
+__extension__ extern __inline uint32x2x3_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld3_dup_u32 (const uint32_t * __a)
 {
-  return __builtin_fma (-__b, __aarch64_vget_lane_any (__c, __lane), __a);
+  uint32x2x3_t ret;
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_ld3rv2si ((const __builtin_aarch64_simd_si *) __a);
+  ret.val[0] = (uint32x2_t) __builtin_aarch64_get_dregciv2si (__o, 0);
+  ret.val[1] = (uint32x2_t) __builtin_aarch64_get_dregciv2si (__o, 1);
+  ret.val[2] = (uint32x2_t) __builtin_aarch64_get_dregciv2si (__o, 2);
+  return ret;
 }
 
-__extension__ static __inline float32_t __attribute__ ((__always_inline__))
-vfmss_laneq_f32 (float32_t __a, float32_t __b,
-		 float32x4_t __c, const int __lane)
+__extension__ extern __inline float16x4x3_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld3_dup_f16 (const float16_t * __a)
 {
-  return __builtin_fmaf (-__b, __aarch64_vget_lane_any (__c, __lane), __a);
+  float16x4x3_t ret;
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_ld3rv4hf ((const __builtin_aarch64_simd_hf *) __a);
+  ret.val[0] = (float16x4_t) __builtin_aarch64_get_dregciv4hf (__o, 0);
+  ret.val[1] = (float16x4_t) __builtin_aarch64_get_dregciv4hf (__o, 1);
+  ret.val[2] = (float16x4_t) __builtin_aarch64_get_dregciv4hf (__o, 2);
+  return ret;
 }
 
-/* vfmsq_lane  */
-
-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
-vfmsq_lane_f32 (float32x4_t __a, float32x4_t __b,
-	        float32x2_t __c, const int __lane)
+__extension__ extern __inline float32x2x3_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld3_dup_f32 (const float32_t * __a)
 {
-  return __builtin_aarch64_fmav4sf (-__b,
-				    __aarch64_vdupq_lane_f32 (__c, __lane),
-				    __a);
+  float32x2x3_t ret;
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_ld3rv2sf ((const __builtin_aarch64_simd_sf *) __a);
+  ret.val[0] = (float32x2_t) __builtin_aarch64_get_dregciv2sf (__o, 0);
+  ret.val[1] = (float32x2_t) __builtin_aarch64_get_dregciv2sf (__o, 1);
+  ret.val[2] = (float32x2_t) __builtin_aarch64_get_dregciv2sf (__o, 2);
+  return ret;
 }
 
-__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
-vfmsq_lane_f64 (float64x2_t __a, float64x2_t __b,
-	        float64x1_t __c, const int __lane)
+__extension__ extern __inline poly64x1x3_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld3_dup_p64 (const poly64_t * __a)
 {
-  return __builtin_aarch64_fmav2df (-__b, vdupq_n_f64 (__c[0]), __a);
+  poly64x1x3_t ret;
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_ld3rv2di ((const __builtin_aarch64_simd_di *) __a);
+  ret.val[0] = (poly64x1_t) __builtin_aarch64_get_dregcidi_pss (__o, 0);
+  ret.val[1] = (poly64x1_t) __builtin_aarch64_get_dregcidi_pss (__o, 1);
+  ret.val[2] = (poly64x1_t) __builtin_aarch64_get_dregcidi_pss (__o, 2);
+  return ret;
 }
 
-/* vfmsq_laneq  */
-
-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
-vfmsq_laneq_f32 (float32x4_t __a, float32x4_t __b,
-	         float32x4_t __c, const int __lane)
+__extension__ extern __inline int8x16x3_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld3q_dup_s8 (const int8_t * __a)
 {
-  return __builtin_aarch64_fmav4sf (-__b,
-				    __aarch64_vdupq_laneq_f32 (__c, __lane),
-				    __a);
+  int8x16x3_t ret;
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_ld3rv16qi ((const __builtin_aarch64_simd_qi *) __a);
+  ret.val[0] = (int8x16_t) __builtin_aarch64_get_qregciv16qi (__o, 0);
+  ret.val[1] = (int8x16_t) __builtin_aarch64_get_qregciv16qi (__o, 1);
+  ret.val[2] = (int8x16_t) __builtin_aarch64_get_qregciv16qi (__o, 2);
+  return ret;
 }
 
-__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
-vfmsq_laneq_f64 (float64x2_t __a, float64x2_t __b,
-	         float64x2_t __c, const int __lane)
+__extension__ extern __inline poly8x16x3_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld3q_dup_p8 (const poly8_t * __a)
 {
-  return __builtin_aarch64_fmav2df (-__b,
-				    __aarch64_vdupq_laneq_f64 (__c, __lane),
-				    __a);
+  poly8x16x3_t ret;
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_ld3rv16qi ((const __builtin_aarch64_simd_qi *) __a);
+  ret.val[0] = (poly8x16_t) __builtin_aarch64_get_qregciv16qi (__o, 0);
+  ret.val[1] = (poly8x16_t) __builtin_aarch64_get_qregciv16qi (__o, 1);
+  ret.val[2] = (poly8x16_t) __builtin_aarch64_get_qregciv16qi (__o, 2);
+  return ret;
 }
 
-/* vld1 */
+__extension__ extern __inline int16x8x3_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld3q_dup_s16 (const int16_t * __a)
+{
+  int16x8x3_t ret;
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_ld3rv8hi ((const __builtin_aarch64_simd_hi *) __a);
+  ret.val[0] = (int16x8_t) __builtin_aarch64_get_qregciv8hi (__o, 0);
+  ret.val[1] = (int16x8_t) __builtin_aarch64_get_qregciv8hi (__o, 1);
+  ret.val[2] = (int16x8_t) __builtin_aarch64_get_qregciv8hi (__o, 2);
+  return ret;
+}
 
-__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
-vld1_f16 (const float16_t *__a)
+__extension__ extern __inline poly16x8x3_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld3q_dup_p16 (const poly16_t * __a)
 {
-  return __builtin_aarch64_ld1v4hf (__a);
+  poly16x8x3_t ret;
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_ld3rv8hi ((const __builtin_aarch64_simd_hi *) __a);
+  ret.val[0] = (poly16x8_t) __builtin_aarch64_get_qregciv8hi (__o, 0);
+  ret.val[1] = (poly16x8_t) __builtin_aarch64_get_qregciv8hi (__o, 1);
+  ret.val[2] = (poly16x8_t) __builtin_aarch64_get_qregciv8hi (__o, 2);
+  return ret;
 }
 
-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
-vld1_f32 (const float32_t *a)
+__extension__ extern __inline int32x4x3_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld3q_dup_s32 (const int32_t * __a)
 {
-  return __builtin_aarch64_ld1v2sf ((const __builtin_aarch64_simd_sf *) a);
+  int32x4x3_t ret;
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_ld3rv4si ((const __builtin_aarch64_simd_si *) __a);
+  ret.val[0] = (int32x4_t) __builtin_aarch64_get_qregciv4si (__o, 0);
+  ret.val[1] = (int32x4_t) __builtin_aarch64_get_qregciv4si (__o, 1);
+  ret.val[2] = (int32x4_t) __builtin_aarch64_get_qregciv4si (__o, 2);
+  return ret;
 }
 
-__extension__ static __inline float64x1_t __attribute__ ((__always_inline__))
-vld1_f64 (const float64_t *a)
+__extension__ extern __inline int64x2x3_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld3q_dup_s64 (const int64_t * __a)
 {
-  return (float64x1_t) {*a};
+  int64x2x3_t ret;
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_ld3rv2di ((const __builtin_aarch64_simd_di *) __a);
+  ret.val[0] = (int64x2_t) __builtin_aarch64_get_qregciv2di (__o, 0);
+  ret.val[1] = (int64x2_t) __builtin_aarch64_get_qregciv2di (__o, 1);
+  ret.val[2] = (int64x2_t) __builtin_aarch64_get_qregciv2di (__o, 2);
+  return ret;
 }
 
-__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
-vld1_p8 (const poly8_t *a)
+__extension__ extern __inline uint8x16x3_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld3q_dup_u8 (const uint8_t * __a)
 {
-  return (poly8x8_t)
-    __builtin_aarch64_ld1v8qi ((const __builtin_aarch64_simd_qi *) a);
+  uint8x16x3_t ret;
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_ld3rv16qi ((const __builtin_aarch64_simd_qi *) __a);
+  ret.val[0] = (uint8x16_t) __builtin_aarch64_get_qregciv16qi (__o, 0);
+  ret.val[1] = (uint8x16_t) __builtin_aarch64_get_qregciv16qi (__o, 1);
+  ret.val[2] = (uint8x16_t) __builtin_aarch64_get_qregciv16qi (__o, 2);
+  return ret;
 }
 
-__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__))
-vld1_p16 (const poly16_t *a)
+__extension__ extern __inline uint16x8x3_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld3q_dup_u16 (const uint16_t * __a)
 {
-  return (poly16x4_t)
-    __builtin_aarch64_ld1v4hi ((const __builtin_aarch64_simd_hi *) a);
+  uint16x8x3_t ret;
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_ld3rv8hi ((const __builtin_aarch64_simd_hi *) __a);
+  ret.val[0] = (uint16x8_t) __builtin_aarch64_get_qregciv8hi (__o, 0);
+  ret.val[1] = (uint16x8_t) __builtin_aarch64_get_qregciv8hi (__o, 1);
+  ret.val[2] = (uint16x8_t) __builtin_aarch64_get_qregciv8hi (__o, 2);
+  return ret;
 }
 
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-vld1_s8 (const int8_t *a)
+__extension__ extern __inline uint32x4x3_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld3q_dup_u32 (const uint32_t * __a)
 {
-  return __builtin_aarch64_ld1v8qi ((const __builtin_aarch64_simd_qi *) a);
+  uint32x4x3_t ret;
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_ld3rv4si ((const __builtin_aarch64_simd_si *) __a);
+  ret.val[0] = (uint32x4_t) __builtin_aarch64_get_qregciv4si (__o, 0);
+  ret.val[1] = (uint32x4_t) __builtin_aarch64_get_qregciv4si (__o, 1);
+  ret.val[2] = (uint32x4_t) __builtin_aarch64_get_qregciv4si (__o, 2);
+  return ret;
 }
 
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-vld1_s16 (const int16_t *a)
+__extension__ extern __inline uint64x2x3_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld3q_dup_u64 (const uint64_t * __a)
 {
-  return __builtin_aarch64_ld1v4hi ((const __builtin_aarch64_simd_hi *) a);
+  uint64x2x3_t ret;
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_ld3rv2di ((const __builtin_aarch64_simd_di *) __a);
+  ret.val[0] = (uint64x2_t) __builtin_aarch64_get_qregciv2di (__o, 0);
+  ret.val[1] = (uint64x2_t) __builtin_aarch64_get_qregciv2di (__o, 1);
+  ret.val[2] = (uint64x2_t) __builtin_aarch64_get_qregciv2di (__o, 2);
+  return ret;
 }
 
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-vld1_s32 (const int32_t *a)
+__extension__ extern __inline float16x8x3_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld3q_dup_f16 (const float16_t * __a)
 {
-  return __builtin_aarch64_ld1v2si ((const __builtin_aarch64_simd_si *) a);
+  float16x8x3_t ret;
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_ld3rv8hf ((const __builtin_aarch64_simd_hf *) __a);
+  ret.val[0] = (float16x8_t) __builtin_aarch64_get_qregciv8hf (__o, 0);
+  ret.val[1] = (float16x8_t) __builtin_aarch64_get_qregciv8hf (__o, 1);
+  ret.val[2] = (float16x8_t) __builtin_aarch64_get_qregciv8hf (__o, 2);
+  return ret;
 }
 
-__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
-vld1_s64 (const int64_t *a)
+__extension__ extern __inline float32x4x3_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld3q_dup_f32 (const float32_t * __a)
 {
-  return (int64x1_t) {*a};
+  float32x4x3_t ret;
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_ld3rv4sf ((const __builtin_aarch64_simd_sf *) __a);
+  ret.val[0] = (float32x4_t) __builtin_aarch64_get_qregciv4sf (__o, 0);
+  ret.val[1] = (float32x4_t) __builtin_aarch64_get_qregciv4sf (__o, 1);
+  ret.val[2] = (float32x4_t) __builtin_aarch64_get_qregciv4sf (__o, 2);
+  return ret;
 }
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-vld1_u8 (const uint8_t *a)
+__extension__ extern __inline float64x2x3_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld3q_dup_f64 (const float64_t * __a)
 {
-  return (uint8x8_t)
-    __builtin_aarch64_ld1v8qi ((const __builtin_aarch64_simd_qi *) a);
+  float64x2x3_t ret;
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_ld3rv2df ((const __builtin_aarch64_simd_df *) __a);
+  ret.val[0] = (float64x2_t) __builtin_aarch64_get_qregciv2df (__o, 0);
+  ret.val[1] = (float64x2_t) __builtin_aarch64_get_qregciv2df (__o, 1);
+  ret.val[2] = (float64x2_t) __builtin_aarch64_get_qregciv2df (__o, 2);
+  return ret;
 }
 
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-vld1_u16 (const uint16_t *a)
+__extension__ extern __inline poly64x2x3_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld3q_dup_p64 (const poly64_t * __a)
 {
-  return (uint16x4_t)
-    __builtin_aarch64_ld1v4hi ((const __builtin_aarch64_simd_hi *) a);
+  poly64x2x3_t ret;
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_ld3rv2di ((const __builtin_aarch64_simd_di *) __a);
+  ret.val[0] = (poly64x2_t) __builtin_aarch64_get_qregciv2di_pss (__o, 0);
+  ret.val[1] = (poly64x2_t) __builtin_aarch64_get_qregciv2di_pss (__o, 1);
+  ret.val[2] = (poly64x2_t) __builtin_aarch64_get_qregciv2di_pss (__o, 2);
+  return ret;
 }
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-vld1_u32 (const uint32_t *a)
+__extension__ extern __inline int64x1x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld4_dup_s64 (const int64_t * __a)
 {
-  return (uint32x2_t)
-    __builtin_aarch64_ld1v2si ((const __builtin_aarch64_simd_si *) a);
+  int64x1x4_t ret;
+  __builtin_aarch64_simd_xi __o;
+  __o = __builtin_aarch64_ld4rdi ((const __builtin_aarch64_simd_di *) __a);
+  ret.val[0] = (int64x1_t) __builtin_aarch64_get_dregxidi (__o, 0);
+  ret.val[1] = (int64x1_t) __builtin_aarch64_get_dregxidi (__o, 1);
+  ret.val[2] = (int64x1_t) __builtin_aarch64_get_dregxidi (__o, 2);
+  ret.val[3] = (int64x1_t) __builtin_aarch64_get_dregxidi (__o, 3);
+  return ret;
 }
 
-__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
-vld1_u64 (const uint64_t *a)
+__extension__ extern __inline uint64x1x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld4_dup_u64 (const uint64_t * __a)
 {
-  return (uint64x1_t) {*a};
+  uint64x1x4_t ret;
+  __builtin_aarch64_simd_xi __o;
+  __o = __builtin_aarch64_ld4rdi ((const __builtin_aarch64_simd_di *) __a);
+  ret.val[0] = (uint64x1_t) __builtin_aarch64_get_dregxidi (__o, 0);
+  ret.val[1] = (uint64x1_t) __builtin_aarch64_get_dregxidi (__o, 1);
+  ret.val[2] = (uint64x1_t) __builtin_aarch64_get_dregxidi (__o, 2);
+  ret.val[3] = (uint64x1_t) __builtin_aarch64_get_dregxidi (__o, 3);
+  return ret;
 }
 
-/* vld1q */
+__extension__ extern __inline float64x1x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld4_dup_f64 (const float64_t * __a)
+{
+  float64x1x4_t ret;
+  __builtin_aarch64_simd_xi __o;
+  __o = __builtin_aarch64_ld4rdf ((const __builtin_aarch64_simd_df *) __a);
+  ret.val[0] = (float64x1_t) {__builtin_aarch64_get_dregxidf (__o, 0)};
+  ret.val[1] = (float64x1_t) {__builtin_aarch64_get_dregxidf (__o, 1)};
+  ret.val[2] = (float64x1_t) {__builtin_aarch64_get_dregxidf (__o, 2)};
+  ret.val[3] = (float64x1_t) {__builtin_aarch64_get_dregxidf (__o, 3)};
+  return ret;
+}
 
-__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
-vld1q_f16 (const float16_t *__a)
+__extension__ extern __inline int8x8x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld4_dup_s8 (const int8_t * __a)
 {
-  return __builtin_aarch64_ld1v8hf (__a);
+  int8x8x4_t ret;
+  __builtin_aarch64_simd_xi __o;
+  __o = __builtin_aarch64_ld4rv8qi ((const __builtin_aarch64_simd_qi *) __a);
+  ret.val[0] = (int8x8_t) __builtin_aarch64_get_dregxiv8qi (__o, 0);
+  ret.val[1] = (int8x8_t) __builtin_aarch64_get_dregxiv8qi (__o, 1);
+  ret.val[2] = (int8x8_t) __builtin_aarch64_get_dregxiv8qi (__o, 2);
+  ret.val[3] = (int8x8_t) __builtin_aarch64_get_dregxiv8qi (__o, 3);
+  return ret;
 }
 
-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
-vld1q_f32 (const float32_t *a)
+__extension__ extern __inline poly8x8x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld4_dup_p8 (const poly8_t * __a)
 {
-  return __builtin_aarch64_ld1v4sf ((const __builtin_aarch64_simd_sf *) a);
+  poly8x8x4_t ret;
+  __builtin_aarch64_simd_xi __o;
+  __o = __builtin_aarch64_ld4rv8qi ((const __builtin_aarch64_simd_qi *) __a);
+  ret.val[0] = (poly8x8_t) __builtin_aarch64_get_dregxiv8qi (__o, 0);
+  ret.val[1] = (poly8x8_t) __builtin_aarch64_get_dregxiv8qi (__o, 1);
+  ret.val[2] = (poly8x8_t) __builtin_aarch64_get_dregxiv8qi (__o, 2);
+  ret.val[3] = (poly8x8_t) __builtin_aarch64_get_dregxiv8qi (__o, 3);
+  return ret;
 }
 
-__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
-vld1q_f64 (const float64_t *a)
+__extension__ extern __inline int16x4x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld4_dup_s16 (const int16_t * __a)
 {
-  return __builtin_aarch64_ld1v2df ((const __builtin_aarch64_simd_df *) a);
+  int16x4x4_t ret;
+  __builtin_aarch64_simd_xi __o;
+  __o = __builtin_aarch64_ld4rv4hi ((const __builtin_aarch64_simd_hi *) __a);
+  ret.val[0] = (int16x4_t) __builtin_aarch64_get_dregxiv4hi (__o, 0);
+  ret.val[1] = (int16x4_t) __builtin_aarch64_get_dregxiv4hi (__o, 1);
+  ret.val[2] = (int16x4_t) __builtin_aarch64_get_dregxiv4hi (__o, 2);
+  ret.val[3] = (int16x4_t) __builtin_aarch64_get_dregxiv4hi (__o, 3);
+  return ret;
 }
 
-__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
-vld1q_p8 (const poly8_t *a)
+__extension__ extern __inline poly16x4x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld4_dup_p16 (const poly16_t * __a)
 {
-  return (poly8x16_t)
-    __builtin_aarch64_ld1v16qi ((const __builtin_aarch64_simd_qi *) a);
+  poly16x4x4_t ret;
+  __builtin_aarch64_simd_xi __o;
+  __o = __builtin_aarch64_ld4rv4hi ((const __builtin_aarch64_simd_hi *) __a);
+  ret.val[0] = (poly16x4_t) __builtin_aarch64_get_dregxiv4hi (__o, 0);
+  ret.val[1] = (poly16x4_t) __builtin_aarch64_get_dregxiv4hi (__o, 1);
+  ret.val[2] = (poly16x4_t) __builtin_aarch64_get_dregxiv4hi (__o, 2);
+  ret.val[3] = (poly16x4_t) __builtin_aarch64_get_dregxiv4hi (__o, 3);
+  return ret;
 }
 
-__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
-vld1q_p16 (const poly16_t *a)
+__extension__ extern __inline int32x2x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld4_dup_s32 (const int32_t * __a)
 {
-  return (poly16x8_t)
-    __builtin_aarch64_ld1v8hi ((const __builtin_aarch64_simd_hi *) a);
+  int32x2x4_t ret;
+  __builtin_aarch64_simd_xi __o;
+  __o = __builtin_aarch64_ld4rv2si ((const __builtin_aarch64_simd_si *) __a);
+  ret.val[0] = (int32x2_t) __builtin_aarch64_get_dregxiv2si (__o, 0);
+  ret.val[1] = (int32x2_t) __builtin_aarch64_get_dregxiv2si (__o, 1);
+  ret.val[2] = (int32x2_t) __builtin_aarch64_get_dregxiv2si (__o, 2);
+  ret.val[3] = (int32x2_t) __builtin_aarch64_get_dregxiv2si (__o, 3);
+  return ret;
 }
 
-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
-vld1q_s8 (const int8_t *a)
+__extension__ extern __inline uint8x8x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld4_dup_u8 (const uint8_t * __a)
 {
-  return __builtin_aarch64_ld1v16qi ((const __builtin_aarch64_simd_qi *) a);
+  uint8x8x4_t ret;
+  __builtin_aarch64_simd_xi __o;
+  __o = __builtin_aarch64_ld4rv8qi ((const __builtin_aarch64_simd_qi *) __a);
+  ret.val[0] = (uint8x8_t) __builtin_aarch64_get_dregxiv8qi (__o, 0);
+  ret.val[1] = (uint8x8_t) __builtin_aarch64_get_dregxiv8qi (__o, 1);
+  ret.val[2] = (uint8x8_t) __builtin_aarch64_get_dregxiv8qi (__o, 2);
+  ret.val[3] = (uint8x8_t) __builtin_aarch64_get_dregxiv8qi (__o, 3);
+  return ret;
 }
 
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-vld1q_s16 (const int16_t *a)
+__extension__ extern __inline uint16x4x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld4_dup_u16 (const uint16_t * __a)
 {
-  return __builtin_aarch64_ld1v8hi ((const __builtin_aarch64_simd_hi *) a);
+  uint16x4x4_t ret;
+  __builtin_aarch64_simd_xi __o;
+  __o = __builtin_aarch64_ld4rv4hi ((const __builtin_aarch64_simd_hi *) __a);
+  ret.val[0] = (uint16x4_t) __builtin_aarch64_get_dregxiv4hi (__o, 0);
+  ret.val[1] = (uint16x4_t) __builtin_aarch64_get_dregxiv4hi (__o, 1);
+  ret.val[2] = (uint16x4_t) __builtin_aarch64_get_dregxiv4hi (__o, 2);
+  ret.val[3] = (uint16x4_t) __builtin_aarch64_get_dregxiv4hi (__o, 3);
+  return ret;
 }
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-vld1q_s32 (const int32_t *a)
+__extension__ extern __inline uint32x2x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld4_dup_u32 (const uint32_t * __a)
 {
-  return __builtin_aarch64_ld1v4si ((const __builtin_aarch64_simd_si *) a);
+  uint32x2x4_t ret;
+  __builtin_aarch64_simd_xi __o;
+  __o = __builtin_aarch64_ld4rv2si ((const __builtin_aarch64_simd_si *) __a);
+  ret.val[0] = (uint32x2_t) __builtin_aarch64_get_dregxiv2si (__o, 0);
+  ret.val[1] = (uint32x2_t) __builtin_aarch64_get_dregxiv2si (__o, 1);
+  ret.val[2] = (uint32x2_t) __builtin_aarch64_get_dregxiv2si (__o, 2);
+  ret.val[3] = (uint32x2_t) __builtin_aarch64_get_dregxiv2si (__o, 3);
+  return ret;
 }
 
-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
-vld1q_s64 (const int64_t *a)
+__extension__ extern __inline float16x4x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld4_dup_f16 (const float16_t * __a)
 {
-  return __builtin_aarch64_ld1v2di ((const __builtin_aarch64_simd_di *) a);
+  float16x4x4_t ret;
+  __builtin_aarch64_simd_xi __o;
+  __o = __builtin_aarch64_ld4rv4hf ((const __builtin_aarch64_simd_hf *) __a);
+  ret.val[0] = (float16x4_t) __builtin_aarch64_get_dregxiv4hf (__o, 0);
+  ret.val[1] = (float16x4_t) __builtin_aarch64_get_dregxiv4hf (__o, 1);
+  ret.val[2] = (float16x4_t) __builtin_aarch64_get_dregxiv4hf (__o, 2);
+  ret.val[3] = (float16x4_t) __builtin_aarch64_get_dregxiv4hf (__o, 3);
+  return ret;
 }
 
-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-vld1q_u8 (const uint8_t *a)
+__extension__ extern __inline float32x2x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld4_dup_f32 (const float32_t * __a)
 {
-  return (uint8x16_t)
-    __builtin_aarch64_ld1v16qi ((const __builtin_aarch64_simd_qi *) a);
+  float32x2x4_t ret;
+  __builtin_aarch64_simd_xi __o;
+  __o = __builtin_aarch64_ld4rv2sf ((const __builtin_aarch64_simd_sf *) __a);
+  ret.val[0] = (float32x2_t) __builtin_aarch64_get_dregxiv2sf (__o, 0);
+  ret.val[1] = (float32x2_t) __builtin_aarch64_get_dregxiv2sf (__o, 1);
+  ret.val[2] = (float32x2_t) __builtin_aarch64_get_dregxiv2sf (__o, 2);
+  ret.val[3] = (float32x2_t) __builtin_aarch64_get_dregxiv2sf (__o, 3);
+  return ret;
 }
 
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-vld1q_u16 (const uint16_t *a)
+__extension__ extern __inline poly64x1x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld4_dup_p64 (const poly64_t * __a)
 {
-  return (uint16x8_t)
-    __builtin_aarch64_ld1v8hi ((const __builtin_aarch64_simd_hi *) a);
+  poly64x1x4_t ret;
+  __builtin_aarch64_simd_xi __o;
+  __o = __builtin_aarch64_ld4rv2di ((const __builtin_aarch64_simd_di *) __a);
+  ret.val[0] = (poly64x1_t) __builtin_aarch64_get_dregxidi_pss (__o, 0);
+  ret.val[1] = (poly64x1_t) __builtin_aarch64_get_dregxidi_pss (__o, 1);
+  ret.val[2] = (poly64x1_t) __builtin_aarch64_get_dregxidi_pss (__o, 2);
+  ret.val[3] = (poly64x1_t) __builtin_aarch64_get_dregxidi_pss (__o, 3);
+  return ret;
 }
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vld1q_u32 (const uint32_t *a)
+__extension__ extern __inline int8x16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld4q_dup_s8 (const int8_t * __a)
 {
-  return (uint32x4_t)
-    __builtin_aarch64_ld1v4si ((const __builtin_aarch64_simd_si *) a);
+  int8x16x4_t ret;
+  __builtin_aarch64_simd_xi __o;
+  __o = __builtin_aarch64_ld4rv16qi ((const __builtin_aarch64_simd_qi *) __a);
+  ret.val[0] = (int8x16_t) __builtin_aarch64_get_qregxiv16qi (__o, 0);
+  ret.val[1] = (int8x16_t) __builtin_aarch64_get_qregxiv16qi (__o, 1);
+  ret.val[2] = (int8x16_t) __builtin_aarch64_get_qregxiv16qi (__o, 2);
+  ret.val[3] = (int8x16_t) __builtin_aarch64_get_qregxiv16qi (__o, 3);
+  return ret;
 }
 
-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
-vld1q_u64 (const uint64_t *a)
+__extension__ extern __inline poly8x16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld4q_dup_p8 (const poly8_t * __a)
 {
-  return (uint64x2_t)
-    __builtin_aarch64_ld1v2di ((const __builtin_aarch64_simd_di *) a);
+  poly8x16x4_t ret;
+  __builtin_aarch64_simd_xi __o;
+  __o = __builtin_aarch64_ld4rv16qi ((const __builtin_aarch64_simd_qi *) __a);
+  ret.val[0] = (poly8x16_t) __builtin_aarch64_get_qregxiv16qi (__o, 0);
+  ret.val[1] = (poly8x16_t) __builtin_aarch64_get_qregxiv16qi (__o, 1);
+  ret.val[2] = (poly8x16_t) __builtin_aarch64_get_qregxiv16qi (__o, 2);
+  ret.val[3] = (poly8x16_t) __builtin_aarch64_get_qregxiv16qi (__o, 3);
+  return ret;
 }
 
-/* vld1_dup  */
-
-__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
-vld1_dup_f16 (const float16_t* __a)
+__extension__ extern __inline int16x8x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld4q_dup_s16 (const int16_t * __a)
 {
-  float16_t __f = *__a;
-  return (float16x4_t) { __f, __f, __f, __f };
+  int16x8x4_t ret;
+  __builtin_aarch64_simd_xi __o;
+  __o = __builtin_aarch64_ld4rv8hi ((const __builtin_aarch64_simd_hi *) __a);
+  ret.val[0] = (int16x8_t) __builtin_aarch64_get_qregxiv8hi (__o, 0);
+  ret.val[1] = (int16x8_t) __builtin_aarch64_get_qregxiv8hi (__o, 1);
+  ret.val[2] = (int16x8_t) __builtin_aarch64_get_qregxiv8hi (__o, 2);
+  ret.val[3] = (int16x8_t) __builtin_aarch64_get_qregxiv8hi (__o, 3);
+  return ret;
 }
 
-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
-vld1_dup_f32 (const float32_t* __a)
+__extension__ extern __inline poly16x8x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld4q_dup_p16 (const poly16_t * __a)
 {
-  return vdup_n_f32 (*__a);
+  poly16x8x4_t ret;
+  __builtin_aarch64_simd_xi __o;
+  __o = __builtin_aarch64_ld4rv8hi ((const __builtin_aarch64_simd_hi *) __a);
+  ret.val[0] = (poly16x8_t) __builtin_aarch64_get_qregxiv8hi (__o, 0);
+  ret.val[1] = (poly16x8_t) __builtin_aarch64_get_qregxiv8hi (__o, 1);
+  ret.val[2] = (poly16x8_t) __builtin_aarch64_get_qregxiv8hi (__o, 2);
+  ret.val[3] = (poly16x8_t) __builtin_aarch64_get_qregxiv8hi (__o, 3);
+  return ret;
 }
 
-__extension__ static __inline float64x1_t __attribute__ ((__always_inline__))
-vld1_dup_f64 (const float64_t* __a)
+__extension__ extern __inline int32x4x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld4q_dup_s32 (const int32_t * __a)
 {
-  return vdup_n_f64 (*__a);
+  int32x4x4_t ret;
+  __builtin_aarch64_simd_xi __o;
+  __o = __builtin_aarch64_ld4rv4si ((const __builtin_aarch64_simd_si *) __a);
+  ret.val[0] = (int32x4_t) __builtin_aarch64_get_qregxiv4si (__o, 0);
+  ret.val[1] = (int32x4_t) __builtin_aarch64_get_qregxiv4si (__o, 1);
+  ret.val[2] = (int32x4_t) __builtin_aarch64_get_qregxiv4si (__o, 2);
+  ret.val[3] = (int32x4_t) __builtin_aarch64_get_qregxiv4si (__o, 3);
+  return ret;
 }
 
-__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
-vld1_dup_p8 (const poly8_t* __a)
+__extension__ extern __inline int64x2x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld4q_dup_s64 (const int64_t * __a)
 {
-  return vdup_n_p8 (*__a);
+  int64x2x4_t ret;
+  __builtin_aarch64_simd_xi __o;
+  __o = __builtin_aarch64_ld4rv2di ((const __builtin_aarch64_simd_di *) __a);
+  ret.val[0] = (int64x2_t) __builtin_aarch64_get_qregxiv2di (__o, 0);
+  ret.val[1] = (int64x2_t) __builtin_aarch64_get_qregxiv2di (__o, 1);
+  ret.val[2] = (int64x2_t) __builtin_aarch64_get_qregxiv2di (__o, 2);
+  ret.val[3] = (int64x2_t) __builtin_aarch64_get_qregxiv2di (__o, 3);
+  return ret;
 }
 
-__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__))
-vld1_dup_p16 (const poly16_t* __a)
+__extension__ extern __inline uint8x16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld4q_dup_u8 (const uint8_t * __a)
 {
-  return vdup_n_p16 (*__a);
+  uint8x16x4_t ret;
+  __builtin_aarch64_simd_xi __o;
+  __o = __builtin_aarch64_ld4rv16qi ((const __builtin_aarch64_simd_qi *) __a);
+  ret.val[0] = (uint8x16_t) __builtin_aarch64_get_qregxiv16qi (__o, 0);
+  ret.val[1] = (uint8x16_t) __builtin_aarch64_get_qregxiv16qi (__o, 1);
+  ret.val[2] = (uint8x16_t) __builtin_aarch64_get_qregxiv16qi (__o, 2);
+  ret.val[3] = (uint8x16_t) __builtin_aarch64_get_qregxiv16qi (__o, 3);
+  return ret;
 }
 
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-vld1_dup_s8 (const int8_t* __a)
+__extension__ extern __inline uint16x8x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld4q_dup_u16 (const uint16_t * __a)
 {
-  return vdup_n_s8 (*__a);
+  uint16x8x4_t ret;
+  __builtin_aarch64_simd_xi __o;
+  __o = __builtin_aarch64_ld4rv8hi ((const __builtin_aarch64_simd_hi *) __a);
+  ret.val[0] = (uint16x8_t) __builtin_aarch64_get_qregxiv8hi (__o, 0);
+  ret.val[1] = (uint16x8_t) __builtin_aarch64_get_qregxiv8hi (__o, 1);
+  ret.val[2] = (uint16x8_t) __builtin_aarch64_get_qregxiv8hi (__o, 2);
+  ret.val[3] = (uint16x8_t) __builtin_aarch64_get_qregxiv8hi (__o, 3);
+  return ret;
 }
 
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-vld1_dup_s16 (const int16_t* __a)
+__extension__ extern __inline uint32x4x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld4q_dup_u32 (const uint32_t * __a)
 {
-  return vdup_n_s16 (*__a);
+  uint32x4x4_t ret;
+  __builtin_aarch64_simd_xi __o;
+  __o = __builtin_aarch64_ld4rv4si ((const __builtin_aarch64_simd_si *) __a);
+  ret.val[0] = (uint32x4_t) __builtin_aarch64_get_qregxiv4si (__o, 0);
+  ret.val[1] = (uint32x4_t) __builtin_aarch64_get_qregxiv4si (__o, 1);
+  ret.val[2] = (uint32x4_t) __builtin_aarch64_get_qregxiv4si (__o, 2);
+  ret.val[3] = (uint32x4_t) __builtin_aarch64_get_qregxiv4si (__o, 3);
+  return ret;
 }
 
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-vld1_dup_s32 (const int32_t* __a)
+__extension__ extern __inline uint64x2x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld4q_dup_u64 (const uint64_t * __a)
 {
-  return vdup_n_s32 (*__a);
+  uint64x2x4_t ret;
+  __builtin_aarch64_simd_xi __o;
+  __o = __builtin_aarch64_ld4rv2di ((const __builtin_aarch64_simd_di *) __a);
+  ret.val[0] = (uint64x2_t) __builtin_aarch64_get_qregxiv2di (__o, 0);
+  ret.val[1] = (uint64x2_t) __builtin_aarch64_get_qregxiv2di (__o, 1);
+  ret.val[2] = (uint64x2_t) __builtin_aarch64_get_qregxiv2di (__o, 2);
+  ret.val[3] = (uint64x2_t) __builtin_aarch64_get_qregxiv2di (__o, 3);
+  return ret;
 }
 
-__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
-vld1_dup_s64 (const int64_t* __a)
+__extension__ extern __inline float16x8x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld4q_dup_f16 (const float16_t * __a)
 {
-  return vdup_n_s64 (*__a);
+  float16x8x4_t ret;
+  __builtin_aarch64_simd_xi __o;
+  __o = __builtin_aarch64_ld4rv8hf ((const __builtin_aarch64_simd_hf *) __a);
+  ret.val[0] = (float16x8_t) __builtin_aarch64_get_qregxiv8hf (__o, 0);
+  ret.val[1] = (float16x8_t) __builtin_aarch64_get_qregxiv8hf (__o, 1);
+  ret.val[2] = (float16x8_t) __builtin_aarch64_get_qregxiv8hf (__o, 2);
+  ret.val[3] = (float16x8_t) __builtin_aarch64_get_qregxiv8hf (__o, 3);
+  return ret;
 }
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-vld1_dup_u8 (const uint8_t* __a)
+__extension__ extern __inline float32x4x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld4q_dup_f32 (const float32_t * __a)
 {
-  return vdup_n_u8 (*__a);
+  float32x4x4_t ret;
+  __builtin_aarch64_simd_xi __o;
+  __o = __builtin_aarch64_ld4rv4sf ((const __builtin_aarch64_simd_sf *) __a);
+  ret.val[0] = (float32x4_t) __builtin_aarch64_get_qregxiv4sf (__o, 0);
+  ret.val[1] = (float32x4_t) __builtin_aarch64_get_qregxiv4sf (__o, 1);
+  ret.val[2] = (float32x4_t) __builtin_aarch64_get_qregxiv4sf (__o, 2);
+  ret.val[3] = (float32x4_t) __builtin_aarch64_get_qregxiv4sf (__o, 3);
+  return ret;
 }
 
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-vld1_dup_u16 (const uint16_t* __a)
+__extension__ extern __inline float64x2x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld4q_dup_f64 (const float64_t * __a)
 {
-  return vdup_n_u16 (*__a);
+  float64x2x4_t ret;
+  __builtin_aarch64_simd_xi __o;
+  __o = __builtin_aarch64_ld4rv2df ((const __builtin_aarch64_simd_df *) __a);
+  ret.val[0] = (float64x2_t) __builtin_aarch64_get_qregxiv2df (__o, 0);
+  ret.val[1] = (float64x2_t) __builtin_aarch64_get_qregxiv2df (__o, 1);
+  ret.val[2] = (float64x2_t) __builtin_aarch64_get_qregxiv2df (__o, 2);
+  ret.val[3] = (float64x2_t) __builtin_aarch64_get_qregxiv2df (__o, 3);
+  return ret;
 }
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-vld1_dup_u32 (const uint32_t* __a)
+__extension__ extern __inline poly64x2x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld4q_dup_p64 (const poly64_t * __a)
 {
-  return vdup_n_u32 (*__a);
+  poly64x2x4_t ret;
+  __builtin_aarch64_simd_xi __o;
+  __o = __builtin_aarch64_ld4rv2di ((const __builtin_aarch64_simd_di *) __a);
+  ret.val[0] = (poly64x2_t) __builtin_aarch64_get_qregxiv2di_pss (__o, 0);
+  ret.val[1] = (poly64x2_t) __builtin_aarch64_get_qregxiv2di_pss (__o, 1);
+  ret.val[2] = (poly64x2_t) __builtin_aarch64_get_qregxiv2di_pss (__o, 2);
+  ret.val[3] = (poly64x2_t) __builtin_aarch64_get_qregxiv2di_pss (__o, 3);
+  return ret;
 }
 
-__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
-vld1_dup_u64 (const uint64_t* __a)
-{
-  return vdup_n_u64 (*__a);
+/* vld2_lane */
+
+#define __LD2_LANE_FUNC(intype, vectype, largetype, ptrtype, mode,	   \
+			 qmode, ptrmode, funcsuffix, signedtype)	   \
+__extension__ extern __inline intype \
+__attribute__ ((__always_inline__, __gnu_inline__,__artificial__)) \
+vld2_lane_##funcsuffix (const ptrtype * __ptr, intype __b, const int __c)  \
+{									   \
+  __builtin_aarch64_simd_oi __o;					   \
+  largetype __temp;							   \
+  __temp.val[0] =							   \
+    vcombine_##funcsuffix (__b.val[0], vcreate_##funcsuffix (0));	   \
+  __temp.val[1] =							   \
+    vcombine_##funcsuffix (__b.val[1], vcreate_##funcsuffix (0));	   \
+  __o = __builtin_aarch64_set_qregoi##qmode (__o,			   \
+					    (signedtype) __temp.val[0],	   \
+					    0);				   \
+  __o = __builtin_aarch64_set_qregoi##qmode (__o,			   \
+					    (signedtype) __temp.val[1],	   \
+					    1);				   \
+  __o =	__builtin_aarch64_ld2_lane##mode (				   \
+	  (__builtin_aarch64_simd_##ptrmode *) __ptr, __o, __c);	   \
+  __b.val[0] = (vectype) __builtin_aarch64_get_dregoidi (__o, 0);	   \
+  __b.val[1] = (vectype) __builtin_aarch64_get_dregoidi (__o, 1);	   \
+  return __b;								   \
 }
 
-/* vld1q_dup  */
+__LD2_LANE_FUNC (float16x4x2_t, float16x4_t, float16x8x2_t, float16_t, v4hf,
+		 v8hf, hf, f16, float16x8_t)
+__LD2_LANE_FUNC (float32x2x2_t, float32x2_t, float32x4x2_t, float32_t, v2sf, v4sf,
+		 sf, f32, float32x4_t)
+__LD2_LANE_FUNC (float64x1x2_t, float64x1_t, float64x2x2_t, float64_t, df, v2df,
+		 df, f64, float64x2_t)
+__LD2_LANE_FUNC (poly8x8x2_t, poly8x8_t, poly8x16x2_t, poly8_t, v8qi, v16qi, qi, p8,
+		 int8x16_t)
+__LD2_LANE_FUNC (poly16x4x2_t, poly16x4_t, poly16x8x2_t, poly16_t, v4hi, v8hi, hi,
+		 p16, int16x8_t)
+__LD2_LANE_FUNC (poly64x1x2_t, poly64x1_t, poly64x2x2_t, poly64_t, di,
+		 v2di_ssps, di, p64, poly64x2_t)
+__LD2_LANE_FUNC (int8x8x2_t, int8x8_t, int8x16x2_t, int8_t, v8qi, v16qi, qi, s8,
+		 int8x16_t)
+__LD2_LANE_FUNC (int16x4x2_t, int16x4_t, int16x8x2_t, int16_t, v4hi, v8hi, hi, s16,
+		 int16x8_t)
+__LD2_LANE_FUNC (int32x2x2_t, int32x2_t, int32x4x2_t, int32_t, v2si, v4si, si, s32,
+		 int32x4_t)
+__LD2_LANE_FUNC (int64x1x2_t, int64x1_t, int64x2x2_t, int64_t, di, v2di, di, s64,
+		 int64x2_t)
+__LD2_LANE_FUNC (uint8x8x2_t, uint8x8_t, uint8x16x2_t, uint8_t, v8qi, v16qi, qi, u8,
+		 int8x16_t)
+__LD2_LANE_FUNC (uint16x4x2_t, uint16x4_t, uint16x8x2_t, uint16_t, v4hi, v8hi, hi,
+		 u16, int16x8_t)
+__LD2_LANE_FUNC (uint32x2x2_t, uint32x2_t, uint32x4x2_t, uint32_t, v2si, v4si, si,
+		 u32, int32x4_t)
+__LD2_LANE_FUNC (uint64x1x2_t, uint64x1_t, uint64x2x2_t, uint64_t, di, v2di, di,
+		 u64, int64x2_t)
 
-__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
-vld1q_dup_f16 (const float16_t* __a)
-{
-  float16_t __f = *__a;
-  return (float16x8_t) { __f, __f, __f, __f, __f, __f, __f, __f };
+#undef __LD2_LANE_FUNC
+
+/* vld2q_lane */
+
+#define __LD2_LANE_FUNC(intype, vtype, ptrtype, mode, ptrmode, funcsuffix) \
+__extension__ extern __inline intype \
+__attribute__ ((__always_inline__, __gnu_inline__,__artificial__)) \
+vld2q_lane_##funcsuffix (const ptrtype * __ptr, intype __b, const int __c) \
+{									   \
+  __builtin_aarch64_simd_oi __o;					   \
+  intype ret;								   \
+  __o = __builtin_aarch64_set_qregoiv4si (__o, (int32x4_t) __b.val[0], 0); \
+  __o = __builtin_aarch64_set_qregoiv4si (__o, (int32x4_t) __b.val[1], 1); \
+  __o = __builtin_aarch64_ld2_lane##mode (				   \
+	(__builtin_aarch64_simd_##ptrmode *) __ptr, __o, __c);		   \
+  ret.val[0] = (vtype) __builtin_aarch64_get_qregoiv4si (__o, 0);	   \
+  ret.val[1] = (vtype) __builtin_aarch64_get_qregoiv4si (__o, 1);	   \
+  return ret;								   \
 }
 
-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
-vld1q_dup_f32 (const float32_t* __a)
-{
-  return vdupq_n_f32 (*__a);
+__LD2_LANE_FUNC (float16x8x2_t, float16x8_t, float16_t, v8hf, hf, f16)
+__LD2_LANE_FUNC (float32x4x2_t, float32x4_t, float32_t, v4sf, sf, f32)
+__LD2_LANE_FUNC (float64x2x2_t, float64x2_t, float64_t, v2df, df, f64)
+__LD2_LANE_FUNC (poly8x16x2_t, poly8x16_t, poly8_t, v16qi, qi, p8)
+__LD2_LANE_FUNC (poly16x8x2_t, poly16x8_t, poly16_t, v8hi, hi, p16)
+__LD2_LANE_FUNC (poly64x2x2_t, poly64x2_t, poly64_t, v2di, di, p64)
+__LD2_LANE_FUNC (int8x16x2_t, int8x16_t, int8_t, v16qi, qi, s8)
+__LD2_LANE_FUNC (int16x8x2_t, int16x8_t, int16_t, v8hi, hi, s16)
+__LD2_LANE_FUNC (int32x4x2_t, int32x4_t, int32_t, v4si, si, s32)
+__LD2_LANE_FUNC (int64x2x2_t, int64x2_t, int64_t, v2di, di, s64)
+__LD2_LANE_FUNC (uint8x16x2_t, uint8x16_t, uint8_t, v16qi, qi, u8)
+__LD2_LANE_FUNC (uint16x8x2_t, uint16x8_t, uint16_t, v8hi, hi, u16)
+__LD2_LANE_FUNC (uint32x4x2_t, uint32x4_t, uint32_t, v4si, si, u32)
+__LD2_LANE_FUNC (uint64x2x2_t, uint64x2_t, uint64_t, v2di, di, u64)
+
+#undef __LD2_LANE_FUNC
+
+/* vld3_lane */
+
+#define __LD3_LANE_FUNC(intype, vectype, largetype, ptrtype, mode,	   \
+			 qmode, ptrmode, funcsuffix, signedtype)	   \
+__extension__ extern __inline intype \
+__attribute__ ((__always_inline__, __gnu_inline__,__artificial__)) \
+vld3_lane_##funcsuffix (const ptrtype * __ptr, intype __b, const int __c)  \
+{									   \
+  __builtin_aarch64_simd_ci __o;					   \
+  largetype __temp;							   \
+  __temp.val[0] =							   \
+    vcombine_##funcsuffix (__b.val[0], vcreate_##funcsuffix (0));	   \
+  __temp.val[1] =							   \
+    vcombine_##funcsuffix (__b.val[1], vcreate_##funcsuffix (0));	   \
+  __temp.val[2] =							   \
+    vcombine_##funcsuffix (__b.val[2], vcreate_##funcsuffix (0));	   \
+  __o = __builtin_aarch64_set_qregci##qmode (__o,			   \
+					    (signedtype) __temp.val[0],	   \
+					    0);				   \
+  __o = __builtin_aarch64_set_qregci##qmode (__o,			   \
+					    (signedtype) __temp.val[1],	   \
+					    1);				   \
+  __o = __builtin_aarch64_set_qregci##qmode (__o,			   \
+					    (signedtype) __temp.val[2],	   \
+					    2);				   \
+  __o =	__builtin_aarch64_ld3_lane##mode (				   \
+	  (__builtin_aarch64_simd_##ptrmode *) __ptr, __o, __c);	   \
+  __b.val[0] = (vectype) __builtin_aarch64_get_dregcidi (__o, 0);	   \
+  __b.val[1] = (vectype) __builtin_aarch64_get_dregcidi (__o, 1);	   \
+  __b.val[2] = (vectype) __builtin_aarch64_get_dregcidi (__o, 2);	   \
+  return __b;								   \
 }
 
-__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
-vld1q_dup_f64 (const float64_t* __a)
-{
-  return vdupq_n_f64 (*__a);
-}
+__LD3_LANE_FUNC (float16x4x3_t, float16x4_t, float16x8x3_t, float16_t, v4hf,
+		 v8hf, hf, f16, float16x8_t)
+__LD3_LANE_FUNC (float32x2x3_t, float32x2_t, float32x4x3_t, float32_t, v2sf, v4sf,
+		 sf, f32, float32x4_t)
+__LD3_LANE_FUNC (float64x1x3_t, float64x1_t, float64x2x3_t, float64_t, df, v2df,
+		 df, f64, float64x2_t)
+__LD3_LANE_FUNC (poly8x8x3_t, poly8x8_t, poly8x16x3_t, poly8_t, v8qi, v16qi, qi, p8,
+		 int8x16_t)
+__LD3_LANE_FUNC (poly16x4x3_t, poly16x4_t, poly16x8x3_t, poly16_t, v4hi, v8hi, hi,
+		 p16, int16x8_t)
+__LD3_LANE_FUNC (poly64x1x3_t, poly64x1_t, poly64x2x3_t, poly64_t, di,
+		 v2di_ssps, di, p64, poly64x2_t)
+__LD3_LANE_FUNC (int8x8x3_t, int8x8_t, int8x16x3_t, int8_t, v8qi, v16qi, qi, s8,
+		 int8x16_t)
+__LD3_LANE_FUNC (int16x4x3_t, int16x4_t, int16x8x3_t, int16_t, v4hi, v8hi, hi, s16,
+		 int16x8_t)
+__LD3_LANE_FUNC (int32x2x3_t, int32x2_t, int32x4x3_t, int32_t, v2si, v4si, si, s32,
+		 int32x4_t)
+__LD3_LANE_FUNC (int64x1x3_t, int64x1_t, int64x2x3_t, int64_t, di, v2di, di, s64,
+		 int64x2_t)
+__LD3_LANE_FUNC (uint8x8x3_t, uint8x8_t, uint8x16x3_t, uint8_t, v8qi, v16qi, qi, u8,
+		 int8x16_t)
+__LD3_LANE_FUNC (uint16x4x3_t, uint16x4_t, uint16x8x3_t, uint16_t, v4hi, v8hi, hi,
+		 u16, int16x8_t)
+__LD3_LANE_FUNC (uint32x2x3_t, uint32x2_t, uint32x4x3_t, uint32_t, v2si, v4si, si,
+		 u32, int32x4_t)
+__LD3_LANE_FUNC (uint64x1x3_t, uint64x1_t, uint64x2x3_t, uint64_t, di, v2di, di,
+		 u64, int64x2_t)
 
-__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
-vld1q_dup_p8 (const poly8_t* __a)
-{
-  return vdupq_n_p8 (*__a);
-}
+#undef __LD3_LANE_FUNC
 
-__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
-vld1q_dup_p16 (const poly16_t* __a)
-{
-  return vdupq_n_p16 (*__a);
-}
+/* vld3q_lane */
 
-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
-vld1q_dup_s8 (const int8_t* __a)
-{
-  return vdupq_n_s8 (*__a);
+#define __LD3_LANE_FUNC(intype, vtype, ptrtype, mode, ptrmode, funcsuffix) \
+__extension__ extern __inline intype \
+__attribute__ ((__always_inline__, __gnu_inline__,__artificial__)) \
+vld3q_lane_##funcsuffix (const ptrtype * __ptr, intype __b, const int __c) \
+{									   \
+  __builtin_aarch64_simd_ci __o;					   \
+  intype ret;								   \
+  __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) __b.val[0], 0); \
+  __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) __b.val[1], 1); \
+  __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) __b.val[2], 2); \
+  __o = __builtin_aarch64_ld3_lane##mode (				   \
+	(__builtin_aarch64_simd_##ptrmode *) __ptr, __o, __c);		   \
+  ret.val[0] = (vtype) __builtin_aarch64_get_qregciv4si (__o, 0);	   \
+  ret.val[1] = (vtype) __builtin_aarch64_get_qregciv4si (__o, 1);	   \
+  ret.val[2] = (vtype) __builtin_aarch64_get_qregciv4si (__o, 2);	   \
+  return ret;								   \
 }
 
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-vld1q_dup_s16 (const int16_t* __a)
-{
-  return vdupq_n_s16 (*__a);
-}
+__LD3_LANE_FUNC (float16x8x3_t, float16x8_t, float16_t, v8hf, hf, f16)
+__LD3_LANE_FUNC (float32x4x3_t, float32x4_t, float32_t, v4sf, sf, f32)
+__LD3_LANE_FUNC (float64x2x3_t, float64x2_t, float64_t, v2df, df, f64)
+__LD3_LANE_FUNC (poly8x16x3_t, poly8x16_t, poly8_t, v16qi, qi, p8)
+__LD3_LANE_FUNC (poly16x8x3_t, poly16x8_t, poly16_t, v8hi, hi, p16)
+__LD3_LANE_FUNC (poly64x2x3_t, poly64x2_t, poly64_t, v2di, di, p64)
+__LD3_LANE_FUNC (int8x16x3_t, int8x16_t, int8_t, v16qi, qi, s8)
+__LD3_LANE_FUNC (int16x8x3_t, int16x8_t, int16_t, v8hi, hi, s16)
+__LD3_LANE_FUNC (int32x4x3_t, int32x4_t, int32_t, v4si, si, s32)
+__LD3_LANE_FUNC (int64x2x3_t, int64x2_t, int64_t, v2di, di, s64)
+__LD3_LANE_FUNC (uint8x16x3_t, uint8x16_t, uint8_t, v16qi, qi, u8)
+__LD3_LANE_FUNC (uint16x8x3_t, uint16x8_t, uint16_t, v8hi, hi, u16)
+__LD3_LANE_FUNC (uint32x4x3_t, uint32x4_t, uint32_t, v4si, si, u32)
+__LD3_LANE_FUNC (uint64x2x3_t, uint64x2_t, uint64_t, v2di, di, u64)
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-vld1q_dup_s32 (const int32_t* __a)
-{
-  return vdupq_n_s32 (*__a);
-}
+#undef __LD3_LANE_FUNC
 
-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
-vld1q_dup_s64 (const int64_t* __a)
-{
-  return vdupq_n_s64 (*__a);
-}
+/* vld4_lane */
 
-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-vld1q_dup_u8 (const uint8_t* __a)
-{
-  return vdupq_n_u8 (*__a);
+#define __LD4_LANE_FUNC(intype, vectype, largetype, ptrtype, mode,	   \
+			 qmode, ptrmode, funcsuffix, signedtype)	   \
+__extension__ extern __inline intype \
+__attribute__ ((__always_inline__, __gnu_inline__,__artificial__)) \
+vld4_lane_##funcsuffix (const ptrtype * __ptr, intype __b, const int __c)  \
+{									   \
+  __builtin_aarch64_simd_xi __o;					   \
+  largetype __temp;							   \
+  __temp.val[0] =							   \
+    vcombine_##funcsuffix (__b.val[0], vcreate_##funcsuffix (0));	   \
+  __temp.val[1] =							   \
+    vcombine_##funcsuffix (__b.val[1], vcreate_##funcsuffix (0));	   \
+  __temp.val[2] =							   \
+    vcombine_##funcsuffix (__b.val[2], vcreate_##funcsuffix (0));	   \
+  __temp.val[3] =							   \
+    vcombine_##funcsuffix (__b.val[3], vcreate_##funcsuffix (0));	   \
+  __o = __builtin_aarch64_set_qregxi##qmode (__o,			   \
+					    (signedtype) __temp.val[0],	   \
+					    0);				   \
+  __o = __builtin_aarch64_set_qregxi##qmode (__o,			   \
+					    (signedtype) __temp.val[1],	   \
+					    1);				   \
+  __o = __builtin_aarch64_set_qregxi##qmode (__o,			   \
+					    (signedtype) __temp.val[2],	   \
+					    2);				   \
+  __o = __builtin_aarch64_set_qregxi##qmode (__o,			   \
+					    (signedtype) __temp.val[3],	   \
+					    3);				   \
+  __o =	__builtin_aarch64_ld4_lane##mode (				   \
+	  (__builtin_aarch64_simd_##ptrmode *) __ptr, __o, __c);	   \
+  __b.val[0] = (vectype) __builtin_aarch64_get_dregxidi (__o, 0);	   \
+  __b.val[1] = (vectype) __builtin_aarch64_get_dregxidi (__o, 1);	   \
+  __b.val[2] = (vectype) __builtin_aarch64_get_dregxidi (__o, 2);	   \
+  __b.val[3] = (vectype) __builtin_aarch64_get_dregxidi (__o, 3);	   \
+  return __b;								   \
 }
 
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-vld1q_dup_u16 (const uint16_t* __a)
-{
-  return vdupq_n_u16 (*__a);
-}
+/* vld4q_lane */
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vld1q_dup_u32 (const uint32_t* __a)
-{
-  return vdupq_n_u32 (*__a);
-}
+__LD4_LANE_FUNC (float16x4x4_t, float16x4_t, float16x8x4_t, float16_t, v4hf,
+		 v8hf, hf, f16, float16x8_t)
+__LD4_LANE_FUNC (float32x2x4_t, float32x2_t, float32x4x4_t, float32_t, v2sf, v4sf,
+		 sf, f32, float32x4_t)
+__LD4_LANE_FUNC (float64x1x4_t, float64x1_t, float64x2x4_t, float64_t, df, v2df,
+		 df, f64, float64x2_t)
+__LD4_LANE_FUNC (poly8x8x4_t, poly8x8_t, poly8x16x4_t, poly8_t, v8qi, v16qi, qi, p8,
+		 int8x16_t)
+__LD4_LANE_FUNC (poly16x4x4_t, poly16x4_t, poly16x8x4_t, poly16_t, v4hi, v8hi, hi,
+		 p16, int16x8_t)
+__LD4_LANE_FUNC (poly64x1x4_t, poly64x1_t, poly64x2x4_t, poly64_t, di,
+		 v2di_ssps, di, p64, poly64x2_t)
+__LD4_LANE_FUNC (int8x8x4_t, int8x8_t, int8x16x4_t, int8_t, v8qi, v16qi, qi, s8,
+		 int8x16_t)
+__LD4_LANE_FUNC (int16x4x4_t, int16x4_t, int16x8x4_t, int16_t, v4hi, v8hi, hi, s16,
+		 int16x8_t)
+__LD4_LANE_FUNC (int32x2x4_t, int32x2_t, int32x4x4_t, int32_t, v2si, v4si, si, s32,
+		 int32x4_t)
+__LD4_LANE_FUNC (int64x1x4_t, int64x1_t, int64x2x4_t, int64_t, di, v2di, di, s64,
+		 int64x2_t)
+__LD4_LANE_FUNC (uint8x8x4_t, uint8x8_t, uint8x16x4_t, uint8_t, v8qi, v16qi, qi, u8,
+		 int8x16_t)
+__LD4_LANE_FUNC (uint16x4x4_t, uint16x4_t, uint16x8x4_t, uint16_t, v4hi, v8hi, hi,
+		 u16, int16x8_t)
+__LD4_LANE_FUNC (uint32x2x4_t, uint32x2_t, uint32x4x4_t, uint32_t, v2si, v4si, si,
+		 u32, int32x4_t)
+__LD4_LANE_FUNC (uint64x1x4_t, uint64x1_t, uint64x2x4_t, uint64_t, di, v2di, di,
+		 u64, int64x2_t)
 
-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
-vld1q_dup_u64 (const uint64_t* __a)
-{
-  return vdupq_n_u64 (*__a);
-}
+#undef __LD4_LANE_FUNC
 
-/* vld1_lane  */
+/* vld4q_lane */
 
-__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
-vld1_lane_f16 (const float16_t *__src, float16x4_t __vec, const int __lane)
-{
-  return __aarch64_vset_lane_any (*__src, __vec, __lane);
+#define __LD4_LANE_FUNC(intype, vtype, ptrtype, mode, ptrmode, funcsuffix) \
+__extension__ extern __inline intype \
+__attribute__ ((__always_inline__, __gnu_inline__,__artificial__)) \
+vld4q_lane_##funcsuffix (const ptrtype * __ptr, intype __b, const int __c) \
+{									   \
+  __builtin_aarch64_simd_xi __o;					   \
+  intype ret;								   \
+  __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) __b.val[0], 0); \
+  __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) __b.val[1], 1); \
+  __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) __b.val[2], 2); \
+  __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) __b.val[3], 3); \
+  __o = __builtin_aarch64_ld4_lane##mode (				   \
+	(__builtin_aarch64_simd_##ptrmode *) __ptr, __o, __c);		   \
+  ret.val[0] = (vtype) __builtin_aarch64_get_qregxiv4si (__o, 0);	   \
+  ret.val[1] = (vtype) __builtin_aarch64_get_qregxiv4si (__o, 1);	   \
+  ret.val[2] = (vtype) __builtin_aarch64_get_qregxiv4si (__o, 2);	   \
+  ret.val[3] = (vtype) __builtin_aarch64_get_qregxiv4si (__o, 3);	   \
+  return ret;								   \
 }
 
-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
-vld1_lane_f32 (const float32_t *__src, float32x2_t __vec, const int __lane)
-{
-  return __aarch64_vset_lane_any (*__src, __vec, __lane);
-}
+__LD4_LANE_FUNC (float16x8x4_t, float16x8_t, float16_t, v8hf, hf, f16)
+__LD4_LANE_FUNC (float32x4x4_t, float32x4_t, float32_t, v4sf, sf, f32)
+__LD4_LANE_FUNC (float64x2x4_t, float64x2_t, float64_t, v2df, df, f64)
+__LD4_LANE_FUNC (poly8x16x4_t, poly8x16_t, poly8_t, v16qi, qi, p8)
+__LD4_LANE_FUNC (poly16x8x4_t, poly16x8_t, poly16_t, v8hi, hi, p16)
+__LD4_LANE_FUNC (poly64x2x4_t, poly64x2_t, poly64_t, v2di, di, p64)
+__LD4_LANE_FUNC (int8x16x4_t, int8x16_t, int8_t, v16qi, qi, s8)
+__LD4_LANE_FUNC (int16x8x4_t, int16x8_t, int16_t, v8hi, hi, s16)
+__LD4_LANE_FUNC (int32x4x4_t, int32x4_t, int32_t, v4si, si, s32)
+__LD4_LANE_FUNC (int64x2x4_t, int64x2_t, int64_t, v2di, di, s64)
+__LD4_LANE_FUNC (uint8x16x4_t, uint8x16_t, uint8_t, v16qi, qi, u8)
+__LD4_LANE_FUNC (uint16x8x4_t, uint16x8_t, uint16_t, v8hi, hi, u16)
+__LD4_LANE_FUNC (uint32x4x4_t, uint32x4_t, uint32_t, v4si, si, u32)
+__LD4_LANE_FUNC (uint64x2x4_t, uint64x2_t, uint64_t, v2di, di, u64)
 
-__extension__ static __inline float64x1_t __attribute__ ((__always_inline__))
-vld1_lane_f64 (const float64_t *__src, float64x1_t __vec, const int __lane)
-{
-  return __aarch64_vset_lane_any (*__src, __vec, __lane);
-}
+#undef __LD4_LANE_FUNC
 
-__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
-vld1_lane_p8 (const poly8_t *__src, poly8x8_t __vec, const int __lane)
-{
-  return __aarch64_vset_lane_any (*__src, __vec, __lane);
-}
+/* vmax */
 
-__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__))
-vld1_lane_p16 (const poly16_t *__src, poly16x4_t __vec, const int __lane)
+__extension__ extern __inline float32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmax_f32 (float32x2_t __a, float32x2_t __b)
 {
-  return __aarch64_vset_lane_any (*__src, __vec, __lane);
+  return __builtin_aarch64_smax_nanv2sf (__a, __b);
 }
 
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-vld1_lane_s8 (const int8_t *__src, int8x8_t __vec, const int __lane)
+__extension__ extern __inline float64x1_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmax_f64 (float64x1_t __a, float64x1_t __b)
 {
-  return __aarch64_vset_lane_any (*__src, __vec, __lane);
+    return (float64x1_t)
+      { __builtin_aarch64_smax_nandf (vget_lane_f64 (__a, 0),
+				      vget_lane_f64 (__b, 0)) };
 }
 
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-vld1_lane_s16 (const int16_t *__src, int16x4_t __vec, const int __lane)
+__extension__ extern __inline int8x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmax_s8 (int8x8_t __a, int8x8_t __b)
 {
-  return __aarch64_vset_lane_any (*__src, __vec, __lane);
+  return __builtin_aarch64_smaxv8qi (__a, __b);
 }
 
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-vld1_lane_s32 (const int32_t *__src, int32x2_t __vec, const int __lane)
+__extension__ extern __inline int16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmax_s16 (int16x4_t __a, int16x4_t __b)
 {
-  return __aarch64_vset_lane_any (*__src, __vec, __lane);
+  return __builtin_aarch64_smaxv4hi (__a, __b);
 }
 
-__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
-vld1_lane_s64 (const int64_t *__src, int64x1_t __vec, const int __lane)
+__extension__ extern __inline int32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmax_s32 (int32x2_t __a, int32x2_t __b)
 {
-  return __aarch64_vset_lane_any (*__src, __vec, __lane);
+  return __builtin_aarch64_smaxv2si (__a, __b);
 }
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-vld1_lane_u8 (const uint8_t *__src, uint8x8_t __vec, const int __lane)
+__extension__ extern __inline uint8x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmax_u8 (uint8x8_t __a, uint8x8_t __b)
 {
-  return __aarch64_vset_lane_any (*__src, __vec, __lane);
+  return (uint8x8_t) __builtin_aarch64_umaxv8qi ((int8x8_t) __a,
+						 (int8x8_t) __b);
 }
 
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-vld1_lane_u16 (const uint16_t *__src, uint16x4_t __vec, const int __lane)
+__extension__ extern __inline uint16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmax_u16 (uint16x4_t __a, uint16x4_t __b)
 {
-  return __aarch64_vset_lane_any (*__src, __vec, __lane);
+  return (uint16x4_t) __builtin_aarch64_umaxv4hi ((int16x4_t) __a,
+						  (int16x4_t) __b);
 }
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-vld1_lane_u32 (const uint32_t *__src, uint32x2_t __vec, const int __lane)
+__extension__ extern __inline uint32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmax_u32 (uint32x2_t __a, uint32x2_t __b)
 {
-  return __aarch64_vset_lane_any (*__src, __vec, __lane);
+  return (uint32x2_t) __builtin_aarch64_umaxv2si ((int32x2_t) __a,
+						  (int32x2_t) __b);
 }
 
-__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
-vld1_lane_u64 (const uint64_t *__src, uint64x1_t __vec, const int __lane)
+__extension__ extern __inline float32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmaxq_f32 (float32x4_t __a, float32x4_t __b)
 {
-  return __aarch64_vset_lane_any (*__src, __vec, __lane);
+  return __builtin_aarch64_smax_nanv4sf (__a, __b);
 }
 
-/* vld1q_lane  */
-
-__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
-vld1q_lane_f16 (const float16_t *__src, float16x8_t __vec, const int __lane)
+__extension__ extern __inline float64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmaxq_f64 (float64x2_t __a, float64x2_t __b)
 {
-  return __aarch64_vset_lane_any (*__src, __vec, __lane);
+  return __builtin_aarch64_smax_nanv2df (__a, __b);
 }
 
-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
-vld1q_lane_f32 (const float32_t *__src, float32x4_t __vec, const int __lane)
+__extension__ extern __inline int8x16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmaxq_s8 (int8x16_t __a, int8x16_t __b)
 {
-  return __aarch64_vset_lane_any (*__src, __vec, __lane);
+  return __builtin_aarch64_smaxv16qi (__a, __b);
 }
 
-__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
-vld1q_lane_f64 (const float64_t *__src, float64x2_t __vec, const int __lane)
+__extension__ extern __inline int16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmaxq_s16 (int16x8_t __a, int16x8_t __b)
 {
-  return __aarch64_vset_lane_any (*__src, __vec, __lane);
+  return __builtin_aarch64_smaxv8hi (__a, __b);
 }
 
-__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
-vld1q_lane_p8 (const poly8_t *__src, poly8x16_t __vec, const int __lane)
+__extension__ extern __inline int32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmaxq_s32 (int32x4_t __a, int32x4_t __b)
 {
-  return __aarch64_vset_lane_any (*__src, __vec, __lane);
+  return __builtin_aarch64_smaxv4si (__a, __b);
 }
 
-__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
-vld1q_lane_p16 (const poly16_t *__src, poly16x8_t __vec, const int __lane)
+__extension__ extern __inline uint8x16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmaxq_u8 (uint8x16_t __a, uint8x16_t __b)
 {
-  return __aarch64_vset_lane_any (*__src, __vec, __lane);
+  return (uint8x16_t) __builtin_aarch64_umaxv16qi ((int8x16_t) __a,
+						   (int8x16_t) __b);
 }
 
-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
-vld1q_lane_s8 (const int8_t *__src, int8x16_t __vec, const int __lane)
+__extension__ extern __inline uint16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmaxq_u16 (uint16x8_t __a, uint16x8_t __b)
 {
-  return __aarch64_vset_lane_any (*__src, __vec, __lane);
+  return (uint16x8_t) __builtin_aarch64_umaxv8hi ((int16x8_t) __a,
+						  (int16x8_t) __b);
 }
 
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-vld1q_lane_s16 (const int16_t *__src, int16x8_t __vec, const int __lane)
+__extension__ extern __inline uint32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmaxq_u32 (uint32x4_t __a, uint32x4_t __b)
 {
-  return __aarch64_vset_lane_any (*__src, __vec, __lane);
+  return (uint32x4_t) __builtin_aarch64_umaxv4si ((int32x4_t) __a,
+						  (int32x4_t) __b);
 }
+/* vmulx */
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-vld1q_lane_s32 (const int32_t *__src, int32x4_t __vec, const int __lane)
+__extension__ extern __inline float32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmulx_f32 (float32x2_t __a, float32x2_t __b)
 {
-  return __aarch64_vset_lane_any (*__src, __vec, __lane);
+  return __builtin_aarch64_fmulxv2sf (__a, __b);
 }
 
-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
-vld1q_lane_s64 (const int64_t *__src, int64x2_t __vec, const int __lane)
+__extension__ extern __inline float32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmulxq_f32 (float32x4_t __a, float32x4_t __b)
 {
-  return __aarch64_vset_lane_any (*__src, __vec, __lane);
+  return __builtin_aarch64_fmulxv4sf (__a, __b);
 }
 
-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-vld1q_lane_u8 (const uint8_t *__src, uint8x16_t __vec, const int __lane)
+__extension__ extern __inline float64x1_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmulx_f64 (float64x1_t __a, float64x1_t __b)
 {
-  return __aarch64_vset_lane_any (*__src, __vec, __lane);
+  return (float64x1_t) {__builtin_aarch64_fmulxdf (__a[0], __b[0])};
 }
 
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-vld1q_lane_u16 (const uint16_t *__src, uint16x8_t __vec, const int __lane)
+__extension__ extern __inline float64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmulxq_f64 (float64x2_t __a, float64x2_t __b)
 {
-  return __aarch64_vset_lane_any (*__src, __vec, __lane);
+  return __builtin_aarch64_fmulxv2df (__a, __b);
 }
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vld1q_lane_u32 (const uint32_t *__src, uint32x4_t __vec, const int __lane)
+__extension__ extern __inline float32_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmulxs_f32 (float32_t __a, float32_t __b)
 {
-  return __aarch64_vset_lane_any (*__src, __vec, __lane);
+  return __builtin_aarch64_fmulxsf (__a, __b);
 }
 
-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
-vld1q_lane_u64 (const uint64_t *__src, uint64x2_t __vec, const int __lane)
+__extension__ extern __inline float64_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmulxd_f64 (float64_t __a, float64_t __b)
 {
-  return __aarch64_vset_lane_any (*__src, __vec, __lane);
+  return __builtin_aarch64_fmulxdf (__a, __b);
 }
 
-/* vldn */
-
-__extension__ static __inline int64x1x2_t __attribute__ ((__always_inline__))
-vld2_s64 (const int64_t * __a)
+__extension__ extern __inline float32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmulx_lane_f32 (float32x2_t __a, float32x2_t __v, const int __lane)
 {
-  int64x1x2_t ret;
-  __builtin_aarch64_simd_oi __o;
-  __o = __builtin_aarch64_ld2di ((const __builtin_aarch64_simd_di *) __a);
-  ret.val[0] = (int64x1_t) __builtin_aarch64_get_dregoidi (__o, 0);
-  ret.val[1] = (int64x1_t) __builtin_aarch64_get_dregoidi (__o, 1);
-  return ret;
+  return vmulx_f32 (__a, __aarch64_vdup_lane_f32 (__v, __lane));
 }
 
-__extension__ static __inline uint64x1x2_t __attribute__ ((__always_inline__))
-vld2_u64 (const uint64_t * __a)
+__extension__ extern __inline float64x1_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmulx_lane_f64 (float64x1_t __a, float64x1_t __v, const int __lane)
 {
-  uint64x1x2_t ret;
-  __builtin_aarch64_simd_oi __o;
-  __o = __builtin_aarch64_ld2di ((const __builtin_aarch64_simd_di *) __a);
-  ret.val[0] = (uint64x1_t) __builtin_aarch64_get_dregoidi (__o, 0);
-  ret.val[1] = (uint64x1_t) __builtin_aarch64_get_dregoidi (__o, 1);
-  return ret;
+  return vmulx_f64 (__a, __aarch64_vdup_lane_f64 (__v, __lane));
 }
 
-__extension__ static __inline float64x1x2_t __attribute__ ((__always_inline__))
-vld2_f64 (const float64_t * __a)
+__extension__ extern __inline float32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmulxq_lane_f32 (float32x4_t __a, float32x2_t __v, const int __lane)
 {
-  float64x1x2_t ret;
-  __builtin_aarch64_simd_oi __o;
-  __o = __builtin_aarch64_ld2df ((const __builtin_aarch64_simd_df *) __a);
-  ret.val[0] = (float64x1_t) {__builtin_aarch64_get_dregoidf (__o, 0)};
-  ret.val[1] = (float64x1_t) {__builtin_aarch64_get_dregoidf (__o, 1)};
-  return ret;
+  return vmulxq_f32 (__a, __aarch64_vdupq_lane_f32 (__v, __lane));
 }
 
-__extension__ static __inline int8x8x2_t __attribute__ ((__always_inline__))
-vld2_s8 (const int8_t * __a)
+__extension__ extern __inline float64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmulxq_lane_f64 (float64x2_t __a, float64x1_t __v, const int __lane)
 {
-  int8x8x2_t ret;
-  __builtin_aarch64_simd_oi __o;
-  __o = __builtin_aarch64_ld2v8qi ((const __builtin_aarch64_simd_qi *) __a);
-  ret.val[0] = (int8x8_t) __builtin_aarch64_get_dregoiv8qi (__o, 0);
-  ret.val[1] = (int8x8_t) __builtin_aarch64_get_dregoiv8qi (__o, 1);
-  return ret;
+  return vmulxq_f64 (__a, __aarch64_vdupq_lane_f64 (__v, __lane));
 }
 
-__extension__ static __inline poly8x8x2_t __attribute__ ((__always_inline__))
-vld2_p8 (const poly8_t * __a)
+__extension__ extern __inline float32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmulx_laneq_f32 (float32x2_t __a, float32x4_t __v, const int __lane)
 {
-  poly8x8x2_t ret;
-  __builtin_aarch64_simd_oi __o;
-  __o = __builtin_aarch64_ld2v8qi ((const __builtin_aarch64_simd_qi *) __a);
-  ret.val[0] = (poly8x8_t) __builtin_aarch64_get_dregoiv8qi (__o, 0);
-  ret.val[1] = (poly8x8_t) __builtin_aarch64_get_dregoiv8qi (__o, 1);
-  return ret;
+  return vmulx_f32 (__a, __aarch64_vdup_laneq_f32 (__v, __lane));
 }
 
-__extension__ static __inline int16x4x2_t __attribute__ ((__always_inline__))
-vld2_s16 (const int16_t * __a)
+__extension__ extern __inline float64x1_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmulx_laneq_f64 (float64x1_t __a, float64x2_t __v, const int __lane)
 {
-  int16x4x2_t ret;
-  __builtin_aarch64_simd_oi __o;
-  __o = __builtin_aarch64_ld2v4hi ((const __builtin_aarch64_simd_hi *) __a);
-  ret.val[0] = (int16x4_t) __builtin_aarch64_get_dregoiv4hi (__o, 0);
-  ret.val[1] = (int16x4_t) __builtin_aarch64_get_dregoiv4hi (__o, 1);
-  return ret;
+  return vmulx_f64 (__a, __aarch64_vdup_laneq_f64 (__v, __lane));
 }
 
-__extension__ static __inline poly16x4x2_t __attribute__ ((__always_inline__))
-vld2_p16 (const poly16_t * __a)
+__extension__ extern __inline float32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmulxq_laneq_f32 (float32x4_t __a, float32x4_t __v, const int __lane)
 {
-  poly16x4x2_t ret;
-  __builtin_aarch64_simd_oi __o;
-  __o = __builtin_aarch64_ld2v4hi ((const __builtin_aarch64_simd_hi *) __a);
-  ret.val[0] = (poly16x4_t) __builtin_aarch64_get_dregoiv4hi (__o, 0);
-  ret.val[1] = (poly16x4_t) __builtin_aarch64_get_dregoiv4hi (__o, 1);
-  return ret;
+  return vmulxq_f32 (__a, __aarch64_vdupq_laneq_f32 (__v, __lane));
 }
 
-__extension__ static __inline int32x2x2_t __attribute__ ((__always_inline__))
-vld2_s32 (const int32_t * __a)
+__extension__ extern __inline float64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmulxq_laneq_f64 (float64x2_t __a, float64x2_t __v, const int __lane)
 {
-  int32x2x2_t ret;
-  __builtin_aarch64_simd_oi __o;
-  __o = __builtin_aarch64_ld2v2si ((const __builtin_aarch64_simd_si *) __a);
-  ret.val[0] = (int32x2_t) __builtin_aarch64_get_dregoiv2si (__o, 0);
-  ret.val[1] = (int32x2_t) __builtin_aarch64_get_dregoiv2si (__o, 1);
-  return ret;
+  return vmulxq_f64 (__a, __aarch64_vdupq_laneq_f64 (__v, __lane));
 }
 
-__extension__ static __inline uint8x8x2_t __attribute__ ((__always_inline__))
-vld2_u8 (const uint8_t * __a)
+__extension__ extern __inline float32_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmulxs_lane_f32 (float32_t __a, float32x2_t __v, const int __lane)
 {
-  uint8x8x2_t ret;
-  __builtin_aarch64_simd_oi __o;
-  __o = __builtin_aarch64_ld2v8qi ((const __builtin_aarch64_simd_qi *) __a);
-  ret.val[0] = (uint8x8_t) __builtin_aarch64_get_dregoiv8qi (__o, 0);
-  ret.val[1] = (uint8x8_t) __builtin_aarch64_get_dregoiv8qi (__o, 1);
-  return ret;
+  return vmulxs_f32 (__a, __aarch64_vget_lane_any (__v, __lane));
 }
 
-__extension__ static __inline uint16x4x2_t __attribute__ ((__always_inline__))
-vld2_u16 (const uint16_t * __a)
+__extension__ extern __inline float32_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmulxs_laneq_f32 (float32_t __a, float32x4_t __v, const int __lane)
 {
-  uint16x4x2_t ret;
-  __builtin_aarch64_simd_oi __o;
-  __o = __builtin_aarch64_ld2v4hi ((const __builtin_aarch64_simd_hi *) __a);
-  ret.val[0] = (uint16x4_t) __builtin_aarch64_get_dregoiv4hi (__o, 0);
-  ret.val[1] = (uint16x4_t) __builtin_aarch64_get_dregoiv4hi (__o, 1);
-  return ret;
+  return vmulxs_f32 (__a, __aarch64_vget_lane_any (__v, __lane));
 }
 
-__extension__ static __inline uint32x2x2_t __attribute__ ((__always_inline__))
-vld2_u32 (const uint32_t * __a)
+__extension__ extern __inline float64_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmulxd_lane_f64 (float64_t __a, float64x1_t __v, const int __lane)
 {
-  uint32x2x2_t ret;
-  __builtin_aarch64_simd_oi __o;
-  __o = __builtin_aarch64_ld2v2si ((const __builtin_aarch64_simd_si *) __a);
-  ret.val[0] = (uint32x2_t) __builtin_aarch64_get_dregoiv2si (__o, 0);
-  ret.val[1] = (uint32x2_t) __builtin_aarch64_get_dregoiv2si (__o, 1);
-  return ret;
+  return vmulxd_f64 (__a, __aarch64_vget_lane_any (__v, __lane));
 }
 
-__extension__ static __inline float16x4x2_t __attribute__ ((__always_inline__))
-vld2_f16 (const float16_t * __a)
+__extension__ extern __inline float64_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmulxd_laneq_f64 (float64_t __a, float64x2_t __v, const int __lane)
 {
-  float16x4x2_t ret;
-  __builtin_aarch64_simd_oi __o;
-  __o = __builtin_aarch64_ld2v4hf (__a);
-  ret.val[0] = __builtin_aarch64_get_dregoiv4hf (__o, 0);
-  ret.val[1] = __builtin_aarch64_get_dregoiv4hf (__o, 1);
-  return ret;
+  return vmulxd_f64 (__a, __aarch64_vget_lane_any (__v, __lane));
 }
 
-__extension__ static __inline float32x2x2_t __attribute__ ((__always_inline__))
-vld2_f32 (const float32_t * __a)
+/* vpmax  */
+
+__extension__ extern __inline int8x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vpmax_s8 (int8x8_t a, int8x8_t b)
 {
-  float32x2x2_t ret;
-  __builtin_aarch64_simd_oi __o;
-  __o = __builtin_aarch64_ld2v2sf ((const __builtin_aarch64_simd_sf *) __a);
-  ret.val[0] = (float32x2_t) __builtin_aarch64_get_dregoiv2sf (__o, 0);
-  ret.val[1] = (float32x2_t) __builtin_aarch64_get_dregoiv2sf (__o, 1);
-  return ret;
+  return __builtin_aarch64_smaxpv8qi (a, b);
 }
 
-__extension__ static __inline int8x16x2_t __attribute__ ((__always_inline__))
-vld2q_s8 (const int8_t * __a)
+__extension__ extern __inline int16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vpmax_s16 (int16x4_t a, int16x4_t b)
 {
-  int8x16x2_t ret;
-  __builtin_aarch64_simd_oi __o;
-  __o = __builtin_aarch64_ld2v16qi ((const __builtin_aarch64_simd_qi *) __a);
-  ret.val[0] = (int8x16_t) __builtin_aarch64_get_qregoiv16qi (__o, 0);
-  ret.val[1] = (int8x16_t) __builtin_aarch64_get_qregoiv16qi (__o, 1);
-  return ret;
+  return __builtin_aarch64_smaxpv4hi (a, b);
 }
 
-__extension__ static __inline poly8x16x2_t __attribute__ ((__always_inline__))
-vld2q_p8 (const poly8_t * __a)
+__extension__ extern __inline int32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vpmax_s32 (int32x2_t a, int32x2_t b)
 {
-  poly8x16x2_t ret;
-  __builtin_aarch64_simd_oi __o;
-  __o = __builtin_aarch64_ld2v16qi ((const __builtin_aarch64_simd_qi *) __a);
-  ret.val[0] = (poly8x16_t) __builtin_aarch64_get_qregoiv16qi (__o, 0);
-  ret.val[1] = (poly8x16_t) __builtin_aarch64_get_qregoiv16qi (__o, 1);
-  return ret;
+  return __builtin_aarch64_smaxpv2si (a, b);
 }
 
-__extension__ static __inline int16x8x2_t __attribute__ ((__always_inline__))
-vld2q_s16 (const int16_t * __a)
+__extension__ extern __inline uint8x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vpmax_u8 (uint8x8_t a, uint8x8_t b)
 {
-  int16x8x2_t ret;
-  __builtin_aarch64_simd_oi __o;
-  __o = __builtin_aarch64_ld2v8hi ((const __builtin_aarch64_simd_hi *) __a);
-  ret.val[0] = (int16x8_t) __builtin_aarch64_get_qregoiv8hi (__o, 0);
-  ret.val[1] = (int16x8_t) __builtin_aarch64_get_qregoiv8hi (__o, 1);
-  return ret;
+  return (uint8x8_t) __builtin_aarch64_umaxpv8qi ((int8x8_t) a,
+						  (int8x8_t) b);
 }
 
-__extension__ static __inline poly16x8x2_t __attribute__ ((__always_inline__))
-vld2q_p16 (const poly16_t * __a)
+__extension__ extern __inline uint16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vpmax_u16 (uint16x4_t a, uint16x4_t b)
 {
-  poly16x8x2_t ret;
-  __builtin_aarch64_simd_oi __o;
-  __o = __builtin_aarch64_ld2v8hi ((const __builtin_aarch64_simd_hi *) __a);
-  ret.val[0] = (poly16x8_t) __builtin_aarch64_get_qregoiv8hi (__o, 0);
-  ret.val[1] = (poly16x8_t) __builtin_aarch64_get_qregoiv8hi (__o, 1);
-  return ret;
+  return (uint16x4_t) __builtin_aarch64_umaxpv4hi ((int16x4_t) a,
+						   (int16x4_t) b);
 }
 
-__extension__ static __inline int32x4x2_t __attribute__ ((__always_inline__))
-vld2q_s32 (const int32_t * __a)
+__extension__ extern __inline uint32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vpmax_u32 (uint32x2_t a, uint32x2_t b)
 {
-  int32x4x2_t ret;
-  __builtin_aarch64_simd_oi __o;
-  __o = __builtin_aarch64_ld2v4si ((const __builtin_aarch64_simd_si *) __a);
-  ret.val[0] = (int32x4_t) __builtin_aarch64_get_qregoiv4si (__o, 0);
-  ret.val[1] = (int32x4_t) __builtin_aarch64_get_qregoiv4si (__o, 1);
-  return ret;
+  return (uint32x2_t) __builtin_aarch64_umaxpv2si ((int32x2_t) a,
+						   (int32x2_t) b);
 }
 
-__extension__ static __inline int64x2x2_t __attribute__ ((__always_inline__))
-vld2q_s64 (const int64_t * __a)
+__extension__ extern __inline int8x16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vpmaxq_s8 (int8x16_t a, int8x16_t b)
 {
-  int64x2x2_t ret;
-  __builtin_aarch64_simd_oi __o;
-  __o = __builtin_aarch64_ld2v2di ((const __builtin_aarch64_simd_di *) __a);
-  ret.val[0] = (int64x2_t) __builtin_aarch64_get_qregoiv2di (__o, 0);
-  ret.val[1] = (int64x2_t) __builtin_aarch64_get_qregoiv2di (__o, 1);
-  return ret;
+  return __builtin_aarch64_smaxpv16qi (a, b);
 }
 
-__extension__ static __inline uint8x16x2_t __attribute__ ((__always_inline__))
-vld2q_u8 (const uint8_t * __a)
-{
-  uint8x16x2_t ret;
-  __builtin_aarch64_simd_oi __o;
-  __o = __builtin_aarch64_ld2v16qi ((const __builtin_aarch64_simd_qi *) __a);
-  ret.val[0] = (uint8x16_t) __builtin_aarch64_get_qregoiv16qi (__o, 0);
-  ret.val[1] = (uint8x16_t) __builtin_aarch64_get_qregoiv16qi (__o, 1);
-  return ret;
+__extension__ extern __inline int16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vpmaxq_s16 (int16x8_t a, int16x8_t b)
+{
+  return __builtin_aarch64_smaxpv8hi (a, b);
 }
 
-__extension__ static __inline uint16x8x2_t __attribute__ ((__always_inline__))
-vld2q_u16 (const uint16_t * __a)
+__extension__ extern __inline int32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vpmaxq_s32 (int32x4_t a, int32x4_t b)
 {
-  uint16x8x2_t ret;
-  __builtin_aarch64_simd_oi __o;
-  __o = __builtin_aarch64_ld2v8hi ((const __builtin_aarch64_simd_hi *) __a);
-  ret.val[0] = (uint16x8_t) __builtin_aarch64_get_qregoiv8hi (__o, 0);
-  ret.val[1] = (uint16x8_t) __builtin_aarch64_get_qregoiv8hi (__o, 1);
-  return ret;
+  return __builtin_aarch64_smaxpv4si (a, b);
 }
 
-__extension__ static __inline uint32x4x2_t __attribute__ ((__always_inline__))
-vld2q_u32 (const uint32_t * __a)
+__extension__ extern __inline uint8x16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vpmaxq_u8 (uint8x16_t a, uint8x16_t b)
 {
-  uint32x4x2_t ret;
-  __builtin_aarch64_simd_oi __o;
-  __o = __builtin_aarch64_ld2v4si ((const __builtin_aarch64_simd_si *) __a);
-  ret.val[0] = (uint32x4_t) __builtin_aarch64_get_qregoiv4si (__o, 0);
-  ret.val[1] = (uint32x4_t) __builtin_aarch64_get_qregoiv4si (__o, 1);
-  return ret;
+  return (uint8x16_t) __builtin_aarch64_umaxpv16qi ((int8x16_t) a,
+						    (int8x16_t) b);
 }
 
-__extension__ static __inline uint64x2x2_t __attribute__ ((__always_inline__))
-vld2q_u64 (const uint64_t * __a)
+__extension__ extern __inline uint16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vpmaxq_u16 (uint16x8_t a, uint16x8_t b)
 {
-  uint64x2x2_t ret;
-  __builtin_aarch64_simd_oi __o;
-  __o = __builtin_aarch64_ld2v2di ((const __builtin_aarch64_simd_di *) __a);
-  ret.val[0] = (uint64x2_t) __builtin_aarch64_get_qregoiv2di (__o, 0);
-  ret.val[1] = (uint64x2_t) __builtin_aarch64_get_qregoiv2di (__o, 1);
-  return ret;
+  return (uint16x8_t) __builtin_aarch64_umaxpv8hi ((int16x8_t) a,
+						   (int16x8_t) b);
 }
 
-__extension__ static __inline float16x8x2_t __attribute__ ((__always_inline__))
-vld2q_f16 (const float16_t * __a)
+__extension__ extern __inline uint32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vpmaxq_u32 (uint32x4_t a, uint32x4_t b)
 {
-  float16x8x2_t ret;
-  __builtin_aarch64_simd_oi __o;
-  __o = __builtin_aarch64_ld2v8hf (__a);
-  ret.val[0] = __builtin_aarch64_get_qregoiv8hf (__o, 0);
-  ret.val[1] = __builtin_aarch64_get_qregoiv8hf (__o, 1);
-  return ret;
+  return (uint32x4_t) __builtin_aarch64_umaxpv4si ((int32x4_t) a,
+						   (int32x4_t) b);
 }
 
-__extension__ static __inline float32x4x2_t __attribute__ ((__always_inline__))
-vld2q_f32 (const float32_t * __a)
+__extension__ extern __inline float32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vpmax_f32 (float32x2_t a, float32x2_t b)
 {
-  float32x4x2_t ret;
-  __builtin_aarch64_simd_oi __o;
-  __o = __builtin_aarch64_ld2v4sf ((const __builtin_aarch64_simd_sf *) __a);
-  ret.val[0] = (float32x4_t) __builtin_aarch64_get_qregoiv4sf (__o, 0);
-  ret.val[1] = (float32x4_t) __builtin_aarch64_get_qregoiv4sf (__o, 1);
-  return ret;
+  return __builtin_aarch64_smax_nanpv2sf (a, b);
 }
 
-__extension__ static __inline float64x2x2_t __attribute__ ((__always_inline__))
-vld2q_f64 (const float64_t * __a)
+__extension__ extern __inline float32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vpmaxq_f32 (float32x4_t a, float32x4_t b)
 {
-  float64x2x2_t ret;
-  __builtin_aarch64_simd_oi __o;
-  __o = __builtin_aarch64_ld2v2df ((const __builtin_aarch64_simd_df *) __a);
-  ret.val[0] = (float64x2_t) __builtin_aarch64_get_qregoiv2df (__o, 0);
-  ret.val[1] = (float64x2_t) __builtin_aarch64_get_qregoiv2df (__o, 1);
-  return ret;
+  return __builtin_aarch64_smax_nanpv4sf (a, b);
 }
 
-__extension__ static __inline int64x1x3_t __attribute__ ((__always_inline__))
-vld3_s64 (const int64_t * __a)
+__extension__ extern __inline float64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vpmaxq_f64 (float64x2_t a, float64x2_t b)
 {
-  int64x1x3_t ret;
-  __builtin_aarch64_simd_ci __o;
-  __o = __builtin_aarch64_ld3di ((const __builtin_aarch64_simd_di *) __a);
-  ret.val[0] = (int64x1_t) __builtin_aarch64_get_dregcidi (__o, 0);
-  ret.val[1] = (int64x1_t) __builtin_aarch64_get_dregcidi (__o, 1);
-  ret.val[2] = (int64x1_t) __builtin_aarch64_get_dregcidi (__o, 2);
-  return ret;
+  return __builtin_aarch64_smax_nanpv2df (a, b);
 }
 
-__extension__ static __inline uint64x1x3_t __attribute__ ((__always_inline__))
-vld3_u64 (const uint64_t * __a)
+__extension__ extern __inline float64_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vpmaxqd_f64 (float64x2_t a)
 {
-  uint64x1x3_t ret;
-  __builtin_aarch64_simd_ci __o;
-  __o = __builtin_aarch64_ld3di ((const __builtin_aarch64_simd_di *) __a);
-  ret.val[0] = (uint64x1_t) __builtin_aarch64_get_dregcidi (__o, 0);
-  ret.val[1] = (uint64x1_t) __builtin_aarch64_get_dregcidi (__o, 1);
-  ret.val[2] = (uint64x1_t) __builtin_aarch64_get_dregcidi (__o, 2);
-  return ret;
+  return __builtin_aarch64_reduc_smax_nan_scal_v2df (a);
 }
 
-__extension__ static __inline float64x1x3_t __attribute__ ((__always_inline__))
-vld3_f64 (const float64_t * __a)
+__extension__ extern __inline float32_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vpmaxs_f32 (float32x2_t a)
 {
-  float64x1x3_t ret;
-  __builtin_aarch64_simd_ci __o;
-  __o = __builtin_aarch64_ld3df ((const __builtin_aarch64_simd_df *) __a);
-  ret.val[0] = (float64x1_t) {__builtin_aarch64_get_dregcidf (__o, 0)};
-  ret.val[1] = (float64x1_t) {__builtin_aarch64_get_dregcidf (__o, 1)};
-  ret.val[2] = (float64x1_t) {__builtin_aarch64_get_dregcidf (__o, 2)};
-  return ret;
+  return __builtin_aarch64_reduc_smax_nan_scal_v2sf (a);
 }
 
-__extension__ static __inline int8x8x3_t __attribute__ ((__always_inline__))
-vld3_s8 (const int8_t * __a)
+/* vpmaxnm  */
+
+__extension__ extern __inline float32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vpmaxnm_f32 (float32x2_t a, float32x2_t b)
 {
-  int8x8x3_t ret;
-  __builtin_aarch64_simd_ci __o;
-  __o = __builtin_aarch64_ld3v8qi ((const __builtin_aarch64_simd_qi *) __a);
-  ret.val[0] = (int8x8_t) __builtin_aarch64_get_dregciv8qi (__o, 0);
-  ret.val[1] = (int8x8_t) __builtin_aarch64_get_dregciv8qi (__o, 1);
-  ret.val[2] = (int8x8_t) __builtin_aarch64_get_dregciv8qi (__o, 2);
-  return ret;
+  return __builtin_aarch64_smaxpv2sf (a, b);
 }
 
-__extension__ static __inline poly8x8x3_t __attribute__ ((__always_inline__))
-vld3_p8 (const poly8_t * __a)
+__extension__ extern __inline float32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vpmaxnmq_f32 (float32x4_t a, float32x4_t b)
 {
-  poly8x8x3_t ret;
-  __builtin_aarch64_simd_ci __o;
-  __o = __builtin_aarch64_ld3v8qi ((const __builtin_aarch64_simd_qi *) __a);
-  ret.val[0] = (poly8x8_t) __builtin_aarch64_get_dregciv8qi (__o, 0);
-  ret.val[1] = (poly8x8_t) __builtin_aarch64_get_dregciv8qi (__o, 1);
-  ret.val[2] = (poly8x8_t) __builtin_aarch64_get_dregciv8qi (__o, 2);
-  return ret;
+  return __builtin_aarch64_smaxpv4sf (a, b);
 }
 
-__extension__ static __inline int16x4x3_t __attribute__ ((__always_inline__))
-vld3_s16 (const int16_t * __a)
+__extension__ extern __inline float64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vpmaxnmq_f64 (float64x2_t a, float64x2_t b)
 {
-  int16x4x3_t ret;
-  __builtin_aarch64_simd_ci __o;
-  __o = __builtin_aarch64_ld3v4hi ((const __builtin_aarch64_simd_hi *) __a);
-  ret.val[0] = (int16x4_t) __builtin_aarch64_get_dregciv4hi (__o, 0);
-  ret.val[1] = (int16x4_t) __builtin_aarch64_get_dregciv4hi (__o, 1);
-  ret.val[2] = (int16x4_t) __builtin_aarch64_get_dregciv4hi (__o, 2);
-  return ret;
+  return __builtin_aarch64_smaxpv2df (a, b);
 }
 
-__extension__ static __inline poly16x4x3_t __attribute__ ((__always_inline__))
-vld3_p16 (const poly16_t * __a)
+__extension__ extern __inline float64_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vpmaxnmqd_f64 (float64x2_t a)
 {
-  poly16x4x3_t ret;
-  __builtin_aarch64_simd_ci __o;
-  __o = __builtin_aarch64_ld3v4hi ((const __builtin_aarch64_simd_hi *) __a);
-  ret.val[0] = (poly16x4_t) __builtin_aarch64_get_dregciv4hi (__o, 0);
-  ret.val[1] = (poly16x4_t) __builtin_aarch64_get_dregciv4hi (__o, 1);
-  ret.val[2] = (poly16x4_t) __builtin_aarch64_get_dregciv4hi (__o, 2);
-  return ret;
+  return __builtin_aarch64_reduc_smax_scal_v2df (a);
 }
 
-__extension__ static __inline int32x2x3_t __attribute__ ((__always_inline__))
-vld3_s32 (const int32_t * __a)
+__extension__ extern __inline float32_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vpmaxnms_f32 (float32x2_t a)
 {
-  int32x2x3_t ret;
-  __builtin_aarch64_simd_ci __o;
-  __o = __builtin_aarch64_ld3v2si ((const __builtin_aarch64_simd_si *) __a);
-  ret.val[0] = (int32x2_t) __builtin_aarch64_get_dregciv2si (__o, 0);
-  ret.val[1] = (int32x2_t) __builtin_aarch64_get_dregciv2si (__o, 1);
-  ret.val[2] = (int32x2_t) __builtin_aarch64_get_dregciv2si (__o, 2);
-  return ret;
+  return __builtin_aarch64_reduc_smax_scal_v2sf (a);
 }
 
-__extension__ static __inline uint8x8x3_t __attribute__ ((__always_inline__))
-vld3_u8 (const uint8_t * __a)
+/* vpmin  */
+
+__extension__ extern __inline int8x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vpmin_s8 (int8x8_t a, int8x8_t b)
 {
-  uint8x8x3_t ret;
-  __builtin_aarch64_simd_ci __o;
-  __o = __builtin_aarch64_ld3v8qi ((const __builtin_aarch64_simd_qi *) __a);
-  ret.val[0] = (uint8x8_t) __builtin_aarch64_get_dregciv8qi (__o, 0);
-  ret.val[1] = (uint8x8_t) __builtin_aarch64_get_dregciv8qi (__o, 1);
-  ret.val[2] = (uint8x8_t) __builtin_aarch64_get_dregciv8qi (__o, 2);
-  return ret;
+  return __builtin_aarch64_sminpv8qi (a, b);
 }
 
-__extension__ static __inline uint16x4x3_t __attribute__ ((__always_inline__))
-vld3_u16 (const uint16_t * __a)
+__extension__ extern __inline int16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vpmin_s16 (int16x4_t a, int16x4_t b)
 {
-  uint16x4x3_t ret;
-  __builtin_aarch64_simd_ci __o;
-  __o = __builtin_aarch64_ld3v4hi ((const __builtin_aarch64_simd_hi *) __a);
-  ret.val[0] = (uint16x4_t) __builtin_aarch64_get_dregciv4hi (__o, 0);
-  ret.val[1] = (uint16x4_t) __builtin_aarch64_get_dregciv4hi (__o, 1);
-  ret.val[2] = (uint16x4_t) __builtin_aarch64_get_dregciv4hi (__o, 2);
-  return ret;
+  return __builtin_aarch64_sminpv4hi (a, b);
 }
 
-__extension__ static __inline uint32x2x3_t __attribute__ ((__always_inline__))
-vld3_u32 (const uint32_t * __a)
+__extension__ extern __inline int32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vpmin_s32 (int32x2_t a, int32x2_t b)
 {
-  uint32x2x3_t ret;
-  __builtin_aarch64_simd_ci __o;
-  __o = __builtin_aarch64_ld3v2si ((const __builtin_aarch64_simd_si *) __a);
-  ret.val[0] = (uint32x2_t) __builtin_aarch64_get_dregciv2si (__o, 0);
-  ret.val[1] = (uint32x2_t) __builtin_aarch64_get_dregciv2si (__o, 1);
-  ret.val[2] = (uint32x2_t) __builtin_aarch64_get_dregciv2si (__o, 2);
-  return ret;
+  return __builtin_aarch64_sminpv2si (a, b);
 }
 
-__extension__ static __inline float16x4x3_t __attribute__ ((__always_inline__))
-vld3_f16 (const float16_t * __a)
+__extension__ extern __inline uint8x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vpmin_u8 (uint8x8_t a, uint8x8_t b)
 {
-  float16x4x3_t ret;
-  __builtin_aarch64_simd_ci __o;
-  __o = __builtin_aarch64_ld3v4hf (__a);
-  ret.val[0] = __builtin_aarch64_get_dregciv4hf (__o, 0);
-  ret.val[1] = __builtin_aarch64_get_dregciv4hf (__o, 1);
-  ret.val[2] = __builtin_aarch64_get_dregciv4hf (__o, 2);
-  return ret;
+  return (uint8x8_t) __builtin_aarch64_uminpv8qi ((int8x8_t) a,
+						  (int8x8_t) b);
 }
 
-__extension__ static __inline float32x2x3_t __attribute__ ((__always_inline__))
-vld3_f32 (const float32_t * __a)
+__extension__ extern __inline uint16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vpmin_u16 (uint16x4_t a, uint16x4_t b)
 {
-  float32x2x3_t ret;
-  __builtin_aarch64_simd_ci __o;
-  __o = __builtin_aarch64_ld3v2sf ((const __builtin_aarch64_simd_sf *) __a);
-  ret.val[0] = (float32x2_t) __builtin_aarch64_get_dregciv2sf (__o, 0);
-  ret.val[1] = (float32x2_t) __builtin_aarch64_get_dregciv2sf (__o, 1);
-  ret.val[2] = (float32x2_t) __builtin_aarch64_get_dregciv2sf (__o, 2);
-  return ret;
+  return (uint16x4_t) __builtin_aarch64_uminpv4hi ((int16x4_t) a,
+						   (int16x4_t) b);
 }
 
-__extension__ static __inline int8x16x3_t __attribute__ ((__always_inline__))
-vld3q_s8 (const int8_t * __a)
+__extension__ extern __inline uint32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vpmin_u32 (uint32x2_t a, uint32x2_t b)
 {
-  int8x16x3_t ret;
-  __builtin_aarch64_simd_ci __o;
-  __o = __builtin_aarch64_ld3v16qi ((const __builtin_aarch64_simd_qi *) __a);
-  ret.val[0] = (int8x16_t) __builtin_aarch64_get_qregciv16qi (__o, 0);
-  ret.val[1] = (int8x16_t) __builtin_aarch64_get_qregciv16qi (__o, 1);
-  ret.val[2] = (int8x16_t) __builtin_aarch64_get_qregciv16qi (__o, 2);
-  return ret;
+  return (uint32x2_t) __builtin_aarch64_uminpv2si ((int32x2_t) a,
+						   (int32x2_t) b);
 }
 
-__extension__ static __inline poly8x16x3_t __attribute__ ((__always_inline__))
-vld3q_p8 (const poly8_t * __a)
+__extension__ extern __inline int8x16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vpminq_s8 (int8x16_t a, int8x16_t b)
 {
-  poly8x16x3_t ret;
-  __builtin_aarch64_simd_ci __o;
-  __o = __builtin_aarch64_ld3v16qi ((const __builtin_aarch64_simd_qi *) __a);
-  ret.val[0] = (poly8x16_t) __builtin_aarch64_get_qregciv16qi (__o, 0);
-  ret.val[1] = (poly8x16_t) __builtin_aarch64_get_qregciv16qi (__o, 1);
-  ret.val[2] = (poly8x16_t) __builtin_aarch64_get_qregciv16qi (__o, 2);
-  return ret;
+  return __builtin_aarch64_sminpv16qi (a, b);
 }
 
-__extension__ static __inline int16x8x3_t __attribute__ ((__always_inline__))
-vld3q_s16 (const int16_t * __a)
+__extension__ extern __inline int16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vpminq_s16 (int16x8_t a, int16x8_t b)
 {
-  int16x8x3_t ret;
-  __builtin_aarch64_simd_ci __o;
-  __o = __builtin_aarch64_ld3v8hi ((const __builtin_aarch64_simd_hi *) __a);
-  ret.val[0] = (int16x8_t) __builtin_aarch64_get_qregciv8hi (__o, 0);
-  ret.val[1] = (int16x8_t) __builtin_aarch64_get_qregciv8hi (__o, 1);
-  ret.val[2] = (int16x8_t) __builtin_aarch64_get_qregciv8hi (__o, 2);
-  return ret;
+  return __builtin_aarch64_sminpv8hi (a, b);
 }
 
-__extension__ static __inline poly16x8x3_t __attribute__ ((__always_inline__))
-vld3q_p16 (const poly16_t * __a)
+__extension__ extern __inline int32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vpminq_s32 (int32x4_t a, int32x4_t b)
 {
-  poly16x8x3_t ret;
-  __builtin_aarch64_simd_ci __o;
-  __o = __builtin_aarch64_ld3v8hi ((const __builtin_aarch64_simd_hi *) __a);
-  ret.val[0] = (poly16x8_t) __builtin_aarch64_get_qregciv8hi (__o, 0);
-  ret.val[1] = (poly16x8_t) __builtin_aarch64_get_qregciv8hi (__o, 1);
-  ret.val[2] = (poly16x8_t) __builtin_aarch64_get_qregciv8hi (__o, 2);
-  return ret;
+  return __builtin_aarch64_sminpv4si (a, b);
 }
 
-__extension__ static __inline int32x4x3_t __attribute__ ((__always_inline__))
-vld3q_s32 (const int32_t * __a)
+__extension__ extern __inline uint8x16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vpminq_u8 (uint8x16_t a, uint8x16_t b)
 {
-  int32x4x3_t ret;
-  __builtin_aarch64_simd_ci __o;
-  __o = __builtin_aarch64_ld3v4si ((const __builtin_aarch64_simd_si *) __a);
-  ret.val[0] = (int32x4_t) __builtin_aarch64_get_qregciv4si (__o, 0);
-  ret.val[1] = (int32x4_t) __builtin_aarch64_get_qregciv4si (__o, 1);
-  ret.val[2] = (int32x4_t) __builtin_aarch64_get_qregciv4si (__o, 2);
-  return ret;
+  return (uint8x16_t) __builtin_aarch64_uminpv16qi ((int8x16_t) a,
+						    (int8x16_t) b);
 }
 
-__extension__ static __inline int64x2x3_t __attribute__ ((__always_inline__))
-vld3q_s64 (const int64_t * __a)
-{
-  int64x2x3_t ret;
-  __builtin_aarch64_simd_ci __o;
-  __o = __builtin_aarch64_ld3v2di ((const __builtin_aarch64_simd_di *) __a);
-  ret.val[0] = (int64x2_t) __builtin_aarch64_get_qregciv2di (__o, 0);
-  ret.val[1] = (int64x2_t) __builtin_aarch64_get_qregciv2di (__o, 1);
-  ret.val[2] = (int64x2_t) __builtin_aarch64_get_qregciv2di (__o, 2);
-  return ret;
+__extension__ extern __inline uint16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vpminq_u16 (uint16x8_t a, uint16x8_t b)
+{
+  return (uint16x8_t) __builtin_aarch64_uminpv8hi ((int16x8_t) a,
+						   (int16x8_t) b);
 }
 
-__extension__ static __inline uint8x16x3_t __attribute__ ((__always_inline__))
-vld3q_u8 (const uint8_t * __a)
+__extension__ extern __inline uint32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vpminq_u32 (uint32x4_t a, uint32x4_t b)
 {
-  uint8x16x3_t ret;
-  __builtin_aarch64_simd_ci __o;
-  __o = __builtin_aarch64_ld3v16qi ((const __builtin_aarch64_simd_qi *) __a);
-  ret.val[0] = (uint8x16_t) __builtin_aarch64_get_qregciv16qi (__o, 0);
-  ret.val[1] = (uint8x16_t) __builtin_aarch64_get_qregciv16qi (__o, 1);
-  ret.val[2] = (uint8x16_t) __builtin_aarch64_get_qregciv16qi (__o, 2);
-  return ret;
+  return (uint32x4_t) __builtin_aarch64_uminpv4si ((int32x4_t) a,
+						   (int32x4_t) b);
 }
 
-__extension__ static __inline uint16x8x3_t __attribute__ ((__always_inline__))
-vld3q_u16 (const uint16_t * __a)
+__extension__ extern __inline float32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vpmin_f32 (float32x2_t a, float32x2_t b)
 {
-  uint16x8x3_t ret;
-  __builtin_aarch64_simd_ci __o;
-  __o = __builtin_aarch64_ld3v8hi ((const __builtin_aarch64_simd_hi *) __a);
-  ret.val[0] = (uint16x8_t) __builtin_aarch64_get_qregciv8hi (__o, 0);
-  ret.val[1] = (uint16x8_t) __builtin_aarch64_get_qregciv8hi (__o, 1);
-  ret.val[2] = (uint16x8_t) __builtin_aarch64_get_qregciv8hi (__o, 2);
-  return ret;
+  return __builtin_aarch64_smin_nanpv2sf (a, b);
 }
 
-__extension__ static __inline uint32x4x3_t __attribute__ ((__always_inline__))
-vld3q_u32 (const uint32_t * __a)
+__extension__ extern __inline float32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vpminq_f32 (float32x4_t a, float32x4_t b)
 {
-  uint32x4x3_t ret;
-  __builtin_aarch64_simd_ci __o;
-  __o = __builtin_aarch64_ld3v4si ((const __builtin_aarch64_simd_si *) __a);
-  ret.val[0] = (uint32x4_t) __builtin_aarch64_get_qregciv4si (__o, 0);
-  ret.val[1] = (uint32x4_t) __builtin_aarch64_get_qregciv4si (__o, 1);
-  ret.val[2] = (uint32x4_t) __builtin_aarch64_get_qregciv4si (__o, 2);
-  return ret;
+  return __builtin_aarch64_smin_nanpv4sf (a, b);
 }
 
-__extension__ static __inline uint64x2x3_t __attribute__ ((__always_inline__))
-vld3q_u64 (const uint64_t * __a)
+__extension__ extern __inline float64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vpminq_f64 (float64x2_t a, float64x2_t b)
 {
-  uint64x2x3_t ret;
-  __builtin_aarch64_simd_ci __o;
-  __o = __builtin_aarch64_ld3v2di ((const __builtin_aarch64_simd_di *) __a);
-  ret.val[0] = (uint64x2_t) __builtin_aarch64_get_qregciv2di (__o, 0);
-  ret.val[1] = (uint64x2_t) __builtin_aarch64_get_qregciv2di (__o, 1);
-  ret.val[2] = (uint64x2_t) __builtin_aarch64_get_qregciv2di (__o, 2);
-  return ret;
+  return __builtin_aarch64_smin_nanpv2df (a, b);
 }
 
-__extension__ static __inline float16x8x3_t __attribute__ ((__always_inline__))
-vld3q_f16 (const float16_t * __a)
+__extension__ extern __inline float64_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vpminqd_f64 (float64x2_t a)
 {
-  float16x8x3_t ret;
-  __builtin_aarch64_simd_ci __o;
-  __o = __builtin_aarch64_ld3v8hf (__a);
-  ret.val[0] = __builtin_aarch64_get_qregciv8hf (__o, 0);
-  ret.val[1] = __builtin_aarch64_get_qregciv8hf (__o, 1);
-  ret.val[2] = __builtin_aarch64_get_qregciv8hf (__o, 2);
-  return ret;
+  return __builtin_aarch64_reduc_smin_nan_scal_v2df (a);
 }
 
-__extension__ static __inline float32x4x3_t __attribute__ ((__always_inline__))
-vld3q_f32 (const float32_t * __a)
+__extension__ extern __inline float32_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vpmins_f32 (float32x2_t a)
 {
-  float32x4x3_t ret;
-  __builtin_aarch64_simd_ci __o;
-  __o = __builtin_aarch64_ld3v4sf ((const __builtin_aarch64_simd_sf *) __a);
-  ret.val[0] = (float32x4_t) __builtin_aarch64_get_qregciv4sf (__o, 0);
-  ret.val[1] = (float32x4_t) __builtin_aarch64_get_qregciv4sf (__o, 1);
-  ret.val[2] = (float32x4_t) __builtin_aarch64_get_qregciv4sf (__o, 2);
-  return ret;
+  return __builtin_aarch64_reduc_smin_nan_scal_v2sf (a);
 }
 
-__extension__ static __inline float64x2x3_t __attribute__ ((__always_inline__))
-vld3q_f64 (const float64_t * __a)
+/* vpminnm  */
+
+__extension__ extern __inline float32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vpminnm_f32 (float32x2_t a, float32x2_t b)
 {
-  float64x2x3_t ret;
-  __builtin_aarch64_simd_ci __o;
-  __o = __builtin_aarch64_ld3v2df ((const __builtin_aarch64_simd_df *) __a);
-  ret.val[0] = (float64x2_t) __builtin_aarch64_get_qregciv2df (__o, 0);
-  ret.val[1] = (float64x2_t) __builtin_aarch64_get_qregciv2df (__o, 1);
-  ret.val[2] = (float64x2_t) __builtin_aarch64_get_qregciv2df (__o, 2);
-  return ret;
+  return __builtin_aarch64_sminpv2sf (a, b);
 }
 
-__extension__ static __inline int64x1x4_t __attribute__ ((__always_inline__))
-vld4_s64 (const int64_t * __a)
+__extension__ extern __inline float32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vpminnmq_f32 (float32x4_t a, float32x4_t b)
 {
-  int64x1x4_t ret;
-  __builtin_aarch64_simd_xi __o;
-  __o = __builtin_aarch64_ld4di ((const __builtin_aarch64_simd_di *) __a);
-  ret.val[0] = (int64x1_t) __builtin_aarch64_get_dregxidi (__o, 0);
-  ret.val[1] = (int64x1_t) __builtin_aarch64_get_dregxidi (__o, 1);
-  ret.val[2] = (int64x1_t) __builtin_aarch64_get_dregxidi (__o, 2);
-  ret.val[3] = (int64x1_t) __builtin_aarch64_get_dregxidi (__o, 3);
-  return ret;
+  return __builtin_aarch64_sminpv4sf (a, b);
 }
 
-__extension__ static __inline uint64x1x4_t __attribute__ ((__always_inline__))
-vld4_u64 (const uint64_t * __a)
+__extension__ extern __inline float64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vpminnmq_f64 (float64x2_t a, float64x2_t b)
 {
-  uint64x1x4_t ret;
-  __builtin_aarch64_simd_xi __o;
-  __o = __builtin_aarch64_ld4di ((const __builtin_aarch64_simd_di *) __a);
-  ret.val[0] = (uint64x1_t) __builtin_aarch64_get_dregxidi (__o, 0);
-  ret.val[1] = (uint64x1_t) __builtin_aarch64_get_dregxidi (__o, 1);
-  ret.val[2] = (uint64x1_t) __builtin_aarch64_get_dregxidi (__o, 2);
-  ret.val[3] = (uint64x1_t) __builtin_aarch64_get_dregxidi (__o, 3);
-  return ret;
+  return __builtin_aarch64_sminpv2df (a, b);
 }
 
-__extension__ static __inline float64x1x4_t __attribute__ ((__always_inline__))
-vld4_f64 (const float64_t * __a)
+__extension__ extern __inline float64_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vpminnmqd_f64 (float64x2_t a)
 {
-  float64x1x4_t ret;
-  __builtin_aarch64_simd_xi __o;
-  __o = __builtin_aarch64_ld4df ((const __builtin_aarch64_simd_df *) __a);
-  ret.val[0] = (float64x1_t) {__builtin_aarch64_get_dregxidf (__o, 0)};
-  ret.val[1] = (float64x1_t) {__builtin_aarch64_get_dregxidf (__o, 1)};
-  ret.val[2] = (float64x1_t) {__builtin_aarch64_get_dregxidf (__o, 2)};
-  ret.val[3] = (float64x1_t) {__builtin_aarch64_get_dregxidf (__o, 3)};
-  return ret;
+  return __builtin_aarch64_reduc_smin_scal_v2df (a);
 }
 
-__extension__ static __inline int8x8x4_t __attribute__ ((__always_inline__))
-vld4_s8 (const int8_t * __a)
+__extension__ extern __inline float32_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vpminnms_f32 (float32x2_t a)
 {
-  int8x8x4_t ret;
-  __builtin_aarch64_simd_xi __o;
-  __o = __builtin_aarch64_ld4v8qi ((const __builtin_aarch64_simd_qi *) __a);
-  ret.val[0] = (int8x8_t) __builtin_aarch64_get_dregxiv8qi (__o, 0);
-  ret.val[1] = (int8x8_t) __builtin_aarch64_get_dregxiv8qi (__o, 1);
-  ret.val[2] = (int8x8_t) __builtin_aarch64_get_dregxiv8qi (__o, 2);
-  ret.val[3] = (int8x8_t) __builtin_aarch64_get_dregxiv8qi (__o, 3);
-  return ret;
+  return __builtin_aarch64_reduc_smin_scal_v2sf (a);
 }
 
-__extension__ static __inline poly8x8x4_t __attribute__ ((__always_inline__))
-vld4_p8 (const poly8_t * __a)
+/* vmaxnm  */
+
+__extension__ extern __inline float32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmaxnm_f32 (float32x2_t __a, float32x2_t __b)
 {
-  poly8x8x4_t ret;
-  __builtin_aarch64_simd_xi __o;
-  __o = __builtin_aarch64_ld4v8qi ((const __builtin_aarch64_simd_qi *) __a);
-  ret.val[0] = (poly8x8_t) __builtin_aarch64_get_dregxiv8qi (__o, 0);
-  ret.val[1] = (poly8x8_t) __builtin_aarch64_get_dregxiv8qi (__o, 1);
-  ret.val[2] = (poly8x8_t) __builtin_aarch64_get_dregxiv8qi (__o, 2);
-  ret.val[3] = (poly8x8_t) __builtin_aarch64_get_dregxiv8qi (__o, 3);
-  return ret;
+  return __builtin_aarch64_fmaxv2sf (__a, __b);
 }
 
-__extension__ static __inline int16x4x4_t __attribute__ ((__always_inline__))
-vld4_s16 (const int16_t * __a)
+__extension__ extern __inline float64x1_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmaxnm_f64 (float64x1_t __a, float64x1_t __b)
 {
-  int16x4x4_t ret;
-  __builtin_aarch64_simd_xi __o;
-  __o = __builtin_aarch64_ld4v4hi ((const __builtin_aarch64_simd_hi *) __a);
-  ret.val[0] = (int16x4_t) __builtin_aarch64_get_dregxiv4hi (__o, 0);
-  ret.val[1] = (int16x4_t) __builtin_aarch64_get_dregxiv4hi (__o, 1);
-  ret.val[2] = (int16x4_t) __builtin_aarch64_get_dregxiv4hi (__o, 2);
-  ret.val[3] = (int16x4_t) __builtin_aarch64_get_dregxiv4hi (__o, 3);
-  return ret;
+  return (float64x1_t)
+    { __builtin_aarch64_fmaxdf (vget_lane_f64 (__a, 0),
+				vget_lane_f64 (__b, 0)) };
 }
 
-__extension__ static __inline poly16x4x4_t __attribute__ ((__always_inline__))
-vld4_p16 (const poly16_t * __a)
+__extension__ extern __inline float32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmaxnmq_f32 (float32x4_t __a, float32x4_t __b)
 {
-  poly16x4x4_t ret;
-  __builtin_aarch64_simd_xi __o;
-  __o = __builtin_aarch64_ld4v4hi ((const __builtin_aarch64_simd_hi *) __a);
-  ret.val[0] = (poly16x4_t) __builtin_aarch64_get_dregxiv4hi (__o, 0);
-  ret.val[1] = (poly16x4_t) __builtin_aarch64_get_dregxiv4hi (__o, 1);
-  ret.val[2] = (poly16x4_t) __builtin_aarch64_get_dregxiv4hi (__o, 2);
-  ret.val[3] = (poly16x4_t) __builtin_aarch64_get_dregxiv4hi (__o, 3);
-  return ret;
+  return __builtin_aarch64_fmaxv4sf (__a, __b);
 }
 
-__extension__ static __inline int32x2x4_t __attribute__ ((__always_inline__))
-vld4_s32 (const int32_t * __a)
+__extension__ extern __inline float64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmaxnmq_f64 (float64x2_t __a, float64x2_t __b)
 {
-  int32x2x4_t ret;
-  __builtin_aarch64_simd_xi __o;
-  __o = __builtin_aarch64_ld4v2si ((const __builtin_aarch64_simd_si *) __a);
-  ret.val[0] = (int32x2_t) __builtin_aarch64_get_dregxiv2si (__o, 0);
-  ret.val[1] = (int32x2_t) __builtin_aarch64_get_dregxiv2si (__o, 1);
-  ret.val[2] = (int32x2_t) __builtin_aarch64_get_dregxiv2si (__o, 2);
-  ret.val[3] = (int32x2_t) __builtin_aarch64_get_dregxiv2si (__o, 3);
-  return ret;
+  return __builtin_aarch64_fmaxv2df (__a, __b);
 }
 
-__extension__ static __inline uint8x8x4_t __attribute__ ((__always_inline__))
-vld4_u8 (const uint8_t * __a)
+/* vmaxv  */
+
+__extension__ extern __inline float32_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmaxv_f32 (float32x2_t __a)
 {
-  uint8x8x4_t ret;
-  __builtin_aarch64_simd_xi __o;
-  __o = __builtin_aarch64_ld4v8qi ((const __builtin_aarch64_simd_qi *) __a);
-  ret.val[0] = (uint8x8_t) __builtin_aarch64_get_dregxiv8qi (__o, 0);
-  ret.val[1] = (uint8x8_t) __builtin_aarch64_get_dregxiv8qi (__o, 1);
-  ret.val[2] = (uint8x8_t) __builtin_aarch64_get_dregxiv8qi (__o, 2);
-  ret.val[3] = (uint8x8_t) __builtin_aarch64_get_dregxiv8qi (__o, 3);
-  return ret;
+  return __builtin_aarch64_reduc_smax_nan_scal_v2sf (__a);
 }
 
-__extension__ static __inline uint16x4x4_t __attribute__ ((__always_inline__))
-vld4_u16 (const uint16_t * __a)
+__extension__ extern __inline int8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmaxv_s8 (int8x8_t __a)
 {
-  uint16x4x4_t ret;
-  __builtin_aarch64_simd_xi __o;
-  __o = __builtin_aarch64_ld4v4hi ((const __builtin_aarch64_simd_hi *) __a);
-  ret.val[0] = (uint16x4_t) __builtin_aarch64_get_dregxiv4hi (__o, 0);
-  ret.val[1] = (uint16x4_t) __builtin_aarch64_get_dregxiv4hi (__o, 1);
-  ret.val[2] = (uint16x4_t) __builtin_aarch64_get_dregxiv4hi (__o, 2);
-  ret.val[3] = (uint16x4_t) __builtin_aarch64_get_dregxiv4hi (__o, 3);
-  return ret;
+  return __builtin_aarch64_reduc_smax_scal_v8qi (__a);
 }
 
-__extension__ static __inline uint32x2x4_t __attribute__ ((__always_inline__))
-vld4_u32 (const uint32_t * __a)
+__extension__ extern __inline int16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmaxv_s16 (int16x4_t __a)
 {
-  uint32x2x4_t ret;
-  __builtin_aarch64_simd_xi __o;
-  __o = __builtin_aarch64_ld4v2si ((const __builtin_aarch64_simd_si *) __a);
-  ret.val[0] = (uint32x2_t) __builtin_aarch64_get_dregxiv2si (__o, 0);
-  ret.val[1] = (uint32x2_t) __builtin_aarch64_get_dregxiv2si (__o, 1);
-  ret.val[2] = (uint32x2_t) __builtin_aarch64_get_dregxiv2si (__o, 2);
-  ret.val[3] = (uint32x2_t) __builtin_aarch64_get_dregxiv2si (__o, 3);
-  return ret;
+  return __builtin_aarch64_reduc_smax_scal_v4hi (__a);
 }
 
-__extension__ static __inline float16x4x4_t __attribute__ ((__always_inline__))
-vld4_f16 (const float16_t * __a)
+__extension__ extern __inline int32_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmaxv_s32 (int32x2_t __a)
 {
-  float16x4x4_t ret;
-  __builtin_aarch64_simd_xi __o;
-  __o = __builtin_aarch64_ld4v4hf (__a);
-  ret.val[0] = __builtin_aarch64_get_dregxiv4hf (__o, 0);
-  ret.val[1] = __builtin_aarch64_get_dregxiv4hf (__o, 1);
-  ret.val[2] = __builtin_aarch64_get_dregxiv4hf (__o, 2);
-  ret.val[3] = __builtin_aarch64_get_dregxiv4hf (__o, 3);
-  return ret;
+  return __builtin_aarch64_reduc_smax_scal_v2si (__a);
 }
 
-__extension__ static __inline float32x2x4_t __attribute__ ((__always_inline__))
-vld4_f32 (const float32_t * __a)
+__extension__ extern __inline uint8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmaxv_u8 (uint8x8_t __a)
 {
-  float32x2x4_t ret;
-  __builtin_aarch64_simd_xi __o;
-  __o = __builtin_aarch64_ld4v2sf ((const __builtin_aarch64_simd_sf *) __a);
-  ret.val[0] = (float32x2_t) __builtin_aarch64_get_dregxiv2sf (__o, 0);
-  ret.val[1] = (float32x2_t) __builtin_aarch64_get_dregxiv2sf (__o, 1);
-  ret.val[2] = (float32x2_t) __builtin_aarch64_get_dregxiv2sf (__o, 2);
-  ret.val[3] = (float32x2_t) __builtin_aarch64_get_dregxiv2sf (__o, 3);
-  return ret;
+  return __builtin_aarch64_reduc_umax_scal_v8qi_uu (__a);
 }
 
-__extension__ static __inline int8x16x4_t __attribute__ ((__always_inline__))
-vld4q_s8 (const int8_t * __a)
+__extension__ extern __inline uint16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmaxv_u16 (uint16x4_t __a)
 {
-  int8x16x4_t ret;
-  __builtin_aarch64_simd_xi __o;
-  __o = __builtin_aarch64_ld4v16qi ((const __builtin_aarch64_simd_qi *) __a);
-  ret.val[0] = (int8x16_t) __builtin_aarch64_get_qregxiv16qi (__o, 0);
-  ret.val[1] = (int8x16_t) __builtin_aarch64_get_qregxiv16qi (__o, 1);
-  ret.val[2] = (int8x16_t) __builtin_aarch64_get_qregxiv16qi (__o, 2);
-  ret.val[3] = (int8x16_t) __builtin_aarch64_get_qregxiv16qi (__o, 3);
-  return ret;
+  return __builtin_aarch64_reduc_umax_scal_v4hi_uu (__a);
 }
 
-__extension__ static __inline poly8x16x4_t __attribute__ ((__always_inline__))
-vld4q_p8 (const poly8_t * __a)
+__extension__ extern __inline uint32_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmaxv_u32 (uint32x2_t __a)
 {
-  poly8x16x4_t ret;
-  __builtin_aarch64_simd_xi __o;
-  __o = __builtin_aarch64_ld4v16qi ((const __builtin_aarch64_simd_qi *) __a);
-  ret.val[0] = (poly8x16_t) __builtin_aarch64_get_qregxiv16qi (__o, 0);
-  ret.val[1] = (poly8x16_t) __builtin_aarch64_get_qregxiv16qi (__o, 1);
-  ret.val[2] = (poly8x16_t) __builtin_aarch64_get_qregxiv16qi (__o, 2);
-  ret.val[3] = (poly8x16_t) __builtin_aarch64_get_qregxiv16qi (__o, 3);
-  return ret;
+  return __builtin_aarch64_reduc_umax_scal_v2si_uu (__a);
 }
 
-__extension__ static __inline int16x8x4_t __attribute__ ((__always_inline__))
-vld4q_s16 (const int16_t * __a)
+__extension__ extern __inline float32_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmaxvq_f32 (float32x4_t __a)
 {
-  int16x8x4_t ret;
-  __builtin_aarch64_simd_xi __o;
-  __o = __builtin_aarch64_ld4v8hi ((const __builtin_aarch64_simd_hi *) __a);
-  ret.val[0] = (int16x8_t) __builtin_aarch64_get_qregxiv8hi (__o, 0);
-  ret.val[1] = (int16x8_t) __builtin_aarch64_get_qregxiv8hi (__o, 1);
-  ret.val[2] = (int16x8_t) __builtin_aarch64_get_qregxiv8hi (__o, 2);
-  ret.val[3] = (int16x8_t) __builtin_aarch64_get_qregxiv8hi (__o, 3);
-  return ret;
+  return __builtin_aarch64_reduc_smax_nan_scal_v4sf (__a);
 }
 
-__extension__ static __inline poly16x8x4_t __attribute__ ((__always_inline__))
-vld4q_p16 (const poly16_t * __a)
+__extension__ extern __inline float64_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmaxvq_f64 (float64x2_t __a)
 {
-  poly16x8x4_t ret;
-  __builtin_aarch64_simd_xi __o;
-  __o = __builtin_aarch64_ld4v8hi ((const __builtin_aarch64_simd_hi *) __a);
-  ret.val[0] = (poly16x8_t) __builtin_aarch64_get_qregxiv8hi (__o, 0);
-  ret.val[1] = (poly16x8_t) __builtin_aarch64_get_qregxiv8hi (__o, 1);
-  ret.val[2] = (poly16x8_t) __builtin_aarch64_get_qregxiv8hi (__o, 2);
-  ret.val[3] = (poly16x8_t) __builtin_aarch64_get_qregxiv8hi (__o, 3);
-  return ret;
+  return __builtin_aarch64_reduc_smax_nan_scal_v2df (__a);
 }
 
-__extension__ static __inline int32x4x4_t __attribute__ ((__always_inline__))
-vld4q_s32 (const int32_t * __a)
+__extension__ extern __inline int8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmaxvq_s8 (int8x16_t __a)
 {
-  int32x4x4_t ret;
-  __builtin_aarch64_simd_xi __o;
-  __o = __builtin_aarch64_ld4v4si ((const __builtin_aarch64_simd_si *) __a);
-  ret.val[0] = (int32x4_t) __builtin_aarch64_get_qregxiv4si (__o, 0);
-  ret.val[1] = (int32x4_t) __builtin_aarch64_get_qregxiv4si (__o, 1);
-  ret.val[2] = (int32x4_t) __builtin_aarch64_get_qregxiv4si (__o, 2);
-  ret.val[3] = (int32x4_t) __builtin_aarch64_get_qregxiv4si (__o, 3);
-  return ret;
+  return __builtin_aarch64_reduc_smax_scal_v16qi (__a);
 }
 
-__extension__ static __inline int64x2x4_t __attribute__ ((__always_inline__))
-vld4q_s64 (const int64_t * __a)
+__extension__ extern __inline int16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmaxvq_s16 (int16x8_t __a)
 {
-  int64x2x4_t ret;
-  __builtin_aarch64_simd_xi __o;
-  __o = __builtin_aarch64_ld4v2di ((const __builtin_aarch64_simd_di *) __a);
-  ret.val[0] = (int64x2_t) __builtin_aarch64_get_qregxiv2di (__o, 0);
-  ret.val[1] = (int64x2_t) __builtin_aarch64_get_qregxiv2di (__o, 1);
-  ret.val[2] = (int64x2_t) __builtin_aarch64_get_qregxiv2di (__o, 2);
-  ret.val[3] = (int64x2_t) __builtin_aarch64_get_qregxiv2di (__o, 3);
-  return ret;
+  return __builtin_aarch64_reduc_smax_scal_v8hi (__a);
 }
 
-__extension__ static __inline uint8x16x4_t __attribute__ ((__always_inline__))
-vld4q_u8 (const uint8_t * __a)
+__extension__ extern __inline int32_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmaxvq_s32 (int32x4_t __a)
 {
-  uint8x16x4_t ret;
-  __builtin_aarch64_simd_xi __o;
-  __o = __builtin_aarch64_ld4v16qi ((const __builtin_aarch64_simd_qi *) __a);
-  ret.val[0] = (uint8x16_t) __builtin_aarch64_get_qregxiv16qi (__o, 0);
-  ret.val[1] = (uint8x16_t) __builtin_aarch64_get_qregxiv16qi (__o, 1);
-  ret.val[2] = (uint8x16_t) __builtin_aarch64_get_qregxiv16qi (__o, 2);
-  ret.val[3] = (uint8x16_t) __builtin_aarch64_get_qregxiv16qi (__o, 3);
-  return ret;
+  return __builtin_aarch64_reduc_smax_scal_v4si (__a);
 }
 
-__extension__ static __inline uint16x8x4_t __attribute__ ((__always_inline__))
-vld4q_u16 (const uint16_t * __a)
+__extension__ extern __inline uint8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmaxvq_u8 (uint8x16_t __a)
 {
-  uint16x8x4_t ret;
-  __builtin_aarch64_simd_xi __o;
-  __o = __builtin_aarch64_ld4v8hi ((const __builtin_aarch64_simd_hi *) __a);
-  ret.val[0] = (uint16x8_t) __builtin_aarch64_get_qregxiv8hi (__o, 0);
-  ret.val[1] = (uint16x8_t) __builtin_aarch64_get_qregxiv8hi (__o, 1);
-  ret.val[2] = (uint16x8_t) __builtin_aarch64_get_qregxiv8hi (__o, 2);
-  ret.val[3] = (uint16x8_t) __builtin_aarch64_get_qregxiv8hi (__o, 3);
-  return ret;
+  return __builtin_aarch64_reduc_umax_scal_v16qi_uu (__a);
 }
 
-__extension__ static __inline uint32x4x4_t __attribute__ ((__always_inline__))
-vld4q_u32 (const uint32_t * __a)
+__extension__ extern __inline uint16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmaxvq_u16 (uint16x8_t __a)
 {
-  uint32x4x4_t ret;
-  __builtin_aarch64_simd_xi __o;
-  __o = __builtin_aarch64_ld4v4si ((const __builtin_aarch64_simd_si *) __a);
-  ret.val[0] = (uint32x4_t) __builtin_aarch64_get_qregxiv4si (__o, 0);
-  ret.val[1] = (uint32x4_t) __builtin_aarch64_get_qregxiv4si (__o, 1);
-  ret.val[2] = (uint32x4_t) __builtin_aarch64_get_qregxiv4si (__o, 2);
-  ret.val[3] = (uint32x4_t) __builtin_aarch64_get_qregxiv4si (__o, 3);
-  return ret;
+  return __builtin_aarch64_reduc_umax_scal_v8hi_uu (__a);
 }
 
-__extension__ static __inline uint64x2x4_t __attribute__ ((__always_inline__))
-vld4q_u64 (const uint64_t * __a)
+__extension__ extern __inline uint32_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmaxvq_u32 (uint32x4_t __a)
 {
-  uint64x2x4_t ret;
-  __builtin_aarch64_simd_xi __o;
-  __o = __builtin_aarch64_ld4v2di ((const __builtin_aarch64_simd_di *) __a);
-  ret.val[0] = (uint64x2_t) __builtin_aarch64_get_qregxiv2di (__o, 0);
-  ret.val[1] = (uint64x2_t) __builtin_aarch64_get_qregxiv2di (__o, 1);
-  ret.val[2] = (uint64x2_t) __builtin_aarch64_get_qregxiv2di (__o, 2);
-  ret.val[3] = (uint64x2_t) __builtin_aarch64_get_qregxiv2di (__o, 3);
-  return ret;
+  return __builtin_aarch64_reduc_umax_scal_v4si_uu (__a);
 }
 
-__extension__ static __inline float16x8x4_t __attribute__ ((__always_inline__))
-vld4q_f16 (const float16_t * __a)
+/* vmaxnmv  */
+
+__extension__ extern __inline float32_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmaxnmv_f32 (float32x2_t __a)
 {
-  float16x8x4_t ret;
-  __builtin_aarch64_simd_xi __o;
-  __o = __builtin_aarch64_ld4v8hf (__a);
-  ret.val[0] = __builtin_aarch64_get_qregxiv8hf (__o, 0);
-  ret.val[1] = __builtin_aarch64_get_qregxiv8hf (__o, 1);
-  ret.val[2] = __builtin_aarch64_get_qregxiv8hf (__o, 2);
-  ret.val[3] = __builtin_aarch64_get_qregxiv8hf (__o, 3);
-  return ret;
+  return __builtin_aarch64_reduc_smax_scal_v2sf (__a);
 }
 
-__extension__ static __inline float32x4x4_t __attribute__ ((__always_inline__))
-vld4q_f32 (const float32_t * __a)
+__extension__ extern __inline float32_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmaxnmvq_f32 (float32x4_t __a)
 {
-  float32x4x4_t ret;
-  __builtin_aarch64_simd_xi __o;
-  __o = __builtin_aarch64_ld4v4sf ((const __builtin_aarch64_simd_sf *) __a);
-  ret.val[0] = (float32x4_t) __builtin_aarch64_get_qregxiv4sf (__o, 0);
-  ret.val[1] = (float32x4_t) __builtin_aarch64_get_qregxiv4sf (__o, 1);
-  ret.val[2] = (float32x4_t) __builtin_aarch64_get_qregxiv4sf (__o, 2);
-  ret.val[3] = (float32x4_t) __builtin_aarch64_get_qregxiv4sf (__o, 3);
-  return ret;
+  return __builtin_aarch64_reduc_smax_scal_v4sf (__a);
 }
 
-__extension__ static __inline float64x2x4_t __attribute__ ((__always_inline__))
-vld4q_f64 (const float64_t * __a)
+__extension__ extern __inline float64_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmaxnmvq_f64 (float64x2_t __a)
 {
-  float64x2x4_t ret;
-  __builtin_aarch64_simd_xi __o;
-  __o = __builtin_aarch64_ld4v2df ((const __builtin_aarch64_simd_df *) __a);
-  ret.val[0] = (float64x2_t) __builtin_aarch64_get_qregxiv2df (__o, 0);
-  ret.val[1] = (float64x2_t) __builtin_aarch64_get_qregxiv2df (__o, 1);
-  ret.val[2] = (float64x2_t) __builtin_aarch64_get_qregxiv2df (__o, 2);
-  ret.val[3] = (float64x2_t) __builtin_aarch64_get_qregxiv2df (__o, 3);
-  return ret;
+  return __builtin_aarch64_reduc_smax_scal_v2df (__a);
 }
 
-/* vldn_dup */
+/* vmin  */
 
-__extension__ static __inline int8x8x2_t __attribute__ ((__always_inline__))
-vld2_dup_s8 (const int8_t * __a)
+__extension__ extern __inline float32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmin_f32 (float32x2_t __a, float32x2_t __b)
 {
-  int8x8x2_t ret;
-  __builtin_aarch64_simd_oi __o;
-  __o = __builtin_aarch64_ld2rv8qi ((const __builtin_aarch64_simd_qi *) __a);
-  ret.val[0] = (int8x8_t) __builtin_aarch64_get_dregoiv8qi (__o, 0);
-  ret.val[1] = (int8x8_t) __builtin_aarch64_get_dregoiv8qi (__o, 1);
-  return ret;
+  return __builtin_aarch64_smin_nanv2sf (__a, __b);
 }
 
-__extension__ static __inline int16x4x2_t __attribute__ ((__always_inline__))
-vld2_dup_s16 (const int16_t * __a)
+__extension__ extern __inline float64x1_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmin_f64 (float64x1_t __a, float64x1_t __b)
 {
-  int16x4x2_t ret;
-  __builtin_aarch64_simd_oi __o;
-  __o = __builtin_aarch64_ld2rv4hi ((const __builtin_aarch64_simd_hi *) __a);
-  ret.val[0] = (int16x4_t) __builtin_aarch64_get_dregoiv4hi (__o, 0);
-  ret.val[1] = (int16x4_t) __builtin_aarch64_get_dregoiv4hi (__o, 1);
-  return ret;
+    return (float64x1_t)
+	  { __builtin_aarch64_smin_nandf (vget_lane_f64 (__a, 0),
+					  vget_lane_f64 (__b, 0)) };
 }
 
-__extension__ static __inline int32x2x2_t __attribute__ ((__always_inline__))
-vld2_dup_s32 (const int32_t * __a)
+__extension__ extern __inline int8x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmin_s8 (int8x8_t __a, int8x8_t __b)
 {
-  int32x2x2_t ret;
-  __builtin_aarch64_simd_oi __o;
-  __o = __builtin_aarch64_ld2rv2si ((const __builtin_aarch64_simd_si *) __a);
-  ret.val[0] = (int32x2_t) __builtin_aarch64_get_dregoiv2si (__o, 0);
-  ret.val[1] = (int32x2_t) __builtin_aarch64_get_dregoiv2si (__o, 1);
-  return ret;
+  return __builtin_aarch64_sminv8qi (__a, __b);
 }
 
-__extension__ static __inline float16x4x2_t __attribute__ ((__always_inline__))
-vld2_dup_f16 (const float16_t * __a)
+__extension__ extern __inline int16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmin_s16 (int16x4_t __a, int16x4_t __b)
 {
-  float16x4x2_t ret;
-  __builtin_aarch64_simd_oi __o;
-  __o = __builtin_aarch64_ld2rv4hf ((const __builtin_aarch64_simd_hf *) __a);
-  ret.val[0] = __builtin_aarch64_get_dregoiv4hf (__o, 0);
-  ret.val[1] = (float16x4_t) __builtin_aarch64_get_dregoiv4hf (__o, 1);
-  return ret;
+  return __builtin_aarch64_sminv4hi (__a, __b);
 }
 
-__extension__ static __inline float32x2x2_t __attribute__ ((__always_inline__))
-vld2_dup_f32 (const float32_t * __a)
+__extension__ extern __inline int32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmin_s32 (int32x2_t __a, int32x2_t __b)
 {
-  float32x2x2_t ret;
-  __builtin_aarch64_simd_oi __o;
-  __o = __builtin_aarch64_ld2rv2sf ((const __builtin_aarch64_simd_sf *) __a);
-  ret.val[0] = (float32x2_t) __builtin_aarch64_get_dregoiv2sf (__o, 0);
-  ret.val[1] = (float32x2_t) __builtin_aarch64_get_dregoiv2sf (__o, 1);
-  return ret;
+  return __builtin_aarch64_sminv2si (__a, __b);
 }
 
-__extension__ static __inline float64x1x2_t __attribute__ ((__always_inline__))
-vld2_dup_f64 (const float64_t * __a)
+__extension__ extern __inline uint8x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmin_u8 (uint8x8_t __a, uint8x8_t __b)
 {
-  float64x1x2_t ret;
-  __builtin_aarch64_simd_oi __o;
-  __o = __builtin_aarch64_ld2rdf ((const __builtin_aarch64_simd_df *) __a);
-  ret.val[0] = (float64x1_t) {__builtin_aarch64_get_dregoidf (__o, 0)};
-  ret.val[1] = (float64x1_t) {__builtin_aarch64_get_dregoidf (__o, 1)};
-  return ret;
+  return (uint8x8_t) __builtin_aarch64_uminv8qi ((int8x8_t) __a,
+						 (int8x8_t) __b);
 }
 
-__extension__ static __inline uint8x8x2_t __attribute__ ((__always_inline__))
-vld2_dup_u8 (const uint8_t * __a)
+__extension__ extern __inline uint16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmin_u16 (uint16x4_t __a, uint16x4_t __b)
 {
-  uint8x8x2_t ret;
-  __builtin_aarch64_simd_oi __o;
-  __o = __builtin_aarch64_ld2rv8qi ((const __builtin_aarch64_simd_qi *) __a);
-  ret.val[0] = (uint8x8_t) __builtin_aarch64_get_dregoiv8qi (__o, 0);
-  ret.val[1] = (uint8x8_t) __builtin_aarch64_get_dregoiv8qi (__o, 1);
-  return ret;
+  return (uint16x4_t) __builtin_aarch64_uminv4hi ((int16x4_t) __a,
+						  (int16x4_t) __b);
 }
 
-__extension__ static __inline uint16x4x2_t __attribute__ ((__always_inline__))
-vld2_dup_u16 (const uint16_t * __a)
+__extension__ extern __inline uint32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmin_u32 (uint32x2_t __a, uint32x2_t __b)
 {
-  uint16x4x2_t ret;
-  __builtin_aarch64_simd_oi __o;
-  __o = __builtin_aarch64_ld2rv4hi ((const __builtin_aarch64_simd_hi *) __a);
-  ret.val[0] = (uint16x4_t) __builtin_aarch64_get_dregoiv4hi (__o, 0);
-  ret.val[1] = (uint16x4_t) __builtin_aarch64_get_dregoiv4hi (__o, 1);
-  return ret;
+  return (uint32x2_t) __builtin_aarch64_uminv2si ((int32x2_t) __a,
+						  (int32x2_t) __b);
 }
 
-__extension__ static __inline uint32x2x2_t __attribute__ ((__always_inline__))
-vld2_dup_u32 (const uint32_t * __a)
+__extension__ extern __inline float32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vminq_f32 (float32x4_t __a, float32x4_t __b)
 {
-  uint32x2x2_t ret;
-  __builtin_aarch64_simd_oi __o;
-  __o = __builtin_aarch64_ld2rv2si ((const __builtin_aarch64_simd_si *) __a);
-  ret.val[0] = (uint32x2_t) __builtin_aarch64_get_dregoiv2si (__o, 0);
-  ret.val[1] = (uint32x2_t) __builtin_aarch64_get_dregoiv2si (__o, 1);
-  return ret;
+  return __builtin_aarch64_smin_nanv4sf (__a, __b);
 }
 
-__extension__ static __inline poly8x8x2_t __attribute__ ((__always_inline__))
-vld2_dup_p8 (const poly8_t * __a)
+__extension__ extern __inline float64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vminq_f64 (float64x2_t __a, float64x2_t __b)
 {
-  poly8x8x2_t ret;
-  __builtin_aarch64_simd_oi __o;
-  __o = __builtin_aarch64_ld2rv8qi ((const __builtin_aarch64_simd_qi *) __a);
-  ret.val[0] = (poly8x8_t) __builtin_aarch64_get_dregoiv8qi (__o, 0);
-  ret.val[1] = (poly8x8_t) __builtin_aarch64_get_dregoiv8qi (__o, 1);
-  return ret;
+  return __builtin_aarch64_smin_nanv2df (__a, __b);
 }
 
-__extension__ static __inline poly16x4x2_t __attribute__ ((__always_inline__))
-vld2_dup_p16 (const poly16_t * __a)
+__extension__ extern __inline int8x16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vminq_s8 (int8x16_t __a, int8x16_t __b)
 {
-  poly16x4x2_t ret;
-  __builtin_aarch64_simd_oi __o;
-  __o = __builtin_aarch64_ld2rv4hi ((const __builtin_aarch64_simd_hi *) __a);
-  ret.val[0] = (poly16x4_t) __builtin_aarch64_get_dregoiv4hi (__o, 0);
-  ret.val[1] = (poly16x4_t) __builtin_aarch64_get_dregoiv4hi (__o, 1);
-  return ret;
+  return __builtin_aarch64_sminv16qi (__a, __b);
 }
 
-__extension__ static __inline int64x1x2_t __attribute__ ((__always_inline__))
-vld2_dup_s64 (const int64_t * __a)
+__extension__ extern __inline int16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vminq_s16 (int16x8_t __a, int16x8_t __b)
 {
-  int64x1x2_t ret;
-  __builtin_aarch64_simd_oi __o;
-  __o = __builtin_aarch64_ld2rdi ((const __builtin_aarch64_simd_di *) __a);
-  ret.val[0] = (int64x1_t) __builtin_aarch64_get_dregoidi (__o, 0);
-  ret.val[1] = (int64x1_t) __builtin_aarch64_get_dregoidi (__o, 1);
-  return ret;
+  return __builtin_aarch64_sminv8hi (__a, __b);
 }
 
-__extension__ static __inline uint64x1x2_t __attribute__ ((__always_inline__))
-vld2_dup_u64 (const uint64_t * __a)
+__extension__ extern __inline int32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vminq_s32 (int32x4_t __a, int32x4_t __b)
 {
-  uint64x1x2_t ret;
-  __builtin_aarch64_simd_oi __o;
-  __o = __builtin_aarch64_ld2rdi ((const __builtin_aarch64_simd_di *) __a);
-  ret.val[0] = (uint64x1_t) __builtin_aarch64_get_dregoidi (__o, 0);
-  ret.val[1] = (uint64x1_t) __builtin_aarch64_get_dregoidi (__o, 1);
-  return ret;
+  return __builtin_aarch64_sminv4si (__a, __b);
 }
 
-__extension__ static __inline int8x16x2_t __attribute__ ((__always_inline__))
-vld2q_dup_s8 (const int8_t * __a)
+__extension__ extern __inline uint8x16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vminq_u8 (uint8x16_t __a, uint8x16_t __b)
 {
-  int8x16x2_t ret;
-  __builtin_aarch64_simd_oi __o;
-  __o = __builtin_aarch64_ld2rv16qi ((const __builtin_aarch64_simd_qi *) __a);
-  ret.val[0] = (int8x16_t) __builtin_aarch64_get_qregoiv16qi (__o, 0);
-  ret.val[1] = (int8x16_t) __builtin_aarch64_get_qregoiv16qi (__o, 1);
-  return ret;
+  return (uint8x16_t) __builtin_aarch64_uminv16qi ((int8x16_t) __a,
+						   (int8x16_t) __b);
+}
+
+__extension__ extern __inline uint16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vminq_u16 (uint16x8_t __a, uint16x8_t __b)
+{
+  return (uint16x8_t) __builtin_aarch64_uminv8hi ((int16x8_t) __a,
+						  (int16x8_t) __b);
 }
 
-__extension__ static __inline poly8x16x2_t __attribute__ ((__always_inline__))
-vld2q_dup_p8 (const poly8_t * __a)
+__extension__ extern __inline uint32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vminq_u32 (uint32x4_t __a, uint32x4_t __b)
+{
+  return (uint32x4_t) __builtin_aarch64_uminv4si ((int32x4_t) __a,
+						  (int32x4_t) __b);
+}
+
+/* vminnm  */
+
+__extension__ extern __inline float32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vminnm_f32 (float32x2_t __a, float32x2_t __b)
 {
-  poly8x16x2_t ret;
-  __builtin_aarch64_simd_oi __o;
-  __o = __builtin_aarch64_ld2rv16qi ((const __builtin_aarch64_simd_qi *) __a);
-  ret.val[0] = (poly8x16_t) __builtin_aarch64_get_qregoiv16qi (__o, 0);
-  ret.val[1] = (poly8x16_t) __builtin_aarch64_get_qregoiv16qi (__o, 1);
-  return ret;
+  return __builtin_aarch64_fminv2sf (__a, __b);
 }
 
-__extension__ static __inline int16x8x2_t __attribute__ ((__always_inline__))
-vld2q_dup_s16 (const int16_t * __a)
+__extension__ extern __inline float64x1_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vminnm_f64 (float64x1_t __a, float64x1_t __b)
 {
-  int16x8x2_t ret;
-  __builtin_aarch64_simd_oi __o;
-  __o = __builtin_aarch64_ld2rv8hi ((const __builtin_aarch64_simd_hi *) __a);
-  ret.val[0] = (int16x8_t) __builtin_aarch64_get_qregoiv8hi (__o, 0);
-  ret.val[1] = (int16x8_t) __builtin_aarch64_get_qregoiv8hi (__o, 1);
-  return ret;
+  return (float64x1_t)
+    { __builtin_aarch64_fmindf (vget_lane_f64 (__a, 0),
+				vget_lane_f64 (__b, 0)) };
 }
 
-__extension__ static __inline poly16x8x2_t __attribute__ ((__always_inline__))
-vld2q_dup_p16 (const poly16_t * __a)
+__extension__ extern __inline float32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vminnmq_f32 (float32x4_t __a, float32x4_t __b)
 {
-  poly16x8x2_t ret;
-  __builtin_aarch64_simd_oi __o;
-  __o = __builtin_aarch64_ld2rv8hi ((const __builtin_aarch64_simd_hi *) __a);
-  ret.val[0] = (poly16x8_t) __builtin_aarch64_get_qregoiv8hi (__o, 0);
-  ret.val[1] = (poly16x8_t) __builtin_aarch64_get_qregoiv8hi (__o, 1);
-  return ret;
+  return __builtin_aarch64_fminv4sf (__a, __b);
 }
 
-__extension__ static __inline int32x4x2_t __attribute__ ((__always_inline__))
-vld2q_dup_s32 (const int32_t * __a)
+__extension__ extern __inline float64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vminnmq_f64 (float64x2_t __a, float64x2_t __b)
 {
-  int32x4x2_t ret;
-  __builtin_aarch64_simd_oi __o;
-  __o = __builtin_aarch64_ld2rv4si ((const __builtin_aarch64_simd_si *) __a);
-  ret.val[0] = (int32x4_t) __builtin_aarch64_get_qregoiv4si (__o, 0);
-  ret.val[1] = (int32x4_t) __builtin_aarch64_get_qregoiv4si (__o, 1);
-  return ret;
+  return __builtin_aarch64_fminv2df (__a, __b);
 }
 
-__extension__ static __inline int64x2x2_t __attribute__ ((__always_inline__))
-vld2q_dup_s64 (const int64_t * __a)
+/* vminv  */
+
+__extension__ extern __inline float32_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vminv_f32 (float32x2_t __a)
 {
-  int64x2x2_t ret;
-  __builtin_aarch64_simd_oi __o;
-  __o = __builtin_aarch64_ld2rv2di ((const __builtin_aarch64_simd_di *) __a);
-  ret.val[0] = (int64x2_t) __builtin_aarch64_get_qregoiv2di (__o, 0);
-  ret.val[1] = (int64x2_t) __builtin_aarch64_get_qregoiv2di (__o, 1);
-  return ret;
+  return __builtin_aarch64_reduc_smin_nan_scal_v2sf (__a);
 }
 
-__extension__ static __inline uint8x16x2_t __attribute__ ((__always_inline__))
-vld2q_dup_u8 (const uint8_t * __a)
+__extension__ extern __inline int8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vminv_s8 (int8x8_t __a)
 {
-  uint8x16x2_t ret;
-  __builtin_aarch64_simd_oi __o;
-  __o = __builtin_aarch64_ld2rv16qi ((const __builtin_aarch64_simd_qi *) __a);
-  ret.val[0] = (uint8x16_t) __builtin_aarch64_get_qregoiv16qi (__o, 0);
-  ret.val[1] = (uint8x16_t) __builtin_aarch64_get_qregoiv16qi (__o, 1);
-  return ret;
+  return __builtin_aarch64_reduc_smin_scal_v8qi (__a);
 }
 
-__extension__ static __inline uint16x8x2_t __attribute__ ((__always_inline__))
-vld2q_dup_u16 (const uint16_t * __a)
+__extension__ extern __inline int16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vminv_s16 (int16x4_t __a)
 {
-  uint16x8x2_t ret;
-  __builtin_aarch64_simd_oi __o;
-  __o = __builtin_aarch64_ld2rv8hi ((const __builtin_aarch64_simd_hi *) __a);
-  ret.val[0] = (uint16x8_t) __builtin_aarch64_get_qregoiv8hi (__o, 0);
-  ret.val[1] = (uint16x8_t) __builtin_aarch64_get_qregoiv8hi (__o, 1);
-  return ret;
+  return __builtin_aarch64_reduc_smin_scal_v4hi (__a);
 }
 
-__extension__ static __inline uint32x4x2_t __attribute__ ((__always_inline__))
-vld2q_dup_u32 (const uint32_t * __a)
+__extension__ extern __inline int32_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vminv_s32 (int32x2_t __a)
 {
-  uint32x4x2_t ret;
-  __builtin_aarch64_simd_oi __o;
-  __o = __builtin_aarch64_ld2rv4si ((const __builtin_aarch64_simd_si *) __a);
-  ret.val[0] = (uint32x4_t) __builtin_aarch64_get_qregoiv4si (__o, 0);
-  ret.val[1] = (uint32x4_t) __builtin_aarch64_get_qregoiv4si (__o, 1);
-  return ret;
+  return __builtin_aarch64_reduc_smin_scal_v2si (__a);
 }
 
-__extension__ static __inline uint64x2x2_t __attribute__ ((__always_inline__))
-vld2q_dup_u64 (const uint64_t * __a)
+__extension__ extern __inline uint8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vminv_u8 (uint8x8_t __a)
 {
-  uint64x2x2_t ret;
-  __builtin_aarch64_simd_oi __o;
-  __o = __builtin_aarch64_ld2rv2di ((const __builtin_aarch64_simd_di *) __a);
-  ret.val[0] = (uint64x2_t) __builtin_aarch64_get_qregoiv2di (__o, 0);
-  ret.val[1] = (uint64x2_t) __builtin_aarch64_get_qregoiv2di (__o, 1);
-  return ret;
+  return __builtin_aarch64_reduc_umin_scal_v8qi_uu (__a);
 }
 
-__extension__ static __inline float16x8x2_t __attribute__ ((__always_inline__))
-vld2q_dup_f16 (const float16_t * __a)
+__extension__ extern __inline uint16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vminv_u16 (uint16x4_t __a)
 {
-  float16x8x2_t ret;
-  __builtin_aarch64_simd_oi __o;
-  __o = __builtin_aarch64_ld2rv8hf ((const __builtin_aarch64_simd_hf *) __a);
-  ret.val[0] = (float16x8_t) __builtin_aarch64_get_qregoiv8hf (__o, 0);
-  ret.val[1] = __builtin_aarch64_get_qregoiv8hf (__o, 1);
-  return ret;
+  return __builtin_aarch64_reduc_umin_scal_v4hi_uu (__a);
 }
 
-__extension__ static __inline float32x4x2_t __attribute__ ((__always_inline__))
-vld2q_dup_f32 (const float32_t * __a)
+__extension__ extern __inline uint32_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vminv_u32 (uint32x2_t __a)
 {
-  float32x4x2_t ret;
-  __builtin_aarch64_simd_oi __o;
-  __o = __builtin_aarch64_ld2rv4sf ((const __builtin_aarch64_simd_sf *) __a);
-  ret.val[0] = (float32x4_t) __builtin_aarch64_get_qregoiv4sf (__o, 0);
-  ret.val[1] = (float32x4_t) __builtin_aarch64_get_qregoiv4sf (__o, 1);
-  return ret;
+  return __builtin_aarch64_reduc_umin_scal_v2si_uu (__a);
 }
 
-__extension__ static __inline float64x2x2_t __attribute__ ((__always_inline__))
-vld2q_dup_f64 (const float64_t * __a)
+__extension__ extern __inline float32_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vminvq_f32 (float32x4_t __a)
 {
-  float64x2x2_t ret;
-  __builtin_aarch64_simd_oi __o;
-  __o = __builtin_aarch64_ld2rv2df ((const __builtin_aarch64_simd_df *) __a);
-  ret.val[0] = (float64x2_t) __builtin_aarch64_get_qregoiv2df (__o, 0);
-  ret.val[1] = (float64x2_t) __builtin_aarch64_get_qregoiv2df (__o, 1);
-  return ret;
+  return __builtin_aarch64_reduc_smin_nan_scal_v4sf (__a);
 }
 
-__extension__ static __inline int64x1x3_t __attribute__ ((__always_inline__))
-vld3_dup_s64 (const int64_t * __a)
+__extension__ extern __inline float64_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vminvq_f64 (float64x2_t __a)
 {
-  int64x1x3_t ret;
-  __builtin_aarch64_simd_ci __o;
-  __o = __builtin_aarch64_ld3rdi ((const __builtin_aarch64_simd_di *) __a);
-  ret.val[0] = (int64x1_t) __builtin_aarch64_get_dregcidi (__o, 0);
-  ret.val[1] = (int64x1_t) __builtin_aarch64_get_dregcidi (__o, 1);
-  ret.val[2] = (int64x1_t) __builtin_aarch64_get_dregcidi (__o, 2);
-  return ret;
+  return __builtin_aarch64_reduc_smin_nan_scal_v2df (__a);
 }
 
-__extension__ static __inline uint64x1x3_t __attribute__ ((__always_inline__))
-vld3_dup_u64 (const uint64_t * __a)
+__extension__ extern __inline int8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vminvq_s8 (int8x16_t __a)
 {
-  uint64x1x3_t ret;
-  __builtin_aarch64_simd_ci __o;
-  __o = __builtin_aarch64_ld3rdi ((const __builtin_aarch64_simd_di *) __a);
-  ret.val[0] = (uint64x1_t) __builtin_aarch64_get_dregcidi (__o, 0);
-  ret.val[1] = (uint64x1_t) __builtin_aarch64_get_dregcidi (__o, 1);
-  ret.val[2] = (uint64x1_t) __builtin_aarch64_get_dregcidi (__o, 2);
-  return ret;
+  return __builtin_aarch64_reduc_smin_scal_v16qi (__a);
 }
 
-__extension__ static __inline float64x1x3_t __attribute__ ((__always_inline__))
-vld3_dup_f64 (const float64_t * __a)
+__extension__ extern __inline int16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vminvq_s16 (int16x8_t __a)
 {
-  float64x1x3_t ret;
-  __builtin_aarch64_simd_ci __o;
-  __o = __builtin_aarch64_ld3rdf ((const __builtin_aarch64_simd_df *) __a);
-  ret.val[0] = (float64x1_t) {__builtin_aarch64_get_dregcidf (__o, 0)};
-  ret.val[1] = (float64x1_t) {__builtin_aarch64_get_dregcidf (__o, 1)};
-  ret.val[2] = (float64x1_t) {__builtin_aarch64_get_dregcidf (__o, 2)};
-  return ret;
+  return __builtin_aarch64_reduc_smin_scal_v8hi (__a);
 }
 
-__extension__ static __inline int8x8x3_t __attribute__ ((__always_inline__))
-vld3_dup_s8 (const int8_t * __a)
+__extension__ extern __inline int32_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vminvq_s32 (int32x4_t __a)
 {
-  int8x8x3_t ret;
-  __builtin_aarch64_simd_ci __o;
-  __o = __builtin_aarch64_ld3rv8qi ((const __builtin_aarch64_simd_qi *) __a);
-  ret.val[0] = (int8x8_t) __builtin_aarch64_get_dregciv8qi (__o, 0);
-  ret.val[1] = (int8x8_t) __builtin_aarch64_get_dregciv8qi (__o, 1);
-  ret.val[2] = (int8x8_t) __builtin_aarch64_get_dregciv8qi (__o, 2);
-  return ret;
+  return __builtin_aarch64_reduc_smin_scal_v4si (__a);
 }
 
-__extension__ static __inline poly8x8x3_t __attribute__ ((__always_inline__))
-vld3_dup_p8 (const poly8_t * __a)
+__extension__ extern __inline uint8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vminvq_u8 (uint8x16_t __a)
 {
-  poly8x8x3_t ret;
-  __builtin_aarch64_simd_ci __o;
-  __o = __builtin_aarch64_ld3rv8qi ((const __builtin_aarch64_simd_qi *) __a);
-  ret.val[0] = (poly8x8_t) __builtin_aarch64_get_dregciv8qi (__o, 0);
-  ret.val[1] = (poly8x8_t) __builtin_aarch64_get_dregciv8qi (__o, 1);
-  ret.val[2] = (poly8x8_t) __builtin_aarch64_get_dregciv8qi (__o, 2);
-  return ret;
+  return __builtin_aarch64_reduc_umin_scal_v16qi_uu (__a);
 }
 
-__extension__ static __inline int16x4x3_t __attribute__ ((__always_inline__))
-vld3_dup_s16 (const int16_t * __a)
+__extension__ extern __inline uint16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vminvq_u16 (uint16x8_t __a)
 {
-  int16x4x3_t ret;
-  __builtin_aarch64_simd_ci __o;
-  __o = __builtin_aarch64_ld3rv4hi ((const __builtin_aarch64_simd_hi *) __a);
-  ret.val[0] = (int16x4_t) __builtin_aarch64_get_dregciv4hi (__o, 0);
-  ret.val[1] = (int16x4_t) __builtin_aarch64_get_dregciv4hi (__o, 1);
-  ret.val[2] = (int16x4_t) __builtin_aarch64_get_dregciv4hi (__o, 2);
-  return ret;
+  return __builtin_aarch64_reduc_umin_scal_v8hi_uu (__a);
 }
 
-__extension__ static __inline poly16x4x3_t __attribute__ ((__always_inline__))
-vld3_dup_p16 (const poly16_t * __a)
+__extension__ extern __inline uint32_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vminvq_u32 (uint32x4_t __a)
 {
-  poly16x4x3_t ret;
-  __builtin_aarch64_simd_ci __o;
-  __o = __builtin_aarch64_ld3rv4hi ((const __builtin_aarch64_simd_hi *) __a);
-  ret.val[0] = (poly16x4_t) __builtin_aarch64_get_dregciv4hi (__o, 0);
-  ret.val[1] = (poly16x4_t) __builtin_aarch64_get_dregciv4hi (__o, 1);
-  ret.val[2] = (poly16x4_t) __builtin_aarch64_get_dregciv4hi (__o, 2);
-  return ret;
+  return __builtin_aarch64_reduc_umin_scal_v4si_uu (__a);
 }
 
-__extension__ static __inline int32x2x3_t __attribute__ ((__always_inline__))
-vld3_dup_s32 (const int32_t * __a)
+/* vminnmv  */
+
+__extension__ extern __inline float32_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vminnmv_f32 (float32x2_t __a)
 {
-  int32x2x3_t ret;
-  __builtin_aarch64_simd_ci __o;
-  __o = __builtin_aarch64_ld3rv2si ((const __builtin_aarch64_simd_si *) __a);
-  ret.val[0] = (int32x2_t) __builtin_aarch64_get_dregciv2si (__o, 0);
-  ret.val[1] = (int32x2_t) __builtin_aarch64_get_dregciv2si (__o, 1);
-  ret.val[2] = (int32x2_t) __builtin_aarch64_get_dregciv2si (__o, 2);
-  return ret;
+  return __builtin_aarch64_reduc_smin_scal_v2sf (__a);
 }
 
-__extension__ static __inline uint8x8x3_t __attribute__ ((__always_inline__))
-vld3_dup_u8 (const uint8_t * __a)
+__extension__ extern __inline float32_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vminnmvq_f32 (float32x4_t __a)
 {
-  uint8x8x3_t ret;
-  __builtin_aarch64_simd_ci __o;
-  __o = __builtin_aarch64_ld3rv8qi ((const __builtin_aarch64_simd_qi *) __a);
-  ret.val[0] = (uint8x8_t) __builtin_aarch64_get_dregciv8qi (__o, 0);
-  ret.val[1] = (uint8x8_t) __builtin_aarch64_get_dregciv8qi (__o, 1);
-  ret.val[2] = (uint8x8_t) __builtin_aarch64_get_dregciv8qi (__o, 2);
-  return ret;
+  return __builtin_aarch64_reduc_smin_scal_v4sf (__a);
 }
 
-__extension__ static __inline uint16x4x3_t __attribute__ ((__always_inline__))
-vld3_dup_u16 (const uint16_t * __a)
+__extension__ extern __inline float64_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vminnmvq_f64 (float64x2_t __a)
 {
-  uint16x4x3_t ret;
-  __builtin_aarch64_simd_ci __o;
-  __o = __builtin_aarch64_ld3rv4hi ((const __builtin_aarch64_simd_hi *) __a);
-  ret.val[0] = (uint16x4_t) __builtin_aarch64_get_dregciv4hi (__o, 0);
-  ret.val[1] = (uint16x4_t) __builtin_aarch64_get_dregciv4hi (__o, 1);
-  ret.val[2] = (uint16x4_t) __builtin_aarch64_get_dregciv4hi (__o, 2);
-  return ret;
+  return __builtin_aarch64_reduc_smin_scal_v2df (__a);
 }
 
-__extension__ static __inline uint32x2x3_t __attribute__ ((__always_inline__))
-vld3_dup_u32 (const uint32_t * __a)
+/* vmla */
+
+__extension__ extern __inline float32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmla_f32 (float32x2_t a, float32x2_t b, float32x2_t c)
 {
-  uint32x2x3_t ret;
-  __builtin_aarch64_simd_ci __o;
-  __o = __builtin_aarch64_ld3rv2si ((const __builtin_aarch64_simd_si *) __a);
-  ret.val[0] = (uint32x2_t) __builtin_aarch64_get_dregciv2si (__o, 0);
-  ret.val[1] = (uint32x2_t) __builtin_aarch64_get_dregciv2si (__o, 1);
-  ret.val[2] = (uint32x2_t) __builtin_aarch64_get_dregciv2si (__o, 2);
-  return ret;
+  return a + b * c;
 }
 
-__extension__ static __inline float16x4x3_t __attribute__ ((__always_inline__))
-vld3_dup_f16 (const float16_t * __a)
+__extension__ extern __inline float64x1_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmla_f64 (float64x1_t __a, float64x1_t __b, float64x1_t __c)
 {
-  float16x4x3_t ret;
-  __builtin_aarch64_simd_ci __o;
-  __o = __builtin_aarch64_ld3rv4hf ((const __builtin_aarch64_simd_hf *) __a);
-  ret.val[0] = (float16x4_t) __builtin_aarch64_get_dregciv4hf (__o, 0);
-  ret.val[1] = (float16x4_t) __builtin_aarch64_get_dregciv4hf (__o, 1);
-  ret.val[2] = (float16x4_t) __builtin_aarch64_get_dregciv4hf (__o, 2);
-  return ret;
+  return __a + __b * __c;
 }
 
-__extension__ static __inline float32x2x3_t __attribute__ ((__always_inline__))
-vld3_dup_f32 (const float32_t * __a)
+__extension__ extern __inline float32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmlaq_f32 (float32x4_t a, float32x4_t b, float32x4_t c)
 {
-  float32x2x3_t ret;
-  __builtin_aarch64_simd_ci __o;
-  __o = __builtin_aarch64_ld3rv2sf ((const __builtin_aarch64_simd_sf *) __a);
-  ret.val[0] = (float32x2_t) __builtin_aarch64_get_dregciv2sf (__o, 0);
-  ret.val[1] = (float32x2_t) __builtin_aarch64_get_dregciv2sf (__o, 1);
-  ret.val[2] = (float32x2_t) __builtin_aarch64_get_dregciv2sf (__o, 2);
-  return ret;
+  return a + b * c;
 }
 
-__extension__ static __inline int8x16x3_t __attribute__ ((__always_inline__))
-vld3q_dup_s8 (const int8_t * __a)
+__extension__ extern __inline float64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmlaq_f64 (float64x2_t a, float64x2_t b, float64x2_t c)
 {
-  int8x16x3_t ret;
-  __builtin_aarch64_simd_ci __o;
-  __o = __builtin_aarch64_ld3rv16qi ((const __builtin_aarch64_simd_qi *) __a);
-  ret.val[0] = (int8x16_t) __builtin_aarch64_get_qregciv16qi (__o, 0);
-  ret.val[1] = (int8x16_t) __builtin_aarch64_get_qregciv16qi (__o, 1);
-  ret.val[2] = (int8x16_t) __builtin_aarch64_get_qregciv16qi (__o, 2);
-  return ret;
+  return a + b * c;
 }
 
-__extension__ static __inline poly8x16x3_t __attribute__ ((__always_inline__))
-vld3q_dup_p8 (const poly8_t * __a)
+/* vmla_lane  */
+
+__extension__ extern __inline float32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmla_lane_f32 (float32x2_t __a, float32x2_t __b,
+	       float32x2_t __c, const int __lane)
 {
-  poly8x16x3_t ret;
-  __builtin_aarch64_simd_ci __o;
-  __o = __builtin_aarch64_ld3rv16qi ((const __builtin_aarch64_simd_qi *) __a);
-  ret.val[0] = (poly8x16_t) __builtin_aarch64_get_qregciv16qi (__o, 0);
-  ret.val[1] = (poly8x16_t) __builtin_aarch64_get_qregciv16qi (__o, 1);
-  ret.val[2] = (poly8x16_t) __builtin_aarch64_get_qregciv16qi (__o, 2);
-  return ret;
+  return (__a + (__b * __aarch64_vget_lane_any (__c, __lane)));
 }
 
-__extension__ static __inline int16x8x3_t __attribute__ ((__always_inline__))
-vld3q_dup_s16 (const int16_t * __a)
+__extension__ extern __inline int16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmla_lane_s16 (int16x4_t __a, int16x4_t __b,
+		int16x4_t __c, const int __lane)
 {
-  int16x8x3_t ret;
-  __builtin_aarch64_simd_ci __o;
-  __o = __builtin_aarch64_ld3rv8hi ((const __builtin_aarch64_simd_hi *) __a);
-  ret.val[0] = (int16x8_t) __builtin_aarch64_get_qregciv8hi (__o, 0);
-  ret.val[1] = (int16x8_t) __builtin_aarch64_get_qregciv8hi (__o, 1);
-  ret.val[2] = (int16x8_t) __builtin_aarch64_get_qregciv8hi (__o, 2);
-  return ret;
+  return (__a + (__b * __aarch64_vget_lane_any (__c, __lane)));
 }
 
-__extension__ static __inline poly16x8x3_t __attribute__ ((__always_inline__))
-vld3q_dup_p16 (const poly16_t * __a)
+__extension__ extern __inline int32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmla_lane_s32 (int32x2_t __a, int32x2_t __b,
+		int32x2_t __c, const int __lane)
 {
-  poly16x8x3_t ret;
-  __builtin_aarch64_simd_ci __o;
-  __o = __builtin_aarch64_ld3rv8hi ((const __builtin_aarch64_simd_hi *) __a);
-  ret.val[0] = (poly16x8_t) __builtin_aarch64_get_qregciv8hi (__o, 0);
-  ret.val[1] = (poly16x8_t) __builtin_aarch64_get_qregciv8hi (__o, 1);
-  ret.val[2] = (poly16x8_t) __builtin_aarch64_get_qregciv8hi (__o, 2);
-  return ret;
+  return (__a + (__b * __aarch64_vget_lane_any (__c, __lane)));
 }
 
-__extension__ static __inline int32x4x3_t __attribute__ ((__always_inline__))
-vld3q_dup_s32 (const int32_t * __a)
+__extension__ extern __inline uint16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmla_lane_u16 (uint16x4_t __a, uint16x4_t __b,
+		uint16x4_t __c, const int __lane)
 {
-  int32x4x3_t ret;
-  __builtin_aarch64_simd_ci __o;
-  __o = __builtin_aarch64_ld3rv4si ((const __builtin_aarch64_simd_si *) __a);
-  ret.val[0] = (int32x4_t) __builtin_aarch64_get_qregciv4si (__o, 0);
-  ret.val[1] = (int32x4_t) __builtin_aarch64_get_qregciv4si (__o, 1);
-  ret.val[2] = (int32x4_t) __builtin_aarch64_get_qregciv4si (__o, 2);
-  return ret;
+  return (__a + (__b * __aarch64_vget_lane_any (__c, __lane)));
 }
 
-__extension__ static __inline int64x2x3_t __attribute__ ((__always_inline__))
-vld3q_dup_s64 (const int64_t * __a)
+__extension__ extern __inline uint32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmla_lane_u32 (uint32x2_t __a, uint32x2_t __b,
+	       uint32x2_t __c, const int __lane)
 {
-  int64x2x3_t ret;
-  __builtin_aarch64_simd_ci __o;
-  __o = __builtin_aarch64_ld3rv2di ((const __builtin_aarch64_simd_di *) __a);
-  ret.val[0] = (int64x2_t) __builtin_aarch64_get_qregciv2di (__o, 0);
-  ret.val[1] = (int64x2_t) __builtin_aarch64_get_qregciv2di (__o, 1);
-  ret.val[2] = (int64x2_t) __builtin_aarch64_get_qregciv2di (__o, 2);
-  return ret;
+  return (__a + (__b * __aarch64_vget_lane_any (__c, __lane)));
 }
 
-__extension__ static __inline uint8x16x3_t __attribute__ ((__always_inline__))
-vld3q_dup_u8 (const uint8_t * __a)
+/* vmla_laneq  */
+
+__extension__ extern __inline float32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmla_laneq_f32 (float32x2_t __a, float32x2_t __b,
+	        float32x4_t __c, const int __lane)
 {
-  uint8x16x3_t ret;
-  __builtin_aarch64_simd_ci __o;
-  __o = __builtin_aarch64_ld3rv16qi ((const __builtin_aarch64_simd_qi *) __a);
-  ret.val[0] = (uint8x16_t) __builtin_aarch64_get_qregciv16qi (__o, 0);
-  ret.val[1] = (uint8x16_t) __builtin_aarch64_get_qregciv16qi (__o, 1);
-  ret.val[2] = (uint8x16_t) __builtin_aarch64_get_qregciv16qi (__o, 2);
-  return ret;
+  return (__a + (__b * __aarch64_vget_lane_any (__c, __lane)));
 }
 
-__extension__ static __inline uint16x8x3_t __attribute__ ((__always_inline__))
-vld3q_dup_u16 (const uint16_t * __a)
+__extension__ extern __inline int16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmla_laneq_s16 (int16x4_t __a, int16x4_t __b,
+		int16x8_t __c, const int __lane)
 {
-  uint16x8x3_t ret;
-  __builtin_aarch64_simd_ci __o;
-  __o = __builtin_aarch64_ld3rv8hi ((const __builtin_aarch64_simd_hi *) __a);
-  ret.val[0] = (uint16x8_t) __builtin_aarch64_get_qregciv8hi (__o, 0);
-  ret.val[1] = (uint16x8_t) __builtin_aarch64_get_qregciv8hi (__o, 1);
-  ret.val[2] = (uint16x8_t) __builtin_aarch64_get_qregciv8hi (__o, 2);
-  return ret;
+  return (__a + (__b * __aarch64_vget_lane_any (__c, __lane)));
 }
 
-__extension__ static __inline uint32x4x3_t __attribute__ ((__always_inline__))
-vld3q_dup_u32 (const uint32_t * __a)
+__extension__ extern __inline int32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmla_laneq_s32 (int32x2_t __a, int32x2_t __b,
+		int32x4_t __c, const int __lane)
 {
-  uint32x4x3_t ret;
-  __builtin_aarch64_simd_ci __o;
-  __o = __builtin_aarch64_ld3rv4si ((const __builtin_aarch64_simd_si *) __a);
-  ret.val[0] = (uint32x4_t) __builtin_aarch64_get_qregciv4si (__o, 0);
-  ret.val[1] = (uint32x4_t) __builtin_aarch64_get_qregciv4si (__o, 1);
-  ret.val[2] = (uint32x4_t) __builtin_aarch64_get_qregciv4si (__o, 2);
-  return ret;
+  return (__a + (__b * __aarch64_vget_lane_any (__c, __lane)));
 }
 
-__extension__ static __inline uint64x2x3_t __attribute__ ((__always_inline__))
-vld3q_dup_u64 (const uint64_t * __a)
+__extension__ extern __inline uint16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmla_laneq_u16 (uint16x4_t __a, uint16x4_t __b,
+		uint16x8_t __c, const int __lane)
 {
-  uint64x2x3_t ret;
-  __builtin_aarch64_simd_ci __o;
-  __o = __builtin_aarch64_ld3rv2di ((const __builtin_aarch64_simd_di *) __a);
-  ret.val[0] = (uint64x2_t) __builtin_aarch64_get_qregciv2di (__o, 0);
-  ret.val[1] = (uint64x2_t) __builtin_aarch64_get_qregciv2di (__o, 1);
-  ret.val[2] = (uint64x2_t) __builtin_aarch64_get_qregciv2di (__o, 2);
-  return ret;
+  return (__a + (__b * __aarch64_vget_lane_any (__c, __lane)));
 }
 
-__extension__ static __inline float16x8x3_t __attribute__ ((__always_inline__))
-vld3q_dup_f16 (const float16_t * __a)
+__extension__ extern __inline uint32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmla_laneq_u32 (uint32x2_t __a, uint32x2_t __b,
+		uint32x4_t __c, const int __lane)
 {
-  float16x8x3_t ret;
-  __builtin_aarch64_simd_ci __o;
-  __o = __builtin_aarch64_ld3rv8hf ((const __builtin_aarch64_simd_hf *) __a);
-  ret.val[0] = (float16x8_t) __builtin_aarch64_get_qregciv8hf (__o, 0);
-  ret.val[1] = (float16x8_t) __builtin_aarch64_get_qregciv8hf (__o, 1);
-  ret.val[2] = (float16x8_t) __builtin_aarch64_get_qregciv8hf (__o, 2);
-  return ret;
+  return (__a + (__b * __aarch64_vget_lane_any (__c, __lane)));
 }
 
-__extension__ static __inline float32x4x3_t __attribute__ ((__always_inline__))
-vld3q_dup_f32 (const float32_t * __a)
+/* vmlaq_lane  */
+
+__extension__ extern __inline float32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmlaq_lane_f32 (float32x4_t __a, float32x4_t __b,
+		float32x2_t __c, const int __lane)
 {
-  float32x4x3_t ret;
-  __builtin_aarch64_simd_ci __o;
-  __o = __builtin_aarch64_ld3rv4sf ((const __builtin_aarch64_simd_sf *) __a);
-  ret.val[0] = (float32x4_t) __builtin_aarch64_get_qregciv4sf (__o, 0);
-  ret.val[1] = (float32x4_t) __builtin_aarch64_get_qregciv4sf (__o, 1);
-  ret.val[2] = (float32x4_t) __builtin_aarch64_get_qregciv4sf (__o, 2);
-  return ret;
+  return (__a + (__b * __aarch64_vget_lane_any (__c, __lane)));
 }
 
-__extension__ static __inline float64x2x3_t __attribute__ ((__always_inline__))
-vld3q_dup_f64 (const float64_t * __a)
+__extension__ extern __inline int16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmlaq_lane_s16 (int16x8_t __a, int16x8_t __b,
+		int16x4_t __c, const int __lane)
 {
-  float64x2x3_t ret;
-  __builtin_aarch64_simd_ci __o;
-  __o = __builtin_aarch64_ld3rv2df ((const __builtin_aarch64_simd_df *) __a);
-  ret.val[0] = (float64x2_t) __builtin_aarch64_get_qregciv2df (__o, 0);
-  ret.val[1] = (float64x2_t) __builtin_aarch64_get_qregciv2df (__o, 1);
-  ret.val[2] = (float64x2_t) __builtin_aarch64_get_qregciv2df (__o, 2);
-  return ret;
+  return (__a + (__b * __aarch64_vget_lane_any (__c, __lane)));
 }
 
-__extension__ static __inline int64x1x4_t __attribute__ ((__always_inline__))
-vld4_dup_s64 (const int64_t * __a)
+__extension__ extern __inline int32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmlaq_lane_s32 (int32x4_t __a, int32x4_t __b,
+		int32x2_t __c, const int __lane)
 {
-  int64x1x4_t ret;
-  __builtin_aarch64_simd_xi __o;
-  __o = __builtin_aarch64_ld4rdi ((const __builtin_aarch64_simd_di *) __a);
-  ret.val[0] = (int64x1_t) __builtin_aarch64_get_dregxidi (__o, 0);
-  ret.val[1] = (int64x1_t) __builtin_aarch64_get_dregxidi (__o, 1);
-  ret.val[2] = (int64x1_t) __builtin_aarch64_get_dregxidi (__o, 2);
-  ret.val[3] = (int64x1_t) __builtin_aarch64_get_dregxidi (__o, 3);
-  return ret;
+  return (__a + (__b * __aarch64_vget_lane_any (__c, __lane)));
 }
 
-__extension__ static __inline uint64x1x4_t __attribute__ ((__always_inline__))
-vld4_dup_u64 (const uint64_t * __a)
+__extension__ extern __inline uint16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmlaq_lane_u16 (uint16x8_t __a, uint16x8_t __b,
+		uint16x4_t __c, const int __lane)
 {
-  uint64x1x4_t ret;
-  __builtin_aarch64_simd_xi __o;
-  __o = __builtin_aarch64_ld4rdi ((const __builtin_aarch64_simd_di *) __a);
-  ret.val[0] = (uint64x1_t) __builtin_aarch64_get_dregxidi (__o, 0);
-  ret.val[1] = (uint64x1_t) __builtin_aarch64_get_dregxidi (__o, 1);
-  ret.val[2] = (uint64x1_t) __builtin_aarch64_get_dregxidi (__o, 2);
-  ret.val[3] = (uint64x1_t) __builtin_aarch64_get_dregxidi (__o, 3);
-  return ret;
+  return (__a + (__b * __aarch64_vget_lane_any (__c, __lane)));
 }
 
-__extension__ static __inline float64x1x4_t __attribute__ ((__always_inline__))
-vld4_dup_f64 (const float64_t * __a)
+__extension__ extern __inline uint32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmlaq_lane_u32 (uint32x4_t __a, uint32x4_t __b,
+		uint32x2_t __c, const int __lane)
 {
-  float64x1x4_t ret;
-  __builtin_aarch64_simd_xi __o;
-  __o = __builtin_aarch64_ld4rdf ((const __builtin_aarch64_simd_df *) __a);
-  ret.val[0] = (float64x1_t) {__builtin_aarch64_get_dregxidf (__o, 0)};
-  ret.val[1] = (float64x1_t) {__builtin_aarch64_get_dregxidf (__o, 1)};
-  ret.val[2] = (float64x1_t) {__builtin_aarch64_get_dregxidf (__o, 2)};
-  ret.val[3] = (float64x1_t) {__builtin_aarch64_get_dregxidf (__o, 3)};
-  return ret;
+  return (__a + (__b * __aarch64_vget_lane_any (__c, __lane)));
 }
 
-__extension__ static __inline int8x8x4_t __attribute__ ((__always_inline__))
-vld4_dup_s8 (const int8_t * __a)
+  /* vmlaq_laneq  */
+
+__extension__ extern __inline float32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmlaq_laneq_f32 (float32x4_t __a, float32x4_t __b,
+		 float32x4_t __c, const int __lane)
 {
-  int8x8x4_t ret;
-  __builtin_aarch64_simd_xi __o;
-  __o = __builtin_aarch64_ld4rv8qi ((const __builtin_aarch64_simd_qi *) __a);
-  ret.val[0] = (int8x8_t) __builtin_aarch64_get_dregxiv8qi (__o, 0);
-  ret.val[1] = (int8x8_t) __builtin_aarch64_get_dregxiv8qi (__o, 1);
-  ret.val[2] = (int8x8_t) __builtin_aarch64_get_dregxiv8qi (__o, 2);
-  ret.val[3] = (int8x8_t) __builtin_aarch64_get_dregxiv8qi (__o, 3);
-  return ret;
+  return (__a + (__b * __aarch64_vget_lane_any (__c, __lane)));
 }
 
-__extension__ static __inline poly8x8x4_t __attribute__ ((__always_inline__))
-vld4_dup_p8 (const poly8_t * __a)
+__extension__ extern __inline int16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmlaq_laneq_s16 (int16x8_t __a, int16x8_t __b,
+		int16x8_t __c, const int __lane)
 {
-  poly8x8x4_t ret;
-  __builtin_aarch64_simd_xi __o;
-  __o = __builtin_aarch64_ld4rv8qi ((const __builtin_aarch64_simd_qi *) __a);
-  ret.val[0] = (poly8x8_t) __builtin_aarch64_get_dregxiv8qi (__o, 0);
-  ret.val[1] = (poly8x8_t) __builtin_aarch64_get_dregxiv8qi (__o, 1);
-  ret.val[2] = (poly8x8_t) __builtin_aarch64_get_dregxiv8qi (__o, 2);
-  ret.val[3] = (poly8x8_t) __builtin_aarch64_get_dregxiv8qi (__o, 3);
-  return ret;
+  return (__a + (__b * __aarch64_vget_lane_any (__c, __lane)));
 }
 
-__extension__ static __inline int16x4x4_t __attribute__ ((__always_inline__))
-vld4_dup_s16 (const int16_t * __a)
+__extension__ extern __inline int32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmlaq_laneq_s32 (int32x4_t __a, int32x4_t __b,
+		int32x4_t __c, const int __lane)
 {
-  int16x4x4_t ret;
-  __builtin_aarch64_simd_xi __o;
-  __o = __builtin_aarch64_ld4rv4hi ((const __builtin_aarch64_simd_hi *) __a);
-  ret.val[0] = (int16x4_t) __builtin_aarch64_get_dregxiv4hi (__o, 0);
-  ret.val[1] = (int16x4_t) __builtin_aarch64_get_dregxiv4hi (__o, 1);
-  ret.val[2] = (int16x4_t) __builtin_aarch64_get_dregxiv4hi (__o, 2);
-  ret.val[3] = (int16x4_t) __builtin_aarch64_get_dregxiv4hi (__o, 3);
-  return ret;
+  return (__a + (__b * __aarch64_vget_lane_any (__c, __lane)));
 }
 
-__extension__ static __inline poly16x4x4_t __attribute__ ((__always_inline__))
-vld4_dup_p16 (const poly16_t * __a)
+__extension__ extern __inline uint16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmlaq_laneq_u16 (uint16x8_t __a, uint16x8_t __b,
+		uint16x8_t __c, const int __lane)
 {
-  poly16x4x4_t ret;
-  __builtin_aarch64_simd_xi __o;
-  __o = __builtin_aarch64_ld4rv4hi ((const __builtin_aarch64_simd_hi *) __a);
-  ret.val[0] = (poly16x4_t) __builtin_aarch64_get_dregxiv4hi (__o, 0);
-  ret.val[1] = (poly16x4_t) __builtin_aarch64_get_dregxiv4hi (__o, 1);
-  ret.val[2] = (poly16x4_t) __builtin_aarch64_get_dregxiv4hi (__o, 2);
-  ret.val[3] = (poly16x4_t) __builtin_aarch64_get_dregxiv4hi (__o, 3);
-  return ret;
+  return (__a + (__b * __aarch64_vget_lane_any (__c, __lane)));
 }
 
-__extension__ static __inline int32x2x4_t __attribute__ ((__always_inline__))
-vld4_dup_s32 (const int32_t * __a)
+__extension__ extern __inline uint32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmlaq_laneq_u32 (uint32x4_t __a, uint32x4_t __b,
+		uint32x4_t __c, const int __lane)
 {
-  int32x2x4_t ret;
-  __builtin_aarch64_simd_xi __o;
-  __o = __builtin_aarch64_ld4rv2si ((const __builtin_aarch64_simd_si *) __a);
-  ret.val[0] = (int32x2_t) __builtin_aarch64_get_dregxiv2si (__o, 0);
-  ret.val[1] = (int32x2_t) __builtin_aarch64_get_dregxiv2si (__o, 1);
-  ret.val[2] = (int32x2_t) __builtin_aarch64_get_dregxiv2si (__o, 2);
-  ret.val[3] = (int32x2_t) __builtin_aarch64_get_dregxiv2si (__o, 3);
-  return ret;
+  return (__a + (__b * __aarch64_vget_lane_any (__c, __lane)));
 }
 
-__extension__ static __inline uint8x8x4_t __attribute__ ((__always_inline__))
-vld4_dup_u8 (const uint8_t * __a)
+/* vmls  */
+
+__extension__ extern __inline float32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmls_f32 (float32x2_t a, float32x2_t b, float32x2_t c)
 {
-  uint8x8x4_t ret;
-  __builtin_aarch64_simd_xi __o;
-  __o = __builtin_aarch64_ld4rv8qi ((const __builtin_aarch64_simd_qi *) __a);
-  ret.val[0] = (uint8x8_t) __builtin_aarch64_get_dregxiv8qi (__o, 0);
-  ret.val[1] = (uint8x8_t) __builtin_aarch64_get_dregxiv8qi (__o, 1);
-  ret.val[2] = (uint8x8_t) __builtin_aarch64_get_dregxiv8qi (__o, 2);
-  ret.val[3] = (uint8x8_t) __builtin_aarch64_get_dregxiv8qi (__o, 3);
-  return ret;
+  return a - b * c;
 }
 
-__extension__ static __inline uint16x4x4_t __attribute__ ((__always_inline__))
-vld4_dup_u16 (const uint16_t * __a)
+__extension__ extern __inline float64x1_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmls_f64 (float64x1_t __a, float64x1_t __b, float64x1_t __c)
 {
-  uint16x4x4_t ret;
-  __builtin_aarch64_simd_xi __o;
-  __o = __builtin_aarch64_ld4rv4hi ((const __builtin_aarch64_simd_hi *) __a);
-  ret.val[0] = (uint16x4_t) __builtin_aarch64_get_dregxiv4hi (__o, 0);
-  ret.val[1] = (uint16x4_t) __builtin_aarch64_get_dregxiv4hi (__o, 1);
-  ret.val[2] = (uint16x4_t) __builtin_aarch64_get_dregxiv4hi (__o, 2);
-  ret.val[3] = (uint16x4_t) __builtin_aarch64_get_dregxiv4hi (__o, 3);
-  return ret;
+  return __a - __b * __c;
 }
 
-__extension__ static __inline uint32x2x4_t __attribute__ ((__always_inline__))
-vld4_dup_u32 (const uint32_t * __a)
+__extension__ extern __inline float32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmlsq_f32 (float32x4_t a, float32x4_t b, float32x4_t c)
 {
-  uint32x2x4_t ret;
-  __builtin_aarch64_simd_xi __o;
-  __o = __builtin_aarch64_ld4rv2si ((const __builtin_aarch64_simd_si *) __a);
-  ret.val[0] = (uint32x2_t) __builtin_aarch64_get_dregxiv2si (__o, 0);
-  ret.val[1] = (uint32x2_t) __builtin_aarch64_get_dregxiv2si (__o, 1);
-  ret.val[2] = (uint32x2_t) __builtin_aarch64_get_dregxiv2si (__o, 2);
-  ret.val[3] = (uint32x2_t) __builtin_aarch64_get_dregxiv2si (__o, 3);
-  return ret;
+  return a - b * c;
 }
 
-__extension__ static __inline float16x4x4_t __attribute__ ((__always_inline__))
-vld4_dup_f16 (const float16_t * __a)
+__extension__ extern __inline float64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmlsq_f64 (float64x2_t a, float64x2_t b, float64x2_t c)
 {
-  float16x4x4_t ret;
-  __builtin_aarch64_simd_xi __o;
-  __o = __builtin_aarch64_ld4rv4hf ((const __builtin_aarch64_simd_hf *) __a);
-  ret.val[0] = (float16x4_t) __builtin_aarch64_get_dregxiv4hf (__o, 0);
-  ret.val[1] = (float16x4_t) __builtin_aarch64_get_dregxiv4hf (__o, 1);
-  ret.val[2] = (float16x4_t) __builtin_aarch64_get_dregxiv4hf (__o, 2);
-  ret.val[3] = (float16x4_t) __builtin_aarch64_get_dregxiv4hf (__o, 3);
-  return ret;
+  return a - b * c;
 }
 
-__extension__ static __inline float32x2x4_t __attribute__ ((__always_inline__))
-vld4_dup_f32 (const float32_t * __a)
+/* vmls_lane  */
+
+__extension__ extern __inline float32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmls_lane_f32 (float32x2_t __a, float32x2_t __b,
+	       float32x2_t __c, const int __lane)
 {
-  float32x2x4_t ret;
-  __builtin_aarch64_simd_xi __o;
-  __o = __builtin_aarch64_ld4rv2sf ((const __builtin_aarch64_simd_sf *) __a);
-  ret.val[0] = (float32x2_t) __builtin_aarch64_get_dregxiv2sf (__o, 0);
-  ret.val[1] = (float32x2_t) __builtin_aarch64_get_dregxiv2sf (__o, 1);
-  ret.val[2] = (float32x2_t) __builtin_aarch64_get_dregxiv2sf (__o, 2);
-  ret.val[3] = (float32x2_t) __builtin_aarch64_get_dregxiv2sf (__o, 3);
-  return ret;
+  return (__a - (__b * __aarch64_vget_lane_any (__c, __lane)));
 }
 
-__extension__ static __inline int8x16x4_t __attribute__ ((__always_inline__))
-vld4q_dup_s8 (const int8_t * __a)
+__extension__ extern __inline int16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmls_lane_s16 (int16x4_t __a, int16x4_t __b,
+		int16x4_t __c, const int __lane)
 {
-  int8x16x4_t ret;
-  __builtin_aarch64_simd_xi __o;
-  __o = __builtin_aarch64_ld4rv16qi ((const __builtin_aarch64_simd_qi *) __a);
-  ret.val[0] = (int8x16_t) __builtin_aarch64_get_qregxiv16qi (__o, 0);
-  ret.val[1] = (int8x16_t) __builtin_aarch64_get_qregxiv16qi (__o, 1);
-  ret.val[2] = (int8x16_t) __builtin_aarch64_get_qregxiv16qi (__o, 2);
-  ret.val[3] = (int8x16_t) __builtin_aarch64_get_qregxiv16qi (__o, 3);
-  return ret;
+  return (__a - (__b * __aarch64_vget_lane_any (__c, __lane)));
 }
 
-__extension__ static __inline poly8x16x4_t __attribute__ ((__always_inline__))
-vld4q_dup_p8 (const poly8_t * __a)
+__extension__ extern __inline int32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmls_lane_s32 (int32x2_t __a, int32x2_t __b,
+		int32x2_t __c, const int __lane)
 {
-  poly8x16x4_t ret;
-  __builtin_aarch64_simd_xi __o;
-  __o = __builtin_aarch64_ld4rv16qi ((const __builtin_aarch64_simd_qi *) __a);
-  ret.val[0] = (poly8x16_t) __builtin_aarch64_get_qregxiv16qi (__o, 0);
-  ret.val[1] = (poly8x16_t) __builtin_aarch64_get_qregxiv16qi (__o, 1);
-  ret.val[2] = (poly8x16_t) __builtin_aarch64_get_qregxiv16qi (__o, 2);
-  ret.val[3] = (poly8x16_t) __builtin_aarch64_get_qregxiv16qi (__o, 3);
-  return ret;
+  return (__a - (__b * __aarch64_vget_lane_any (__c, __lane)));
 }
 
-__extension__ static __inline int16x8x4_t __attribute__ ((__always_inline__))
-vld4q_dup_s16 (const int16_t * __a)
+__extension__ extern __inline uint16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmls_lane_u16 (uint16x4_t __a, uint16x4_t __b,
+		uint16x4_t __c, const int __lane)
 {
-  int16x8x4_t ret;
-  __builtin_aarch64_simd_xi __o;
-  __o = __builtin_aarch64_ld4rv8hi ((const __builtin_aarch64_simd_hi *) __a);
-  ret.val[0] = (int16x8_t) __builtin_aarch64_get_qregxiv8hi (__o, 0);
-  ret.val[1] = (int16x8_t) __builtin_aarch64_get_qregxiv8hi (__o, 1);
-  ret.val[2] = (int16x8_t) __builtin_aarch64_get_qregxiv8hi (__o, 2);
-  ret.val[3] = (int16x8_t) __builtin_aarch64_get_qregxiv8hi (__o, 3);
-  return ret;
+  return (__a - (__b * __aarch64_vget_lane_any (__c, __lane)));
 }
 
-__extension__ static __inline poly16x8x4_t __attribute__ ((__always_inline__))
-vld4q_dup_p16 (const poly16_t * __a)
+__extension__ extern __inline uint32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmls_lane_u32 (uint32x2_t __a, uint32x2_t __b,
+	       uint32x2_t __c, const int __lane)
 {
-  poly16x8x4_t ret;
-  __builtin_aarch64_simd_xi __o;
-  __o = __builtin_aarch64_ld4rv8hi ((const __builtin_aarch64_simd_hi *) __a);
-  ret.val[0] = (poly16x8_t) __builtin_aarch64_get_qregxiv8hi (__o, 0);
-  ret.val[1] = (poly16x8_t) __builtin_aarch64_get_qregxiv8hi (__o, 1);
-  ret.val[2] = (poly16x8_t) __builtin_aarch64_get_qregxiv8hi (__o, 2);
-  ret.val[3] = (poly16x8_t) __builtin_aarch64_get_qregxiv8hi (__o, 3);
-  return ret;
+  return (__a - (__b * __aarch64_vget_lane_any (__c, __lane)));
 }
 
-__extension__ static __inline int32x4x4_t __attribute__ ((__always_inline__))
-vld4q_dup_s32 (const int32_t * __a)
+/* vmls_laneq  */
+
+__extension__ extern __inline float32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmls_laneq_f32 (float32x2_t __a, float32x2_t __b,
+	       float32x4_t __c, const int __lane)
 {
-  int32x4x4_t ret;
-  __builtin_aarch64_simd_xi __o;
-  __o = __builtin_aarch64_ld4rv4si ((const __builtin_aarch64_simd_si *) __a);
-  ret.val[0] = (int32x4_t) __builtin_aarch64_get_qregxiv4si (__o, 0);
-  ret.val[1] = (int32x4_t) __builtin_aarch64_get_qregxiv4si (__o, 1);
-  ret.val[2] = (int32x4_t) __builtin_aarch64_get_qregxiv4si (__o, 2);
-  ret.val[3] = (int32x4_t) __builtin_aarch64_get_qregxiv4si (__o, 3);
-  return ret;
+  return (__a - (__b * __aarch64_vget_lane_any (__c, __lane)));
 }
 
-__extension__ static __inline int64x2x4_t __attribute__ ((__always_inline__))
-vld4q_dup_s64 (const int64_t * __a)
+__extension__ extern __inline int16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmls_laneq_s16 (int16x4_t __a, int16x4_t __b,
+		int16x8_t __c, const int __lane)
 {
-  int64x2x4_t ret;
-  __builtin_aarch64_simd_xi __o;
-  __o = __builtin_aarch64_ld4rv2di ((const __builtin_aarch64_simd_di *) __a);
-  ret.val[0] = (int64x2_t) __builtin_aarch64_get_qregxiv2di (__o, 0);
-  ret.val[1] = (int64x2_t) __builtin_aarch64_get_qregxiv2di (__o, 1);
-  ret.val[2] = (int64x2_t) __builtin_aarch64_get_qregxiv2di (__o, 2);
-  ret.val[3] = (int64x2_t) __builtin_aarch64_get_qregxiv2di (__o, 3);
-  return ret;
+  return (__a - (__b * __aarch64_vget_lane_any (__c, __lane)));
 }
 
-__extension__ static __inline uint8x16x4_t __attribute__ ((__always_inline__))
-vld4q_dup_u8 (const uint8_t * __a)
+__extension__ extern __inline int32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmls_laneq_s32 (int32x2_t __a, int32x2_t __b,
+		int32x4_t __c, const int __lane)
 {
-  uint8x16x4_t ret;
-  __builtin_aarch64_simd_xi __o;
-  __o = __builtin_aarch64_ld4rv16qi ((const __builtin_aarch64_simd_qi *) __a);
-  ret.val[0] = (uint8x16_t) __builtin_aarch64_get_qregxiv16qi (__o, 0);
-  ret.val[1] = (uint8x16_t) __builtin_aarch64_get_qregxiv16qi (__o, 1);
-  ret.val[2] = (uint8x16_t) __builtin_aarch64_get_qregxiv16qi (__o, 2);
-  ret.val[3] = (uint8x16_t) __builtin_aarch64_get_qregxiv16qi (__o, 3);
-  return ret;
+  return (__a - (__b * __aarch64_vget_lane_any (__c, __lane)));
 }
 
-__extension__ static __inline uint16x8x4_t __attribute__ ((__always_inline__))
-vld4q_dup_u16 (const uint16_t * __a)
+__extension__ extern __inline uint16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmls_laneq_u16 (uint16x4_t __a, uint16x4_t __b,
+		uint16x8_t __c, const int __lane)
 {
-  uint16x8x4_t ret;
-  __builtin_aarch64_simd_xi __o;
-  __o = __builtin_aarch64_ld4rv8hi ((const __builtin_aarch64_simd_hi *) __a);
-  ret.val[0] = (uint16x8_t) __builtin_aarch64_get_qregxiv8hi (__o, 0);
-  ret.val[1] = (uint16x8_t) __builtin_aarch64_get_qregxiv8hi (__o, 1);
-  ret.val[2] = (uint16x8_t) __builtin_aarch64_get_qregxiv8hi (__o, 2);
-  ret.val[3] = (uint16x8_t) __builtin_aarch64_get_qregxiv8hi (__o, 3);
-  return ret;
+  return (__a - (__b * __aarch64_vget_lane_any (__c, __lane)));
 }
 
-__extension__ static __inline uint32x4x4_t __attribute__ ((__always_inline__))
-vld4q_dup_u32 (const uint32_t * __a)
+__extension__ extern __inline uint32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmls_laneq_u32 (uint32x2_t __a, uint32x2_t __b,
+		uint32x4_t __c, const int __lane)
 {
-  uint32x4x4_t ret;
-  __builtin_aarch64_simd_xi __o;
-  __o = __builtin_aarch64_ld4rv4si ((const __builtin_aarch64_simd_si *) __a);
-  ret.val[0] = (uint32x4_t) __builtin_aarch64_get_qregxiv4si (__o, 0);
-  ret.val[1] = (uint32x4_t) __builtin_aarch64_get_qregxiv4si (__o, 1);
-  ret.val[2] = (uint32x4_t) __builtin_aarch64_get_qregxiv4si (__o, 2);
-  ret.val[3] = (uint32x4_t) __builtin_aarch64_get_qregxiv4si (__o, 3);
-  return ret;
+  return (__a - (__b * __aarch64_vget_lane_any (__c, __lane)));
 }
 
-__extension__ static __inline uint64x2x4_t __attribute__ ((__always_inline__))
-vld4q_dup_u64 (const uint64_t * __a)
+/* vmlsq_lane  */
+
+__extension__ extern __inline float32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmlsq_lane_f32 (float32x4_t __a, float32x4_t __b,
+		float32x2_t __c, const int __lane)
 {
-  uint64x2x4_t ret;
-  __builtin_aarch64_simd_xi __o;
-  __o = __builtin_aarch64_ld4rv2di ((const __builtin_aarch64_simd_di *) __a);
-  ret.val[0] = (uint64x2_t) __builtin_aarch64_get_qregxiv2di (__o, 0);
-  ret.val[1] = (uint64x2_t) __builtin_aarch64_get_qregxiv2di (__o, 1);
-  ret.val[2] = (uint64x2_t) __builtin_aarch64_get_qregxiv2di (__o, 2);
-  ret.val[3] = (uint64x2_t) __builtin_aarch64_get_qregxiv2di (__o, 3);
-  return ret;
+  return (__a - (__b * __aarch64_vget_lane_any (__c, __lane)));
 }
 
-__extension__ static __inline float16x8x4_t __attribute__ ((__always_inline__))
-vld4q_dup_f16 (const float16_t * __a)
+__extension__ extern __inline int16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmlsq_lane_s16 (int16x8_t __a, int16x8_t __b,
+		int16x4_t __c, const int __lane)
 {
-  float16x8x4_t ret;
-  __builtin_aarch64_simd_xi __o;
-  __o = __builtin_aarch64_ld4rv8hf ((const __builtin_aarch64_simd_hf *) __a);
-  ret.val[0] = (float16x8_t) __builtin_aarch64_get_qregxiv8hf (__o, 0);
-  ret.val[1] = (float16x8_t) __builtin_aarch64_get_qregxiv8hf (__o, 1);
-  ret.val[2] = (float16x8_t) __builtin_aarch64_get_qregxiv8hf (__o, 2);
-  ret.val[3] = (float16x8_t) __builtin_aarch64_get_qregxiv8hf (__o, 3);
-  return ret;
+  return (__a - (__b * __aarch64_vget_lane_any (__c, __lane)));
 }
 
-__extension__ static __inline float32x4x4_t __attribute__ ((__always_inline__))
-vld4q_dup_f32 (const float32_t * __a)
+__extension__ extern __inline int32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmlsq_lane_s32 (int32x4_t __a, int32x4_t __b,
+		int32x2_t __c, const int __lane)
 {
-  float32x4x4_t ret;
-  __builtin_aarch64_simd_xi __o;
-  __o = __builtin_aarch64_ld4rv4sf ((const __builtin_aarch64_simd_sf *) __a);
-  ret.val[0] = (float32x4_t) __builtin_aarch64_get_qregxiv4sf (__o, 0);
-  ret.val[1] = (float32x4_t) __builtin_aarch64_get_qregxiv4sf (__o, 1);
-  ret.val[2] = (float32x4_t) __builtin_aarch64_get_qregxiv4sf (__o, 2);
-  ret.val[3] = (float32x4_t) __builtin_aarch64_get_qregxiv4sf (__o, 3);
-  return ret;
+  return (__a - (__b * __aarch64_vget_lane_any (__c, __lane)));
 }
 
-__extension__ static __inline float64x2x4_t __attribute__ ((__always_inline__))
-vld4q_dup_f64 (const float64_t * __a)
+__extension__ extern __inline uint16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmlsq_lane_u16 (uint16x8_t __a, uint16x8_t __b,
+		uint16x4_t __c, const int __lane)
 {
-  float64x2x4_t ret;
-  __builtin_aarch64_simd_xi __o;
-  __o = __builtin_aarch64_ld4rv2df ((const __builtin_aarch64_simd_df *) __a);
-  ret.val[0] = (float64x2_t) __builtin_aarch64_get_qregxiv2df (__o, 0);
-  ret.val[1] = (float64x2_t) __builtin_aarch64_get_qregxiv2df (__o, 1);
-  ret.val[2] = (float64x2_t) __builtin_aarch64_get_qregxiv2df (__o, 2);
-  ret.val[3] = (float64x2_t) __builtin_aarch64_get_qregxiv2df (__o, 3);
-  return ret;
+  return (__a - (__b * __aarch64_vget_lane_any (__c, __lane)));
 }
 
-/* vld2_lane */
-
-#define __LD2_LANE_FUNC(intype, vectype, largetype, ptrtype, mode,	   \
-			 qmode, ptrmode, funcsuffix, signedtype)	   \
-__extension__ static __inline intype __attribute__ ((__always_inline__))   \
-vld2_lane_##funcsuffix (const ptrtype * __ptr, intype __b, const int __c)  \
-{									   \
-  __builtin_aarch64_simd_oi __o;					   \
-  largetype __temp;							   \
-  __temp.val[0] =							   \
-    vcombine_##funcsuffix (__b.val[0], vcreate_##funcsuffix (0));	   \
-  __temp.val[1] =							   \
-    vcombine_##funcsuffix (__b.val[1], vcreate_##funcsuffix (0));	   \
-  __o = __builtin_aarch64_set_qregoi##qmode (__o,			   \
-					    (signedtype) __temp.val[0],	   \
-					    0);				   \
-  __o = __builtin_aarch64_set_qregoi##qmode (__o,			   \
-					    (signedtype) __temp.val[1],	   \
-					    1);				   \
-  __o =	__builtin_aarch64_ld2_lane##mode (				   \
-	  (__builtin_aarch64_simd_##ptrmode *) __ptr, __o, __c);	   \
-  __b.val[0] = (vectype) __builtin_aarch64_get_dregoidi (__o, 0);	   \
-  __b.val[1] = (vectype) __builtin_aarch64_get_dregoidi (__o, 1);	   \
-  return __b;								   \
+__extension__ extern __inline uint32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmlsq_lane_u32 (uint32x4_t __a, uint32x4_t __b,
+		uint32x2_t __c, const int __lane)
+{
+  return (__a - (__b * __aarch64_vget_lane_any (__c, __lane)));
 }
 
-__LD2_LANE_FUNC (float16x4x2_t, float16x4_t, float16x8x2_t, float16_t, v4hf,
-		 v8hf, hf, f16, float16x8_t)
-__LD2_LANE_FUNC (float32x2x2_t, float32x2_t, float32x4x2_t, float32_t, v2sf, v4sf,
-		 sf, f32, float32x4_t)
-__LD2_LANE_FUNC (float64x1x2_t, float64x1_t, float64x2x2_t, float64_t, df, v2df,
-		 df, f64, float64x2_t)
-__LD2_LANE_FUNC (poly8x8x2_t, poly8x8_t, poly8x16x2_t, poly8_t, v8qi, v16qi, qi, p8,
-		 int8x16_t)
-__LD2_LANE_FUNC (poly16x4x2_t, poly16x4_t, poly16x8x2_t, poly16_t, v4hi, v8hi, hi,
-		 p16, int16x8_t)
-__LD2_LANE_FUNC (int8x8x2_t, int8x8_t, int8x16x2_t, int8_t, v8qi, v16qi, qi, s8,
-		 int8x16_t)
-__LD2_LANE_FUNC (int16x4x2_t, int16x4_t, int16x8x2_t, int16_t, v4hi, v8hi, hi, s16,
-		 int16x8_t)
-__LD2_LANE_FUNC (int32x2x2_t, int32x2_t, int32x4x2_t, int32_t, v2si, v4si, si, s32,
-		 int32x4_t)
-__LD2_LANE_FUNC (int64x1x2_t, int64x1_t, int64x2x2_t, int64_t, di, v2di, di, s64,
-		 int64x2_t)
-__LD2_LANE_FUNC (uint8x8x2_t, uint8x8_t, uint8x16x2_t, uint8_t, v8qi, v16qi, qi, u8,
-		 int8x16_t)
-__LD2_LANE_FUNC (uint16x4x2_t, uint16x4_t, uint16x8x2_t, uint16_t, v4hi, v8hi, hi,
-		 u16, int16x8_t)
-__LD2_LANE_FUNC (uint32x2x2_t, uint32x2_t, uint32x4x2_t, uint32_t, v2si, v4si, si,
-		 u32, int32x4_t)
-__LD2_LANE_FUNC (uint64x1x2_t, uint64x1_t, uint64x2x2_t, uint64_t, di, v2di, di,
-		 u64, int64x2_t)
+  /* vmlsq_laneq  */
+
+__extension__ extern __inline float32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmlsq_laneq_f32 (float32x4_t __a, float32x4_t __b,
+		float32x4_t __c, const int __lane)
+{
+  return (__a - (__b * __aarch64_vget_lane_any (__c, __lane)));
+}
 
-#undef __LD2_LANE_FUNC
+__extension__ extern __inline int16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmlsq_laneq_s16 (int16x8_t __a, int16x8_t __b,
+		int16x8_t __c, const int __lane)
+{
+  return (__a - (__b * __aarch64_vget_lane_any (__c, __lane)));
+}
 
-/* vld2q_lane */
+__extension__ extern __inline int32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmlsq_laneq_s32 (int32x4_t __a, int32x4_t __b,
+		int32x4_t __c, const int __lane)
+{
+  return (__a - (__b * __aarch64_vget_lane_any (__c, __lane)));
+}
+__extension__ extern __inline uint16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmlsq_laneq_u16 (uint16x8_t __a, uint16x8_t __b,
+		uint16x8_t __c, const int __lane)
+{
+  return (__a - (__b * __aarch64_vget_lane_any (__c, __lane)));
+}
 
-#define __LD2_LANE_FUNC(intype, vtype, ptrtype, mode, ptrmode, funcsuffix) \
-__extension__ static __inline intype __attribute__ ((__always_inline__))   \
-vld2q_lane_##funcsuffix (const ptrtype * __ptr, intype __b, const int __c) \
-{									   \
-  __builtin_aarch64_simd_oi __o;					   \
-  intype ret;								   \
-  __o = __builtin_aarch64_set_qregoiv4si (__o, (int32x4_t) __b.val[0], 0); \
-  __o = __builtin_aarch64_set_qregoiv4si (__o, (int32x4_t) __b.val[1], 1); \
-  __o = __builtin_aarch64_ld2_lane##mode (				   \
-	(__builtin_aarch64_simd_##ptrmode *) __ptr, __o, __c);		   \
-  ret.val[0] = (vtype) __builtin_aarch64_get_qregoiv4si (__o, 0);	   \
-  ret.val[1] = (vtype) __builtin_aarch64_get_qregoiv4si (__o, 1);	   \
-  return ret;								   \
+__extension__ extern __inline uint32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmlsq_laneq_u32 (uint32x4_t __a, uint32x4_t __b,
+		uint32x4_t __c, const int __lane)
+{
+  return (__a - (__b * __aarch64_vget_lane_any (__c, __lane)));
 }
 
-__LD2_LANE_FUNC (float16x8x2_t, float16x8_t, float16_t, v8hf, hf, f16)
-__LD2_LANE_FUNC (float32x4x2_t, float32x4_t, float32_t, v4sf, sf, f32)
-__LD2_LANE_FUNC (float64x2x2_t, float64x2_t, float64_t, v2df, df, f64)
-__LD2_LANE_FUNC (poly8x16x2_t, poly8x16_t, poly8_t, v16qi, qi, p8)
-__LD2_LANE_FUNC (poly16x8x2_t, poly16x8_t, poly16_t, v8hi, hi, p16)
-__LD2_LANE_FUNC (int8x16x2_t, int8x16_t, int8_t, v16qi, qi, s8)
-__LD2_LANE_FUNC (int16x8x2_t, int16x8_t, int16_t, v8hi, hi, s16)
-__LD2_LANE_FUNC (int32x4x2_t, int32x4_t, int32_t, v4si, si, s32)
-__LD2_LANE_FUNC (int64x2x2_t, int64x2_t, int64_t, v2di, di, s64)
-__LD2_LANE_FUNC (uint8x16x2_t, uint8x16_t, uint8_t, v16qi, qi, u8)
-__LD2_LANE_FUNC (uint16x8x2_t, uint16x8_t, uint16_t, v8hi, hi, u16)
-__LD2_LANE_FUNC (uint32x4x2_t, uint32x4_t, uint32_t, v4si, si, u32)
-__LD2_LANE_FUNC (uint64x2x2_t, uint64x2_t, uint64_t, v2di, di, u64)
+/* vmov_n_  */
 
-#undef __LD2_LANE_FUNC
+__extension__ extern __inline float16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmov_n_f16 (float16_t __a)
+{
+  return vdup_n_f16 (__a);
+}
 
-/* vld3_lane */
+__extension__ extern __inline float32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmov_n_f32 (float32_t __a)
+{
+  return vdup_n_f32 (__a);
+}
 
-#define __LD3_LANE_FUNC(intype, vectype, largetype, ptrtype, mode,	   \
-			 qmode, ptrmode, funcsuffix, signedtype)	   \
-__extension__ static __inline intype __attribute__ ((__always_inline__))   \
-vld3_lane_##funcsuffix (const ptrtype * __ptr, intype __b, const int __c)  \
-{									   \
-  __builtin_aarch64_simd_ci __o;					   \
-  largetype __temp;							   \
-  __temp.val[0] =							   \
-    vcombine_##funcsuffix (__b.val[0], vcreate_##funcsuffix (0));	   \
-  __temp.val[1] =							   \
-    vcombine_##funcsuffix (__b.val[1], vcreate_##funcsuffix (0));	   \
-  __temp.val[2] =							   \
-    vcombine_##funcsuffix (__b.val[2], vcreate_##funcsuffix (0));	   \
-  __o = __builtin_aarch64_set_qregci##qmode (__o,			   \
-					    (signedtype) __temp.val[0],	   \
-					    0);				   \
-  __o = __builtin_aarch64_set_qregci##qmode (__o,			   \
-					    (signedtype) __temp.val[1],	   \
-					    1);				   \
-  __o = __builtin_aarch64_set_qregci##qmode (__o,			   \
-					    (signedtype) __temp.val[2],	   \
-					    2);				   \
-  __o =	__builtin_aarch64_ld3_lane##mode (				   \
-	  (__builtin_aarch64_simd_##ptrmode *) __ptr, __o, __c);	   \
-  __b.val[0] = (vectype) __builtin_aarch64_get_dregcidi (__o, 0);	   \
-  __b.val[1] = (vectype) __builtin_aarch64_get_dregcidi (__o, 1);	   \
-  __b.val[2] = (vectype) __builtin_aarch64_get_dregcidi (__o, 2);	   \
-  return __b;								   \
+__extension__ extern __inline float64x1_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmov_n_f64 (float64_t __a)
+{
+  return (float64x1_t) {__a};
 }
 
-__LD3_LANE_FUNC (float16x4x3_t, float16x4_t, float16x8x3_t, float16_t, v4hf,
-		 v8hf, hf, f16, float16x8_t)
-__LD3_LANE_FUNC (float32x2x3_t, float32x2_t, float32x4x3_t, float32_t, v2sf, v4sf,
-		 sf, f32, float32x4_t)
-__LD3_LANE_FUNC (float64x1x3_t, float64x1_t, float64x2x3_t, float64_t, df, v2df,
-		 df, f64, float64x2_t)
-__LD3_LANE_FUNC (poly8x8x3_t, poly8x8_t, poly8x16x3_t, poly8_t, v8qi, v16qi, qi, p8,
-		 int8x16_t)
-__LD3_LANE_FUNC (poly16x4x3_t, poly16x4_t, poly16x8x3_t, poly16_t, v4hi, v8hi, hi,
-		 p16, int16x8_t)
-__LD3_LANE_FUNC (int8x8x3_t, int8x8_t, int8x16x3_t, int8_t, v8qi, v16qi, qi, s8,
-		 int8x16_t)
-__LD3_LANE_FUNC (int16x4x3_t, int16x4_t, int16x8x3_t, int16_t, v4hi, v8hi, hi, s16,
-		 int16x8_t)
-__LD3_LANE_FUNC (int32x2x3_t, int32x2_t, int32x4x3_t, int32_t, v2si, v4si, si, s32,
-		 int32x4_t)
-__LD3_LANE_FUNC (int64x1x3_t, int64x1_t, int64x2x3_t, int64_t, di, v2di, di, s64,
-		 int64x2_t)
-__LD3_LANE_FUNC (uint8x8x3_t, uint8x8_t, uint8x16x3_t, uint8_t, v8qi, v16qi, qi, u8,
-		 int8x16_t)
-__LD3_LANE_FUNC (uint16x4x3_t, uint16x4_t, uint16x8x3_t, uint16_t, v4hi, v8hi, hi,
-		 u16, int16x8_t)
-__LD3_LANE_FUNC (uint32x2x3_t, uint32x2_t, uint32x4x3_t, uint32_t, v2si, v4si, si,
-		 u32, int32x4_t)
-__LD3_LANE_FUNC (uint64x1x3_t, uint64x1_t, uint64x2x3_t, uint64_t, di, v2di, di,
-		 u64, int64x2_t)
+__extension__ extern __inline poly8x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmov_n_p8 (poly8_t __a)
+{
+  return vdup_n_p8 (__a);
+}
 
-#undef __LD3_LANE_FUNC
+__extension__ extern __inline poly16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmov_n_p16 (poly16_t __a)
+{
+  return vdup_n_p16 (__a);
+}
 
-/* vld3q_lane */
+__extension__ extern __inline poly64x1_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmov_n_p64 (poly64_t __a)
+{
+  return vdup_n_p64 (__a);
+}
 
-#define __LD3_LANE_FUNC(intype, vtype, ptrtype, mode, ptrmode, funcsuffix) \
-__extension__ static __inline intype __attribute__ ((__always_inline__))   \
-vld3q_lane_##funcsuffix (const ptrtype * __ptr, intype __b, const int __c) \
-{									   \
-  __builtin_aarch64_simd_ci __o;					   \
-  intype ret;								   \
-  __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) __b.val[0], 0); \
-  __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) __b.val[1], 1); \
-  __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) __b.val[2], 2); \
-  __o = __builtin_aarch64_ld3_lane##mode (				   \
-	(__builtin_aarch64_simd_##ptrmode *) __ptr, __o, __c);		   \
-  ret.val[0] = (vtype) __builtin_aarch64_get_qregciv4si (__o, 0);	   \
-  ret.val[1] = (vtype) __builtin_aarch64_get_qregciv4si (__o, 1);	   \
-  ret.val[2] = (vtype) __builtin_aarch64_get_qregciv4si (__o, 2);	   \
-  return ret;								   \
+__extension__ extern __inline int8x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmov_n_s8 (int8_t __a)
+{
+  return vdup_n_s8 (__a);
 }
 
-__LD3_LANE_FUNC (float16x8x3_t, float16x8_t, float16_t, v8hf, hf, f16)
-__LD3_LANE_FUNC (float32x4x3_t, float32x4_t, float32_t, v4sf, sf, f32)
-__LD3_LANE_FUNC (float64x2x3_t, float64x2_t, float64_t, v2df, df, f64)
-__LD3_LANE_FUNC (poly8x16x3_t, poly8x16_t, poly8_t, v16qi, qi, p8)
-__LD3_LANE_FUNC (poly16x8x3_t, poly16x8_t, poly16_t, v8hi, hi, p16)
-__LD3_LANE_FUNC (int8x16x3_t, int8x16_t, int8_t, v16qi, qi, s8)
-__LD3_LANE_FUNC (int16x8x3_t, int16x8_t, int16_t, v8hi, hi, s16)
-__LD3_LANE_FUNC (int32x4x3_t, int32x4_t, int32_t, v4si, si, s32)
-__LD3_LANE_FUNC (int64x2x3_t, int64x2_t, int64_t, v2di, di, s64)
-__LD3_LANE_FUNC (uint8x16x3_t, uint8x16_t, uint8_t, v16qi, qi, u8)
-__LD3_LANE_FUNC (uint16x8x3_t, uint16x8_t, uint16_t, v8hi, hi, u16)
-__LD3_LANE_FUNC (uint32x4x3_t, uint32x4_t, uint32_t, v4si, si, u32)
-__LD3_LANE_FUNC (uint64x2x3_t, uint64x2_t, uint64_t, v2di, di, u64)
+__extension__ extern __inline int16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmov_n_s16 (int16_t __a)
+{
+  return vdup_n_s16 (__a);
+}
 
-#undef __LD3_LANE_FUNC
+__extension__ extern __inline int32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmov_n_s32 (int32_t __a)
+{
+  return vdup_n_s32 (__a);
+}
 
-/* vld4_lane */
+__extension__ extern __inline int64x1_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmov_n_s64 (int64_t __a)
+{
+  return (int64x1_t) {__a};
+}
 
-#define __LD4_LANE_FUNC(intype, vectype, largetype, ptrtype, mode,	   \
-			 qmode, ptrmode, funcsuffix, signedtype)	   \
-__extension__ static __inline intype __attribute__ ((__always_inline__))   \
-vld4_lane_##funcsuffix (const ptrtype * __ptr, intype __b, const int __c)  \
-{									   \
-  __builtin_aarch64_simd_xi __o;					   \
-  largetype __temp;							   \
-  __temp.val[0] =							   \
-    vcombine_##funcsuffix (__b.val[0], vcreate_##funcsuffix (0));	   \
-  __temp.val[1] =							   \
-    vcombine_##funcsuffix (__b.val[1], vcreate_##funcsuffix (0));	   \
-  __temp.val[2] =							   \
-    vcombine_##funcsuffix (__b.val[2], vcreate_##funcsuffix (0));	   \
-  __temp.val[3] =							   \
-    vcombine_##funcsuffix (__b.val[3], vcreate_##funcsuffix (0));	   \
-  __o = __builtin_aarch64_set_qregxi##qmode (__o,			   \
-					    (signedtype) __temp.val[0],	   \
-					    0);				   \
-  __o = __builtin_aarch64_set_qregxi##qmode (__o,			   \
-					    (signedtype) __temp.val[1],	   \
-					    1);				   \
-  __o = __builtin_aarch64_set_qregxi##qmode (__o,			   \
-					    (signedtype) __temp.val[2],	   \
-					    2);				   \
-  __o = __builtin_aarch64_set_qregxi##qmode (__o,			   \
-					    (signedtype) __temp.val[3],	   \
-					    3);				   \
-  __o =	__builtin_aarch64_ld4_lane##mode (				   \
-	  (__builtin_aarch64_simd_##ptrmode *) __ptr, __o, __c);	   \
-  __b.val[0] = (vectype) __builtin_aarch64_get_dregxidi (__o, 0);	   \
-  __b.val[1] = (vectype) __builtin_aarch64_get_dregxidi (__o, 1);	   \
-  __b.val[2] = (vectype) __builtin_aarch64_get_dregxidi (__o, 2);	   \
-  __b.val[3] = (vectype) __builtin_aarch64_get_dregxidi (__o, 3);	   \
-  return __b;								   \
+__extension__ extern __inline uint8x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmov_n_u8 (uint8_t __a)
+{
+  return vdup_n_u8 (__a);
 }
 
-/* vld4q_lane */
+__extension__ extern __inline uint16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmov_n_u16 (uint16_t __a)
+{
+    return vdup_n_u16 (__a);
+}
 
-__LD4_LANE_FUNC (float16x4x4_t, float16x4_t, float16x8x4_t, float16_t, v4hf,
-		 v8hf, hf, f16, float16x8_t)
-__LD4_LANE_FUNC (float32x2x4_t, float32x2_t, float32x4x4_t, float32_t, v2sf, v4sf,
-		 sf, f32, float32x4_t)
-__LD4_LANE_FUNC (float64x1x4_t, float64x1_t, float64x2x4_t, float64_t, df, v2df,
-		 df, f64, float64x2_t)
-__LD4_LANE_FUNC (poly8x8x4_t, poly8x8_t, poly8x16x4_t, poly8_t, v8qi, v16qi, qi, p8,
-		 int8x16_t)
-__LD4_LANE_FUNC (poly16x4x4_t, poly16x4_t, poly16x8x4_t, poly16_t, v4hi, v8hi, hi,
-		 p16, int16x8_t)
-__LD4_LANE_FUNC (int8x8x4_t, int8x8_t, int8x16x4_t, int8_t, v8qi, v16qi, qi, s8,
-		 int8x16_t)
-__LD4_LANE_FUNC (int16x4x4_t, int16x4_t, int16x8x4_t, int16_t, v4hi, v8hi, hi, s16,
-		 int16x8_t)
-__LD4_LANE_FUNC (int32x2x4_t, int32x2_t, int32x4x4_t, int32_t, v2si, v4si, si, s32,
-		 int32x4_t)
-__LD4_LANE_FUNC (int64x1x4_t, int64x1_t, int64x2x4_t, int64_t, di, v2di, di, s64,
-		 int64x2_t)
-__LD4_LANE_FUNC (uint8x8x4_t, uint8x8_t, uint8x16x4_t, uint8_t, v8qi, v16qi, qi, u8,
-		 int8x16_t)
-__LD4_LANE_FUNC (uint16x4x4_t, uint16x4_t, uint16x8x4_t, uint16_t, v4hi, v8hi, hi,
-		 u16, int16x8_t)
-__LD4_LANE_FUNC (uint32x2x4_t, uint32x2_t, uint32x4x4_t, uint32_t, v2si, v4si, si,
-		 u32, int32x4_t)
-__LD4_LANE_FUNC (uint64x1x4_t, uint64x1_t, uint64x2x4_t, uint64_t, di, v2di, di,
-		 u64, int64x2_t)
+__extension__ extern __inline uint32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmov_n_u32 (uint32_t __a)
+{
+   return vdup_n_u32 (__a);
+}
+
+__extension__ extern __inline uint64x1_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmov_n_u64 (uint64_t __a)
+{
+  return (uint64x1_t) {__a};
+}
+
+__extension__ extern __inline float16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmovq_n_f16 (float16_t __a)
+{
+  return vdupq_n_f16 (__a);
+}
+
+__extension__ extern __inline float32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmovq_n_f32 (float32_t __a)
+{
+  return vdupq_n_f32 (__a);
+}
+
+__extension__ extern __inline float64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmovq_n_f64 (float64_t __a)
+{
+  return vdupq_n_f64 (__a);
+}
 
-#undef __LD4_LANE_FUNC
+__extension__ extern __inline poly8x16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmovq_n_p8 (poly8_t __a)
+{
+  return vdupq_n_p8 (__a);
+}
 
-/* vld4q_lane */
+__extension__ extern __inline poly16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmovq_n_p16 (poly16_t __a)
+{
+  return vdupq_n_p16 (__a);
+}
 
-#define __LD4_LANE_FUNC(intype, vtype, ptrtype, mode, ptrmode, funcsuffix) \
-__extension__ static __inline intype __attribute__ ((__always_inline__))   \
-vld4q_lane_##funcsuffix (const ptrtype * __ptr, intype __b, const int __c) \
-{									   \
-  __builtin_aarch64_simd_xi __o;					   \
-  intype ret;								   \
-  __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) __b.val[0], 0); \
-  __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) __b.val[1], 1); \
-  __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) __b.val[2], 2); \
-  __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) __b.val[3], 3); \
-  __o = __builtin_aarch64_ld4_lane##mode (				   \
-	(__builtin_aarch64_simd_##ptrmode *) __ptr, __o, __c);		   \
-  ret.val[0] = (vtype) __builtin_aarch64_get_qregxiv4si (__o, 0);	   \
-  ret.val[1] = (vtype) __builtin_aarch64_get_qregxiv4si (__o, 1);	   \
-  ret.val[2] = (vtype) __builtin_aarch64_get_qregxiv4si (__o, 2);	   \
-  ret.val[3] = (vtype) __builtin_aarch64_get_qregxiv4si (__o, 3);	   \
-  return ret;								   \
+__extension__ extern __inline poly64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmovq_n_p64 (poly64_t __a)
+{
+  return vdupq_n_p64 (__a);
 }
 
-__LD4_LANE_FUNC (float16x8x4_t, float16x8_t, float16_t, v8hf, hf, f16)
-__LD4_LANE_FUNC (float32x4x4_t, float32x4_t, float32_t, v4sf, sf, f32)
-__LD4_LANE_FUNC (float64x2x4_t, float64x2_t, float64_t, v2df, df, f64)
-__LD4_LANE_FUNC (poly8x16x4_t, poly8x16_t, poly8_t, v16qi, qi, p8)
-__LD4_LANE_FUNC (poly16x8x4_t, poly16x8_t, poly16_t, v8hi, hi, p16)
-__LD4_LANE_FUNC (int8x16x4_t, int8x16_t, int8_t, v16qi, qi, s8)
-__LD4_LANE_FUNC (int16x8x4_t, int16x8_t, int16_t, v8hi, hi, s16)
-__LD4_LANE_FUNC (int32x4x4_t, int32x4_t, int32_t, v4si, si, s32)
-__LD4_LANE_FUNC (int64x2x4_t, int64x2_t, int64_t, v2di, di, s64)
-__LD4_LANE_FUNC (uint8x16x4_t, uint8x16_t, uint8_t, v16qi, qi, u8)
-__LD4_LANE_FUNC (uint16x8x4_t, uint16x8_t, uint16_t, v8hi, hi, u16)
-__LD4_LANE_FUNC (uint32x4x4_t, uint32x4_t, uint32_t, v4si, si, u32)
-__LD4_LANE_FUNC (uint64x2x4_t, uint64x2_t, uint64_t, v2di, di, u64)
+__extension__ extern __inline int8x16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmovq_n_s8 (int8_t __a)
+{
+  return vdupq_n_s8 (__a);
+}
 
-#undef __LD4_LANE_FUNC
+__extension__ extern __inline int16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmovq_n_s16 (int16_t __a)
+{
+  return vdupq_n_s16 (__a);
+}
 
-/* vmax */
+__extension__ extern __inline int32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmovq_n_s32 (int32_t __a)
+{
+  return vdupq_n_s32 (__a);
+}
 
-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
-vmax_f32 (float32x2_t __a, float32x2_t __b)
+__extension__ extern __inline int64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmovq_n_s64 (int64_t __a)
 {
-  return __builtin_aarch64_smax_nanv2sf (__a, __b);
+  return vdupq_n_s64 (__a);
 }
 
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-vmax_s8 (int8x8_t __a, int8x8_t __b)
+__extension__ extern __inline uint8x16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmovq_n_u8 (uint8_t __a)
 {
-  return __builtin_aarch64_smaxv8qi (__a, __b);
+  return vdupq_n_u8 (__a);
 }
 
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-vmax_s16 (int16x4_t __a, int16x4_t __b)
+__extension__ extern __inline uint16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmovq_n_u16 (uint16_t __a)
 {
-  return __builtin_aarch64_smaxv4hi (__a, __b);
+  return vdupq_n_u16 (__a);
 }
 
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-vmax_s32 (int32x2_t __a, int32x2_t __b)
+__extension__ extern __inline uint32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmovq_n_u32 (uint32_t __a)
 {
-  return __builtin_aarch64_smaxv2si (__a, __b);
+  return vdupq_n_u32 (__a);
 }
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-vmax_u8 (uint8x8_t __a, uint8x8_t __b)
+__extension__ extern __inline uint64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmovq_n_u64 (uint64_t __a)
 {
-  return (uint8x8_t) __builtin_aarch64_umaxv8qi ((int8x8_t) __a,
-						 (int8x8_t) __b);
+  return vdupq_n_u64 (__a);
 }
 
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-vmax_u16 (uint16x4_t __a, uint16x4_t __b)
+/* vmul_lane  */
+
+__extension__ extern __inline float32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmul_lane_f32 (float32x2_t __a, float32x2_t __b, const int __lane)
 {
-  return (uint16x4_t) __builtin_aarch64_umaxv4hi ((int16x4_t) __a,
-						  (int16x4_t) __b);
+  return __a * __aarch64_vget_lane_any (__b, __lane);
 }
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-vmax_u32 (uint32x2_t __a, uint32x2_t __b)
+__extension__ extern __inline float64x1_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmul_lane_f64 (float64x1_t __a, float64x1_t __b, const int __lane)
 {
-  return (uint32x2_t) __builtin_aarch64_umaxv2si ((int32x2_t) __a,
-						  (int32x2_t) __b);
+  return __a * __b;
 }
 
-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
-vmaxq_f32 (float32x4_t __a, float32x4_t __b)
+__extension__ extern __inline int16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmul_lane_s16 (int16x4_t __a, int16x4_t __b, const int __lane)
 {
-  return __builtin_aarch64_smax_nanv4sf (__a, __b);
+  return __a * __aarch64_vget_lane_any (__b, __lane);
 }
 
-__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
-vmaxq_f64 (float64x2_t __a, float64x2_t __b)
+__extension__ extern __inline int32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmul_lane_s32 (int32x2_t __a, int32x2_t __b, const int __lane)
 {
-  return __builtin_aarch64_smax_nanv2df (__a, __b);
+  return __a * __aarch64_vget_lane_any (__b, __lane);
 }
 
-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
-vmaxq_s8 (int8x16_t __a, int8x16_t __b)
+__extension__ extern __inline uint16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmul_lane_u16 (uint16x4_t __a, uint16x4_t __b, const int __lane)
 {
-  return __builtin_aarch64_smaxv16qi (__a, __b);
+  return __a * __aarch64_vget_lane_any (__b, __lane);
 }
 
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-vmaxq_s16 (int16x8_t __a, int16x8_t __b)
+__extension__ extern __inline uint32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmul_lane_u32 (uint32x2_t __a, uint32x2_t __b, const int __lane)
 {
-  return __builtin_aarch64_smaxv8hi (__a, __b);
+  return __a * __aarch64_vget_lane_any (__b, __lane);
 }
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-vmaxq_s32 (int32x4_t __a, int32x4_t __b)
+/* vmuld_lane  */
+
+__extension__ extern __inline float64_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmuld_lane_f64 (float64_t __a, float64x1_t __b, const int __lane)
 {
-  return __builtin_aarch64_smaxv4si (__a, __b);
+  return __a * __aarch64_vget_lane_any (__b, __lane);
 }
 
-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-vmaxq_u8 (uint8x16_t __a, uint8x16_t __b)
+__extension__ extern __inline float64_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmuld_laneq_f64 (float64_t __a, float64x2_t __b, const int __lane)
 {
-  return (uint8x16_t) __builtin_aarch64_umaxv16qi ((int8x16_t) __a,
-						   (int8x16_t) __b);
+  return __a * __aarch64_vget_lane_any (__b, __lane);
 }
 
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-vmaxq_u16 (uint16x8_t __a, uint16x8_t __b)
+/* vmuls_lane  */
+
+__extension__ extern __inline float32_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmuls_lane_f32 (float32_t __a, float32x2_t __b, const int __lane)
 {
-  return (uint16x8_t) __builtin_aarch64_umaxv8hi ((int16x8_t) __a,
-						  (int16x8_t) __b);
+  return __a * __aarch64_vget_lane_any (__b, __lane);
 }
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vmaxq_u32 (uint32x4_t __a, uint32x4_t __b)
+__extension__ extern __inline float32_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmuls_laneq_f32 (float32_t __a, float32x4_t __b, const int __lane)
 {
-  return (uint32x4_t) __builtin_aarch64_umaxv4si ((int32x4_t) __a,
-						  (int32x4_t) __b);
+  return __a * __aarch64_vget_lane_any (__b, __lane);
 }
-/* vmulx */
 
-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
-vmulx_f32 (float32x2_t __a, float32x2_t __b)
+/* vmul_laneq  */
+
+__extension__ extern __inline float32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmul_laneq_f32 (float32x2_t __a, float32x4_t __b, const int __lane)
 {
-  return __builtin_aarch64_fmulxv2sf (__a, __b);
+  return __a * __aarch64_vget_lane_any (__b, __lane);
 }
 
-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
-vmulxq_f32 (float32x4_t __a, float32x4_t __b)
+__extension__ extern __inline float64x1_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmul_laneq_f64 (float64x1_t __a, float64x2_t __b, const int __lane)
 {
-  return __builtin_aarch64_fmulxv4sf (__a, __b);
+  return __a * __aarch64_vget_lane_any (__b, __lane);
 }
 
-__extension__ static __inline float64x1_t __attribute__ ((__always_inline__))
-vmulx_f64 (float64x1_t __a, float64x1_t __b)
+__extension__ extern __inline int16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmul_laneq_s16 (int16x4_t __a, int16x8_t __b, const int __lane)
 {
-  return (float64x1_t) {__builtin_aarch64_fmulxdf (__a[0], __b[0])};
+  return __a * __aarch64_vget_lane_any (__b, __lane);
 }
 
-__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
-vmulxq_f64 (float64x2_t __a, float64x2_t __b)
+__extension__ extern __inline int32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmul_laneq_s32 (int32x2_t __a, int32x4_t __b, const int __lane)
 {
-  return __builtin_aarch64_fmulxv2df (__a, __b);
+  return __a * __aarch64_vget_lane_any (__b, __lane);
 }
 
-__extension__ static __inline float32_t __attribute__ ((__always_inline__))
-vmulxs_f32 (float32_t __a, float32_t __b)
+__extension__ extern __inline uint16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmul_laneq_u16 (uint16x4_t __a, uint16x8_t __b, const int __lane)
 {
-  return __builtin_aarch64_fmulxsf (__a, __b);
+  return __a * __aarch64_vget_lane_any (__b, __lane);
 }
 
-__extension__ static __inline float64_t __attribute__ ((__always_inline__))
-vmulxd_f64 (float64_t __a, float64_t __b)
+__extension__ extern __inline uint32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmul_laneq_u32 (uint32x2_t __a, uint32x4_t __b, const int __lane)
 {
-  return __builtin_aarch64_fmulxdf (__a, __b);
+  return __a * __aarch64_vget_lane_any (__b, __lane);
 }
 
-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
-vmulx_lane_f32 (float32x2_t __a, float32x2_t __v, const int __lane)
+/* vmul_n  */
+
+__extension__ extern __inline float64x1_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmul_n_f64  (float64x1_t __a, float64_t __b)
 {
-  return vmulx_f32 (__a, __aarch64_vdup_lane_f32 (__v, __lane));
+  return (float64x1_t) { vget_lane_f64 (__a, 0) * __b };
 }
 
-__extension__ static __inline float64x1_t __attribute__ ((__always_inline__))
-vmulx_lane_f64 (float64x1_t __a, float64x1_t __v, const int __lane)
+/* vmulq_lane  */
+
+__extension__ extern __inline float32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmulq_lane_f32 (float32x4_t __a, float32x2_t __b, const int __lane)
 {
-  return vmulx_f64 (__a, __aarch64_vdup_lane_f64 (__v, __lane));
+  return __a * __aarch64_vget_lane_any (__b, __lane);
 }
 
-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
-vmulxq_lane_f32 (float32x4_t __a, float32x2_t __v, const int __lane)
+__extension__ extern __inline float64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmulq_lane_f64 (float64x2_t __a, float64x1_t __b, const int __lane)
 {
-  return vmulxq_f32 (__a, __aarch64_vdupq_lane_f32 (__v, __lane));
+  __AARCH64_LANE_CHECK (__a, __lane);
+  return __a * __b[0];
 }
 
-__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
-vmulxq_lane_f64 (float64x2_t __a, float64x1_t __v, const int __lane)
+__extension__ extern __inline int16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmulq_lane_s16 (int16x8_t __a, int16x4_t __b, const int __lane)
 {
-  return vmulxq_f64 (__a, __aarch64_vdupq_lane_f64 (__v, __lane));
+  return __a * __aarch64_vget_lane_any (__b, __lane);
 }
 
-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
-vmulx_laneq_f32 (float32x2_t __a, float32x4_t __v, const int __lane)
+__extension__ extern __inline int32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmulq_lane_s32 (int32x4_t __a, int32x2_t __b, const int __lane)
 {
-  return vmulx_f32 (__a, __aarch64_vdup_laneq_f32 (__v, __lane));
+  return __a * __aarch64_vget_lane_any (__b, __lane);
 }
 
-__extension__ static __inline float64x1_t __attribute__ ((__always_inline__))
-vmulx_laneq_f64 (float64x1_t __a, float64x2_t __v, const int __lane)
+__extension__ extern __inline uint16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmulq_lane_u16 (uint16x8_t __a, uint16x4_t __b, const int __lane)
 {
-  return vmulx_f64 (__a, __aarch64_vdup_laneq_f64 (__v, __lane));
+  return __a * __aarch64_vget_lane_any (__b, __lane);
 }
 
-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
-vmulxq_laneq_f32 (float32x4_t __a, float32x4_t __v, const int __lane)
+__extension__ extern __inline uint32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmulq_lane_u32 (uint32x4_t __a, uint32x2_t __b, const int __lane)
 {
-  return vmulxq_f32 (__a, __aarch64_vdupq_laneq_f32 (__v, __lane));
+  return __a * __aarch64_vget_lane_any (__b, __lane);
 }
 
-__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
-vmulxq_laneq_f64 (float64x2_t __a, float64x2_t __v, const int __lane)
+/* vmulq_laneq  */
+
+__extension__ extern __inline float32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmulq_laneq_f32 (float32x4_t __a, float32x4_t __b, const int __lane)
 {
-  return vmulxq_f64 (__a, __aarch64_vdupq_laneq_f64 (__v, __lane));
+  return __a * __aarch64_vget_lane_any (__b, __lane);
 }
 
-__extension__ static __inline float32_t __attribute__ ((__always_inline__))
-vmulxs_lane_f32 (float32_t __a, float32x2_t __v, const int __lane)
+__extension__ extern __inline float64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmulq_laneq_f64 (float64x2_t __a, float64x2_t __b, const int __lane)
 {
-  return vmulxs_f32 (__a, __aarch64_vget_lane_any (__v, __lane));
+  return __a * __aarch64_vget_lane_any (__b, __lane);
 }
 
-__extension__ static __inline float32_t __attribute__ ((__always_inline__))
-vmulxs_laneq_f32 (float32_t __a, float32x4_t __v, const int __lane)
+__extension__ extern __inline int16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmulq_laneq_s16 (int16x8_t __a, int16x8_t __b, const int __lane)
 {
-  return vmulxs_f32 (__a, __aarch64_vget_lane_any (__v, __lane));
+  return __a * __aarch64_vget_lane_any (__b, __lane);
 }
 
-__extension__ static __inline float64_t __attribute__ ((__always_inline__))
-vmulxd_lane_f64 (float64_t __a, float64x1_t __v, const int __lane)
+__extension__ extern __inline int32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmulq_laneq_s32 (int32x4_t __a, int32x4_t __b, const int __lane)
 {
-  return vmulxd_f64 (__a, __aarch64_vget_lane_any (__v, __lane));
+  return __a * __aarch64_vget_lane_any (__b, __lane);
 }
 
-__extension__ static __inline float64_t __attribute__ ((__always_inline__))
-vmulxd_laneq_f64 (float64_t __a, float64x2_t __v, const int __lane)
+__extension__ extern __inline uint16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmulq_laneq_u16 (uint16x8_t __a, uint16x8_t __b, const int __lane)
 {
-  return vmulxd_f64 (__a, __aarch64_vget_lane_any (__v, __lane));
+  return __a * __aarch64_vget_lane_any (__b, __lane);
 }
 
-/* vpmax  */
+__extension__ extern __inline uint32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmulq_laneq_u32 (uint32x4_t __a, uint32x4_t __b, const int __lane)
+{
+  return __a * __aarch64_vget_lane_any (__b, __lane);
+}
 
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-vpmax_s8 (int8x8_t a, int8x8_t b)
+/* vmul_n.  */
+
+__extension__ extern __inline float32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmul_n_f32 (float32x2_t __a, float32_t __b)
 {
-  return __builtin_aarch64_smaxpv8qi (a, b);
+  return __a * __b;
 }
 
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-vpmax_s16 (int16x4_t a, int16x4_t b)
+__extension__ extern __inline float32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmulq_n_f32 (float32x4_t __a, float32_t __b)
 {
-  return __builtin_aarch64_smaxpv4hi (a, b);
+  return __a * __b;
 }
 
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-vpmax_s32 (int32x2_t a, int32x2_t b)
+__extension__ extern __inline float64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmulq_n_f64 (float64x2_t __a, float64_t __b)
 {
-  return __builtin_aarch64_smaxpv2si (a, b);
+  return __a * __b;
 }
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-vpmax_u8 (uint8x8_t a, uint8x8_t b)
+__extension__ extern __inline int16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmul_n_s16 (int16x4_t __a, int16_t __b)
 {
-  return (uint8x8_t) __builtin_aarch64_umaxpv8qi ((int8x8_t) a,
-						  (int8x8_t) b);
+  return __a * __b;
 }
 
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-vpmax_u16 (uint16x4_t a, uint16x4_t b)
+__extension__ extern __inline int16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmulq_n_s16 (int16x8_t __a, int16_t __b)
 {
-  return (uint16x4_t) __builtin_aarch64_umaxpv4hi ((int16x4_t) a,
-						   (int16x4_t) b);
+  return __a * __b;
 }
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-vpmax_u32 (uint32x2_t a, uint32x2_t b)
+__extension__ extern __inline int32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmul_n_s32 (int32x2_t __a, int32_t __b)
 {
-  return (uint32x2_t) __builtin_aarch64_umaxpv2si ((int32x2_t) a,
-						   (int32x2_t) b);
+  return __a * __b;
 }
 
-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
-vpmaxq_s8 (int8x16_t a, int8x16_t b)
+__extension__ extern __inline int32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmulq_n_s32 (int32x4_t __a, int32_t __b)
 {
-  return __builtin_aarch64_smaxpv16qi (a, b);
+  return __a * __b;
 }
 
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-vpmaxq_s16 (int16x8_t a, int16x8_t b)
+__extension__ extern __inline uint16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmul_n_u16 (uint16x4_t __a, uint16_t __b)
 {
-  return __builtin_aarch64_smaxpv8hi (a, b);
+  return __a * __b;
 }
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-vpmaxq_s32 (int32x4_t a, int32x4_t b)
+__extension__ extern __inline uint16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmulq_n_u16 (uint16x8_t __a, uint16_t __b)
+{
+  return __a * __b;
+}
+
+__extension__ extern __inline uint32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmul_n_u32 (uint32x2_t __a, uint32_t __b)
+{
+  return __a * __b;
+}
+
+__extension__ extern __inline uint32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmulq_n_u32 (uint32x4_t __a, uint32_t __b)
+{
+  return __a * __b;
+}
+
+/* vmvn  */
+
+__extension__ extern __inline poly8x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmvn_p8 (poly8x8_t __a)
 {
-  return __builtin_aarch64_smaxpv4si (a, b);
+  return (poly8x8_t) ~((int8x8_t) __a);
 }
 
-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-vpmaxq_u8 (uint8x16_t a, uint8x16_t b)
+__extension__ extern __inline int8x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmvn_s8 (int8x8_t __a)
 {
-  return (uint8x16_t) __builtin_aarch64_umaxpv16qi ((int8x16_t) a,
-						    (int8x16_t) b);
+  return ~__a;
 }
 
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-vpmaxq_u16 (uint16x8_t a, uint16x8_t b)
+__extension__ extern __inline int16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmvn_s16 (int16x4_t __a)
 {
-  return (uint16x8_t) __builtin_aarch64_umaxpv8hi ((int16x8_t) a,
-						   (int16x8_t) b);
+  return ~__a;
 }
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vpmaxq_u32 (uint32x4_t a, uint32x4_t b)
+__extension__ extern __inline int32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmvn_s32 (int32x2_t __a)
 {
-  return (uint32x4_t) __builtin_aarch64_umaxpv4si ((int32x4_t) a,
-						   (int32x4_t) b);
+  return ~__a;
 }
 
-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
-vpmax_f32 (float32x2_t a, float32x2_t b)
+__extension__ extern __inline uint8x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmvn_u8 (uint8x8_t __a)
 {
-  return __builtin_aarch64_smax_nanpv2sf (a, b);
+  return ~__a;
 }
 
-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
-vpmaxq_f32 (float32x4_t a, float32x4_t b)
+__extension__ extern __inline uint16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmvn_u16 (uint16x4_t __a)
 {
-  return __builtin_aarch64_smax_nanpv4sf (a, b);
+  return ~__a;
 }
 
-__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
-vpmaxq_f64 (float64x2_t a, float64x2_t b)
+__extension__ extern __inline uint32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmvn_u32 (uint32x2_t __a)
 {
-  return __builtin_aarch64_smax_nanpv2df (a, b);
+  return ~__a;
 }
 
-__extension__ static __inline float64_t __attribute__ ((__always_inline__))
-vpmaxqd_f64 (float64x2_t a)
+__extension__ extern __inline poly8x16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmvnq_p8 (poly8x16_t __a)
 {
-  return __builtin_aarch64_reduc_smax_nan_scal_v2df (a);
+  return (poly8x16_t) ~((int8x16_t) __a);
 }
 
-__extension__ static __inline float32_t __attribute__ ((__always_inline__))
-vpmaxs_f32 (float32x2_t a)
+__extension__ extern __inline int8x16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmvnq_s8 (int8x16_t __a)
 {
-  return __builtin_aarch64_reduc_smax_nan_scal_v2sf (a);
+  return ~__a;
 }
 
-/* vpmaxnm  */
-
-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
-vpmaxnm_f32 (float32x2_t a, float32x2_t b)
+__extension__ extern __inline int16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmvnq_s16 (int16x8_t __a)
 {
-  return __builtin_aarch64_smaxpv2sf (a, b);
+  return ~__a;
 }
 
-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
-vpmaxnmq_f32 (float32x4_t a, float32x4_t b)
+__extension__ extern __inline int32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmvnq_s32 (int32x4_t __a)
 {
-  return __builtin_aarch64_smaxpv4sf (a, b);
+  return ~__a;
 }
 
-__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
-vpmaxnmq_f64 (float64x2_t a, float64x2_t b)
+__extension__ extern __inline uint8x16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmvnq_u8 (uint8x16_t __a)
 {
-  return __builtin_aarch64_smaxpv2df (a, b);
+  return ~__a;
 }
 
-__extension__ static __inline float64_t __attribute__ ((__always_inline__))
-vpmaxnmqd_f64 (float64x2_t a)
+__extension__ extern __inline uint16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmvnq_u16 (uint16x8_t __a)
 {
-  return __builtin_aarch64_reduc_smax_scal_v2df (a);
+  return ~__a;
 }
 
-__extension__ static __inline float32_t __attribute__ ((__always_inline__))
-vpmaxnms_f32 (float32x2_t a)
+__extension__ extern __inline uint32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmvnq_u32 (uint32x4_t __a)
 {
-  return __builtin_aarch64_reduc_smax_scal_v2sf (a);
+  return ~__a;
 }
 
-/* vpmin  */
+/* vneg  */
 
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-vpmin_s8 (int8x8_t a, int8x8_t b)
+__extension__ extern __inline float32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vneg_f32 (float32x2_t __a)
 {
-  return __builtin_aarch64_sminpv8qi (a, b);
+  return -__a;
 }
 
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-vpmin_s16 (int16x4_t a, int16x4_t b)
+__extension__ extern __inline float64x1_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vneg_f64 (float64x1_t __a)
 {
-  return __builtin_aarch64_sminpv4hi (a, b);
+  return -__a;
 }
 
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-vpmin_s32 (int32x2_t a, int32x2_t b)
+__extension__ extern __inline int8x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vneg_s8 (int8x8_t __a)
 {
-  return __builtin_aarch64_sminpv2si (a, b);
+  return -__a;
 }
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-vpmin_u8 (uint8x8_t a, uint8x8_t b)
+__extension__ extern __inline int16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vneg_s16 (int16x4_t __a)
 {
-  return (uint8x8_t) __builtin_aarch64_uminpv8qi ((int8x8_t) a,
-						  (int8x8_t) b);
+  return -__a;
 }
 
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-vpmin_u16 (uint16x4_t a, uint16x4_t b)
+__extension__ extern __inline int32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vneg_s32 (int32x2_t __a)
 {
-  return (uint16x4_t) __builtin_aarch64_uminpv4hi ((int16x4_t) a,
-						   (int16x4_t) b);
+  return -__a;
 }
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-vpmin_u32 (uint32x2_t a, uint32x2_t b)
+__extension__ extern __inline int64x1_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vneg_s64 (int64x1_t __a)
 {
-  return (uint32x2_t) __builtin_aarch64_uminpv2si ((int32x2_t) a,
-						   (int32x2_t) b);
+  return -__a;
 }
 
-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
-vpminq_s8 (int8x16_t a, int8x16_t b)
+__extension__ extern __inline float32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vnegq_f32 (float32x4_t __a)
 {
-  return __builtin_aarch64_sminpv16qi (a, b);
+  return -__a;
 }
 
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-vpminq_s16 (int16x8_t a, int16x8_t b)
+__extension__ extern __inline float64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vnegq_f64 (float64x2_t __a)
 {
-  return __builtin_aarch64_sminpv8hi (a, b);
+  return -__a;
 }
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-vpminq_s32 (int32x4_t a, int32x4_t b)
+__extension__ extern __inline int8x16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vnegq_s8 (int8x16_t __a)
 {
-  return __builtin_aarch64_sminpv4si (a, b);
+  return -__a;
 }
 
-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-vpminq_u8 (uint8x16_t a, uint8x16_t b)
+__extension__ extern __inline int16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vnegq_s16 (int16x8_t __a)
 {
-  return (uint8x16_t) __builtin_aarch64_uminpv16qi ((int8x16_t) a,
-						    (int8x16_t) b);
+  return -__a;
 }
 
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-vpminq_u16 (uint16x8_t a, uint16x8_t b)
+__extension__ extern __inline int32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vnegq_s32 (int32x4_t __a)
 {
-  return (uint16x8_t) __builtin_aarch64_uminpv8hi ((int16x8_t) a,
-						   (int16x8_t) b);
+  return -__a;
 }
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vpminq_u32 (uint32x4_t a, uint32x4_t b)
+__extension__ extern __inline int64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vnegq_s64 (int64x2_t __a)
 {
-  return (uint32x4_t) __builtin_aarch64_uminpv4si ((int32x4_t) a,
-						   (int32x4_t) b);
+  return -__a;
 }
 
-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
-vpmin_f32 (float32x2_t a, float32x2_t b)
+/* vpadd  */
+
+__extension__ extern __inline float32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vpadd_f32 (float32x2_t __a, float32x2_t __b)
 {
-  return __builtin_aarch64_smin_nanpv2sf (a, b);
+  return __builtin_aarch64_faddpv2sf (__a, __b);
 }
 
-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
-vpminq_f32 (float32x4_t a, float32x4_t b)
+__extension__ extern __inline float32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vpaddq_f32 (float32x4_t __a, float32x4_t __b)
 {
-  return __builtin_aarch64_smin_nanpv4sf (a, b);
+  return __builtin_aarch64_faddpv4sf (__a, __b);
 }
 
-__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
-vpminq_f64 (float64x2_t a, float64x2_t b)
+__extension__ extern __inline float64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vpaddq_f64 (float64x2_t __a, float64x2_t __b)
 {
-  return __builtin_aarch64_smin_nanpv2df (a, b);
+  return __builtin_aarch64_faddpv2df (__a, __b);
 }
 
-__extension__ static __inline float64_t __attribute__ ((__always_inline__))
-vpminqd_f64 (float64x2_t a)
+__extension__ extern __inline int8x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vpadd_s8 (int8x8_t __a, int8x8_t __b)
 {
-  return __builtin_aarch64_reduc_smin_nan_scal_v2df (a);
+  return __builtin_aarch64_addpv8qi (__a, __b);
 }
 
-__extension__ static __inline float32_t __attribute__ ((__always_inline__))
-vpmins_f32 (float32x2_t a)
+__extension__ extern __inline int16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vpadd_s16 (int16x4_t __a, int16x4_t __b)
 {
-  return __builtin_aarch64_reduc_smin_nan_scal_v2sf (a);
+  return __builtin_aarch64_addpv4hi (__a, __b);
 }
 
-/* vpminnm  */
-
-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
-vpminnm_f32 (float32x2_t a, float32x2_t b)
+__extension__ extern __inline int32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vpadd_s32 (int32x2_t __a, int32x2_t __b)
 {
-  return __builtin_aarch64_sminpv2sf (a, b);
+  return __builtin_aarch64_addpv2si (__a, __b);
 }
 
-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
-vpminnmq_f32 (float32x4_t a, float32x4_t b)
+__extension__ extern __inline uint8x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vpadd_u8 (uint8x8_t __a, uint8x8_t __b)
 {
-  return __builtin_aarch64_sminpv4sf (a, b);
+  return (uint8x8_t) __builtin_aarch64_addpv8qi ((int8x8_t) __a,
+						 (int8x8_t) __b);
 }
 
-__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
-vpminnmq_f64 (float64x2_t a, float64x2_t b)
+__extension__ extern __inline uint16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vpadd_u16 (uint16x4_t __a, uint16x4_t __b)
 {
-  return __builtin_aarch64_sminpv2df (a, b);
+  return (uint16x4_t) __builtin_aarch64_addpv4hi ((int16x4_t) __a,
+						  (int16x4_t) __b);
 }
 
-__extension__ static __inline float64_t __attribute__ ((__always_inline__))
-vpminnmqd_f64 (float64x2_t a)
+__extension__ extern __inline uint32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vpadd_u32 (uint32x2_t __a, uint32x2_t __b)
 {
-  return __builtin_aarch64_reduc_smin_scal_v2df (a);
+  return (uint32x2_t) __builtin_aarch64_addpv2si ((int32x2_t) __a,
+						  (int32x2_t) __b);
 }
 
-__extension__ static __inline float32_t __attribute__ ((__always_inline__))
-vpminnms_f32 (float32x2_t a)
+__extension__ extern __inline float32_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vpadds_f32 (float32x2_t __a)
 {
-  return __builtin_aarch64_reduc_smin_scal_v2sf (a);
+  return __builtin_aarch64_reduc_plus_scal_v2sf (__a);
 }
 
-/* vmaxnm  */
-
-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
-vmaxnm_f32 (float32x2_t __a, float32x2_t __b)
+__extension__ extern __inline float64_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vpaddd_f64 (float64x2_t __a)
 {
-  return __builtin_aarch64_fmaxv2sf (__a, __b);
+  return __builtin_aarch64_reduc_plus_scal_v2df (__a);
 }
 
-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
-vmaxnmq_f32 (float32x4_t __a, float32x4_t __b)
+__extension__ extern __inline int64_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vpaddd_s64 (int64x2_t __a)
 {
-  return __builtin_aarch64_fmaxv4sf (__a, __b);
+  return __builtin_aarch64_addpdi (__a);
 }
 
-__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
-vmaxnmq_f64 (float64x2_t __a, float64x2_t __b)
+__extension__ extern __inline uint64_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vpaddd_u64 (uint64x2_t __a)
 {
-  return __builtin_aarch64_fmaxv2df (__a, __b);
+  return __builtin_aarch64_addpdi ((int64x2_t) __a);
 }
 
-/* vmaxv  */
+/* vqabs */
 
-__extension__ static __inline float32_t __attribute__ ((__always_inline__))
-vmaxv_f32 (float32x2_t __a)
+__extension__ extern __inline int64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqabsq_s64 (int64x2_t __a)
 {
-  return __builtin_aarch64_reduc_smax_nan_scal_v2sf (__a);
+  return (int64x2_t) __builtin_aarch64_sqabsv2di (__a);
 }
 
-__extension__ static __inline int8_t __attribute__ ((__always_inline__))
-vmaxv_s8 (int8x8_t __a)
+__extension__ extern __inline int8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqabsb_s8 (int8_t __a)
 {
-  return __builtin_aarch64_reduc_smax_scal_v8qi (__a);
+  return (int8_t) __builtin_aarch64_sqabsqi (__a);
 }
 
-__extension__ static __inline int16_t __attribute__ ((__always_inline__))
-vmaxv_s16 (int16x4_t __a)
+__extension__ extern __inline int16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqabsh_s16 (int16_t __a)
 {
-  return __builtin_aarch64_reduc_smax_scal_v4hi (__a);
+  return (int16_t) __builtin_aarch64_sqabshi (__a);
 }
 
-__extension__ static __inline int32_t __attribute__ ((__always_inline__))
-vmaxv_s32 (int32x2_t __a)
+__extension__ extern __inline int32_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqabss_s32 (int32_t __a)
 {
-  return __builtin_aarch64_reduc_smax_scal_v2si (__a);
+  return (int32_t) __builtin_aarch64_sqabssi (__a);
 }
 
-__extension__ static __inline uint8_t __attribute__ ((__always_inline__))
-vmaxv_u8 (uint8x8_t __a)
+__extension__ extern __inline int64_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqabsd_s64 (int64_t __a)
 {
-  return __builtin_aarch64_reduc_umax_scal_v8qi_uu (__a);
+  return __builtin_aarch64_sqabsdi (__a);
 }
 
-__extension__ static __inline uint16_t __attribute__ ((__always_inline__))
-vmaxv_u16 (uint16x4_t __a)
-{
-  return __builtin_aarch64_reduc_umax_scal_v4hi_uu (__a);
-}
+/* vqadd */
 
-__extension__ static __inline uint32_t __attribute__ ((__always_inline__))
-vmaxv_u32 (uint32x2_t __a)
+__extension__ extern __inline int8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqaddb_s8 (int8_t __a, int8_t __b)
 {
-  return __builtin_aarch64_reduc_umax_scal_v2si_uu (__a);
+  return (int8_t) __builtin_aarch64_sqaddqi (__a, __b);
 }
 
-__extension__ static __inline float32_t __attribute__ ((__always_inline__))
-vmaxvq_f32 (float32x4_t __a)
+__extension__ extern __inline int16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqaddh_s16 (int16_t __a, int16_t __b)
 {
-  return __builtin_aarch64_reduc_smax_nan_scal_v4sf (__a);
+  return (int16_t) __builtin_aarch64_sqaddhi (__a, __b);
 }
 
-__extension__ static __inline float64_t __attribute__ ((__always_inline__))
-vmaxvq_f64 (float64x2_t __a)
+__extension__ extern __inline int32_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqadds_s32 (int32_t __a, int32_t __b)
 {
-  return __builtin_aarch64_reduc_smax_nan_scal_v2df (__a);
+  return (int32_t) __builtin_aarch64_sqaddsi (__a, __b);
 }
 
-__extension__ static __inline int8_t __attribute__ ((__always_inline__))
-vmaxvq_s8 (int8x16_t __a)
+__extension__ extern __inline int64_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqaddd_s64 (int64_t __a, int64_t __b)
 {
-  return __builtin_aarch64_reduc_smax_scal_v16qi (__a);
+  return __builtin_aarch64_sqadddi (__a, __b);
 }
 
-__extension__ static __inline int16_t __attribute__ ((__always_inline__))
-vmaxvq_s16 (int16x8_t __a)
+__extension__ extern __inline uint8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqaddb_u8 (uint8_t __a, uint8_t __b)
 {
-  return __builtin_aarch64_reduc_smax_scal_v8hi (__a);
+  return (uint8_t) __builtin_aarch64_uqaddqi_uuu (__a, __b);
 }
 
-__extension__ static __inline int32_t __attribute__ ((__always_inline__))
-vmaxvq_s32 (int32x4_t __a)
+__extension__ extern __inline uint16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqaddh_u16 (uint16_t __a, uint16_t __b)
 {
-  return __builtin_aarch64_reduc_smax_scal_v4si (__a);
+  return (uint16_t) __builtin_aarch64_uqaddhi_uuu (__a, __b);
 }
 
-__extension__ static __inline uint8_t __attribute__ ((__always_inline__))
-vmaxvq_u8 (uint8x16_t __a)
+__extension__ extern __inline uint32_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqadds_u32 (uint32_t __a, uint32_t __b)
 {
-  return __builtin_aarch64_reduc_umax_scal_v16qi_uu (__a);
+  return (uint32_t) __builtin_aarch64_uqaddsi_uuu (__a, __b);
 }
 
-__extension__ static __inline uint16_t __attribute__ ((__always_inline__))
-vmaxvq_u16 (uint16x8_t __a)
+__extension__ extern __inline uint64_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqaddd_u64 (uint64_t __a, uint64_t __b)
 {
-  return __builtin_aarch64_reduc_umax_scal_v8hi_uu (__a);
+  return __builtin_aarch64_uqadddi_uuu (__a, __b);
 }
 
-__extension__ static __inline uint32_t __attribute__ ((__always_inline__))
-vmaxvq_u32 (uint32x4_t __a)
+/* vqdmlal */
+
+__extension__ extern __inline int32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqdmlal_s16 (int32x4_t __a, int16x4_t __b, int16x4_t __c)
 {
-  return __builtin_aarch64_reduc_umax_scal_v4si_uu (__a);
+  return __builtin_aarch64_sqdmlalv4hi (__a, __b, __c);
 }
 
-/* vmaxnmv  */
-
-__extension__ static __inline float32_t __attribute__ ((__always_inline__))
-vmaxnmv_f32 (float32x2_t __a)
+__extension__ extern __inline int32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqdmlal_high_s16 (int32x4_t __a, int16x8_t __b, int16x8_t __c)
 {
-  return __builtin_aarch64_reduc_smax_scal_v2sf (__a);
+  return __builtin_aarch64_sqdmlal2v8hi (__a, __b, __c);
 }
 
-__extension__ static __inline float32_t __attribute__ ((__always_inline__))
-vmaxnmvq_f32 (float32x4_t __a)
+__extension__ extern __inline int32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqdmlal_high_lane_s16 (int32x4_t __a, int16x8_t __b, int16x4_t __c,
+		       int const __d)
 {
-  return __builtin_aarch64_reduc_smax_scal_v4sf (__a);
+  return __builtin_aarch64_sqdmlal2_lanev8hi (__a, __b, __c, __d);
 }
 
-__extension__ static __inline float64_t __attribute__ ((__always_inline__))
-vmaxnmvq_f64 (float64x2_t __a)
+__extension__ extern __inline int32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqdmlal_high_laneq_s16 (int32x4_t __a, int16x8_t __b, int16x8_t __c,
+			int const __d)
 {
-  return __builtin_aarch64_reduc_smax_scal_v2df (__a);
+  return __builtin_aarch64_sqdmlal2_laneqv8hi (__a, __b, __c, __d);
 }
 
-/* vmin  */
-
-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
-vmin_f32 (float32x2_t __a, float32x2_t __b)
+__extension__ extern __inline int32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqdmlal_high_n_s16 (int32x4_t __a, int16x8_t __b, int16_t __c)
 {
-  return __builtin_aarch64_smin_nanv2sf (__a, __b);
+  return __builtin_aarch64_sqdmlal2_nv8hi (__a, __b, __c);
 }
 
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-vmin_s8 (int8x8_t __a, int8x8_t __b)
+__extension__ extern __inline int32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqdmlal_lane_s16 (int32x4_t __a, int16x4_t __b, int16x4_t __c, int const __d)
 {
-  return __builtin_aarch64_sminv8qi (__a, __b);
+  return __builtin_aarch64_sqdmlal_lanev4hi (__a, __b, __c, __d);
 }
 
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-vmin_s16 (int16x4_t __a, int16x4_t __b)
+__extension__ extern __inline int32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqdmlal_laneq_s16 (int32x4_t __a, int16x4_t __b, int16x8_t __c, int const __d)
 {
-  return __builtin_aarch64_sminv4hi (__a, __b);
+  return __builtin_aarch64_sqdmlal_laneqv4hi (__a, __b, __c, __d);
 }
 
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-vmin_s32 (int32x2_t __a, int32x2_t __b)
+__extension__ extern __inline int32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqdmlal_n_s16 (int32x4_t __a, int16x4_t __b, int16_t __c)
 {
-  return __builtin_aarch64_sminv2si (__a, __b);
+  return __builtin_aarch64_sqdmlal_nv4hi (__a, __b, __c);
 }
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-vmin_u8 (uint8x8_t __a, uint8x8_t __b)
+__extension__ extern __inline int64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqdmlal_s32 (int64x2_t __a, int32x2_t __b, int32x2_t __c)
 {
-  return (uint8x8_t) __builtin_aarch64_uminv8qi ((int8x8_t) __a,
-						 (int8x8_t) __b);
+  return __builtin_aarch64_sqdmlalv2si (__a, __b, __c);
 }
 
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-vmin_u16 (uint16x4_t __a, uint16x4_t __b)
+__extension__ extern __inline int64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqdmlal_high_s32 (int64x2_t __a, int32x4_t __b, int32x4_t __c)
 {
-  return (uint16x4_t) __builtin_aarch64_uminv4hi ((int16x4_t) __a,
-						  (int16x4_t) __b);
+  return __builtin_aarch64_sqdmlal2v4si (__a, __b, __c);
 }
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-vmin_u32 (uint32x2_t __a, uint32x2_t __b)
+__extension__ extern __inline int64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqdmlal_high_lane_s32 (int64x2_t __a, int32x4_t __b, int32x2_t __c,
+		       int const __d)
 {
-  return (uint32x2_t) __builtin_aarch64_uminv2si ((int32x2_t) __a,
-						  (int32x2_t) __b);
+  return __builtin_aarch64_sqdmlal2_lanev4si (__a, __b, __c, __d);
 }
 
-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
-vminq_f32 (float32x4_t __a, float32x4_t __b)
+__extension__ extern __inline int64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqdmlal_high_laneq_s32 (int64x2_t __a, int32x4_t __b, int32x4_t __c,
+			int const __d)
 {
-  return __builtin_aarch64_smin_nanv4sf (__a, __b);
+  return __builtin_aarch64_sqdmlal2_laneqv4si (__a, __b, __c, __d);
 }
 
-__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
-vminq_f64 (float64x2_t __a, float64x2_t __b)
+__extension__ extern __inline int64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqdmlal_high_n_s32 (int64x2_t __a, int32x4_t __b, int32_t __c)
 {
-  return __builtin_aarch64_smin_nanv2df (__a, __b);
+  return __builtin_aarch64_sqdmlal2_nv4si (__a, __b, __c);
 }
 
-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
-vminq_s8 (int8x16_t __a, int8x16_t __b)
+__extension__ extern __inline int64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqdmlal_lane_s32 (int64x2_t __a, int32x2_t __b, int32x2_t __c, int const __d)
 {
-  return __builtin_aarch64_sminv16qi (__a, __b);
+  return __builtin_aarch64_sqdmlal_lanev2si (__a, __b, __c, __d);
 }
 
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-vminq_s16 (int16x8_t __a, int16x8_t __b)
+__extension__ extern __inline int64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqdmlal_laneq_s32 (int64x2_t __a, int32x2_t __b, int32x4_t __c, int const __d)
 {
-  return __builtin_aarch64_sminv8hi (__a, __b);
+  return __builtin_aarch64_sqdmlal_laneqv2si (__a, __b, __c, __d);
 }
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-vminq_s32 (int32x4_t __a, int32x4_t __b)
+__extension__ extern __inline int64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqdmlal_n_s32 (int64x2_t __a, int32x2_t __b, int32_t __c)
 {
-  return __builtin_aarch64_sminv4si (__a, __b);
+  return __builtin_aarch64_sqdmlal_nv2si (__a, __b, __c);
 }
 
-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-vminq_u8 (uint8x16_t __a, uint8x16_t __b)
+__extension__ extern __inline int32_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqdmlalh_s16 (int32_t __a, int16_t __b, int16_t __c)
 {
-  return (uint8x16_t) __builtin_aarch64_uminv16qi ((int8x16_t) __a,
-						   (int8x16_t) __b);
+  return __builtin_aarch64_sqdmlalhi (__a, __b, __c);
 }
 
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-vminq_u16 (uint16x8_t __a, uint16x8_t __b)
+__extension__ extern __inline int32_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqdmlalh_lane_s16 (int32_t __a, int16_t __b, int16x4_t __c, const int __d)
 {
-  return (uint16x8_t) __builtin_aarch64_uminv8hi ((int16x8_t) __a,
-						  (int16x8_t) __b);
+  return __builtin_aarch64_sqdmlal_lanehi (__a, __b, __c, __d);
 }
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vminq_u32 (uint32x4_t __a, uint32x4_t __b)
+__extension__ extern __inline int32_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqdmlalh_laneq_s16 (int32_t __a, int16_t __b, int16x8_t __c, const int __d)
 {
-  return (uint32x4_t) __builtin_aarch64_uminv4si ((int32x4_t) __a,
-						  (int32x4_t) __b);
+  return __builtin_aarch64_sqdmlal_laneqhi (__a, __b, __c, __d);
 }
 
-/* vminnm  */
-
-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
-vminnm_f32 (float32x2_t __a, float32x2_t __b)
+__extension__ extern __inline int64_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqdmlals_s32 (int64_t __a, int32_t __b, int32_t __c)
 {
-  return __builtin_aarch64_fminv2sf (__a, __b);
+  return __builtin_aarch64_sqdmlalsi (__a, __b, __c);
 }
 
-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
-vminnmq_f32 (float32x4_t __a, float32x4_t __b)
+__extension__ extern __inline int64_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqdmlals_lane_s32 (int64_t __a, int32_t __b, int32x2_t __c, const int __d)
 {
-  return __builtin_aarch64_fminv4sf (__a, __b);
+  return __builtin_aarch64_sqdmlal_lanesi (__a, __b, __c, __d);
 }
 
-__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
-vminnmq_f64 (float64x2_t __a, float64x2_t __b)
+__extension__ extern __inline int64_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqdmlals_laneq_s32 (int64_t __a, int32_t __b, int32x4_t __c, const int __d)
 {
-  return __builtin_aarch64_fminv2df (__a, __b);
+  return __builtin_aarch64_sqdmlal_laneqsi (__a, __b, __c, __d);
 }
 
-/* vminv  */
+/* vqdmlsl */
 
-__extension__ static __inline float32_t __attribute__ ((__always_inline__))
-vminv_f32 (float32x2_t __a)
+__extension__ extern __inline int32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqdmlsl_s16 (int32x4_t __a, int16x4_t __b, int16x4_t __c)
 {
-  return __builtin_aarch64_reduc_smin_nan_scal_v2sf (__a);
+  return __builtin_aarch64_sqdmlslv4hi (__a, __b, __c);
 }
 
-__extension__ static __inline int8_t __attribute__ ((__always_inline__))
-vminv_s8 (int8x8_t __a)
+__extension__ extern __inline int32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqdmlsl_high_s16 (int32x4_t __a, int16x8_t __b, int16x8_t __c)
 {
-  return __builtin_aarch64_reduc_smin_scal_v8qi (__a);
+  return __builtin_aarch64_sqdmlsl2v8hi (__a, __b, __c);
 }
 
-__extension__ static __inline int16_t __attribute__ ((__always_inline__))
-vminv_s16 (int16x4_t __a)
+__extension__ extern __inline int32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqdmlsl_high_lane_s16 (int32x4_t __a, int16x8_t __b, int16x4_t __c,
+		       int const __d)
 {
-  return __builtin_aarch64_reduc_smin_scal_v4hi (__a);
+  return __builtin_aarch64_sqdmlsl2_lanev8hi (__a, __b, __c, __d);
 }
 
-__extension__ static __inline int32_t __attribute__ ((__always_inline__))
-vminv_s32 (int32x2_t __a)
+__extension__ extern __inline int32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqdmlsl_high_laneq_s16 (int32x4_t __a, int16x8_t __b, int16x8_t __c,
+			int const __d)
 {
-  return __builtin_aarch64_reduc_smin_scal_v2si (__a);
+  return __builtin_aarch64_sqdmlsl2_laneqv8hi (__a, __b, __c, __d);
 }
 
-__extension__ static __inline uint8_t __attribute__ ((__always_inline__))
-vminv_u8 (uint8x8_t __a)
+__extension__ extern __inline int32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqdmlsl_high_n_s16 (int32x4_t __a, int16x8_t __b, int16_t __c)
 {
-  return __builtin_aarch64_reduc_umin_scal_v8qi_uu (__a);
+  return __builtin_aarch64_sqdmlsl2_nv8hi (__a, __b, __c);
 }
 
-__extension__ static __inline uint16_t __attribute__ ((__always_inline__))
-vminv_u16 (uint16x4_t __a)
+__extension__ extern __inline int32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqdmlsl_lane_s16 (int32x4_t __a, int16x4_t __b, int16x4_t __c, int const __d)
 {
-  return __builtin_aarch64_reduc_umin_scal_v4hi_uu (__a);
+  return __builtin_aarch64_sqdmlsl_lanev4hi (__a, __b, __c, __d);
 }
 
-__extension__ static __inline uint32_t __attribute__ ((__always_inline__))
-vminv_u32 (uint32x2_t __a)
+__extension__ extern __inline int32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqdmlsl_laneq_s16 (int32x4_t __a, int16x4_t __b, int16x8_t __c, int const __d)
 {
-  return __builtin_aarch64_reduc_umin_scal_v2si_uu (__a);
+  return __builtin_aarch64_sqdmlsl_laneqv4hi (__a, __b, __c, __d);
 }
 
-__extension__ static __inline float32_t __attribute__ ((__always_inline__))
-vminvq_f32 (float32x4_t __a)
+__extension__ extern __inline int32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqdmlsl_n_s16 (int32x4_t __a, int16x4_t __b, int16_t __c)
 {
-  return __builtin_aarch64_reduc_smin_nan_scal_v4sf (__a);
+  return __builtin_aarch64_sqdmlsl_nv4hi (__a, __b, __c);
 }
 
-__extension__ static __inline float64_t __attribute__ ((__always_inline__))
-vminvq_f64 (float64x2_t __a)
+__extension__ extern __inline int64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqdmlsl_s32 (int64x2_t __a, int32x2_t __b, int32x2_t __c)
 {
-  return __builtin_aarch64_reduc_smin_nan_scal_v2df (__a);
+  return __builtin_aarch64_sqdmlslv2si (__a, __b, __c);
 }
 
-__extension__ static __inline int8_t __attribute__ ((__always_inline__))
-vminvq_s8 (int8x16_t __a)
+__extension__ extern __inline int64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqdmlsl_high_s32 (int64x2_t __a, int32x4_t __b, int32x4_t __c)
 {
-  return __builtin_aarch64_reduc_smin_scal_v16qi (__a);
+  return __builtin_aarch64_sqdmlsl2v4si (__a, __b, __c);
 }
 
-__extension__ static __inline int16_t __attribute__ ((__always_inline__))
-vminvq_s16 (int16x8_t __a)
+__extension__ extern __inline int64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqdmlsl_high_lane_s32 (int64x2_t __a, int32x4_t __b, int32x2_t __c,
+		       int const __d)
 {
-  return __builtin_aarch64_reduc_smin_scal_v8hi (__a);
+  return __builtin_aarch64_sqdmlsl2_lanev4si (__a, __b, __c, __d);
 }
 
-__extension__ static __inline int32_t __attribute__ ((__always_inline__))
-vminvq_s32 (int32x4_t __a)
+__extension__ extern __inline int64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqdmlsl_high_laneq_s32 (int64x2_t __a, int32x4_t __b, int32x4_t __c,
+			int const __d)
 {
-  return __builtin_aarch64_reduc_smin_scal_v4si (__a);
+  return __builtin_aarch64_sqdmlsl2_laneqv4si (__a, __b, __c, __d);
 }
 
-__extension__ static __inline uint8_t __attribute__ ((__always_inline__))
-vminvq_u8 (uint8x16_t __a)
+__extension__ extern __inline int64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqdmlsl_high_n_s32 (int64x2_t __a, int32x4_t __b, int32_t __c)
 {
-  return __builtin_aarch64_reduc_umin_scal_v16qi_uu (__a);
+  return __builtin_aarch64_sqdmlsl2_nv4si (__a, __b, __c);
 }
 
-__extension__ static __inline uint16_t __attribute__ ((__always_inline__))
-vminvq_u16 (uint16x8_t __a)
+__extension__ extern __inline int64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqdmlsl_lane_s32 (int64x2_t __a, int32x2_t __b, int32x2_t __c, int const __d)
 {
-  return __builtin_aarch64_reduc_umin_scal_v8hi_uu (__a);
+  return __builtin_aarch64_sqdmlsl_lanev2si (__a, __b, __c, __d);
 }
 
-__extension__ static __inline uint32_t __attribute__ ((__always_inline__))
-vminvq_u32 (uint32x4_t __a)
+__extension__ extern __inline int64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqdmlsl_laneq_s32 (int64x2_t __a, int32x2_t __b, int32x4_t __c, int const __d)
 {
-  return __builtin_aarch64_reduc_umin_scal_v4si_uu (__a);
+  return __builtin_aarch64_sqdmlsl_laneqv2si (__a, __b, __c, __d);
 }
 
-/* vminnmv  */
-
-__extension__ static __inline float32_t __attribute__ ((__always_inline__))
-vminnmv_f32 (float32x2_t __a)
+__extension__ extern __inline int64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqdmlsl_n_s32 (int64x2_t __a, int32x2_t __b, int32_t __c)
 {
-  return __builtin_aarch64_reduc_smin_scal_v2sf (__a);
+  return __builtin_aarch64_sqdmlsl_nv2si (__a, __b, __c);
 }
 
-__extension__ static __inline float32_t __attribute__ ((__always_inline__))
-vminnmvq_f32 (float32x4_t __a)
+__extension__ extern __inline int32_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqdmlslh_s16 (int32_t __a, int16_t __b, int16_t __c)
 {
-  return __builtin_aarch64_reduc_smin_scal_v4sf (__a);
+  return __builtin_aarch64_sqdmlslhi (__a, __b, __c);
 }
 
-__extension__ static __inline float64_t __attribute__ ((__always_inline__))
-vminnmvq_f64 (float64x2_t __a)
+__extension__ extern __inline int32_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqdmlslh_lane_s16 (int32_t __a, int16_t __b, int16x4_t __c, const int __d)
 {
-  return __builtin_aarch64_reduc_smin_scal_v2df (__a);
+  return __builtin_aarch64_sqdmlsl_lanehi (__a, __b, __c, __d);
 }
 
-/* vmla */
-
-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
-vmla_f32 (float32x2_t a, float32x2_t b, float32x2_t c)
+__extension__ extern __inline int32_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqdmlslh_laneq_s16 (int32_t __a, int16_t __b, int16x8_t __c, const int __d)
 {
-  return a + b * c;
+  return __builtin_aarch64_sqdmlsl_laneqhi (__a, __b, __c, __d);
 }
 
-__extension__ static __inline float64x1_t __attribute__ ((__always_inline__))
-vmla_f64 (float64x1_t __a, float64x1_t __b, float64x1_t __c)
+__extension__ extern __inline int64_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqdmlsls_s32 (int64_t __a, int32_t __b, int32_t __c)
 {
-  return __a + __b * __c;
+  return __builtin_aarch64_sqdmlslsi (__a, __b, __c);
 }
 
-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
-vmlaq_f32 (float32x4_t a, float32x4_t b, float32x4_t c)
+__extension__ extern __inline int64_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqdmlsls_lane_s32 (int64_t __a, int32_t __b, int32x2_t __c, const int __d)
 {
-  return a + b * c;
+  return __builtin_aarch64_sqdmlsl_lanesi (__a, __b, __c, __d);
 }
 
-__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
-vmlaq_f64 (float64x2_t a, float64x2_t b, float64x2_t c)
+__extension__ extern __inline int64_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqdmlsls_laneq_s32 (int64_t __a, int32_t __b, int32x4_t __c, const int __d)
 {
-  return a + b * c;
+  return __builtin_aarch64_sqdmlsl_laneqsi (__a, __b, __c, __d);
 }
 
-/* vmla_lane  */
+/* vqdmulh */
 
-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
-vmla_lane_f32 (float32x2_t __a, float32x2_t __b,
-	       float32x2_t __c, const int __lane)
+__extension__ extern __inline int16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqdmulh_lane_s16 (int16x4_t __a, int16x4_t __b, const int __c)
 {
-  return (__a + (__b * __aarch64_vget_lane_any (__c, __lane)));
+  return __builtin_aarch64_sqdmulh_lanev4hi (__a, __b, __c);
 }
 
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-vmla_lane_s16 (int16x4_t __a, int16x4_t __b,
-		int16x4_t __c, const int __lane)
+__extension__ extern __inline int32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqdmulh_lane_s32 (int32x2_t __a, int32x2_t __b, const int __c)
 {
-  return (__a + (__b * __aarch64_vget_lane_any (__c, __lane)));
+  return __builtin_aarch64_sqdmulh_lanev2si (__a, __b, __c);
 }
 
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-vmla_lane_s32 (int32x2_t __a, int32x2_t __b,
-		int32x2_t __c, const int __lane)
+__extension__ extern __inline int16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqdmulhq_lane_s16 (int16x8_t __a, int16x4_t __b, const int __c)
 {
-  return (__a + (__b * __aarch64_vget_lane_any (__c, __lane)));
+  return __builtin_aarch64_sqdmulh_lanev8hi (__a, __b, __c);
 }
 
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-vmla_lane_u16 (uint16x4_t __a, uint16x4_t __b,
-		uint16x4_t __c, const int __lane)
+__extension__ extern __inline int32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqdmulhq_lane_s32 (int32x4_t __a, int32x2_t __b, const int __c)
 {
-  return (__a + (__b * __aarch64_vget_lane_any (__c, __lane)));
+  return __builtin_aarch64_sqdmulh_lanev4si (__a, __b, __c);
 }
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-vmla_lane_u32 (uint32x2_t __a, uint32x2_t __b,
-	       uint32x2_t __c, const int __lane)
+__extension__ extern __inline int16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqdmulhh_s16 (int16_t __a, int16_t __b)
 {
-  return (__a + (__b * __aarch64_vget_lane_any (__c, __lane)));
+  return (int16_t) __builtin_aarch64_sqdmulhhi (__a, __b);
 }
 
-/* vmla_laneq  */
+__extension__ extern __inline int16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqdmulhh_lane_s16 (int16_t __a, int16x4_t __b, const int __c)
+{
+  return __builtin_aarch64_sqdmulh_lanehi (__a, __b, __c);
+}
 
-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
-vmla_laneq_f32 (float32x2_t __a, float32x2_t __b,
-	        float32x4_t __c, const int __lane)
+__extension__ extern __inline int16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqdmulhh_laneq_s16 (int16_t __a, int16x8_t __b, const int __c)
 {
-  return (__a + (__b * __aarch64_vget_lane_any (__c, __lane)));
+  return __builtin_aarch64_sqdmulh_laneqhi (__a, __b, __c);
 }
 
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-vmla_laneq_s16 (int16x4_t __a, int16x4_t __b,
-		int16x8_t __c, const int __lane)
+__extension__ extern __inline int32_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqdmulhs_s32 (int32_t __a, int32_t __b)
 {
-  return (__a + (__b * __aarch64_vget_lane_any (__c, __lane)));
+  return (int32_t) __builtin_aarch64_sqdmulhsi (__a, __b);
 }
 
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-vmla_laneq_s32 (int32x2_t __a, int32x2_t __b,
-		int32x4_t __c, const int __lane)
+__extension__ extern __inline int32_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqdmulhs_lane_s32 (int32_t __a, int32x2_t __b, const int __c)
 {
-  return (__a + (__b * __aarch64_vget_lane_any (__c, __lane)));
+  return __builtin_aarch64_sqdmulh_lanesi (__a, __b, __c);
 }
 
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-vmla_laneq_u16 (uint16x4_t __a, uint16x4_t __b,
-		uint16x8_t __c, const int __lane)
+__extension__ extern __inline int32_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqdmulhs_laneq_s32 (int32_t __a, int32x4_t __b, const int __c)
 {
-  return (__a + (__b * __aarch64_vget_lane_any (__c, __lane)));
+  return __builtin_aarch64_sqdmulh_laneqsi (__a, __b, __c);
 }
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-vmla_laneq_u32 (uint32x2_t __a, uint32x2_t __b,
-		uint32x4_t __c, const int __lane)
+/* vqdmull */
+
+__extension__ extern __inline int32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqdmull_s16 (int16x4_t __a, int16x4_t __b)
 {
-  return (__a + (__b * __aarch64_vget_lane_any (__c, __lane)));
+  return __builtin_aarch64_sqdmullv4hi (__a, __b);
 }
 
-/* vmlaq_lane  */
+__extension__ extern __inline int32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqdmull_high_s16 (int16x8_t __a, int16x8_t __b)
+{
+  return __builtin_aarch64_sqdmull2v8hi (__a, __b);
+}
 
-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
-vmlaq_lane_f32 (float32x4_t __a, float32x4_t __b,
-		float32x2_t __c, const int __lane)
+__extension__ extern __inline int32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqdmull_high_lane_s16 (int16x8_t __a, int16x4_t __b, int const __c)
 {
-  return (__a + (__b * __aarch64_vget_lane_any (__c, __lane)));
+  return __builtin_aarch64_sqdmull2_lanev8hi (__a, __b,__c);
 }
 
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-vmlaq_lane_s16 (int16x8_t __a, int16x8_t __b,
-		int16x4_t __c, const int __lane)
+__extension__ extern __inline int32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqdmull_high_laneq_s16 (int16x8_t __a, int16x8_t __b, int const __c)
 {
-  return (__a + (__b * __aarch64_vget_lane_any (__c, __lane)));
+  return __builtin_aarch64_sqdmull2_laneqv8hi (__a, __b,__c);
 }
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-vmlaq_lane_s32 (int32x4_t __a, int32x4_t __b,
-		int32x2_t __c, const int __lane)
+__extension__ extern __inline int32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqdmull_high_n_s16 (int16x8_t __a, int16_t __b)
 {
-  return (__a + (__b * __aarch64_vget_lane_any (__c, __lane)));
+  return __builtin_aarch64_sqdmull2_nv8hi (__a, __b);
 }
 
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-vmlaq_lane_u16 (uint16x8_t __a, uint16x8_t __b,
-		uint16x4_t __c, const int __lane)
+__extension__ extern __inline int32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqdmull_lane_s16 (int16x4_t __a, int16x4_t __b, int const __c)
 {
-  return (__a + (__b * __aarch64_vget_lane_any (__c, __lane)));
+  return __builtin_aarch64_sqdmull_lanev4hi (__a, __b, __c);
 }
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vmlaq_lane_u32 (uint32x4_t __a, uint32x4_t __b,
-		uint32x2_t __c, const int __lane)
+__extension__ extern __inline int32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqdmull_laneq_s16 (int16x4_t __a, int16x8_t __b, int const __c)
 {
-  return (__a + (__b * __aarch64_vget_lane_any (__c, __lane)));
+  return __builtin_aarch64_sqdmull_laneqv4hi (__a, __b, __c);
 }
 
-  /* vmlaq_laneq  */
+__extension__ extern __inline int32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqdmull_n_s16 (int16x4_t __a, int16_t __b)
+{
+  return __builtin_aarch64_sqdmull_nv4hi (__a, __b);
+}
 
-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
-vmlaq_laneq_f32 (float32x4_t __a, float32x4_t __b,
-		 float32x4_t __c, const int __lane)
+__extension__ extern __inline int64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqdmull_s32 (int32x2_t __a, int32x2_t __b)
 {
-  return (__a + (__b * __aarch64_vget_lane_any (__c, __lane)));
+  return __builtin_aarch64_sqdmullv2si (__a, __b);
 }
 
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-vmlaq_laneq_s16 (int16x8_t __a, int16x8_t __b,
-		int16x8_t __c, const int __lane)
+__extension__ extern __inline int64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqdmull_high_s32 (int32x4_t __a, int32x4_t __b)
 {
-  return (__a + (__b * __aarch64_vget_lane_any (__c, __lane)));
+  return __builtin_aarch64_sqdmull2v4si (__a, __b);
 }
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-vmlaq_laneq_s32 (int32x4_t __a, int32x4_t __b,
-		int32x4_t __c, const int __lane)
+__extension__ extern __inline int64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqdmull_high_lane_s32 (int32x4_t __a, int32x2_t __b, int const __c)
 {
-  return (__a + (__b * __aarch64_vget_lane_any (__c, __lane)));
+  return __builtin_aarch64_sqdmull2_lanev4si (__a, __b, __c);
 }
 
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-vmlaq_laneq_u16 (uint16x8_t __a, uint16x8_t __b,
-		uint16x8_t __c, const int __lane)
+__extension__ extern __inline int64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqdmull_high_laneq_s32 (int32x4_t __a, int32x4_t __b, int const __c)
 {
-  return (__a + (__b * __aarch64_vget_lane_any (__c, __lane)));
+  return __builtin_aarch64_sqdmull2_laneqv4si (__a, __b, __c);
 }
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vmlaq_laneq_u32 (uint32x4_t __a, uint32x4_t __b,
-		uint32x4_t __c, const int __lane)
+__extension__ extern __inline int64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqdmull_high_n_s32 (int32x4_t __a, int32_t __b)
 {
-  return (__a + (__b * __aarch64_vget_lane_any (__c, __lane)));
+  return __builtin_aarch64_sqdmull2_nv4si (__a, __b);
 }
 
-/* vmls  */
-
-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
-vmls_f32 (float32x2_t a, float32x2_t b, float32x2_t c)
+__extension__ extern __inline int64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqdmull_lane_s32 (int32x2_t __a, int32x2_t __b, int const __c)
 {
-  return a - b * c;
+  return __builtin_aarch64_sqdmull_lanev2si (__a, __b, __c);
 }
 
-__extension__ static __inline float64x1_t __attribute__ ((__always_inline__))
-vmls_f64 (float64x1_t __a, float64x1_t __b, float64x1_t __c)
+__extension__ extern __inline int64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqdmull_laneq_s32 (int32x2_t __a, int32x4_t __b, int const __c)
 {
-  return __a - __b * __c;
+  return __builtin_aarch64_sqdmull_laneqv2si (__a, __b, __c);
 }
 
-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
-vmlsq_f32 (float32x4_t a, float32x4_t b, float32x4_t c)
+__extension__ extern __inline int64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqdmull_n_s32 (int32x2_t __a, int32_t __b)
 {
-  return a - b * c;
+  return __builtin_aarch64_sqdmull_nv2si (__a, __b);
 }
 
-__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
-vmlsq_f64 (float64x2_t a, float64x2_t b, float64x2_t c)
+__extension__ extern __inline int32_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqdmullh_s16 (int16_t __a, int16_t __b)
 {
-  return a - b * c;
+  return (int32_t) __builtin_aarch64_sqdmullhi (__a, __b);
 }
 
-/* vmls_lane  */
-
-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
-vmls_lane_f32 (float32x2_t __a, float32x2_t __b,
-	       float32x2_t __c, const int __lane)
+__extension__ extern __inline int32_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqdmullh_lane_s16 (int16_t __a, int16x4_t __b, const int __c)
 {
-  return (__a - (__b * __aarch64_vget_lane_any (__c, __lane)));
+  return __builtin_aarch64_sqdmull_lanehi (__a, __b, __c);
 }
 
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-vmls_lane_s16 (int16x4_t __a, int16x4_t __b,
-		int16x4_t __c, const int __lane)
+__extension__ extern __inline int32_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqdmullh_laneq_s16 (int16_t __a, int16x8_t __b, const int __c)
 {
-  return (__a - (__b * __aarch64_vget_lane_any (__c, __lane)));
+  return __builtin_aarch64_sqdmull_laneqhi (__a, __b, __c);
 }
 
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-vmls_lane_s32 (int32x2_t __a, int32x2_t __b,
-		int32x2_t __c, const int __lane)
+__extension__ extern __inline int64_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqdmulls_s32 (int32_t __a, int32_t __b)
 {
-  return (__a - (__b * __aarch64_vget_lane_any (__c, __lane)));
+  return __builtin_aarch64_sqdmullsi (__a, __b);
 }
 
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-vmls_lane_u16 (uint16x4_t __a, uint16x4_t __b,
-		uint16x4_t __c, const int __lane)
+__extension__ extern __inline int64_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqdmulls_lane_s32 (int32_t __a, int32x2_t __b, const int __c)
 {
-  return (__a - (__b * __aarch64_vget_lane_any (__c, __lane)));
+  return __builtin_aarch64_sqdmull_lanesi (__a, __b, __c);
 }
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-vmls_lane_u32 (uint32x2_t __a, uint32x2_t __b,
-	       uint32x2_t __c, const int __lane)
+__extension__ extern __inline int64_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqdmulls_laneq_s32 (int32_t __a, int32x4_t __b, const int __c)
 {
-  return (__a - (__b * __aarch64_vget_lane_any (__c, __lane)));
+  return __builtin_aarch64_sqdmull_laneqsi (__a, __b, __c);
 }
 
-/* vmls_laneq  */
+/* vqmovn */
 
-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
-vmls_laneq_f32 (float32x2_t __a, float32x2_t __b,
-	       float32x4_t __c, const int __lane)
+__extension__ extern __inline int8x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqmovn_s16 (int16x8_t __a)
 {
-  return (__a - (__b * __aarch64_vget_lane_any (__c, __lane)));
+  return (int8x8_t) __builtin_aarch64_sqmovnv8hi (__a);
 }
 
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-vmls_laneq_s16 (int16x4_t __a, int16x4_t __b,
-		int16x8_t __c, const int __lane)
+__extension__ extern __inline int16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqmovn_s32 (int32x4_t __a)
 {
-  return (__a - (__b * __aarch64_vget_lane_any (__c, __lane)));
+  return (int16x4_t) __builtin_aarch64_sqmovnv4si (__a);
 }
 
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-vmls_laneq_s32 (int32x2_t __a, int32x2_t __b,
-		int32x4_t __c, const int __lane)
+__extension__ extern __inline int32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqmovn_s64 (int64x2_t __a)
 {
-  return (__a - (__b * __aarch64_vget_lane_any (__c, __lane)));
+  return (int32x2_t) __builtin_aarch64_sqmovnv2di (__a);
 }
 
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-vmls_laneq_u16 (uint16x4_t __a, uint16x4_t __b,
-		uint16x8_t __c, const int __lane)
+__extension__ extern __inline uint8x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqmovn_u16 (uint16x8_t __a)
 {
-  return (__a - (__b * __aarch64_vget_lane_any (__c, __lane)));
+  return (uint8x8_t) __builtin_aarch64_uqmovnv8hi ((int16x8_t) __a);
 }
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-vmls_laneq_u32 (uint32x2_t __a, uint32x2_t __b,
-		uint32x4_t __c, const int __lane)
+__extension__ extern __inline uint16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqmovn_u32 (uint32x4_t __a)
 {
-  return (__a - (__b * __aarch64_vget_lane_any (__c, __lane)));
+  return (uint16x4_t) __builtin_aarch64_uqmovnv4si ((int32x4_t) __a);
 }
 
-/* vmlsq_lane  */
-
-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
-vmlsq_lane_f32 (float32x4_t __a, float32x4_t __b,
-		float32x2_t __c, const int __lane)
+__extension__ extern __inline uint32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqmovn_u64 (uint64x2_t __a)
 {
-  return (__a - (__b * __aarch64_vget_lane_any (__c, __lane)));
+  return (uint32x2_t) __builtin_aarch64_uqmovnv2di ((int64x2_t) __a);
 }
 
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-vmlsq_lane_s16 (int16x8_t __a, int16x8_t __b,
-		int16x4_t __c, const int __lane)
+__extension__ extern __inline int8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqmovnh_s16 (int16_t __a)
 {
-  return (__a - (__b * __aarch64_vget_lane_any (__c, __lane)));
+  return (int8_t) __builtin_aarch64_sqmovnhi (__a);
 }
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-vmlsq_lane_s32 (int32x4_t __a, int32x4_t __b,
-		int32x2_t __c, const int __lane)
+__extension__ extern __inline int16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqmovns_s32 (int32_t __a)
 {
-  return (__a - (__b * __aarch64_vget_lane_any (__c, __lane)));
+  return (int16_t) __builtin_aarch64_sqmovnsi (__a);
 }
 
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-vmlsq_lane_u16 (uint16x8_t __a, uint16x8_t __b,
-		uint16x4_t __c, const int __lane)
+__extension__ extern __inline int32_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqmovnd_s64 (int64_t __a)
 {
-  return (__a - (__b * __aarch64_vget_lane_any (__c, __lane)));
+  return (int32_t) __builtin_aarch64_sqmovndi (__a);
 }
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vmlsq_lane_u32 (uint32x4_t __a, uint32x4_t __b,
-		uint32x2_t __c, const int __lane)
+__extension__ extern __inline uint8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqmovnh_u16 (uint16_t __a)
 {
-  return (__a - (__b * __aarch64_vget_lane_any (__c, __lane)));
+  return (uint8_t) __builtin_aarch64_uqmovnhi (__a);
 }
 
-  /* vmlsq_laneq  */
-
-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
-vmlsq_laneq_f32 (float32x4_t __a, float32x4_t __b,
-		float32x4_t __c, const int __lane)
+__extension__ extern __inline uint16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqmovns_u32 (uint32_t __a)
 {
-  return (__a - (__b * __aarch64_vget_lane_any (__c, __lane)));
+  return (uint16_t) __builtin_aarch64_uqmovnsi (__a);
 }
 
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-vmlsq_laneq_s16 (int16x8_t __a, int16x8_t __b,
-		int16x8_t __c, const int __lane)
+__extension__ extern __inline uint32_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqmovnd_u64 (uint64_t __a)
 {
-  return (__a - (__b * __aarch64_vget_lane_any (__c, __lane)));
+  return (uint32_t) __builtin_aarch64_uqmovndi (__a);
 }
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-vmlsq_laneq_s32 (int32x4_t __a, int32x4_t __b,
-		int32x4_t __c, const int __lane)
+/* vqmovun */
+
+__extension__ extern __inline uint8x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqmovun_s16 (int16x8_t __a)
 {
-  return (__a - (__b * __aarch64_vget_lane_any (__c, __lane)));
+  return (uint8x8_t) __builtin_aarch64_sqmovunv8hi (__a);
 }
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-vmlsq_laneq_u16 (uint16x8_t __a, uint16x8_t __b,
-		uint16x8_t __c, const int __lane)
+
+__extension__ extern __inline uint16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqmovun_s32 (int32x4_t __a)
 {
-  return (__a - (__b * __aarch64_vget_lane_any (__c, __lane)));
+  return (uint16x4_t) __builtin_aarch64_sqmovunv4si (__a);
 }
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vmlsq_laneq_u32 (uint32x4_t __a, uint32x4_t __b,
-		uint32x4_t __c, const int __lane)
+__extension__ extern __inline uint32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqmovun_s64 (int64x2_t __a)
 {
-  return (__a - (__b * __aarch64_vget_lane_any (__c, __lane)));
+  return (uint32x2_t) __builtin_aarch64_sqmovunv2di (__a);
 }
 
-/* vmov_n_  */
-
-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
-vmov_n_f32 (float32_t __a)
+__extension__ extern __inline int8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqmovunh_s16 (int16_t __a)
 {
-  return vdup_n_f32 (__a);
+  return (int8_t) __builtin_aarch64_sqmovunhi (__a);
 }
 
-__extension__ static __inline float64x1_t __attribute__ ((__always_inline__))
-vmov_n_f64 (float64_t __a)
+__extension__ extern __inline int16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqmovuns_s32 (int32_t __a)
 {
-  return (float64x1_t) {__a};
+  return (int16_t) __builtin_aarch64_sqmovunsi (__a);
 }
 
-__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
-vmov_n_p8 (poly8_t __a)
+__extension__ extern __inline int32_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqmovund_s64 (int64_t __a)
 {
-  return vdup_n_p8 (__a);
+  return (int32_t) __builtin_aarch64_sqmovundi (__a);
 }
 
-__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__))
-vmov_n_p16 (poly16_t __a)
+/* vqneg */
+
+__extension__ extern __inline int64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqnegq_s64 (int64x2_t __a)
 {
-  return vdup_n_p16 (__a);
+  return (int64x2_t) __builtin_aarch64_sqnegv2di (__a);
 }
 
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-vmov_n_s8 (int8_t __a)
+__extension__ extern __inline int8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqnegb_s8 (int8_t __a)
 {
-  return vdup_n_s8 (__a);
+  return (int8_t) __builtin_aarch64_sqnegqi (__a);
 }
 
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-vmov_n_s16 (int16_t __a)
+__extension__ extern __inline int16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqnegh_s16 (int16_t __a)
 {
-  return vdup_n_s16 (__a);
+  return (int16_t) __builtin_aarch64_sqneghi (__a);
 }
 
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-vmov_n_s32 (int32_t __a)
+__extension__ extern __inline int32_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqnegs_s32 (int32_t __a)
 {
-  return vdup_n_s32 (__a);
+  return (int32_t) __builtin_aarch64_sqnegsi (__a);
 }
 
-__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
-vmov_n_s64 (int64_t __a)
+__extension__ extern __inline int64_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqnegd_s64 (int64_t __a)
 {
-  return (int64x1_t) {__a};
+  return __builtin_aarch64_sqnegdi (__a);
 }
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-vmov_n_u8 (uint8_t __a)
+/* vqrdmulh */
+
+__extension__ extern __inline int16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqrdmulh_lane_s16 (int16x4_t __a, int16x4_t __b, const int __c)
 {
-  return vdup_n_u8 (__a);
+  return  __builtin_aarch64_sqrdmulh_lanev4hi (__a, __b, __c);
 }
 
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-vmov_n_u16 (uint16_t __a)
+__extension__ extern __inline int32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqrdmulh_lane_s32 (int32x2_t __a, int32x2_t __b, const int __c)
 {
-    return vdup_n_u16 (__a);
+  return __builtin_aarch64_sqrdmulh_lanev2si (__a, __b, __c);
 }
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-vmov_n_u32 (uint32_t __a)
+__extension__ extern __inline int16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqrdmulhq_lane_s16 (int16x8_t __a, int16x4_t __b, const int __c)
 {
-   return vdup_n_u32 (__a);
+  return __builtin_aarch64_sqrdmulh_lanev8hi (__a, __b, __c);
 }
 
-__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
-vmov_n_u64 (uint64_t __a)
+__extension__ extern __inline int32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqrdmulhq_lane_s32 (int32x4_t __a, int32x2_t __b, const int __c)
 {
-  return (uint64x1_t) {__a};
+  return __builtin_aarch64_sqrdmulh_lanev4si (__a, __b, __c);
 }
 
-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
-vmovq_n_f32 (float32_t __a)
+__extension__ extern __inline int16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqrdmulhh_s16 (int16_t __a, int16_t __b)
 {
-  return vdupq_n_f32 (__a);
+  return (int16_t) __builtin_aarch64_sqrdmulhhi (__a, __b);
 }
 
-__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
-vmovq_n_f64 (float64_t __a)
+__extension__ extern __inline int16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqrdmulhh_lane_s16 (int16_t __a, int16x4_t __b, const int __c)
 {
-  return vdupq_n_f64 (__a);
+  return __builtin_aarch64_sqrdmulh_lanehi (__a, __b, __c);
 }
 
-__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
-vmovq_n_p8 (poly8_t __a)
+__extension__ extern __inline int16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqrdmulhh_laneq_s16 (int16_t __a, int16x8_t __b, const int __c)
 {
-  return vdupq_n_p8 (__a);
+  return __builtin_aarch64_sqrdmulh_laneqhi (__a, __b, __c);
 }
 
-__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
-vmovq_n_p16 (poly16_t __a)
+__extension__ extern __inline int32_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqrdmulhs_s32 (int32_t __a, int32_t __b)
 {
-  return vdupq_n_p16 (__a);
+  return (int32_t) __builtin_aarch64_sqrdmulhsi (__a, __b);
 }
 
-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
-vmovq_n_s8 (int8_t __a)
+__extension__ extern __inline int32_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqrdmulhs_lane_s32 (int32_t __a, int32x2_t __b, const int __c)
 {
-  return vdupq_n_s8 (__a);
+  return __builtin_aarch64_sqrdmulh_lanesi (__a, __b, __c);
 }
 
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-vmovq_n_s16 (int16_t __a)
+__extension__ extern __inline int32_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqrdmulhs_laneq_s32 (int32_t __a, int32x4_t __b, const int __c)
 {
-  return vdupq_n_s16 (__a);
+  return __builtin_aarch64_sqrdmulh_laneqsi (__a, __b, __c);
 }
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-vmovq_n_s32 (int32_t __a)
+/* vqrshl */
+
+__extension__ extern __inline int8x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqrshl_s8 (int8x8_t __a, int8x8_t __b)
 {
-  return vdupq_n_s32 (__a);
+  return __builtin_aarch64_sqrshlv8qi (__a, __b);
 }
 
-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
-vmovq_n_s64 (int64_t __a)
+__extension__ extern __inline int16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqrshl_s16 (int16x4_t __a, int16x4_t __b)
 {
-  return vdupq_n_s64 (__a);
+  return __builtin_aarch64_sqrshlv4hi (__a, __b);
 }
 
-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-vmovq_n_u8 (uint8_t __a)
+__extension__ extern __inline int32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqrshl_s32 (int32x2_t __a, int32x2_t __b)
 {
-  return vdupq_n_u8 (__a);
+  return __builtin_aarch64_sqrshlv2si (__a, __b);
 }
 
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-vmovq_n_u16 (uint16_t __a)
+__extension__ extern __inline int64x1_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqrshl_s64 (int64x1_t __a, int64x1_t __b)
 {
-  return vdupq_n_u16 (__a);
+  return (int64x1_t) {__builtin_aarch64_sqrshldi (__a[0], __b[0])};
 }
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vmovq_n_u32 (uint32_t __a)
+__extension__ extern __inline uint8x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqrshl_u8 (uint8x8_t __a, int8x8_t __b)
 {
-  return vdupq_n_u32 (__a);
+  return __builtin_aarch64_uqrshlv8qi_uus ( __a, __b);
 }
 
-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
-vmovq_n_u64 (uint64_t __a)
+__extension__ extern __inline uint16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqrshl_u16 (uint16x4_t __a, int16x4_t __b)
 {
-  return vdupq_n_u64 (__a);
+  return __builtin_aarch64_uqrshlv4hi_uus ( __a, __b);
 }
 
-/* vmul_lane  */
+__extension__ extern __inline uint32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqrshl_u32 (uint32x2_t __a, int32x2_t __b)
+{
+  return __builtin_aarch64_uqrshlv2si_uus ( __a, __b);
+}
 
-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
-vmul_lane_f32 (float32x2_t __a, float32x2_t __b, const int __lane)
+__extension__ extern __inline uint64x1_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqrshl_u64 (uint64x1_t __a, int64x1_t __b)
 {
-  return __a * __aarch64_vget_lane_any (__b, __lane);
+  return (uint64x1_t) {__builtin_aarch64_uqrshldi_uus (__a[0], __b[0])};
 }
 
-__extension__ static __inline float64x1_t __attribute__ ((__always_inline__))
-vmul_lane_f64 (float64x1_t __a, float64x1_t __b, const int __lane)
+__extension__ extern __inline int8x16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqrshlq_s8 (int8x16_t __a, int8x16_t __b)
 {
-  return __a * __b;
+  return __builtin_aarch64_sqrshlv16qi (__a, __b);
 }
 
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-vmul_lane_s16 (int16x4_t __a, int16x4_t __b, const int __lane)
+__extension__ extern __inline int16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqrshlq_s16 (int16x8_t __a, int16x8_t __b)
 {
-  return __a * __aarch64_vget_lane_any (__b, __lane);
+  return __builtin_aarch64_sqrshlv8hi (__a, __b);
 }
 
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-vmul_lane_s32 (int32x2_t __a, int32x2_t __b, const int __lane)
+__extension__ extern __inline int32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqrshlq_s32 (int32x4_t __a, int32x4_t __b)
 {
-  return __a * __aarch64_vget_lane_any (__b, __lane);
+  return __builtin_aarch64_sqrshlv4si (__a, __b);
 }
 
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-vmul_lane_u16 (uint16x4_t __a, uint16x4_t __b, const int __lane)
+__extension__ extern __inline int64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqrshlq_s64 (int64x2_t __a, int64x2_t __b)
 {
-  return __a * __aarch64_vget_lane_any (__b, __lane);
+  return __builtin_aarch64_sqrshlv2di (__a, __b);
 }
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-vmul_lane_u32 (uint32x2_t __a, uint32x2_t __b, const int __lane)
+__extension__ extern __inline uint8x16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqrshlq_u8 (uint8x16_t __a, int8x16_t __b)
 {
-  return __a * __aarch64_vget_lane_any (__b, __lane);
+  return __builtin_aarch64_uqrshlv16qi_uus ( __a, __b);
 }
 
-/* vmuld_lane  */
-
-__extension__ static __inline float64_t __attribute__ ((__always_inline__))
-vmuld_lane_f64 (float64_t __a, float64x1_t __b, const int __lane)
+__extension__ extern __inline uint16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqrshlq_u16 (uint16x8_t __a, int16x8_t __b)
 {
-  return __a * __aarch64_vget_lane_any (__b, __lane);
+  return __builtin_aarch64_uqrshlv8hi_uus ( __a, __b);
 }
 
-__extension__ static __inline float64_t __attribute__ ((__always_inline__))
-vmuld_laneq_f64 (float64_t __a, float64x2_t __b, const int __lane)
+__extension__ extern __inline uint32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqrshlq_u32 (uint32x4_t __a, int32x4_t __b)
 {
-  return __a * __aarch64_vget_lane_any (__b, __lane);
+  return __builtin_aarch64_uqrshlv4si_uus ( __a, __b);
 }
 
-/* vmuls_lane  */
-
-__extension__ static __inline float32_t __attribute__ ((__always_inline__))
-vmuls_lane_f32 (float32_t __a, float32x2_t __b, const int __lane)
+__extension__ extern __inline uint64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqrshlq_u64 (uint64x2_t __a, int64x2_t __b)
 {
-  return __a * __aarch64_vget_lane_any (__b, __lane);
+  return __builtin_aarch64_uqrshlv2di_uus ( __a, __b);
 }
 
-__extension__ static __inline float32_t __attribute__ ((__always_inline__))
-vmuls_laneq_f32 (float32_t __a, float32x4_t __b, const int __lane)
+__extension__ extern __inline int8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqrshlb_s8 (int8_t __a, int8_t __b)
 {
-  return __a * __aarch64_vget_lane_any (__b, __lane);
+  return __builtin_aarch64_sqrshlqi (__a, __b);
 }
 
-/* vmul_laneq  */
-
-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
-vmul_laneq_f32 (float32x2_t __a, float32x4_t __b, const int __lane)
+__extension__ extern __inline int16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqrshlh_s16 (int16_t __a, int16_t __b)
 {
-  return __a * __aarch64_vget_lane_any (__b, __lane);
+  return __builtin_aarch64_sqrshlhi (__a, __b);
 }
 
-__extension__ static __inline float64x1_t __attribute__ ((__always_inline__))
-vmul_laneq_f64 (float64x1_t __a, float64x2_t __b, const int __lane)
+__extension__ extern __inline int32_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqrshls_s32 (int32_t __a, int32_t __b)
 {
-  return __a * __aarch64_vget_lane_any (__b, __lane);
+  return __builtin_aarch64_sqrshlsi (__a, __b);
 }
 
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-vmul_laneq_s16 (int16x4_t __a, int16x8_t __b, const int __lane)
+__extension__ extern __inline int64_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqrshld_s64 (int64_t __a, int64_t __b)
 {
-  return __a * __aarch64_vget_lane_any (__b, __lane);
+  return __builtin_aarch64_sqrshldi (__a, __b);
 }
 
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-vmul_laneq_s32 (int32x2_t __a, int32x4_t __b, const int __lane)
+__extension__ extern __inline uint8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqrshlb_u8 (uint8_t __a, uint8_t __b)
 {
-  return __a * __aarch64_vget_lane_any (__b, __lane);
+  return __builtin_aarch64_uqrshlqi_uus (__a, __b);
 }
 
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-vmul_laneq_u16 (uint16x4_t __a, uint16x8_t __b, const int __lane)
+__extension__ extern __inline uint16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqrshlh_u16 (uint16_t __a, uint16_t __b)
 {
-  return __a * __aarch64_vget_lane_any (__b, __lane);
+  return __builtin_aarch64_uqrshlhi_uus (__a, __b);
 }
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-vmul_laneq_u32 (uint32x2_t __a, uint32x4_t __b, const int __lane)
+__extension__ extern __inline uint32_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqrshls_u32 (uint32_t __a, uint32_t __b)
 {
-  return __a * __aarch64_vget_lane_any (__b, __lane);
+  return __builtin_aarch64_uqrshlsi_uus (__a, __b);
 }
 
-/* vmul_n  */
-
-__extension__ static __inline float64x1_t __attribute__ ((__always_inline__))
-vmul_n_f64  (float64x1_t __a, float64_t __b)
+__extension__ extern __inline uint64_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqrshld_u64 (uint64_t __a, uint64_t __b)
 {
-  return (float64x1_t) { vget_lane_f64 (__a, 0) * __b };
+  return __builtin_aarch64_uqrshldi_uus (__a, __b);
 }
 
-/* vmulq_lane  */
+/* vqrshrn */
 
-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
-vmulq_lane_f32 (float32x4_t __a, float32x2_t __b, const int __lane)
+__extension__ extern __inline int8x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqrshrn_n_s16 (int16x8_t __a, const int __b)
 {
-  return __a * __aarch64_vget_lane_any (__b, __lane);
+  return (int8x8_t) __builtin_aarch64_sqrshrn_nv8hi (__a, __b);
 }
 
-__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
-vmulq_lane_f64 (float64x2_t __a, float64x1_t __b, const int __lane)
+__extension__ extern __inline int16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqrshrn_n_s32 (int32x4_t __a, const int __b)
 {
-  __AARCH64_LANE_CHECK (__a, __lane);
-  return __a * __b[0];
+  return (int16x4_t) __builtin_aarch64_sqrshrn_nv4si (__a, __b);
 }
 
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-vmulq_lane_s16 (int16x8_t __a, int16x4_t __b, const int __lane)
+__extension__ extern __inline int32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqrshrn_n_s64 (int64x2_t __a, const int __b)
 {
-  return __a * __aarch64_vget_lane_any (__b, __lane);
+  return (int32x2_t) __builtin_aarch64_sqrshrn_nv2di (__a, __b);
 }
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-vmulq_lane_s32 (int32x4_t __a, int32x2_t __b, const int __lane)
+__extension__ extern __inline uint8x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqrshrn_n_u16 (uint16x8_t __a, const int __b)
 {
-  return __a * __aarch64_vget_lane_any (__b, __lane);
+  return __builtin_aarch64_uqrshrn_nv8hi_uus ( __a, __b);
 }
 
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-vmulq_lane_u16 (uint16x8_t __a, uint16x4_t __b, const int __lane)
+__extension__ extern __inline uint16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqrshrn_n_u32 (uint32x4_t __a, const int __b)
 {
-  return __a * __aarch64_vget_lane_any (__b, __lane);
+  return __builtin_aarch64_uqrshrn_nv4si_uus ( __a, __b);
 }
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vmulq_lane_u32 (uint32x4_t __a, uint32x2_t __b, const int __lane)
+__extension__ extern __inline uint32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqrshrn_n_u64 (uint64x2_t __a, const int __b)
 {
-  return __a * __aarch64_vget_lane_any (__b, __lane);
+  return __builtin_aarch64_uqrshrn_nv2di_uus ( __a, __b);
 }
 
-/* vmulq_laneq  */
-
-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
-vmulq_laneq_f32 (float32x4_t __a, float32x4_t __b, const int __lane)
+__extension__ extern __inline int8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqrshrnh_n_s16 (int16_t __a, const int __b)
 {
-  return __a * __aarch64_vget_lane_any (__b, __lane);
+  return (int8_t) __builtin_aarch64_sqrshrn_nhi (__a, __b);
 }
 
-__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
-vmulq_laneq_f64 (float64x2_t __a, float64x2_t __b, const int __lane)
+__extension__ extern __inline int16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqrshrns_n_s32 (int32_t __a, const int __b)
 {
-  return __a * __aarch64_vget_lane_any (__b, __lane);
+  return (int16_t) __builtin_aarch64_sqrshrn_nsi (__a, __b);
 }
 
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-vmulq_laneq_s16 (int16x8_t __a, int16x8_t __b, const int __lane)
+__extension__ extern __inline int32_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqrshrnd_n_s64 (int64_t __a, const int __b)
 {
-  return __a * __aarch64_vget_lane_any (__b, __lane);
+  return (int32_t) __builtin_aarch64_sqrshrn_ndi (__a, __b);
 }
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-vmulq_laneq_s32 (int32x4_t __a, int32x4_t __b, const int __lane)
+__extension__ extern __inline uint8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqrshrnh_n_u16 (uint16_t __a, const int __b)
 {
-  return __a * __aarch64_vget_lane_any (__b, __lane);
+  return __builtin_aarch64_uqrshrn_nhi_uus (__a, __b);
 }
 
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-vmulq_laneq_u16 (uint16x8_t __a, uint16x8_t __b, const int __lane)
+__extension__ extern __inline uint16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqrshrns_n_u32 (uint32_t __a, const int __b)
 {
-  return __a * __aarch64_vget_lane_any (__b, __lane);
+  return __builtin_aarch64_uqrshrn_nsi_uus (__a, __b);
 }
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vmulq_laneq_u32 (uint32x4_t __a, uint32x4_t __b, const int __lane)
+__extension__ extern __inline uint32_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqrshrnd_n_u64 (uint64_t __a, const int __b)
 {
-  return __a * __aarch64_vget_lane_any (__b, __lane);
+  return __builtin_aarch64_uqrshrn_ndi_uus (__a, __b);
 }
 
-/* vneg  */
+/* vqrshrun */
 
-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
-vneg_f32 (float32x2_t __a)
+__extension__ extern __inline uint8x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqrshrun_n_s16 (int16x8_t __a, const int __b)
 {
-  return -__a;
+  return (uint8x8_t) __builtin_aarch64_sqrshrun_nv8hi (__a, __b);
 }
 
-__extension__ static __inline float64x1_t __attribute__ ((__always_inline__))
-vneg_f64 (float64x1_t __a)
+__extension__ extern __inline uint16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqrshrun_n_s32 (int32x4_t __a, const int __b)
 {
-  return -__a;
+  return (uint16x4_t) __builtin_aarch64_sqrshrun_nv4si (__a, __b);
 }
 
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-vneg_s8 (int8x8_t __a)
+__extension__ extern __inline uint32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqrshrun_n_s64 (int64x2_t __a, const int __b)
 {
-  return -__a;
+  return (uint32x2_t) __builtin_aarch64_sqrshrun_nv2di (__a, __b);
 }
 
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-vneg_s16 (int16x4_t __a)
+__extension__ extern __inline int8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqrshrunh_n_s16 (int16_t __a, const int __b)
 {
-  return -__a;
+  return (int8_t) __builtin_aarch64_sqrshrun_nhi (__a, __b);
 }
 
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-vneg_s32 (int32x2_t __a)
+__extension__ extern __inline int16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqrshruns_n_s32 (int32_t __a, const int __b)
 {
-  return -__a;
+  return (int16_t) __builtin_aarch64_sqrshrun_nsi (__a, __b);
 }
 
-__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
-vneg_s64 (int64x1_t __a)
+__extension__ extern __inline int32_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqrshrund_n_s64 (int64_t __a, const int __b)
 {
-  return -__a;
+  return (int32_t) __builtin_aarch64_sqrshrun_ndi (__a, __b);
 }
 
-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
-vnegq_f32 (float32x4_t __a)
+/* vqshl */
+
+__extension__ extern __inline int8x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqshl_s8 (int8x8_t __a, int8x8_t __b)
 {
-  return -__a;
+  return __builtin_aarch64_sqshlv8qi (__a, __b);
 }
 
-__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
-vnegq_f64 (float64x2_t __a)
+__extension__ extern __inline int16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqshl_s16 (int16x4_t __a, int16x4_t __b)
 {
-  return -__a;
+  return __builtin_aarch64_sqshlv4hi (__a, __b);
 }
 
-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
-vnegq_s8 (int8x16_t __a)
+__extension__ extern __inline int32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqshl_s32 (int32x2_t __a, int32x2_t __b)
 {
-  return -__a;
+  return __builtin_aarch64_sqshlv2si (__a, __b);
 }
 
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-vnegq_s16 (int16x8_t __a)
+__extension__ extern __inline int64x1_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqshl_s64 (int64x1_t __a, int64x1_t __b)
 {
-  return -__a;
+  return (int64x1_t) {__builtin_aarch64_sqshldi (__a[0], __b[0])};
 }
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-vnegq_s32 (int32x4_t __a)
+__extension__ extern __inline uint8x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqshl_u8 (uint8x8_t __a, int8x8_t __b)
 {
-  return -__a;
+  return __builtin_aarch64_uqshlv8qi_uus ( __a, __b);
 }
 
-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
-vnegq_s64 (int64x2_t __a)
+__extension__ extern __inline uint16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqshl_u16 (uint16x4_t __a, int16x4_t __b)
 {
-  return -__a;
+  return __builtin_aarch64_uqshlv4hi_uus ( __a, __b);
 }
 
-/* vpadd  */
-
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-vpadd_s8 (int8x8_t __a, int8x8_t __b)
+__extension__ extern __inline uint32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqshl_u32 (uint32x2_t __a, int32x2_t __b)
 {
-  return __builtin_aarch64_addpv8qi (__a, __b);
+  return __builtin_aarch64_uqshlv2si_uus ( __a, __b);
 }
 
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-vpadd_s16 (int16x4_t __a, int16x4_t __b)
+__extension__ extern __inline uint64x1_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqshl_u64 (uint64x1_t __a, int64x1_t __b)
 {
-  return __builtin_aarch64_addpv4hi (__a, __b);
+  return (uint64x1_t) {__builtin_aarch64_uqshldi_uus (__a[0], __b[0])};
 }
 
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-vpadd_s32 (int32x2_t __a, int32x2_t __b)
+__extension__ extern __inline int8x16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqshlq_s8 (int8x16_t __a, int8x16_t __b)
 {
-  return __builtin_aarch64_addpv2si (__a, __b);
+  return __builtin_aarch64_sqshlv16qi (__a, __b);
 }
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-vpadd_u8 (uint8x8_t __a, uint8x8_t __b)
+__extension__ extern __inline int16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqshlq_s16 (int16x8_t __a, int16x8_t __b)
 {
-  return (uint8x8_t) __builtin_aarch64_addpv8qi ((int8x8_t) __a,
-						 (int8x8_t) __b);
+  return __builtin_aarch64_sqshlv8hi (__a, __b);
 }
 
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-vpadd_u16 (uint16x4_t __a, uint16x4_t __b)
+__extension__ extern __inline int32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqshlq_s32 (int32x4_t __a, int32x4_t __b)
 {
-  return (uint16x4_t) __builtin_aarch64_addpv4hi ((int16x4_t) __a,
-						  (int16x4_t) __b);
+  return __builtin_aarch64_sqshlv4si (__a, __b);
 }
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-vpadd_u32 (uint32x2_t __a, uint32x2_t __b)
+__extension__ extern __inline int64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqshlq_s64 (int64x2_t __a, int64x2_t __b)
 {
-  return (uint32x2_t) __builtin_aarch64_addpv2si ((int32x2_t) __a,
-						  (int32x2_t) __b);
+  return __builtin_aarch64_sqshlv2di (__a, __b);
 }
 
-__extension__ static __inline float64_t __attribute__ ((__always_inline__))
-vpaddd_f64 (float64x2_t __a)
+__extension__ extern __inline uint8x16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqshlq_u8 (uint8x16_t __a, int8x16_t __b)
 {
-  return __builtin_aarch64_reduc_plus_scal_v2df (__a);
+  return __builtin_aarch64_uqshlv16qi_uus ( __a, __b);
 }
 
-__extension__ static __inline int64_t __attribute__ ((__always_inline__))
-vpaddd_s64 (int64x2_t __a)
+__extension__ extern __inline uint16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqshlq_u16 (uint16x8_t __a, int16x8_t __b)
 {
-  return __builtin_aarch64_addpdi (__a);
+  return __builtin_aarch64_uqshlv8hi_uus ( __a, __b);
 }
 
-__extension__ static __inline uint64_t __attribute__ ((__always_inline__))
-vpaddd_u64 (uint64x2_t __a)
+__extension__ extern __inline uint32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqshlq_u32 (uint32x4_t __a, int32x4_t __b)
 {
-  return __builtin_aarch64_addpdi ((int64x2_t) __a);
+  return __builtin_aarch64_uqshlv4si_uus ( __a, __b);
 }
 
-/* vqabs */
-
-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
-vqabsq_s64 (int64x2_t __a)
+__extension__ extern __inline uint64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqshlq_u64 (uint64x2_t __a, int64x2_t __b)
 {
-  return (int64x2_t) __builtin_aarch64_sqabsv2di (__a);
+  return __builtin_aarch64_uqshlv2di_uus ( __a, __b);
 }
 
-__extension__ static __inline int8_t __attribute__ ((__always_inline__))
-vqabsb_s8 (int8_t __a)
+__extension__ extern __inline int8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqshlb_s8 (int8_t __a, int8_t __b)
 {
-  return (int8_t) __builtin_aarch64_sqabsqi (__a);
+  return __builtin_aarch64_sqshlqi (__a, __b);
 }
 
-__extension__ static __inline int16_t __attribute__ ((__always_inline__))
-vqabsh_s16 (int16_t __a)
+__extension__ extern __inline int16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqshlh_s16 (int16_t __a, int16_t __b)
 {
-  return (int16_t) __builtin_aarch64_sqabshi (__a);
+  return __builtin_aarch64_sqshlhi (__a, __b);
 }
 
-__extension__ static __inline int32_t __attribute__ ((__always_inline__))
-vqabss_s32 (int32_t __a)
+__extension__ extern __inline int32_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqshls_s32 (int32_t __a, int32_t __b)
 {
-  return (int32_t) __builtin_aarch64_sqabssi (__a);
+  return __builtin_aarch64_sqshlsi (__a, __b);
 }
 
-__extension__ static __inline int64_t __attribute__ ((__always_inline__))
-vqabsd_s64 (int64_t __a)
+__extension__ extern __inline int64_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqshld_s64 (int64_t __a, int64_t __b)
 {
-  return __builtin_aarch64_sqabsdi (__a);
+  return __builtin_aarch64_sqshldi (__a, __b);
 }
 
-/* vqadd */
-
-__extension__ static __inline int8_t __attribute__ ((__always_inline__))
-vqaddb_s8 (int8_t __a, int8_t __b)
+__extension__ extern __inline uint8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqshlb_u8 (uint8_t __a, uint8_t __b)
 {
-  return (int8_t) __builtin_aarch64_sqaddqi (__a, __b);
+  return __builtin_aarch64_uqshlqi_uus (__a, __b);
 }
 
-__extension__ static __inline int16_t __attribute__ ((__always_inline__))
-vqaddh_s16 (int16_t __a, int16_t __b)
+__extension__ extern __inline uint16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqshlh_u16 (uint16_t __a, uint16_t __b)
 {
-  return (int16_t) __builtin_aarch64_sqaddhi (__a, __b);
+  return __builtin_aarch64_uqshlhi_uus (__a, __b);
 }
 
-__extension__ static __inline int32_t __attribute__ ((__always_inline__))
-vqadds_s32 (int32_t __a, int32_t __b)
+__extension__ extern __inline uint32_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqshls_u32 (uint32_t __a, uint32_t __b)
 {
-  return (int32_t) __builtin_aarch64_sqaddsi (__a, __b);
+  return __builtin_aarch64_uqshlsi_uus (__a, __b);
 }
 
-__extension__ static __inline int64_t __attribute__ ((__always_inline__))
-vqaddd_s64 (int64_t __a, int64_t __b)
+__extension__ extern __inline uint64_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqshld_u64 (uint64_t __a, uint64_t __b)
 {
-  return __builtin_aarch64_sqadddi (__a, __b);
+  return __builtin_aarch64_uqshldi_uus (__a, __b);
 }
 
-__extension__ static __inline uint8_t __attribute__ ((__always_inline__))
-vqaddb_u8 (uint8_t __a, uint8_t __b)
+__extension__ extern __inline int8x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqshl_n_s8 (int8x8_t __a, const int __b)
 {
-  return (uint8_t) __builtin_aarch64_uqaddqi_uuu (__a, __b);
+  return (int8x8_t) __builtin_aarch64_sqshl_nv8qi (__a, __b);
 }
 
-__extension__ static __inline uint16_t __attribute__ ((__always_inline__))
-vqaddh_u16 (uint16_t __a, uint16_t __b)
+__extension__ extern __inline int16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqshl_n_s16 (int16x4_t __a, const int __b)
 {
-  return (uint16_t) __builtin_aarch64_uqaddhi_uuu (__a, __b);
+  return (int16x4_t) __builtin_aarch64_sqshl_nv4hi (__a, __b);
 }
 
-__extension__ static __inline uint32_t __attribute__ ((__always_inline__))
-vqadds_u32 (uint32_t __a, uint32_t __b)
+__extension__ extern __inline int32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqshl_n_s32 (int32x2_t __a, const int __b)
 {
-  return (uint32_t) __builtin_aarch64_uqaddsi_uuu (__a, __b);
+  return (int32x2_t) __builtin_aarch64_sqshl_nv2si (__a, __b);
 }
 
-__extension__ static __inline uint64_t __attribute__ ((__always_inline__))
-vqaddd_u64 (uint64_t __a, uint64_t __b)
+__extension__ extern __inline int64x1_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqshl_n_s64 (int64x1_t __a, const int __b)
 {
-  return __builtin_aarch64_uqadddi_uuu (__a, __b);
+  return (int64x1_t) {__builtin_aarch64_sqshl_ndi (__a[0], __b)};
 }
 
-/* vqdmlal */
-
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-vqdmlal_s16 (int32x4_t __a, int16x4_t __b, int16x4_t __c)
+__extension__ extern __inline uint8x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqshl_n_u8 (uint8x8_t __a, const int __b)
 {
-  return __builtin_aarch64_sqdmlalv4hi (__a, __b, __c);
+  return __builtin_aarch64_uqshl_nv8qi_uus (__a, __b);
 }
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-vqdmlal_high_s16 (int32x4_t __a, int16x8_t __b, int16x8_t __c)
+__extension__ extern __inline uint16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqshl_n_u16 (uint16x4_t __a, const int __b)
 {
-  return __builtin_aarch64_sqdmlal2v8hi (__a, __b, __c);
+  return __builtin_aarch64_uqshl_nv4hi_uus (__a, __b);
 }
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-vqdmlal_high_lane_s16 (int32x4_t __a, int16x8_t __b, int16x4_t __c,
-		       int const __d)
+__extension__ extern __inline uint32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqshl_n_u32 (uint32x2_t __a, const int __b)
 {
-  return __builtin_aarch64_sqdmlal2_lanev8hi (__a, __b, __c, __d);
+  return __builtin_aarch64_uqshl_nv2si_uus (__a, __b);
 }
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-vqdmlal_high_laneq_s16 (int32x4_t __a, int16x8_t __b, int16x8_t __c,
-			int const __d)
+__extension__ extern __inline uint64x1_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqshl_n_u64 (uint64x1_t __a, const int __b)
 {
-  return __builtin_aarch64_sqdmlal2_laneqv8hi (__a, __b, __c, __d);
+  return (uint64x1_t) {__builtin_aarch64_uqshl_ndi_uus (__a[0], __b)};
 }
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-vqdmlal_high_n_s16 (int32x4_t __a, int16x8_t __b, int16_t __c)
+__extension__ extern __inline int8x16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqshlq_n_s8 (int8x16_t __a, const int __b)
 {
-  return __builtin_aarch64_sqdmlal2_nv8hi (__a, __b, __c);
+  return (int8x16_t) __builtin_aarch64_sqshl_nv16qi (__a, __b);
 }
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-vqdmlal_lane_s16 (int32x4_t __a, int16x4_t __b, int16x4_t __c, int const __d)
+__extension__ extern __inline int16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqshlq_n_s16 (int16x8_t __a, const int __b)
 {
-  return __builtin_aarch64_sqdmlal_lanev4hi (__a, __b, __c, __d);
+  return (int16x8_t) __builtin_aarch64_sqshl_nv8hi (__a, __b);
 }
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-vqdmlal_laneq_s16 (int32x4_t __a, int16x4_t __b, int16x8_t __c, int const __d)
+__extension__ extern __inline int32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqshlq_n_s32 (int32x4_t __a, const int __b)
 {
-  return __builtin_aarch64_sqdmlal_laneqv4hi (__a, __b, __c, __d);
+  return (int32x4_t) __builtin_aarch64_sqshl_nv4si (__a, __b);
 }
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-vqdmlal_n_s16 (int32x4_t __a, int16x4_t __b, int16_t __c)
+__extension__ extern __inline int64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqshlq_n_s64 (int64x2_t __a, const int __b)
 {
-  return __builtin_aarch64_sqdmlal_nv4hi (__a, __b, __c);
+  return (int64x2_t) __builtin_aarch64_sqshl_nv2di (__a, __b);
 }
 
-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
-vqdmlal_s32 (int64x2_t __a, int32x2_t __b, int32x2_t __c)
+__extension__ extern __inline uint8x16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqshlq_n_u8 (uint8x16_t __a, const int __b)
 {
-  return __builtin_aarch64_sqdmlalv2si (__a, __b, __c);
+  return __builtin_aarch64_uqshl_nv16qi_uus (__a, __b);
 }
 
-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
-vqdmlal_high_s32 (int64x2_t __a, int32x4_t __b, int32x4_t __c)
+__extension__ extern __inline uint16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqshlq_n_u16 (uint16x8_t __a, const int __b)
 {
-  return __builtin_aarch64_sqdmlal2v4si (__a, __b, __c);
+  return __builtin_aarch64_uqshl_nv8hi_uus (__a, __b);
 }
 
-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
-vqdmlal_high_lane_s32 (int64x2_t __a, int32x4_t __b, int32x2_t __c,
-		       int const __d)
+__extension__ extern __inline uint32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqshlq_n_u32 (uint32x4_t __a, const int __b)
 {
-  return __builtin_aarch64_sqdmlal2_lanev4si (__a, __b, __c, __d);
+  return __builtin_aarch64_uqshl_nv4si_uus (__a, __b);
 }
 
-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
-vqdmlal_high_laneq_s32 (int64x2_t __a, int32x4_t __b, int32x4_t __c,
-			int const __d)
+__extension__ extern __inline uint64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqshlq_n_u64 (uint64x2_t __a, const int __b)
 {
-  return __builtin_aarch64_sqdmlal2_laneqv4si (__a, __b, __c, __d);
+  return __builtin_aarch64_uqshl_nv2di_uus (__a, __b);
 }
 
-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
-vqdmlal_high_n_s32 (int64x2_t __a, int32x4_t __b, int32_t __c)
+__extension__ extern __inline int8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqshlb_n_s8 (int8_t __a, const int __b)
 {
-  return __builtin_aarch64_sqdmlal2_nv4si (__a, __b, __c);
+  return (int8_t) __builtin_aarch64_sqshl_nqi (__a, __b);
 }
 
-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
-vqdmlal_lane_s32 (int64x2_t __a, int32x2_t __b, int32x2_t __c, int const __d)
+__extension__ extern __inline int16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqshlh_n_s16 (int16_t __a, const int __b)
 {
-  return __builtin_aarch64_sqdmlal_lanev2si (__a, __b, __c, __d);
+  return (int16_t) __builtin_aarch64_sqshl_nhi (__a, __b);
 }
 
-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
-vqdmlal_laneq_s32 (int64x2_t __a, int32x2_t __b, int32x4_t __c, int const __d)
+__extension__ extern __inline int32_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqshls_n_s32 (int32_t __a, const int __b)
 {
-  return __builtin_aarch64_sqdmlal_laneqv2si (__a, __b, __c, __d);
+  return (int32_t) __builtin_aarch64_sqshl_nsi (__a, __b);
 }
 
-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
-vqdmlal_n_s32 (int64x2_t __a, int32x2_t __b, int32_t __c)
+__extension__ extern __inline int64_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqshld_n_s64 (int64_t __a, const int __b)
 {
-  return __builtin_aarch64_sqdmlal_nv2si (__a, __b, __c);
+  return __builtin_aarch64_sqshl_ndi (__a, __b);
 }
 
-__extension__ static __inline int32_t __attribute__ ((__always_inline__))
-vqdmlalh_s16 (int32_t __a, int16_t __b, int16_t __c)
+__extension__ extern __inline uint8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqshlb_n_u8 (uint8_t __a, const int __b)
 {
-  return __builtin_aarch64_sqdmlalhi (__a, __b, __c);
+  return __builtin_aarch64_uqshl_nqi_uus (__a, __b);
 }
 
-__extension__ static __inline int32_t __attribute__ ((__always_inline__))
-vqdmlalh_lane_s16 (int32_t __a, int16_t __b, int16x4_t __c, const int __d)
+__extension__ extern __inline uint16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqshlh_n_u16 (uint16_t __a, const int __b)
 {
-  return __builtin_aarch64_sqdmlal_lanehi (__a, __b, __c, __d);
+  return __builtin_aarch64_uqshl_nhi_uus (__a, __b);
 }
 
-__extension__ static __inline int32_t __attribute__ ((__always_inline__))
-vqdmlalh_laneq_s16 (int32_t __a, int16_t __b, int16x8_t __c, const int __d)
+__extension__ extern __inline uint32_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqshls_n_u32 (uint32_t __a, const int __b)
 {
-  return __builtin_aarch64_sqdmlal_laneqhi (__a, __b, __c, __d);
+  return __builtin_aarch64_uqshl_nsi_uus (__a, __b);
 }
 
-__extension__ static __inline int64_t __attribute__ ((__always_inline__))
-vqdmlals_s32 (int64_t __a, int32_t __b, int32_t __c)
+__extension__ extern __inline uint64_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqshld_n_u64 (uint64_t __a, const int __b)
 {
-  return __builtin_aarch64_sqdmlalsi (__a, __b, __c);
+  return __builtin_aarch64_uqshl_ndi_uus (__a, __b);
 }
 
-__extension__ static __inline int64_t __attribute__ ((__always_inline__))
-vqdmlals_lane_s32 (int64_t __a, int32_t __b, int32x2_t __c, const int __d)
+/* vqshlu */
+
+__extension__ extern __inline uint8x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqshlu_n_s8 (int8x8_t __a, const int __b)
 {
-  return __builtin_aarch64_sqdmlal_lanesi (__a, __b, __c, __d);
+  return __builtin_aarch64_sqshlu_nv8qi_uss (__a, __b);
 }
 
-__extension__ static __inline int64_t __attribute__ ((__always_inline__))
-vqdmlals_laneq_s32 (int64_t __a, int32_t __b, int32x4_t __c, const int __d)
+__extension__ extern __inline uint16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqshlu_n_s16 (int16x4_t __a, const int __b)
 {
-  return __builtin_aarch64_sqdmlal_laneqsi (__a, __b, __c, __d);
+  return __builtin_aarch64_sqshlu_nv4hi_uss (__a, __b);
 }
 
-/* vqdmlsl */
-
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-vqdmlsl_s16 (int32x4_t __a, int16x4_t __b, int16x4_t __c)
+__extension__ extern __inline uint32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqshlu_n_s32 (int32x2_t __a, const int __b)
 {
-  return __builtin_aarch64_sqdmlslv4hi (__a, __b, __c);
+  return __builtin_aarch64_sqshlu_nv2si_uss (__a, __b);
 }
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-vqdmlsl_high_s16 (int32x4_t __a, int16x8_t __b, int16x8_t __c)
+__extension__ extern __inline uint64x1_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqshlu_n_s64 (int64x1_t __a, const int __b)
 {
-  return __builtin_aarch64_sqdmlsl2v8hi (__a, __b, __c);
+  return (uint64x1_t) {__builtin_aarch64_sqshlu_ndi_uss (__a[0], __b)};
 }
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-vqdmlsl_high_lane_s16 (int32x4_t __a, int16x8_t __b, int16x4_t __c,
-		       int const __d)
+__extension__ extern __inline uint8x16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqshluq_n_s8 (int8x16_t __a, const int __b)
 {
-  return __builtin_aarch64_sqdmlsl2_lanev8hi (__a, __b, __c, __d);
+  return __builtin_aarch64_sqshlu_nv16qi_uss (__a, __b);
 }
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-vqdmlsl_high_laneq_s16 (int32x4_t __a, int16x8_t __b, int16x8_t __c,
-			int const __d)
+__extension__ extern __inline uint16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqshluq_n_s16 (int16x8_t __a, const int __b)
 {
-  return __builtin_aarch64_sqdmlsl2_laneqv8hi (__a, __b, __c, __d);
+  return __builtin_aarch64_sqshlu_nv8hi_uss (__a, __b);
 }
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-vqdmlsl_high_n_s16 (int32x4_t __a, int16x8_t __b, int16_t __c)
+__extension__ extern __inline uint32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqshluq_n_s32 (int32x4_t __a, const int __b)
 {
-  return __builtin_aarch64_sqdmlsl2_nv8hi (__a, __b, __c);
+  return __builtin_aarch64_sqshlu_nv4si_uss (__a, __b);
 }
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-vqdmlsl_lane_s16 (int32x4_t __a, int16x4_t __b, int16x4_t __c, int const __d)
+__extension__ extern __inline uint64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqshluq_n_s64 (int64x2_t __a, const int __b)
 {
-  return __builtin_aarch64_sqdmlsl_lanev4hi (__a, __b, __c, __d);
+  return __builtin_aarch64_sqshlu_nv2di_uss (__a, __b);
 }
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-vqdmlsl_laneq_s16 (int32x4_t __a, int16x4_t __b, int16x8_t __c, int const __d)
+__extension__ extern __inline int8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqshlub_n_s8 (int8_t __a, const int __b)
 {
-  return __builtin_aarch64_sqdmlsl_laneqv4hi (__a, __b, __c, __d);
+  return (int8_t) __builtin_aarch64_sqshlu_nqi_uss (__a, __b);
 }
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-vqdmlsl_n_s16 (int32x4_t __a, int16x4_t __b, int16_t __c)
+__extension__ extern __inline int16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqshluh_n_s16 (int16_t __a, const int __b)
 {
-  return __builtin_aarch64_sqdmlsl_nv4hi (__a, __b, __c);
+  return (int16_t) __builtin_aarch64_sqshlu_nhi_uss (__a, __b);
 }
 
-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
-vqdmlsl_s32 (int64x2_t __a, int32x2_t __b, int32x2_t __c)
+__extension__ extern __inline int32_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqshlus_n_s32 (int32_t __a, const int __b)
 {
-  return __builtin_aarch64_sqdmlslv2si (__a, __b, __c);
+  return (int32_t) __builtin_aarch64_sqshlu_nsi_uss (__a, __b);
 }
 
-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
-vqdmlsl_high_s32 (int64x2_t __a, int32x4_t __b, int32x4_t __c)
+__extension__ extern __inline uint64_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqshlud_n_s64 (int64_t __a, const int __b)
 {
-  return __builtin_aarch64_sqdmlsl2v4si (__a, __b, __c);
+  return __builtin_aarch64_sqshlu_ndi_uss (__a, __b);
 }
 
-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
-vqdmlsl_high_lane_s32 (int64x2_t __a, int32x4_t __b, int32x2_t __c,
-		       int const __d)
+/* vqshrn */
+
+__extension__ extern __inline int8x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqshrn_n_s16 (int16x8_t __a, const int __b)
 {
-  return __builtin_aarch64_sqdmlsl2_lanev4si (__a, __b, __c, __d);
+  return (int8x8_t) __builtin_aarch64_sqshrn_nv8hi (__a, __b);
 }
 
-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
-vqdmlsl_high_laneq_s32 (int64x2_t __a, int32x4_t __b, int32x4_t __c,
-			int const __d)
+__extension__ extern __inline int16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqshrn_n_s32 (int32x4_t __a, const int __b)
 {
-  return __builtin_aarch64_sqdmlsl2_laneqv4si (__a, __b, __c, __d);
+  return (int16x4_t) __builtin_aarch64_sqshrn_nv4si (__a, __b);
 }
 
-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
-vqdmlsl_high_n_s32 (int64x2_t __a, int32x4_t __b, int32_t __c)
+__extension__ extern __inline int32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqshrn_n_s64 (int64x2_t __a, const int __b)
 {
-  return __builtin_aarch64_sqdmlsl2_nv4si (__a, __b, __c);
+  return (int32x2_t) __builtin_aarch64_sqshrn_nv2di (__a, __b);
 }
 
-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
-vqdmlsl_lane_s32 (int64x2_t __a, int32x2_t __b, int32x2_t __c, int const __d)
+__extension__ extern __inline uint8x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqshrn_n_u16 (uint16x8_t __a, const int __b)
 {
-  return __builtin_aarch64_sqdmlsl_lanev2si (__a, __b, __c, __d);
+  return __builtin_aarch64_uqshrn_nv8hi_uus ( __a, __b);
 }
 
-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
-vqdmlsl_laneq_s32 (int64x2_t __a, int32x2_t __b, int32x4_t __c, int const __d)
+__extension__ extern __inline uint16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqshrn_n_u32 (uint32x4_t __a, const int __b)
 {
-  return __builtin_aarch64_sqdmlsl_laneqv2si (__a, __b, __c, __d);
+  return __builtin_aarch64_uqshrn_nv4si_uus ( __a, __b);
 }
 
-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
-vqdmlsl_n_s32 (int64x2_t __a, int32x2_t __b, int32_t __c)
+__extension__ extern __inline uint32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqshrn_n_u64 (uint64x2_t __a, const int __b)
 {
-  return __builtin_aarch64_sqdmlsl_nv2si (__a, __b, __c);
+  return __builtin_aarch64_uqshrn_nv2di_uus ( __a, __b);
 }
 
-__extension__ static __inline int32_t __attribute__ ((__always_inline__))
-vqdmlslh_s16 (int32_t __a, int16_t __b, int16_t __c)
+__extension__ extern __inline int8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqshrnh_n_s16 (int16_t __a, const int __b)
 {
-  return __builtin_aarch64_sqdmlslhi (__a, __b, __c);
+  return (int8_t) __builtin_aarch64_sqshrn_nhi (__a, __b);
 }
 
-__extension__ static __inline int32_t __attribute__ ((__always_inline__))
-vqdmlslh_lane_s16 (int32_t __a, int16_t __b, int16x4_t __c, const int __d)
+__extension__ extern __inline int16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqshrns_n_s32 (int32_t __a, const int __b)
 {
-  return __builtin_aarch64_sqdmlsl_lanehi (__a, __b, __c, __d);
+  return (int16_t) __builtin_aarch64_sqshrn_nsi (__a, __b);
 }
 
-__extension__ static __inline int32_t __attribute__ ((__always_inline__))
-vqdmlslh_laneq_s16 (int32_t __a, int16_t __b, int16x8_t __c, const int __d)
+__extension__ extern __inline int32_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqshrnd_n_s64 (int64_t __a, const int __b)
 {
-  return __builtin_aarch64_sqdmlsl_laneqhi (__a, __b, __c, __d);
+  return (int32_t) __builtin_aarch64_sqshrn_ndi (__a, __b);
 }
 
-__extension__ static __inline int64_t __attribute__ ((__always_inline__))
-vqdmlsls_s32 (int64_t __a, int32_t __b, int32_t __c)
+__extension__ extern __inline uint8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqshrnh_n_u16 (uint16_t __a, const int __b)
 {
-  return __builtin_aarch64_sqdmlslsi (__a, __b, __c);
+  return __builtin_aarch64_uqshrn_nhi_uus (__a, __b);
 }
 
-__extension__ static __inline int64_t __attribute__ ((__always_inline__))
-vqdmlsls_lane_s32 (int64_t __a, int32_t __b, int32x2_t __c, const int __d)
+__extension__ extern __inline uint16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqshrns_n_u32 (uint32_t __a, const int __b)
 {
-  return __builtin_aarch64_sqdmlsl_lanesi (__a, __b, __c, __d);
+  return __builtin_aarch64_uqshrn_nsi_uus (__a, __b);
 }
 
-__extension__ static __inline int64_t __attribute__ ((__always_inline__))
-vqdmlsls_laneq_s32 (int64_t __a, int32_t __b, int32x4_t __c, const int __d)
+__extension__ extern __inline uint32_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqshrnd_n_u64 (uint64_t __a, const int __b)
 {
-  return __builtin_aarch64_sqdmlsl_laneqsi (__a, __b, __c, __d);
+  return __builtin_aarch64_uqshrn_ndi_uus (__a, __b);
 }
 
-/* vqdmulh */
+/* vqshrun */
 
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-vqdmulh_lane_s16 (int16x4_t __a, int16x4_t __b, const int __c)
+__extension__ extern __inline uint8x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqshrun_n_s16 (int16x8_t __a, const int __b)
 {
-  return __builtin_aarch64_sqdmulh_lanev4hi (__a, __b, __c);
+  return (uint8x8_t) __builtin_aarch64_sqshrun_nv8hi (__a, __b);
 }
 
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-vqdmulh_lane_s32 (int32x2_t __a, int32x2_t __b, const int __c)
+__extension__ extern __inline uint16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqshrun_n_s32 (int32x4_t __a, const int __b)
 {
-  return __builtin_aarch64_sqdmulh_lanev2si (__a, __b, __c);
+  return (uint16x4_t) __builtin_aarch64_sqshrun_nv4si (__a, __b);
 }
 
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-vqdmulhq_lane_s16 (int16x8_t __a, int16x4_t __b, const int __c)
+__extension__ extern __inline uint32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqshrun_n_s64 (int64x2_t __a, const int __b)
 {
-  return __builtin_aarch64_sqdmulh_lanev8hi (__a, __b, __c);
+  return (uint32x2_t) __builtin_aarch64_sqshrun_nv2di (__a, __b);
 }
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-vqdmulhq_lane_s32 (int32x4_t __a, int32x2_t __b, const int __c)
+__extension__ extern __inline int8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqshrunh_n_s16 (int16_t __a, const int __b)
 {
-  return __builtin_aarch64_sqdmulh_lanev4si (__a, __b, __c);
+  return (int8_t) __builtin_aarch64_sqshrun_nhi (__a, __b);
 }
 
-__extension__ static __inline int16_t __attribute__ ((__always_inline__))
-vqdmulhh_s16 (int16_t __a, int16_t __b)
+__extension__ extern __inline int16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqshruns_n_s32 (int32_t __a, const int __b)
 {
-  return (int16_t) __builtin_aarch64_sqdmulhhi (__a, __b);
+  return (int16_t) __builtin_aarch64_sqshrun_nsi (__a, __b);
 }
 
-__extension__ static __inline int16_t __attribute__ ((__always_inline__))
-vqdmulhh_lane_s16 (int16_t __a, int16x4_t __b, const int __c)
+__extension__ extern __inline int32_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqshrund_n_s64 (int64_t __a, const int __b)
 {
-  return __builtin_aarch64_sqdmulh_lanehi (__a, __b, __c);
+  return (int32_t) __builtin_aarch64_sqshrun_ndi (__a, __b);
 }
 
-__extension__ static __inline int16_t __attribute__ ((__always_inline__))
-vqdmulhh_laneq_s16 (int16_t __a, int16x8_t __b, const int __c)
+/* vqsub */
+
+__extension__ extern __inline int8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqsubb_s8 (int8_t __a, int8_t __b)
 {
-  return __builtin_aarch64_sqdmulh_laneqhi (__a, __b, __c);
+  return (int8_t) __builtin_aarch64_sqsubqi (__a, __b);
 }
 
-__extension__ static __inline int32_t __attribute__ ((__always_inline__))
-vqdmulhs_s32 (int32_t __a, int32_t __b)
+__extension__ extern __inline int16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqsubh_s16 (int16_t __a, int16_t __b)
 {
-  return (int32_t) __builtin_aarch64_sqdmulhsi (__a, __b);
+  return (int16_t) __builtin_aarch64_sqsubhi (__a, __b);
 }
 
-__extension__ static __inline int32_t __attribute__ ((__always_inline__))
-vqdmulhs_lane_s32 (int32_t __a, int32x2_t __b, const int __c)
+__extension__ extern __inline int32_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqsubs_s32 (int32_t __a, int32_t __b)
 {
-  return __builtin_aarch64_sqdmulh_lanesi (__a, __b, __c);
+  return (int32_t) __builtin_aarch64_sqsubsi (__a, __b);
 }
 
-__extension__ static __inline int32_t __attribute__ ((__always_inline__))
-vqdmulhs_laneq_s32 (int32_t __a, int32x4_t __b, const int __c)
+__extension__ extern __inline int64_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqsubd_s64 (int64_t __a, int64_t __b)
 {
-  return __builtin_aarch64_sqdmulh_laneqsi (__a, __b, __c);
+  return __builtin_aarch64_sqsubdi (__a, __b);
 }
 
-/* vqdmull */
-
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-vqdmull_s16 (int16x4_t __a, int16x4_t __b)
+__extension__ extern __inline uint8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqsubb_u8 (uint8_t __a, uint8_t __b)
 {
-  return __builtin_aarch64_sqdmullv4hi (__a, __b);
+  return (uint8_t) __builtin_aarch64_uqsubqi_uuu (__a, __b);
 }
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-vqdmull_high_s16 (int16x8_t __a, int16x8_t __b)
+__extension__ extern __inline uint16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqsubh_u16 (uint16_t __a, uint16_t __b)
 {
-  return __builtin_aarch64_sqdmull2v8hi (__a, __b);
+  return (uint16_t) __builtin_aarch64_uqsubhi_uuu (__a, __b);
 }
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-vqdmull_high_lane_s16 (int16x8_t __a, int16x4_t __b, int const __c)
+__extension__ extern __inline uint32_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqsubs_u32 (uint32_t __a, uint32_t __b)
 {
-  return __builtin_aarch64_sqdmull2_lanev8hi (__a, __b,__c);
+  return (uint32_t) __builtin_aarch64_uqsubsi_uuu (__a, __b);
 }
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-vqdmull_high_laneq_s16 (int16x8_t __a, int16x8_t __b, int const __c)
+__extension__ extern __inline uint64_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqsubd_u64 (uint64_t __a, uint64_t __b)
 {
-  return __builtin_aarch64_sqdmull2_laneqv8hi (__a, __b,__c);
+  return __builtin_aarch64_uqsubdi_uuu (__a, __b);
 }
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-vqdmull_high_n_s16 (int16x8_t __a, int16_t __b)
+/* vqtbl2 */
+
+__extension__ extern __inline int8x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqtbl2_s8 (int8x16x2_t tab, uint8x8_t idx)
 {
-  return __builtin_aarch64_sqdmull2_nv8hi (__a, __b);
+  __builtin_aarch64_simd_oi __o;
+  __o = __builtin_aarch64_set_qregoiv16qi (__o, tab.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv16qi (__o, tab.val[1], 1);
+  return __builtin_aarch64_tbl3v8qi (__o, (int8x8_t)idx);
 }
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-vqdmull_lane_s16 (int16x4_t __a, int16x4_t __b, int const __c)
+__extension__ extern __inline uint8x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqtbl2_u8 (uint8x16x2_t tab, uint8x8_t idx)
 {
-  return __builtin_aarch64_sqdmull_lanev4hi (__a, __b, __c);
+  __builtin_aarch64_simd_oi __o;
+  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t)tab.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t)tab.val[1], 1);
+  return (uint8x8_t)__builtin_aarch64_tbl3v8qi (__o, (int8x8_t)idx);
 }
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-vqdmull_laneq_s16 (int16x4_t __a, int16x8_t __b, int const __c)
+__extension__ extern __inline poly8x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqtbl2_p8 (poly8x16x2_t tab, uint8x8_t idx)
 {
-  return __builtin_aarch64_sqdmull_laneqv4hi (__a, __b, __c);
+  __builtin_aarch64_simd_oi __o;
+  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t)tab.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t)tab.val[1], 1);
+  return (poly8x8_t)__builtin_aarch64_tbl3v8qi (__o, (int8x8_t)idx);
 }
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-vqdmull_n_s16 (int16x4_t __a, int16_t __b)
+__extension__ extern __inline int8x16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqtbl2q_s8 (int8x16x2_t tab, uint8x16_t idx)
 {
-  return __builtin_aarch64_sqdmull_nv4hi (__a, __b);
+  __builtin_aarch64_simd_oi __o;
+  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t)tab.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t)tab.val[1], 1);
+  return __builtin_aarch64_tbl3v16qi (__o, (int8x16_t)idx);
 }
 
-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
-vqdmull_s32 (int32x2_t __a, int32x2_t __b)
+__extension__ extern __inline uint8x16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqtbl2q_u8 (uint8x16x2_t tab, uint8x16_t idx)
 {
-  return __builtin_aarch64_sqdmullv2si (__a, __b);
+  __builtin_aarch64_simd_oi __o;
+  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t)tab.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t)tab.val[1], 1);
+  return (uint8x16_t)__builtin_aarch64_tbl3v16qi (__o, (int8x16_t)idx);
 }
 
-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
-vqdmull_high_s32 (int32x4_t __a, int32x4_t __b)
+__extension__ extern __inline poly8x16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqtbl2q_p8 (poly8x16x2_t tab, uint8x16_t idx)
 {
-  return __builtin_aarch64_sqdmull2v4si (__a, __b);
+  __builtin_aarch64_simd_oi __o;
+  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t)tab.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t)tab.val[1], 1);
+  return (poly8x16_t)__builtin_aarch64_tbl3v16qi (__o, (int8x16_t)idx);
 }
 
-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
-vqdmull_high_lane_s32 (int32x4_t __a, int32x2_t __b, int const __c)
+/* vqtbl3 */
+
+__extension__ extern __inline int8x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqtbl3_s8 (int8x16x3_t tab, uint8x8_t idx)
 {
-  return __builtin_aarch64_sqdmull2_lanev4si (__a, __b, __c);
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)tab.val[0], 0);
+  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)tab.val[1], 1);
+  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)tab.val[2], 2);
+  return __builtin_aarch64_qtbl3v8qi (__o, (int8x8_t)idx);
 }
 
-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
-vqdmull_high_laneq_s32 (int32x4_t __a, int32x4_t __b, int const __c)
+__extension__ extern __inline uint8x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqtbl3_u8 (uint8x16x3_t tab, uint8x8_t idx)
 {
-  return __builtin_aarch64_sqdmull2_laneqv4si (__a, __b, __c);
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)tab.val[0], 0);
+  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)tab.val[1], 1);
+  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)tab.val[2], 2);
+  return (uint8x8_t)__builtin_aarch64_qtbl3v8qi (__o, (int8x8_t)idx);
 }
 
-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
-vqdmull_high_n_s32 (int32x4_t __a, int32_t __b)
+__extension__ extern __inline poly8x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqtbl3_p8 (poly8x16x3_t tab, uint8x8_t idx)
 {
-  return __builtin_aarch64_sqdmull2_nv4si (__a, __b);
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)tab.val[0], 0);
+  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)tab.val[1], 1);
+  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)tab.val[2], 2);
+  return (poly8x8_t)__builtin_aarch64_qtbl3v8qi (__o, (int8x8_t)idx);
 }
 
-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
-vqdmull_lane_s32 (int32x2_t __a, int32x2_t __b, int const __c)
+__extension__ extern __inline int8x16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqtbl3q_s8 (int8x16x3_t tab, uint8x16_t idx)
 {
-  return __builtin_aarch64_sqdmull_lanev2si (__a, __b, __c);
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)tab.val[0], 0);
+  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)tab.val[1], 1);
+  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)tab.val[2], 2);
+  return __builtin_aarch64_qtbl3v16qi (__o, (int8x16_t)idx);
 }
 
-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
-vqdmull_laneq_s32 (int32x2_t __a, int32x4_t __b, int const __c)
+__extension__ extern __inline uint8x16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqtbl3q_u8 (uint8x16x3_t tab, uint8x16_t idx)
 {
-  return __builtin_aarch64_sqdmull_laneqv2si (__a, __b, __c);
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)tab.val[0], 0);
+  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)tab.val[1], 1);
+  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)tab.val[2], 2);
+  return (uint8x16_t)__builtin_aarch64_qtbl3v16qi (__o, (int8x16_t)idx);
 }
 
-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
-vqdmull_n_s32 (int32x2_t __a, int32_t __b)
+__extension__ extern __inline poly8x16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqtbl3q_p8 (poly8x16x3_t tab, uint8x16_t idx)
 {
-  return __builtin_aarch64_sqdmull_nv2si (__a, __b);
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)tab.val[0], 0);
+  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)tab.val[1], 1);
+  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)tab.val[2], 2);
+  return (poly8x16_t)__builtin_aarch64_qtbl3v16qi (__o, (int8x16_t)idx);
 }
 
-__extension__ static __inline int32_t __attribute__ ((__always_inline__))
-vqdmullh_s16 (int16_t __a, int16_t __b)
+/* vqtbl4 */
+
+__extension__ extern __inline int8x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqtbl4_s8 (int8x16x4_t tab, uint8x8_t idx)
 {
-  return (int32_t) __builtin_aarch64_sqdmullhi (__a, __b);
+  __builtin_aarch64_simd_xi __o;
+  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[0], 0);
+  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[1], 1);
+  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[2], 2);
+  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[3], 3);
+  return __builtin_aarch64_qtbl4v8qi (__o, (int8x8_t)idx);
 }
 
-__extension__ static __inline int32_t __attribute__ ((__always_inline__))
-vqdmullh_lane_s16 (int16_t __a, int16x4_t __b, const int __c)
+__extension__ extern __inline uint8x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqtbl4_u8 (uint8x16x4_t tab, uint8x8_t idx)
 {
-  return __builtin_aarch64_sqdmull_lanehi (__a, __b, __c);
+  __builtin_aarch64_simd_xi __o;
+  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[0], 0);
+  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[1], 1);
+  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[2], 2);
+  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[3], 3);
+  return (uint8x8_t)__builtin_aarch64_qtbl4v8qi (__o, (int8x8_t)idx);
 }
 
-__extension__ static __inline int32_t __attribute__ ((__always_inline__))
-vqdmullh_laneq_s16 (int16_t __a, int16x8_t __b, const int __c)
+__extension__ extern __inline poly8x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqtbl4_p8 (poly8x16x4_t tab, uint8x8_t idx)
 {
-  return __builtin_aarch64_sqdmull_laneqhi (__a, __b, __c);
+  __builtin_aarch64_simd_xi __o;
+  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[0], 0);
+  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[1], 1);
+  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[2], 2);
+  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[3], 3);
+  return (poly8x8_t)__builtin_aarch64_qtbl4v8qi (__o, (int8x8_t)idx);
 }
 
-__extension__ static __inline int64_t __attribute__ ((__always_inline__))
-vqdmulls_s32 (int32_t __a, int32_t __b)
+__extension__ extern __inline int8x16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqtbl4q_s8 (int8x16x4_t tab, uint8x16_t idx)
 {
-  return __builtin_aarch64_sqdmullsi (__a, __b);
+  __builtin_aarch64_simd_xi __o;
+  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[0], 0);
+  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[1], 1);
+  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[2], 2);
+  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[3], 3);
+  return __builtin_aarch64_qtbl4v16qi (__o, (int8x16_t)idx);
 }
 
-__extension__ static __inline int64_t __attribute__ ((__always_inline__))
-vqdmulls_lane_s32 (int32_t __a, int32x2_t __b, const int __c)
+__extension__ extern __inline uint8x16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqtbl4q_u8 (uint8x16x4_t tab, uint8x16_t idx)
 {
-  return __builtin_aarch64_sqdmull_lanesi (__a, __b, __c);
+  __builtin_aarch64_simd_xi __o;
+  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[0], 0);
+  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[1], 1);
+  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[2], 2);
+  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[3], 3);
+  return (uint8x16_t)__builtin_aarch64_qtbl4v16qi (__o, (int8x16_t)idx);
 }
 
-__extension__ static __inline int64_t __attribute__ ((__always_inline__))
-vqdmulls_laneq_s32 (int32_t __a, int32x4_t __b, const int __c)
+__extension__ extern __inline poly8x16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqtbl4q_p8 (poly8x16x4_t tab, uint8x16_t idx)
 {
-  return __builtin_aarch64_sqdmull_laneqsi (__a, __b, __c);
+  __builtin_aarch64_simd_xi __o;
+  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[0], 0);
+  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[1], 1);
+  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[2], 2);
+  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[3], 3);
+  return (poly8x16_t)__builtin_aarch64_qtbl4v16qi (__o, (int8x16_t)idx);
 }
 
-/* vqmovn */
 
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-vqmovn_s16 (int16x8_t __a)
+/* vqtbx2 */
+__extension__ extern __inline int8x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqtbx2_s8 (int8x8_t r, int8x16x2_t tab, uint8x8_t idx)
 {
-  return (int8x8_t) __builtin_aarch64_sqmovnv8hi (__a);
+  __builtin_aarch64_simd_oi __o;
+  __o = __builtin_aarch64_set_qregoiv16qi (__o, tab.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv16qi (__o, tab.val[1], 1);
+  return __builtin_aarch64_tbx4v8qi (r, __o, (int8x8_t)idx);
 }
 
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-vqmovn_s32 (int32x4_t __a)
+__extension__ extern __inline uint8x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqtbx2_u8 (uint8x8_t r, uint8x16x2_t tab, uint8x8_t idx)
 {
-  return (int16x4_t) __builtin_aarch64_sqmovnv4si (__a);
+  __builtin_aarch64_simd_oi __o;
+  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t)tab.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t)tab.val[1], 1);
+  return (uint8x8_t)__builtin_aarch64_tbx4v8qi ((int8x8_t)r, __o,
+						(int8x8_t)idx);
 }
 
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-vqmovn_s64 (int64x2_t __a)
+__extension__ extern __inline poly8x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqtbx2_p8 (poly8x8_t r, poly8x16x2_t tab, uint8x8_t idx)
 {
-  return (int32x2_t) __builtin_aarch64_sqmovnv2di (__a);
+  __builtin_aarch64_simd_oi __o;
+  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t)tab.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t)tab.val[1], 1);
+  return (poly8x8_t)__builtin_aarch64_tbx4v8qi ((int8x8_t)r, __o,
+						(int8x8_t)idx);
 }
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-vqmovn_u16 (uint16x8_t __a)
+__extension__ extern __inline int8x16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqtbx2q_s8 (int8x16_t r, int8x16x2_t tab, uint8x16_t idx)
 {
-  return (uint8x8_t) __builtin_aarch64_uqmovnv8hi ((int16x8_t) __a);
+  __builtin_aarch64_simd_oi __o;
+  __o = __builtin_aarch64_set_qregoiv16qi (__o, tab.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv16qi (__o, tab.val[1], 1);
+  return __builtin_aarch64_tbx4v16qi (r, __o, (int8x16_t)idx);
 }
 
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-vqmovn_u32 (uint32x4_t __a)
+__extension__ extern __inline uint8x16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqtbx2q_u8 (uint8x16_t r, uint8x16x2_t tab, uint8x16_t idx)
 {
-  return (uint16x4_t) __builtin_aarch64_uqmovnv4si ((int32x4_t) __a);
+  __builtin_aarch64_simd_oi __o;
+  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t)tab.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t)tab.val[1], 1);
+  return (uint8x16_t)__builtin_aarch64_tbx4v16qi ((int8x16_t)r, __o,
+						  (int8x16_t)idx);
 }
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-vqmovn_u64 (uint64x2_t __a)
+__extension__ extern __inline poly8x16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqtbx2q_p8 (poly8x16_t r, poly8x16x2_t tab, uint8x16_t idx)
 {
-  return (uint32x2_t) __builtin_aarch64_uqmovnv2di ((int64x2_t) __a);
+  __builtin_aarch64_simd_oi __o;
+  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t)tab.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t)tab.val[1], 1);
+  return (poly8x16_t)__builtin_aarch64_tbx4v16qi ((int8x16_t)r, __o,
+						  (int8x16_t)idx);
 }
 
-__extension__ static __inline int8_t __attribute__ ((__always_inline__))
-vqmovnh_s16 (int16_t __a)
+/* vqtbx3 */
+__extension__ extern __inline int8x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqtbx3_s8 (int8x8_t r, int8x16x3_t tab, uint8x8_t idx)
 {
-  return (int8_t) __builtin_aarch64_sqmovnhi (__a);
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_set_qregciv16qi (__o, tab.val[0], 0);
+  __o = __builtin_aarch64_set_qregciv16qi (__o, tab.val[1], 1);
+  __o = __builtin_aarch64_set_qregciv16qi (__o, tab.val[2], 2);
+  return __builtin_aarch64_qtbx3v8qi (r, __o, (int8x8_t)idx);
 }
 
-__extension__ static __inline int16_t __attribute__ ((__always_inline__))
-vqmovns_s32 (int32_t __a)
+__extension__ extern __inline uint8x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqtbx3_u8 (uint8x8_t r, uint8x16x3_t tab, uint8x8_t idx)
 {
-  return (int16_t) __builtin_aarch64_sqmovnsi (__a);
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)tab.val[0], 0);
+  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)tab.val[1], 1);
+  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)tab.val[2], 2);
+  return (uint8x8_t)__builtin_aarch64_qtbx3v8qi ((int8x8_t)r, __o,
+						 (int8x8_t)idx);
 }
 
-__extension__ static __inline int32_t __attribute__ ((__always_inline__))
-vqmovnd_s64 (int64_t __a)
+__extension__ extern __inline poly8x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqtbx3_p8 (poly8x8_t r, poly8x16x3_t tab, uint8x8_t idx)
 {
-  return (int32_t) __builtin_aarch64_sqmovndi (__a);
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)tab.val[0], 0);
+  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)tab.val[1], 1);
+  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)tab.val[2], 2);
+  return (poly8x8_t)__builtin_aarch64_qtbx3v8qi ((int8x8_t)r, __o,
+						 (int8x8_t)idx);
 }
 
-__extension__ static __inline uint8_t __attribute__ ((__always_inline__))
-vqmovnh_u16 (uint16_t __a)
+__extension__ extern __inline int8x16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqtbx3q_s8 (int8x16_t r, int8x16x3_t tab, uint8x16_t idx)
 {
-  return (uint8_t) __builtin_aarch64_uqmovnhi (__a);
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_set_qregciv16qi (__o, tab.val[0], 0);
+  __o = __builtin_aarch64_set_qregciv16qi (__o, tab.val[1], 1);
+  __o = __builtin_aarch64_set_qregciv16qi (__o, tab.val[2], 2);
+  return __builtin_aarch64_qtbx3v16qi (r, __o, (int8x16_t)idx);
 }
 
-__extension__ static __inline uint16_t __attribute__ ((__always_inline__))
-vqmovns_u32 (uint32_t __a)
+__extension__ extern __inline uint8x16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqtbx3q_u8 (uint8x16_t r, uint8x16x3_t tab, uint8x16_t idx)
 {
-  return (uint16_t) __builtin_aarch64_uqmovnsi (__a);
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)tab.val[0], 0);
+  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)tab.val[1], 1);
+  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)tab.val[2], 2);
+  return (uint8x16_t)__builtin_aarch64_qtbx3v16qi ((int8x16_t)r, __o,
+						   (int8x16_t)idx);
 }
 
-__extension__ static __inline uint32_t __attribute__ ((__always_inline__))
-vqmovnd_u64 (uint64_t __a)
+__extension__ extern __inline poly8x16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqtbx3q_p8 (poly8x16_t r, poly8x16x3_t tab, uint8x16_t idx)
 {
-  return (uint32_t) __builtin_aarch64_uqmovndi (__a);
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)tab.val[0], 0);
+  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)tab.val[1], 1);
+  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)tab.val[2], 2);
+  return (poly8x16_t)__builtin_aarch64_qtbx3v16qi ((int8x16_t)r, __o,
+						   (int8x16_t)idx);
 }
 
-/* vqmovun */
+/* vqtbx4 */
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-vqmovun_s16 (int16x8_t __a)
+__extension__ extern __inline int8x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqtbx4_s8 (int8x8_t r, int8x16x4_t tab, uint8x8_t idx)
 {
-  return (uint8x8_t) __builtin_aarch64_sqmovunv8hi (__a);
+  __builtin_aarch64_simd_xi __o;
+  __o = __builtin_aarch64_set_qregxiv16qi (__o, tab.val[0], 0);
+  __o = __builtin_aarch64_set_qregxiv16qi (__o, tab.val[1], 1);
+  __o = __builtin_aarch64_set_qregxiv16qi (__o, tab.val[2], 2);
+  __o = __builtin_aarch64_set_qregxiv16qi (__o, tab.val[3], 3);
+  return __builtin_aarch64_qtbx4v8qi (r, __o, (int8x8_t)idx);
 }
 
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-vqmovun_s32 (int32x4_t __a)
+__extension__ extern __inline uint8x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqtbx4_u8 (uint8x8_t r, uint8x16x4_t tab, uint8x8_t idx)
 {
-  return (uint16x4_t) __builtin_aarch64_sqmovunv4si (__a);
+  __builtin_aarch64_simd_xi __o;
+  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[0], 0);
+  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[1], 1);
+  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[2], 2);
+  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[3], 3);
+  return (uint8x8_t)__builtin_aarch64_qtbx4v8qi ((int8x8_t)r, __o,
+						 (int8x8_t)idx);
 }
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-vqmovun_s64 (int64x2_t __a)
+__extension__ extern __inline poly8x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqtbx4_p8 (poly8x8_t r, poly8x16x4_t tab, uint8x8_t idx)
 {
-  return (uint32x2_t) __builtin_aarch64_sqmovunv2di (__a);
+  __builtin_aarch64_simd_xi __o;
+  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[0], 0);
+  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[1], 1);
+  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[2], 2);
+  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[3], 3);
+  return (poly8x8_t)__builtin_aarch64_qtbx4v8qi ((int8x8_t)r, __o,
+						 (int8x8_t)idx);
 }
 
-__extension__ static __inline int8_t __attribute__ ((__always_inline__))
-vqmovunh_s16 (int16_t __a)
+__extension__ extern __inline int8x16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqtbx4q_s8 (int8x16_t r, int8x16x4_t tab, uint8x16_t idx)
 {
-  return (int8_t) __builtin_aarch64_sqmovunhi (__a);
+  __builtin_aarch64_simd_xi __o;
+  __o = __builtin_aarch64_set_qregxiv16qi (__o, tab.val[0], 0);
+  __o = __builtin_aarch64_set_qregxiv16qi (__o, tab.val[1], 1);
+  __o = __builtin_aarch64_set_qregxiv16qi (__o, tab.val[2], 2);
+  __o = __builtin_aarch64_set_qregxiv16qi (__o, tab.val[3], 3);
+  return __builtin_aarch64_qtbx4v16qi (r, __o, (int8x16_t)idx);
 }
 
-__extension__ static __inline int16_t __attribute__ ((__always_inline__))
-vqmovuns_s32 (int32_t __a)
+__extension__ extern __inline uint8x16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqtbx4q_u8 (uint8x16_t r, uint8x16x4_t tab, uint8x16_t idx)
 {
-  return (int16_t) __builtin_aarch64_sqmovunsi (__a);
+  __builtin_aarch64_simd_xi __o;
+  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[0], 0);
+  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[1], 1);
+  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[2], 2);
+  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[3], 3);
+  return (uint8x16_t)__builtin_aarch64_qtbx4v16qi ((int8x16_t)r, __o,
+						   (int8x16_t)idx);
 }
 
-__extension__ static __inline int32_t __attribute__ ((__always_inline__))
-vqmovund_s64 (int64_t __a)
+__extension__ extern __inline poly8x16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqtbx4q_p8 (poly8x16_t r, poly8x16x4_t tab, uint8x16_t idx)
 {
-  return (int32_t) __builtin_aarch64_sqmovundi (__a);
+  __builtin_aarch64_simd_xi __o;
+  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[0], 0);
+  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[1], 1);
+  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[2], 2);
+  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[3], 3);
+  return (poly8x16_t)__builtin_aarch64_qtbx4v16qi ((int8x16_t)r, __o,
+						   (int8x16_t)idx);
 }
 
-/* vqneg */
+/* vrbit  */
 
-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
-vqnegq_s64 (int64x2_t __a)
+__extension__ extern __inline poly8x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vrbit_p8 (poly8x8_t __a)
 {
-  return (int64x2_t) __builtin_aarch64_sqnegv2di (__a);
+  return (poly8x8_t) __builtin_aarch64_rbitv8qi ((int8x8_t) __a);
 }
 
-__extension__ static __inline int8_t __attribute__ ((__always_inline__))
-vqnegb_s8 (int8_t __a)
+__extension__ extern __inline int8x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vrbit_s8 (int8x8_t __a)
 {
-  return (int8_t) __builtin_aarch64_sqnegqi (__a);
+  return __builtin_aarch64_rbitv8qi (__a);
 }
 
-__extension__ static __inline int16_t __attribute__ ((__always_inline__))
-vqnegh_s16 (int16_t __a)
+__extension__ extern __inline uint8x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vrbit_u8 (uint8x8_t __a)
 {
-  return (int16_t) __builtin_aarch64_sqneghi (__a);
+  return (uint8x8_t) __builtin_aarch64_rbitv8qi ((int8x8_t) __a);
 }
 
-__extension__ static __inline int32_t __attribute__ ((__always_inline__))
-vqnegs_s32 (int32_t __a)
+__extension__ extern __inline poly8x16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vrbitq_p8 (poly8x16_t __a)
 {
-  return (int32_t) __builtin_aarch64_sqnegsi (__a);
+  return (poly8x16_t) __builtin_aarch64_rbitv16qi ((int8x16_t)__a);
 }
 
-__extension__ static __inline int64_t __attribute__ ((__always_inline__))
-vqnegd_s64 (int64_t __a)
+__extension__ extern __inline int8x16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vrbitq_s8 (int8x16_t __a)
 {
-  return __builtin_aarch64_sqnegdi (__a);
+  return __builtin_aarch64_rbitv16qi (__a);
 }
 
-/* vqrdmulh */
-
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-vqrdmulh_lane_s16 (int16x4_t __a, int16x4_t __b, const int __c)
+__extension__ extern __inline uint8x16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vrbitq_u8 (uint8x16_t __a)
 {
-  return  __builtin_aarch64_sqrdmulh_lanev4hi (__a, __b, __c);
+  return (uint8x16_t) __builtin_aarch64_rbitv16qi ((int8x16_t) __a);
 }
 
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-vqrdmulh_lane_s32 (int32x2_t __a, int32x2_t __b, const int __c)
+/* vrecpe  */
+
+__extension__ extern __inline uint32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vrecpe_u32 (uint32x2_t __a)
 {
-  return __builtin_aarch64_sqrdmulh_lanev2si (__a, __b, __c);
+  return (uint32x2_t) __builtin_aarch64_urecpev2si ((int32x2_t) __a);
 }
 
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-vqrdmulhq_lane_s16 (int16x8_t __a, int16x4_t __b, const int __c)
+__extension__ extern __inline uint32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vrecpeq_u32 (uint32x4_t __a)
 {
-  return __builtin_aarch64_sqrdmulh_lanev8hi (__a, __b, __c);
+  return (uint32x4_t) __builtin_aarch64_urecpev4si ((int32x4_t) __a);
 }
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-vqrdmulhq_lane_s32 (int32x4_t __a, int32x2_t __b, const int __c)
+__extension__ extern __inline float32_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vrecpes_f32 (float32_t __a)
 {
-  return __builtin_aarch64_sqrdmulh_lanev4si (__a, __b, __c);
+  return __builtin_aarch64_frecpesf (__a);
 }
 
-__extension__ static __inline int16_t __attribute__ ((__always_inline__))
-vqrdmulhh_s16 (int16_t __a, int16_t __b)
+__extension__ extern __inline float64_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vrecped_f64 (float64_t __a)
 {
-  return (int16_t) __builtin_aarch64_sqrdmulhhi (__a, __b);
+  return __builtin_aarch64_frecpedf (__a);
 }
 
-__extension__ static __inline int16_t __attribute__ ((__always_inline__))
-vqrdmulhh_lane_s16 (int16_t __a, int16x4_t __b, const int __c)
+__extension__ extern __inline float32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vrecpe_f32 (float32x2_t __a)
 {
-  return __builtin_aarch64_sqrdmulh_lanehi (__a, __b, __c);
+  return __builtin_aarch64_frecpev2sf (__a);
 }
 
-__extension__ static __inline int16_t __attribute__ ((__always_inline__))
-vqrdmulhh_laneq_s16 (int16_t __a, int16x8_t __b, const int __c)
+__extension__ extern __inline float64x1_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vrecpe_f64 (float64x1_t __a)
 {
-  return __builtin_aarch64_sqrdmulh_laneqhi (__a, __b, __c);
+  return (float64x1_t) { vrecped_f64 (vget_lane_f64 (__a, 0)) };
 }
 
-__extension__ static __inline int32_t __attribute__ ((__always_inline__))
-vqrdmulhs_s32 (int32_t __a, int32_t __b)
+__extension__ extern __inline float32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vrecpeq_f32 (float32x4_t __a)
 {
-  return (int32_t) __builtin_aarch64_sqrdmulhsi (__a, __b);
+  return __builtin_aarch64_frecpev4sf (__a);
 }
 
-__extension__ static __inline int32_t __attribute__ ((__always_inline__))
-vqrdmulhs_lane_s32 (int32_t __a, int32x2_t __b, const int __c)
+__extension__ extern __inline float64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vrecpeq_f64 (float64x2_t __a)
 {
-  return __builtin_aarch64_sqrdmulh_lanesi (__a, __b, __c);
+  return __builtin_aarch64_frecpev2df (__a);
 }
 
-__extension__ static __inline int32_t __attribute__ ((__always_inline__))
-vqrdmulhs_laneq_s32 (int32_t __a, int32x4_t __b, const int __c)
+/* vrecps  */
+
+__extension__ extern __inline float32_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vrecpss_f32 (float32_t __a, float32_t __b)
 {
-  return __builtin_aarch64_sqrdmulh_laneqsi (__a, __b, __c);
+  return __builtin_aarch64_frecpssf (__a, __b);
 }
 
-/* vqrshl */
-
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-vqrshl_s8 (int8x8_t __a, int8x8_t __b)
+__extension__ extern __inline float64_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vrecpsd_f64 (float64_t __a, float64_t __b)
 {
-  return __builtin_aarch64_sqrshlv8qi (__a, __b);
+  return __builtin_aarch64_frecpsdf (__a, __b);
 }
 
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-vqrshl_s16 (int16x4_t __a, int16x4_t __b)
+__extension__ extern __inline float32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vrecps_f32 (float32x2_t __a, float32x2_t __b)
 {
-  return __builtin_aarch64_sqrshlv4hi (__a, __b);
+  return __builtin_aarch64_frecpsv2sf (__a, __b);
 }
 
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-vqrshl_s32 (int32x2_t __a, int32x2_t __b)
+__extension__ extern __inline float64x1_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vrecps_f64 (float64x1_t __a, float64x1_t __b)
 {
-  return __builtin_aarch64_sqrshlv2si (__a, __b);
+  return (float64x1_t) { vrecpsd_f64 (vget_lane_f64 (__a, 0),
+				      vget_lane_f64 (__b, 0)) };
 }
 
-__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
-vqrshl_s64 (int64x1_t __a, int64x1_t __b)
+__extension__ extern __inline float32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vrecpsq_f32 (float32x4_t __a, float32x4_t __b)
 {
-  return (int64x1_t) {__builtin_aarch64_sqrshldi (__a[0], __b[0])};
+  return __builtin_aarch64_frecpsv4sf (__a, __b);
 }
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-vqrshl_u8 (uint8x8_t __a, int8x8_t __b)
+__extension__ extern __inline float64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vrecpsq_f64 (float64x2_t __a, float64x2_t __b)
 {
-  return __builtin_aarch64_uqrshlv8qi_uus ( __a, __b);
+  return __builtin_aarch64_frecpsv2df (__a, __b);
 }
 
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-vqrshl_u16 (uint16x4_t __a, int16x4_t __b)
+/* vrecpx  */
+
+__extension__ extern __inline float32_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vrecpxs_f32 (float32_t __a)
 {
-  return __builtin_aarch64_uqrshlv4hi_uus ( __a, __b);
+  return __builtin_aarch64_frecpxsf (__a);
 }
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-vqrshl_u32 (uint32x2_t __a, int32x2_t __b)
+__extension__ extern __inline float64_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vrecpxd_f64 (float64_t __a)
 {
-  return __builtin_aarch64_uqrshlv2si_uus ( __a, __b);
+  return __builtin_aarch64_frecpxdf (__a);
 }
 
-__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
-vqrshl_u64 (uint64x1_t __a, int64x1_t __b)
+
+/* vrev  */
+
+__extension__ extern __inline poly8x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vrev16_p8 (poly8x8_t a)
 {
-  return (uint64x1_t) {__builtin_aarch64_uqrshldi_uus (__a[0], __b[0])};
+  return __builtin_shuffle (a, (uint8x8_t) { 1, 0, 3, 2, 5, 4, 7, 6 });
 }
 
-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
-vqrshlq_s8 (int8x16_t __a, int8x16_t __b)
+__extension__ extern __inline int8x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vrev16_s8 (int8x8_t a)
 {
-  return __builtin_aarch64_sqrshlv16qi (__a, __b);
+  return __builtin_shuffle (a, (uint8x8_t) { 1, 0, 3, 2, 5, 4, 7, 6 });
 }
 
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-vqrshlq_s16 (int16x8_t __a, int16x8_t __b)
+__extension__ extern __inline uint8x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vrev16_u8 (uint8x8_t a)
 {
-  return __builtin_aarch64_sqrshlv8hi (__a, __b);
+  return __builtin_shuffle (a, (uint8x8_t) { 1, 0, 3, 2, 5, 4, 7, 6 });
 }
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-vqrshlq_s32 (int32x4_t __a, int32x4_t __b)
+__extension__ extern __inline poly8x16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vrev16q_p8 (poly8x16_t a)
 {
-  return __builtin_aarch64_sqrshlv4si (__a, __b);
+  return __builtin_shuffle (a,
+      (uint8x16_t) { 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14 });
 }
 
-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
-vqrshlq_s64 (int64x2_t __a, int64x2_t __b)
+__extension__ extern __inline int8x16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vrev16q_s8 (int8x16_t a)
 {
-  return __builtin_aarch64_sqrshlv2di (__a, __b);
+  return __builtin_shuffle (a,
+      (uint8x16_t) { 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14 });
 }
 
-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-vqrshlq_u8 (uint8x16_t __a, int8x16_t __b)
+__extension__ extern __inline uint8x16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vrev16q_u8 (uint8x16_t a)
 {
-  return __builtin_aarch64_uqrshlv16qi_uus ( __a, __b);
+  return __builtin_shuffle (a,
+      (uint8x16_t) { 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14 });
 }
 
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-vqrshlq_u16 (uint16x8_t __a, int16x8_t __b)
+__extension__ extern __inline poly8x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vrev32_p8 (poly8x8_t a)
 {
-  return __builtin_aarch64_uqrshlv8hi_uus ( __a, __b);
+  return __builtin_shuffle (a, (uint8x8_t) { 3, 2, 1, 0, 7, 6, 5, 4 });
 }
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vqrshlq_u32 (uint32x4_t __a, int32x4_t __b)
+__extension__ extern __inline poly16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vrev32_p16 (poly16x4_t a)
 {
-  return __builtin_aarch64_uqrshlv4si_uus ( __a, __b);
+  return __builtin_shuffle (a, (uint16x4_t) { 1, 0, 3, 2 });
 }
 
-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
-vqrshlq_u64 (uint64x2_t __a, int64x2_t __b)
+__extension__ extern __inline int8x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vrev32_s8 (int8x8_t a)
 {
-  return __builtin_aarch64_uqrshlv2di_uus ( __a, __b);
+  return __builtin_shuffle (a, (uint8x8_t) { 3, 2, 1, 0, 7, 6, 5, 4 });
 }
 
-__extension__ static __inline int8_t __attribute__ ((__always_inline__))
-vqrshlb_s8 (int8_t __a, int8_t __b)
+__extension__ extern __inline int16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vrev32_s16 (int16x4_t a)
 {
-  return __builtin_aarch64_sqrshlqi (__a, __b);
+  return __builtin_shuffle (a, (uint16x4_t) { 1, 0, 3, 2 });
 }
 
-__extension__ static __inline int16_t __attribute__ ((__always_inline__))
-vqrshlh_s16 (int16_t __a, int16_t __b)
+__extension__ extern __inline uint8x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vrev32_u8 (uint8x8_t a)
 {
-  return __builtin_aarch64_sqrshlhi (__a, __b);
+  return __builtin_shuffle (a, (uint8x8_t) { 3, 2, 1, 0, 7, 6, 5, 4 });
 }
 
-__extension__ static __inline int32_t __attribute__ ((__always_inline__))
-vqrshls_s32 (int32_t __a, int32_t __b)
+__extension__ extern __inline uint16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vrev32_u16 (uint16x4_t a)
 {
-  return __builtin_aarch64_sqrshlsi (__a, __b);
+  return __builtin_shuffle (a, (uint16x4_t) { 1, 0, 3, 2 });
 }
 
-__extension__ static __inline int64_t __attribute__ ((__always_inline__))
-vqrshld_s64 (int64_t __a, int64_t __b)
+__extension__ extern __inline poly8x16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vrev32q_p8 (poly8x16_t a)
 {
-  return __builtin_aarch64_sqrshldi (__a, __b);
+  return __builtin_shuffle (a,
+      (uint8x16_t) { 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12 });
 }
 
-__extension__ static __inline uint8_t __attribute__ ((__always_inline__))
-vqrshlb_u8 (uint8_t __a, uint8_t __b)
+__extension__ extern __inline poly16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vrev32q_p16 (poly16x8_t a)
 {
-  return __builtin_aarch64_uqrshlqi_uus (__a, __b);
+  return __builtin_shuffle (a, (uint16x8_t) { 1, 0, 3, 2, 5, 4, 7, 6 });
 }
 
-__extension__ static __inline uint16_t __attribute__ ((__always_inline__))
-vqrshlh_u16 (uint16_t __a, uint16_t __b)
+__extension__ extern __inline int8x16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vrev32q_s8 (int8x16_t a)
 {
-  return __builtin_aarch64_uqrshlhi_uus (__a, __b);
+  return __builtin_shuffle (a,
+      (uint8x16_t) { 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12 });
 }
 
-__extension__ static __inline uint32_t __attribute__ ((__always_inline__))
-vqrshls_u32 (uint32_t __a, uint32_t __b)
+__extension__ extern __inline int16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vrev32q_s16 (int16x8_t a)
 {
-  return __builtin_aarch64_uqrshlsi_uus (__a, __b);
+  return __builtin_shuffle (a, (uint16x8_t) { 1, 0, 3, 2, 5, 4, 7, 6 });
 }
 
-__extension__ static __inline uint64_t __attribute__ ((__always_inline__))
-vqrshld_u64 (uint64_t __a, uint64_t __b)
+__extension__ extern __inline uint8x16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vrev32q_u8 (uint8x16_t a)
 {
-  return __builtin_aarch64_uqrshldi_uus (__a, __b);
+  return __builtin_shuffle (a,
+      (uint8x16_t) { 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12 });
 }
 
-/* vqrshrn */
+__extension__ extern __inline uint16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vrev32q_u16 (uint16x8_t a)
+{
+  return __builtin_shuffle (a, (uint16x8_t) { 1, 0, 3, 2, 5, 4, 7, 6 });
+}
 
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-vqrshrn_n_s16 (int16x8_t __a, const int __b)
+__extension__ extern __inline float16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vrev64_f16 (float16x4_t __a)
 {
-  return (int8x8_t) __builtin_aarch64_sqrshrn_nv8hi (__a, __b);
+  return __builtin_shuffle (__a, (uint16x4_t) { 3, 2, 1, 0 });
 }
 
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-vqrshrn_n_s32 (int32x4_t __a, const int __b)
+__extension__ extern __inline float32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vrev64_f32 (float32x2_t a)
 {
-  return (int16x4_t) __builtin_aarch64_sqrshrn_nv4si (__a, __b);
+  return __builtin_shuffle (a, (uint32x2_t) { 1, 0 });
 }
 
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-vqrshrn_n_s64 (int64x2_t __a, const int __b)
+__extension__ extern __inline poly8x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vrev64_p8 (poly8x8_t a)
 {
-  return (int32x2_t) __builtin_aarch64_sqrshrn_nv2di (__a, __b);
+  return __builtin_shuffle (a, (uint8x8_t) { 7, 6, 5, 4, 3, 2, 1, 0 });
 }
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-vqrshrn_n_u16 (uint16x8_t __a, const int __b)
+__extension__ extern __inline poly16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vrev64_p16 (poly16x4_t a)
 {
-  return __builtin_aarch64_uqrshrn_nv8hi_uus ( __a, __b);
+  return __builtin_shuffle (a, (uint16x4_t) { 3, 2, 1, 0 });
 }
 
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-vqrshrn_n_u32 (uint32x4_t __a, const int __b)
+__extension__ extern __inline int8x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vrev64_s8 (int8x8_t a)
 {
-  return __builtin_aarch64_uqrshrn_nv4si_uus ( __a, __b);
+  return __builtin_shuffle (a, (uint8x8_t) { 7, 6, 5, 4, 3, 2, 1, 0 });
 }
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-vqrshrn_n_u64 (uint64x2_t __a, const int __b)
+__extension__ extern __inline int16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vrev64_s16 (int16x4_t a)
 {
-  return __builtin_aarch64_uqrshrn_nv2di_uus ( __a, __b);
+  return __builtin_shuffle (a, (uint16x4_t) { 3, 2, 1, 0 });
 }
 
-__extension__ static __inline int8_t __attribute__ ((__always_inline__))
-vqrshrnh_n_s16 (int16_t __a, const int __b)
+__extension__ extern __inline int32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vrev64_s32 (int32x2_t a)
 {
-  return (int8_t) __builtin_aarch64_sqrshrn_nhi (__a, __b);
+  return __builtin_shuffle (a, (uint32x2_t) { 1, 0 });
 }
 
-__extension__ static __inline int16_t __attribute__ ((__always_inline__))
-vqrshrns_n_s32 (int32_t __a, const int __b)
+__extension__ extern __inline uint8x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vrev64_u8 (uint8x8_t a)
 {
-  return (int16_t) __builtin_aarch64_sqrshrn_nsi (__a, __b);
+  return __builtin_shuffle (a, (uint8x8_t) { 7, 6, 5, 4, 3, 2, 1, 0 });
 }
 
-__extension__ static __inline int32_t __attribute__ ((__always_inline__))
-vqrshrnd_n_s64 (int64_t __a, const int __b)
+__extension__ extern __inline uint16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vrev64_u16 (uint16x4_t a)
 {
-  return (int32_t) __builtin_aarch64_sqrshrn_ndi (__a, __b);
+  return __builtin_shuffle (a, (uint16x4_t) { 3, 2, 1, 0 });
 }
 
-__extension__ static __inline uint8_t __attribute__ ((__always_inline__))
-vqrshrnh_n_u16 (uint16_t __a, const int __b)
+__extension__ extern __inline uint32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vrev64_u32 (uint32x2_t a)
 {
-  return __builtin_aarch64_uqrshrn_nhi_uus (__a, __b);
+  return __builtin_shuffle (a, (uint32x2_t) { 1, 0 });
 }
 
-__extension__ static __inline uint16_t __attribute__ ((__always_inline__))
-vqrshrns_n_u32 (uint32_t __a, const int __b)
+__extension__ extern __inline float16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vrev64q_f16 (float16x8_t __a)
 {
-  return __builtin_aarch64_uqrshrn_nsi_uus (__a, __b);
+  return __builtin_shuffle (__a, (uint16x8_t) { 3, 2, 1, 0, 7, 6, 5, 4 });
 }
 
-__extension__ static __inline uint32_t __attribute__ ((__always_inline__))
-vqrshrnd_n_u64 (uint64_t __a, const int __b)
+__extension__ extern __inline float32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vrev64q_f32 (float32x4_t a)
 {
-  return __builtin_aarch64_uqrshrn_ndi_uus (__a, __b);
+  return __builtin_shuffle (a, (uint32x4_t) { 1, 0, 3, 2 });
 }
 
-/* vqrshrun */
-
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-vqrshrun_n_s16 (int16x8_t __a, const int __b)
+__extension__ extern __inline poly8x16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vrev64q_p8 (poly8x16_t a)
 {
-  return (uint8x8_t) __builtin_aarch64_sqrshrun_nv8hi (__a, __b);
+  return __builtin_shuffle (a,
+      (uint8x16_t) { 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8 });
 }
 
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-vqrshrun_n_s32 (int32x4_t __a, const int __b)
+__extension__ extern __inline poly16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vrev64q_p16 (poly16x8_t a)
 {
-  return (uint16x4_t) __builtin_aarch64_sqrshrun_nv4si (__a, __b);
+  return __builtin_shuffle (a, (uint16x8_t) { 3, 2, 1, 0, 7, 6, 5, 4 });
 }
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-vqrshrun_n_s64 (int64x2_t __a, const int __b)
+__extension__ extern __inline int8x16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vrev64q_s8 (int8x16_t a)
 {
-  return (uint32x2_t) __builtin_aarch64_sqrshrun_nv2di (__a, __b);
+  return __builtin_shuffle (a,
+      (uint8x16_t) { 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8 });
 }
 
-__extension__ static __inline int8_t __attribute__ ((__always_inline__))
-vqrshrunh_n_s16 (int16_t __a, const int __b)
+__extension__ extern __inline int16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vrev64q_s16 (int16x8_t a)
 {
-  return (int8_t) __builtin_aarch64_sqrshrun_nhi (__a, __b);
+  return __builtin_shuffle (a, (uint16x8_t) { 3, 2, 1, 0, 7, 6, 5, 4 });
 }
 
-__extension__ static __inline int16_t __attribute__ ((__always_inline__))
-vqrshruns_n_s32 (int32_t __a, const int __b)
+__extension__ extern __inline int32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vrev64q_s32 (int32x4_t a)
 {
-  return (int16_t) __builtin_aarch64_sqrshrun_nsi (__a, __b);
+  return __builtin_shuffle (a, (uint32x4_t) { 1, 0, 3, 2 });
 }
 
-__extension__ static __inline int32_t __attribute__ ((__always_inline__))
-vqrshrund_n_s64 (int64_t __a, const int __b)
+__extension__ extern __inline uint8x16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vrev64q_u8 (uint8x16_t a)
 {
-  return (int32_t) __builtin_aarch64_sqrshrun_ndi (__a, __b);
+  return __builtin_shuffle (a,
+      (uint8x16_t) { 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8 });
 }
 
-/* vqshl */
-
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-vqshl_s8 (int8x8_t __a, int8x8_t __b)
+__extension__ extern __inline uint16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vrev64q_u16 (uint16x8_t a)
 {
-  return __builtin_aarch64_sqshlv8qi (__a, __b);
+  return __builtin_shuffle (a, (uint16x8_t) { 3, 2, 1, 0, 7, 6, 5, 4 });
 }
 
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-vqshl_s16 (int16x4_t __a, int16x4_t __b)
+__extension__ extern __inline uint32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vrev64q_u32 (uint32x4_t a)
 {
-  return __builtin_aarch64_sqshlv4hi (__a, __b);
+  return __builtin_shuffle (a, (uint32x4_t) { 1, 0, 3, 2 });
 }
 
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-vqshl_s32 (int32x2_t __a, int32x2_t __b)
+/* vrnd  */
+
+__extension__ extern __inline float32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vrnd_f32 (float32x2_t __a)
 {
-  return __builtin_aarch64_sqshlv2si (__a, __b);
+  return __builtin_aarch64_btruncv2sf (__a);
 }
 
-__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
-vqshl_s64 (int64x1_t __a, int64x1_t __b)
+__extension__ extern __inline float64x1_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vrnd_f64 (float64x1_t __a)
 {
-  return (int64x1_t) {__builtin_aarch64_sqshldi (__a[0], __b[0])};
+  return vset_lane_f64 (__builtin_trunc (vget_lane_f64 (__a, 0)), __a, 0);
 }
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-vqshl_u8 (uint8x8_t __a, int8x8_t __b)
+__extension__ extern __inline float32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vrndq_f32 (float32x4_t __a)
 {
-  return __builtin_aarch64_uqshlv8qi_uus ( __a, __b);
+  return __builtin_aarch64_btruncv4sf (__a);
 }
 
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-vqshl_u16 (uint16x4_t __a, int16x4_t __b)
+__extension__ extern __inline float64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vrndq_f64 (float64x2_t __a)
 {
-  return __builtin_aarch64_uqshlv4hi_uus ( __a, __b);
+  return __builtin_aarch64_btruncv2df (__a);
 }
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-vqshl_u32 (uint32x2_t __a, int32x2_t __b)
+/* vrnda  */
+
+__extension__ extern __inline float32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vrnda_f32 (float32x2_t __a)
 {
-  return __builtin_aarch64_uqshlv2si_uus ( __a, __b);
+  return __builtin_aarch64_roundv2sf (__a);
 }
 
-__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
-vqshl_u64 (uint64x1_t __a, int64x1_t __b)
+__extension__ extern __inline float64x1_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vrnda_f64 (float64x1_t __a)
 {
-  return (uint64x1_t) {__builtin_aarch64_uqshldi_uus (__a[0], __b[0])};
+  return vset_lane_f64 (__builtin_round (vget_lane_f64 (__a, 0)), __a, 0);
 }
 
-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
-vqshlq_s8 (int8x16_t __a, int8x16_t __b)
+__extension__ extern __inline float32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vrndaq_f32 (float32x4_t __a)
 {
-  return __builtin_aarch64_sqshlv16qi (__a, __b);
+  return __builtin_aarch64_roundv4sf (__a);
 }
 
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-vqshlq_s16 (int16x8_t __a, int16x8_t __b)
+__extension__ extern __inline float64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vrndaq_f64 (float64x2_t __a)
 {
-  return __builtin_aarch64_sqshlv8hi (__a, __b);
+  return __builtin_aarch64_roundv2df (__a);
 }
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-vqshlq_s32 (int32x4_t __a, int32x4_t __b)
+/* vrndi  */
+
+__extension__ extern __inline float32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vrndi_f32 (float32x2_t __a)
 {
-  return __builtin_aarch64_sqshlv4si (__a, __b);
+  return __builtin_aarch64_nearbyintv2sf (__a);
 }
 
-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
-vqshlq_s64 (int64x2_t __a, int64x2_t __b)
+__extension__ extern __inline float64x1_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vrndi_f64 (float64x1_t __a)
 {
-  return __builtin_aarch64_sqshlv2di (__a, __b);
+  return vset_lane_f64 (__builtin_nearbyint (vget_lane_f64 (__a, 0)), __a, 0);
 }
 
-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-vqshlq_u8 (uint8x16_t __a, int8x16_t __b)
+__extension__ extern __inline float32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vrndiq_f32 (float32x4_t __a)
 {
-  return __builtin_aarch64_uqshlv16qi_uus ( __a, __b);
+  return __builtin_aarch64_nearbyintv4sf (__a);
 }
 
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-vqshlq_u16 (uint16x8_t __a, int16x8_t __b)
+__extension__ extern __inline float64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vrndiq_f64 (float64x2_t __a)
 {
-  return __builtin_aarch64_uqshlv8hi_uus ( __a, __b);
+  return __builtin_aarch64_nearbyintv2df (__a);
 }
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vqshlq_u32 (uint32x4_t __a, int32x4_t __b)
+/* vrndm  */
+
+__extension__ extern __inline float32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vrndm_f32 (float32x2_t __a)
 {
-  return __builtin_aarch64_uqshlv4si_uus ( __a, __b);
+  return __builtin_aarch64_floorv2sf (__a);
 }
 
-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
-vqshlq_u64 (uint64x2_t __a, int64x2_t __b)
+__extension__ extern __inline float64x1_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vrndm_f64 (float64x1_t __a)
 {
-  return __builtin_aarch64_uqshlv2di_uus ( __a, __b);
+  return vset_lane_f64 (__builtin_floor (vget_lane_f64 (__a, 0)), __a, 0);
 }
 
-__extension__ static __inline int8_t __attribute__ ((__always_inline__))
-vqshlb_s8 (int8_t __a, int8_t __b)
+__extension__ extern __inline float32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vrndmq_f32 (float32x4_t __a)
 {
-  return __builtin_aarch64_sqshlqi (__a, __b);
+  return __builtin_aarch64_floorv4sf (__a);
 }
 
-__extension__ static __inline int16_t __attribute__ ((__always_inline__))
-vqshlh_s16 (int16_t __a, int16_t __b)
+__extension__ extern __inline float64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vrndmq_f64 (float64x2_t __a)
 {
-  return __builtin_aarch64_sqshlhi (__a, __b);
+  return __builtin_aarch64_floorv2df (__a);
 }
 
-__extension__ static __inline int32_t __attribute__ ((__always_inline__))
-vqshls_s32 (int32_t __a, int32_t __b)
+/* vrndn  */
+
+__extension__ extern __inline float32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vrndn_f32 (float32x2_t __a)
 {
-  return __builtin_aarch64_sqshlsi (__a, __b);
+  return __builtin_aarch64_frintnv2sf (__a);
 }
 
-__extension__ static __inline int64_t __attribute__ ((__always_inline__))
-vqshld_s64 (int64_t __a, int64_t __b)
+__extension__ extern __inline float64x1_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vrndn_f64 (float64x1_t __a)
 {
-  return __builtin_aarch64_sqshldi (__a, __b);
+  return (float64x1_t) {__builtin_aarch64_frintndf (__a[0])};
 }
 
-__extension__ static __inline uint8_t __attribute__ ((__always_inline__))
-vqshlb_u8 (uint8_t __a, uint8_t __b)
+__extension__ extern __inline float32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vrndnq_f32 (float32x4_t __a)
 {
-  return __builtin_aarch64_uqshlqi_uus (__a, __b);
+  return __builtin_aarch64_frintnv4sf (__a);
 }
 
-__extension__ static __inline uint16_t __attribute__ ((__always_inline__))
-vqshlh_u16 (uint16_t __a, uint16_t __b)
+__extension__ extern __inline float64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vrndnq_f64 (float64x2_t __a)
 {
-  return __builtin_aarch64_uqshlhi_uus (__a, __b);
+  return __builtin_aarch64_frintnv2df (__a);
 }
 
-__extension__ static __inline uint32_t __attribute__ ((__always_inline__))
-vqshls_u32 (uint32_t __a, uint32_t __b)
+/* vrndp  */
+
+__extension__ extern __inline float32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vrndp_f32 (float32x2_t __a)
 {
-  return __builtin_aarch64_uqshlsi_uus (__a, __b);
+  return __builtin_aarch64_ceilv2sf (__a);
 }
 
-__extension__ static __inline uint64_t __attribute__ ((__always_inline__))
-vqshld_u64 (uint64_t __a, uint64_t __b)
+__extension__ extern __inline float64x1_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vrndp_f64 (float64x1_t __a)
 {
-  return __builtin_aarch64_uqshldi_uus (__a, __b);
+  return vset_lane_f64 (__builtin_ceil (vget_lane_f64 (__a, 0)), __a, 0);
 }
 
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-vqshl_n_s8 (int8x8_t __a, const int __b)
+__extension__ extern __inline float32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vrndpq_f32 (float32x4_t __a)
 {
-  return (int8x8_t) __builtin_aarch64_sqshl_nv8qi (__a, __b);
+  return __builtin_aarch64_ceilv4sf (__a);
 }
 
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-vqshl_n_s16 (int16x4_t __a, const int __b)
+__extension__ extern __inline float64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vrndpq_f64 (float64x2_t __a)
 {
-  return (int16x4_t) __builtin_aarch64_sqshl_nv4hi (__a, __b);
+  return __builtin_aarch64_ceilv2df (__a);
 }
 
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-vqshl_n_s32 (int32x2_t __a, const int __b)
+/* vrndx  */
+
+__extension__ extern __inline float32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vrndx_f32 (float32x2_t __a)
 {
-  return (int32x2_t) __builtin_aarch64_sqshl_nv2si (__a, __b);
+  return __builtin_aarch64_rintv2sf (__a);
 }
 
-__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
-vqshl_n_s64 (int64x1_t __a, const int __b)
+__extension__ extern __inline float64x1_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vrndx_f64 (float64x1_t __a)
 {
-  return (int64x1_t) {__builtin_aarch64_sqshl_ndi (__a[0], __b)};
+  return vset_lane_f64 (__builtin_rint (vget_lane_f64 (__a, 0)), __a, 0);
 }
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-vqshl_n_u8 (uint8x8_t __a, const int __b)
+__extension__ extern __inline float32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vrndxq_f32 (float32x4_t __a)
 {
-  return __builtin_aarch64_uqshl_nv8qi_uus (__a, __b);
+  return __builtin_aarch64_rintv4sf (__a);
 }
 
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-vqshl_n_u16 (uint16x4_t __a, const int __b)
+__extension__ extern __inline float64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vrndxq_f64 (float64x2_t __a)
 {
-  return __builtin_aarch64_uqshl_nv4hi_uus (__a, __b);
+  return __builtin_aarch64_rintv2df (__a);
 }
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-vqshl_n_u32 (uint32x2_t __a, const int __b)
+/* vrshl */
+
+__extension__ extern __inline int8x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vrshl_s8 (int8x8_t __a, int8x8_t __b)
 {
-  return __builtin_aarch64_uqshl_nv2si_uus (__a, __b);
+  return (int8x8_t) __builtin_aarch64_srshlv8qi (__a, __b);
 }
 
-__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
-vqshl_n_u64 (uint64x1_t __a, const int __b)
+__extension__ extern __inline int16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vrshl_s16 (int16x4_t __a, int16x4_t __b)
 {
-  return (uint64x1_t) {__builtin_aarch64_uqshl_ndi_uus (__a[0], __b)};
+  return (int16x4_t) __builtin_aarch64_srshlv4hi (__a, __b);
 }
 
-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
-vqshlq_n_s8 (int8x16_t __a, const int __b)
+__extension__ extern __inline int32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vrshl_s32 (int32x2_t __a, int32x2_t __b)
 {
-  return (int8x16_t) __builtin_aarch64_sqshl_nv16qi (__a, __b);
+  return (int32x2_t) __builtin_aarch64_srshlv2si (__a, __b);
 }
 
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-vqshlq_n_s16 (int16x8_t __a, const int __b)
+__extension__ extern __inline int64x1_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vrshl_s64 (int64x1_t __a, int64x1_t __b)
 {
-  return (int16x8_t) __builtin_aarch64_sqshl_nv8hi (__a, __b);
+  return (int64x1_t) {__builtin_aarch64_srshldi (__a[0], __b[0])};
 }
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-vqshlq_n_s32 (int32x4_t __a, const int __b)
+__extension__ extern __inline uint8x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vrshl_u8 (uint8x8_t __a, int8x8_t __b)
 {
-  return (int32x4_t) __builtin_aarch64_sqshl_nv4si (__a, __b);
+  return __builtin_aarch64_urshlv8qi_uus (__a, __b);
 }
 
-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
-vqshlq_n_s64 (int64x2_t __a, const int __b)
+__extension__ extern __inline uint16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vrshl_u16 (uint16x4_t __a, int16x4_t __b)
 {
-  return (int64x2_t) __builtin_aarch64_sqshl_nv2di (__a, __b);
+  return __builtin_aarch64_urshlv4hi_uus (__a, __b);
 }
 
-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-vqshlq_n_u8 (uint8x16_t __a, const int __b)
+__extension__ extern __inline uint32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vrshl_u32 (uint32x2_t __a, int32x2_t __b)
 {
-  return __builtin_aarch64_uqshl_nv16qi_uus (__a, __b);
+  return __builtin_aarch64_urshlv2si_uus (__a, __b);
 }
 
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-vqshlq_n_u16 (uint16x8_t __a, const int __b)
+__extension__ extern __inline uint64x1_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vrshl_u64 (uint64x1_t __a, int64x1_t __b)
 {
-  return __builtin_aarch64_uqshl_nv8hi_uus (__a, __b);
+  return (uint64x1_t) {__builtin_aarch64_urshldi_uus (__a[0], __b[0])};
 }
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vqshlq_n_u32 (uint32x4_t __a, const int __b)
+__extension__ extern __inline int8x16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vrshlq_s8 (int8x16_t __a, int8x16_t __b)
 {
-  return __builtin_aarch64_uqshl_nv4si_uus (__a, __b);
+  return (int8x16_t) __builtin_aarch64_srshlv16qi (__a, __b);
 }
 
-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
-vqshlq_n_u64 (uint64x2_t __a, const int __b)
+__extension__ extern __inline int16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vrshlq_s16 (int16x8_t __a, int16x8_t __b)
 {
-  return __builtin_aarch64_uqshl_nv2di_uus (__a, __b);
+  return (int16x8_t) __builtin_aarch64_srshlv8hi (__a, __b);
 }
 
-__extension__ static __inline int8_t __attribute__ ((__always_inline__))
-vqshlb_n_s8 (int8_t __a, const int __b)
+__extension__ extern __inline int32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vrshlq_s32 (int32x4_t __a, int32x4_t __b)
 {
-  return (int8_t) __builtin_aarch64_sqshl_nqi (__a, __b);
+  return (int32x4_t) __builtin_aarch64_srshlv4si (__a, __b);
 }
 
-__extension__ static __inline int16_t __attribute__ ((__always_inline__))
-vqshlh_n_s16 (int16_t __a, const int __b)
+__extension__ extern __inline int64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vrshlq_s64 (int64x2_t __a, int64x2_t __b)
 {
-  return (int16_t) __builtin_aarch64_sqshl_nhi (__a, __b);
+  return (int64x2_t) __builtin_aarch64_srshlv2di (__a, __b);
 }
 
-__extension__ static __inline int32_t __attribute__ ((__always_inline__))
-vqshls_n_s32 (int32_t __a, const int __b)
+__extension__ extern __inline uint8x16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vrshlq_u8 (uint8x16_t __a, int8x16_t __b)
 {
-  return (int32_t) __builtin_aarch64_sqshl_nsi (__a, __b);
+  return __builtin_aarch64_urshlv16qi_uus (__a, __b);
 }
 
-__extension__ static __inline int64_t __attribute__ ((__always_inline__))
-vqshld_n_s64 (int64_t __a, const int __b)
+__extension__ extern __inline uint16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vrshlq_u16 (uint16x8_t __a, int16x8_t __b)
 {
-  return __builtin_aarch64_sqshl_ndi (__a, __b);
+  return __builtin_aarch64_urshlv8hi_uus (__a, __b);
 }
 
-__extension__ static __inline uint8_t __attribute__ ((__always_inline__))
-vqshlb_n_u8 (uint8_t __a, const int __b)
+__extension__ extern __inline uint32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vrshlq_u32 (uint32x4_t __a, int32x4_t __b)
 {
-  return __builtin_aarch64_uqshl_nqi_uus (__a, __b);
+  return __builtin_aarch64_urshlv4si_uus (__a, __b);
 }
 
-__extension__ static __inline uint16_t __attribute__ ((__always_inline__))
-vqshlh_n_u16 (uint16_t __a, const int __b)
+__extension__ extern __inline uint64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vrshlq_u64 (uint64x2_t __a, int64x2_t __b)
 {
-  return __builtin_aarch64_uqshl_nhi_uus (__a, __b);
+  return __builtin_aarch64_urshlv2di_uus (__a, __b);
 }
 
-__extension__ static __inline uint32_t __attribute__ ((__always_inline__))
-vqshls_n_u32 (uint32_t __a, const int __b)
+__extension__ extern __inline int64_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vrshld_s64 (int64_t __a, int64_t __b)
 {
-  return __builtin_aarch64_uqshl_nsi_uus (__a, __b);
+  return __builtin_aarch64_srshldi (__a, __b);
 }
 
-__extension__ static __inline uint64_t __attribute__ ((__always_inline__))
-vqshld_n_u64 (uint64_t __a, const int __b)
+__extension__ extern __inline uint64_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vrshld_u64 (uint64_t __a, int64_t __b)
 {
-  return __builtin_aarch64_uqshl_ndi_uus (__a, __b);
+  return __builtin_aarch64_urshldi_uus (__a, __b);
 }
 
-/* vqshlu */
+/* vrshr */
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-vqshlu_n_s8 (int8x8_t __a, const int __b)
+__extension__ extern __inline int8x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vrshr_n_s8 (int8x8_t __a, const int __b)
 {
-  return __builtin_aarch64_sqshlu_nv8qi_uss (__a, __b);
+  return (int8x8_t) __builtin_aarch64_srshr_nv8qi (__a, __b);
 }
 
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-vqshlu_n_s16 (int16x4_t __a, const int __b)
+__extension__ extern __inline int16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vrshr_n_s16 (int16x4_t __a, const int __b)
 {
-  return __builtin_aarch64_sqshlu_nv4hi_uss (__a, __b);
+  return (int16x4_t) __builtin_aarch64_srshr_nv4hi (__a, __b);
 }
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-vqshlu_n_s32 (int32x2_t __a, const int __b)
+__extension__ extern __inline int32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vrshr_n_s32 (int32x2_t __a, const int __b)
 {
-  return __builtin_aarch64_sqshlu_nv2si_uss (__a, __b);
+  return (int32x2_t) __builtin_aarch64_srshr_nv2si (__a, __b);
 }
 
-__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
-vqshlu_n_s64 (int64x1_t __a, const int __b)
+__extension__ extern __inline int64x1_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vrshr_n_s64 (int64x1_t __a, const int __b)
 {
-  return (uint64x1_t) {__builtin_aarch64_sqshlu_ndi_uss (__a[0], __b)};
+  return (int64x1_t) {__builtin_aarch64_srshr_ndi (__a[0], __b)};
 }
 
-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-vqshluq_n_s8 (int8x16_t __a, const int __b)
+__extension__ extern __inline uint8x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vrshr_n_u8 (uint8x8_t __a, const int __b)
 {
-  return __builtin_aarch64_sqshlu_nv16qi_uss (__a, __b);
+  return __builtin_aarch64_urshr_nv8qi_uus (__a, __b);
 }
 
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-vqshluq_n_s16 (int16x8_t __a, const int __b)
+__extension__ extern __inline uint16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vrshr_n_u16 (uint16x4_t __a, const int __b)
 {
-  return __builtin_aarch64_sqshlu_nv8hi_uss (__a, __b);
+  return __builtin_aarch64_urshr_nv4hi_uus (__a, __b);
 }
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vqshluq_n_s32 (int32x4_t __a, const int __b)
+__extension__ extern __inline uint32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vrshr_n_u32 (uint32x2_t __a, const int __b)
 {
-  return __builtin_aarch64_sqshlu_nv4si_uss (__a, __b);
+  return __builtin_aarch64_urshr_nv2si_uus (__a, __b);
 }
 
-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
-vqshluq_n_s64 (int64x2_t __a, const int __b)
+__extension__ extern __inline uint64x1_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vrshr_n_u64 (uint64x1_t __a, const int __b)
 {
-  return __builtin_aarch64_sqshlu_nv2di_uss (__a, __b);
+  return (uint64x1_t) {__builtin_aarch64_urshr_ndi_uus (__a[0], __b)};
 }
 
-__extension__ static __inline int8_t __attribute__ ((__always_inline__))
-vqshlub_n_s8 (int8_t __a, const int __b)
+__extension__ extern __inline int8x16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vrshrq_n_s8 (int8x16_t __a, const int __b)
 {
-  return (int8_t) __builtin_aarch64_sqshlu_nqi_uss (__a, __b);
+  return (int8x16_t) __builtin_aarch64_srshr_nv16qi (__a, __b);
 }
 
-__extension__ static __inline int16_t __attribute__ ((__always_inline__))
-vqshluh_n_s16 (int16_t __a, const int __b)
+__extension__ extern __inline int16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vrshrq_n_s16 (int16x8_t __a, const int __b)
 {
-  return (int16_t) __builtin_aarch64_sqshlu_nhi_uss (__a, __b);
+  return (int16x8_t) __builtin_aarch64_srshr_nv8hi (__a, __b);
 }
 
-__extension__ static __inline int32_t __attribute__ ((__always_inline__))
-vqshlus_n_s32 (int32_t __a, const int __b)
+__extension__ extern __inline int32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vrshrq_n_s32 (int32x4_t __a, const int __b)
 {
-  return (int32_t) __builtin_aarch64_sqshlu_nsi_uss (__a, __b);
+  return (int32x4_t) __builtin_aarch64_srshr_nv4si (__a, __b);
 }
 
-__extension__ static __inline uint64_t __attribute__ ((__always_inline__))
-vqshlud_n_s64 (int64_t __a, const int __b)
+__extension__ extern __inline int64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vrshrq_n_s64 (int64x2_t __a, const int __b)
 {
-  return __builtin_aarch64_sqshlu_ndi_uss (__a, __b);
+  return (int64x2_t) __builtin_aarch64_srshr_nv2di (__a, __b);
 }
 
-/* vqshrn */
-
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-vqshrn_n_s16 (int16x8_t __a, const int __b)
+__extension__ extern __inline uint8x16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vrshrq_n_u8 (uint8x16_t __a, const int __b)
 {
-  return (int8x8_t) __builtin_aarch64_sqshrn_nv8hi (__a, __b);
+  return __builtin_aarch64_urshr_nv16qi_uus (__a, __b);
 }
 
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-vqshrn_n_s32 (int32x4_t __a, const int __b)
+__extension__ extern __inline uint16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vrshrq_n_u16 (uint16x8_t __a, const int __b)
 {
-  return (int16x4_t) __builtin_aarch64_sqshrn_nv4si (__a, __b);
+  return __builtin_aarch64_urshr_nv8hi_uus (__a, __b);
 }
 
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-vqshrn_n_s64 (int64x2_t __a, const int __b)
+__extension__ extern __inline uint32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vrshrq_n_u32 (uint32x4_t __a, const int __b)
 {
-  return (int32x2_t) __builtin_aarch64_sqshrn_nv2di (__a, __b);
+  return __builtin_aarch64_urshr_nv4si_uus (__a, __b);
 }
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-vqshrn_n_u16 (uint16x8_t __a, const int __b)
+__extension__ extern __inline uint64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vrshrq_n_u64 (uint64x2_t __a, const int __b)
 {
-  return __builtin_aarch64_uqshrn_nv8hi_uus ( __a, __b);
+  return __builtin_aarch64_urshr_nv2di_uus (__a, __b);
 }
 
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-vqshrn_n_u32 (uint32x4_t __a, const int __b)
+__extension__ extern __inline int64_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vrshrd_n_s64 (int64_t __a, const int __b)
 {
-  return __builtin_aarch64_uqshrn_nv4si_uus ( __a, __b);
+  return __builtin_aarch64_srshr_ndi (__a, __b);
 }
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-vqshrn_n_u64 (uint64x2_t __a, const int __b)
+__extension__ extern __inline uint64_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vrshrd_n_u64 (uint64_t __a, const int __b)
 {
-  return __builtin_aarch64_uqshrn_nv2di_uus ( __a, __b);
+  return __builtin_aarch64_urshr_ndi_uus (__a, __b);
 }
 
-__extension__ static __inline int8_t __attribute__ ((__always_inline__))
-vqshrnh_n_s16 (int16_t __a, const int __b)
+/* vrsqrte.  */
+
+__extension__ extern __inline float32_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vrsqrtes_f32 (float32_t __a)
 {
-  return (int8_t) __builtin_aarch64_sqshrn_nhi (__a, __b);
+  return __builtin_aarch64_rsqrtesf (__a);
 }
 
-__extension__ static __inline int16_t __attribute__ ((__always_inline__))
-vqshrns_n_s32 (int32_t __a, const int __b)
+__extension__ extern __inline float64_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vrsqrted_f64 (float64_t __a)
 {
-  return (int16_t) __builtin_aarch64_sqshrn_nsi (__a, __b);
+  return __builtin_aarch64_rsqrtedf (__a);
 }
 
-__extension__ static __inline int32_t __attribute__ ((__always_inline__))
-vqshrnd_n_s64 (int64_t __a, const int __b)
+__extension__ extern __inline float32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vrsqrte_f32 (float32x2_t __a)
 {
-  return (int32_t) __builtin_aarch64_sqshrn_ndi (__a, __b);
+  return __builtin_aarch64_rsqrtev2sf (__a);
 }
 
-__extension__ static __inline uint8_t __attribute__ ((__always_inline__))
-vqshrnh_n_u16 (uint16_t __a, const int __b)
+__extension__ extern __inline float64x1_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vrsqrte_f64 (float64x1_t __a)
 {
-  return __builtin_aarch64_uqshrn_nhi_uus (__a, __b);
+  return (float64x1_t) {vrsqrted_f64 (vget_lane_f64 (__a, 0))};
 }
 
-__extension__ static __inline uint16_t __attribute__ ((__always_inline__))
-vqshrns_n_u32 (uint32_t __a, const int __b)
+__extension__ extern __inline float32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vrsqrteq_f32 (float32x4_t __a)
 {
-  return __builtin_aarch64_uqshrn_nsi_uus (__a, __b);
+  return __builtin_aarch64_rsqrtev4sf (__a);
 }
 
-__extension__ static __inline uint32_t __attribute__ ((__always_inline__))
-vqshrnd_n_u64 (uint64_t __a, const int __b)
+__extension__ extern __inline float64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vrsqrteq_f64 (float64x2_t __a)
 {
-  return __builtin_aarch64_uqshrn_ndi_uus (__a, __b);
+  return __builtin_aarch64_rsqrtev2df (__a);
 }
 
-/* vqshrun */
+/* vrsqrts.  */
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-vqshrun_n_s16 (int16x8_t __a, const int __b)
+__extension__ extern __inline float32_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vrsqrtss_f32 (float32_t __a, float32_t __b)
 {
-  return (uint8x8_t) __builtin_aarch64_sqshrun_nv8hi (__a, __b);
+  return __builtin_aarch64_rsqrtssf (__a, __b);
 }
 
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-vqshrun_n_s32 (int32x4_t __a, const int __b)
+__extension__ extern __inline float64_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vrsqrtsd_f64 (float64_t __a, float64_t __b)
 {
-  return (uint16x4_t) __builtin_aarch64_sqshrun_nv4si (__a, __b);
+  return __builtin_aarch64_rsqrtsdf (__a, __b);
 }
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-vqshrun_n_s64 (int64x2_t __a, const int __b)
+__extension__ extern __inline float32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vrsqrts_f32 (float32x2_t __a, float32x2_t __b)
 {
-  return (uint32x2_t) __builtin_aarch64_sqshrun_nv2di (__a, __b);
+  return __builtin_aarch64_rsqrtsv2sf (__a, __b);
 }
 
-__extension__ static __inline int8_t __attribute__ ((__always_inline__))
-vqshrunh_n_s16 (int16_t __a, const int __b)
+__extension__ extern __inline float64x1_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vrsqrts_f64 (float64x1_t __a, float64x1_t __b)
 {
-  return (int8_t) __builtin_aarch64_sqshrun_nhi (__a, __b);
+  return (float64x1_t) {vrsqrtsd_f64 (vget_lane_f64 (__a, 0),
+				      vget_lane_f64 (__b, 0))};
 }
 
-__extension__ static __inline int16_t __attribute__ ((__always_inline__))
-vqshruns_n_s32 (int32_t __a, const int __b)
+__extension__ extern __inline float32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vrsqrtsq_f32 (float32x4_t __a, float32x4_t __b)
 {
-  return (int16_t) __builtin_aarch64_sqshrun_nsi (__a, __b);
+  return __builtin_aarch64_rsqrtsv4sf (__a, __b);
 }
 
-__extension__ static __inline int32_t __attribute__ ((__always_inline__))
-vqshrund_n_s64 (int64_t __a, const int __b)
+__extension__ extern __inline float64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vrsqrtsq_f64 (float64x2_t __a, float64x2_t __b)
 {
-  return (int32_t) __builtin_aarch64_sqshrun_ndi (__a, __b);
+  return __builtin_aarch64_rsqrtsv2df (__a, __b);
 }
 
-/* vqsub */
+/* vrsra */
 
-__extension__ static __inline int8_t __attribute__ ((__always_inline__))
-vqsubb_s8 (int8_t __a, int8_t __b)
+__extension__ extern __inline int8x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vrsra_n_s8 (int8x8_t __a, int8x8_t __b, const int __c)
 {
-  return (int8_t) __builtin_aarch64_sqsubqi (__a, __b);
+  return (int8x8_t) __builtin_aarch64_srsra_nv8qi (__a, __b, __c);
 }
 
-__extension__ static __inline int16_t __attribute__ ((__always_inline__))
-vqsubh_s16 (int16_t __a, int16_t __b)
+__extension__ extern __inline int16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vrsra_n_s16 (int16x4_t __a, int16x4_t __b, const int __c)
 {
-  return (int16_t) __builtin_aarch64_sqsubhi (__a, __b);
+  return (int16x4_t) __builtin_aarch64_srsra_nv4hi (__a, __b, __c);
 }
 
-__extension__ static __inline int32_t __attribute__ ((__always_inline__))
-vqsubs_s32 (int32_t __a, int32_t __b)
+__extension__ extern __inline int32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vrsra_n_s32 (int32x2_t __a, int32x2_t __b, const int __c)
 {
-  return (int32_t) __builtin_aarch64_sqsubsi (__a, __b);
+  return (int32x2_t) __builtin_aarch64_srsra_nv2si (__a, __b, __c);
 }
 
-__extension__ static __inline int64_t __attribute__ ((__always_inline__))
-vqsubd_s64 (int64_t __a, int64_t __b)
+__extension__ extern __inline int64x1_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vrsra_n_s64 (int64x1_t __a, int64x1_t __b, const int __c)
 {
-  return __builtin_aarch64_sqsubdi (__a, __b);
+  return (int64x1_t) {__builtin_aarch64_srsra_ndi (__a[0], __b[0], __c)};
 }
 
-__extension__ static __inline uint8_t __attribute__ ((__always_inline__))
-vqsubb_u8 (uint8_t __a, uint8_t __b)
+__extension__ extern __inline uint8x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vrsra_n_u8 (uint8x8_t __a, uint8x8_t __b, const int __c)
 {
-  return (uint8_t) __builtin_aarch64_uqsubqi_uuu (__a, __b);
+  return __builtin_aarch64_ursra_nv8qi_uuus (__a, __b, __c);
 }
 
-__extension__ static __inline uint16_t __attribute__ ((__always_inline__))
-vqsubh_u16 (uint16_t __a, uint16_t __b)
+__extension__ extern __inline uint16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vrsra_n_u16 (uint16x4_t __a, uint16x4_t __b, const int __c)
 {
-  return (uint16_t) __builtin_aarch64_uqsubhi_uuu (__a, __b);
+  return __builtin_aarch64_ursra_nv4hi_uuus (__a, __b, __c);
 }
 
-__extension__ static __inline uint32_t __attribute__ ((__always_inline__))
-vqsubs_u32 (uint32_t __a, uint32_t __b)
+__extension__ extern __inline uint32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vrsra_n_u32 (uint32x2_t __a, uint32x2_t __b, const int __c)
 {
-  return (uint32_t) __builtin_aarch64_uqsubsi_uuu (__a, __b);
+  return __builtin_aarch64_ursra_nv2si_uuus (__a, __b, __c);
 }
 
-__extension__ static __inline uint64_t __attribute__ ((__always_inline__))
-vqsubd_u64 (uint64_t __a, uint64_t __b)
+__extension__ extern __inline uint64x1_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vrsra_n_u64 (uint64x1_t __a, uint64x1_t __b, const int __c)
 {
-  return __builtin_aarch64_uqsubdi_uuu (__a, __b);
+  return (uint64x1_t) {__builtin_aarch64_ursra_ndi_uuus (__a[0], __b[0], __c)};
 }
 
-/* vqtbl2 */
+__extension__ extern __inline int8x16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vrsraq_n_s8 (int8x16_t __a, int8x16_t __b, const int __c)
+{
+  return (int8x16_t) __builtin_aarch64_srsra_nv16qi (__a, __b, __c);
+}
 
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-vqtbl2_s8 (int8x16x2_t tab, uint8x8_t idx)
+__extension__ extern __inline int16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vrsraq_n_s16 (int16x8_t __a, int16x8_t __b, const int __c)
 {
-  __builtin_aarch64_simd_oi __o;
-  __o = __builtin_aarch64_set_qregoiv16qi (__o, tab.val[0], 0);
-  __o = __builtin_aarch64_set_qregoiv16qi (__o, tab.val[1], 1);
-  return __builtin_aarch64_tbl3v8qi (__o, (int8x8_t)idx);
+  return (int16x8_t) __builtin_aarch64_srsra_nv8hi (__a, __b, __c);
 }
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-vqtbl2_u8 (uint8x16x2_t tab, uint8x8_t idx)
+__extension__ extern __inline int32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vrsraq_n_s32 (int32x4_t __a, int32x4_t __b, const int __c)
 {
-  __builtin_aarch64_simd_oi __o;
-  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t)tab.val[0], 0);
-  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t)tab.val[1], 1);
-  return (uint8x8_t)__builtin_aarch64_tbl3v8qi (__o, (int8x8_t)idx);
+  return (int32x4_t) __builtin_aarch64_srsra_nv4si (__a, __b, __c);
 }
 
-__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
-vqtbl2_p8 (poly8x16x2_t tab, uint8x8_t idx)
+__extension__ extern __inline int64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vrsraq_n_s64 (int64x2_t __a, int64x2_t __b, const int __c)
 {
-  __builtin_aarch64_simd_oi __o;
-  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t)tab.val[0], 0);
-  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t)tab.val[1], 1);
-  return (poly8x8_t)__builtin_aarch64_tbl3v8qi (__o, (int8x8_t)idx);
+  return (int64x2_t) __builtin_aarch64_srsra_nv2di (__a, __b, __c);
 }
 
-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
-vqtbl2q_s8 (int8x16x2_t tab, uint8x16_t idx)
+__extension__ extern __inline uint8x16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vrsraq_n_u8 (uint8x16_t __a, uint8x16_t __b, const int __c)
 {
-  __builtin_aarch64_simd_oi __o;
-  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t)tab.val[0], 0);
-  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t)tab.val[1], 1);
-  return __builtin_aarch64_tbl3v16qi (__o, (int8x16_t)idx);
+  return __builtin_aarch64_ursra_nv16qi_uuus (__a, __b, __c);
 }
 
-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-vqtbl2q_u8 (uint8x16x2_t tab, uint8x16_t idx)
+__extension__ extern __inline uint16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vrsraq_n_u16 (uint16x8_t __a, uint16x8_t __b, const int __c)
 {
-  __builtin_aarch64_simd_oi __o;
-  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t)tab.val[0], 0);
-  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t)tab.val[1], 1);
-  return (uint8x16_t)__builtin_aarch64_tbl3v16qi (__o, (int8x16_t)idx);
+  return __builtin_aarch64_ursra_nv8hi_uuus (__a, __b, __c);
 }
 
-__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
-vqtbl2q_p8 (poly8x16x2_t tab, uint8x16_t idx)
+__extension__ extern __inline uint32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vrsraq_n_u32 (uint32x4_t __a, uint32x4_t __b, const int __c)
 {
-  __builtin_aarch64_simd_oi __o;
-  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t)tab.val[0], 0);
-  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t)tab.val[1], 1);
-  return (poly8x16_t)__builtin_aarch64_tbl3v16qi (__o, (int8x16_t)idx);
+  return __builtin_aarch64_ursra_nv4si_uuus (__a, __b, __c);
 }
 
-/* vqtbl3 */
+__extension__ extern __inline uint64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vrsraq_n_u64 (uint64x2_t __a, uint64x2_t __b, const int __c)
+{
+  return __builtin_aarch64_ursra_nv2di_uuus (__a, __b, __c);
+}
 
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-vqtbl3_s8 (int8x16x3_t tab, uint8x8_t idx)
+__extension__ extern __inline int64_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vrsrad_n_s64 (int64_t __a, int64_t __b, const int __c)
 {
-  __builtin_aarch64_simd_ci __o;
-  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)tab.val[0], 0);
-  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)tab.val[1], 1);
-  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)tab.val[2], 2);
-  return __builtin_aarch64_qtbl3v8qi (__o, (int8x8_t)idx);
+  return __builtin_aarch64_srsra_ndi (__a, __b, __c);
 }
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-vqtbl3_u8 (uint8x16x3_t tab, uint8x8_t idx)
+__extension__ extern __inline uint64_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vrsrad_n_u64 (uint64_t __a, uint64_t __b, const int __c)
 {
-  __builtin_aarch64_simd_ci __o;
-  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)tab.val[0], 0);
-  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)tab.val[1], 1);
-  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)tab.val[2], 2);
-  return (uint8x8_t)__builtin_aarch64_qtbl3v8qi (__o, (int8x8_t)idx);
+  return __builtin_aarch64_ursra_ndi_uuus (__a, __b, __c);
 }
 
-__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
-vqtbl3_p8 (poly8x16x3_t tab, uint8x8_t idx)
+#pragma GCC push_options
+#pragma GCC target ("+nothing+crypto")
+
+/* vsha1  */
+
+__extension__ extern __inline uint32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vsha1cq_u32 (uint32x4_t hash_abcd, uint32_t hash_e, uint32x4_t wk)
 {
-  __builtin_aarch64_simd_ci __o;
-  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)tab.val[0], 0);
-  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)tab.val[1], 1);
-  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)tab.val[2], 2);
-  return (poly8x8_t)__builtin_aarch64_qtbl3v8qi (__o, (int8x8_t)idx);
+  return __builtin_aarch64_crypto_sha1cv4si_uuuu (hash_abcd, hash_e, wk);
 }
 
-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
-vqtbl3q_s8 (int8x16x3_t tab, uint8x16_t idx)
+__extension__ extern __inline uint32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vsha1mq_u32 (uint32x4_t hash_abcd, uint32_t hash_e, uint32x4_t wk)
 {
-  __builtin_aarch64_simd_ci __o;
-  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)tab.val[0], 0);
-  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)tab.val[1], 1);
-  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)tab.val[2], 2);
-  return __builtin_aarch64_qtbl3v16qi (__o, (int8x16_t)idx);
+  return __builtin_aarch64_crypto_sha1mv4si_uuuu (hash_abcd, hash_e, wk);
 }
 
-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-vqtbl3q_u8 (uint8x16x3_t tab, uint8x16_t idx)
+__extension__ extern __inline uint32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vsha1pq_u32 (uint32x4_t hash_abcd, uint32_t hash_e, uint32x4_t wk)
 {
-  __builtin_aarch64_simd_ci __o;
-  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)tab.val[0], 0);
-  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)tab.val[1], 1);
-  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)tab.val[2], 2);
-  return (uint8x16_t)__builtin_aarch64_qtbl3v16qi (__o, (int8x16_t)idx);
+  return __builtin_aarch64_crypto_sha1pv4si_uuuu (hash_abcd, hash_e, wk);
+}
+
+__extension__ extern __inline uint32_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vsha1h_u32 (uint32_t hash_e)
+{
+  return __builtin_aarch64_crypto_sha1hsi_uu (hash_e);
 }
 
-__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
-vqtbl3q_p8 (poly8x16x3_t tab, uint8x16_t idx)
+__extension__ extern __inline uint32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vsha1su0q_u32 (uint32x4_t w0_3, uint32x4_t w4_7, uint32x4_t w8_11)
 {
-  __builtin_aarch64_simd_ci __o;
-  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)tab.val[0], 0);
-  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)tab.val[1], 1);
-  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)tab.val[2], 2);
-  return (poly8x16_t)__builtin_aarch64_qtbl3v16qi (__o, (int8x16_t)idx);
+  return __builtin_aarch64_crypto_sha1su0v4si_uuuu (w0_3, w4_7, w8_11);
 }
 
-/* vqtbl4 */
-
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-vqtbl4_s8 (int8x16x4_t tab, uint8x8_t idx)
+__extension__ extern __inline uint32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vsha1su1q_u32 (uint32x4_t tw0_3, uint32x4_t w12_15)
 {
-  __builtin_aarch64_simd_xi __o;
-  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[0], 0);
-  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[1], 1);
-  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[2], 2);
-  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[3], 3);
-  return __builtin_aarch64_qtbl4v8qi (__o, (int8x8_t)idx);
+  return __builtin_aarch64_crypto_sha1su1v4si_uuu (tw0_3, w12_15);
 }
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-vqtbl4_u8 (uint8x16x4_t tab, uint8x8_t idx)
+__extension__ extern __inline uint32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vsha256hq_u32 (uint32x4_t hash_abcd, uint32x4_t hash_efgh, uint32x4_t wk)
 {
-  __builtin_aarch64_simd_xi __o;
-  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[0], 0);
-  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[1], 1);
-  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[2], 2);
-  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[3], 3);
-  return (uint8x8_t)__builtin_aarch64_qtbl4v8qi (__o, (int8x8_t)idx);
+  return __builtin_aarch64_crypto_sha256hv4si_uuuu (hash_abcd, hash_efgh, wk);
 }
 
-__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
-vqtbl4_p8 (poly8x16x4_t tab, uint8x8_t idx)
+__extension__ extern __inline uint32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vsha256h2q_u32 (uint32x4_t hash_efgh, uint32x4_t hash_abcd, uint32x4_t wk)
 {
-  __builtin_aarch64_simd_xi __o;
-  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[0], 0);
-  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[1], 1);
-  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[2], 2);
-  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[3], 3);
-  return (poly8x8_t)__builtin_aarch64_qtbl4v8qi (__o, (int8x8_t)idx);
+  return __builtin_aarch64_crypto_sha256h2v4si_uuuu (hash_efgh, hash_abcd, wk);
 }
 
-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
-vqtbl4q_s8 (int8x16x4_t tab, uint8x16_t idx)
+__extension__ extern __inline uint32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vsha256su0q_u32 (uint32x4_t w0_3, uint32x4_t w4_7)
 {
-  __builtin_aarch64_simd_xi __o;
-  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[0], 0);
-  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[1], 1);
-  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[2], 2);
-  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[3], 3);
-  return __builtin_aarch64_qtbl4v16qi (__o, (int8x16_t)idx);
+  return __builtin_aarch64_crypto_sha256su0v4si_uuu (w0_3, w4_7);
 }
 
-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-vqtbl4q_u8 (uint8x16x4_t tab, uint8x16_t idx)
+__extension__ extern __inline uint32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vsha256su1q_u32 (uint32x4_t tw0_3, uint32x4_t w8_11, uint32x4_t w12_15)
 {
-  __builtin_aarch64_simd_xi __o;
-  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[0], 0);
-  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[1], 1);
-  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[2], 2);
-  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[3], 3);
-  return (uint8x16_t)__builtin_aarch64_qtbl4v16qi (__o, (int8x16_t)idx);
+  return __builtin_aarch64_crypto_sha256su1v4si_uuuu (tw0_3, w8_11, w12_15);
 }
 
-__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
-vqtbl4q_p8 (poly8x16x4_t tab, uint8x16_t idx)
+__extension__ extern __inline poly128_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmull_p64 (poly64_t a, poly64_t b)
 {
-  __builtin_aarch64_simd_xi __o;
-  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[0], 0);
-  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[1], 1);
-  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[2], 2);
-  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[3], 3);
-  return (poly8x16_t)__builtin_aarch64_qtbl4v16qi (__o, (int8x16_t)idx);
+  return
+    __builtin_aarch64_crypto_pmulldi_ppp (a, b);
 }
 
-
-/* vqtbx2 */
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-vqtbx2_s8 (int8x8_t r, int8x16x2_t tab, uint8x8_t idx)
+__extension__ extern __inline poly128_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmull_high_p64 (poly64x2_t a, poly64x2_t b)
 {
-  __builtin_aarch64_simd_oi __o;
-  __o = __builtin_aarch64_set_qregoiv16qi (__o, tab.val[0], 0);
-  __o = __builtin_aarch64_set_qregoiv16qi (__o, tab.val[1], 1);
-  return __builtin_aarch64_tbx4v8qi (r, __o, (int8x8_t)idx);
+  return __builtin_aarch64_crypto_pmullv2di_ppp (a, b);
 }
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-vqtbx2_u8 (uint8x8_t r, uint8x16x2_t tab, uint8x8_t idx)
+#pragma GCC pop_options
+
+/* vshl */
+
+__extension__ extern __inline int8x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vshl_n_s8 (int8x8_t __a, const int __b)
 {
-  __builtin_aarch64_simd_oi __o;
-  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t)tab.val[0], 0);
-  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t)tab.val[1], 1);
-  return (uint8x8_t)__builtin_aarch64_tbx4v8qi ((int8x8_t)r, __o,
-						(int8x8_t)idx);
+  return (int8x8_t) __builtin_aarch64_ashlv8qi (__a, __b);
 }
 
-__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
-vqtbx2_p8 (poly8x8_t r, poly8x16x2_t tab, uint8x8_t idx)
+__extension__ extern __inline int16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vshl_n_s16 (int16x4_t __a, const int __b)
 {
-  __builtin_aarch64_simd_oi __o;
-  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t)tab.val[0], 0);
-  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t)tab.val[1], 1);
-  return (poly8x8_t)__builtin_aarch64_tbx4v8qi ((int8x8_t)r, __o,
-						(int8x8_t)idx);
+  return (int16x4_t) __builtin_aarch64_ashlv4hi (__a, __b);
 }
 
-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
-vqtbx2q_s8 (int8x16_t r, int8x16x2_t tab, uint8x16_t idx)
+__extension__ extern __inline int32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vshl_n_s32 (int32x2_t __a, const int __b)
 {
-  __builtin_aarch64_simd_oi __o;
-  __o = __builtin_aarch64_set_qregoiv16qi (__o, tab.val[0], 0);
-  __o = __builtin_aarch64_set_qregoiv16qi (__o, tab.val[1], 1);
-  return __builtin_aarch64_tbx4v16qi (r, __o, (int8x16_t)idx);
+  return (int32x2_t) __builtin_aarch64_ashlv2si (__a, __b);
 }
 
-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-vqtbx2q_u8 (uint8x16_t r, uint8x16x2_t tab, uint8x16_t idx)
+__extension__ extern __inline int64x1_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vshl_n_s64 (int64x1_t __a, const int __b)
 {
-  __builtin_aarch64_simd_oi __o;
-  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t)tab.val[0], 0);
-  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t)tab.val[1], 1);
-  return (uint8x16_t)__builtin_aarch64_tbx4v16qi ((int8x16_t)r, __o,
-						  (int8x16_t)idx);
+  return (int64x1_t) {__builtin_aarch64_ashldi (__a[0], __b)};
 }
 
-__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
-vqtbx2q_p8 (poly8x16_t r, poly8x16x2_t tab, uint8x16_t idx)
+__extension__ extern __inline uint8x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vshl_n_u8 (uint8x8_t __a, const int __b)
 {
-  __builtin_aarch64_simd_oi __o;
-  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t)tab.val[0], 0);
-  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t)tab.val[1], 1);
-  return (poly8x16_t)__builtin_aarch64_tbx4v16qi ((int8x16_t)r, __o,
-						  (int8x16_t)idx);
+  return (uint8x8_t) __builtin_aarch64_ashlv8qi ((int8x8_t) __a, __b);
 }
 
-/* vqtbx3 */
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-vqtbx3_s8 (int8x8_t r, int8x16x3_t tab, uint8x8_t idx)
+__extension__ extern __inline uint16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vshl_n_u16 (uint16x4_t __a, const int __b)
 {
-  __builtin_aarch64_simd_ci __o;
-  __o = __builtin_aarch64_set_qregciv16qi (__o, tab.val[0], 0);
-  __o = __builtin_aarch64_set_qregciv16qi (__o, tab.val[1], 1);
-  __o = __builtin_aarch64_set_qregciv16qi (__o, tab.val[2], 2);
-  return __builtin_aarch64_qtbx3v8qi (r, __o, (int8x8_t)idx);
+  return (uint16x4_t) __builtin_aarch64_ashlv4hi ((int16x4_t) __a, __b);
 }
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-vqtbx3_u8 (uint8x8_t r, uint8x16x3_t tab, uint8x8_t idx)
+__extension__ extern __inline uint32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vshl_n_u32 (uint32x2_t __a, const int __b)
 {
-  __builtin_aarch64_simd_ci __o;
-  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)tab.val[0], 0);
-  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)tab.val[1], 1);
-  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)tab.val[2], 2);
-  return (uint8x8_t)__builtin_aarch64_qtbx3v8qi ((int8x8_t)r, __o,
-						 (int8x8_t)idx);
+  return (uint32x2_t) __builtin_aarch64_ashlv2si ((int32x2_t) __a, __b);
 }
 
-__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
-vqtbx3_p8 (poly8x8_t r, poly8x16x3_t tab, uint8x8_t idx)
+__extension__ extern __inline uint64x1_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vshl_n_u64 (uint64x1_t __a, const int __b)
 {
-  __builtin_aarch64_simd_ci __o;
-  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)tab.val[0], 0);
-  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)tab.val[1], 1);
-  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)tab.val[2], 2);
-  return (poly8x8_t)__builtin_aarch64_qtbx3v8qi ((int8x8_t)r, __o,
-						 (int8x8_t)idx);
+  return (uint64x1_t) {__builtin_aarch64_ashldi ((int64_t) __a[0], __b)};
 }
 
-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
-vqtbx3q_s8 (int8x16_t r, int8x16x3_t tab, uint8x16_t idx)
+__extension__ extern __inline int8x16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vshlq_n_s8 (int8x16_t __a, const int __b)
 {
-  __builtin_aarch64_simd_ci __o;
-  __o = __builtin_aarch64_set_qregciv16qi (__o, tab.val[0], 0);
-  __o = __builtin_aarch64_set_qregciv16qi (__o, tab.val[1], 1);
-  __o = __builtin_aarch64_set_qregciv16qi (__o, tab.val[2], 2);
-  return __builtin_aarch64_qtbx3v16qi (r, __o, (int8x16_t)idx);
+  return (int8x16_t) __builtin_aarch64_ashlv16qi (__a, __b);
 }
 
-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-vqtbx3q_u8 (uint8x16_t r, uint8x16x3_t tab, uint8x16_t idx)
+__extension__ extern __inline int16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vshlq_n_s16 (int16x8_t __a, const int __b)
 {
-  __builtin_aarch64_simd_ci __o;
-  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)tab.val[0], 0);
-  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)tab.val[1], 1);
-  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)tab.val[2], 2);
-  return (uint8x16_t)__builtin_aarch64_qtbx3v16qi ((int8x16_t)r, __o,
-						   (int8x16_t)idx);
+  return (int16x8_t) __builtin_aarch64_ashlv8hi (__a, __b);
 }
 
-__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
-vqtbx3q_p8 (poly8x16_t r, poly8x16x3_t tab, uint8x16_t idx)
+__extension__ extern __inline int32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vshlq_n_s32 (int32x4_t __a, const int __b)
 {
-  __builtin_aarch64_simd_ci __o;
-  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)tab.val[0], 0);
-  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)tab.val[1], 1);
-  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)tab.val[2], 2);
-  return (poly8x16_t)__builtin_aarch64_qtbx3v16qi ((int8x16_t)r, __o,
-						   (int8x16_t)idx);
+  return (int32x4_t) __builtin_aarch64_ashlv4si (__a, __b);
 }
 
-/* vqtbx4 */
-
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-vqtbx4_s8 (int8x8_t r, int8x16x4_t tab, uint8x8_t idx)
+__extension__ extern __inline int64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vshlq_n_s64 (int64x2_t __a, const int __b)
 {
-  __builtin_aarch64_simd_xi __o;
-  __o = __builtin_aarch64_set_qregxiv16qi (__o, tab.val[0], 0);
-  __o = __builtin_aarch64_set_qregxiv16qi (__o, tab.val[1], 1);
-  __o = __builtin_aarch64_set_qregxiv16qi (__o, tab.val[2], 2);
-  __o = __builtin_aarch64_set_qregxiv16qi (__o, tab.val[3], 3);
-  return __builtin_aarch64_qtbx4v8qi (r, __o, (int8x8_t)idx);
+  return (int64x2_t) __builtin_aarch64_ashlv2di (__a, __b);
 }
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-vqtbx4_u8 (uint8x8_t r, uint8x16x4_t tab, uint8x8_t idx)
+__extension__ extern __inline uint8x16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vshlq_n_u8 (uint8x16_t __a, const int __b)
 {
-  __builtin_aarch64_simd_xi __o;
-  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[0], 0);
-  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[1], 1);
-  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[2], 2);
-  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[3], 3);
-  return (uint8x8_t)__builtin_aarch64_qtbx4v8qi ((int8x8_t)r, __o,
-						 (int8x8_t)idx);
+  return (uint8x16_t) __builtin_aarch64_ashlv16qi ((int8x16_t) __a, __b);
 }
 
-__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
-vqtbx4_p8 (poly8x8_t r, poly8x16x4_t tab, uint8x8_t idx)
+__extension__ extern __inline uint16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vshlq_n_u16 (uint16x8_t __a, const int __b)
 {
-  __builtin_aarch64_simd_xi __o;
-  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[0], 0);
-  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[1], 1);
-  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[2], 2);
-  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[3], 3);
-  return (poly8x8_t)__builtin_aarch64_qtbx4v8qi ((int8x8_t)r, __o,
-						 (int8x8_t)idx);
+  return (uint16x8_t) __builtin_aarch64_ashlv8hi ((int16x8_t) __a, __b);
 }
 
-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
-vqtbx4q_s8 (int8x16_t r, int8x16x4_t tab, uint8x16_t idx)
+__extension__ extern __inline uint32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vshlq_n_u32 (uint32x4_t __a, const int __b)
 {
-  __builtin_aarch64_simd_xi __o;
-  __o = __builtin_aarch64_set_qregxiv16qi (__o, tab.val[0], 0);
-  __o = __builtin_aarch64_set_qregxiv16qi (__o, tab.val[1], 1);
-  __o = __builtin_aarch64_set_qregxiv16qi (__o, tab.val[2], 2);
-  __o = __builtin_aarch64_set_qregxiv16qi (__o, tab.val[3], 3);
-  return __builtin_aarch64_qtbx4v16qi (r, __o, (int8x16_t)idx);
+  return (uint32x4_t) __builtin_aarch64_ashlv4si ((int32x4_t) __a, __b);
 }
 
-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-vqtbx4q_u8 (uint8x16_t r, uint8x16x4_t tab, uint8x16_t idx)
+__extension__ extern __inline uint64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vshlq_n_u64 (uint64x2_t __a, const int __b)
 {
-  __builtin_aarch64_simd_xi __o;
-  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[0], 0);
-  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[1], 1);
-  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[2], 2);
-  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[3], 3);
-  return (uint8x16_t)__builtin_aarch64_qtbx4v16qi ((int8x16_t)r, __o,
-						   (int8x16_t)idx);
+  return (uint64x2_t) __builtin_aarch64_ashlv2di ((int64x2_t) __a, __b);
 }
 
-__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
-vqtbx4q_p8 (poly8x16_t r, poly8x16x4_t tab, uint8x16_t idx)
+__extension__ extern __inline int64_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vshld_n_s64 (int64_t __a, const int __b)
 {
-  __builtin_aarch64_simd_xi __o;
-  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[0], 0);
-  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[1], 1);
-  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[2], 2);
-  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[3], 3);
-  return (poly8x16_t)__builtin_aarch64_qtbx4v16qi ((int8x16_t)r, __o,
-						   (int8x16_t)idx);
+  return __builtin_aarch64_ashldi (__a, __b);
 }
 
-/* vrbit  */
-
-__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
-vrbit_p8 (poly8x8_t __a)
+__extension__ extern __inline uint64_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vshld_n_u64 (uint64_t __a, const int __b)
 {
-  return (poly8x8_t) __builtin_aarch64_rbitv8qi ((int8x8_t) __a);
+  return (uint64_t) __builtin_aarch64_ashldi (__a, __b);
 }
 
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-vrbit_s8 (int8x8_t __a)
+__extension__ extern __inline int8x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vshl_s8 (int8x8_t __a, int8x8_t __b)
 {
-  return __builtin_aarch64_rbitv8qi (__a);
+  return __builtin_aarch64_sshlv8qi (__a, __b);
 }
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-vrbit_u8 (uint8x8_t __a)
+__extension__ extern __inline int16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vshl_s16 (int16x4_t __a, int16x4_t __b)
 {
-  return (uint8x8_t) __builtin_aarch64_rbitv8qi ((int8x8_t) __a);
+  return __builtin_aarch64_sshlv4hi (__a, __b);
 }
 
-__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
-vrbitq_p8 (poly8x16_t __a)
+__extension__ extern __inline int32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vshl_s32 (int32x2_t __a, int32x2_t __b)
 {
-  return (poly8x16_t) __builtin_aarch64_rbitv16qi ((int8x16_t)__a);
+  return __builtin_aarch64_sshlv2si (__a, __b);
 }
 
-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
-vrbitq_s8 (int8x16_t __a)
+__extension__ extern __inline int64x1_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vshl_s64 (int64x1_t __a, int64x1_t __b)
 {
-  return __builtin_aarch64_rbitv16qi (__a);
+  return (int64x1_t) {__builtin_aarch64_sshldi (__a[0], __b[0])};
 }
 
-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-vrbitq_u8 (uint8x16_t __a)
+__extension__ extern __inline uint8x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vshl_u8 (uint8x8_t __a, int8x8_t __b)
 {
-  return (uint8x16_t) __builtin_aarch64_rbitv16qi ((int8x16_t) __a);
+  return __builtin_aarch64_ushlv8qi_uus (__a, __b);
 }
 
-/* vrecpe  */
-
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-vrecpe_u32 (uint32x2_t __a)
+__extension__ extern __inline uint16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vshl_u16 (uint16x4_t __a, int16x4_t __b)
 {
-  return (uint32x2_t) __builtin_aarch64_urecpev2si ((int32x2_t) __a);
+  return __builtin_aarch64_ushlv4hi_uus (__a, __b);
 }
- 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vrecpeq_u32 (uint32x4_t __a)
+
+__extension__ extern __inline uint32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vshl_u32 (uint32x2_t __a, int32x2_t __b)
 {
-  return (uint32x4_t) __builtin_aarch64_urecpev4si ((int32x4_t) __a);
+  return __builtin_aarch64_ushlv2si_uus (__a, __b);
 }
 
-__extension__ static __inline float32_t __attribute__ ((__always_inline__))
-vrecpes_f32 (float32_t __a)
+__extension__ extern __inline uint64x1_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vshl_u64 (uint64x1_t __a, int64x1_t __b)
 {
-  return __builtin_aarch64_frecpesf (__a);
+  return (uint64x1_t) {__builtin_aarch64_ushldi_uus (__a[0], __b[0])};
 }
 
-__extension__ static __inline float64_t __attribute__ ((__always_inline__))
-vrecped_f64 (float64_t __a)
+__extension__ extern __inline int8x16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vshlq_s8 (int8x16_t __a, int8x16_t __b)
 {
-  return __builtin_aarch64_frecpedf (__a);
+  return __builtin_aarch64_sshlv16qi (__a, __b);
 }
 
-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
-vrecpe_f32 (float32x2_t __a)
+__extension__ extern __inline int16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vshlq_s16 (int16x8_t __a, int16x8_t __b)
 {
-  return __builtin_aarch64_frecpev2sf (__a);
+  return __builtin_aarch64_sshlv8hi (__a, __b);
 }
 
-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
-vrecpeq_f32 (float32x4_t __a)
+__extension__ extern __inline int32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vshlq_s32 (int32x4_t __a, int32x4_t __b)
 {
-  return __builtin_aarch64_frecpev4sf (__a);
+  return __builtin_aarch64_sshlv4si (__a, __b);
 }
 
-__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
-vrecpeq_f64 (float64x2_t __a)
+__extension__ extern __inline int64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vshlq_s64 (int64x2_t __a, int64x2_t __b)
 {
-  return __builtin_aarch64_frecpev2df (__a);
+  return __builtin_aarch64_sshlv2di (__a, __b);
 }
 
-/* vrecps  */
-
-__extension__ static __inline float32_t __attribute__ ((__always_inline__))
-vrecpss_f32 (float32_t __a, float32_t __b)
+__extension__ extern __inline uint8x16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vshlq_u8 (uint8x16_t __a, int8x16_t __b)
 {
-  return __builtin_aarch64_frecpssf (__a, __b);
+  return __builtin_aarch64_ushlv16qi_uus (__a, __b);
 }
 
-__extension__ static __inline float64_t __attribute__ ((__always_inline__))
-vrecpsd_f64 (float64_t __a, float64_t __b)
+__extension__ extern __inline uint16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vshlq_u16 (uint16x8_t __a, int16x8_t __b)
 {
-  return __builtin_aarch64_frecpsdf (__a, __b);
+  return __builtin_aarch64_ushlv8hi_uus (__a, __b);
 }
 
-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
-vrecps_f32 (float32x2_t __a, float32x2_t __b)
+__extension__ extern __inline uint32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vshlq_u32 (uint32x4_t __a, int32x4_t __b)
 {
-  return __builtin_aarch64_frecpsv2sf (__a, __b);
+  return __builtin_aarch64_ushlv4si_uus (__a, __b);
 }
 
-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
-vrecpsq_f32 (float32x4_t __a, float32x4_t __b)
+__extension__ extern __inline uint64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vshlq_u64 (uint64x2_t __a, int64x2_t __b)
 {
-  return __builtin_aarch64_frecpsv4sf (__a, __b);
+  return __builtin_aarch64_ushlv2di_uus (__a, __b);
 }
 
-__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
-vrecpsq_f64 (float64x2_t __a, float64x2_t __b)
+__extension__ extern __inline int64_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vshld_s64 (int64_t __a, int64_t __b)
 {
-  return __builtin_aarch64_frecpsv2df (__a, __b);
+  return __builtin_aarch64_sshldi (__a, __b);
 }
 
-/* vrecpx  */
-
-__extension__ static __inline float32_t __attribute__ ((__always_inline__))
-vrecpxs_f32 (float32_t __a)
+__extension__ extern __inline uint64_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vshld_u64 (uint64_t __a, uint64_t __b)
 {
-  return __builtin_aarch64_frecpxsf (__a);
+  return __builtin_aarch64_ushldi_uus (__a, __b);
 }
 
-__extension__ static __inline float64_t __attribute__ ((__always_inline__))
-vrecpxd_f64 (float64_t __a)
+__extension__ extern __inline int16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vshll_high_n_s8 (int8x16_t __a, const int __b)
 {
-  return __builtin_aarch64_frecpxdf (__a);
+  return __builtin_aarch64_sshll2_nv16qi (__a, __b);
 }
 
-
-/* vrev  */
-
-__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
-vrev16_p8 (poly8x8_t a)
+__extension__ extern __inline int32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vshll_high_n_s16 (int16x8_t __a, const int __b)
 {
-  return __builtin_shuffle (a, (uint8x8_t) { 1, 0, 3, 2, 5, 4, 7, 6 });
+  return __builtin_aarch64_sshll2_nv8hi (__a, __b);
 }
 
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-vrev16_s8 (int8x8_t a)
+__extension__ extern __inline int64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vshll_high_n_s32 (int32x4_t __a, const int __b)
 {
-  return __builtin_shuffle (a, (uint8x8_t) { 1, 0, 3, 2, 5, 4, 7, 6 });
+  return __builtin_aarch64_sshll2_nv4si (__a, __b);
 }
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-vrev16_u8 (uint8x8_t a)
+__extension__ extern __inline uint16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vshll_high_n_u8 (uint8x16_t __a, const int __b)
 {
-  return __builtin_shuffle (a, (uint8x8_t) { 1, 0, 3, 2, 5, 4, 7, 6 });
+  return (uint16x8_t) __builtin_aarch64_ushll2_nv16qi ((int8x16_t) __a, __b);
 }
 
-__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
-vrev16q_p8 (poly8x16_t a)
+__extension__ extern __inline uint32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vshll_high_n_u16 (uint16x8_t __a, const int __b)
 {
-  return __builtin_shuffle (a,
-      (uint8x16_t) { 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14 });
+  return (uint32x4_t) __builtin_aarch64_ushll2_nv8hi ((int16x8_t) __a, __b);
 }
 
-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
-vrev16q_s8 (int8x16_t a)
+__extension__ extern __inline uint64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vshll_high_n_u32 (uint32x4_t __a, const int __b)
 {
-  return __builtin_shuffle (a,
-      (uint8x16_t) { 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14 });
+  return (uint64x2_t) __builtin_aarch64_ushll2_nv4si ((int32x4_t) __a, __b);
 }
 
-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-vrev16q_u8 (uint8x16_t a)
+__extension__ extern __inline int16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vshll_n_s8 (int8x8_t __a, const int __b)
 {
-  return __builtin_shuffle (a,
-      (uint8x16_t) { 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14 });
+  return __builtin_aarch64_sshll_nv8qi (__a, __b);
 }
 
-__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
-vrev32_p8 (poly8x8_t a)
+__extension__ extern __inline int32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vshll_n_s16 (int16x4_t __a, const int __b)
 {
-  return __builtin_shuffle (a, (uint8x8_t) { 3, 2, 1, 0, 7, 6, 5, 4 });
+  return __builtin_aarch64_sshll_nv4hi (__a, __b);
 }
 
-__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__))
-vrev32_p16 (poly16x4_t a)
+__extension__ extern __inline int64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vshll_n_s32 (int32x2_t __a, const int __b)
 {
-  return __builtin_shuffle (a, (uint16x4_t) { 1, 0, 3, 2 });
+  return __builtin_aarch64_sshll_nv2si (__a, __b);
 }
 
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-vrev32_s8 (int8x8_t a)
+__extension__ extern __inline uint16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vshll_n_u8 (uint8x8_t __a, const int __b)
 {
-  return __builtin_shuffle (a, (uint8x8_t) { 3, 2, 1, 0, 7, 6, 5, 4 });
+  return __builtin_aarch64_ushll_nv8qi_uus (__a, __b);
 }
 
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-vrev32_s16 (int16x4_t a)
+__extension__ extern __inline uint32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vshll_n_u16 (uint16x4_t __a, const int __b)
 {
-  return __builtin_shuffle (a, (uint16x4_t) { 1, 0, 3, 2 });
+  return __builtin_aarch64_ushll_nv4hi_uus (__a, __b);
 }
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-vrev32_u8 (uint8x8_t a)
+__extension__ extern __inline uint64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vshll_n_u32 (uint32x2_t __a, const int __b)
 {
-  return __builtin_shuffle (a, (uint8x8_t) { 3, 2, 1, 0, 7, 6, 5, 4 });
+  return __builtin_aarch64_ushll_nv2si_uus (__a, __b);
 }
 
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-vrev32_u16 (uint16x4_t a)
+/* vshr */
+
+__extension__ extern __inline int8x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vshr_n_s8 (int8x8_t __a, const int __b)
 {
-  return __builtin_shuffle (a, (uint16x4_t) { 1, 0, 3, 2 });
+  return (int8x8_t) __builtin_aarch64_ashrv8qi (__a, __b);
 }
 
-__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
-vrev32q_p8 (poly8x16_t a)
+__extension__ extern __inline int16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vshr_n_s16 (int16x4_t __a, const int __b)
 {
-  return __builtin_shuffle (a,
-      (uint8x16_t) { 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12 });
+  return (int16x4_t) __builtin_aarch64_ashrv4hi (__a, __b);
 }
 
-__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
-vrev32q_p16 (poly16x8_t a)
+__extension__ extern __inline int32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vshr_n_s32 (int32x2_t __a, const int __b)
 {
-  return __builtin_shuffle (a, (uint16x8_t) { 1, 0, 3, 2, 5, 4, 7, 6 });
+  return (int32x2_t) __builtin_aarch64_ashrv2si (__a, __b);
 }
 
-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
-vrev32q_s8 (int8x16_t a)
+__extension__ extern __inline int64x1_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vshr_n_s64 (int64x1_t __a, const int __b)
 {
-  return __builtin_shuffle (a,
-      (uint8x16_t) { 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12 });
+  return (int64x1_t) {__builtin_aarch64_ashr_simddi (__a[0], __b)};
 }
 
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-vrev32q_s16 (int16x8_t a)
+__extension__ extern __inline uint8x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vshr_n_u8 (uint8x8_t __a, const int __b)
 {
-  return __builtin_shuffle (a, (uint16x8_t) { 1, 0, 3, 2, 5, 4, 7, 6 });
+  return (uint8x8_t) __builtin_aarch64_lshrv8qi ((int8x8_t) __a, __b);
 }
 
-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-vrev32q_u8 (uint8x16_t a)
+__extension__ extern __inline uint16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vshr_n_u16 (uint16x4_t __a, const int __b)
 {
-  return __builtin_shuffle (a,
-      (uint8x16_t) { 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12 });
+  return (uint16x4_t) __builtin_aarch64_lshrv4hi ((int16x4_t) __a, __b);
 }
 
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-vrev32q_u16 (uint16x8_t a)
+__extension__ extern __inline uint32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vshr_n_u32 (uint32x2_t __a, const int __b)
 {
-  return __builtin_shuffle (a, (uint16x8_t) { 1, 0, 3, 2, 5, 4, 7, 6 });
+  return (uint32x2_t) __builtin_aarch64_lshrv2si ((int32x2_t) __a, __b);
 }
 
-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
-vrev64_f32 (float32x2_t a)
+__extension__ extern __inline uint64x1_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vshr_n_u64 (uint64x1_t __a, const int __b)
 {
-  return __builtin_shuffle (a, (uint32x2_t) { 1, 0 });
+  return (uint64x1_t) {__builtin_aarch64_lshr_simddi_uus ( __a[0], __b)};
 }
 
-__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
-vrev64_p8 (poly8x8_t a)
+__extension__ extern __inline int8x16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vshrq_n_s8 (int8x16_t __a, const int __b)
 {
-  return __builtin_shuffle (a, (uint8x8_t) { 7, 6, 5, 4, 3, 2, 1, 0 });
+  return (int8x16_t) __builtin_aarch64_ashrv16qi (__a, __b);
 }
 
-__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__))
-vrev64_p16 (poly16x4_t a)
+__extension__ extern __inline int16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vshrq_n_s16 (int16x8_t __a, const int __b)
 {
-  return __builtin_shuffle (a, (uint16x4_t) { 3, 2, 1, 0 });
+  return (int16x8_t) __builtin_aarch64_ashrv8hi (__a, __b);
 }
 
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-vrev64_s8 (int8x8_t a)
+__extension__ extern __inline int32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vshrq_n_s32 (int32x4_t __a, const int __b)
 {
-  return __builtin_shuffle (a, (uint8x8_t) { 7, 6, 5, 4, 3, 2, 1, 0 });
+  return (int32x4_t) __builtin_aarch64_ashrv4si (__a, __b);
 }
 
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-vrev64_s16 (int16x4_t a)
+__extension__ extern __inline int64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vshrq_n_s64 (int64x2_t __a, const int __b)
 {
-  return __builtin_shuffle (a, (uint16x4_t) { 3, 2, 1, 0 });
+  return (int64x2_t) __builtin_aarch64_ashrv2di (__a, __b);
 }
 
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-vrev64_s32 (int32x2_t a)
+__extension__ extern __inline uint8x16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vshrq_n_u8 (uint8x16_t __a, const int __b)
 {
-  return __builtin_shuffle (a, (uint32x2_t) { 1, 0 });
+  return (uint8x16_t) __builtin_aarch64_lshrv16qi ((int8x16_t) __a, __b);
 }
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-vrev64_u8 (uint8x8_t a)
+__extension__ extern __inline uint16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vshrq_n_u16 (uint16x8_t __a, const int __b)
 {
-  return __builtin_shuffle (a, (uint8x8_t) { 7, 6, 5, 4, 3, 2, 1, 0 });
+  return (uint16x8_t) __builtin_aarch64_lshrv8hi ((int16x8_t) __a, __b);
 }
 
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-vrev64_u16 (uint16x4_t a)
+__extension__ extern __inline uint32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vshrq_n_u32 (uint32x4_t __a, const int __b)
 {
-  return __builtin_shuffle (a, (uint16x4_t) { 3, 2, 1, 0 });
+  return (uint32x4_t) __builtin_aarch64_lshrv4si ((int32x4_t) __a, __b);
 }
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-vrev64_u32 (uint32x2_t a)
+__extension__ extern __inline uint64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vshrq_n_u64 (uint64x2_t __a, const int __b)
 {
-  return __builtin_shuffle (a, (uint32x2_t) { 1, 0 });
+  return (uint64x2_t) __builtin_aarch64_lshrv2di ((int64x2_t) __a, __b);
 }
 
-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
-vrev64q_f32 (float32x4_t a)
+__extension__ extern __inline int64_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vshrd_n_s64 (int64_t __a, const int __b)
 {
-  return __builtin_shuffle (a, (uint32x4_t) { 1, 0, 3, 2 });
+  return __builtin_aarch64_ashr_simddi (__a, __b);
 }
 
-__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
-vrev64q_p8 (poly8x16_t a)
+__extension__ extern __inline uint64_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vshrd_n_u64 (uint64_t __a, const int __b)
 {
-  return __builtin_shuffle (a,
-      (uint8x16_t) { 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8 });
+  return __builtin_aarch64_lshr_simddi_uus (__a, __b);
 }
 
-__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
-vrev64q_p16 (poly16x8_t a)
+/* vsli */
+
+__extension__ extern __inline int8x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vsli_n_s8 (int8x8_t __a, int8x8_t __b, const int __c)
 {
-  return __builtin_shuffle (a, (uint16x8_t) { 3, 2, 1, 0, 7, 6, 5, 4 });
+  return (int8x8_t) __builtin_aarch64_ssli_nv8qi (__a, __b, __c);
 }
 
-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
-vrev64q_s8 (int8x16_t a)
+__extension__ extern __inline int16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vsli_n_s16 (int16x4_t __a, int16x4_t __b, const int __c)
 {
-  return __builtin_shuffle (a,
-      (uint8x16_t) { 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8 });
+  return (int16x4_t) __builtin_aarch64_ssli_nv4hi (__a, __b, __c);
 }
 
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-vrev64q_s16 (int16x8_t a)
+__extension__ extern __inline int32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vsli_n_s32 (int32x2_t __a, int32x2_t __b, const int __c)
 {
-  return __builtin_shuffle (a, (uint16x8_t) { 3, 2, 1, 0, 7, 6, 5, 4 });
+  return (int32x2_t) __builtin_aarch64_ssli_nv2si (__a, __b, __c);
 }
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-vrev64q_s32 (int32x4_t a)
+__extension__ extern __inline int64x1_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vsli_n_s64 (int64x1_t __a, int64x1_t __b, const int __c)
 {
-  return __builtin_shuffle (a, (uint32x4_t) { 1, 0, 3, 2 });
+  return (int64x1_t) {__builtin_aarch64_ssli_ndi (__a[0], __b[0], __c)};
 }
 
-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-vrev64q_u8 (uint8x16_t a)
+__extension__ extern __inline uint8x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vsli_n_u8 (uint8x8_t __a, uint8x8_t __b, const int __c)
 {
-  return __builtin_shuffle (a,
-      (uint8x16_t) { 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8 });
+  return __builtin_aarch64_usli_nv8qi_uuus (__a, __b, __c);
 }
 
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-vrev64q_u16 (uint16x8_t a)
+__extension__ extern __inline uint16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vsli_n_u16 (uint16x4_t __a, uint16x4_t __b, const int __c)
 {
-  return __builtin_shuffle (a, (uint16x8_t) { 3, 2, 1, 0, 7, 6, 5, 4 });
+  return __builtin_aarch64_usli_nv4hi_uuus (__a, __b, __c);
 }
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vrev64q_u32 (uint32x4_t a)
+__extension__ extern __inline uint32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vsli_n_u32 (uint32x2_t __a, uint32x2_t __b, const int __c)
 {
-  return __builtin_shuffle (a, (uint32x4_t) { 1, 0, 3, 2 });
+  return __builtin_aarch64_usli_nv2si_uuus (__a, __b, __c);
 }
 
-/* vrnd  */
-
-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
-vrnd_f32 (float32x2_t __a)
+__extension__ extern __inline uint64x1_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vsli_n_u64 (uint64x1_t __a, uint64x1_t __b, const int __c)
 {
-  return __builtin_aarch64_btruncv2sf (__a);
+  return (uint64x1_t) {__builtin_aarch64_usli_ndi_uuus (__a[0], __b[0], __c)};
 }
 
-__extension__ static __inline float64x1_t __attribute__ ((__always_inline__))
-vrnd_f64 (float64x1_t __a)
+__extension__ extern __inline poly64x1_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vsli_n_p64 (poly64x1_t __a, poly64x1_t __b, const int __c)
 {
-  return vset_lane_f64 (__builtin_trunc (vget_lane_f64 (__a, 0)), __a, 0);
+  return (poly64x1_t) {__builtin_aarch64_ssli_ndi_ppps (__a[0], __b[0], __c)};
 }
 
-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
-vrndq_f32 (float32x4_t __a)
+__extension__ extern __inline int8x16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vsliq_n_s8 (int8x16_t __a, int8x16_t __b, const int __c)
 {
-  return __builtin_aarch64_btruncv4sf (__a);
+  return (int8x16_t) __builtin_aarch64_ssli_nv16qi (__a, __b, __c);
 }
 
-__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
-vrndq_f64 (float64x2_t __a)
+__extension__ extern __inline int16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vsliq_n_s16 (int16x8_t __a, int16x8_t __b, const int __c)
 {
-  return __builtin_aarch64_btruncv2df (__a);
+  return (int16x8_t) __builtin_aarch64_ssli_nv8hi (__a, __b, __c);
 }
 
-/* vrnda  */
+__extension__ extern __inline int32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vsliq_n_s32 (int32x4_t __a, int32x4_t __b, const int __c)
+{
+  return (int32x4_t) __builtin_aarch64_ssli_nv4si (__a, __b, __c);
+}
 
-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
-vrnda_f32 (float32x2_t __a)
+__extension__ extern __inline int64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vsliq_n_s64 (int64x2_t __a, int64x2_t __b, const int __c)
 {
-  return __builtin_aarch64_roundv2sf (__a);
+  return (int64x2_t) __builtin_aarch64_ssli_nv2di (__a, __b, __c);
 }
 
-__extension__ static __inline float64x1_t __attribute__ ((__always_inline__))
-vrnda_f64 (float64x1_t __a)
+__extension__ extern __inline uint8x16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vsliq_n_u8 (uint8x16_t __a, uint8x16_t __b, const int __c)
 {
-  return vset_lane_f64 (__builtin_round (vget_lane_f64 (__a, 0)), __a, 0);
+  return __builtin_aarch64_usli_nv16qi_uuus (__a, __b, __c);
 }
 
-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
-vrndaq_f32 (float32x4_t __a)
+__extension__ extern __inline uint16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vsliq_n_u16 (uint16x8_t __a, uint16x8_t __b, const int __c)
 {
-  return __builtin_aarch64_roundv4sf (__a);
+  return __builtin_aarch64_usli_nv8hi_uuus (__a, __b, __c);
 }
 
-__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
-vrndaq_f64 (float64x2_t __a)
+__extension__ extern __inline uint32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vsliq_n_u32 (uint32x4_t __a, uint32x4_t __b, const int __c)
 {
-  return __builtin_aarch64_roundv2df (__a);
+  return __builtin_aarch64_usli_nv4si_uuus (__a, __b, __c);
 }
 
-/* vrndi  */
-
-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
-vrndi_f32 (float32x2_t __a)
+__extension__ extern __inline uint64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vsliq_n_u64 (uint64x2_t __a, uint64x2_t __b, const int __c)
 {
-  return __builtin_aarch64_nearbyintv2sf (__a);
+  return __builtin_aarch64_usli_nv2di_uuus (__a, __b, __c);
 }
 
-__extension__ static __inline float64x1_t __attribute__ ((__always_inline__))
-vrndi_f64 (float64x1_t __a)
+__extension__ extern __inline poly64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vsliq_n_p64 (poly64x2_t __a, poly64x2_t __b, const int __c)
 {
-  return vset_lane_f64 (__builtin_nearbyint (vget_lane_f64 (__a, 0)), __a, 0);
+  return __builtin_aarch64_ssli_nv2di_ppps (__a, __b, __c);
 }
 
-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
-vrndiq_f32 (float32x4_t __a)
+__extension__ extern __inline int64_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vslid_n_s64 (int64_t __a, int64_t __b, const int __c)
 {
-  return __builtin_aarch64_nearbyintv4sf (__a);
+  return __builtin_aarch64_ssli_ndi (__a, __b, __c);
 }
 
-__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
-vrndiq_f64 (float64x2_t __a)
+__extension__ extern __inline uint64_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vslid_n_u64 (uint64_t __a, uint64_t __b, const int __c)
 {
-  return __builtin_aarch64_nearbyintv2df (__a);
+  return __builtin_aarch64_usli_ndi_uuus (__a, __b, __c);
 }
 
-/* vrndm  */
+/* vsqadd */
 
-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
-vrndm_f32 (float32x2_t __a)
+__extension__ extern __inline uint8x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vsqadd_u8 (uint8x8_t __a, int8x8_t __b)
 {
-  return __builtin_aarch64_floorv2sf (__a);
+  return __builtin_aarch64_usqaddv8qi_uus (__a, __b);
 }
 
-__extension__ static __inline float64x1_t __attribute__ ((__always_inline__))
-vrndm_f64 (float64x1_t __a)
+__extension__ extern __inline uint16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vsqadd_u16 (uint16x4_t __a, int16x4_t __b)
 {
-  return vset_lane_f64 (__builtin_floor (vget_lane_f64 (__a, 0)), __a, 0);
+  return __builtin_aarch64_usqaddv4hi_uus (__a, __b);
 }
 
-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
-vrndmq_f32 (float32x4_t __a)
+__extension__ extern __inline uint32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vsqadd_u32 (uint32x2_t __a, int32x2_t __b)
 {
-  return __builtin_aarch64_floorv4sf (__a);
+  return __builtin_aarch64_usqaddv2si_uus (__a, __b);
 }
 
-__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
-vrndmq_f64 (float64x2_t __a)
+__extension__ extern __inline uint64x1_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vsqadd_u64 (uint64x1_t __a, int64x1_t __b)
 {
-  return __builtin_aarch64_floorv2df (__a);
+  return (uint64x1_t) {__builtin_aarch64_usqadddi_uus (__a[0], __b[0])};
 }
 
-/* vrndn  */
-
-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
-vrndn_f32 (float32x2_t __a)
+__extension__ extern __inline uint8x16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vsqaddq_u8 (uint8x16_t __a, int8x16_t __b)
 {
-  return __builtin_aarch64_frintnv2sf (__a);
+  return __builtin_aarch64_usqaddv16qi_uus (__a, __b);
 }
 
-__extension__ static __inline float64x1_t __attribute__ ((__always_inline__))
-vrndn_f64 (float64x1_t __a)
+__extension__ extern __inline uint16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vsqaddq_u16 (uint16x8_t __a, int16x8_t __b)
 {
-  return (float64x1_t) {__builtin_aarch64_frintndf (__a[0])};
+  return __builtin_aarch64_usqaddv8hi_uus (__a, __b);
 }
 
-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
-vrndnq_f32 (float32x4_t __a)
+__extension__ extern __inline uint32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vsqaddq_u32 (uint32x4_t __a, int32x4_t __b)
 {
-  return __builtin_aarch64_frintnv4sf (__a);
+  return __builtin_aarch64_usqaddv4si_uus (__a, __b);
 }
 
-__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
-vrndnq_f64 (float64x2_t __a)
+__extension__ extern __inline uint64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vsqaddq_u64 (uint64x2_t __a, int64x2_t __b)
 {
-  return __builtin_aarch64_frintnv2df (__a);
+  return __builtin_aarch64_usqaddv2di_uus (__a, __b);
 }
 
-/* vrndp  */
-
-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
-vrndp_f32 (float32x2_t __a)
+__extension__ extern __inline uint8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vsqaddb_u8 (uint8_t __a, int8_t __b)
 {
-  return __builtin_aarch64_ceilv2sf (__a);
+  return __builtin_aarch64_usqaddqi_uus (__a, __b);
 }
 
-__extension__ static __inline float64x1_t __attribute__ ((__always_inline__))
-vrndp_f64 (float64x1_t __a)
+__extension__ extern __inline uint16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vsqaddh_u16 (uint16_t __a, int16_t __b)
 {
-  return vset_lane_f64 (__builtin_ceil (vget_lane_f64 (__a, 0)), __a, 0);
+  return __builtin_aarch64_usqaddhi_uus (__a, __b);
 }
 
-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
-vrndpq_f32 (float32x4_t __a)
+__extension__ extern __inline uint32_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vsqadds_u32 (uint32_t __a, int32_t __b)
 {
-  return __builtin_aarch64_ceilv4sf (__a);
+  return __builtin_aarch64_usqaddsi_uus (__a, __b);
 }
 
-__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
-vrndpq_f64 (float64x2_t __a)
+__extension__ extern __inline uint64_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vsqaddd_u64 (uint64_t __a, int64_t __b)
 {
-  return __builtin_aarch64_ceilv2df (__a);
+  return __builtin_aarch64_usqadddi_uus (__a, __b);
 }
 
-/* vrndx  */
-
-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
-vrndx_f32 (float32x2_t __a)
+/* vsqrt */
+__extension__ extern __inline float32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vsqrt_f32 (float32x2_t a)
 {
-  return __builtin_aarch64_rintv2sf (__a);
+  return __builtin_aarch64_sqrtv2sf (a);
 }
 
-__extension__ static __inline float64x1_t __attribute__ ((__always_inline__))
-vrndx_f64 (float64x1_t __a)
+__extension__ extern __inline float32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vsqrtq_f32 (float32x4_t a)
 {
-  return vset_lane_f64 (__builtin_rint (vget_lane_f64 (__a, 0)), __a, 0);
+  return __builtin_aarch64_sqrtv4sf (a);
 }
 
-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
-vrndxq_f32 (float32x4_t __a)
+__extension__ extern __inline float64x1_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vsqrt_f64 (float64x1_t a)
 {
-  return __builtin_aarch64_rintv4sf (__a);
+  return (float64x1_t) { __builtin_aarch64_sqrtdf (a[0]) };
 }
 
-__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
-vrndxq_f64 (float64x2_t __a)
+__extension__ extern __inline float64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vsqrtq_f64 (float64x2_t a)
 {
-  return __builtin_aarch64_rintv2df (__a);
+  return __builtin_aarch64_sqrtv2df (a);
 }
 
-/* vrshl */
+/* vsra */
 
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-vrshl_s8 (int8x8_t __a, int8x8_t __b)
+__extension__ extern __inline int8x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vsra_n_s8 (int8x8_t __a, int8x8_t __b, const int __c)
 {
-  return (int8x8_t) __builtin_aarch64_srshlv8qi (__a, __b);
+  return (int8x8_t) __builtin_aarch64_ssra_nv8qi (__a, __b, __c);
 }
 
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-vrshl_s16 (int16x4_t __a, int16x4_t __b)
+__extension__ extern __inline int16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vsra_n_s16 (int16x4_t __a, int16x4_t __b, const int __c)
 {
-  return (int16x4_t) __builtin_aarch64_srshlv4hi (__a, __b);
+  return (int16x4_t) __builtin_aarch64_ssra_nv4hi (__a, __b, __c);
 }
 
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-vrshl_s32 (int32x2_t __a, int32x2_t __b)
+__extension__ extern __inline int32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vsra_n_s32 (int32x2_t __a, int32x2_t __b, const int __c)
 {
-  return (int32x2_t) __builtin_aarch64_srshlv2si (__a, __b);
+  return (int32x2_t) __builtin_aarch64_ssra_nv2si (__a, __b, __c);
 }
 
-__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
-vrshl_s64 (int64x1_t __a, int64x1_t __b)
+__extension__ extern __inline int64x1_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vsra_n_s64 (int64x1_t __a, int64x1_t __b, const int __c)
 {
-  return (int64x1_t) {__builtin_aarch64_srshldi (__a[0], __b[0])};
+  return (int64x1_t) {__builtin_aarch64_ssra_ndi (__a[0], __b[0], __c)};
 }
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-vrshl_u8 (uint8x8_t __a, int8x8_t __b)
+__extension__ extern __inline uint8x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vsra_n_u8 (uint8x8_t __a, uint8x8_t __b, const int __c)
 {
-  return __builtin_aarch64_urshlv8qi_uus (__a, __b);
+  return __builtin_aarch64_usra_nv8qi_uuus (__a, __b, __c);
 }
 
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-vrshl_u16 (uint16x4_t __a, int16x4_t __b)
+__extension__ extern __inline uint16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vsra_n_u16 (uint16x4_t __a, uint16x4_t __b, const int __c)
 {
-  return __builtin_aarch64_urshlv4hi_uus (__a, __b);
+  return __builtin_aarch64_usra_nv4hi_uuus (__a, __b, __c);
 }
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-vrshl_u32 (uint32x2_t __a, int32x2_t __b)
+__extension__ extern __inline uint32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vsra_n_u32 (uint32x2_t __a, uint32x2_t __b, const int __c)
 {
-  return __builtin_aarch64_urshlv2si_uus (__a, __b);
+  return __builtin_aarch64_usra_nv2si_uuus (__a, __b, __c);
 }
 
-__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
-vrshl_u64 (uint64x1_t __a, int64x1_t __b)
+__extension__ extern __inline uint64x1_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vsra_n_u64 (uint64x1_t __a, uint64x1_t __b, const int __c)
 {
-  return (uint64x1_t) {__builtin_aarch64_urshldi_uus (__a[0], __b[0])};
+  return (uint64x1_t) {__builtin_aarch64_usra_ndi_uuus (__a[0], __b[0], __c)};
 }
 
-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
-vrshlq_s8 (int8x16_t __a, int8x16_t __b)
+__extension__ extern __inline int8x16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vsraq_n_s8 (int8x16_t __a, int8x16_t __b, const int __c)
 {
-  return (int8x16_t) __builtin_aarch64_srshlv16qi (__a, __b);
+  return (int8x16_t) __builtin_aarch64_ssra_nv16qi (__a, __b, __c);
 }
 
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-vrshlq_s16 (int16x8_t __a, int16x8_t __b)
+__extension__ extern __inline int16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vsraq_n_s16 (int16x8_t __a, int16x8_t __b, const int __c)
 {
-  return (int16x8_t) __builtin_aarch64_srshlv8hi (__a, __b);
+  return (int16x8_t) __builtin_aarch64_ssra_nv8hi (__a, __b, __c);
 }
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-vrshlq_s32 (int32x4_t __a, int32x4_t __b)
+__extension__ extern __inline int32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vsraq_n_s32 (int32x4_t __a, int32x4_t __b, const int __c)
 {
-  return (int32x4_t) __builtin_aarch64_srshlv4si (__a, __b);
+  return (int32x4_t) __builtin_aarch64_ssra_nv4si (__a, __b, __c);
 }
 
-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
-vrshlq_s64 (int64x2_t __a, int64x2_t __b)
+__extension__ extern __inline int64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vsraq_n_s64 (int64x2_t __a, int64x2_t __b, const int __c)
 {
-  return (int64x2_t) __builtin_aarch64_srshlv2di (__a, __b);
+  return (int64x2_t) __builtin_aarch64_ssra_nv2di (__a, __b, __c);
 }
 
-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-vrshlq_u8 (uint8x16_t __a, int8x16_t __b)
+__extension__ extern __inline uint8x16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vsraq_n_u8 (uint8x16_t __a, uint8x16_t __b, const int __c)
 {
-  return __builtin_aarch64_urshlv16qi_uus (__a, __b);
+  return __builtin_aarch64_usra_nv16qi_uuus (__a, __b, __c);
 }
 
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-vrshlq_u16 (uint16x8_t __a, int16x8_t __b)
+__extension__ extern __inline uint16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vsraq_n_u16 (uint16x8_t __a, uint16x8_t __b, const int __c)
 {
-  return __builtin_aarch64_urshlv8hi_uus (__a, __b);
+  return __builtin_aarch64_usra_nv8hi_uuus (__a, __b, __c);
 }
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vrshlq_u32 (uint32x4_t __a, int32x4_t __b)
+__extension__ extern __inline uint32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vsraq_n_u32 (uint32x4_t __a, uint32x4_t __b, const int __c)
 {
-  return __builtin_aarch64_urshlv4si_uus (__a, __b);
+  return __builtin_aarch64_usra_nv4si_uuus (__a, __b, __c);
 }
 
-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
-vrshlq_u64 (uint64x2_t __a, int64x2_t __b)
+__extension__ extern __inline uint64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vsraq_n_u64 (uint64x2_t __a, uint64x2_t __b, const int __c)
 {
-  return __builtin_aarch64_urshlv2di_uus (__a, __b);
+  return __builtin_aarch64_usra_nv2di_uuus (__a, __b, __c);
 }
 
-__extension__ static __inline int64_t __attribute__ ((__always_inline__))
-vrshld_s64 (int64_t __a, int64_t __b)
+__extension__ extern __inline int64_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vsrad_n_s64 (int64_t __a, int64_t __b, const int __c)
 {
-  return __builtin_aarch64_srshldi (__a, __b);
+  return __builtin_aarch64_ssra_ndi (__a, __b, __c);
 }
 
-__extension__ static __inline uint64_t __attribute__ ((__always_inline__))
-vrshld_u64 (uint64_t __a, int64_t __b)
+__extension__ extern __inline uint64_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vsrad_n_u64 (uint64_t __a, uint64_t __b, const int __c)
 {
-  return __builtin_aarch64_urshldi_uus (__a, __b);
+  return __builtin_aarch64_usra_ndi_uuus (__a, __b, __c);
 }
 
-/* vrshr */
-
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-vrshr_n_s8 (int8x8_t __a, const int __b)
-{
-  return (int8x8_t) __builtin_aarch64_srshr_nv8qi (__a, __b);
-}
+/* vsri */
 
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-vrshr_n_s16 (int16x4_t __a, const int __b)
+__extension__ extern __inline int8x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vsri_n_s8 (int8x8_t __a, int8x8_t __b, const int __c)
 {
-  return (int16x4_t) __builtin_aarch64_srshr_nv4hi (__a, __b);
+  return (int8x8_t) __builtin_aarch64_ssri_nv8qi (__a, __b, __c);
 }
 
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-vrshr_n_s32 (int32x2_t __a, const int __b)
+__extension__ extern __inline int16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vsri_n_s16 (int16x4_t __a, int16x4_t __b, const int __c)
 {
-  return (int32x2_t) __builtin_aarch64_srshr_nv2si (__a, __b);
+  return (int16x4_t) __builtin_aarch64_ssri_nv4hi (__a, __b, __c);
 }
 
-__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
-vrshr_n_s64 (int64x1_t __a, const int __b)
+__extension__ extern __inline int32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vsri_n_s32 (int32x2_t __a, int32x2_t __b, const int __c)
 {
-  return (int64x1_t) {__builtin_aarch64_srshr_ndi (__a[0], __b)};
+  return (int32x2_t) __builtin_aarch64_ssri_nv2si (__a, __b, __c);
 }
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-vrshr_n_u8 (uint8x8_t __a, const int __b)
+__extension__ extern __inline int64x1_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vsri_n_s64 (int64x1_t __a, int64x1_t __b, const int __c)
 {
-  return __builtin_aarch64_urshr_nv8qi_uus (__a, __b);
+  return (int64x1_t) {__builtin_aarch64_ssri_ndi (__a[0], __b[0], __c)};
 }
 
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-vrshr_n_u16 (uint16x4_t __a, const int __b)
+__extension__ extern __inline uint8x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vsri_n_u8 (uint8x8_t __a, uint8x8_t __b, const int __c)
 {
-  return __builtin_aarch64_urshr_nv4hi_uus (__a, __b);
+  return __builtin_aarch64_usri_nv8qi_uuus (__a, __b, __c);
 }
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-vrshr_n_u32 (uint32x2_t __a, const int __b)
+__extension__ extern __inline uint16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vsri_n_u16 (uint16x4_t __a, uint16x4_t __b, const int __c)
 {
-  return __builtin_aarch64_urshr_nv2si_uus (__a, __b);
+  return __builtin_aarch64_usri_nv4hi_uuus (__a, __b, __c);
 }
 
-__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
-vrshr_n_u64 (uint64x1_t __a, const int __b)
+__extension__ extern __inline uint32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vsri_n_u32 (uint32x2_t __a, uint32x2_t __b, const int __c)
 {
-  return (uint64x1_t) {__builtin_aarch64_urshr_ndi_uus (__a[0], __b)};
+  return __builtin_aarch64_usri_nv2si_uuus (__a, __b, __c);
 }
 
-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
-vrshrq_n_s8 (int8x16_t __a, const int __b)
+__extension__ extern __inline uint64x1_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vsri_n_u64 (uint64x1_t __a, uint64x1_t __b, const int __c)
 {
-  return (int8x16_t) __builtin_aarch64_srshr_nv16qi (__a, __b);
+  return (uint64x1_t) {__builtin_aarch64_usri_ndi_uuus (__a[0], __b[0], __c)};
 }
 
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-vrshrq_n_s16 (int16x8_t __a, const int __b)
+__extension__ extern __inline int8x16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vsriq_n_s8 (int8x16_t __a, int8x16_t __b, const int __c)
 {
-  return (int16x8_t) __builtin_aarch64_srshr_nv8hi (__a, __b);
+  return (int8x16_t) __builtin_aarch64_ssri_nv16qi (__a, __b, __c);
 }
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-vrshrq_n_s32 (int32x4_t __a, const int __b)
+__extension__ extern __inline int16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vsriq_n_s16 (int16x8_t __a, int16x8_t __b, const int __c)
 {
-  return (int32x4_t) __builtin_aarch64_srshr_nv4si (__a, __b);
+  return (int16x8_t) __builtin_aarch64_ssri_nv8hi (__a, __b, __c);
 }
 
-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
-vrshrq_n_s64 (int64x2_t __a, const int __b)
+__extension__ extern __inline int32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vsriq_n_s32 (int32x4_t __a, int32x4_t __b, const int __c)
 {
-  return (int64x2_t) __builtin_aarch64_srshr_nv2di (__a, __b);
+  return (int32x4_t) __builtin_aarch64_ssri_nv4si (__a, __b, __c);
 }
 
-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-vrshrq_n_u8 (uint8x16_t __a, const int __b)
+__extension__ extern __inline int64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vsriq_n_s64 (int64x2_t __a, int64x2_t __b, const int __c)
 {
-  return __builtin_aarch64_urshr_nv16qi_uus (__a, __b);
+  return (int64x2_t) __builtin_aarch64_ssri_nv2di (__a, __b, __c);
 }
 
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-vrshrq_n_u16 (uint16x8_t __a, const int __b)
+__extension__ extern __inline uint8x16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vsriq_n_u8 (uint8x16_t __a, uint8x16_t __b, const int __c)
 {
-  return __builtin_aarch64_urshr_nv8hi_uus (__a, __b);
+  return __builtin_aarch64_usri_nv16qi_uuus (__a, __b, __c);
 }
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vrshrq_n_u32 (uint32x4_t __a, const int __b)
+__extension__ extern __inline uint16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vsriq_n_u16 (uint16x8_t __a, uint16x8_t __b, const int __c)
 {
-  return __builtin_aarch64_urshr_nv4si_uus (__a, __b);
+  return __builtin_aarch64_usri_nv8hi_uuus (__a, __b, __c);
 }
 
-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
-vrshrq_n_u64 (uint64x2_t __a, const int __b)
+__extension__ extern __inline uint32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vsriq_n_u32 (uint32x4_t __a, uint32x4_t __b, const int __c)
 {
-  return __builtin_aarch64_urshr_nv2di_uus (__a, __b);
+  return __builtin_aarch64_usri_nv4si_uuus (__a, __b, __c);
 }
 
-__extension__ static __inline int64_t __attribute__ ((__always_inline__))
-vrshrd_n_s64 (int64_t __a, const int __b)
+__extension__ extern __inline uint64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vsriq_n_u64 (uint64x2_t __a, uint64x2_t __b, const int __c)
 {
-  return __builtin_aarch64_srshr_ndi (__a, __b);
+  return __builtin_aarch64_usri_nv2di_uuus (__a, __b, __c);
 }
 
-__extension__ static __inline uint64_t __attribute__ ((__always_inline__))
-vrshrd_n_u64 (uint64_t __a, const int __b)
+__extension__ extern __inline int64_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vsrid_n_s64 (int64_t __a, int64_t __b, const int __c)
 {
-  return __builtin_aarch64_urshr_ndi_uus (__a, __b);
+  return __builtin_aarch64_ssri_ndi (__a, __b, __c);
 }
 
-/* vrsra */
-
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-vrsra_n_s8 (int8x8_t __a, int8x8_t __b, const int __c)
+__extension__ extern __inline uint64_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vsrid_n_u64 (uint64_t __a, uint64_t __b, const int __c)
 {
-  return (int8x8_t) __builtin_aarch64_srsra_nv8qi (__a, __b, __c);
+  return __builtin_aarch64_usri_ndi_uuus (__a, __b, __c);
 }
 
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-vrsra_n_s16 (int16x4_t __a, int16x4_t __b, const int __c)
-{
-  return (int16x4_t) __builtin_aarch64_srsra_nv4hi (__a, __b, __c);
-}
+/* vst1 */
 
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-vrsra_n_s32 (int32x2_t __a, int32x2_t __b, const int __c)
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1_f16 (float16_t *__a, float16x4_t __b)
 {
-  return (int32x2_t) __builtin_aarch64_srsra_nv2si (__a, __b, __c);
+  __builtin_aarch64_st1v4hf (__a, __b);
 }
 
-__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
-vrsra_n_s64 (int64x1_t __a, int64x1_t __b, const int __c)
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1_f32 (float32_t *a, float32x2_t b)
 {
-  return (int64x1_t) {__builtin_aarch64_srsra_ndi (__a[0], __b[0], __c)};
+  __builtin_aarch64_st1v2sf ((__builtin_aarch64_simd_sf *) a, b);
 }
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-vrsra_n_u8 (uint8x8_t __a, uint8x8_t __b, const int __c)
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1_f64 (float64_t *a, float64x1_t b)
 {
-  return __builtin_aarch64_ursra_nv8qi_uuus (__a, __b, __c);
+  *a = b[0];
 }
 
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-vrsra_n_u16 (uint16x4_t __a, uint16x4_t __b, const int __c)
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1_p8 (poly8_t *a, poly8x8_t b)
 {
-  return __builtin_aarch64_ursra_nv4hi_uuus (__a, __b, __c);
+  __builtin_aarch64_st1v8qi ((__builtin_aarch64_simd_qi *) a,
+			     (int8x8_t) b);
 }
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-vrsra_n_u32 (uint32x2_t __a, uint32x2_t __b, const int __c)
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1_p16 (poly16_t *a, poly16x4_t b)
 {
-  return __builtin_aarch64_ursra_nv2si_uuus (__a, __b, __c);
+  __builtin_aarch64_st1v4hi ((__builtin_aarch64_simd_hi *) a,
+			     (int16x4_t) b);
 }
 
-__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
-vrsra_n_u64 (uint64x1_t __a, uint64x1_t __b, const int __c)
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1_p64 (poly64_t *a, poly64x1_t b)
 {
-  return (uint64x1_t) {__builtin_aarch64_ursra_ndi_uuus (__a[0], __b[0], __c)};
+  *a = b[0];
 }
 
-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
-vrsraq_n_s8 (int8x16_t __a, int8x16_t __b, const int __c)
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1_s8 (int8_t *a, int8x8_t b)
 {
-  return (int8x16_t) __builtin_aarch64_srsra_nv16qi (__a, __b, __c);
+  __builtin_aarch64_st1v8qi ((__builtin_aarch64_simd_qi *) a, b);
 }
 
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-vrsraq_n_s16 (int16x8_t __a, int16x8_t __b, const int __c)
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1_s16 (int16_t *a, int16x4_t b)
 {
-  return (int16x8_t) __builtin_aarch64_srsra_nv8hi (__a, __b, __c);
+  __builtin_aarch64_st1v4hi ((__builtin_aarch64_simd_hi *) a, b);
 }
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-vrsraq_n_s32 (int32x4_t __a, int32x4_t __b, const int __c)
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1_s32 (int32_t *a, int32x2_t b)
 {
-  return (int32x4_t) __builtin_aarch64_srsra_nv4si (__a, __b, __c);
+  __builtin_aarch64_st1v2si ((__builtin_aarch64_simd_si *) a, b);
 }
 
-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
-vrsraq_n_s64 (int64x2_t __a, int64x2_t __b, const int __c)
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1_s64 (int64_t *a, int64x1_t b)
 {
-  return (int64x2_t) __builtin_aarch64_srsra_nv2di (__a, __b, __c);
+  *a = b[0];
 }
 
-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-vrsraq_n_u8 (uint8x16_t __a, uint8x16_t __b, const int __c)
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1_u8 (uint8_t *a, uint8x8_t b)
 {
-  return __builtin_aarch64_ursra_nv16qi_uuus (__a, __b, __c);
+  __builtin_aarch64_st1v8qi ((__builtin_aarch64_simd_qi *) a,
+			     (int8x8_t) b);
 }
 
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-vrsraq_n_u16 (uint16x8_t __a, uint16x8_t __b, const int __c)
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1_u16 (uint16_t *a, uint16x4_t b)
 {
-  return __builtin_aarch64_ursra_nv8hi_uuus (__a, __b, __c);
+  __builtin_aarch64_st1v4hi ((__builtin_aarch64_simd_hi *) a,
+			     (int16x4_t) b);
 }
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vrsraq_n_u32 (uint32x4_t __a, uint32x4_t __b, const int __c)
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1_u32 (uint32_t *a, uint32x2_t b)
 {
-  return __builtin_aarch64_ursra_nv4si_uuus (__a, __b, __c);
+  __builtin_aarch64_st1v2si ((__builtin_aarch64_simd_si *) a,
+			     (int32x2_t) b);
 }
 
-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
-vrsraq_n_u64 (uint64x2_t __a, uint64x2_t __b, const int __c)
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1_u64 (uint64_t *a, uint64x1_t b)
 {
-  return __builtin_aarch64_ursra_nv2di_uuus (__a, __b, __c);
+  *a = b[0];
 }
 
-__extension__ static __inline int64_t __attribute__ ((__always_inline__))
-vrsrad_n_s64 (int64_t __a, int64_t __b, const int __c)
+/* vst1q */
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1q_f16 (float16_t *__a, float16x8_t __b)
 {
-  return __builtin_aarch64_srsra_ndi (__a, __b, __c);
+  __builtin_aarch64_st1v8hf (__a, __b);
 }
 
-__extension__ static __inline uint64_t __attribute__ ((__always_inline__))
-vrsrad_n_u64 (uint64_t __a, uint64_t __b, const int __c)
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1q_f32 (float32_t *a, float32x4_t b)
 {
-  return __builtin_aarch64_ursra_ndi_uuus (__a, __b, __c);
+  __builtin_aarch64_st1v4sf ((__builtin_aarch64_simd_sf *) a, b);
 }
 
-#pragma GCC push_options
-#pragma GCC target ("+nothing+crypto")
-
-/* vsha1  */
-
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vsha1cq_u32 (uint32x4_t hash_abcd, uint32_t hash_e, uint32x4_t wk)
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1q_f64 (float64_t *a, float64x2_t b)
 {
-  return __builtin_aarch64_crypto_sha1cv4si_uuuu (hash_abcd, hash_e, wk);
+  __builtin_aarch64_st1v2df ((__builtin_aarch64_simd_df *) a, b);
 }
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vsha1mq_u32 (uint32x4_t hash_abcd, uint32_t hash_e, uint32x4_t wk)
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1q_p8 (poly8_t *a, poly8x16_t b)
 {
-  return __builtin_aarch64_crypto_sha1mv4si_uuuu (hash_abcd, hash_e, wk);
+  __builtin_aarch64_st1v16qi ((__builtin_aarch64_simd_qi *) a,
+			      (int8x16_t) b);
 }
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vsha1pq_u32 (uint32x4_t hash_abcd, uint32_t hash_e, uint32x4_t wk)
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1q_p16 (poly16_t *a, poly16x8_t b)
 {
-  return __builtin_aarch64_crypto_sha1pv4si_uuuu (hash_abcd, hash_e, wk);
+  __builtin_aarch64_st1v8hi ((__builtin_aarch64_simd_hi *) a,
+			     (int16x8_t) b);
 }
 
-__extension__ static __inline uint32_t __attribute__ ((__always_inline__))
-vsha1h_u32 (uint32_t hash_e)
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1q_p64 (poly64_t *a, poly64x2_t b)
 {
-  return __builtin_aarch64_crypto_sha1hsi_uu (hash_e);
+  __builtin_aarch64_st1v2di_sp ((__builtin_aarch64_simd_di *) a,
+				(poly64x2_t) b);
 }
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vsha1su0q_u32 (uint32x4_t w0_3, uint32x4_t w4_7, uint32x4_t w8_11)
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1q_s8 (int8_t *a, int8x16_t b)
 {
-  return __builtin_aarch64_crypto_sha1su0v4si_uuuu (w0_3, w4_7, w8_11);
+  __builtin_aarch64_st1v16qi ((__builtin_aarch64_simd_qi *) a, b);
 }
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vsha1su1q_u32 (uint32x4_t tw0_3, uint32x4_t w12_15)
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1q_s16 (int16_t *a, int16x8_t b)
 {
-  return __builtin_aarch64_crypto_sha1su1v4si_uuu (tw0_3, w12_15);
+  __builtin_aarch64_st1v8hi ((__builtin_aarch64_simd_hi *) a, b);
 }
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vsha256hq_u32 (uint32x4_t hash_abcd, uint32x4_t hash_efgh, uint32x4_t wk)
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1q_s32 (int32_t *a, int32x4_t b)
 {
-  return __builtin_aarch64_crypto_sha256hv4si_uuuu (hash_abcd, hash_efgh, wk);
+  __builtin_aarch64_st1v4si ((__builtin_aarch64_simd_si *) a, b);
 }
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vsha256h2q_u32 (uint32x4_t hash_efgh, uint32x4_t hash_abcd, uint32x4_t wk)
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1q_s64 (int64_t *a, int64x2_t b)
 {
-  return __builtin_aarch64_crypto_sha256h2v4si_uuuu (hash_efgh, hash_abcd, wk);
+  __builtin_aarch64_st1v2di ((__builtin_aarch64_simd_di *) a, b);
 }
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vsha256su0q_u32 (uint32x4_t w0_3, uint32x4_t w4_7)
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1q_u8 (uint8_t *a, uint8x16_t b)
 {
-  return __builtin_aarch64_crypto_sha256su0v4si_uuu (w0_3, w4_7);
+  __builtin_aarch64_st1v16qi ((__builtin_aarch64_simd_qi *) a,
+			      (int8x16_t) b);
 }
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vsha256su1q_u32 (uint32x4_t tw0_3, uint32x4_t w8_11, uint32x4_t w12_15)
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1q_u16 (uint16_t *a, uint16x8_t b)
 {
-  return __builtin_aarch64_crypto_sha256su1v4si_uuuu (tw0_3, w8_11, w12_15);
+  __builtin_aarch64_st1v8hi ((__builtin_aarch64_simd_hi *) a,
+			     (int16x8_t) b);
 }
 
-__extension__ static __inline poly128_t __attribute__ ((__always_inline__))
-vmull_p64 (poly64_t a, poly64_t b)
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1q_u32 (uint32_t *a, uint32x4_t b)
 {
-  return
-    __builtin_aarch64_crypto_pmulldi_ppp (a, b);
+  __builtin_aarch64_st1v4si ((__builtin_aarch64_simd_si *) a,
+			     (int32x4_t) b);
 }
 
-__extension__ static __inline poly128_t __attribute__ ((__always_inline__))
-vmull_high_p64 (poly64x2_t a, poly64x2_t b)
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1q_u64 (uint64_t *a, uint64x2_t b)
 {
-  return __builtin_aarch64_crypto_pmullv2di_ppp (a, b);
+  __builtin_aarch64_st1v2di ((__builtin_aarch64_simd_di *) a,
+			     (int64x2_t) b);
 }
 
-#pragma GCC pop_options
+/* vst1_lane */
 
-/* vshl */
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1_lane_f16 (float16_t *__a, float16x4_t __b, const int __lane)
+{
+  *__a = __aarch64_vget_lane_any (__b, __lane);
+}
 
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-vshl_n_s8 (int8x8_t __a, const int __b)
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1_lane_f32 (float32_t *__a, float32x2_t __b, const int __lane)
 {
-  return (int8x8_t) __builtin_aarch64_ashlv8qi (__a, __b);
+  *__a = __aarch64_vget_lane_any (__b, __lane);
 }
 
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-vshl_n_s16 (int16x4_t __a, const int __b)
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1_lane_f64 (float64_t *__a, float64x1_t __b, const int __lane)
 {
-  return (int16x4_t) __builtin_aarch64_ashlv4hi (__a, __b);
+  *__a = __aarch64_vget_lane_any (__b, __lane);
 }
 
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-vshl_n_s32 (int32x2_t __a, const int __b)
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1_lane_p8 (poly8_t *__a, poly8x8_t __b, const int __lane)
 {
-  return (int32x2_t) __builtin_aarch64_ashlv2si (__a, __b);
+  *__a = __aarch64_vget_lane_any (__b, __lane);
 }
 
-__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
-vshl_n_s64 (int64x1_t __a, const int __b)
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1_lane_p16 (poly16_t *__a, poly16x4_t __b, const int __lane)
 {
-  return (int64x1_t) {__builtin_aarch64_ashldi (__a[0], __b)};
+  *__a = __aarch64_vget_lane_any (__b, __lane);
 }
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-vshl_n_u8 (uint8x8_t __a, const int __b)
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1_lane_p64 (poly64_t *__a, poly64x1_t __b, const int __lane)
 {
-  return (uint8x8_t) __builtin_aarch64_ashlv8qi ((int8x8_t) __a, __b);
+  *__a = __aarch64_vget_lane_any (__b, __lane);
 }
 
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-vshl_n_u16 (uint16x4_t __a, const int __b)
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1_lane_s8 (int8_t *__a, int8x8_t __b, const int __lane)
 {
-  return (uint16x4_t) __builtin_aarch64_ashlv4hi ((int16x4_t) __a, __b);
+  *__a = __aarch64_vget_lane_any (__b, __lane);
 }
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-vshl_n_u32 (uint32x2_t __a, const int __b)
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1_lane_s16 (int16_t *__a, int16x4_t __b, const int __lane)
 {
-  return (uint32x2_t) __builtin_aarch64_ashlv2si ((int32x2_t) __a, __b);
+  *__a = __aarch64_vget_lane_any (__b, __lane);
 }
 
-__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
-vshl_n_u64 (uint64x1_t __a, const int __b)
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1_lane_s32 (int32_t *__a, int32x2_t __b, const int __lane)
 {
-  return (uint64x1_t) {__builtin_aarch64_ashldi ((int64_t) __a[0], __b)};
+  *__a = __aarch64_vget_lane_any (__b, __lane);
 }
 
-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
-vshlq_n_s8 (int8x16_t __a, const int __b)
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1_lane_s64 (int64_t *__a, int64x1_t __b, const int __lane)
 {
-  return (int8x16_t) __builtin_aarch64_ashlv16qi (__a, __b);
+  *__a = __aarch64_vget_lane_any (__b, __lane);
 }
 
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-vshlq_n_s16 (int16x8_t __a, const int __b)
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1_lane_u8 (uint8_t *__a, uint8x8_t __b, const int __lane)
 {
-  return (int16x8_t) __builtin_aarch64_ashlv8hi (__a, __b);
+  *__a = __aarch64_vget_lane_any (__b, __lane);
 }
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-vshlq_n_s32 (int32x4_t __a, const int __b)
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1_lane_u16 (uint16_t *__a, uint16x4_t __b, const int __lane)
 {
-  return (int32x4_t) __builtin_aarch64_ashlv4si (__a, __b);
+  *__a = __aarch64_vget_lane_any (__b, __lane);
 }
 
-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
-vshlq_n_s64 (int64x2_t __a, const int __b)
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1_lane_u32 (uint32_t *__a, uint32x2_t __b, const int __lane)
 {
-  return (int64x2_t) __builtin_aarch64_ashlv2di (__a, __b);
+  *__a = __aarch64_vget_lane_any (__b, __lane);
 }
 
-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-vshlq_n_u8 (uint8x16_t __a, const int __b)
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1_lane_u64 (uint64_t *__a, uint64x1_t __b, const int __lane)
 {
-  return (uint8x16_t) __builtin_aarch64_ashlv16qi ((int8x16_t) __a, __b);
+  *__a = __aarch64_vget_lane_any (__b, __lane);
 }
 
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-vshlq_n_u16 (uint16x8_t __a, const int __b)
+/* vst1q_lane */
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1q_lane_f16 (float16_t *__a, float16x8_t __b, const int __lane)
 {
-  return (uint16x8_t) __builtin_aarch64_ashlv8hi ((int16x8_t) __a, __b);
+  *__a = __aarch64_vget_lane_any (__b, __lane);
 }
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vshlq_n_u32 (uint32x4_t __a, const int __b)
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1q_lane_f32 (float32_t *__a, float32x4_t __b, const int __lane)
 {
-  return (uint32x4_t) __builtin_aarch64_ashlv4si ((int32x4_t) __a, __b);
+  *__a = __aarch64_vget_lane_any (__b, __lane);
 }
 
-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
-vshlq_n_u64 (uint64x2_t __a, const int __b)
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1q_lane_f64 (float64_t *__a, float64x2_t __b, const int __lane)
 {
-  return (uint64x2_t) __builtin_aarch64_ashlv2di ((int64x2_t) __a, __b);
+  *__a = __aarch64_vget_lane_any (__b, __lane);
 }
 
-__extension__ static __inline int64_t __attribute__ ((__always_inline__))
-vshld_n_s64 (int64_t __a, const int __b)
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1q_lane_p8 (poly8_t *__a, poly8x16_t __b, const int __lane)
 {
-  return __builtin_aarch64_ashldi (__a, __b);
+  *__a = __aarch64_vget_lane_any (__b, __lane);
 }
 
-__extension__ static __inline uint64_t __attribute__ ((__always_inline__))
-vshld_n_u64 (uint64_t __a, const int __b)
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1q_lane_p16 (poly16_t *__a, poly16x8_t __b, const int __lane)
 {
-  return (uint64_t) __builtin_aarch64_ashldi (__a, __b);
+  *__a = __aarch64_vget_lane_any (__b, __lane);
 }
 
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-vshl_s8 (int8x8_t __a, int8x8_t __b)
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1q_lane_p64 (poly64_t *__a, poly64x2_t __b, const int __lane)
 {
-  return __builtin_aarch64_sshlv8qi (__a, __b);
+  *__a = __aarch64_vget_lane_any (__b, __lane);
 }
 
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-vshl_s16 (int16x4_t __a, int16x4_t __b)
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1q_lane_s8 (int8_t *__a, int8x16_t __b, const int __lane)
 {
-  return __builtin_aarch64_sshlv4hi (__a, __b);
+  *__a = __aarch64_vget_lane_any (__b, __lane);
 }
 
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-vshl_s32 (int32x2_t __a, int32x2_t __b)
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1q_lane_s16 (int16_t *__a, int16x8_t __b, const int __lane)
 {
-  return __builtin_aarch64_sshlv2si (__a, __b);
+  *__a = __aarch64_vget_lane_any (__b, __lane);
 }
 
-__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
-vshl_s64 (int64x1_t __a, int64x1_t __b)
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1q_lane_s32 (int32_t *__a, int32x4_t __b, const int __lane)
 {
-  return (int64x1_t) {__builtin_aarch64_sshldi (__a[0], __b[0])};
+  *__a = __aarch64_vget_lane_any (__b, __lane);
 }
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-vshl_u8 (uint8x8_t __a, int8x8_t __b)
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1q_lane_s64 (int64_t *__a, int64x2_t __b, const int __lane)
 {
-  return __builtin_aarch64_ushlv8qi_uus (__a, __b);
+  *__a = __aarch64_vget_lane_any (__b, __lane);
 }
 
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-vshl_u16 (uint16x4_t __a, int16x4_t __b)
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1q_lane_u8 (uint8_t *__a, uint8x16_t __b, const int __lane)
 {
-  return __builtin_aarch64_ushlv4hi_uus (__a, __b);
+  *__a = __aarch64_vget_lane_any (__b, __lane);
 }
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-vshl_u32 (uint32x2_t __a, int32x2_t __b)
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1q_lane_u16 (uint16_t *__a, uint16x8_t __b, const int __lane)
 {
-  return __builtin_aarch64_ushlv2si_uus (__a, __b);
+  *__a = __aarch64_vget_lane_any (__b, __lane);
 }
 
-__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
-vshl_u64 (uint64x1_t __a, int64x1_t __b)
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1q_lane_u32 (uint32_t *__a, uint32x4_t __b, const int __lane)
 {
-  return (uint64x1_t) {__builtin_aarch64_ushldi_uus (__a[0], __b[0])};
+  *__a = __aarch64_vget_lane_any (__b, __lane);
 }
 
-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
-vshlq_s8 (int8x16_t __a, int8x16_t __b)
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1q_lane_u64 (uint64_t *__a, uint64x2_t __b, const int __lane)
 {
-  return __builtin_aarch64_sshlv16qi (__a, __b);
+  *__a = __aarch64_vget_lane_any (__b, __lane);
 }
 
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-vshlq_s16 (int16x8_t __a, int16x8_t __b)
+/* vstn */
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst2_s64 (int64_t * __a, int64x1x2_t val)
 {
-  return __builtin_aarch64_sshlv8hi (__a, __b);
+  __builtin_aarch64_simd_oi __o;
+  int64x2x2_t temp;
+  temp.val[0] = vcombine_s64 (val.val[0], vcreate_s64 (__AARCH64_INT64_C (0)));
+  temp.val[1] = vcombine_s64 (val.val[1], vcreate_s64 (__AARCH64_INT64_C (0)));
+  __o = __builtin_aarch64_set_qregoiv2di (__o, (int64x2_t) temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv2di (__o, (int64x2_t) temp.val[1], 1);
+  __builtin_aarch64_st2di ((__builtin_aarch64_simd_di *) __a, __o);
 }
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-vshlq_s32 (int32x4_t __a, int32x4_t __b)
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst2_u64 (uint64_t * __a, uint64x1x2_t val)
 {
-  return __builtin_aarch64_sshlv4si (__a, __b);
+  __builtin_aarch64_simd_oi __o;
+  uint64x2x2_t temp;
+  temp.val[0] = vcombine_u64 (val.val[0], vcreate_u64 (__AARCH64_UINT64_C (0)));
+  temp.val[1] = vcombine_u64 (val.val[1], vcreate_u64 (__AARCH64_UINT64_C (0)));
+  __o = __builtin_aarch64_set_qregoiv2di (__o, (int64x2_t) temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv2di (__o, (int64x2_t) temp.val[1], 1);
+  __builtin_aarch64_st2di ((__builtin_aarch64_simd_di *) __a, __o);
 }
 
-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
-vshlq_s64 (int64x2_t __a, int64x2_t __b)
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst2_f64 (float64_t * __a, float64x1x2_t val)
 {
-  return __builtin_aarch64_sshlv2di (__a, __b);
+  __builtin_aarch64_simd_oi __o;
+  float64x2x2_t temp;
+  temp.val[0] = vcombine_f64 (val.val[0], vcreate_f64 (__AARCH64_UINT64_C (0)));
+  temp.val[1] = vcombine_f64 (val.val[1], vcreate_f64 (__AARCH64_UINT64_C (0)));
+  __o = __builtin_aarch64_set_qregoiv2df (__o, (float64x2_t) temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv2df (__o, (float64x2_t) temp.val[1], 1);
+  __builtin_aarch64_st2df ((__builtin_aarch64_simd_df *) __a, __o);
 }
 
-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-vshlq_u8 (uint8x16_t __a, int8x16_t __b)
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst2_s8 (int8_t * __a, int8x8x2_t val)
 {
-  return __builtin_aarch64_ushlv16qi_uus (__a, __b);
+  __builtin_aarch64_simd_oi __o;
+  int8x16x2_t temp;
+  temp.val[0] = vcombine_s8 (val.val[0], vcreate_s8 (__AARCH64_INT64_C (0)));
+  temp.val[1] = vcombine_s8 (val.val[1], vcreate_s8 (__AARCH64_INT64_C (0)));
+  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) temp.val[1], 1);
+  __builtin_aarch64_st2v8qi ((__builtin_aarch64_simd_qi *) __a, __o);
 }
 
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-vshlq_u16 (uint16x8_t __a, int16x8_t __b)
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst2_p8 (poly8_t * __a, poly8x8x2_t val)
 {
-  return __builtin_aarch64_ushlv8hi_uus (__a, __b);
+  __builtin_aarch64_simd_oi __o;
+  poly8x16x2_t temp;
+  temp.val[0] = vcombine_p8 (val.val[0], vcreate_p8 (__AARCH64_UINT64_C (0)));
+  temp.val[1] = vcombine_p8 (val.val[1], vcreate_p8 (__AARCH64_UINT64_C (0)));
+  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) temp.val[1], 1);
+  __builtin_aarch64_st2v8qi ((__builtin_aarch64_simd_qi *) __a, __o);
 }
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vshlq_u32 (uint32x4_t __a, int32x4_t __b)
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst2_s16 (int16_t * __a, int16x4x2_t val)
 {
-  return __builtin_aarch64_ushlv4si_uus (__a, __b);
+  __builtin_aarch64_simd_oi __o;
+  int16x8x2_t temp;
+  temp.val[0] = vcombine_s16 (val.val[0], vcreate_s16 (__AARCH64_INT64_C (0)));
+  temp.val[1] = vcombine_s16 (val.val[1], vcreate_s16 (__AARCH64_INT64_C (0)));
+  __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) temp.val[1], 1);
+  __builtin_aarch64_st2v4hi ((__builtin_aarch64_simd_hi *) __a, __o);
 }
 
-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
-vshlq_u64 (uint64x2_t __a, int64x2_t __b)
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst2_p16 (poly16_t * __a, poly16x4x2_t val)
 {
-  return __builtin_aarch64_ushlv2di_uus (__a, __b);
+  __builtin_aarch64_simd_oi __o;
+  poly16x8x2_t temp;
+  temp.val[0] = vcombine_p16 (val.val[0], vcreate_p16 (__AARCH64_UINT64_C (0)));
+  temp.val[1] = vcombine_p16 (val.val[1], vcreate_p16 (__AARCH64_UINT64_C (0)));
+  __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) temp.val[1], 1);
+  __builtin_aarch64_st2v4hi ((__builtin_aarch64_simd_hi *) __a, __o);
 }
 
-__extension__ static __inline int64_t __attribute__ ((__always_inline__))
-vshld_s64 (int64_t __a, int64_t __b)
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst2_s32 (int32_t * __a, int32x2x2_t val)
 {
-  return __builtin_aarch64_sshldi (__a, __b);
+  __builtin_aarch64_simd_oi __o;
+  int32x4x2_t temp;
+  temp.val[0] = vcombine_s32 (val.val[0], vcreate_s32 (__AARCH64_INT64_C (0)));
+  temp.val[1] = vcombine_s32 (val.val[1], vcreate_s32 (__AARCH64_INT64_C (0)));
+  __o = __builtin_aarch64_set_qregoiv4si (__o, (int32x4_t) temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv4si (__o, (int32x4_t) temp.val[1], 1);
+  __builtin_aarch64_st2v2si ((__builtin_aarch64_simd_si *) __a, __o);
 }
 
-__extension__ static __inline uint64_t __attribute__ ((__always_inline__))
-vshld_u64 (uint64_t __a, uint64_t __b)
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst2_u8 (uint8_t * __a, uint8x8x2_t val)
 {
-  return __builtin_aarch64_ushldi_uus (__a, __b);
+  __builtin_aarch64_simd_oi __o;
+  uint8x16x2_t temp;
+  temp.val[0] = vcombine_u8 (val.val[0], vcreate_u8 (__AARCH64_UINT64_C (0)));
+  temp.val[1] = vcombine_u8 (val.val[1], vcreate_u8 (__AARCH64_UINT64_C (0)));
+  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) temp.val[1], 1);
+  __builtin_aarch64_st2v8qi ((__builtin_aarch64_simd_qi *) __a, __o);
 }
 
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-vshll_high_n_s8 (int8x16_t __a, const int __b)
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst2_u16 (uint16_t * __a, uint16x4x2_t val)
 {
-  return __builtin_aarch64_sshll2_nv16qi (__a, __b);
+  __builtin_aarch64_simd_oi __o;
+  uint16x8x2_t temp;
+  temp.val[0] = vcombine_u16 (val.val[0], vcreate_u16 (__AARCH64_UINT64_C (0)));
+  temp.val[1] = vcombine_u16 (val.val[1], vcreate_u16 (__AARCH64_UINT64_C (0)));
+  __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) temp.val[1], 1);
+  __builtin_aarch64_st2v4hi ((__builtin_aarch64_simd_hi *) __a, __o);
 }
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-vshll_high_n_s16 (int16x8_t __a, const int __b)
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst2_u32 (uint32_t * __a, uint32x2x2_t val)
 {
-  return __builtin_aarch64_sshll2_nv8hi (__a, __b);
+  __builtin_aarch64_simd_oi __o;
+  uint32x4x2_t temp;
+  temp.val[0] = vcombine_u32 (val.val[0], vcreate_u32 (__AARCH64_UINT64_C (0)));
+  temp.val[1] = vcombine_u32 (val.val[1], vcreate_u32 (__AARCH64_UINT64_C (0)));
+  __o = __builtin_aarch64_set_qregoiv4si (__o, (int32x4_t) temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv4si (__o, (int32x4_t) temp.val[1], 1);
+  __builtin_aarch64_st2v2si ((__builtin_aarch64_simd_si *) __a, __o);
 }
 
-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
-vshll_high_n_s32 (int32x4_t __a, const int __b)
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst2_f16 (float16_t * __a, float16x4x2_t val)
 {
-  return __builtin_aarch64_sshll2_nv4si (__a, __b);
+  __builtin_aarch64_simd_oi __o;
+  float16x8x2_t temp;
+  temp.val[0] = vcombine_f16 (val.val[0], vcreate_f16 (__AARCH64_UINT64_C (0)));
+  temp.val[1] = vcombine_f16 (val.val[1], vcreate_f16 (__AARCH64_UINT64_C (0)));
+  __o = __builtin_aarch64_set_qregoiv8hf (__o, temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv8hf (__o, temp.val[1], 1);
+  __builtin_aarch64_st2v4hf (__a, __o);
 }
 
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-vshll_high_n_u8 (uint8x16_t __a, const int __b)
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst2_f32 (float32_t * __a, float32x2x2_t val)
 {
-  return (uint16x8_t) __builtin_aarch64_ushll2_nv16qi ((int8x16_t) __a, __b);
+  __builtin_aarch64_simd_oi __o;
+  float32x4x2_t temp;
+  temp.val[0] = vcombine_f32 (val.val[0], vcreate_f32 (__AARCH64_UINT64_C (0)));
+  temp.val[1] = vcombine_f32 (val.val[1], vcreate_f32 (__AARCH64_UINT64_C (0)));
+  __o = __builtin_aarch64_set_qregoiv4sf (__o, (float32x4_t) temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv4sf (__o, (float32x4_t) temp.val[1], 1);
+  __builtin_aarch64_st2v2sf ((__builtin_aarch64_simd_sf *) __a, __o);
 }
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vshll_high_n_u16 (uint16x8_t __a, const int __b)
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst2_p64 (poly64_t * __a, poly64x1x2_t val)
 {
-  return (uint32x4_t) __builtin_aarch64_ushll2_nv8hi ((int16x8_t) __a, __b);
+  __builtin_aarch64_simd_oi __o;
+  poly64x2x2_t temp;
+  temp.val[0] = vcombine_p64 (val.val[0], vcreate_p64 (__AARCH64_UINT64_C (0)));
+  temp.val[1] = vcombine_p64 (val.val[1], vcreate_p64 (__AARCH64_UINT64_C (0)));
+  __o = __builtin_aarch64_set_qregoiv2di_ssps (__o,
+					       (poly64x2_t) temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv2di_ssps (__o,
+					       (poly64x2_t) temp.val[1], 1);
+  __builtin_aarch64_st2di ((__builtin_aarch64_simd_di *) __a, __o);
 }
 
-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
-vshll_high_n_u32 (uint32x4_t __a, const int __b)
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst2q_s8 (int8_t * __a, int8x16x2_t val)
 {
-  return (uint64x2_t) __builtin_aarch64_ushll2_nv4si ((int32x4_t) __a, __b);
+  __builtin_aarch64_simd_oi __o;
+  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) val.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) val.val[1], 1);
+  __builtin_aarch64_st2v16qi ((__builtin_aarch64_simd_qi *) __a, __o);
 }
 
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-vshll_n_s8 (int8x8_t __a, const int __b)
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst2q_p8 (poly8_t * __a, poly8x16x2_t val)
 {
-  return __builtin_aarch64_sshll_nv8qi (__a, __b);
+  __builtin_aarch64_simd_oi __o;
+  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) val.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) val.val[1], 1);
+  __builtin_aarch64_st2v16qi ((__builtin_aarch64_simd_qi *) __a, __o);
 }
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-vshll_n_s16 (int16x4_t __a, const int __b)
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst2q_s16 (int16_t * __a, int16x8x2_t val)
 {
-  return __builtin_aarch64_sshll_nv4hi (__a, __b);
+  __builtin_aarch64_simd_oi __o;
+  __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) val.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) val.val[1], 1);
+  __builtin_aarch64_st2v8hi ((__builtin_aarch64_simd_hi *) __a, __o);
 }
 
-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
-vshll_n_s32 (int32x2_t __a, const int __b)
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst2q_p16 (poly16_t * __a, poly16x8x2_t val)
 {
-  return __builtin_aarch64_sshll_nv2si (__a, __b);
+  __builtin_aarch64_simd_oi __o;
+  __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) val.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) val.val[1], 1);
+  __builtin_aarch64_st2v8hi ((__builtin_aarch64_simd_hi *) __a, __o);
 }
 
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-vshll_n_u8 (uint8x8_t __a, const int __b)
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst2q_s32 (int32_t * __a, int32x4x2_t val)
 {
-  return __builtin_aarch64_ushll_nv8qi_uus (__a, __b);
+  __builtin_aarch64_simd_oi __o;
+  __o = __builtin_aarch64_set_qregoiv4si (__o, (int32x4_t) val.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv4si (__o, (int32x4_t) val.val[1], 1);
+  __builtin_aarch64_st2v4si ((__builtin_aarch64_simd_si *) __a, __o);
 }
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vshll_n_u16 (uint16x4_t __a, const int __b)
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst2q_s64 (int64_t * __a, int64x2x2_t val)
 {
-  return __builtin_aarch64_ushll_nv4hi_uus (__a, __b);
+  __builtin_aarch64_simd_oi __o;
+  __o = __builtin_aarch64_set_qregoiv2di (__o, (int64x2_t) val.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv2di (__o, (int64x2_t) val.val[1], 1);
+  __builtin_aarch64_st2v2di ((__builtin_aarch64_simd_di *) __a, __o);
 }
 
-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
-vshll_n_u32 (uint32x2_t __a, const int __b)
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst2q_u8 (uint8_t * __a, uint8x16x2_t val)
 {
-  return __builtin_aarch64_ushll_nv2si_uus (__a, __b);
+  __builtin_aarch64_simd_oi __o;
+  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) val.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) val.val[1], 1);
+  __builtin_aarch64_st2v16qi ((__builtin_aarch64_simd_qi *) __a, __o);
 }
 
-/* vshr */
-
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-vshr_n_s8 (int8x8_t __a, const int __b)
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst2q_u16 (uint16_t * __a, uint16x8x2_t val)
 {
-  return (int8x8_t) __builtin_aarch64_ashrv8qi (__a, __b);
+  __builtin_aarch64_simd_oi __o;
+  __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) val.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) val.val[1], 1);
+  __builtin_aarch64_st2v8hi ((__builtin_aarch64_simd_hi *) __a, __o);
 }
 
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-vshr_n_s16 (int16x4_t __a, const int __b)
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst2q_u32 (uint32_t * __a, uint32x4x2_t val)
 {
-  return (int16x4_t) __builtin_aarch64_ashrv4hi (__a, __b);
+  __builtin_aarch64_simd_oi __o;
+  __o = __builtin_aarch64_set_qregoiv4si (__o, (int32x4_t) val.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv4si (__o, (int32x4_t) val.val[1], 1);
+  __builtin_aarch64_st2v4si ((__builtin_aarch64_simd_si *) __a, __o);
 }
 
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-vshr_n_s32 (int32x2_t __a, const int __b)
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst2q_u64 (uint64_t * __a, uint64x2x2_t val)
 {
-  return (int32x2_t) __builtin_aarch64_ashrv2si (__a, __b);
+  __builtin_aarch64_simd_oi __o;
+  __o = __builtin_aarch64_set_qregoiv2di (__o, (int64x2_t) val.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv2di (__o, (int64x2_t) val.val[1], 1);
+  __builtin_aarch64_st2v2di ((__builtin_aarch64_simd_di *) __a, __o);
 }
 
-__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
-vshr_n_s64 (int64x1_t __a, const int __b)
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst2q_f16 (float16_t * __a, float16x8x2_t val)
 {
-  return (int64x1_t) {__builtin_aarch64_ashr_simddi (__a[0], __b)};
+  __builtin_aarch64_simd_oi __o;
+  __o = __builtin_aarch64_set_qregoiv8hf (__o, val.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv8hf (__o, val.val[1], 1);
+  __builtin_aarch64_st2v8hf (__a, __o);
 }
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-vshr_n_u8 (uint8x8_t __a, const int __b)
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst2q_f32 (float32_t * __a, float32x4x2_t val)
 {
-  return (uint8x8_t) __builtin_aarch64_lshrv8qi ((int8x8_t) __a, __b);
+  __builtin_aarch64_simd_oi __o;
+  __o = __builtin_aarch64_set_qregoiv4sf (__o, (float32x4_t) val.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv4sf (__o, (float32x4_t) val.val[1], 1);
+  __builtin_aarch64_st2v4sf ((__builtin_aarch64_simd_sf *) __a, __o);
 }
 
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-vshr_n_u16 (uint16x4_t __a, const int __b)
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst2q_f64 (float64_t * __a, float64x2x2_t val)
 {
-  return (uint16x4_t) __builtin_aarch64_lshrv4hi ((int16x4_t) __a, __b);
+  __builtin_aarch64_simd_oi __o;
+  __o = __builtin_aarch64_set_qregoiv2df (__o, (float64x2_t) val.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv2df (__o, (float64x2_t) val.val[1], 1);
+  __builtin_aarch64_st2v2df ((__builtin_aarch64_simd_df *) __a, __o);
 }
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-vshr_n_u32 (uint32x2_t __a, const int __b)
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst2q_p64 (poly64_t * __a, poly64x2x2_t val)
 {
-  return (uint32x2_t) __builtin_aarch64_lshrv2si ((int32x2_t) __a, __b);
+  __builtin_aarch64_simd_oi __o;
+  __o = __builtin_aarch64_set_qregoiv2di_ssps (__o,
+					       (poly64x2_t) val.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv2di_ssps (__o,
+					       (poly64x2_t) val.val[1], 1);
+  __builtin_aarch64_st2v2di ((__builtin_aarch64_simd_di *) __a, __o);
 }
 
-__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
-vshr_n_u64 (uint64x1_t __a, const int __b)
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst3_s64 (int64_t * __a, int64x1x3_t val)
 {
-  return (uint64x1_t) {__builtin_aarch64_lshr_simddi_uus ( __a[0], __b)};
+  __builtin_aarch64_simd_ci __o;
+  int64x2x3_t temp;
+  temp.val[0] = vcombine_s64 (val.val[0], vcreate_s64 (__AARCH64_INT64_C (0)));
+  temp.val[1] = vcombine_s64 (val.val[1], vcreate_s64 (__AARCH64_INT64_C (0)));
+  temp.val[2] = vcombine_s64 (val.val[2], vcreate_s64 (__AARCH64_INT64_C (0)));
+  __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) temp.val[1], 1);
+  __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) temp.val[2], 2);
+  __builtin_aarch64_st3di ((__builtin_aarch64_simd_di *) __a, __o);
 }
 
-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
-vshrq_n_s8 (int8x16_t __a, const int __b)
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst3_u64 (uint64_t * __a, uint64x1x3_t val)
 {
-  return (int8x16_t) __builtin_aarch64_ashrv16qi (__a, __b);
+  __builtin_aarch64_simd_ci __o;
+  uint64x2x3_t temp;
+  temp.val[0] = vcombine_u64 (val.val[0], vcreate_u64 (__AARCH64_UINT64_C (0)));
+  temp.val[1] = vcombine_u64 (val.val[1], vcreate_u64 (__AARCH64_UINT64_C (0)));
+  temp.val[2] = vcombine_u64 (val.val[2], vcreate_u64 (__AARCH64_UINT64_C (0)));
+  __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) temp.val[1], 1);
+  __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) temp.val[2], 2);
+  __builtin_aarch64_st3di ((__builtin_aarch64_simd_di *) __a, __o);
 }
 
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-vshrq_n_s16 (int16x8_t __a, const int __b)
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst3_f64 (float64_t * __a, float64x1x3_t val)
 {
-  return (int16x8_t) __builtin_aarch64_ashrv8hi (__a, __b);
+  __builtin_aarch64_simd_ci __o;
+  float64x2x3_t temp;
+  temp.val[0] = vcombine_f64 (val.val[0], vcreate_f64 (__AARCH64_UINT64_C (0)));
+  temp.val[1] = vcombine_f64 (val.val[1], vcreate_f64 (__AARCH64_UINT64_C (0)));
+  temp.val[2] = vcombine_f64 (val.val[2], vcreate_f64 (__AARCH64_UINT64_C (0)));
+  __o = __builtin_aarch64_set_qregciv2df (__o, (float64x2_t) temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregciv2df (__o, (float64x2_t) temp.val[1], 1);
+  __o = __builtin_aarch64_set_qregciv2df (__o, (float64x2_t) temp.val[2], 2);
+  __builtin_aarch64_st3df ((__builtin_aarch64_simd_df *) __a, __o);
 }
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-vshrq_n_s32 (int32x4_t __a, const int __b)
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst3_s8 (int8_t * __a, int8x8x3_t val)
 {
-  return (int32x4_t) __builtin_aarch64_ashrv4si (__a, __b);
+  __builtin_aarch64_simd_ci __o;
+  int8x16x3_t temp;
+  temp.val[0] = vcombine_s8 (val.val[0], vcreate_s8 (__AARCH64_INT64_C (0)));
+  temp.val[1] = vcombine_s8 (val.val[1], vcreate_s8 (__AARCH64_INT64_C (0)));
+  temp.val[2] = vcombine_s8 (val.val[2], vcreate_s8 (__AARCH64_INT64_C (0)));
+  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) temp.val[1], 1);
+  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) temp.val[2], 2);
+  __builtin_aarch64_st3v8qi ((__builtin_aarch64_simd_qi *) __a, __o);
 }
 
-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
-vshrq_n_s64 (int64x2_t __a, const int __b)
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst3_p8 (poly8_t * __a, poly8x8x3_t val)
 {
-  return (int64x2_t) __builtin_aarch64_ashrv2di (__a, __b);
+  __builtin_aarch64_simd_ci __o;
+  poly8x16x3_t temp;
+  temp.val[0] = vcombine_p8 (val.val[0], vcreate_p8 (__AARCH64_UINT64_C (0)));
+  temp.val[1] = vcombine_p8 (val.val[1], vcreate_p8 (__AARCH64_UINT64_C (0)));
+  temp.val[2] = vcombine_p8 (val.val[2], vcreate_p8 (__AARCH64_UINT64_C (0)));
+  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) temp.val[1], 1);
+  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) temp.val[2], 2);
+  __builtin_aarch64_st3v8qi ((__builtin_aarch64_simd_qi *) __a, __o);
 }
 
-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-vshrq_n_u8 (uint8x16_t __a, const int __b)
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst3_s16 (int16_t * __a, int16x4x3_t val)
 {
-  return (uint8x16_t) __builtin_aarch64_lshrv16qi ((int8x16_t) __a, __b);
+  __builtin_aarch64_simd_ci __o;
+  int16x8x3_t temp;
+  temp.val[0] = vcombine_s16 (val.val[0], vcreate_s16 (__AARCH64_INT64_C (0)));
+  temp.val[1] = vcombine_s16 (val.val[1], vcreate_s16 (__AARCH64_INT64_C (0)));
+  temp.val[2] = vcombine_s16 (val.val[2], vcreate_s16 (__AARCH64_INT64_C (0)));
+  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) temp.val[1], 1);
+  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) temp.val[2], 2);
+  __builtin_aarch64_st3v4hi ((__builtin_aarch64_simd_hi *) __a, __o);
 }
 
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-vshrq_n_u16 (uint16x8_t __a, const int __b)
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst3_p16 (poly16_t * __a, poly16x4x3_t val)
 {
-  return (uint16x8_t) __builtin_aarch64_lshrv8hi ((int16x8_t) __a, __b);
+  __builtin_aarch64_simd_ci __o;
+  poly16x8x3_t temp;
+  temp.val[0] = vcombine_p16 (val.val[0], vcreate_p16 (__AARCH64_UINT64_C (0)));
+  temp.val[1] = vcombine_p16 (val.val[1], vcreate_p16 (__AARCH64_UINT64_C (0)));
+  temp.val[2] = vcombine_p16 (val.val[2], vcreate_p16 (__AARCH64_UINT64_C (0)));
+  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) temp.val[1], 1);
+  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) temp.val[2], 2);
+  __builtin_aarch64_st3v4hi ((__builtin_aarch64_simd_hi *) __a, __o);
 }
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vshrq_n_u32 (uint32x4_t __a, const int __b)
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst3_s32 (int32_t * __a, int32x2x3_t val)
 {
-  return (uint32x4_t) __builtin_aarch64_lshrv4si ((int32x4_t) __a, __b);
+  __builtin_aarch64_simd_ci __o;
+  int32x4x3_t temp;
+  temp.val[0] = vcombine_s32 (val.val[0], vcreate_s32 (__AARCH64_INT64_C (0)));
+  temp.val[1] = vcombine_s32 (val.val[1], vcreate_s32 (__AARCH64_INT64_C (0)));
+  temp.val[2] = vcombine_s32 (val.val[2], vcreate_s32 (__AARCH64_INT64_C (0)));
+  __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) temp.val[1], 1);
+  __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) temp.val[2], 2);
+  __builtin_aarch64_st3v2si ((__builtin_aarch64_simd_si *) __a, __o);
 }
 
-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
-vshrq_n_u64 (uint64x2_t __a, const int __b)
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst3_u8 (uint8_t * __a, uint8x8x3_t val)
 {
-  return (uint64x2_t) __builtin_aarch64_lshrv2di ((int64x2_t) __a, __b);
+  __builtin_aarch64_simd_ci __o;
+  uint8x16x3_t temp;
+  temp.val[0] = vcombine_u8 (val.val[0], vcreate_u8 (__AARCH64_UINT64_C (0)));
+  temp.val[1] = vcombine_u8 (val.val[1], vcreate_u8 (__AARCH64_UINT64_C (0)));
+  temp.val[2] = vcombine_u8 (val.val[2], vcreate_u8 (__AARCH64_UINT64_C (0)));
+  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) temp.val[1], 1);
+  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) temp.val[2], 2);
+  __builtin_aarch64_st3v8qi ((__builtin_aarch64_simd_qi *) __a, __o);
 }
 
-__extension__ static __inline int64_t __attribute__ ((__always_inline__))
-vshrd_n_s64 (int64_t __a, const int __b)
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst3_u16 (uint16_t * __a, uint16x4x3_t val)
 {
-  return __builtin_aarch64_ashr_simddi (__a, __b);
+  __builtin_aarch64_simd_ci __o;
+  uint16x8x3_t temp;
+  temp.val[0] = vcombine_u16 (val.val[0], vcreate_u16 (__AARCH64_UINT64_C (0)));
+  temp.val[1] = vcombine_u16 (val.val[1], vcreate_u16 (__AARCH64_UINT64_C (0)));
+  temp.val[2] = vcombine_u16 (val.val[2], vcreate_u16 (__AARCH64_UINT64_C (0)));
+  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) temp.val[1], 1);
+  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) temp.val[2], 2);
+  __builtin_aarch64_st3v4hi ((__builtin_aarch64_simd_hi *) __a, __o);
 }
 
-__extension__ static __inline uint64_t __attribute__ ((__always_inline__))
-vshrd_n_u64 (uint64_t __a, const int __b)
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst3_u32 (uint32_t * __a, uint32x2x3_t val)
 {
-  return __builtin_aarch64_lshr_simddi_uus (__a, __b);
+  __builtin_aarch64_simd_ci __o;
+  uint32x4x3_t temp;
+  temp.val[0] = vcombine_u32 (val.val[0], vcreate_u32 (__AARCH64_UINT64_C (0)));
+  temp.val[1] = vcombine_u32 (val.val[1], vcreate_u32 (__AARCH64_UINT64_C (0)));
+  temp.val[2] = vcombine_u32 (val.val[2], vcreate_u32 (__AARCH64_UINT64_C (0)));
+  __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) temp.val[1], 1);
+  __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) temp.val[2], 2);
+  __builtin_aarch64_st3v2si ((__builtin_aarch64_simd_si *) __a, __o);
 }
 
-/* vsli */
-
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-vsli_n_s8 (int8x8_t __a, int8x8_t __b, const int __c)
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst3_f16 (float16_t * __a, float16x4x3_t val)
 {
-  return (int8x8_t) __builtin_aarch64_ssli_nv8qi (__a, __b, __c);
+  __builtin_aarch64_simd_ci __o;
+  float16x8x3_t temp;
+  temp.val[0] = vcombine_f16 (val.val[0], vcreate_f16 (__AARCH64_UINT64_C (0)));
+  temp.val[1] = vcombine_f16 (val.val[1], vcreate_f16 (__AARCH64_UINT64_C (0)));
+  temp.val[2] = vcombine_f16 (val.val[2], vcreate_f16 (__AARCH64_UINT64_C (0)));
+  __o = __builtin_aarch64_set_qregciv8hf (__o, (float16x8_t) temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregciv8hf (__o, (float16x8_t) temp.val[1], 1);
+  __o = __builtin_aarch64_set_qregciv8hf (__o, (float16x8_t) temp.val[2], 2);
+  __builtin_aarch64_st3v4hf ((__builtin_aarch64_simd_hf *) __a, __o);
 }
 
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-vsli_n_s16 (int16x4_t __a, int16x4_t __b, const int __c)
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst3_f32 (float32_t * __a, float32x2x3_t val)
 {
-  return (int16x4_t) __builtin_aarch64_ssli_nv4hi (__a, __b, __c);
+  __builtin_aarch64_simd_ci __o;
+  float32x4x3_t temp;
+  temp.val[0] = vcombine_f32 (val.val[0], vcreate_f32 (__AARCH64_UINT64_C (0)));
+  temp.val[1] = vcombine_f32 (val.val[1], vcreate_f32 (__AARCH64_UINT64_C (0)));
+  temp.val[2] = vcombine_f32 (val.val[2], vcreate_f32 (__AARCH64_UINT64_C (0)));
+  __o = __builtin_aarch64_set_qregciv4sf (__o, (float32x4_t) temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregciv4sf (__o, (float32x4_t) temp.val[1], 1);
+  __o = __builtin_aarch64_set_qregciv4sf (__o, (float32x4_t) temp.val[2], 2);
+  __builtin_aarch64_st3v2sf ((__builtin_aarch64_simd_sf *) __a, __o);
 }
 
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-vsli_n_s32 (int32x2_t __a, int32x2_t __b, const int __c)
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst3_p64 (poly64_t * __a, poly64x1x3_t val)
 {
-  return (int32x2_t) __builtin_aarch64_ssli_nv2si (__a, __b, __c);
+  __builtin_aarch64_simd_ci __o;
+  poly64x2x3_t temp;
+  temp.val[0] = vcombine_p64 (val.val[0], vcreate_p64 (__AARCH64_UINT64_C (0)));
+  temp.val[1] = vcombine_p64 (val.val[1], vcreate_p64 (__AARCH64_UINT64_C (0)));
+  temp.val[2] = vcombine_p64 (val.val[2], vcreate_p64 (__AARCH64_UINT64_C (0)));
+  __o = __builtin_aarch64_set_qregciv2di_ssps (__o,
+					       (poly64x2_t) temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregciv2di_ssps (__o,
+					       (poly64x2_t) temp.val[1], 1);
+  __o = __builtin_aarch64_set_qregciv2di_ssps (__o,
+					       (poly64x2_t) temp.val[2], 2);
+  __builtin_aarch64_st3di ((__builtin_aarch64_simd_di *) __a, __o);
 }
 
-__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
-vsli_n_s64 (int64x1_t __a, int64x1_t __b, const int __c)
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst3q_s8 (int8_t * __a, int8x16x3_t val)
 {
-  return (int64x1_t) {__builtin_aarch64_ssli_ndi (__a[0], __b[0], __c)};
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) val.val[0], 0);
+  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) val.val[1], 1);
+  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) val.val[2], 2);
+  __builtin_aarch64_st3v16qi ((__builtin_aarch64_simd_qi *) __a, __o);
 }
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-vsli_n_u8 (uint8x8_t __a, uint8x8_t __b, const int __c)
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst3q_p8 (poly8_t * __a, poly8x16x3_t val)
 {
-  return __builtin_aarch64_usli_nv8qi_uuus (__a, __b, __c);
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) val.val[0], 0);
+  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) val.val[1], 1);
+  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) val.val[2], 2);
+  __builtin_aarch64_st3v16qi ((__builtin_aarch64_simd_qi *) __a, __o);
 }
 
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-vsli_n_u16 (uint16x4_t __a, uint16x4_t __b, const int __c)
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst3q_s16 (int16_t * __a, int16x8x3_t val)
 {
-  return __builtin_aarch64_usli_nv4hi_uuus (__a, __b, __c);
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) val.val[0], 0);
+  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) val.val[1], 1);
+  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) val.val[2], 2);
+  __builtin_aarch64_st3v8hi ((__builtin_aarch64_simd_hi *) __a, __o);
 }
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-vsli_n_u32 (uint32x2_t __a, uint32x2_t __b, const int __c)
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst3q_p16 (poly16_t * __a, poly16x8x3_t val)
 {
-  return __builtin_aarch64_usli_nv2si_uuus (__a, __b, __c);
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) val.val[0], 0);
+  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) val.val[1], 1);
+  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) val.val[2], 2);
+  __builtin_aarch64_st3v8hi ((__builtin_aarch64_simd_hi *) __a, __o);
 }
 
-__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
-vsli_n_u64 (uint64x1_t __a, uint64x1_t __b, const int __c)
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst3q_s32 (int32_t * __a, int32x4x3_t val)
 {
-  return (uint64x1_t) {__builtin_aarch64_usli_ndi_uuus (__a[0], __b[0], __c)};
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) val.val[0], 0);
+  __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) val.val[1], 1);
+  __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) val.val[2], 2);
+  __builtin_aarch64_st3v4si ((__builtin_aarch64_simd_si *) __a, __o);
 }
 
-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
-vsliq_n_s8 (int8x16_t __a, int8x16_t __b, const int __c)
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst3q_s64 (int64_t * __a, int64x2x3_t val)
 {
-  return (int8x16_t) __builtin_aarch64_ssli_nv16qi (__a, __b, __c);
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) val.val[0], 0);
+  __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) val.val[1], 1);
+  __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) val.val[2], 2);
+  __builtin_aarch64_st3v2di ((__builtin_aarch64_simd_di *) __a, __o);
 }
 
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-vsliq_n_s16 (int16x8_t __a, int16x8_t __b, const int __c)
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst3q_u8 (uint8_t * __a, uint8x16x3_t val)
 {
-  return (int16x8_t) __builtin_aarch64_ssli_nv8hi (__a, __b, __c);
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) val.val[0], 0);
+  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) val.val[1], 1);
+  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) val.val[2], 2);
+  __builtin_aarch64_st3v16qi ((__builtin_aarch64_simd_qi *) __a, __o);
 }
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-vsliq_n_s32 (int32x4_t __a, int32x4_t __b, const int __c)
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst3q_u16 (uint16_t * __a, uint16x8x3_t val)
 {
-  return (int32x4_t) __builtin_aarch64_ssli_nv4si (__a, __b, __c);
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) val.val[0], 0);
+  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) val.val[1], 1);
+  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) val.val[2], 2);
+  __builtin_aarch64_st3v8hi ((__builtin_aarch64_simd_hi *) __a, __o);
 }
 
-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
-vsliq_n_s64 (int64x2_t __a, int64x2_t __b, const int __c)
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst3q_u32 (uint32_t * __a, uint32x4x3_t val)
 {
-  return (int64x2_t) __builtin_aarch64_ssli_nv2di (__a, __b, __c);
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) val.val[0], 0);
+  __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) val.val[1], 1);
+  __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) val.val[2], 2);
+  __builtin_aarch64_st3v4si ((__builtin_aarch64_simd_si *) __a, __o);
 }
 
-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-vsliq_n_u8 (uint8x16_t __a, uint8x16_t __b, const int __c)
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst3q_u64 (uint64_t * __a, uint64x2x3_t val)
 {
-  return __builtin_aarch64_usli_nv16qi_uuus (__a, __b, __c);
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) val.val[0], 0);
+  __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) val.val[1], 1);
+  __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) val.val[2], 2);
+  __builtin_aarch64_st3v2di ((__builtin_aarch64_simd_di *) __a, __o);
 }
 
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-vsliq_n_u16 (uint16x8_t __a, uint16x8_t __b, const int __c)
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst3q_f16 (float16_t * __a, float16x8x3_t val)
 {
-  return __builtin_aarch64_usli_nv8hi_uuus (__a, __b, __c);
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_set_qregciv8hf (__o, (float16x8_t) val.val[0], 0);
+  __o = __builtin_aarch64_set_qregciv8hf (__o, (float16x8_t) val.val[1], 1);
+  __o = __builtin_aarch64_set_qregciv8hf (__o, (float16x8_t) val.val[2], 2);
+  __builtin_aarch64_st3v8hf ((__builtin_aarch64_simd_hf *) __a, __o);
 }
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vsliq_n_u32 (uint32x4_t __a, uint32x4_t __b, const int __c)
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst3q_f32 (float32_t * __a, float32x4x3_t val)
 {
-  return __builtin_aarch64_usli_nv4si_uuus (__a, __b, __c);
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_set_qregciv4sf (__o, (float32x4_t) val.val[0], 0);
+  __o = __builtin_aarch64_set_qregciv4sf (__o, (float32x4_t) val.val[1], 1);
+  __o = __builtin_aarch64_set_qregciv4sf (__o, (float32x4_t) val.val[2], 2);
+  __builtin_aarch64_st3v4sf ((__builtin_aarch64_simd_sf *) __a, __o);
 }
 
-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
-vsliq_n_u64 (uint64x2_t __a, uint64x2_t __b, const int __c)
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst3q_f64 (float64_t * __a, float64x2x3_t val)
 {
-  return __builtin_aarch64_usli_nv2di_uuus (__a, __b, __c);
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_set_qregciv2df (__o, (float64x2_t) val.val[0], 0);
+  __o = __builtin_aarch64_set_qregciv2df (__o, (float64x2_t) val.val[1], 1);
+  __o = __builtin_aarch64_set_qregciv2df (__o, (float64x2_t) val.val[2], 2);
+  __builtin_aarch64_st3v2df ((__builtin_aarch64_simd_df *) __a, __o);
 }
 
-__extension__ static __inline int64_t __attribute__ ((__always_inline__))
-vslid_n_s64 (int64_t __a, int64_t __b, const int __c)
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst3q_p64 (poly64_t * __a, poly64x2x3_t val)
 {
-  return __builtin_aarch64_ssli_ndi (__a, __b, __c);
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_set_qregciv2di_ssps (__o,
+					       (poly64x2_t) val.val[0], 0);
+  __o = __builtin_aarch64_set_qregciv2di_ssps (__o,
+					       (poly64x2_t) val.val[1], 1);
+  __o = __builtin_aarch64_set_qregciv2di_ssps (__o,
+					       (poly64x2_t) val.val[2], 2);
+  __builtin_aarch64_st3v2di ((__builtin_aarch64_simd_di *) __a, __o);
 }
 
-__extension__ static __inline uint64_t __attribute__ ((__always_inline__))
-vslid_n_u64 (uint64_t __a, uint64_t __b, const int __c)
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst4_s64 (int64_t * __a, int64x1x4_t val)
 {
-  return __builtin_aarch64_usli_ndi_uuus (__a, __b, __c);
+  __builtin_aarch64_simd_xi __o;
+  int64x2x4_t temp;
+  temp.val[0] = vcombine_s64 (val.val[0], vcreate_s64 (__AARCH64_INT64_C (0)));
+  temp.val[1] = vcombine_s64 (val.val[1], vcreate_s64 (__AARCH64_INT64_C (0)));
+  temp.val[2] = vcombine_s64 (val.val[2], vcreate_s64 (__AARCH64_INT64_C (0)));
+  temp.val[3] = vcombine_s64 (val.val[3], vcreate_s64 (__AARCH64_INT64_C (0)));
+  __o = __builtin_aarch64_set_qregxiv2di (__o, (int64x2_t) temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregxiv2di (__o, (int64x2_t) temp.val[1], 1);
+  __o = __builtin_aarch64_set_qregxiv2di (__o, (int64x2_t) temp.val[2], 2);
+  __o = __builtin_aarch64_set_qregxiv2di (__o, (int64x2_t) temp.val[3], 3);
+  __builtin_aarch64_st4di ((__builtin_aarch64_simd_di *) __a, __o);
 }
 
-/* vsqadd */
-
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-vsqadd_u8 (uint8x8_t __a, int8x8_t __b)
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst4_u64 (uint64_t * __a, uint64x1x4_t val)
 {
-  return __builtin_aarch64_usqaddv8qi_uus (__a, __b);
+  __builtin_aarch64_simd_xi __o;
+  uint64x2x4_t temp;
+  temp.val[0] = vcombine_u64 (val.val[0], vcreate_u64 (__AARCH64_UINT64_C (0)));
+  temp.val[1] = vcombine_u64 (val.val[1], vcreate_u64 (__AARCH64_UINT64_C (0)));
+  temp.val[2] = vcombine_u64 (val.val[2], vcreate_u64 (__AARCH64_UINT64_C (0)));
+  temp.val[3] = vcombine_u64 (val.val[3], vcreate_u64 (__AARCH64_UINT64_C (0)));
+  __o = __builtin_aarch64_set_qregxiv2di (__o, (int64x2_t) temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregxiv2di (__o, (int64x2_t) temp.val[1], 1);
+  __o = __builtin_aarch64_set_qregxiv2di (__o, (int64x2_t) temp.val[2], 2);
+  __o = __builtin_aarch64_set_qregxiv2di (__o, (int64x2_t) temp.val[3], 3);
+  __builtin_aarch64_st4di ((__builtin_aarch64_simd_di *) __a, __o);
 }
 
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-vsqadd_u16 (uint16x4_t __a, int16x4_t __b)
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst4_f64 (float64_t * __a, float64x1x4_t val)
 {
-  return __builtin_aarch64_usqaddv4hi_uus (__a, __b);
+  __builtin_aarch64_simd_xi __o;
+  float64x2x4_t temp;
+  temp.val[0] = vcombine_f64 (val.val[0], vcreate_f64 (__AARCH64_UINT64_C (0)));
+  temp.val[1] = vcombine_f64 (val.val[1], vcreate_f64 (__AARCH64_UINT64_C (0)));
+  temp.val[2] = vcombine_f64 (val.val[2], vcreate_f64 (__AARCH64_UINT64_C (0)));
+  temp.val[3] = vcombine_f64 (val.val[3], vcreate_f64 (__AARCH64_UINT64_C (0)));
+  __o = __builtin_aarch64_set_qregxiv2df (__o, (float64x2_t) temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregxiv2df (__o, (float64x2_t) temp.val[1], 1);
+  __o = __builtin_aarch64_set_qregxiv2df (__o, (float64x2_t) temp.val[2], 2);
+  __o = __builtin_aarch64_set_qregxiv2df (__o, (float64x2_t) temp.val[3], 3);
+  __builtin_aarch64_st4df ((__builtin_aarch64_simd_df *) __a, __o);
 }
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-vsqadd_u32 (uint32x2_t __a, int32x2_t __b)
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst4_s8 (int8_t * __a, int8x8x4_t val)
 {
-  return __builtin_aarch64_usqaddv2si_uus (__a, __b);
+  __builtin_aarch64_simd_xi __o;
+  int8x16x4_t temp;
+  temp.val[0] = vcombine_s8 (val.val[0], vcreate_s8 (__AARCH64_INT64_C (0)));
+  temp.val[1] = vcombine_s8 (val.val[1], vcreate_s8 (__AARCH64_INT64_C (0)));
+  temp.val[2] = vcombine_s8 (val.val[2], vcreate_s8 (__AARCH64_INT64_C (0)));
+  temp.val[3] = vcombine_s8 (val.val[3], vcreate_s8 (__AARCH64_INT64_C (0)));
+  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) temp.val[1], 1);
+  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) temp.val[2], 2);
+  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) temp.val[3], 3);
+  __builtin_aarch64_st4v8qi ((__builtin_aarch64_simd_qi *) __a, __o);
 }
 
-__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
-vsqadd_u64 (uint64x1_t __a, int64x1_t __b)
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst4_p8 (poly8_t * __a, poly8x8x4_t val)
 {
-  return (uint64x1_t) {__builtin_aarch64_usqadddi_uus (__a[0], __b[0])};
+  __builtin_aarch64_simd_xi __o;
+  poly8x16x4_t temp;
+  temp.val[0] = vcombine_p8 (val.val[0], vcreate_p8 (__AARCH64_UINT64_C (0)));
+  temp.val[1] = vcombine_p8 (val.val[1], vcreate_p8 (__AARCH64_UINT64_C (0)));
+  temp.val[2] = vcombine_p8 (val.val[2], vcreate_p8 (__AARCH64_UINT64_C (0)));
+  temp.val[3] = vcombine_p8 (val.val[3], vcreate_p8 (__AARCH64_UINT64_C (0)));
+  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) temp.val[1], 1);
+  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) temp.val[2], 2);
+  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) temp.val[3], 3);
+  __builtin_aarch64_st4v8qi ((__builtin_aarch64_simd_qi *) __a, __o);
 }
 
-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-vsqaddq_u8 (uint8x16_t __a, int8x16_t __b)
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst4_s16 (int16_t * __a, int16x4x4_t val)
 {
-  return __builtin_aarch64_usqaddv16qi_uus (__a, __b);
+  __builtin_aarch64_simd_xi __o;
+  int16x8x4_t temp;
+  temp.val[0] = vcombine_s16 (val.val[0], vcreate_s16 (__AARCH64_INT64_C (0)));
+  temp.val[1] = vcombine_s16 (val.val[1], vcreate_s16 (__AARCH64_INT64_C (0)));
+  temp.val[2] = vcombine_s16 (val.val[2], vcreate_s16 (__AARCH64_INT64_C (0)));
+  temp.val[3] = vcombine_s16 (val.val[3], vcreate_s16 (__AARCH64_INT64_C (0)));
+  __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) temp.val[1], 1);
+  __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) temp.val[2], 2);
+  __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) temp.val[3], 3);
+  __builtin_aarch64_st4v4hi ((__builtin_aarch64_simd_hi *) __a, __o);
 }
 
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-vsqaddq_u16 (uint16x8_t __a, int16x8_t __b)
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst4_p16 (poly16_t * __a, poly16x4x4_t val)
 {
-  return __builtin_aarch64_usqaddv8hi_uus (__a, __b);
+  __builtin_aarch64_simd_xi __o;
+  poly16x8x4_t temp;
+  temp.val[0] = vcombine_p16 (val.val[0], vcreate_p16 (__AARCH64_UINT64_C (0)));
+  temp.val[1] = vcombine_p16 (val.val[1], vcreate_p16 (__AARCH64_UINT64_C (0)));
+  temp.val[2] = vcombine_p16 (val.val[2], vcreate_p16 (__AARCH64_UINT64_C (0)));
+  temp.val[3] = vcombine_p16 (val.val[3], vcreate_p16 (__AARCH64_UINT64_C (0)));
+  __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) temp.val[1], 1);
+  __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) temp.val[2], 2);
+  __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) temp.val[3], 3);
+  __builtin_aarch64_st4v4hi ((__builtin_aarch64_simd_hi *) __a, __o);
 }
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vsqaddq_u32 (uint32x4_t __a, int32x4_t __b)
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst4_s32 (int32_t * __a, int32x2x4_t val)
 {
-  return __builtin_aarch64_usqaddv4si_uus (__a, __b);
+  __builtin_aarch64_simd_xi __o;
+  int32x4x4_t temp;
+  temp.val[0] = vcombine_s32 (val.val[0], vcreate_s32 (__AARCH64_INT64_C (0)));
+  temp.val[1] = vcombine_s32 (val.val[1], vcreate_s32 (__AARCH64_INT64_C (0)));
+  temp.val[2] = vcombine_s32 (val.val[2], vcreate_s32 (__AARCH64_INT64_C (0)));
+  temp.val[3] = vcombine_s32 (val.val[3], vcreate_s32 (__AARCH64_INT64_C (0)));
+  __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) temp.val[1], 1);
+  __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) temp.val[2], 2);
+  __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) temp.val[3], 3);
+  __builtin_aarch64_st4v2si ((__builtin_aarch64_simd_si *) __a, __o);
 }
 
-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
-vsqaddq_u64 (uint64x2_t __a, int64x2_t __b)
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst4_u8 (uint8_t * __a, uint8x8x4_t val)
 {
-  return __builtin_aarch64_usqaddv2di_uus (__a, __b);
+  __builtin_aarch64_simd_xi __o;
+  uint8x16x4_t temp;
+  temp.val[0] = vcombine_u8 (val.val[0], vcreate_u8 (__AARCH64_UINT64_C (0)));
+  temp.val[1] = vcombine_u8 (val.val[1], vcreate_u8 (__AARCH64_UINT64_C (0)));
+  temp.val[2] = vcombine_u8 (val.val[2], vcreate_u8 (__AARCH64_UINT64_C (0)));
+  temp.val[3] = vcombine_u8 (val.val[3], vcreate_u8 (__AARCH64_UINT64_C (0)));
+  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) temp.val[1], 1);
+  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) temp.val[2], 2);
+  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) temp.val[3], 3);
+  __builtin_aarch64_st4v8qi ((__builtin_aarch64_simd_qi *) __a, __o);
 }
 
-__extension__ static __inline uint8_t __attribute__ ((__always_inline__))
-vsqaddb_u8 (uint8_t __a, int8_t __b)
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst4_u16 (uint16_t * __a, uint16x4x4_t val)
 {
-  return __builtin_aarch64_usqaddqi_uus (__a, __b);
+  __builtin_aarch64_simd_xi __o;
+  uint16x8x4_t temp;
+  temp.val[0] = vcombine_u16 (val.val[0], vcreate_u16 (__AARCH64_UINT64_C (0)));
+  temp.val[1] = vcombine_u16 (val.val[1], vcreate_u16 (__AARCH64_UINT64_C (0)));
+  temp.val[2] = vcombine_u16 (val.val[2], vcreate_u16 (__AARCH64_UINT64_C (0)));
+  temp.val[3] = vcombine_u16 (val.val[3], vcreate_u16 (__AARCH64_UINT64_C (0)));
+  __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) temp.val[1], 1);
+  __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) temp.val[2], 2);
+  __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) temp.val[3], 3);
+  __builtin_aarch64_st4v4hi ((__builtin_aarch64_simd_hi *) __a, __o);
 }
 
-__extension__ static __inline uint16_t __attribute__ ((__always_inline__))
-vsqaddh_u16 (uint16_t __a, int16_t __b)
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst4_u32 (uint32_t * __a, uint32x2x4_t val)
 {
-  return __builtin_aarch64_usqaddhi_uus (__a, __b);
+  __builtin_aarch64_simd_xi __o;
+  uint32x4x4_t temp;
+  temp.val[0] = vcombine_u32 (val.val[0], vcreate_u32 (__AARCH64_UINT64_C (0)));
+  temp.val[1] = vcombine_u32 (val.val[1], vcreate_u32 (__AARCH64_UINT64_C (0)));
+  temp.val[2] = vcombine_u32 (val.val[2], vcreate_u32 (__AARCH64_UINT64_C (0)));
+  temp.val[3] = vcombine_u32 (val.val[3], vcreate_u32 (__AARCH64_UINT64_C (0)));
+  __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) temp.val[1], 1);
+  __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) temp.val[2], 2);
+  __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) temp.val[3], 3);
+  __builtin_aarch64_st4v2si ((__builtin_aarch64_simd_si *) __a, __o);
 }
 
-__extension__ static __inline uint32_t __attribute__ ((__always_inline__))
-vsqadds_u32 (uint32_t __a, int32_t __b)
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst4_f16 (float16_t * __a, float16x4x4_t val)
 {
-  return __builtin_aarch64_usqaddsi_uus (__a, __b);
+  __builtin_aarch64_simd_xi __o;
+  float16x8x4_t temp;
+  temp.val[0] = vcombine_f16 (val.val[0], vcreate_f16 (__AARCH64_UINT64_C (0)));
+  temp.val[1] = vcombine_f16 (val.val[1], vcreate_f16 (__AARCH64_UINT64_C (0)));
+  temp.val[2] = vcombine_f16 (val.val[2], vcreate_f16 (__AARCH64_UINT64_C (0)));
+  temp.val[3] = vcombine_f16 (val.val[3], vcreate_f16 (__AARCH64_UINT64_C (0)));
+  __o = __builtin_aarch64_set_qregxiv8hf (__o, (float16x8_t) temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregxiv8hf (__o, (float16x8_t) temp.val[1], 1);
+  __o = __builtin_aarch64_set_qregxiv8hf (__o, (float16x8_t) temp.val[2], 2);
+  __o = __builtin_aarch64_set_qregxiv8hf (__o, (float16x8_t) temp.val[3], 3);
+  __builtin_aarch64_st4v4hf ((__builtin_aarch64_simd_hf *) __a, __o);
 }
 
-__extension__ static __inline uint64_t __attribute__ ((__always_inline__))
-vsqaddd_u64 (uint64_t __a, int64_t __b)
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst4_f32 (float32_t * __a, float32x2x4_t val)
 {
-  return __builtin_aarch64_usqadddi_uus (__a, __b);
+  __builtin_aarch64_simd_xi __o;
+  float32x4x4_t temp;
+  temp.val[0] = vcombine_f32 (val.val[0], vcreate_f32 (__AARCH64_UINT64_C (0)));
+  temp.val[1] = vcombine_f32 (val.val[1], vcreate_f32 (__AARCH64_UINT64_C (0)));
+  temp.val[2] = vcombine_f32 (val.val[2], vcreate_f32 (__AARCH64_UINT64_C (0)));
+  temp.val[3] = vcombine_f32 (val.val[3], vcreate_f32 (__AARCH64_UINT64_C (0)));
+  __o = __builtin_aarch64_set_qregxiv4sf (__o, (float32x4_t) temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregxiv4sf (__o, (float32x4_t) temp.val[1], 1);
+  __o = __builtin_aarch64_set_qregxiv4sf (__o, (float32x4_t) temp.val[2], 2);
+  __o = __builtin_aarch64_set_qregxiv4sf (__o, (float32x4_t) temp.val[3], 3);
+  __builtin_aarch64_st4v2sf ((__builtin_aarch64_simd_sf *) __a, __o);
 }
 
-/* vsqrt */
-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
-vsqrt_f32 (float32x2_t a)
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst4_p64 (poly64_t * __a, poly64x1x4_t val)
 {
-  return __builtin_aarch64_sqrtv2sf (a);
+  __builtin_aarch64_simd_xi __o;
+  poly64x2x4_t temp;
+  temp.val[0] = vcombine_p64 (val.val[0], vcreate_p64 (__AARCH64_UINT64_C (0)));
+  temp.val[1] = vcombine_p64 (val.val[1], vcreate_p64 (__AARCH64_UINT64_C (0)));
+  temp.val[2] = vcombine_p64 (val.val[2], vcreate_p64 (__AARCH64_UINT64_C (0)));
+  temp.val[3] = vcombine_p64 (val.val[3], vcreate_p64 (__AARCH64_UINT64_C (0)));
+  __o = __builtin_aarch64_set_qregxiv2di_ssps (__o,
+					       (poly64x2_t) temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregxiv2di_ssps (__o,
+					       (poly64x2_t) temp.val[1], 1);
+  __o = __builtin_aarch64_set_qregxiv2di_ssps (__o,
+					       (poly64x2_t) temp.val[2], 2);
+  __o = __builtin_aarch64_set_qregxiv2di_ssps (__o,
+					       (poly64x2_t) temp.val[3], 3);
+  __builtin_aarch64_st4di ((__builtin_aarch64_simd_di *) __a, __o);
 }
 
-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
-vsqrtq_f32 (float32x4_t a)
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst4q_s8 (int8_t * __a, int8x16x4_t val)
 {
-  return __builtin_aarch64_sqrtv4sf (a);
+  __builtin_aarch64_simd_xi __o;
+  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) val.val[0], 0);
+  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) val.val[1], 1);
+  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) val.val[2], 2);
+  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) val.val[3], 3);
+  __builtin_aarch64_st4v16qi ((__builtin_aarch64_simd_qi *) __a, __o);
 }
 
-__extension__ static __inline float64x1_t __attribute__ ((__always_inline__))
-vsqrt_f64 (float64x1_t a)
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst4q_p8 (poly8_t * __a, poly8x16x4_t val)
 {
-  return (float64x1_t) { __builtin_aarch64_sqrtdf (a[0]) };
+  __builtin_aarch64_simd_xi __o;
+  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) val.val[0], 0);
+  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) val.val[1], 1);
+  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) val.val[2], 2);
+  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) val.val[3], 3);
+  __builtin_aarch64_st4v16qi ((__builtin_aarch64_simd_qi *) __a, __o);
 }
 
-__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
-vsqrtq_f64 (float64x2_t a)
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst4q_s16 (int16_t * __a, int16x8x4_t val)
 {
-  return __builtin_aarch64_sqrtv2df (a);
+  __builtin_aarch64_simd_xi __o;
+  __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) val.val[0], 0);
+  __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) val.val[1], 1);
+  __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) val.val[2], 2);
+  __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) val.val[3], 3);
+  __builtin_aarch64_st4v8hi ((__builtin_aarch64_simd_hi *) __a, __o);
 }
 
-/* vsra */
-
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-vsra_n_s8 (int8x8_t __a, int8x8_t __b, const int __c)
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst4q_p16 (poly16_t * __a, poly16x8x4_t val)
 {
-  return (int8x8_t) __builtin_aarch64_ssra_nv8qi (__a, __b, __c);
+  __builtin_aarch64_simd_xi __o;
+  __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) val.val[0], 0);
+  __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) val.val[1], 1);
+  __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) val.val[2], 2);
+  __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) val.val[3], 3);
+  __builtin_aarch64_st4v8hi ((__builtin_aarch64_simd_hi *) __a, __o);
 }
 
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-vsra_n_s16 (int16x4_t __a, int16x4_t __b, const int __c)
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst4q_s32 (int32_t * __a, int32x4x4_t val)
 {
-  return (int16x4_t) __builtin_aarch64_ssra_nv4hi (__a, __b, __c);
+  __builtin_aarch64_simd_xi __o;
+  __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) val.val[0], 0);
+  __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) val.val[1], 1);
+  __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) val.val[2], 2);
+  __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) val.val[3], 3);
+  __builtin_aarch64_st4v4si ((__builtin_aarch64_simd_si *) __a, __o);
 }
 
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-vsra_n_s32 (int32x2_t __a, int32x2_t __b, const int __c)
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst4q_s64 (int64_t * __a, int64x2x4_t val)
 {
-  return (int32x2_t) __builtin_aarch64_ssra_nv2si (__a, __b, __c);
+  __builtin_aarch64_simd_xi __o;
+  __o = __builtin_aarch64_set_qregxiv2di (__o, (int64x2_t) val.val[0], 0);
+  __o = __builtin_aarch64_set_qregxiv2di (__o, (int64x2_t) val.val[1], 1);
+  __o = __builtin_aarch64_set_qregxiv2di (__o, (int64x2_t) val.val[2], 2);
+  __o = __builtin_aarch64_set_qregxiv2di (__o, (int64x2_t) val.val[3], 3);
+  __builtin_aarch64_st4v2di ((__builtin_aarch64_simd_di *) __a, __o);
 }
 
-__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
-vsra_n_s64 (int64x1_t __a, int64x1_t __b, const int __c)
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst4q_u8 (uint8_t * __a, uint8x16x4_t val)
 {
-  return (int64x1_t) {__builtin_aarch64_ssra_ndi (__a[0], __b[0], __c)};
+  __builtin_aarch64_simd_xi __o;
+  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) val.val[0], 0);
+  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) val.val[1], 1);
+  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) val.val[2], 2);
+  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) val.val[3], 3);
+  __builtin_aarch64_st4v16qi ((__builtin_aarch64_simd_qi *) __a, __o);
 }
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-vsra_n_u8 (uint8x8_t __a, uint8x8_t __b, const int __c)
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst4q_u16 (uint16_t * __a, uint16x8x4_t val)
 {
-  return __builtin_aarch64_usra_nv8qi_uuus (__a, __b, __c);
+  __builtin_aarch64_simd_xi __o;
+  __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) val.val[0], 0);
+  __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) val.val[1], 1);
+  __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) val.val[2], 2);
+  __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) val.val[3], 3);
+  __builtin_aarch64_st4v8hi ((__builtin_aarch64_simd_hi *) __a, __o);
 }
 
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-vsra_n_u16 (uint16x4_t __a, uint16x4_t __b, const int __c)
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst4q_u32 (uint32_t * __a, uint32x4x4_t val)
 {
-  return __builtin_aarch64_usra_nv4hi_uuus (__a, __b, __c);
+  __builtin_aarch64_simd_xi __o;
+  __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) val.val[0], 0);
+  __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) val.val[1], 1);
+  __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) val.val[2], 2);
+  __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) val.val[3], 3);
+  __builtin_aarch64_st4v4si ((__builtin_aarch64_simd_si *) __a, __o);
 }
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-vsra_n_u32 (uint32x2_t __a, uint32x2_t __b, const int __c)
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst4q_u64 (uint64_t * __a, uint64x2x4_t val)
 {
-  return __builtin_aarch64_usra_nv2si_uuus (__a, __b, __c);
+  __builtin_aarch64_simd_xi __o;
+  __o = __builtin_aarch64_set_qregxiv2di (__o, (int64x2_t) val.val[0], 0);
+  __o = __builtin_aarch64_set_qregxiv2di (__o, (int64x2_t) val.val[1], 1);
+  __o = __builtin_aarch64_set_qregxiv2di (__o, (int64x2_t) val.val[2], 2);
+  __o = __builtin_aarch64_set_qregxiv2di (__o, (int64x2_t) val.val[3], 3);
+  __builtin_aarch64_st4v2di ((__builtin_aarch64_simd_di *) __a, __o);
 }
 
-__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
-vsra_n_u64 (uint64x1_t __a, uint64x1_t __b, const int __c)
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst4q_f16 (float16_t * __a, float16x8x4_t val)
 {
-  return (uint64x1_t) {__builtin_aarch64_usra_ndi_uuus (__a[0], __b[0], __c)};
+  __builtin_aarch64_simd_xi __o;
+  __o = __builtin_aarch64_set_qregxiv8hf (__o, (float16x8_t) val.val[0], 0);
+  __o = __builtin_aarch64_set_qregxiv8hf (__o, (float16x8_t) val.val[1], 1);
+  __o = __builtin_aarch64_set_qregxiv8hf (__o, (float16x8_t) val.val[2], 2);
+  __o = __builtin_aarch64_set_qregxiv8hf (__o, (float16x8_t) val.val[3], 3);
+  __builtin_aarch64_st4v8hf ((__builtin_aarch64_simd_hf *) __a, __o);
 }
 
-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
-vsraq_n_s8 (int8x16_t __a, int8x16_t __b, const int __c)
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst4q_f32 (float32_t * __a, float32x4x4_t val)
 {
-  return (int8x16_t) __builtin_aarch64_ssra_nv16qi (__a, __b, __c);
+  __builtin_aarch64_simd_xi __o;
+  __o = __builtin_aarch64_set_qregxiv4sf (__o, (float32x4_t) val.val[0], 0);
+  __o = __builtin_aarch64_set_qregxiv4sf (__o, (float32x4_t) val.val[1], 1);
+  __o = __builtin_aarch64_set_qregxiv4sf (__o, (float32x4_t) val.val[2], 2);
+  __o = __builtin_aarch64_set_qregxiv4sf (__o, (float32x4_t) val.val[3], 3);
+  __builtin_aarch64_st4v4sf ((__builtin_aarch64_simd_sf *) __a, __o);
 }
 
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-vsraq_n_s16 (int16x8_t __a, int16x8_t __b, const int __c)
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst4q_f64 (float64_t * __a, float64x2x4_t val)
 {
-  return (int16x8_t) __builtin_aarch64_ssra_nv8hi (__a, __b, __c);
+  __builtin_aarch64_simd_xi __o;
+  __o = __builtin_aarch64_set_qregxiv2df (__o, (float64x2_t) val.val[0], 0);
+  __o = __builtin_aarch64_set_qregxiv2df (__o, (float64x2_t) val.val[1], 1);
+  __o = __builtin_aarch64_set_qregxiv2df (__o, (float64x2_t) val.val[2], 2);
+  __o = __builtin_aarch64_set_qregxiv2df (__o, (float64x2_t) val.val[3], 3);
+  __builtin_aarch64_st4v2df ((__builtin_aarch64_simd_df *) __a, __o);
 }
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-vsraq_n_s32 (int32x4_t __a, int32x4_t __b, const int __c)
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst4q_p64 (poly64_t * __a, poly64x2x4_t val)
 {
-  return (int32x4_t) __builtin_aarch64_ssra_nv4si (__a, __b, __c);
+  __builtin_aarch64_simd_xi __o;
+  __o = __builtin_aarch64_set_qregxiv2di_ssps (__o,
+					       (poly64x2_t) val.val[0], 0);
+  __o = __builtin_aarch64_set_qregxiv2di_ssps (__o,
+					       (poly64x2_t) val.val[1], 1);
+  __o = __builtin_aarch64_set_qregxiv2di_ssps (__o,
+					       (poly64x2_t) val.val[2], 2);
+  __o = __builtin_aarch64_set_qregxiv2di_ssps (__o,
+					       (poly64x2_t) val.val[3], 3);
+  __builtin_aarch64_st4v2di ((__builtin_aarch64_simd_di *) __a, __o);
 }
 
-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
-vsraq_n_s64 (int64x2_t __a, int64x2_t __b, const int __c)
-{
-  return (int64x2_t) __builtin_aarch64_ssra_nv2di (__a, __b, __c);
-}
+/* vsub */
 
-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-vsraq_n_u8 (uint8x16_t __a, uint8x16_t __b, const int __c)
+__extension__ extern __inline int64_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vsubd_s64 (int64_t __a, int64_t __b)
 {
-  return __builtin_aarch64_usra_nv16qi_uuus (__a, __b, __c);
+  return __a - __b;
 }
 
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-vsraq_n_u16 (uint16x8_t __a, uint16x8_t __b, const int __c)
+__extension__ extern __inline uint64_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vsubd_u64 (uint64_t __a, uint64_t __b)
 {
-  return __builtin_aarch64_usra_nv8hi_uuus (__a, __b, __c);
+  return __a - __b;
 }
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vsraq_n_u32 (uint32x4_t __a, uint32x4_t __b, const int __c)
-{
-  return __builtin_aarch64_usra_nv4si_uuus (__a, __b, __c);
-}
+/* vtbx1  */
 
-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
-vsraq_n_u64 (uint64x2_t __a, uint64x2_t __b, const int __c)
+__extension__ extern __inline int8x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vtbx1_s8 (int8x8_t __r, int8x8_t __tab, int8x8_t __idx)
 {
-  return __builtin_aarch64_usra_nv2di_uuus (__a, __b, __c);
+  uint8x8_t __mask = vclt_u8 (vreinterpret_u8_s8 (__idx),
+			      vmov_n_u8 (8));
+  int8x8_t __tbl = vtbl1_s8 (__tab, __idx);
+
+  return vbsl_s8 (__mask, __tbl, __r);
 }
 
-__extension__ static __inline int64_t __attribute__ ((__always_inline__))
-vsrad_n_s64 (int64_t __a, int64_t __b, const int __c)
+__extension__ extern __inline uint8x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vtbx1_u8 (uint8x8_t __r, uint8x8_t __tab, uint8x8_t __idx)
 {
-  return __builtin_aarch64_ssra_ndi (__a, __b, __c);
+  uint8x8_t __mask = vclt_u8 (__idx, vmov_n_u8 (8));
+  uint8x8_t __tbl = vtbl1_u8 (__tab, __idx);
+
+  return vbsl_u8 (__mask, __tbl, __r);
 }
 
-__extension__ static __inline uint64_t __attribute__ ((__always_inline__))
-vsrad_n_u64 (uint64_t __a, uint64_t __b, const int __c)
+__extension__ extern __inline poly8x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vtbx1_p8 (poly8x8_t __r, poly8x8_t __tab, uint8x8_t __idx)
 {
-  return __builtin_aarch64_usra_ndi_uuus (__a, __b, __c);
+  uint8x8_t __mask = vclt_u8 (__idx, vmov_n_u8 (8));
+  poly8x8_t __tbl = vtbl1_p8 (__tab, __idx);
+
+  return vbsl_p8 (__mask, __tbl, __r);
 }
 
-/* vsri */
+/* vtbx3  */
 
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-vsri_n_s8 (int8x8_t __a, int8x8_t __b, const int __c)
+__extension__ extern __inline int8x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vtbx3_s8 (int8x8_t __r, int8x8x3_t __tab, int8x8_t __idx)
 {
-  return (int8x8_t) __builtin_aarch64_ssri_nv8qi (__a, __b, __c);
+  uint8x8_t __mask = vclt_u8 (vreinterpret_u8_s8 (__idx),
+			      vmov_n_u8 (24));
+  int8x8_t __tbl = vtbl3_s8 (__tab, __idx);
+
+  return vbsl_s8 (__mask, __tbl, __r);
 }
 
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-vsri_n_s16 (int16x4_t __a, int16x4_t __b, const int __c)
+__extension__ extern __inline uint8x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vtbx3_u8 (uint8x8_t __r, uint8x8x3_t __tab, uint8x8_t __idx)
 {
-  return (int16x4_t) __builtin_aarch64_ssri_nv4hi (__a, __b, __c);
+  uint8x8_t __mask = vclt_u8 (__idx, vmov_n_u8 (24));
+  uint8x8_t __tbl = vtbl3_u8 (__tab, __idx);
+
+  return vbsl_u8 (__mask, __tbl, __r);
 }
 
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-vsri_n_s32 (int32x2_t __a, int32x2_t __b, const int __c)
+__extension__ extern __inline poly8x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vtbx3_p8 (poly8x8_t __r, poly8x8x3_t __tab, uint8x8_t __idx)
 {
-  return (int32x2_t) __builtin_aarch64_ssri_nv2si (__a, __b, __c);
+  uint8x8_t __mask = vclt_u8 (__idx, vmov_n_u8 (24));
+  poly8x8_t __tbl = vtbl3_p8 (__tab, __idx);
+
+  return vbsl_p8 (__mask, __tbl, __r);
 }
 
-__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
-vsri_n_s64 (int64x1_t __a, int64x1_t __b, const int __c)
+/* vtbx4  */
+
+__extension__ extern __inline int8x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vtbx4_s8 (int8x8_t __r, int8x8x4_t __tab, int8x8_t __idx)
 {
-  return (int64x1_t) {__builtin_aarch64_ssri_ndi (__a[0], __b[0], __c)};
+  int8x8_t result;
+  int8x16x2_t temp;
+  __builtin_aarch64_simd_oi __o;
+  temp.val[0] = vcombine_s8 (__tab.val[0], __tab.val[1]);
+  temp.val[1] = vcombine_s8 (__tab.val[2], __tab.val[3]);
+  __o = __builtin_aarch64_set_qregoiv16qi (__o,
+					   (int8x16_t) temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv16qi (__o,
+					   (int8x16_t) temp.val[1], 1);
+  result = __builtin_aarch64_tbx4v8qi (__r, __o, __idx);
+  return result;
 }
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-vsri_n_u8 (uint8x8_t __a, uint8x8_t __b, const int __c)
+__extension__ extern __inline uint8x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vtbx4_u8 (uint8x8_t __r, uint8x8x4_t __tab, uint8x8_t __idx)
 {
-  return __builtin_aarch64_usri_nv8qi_uuus (__a, __b, __c);
+  uint8x8_t result;
+  uint8x16x2_t temp;
+  __builtin_aarch64_simd_oi __o;
+  temp.val[0] = vcombine_u8 (__tab.val[0], __tab.val[1]);
+  temp.val[1] = vcombine_u8 (__tab.val[2], __tab.val[3]);
+  __o = __builtin_aarch64_set_qregoiv16qi (__o,
+					   (int8x16_t) temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv16qi (__o,
+					   (int8x16_t) temp.val[1], 1);
+  result = (uint8x8_t)__builtin_aarch64_tbx4v8qi ((int8x8_t)__r, __o,
+						  (int8x8_t)__idx);
+  return result;
 }
 
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-vsri_n_u16 (uint16x4_t __a, uint16x4_t __b, const int __c)
+__extension__ extern __inline poly8x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vtbx4_p8 (poly8x8_t __r, poly8x8x4_t __tab, uint8x8_t __idx)
 {
-  return __builtin_aarch64_usri_nv4hi_uuus (__a, __b, __c);
+  poly8x8_t result;
+  poly8x16x2_t temp;
+  __builtin_aarch64_simd_oi __o;
+  temp.val[0] = vcombine_p8 (__tab.val[0], __tab.val[1]);
+  temp.val[1] = vcombine_p8 (__tab.val[2], __tab.val[3]);
+  __o = __builtin_aarch64_set_qregoiv16qi (__o,
+					   (int8x16_t) temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv16qi (__o,
+					   (int8x16_t) temp.val[1], 1);
+  result = (poly8x8_t)__builtin_aarch64_tbx4v8qi ((int8x8_t)__r, __o,
+						  (int8x8_t)__idx);
+  return result;
 }
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-vsri_n_u32 (uint32x2_t __a, uint32x2_t __b, const int __c)
+/* vtrn */
+
+__extension__ extern __inline float16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vtrn1_f16 (float16x4_t __a, float16x4_t __b)
 {
-  return __builtin_aarch64_usri_nv2si_uuus (__a, __b, __c);
+#ifdef __AARCH64EB__
+  return __builtin_shuffle (__a, __b, (uint16x4_t) {5, 1, 7, 3});
+#else
+  return __builtin_shuffle (__a, __b, (uint16x4_t) {0, 4, 2, 6});
+#endif
 }
 
-__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
-vsri_n_u64 (uint64x1_t __a, uint64x1_t __b, const int __c)
+__extension__ extern __inline float32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vtrn1_f32 (float32x2_t __a, float32x2_t __b)
 {
-  return (uint64x1_t) {__builtin_aarch64_usri_ndi_uuus (__a[0], __b[0], __c)};
+#ifdef __AARCH64EB__
+  return __builtin_shuffle (__a, __b, (uint32x2_t) {3, 1});
+#else
+  return __builtin_shuffle (__a, __b, (uint32x2_t) {0, 2});
+#endif
 }
 
-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
-vsriq_n_s8 (int8x16_t __a, int8x16_t __b, const int __c)
+__extension__ extern __inline poly8x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vtrn1_p8 (poly8x8_t __a, poly8x8_t __b)
 {
-  return (int8x16_t) __builtin_aarch64_ssri_nv16qi (__a, __b, __c);
+#ifdef __AARCH64EB__
+  return __builtin_shuffle (__a, __b, (uint8x8_t) {9, 1, 11, 3, 13, 5, 15, 7});
+#else
+  return __builtin_shuffle (__a, __b, (uint8x8_t) {0, 8, 2, 10, 4, 12, 6, 14});
+#endif
 }
 
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-vsriq_n_s16 (int16x8_t __a, int16x8_t __b, const int __c)
+__extension__ extern __inline poly16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vtrn1_p16 (poly16x4_t __a, poly16x4_t __b)
 {
-  return (int16x8_t) __builtin_aarch64_ssri_nv8hi (__a, __b, __c);
+#ifdef __AARCH64EB__
+  return __builtin_shuffle (__a, __b, (uint16x4_t) {5, 1, 7, 3});
+#else
+  return __builtin_shuffle (__a, __b, (uint16x4_t) {0, 4, 2, 6});
+#endif
 }
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-vsriq_n_s32 (int32x4_t __a, int32x4_t __b, const int __c)
+__extension__ extern __inline int8x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vtrn1_s8 (int8x8_t __a, int8x8_t __b)
 {
-  return (int32x4_t) __builtin_aarch64_ssri_nv4si (__a, __b, __c);
+#ifdef __AARCH64EB__
+  return __builtin_shuffle (__a, __b, (uint8x8_t) {9, 1, 11, 3, 13, 5, 15, 7});
+#else
+  return __builtin_shuffle (__a, __b, (uint8x8_t) {0, 8, 2, 10, 4, 12, 6, 14});
+#endif
 }
 
-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
-vsriq_n_s64 (int64x2_t __a, int64x2_t __b, const int __c)
+__extension__ extern __inline int16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vtrn1_s16 (int16x4_t __a, int16x4_t __b)
 {
-  return (int64x2_t) __builtin_aarch64_ssri_nv2di (__a, __b, __c);
+#ifdef __AARCH64EB__
+  return __builtin_shuffle (__a, __b, (uint16x4_t) {5, 1, 7, 3});
+#else
+  return __builtin_shuffle (__a, __b, (uint16x4_t) {0, 4, 2, 6});
+#endif
 }
 
-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-vsriq_n_u8 (uint8x16_t __a, uint8x16_t __b, const int __c)
+__extension__ extern __inline int32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vtrn1_s32 (int32x2_t __a, int32x2_t __b)
 {
-  return __builtin_aarch64_usri_nv16qi_uuus (__a, __b, __c);
+#ifdef __AARCH64EB__
+  return __builtin_shuffle (__a, __b, (uint32x2_t) {3, 1});
+#else
+  return __builtin_shuffle (__a, __b, (uint32x2_t) {0, 2});
+#endif
 }
 
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-vsriq_n_u16 (uint16x8_t __a, uint16x8_t __b, const int __c)
+__extension__ extern __inline uint8x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vtrn1_u8 (uint8x8_t __a, uint8x8_t __b)
 {
-  return __builtin_aarch64_usri_nv8hi_uuus (__a, __b, __c);
+#ifdef __AARCH64EB__
+  return __builtin_shuffle (__a, __b, (uint8x8_t) {9, 1, 11, 3, 13, 5, 15, 7});
+#else
+  return __builtin_shuffle (__a, __b, (uint8x8_t) {0, 8, 2, 10, 4, 12, 6, 14});
+#endif
 }
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vsriq_n_u32 (uint32x4_t __a, uint32x4_t __b, const int __c)
+__extension__ extern __inline uint16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vtrn1_u16 (uint16x4_t __a, uint16x4_t __b)
 {
-  return __builtin_aarch64_usri_nv4si_uuus (__a, __b, __c);
+#ifdef __AARCH64EB__
+  return __builtin_shuffle (__a, __b, (uint16x4_t) {5, 1, 7, 3});
+#else
+  return __builtin_shuffle (__a, __b, (uint16x4_t) {0, 4, 2, 6});
+#endif
 }
 
-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
-vsriq_n_u64 (uint64x2_t __a, uint64x2_t __b, const int __c)
+__extension__ extern __inline uint32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vtrn1_u32 (uint32x2_t __a, uint32x2_t __b)
 {
-  return __builtin_aarch64_usri_nv2di_uuus (__a, __b, __c);
+#ifdef __AARCH64EB__
+  return __builtin_shuffle (__a, __b, (uint32x2_t) {3, 1});
+#else
+  return __builtin_shuffle (__a, __b, (uint32x2_t) {0, 2});
+#endif
 }
 
-__extension__ static __inline int64_t __attribute__ ((__always_inline__))
-vsrid_n_s64 (int64_t __a, int64_t __b, const int __c)
+__extension__ extern __inline float16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vtrn1q_f16 (float16x8_t __a, float16x8_t __b)
 {
-  return __builtin_aarch64_ssri_ndi (__a, __b, __c);
+#ifdef __AARCH64EB__
+  return __builtin_shuffle (__a, __b, (uint16x8_t) {9, 1, 11, 3, 13, 5, 15, 7});
+#else
+  return __builtin_shuffle (__a, __b, (uint16x8_t) {0, 8, 2, 10, 4, 12, 6, 14});
+#endif
 }
 
-__extension__ static __inline uint64_t __attribute__ ((__always_inline__))
-vsrid_n_u64 (uint64_t __a, uint64_t __b, const int __c)
+__extension__ extern __inline float32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vtrn1q_f32 (float32x4_t __a, float32x4_t __b)
 {
-  return __builtin_aarch64_usri_ndi_uuus (__a, __b, __c);
+#ifdef __AARCH64EB__
+  return __builtin_shuffle (__a, __b, (uint32x4_t) {5, 1, 7, 3});
+#else
+  return __builtin_shuffle (__a, __b, (uint32x4_t) {0, 4, 2, 6});
+#endif
 }
 
-/* vst1 */
-
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst1_f16 (float16_t *__a, float16x4_t __b)
+__extension__ extern __inline float64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vtrn1q_f64 (float64x2_t __a, float64x2_t __b)
 {
-  __builtin_aarch64_st1v4hf (__a, __b);
+#ifdef __AARCH64EB__
+  return __builtin_shuffle (__a, __b, (uint64x2_t) {3, 1});
+#else
+  return __builtin_shuffle (__a, __b, (uint64x2_t) {0, 2});
+#endif
 }
 
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst1_f32 (float32_t *a, float32x2_t b)
+__extension__ extern __inline poly8x16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vtrn1q_p8 (poly8x16_t __a, poly8x16_t __b)
 {
-  __builtin_aarch64_st1v2sf ((__builtin_aarch64_simd_sf *) a, b);
+#ifdef __AARCH64EB__
+  return __builtin_shuffle (__a, __b,
+      (uint8x16_t) {17, 1, 19, 3, 21, 5, 23, 7, 25, 9, 27, 11, 29, 13, 31, 15});
+#else
+  return __builtin_shuffle (__a, __b,
+      (uint8x16_t) {0, 16, 2, 18, 4, 20, 6, 22, 8, 24, 10, 26, 12, 28, 14, 30});
+#endif
 }
 
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst1_f64 (float64_t *a, float64x1_t b)
+__extension__ extern __inline poly16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vtrn1q_p16 (poly16x8_t __a, poly16x8_t __b)
 {
-  *a = b[0];
+#ifdef __AARCH64EB__
+  return __builtin_shuffle (__a, __b, (uint16x8_t) {9, 1, 11, 3, 13, 5, 15, 7});
+#else
+  return __builtin_shuffle (__a, __b, (uint16x8_t) {0, 8, 2, 10, 4, 12, 6, 14});
+#endif
 }
 
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst1_p8 (poly8_t *a, poly8x8_t b)
+__extension__ extern __inline int8x16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vtrn1q_s8 (int8x16_t __a, int8x16_t __b)
 {
-  __builtin_aarch64_st1v8qi ((__builtin_aarch64_simd_qi *) a,
-			     (int8x8_t) b);
+#ifdef __AARCH64EB__
+  return __builtin_shuffle (__a, __b,
+      (uint8x16_t) {17, 1, 19, 3, 21, 5, 23, 7, 25, 9, 27, 11, 29, 13, 31, 15});
+#else
+  return __builtin_shuffle (__a, __b,
+      (uint8x16_t) {0, 16, 2, 18, 4, 20, 6, 22, 8, 24, 10, 26, 12, 28, 14, 30});
+#endif
 }
 
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst1_p16 (poly16_t *a, poly16x4_t b)
+__extension__ extern __inline int16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vtrn1q_s16 (int16x8_t __a, int16x8_t __b)
 {
-  __builtin_aarch64_st1v4hi ((__builtin_aarch64_simd_hi *) a,
-			     (int16x4_t) b);
+#ifdef __AARCH64EB__
+  return __builtin_shuffle (__a, __b, (uint16x8_t) {9, 1, 11, 3, 13, 5, 15, 7});
+#else
+  return __builtin_shuffle (__a, __b, (uint16x8_t) {0, 8, 2, 10, 4, 12, 6, 14});
+#endif
 }
 
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst1_s8 (int8_t *a, int8x8_t b)
+__extension__ extern __inline int32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vtrn1q_s32 (int32x4_t __a, int32x4_t __b)
 {
-  __builtin_aarch64_st1v8qi ((__builtin_aarch64_simd_qi *) a, b);
+#ifdef __AARCH64EB__
+  return __builtin_shuffle (__a, __b, (uint32x4_t) {5, 1, 7, 3});
+#else
+  return __builtin_shuffle (__a, __b, (uint32x4_t) {0, 4, 2, 6});
+#endif
 }
 
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst1_s16 (int16_t *a, int16x4_t b)
+__extension__ extern __inline int64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vtrn1q_s64 (int64x2_t __a, int64x2_t __b)
 {
-  __builtin_aarch64_st1v4hi ((__builtin_aarch64_simd_hi *) a, b);
+#ifdef __AARCH64EB__
+  return __builtin_shuffle (__a, __b, (uint64x2_t) {3, 1});
+#else
+  return __builtin_shuffle (__a, __b, (uint64x2_t) {0, 2});
+#endif
 }
 
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst1_s32 (int32_t *a, int32x2_t b)
+__extension__ extern __inline uint8x16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vtrn1q_u8 (uint8x16_t __a, uint8x16_t __b)
 {
-  __builtin_aarch64_st1v2si ((__builtin_aarch64_simd_si *) a, b);
+#ifdef __AARCH64EB__
+  return __builtin_shuffle (__a, __b,
+      (uint8x16_t) {17, 1, 19, 3, 21, 5, 23, 7, 25, 9, 27, 11, 29, 13, 31, 15});
+#else
+  return __builtin_shuffle (__a, __b,
+      (uint8x16_t) {0, 16, 2, 18, 4, 20, 6, 22, 8, 24, 10, 26, 12, 28, 14, 30});
+#endif
 }
 
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst1_s64 (int64_t *a, int64x1_t b)
+__extension__ extern __inline uint16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vtrn1q_u16 (uint16x8_t __a, uint16x8_t __b)
 {
-  *a = b[0];
+#ifdef __AARCH64EB__
+  return __builtin_shuffle (__a, __b, (uint16x8_t) {9, 1, 11, 3, 13, 5, 15, 7});
+#else
+  return __builtin_shuffle (__a, __b, (uint16x8_t) {0, 8, 2, 10, 4, 12, 6, 14});
+#endif
 }
 
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst1_u8 (uint8_t *a, uint8x8_t b)
+__extension__ extern __inline uint32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vtrn1q_u32 (uint32x4_t __a, uint32x4_t __b)
 {
-  __builtin_aarch64_st1v8qi ((__builtin_aarch64_simd_qi *) a,
-			     (int8x8_t) b);
+#ifdef __AARCH64EB__
+  return __builtin_shuffle (__a, __b, (uint32x4_t) {5, 1, 7, 3});
+#else
+  return __builtin_shuffle (__a, __b, (uint32x4_t) {0, 4, 2, 6});
+#endif
 }
 
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst1_u16 (uint16_t *a, uint16x4_t b)
+__extension__ extern __inline uint64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vtrn1q_u64 (uint64x2_t __a, uint64x2_t __b)
 {
-  __builtin_aarch64_st1v4hi ((__builtin_aarch64_simd_hi *) a,
-			     (int16x4_t) b);
+#ifdef __AARCH64EB__
+  return __builtin_shuffle (__a, __b, (uint64x2_t) {3, 1});
+#else
+  return __builtin_shuffle (__a, __b, (uint64x2_t) {0, 2});
+#endif
 }
 
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst1_u32 (uint32_t *a, uint32x2_t b)
+__extension__ extern __inline float16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vtrn2_f16 (float16x4_t __a, float16x4_t __b)
 {
-  __builtin_aarch64_st1v2si ((__builtin_aarch64_simd_si *) a,
-			     (int32x2_t) b);
+#ifdef __AARCH64EB__
+  return __builtin_shuffle (__a, __b, (uint16x4_t) {4, 0, 6, 2});
+#else
+  return __builtin_shuffle (__a, __b, (uint16x4_t) {1, 5, 3, 7});
+#endif
 }
 
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst1_u64 (uint64_t *a, uint64x1_t b)
+__extension__ extern __inline float32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vtrn2_f32 (float32x2_t __a, float32x2_t __b)
 {
-  *a = b[0];
+#ifdef __AARCH64EB__
+  return __builtin_shuffle (__a, __b, (uint32x2_t) {2, 0});
+#else
+  return __builtin_shuffle (__a, __b, (uint32x2_t) {1, 3});
+#endif
 }
 
-/* vst1q */
-
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst1q_f16 (float16_t *__a, float16x8_t __b)
+__extension__ extern __inline poly8x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vtrn2_p8 (poly8x8_t __a, poly8x8_t __b)
 {
-  __builtin_aarch64_st1v8hf (__a, __b);
+#ifdef __AARCH64EB__
+  return __builtin_shuffle (__a, __b, (uint8x8_t) {8, 0, 10, 2, 12, 4, 14, 6});
+#else
+  return __builtin_shuffle (__a, __b, (uint8x8_t) {1, 9, 3, 11, 5, 13, 7, 15});
+#endif
 }
 
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst1q_f32 (float32_t *a, float32x4_t b)
+__extension__ extern __inline poly16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vtrn2_p16 (poly16x4_t __a, poly16x4_t __b)
 {
-  __builtin_aarch64_st1v4sf ((__builtin_aarch64_simd_sf *) a, b);
+#ifdef __AARCH64EB__
+  return __builtin_shuffle (__a, __b, (uint16x4_t) {4, 0, 6, 2});
+#else
+  return __builtin_shuffle (__a, __b, (uint16x4_t) {1, 5, 3, 7});
+#endif
 }
 
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst1q_f64 (float64_t *a, float64x2_t b)
+__extension__ extern __inline int8x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vtrn2_s8 (int8x8_t __a, int8x8_t __b)
 {
-  __builtin_aarch64_st1v2df ((__builtin_aarch64_simd_df *) a, b);
+#ifdef __AARCH64EB__
+  return __builtin_shuffle (__a, __b, (uint8x8_t) {8, 0, 10, 2, 12, 4, 14, 6});
+#else
+  return __builtin_shuffle (__a, __b, (uint8x8_t) {1, 9, 3, 11, 5, 13, 7, 15});
+#endif
 }
 
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst1q_p8 (poly8_t *a, poly8x16_t b)
+__extension__ extern __inline int16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vtrn2_s16 (int16x4_t __a, int16x4_t __b)
 {
-  __builtin_aarch64_st1v16qi ((__builtin_aarch64_simd_qi *) a,
-			      (int8x16_t) b);
+#ifdef __AARCH64EB__
+  return __builtin_shuffle (__a, __b, (uint16x4_t) {4, 0, 6, 2});
+#else
+  return __builtin_shuffle (__a, __b, (uint16x4_t) {1, 5, 3, 7});
+#endif
 }
 
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst1q_p16 (poly16_t *a, poly16x8_t b)
+__extension__ extern __inline int32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vtrn2_s32 (int32x2_t __a, int32x2_t __b)
 {
-  __builtin_aarch64_st1v8hi ((__builtin_aarch64_simd_hi *) a,
-			     (int16x8_t) b);
+#ifdef __AARCH64EB__
+  return __builtin_shuffle (__a, __b, (uint32x2_t) {2, 0});
+#else
+  return __builtin_shuffle (__a, __b, (uint32x2_t) {1, 3});
+#endif
 }
 
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst1q_s8 (int8_t *a, int8x16_t b)
+__extension__ extern __inline uint8x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vtrn2_u8 (uint8x8_t __a, uint8x8_t __b)
 {
-  __builtin_aarch64_st1v16qi ((__builtin_aarch64_simd_qi *) a, b);
+#ifdef __AARCH64EB__
+  return __builtin_shuffle (__a, __b, (uint8x8_t) {8, 0, 10, 2, 12, 4, 14, 6});
+#else
+  return __builtin_shuffle (__a, __b, (uint8x8_t) {1, 9, 3, 11, 5, 13, 7, 15});
+#endif
 }
 
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst1q_s16 (int16_t *a, int16x8_t b)
+__extension__ extern __inline uint16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vtrn2_u16 (uint16x4_t __a, uint16x4_t __b)
 {
-  __builtin_aarch64_st1v8hi ((__builtin_aarch64_simd_hi *) a, b);
+#ifdef __AARCH64EB__
+  return __builtin_shuffle (__a, __b, (uint16x4_t) {4, 0, 6, 2});
+#else
+  return __builtin_shuffle (__a, __b, (uint16x4_t) {1, 5, 3, 7});
+#endif
 }
 
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst1q_s32 (int32_t *a, int32x4_t b)
+__extension__ extern __inline uint32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vtrn2_u32 (uint32x2_t __a, uint32x2_t __b)
 {
-  __builtin_aarch64_st1v4si ((__builtin_aarch64_simd_si *) a, b);
+#ifdef __AARCH64EB__
+  return __builtin_shuffle (__a, __b, (uint32x2_t) {2, 0});
+#else
+  return __builtin_shuffle (__a, __b, (uint32x2_t) {1, 3});
+#endif
 }
 
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst1q_s64 (int64_t *a, int64x2_t b)
+__extension__ extern __inline float16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vtrn2q_f16 (float16x8_t __a, float16x8_t __b)
 {
-  __builtin_aarch64_st1v2di ((__builtin_aarch64_simd_di *) a, b);
+#ifdef __AARCH64EB__
+  return __builtin_shuffle (__a, __b, (uint16x8_t) {8, 0, 10, 2, 12, 4, 14, 6});
+#else
+  return __builtin_shuffle (__a, __b, (uint16x8_t) {1, 9, 3, 11, 5, 13, 7, 15});
+#endif
 }
 
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst1q_u8 (uint8_t *a, uint8x16_t b)
+__extension__ extern __inline float32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vtrn2q_f32 (float32x4_t __a, float32x4_t __b)
 {
-  __builtin_aarch64_st1v16qi ((__builtin_aarch64_simd_qi *) a,
-			      (int8x16_t) b);
+#ifdef __AARCH64EB__
+  return __builtin_shuffle (__a, __b, (uint32x4_t) {4, 0, 6, 2});
+#else
+  return __builtin_shuffle (__a, __b, (uint32x4_t) {1, 5, 3, 7});
+#endif
 }
 
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst1q_u16 (uint16_t *a, uint16x8_t b)
+__extension__ extern __inline float64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vtrn2q_f64 (float64x2_t __a, float64x2_t __b)
 {
-  __builtin_aarch64_st1v8hi ((__builtin_aarch64_simd_hi *) a,
-			     (int16x8_t) b);
+#ifdef __AARCH64EB__
+  return __builtin_shuffle (__a, __b, (uint64x2_t) {2, 0});
+#else
+  return __builtin_shuffle (__a, __b, (uint64x2_t) {1, 3});
+#endif
 }
 
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst1q_u32 (uint32_t *a, uint32x4_t b)
+__extension__ extern __inline poly8x16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vtrn2q_p8 (poly8x16_t __a, poly8x16_t __b)
 {
-  __builtin_aarch64_st1v4si ((__builtin_aarch64_simd_si *) a,
-			     (int32x4_t) b);
+#ifdef __AARCH64EB__
+  return __builtin_shuffle (__a, __b,
+      (uint8x16_t) {16, 0, 18, 2, 20, 4, 22, 6, 24, 8, 26, 10, 28, 12, 30, 14});
+#else
+  return __builtin_shuffle (__a, __b,
+      (uint8x16_t) {1, 17, 3, 19, 5, 21, 7, 23, 9, 25, 11, 27, 13, 29, 15, 31});
+#endif
 }
 
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst1q_u64 (uint64_t *a, uint64x2_t b)
+__extension__ extern __inline poly16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vtrn2q_p16 (poly16x8_t __a, poly16x8_t __b)
 {
-  __builtin_aarch64_st1v2di ((__builtin_aarch64_simd_di *) a,
-			     (int64x2_t) b);
+#ifdef __AARCH64EB__
+  return __builtin_shuffle (__a, __b, (uint16x8_t) {8, 0, 10, 2, 12, 4, 14, 6});
+#else
+  return __builtin_shuffle (__a, __b, (uint16x8_t) {1, 9, 3, 11, 5, 13, 7, 15});
+#endif
 }
 
-/* vst1_lane */
-
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst1_lane_f16 (float16_t *__a, float16x4_t __b, const int __lane)
+__extension__ extern __inline int8x16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vtrn2q_s8 (int8x16_t __a, int8x16_t __b)
 {
-  *__a = __aarch64_vget_lane_any (__b, __lane);
+#ifdef __AARCH64EB__
+  return __builtin_shuffle (__a, __b,
+      (uint8x16_t) {16, 0, 18, 2, 20, 4, 22, 6, 24, 8, 26, 10, 28, 12, 30, 14});
+#else
+  return __builtin_shuffle (__a, __b,
+      (uint8x16_t) {1, 17, 3, 19, 5, 21, 7, 23, 9, 25, 11, 27, 13, 29, 15, 31});
+#endif
 }
 
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst1_lane_f32 (float32_t *__a, float32x2_t __b, const int __lane)
+__extension__ extern __inline int16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vtrn2q_s16 (int16x8_t __a, int16x8_t __b)
 {
-  *__a = __aarch64_vget_lane_any (__b, __lane);
+#ifdef __AARCH64EB__
+  return __builtin_shuffle (__a, __b, (uint16x8_t) {8, 0, 10, 2, 12, 4, 14, 6});
+#else
+  return __builtin_shuffle (__a, __b, (uint16x8_t) {1, 9, 3, 11, 5, 13, 7, 15});
+#endif
 }
 
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst1_lane_f64 (float64_t *__a, float64x1_t __b, const int __lane)
+__extension__ extern __inline int32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vtrn2q_s32 (int32x4_t __a, int32x4_t __b)
 {
-  *__a = __aarch64_vget_lane_any (__b, __lane);
+#ifdef __AARCH64EB__
+  return __builtin_shuffle (__a, __b, (uint32x4_t) {4, 0, 6, 2});
+#else
+  return __builtin_shuffle (__a, __b, (uint32x4_t) {1, 5, 3, 7});
+#endif
 }
 
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst1_lane_p8 (poly8_t *__a, poly8x8_t __b, const int __lane)
+__extension__ extern __inline int64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vtrn2q_s64 (int64x2_t __a, int64x2_t __b)
 {
-  *__a = __aarch64_vget_lane_any (__b, __lane);
+#ifdef __AARCH64EB__
+  return __builtin_shuffle (__a, __b, (uint64x2_t) {2, 0});
+#else
+  return __builtin_shuffle (__a, __b, (uint64x2_t) {1, 3});
+#endif
 }
 
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst1_lane_p16 (poly16_t *__a, poly16x4_t __b, const int __lane)
+__extension__ extern __inline uint8x16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vtrn2q_u8 (uint8x16_t __a, uint8x16_t __b)
 {
-  *__a = __aarch64_vget_lane_any (__b, __lane);
+#ifdef __AARCH64EB__
+  return __builtin_shuffle (__a, __b,
+      (uint8x16_t) {16, 0, 18, 2, 20, 4, 22, 6, 24, 8, 26, 10, 28, 12, 30, 14});
+#else
+  return __builtin_shuffle (__a, __b,
+      (uint8x16_t) {1, 17, 3, 19, 5, 21, 7, 23, 9, 25, 11, 27, 13, 29, 15, 31});
+#endif
 }
 
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst1_lane_s8 (int8_t *__a, int8x8_t __b, const int __lane)
+__extension__ extern __inline uint16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vtrn2q_u16 (uint16x8_t __a, uint16x8_t __b)
 {
-  *__a = __aarch64_vget_lane_any (__b, __lane);
+#ifdef __AARCH64EB__
+  return __builtin_shuffle (__a, __b, (uint16x8_t) {8, 0, 10, 2, 12, 4, 14, 6});
+#else
+  return __builtin_shuffle (__a, __b, (uint16x8_t) {1, 9, 3, 11, 5, 13, 7, 15});
+#endif
 }
 
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst1_lane_s16 (int16_t *__a, int16x4_t __b, const int __lane)
+__extension__ extern __inline uint32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vtrn2q_u32 (uint32x4_t __a, uint32x4_t __b)
 {
-  *__a = __aarch64_vget_lane_any (__b, __lane);
+#ifdef __AARCH64EB__
+  return __builtin_shuffle (__a, __b, (uint32x4_t) {4, 0, 6, 2});
+#else
+  return __builtin_shuffle (__a, __b, (uint32x4_t) {1, 5, 3, 7});
+#endif
 }
 
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst1_lane_s32 (int32_t *__a, int32x2_t __b, const int __lane)
+__extension__ extern __inline uint64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vtrn2q_u64 (uint64x2_t __a, uint64x2_t __b)
 {
-  *__a = __aarch64_vget_lane_any (__b, __lane);
+#ifdef __AARCH64EB__
+  return __builtin_shuffle (__a, __b, (uint64x2_t) {2, 0});
+#else
+  return __builtin_shuffle (__a, __b, (uint64x2_t) {1, 3});
+#endif
 }
 
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst1_lane_s64 (int64_t *__a, int64x1_t __b, const int __lane)
+__extension__ extern __inline float16x4x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vtrn_f16 (float16x4_t __a, float16x4_t __b)
 {
-  *__a = __aarch64_vget_lane_any (__b, __lane);
+  return (float16x4x2_t) {vtrn1_f16 (__a, __b), vtrn2_f16 (__a, __b)};
 }
 
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst1_lane_u8 (uint8_t *__a, uint8x8_t __b, const int __lane)
+__extension__ extern __inline float32x2x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vtrn_f32 (float32x2_t a, float32x2_t b)
 {
-  *__a = __aarch64_vget_lane_any (__b, __lane);
+  return (float32x2x2_t) {vtrn1_f32 (a, b), vtrn2_f32 (a, b)};
 }
 
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst1_lane_u16 (uint16_t *__a, uint16x4_t __b, const int __lane)
+__extension__ extern __inline poly8x8x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vtrn_p8 (poly8x8_t a, poly8x8_t b)
 {
-  *__a = __aarch64_vget_lane_any (__b, __lane);
+  return (poly8x8x2_t) {vtrn1_p8 (a, b), vtrn2_p8 (a, b)};
 }
 
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst1_lane_u32 (uint32_t *__a, uint32x2_t __b, const int __lane)
+__extension__ extern __inline poly16x4x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vtrn_p16 (poly16x4_t a, poly16x4_t b)
 {
-  *__a = __aarch64_vget_lane_any (__b, __lane);
+  return (poly16x4x2_t) {vtrn1_p16 (a, b), vtrn2_p16 (a, b)};
 }
 
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst1_lane_u64 (uint64_t *__a, uint64x1_t __b, const int __lane)
+__extension__ extern __inline int8x8x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vtrn_s8 (int8x8_t a, int8x8_t b)
 {
-  *__a = __aarch64_vget_lane_any (__b, __lane);
+  return (int8x8x2_t) {vtrn1_s8 (a, b), vtrn2_s8 (a, b)};
 }
 
-/* vst1q_lane */
-
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst1q_lane_f16 (float16_t *__a, float16x8_t __b, const int __lane)
+__extension__ extern __inline int16x4x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vtrn_s16 (int16x4_t a, int16x4_t b)
 {
-  *__a = __aarch64_vget_lane_any (__b, __lane);
+  return (int16x4x2_t) {vtrn1_s16 (a, b), vtrn2_s16 (a, b)};
 }
 
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst1q_lane_f32 (float32_t *__a, float32x4_t __b, const int __lane)
+__extension__ extern __inline int32x2x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vtrn_s32 (int32x2_t a, int32x2_t b)
 {
-  *__a = __aarch64_vget_lane_any (__b, __lane);
+  return (int32x2x2_t) {vtrn1_s32 (a, b), vtrn2_s32 (a, b)};
 }
 
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst1q_lane_f64 (float64_t *__a, float64x2_t __b, const int __lane)
+__extension__ extern __inline uint8x8x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vtrn_u8 (uint8x8_t a, uint8x8_t b)
 {
-  *__a = __aarch64_vget_lane_any (__b, __lane);
+  return (uint8x8x2_t) {vtrn1_u8 (a, b), vtrn2_u8 (a, b)};
 }
 
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst1q_lane_p8 (poly8_t *__a, poly8x16_t __b, const int __lane)
+__extension__ extern __inline uint16x4x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vtrn_u16 (uint16x4_t a, uint16x4_t b)
 {
-  *__a = __aarch64_vget_lane_any (__b, __lane);
+  return (uint16x4x2_t) {vtrn1_u16 (a, b), vtrn2_u16 (a, b)};
 }
 
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst1q_lane_p16 (poly16_t *__a, poly16x8_t __b, const int __lane)
+__extension__ extern __inline uint32x2x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vtrn_u32 (uint32x2_t a, uint32x2_t b)
 {
-  *__a = __aarch64_vget_lane_any (__b, __lane);
+  return (uint32x2x2_t) {vtrn1_u32 (a, b), vtrn2_u32 (a, b)};
 }
 
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst1q_lane_s8 (int8_t *__a, int8x16_t __b, const int __lane)
+__extension__ extern __inline float16x8x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vtrnq_f16 (float16x8_t __a, float16x8_t __b)
 {
-  *__a = __aarch64_vget_lane_any (__b, __lane);
+  return (float16x8x2_t) {vtrn1q_f16 (__a, __b), vtrn2q_f16 (__a, __b)};
 }
 
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst1q_lane_s16 (int16_t *__a, int16x8_t __b, const int __lane)
+__extension__ extern __inline float32x4x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vtrnq_f32 (float32x4_t a, float32x4_t b)
 {
-  *__a = __aarch64_vget_lane_any (__b, __lane);
+  return (float32x4x2_t) {vtrn1q_f32 (a, b), vtrn2q_f32 (a, b)};
 }
 
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst1q_lane_s32 (int32_t *__a, int32x4_t __b, const int __lane)
+__extension__ extern __inline poly8x16x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vtrnq_p8 (poly8x16_t a, poly8x16_t b)
 {
-  *__a = __aarch64_vget_lane_any (__b, __lane);
+  return (poly8x16x2_t) {vtrn1q_p8 (a, b), vtrn2q_p8 (a, b)};
 }
 
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst1q_lane_s64 (int64_t *__a, int64x2_t __b, const int __lane)
+__extension__ extern __inline poly16x8x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vtrnq_p16 (poly16x8_t a, poly16x8_t b)
 {
-  *__a = __aarch64_vget_lane_any (__b, __lane);
+  return (poly16x8x2_t) {vtrn1q_p16 (a, b), vtrn2q_p16 (a, b)};
 }
 
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst1q_lane_u8 (uint8_t *__a, uint8x16_t __b, const int __lane)
+__extension__ extern __inline int8x16x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vtrnq_s8 (int8x16_t a, int8x16_t b)
 {
-  *__a = __aarch64_vget_lane_any (__b, __lane);
+  return (int8x16x2_t) {vtrn1q_s8 (a, b), vtrn2q_s8 (a, b)};
 }
 
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst1q_lane_u16 (uint16_t *__a, uint16x8_t __b, const int __lane)
+__extension__ extern __inline int16x8x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vtrnq_s16 (int16x8_t a, int16x8_t b)
 {
-  *__a = __aarch64_vget_lane_any (__b, __lane);
+  return (int16x8x2_t) {vtrn1q_s16 (a, b), vtrn2q_s16 (a, b)};
 }
 
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst1q_lane_u32 (uint32_t *__a, uint32x4_t __b, const int __lane)
+__extension__ extern __inline int32x4x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vtrnq_s32 (int32x4_t a, int32x4_t b)
 {
-  *__a = __aarch64_vget_lane_any (__b, __lane);
+  return (int32x4x2_t) {vtrn1q_s32 (a, b), vtrn2q_s32 (a, b)};
 }
 
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst1q_lane_u64 (uint64_t *__a, uint64x2_t __b, const int __lane)
+__extension__ extern __inline uint8x16x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vtrnq_u8 (uint8x16_t a, uint8x16_t b)
 {
-  *__a = __aarch64_vget_lane_any (__b, __lane);
+  return (uint8x16x2_t) {vtrn1q_u8 (a, b), vtrn2q_u8 (a, b)};
 }
 
-/* vstn */
-
-__extension__ static __inline void
-vst2_s64 (int64_t * __a, int64x1x2_t val)
+__extension__ extern __inline uint16x8x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vtrnq_u16 (uint16x8_t a, uint16x8_t b)
 {
-  __builtin_aarch64_simd_oi __o;
-  int64x2x2_t temp;
-  temp.val[0] = vcombine_s64 (val.val[0], vcreate_s64 (__AARCH64_INT64_C (0)));
-  temp.val[1] = vcombine_s64 (val.val[1], vcreate_s64 (__AARCH64_INT64_C (0)));
-  __o = __builtin_aarch64_set_qregoiv2di (__o, (int64x2_t) temp.val[0], 0);
-  __o = __builtin_aarch64_set_qregoiv2di (__o, (int64x2_t) temp.val[1], 1);
-  __builtin_aarch64_st2di ((__builtin_aarch64_simd_di *) __a, __o);
+  return (uint16x8x2_t) {vtrn1q_u16 (a, b), vtrn2q_u16 (a, b)};
 }
 
-__extension__ static __inline void
-vst2_u64 (uint64_t * __a, uint64x1x2_t val)
+__extension__ extern __inline uint32x4x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vtrnq_u32 (uint32x4_t a, uint32x4_t b)
 {
-  __builtin_aarch64_simd_oi __o;
-  uint64x2x2_t temp;
-  temp.val[0] = vcombine_u64 (val.val[0], vcreate_u64 (__AARCH64_UINT64_C (0)));
-  temp.val[1] = vcombine_u64 (val.val[1], vcreate_u64 (__AARCH64_UINT64_C (0)));
-  __o = __builtin_aarch64_set_qregoiv2di (__o, (int64x2_t) temp.val[0], 0);
-  __o = __builtin_aarch64_set_qregoiv2di (__o, (int64x2_t) temp.val[1], 1);
-  __builtin_aarch64_st2di ((__builtin_aarch64_simd_di *) __a, __o);
+  return (uint32x4x2_t) {vtrn1q_u32 (a, b), vtrn2q_u32 (a, b)};
 }
 
-__extension__ static __inline void
-vst2_f64 (float64_t * __a, float64x1x2_t val)
-{
-  __builtin_aarch64_simd_oi __o;
-  float64x2x2_t temp;
-  temp.val[0] = vcombine_f64 (val.val[0], vcreate_f64 (__AARCH64_UINT64_C (0)));
-  temp.val[1] = vcombine_f64 (val.val[1], vcreate_f64 (__AARCH64_UINT64_C (0)));
-  __o = __builtin_aarch64_set_qregoiv2df (__o, (float64x2_t) temp.val[0], 0);
-  __o = __builtin_aarch64_set_qregoiv2df (__o, (float64x2_t) temp.val[1], 1);
-  __builtin_aarch64_st2df ((__builtin_aarch64_simd_df *) __a, __o);
-}
+/* vtst */
 
-__extension__ static __inline void
-vst2_s8 (int8_t * __a, int8x8x2_t val)
+__extension__ extern __inline uint8x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vtst_s8 (int8x8_t __a, int8x8_t __b)
 {
-  __builtin_aarch64_simd_oi __o;
-  int8x16x2_t temp;
-  temp.val[0] = vcombine_s8 (val.val[0], vcreate_s8 (__AARCH64_INT64_C (0)));
-  temp.val[1] = vcombine_s8 (val.val[1], vcreate_s8 (__AARCH64_INT64_C (0)));
-  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) temp.val[0], 0);
-  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) temp.val[1], 1);
-  __builtin_aarch64_st2v8qi ((__builtin_aarch64_simd_qi *) __a, __o);
+  return (uint8x8_t) ((__a & __b) != 0);
 }
 
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst2_p8 (poly8_t * __a, poly8x8x2_t val)
+__extension__ extern __inline uint16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vtst_s16 (int16x4_t __a, int16x4_t __b)
 {
-  __builtin_aarch64_simd_oi __o;
-  poly8x16x2_t temp;
-  temp.val[0] = vcombine_p8 (val.val[0], vcreate_p8 (__AARCH64_UINT64_C (0)));
-  temp.val[1] = vcombine_p8 (val.val[1], vcreate_p8 (__AARCH64_UINT64_C (0)));
-  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) temp.val[0], 0);
-  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) temp.val[1], 1);
-  __builtin_aarch64_st2v8qi ((__builtin_aarch64_simd_qi *) __a, __o);
+  return (uint16x4_t) ((__a & __b) != 0);
 }
 
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst2_s16 (int16_t * __a, int16x4x2_t val)
+__extension__ extern __inline uint32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vtst_s32 (int32x2_t __a, int32x2_t __b)
 {
-  __builtin_aarch64_simd_oi __o;
-  int16x8x2_t temp;
-  temp.val[0] = vcombine_s16 (val.val[0], vcreate_s16 (__AARCH64_INT64_C (0)));
-  temp.val[1] = vcombine_s16 (val.val[1], vcreate_s16 (__AARCH64_INT64_C (0)));
-  __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) temp.val[0], 0);
-  __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) temp.val[1], 1);
-  __builtin_aarch64_st2v4hi ((__builtin_aarch64_simd_hi *) __a, __o);
+  return (uint32x2_t) ((__a & __b) != 0);
 }
 
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst2_p16 (poly16_t * __a, poly16x4x2_t val)
+__extension__ extern __inline uint64x1_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vtst_s64 (int64x1_t __a, int64x1_t __b)
 {
-  __builtin_aarch64_simd_oi __o;
-  poly16x8x2_t temp;
-  temp.val[0] = vcombine_p16 (val.val[0], vcreate_p16 (__AARCH64_UINT64_C (0)));
-  temp.val[1] = vcombine_p16 (val.val[1], vcreate_p16 (__AARCH64_UINT64_C (0)));
-  __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) temp.val[0], 0);
-  __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) temp.val[1], 1);
-  __builtin_aarch64_st2v4hi ((__builtin_aarch64_simd_hi *) __a, __o);
+  return (uint64x1_t) ((__a & __b) != __AARCH64_INT64_C (0));
 }
 
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst2_s32 (int32_t * __a, int32x2x2_t val)
+__extension__ extern __inline uint8x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vtst_u8 (uint8x8_t __a, uint8x8_t __b)
 {
-  __builtin_aarch64_simd_oi __o;
-  int32x4x2_t temp;
-  temp.val[0] = vcombine_s32 (val.val[0], vcreate_s32 (__AARCH64_INT64_C (0)));
-  temp.val[1] = vcombine_s32 (val.val[1], vcreate_s32 (__AARCH64_INT64_C (0)));
-  __o = __builtin_aarch64_set_qregoiv4si (__o, (int32x4_t) temp.val[0], 0);
-  __o = __builtin_aarch64_set_qregoiv4si (__o, (int32x4_t) temp.val[1], 1);
-  __builtin_aarch64_st2v2si ((__builtin_aarch64_simd_si *) __a, __o);
+  return ((__a & __b) != 0);
 }
 
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst2_u8 (uint8_t * __a, uint8x8x2_t val)
+__extension__ extern __inline uint16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vtst_u16 (uint16x4_t __a, uint16x4_t __b)
 {
-  __builtin_aarch64_simd_oi __o;
-  uint8x16x2_t temp;
-  temp.val[0] = vcombine_u8 (val.val[0], vcreate_u8 (__AARCH64_UINT64_C (0)));
-  temp.val[1] = vcombine_u8 (val.val[1], vcreate_u8 (__AARCH64_UINT64_C (0)));
-  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) temp.val[0], 0);
-  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) temp.val[1], 1);
-  __builtin_aarch64_st2v8qi ((__builtin_aarch64_simd_qi *) __a, __o);
+  return ((__a & __b) != 0);
 }
 
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst2_u16 (uint16_t * __a, uint16x4x2_t val)
+__extension__ extern __inline uint32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vtst_u32 (uint32x2_t __a, uint32x2_t __b)
 {
-  __builtin_aarch64_simd_oi __o;
-  uint16x8x2_t temp;
-  temp.val[0] = vcombine_u16 (val.val[0], vcreate_u16 (__AARCH64_UINT64_C (0)));
-  temp.val[1] = vcombine_u16 (val.val[1], vcreate_u16 (__AARCH64_UINT64_C (0)));
-  __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) temp.val[0], 0);
-  __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) temp.val[1], 1);
-  __builtin_aarch64_st2v4hi ((__builtin_aarch64_simd_hi *) __a, __o);
+  return ((__a & __b) != 0);
 }
 
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst2_u32 (uint32_t * __a, uint32x2x2_t val)
+__extension__ extern __inline uint64x1_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vtst_u64 (uint64x1_t __a, uint64x1_t __b)
 {
-  __builtin_aarch64_simd_oi __o;
-  uint32x4x2_t temp;
-  temp.val[0] = vcombine_u32 (val.val[0], vcreate_u32 (__AARCH64_UINT64_C (0)));
-  temp.val[1] = vcombine_u32 (val.val[1], vcreate_u32 (__AARCH64_UINT64_C (0)));
-  __o = __builtin_aarch64_set_qregoiv4si (__o, (int32x4_t) temp.val[0], 0);
-  __o = __builtin_aarch64_set_qregoiv4si (__o, (int32x4_t) temp.val[1], 1);
-  __builtin_aarch64_st2v2si ((__builtin_aarch64_simd_si *) __a, __o);
+  return ((__a & __b) != __AARCH64_UINT64_C (0));
 }
 
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst2_f16 (float16_t * __a, float16x4x2_t val)
+__extension__ extern __inline uint8x16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vtstq_s8 (int8x16_t __a, int8x16_t __b)
 {
-  __builtin_aarch64_simd_oi __o;
-  float16x8x2_t temp;
-  temp.val[0] = vcombine_f16 (val.val[0], vcreate_f16 (__AARCH64_UINT64_C (0)));
-  temp.val[1] = vcombine_f16 (val.val[1], vcreate_f16 (__AARCH64_UINT64_C (0)));
-  __o = __builtin_aarch64_set_qregoiv8hf (__o, temp.val[0], 0);
-  __o = __builtin_aarch64_set_qregoiv8hf (__o, temp.val[1], 1);
-  __builtin_aarch64_st2v4hf (__a, __o);
+  return (uint8x16_t) ((__a & __b) != 0);
 }
 
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst2_f32 (float32_t * __a, float32x2x2_t val)
+__extension__ extern __inline uint16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vtstq_s16 (int16x8_t __a, int16x8_t __b)
 {
-  __builtin_aarch64_simd_oi __o;
-  float32x4x2_t temp;
-  temp.val[0] = vcombine_f32 (val.val[0], vcreate_f32 (__AARCH64_UINT64_C (0)));
-  temp.val[1] = vcombine_f32 (val.val[1], vcreate_f32 (__AARCH64_UINT64_C (0)));
-  __o = __builtin_aarch64_set_qregoiv4sf (__o, (float32x4_t) temp.val[0], 0);
-  __o = __builtin_aarch64_set_qregoiv4sf (__o, (float32x4_t) temp.val[1], 1);
-  __builtin_aarch64_st2v2sf ((__builtin_aarch64_simd_sf *) __a, __o);
+  return (uint16x8_t) ((__a & __b) != 0);
 }
 
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst2q_s8 (int8_t * __a, int8x16x2_t val)
+__extension__ extern __inline uint32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vtstq_s32 (int32x4_t __a, int32x4_t __b)
 {
-  __builtin_aarch64_simd_oi __o;
-  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) val.val[0], 0);
-  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) val.val[1], 1);
-  __builtin_aarch64_st2v16qi ((__builtin_aarch64_simd_qi *) __a, __o);
+  return (uint32x4_t) ((__a & __b) != 0);
 }
 
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst2q_p8 (poly8_t * __a, poly8x16x2_t val)
+__extension__ extern __inline uint64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vtstq_s64 (int64x2_t __a, int64x2_t __b)
 {
-  __builtin_aarch64_simd_oi __o;
-  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) val.val[0], 0);
-  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) val.val[1], 1);
-  __builtin_aarch64_st2v16qi ((__builtin_aarch64_simd_qi *) __a, __o);
+  return (uint64x2_t) ((__a & __b) != __AARCH64_INT64_C (0));
 }
 
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst2q_s16 (int16_t * __a, int16x8x2_t val)
+__extension__ extern __inline uint8x16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vtstq_u8 (uint8x16_t __a, uint8x16_t __b)
 {
-  __builtin_aarch64_simd_oi __o;
-  __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) val.val[0], 0);
-  __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) val.val[1], 1);
-  __builtin_aarch64_st2v8hi ((__builtin_aarch64_simd_hi *) __a, __o);
+  return ((__a & __b) != 0);
 }
 
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst2q_p16 (poly16_t * __a, poly16x8x2_t val)
+__extension__ extern __inline uint16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vtstq_u16 (uint16x8_t __a, uint16x8_t __b)
 {
-  __builtin_aarch64_simd_oi __o;
-  __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) val.val[0], 0);
-  __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) val.val[1], 1);
-  __builtin_aarch64_st2v8hi ((__builtin_aarch64_simd_hi *) __a, __o);
+  return ((__a & __b) != 0);
 }
 
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst2q_s32 (int32_t * __a, int32x4x2_t val)
+__extension__ extern __inline uint32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vtstq_u32 (uint32x4_t __a, uint32x4_t __b)
 {
-  __builtin_aarch64_simd_oi __o;
-  __o = __builtin_aarch64_set_qregoiv4si (__o, (int32x4_t) val.val[0], 0);
-  __o = __builtin_aarch64_set_qregoiv4si (__o, (int32x4_t) val.val[1], 1);
-  __builtin_aarch64_st2v4si ((__builtin_aarch64_simd_si *) __a, __o);
+  return ((__a & __b) != 0);
 }
 
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst2q_s64 (int64_t * __a, int64x2x2_t val)
+__extension__ extern __inline uint64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vtstq_u64 (uint64x2_t __a, uint64x2_t __b)
 {
-  __builtin_aarch64_simd_oi __o;
-  __o = __builtin_aarch64_set_qregoiv2di (__o, (int64x2_t) val.val[0], 0);
-  __o = __builtin_aarch64_set_qregoiv2di (__o, (int64x2_t) val.val[1], 1);
-  __builtin_aarch64_st2v2di ((__builtin_aarch64_simd_di *) __a, __o);
+  return ((__a & __b) != __AARCH64_UINT64_C (0));
 }
 
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst2q_u8 (uint8_t * __a, uint8x16x2_t val)
+__extension__ extern __inline uint64_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vtstd_s64 (int64_t __a, int64_t __b)
 {
-  __builtin_aarch64_simd_oi __o;
-  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) val.val[0], 0);
-  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) val.val[1], 1);
-  __builtin_aarch64_st2v16qi ((__builtin_aarch64_simd_qi *) __a, __o);
+  return (__a & __b) ? -1ll : 0ll;
 }
 
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst2q_u16 (uint16_t * __a, uint16x8x2_t val)
+__extension__ extern __inline uint64_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vtstd_u64 (uint64_t __a, uint64_t __b)
 {
-  __builtin_aarch64_simd_oi __o;
-  __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) val.val[0], 0);
-  __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) val.val[1], 1);
-  __builtin_aarch64_st2v8hi ((__builtin_aarch64_simd_hi *) __a, __o);
+  return (__a & __b) ? -1ll : 0ll;
 }
 
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst2q_u32 (uint32_t * __a, uint32x4x2_t val)
+/* vuqadd */
+
+__extension__ extern __inline int8x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vuqadd_s8 (int8x8_t __a, uint8x8_t __b)
 {
-  __builtin_aarch64_simd_oi __o;
-  __o = __builtin_aarch64_set_qregoiv4si (__o, (int32x4_t) val.val[0], 0);
-  __o = __builtin_aarch64_set_qregoiv4si (__o, (int32x4_t) val.val[1], 1);
-  __builtin_aarch64_st2v4si ((__builtin_aarch64_simd_si *) __a, __o);
+  return __builtin_aarch64_suqaddv8qi_ssu (__a,  __b);
 }
 
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst2q_u64 (uint64_t * __a, uint64x2x2_t val)
+__extension__ extern __inline int16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vuqadd_s16 (int16x4_t __a, uint16x4_t __b)
 {
-  __builtin_aarch64_simd_oi __o;
-  __o = __builtin_aarch64_set_qregoiv2di (__o, (int64x2_t) val.val[0], 0);
-  __o = __builtin_aarch64_set_qregoiv2di (__o, (int64x2_t) val.val[1], 1);
-  __builtin_aarch64_st2v2di ((__builtin_aarch64_simd_di *) __a, __o);
+  return __builtin_aarch64_suqaddv4hi_ssu (__a,  __b);
 }
 
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst2q_f16 (float16_t * __a, float16x8x2_t val)
+__extension__ extern __inline int32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vuqadd_s32 (int32x2_t __a, uint32x2_t __b)
 {
-  __builtin_aarch64_simd_oi __o;
-  __o = __builtin_aarch64_set_qregoiv8hf (__o, val.val[0], 0);
-  __o = __builtin_aarch64_set_qregoiv8hf (__o, val.val[1], 1);
-  __builtin_aarch64_st2v8hf (__a, __o);
+  return __builtin_aarch64_suqaddv2si_ssu (__a,  __b);
 }
 
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst2q_f32 (float32_t * __a, float32x4x2_t val)
+__extension__ extern __inline int64x1_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vuqadd_s64 (int64x1_t __a, uint64x1_t __b)
 {
-  __builtin_aarch64_simd_oi __o;
-  __o = __builtin_aarch64_set_qregoiv4sf (__o, (float32x4_t) val.val[0], 0);
-  __o = __builtin_aarch64_set_qregoiv4sf (__o, (float32x4_t) val.val[1], 1);
-  __builtin_aarch64_st2v4sf ((__builtin_aarch64_simd_sf *) __a, __o);
+  return (int64x1_t) {__builtin_aarch64_suqadddi_ssu (__a[0], __b[0])};
 }
 
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst2q_f64 (float64_t * __a, float64x2x2_t val)
+__extension__ extern __inline int8x16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vuqaddq_s8 (int8x16_t __a, uint8x16_t __b)
 {
-  __builtin_aarch64_simd_oi __o;
-  __o = __builtin_aarch64_set_qregoiv2df (__o, (float64x2_t) val.val[0], 0);
-  __o = __builtin_aarch64_set_qregoiv2df (__o, (float64x2_t) val.val[1], 1);
-  __builtin_aarch64_st2v2df ((__builtin_aarch64_simd_df *) __a, __o);
+  return __builtin_aarch64_suqaddv16qi_ssu (__a,  __b);
 }
 
-__extension__ static __inline void
-vst3_s64 (int64_t * __a, int64x1x3_t val)
+__extension__ extern __inline int16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vuqaddq_s16 (int16x8_t __a, uint16x8_t __b)
 {
-  __builtin_aarch64_simd_ci __o;
-  int64x2x3_t temp;
-  temp.val[0] = vcombine_s64 (val.val[0], vcreate_s64 (__AARCH64_INT64_C (0)));
-  temp.val[1] = vcombine_s64 (val.val[1], vcreate_s64 (__AARCH64_INT64_C (0)));
-  temp.val[2] = vcombine_s64 (val.val[2], vcreate_s64 (__AARCH64_INT64_C (0)));
-  __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) temp.val[0], 0);
-  __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) temp.val[1], 1);
-  __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) temp.val[2], 2);
-  __builtin_aarch64_st3di ((__builtin_aarch64_simd_di *) __a, __o);
+  return __builtin_aarch64_suqaddv8hi_ssu (__a,  __b);
 }
 
-__extension__ static __inline void
-vst3_u64 (uint64_t * __a, uint64x1x3_t val)
+__extension__ extern __inline int32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vuqaddq_s32 (int32x4_t __a, uint32x4_t __b)
 {
-  __builtin_aarch64_simd_ci __o;
-  uint64x2x3_t temp;
-  temp.val[0] = vcombine_u64 (val.val[0], vcreate_u64 (__AARCH64_UINT64_C (0)));
-  temp.val[1] = vcombine_u64 (val.val[1], vcreate_u64 (__AARCH64_UINT64_C (0)));
-  temp.val[2] = vcombine_u64 (val.val[2], vcreate_u64 (__AARCH64_UINT64_C (0)));
-  __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) temp.val[0], 0);
-  __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) temp.val[1], 1);
-  __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) temp.val[2], 2);
-  __builtin_aarch64_st3di ((__builtin_aarch64_simd_di *) __a, __o);
+  return __builtin_aarch64_suqaddv4si_ssu (__a,  __b);
 }
 
-__extension__ static __inline void
-vst3_f64 (float64_t * __a, float64x1x3_t val)
+__extension__ extern __inline int64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vuqaddq_s64 (int64x2_t __a, uint64x2_t __b)
 {
-  __builtin_aarch64_simd_ci __o;
-  float64x2x3_t temp;
-  temp.val[0] = vcombine_f64 (val.val[0], vcreate_f64 (__AARCH64_UINT64_C (0)));
-  temp.val[1] = vcombine_f64 (val.val[1], vcreate_f64 (__AARCH64_UINT64_C (0)));
-  temp.val[2] = vcombine_f64 (val.val[2], vcreate_f64 (__AARCH64_UINT64_C (0)));
-  __o = __builtin_aarch64_set_qregciv2df (__o, (float64x2_t) temp.val[0], 0);
-  __o = __builtin_aarch64_set_qregciv2df (__o, (float64x2_t) temp.val[1], 1);
-  __o = __builtin_aarch64_set_qregciv2df (__o, (float64x2_t) temp.val[2], 2);
-  __builtin_aarch64_st3df ((__builtin_aarch64_simd_df *) __a, __o);
+  return __builtin_aarch64_suqaddv2di_ssu (__a,  __b);
 }
 
-__extension__ static __inline void
-vst3_s8 (int8_t * __a, int8x8x3_t val)
+__extension__ extern __inline int8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vuqaddb_s8 (int8_t __a, uint8_t __b)
 {
-  __builtin_aarch64_simd_ci __o;
-  int8x16x3_t temp;
-  temp.val[0] = vcombine_s8 (val.val[0], vcreate_s8 (__AARCH64_INT64_C (0)));
-  temp.val[1] = vcombine_s8 (val.val[1], vcreate_s8 (__AARCH64_INT64_C (0)));
-  temp.val[2] = vcombine_s8 (val.val[2], vcreate_s8 (__AARCH64_INT64_C (0)));
-  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) temp.val[0], 0);
-  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) temp.val[1], 1);
-  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) temp.val[2], 2);
-  __builtin_aarch64_st3v8qi ((__builtin_aarch64_simd_qi *) __a, __o);
+  return __builtin_aarch64_suqaddqi_ssu (__a,  __b);
 }
 
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst3_p8 (poly8_t * __a, poly8x8x3_t val)
+__extension__ extern __inline int16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vuqaddh_s16 (int16_t __a, uint16_t __b)
 {
-  __builtin_aarch64_simd_ci __o;
-  poly8x16x3_t temp;
-  temp.val[0] = vcombine_p8 (val.val[0], vcreate_p8 (__AARCH64_UINT64_C (0)));
-  temp.val[1] = vcombine_p8 (val.val[1], vcreate_p8 (__AARCH64_UINT64_C (0)));
-  temp.val[2] = vcombine_p8 (val.val[2], vcreate_p8 (__AARCH64_UINT64_C (0)));
-  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) temp.val[0], 0);
-  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) temp.val[1], 1);
-  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) temp.val[2], 2);
-  __builtin_aarch64_st3v8qi ((__builtin_aarch64_simd_qi *) __a, __o);
+  return __builtin_aarch64_suqaddhi_ssu (__a,  __b);
 }
 
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst3_s16 (int16_t * __a, int16x4x3_t val)
+__extension__ extern __inline int32_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vuqadds_s32 (int32_t __a, uint32_t __b)
 {
-  __builtin_aarch64_simd_ci __o;
-  int16x8x3_t temp;
-  temp.val[0] = vcombine_s16 (val.val[0], vcreate_s16 (__AARCH64_INT64_C (0)));
-  temp.val[1] = vcombine_s16 (val.val[1], vcreate_s16 (__AARCH64_INT64_C (0)));
-  temp.val[2] = vcombine_s16 (val.val[2], vcreate_s16 (__AARCH64_INT64_C (0)));
-  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) temp.val[0], 0);
-  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) temp.val[1], 1);
-  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) temp.val[2], 2);
-  __builtin_aarch64_st3v4hi ((__builtin_aarch64_simd_hi *) __a, __o);
+  return __builtin_aarch64_suqaddsi_ssu (__a,  __b);
 }
 
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst3_p16 (poly16_t * __a, poly16x4x3_t val)
+__extension__ extern __inline int64_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vuqaddd_s64 (int64_t __a, uint64_t __b)
 {
-  __builtin_aarch64_simd_ci __o;
-  poly16x8x3_t temp;
-  temp.val[0] = vcombine_p16 (val.val[0], vcreate_p16 (__AARCH64_UINT64_C (0)));
-  temp.val[1] = vcombine_p16 (val.val[1], vcreate_p16 (__AARCH64_UINT64_C (0)));
-  temp.val[2] = vcombine_p16 (val.val[2], vcreate_p16 (__AARCH64_UINT64_C (0)));
-  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) temp.val[0], 0);
-  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) temp.val[1], 1);
-  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) temp.val[2], 2);
-  __builtin_aarch64_st3v4hi ((__builtin_aarch64_simd_hi *) __a, __o);
+  return __builtin_aarch64_suqadddi_ssu (__a,  __b);
 }
 
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst3_s32 (int32_t * __a, int32x2x3_t val)
+#define __DEFINTERLEAVE(op, rettype, intype, funcsuffix, Q) 		\
+  __extension__ extern __inline rettype					\
+  __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) \
+  v ## op ## Q ## _ ## funcsuffix (intype a, intype b)			\
+  {									\
+    return (rettype) {v ## op ## 1 ## Q ## _ ## funcsuffix (a, b),	\
+		      v ## op ## 2 ## Q ## _ ## funcsuffix (a, b)};	\
+  }
+
+#define __INTERLEAVE_LIST(op)					\
+  __DEFINTERLEAVE (op, float16x4x2_t, float16x4_t, f16,)	\
+  __DEFINTERLEAVE (op, float32x2x2_t, float32x2_t, f32,)	\
+  __DEFINTERLEAVE (op, poly8x8x2_t, poly8x8_t, p8,)		\
+  __DEFINTERLEAVE (op, poly16x4x2_t, poly16x4_t, p16,)		\
+  __DEFINTERLEAVE (op, int8x8x2_t, int8x8_t, s8,)		\
+  __DEFINTERLEAVE (op, int16x4x2_t, int16x4_t, s16,)		\
+  __DEFINTERLEAVE (op, int32x2x2_t, int32x2_t, s32,)		\
+  __DEFINTERLEAVE (op, uint8x8x2_t, uint8x8_t, u8,)		\
+  __DEFINTERLEAVE (op, uint16x4x2_t, uint16x4_t, u16,)		\
+  __DEFINTERLEAVE (op, uint32x2x2_t, uint32x2_t, u32,)		\
+  __DEFINTERLEAVE (op, float16x8x2_t, float16x8_t, f16, q)	\
+  __DEFINTERLEAVE (op, float32x4x2_t, float32x4_t, f32, q)	\
+  __DEFINTERLEAVE (op, poly8x16x2_t, poly8x16_t, p8, q)		\
+  __DEFINTERLEAVE (op, poly16x8x2_t, poly16x8_t, p16, q)	\
+  __DEFINTERLEAVE (op, int8x16x2_t, int8x16_t, s8, q)		\
+  __DEFINTERLEAVE (op, int16x8x2_t, int16x8_t, s16, q)		\
+  __DEFINTERLEAVE (op, int32x4x2_t, int32x4_t, s32, q)		\
+  __DEFINTERLEAVE (op, uint8x16x2_t, uint8x16_t, u8, q)		\
+  __DEFINTERLEAVE (op, uint16x8x2_t, uint16x8_t, u16, q)	\
+  __DEFINTERLEAVE (op, uint32x4x2_t, uint32x4_t, u32, q)
+
+/* vuzp */
+
+__extension__ extern __inline float16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vuzp1_f16 (float16x4_t __a, float16x4_t __b)
 {
-  __builtin_aarch64_simd_ci __o;
-  int32x4x3_t temp;
-  temp.val[0] = vcombine_s32 (val.val[0], vcreate_s32 (__AARCH64_INT64_C (0)));
-  temp.val[1] = vcombine_s32 (val.val[1], vcreate_s32 (__AARCH64_INT64_C (0)));
-  temp.val[2] = vcombine_s32 (val.val[2], vcreate_s32 (__AARCH64_INT64_C (0)));
-  __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) temp.val[0], 0);
-  __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) temp.val[1], 1);
-  __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) temp.val[2], 2);
-  __builtin_aarch64_st3v2si ((__builtin_aarch64_simd_si *) __a, __o);
+#ifdef __AARCH64EB__
+  return __builtin_shuffle (__a, __b, (uint16x4_t) {5, 7, 1, 3});
+#else
+  return __builtin_shuffle (__a, __b, (uint16x4_t) {0, 2, 4, 6});
+#endif
 }
 
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst3_u8 (uint8_t * __a, uint8x8x3_t val)
+__extension__ extern __inline float32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vuzp1_f32 (float32x2_t __a, float32x2_t __b)
 {
-  __builtin_aarch64_simd_ci __o;
-  uint8x16x3_t temp;
-  temp.val[0] = vcombine_u8 (val.val[0], vcreate_u8 (__AARCH64_UINT64_C (0)));
-  temp.val[1] = vcombine_u8 (val.val[1], vcreate_u8 (__AARCH64_UINT64_C (0)));
-  temp.val[2] = vcombine_u8 (val.val[2], vcreate_u8 (__AARCH64_UINT64_C (0)));
-  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) temp.val[0], 0);
-  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) temp.val[1], 1);
-  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) temp.val[2], 2);
-  __builtin_aarch64_st3v8qi ((__builtin_aarch64_simd_qi *) __a, __o);
+#ifdef __AARCH64EB__
+  return __builtin_shuffle (__a, __b, (uint32x2_t) {3, 1});
+#else
+  return __builtin_shuffle (__a, __b, (uint32x2_t) {0, 2});
+#endif
 }
 
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst3_u16 (uint16_t * __a, uint16x4x3_t val)
+__extension__ extern __inline poly8x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vuzp1_p8 (poly8x8_t __a, poly8x8_t __b)
 {
-  __builtin_aarch64_simd_ci __o;
-  uint16x8x3_t temp;
-  temp.val[0] = vcombine_u16 (val.val[0], vcreate_u16 (__AARCH64_UINT64_C (0)));
-  temp.val[1] = vcombine_u16 (val.val[1], vcreate_u16 (__AARCH64_UINT64_C (0)));
-  temp.val[2] = vcombine_u16 (val.val[2], vcreate_u16 (__AARCH64_UINT64_C (0)));
-  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) temp.val[0], 0);
-  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) temp.val[1], 1);
-  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) temp.val[2], 2);
-  __builtin_aarch64_st3v4hi ((__builtin_aarch64_simd_hi *) __a, __o);
+#ifdef __AARCH64EB__
+  return __builtin_shuffle (__a, __b, (uint8x8_t) {9, 11, 13, 15, 1, 3, 5, 7});
+#else
+  return __builtin_shuffle (__a, __b, (uint8x8_t) {0, 2, 4, 6, 8, 10, 12, 14});
+#endif
 }
 
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst3_u32 (uint32_t * __a, uint32x2x3_t val)
+__extension__ extern __inline poly16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vuzp1_p16 (poly16x4_t __a, poly16x4_t __b)
 {
-  __builtin_aarch64_simd_ci __o;
-  uint32x4x3_t temp;
-  temp.val[0] = vcombine_u32 (val.val[0], vcreate_u32 (__AARCH64_UINT64_C (0)));
-  temp.val[1] = vcombine_u32 (val.val[1], vcreate_u32 (__AARCH64_UINT64_C (0)));
-  temp.val[2] = vcombine_u32 (val.val[2], vcreate_u32 (__AARCH64_UINT64_C (0)));
-  __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) temp.val[0], 0);
-  __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) temp.val[1], 1);
-  __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) temp.val[2], 2);
-  __builtin_aarch64_st3v2si ((__builtin_aarch64_simd_si *) __a, __o);
+#ifdef __AARCH64EB__
+  return __builtin_shuffle (__a, __b, (uint16x4_t) {5, 7, 1, 3});
+#else
+  return __builtin_shuffle (__a, __b, (uint16x4_t) {0, 2, 4, 6});
+#endif
 }
 
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst3_f16 (float16_t * __a, float16x4x3_t val)
+__extension__ extern __inline int8x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vuzp1_s8 (int8x8_t __a, int8x8_t __b)
 {
-  __builtin_aarch64_simd_ci __o;
-  float16x8x3_t temp;
-  temp.val[0] = vcombine_f16 (val.val[0], vcreate_f16 (__AARCH64_UINT64_C (0)));
-  temp.val[1] = vcombine_f16 (val.val[1], vcreate_f16 (__AARCH64_UINT64_C (0)));
-  temp.val[2] = vcombine_f16 (val.val[2], vcreate_f16 (__AARCH64_UINT64_C (0)));
-  __o = __builtin_aarch64_set_qregciv8hf (__o, (float16x8_t) temp.val[0], 0);
-  __o = __builtin_aarch64_set_qregciv8hf (__o, (float16x8_t) temp.val[1], 1);
-  __o = __builtin_aarch64_set_qregciv8hf (__o, (float16x8_t) temp.val[2], 2);
-  __builtin_aarch64_st3v4hf ((__builtin_aarch64_simd_hf *) __a, __o);
+#ifdef __AARCH64EB__
+  return __builtin_shuffle (__a, __b, (uint8x8_t) {9, 11, 13, 15, 1, 3, 5, 7});
+#else
+  return __builtin_shuffle (__a, __b, (uint8x8_t) {0, 2, 4, 6, 8, 10, 12, 14});
+#endif
 }
 
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst3_f32 (float32_t * __a, float32x2x3_t val)
+__extension__ extern __inline int16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vuzp1_s16 (int16x4_t __a, int16x4_t __b)
 {
-  __builtin_aarch64_simd_ci __o;
-  float32x4x3_t temp;
-  temp.val[0] = vcombine_f32 (val.val[0], vcreate_f32 (__AARCH64_UINT64_C (0)));
-  temp.val[1] = vcombine_f32 (val.val[1], vcreate_f32 (__AARCH64_UINT64_C (0)));
-  temp.val[2] = vcombine_f32 (val.val[2], vcreate_f32 (__AARCH64_UINT64_C (0)));
-  __o = __builtin_aarch64_set_qregciv4sf (__o, (float32x4_t) temp.val[0], 0);
-  __o = __builtin_aarch64_set_qregciv4sf (__o, (float32x4_t) temp.val[1], 1);
-  __o = __builtin_aarch64_set_qregciv4sf (__o, (float32x4_t) temp.val[2], 2);
-  __builtin_aarch64_st3v2sf ((__builtin_aarch64_simd_sf *) __a, __o);
+#ifdef __AARCH64EB__
+  return __builtin_shuffle (__a, __b, (uint16x4_t) {5, 7, 1, 3});
+#else
+  return __builtin_shuffle (__a, __b, (uint16x4_t) {0, 2, 4, 6});
+#endif
 }
 
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst3q_s8 (int8_t * __a, int8x16x3_t val)
+__extension__ extern __inline int32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vuzp1_s32 (int32x2_t __a, int32x2_t __b)
 {
-  __builtin_aarch64_simd_ci __o;
-  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) val.val[0], 0);
-  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) val.val[1], 1);
-  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) val.val[2], 2);
-  __builtin_aarch64_st3v16qi ((__builtin_aarch64_simd_qi *) __a, __o);
+#ifdef __AARCH64EB__
+  return __builtin_shuffle (__a, __b, (uint32x2_t) {3, 1});
+#else
+  return __builtin_shuffle (__a, __b, (uint32x2_t) {0, 2});
+#endif
 }
 
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst3q_p8 (poly8_t * __a, poly8x16x3_t val)
+__extension__ extern __inline uint8x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vuzp1_u8 (uint8x8_t __a, uint8x8_t __b)
 {
-  __builtin_aarch64_simd_ci __o;
-  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) val.val[0], 0);
-  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) val.val[1], 1);
-  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) val.val[2], 2);
-  __builtin_aarch64_st3v16qi ((__builtin_aarch64_simd_qi *) __a, __o);
+#ifdef __AARCH64EB__
+  return __builtin_shuffle (__a, __b, (uint8x8_t) {9, 11, 13, 15, 1, 3, 5, 7});
+#else
+  return __builtin_shuffle (__a, __b, (uint8x8_t) {0, 2, 4, 6, 8, 10, 12, 14});
+#endif
 }
 
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst3q_s16 (int16_t * __a, int16x8x3_t val)
+__extension__ extern __inline uint16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vuzp1_u16 (uint16x4_t __a, uint16x4_t __b)
 {
-  __builtin_aarch64_simd_ci __o;
-  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) val.val[0], 0);
-  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) val.val[1], 1);
-  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) val.val[2], 2);
-  __builtin_aarch64_st3v8hi ((__builtin_aarch64_simd_hi *) __a, __o);
+#ifdef __AARCH64EB__
+  return __builtin_shuffle (__a, __b, (uint16x4_t) {5, 7, 1, 3});
+#else
+  return __builtin_shuffle (__a, __b, (uint16x4_t) {0, 2, 4, 6});
+#endif
 }
 
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst3q_p16 (poly16_t * __a, poly16x8x3_t val)
+__extension__ extern __inline uint32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vuzp1_u32 (uint32x2_t __a, uint32x2_t __b)
 {
-  __builtin_aarch64_simd_ci __o;
-  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) val.val[0], 0);
-  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) val.val[1], 1);
-  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) val.val[2], 2);
-  __builtin_aarch64_st3v8hi ((__builtin_aarch64_simd_hi *) __a, __o);
+#ifdef __AARCH64EB__
+  return __builtin_shuffle (__a, __b, (uint32x2_t) {3, 1});
+#else
+  return __builtin_shuffle (__a, __b, (uint32x2_t) {0, 2});
+#endif
 }
 
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst3q_s32 (int32_t * __a, int32x4x3_t val)
+__extension__ extern __inline float16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vuzp1q_f16 (float16x8_t __a, float16x8_t __b)
 {
-  __builtin_aarch64_simd_ci __o;
-  __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) val.val[0], 0);
-  __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) val.val[1], 1);
-  __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) val.val[2], 2);
-  __builtin_aarch64_st3v4si ((__builtin_aarch64_simd_si *) __a, __o);
+#ifdef __AARCH64EB__
+  return __builtin_shuffle (__a, __b, (uint16x8_t) {9, 11, 13, 15, 1, 3, 5, 7});
+#else
+  return __builtin_shuffle (__a, __b, (uint16x8_t) {0, 2, 4, 6, 8, 10, 12, 14});
+#endif
 }
 
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst3q_s64 (int64_t * __a, int64x2x3_t val)
+__extension__ extern __inline float32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vuzp1q_f32 (float32x4_t __a, float32x4_t __b)
 {
-  __builtin_aarch64_simd_ci __o;
-  __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) val.val[0], 0);
-  __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) val.val[1], 1);
-  __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) val.val[2], 2);
-  __builtin_aarch64_st3v2di ((__builtin_aarch64_simd_di *) __a, __o);
+#ifdef __AARCH64EB__
+  return __builtin_shuffle (__a, __b, (uint32x4_t) {5, 7, 1, 3});
+#else
+  return __builtin_shuffle (__a, __b, (uint32x4_t) {0, 2, 4, 6});
+#endif
 }
 
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst3q_u8 (uint8_t * __a, uint8x16x3_t val)
+__extension__ extern __inline float64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vuzp1q_f64 (float64x2_t __a, float64x2_t __b)
 {
-  __builtin_aarch64_simd_ci __o;
-  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) val.val[0], 0);
-  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) val.val[1], 1);
-  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) val.val[2], 2);
-  __builtin_aarch64_st3v16qi ((__builtin_aarch64_simd_qi *) __a, __o);
+#ifdef __AARCH64EB__
+  return __builtin_shuffle (__a, __b, (uint64x2_t) {3, 1});
+#else
+  return __builtin_shuffle (__a, __b, (uint64x2_t) {0, 2});
+#endif
 }
 
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst3q_u16 (uint16_t * __a, uint16x8x3_t val)
+__extension__ extern __inline poly8x16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vuzp1q_p8 (poly8x16_t __a, poly8x16_t __b)
 {
-  __builtin_aarch64_simd_ci __o;
-  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) val.val[0], 0);
-  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) val.val[1], 1);
-  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) val.val[2], 2);
-  __builtin_aarch64_st3v8hi ((__builtin_aarch64_simd_hi *) __a, __o);
+#ifdef __AARCH64EB__
+  return __builtin_shuffle (__a, __b, (uint8x16_t)
+      {17, 19, 21, 23, 25, 27, 29, 31, 1, 3, 5, 7, 9, 11, 13, 15});
+#else
+  return __builtin_shuffle (__a, __b, (uint8x16_t)
+      {0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30});
+#endif
 }
 
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst3q_u32 (uint32_t * __a, uint32x4x3_t val)
+__extension__ extern __inline poly16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vuzp1q_p16 (poly16x8_t __a, poly16x8_t __b)
 {
-  __builtin_aarch64_simd_ci __o;
-  __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) val.val[0], 0);
-  __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) val.val[1], 1);
-  __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) val.val[2], 2);
-  __builtin_aarch64_st3v4si ((__builtin_aarch64_simd_si *) __a, __o);
+#ifdef __AARCH64EB__
+  return __builtin_shuffle (__a, __b, (uint16x8_t) {9, 11, 13, 15, 1, 3, 5, 7});
+#else
+  return __builtin_shuffle (__a, __b, (uint16x8_t) {0, 2, 4, 6, 8, 10, 12, 14});
+#endif
 }
 
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst3q_u64 (uint64_t * __a, uint64x2x3_t val)
+__extension__ extern __inline int8x16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vuzp1q_s8 (int8x16_t __a, int8x16_t __b)
 {
-  __builtin_aarch64_simd_ci __o;
-  __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) val.val[0], 0);
-  __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) val.val[1], 1);
-  __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) val.val[2], 2);
-  __builtin_aarch64_st3v2di ((__builtin_aarch64_simd_di *) __a, __o);
+#ifdef __AARCH64EB__
+  return __builtin_shuffle (__a, __b,
+      (uint8x16_t) {17, 19, 21, 23, 25, 27, 29, 31, 1, 3, 5, 7, 9, 11, 13, 15});
+#else
+  return __builtin_shuffle (__a, __b,
+      (uint8x16_t) {0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30});
+#endif
 }
 
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst3q_f16 (float16_t * __a, float16x8x3_t val)
+__extension__ extern __inline int16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vuzp1q_s16 (int16x8_t __a, int16x8_t __b)
 {
-  __builtin_aarch64_simd_ci __o;
-  __o = __builtin_aarch64_set_qregciv8hf (__o, (float16x8_t) val.val[0], 0);
-  __o = __builtin_aarch64_set_qregciv8hf (__o, (float16x8_t) val.val[1], 1);
-  __o = __builtin_aarch64_set_qregciv8hf (__o, (float16x8_t) val.val[2], 2);
-  __builtin_aarch64_st3v8hf ((__builtin_aarch64_simd_hf *) __a, __o);
+#ifdef __AARCH64EB__
+  return __builtin_shuffle (__a, __b, (uint16x8_t) {9, 11, 13, 15, 1, 3, 5, 7});
+#else
+  return __builtin_shuffle (__a, __b, (uint16x8_t) {0, 2, 4, 6, 8, 10, 12, 14});
+#endif
 }
 
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst3q_f32 (float32_t * __a, float32x4x3_t val)
+__extension__ extern __inline int32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vuzp1q_s32 (int32x4_t __a, int32x4_t __b)
 {
-  __builtin_aarch64_simd_ci __o;
-  __o = __builtin_aarch64_set_qregciv4sf (__o, (float32x4_t) val.val[0], 0);
-  __o = __builtin_aarch64_set_qregciv4sf (__o, (float32x4_t) val.val[1], 1);
-  __o = __builtin_aarch64_set_qregciv4sf (__o, (float32x4_t) val.val[2], 2);
-  __builtin_aarch64_st3v4sf ((__builtin_aarch64_simd_sf *) __a, __o);
+#ifdef __AARCH64EB__
+  return __builtin_shuffle (__a, __b, (uint32x4_t) {5, 7, 1, 3});
+#else
+  return __builtin_shuffle (__a, __b, (uint32x4_t) {0, 2, 4, 6});
+#endif
 }
 
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst3q_f64 (float64_t * __a, float64x2x3_t val)
+__extension__ extern __inline int64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vuzp1q_s64 (int64x2_t __a, int64x2_t __b)
 {
-  __builtin_aarch64_simd_ci __o;
-  __o = __builtin_aarch64_set_qregciv2df (__o, (float64x2_t) val.val[0], 0);
-  __o = __builtin_aarch64_set_qregciv2df (__o, (float64x2_t) val.val[1], 1);
-  __o = __builtin_aarch64_set_qregciv2df (__o, (float64x2_t) val.val[2], 2);
-  __builtin_aarch64_st3v2df ((__builtin_aarch64_simd_df *) __a, __o);
+#ifdef __AARCH64EB__
+  return __builtin_shuffle (__a, __b, (uint64x2_t) {3, 1});
+#else
+  return __builtin_shuffle (__a, __b, (uint64x2_t) {0, 2});
+#endif
 }
 
-__extension__ static __inline void
-vst4_s64 (int64_t * __a, int64x1x4_t val)
+__extension__ extern __inline uint8x16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vuzp1q_u8 (uint8x16_t __a, uint8x16_t __b)
 {
-  __builtin_aarch64_simd_xi __o;
-  int64x2x4_t temp;
-  temp.val[0] = vcombine_s64 (val.val[0], vcreate_s64 (__AARCH64_INT64_C (0)));
-  temp.val[1] = vcombine_s64 (val.val[1], vcreate_s64 (__AARCH64_INT64_C (0)));
-  temp.val[2] = vcombine_s64 (val.val[2], vcreate_s64 (__AARCH64_INT64_C (0)));
-  temp.val[3] = vcombine_s64 (val.val[3], vcreate_s64 (__AARCH64_INT64_C (0)));
-  __o = __builtin_aarch64_set_qregxiv2di (__o, (int64x2_t) temp.val[0], 0);
-  __o = __builtin_aarch64_set_qregxiv2di (__o, (int64x2_t) temp.val[1], 1);
-  __o = __builtin_aarch64_set_qregxiv2di (__o, (int64x2_t) temp.val[2], 2);
-  __o = __builtin_aarch64_set_qregxiv2di (__o, (int64x2_t) temp.val[3], 3);
-  __builtin_aarch64_st4di ((__builtin_aarch64_simd_di *) __a, __o);
+#ifdef __AARCH64EB__
+  return __builtin_shuffle (__a, __b,
+      (uint8x16_t) {17, 19, 21, 23, 25, 27, 29, 31, 1, 3, 5, 7, 9, 11, 13, 15});
+#else
+  return __builtin_shuffle (__a, __b,
+      (uint8x16_t) {0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30});
+#endif
 }
 
-__extension__ static __inline void
-vst4_u64 (uint64_t * __a, uint64x1x4_t val)
+__extension__ extern __inline uint16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vuzp1q_u16 (uint16x8_t __a, uint16x8_t __b)
 {
-  __builtin_aarch64_simd_xi __o;
-  uint64x2x4_t temp;
-  temp.val[0] = vcombine_u64 (val.val[0], vcreate_u64 (__AARCH64_UINT64_C (0)));
-  temp.val[1] = vcombine_u64 (val.val[1], vcreate_u64 (__AARCH64_UINT64_C (0)));
-  temp.val[2] = vcombine_u64 (val.val[2], vcreate_u64 (__AARCH64_UINT64_C (0)));
-  temp.val[3] = vcombine_u64 (val.val[3], vcreate_u64 (__AARCH64_UINT64_C (0)));
-  __o = __builtin_aarch64_set_qregxiv2di (__o, (int64x2_t) temp.val[0], 0);
-  __o = __builtin_aarch64_set_qregxiv2di (__o, (int64x2_t) temp.val[1], 1);
-  __o = __builtin_aarch64_set_qregxiv2di (__o, (int64x2_t) temp.val[2], 2);
-  __o = __builtin_aarch64_set_qregxiv2di (__o, (int64x2_t) temp.val[3], 3);
-  __builtin_aarch64_st4di ((__builtin_aarch64_simd_di *) __a, __o);
+#ifdef __AARCH64EB__
+  return __builtin_shuffle (__a, __b, (uint16x8_t) {9, 11, 13, 15, 1, 3, 5, 7});
+#else
+  return __builtin_shuffle (__a, __b, (uint16x8_t) {0, 2, 4, 6, 8, 10, 12, 14});
+#endif
 }
 
-__extension__ static __inline void
-vst4_f64 (float64_t * __a, float64x1x4_t val)
+__extension__ extern __inline uint32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vuzp1q_u32 (uint32x4_t __a, uint32x4_t __b)
 {
-  __builtin_aarch64_simd_xi __o;
-  float64x2x4_t temp;
-  temp.val[0] = vcombine_f64 (val.val[0], vcreate_f64 (__AARCH64_UINT64_C (0)));
-  temp.val[1] = vcombine_f64 (val.val[1], vcreate_f64 (__AARCH64_UINT64_C (0)));
-  temp.val[2] = vcombine_f64 (val.val[2], vcreate_f64 (__AARCH64_UINT64_C (0)));
-  temp.val[3] = vcombine_f64 (val.val[3], vcreate_f64 (__AARCH64_UINT64_C (0)));
-  __o = __builtin_aarch64_set_qregxiv2df (__o, (float64x2_t) temp.val[0], 0);
-  __o = __builtin_aarch64_set_qregxiv2df (__o, (float64x2_t) temp.val[1], 1);
-  __o = __builtin_aarch64_set_qregxiv2df (__o, (float64x2_t) temp.val[2], 2);
-  __o = __builtin_aarch64_set_qregxiv2df (__o, (float64x2_t) temp.val[3], 3);
-  __builtin_aarch64_st4df ((__builtin_aarch64_simd_df *) __a, __o);
+#ifdef __AARCH64EB__
+  return __builtin_shuffle (__a, __b, (uint32x4_t) {5, 7, 1, 3});
+#else
+  return __builtin_shuffle (__a, __b, (uint32x4_t) {0, 2, 4, 6});
+#endif
 }
 
-__extension__ static __inline void
-vst4_s8 (int8_t * __a, int8x8x4_t val)
+__extension__ extern __inline uint64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vuzp1q_u64 (uint64x2_t __a, uint64x2_t __b)
 {
-  __builtin_aarch64_simd_xi __o;
-  int8x16x4_t temp;
-  temp.val[0] = vcombine_s8 (val.val[0], vcreate_s8 (__AARCH64_INT64_C (0)));
-  temp.val[1] = vcombine_s8 (val.val[1], vcreate_s8 (__AARCH64_INT64_C (0)));
-  temp.val[2] = vcombine_s8 (val.val[2], vcreate_s8 (__AARCH64_INT64_C (0)));
-  temp.val[3] = vcombine_s8 (val.val[3], vcreate_s8 (__AARCH64_INT64_C (0)));
-  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) temp.val[0], 0);
-  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) temp.val[1], 1);
-  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) temp.val[2], 2);
-  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) temp.val[3], 3);
-  __builtin_aarch64_st4v8qi ((__builtin_aarch64_simd_qi *) __a, __o);
+#ifdef __AARCH64EB__
+  return __builtin_shuffle (__a, __b, (uint64x2_t) {3, 1});
+#else
+  return __builtin_shuffle (__a, __b, (uint64x2_t) {0, 2});
+#endif
+}
+
+__extension__ extern __inline float16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vuzp2_f16 (float16x4_t __a, float16x4_t __b)
+{
+#ifdef __AARCH64EB__
+  return __builtin_shuffle (__a, __b, (uint16x4_t) {4, 6, 0, 2});
+#else
+  return __builtin_shuffle (__a, __b, (uint16x4_t) {1, 3, 5, 7});
+#endif
 }
 
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst4_p8 (poly8_t * __a, poly8x8x4_t val)
+__extension__ extern __inline float32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vuzp2_f32 (float32x2_t __a, float32x2_t __b)
 {
-  __builtin_aarch64_simd_xi __o;
-  poly8x16x4_t temp;
-  temp.val[0] = vcombine_p8 (val.val[0], vcreate_p8 (__AARCH64_UINT64_C (0)));
-  temp.val[1] = vcombine_p8 (val.val[1], vcreate_p8 (__AARCH64_UINT64_C (0)));
-  temp.val[2] = vcombine_p8 (val.val[2], vcreate_p8 (__AARCH64_UINT64_C (0)));
-  temp.val[3] = vcombine_p8 (val.val[3], vcreate_p8 (__AARCH64_UINT64_C (0)));
-  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) temp.val[0], 0);
-  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) temp.val[1], 1);
-  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) temp.val[2], 2);
-  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) temp.val[3], 3);
-  __builtin_aarch64_st4v8qi ((__builtin_aarch64_simd_qi *) __a, __o);
+#ifdef __AARCH64EB__
+  return __builtin_shuffle (__a, __b, (uint32x2_t) {2, 0});
+#else
+  return __builtin_shuffle (__a, __b, (uint32x2_t) {1, 3});
+#endif
 }
 
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst4_s16 (int16_t * __a, int16x4x4_t val)
+__extension__ extern __inline poly8x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vuzp2_p8 (poly8x8_t __a, poly8x8_t __b)
 {
-  __builtin_aarch64_simd_xi __o;
-  int16x8x4_t temp;
-  temp.val[0] = vcombine_s16 (val.val[0], vcreate_s16 (__AARCH64_INT64_C (0)));
-  temp.val[1] = vcombine_s16 (val.val[1], vcreate_s16 (__AARCH64_INT64_C (0)));
-  temp.val[2] = vcombine_s16 (val.val[2], vcreate_s16 (__AARCH64_INT64_C (0)));
-  temp.val[3] = vcombine_s16 (val.val[3], vcreate_s16 (__AARCH64_INT64_C (0)));
-  __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) temp.val[0], 0);
-  __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) temp.val[1], 1);
-  __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) temp.val[2], 2);
-  __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) temp.val[3], 3);
-  __builtin_aarch64_st4v4hi ((__builtin_aarch64_simd_hi *) __a, __o);
+#ifdef __AARCH64EB__
+  return __builtin_shuffle (__a, __b, (uint8x8_t) {8, 10, 12, 14, 0, 2, 4, 6});
+#else
+  return __builtin_shuffle (__a, __b, (uint8x8_t) {1, 3, 5, 7, 9, 11, 13, 15});
+#endif
 }
 
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst4_p16 (poly16_t * __a, poly16x4x4_t val)
+__extension__ extern __inline poly16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vuzp2_p16 (poly16x4_t __a, poly16x4_t __b)
 {
-  __builtin_aarch64_simd_xi __o;
-  poly16x8x4_t temp;
-  temp.val[0] = vcombine_p16 (val.val[0], vcreate_p16 (__AARCH64_UINT64_C (0)));
-  temp.val[1] = vcombine_p16 (val.val[1], vcreate_p16 (__AARCH64_UINT64_C (0)));
-  temp.val[2] = vcombine_p16 (val.val[2], vcreate_p16 (__AARCH64_UINT64_C (0)));
-  temp.val[3] = vcombine_p16 (val.val[3], vcreate_p16 (__AARCH64_UINT64_C (0)));
-  __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) temp.val[0], 0);
-  __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) temp.val[1], 1);
-  __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) temp.val[2], 2);
-  __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) temp.val[3], 3);
-  __builtin_aarch64_st4v4hi ((__builtin_aarch64_simd_hi *) __a, __o);
+#ifdef __AARCH64EB__
+  return __builtin_shuffle (__a, __b, (uint16x4_t) {4, 6, 0, 2});
+#else
+  return __builtin_shuffle (__a, __b, (uint16x4_t) {1, 3, 5, 7});
+#endif
 }
 
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst4_s32 (int32_t * __a, int32x2x4_t val)
+__extension__ extern __inline int8x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vuzp2_s8 (int8x8_t __a, int8x8_t __b)
 {
-  __builtin_aarch64_simd_xi __o;
-  int32x4x4_t temp;
-  temp.val[0] = vcombine_s32 (val.val[0], vcreate_s32 (__AARCH64_INT64_C (0)));
-  temp.val[1] = vcombine_s32 (val.val[1], vcreate_s32 (__AARCH64_INT64_C (0)));
-  temp.val[2] = vcombine_s32 (val.val[2], vcreate_s32 (__AARCH64_INT64_C (0)));
-  temp.val[3] = vcombine_s32 (val.val[3], vcreate_s32 (__AARCH64_INT64_C (0)));
-  __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) temp.val[0], 0);
-  __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) temp.val[1], 1);
-  __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) temp.val[2], 2);
-  __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) temp.val[3], 3);
-  __builtin_aarch64_st4v2si ((__builtin_aarch64_simd_si *) __a, __o);
+#ifdef __AARCH64EB__
+  return __builtin_shuffle (__a, __b, (uint8x8_t) {8, 10, 12, 14, 0, 2, 4, 6});
+#else
+  return __builtin_shuffle (__a, __b, (uint8x8_t) {1, 3, 5, 7, 9, 11, 13, 15});
+#endif
 }
 
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst4_u8 (uint8_t * __a, uint8x8x4_t val)
+__extension__ extern __inline int16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vuzp2_s16 (int16x4_t __a, int16x4_t __b)
 {
-  __builtin_aarch64_simd_xi __o;
-  uint8x16x4_t temp;
-  temp.val[0] = vcombine_u8 (val.val[0], vcreate_u8 (__AARCH64_UINT64_C (0)));
-  temp.val[1] = vcombine_u8 (val.val[1], vcreate_u8 (__AARCH64_UINT64_C (0)));
-  temp.val[2] = vcombine_u8 (val.val[2], vcreate_u8 (__AARCH64_UINT64_C (0)));
-  temp.val[3] = vcombine_u8 (val.val[3], vcreate_u8 (__AARCH64_UINT64_C (0)));
-  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) temp.val[0], 0);
-  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) temp.val[1], 1);
-  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) temp.val[2], 2);
-  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) temp.val[3], 3);
-  __builtin_aarch64_st4v8qi ((__builtin_aarch64_simd_qi *) __a, __o);
+#ifdef __AARCH64EB__
+  return __builtin_shuffle (__a, __b, (uint16x4_t) {4, 6, 0, 2});
+#else
+  return __builtin_shuffle (__a, __b, (uint16x4_t) {1, 3, 5, 7});
+#endif
 }
 
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst4_u16 (uint16_t * __a, uint16x4x4_t val)
+__extension__ extern __inline int32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vuzp2_s32 (int32x2_t __a, int32x2_t __b)
 {
-  __builtin_aarch64_simd_xi __o;
-  uint16x8x4_t temp;
-  temp.val[0] = vcombine_u16 (val.val[0], vcreate_u16 (__AARCH64_UINT64_C (0)));
-  temp.val[1] = vcombine_u16 (val.val[1], vcreate_u16 (__AARCH64_UINT64_C (0)));
-  temp.val[2] = vcombine_u16 (val.val[2], vcreate_u16 (__AARCH64_UINT64_C (0)));
-  temp.val[3] = vcombine_u16 (val.val[3], vcreate_u16 (__AARCH64_UINT64_C (0)));
-  __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) temp.val[0], 0);
-  __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) temp.val[1], 1);
-  __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) temp.val[2], 2);
-  __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) temp.val[3], 3);
-  __builtin_aarch64_st4v4hi ((__builtin_aarch64_simd_hi *) __a, __o);
+#ifdef __AARCH64EB__
+  return __builtin_shuffle (__a, __b, (uint32x2_t) {2, 0});
+#else
+  return __builtin_shuffle (__a, __b, (uint32x2_t) {1, 3});
+#endif
 }
 
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst4_u32 (uint32_t * __a, uint32x2x4_t val)
+__extension__ extern __inline uint8x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vuzp2_u8 (uint8x8_t __a, uint8x8_t __b)
 {
-  __builtin_aarch64_simd_xi __o;
-  uint32x4x4_t temp;
-  temp.val[0] = vcombine_u32 (val.val[0], vcreate_u32 (__AARCH64_UINT64_C (0)));
-  temp.val[1] = vcombine_u32 (val.val[1], vcreate_u32 (__AARCH64_UINT64_C (0)));
-  temp.val[2] = vcombine_u32 (val.val[2], vcreate_u32 (__AARCH64_UINT64_C (0)));
-  temp.val[3] = vcombine_u32 (val.val[3], vcreate_u32 (__AARCH64_UINT64_C (0)));
-  __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) temp.val[0], 0);
-  __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) temp.val[1], 1);
-  __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) temp.val[2], 2);
-  __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) temp.val[3], 3);
-  __builtin_aarch64_st4v2si ((__builtin_aarch64_simd_si *) __a, __o);
+#ifdef __AARCH64EB__
+  return __builtin_shuffle (__a, __b, (uint8x8_t) {8, 10, 12, 14, 0, 2, 4, 6});
+#else
+  return __builtin_shuffle (__a, __b, (uint8x8_t) {1, 3, 5, 7, 9, 11, 13, 15});
+#endif
 }
 
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst4_f16 (float16_t * __a, float16x4x4_t val)
+__extension__ extern __inline uint16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vuzp2_u16 (uint16x4_t __a, uint16x4_t __b)
 {
-  __builtin_aarch64_simd_xi __o;
-  float16x8x4_t temp;
-  temp.val[0] = vcombine_f16 (val.val[0], vcreate_f16 (__AARCH64_UINT64_C (0)));
-  temp.val[1] = vcombine_f16 (val.val[1], vcreate_f16 (__AARCH64_UINT64_C (0)));
-  temp.val[2] = vcombine_f16 (val.val[2], vcreate_f16 (__AARCH64_UINT64_C (0)));
-  temp.val[3] = vcombine_f16 (val.val[3], vcreate_f16 (__AARCH64_UINT64_C (0)));
-  __o = __builtin_aarch64_set_qregxiv8hf (__o, (float16x8_t) temp.val[0], 0);
-  __o = __builtin_aarch64_set_qregxiv8hf (__o, (float16x8_t) temp.val[1], 1);
-  __o = __builtin_aarch64_set_qregxiv8hf (__o, (float16x8_t) temp.val[2], 2);
-  __o = __builtin_aarch64_set_qregxiv8hf (__o, (float16x8_t) temp.val[3], 3);
-  __builtin_aarch64_st4v4hf ((__builtin_aarch64_simd_hf *) __a, __o);
+#ifdef __AARCH64EB__
+  return __builtin_shuffle (__a, __b, (uint16x4_t) {4, 6, 0, 2});
+#else
+  return __builtin_shuffle (__a, __b, (uint16x4_t) {1, 3, 5, 7});
+#endif
 }
 
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst4_f32 (float32_t * __a, float32x2x4_t val)
+__extension__ extern __inline uint32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vuzp2_u32 (uint32x2_t __a, uint32x2_t __b)
 {
-  __builtin_aarch64_simd_xi __o;
-  float32x4x4_t temp;
-  temp.val[0] = vcombine_f32 (val.val[0], vcreate_f32 (__AARCH64_UINT64_C (0)));
-  temp.val[1] = vcombine_f32 (val.val[1], vcreate_f32 (__AARCH64_UINT64_C (0)));
-  temp.val[2] = vcombine_f32 (val.val[2], vcreate_f32 (__AARCH64_UINT64_C (0)));
-  temp.val[3] = vcombine_f32 (val.val[3], vcreate_f32 (__AARCH64_UINT64_C (0)));
-  __o = __builtin_aarch64_set_qregxiv4sf (__o, (float32x4_t) temp.val[0], 0);
-  __o = __builtin_aarch64_set_qregxiv4sf (__o, (float32x4_t) temp.val[1], 1);
-  __o = __builtin_aarch64_set_qregxiv4sf (__o, (float32x4_t) temp.val[2], 2);
-  __o = __builtin_aarch64_set_qregxiv4sf (__o, (float32x4_t) temp.val[3], 3);
-  __builtin_aarch64_st4v2sf ((__builtin_aarch64_simd_sf *) __a, __o);
+#ifdef __AARCH64EB__
+  return __builtin_shuffle (__a, __b, (uint32x2_t) {2, 0});
+#else
+  return __builtin_shuffle (__a, __b, (uint32x2_t) {1, 3});
+#endif
 }
 
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst4q_s8 (int8_t * __a, int8x16x4_t val)
+__extension__ extern __inline float16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vuzp2q_f16 (float16x8_t __a, float16x8_t __b)
 {
-  __builtin_aarch64_simd_xi __o;
-  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) val.val[0], 0);
-  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) val.val[1], 1);
-  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) val.val[2], 2);
-  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) val.val[3], 3);
-  __builtin_aarch64_st4v16qi ((__builtin_aarch64_simd_qi *) __a, __o);
+#ifdef __AARCH64EB__
+  return __builtin_shuffle (__a, __b, (uint16x8_t) {8, 10, 12, 14, 0, 2, 4, 6});
+#else
+  return __builtin_shuffle (__a, __b, (uint16x8_t) {1, 3, 5, 7, 9, 11, 13, 15});
+#endif
 }
 
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst4q_p8 (poly8_t * __a, poly8x16x4_t val)
+__extension__ extern __inline float32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vuzp2q_f32 (float32x4_t __a, float32x4_t __b)
 {
-  __builtin_aarch64_simd_xi __o;
-  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) val.val[0], 0);
-  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) val.val[1], 1);
-  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) val.val[2], 2);
-  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) val.val[3], 3);
-  __builtin_aarch64_st4v16qi ((__builtin_aarch64_simd_qi *) __a, __o);
+#ifdef __AARCH64EB__
+  return __builtin_shuffle (__a, __b, (uint32x4_t) {4, 6, 0, 2});
+#else
+  return __builtin_shuffle (__a, __b, (uint32x4_t) {1, 3, 5, 7});
+#endif
 }
 
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst4q_s16 (int16_t * __a, int16x8x4_t val)
+__extension__ extern __inline float64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vuzp2q_f64 (float64x2_t __a, float64x2_t __b)
 {
-  __builtin_aarch64_simd_xi __o;
-  __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) val.val[0], 0);
-  __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) val.val[1], 1);
-  __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) val.val[2], 2);
-  __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) val.val[3], 3);
-  __builtin_aarch64_st4v8hi ((__builtin_aarch64_simd_hi *) __a, __o);
+#ifdef __AARCH64EB__
+  return __builtin_shuffle (__a, __b, (uint64x2_t) {2, 0});
+#else
+  return __builtin_shuffle (__a, __b, (uint64x2_t) {1, 3});
+#endif
 }
 
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst4q_p16 (poly16_t * __a, poly16x8x4_t val)
+__extension__ extern __inline poly8x16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vuzp2q_p8 (poly8x16_t __a, poly8x16_t __b)
 {
-  __builtin_aarch64_simd_xi __o;
-  __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) val.val[0], 0);
-  __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) val.val[1], 1);
-  __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) val.val[2], 2);
-  __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) val.val[3], 3);
-  __builtin_aarch64_st4v8hi ((__builtin_aarch64_simd_hi *) __a, __o);
+#ifdef __AARCH64EB__
+  return __builtin_shuffle (__a, __b,
+      (uint8x16_t) {16, 18, 20, 22, 24, 26, 28, 30, 0, 2, 4, 6, 8, 10, 12, 14});
+#else
+  return __builtin_shuffle (__a, __b,
+      (uint8x16_t) {1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31});
+#endif
 }
 
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst4q_s32 (int32_t * __a, int32x4x4_t val)
+__extension__ extern __inline poly16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vuzp2q_p16 (poly16x8_t __a, poly16x8_t __b)
 {
-  __builtin_aarch64_simd_xi __o;
-  __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) val.val[0], 0);
-  __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) val.val[1], 1);
-  __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) val.val[2], 2);
-  __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) val.val[3], 3);
-  __builtin_aarch64_st4v4si ((__builtin_aarch64_simd_si *) __a, __o);
+#ifdef __AARCH64EB__
+  return __builtin_shuffle (__a, __b, (uint16x8_t) {8, 10, 12, 14, 0, 2, 4, 6});
+#else
+  return __builtin_shuffle (__a, __b, (uint16x8_t) {1, 3, 5, 7, 9, 11, 13, 15});
+#endif
 }
 
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst4q_s64 (int64_t * __a, int64x2x4_t val)
+__extension__ extern __inline int8x16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vuzp2q_s8 (int8x16_t __a, int8x16_t __b)
 {
-  __builtin_aarch64_simd_xi __o;
-  __o = __builtin_aarch64_set_qregxiv2di (__o, (int64x2_t) val.val[0], 0);
-  __o = __builtin_aarch64_set_qregxiv2di (__o, (int64x2_t) val.val[1], 1);
-  __o = __builtin_aarch64_set_qregxiv2di (__o, (int64x2_t) val.val[2], 2);
-  __o = __builtin_aarch64_set_qregxiv2di (__o, (int64x2_t) val.val[3], 3);
-  __builtin_aarch64_st4v2di ((__builtin_aarch64_simd_di *) __a, __o);
+#ifdef __AARCH64EB__
+  return __builtin_shuffle (__a, __b,
+      (uint8x16_t) {16, 18, 20, 22, 24, 26, 28, 30, 0, 2, 4, 6, 8, 10, 12, 14});
+#else
+  return __builtin_shuffle (__a, __b,
+      (uint8x16_t) {1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31});
+#endif
 }
 
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst4q_u8 (uint8_t * __a, uint8x16x4_t val)
+__extension__ extern __inline int16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vuzp2q_s16 (int16x8_t __a, int16x8_t __b)
 {
-  __builtin_aarch64_simd_xi __o;
-  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) val.val[0], 0);
-  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) val.val[1], 1);
-  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) val.val[2], 2);
-  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) val.val[3], 3);
-  __builtin_aarch64_st4v16qi ((__builtin_aarch64_simd_qi *) __a, __o);
+#ifdef __AARCH64EB__
+  return __builtin_shuffle (__a, __b, (uint16x8_t) {8, 10, 12, 14, 0, 2, 4, 6});
+#else
+  return __builtin_shuffle (__a, __b, (uint16x8_t) {1, 3, 5, 7, 9, 11, 13, 15});
+#endif
 }
 
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst4q_u16 (uint16_t * __a, uint16x8x4_t val)
+__extension__ extern __inline int32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vuzp2q_s32 (int32x4_t __a, int32x4_t __b)
 {
-  __builtin_aarch64_simd_xi __o;
-  __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) val.val[0], 0);
-  __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) val.val[1], 1);
-  __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) val.val[2], 2);
-  __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) val.val[3], 3);
-  __builtin_aarch64_st4v8hi ((__builtin_aarch64_simd_hi *) __a, __o);
+#ifdef __AARCH64EB__
+  return __builtin_shuffle (__a, __b, (uint32x4_t) {4, 6, 0, 2});
+#else
+  return __builtin_shuffle (__a, __b, (uint32x4_t) {1, 3, 5, 7});
+#endif
 }
 
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst4q_u32 (uint32_t * __a, uint32x4x4_t val)
+__extension__ extern __inline int64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vuzp2q_s64 (int64x2_t __a, int64x2_t __b)
 {
-  __builtin_aarch64_simd_xi __o;
-  __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) val.val[0], 0);
-  __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) val.val[1], 1);
-  __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) val.val[2], 2);
-  __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) val.val[3], 3);
-  __builtin_aarch64_st4v4si ((__builtin_aarch64_simd_si *) __a, __o);
+#ifdef __AARCH64EB__
+  return __builtin_shuffle (__a, __b, (uint64x2_t) {2, 0});
+#else
+  return __builtin_shuffle (__a, __b, (uint64x2_t) {1, 3});
+#endif
 }
 
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst4q_u64 (uint64_t * __a, uint64x2x4_t val)
+__extension__ extern __inline uint8x16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vuzp2q_u8 (uint8x16_t __a, uint8x16_t __b)
 {
-  __builtin_aarch64_simd_xi __o;
-  __o = __builtin_aarch64_set_qregxiv2di (__o, (int64x2_t) val.val[0], 0);
-  __o = __builtin_aarch64_set_qregxiv2di (__o, (int64x2_t) val.val[1], 1);
-  __o = __builtin_aarch64_set_qregxiv2di (__o, (int64x2_t) val.val[2], 2);
-  __o = __builtin_aarch64_set_qregxiv2di (__o, (int64x2_t) val.val[3], 3);
-  __builtin_aarch64_st4v2di ((__builtin_aarch64_simd_di *) __a, __o);
+#ifdef __AARCH64EB__
+  return __builtin_shuffle (__a, __b, (uint8x16_t)
+      {16, 18, 20, 22, 24, 26, 28, 30, 0, 2, 4, 6, 8, 10, 12, 14});
+#else
+  return __builtin_shuffle (__a, __b, (uint8x16_t)
+      {1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31});
+#endif
 }
 
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst4q_f16 (float16_t * __a, float16x8x4_t val)
+__extension__ extern __inline uint16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vuzp2q_u16 (uint16x8_t __a, uint16x8_t __b)
 {
-  __builtin_aarch64_simd_xi __o;
-  __o = __builtin_aarch64_set_qregxiv8hf (__o, (float16x8_t) val.val[0], 0);
-  __o = __builtin_aarch64_set_qregxiv8hf (__o, (float16x8_t) val.val[1], 1);
-  __o = __builtin_aarch64_set_qregxiv8hf (__o, (float16x8_t) val.val[2], 2);
-  __o = __builtin_aarch64_set_qregxiv8hf (__o, (float16x8_t) val.val[3], 3);
-  __builtin_aarch64_st4v8hf ((__builtin_aarch64_simd_hf *) __a, __o);
+#ifdef __AARCH64EB__
+  return __builtin_shuffle (__a, __b, (uint16x8_t) {8, 10, 12, 14, 0, 2, 4, 6});
+#else
+  return __builtin_shuffle (__a, __b, (uint16x8_t) {1, 3, 5, 7, 9, 11, 13, 15});
+#endif
 }
 
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst4q_f32 (float32_t * __a, float32x4x4_t val)
+__extension__ extern __inline uint32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vuzp2q_u32 (uint32x4_t __a, uint32x4_t __b)
 {
-  __builtin_aarch64_simd_xi __o;
-  __o = __builtin_aarch64_set_qregxiv4sf (__o, (float32x4_t) val.val[0], 0);
-  __o = __builtin_aarch64_set_qregxiv4sf (__o, (float32x4_t) val.val[1], 1);
-  __o = __builtin_aarch64_set_qregxiv4sf (__o, (float32x4_t) val.val[2], 2);
-  __o = __builtin_aarch64_set_qregxiv4sf (__o, (float32x4_t) val.val[3], 3);
-  __builtin_aarch64_st4v4sf ((__builtin_aarch64_simd_sf *) __a, __o);
+#ifdef __AARCH64EB__
+  return __builtin_shuffle (__a, __b, (uint32x4_t) {4, 6, 0, 2});
+#else
+  return __builtin_shuffle (__a, __b, (uint32x4_t) {1, 3, 5, 7});
+#endif
 }
 
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst4q_f64 (float64_t * __a, float64x2x4_t val)
+__extension__ extern __inline uint64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vuzp2q_u64 (uint64x2_t __a, uint64x2_t __b)
 {
-  __builtin_aarch64_simd_xi __o;
-  __o = __builtin_aarch64_set_qregxiv2df (__o, (float64x2_t) val.val[0], 0);
-  __o = __builtin_aarch64_set_qregxiv2df (__o, (float64x2_t) val.val[1], 1);
-  __o = __builtin_aarch64_set_qregxiv2df (__o, (float64x2_t) val.val[2], 2);
-  __o = __builtin_aarch64_set_qregxiv2df (__o, (float64x2_t) val.val[3], 3);
-  __builtin_aarch64_st4v2df ((__builtin_aarch64_simd_df *) __a, __o);
+#ifdef __AARCH64EB__
+  return __builtin_shuffle (__a, __b, (uint64x2_t) {2, 0});
+#else
+  return __builtin_shuffle (__a, __b, (uint64x2_t) {1, 3});
+#endif
 }
 
-/* vsub */
+__INTERLEAVE_LIST (uzp)
 
-__extension__ static __inline int64_t __attribute__ ((__always_inline__))
-vsubd_s64 (int64_t __a, int64_t __b)
+/* vzip */
+
+__extension__ extern __inline float16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vzip1_f16 (float16x4_t __a, float16x4_t __b)
 {
-  return __a - __b;
+#ifdef __AARCH64EB__
+  return __builtin_shuffle (__a, __b, (uint16x4_t) {6, 2, 7, 3});
+#else
+  return __builtin_shuffle (__a, __b, (uint16x4_t) {0, 4, 1, 5});
+#endif
 }
 
-__extension__ static __inline uint64_t __attribute__ ((__always_inline__))
-vsubd_u64 (uint64_t __a, uint64_t __b)
+__extension__ extern __inline float32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vzip1_f32 (float32x2_t __a, float32x2_t __b)
 {
-  return __a - __b;
+#ifdef __AARCH64EB__
+  return __builtin_shuffle (__a, __b, (uint32x2_t) {3, 1});
+#else
+  return __builtin_shuffle (__a, __b, (uint32x2_t) {0, 2});
+#endif
 }
 
-/* vtbx1  */
-
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-vtbx1_s8 (int8x8_t __r, int8x8_t __tab, int8x8_t __idx)
+__extension__ extern __inline poly8x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vzip1_p8 (poly8x8_t __a, poly8x8_t __b)
 {
-  uint8x8_t __mask = vclt_u8 (vreinterpret_u8_s8 (__idx),
-			      vmov_n_u8 (8));
-  int8x8_t __tbl = vtbl1_s8 (__tab, __idx);
-
-  return vbsl_s8 (__mask, __tbl, __r);
+#ifdef __AARCH64EB__
+  return __builtin_shuffle (__a, __b, (uint8x8_t) {12, 4, 13, 5, 14, 6, 15, 7});
+#else
+  return __builtin_shuffle (__a, __b, (uint8x8_t) {0, 8, 1, 9, 2, 10, 3, 11});
+#endif
 }
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-vtbx1_u8 (uint8x8_t __r, uint8x8_t __tab, uint8x8_t __idx)
+__extension__ extern __inline poly16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vzip1_p16 (poly16x4_t __a, poly16x4_t __b)
 {
-  uint8x8_t __mask = vclt_u8 (__idx, vmov_n_u8 (8));
-  uint8x8_t __tbl = vtbl1_u8 (__tab, __idx);
-
-  return vbsl_u8 (__mask, __tbl, __r);
+#ifdef __AARCH64EB__
+  return __builtin_shuffle (__a, __b, (uint16x4_t) {6, 2, 7, 3});
+#else
+  return __builtin_shuffle (__a, __b, (uint16x4_t) {0, 4, 1, 5});
+#endif
 }
 
-__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
-vtbx1_p8 (poly8x8_t __r, poly8x8_t __tab, uint8x8_t __idx)
+__extension__ extern __inline int8x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vzip1_s8 (int8x8_t __a, int8x8_t __b)
 {
-  uint8x8_t __mask = vclt_u8 (__idx, vmov_n_u8 (8));
-  poly8x8_t __tbl = vtbl1_p8 (__tab, __idx);
-
-  return vbsl_p8 (__mask, __tbl, __r);
+#ifdef __AARCH64EB__
+  return __builtin_shuffle (__a, __b, (uint8x8_t) {12, 4, 13, 5, 14, 6, 15, 7});
+#else
+  return __builtin_shuffle (__a, __b, (uint8x8_t) {0, 8, 1, 9, 2, 10, 3, 11});
+#endif
 }
 
-/* vtbx3  */
-
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-vtbx3_s8 (int8x8_t __r, int8x8x3_t __tab, int8x8_t __idx)
+__extension__ extern __inline int16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vzip1_s16 (int16x4_t __a, int16x4_t __b)
 {
-  uint8x8_t __mask = vclt_u8 (vreinterpret_u8_s8 (__idx),
-			      vmov_n_u8 (24));
-  int8x8_t __tbl = vtbl3_s8 (__tab, __idx);
-
-  return vbsl_s8 (__mask, __tbl, __r);
+#ifdef __AARCH64EB__
+  return __builtin_shuffle (__a, __b, (uint16x4_t) {6, 2, 7, 3});
+#else
+  return __builtin_shuffle (__a, __b, (uint16x4_t) {0, 4, 1, 5});
+#endif
 }
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-vtbx3_u8 (uint8x8_t __r, uint8x8x3_t __tab, uint8x8_t __idx)
+__extension__ extern __inline int32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vzip1_s32 (int32x2_t __a, int32x2_t __b)
 {
-  uint8x8_t __mask = vclt_u8 (__idx, vmov_n_u8 (24));
-  uint8x8_t __tbl = vtbl3_u8 (__tab, __idx);
-
-  return vbsl_u8 (__mask, __tbl, __r);
+#ifdef __AARCH64EB__
+  return __builtin_shuffle (__a, __b, (uint32x2_t) {3, 1});
+#else
+  return __builtin_shuffle (__a, __b, (uint32x2_t) {0, 2});
+#endif
 }
 
-__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
-vtbx3_p8 (poly8x8_t __r, poly8x8x3_t __tab, uint8x8_t __idx)
+__extension__ extern __inline uint8x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vzip1_u8 (uint8x8_t __a, uint8x8_t __b)
 {
-  uint8x8_t __mask = vclt_u8 (__idx, vmov_n_u8 (24));
-  poly8x8_t __tbl = vtbl3_p8 (__tab, __idx);
+#ifdef __AARCH64EB__
+  return __builtin_shuffle (__a, __b, (uint8x8_t) {12, 4, 13, 5, 14, 6, 15, 7});
+#else
+  return __builtin_shuffle (__a, __b, (uint8x8_t) {0, 8, 1, 9, 2, 10, 3, 11});
+#endif
+}
 
-  return vbsl_p8 (__mask, __tbl, __r);
+__extension__ extern __inline uint16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vzip1_u16 (uint16x4_t __a, uint16x4_t __b)
+{
+#ifdef __AARCH64EB__
+  return __builtin_shuffle (__a, __b, (uint16x4_t) {6, 2, 7, 3});
+#else
+  return __builtin_shuffle (__a, __b, (uint16x4_t) {0, 4, 1, 5});
+#endif
 }
 
-/* vtbx4  */
+__extension__ extern __inline uint32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vzip1_u32 (uint32x2_t __a, uint32x2_t __b)
+{
+#ifdef __AARCH64EB__
+  return __builtin_shuffle (__a, __b, (uint32x2_t) {3, 1});
+#else
+  return __builtin_shuffle (__a, __b, (uint32x2_t) {0, 2});
+#endif
+}
 
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-vtbx4_s8 (int8x8_t __r, int8x8x4_t __tab, int8x8_t __idx)
+__extension__ extern __inline float16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vzip1q_f16 (float16x8_t __a, float16x8_t __b)
 {
-  int8x8_t result;
-  int8x16x2_t temp;
-  __builtin_aarch64_simd_oi __o;
-  temp.val[0] = vcombine_s8 (__tab.val[0], __tab.val[1]);
-  temp.val[1] = vcombine_s8 (__tab.val[2], __tab.val[3]);
-  __o = __builtin_aarch64_set_qregoiv16qi (__o,
-					   (int8x16_t) temp.val[0], 0);
-  __o = __builtin_aarch64_set_qregoiv16qi (__o,
-					   (int8x16_t) temp.val[1], 1);
-  result = __builtin_aarch64_tbx4v8qi (__r, __o, __idx);
-  return result;
+#ifdef __AARCH64EB__
+  return __builtin_shuffle (__a, __b,
+			    (uint16x8_t) {12, 4, 13, 5, 14, 6, 15, 7});
+#else
+  return __builtin_shuffle (__a, __b,
+			    (uint16x8_t) {0, 8, 1, 9, 2, 10, 3, 11});
+#endif
 }
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-vtbx4_u8 (uint8x8_t __r, uint8x8x4_t __tab, uint8x8_t __idx)
+__extension__ extern __inline float32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vzip1q_f32 (float32x4_t __a, float32x4_t __b)
 {
-  uint8x8_t result;
-  uint8x16x2_t temp;
-  __builtin_aarch64_simd_oi __o;
-  temp.val[0] = vcombine_u8 (__tab.val[0], __tab.val[1]);
-  temp.val[1] = vcombine_u8 (__tab.val[2], __tab.val[3]);
-  __o = __builtin_aarch64_set_qregoiv16qi (__o,
-					   (int8x16_t) temp.val[0], 0);
-  __o = __builtin_aarch64_set_qregoiv16qi (__o,
-					   (int8x16_t) temp.val[1], 1);
-  result = (uint8x8_t)__builtin_aarch64_tbx4v8qi ((int8x8_t)__r, __o,
-						  (int8x8_t)__idx);
-  return result;
+#ifdef __AARCH64EB__
+  return __builtin_shuffle (__a, __b, (uint32x4_t) {6, 2, 7, 3});
+#else
+  return __builtin_shuffle (__a, __b, (uint32x4_t) {0, 4, 1, 5});
+#endif
 }
 
-__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
-vtbx4_p8 (poly8x8_t __r, poly8x8x4_t __tab, uint8x8_t __idx)
+__extension__ extern __inline float64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vzip1q_f64 (float64x2_t __a, float64x2_t __b)
 {
-  poly8x8_t result;
-  poly8x16x2_t temp;
-  __builtin_aarch64_simd_oi __o;
-  temp.val[0] = vcombine_p8 (__tab.val[0], __tab.val[1]);
-  temp.val[1] = vcombine_p8 (__tab.val[2], __tab.val[3]);
-  __o = __builtin_aarch64_set_qregoiv16qi (__o,
-					   (int8x16_t) temp.val[0], 0);
-  __o = __builtin_aarch64_set_qregoiv16qi (__o,
-					   (int8x16_t) temp.val[1], 1);
-  result = (poly8x8_t)__builtin_aarch64_tbx4v8qi ((int8x8_t)__r, __o,
-						  (int8x8_t)__idx);
-  return result;
+#ifdef __AARCH64EB__
+  return __builtin_shuffle (__a, __b, (uint64x2_t) {3, 1});
+#else
+  return __builtin_shuffle (__a, __b, (uint64x2_t) {0, 2});
+#endif
 }
 
-/* vtrn */
+__extension__ extern __inline poly8x16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vzip1q_p8 (poly8x16_t __a, poly8x16_t __b)
+{
+#ifdef __AARCH64EB__
+  return __builtin_shuffle (__a, __b, (uint8x16_t)
+      {24, 8, 25, 9, 26, 10, 27, 11, 28, 12, 29, 13, 30, 14, 31, 15});
+#else
+  return __builtin_shuffle (__a, __b, (uint8x16_t)
+      {0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23});
+#endif
+}
 
-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
-vtrn1_f32 (float32x2_t __a, float32x2_t __b)
+__extension__ extern __inline poly16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vzip1q_p16 (poly16x8_t __a, poly16x8_t __b)
 {
 #ifdef __AARCH64EB__
-  return __builtin_shuffle (__a, __b, (uint32x2_t) {3, 1});
+  return __builtin_shuffle (__a, __b, (uint16x8_t)
+      {12, 4, 13, 5, 14, 6, 15, 7});
 #else
-  return __builtin_shuffle (__a, __b, (uint32x2_t) {0, 2});
+  return __builtin_shuffle (__a, __b, (uint16x8_t) {0, 8, 1, 9, 2, 10, 3, 11});
 #endif
 }
 
-__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
-vtrn1_p8 (poly8x8_t __a, poly8x8_t __b)
+__extension__ extern __inline int8x16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vzip1q_s8 (int8x16_t __a, int8x16_t __b)
 {
 #ifdef __AARCH64EB__
-  return __builtin_shuffle (__a, __b, (uint8x8_t) {9, 1, 11, 3, 13, 5, 15, 7});
+  return __builtin_shuffle (__a, __b, (uint8x16_t)
+      {24, 8, 25, 9, 26, 10, 27, 11, 28, 12, 29, 13, 30, 14, 31, 15});
 #else
-  return __builtin_shuffle (__a, __b, (uint8x8_t) {0, 8, 2, 10, 4, 12, 6, 14});
+  return __builtin_shuffle (__a, __b, (uint8x16_t)
+      {0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23});
 #endif
 }
 
-__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__))
-vtrn1_p16 (poly16x4_t __a, poly16x4_t __b)
+__extension__ extern __inline int16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vzip1q_s16 (int16x8_t __a, int16x8_t __b)
 {
 #ifdef __AARCH64EB__
-  return __builtin_shuffle (__a, __b, (uint16x4_t) {5, 1, 7, 3});
+  return __builtin_shuffle (__a, __b, (uint16x8_t)
+      {12, 4, 13, 5, 14, 6, 15, 7});
 #else
-  return __builtin_shuffle (__a, __b, (uint16x4_t) {0, 4, 2, 6});
+  return __builtin_shuffle (__a, __b, (uint16x8_t) {0, 8, 1, 9, 2, 10, 3, 11});
 #endif
 }
 
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-vtrn1_s8 (int8x8_t __a, int8x8_t __b)
+__extension__ extern __inline int32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vzip1q_s32 (int32x4_t __a, int32x4_t __b)
 {
 #ifdef __AARCH64EB__
-  return __builtin_shuffle (__a, __b, (uint8x8_t) {9, 1, 11, 3, 13, 5, 15, 7});
+  return __builtin_shuffle (__a, __b, (uint32x4_t) {6, 2, 7, 3});
 #else
-  return __builtin_shuffle (__a, __b, (uint8x8_t) {0, 8, 2, 10, 4, 12, 6, 14});
+  return __builtin_shuffle (__a, __b, (uint32x4_t) {0, 4, 1, 5});
 #endif
 }
 
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-vtrn1_s16 (int16x4_t __a, int16x4_t __b)
+__extension__ extern __inline int64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vzip1q_s64 (int64x2_t __a, int64x2_t __b)
 {
 #ifdef __AARCH64EB__
-  return __builtin_shuffle (__a, __b, (uint16x4_t) {5, 1, 7, 3});
+  return __builtin_shuffle (__a, __b, (uint64x2_t) {3, 1});
 #else
-  return __builtin_shuffle (__a, __b, (uint16x4_t) {0, 4, 2, 6});
+  return __builtin_shuffle (__a, __b, (uint64x2_t) {0, 2});
 #endif
 }
 
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-vtrn1_s32 (int32x2_t __a, int32x2_t __b)
+__extension__ extern __inline uint8x16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vzip1q_u8 (uint8x16_t __a, uint8x16_t __b)
 {
 #ifdef __AARCH64EB__
-  return __builtin_shuffle (__a, __b, (uint32x2_t) {3, 1});
+  return __builtin_shuffle (__a, __b, (uint8x16_t)
+      {24, 8, 25, 9, 26, 10, 27, 11, 28, 12, 29, 13, 30, 14, 31, 15});
 #else
-  return __builtin_shuffle (__a, __b, (uint32x2_t) {0, 2});
+  return __builtin_shuffle (__a, __b, (uint8x16_t)
+      {0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23});
 #endif
 }
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-vtrn1_u8 (uint8x8_t __a, uint8x8_t __b)
+__extension__ extern __inline uint16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vzip1q_u16 (uint16x8_t __a, uint16x8_t __b)
 {
 #ifdef __AARCH64EB__
-  return __builtin_shuffle (__a, __b, (uint8x8_t) {9, 1, 11, 3, 13, 5, 15, 7});
+  return __builtin_shuffle (__a, __b, (uint16x8_t)
+      {12, 4, 13, 5, 14, 6, 15, 7});
 #else
-  return __builtin_shuffle (__a, __b, (uint8x8_t) {0, 8, 2, 10, 4, 12, 6, 14});
+  return __builtin_shuffle (__a, __b, (uint16x8_t) {0, 8, 1, 9, 2, 10, 3, 11});
 #endif
 }
 
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-vtrn1_u16 (uint16x4_t __a, uint16x4_t __b)
+__extension__ extern __inline uint32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vzip1q_u32 (uint32x4_t __a, uint32x4_t __b)
 {
 #ifdef __AARCH64EB__
-  return __builtin_shuffle (__a, __b, (uint16x4_t) {5, 1, 7, 3});
+  return __builtin_shuffle (__a, __b, (uint32x4_t) {6, 2, 7, 3});
 #else
-  return __builtin_shuffle (__a, __b, (uint16x4_t) {0, 4, 2, 6});
+  return __builtin_shuffle (__a, __b, (uint32x4_t) {0, 4, 1, 5});
 #endif
 }
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-vtrn1_u32 (uint32x2_t __a, uint32x2_t __b)
+__extension__ extern __inline uint64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vzip1q_u64 (uint64x2_t __a, uint64x2_t __b)
 {
 #ifdef __AARCH64EB__
-  return __builtin_shuffle (__a, __b, (uint32x2_t) {3, 1});
+  return __builtin_shuffle (__a, __b, (uint64x2_t) {3, 1});
 #else
-  return __builtin_shuffle (__a, __b, (uint32x2_t) {0, 2});
+  return __builtin_shuffle (__a, __b, (uint64x2_t) {0, 2});
 #endif
 }
 
-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
-vtrn1q_f32 (float32x4_t __a, float32x4_t __b)
+__extension__ extern __inline float16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vzip2_f16 (float16x4_t __a, float16x4_t __b)
 {
 #ifdef __AARCH64EB__
-  return __builtin_shuffle (__a, __b, (uint32x4_t) {5, 1, 7, 3});
+  return __builtin_shuffle (__a, __b, (uint16x4_t) {4, 0, 5, 1});
 #else
-  return __builtin_shuffle (__a, __b, (uint32x4_t) {0, 4, 2, 6});
+  return __builtin_shuffle (__a, __b, (uint16x4_t) {2, 6, 3, 7});
 #endif
 }
 
-__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
-vtrn1q_f64 (float64x2_t __a, float64x2_t __b)
+__extension__ extern __inline float32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vzip2_f32 (float32x2_t __a, float32x2_t __b)
 {
 #ifdef __AARCH64EB__
-  return __builtin_shuffle (__a, __b, (uint64x2_t) {3, 1});
+  return __builtin_shuffle (__a, __b, (uint32x2_t) {2, 0});
 #else
-  return __builtin_shuffle (__a, __b, (uint64x2_t) {0, 2});
+  return __builtin_shuffle (__a, __b, (uint32x2_t) {1, 3});
 #endif
 }
 
-__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
-vtrn1q_p8 (poly8x16_t __a, poly8x16_t __b)
+__extension__ extern __inline poly8x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vzip2_p8 (poly8x8_t __a, poly8x8_t __b)
 {
 #ifdef __AARCH64EB__
-  return __builtin_shuffle (__a, __b,
-      (uint8x16_t) {17, 1, 19, 3, 21, 5, 23, 7, 25, 9, 27, 11, 29, 13, 31, 15});
+  return __builtin_shuffle (__a, __b, (uint8x8_t) {8, 0, 9, 1, 10, 2, 11, 3});
 #else
-  return __builtin_shuffle (__a, __b,
-      (uint8x16_t) {0, 16, 2, 18, 4, 20, 6, 22, 8, 24, 10, 26, 12, 28, 14, 30});
+  return __builtin_shuffle (__a, __b, (uint8x8_t) {4, 12, 5, 13, 6, 14, 7, 15});
 #endif
 }
 
-__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
-vtrn1q_p16 (poly16x8_t __a, poly16x8_t __b)
+__extension__ extern __inline poly16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vzip2_p16 (poly16x4_t __a, poly16x4_t __b)
 {
 #ifdef __AARCH64EB__
-  return __builtin_shuffle (__a, __b, (uint16x8_t) {9, 1, 11, 3, 13, 5, 15, 7});
+  return __builtin_shuffle (__a, __b, (uint16x4_t) {4, 0, 5, 1});
 #else
-  return __builtin_shuffle (__a, __b, (uint16x8_t) {0, 8, 2, 10, 4, 12, 6, 14});
+  return __builtin_shuffle (__a, __b, (uint16x4_t) {2, 6, 3, 7});
 #endif
 }
 
-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
-vtrn1q_s8 (int8x16_t __a, int8x16_t __b)
+__extension__ extern __inline int8x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vzip2_s8 (int8x8_t __a, int8x8_t __b)
 {
 #ifdef __AARCH64EB__
-  return __builtin_shuffle (__a, __b,
-      (uint8x16_t) {17, 1, 19, 3, 21, 5, 23, 7, 25, 9, 27, 11, 29, 13, 31, 15});
+  return __builtin_shuffle (__a, __b, (uint8x8_t) {8, 0, 9, 1, 10, 2, 11, 3});
 #else
-  return __builtin_shuffle (__a, __b,
-      (uint8x16_t) {0, 16, 2, 18, 4, 20, 6, 22, 8, 24, 10, 26, 12, 28, 14, 30});
+  return __builtin_shuffle (__a, __b, (uint8x8_t) {4, 12, 5, 13, 6, 14, 7, 15});
 #endif
 }
 
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-vtrn1q_s16 (int16x8_t __a, int16x8_t __b)
+__extension__ extern __inline int16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vzip2_s16 (int16x4_t __a, int16x4_t __b)
 {
 #ifdef __AARCH64EB__
-  return __builtin_shuffle (__a, __b, (uint16x8_t) {9, 1, 11, 3, 13, 5, 15, 7});
+  return __builtin_shuffle (__a, __b, (uint16x4_t) {4, 0, 5, 1});
 #else
-  return __builtin_shuffle (__a, __b, (uint16x8_t) {0, 8, 2, 10, 4, 12, 6, 14});
+  return __builtin_shuffle (__a, __b, (uint16x4_t) {2, 6, 3, 7});
 #endif
 }
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-vtrn1q_s32 (int32x4_t __a, int32x4_t __b)
+__extension__ extern __inline int32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vzip2_s32 (int32x2_t __a, int32x2_t __b)
 {
 #ifdef __AARCH64EB__
-  return __builtin_shuffle (__a, __b, (uint32x4_t) {5, 1, 7, 3});
+  return __builtin_shuffle (__a, __b, (uint32x2_t) {2, 0});
 #else
-  return __builtin_shuffle (__a, __b, (uint32x4_t) {0, 4, 2, 6});
+  return __builtin_shuffle (__a, __b, (uint32x2_t) {1, 3});
 #endif
 }
 
-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
-vtrn1q_s64 (int64x2_t __a, int64x2_t __b)
+__extension__ extern __inline uint8x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vzip2_u8 (uint8x8_t __a, uint8x8_t __b)
 {
 #ifdef __AARCH64EB__
-  return __builtin_shuffle (__a, __b, (uint64x2_t) {3, 1});
+  return __builtin_shuffle (__a, __b, (uint8x8_t) {8, 0, 9, 1, 10, 2, 11, 3});
 #else
-  return __builtin_shuffle (__a, __b, (uint64x2_t) {0, 2});
+  return __builtin_shuffle (__a, __b, (uint8x8_t) {4, 12, 5, 13, 6, 14, 7, 15});
 #endif
 }
 
-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-vtrn1q_u8 (uint8x16_t __a, uint8x16_t __b)
+__extension__ extern __inline uint16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vzip2_u16 (uint16x4_t __a, uint16x4_t __b)
 {
 #ifdef __AARCH64EB__
-  return __builtin_shuffle (__a, __b,
-      (uint8x16_t) {17, 1, 19, 3, 21, 5, 23, 7, 25, 9, 27, 11, 29, 13, 31, 15});
+  return __builtin_shuffle (__a, __b, (uint16x4_t) {4, 0, 5, 1});
 #else
-  return __builtin_shuffle (__a, __b,
-      (uint8x16_t) {0, 16, 2, 18, 4, 20, 6, 22, 8, 24, 10, 26, 12, 28, 14, 30});
+  return __builtin_shuffle (__a, __b, (uint16x4_t) {2, 6, 3, 7});
 #endif
 }
 
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-vtrn1q_u16 (uint16x8_t __a, uint16x8_t __b)
+__extension__ extern __inline uint32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vzip2_u32 (uint32x2_t __a, uint32x2_t __b)
 {
 #ifdef __AARCH64EB__
-  return __builtin_shuffle (__a, __b, (uint16x8_t) {9, 1, 11, 3, 13, 5, 15, 7});
+  return __builtin_shuffle (__a, __b, (uint32x2_t) {2, 0});
 #else
-  return __builtin_shuffle (__a, __b, (uint16x8_t) {0, 8, 2, 10, 4, 12, 6, 14});
+  return __builtin_shuffle (__a, __b, (uint32x2_t) {1, 3});
 #endif
 }
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vtrn1q_u32 (uint32x4_t __a, uint32x4_t __b)
+__extension__ extern __inline float16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vzip2q_f16 (float16x8_t __a, float16x8_t __b)
 {
 #ifdef __AARCH64EB__
-  return __builtin_shuffle (__a, __b, (uint32x4_t) {5, 1, 7, 3});
+  return __builtin_shuffle (__a, __b,
+			    (uint16x8_t) {8, 0, 9, 1, 10, 2, 11, 3});
 #else
-  return __builtin_shuffle (__a, __b, (uint32x4_t) {0, 4, 2, 6});
+  return __builtin_shuffle (__a, __b,
+			    (uint16x8_t) {4, 12, 5, 13, 6, 14, 7, 15});
 #endif
 }
 
-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
-vtrn1q_u64 (uint64x2_t __a, uint64x2_t __b)
+__extension__ extern __inline float32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vzip2q_f32 (float32x4_t __a, float32x4_t __b)
 {
 #ifdef __AARCH64EB__
-  return __builtin_shuffle (__a, __b, (uint64x2_t) {3, 1});
+  return __builtin_shuffle (__a, __b, (uint32x4_t) {4, 0, 5, 1});
 #else
-  return __builtin_shuffle (__a, __b, (uint64x2_t) {0, 2});
+  return __builtin_shuffle (__a, __b, (uint32x4_t) {2, 6, 3, 7});
 #endif
 }
 
-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
-vtrn2_f32 (float32x2_t __a, float32x2_t __b)
+__extension__ extern __inline float64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vzip2q_f64 (float64x2_t __a, float64x2_t __b)
 {
 #ifdef __AARCH64EB__
-  return __builtin_shuffle (__a, __b, (uint32x2_t) {2, 0});
+  return __builtin_shuffle (__a, __b, (uint64x2_t) {2, 0});
 #else
-  return __builtin_shuffle (__a, __b, (uint32x2_t) {1, 3});
+  return __builtin_shuffle (__a, __b, (uint64x2_t) {1, 3});
 #endif
 }
 
-__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
-vtrn2_p8 (poly8x8_t __a, poly8x8_t __b)
+__extension__ extern __inline poly8x16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vzip2q_p8 (poly8x16_t __a, poly8x16_t __b)
 {
 #ifdef __AARCH64EB__
-  return __builtin_shuffle (__a, __b, (uint8x8_t) {8, 0, 10, 2, 12, 4, 14, 6});
+  return __builtin_shuffle (__a, __b, (uint8x16_t)
+      {16, 0, 17, 1, 18, 2, 19, 3, 20, 4, 21, 5, 22, 6, 23, 7});
 #else
-  return __builtin_shuffle (__a, __b, (uint8x8_t) {1, 9, 3, 11, 5, 13, 7, 15});
+  return __builtin_shuffle (__a, __b, (uint8x16_t)
+      {8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31});
 #endif
 }
 
-__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__))
-vtrn2_p16 (poly16x4_t __a, poly16x4_t __b)
+__extension__ extern __inline poly16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vzip2q_p16 (poly16x8_t __a, poly16x8_t __b)
 {
 #ifdef __AARCH64EB__
-  return __builtin_shuffle (__a, __b, (uint16x4_t) {4, 0, 6, 2});
+  return __builtin_shuffle (__a, __b, (uint16x8_t) {8, 0, 9, 1, 10, 2, 11, 3});
 #else
-  return __builtin_shuffle (__a, __b, (uint16x4_t) {1, 5, 3, 7});
+  return __builtin_shuffle (__a, __b, (uint16x8_t)
+      {4, 12, 5, 13, 6, 14, 7, 15});
 #endif
 }
 
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-vtrn2_s8 (int8x8_t __a, int8x8_t __b)
+__extension__ extern __inline int8x16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vzip2q_s8 (int8x16_t __a, int8x16_t __b)
 {
 #ifdef __AARCH64EB__
-  return __builtin_shuffle (__a, __b, (uint8x8_t) {8, 0, 10, 2, 12, 4, 14, 6});
+  return __builtin_shuffle (__a, __b, (uint8x16_t)
+      {16, 0, 17, 1, 18, 2, 19, 3, 20, 4, 21, 5, 22, 6, 23, 7});
 #else
-  return __builtin_shuffle (__a, __b, (uint8x8_t) {1, 9, 3, 11, 5, 13, 7, 15});
+  return __builtin_shuffle (__a, __b, (uint8x16_t)
+      {8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31});
 #endif
 }
 
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-vtrn2_s16 (int16x4_t __a, int16x4_t __b)
+__extension__ extern __inline int16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vzip2q_s16 (int16x8_t __a, int16x8_t __b)
 {
 #ifdef __AARCH64EB__
-  return __builtin_shuffle (__a, __b, (uint16x4_t) {4, 0, 6, 2});
+  return __builtin_shuffle (__a, __b, (uint16x8_t) {8, 0, 9, 1, 10, 2, 11, 3});
 #else
-  return __builtin_shuffle (__a, __b, (uint16x4_t) {1, 5, 3, 7});
+  return __builtin_shuffle (__a, __b, (uint16x8_t)
+      {4, 12, 5, 13, 6, 14, 7, 15});
 #endif
 }
 
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-vtrn2_s32 (int32x2_t __a, int32x2_t __b)
+__extension__ extern __inline int32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vzip2q_s32 (int32x4_t __a, int32x4_t __b)
 {
 #ifdef __AARCH64EB__
-  return __builtin_shuffle (__a, __b, (uint32x2_t) {2, 0});
+  return __builtin_shuffle (__a, __b, (uint32x4_t) {4, 0, 5, 1});
 #else
-  return __builtin_shuffle (__a, __b, (uint32x2_t) {1, 3});
+  return __builtin_shuffle (__a, __b, (uint32x4_t) {2, 6, 3, 7});
 #endif
 }
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-vtrn2_u8 (uint8x8_t __a, uint8x8_t __b)
+__extension__ extern __inline int64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vzip2q_s64 (int64x2_t __a, int64x2_t __b)
 {
 #ifdef __AARCH64EB__
-  return __builtin_shuffle (__a, __b, (uint8x8_t) {8, 0, 10, 2, 12, 4, 14, 6});
+  return __builtin_shuffle (__a, __b, (uint64x2_t) {2, 0});
 #else
-  return __builtin_shuffle (__a, __b, (uint8x8_t) {1, 9, 3, 11, 5, 13, 7, 15});
+  return __builtin_shuffle (__a, __b, (uint64x2_t) {1, 3});
 #endif
 }
 
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-vtrn2_u16 (uint16x4_t __a, uint16x4_t __b)
+__extension__ extern __inline uint8x16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vzip2q_u8 (uint8x16_t __a, uint8x16_t __b)
 {
 #ifdef __AARCH64EB__
-  return __builtin_shuffle (__a, __b, (uint16x4_t) {4, 0, 6, 2});
+  return __builtin_shuffle (__a, __b, (uint8x16_t)
+      {16, 0, 17, 1, 18, 2, 19, 3, 20, 4, 21, 5, 22, 6, 23, 7});
 #else
-  return __builtin_shuffle (__a, __b, (uint16x4_t) {1, 5, 3, 7});
+  return __builtin_shuffle (__a, __b, (uint8x16_t)
+      {8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31});
 #endif
 }
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-vtrn2_u32 (uint32x2_t __a, uint32x2_t __b)
+__extension__ extern __inline uint16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vzip2q_u16 (uint16x8_t __a, uint16x8_t __b)
 {
 #ifdef __AARCH64EB__
-  return __builtin_shuffle (__a, __b, (uint32x2_t) {2, 0});
+  return __builtin_shuffle (__a, __b, (uint16x8_t) {8, 0, 9, 1, 10, 2, 11, 3});
 #else
-  return __builtin_shuffle (__a, __b, (uint32x2_t) {1, 3});
+  return __builtin_shuffle (__a, __b, (uint16x8_t)
+      {4, 12, 5, 13, 6, 14, 7, 15});
 #endif
 }
 
-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
-vtrn2q_f32 (float32x4_t __a, float32x4_t __b)
+__extension__ extern __inline uint32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vzip2q_u32 (uint32x4_t __a, uint32x4_t __b)
 {
 #ifdef __AARCH64EB__
-  return __builtin_shuffle (__a, __b, (uint32x4_t) {4, 0, 6, 2});
+  return __builtin_shuffle (__a, __b, (uint32x4_t) {4, 0, 5, 1});
 #else
-  return __builtin_shuffle (__a, __b, (uint32x4_t) {1, 5, 3, 7});
+  return __builtin_shuffle (__a, __b, (uint32x4_t) {2, 6, 3, 7});
 #endif
 }
 
-__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
-vtrn2q_f64 (float64x2_t __a, float64x2_t __b)
+__extension__ extern __inline uint64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vzip2q_u64 (uint64x2_t __a, uint64x2_t __b)
 {
 #ifdef __AARCH64EB__
   return __builtin_shuffle (__a, __b, (uint64x2_t) {2, 0});
@@ -24455,1319 +30368,1184 @@ vtrn2q_f64 (float64x2_t __a, float64x2_t __b)
 #endif
 }
 
-__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
-vtrn2q_p8 (poly8x16_t __a, poly8x16_t __b)
+__INTERLEAVE_LIST (zip)
+
+#undef __INTERLEAVE_LIST
+#undef __DEFINTERLEAVE
+
+/* End of optimal implementations in approved order.  */
+
+#pragma GCC pop_options
+
+/* ARMv8.2-A FP16 intrinsics.  */
+
+#include "arm_fp16.h"
+
+#pragma GCC push_options
+#pragma GCC target ("arch=armv8.2-a+fp16")
+
+/* ARMv8.2-A FP16 one operand vector intrinsics.  */
+
+__extension__ extern __inline float16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vabs_f16 (float16x4_t __a)
+{
+  return __builtin_aarch64_absv4hf (__a);
+}
+
+__extension__ extern __inline float16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vabsq_f16 (float16x8_t __a)
+{
+  return __builtin_aarch64_absv8hf (__a);
+}
+
+__extension__ extern __inline uint16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vceqz_f16 (float16x4_t __a)
+{
+  return __builtin_aarch64_cmeqv4hf_uss (__a, vdup_n_f16 (0.0f));
+}
+
+__extension__ extern __inline uint16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vceqzq_f16 (float16x8_t __a)
+{
+  return __builtin_aarch64_cmeqv8hf_uss (__a, vdupq_n_f16 (0.0f));
+}
+
+__extension__ extern __inline uint16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcgez_f16 (float16x4_t __a)
+{
+  return __builtin_aarch64_cmgev4hf_uss (__a, vdup_n_f16 (0.0f));
+}
+
+__extension__ extern __inline uint16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcgezq_f16 (float16x8_t __a)
+{
+  return __builtin_aarch64_cmgev8hf_uss (__a, vdupq_n_f16 (0.0f));
+}
+
+__extension__ extern __inline uint16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcgtz_f16 (float16x4_t __a)
+{
+  return __builtin_aarch64_cmgtv4hf_uss (__a, vdup_n_f16 (0.0f));
+}
+
+__extension__ extern __inline uint16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcgtzq_f16 (float16x8_t __a)
+{
+  return __builtin_aarch64_cmgtv8hf_uss (__a, vdupq_n_f16 (0.0f));
+}
+
+__extension__ extern __inline uint16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vclez_f16 (float16x4_t __a)
+{
+  return __builtin_aarch64_cmlev4hf_uss (__a, vdup_n_f16 (0.0f));
+}
+
+__extension__ extern __inline uint16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vclezq_f16 (float16x8_t __a)
+{
+  return __builtin_aarch64_cmlev8hf_uss (__a, vdupq_n_f16 (0.0f));
+}
+
+__extension__ extern __inline uint16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcltz_f16 (float16x4_t __a)
+{
+  return __builtin_aarch64_cmltv4hf_uss (__a, vdup_n_f16 (0.0f));
+}
+
+__extension__ extern __inline uint16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcltzq_f16 (float16x8_t __a)
+{
+  return __builtin_aarch64_cmltv8hf_uss (__a, vdupq_n_f16 (0.0f));
+}
+
+__extension__ extern __inline float16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcvt_f16_s16 (int16x4_t __a)
 {
-#ifdef __AARCH64EB__
-  return __builtin_shuffle (__a, __b,
-      (uint8x16_t) {16, 0, 18, 2, 20, 4, 22, 6, 24, 8, 26, 10, 28, 12, 30, 14});
-#else
-  return __builtin_shuffle (__a, __b,
-      (uint8x16_t) {1, 17, 3, 19, 5, 21, 7, 23, 9, 25, 11, 27, 13, 29, 15, 31});
-#endif
+  return __builtin_aarch64_floatv4hiv4hf (__a);
 }
 
-__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
-vtrn2q_p16 (poly16x8_t __a, poly16x8_t __b)
+__extension__ extern __inline float16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcvtq_f16_s16 (int16x8_t __a)
 {
-#ifdef __AARCH64EB__
-  return __builtin_shuffle (__a, __b, (uint16x8_t) {8, 0, 10, 2, 12, 4, 14, 6});
-#else
-  return __builtin_shuffle (__a, __b, (uint16x8_t) {1, 9, 3, 11, 5, 13, 7, 15});
-#endif
+  return __builtin_aarch64_floatv8hiv8hf (__a);
 }
 
-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
-vtrn2q_s8 (int8x16_t __a, int8x16_t __b)
+__extension__ extern __inline float16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcvt_f16_u16 (uint16x4_t __a)
 {
-#ifdef __AARCH64EB__
-  return __builtin_shuffle (__a, __b,
-      (uint8x16_t) {16, 0, 18, 2, 20, 4, 22, 6, 24, 8, 26, 10, 28, 12, 30, 14});
-#else
-  return __builtin_shuffle (__a, __b,
-      (uint8x16_t) {1, 17, 3, 19, 5, 21, 7, 23, 9, 25, 11, 27, 13, 29, 15, 31});
-#endif
+  return __builtin_aarch64_floatunsv4hiv4hf ((int16x4_t) __a);
 }
 
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-vtrn2q_s16 (int16x8_t __a, int16x8_t __b)
+__extension__ extern __inline float16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcvtq_f16_u16 (uint16x8_t __a)
 {
-#ifdef __AARCH64EB__
-  return __builtin_shuffle (__a, __b, (uint16x8_t) {8, 0, 10, 2, 12, 4, 14, 6});
-#else
-  return __builtin_shuffle (__a, __b, (uint16x8_t) {1, 9, 3, 11, 5, 13, 7, 15});
-#endif
+  return __builtin_aarch64_floatunsv8hiv8hf ((int16x8_t) __a);
 }
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-vtrn2q_s32 (int32x4_t __a, int32x4_t __b)
+__extension__ extern __inline int16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcvt_s16_f16 (float16x4_t __a)
 {
-#ifdef __AARCH64EB__
-  return __builtin_shuffle (__a, __b, (uint32x4_t) {4, 0, 6, 2});
-#else
-  return __builtin_shuffle (__a, __b, (uint32x4_t) {1, 5, 3, 7});
-#endif
+  return __builtin_aarch64_lbtruncv4hfv4hi (__a);
 }
 
-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
-vtrn2q_s64 (int64x2_t __a, int64x2_t __b)
+__extension__ extern __inline int16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcvtq_s16_f16 (float16x8_t __a)
 {
-#ifdef __AARCH64EB__
-  return __builtin_shuffle (__a, __b, (uint64x2_t) {2, 0});
-#else
-  return __builtin_shuffle (__a, __b, (uint64x2_t) {1, 3});
-#endif
+  return __builtin_aarch64_lbtruncv8hfv8hi (__a);
 }
 
-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-vtrn2q_u8 (uint8x16_t __a, uint8x16_t __b)
+__extension__ extern __inline uint16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcvt_u16_f16 (float16x4_t __a)
 {
-#ifdef __AARCH64EB__
-  return __builtin_shuffle (__a, __b,
-      (uint8x16_t) {16, 0, 18, 2, 20, 4, 22, 6, 24, 8, 26, 10, 28, 12, 30, 14});
-#else
-  return __builtin_shuffle (__a, __b,
-      (uint8x16_t) {1, 17, 3, 19, 5, 21, 7, 23, 9, 25, 11, 27, 13, 29, 15, 31});
-#endif
+  return __builtin_aarch64_lbtruncuv4hfv4hi_us (__a);
 }
 
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-vtrn2q_u16 (uint16x8_t __a, uint16x8_t __b)
+__extension__ extern __inline uint16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcvtq_u16_f16 (float16x8_t __a)
 {
-#ifdef __AARCH64EB__
-  return __builtin_shuffle (__a, __b, (uint16x8_t) {8, 0, 10, 2, 12, 4, 14, 6});
-#else
-  return __builtin_shuffle (__a, __b, (uint16x8_t) {1, 9, 3, 11, 5, 13, 7, 15});
-#endif
+  return __builtin_aarch64_lbtruncuv8hfv8hi_us (__a);
 }
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vtrn2q_u32 (uint32x4_t __a, uint32x4_t __b)
+__extension__ extern __inline int16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcvta_s16_f16 (float16x4_t __a)
 {
-#ifdef __AARCH64EB__
-  return __builtin_shuffle (__a, __b, (uint32x4_t) {4, 0, 6, 2});
-#else
-  return __builtin_shuffle (__a, __b, (uint32x4_t) {1, 5, 3, 7});
-#endif
+  return __builtin_aarch64_lroundv4hfv4hi (__a);
 }
 
-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
-vtrn2q_u64 (uint64x2_t __a, uint64x2_t __b)
+__extension__ extern __inline int16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcvtaq_s16_f16 (float16x8_t __a)
 {
-#ifdef __AARCH64EB__
-  return __builtin_shuffle (__a, __b, (uint64x2_t) {2, 0});
-#else
-  return __builtin_shuffle (__a, __b, (uint64x2_t) {1, 3});
-#endif
+  return __builtin_aarch64_lroundv8hfv8hi (__a);
 }
 
-__extension__ static __inline float32x2x2_t __attribute__ ((__always_inline__))
-vtrn_f32 (float32x2_t a, float32x2_t b)
+__extension__ extern __inline uint16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcvta_u16_f16 (float16x4_t __a)
 {
-  return (float32x2x2_t) {vtrn1_f32 (a, b), vtrn2_f32 (a, b)};
+  return __builtin_aarch64_lrounduv4hfv4hi_us (__a);
 }
 
-__extension__ static __inline poly8x8x2_t __attribute__ ((__always_inline__))
-vtrn_p8 (poly8x8_t a, poly8x8_t b)
+__extension__ extern __inline uint16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcvtaq_u16_f16 (float16x8_t __a)
 {
-  return (poly8x8x2_t) {vtrn1_p8 (a, b), vtrn2_p8 (a, b)};
+  return __builtin_aarch64_lrounduv8hfv8hi_us (__a);
 }
 
-__extension__ static __inline poly16x4x2_t __attribute__ ((__always_inline__))
-vtrn_p16 (poly16x4_t a, poly16x4_t b)
+__extension__ extern __inline int16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcvtm_s16_f16 (float16x4_t __a)
 {
-  return (poly16x4x2_t) {vtrn1_p16 (a, b), vtrn2_p16 (a, b)};
+  return __builtin_aarch64_lfloorv4hfv4hi (__a);
 }
 
-__extension__ static __inline int8x8x2_t __attribute__ ((__always_inline__))
-vtrn_s8 (int8x8_t a, int8x8_t b)
+__extension__ extern __inline int16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcvtmq_s16_f16 (float16x8_t __a)
 {
-  return (int8x8x2_t) {vtrn1_s8 (a, b), vtrn2_s8 (a, b)};
+  return __builtin_aarch64_lfloorv8hfv8hi (__a);
 }
 
-__extension__ static __inline int16x4x2_t __attribute__ ((__always_inline__))
-vtrn_s16 (int16x4_t a, int16x4_t b)
+__extension__ extern __inline uint16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcvtm_u16_f16 (float16x4_t __a)
 {
-  return (int16x4x2_t) {vtrn1_s16 (a, b), vtrn2_s16 (a, b)};
+  return __builtin_aarch64_lflooruv4hfv4hi_us (__a);
 }
 
-__extension__ static __inline int32x2x2_t __attribute__ ((__always_inline__))
-vtrn_s32 (int32x2_t a, int32x2_t b)
+__extension__ extern __inline uint16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcvtmq_u16_f16 (float16x8_t __a)
 {
-  return (int32x2x2_t) {vtrn1_s32 (a, b), vtrn2_s32 (a, b)};
+  return __builtin_aarch64_lflooruv8hfv8hi_us (__a);
 }
 
-__extension__ static __inline uint8x8x2_t __attribute__ ((__always_inline__))
-vtrn_u8 (uint8x8_t a, uint8x8_t b)
+__extension__ extern __inline int16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcvtn_s16_f16 (float16x4_t __a)
 {
-  return (uint8x8x2_t) {vtrn1_u8 (a, b), vtrn2_u8 (a, b)};
+  return __builtin_aarch64_lfrintnv4hfv4hi (__a);
 }
 
-__extension__ static __inline uint16x4x2_t __attribute__ ((__always_inline__))
-vtrn_u16 (uint16x4_t a, uint16x4_t b)
+__extension__ extern __inline int16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcvtnq_s16_f16 (float16x8_t __a)
 {
-  return (uint16x4x2_t) {vtrn1_u16 (a, b), vtrn2_u16 (a, b)};
+  return __builtin_aarch64_lfrintnv8hfv8hi (__a);
 }
 
-__extension__ static __inline uint32x2x2_t __attribute__ ((__always_inline__))
-vtrn_u32 (uint32x2_t a, uint32x2_t b)
+__extension__ extern __inline uint16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcvtn_u16_f16 (float16x4_t __a)
 {
-  return (uint32x2x2_t) {vtrn1_u32 (a, b), vtrn2_u32 (a, b)};
+  return __builtin_aarch64_lfrintnuv4hfv4hi_us (__a);
 }
 
-__extension__ static __inline float32x4x2_t __attribute__ ((__always_inline__))
-vtrnq_f32 (float32x4_t a, float32x4_t b)
+__extension__ extern __inline uint16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcvtnq_u16_f16 (float16x8_t __a)
 {
-  return (float32x4x2_t) {vtrn1q_f32 (a, b), vtrn2q_f32 (a, b)};
+  return __builtin_aarch64_lfrintnuv8hfv8hi_us (__a);
 }
 
-__extension__ static __inline poly8x16x2_t __attribute__ ((__always_inline__))
-vtrnq_p8 (poly8x16_t a, poly8x16_t b)
+__extension__ extern __inline int16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcvtp_s16_f16 (float16x4_t __a)
 {
-  return (poly8x16x2_t) {vtrn1q_p8 (a, b), vtrn2q_p8 (a, b)};
+  return __builtin_aarch64_lceilv4hfv4hi (__a);
 }
 
-__extension__ static __inline poly16x8x2_t __attribute__ ((__always_inline__))
-vtrnq_p16 (poly16x8_t a, poly16x8_t b)
+__extension__ extern __inline int16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcvtpq_s16_f16 (float16x8_t __a)
 {
-  return (poly16x8x2_t) {vtrn1q_p16 (a, b), vtrn2q_p16 (a, b)};
+  return __builtin_aarch64_lceilv8hfv8hi (__a);
 }
 
-__extension__ static __inline int8x16x2_t __attribute__ ((__always_inline__))
-vtrnq_s8 (int8x16_t a, int8x16_t b)
+__extension__ extern __inline uint16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcvtp_u16_f16 (float16x4_t __a)
 {
-  return (int8x16x2_t) {vtrn1q_s8 (a, b), vtrn2q_s8 (a, b)};
+  return __builtin_aarch64_lceiluv4hfv4hi_us (__a);
 }
 
-__extension__ static __inline int16x8x2_t __attribute__ ((__always_inline__))
-vtrnq_s16 (int16x8_t a, int16x8_t b)
+__extension__ extern __inline uint16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcvtpq_u16_f16 (float16x8_t __a)
 {
-  return (int16x8x2_t) {vtrn1q_s16 (a, b), vtrn2q_s16 (a, b)};
+  return __builtin_aarch64_lceiluv8hfv8hi_us (__a);
 }
 
-__extension__ static __inline int32x4x2_t __attribute__ ((__always_inline__))
-vtrnq_s32 (int32x4_t a, int32x4_t b)
+__extension__ extern __inline float16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vneg_f16 (float16x4_t __a)
 {
-  return (int32x4x2_t) {vtrn1q_s32 (a, b), vtrn2q_s32 (a, b)};
+  return -__a;
 }
 
-__extension__ static __inline uint8x16x2_t __attribute__ ((__always_inline__))
-vtrnq_u8 (uint8x16_t a, uint8x16_t b)
+__extension__ extern __inline float16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vnegq_f16 (float16x8_t __a)
 {
-  return (uint8x16x2_t) {vtrn1q_u8 (a, b), vtrn2q_u8 (a, b)};
+  return -__a;
 }
 
-__extension__ static __inline uint16x8x2_t __attribute__ ((__always_inline__))
-vtrnq_u16 (uint16x8_t a, uint16x8_t b)
+__extension__ extern __inline float16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vrecpe_f16 (float16x4_t __a)
 {
-  return (uint16x8x2_t) {vtrn1q_u16 (a, b), vtrn2q_u16 (a, b)};
+  return __builtin_aarch64_frecpev4hf (__a);
 }
 
-__extension__ static __inline uint32x4x2_t __attribute__ ((__always_inline__))
-vtrnq_u32 (uint32x4_t a, uint32x4_t b)
+__extension__ extern __inline float16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vrecpeq_f16 (float16x8_t __a)
 {
-  return (uint32x4x2_t) {vtrn1q_u32 (a, b), vtrn2q_u32 (a, b)};
+  return __builtin_aarch64_frecpev8hf (__a);
 }
 
-/* vtst */
+__extension__ extern __inline float16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vrnd_f16 (float16x4_t __a)
+{
+  return __builtin_aarch64_btruncv4hf (__a);
+}
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-vtst_s8 (int8x8_t __a, int8x8_t __b)
+__extension__ extern __inline float16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vrndq_f16 (float16x8_t __a)
 {
-  return (uint8x8_t) ((__a & __b) != 0);
+  return __builtin_aarch64_btruncv8hf (__a);
 }
 
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-vtst_s16 (int16x4_t __a, int16x4_t __b)
+__extension__ extern __inline float16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vrnda_f16 (float16x4_t __a)
 {
-  return (uint16x4_t) ((__a & __b) != 0);
+  return __builtin_aarch64_roundv4hf (__a);
 }
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-vtst_s32 (int32x2_t __a, int32x2_t __b)
+__extension__ extern __inline float16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vrndaq_f16 (float16x8_t __a)
 {
-  return (uint32x2_t) ((__a & __b) != 0);
+  return __builtin_aarch64_roundv8hf (__a);
 }
 
-__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
-vtst_s64 (int64x1_t __a, int64x1_t __b)
+__extension__ extern __inline float16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vrndi_f16 (float16x4_t __a)
 {
-  return (uint64x1_t) ((__a & __b) != __AARCH64_INT64_C (0));
+  return __builtin_aarch64_nearbyintv4hf (__a);
 }
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-vtst_u8 (uint8x8_t __a, uint8x8_t __b)
+__extension__ extern __inline float16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vrndiq_f16 (float16x8_t __a)
 {
-  return ((__a & __b) != 0);
+  return __builtin_aarch64_nearbyintv8hf (__a);
 }
 
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-vtst_u16 (uint16x4_t __a, uint16x4_t __b)
+__extension__ extern __inline float16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vrndm_f16 (float16x4_t __a)
 {
-  return ((__a & __b) != 0);
+  return __builtin_aarch64_floorv4hf (__a);
 }
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-vtst_u32 (uint32x2_t __a, uint32x2_t __b)
+__extension__ extern __inline float16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vrndmq_f16 (float16x8_t __a)
 {
-  return ((__a & __b) != 0);
+  return __builtin_aarch64_floorv8hf (__a);
 }
 
-__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
-vtst_u64 (uint64x1_t __a, uint64x1_t __b)
+__extension__ extern __inline float16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vrndn_f16 (float16x4_t __a)
 {
-  return ((__a & __b) != __AARCH64_UINT64_C (0));
+  return __builtin_aarch64_frintnv4hf (__a);
 }
 
-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-vtstq_s8 (int8x16_t __a, int8x16_t __b)
+__extension__ extern __inline float16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vrndnq_f16 (float16x8_t __a)
 {
-  return (uint8x16_t) ((__a & __b) != 0);
+  return __builtin_aarch64_frintnv8hf (__a);
 }
 
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-vtstq_s16 (int16x8_t __a, int16x8_t __b)
+__extension__ extern __inline float16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vrndp_f16 (float16x4_t __a)
 {
-  return (uint16x8_t) ((__a & __b) != 0);
+  return __builtin_aarch64_ceilv4hf (__a);
 }
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vtstq_s32 (int32x4_t __a, int32x4_t __b)
+__extension__ extern __inline float16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vrndpq_f16 (float16x8_t __a)
 {
-  return (uint32x4_t) ((__a & __b) != 0);
+  return __builtin_aarch64_ceilv8hf (__a);
 }
 
-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
-vtstq_s64 (int64x2_t __a, int64x2_t __b)
+__extension__ extern __inline float16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vrndx_f16 (float16x4_t __a)
 {
-  return (uint64x2_t) ((__a & __b) != __AARCH64_INT64_C (0));
+  return __builtin_aarch64_rintv4hf (__a);
 }
 
-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-vtstq_u8 (uint8x16_t __a, uint8x16_t __b)
+__extension__ extern __inline float16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vrndxq_f16 (float16x8_t __a)
 {
-  return ((__a & __b) != 0);
+  return __builtin_aarch64_rintv8hf (__a);
 }
 
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-vtstq_u16 (uint16x8_t __a, uint16x8_t __b)
+__extension__ extern __inline float16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vrsqrte_f16 (float16x4_t a)
 {
-  return ((__a & __b) != 0);
+  return __builtin_aarch64_rsqrtev4hf (a);
 }
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vtstq_u32 (uint32x4_t __a, uint32x4_t __b)
+__extension__ extern __inline float16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vrsqrteq_f16 (float16x8_t a)
 {
-  return ((__a & __b) != 0);
+  return __builtin_aarch64_rsqrtev8hf (a);
 }
 
-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
-vtstq_u64 (uint64x2_t __a, uint64x2_t __b)
+__extension__ extern __inline float16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vsqrt_f16 (float16x4_t a)
 {
-  return ((__a & __b) != __AARCH64_UINT64_C (0));
+  return __builtin_aarch64_sqrtv4hf (a);
 }
 
-__extension__ static __inline uint64_t __attribute__ ((__always_inline__))
-vtstd_s64 (int64_t __a, int64_t __b)
+__extension__ extern __inline float16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vsqrtq_f16 (float16x8_t a)
 {
-  return (__a & __b) ? -1ll : 0ll;
+  return __builtin_aarch64_sqrtv8hf (a);
 }
 
-__extension__ static __inline uint64_t __attribute__ ((__always_inline__))
-vtstd_u64 (uint64_t __a, uint64_t __b)
+/* ARMv8.2-A FP16 two operands vector intrinsics.  */
+
+__extension__ extern __inline float16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vadd_f16 (float16x4_t __a, float16x4_t __b)
 {
-  return (__a & __b) ? -1ll : 0ll;
+  return __a + __b;
 }
 
-/* vuqadd */
+__extension__ extern __inline float16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vaddq_f16 (float16x8_t __a, float16x8_t __b)
+{
+  return __a + __b;
+}
 
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-vuqadd_s8 (int8x8_t __a, uint8x8_t __b)
+__extension__ extern __inline float16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vabd_f16 (float16x4_t a, float16x4_t b)
 {
-  return __builtin_aarch64_suqaddv8qi_ssu (__a,  __b);
+  return __builtin_aarch64_fabdv4hf (a, b);
 }
 
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-vuqadd_s16 (int16x4_t __a, uint16x4_t __b)
+__extension__ extern __inline float16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vabdq_f16 (float16x8_t a, float16x8_t b)
 {
-  return __builtin_aarch64_suqaddv4hi_ssu (__a,  __b);
+  return __builtin_aarch64_fabdv8hf (a, b);
 }
 
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-vuqadd_s32 (int32x2_t __a, uint32x2_t __b)
+__extension__ extern __inline uint16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcage_f16 (float16x4_t __a, float16x4_t __b)
 {
-  return __builtin_aarch64_suqaddv2si_ssu (__a,  __b);
+  return __builtin_aarch64_facgev4hf_uss (__a, __b);
 }
 
-__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
-vuqadd_s64 (int64x1_t __a, uint64x1_t __b)
+__extension__ extern __inline uint16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcageq_f16 (float16x8_t __a, float16x8_t __b)
 {
-  return (int64x1_t) {__builtin_aarch64_suqadddi_ssu (__a[0], __b[0])};
+  return __builtin_aarch64_facgev8hf_uss (__a, __b);
 }
 
-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
-vuqaddq_s8 (int8x16_t __a, uint8x16_t __b)
+__extension__ extern __inline uint16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcagt_f16 (float16x4_t __a, float16x4_t __b)
 {
-  return __builtin_aarch64_suqaddv16qi_ssu (__a,  __b);
+  return __builtin_aarch64_facgtv4hf_uss (__a, __b);
 }
 
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-vuqaddq_s16 (int16x8_t __a, uint16x8_t __b)
+__extension__ extern __inline uint16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcagtq_f16 (float16x8_t __a, float16x8_t __b)
 {
-  return __builtin_aarch64_suqaddv8hi_ssu (__a,  __b);
+  return __builtin_aarch64_facgtv8hf_uss (__a, __b);
 }
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-vuqaddq_s32 (int32x4_t __a, uint32x4_t __b)
+__extension__ extern __inline uint16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcale_f16 (float16x4_t __a, float16x4_t __b)
 {
-  return __builtin_aarch64_suqaddv4si_ssu (__a,  __b);
+  return __builtin_aarch64_faclev4hf_uss (__a, __b);
 }
 
-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
-vuqaddq_s64 (int64x2_t __a, uint64x2_t __b)
+__extension__ extern __inline uint16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcaleq_f16 (float16x8_t __a, float16x8_t __b)
 {
-  return __builtin_aarch64_suqaddv2di_ssu (__a,  __b);
+  return __builtin_aarch64_faclev8hf_uss (__a, __b);
 }
 
-__extension__ static __inline int8_t __attribute__ ((__always_inline__))
-vuqaddb_s8 (int8_t __a, uint8_t __b)
+__extension__ extern __inline uint16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcalt_f16 (float16x4_t __a, float16x4_t __b)
 {
-  return __builtin_aarch64_suqaddqi_ssu (__a,  __b);
+  return __builtin_aarch64_facltv4hf_uss (__a, __b);
 }
 
-__extension__ static __inline int16_t __attribute__ ((__always_inline__))
-vuqaddh_s16 (int16_t __a, uint16_t __b)
+__extension__ extern __inline uint16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcaltq_f16 (float16x8_t __a, float16x8_t __b)
 {
-  return __builtin_aarch64_suqaddhi_ssu (__a,  __b);
+  return __builtin_aarch64_facltv8hf_uss (__a, __b);
 }
 
-__extension__ static __inline int32_t __attribute__ ((__always_inline__))
-vuqadds_s32 (int32_t __a, uint32_t __b)
+__extension__ extern __inline uint16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vceq_f16 (float16x4_t __a, float16x4_t __b)
 {
-  return __builtin_aarch64_suqaddsi_ssu (__a,  __b);
+  return __builtin_aarch64_cmeqv4hf_uss (__a, __b);
 }
 
-__extension__ static __inline int64_t __attribute__ ((__always_inline__))
-vuqaddd_s64 (int64_t __a, uint64_t __b)
+__extension__ extern __inline uint16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vceqq_f16 (float16x8_t __a, float16x8_t __b)
 {
-  return __builtin_aarch64_suqadddi_ssu (__a,  __b);
+  return __builtin_aarch64_cmeqv8hf_uss (__a, __b);
 }
 
-#define __DEFINTERLEAVE(op, rettype, intype, funcsuffix, Q) 		\
-  __extension__ static __inline rettype					\
-  __attribute__ ((__always_inline__))					\
-  v ## op ## Q ## _ ## funcsuffix (intype a, intype b)			\
-  {									\
-    return (rettype) {v ## op ## 1 ## Q ## _ ## funcsuffix (a, b),	\
-		      v ## op ## 2 ## Q ## _ ## funcsuffix (a, b)};	\
-  }
+__extension__ extern __inline uint16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcge_f16 (float16x4_t __a, float16x4_t __b)
+{
+  return __builtin_aarch64_cmgev4hf_uss (__a, __b);
+}
 
-#define __INTERLEAVE_LIST(op)					\
-  __DEFINTERLEAVE (op, float32x2x2_t, float32x2_t, f32,)	\
-  __DEFINTERLEAVE (op, poly8x8x2_t, poly8x8_t, p8,)		\
-  __DEFINTERLEAVE (op, poly16x4x2_t, poly16x4_t, p16,)		\
-  __DEFINTERLEAVE (op, int8x8x2_t, int8x8_t, s8,)		\
-  __DEFINTERLEAVE (op, int16x4x2_t, int16x4_t, s16,)		\
-  __DEFINTERLEAVE (op, int32x2x2_t, int32x2_t, s32,)		\
-  __DEFINTERLEAVE (op, uint8x8x2_t, uint8x8_t, u8,)		\
-  __DEFINTERLEAVE (op, uint16x4x2_t, uint16x4_t, u16,)		\
-  __DEFINTERLEAVE (op, uint32x2x2_t, uint32x2_t, u32,)		\
-  __DEFINTERLEAVE (op, float32x4x2_t, float32x4_t, f32, q)	\
-  __DEFINTERLEAVE (op, poly8x16x2_t, poly8x16_t, p8, q)		\
-  __DEFINTERLEAVE (op, poly16x8x2_t, poly16x8_t, p16, q)	\
-  __DEFINTERLEAVE (op, int8x16x2_t, int8x16_t, s8, q)		\
-  __DEFINTERLEAVE (op, int16x8x2_t, int16x8_t, s16, q)		\
-  __DEFINTERLEAVE (op, int32x4x2_t, int32x4_t, s32, q)		\
-  __DEFINTERLEAVE (op, uint8x16x2_t, uint8x16_t, u8, q)		\
-  __DEFINTERLEAVE (op, uint16x8x2_t, uint16x8_t, u16, q)	\
-  __DEFINTERLEAVE (op, uint32x4x2_t, uint32x4_t, u32, q)
+__extension__ extern __inline uint16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcgeq_f16 (float16x8_t __a, float16x8_t __b)
+{
+  return __builtin_aarch64_cmgev8hf_uss (__a, __b);
+}
 
-/* vuzp */
+__extension__ extern __inline uint16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcgt_f16 (float16x4_t __a, float16x4_t __b)
+{
+  return __builtin_aarch64_cmgtv4hf_uss (__a, __b);
+}
 
-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
-vuzp1_f32 (float32x2_t __a, float32x2_t __b)
+__extension__ extern __inline uint16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcgtq_f16 (float16x8_t __a, float16x8_t __b)
 {
-#ifdef __AARCH64EB__
-  return __builtin_shuffle (__a, __b, (uint32x2_t) {3, 1});
-#else
-  return __builtin_shuffle (__a, __b, (uint32x2_t) {0, 2});
-#endif
+  return __builtin_aarch64_cmgtv8hf_uss (__a, __b);
 }
 
-__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
-vuzp1_p8 (poly8x8_t __a, poly8x8_t __b)
+__extension__ extern __inline uint16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcle_f16 (float16x4_t __a, float16x4_t __b)
 {
-#ifdef __AARCH64EB__
-  return __builtin_shuffle (__a, __b, (uint8x8_t) {9, 11, 13, 15, 1, 3, 5, 7});
-#else
-  return __builtin_shuffle (__a, __b, (uint8x8_t) {0, 2, 4, 6, 8, 10, 12, 14});
-#endif
+  return __builtin_aarch64_cmlev4hf_uss (__a, __b);
 }
 
-__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__))
-vuzp1_p16 (poly16x4_t __a, poly16x4_t __b)
+__extension__ extern __inline uint16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcleq_f16 (float16x8_t __a, float16x8_t __b)
 {
-#ifdef __AARCH64EB__
-  return __builtin_shuffle (__a, __b, (uint16x4_t) {5, 7, 1, 3});
-#else
-  return __builtin_shuffle (__a, __b, (uint16x4_t) {0, 2, 4, 6});
-#endif
+  return __builtin_aarch64_cmlev8hf_uss (__a, __b);
 }
 
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-vuzp1_s8 (int8x8_t __a, int8x8_t __b)
+__extension__ extern __inline uint16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vclt_f16 (float16x4_t __a, float16x4_t __b)
 {
-#ifdef __AARCH64EB__
-  return __builtin_shuffle (__a, __b, (uint8x8_t) {9, 11, 13, 15, 1, 3, 5, 7});
-#else
-  return __builtin_shuffle (__a, __b, (uint8x8_t) {0, 2, 4, 6, 8, 10, 12, 14});
-#endif
+  return __builtin_aarch64_cmltv4hf_uss (__a, __b);
 }
 
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-vuzp1_s16 (int16x4_t __a, int16x4_t __b)
+__extension__ extern __inline uint16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcltq_f16 (float16x8_t __a, float16x8_t __b)
 {
-#ifdef __AARCH64EB__
-  return __builtin_shuffle (__a, __b, (uint16x4_t) {5, 7, 1, 3});
-#else
-  return __builtin_shuffle (__a, __b, (uint16x4_t) {0, 2, 4, 6});
-#endif
+  return __builtin_aarch64_cmltv8hf_uss (__a, __b);
 }
 
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-vuzp1_s32 (int32x2_t __a, int32x2_t __b)
+__extension__ extern __inline float16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcvt_n_f16_s16 (int16x4_t __a, const int __b)
 {
-#ifdef __AARCH64EB__
-  return __builtin_shuffle (__a, __b, (uint32x2_t) {3, 1});
-#else
-  return __builtin_shuffle (__a, __b, (uint32x2_t) {0, 2});
-#endif
+  return __builtin_aarch64_scvtfv4hi (__a, __b);
 }
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-vuzp1_u8 (uint8x8_t __a, uint8x8_t __b)
+__extension__ extern __inline float16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcvtq_n_f16_s16 (int16x8_t __a, const int __b)
 {
-#ifdef __AARCH64EB__
-  return __builtin_shuffle (__a, __b, (uint8x8_t) {9, 11, 13, 15, 1, 3, 5, 7});
-#else
-  return __builtin_shuffle (__a, __b, (uint8x8_t) {0, 2, 4, 6, 8, 10, 12, 14});
-#endif
+  return __builtin_aarch64_scvtfv8hi (__a, __b);
 }
 
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-vuzp1_u16 (uint16x4_t __a, uint16x4_t __b)
+__extension__ extern __inline float16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcvt_n_f16_u16 (uint16x4_t __a, const int __b)
 {
-#ifdef __AARCH64EB__
-  return __builtin_shuffle (__a, __b, (uint16x4_t) {5, 7, 1, 3});
-#else
-  return __builtin_shuffle (__a, __b, (uint16x4_t) {0, 2, 4, 6});
-#endif
+  return __builtin_aarch64_ucvtfv4hi_sus (__a, __b);
+}
+
+__extension__ extern __inline float16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcvtq_n_f16_u16 (uint16x8_t __a, const int __b)
+{
+  return __builtin_aarch64_ucvtfv8hi_sus (__a, __b);
 }
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-vuzp1_u32 (uint32x2_t __a, uint32x2_t __b)
+__extension__ extern __inline int16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcvt_n_s16_f16 (float16x4_t __a, const int __b)
 {
-#ifdef __AARCH64EB__
-  return __builtin_shuffle (__a, __b, (uint32x2_t) {3, 1});
-#else
-  return __builtin_shuffle (__a, __b, (uint32x2_t) {0, 2});
-#endif
+  return __builtin_aarch64_fcvtzsv4hf (__a, __b);
 }
 
-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
-vuzp1q_f32 (float32x4_t __a, float32x4_t __b)
+__extension__ extern __inline int16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcvtq_n_s16_f16 (float16x8_t __a, const int __b)
 {
-#ifdef __AARCH64EB__
-  return __builtin_shuffle (__a, __b, (uint32x4_t) {5, 7, 1, 3});
-#else
-  return __builtin_shuffle (__a, __b, (uint32x4_t) {0, 2, 4, 6});
-#endif
+  return __builtin_aarch64_fcvtzsv8hf (__a, __b);
 }
 
-__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
-vuzp1q_f64 (float64x2_t __a, float64x2_t __b)
+__extension__ extern __inline uint16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcvt_n_u16_f16 (float16x4_t __a, const int __b)
 {
-#ifdef __AARCH64EB__
-  return __builtin_shuffle (__a, __b, (uint64x2_t) {3, 1});
-#else
-  return __builtin_shuffle (__a, __b, (uint64x2_t) {0, 2});
-#endif
+  return __builtin_aarch64_fcvtzuv4hf_uss (__a, __b);
 }
 
-__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
-vuzp1q_p8 (poly8x16_t __a, poly8x16_t __b)
+__extension__ extern __inline uint16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcvtq_n_u16_f16 (float16x8_t __a, const int __b)
 {
-#ifdef __AARCH64EB__
-  return __builtin_shuffle (__a, __b, (uint8x16_t)
-      {17, 19, 21, 23, 25, 27, 29, 31, 1, 3, 5, 7, 9, 11, 13, 15});
-#else
-  return __builtin_shuffle (__a, __b, (uint8x16_t)
-      {0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30});
-#endif
+  return __builtin_aarch64_fcvtzuv8hf_uss (__a, __b);
 }
 
-__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
-vuzp1q_p16 (poly16x8_t __a, poly16x8_t __b)
+__extension__ extern __inline float16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vdiv_f16 (float16x4_t __a, float16x4_t __b)
 {
-#ifdef __AARCH64EB__
-  return __builtin_shuffle (__a, __b, (uint16x8_t) {9, 11, 13, 15, 1, 3, 5, 7});
-#else
-  return __builtin_shuffle (__a, __b, (uint16x8_t) {0, 2, 4, 6, 8, 10, 12, 14});
-#endif
+  return __a / __b;
 }
 
-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
-vuzp1q_s8 (int8x16_t __a, int8x16_t __b)
+__extension__ extern __inline float16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vdivq_f16 (float16x8_t __a, float16x8_t __b)
 {
-#ifdef __AARCH64EB__
-  return __builtin_shuffle (__a, __b,
-      (uint8x16_t) {17, 19, 21, 23, 25, 27, 29, 31, 1, 3, 5, 7, 9, 11, 13, 15});
-#else
-  return __builtin_shuffle (__a, __b,
-      (uint8x16_t) {0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30});
-#endif
+  return __a / __b;
 }
 
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-vuzp1q_s16 (int16x8_t __a, int16x8_t __b)
+__extension__ extern __inline float16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmax_f16 (float16x4_t __a, float16x4_t __b)
 {
-#ifdef __AARCH64EB__
-  return __builtin_shuffle (__a, __b, (uint16x8_t) {9, 11, 13, 15, 1, 3, 5, 7});
-#else
-  return __builtin_shuffle (__a, __b, (uint16x8_t) {0, 2, 4, 6, 8, 10, 12, 14});
-#endif
+  return __builtin_aarch64_smax_nanv4hf (__a, __b);
 }
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-vuzp1q_s32 (int32x4_t __a, int32x4_t __b)
+__extension__ extern __inline float16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmaxq_f16 (float16x8_t __a, float16x8_t __b)
 {
-#ifdef __AARCH64EB__
-  return __builtin_shuffle (__a, __b, (uint32x4_t) {5, 7, 1, 3});
-#else
-  return __builtin_shuffle (__a, __b, (uint32x4_t) {0, 2, 4, 6});
-#endif
+  return __builtin_aarch64_smax_nanv8hf (__a, __b);
 }
 
-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
-vuzp1q_s64 (int64x2_t __a, int64x2_t __b)
+__extension__ extern __inline float16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmaxnm_f16 (float16x4_t __a, float16x4_t __b)
 {
-#ifdef __AARCH64EB__
-  return __builtin_shuffle (__a, __b, (uint64x2_t) {3, 1});
-#else
-  return __builtin_shuffle (__a, __b, (uint64x2_t) {0, 2});
-#endif
+  return __builtin_aarch64_fmaxv4hf (__a, __b);
 }
 
-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-vuzp1q_u8 (uint8x16_t __a, uint8x16_t __b)
+__extension__ extern __inline float16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmaxnmq_f16 (float16x8_t __a, float16x8_t __b)
 {
-#ifdef __AARCH64EB__
-  return __builtin_shuffle (__a, __b,
-      (uint8x16_t) {17, 19, 21, 23, 25, 27, 29, 31, 1, 3, 5, 7, 9, 11, 13, 15});
-#else
-  return __builtin_shuffle (__a, __b,
-      (uint8x16_t) {0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30});
-#endif
+  return __builtin_aarch64_fmaxv8hf (__a, __b);
 }
 
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-vuzp1q_u16 (uint16x8_t __a, uint16x8_t __b)
+__extension__ extern __inline float16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmin_f16 (float16x4_t __a, float16x4_t __b)
 {
-#ifdef __AARCH64EB__
-  return __builtin_shuffle (__a, __b, (uint16x8_t) {9, 11, 13, 15, 1, 3, 5, 7});
-#else
-  return __builtin_shuffle (__a, __b, (uint16x8_t) {0, 2, 4, 6, 8, 10, 12, 14});
-#endif
+  return __builtin_aarch64_smin_nanv4hf (__a, __b);
 }
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vuzp1q_u32 (uint32x4_t __a, uint32x4_t __b)
+__extension__ extern __inline float16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vminq_f16 (float16x8_t __a, float16x8_t __b)
 {
-#ifdef __AARCH64EB__
-  return __builtin_shuffle (__a, __b, (uint32x4_t) {5, 7, 1, 3});
-#else
-  return __builtin_shuffle (__a, __b, (uint32x4_t) {0, 2, 4, 6});
-#endif
+  return __builtin_aarch64_smin_nanv8hf (__a, __b);
 }
 
-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
-vuzp1q_u64 (uint64x2_t __a, uint64x2_t __b)
+__extension__ extern __inline float16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vminnm_f16 (float16x4_t __a, float16x4_t __b)
 {
-#ifdef __AARCH64EB__
-  return __builtin_shuffle (__a, __b, (uint64x2_t) {3, 1});
-#else
-  return __builtin_shuffle (__a, __b, (uint64x2_t) {0, 2});
-#endif
+  return __builtin_aarch64_fminv4hf (__a, __b);
 }
 
-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
-vuzp2_f32 (float32x2_t __a, float32x2_t __b)
+__extension__ extern __inline float16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vminnmq_f16 (float16x8_t __a, float16x8_t __b)
 {
-#ifdef __AARCH64EB__
-  return __builtin_shuffle (__a, __b, (uint32x2_t) {2, 0});
-#else
-  return __builtin_shuffle (__a, __b, (uint32x2_t) {1, 3});
-#endif
+  return __builtin_aarch64_fminv8hf (__a, __b);
 }
 
-__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
-vuzp2_p8 (poly8x8_t __a, poly8x8_t __b)
+__extension__ extern __inline float16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmul_f16 (float16x4_t __a, float16x4_t __b)
 {
-#ifdef __AARCH64EB__
-  return __builtin_shuffle (__a, __b, (uint8x8_t) {8, 10, 12, 14, 0, 2, 4, 6});
-#else
-  return __builtin_shuffle (__a, __b, (uint8x8_t) {1, 3, 5, 7, 9, 11, 13, 15});
-#endif
+  return __a * __b;
 }
 
-__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__))
-vuzp2_p16 (poly16x4_t __a, poly16x4_t __b)
+__extension__ extern __inline float16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmulq_f16 (float16x8_t __a, float16x8_t __b)
 {
-#ifdef __AARCH64EB__
-  return __builtin_shuffle (__a, __b, (uint16x4_t) {4, 6, 0, 2});
-#else
-  return __builtin_shuffle (__a, __b, (uint16x4_t) {1, 3, 5, 7});
-#endif
+  return __a * __b;
 }
 
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-vuzp2_s8 (int8x8_t __a, int8x8_t __b)
+__extension__ extern __inline float16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmulx_f16 (float16x4_t __a, float16x4_t __b)
 {
-#ifdef __AARCH64EB__
-  return __builtin_shuffle (__a, __b, (uint8x8_t) {8, 10, 12, 14, 0, 2, 4, 6});
-#else
-  return __builtin_shuffle (__a, __b, (uint8x8_t) {1, 3, 5, 7, 9, 11, 13, 15});
-#endif
+  return __builtin_aarch64_fmulxv4hf (__a, __b);
 }
 
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-vuzp2_s16 (int16x4_t __a, int16x4_t __b)
+__extension__ extern __inline float16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmulxq_f16 (float16x8_t __a, float16x8_t __b)
 {
-#ifdef __AARCH64EB__
-  return __builtin_shuffle (__a, __b, (uint16x4_t) {4, 6, 0, 2});
-#else
-  return __builtin_shuffle (__a, __b, (uint16x4_t) {1, 3, 5, 7});
-#endif
+  return __builtin_aarch64_fmulxv8hf (__a, __b);
 }
 
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-vuzp2_s32 (int32x2_t __a, int32x2_t __b)
+__extension__ extern __inline float16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vpadd_f16 (float16x4_t a, float16x4_t b)
 {
-#ifdef __AARCH64EB__
-  return __builtin_shuffle (__a, __b, (uint32x2_t) {2, 0});
-#else
-  return __builtin_shuffle (__a, __b, (uint32x2_t) {1, 3});
-#endif
+  return __builtin_aarch64_faddpv4hf (a, b);
 }
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-vuzp2_u8 (uint8x8_t __a, uint8x8_t __b)
+__extension__ extern __inline float16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vpaddq_f16 (float16x8_t a, float16x8_t b)
 {
-#ifdef __AARCH64EB__
-  return __builtin_shuffle (__a, __b, (uint8x8_t) {8, 10, 12, 14, 0, 2, 4, 6});
-#else
-  return __builtin_shuffle (__a, __b, (uint8x8_t) {1, 3, 5, 7, 9, 11, 13, 15});
-#endif
+  return __builtin_aarch64_faddpv8hf (a, b);
 }
 
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-vuzp2_u16 (uint16x4_t __a, uint16x4_t __b)
+__extension__ extern __inline float16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vpmax_f16 (float16x4_t a, float16x4_t b)
 {
-#ifdef __AARCH64EB__
-  return __builtin_shuffle (__a, __b, (uint16x4_t) {4, 6, 0, 2});
-#else
-  return __builtin_shuffle (__a, __b, (uint16x4_t) {1, 3, 5, 7});
-#endif
+  return __builtin_aarch64_smax_nanpv4hf (a, b);
 }
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-vuzp2_u32 (uint32x2_t __a, uint32x2_t __b)
+__extension__ extern __inline float16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vpmaxq_f16 (float16x8_t a, float16x8_t b)
 {
-#ifdef __AARCH64EB__
-  return __builtin_shuffle (__a, __b, (uint32x2_t) {2, 0});
-#else
-  return __builtin_shuffle (__a, __b, (uint32x2_t) {1, 3});
-#endif
+  return __builtin_aarch64_smax_nanpv8hf (a, b);
 }
 
-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
-vuzp2q_f32 (float32x4_t __a, float32x4_t __b)
+__extension__ extern __inline float16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vpmaxnm_f16 (float16x4_t a, float16x4_t b)
 {
-#ifdef __AARCH64EB__
-  return __builtin_shuffle (__a, __b, (uint32x4_t) {4, 6, 0, 2});
-#else
-  return __builtin_shuffle (__a, __b, (uint32x4_t) {1, 3, 5, 7});
-#endif
+  return __builtin_aarch64_smaxpv4hf (a, b);
 }
 
-__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
-vuzp2q_f64 (float64x2_t __a, float64x2_t __b)
+__extension__ extern __inline float16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vpmaxnmq_f16 (float16x8_t a, float16x8_t b)
 {
-#ifdef __AARCH64EB__
-  return __builtin_shuffle (__a, __b, (uint64x2_t) {2, 0});
-#else
-  return __builtin_shuffle (__a, __b, (uint64x2_t) {1, 3});
-#endif
+  return __builtin_aarch64_smaxpv8hf (a, b);
 }
 
-__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
-vuzp2q_p8 (poly8x16_t __a, poly8x16_t __b)
+__extension__ extern __inline float16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vpmin_f16 (float16x4_t a, float16x4_t b)
 {
-#ifdef __AARCH64EB__
-  return __builtin_shuffle (__a, __b,
-      (uint8x16_t) {16, 18, 20, 22, 24, 26, 28, 30, 0, 2, 4, 6, 8, 10, 12, 14});
-#else
-  return __builtin_shuffle (__a, __b,
-      (uint8x16_t) {1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31});
-#endif
+  return __builtin_aarch64_smin_nanpv4hf (a, b);
 }
 
-__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
-vuzp2q_p16 (poly16x8_t __a, poly16x8_t __b)
+__extension__ extern __inline float16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vpminq_f16 (float16x8_t a, float16x8_t b)
 {
-#ifdef __AARCH64EB__
-  return __builtin_shuffle (__a, __b, (uint16x8_t) {8, 10, 12, 14, 0, 2, 4, 6});
-#else
-  return __builtin_shuffle (__a, __b, (uint16x8_t) {1, 3, 5, 7, 9, 11, 13, 15});
-#endif
+  return __builtin_aarch64_smin_nanpv8hf (a, b);
 }
 
-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
-vuzp2q_s8 (int8x16_t __a, int8x16_t __b)
+__extension__ extern __inline float16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vpminnm_f16 (float16x4_t a, float16x4_t b)
 {
-#ifdef __AARCH64EB__
-  return __builtin_shuffle (__a, __b,
-      (uint8x16_t) {16, 18, 20, 22, 24, 26, 28, 30, 0, 2, 4, 6, 8, 10, 12, 14});
-#else
-  return __builtin_shuffle (__a, __b,
-      (uint8x16_t) {1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31});
-#endif
+  return __builtin_aarch64_sminpv4hf (a, b);
 }
 
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-vuzp2q_s16 (int16x8_t __a, int16x8_t __b)
+__extension__ extern __inline float16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vpminnmq_f16 (float16x8_t a, float16x8_t b)
 {
-#ifdef __AARCH64EB__
-  return __builtin_shuffle (__a, __b, (uint16x8_t) {8, 10, 12, 14, 0, 2, 4, 6});
-#else
-  return __builtin_shuffle (__a, __b, (uint16x8_t) {1, 3, 5, 7, 9, 11, 13, 15});
-#endif
+  return __builtin_aarch64_sminpv8hf (a, b);
 }
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-vuzp2q_s32 (int32x4_t __a, int32x4_t __b)
+__extension__ extern __inline float16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vrecps_f16 (float16x4_t __a, float16x4_t __b)
 {
-#ifdef __AARCH64EB__
-  return __builtin_shuffle (__a, __b, (uint32x4_t) {4, 6, 0, 2});
-#else
-  return __builtin_shuffle (__a, __b, (uint32x4_t) {1, 3, 5, 7});
-#endif
+  return __builtin_aarch64_frecpsv4hf (__a, __b);
 }
 
-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
-vuzp2q_s64 (int64x2_t __a, int64x2_t __b)
+__extension__ extern __inline float16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vrecpsq_f16 (float16x8_t __a, float16x8_t __b)
 {
-#ifdef __AARCH64EB__
-  return __builtin_shuffle (__a, __b, (uint64x2_t) {2, 0});
-#else
-  return __builtin_shuffle (__a, __b, (uint64x2_t) {1, 3});
-#endif
+  return __builtin_aarch64_frecpsv8hf (__a, __b);
 }
 
-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-vuzp2q_u8 (uint8x16_t __a, uint8x16_t __b)
+__extension__ extern __inline float16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vrsqrts_f16 (float16x4_t a, float16x4_t b)
 {
-#ifdef __AARCH64EB__
-  return __builtin_shuffle (__a, __b, (uint8x16_t)
-      {16, 18, 20, 22, 24, 26, 28, 30, 0, 2, 4, 6, 8, 10, 12, 14});
-#else
-  return __builtin_shuffle (__a, __b, (uint8x16_t)
-      {1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31});
-#endif
+  return __builtin_aarch64_rsqrtsv4hf (a, b);
 }
 
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-vuzp2q_u16 (uint16x8_t __a, uint16x8_t __b)
+__extension__ extern __inline float16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vrsqrtsq_f16 (float16x8_t a, float16x8_t b)
 {
-#ifdef __AARCH64EB__
-  return __builtin_shuffle (__a, __b, (uint16x8_t) {8, 10, 12, 14, 0, 2, 4, 6});
-#else
-  return __builtin_shuffle (__a, __b, (uint16x8_t) {1, 3, 5, 7, 9, 11, 13, 15});
-#endif
+  return __builtin_aarch64_rsqrtsv8hf (a, b);
 }
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vuzp2q_u32 (uint32x4_t __a, uint32x4_t __b)
+__extension__ extern __inline float16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vsub_f16 (float16x4_t __a, float16x4_t __b)
 {
-#ifdef __AARCH64EB__
-  return __builtin_shuffle (__a, __b, (uint32x4_t) {4, 6, 0, 2});
-#else
-  return __builtin_shuffle (__a, __b, (uint32x4_t) {1, 3, 5, 7});
-#endif
+  return __a - __b;
 }
 
-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
-vuzp2q_u64 (uint64x2_t __a, uint64x2_t __b)
+__extension__ extern __inline float16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vsubq_f16 (float16x8_t __a, float16x8_t __b)
 {
-#ifdef __AARCH64EB__
-  return __builtin_shuffle (__a, __b, (uint64x2_t) {2, 0});
-#else
-  return __builtin_shuffle (__a, __b, (uint64x2_t) {1, 3});
-#endif
+  return __a - __b;
 }
 
-__INTERLEAVE_LIST (uzp)
-
-/* vzip */
+/* ARMv8.2-A FP16 three operands vector intrinsics.  */
 
-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
-vzip1_f32 (float32x2_t __a, float32x2_t __b)
+__extension__ extern __inline float16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vfma_f16 (float16x4_t __a, float16x4_t __b, float16x4_t __c)
 {
-#ifdef __AARCH64EB__
-  return __builtin_shuffle (__a, __b, (uint32x2_t) {3, 1});
-#else
-  return __builtin_shuffle (__a, __b, (uint32x2_t) {0, 2});
-#endif
+  return __builtin_aarch64_fmav4hf (__b, __c, __a);
 }
 
-__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
-vzip1_p8 (poly8x8_t __a, poly8x8_t __b)
+__extension__ extern __inline float16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vfmaq_f16 (float16x8_t __a, float16x8_t __b, float16x8_t __c)
 {
-#ifdef __AARCH64EB__
-  return __builtin_shuffle (__a, __b, (uint8x8_t) {12, 4, 13, 5, 14, 6, 15, 7});
-#else
-  return __builtin_shuffle (__a, __b, (uint8x8_t) {0, 8, 1, 9, 2, 10, 3, 11});
-#endif
+  return __builtin_aarch64_fmav8hf (__b, __c, __a);
 }
 
-__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__))
-vzip1_p16 (poly16x4_t __a, poly16x4_t __b)
+__extension__ extern __inline float16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vfms_f16 (float16x4_t __a, float16x4_t __b, float16x4_t __c)
 {
-#ifdef __AARCH64EB__
-  return __builtin_shuffle (__a, __b, (uint16x4_t) {6, 2, 7, 3});
-#else
-  return __builtin_shuffle (__a, __b, (uint16x4_t) {0, 4, 1, 5});
-#endif
+  return __builtin_aarch64_fnmav4hf (__b, __c, __a);
 }
 
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-vzip1_s8 (int8x8_t __a, int8x8_t __b)
+__extension__ extern __inline float16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vfmsq_f16 (float16x8_t __a, float16x8_t __b, float16x8_t __c)
 {
-#ifdef __AARCH64EB__
-  return __builtin_shuffle (__a, __b, (uint8x8_t) {12, 4, 13, 5, 14, 6, 15, 7});
-#else
-  return __builtin_shuffle (__a, __b, (uint8x8_t) {0, 8, 1, 9, 2, 10, 3, 11});
-#endif
+  return __builtin_aarch64_fnmav8hf (__b, __c, __a);
 }
 
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-vzip1_s16 (int16x4_t __a, int16x4_t __b)
-{
-#ifdef __AARCH64EB__
-  return __builtin_shuffle (__a, __b, (uint16x4_t) {6, 2, 7, 3});
-#else
-  return __builtin_shuffle (__a, __b, (uint16x4_t) {0, 4, 1, 5});
-#endif
+/* ARMv8.2-A FP16 lane vector intrinsics.  */
+
+__extension__ extern __inline float16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vfmah_lane_f16 (float16_t __a, float16_t __b,
+		float16x4_t __c, const int __lane)
+{
+  return vfmah_f16 (__a, __b, __aarch64_vget_lane_any (__c, __lane));
 }
 
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-vzip1_s32 (int32x2_t __a, int32x2_t __b)
+__extension__ extern __inline float16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vfmah_laneq_f16 (float16_t __a, float16_t __b,
+		 float16x8_t __c, const int __lane)
 {
-#ifdef __AARCH64EB__
-  return __builtin_shuffle (__a, __b, (uint32x2_t) {3, 1});
-#else
-  return __builtin_shuffle (__a, __b, (uint32x2_t) {0, 2});
-#endif
+  return vfmah_f16 (__a, __b, __aarch64_vget_lane_any (__c, __lane));
 }
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-vzip1_u8 (uint8x8_t __a, uint8x8_t __b)
+__extension__ extern __inline float16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vfma_lane_f16 (float16x4_t __a, float16x4_t __b,
+	       float16x4_t __c, const int __lane)
 {
-#ifdef __AARCH64EB__
-  return __builtin_shuffle (__a, __b, (uint8x8_t) {12, 4, 13, 5, 14, 6, 15, 7});
-#else
-  return __builtin_shuffle (__a, __b, (uint8x8_t) {0, 8, 1, 9, 2, 10, 3, 11});
-#endif
+  return vfma_f16 (__a, __b, __aarch64_vdup_lane_f16 (__c, __lane));
 }
 
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-vzip1_u16 (uint16x4_t __a, uint16x4_t __b)
+__extension__ extern __inline float16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vfmaq_lane_f16 (float16x8_t __a, float16x8_t __b,
+		float16x4_t __c, const int __lane)
 {
-#ifdef __AARCH64EB__
-  return __builtin_shuffle (__a, __b, (uint16x4_t) {6, 2, 7, 3});
-#else
-  return __builtin_shuffle (__a, __b, (uint16x4_t) {0, 4, 1, 5});
-#endif
+  return vfmaq_f16 (__a, __b, __aarch64_vdupq_lane_f16 (__c, __lane));
 }
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-vzip1_u32 (uint32x2_t __a, uint32x2_t __b)
+__extension__ extern __inline float16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vfma_laneq_f16 (float16x4_t __a, float16x4_t __b,
+		float16x8_t __c, const int __lane)
 {
-#ifdef __AARCH64EB__
-  return __builtin_shuffle (__a, __b, (uint32x2_t) {3, 1});
-#else
-  return __builtin_shuffle (__a, __b, (uint32x2_t) {0, 2});
-#endif
+  return vfma_f16 (__a, __b, __aarch64_vdup_laneq_f16 (__c, __lane));
 }
 
-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
-vzip1q_f32 (float32x4_t __a, float32x4_t __b)
+__extension__ extern __inline float16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vfmaq_laneq_f16 (float16x8_t __a, float16x8_t __b,
+		 float16x8_t __c, const int __lane)
 {
-#ifdef __AARCH64EB__
-  return __builtin_shuffle (__a, __b, (uint32x4_t) {6, 2, 7, 3});
-#else
-  return __builtin_shuffle (__a, __b, (uint32x4_t) {0, 4, 1, 5});
-#endif
+  return vfmaq_f16 (__a, __b, __aarch64_vdupq_laneq_f16 (__c, __lane));
 }
 
-__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
-vzip1q_f64 (float64x2_t __a, float64x2_t __b)
+__extension__ extern __inline float16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vfma_n_f16 (float16x4_t __a, float16x4_t __b, float16_t __c)
 {
-#ifdef __AARCH64EB__
-  return __builtin_shuffle (__a, __b, (uint64x2_t) {3, 1});
-#else
-  return __builtin_shuffle (__a, __b, (uint64x2_t) {0, 2});
-#endif
+  return vfma_f16 (__a, __b, vdup_n_f16 (__c));
 }
 
-__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
-vzip1q_p8 (poly8x16_t __a, poly8x16_t __b)
+__extension__ extern __inline float16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vfmaq_n_f16 (float16x8_t __a, float16x8_t __b, float16_t __c)
 {
-#ifdef __AARCH64EB__
-  return __builtin_shuffle (__a, __b, (uint8x16_t)
-      {24, 8, 25, 9, 26, 10, 27, 11, 28, 12, 29, 13, 30, 14, 31, 15});
-#else
-  return __builtin_shuffle (__a, __b, (uint8x16_t)
-      {0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23});
-#endif
+  return vfmaq_f16 (__a, __b, vdupq_n_f16 (__c));
 }
 
-__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
-vzip1q_p16 (poly16x8_t __a, poly16x8_t __b)
+__extension__ extern __inline float16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vfmsh_lane_f16 (float16_t __a, float16_t __b,
+		float16x4_t __c, const int __lane)
 {
-#ifdef __AARCH64EB__
-  return __builtin_shuffle (__a, __b, (uint16x8_t)
-      {12, 4, 13, 5, 14, 6, 15, 7});
-#else
-  return __builtin_shuffle (__a, __b, (uint16x8_t) {0, 8, 1, 9, 2, 10, 3, 11});
-#endif
+  return vfmsh_f16 (__a, __b, __aarch64_vget_lane_any (__c, __lane));
 }
 
-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
-vzip1q_s8 (int8x16_t __a, int8x16_t __b)
+__extension__ extern __inline float16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vfmsh_laneq_f16 (float16_t __a, float16_t __b,
+		 float16x8_t __c, const int __lane)
 {
-#ifdef __AARCH64EB__
-  return __builtin_shuffle (__a, __b, (uint8x16_t)
-      {24, 8, 25, 9, 26, 10, 27, 11, 28, 12, 29, 13, 30, 14, 31, 15});
-#else
-  return __builtin_shuffle (__a, __b, (uint8x16_t)
-      {0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23});
-#endif
+  return vfmsh_f16 (__a, __b, __aarch64_vget_lane_any (__c, __lane));
 }
 
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-vzip1q_s16 (int16x8_t __a, int16x8_t __b)
+__extension__ extern __inline float16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vfms_lane_f16 (float16x4_t __a, float16x4_t __b,
+	       float16x4_t __c, const int __lane)
 {
-#ifdef __AARCH64EB__
-  return __builtin_shuffle (__a, __b, (uint16x8_t)
-      {12, 4, 13, 5, 14, 6, 15, 7});
-#else
-  return __builtin_shuffle (__a, __b, (uint16x8_t) {0, 8, 1, 9, 2, 10, 3, 11});
-#endif
+  return vfms_f16 (__a, __b, __aarch64_vdup_lane_f16 (__c, __lane));
 }
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-vzip1q_s32 (int32x4_t __a, int32x4_t __b)
+__extension__ extern __inline float16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vfmsq_lane_f16 (float16x8_t __a, float16x8_t __b,
+		float16x4_t __c, const int __lane)
 {
-#ifdef __AARCH64EB__
-  return __builtin_shuffle (__a, __b, (uint32x4_t) {6, 2, 7, 3});
-#else
-  return __builtin_shuffle (__a, __b, (uint32x4_t) {0, 4, 1, 5});
-#endif
+  return vfmsq_f16 (__a, __b, __aarch64_vdupq_lane_f16 (__c, __lane));
 }
 
-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
-vzip1q_s64 (int64x2_t __a, int64x2_t __b)
+__extension__ extern __inline float16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vfms_laneq_f16 (float16x4_t __a, float16x4_t __b,
+		float16x8_t __c, const int __lane)
 {
-#ifdef __AARCH64EB__
-  return __builtin_shuffle (__a, __b, (uint64x2_t) {3, 1});
-#else
-  return __builtin_shuffle (__a, __b, (uint64x2_t) {0, 2});
-#endif
+  return vfms_f16 (__a, __b, __aarch64_vdup_laneq_f16 (__c, __lane));
 }
 
-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-vzip1q_u8 (uint8x16_t __a, uint8x16_t __b)
+__extension__ extern __inline float16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vfmsq_laneq_f16 (float16x8_t __a, float16x8_t __b,
+		 float16x8_t __c, const int __lane)
 {
-#ifdef __AARCH64EB__
-  return __builtin_shuffle (__a, __b, (uint8x16_t)
-      {24, 8, 25, 9, 26, 10, 27, 11, 28, 12, 29, 13, 30, 14, 31, 15});
-#else
-  return __builtin_shuffle (__a, __b, (uint8x16_t)
-      {0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23});
-#endif
+  return vfmsq_f16 (__a, __b, __aarch64_vdupq_laneq_f16 (__c, __lane));
 }
 
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-vzip1q_u16 (uint16x8_t __a, uint16x8_t __b)
+__extension__ extern __inline float16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vfms_n_f16 (float16x4_t __a, float16x4_t __b, float16_t __c)
 {
-#ifdef __AARCH64EB__
-  return __builtin_shuffle (__a, __b, (uint16x8_t)
-      {12, 4, 13, 5, 14, 6, 15, 7});
-#else
-  return __builtin_shuffle (__a, __b, (uint16x8_t) {0, 8, 1, 9, 2, 10, 3, 11});
-#endif
+  return vfms_f16 (__a, __b, vdup_n_f16 (__c));
 }
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vzip1q_u32 (uint32x4_t __a, uint32x4_t __b)
+__extension__ extern __inline float16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vfmsq_n_f16 (float16x8_t __a, float16x8_t __b, float16_t __c)
 {
-#ifdef __AARCH64EB__
-  return __builtin_shuffle (__a, __b, (uint32x4_t) {6, 2, 7, 3});
-#else
-  return __builtin_shuffle (__a, __b, (uint32x4_t) {0, 4, 1, 5});
-#endif
+  return vfmsq_f16 (__a, __b, vdupq_n_f16 (__c));
 }
 
-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
-vzip1q_u64 (uint64x2_t __a, uint64x2_t __b)
+__extension__ extern __inline float16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmulh_lane_f16 (float16_t __a, float16x4_t __b, const int __lane)
 {
-#ifdef __AARCH64EB__
-  return __builtin_shuffle (__a, __b, (uint64x2_t) {3, 1});
-#else
-  return __builtin_shuffle (__a, __b, (uint64x2_t) {0, 2});
-#endif
+  return __a * __aarch64_vget_lane_any (__b, __lane);
 }
 
-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
-vzip2_f32 (float32x2_t __a, float32x2_t __b)
+__extension__ extern __inline float16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmul_lane_f16 (float16x4_t __a, float16x4_t __b, const int __lane)
 {
-#ifdef __AARCH64EB__
-  return __builtin_shuffle (__a, __b, (uint32x2_t) {2, 0});
-#else
-  return __builtin_shuffle (__a, __b, (uint32x2_t) {1, 3});
-#endif
+  return vmul_f16 (__a, vdup_n_f16 (__aarch64_vget_lane_any (__b, __lane)));
 }
 
-__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
-vzip2_p8 (poly8x8_t __a, poly8x8_t __b)
+__extension__ extern __inline float16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmulq_lane_f16 (float16x8_t __a, float16x4_t __b, const int __lane)
 {
-#ifdef __AARCH64EB__
-  return __builtin_shuffle (__a, __b, (uint8x8_t) {8, 0, 9, 1, 10, 2, 11, 3});
-#else
-  return __builtin_shuffle (__a, __b, (uint8x8_t) {4, 12, 5, 13, 6, 14, 7, 15});
-#endif
+  return vmulq_f16 (__a, vdupq_n_f16 (__aarch64_vget_lane_any (__b, __lane)));
 }
 
-__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__))
-vzip2_p16 (poly16x4_t __a, poly16x4_t __b)
+__extension__ extern __inline float16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmulh_laneq_f16 (float16_t __a, float16x8_t __b, const int __lane)
 {
-#ifdef __AARCH64EB__
-  return __builtin_shuffle (__a, __b, (uint16x4_t) {4, 0, 5, 1});
-#else
-  return __builtin_shuffle (__a, __b, (uint16x4_t) {2, 6, 3, 7});
-#endif
+  return __a * __aarch64_vget_lane_any (__b, __lane);
 }
 
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-vzip2_s8 (int8x8_t __a, int8x8_t __b)
+__extension__ extern __inline float16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmul_laneq_f16 (float16x4_t __a, float16x8_t __b, const int __lane)
 {
-#ifdef __AARCH64EB__
-  return __builtin_shuffle (__a, __b, (uint8x8_t) {8, 0, 9, 1, 10, 2, 11, 3});
-#else
-  return __builtin_shuffle (__a, __b, (uint8x8_t) {4, 12, 5, 13, 6, 14, 7, 15});
-#endif
+  return vmul_f16 (__a, vdup_n_f16 (__aarch64_vget_lane_any (__b, __lane)));
 }
 
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-vzip2_s16 (int16x4_t __a, int16x4_t __b)
+__extension__ extern __inline float16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmulq_laneq_f16 (float16x8_t __a, float16x8_t __b, const int __lane)
 {
-#ifdef __AARCH64EB__
-  return __builtin_shuffle (__a, __b, (uint16x4_t) {4, 0, 5, 1});
-#else
-  return __builtin_shuffle (__a, __b, (uint16x4_t) {2, 6, 3, 7});
-#endif
+  return vmulq_f16 (__a, vdupq_n_f16 (__aarch64_vget_lane_any (__b, __lane)));
 }
 
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-vzip2_s32 (int32x2_t __a, int32x2_t __b)
+__extension__ extern __inline float16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmul_n_f16 (float16x4_t __a, float16_t __b)
 {
-#ifdef __AARCH64EB__
-  return __builtin_shuffle (__a, __b, (uint32x2_t) {2, 0});
-#else
-  return __builtin_shuffle (__a, __b, (uint32x2_t) {1, 3});
-#endif
+  return vmul_lane_f16 (__a, vdup_n_f16 (__b), 0);
 }
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-vzip2_u8 (uint8x8_t __a, uint8x8_t __b)
+__extension__ extern __inline float16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmulq_n_f16 (float16x8_t __a, float16_t __b)
 {
-#ifdef __AARCH64EB__
-  return __builtin_shuffle (__a, __b, (uint8x8_t) {8, 0, 9, 1, 10, 2, 11, 3});
-#else
-  return __builtin_shuffle (__a, __b, (uint8x8_t) {4, 12, 5, 13, 6, 14, 7, 15});
-#endif
+  return vmulq_laneq_f16 (__a, vdupq_n_f16 (__b), 0);
 }
 
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-vzip2_u16 (uint16x4_t __a, uint16x4_t __b)
+__extension__ extern __inline float16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmulxh_lane_f16 (float16_t __a, float16x4_t __b, const int __lane)
 {
-#ifdef __AARCH64EB__
-  return __builtin_shuffle (__a, __b, (uint16x4_t) {4, 0, 5, 1});
-#else
-  return __builtin_shuffle (__a, __b, (uint16x4_t) {2, 6, 3, 7});
-#endif
+  return vmulxh_f16 (__a, __aarch64_vget_lane_any (__b, __lane));
 }
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-vzip2_u32 (uint32x2_t __a, uint32x2_t __b)
+__extension__ extern __inline float16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmulx_lane_f16 (float16x4_t __a, float16x4_t __b, const int __lane)
 {
-#ifdef __AARCH64EB__
-  return __builtin_shuffle (__a, __b, (uint32x2_t) {2, 0});
-#else
-  return __builtin_shuffle (__a, __b, (uint32x2_t) {1, 3});
-#endif
+  return vmulx_f16 (__a, __aarch64_vdup_lane_f16 (__b, __lane));
 }
 
-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
-vzip2q_f32 (float32x4_t __a, float32x4_t __b)
+__extension__ extern __inline float16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmulxq_lane_f16 (float16x8_t __a, float16x4_t __b, const int __lane)
 {
-#ifdef __AARCH64EB__
-  return __builtin_shuffle (__a, __b, (uint32x4_t) {4, 0, 5, 1});
-#else
-  return __builtin_shuffle (__a, __b, (uint32x4_t) {2, 6, 3, 7});
-#endif
+  return vmulxq_f16 (__a, __aarch64_vdupq_lane_f16 (__b, __lane));
 }
 
-__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
-vzip2q_f64 (float64x2_t __a, float64x2_t __b)
+__extension__ extern __inline float16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmulxh_laneq_f16 (float16_t __a, float16x8_t __b, const int __lane)
 {
-#ifdef __AARCH64EB__
-  return __builtin_shuffle (__a, __b, (uint64x2_t) {2, 0});
-#else
-  return __builtin_shuffle (__a, __b, (uint64x2_t) {1, 3});
-#endif
+  return vmulxh_f16 (__a, __aarch64_vget_lane_any (__b, __lane));
 }
 
-__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
-vzip2q_p8 (poly8x16_t __a, poly8x16_t __b)
+__extension__ extern __inline float16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmulx_laneq_f16 (float16x4_t __a, float16x8_t __b, const int __lane)
 {
-#ifdef __AARCH64EB__
-  return __builtin_shuffle (__a, __b, (uint8x16_t)
-      {16, 0, 17, 1, 18, 2, 19, 3, 20, 4, 21, 5, 22, 6, 23, 7});
-#else
-  return __builtin_shuffle (__a, __b, (uint8x16_t)
-      {8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31});
-#endif
+  return vmulx_f16 (__a, __aarch64_vdup_laneq_f16 (__b, __lane));
 }
 
-__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
-vzip2q_p16 (poly16x8_t __a, poly16x8_t __b)
+__extension__ extern __inline float16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmulxq_laneq_f16 (float16x8_t __a, float16x8_t __b, const int __lane)
 {
-#ifdef __AARCH64EB__
-  return __builtin_shuffle (__a, __b, (uint16x8_t) {8, 0, 9, 1, 10, 2, 11, 3});
-#else
-  return __builtin_shuffle (__a, __b, (uint16x8_t)
-      {4, 12, 5, 13, 6, 14, 7, 15});
-#endif
+  return vmulxq_f16 (__a, __aarch64_vdupq_laneq_f16 (__b, __lane));
 }
 
-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
-vzip2q_s8 (int8x16_t __a, int8x16_t __b)
+__extension__ extern __inline float16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmulx_n_f16 (float16x4_t __a, float16_t __b)
 {
-#ifdef __AARCH64EB__
-  return __builtin_shuffle (__a, __b, (uint8x16_t)
-      {16, 0, 17, 1, 18, 2, 19, 3, 20, 4, 21, 5, 22, 6, 23, 7});
-#else
-  return __builtin_shuffle (__a, __b, (uint8x16_t)
-      {8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31});
-#endif
+  return vmulx_f16 (__a, vdup_n_f16 (__b));
 }
 
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-vzip2q_s16 (int16x8_t __a, int16x8_t __b)
+__extension__ extern __inline float16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmulxq_n_f16 (float16x8_t __a, float16_t __b)
 {
-#ifdef __AARCH64EB__
-  return __builtin_shuffle (__a, __b, (uint16x8_t) {8, 0, 9, 1, 10, 2, 11, 3});
-#else
-  return __builtin_shuffle (__a, __b, (uint16x8_t)
-      {4, 12, 5, 13, 6, 14, 7, 15});
-#endif
+  return vmulxq_f16 (__a, vdupq_n_f16 (__b));
 }
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-vzip2q_s32 (int32x4_t __a, int32x4_t __b)
+/* ARMv8.2-A FP16 reduction vector intrinsics.  */
+
+__extension__ extern __inline float16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmaxv_f16 (float16x4_t __a)
 {
-#ifdef __AARCH64EB__
-  return __builtin_shuffle (__a, __b, (uint32x4_t) {4, 0, 5, 1});
-#else
-  return __builtin_shuffle (__a, __b, (uint32x4_t) {2, 6, 3, 7});
-#endif
+  return __builtin_aarch64_reduc_smax_nan_scal_v4hf (__a);
 }
 
-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
-vzip2q_s64 (int64x2_t __a, int64x2_t __b)
+__extension__ extern __inline float16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmaxvq_f16 (float16x8_t __a)
 {
-#ifdef __AARCH64EB__
-  return __builtin_shuffle (__a, __b, (uint64x2_t) {2, 0});
-#else
-  return __builtin_shuffle (__a, __b, (uint64x2_t) {1, 3});
-#endif
+  return __builtin_aarch64_reduc_smax_nan_scal_v8hf (__a);
 }
 
-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-vzip2q_u8 (uint8x16_t __a, uint8x16_t __b)
+__extension__ extern __inline float16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vminv_f16 (float16x4_t __a)
 {
-#ifdef __AARCH64EB__
-  return __builtin_shuffle (__a, __b, (uint8x16_t)
-      {16, 0, 17, 1, 18, 2, 19, 3, 20, 4, 21, 5, 22, 6, 23, 7});
-#else
-  return __builtin_shuffle (__a, __b, (uint8x16_t)
-      {8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31});
-#endif
+  return __builtin_aarch64_reduc_smin_nan_scal_v4hf (__a);
 }
 
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-vzip2q_u16 (uint16x8_t __a, uint16x8_t __b)
+__extension__ extern __inline float16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vminvq_f16 (float16x8_t __a)
 {
-#ifdef __AARCH64EB__
-  return __builtin_shuffle (__a, __b, (uint16x8_t) {8, 0, 9, 1, 10, 2, 11, 3});
-#else
-  return __builtin_shuffle (__a, __b, (uint16x8_t)
-      {4, 12, 5, 13, 6, 14, 7, 15});
-#endif
+  return __builtin_aarch64_reduc_smin_nan_scal_v8hf (__a);
 }
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vzip2q_u32 (uint32x4_t __a, uint32x4_t __b)
+__extension__ extern __inline float16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmaxnmv_f16 (float16x4_t __a)
 {
-#ifdef __AARCH64EB__
-  return __builtin_shuffle (__a, __b, (uint32x4_t) {4, 0, 5, 1});
-#else
-  return __builtin_shuffle (__a, __b, (uint32x4_t) {2, 6, 3, 7});
-#endif
+  return __builtin_aarch64_reduc_smax_scal_v4hf (__a);
 }
 
-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
-vzip2q_u64 (uint64x2_t __a, uint64x2_t __b)
+__extension__ extern __inline float16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmaxnmvq_f16 (float16x8_t __a)
 {
-#ifdef __AARCH64EB__
-  return __builtin_shuffle (__a, __b, (uint64x2_t) {2, 0});
-#else
-  return __builtin_shuffle (__a, __b, (uint64x2_t) {1, 3});
-#endif
+  return __builtin_aarch64_reduc_smax_scal_v8hf (__a);
 }
 
-__INTERLEAVE_LIST (zip)
+__extension__ extern __inline float16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vminnmv_f16 (float16x4_t __a)
+{
+  return __builtin_aarch64_reduc_smin_scal_v4hf (__a);
+}
 
-#undef __INTERLEAVE_LIST
-#undef __DEFINTERLEAVE
+__extension__ extern __inline float16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vminnmvq_f16 (float16x8_t __a)
+{
+  return __builtin_aarch64_reduc_smin_scal_v8hf (__a);
+}
 
-/* End of optimal implementations in approved order.  */
+#pragma GCC pop_options
 
 #undef __aarch64_vget_lane_any
 
 #undef __aarch64_vdup_lane_any
+#undef __aarch64_vdup_lane_f16
 #undef __aarch64_vdup_lane_f32
 #undef __aarch64_vdup_lane_f64
 #undef __aarch64_vdup_lane_p8
@@ -25780,6 +31558,7 @@ __INTERLEAVE_LIST (zip)
 #undef __aarch64_vdup_lane_u16
 #undef __aarch64_vdup_lane_u32
 #undef __aarch64_vdup_lane_u64
+#undef __aarch64_vdup_laneq_f16
 #undef __aarch64_vdup_laneq_f32
 #undef __aarch64_vdup_laneq_f64
 #undef __aarch64_vdup_laneq_p8
@@ -25792,6 +31571,7 @@ __INTERLEAVE_LIST (zip)
 #undef __aarch64_vdup_laneq_u16
 #undef __aarch64_vdup_laneq_u32
 #undef __aarch64_vdup_laneq_u64
+#undef __aarch64_vdupq_lane_f16
 #undef __aarch64_vdupq_lane_f32
 #undef __aarch64_vdupq_lane_f64
 #undef __aarch64_vdupq_lane_p8
@@ -25804,6 +31584,7 @@ __INTERLEAVE_LIST (zip)
 #undef __aarch64_vdupq_lane_u16
 #undef __aarch64_vdupq_lane_u32
 #undef __aarch64_vdupq_lane_u64
+#undef __aarch64_vdupq_laneq_f16
 #undef __aarch64_vdupq_laneq_f32
 #undef __aarch64_vdupq_laneq_f64
 #undef __aarch64_vdupq_laneq_p8
@@ -25817,6 +31598,4 @@ __INTERLEAVE_LIST (zip)
 #undef __aarch64_vdupq_laneq_u32
 #undef __aarch64_vdupq_laneq_u64
 
-#pragma GCC pop_options
-
 #endif
--- a/src/gcc/config/aarch64/atomics.md
+++ b/src/gcc/config/aarch64/atomics.md
@@ -583,7 +583,7 @@
   }
 )
 
-;; ARMv8.1 LSE instructions.
+;; ARMv8.1-A LSE instructions.
 
 ;; Atomic swap with memory.
 (define_insn "aarch64_atomic_swp<mode>"
--- a/src/gcc/config/aarch64/cortex-a57-fma-steering.c
+++ b/src/gcc/config/aarch64/cortex-a57-fma-steering.c
@@ -35,7 +35,6 @@
 #include "context.h"
 #include "tree-pass.h"
 #include "regrename.h"
-#include "cortex-a57-fma-steering.h"
 #include "aarch64-protos.h"
 
 /* For better performance, the destination of FMADD/FMSUB instructions should
@@ -923,10 +922,10 @@ func_fma_steering::analyze ()
       FOR_BB_INSNS (bb, insn)
 	{
 	  operand_rr_info *dest_op_info;
-	  struct du_chain *chain;
+	  struct du_chain *chain = NULL;
 	  unsigned dest_regno;
-	  fma_forest *forest;
-	  du_head_p head;
+	  fma_forest *forest = NULL;
+	  du_head_p head = NULL;
 	  int i;
 
 	  if (!is_fmul_fmac_insn (insn, true))
@@ -1068,21 +1067,8 @@ public:
 
 /* Create a new fma steering pass instance.  */
 
-static rtl_opt_pass *
+rtl_opt_pass *
 make_pass_fma_steering (gcc::context *ctxt)
 {
   return new pass_fma_steering (ctxt);
 }
-
-/* Register the FMA steering pass to the pass manager.  */
-
-void
-aarch64_register_fma_steering ()
-{
-  opt_pass *pass_fma_steering = make_pass_fma_steering (g);
-
-  struct register_pass_info fma_steering_info
-    = { pass_fma_steering, "rnreg", 1, PASS_POS_INSERT_AFTER };
-
-  register_pass (&fma_steering_info);
-}
--- a/src/gcc/config/aarch64/cortex-a57-fma-steering.h
+++ b/src//dev/null
@@ -1,22 +0,0 @@
-/* This file contains declarations for the FMA steering optimization
-   pass for Cortex-A57.
-   Copyright (C) 2015-2016 Free Software Foundation, Inc.
-   Contributed by ARM Ltd.
-
-   This file is part of GCC.
-
-   GCC is free software; you can redistribute it and/or modify it
-   under the terms of the GNU General Public License as published by
-   the Free Software Foundation; either version 3, or (at your option)
-   any later version.
-
-   GCC is distributed in the hope that it will be useful, but
-   WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   General Public License for more details.
-
-   You should have received a copy of the GNU General Public License
-   along with GCC; see the file COPYING3.  If not see
-   <http://www.gnu.org/licenses/>.  */
-
-void aarch64_register_fma_steering (void);
--- a/src/gcc/config/aarch64/geniterators.sh
+++ b/src/gcc/config/aarch64/geniterators.sh
@@ -23,10 +23,7 @@
 # BUILTIN_<ITERATOR> macros, which expand to VAR<N> Macros covering the
 # same set of modes as the iterator in iterators.md
 #
-# Find the <ITERATOR> definitions (may span several lines), skip the ones
-# which does not have a simple format because it contains characters we
-# don't want to or can't handle (e.g P, PTR iterators change depending on
-# Pmode and ptr_mode).
+# Find the <ITERATOR> definitions (may span several lines).
 LC_ALL=C awk '
 BEGIN {
 	print "/* -*- buffer-read-only: t -*- */"
@@ -49,12 +46,24 @@ iterdef {
 	sub(/.*\(define_mode_iterator/, "", s)
 }
 
-iterdef && s ~ /\)/ {
+iterdef {
+	# Count the parentheses, the iterator definition ends
+	# if there are more closing ones than opening ones.
+	nopen = gsub(/\(/, "(", s)
+	nclose = gsub(/\)/, ")", s)
+	if (nopen >= nclose)
+		next
+
 	iterdef = 0
 
 	gsub(/[ \t]+/, " ", s)
-	sub(/ *\).*/, "", s)
+	sub(/ *\)[^)]*$/, "", s)
 	sub(/^ /, "", s)
+
+	# Drop the conditions.
+	gsub(/ *"[^"]*" *\)/, "", s)
+	gsub(/\( */, "", s)
+
 	if (s !~ /^[A-Za-z0-9_]+ \[[A-Z0-9 ]*\]$/)
 		next
 	sub(/\[ */, "", s)
--- a/src/gcc/config/aarch64/iterators.md
+++ b/src/gcc/config/aarch64/iterators.md
@@ -26,6 +26,9 @@
 ;; Iterator for General Purpose Integer registers (32- and 64-bit modes)
 (define_mode_iterator GPI [SI DI])
 
+;; Iterator for HI, SI, DI, some instructions can only work on these modes.
+(define_mode_iterator GPI_I16 [(HI "AARCH64_ISA_F16") SI DI])
+
 ;; Iterator for QI and HI modes
 (define_mode_iterator SHORT [QI HI])
 
@@ -38,6 +41,9 @@
 ;; Iterator for General Purpose Floating-point registers (32- and 64-bit modes)
 (define_mode_iterator GPF [SF DF])
 
+;; Iterator for all scalar floating point modes (HF, SF, DF)
+(define_mode_iterator GPF_F16 [(HF "AARCH64_ISA_F16") SF DF])
+
 ;; Iterator for all scalar floating point modes (HF, SF, DF and TF)
 (define_mode_iterator GPF_TF_F16 [HF SF DF TF])
 
@@ -88,11 +94,22 @@
 ;; Vector Float modes suitable for moving, loading and storing.
 (define_mode_iterator VDQF_F16 [V4HF V8HF V2SF V4SF V2DF])
 
-;; Vector Float modes, barring HF modes.
+;; Vector Float modes.
 (define_mode_iterator VDQF [V2SF V4SF V2DF])
+(define_mode_iterator VHSDF [(V4HF "TARGET_SIMD_F16INST")
+			     (V8HF "TARGET_SIMD_F16INST")
+			     V2SF V4SF V2DF])
 
 ;; Vector Float modes, and DF.
 (define_mode_iterator VDQF_DF [V2SF V4SF V2DF DF])
+(define_mode_iterator VHSDF_DF [(V4HF "TARGET_SIMD_F16INST")
+				(V8HF "TARGET_SIMD_F16INST")
+				V2SF V4SF V2DF DF])
+(define_mode_iterator VHSDF_HSDF [(V4HF "TARGET_SIMD_F16INST")
+				  (V8HF "TARGET_SIMD_F16INST")
+				  V2SF V4SF V2DF
+				  (HF "TARGET_SIMD_F16INST")
+				  SF DF])
 
 ;; Vector single Float modes.
 (define_mode_iterator VDQSF [V2SF V4SF])
@@ -150,10 +167,30 @@
 
 ;; Vector modes except double int.
 (define_mode_iterator VDQIF [V8QI V16QI V4HI V8HI V2SI V4SI V2SF V4SF V2DF])
+(define_mode_iterator VDQIF_F16 [V8QI V16QI V4HI V8HI V2SI V4SI
+                                 V4HF V8HF V2SF V4SF V2DF])
 
 ;; Vector modes for S type.
 (define_mode_iterator VDQ_SI [V2SI V4SI])
 
+;; Vector modes for S and D
+(define_mode_iterator VDQ_SDI [V2SI V4SI V2DI])
+
+;; Vector modes for H, S and D
+(define_mode_iterator VDQ_HSDI [(V4HI "TARGET_SIMD_F16INST")
+				(V8HI "TARGET_SIMD_F16INST")
+				V2SI V4SI V2DI])
+
+;; Scalar and Vector modes for S and D
+(define_mode_iterator VSDQ_SDI [V2SI V4SI V2DI SI DI])
+
+;; Scalar and Vector modes for S and D, Vector modes for H.
+(define_mode_iterator VSDQ_HSDI [(V4HI "TARGET_SIMD_F16INST")
+				 (V8HI "TARGET_SIMD_F16INST")
+				 V2SI V4SI V2DI
+				 (HI "TARGET_SIMD_F16INST")
+				 SI DI])
+
 ;; Vector modes for Q and H types.
 (define_mode_iterator VDQQH [V8QI V16QI V4HI V8HI])
 
@@ -193,7 +230,10 @@
 (define_mode_iterator DX [DI DF])
 
 ;; Modes available for <f>mul lane operations.
-(define_mode_iterator VMUL [V4HI V8HI V2SI V4SI V2SF V4SF V2DF])
+(define_mode_iterator VMUL [V4HI V8HI V2SI V4SI
+			    (V4HF "TARGET_SIMD_F16INST")
+			    (V8HF "TARGET_SIMD_F16INST")
+			    V2SF V4SF V2DF])
 
 ;; Modes available for <f>mul lane operations changing lane count.
 (define_mode_iterator VMUL_CHANGE_NLANES [V4HI V8HI V2SI V4SI V2SF V4SF])
@@ -342,8 +382,8 @@
 (define_mode_attr w [(QI "w") (HI "w") (SI "w") (DI "x") (SF "s") (DF "d")])
 
 ;; For inequal width int to float conversion
-(define_mode_attr w1 [(SF "w") (DF "x")])
-(define_mode_attr w2 [(SF "x") (DF "w")])
+(define_mode_attr w1 [(HF "w") (SF "w") (DF "x")])
+(define_mode_attr w2 [(HF "x") (SF "x") (DF "w")])
 
 (define_mode_attr short_mask [(HI "65535") (QI "255")])
 
@@ -355,12 +395,13 @@
 
 ;; For scalar usage of vector/FP registers
 (define_mode_attr v [(QI "b") (HI "h") (SI "s") (DI "d")
-		    (SF "s") (DF "d")
+		    (HF  "h") (SF "s") (DF "d")
 		    (V8QI "") (V16QI "")
 		    (V4HI "") (V8HI "")
 		    (V2SI "") (V4SI  "")
 		    (V2DI "") (V2SF "")
-		    (V4SF "") (V2DF "")])
+		    (V4SF "") (V4HF "")
+		    (V8HF "") (V2DF "")])
 
 ;; For scalar usage of vector/FP registers, narrowing
 (define_mode_attr vn2 [(QI "") (HI "b") (SI "h") (DI "s")
@@ -385,7 +426,7 @@
 (define_mode_attr vas [(DI "") (SI ".2s")])
 
 ;; Map a floating point mode to the appropriate register name prefix
-(define_mode_attr s [(SF "s") (DF "d")])
+(define_mode_attr s [(HF "h") (SF "s") (DF "d")])
 
 ;; Give the length suffix letter for a sign- or zero-extension.
 (define_mode_attr size [(QI "b") (HI "h") (SI "w")])
@@ -421,8 +462,8 @@
 			 (V4SF ".4s") (V2DF ".2d")
 			 (DI   "")    (SI   "")
 			 (HI   "")    (QI   "")
-			 (TI   "")    (SF   "")
-			 (DF   "")])
+			 (TI   "")    (HF   "")
+			 (SF   "")    (DF   "")])
 
 ;; Register suffix narrowed modes for VQN.
 (define_mode_attr Vmntype [(V8HI ".8b") (V4SI ".4h")
@@ -437,10 +478,21 @@
 			  (V2DI "d") (V4HF "h")
 			  (V8HF "h") (V2SF  "s")
 			  (V4SF "s") (V2DF  "d")
+			  (HF   "h")
 			  (SF   "s") (DF  "d")
 			  (QI "b")   (HI "h")
 			  (SI "s")   (DI "d")])
 
+;; Vetype is used everywhere in scheduling type and assembly output,
+;; sometimes they are not the same, for example HF modes on some
+;; instructions.  stype is defined to represent scheduling type
+;; more accurately.
+(define_mode_attr stype [(V8QI "b") (V16QI "b") (V4HI "s") (V8HI "s")
+			 (V2SI "s") (V4SI "s") (V2DI "d") (V4HF "s")
+			 (V8HF "s") (V2SF "s") (V4SF "s") (V2DF "d")
+			 (HF "s") (SF "s") (DF "d") (QI "b") (HI "s")
+			 (SI "s") (DI "d")])
+
 ;; Mode-to-bitwise operation type mapping.
 (define_mode_attr Vbtype [(V8QI "8b")  (V16QI "16b")
 			  (V4HI "8b") (V8HI  "16b")
@@ -598,7 +650,7 @@
 				(V4HF "V4HI") (V8HF  "V8HI")
 				(V2SF "V2SI") (V4SF  "V4SI")
 				(V2DF "V2DI") (DF    "DI")
-				(SF   "SI")])
+				(SF   "SI")   (HF    "HI")])
 
 ;; Lower case mode of results of comparison operations.
 (define_mode_attr v_cmp_result [(V8QI "v8qi") (V16QI "v16qi")
@@ -648,12 +700,21 @@
 (define_mode_attr atomic_sfx
   [(QI "b") (HI "h") (SI "") (DI "")])
 
-(define_mode_attr fcvt_target [(V2DF "v2di") (V4SF "v4si") (V2SF "v2si") (SF "si") (DF "di")])
-(define_mode_attr FCVT_TARGET [(V2DF "V2DI") (V4SF "V4SI") (V2SF "V2SI") (SF "SI") (DF "DI")])
+(define_mode_attr fcvt_target [(V2DF "v2di") (V4SF "v4si") (V2SF "v2si")
+			       (V2DI "v2df") (V4SI "v4sf") (V2SI "v2sf")
+			       (SF "si") (DF "di") (SI "sf") (DI "df")
+			       (V4HF "v4hi") (V8HF "v8hi") (V4HI "v4hf")
+			       (V8HI "v8hf") (HF "hi") (HI "hf")])
+(define_mode_attr FCVT_TARGET [(V2DF "V2DI") (V4SF "V4SI") (V2SF "V2SI")
+			       (V2DI "V2DF") (V4SI "V4SF") (V2SI "V2SF")
+			       (SF "SI") (DF "DI") (SI "SF") (DI "DF")
+			       (V4HF "V4HI") (V8HF "V8HI") (V4HI "V4HF")
+			       (V8HI "V8HF") (HF "HI") (HI "HF")])
+
 
 ;; for the inequal width integer to fp conversions
-(define_mode_attr fcvt_iesize [(SF "di") (DF "si")])
-(define_mode_attr FCVT_IESIZE [(SF "DI") (DF "SI")])
+(define_mode_attr fcvt_iesize [(HF "di") (SF "di") (DF "si")])
+(define_mode_attr FCVT_IESIZE [(HF "DI") (SF "DI") (DF "SI")])
 
 (define_mode_attr VSWAP_WIDTH [(V8QI "V16QI") (V16QI "V8QI")
 				(V4HI "V8HI") (V8HI  "V4HI")
@@ -676,6 +737,7 @@
 ;; the 'x' constraint.  All other modes may use the 'w' constraint.
 (define_mode_attr h_con [(V2SI "w") (V4SI "w")
 			 (V4HI "x") (V8HI "x")
+			 (V4HF "w") (V8HF "w")
 			 (V2SF "w") (V4SF "w")
 			 (V2DF "w") (DF "w")])
 
@@ -684,6 +746,7 @@
 		     (V4HI "")  (V8HI  "")
 		     (V2SI "")  (V4SI  "")
 		     (DI   "")  (V2DI  "")
+		     (V4HF "f") (V8HF  "f")
 		     (V2SF "f") (V4SF  "f")
 		     (V2DF "f") (DF    "f")])
 
@@ -692,6 +755,7 @@
 		      (V4HI "")  (V8HI  "")
 		      (V2SI "")  (V4SI  "")
 		      (DI   "")  (V2DI  "")
+		      (V4HF "_fp") (V8HF  "_fp")
 		      (V2SF "_fp") (V4SF  "_fp")
 		      (V2DF "_fp") (DF    "_fp")
 		      (SF "_fp")])
@@ -704,17 +768,19 @@
 		     (V4HF "") (V8HF "_q")
 		     (V2SF "") (V4SF  "_q")
 			       (V2DF  "_q")
-		     (QI "") (HI "") (SI "") (DI "") (SF "") (DF "")])
+		     (QI "") (HI "") (SI "") (DI "") (HF "") (SF "") (DF "")])
 
 (define_mode_attr vp [(V8QI "v") (V16QI "v")
 		      (V4HI "v") (V8HI  "v")
 		      (V2SI "p") (V4SI  "v")
-		      (V2DI  "p") (V2DF  "p")
-		      (V2SF "p") (V4SF  "v")])
+		      (V2DI "p") (V2DF  "p")
+		      (V2SF "p") (V4SF  "v")
+		      (V4HF "v") (V8HF  "v")])
 
 (define_mode_attr vsi2qi [(V2SI "v8qi") (V4SI "v16qi")])
 (define_mode_attr VSI2QI [(V2SI "V8QI") (V4SI "V16QI")])
 
+;; Sum of lengths of instructions needed to move vector registers of a mode.
 (define_mode_attr insn_count [(OI "8") (CI "12") (XI "16")])
 
 ;; -fpic small model GOT reloc modifers: gotpage_lo15/lo14 for ILP64/32.
@@ -876,9 +942,6 @@
 ;; Similar, but when not(op)
 (define_code_attr nlogical [(and "bic") (ior "orn") (xor "eon")])
 
-;; Sign- or zero-extending load
-(define_code_attr ldrxt [(sign_extend "ldrs") (zero_extend "ldr")])
-
 ;; Sign- or zero-extending data-op
 (define_code_attr su [(sign_extend "s") (zero_extend "u")
 		      (sign_extract "s") (zero_extract "u")
@@ -953,9 +1016,8 @@
 (define_int_iterator ADDSUBHN2 [UNSPEC_ADDHN2 UNSPEC_RADDHN2
 			        UNSPEC_SUBHN2 UNSPEC_RSUBHN2])
 
-(define_int_iterator FMAXMIN_UNS [UNSPEC_FMAX UNSPEC_FMIN])
-
-(define_int_iterator FMAXMIN [UNSPEC_FMAXNM UNSPEC_FMINNM])
+(define_int_iterator FMAXMIN_UNS [UNSPEC_FMAX UNSPEC_FMIN
+				  UNSPEC_FMAXNM UNSPEC_FMINNM])
 
 (define_int_iterator VQDMULH [UNSPEC_SQDMULH UNSPEC_SQRDMULH])
 
@@ -1001,6 +1063,9 @@
 (define_int_iterator FCVT [UNSPEC_FRINTZ UNSPEC_FRINTP UNSPEC_FRINTM
 			    UNSPEC_FRINTA UNSPEC_FRINTN])
 
+(define_int_iterator FCVT_F2FIXED [UNSPEC_FCVTZS UNSPEC_FCVTZU])
+(define_int_iterator FCVT_FIXED2F [UNSPEC_SCVTF UNSPEC_UCVTF])
+
 (define_int_iterator FRECP [UNSPEC_FRECPE UNSPEC_FRECPX])
 
 (define_int_iterator CRC [UNSPEC_CRC32B UNSPEC_CRC32H UNSPEC_CRC32W
@@ -1036,7 +1101,9 @@
 			      (UNSPEC_FMAXV "smax_nan")
 			      (UNSPEC_FMIN "smin_nan")
 			      (UNSPEC_FMINNMV "smin")
-			      (UNSPEC_FMINV "smin_nan")])
+			      (UNSPEC_FMINV "smin_nan")
+			      (UNSPEC_FMAXNM "fmax")
+			      (UNSPEC_FMINNM "fmin")])
 
 (define_int_attr  maxmin_uns_op [(UNSPEC_UMAXV "umax")
 				 (UNSPEC_UMINV "umin")
@@ -1047,13 +1114,9 @@
 				 (UNSPEC_FMAXV "fmax")
 				 (UNSPEC_FMIN "fmin")
 				 (UNSPEC_FMINNMV "fminnm")
-				 (UNSPEC_FMINV "fmin")])
-
-(define_int_attr fmaxmin [(UNSPEC_FMAXNM "fmax")
-			  (UNSPEC_FMINNM "fmin")])
-
-(define_int_attr fmaxmin_op [(UNSPEC_FMAXNM "fmaxnm")
-			     (UNSPEC_FMINNM "fminnm")])
+				 (UNSPEC_FMINV "fmin")
+				 (UNSPEC_FMAXNM "fmaxnm")
+				 (UNSPEC_FMINNM "fminnm")])
 
 (define_int_attr sur [(UNSPEC_SHADD "s") (UNSPEC_UHADD "u")
 		      (UNSPEC_SRHADD "sr") (UNSPEC_URHADD "ur")
@@ -1137,6 +1200,11 @@
 			       (UNSPEC_FRINTP "ceil") (UNSPEC_FRINTM "floor")
 			       (UNSPEC_FRINTN "frintn")])
 
+(define_int_attr fcvt_fixed_insn [(UNSPEC_SCVTF "scvtf")
+				  (UNSPEC_UCVTF "ucvtf")
+				  (UNSPEC_FCVTZS "fcvtzs")
+				  (UNSPEC_FCVTZU "fcvtzu")])
+
 (define_int_attr perm_insn [(UNSPEC_ZIP1 "zip") (UNSPEC_ZIP2 "zip")
 			    (UNSPEC_TRN1 "trn") (UNSPEC_TRN2 "trn")
 			    (UNSPEC_UZP1 "uzp") (UNSPEC_UZP2 "uzp")])
--- a/src/gcc/config/aarch64/predicates.md
+++ b/src/gcc/config/aarch64/predicates.md
@@ -54,9 +54,9 @@
 	    (match_test "op == const0_rtx"))))
 
 (define_predicate "aarch64_reg_or_fp_zero"
-  (and (match_code "reg,subreg,const_double")
-       (ior (match_operand 0 "register_operand")
-	    (match_test "aarch64_float_const_zero_rtx_p (op)"))))
+  (ior (match_operand 0 "register_operand")
+	(and (match_code "const_double")
+	     (match_test "aarch64_float_const_zero_rtx_p (op)"))))
 
 (define_predicate "aarch64_reg_zero_or_m1_or_1"
   (and (match_code "reg,subreg,const_int")
--- a/src/gcc/config/aarch64/t-aarch64
+++ b/src/gcc/config/aarch64/t-aarch64
@@ -52,16 +52,17 @@ aarch-common.o: $(srcdir)/config/arm/aarch-common.c $(CONFIG_H) $(SYSTEM_H) \
 		$(srcdir)/config/arm/aarch-common.c
 
 aarch64-c.o: $(srcdir)/config/aarch64/aarch64-c.c $(CONFIG_H) $(SYSTEM_H) \
-    coretypes.h $(TM_H) $(TREE_H) output.h $(C_COMMON_H)
+    coretypes.h $(TM_H) $(TREE_H) output.h $(C_COMMON_H) $(TARGET_H)
 	$(COMPILER) -c $(ALL_COMPILERFLAGS) $(ALL_CPPFLAGS) $(INCLUDES) \
 		$(srcdir)/config/aarch64/aarch64-c.c
 
+PASSES_EXTRA += $(srcdir)/config/aarch64/aarch64-passes.def
+
 cortex-a57-fma-steering.o: $(srcdir)/config/aarch64/cortex-a57-fma-steering.c \
     $(CONFIG_H) $(SYSTEM_H) $(TM_H) $(REGS_H) insn-config.h $(RTL_BASE_H) \
     dominance.h cfg.h cfganal.h $(BASIC_BLOCK_H) $(INSN_ATTR_H) $(RECOG_H) \
     output.h hash-map.h $(DF_H) $(OBSTACK_H) $(TARGET_H) $(RTL_H) \
     $(CONTEXT_H) $(TREE_PASS_H) regrename.h \
-    $(srcdir)/config/aarch64/cortex-a57-fma-steering.h \
     $(srcdir)/config/aarch64/aarch64-protos.h
 	$(COMPILER) -c $(ALL_COMPILERFLAGS) $(ALL_CPPFLAGS) $(INCLUDES) \
 		$(srcdir)/config/aarch64/cortex-a57-fma-steering.c
--- a/src/gcc/config/aarch64/thunderx.md
+++ b/src/gcc/config/aarch64/thunderx.md
@@ -39,7 +39,7 @@
 
 (define_insn_reservation "thunderx_shift" 1
   (and (eq_attr "tune" "thunderx")
-       (eq_attr "type" "bfm,extend,rotate_imm,shift_imm,shift_reg,rbit,rev"))
+       (eq_attr "type" "bfm,bfx,extend,rotate_imm,shift_imm,shift_reg,rbit,rev"))
   "thunderx_pipe0 | thunderx_pipe1")
 
 
--- a/src/gcc/config/alpha/alpha.c
+++ b/src/gcc/config/alpha/alpha.c
@@ -26,6 +26,7 @@ along with GCC; see the file COPYING3.  If not see
 #include "target.h"
 #include "rtl.h"
 #include "tree.h"
+#include "memmodel.h"
 #include "gimple.h"
 #include "df.h"
 #include "tm_p.h"
--- a/src/gcc/config/arm/aarch-cost-tables.h
+++ b/src/gcc/config/arm/aarch-cost-tables.h
@@ -191,35 +191,35 @@ const struct cpu_cost_table cortexa53_extra_costs =
   {
     /* FP SFmode */
     {
-      COSTS_N_INSNS (15),	/* div.  */
-      COSTS_N_INSNS (3),	/* mult.  */
-      COSTS_N_INSNS (7),	/* mult_addsub. */
-      COSTS_N_INSNS (7),	/* fma.  */
-      COSTS_N_INSNS (3),	/* addsub.  */
-      COSTS_N_INSNS (1),	/* fpconst. */
-      COSTS_N_INSNS (2),	/* neg.  */
-      COSTS_N_INSNS (1),	/* compare.  */
-      COSTS_N_INSNS (3),	/* widen.  */
-      COSTS_N_INSNS (3),	/* narrow.  */
-      COSTS_N_INSNS (3),	/* toint.  */
-      COSTS_N_INSNS (3),	/* fromint.  */
-      COSTS_N_INSNS (3)		/* roundint.  */
+      COSTS_N_INSNS (5),	/* div.  */
+      COSTS_N_INSNS (1),	/* mult.  */
+      COSTS_N_INSNS (2),	/* mult_addsub.  */
+      COSTS_N_INSNS (2),	/* fma.  */
+      COSTS_N_INSNS (1),	/* addsub.  */
+      0,			/* fpconst.  */
+      COSTS_N_INSNS (1),	/* neg.  */
+      0,			/* compare.  */
+      COSTS_N_INSNS (1),	/* widen.  */
+      COSTS_N_INSNS (1),	/* narrow.  */
+      COSTS_N_INSNS (1),	/* toint.  */
+      COSTS_N_INSNS (1),	/* fromint.  */
+      COSTS_N_INSNS (1)		/* roundint.  */
     },
     /* FP DFmode */
     {
-      COSTS_N_INSNS (30),	/* div.  */
-      COSTS_N_INSNS (3),	/* mult.  */
-      COSTS_N_INSNS (7),	/* mult_addsub.  */
-      COSTS_N_INSNS (7),	/* fma.  */
-      COSTS_N_INSNS (3),	/* addsub.  */
-      COSTS_N_INSNS (1),	/* fpconst.  */
-      COSTS_N_INSNS (2),	/* neg.  */
-      COSTS_N_INSNS (1),	/* compare.  */
-      COSTS_N_INSNS (3),	/* widen.  */
-      COSTS_N_INSNS (3),	/* narrow.  */
-      COSTS_N_INSNS (3),	/* toint.  */
-      COSTS_N_INSNS (3),	/* fromint.  */
-      COSTS_N_INSNS (3)		/* roundint.  */
+      COSTS_N_INSNS (10),	/* div.  */
+      COSTS_N_INSNS (1),	/* mult.  */
+      COSTS_N_INSNS (2),	/* mult_addsub.  */
+      COSTS_N_INSNS (2),	/* fma.  */
+      COSTS_N_INSNS (1),	/* addsub.  */
+      0,			/* fpconst.  */
+      COSTS_N_INSNS (1),	/* neg.  */
+      0,			/* compare.  */
+      COSTS_N_INSNS (1),	/* widen.  */
+      COSTS_N_INSNS (1),	/* narrow.  */
+      COSTS_N_INSNS (1),	/* toint.  */
+      COSTS_N_INSNS (1),	/* fromint.  */
+      COSTS_N_INSNS (1)		/* roundint.  */
     }
   },
   /* Vector */
@@ -294,35 +294,35 @@ const struct cpu_cost_table cortexa57_extra_costs =
   {
     /* FP SFmode */
     {
-      COSTS_N_INSNS (17),      /* div.  */
-      COSTS_N_INSNS (5),       /* mult.  */
-      COSTS_N_INSNS (9),       /* mult_addsub. */
-      COSTS_N_INSNS (9),       /* fma.  */
-      COSTS_N_INSNS (4),       /* addsub.  */
-      COSTS_N_INSNS (2),       /* fpconst. */
-      COSTS_N_INSNS (2),       /* neg.  */
-      COSTS_N_INSNS (2),       /* compare.  */
-      COSTS_N_INSNS (4),       /* widen.  */
-      COSTS_N_INSNS (4),       /* narrow.  */
-      COSTS_N_INSNS (4),       /* toint.  */
-      COSTS_N_INSNS (4),       /* fromint.  */
-      COSTS_N_INSNS (4)        /* roundint.  */
+      COSTS_N_INSNS (6),      /* div.  */
+      COSTS_N_INSNS (1),       /* mult.  */
+      COSTS_N_INSNS (2),       /* mult_addsub.  */
+      COSTS_N_INSNS (2),       /* fma.  */
+      COSTS_N_INSNS (1),       /* addsub.  */
+      0,		       /* fpconst.  */
+      0,		       /* neg.  */
+      0,		       /* compare.  */
+      COSTS_N_INSNS (1),       /* widen.  */
+      COSTS_N_INSNS (1),       /* narrow.  */
+      COSTS_N_INSNS (1),       /* toint.  */
+      COSTS_N_INSNS (1),       /* fromint.  */
+      COSTS_N_INSNS (1)        /* roundint.  */
     },
     /* FP DFmode */
     {
-      COSTS_N_INSNS (31),      /* div.  */
-      COSTS_N_INSNS (5),       /* mult.  */
-      COSTS_N_INSNS (9),       /* mult_addsub.  */
-      COSTS_N_INSNS (9),       /* fma.  */
-      COSTS_N_INSNS (4),       /* addsub.  */
-      COSTS_N_INSNS (2),       /* fpconst.  */
-      COSTS_N_INSNS (2),       /* neg.  */
-      COSTS_N_INSNS (2),       /* compare.  */
-      COSTS_N_INSNS (4),       /* widen.  */
-      COSTS_N_INSNS (4),       /* narrow.  */
-      COSTS_N_INSNS (4),       /* toint.  */
-      COSTS_N_INSNS (4),       /* fromint.  */
-      COSTS_N_INSNS (4)        /* roundint.  */
+      COSTS_N_INSNS (11),      /* div.  */
+      COSTS_N_INSNS (1),       /* mult.  */
+      COSTS_N_INSNS (2),       /* mult_addsub.  */
+      COSTS_N_INSNS (2),       /* fma.  */
+      COSTS_N_INSNS (1),       /* addsub.  */
+      0,		       /* fpconst.  */
+      0,		       /* neg.  */
+      0,		       /* compare.  */
+      COSTS_N_INSNS (1),       /* widen.  */
+      COSTS_N_INSNS (1),       /* narrow.  */
+      COSTS_N_INSNS (1),       /* toint.  */
+      COSTS_N_INSNS (1),       /* fromint.  */
+      COSTS_N_INSNS (1)        /* roundint.  */
     }
   },
   /* Vector */
@@ -537,4 +537,107 @@ const struct cpu_cost_table xgene1_extra_costs =
   }
 };
 
+const struct cpu_cost_table qdf24xx_extra_costs =
+{
+  /* ALU */
+  {
+    0,                 /* arith.  */
+    0,                 /* logical.  */
+    0,                 /* shift.  */
+    0,                 /* shift_reg.  */
+    COSTS_N_INSNS (1), /* arith_shift.  */
+    COSTS_N_INSNS (1), /* arith_shift_reg.  */
+    0,                 /* log_shift.  */
+    0,                 /* log_shift_reg.  */
+    0,                 /* extend.  */
+    0,                 /* extend_arith.  */
+    0,                 /* bfi.  */
+    0,                 /* bfx.  */
+    0,                 /* clz.  */
+    0,	               /* rev.  */
+    0,                 /* non_exec.  */
+    true               /* non_exec_costs_exec.  */
+  },
+  {
+    /* MULT SImode */
+    {
+      COSTS_N_INSNS (2),       /* simple.  */
+      COSTS_N_INSNS (2),       /* flag_setting.  */
+      COSTS_N_INSNS (2),       /* extend.  */
+      COSTS_N_INSNS (2),       /* add.  */
+      COSTS_N_INSNS (2),       /* extend_add.  */
+      COSTS_N_INSNS (4)       /* idiv.  */
+    },
+    /* MULT DImode */
+    {
+      COSTS_N_INSNS (3),       /* simple.  */
+      0,                       /* flag_setting (N/A).  */
+      COSTS_N_INSNS (3),       /* extend.  */
+      COSTS_N_INSNS (3),       /* add.  */
+      COSTS_N_INSNS (3),       /* extend_add.  */
+      COSTS_N_INSNS (9)       /* idiv.  */
+    }
+  },
+  /* LD/ST */
+  {
+    COSTS_N_INSNS (2),         /* load.  */
+    COSTS_N_INSNS (2),         /* load_sign_extend.  */
+    COSTS_N_INSNS (2),         /* ldrd.  */
+    COSTS_N_INSNS (2),         /* ldm_1st.  */
+    1,                         /* ldm_regs_per_insn_1st.  */
+    2,                         /* ldm_regs_per_insn_subsequent.  */
+    COSTS_N_INSNS (2),         /* loadf.  */
+    COSTS_N_INSNS (2),         /* loadd.  */
+    COSTS_N_INSNS (3),         /* load_unaligned.  */
+    0,                         /* store.  */
+    0,                         /* strd.  */
+    0,                         /* stm_1st.  */
+    1,                         /* stm_regs_per_insn_1st.  */
+    2,                         /* stm_regs_per_insn_subsequent.  */
+    0,                         /* storef.  */
+    0,                         /* stored.  */
+    COSTS_N_INSNS (1),         /* store_unaligned.  */
+    COSTS_N_INSNS (1),         /* loadv.  */
+    COSTS_N_INSNS (1)          /* storev.  */
+  },
+  {
+    /* FP SFmode */
+    {
+      COSTS_N_INSNS (6),      /* div.  */
+      COSTS_N_INSNS (5),       /* mult.  */
+      COSTS_N_INSNS (5),       /* mult_addsub. */
+      COSTS_N_INSNS (5),       /* fma.  */
+      COSTS_N_INSNS (3),       /* addsub.  */
+      COSTS_N_INSNS (1),       /* fpconst. */
+      COSTS_N_INSNS (1),       /* neg.  */
+      COSTS_N_INSNS (2),       /* compare.  */
+      COSTS_N_INSNS (4),       /* widen.  */
+      COSTS_N_INSNS (4),       /* narrow.  */
+      COSTS_N_INSNS (4),       /* toint.  */
+      COSTS_N_INSNS (4),       /* fromint.  */
+      COSTS_N_INSNS (2)        /* roundint.  */
+    },
+    /* FP DFmode */
+    {
+      COSTS_N_INSNS (11),      /* div.  */
+      COSTS_N_INSNS (6),       /* mult.  */
+      COSTS_N_INSNS (6),       /* mult_addsub.  */
+      COSTS_N_INSNS (6),       /* fma.  */
+      COSTS_N_INSNS (3),       /* addsub.  */
+      COSTS_N_INSNS (1),       /* fpconst.  */
+      COSTS_N_INSNS (1),       /* neg.  */
+      COSTS_N_INSNS (2),       /* compare.  */
+      COSTS_N_INSNS (4),       /* widen.  */
+      COSTS_N_INSNS (4),       /* narrow.  */
+      COSTS_N_INSNS (4),       /* toint.  */
+      COSTS_N_INSNS (4),       /* fromint.  */
+      COSTS_N_INSNS (2)        /* roundint.  */
+    }
+  },
+  /* Vector */
+  {
+    COSTS_N_INSNS (1)  /* alu.  */
+  }
+};
+
 #endif /* GCC_AARCH_COST_TABLES_H */
--- a/src/gcc/config/arm/arm-arches.def
+++ b/src/gcc/config/arm/arm-arches.def
@@ -58,10 +58,22 @@ ARM_ARCH("armv7e-m", cortexm4,  7EM,	ARM_FSET_MAKE_CPU1 (FL_CO_PROC |	      FL_F
 ARM_ARCH("armv8-a", cortexa53,  8A,	ARM_FSET_MAKE_CPU1 (FL_CO_PROC |             FL_FOR_ARCH8A))
 ARM_ARCH("armv8-a+crc",cortexa53, 8A,   ARM_FSET_MAKE_CPU1 (FL_CO_PROC | FL_CRC32  | FL_FOR_ARCH8A))
 ARM_ARCH("armv8.1-a", cortexa53,  8A,
-	  ARM_FSET_MAKE (FL_CO_PROC | FL_FOR_ARCH8A,  FL2_FOR_ARCH8_1A))
+	  ARM_FSET_MAKE (FL_CO_PROC | FL_CRC32 | FL_FOR_ARCH8A,
+			 FL2_FOR_ARCH8_1A))
 ARM_ARCH("armv8.1-a+crc",cortexa53, 8A,
 	  ARM_FSET_MAKE (FL_CO_PROC | FL_CRC32 | FL_FOR_ARCH8A,
 			 FL2_FOR_ARCH8_1A))
+ARM_ARCH ("armv8.2-a", cortexa53,  8A,
+	  ARM_FSET_MAKE (FL_CO_PROC | FL_CRC32 | FL_FOR_ARCH8A,
+			 FL2_FOR_ARCH8_2A))
+ARM_ARCH ("armv8.2-a+fp16", cortexa53,  8A,
+	  ARM_FSET_MAKE (FL_CO_PROC | FL_CRC32 | FL_FOR_ARCH8A,
+			 FL2_FOR_ARCH8_2A | FL2_FP16INST))
+ARM_ARCH("armv8-m.base", cortexm23, 8M_BASE,
+	  ARM_FSET_MAKE (FL_FOR_ARCH8M_BASE, FL2_CMSE))
+ARM_ARCH("armv8-m.main", cortexm7, 8M_MAIN,
+	  ARM_FSET_MAKE (FL_CO_PROC | FL_FOR_ARCH8M_MAIN, FL2_CMSE))
+ARM_ARCH("armv8-m.main+dsp", cortexm33, 8M_MAIN,
+	  ARM_FSET_MAKE (FL_CO_PROC | FL_ARCH7EM | FL_FOR_ARCH8M_MAIN, FL2_CMSE))
 ARM_ARCH("iwmmxt",  iwmmxt,     5TE,	ARM_FSET_MAKE_CPU1 (FL_LDSCHED | FL_STRONG | FL_FOR_ARCH5TE | FL_XSCALE | FL_IWMMXT))
 ARM_ARCH("iwmmxt2", iwmmxt2,    5TE,	ARM_FSET_MAKE_CPU1 (FL_LDSCHED | FL_STRONG | FL_FOR_ARCH5TE | FL_XSCALE | FL_IWMMXT | FL_IWMMXT2))
-
--- a/src/gcc/config/arm/arm-builtins.c
+++ b/src/gcc/config/arm/arm-builtins.c
@@ -190,6 +190,8 @@ arm_storestruct_lane_qualifiers[SIMD_MAX_BUILTIN_ARGS]
 #define ti_UP	 TImode
 #define ei_UP	 EImode
 #define oi_UP	 OImode
+#define hf_UP	 HFmode
+#define si_UP	 SImode
 
 #define UP(X) X##_UP
 
@@ -239,12 +241,22 @@ typedef struct {
   VAR11 (T, N, A, B, C, D, E, F, G, H, I, J, K) \
   VAR1 (T, N, L)
 
-/* The NEON builtin data can be found in arm_neon_builtins.def.
-   The mode entries in the following table correspond to the "key" type of the
-   instruction variant, i.e. equivalent to that which would be specified after
-   the assembler mnemonic, which usually refers to the last vector operand.
-   The modes listed per instruction should be the same as those defined for
-   that instruction's pattern in neon.md.  */
+/* The NEON builtin data can be found in arm_neon_builtins.def and
+   arm_vfp_builtins.def.  The entries in arm_neon_builtins.def require
+   TARGET_NEON to be true.  The feature tests are checked when the
+   builtins are expanded.
+
+   The mode entries in the following table correspond to the "key"
+   type of the instruction variant, i.e. equivalent to that which
+   would be specified after the assembler mnemonic, which usually
+   refers to the last vector operand.  The modes listed per
+   instruction should be the same as those defined for that
+   instruction's pattern in neon.md.  */
+
+static neon_builtin_datum vfp_builtin_data[] =
+{
+#include "arm_vfp_builtins.def"
+};
 
 static neon_builtin_datum neon_builtin_data[] =
 {
@@ -515,6 +527,8 @@ enum arm_builtins
   ARM_BUILTIN_GET_FPSCR,
   ARM_BUILTIN_SET_FPSCR,
 
+  ARM_BUILTIN_CMSE_NONSECURE_CALLER,
+
 #undef CRYPTO1
 #undef CRYPTO2
 #undef CRYPTO3
@@ -534,6 +548,10 @@ enum arm_builtins
 #undef CRYPTO2
 #undef CRYPTO3
 
+  ARM_BUILTIN_VFP_BASE,
+
+#include "arm_vfp_builtins.def"
+
   ARM_BUILTIN_NEON_BASE,
   ARM_BUILTIN_NEON_LANE_CHECK = ARM_BUILTIN_NEON_BASE,
 
@@ -542,8 +560,11 @@ enum arm_builtins
   ARM_BUILTIN_MAX
 };
 
+#define ARM_BUILTIN_VFP_PATTERN_START \
+  (ARM_BUILTIN_VFP_BASE + 1)
+
 #define ARM_BUILTIN_NEON_PATTERN_START \
-    (ARM_BUILTIN_MAX - ARRAY_SIZE (neon_builtin_data))
+  (ARM_BUILTIN_NEON_BASE + 1)
 
 #undef CF
 #undef VAR1
@@ -895,6 +916,110 @@ arm_init_simd_builtin_scalar_types (void)
 					     "__builtin_neon_uti");
 }
 
+/* Set up a NEON builtin.  */
+
+static void
+arm_init_neon_builtin (unsigned int fcode,
+		       neon_builtin_datum *d)
+{
+  bool print_type_signature_p = false;
+  char type_signature[SIMD_MAX_BUILTIN_ARGS] = { 0 };
+  char namebuf[60];
+  tree ftype = NULL;
+  tree fndecl = NULL;
+
+  d->fcode = fcode;
+
+  /* We must track two variables here.  op_num is
+     the operand number as in the RTL pattern.  This is
+     required to access the mode (e.g. V4SF mode) of the
+     argument, from which the base type can be derived.
+     arg_num is an index in to the qualifiers data, which
+     gives qualifiers to the type (e.g. const unsigned).
+     The reason these two variables may differ by one is the
+     void return type.  While all return types take the 0th entry
+     in the qualifiers array, there is no operand for them in the
+     RTL pattern.  */
+  int op_num = insn_data[d->code].n_operands - 1;
+  int arg_num = d->qualifiers[0] & qualifier_void
+    ? op_num + 1
+    : op_num;
+  tree return_type = void_type_node, args = void_list_node;
+  tree eltype;
+
+  /* Build a function type directly from the insn_data for this
+     builtin.  The build_function_type () function takes care of
+     removing duplicates for us.  */
+  for (; op_num >= 0; arg_num--, op_num--)
+    {
+      machine_mode op_mode = insn_data[d->code].operand[op_num].mode;
+      enum arm_type_qualifiers qualifiers = d->qualifiers[arg_num];
+
+      if (qualifiers & qualifier_unsigned)
+	{
+	  type_signature[arg_num] = 'u';
+	  print_type_signature_p = true;
+	}
+      else if (qualifiers & qualifier_poly)
+	{
+	  type_signature[arg_num] = 'p';
+	  print_type_signature_p = true;
+	}
+      else
+	type_signature[arg_num] = 's';
+
+      /* Skip an internal operand for vget_{low, high}.  */
+      if (qualifiers & qualifier_internal)
+	continue;
+
+      /* Some builtins have different user-facing types
+	 for certain arguments, encoded in d->mode.  */
+      if (qualifiers & qualifier_map_mode)
+	op_mode = d->mode;
+
+      /* For pointers, we want a pointer to the basic type
+	 of the vector.  */
+      if (qualifiers & qualifier_pointer && VECTOR_MODE_P (op_mode))
+	op_mode = GET_MODE_INNER (op_mode);
+
+      eltype = arm_simd_builtin_type
+	(op_mode,
+	 (qualifiers & qualifier_unsigned) != 0,
+	 (qualifiers & qualifier_poly) != 0);
+      gcc_assert (eltype != NULL);
+
+      /* Add qualifiers.  */
+      if (qualifiers & qualifier_const)
+	eltype = build_qualified_type (eltype, TYPE_QUAL_CONST);
+
+      if (qualifiers & qualifier_pointer)
+	eltype = build_pointer_type (eltype);
+
+      /* If we have reached arg_num == 0, we are at a non-void
+	 return type.  Otherwise, we are still processing
+	 arguments.  */
+      if (arg_num == 0)
+	return_type = eltype;
+      else
+	args = tree_cons (NULL_TREE, eltype, args);
+    }
+
+  ftype = build_function_type (return_type, args);
+
+  gcc_assert (ftype != NULL);
+
+  if (print_type_signature_p)
+    snprintf (namebuf, sizeof (namebuf), "__builtin_neon_%s_%s",
+	      d->name, type_signature);
+  else
+    snprintf (namebuf, sizeof (namebuf), "__builtin_neon_%s",
+	      d->name);
+
+  fndecl = add_builtin_function (namebuf, ftype, fcode, BUILT_IN_MD,
+				 NULL, NULL_TREE);
+  arm_builtin_decls[fcode] = fndecl;
+}
+
 /* Set up all the NEON builtins, even builtins for instructions that are not
    in the current target ISA to allow the user to compile particular modules
    with different target specific options that differ from the command line
@@ -924,103 +1049,22 @@ arm_init_neon_builtins (void)
 
   for (i = 0; i < ARRAY_SIZE (neon_builtin_data); i++, fcode++)
     {
-      bool print_type_signature_p = false;
-      char type_signature[SIMD_MAX_BUILTIN_ARGS] = { 0 };
       neon_builtin_datum *d = &neon_builtin_data[i];
-      char namebuf[60];
-      tree ftype = NULL;
-      tree fndecl = NULL;
-
-      d->fcode = fcode;
-
-      /* We must track two variables here.  op_num is
-	 the operand number as in the RTL pattern.  This is
-	 required to access the mode (e.g. V4SF mode) of the
-	 argument, from which the base type can be derived.
-	 arg_num is an index in to the qualifiers data, which
-	 gives qualifiers to the type (e.g. const unsigned).
-	 The reason these two variables may differ by one is the
-	 void return type.  While all return types take the 0th entry
-	 in the qualifiers array, there is no operand for them in the
-	 RTL pattern.  */
-      int op_num = insn_data[d->code].n_operands - 1;
-      int arg_num = d->qualifiers[0] & qualifier_void
-		      ? op_num + 1
-		      : op_num;
-      tree return_type = void_type_node, args = void_list_node;
-      tree eltype;
-
-      /* Build a function type directly from the insn_data for this
-	 builtin.  The build_function_type () function takes care of
-	 removing duplicates for us.  */
-      for (; op_num >= 0; arg_num--, op_num--)
-	{
-	  machine_mode op_mode = insn_data[d->code].operand[op_num].mode;
-	  enum arm_type_qualifiers qualifiers = d->qualifiers[arg_num];
-
-	  if (qualifiers & qualifier_unsigned)
-	    {
-	      type_signature[arg_num] = 'u';
-	      print_type_signature_p = true;
-	    }
-	  else if (qualifiers & qualifier_poly)
-	    {
-	      type_signature[arg_num] = 'p';
-	      print_type_signature_p = true;
-	    }
-	  else
-	    type_signature[arg_num] = 's';
-
-	  /* Skip an internal operand for vget_{low, high}.  */
-	  if (qualifiers & qualifier_internal)
-	    continue;
-
-	  /* Some builtins have different user-facing types
-	     for certain arguments, encoded in d->mode.  */
-	  if (qualifiers & qualifier_map_mode)
-	      op_mode = d->mode;
-
-	  /* For pointers, we want a pointer to the basic type
-	     of the vector.  */
-	  if (qualifiers & qualifier_pointer && VECTOR_MODE_P (op_mode))
-	    op_mode = GET_MODE_INNER (op_mode);
-
-	  eltype = arm_simd_builtin_type
-		     (op_mode,
-		      (qualifiers & qualifier_unsigned) != 0,
-		      (qualifiers & qualifier_poly) != 0);
-	  gcc_assert (eltype != NULL);
-
-	  /* Add qualifiers.  */
-	  if (qualifiers & qualifier_const)
-	    eltype = build_qualified_type (eltype, TYPE_QUAL_CONST);
-
-	  if (qualifiers & qualifier_pointer)
-	      eltype = build_pointer_type (eltype);
-
-	  /* If we have reached arg_num == 0, we are at a non-void
-	     return type.  Otherwise, we are still processing
-	     arguments.  */
-	  if (arg_num == 0)
-	    return_type = eltype;
-	  else
-	    args = tree_cons (NULL_TREE, eltype, args);
-	}
-
-      ftype = build_function_type (return_type, args);
+      arm_init_neon_builtin (fcode, d);
+    }
+}
 
-      gcc_assert (ftype != NULL);
+/* Set up all the scalar floating point builtins.  */
 
-      if (print_type_signature_p)
-	snprintf (namebuf, sizeof (namebuf), "__builtin_neon_%s_%s",
-		  d->name, type_signature);
-      else
-	snprintf (namebuf, sizeof (namebuf), "__builtin_neon_%s",
-		  d->name);
+static void
+arm_init_vfp_builtins (void)
+{
+  unsigned int i, fcode = ARM_BUILTIN_VFP_PATTERN_START;
 
-      fndecl = add_builtin_function (namebuf, ftype, fcode, BUILT_IN_MD,
-				     NULL, NULL_TREE);
-      arm_builtin_decls[fcode] = fndecl;
+  for (i = 0; i < ARRAY_SIZE (vfp_builtin_data); i++, fcode++)
+    {
+      neon_builtin_datum *d = &vfp_builtin_data[i];
+      arm_init_neon_builtin (fcode, d);
     }
 }
 
@@ -1768,14 +1812,14 @@ arm_init_builtins (void)
   if (TARGET_HARD_FLOAT)
     {
       arm_init_neon_builtins ();
-
+      arm_init_vfp_builtins ();
       arm_init_crypto_builtins ();
     }
 
   if (TARGET_CRC32)
     arm_init_crc32_builtins ();
 
-  if (TARGET_VFP && TARGET_HARD_FLOAT)
+  if (TARGET_HARD_FLOAT)
     {
       tree ftype_set_fpscr
 	= build_function_type_list (void_type_node, unsigned_type_node, NULL);
@@ -1789,6 +1833,17 @@ arm_init_builtins (void)
 	= add_builtin_function ("__builtin_arm_stfscr", ftype_set_fpscr,
 				ARM_BUILTIN_SET_FPSCR, BUILT_IN_MD, NULL, NULL_TREE);
     }
+
+  if (use_cmse)
+    {
+      tree ftype_cmse_nonsecure_caller
+	= build_function_type_list (unsigned_type_node, NULL);
+      arm_builtin_decls[ARM_BUILTIN_CMSE_NONSECURE_CALLER]
+	= add_builtin_function ("__builtin_arm_cmse_nonsecure_caller",
+				ftype_cmse_nonsecure_caller,
+				ARM_BUILTIN_CMSE_NONSECURE_CALLER, BUILT_IN_MD,
+				NULL, NULL_TREE);
+    }
 }
 
 /* Return the ARM builtin for CODE.  */
@@ -2211,40 +2266,16 @@ constant_arg:
   return target;
 }
 
-/* Expand a Neon builtin, i.e. those registered only if TARGET_NEON holds.
-   Most of these are "special" because they don't have symbolic
-   constants defined per-instruction or per instruction-variant. Instead, the
-   required info is looked up in the table neon_builtin_data.  */
+/* Expand a neon builtin.  This is also used for vfp builtins, which behave in
+   the same way.  These builtins are "special" because they don't have symbolic
+   constants defined per-instruction or per instruction-variant.  Instead, the
+   required info is looked up in the NEON_BUILTIN_DATA record that is passed
+   into the function.  */
+
 static rtx
-arm_expand_neon_builtin (int fcode, tree exp, rtx target)
+arm_expand_neon_builtin_1 (int fcode, tree exp, rtx target,
+			   neon_builtin_datum *d)
 {
-  /* Check in the context of the function making the call whether the
-     builtin is supported.  */
-  if (! TARGET_NEON)
-    {
-      fatal_error (input_location,
-		   "You must enable NEON instructions (e.g. -mfloat-abi=softfp -mfpu=neon) to use these intrinsics.");
-      return const0_rtx;
-    }
-
-  if (fcode == ARM_BUILTIN_NEON_LANE_CHECK)
-    {
-      /* Builtin is only to check bounds of the lane passed to some intrinsics
-	 that are implemented with gcc vector extensions in arm_neon.h.  */
-
-      tree nlanes = CALL_EXPR_ARG (exp, 0);
-      gcc_assert (TREE_CODE (nlanes) == INTEGER_CST);
-      rtx lane_idx = expand_normal (CALL_EXPR_ARG (exp, 1));
-      if (CONST_INT_P (lane_idx))
-	neon_lane_bounds (lane_idx, 0, TREE_INT_CST_LOW (nlanes), exp);
-      else
-	error ("%Klane index must be a constant immediate", exp);
-      /* Don't generate any RTL.  */
-      return const0_rtx;
-    }
-
-  neon_builtin_datum *d =
-		&neon_builtin_data[fcode - ARM_BUILTIN_NEON_PATTERN_START];
   enum insn_code icode = d->code;
   builtin_arg args[SIMD_MAX_BUILTIN_ARGS + 1];
   int num_args = insn_data[d->code].n_operands;
@@ -2260,8 +2291,8 @@ arm_expand_neon_builtin (int fcode, tree exp, rtx target)
       /* We have four arrays of data, each indexed in a different fashion.
 	 qualifiers - element 0 always describes the function return type.
 	 operands - element 0 is either the operand for return value (if
-	   the function has a non-void return type) or the operand for the
-	   first argument.
+	 the function has a non-void return type) or the operand for the
+	 first argument.
 	 expr_args - element 0 always holds the first argument.
 	 args - element 0 is always used for the return type.  */
       int qualifiers_k = k;
@@ -2283,7 +2314,7 @@ arm_expand_neon_builtin (int fcode, tree exp, rtx target)
 	  bool op_const_int_p =
 	    (CONST_INT_P (arg)
 	     && (*insn_data[icode].operand[operands_k].predicate)
-		(arg, insn_data[icode].operand[operands_k].mode));
+	     (arg, insn_data[icode].operand[operands_k].mode));
 	  args[k] = op_const_int_p ? NEON_ARG_CONSTANT : NEON_ARG_COPY_TO_REG;
 	}
       else if (d->qualifiers[qualifiers_k] & qualifier_pointer)
@@ -2296,8 +2327,68 @@ arm_expand_neon_builtin (int fcode, tree exp, rtx target)
   /* The interface to arm_expand_neon_args expects a 0 if
      the function is void, and a 1 if it is not.  */
   return arm_expand_neon_args
-	  (target, d->mode, fcode, icode, !is_void, exp,
-	   &args[1]);
+    (target, d->mode, fcode, icode, !is_void, exp,
+     &args[1]);
+}
+
+/* Expand a Neon builtin, i.e. those registered only if TARGET_NEON holds.
+   Most of these are "special" because they don't have symbolic
+   constants defined per-instruction or per instruction-variant.  Instead, the
+   required info is looked up in the table neon_builtin_data.  */
+
+static rtx
+arm_expand_neon_builtin (int fcode, tree exp, rtx target)
+{
+  if (fcode >= ARM_BUILTIN_NEON_BASE && ! TARGET_NEON)
+    {
+      fatal_error (input_location,
+		   "You must enable NEON instructions"
+		   " (e.g. -mfloat-abi=softfp -mfpu=neon)"
+		   " to use these intrinsics.");
+      return const0_rtx;
+    }
+
+  if (fcode == ARM_BUILTIN_NEON_LANE_CHECK)
+    {
+      /* Builtin is only to check bounds of the lane passed to some intrinsics
+	 that are implemented with gcc vector extensions in arm_neon.h.  */
+
+      tree nlanes = CALL_EXPR_ARG (exp, 0);
+      gcc_assert (TREE_CODE (nlanes) == INTEGER_CST);
+      rtx lane_idx = expand_normal (CALL_EXPR_ARG (exp, 1));
+      if (CONST_INT_P (lane_idx))
+	neon_lane_bounds (lane_idx, 0, TREE_INT_CST_LOW (nlanes), exp);
+      else
+	error ("%Klane index must be a constant immediate", exp);
+      /* Don't generate any RTL.  */
+      return const0_rtx;
+    }
+
+  neon_builtin_datum *d
+    = &neon_builtin_data[fcode - ARM_BUILTIN_NEON_PATTERN_START];
+
+  return arm_expand_neon_builtin_1 (fcode, exp, target, d);
+}
+
+/* Expand a VFP builtin.  These builtins are treated like
+   neon builtins except that the data is looked up in table
+   VFP_BUILTIN_DATA.  */
+
+static rtx
+arm_expand_vfp_builtin (int fcode, tree exp, rtx target)
+{
+  if (fcode >= ARM_BUILTIN_VFP_BASE && ! TARGET_HARD_FLOAT)
+    {
+      fatal_error (input_location,
+		   "You must enable VFP instructions"
+		   " to use these intrinsics.");
+      return const0_rtx;
+    }
+
+  neon_builtin_datum *d
+    = &vfp_builtin_data[fcode - ARM_BUILTIN_VFP_PATTERN_START];
+
+  return arm_expand_neon_builtin_1 (fcode, exp, target, d);
 }
 
 /* Expand an expression EXP that calls a built-in function,
@@ -2337,13 +2428,18 @@ arm_expand_builtin (tree exp,
   if (fcode >= ARM_BUILTIN_NEON_BASE)
     return arm_expand_neon_builtin (fcode, exp, target);
 
+  if (fcode >= ARM_BUILTIN_VFP_BASE)
+    return arm_expand_vfp_builtin (fcode, exp, target);
+
   /* Check in the context of the function making the call whether the
      builtin is supported.  */
   if (fcode >= ARM_BUILTIN_CRYPTO_BASE
       && (!TARGET_CRYPTO || !TARGET_HARD_FLOAT))
     {
       fatal_error (input_location,
-		   "You must enable crypto intrinsics (e.g. include -mfloat-abi=softfp -mfpu=crypto-neon...) to use these intrinsics.");
+		   "You must enable crypto instructions"
+		   " (e.g. include -mfloat-abi=softfp -mfpu=crypto-neon...)"
+		   " to use these intrinsics.");
       return const0_rtx;
     }
 
@@ -2368,6 +2464,12 @@ arm_expand_builtin (tree exp,
       emit_insn (pat);
       return target;
 
+    case ARM_BUILTIN_CMSE_NONSECURE_CALLER:
+      target = gen_reg_rtx (SImode);
+      op0 = arm_return_addr (0, NULL_RTX);
+      emit_insn (gen_addsi3 (target, op0, const1_rtx));
+      return target;
+
     case ARM_BUILTIN_TEXTRMSB:
     case ARM_BUILTIN_TEXTRMUB:
     case ARM_BUILTIN_TEXTRMSH:
@@ -2995,7 +3097,7 @@ arm_atomic_assign_expand_fenv (tree *hold, tree *clear, tree *update)
   tree new_fenv_var, reload_fenv, restore_fnenv;
   tree update_call, atomic_feraiseexcept, hold_fnclex;
 
-  if (!TARGET_VFP || !TARGET_HARD_FLOAT)
+  if (!TARGET_HARD_FLOAT)
     return;
 
   /* Generate the equivalent of :
--- a/src/gcc/config/arm/arm-c.c
+++ b/src/gcc/config/arm/arm-c.c
@@ -76,6 +76,14 @@ arm_cpu_builtins (struct cpp_reader* pfile)
 
   def_or_undef_macro (pfile, "__ARM_32BIT_STATE", TARGET_32BIT);
 
+  if (arm_arch8 && !arm_arch_notm)
+    {
+      if (arm_arch_cmse && use_cmse)
+	builtin_define_with_int_value ("__ARM_FEATURE_CMSE", 3);
+      else
+	builtin_define ("__ARM_FEATURE_CMSE");
+    }
+
   if (TARGET_ARM_FEATURE_LDREX)
     builtin_define_with_int_value ("__ARM_FEATURE_LDREX",
 				   TARGET_ARM_FEATURE_LDREX);
@@ -86,6 +94,9 @@ arm_cpu_builtins (struct cpp_reader* pfile)
 		      ((TARGET_ARM_ARCH >= 5 && !TARGET_THUMB)
 		       || TARGET_ARM_ARCH_ISA_THUMB >=2));
 
+  def_or_undef_macro (pfile, "__ARM_FEATURE_NUMERIC_MAXMIN",
+		      TARGET_ARM_ARCH >= 8 && TARGET_NEON && TARGET_FPU_ARMV8);
+
   def_or_undef_macro (pfile, "__ARM_FEATURE_SIMD32", TARGET_INT_SIMD);
 
   builtin_define_with_int_value ("__ARM_SIZEOF_MINIMAL_ENUM",
@@ -128,17 +139,24 @@ arm_cpu_builtins (struct cpp_reader* pfile)
   if (TARGET_SOFT_FLOAT)
     builtin_define ("__SOFTFP__");
 
-  def_or_undef_macro (pfile, "__VFP_FP__", TARGET_VFP);
+  builtin_define ("__VFP_FP__");
 
   if (TARGET_ARM_FP)
     builtin_define_with_int_value ("__ARM_FP", TARGET_ARM_FP);
   else
     cpp_undef (pfile, "__ARM_FP");
 
-  if (arm_fp16_format == ARM_FP16_FORMAT_IEEE)
-    builtin_define ("__ARM_FP16_FORMAT_IEEE");
-  if (arm_fp16_format == ARM_FP16_FORMAT_ALTERNATIVE)
-    builtin_define ("__ARM_FP16_FORMAT_ALTERNATIVE");
+  def_or_undef_macro (pfile, "__ARM_FP16_FORMAT_IEEE",
+		      arm_fp16_format == ARM_FP16_FORMAT_IEEE);
+  def_or_undef_macro (pfile, "__ARM_FP16_FORMAT_ALTERNATIVE",
+		      arm_fp16_format == ARM_FP16_FORMAT_ALTERNATIVE);
+  def_or_undef_macro (pfile, "__ARM_FP16_ARGS",
+		      arm_fp16_format != ARM_FP16_FORMAT_NONE);
+
+  def_or_undef_macro (pfile, "__ARM_FEATURE_FP16_SCALAR_ARITHMETIC",
+		      TARGET_VFP_FP16INST);
+  def_or_undef_macro (pfile, "__ARM_FEATURE_FP16_VECTOR_ARITHMETIC",
+		      TARGET_NEON_FP16INST);
 
   def_or_undef_macro (pfile, "__ARM_FEATURE_FMA", TARGET_FMA);
   def_or_undef_macro (pfile, "__ARM_NEON__", TARGET_NEON);
--- a/src/gcc/config/arm/arm-cores.def
+++ b/src/gcc/config/arm/arm-cores.def
@@ -166,15 +166,21 @@ ARM_CORE("cortex-a15.cortex-a7", cortexa15cortexa7, cortexa7,	7A,	ARM_FSET_MAKE_
 ARM_CORE("cortex-a17.cortex-a7", cortexa17cortexa7, cortexa7,	7A,	ARM_FSET_MAKE_CPU1 (FL_LDSCHED | FL_THUMB_DIV | FL_ARM_DIV | FL_FOR_ARCH7A), cortex_a12)
 
 /* V8 Architecture Processors */
+ARM_CORE("cortex-m23",	cortexm23, cortexm23,	8M_BASE, ARM_FSET_MAKE_CPU1 (FL_LDSCHED | FL_FOR_ARCH8M_BASE), v6m)
 ARM_CORE("cortex-a32",	cortexa32, cortexa53,	8A,	ARM_FSET_MAKE_CPU1 (FL_LDSCHED | FL_CRC32 | FL_FOR_ARCH8A), cortex_a35)
+ARM_CORE("cortex-m33",	cortexm33, cortexm33,	8M_MAIN, ARM_FSET_MAKE_CPU1 (FL_LDSCHED | FL_ARCH7EM | FL_FOR_ARCH8M_MAIN), v7m)
 ARM_CORE("cortex-a35",	cortexa35, cortexa53,	8A,	ARM_FSET_MAKE_CPU1 (FL_LDSCHED | FL_CRC32 | FL_FOR_ARCH8A), cortex_a35)
 ARM_CORE("cortex-a53",	cortexa53, cortexa53,	8A,	ARM_FSET_MAKE_CPU1 (FL_LDSCHED | FL_CRC32 | FL_FOR_ARCH8A), cortex_a53)
 ARM_CORE("cortex-a57",	cortexa57, cortexa57,	8A,	ARM_FSET_MAKE_CPU1 (FL_LDSCHED | FL_CRC32 | FL_FOR_ARCH8A), cortex_a57)
 ARM_CORE("cortex-a72",	cortexa72, cortexa57,	8A,	ARM_FSET_MAKE_CPU1 (FL_LDSCHED | FL_CRC32 | FL_FOR_ARCH8A), cortex_a57)
+ARM_CORE("cortex-a73",	cortexa73, cortexa57,	8A,	ARM_FSET_MAKE_CPU1 (FL_LDSCHED | FL_CRC32 | FL_FOR_ARCH8A), cortex_a73)
 ARM_CORE("exynos-m1",	exynosm1,  exynosm1,	8A,	ARM_FSET_MAKE_CPU1 (FL_LDSCHED | FL_CRC32 | FL_FOR_ARCH8A), exynosm1)
-ARM_CORE("qdf24xx",	qdf24xx,   cortexa57,	8A,	ARM_FSET_MAKE_CPU1 (FL_LDSCHED | FL_CRC32 | FL_FOR_ARCH8A), cortex_a57)
+ARM_CORE("qdf24xx",	qdf24xx,   cortexa57,	8A,	ARM_FSET_MAKE_CPU1 (FL_LDSCHED | FL_CRC32 | FL_FOR_ARCH8A), qdf24xx)
 ARM_CORE("xgene1",      xgene1,    xgene1,      8A,	ARM_FSET_MAKE_CPU1 (FL_LDSCHED | FL_FOR_ARCH8A),            xgene1)
 
 /* V8 big.LITTLE implementations */
 ARM_CORE("cortex-a57.cortex-a53", cortexa57cortexa53, cortexa53, 8A,	ARM_FSET_MAKE_CPU1 (FL_LDSCHED | FL_CRC32 | FL_FOR_ARCH8A), cortex_a57)
 ARM_CORE("cortex-a72.cortex-a53", cortexa72cortexa53, cortexa53, 8A,	ARM_FSET_MAKE_CPU1 (FL_LDSCHED | FL_CRC32 | FL_FOR_ARCH8A), cortex_a57)
+ARM_CORE("cortex-a73.cortex-a35", cortexa73cortexa35, cortexa53, 8A,	ARM_FSET_MAKE_CPU1 (FL_LDSCHED | FL_CRC32 | FL_FOR_ARCH8A), cortex_a73)
+ARM_CORE("cortex-a73.cortex-a53", cortexa73cortexa53, cortexa53, 8A,	ARM_FSET_MAKE_CPU1 (FL_LDSCHED | FL_CRC32 | FL_FOR_ARCH8A), cortex_a73)
+
--- /dev/null
+++ b/src/gcc/config/arm/arm-flags.h
@@ -0,0 +1,212 @@
+/* Flags used to identify the presence of processor capabilities.
+
+   Copyright (C) 2016 Free Software Foundation, Inc.
+   Contributed by ARM Ltd.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify it
+   under the terms of the GNU General Public License as published
+   by the Free Software Foundation; either version 3, or (at your
+   option) any later version.
+
+   GCC is distributed in the hope that it will be useful, but WITHOUT
+   ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+   or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
+   License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with GCC; see the file COPYING3.  If not see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef GCC_ARM_FLAGS_H
+#define GCC_ARM_FLAGS_H
+
+/* Flags used to identify the presence of processor capabilities.  */
+
+/* Bit values used to identify processor capabilities.  */
+#define FL_NONE	      (0U)		/* No flags.  */
+#define FL_ANY	      (0xffffffffU)	/* All flags.  */
+#define FL_CO_PROC    (1U << 0)		/* Has external co-processor bus.  */
+#define FL_ARCH3M     (1U << 1)		/* Extended multiply.  */
+#define FL_MODE26     (1U << 2)		/* 26-bit mode support.  */
+#define FL_MODE32     (1U << 3)		/* 32-bit mode support.  */
+#define FL_ARCH4      (1U << 4)		/* Architecture rel 4.  */
+#define FL_ARCH5      (1U << 5)		/* Architecture rel 5.  */
+#define FL_THUMB      (1U << 6)		/* Thumb aware.  */
+#define FL_LDSCHED    (1U << 7)		/* Load scheduling necessary.  */
+#define FL_STRONG     (1U << 8)		/* StrongARM.  */
+#define FL_ARCH5E     (1U << 9)		/* DSP extensions to v5.  */
+#define FL_XSCALE     (1U << 10)	/* XScale.  */
+/* spare	      (1U << 11) */
+#define FL_ARCH6      (1U << 12)	/* Architecture rel 6.  Adds
+					   media instructions.  */
+#define FL_VFPV2      (1U << 13)	/* Vector Floating Point V2.  */
+#define FL_WBUF	      (1U << 14)	/* Schedule for write buffer ops.
+					   Note: ARM6 & 7 derivatives only.  */
+#define FL_ARCH6K     (1U << 15)	/* Architecture rel 6 K extensions.  */
+#define FL_THUMB2     (1U << 16)	/* Thumb-2.  */
+#define FL_NOTM	      (1U << 17)	/* Instructions not present in the 'M'
+					   profile.  */
+#define FL_THUMB_DIV  (1U << 18)	/* Hardware divide (Thumb mode).  */
+#define FL_VFPV3      (1U << 19)	/* Vector Floating Point V3.  */
+#define FL_NEON       (1U << 20)	/* Neon instructions.  */
+#define FL_ARCH7EM    (1U << 21)	/* Instructions present in the ARMv7E-M
+					   architecture.  */
+#define FL_ARCH7      (1U << 22)	/* Architecture 7.  */
+#define FL_ARM_DIV    (1U << 23)	/* Hardware divide (ARM mode).  */
+#define FL_ARCH8      (1U << 24)	/* Architecture 8.  */
+#define FL_CRC32      (1U << 25)	/* ARMv8 CRC32 instructions.  */
+#define FL_SMALLMUL   (1U << 26)	/* Small multiply supported.  */
+#define FL_NO_VOLATILE_CE  (1U << 27)	/* No volatile memory in IT block.  */
+
+#define FL_IWMMXT     (1U << 29)	/* XScale v2 or "Intel Wireless MMX
+					   technology".  */
+#define FL_IWMMXT2    (1U << 30)	/* "Intel Wireless MMX2
+					    technology".  */
+#define FL_ARCH6KZ    (1U << 31)	/* ARMv6KZ architecture.  */
+
+#define FL2_ARCH8_1   (1U << 0)		/* Architecture 8.1.  */
+#define FL2_ARCH8_2   (1U << 1)		/* Architecture 8.2.  */
+#define FL2_FP16INST  (1U << 2)		/* FP16 Instructions for ARMv8.2 and
+					   later.  */
+#define FL2_CMSE      (1U << 3)		/* ARMv8-M Security Extensions.  */
+
+/* Flags that only effect tuning, not available instructions.  */
+#define FL_TUNE		(FL_WBUF | FL_VFPV2 | FL_STRONG | FL_LDSCHED \
+			 | FL_CO_PROC)
+
+#define FL_FOR_ARCH2		FL_NOTM
+#define FL_FOR_ARCH3		(FL_FOR_ARCH2 | FL_MODE32)
+#define FL_FOR_ARCH3M		(FL_FOR_ARCH3 | FL_ARCH3M)
+#define FL_FOR_ARCH4		(FL_FOR_ARCH3M | FL_ARCH4)
+#define FL_FOR_ARCH4T		(FL_FOR_ARCH4 | FL_THUMB)
+#define FL_FOR_ARCH5		(FL_FOR_ARCH4 | FL_ARCH5)
+#define FL_FOR_ARCH5T		(FL_FOR_ARCH5 | FL_THUMB)
+#define FL_FOR_ARCH5E		(FL_FOR_ARCH5 | FL_ARCH5E)
+#define FL_FOR_ARCH5TE		(FL_FOR_ARCH5E | FL_THUMB)
+#define FL_FOR_ARCH5TEJ		FL_FOR_ARCH5TE
+#define FL_FOR_ARCH6		(FL_FOR_ARCH5TE | FL_ARCH6)
+#define FL_FOR_ARCH6J		FL_FOR_ARCH6
+#define FL_FOR_ARCH6K		(FL_FOR_ARCH6 | FL_ARCH6K)
+#define FL_FOR_ARCH6Z		FL_FOR_ARCH6
+#define FL_FOR_ARCH6ZK		FL_FOR_ARCH6K
+#define FL_FOR_ARCH6KZ		(FL_FOR_ARCH6K | FL_ARCH6KZ)
+#define FL_FOR_ARCH6T2		(FL_FOR_ARCH6 | FL_THUMB2)
+#define FL_FOR_ARCH6M		(FL_FOR_ARCH6 & ~FL_NOTM)
+#define FL_FOR_ARCH7		((FL_FOR_ARCH6T2 & ~FL_NOTM) | FL_ARCH7)
+#define FL_FOR_ARCH7A		(FL_FOR_ARCH7 | FL_NOTM | FL_ARCH6K)
+#define FL_FOR_ARCH7VE		(FL_FOR_ARCH7A | FL_THUMB_DIV | FL_ARM_DIV)
+#define FL_FOR_ARCH7R		(FL_FOR_ARCH7A | FL_THUMB_DIV)
+#define FL_FOR_ARCH7M		(FL_FOR_ARCH7 | FL_THUMB_DIV)
+#define FL_FOR_ARCH7EM		(FL_FOR_ARCH7M | FL_ARCH7EM)
+#define FL_FOR_ARCH8A		(FL_FOR_ARCH7VE | FL_ARCH8)
+#define FL2_FOR_ARCH8_1A	FL2_ARCH8_1
+#define FL2_FOR_ARCH8_2A	(FL2_FOR_ARCH8_1A | FL2_ARCH8_2)
+#define FL_FOR_ARCH8M_BASE	(FL_FOR_ARCH6M | FL_ARCH8 | FL_THUMB_DIV)
+#define FL_FOR_ARCH8M_MAIN	(FL_FOR_ARCH7M | FL_ARCH8)
+
+/* There are too many feature bits to fit in a single word so the set of cpu and
+   fpu capabilities is a structure.  A feature set is created and manipulated
+   with the ARM_FSET macros.  */
+
+typedef struct
+{
+  unsigned cpu[2];
+} arm_feature_set;
+
+
+/* Initialize a feature set.  */
+
+#define ARM_FSET_MAKE(CPU1,CPU2) { { (CPU1), (CPU2) } }
+
+#define ARM_FSET_MAKE_CPU1(CPU1) ARM_FSET_MAKE ((CPU1), (FL_NONE))
+#define ARM_FSET_MAKE_CPU2(CPU2) ARM_FSET_MAKE ((FL_NONE), (CPU2))
+
+/* Accessors.  */
+
+#define ARM_FSET_CPU1(S) ((S).cpu[0])
+#define ARM_FSET_CPU2(S) ((S).cpu[1])
+
+/* Useful combinations.  */
+
+#define ARM_FSET_EMPTY ARM_FSET_MAKE (FL_NONE, FL_NONE)
+#define ARM_FSET_ANY ARM_FSET_MAKE (FL_ANY, FL_ANY)
+
+/* Tests for a specific CPU feature.  */
+
+#define ARM_FSET_HAS_CPU1(A, F)  \
+  (((A).cpu[0] & ((unsigned long)(F))) == ((unsigned long)(F)))
+#define ARM_FSET_HAS_CPU2(A, F)  \
+  (((A).cpu[1] & ((unsigned long)(F))) == ((unsigned long)(F)))
+#define ARM_FSET_HAS_CPU(A, F1, F2)				\
+  (ARM_FSET_HAS_CPU1 ((A), (F1)) && ARM_FSET_HAS_CPU2 ((A), (F2)))
+
+/* Add a feature to a feature set.  */
+
+#define ARM_FSET_ADD_CPU1(DST, F)		\
+  do {						\
+    (DST).cpu[0] |= (F);			\
+  } while (0)
+
+#define ARM_FSET_ADD_CPU2(DST, F)		\
+  do {						\
+    (DST).cpu[1] |= (F);			\
+  } while (0)
+
+/* Remove a feature from a feature set.  */
+
+#define ARM_FSET_DEL_CPU1(DST, F)		\
+  do {						\
+    (DST).cpu[0] &= ~(F);			\
+  } while (0)
+
+#define ARM_FSET_DEL_CPU2(DST, F)		\
+  do {						\
+    (DST).cpu[1] &= ~(F);			\
+  } while (0)
+
+/* Union of feature sets.  */
+
+#define ARM_FSET_UNION(DST,F1,F2)		\
+  do {						\
+    (DST).cpu[0] = (F1).cpu[0] | (F2).cpu[0];	\
+    (DST).cpu[1] = (F1).cpu[1] | (F2).cpu[1];	\
+  } while (0)
+
+/* Intersection of feature sets.  */
+
+#define ARM_FSET_INTER(DST,F1,F2)		\
+  do {						\
+    (DST).cpu[0] = (F1).cpu[0] & (F2).cpu[0];	\
+    (DST).cpu[1] = (F1).cpu[1] & (F2).cpu[1];	\
+  } while (0)
+
+/* Exclusive disjunction.  */
+
+#define ARM_FSET_XOR(DST,F1,F2)				\
+  do {							\
+    (DST).cpu[0] = (F1).cpu[0] ^ (F2).cpu[0];		\
+    (DST).cpu[1] = (F1).cpu[1] ^ (F2).cpu[1];		\
+  } while (0)
+
+/* Difference of feature sets: F1 excluding the elements of F2.  */
+
+#define ARM_FSET_EXCLUDE(DST,F1,F2)		\
+  do {						\
+    (DST).cpu[0] = (F1).cpu[0] & ~(F2).cpu[0];	\
+    (DST).cpu[1] = (F1).cpu[1] & ~(F2).cpu[1];	\
+  } while (0)
+
+/* Test for an empty feature set.  */
+
+#define ARM_FSET_IS_EMPTY(A)		\
+  (!((A).cpu[0]) && !((A).cpu[1]))
+
+/* Tests whether the cpu features of A are a subset of B.  */
+
+#define ARM_FSET_CPU_SUBSET(A,B)					\
+  ((((A).cpu[0] & (B).cpu[0]) == (A).cpu[0])				\
+   && (((A).cpu[1] & (B).cpu[1]) == (A).cpu[1]))
+
+#endif /* GCC_ARM_FLAGS_H */
--- a/src/gcc/config/arm/arm-fpus.def
+++ b/src/gcc/config/arm/arm-fpus.def
@@ -19,30 +19,31 @@
 
 /* Before using #include to read this file, define a macro:
 
-      ARM_FPU(NAME, MODEL, REV, VFP_REGS, FEATURES)
+      ARM_FPU(NAME, REV, VFP_REGS, FEATURES)
 
    The arguments are the fields of struct arm_fpu_desc.
 
    genopt.sh assumes no whitespace up to the first "," in each entry.  */
 
-ARM_FPU("vfp",		ARM_FP_MODEL_VFP, 2, VFP_REG_D16, FPU_FL_NONE)
-ARM_FPU("vfpv3",	ARM_FP_MODEL_VFP, 3, VFP_REG_D32, FPU_FL_NONE)
-ARM_FPU("vfpv3-fp16",	ARM_FP_MODEL_VFP, 3, VFP_REG_D32, FPU_FL_FP16)
-ARM_FPU("vfpv3-d16",	ARM_FP_MODEL_VFP, 3, VFP_REG_D16, FPU_FL_NONE)
-ARM_FPU("vfpv3-d16-fp16",	ARM_FP_MODEL_VFP, 3, VFP_REG_D16, FPU_FL_FP16)
-ARM_FPU("vfpv3xd",	ARM_FP_MODEL_VFP, 3, VFP_REG_SINGLE, FPU_FL_NONE)
-ARM_FPU("vfpv3xd-fp16",	ARM_FP_MODEL_VFP, 3, VFP_REG_SINGLE, FPU_FL_FP16)
-ARM_FPU("neon",		ARM_FP_MODEL_VFP, 3, VFP_REG_D32, FPU_FL_NEON)
-ARM_FPU("neon-fp16",	ARM_FP_MODEL_VFP, 3, VFP_REG_D32, FPU_FL_NEON | FPU_FL_FP16)
-ARM_FPU("vfpv4",	ARM_FP_MODEL_VFP, 4, VFP_REG_D32, FPU_FL_FP16)
-ARM_FPU("vfpv4-d16",	ARM_FP_MODEL_VFP, 4, VFP_REG_D16, FPU_FL_FP16)
-ARM_FPU("fpv4-sp-d16",	ARM_FP_MODEL_VFP, 4, VFP_REG_SINGLE, FPU_FL_FP16)
-ARM_FPU("fpv5-sp-d16",	ARM_FP_MODEL_VFP, 5, VFP_REG_SINGLE, FPU_FL_FP16)
-ARM_FPU("fpv5-d16",	ARM_FP_MODEL_VFP, 5, VFP_REG_D16, FPU_FL_FP16)
-ARM_FPU("neon-vfpv4",	ARM_FP_MODEL_VFP, 4, VFP_REG_D32, FPU_FL_NEON | FPU_FL_FP16)
-ARM_FPU("fp-armv8",	ARM_FP_MODEL_VFP, 8, VFP_REG_D32, FPU_FL_FP16)
-ARM_FPU("neon-fp-armv8",ARM_FP_MODEL_VFP, 8, VFP_REG_D32, FPU_FL_NEON | FPU_FL_FP16)
-ARM_FPU("crypto-neon-fp-armv8",
-			ARM_FP_MODEL_VFP, 8, VFP_REG_D32, FPU_FL_NEON | FPU_FL_FP16 | FPU_FL_CRYPTO)
+ARM_FPU("vfp",		2, VFP_REG_D16, FPU_FL_NONE)
+ARM_FPU("vfpv2",	2, VFP_REG_D16, FPU_FL_NONE)
+ARM_FPU("vfpv3",	3, VFP_REG_D32, FPU_FL_NONE)
+ARM_FPU("vfpv3-fp16",	3, VFP_REG_D32, FPU_FL_FP16)
+ARM_FPU("vfpv3-d16",	3, VFP_REG_D16, FPU_FL_NONE)
+ARM_FPU("vfpv3-d16-fp16", 3, VFP_REG_D16, FPU_FL_FP16)
+ARM_FPU("vfpv3xd",	3, VFP_REG_SINGLE, FPU_FL_NONE)
+ARM_FPU("vfpv3xd-fp16",	3, VFP_REG_SINGLE, FPU_FL_FP16)
+ARM_FPU("neon",		3, VFP_REG_D32, FPU_FL_NEON)
+ARM_FPU("neon-vfpv3",	3, VFP_REG_D32, FPU_FL_NEON)
+ARM_FPU("neon-fp16",	3, VFP_REG_D32, FPU_FL_NEON | FPU_FL_FP16)
+ARM_FPU("vfpv4",	4, VFP_REG_D32, FPU_FL_FP16)
+ARM_FPU("vfpv4-d16",	4, VFP_REG_D16, FPU_FL_FP16)
+ARM_FPU("fpv4-sp-d16",	4, VFP_REG_SINGLE, FPU_FL_FP16)
+ARM_FPU("fpv5-sp-d16",	5, VFP_REG_SINGLE, FPU_FL_FP16)
+ARM_FPU("fpv5-d16",	5, VFP_REG_D16, FPU_FL_FP16)
+ARM_FPU("neon-vfpv4",	4, VFP_REG_D32, FPU_FL_NEON | FPU_FL_FP16)
+ARM_FPU("fp-armv8",	8, VFP_REG_D32, FPU_FL_FP16)
+ARM_FPU("neon-fp-armv8", 8, VFP_REG_D32, FPU_FL_NEON | FPU_FL_FP16)
+ARM_FPU("crypto-neon-fp-armv8", 8, VFP_REG_D32, FPU_FL_NEON | FPU_FL_FP16 | FPU_FL_CRYPTO)
 /* Compatibility aliases.  */
-ARM_FPU("vfp3",		ARM_FP_MODEL_VFP, 3, VFP_REG_D32, FPU_FL_NONE)
+ARM_FPU("vfp3",		3, VFP_REG_D32, FPU_FL_NONE)
--- a/src/gcc/config/arm/arm-modes.def
+++ b/src/gcc/config/arm/arm-modes.def
@@ -59,6 +59,7 @@ CC_MODE (CC_DGEU);
 CC_MODE (CC_DGTU);
 CC_MODE (CC_C);
 CC_MODE (CC_N);
+CC_MODE (CC_V);
 
 /* Vector modes.  */
 VECTOR_MODES (INT, 4);        /*            V4QI V2HI */
--- a/src/gcc/config/arm/arm-opts.h
+++ b/src/gcc/config/arm/arm-opts.h
@@ -25,6 +25,8 @@
 #ifndef ARM_OPTS_H
 #define ARM_OPTS_H
 
+#include "arm-flags.h"
+
 /* The various ARM cores.  */
 enum processor_type
 {
--- a/src/gcc/config/arm/arm-protos.h
+++ b/src/gcc/config/arm/arm-protos.h
@@ -22,6 +22,8 @@
 #ifndef GCC_ARM_PROTOS_H
 #define GCC_ARM_PROTOS_H
 
+#include "arm-flags.h"
+
 extern enum unwind_info_type arm_except_unwind_info (struct gcc_options *);
 extern int use_return_insn (int, rtx);
 extern bool use_simple_return_p (void);
@@ -31,6 +33,7 @@ extern int arm_volatile_func (void);
 extern void arm_expand_prologue (void);
 extern void arm_expand_epilogue (bool);
 extern void arm_declare_function_name (FILE *, const char *, tree);
+extern void arm_asm_declare_function_name (FILE *, const char *, tree);
 extern void thumb2_expand_return (bool);
 extern const char *arm_strip_name_encoding (const char *);
 extern void arm_asm_output_labelref (FILE *, const char *);
@@ -50,8 +53,12 @@ extern tree arm_builtin_decl (unsigned code, bool initialize_p
 			      ATTRIBUTE_UNUSED);
 extern void arm_init_builtins (void);
 extern void arm_atomic_assign_expand_fenv (tree *hold, tree *clear, tree *update);
-
+extern rtx arm_simd_vect_par_cnst_half (machine_mode mode, bool high);
+extern bool arm_simd_check_vect_par_cnst_half_p (rtx op, machine_mode mode,
+						 bool high);
 #ifdef RTX_CODE
+extern void arm_gen_unlikely_cbranch (enum rtx_code, machine_mode cc_mode,
+				      rtx label_ref);
 extern bool arm_vector_mode_supported_p (machine_mode);
 extern bool arm_small_register_classes_for_mode_p (machine_mode);
 extern int arm_hard_regno_mode_ok (unsigned int, machine_mode);
@@ -130,6 +137,7 @@ extern int arm_const_double_inline_cost (rtx);
 extern bool arm_const_double_by_parts (rtx);
 extern bool arm_const_double_by_immediates (rtx);
 extern void arm_emit_call_insn (rtx, rtx, bool);
+bool detect_cmse_nonsecure_call (tree);
 extern const char *output_call (rtx *);
 void arm_emit_movpair (rtx, rtx);
 extern const char *output_mov_long_double_arm_from_arm (rtx *);
@@ -161,6 +169,7 @@ extern const char *arm_output_iwmmxt_shift_immediate (const char *, rtx *, bool)
 extern const char *arm_output_iwmmxt_tinsr (rtx *);
 extern unsigned int arm_sync_loop_insns (rtx , rtx *);
 extern int arm_attr_length_push_multi(rtx, rtx);
+extern int arm_attr_length_pop_multi(rtx *, bool, bool);
 extern void arm_expand_compare_and_swap (rtx op[]);
 extern void arm_split_compare_and_swap (rtx op[]);
 extern void arm_split_atomic_op (enum rtx_code, rtx, rtx, rtx, rtx, rtx, rtx);
@@ -192,7 +201,6 @@ extern const char *thumb_call_via_reg (rtx);
 extern void thumb_expand_movmemqi (rtx *);
 extern rtx arm_return_addr (int, rtx);
 extern void thumb_reload_out_hi (rtx *);
-extern void thumb_reload_in_hi (rtx *);
 extern void thumb_set_return_address (rtx, rtx);
 extern const char *thumb1_output_casesi (rtx *);
 extern const char *thumb2_output_casesi (rtx *);
@@ -256,7 +264,6 @@ struct cpu_cost_table;
 
 struct tune_params
 {
-  bool (*rtx_costs) (rtx, RTX_CODE, RTX_CODE, int *, bool);
   const struct cpu_cost_table *insn_extra_cost;
   bool (*sched_adjust_cost) (rtx_insn *, rtx, rtx_insn *, int *);
   int (*branch_cost) (bool, bool);
@@ -319,6 +326,7 @@ extern int vfp3_const_double_for_bits (rtx);
 
 extern void arm_emit_coreregs_64bit_shift (enum rtx_code, rtx, rtx, rtx, rtx,
 					   rtx);
+extern bool arm_fusion_enabled_p (tune_params::fuse_ops);
 extern bool arm_valid_symbolic_address_p (rtx);
 extern bool arm_validize_comparison (rtx *, rtx *, rtx *);
 #endif /* RTX_CODE */
@@ -344,184 +352,6 @@ extern void arm_cpu_cpp_builtins (struct cpp_reader *);
 
 extern bool arm_is_constant_pool_ref (rtx);
 
-/* Flags used to identify the presence of processor capabilities.  */
-
-/* Bit values used to identify processor capabilities.  */
-#define FL_NONE	      (0)	      /* No flags.  */
-#define FL_ANY	      (0xffffffff)    /* All flags.  */
-#define FL_CO_PROC    (1 << 0)        /* Has external co-processor bus */
-#define FL_ARCH3M     (1 << 1)        /* Extended multiply */
-#define FL_MODE26     (1 << 2)        /* 26-bit mode support */
-#define FL_MODE32     (1 << 3)        /* 32-bit mode support */
-#define FL_ARCH4      (1 << 4)        /* Architecture rel 4 */
-#define FL_ARCH5      (1 << 5)        /* Architecture rel 5 */
-#define FL_THUMB      (1 << 6)        /* Thumb aware */
-#define FL_LDSCHED    (1 << 7)	      /* Load scheduling necessary */
-#define FL_STRONG     (1 << 8)	      /* StrongARM */
-#define FL_ARCH5E     (1 << 9)        /* DSP extensions to v5 */
-#define FL_XSCALE     (1 << 10)	      /* XScale */
-/* spare	      (1 << 11)	*/
-#define FL_ARCH6      (1 << 12)       /* Architecture rel 6.  Adds
-					 media instructions.  */
-#define FL_VFPV2      (1 << 13)       /* Vector Floating Point V2.  */
-#define FL_WBUF	      (1 << 14)	      /* Schedule for write buffer ops.
-					 Note: ARM6 & 7 derivatives only.  */
-#define FL_ARCH6K     (1 << 15)       /* Architecture rel 6 K extensions.  */
-#define FL_THUMB2     (1 << 16)	      /* Thumb-2.  */
-#define FL_NOTM	      (1 << 17)	      /* Instructions not present in the 'M'
-					 profile.  */
-#define FL_THUMB_DIV  (1 << 18)	      /* Hardware divide (Thumb mode).  */
-#define FL_VFPV3      (1 << 19)       /* Vector Floating Point V3.  */
-#define FL_NEON       (1 << 20)       /* Neon instructions.  */
-#define FL_ARCH7EM    (1 << 21)	      /* Instructions present in the ARMv7E-M
-					 architecture.  */
-#define FL_ARCH7      (1 << 22)       /* Architecture 7.  */
-#define FL_ARM_DIV    (1 << 23)	      /* Hardware divide (ARM mode).  */
-#define FL_ARCH8      (1 << 24)       /* Architecture 8.  */
-#define FL_CRC32      (1 << 25)	      /* ARMv8 CRC32 instructions.  */
-
-#define FL_SMALLMUL   (1 << 26)       /* Small multiply supported.  */
-#define FL_NO_VOLATILE_CE   (1 << 27) /* No volatile memory in IT block.  */
-
-#define FL_IWMMXT     (1 << 29)	      /* XScale v2 or "Intel Wireless MMX technology".  */
-#define FL_IWMMXT2    (1 << 30)       /* "Intel Wireless MMX2 technology".  */
-#define FL_ARCH6KZ    (1 << 31)       /* ARMv6KZ architecture.  */
-
-#define FL2_ARCH8_1   (1 << 0)	      /* Architecture 8.1.  */
-
-/* Flags that only effect tuning, not available instructions.  */
-#define FL_TUNE		(FL_WBUF | FL_VFPV2 | FL_STRONG | FL_LDSCHED \
-			 | FL_CO_PROC)
-
-#define FL_FOR_ARCH2	FL_NOTM
-#define FL_FOR_ARCH3	(FL_FOR_ARCH2 | FL_MODE32)
-#define FL_FOR_ARCH3M	(FL_FOR_ARCH3 | FL_ARCH3M)
-#define FL_FOR_ARCH4	(FL_FOR_ARCH3M | FL_ARCH4)
-#define FL_FOR_ARCH4T	(FL_FOR_ARCH4 | FL_THUMB)
-#define FL_FOR_ARCH5	(FL_FOR_ARCH4 | FL_ARCH5)
-#define FL_FOR_ARCH5T	(FL_FOR_ARCH5 | FL_THUMB)
-#define FL_FOR_ARCH5E	(FL_FOR_ARCH5 | FL_ARCH5E)
-#define FL_FOR_ARCH5TE	(FL_FOR_ARCH5E | FL_THUMB)
-#define FL_FOR_ARCH5TEJ	FL_FOR_ARCH5TE
-#define FL_FOR_ARCH6	(FL_FOR_ARCH5TE | FL_ARCH6)
-#define FL_FOR_ARCH6J	FL_FOR_ARCH6
-#define FL_FOR_ARCH6K	(FL_FOR_ARCH6 | FL_ARCH6K)
-#define FL_FOR_ARCH6Z	FL_FOR_ARCH6
-#define FL_FOR_ARCH6KZ	(FL_FOR_ARCH6K | FL_ARCH6KZ)
-#define FL_FOR_ARCH6T2	(FL_FOR_ARCH6 | FL_THUMB2)
-#define FL_FOR_ARCH6M	(FL_FOR_ARCH6 & ~FL_NOTM)
-#define FL_FOR_ARCH7	((FL_FOR_ARCH6T2 & ~FL_NOTM) | FL_ARCH7)
-#define FL_FOR_ARCH7A	(FL_FOR_ARCH7 | FL_NOTM | FL_ARCH6K)
-#define FL_FOR_ARCH7VE	(FL_FOR_ARCH7A | FL_THUMB_DIV | FL_ARM_DIV)
-#define FL_FOR_ARCH7R	(FL_FOR_ARCH7A | FL_THUMB_DIV)
-#define FL_FOR_ARCH7M	(FL_FOR_ARCH7 | FL_THUMB_DIV)
-#define FL_FOR_ARCH7EM  (FL_FOR_ARCH7M | FL_ARCH7EM)
-#define FL_FOR_ARCH8A	(FL_FOR_ARCH7VE | FL_ARCH8)
-#define FL2_FOR_ARCH8_1A	FL2_ARCH8_1
-
-/* There are too many feature bits to fit in a single word so the set of cpu and
-   fpu capabilities is a structure.  A feature set is created and manipulated
-   with the ARM_FSET macros.  */
-
-typedef struct
-{
-  unsigned long cpu[2];
-} arm_feature_set;
-
-
-/* Initialize a feature set.  */
-
-#define ARM_FSET_MAKE(CPU1,CPU2) { { (CPU1), (CPU2) } }
-
-#define ARM_FSET_MAKE_CPU1(CPU1) ARM_FSET_MAKE ((CPU1), (FL_NONE))
-#define ARM_FSET_MAKE_CPU2(CPU2) ARM_FSET_MAKE ((FL_NONE), (CPU2))
-
-/* Accessors.  */
-
-#define ARM_FSET_CPU1(S) ((S).cpu[0])
-#define ARM_FSET_CPU2(S) ((S).cpu[1])
-
-/* Useful combinations.  */
-
-#define ARM_FSET_EMPTY ARM_FSET_MAKE (FL_NONE, FL_NONE)
-#define ARM_FSET_ANY ARM_FSET_MAKE (FL_ANY, FL_ANY)
-
-/* Tests for a specific CPU feature.  */
-
-#define ARM_FSET_HAS_CPU1(A, F)  \
-  (((A).cpu[0] & ((unsigned long)(F))) == ((unsigned long)(F)))
-#define ARM_FSET_HAS_CPU2(A, F)  \
-  (((A).cpu[1] & ((unsigned long)(F))) == ((unsigned long)(F)))
-#define ARM_FSET_HAS_CPU(A, F1, F2)				\
-  (ARM_FSET_HAS_CPU1 ((A), (F1)) && ARM_FSET_HAS_CPU2 ((A), (F2)))
-
-/* Add a feature to a feature set.  */
-
-#define ARM_FSET_ADD_CPU1(DST, F)		\
-  do {						\
-    (DST).cpu[0] |= (F);			\
-  } while (0)
-
-#define ARM_FSET_ADD_CPU2(DST, F)		\
-  do {						\
-    (DST).cpu[1] |= (F);			\
-  } while (0)
-
-/* Remove a feature from a feature set.  */
-
-#define ARM_FSET_DEL_CPU1(DST, F)		\
-  do {						\
-    (DST).cpu[0] &= ~(F);			\
-  } while (0)
-
-#define ARM_FSET_DEL_CPU2(DST, F)		\
-  do {						\
-    (DST).cpu[1] &= ~(F);			\
-  } while (0)
-
-/* Union of feature sets.  */
-
-#define ARM_FSET_UNION(DST,F1,F2)		\
-  do {						\
-    (DST).cpu[0] = (F1).cpu[0] | (F2).cpu[0];	\
-    (DST).cpu[1] = (F1).cpu[1] | (F2).cpu[1];	\
-  } while (0)
-
-/* Intersection of feature sets.  */
-
-#define ARM_FSET_INTER(DST,F1,F2)		\
-  do {						\
-    (DST).cpu[0] = (F1).cpu[0] & (F2).cpu[0];	\
-    (DST).cpu[1] = (F1).cpu[1] & (F2).cpu[1];	\
-  } while (0)
-
-/* Exclusive disjunction.  */
-
-#define ARM_FSET_XOR(DST,F1,F2)				\
-  do {							\
-    (DST).cpu[0] = (F1).cpu[0] ^ (F2).cpu[0];		\
-    (DST).cpu[1] = (F1).cpu[1] ^ (F2).cpu[1];		\
-  } while (0)
-
-/* Difference of feature sets: F1 excluding the elements of F2.  */
-
-#define ARM_FSET_EXCLUDE(DST,F1,F2)		\
-  do {						\
-    (DST).cpu[0] = (F1).cpu[0] & ~(F2).cpu[0];	\
-    (DST).cpu[1] = (F1).cpu[1] & ~(F2).cpu[1];	\
-  } while (0)
-
-/* Test for an empty feature set.  */
-
-#define ARM_FSET_IS_EMPTY(A)		\
-  (!((A).cpu[0]) && !((A).cpu[1]))
-
-/* Tests whether the cpu features of A are a subset of B.  */
-
-#define ARM_FSET_CPU_SUBSET(A,B)					\
-  ((((A).cpu[0] & (B).cpu[0]) == (A).cpu[0])				\
-   && (((A).cpu[1] & (B).cpu[1]) == (A).cpu[1]))
-
 /* The bits in this mask specify which
    instructions we are allowed to generate.  */
 extern arm_feature_set insn_flags;
@@ -601,6 +431,9 @@ extern int arm_tune_cortex_a9;
    interworking clean.  */
 extern int arm_cpp_interwork;
 
+/* Nonzero if chip supports Thumb 1.  */
+extern int arm_arch_thumb1;
+
 /* Nonzero if chip supports Thumb 2.  */
 extern int arm_arch_thumb2;
 
--- a/src/gcc/config/arm/arm-tables.opt
+++ b/src/gcc/config/arm/arm-tables.opt
@@ -307,9 +307,15 @@ EnumValue
 Enum(processor_type) String(cortex-a17.cortex-a7) Value(cortexa17cortexa7)
 
 EnumValue
+Enum(processor_type) String(cortex-m23) Value(cortexm23)
+
+EnumValue
 Enum(processor_type) String(cortex-a32) Value(cortexa32)
 
 EnumValue
+Enum(processor_type) String(cortex-m33) Value(cortexm33)
+
+EnumValue
 Enum(processor_type) String(cortex-a35) Value(cortexa35)
 
 EnumValue
@@ -322,6 +328,9 @@ EnumValue
 Enum(processor_type) String(cortex-a72) Value(cortexa72)
 
 EnumValue
+Enum(processor_type) String(cortex-a73) Value(cortexa73)
+
+EnumValue
 Enum(processor_type) String(exynos-m1) Value(exynosm1)
 
 EnumValue
@@ -336,6 +345,12 @@ Enum(processor_type) String(cortex-a57.cortex-a53) Value(cortexa57cortexa53)
 EnumValue
 Enum(processor_type) String(cortex-a72.cortex-a53) Value(cortexa72cortexa53)
 
+EnumValue
+Enum(processor_type) String(cortex-a73.cortex-a35) Value(cortexa73cortexa35)
+
+EnumValue
+Enum(processor_type) String(cortex-a73.cortex-a53) Value(cortexa73cortexa53)
+
 Enum
 Name(arm_arch) Type(int)
 Known ARM architectures (for use with the -march= option):
@@ -428,10 +443,25 @@ EnumValue
 Enum(arm_arch) String(armv8.1-a+crc) Value(28)
 
 EnumValue
-Enum(arm_arch) String(iwmmxt) Value(29)
+Enum(arm_arch) String(armv8.2-a) Value(29)
+
+EnumValue
+Enum(arm_arch) String(armv8.2-a+fp16) Value(30)
 
 EnumValue
-Enum(arm_arch) String(iwmmxt2) Value(30)
+Enum(arm_arch) String(armv8-m.base) Value(31)
+
+EnumValue
+Enum(arm_arch) String(armv8-m.main) Value(32)
+
+EnumValue
+Enum(arm_arch) String(armv8-m.main+dsp) Value(33)
+
+EnumValue
+Enum(arm_arch) String(iwmmxt) Value(34)
+
+EnumValue
+Enum(arm_arch) String(iwmmxt2) Value(35)
 
 Enum
 Name(arm_fpu) Type(int)
@@ -441,56 +471,62 @@ EnumValue
 Enum(arm_fpu) String(vfp) Value(0)
 
 EnumValue
-Enum(arm_fpu) String(vfpv3) Value(1)
+Enum(arm_fpu) String(vfpv2) Value(1)
+
+EnumValue
+Enum(arm_fpu) String(vfpv3) Value(2)
+
+EnumValue
+Enum(arm_fpu) String(vfpv3-fp16) Value(3)
 
 EnumValue
-Enum(arm_fpu) String(vfpv3-fp16) Value(2)
+Enum(arm_fpu) String(vfpv3-d16) Value(4)
 
 EnumValue
-Enum(arm_fpu) String(vfpv3-d16) Value(3)
+Enum(arm_fpu) String(vfpv3-d16-fp16) Value(5)
 
 EnumValue
-Enum(arm_fpu) String(vfpv3-d16-fp16) Value(4)
+Enum(arm_fpu) String(vfpv3xd) Value(6)
 
 EnumValue
-Enum(arm_fpu) String(vfpv3xd) Value(5)
+Enum(arm_fpu) String(vfpv3xd-fp16) Value(7)
 
 EnumValue
-Enum(arm_fpu) String(vfpv3xd-fp16) Value(6)
+Enum(arm_fpu) String(neon) Value(8)
 
 EnumValue
-Enum(arm_fpu) String(neon) Value(7)
+Enum(arm_fpu) String(neon-vfpv3) Value(9)
 
 EnumValue
-Enum(arm_fpu) String(neon-fp16) Value(8)
+Enum(arm_fpu) String(neon-fp16) Value(10)
 
 EnumValue
-Enum(arm_fpu) String(vfpv4) Value(9)
+Enum(arm_fpu) String(vfpv4) Value(11)
 
 EnumValue
-Enum(arm_fpu) String(vfpv4-d16) Value(10)
+Enum(arm_fpu) String(vfpv4-d16) Value(12)
 
 EnumValue
-Enum(arm_fpu) String(fpv4-sp-d16) Value(11)
+Enum(arm_fpu) String(fpv4-sp-d16) Value(13)
 
 EnumValue
-Enum(arm_fpu) String(fpv5-sp-d16) Value(12)
+Enum(arm_fpu) String(fpv5-sp-d16) Value(14)
 
 EnumValue
-Enum(arm_fpu) String(fpv5-d16) Value(13)
+Enum(arm_fpu) String(fpv5-d16) Value(15)
 
 EnumValue
-Enum(arm_fpu) String(neon-vfpv4) Value(14)
+Enum(arm_fpu) String(neon-vfpv4) Value(16)
 
 EnumValue
-Enum(arm_fpu) String(fp-armv8) Value(15)
+Enum(arm_fpu) String(fp-armv8) Value(17)
 
 EnumValue
-Enum(arm_fpu) String(neon-fp-armv8) Value(16)
+Enum(arm_fpu) String(neon-fp-armv8) Value(18)
 
 EnumValue
-Enum(arm_fpu) String(crypto-neon-fp-armv8) Value(17)
+Enum(arm_fpu) String(crypto-neon-fp-armv8) Value(19)
 
 EnumValue
-Enum(arm_fpu) String(vfp3) Value(18)
+Enum(arm_fpu) String(vfp3) Value(20)
 
--- a/src/gcc/config/arm/arm-tune.md
+++ b/src/gcc/config/arm/arm-tune.md
@@ -32,8 +32,10 @@
 	cortexr4f,cortexr5,cortexr7,
 	cortexr8,cortexm7,cortexm4,
 	cortexm3,marvell_pj4,cortexa15cortexa7,
-	cortexa17cortexa7,cortexa32,cortexa35,
-	cortexa53,cortexa57,cortexa72,
+	cortexa17cortexa7,cortexm23,cortexa32,
+	cortexm33,cortexa35,cortexa53,
+	cortexa57,cortexa72,cortexa73,
 	exynosm1,qdf24xx,xgene1,
-	cortexa57cortexa53,cortexa72cortexa53"
+	cortexa57cortexa53,cortexa72cortexa53,cortexa73cortexa35,
+	cortexa73cortexa53"
 	(const (symbol_ref "((enum attr_tune) arm_tune)")))
--- a/src/gcc/config/arm/arm.c
+++ b/src/gcc/config/arm/arm.c
@@ -27,6 +27,7 @@
 #include "target.h"
 #include "rtl.h"
 #include "tree.h"
+#include "memmodel.h"
 #include "cfghooks.h"
 #include "df.h"
 #include "tm_p.h"
@@ -61,6 +62,7 @@
 #include "builtins.h"
 #include "tm-constrs.h"
 #include "rtl-iter.h"
+#include "gimplify.h"
 
 /* This file should be included last.  */
 #include "target-def.h"
@@ -104,7 +106,6 @@ static void arm_print_operand_address (FILE *, machine_mode, rtx);
 static bool arm_print_operand_punct_valid_p (unsigned char code);
 static const char *fp_const_from_val (REAL_VALUE_TYPE *);
 static arm_cc get_arm_condition_code (rtx);
-static HOST_WIDE_INT int_log2 (HOST_WIDE_INT);
 static const char *output_multi_immediate (rtx *, const char *, const char *,
 					   int, HOST_WIDE_INT);
 static const char *shift_op (rtx, HOST_WIDE_INT *);
@@ -135,6 +136,8 @@ static tree arm_handle_isr_attribute (tree *, tree, tree, int, bool *);
 #if TARGET_DLLIMPORT_DECL_ATTRIBUTES
 static tree arm_handle_notshared_attribute (tree *, tree, tree, int, bool *);
 #endif
+static tree arm_handle_cmse_nonsecure_entry (tree *, tree, tree, int, bool *);
+static tree arm_handle_cmse_nonsecure_call (tree *, tree, tree, int, bool *);
 static void arm_output_function_epilogue (FILE *, HOST_WIDE_INT);
 static void arm_output_function_prologue (FILE *, HOST_WIDE_INT);
 static int arm_comp_type_attributes (const_tree, const_tree);
@@ -164,12 +167,6 @@ static void arm_output_mi_thunk (FILE *, tree, HOST_WIDE_INT, HOST_WIDE_INT,
 static bool arm_have_conditional_execution (void);
 static bool arm_cannot_force_const_mem (machine_mode, rtx);
 static bool arm_legitimate_constant_p (machine_mode, rtx);
-static bool arm_rtx_costs_1 (rtx, enum rtx_code, int*, bool);
-static bool arm_size_rtx_costs (rtx, enum rtx_code, enum rtx_code, int *);
-static bool arm_slowmul_rtx_costs (rtx, enum rtx_code, enum rtx_code, int *, bool);
-static bool arm_fastmul_rtx_costs (rtx, enum rtx_code, enum rtx_code, int *, bool);
-static bool arm_xscale_rtx_costs (rtx, enum rtx_code, enum rtx_code, int *, bool);
-static bool arm_9e_rtx_costs (rtx, enum rtx_code, enum rtx_code, int *, bool);
 static bool arm_rtx_costs (rtx, machine_mode, int, int, int *, bool);
 static int arm_address_cost (rtx, machine_mode, addr_space_t, bool);
 static int arm_register_move_cost (machine_mode, reg_class_t, reg_class_t);
@@ -249,8 +246,6 @@ static void arm_output_dwarf_dtprel (FILE *, int, rtx) ATTRIBUTE_UNUSED;
 static bool arm_output_addr_const_extra (FILE *, rtx);
 static bool arm_allocate_stack_slots_for_args (void);
 static bool arm_warn_func_return (tree);
-static const char *arm_invalid_parameter_type (const_tree t);
-static const char *arm_invalid_return_type (const_tree t);
 static tree arm_promoted_type (const_tree t);
 static tree arm_convert_to_type (tree type, tree expr);
 static bool arm_scalar_mode_supported_p (machine_mode);
@@ -300,6 +295,9 @@ static void arm_canonicalize_comparison (int *code, rtx *op0, rtx *op1,
 static unsigned HOST_WIDE_INT arm_asan_shadow_offset (void);
 
 static void arm_sched_fusion_priority (rtx_insn *, int, int *, int*);
+static bool arm_can_output_mi_thunk (const_tree, HOST_WIDE_INT, HOST_WIDE_INT,
+				     const_tree);
+
 
 /* Table of machine attributes.  */
 static const struct attribute_spec arm_attribute_table[] =
@@ -343,6 +341,11 @@ static const struct attribute_spec arm_attribute_table[] =
   { "notshared",    0, 0, false, true, false, arm_handle_notshared_attribute,
     false },
 #endif
+  /* ARMv8-M Security Extensions support.  */
+  { "cmse_nonsecure_entry", 0, 0, true, false, false,
+    arm_handle_cmse_nonsecure_entry, false },
+  { "cmse_nonsecure_call", 0, 0, true, false, false,
+    arm_handle_cmse_nonsecure_call, true },
   { NULL,           0, 0, false, false, false, NULL, false }
 };
 
@@ -463,7 +466,7 @@ static const struct attribute_spec arm_attribute_table[] =
 #undef  TARGET_ASM_OUTPUT_MI_THUNK
 #define TARGET_ASM_OUTPUT_MI_THUNK arm_output_mi_thunk
 #undef  TARGET_ASM_CAN_OUTPUT_MI_THUNK
-#define TARGET_ASM_CAN_OUTPUT_MI_THUNK default_can_output_mi_thunk_no_vcall
+#define TARGET_ASM_CAN_OUTPUT_MI_THUNK arm_can_output_mi_thunk
 
 #undef  TARGET_RTX_COSTS
 #define TARGET_RTX_COSTS arm_rtx_costs
@@ -654,12 +657,6 @@ static const struct attribute_spec arm_attribute_table[] =
 #undef TARGET_PREFERRED_RELOAD_CLASS
 #define TARGET_PREFERRED_RELOAD_CLASS arm_preferred_reload_class
 
-#undef TARGET_INVALID_PARAMETER_TYPE
-#define TARGET_INVALID_PARAMETER_TYPE arm_invalid_parameter_type
-
-#undef TARGET_INVALID_RETURN_TYPE
-#define TARGET_INVALID_RETURN_TYPE arm_invalid_return_type
-
 #undef TARGET_PROMOTED_TYPE
 #define TARGET_PROMOTED_TYPE arm_promoted_type
 
@@ -820,6 +817,13 @@ int arm_arch8 = 0;
 /* Nonzero if this chip supports the ARMv8.1 extensions.  */
 int arm_arch8_1 = 0;
 
+/* Nonzero if this chip supports the ARM Architecture 8.2 extensions.  */
+int arm_arch8_2 = 0;
+
+/* Nonzero if this chip supports the FP16 instructions extension of ARM
+   Architecture 8.2.  */
+int arm_fp16_inst = 0;
+
 /* Nonzero if this chip can benefit from load scheduling.  */
 int arm_ld_sched = 0;
 
@@ -852,6 +856,9 @@ int arm_tune_cortex_a9 = 0;
    interworking clean.  */
 int arm_cpp_interwork = 0;
 
+/* Nonzero if chip supports Thumb 1.  */
+int arm_arch_thumb1;
+
 /* Nonzero if chip supports Thumb 2.  */
 int arm_arch_thumb2;
 
@@ -892,6 +899,9 @@ int arm_condexec_masklen = 0;
 /* Nonzero if chip supports the ARMv8 CRC instructions.  */
 int arm_arch_crc = 0;
 
+/* Nonzero if chip supports the ARMv8-M security extensions.  */
+int arm_arch_cmse = 0;
+
 /* Nonzero if the core has a very small, high-latency, multiply unit.  */
 int arm_m_profile_small_mul = 0;
 
@@ -1684,8 +1694,7 @@ const struct cpu_cost_table v7m_extra_costs =
 
 const struct tune_params arm_slowmul_tune =
 {
-  arm_slowmul_rtx_costs,
-  NULL,					/* Insn extra costs.  */
+  &generic_extra_costs,			/* Insn extra costs.  */
   NULL,					/* Sched adj cost.  */
   arm_default_branch_cost,
   &arm_default_vec_cost,
@@ -1707,8 +1716,7 @@ const struct tune_params arm_slowmul_tune =
 
 const struct tune_params arm_fastmul_tune =
 {
-  arm_fastmul_rtx_costs,
-  NULL,					/* Insn extra costs.  */
+  &generic_extra_costs,			/* Insn extra costs.  */
   NULL,					/* Sched adj cost.  */
   arm_default_branch_cost,
   &arm_default_vec_cost,
@@ -1733,8 +1741,7 @@ const struct tune_params arm_fastmul_tune =
 
 const struct tune_params arm_strongarm_tune =
 {
-  arm_fastmul_rtx_costs,
-  NULL,					/* Insn extra costs.  */
+  &generic_extra_costs,			/* Insn extra costs.  */
   NULL,					/* Sched adj cost.  */
   arm_default_branch_cost,
   &arm_default_vec_cost,
@@ -1756,8 +1763,7 @@ const struct tune_params arm_strongarm_tune =
 
 const struct tune_params arm_xscale_tune =
 {
-  arm_xscale_rtx_costs,
-  NULL,					/* Insn extra costs.  */
+  &generic_extra_costs,			/* Insn extra costs.  */
   xscale_sched_adjust_cost,
   arm_default_branch_cost,
   &arm_default_vec_cost,
@@ -1779,8 +1785,7 @@ const struct tune_params arm_xscale_tune =
 
 const struct tune_params arm_9e_tune =
 {
-  arm_9e_rtx_costs,
-  NULL,					/* Insn extra costs.  */
+  &generic_extra_costs,			/* Insn extra costs.  */
   NULL,					/* Sched adj cost.  */
   arm_default_branch_cost,
   &arm_default_vec_cost,
@@ -1802,8 +1807,7 @@ const struct tune_params arm_9e_tune =
 
 const struct tune_params arm_marvell_pj4_tune =
 {
-  arm_9e_rtx_costs,
-  NULL,					/* Insn extra costs.  */
+  &generic_extra_costs,			/* Insn extra costs.  */
   NULL,					/* Sched adj cost.  */
   arm_default_branch_cost,
   &arm_default_vec_cost,
@@ -1825,8 +1829,7 @@ const struct tune_params arm_marvell_pj4_tune =
 
 const struct tune_params arm_v6t2_tune =
 {
-  arm_9e_rtx_costs,
-  NULL,					/* Insn extra costs.  */
+  &generic_extra_costs,			/* Insn extra costs.  */
   NULL,					/* Sched adj cost.  */
   arm_default_branch_cost,
   &arm_default_vec_cost,
@@ -1850,7 +1853,6 @@ const struct tune_params arm_v6t2_tune =
 /* Generic Cortex tuning.  Use more specific tunings if appropriate.  */
 const struct tune_params arm_cortex_tune =
 {
-  arm_9e_rtx_costs,
   &generic_extra_costs,
   NULL,					/* Sched adj cost.  */
   arm_default_branch_cost,
@@ -1873,7 +1875,6 @@ const struct tune_params arm_cortex_tune =
 
 const struct tune_params arm_cortex_a8_tune =
 {
-  arm_9e_rtx_costs,
   &cortexa8_extra_costs,
   NULL,					/* Sched adj cost.  */
   arm_default_branch_cost,
@@ -1896,7 +1897,6 @@ const struct tune_params arm_cortex_a8_tune =
 
 const struct tune_params arm_cortex_a7_tune =
 {
-  arm_9e_rtx_costs,
   &cortexa7_extra_costs,
   NULL,					/* Sched adj cost.  */
   arm_default_branch_cost,
@@ -1919,7 +1919,6 @@ const struct tune_params arm_cortex_a7_tune =
 
 const struct tune_params arm_cortex_a15_tune =
 {
-  arm_9e_rtx_costs,
   &cortexa15_extra_costs,
   NULL,					/* Sched adj cost.  */
   arm_default_branch_cost,
@@ -1942,7 +1941,6 @@ const struct tune_params arm_cortex_a15_tune =
 
 const struct tune_params arm_cortex_a35_tune =
 {
-  arm_9e_rtx_costs,
   &cortexa53_extra_costs,
   NULL,					/* Sched adj cost.  */
   arm_default_branch_cost,
@@ -1965,7 +1963,6 @@ const struct tune_params arm_cortex_a35_tune =
 
 const struct tune_params arm_cortex_a53_tune =
 {
-  arm_9e_rtx_costs,
   &cortexa53_extra_costs,
   NULL,					/* Sched adj cost.  */
   arm_default_branch_cost,
@@ -1988,7 +1985,6 @@ const struct tune_params arm_cortex_a53_tune =
 
 const struct tune_params arm_cortex_a57_tune =
 {
-  arm_9e_rtx_costs,
   &cortexa57_extra_costs,
   NULL,					/* Sched adj cost.  */
   arm_default_branch_cost,
@@ -2011,7 +2007,6 @@ const struct tune_params arm_cortex_a57_tune =
 
 const struct tune_params arm_exynosm1_tune =
 {
-  arm_9e_rtx_costs,
   &exynosm1_extra_costs,
   NULL,						/* Sched adj cost.  */
   arm_default_branch_cost,
@@ -2034,7 +2029,6 @@ const struct tune_params arm_exynosm1_tune =
 
 const struct tune_params arm_xgene1_tune =
 {
-  arm_9e_rtx_costs,
   &xgene1_extra_costs,
   NULL,					/* Sched adj cost.  */
   arm_default_branch_cost,
@@ -2055,12 +2049,33 @@ const struct tune_params arm_xgene1_tune =
   tune_params::SCHED_AUTOPREF_OFF
 };
 
+const struct tune_params arm_qdf24xx_tune =
+{
+  &qdf24xx_extra_costs,
+  NULL,                                         /* Scheduler cost adjustment.  */
+  arm_default_branch_cost,
+  &arm_default_vec_cost,			/* Vectorizer costs.  */
+  1,						/* Constant limit.  */
+  2,						/* Max cond insns.  */
+  8,						/* Memset max inline.  */
+  4,						/* Issue rate.  */
+  ARM_PREFETCH_BENEFICIAL (0, -1, 64),
+  tune_params::PREF_CONST_POOL_FALSE,
+  tune_params::PREF_LDRD_TRUE,
+  tune_params::LOG_OP_NON_SHORT_CIRCUIT_TRUE,	/* Thumb.  */
+  tune_params::LOG_OP_NON_SHORT_CIRCUIT_TRUE,	/* ARM.  */
+  tune_params::DISPARAGE_FLAGS_ALL,
+  tune_params::PREF_NEON_64_FALSE,
+  tune_params::PREF_NEON_STRINGOPS_TRUE,
+  FUSE_OPS (tune_params::FUSE_MOVW_MOVT),
+  tune_params::SCHED_AUTOPREF_FULL
+};
+
 /* Branches can be dual-issued on Cortex-A5, so conditional execution is
    less appealing.  Set max_insns_skipped to a low value.  */
 
 const struct tune_params arm_cortex_a5_tune =
 {
-  arm_9e_rtx_costs,
   &cortexa5_extra_costs,
   NULL,					/* Sched adj cost.  */
   arm_cortex_a5_branch_cost,
@@ -2083,7 +2098,6 @@ const struct tune_params arm_cortex_a5_tune =
 
 const struct tune_params arm_cortex_a9_tune =
 {
-  arm_9e_rtx_costs,
   &cortexa9_extra_costs,
   cortex_a9_sched_adjust_cost,
   arm_default_branch_cost,
@@ -2106,7 +2120,6 @@ const struct tune_params arm_cortex_a9_tune =
 
 const struct tune_params arm_cortex_a12_tune =
 {
-  arm_9e_rtx_costs,
   &cortexa12_extra_costs,
   NULL,					/* Sched adj cost.  */
   arm_default_branch_cost,
@@ -2127,6 +2140,28 @@ const struct tune_params arm_cortex_a12_tune =
   tune_params::SCHED_AUTOPREF_OFF
 };
 
+const struct tune_params arm_cortex_a73_tune =
+{
+  &cortexa57_extra_costs,
+  NULL,						/* Sched adj cost.  */
+  arm_default_branch_cost,
+  &arm_default_vec_cost,			/* Vectorizer costs.  */
+  1,						/* Constant limit.  */
+  2,						/* Max cond insns.  */
+  8,						/* Memset max inline.  */
+  2,						/* Issue rate.  */
+  ARM_PREFETCH_NOT_BENEFICIAL,
+  tune_params::PREF_CONST_POOL_FALSE,
+  tune_params::PREF_LDRD_TRUE,
+  tune_params::LOG_OP_NON_SHORT_CIRCUIT_TRUE,		/* Thumb.  */
+  tune_params::LOG_OP_NON_SHORT_CIRCUIT_TRUE,		/* ARM.  */
+  tune_params::DISPARAGE_FLAGS_ALL,
+  tune_params::PREF_NEON_64_FALSE,
+  tune_params::PREF_NEON_STRINGOPS_TRUE,
+  FUSE_OPS (tune_params::FUSE_AES_AESMC | tune_params::FUSE_MOVW_MOVT),
+  tune_params::SCHED_AUTOPREF_FULL
+};
+
 /* armv7m tuning.  On Cortex-M4 cores for example, MOVW/MOVT take a single
    cycle to execute each.  An LDR from the constant pool also takes two cycles
    to execute, but mildly increases pipelining opportunity (consecutive
@@ -2136,7 +2171,6 @@ const struct tune_params arm_cortex_a12_tune =
 
 const struct tune_params arm_v7m_tune =
 {
-  arm_9e_rtx_costs,
   &v7m_extra_costs,
   NULL,					/* Sched adj cost.  */
   arm_cortex_m_branch_cost,
@@ -2161,7 +2195,6 @@ const struct tune_params arm_v7m_tune =
 
 const struct tune_params arm_cortex_m7_tune =
 {
-  arm_9e_rtx_costs,
   &v7m_extra_costs,
   NULL,					/* Sched adj cost.  */
   arm_cortex_m7_branch_cost,
@@ -2183,11 +2216,11 @@ const struct tune_params arm_cortex_m7_tune =
 };
 
 /* The arm_v6m_tune is duplicated from arm_cortex_tune, rather than
-   arm_v6t2_tune. It is used for cortex-m0, cortex-m1 and cortex-m0plus.  */
+   arm_v6t2_tune.  It is used for cortex-m0, cortex-m1, cortex-m0plus and
+   cortex-m23.  */
 const struct tune_params arm_v6m_tune =
 {
-  arm_9e_rtx_costs,
-  NULL,					/* Insn extra costs.  */
+  &generic_extra_costs,			/* Insn extra costs.  */
   NULL,					/* Sched adj cost.  */
   arm_default_branch_cost,
   &arm_default_vec_cost,                        /* Vectorizer costs.  */
@@ -2209,8 +2242,7 @@ const struct tune_params arm_v6m_tune =
 
 const struct tune_params arm_fa726te_tune =
 {
-  arm_9e_rtx_costs,
-  NULL,					/* Insn extra costs.  */
+  &generic_extra_costs,				/* Insn extra costs.  */
   fa726te_sched_adjust_cost,
   arm_default_branch_cost,
   &arm_default_vec_cost,
@@ -2264,16 +2296,18 @@ static const struct processors *arm_selected_arch;
 static const struct processors *arm_selected_cpu;
 static const struct processors *arm_selected_tune;
 
-/* The name of the preprocessor macro to define for this architecture.  */
+/* The name of the preprocessor macro to define for this architecture.  PROFILE
+   is replaced by the architecture name (eg. 8A) in arm_option_override () and
+   is thus chosen to be big enough to hold the longest architecture name.  */
 
-char arm_arch_name[] = "__ARM_ARCH_0UNK__";
+char arm_arch_name[] = "__ARM_ARCH_PROFILE__";
 
 /* Available values for -mfpu=.  */
 
 const struct arm_fpu_desc all_fpus[] =
 {
-#define ARM_FPU(NAME, MODEL, REV, VFP_REGS, FEATURES) \
-  { NAME, MODEL, REV, VFP_REGS, FEATURES },
+#define ARM_FPU(NAME, REV, VFP_REGS, FEATURES) \
+  { NAME, REV, VFP_REGS, FEATURES },
 #include "arm-fpus.def"
 #undef ARM_FPU
 };
@@ -2752,8 +2786,8 @@ arm_option_check_internal (struct gcc_options *opts)
   const struct arm_fpu_desc *fpu_desc = &all_fpus[opts->x_arm_fpu_index];
 
   /* iWMMXt and NEON are incompatible.  */
-    if (TARGET_IWMMXT && TARGET_VFP
-      && ARM_FPU_FSET_HAS (fpu_desc->features, FPU_FL_NEON))
+    if (TARGET_IWMMXT
+	&& ARM_FPU_FSET_HAS (fpu_desc->features, FPU_FL_NEON))
     error ("iWMMXt and NEON are incompatible");
 
   /* Make sure that the processor choice does not conflict with any of the
@@ -2907,7 +2941,8 @@ arm_option_override_internal (struct gcc_options *opts,
   if (! opts_set->x_arm_restrict_it)
     opts->x_arm_restrict_it = arm_arch8;
 
-  if (!TARGET_THUMB2_P (opts->x_target_flags))
+  /* ARM execution state and M profile don't have [restrict] IT.  */
+  if (!TARGET_THUMB2_P (opts->x_target_flags) || !arm_arch_notm)
     opts->x_arm_restrict_it = 0;
 
   /* Enable -munaligned-access by default for
@@ -2918,7 +2953,8 @@ arm_option_override_internal (struct gcc_options *opts,
 
      Disable -munaligned-access by default for
      - all pre-ARMv6 architecture-based processors
-     - ARMv6-M architecture-based processors.  */
+     - ARMv6-M architecture-based processors
+     - ARMv8-M Baseline processors.  */
 
   if (! opts_set->x_unaligned_access)
     {
@@ -3152,9 +3188,6 @@ arm_option_override (void)
   if (TARGET_APCS_REENT)
     warning (0, "APCS reentrant code not supported.  Ignored");
 
-  if (TARGET_APCS_FLOAT)
-    warning (0, "passing floating point arguments in fp regs not yet supported");
-
   /* Initialize boolean versions of the flags, for use in the arm.md file.  */
   arm_arch3m = ARM_FSET_HAS_CPU1 (insn_flags, FL_ARCH3M);
   arm_arch4 = ARM_FSET_HAS_CPU1 (insn_flags, FL_ARCH4);
@@ -3170,6 +3203,8 @@ arm_option_override (void)
   arm_arch7em = ARM_FSET_HAS_CPU1 (insn_flags, FL_ARCH7EM);
   arm_arch8 = ARM_FSET_HAS_CPU1 (insn_flags, FL_ARCH8);
   arm_arch8_1 = ARM_FSET_HAS_CPU2 (insn_flags, FL2_ARCH8_1);
+  arm_arch8_2 = ARM_FSET_HAS_CPU2 (insn_flags, FL2_ARCH8_2);
+  arm_arch_thumb1 = ARM_FSET_HAS_CPU1 (insn_flags, FL_THUMB);
   arm_arch_thumb2 = ARM_FSET_HAS_CPU1 (insn_flags, FL_THUMB2);
   arm_arch_xscale = ARM_FSET_HAS_CPU1 (insn_flags, FL_XSCALE);
 
@@ -3184,7 +3219,15 @@ arm_option_override (void)
   arm_arch_no_volatile_ce = ARM_FSET_HAS_CPU1 (insn_flags, FL_NO_VOLATILE_CE);
   arm_tune_cortex_a9 = (arm_tune == cortexa9) != 0;
   arm_arch_crc = ARM_FSET_HAS_CPU1 (insn_flags, FL_CRC32);
+  arm_arch_cmse = ARM_FSET_HAS_CPU2 (insn_flags, FL2_CMSE);
   arm_m_profile_small_mul = ARM_FSET_HAS_CPU1 (insn_flags, FL_SMALLMUL);
+  arm_fp16_inst = ARM_FSET_HAS_CPU2 (insn_flags, FL2_FP16INST);
+  if (arm_fp16_inst)
+    {
+      if (arm_fp16_format == ARM_FP16_FORMAT_ALTERNATIVE)
+	error ("selected fp16 options are incompatible.");
+      arm_fp16_format = ARM_FP16_FORMAT_IEEE;
+    }
 
   /* V5 code we generate is completely interworking capable, so we turn off
      TARGET_INTERWORK here to avoid many tests later on.  */
@@ -3222,10 +3265,8 @@ arm_option_override (void)
   /* If soft-float is specified then don't use FPU.  */
   if (TARGET_SOFT_FLOAT)
     arm_fpu_attr = FPU_NONE;
-  else if (TARGET_VFP)
-    arm_fpu_attr = FPU_VFP;
   else
-    gcc_unreachable();
+    arm_fpu_attr = FPU_VFP;
 
   if (TARGET_AAPCS_BASED)
     {
@@ -3245,15 +3286,14 @@ arm_option_override (void)
       if (arm_abi == ARM_ABI_IWMMXT)
 	arm_pcs_default = ARM_PCS_AAPCS_IWMMXT;
       else if (arm_float_abi == ARM_FLOAT_ABI_HARD
-	       && TARGET_HARD_FLOAT
-	       && TARGET_VFP)
+	       && TARGET_HARD_FLOAT)
 	arm_pcs_default = ARM_PCS_AAPCS_VFP;
       else
 	arm_pcs_default = ARM_PCS_AAPCS;
     }
   else
     {
-      if (arm_float_abi == ARM_FLOAT_ABI_HARD && TARGET_VFP)
+      if (arm_float_abi == ARM_FLOAT_ABI_HARD)
 	sorry ("-mfloat-abi=hard and VFP");
 
       if (arm_abi == ARM_ABI_APCS)
@@ -3298,6 +3338,20 @@ arm_option_override (void)
 	}
     }
 
+  if (TARGET_VXWORKS_RTP)
+    {
+      if (!global_options_set.x_arm_pic_data_is_text_relative)
+	arm_pic_data_is_text_relative = 0;
+    }
+  else if (flag_pic
+	   && !arm_pic_data_is_text_relative
+	   && !(global_options_set.x_target_flags & MASK_SINGLE_PIC_BASE))
+    /* When text & data segments don't have a fixed displacement, the
+       intended use is with a single, read only, pic base register.
+       Unless the user explicitly requested not to do that, set
+       it.  */
+    target_flags |= MASK_SINGLE_PIC_BASE;
+
   /* If stack checking is disabled, we can use r10 as the PIC register,
      which keeps r9 available.  The EABI specifies r9 as the PIC register.  */
   if (flag_pic && TARGET_SINGLE_PIC_BASE)
@@ -3329,10 +3383,6 @@ arm_option_override (void)
 	arm_pic_register = pic_register;
     }
 
-  if (TARGET_VXWORKS_RTP
-      && !global_options_set.x_arm_pic_data_is_text_relative)
-    arm_pic_data_is_text_relative = 0;
-
   /* Enable -mfix-cortex-m3-ldrd by default for Cortex-M3 cores.  */
   if (fix_cm3_ldrd == 2)
     {
@@ -3436,6 +3486,9 @@ arm_option_override (void)
   if (target_slow_flash_data)
     arm_disable_literal_pool = true;
 
+  if (use_cmse && !arm_arch_cmse)
+    error ("target CPU does not support ARMv8-M Security Extensions");
+
   /* Disable scheduling fusion by default if it's not armv7 processor
      or doesn't prefer ldrd/strd.  */
   if (flag_schedule_fusion == 2
@@ -3568,6 +3621,9 @@ arm_compute_func_type (void)
   else
     type |= arm_isr_value (TREE_VALUE (a));
 
+  if (lookup_attribute ("cmse_nonsecure_entry", attr))
+    type |= ARM_FT_CMSE_ENTRY;
+
   return type;
 }
 
@@ -3794,6 +3850,11 @@ use_return_insn (int iscond, rtx sibling)
 	return 0;
     }
 
+  /* ARMv8-M nonsecure entry function need to use bxns to return and thus need
+     several instructions if anything needs to be popped.  */
+  if (saved_int_regs && IS_CMSE_ENTRY (func_type))
+    return 0;
+
   /* If there are saved registers but the LR isn't saved, then we need
      two instructions for the return.  */
   if (saved_int_regs && !(saved_int_regs & (1 << LR_REGNUM)))
@@ -3801,7 +3862,7 @@ use_return_insn (int iscond, rtx sibling)
 
   /* Can't be done if any of the VFP regs are pushed,
      since this also requires an insn.  */
-  if (TARGET_HARD_FLOAT && TARGET_VFP)
+  if (TARGET_HARD_FLOAT)
     for (regno = FIRST_VFP_REGNUM; regno <= LAST_VFP_REGNUM; regno++)
       if (df_regs_ever_live_p (regno) && !call_used_regs[regno])
 	return 0;
@@ -3899,7 +3960,7 @@ const_ok_for_op (HOST_WIDE_INT i, enum rtx_code code)
     {
     case SET:
       /* See if we can use movw.  */
-      if (arm_arch_thumb2 && (i & 0xffff0000) == 0)
+      if (TARGET_HAVE_MOVT && (i & 0xffff0000) == 0)
 	return 1;
       else
 	/* Otherwise, try mvn.  */
@@ -4118,7 +4179,7 @@ optimal_immediate_sequence (enum rtx_code code, unsigned HOST_WIDE_INT val,
      yield a shorter sequence, we may as well use zero.  */
   insns1 = optimal_immediate_sequence_1 (code, val, return_sequence, best_start);
   if (best_start != 0
-      && ((((unsigned HOST_WIDE_INT) 1) << best_start) < val))
+      && ((HOST_WIDE_INT_1U << best_start) < val))
     {
       insns2 = optimal_immediate_sequence_1 (code, val, &tmp_sequence, 0);
       if (insns2 <= insns1)
@@ -4949,7 +5010,7 @@ arm_canonicalize_comparison (int *code, rtx *op0, rtx *op1,
   if (mode == VOIDmode)
     mode = GET_MODE (*op1);
 
-  maxval = (((unsigned HOST_WIDE_INT) 1) << (GET_MODE_BITSIZE(mode) - 1)) - 1;
+  maxval = (HOST_WIDE_INT_1U << (GET_MODE_BITSIZE (mode) - 1)) - 1;
 
   /* For DImode, we have GE/LT/GEU/LTU comparisons.  In ARM mode
      we can also use cmp/cmpeq for GTU/LEU.  GT/LE must be either
@@ -5255,7 +5316,6 @@ arm_function_value_regno_p (const unsigned int regno)
   if (regno == ARG_REGISTER (1)
       || (TARGET_32BIT
 	  && TARGET_AAPCS_BASED
-	  && TARGET_VFP
 	  && TARGET_HARD_FLOAT
 	  && regno == FIRST_VFP_REGNUM)
       || (TARGET_IWMMXT_ABI
@@ -5274,7 +5334,7 @@ arm_apply_result_size (void)
 
   if (TARGET_32BIT)
     {
-      if (TARGET_HARD_FLOAT_ABI && TARGET_VFP)
+      if (TARGET_HARD_FLOAT_ABI)
 	size += 32;
       if (TARGET_IWMMXT_ABI)
 	size += 8;
@@ -5549,7 +5609,7 @@ aapcs_vfp_sub_candidate (const_tree type, machine_mode *modep)
     {
     case REAL_TYPE:
       mode = TYPE_MODE (type);
-      if (mode != DFmode && mode != SFmode)
+      if (mode != DFmode && mode != SFmode && mode != HFmode)
 	return -1;
 
       if (*modep == VOIDmode)
@@ -5722,7 +5782,7 @@ use_vfp_abi (enum arm_pcs pcs_variant, bool is_double)
   if (pcs_variant != ARM_PCS_AAPCS_LOCAL)
     return false;
 
-  return (TARGET_32BIT && TARGET_VFP && TARGET_HARD_FLOAT &&
+  return (TARGET_32BIT && TARGET_HARD_FLOAT &&
 	  (TARGET_VFP_DOUBLE || !is_double));
 }
 
@@ -5797,11 +5857,16 @@ aapcs_vfp_is_call_candidate (CUMULATIVE_ARGS *pcum, machine_mode mode,
 						&pcum->aapcs_vfp_rcount);
 }
 
+/* Implement the allocate field in aapcs_cp_arg_layout.  See the comment there
+   for the behaviour of this function.  */
+
 static bool
 aapcs_vfp_allocate (CUMULATIVE_ARGS *pcum, machine_mode mode,
 		    const_tree type  ATTRIBUTE_UNUSED)
 {
-  int shift = GET_MODE_SIZE (pcum->aapcs_vfp_rmode) / GET_MODE_SIZE (SFmode);
+  int rmode_size
+    = MAX (GET_MODE_SIZE (pcum->aapcs_vfp_rmode), GET_MODE_SIZE (SFmode));
+  int shift = rmode_size / GET_MODE_SIZE (SFmode);
   unsigned mask = (1 << (shift * pcum->aapcs_vfp_rcount)) - 1;
   int regno;
 
@@ -5850,6 +5915,9 @@ aapcs_vfp_allocate (CUMULATIVE_ARGS *pcum, machine_mode mode,
   return false;
 }
 
+/* Implement the allocate_return_reg field in aapcs_cp_arg_layout.  See the
+   comment there for the behaviour of this function.  */
+
 static rtx
 aapcs_vfp_allocate_return_reg (enum arm_pcs pcs_variant ATTRIBUTE_UNUSED,
 			       machine_mode mode,
@@ -5940,13 +6008,13 @@ static struct
      required for a return from FUNCTION_ARG.  */
   bool (*allocate) (CUMULATIVE_ARGS *, machine_mode, const_tree);
 
-  /* Return true if a result of mode MODE (or type TYPE if MODE is
-     BLKmode) is can be returned in this co-processor's registers.  */
+  /* Return true if a result of mode MODE (or type TYPE if MODE is BLKmode) can
+     be returned in this co-processor's registers.  */
   bool (*is_return_candidate) (enum arm_pcs, machine_mode, const_tree);
 
-  /* Allocate and return an RTX element to hold the return type of a
-     call, this routine must not fail and will only be called if
-     is_return_candidate returned true with the same parameters.  */
+  /* Allocate and return an RTX element to hold the return type of a call.  This
+     routine must not fail and will only be called if is_return_candidate
+     returned true with the same parameters.  */
   rtx (*allocate_return_reg) (enum arm_pcs, machine_mode, const_tree);
 
   /* Finish processing this argument and prepare to start processing
@@ -6561,6 +6629,185 @@ arm_handle_notshared_attribute (tree *node,
 }
 #endif
 
+/* This function returns true if a function with declaration FNDECL and type
+   FNTYPE uses the stack to pass arguments or return variables and false
+   otherwise.  This is used for functions with the attributes
+   'cmse_nonsecure_call' or 'cmse_nonsecure_entry' and this function will issue
+   diagnostic messages if the stack is used.  NAME is the name of the attribute
+   used.  */
+
+static bool
+cmse_func_args_or_return_in_stack (tree fndecl, tree name, tree fntype)
+{
+  function_args_iterator args_iter;
+  CUMULATIVE_ARGS args_so_far_v;
+  cumulative_args_t args_so_far;
+  bool first_param = true;
+  tree arg_type, prev_arg_type = NULL_TREE, ret_type;
+
+  /* Error out if any argument is passed on the stack.  */
+  arm_init_cumulative_args (&args_so_far_v, fntype, NULL_RTX, fndecl);
+  args_so_far = pack_cumulative_args (&args_so_far_v);
+  FOREACH_FUNCTION_ARGS (fntype, arg_type, args_iter)
+    {
+      rtx arg_rtx;
+      machine_mode arg_mode = TYPE_MODE (arg_type);
+
+      prev_arg_type = arg_type;
+      if (VOID_TYPE_P (arg_type))
+	continue;
+
+      if (!first_param)
+	arm_function_arg_advance (args_so_far, arg_mode, arg_type, true);
+      arg_rtx = arm_function_arg (args_so_far, arg_mode, arg_type, true);
+      if (!arg_rtx
+	  || arm_arg_partial_bytes (args_so_far, arg_mode, arg_type, true))
+	{
+	  error ("%qE attribute not available to functions with arguments "
+		 "passed on the stack", name);
+	  return true;
+	}
+      first_param = false;
+    }
+
+  /* Error out for variadic functions since we cannot control how many
+     arguments will be passed and thus stack could be used.  stdarg_p () is not
+     used for the checking to avoid browsing arguments twice.  */
+  if (prev_arg_type != NULL_TREE && !VOID_TYPE_P (prev_arg_type))
+    {
+      error ("%qE attribute not available to functions with variable number "
+	     "of arguments", name);
+      return true;
+    }
+
+  /* Error out if return value is passed on the stack.  */
+  ret_type = TREE_TYPE (fntype);
+  if (arm_return_in_memory (ret_type, fntype))
+    {
+      error ("%qE attribute not available to functions that return value on "
+	     "the stack", name);
+      return true;
+    }
+  return false;
+}
+
+/* Called upon detection of the use of the cmse_nonsecure_entry attribute, this
+   function will check whether the attribute is allowed here and will add the
+   attribute to the function declaration tree or otherwise issue a warning.  */
+
+static tree
+arm_handle_cmse_nonsecure_entry (tree *node, tree name,
+				 tree /* args */,
+				 int /* flags */,
+				 bool *no_add_attrs)
+{
+  tree fndecl;
+
+  if (!use_cmse)
+    {
+      *no_add_attrs = true;
+      warning (OPT_Wattributes, "%qE attribute ignored without -mcmse option.",
+	       name);
+      return NULL_TREE;
+    }
+
+  /* Ignore attribute for function types.  */
+  if (TREE_CODE (*node) != FUNCTION_DECL)
+    {
+      warning (OPT_Wattributes, "%qE attribute only applies to functions",
+	       name);
+      *no_add_attrs = true;
+      return NULL_TREE;
+    }
+
+  fndecl = *node;
+
+  /* Warn for static linkage functions.  */
+  if (!TREE_PUBLIC (fndecl))
+    {
+      warning (OPT_Wattributes, "%qE attribute has no effect on functions "
+	       "with static linkage", name);
+      *no_add_attrs = true;
+      return NULL_TREE;
+    }
+
+  *no_add_attrs |= cmse_func_args_or_return_in_stack (fndecl, name,
+						TREE_TYPE (fndecl));
+  return NULL_TREE;
+}
+
+
+/* Called upon detection of the use of the cmse_nonsecure_call attribute, this
+   function will check whether the attribute is allowed here and will add the
+   attribute to the function type tree or otherwise issue a diagnostic.  The
+   reason we check this at declaration time is to only allow the use of the
+   attribute with declarations of function pointers and not function
+   declarations.  This function checks NODE is of the expected type and issues
+   diagnostics otherwise using NAME.  If it is not of the expected type
+   *NO_ADD_ATTRS will be set to true.  */
+
+static tree
+arm_handle_cmse_nonsecure_call (tree *node, tree name,
+				 tree /* args */,
+				 int /* flags */,
+				 bool *no_add_attrs)
+{
+  tree decl = NULL_TREE, fntype = NULL_TREE;
+  tree type;
+
+  if (!use_cmse)
+    {
+      *no_add_attrs = true;
+      warning (OPT_Wattributes, "%qE attribute ignored without -mcmse option.",
+	       name);
+      return NULL_TREE;
+    }
+
+  if (TREE_CODE (*node) == VAR_DECL || TREE_CODE (*node) == TYPE_DECL)
+    {
+      decl = *node;
+      fntype = TREE_TYPE (decl);
+    }
+
+  while (fntype != NULL_TREE && TREE_CODE (fntype) == POINTER_TYPE)
+    fntype = TREE_TYPE (fntype);
+
+  if (!decl || TREE_CODE (fntype) != FUNCTION_TYPE)
+    {
+	warning (OPT_Wattributes, "%qE attribute only applies to base type of a "
+		 "function pointer", name);
+	*no_add_attrs = true;
+	return NULL_TREE;
+    }
+
+  *no_add_attrs |= cmse_func_args_or_return_in_stack (NULL, name, fntype);
+
+  if (*no_add_attrs)
+    return NULL_TREE;
+
+  /* Prevent trees being shared among function types with and without
+     cmse_nonsecure_call attribute.  */
+  type = TREE_TYPE (decl);
+
+  type = build_distinct_type_copy (type);
+  TREE_TYPE (decl) = type;
+  fntype = type;
+
+  while (TREE_CODE (fntype) != FUNCTION_TYPE)
+    {
+      type = fntype;
+      fntype = TREE_TYPE (fntype);
+      fntype = build_distinct_type_copy (fntype);
+      TREE_TYPE (type) = fntype;
+    }
+
+  /* Construct a type attribute and add it to the function type.  */
+  tree attrs = tree_cons (get_identifier ("cmse_nonsecure_call"), NULL_TREE,
+			  TYPE_ATTRIBUTES (fntype));
+  TYPE_ATTRIBUTES (fntype) = attrs;
+  return NULL_TREE;
+}
+
 /* Return 0 if the attributes for two types are incompatible, 1 if they
    are compatible, and 2 if they are nearly compatible (which causes a
    warning to be generated).  */
@@ -6601,6 +6848,14 @@ arm_comp_type_attributes (const_tree type1, const_tree type2)
   if (l1 != l2)
     return 0;
 
+  l1 = lookup_attribute ("cmse_nonsecure_call",
+			 TYPE_ATTRIBUTES (type1)) != NULL;
+  l2 = lookup_attribute ("cmse_nonsecure_call",
+			 TYPE_ATTRIBUTES (type2)) != NULL;
+
+  if (l1 != l2)
+    return 0;
+
   return 1;
 }
 
@@ -6711,7 +6966,7 @@ arm_function_ok_for_sibcall (tree decl, tree exp)
      may be used both as target of the call and base register for restoring
      the VFP registers  */
   if (TARGET_APCS_FRAME && TARGET_ARM
-      && TARGET_HARD_FLOAT && TARGET_VFP
+      && TARGET_HARD_FLOAT
       && decl && arm_is_long_call_p (decl))
     return false;
 
@@ -6727,6 +6982,20 @@ arm_function_ok_for_sibcall (tree decl, tree exp)
   if (IS_INTERRUPT (func_type))
     return false;
 
+  /* ARMv8-M non-secure entry functions need to return with bxns which is only
+     generated for entry functions themselves.  */
+  if (IS_CMSE_ENTRY (arm_current_func_type ()))
+    return false;
+
+  /* We do not allow ARMv8-M non-secure calls to be turned into sibling calls,
+     this would complicate matters for later code generation.  */
+  if (TREE_CODE (exp) == CALL_EXPR)
+    {
+      tree fntype = TREE_TYPE (TREE_TYPE (CALL_EXPR_FN (exp)));
+      if (lookup_attribute ("cmse_nonsecure_call", TYPE_ATTRIBUTES (fntype)))
+	return false;
+    }
+
   if (!VOID_TYPE_P (TREE_TYPE (DECL_RESULT (cfun->decl))))
     {
       /* Check that the return value locations are the same.  For
@@ -7187,8 +7456,7 @@ arm_legitimate_address_outer_p (machine_mode mode, rtx x, RTX_CODE outer,
     return 1;
 
   use_ldrd = (TARGET_LDRD
-	      && (mode == DImode
-		  || (mode == DFmode && (TARGET_SOFT_FLOAT || TARGET_VFP))));
+	      && (mode == DImode || mode == DFmode));
 
   if (code == POST_INC || code == PRE_DEC
       || ((code == PRE_INC || code == POST_DEC)
@@ -7273,8 +7541,7 @@ thumb2_legitimate_address_p (machine_mode mode, rtx x, int strict_p)
     return 1;
 
   use_ldrd = (TARGET_LDRD
-	      && (mode == DImode
-		  || (mode == DFmode && (TARGET_SOFT_FLOAT || TARGET_VFP))));
+	      && (mode == DImode || mode == DFmode));
 
   if (code == POST_INC || code == PRE_DEC
       || ((code == PRE_INC || code == POST_DEC)
@@ -7367,7 +7634,6 @@ arm_legitimate_index_p (machine_mode mode, rtx index, RTX_CODE outer,
 
   /* Standard coprocessor addressing modes.  */
   if (TARGET_HARD_FLOAT
-      && TARGET_VFP
       && (mode == SFmode || mode == DFmode))
     return (code == CONST_INT && INTVAL (index) < 1024
 	    && INTVAL (index) > -1024
@@ -7487,7 +7753,6 @@ thumb2_legitimate_index_p (machine_mode mode, rtx index, int strict_p)
   /* ??? Combine arm and thumb2 coprocessor addressing modes.  */
   /* Standard coprocessor addressing modes.  */
   if (TARGET_HARD_FLOAT
-      && TARGET_VFP
       && (mode == SFmode || mode == DFmode))
     return (code == CONST_INT && INTVAL (index) < 1024
 	    /* Thumb-2 allows only > -256 index range for it's core register
@@ -8033,8 +8298,7 @@ arm_legitimize_address (rtx x, rtx orig_x, machine_mode mode)
 
 	  /* VFP addressing modes actually allow greater offsets, but for
 	     now we just stick with the lowest common denominator.  */
-	  if (mode == DImode
-	      || ((TARGET_SOFT_FLOAT || TARGET_VFP) && mode == DFmode))
+	  if (mode == DImode || mode == DFmode)
 	    {
 	      low_n = n & 0x0f;
 	      n &= ~0x0f;
@@ -8226,6 +8490,12 @@ arm_legitimate_constant_p_1 (machine_mode, rtx x)
 static bool
 thumb_legitimate_constant_p (machine_mode mode ATTRIBUTE_UNUSED, rtx x)
 {
+  /* Splitters for TARGET_USE_MOVT call arm_emit_movpair which creates high
+     RTX.  These RTX must therefore be allowed for Thumb-1 so that when run
+     for ARMv8-M Baseline or later the result is valid.  */
+  if (TARGET_HAVE_MOVT && GET_CODE (x) == HIGH)
+    x = XEXP (x, 0);
+
   return (CONST_INT_P (x)
 	  || CONST_DOUBLE_P (x)
 	  || CONSTANT_ADDRESS_P (x)
@@ -8312,7 +8582,9 @@ thumb1_rtx_costs (rtx x, enum rtx_code code, enum rtx_code outer)
     case CONST_INT:
       if (outer == SET)
 	{
-	  if ((unsigned HOST_WIDE_INT) INTVAL (x) < 256)
+	  if (UINTVAL (x) < 256
+	      /* 16-bit constant.  */
+	      || (TARGET_HAVE_MOVT && !(INTVAL (x) & 0xffff0000)))
 	    return 0;
 	  if (thumb_shiftable_const (INTVAL (x)))
 	    return COSTS_N_INSNS (2);
@@ -8329,8 +8601,8 @@ thumb1_rtx_costs (rtx x, enum rtx_code code, enum rtx_code outer)
 	  int i;
 	  /* This duplicates the tests in the andsi3 expander.  */
 	  for (i = 9; i <= 31; i++)
-	    if ((((HOST_WIDE_INT) 1) << i) - 1 == INTVAL (x)
-		|| (((HOST_WIDE_INT) 1) << i) - 1 == ~INTVAL (x))
+	    if ((HOST_WIDE_INT_1 << i) - 1 == INTVAL (x)
+		|| (HOST_WIDE_INT_1 << i) - 1 == ~INTVAL (x))
 	      return COSTS_N_INSNS (2);
 	}
       else if (outer == ASHIFT || outer == ASHIFTRT
@@ -8393,1006 +8665,162 @@ thumb1_rtx_costs (rtx x, enum rtx_code code, enum rtx_code outer)
     }
 }
 
-static inline bool
-arm_rtx_costs_1 (rtx x, enum rtx_code outer, int* total, bool speed)
+/* Estimates the size cost of thumb1 instructions.
+   For now most of the code is copied from thumb1_rtx_costs. We need more
+   fine grain tuning when we have more related test cases.  */
+static inline int
+thumb1_size_rtx_costs (rtx x, enum rtx_code code, enum rtx_code outer)
 {
   machine_mode mode = GET_MODE (x);
-  enum rtx_code subcode;
-  rtx operand;
-  enum rtx_code code = GET_CODE (x);
-  *total = 0;
+  int words, cost;
 
   switch (code)
     {
-    case MEM:
-      /* Memory costs quite a lot for the first word, but subsequent words
-	 load at the equivalent of a single insn each.  */
-      *total = COSTS_N_INSNS (2 + ARM_NUM_REGS (mode));
-      return true;
+    case ASHIFT:
+    case ASHIFTRT:
+    case LSHIFTRT:
+    case ROTATERT:
+      return (mode == SImode) ? COSTS_N_INSNS (1) : COSTS_N_INSNS (2);
 
-    case DIV:
-    case MOD:
-    case UDIV:
-    case UMOD:
-      if (TARGET_HARD_FLOAT && mode == SFmode)
-	*total = COSTS_N_INSNS (2);
-      else if (TARGET_HARD_FLOAT && mode == DFmode && !TARGET_VFP_SINGLE)
-	*total = COSTS_N_INSNS (4);
-      else
-	*total = COSTS_N_INSNS (20);
-      return false;
+    case PLUS:
+    case MINUS:
+      /* Thumb-1 needs two instructions to fulfill shiftadd/shiftsub0/shiftsub1
+	 defined by RTL expansion, especially for the expansion of
+	 multiplication.  */
+      if ((GET_CODE (XEXP (x, 0)) == MULT
+	   && power_of_two_operand (XEXP (XEXP (x,0),1), SImode))
+	  || (GET_CODE (XEXP (x, 1)) == MULT
+	      && power_of_two_operand (XEXP (XEXP (x, 1), 1), SImode)))
+	return COSTS_N_INSNS (2);
+      /* On purpose fall through for normal RTX.  */
+    case COMPARE:
+    case NEG:
+    case NOT:
+      return COSTS_N_INSNS (1);
 
-    case ROTATE:
-      if (REG_P (XEXP (x, 1)))
-	*total = COSTS_N_INSNS (1); /* Need to subtract from 32 */
-      else if (!CONST_INT_P (XEXP (x, 1)))
-	*total = rtx_cost (XEXP (x, 1), mode, code, 1, speed);
+    case MULT:
+      if (CONST_INT_P (XEXP (x, 1)))
+        {
+          /* Thumb1 mul instruction can't operate on const. We must Load it
+             into a register first.  */
+          int const_size = thumb1_size_rtx_costs (XEXP (x, 1), CONST_INT, SET);
+	  /* For the targets which have a very small and high-latency multiply
+	     unit, we prefer to synthesize the mult with up to 5 instructions,
+	     giving a good balance between size and performance.  */
+	  if (arm_arch6m && arm_m_profile_small_mul)
+	    return COSTS_N_INSNS (5);
+	  else
+	    return COSTS_N_INSNS (1) + const_size;
+        }
+      return COSTS_N_INSNS (1);
 
-      /* Fall through */
-    case ROTATERT:
-      if (mode != SImode)
-	{
-	  *total += COSTS_N_INSNS (4);
-	  return true;
-	}
+    case SET:
+      /* A SET doesn't have a mode, so let's look at the SET_DEST to get
+	 the mode.  */
+      words = ARM_NUM_INTS (GET_MODE_SIZE (GET_MODE (SET_DEST (x))));
+      cost = COSTS_N_INSNS (words);
+      if (satisfies_constraint_J (SET_SRC (x))
+	  || satisfies_constraint_K (SET_SRC (x))
+	     /* Too big an immediate for a 2-byte mov, using MOVT.  */
+	  || (CONST_INT_P (SET_SRC (x))
+	      && UINTVAL (SET_SRC (x)) >= 256
+	      && TARGET_HAVE_MOVT
+	      && satisfies_constraint_j (SET_SRC (x)))
+	     /* thumb1_movdi_insn.  */
+	  || ((words > 1) && MEM_P (SET_SRC (x))))
+	cost += COSTS_N_INSNS (1);
+      return cost;
 
-      /* Fall through */
-    case ASHIFT: case LSHIFTRT: case ASHIFTRT:
-      *total += rtx_cost (XEXP (x, 0), mode, code, 0, speed);
-      if (mode == DImode)
-	{
-	  *total += COSTS_N_INSNS (3);
-	  return true;
-	}
+    case CONST_INT:
+      if (outer == SET)
+        {
+          if (UINTVAL (x) < 256)
+            return COSTS_N_INSNS (1);
+	  /* movw is 4byte long.  */
+	  if (TARGET_HAVE_MOVT && !(INTVAL (x) & 0xffff0000))
+	    return COSTS_N_INSNS (2);
+	  /* See split "TARGET_THUMB1 && satisfies_constraint_J".  */
+	  if (INTVAL (x) >= -255 && INTVAL (x) <= -1)
+            return COSTS_N_INSNS (2);
+	  /* See split "TARGET_THUMB1 && satisfies_constraint_K".  */
+          if (thumb_shiftable_const (INTVAL (x)))
+            return COSTS_N_INSNS (2);
+          return COSTS_N_INSNS (3);
+        }
+      else if ((outer == PLUS || outer == COMPARE)
+               && INTVAL (x) < 256 && INTVAL (x) > -256)
+        return 0;
+      else if ((outer == IOR || outer == XOR || outer == AND)
+               && INTVAL (x) < 256 && INTVAL (x) >= -256)
+        return COSTS_N_INSNS (1);
+      else if (outer == AND)
+        {
+          int i;
+          /* This duplicates the tests in the andsi3 expander.  */
+          for (i = 9; i <= 31; i++)
+            if ((HOST_WIDE_INT_1 << i) - 1 == INTVAL (x)
+                || (HOST_WIDE_INT_1 << i) - 1 == ~INTVAL (x))
+              return COSTS_N_INSNS (2);
+        }
+      else if (outer == ASHIFT || outer == ASHIFTRT
+               || outer == LSHIFTRT)
+        return 0;
+      return COSTS_N_INSNS (2);
 
-      *total += COSTS_N_INSNS (1);
-      /* Increase the cost of complex shifts because they aren't any faster,
-         and reduce dual issue opportunities.  */
-      if (arm_tune_cortex_a9
-	  && outer != SET && !CONST_INT_P (XEXP (x, 1)))
-	++*total;
+    case CONST:
+    case CONST_DOUBLE:
+    case LABEL_REF:
+    case SYMBOL_REF:
+      return COSTS_N_INSNS (3);
 
-      return true;
+    case UDIV:
+    case UMOD:
+    case DIV:
+    case MOD:
+      return 100;
 
-    case MINUS:
-      if (mode == DImode)
-	{
-	  *total = COSTS_N_INSNS (ARM_NUM_REGS (mode));
-	  if (CONST_INT_P (XEXP (x, 0))
-	      && const_ok_for_arm (INTVAL (XEXP (x, 0))))
-	    {
-	      *total += rtx_cost (XEXP (x, 1), mode, code, 1, speed);
-	      return true;
-	    }
-
-	  if (CONST_INT_P (XEXP (x, 1))
-	      && const_ok_for_arm (INTVAL (XEXP (x, 1))))
-	    {
-	      *total += rtx_cost (XEXP (x, 0), mode, code, 0, speed);
-	      return true;
-	    }
-
-	  return false;
-	}
-
-      if (GET_MODE_CLASS (mode) == MODE_FLOAT)
-	{
-	  if (TARGET_HARD_FLOAT
-	      && (mode == SFmode
-		  || (mode == DFmode && !TARGET_VFP_SINGLE)))
-	    {
-	      *total = COSTS_N_INSNS (1);
-	      if (CONST_DOUBLE_P (XEXP (x, 0))
-		  && arm_const_double_rtx (XEXP (x, 0)))
-		{
-		  *total += rtx_cost (XEXP (x, 1), mode, code, 1, speed);
-		  return true;
-		}
-
-	      if (CONST_DOUBLE_P (XEXP (x, 1))
-		  && arm_const_double_rtx (XEXP (x, 1)))
-		{
-		  *total += rtx_cost (XEXP (x, 0), mode, code, 0, speed);
-		  return true;
-		}
-
-	      return false;
-	    }
-	  *total = COSTS_N_INSNS (20);
-	  return false;
-	}
-
-      *total = COSTS_N_INSNS (1);
-      if (CONST_INT_P (XEXP (x, 0))
-	  && const_ok_for_arm (INTVAL (XEXP (x, 0))))
-	{
-	  *total += rtx_cost (XEXP (x, 1), mode, code, 1, speed);
-	  return true;
-	}
-
-      subcode = GET_CODE (XEXP (x, 1));
-      if (subcode == ASHIFT || subcode == ASHIFTRT
-	  || subcode == LSHIFTRT
-	  || subcode == ROTATE || subcode == ROTATERT)
-	{
-	  *total += rtx_cost (XEXP (x, 0), mode, code, 0, speed);
-	  *total += rtx_cost (XEXP (XEXP (x, 1), 0), mode, subcode, 0, speed);
-	  return true;
-	}
-
-      /* A shift as a part of RSB costs no more than RSB itself.  */
-      if (GET_CODE (XEXP (x, 0)) == MULT
-	  && power_of_two_operand (XEXP (XEXP (x, 0), 1), SImode))
-	{
-	  *total += rtx_cost (XEXP (XEXP (x, 0), 0), mode, code, 0, speed);
-	  *total += rtx_cost (XEXP (x, 1), mode, code, 1, speed);
-	  return true;
-	}
-
-      if (subcode == MULT
-	  && power_of_two_operand (XEXP (XEXP (x, 1), 1), SImode))
-	{
-	  *total += rtx_cost (XEXP (x, 0), mode, code, 0, speed);
-	  *total += rtx_cost (XEXP (XEXP (x, 1), 0), mode, subcode, 0, speed);
-	  return true;
-	}
-
-      if (GET_RTX_CLASS (GET_CODE (XEXP (x, 1))) == RTX_COMPARE
-	  || GET_RTX_CLASS (GET_CODE (XEXP (x, 1))) == RTX_COMM_COMPARE)
-	{
-	  *total = COSTS_N_INSNS (1) + rtx_cost (XEXP (x, 0), mode, code,
-						 0, speed);
-	  if (REG_P (XEXP (XEXP (x, 1), 0))
-	      && REGNO (XEXP (XEXP (x, 1), 0)) != CC_REGNUM)
-	    *total += COSTS_N_INSNS (1);
-
-	  return true;
-	}
-
-      /* Fall through */
-
-    case PLUS:
-      if (code == PLUS && arm_arch6 && mode == SImode
-	  && (GET_CODE (XEXP (x, 0)) == ZERO_EXTEND
-	      || GET_CODE (XEXP (x, 0)) == SIGN_EXTEND))
-	{
-	  *total = COSTS_N_INSNS (1);
-	  *total += rtx_cost (XEXP (XEXP (x, 0), 0), VOIDmode,
-			      GET_CODE (XEXP (x, 0)), 0, speed);
-	  *total += rtx_cost (XEXP (x, 1), mode, code, 1, speed);
-	  return true;
-	}
-
-      /* MLA: All arguments must be registers.  We filter out
-	 multiplication by a power of two, so that we fall down into
-	 the code below.  */
-      if (GET_CODE (XEXP (x, 0)) == MULT
-	  && !power_of_two_operand (XEXP (XEXP (x, 0), 1), SImode))
-	{
-	  /* The cost comes from the cost of the multiply.  */
-	  return false;
-	}
-
-      if (GET_MODE_CLASS (mode) == MODE_FLOAT)
-	{
-	  if (TARGET_HARD_FLOAT
-	      && (mode == SFmode
-		  || (mode == DFmode && !TARGET_VFP_SINGLE)))
-	    {
-	      *total = COSTS_N_INSNS (1);
-	      if (CONST_DOUBLE_P (XEXP (x, 1))
-		  && arm_const_double_rtx (XEXP (x, 1)))
-		{
-		  *total += rtx_cost (XEXP (x, 0), mode, code, 0, speed);
-		  return true;
-		}
-
-	      return false;
-	    }
-
-	  *total = COSTS_N_INSNS (20);
-	  return false;
-	}
-
-      if (GET_RTX_CLASS (GET_CODE (XEXP (x, 0))) == RTX_COMPARE
-	  || GET_RTX_CLASS (GET_CODE (XEXP (x, 0))) == RTX_COMM_COMPARE)
-	{
-	  *total = COSTS_N_INSNS (1) + rtx_cost (XEXP (x, 1), mode, code,
-						 1, speed);
-	  if (REG_P (XEXP (XEXP (x, 0), 0))
-	      && REGNO (XEXP (XEXP (x, 0), 0)) != CC_REGNUM)
-	    *total += COSTS_N_INSNS (1);
-	  return true;
-	}
-
-      /* Fall through */
-
-    case AND: case XOR: case IOR:
-
-      /* Normally the frame registers will be spilt into reg+const during
-	 reload, so it is a bad idea to combine them with other instructions,
-	 since then they might not be moved outside of loops.  As a compromise
-	 we allow integration with ops that have a constant as their second
-	 operand.  */
-      if (REG_OR_SUBREG_REG (XEXP (x, 0))
-	  && ARM_FRAME_RTX (REG_OR_SUBREG_RTX (XEXP (x, 0)))
-	  && !CONST_INT_P (XEXP (x, 1)))
-	*total = COSTS_N_INSNS (1);
-
-      if (mode == DImode)
-	{
-	  *total += COSTS_N_INSNS (2);
-	  if (CONST_INT_P (XEXP (x, 1))
-	      && const_ok_for_op (INTVAL (XEXP (x, 1)), code))
-	    {
-	      *total += rtx_cost (XEXP (x, 0), mode, code, 0, speed);
-	      return true;
-	    }
-
-	  return false;
-	}
-
-      *total += COSTS_N_INSNS (1);
-      if (CONST_INT_P (XEXP (x, 1))
-	  && const_ok_for_op (INTVAL (XEXP (x, 1)), code))
-	{
-	  *total += rtx_cost (XEXP (x, 0), mode, code, 0, speed);
-	  return true;
-	}
-      subcode = GET_CODE (XEXP (x, 0));
-      if (subcode == ASHIFT || subcode == ASHIFTRT
-	  || subcode == LSHIFTRT
-	  || subcode == ROTATE || subcode == ROTATERT)
-	{
-	  *total += rtx_cost (XEXP (x, 1), mode, code, 1, speed);
-	  *total += rtx_cost (XEXP (XEXP (x, 0), 0), mode, subcode, 0, speed);
-	  return true;
-	}
-
-      if (subcode == MULT
-	  && power_of_two_operand (XEXP (XEXP (x, 0), 1), SImode))
-	{
-	  *total += rtx_cost (XEXP (x, 1), mode, code, 1, speed);
-	  *total += rtx_cost (XEXP (XEXP (x, 0), 0), mode, subcode, 0, speed);
-	  return true;
-	}
-
-      if (subcode == UMIN || subcode == UMAX
-	  || subcode == SMIN || subcode == SMAX)
-	{
-	  *total = COSTS_N_INSNS (3);
-	  return true;
-	}
-
-      return false;
-
-    case MULT:
-      /* This should have been handled by the CPU specific routines.  */
-      gcc_unreachable ();
-
-    case TRUNCATE:
-      if (arm_arch3m && mode == SImode
-	  && GET_CODE (XEXP (x, 0)) == LSHIFTRT
-	  && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
-	  && (GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0))
-	      == GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)))
-	  && (GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == ZERO_EXTEND
-	      || GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == SIGN_EXTEND))
-	{
-	  *total = rtx_cost (XEXP (XEXP (x, 0), 0), VOIDmode, LSHIFTRT,
-			     0, speed);
-	  return true;
-	}
-      *total = COSTS_N_INSNS (2); /* Plus the cost of the MULT */
-      return false;
-
-    case NEG:
-      if (GET_MODE_CLASS (mode) == MODE_FLOAT)
-	{
-	  if (TARGET_HARD_FLOAT
-	      && (mode == SFmode
-		  || (mode == DFmode && !TARGET_VFP_SINGLE)))
-	    {
-	      *total = COSTS_N_INSNS (1);
-	      return false;
-	    }
-	  *total = COSTS_N_INSNS (2);
-	  return false;
-	}
-
-      /* Fall through */
-    case NOT:
-      *total = COSTS_N_INSNS (ARM_NUM_REGS(mode));
-      if (mode == SImode && code == NOT)
-	{
-	  subcode = GET_CODE (XEXP (x, 0));
-	  if (subcode == ASHIFT || subcode == ASHIFTRT
-	      || subcode == LSHIFTRT
-	      || subcode == ROTATE || subcode == ROTATERT
-	      || (subcode == MULT
-		  && power_of_two_operand (XEXP (XEXP (x, 0), 1), SImode)))
-	    {
-	      *total += rtx_cost (XEXP (XEXP (x, 0), 0), mode, subcode,
-				  0, speed);
-	      /* Register shifts cost an extra cycle.  */
-	      if (!CONST_INT_P (XEXP (XEXP (x, 0), 1)))
-		*total += COSTS_N_INSNS (1) + rtx_cost (XEXP (XEXP (x, 0), 1),
-							mode, subcode,
-							1, speed);
-	      return true;
-	    }
-	}
-
-      return false;
-
-    case IF_THEN_ELSE:
-      if (GET_CODE (XEXP (x, 1)) == PC || GET_CODE (XEXP (x, 2)) == PC)
-	{
-	  *total = COSTS_N_INSNS (4);
-	  return true;
-	}
-
-      operand = XEXP (x, 0);
-
-      if (!((GET_RTX_CLASS (GET_CODE (operand)) == RTX_COMPARE
-	     || GET_RTX_CLASS (GET_CODE (operand)) == RTX_COMM_COMPARE)
-	    && REG_P (XEXP (operand, 0))
-	    && REGNO (XEXP (operand, 0)) == CC_REGNUM))
-	*total += COSTS_N_INSNS (1);
-      *total += rtx_cost (XEXP (x, 1), VOIDmode, code, 1, speed);
-      *total += rtx_cost (XEXP (x, 2), VOIDmode, code, 2, speed);
-      return true;
-
-    case NE:
-      if (mode == SImode && XEXP (x, 1) == const0_rtx)
-	{
-	  *total = COSTS_N_INSNS (2) + rtx_cost (XEXP (x, 0), mode, code,
-						 0, speed);
-	  return true;
-	}
-      goto scc_insn;
-
-    case GE:
-      if ((!REG_P (XEXP (x, 0)) || REGNO (XEXP (x, 0)) != CC_REGNUM)
-	  && mode == SImode && XEXP (x, 1) == const0_rtx)
-	{
-	  *total = COSTS_N_INSNS (2) + rtx_cost (XEXP (x, 0), mode, code,
-						 0, speed);
-	  return true;
-	}
-      goto scc_insn;
-
-    case LT:
-      if ((!REG_P (XEXP (x, 0)) || REGNO (XEXP (x, 0)) != CC_REGNUM)
-	  && mode == SImode && XEXP (x, 1) == const0_rtx)
-	{
-	  *total = COSTS_N_INSNS (1) + rtx_cost (XEXP (x, 0), mode, code,
-						 0, speed);
-	  return true;
-	}
-      goto scc_insn;
-
-    case EQ:
-    case GT:
-    case LE:
-    case GEU:
-    case LTU:
-    case GTU:
-    case LEU:
-    case UNORDERED:
-    case ORDERED:
-    case UNEQ:
-    case UNGE:
-    case UNLT:
-    case UNGT:
-    case UNLE:
-    scc_insn:
-      /* SCC insns.  In the case where the comparison has already been
-	 performed, then they cost 2 instructions.  Otherwise they need
-	 an additional comparison before them.  */
-      *total = COSTS_N_INSNS (2);
-      if (REG_P (XEXP (x, 0)) && REGNO (XEXP (x, 0)) == CC_REGNUM)
-	{
-	  return true;
-	}
-
-      /* Fall through */
-    case COMPARE:
-      if (REG_P (XEXP (x, 0)) && REGNO (XEXP (x, 0)) == CC_REGNUM)
-	{
-	  *total = 0;
-	  return true;
-	}
-
-      *total += COSTS_N_INSNS (1);
-      if (CONST_INT_P (XEXP (x, 1))
-	  && const_ok_for_op (INTVAL (XEXP (x, 1)), code))
-	{
-	  *total += rtx_cost (XEXP (x, 0), VOIDmode, code, 0, speed);
-	  return true;
-	}
-
-      subcode = GET_CODE (XEXP (x, 0));
-      if (subcode == ASHIFT || subcode == ASHIFTRT
-	  || subcode == LSHIFTRT
-	  || subcode == ROTATE || subcode == ROTATERT)
-	{
-	  mode = GET_MODE (XEXP (x, 0));
-	  *total += rtx_cost (XEXP (x, 1), mode, code, 1, speed);
-	  *total += rtx_cost (XEXP (XEXP (x, 0), 0), mode, subcode, 0, speed);
-	  return true;
-	}
-
-      if (subcode == MULT
-	  && power_of_two_operand (XEXP (XEXP (x, 0), 1), SImode))
-	{
-	  mode = GET_MODE (XEXP (x, 0));
-	  *total += rtx_cost (XEXP (x, 1), mode, code, 1, speed);
-	  *total += rtx_cost (XEXP (XEXP (x, 0), 0), mode, subcode, 0, speed);
-	  return true;
-	}
-
-      return false;
-
-    case UMIN:
-    case UMAX:
-    case SMIN:
-    case SMAX:
-      *total = COSTS_N_INSNS (2) + rtx_cost (XEXP (x, 0), mode, code, 0, speed);
-      if (!CONST_INT_P (XEXP (x, 1))
-	  || !const_ok_for_arm (INTVAL (XEXP (x, 1))))
-	*total += rtx_cost (XEXP (x, 1), mode, code, 1, speed);
-      return true;
-
-    case ABS:
-      if (GET_MODE_CLASS (mode) == MODE_FLOAT)
-	{
-	  if (TARGET_HARD_FLOAT
-	      && (mode == SFmode
-		  || (mode == DFmode && !TARGET_VFP_SINGLE)))
-	    {
-	      *total = COSTS_N_INSNS (1);
-	      return false;
-	    }
-	  *total = COSTS_N_INSNS (20);
-	  return false;
-	}
-      *total = COSTS_N_INSNS (1);
-      if (mode == DImode)
-	*total += COSTS_N_INSNS (3);
-      return false;
-
-    case SIGN_EXTEND:
-    case ZERO_EXTEND:
-      *total = 0;
-      if (GET_MODE_CLASS (mode) == MODE_INT)
-	{
-	  rtx op = XEXP (x, 0);
-	  machine_mode opmode = GET_MODE (op);
-
-	  if (mode == DImode)
-	    *total += COSTS_N_INSNS (1);
-
-	  if (opmode != SImode)
-	    {
-	      if (MEM_P (op))
-		{
-		  /* If !arm_arch4, we use one of the extendhisi2_mem
-		     or movhi_bytes patterns for HImode.  For a QImode
-		     sign extension, we first zero-extend from memory
-		     and then perform a shift sequence.  */
-		  if (!arm_arch4 && (opmode != QImode || code == SIGN_EXTEND))
-		    *total += COSTS_N_INSNS (2);
-		}
-	      else if (arm_arch6)
-		*total += COSTS_N_INSNS (1);
-
-	      /* We don't have the necessary insn, so we need to perform some
-		 other operation.  */
-	      else if (TARGET_ARM && code == ZERO_EXTEND && mode == QImode)
-		/* An and with constant 255.  */
-		*total += COSTS_N_INSNS (1);
-	      else
-		/* A shift sequence.  Increase costs slightly to avoid
-		   combining two shifts into an extend operation.  */
-		*total += COSTS_N_INSNS (2) + 1;
-	    }
-
-	  return false;
-	}
-
-      switch (GET_MODE (XEXP (x, 0)))
-	{
-	case V8QImode:
-	case V4HImode:
-	case V2SImode:
-	case V4QImode:
-	case V2HImode:
-	  *total = COSTS_N_INSNS (1);
-	  return false;
-
-	default:
-	  gcc_unreachable ();
-	}
-      gcc_unreachable ();
-
-    case ZERO_EXTRACT:
-    case SIGN_EXTRACT:
-      mode = GET_MODE (XEXP (x, 0));
-      *total = COSTS_N_INSNS (1) + rtx_cost (XEXP (x, 0), mode, code, 0, speed);
-      return true;
-
-    case CONST_INT:
-      if (const_ok_for_arm (INTVAL (x))
-	  || const_ok_for_arm (~INTVAL (x)))
-	*total = COSTS_N_INSNS (1);
-      else
-	*total = COSTS_N_INSNS (arm_gen_constant (SET, mode, NULL_RTX,
-						  INTVAL (x), NULL_RTX,
-						  NULL_RTX, 0, 0));
-      return true;
-
-    case CONST:
-    case LABEL_REF:
-    case SYMBOL_REF:
-      *total = COSTS_N_INSNS (3);
-      return true;
-
-    case HIGH:
-      *total = COSTS_N_INSNS (1);
-      return true;
-
-    case LO_SUM:
-      *total = COSTS_N_INSNS (1);
-      *total += rtx_cost (XEXP (x, 0), mode, code, 0, speed);
-      return true;
-
-    case CONST_DOUBLE:
-      if (TARGET_HARD_FLOAT && vfp3_const_double_rtx (x)
-	  && (mode == SFmode || !TARGET_VFP_SINGLE))
-	*total = COSTS_N_INSNS (1);
-      else
-	*total = COSTS_N_INSNS (4);
-      return true;
-
-    case SET:
-      /* The vec_extract patterns accept memory operands that require an
-	 address reload.  Account for the cost of that reload to give the
-	 auto-inc-dec pass an incentive to try to replace them.  */
-      if (TARGET_NEON && MEM_P (SET_DEST (x))
-	  && GET_CODE (SET_SRC (x)) == VEC_SELECT)
-	{
-	  mode = GET_MODE (SET_DEST (x));
-	  *total = rtx_cost (SET_DEST (x), mode, code, 0, speed);
-	  if (!neon_vector_mem_operand (SET_DEST (x), 2, true))
-	    *total += COSTS_N_INSNS (1);
-	  return true;
-	}
-      /* Likewise for the vec_set patterns.  */
-      if (TARGET_NEON && GET_CODE (SET_SRC (x)) == VEC_MERGE
-	  && GET_CODE (XEXP (SET_SRC (x), 0)) == VEC_DUPLICATE
-	  && MEM_P (XEXP (XEXP (SET_SRC (x), 0), 0)))
-	{
-	  rtx mem = XEXP (XEXP (SET_SRC (x), 0), 0);
-	  mode = GET_MODE (SET_DEST (x));
-	  *total = rtx_cost (mem, mode, code, 0, speed);
-	  if (!neon_vector_mem_operand (mem, 2, true))
-	    *total += COSTS_N_INSNS (1);
-	  return true;
-	}
-      return false;
-
-    case UNSPEC:
-      /* We cost this as high as our memory costs to allow this to
-	 be hoisted from loops.  */
-      if (XINT (x, 1) == UNSPEC_PIC_UNIFIED)
-	{
-	  *total = COSTS_N_INSNS (2 + ARM_NUM_REGS (mode));
-	}
-      return true;
-
-    case CONST_VECTOR:
-      if (TARGET_NEON
-	  && TARGET_HARD_FLOAT
-	  && outer == SET
-	  && (VALID_NEON_DREG_MODE (mode) || VALID_NEON_QREG_MODE (mode))
-	  && neon_immediate_valid_for_move (x, mode, NULL, NULL))
-	*total = COSTS_N_INSNS (1);
-      else
-	*total = COSTS_N_INSNS (4);
-      return true;
-
-    default:
-      *total = COSTS_N_INSNS (4);
-      return false;
-    }
-}
-
-/* Estimates the size cost of thumb1 instructions.
-   For now most of the code is copied from thumb1_rtx_costs. We need more
-   fine grain tuning when we have more related test cases.  */
-static inline int
-thumb1_size_rtx_costs (rtx x, enum rtx_code code, enum rtx_code outer)
-{
-  machine_mode mode = GET_MODE (x);
-  int words;
-
-  switch (code)
-    {
-    case ASHIFT:
-    case ASHIFTRT:
-    case LSHIFTRT:
-    case ROTATERT:
-      return (mode == SImode) ? COSTS_N_INSNS (1) : COSTS_N_INSNS (2);
-
-    case PLUS:
-    case MINUS:
-      /* Thumb-1 needs two instructions to fulfill shiftadd/shiftsub0/shiftsub1
-	 defined by RTL expansion, especially for the expansion of
-	 multiplication.  */
-      if ((GET_CODE (XEXP (x, 0)) == MULT
-	   && power_of_two_operand (XEXP (XEXP (x,0),1), SImode))
-	  || (GET_CODE (XEXP (x, 1)) == MULT
-	      && power_of_two_operand (XEXP (XEXP (x, 1), 1), SImode)))
-	return COSTS_N_INSNS (2);
-      /* On purpose fall through for normal RTX.  */
-    case COMPARE:
-    case NEG:
-    case NOT:
-      return COSTS_N_INSNS (1);
-
-    case MULT:
-      if (CONST_INT_P (XEXP (x, 1)))
-        {
-          /* Thumb1 mul instruction can't operate on const. We must Load it
-             into a register first.  */
-          int const_size = thumb1_size_rtx_costs (XEXP (x, 1), CONST_INT, SET);
-	  /* For the targets which have a very small and high-latency multiply
-	     unit, we prefer to synthesize the mult with up to 5 instructions,
-	     giving a good balance between size and performance.  */
-	  if (arm_arch6m && arm_m_profile_small_mul)
-	    return COSTS_N_INSNS (5);
-	  else
-	    return COSTS_N_INSNS (1) + const_size;
-        }
-      return COSTS_N_INSNS (1);
-
-    case SET:
-      /* A SET doesn't have a mode, so let's look at the SET_DEST to get
-	 the mode.  */
-      words = ARM_NUM_INTS (GET_MODE_SIZE (GET_MODE (SET_DEST (x))));
-      return COSTS_N_INSNS (words)
-	     + COSTS_N_INSNS (1) * (satisfies_constraint_J (SET_SRC (x))
-				    || satisfies_constraint_K (SET_SRC (x))
-				       /* thumb1_movdi_insn.  */
-				    || ((words > 1) && MEM_P (SET_SRC (x))));
-
-    case CONST_INT:
-      if (outer == SET)
-        {
-          if ((unsigned HOST_WIDE_INT) INTVAL (x) < 256)
-            return COSTS_N_INSNS (1);
-	  /* See split "TARGET_THUMB1 && satisfies_constraint_J".  */
-	  if (INTVAL (x) >= -255 && INTVAL (x) <= -1)
-            return COSTS_N_INSNS (2);
-	  /* See split "TARGET_THUMB1 && satisfies_constraint_K".  */
-          if (thumb_shiftable_const (INTVAL (x)))
-            return COSTS_N_INSNS (2);
-          return COSTS_N_INSNS (3);
-        }
-      else if ((outer == PLUS || outer == COMPARE)
-               && INTVAL (x) < 256 && INTVAL (x) > -256)
-        return 0;
-      else if ((outer == IOR || outer == XOR || outer == AND)
-               && INTVAL (x) < 256 && INTVAL (x) >= -256)
-        return COSTS_N_INSNS (1);
-      else if (outer == AND)
-        {
-          int i;
-          /* This duplicates the tests in the andsi3 expander.  */
-          for (i = 9; i <= 31; i++)
-            if ((((HOST_WIDE_INT) 1) << i) - 1 == INTVAL (x)
-                || (((HOST_WIDE_INT) 1) << i) - 1 == ~INTVAL (x))
-              return COSTS_N_INSNS (2);
-        }
-      else if (outer == ASHIFT || outer == ASHIFTRT
-               || outer == LSHIFTRT)
-        return 0;
-      return COSTS_N_INSNS (2);
-
-    case CONST:
-    case CONST_DOUBLE:
-    case LABEL_REF:
-    case SYMBOL_REF:
-      return COSTS_N_INSNS (3);
-
-    case UDIV:
-    case UMOD:
-    case DIV:
-    case MOD:
-      return 100;
-
-    case TRUNCATE:
-      return 99;
-
-    case AND:
-    case XOR:
-    case IOR:
-      return COSTS_N_INSNS (1);
-
-    case MEM:
-      return (COSTS_N_INSNS (1)
-	      + COSTS_N_INSNS (1)
-		* ((GET_MODE_SIZE (mode) - 1) / UNITS_PER_WORD)
-              + ((GET_CODE (x) == SYMBOL_REF && CONSTANT_POOL_ADDRESS_P (x))
-                 ? COSTS_N_INSNS (1) : 0));
-
-    case IF_THEN_ELSE:
-      /* XXX a guess.  */
-      if (GET_CODE (XEXP (x, 1)) == PC || GET_CODE (XEXP (x, 2)) == PC)
-        return 14;
-      return 2;
-
-    case ZERO_EXTEND:
-      /* XXX still guessing.  */
-      switch (GET_MODE (XEXP (x, 0)))
-        {
-          case QImode:
-            return (1 + (mode == DImode ? 4 : 0)
-                    + (MEM_P (XEXP (x, 0)) ? 10 : 0));
-
-          case HImode:
-            return (4 + (mode == DImode ? 4 : 0)
-                    + (MEM_P (XEXP (x, 0)) ? 10 : 0));
-
-          case SImode:
-            return (1 + (MEM_P (XEXP (x, 0)) ? 10 : 0));
-
-          default:
-            return 99;
-        }
-
-    default:
-      return 99;
-    }
-}
-
-/* RTX costs when optimizing for size.  */
-static bool
-arm_size_rtx_costs (rtx x, enum rtx_code code, enum rtx_code outer_code,
-		    int *total)
-{
-  machine_mode mode = GET_MODE (x);
-  if (TARGET_THUMB1)
-    {
-      *total = thumb1_size_rtx_costs (x, code, outer_code);
-      return true;
-    }
-
-  /* FIXME: This makes no attempt to prefer narrow Thumb-2 instructions.  */
-  switch (code)
-    {
-    case MEM:
-      /* A memory access costs 1 insn if the mode is small, or the address is
-	 a single register, otherwise it costs one insn per word.  */
-      if (REG_P (XEXP (x, 0)))
-	*total = COSTS_N_INSNS (1);
-      else if (flag_pic
-	       && GET_CODE (XEXP (x, 0)) == PLUS
-	       && will_be_in_index_register (XEXP (XEXP (x, 0), 1)))
-	/* This will be split into two instructions.
-	   See arm.md:calculate_pic_address.  */
-	*total = COSTS_N_INSNS (2);
-      else
-	*total = COSTS_N_INSNS (ARM_NUM_REGS (mode));
-      return true;
-
-    case DIV:
-    case MOD:
-    case UDIV:
-    case UMOD:
-      /* Needs a libcall, so it costs about this.  */
-      *total = COSTS_N_INSNS (2);
-      return false;
-
-    case ROTATE:
-      if (mode == SImode && REG_P (XEXP (x, 1)))
-	{
-	  *total = COSTS_N_INSNS (2) + rtx_cost (XEXP (x, 0), mode, code,
-						 0, false);
-	  return true;
-	}
-      /* Fall through */
-    case ROTATERT:
-    case ASHIFT:
-    case LSHIFTRT:
-    case ASHIFTRT:
-      if (mode == DImode && CONST_INT_P (XEXP (x, 1)))
-	{
-	  *total = COSTS_N_INSNS (3) + rtx_cost (XEXP (x, 0), mode, code,
-						 0, false);
-	  return true;
-	}
-      else if (mode == SImode)
-	{
-	  *total = COSTS_N_INSNS (1) + rtx_cost (XEXP (x, 0), mode, code,
-						 0, false);
-	  /* Slightly disparage register shifts, but not by much.  */
-	  if (!CONST_INT_P (XEXP (x, 1)))
-	    *total += 1 + rtx_cost (XEXP (x, 1), mode, code, 1, false);
-	  return true;
-	}
-
-      /* Needs a libcall.  */
-      *total = COSTS_N_INSNS (2);
-      return false;
-
-    case MINUS:
-      if (TARGET_HARD_FLOAT && GET_MODE_CLASS (mode) == MODE_FLOAT
-	  && (mode == SFmode || !TARGET_VFP_SINGLE))
-	{
-	  *total = COSTS_N_INSNS (1);
-	  return false;
-	}
-
-      if (mode == SImode)
-	{
-	  enum rtx_code subcode0 = GET_CODE (XEXP (x, 0));
-	  enum rtx_code subcode1 = GET_CODE (XEXP (x, 1));
-
-	  if (subcode0 == ROTATE || subcode0 == ROTATERT || subcode0 == ASHIFT
-	      || subcode0 == LSHIFTRT || subcode0 == ASHIFTRT
-	      || subcode1 == ROTATE || subcode1 == ROTATERT
-	      || subcode1 == ASHIFT || subcode1 == LSHIFTRT
-	      || subcode1 == ASHIFTRT)
-	    {
-	      /* It's just the cost of the two operands.  */
-	      *total = 0;
-	      return false;
-	    }
-
-	  *total = COSTS_N_INSNS (1);
-	  return false;
-	}
-
-      *total = COSTS_N_INSNS (ARM_NUM_REGS (mode));
-      return false;
-
-    case PLUS:
-      if (TARGET_HARD_FLOAT && GET_MODE_CLASS (mode) == MODE_FLOAT
-	  && (mode == SFmode || !TARGET_VFP_SINGLE))
-	{
-	  *total = COSTS_N_INSNS (1);
-	  return false;
-	}
-
-      /* A shift as a part of ADD costs nothing.  */
-      if (GET_CODE (XEXP (x, 0)) == MULT
-	  && power_of_two_operand (XEXP (XEXP (x, 0), 1), SImode))
-	{
-	  *total = COSTS_N_INSNS (TARGET_THUMB2 ? 2 : 1);
-	  *total += rtx_cost (XEXP (XEXP (x, 0), 0), mode, code, 0, false);
-	  *total += rtx_cost (XEXP (x, 1), mode, code, 1, false);
-	  return true;
-	}
-
-      /* Fall through */
-    case AND: case XOR: case IOR:
-      if (mode == SImode)
-	{
-	  enum rtx_code subcode = GET_CODE (XEXP (x, 0));
-
-	  if (subcode == ROTATE || subcode == ROTATERT || subcode == ASHIFT
-	      || subcode == LSHIFTRT || subcode == ASHIFTRT
-	      || (code == AND && subcode == NOT))
-	    {
-	      /* It's just the cost of the two operands.  */
-	      *total = 0;
-	      return false;
-	    }
-	}
-
-      *total = COSTS_N_INSNS (ARM_NUM_REGS (mode));
-      return false;
-
-    case MULT:
-      *total = COSTS_N_INSNS (ARM_NUM_REGS (mode));
-      return false;
-
-    case NEG:
-      if (TARGET_HARD_FLOAT && GET_MODE_CLASS (mode) == MODE_FLOAT
-	  && (mode == SFmode || !TARGET_VFP_SINGLE))
-	{
-	  *total = COSTS_N_INSNS (1);
-	  return false;
-	}
-
-      /* Fall through */
-    case NOT:
-      *total = COSTS_N_INSNS (ARM_NUM_REGS (mode));
-
-      return false;
+    case TRUNCATE:
+      return 99;
 
-    case IF_THEN_ELSE:
-      *total = 0;
-      return false;
+    case AND:
+    case XOR:
+    case IOR:
+      return COSTS_N_INSNS (1);
 
-    case COMPARE:
-      if (cc_register (XEXP (x, 0), VOIDmode))
-	* total = 0;
-      else
-	*total = COSTS_N_INSNS (1);
-      return false;
+    case MEM:
+      return (COSTS_N_INSNS (1)
+	      + COSTS_N_INSNS (1)
+		* ((GET_MODE_SIZE (mode) - 1) / UNITS_PER_WORD)
+              + ((GET_CODE (x) == SYMBOL_REF && CONSTANT_POOL_ADDRESS_P (x))
+                 ? COSTS_N_INSNS (1) : 0));
 
-    case ABS:
-      if (TARGET_HARD_FLOAT && GET_MODE_CLASS (mode) == MODE_FLOAT
-	  && (mode == SFmode || !TARGET_VFP_SINGLE))
-	*total = COSTS_N_INSNS (1);
-      else
-	*total = COSTS_N_INSNS (1 + ARM_NUM_REGS (mode));
-      return false;
+    case IF_THEN_ELSE:
+      /* XXX a guess.  */
+      if (GET_CODE (XEXP (x, 1)) == PC || GET_CODE (XEXP (x, 2)) == PC)
+        return 14;
+      return 2;
 
-    case SIGN_EXTEND:
     case ZERO_EXTEND:
-      return arm_rtx_costs_1 (x, outer_code, total, 0);
-
-    case CONST_INT:
-      if (const_ok_for_arm (INTVAL (x)))
-	/* A multiplication by a constant requires another instruction
-	   to load the constant to a register.  */
-	*total = COSTS_N_INSNS ((outer_code == SET || outer_code == MULT)
-				? 1 : 0);
-      else if (const_ok_for_arm (~INTVAL (x)))
-	*total = COSTS_N_INSNS (outer_code == AND ? 0 : 1);
-      else if (const_ok_for_arm (-INTVAL (x)))
-	{
-	  if (outer_code == COMPARE || outer_code == PLUS
-	      || outer_code == MINUS)
-	    *total = 0;
-	  else
-	    *total = COSTS_N_INSNS (1);
-	}
-      else
-	*total = COSTS_N_INSNS (2);
-      return true;
-
-    case CONST:
-    case LABEL_REF:
-    case SYMBOL_REF:
-      *total = COSTS_N_INSNS (2);
-      return true;
-
-    case CONST_DOUBLE:
-      *total = COSTS_N_INSNS (4);
-      return true;
+      /* XXX still guessing.  */
+      switch (GET_MODE (XEXP (x, 0)))
+        {
+          case QImode:
+            return (1 + (mode == DImode ? 4 : 0)
+                    + (MEM_P (XEXP (x, 0)) ? 10 : 0));
 
-    case CONST_VECTOR:
-      if (TARGET_NEON
-	  && TARGET_HARD_FLOAT
-	  && outer_code == SET
-	  && (VALID_NEON_DREG_MODE (mode) || VALID_NEON_QREG_MODE (mode))
-	  && neon_immediate_valid_for_move (x, mode, NULL, NULL))
-	*total = COSTS_N_INSNS (1);
-      else
-	*total = COSTS_N_INSNS (4);
-      return true;
+          case HImode:
+            return (4 + (mode == DImode ? 4 : 0)
+                    + (MEM_P (XEXP (x, 0)) ? 10 : 0));
 
-    case HIGH:
-    case LO_SUM:
-      /* We prefer constant pool entries to MOVW/MOVT pairs, so bump the
-	 cost of these slightly.  */
-      *total = COSTS_N_INSNS (1) + 1;
-      return true;
+          case SImode:
+            return (1 + (MEM_P (XEXP (x, 0)) ? 10 : 0));
 
-    case SET:
-      return false;
+          default:
+            return 99;
+        }
 
     default:
-      if (mode != VOIDmode)
-	*total = COSTS_N_INSNS (ARM_NUM_REGS (mode));
-      else
-	*total = COSTS_N_INSNS (4); /* How knows?  */
-      return false;
+      return 99;
     }
 }
 
@@ -9519,7 +8947,7 @@ arm_unspec_cost (rtx x, enum rtx_code /* outer_code */, bool speed_p, int *cost)
    flags are live or not, and thus no realistic way to determine what
    the size will eventually be.  */
 static bool
-arm_new_rtx_costs (rtx x, enum rtx_code code, enum rtx_code outer_code,
+arm_rtx_costs_internal (rtx x, enum rtx_code code, enum rtx_code outer_code,
 		   const struct cpu_cost_table *extra_cost,
 		   int *cost, bool speed_p)
 {
@@ -10771,8 +10199,6 @@ arm_new_rtx_costs (rtx x, enum rtx_code code, enum rtx_code outer_code,
       if ((arm_arch4 || GET_MODE (XEXP (x, 0)) == SImode)
 	  && MEM_P (XEXP (x, 0)))
 	{
-	  *cost = rtx_cost (XEXP (x, 0), VOIDmode, code, 0, speed_p);
-
 	  if (mode == DImode)
 	    *cost += COSTS_N_INSNS (1);
 
@@ -11164,390 +10590,70 @@ arm_new_rtx_costs (rtx x, enum rtx_code code, enum rtx_code outer_code,
 	  /* Vector costs? */
 	}
       *cost = LIBCALL_COST (1);
-      return false;
-
-    case FLOAT:
-    case UNSIGNED_FLOAT:
-      if (TARGET_HARD_FLOAT)
-	{
-	  /* ??? Increase the cost to deal with transferring from CORE
-	     -> FP registers?  */
-	  if (speed_p)
-	    *cost += extra_cost->fp[mode == DFmode].fromint;
-	  return false;
-	}
-      *cost = LIBCALL_COST (1);
-      return false;
-
-    case CALL:
-      return true;
-
-    case ASM_OPERANDS:
-      {
-      /* Just a guess.  Guess number of instructions in the asm
-         plus one insn per input.  Always a minimum of COSTS_N_INSNS (1)
-         though (see PR60663).  */
-        int asm_length = MAX (1, asm_str_count (ASM_OPERANDS_TEMPLATE (x)));
-        int num_operands = ASM_OPERANDS_INPUT_LENGTH (x);
-
-        *cost = COSTS_N_INSNS (asm_length + num_operands);
-        return true;
-      }
-    default:
-      if (mode != VOIDmode)
-	*cost = COSTS_N_INSNS (ARM_NUM_REGS (mode));
-      else
-	*cost = COSTS_N_INSNS (4); /* Who knows?  */
-      return false;
-    }
-}
-
-#undef HANDLE_NARROW_SHIFT_ARITH
-
-/* RTX costs when optimizing for size.  */
-static bool
-arm_rtx_costs (rtx x, machine_mode mode ATTRIBUTE_UNUSED, int outer_code,
-	       int opno ATTRIBUTE_UNUSED, int *total, bool speed)
-{
-  bool result;
-  int code = GET_CODE (x);
-
-  if (TARGET_OLD_RTX_COSTS
-      || (!current_tune->insn_extra_cost && !TARGET_NEW_GENERIC_COSTS))
-    {
-      /* Old way.  (Deprecated.)  */
-      if (!speed)
-	result = arm_size_rtx_costs (x, (enum rtx_code) code,
-				     (enum rtx_code) outer_code, total);
-      else
-	result = current_tune->rtx_costs (x,  (enum rtx_code) code,
-					  (enum rtx_code) outer_code, total,
-					  speed);
-    }
-  else
-    {
-    /* New way.  */
-      if (current_tune->insn_extra_cost)
-        result =  arm_new_rtx_costs (x, (enum rtx_code) code,
-				     (enum rtx_code) outer_code,
-				     current_tune->insn_extra_cost,
-				     total, speed);
-    /* TARGET_NEW_GENERIC_COSTS && !TARGET_OLD_RTX_COSTS
-       && current_tune->insn_extra_cost != NULL  */
-      else
-        result =  arm_new_rtx_costs (x, (enum rtx_code) code,
-				    (enum rtx_code) outer_code,
-				    &generic_extra_costs, total, speed);
-    }
-
-  if (dump_file && (dump_flags & TDF_DETAILS))
-    {
-      print_rtl_single (dump_file, x);
-      fprintf (dump_file, "\n%s cost: %d (%s)\n", speed ? "Hot" : "Cold",
-	       *total, result ? "final" : "partial");
-    }
-  return result;
-}
-
-/* RTX costs for cores with a slow MUL implementation.  Thumb-2 is not
-   supported on any "slowmul" cores, so it can be ignored.  */
-
-static bool
-arm_slowmul_rtx_costs (rtx x, enum rtx_code code, enum rtx_code outer_code,
-		       int *total, bool speed)
-{
-  machine_mode mode = GET_MODE (x);
-
-  if (TARGET_THUMB)
-    {
-      *total = thumb1_rtx_costs (x, code, outer_code);
-      return true;
-    }
-
-  switch (code)
-    {
-    case MULT:
-      if (GET_MODE_CLASS (mode) == MODE_FLOAT
-	  || mode == DImode)
-	{
-	  *total = COSTS_N_INSNS (20);
-	  return false;
-	}
-
-      if (CONST_INT_P (XEXP (x, 1)))
-	{
-	  unsigned HOST_WIDE_INT i = (INTVAL (XEXP (x, 1))
-				      & (unsigned HOST_WIDE_INT) 0xffffffff);
-	  int cost, const_ok = const_ok_for_arm (i);
-	  int j, booth_unit_size;
-
-	  /* Tune as appropriate.  */
-	  cost = const_ok ? 4 : 8;
-	  booth_unit_size = 2;
-	  for (j = 0; i && j < 32; j += booth_unit_size)
-	    {
-	      i >>= booth_unit_size;
-	      cost++;
-	    }
-
-	  *total = COSTS_N_INSNS (cost);
-	  *total += rtx_cost (XEXP (x, 0), mode, code, 0, speed);
-	  return true;
-	}
-
-      *total = COSTS_N_INSNS (20);
-      return false;
-
-    default:
-      return arm_rtx_costs_1 (x, outer_code, total, speed);;
-    }
-}
-
-
-/* RTX cost for cores with a fast multiply unit (M variants).  */
-
-static bool
-arm_fastmul_rtx_costs (rtx x, enum rtx_code code, enum rtx_code outer_code,
-		       int *total, bool speed)
-{
-  machine_mode mode = GET_MODE (x);
-
-  if (TARGET_THUMB1)
-    {
-      *total = thumb1_rtx_costs (x, code, outer_code);
-      return true;
-    }
-
-  /* ??? should thumb2 use different costs?  */
-  switch (code)
-    {
-    case MULT:
-      /* There is no point basing this on the tuning, since it is always the
-	 fast variant if it exists at all.  */
-      if (mode == DImode
-	  && (GET_CODE (XEXP (x, 0)) == GET_CODE (XEXP (x, 1)))
-	  && (GET_CODE (XEXP (x, 0)) == ZERO_EXTEND
-	      || GET_CODE (XEXP (x, 0)) == SIGN_EXTEND))
-	{
-	  *total = COSTS_N_INSNS(2);
-	  return false;
-	}
-
-
-      if (mode == DImode)
-	{
-	  *total = COSTS_N_INSNS (5);
-	  return false;
-	}
-
-      if (CONST_INT_P (XEXP (x, 1)))
-	{
-	  unsigned HOST_WIDE_INT i = (INTVAL (XEXP (x, 1))
-				      & (unsigned HOST_WIDE_INT) 0xffffffff);
-	  int cost, const_ok = const_ok_for_arm (i);
-	  int j, booth_unit_size;
-
-	  /* Tune as appropriate.  */
-	  cost = const_ok ? 4 : 8;
-	  booth_unit_size = 8;
-	  for (j = 0; i && j < 32; j += booth_unit_size)
-	    {
-	      i >>= booth_unit_size;
-	      cost++;
-	    }
-
-	  *total = COSTS_N_INSNS(cost);
-	  return false;
-	}
-
-      if (mode == SImode)
-	{
-	  *total = COSTS_N_INSNS (4);
-	  return false;
-	}
-
-      if (GET_MODE_CLASS (mode) == MODE_FLOAT)
-	{
-	  if (TARGET_HARD_FLOAT
-	      && (mode == SFmode
-		  || (mode == DFmode && !TARGET_VFP_SINGLE)))
-	    {
-	      *total = COSTS_N_INSNS (1);
-	      return false;
-	    }
-	}
-
-      /* Requires a lib call */
-      *total = COSTS_N_INSNS (20);
-      return false;
-
-    default:
-      return arm_rtx_costs_1 (x, outer_code, total, speed);
-    }
-}
-
-
-/* RTX cost for XScale CPUs.  Thumb-2 is not supported on any xscale cores,
-   so it can be ignored.  */
-
-static bool
-arm_xscale_rtx_costs (rtx x, enum rtx_code code, enum rtx_code outer_code,
-		      int *total, bool speed)
-{
-  machine_mode mode = GET_MODE (x);
-
-  if (TARGET_THUMB)
-    {
-      *total = thumb1_rtx_costs (x, code, outer_code);
-      return true;
-    }
-
-  switch (code)
-    {
-    case COMPARE:
-      if (GET_CODE (XEXP (x, 0)) != MULT)
-	return arm_rtx_costs_1 (x, outer_code, total, speed);
-
-      /* A COMPARE of a MULT is slow on XScale; the muls instruction
-	 will stall until the multiplication is complete.  */
-      *total = COSTS_N_INSNS (3);
-      return false;
-
-    case MULT:
-      /* There is no point basing this on the tuning, since it is always the
-	 fast variant if it exists at all.  */
-      if (mode == DImode
-	  && (GET_CODE (XEXP (x, 0)) == GET_CODE (XEXP (x, 1)))
-	  && (GET_CODE (XEXP (x, 0)) == ZERO_EXTEND
-	      || GET_CODE (XEXP (x, 0)) == SIGN_EXTEND))
-	{
-	  *total = COSTS_N_INSNS (2);
-	  return false;
-	}
-
-
-      if (mode == DImode)
-	{
-	  *total = COSTS_N_INSNS (5);
-	  return false;
-	}
-
-      if (CONST_INT_P (XEXP (x, 1)))
-	{
-	  /* If operand 1 is a constant we can more accurately
-	     calculate the cost of the multiply.  The multiplier can
-	     retire 15 bits on the first cycle and a further 12 on the
-	     second.  We do, of course, have to load the constant into
-	     a register first.  */
-	  unsigned HOST_WIDE_INT i = INTVAL (XEXP (x, 1));
-	  /* There's a general overhead of one cycle.  */
-	  int cost = 1;
-	  unsigned HOST_WIDE_INT masked_const;
-
-	  if (i & 0x80000000)
-	    i = ~i;
-
-	  i &= (unsigned HOST_WIDE_INT) 0xffffffff;
-
-	  masked_const = i & 0xffff8000;
-	  if (masked_const != 0)
-	    {
-	      cost++;
-	      masked_const = i & 0xf8000000;
-	      if (masked_const != 0)
-		cost++;
-	    }
-	  *total = COSTS_N_INSNS (cost);
-	  return false;
-	}
+      return false;
 
-      if (mode == SImode)
+    case FLOAT:
+    case UNSIGNED_FLOAT:
+      if (TARGET_HARD_FLOAT)
 	{
-	  *total = COSTS_N_INSNS (3);
+	  /* ??? Increase the cost to deal with transferring from CORE
+	     -> FP registers?  */
+	  if (speed_p)
+	    *cost += extra_cost->fp[mode == DFmode].fromint;
 	  return false;
 	}
-
-      /* Requires a lib call */
-      *total = COSTS_N_INSNS (20);
+      *cost = LIBCALL_COST (1);
       return false;
 
+    case CALL:
+      return true;
+
+    case ASM_OPERANDS:
+      {
+      /* Just a guess.  Guess number of instructions in the asm
+         plus one insn per input.  Always a minimum of COSTS_N_INSNS (1)
+         though (see PR60663).  */
+        int asm_length = MAX (1, asm_str_count (ASM_OPERANDS_TEMPLATE (x)));
+        int num_operands = ASM_OPERANDS_INPUT_LENGTH (x);
+
+        *cost = COSTS_N_INSNS (asm_length + num_operands);
+        return true;
+      }
     default:
-      return arm_rtx_costs_1 (x, outer_code, total, speed);
+      if (mode != VOIDmode)
+	*cost = COSTS_N_INSNS (ARM_NUM_REGS (mode));
+      else
+	*cost = COSTS_N_INSNS (4); /* Who knows?  */
+      return false;
     }
 }
 
+#undef HANDLE_NARROW_SHIFT_ARITH
 
-/* RTX costs for 9e (and later) cores.  */
+/* RTX costs entry point.  */
 
 static bool
-arm_9e_rtx_costs (rtx x, enum rtx_code code, enum rtx_code outer_code,
-		  int *total, bool speed)
+arm_rtx_costs (rtx x, machine_mode mode ATTRIBUTE_UNUSED, int outer_code,
+	       int opno ATTRIBUTE_UNUSED, int *total, bool speed)
 {
-  machine_mode mode = GET_MODE (x);
-
-  if (TARGET_THUMB1)
-    {
-      switch (code)
-	{
-	case MULT:
-	  /* Small multiply: 32 cycles for an integer multiply inst.  */
-	  if (arm_arch6m && arm_m_profile_small_mul)
-	    *total = COSTS_N_INSNS (32);
-	  else
-	    *total = COSTS_N_INSNS (3);
-	  return true;
+  bool result;
+  int code = GET_CODE (x);
+  gcc_assert (current_tune->insn_extra_cost);
 
-	default:
-	  *total = thumb1_rtx_costs (x, code, outer_code);
-	  return true;
-	}
-    }
+  result =  arm_rtx_costs_internal (x, (enum rtx_code) code,
+				(enum rtx_code) outer_code,
+				current_tune->insn_extra_cost,
+				total, speed);
 
-  switch (code)
+  if (dump_file && (dump_flags & TDF_DETAILS))
     {
-    case MULT:
-      /* There is no point basing this on the tuning, since it is always the
-	 fast variant if it exists at all.  */
-      if (mode == DImode
-	  && (GET_CODE (XEXP (x, 0)) == GET_CODE (XEXP (x, 1)))
-	  && (GET_CODE (XEXP (x, 0)) == ZERO_EXTEND
-	      || GET_CODE (XEXP (x, 0)) == SIGN_EXTEND))
-	{
-	  *total = COSTS_N_INSNS (2);
-	  return false;
-	}
-
-
-      if (mode == DImode)
-	{
-	  *total = COSTS_N_INSNS (5);
-	  return false;
-	}
-
-      if (mode == SImode)
-	{
-	  *total = COSTS_N_INSNS (2);
-	  return false;
-	}
-
-      if (GET_MODE_CLASS (mode) == MODE_FLOAT)
-	{
-	  if (TARGET_HARD_FLOAT
-	      && (mode == SFmode
-		  || (mode == DFmode && !TARGET_VFP_SINGLE)))
-	    {
-	      *total = COSTS_N_INSNS (1);
-	      return false;
-	    }
-	}
-
-      *total = COSTS_N_INSNS (20);
-      return false;
-
-    default:
-      return arm_rtx_costs_1 (x, outer_code, total, speed);
+      print_rtl_single (dump_file, x);
+      fprintf (dump_file, "\n%s cost: %d (%s)\n", speed ? "Hot" : "Cold",
+	       *total, result ? "final" : "partial");
     }
+  return result;
 }
+
 /* All address computations that can be done are free, but rtx cost returns
    the same for practically all of them.  So we weight the different types
    of address here in the order (most pref first):
@@ -12269,7 +11375,7 @@ vfp3_const_double_index (rtx x)
 
   /* We can permit four significant bits of mantissa only, plus a high bit
      which is always 1.  */
-  mask = ((unsigned HOST_WIDE_INT)1 << (point_pos - 5)) - 1;
+  mask = (HOST_WIDE_INT_1U << (point_pos - 5)) - 1;
   if ((mantissa & mask) != 0)
     return -1;
 
@@ -12423,6 +11529,12 @@ neon_valid_immediate (rtx op, machine_mode mode, int inverse,
 	return 18;
     }
 
+  /* The tricks done in the code below apply for little-endian vector layout.
+     For big-endian vectors only allow vectors of the form { a, a, a..., a }.
+     FIXME: Implement logic for big-endian vectors.  */
+  if (BYTES_BIG_ENDIAN && vector && !const_vec_duplicate_p (op))
+    return -1;
+
   /* Splat vector constant out into a byte vector.  */
   for (i = 0; i < n_elts; i++)
     {
@@ -13151,7 +12263,7 @@ coproc_secondary_reload_class (machine_mode mode, rtx x, bool wb)
 {
   if (mode == HFmode)
     {
-      if (!TARGET_NEON_FP16)
+      if (!TARGET_NEON_FP16 && !TARGET_VFP_FP16INST)
 	return GENERAL_REGS;
       if (s_register_operand (x, mode) || neon_vector_mem_operand (x, 2, true))
 	return NO_REGS;
@@ -15988,14 +15100,17 @@ gen_operands_ldrd_strd (rtx *operands, bool load,
   /* If the same input register is used in both stores
      when storing different constants, try to find a free register.
      For example, the code
-        mov r0, 0
-        str r0, [r2]
-        mov r0, 1
-        str r0, [r2, #4]
+	mov r0, 0
+	str r0, [r2]
+	mov r0, 1
+	str r0, [r2, #4]
      can be transformed into
-        mov r1, 0
-        strd r1, r0, [r2]
-     in Thumb mode assuming that r1 is free.  */
+	mov r1, 0
+	mov r0, 1
+	strd r1, r0, [r2]
+     in Thumb mode assuming that r1 is free.
+     For ARM mode do the same but only if the starting register
+     can be made to be even.  */
   if (const_store
       && REGNO (operands[0]) == REGNO (operands[1])
       && INTVAL (operands[4]) != INTVAL (operands[5]))
@@ -16014,7 +15129,6 @@ gen_operands_ldrd_strd (rtx *operands, bool load,
       }
     else if (TARGET_ARM)
       {
-        return false;
         int regno = REGNO (operands[0]);
         if (!peep2_reg_dead_p (4, operands[0]))
           {
@@ -16368,7 +15482,7 @@ get_jump_table_size (rtx_jump_table_data *insn)
 	{
 	case 1:
 	  /* Round up size  of TBB table to a halfword boundary.  */
-	  size = (size + 1) & ~(HOST_WIDE_INT)1;
+	  size = (size + 1) & ~HOST_WIDE_INT_1;
 	  break;
 	case 2:
 	  /* No padding necessary for TBH.  */
@@ -16837,35 +15951,37 @@ dump_minipool (rtx_insn *scan)
 	      fputc ('\n', dump_file);
 	    }
 
+	  rtx val = copy_rtx (mp->value);
+
 	  switch (GET_MODE_SIZE (mp->mode))
 	    {
 #ifdef HAVE_consttable_1
 	    case 1:
-	      scan = emit_insn_after (gen_consttable_1 (mp->value), scan);
+	      scan = emit_insn_after (gen_consttable_1 (val), scan);
 	      break;
 
 #endif
 #ifdef HAVE_consttable_2
 	    case 2:
-	      scan = emit_insn_after (gen_consttable_2 (mp->value), scan);
+	      scan = emit_insn_after (gen_consttable_2 (val), scan);
 	      break;
 
 #endif
 #ifdef HAVE_consttable_4
 	    case 4:
-	      scan = emit_insn_after (gen_consttable_4 (mp->value), scan);
+	      scan = emit_insn_after (gen_consttable_4 (val), scan);
 	      break;
 
 #endif
 #ifdef HAVE_consttable_8
 	    case 8:
-	      scan = emit_insn_after (gen_consttable_8 (mp->value), scan);
+	      scan = emit_insn_after (gen_consttable_8 (val), scan);
 	      break;
 
 #endif
 #ifdef HAVE_consttable_16
 	    case 16:
-              scan = emit_insn_after (gen_consttable_16 (mp->value), scan);
+              scan = emit_insn_after (gen_consttable_16 (val), scan);
               break;
 
 #endif
@@ -17269,6 +16385,470 @@ note_invalid_constants (rtx_insn *insn, HOST_WIDE_INT address, int do_pushes)
   return;
 }
 
+/* This function computes the clear mask and PADDING_BITS_TO_CLEAR for structs
+   and unions in the context of ARMv8-M Security Extensions.  It is used as a
+   helper function for both 'cmse_nonsecure_call' and 'cmse_nonsecure_entry'
+   functions.  The PADDING_BITS_TO_CLEAR pointer can be the base to either one
+   or four masks, depending on whether it is being computed for a
+   'cmse_nonsecure_entry' return value or a 'cmse_nonsecure_call' argument
+   respectively.  The tree for the type of the argument or a field within an
+   argument is passed in ARG_TYPE, the current register this argument or field
+   starts in is kept in the pointer REGNO and updated accordingly, the bit this
+   argument or field starts at is passed in STARTING_BIT and the last used bit
+   is kept in LAST_USED_BIT which is also updated accordingly.  */
+
+static unsigned HOST_WIDE_INT
+comp_not_to_clear_mask_str_un (tree arg_type, int * regno,
+			       uint32_t * padding_bits_to_clear,
+			       unsigned starting_bit, int * last_used_bit)
+
+{
+  unsigned HOST_WIDE_INT not_to_clear_reg_mask = 0;
+
+  if (TREE_CODE (arg_type) == RECORD_TYPE)
+    {
+      unsigned current_bit = starting_bit;
+      tree field;
+      long int offset, size;
+
+
+      field = TYPE_FIELDS (arg_type);
+      while (field)
+	{
+	  /* The offset within a structure is always an offset from
+	     the start of that structure.  Make sure we take that into the
+	     calculation of the register based offset that we use here.  */
+	  offset = starting_bit;
+	  offset += TREE_INT_CST_ELT (DECL_FIELD_BIT_OFFSET (field), 0);
+	  offset %= 32;
+
+	  /* This is the actual size of the field, for bitfields this is the
+	     bitfield width and not the container size.  */
+	  size = TREE_INT_CST_ELT (DECL_SIZE (field), 0);
+
+	  if (*last_used_bit != offset)
+	    {
+	      if (offset < *last_used_bit)
+		{
+		  /* This field's offset is before the 'last_used_bit', that
+		     means this field goes on the next register.  So we need to
+		     pad the rest of the current register and increase the
+		     register number.  */
+		  uint32_t mask;
+		  mask  = ((uint32_t)-1) - ((uint32_t) 1 << *last_used_bit);
+		  mask++;
+
+		  padding_bits_to_clear[*regno] |= mask;
+		  not_to_clear_reg_mask |= HOST_WIDE_INT_1U << *regno;
+		  (*regno)++;
+		}
+	      else
+		{
+		  /* Otherwise we pad the bits between the last field's end and
+		     the start of the new field.  */
+		  uint32_t mask;
+
+		  mask = ((uint32_t)-1) >> (32 - offset);
+		  mask -= ((uint32_t) 1 << *last_used_bit) - 1;
+		  padding_bits_to_clear[*regno] |= mask;
+		}
+	      current_bit = offset;
+	    }
+
+	  /* Calculate further padding bits for inner structs/unions too.  */
+	  if (RECORD_OR_UNION_TYPE_P (TREE_TYPE (field)))
+	    {
+	      *last_used_bit = current_bit;
+	      not_to_clear_reg_mask
+		|= comp_not_to_clear_mask_str_un (TREE_TYPE (field), regno,
+						  padding_bits_to_clear, offset,
+						  last_used_bit);
+	    }
+	  else
+	    {
+	      /* Update 'current_bit' with this field's size.  If the
+		 'current_bit' lies in a subsequent register, update 'regno' and
+		 reset 'current_bit' to point to the current bit in that new
+		 register.  */
+	      current_bit += size;
+	      while (current_bit >= 32)
+		{
+		  current_bit-=32;
+		  not_to_clear_reg_mask |= HOST_WIDE_INT_1U << *regno;
+		  (*regno)++;
+		}
+	      *last_used_bit = current_bit;
+	    }
+
+	  field = TREE_CHAIN (field);
+	}
+      not_to_clear_reg_mask |= HOST_WIDE_INT_1U << *regno;
+    }
+  else if (TREE_CODE (arg_type) == UNION_TYPE)
+    {
+      tree field, field_t;
+      int i, regno_t, field_size;
+      int max_reg = -1;
+      int max_bit = -1;
+      uint32_t mask;
+      uint32_t padding_bits_to_clear_res[NUM_ARG_REGS]
+	= {-1, -1, -1, -1};
+
+      /* To compute the padding bits in a union we only consider bits as
+	 padding bits if they are always either a padding bit or fall outside a
+	 fields size for all fields in the union.  */
+      field = TYPE_FIELDS (arg_type);
+      while (field)
+	{
+	  uint32_t padding_bits_to_clear_t[NUM_ARG_REGS]
+	    = {0U, 0U, 0U, 0U};
+	  int last_used_bit_t = *last_used_bit;
+	  regno_t = *regno;
+	  field_t = TREE_TYPE (field);
+
+	  /* If the field's type is either a record or a union make sure to
+	     compute their padding bits too.  */
+	  if (RECORD_OR_UNION_TYPE_P (field_t))
+	    not_to_clear_reg_mask
+	      |= comp_not_to_clear_mask_str_un (field_t, &regno_t,
+						&padding_bits_to_clear_t[0],
+						starting_bit, &last_used_bit_t);
+	  else
+	    {
+	      field_size = TREE_INT_CST_ELT (DECL_SIZE (field), 0);
+	      regno_t = (field_size / 32) + *regno;
+	      last_used_bit_t = (starting_bit + field_size) % 32;
+	    }
+
+	  for (i = *regno; i < regno_t; i++)
+	    {
+	      /* For all but the last register used by this field only keep the
+		 padding bits that were padding bits in this field.  */
+	      padding_bits_to_clear_res[i] &= padding_bits_to_clear_t[i];
+	    }
+
+	    /* For the last register, keep all padding bits that were padding
+	       bits in this field and any padding bits that are still valid
+	       as padding bits but fall outside of this field's size.  */
+	    mask = (((uint32_t) -1) - ((uint32_t) 1 << last_used_bit_t)) + 1;
+	    padding_bits_to_clear_res[regno_t]
+	      &= padding_bits_to_clear_t[regno_t] | mask;
+
+	  /* Update the maximum size of the fields in terms of registers used
+	     ('max_reg') and the 'last_used_bit' in said register.  */
+	  if (max_reg < regno_t)
+	    {
+	      max_reg = regno_t;
+	      max_bit = last_used_bit_t;
+	    }
+	  else if (max_reg == regno_t && max_bit < last_used_bit_t)
+	    max_bit = last_used_bit_t;
+
+	  field = TREE_CHAIN (field);
+	}
+
+      /* Update the current padding_bits_to_clear using the intersection of the
+	 padding bits of all the fields.  */
+      for (i=*regno; i < max_reg; i++)
+	padding_bits_to_clear[i] |= padding_bits_to_clear_res[i];
+
+      /* Do not keep trailing padding bits, we do not know yet whether this
+	 is the end of the argument.  */
+      mask = ((uint32_t) 1 << max_bit) - 1;
+      padding_bits_to_clear[max_reg]
+	|= padding_bits_to_clear_res[max_reg] & mask;
+
+      *regno = max_reg;
+      *last_used_bit = max_bit;
+    }
+  else
+    /* This function should only be used for structs and unions.  */
+    gcc_unreachable ();
+
+  return not_to_clear_reg_mask;
+}
+
+/* In the context of ARMv8-M Security Extensions, this function is used for both
+   'cmse_nonsecure_call' and 'cmse_nonsecure_entry' functions to compute what
+   registers are used when returning or passing arguments, which is then
+   returned as a mask.  It will also compute a mask to indicate padding/unused
+   bits for each of these registers, and passes this through the
+   PADDING_BITS_TO_CLEAR pointer.  The tree of the argument type is passed in
+   ARG_TYPE, the rtl representation of the argument is passed in ARG_RTX and
+   the starting register used to pass this argument or return value is passed
+   in REGNO.  It makes use of 'comp_not_to_clear_mask_str_un' to compute these
+   for struct and union types.  */
+
+static unsigned HOST_WIDE_INT
+compute_not_to_clear_mask (tree arg_type, rtx arg_rtx, int regno,
+			     uint32_t * padding_bits_to_clear)
+
+{
+  int last_used_bit = 0;
+  unsigned HOST_WIDE_INT not_to_clear_mask;
+
+  if (RECORD_OR_UNION_TYPE_P (arg_type))
+    {
+      not_to_clear_mask
+	= comp_not_to_clear_mask_str_un (arg_type, &regno,
+					 padding_bits_to_clear, 0,
+					 &last_used_bit);
+
+
+      /* If the 'last_used_bit' is not zero, that means we are still using a
+	 part of the last 'regno'.  In such cases we must clear the trailing
+	 bits.  Otherwise we are not using regno and we should mark it as to
+	 clear.  */
+      if (last_used_bit != 0)
+	padding_bits_to_clear[regno]
+	  |= ((uint32_t)-1) - ((uint32_t) 1 << last_used_bit) + 1;
+      else
+	not_to_clear_mask &= ~(HOST_WIDE_INT_1U << regno);
+    }
+  else
+    {
+      not_to_clear_mask = 0;
+      /* We are not dealing with structs nor unions.  So these arguments may be
+	 passed in floating point registers too.  In some cases a BLKmode is
+	 used when returning or passing arguments in multiple VFP registers.  */
+      if (GET_MODE (arg_rtx) == BLKmode)
+	{
+	  int i, arg_regs;
+	  rtx reg;
+
+	  /* This should really only occur when dealing with the hard-float
+	     ABI.  */
+	  gcc_assert (TARGET_HARD_FLOAT_ABI);
+
+	  for (i = 0; i < XVECLEN (arg_rtx, 0); i++)
+	    {
+	      reg = XEXP (XVECEXP (arg_rtx, 0, i), 0);
+	      gcc_assert (REG_P (reg));
+
+	      not_to_clear_mask |= HOST_WIDE_INT_1U << REGNO (reg);
+
+	      /* If we are dealing with DF mode, make sure we don't
+		 clear either of the registers it addresses.  */
+	      arg_regs = ARM_NUM_REGS (GET_MODE (reg));
+	      if (arg_regs > 1)
+		{
+		  unsigned HOST_WIDE_INT mask;
+		  mask = HOST_WIDE_INT_1U << (REGNO (reg) + arg_regs);
+		  mask -= HOST_WIDE_INT_1U << REGNO (reg);
+		  not_to_clear_mask |= mask;
+		}
+	    }
+	}
+      else
+	{
+	  /* Otherwise we can rely on the MODE to determine how many registers
+	     are being used by this argument.  */
+	  int arg_regs = ARM_NUM_REGS (GET_MODE (arg_rtx));
+	  not_to_clear_mask |= HOST_WIDE_INT_1U << REGNO (arg_rtx);
+	  if (arg_regs > 1)
+	    {
+	      unsigned HOST_WIDE_INT
+	      mask = HOST_WIDE_INT_1U << (REGNO (arg_rtx) + arg_regs);
+	      mask -= HOST_WIDE_INT_1U << REGNO (arg_rtx);
+	      not_to_clear_mask |= mask;
+	    }
+	}
+    }
+
+  return not_to_clear_mask;
+}
+
+/* Saves callee saved registers, clears callee saved registers and caller saved
+   registers not used to pass arguments before a cmse_nonsecure_call.  And
+   restores the callee saved registers after.  */
+
+static void
+cmse_nonsecure_call_clear_caller_saved (void)
+{
+  basic_block bb;
+
+  FOR_EACH_BB_FN (bb, cfun)
+    {
+      rtx_insn *insn;
+
+      FOR_BB_INSNS (bb, insn)
+	{
+	  uint64_t to_clear_mask, float_mask;
+	  rtx_insn *seq;
+	  rtx pat, call, unspec, reg, cleared_reg, tmp;
+	  unsigned int regno, maxregno;
+	  rtx address;
+	  CUMULATIVE_ARGS args_so_far_v;
+	  cumulative_args_t args_so_far;
+	  tree arg_type, fntype;
+	  bool using_r4, first_param = true;
+	  function_args_iterator args_iter;
+	  uint32_t padding_bits_to_clear[4] = {0U, 0U, 0U, 0U};
+	  uint32_t * padding_bits_to_clear_ptr = &padding_bits_to_clear[0];
+
+	  if (!NONDEBUG_INSN_P (insn))
+	    continue;
+
+	  if (!CALL_P (insn))
+	    continue;
+
+	  pat = PATTERN (insn);
+	  gcc_assert (GET_CODE (pat) == PARALLEL && XVECLEN (pat, 0) > 0);
+	  call = XVECEXP (pat, 0, 0);
+
+	  /* Get the real call RTX if the insn sets a value, ie. returns.  */
+	  if (GET_CODE (call) == SET)
+	      call = SET_SRC (call);
+
+	  /* Check if it is a cmse_nonsecure_call.  */
+	  unspec = XEXP (call, 0);
+	  if (GET_CODE (unspec) != UNSPEC
+	      || XINT (unspec, 1) != UNSPEC_NONSECURE_MEM)
+	    continue;
+
+	  /* Determine the caller-saved registers we need to clear.  */
+	  to_clear_mask = (1LL << (NUM_ARG_REGS)) - 1;
+	  maxregno = NUM_ARG_REGS - 1;
+	  /* Only look at the caller-saved floating point registers in case of
+	     -mfloat-abi=hard.  For -mfloat-abi=softfp we will be using the
+	     lazy store and loads which clear both caller- and callee-saved
+	     registers.  */
+	  if (TARGET_HARD_FLOAT_ABI)
+	    {
+	      float_mask = (1LL << (D7_VFP_REGNUM + 1)) - 1;
+	      float_mask &= ~((1LL << FIRST_VFP_REGNUM) - 1);
+	      to_clear_mask |= float_mask;
+	      maxregno = D7_VFP_REGNUM;
+	    }
+
+	  /* Make sure the register used to hold the function address is not
+	     cleared.  */
+	  address = RTVEC_ELT (XVEC (unspec, 0), 0);
+	  gcc_assert (MEM_P (address));
+	  gcc_assert (REG_P (XEXP (address, 0)));
+	  to_clear_mask &= ~(1LL << REGNO (XEXP (address, 0)));
+
+	  /* Set basic block of call insn so that df rescan is performed on
+	     insns inserted here.  */
+	  set_block_for_insn (insn, bb);
+	  df_set_flags (DF_DEFER_INSN_RESCAN);
+	  start_sequence ();
+
+	  /* Make sure the scheduler doesn't schedule other insns beyond
+	     here.  */
+	  emit_insn (gen_blockage ());
+
+	  /* Walk through all arguments and clear registers appropriately.
+	  */
+	  fntype = TREE_TYPE (MEM_EXPR (address));
+	  arm_init_cumulative_args (&args_so_far_v, fntype, NULL_RTX,
+				    NULL_TREE);
+	  args_so_far = pack_cumulative_args (&args_so_far_v);
+	  FOREACH_FUNCTION_ARGS (fntype, arg_type, args_iter)
+	    {
+	      rtx arg_rtx;
+	      machine_mode arg_mode = TYPE_MODE (arg_type);
+
+	      if (VOID_TYPE_P (arg_type))
+		continue;
+
+	      if (!first_param)
+		arm_function_arg_advance (args_so_far, arg_mode, arg_type,
+					  true);
+
+	      arg_rtx = arm_function_arg (args_so_far, arg_mode, arg_type,
+					  true);
+	      gcc_assert (REG_P (arg_rtx));
+	      to_clear_mask
+		&= ~compute_not_to_clear_mask (arg_type, arg_rtx,
+					       REGNO (arg_rtx),
+					       padding_bits_to_clear_ptr);
+
+	      first_param = false;
+	    }
+
+	  /* Clear padding bits where needed.  */
+	  cleared_reg = XEXP (address, 0);
+	  reg = gen_rtx_REG (SImode, IP_REGNUM);
+	  using_r4 = false;
+	  for (regno = R0_REGNUM; regno < NUM_ARG_REGS; regno++)
+	    {
+	      if (padding_bits_to_clear[regno] == 0)
+		continue;
+
+	      /* If this is a Thumb-1 target copy the address of the function
+		 we are calling from 'r4' into 'ip' such that we can use r4 to
+		 clear the unused bits in the arguments.  */
+	      if (TARGET_THUMB1 && !using_r4)
+		{
+		  using_r4 =  true;
+		  reg = cleared_reg;
+		  emit_move_insn (gen_rtx_REG (SImode, IP_REGNUM),
+					  reg);
+		}
+
+	      tmp = GEN_INT ((((~padding_bits_to_clear[regno]) << 16u) >> 16u));
+	      emit_move_insn (reg, tmp);
+	      /* Also fill the top half of the negated
+		 padding_bits_to_clear.  */
+	      if (((~padding_bits_to_clear[regno]) >> 16) > 0)
+		{
+		  tmp = GEN_INT ((~padding_bits_to_clear[regno]) >> 16);
+		  emit_insn (gen_rtx_SET (gen_rtx_ZERO_EXTRACT (SImode, reg,
+								GEN_INT (16),
+								GEN_INT (16)),
+					  tmp));
+		}
+
+	      emit_insn (gen_andsi3 (gen_rtx_REG (SImode, regno),
+				     gen_rtx_REG (SImode, regno),
+				     reg));
+
+	    }
+	  if (using_r4)
+	    emit_move_insn (cleared_reg,
+			    gen_rtx_REG (SImode, IP_REGNUM));
+
+	  /* We use right shift and left shift to clear the LSB of the address
+	     we jump to instead of using bic, to avoid having to use an extra
+	     register on Thumb-1.  */
+	  tmp = gen_rtx_LSHIFTRT (SImode, cleared_reg, const1_rtx);
+	  emit_insn (gen_rtx_SET (cleared_reg, tmp));
+	  tmp = gen_rtx_ASHIFT (SImode, cleared_reg, const1_rtx);
+	  emit_insn (gen_rtx_SET (cleared_reg, tmp));
+
+	  /* Clearing all registers that leak before doing a non-secure
+	     call.  */
+	  for (regno = R0_REGNUM; regno <= maxregno; regno++)
+	    {
+	      if (!(to_clear_mask & (1LL << regno)))
+		continue;
+
+	      /* If regno is an even vfp register and its successor is also to
+		 be cleared, use vmov.  */
+	      if (IS_VFP_REGNUM (regno))
+		{
+		  if (TARGET_VFP_DOUBLE
+		      && VFP_REGNO_OK_FOR_DOUBLE (regno)
+		      && to_clear_mask & (1LL << (regno + 1)))
+		    emit_move_insn (gen_rtx_REG (DFmode, regno++),
+				    CONST0_RTX (DFmode));
+		  else
+		    emit_move_insn (gen_rtx_REG (SFmode, regno),
+				    CONST0_RTX (SFmode));
+		}
+	      else
+		emit_move_insn (gen_rtx_REG (SImode, regno), cleared_reg);
+	    }
+
+	  seq = get_insns ();
+	  end_sequence ();
+	  emit_insn_before (seq, insn);
+
+	}
+    }
+}
+
 /* Rewrite move insn into subtract of 0 if the condition codes will
    be useful in next conditional jump insn.  */
 
@@ -17569,6 +17149,8 @@ arm_reorg (void)
   HOST_WIDE_INT address = 0;
   Mfix * fix;
 
+  if (use_cmse)
+    cmse_nonsecure_call_clear_caller_saved ();
   if (TARGET_THUMB1)
     thumb1_reorg ();
   else if (TARGET_THUMB2)
@@ -17941,6 +17523,23 @@ vfp_emit_fstmd (int base_reg, int count)
   return count * 8;
 }
 
+/* Returns true if -mcmse has been passed and the function pointed to by 'addr'
+   has the cmse_nonsecure_call attribute and returns false otherwise.  */
+
+bool
+detect_cmse_nonsecure_call (tree addr)
+{
+  if (!addr)
+    return FALSE;
+
+  tree fntype = TREE_TYPE (addr);
+  if (use_cmse && lookup_attribute ("cmse_nonsecure_call",
+				    TYPE_ATTRIBUTES (fntype)))
+    return TRUE;
+  return FALSE;
+}
+
+
 /* Emit a call instruction with pattern PAT.  ADDR is the address of
    the call target.  */
 
@@ -18600,6 +18199,8 @@ output_move_vfp (rtx *operands)
   rtx reg, mem, addr, ops[2];
   int load = REG_P (operands[0]);
   int dp = GET_MODE_SIZE (GET_MODE (operands[0])) == 8;
+  int sp = (!TARGET_VFP_FP16INST
+	    || GET_MODE_SIZE (GET_MODE (operands[0])) == 4);
   int integer_p = GET_MODE_CLASS (GET_MODE (operands[0])) == MODE_INT;
   const char *templ;
   char buff[50];
@@ -18612,8 +18213,10 @@ output_move_vfp (rtx *operands)
 
   gcc_assert (REG_P (reg));
   gcc_assert (IS_VFP_REGNUM (REGNO (reg)));
-  gcc_assert (mode == SFmode
+  gcc_assert ((mode == HFmode && TARGET_HARD_FLOAT)
+	      || mode == SFmode
 	      || mode == DFmode
+	      || mode == HImode
 	      || mode == SImode
 	      || mode == DImode
               || (TARGET_NEON && VALID_NEON_DREG_MODE (mode)));
@@ -18644,7 +18247,7 @@ output_move_vfp (rtx *operands)
 
   sprintf (buff, templ,
 	   load ? "ld" : "st",
-	   dp ? "64" : "32",
+	   dp ? "64" : sp ? "32" : "16",
 	   dp ? "P" : "",
 	   integer_p ? "\t%@ int" : "");
   output_asm_insn (buff, ops);
@@ -19070,7 +18673,8 @@ shift_op (rtx op, HOST_WIDE_INT *amountp)
 	  return NULL;
 	}
 
-      *amountp = int_log2 (*amountp);
+      *amountp = exact_log2 (*amountp);
+      gcc_assert (IN_RANGE (*amountp, 0, 31));
       return ARM_LSL_NAME;
 
     default:
@@ -19102,22 +18706,6 @@ shift_op (rtx op, HOST_WIDE_INT *amountp)
   return mnem;
 }
 
-/* Obtain the shift from the POWER of two.  */
-
-static HOST_WIDE_INT
-int_log2 (HOST_WIDE_INT power)
-{
-  HOST_WIDE_INT shift = 0;
-
-  while ((((HOST_WIDE_INT) 1 << shift) & power) == 0)
-    {
-      gcc_assert (shift <= 31);
-      shift++;
-    }
-
-  return shift;
-}
-
 /* Output a .ascii pseudo-op, keeping track of lengths.  This is
    because /bin/as is horribly restrictive.  The judgement about
    whether or not each character is 'printable' (and can be output as
@@ -19474,7 +19062,7 @@ arm_get_vfp_saved_size (void)
 
   saved = 0;
   /* Space for saved VFP registers.  */
-  if (TARGET_HARD_FLOAT && TARGET_VFP)
+  if (TARGET_HARD_FLOAT)
     {
       count = 0;
       for (regno = FIRST_VFP_REGNUM;
@@ -19563,6 +19151,7 @@ output_return_instruction (rtx operand, bool really_return, bool reverse,
 	 (e.g. interworking) then we can load the return address
 	 directly into the PC.  Otherwise we must load it into LR.  */
       if (really_return
+	  && !IS_CMSE_ENTRY (func_type)
 	  && (IS_INTERRUPT (func_type) || !TARGET_INTERWORK))
 	return_reg = reg_names[PC_REGNUM];
       else
@@ -19703,18 +19292,93 @@ output_return_instruction (rtx operand, bool really_return, bool reverse,
 	  break;
 
 	default:
+	  if (IS_CMSE_ENTRY (func_type))
+	    {
+	      /* Check if we have to clear the 'GE bits' which is only used if
+		 parallel add and subtraction instructions are available.  */
+	      if (TARGET_INT_SIMD)
+		snprintf (instr, sizeof (instr),
+			  "msr%s\tAPSR_nzcvqg, %%|lr", conditional);
+	      else
+		snprintf (instr, sizeof (instr),
+			  "msr%s\tAPSR_nzcvq, %%|lr", conditional);
+
+	      output_asm_insn (instr, & operand);
+	      if (TARGET_HARD_FLOAT && !TARGET_THUMB1)
+		{
+		  /* Clear the cumulative exception-status bits (0-4,7) and the
+		     condition code bits (28-31) of the FPSCR.  We need to
+		     remember to clear the first scratch register used (IP) and
+		     save and restore the second (r4).  */
+		  snprintf (instr, sizeof (instr), "push\t{%%|r4}");
+		  output_asm_insn (instr, & operand);
+		  snprintf (instr, sizeof (instr), "vmrs\t%%|ip, fpscr");
+		  output_asm_insn (instr, & operand);
+		  snprintf (instr, sizeof (instr), "movw\t%%|r4, #65376");
+		  output_asm_insn (instr, & operand);
+		  snprintf (instr, sizeof (instr), "movt\t%%|r4, #4095");
+		  output_asm_insn (instr, & operand);
+		  snprintf (instr, sizeof (instr), "and\t%%|ip, %%|r4");
+		  output_asm_insn (instr, & operand);
+		  snprintf (instr, sizeof (instr), "vmsr\tfpscr, %%|ip");
+		  output_asm_insn (instr, & operand);
+		  snprintf (instr, sizeof (instr), "pop\t{%%|r4}");
+		  output_asm_insn (instr, & operand);
+		  snprintf (instr, sizeof (instr), "mov\t%%|ip, %%|lr");
+		  output_asm_insn (instr, & operand);
+		}
+	      snprintf (instr, sizeof (instr), "bxns\t%%|lr");
+	    }
 	  /* Use bx if it's available.  */
-	  if (arm_arch5 || arm_arch4t)
+	  else if (arm_arch5 || arm_arch4t)
 	    sprintf (instr, "bx%s\t%%|lr", conditional);
 	  else
 	    sprintf (instr, "mov%s\t%%|pc, %%|lr", conditional);
 	  break;
 	}
 
-      output_asm_insn (instr, & operand);
+      output_asm_insn (instr, & operand);
+    }
+
+  return "";
+}
+
+/* Output in FILE asm statements needed to declare the NAME of the function
+   defined by its DECL node.  */
+
+void
+arm_asm_declare_function_name (FILE *file, const char *name, tree decl)
+{
+  size_t cmse_name_len;
+  char *cmse_name = 0;
+  char cmse_prefix[] = "__acle_se_";
+
+  /* When compiling with ARMv8-M Security Extensions enabled, we should print an
+     extra function label for each function with the 'cmse_nonsecure_entry'
+     attribute.  This extra function label should be prepended with
+     '__acle_se_', telling the linker that it needs to create secure gateway
+     veneers for this function.  */
+  if (use_cmse && lookup_attribute ("cmse_nonsecure_entry",
+				    DECL_ATTRIBUTES (decl)))
+    {
+      cmse_name_len = sizeof (cmse_prefix) + strlen (name);
+      cmse_name = XALLOCAVEC (char, cmse_name_len);
+      snprintf (cmse_name, cmse_name_len, "%s%s", cmse_prefix, name);
+      targetm.asm_out.globalize_label (file, cmse_name);
+
+      ARM_DECLARE_FUNCTION_NAME (file, cmse_name, decl);
+      ASM_OUTPUT_TYPE_DIRECTIVE (file, cmse_name, "function");
     }
 
-  return "";
+  ARM_DECLARE_FUNCTION_NAME (file, name, decl);
+  ASM_OUTPUT_TYPE_DIRECTIVE (file, name, "function");
+  ASM_DECLARE_RESULT (file, DECL_RESULT (decl));
+  ASM_OUTPUT_LABEL (file, name);
+
+  if (cmse_name)
+    ASM_OUTPUT_LABEL (file, cmse_name);
+
+  ARM_OUTPUT_FN_UNWIND (file, TRUE);
 }
 
 /* Write the function name into the code section, directly preceding
@@ -19766,10 +19430,6 @@ arm_output_function_prologue (FILE *f, HOST_WIDE_INT frame_size)
 {
   unsigned long func_type;
 
-  /* ??? Do we want to print some of the below anyway?  */
-  if (TARGET_THUMB1)
-    return;
-
   /* Sanity check.  */
   gcc_assert (!arm_ccfsm_state && !arm_target_insn);
 
@@ -19804,6 +19464,8 @@ arm_output_function_prologue (FILE *f, HOST_WIDE_INT frame_size)
     asm_fprintf (f, "\t%@ Nested: function declared inside another function.\n");
   if (IS_STACKALIGN (func_type))
     asm_fprintf (f, "\t%@ Stack Align: May be called with mis-aligned SP.\n");
+  if (IS_CMSE_ENTRY (func_type))
+    asm_fprintf (f, "\t%@ Non-secure entry function: called from non-secure code.\n");
 
   asm_fprintf (f, "\t%@ args = %d, pretend = %d, frame = %wd\n",
 	       crtl->args.size,
@@ -20473,7 +20135,7 @@ arm_emit_vfp_multi_reg_pop (int first_reg, int num_regs, rtx base_reg)
   REG_NOTES (par) = dwarf;
 
   /* Make sure cfa doesn't leave with IP_REGNUM to allow unwinding fron FP.  */
-  if (TARGET_VFP && REGNO (base_reg) == IP_REGNUM)
+  if (REGNO (base_reg) == IP_REGNUM)
     {
       RTX_FRAME_RELATED_P (par) = 1;
       add_reg_note (par, REG_CFA_DEF_CFA, hard_frame_pointer_rtx);
@@ -20934,7 +20596,7 @@ arm_get_frame_offsets (void)
       func_type = arm_current_func_type ();
       /* Space for saved VFP registers.  */
       if (! IS_VOLATILE (func_type)
-	  && TARGET_HARD_FLOAT && TARGET_VFP)
+	  && TARGET_HARD_FLOAT)
 	saved += arm_get_vfp_saved_size ();
     }
   else /* TARGET_THUMB1 */
@@ -21155,7 +20817,7 @@ arm_save_coproc_regs(void)
 	saved_size += 8;
       }
 
-  if (TARGET_HARD_FLOAT && TARGET_VFP)
+  if (TARGET_HARD_FLOAT)
     {
       start_reg = FIRST_VFP_REGNUM;
 
@@ -22941,6 +22603,8 @@ maybe_get_arm_condition_code (rtx comparison)
 	{
 	case LTU: return ARM_CS;
 	case GEU: return ARM_CC;
+	case NE: return ARM_CS;
+	case EQ: return ARM_CC;
 	default: return ARM_NV;
 	}
 
@@ -22966,6 +22630,14 @@ maybe_get_arm_condition_code (rtx comparison)
 	default: return ARM_NV;
 	}
 
+    case CC_Vmode:
+      switch (comp_code)
+	{
+	case NE: return ARM_VS;
+	case EQ: return ARM_VC;
+	default: return ARM_NV;
+	}
+
     case CCmode:
       switch (comp_code)
 	{
@@ -23396,7 +23068,7 @@ arm_hard_regno_mode_ok (unsigned int regno, machine_mode mode)
 {
   if (GET_MODE_CLASS (mode) == MODE_CC)
     return (regno == CC_REGNUM
-	    || (TARGET_HARD_FLOAT && TARGET_VFP
+	    || (TARGET_HARD_FLOAT
 		&& regno == VFPCC_REGNUM));
 
   if (regno == CC_REGNUM && GET_MODE_CLASS (mode) != MODE_CC)
@@ -23410,8 +23082,7 @@ arm_hard_regno_mode_ok (unsigned int regno, machine_mode mode)
        start of an even numbered register pair.  */
     return (ARM_NUM_REGS (mode) < 2) || (regno < LAST_LO_REGNUM);
 
-  if (TARGET_HARD_FLOAT && TARGET_VFP
-      && IS_VFP_REGNUM (regno))
+  if (TARGET_HARD_FLOAT && IS_VFP_REGNUM (regno))
     {
       if (mode == SFmode || mode == SImode)
 	return VFP_REGNO_OK_FOR_SINGLE (regno);
@@ -23419,10 +23090,12 @@ arm_hard_regno_mode_ok (unsigned int regno, machine_mode mode)
       if (mode == DFmode)
 	return VFP_REGNO_OK_FOR_DOUBLE (regno);
 
-      /* VFP registers can hold HFmode values, but there is no point in
-	 putting them there unless we have hardware conversion insns. */
       if (mode == HFmode)
-	return TARGET_FP16 && VFP_REGNO_OK_FOR_SINGLE (regno);
+	return VFP_REGNO_OK_FOR_SINGLE (regno);
+
+      /* VFP registers can hold HImode values.  */
+      if (mode == HImode)
+	return VFP_REGNO_OK_FOR_SINGLE (regno);
 
       if (TARGET_NEON)
         return (VALID_NEON_DREG_MODE (mode) && VFP_REGNO_OK_FOR_DOUBLE (regno))
@@ -23626,26 +23299,6 @@ arm_debugger_arg_offset (int value, rtx addr)
   return value;
 }
 
-/* Implement TARGET_INVALID_PARAMETER_TYPE.  */
-
-static const char *
-arm_invalid_parameter_type (const_tree t)
-{
-  if (SCALAR_FLOAT_TYPE_P (t) && TYPE_PRECISION (t) == 16)
-    return N_("function parameters cannot have __fp16 type");
-  return NULL;
-}
-
-/* Implement TARGET_INVALID_PARAMETER_TYPE.  */
-
-static const char *
-arm_invalid_return_type (const_tree t)
-{
-  if (SCALAR_FLOAT_TYPE_P (t) && TYPE_PRECISION (t) == 16)
-    return N_("functions cannot return __fp16 type");
-  return NULL;
-}
-
 /* Implement TARGET_PROMOTED_TYPE.  */
 
 static tree
@@ -23885,8 +23538,8 @@ thumb_pop (FILE *f, unsigned long mask)
   if (mask & (1 << PC_REGNUM))
     {
       /* Catch popping the PC.  */
-      if (TARGET_INTERWORK || TARGET_BACKTRACE
-	  || crtl->calls_eh_return)
+      if (TARGET_INTERWORK || TARGET_BACKTRACE || crtl->calls_eh_return
+	  || IS_CMSE_ENTRY (arm_current_func_type ()))
 	{
 	  /* The PC is never poped directly, instead
 	     it is popped into r3 and then BX is used.  */
@@ -23947,7 +23600,14 @@ thumb_exit (FILE *f, int reg_containing_return_addr)
       if (crtl->calls_eh_return)
 	asm_fprintf (f, "\tadd\t%r, %r\n", SP_REGNUM, ARM_EH_STACKADJ_REGNUM);
 
-      asm_fprintf (f, "\tbx\t%r\n", reg_containing_return_addr);
+      if (IS_CMSE_ENTRY (arm_current_func_type ()))
+	{
+	  asm_fprintf (f, "\tmsr\tAPSR_nzcvq, %r\n",
+		       reg_containing_return_addr);
+	  asm_fprintf (f, "\tbxns\t%r\n", reg_containing_return_addr);
+	}
+      else
+	asm_fprintf (f, "\tbx\t%r\n", reg_containing_return_addr);
       return;
     }
   /* Otherwise if we are not supporting interworking and we have not created
@@ -23956,7 +23616,8 @@ thumb_exit (FILE *f, int reg_containing_return_addr)
   else if (!TARGET_INTERWORK
 	   && !TARGET_BACKTRACE
 	   && !is_called_in_ARM_mode (current_function_decl)
-	   && !crtl->calls_eh_return)
+	   && !crtl->calls_eh_return
+	   && !IS_CMSE_ENTRY (arm_current_func_type ()))
     {
       asm_fprintf (f, "\tpop\t{%r}\n", PC_REGNUM);
       return;
@@ -24179,7 +23840,21 @@ thumb_exit (FILE *f, int reg_containing_return_addr)
     asm_fprintf (f, "\tadd\t%r, %r\n", SP_REGNUM, ARM_EH_STACKADJ_REGNUM);
 
   /* Return to caller.  */
-  asm_fprintf (f, "\tbx\t%r\n", reg_containing_return_addr);
+  if (IS_CMSE_ENTRY (arm_current_func_type ()))
+    {
+      /* This is for the cases where LR is not being used to contain the return
+         address.  It may therefore contain information that we might not want
+	 to leak, hence it must be cleared.  The value in R0 will never be a
+	 secret at this point, so it is safe to use it, see the clearing code
+	 in 'cmse_nonsecure_entry_clear_before_return'.  */
+      if (reg_containing_return_addr != LR_REGNUM)
+	asm_fprintf (f, "\tmov\tlr, r0\n");
+
+      asm_fprintf (f, "\tmsr\tAPSR_nzcvq, %r\n", reg_containing_return_addr);
+      asm_fprintf (f, "\tbxns\t%r\n", reg_containing_return_addr);
+    }
+  else
+    asm_fprintf (f, "\tbx\t%r\n", reg_containing_return_addr);
 }
 
 /* Scan INSN just before assembler is output for it.
@@ -25044,6 +24719,149 @@ thumb1_expand_prologue (void)
     cfun->machine->lr_save_eliminated = 0;
 }
 
+/* Clear caller saved registers not used to pass return values and leaked
+   condition flags before exiting a cmse_nonsecure_entry function.  */
+
+void
+cmse_nonsecure_entry_clear_before_return (void)
+{
+  uint64_t to_clear_mask[2];
+  uint32_t padding_bits_to_clear = 0;
+  uint32_t * padding_bits_to_clear_ptr = &padding_bits_to_clear;
+  int regno, maxregno = IP_REGNUM;
+  tree result_type;
+  rtx result_rtl;
+
+  to_clear_mask[0] = (1ULL << (NUM_ARG_REGS)) - 1;
+  to_clear_mask[0] |= (1ULL << IP_REGNUM);
+
+  /* If we are not dealing with -mfloat-abi=soft we will need to clear VFP
+     registers.  We also check that TARGET_HARD_FLOAT and !TARGET_THUMB1 hold
+     to make sure the instructions used to clear them are present.  */
+  if (TARGET_HARD_FLOAT && !TARGET_THUMB1)
+    {
+      uint64_t float_mask = (1ULL << (D7_VFP_REGNUM + 1)) - 1;
+      maxregno = LAST_VFP_REGNUM;
+
+      float_mask &= ~((1ULL << FIRST_VFP_REGNUM) - 1);
+      to_clear_mask[0] |= float_mask;
+
+      float_mask = (1ULL << (maxregno - 63)) - 1;
+      to_clear_mask[1] = float_mask;
+
+      /* Make sure we don't clear the two scratch registers used to clear the
+	 relevant FPSCR bits in output_return_instruction.  */
+      emit_use (gen_rtx_REG (SImode, IP_REGNUM));
+      to_clear_mask[0] &= ~(1ULL << IP_REGNUM);
+      emit_use (gen_rtx_REG (SImode, 4));
+      to_clear_mask[0] &= ~(1ULL << 4);
+    }
+
+  /* If the user has defined registers to be caller saved, these are no longer
+     restored by the function before returning and must thus be cleared for
+     security purposes.  */
+  for (regno = NUM_ARG_REGS; regno < LAST_VFP_REGNUM; regno++)
+    {
+      /* We do not touch registers that can be used to pass arguments as per
+	 the AAPCS, since these should never be made callee-saved by user
+	 options.  */
+      if (IN_RANGE (regno, FIRST_VFP_REGNUM, D7_VFP_REGNUM))
+	continue;
+      if (IN_RANGE (regno, IP_REGNUM, PC_REGNUM))
+	continue;
+      if (call_used_regs[regno])
+	to_clear_mask[regno / 64] |= (1ULL << (regno % 64));
+    }
+
+  /* Make sure we do not clear the registers used to return the result in.  */
+  result_type = TREE_TYPE (DECL_RESULT (current_function_decl));
+  if (!VOID_TYPE_P (result_type))
+    {
+      result_rtl = arm_function_value (result_type, current_function_decl, 0);
+
+      /* No need to check that we return in registers, because we don't
+	 support returning on stack yet.  */
+      to_clear_mask[0]
+	&= ~compute_not_to_clear_mask (result_type, result_rtl, 0,
+				       padding_bits_to_clear_ptr);
+    }
+
+  if (padding_bits_to_clear != 0)
+    {
+      rtx reg_rtx;
+      /* Padding bits to clear is not 0 so we know we are dealing with
+	 returning a composite type, which only uses r0.  Let's make sure that
+	 r1-r3 is cleared too, we will use r1 as a scratch register.  */
+      gcc_assert ((to_clear_mask[0] & 0xe) == 0xe);
+
+      reg_rtx = gen_rtx_REG (SImode, R1_REGNUM);
+
+      /* Fill the lower half of the negated padding_bits_to_clear.  */
+      emit_move_insn (reg_rtx,
+		      GEN_INT ((((~padding_bits_to_clear) << 16u) >> 16u)));
+
+      /* Also fill the top half of the negated padding_bits_to_clear.  */
+      if (((~padding_bits_to_clear) >> 16) > 0)
+	emit_insn (gen_rtx_SET (gen_rtx_ZERO_EXTRACT (SImode, reg_rtx,
+						      GEN_INT (16),
+						      GEN_INT (16)),
+				GEN_INT ((~padding_bits_to_clear) >> 16)));
+
+      emit_insn (gen_andsi3 (gen_rtx_REG (SImode, R0_REGNUM),
+			   gen_rtx_REG (SImode, R0_REGNUM),
+			   reg_rtx));
+    }
+
+  for (regno = R0_REGNUM; regno <= maxregno; regno++)
+    {
+      if (!(to_clear_mask[regno / 64] & (1ULL << (regno % 64))))
+	continue;
+
+      if (IS_VFP_REGNUM (regno))
+	{
+	  /* If regno is an even vfp register and its successor is also to
+	     be cleared, use vmov.  */
+	  if (TARGET_VFP_DOUBLE
+	      && VFP_REGNO_OK_FOR_DOUBLE (regno)
+	      && to_clear_mask[regno / 64] & (1ULL << ((regno % 64) + 1)))
+	    {
+	      emit_move_insn (gen_rtx_REG (DFmode, regno),
+			      CONST1_RTX (DFmode));
+	      emit_use (gen_rtx_REG (DFmode, regno));
+	      regno++;
+	    }
+	  else
+	    {
+	      emit_move_insn (gen_rtx_REG (SFmode, regno),
+			      CONST1_RTX (SFmode));
+	      emit_use (gen_rtx_REG (SFmode, regno));
+	    }
+	}
+      else
+	{
+	  if (TARGET_THUMB1)
+	    {
+	      if (regno == R0_REGNUM)
+		emit_move_insn (gen_rtx_REG (SImode, regno),
+				const0_rtx);
+	      else
+		/* R0 has either been cleared before, see code above, or it
+		   holds a return value, either way it is not secret
+		   information.  */
+		emit_move_insn (gen_rtx_REG (SImode, regno),
+				gen_rtx_REG (SImode, R0_REGNUM));
+	      emit_use (gen_rtx_REG (SImode, regno));
+	    }
+	  else
+	    {
+	      emit_move_insn (gen_rtx_REG (SImode, regno),
+			      gen_rtx_REG (SImode, LR_REGNUM));
+	      emit_use (gen_rtx_REG (SImode, regno));
+	    }
+	}
+    }
+}
+
 /* Generate pattern *pop_multiple_with_stack_update_and_return if single
    POP instruction can be generated.  LR should be replaced by PC.  All
    the checks required are already done by  USE_RETURN_INSN ().  Hence,
@@ -25065,6 +24883,12 @@ thumb2_expand_return (bool simple_return)
 
   if (!simple_return && saved_regs_mask)
     {
+      /* TODO: Verify that this path is never taken for cmse_nonsecure_entry
+	 functions or adapt code to handle according to ACLE.  This path should
+	 not be reachable for cmse_nonsecure_entry functions though we prefer
+	 to assert it for now to ensure that future code changes do not silently
+	 change this behavior.  */
+      gcc_assert (!IS_CMSE_ENTRY (arm_current_func_type ()));
       if (num_regs == 1)
         {
           rtx par = gen_rtx_PARALLEL (VOIDmode, rtvec_alloc (2));
@@ -25087,6 +24911,8 @@ thumb2_expand_return (bool simple_return)
     }
   else
     {
+      if (IS_CMSE_ENTRY (arm_current_func_type ()))
+	cmse_nonsecure_entry_clear_before_return ();
       emit_jump_insn (simple_return_rtx);
     }
 }
@@ -25145,6 +24971,10 @@ thumb1_expand_epilogue (void)
 
   if (! df_regs_ever_live_p (LR_REGNUM))
     emit_use (gen_rtx_REG (SImode, LR_REGNUM));
+
+  /* Clear all caller-saved regs that are not used to return.  */
+  if (IS_CMSE_ENTRY (arm_current_func_type ()))
+    cmse_nonsecure_entry_clear_before_return ();
 }
 
 /* Epilogue code for APCS frame.  */
@@ -25179,7 +25009,7 @@ arm_expand_epilogue_apcs_frame (bool really_return)
         floats_from_frame += 4;
       }
 
-  if (TARGET_HARD_FLOAT && TARGET_VFP)
+  if (TARGET_HARD_FLOAT)
     {
       int start_reg;
       rtx ip_rtx = gen_rtx_REG (SImode, IP_REGNUM);
@@ -25425,7 +25255,7 @@ arm_expand_epilogue (bool really_return)
         }
     }
 
-  if (TARGET_HARD_FLOAT && TARGET_VFP)
+  if (TARGET_HARD_FLOAT)
     {
       /* Generate VFP register multi-pop.  */
       int end_reg = LAST_VFP_REGNUM + 1;
@@ -25482,6 +25312,7 @@ arm_expand_epilogue (bool really_return)
 
       if (ARM_FUNC_TYPE (func_type) != ARM_FT_INTERWORKED
           && (TARGET_ARM || ARM_FUNC_TYPE (func_type) == ARM_FT_NORMAL)
+	  && !IS_CMSE_ENTRY (func_type)
           && !IS_STACKALIGN (func_type)
           && really_return
           && crtl->args.pretend_args_size == 0
@@ -25578,6 +25409,14 @@ arm_expand_epilogue (bool really_return)
 				   stack_pointer_rtx, stack_pointer_rtx);
     }
 
+    /* Clear all caller-saved regs that are not used to return.  */
+    if (IS_CMSE_ENTRY (arm_current_func_type ()))
+      {
+	/* CMSE_ENTRY always returns.  */
+	gcc_assert (really_return);
+	cmse_nonsecure_entry_clear_before_return ();
+      }
+
   if (!really_return)
     return;
 
@@ -25874,13 +25713,6 @@ thumb_reload_out_hi (rtx *operands)
   emit_insn (gen_thumb_movhi_clobber (operands[0], operands[1], operands[2]));
 }
 
-/* Handle reading a half-word from memory during reload.  */
-void
-thumb_reload_in_hi (rtx *operands ATTRIBUTE_UNUSED)
-{
-  gcc_unreachable ();
-}
-
 /* Return the length of a function name prefix
     that starts with the character 'c'.  */
 static int
@@ -25950,46 +25782,55 @@ arm_emit_eabi_attribute (const char *name, int num, int val)
 void
 arm_print_tune_info (void)
 {
-  asm_fprintf (asm_out_file, "\t@.tune parameters\n");
-  asm_fprintf (asm_out_file, "\t\t@constant_limit:\t%d\n",
+  asm_fprintf (asm_out_file, "\t" ASM_COMMENT_START ".tune parameters\n");
+  asm_fprintf (asm_out_file, "\t\t" ASM_COMMENT_START "constant_limit:\t%d\n",
 	       current_tune->constant_limit);
-  asm_fprintf (asm_out_file, "\t\t@max_insns_skipped:\t%d\n",
-	       current_tune->max_insns_skipped);
-  asm_fprintf (asm_out_file, "\t\t@prefetch.num_slots:\t%d\n",
-	       current_tune->prefetch.num_slots);
-  asm_fprintf (asm_out_file, "\t\t@prefetch.l1_cache_size:\t%d\n",
+  asm_fprintf (asm_out_file, "\t\t" ASM_COMMENT_START
+	       "max_insns_skipped:\t%d\n", current_tune->max_insns_skipped);
+  asm_fprintf (asm_out_file, "\t\t" ASM_COMMENT_START
+	       "prefetch.num_slots:\t%d\n", current_tune->prefetch.num_slots);
+  asm_fprintf (asm_out_file, "\t\t" ASM_COMMENT_START
+	       "prefetch.l1_cache_size:\t%d\n",
 	       current_tune->prefetch.l1_cache_size);
-  asm_fprintf (asm_out_file, "\t\t@prefetch.l1_cache_line_size:\t%d\n",
+  asm_fprintf (asm_out_file, "\t\t" ASM_COMMENT_START
+	       "prefetch.l1_cache_line_size:\t%d\n",
 	       current_tune->prefetch.l1_cache_line_size);
-  asm_fprintf (asm_out_file, "\t\t@prefer_constant_pool:\t%d\n",
+  asm_fprintf (asm_out_file, "\t\t" ASM_COMMENT_START
+	       "prefer_constant_pool:\t%d\n",
 	       (int) current_tune->prefer_constant_pool);
-  asm_fprintf (asm_out_file, "\t\t@branch_cost:\t(s:speed, p:predictable)\n");
-  asm_fprintf (asm_out_file, "\t\t\t\ts&p\tcost\n");
-  asm_fprintf (asm_out_file, "\t\t\t\t00\t%d\n",
+  asm_fprintf (asm_out_file, "\t\t" ASM_COMMENT_START
+	       "branch_cost:\t(s:speed, p:predictable)\n");
+  asm_fprintf (asm_out_file, "\t\t" ASM_COMMENT_START "\t\ts&p\tcost\n");
+  asm_fprintf (asm_out_file, "\t\t" ASM_COMMENT_START "\t\t00\t%d\n",
 	       current_tune->branch_cost (false, false));
-  asm_fprintf (asm_out_file, "\t\t\t\t01\t%d\n",
+  asm_fprintf (asm_out_file, "\t\t" ASM_COMMENT_START "\t\t01\t%d\n",
 	       current_tune->branch_cost (false, true));
-  asm_fprintf (asm_out_file, "\t\t\t\t10\t%d\n",
+  asm_fprintf (asm_out_file, "\t\t" ASM_COMMENT_START "\t\t10\t%d\n",
 	       current_tune->branch_cost (true, false));
-  asm_fprintf (asm_out_file, "\t\t\t\t11\t%d\n",
+  asm_fprintf (asm_out_file, "\t\t" ASM_COMMENT_START "\t\t11\t%d\n",
 	       current_tune->branch_cost (true, true));
-  asm_fprintf (asm_out_file, "\t\t@prefer_ldrd_strd:\t%d\n",
+  asm_fprintf (asm_out_file, "\t\t" ASM_COMMENT_START
+	       "prefer_ldrd_strd:\t%d\n",
 	       (int) current_tune->prefer_ldrd_strd);
-  asm_fprintf (asm_out_file, "\t\t@logical_op_non_short_circuit:\t[%d,%d]\n",
+  asm_fprintf (asm_out_file, "\t\t" ASM_COMMENT_START
+	       "logical_op_non_short_circuit:\t[%d,%d]\n",
 	       (int) current_tune->logical_op_non_short_circuit_thumb,
 	       (int) current_tune->logical_op_non_short_circuit_arm);
-  asm_fprintf (asm_out_file, "\t\t@prefer_neon_for_64bits:\t%d\n",
+  asm_fprintf (asm_out_file, "\t\t" ASM_COMMENT_START
+	       "prefer_neon_for_64bits:\t%d\n",
 	       (int) current_tune->prefer_neon_for_64bits);
-  asm_fprintf (asm_out_file,
-	       "\t\t@disparage_flag_setting_t16_encodings:\t%d\n",
+  asm_fprintf (asm_out_file, "\t\t" ASM_COMMENT_START
+	       "disparage_flag_setting_t16_encodings:\t%d\n",
 	       (int) current_tune->disparage_flag_setting_t16_encodings);
-  asm_fprintf (asm_out_file, "\t\t@string_ops_prefer_neon:\t%d\n",
+  asm_fprintf (asm_out_file, "\t\t" ASM_COMMENT_START
+	       "string_ops_prefer_neon:\t%d\n",
 	       (int) current_tune->string_ops_prefer_neon);
-  asm_fprintf (asm_out_file, "\t\t@max_insns_inline_memset:\t%d\n",
+  asm_fprintf (asm_out_file, "\t\t" ASM_COMMENT_START
+	       "max_insns_inline_memset:\t%d\n",
 	       current_tune->max_insns_inline_memset);
-  asm_fprintf (asm_out_file, "\t\t@fusible_ops:\t%u\n",
+  asm_fprintf (asm_out_file, "\t\t" ASM_COMMENT_START "fusible_ops:\t%u\n",
 	       current_tune->fusible_ops);
-  asm_fprintf (asm_out_file, "\t\t@sched_autopref:\t%d\n",
+  asm_fprintf (asm_out_file, "\t\t" ASM_COMMENT_START "sched_autopref:\t%d\n",
 	       (int) current_tune->sched_autopref);
 }
 
@@ -26018,7 +25859,7 @@ arm_file_start (void)
 	      const char* pos = strchr (arm_selected_arch->name, '+');
 	      if (pos)
 		{
-		  char buf[15];
+		  char buf[32];
 		  gcc_assert (strlen (arm_selected_arch->name)
 			      <= sizeof (buf) / sizeof (*pos));
 		  strncpy (buf, arm_selected_arch->name,
@@ -26043,7 +25884,7 @@ arm_file_start (void)
       if (print_tune_info)
 	arm_print_tune_info ();
 
-      if (! TARGET_SOFT_FLOAT && TARGET_VFP)
+      if (! TARGET_SOFT_FLOAT)
 	{
 	  if (TARGET_HARD_FLOAT && TARGET_VFP_SINGLE)
 	    arm_emit_eabi_attribute ("Tag_ABI_HardFP_use", 27, 1);
@@ -26160,11 +26001,10 @@ arm_internal_label (FILE *stream, const char *prefix, unsigned long labelno)
 
 /* Output code to add DELTA to the first argument, and then jump
    to FUNCTION.  Used for C++ multiple inheritance.  */
+
 static void
-arm_output_mi_thunk (FILE *file, tree thunk ATTRIBUTE_UNUSED,
-		     HOST_WIDE_INT delta,
-		     HOST_WIDE_INT vcall_offset ATTRIBUTE_UNUSED,
-		     tree function)
+arm_thumb1_mi_thunk (FILE *file, tree, HOST_WIDE_INT delta,
+		     HOST_WIDE_INT, tree function)
 {
   static int thunk_label = 0;
   char label[256];
@@ -26305,6 +26145,76 @@ arm_output_mi_thunk (FILE *file, tree thunk ATTRIBUTE_UNUSED,
   final_end_function ();
 }
 
+/* MI thunk handling for TARGET_32BIT.  */
+
+static void
+arm32_output_mi_thunk (FILE *file, tree, HOST_WIDE_INT delta,
+		       HOST_WIDE_INT vcall_offset, tree function)
+{
+  /* On ARM, this_regno is R0 or R1 depending on
+     whether the function returns an aggregate or not.
+  */
+  int this_regno = (aggregate_value_p (TREE_TYPE (TREE_TYPE (function)),
+				       function)
+		    ? R1_REGNUM : R0_REGNUM);
+
+  rtx temp = gen_rtx_REG (Pmode, IP_REGNUM);
+  rtx this_rtx = gen_rtx_REG (Pmode, this_regno);
+  reload_completed = 1;
+  emit_note (NOTE_INSN_PROLOGUE_END);
+
+  /* Add DELTA to THIS_RTX.  */
+  if (delta != 0)
+    arm_split_constant (PLUS, Pmode, NULL_RTX,
+			delta, this_rtx, this_rtx, false);
+
+  /* Add *(*THIS_RTX + VCALL_OFFSET) to THIS_RTX.  */
+  if (vcall_offset != 0)
+    {
+      /* Load *THIS_RTX.  */
+      emit_move_insn (temp, gen_rtx_MEM (Pmode, this_rtx));
+      /* Compute *THIS_RTX + VCALL_OFFSET.  */
+      arm_split_constant (PLUS, Pmode, NULL_RTX, vcall_offset, temp, temp,
+			  false);
+      /* Compute *(*THIS_RTX + VCALL_OFFSET).  */
+      emit_move_insn (temp, gen_rtx_MEM (Pmode, temp));
+      emit_insn (gen_add3_insn (this_rtx, this_rtx, temp));
+    }
+
+  /* Generate a tail call to the target function.  */
+  if (!TREE_USED (function))
+    {
+      assemble_external (function);
+      TREE_USED (function) = 1;
+    }
+  rtx funexp = XEXP (DECL_RTL (function), 0);
+  funexp = gen_rtx_MEM (FUNCTION_MODE, funexp);
+  rtx_insn * insn = emit_call_insn (gen_sibcall (funexp, const0_rtx, NULL_RTX));
+  SIBLING_CALL_P (insn) = 1;
+
+  insn = get_insns ();
+  shorten_branches (insn);
+  final_start_function (insn, file, 1);
+  final (insn, file, 1);
+  final_end_function ();
+
+  /* Stop pretending this is a post-reload pass.  */
+  reload_completed = 0;
+}
+
+/* Output code to add DELTA to the first argument, and then jump
+   to FUNCTION.  Used for C++ multiple inheritance.  */
+
+static void
+arm_output_mi_thunk (FILE *file, tree thunk, HOST_WIDE_INT delta,
+		     HOST_WIDE_INT vcall_offset, tree function)
+{
+  if (TARGET_32BIT)
+    arm32_output_mi_thunk (file, thunk, delta, vcall_offset, function);
+  else
+    arm_thumb1_mi_thunk (file, thunk, delta, vcall_offset, function);
+}
+
 int
 arm_emit_vector_const (FILE *file, rtx x)
 {
@@ -27543,7 +27453,7 @@ arm_mangle_type (const_tree type)
 static const int thumb_core_reg_alloc_order[] =
 {
    3,  2,  1,  0,  4,  5,  6,  7,
-  14, 12,  8,  9, 10, 11
+  12, 14,  8,  9, 10, 11
 };
 
 /* Adjust register allocation order when compiling for Thumb.  */
@@ -27689,7 +27599,7 @@ arm_conditional_register_usage (void)
   if (TARGET_THUMB1)
     fixed_regs[LR_REGNUM] = call_used_regs[LR_REGNUM] = 1;
 
-  if (TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_VFP)
+  if (TARGET_32BIT && TARGET_HARD_FLOAT)
     {
       /* VFPv3 registers are disabled when earlier VFP
 	 versions are selected due to the definition of
@@ -27760,7 +27670,7 @@ arm_preferred_rename_class (reg_class_t rclass)
     return NO_REGS;
 }
 
-/* Compute the atrribute "length" of insn "*push_multi".
+/* Compute the attribute "length" of insn "*push_multi".
    So this function MUST be kept in sync with that insn pattern.  */
 int
 arm_attr_length_push_multi(rtx parallel_op, rtx first_op)
@@ -27777,6 +27687,11 @@ arm_attr_length_push_multi(rtx parallel_op, rtx first_op)
 
   /* Thumb2 mode.  */
   regno = REGNO (first_op);
+  /* For PUSH/STM under Thumb2 mode, we can use 16-bit encodings if the register
+     list is 8-bit.  Normally this means all registers in the list must be
+     LO_REGS, that is (R0 -R7).  If any HI_REGS used, then we must use 32-bit
+     encodings.  There is one exception for PUSH that LR in HI_REGS can be used
+     with 16-bit encoding.  */
   hi_reg = (REGNO_REG_CLASS (regno) == HI_REGS) && (regno != LR_REGNUM);
   for (i = 1; i < num_saves && !hi_reg; i++)
     {
@@ -27789,6 +27704,56 @@ arm_attr_length_push_multi(rtx parallel_op, rtx first_op)
   return 4;
 }
 
+/* Compute the attribute "length" of insn.  Currently, this function is used
+   for "*load_multiple_with_writeback", "*pop_multiple_with_return" and
+   "*pop_multiple_with_writeback_and_return".  OPERANDS is the toplevel PARALLEL
+   rtx, RETURN_PC is true if OPERANDS contains return insn.  WRITE_BACK_P is
+   true if OPERANDS contains insn which explicit updates base register.  */
+
+int
+arm_attr_length_pop_multi (rtx *operands, bool return_pc, bool write_back_p)
+{
+  /* ARM mode.  */
+  if (TARGET_ARM)
+    return 4;
+  /* Thumb1 mode.  */
+  if (TARGET_THUMB1)
+    return 2;
+
+  rtx parallel_op = operands[0];
+  /* Initialize to elements number of PARALLEL.  */
+  unsigned indx = XVECLEN (parallel_op, 0) - 1;
+  /* Initialize the value to base register.  */
+  unsigned regno = REGNO (operands[1]);
+  /* Skip return and write back pattern.
+     We only need register pop pattern for later analysis.  */
+  unsigned first_indx = 0;
+  first_indx += return_pc ? 1 : 0;
+  first_indx += write_back_p ? 1 : 0;
+
+  /* A pop operation can be done through LDM or POP.  If the base register is SP
+     and if it's with write back, then a LDM will be alias of POP.  */
+  bool pop_p = (regno == SP_REGNUM && write_back_p);
+  bool ldm_p = !pop_p;
+
+  /* Check base register for LDM.  */
+  if (ldm_p && REGNO_REG_CLASS (regno) == HI_REGS)
+    return 4;
+
+  /* Check each register in the list.  */
+  for (; indx >= first_indx; indx--)
+    {
+      regno = REGNO (XEXP (XVECEXP (parallel_op, 0, indx), 0));
+      /* For POP, PC in HI_REGS can be used with 16-bit encoding.  See similar
+	 comment in arm_attr_length_push_multi.  */
+      if (REGNO_REG_CLASS (regno) == HI_REGS
+	  && (regno != PC_REGNUM || ldm_p))
+	return 4;
+    }
+
+  return 2;
+}
+
 /* Compute the number of instructions emitted by output_move_double.  */
 int
 arm_count_output_move_double_insns (rtx *operands)
@@ -27820,7 +27785,11 @@ vfp3_const_double_for_fract_bits (rtx operand)
 	  HOST_WIDE_INT value = real_to_integer (&r0);
 	  value = value & 0xffffffff;
 	  if ((value != 0) && ( (value & (value - 1)) == 0))
-	    return int_log2 (value);
+	    {
+	      int ret = exact_log2 (value);
+	      gcc_assert (IN_RANGE (ret, 0, 31));
+	      return ret;
+	    }
 	}
     }
   return 0;
@@ -27960,9 +27929,9 @@ emit_unlikely_jump (rtx insn)
 void
 arm_expand_compare_and_swap (rtx operands[])
 {
-  rtx bval, rval, mem, oldval, newval, is_weak, mod_s, mod_f, x;
+  rtx bval, bdst, rval, mem, oldval, newval, is_weak, mod_s, mod_f, x;
   machine_mode mode;
-  rtx (*gen) (rtx, rtx, rtx, rtx, rtx, rtx, rtx);
+  rtx (*gen) (rtx, rtx, rtx, rtx, rtx, rtx, rtx, rtx);
 
   bval = operands[0];
   rval = operands[1];
@@ -28019,43 +27988,54 @@ arm_expand_compare_and_swap (rtx operands[])
       gcc_unreachable ();
     }
 
-  emit_insn (gen (rval, mem, oldval, newval, is_weak, mod_s, mod_f));
+  bdst = TARGET_THUMB1 ? bval : gen_rtx_REG (CCmode, CC_REGNUM);
+  emit_insn (gen (bdst, rval, mem, oldval, newval, is_weak, mod_s, mod_f));
 
   if (mode == QImode || mode == HImode)
     emit_move_insn (operands[1], gen_lowpart (mode, rval));
 
   /* In all cases, we arrange for success to be signaled by Z set.
      This arrangement allows for the boolean result to be used directly
-     in a subsequent branch, post optimization.  */
-  x = gen_rtx_REG (CCmode, CC_REGNUM);
-  x = gen_rtx_EQ (SImode, x, const0_rtx);
-  emit_insn (gen_rtx_SET (bval, x));
+     in a subsequent branch, post optimization.  For Thumb-1 targets, the
+     boolean negation of the result is also stored in bval because Thumb-1
+     backend lacks dependency tracking for CC flag due to flag-setting not
+     being represented at RTL level.  */
+  if (TARGET_THUMB1)
+      emit_insn (gen_cstoresi_eq0_thumb1 (bval, bdst));
+  else
+    {
+      x = gen_rtx_EQ (SImode, bdst, const0_rtx);
+      emit_insn (gen_rtx_SET (bval, x));
+    }
 }
 
 /* Split a compare and swap pattern.  It is IMPLEMENTATION DEFINED whether
    another memory store between the load-exclusive and store-exclusive can
    reset the monitor from Exclusive to Open state.  This means we must wait
    until after reload to split the pattern, lest we get a register spill in
-   the middle of the atomic sequence.  */
+   the middle of the atomic sequence.  Success of the compare and swap is
+   indicated by the Z flag set for 32bit targets and by neg_bval being zero
+   for Thumb-1 targets (ie. negation of the boolean value returned by
+   atomic_compare_and_swapmode standard pattern in operand 0).  */
 
 void
 arm_split_compare_and_swap (rtx operands[])
 {
-  rtx rval, mem, oldval, newval, scratch;
+  rtx rval, mem, oldval, newval, neg_bval;
   machine_mode mode;
   enum memmodel mod_s, mod_f;
   bool is_weak;
   rtx_code_label *label1, *label2;
   rtx x, cond;
 
-  rval = operands[0];
-  mem = operands[1];
-  oldval = operands[2];
-  newval = operands[3];
-  is_weak = (operands[4] != const0_rtx);
-  mod_s = memmodel_from_int (INTVAL (operands[5]));
-  mod_f = memmodel_from_int (INTVAL (operands[6]));
-  scratch = operands[7];
+  rval = operands[1];
+  mem = operands[2];
+  oldval = operands[3];
+  newval = operands[4];
+  is_weak = (operands[5] != const0_rtx);
+  mod_s = memmodel_from_int (INTVAL (operands[6]));
+  mod_f = memmodel_from_int (INTVAL (operands[7]));
+  neg_bval = TARGET_THUMB1 ? operands[0] : operands[8];
   mode = GET_MODE (mem);
 
   bool is_armv8_sync = arm_arch8 && is_mm_sync (mod_s);
@@ -28087,26 +28067,44 @@ arm_split_compare_and_swap (rtx operands[])
 
   arm_emit_load_exclusive (mode, rval, mem, use_acquire);
 
-  cond = arm_gen_compare_reg (NE, rval, oldval, scratch);
-  x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
-  x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
-			    gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
-  emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
+  /* Z is set to 0 for 32bit targets (resp. rval set to 1) if oldval != rval,
+     as required to communicate with arm_expand_compare_and_swap.  */
+  if (TARGET_32BIT)
+    {
+      cond = arm_gen_compare_reg (NE, rval, oldval, neg_bval);
+      x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
+      x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
+				gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
+      emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
+    }
+  else
+    {
+      emit_move_insn (neg_bval, const1_rtx);
+      cond = gen_rtx_NE (VOIDmode, rval, oldval);
+      if (thumb1_cmpneg_operand (oldval, SImode))
+	emit_unlikely_jump (gen_cbranchsi4_scratch (neg_bval, rval, oldval,
+						    label2, cond));
+      else
+	emit_unlikely_jump (gen_cbranchsi4_insn (cond, rval, oldval, label2));
+    }
 
-  arm_emit_store_exclusive (mode, scratch, mem, newval, use_release);
+  arm_emit_store_exclusive (mode, neg_bval, mem, newval, use_release);
 
   /* Weak or strong, we want EQ to be true for success, so that we
      match the flags that we got from the compare above.  */
-  cond = gen_rtx_REG (CCmode, CC_REGNUM);
-  x = gen_rtx_COMPARE (CCmode, scratch, const0_rtx);
-  emit_insn (gen_rtx_SET (cond, x));
+  if (TARGET_32BIT)
+    {
+      cond = gen_rtx_REG (CCmode, CC_REGNUM);
+      x = gen_rtx_COMPARE (CCmode, neg_bval, const0_rtx);
+      emit_insn (gen_rtx_SET (cond, x));
+    }
 
   if (!is_weak)
     {
-      x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
-      x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
-				gen_rtx_LABEL_REF (Pmode, label1), pc_rtx);
-      emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
+      /* Z is set to boolean value of !neg_bval, as required to communicate
+	 with arm_expand_compare_and_swap.  */
+      x = gen_rtx_NE (VOIDmode, neg_bval, const0_rtx);
+      emit_unlikely_jump (gen_cbranchsi4 (x, neg_bval, const0_rtx, label1));
     }
 
   if (!is_mm_relaxed (mod_f))
@@ -28121,6 +28119,15 @@ arm_split_compare_and_swap (rtx operands[])
     emit_label (label2);
 }
 
+/* Split an atomic operation pattern.  Operation is given by CODE and is one
+   of PLUS, MINUS, IOR, XOR, SET (for an exchange operation) or NOT (for a nand
+   operation).  Operation is performed on the content at MEM and on VALUE
+   following the memory model MODEL_RTX.  The content at MEM before and after
+   the operation is returned in OLD_OUT and NEW_OUT respectively while the
+   success of the operation is returned in COND.  Using a scratch register or
+   an operand register for these determines what result is returned for that
+   pattern.  */
+
 void
 arm_split_atomic_op (enum rtx_code code, rtx old_out, rtx new_out, rtx mem,
 		     rtx value, rtx model_rtx, rtx cond)
@@ -28129,6 +28136,7 @@ arm_split_atomic_op (enum rtx_code code, rtx old_out, rtx new_out, rtx mem,
   machine_mode mode = GET_MODE (mem);
   machine_mode wmode = (mode == DImode ? DImode : SImode);
   rtx_code_label *label;
+  bool all_low_regs, bind_old_new;
   rtx x;
 
   bool is_armv8_sync = arm_arch8 && is_mm_sync (model);
@@ -28163,6 +28171,28 @@ arm_split_atomic_op (enum rtx_code code, rtx old_out, rtx new_out, rtx mem,
 
   arm_emit_load_exclusive (mode, old_out, mem, use_acquire);
 
+  /* Does the operation require destination and first operand to use the same
+     register?  This is decided by register constraints of relevant insn
+     patterns in thumb1.md.  */
+  gcc_assert (!new_out || REG_P (new_out));
+  all_low_regs = REG_P (value) && REGNO_REG_CLASS (REGNO (value)) == LO_REGS
+		 && new_out && REGNO_REG_CLASS (REGNO (new_out)) == LO_REGS
+		 && REGNO_REG_CLASS (REGNO (old_out)) == LO_REGS;
+  bind_old_new =
+    (TARGET_THUMB1
+     && code != SET
+     && code != MINUS
+     && (code != PLUS || (!all_low_regs && !satisfies_constraint_L (value))));
+
+  /* We want to return the old value while putting the result of the operation
+     in the same register as the old value so copy the old value over to the
+     destination register and use that register for the operation.  */
+  if (old_out && bind_old_new)
+    {
+      emit_move_insn (new_out, old_out);
+      old_out = new_out;
+    }
+
   switch (code)
     {
     case SET:
@@ -28377,6 +28407,8 @@ arm_evpc_neon_vuzp (struct expand_vec_perm_d *d)
     case V8QImode:  gen = gen_neon_vuzpv8qi_internal;  break;
     case V8HImode:  gen = gen_neon_vuzpv8hi_internal;  break;
     case V4HImode:  gen = gen_neon_vuzpv4hi_internal;  break;
+    case V8HFmode:  gen = gen_neon_vuzpv8hf_internal;  break;
+    case V4HFmode:  gen = gen_neon_vuzpv4hf_internal;  break;
     case V4SImode:  gen = gen_neon_vuzpv4si_internal;  break;
     case V2SImode:  gen = gen_neon_vuzpv2si_internal;  break;
     case V2SFmode:  gen = gen_neon_vuzpv2sf_internal;  break;
@@ -28450,6 +28482,8 @@ arm_evpc_neon_vzip (struct expand_vec_perm_d *d)
     case V8QImode:  gen = gen_neon_vzipv8qi_internal;  break;
     case V8HImode:  gen = gen_neon_vzipv8hi_internal;  break;
     case V4HImode:  gen = gen_neon_vzipv4hi_internal;  break;
+    case V8HFmode:  gen = gen_neon_vzipv8hf_internal;  break;
+    case V4HFmode:  gen = gen_neon_vzipv4hf_internal;  break;
     case V4SImode:  gen = gen_neon_vzipv4si_internal;  break;
     case V2SImode:  gen = gen_neon_vzipv2si_internal;  break;
     case V2SFmode:  gen = gen_neon_vzipv2sf_internal;  break;
@@ -28502,6 +28536,8 @@ arm_evpc_neon_vrev (struct expand_vec_perm_d *d)
 	case V8QImode:  gen = gen_neon_vrev32v8qi;  break;
 	case V8HImode:  gen = gen_neon_vrev64v8hi;  break;
 	case V4HImode:  gen = gen_neon_vrev64v4hi;  break;
+	case V8HFmode:  gen = gen_neon_vrev64v8hf;  break;
+	case V4HFmode:  gen = gen_neon_vrev64v4hf;  break;
 	default:
 	  return false;
 	}
@@ -28585,6 +28621,8 @@ arm_evpc_neon_vtrn (struct expand_vec_perm_d *d)
     case V8QImode:  gen = gen_neon_vtrnv8qi_internal;  break;
     case V8HImode:  gen = gen_neon_vtrnv8hi_internal;  break;
     case V4HImode:  gen = gen_neon_vtrnv4hi_internal;  break;
+    case V8HFmode:  gen = gen_neon_vtrnv8hf_internal;  break;
+    case V4HFmode:  gen = gen_neon_vtrnv4hf_internal;  break;
     case V4SImode:  gen = gen_neon_vtrnv4si_internal;  break;
     case V2SImode:  gen = gen_neon_vtrnv2si_internal;  break;
     case V2SFmode:  gen = gen_neon_vtrnv2sf_internal;  break;
@@ -28660,6 +28698,8 @@ arm_evpc_neon_vext (struct expand_vec_perm_d *d)
     case V8HImode: gen = gen_neon_vextv8hi; break;
     case V2SImode: gen = gen_neon_vextv2si; break;
     case V4SImode: gen = gen_neon_vextv4si; break;
+    case V4HFmode: gen = gen_neon_vextv4hf; break;
+    case V8HFmode: gen = gen_neon_vextv8hf; break;
     case V2SFmode: gen = gen_neon_vextv2sf; break;
     case V4SFmode: gen = gen_neon_vextv4sf; break;
     case V2DImode: gen = gen_neon_vextv2di; break;
@@ -29185,7 +29225,7 @@ arm_validize_comparison (rtx *comparison, rtx * op1, rtx * op2)
 {
   enum rtx_code code = GET_CODE (*comparison);
   int code_int;
-  machine_mode mode = (GET_MODE (*op1) == VOIDmode) 
+  machine_mode mode = (GET_MODE (*op1) == VOIDmode)
     ? GET_MODE (*op2) : GET_MODE (*op1);
 
   gcc_assert (GET_MODE (*op1) != VOIDmode || GET_MODE (*op2) != VOIDmode);
@@ -29213,11 +29253,19 @@ arm_validize_comparison (rtx *comparison, rtx * op1, rtx * op2)
 	*op2 = force_reg (mode, *op2);
       return true;
 
+    case HFmode:
+      if (!TARGET_VFP_FP16INST)
+	break;
+      /* FP16 comparisons are done in SF mode.  */
+      mode = SFmode;
+      *op1 = convert_to_mode (mode, *op1, 1);
+      *op2 = convert_to_mode (mode, *op2, 1);
+      /* Fall through.  */
     case SFmode:
     case DFmode:
-      if (!arm_float_compare_operand (*op1, mode))
+      if (!vfp_compare_operand (*op1, mode))
 	*op1 = force_reg (mode, *op1);
-      if (!arm_float_compare_operand (*op2, mode))
+      if (!vfp_compare_operand (*op2, mode))
 	*op2 = force_reg (mode, *op2);
       return true;
     default:
@@ -29759,11 +29807,57 @@ arm_macro_fusion_p (void)
   return current_tune->fusible_ops != tune_params::FUSE_NOTHING;
 }
 
+/* Return true if the two back-to-back sets PREV_SET, CURR_SET are suitable
+   for MOVW / MOVT macro fusion.  */
+
+static bool
+arm_sets_movw_movt_fusible_p (rtx prev_set, rtx curr_set)
+{
+  /* We are trying to fuse
+     movw imm / movt imm
+    instructions as a group that gets scheduled together.  */
+
+  rtx set_dest = SET_DEST (curr_set);
+
+  if (GET_MODE (set_dest) != SImode)
+    return false;
+
+  /* We are trying to match:
+     prev (movw)  == (set (reg r0) (const_int imm16))
+     curr (movt) == (set (zero_extract (reg r0)
+					(const_int 16)
+					(const_int 16))
+			  (const_int imm16_1))
+     or
+     prev (movw) == (set (reg r1)
+			  (high (symbol_ref ("SYM"))))
+    curr (movt) == (set (reg r0)
+			(lo_sum (reg r1)
+				(symbol_ref ("SYM"))))  */
+
+    if (GET_CODE (set_dest) == ZERO_EXTRACT)
+      {
+	if (CONST_INT_P (SET_SRC (curr_set))
+	    && CONST_INT_P (SET_SRC (prev_set))
+	    && REG_P (XEXP (set_dest, 0))
+	    && REG_P (SET_DEST (prev_set))
+	    && REGNO (XEXP (set_dest, 0)) == REGNO (SET_DEST (prev_set)))
+	  return true;
+
+      }
+    else if (GET_CODE (SET_SRC (curr_set)) == LO_SUM
+	     && REG_P (SET_DEST (curr_set))
+	     && REG_P (SET_DEST (prev_set))
+	     && GET_CODE (SET_SRC (prev_set)) == HIGH
+	     && REGNO (SET_DEST (curr_set)) == REGNO (SET_DEST (prev_set)))
+      return true;
+
+  return false;
+}
 
 static bool
 aarch_macro_fusion_pair_p (rtx_insn* prev, rtx_insn* curr)
 {
-  rtx set_dest;
   rtx prev_set = single_set (prev);
   rtx curr_set = single_set (curr);
 
@@ -29781,54 +29875,26 @@ aarch_macro_fusion_pair_p (rtx_insn* prev, rtx_insn* curr)
       && aarch_crypto_can_dual_issue (prev, curr))
     return true;
 
-  if (current_tune->fusible_ops & tune_params::FUSE_MOVW_MOVT)
-    {
-      /* We are trying to fuse
-	 movw imm / movt imm
-	 instructions as a group that gets scheduled together.  */
-
-      set_dest = SET_DEST (curr_set);
-
-      if (GET_MODE (set_dest) != SImode)
-	return false;
+  if (current_tune->fusible_ops & tune_params::FUSE_MOVW_MOVT
+      && arm_sets_movw_movt_fusible_p (prev_set, curr_set))
+    return true;
 
-      /* We are trying to match:
-	 prev (movw)  == (set (reg r0) (const_int imm16))
-	 curr (movt) == (set (zero_extract (reg r0)
-					  (const_int 16)
-					   (const_int 16))
-			     (const_int imm16_1))
-	 or
-	 prev (movw) == (set (reg r1)
-			      (high (symbol_ref ("SYM"))))
-	 curr (movt) == (set (reg r0)
-			     (lo_sum (reg r1)
-				     (symbol_ref ("SYM"))))  */
-      if (GET_CODE (set_dest) == ZERO_EXTRACT)
-	{
-	  if (CONST_INT_P (SET_SRC (curr_set))
-	      && CONST_INT_P (SET_SRC (prev_set))
-	      && REG_P (XEXP (set_dest, 0))
-	      && REG_P (SET_DEST (prev_set))
-	      && REGNO (XEXP (set_dest, 0)) == REGNO (SET_DEST (prev_set)))
-	    return true;
-	}
-      else if (GET_CODE (SET_SRC (curr_set)) == LO_SUM
-	       && REG_P (SET_DEST (curr_set))
-	       && REG_P (SET_DEST (prev_set))
-	       && GET_CODE (SET_SRC (prev_set)) == HIGH
-	       && REGNO (SET_DEST (curr_set)) == REGNO (SET_DEST (prev_set)))
-	     return true;
-    }
   return false;
 }
 
+/* Return true iff the instruction fusion described by OP is enabled.  */
+bool
+arm_fusion_enabled_p (tune_params::fuse_ops op)
+{
+  return current_tune->fusible_ops & op;
+}
+
 /* Implement the TARGET_ASAN_SHADOW_OFFSET hook.  */
 
 static unsigned HOST_WIDE_INT
 arm_asan_shadow_offset (void)
 {
-  return (unsigned HOST_WIDE_INT) 1 << 29;
+  return HOST_WIDE_INT_1U << 29;
 }
 
 
@@ -29853,9 +29919,9 @@ arm_const_not_ok_for_debug_p (rtx p)
 	      && GET_CODE (XEXP (p, 0)) == SYMBOL_REF
 	      && (decl_op0 = SYMBOL_REF_DECL (XEXP (p, 0))))
 	    {
-	      if ((TREE_CODE (decl_op1) == VAR_DECL
+	      if ((VAR_P (decl_op1)
 		   || TREE_CODE (decl_op1) == CONST_DECL)
-		  && (TREE_CODE (decl_op0) == VAR_DECL
+		  && (VAR_P (decl_op0)
 		      || TREE_CODE (decl_op0) == CONST_DECL))
 		return (get_variable_section (decl_op1, false)
 			!= get_variable_section (decl_op0, false));
@@ -29988,9 +30054,8 @@ arm_can_inline_p (tree caller, tree callee)
   if ((caller_fpu->features & callee_fpu->features) != callee_fpu->features)
     return false;
 
-  /* Need same model and regs.  */
-  if (callee_fpu->model != caller_fpu->model
-      || callee_fpu->regs != callee_fpu->regs)
+  /* Need same FPU regs.  */
+  if (callee_fpu->regs != callee_fpu->regs)
     return false;
 
   /* OK to inline between different modes.
@@ -30333,4 +30398,113 @@ arm_sched_fusion_priority (rtx_insn *insn, int max_pri,
   return;
 }
 
+
+/* Construct and return a PARALLEL RTX vector with elements numbering the
+   lanes of either the high (HIGH == TRUE) or low (HIGH == FALSE) half of
+   the vector - from the perspective of the architecture.  This does not
+   line up with GCC's perspective on lane numbers, so we end up with
+   different masks depending on our target endian-ness.  The diagram
+   below may help.  We must draw the distinction when building masks
+   which select one half of the vector.  An instruction selecting
+   architectural low-lanes for a big-endian target, must be described using
+   a mask selecting GCC high-lanes.
+
+                 Big-Endian             Little-Endian
+
+GCC             0   1   2   3           3   2   1   0
+              | x | x | x | x |       | x | x | x | x |
+Architecture    3   2   1   0           3   2   1   0
+
+Low Mask:         { 2, 3 }                { 0, 1 }
+High Mask:        { 0, 1 }                { 2, 3 }
+*/
+
+rtx
+arm_simd_vect_par_cnst_half (machine_mode mode, bool high)
+{
+  int nunits = GET_MODE_NUNITS (mode);
+  rtvec v = rtvec_alloc (nunits / 2);
+  int high_base = nunits / 2;
+  int low_base = 0;
+  int base;
+  rtx t1;
+  int i;
+
+  if (BYTES_BIG_ENDIAN)
+    base = high ? low_base : high_base;
+  else
+    base = high ? high_base : low_base;
+
+  for (i = 0; i < nunits / 2; i++)
+    RTVEC_ELT (v, i) = GEN_INT (base + i);
+
+  t1 = gen_rtx_PARALLEL (mode, v);
+  return t1;
+}
+
+/* Check OP for validity as a PARALLEL RTX vector with elements
+   numbering the lanes of either the high (HIGH == TRUE) or low lanes,
+   from the perspective of the architecture.  See the diagram above
+   arm_simd_vect_par_cnst_half_p for more details.  */
+
+bool
+arm_simd_check_vect_par_cnst_half_p (rtx op, machine_mode mode,
+				       bool high)
+{
+  rtx ideal = arm_simd_vect_par_cnst_half (mode, high);
+  HOST_WIDE_INT count_op = XVECLEN (op, 0);
+  HOST_WIDE_INT count_ideal = XVECLEN (ideal, 0);
+  int i = 0;
+
+  if (!VECTOR_MODE_P (mode))
+    return false;
+
+  if (count_op != count_ideal)
+    return false;
+
+  for (i = 0; i < count_ideal; i++)
+    {
+      rtx elt_op = XVECEXP (op, 0, i);
+      rtx elt_ideal = XVECEXP (ideal, 0, i);
+
+      if (!CONST_INT_P (elt_op)
+	  || INTVAL (elt_ideal) != INTVAL (elt_op))
+	return false;
+    }
+  return true;
+}
+
+/* Can output mi_thunk for all cases except for non-zero vcall_offset
+   in Thumb1.  */
+static bool
+arm_can_output_mi_thunk (const_tree, HOST_WIDE_INT, HOST_WIDE_INT vcall_offset,
+			 const_tree)
+{
+  /* For now, we punt and not handle this for TARGET_THUMB1.  */
+  if (vcall_offset && TARGET_THUMB1)
+    return false;
+
+  /* Otherwise ok.  */
+  return true;
+}
+
+/* Generate RTL for a conditional branch with rtx comparison CODE in
+   mode CC_MODE. The destination of the unlikely conditional branch
+   is LABEL_REF.  */
+
+void
+arm_gen_unlikely_cbranch (enum rtx_code code, machine_mode cc_mode,
+			  rtx label_ref)
+{
+  rtx x;
+  x = gen_rtx_fmt_ee (code, VOIDmode,
+		      gen_rtx_REG (cc_mode, CC_REGNUM),
+		      const0_rtx);
+
+  x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
+			    gen_rtx_LABEL_REF (VOIDmode, label_ref),
+			    pc_rtx);
+  emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
+}
+
 #include "gt-arm.h"
--- a/src/gcc/config/arm/arm.h
+++ b/src/gcc/config/arm/arm.h
@@ -80,11 +80,6 @@ extern arm_cc arm_current_cc;
 extern int arm_target_label;
 extern int arm_ccfsm_state;
 extern GTY(()) rtx arm_target_insn;
-/* The label of the current constant pool.  */
-extern rtx pool_vector_label;
-/* Set to 1 when a return insn is output, this means that the epilogue
-   is not needed.  */
-extern int return_used_this_function;
 /* Callback to output language specific object attributes.  */
 extern void (*arm_lang_output_object_attributes_hook)(void);
 
@@ -139,7 +134,6 @@ extern void (*arm_lang_output_object_attributes_hook)(void);
 #define TARGET_HARD_FLOAT		(arm_float_abi != ARM_FLOAT_ABI_SOFT)
 /* Use hardware floating point calling convention.  */
 #define TARGET_HARD_FLOAT_ABI		(arm_float_abi == ARM_FLOAT_ABI_HARD)
-#define TARGET_VFP		        (TARGET_FPU_MODEL == ARM_FP_MODEL_VFP)
 #define TARGET_IWMMXT			(arm_arch_iwmmxt)
 #define TARGET_IWMMXT2			(arm_arch_iwmmxt2)
 #define TARGET_REALLY_IWMMXT		(TARGET_IWMMXT && TARGET_32BIT)
@@ -177,50 +171,57 @@ extern void (*arm_lang_output_object_attributes_hook)(void);
    to be more careful with TARGET_NEON as noted below.  */
 
 /* FPU is has the full VFPv3/NEON register file of 32 D registers.  */
-#define TARGET_VFPD32 (TARGET_VFP && TARGET_FPU_REGS == VFP_REG_D32)
+#define TARGET_VFPD32 (TARGET_FPU_REGS == VFP_REG_D32)
 
 /* FPU supports VFPv3 instructions.  */
-#define TARGET_VFP3 (TARGET_VFP && TARGET_FPU_REV >= 3)
+#define TARGET_VFP3 (TARGET_FPU_REV >= 3)
 
 /* FPU supports FPv5 instructions.  */
-#define TARGET_VFP5 (TARGET_VFP && TARGET_FPU_REV >= 5)
+#define TARGET_VFP5 (TARGET_FPU_REV >= 5)
 
 /* FPU only supports VFP single-precision instructions.  */
-#define TARGET_VFP_SINGLE (TARGET_VFP && TARGET_FPU_REGS == VFP_REG_SINGLE)
+#define TARGET_VFP_SINGLE (TARGET_FPU_REGS == VFP_REG_SINGLE)
 
 /* FPU supports VFP double-precision instructions.  */
-#define TARGET_VFP_DOUBLE (TARGET_VFP && TARGET_FPU_REGS != VFP_REG_SINGLE)
+#define TARGET_VFP_DOUBLE (TARGET_FPU_REGS != VFP_REG_SINGLE)
 
 /* FPU supports half-precision floating-point with NEON element load/store.  */
-#define TARGET_NEON_FP16						\
-  (TARGET_VFP								\
-   && ARM_FPU_FSET_HAS (TARGET_FPU_FEATURES, FPU_FL_NEON | FPU_FL_FP16))
+#define TARGET_NEON_FP16					\
+  (ARM_FPU_FSET_HAS (TARGET_FPU_FEATURES, FPU_FL_NEON)		\
+   && ARM_FPU_FSET_HAS (TARGET_FPU_FEATURES, FPU_FL_FP16))
 
 /* FPU supports VFP half-precision floating-point.  */
 #define TARGET_FP16							\
-  (TARGET_VFP && ARM_FPU_FSET_HAS (TARGET_FPU_FEATURES, FPU_FL_FP16))
+  (ARM_FPU_FSET_HAS (TARGET_FPU_FEATURES, FPU_FL_FP16))
 
 /* FPU supports fused-multiply-add operations.  */
-#define TARGET_FMA (TARGET_VFP && TARGET_FPU_REV >= 4)
+#define TARGET_FMA (TARGET_FPU_REV >= 4)
 
 /* FPU is ARMv8 compatible.  */
-#define TARGET_FPU_ARMV8 (TARGET_VFP && TARGET_FPU_REV >= 8)
+#define TARGET_FPU_ARMV8 (TARGET_FPU_REV >= 8)
 
 /* FPU supports Crypto extensions.  */
 #define TARGET_CRYPTO							\
-  (TARGET_VFP && ARM_FPU_FSET_HAS (TARGET_FPU_FEATURES, FPU_FL_CRYPTO))
+  (ARM_FPU_FSET_HAS (TARGET_FPU_FEATURES, FPU_FL_CRYPTO))
 
 /* FPU supports Neon instructions.  The setting of this macro gets
    revealed via __ARM_NEON__ so we add extra guards upon TARGET_32BIT
    and TARGET_HARD_FLOAT to ensure that NEON instructions are
    available.  */
 #define TARGET_NEON							\
-  (TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_VFP			\
+  (TARGET_32BIT && TARGET_HARD_FLOAT					\
    && ARM_FPU_FSET_HAS (TARGET_FPU_FEATURES, FPU_FL_NEON))
 
 /* FPU supports ARMv8.1 Adv.SIMD extensions.  */
 #define TARGET_NEON_RDMA (TARGET_NEON && arm_arch8_1)
 
+/* FPU supports the floating point FP16 instructions for ARMv8.2 and later.  */
+#define TARGET_VFP_FP16INST \
+  (TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_FPU_ARMV8 && arm_fp16_inst)
+
+/* FPU supports the AdvSIMD FP16 instructions for ARMv8.2 and later.  */
+#define TARGET_NEON_FP16INST (TARGET_VFP_FP16INST && TARGET_NEON_RDMA)
+
 /* Q-bit is present.  */
 #define TARGET_ARM_QBIT \
   (TARGET_32BIT && arm_arch5e && (arm_arch_notm || arm_arch7))
@@ -236,7 +237,7 @@ extern void (*arm_lang_output_object_attributes_hook)(void);
 
 /* Should MOVW/MOVT be used in preference to a constant pool.  */
 #define TARGET_USE_MOVT \
-  (arm_arch_thumb2 \
+  (TARGET_HAVE_MOVT \
    && (arm_disable_literal_pool \
        || (!optimize_size && !current_tune->prefer_constant_pool)))
 
@@ -251,14 +252,18 @@ extern void (*arm_lang_output_object_attributes_hook)(void);
 #define TARGET_HAVE_MEMORY_BARRIER (TARGET_HAVE_DMB || TARGET_HAVE_DMB_MCR)
 
 /* Nonzero if this chip supports ldrex and strex */
-#define TARGET_HAVE_LDREX        ((arm_arch6 && TARGET_ARM) || arm_arch7)
+#define TARGET_HAVE_LDREX        ((arm_arch6 && TARGET_ARM)	\
+				  || arm_arch7			\
+				  || (arm_arch8 && !arm_arch_notm))
 
 /* Nonzero if this chip supports LPAE.  */
 #define TARGET_HAVE_LPAE						\
   (arm_arch7 && ARM_FSET_HAS_CPU1 (insn_flags, FL_FOR_ARCH7VE))
 
 /* Nonzero if this chip supports ldrex{bh} and strex{bh}.  */
-#define TARGET_HAVE_LDREXBH ((arm_arch6k && TARGET_ARM) || arm_arch7)
+#define TARGET_HAVE_LDREXBH ((arm_arch6k && TARGET_ARM)		\
+			     || arm_arch7			\
+			     || (arm_arch8 && !arm_arch_notm))
 
 /* Nonzero if this chip supports ldrexd and strexd.  */
 #define TARGET_HAVE_LDREXD (((arm_arch6k && TARGET_ARM) \
@@ -267,9 +272,20 @@ extern void (*arm_lang_output_object_attributes_hook)(void);
 /* Nonzero if this chip supports load-acquire and store-release.  */
 #define TARGET_HAVE_LDACQ	(TARGET_ARM_ARCH >= 8)
 
+/* Nonzero if this chip supports LDAEXD and STLEXD.  */
+#define TARGET_HAVE_LDACQEXD	(TARGET_ARM_ARCH >= 8	\
+				 && TARGET_32BIT	\
+				 && arm_arch_notm)
+
+/* Nonzero if this chip provides the MOVW and MOVT instructions.  */
+#define TARGET_HAVE_MOVT	(arm_arch_thumb2 || arm_arch8)
+
+/* Nonzero if this chip provides the CBZ and CBNZ instructions.  */
+#define TARGET_HAVE_CBZ		(arm_arch_thumb2 || arm_arch8)
+
 /* Nonzero if integer division instructions supported.  */
 #define TARGET_IDIV	((TARGET_ARM && arm_arch_arm_hwdiv)	\
-			 || (TARGET_THUMB2 && arm_arch_thumb_hwdiv))
+			 || (TARGET_THUMB && arm_arch_thumb_hwdiv))
 
 /* Nonzero if disallow volatile memory access in IT block.  */
 #define TARGET_NO_VOLATILE_CE		(arm_arch_no_volatile_ce)
@@ -349,7 +365,6 @@ enum vfp_reg_type
 extern const struct arm_fpu_desc
 {
   const char *name;
-  enum arm_fp_model model;
   int rev;
   enum vfp_reg_type regs;
   arm_fpu_feature_set features;
@@ -358,7 +373,6 @@ extern const struct arm_fpu_desc
 /* Accessors.  */
 
 #define TARGET_FPU_NAME     (all_fpus[arm_fpu_index].name)
-#define TARGET_FPU_MODEL    (all_fpus[arm_fpu_index].model)
 #define TARGET_FPU_REV      (all_fpus[arm_fpu_index].rev)
 #define TARGET_FPU_REGS     (all_fpus[arm_fpu_index].regs)
 #define TARGET_FPU_FEATURES (all_fpus[arm_fpu_index].features)
@@ -402,7 +416,9 @@ enum base_architecture
   BASE_ARCH_7R = 7,
   BASE_ARCH_7M = 7,
   BASE_ARCH_7EM = 7,
-  BASE_ARCH_8A = 8
+  BASE_ARCH_8A = 8,
+  BASE_ARCH_8M_BASE = 8,
+  BASE_ARCH_8M_MAIN = 8
 };
 
 /* The major revision number of the ARM Architecture implemented by the target.  */
@@ -447,6 +463,13 @@ extern int arm_arch8;
 /* Nonzero if this chip supports the ARM Architecture 8.1 extensions.  */
 extern int arm_arch8_1;
 
+/* Nonzero if this chip supports the ARM Architecture 8.2 extensions.  */
+extern int arm_arch8_2;
+
+/* Nonzero if this chip supports the FP16 instructions extension of ARM
+   Architecture 8.2.  */
+extern int arm_fp16_inst;
+
 /* Nonzero if this chip can benefit from load scheduling.  */
 extern int arm_ld_sched;
 
@@ -478,6 +501,9 @@ extern int arm_tune_cortex_a9;
    interworking clean.  */
 extern int arm_cpp_interwork;
 
+/* Nonzero if chip supports Thumb 1.  */
+extern int arm_arch_thumb1;
+
 /* Nonzero if chip supports Thumb 2.  */
 extern int arm_arch_thumb2;
 
@@ -502,6 +528,9 @@ extern bool arm_disable_literal_pool;
 /* Nonzero if chip supports the ARMv8 CRC instructions.  */
 extern int arm_arch_crc;
 
+/* Nonzero if chip supports the ARMv8-M Security Extensions.  */
+extern int arm_arch_cmse;
+
 #ifndef TARGET_DEFAULT
 #define TARGET_DEFAULT  (MASK_APCS_FRAME)
 #endif
@@ -1191,7 +1220,7 @@ enum reg_class
    the data layout happens to be consistent for big-endian, so we explicitly allow
    that case.  */
 #define CANNOT_CHANGE_MODE_CLASS(FROM, TO, CLASS)		\
-  (TARGET_VFP && TARGET_BIG_END					\
+  (TARGET_BIG_END						\
    && !(GET_MODE_SIZE (FROM) == 16 && GET_MODE_SIZE (TO) == 8)	\
    && (GET_MODE_SIZE (FROM) > UNITS_PER_WORD			\
        || GET_MODE_SIZE (TO) > UNITS_PER_WORD)			\
@@ -1242,8 +1271,7 @@ enum reg_class
    NO_REGS is returned.  */
 #define SECONDARY_OUTPUT_RELOAD_CLASS(CLASS, MODE, X)		\
   /* Restrict which direct reloads are allowed for VFP/iWMMXt regs.  */ \
-  ((TARGET_VFP && TARGET_HARD_FLOAT				\
-    && IS_VFP_CLASS (CLASS))					\
+  ((TARGET_HARD_FLOAT && IS_VFP_CLASS (CLASS))			\
    ? coproc_secondary_reload_class (MODE, X, FALSE)		\
    : (TARGET_IWMMXT && (CLASS) == IWMMXT_REGS)			\
    ? coproc_secondary_reload_class (MODE, X, TRUE)		\
@@ -1255,8 +1283,7 @@ enum reg_class
 /* If we need to load shorts byte-at-a-time, then we need a scratch.  */
 #define SECONDARY_INPUT_RELOAD_CLASS(CLASS, MODE, X)		\
   /* Restrict which direct reloads are allowed for VFP/iWMMXt regs.  */ \
-  ((TARGET_VFP && TARGET_HARD_FLOAT				\
-    && IS_VFP_CLASS (CLASS))					\
+  ((TARGET_HARD_FLOAT && IS_VFP_CLASS (CLASS))			\
     ? coproc_secondary_reload_class (MODE, X, FALSE) :		\
     (TARGET_IWMMXT && (CLASS) == IWMMXT_REGS) ?			\
     coproc_secondary_reload_class (MODE, X, TRUE) :		\
@@ -1363,6 +1390,7 @@ enum reg_class
 #define ARM_FT_VOLATILE		(1 << 4) /* Does not return.  */
 #define ARM_FT_NESTED		(1 << 5) /* Embedded inside another func.  */
 #define ARM_FT_STACKALIGN	(1 << 6) /* Called with misaligned stack.  */
+#define ARM_FT_CMSE_ENTRY	(1 << 7) /* ARMv8-M non-secure entry function.  */
 
 /* Some macros to test these flags.  */
 #define ARM_FUNC_TYPE(t)	(t & ARM_FT_TYPE_MASK)
@@ -1371,6 +1399,7 @@ enum reg_class
 #define IS_NAKED(t)        	(t & ARM_FT_NAKED)
 #define IS_NESTED(t)       	(t & ARM_FT_NESTED)
 #define IS_STACKALIGN(t)       	(t & ARM_FT_STACKALIGN)
+#define IS_CMSE_ENTRY(t)	(t & ARM_FT_CMSE_ENTRY)
 
 
 /* Structure used to hold the function stack frame layout.  Offsets are
@@ -1516,7 +1545,7 @@ typedef struct
    On the ARM, r0-r3 are used to pass args.  */
 #define FUNCTION_ARG_REGNO_P(REGNO)					\
    (IN_RANGE ((REGNO), 0, 3)						\
-    || (TARGET_AAPCS_BASED && TARGET_VFP && TARGET_HARD_FLOAT		\
+    || (TARGET_AAPCS_BASED && TARGET_HARD_FLOAT				\
 	&& IN_RANGE ((REGNO), FIRST_VFP_REGNUM, FIRST_VFP_REGNUM + 15))	\
     || (TARGET_IWMMXT_ABI						\
 	&& IN_RANGE ((REGNO), FIRST_IWMMXT_REGNUM, FIRST_IWMMXT_REGNUM + 9)))
@@ -2187,13 +2216,9 @@ extern int making_const_table;
 #define TARGET_ARM_ARCH	\
   (arm_base_arch)	\
 
-#define TARGET_ARM_V6M (!arm_arch_notm && !arm_arch_thumb2)
-#define TARGET_ARM_V7M (!arm_arch_notm && arm_arch_thumb2)
-
 /* The highest Thumb instruction set version supported by the chip.  */
-#define TARGET_ARM_ARCH_ISA_THUMB 		\
-  (arm_arch_thumb2 ? 2				\
-	           : ((TARGET_ARM_ARCH >= 5 || arm_arch4t) ? 1 : 0))
+#define TARGET_ARM_ARCH_ISA_THUMB		\
+  (arm_arch_thumb2 ? 2 : (arm_arch_thumb1 ? 1 : 0))
 
 /* Expands to an upper-case char of the target's architectural
    profile.  */
@@ -2245,13 +2270,18 @@ extern const char *arm_rewrite_mcpu (int argc, const char **argv);
    "   :%{march=*:-march=%*}}"					\
    BIG_LITTLE_SPEC
 
+extern const char *arm_target_thumb_only (int argc, const char **argv);
+#define TARGET_MODE_SPEC_FUNCTIONS					\
+  { "target_mode_check", arm_target_thumb_only },
+
 /* -mcpu=native handling only makes sense with compiler running on
    an ARM chip.  */
 #if defined(__arm__)
 extern const char *host_detect_local_cpu (int argc, const char **argv);
 # define EXTRA_SPEC_FUNCTIONS						\
   { "local_cpu_detect", host_detect_local_cpu },			\
-  BIG_LITTLE_CPU_SPEC_FUNCTIONS
+  BIG_LITTLE_CPU_SPEC_FUNCTIONS						\
+  TARGET_MODE_SPEC_FUNCTIONS
 
 # define MCPU_MTUNE_NATIVE_SPECS					\
    " %{march=native:%<march=native %:local_cpu_detect(arch)}"		\
@@ -2259,10 +2289,21 @@ extern const char *host_detect_local_cpu (int argc, const char **argv);
    " %{mtune=native:%<mtune=native %:local_cpu_detect(tune)}"
 #else
 # define MCPU_MTUNE_NATIVE_SPECS ""
-# define EXTRA_SPEC_FUNCTIONS BIG_LITTLE_CPU_SPEC_FUNCTIONS
+# define EXTRA_SPEC_FUNCTIONS						\
+	BIG_LITTLE_CPU_SPEC_FUNCTIONS					\
+	TARGET_MODE_SPEC_FUNCTIONS
 #endif
 
-#define DRIVER_SELF_SPECS MCPU_MTUNE_NATIVE_SPECS
+/* Automatically add -mthumb for Thumb-only targets if mode isn't specified
+   via the configuration option --with-mode or via the command line. The
+   function target_mode_check is called to do the check with either:
+   - an array of -march values if any is given;
+   - an array of -mcpu values if any is given;
+   - an empty array.  */
+#define TARGET_MODE_SPECS						\
+  " %{!marm:%{!mthumb:%:target_mode_check(%{march=*:%*;mcpu=*:%*;:})}}"
+
+#define DRIVER_SELF_SPECS MCPU_MTUNE_NATIVE_SPECS TARGET_MODE_SPECS
 #define TARGET_SUPPORTS_WIDE_INT 1
 
 /* For switching between functions with different target attributes.  */
--- a/src/gcc/config/arm/arm.md
+++ b/src/gcc/config/arm/arm.md
@@ -118,10 +118,10 @@
 ; This can be "a" for ARM, "t" for either of the Thumbs, "32" for
 ; TARGET_32BIT, "t1" or "t2" to specify a specific Thumb mode.  "v6"
 ; for ARM or Thumb-2 with arm_arch6, and nov6 for ARM without
-; arm_arch6.  "v6t2" for Thumb-2 with arm_arch6.  This attribute is
-; used to compute attribute "enabled", use type "any" to enable an
-; alternative in all cases.
-(define_attr "arch" "any,a,t,32,t1,t2,v6,nov6,v6t2,neon_for_64bits,avoid_neon_for_64bits,iwmmxt,iwmmxt2,armv6_or_vfpv3"
+; arm_arch6.  "v6t2" for Thumb-2 with arm_arch6 and "v8mb" for ARMv8-M
+; Baseline.  This attribute is used to compute attribute "enabled",
+; use type "any" to enable an alternative in all cases.
+(define_attr "arch" "any,a,t,32,t1,t2,v6,nov6,v6t2,v8mb,neon_for_64bits,avoid_neon_for_64bits,iwmmxt,iwmmxt2,armv6_or_vfpv3,neon"
   (const_string "any"))
 
 (define_attr "arch_enabled" "no,yes"
@@ -160,6 +160,10 @@
 	      (match_test "TARGET_32BIT && arm_arch6 && arm_arch_thumb2"))
 	 (const_string "yes")
 
+	 (and (eq_attr "arch" "v8mb")
+	      (match_test "TARGET_THUMB1 && arm_arch8"))
+	 (const_string "yes")
+
 	 (and (eq_attr "arch" "avoid_neon_for_64bits")
 	      (match_test "TARGET_NEON")
 	      (not (match_test "TARGET_PREFER_NEON_64BITS")))
@@ -177,6 +181,10 @@
 	 (and (eq_attr "arch" "armv6_or_vfpv3")
 	      (match_test "arm_arch6 || TARGET_VFP3"))
 	 (const_string "yes")
+
+	 (and (eq_attr "arch" "neon")
+	      (match_test "TARGET_NEON"))
+	 (const_string "yes")
 	]
 
 	(const_string "no")))
@@ -539,6 +547,32 @@
    (set_attr "type" "multiple")]
 )
 
+(define_expand "addv<mode>4"
+  [(match_operand:SIDI 0 "register_operand")
+   (match_operand:SIDI 1 "register_operand")
+   (match_operand:SIDI 2 "register_operand")
+   (match_operand 3 "")]
+  "TARGET_32BIT"
+{
+  emit_insn (gen_add<mode>3_compareV (operands[0], operands[1], operands[2]));
+  arm_gen_unlikely_cbranch (NE, CC_Vmode, operands[3]);
+
+  DONE;
+})
+
+(define_expand "uaddv<mode>4"
+  [(match_operand:SIDI 0 "register_operand")
+   (match_operand:SIDI 1 "register_operand")
+   (match_operand:SIDI 2 "register_operand")
+   (match_operand 3 "")]
+  "TARGET_32BIT"
+{
+  emit_insn (gen_add<mode>3_compareC (operands[0], operands[1], operands[2]));
+  arm_gen_unlikely_cbranch (NE, CC_Cmode, operands[3]);
+
+  DONE;
+})
+
 (define_expand "addsi3"
   [(set (match_operand:SI          0 "s_register_operand" "")
 	(plus:SI (match_operand:SI 1 "s_register_operand" "")
@@ -617,6 +651,165 @@
  ]
 )
 
+(define_insn_and_split "adddi3_compareV"
+  [(set (reg:CC_V CC_REGNUM)
+	(ne:CC_V
+	  (plus:TI
+	    (sign_extend:TI (match_operand:DI 1 "register_operand" "r"))
+	    (sign_extend:TI (match_operand:DI 2 "register_operand" "r")))
+	  (sign_extend:TI (plus:DI (match_dup 1) (match_dup 2)))))
+   (set (match_operand:DI 0 "register_operand" "=&r")
+	(plus:DI (match_dup 1) (match_dup 2)))]
+  "TARGET_32BIT"
+  "#"
+  "&& reload_completed"
+  [(parallel [(set (reg:CC_C CC_REGNUM)
+		   (compare:CC_C (plus:SI (match_dup 1) (match_dup 2))
+				 (match_dup 1)))
+	      (set (match_dup 0) (plus:SI (match_dup 1) (match_dup 2)))])
+   (parallel [(set (reg:CC_V CC_REGNUM)
+		   (ne:CC_V
+		    (plus:DI (plus:DI
+			      (sign_extend:DI (match_dup 4))
+			      (sign_extend:DI (match_dup 5)))
+			     (ltu:DI (reg:CC_C CC_REGNUM) (const_int 0)))
+		    (plus:DI (sign_extend:DI
+			      (plus:SI (match_dup 4) (match_dup 5)))
+			     (ltu:DI (reg:CC_C CC_REGNUM) (const_int 0)))))
+	     (set (match_dup 3) (plus:SI (plus:SI
+					  (match_dup 4) (match_dup 5))
+					 (ltu:SI (reg:CC_C CC_REGNUM)
+						 (const_int 0))))])]
+  "
+  {
+    operands[3] = gen_highpart (SImode, operands[0]);
+    operands[0] = gen_lowpart (SImode, operands[0]);
+    operands[4] = gen_highpart (SImode, operands[1]);
+    operands[1] = gen_lowpart (SImode, operands[1]);
+    operands[5] = gen_highpart (SImode, operands[2]);
+    operands[2] = gen_lowpart (SImode, operands[2]);
+  }"
+ [(set_attr "conds" "set")
+   (set_attr "length" "8")
+   (set_attr "type" "multiple")]
+)
+
+(define_insn "addsi3_compareV"
+  [(set (reg:CC_V CC_REGNUM)
+	(ne:CC_V
+	  (plus:DI
+	    (sign_extend:DI (match_operand:SI 1 "register_operand" "r"))
+	    (sign_extend:DI (match_operand:SI 2 "register_operand" "r")))
+	  (sign_extend:DI (plus:SI (match_dup 1) (match_dup 2)))))
+   (set (match_operand:SI 0 "register_operand" "=r")
+	(plus:SI (match_dup 1) (match_dup 2)))]
+  "TARGET_32BIT"
+  "adds%?\\t%0, %1, %2"
+  [(set_attr "conds" "set")
+   (set_attr "type" "alus_sreg")]
+)
+
+(define_insn "*addsi3_compareV_upper"
+  [(set (reg:CC_V CC_REGNUM)
+	(ne:CC_V
+	  (plus:DI
+	   (plus:DI
+	    (sign_extend:DI (match_operand:SI 1 "register_operand" "r"))
+	    (sign_extend:DI (match_operand:SI 2 "register_operand" "r")))
+	   (ltu:DI (reg:CC_C CC_REGNUM) (const_int 0)))
+	  (plus:DI (sign_extend:DI
+		    (plus:SI (match_dup 1) (match_dup 2)))
+		   (ltu:DI (reg:CC_C CC_REGNUM) (const_int 0)))))
+   (set (match_operand:SI 0 "register_operand" "=r")
+	(plus:SI
+	 (plus:SI (match_dup 1) (match_dup 2))
+	 (ltu:SI (reg:CC_C CC_REGNUM) (const_int 0))))]
+  "TARGET_32BIT"
+  "adcs%?\\t%0, %1, %2"
+  [(set_attr "conds" "set")
+   (set_attr "type" "adcs_reg")]
+)
+
+(define_insn_and_split "adddi3_compareC"
+  [(set (reg:CC_C CC_REGNUM)
+	(ne:CC_C
+	  (plus:TI
+	    (zero_extend:TI (match_operand:DI 1 "register_operand" "r"))
+	    (zero_extend:TI (match_operand:DI 2 "register_operand" "r")))
+	  (zero_extend:TI (plus:DI (match_dup 1) (match_dup 2)))))
+   (set (match_operand:DI 0 "register_operand" "=&r")
+	(plus:DI (match_dup 1) (match_dup 2)))]
+  "TARGET_32BIT"
+  "#"
+  "&& reload_completed"
+  [(parallel [(set (reg:CC_C CC_REGNUM)
+		   (compare:CC_C (plus:SI (match_dup 1) (match_dup 2))
+				 (match_dup 1)))
+	      (set (match_dup 0) (plus:SI (match_dup 1) (match_dup 2)))])
+   (parallel [(set (reg:CC_C CC_REGNUM)
+		   (ne:CC_C
+		    (plus:DI (plus:DI
+			      (zero_extend:DI (match_dup 4))
+			      (zero_extend:DI (match_dup 5)))
+			     (ltu:DI (reg:CC_C CC_REGNUM) (const_int 0)))
+		    (plus:DI (zero_extend:DI
+			      (plus:SI (match_dup 4) (match_dup 5)))
+			     (ltu:DI (reg:CC_C CC_REGNUM) (const_int 0)))))
+	     (set (match_dup 3) (plus:SI
+				 (plus:SI (match_dup 4) (match_dup 5))
+				 (ltu:SI (reg:CC_C CC_REGNUM)
+					 (const_int 0))))])]
+  "
+  {
+    operands[3] = gen_highpart (SImode, operands[0]);
+    operands[0] = gen_lowpart (SImode, operands[0]);
+    operands[4] = gen_highpart (SImode, operands[1]);
+    operands[5] = gen_highpart (SImode, operands[2]);
+    operands[1] = gen_lowpart (SImode, operands[1]);
+    operands[2] = gen_lowpart (SImode, operands[2]);
+  }"
+ [(set_attr "conds" "set")
+   (set_attr "length" "8")
+   (set_attr "type" "multiple")]
+)
+
+(define_insn "*addsi3_compareC_upper"
+  [(set (reg:CC_C CC_REGNUM)
+	(ne:CC_C
+	  (plus:DI
+	   (plus:DI
+	    (zero_extend:DI (match_operand:SI 1 "register_operand" "r"))
+	    (zero_extend:DI (match_operand:SI 2 "register_operand" "r")))
+	   (ltu:DI (reg:CC_C CC_REGNUM) (const_int 0)))
+	  (plus:DI (zero_extend:DI
+		    (plus:SI (match_dup 1) (match_dup 2)))
+		   (ltu:DI (reg:CC_C CC_REGNUM) (const_int 0)))))
+   (set (match_operand:SI 0 "register_operand" "=r")
+	(plus:SI
+	 (plus:SI (match_dup 1) (match_dup 2))
+	 (ltu:SI (reg:CC_C CC_REGNUM) (const_int 0))))]
+  "TARGET_32BIT"
+  "adcs%?\\t%0, %1, %2"
+  [(set_attr "conds" "set")
+   (set_attr "type" "adcs_reg")]
+)
+
+(define_insn "addsi3_compareC"
+   [(set (reg:CC_C CC_REGNUM)
+	 (ne:CC_C
+	  (plus:DI
+	   (zero_extend:DI (match_operand:SI 1 "register_operand" "r"))
+	   (zero_extend:DI (match_operand:SI 2 "register_operand" "r")))
+	  (zero_extend:DI
+	   (plus:SI (match_dup 1) (match_dup 2)))))
+    (set (match_operand:SI 0 "register_operand" "=r")
+	 (plus:SI (match_dup 1) (match_dup 2)))]
+  "TARGET_32BIT"
+  "adds%?\\t%0, %1, %2"
+  [(set_attr "conds" "set")
+   (set_attr "type" "alus_sreg")]
+)
+
 (define_insn "addsi3_compare0"
   [(set (reg:CC_NOOV CC_REGNUM)
 	(compare:CC_NOOV
@@ -866,20 +1059,90 @@
     (set_attr "type" "adcs_reg")]
 )
 
+(define_expand "subv<mode>4"
+  [(match_operand:SIDI 0 "register_operand")
+   (match_operand:SIDI 1 "register_operand")
+   (match_operand:SIDI 2 "register_operand")
+   (match_operand 3 "")]
+  "TARGET_32BIT"
+{
+  emit_insn (gen_sub<mode>3_compare1 (operands[0], operands[1], operands[2]));
+  arm_gen_unlikely_cbranch (NE, CC_Vmode, operands[3]);
+
+  DONE;
+})
+
+(define_expand "usubv<mode>4"
+  [(match_operand:SIDI 0 "register_operand")
+   (match_operand:SIDI 1 "register_operand")
+   (match_operand:SIDI 2 "register_operand")
+   (match_operand 3 "")]
+  "TARGET_32BIT"
+{
+  emit_insn (gen_sub<mode>3_compare1 (operands[0], operands[1], operands[2]));
+  arm_gen_unlikely_cbranch (LTU, CCmode, operands[3]);
+
+  DONE;
+})
+
+(define_insn_and_split "subdi3_compare1"
+  [(set (reg:CC CC_REGNUM)
+	(compare:CC
+	  (match_operand:DI 1 "register_operand" "r")
+	  (match_operand:DI 2 "register_operand" "r")))
+   (set (match_operand:DI 0 "register_operand" "=&r")
+	(minus:DI (match_dup 1) (match_dup 2)))]
+  "TARGET_32BIT"
+  "#"
+  "&& reload_completed"
+  [(parallel [(set (reg:CC CC_REGNUM)
+		   (compare:CC (match_dup 1) (match_dup 2)))
+	      (set (match_dup 0) (minus:SI (match_dup 1) (match_dup 2)))])
+   (parallel [(set (reg:CC CC_REGNUM)
+		   (compare:CC (match_dup 4) (match_dup 5)))
+	     (set (match_dup 3) (minus:SI (minus:SI (match_dup 4) (match_dup 5))
+			       (ltu:SI (reg:CC_C CC_REGNUM) (const_int 0))))])]
+  {
+    operands[3] = gen_highpart (SImode, operands[0]);
+    operands[0] = gen_lowpart (SImode, operands[0]);
+    operands[4] = gen_highpart (SImode, operands[1]);
+    operands[1] = gen_lowpart (SImode, operands[1]);
+    operands[5] = gen_highpart (SImode, operands[2]);
+    operands[2] = gen_lowpart (SImode, operands[2]);
+   }
+  [(set_attr "conds" "set")
+   (set_attr "length" "8")
+   (set_attr "type" "multiple")]
+)
+
+(define_insn "subsi3_compare1"
+  [(set (reg:CC CC_REGNUM)
+	(compare:CC
+	  (match_operand:SI 1 "register_operand" "r")
+	  (match_operand:SI 2 "register_operand" "r")))
+   (set (match_operand:SI 0 "register_operand" "=r")
+	(minus:SI (match_dup 1) (match_dup 2)))]
+  "TARGET_32BIT"
+  "subs%?\\t%0, %1, %2"
+  [(set_attr "conds" "set")
+   (set_attr "type" "alus_sreg")]
+)
+
 (define_insn "*subsi3_carryin"
-  [(set (match_operand:SI 0 "s_register_operand" "=r,r")
-        (minus:SI (minus:SI (match_operand:SI 1 "reg_or_int_operand" "r,I")
-                            (match_operand:SI 2 "s_register_operand" "r,r"))
-                  (ltu:SI (reg:CC_C CC_REGNUM) (const_int 0))))]
+  [(set (match_operand:SI 0 "s_register_operand" "=r,r,r")
+	(minus:SI (minus:SI (match_operand:SI 1 "reg_or_int_operand" "r,I,Pz")
+			    (match_operand:SI 2 "s_register_operand" "r,r,r"))
+		  (ltu:SI (reg:CC_C CC_REGNUM) (const_int 0))))]
   "TARGET_32BIT"
   "@
    sbc%?\\t%0, %1, %2
-   rsc%?\\t%0, %2, %1"
+   rsc%?\\t%0, %2, %1
+   sbc%?\\t%0, %2, %2, lsl #1"
   [(set_attr "conds" "use")
-   (set_attr "arch" "*,a")
+   (set_attr "arch" "*,a,t2")
    (set_attr "predicable" "yes")
    (set_attr "predicable_short_it" "no")
-   (set_attr "type" "adc_reg,adc_imm")]
+   (set_attr "type" "adc_reg,adc_imm,alu_shift_imm")]
 )
 
 (define_insn "*subsi3_carryin_const"
@@ -1895,7 +2158,7 @@
   [(set (match_operand:SF 0 "s_register_operand" "")
 	(div:SF (match_operand:SF 1 "s_register_operand" "")
 		(match_operand:SF 2 "s_register_operand" "")))]
-  "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_VFP"
+  "TARGET_32BIT && TARGET_HARD_FLOAT"
   "")
 
 (define_expand "divdf3"
@@ -2137,13 +2400,13 @@
 
           for (i = 9; i <= 31; i++)
 	    {
-	      if ((((HOST_WIDE_INT) 1) << i) - 1 == INTVAL (operands[2]))
+	      if ((HOST_WIDE_INT_1 << i) - 1 == INTVAL (operands[2]))
 	        {
 	          emit_insn (gen_extzv (operands[0], operands[1], GEN_INT (i),
 			 	        const0_rtx));
 	          DONE;
 	        }
-	      else if ((((HOST_WIDE_INT) 1) << i) - 1
+	      else if ((HOST_WIDE_INT_1 << i) - 1
 		       == ~INTVAL (operands[2]))
 	        {
 	          rtx shift = GEN_INT (i);
@@ -2442,7 +2705,7 @@
   {
     int start_bit = INTVAL (operands[2]);
     int width = INTVAL (operands[1]);
-    HOST_WIDE_INT mask = (((HOST_WIDE_INT)1) << width) - 1;
+    HOST_WIDE_INT mask = (HOST_WIDE_INT_1 << width) - 1;
     rtx target, subtarget;
 
     if (arm_arch_thumb2)
@@ -3050,7 +3313,14 @@
 	(xor:DI (match_operand:DI 1 "s_register_operand" "")
 		(match_operand:DI 2 "arm_xordi_operand" "")))]
   "TARGET_32BIT"
-  ""
+  {
+    /* The iWMMXt pattern for xordi3 accepts only register operands but we want
+       to reuse this expander for all TARGET_32BIT targets so just force the
+       constants into a register.  Unlike for the anddi3 and iordi3 there are
+       no NEON instructions that take an immediate.  */
+    if (TARGET_IWMMXT && !REG_P (operands[2]))
+      operands[2] = force_reg (DImode, operands[2]);
+  }
 )
 
 (define_insn_and_split "*xordi3_insn"
@@ -3744,8 +4014,7 @@
     {
       rtx scratch1, scratch2;
 
-      if (CONST_INT_P (operands[2])
-	  && (HOST_WIDE_INT) INTVAL (operands[2]) == 1)
+      if (operands[2] == CONST1_RTX (SImode))
         {
           emit_insn (gen_arm_ashldi3_1bit (operands[0], operands[1]));
           DONE;
@@ -3790,7 +4059,7 @@
   "TARGET_EITHER"
   "
   if (CONST_INT_P (operands[2])
-      && ((unsigned HOST_WIDE_INT) INTVAL (operands[2])) > 31)
+      && (UINTVAL (operands[2])) > 31)
     {
       emit_insn (gen_movsi (operands[0], const0_rtx));
       DONE;
@@ -3818,8 +4087,7 @@
     {
       rtx scratch1, scratch2;
 
-      if (CONST_INT_P (operands[2])
-	  && (HOST_WIDE_INT) INTVAL (operands[2]) == 1)
+      if (operands[2] == CONST1_RTX (SImode))
         {
           emit_insn (gen_arm_ashrdi3_1bit (operands[0], operands[1]));
           DONE;
@@ -3864,7 +4132,7 @@
   "TARGET_EITHER"
   "
   if (CONST_INT_P (operands[2])
-      && ((unsigned HOST_WIDE_INT) INTVAL (operands[2])) > 31)
+      && UINTVAL (operands[2]) > 31)
     operands[2] = GEN_INT (31);
   "
 )
@@ -3889,8 +4157,7 @@
     {
       rtx scratch1, scratch2;
 
-      if (CONST_INT_P (operands[2])
-	  && (HOST_WIDE_INT) INTVAL (operands[2]) == 1)
+      if (operands[2] == CONST1_RTX (SImode))
         {
           emit_insn (gen_arm_lshrdi3_1bit (operands[0], operands[1]));
           DONE;
@@ -3935,7 +4202,7 @@
   "TARGET_EITHER"
   "
   if (CONST_INT_P (operands[2])
-      && ((unsigned HOST_WIDE_INT) INTVAL (operands[2])) > 31)
+      && (UINTVAL (operands[2])) > 31)
     {
       emit_insn (gen_movsi (operands[0], const0_rtx));
       DONE;
@@ -3969,7 +4236,7 @@
   if (TARGET_32BIT)
     {
       if (CONST_INT_P (operands[2])
-          && ((unsigned HOST_WIDE_INT) INTVAL (operands[2])) > 31)
+          && UINTVAL (operands[2]) > 31)
         operands[2] = GEN_INT (INTVAL (operands[2]) % 32);
     }
   else /* TARGET_THUMB1 */
@@ -4300,9 +4567,11 @@
 (define_insn "*extv_reg"
   [(set (match_operand:SI 0 "s_register_operand" "=r")
 	(sign_extract:SI (match_operand:SI 1 "s_register_operand" "r")
-                         (match_operand:SI 2 "const_int_M_operand" "M")
-                         (match_operand:SI 3 "const_int_M_operand" "M")))]
-  "arm_arch_thumb2"
+			  (match_operand:SI 2 "const_int_operand" "n")
+			  (match_operand:SI 3 "const_int_operand" "n")))]
+  "arm_arch_thumb2
+   && IN_RANGE (INTVAL (operands[3]), 0, 31)
+   && IN_RANGE (INTVAL (operands[2]), 1, 32 - INTVAL (operands[3]))"
   "sbfx%?\t%0, %1, %3, %2"
   [(set_attr "length" "4")
    (set_attr "predicable" "yes")
@@ -4313,9 +4582,11 @@
 (define_insn "extzv_t2"
   [(set (match_operand:SI 0 "s_register_operand" "=r")
 	(zero_extract:SI (match_operand:SI 1 "s_register_operand" "r")
-                         (match_operand:SI 2 "const_int_M_operand" "M")
-                         (match_operand:SI 3 "const_int_M_operand" "M")))]
-  "arm_arch_thumb2"
+			  (match_operand:SI 2 "const_int_operand" "n")
+			  (match_operand:SI 3 "const_int_operand" "n")))]
+  "arm_arch_thumb2
+   && IN_RANGE (INTVAL (operands[3]), 0, 31)
+   && IN_RANGE (INTVAL (operands[2]), 1, 32 - INTVAL (operands[3]))"
   "ubfx%?\t%0, %1, %3, %2"
   [(set_attr "length" "4")
    (set_attr "predicable" "yes")
@@ -4326,23 +4597,29 @@
 
 ;; Division instructions
 (define_insn "divsi3"
-  [(set (match_operand:SI	  0 "s_register_operand" "=r")
-	(div:SI (match_operand:SI 1 "s_register_operand"  "r")
-		(match_operand:SI 2 "s_register_operand"  "r")))]
+  [(set (match_operand:SI	  0 "s_register_operand" "=r,r")
+	(div:SI (match_operand:SI 1 "s_register_operand"  "r,r")
+		(match_operand:SI 2 "s_register_operand"  "r,r")))]
   "TARGET_IDIV"
-  "sdiv%?\t%0, %1, %2"
-  [(set_attr "predicable" "yes")
+  "@
+   sdiv%?\t%0, %1, %2
+   sdiv\t%0, %1, %2"
+  [(set_attr "arch" "32,v8mb")
+   (set_attr "predicable" "yes")
    (set_attr "predicable_short_it" "no")
    (set_attr "type" "sdiv")]
 )
 
 (define_insn "udivsi3"
-  [(set (match_operand:SI	   0 "s_register_operand" "=r")
-	(udiv:SI (match_operand:SI 1 "s_register_operand"  "r")
-		 (match_operand:SI 2 "s_register_operand"  "r")))]
+  [(set (match_operand:SI	   0 "s_register_operand" "=r,r")
+	(udiv:SI (match_operand:SI 1 "s_register_operand"  "r,r")
+		 (match_operand:SI 2 "s_register_operand"  "r,r")))]
   "TARGET_IDIV"
-  "udiv%?\t%0, %1, %2"
-  [(set_attr "predicable" "yes")
+  "@
+   udiv%?\t%0, %1, %2
+   udiv\t%0, %1, %2"
+  [(set_attr "arch" "32,v8mb")
+   (set_attr "predicable" "yes")
    (set_attr "predicable_short_it" "no")
    (set_attr "type" "udiv")]
 )
@@ -4350,6 +4627,63 @@
 
 ;; Unary arithmetic insns
 
+(define_expand "negvsi3"
+  [(match_operand:SI 0 "register_operand")
+   (match_operand:SI 1 "register_operand")
+   (match_operand 2 "")]
+  "TARGET_32BIT"
+{
+  emit_insn (gen_subsi3_compare (operands[0], const0_rtx, operands[1]));
+  arm_gen_unlikely_cbranch (NE, CC_Vmode, operands[2]);
+
+  DONE;
+})
+
+(define_expand "negvdi3"
+  [(match_operand:DI 0 "register_operand")
+   (match_operand:DI 1 "register_operand")
+   (match_operand 2 "")]
+  "TARGET_ARM"
+{
+  emit_insn (gen_negdi2_compare (operands[0], operands[1]));
+  arm_gen_unlikely_cbranch (NE, CC_Vmode, operands[2]);
+
+  DONE;
+})
+
+
+(define_insn_and_split "negdi2_compare"
+  [(set (reg:CC CC_REGNUM)
+	(compare:CC
+	  (const_int 0)
+	  (match_operand:DI 1 "register_operand" "0,r")))
+   (set (match_operand:DI 0 "register_operand" "=r,&r")
+	(minus:DI (const_int 0) (match_dup 1)))]
+  "TARGET_ARM"
+  "#"
+  "&& reload_completed"
+  [(parallel [(set (reg:CC CC_REGNUM)
+		   (compare:CC (const_int 0) (match_dup 1)))
+	      (set (match_dup 0) (minus:SI (const_int 0)
+					   (match_dup 1)))])
+   (parallel [(set (reg:CC CC_REGNUM)
+		   (compare:CC (const_int 0) (match_dup 3)))
+	     (set (match_dup 2)
+		  (minus:SI
+		   (minus:SI (const_int 0) (match_dup 3))
+		   (ltu:SI (reg:CC_C CC_REGNUM)
+			   (const_int 0))))])]
+  {
+    operands[2] = gen_highpart (SImode, operands[0]);
+    operands[0] = gen_lowpart (SImode, operands[0]);
+    operands[3] = gen_highpart (SImode, operands[1]);
+    operands[1] = gen_lowpart (SImode, operands[1]);
+  }
+  [(set_attr "conds" "set")
+   (set_attr "length" "8")
+   (set_attr "type" "multiple")]
+)
+
 (define_expand "negdi2"
  [(parallel
    [(set (match_operand:DI 0 "s_register_operand" "")
@@ -4367,12 +4701,13 @@
 
 ;; The constraints here are to prevent a *partial* overlap (where %Q0 == %R1).
 ;; The first alternative allows the common case of a *full* overlap.
-(define_insn_and_split "*arm_negdi2"
+(define_insn_and_split "*negdi2_insn"
   [(set (match_operand:DI         0 "s_register_operand" "=r,&r")
 	(neg:DI (match_operand:DI 1 "s_register_operand"  "0,r")))
    (clobber (reg:CC CC_REGNUM))]
-  "TARGET_ARM"
-  "#"   ; "rsbs\\t%Q0, %Q1, #0\;rsc\\t%R0, %R1, #0"
+  "TARGET_32BIT"
+  "#"	; rsbs %Q0, %Q1, #0; rsc %R0, %R1, #0	       (ARM)
+	; negs %Q0, %Q1    ; sbc %R0, %R1, %R1, lsl #1 (Thumb-2)
   "&& reload_completed"
   [(parallel [(set (reg:CC CC_REGNUM)
 		   (compare:CC (const_int 0) (match_dup 1)))
@@ -4390,6 +4725,20 @@
    (set_attr "type" "multiple")]
 )
 
+(define_insn "*negsi2_carryin_compare"
+  [(set (reg:CC CC_REGNUM)
+	(compare:CC (const_int 0)
+		    (match_operand:SI 1 "s_register_operand" "r")))
+   (set (match_operand:SI 0 "s_register_operand" "=r")
+	(minus:SI (minus:SI (const_int 0)
+			    (match_dup 1))
+		  (ltu:SI (reg:CC_C CC_REGNUM) (const_int 0))))]
+  "TARGET_ARM"
+  "rscs\\t%0, %1, #0"
+  [(set_attr "conds" "set")
+   (set_attr "type" "alus_imm")]
+)
+
 (define_expand "negsi2"
   [(set (match_operand:SI         0 "s_register_operand" "")
 	(neg:SI (match_operand:SI 1 "s_register_operand" "")))]
@@ -4412,7 +4761,7 @@
 (define_expand "negsf2"
   [(set (match_operand:SF         0 "s_register_operand" "")
 	(neg:SF (match_operand:SF 1 "s_register_operand" "")))]
-  "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_VFP"
+  "TARGET_32BIT && TARGET_HARD_FLOAT"
   ""
 )
 
@@ -4685,7 +5034,7 @@
 (define_expand "sqrtsf2"
   [(set (match_operand:SF 0 "s_register_operand" "")
 	(sqrt:SF (match_operand:SF 1 "s_register_operand" "")))]
-  "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_VFP"
+  "TARGET_32BIT && TARGET_HARD_FLOAT"
   "")
 
 (define_expand "sqrtdf2"
@@ -4854,7 +5203,7 @@
   ""
 )
 
-/* DFmode -> HFmode conversions have to go through SFmode.  */
+;; DFmode to HFmode conversions have to go through SFmode.
 (define_expand "truncdfhf2"
   [(set (match_operand:HF  0 "general_operand" "")
 	(float_truncate:HF
@@ -5117,7 +5466,7 @@
 		     (match_operator 5 "subreg_lowpart_operator"
 		      [(match_operand:SI 4 "s_register_operand" "")]))))]
   "TARGET_32BIT
-   && ((unsigned HOST_WIDE_INT) INTVAL (operands[3])
+   && (UINTVAL (operands[3])
        == (GET_MODE_MASK (GET_MODE (operands[5]))
            & (GET_MODE_MASK (GET_MODE (operands[5]))
 	      << (INTVAL (operands[2])))))"
@@ -5361,7 +5710,7 @@
   ""
 )
 
-/* HFmode -> DFmode conversions have to go through SFmode.  */
+;; HFmode -> DFmode conversions have to go through SFmode.
 (define_expand "extendhfdf2"
   [(set (match_operand:DF                  0 "general_operand" "")
 	(float_extend:DF (match_operand:HF 1 "general_operand"  "")))]
@@ -5490,7 +5839,7 @@
   [(set (match_operand:DI 0 "nonimmediate_di_operand" "=r, r, r, q, m")
 	(match_operand:DI 1 "di_operand"              "rDa,Db,Dc,mi,q"))]
   "TARGET_32BIT
-   && !(TARGET_HARD_FLOAT && TARGET_VFP)
+   && !(TARGET_HARD_FLOAT)
    && !TARGET_IWMMXT
    && (   register_operand (operands[0], DImode)
        || register_operand (operands[1], DImode))"
@@ -5699,12 +6048,15 @@
 ;; LO_SUM adds in the high bits.  Fortunately these are opaque operations
 ;; so this does not matter.
 (define_insn "*arm_movt"
-  [(set (match_operand:SI 0 "nonimmediate_operand" "=r")
-	(lo_sum:SI (match_operand:SI 1 "nonimmediate_operand" "0")
-		   (match_operand:SI 2 "general_operand"      "i")))]
-  "arm_arch_thumb2 && arm_valid_symbolic_address_p (operands[2])"
-  "movt%?\t%0, #:upper16:%c2"
-  [(set_attr "predicable" "yes")
+  [(set (match_operand:SI 0 "nonimmediate_operand" "=r,r")
+	(lo_sum:SI (match_operand:SI 1 "nonimmediate_operand" "0,0")
+		   (match_operand:SI 2 "general_operand"      "i,i")))]
+  "TARGET_HAVE_MOVT && arm_valid_symbolic_address_p (operands[2])"
+  "@
+   movt%?\t%0, #:upper16:%c2
+   movt\t%0, #:upper16:%c2"
+  [(set_attr "arch"  "32,v8mb")
+   (set_attr "predicable" "yes")
    (set_attr "predicable_short_it" "no")
    (set_attr "length" "4")
    (set_attr "type" "alu_sreg")]
@@ -5713,8 +6065,7 @@
 (define_insn "*arm_movsi_insn"
   [(set (match_operand:SI 0 "nonimmediate_operand" "=rk,r,r,r,rk,m")
 	(match_operand:SI 1 "general_operand"      "rk, I,K,j,mi,rk"))]
-  "TARGET_ARM && ! TARGET_IWMMXT
-   && !(TARGET_HARD_FLOAT && TARGET_VFP)
+  "TARGET_ARM && !TARGET_IWMMXT && !TARGET_HARD_FLOAT
    && (   register_operand (operands[0], SImode)
        || register_operand (operands[1], SImode))"
   "@
@@ -5726,6 +6077,7 @@
    str%?\\t%1, %0"
   [(set_attr "type" "mov_reg,mov_imm,mvn_imm,mov_imm,load1,store1")
    (set_attr "predicable" "yes")
+   (set_attr "arch" "*,*,*,v6t2,*,*")
    (set_attr "pool_range" "*,*,*,*,4096,*")
    (set_attr "neg_pool_range" "*,*,*,*,4084,*")]
 )
@@ -5762,7 +6114,8 @@
   [(set (match_operand:SI 0 "arm_general_register_operand" "")
 	(const:SI (plus:SI (match_operand:SI 1 "general_operand" "")
 			   (match_operand:SI 2 "const_int_operand" ""))))]
-  "TARGET_THUMB2
+  "TARGET_THUMB
+   && TARGET_HAVE_MOVT
    && arm_disable_literal_pool
    && reload_completed
    && GET_CODE (operands[1]) == SYMBOL_REF"
@@ -5793,8 +6146,7 @@
 (define_split
   [(set (match_operand:SI 0 "arm_general_register_operand" "")
        (match_operand:SI 1 "general_operand" ""))]
-  "TARGET_32BIT
-   && TARGET_USE_MOVT && GET_CODE (operands[1]) == SYMBOL_REF
+  "TARGET_USE_MOVT && GET_CODE (operands[1]) == SYMBOL_REF
    && !flag_pic && !target_word_relocations
    && !arm_tls_referenced_p (operands[1])"
   [(clobber (const_int 0))]
@@ -6362,7 +6714,7 @@
   [(set (match_operand:HI 0 "nonimmediate_operand" "=r,r,r,m,r")
 	(match_operand:HI 1 "general_operand"      "rIk,K,n,r,mi"))]
   "TARGET_ARM
-   && arm_arch4
+   && arm_arch4 && !TARGET_HARD_FLOAT
    && (register_operand (operands[0], HImode)
        || register_operand (operands[1], HImode))"
   "@
@@ -6388,7 +6740,7 @@
 (define_insn "*movhi_bytes"
   [(set (match_operand:HI 0 "s_register_operand" "=r,r,r")
 	(match_operand:HI 1 "arm_rhs_operand"  "I,rk,K"))]
-  "TARGET_ARM"
+  "TARGET_ARM && !TARGET_HARD_FLOAT"
   "@
    mov%?\\t%0, %1\\t%@ movhi
    mov%?\\t%0, %1\\t%@ movhi
@@ -6396,7 +6748,7 @@
   [(set_attr "predicable" "yes")
    (set_attr "type" "mov_imm,mov_reg,mvn_imm")]
 )
-	
+
 ;; We use a DImode scratch because we may occasionally need an additional
 ;; temporary if the address isn't offsettable -- push_reload doesn't seem
 ;; to take any notice of the "o" constraints on reload_memory_operand operand.
@@ -6518,7 +6870,7 @@
    strb%?\\t%1, %0"
   [(set_attr "type" "mov_reg,mov_reg,mov_imm,mov_imm,mvn_imm,load1,store1,load1,store1")
    (set_attr "predicable" "yes")
-   (set_attr "predicable_short_it" "yes,yes,yes,no,no,no,no,no,no")
+   (set_attr "predicable_short_it" "yes,yes,no,yes,no,no,no,no,no")
    (set_attr "arch" "t2,any,any,t2,any,t2,t2,any,any")
    (set_attr "length" "2,4,4,2,4,2,2,4,4")]
 )
@@ -6548,7 +6900,7 @@
 (define_insn "*arm32_movhf"
   [(set (match_operand:HF 0 "nonimmediate_operand" "=r,m,r,r")
 	(match_operand:HF 1 "general_operand"	   " m,r,r,F"))]
-  "TARGET_32BIT && !(TARGET_HARD_FLOAT && TARGET_FP16)
+  "TARGET_32BIT && !TARGET_HARD_FLOAT
    && (	  s_register_operand (operands[0], HFmode)
        || s_register_operand (operands[1], HFmode))"
   "*
@@ -6892,7 +7244,7 @@
   [(set (pc) (if_then_else
 	      (match_operator 0 "expandable_comparison_operator"
 	       [(match_operand:SF 1 "s_register_operand" "")
-	        (match_operand:SF 2 "arm_float_compare_operand" "")])
+	        (match_operand:SF 2 "vfp_compare_operand" "")])
 	      (label_ref (match_operand 3 "" ""))
 	      (pc)))]
   "TARGET_32BIT && TARGET_HARD_FLOAT"
@@ -6904,7 +7256,7 @@
   [(set (pc) (if_then_else
 	      (match_operator 0 "expandable_comparison_operator"
 	       [(match_operand:DF 1 "s_register_operand" "")
-	        (match_operand:DF 2 "arm_float_compare_operand" "")])
+	        (match_operand:DF 2 "vfp_compare_operand" "")])
 	      (label_ref (match_operand 3 "" ""))
 	      (pc)))]
   "TARGET_32BIT && TARGET_HARD_FLOAT && !TARGET_VFP_SINGLE"
@@ -7366,11 +7718,29 @@
   DONE;
 }")
 
+(define_expand "cstorehf4"
+  [(set (match_operand:SI 0 "s_register_operand")
+	(match_operator:SI 1 "expandable_comparison_operator"
+	 [(match_operand:HF 2 "s_register_operand")
+	  (match_operand:HF 3 "vfp_compare_operand")]))]
+  "TARGET_VFP_FP16INST"
+  {
+    if (!arm_validize_comparison (&operands[1],
+				  &operands[2],
+				  &operands[3]))
+       FAIL;
+
+    emit_insn (gen_cstore_cc (operands[0], operands[1],
+			      operands[2], operands[3]));
+    DONE;
+  }
+)
+
 (define_expand "cstoresf4"
   [(set (match_operand:SI 0 "s_register_operand" "")
 	(match_operator:SI 1 "expandable_comparison_operator"
 	 [(match_operand:SF 2 "s_register_operand" "")
-	  (match_operand:SF 3 "arm_float_compare_operand" "")]))]
+	  (match_operand:SF 3 "vfp_compare_operand" "")]))]
   "TARGET_32BIT && TARGET_HARD_FLOAT"
   "emit_insn (gen_cstore_cc (operands[0], operands[1],
 			     operands[2], operands[3])); DONE;"
@@ -7380,7 +7750,7 @@
   [(set (match_operand:SI 0 "s_register_operand" "")
 	(match_operator:SI 1 "expandable_comparison_operator"
 	 [(match_operand:DF 2 "s_register_operand" "")
-	  (match_operand:DF 3 "arm_float_compare_operand" "")]))]
+	  (match_operand:DF 3 "vfp_compare_operand" "")]))]
   "TARGET_32BIT && TARGET_HARD_FLOAT && !TARGET_VFP_SINGLE"
   "emit_insn (gen_cstore_cc (operands[0], operands[1],
 			     operands[2], operands[3])); DONE;"
@@ -7418,9 +7788,31 @@
     rtx ccreg;
 
     if (!arm_validize_comparison (&operands[1], &XEXP (operands[1], 0), 
-       				  &XEXP (operands[1], 1)))
+				  &XEXP (operands[1], 1)))
       FAIL;
-    
+
+    code = GET_CODE (operands[1]);
+    ccreg = arm_gen_compare_reg (code, XEXP (operands[1], 0),
+				 XEXP (operands[1], 1), NULL_RTX);
+    operands[1] = gen_rtx_fmt_ee (code, VOIDmode, ccreg, const0_rtx);
+  }"
+)
+
+(define_expand "movhfcc"
+  [(set (match_operand:HF 0 "s_register_operand")
+	(if_then_else:HF (match_operand 1 "arm_cond_move_operator")
+			 (match_operand:HF 2 "s_register_operand")
+			 (match_operand:HF 3 "s_register_operand")))]
+  "TARGET_VFP_FP16INST"
+  "
+  {
+    enum rtx_code code = GET_CODE (operands[1]);
+    rtx ccreg;
+
+    if (!arm_validize_comparison (&operands[1], &XEXP (operands[1], 0),
+				  &XEXP (operands[1], 1)))
+      FAIL;
+
     code = GET_CODE (operands[1]);
     ccreg = arm_gen_compare_reg (code, XEXP (operands[1], 0),
 				 XEXP (operands[1], 1), NULL_RTX);
@@ -7439,7 +7831,7 @@
     enum rtx_code code = GET_CODE (operands[1]);
     rtx ccreg;
 
-    if (!arm_validize_comparison (&operands[1], &XEXP (operands[1], 0), 
+    if (!arm_validize_comparison (&operands[1], &XEXP (operands[1], 0),
        				  &XEXP (operands[1], 1)))
        FAIL;
 
@@ -7504,6 +7896,37 @@
    (set_attr "type" "fcsel")]
 )
 
+(define_insn "*cmovhf"
+    [(set (match_operand:HF 0 "s_register_operand" "=t")
+	(if_then_else:HF (match_operator 1 "arm_vsel_comparison_operator"
+			 [(match_operand 2 "cc_register" "") (const_int 0)])
+			  (match_operand:HF 3 "s_register_operand" "t")
+			  (match_operand:HF 4 "s_register_operand" "t")))]
+  "TARGET_VFP_FP16INST"
+  "*
+  {
+    enum arm_cond_code code = maybe_get_arm_condition_code (operands[1]);
+    switch (code)
+      {
+      case ARM_GE:
+      case ARM_GT:
+      case ARM_EQ:
+      case ARM_VS:
+	return \"vsel%d1.f16\\t%0, %3, %4\";
+      case ARM_LT:
+      case ARM_LE:
+      case ARM_NE:
+      case ARM_VC:
+	return \"vsel%D1.f16\\t%0, %4, %3\";
+      default:
+	gcc_unreachable ();
+      }
+    return \"\";
+  }"
+  [(set_attr "conds" "use")
+   (set_attr "type" "fcsel")]
+)
+
 (define_insn_and_split "*movsicc_insn"
   [(set (match_operand:SI 0 "s_register_operand" "=r,r,r,r,r,r,r,r")
 	(if_then_else:SI
@@ -7627,6 +8050,7 @@
   "
   {
     rtx callee, pat;
+    tree addr = MEM_EXPR (operands[0]);
     
     /* In an untyped call, we can get NULL for operand 2.  */
     if (operands[2] == NULL_RTX)
@@ -7641,8 +8065,17 @@
 	: !REG_P (callee))
       XEXP (operands[0], 0) = force_reg (Pmode, callee);
 
-    pat = gen_call_internal (operands[0], operands[1], operands[2]);
-    arm_emit_call_insn (pat, XEXP (operands[0], 0), false);
+    if (detect_cmse_nonsecure_call (addr))
+      {
+	pat = gen_nonsecure_call_internal (operands[0], operands[1],
+					   operands[2]);
+	emit_call_insn (pat);
+      }
+    else
+      {
+	pat = gen_call_internal (operands[0], operands[1], operands[2]);
+	arm_emit_call_insn (pat, XEXP (operands[0], 0), false);
+      }
     DONE;
   }"
 )
@@ -7653,6 +8086,24 @@
 	      (use (match_operand 2 "" ""))
 	      (clobber (reg:SI LR_REGNUM))])])
 
+(define_expand "nonsecure_call_internal"
+  [(parallel [(call (unspec:SI [(match_operand 0 "memory_operand" "")]
+			       UNSPEC_NONSECURE_MEM)
+		    (match_operand 1 "general_operand" ""))
+	      (use (match_operand 2 "" ""))
+	      (clobber (reg:SI LR_REGNUM))
+	      (clobber (reg:SI 4))])]
+  "use_cmse"
+  "
+  {
+    rtx tmp;
+    tmp = copy_to_suggested_reg (XEXP (operands[0], 0),
+				 gen_rtx_REG (SImode, 4),
+				 SImode);
+
+    operands[0] = replace_equiv_address (operands[0], tmp);
+  }")
+
 (define_insn "*call_reg_armv5"
   [(call (mem:SI (match_operand:SI 0 "s_register_operand" "r"))
          (match_operand 1 "" ""))
@@ -7688,6 +8139,7 @@
   "
   {
     rtx pat, callee;
+    tree addr = MEM_EXPR (operands[1]);
     
     /* In an untyped call, we can get NULL for operand 2.  */
     if (operands[3] == 0)
@@ -7702,9 +8154,18 @@
 	: !REG_P (callee))
       XEXP (operands[1], 0) = force_reg (Pmode, callee);
 
-    pat = gen_call_value_internal (operands[0], operands[1],
-				   operands[2], operands[3]);
-    arm_emit_call_insn (pat, XEXP (operands[1], 0), false);
+    if (detect_cmse_nonsecure_call (addr))
+      {
+	pat = gen_nonsecure_call_value_internal (operands[0], operands[1],
+						 operands[2], operands[3]);
+	emit_call_insn (pat);
+      }
+    else
+      {
+	pat = gen_call_value_internal (operands[0], operands[1],
+				       operands[2], operands[3]);
+	arm_emit_call_insn (pat, XEXP (operands[1], 0), false);
+      }
     DONE;
   }"
 )
@@ -7716,6 +8177,25 @@
 	      (use (match_operand 3 "" ""))
 	      (clobber (reg:SI LR_REGNUM))])])
 
+(define_expand "nonsecure_call_value_internal"
+  [(parallel [(set (match_operand       0 "" "")
+		   (call (unspec:SI [(match_operand 1 "memory_operand" "")]
+				    UNSPEC_NONSECURE_MEM)
+			 (match_operand 2 "general_operand" "")))
+	      (use (match_operand 3 "" ""))
+	      (clobber (reg:SI LR_REGNUM))
+	      (clobber (reg:SI 4))])]
+  "use_cmse"
+  "
+  {
+    rtx tmp;
+    tmp = copy_to_suggested_reg (XEXP (operands[1], 0),
+				 gen_rtx_REG (SImode, 4),
+				 SImode);
+
+    operands[1] = replace_equiv_address (operands[1], tmp);
+  }")
+
 (define_insn "*call_value_reg_armv5"
   [(set (match_operand 0 "" "")
         (call (mem:SI (match_operand:SI 1 "s_register_operand" "r"))
@@ -8153,8 +8633,8 @@
 )
 
 (define_insn "probe_stack"
-  [(set (match_operand 0 "memory_operand" "=m")
-        (unspec [(const_int 0)] UNSPEC_PROBE_STACK))]
+  [(set (match_operand:SI 0 "memory_operand" "=m")
+        (unspec:SI [(const_int 0)] UNSPEC_PROBE_STACK))]
   "TARGET_32BIT"
   "str%?\\tr0, %0"
   [(set_attr "type" "store1")
@@ -10221,8 +10701,8 @@
 	 (match_operand 1 "const_int_operand" "")))
    (clobber (match_scratch:SI 2 ""))]
   "TARGET_ARM
-   && (((unsigned HOST_WIDE_INT) INTVAL (operands[1]))
-       == (((unsigned HOST_WIDE_INT) INTVAL (operands[1])) >> 24) << 24)"
+   && ((UINTVAL (operands[1]))
+       == ((UINTVAL (operands[1])) >> 24) << 24)"
   [(set (match_dup 2) (zero_extend:SI (match_dup 0)))
    (set (reg:CC CC_REGNUM) (compare:CC (match_dup 2) (match_dup 1)))]
   "
@@ -10562,7 +11042,11 @@
   }
   "
   [(set_attr "type" "load4")
-   (set_attr "predicable" "yes")]
+   (set_attr "predicable" "yes")
+   (set (attr "length")
+	(symbol_ref "arm_attr_length_pop_multi (operands,
+						/*return_pc=*/false,
+						/*write_back_p=*/true)"))]
 )
 
 ;; Pop with return (as used in epilogue RTL)
@@ -10591,7 +11075,10 @@
   }
   "
   [(set_attr "type" "load4")
-   (set_attr "predicable" "yes")]
+   (set_attr "predicable" "yes")
+   (set (attr "length")
+	(symbol_ref "arm_attr_length_pop_multi (operands, /*return_pc=*/true,
+						/*write_back_p=*/true)"))]
 )
 
 (define_insn "*pop_multiple_with_return"
@@ -10611,7 +11098,10 @@
   }
   "
   [(set_attr "type" "load4")
-   (set_attr "predicable" "yes")]
+   (set_attr "predicable" "yes")
+   (set (attr "length")
+	(symbol_ref "arm_attr_length_pop_multi (operands, /*return_pc=*/true,
+						/*write_back_p=*/false)"))]
 )
 
 ;; Load into PC and return
@@ -10632,7 +11122,7 @@
                    (match_operand:SI 2 "const_int_I_operand" "I")))
      (set (match_operand:DF 3 "vfp_hard_register_operand" "")
           (mem:DF (match_dup 1)))])]
-  "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_VFP"
+  "TARGET_32BIT && TARGET_HARD_FLOAT"
   "*
   {
     int num_regs = XVECLEN (operands[0], 0);
@@ -10822,19 +11312,22 @@
    (set_attr "predicable_short_it" "no")
    (set_attr "type" "clz")])
 
-(define_expand "ctzsi2"
- [(set (match_operand:SI           0 "s_register_operand" "")
-       (ctz:SI (match_operand:SI  1 "s_register_operand" "")))]
+;; Keep this as a CTZ expression until after reload and then split
+;; into RBIT + CLZ.  Since RBIT is represented as an UNSPEC it is unlikely
+;; to fold with any other expression.
+
+(define_insn_and_split "ctzsi2"
+ [(set (match_operand:SI           0 "s_register_operand" "=r")
+       (ctz:SI (match_operand:SI  1 "s_register_operand" "r")))]
   "TARGET_32BIT && arm_arch_thumb2"
+  "#"
+  "&& reload_completed"
+  [(const_int 0)]
   "
-   {
-     rtx tmp = gen_reg_rtx (SImode); 
-     emit_insn (gen_rbitsi2 (tmp, operands[1]));
-     emit_insn (gen_clzsi2 (operands[0], tmp));
-   }
-   DONE;
-  "
-)
+  emit_insn (gen_rbitsi2 (operands[0], operands[1]));
+  emit_insn (gen_clzsi2 (operands[0], operands[0]));
+  DONE;
+")
 
 ;; V5E instructions.
 
@@ -10958,13 +11451,16 @@
 ;; We only care about the lower 16 bits of the constant 
 ;; being inserted into the upper 16 bits of the register.
 (define_insn "*arm_movtas_ze" 
-  [(set (zero_extract:SI (match_operand:SI 0 "s_register_operand" "+r")
+  [(set (zero_extract:SI (match_operand:SI 0 "s_register_operand" "+r,r")
                    (const_int 16)
                    (const_int 16))
         (match_operand:SI 1 "const_int_operand" ""))]
-  "arm_arch_thumb2"
-  "movt%?\t%0, %L1"
- [(set_attr "predicable" "yes")
+  "TARGET_HAVE_MOVT"
+  "@
+   movt%?\t%0, %L1
+   movt\t%0, %L1"
+ [(set_attr "arch" "32,v8mb")
+  (set_attr "predicable" "yes")
   (set_attr "predicable_short_it" "no")
   (set_attr "length" "4")
   (set_attr "type" "alu_sreg")]
--- a/src/gcc/config/arm/arm.opt
+++ b/src/gcc/config/arm/arm.opt
@@ -61,10 +61,6 @@ Generate a call to abort if a noreturn function returns.
 mapcs
 Target RejectNegative Mask(APCS_FRAME) Undocumented
 
-mapcs-float
-Target Report Mask(APCS_FLOAT)
-Pass FP arguments in FP registers.
-
 mapcs-frame
 Target Report Mask(APCS_FRAME)
 Generate APCS conformant stack frames.
@@ -109,6 +105,10 @@ mfloat-abi=
 Target RejectNegative Joined Enum(float_abi_type) Var(arm_float_abi) Init(TARGET_DEFAULT_FLOAT_ABI)
 Specify if floating point hardware should be used.
 
+mcmse
+Target RejectNegative Var(use_cmse)
+Specify that the compiler should target secure code as per ARMv8-M Security Extensions.
+
 Enum
 Name(float_abi_type) Type(enum float_abi_type)
 Known floating-point ABIs (for use with the -mfloat-abi= option):
@@ -253,14 +253,6 @@ mrestrict-it
 Target Report Var(arm_restrict_it) Init(2) Save
 Generate IT blocks appropriate for ARMv8.
 
-mold-rtx-costs
-Target Report Mask(OLD_RTX_COSTS)
-Use the old RTX costing tables (transitional).
-
-mnew-generic-costs
-Target Report Mask(NEW_GENERIC_COSTS)
-Use the new generic RTX cost tables if new core-specific cost table not available (transitional).
-
 mfix-cortex-m3-ldrd
 Target Report Var(fix_cm3_ldrd) Init(2)
 Avoid overlapping destination and address registers on LDRD instructions
--- /dev/null
+++ b/src/gcc/config/arm/arm_cmse.h
@@ -0,0 +1,199 @@
+/* ARMv8-M Secure Extensions intrinsics include file.
+
+   Copyright (C) 2015-2016 Free Software Foundation, Inc.
+   Contributed by ARM Ltd.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify it
+   under the terms of the GNU General Public License as published
+   by the Free Software Foundation; either version 3, or (at your
+   option) any later version.
+
+   GCC is distributed in the hope that it will be useful, but WITHOUT
+   ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+   or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
+   License for more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+
+#ifndef _GCC_ARM_CMSE_H
+#define _GCC_ARM_CMSE_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#if __ARM_FEATURE_CMSE & 1
+
+#include <stddef.h>
+#include <stdint.h>
+
+#ifdef __ARM_BIG_ENDIAN
+
+typedef union {
+  struct cmse_address_info {
+#if __ARM_FEATURE_CMSE & 2
+    unsigned idau_region:8;
+    unsigned idau_region_valid:1;
+    unsigned secure:1;
+    unsigned nonsecure_readwrite_ok:1;
+    unsigned nonsecure_read_ok:1;
+#else
+    unsigned :12;
+#endif
+    unsigned readwrite_ok:1;
+    unsigned read_ok:1;
+#if __ARM_FEATURE_CMSE & 2
+    unsigned sau_region_valid:1;
+#else
+    unsigned :1;
+#endif
+    unsigned mpu_region_valid:1;
+#if __ARM_FEATURE_CMSE & 2
+    unsigned sau_region:8;
+#else
+    unsigned :8;
+#endif
+    unsigned mpu_region:8;
+  } flags;
+  unsigned value;
+} cmse_address_info_t;
+
+#else
+
+typedef union {
+  struct cmse_address_info {
+    unsigned mpu_region:8;
+#if __ARM_FEATURE_CMSE & 2
+    unsigned sau_region:8;
+#else
+    unsigned :8;
+#endif
+    unsigned mpu_region_valid:1;
+#if __ARM_FEATURE_CMSE & 2
+    unsigned sau_region_valid:1;
+#else
+    unsigned :1;
+#endif
+    unsigned read_ok:1;
+    unsigned readwrite_ok:1;
+#if __ARM_FEATURE_CMSE & 2
+    unsigned nonsecure_read_ok:1;
+    unsigned nonsecure_readwrite_ok:1;
+    unsigned secure:1;
+    unsigned idau_region_valid:1;
+    unsigned idau_region:8;
+#else
+    unsigned :12;
+#endif
+  } flags;
+  unsigned value;
+} cmse_address_info_t;
+
+#endif /* __ARM_BIG_ENDIAN */
+
+#define cmse_TT_fptr(p) (__cmse_TT_fptr ((__cmse_fptr)(p)))
+
+typedef void (*__cmse_fptr)(void);
+
+#define __CMSE_TT_ASM(flags) \
+{ \
+  cmse_address_info_t __result; \
+   __asm__ ("tt" # flags " %0,%1" \
+	   : "=r"(__result) \
+	   : "r"(__p) \
+	   : "memory"); \
+  return __result; \
+}
+
+__extension__ static __inline __attribute__ ((__always_inline__))
+cmse_address_info_t
+__cmse_TT_fptr (__cmse_fptr __p)
+__CMSE_TT_ASM ()
+
+__extension__ static __inline __attribute__ ((__always_inline__))
+cmse_address_info_t
+cmse_TT (void *__p)
+__CMSE_TT_ASM ()
+
+#define cmse_TTT_fptr(p) (__cmse_TTT_fptr ((__cmse_fptr)(p)))
+
+__extension__ static __inline __attribute__ ((__always_inline__))
+cmse_address_info_t
+__cmse_TTT_fptr (__cmse_fptr __p)
+__CMSE_TT_ASM (t)
+
+__extension__ static __inline __attribute__ ((__always_inline__))
+cmse_address_info_t
+cmse_TTT (void *__p)
+__CMSE_TT_ASM (t)
+
+#if __ARM_FEATURE_CMSE & 2
+
+#define cmse_TTA_fptr(p) (__cmse_TTA_fptr ((__cmse_fptr)(p)))
+
+__extension__ static __inline __attribute__ ((__always_inline__))
+cmse_address_info_t
+__cmse_TTA_fptr (__cmse_fptr __p)
+__CMSE_TT_ASM (a)
+
+__extension__ static __inline __attribute__ ((__always_inline__))
+cmse_address_info_t
+cmse_TTA (void *__p)
+__CMSE_TT_ASM (a)
+
+#define cmse_TTAT_fptr(p) (__cmse_TTAT_fptr ((__cmse_fptr)(p)))
+
+__extension__ static __inline cmse_address_info_t
+__attribute__ ((__always_inline__))
+__cmse_TTAT_fptr (__cmse_fptr __p)
+__CMSE_TT_ASM (at)
+
+__extension__ static __inline cmse_address_info_t
+__attribute__ ((__always_inline__))
+cmse_TTAT (void *__p)
+__CMSE_TT_ASM (at)
+
+/* FIXME: diagnose use outside cmse_nonsecure_entry functions.  */
+__extension__ static __inline int __attribute__ ((__always_inline__))
+cmse_nonsecure_caller (void)
+{
+  return __builtin_arm_cmse_nonsecure_caller ();
+}
+
+#define CMSE_AU_NONSECURE	2
+#define CMSE_MPU_NONSECURE	16
+#define CMSE_NONSECURE		18
+
+#define cmse_nsfptr_create(p) ((typeof ((p))) ((intptr_t) (p) & ~1))
+
+#define cmse_is_nsfptr(p) (!((intptr_t) (p) & 1))
+
+#endif /* __ARM_FEATURE_CMSE & 2 */
+
+#define CMSE_MPU_UNPRIV		4
+#define CMSE_MPU_READWRITE	1
+#define CMSE_MPU_READ		8
+
+__extension__ void *
+cmse_check_address_range (void *, size_t, int);
+
+#define cmse_check_pointed_object(p, f) \
+  ((typeof ((p))) cmse_check_address_range ((p), sizeof (*(p)), (f)))
+
+#endif /* __ARM_FEATURE_CMSE & 1 */
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _GCC_ARM_CMSE_H */
--- /dev/null
+++ b/src/gcc/config/arm/arm_fp16.h
@@ -0,0 +1,255 @@
+/* ARM FP16 intrinsics include file.
+
+   Copyright (C) 2016 Free Software Foundation, Inc.
+   Contributed by ARM Ltd.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify it
+   under the terms of the GNU General Public License as published
+   by the Free Software Foundation; either version 3, or (at your
+   option) any later version.
+
+   GCC is distributed in the hope that it will be useful, but WITHOUT
+   ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+   or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
+   License for more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef _GCC_ARM_FP16_H
+#define _GCC_ARM_FP16_H 1
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <stdint.h>
+
+/* Intrinsics for FP16 instructions.  */
+#pragma GCC push_options
+#pragma GCC target ("fpu=fp-armv8")
+
+#if defined (__ARM_FEATURE_FP16_SCALAR_ARITHMETIC)
+
+typedef __fp16 float16_t;
+
+__extension__ static __inline float16_t __attribute__ ((__always_inline__))
+vabsh_f16 (float16_t __a)
+{
+  return __builtin_neon_vabshf (__a);
+}
+
+__extension__ static __inline float16_t __attribute__ ((__always_inline__))
+vaddh_f16 (float16_t __a, float16_t __b)
+{
+  return __a + __b;
+}
+
+__extension__ static __inline int32_t __attribute__ ((__always_inline__))
+vcvtah_s32_f16 (float16_t __a)
+{
+  return __builtin_neon_vcvtahssi (__a);
+}
+
+__extension__ static __inline uint32_t __attribute__ ((__always_inline__))
+vcvtah_u32_f16 (float16_t __a)
+{
+  return __builtin_neon_vcvtahusi (__a);
+}
+
+__extension__ static __inline float16_t __attribute__ ((__always_inline__))
+vcvth_f16_s32 (int32_t __a)
+{
+  return __builtin_neon_vcvthshf (__a);
+}
+
+__extension__ static __inline float16_t __attribute__ ((__always_inline__))
+vcvth_f16_u32 (uint32_t __a)
+{
+  return __builtin_neon_vcvthuhf (__a);
+}
+
+__extension__ static __inline float16_t __attribute__ ((__always_inline__))
+vcvth_n_f16_s32 (int32_t __a, const int __b)
+{
+  return __builtin_neon_vcvths_nhf (__a, __b);
+}
+
+__extension__ static __inline float16_t __attribute__ ((__always_inline__))
+vcvth_n_f16_u32 (uint32_t __a, const int __b)
+{
+  return __builtin_neon_vcvthu_nhf ((int32_t)__a, __b);
+}
+
+__extension__ static __inline int32_t __attribute__ ((__always_inline__))
+vcvth_n_s32_f16 (float16_t __a, const int __b)
+{
+  return __builtin_neon_vcvths_nsi (__a, __b);
+}
+
+__extension__ static __inline uint32_t __attribute__ ((__always_inline__))
+vcvth_n_u32_f16 (float16_t __a, const int __b)
+{
+  return (uint32_t)__builtin_neon_vcvthu_nsi (__a, __b);
+}
+
+__extension__ static __inline int32_t __attribute__ ((__always_inline__))
+vcvth_s32_f16 (float16_t __a)
+{
+  return __builtin_neon_vcvthssi (__a);
+}
+
+__extension__ static __inline uint32_t __attribute__ ((__always_inline__))
+vcvth_u32_f16 (float16_t __a)
+{
+  return __builtin_neon_vcvthusi (__a);
+}
+
+__extension__ static __inline int32_t __attribute__ ((__always_inline__))
+vcvtmh_s32_f16 (float16_t __a)
+{
+  return __builtin_neon_vcvtmhssi (__a);
+}
+
+__extension__ static __inline uint32_t __attribute__ ((__always_inline__))
+vcvtmh_u32_f16 (float16_t __a)
+{
+  return __builtin_neon_vcvtmhusi (__a);
+}
+
+__extension__ static __inline int32_t __attribute__ ((__always_inline__))
+vcvtnh_s32_f16 (float16_t __a)
+{
+  return __builtin_neon_vcvtnhssi (__a);
+}
+
+__extension__ static __inline uint32_t __attribute__ ((__always_inline__))
+vcvtnh_u32_f16 (float16_t __a)
+{
+  return __builtin_neon_vcvtnhusi (__a);
+}
+
+__extension__ static __inline int32_t __attribute__ ((__always_inline__))
+vcvtph_s32_f16 (float16_t __a)
+{
+  return __builtin_neon_vcvtphssi (__a);
+}
+
+__extension__ static __inline uint32_t __attribute__ ((__always_inline__))
+vcvtph_u32_f16 (float16_t __a)
+{
+  return __builtin_neon_vcvtphusi (__a);
+}
+
+__extension__ static __inline float16_t __attribute__ ((__always_inline__))
+vdivh_f16 (float16_t __a, float16_t __b)
+{
+  return __a / __b;
+}
+
+__extension__ static __inline float16_t __attribute__ ((__always_inline__))
+vfmah_f16 (float16_t __a, float16_t __b, float16_t __c)
+{
+  return __builtin_neon_vfmahf (__a, __b, __c);
+}
+
+__extension__ static __inline float16_t __attribute__ ((__always_inline__))
+vfmsh_f16 (float16_t __a, float16_t __b, float16_t __c)
+{
+  return __builtin_neon_vfmshf (__a, __b, __c);
+}
+
+__extension__ static __inline float16_t __attribute__ ((__always_inline__))
+vmaxnmh_f16 (float16_t __a, float16_t __b)
+{
+  return __builtin_neon_vmaxnmhf (__a, __b);
+}
+
+__extension__ static __inline float16_t __attribute__ ((__always_inline__))
+vminnmh_f16 (float16_t __a, float16_t __b)
+{
+  return __builtin_neon_vminnmhf (__a, __b);
+}
+
+__extension__ static __inline float16_t __attribute__ ((__always_inline__))
+vmulh_f16 (float16_t __a, float16_t __b)
+{
+  return __a * __b;
+}
+
+__extension__ static __inline float16_t __attribute__ ((__always_inline__))
+vnegh_f16 (float16_t __a)
+{
+  return  - __a;
+}
+
+__extension__ static __inline float16_t __attribute__ ((__always_inline__))
+vrndah_f16 (float16_t __a)
+{
+  return __builtin_neon_vrndahf (__a);
+}
+
+__extension__ static __inline float16_t __attribute__ ((__always_inline__))
+vrndh_f16 (float16_t __a)
+{
+  return __builtin_neon_vrndhf (__a);
+}
+
+__extension__ static __inline float16_t __attribute__ ((__always_inline__))
+vrndih_f16 (float16_t __a)
+{
+  return __builtin_neon_vrndihf (__a);
+}
+
+__extension__ static __inline float16_t __attribute__ ((__always_inline__))
+vrndmh_f16 (float16_t __a)
+{
+  return __builtin_neon_vrndmhf (__a);
+}
+
+__extension__ static __inline float16_t __attribute__ ((__always_inline__))
+vrndnh_f16 (float16_t __a)
+{
+  return __builtin_neon_vrndnhf (__a);
+}
+
+__extension__ static __inline float16_t __attribute__ ((__always_inline__))
+vrndph_f16 (float16_t __a)
+{
+  return __builtin_neon_vrndphf (__a);
+}
+
+__extension__ static __inline float16_t __attribute__ ((__always_inline__))
+vrndxh_f16 (float16_t __a)
+{
+  return __builtin_neon_vrndxhf (__a);
+}
+
+__extension__ static __inline float16_t __attribute__ ((__always_inline__))
+vsqrth_f16 (float16_t __a)
+{
+  return __builtin_neon_vsqrthf (__a);
+}
+
+__extension__ static __inline float16_t __attribute__ ((__always_inline__))
+vsubh_f16 (float16_t __a, float16_t __b)
+{
+  return __a - __b;
+}
+
+#endif /* __ARM_FEATURE_FP16_SCALAR_ARITHMETIC  */
+#pragma GCC pop_options
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
--- a/src/gcc/config/arm/arm_neon.h
+++ b/src/gcc/config/arm/arm_neon.h
@@ -38,6 +38,7 @@
 extern "C" {
 #endif
 
+#include <arm_fp16.h>
 #include <stdint.h>
 
 typedef __simd64_int8_t int8x8_t;
@@ -509,528 +510,614 @@ typedef struct poly64x2x4_t
 #pragma GCC pop_options
 
 /* vadd  */
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int8x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vadd_s8 (int8x8_t __a, int8x8_t __b)
 {
   return __a + __b;
 }
 
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vadd_s16 (int16x4_t __a, int16x4_t __b)
 {
   return __a + __b;
 }
 
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vadd_s32 (int32x2_t __a, int32x2_t __b)
 {
   return __a + __b;
 }
 
-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline float32x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vadd_f32 (float32x2_t __a, float32x2_t __b)
 {
-#ifdef __FAST_MATH
+#ifdef __FAST_MATH__
   return __a + __b;
 #else
   return (float32x2_t) __builtin_neon_vaddv2sf (__a, __b);
 #endif
 }
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint8x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vadd_u8 (uint8x8_t __a, uint8x8_t __b)
 {
   return __a + __b;
 }
 
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vadd_u16 (uint16x4_t __a, uint16x4_t __b)
 {
   return __a + __b;
 }
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vadd_u32 (uint32x2_t __a, uint32x2_t __b)
 {
   return __a + __b;
 }
 
-__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int64x1_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vadd_s64 (int64x1_t __a, int64x1_t __b)
 {
   return __a + __b;
 }
 
-__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint64x1_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vadd_u64 (uint64x1_t __a, uint64x1_t __b)
 {
   return __a + __b;
 }
 
-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int8x16_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vaddq_s8 (int8x16_t __a, int8x16_t __b)
 {
   return __a + __b;
 }
 
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vaddq_s16 (int16x8_t __a, int16x8_t __b)
 {
   return __a + __b;
 }
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vaddq_s32 (int32x4_t __a, int32x4_t __b)
 {
   return __a + __b;
 }
 
-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int64x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vaddq_s64 (int64x2_t __a, int64x2_t __b)
 {
   return __a + __b;
 }
 
-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline float32x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vaddq_f32 (float32x4_t __a, float32x4_t __b)
 {
-#ifdef __FAST_MATH
+#ifdef __FAST_MATH__
   return __a + __b;
 #else
   return (float32x4_t) __builtin_neon_vaddv4sf (__a, __b);
 #endif
 }
 
-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint8x16_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vaddq_u8 (uint8x16_t __a, uint8x16_t __b)
 {
   return __a + __b;
 }
 
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vaddq_u16 (uint16x8_t __a, uint16x8_t __b)
 {
   return __a + __b;
 }
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vaddq_u32 (uint32x4_t __a, uint32x4_t __b)
 {
   return __a + __b;
 }
 
-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint64x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vaddq_u64 (uint64x2_t __a, uint64x2_t __b)
 {
   return __a + __b;
 }
 
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vaddl_s8 (int8x8_t __a, int8x8_t __b)
 {
   return (int16x8_t)__builtin_neon_vaddlsv8qi (__a, __b);
 }
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vaddl_s16 (int16x4_t __a, int16x4_t __b)
 {
   return (int32x4_t)__builtin_neon_vaddlsv4hi (__a, __b);
 }
 
-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int64x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vaddl_s32 (int32x2_t __a, int32x2_t __b)
 {
   return (int64x2_t)__builtin_neon_vaddlsv2si (__a, __b);
 }
 
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vaddl_u8 (uint8x8_t __a, uint8x8_t __b)
 {
   return (uint16x8_t)__builtin_neon_vaddluv8qi ((int8x8_t) __a, (int8x8_t) __b);
 }
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vaddl_u16 (uint16x4_t __a, uint16x4_t __b)
 {
   return (uint32x4_t)__builtin_neon_vaddluv4hi ((int16x4_t) __a, (int16x4_t) __b);
 }
 
-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint64x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vaddl_u32 (uint32x2_t __a, uint32x2_t __b)
 {
   return (uint64x2_t)__builtin_neon_vaddluv2si ((int32x2_t) __a, (int32x2_t) __b);
 }
 
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vaddw_s8 (int16x8_t __a, int8x8_t __b)
 {
   return (int16x8_t)__builtin_neon_vaddwsv8qi (__a, __b);
 }
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vaddw_s16 (int32x4_t __a, int16x4_t __b)
 {
   return (int32x4_t)__builtin_neon_vaddwsv4hi (__a, __b);
 }
 
-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int64x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vaddw_s32 (int64x2_t __a, int32x2_t __b)
 {
   return (int64x2_t)__builtin_neon_vaddwsv2si (__a, __b);
 }
 
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vaddw_u8 (uint16x8_t __a, uint8x8_t __b)
 {
   return (uint16x8_t)__builtin_neon_vaddwuv8qi ((int16x8_t) __a, (int8x8_t) __b);
 }
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vaddw_u16 (uint32x4_t __a, uint16x4_t __b)
 {
   return (uint32x4_t)__builtin_neon_vaddwuv4hi ((int32x4_t) __a, (int16x4_t) __b);
 }
 
-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint64x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vaddw_u32 (uint64x2_t __a, uint32x2_t __b)
 {
   return (uint64x2_t)__builtin_neon_vaddwuv2si ((int64x2_t) __a, (int32x2_t) __b);
 }
 
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int8x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vhadd_s8 (int8x8_t __a, int8x8_t __b)
 {
   return (int8x8_t)__builtin_neon_vhaddsv8qi (__a, __b);
 }
 
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vhadd_s16 (int16x4_t __a, int16x4_t __b)
 {
   return (int16x4_t)__builtin_neon_vhaddsv4hi (__a, __b);
 }
 
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vhadd_s32 (int32x2_t __a, int32x2_t __b)
 {
   return (int32x2_t)__builtin_neon_vhaddsv2si (__a, __b);
 }
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint8x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vhadd_u8 (uint8x8_t __a, uint8x8_t __b)
 {
   return (uint8x8_t)__builtin_neon_vhadduv8qi ((int8x8_t) __a, (int8x8_t) __b);
 }
 
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vhadd_u16 (uint16x4_t __a, uint16x4_t __b)
 {
   return (uint16x4_t)__builtin_neon_vhadduv4hi ((int16x4_t) __a, (int16x4_t) __b);
 }
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vhadd_u32 (uint32x2_t __a, uint32x2_t __b)
 {
   return (uint32x2_t)__builtin_neon_vhadduv2si ((int32x2_t) __a, (int32x2_t) __b);
 }
 
-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int8x16_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vhaddq_s8 (int8x16_t __a, int8x16_t __b)
 {
   return (int8x16_t)__builtin_neon_vhaddsv16qi (__a, __b);
 }
 
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vhaddq_s16 (int16x8_t __a, int16x8_t __b)
 {
   return (int16x8_t)__builtin_neon_vhaddsv8hi (__a, __b);
 }
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vhaddq_s32 (int32x4_t __a, int32x4_t __b)
 {
   return (int32x4_t)__builtin_neon_vhaddsv4si (__a, __b);
 }
 
-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint8x16_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vhaddq_u8 (uint8x16_t __a, uint8x16_t __b)
 {
   return (uint8x16_t)__builtin_neon_vhadduv16qi ((int8x16_t) __a, (int8x16_t) __b);
 }
 
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vhaddq_u16 (uint16x8_t __a, uint16x8_t __b)
 {
   return (uint16x8_t)__builtin_neon_vhadduv8hi ((int16x8_t) __a, (int16x8_t) __b);
 }
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vhaddq_u32 (uint32x4_t __a, uint32x4_t __b)
 {
   return (uint32x4_t)__builtin_neon_vhadduv4si ((int32x4_t) __a, (int32x4_t) __b);
 }
 
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int8x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vrhadd_s8 (int8x8_t __a, int8x8_t __b)
 {
   return (int8x8_t)__builtin_neon_vrhaddsv8qi (__a, __b);
 }
 
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vrhadd_s16 (int16x4_t __a, int16x4_t __b)
 {
   return (int16x4_t)__builtin_neon_vrhaddsv4hi (__a, __b);
 }
 
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vrhadd_s32 (int32x2_t __a, int32x2_t __b)
 {
   return (int32x2_t)__builtin_neon_vrhaddsv2si (__a, __b);
 }
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint8x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vrhadd_u8 (uint8x8_t __a, uint8x8_t __b)
 {
   return (uint8x8_t)__builtin_neon_vrhadduv8qi ((int8x8_t) __a, (int8x8_t) __b);
 }
 
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vrhadd_u16 (uint16x4_t __a, uint16x4_t __b)
 {
   return (uint16x4_t)__builtin_neon_vrhadduv4hi ((int16x4_t) __a, (int16x4_t) __b);
 }
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vrhadd_u32 (uint32x2_t __a, uint32x2_t __b)
 {
   return (uint32x2_t)__builtin_neon_vrhadduv2si ((int32x2_t) __a, (int32x2_t) __b);
 }
 
-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int8x16_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vrhaddq_s8 (int8x16_t __a, int8x16_t __b)
 {
   return (int8x16_t)__builtin_neon_vrhaddsv16qi (__a, __b);
 }
 
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vrhaddq_s16 (int16x8_t __a, int16x8_t __b)
 {
   return (int16x8_t)__builtin_neon_vrhaddsv8hi (__a, __b);
 }
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vrhaddq_s32 (int32x4_t __a, int32x4_t __b)
 {
   return (int32x4_t)__builtin_neon_vrhaddsv4si (__a, __b);
 }
 
-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint8x16_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vrhaddq_u8 (uint8x16_t __a, uint8x16_t __b)
 {
   return (uint8x16_t)__builtin_neon_vrhadduv16qi ((int8x16_t) __a, (int8x16_t) __b);
 }
 
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vrhaddq_u16 (uint16x8_t __a, uint16x8_t __b)
 {
   return (uint16x8_t)__builtin_neon_vrhadduv8hi ((int16x8_t) __a, (int16x8_t) __b);
 }
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vrhaddq_u32 (uint32x4_t __a, uint32x4_t __b)
 {
   return (uint32x4_t)__builtin_neon_vrhadduv4si ((int32x4_t) __a, (int32x4_t) __b);
 }
 
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int8x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vqadd_s8 (int8x8_t __a, int8x8_t __b)
 {
   return (int8x8_t)__builtin_neon_vqaddsv8qi (__a, __b);
 }
 
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vqadd_s16 (int16x4_t __a, int16x4_t __b)
 {
   return (int16x4_t)__builtin_neon_vqaddsv4hi (__a, __b);
 }
 
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vqadd_s32 (int32x2_t __a, int32x2_t __b)
 {
   return (int32x2_t)__builtin_neon_vqaddsv2si (__a, __b);
 }
 
-__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int64x1_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vqadd_s64 (int64x1_t __a, int64x1_t __b)
 {
   return (int64x1_t)__builtin_neon_vqaddsdi (__a, __b);
 }
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint8x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vqadd_u8 (uint8x8_t __a, uint8x8_t __b)
 {
   return (uint8x8_t)__builtin_neon_vqadduv8qi ((int8x8_t) __a, (int8x8_t) __b);
 }
 
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vqadd_u16 (uint16x4_t __a, uint16x4_t __b)
 {
   return (uint16x4_t)__builtin_neon_vqadduv4hi ((int16x4_t) __a, (int16x4_t) __b);
 }
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vqadd_u32 (uint32x2_t __a, uint32x2_t __b)
 {
   return (uint32x2_t)__builtin_neon_vqadduv2si ((int32x2_t) __a, (int32x2_t) __b);
 }
 
-__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint64x1_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vqadd_u64 (uint64x1_t __a, uint64x1_t __b)
 {
   return (uint64x1_t)__builtin_neon_vqaddudi ((int64x1_t) __a, (int64x1_t) __b);
 }
 
-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int8x16_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vqaddq_s8 (int8x16_t __a, int8x16_t __b)
 {
   return (int8x16_t)__builtin_neon_vqaddsv16qi (__a, __b);
 }
 
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vqaddq_s16 (int16x8_t __a, int16x8_t __b)
 {
   return (int16x8_t)__builtin_neon_vqaddsv8hi (__a, __b);
 }
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vqaddq_s32 (int32x4_t __a, int32x4_t __b)
 {
   return (int32x4_t)__builtin_neon_vqaddsv4si (__a, __b);
 }
 
-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int64x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vqaddq_s64 (int64x2_t __a, int64x2_t __b)
 {
   return (int64x2_t)__builtin_neon_vqaddsv2di (__a, __b);
 }
 
-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint8x16_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vqaddq_u8 (uint8x16_t __a, uint8x16_t __b)
 {
   return (uint8x16_t)__builtin_neon_vqadduv16qi ((int8x16_t) __a, (int8x16_t) __b);
 }
 
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vqaddq_u16 (uint16x8_t __a, uint16x8_t __b)
 {
   return (uint16x8_t)__builtin_neon_vqadduv8hi ((int16x8_t) __a, (int16x8_t) __b);
 }
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vqaddq_u32 (uint32x4_t __a, uint32x4_t __b)
 {
   return (uint32x4_t)__builtin_neon_vqadduv4si ((int32x4_t) __a, (int32x4_t) __b);
 }
 
-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint64x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vqaddq_u64 (uint64x2_t __a, uint64x2_t __b)
 {
   return (uint64x2_t)__builtin_neon_vqadduv2di ((int64x2_t) __a, (int64x2_t) __b);
 }
 
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int8x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vaddhn_s16 (int16x8_t __a, int16x8_t __b)
 {
   return (int8x8_t)__builtin_neon_vaddhnv8hi (__a, __b);
 }
 
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vaddhn_s32 (int32x4_t __a, int32x4_t __b)
 {
   return (int16x4_t)__builtin_neon_vaddhnv4si (__a, __b);
 }
 
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vaddhn_s64 (int64x2_t __a, int64x2_t __b)
 {
   return (int32x2_t)__builtin_neon_vaddhnv2di (__a, __b);
 }
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint8x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vaddhn_u16 (uint16x8_t __a, uint16x8_t __b)
 {
   return (uint8x8_t)__builtin_neon_vaddhnv8hi ((int16x8_t) __a, (int16x8_t) __b);
 }
 
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vaddhn_u32 (uint32x4_t __a, uint32x4_t __b)
 {
   return (uint16x4_t)__builtin_neon_vaddhnv4si ((int32x4_t) __a, (int32x4_t) __b);
 }
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vaddhn_u64 (uint64x2_t __a, uint64x2_t __b)
 {
   return (uint32x2_t)__builtin_neon_vaddhnv2di ((int64x2_t) __a, (int64x2_t) __b);
 }
 
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int8x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vraddhn_s16 (int16x8_t __a, int16x8_t __b)
 {
   return (int8x8_t)__builtin_neon_vraddhnv8hi (__a, __b);
 }
 
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vraddhn_s32 (int32x4_t __a, int32x4_t __b)
 {
   return (int16x4_t)__builtin_neon_vraddhnv4si (__a, __b);
 }
 
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vraddhn_s64 (int64x2_t __a, int64x2_t __b)
 {
   return (int32x2_t)__builtin_neon_vraddhnv2di (__a, __b);
 }
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint8x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vraddhn_u16 (uint16x8_t __a, uint16x8_t __b)
 {
   return (uint8x8_t)__builtin_neon_vraddhnv8hi ((int16x8_t) __a, (int16x8_t) __b);
 }
 
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vraddhn_u32 (uint32x4_t __a, uint32x4_t __b)
 {
   return (uint16x4_t)__builtin_neon_vraddhnv4si ((int32x4_t) __a, (int32x4_t) __b);
 }
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vraddhn_u64 (uint64x2_t __a, uint64x2_t __b)
 {
   return (uint32x2_t)__builtin_neon_vraddhnv2di ((int64x2_t) __a, (int64x2_t) __b);
 }
 
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int8x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vmul_s8 (int8x8_t __a, int8x8_t __b)
 {
   return __a * __b;
 }
 
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vmul_s16 (int16x4_t __a, int16x4_t __b)
 {
   return __a * __b;
 }
 
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vmul_s32 (int32x2_t __a, int32x2_t __b)
 {
   return __a * __b;
 }
 
-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline float32x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vmul_f32 (float32x2_t __a, float32x2_t __b)
 {
-#ifdef __FAST_MATH
+#ifdef __FAST_MATH__
   return __a * __b;
 #else
   return (float32x2_t) __builtin_neon_vmulfv2sf (__a, __b);
@@ -1038,493 +1125,574 @@ vmul_f32 (float32x2_t __a, float32x2_t __b)
 
 }
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint8x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vmul_u8 (uint8x8_t __a, uint8x8_t __b)
 {
   return __a * __b;
 }
 
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vmul_u16 (uint16x4_t __a, uint16x4_t __b)
 {
   return __a * __b;
 }
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vmul_u32 (uint32x2_t __a, uint32x2_t __b)
 {
   return __a * __b;
 }
 
-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int8x16_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vmulq_s8 (int8x16_t __a, int8x16_t __b)
 {
   return __a * __b;
 }
 
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vmulq_s16 (int16x8_t __a, int16x8_t __b)
 {
   return __a * __b;
 }
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vmulq_s32 (int32x4_t __a, int32x4_t __b)
 {
   return __a * __b;
 }
 
-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline float32x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vmulq_f32 (float32x4_t __a, float32x4_t __b)
 {
-#ifdef __FAST_MATH
+#ifdef __FAST_MATH__
   return __a * __b;
 #else
   return (float32x4_t) __builtin_neon_vmulfv4sf (__a, __b);
 #endif
 }
 
-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint8x16_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vmulq_u8 (uint8x16_t __a, uint8x16_t __b)
 {
   return __a * __b;
 }
 
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vmulq_u16 (uint16x8_t __a, uint16x8_t __b)
 {
   return __a * __b;
 }
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vmulq_u32 (uint32x4_t __a, uint32x4_t __b)
 {
   return __a * __b;
 }
 
-__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline poly8x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vmul_p8 (poly8x8_t __a, poly8x8_t __b)
 {
   return (poly8x8_t)__builtin_neon_vmulpv8qi ((int8x8_t) __a, (int8x8_t) __b);
 }
 
-__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
+__extension__ extern __inline poly8x16_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vmulq_p8 (poly8x16_t __a, poly8x16_t __b)
 {
   return (poly8x16_t)__builtin_neon_vmulpv16qi ((int8x16_t) __a, (int8x16_t) __b);
 }
 
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vqdmulh_s16 (int16x4_t __a, int16x4_t __b)
 {
   return (int16x4_t)__builtin_neon_vqdmulhv4hi (__a, __b);
 }
 
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vqdmulh_s32 (int32x2_t __a, int32x2_t __b)
 {
   return (int32x2_t)__builtin_neon_vqdmulhv2si (__a, __b);
 }
 
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vqdmulhq_s16 (int16x8_t __a, int16x8_t __b)
 {
   return (int16x8_t)__builtin_neon_vqdmulhv8hi (__a, __b);
 }
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vqdmulhq_s32 (int32x4_t __a, int32x4_t __b)
 {
   return (int32x4_t)__builtin_neon_vqdmulhv4si (__a, __b);
 }
 
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vqrdmulh_s16 (int16x4_t __a, int16x4_t __b)
 {
   return (int16x4_t)__builtin_neon_vqrdmulhv4hi (__a, __b);
 }
 
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vqrdmulh_s32 (int32x2_t __a, int32x2_t __b)
 {
   return (int32x2_t)__builtin_neon_vqrdmulhv2si (__a, __b);
 }
 
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vqrdmulhq_s16 (int16x8_t __a, int16x8_t __b)
 {
   return (int16x8_t)__builtin_neon_vqrdmulhv8hi (__a, __b);
 }
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vqrdmulhq_s32 (int32x4_t __a, int32x4_t __b)
 {
   return (int32x4_t)__builtin_neon_vqrdmulhv4si (__a, __b);
 }
 
 #ifdef __ARM_FEATURE_QRDMX
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vqrdmlah_s16 (int16x4_t __a, int16x4_t __b, int16x4_t __c)
 {
   return (int16x4_t)__builtin_neon_vqrdmlahv4hi (__a, __b, __c);
 }
 
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vqrdmlah_s32 (int32x2_t __a, int32x2_t __b, int32x2_t __c)
 {
   return (int32x2_t)__builtin_neon_vqrdmlahv2si (__a, __b, __c);
 }
 
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vqrdmlahq_s16 (int16x8_t __a, int16x8_t __b, int16x8_t __c)
 {
   return (int16x8_t)__builtin_neon_vqrdmlahv8hi (__a, __b, __c);
 }
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vqrdmlahq_s32 (int32x4_t __a, int32x4_t __b, int32x4_t __c)
 {
   return (int32x4_t)__builtin_neon_vqrdmlahv4si (__a, __b, __c);
 }
 
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vqrdmlsh_s16 (int16x4_t __a, int16x4_t __b, int16x4_t __c)
 {
   return (int16x4_t)__builtin_neon_vqrdmlshv4hi (__a, __b, __c);
 }
 
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vqrdmlsh_s32 (int32x2_t __a, int32x2_t __b, int32x2_t __c)
 {
   return (int32x2_t)__builtin_neon_vqrdmlshv2si (__a, __b, __c);
 }
 
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vqrdmlshq_s16 (int16x8_t __a, int16x8_t __b, int16x8_t __c)
 {
   return (int16x8_t)__builtin_neon_vqrdmlshv8hi (__a, __b, __c);
 }
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vqrdmlshq_s32 (int32x4_t __a, int32x4_t __b, int32x4_t __c)
 {
   return (int32x4_t)__builtin_neon_vqrdmlshv4si (__a, __b, __c);
 }
 #endif
 
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vmull_s8 (int8x8_t __a, int8x8_t __b)
 {
   return (int16x8_t)__builtin_neon_vmullsv8qi (__a, __b);
 }
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vmull_s16 (int16x4_t __a, int16x4_t __b)
 {
   return (int32x4_t)__builtin_neon_vmullsv4hi (__a, __b);
 }
 
-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int64x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vmull_s32 (int32x2_t __a, int32x2_t __b)
 {
   return (int64x2_t)__builtin_neon_vmullsv2si (__a, __b);
 }
 
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vmull_u8 (uint8x8_t __a, uint8x8_t __b)
 {
   return (uint16x8_t)__builtin_neon_vmulluv8qi ((int8x8_t) __a, (int8x8_t) __b);
 }
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vmull_u16 (uint16x4_t __a, uint16x4_t __b)
 {
   return (uint32x4_t)__builtin_neon_vmulluv4hi ((int16x4_t) __a, (int16x4_t) __b);
 }
 
-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint64x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vmull_u32 (uint32x2_t __a, uint32x2_t __b)
 {
   return (uint64x2_t)__builtin_neon_vmulluv2si ((int32x2_t) __a, (int32x2_t) __b);
 }
 
-__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline poly16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vmull_p8 (poly8x8_t __a, poly8x8_t __b)
 {
   return (poly16x8_t)__builtin_neon_vmullpv8qi ((int8x8_t) __a, (int8x8_t) __b);
 }
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vqdmull_s16 (int16x4_t __a, int16x4_t __b)
 {
   return (int32x4_t)__builtin_neon_vqdmullv4hi (__a, __b);
 }
 
-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int64x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vqdmull_s32 (int32x2_t __a, int32x2_t __b)
 {
   return (int64x2_t)__builtin_neon_vqdmullv2si (__a, __b);
 }
 
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int8x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vmla_s8 (int8x8_t __a, int8x8_t __b, int8x8_t __c)
 {
   return (int8x8_t)__builtin_neon_vmlav8qi (__a, __b, __c);
 }
 
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vmla_s16 (int16x4_t __a, int16x4_t __b, int16x4_t __c)
 {
   return (int16x4_t)__builtin_neon_vmlav4hi (__a, __b, __c);
 }
 
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vmla_s32 (int32x2_t __a, int32x2_t __b, int32x2_t __c)
 {
   return (int32x2_t)__builtin_neon_vmlav2si (__a, __b, __c);
 }
 
-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline float32x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vmla_f32 (float32x2_t __a, float32x2_t __b, float32x2_t __c)
 {
   return (float32x2_t)__builtin_neon_vmlav2sf (__a, __b, __c);
 }
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint8x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vmla_u8 (uint8x8_t __a, uint8x8_t __b, uint8x8_t __c)
 {
   return (uint8x8_t)__builtin_neon_vmlav8qi ((int8x8_t) __a, (int8x8_t) __b, (int8x8_t) __c);
 }
 
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vmla_u16 (uint16x4_t __a, uint16x4_t __b, uint16x4_t __c)
 {
   return (uint16x4_t)__builtin_neon_vmlav4hi ((int16x4_t) __a, (int16x4_t) __b, (int16x4_t) __c);
 }
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vmla_u32 (uint32x2_t __a, uint32x2_t __b, uint32x2_t __c)
 {
   return (uint32x2_t)__builtin_neon_vmlav2si ((int32x2_t) __a, (int32x2_t) __b, (int32x2_t) __c);
 }
 
-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int8x16_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vmlaq_s8 (int8x16_t __a, int8x16_t __b, int8x16_t __c)
 {
   return (int8x16_t)__builtin_neon_vmlav16qi (__a, __b, __c);
 }
 
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vmlaq_s16 (int16x8_t __a, int16x8_t __b, int16x8_t __c)
 {
   return (int16x8_t)__builtin_neon_vmlav8hi (__a, __b, __c);
 }
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vmlaq_s32 (int32x4_t __a, int32x4_t __b, int32x4_t __c)
 {
   return (int32x4_t)__builtin_neon_vmlav4si (__a, __b, __c);
 }
 
-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline float32x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vmlaq_f32 (float32x4_t __a, float32x4_t __b, float32x4_t __c)
 {
   return (float32x4_t)__builtin_neon_vmlav4sf (__a, __b, __c);
 }
 
-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint8x16_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vmlaq_u8 (uint8x16_t __a, uint8x16_t __b, uint8x16_t __c)
 {
   return (uint8x16_t)__builtin_neon_vmlav16qi ((int8x16_t) __a, (int8x16_t) __b, (int8x16_t) __c);
 }
 
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vmlaq_u16 (uint16x8_t __a, uint16x8_t __b, uint16x8_t __c)
 {
   return (uint16x8_t)__builtin_neon_vmlav8hi ((int16x8_t) __a, (int16x8_t) __b, (int16x8_t) __c);
 }
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vmlaq_u32 (uint32x4_t __a, uint32x4_t __b, uint32x4_t __c)
 {
   return (uint32x4_t)__builtin_neon_vmlav4si ((int32x4_t) __a, (int32x4_t) __b, (int32x4_t) __c);
 }
 
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vmlal_s8 (int16x8_t __a, int8x8_t __b, int8x8_t __c)
 {
   return (int16x8_t)__builtin_neon_vmlalsv8qi (__a, __b, __c);
 }
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vmlal_s16 (int32x4_t __a, int16x4_t __b, int16x4_t __c)
 {
   return (int32x4_t)__builtin_neon_vmlalsv4hi (__a, __b, __c);
 }
 
-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int64x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vmlal_s32 (int64x2_t __a, int32x2_t __b, int32x2_t __c)
 {
   return (int64x2_t)__builtin_neon_vmlalsv2si (__a, __b, __c);
 }
 
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vmlal_u8 (uint16x8_t __a, uint8x8_t __b, uint8x8_t __c)
 {
   return (uint16x8_t)__builtin_neon_vmlaluv8qi ((int16x8_t) __a, (int8x8_t) __b, (int8x8_t) __c);
 }
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vmlal_u16 (uint32x4_t __a, uint16x4_t __b, uint16x4_t __c)
 {
   return (uint32x4_t)__builtin_neon_vmlaluv4hi ((int32x4_t) __a, (int16x4_t) __b, (int16x4_t) __c);
 }
 
-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint64x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vmlal_u32 (uint64x2_t __a, uint32x2_t __b, uint32x2_t __c)
 {
   return (uint64x2_t)__builtin_neon_vmlaluv2si ((int64x2_t) __a, (int32x2_t) __b, (int32x2_t) __c);
 }
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vqdmlal_s16 (int32x4_t __a, int16x4_t __b, int16x4_t __c)
 {
   return (int32x4_t)__builtin_neon_vqdmlalv4hi (__a, __b, __c);
 }
 
-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int64x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vqdmlal_s32 (int64x2_t __a, int32x2_t __b, int32x2_t __c)
 {
   return (int64x2_t)__builtin_neon_vqdmlalv2si (__a, __b, __c);
 }
 
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int8x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vmls_s8 (int8x8_t __a, int8x8_t __b, int8x8_t __c)
 {
   return (int8x8_t)__builtin_neon_vmlsv8qi (__a, __b, __c);
 }
 
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vmls_s16 (int16x4_t __a, int16x4_t __b, int16x4_t __c)
 {
   return (int16x4_t)__builtin_neon_vmlsv4hi (__a, __b, __c);
 }
 
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vmls_s32 (int32x2_t __a, int32x2_t __b, int32x2_t __c)
 {
   return (int32x2_t)__builtin_neon_vmlsv2si (__a, __b, __c);
 }
 
-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline float32x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vmls_f32 (float32x2_t __a, float32x2_t __b, float32x2_t __c)
 {
   return (float32x2_t)__builtin_neon_vmlsv2sf (__a, __b, __c);
 }
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint8x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vmls_u8 (uint8x8_t __a, uint8x8_t __b, uint8x8_t __c)
 {
   return (uint8x8_t)__builtin_neon_vmlsv8qi ((int8x8_t) __a, (int8x8_t) __b, (int8x8_t) __c);
 }
 
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vmls_u16 (uint16x4_t __a, uint16x4_t __b, uint16x4_t __c)
 {
   return (uint16x4_t)__builtin_neon_vmlsv4hi ((int16x4_t) __a, (int16x4_t) __b, (int16x4_t) __c);
 }
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vmls_u32 (uint32x2_t __a, uint32x2_t __b, uint32x2_t __c)
 {
   return (uint32x2_t)__builtin_neon_vmlsv2si ((int32x2_t) __a, (int32x2_t) __b, (int32x2_t) __c);
 }
 
-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int8x16_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vmlsq_s8 (int8x16_t __a, int8x16_t __b, int8x16_t __c)
 {
   return (int8x16_t)__builtin_neon_vmlsv16qi (__a, __b, __c);
 }
 
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vmlsq_s16 (int16x8_t __a, int16x8_t __b, int16x8_t __c)
 {
   return (int16x8_t)__builtin_neon_vmlsv8hi (__a, __b, __c);
 }
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vmlsq_s32 (int32x4_t __a, int32x4_t __b, int32x4_t __c)
 {
   return (int32x4_t)__builtin_neon_vmlsv4si (__a, __b, __c);
 }
 
-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline float32x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vmlsq_f32 (float32x4_t __a, float32x4_t __b, float32x4_t __c)
 {
   return (float32x4_t)__builtin_neon_vmlsv4sf (__a, __b, __c);
 }
 
-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint8x16_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vmlsq_u8 (uint8x16_t __a, uint8x16_t __b, uint8x16_t __c)
 {
   return (uint8x16_t)__builtin_neon_vmlsv16qi ((int8x16_t) __a, (int8x16_t) __b, (int8x16_t) __c);
 }
 
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vmlsq_u16 (uint16x8_t __a, uint16x8_t __b, uint16x8_t __c)
 {
   return (uint16x8_t)__builtin_neon_vmlsv8hi ((int16x8_t) __a, (int16x8_t) __b, (int16x8_t) __c);
 }
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vmlsq_u32 (uint32x4_t __a, uint32x4_t __b, uint32x4_t __c)
 {
   return (uint32x4_t)__builtin_neon_vmlsv4si ((int32x4_t) __a, (int32x4_t) __b, (int32x4_t) __c);
 }
 
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vmlsl_s8 (int16x8_t __a, int8x8_t __b, int8x8_t __c)
 {
   return (int16x8_t)__builtin_neon_vmlslsv8qi (__a, __b, __c);
 }
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vmlsl_s16 (int32x4_t __a, int16x4_t __b, int16x4_t __c)
 {
   return (int32x4_t)__builtin_neon_vmlslsv4hi (__a, __b, __c);
 }
 
-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int64x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vmlsl_s32 (int64x2_t __a, int32x2_t __b, int32x2_t __c)
 {
   return (int64x2_t)__builtin_neon_vmlslsv2si (__a, __b, __c);
 }
 
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vmlsl_u8 (uint16x8_t __a, uint8x8_t __b, uint8x8_t __c)
 {
   return (uint16x8_t)__builtin_neon_vmlsluv8qi ((int16x8_t) __a, (int8x8_t) __b, (int8x8_t) __c);
 }
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vmlsl_u16 (uint32x4_t __a, uint16x4_t __b, uint16x4_t __c)
 {
   return (uint32x4_t)__builtin_neon_vmlsluv4hi ((int32x4_t) __a, (int16x4_t) __b, (int16x4_t) __c);
 }
 
-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint64x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vmlsl_u32 (uint64x2_t __a, uint32x2_t __b, uint32x2_t __c)
 {
   return (uint64x2_t)__builtin_neon_vmlsluv2si ((int64x2_t) __a, (int32x2_t) __b, (int32x2_t) __c);
 }
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vqdmlsl_s16 (int32x4_t __a, int16x4_t __b, int16x4_t __c)
 {
   return (int32x4_t)__builtin_neon_vqdmlslv4hi (__a, __b, __c);
 }
 
-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int64x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vqdmlsl_s32 (int64x2_t __a, int32x2_t __b, int32x2_t __c)
 {
   return (int64x2_t)__builtin_neon_vqdmlslv2si (__a, __b, __c);
@@ -1532,25 +1700,29 @@ vqdmlsl_s32 (int64x2_t __a, int32x2_t __b, int32x2_t __c)
 
 #pragma GCC push_options
 #pragma GCC target ("fpu=neon-vfpv4")
-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline float32x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vfma_f32 (float32x2_t __a, float32x2_t __b, float32x2_t __c)
 {
   return (float32x2_t)__builtin_neon_vfmav2sf (__a, __b, __c);
 }
 
-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline float32x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vfmaq_f32 (float32x4_t __a, float32x4_t __b, float32x4_t __c)
 {
   return (float32x4_t)__builtin_neon_vfmav4sf (__a, __b, __c);
 }
 
-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline float32x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vfms_f32 (float32x2_t __a, float32x2_t __b, float32x2_t __c)
 {
   return (float32x2_t)__builtin_neon_vfmsv2sf (__a, __b, __c);
 }
 
-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline float32x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vfmsq_f32 (float32x4_t __a, float32x4_t __b, float32x4_t __c)
 {
   return (float32x4_t)__builtin_neon_vfmsv4sf (__a, __b, __c);
@@ -1558,7 +1730,8 @@ vfmsq_f32 (float32x4_t __a, float32x4_t __b, float32x4_t __c)
 #pragma GCC pop_options
 
 #if __ARM_ARCH >= 8
-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline float32x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vrndn_f32 (float32x2_t __a)
 {
   return (float32x2_t)__builtin_neon_vrintnv2sf (__a);
@@ -1566,7 +1739,8 @@ vrndn_f32 (float32x2_t __a)
 
 #endif
 #if __ARM_ARCH >= 8
-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline float32x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vrndnq_f32 (float32x4_t __a)
 {
   return (float32x4_t)__builtin_neon_vrintnv4sf (__a);
@@ -1574,7 +1748,8 @@ vrndnq_f32 (float32x4_t __a)
 
 #endif
 #if __ARM_ARCH >= 8
-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline float32x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vrnda_f32 (float32x2_t __a)
 {
   return (float32x2_t)__builtin_neon_vrintav2sf (__a);
@@ -1582,7 +1757,8 @@ vrnda_f32 (float32x2_t __a)
 
 #endif
 #if __ARM_ARCH >= 8
-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline float32x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vrndaq_f32 (float32x4_t __a)
 {
   return (float32x4_t)__builtin_neon_vrintav4sf (__a);
@@ -1590,7 +1766,8 @@ vrndaq_f32 (float32x4_t __a)
 
 #endif
 #if __ARM_ARCH >= 8
-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline float32x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vrndp_f32 (float32x2_t __a)
 {
   return (float32x2_t)__builtin_neon_vrintpv2sf (__a);
@@ -1598,7 +1775,8 @@ vrndp_f32 (float32x2_t __a)
 
 #endif
 #if __ARM_ARCH >= 8
-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline float32x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vrndpq_f32 (float32x4_t __a)
 {
   return (float32x4_t)__builtin_neon_vrintpv4sf (__a);
@@ -1606,7 +1784,8 @@ vrndpq_f32 (float32x4_t __a)
 
 #endif
 #if __ARM_ARCH >= 8
-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline float32x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vrndm_f32 (float32x2_t __a)
 {
   return (float32x2_t)__builtin_neon_vrintmv2sf (__a);
@@ -1614,7 +1793,8 @@ vrndm_f32 (float32x2_t __a)
 
 #endif
 #if __ARM_ARCH >= 8
-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline float32x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vrndmq_f32 (float32x4_t __a)
 {
   return (float32x4_t)__builtin_neon_vrintmv4sf (__a);
@@ -1623,7 +1803,8 @@ vrndmq_f32 (float32x4_t __a)
 #endif
 
 #if __ARM_ARCH >= 8
-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline float32x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vrndx_f32 (float32x2_t __a)
 {
   return (float32x2_t)__builtin_neon_vrintxv2sf (__a);
@@ -1632,7 +1813,8 @@ vrndx_f32 (float32x2_t __a)
 #endif
 
 #if __ARM_ARCH >= 8
-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline float32x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vrndxq_f32 (float32x4_t __a)
 {
   return (float32x4_t)__builtin_neon_vrintxv4sf (__a);
@@ -1641,7 +1823,8 @@ vrndxq_f32 (float32x4_t __a)
 #endif
 
 #if __ARM_ARCH >= 8
-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline float32x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vrnd_f32 (float32x2_t __a)
 {
   return (float32x2_t)__builtin_neon_vrintzv2sf (__a);
@@ -1649,7 +1832,8 @@ vrnd_f32 (float32x2_t __a)
 
 #endif
 #if __ARM_ARCH >= 8
-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline float32x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vrndq_f32 (float32x4_t __a)
 {
   return (float32x4_t)__builtin_neon_vrintzv4sf (__a);
@@ -1657,2907 +1841,3436 @@ vrndq_f32 (float32x4_t __a)
 
 #endif
 
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int8x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vsub_s8 (int8x8_t __a, int8x8_t __b)
 {
   return __a - __b;
 }
 
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vsub_s16 (int16x4_t __a, int16x4_t __b)
 {
   return __a - __b;
 }
 
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vsub_s32 (int32x2_t __a, int32x2_t __b)
 {
   return __a - __b;
 }
 
-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline float32x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vsub_f32 (float32x2_t __a, float32x2_t __b)
 {
-#ifdef __FAST_MATH
+#ifdef __FAST_MATH__
   return __a - __b;
 #else
   return (float32x2_t) __builtin_neon_vsubv2sf (__a, __b);
 #endif
 }
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint8x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vsub_u8 (uint8x8_t __a, uint8x8_t __b)
 {
   return __a - __b;
 }
 
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vsub_u16 (uint16x4_t __a, uint16x4_t __b)
 {
   return __a - __b;
 }
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vsub_u32 (uint32x2_t __a, uint32x2_t __b)
 {
   return __a - __b;
 }
 
-__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int64x1_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vsub_s64 (int64x1_t __a, int64x1_t __b)
 {
   return __a - __b;
 }
 
-__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint64x1_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vsub_u64 (uint64x1_t __a, uint64x1_t __b)
 {
   return __a - __b;
 }
 
-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int8x16_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vsubq_s8 (int8x16_t __a, int8x16_t __b)
 {
   return __a - __b;
 }
 
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vsubq_s16 (int16x8_t __a, int16x8_t __b)
 {
   return __a - __b;
 }
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vsubq_s32 (int32x4_t __a, int32x4_t __b)
 {
   return __a - __b;
 }
 
-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int64x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vsubq_s64 (int64x2_t __a, int64x2_t __b)
 {
   return __a - __b;
 }
 
-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline float32x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vsubq_f32 (float32x4_t __a, float32x4_t __b)
 {
-#ifdef __FAST_MATH
+#ifdef __FAST_MATH__
   return __a - __b;
 #else
   return (float32x4_t) __builtin_neon_vsubv4sf (__a, __b);
 #endif
 }
 
-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint8x16_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vsubq_u8 (uint8x16_t __a, uint8x16_t __b)
 {
   return __a - __b;
 }
 
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vsubq_u16 (uint16x8_t __a, uint16x8_t __b)
 {
   return __a - __b;
 }
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vsubq_u32 (uint32x4_t __a, uint32x4_t __b)
 {
   return __a - __b;
 }
 
-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint64x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vsubq_u64 (uint64x2_t __a, uint64x2_t __b)
 {
   return __a - __b;
 }
 
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vsubl_s8 (int8x8_t __a, int8x8_t __b)
 {
   return (int16x8_t)__builtin_neon_vsublsv8qi (__a, __b);
 }
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vsubl_s16 (int16x4_t __a, int16x4_t __b)
 {
   return (int32x4_t)__builtin_neon_vsublsv4hi (__a, __b);
 }
 
-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int64x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vsubl_s32 (int32x2_t __a, int32x2_t __b)
 {
   return (int64x2_t)__builtin_neon_vsublsv2si (__a, __b);
 }
 
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vsubl_u8 (uint8x8_t __a, uint8x8_t __b)
 {
   return (uint16x8_t)__builtin_neon_vsubluv8qi ((int8x8_t) __a, (int8x8_t) __b);
 }
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vsubl_u16 (uint16x4_t __a, uint16x4_t __b)
 {
   return (uint32x4_t)__builtin_neon_vsubluv4hi ((int16x4_t) __a, (int16x4_t) __b);
 }
 
-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint64x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vsubl_u32 (uint32x2_t __a, uint32x2_t __b)
 {
   return (uint64x2_t)__builtin_neon_vsubluv2si ((int32x2_t) __a, (int32x2_t) __b);
 }
 
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vsubw_s8 (int16x8_t __a, int8x8_t __b)
 {
   return (int16x8_t)__builtin_neon_vsubwsv8qi (__a, __b);
 }
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vsubw_s16 (int32x4_t __a, int16x4_t __b)
 {
   return (int32x4_t)__builtin_neon_vsubwsv4hi (__a, __b);
 }
 
-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int64x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vsubw_s32 (int64x2_t __a, int32x2_t __b)
 {
   return (int64x2_t)__builtin_neon_vsubwsv2si (__a, __b);
 }
 
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vsubw_u8 (uint16x8_t __a, uint8x8_t __b)
 {
   return (uint16x8_t)__builtin_neon_vsubwuv8qi ((int16x8_t) __a, (int8x8_t) __b);
 }
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vsubw_u16 (uint32x4_t __a, uint16x4_t __b)
 {
   return (uint32x4_t)__builtin_neon_vsubwuv4hi ((int32x4_t) __a, (int16x4_t) __b);
 }
 
-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint64x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vsubw_u32 (uint64x2_t __a, uint32x2_t __b)
 {
   return (uint64x2_t)__builtin_neon_vsubwuv2si ((int64x2_t) __a, (int32x2_t) __b);
 }
 
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int8x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vhsub_s8 (int8x8_t __a, int8x8_t __b)
 {
   return (int8x8_t)__builtin_neon_vhsubsv8qi (__a, __b);
 }
 
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vhsub_s16 (int16x4_t __a, int16x4_t __b)
 {
   return (int16x4_t)__builtin_neon_vhsubsv4hi (__a, __b);
 }
 
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vhsub_s32 (int32x2_t __a, int32x2_t __b)
 {
   return (int32x2_t)__builtin_neon_vhsubsv2si (__a, __b);
 }
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint8x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vhsub_u8 (uint8x8_t __a, uint8x8_t __b)
 {
   return (uint8x8_t)__builtin_neon_vhsubuv8qi ((int8x8_t) __a, (int8x8_t) __b);
 }
 
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vhsub_u16 (uint16x4_t __a, uint16x4_t __b)
 {
   return (uint16x4_t)__builtin_neon_vhsubuv4hi ((int16x4_t) __a, (int16x4_t) __b);
 }
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vhsub_u32 (uint32x2_t __a, uint32x2_t __b)
 {
   return (uint32x2_t)__builtin_neon_vhsubuv2si ((int32x2_t) __a, (int32x2_t) __b);
 }
 
-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int8x16_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vhsubq_s8 (int8x16_t __a, int8x16_t __b)
 {
   return (int8x16_t)__builtin_neon_vhsubsv16qi (__a, __b);
 }
 
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vhsubq_s16 (int16x8_t __a, int16x8_t __b)
 {
   return (int16x8_t)__builtin_neon_vhsubsv8hi (__a, __b);
 }
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vhsubq_s32 (int32x4_t __a, int32x4_t __b)
 {
   return (int32x4_t)__builtin_neon_vhsubsv4si (__a, __b);
 }
 
-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint8x16_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vhsubq_u8 (uint8x16_t __a, uint8x16_t __b)
 {
   return (uint8x16_t)__builtin_neon_vhsubuv16qi ((int8x16_t) __a, (int8x16_t) __b);
 }
 
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vhsubq_u16 (uint16x8_t __a, uint16x8_t __b)
 {
   return (uint16x8_t)__builtin_neon_vhsubuv8hi ((int16x8_t) __a, (int16x8_t) __b);
 }
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vhsubq_u32 (uint32x4_t __a, uint32x4_t __b)
 {
   return (uint32x4_t)__builtin_neon_vhsubuv4si ((int32x4_t) __a, (int32x4_t) __b);
 }
 
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int8x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vqsub_s8 (int8x8_t __a, int8x8_t __b)
 {
   return (int8x8_t)__builtin_neon_vqsubsv8qi (__a, __b);
 }
 
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vqsub_s16 (int16x4_t __a, int16x4_t __b)
 {
   return (int16x4_t)__builtin_neon_vqsubsv4hi (__a, __b);
 }
 
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vqsub_s32 (int32x2_t __a, int32x2_t __b)
 {
   return (int32x2_t)__builtin_neon_vqsubsv2si (__a, __b);
 }
 
-__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int64x1_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vqsub_s64 (int64x1_t __a, int64x1_t __b)
 {
   return (int64x1_t)__builtin_neon_vqsubsdi (__a, __b);
 }
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint8x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vqsub_u8 (uint8x8_t __a, uint8x8_t __b)
 {
   return (uint8x8_t)__builtin_neon_vqsubuv8qi ((int8x8_t) __a, (int8x8_t) __b);
 }
 
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vqsub_u16 (uint16x4_t __a, uint16x4_t __b)
 {
   return (uint16x4_t)__builtin_neon_vqsubuv4hi ((int16x4_t) __a, (int16x4_t) __b);
 }
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vqsub_u32 (uint32x2_t __a, uint32x2_t __b)
 {
   return (uint32x2_t)__builtin_neon_vqsubuv2si ((int32x2_t) __a, (int32x2_t) __b);
 }
 
-__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint64x1_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vqsub_u64 (uint64x1_t __a, uint64x1_t __b)
 {
   return (uint64x1_t)__builtin_neon_vqsubudi ((int64x1_t) __a, (int64x1_t) __b);
 }
 
-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int8x16_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vqsubq_s8 (int8x16_t __a, int8x16_t __b)
 {
   return (int8x16_t)__builtin_neon_vqsubsv16qi (__a, __b);
 }
 
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vqsubq_s16 (int16x8_t __a, int16x8_t __b)
 {
   return (int16x8_t)__builtin_neon_vqsubsv8hi (__a, __b);
 }
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vqsubq_s32 (int32x4_t __a, int32x4_t __b)
 {
   return (int32x4_t)__builtin_neon_vqsubsv4si (__a, __b);
 }
 
-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int64x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vqsubq_s64 (int64x2_t __a, int64x2_t __b)
 {
   return (int64x2_t)__builtin_neon_vqsubsv2di (__a, __b);
 }
 
-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint8x16_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vqsubq_u8 (uint8x16_t __a, uint8x16_t __b)
 {
   return (uint8x16_t)__builtin_neon_vqsubuv16qi ((int8x16_t) __a, (int8x16_t) __b);
 }
 
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vqsubq_u16 (uint16x8_t __a, uint16x8_t __b)
 {
   return (uint16x8_t)__builtin_neon_vqsubuv8hi ((int16x8_t) __a, (int16x8_t) __b);
 }
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vqsubq_u32 (uint32x4_t __a, uint32x4_t __b)
 {
   return (uint32x4_t)__builtin_neon_vqsubuv4si ((int32x4_t) __a, (int32x4_t) __b);
 }
 
-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint64x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vqsubq_u64 (uint64x2_t __a, uint64x2_t __b)
 {
   return (uint64x2_t)__builtin_neon_vqsubuv2di ((int64x2_t) __a, (int64x2_t) __b);
 }
 
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int8x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vsubhn_s16 (int16x8_t __a, int16x8_t __b)
 {
   return (int8x8_t)__builtin_neon_vsubhnv8hi (__a, __b);
 }
 
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vsubhn_s32 (int32x4_t __a, int32x4_t __b)
 {
   return (int16x4_t)__builtin_neon_vsubhnv4si (__a, __b);
 }
 
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vsubhn_s64 (int64x2_t __a, int64x2_t __b)
 {
   return (int32x2_t)__builtin_neon_vsubhnv2di (__a, __b);
 }
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint8x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vsubhn_u16 (uint16x8_t __a, uint16x8_t __b)
 {
   return (uint8x8_t)__builtin_neon_vsubhnv8hi ((int16x8_t) __a, (int16x8_t) __b);
 }
 
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vsubhn_u32 (uint32x4_t __a, uint32x4_t __b)
 {
   return (uint16x4_t)__builtin_neon_vsubhnv4si ((int32x4_t) __a, (int32x4_t) __b);
 }
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vsubhn_u64 (uint64x2_t __a, uint64x2_t __b)
 {
   return (uint32x2_t)__builtin_neon_vsubhnv2di ((int64x2_t) __a, (int64x2_t) __b);
 }
 
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int8x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vrsubhn_s16 (int16x8_t __a, int16x8_t __b)
 {
   return (int8x8_t)__builtin_neon_vrsubhnv8hi (__a, __b);
 }
 
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vrsubhn_s32 (int32x4_t __a, int32x4_t __b)
 {
   return (int16x4_t)__builtin_neon_vrsubhnv4si (__a, __b);
 }
 
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vrsubhn_s64 (int64x2_t __a, int64x2_t __b)
 {
   return (int32x2_t)__builtin_neon_vrsubhnv2di (__a, __b);
 }
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint8x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vrsubhn_u16 (uint16x8_t __a, uint16x8_t __b)
 {
   return (uint8x8_t)__builtin_neon_vrsubhnv8hi ((int16x8_t) __a, (int16x8_t) __b);
 }
 
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vrsubhn_u32 (uint32x4_t __a, uint32x4_t __b)
 {
   return (uint16x4_t)__builtin_neon_vrsubhnv4si ((int32x4_t) __a, (int32x4_t) __b);
 }
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vrsubhn_u64 (uint64x2_t __a, uint64x2_t __b)
 {
   return (uint32x2_t)__builtin_neon_vrsubhnv2di ((int64x2_t) __a, (int64x2_t) __b);
 }
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint8x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vceq_s8 (int8x8_t __a, int8x8_t __b)
 {
   return (uint8x8_t)__builtin_neon_vceqv8qi (__a, __b);
 }
 
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vceq_s16 (int16x4_t __a, int16x4_t __b)
 {
   return (uint16x4_t)__builtin_neon_vceqv4hi (__a, __b);
 }
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vceq_s32 (int32x2_t __a, int32x2_t __b)
 {
   return (uint32x2_t)__builtin_neon_vceqv2si (__a, __b);
 }
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vceq_f32 (float32x2_t __a, float32x2_t __b)
 {
   return (uint32x2_t)__builtin_neon_vceqv2sf (__a, __b);
 }
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint8x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vceq_u8 (uint8x8_t __a, uint8x8_t __b)
 {
   return (uint8x8_t)__builtin_neon_vceqv8qi ((int8x8_t) __a, (int8x8_t) __b);
 }
 
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vceq_u16 (uint16x4_t __a, uint16x4_t __b)
 {
   return (uint16x4_t)__builtin_neon_vceqv4hi ((int16x4_t) __a, (int16x4_t) __b);
 }
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vceq_u32 (uint32x2_t __a, uint32x2_t __b)
 {
   return (uint32x2_t)__builtin_neon_vceqv2si ((int32x2_t) __a, (int32x2_t) __b);
 }
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint8x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vceq_p8 (poly8x8_t __a, poly8x8_t __b)
 {
   return (uint8x8_t)__builtin_neon_vceqv8qi ((int8x8_t) __a, (int8x8_t) __b);
 }
 
-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint8x16_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vceqq_s8 (int8x16_t __a, int8x16_t __b)
 {
   return (uint8x16_t)__builtin_neon_vceqv16qi (__a, __b);
 }
 
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vceqq_s16 (int16x8_t __a, int16x8_t __b)
 {
   return (uint16x8_t)__builtin_neon_vceqv8hi (__a, __b);
 }
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vceqq_s32 (int32x4_t __a, int32x4_t __b)
 {
   return (uint32x4_t)__builtin_neon_vceqv4si (__a, __b);
 }
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vceqq_f32 (float32x4_t __a, float32x4_t __b)
 {
   return (uint32x4_t)__builtin_neon_vceqv4sf (__a, __b);
 }
 
-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint8x16_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vceqq_u8 (uint8x16_t __a, uint8x16_t __b)
 {
   return (uint8x16_t)__builtin_neon_vceqv16qi ((int8x16_t) __a, (int8x16_t) __b);
 }
 
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vceqq_u16 (uint16x8_t __a, uint16x8_t __b)
 {
   return (uint16x8_t)__builtin_neon_vceqv8hi ((int16x8_t) __a, (int16x8_t) __b);
 }
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vceqq_u32 (uint32x4_t __a, uint32x4_t __b)
 {
   return (uint32x4_t)__builtin_neon_vceqv4si ((int32x4_t) __a, (int32x4_t) __b);
 }
 
-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint8x16_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vceqq_p8 (poly8x16_t __a, poly8x16_t __b)
 {
   return (uint8x16_t)__builtin_neon_vceqv16qi ((int8x16_t) __a, (int8x16_t) __b);
 }
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint8x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vcge_s8 (int8x8_t __a, int8x8_t __b)
 {
   return (uint8x8_t)__builtin_neon_vcgev8qi (__a, __b);
 }
 
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vcge_s16 (int16x4_t __a, int16x4_t __b)
 {
   return (uint16x4_t)__builtin_neon_vcgev4hi (__a, __b);
 }
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vcge_s32 (int32x2_t __a, int32x2_t __b)
 {
   return (uint32x2_t)__builtin_neon_vcgev2si (__a, __b);
 }
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vcge_f32 (float32x2_t __a, float32x2_t __b)
 {
   return (uint32x2_t)__builtin_neon_vcgev2sf (__a, __b);
 }
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint8x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vcge_u8 (uint8x8_t __a, uint8x8_t __b)
 {
   return (uint8x8_t)__builtin_neon_vcgeuv8qi ((int8x8_t) __a, (int8x8_t) __b);
 }
 
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vcge_u16 (uint16x4_t __a, uint16x4_t __b)
 {
   return (uint16x4_t)__builtin_neon_vcgeuv4hi ((int16x4_t) __a, (int16x4_t) __b);
 }
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vcge_u32 (uint32x2_t __a, uint32x2_t __b)
 {
   return (uint32x2_t)__builtin_neon_vcgeuv2si ((int32x2_t) __a, (int32x2_t) __b);
 }
 
-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint8x16_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vcgeq_s8 (int8x16_t __a, int8x16_t __b)
 {
   return (uint8x16_t)__builtin_neon_vcgev16qi (__a, __b);
 }
 
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vcgeq_s16 (int16x8_t __a, int16x8_t __b)
 {
   return (uint16x8_t)__builtin_neon_vcgev8hi (__a, __b);
 }
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vcgeq_s32 (int32x4_t __a, int32x4_t __b)
 {
   return (uint32x4_t)__builtin_neon_vcgev4si (__a, __b);
 }
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vcgeq_f32 (float32x4_t __a, float32x4_t __b)
 {
   return (uint32x4_t)__builtin_neon_vcgev4sf (__a, __b);
 }
 
-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint8x16_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vcgeq_u8 (uint8x16_t __a, uint8x16_t __b)
 {
   return (uint8x16_t)__builtin_neon_vcgeuv16qi ((int8x16_t) __a, (int8x16_t) __b);
 }
 
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vcgeq_u16 (uint16x8_t __a, uint16x8_t __b)
 {
   return (uint16x8_t)__builtin_neon_vcgeuv8hi ((int16x8_t) __a, (int16x8_t) __b);
 }
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vcgeq_u32 (uint32x4_t __a, uint32x4_t __b)
 {
   return (uint32x4_t)__builtin_neon_vcgeuv4si ((int32x4_t) __a, (int32x4_t) __b);
 }
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint8x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vcle_s8 (int8x8_t __a, int8x8_t __b)
 {
   return (uint8x8_t)__builtin_neon_vcgev8qi (__b, __a);
 }
 
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vcle_s16 (int16x4_t __a, int16x4_t __b)
 {
   return (uint16x4_t)__builtin_neon_vcgev4hi (__b, __a);
 }
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vcle_s32 (int32x2_t __a, int32x2_t __b)
 {
   return (uint32x2_t)__builtin_neon_vcgev2si (__b, __a);
 }
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vcle_f32 (float32x2_t __a, float32x2_t __b)
 {
   return (uint32x2_t)__builtin_neon_vcgev2sf (__b, __a);
 }
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint8x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vcle_u8 (uint8x8_t __a, uint8x8_t __b)
 {
   return (uint8x8_t)__builtin_neon_vcgeuv8qi ((int8x8_t) __b, (int8x8_t) __a);
 }
 
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vcle_u16 (uint16x4_t __a, uint16x4_t __b)
 {
   return (uint16x4_t)__builtin_neon_vcgeuv4hi ((int16x4_t) __b, (int16x4_t) __a);
 }
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vcle_u32 (uint32x2_t __a, uint32x2_t __b)
 {
   return (uint32x2_t)__builtin_neon_vcgeuv2si ((int32x2_t) __b, (int32x2_t) __a);
 }
 
-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint8x16_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vcleq_s8 (int8x16_t __a, int8x16_t __b)
 {
   return (uint8x16_t)__builtin_neon_vcgev16qi (__b, __a);
 }
 
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vcleq_s16 (int16x8_t __a, int16x8_t __b)
 {
   return (uint16x8_t)__builtin_neon_vcgev8hi (__b, __a);
 }
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vcleq_s32 (int32x4_t __a, int32x4_t __b)
 {
   return (uint32x4_t)__builtin_neon_vcgev4si (__b, __a);
 }
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vcleq_f32 (float32x4_t __a, float32x4_t __b)
 {
   return (uint32x4_t)__builtin_neon_vcgev4sf (__b, __a);
 }
 
-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint8x16_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vcleq_u8 (uint8x16_t __a, uint8x16_t __b)
 {
   return (uint8x16_t)__builtin_neon_vcgeuv16qi ((int8x16_t) __b, (int8x16_t) __a);
 }
 
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vcleq_u16 (uint16x8_t __a, uint16x8_t __b)
 {
   return (uint16x8_t)__builtin_neon_vcgeuv8hi ((int16x8_t) __b, (int16x8_t) __a);
 }
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vcleq_u32 (uint32x4_t __a, uint32x4_t __b)
 {
   return (uint32x4_t)__builtin_neon_vcgeuv4si ((int32x4_t) __b, (int32x4_t) __a);
 }
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint8x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vcgt_s8 (int8x8_t __a, int8x8_t __b)
 {
   return (uint8x8_t)__builtin_neon_vcgtv8qi (__a, __b);
 }
 
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vcgt_s16 (int16x4_t __a, int16x4_t __b)
 {
   return (uint16x4_t)__builtin_neon_vcgtv4hi (__a, __b);
 }
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vcgt_s32 (int32x2_t __a, int32x2_t __b)
 {
   return (uint32x2_t)__builtin_neon_vcgtv2si (__a, __b);
 }
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vcgt_f32 (float32x2_t __a, float32x2_t __b)
 {
   return (uint32x2_t)__builtin_neon_vcgtv2sf (__a, __b);
 }
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint8x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vcgt_u8 (uint8x8_t __a, uint8x8_t __b)
 {
   return (uint8x8_t)__builtin_neon_vcgtuv8qi ((int8x8_t) __a, (int8x8_t) __b);
 }
 
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vcgt_u16 (uint16x4_t __a, uint16x4_t __b)
 {
   return (uint16x4_t)__builtin_neon_vcgtuv4hi ((int16x4_t) __a, (int16x4_t) __b);
 }
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vcgt_u32 (uint32x2_t __a, uint32x2_t __b)
 {
   return (uint32x2_t)__builtin_neon_vcgtuv2si ((int32x2_t) __a, (int32x2_t) __b);
 }
 
-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint8x16_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vcgtq_s8 (int8x16_t __a, int8x16_t __b)
 {
   return (uint8x16_t)__builtin_neon_vcgtv16qi (__a, __b);
 }
 
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vcgtq_s16 (int16x8_t __a, int16x8_t __b)
 {
   return (uint16x8_t)__builtin_neon_vcgtv8hi (__a, __b);
 }
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vcgtq_s32 (int32x4_t __a, int32x4_t __b)
 {
   return (uint32x4_t)__builtin_neon_vcgtv4si (__a, __b);
 }
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vcgtq_f32 (float32x4_t __a, float32x4_t __b)
 {
   return (uint32x4_t)__builtin_neon_vcgtv4sf (__a, __b);
 }
 
-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint8x16_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vcgtq_u8 (uint8x16_t __a, uint8x16_t __b)
 {
   return (uint8x16_t)__builtin_neon_vcgtuv16qi ((int8x16_t) __a, (int8x16_t) __b);
 }
 
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vcgtq_u16 (uint16x8_t __a, uint16x8_t __b)
 {
   return (uint16x8_t)__builtin_neon_vcgtuv8hi ((int16x8_t) __a, (int16x8_t) __b);
 }
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vcgtq_u32 (uint32x4_t __a, uint32x4_t __b)
 {
   return (uint32x4_t)__builtin_neon_vcgtuv4si ((int32x4_t) __a, (int32x4_t) __b);
 }
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint8x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vclt_s8 (int8x8_t __a, int8x8_t __b)
 {
   return (uint8x8_t)__builtin_neon_vcgtv8qi (__b, __a);
 }
 
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vclt_s16 (int16x4_t __a, int16x4_t __b)
 {
   return (uint16x4_t)__builtin_neon_vcgtv4hi (__b, __a);
 }
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vclt_s32 (int32x2_t __a, int32x2_t __b)
 {
   return (uint32x2_t)__builtin_neon_vcgtv2si (__b, __a);
 }
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vclt_f32 (float32x2_t __a, float32x2_t __b)
 {
   return (uint32x2_t)__builtin_neon_vcgtv2sf (__b, __a);
 }
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint8x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vclt_u8 (uint8x8_t __a, uint8x8_t __b)
 {
   return (uint8x8_t)__builtin_neon_vcgtuv8qi ((int8x8_t) __b, (int8x8_t) __a);
 }
 
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vclt_u16 (uint16x4_t __a, uint16x4_t __b)
 {
   return (uint16x4_t)__builtin_neon_vcgtuv4hi ((int16x4_t) __b, (int16x4_t) __a);
 }
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vclt_u32 (uint32x2_t __a, uint32x2_t __b)
 {
   return (uint32x2_t)__builtin_neon_vcgtuv2si ((int32x2_t) __b, (int32x2_t) __a);
 }
 
-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint8x16_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vcltq_s8 (int8x16_t __a, int8x16_t __b)
 {
   return (uint8x16_t)__builtin_neon_vcgtv16qi (__b, __a);
 }
 
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vcltq_s16 (int16x8_t __a, int16x8_t __b)
 {
   return (uint16x8_t)__builtin_neon_vcgtv8hi (__b, __a);
 }
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vcltq_s32 (int32x4_t __a, int32x4_t __b)
 {
   return (uint32x4_t)__builtin_neon_vcgtv4si (__b, __a);
 }
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vcltq_f32 (float32x4_t __a, float32x4_t __b)
 {
   return (uint32x4_t)__builtin_neon_vcgtv4sf (__b, __a);
 }
 
-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint8x16_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vcltq_u8 (uint8x16_t __a, uint8x16_t __b)
 {
   return (uint8x16_t)__builtin_neon_vcgtuv16qi ((int8x16_t) __b, (int8x16_t) __a);
 }
 
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vcltq_u16 (uint16x8_t __a, uint16x8_t __b)
 {
   return (uint16x8_t)__builtin_neon_vcgtuv8hi ((int16x8_t) __b, (int16x8_t) __a);
 }
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vcltq_u32 (uint32x4_t __a, uint32x4_t __b)
 {
   return (uint32x4_t)__builtin_neon_vcgtuv4si ((int32x4_t) __b, (int32x4_t) __a);
 }
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vcage_f32 (float32x2_t __a, float32x2_t __b)
 {
   return (uint32x2_t)__builtin_neon_vcagev2sf (__a, __b);
 }
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vcageq_f32 (float32x4_t __a, float32x4_t __b)
 {
   return (uint32x4_t)__builtin_neon_vcagev4sf (__a, __b);
 }
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vcale_f32 (float32x2_t __a, float32x2_t __b)
 {
   return (uint32x2_t)__builtin_neon_vcagev2sf (__b, __a);
 }
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vcaleq_f32 (float32x4_t __a, float32x4_t __b)
 {
   return (uint32x4_t)__builtin_neon_vcagev4sf (__b, __a);
 }
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vcagt_f32 (float32x2_t __a, float32x2_t __b)
 {
   return (uint32x2_t)__builtin_neon_vcagtv2sf (__a, __b);
 }
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vcagtq_f32 (float32x4_t __a, float32x4_t __b)
 {
   return (uint32x4_t)__builtin_neon_vcagtv4sf (__a, __b);
 }
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vcalt_f32 (float32x2_t __a, float32x2_t __b)
 {
   return (uint32x2_t)__builtin_neon_vcagtv2sf (__b, __a);
 }
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vcaltq_f32 (float32x4_t __a, float32x4_t __b)
 {
   return (uint32x4_t)__builtin_neon_vcagtv4sf (__b, __a);
 }
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint8x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vtst_s8 (int8x8_t __a, int8x8_t __b)
 {
   return (uint8x8_t)__builtin_neon_vtstv8qi (__a, __b);
 }
 
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vtst_s16 (int16x4_t __a, int16x4_t __b)
 {
   return (uint16x4_t)__builtin_neon_vtstv4hi (__a, __b);
 }
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vtst_s32 (int32x2_t __a, int32x2_t __b)
 {
   return (uint32x2_t)__builtin_neon_vtstv2si (__a, __b);
 }
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint8x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vtst_u8 (uint8x8_t __a, uint8x8_t __b)
 {
   return (uint8x8_t)__builtin_neon_vtstv8qi ((int8x8_t) __a, (int8x8_t) __b);
 }
 
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vtst_u16 (uint16x4_t __a, uint16x4_t __b)
 {
   return (uint16x4_t)__builtin_neon_vtstv4hi ((int16x4_t) __a, (int16x4_t) __b);
 }
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vtst_u32 (uint32x2_t __a, uint32x2_t __b)
 {
   return (uint32x2_t)__builtin_neon_vtstv2si ((int32x2_t) __a, (int32x2_t) __b);
 }
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint8x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vtst_p8 (poly8x8_t __a, poly8x8_t __b)
 {
   return (uint8x8_t)__builtin_neon_vtstv8qi ((int8x8_t) __a, (int8x8_t) __b);
 }
 
-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vtst_p16 (poly16x4_t __a, poly16x4_t __b)
+{
+  return (uint16x4_t)__builtin_neon_vtstv4hi ((int16x4_t) __a, (int16x4_t) __b);
+}
+
+__extension__ extern __inline uint8x16_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vtstq_s8 (int8x16_t __a, int8x16_t __b)
 {
   return (uint8x16_t)__builtin_neon_vtstv16qi (__a, __b);
 }
 
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vtstq_s16 (int16x8_t __a, int16x8_t __b)
 {
   return (uint16x8_t)__builtin_neon_vtstv8hi (__a, __b);
 }
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vtstq_s32 (int32x4_t __a, int32x4_t __b)
 {
   return (uint32x4_t)__builtin_neon_vtstv4si (__a, __b);
 }
 
-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint8x16_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vtstq_u8 (uint8x16_t __a, uint8x16_t __b)
 {
   return (uint8x16_t)__builtin_neon_vtstv16qi ((int8x16_t) __a, (int8x16_t) __b);
 }
 
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vtstq_u16 (uint16x8_t __a, uint16x8_t __b)
 {
   return (uint16x8_t)__builtin_neon_vtstv8hi ((int16x8_t) __a, (int16x8_t) __b);
 }
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vtstq_u32 (uint32x4_t __a, uint32x4_t __b)
 {
   return (uint32x4_t)__builtin_neon_vtstv4si ((int32x4_t) __a, (int32x4_t) __b);
 }
 
-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint8x16_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vtstq_p8 (poly8x16_t __a, poly8x16_t __b)
 {
   return (uint8x16_t)__builtin_neon_vtstv16qi ((int8x16_t) __a, (int8x16_t) __b);
 }
 
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vtstq_p16 (poly16x8_t __a, poly16x8_t __b)
+{
+  return (uint16x8_t)__builtin_neon_vtstv8hi ((int16x8_t) __a, (int16x8_t) __b);
+}
+
+__extension__ extern __inline int8x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vabd_s8 (int8x8_t __a, int8x8_t __b)
 {
   return (int8x8_t)__builtin_neon_vabdsv8qi (__a, __b);
 }
 
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vabd_s16 (int16x4_t __a, int16x4_t __b)
 {
   return (int16x4_t)__builtin_neon_vabdsv4hi (__a, __b);
 }
 
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vabd_s32 (int32x2_t __a, int32x2_t __b)
 {
   return (int32x2_t)__builtin_neon_vabdsv2si (__a, __b);
 }
 
-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline float32x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vabd_f32 (float32x2_t __a, float32x2_t __b)
 {
   return (float32x2_t)__builtin_neon_vabdfv2sf (__a, __b);
 }
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint8x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vabd_u8 (uint8x8_t __a, uint8x8_t __b)
 {
   return (uint8x8_t)__builtin_neon_vabduv8qi ((int8x8_t) __a, (int8x8_t) __b);
 }
 
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vabd_u16 (uint16x4_t __a, uint16x4_t __b)
 {
   return (uint16x4_t)__builtin_neon_vabduv4hi ((int16x4_t) __a, (int16x4_t) __b);
 }
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vabd_u32 (uint32x2_t __a, uint32x2_t __b)
 {
   return (uint32x2_t)__builtin_neon_vabduv2si ((int32x2_t) __a, (int32x2_t) __b);
 }
 
-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int8x16_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vabdq_s8 (int8x16_t __a, int8x16_t __b)
 {
   return (int8x16_t)__builtin_neon_vabdsv16qi (__a, __b);
 }
 
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vabdq_s16 (int16x8_t __a, int16x8_t __b)
 {
   return (int16x8_t)__builtin_neon_vabdsv8hi (__a, __b);
 }
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vabdq_s32 (int32x4_t __a, int32x4_t __b)
 {
   return (int32x4_t)__builtin_neon_vabdsv4si (__a, __b);
 }
 
-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline float32x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vabdq_f32 (float32x4_t __a, float32x4_t __b)
 {
   return (float32x4_t)__builtin_neon_vabdfv4sf (__a, __b);
 }
 
-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint8x16_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vabdq_u8 (uint8x16_t __a, uint8x16_t __b)
 {
   return (uint8x16_t)__builtin_neon_vabduv16qi ((int8x16_t) __a, (int8x16_t) __b);
 }
 
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vabdq_u16 (uint16x8_t __a, uint16x8_t __b)
 {
   return (uint16x8_t)__builtin_neon_vabduv8hi ((int16x8_t) __a, (int16x8_t) __b);
 }
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vabdq_u32 (uint32x4_t __a, uint32x4_t __b)
 {
   return (uint32x4_t)__builtin_neon_vabduv4si ((int32x4_t) __a, (int32x4_t) __b);
 }
 
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vabdl_s8 (int8x8_t __a, int8x8_t __b)
 {
   return (int16x8_t)__builtin_neon_vabdlsv8qi (__a, __b);
 }
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vabdl_s16 (int16x4_t __a, int16x4_t __b)
 {
   return (int32x4_t)__builtin_neon_vabdlsv4hi (__a, __b);
 }
 
-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int64x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vabdl_s32 (int32x2_t __a, int32x2_t __b)
 {
   return (int64x2_t)__builtin_neon_vabdlsv2si (__a, __b);
 }
 
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vabdl_u8 (uint8x8_t __a, uint8x8_t __b)
 {
   return (uint16x8_t)__builtin_neon_vabdluv8qi ((int8x8_t) __a, (int8x8_t) __b);
 }
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vabdl_u16 (uint16x4_t __a, uint16x4_t __b)
 {
   return (uint32x4_t)__builtin_neon_vabdluv4hi ((int16x4_t) __a, (int16x4_t) __b);
 }
 
-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint64x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vabdl_u32 (uint32x2_t __a, uint32x2_t __b)
 {
   return (uint64x2_t)__builtin_neon_vabdluv2si ((int32x2_t) __a, (int32x2_t) __b);
 }
 
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int8x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vaba_s8 (int8x8_t __a, int8x8_t __b, int8x8_t __c)
 {
   return (int8x8_t)__builtin_neon_vabasv8qi (__a, __b, __c);
 }
 
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vaba_s16 (int16x4_t __a, int16x4_t __b, int16x4_t __c)
 {
   return (int16x4_t)__builtin_neon_vabasv4hi (__a, __b, __c);
 }
 
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vaba_s32 (int32x2_t __a, int32x2_t __b, int32x2_t __c)
 {
   return (int32x2_t)__builtin_neon_vabasv2si (__a, __b, __c);
 }
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint8x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vaba_u8 (uint8x8_t __a, uint8x8_t __b, uint8x8_t __c)
 {
   return (uint8x8_t)__builtin_neon_vabauv8qi ((int8x8_t) __a, (int8x8_t) __b, (int8x8_t) __c);
 }
 
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vaba_u16 (uint16x4_t __a, uint16x4_t __b, uint16x4_t __c)
 {
   return (uint16x4_t)__builtin_neon_vabauv4hi ((int16x4_t) __a, (int16x4_t) __b, (int16x4_t) __c);
 }
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vaba_u32 (uint32x2_t __a, uint32x2_t __b, uint32x2_t __c)
 {
   return (uint32x2_t)__builtin_neon_vabauv2si ((int32x2_t) __a, (int32x2_t) __b, (int32x2_t) __c);
 }
 
-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int8x16_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vabaq_s8 (int8x16_t __a, int8x16_t __b, int8x16_t __c)
 {
   return (int8x16_t)__builtin_neon_vabasv16qi (__a, __b, __c);
 }
 
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vabaq_s16 (int16x8_t __a, int16x8_t __b, int16x8_t __c)
 {
   return (int16x8_t)__builtin_neon_vabasv8hi (__a, __b, __c);
 }
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vabaq_s32 (int32x4_t __a, int32x4_t __b, int32x4_t __c)
 {
   return (int32x4_t)__builtin_neon_vabasv4si (__a, __b, __c);
 }
 
-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint8x16_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vabaq_u8 (uint8x16_t __a, uint8x16_t __b, uint8x16_t __c)
 {
   return (uint8x16_t)__builtin_neon_vabauv16qi ((int8x16_t) __a, (int8x16_t) __b, (int8x16_t) __c);
 }
 
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vabaq_u16 (uint16x8_t __a, uint16x8_t __b, uint16x8_t __c)
 {
   return (uint16x8_t)__builtin_neon_vabauv8hi ((int16x8_t) __a, (int16x8_t) __b, (int16x8_t) __c);
 }
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vabaq_u32 (uint32x4_t __a, uint32x4_t __b, uint32x4_t __c)
 {
   return (uint32x4_t)__builtin_neon_vabauv4si ((int32x4_t) __a, (int32x4_t) __b, (int32x4_t) __c);
 }
 
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vabal_s8 (int16x8_t __a, int8x8_t __b, int8x8_t __c)
 {
   return (int16x8_t)__builtin_neon_vabalsv8qi (__a, __b, __c);
 }
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vabal_s16 (int32x4_t __a, int16x4_t __b, int16x4_t __c)
 {
   return (int32x4_t)__builtin_neon_vabalsv4hi (__a, __b, __c);
 }
 
-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int64x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vabal_s32 (int64x2_t __a, int32x2_t __b, int32x2_t __c)
 {
   return (int64x2_t)__builtin_neon_vabalsv2si (__a, __b, __c);
 }
 
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vabal_u8 (uint16x8_t __a, uint8x8_t __b, uint8x8_t __c)
 {
   return (uint16x8_t)__builtin_neon_vabaluv8qi ((int16x8_t) __a, (int8x8_t) __b, (int8x8_t) __c);
 }
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vabal_u16 (uint32x4_t __a, uint16x4_t __b, uint16x4_t __c)
 {
   return (uint32x4_t)__builtin_neon_vabaluv4hi ((int32x4_t) __a, (int16x4_t) __b, (int16x4_t) __c);
 }
 
-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint64x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vabal_u32 (uint64x2_t __a, uint32x2_t __b, uint32x2_t __c)
 {
   return (uint64x2_t)__builtin_neon_vabaluv2si ((int64x2_t) __a, (int32x2_t) __b, (int32x2_t) __c);
 }
 
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int8x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vmax_s8 (int8x8_t __a, int8x8_t __b)
 {
   return (int8x8_t)__builtin_neon_vmaxsv8qi (__a, __b);
 }
 
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vmax_s16 (int16x4_t __a, int16x4_t __b)
 {
   return (int16x4_t)__builtin_neon_vmaxsv4hi (__a, __b);
 }
 
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vmax_s32 (int32x2_t __a, int32x2_t __b)
 {
   return (int32x2_t)__builtin_neon_vmaxsv2si (__a, __b);
 }
 
-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline float32x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vmax_f32 (float32x2_t __a, float32x2_t __b)
 {
   return (float32x2_t)__builtin_neon_vmaxfv2sf (__a, __b);
 }
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint8x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vmax_u8 (uint8x8_t __a, uint8x8_t __b)
 {
   return (uint8x8_t)__builtin_neon_vmaxuv8qi ((int8x8_t) __a, (int8x8_t) __b);
 }
 
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vmax_u16 (uint16x4_t __a, uint16x4_t __b)
 {
   return (uint16x4_t)__builtin_neon_vmaxuv4hi ((int16x4_t) __a, (int16x4_t) __b);
 }
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vmax_u32 (uint32x2_t __a, uint32x2_t __b)
 {
   return (uint32x2_t)__builtin_neon_vmaxuv2si ((int32x2_t) __a, (int32x2_t) __b);
 }
 
-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int8x16_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vmaxq_s8 (int8x16_t __a, int8x16_t __b)
 {
   return (int8x16_t)__builtin_neon_vmaxsv16qi (__a, __b);
 }
 
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vmaxq_s16 (int16x8_t __a, int16x8_t __b)
 {
   return (int16x8_t)__builtin_neon_vmaxsv8hi (__a, __b);
 }
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vmaxq_s32 (int32x4_t __a, int32x4_t __b)
 {
   return (int32x4_t)__builtin_neon_vmaxsv4si (__a, __b);
 }
 
-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline float32x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vmaxq_f32 (float32x4_t __a, float32x4_t __b)
 {
   return (float32x4_t)__builtin_neon_vmaxfv4sf (__a, __b);
 }
 
-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+#pragma GCC push_options
+#pragma GCC target ("fpu=neon-fp-armv8")
+__extension__ extern __inline float32x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vmaxnm_f32 (float32x2_t a, float32x2_t b)
+{
+  return (float32x2_t)__builtin_neon_vmaxnmv2sf (a, b);
+}
+
+__extension__ extern __inline float32x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vmaxnmq_f32 (float32x4_t a, float32x4_t b)
+{
+  return (float32x4_t)__builtin_neon_vmaxnmv4sf (a, b);
+}
+
+__extension__ extern __inline float32x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vminnm_f32 (float32x2_t a, float32x2_t b)
+{
+  return (float32x2_t)__builtin_neon_vminnmv2sf (a, b);
+}
+
+__extension__ extern __inline float32x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vminnmq_f32 (float32x4_t a, float32x4_t b)
+{
+  return (float32x4_t)__builtin_neon_vminnmv4sf (a, b);
+}
+#pragma GCC pop_options
+
+
+__extension__ extern __inline uint8x16_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vmaxq_u8 (uint8x16_t __a, uint8x16_t __b)
 {
   return (uint8x16_t)__builtin_neon_vmaxuv16qi ((int8x16_t) __a, (int8x16_t) __b);
 }
 
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vmaxq_u16 (uint16x8_t __a, uint16x8_t __b)
 {
   return (uint16x8_t)__builtin_neon_vmaxuv8hi ((int16x8_t) __a, (int16x8_t) __b);
 }
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vmaxq_u32 (uint32x4_t __a, uint32x4_t __b)
 {
   return (uint32x4_t)__builtin_neon_vmaxuv4si ((int32x4_t) __a, (int32x4_t) __b);
 }
 
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int8x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vmin_s8 (int8x8_t __a, int8x8_t __b)
 {
   return (int8x8_t)__builtin_neon_vminsv8qi (__a, __b);
 }
 
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vmin_s16 (int16x4_t __a, int16x4_t __b)
 {
   return (int16x4_t)__builtin_neon_vminsv4hi (__a, __b);
 }
 
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vmin_s32 (int32x2_t __a, int32x2_t __b)
 {
   return (int32x2_t)__builtin_neon_vminsv2si (__a, __b);
 }
 
-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline float32x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vmin_f32 (float32x2_t __a, float32x2_t __b)
 {
   return (float32x2_t)__builtin_neon_vminfv2sf (__a, __b);
 }
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint8x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vmin_u8 (uint8x8_t __a, uint8x8_t __b)
 {
   return (uint8x8_t)__builtin_neon_vminuv8qi ((int8x8_t) __a, (int8x8_t) __b);
 }
 
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vmin_u16 (uint16x4_t __a, uint16x4_t __b)
 {
   return (uint16x4_t)__builtin_neon_vminuv4hi ((int16x4_t) __a, (int16x4_t) __b);
 }
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vmin_u32 (uint32x2_t __a, uint32x2_t __b)
 {
   return (uint32x2_t)__builtin_neon_vminuv2si ((int32x2_t) __a, (int32x2_t) __b);
 }
 
-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int8x16_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vminq_s8 (int8x16_t __a, int8x16_t __b)
 {
   return (int8x16_t)__builtin_neon_vminsv16qi (__a, __b);
 }
 
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vminq_s16 (int16x8_t __a, int16x8_t __b)
 {
   return (int16x8_t)__builtin_neon_vminsv8hi (__a, __b);
 }
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vminq_s32 (int32x4_t __a, int32x4_t __b)
 {
   return (int32x4_t)__builtin_neon_vminsv4si (__a, __b);
 }
 
-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline float32x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vminq_f32 (float32x4_t __a, float32x4_t __b)
 {
   return (float32x4_t)__builtin_neon_vminfv4sf (__a, __b);
 }
 
-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint8x16_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vminq_u8 (uint8x16_t __a, uint8x16_t __b)
 {
   return (uint8x16_t)__builtin_neon_vminuv16qi ((int8x16_t) __a, (int8x16_t) __b);
 }
 
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vminq_u16 (uint16x8_t __a, uint16x8_t __b)
 {
   return (uint16x8_t)__builtin_neon_vminuv8hi ((int16x8_t) __a, (int16x8_t) __b);
 }
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vminq_u32 (uint32x4_t __a, uint32x4_t __b)
 {
   return (uint32x4_t)__builtin_neon_vminuv4si ((int32x4_t) __a, (int32x4_t) __b);
 }
 
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int8x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vpadd_s8 (int8x8_t __a, int8x8_t __b)
 {
   return (int8x8_t)__builtin_neon_vpaddv8qi (__a, __b);
 }
 
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vpadd_s16 (int16x4_t __a, int16x4_t __b)
 {
   return (int16x4_t)__builtin_neon_vpaddv4hi (__a, __b);
 }
 
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vpadd_s32 (int32x2_t __a, int32x2_t __b)
 {
   return (int32x2_t)__builtin_neon_vpaddv2si (__a, __b);
 }
 
-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline float32x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vpadd_f32 (float32x2_t __a, float32x2_t __b)
 {
   return (float32x2_t)__builtin_neon_vpaddv2sf (__a, __b);
 }
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint8x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vpadd_u8 (uint8x8_t __a, uint8x8_t __b)
 {
   return (uint8x8_t)__builtin_neon_vpaddv8qi ((int8x8_t) __a, (int8x8_t) __b);
 }
 
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vpadd_u16 (uint16x4_t __a, uint16x4_t __b)
 {
   return (uint16x4_t)__builtin_neon_vpaddv4hi ((int16x4_t) __a, (int16x4_t) __b);
 }
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vpadd_u32 (uint32x2_t __a, uint32x2_t __b)
 {
   return (uint32x2_t)__builtin_neon_vpaddv2si ((int32x2_t) __a, (int32x2_t) __b);
 }
 
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vpaddl_s8 (int8x8_t __a)
 {
   return (int16x4_t)__builtin_neon_vpaddlsv8qi (__a);
 }
 
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vpaddl_s16 (int16x4_t __a)
 {
   return (int32x2_t)__builtin_neon_vpaddlsv4hi (__a);
 }
 
-__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int64x1_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vpaddl_s32 (int32x2_t __a)
 {
   return (int64x1_t)__builtin_neon_vpaddlsv2si (__a);
 }
 
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vpaddl_u8 (uint8x8_t __a)
 {
   return (uint16x4_t)__builtin_neon_vpaddluv8qi ((int8x8_t) __a);
 }
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vpaddl_u16 (uint16x4_t __a)
 {
   return (uint32x2_t)__builtin_neon_vpaddluv4hi ((int16x4_t) __a);
 }
 
-__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint64x1_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vpaddl_u32 (uint32x2_t __a)
 {
   return (uint64x1_t)__builtin_neon_vpaddluv2si ((int32x2_t) __a);
 }
 
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vpaddlq_s8 (int8x16_t __a)
 {
   return (int16x8_t)__builtin_neon_vpaddlsv16qi (__a);
 }
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vpaddlq_s16 (int16x8_t __a)
 {
   return (int32x4_t)__builtin_neon_vpaddlsv8hi (__a);
 }
 
-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int64x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vpaddlq_s32 (int32x4_t __a)
 {
   return (int64x2_t)__builtin_neon_vpaddlsv4si (__a);
 }
 
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vpaddlq_u8 (uint8x16_t __a)
 {
   return (uint16x8_t)__builtin_neon_vpaddluv16qi ((int8x16_t) __a);
 }
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vpaddlq_u16 (uint16x8_t __a)
 {
   return (uint32x4_t)__builtin_neon_vpaddluv8hi ((int16x8_t) __a);
 }
 
-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint64x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vpaddlq_u32 (uint32x4_t __a)
 {
   return (uint64x2_t)__builtin_neon_vpaddluv4si ((int32x4_t) __a);
 }
 
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vpadal_s8 (int16x4_t __a, int8x8_t __b)
 {
   return (int16x4_t)__builtin_neon_vpadalsv8qi (__a, __b);
 }
 
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vpadal_s16 (int32x2_t __a, int16x4_t __b)
 {
   return (int32x2_t)__builtin_neon_vpadalsv4hi (__a, __b);
 }
 
-__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int64x1_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vpadal_s32 (int64x1_t __a, int32x2_t __b)
 {
   return (int64x1_t)__builtin_neon_vpadalsv2si (__a, __b);
 }
 
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vpadal_u8 (uint16x4_t __a, uint8x8_t __b)
 {
   return (uint16x4_t)__builtin_neon_vpadaluv8qi ((int16x4_t) __a, (int8x8_t) __b);
 }
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vpadal_u16 (uint32x2_t __a, uint16x4_t __b)
 {
   return (uint32x2_t)__builtin_neon_vpadaluv4hi ((int32x2_t) __a, (int16x4_t) __b);
 }
 
-__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint64x1_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vpadal_u32 (uint64x1_t __a, uint32x2_t __b)
 {
   return (uint64x1_t)__builtin_neon_vpadaluv2si ((int64x1_t) __a, (int32x2_t) __b);
 }
 
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vpadalq_s8 (int16x8_t __a, int8x16_t __b)
 {
   return (int16x8_t)__builtin_neon_vpadalsv16qi (__a, __b);
 }
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vpadalq_s16 (int32x4_t __a, int16x8_t __b)
 {
   return (int32x4_t)__builtin_neon_vpadalsv8hi (__a, __b);
 }
 
-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int64x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vpadalq_s32 (int64x2_t __a, int32x4_t __b)
 {
   return (int64x2_t)__builtin_neon_vpadalsv4si (__a, __b);
 }
 
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vpadalq_u8 (uint16x8_t __a, uint8x16_t __b)
 {
   return (uint16x8_t)__builtin_neon_vpadaluv16qi ((int16x8_t) __a, (int8x16_t) __b);
 }
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vpadalq_u16 (uint32x4_t __a, uint16x8_t __b)
 {
   return (uint32x4_t)__builtin_neon_vpadaluv8hi ((int32x4_t) __a, (int16x8_t) __b);
 }
 
-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint64x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vpadalq_u32 (uint64x2_t __a, uint32x4_t __b)
 {
   return (uint64x2_t)__builtin_neon_vpadaluv4si ((int64x2_t) __a, (int32x4_t) __b);
 }
 
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int8x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vpmax_s8 (int8x8_t __a, int8x8_t __b)
 {
   return (int8x8_t)__builtin_neon_vpmaxsv8qi (__a, __b);
 }
 
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vpmax_s16 (int16x4_t __a, int16x4_t __b)
 {
   return (int16x4_t)__builtin_neon_vpmaxsv4hi (__a, __b);
 }
 
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vpmax_s32 (int32x2_t __a, int32x2_t __b)
 {
   return (int32x2_t)__builtin_neon_vpmaxsv2si (__a, __b);
 }
 
-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline float32x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vpmax_f32 (float32x2_t __a, float32x2_t __b)
 {
   return (float32x2_t)__builtin_neon_vpmaxfv2sf (__a, __b);
 }
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint8x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vpmax_u8 (uint8x8_t __a, uint8x8_t __b)
 {
   return (uint8x8_t)__builtin_neon_vpmaxuv8qi ((int8x8_t) __a, (int8x8_t) __b);
 }
 
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vpmax_u16 (uint16x4_t __a, uint16x4_t __b)
 {
   return (uint16x4_t)__builtin_neon_vpmaxuv4hi ((int16x4_t) __a, (int16x4_t) __b);
 }
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vpmax_u32 (uint32x2_t __a, uint32x2_t __b)
 {
   return (uint32x2_t)__builtin_neon_vpmaxuv2si ((int32x2_t) __a, (int32x2_t) __b);
 }
 
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int8x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vpmin_s8 (int8x8_t __a, int8x8_t __b)
 {
   return (int8x8_t)__builtin_neon_vpminsv8qi (__a, __b);
 }
 
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vpmin_s16 (int16x4_t __a, int16x4_t __b)
 {
   return (int16x4_t)__builtin_neon_vpminsv4hi (__a, __b);
 }
 
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vpmin_s32 (int32x2_t __a, int32x2_t __b)
 {
   return (int32x2_t)__builtin_neon_vpminsv2si (__a, __b);
 }
 
-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline float32x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vpmin_f32 (float32x2_t __a, float32x2_t __b)
 {
   return (float32x2_t)__builtin_neon_vpminfv2sf (__a, __b);
 }
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint8x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vpmin_u8 (uint8x8_t __a, uint8x8_t __b)
 {
   return (uint8x8_t)__builtin_neon_vpminuv8qi ((int8x8_t) __a, (int8x8_t) __b);
 }
 
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vpmin_u16 (uint16x4_t __a, uint16x4_t __b)
 {
   return (uint16x4_t)__builtin_neon_vpminuv4hi ((int16x4_t) __a, (int16x4_t) __b);
 }
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vpmin_u32 (uint32x2_t __a, uint32x2_t __b)
 {
   return (uint32x2_t)__builtin_neon_vpminuv2si ((int32x2_t) __a, (int32x2_t) __b);
 }
 
-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline float32x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vrecps_f32 (float32x2_t __a, float32x2_t __b)
 {
   return (float32x2_t)__builtin_neon_vrecpsv2sf (__a, __b);
 }
 
-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline float32x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vrecpsq_f32 (float32x4_t __a, float32x4_t __b)
 {
   return (float32x4_t)__builtin_neon_vrecpsv4sf (__a, __b);
 }
 
-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline float32x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vrsqrts_f32 (float32x2_t __a, float32x2_t __b)
 {
   return (float32x2_t)__builtin_neon_vrsqrtsv2sf (__a, __b);
 }
 
-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline float32x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vrsqrtsq_f32 (float32x4_t __a, float32x4_t __b)
 {
   return (float32x4_t)__builtin_neon_vrsqrtsv4sf (__a, __b);
 }
 
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int8x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vshl_s8 (int8x8_t __a, int8x8_t __b)
 {
   return (int8x8_t)__builtin_neon_vshlsv8qi (__a, __b);
 }
 
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vshl_s16 (int16x4_t __a, int16x4_t __b)
 {
   return (int16x4_t)__builtin_neon_vshlsv4hi (__a, __b);
 }
 
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vshl_s32 (int32x2_t __a, int32x2_t __b)
 {
   return (int32x2_t)__builtin_neon_vshlsv2si (__a, __b);
 }
 
-__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int64x1_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vshl_s64 (int64x1_t __a, int64x1_t __b)
 {
   return (int64x1_t)__builtin_neon_vshlsdi (__a, __b);
 }
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint8x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vshl_u8 (uint8x8_t __a, int8x8_t __b)
 {
   return (uint8x8_t)__builtin_neon_vshluv8qi ((int8x8_t) __a, __b);
 }
 
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vshl_u16 (uint16x4_t __a, int16x4_t __b)
 {
   return (uint16x4_t)__builtin_neon_vshluv4hi ((int16x4_t) __a, __b);
 }
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vshl_u32 (uint32x2_t __a, int32x2_t __b)
 {
   return (uint32x2_t)__builtin_neon_vshluv2si ((int32x2_t) __a, __b);
 }
 
-__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint64x1_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vshl_u64 (uint64x1_t __a, int64x1_t __b)
 {
   return (uint64x1_t)__builtin_neon_vshludi ((int64x1_t) __a, __b);
 }
 
-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int8x16_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vshlq_s8 (int8x16_t __a, int8x16_t __b)
 {
   return (int8x16_t)__builtin_neon_vshlsv16qi (__a, __b);
 }
 
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vshlq_s16 (int16x8_t __a, int16x8_t __b)
 {
   return (int16x8_t)__builtin_neon_vshlsv8hi (__a, __b);
 }
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vshlq_s32 (int32x4_t __a, int32x4_t __b)
 {
   return (int32x4_t)__builtin_neon_vshlsv4si (__a, __b);
 }
 
-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int64x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vshlq_s64 (int64x2_t __a, int64x2_t __b)
 {
   return (int64x2_t)__builtin_neon_vshlsv2di (__a, __b);
 }
 
-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint8x16_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vshlq_u8 (uint8x16_t __a, int8x16_t __b)
 {
   return (uint8x16_t)__builtin_neon_vshluv16qi ((int8x16_t) __a, __b);
 }
 
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vshlq_u16 (uint16x8_t __a, int16x8_t __b)
 {
   return (uint16x8_t)__builtin_neon_vshluv8hi ((int16x8_t) __a, __b);
 }
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vshlq_u32 (uint32x4_t __a, int32x4_t __b)
 {
   return (uint32x4_t)__builtin_neon_vshluv4si ((int32x4_t) __a, __b);
 }
 
-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint64x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vshlq_u64 (uint64x2_t __a, int64x2_t __b)
 {
   return (uint64x2_t)__builtin_neon_vshluv2di ((int64x2_t) __a, __b);
 }
 
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int8x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vrshl_s8 (int8x8_t __a, int8x8_t __b)
 {
   return (int8x8_t)__builtin_neon_vrshlsv8qi (__a, __b);
 }
 
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vrshl_s16 (int16x4_t __a, int16x4_t __b)
 {
   return (int16x4_t)__builtin_neon_vrshlsv4hi (__a, __b);
 }
 
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vrshl_s32 (int32x2_t __a, int32x2_t __b)
 {
   return (int32x2_t)__builtin_neon_vrshlsv2si (__a, __b);
 }
 
-__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int64x1_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vrshl_s64 (int64x1_t __a, int64x1_t __b)
 {
   return (int64x1_t)__builtin_neon_vrshlsdi (__a, __b);
 }
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint8x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vrshl_u8 (uint8x8_t __a, int8x8_t __b)
 {
   return (uint8x8_t)__builtin_neon_vrshluv8qi ((int8x8_t) __a, __b);
 }
 
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vrshl_u16 (uint16x4_t __a, int16x4_t __b)
 {
   return (uint16x4_t)__builtin_neon_vrshluv4hi ((int16x4_t) __a, __b);
 }
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vrshl_u32 (uint32x2_t __a, int32x2_t __b)
 {
   return (uint32x2_t)__builtin_neon_vrshluv2si ((int32x2_t) __a, __b);
 }
 
-__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint64x1_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vrshl_u64 (uint64x1_t __a, int64x1_t __b)
 {
   return (uint64x1_t)__builtin_neon_vrshludi ((int64x1_t) __a, __b);
 }
 
-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int8x16_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vrshlq_s8 (int8x16_t __a, int8x16_t __b)
 {
   return (int8x16_t)__builtin_neon_vrshlsv16qi (__a, __b);
 }
 
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vrshlq_s16 (int16x8_t __a, int16x8_t __b)
 {
   return (int16x8_t)__builtin_neon_vrshlsv8hi (__a, __b);
 }
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vrshlq_s32 (int32x4_t __a, int32x4_t __b)
 {
   return (int32x4_t)__builtin_neon_vrshlsv4si (__a, __b);
 }
 
-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int64x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vrshlq_s64 (int64x2_t __a, int64x2_t __b)
 {
   return (int64x2_t)__builtin_neon_vrshlsv2di (__a, __b);
 }
 
-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint8x16_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vrshlq_u8 (uint8x16_t __a, int8x16_t __b)
 {
   return (uint8x16_t)__builtin_neon_vrshluv16qi ((int8x16_t) __a, __b);
 }
 
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vrshlq_u16 (uint16x8_t __a, int16x8_t __b)
 {
   return (uint16x8_t)__builtin_neon_vrshluv8hi ((int16x8_t) __a, __b);
 }
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vrshlq_u32 (uint32x4_t __a, int32x4_t __b)
 {
   return (uint32x4_t)__builtin_neon_vrshluv4si ((int32x4_t) __a, __b);
 }
 
-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint64x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vrshlq_u64 (uint64x2_t __a, int64x2_t __b)
 {
   return (uint64x2_t)__builtin_neon_vrshluv2di ((int64x2_t) __a, __b);
 }
 
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int8x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vqshl_s8 (int8x8_t __a, int8x8_t __b)
 {
   return (int8x8_t)__builtin_neon_vqshlsv8qi (__a, __b);
 }
 
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vqshl_s16 (int16x4_t __a, int16x4_t __b)
 {
   return (int16x4_t)__builtin_neon_vqshlsv4hi (__a, __b);
 }
 
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vqshl_s32 (int32x2_t __a, int32x2_t __b)
 {
   return (int32x2_t)__builtin_neon_vqshlsv2si (__a, __b);
 }
 
-__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int64x1_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vqshl_s64 (int64x1_t __a, int64x1_t __b)
 {
   return (int64x1_t)__builtin_neon_vqshlsdi (__a, __b);
 }
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint8x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vqshl_u8 (uint8x8_t __a, int8x8_t __b)
 {
   return (uint8x8_t)__builtin_neon_vqshluv8qi ((int8x8_t) __a, __b);
 }
 
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vqshl_u16 (uint16x4_t __a, int16x4_t __b)
 {
   return (uint16x4_t)__builtin_neon_vqshluv4hi ((int16x4_t) __a, __b);
 }
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vqshl_u32 (uint32x2_t __a, int32x2_t __b)
 {
   return (uint32x2_t)__builtin_neon_vqshluv2si ((int32x2_t) __a, __b);
 }
 
-__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint64x1_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vqshl_u64 (uint64x1_t __a, int64x1_t __b)
 {
   return (uint64x1_t)__builtin_neon_vqshludi ((int64x1_t) __a, __b);
 }
 
-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int8x16_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vqshlq_s8 (int8x16_t __a, int8x16_t __b)
 {
   return (int8x16_t)__builtin_neon_vqshlsv16qi (__a, __b);
 }
 
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vqshlq_s16 (int16x8_t __a, int16x8_t __b)
 {
   return (int16x8_t)__builtin_neon_vqshlsv8hi (__a, __b);
 }
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vqshlq_s32 (int32x4_t __a, int32x4_t __b)
 {
   return (int32x4_t)__builtin_neon_vqshlsv4si (__a, __b);
 }
 
-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int64x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vqshlq_s64 (int64x2_t __a, int64x2_t __b)
 {
   return (int64x2_t)__builtin_neon_vqshlsv2di (__a, __b);
 }
 
-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint8x16_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vqshlq_u8 (uint8x16_t __a, int8x16_t __b)
 {
   return (uint8x16_t)__builtin_neon_vqshluv16qi ((int8x16_t) __a, __b);
 }
 
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vqshlq_u16 (uint16x8_t __a, int16x8_t __b)
 {
   return (uint16x8_t)__builtin_neon_vqshluv8hi ((int16x8_t) __a, __b);
 }
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vqshlq_u32 (uint32x4_t __a, int32x4_t __b)
 {
   return (uint32x4_t)__builtin_neon_vqshluv4si ((int32x4_t) __a, __b);
 }
 
-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint64x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vqshlq_u64 (uint64x2_t __a, int64x2_t __b)
 {
   return (uint64x2_t)__builtin_neon_vqshluv2di ((int64x2_t) __a, __b);
 }
 
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int8x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vqrshl_s8 (int8x8_t __a, int8x8_t __b)
 {
   return (int8x8_t)__builtin_neon_vqrshlsv8qi (__a, __b);
 }
 
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vqrshl_s16 (int16x4_t __a, int16x4_t __b)
 {
   return (int16x4_t)__builtin_neon_vqrshlsv4hi (__a, __b);
 }
 
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vqrshl_s32 (int32x2_t __a, int32x2_t __b)
 {
   return (int32x2_t)__builtin_neon_vqrshlsv2si (__a, __b);
 }
 
-__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int64x1_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vqrshl_s64 (int64x1_t __a, int64x1_t __b)
 {
   return (int64x1_t)__builtin_neon_vqrshlsdi (__a, __b);
 }
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint8x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vqrshl_u8 (uint8x8_t __a, int8x8_t __b)
 {
   return (uint8x8_t)__builtin_neon_vqrshluv8qi ((int8x8_t) __a, __b);
 }
 
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vqrshl_u16 (uint16x4_t __a, int16x4_t __b)
 {
   return (uint16x4_t)__builtin_neon_vqrshluv4hi ((int16x4_t) __a, __b);
 }
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vqrshl_u32 (uint32x2_t __a, int32x2_t __b)
 {
   return (uint32x2_t)__builtin_neon_vqrshluv2si ((int32x2_t) __a, __b);
 }
 
-__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint64x1_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vqrshl_u64 (uint64x1_t __a, int64x1_t __b)
 {
   return (uint64x1_t)__builtin_neon_vqrshludi ((int64x1_t) __a, __b);
 }
 
-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int8x16_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vqrshlq_s8 (int8x16_t __a, int8x16_t __b)
 {
   return (int8x16_t)__builtin_neon_vqrshlsv16qi (__a, __b);
 }
 
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vqrshlq_s16 (int16x8_t __a, int16x8_t __b)
 {
   return (int16x8_t)__builtin_neon_vqrshlsv8hi (__a, __b);
 }
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vqrshlq_s32 (int32x4_t __a, int32x4_t __b)
 {
   return (int32x4_t)__builtin_neon_vqrshlsv4si (__a, __b);
 }
 
-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int64x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vqrshlq_s64 (int64x2_t __a, int64x2_t __b)
 {
   return (int64x2_t)__builtin_neon_vqrshlsv2di (__a, __b);
 }
 
-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint8x16_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vqrshlq_u8 (uint8x16_t __a, int8x16_t __b)
 {
   return (uint8x16_t)__builtin_neon_vqrshluv16qi ((int8x16_t) __a, __b);
 }
 
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vqrshlq_u16 (uint16x8_t __a, int16x8_t __b)
 {
   return (uint16x8_t)__builtin_neon_vqrshluv8hi ((int16x8_t) __a, __b);
 }
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vqrshlq_u32 (uint32x4_t __a, int32x4_t __b)
 {
   return (uint32x4_t)__builtin_neon_vqrshluv4si ((int32x4_t) __a, __b);
 }
 
-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint64x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vqrshlq_u64 (uint64x2_t __a, int64x2_t __b)
 {
   return (uint64x2_t)__builtin_neon_vqrshluv2di ((int64x2_t) __a, __b);
 }
 
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int8x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vshr_n_s8 (int8x8_t __a, const int __b)
 {
   return (int8x8_t)__builtin_neon_vshrs_nv8qi (__a, __b);
 }
 
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vshr_n_s16 (int16x4_t __a, const int __b)
 {
   return (int16x4_t)__builtin_neon_vshrs_nv4hi (__a, __b);
 }
 
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vshr_n_s32 (int32x2_t __a, const int __b)
 {
   return (int32x2_t)__builtin_neon_vshrs_nv2si (__a, __b);
 }
 
-__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int64x1_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vshr_n_s64 (int64x1_t __a, const int __b)
 {
   return (int64x1_t)__builtin_neon_vshrs_ndi (__a, __b);
 }
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint8x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vshr_n_u8 (uint8x8_t __a, const int __b)
 {
   return (uint8x8_t)__builtin_neon_vshru_nv8qi ((int8x8_t) __a, __b);
 }
 
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vshr_n_u16 (uint16x4_t __a, const int __b)
 {
   return (uint16x4_t)__builtin_neon_vshru_nv4hi ((int16x4_t) __a, __b);
 }
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vshr_n_u32 (uint32x2_t __a, const int __b)
 {
   return (uint32x2_t)__builtin_neon_vshru_nv2si ((int32x2_t) __a, __b);
 }
 
-__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint64x1_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vshr_n_u64 (uint64x1_t __a, const int __b)
 {
   return (uint64x1_t)__builtin_neon_vshru_ndi ((int64x1_t) __a, __b);
 }
 
-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int8x16_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vshrq_n_s8 (int8x16_t __a, const int __b)
 {
   return (int8x16_t)__builtin_neon_vshrs_nv16qi (__a, __b);
 }
 
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vshrq_n_s16 (int16x8_t __a, const int __b)
 {
   return (int16x8_t)__builtin_neon_vshrs_nv8hi (__a, __b);
 }
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vshrq_n_s32 (int32x4_t __a, const int __b)
 {
   return (int32x4_t)__builtin_neon_vshrs_nv4si (__a, __b);
 }
 
-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int64x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vshrq_n_s64 (int64x2_t __a, const int __b)
 {
   return (int64x2_t)__builtin_neon_vshrs_nv2di (__a, __b);
 }
 
-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint8x16_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vshrq_n_u8 (uint8x16_t __a, const int __b)
 {
   return (uint8x16_t)__builtin_neon_vshru_nv16qi ((int8x16_t) __a, __b);
 }
 
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vshrq_n_u16 (uint16x8_t __a, const int __b)
 {
   return (uint16x8_t)__builtin_neon_vshru_nv8hi ((int16x8_t) __a, __b);
 }
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vshrq_n_u32 (uint32x4_t __a, const int __b)
 {
   return (uint32x4_t)__builtin_neon_vshru_nv4si ((int32x4_t) __a, __b);
 }
 
-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint64x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vshrq_n_u64 (uint64x2_t __a, const int __b)
 {
   return (uint64x2_t)__builtin_neon_vshru_nv2di ((int64x2_t) __a, __b);
 }
 
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int8x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vrshr_n_s8 (int8x8_t __a, const int __b)
 {
   return (int8x8_t)__builtin_neon_vrshrs_nv8qi (__a, __b);
 }
 
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vrshr_n_s16 (int16x4_t __a, const int __b)
 {
   return (int16x4_t)__builtin_neon_vrshrs_nv4hi (__a, __b);
 }
 
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vrshr_n_s32 (int32x2_t __a, const int __b)
 {
   return (int32x2_t)__builtin_neon_vrshrs_nv2si (__a, __b);
 }
 
-__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int64x1_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vrshr_n_s64 (int64x1_t __a, const int __b)
 {
   return (int64x1_t)__builtin_neon_vrshrs_ndi (__a, __b);
 }
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint8x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vrshr_n_u8 (uint8x8_t __a, const int __b)
 {
   return (uint8x8_t)__builtin_neon_vrshru_nv8qi ((int8x8_t) __a, __b);
 }
 
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vrshr_n_u16 (uint16x4_t __a, const int __b)
 {
   return (uint16x4_t)__builtin_neon_vrshru_nv4hi ((int16x4_t) __a, __b);
 }
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vrshr_n_u32 (uint32x2_t __a, const int __b)
 {
   return (uint32x2_t)__builtin_neon_vrshru_nv2si ((int32x2_t) __a, __b);
 }
 
-__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint64x1_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vrshr_n_u64 (uint64x1_t __a, const int __b)
 {
   return (uint64x1_t)__builtin_neon_vrshru_ndi ((int64x1_t) __a, __b);
 }
 
-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int8x16_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vrshrq_n_s8 (int8x16_t __a, const int __b)
 {
   return (int8x16_t)__builtin_neon_vrshrs_nv16qi (__a, __b);
 }
 
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vrshrq_n_s16 (int16x8_t __a, const int __b)
 {
   return (int16x8_t)__builtin_neon_vrshrs_nv8hi (__a, __b);
 }
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vrshrq_n_s32 (int32x4_t __a, const int __b)
 {
   return (int32x4_t)__builtin_neon_vrshrs_nv4si (__a, __b);
 }
 
-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int64x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vrshrq_n_s64 (int64x2_t __a, const int __b)
 {
   return (int64x2_t)__builtin_neon_vrshrs_nv2di (__a, __b);
 }
 
-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint8x16_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vrshrq_n_u8 (uint8x16_t __a, const int __b)
 {
   return (uint8x16_t)__builtin_neon_vrshru_nv16qi ((int8x16_t) __a, __b);
 }
 
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vrshrq_n_u16 (uint16x8_t __a, const int __b)
 {
   return (uint16x8_t)__builtin_neon_vrshru_nv8hi ((int16x8_t) __a, __b);
 }
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vrshrq_n_u32 (uint32x4_t __a, const int __b)
 {
   return (uint32x4_t)__builtin_neon_vrshru_nv4si ((int32x4_t) __a, __b);
 }
 
-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint64x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vrshrq_n_u64 (uint64x2_t __a, const int __b)
 {
   return (uint64x2_t)__builtin_neon_vrshru_nv2di ((int64x2_t) __a, __b);
 }
 
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int8x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vshrn_n_s16 (int16x8_t __a, const int __b)
 {
   return (int8x8_t)__builtin_neon_vshrn_nv8hi (__a, __b);
 }
 
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vshrn_n_s32 (int32x4_t __a, const int __b)
 {
   return (int16x4_t)__builtin_neon_vshrn_nv4si (__a, __b);
 }
 
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vshrn_n_s64 (int64x2_t __a, const int __b)
 {
   return (int32x2_t)__builtin_neon_vshrn_nv2di (__a, __b);
 }
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint8x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vshrn_n_u16 (uint16x8_t __a, const int __b)
 {
   return (uint8x8_t)__builtin_neon_vshrn_nv8hi ((int16x8_t) __a, __b);
 }
 
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vshrn_n_u32 (uint32x4_t __a, const int __b)
 {
   return (uint16x4_t)__builtin_neon_vshrn_nv4si ((int32x4_t) __a, __b);
 }
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vshrn_n_u64 (uint64x2_t __a, const int __b)
 {
   return (uint32x2_t)__builtin_neon_vshrn_nv2di ((int64x2_t) __a, __b);
 }
 
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int8x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vrshrn_n_s16 (int16x8_t __a, const int __b)
 {
   return (int8x8_t)__builtin_neon_vrshrn_nv8hi (__a, __b);
 }
 
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vrshrn_n_s32 (int32x4_t __a, const int __b)
 {
   return (int16x4_t)__builtin_neon_vrshrn_nv4si (__a, __b);
 }
 
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vrshrn_n_s64 (int64x2_t __a, const int __b)
 {
   return (int32x2_t)__builtin_neon_vrshrn_nv2di (__a, __b);
 }
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint8x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vrshrn_n_u16 (uint16x8_t __a, const int __b)
 {
   return (uint8x8_t)__builtin_neon_vrshrn_nv8hi ((int16x8_t) __a, __b);
 }
 
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vrshrn_n_u32 (uint32x4_t __a, const int __b)
 {
   return (uint16x4_t)__builtin_neon_vrshrn_nv4si ((int32x4_t) __a, __b);
 }
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vrshrn_n_u64 (uint64x2_t __a, const int __b)
 {
   return (uint32x2_t)__builtin_neon_vrshrn_nv2di ((int64x2_t) __a, __b);
 }
 
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int8x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vqshrn_n_s16 (int16x8_t __a, const int __b)
 {
   return (int8x8_t)__builtin_neon_vqshrns_nv8hi (__a, __b);
 }
 
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vqshrn_n_s32 (int32x4_t __a, const int __b)
 {
   return (int16x4_t)__builtin_neon_vqshrns_nv4si (__a, __b);
 }
 
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vqshrn_n_s64 (int64x2_t __a, const int __b)
 {
   return (int32x2_t)__builtin_neon_vqshrns_nv2di (__a, __b);
 }
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint8x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vqshrn_n_u16 (uint16x8_t __a, const int __b)
 {
   return (uint8x8_t)__builtin_neon_vqshrnu_nv8hi ((int16x8_t) __a, __b);
 }
 
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vqshrn_n_u32 (uint32x4_t __a, const int __b)
 {
   return (uint16x4_t)__builtin_neon_vqshrnu_nv4si ((int32x4_t) __a, __b);
 }
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vqshrn_n_u64 (uint64x2_t __a, const int __b)
 {
   return (uint32x2_t)__builtin_neon_vqshrnu_nv2di ((int64x2_t) __a, __b);
 }
 
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int8x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vqrshrn_n_s16 (int16x8_t __a, const int __b)
 {
   return (int8x8_t)__builtin_neon_vqrshrns_nv8hi (__a, __b);
 }
 
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vqrshrn_n_s32 (int32x4_t __a, const int __b)
 {
   return (int16x4_t)__builtin_neon_vqrshrns_nv4si (__a, __b);
 }
 
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vqrshrn_n_s64 (int64x2_t __a, const int __b)
 {
   return (int32x2_t)__builtin_neon_vqrshrns_nv2di (__a, __b);
 }
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint8x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vqrshrn_n_u16 (uint16x8_t __a, const int __b)
 {
   return (uint8x8_t)__builtin_neon_vqrshrnu_nv8hi ((int16x8_t) __a, __b);
 }
 
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vqrshrn_n_u32 (uint32x4_t __a, const int __b)
 {
   return (uint16x4_t)__builtin_neon_vqrshrnu_nv4si ((int32x4_t) __a, __b);
 }
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vqrshrn_n_u64 (uint64x2_t __a, const int __b)
 {
   return (uint32x2_t)__builtin_neon_vqrshrnu_nv2di ((int64x2_t) __a, __b);
 }
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint8x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vqshrun_n_s16 (int16x8_t __a, const int __b)
 {
   return (uint8x8_t)__builtin_neon_vqshrun_nv8hi (__a, __b);
 }
 
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vqshrun_n_s32 (int32x4_t __a, const int __b)
 {
   return (uint16x4_t)__builtin_neon_vqshrun_nv4si (__a, __b);
 }
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vqshrun_n_s64 (int64x2_t __a, const int __b)
 {
   return (uint32x2_t)__builtin_neon_vqshrun_nv2di (__a, __b);
 }
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint8x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vqrshrun_n_s16 (int16x8_t __a, const int __b)
 {
   return (uint8x8_t)__builtin_neon_vqrshrun_nv8hi (__a, __b);
 }
 
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vqrshrun_n_s32 (int32x4_t __a, const int __b)
 {
   return (uint16x4_t)__builtin_neon_vqrshrun_nv4si (__a, __b);
 }
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vqrshrun_n_s64 (int64x2_t __a, const int __b)
 {
   return (uint32x2_t)__builtin_neon_vqrshrun_nv2di (__a, __b);
 }
 
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int8x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vshl_n_s8 (int8x8_t __a, const int __b)
 {
   return (int8x8_t)__builtin_neon_vshl_nv8qi (__a, __b);
 }
 
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vshl_n_s16 (int16x4_t __a, const int __b)
 {
   return (int16x4_t)__builtin_neon_vshl_nv4hi (__a, __b);
 }
 
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vshl_n_s32 (int32x2_t __a, const int __b)
 {
   return (int32x2_t)__builtin_neon_vshl_nv2si (__a, __b);
 }
 
-__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int64x1_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vshl_n_s64 (int64x1_t __a, const int __b)
 {
   return (int64x1_t)__builtin_neon_vshl_ndi (__a, __b);
 }
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint8x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vshl_n_u8 (uint8x8_t __a, const int __b)
 {
   return (uint8x8_t)__builtin_neon_vshl_nv8qi ((int8x8_t) __a, __b);
 }
 
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vshl_n_u16 (uint16x4_t __a, const int __b)
 {
   return (uint16x4_t)__builtin_neon_vshl_nv4hi ((int16x4_t) __a, __b);
 }
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vshl_n_u32 (uint32x2_t __a, const int __b)
 {
   return (uint32x2_t)__builtin_neon_vshl_nv2si ((int32x2_t) __a, __b);
 }
 
-__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint64x1_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vshl_n_u64 (uint64x1_t __a, const int __b)
 {
   return (uint64x1_t)__builtin_neon_vshl_ndi ((int64x1_t) __a, __b);
 }
 
-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int8x16_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vshlq_n_s8 (int8x16_t __a, const int __b)
 {
   return (int8x16_t)__builtin_neon_vshl_nv16qi (__a, __b);
 }
 
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vshlq_n_s16 (int16x8_t __a, const int __b)
 {
   return (int16x8_t)__builtin_neon_vshl_nv8hi (__a, __b);
 }
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vshlq_n_s32 (int32x4_t __a, const int __b)
 {
   return (int32x4_t)__builtin_neon_vshl_nv4si (__a, __b);
 }
 
-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int64x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vshlq_n_s64 (int64x2_t __a, const int __b)
 {
   return (int64x2_t)__builtin_neon_vshl_nv2di (__a, __b);
 }
 
-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint8x16_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vshlq_n_u8 (uint8x16_t __a, const int __b)
 {
   return (uint8x16_t)__builtin_neon_vshl_nv16qi ((int8x16_t) __a, __b);
 }
 
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vshlq_n_u16 (uint16x8_t __a, const int __b)
 {
   return (uint16x8_t)__builtin_neon_vshl_nv8hi ((int16x8_t) __a, __b);
 }
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vshlq_n_u32 (uint32x4_t __a, const int __b)
 {
   return (uint32x4_t)__builtin_neon_vshl_nv4si ((int32x4_t) __a, __b);
 }
 
-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint64x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vshlq_n_u64 (uint64x2_t __a, const int __b)
 {
   return (uint64x2_t)__builtin_neon_vshl_nv2di ((int64x2_t) __a, __b);
 }
 
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int8x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vqshl_n_s8 (int8x8_t __a, const int __b)
 {
   return (int8x8_t)__builtin_neon_vqshl_s_nv8qi (__a, __b);
 }
 
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vqshl_n_s16 (int16x4_t __a, const int __b)
 {
   return (int16x4_t)__builtin_neon_vqshl_s_nv4hi (__a, __b);
 }
 
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vqshl_n_s32 (int32x2_t __a, const int __b)
 {
   return (int32x2_t)__builtin_neon_vqshl_s_nv2si (__a, __b);
 }
 
-__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int64x1_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vqshl_n_s64 (int64x1_t __a, const int __b)
 {
   return (int64x1_t)__builtin_neon_vqshl_s_ndi (__a, __b);
 }
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint8x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vqshl_n_u8 (uint8x8_t __a, const int __b)
 {
   return (uint8x8_t)__builtin_neon_vqshl_u_nv8qi ((int8x8_t) __a, __b);
 }
 
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vqshl_n_u16 (uint16x4_t __a, const int __b)
 {
   return (uint16x4_t)__builtin_neon_vqshl_u_nv4hi ((int16x4_t) __a, __b);
 }
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vqshl_n_u32 (uint32x2_t __a, const int __b)
 {
   return (uint32x2_t)__builtin_neon_vqshl_u_nv2si ((int32x2_t) __a, __b);
 }
 
-__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint64x1_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vqshl_n_u64 (uint64x1_t __a, const int __b)
 {
   return (uint64x1_t)__builtin_neon_vqshl_u_ndi ((int64x1_t) __a, __b);
 }
 
-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int8x16_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vqshlq_n_s8 (int8x16_t __a, const int __b)
 {
   return (int8x16_t)__builtin_neon_vqshl_s_nv16qi (__a, __b);
 }
 
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vqshlq_n_s16 (int16x8_t __a, const int __b)
 {
   return (int16x8_t)__builtin_neon_vqshl_s_nv8hi (__a, __b);
 }
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vqshlq_n_s32 (int32x4_t __a, const int __b)
 {
   return (int32x4_t)__builtin_neon_vqshl_s_nv4si (__a, __b);
 }
 
-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int64x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vqshlq_n_s64 (int64x2_t __a, const int __b)
 {
   return (int64x2_t)__builtin_neon_vqshl_s_nv2di (__a, __b);
 }
 
-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint8x16_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vqshlq_n_u8 (uint8x16_t __a, const int __b)
 {
   return (uint8x16_t)__builtin_neon_vqshl_u_nv16qi ((int8x16_t) __a, __b);
 }
 
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vqshlq_n_u16 (uint16x8_t __a, const int __b)
 {
   return (uint16x8_t)__builtin_neon_vqshl_u_nv8hi ((int16x8_t) __a, __b);
 }
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vqshlq_n_u32 (uint32x4_t __a, const int __b)
 {
   return (uint32x4_t)__builtin_neon_vqshl_u_nv4si ((int32x4_t) __a, __b);
 }
 
-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint64x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vqshlq_n_u64 (uint64x2_t __a, const int __b)
 {
   return (uint64x2_t)__builtin_neon_vqshl_u_nv2di ((int64x2_t) __a, __b);
 }
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint8x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vqshlu_n_s8 (int8x8_t __a, const int __b)
 {
   return (uint8x8_t)__builtin_neon_vqshlu_nv8qi (__a, __b);
 }
 
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vqshlu_n_s16 (int16x4_t __a, const int __b)
 {
   return (uint16x4_t)__builtin_neon_vqshlu_nv4hi (__a, __b);
 }
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vqshlu_n_s32 (int32x2_t __a, const int __b)
 {
   return (uint32x2_t)__builtin_neon_vqshlu_nv2si (__a, __b);
 }
 
-__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint64x1_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vqshlu_n_s64 (int64x1_t __a, const int __b)
 {
   return (uint64x1_t)__builtin_neon_vqshlu_ndi (__a, __b);
 }
 
-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint8x16_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vqshluq_n_s8 (int8x16_t __a, const int __b)
 {
   return (uint8x16_t)__builtin_neon_vqshlu_nv16qi (__a, __b);
 }
 
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vqshluq_n_s16 (int16x8_t __a, const int __b)
 {
   return (uint16x8_t)__builtin_neon_vqshlu_nv8hi (__a, __b);
 }
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vqshluq_n_s32 (int32x4_t __a, const int __b)
 {
   return (uint32x4_t)__builtin_neon_vqshlu_nv4si (__a, __b);
 }
 
-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint64x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vqshluq_n_s64 (int64x2_t __a, const int __b)
 {
   return (uint64x2_t)__builtin_neon_vqshlu_nv2di (__a, __b);
 }
 
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vshll_n_s8 (int8x8_t __a, const int __b)
 {
   return (int16x8_t)__builtin_neon_vshlls_nv8qi (__a, __b);
 }
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vshll_n_s16 (int16x4_t __a, const int __b)
 {
   return (int32x4_t)__builtin_neon_vshlls_nv4hi (__a, __b);
 }
 
-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int64x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vshll_n_s32 (int32x2_t __a, const int __b)
 {
   return (int64x2_t)__builtin_neon_vshlls_nv2si (__a, __b);
 }
 
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vshll_n_u8 (uint8x8_t __a, const int __b)
 {
   return (uint16x8_t)__builtin_neon_vshllu_nv8qi ((int8x8_t) __a, __b);
 }
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vshll_n_u16 (uint16x4_t __a, const int __b)
 {
   return (uint32x4_t)__builtin_neon_vshllu_nv4hi ((int16x4_t) __a, __b);
 }
 
-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint64x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vshll_n_u32 (uint32x2_t __a, const int __b)
 {
   return (uint64x2_t)__builtin_neon_vshllu_nv2si ((int32x2_t) __a, __b);
 }
 
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int8x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vsra_n_s8 (int8x8_t __a, int8x8_t __b, const int __c)
 {
   return (int8x8_t)__builtin_neon_vsras_nv8qi (__a, __b, __c);
 }
 
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vsra_n_s16 (int16x4_t __a, int16x4_t __b, const int __c)
 {
   return (int16x4_t)__builtin_neon_vsras_nv4hi (__a, __b, __c);
 }
 
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vsra_n_s32 (int32x2_t __a, int32x2_t __b, const int __c)
 {
   return (int32x2_t)__builtin_neon_vsras_nv2si (__a, __b, __c);
 }
 
-__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int64x1_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vsra_n_s64 (int64x1_t __a, int64x1_t __b, const int __c)
 {
   return (int64x1_t)__builtin_neon_vsras_ndi (__a, __b, __c);
 }
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint8x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vsra_n_u8 (uint8x8_t __a, uint8x8_t __b, const int __c)
 {
   return (uint8x8_t)__builtin_neon_vsrau_nv8qi ((int8x8_t) __a, (int8x8_t) __b, __c);
 }
 
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vsra_n_u16 (uint16x4_t __a, uint16x4_t __b, const int __c)
 {
   return (uint16x4_t)__builtin_neon_vsrau_nv4hi ((int16x4_t) __a, (int16x4_t) __b, __c);
 }
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vsra_n_u32 (uint32x2_t __a, uint32x2_t __b, const int __c)
 {
   return (uint32x2_t)__builtin_neon_vsrau_nv2si ((int32x2_t) __a, (int32x2_t) __b, __c);
 }
 
-__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint64x1_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vsra_n_u64 (uint64x1_t __a, uint64x1_t __b, const int __c)
 {
   return (uint64x1_t)__builtin_neon_vsrau_ndi ((int64x1_t) __a, (int64x1_t) __b, __c);
 }
 
-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int8x16_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vsraq_n_s8 (int8x16_t __a, int8x16_t __b, const int __c)
 {
   return (int8x16_t)__builtin_neon_vsras_nv16qi (__a, __b, __c);
 }
 
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vsraq_n_s16 (int16x8_t __a, int16x8_t __b, const int __c)
 {
   return (int16x8_t)__builtin_neon_vsras_nv8hi (__a, __b, __c);
 }
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vsraq_n_s32 (int32x4_t __a, int32x4_t __b, const int __c)
 {
   return (int32x4_t)__builtin_neon_vsras_nv4si (__a, __b, __c);
 }
 
-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int64x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vsraq_n_s64 (int64x2_t __a, int64x2_t __b, const int __c)
 {
   return (int64x2_t)__builtin_neon_vsras_nv2di (__a, __b, __c);
 }
 
-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint8x16_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vsraq_n_u8 (uint8x16_t __a, uint8x16_t __b, const int __c)
 {
   return (uint8x16_t)__builtin_neon_vsrau_nv16qi ((int8x16_t) __a, (int8x16_t) __b, __c);
 }
 
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vsraq_n_u16 (uint16x8_t __a, uint16x8_t __b, const int __c)
 {
   return (uint16x8_t)__builtin_neon_vsrau_nv8hi ((int16x8_t) __a, (int16x8_t) __b, __c);
 }
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vsraq_n_u32 (uint32x4_t __a, uint32x4_t __b, const int __c)
 {
   return (uint32x4_t)__builtin_neon_vsrau_nv4si ((int32x4_t) __a, (int32x4_t) __b, __c);
 }
 
-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint64x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vsraq_n_u64 (uint64x2_t __a, uint64x2_t __b, const int __c)
 {
   return (uint64x2_t)__builtin_neon_vsrau_nv2di ((int64x2_t) __a, (int64x2_t) __b, __c);
 }
 
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int8x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vrsra_n_s8 (int8x8_t __a, int8x8_t __b, const int __c)
 {
   return (int8x8_t)__builtin_neon_vrsras_nv8qi (__a, __b, __c);
 }
 
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vrsra_n_s16 (int16x4_t __a, int16x4_t __b, const int __c)
 {
   return (int16x4_t)__builtin_neon_vrsras_nv4hi (__a, __b, __c);
 }
 
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vrsra_n_s32 (int32x2_t __a, int32x2_t __b, const int __c)
 {
   return (int32x2_t)__builtin_neon_vrsras_nv2si (__a, __b, __c);
 }
 
-__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int64x1_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vrsra_n_s64 (int64x1_t __a, int64x1_t __b, const int __c)
 {
   return (int64x1_t)__builtin_neon_vrsras_ndi (__a, __b, __c);
 }
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint8x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vrsra_n_u8 (uint8x8_t __a, uint8x8_t __b, const int __c)
 {
   return (uint8x8_t)__builtin_neon_vrsrau_nv8qi ((int8x8_t) __a, (int8x8_t) __b, __c);
 }
 
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vrsra_n_u16 (uint16x4_t __a, uint16x4_t __b, const int __c)
 {
   return (uint16x4_t)__builtin_neon_vrsrau_nv4hi ((int16x4_t) __a, (int16x4_t) __b, __c);
 }
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vrsra_n_u32 (uint32x2_t __a, uint32x2_t __b, const int __c)
 {
   return (uint32x2_t)__builtin_neon_vrsrau_nv2si ((int32x2_t) __a, (int32x2_t) __b, __c);
 }
 
-__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint64x1_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vrsra_n_u64 (uint64x1_t __a, uint64x1_t __b, const int __c)
 {
   return (uint64x1_t)__builtin_neon_vrsrau_ndi ((int64x1_t) __a, (int64x1_t) __b, __c);
 }
 
-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int8x16_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vrsraq_n_s8 (int8x16_t __a, int8x16_t __b, const int __c)
 {
   return (int8x16_t)__builtin_neon_vrsras_nv16qi (__a, __b, __c);
 }
 
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vrsraq_n_s16 (int16x8_t __a, int16x8_t __b, const int __c)
 {
   return (int16x8_t)__builtin_neon_vrsras_nv8hi (__a, __b, __c);
 }
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vrsraq_n_s32 (int32x4_t __a, int32x4_t __b, const int __c)
 {
   return (int32x4_t)__builtin_neon_vrsras_nv4si (__a, __b, __c);
 }
 
-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int64x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vrsraq_n_s64 (int64x2_t __a, int64x2_t __b, const int __c)
 {
   return (int64x2_t)__builtin_neon_vrsras_nv2di (__a, __b, __c);
 }
 
-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint8x16_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vrsraq_n_u8 (uint8x16_t __a, uint8x16_t __b, const int __c)
 {
   return (uint8x16_t)__builtin_neon_vrsrau_nv16qi ((int8x16_t) __a, (int8x16_t) __b, __c);
 }
 
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vrsraq_n_u16 (uint16x8_t __a, uint16x8_t __b, const int __c)
 {
   return (uint16x8_t)__builtin_neon_vrsrau_nv8hi ((int16x8_t) __a, (int16x8_t) __b, __c);
 }
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vrsraq_n_u32 (uint32x4_t __a, uint32x4_t __b, const int __c)
 {
   return (uint32x4_t)__builtin_neon_vrsrau_nv4si ((int32x4_t) __a, (int32x4_t) __b, __c);
 }
 
-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint64x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vrsraq_n_u64 (uint64x2_t __a, uint64x2_t __b, const int __c)
 {
   return (uint64x2_t)__builtin_neon_vrsrau_nv2di ((int64x2_t) __a, (int64x2_t) __b, __c);
@@ -4565,68 +5278,79 @@ vrsraq_n_u64 (uint64x2_t __a, uint64x2_t __b, const int __c)
 
 #pragma GCC push_options
 #pragma GCC target ("fpu=crypto-neon-fp-armv8")
-__extension__ static __inline poly64x1_t __attribute__ ((__always_inline__))
+__extension__ extern __inline poly64x1_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vsri_n_p64 (poly64x1_t __a, poly64x1_t __b, const int __c)
 {
   return (poly64x1_t)__builtin_neon_vsri_ndi (__a, __b, __c);
 }
 
 #pragma GCC pop_options
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int8x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vsri_n_s8 (int8x8_t __a, int8x8_t __b, const int __c)
 {
   return (int8x8_t)__builtin_neon_vsri_nv8qi (__a, __b, __c);
 }
 
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vsri_n_s16 (int16x4_t __a, int16x4_t __b, const int __c)
 {
   return (int16x4_t)__builtin_neon_vsri_nv4hi (__a, __b, __c);
 }
 
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vsri_n_s32 (int32x2_t __a, int32x2_t __b, const int __c)
 {
   return (int32x2_t)__builtin_neon_vsri_nv2si (__a, __b, __c);
 }
 
-__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int64x1_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vsri_n_s64 (int64x1_t __a, int64x1_t __b, const int __c)
 {
   return (int64x1_t)__builtin_neon_vsri_ndi (__a, __b, __c);
 }
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint8x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vsri_n_u8 (uint8x8_t __a, uint8x8_t __b, const int __c)
 {
   return (uint8x8_t)__builtin_neon_vsri_nv8qi ((int8x8_t) __a, (int8x8_t) __b, __c);
 }
 
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vsri_n_u16 (uint16x4_t __a, uint16x4_t __b, const int __c)
 {
   return (uint16x4_t)__builtin_neon_vsri_nv4hi ((int16x4_t) __a, (int16x4_t) __b, __c);
 }
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vsri_n_u32 (uint32x2_t __a, uint32x2_t __b, const int __c)
 {
   return (uint32x2_t)__builtin_neon_vsri_nv2si ((int32x2_t) __a, (int32x2_t) __b, __c);
 }
 
-__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint64x1_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vsri_n_u64 (uint64x1_t __a, uint64x1_t __b, const int __c)
 {
   return (uint64x1_t)__builtin_neon_vsri_ndi ((int64x1_t) __a, (int64x1_t) __b, __c);
 }
 
-__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline poly8x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vsri_n_p8 (poly8x8_t __a, poly8x8_t __b, const int __c)
 {
   return (poly8x8_t)__builtin_neon_vsri_nv8qi ((int8x8_t) __a, (int8x8_t) __b, __c);
 }
 
-__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline poly16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vsri_n_p16 (poly16x4_t __a, poly16x4_t __b, const int __c)
 {
   return (poly16x4_t)__builtin_neon_vsri_nv4hi ((int16x4_t) __a, (int16x4_t) __b, __c);
@@ -4634,68 +5358,79 @@ vsri_n_p16 (poly16x4_t __a, poly16x4_t __b, const int __c)
 
 #pragma GCC push_options
 #pragma GCC target ("fpu=crypto-neon-fp-armv8")
-__extension__ static __inline poly64x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline poly64x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vsriq_n_p64 (poly64x2_t __a, poly64x2_t __b, const int __c)
 {
   return (poly64x2_t)__builtin_neon_vsri_nv2di ((int64x2_t) __a, (int64x2_t) __b, __c);
 }
 
 #pragma GCC pop_options
-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int8x16_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vsriq_n_s8 (int8x16_t __a, int8x16_t __b, const int __c)
 {
   return (int8x16_t)__builtin_neon_vsri_nv16qi (__a, __b, __c);
 }
 
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vsriq_n_s16 (int16x8_t __a, int16x8_t __b, const int __c)
 {
   return (int16x8_t)__builtin_neon_vsri_nv8hi (__a, __b, __c);
 }
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vsriq_n_s32 (int32x4_t __a, int32x4_t __b, const int __c)
 {
   return (int32x4_t)__builtin_neon_vsri_nv4si (__a, __b, __c);
 }
 
-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int64x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vsriq_n_s64 (int64x2_t __a, int64x2_t __b, const int __c)
 {
   return (int64x2_t)__builtin_neon_vsri_nv2di (__a, __b, __c);
 }
 
-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint8x16_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vsriq_n_u8 (uint8x16_t __a, uint8x16_t __b, const int __c)
 {
   return (uint8x16_t)__builtin_neon_vsri_nv16qi ((int8x16_t) __a, (int8x16_t) __b, __c);
 }
 
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vsriq_n_u16 (uint16x8_t __a, uint16x8_t __b, const int __c)
 {
   return (uint16x8_t)__builtin_neon_vsri_nv8hi ((int16x8_t) __a, (int16x8_t) __b, __c);
 }
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vsriq_n_u32 (uint32x4_t __a, uint32x4_t __b, const int __c)
 {
   return (uint32x4_t)__builtin_neon_vsri_nv4si ((int32x4_t) __a, (int32x4_t) __b, __c);
 }
 
-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint64x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vsriq_n_u64 (uint64x2_t __a, uint64x2_t __b, const int __c)
 {
   return (uint64x2_t)__builtin_neon_vsri_nv2di ((int64x2_t) __a, (int64x2_t) __b, __c);
 }
 
-__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
+__extension__ extern __inline poly8x16_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vsriq_n_p8 (poly8x16_t __a, poly8x16_t __b, const int __c)
 {
   return (poly8x16_t)__builtin_neon_vsri_nv16qi ((int8x16_t) __a, (int8x16_t) __b, __c);
 }
 
-__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline poly16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vsriq_n_p16 (poly16x8_t __a, poly16x8_t __b, const int __c)
 {
   return (poly16x8_t)__builtin_neon_vsri_nv8hi ((int16x8_t) __a, (int16x8_t) __b, __c);
@@ -4703,68 +5438,79 @@ vsriq_n_p16 (poly16x8_t __a, poly16x8_t __b, const int __c)
 
 #pragma GCC push_options
 #pragma GCC target ("fpu=crypto-neon-fp-armv8")
-__extension__ static __inline poly64x1_t __attribute__ ((__always_inline__))
+__extension__ extern __inline poly64x1_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vsli_n_p64 (poly64x1_t __a, poly64x1_t __b, const int __c)
 {
   return (poly64x1_t)__builtin_neon_vsli_ndi (__a, __b, __c);
 }
 
 #pragma GCC pop_options
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int8x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vsli_n_s8 (int8x8_t __a, int8x8_t __b, const int __c)
 {
   return (int8x8_t)__builtin_neon_vsli_nv8qi (__a, __b, __c);
 }
 
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vsli_n_s16 (int16x4_t __a, int16x4_t __b, const int __c)
 {
   return (int16x4_t)__builtin_neon_vsli_nv4hi (__a, __b, __c);
 }
 
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vsli_n_s32 (int32x2_t __a, int32x2_t __b, const int __c)
 {
   return (int32x2_t)__builtin_neon_vsli_nv2si (__a, __b, __c);
 }
 
-__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int64x1_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vsli_n_s64 (int64x1_t __a, int64x1_t __b, const int __c)
 {
   return (int64x1_t)__builtin_neon_vsli_ndi (__a, __b, __c);
 }
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint8x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vsli_n_u8 (uint8x8_t __a, uint8x8_t __b, const int __c)
 {
   return (uint8x8_t)__builtin_neon_vsli_nv8qi ((int8x8_t) __a, (int8x8_t) __b, __c);
 }
 
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vsli_n_u16 (uint16x4_t __a, uint16x4_t __b, const int __c)
 {
   return (uint16x4_t)__builtin_neon_vsli_nv4hi ((int16x4_t) __a, (int16x4_t) __b, __c);
 }
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vsli_n_u32 (uint32x2_t __a, uint32x2_t __b, const int __c)
 {
   return (uint32x2_t)__builtin_neon_vsli_nv2si ((int32x2_t) __a, (int32x2_t) __b, __c);
 }
 
-__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint64x1_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vsli_n_u64 (uint64x1_t __a, uint64x1_t __b, const int __c)
 {
   return (uint64x1_t)__builtin_neon_vsli_ndi ((int64x1_t) __a, (int64x1_t) __b, __c);
 }
 
-__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline poly8x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vsli_n_p8 (poly8x8_t __a, poly8x8_t __b, const int __c)
 {
   return (poly8x8_t)__builtin_neon_vsli_nv8qi ((int8x8_t) __a, (int8x8_t) __b, __c);
 }
 
-__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline poly16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vsli_n_p16 (poly16x4_t __a, poly16x4_t __b, const int __c)
 {
   return (poly16x4_t)__builtin_neon_vsli_nv4hi ((int16x4_t) __a, (int16x4_t) __b, __c);
@@ -4772,530 +5518,618 @@ vsli_n_p16 (poly16x4_t __a, poly16x4_t __b, const int __c)
 
 #pragma GCC push_options
 #pragma GCC target ("fpu=crypto-neon-fp-armv8")
-__extension__ static __inline poly64x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline poly64x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vsliq_n_p64 (poly64x2_t __a, poly64x2_t __b, const int __c)
 {
   return (poly64x2_t)__builtin_neon_vsli_nv2di ((int64x2_t) __a, (int64x2_t) __b, __c);
 }
 
 #pragma GCC pop_options
-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int8x16_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vsliq_n_s8 (int8x16_t __a, int8x16_t __b, const int __c)
 {
   return (int8x16_t)__builtin_neon_vsli_nv16qi (__a, __b, __c);
 }
 
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vsliq_n_s16 (int16x8_t __a, int16x8_t __b, const int __c)
 {
   return (int16x8_t)__builtin_neon_vsli_nv8hi (__a, __b, __c);
 }
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vsliq_n_s32 (int32x4_t __a, int32x4_t __b, const int __c)
 {
   return (int32x4_t)__builtin_neon_vsli_nv4si (__a, __b, __c);
 }
 
-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int64x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vsliq_n_s64 (int64x2_t __a, int64x2_t __b, const int __c)
 {
   return (int64x2_t)__builtin_neon_vsli_nv2di (__a, __b, __c);
 }
 
-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint8x16_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vsliq_n_u8 (uint8x16_t __a, uint8x16_t __b, const int __c)
 {
   return (uint8x16_t)__builtin_neon_vsli_nv16qi ((int8x16_t) __a, (int8x16_t) __b, __c);
 }
 
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vsliq_n_u16 (uint16x8_t __a, uint16x8_t __b, const int __c)
 {
   return (uint16x8_t)__builtin_neon_vsli_nv8hi ((int16x8_t) __a, (int16x8_t) __b, __c);
 }
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vsliq_n_u32 (uint32x4_t __a, uint32x4_t __b, const int __c)
 {
   return (uint32x4_t)__builtin_neon_vsli_nv4si ((int32x4_t) __a, (int32x4_t) __b, __c);
 }
 
-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint64x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vsliq_n_u64 (uint64x2_t __a, uint64x2_t __b, const int __c)
 {
   return (uint64x2_t)__builtin_neon_vsli_nv2di ((int64x2_t) __a, (int64x2_t) __b, __c);
 }
 
-__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
+__extension__ extern __inline poly8x16_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vsliq_n_p8 (poly8x16_t __a, poly8x16_t __b, const int __c)
 {
   return (poly8x16_t)__builtin_neon_vsli_nv16qi ((int8x16_t) __a, (int8x16_t) __b, __c);
 }
 
-__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline poly16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vsliq_n_p16 (poly16x8_t __a, poly16x8_t __b, const int __c)
 {
   return (poly16x8_t)__builtin_neon_vsli_nv8hi ((int16x8_t) __a, (int16x8_t) __b, __c);
 }
 
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int8x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vabs_s8 (int8x8_t __a)
 {
   return (int8x8_t)__builtin_neon_vabsv8qi (__a);
 }
 
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vabs_s16 (int16x4_t __a)
 {
   return (int16x4_t)__builtin_neon_vabsv4hi (__a);
 }
 
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vabs_s32 (int32x2_t __a)
 {
   return (int32x2_t)__builtin_neon_vabsv2si (__a);
 }
 
-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline float32x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vabs_f32 (float32x2_t __a)
 {
   return (float32x2_t)__builtin_neon_vabsv2sf (__a);
 }
 
-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int8x16_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vabsq_s8 (int8x16_t __a)
 {
   return (int8x16_t)__builtin_neon_vabsv16qi (__a);
 }
 
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vabsq_s16 (int16x8_t __a)
 {
   return (int16x8_t)__builtin_neon_vabsv8hi (__a);
 }
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vabsq_s32 (int32x4_t __a)
 {
   return (int32x4_t)__builtin_neon_vabsv4si (__a);
 }
 
-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline float32x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vabsq_f32 (float32x4_t __a)
 {
   return (float32x4_t)__builtin_neon_vabsv4sf (__a);
 }
 
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int8x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vqabs_s8 (int8x8_t __a)
 {
   return (int8x8_t)__builtin_neon_vqabsv8qi (__a);
 }
 
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vqabs_s16 (int16x4_t __a)
 {
   return (int16x4_t)__builtin_neon_vqabsv4hi (__a);
 }
 
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vqabs_s32 (int32x2_t __a)
 {
   return (int32x2_t)__builtin_neon_vqabsv2si (__a);
 }
 
-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int8x16_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vqabsq_s8 (int8x16_t __a)
 {
   return (int8x16_t)__builtin_neon_vqabsv16qi (__a);
 }
 
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vqabsq_s16 (int16x8_t __a)
 {
   return (int16x8_t)__builtin_neon_vqabsv8hi (__a);
 }
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vqabsq_s32 (int32x4_t __a)
 {
   return (int32x4_t)__builtin_neon_vqabsv4si (__a);
 }
 
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int8x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vneg_s8 (int8x8_t __a)
 {
   return (int8x8_t)__builtin_neon_vnegv8qi (__a);
 }
 
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vneg_s16 (int16x4_t __a)
 {
   return (int16x4_t)__builtin_neon_vnegv4hi (__a);
 }
 
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vneg_s32 (int32x2_t __a)
 {
   return (int32x2_t)__builtin_neon_vnegv2si (__a);
 }
 
-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline float32x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vneg_f32 (float32x2_t __a)
 {
   return (float32x2_t)__builtin_neon_vnegv2sf (__a);
 }
 
-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int8x16_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vnegq_s8 (int8x16_t __a)
 {
   return (int8x16_t)__builtin_neon_vnegv16qi (__a);
 }
 
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vnegq_s16 (int16x8_t __a)
 {
   return (int16x8_t)__builtin_neon_vnegv8hi (__a);
 }
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vnegq_s32 (int32x4_t __a)
 {
   return (int32x4_t)__builtin_neon_vnegv4si (__a);
 }
 
-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline float32x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vnegq_f32 (float32x4_t __a)
 {
   return (float32x4_t)__builtin_neon_vnegv4sf (__a);
 }
 
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int8x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vqneg_s8 (int8x8_t __a)
 {
   return (int8x8_t)__builtin_neon_vqnegv8qi (__a);
 }
 
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vqneg_s16 (int16x4_t __a)
 {
   return (int16x4_t)__builtin_neon_vqnegv4hi (__a);
 }
 
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vqneg_s32 (int32x2_t __a)
 {
   return (int32x2_t)__builtin_neon_vqnegv2si (__a);
 }
 
-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int8x16_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vqnegq_s8 (int8x16_t __a)
 {
   return (int8x16_t)__builtin_neon_vqnegv16qi (__a);
 }
 
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vqnegq_s16 (int16x8_t __a)
 {
   return (int16x8_t)__builtin_neon_vqnegv8hi (__a);
 }
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vqnegq_s32 (int32x4_t __a)
 {
   return (int32x4_t)__builtin_neon_vqnegv4si (__a);
 }
 
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int8x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vmvn_s8 (int8x8_t __a)
 {
   return (int8x8_t)__builtin_neon_vmvnv8qi (__a);
 }
 
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vmvn_s16 (int16x4_t __a)
 {
   return (int16x4_t)__builtin_neon_vmvnv4hi (__a);
 }
 
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vmvn_s32 (int32x2_t __a)
 {
   return (int32x2_t)__builtin_neon_vmvnv2si (__a);
 }
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint8x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vmvn_u8 (uint8x8_t __a)
 {
   return (uint8x8_t)__builtin_neon_vmvnv8qi ((int8x8_t) __a);
 }
 
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vmvn_u16 (uint16x4_t __a)
 {
   return (uint16x4_t)__builtin_neon_vmvnv4hi ((int16x4_t) __a);
 }
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vmvn_u32 (uint32x2_t __a)
 {
   return (uint32x2_t)__builtin_neon_vmvnv2si ((int32x2_t) __a);
 }
 
-__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline poly8x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vmvn_p8 (poly8x8_t __a)
 {
   return (poly8x8_t)__builtin_neon_vmvnv8qi ((int8x8_t) __a);
 }
 
-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int8x16_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vmvnq_s8 (int8x16_t __a)
 {
   return (int8x16_t)__builtin_neon_vmvnv16qi (__a);
 }
 
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vmvnq_s16 (int16x8_t __a)
 {
   return (int16x8_t)__builtin_neon_vmvnv8hi (__a);
 }
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vmvnq_s32 (int32x4_t __a)
 {
   return (int32x4_t)__builtin_neon_vmvnv4si (__a);
 }
 
-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint8x16_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vmvnq_u8 (uint8x16_t __a)
 {
   return (uint8x16_t)__builtin_neon_vmvnv16qi ((int8x16_t) __a);
 }
 
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vmvnq_u16 (uint16x8_t __a)
 {
   return (uint16x8_t)__builtin_neon_vmvnv8hi ((int16x8_t) __a);
 }
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vmvnq_u32 (uint32x4_t __a)
 {
   return (uint32x4_t)__builtin_neon_vmvnv4si ((int32x4_t) __a);
 }
 
-__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
+__extension__ extern __inline poly8x16_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vmvnq_p8 (poly8x16_t __a)
 {
   return (poly8x16_t)__builtin_neon_vmvnv16qi ((int8x16_t) __a);
 }
 
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int8x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vcls_s8 (int8x8_t __a)
 {
   return (int8x8_t)__builtin_neon_vclsv8qi (__a);
 }
 
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vcls_s16 (int16x4_t __a)
 {
   return (int16x4_t)__builtin_neon_vclsv4hi (__a);
 }
 
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vcls_s32 (int32x2_t __a)
 {
   return (int32x2_t)__builtin_neon_vclsv2si (__a);
 }
 
-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int8x16_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vclsq_s8 (int8x16_t __a)
 {
   return (int8x16_t)__builtin_neon_vclsv16qi (__a);
 }
 
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vclsq_s16 (int16x8_t __a)
 {
   return (int16x8_t)__builtin_neon_vclsv8hi (__a);
 }
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vclsq_s32 (int32x4_t __a)
 {
   return (int32x4_t)__builtin_neon_vclsv4si (__a);
 }
 
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int8x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vclz_s8 (int8x8_t __a)
 {
   return (int8x8_t)__builtin_neon_vclzv8qi (__a);
 }
 
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vclz_s16 (int16x4_t __a)
 {
   return (int16x4_t)__builtin_neon_vclzv4hi (__a);
 }
 
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vclz_s32 (int32x2_t __a)
 {
   return (int32x2_t)__builtin_neon_vclzv2si (__a);
 }
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint8x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vclz_u8 (uint8x8_t __a)
 {
   return (uint8x8_t)__builtin_neon_vclzv8qi ((int8x8_t) __a);
 }
 
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vclz_u16 (uint16x4_t __a)
 {
   return (uint16x4_t)__builtin_neon_vclzv4hi ((int16x4_t) __a);
 }
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vclz_u32 (uint32x2_t __a)
 {
   return (uint32x2_t)__builtin_neon_vclzv2si ((int32x2_t) __a);
 }
 
-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int8x16_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vclzq_s8 (int8x16_t __a)
 {
   return (int8x16_t)__builtin_neon_vclzv16qi (__a);
 }
 
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vclzq_s16 (int16x8_t __a)
 {
   return (int16x8_t)__builtin_neon_vclzv8hi (__a);
 }
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vclzq_s32 (int32x4_t __a)
 {
   return (int32x4_t)__builtin_neon_vclzv4si (__a);
 }
 
-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint8x16_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vclzq_u8 (uint8x16_t __a)
 {
   return (uint8x16_t)__builtin_neon_vclzv16qi ((int8x16_t) __a);
 }
 
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vclzq_u16 (uint16x8_t __a)
 {
   return (uint16x8_t)__builtin_neon_vclzv8hi ((int16x8_t) __a);
 }
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vclzq_u32 (uint32x4_t __a)
 {
   return (uint32x4_t)__builtin_neon_vclzv4si ((int32x4_t) __a);
 }
 
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int8x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vcnt_s8 (int8x8_t __a)
 {
   return (int8x8_t)__builtin_neon_vcntv8qi (__a);
 }
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint8x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vcnt_u8 (uint8x8_t __a)
 {
   return (uint8x8_t)__builtin_neon_vcntv8qi ((int8x8_t) __a);
 }
 
-__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline poly8x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vcnt_p8 (poly8x8_t __a)
 {
   return (poly8x8_t)__builtin_neon_vcntv8qi ((int8x8_t) __a);
 }
 
-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int8x16_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vcntq_s8 (int8x16_t __a)
 {
   return (int8x16_t)__builtin_neon_vcntv16qi (__a);
 }
 
-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint8x16_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vcntq_u8 (uint8x16_t __a)
 {
   return (uint8x16_t)__builtin_neon_vcntv16qi ((int8x16_t) __a);
 }
 
-__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
+__extension__ extern __inline poly8x16_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vcntq_p8 (poly8x16_t __a)
 {
   return (poly8x16_t)__builtin_neon_vcntv16qi ((int8x16_t) __a);
 }
 
-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline float32x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vrecpe_f32 (float32x2_t __a)
 {
   return (float32x2_t)__builtin_neon_vrecpev2sf (__a);
 }
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vrecpe_u32 (uint32x2_t __a)
 {
   return (uint32x2_t)__builtin_neon_vrecpev2si ((int32x2_t) __a);
 }
 
-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline float32x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vrecpeq_f32 (float32x4_t __a)
 {
   return (float32x4_t)__builtin_neon_vrecpev4sf (__a);
 }
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vrecpeq_u32 (uint32x4_t __a)
 {
   return (uint32x4_t)__builtin_neon_vrecpev4si ((int32x4_t) __a);
 }
 
-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline float32x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vrsqrte_f32 (float32x2_t __a)
 {
   return (float32x2_t)__builtin_neon_vrsqrtev2sf (__a);
 }
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vrsqrte_u32 (uint32x2_t __a)
 {
   return (uint32x2_t)__builtin_neon_vrsqrtev2si ((int32x2_t) __a);
 }
 
-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline float32x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vrsqrteq_f32 (float32x4_t __a)
 {
   return (float32x4_t)__builtin_neon_vrsqrtev4sf (__a);
 }
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vrsqrteq_u32 (uint32x4_t __a)
 {
   return (uint32x4_t)__builtin_neon_vrsqrtev4si ((int32x4_t) __a);
 }
 
-__extension__ static __inline int8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vget_lane_s8 (int8x8_t __a, const int __b)
 {
   return (int8_t)__builtin_neon_vget_lanev8qi (__a, __b);
 }
 
-__extension__ static __inline int16_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int16_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vget_lane_s16 (int16x4_t __a, const int __b)
 {
   return (int16_t)__builtin_neon_vget_lanev4hi (__a, __b);
 }
 
-__extension__ static __inline int32_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vget_lane_s32 (int32x2_t __a, const int __b)
 {
   return (int32_t)__builtin_neon_vget_lanev2si (__a, __b);
@@ -5328,67 +6162,88 @@ vget_lane_s32 (int32x2_t __a, const int __b)
   })
 #endif
 
-__extension__ static __inline float32_t __attribute__ ((__always_inline__))
+__extension__ extern __inline float32_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vget_lane_f32 (float32x2_t __a, const int __b)
 {
   return (float32_t)__builtin_neon_vget_lanev2sf (__a, __b);
 }
 
-__extension__ static __inline uint8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vget_lane_u8 (uint8x8_t __a, const int __b)
 {
   return (uint8_t)__builtin_neon_vget_laneuv8qi ((int8x8_t) __a, __b);
 }
 
-__extension__ static __inline uint16_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint16_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vget_lane_u16 (uint16x4_t __a, const int __b)
 {
   return (uint16_t)__builtin_neon_vget_laneuv4hi ((int16x4_t) __a, __b);
 }
 
-__extension__ static __inline uint32_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vget_lane_u32 (uint32x2_t __a, const int __b)
 {
   return (uint32_t)__builtin_neon_vget_laneuv2si ((int32x2_t) __a, __b);
 }
 
-__extension__ static __inline poly8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline poly8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vget_lane_p8 (poly8x8_t __a, const int __b)
 {
   return (poly8_t)__builtin_neon_vget_laneuv8qi ((int8x8_t) __a, __b);
 }
 
-__extension__ static __inline poly16_t __attribute__ ((__always_inline__))
+__extension__ extern __inline poly16_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vget_lane_p16 (poly16x4_t __a, const int __b)
 {
   return (poly16_t)__builtin_neon_vget_laneuv4hi ((int16x4_t) __a, __b);
 }
 
-__extension__ static __inline int64_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int64_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vget_lane_s64 (int64x1_t __a, const int __b)
 {
   return (int64_t)__builtin_neon_vget_lanedi (__a, __b);
 }
 
-__extension__ static __inline uint64_t __attribute__ ((__always_inline__))
+#pragma GCC push_options
+#pragma GCC target ("fpu=crypto-neon-fp-armv8")
+__extension__ extern __inline poly64_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vget_lane_p64 (poly64x1_t __a, const int __b)
+{
+  return (poly64_t)__builtin_neon_vget_lanedi ((int64x1_t) __a, __b);
+}
+
+#pragma GCC pop_options
+__extension__ extern __inline uint64_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vget_lane_u64 (uint64x1_t __a, const int __b)
 {
   return (uint64_t)__builtin_neon_vget_lanedi ((int64x1_t) __a, __b);
 }
 
-__extension__ static __inline int8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vgetq_lane_s8 (int8x16_t __a, const int __b)
 {
   return (int8_t)__builtin_neon_vget_lanev16qi (__a, __b);
 }
 
-__extension__ static __inline int16_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int16_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vgetq_lane_s16 (int16x8_t __a, const int __b)
 {
   return (int16_t)__builtin_neon_vget_lanev8hi (__a, __b);
 }
 
-__extension__ static __inline int32_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vgetq_lane_s32 (int32x4_t __a, const int __b)
 {
   return (int32_t)__builtin_neon_vget_lanev4si (__a, __b);
@@ -5405,67 +6260,78 @@ vgetq_lane_s32 (int32x4_t __a, const int __b)
   })
 #endif
 
-__extension__ static __inline float32_t __attribute__ ((__always_inline__))
+__extension__ extern __inline float32_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vgetq_lane_f32 (float32x4_t __a, const int __b)
 {
   return (float32_t)__builtin_neon_vget_lanev4sf (__a, __b);
 }
 
-__extension__ static __inline uint8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vgetq_lane_u8 (uint8x16_t __a, const int __b)
 {
   return (uint8_t)__builtin_neon_vget_laneuv16qi ((int8x16_t) __a, __b);
 }
 
-__extension__ static __inline uint16_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint16_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vgetq_lane_u16 (uint16x8_t __a, const int __b)
 {
   return (uint16_t)__builtin_neon_vget_laneuv8hi ((int16x8_t) __a, __b);
 }
 
-__extension__ static __inline uint32_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vgetq_lane_u32 (uint32x4_t __a, const int __b)
 {
   return (uint32_t)__builtin_neon_vget_laneuv4si ((int32x4_t) __a, __b);
 }
 
-__extension__ static __inline poly8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline poly8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vgetq_lane_p8 (poly8x16_t __a, const int __b)
 {
   return (poly8_t)__builtin_neon_vget_laneuv16qi ((int8x16_t) __a, __b);
 }
 
-__extension__ static __inline poly16_t __attribute__ ((__always_inline__))
+__extension__ extern __inline poly16_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vgetq_lane_p16 (poly16x8_t __a, const int __b)
 {
   return (poly16_t)__builtin_neon_vget_laneuv8hi ((int16x8_t) __a, __b);
 }
 
-__extension__ static __inline int64_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int64_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vgetq_lane_s64 (int64x2_t __a, const int __b)
 {
   return (int64_t)__builtin_neon_vget_lanev2di (__a, __b);
 }
 
-__extension__ static __inline uint64_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint64_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vgetq_lane_u64 (uint64x2_t __a, const int __b)
 {
   return (uint64_t)__builtin_neon_vget_lanev2di ((int64x2_t) __a, __b);
 }
 
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int8x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vset_lane_s8 (int8_t __a, int8x8_t __b, const int __c)
 {
   return (int8x8_t)__builtin_neon_vset_lanev8qi ((__builtin_neon_qi) __a, __b, __c);
 }
 
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vset_lane_s16 (int16_t __a, int16x4_t __b, const int __c)
 {
   return (int16x4_t)__builtin_neon_vset_lanev4hi ((__builtin_neon_hi) __a, __b, __c);
 }
 
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vset_lane_s32 (int32_t __a, int32x2_t __b, const int __c)
 {
   return (int32x2_t)__builtin_neon_vset_lanev2si ((__builtin_neon_si) __a, __b, __c);
@@ -5483,67 +6349,78 @@ vset_lane_s32 (int32_t __a, int32x2_t __b, const int __c)
   })
 #endif
 
-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline float32x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vset_lane_f32 (float32_t __a, float32x2_t __b, const int __c)
 {
   return (float32x2_t)__builtin_neon_vset_lanev2sf ((__builtin_neon_sf) __a, __b, __c);
 }
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint8x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vset_lane_u8 (uint8_t __a, uint8x8_t __b, const int __c)
 {
   return (uint8x8_t)__builtin_neon_vset_lanev8qi ((__builtin_neon_qi) __a, (int8x8_t) __b, __c);
 }
 
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vset_lane_u16 (uint16_t __a, uint16x4_t __b, const int __c)
 {
   return (uint16x4_t)__builtin_neon_vset_lanev4hi ((__builtin_neon_hi) __a, (int16x4_t) __b, __c);
 }
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vset_lane_u32 (uint32_t __a, uint32x2_t __b, const int __c)
 {
   return (uint32x2_t)__builtin_neon_vset_lanev2si ((__builtin_neon_si) __a, (int32x2_t) __b, __c);
 }
 
-__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline poly8x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vset_lane_p8 (poly8_t __a, poly8x8_t __b, const int __c)
 {
   return (poly8x8_t)__builtin_neon_vset_lanev8qi ((__builtin_neon_qi) __a, (int8x8_t) __b, __c);
 }
 
-__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline poly16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vset_lane_p16 (poly16_t __a, poly16x4_t __b, const int __c)
 {
   return (poly16x4_t)__builtin_neon_vset_lanev4hi ((__builtin_neon_hi) __a, (int16x4_t) __b, __c);
 }
 
-__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int64x1_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vset_lane_s64 (int64_t __a, int64x1_t __b, const int __c)
 {
   return (int64x1_t)__builtin_neon_vset_lanedi ((__builtin_neon_di) __a, __b, __c);
 }
 
-__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint64x1_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vset_lane_u64 (uint64_t __a, uint64x1_t __b, const int __c)
 {
   return (uint64x1_t)__builtin_neon_vset_lanedi ((__builtin_neon_di) __a, (int64x1_t) __b, __c);
 }
 
-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int8x16_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vsetq_lane_s8 (int8_t __a, int8x16_t __b, const int __c)
 {
   return (int8x16_t)__builtin_neon_vset_lanev16qi ((__builtin_neon_qi) __a, __b, __c);
 }
 
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vsetq_lane_s16 (int16_t __a, int16x8_t __b, const int __c)
 {
   return (int16x8_t)__builtin_neon_vset_lanev8hi ((__builtin_neon_hi) __a, __b, __c);
 }
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vsetq_lane_s32 (int32_t __a, int32x4_t __b, const int __c)
 {
   return (int32x4_t)__builtin_neon_vset_lanev4si ((__builtin_neon_si) __a, __b, __c);
@@ -5561,49 +6438,57 @@ vsetq_lane_s32 (int32_t __a, int32x4_t __b, const int __c)
   })
 #endif
 
-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline float32x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vsetq_lane_f32 (float32_t __a, float32x4_t __b, const int __c)
 {
   return (float32x4_t)__builtin_neon_vset_lanev4sf ((__builtin_neon_sf) __a, __b, __c);
 }
 
-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint8x16_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vsetq_lane_u8 (uint8_t __a, uint8x16_t __b, const int __c)
 {
   return (uint8x16_t)__builtin_neon_vset_lanev16qi ((__builtin_neon_qi) __a, (int8x16_t) __b, __c);
 }
 
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vsetq_lane_u16 (uint16_t __a, uint16x8_t __b, const int __c)
 {
   return (uint16x8_t)__builtin_neon_vset_lanev8hi ((__builtin_neon_hi) __a, (int16x8_t) __b, __c);
 }
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vsetq_lane_u32 (uint32_t __a, uint32x4_t __b, const int __c)
 {
   return (uint32x4_t)__builtin_neon_vset_lanev4si ((__builtin_neon_si) __a, (int32x4_t) __b, __c);
 }
 
-__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
+__extension__ extern __inline poly8x16_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vsetq_lane_p8 (poly8_t __a, poly8x16_t __b, const int __c)
 {
   return (poly8x16_t)__builtin_neon_vset_lanev16qi ((__builtin_neon_qi) __a, (int8x16_t) __b, __c);
 }
 
-__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline poly16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vsetq_lane_p16 (poly16_t __a, poly16x8_t __b, const int __c)
 {
   return (poly16x8_t)__builtin_neon_vset_lanev8hi ((__builtin_neon_hi) __a, (int16x8_t) __b, __c);
 }
 
-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int64x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vsetq_lane_s64 (int64_t __a, int64x2_t __b, const int __c)
 {
   return (int64x2_t)__builtin_neon_vset_lanev2di ((__builtin_neon_di) __a, __b, __c);
 }
 
-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint64x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vsetq_lane_u64 (uint64_t __a, uint64x2_t __b, const int __c)
 {
   return (uint64x2_t)__builtin_neon_vset_lanev2di ((__builtin_neon_di) __a, (int64x2_t) __b, __c);
@@ -5611,136 +6496,158 @@ vsetq_lane_u64 (uint64_t __a, uint64x2_t __b, const int __c)
 
 #pragma GCC push_options
 #pragma GCC target ("fpu=crypto-neon-fp-armv8")
-__extension__ static __inline poly64x1_t __attribute__ ((__always_inline__))
+__extension__ extern __inline poly64x1_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vcreate_p64 (uint64_t __a)
 {
   return (poly64x1_t)__builtin_neon_vcreatedi ((__builtin_neon_di) __a);
 }
 
 #pragma GCC pop_options
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int8x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vcreate_s8 (uint64_t __a)
 {
   return (int8x8_t)__builtin_neon_vcreatev8qi ((__builtin_neon_di) __a);
 }
 
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vcreate_s16 (uint64_t __a)
 {
   return (int16x4_t)__builtin_neon_vcreatev4hi ((__builtin_neon_di) __a);
 }
 
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vcreate_s32 (uint64_t __a)
 {
   return (int32x2_t)__builtin_neon_vcreatev2si ((__builtin_neon_di) __a);
 }
 
-__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int64x1_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vcreate_s64 (uint64_t __a)
 {
   return (int64x1_t)__builtin_neon_vcreatedi ((__builtin_neon_di) __a);
 }
 
 #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
-__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline float16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vcreate_f16 (uint64_t __a)
 {
   return (float16x4_t) __a;
 }
 #endif
 
-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline float32x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vcreate_f32 (uint64_t __a)
 {
   return (float32x2_t)__builtin_neon_vcreatev2sf ((__builtin_neon_di) __a);
 }
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint8x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vcreate_u8 (uint64_t __a)
 {
   return (uint8x8_t)__builtin_neon_vcreatev8qi ((__builtin_neon_di) __a);
 }
 
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vcreate_u16 (uint64_t __a)
 {
   return (uint16x4_t)__builtin_neon_vcreatev4hi ((__builtin_neon_di) __a);
 }
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vcreate_u32 (uint64_t __a)
 {
   return (uint32x2_t)__builtin_neon_vcreatev2si ((__builtin_neon_di) __a);
 }
 
-__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint64x1_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vcreate_u64 (uint64_t __a)
 {
   return (uint64x1_t)__builtin_neon_vcreatedi ((__builtin_neon_di) __a);
 }
 
-__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline poly8x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vcreate_p8 (uint64_t __a)
 {
   return (poly8x8_t)__builtin_neon_vcreatev8qi ((__builtin_neon_di) __a);
 }
 
-__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline poly16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vcreate_p16 (uint64_t __a)
 {
   return (poly16x4_t)__builtin_neon_vcreatev4hi ((__builtin_neon_di) __a);
 }
 
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int8x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vdup_n_s8 (int8_t __a)
 {
   return (int8x8_t)__builtin_neon_vdup_nv8qi ((__builtin_neon_qi) __a);
 }
 
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vdup_n_s16 (int16_t __a)
 {
   return (int16x4_t)__builtin_neon_vdup_nv4hi ((__builtin_neon_hi) __a);
 }
 
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vdup_n_s32 (int32_t __a)
 {
   return (int32x2_t)__builtin_neon_vdup_nv2si ((__builtin_neon_si) __a);
 }
 
-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline float32x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vdup_n_f32 (float32_t __a)
 {
   return (float32x2_t)__builtin_neon_vdup_nv2sf ((__builtin_neon_sf) __a);
 }
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint8x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vdup_n_u8 (uint8_t __a)
 {
   return (uint8x8_t)__builtin_neon_vdup_nv8qi ((__builtin_neon_qi) __a);
 }
 
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vdup_n_u16 (uint16_t __a)
 {
   return (uint16x4_t)__builtin_neon_vdup_nv4hi ((__builtin_neon_hi) __a);
 }
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vdup_n_u32 (uint32_t __a)
 {
   return (uint32x2_t)__builtin_neon_vdup_nv2si ((__builtin_neon_si) __a);
 }
 
-__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline poly8x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vdup_n_p8 (poly8_t __a)
 {
   return (poly8x8_t)__builtin_neon_vdup_nv8qi ((__builtin_neon_qi) __a);
 }
 
-__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline poly16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vdup_n_p16 (poly16_t __a)
 {
   return (poly16x4_t)__builtin_neon_vdup_nv4hi ((__builtin_neon_hi) __a);
@@ -5748,20 +6655,23 @@ vdup_n_p16 (poly16_t __a)
 
 #pragma GCC push_options
 #pragma GCC target ("fpu=crypto-neon-fp-armv8")
-__extension__ static __inline poly64x1_t __attribute__ ((__always_inline__))
+__extension__ extern __inline poly64x1_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vdup_n_p64 (poly64_t __a)
 {
   return (poly64x1_t)__builtin_neon_vdup_ndi ((__builtin_neon_di) __a);
 }
 
 #pragma GCC pop_options
-__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int64x1_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vdup_n_s64 (int64_t __a)
 {
   return (int64x1_t)__builtin_neon_vdup_ndi ((__builtin_neon_di) __a);
 }
 
-__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint64x1_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vdup_n_u64 (uint64_t __a)
 {
   return (uint64x1_t)__builtin_neon_vdup_ndi ((__builtin_neon_di) __a);
@@ -5769,260 +6679,303 @@ vdup_n_u64 (uint64_t __a)
 
 #pragma GCC push_options
 #pragma GCC target ("fpu=crypto-neon-fp-armv8")
-__extension__ static __inline poly64x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline poly64x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vdupq_n_p64 (poly64_t __a)
 {
   return (poly64x2_t)__builtin_neon_vdup_nv2di ((__builtin_neon_di) __a);
 }
 
 #pragma GCC pop_options
-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int8x16_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vdupq_n_s8 (int8_t __a)
 {
   return (int8x16_t)__builtin_neon_vdup_nv16qi ((__builtin_neon_qi) __a);
 }
 
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vdupq_n_s16 (int16_t __a)
 {
   return (int16x8_t)__builtin_neon_vdup_nv8hi ((__builtin_neon_hi) __a);
 }
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vdupq_n_s32 (int32_t __a)
 {
   return (int32x4_t)__builtin_neon_vdup_nv4si ((__builtin_neon_si) __a);
 }
 
-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline float32x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vdupq_n_f32 (float32_t __a)
 {
   return (float32x4_t)__builtin_neon_vdup_nv4sf ((__builtin_neon_sf) __a);
 }
 
-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint8x16_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vdupq_n_u8 (uint8_t __a)
 {
   return (uint8x16_t)__builtin_neon_vdup_nv16qi ((__builtin_neon_qi) __a);
 }
 
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vdupq_n_u16 (uint16_t __a)
 {
   return (uint16x8_t)__builtin_neon_vdup_nv8hi ((__builtin_neon_hi) __a);
 }
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vdupq_n_u32 (uint32_t __a)
 {
   return (uint32x4_t)__builtin_neon_vdup_nv4si ((__builtin_neon_si) __a);
 }
 
-__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
+__extension__ extern __inline poly8x16_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vdupq_n_p8 (poly8_t __a)
 {
   return (poly8x16_t)__builtin_neon_vdup_nv16qi ((__builtin_neon_qi) __a);
 }
 
-__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline poly16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vdupq_n_p16 (poly16_t __a)
 {
   return (poly16x8_t)__builtin_neon_vdup_nv8hi ((__builtin_neon_hi) __a);
 }
 
-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int64x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vdupq_n_s64 (int64_t __a)
 {
   return (int64x2_t)__builtin_neon_vdup_nv2di ((__builtin_neon_di) __a);
 }
 
-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint64x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vdupq_n_u64 (uint64_t __a)
 {
   return (uint64x2_t)__builtin_neon_vdup_nv2di ((__builtin_neon_di) __a);
 }
 
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int8x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vmov_n_s8 (int8_t __a)
 {
   return (int8x8_t)__builtin_neon_vdup_nv8qi ((__builtin_neon_qi) __a);
 }
 
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vmov_n_s16 (int16_t __a)
 {
   return (int16x4_t)__builtin_neon_vdup_nv4hi ((__builtin_neon_hi) __a);
 }
 
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vmov_n_s32 (int32_t __a)
 {
   return (int32x2_t)__builtin_neon_vdup_nv2si ((__builtin_neon_si) __a);
 }
 
-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline float32x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vmov_n_f32 (float32_t __a)
 {
   return (float32x2_t)__builtin_neon_vdup_nv2sf ((__builtin_neon_sf) __a);
 }
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint8x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vmov_n_u8 (uint8_t __a)
 {
   return (uint8x8_t)__builtin_neon_vdup_nv8qi ((__builtin_neon_qi) __a);
 }
 
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vmov_n_u16 (uint16_t __a)
 {
   return (uint16x4_t)__builtin_neon_vdup_nv4hi ((__builtin_neon_hi) __a);
 }
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vmov_n_u32 (uint32_t __a)
 {
   return (uint32x2_t)__builtin_neon_vdup_nv2si ((__builtin_neon_si) __a);
 }
 
-__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline poly8x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vmov_n_p8 (poly8_t __a)
 {
   return (poly8x8_t)__builtin_neon_vdup_nv8qi ((__builtin_neon_qi) __a);
 }
 
-__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline poly16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vmov_n_p16 (poly16_t __a)
 {
   return (poly16x4_t)__builtin_neon_vdup_nv4hi ((__builtin_neon_hi) __a);
 }
 
-__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int64x1_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vmov_n_s64 (int64_t __a)
 {
   return (int64x1_t)__builtin_neon_vdup_ndi ((__builtin_neon_di) __a);
 }
 
-__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint64x1_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vmov_n_u64 (uint64_t __a)
 {
   return (uint64x1_t)__builtin_neon_vdup_ndi ((__builtin_neon_di) __a);
 }
 
-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int8x16_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vmovq_n_s8 (int8_t __a)
 {
   return (int8x16_t)__builtin_neon_vdup_nv16qi ((__builtin_neon_qi) __a);
 }
 
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vmovq_n_s16 (int16_t __a)
 {
   return (int16x8_t)__builtin_neon_vdup_nv8hi ((__builtin_neon_hi) __a);
 }
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vmovq_n_s32 (int32_t __a)
 {
   return (int32x4_t)__builtin_neon_vdup_nv4si ((__builtin_neon_si) __a);
 }
 
-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline float32x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vmovq_n_f32 (float32_t __a)
 {
   return (float32x4_t)__builtin_neon_vdup_nv4sf ((__builtin_neon_sf) __a);
 }
 
-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint8x16_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vmovq_n_u8 (uint8_t __a)
 {
   return (uint8x16_t)__builtin_neon_vdup_nv16qi ((__builtin_neon_qi) __a);
 }
 
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vmovq_n_u16 (uint16_t __a)
 {
   return (uint16x8_t)__builtin_neon_vdup_nv8hi ((__builtin_neon_hi) __a);
 }
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vmovq_n_u32 (uint32_t __a)
 {
   return (uint32x4_t)__builtin_neon_vdup_nv4si ((__builtin_neon_si) __a);
 }
 
-__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
+__extension__ extern __inline poly8x16_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vmovq_n_p8 (poly8_t __a)
 {
   return (poly8x16_t)__builtin_neon_vdup_nv16qi ((__builtin_neon_qi) __a);
 }
 
-__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline poly16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vmovq_n_p16 (poly16_t __a)
 {
   return (poly16x8_t)__builtin_neon_vdup_nv8hi ((__builtin_neon_hi) __a);
 }
 
-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int64x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vmovq_n_s64 (int64_t __a)
 {
   return (int64x2_t)__builtin_neon_vdup_nv2di ((__builtin_neon_di) __a);
 }
 
-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint64x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vmovq_n_u64 (uint64_t __a)
 {
   return (uint64x2_t)__builtin_neon_vdup_nv2di ((__builtin_neon_di) __a);
 }
 
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int8x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vdup_lane_s8 (int8x8_t __a, const int __b)
 {
   return (int8x8_t)__builtin_neon_vdup_lanev8qi (__a, __b);
 }
 
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vdup_lane_s16 (int16x4_t __a, const int __b)
 {
   return (int16x4_t)__builtin_neon_vdup_lanev4hi (__a, __b);
 }
 
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vdup_lane_s32 (int32x2_t __a, const int __b)
 {
   return (int32x2_t)__builtin_neon_vdup_lanev2si (__a, __b);
 }
 
-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline float32x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vdup_lane_f32 (float32x2_t __a, const int __b)
 {
   return (float32x2_t)__builtin_neon_vdup_lanev2sf (__a, __b);
 }
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint8x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vdup_lane_u8 (uint8x8_t __a, const int __b)
 {
   return (uint8x8_t)__builtin_neon_vdup_lanev8qi ((int8x8_t) __a, __b);
 }
 
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vdup_lane_u16 (uint16x4_t __a, const int __b)
 {
   return (uint16x4_t)__builtin_neon_vdup_lanev4hi ((int16x4_t) __a, __b);
 }
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vdup_lane_u32 (uint32x2_t __a, const int __b)
 {
   return (uint32x2_t)__builtin_neon_vdup_lanev2si ((int32x2_t) __a, __b);
 }
 
-__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline poly8x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vdup_lane_p8 (poly8x8_t __a, const int __b)
 {
   return (poly8x8_t)__builtin_neon_vdup_lanev8qi ((int8x8_t) __a, __b);
 }
 
-__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline poly16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vdup_lane_p16 (poly16x4_t __a, const int __b)
 {
   return (poly16x4_t)__builtin_neon_vdup_lanev4hi ((int16x4_t) __a, __b);
@@ -6030,74 +6983,86 @@ vdup_lane_p16 (poly16x4_t __a, const int __b)
 
 #pragma GCC push_options
 #pragma GCC target ("fpu=crypto-neon-fp-armv8")
-__extension__ static __inline poly64x1_t __attribute__ ((__always_inline__))
+__extension__ extern __inline poly64x1_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vdup_lane_p64 (poly64x1_t __a, const int __b)
 {
   return (poly64x1_t)__builtin_neon_vdup_lanedi (__a, __b);
 }
 
 #pragma GCC pop_options
-__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int64x1_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vdup_lane_s64 (int64x1_t __a, const int __b)
 {
   return (int64x1_t)__builtin_neon_vdup_lanedi (__a, __b);
 }
 
-__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint64x1_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vdup_lane_u64 (uint64x1_t __a, const int __b)
 {
   return (uint64x1_t)__builtin_neon_vdup_lanedi ((int64x1_t) __a, __b);
 }
 
-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int8x16_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vdupq_lane_s8 (int8x8_t __a, const int __b)
 {
   return (int8x16_t)__builtin_neon_vdup_lanev16qi (__a, __b);
 }
 
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vdupq_lane_s16 (int16x4_t __a, const int __b)
 {
   return (int16x8_t)__builtin_neon_vdup_lanev8hi (__a, __b);
 }
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vdupq_lane_s32 (int32x2_t __a, const int __b)
 {
   return (int32x4_t)__builtin_neon_vdup_lanev4si (__a, __b);
 }
 
-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline float32x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vdupq_lane_f32 (float32x2_t __a, const int __b)
 {
   return (float32x4_t)__builtin_neon_vdup_lanev4sf (__a, __b);
 }
 
-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint8x16_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vdupq_lane_u8 (uint8x8_t __a, const int __b)
 {
   return (uint8x16_t)__builtin_neon_vdup_lanev16qi ((int8x8_t) __a, __b);
 }
 
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vdupq_lane_u16 (uint16x4_t __a, const int __b)
 {
   return (uint16x8_t)__builtin_neon_vdup_lanev8hi ((int16x4_t) __a, __b);
 }
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vdupq_lane_u32 (uint32x2_t __a, const int __b)
 {
   return (uint32x4_t)__builtin_neon_vdup_lanev4si ((int32x2_t) __a, __b);
 }
 
-__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
+__extension__ extern __inline poly8x16_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vdupq_lane_p8 (poly8x8_t __a, const int __b)
 {
   return (poly8x16_t)__builtin_neon_vdup_lanev16qi ((int8x8_t) __a, __b);
 }
 
-__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline poly16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vdupq_lane_p16 (poly16x4_t __a, const int __b)
 {
   return (poly16x8_t)__builtin_neon_vdup_lanev8hi ((int16x4_t) __a, __b);
@@ -6105,20 +7070,23 @@ vdupq_lane_p16 (poly16x4_t __a, const int __b)
 
 #pragma GCC push_options
 #pragma GCC target ("fpu=crypto-neon-fp-armv8")
-__extension__ static __inline poly64x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline poly64x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vdupq_lane_p64 (poly64x1_t __a, const int __b)
 {
   return (poly64x2_t)__builtin_neon_vdup_lanev2di (__a, __b);
 }
 
 #pragma GCC pop_options
-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int64x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vdupq_lane_s64 (int64x1_t __a, const int __b)
 {
   return (int64x2_t)__builtin_neon_vdup_lanev2di (__a, __b);
 }
 
-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint64x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vdupq_lane_u64 (uint64x1_t __a, const int __b)
 {
   return (uint64x2_t)__builtin_neon_vdup_lanev2di ((int64x1_t) __a, __b);
@@ -6126,82 +7094,95 @@ vdupq_lane_u64 (uint64x1_t __a, const int __b)
 
 #pragma GCC push_options
 #pragma GCC target ("fpu=crypto-neon-fp-armv8")
-__extension__ static __inline poly64x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline poly64x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vcombine_p64 (poly64x1_t __a, poly64x1_t __b)
 {
   return (poly64x2_t)__builtin_neon_vcombinedi (__a, __b);
 }
 
 #pragma GCC pop_options
-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int8x16_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vcombine_s8 (int8x8_t __a, int8x8_t __b)
 {
   return (int8x16_t)__builtin_neon_vcombinev8qi (__a, __b);
 }
 
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vcombine_s16 (int16x4_t __a, int16x4_t __b)
 {
   return (int16x8_t)__builtin_neon_vcombinev4hi (__a, __b);
 }
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vcombine_s32 (int32x2_t __a, int32x2_t __b)
 {
   return (int32x4_t)__builtin_neon_vcombinev2si (__a, __b);
 }
 
-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int64x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vcombine_s64 (int64x1_t __a, int64x1_t __b)
 {
   return (int64x2_t)__builtin_neon_vcombinedi (__a, __b);
 }
 
 #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
-__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline float16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vcombine_f16 (float16x4_t __a, float16x4_t __b)
 {
   return __builtin_neon_vcombinev4hf (__a, __b);
 }
 #endif
 
-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline float32x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vcombine_f32 (float32x2_t __a, float32x2_t __b)
 {
   return (float32x4_t)__builtin_neon_vcombinev2sf (__a, __b);
 }
 
-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint8x16_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vcombine_u8 (uint8x8_t __a, uint8x8_t __b)
 {
   return (uint8x16_t)__builtin_neon_vcombinev8qi ((int8x8_t) __a, (int8x8_t) __b);
 }
 
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vcombine_u16 (uint16x4_t __a, uint16x4_t __b)
 {
   return (uint16x8_t)__builtin_neon_vcombinev4hi ((int16x4_t) __a, (int16x4_t) __b);
 }
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vcombine_u32 (uint32x2_t __a, uint32x2_t __b)
 {
   return (uint32x4_t)__builtin_neon_vcombinev2si ((int32x2_t) __a, (int32x2_t) __b);
 }
 
-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint64x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vcombine_u64 (uint64x1_t __a, uint64x1_t __b)
 {
   return (uint64x2_t)__builtin_neon_vcombinedi ((int64x1_t) __a, (int64x1_t) __b);
 }
 
-__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
+__extension__ extern __inline poly8x16_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vcombine_p8 (poly8x8_t __a, poly8x8_t __b)
 {
   return (poly8x16_t)__builtin_neon_vcombinev8qi ((int8x8_t) __a, (int8x8_t) __b);
 }
 
-__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline poly16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vcombine_p16 (poly16x4_t __a, poly16x4_t __b)
 {
   return (poly16x8_t)__builtin_neon_vcombinev4hi ((int16x4_t) __a, (int16x4_t) __b);
@@ -6209,144 +7190,167 @@ vcombine_p16 (poly16x4_t __a, poly16x4_t __b)
 
 #pragma GCC push_options
 #pragma GCC target ("fpu=crypto-neon-fp-armv8")
-__extension__ static __inline poly64x1_t __attribute__ ((__always_inline__))
+__extension__ extern __inline poly64x1_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vget_high_p64 (poly64x2_t __a)
 {
   return (poly64x1_t)__builtin_neon_vget_highv2di ((int64x2_t) __a);
 }
 
 #pragma GCC pop_options
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int8x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vget_high_s8 (int8x16_t __a)
 {
   return (int8x8_t)__builtin_neon_vget_highv16qi (__a);
 }
 
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vget_high_s16 (int16x8_t __a)
 {
   return (int16x4_t)__builtin_neon_vget_highv8hi (__a);
 }
 
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vget_high_s32 (int32x4_t __a)
 {
   return (int32x2_t)__builtin_neon_vget_highv4si (__a);
 }
 
-__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int64x1_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vget_high_s64 (int64x2_t __a)
 {
   return (int64x1_t)__builtin_neon_vget_highv2di (__a);
 }
 
 #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
-__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline float16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vget_high_f16 (float16x8_t __a)
 {
   return __builtin_neon_vget_highv8hf (__a);
 }
 #endif
 
-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline float32x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vget_high_f32 (float32x4_t __a)
 {
   return (float32x2_t)__builtin_neon_vget_highv4sf (__a);
 }
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint8x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vget_high_u8 (uint8x16_t __a)
 {
   return (uint8x8_t)__builtin_neon_vget_highv16qi ((int8x16_t) __a);
 }
 
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vget_high_u16 (uint16x8_t __a)
 {
   return (uint16x4_t)__builtin_neon_vget_highv8hi ((int16x8_t) __a);
 }
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vget_high_u32 (uint32x4_t __a)
 {
   return (uint32x2_t)__builtin_neon_vget_highv4si ((int32x4_t) __a);
 }
 
-__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint64x1_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vget_high_u64 (uint64x2_t __a)
 {
   return (uint64x1_t)__builtin_neon_vget_highv2di ((int64x2_t) __a);
 }
 
-__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline poly8x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vget_high_p8 (poly8x16_t __a)
 {
   return (poly8x8_t)__builtin_neon_vget_highv16qi ((int8x16_t) __a);
 }
 
-__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline poly16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vget_high_p16 (poly16x8_t __a)
 {
   return (poly16x4_t)__builtin_neon_vget_highv8hi ((int16x8_t) __a);
 }
 
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int8x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vget_low_s8 (int8x16_t __a)
 {
   return (int8x8_t)__builtin_neon_vget_lowv16qi (__a);
 }
 
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vget_low_s16 (int16x8_t __a)
 {
   return (int16x4_t)__builtin_neon_vget_lowv8hi (__a);
 }
 
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vget_low_s32 (int32x4_t __a)
 {
   return (int32x2_t)__builtin_neon_vget_lowv4si (__a);
 }
 
 #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
-__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline float16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vget_low_f16 (float16x8_t __a)
 {
   return __builtin_neon_vget_lowv8hf (__a);
 }
 #endif
 
-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline float32x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vget_low_f32 (float32x4_t __a)
 {
   return (float32x2_t)__builtin_neon_vget_lowv4sf (__a);
 }
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint8x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vget_low_u8 (uint8x16_t __a)
 {
   return (uint8x8_t)__builtin_neon_vget_lowv16qi ((int8x16_t) __a);
 }
 
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vget_low_u16 (uint16x8_t __a)
 {
   return (uint16x4_t)__builtin_neon_vget_lowv8hi ((int16x8_t) __a);
 }
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vget_low_u32 (uint32x4_t __a)
 {
   return (uint32x2_t)__builtin_neon_vget_lowv4si ((int32x4_t) __a);
 }
 
-__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline poly8x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vget_low_p8 (poly8x16_t __a)
 {
   return (poly8x8_t)__builtin_neon_vget_lowv16qi ((int8x16_t) __a);
 }
 
-__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline poly16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vget_low_p16 (poly16x8_t __a)
 {
   return (poly16x4_t)__builtin_neon_vget_lowv8hi ((int16x8_t) __a);
@@ -6354,68 +7358,79 @@ vget_low_p16 (poly16x8_t __a)
 
 #pragma GCC push_options
 #pragma GCC target ("fpu=crypto-neon-fp-armv8")
-__extension__ static __inline poly64x1_t __attribute__ ((__always_inline__))
+__extension__ extern __inline poly64x1_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vget_low_p64 (poly64x2_t __a)
 {
   return (poly64x1_t)__builtin_neon_vget_lowv2di ((int64x2_t) __a);
 }
 
 #pragma GCC pop_options
-__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int64x1_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vget_low_s64 (int64x2_t __a)
 {
   return (int64x1_t)__builtin_neon_vget_lowv2di (__a);
 }
 
-__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint64x1_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vget_low_u64 (uint64x2_t __a)
 {
   return (uint64x1_t)__builtin_neon_vget_lowv2di ((int64x2_t) __a);
 }
 
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vcvt_s32_f32 (float32x2_t __a)
 {
   return (int32x2_t)__builtin_neon_vcvtsv2sf (__a);
 }
 
-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline float32x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vcvt_f32_s32 (int32x2_t __a)
 {
   return (float32x2_t)__builtin_neon_vcvtsv2si (__a);
 }
 
-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline float32x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vcvt_f32_u32 (uint32x2_t __a)
 {
   return (float32x2_t)__builtin_neon_vcvtuv2si ((int32x2_t) __a);
 }
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vcvt_u32_f32 (float32x2_t __a)
 {
   return (uint32x2_t)__builtin_neon_vcvtuv2sf (__a);
 }
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vcvtq_s32_f32 (float32x4_t __a)
 {
   return (int32x4_t)__builtin_neon_vcvtsv4sf (__a);
 }
 
-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline float32x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vcvtq_f32_s32 (int32x4_t __a)
 {
   return (float32x4_t)__builtin_neon_vcvtsv4si (__a);
 }
 
-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline float32x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vcvtq_f32_u32 (uint32x4_t __a)
 {
   return (float32x4_t)__builtin_neon_vcvtuv4si ((int32x4_t) __a);
 }
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vcvtq_u32_f32 (float32x4_t __a)
 {
   return (uint32x4_t)__builtin_neon_vcvtuv4sf (__a);
@@ -6424,7 +7439,8 @@ vcvtq_u32_f32 (float32x4_t __a)
 #pragma GCC push_options
 #pragma GCC target ("fpu=neon-fp16")
 #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
-__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline float16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vcvt_f16_f32 (float32x4_t __a)
 {
   return (float16x4_t)__builtin_neon_vcvtv4hfv4sf (__a);
@@ -6432,7 +7448,8 @@ vcvt_f16_f32 (float32x4_t __a)
 #endif
 
 #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline float32x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vcvt_f32_f16 (float16x4_t __a)
 {
   return (float32x4_t)__builtin_neon_vcvtv4sfv4hf (__a);
@@ -6440,1059 +7457,1232 @@ vcvt_f32_f16 (float16x4_t __a)
 #endif
 #pragma GCC pop_options
 
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vcvt_n_s32_f32 (float32x2_t __a, const int __b)
 {
   return (int32x2_t)__builtin_neon_vcvts_nv2sf (__a, __b);
 }
 
-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline float32x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vcvt_n_f32_s32 (int32x2_t __a, const int __b)
 {
   return (float32x2_t)__builtin_neon_vcvts_nv2si (__a, __b);
 }
 
-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline float32x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vcvt_n_f32_u32 (uint32x2_t __a, const int __b)
 {
   return (float32x2_t)__builtin_neon_vcvtu_nv2si ((int32x2_t) __a, __b);
 }
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vcvt_n_u32_f32 (float32x2_t __a, const int __b)
 {
   return (uint32x2_t)__builtin_neon_vcvtu_nv2sf (__a, __b);
 }
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vcvtq_n_s32_f32 (float32x4_t __a, const int __b)
 {
   return (int32x4_t)__builtin_neon_vcvts_nv4sf (__a, __b);
 }
 
-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline float32x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vcvtq_n_f32_s32 (int32x4_t __a, const int __b)
 {
   return (float32x4_t)__builtin_neon_vcvts_nv4si (__a, __b);
 }
 
-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline float32x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vcvtq_n_f32_u32 (uint32x4_t __a, const int __b)
 {
   return (float32x4_t)__builtin_neon_vcvtu_nv4si ((int32x4_t) __a, __b);
 }
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vcvtq_n_u32_f32 (float32x4_t __a, const int __b)
 {
   return (uint32x4_t)__builtin_neon_vcvtu_nv4sf (__a, __b);
 }
 
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int8x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vmovn_s16 (int16x8_t __a)
 {
   return (int8x8_t)__builtin_neon_vmovnv8hi (__a);
 }
 
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vmovn_s32 (int32x4_t __a)
 {
   return (int16x4_t)__builtin_neon_vmovnv4si (__a);
 }
 
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vmovn_s64 (int64x2_t __a)
 {
   return (int32x2_t)__builtin_neon_vmovnv2di (__a);
 }
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint8x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vmovn_u16 (uint16x8_t __a)
 {
   return (uint8x8_t)__builtin_neon_vmovnv8hi ((int16x8_t) __a);
 }
 
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vmovn_u32 (uint32x4_t __a)
 {
   return (uint16x4_t)__builtin_neon_vmovnv4si ((int32x4_t) __a);
 }
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vmovn_u64 (uint64x2_t __a)
 {
   return (uint32x2_t)__builtin_neon_vmovnv2di ((int64x2_t) __a);
 }
 
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int8x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vqmovn_s16 (int16x8_t __a)
 {
   return (int8x8_t)__builtin_neon_vqmovnsv8hi (__a);
 }
 
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vqmovn_s32 (int32x4_t __a)
 {
   return (int16x4_t)__builtin_neon_vqmovnsv4si (__a);
 }
 
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vqmovn_s64 (int64x2_t __a)
 {
   return (int32x2_t)__builtin_neon_vqmovnsv2di (__a);
 }
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint8x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vqmovn_u16 (uint16x8_t __a)
 {
   return (uint8x8_t)__builtin_neon_vqmovnuv8hi ((int16x8_t) __a);
 }
 
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vqmovn_u32 (uint32x4_t __a)
 {
   return (uint16x4_t)__builtin_neon_vqmovnuv4si ((int32x4_t) __a);
 }
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vqmovn_u64 (uint64x2_t __a)
 {
   return (uint32x2_t)__builtin_neon_vqmovnuv2di ((int64x2_t) __a);
 }
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint8x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vqmovun_s16 (int16x8_t __a)
 {
   return (uint8x8_t)__builtin_neon_vqmovunv8hi (__a);
 }
 
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vqmovun_s32 (int32x4_t __a)
 {
   return (uint16x4_t)__builtin_neon_vqmovunv4si (__a);
 }
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vqmovun_s64 (int64x2_t __a)
 {
   return (uint32x2_t)__builtin_neon_vqmovunv2di (__a);
 }
 
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vmovl_s8 (int8x8_t __a)
 {
   return (int16x8_t)__builtin_neon_vmovlsv8qi (__a);
 }
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vmovl_s16 (int16x4_t __a)
 {
   return (int32x4_t)__builtin_neon_vmovlsv4hi (__a);
 }
 
-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int64x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vmovl_s32 (int32x2_t __a)
 {
   return (int64x2_t)__builtin_neon_vmovlsv2si (__a);
 }
 
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vmovl_u8 (uint8x8_t __a)
 {
   return (uint16x8_t)__builtin_neon_vmovluv8qi ((int8x8_t) __a);
 }
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vmovl_u16 (uint16x4_t __a)
 {
   return (uint32x4_t)__builtin_neon_vmovluv4hi ((int16x4_t) __a);
 }
 
-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint64x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vmovl_u32 (uint32x2_t __a)
 {
   return (uint64x2_t)__builtin_neon_vmovluv2si ((int32x2_t) __a);
 }
 
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int8x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vtbl1_s8 (int8x8_t __a, int8x8_t __b)
 {
   return (int8x8_t)__builtin_neon_vtbl1v8qi (__a, __b);
 }
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint8x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vtbl1_u8 (uint8x8_t __a, uint8x8_t __b)
 {
   return (uint8x8_t)__builtin_neon_vtbl1v8qi ((int8x8_t) __a, (int8x8_t) __b);
 }
 
-__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline poly8x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vtbl1_p8 (poly8x8_t __a, uint8x8_t __b)
 {
   return (poly8x8_t)__builtin_neon_vtbl1v8qi ((int8x8_t) __a, (int8x8_t) __b);
 }
 
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int8x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vtbl2_s8 (int8x8x2_t __a, int8x8_t __b)
 {
   union { int8x8x2_t __i; __builtin_neon_ti __o; } __au = { __a };
   return (int8x8_t)__builtin_neon_vtbl2v8qi (__au.__o, __b);
 }
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint8x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vtbl2_u8 (uint8x8x2_t __a, uint8x8_t __b)
 {
   union { uint8x8x2_t __i; __builtin_neon_ti __o; } __au = { __a };
   return (uint8x8_t)__builtin_neon_vtbl2v8qi (__au.__o, (int8x8_t) __b);
 }
 
-__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline poly8x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vtbl2_p8 (poly8x8x2_t __a, uint8x8_t __b)
 {
   union { poly8x8x2_t __i; __builtin_neon_ti __o; } __au = { __a };
   return (poly8x8_t)__builtin_neon_vtbl2v8qi (__au.__o, (int8x8_t) __b);
 }
 
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int8x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vtbl3_s8 (int8x8x3_t __a, int8x8_t __b)
 {
   union { int8x8x3_t __i; __builtin_neon_ei __o; } __au = { __a };
   return (int8x8_t)__builtin_neon_vtbl3v8qi (__au.__o, __b);
 }
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint8x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vtbl3_u8 (uint8x8x3_t __a, uint8x8_t __b)
 {
   union { uint8x8x3_t __i; __builtin_neon_ei __o; } __au = { __a };
   return (uint8x8_t)__builtin_neon_vtbl3v8qi (__au.__o, (int8x8_t) __b);
 }
 
-__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline poly8x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vtbl3_p8 (poly8x8x3_t __a, uint8x8_t __b)
 {
   union { poly8x8x3_t __i; __builtin_neon_ei __o; } __au = { __a };
   return (poly8x8_t)__builtin_neon_vtbl3v8qi (__au.__o, (int8x8_t) __b);
 }
 
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int8x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vtbl4_s8 (int8x8x4_t __a, int8x8_t __b)
 {
   union { int8x8x4_t __i; __builtin_neon_oi __o; } __au = { __a };
   return (int8x8_t)__builtin_neon_vtbl4v8qi (__au.__o, __b);
 }
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint8x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vtbl4_u8 (uint8x8x4_t __a, uint8x8_t __b)
 {
   union { uint8x8x4_t __i; __builtin_neon_oi __o; } __au = { __a };
   return (uint8x8_t)__builtin_neon_vtbl4v8qi (__au.__o, (int8x8_t) __b);
 }
 
-__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline poly8x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vtbl4_p8 (poly8x8x4_t __a, uint8x8_t __b)
 {
   union { poly8x8x4_t __i; __builtin_neon_oi __o; } __au = { __a };
   return (poly8x8_t)__builtin_neon_vtbl4v8qi (__au.__o, (int8x8_t) __b);
 }
 
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int8x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vtbx1_s8 (int8x8_t __a, int8x8_t __b, int8x8_t __c)
 {
   return (int8x8_t)__builtin_neon_vtbx1v8qi (__a, __b, __c);
 }
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint8x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vtbx1_u8 (uint8x8_t __a, uint8x8_t __b, uint8x8_t __c)
 {
   return (uint8x8_t)__builtin_neon_vtbx1v8qi ((int8x8_t) __a, (int8x8_t) __b, (int8x8_t) __c);
 }
 
-__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline poly8x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vtbx1_p8 (poly8x8_t __a, poly8x8_t __b, uint8x8_t __c)
 {
   return (poly8x8_t)__builtin_neon_vtbx1v8qi ((int8x8_t) __a, (int8x8_t) __b, (int8x8_t) __c);
 }
 
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int8x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vtbx2_s8 (int8x8_t __a, int8x8x2_t __b, int8x8_t __c)
 {
   union { int8x8x2_t __i; __builtin_neon_ti __o; } __bu = { __b };
   return (int8x8_t)__builtin_neon_vtbx2v8qi (__a, __bu.__o, __c);
 }
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint8x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vtbx2_u8 (uint8x8_t __a, uint8x8x2_t __b, uint8x8_t __c)
 {
   union { uint8x8x2_t __i; __builtin_neon_ti __o; } __bu = { __b };
   return (uint8x8_t)__builtin_neon_vtbx2v8qi ((int8x8_t) __a, __bu.__o, (int8x8_t) __c);
 }
 
-__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline poly8x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vtbx2_p8 (poly8x8_t __a, poly8x8x2_t __b, uint8x8_t __c)
 {
   union { poly8x8x2_t __i; __builtin_neon_ti __o; } __bu = { __b };
   return (poly8x8_t)__builtin_neon_vtbx2v8qi ((int8x8_t) __a, __bu.__o, (int8x8_t) __c);
 }
 
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int8x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vtbx3_s8 (int8x8_t __a, int8x8x3_t __b, int8x8_t __c)
 {
   union { int8x8x3_t __i; __builtin_neon_ei __o; } __bu = { __b };
   return (int8x8_t)__builtin_neon_vtbx3v8qi (__a, __bu.__o, __c);
 }
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint8x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vtbx3_u8 (uint8x8_t __a, uint8x8x3_t __b, uint8x8_t __c)
 {
   union { uint8x8x3_t __i; __builtin_neon_ei __o; } __bu = { __b };
   return (uint8x8_t)__builtin_neon_vtbx3v8qi ((int8x8_t) __a, __bu.__o, (int8x8_t) __c);
 }
 
-__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline poly8x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vtbx3_p8 (poly8x8_t __a, poly8x8x3_t __b, uint8x8_t __c)
 {
   union { poly8x8x3_t __i; __builtin_neon_ei __o; } __bu = { __b };
   return (poly8x8_t)__builtin_neon_vtbx3v8qi ((int8x8_t) __a, __bu.__o, (int8x8_t) __c);
 }
 
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int8x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vtbx4_s8 (int8x8_t __a, int8x8x4_t __b, int8x8_t __c)
 {
   union { int8x8x4_t __i; __builtin_neon_oi __o; } __bu = { __b };
   return (int8x8_t)__builtin_neon_vtbx4v8qi (__a, __bu.__o, __c);
 }
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint8x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vtbx4_u8 (uint8x8_t __a, uint8x8x4_t __b, uint8x8_t __c)
 {
   union { uint8x8x4_t __i; __builtin_neon_oi __o; } __bu = { __b };
   return (uint8x8_t)__builtin_neon_vtbx4v8qi ((int8x8_t) __a, __bu.__o, (int8x8_t) __c);
 }
 
-__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline poly8x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vtbx4_p8 (poly8x8_t __a, poly8x8x4_t __b, uint8x8_t __c)
 {
   union { poly8x8x4_t __i; __builtin_neon_oi __o; } __bu = { __b };
   return (poly8x8_t)__builtin_neon_vtbx4v8qi ((int8x8_t) __a, __bu.__o, (int8x8_t) __c);
 }
 
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vmul_lane_s16 (int16x4_t __a, int16x4_t __b, const int __c)
 {
   return (int16x4_t)__builtin_neon_vmul_lanev4hi (__a, __b, __c);
 }
 
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vmul_lane_s32 (int32x2_t __a, int32x2_t __b, const int __c)
 {
   return (int32x2_t)__builtin_neon_vmul_lanev2si (__a, __b, __c);
 }
 
-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline float32x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vmul_lane_f32 (float32x2_t __a, float32x2_t __b, const int __c)
 {
   return (float32x2_t)__builtin_neon_vmul_lanev2sf (__a, __b, __c);
 }
 
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vmul_lane_u16 (uint16x4_t __a, uint16x4_t __b, const int __c)
 {
   return (uint16x4_t)__builtin_neon_vmul_lanev4hi ((int16x4_t) __a, (int16x4_t) __b, __c);
 }
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vmul_lane_u32 (uint32x2_t __a, uint32x2_t __b, const int __c)
 {
   return (uint32x2_t)__builtin_neon_vmul_lanev2si ((int32x2_t) __a, (int32x2_t) __b, __c);
 }
 
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vmulq_lane_s16 (int16x8_t __a, int16x4_t __b, const int __c)
 {
   return (int16x8_t)__builtin_neon_vmul_lanev8hi (__a, __b, __c);
 }
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vmulq_lane_s32 (int32x4_t __a, int32x2_t __b, const int __c)
 {
   return (int32x4_t)__builtin_neon_vmul_lanev4si (__a, __b, __c);
 }
 
-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline float32x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vmulq_lane_f32 (float32x4_t __a, float32x2_t __b, const int __c)
 {
   return (float32x4_t)__builtin_neon_vmul_lanev4sf (__a, __b, __c);
 }
 
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vmulq_lane_u16 (uint16x8_t __a, uint16x4_t __b, const int __c)
 {
   return (uint16x8_t)__builtin_neon_vmul_lanev8hi ((int16x8_t) __a, (int16x4_t) __b, __c);
 }
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vmulq_lane_u32 (uint32x4_t __a, uint32x2_t __b, const int __c)
 {
   return (uint32x4_t)__builtin_neon_vmul_lanev4si ((int32x4_t) __a, (int32x2_t) __b, __c);
 }
 
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vmla_lane_s16 (int16x4_t __a, int16x4_t __b, int16x4_t __c, const int __d)
 {
   return (int16x4_t)__builtin_neon_vmla_lanev4hi (__a, __b, __c, __d);
 }
 
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vmla_lane_s32 (int32x2_t __a, int32x2_t __b, int32x2_t __c, const int __d)
 {
   return (int32x2_t)__builtin_neon_vmla_lanev2si (__a, __b, __c, __d);
 }
 
-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline float32x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vmla_lane_f32 (float32x2_t __a, float32x2_t __b, float32x2_t __c, const int __d)
 {
   return (float32x2_t)__builtin_neon_vmla_lanev2sf (__a, __b, __c, __d);
 }
 
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vmla_lane_u16 (uint16x4_t __a, uint16x4_t __b, uint16x4_t __c, const int __d)
 {
   return (uint16x4_t)__builtin_neon_vmla_lanev4hi ((int16x4_t) __a, (int16x4_t) __b, (int16x4_t) __c, __d);
 }
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vmla_lane_u32 (uint32x2_t __a, uint32x2_t __b, uint32x2_t __c, const int __d)
 {
   return (uint32x2_t)__builtin_neon_vmla_lanev2si ((int32x2_t) __a, (int32x2_t) __b, (int32x2_t) __c, __d);
 }
 
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vmlaq_lane_s16 (int16x8_t __a, int16x8_t __b, int16x4_t __c, const int __d)
 {
   return (int16x8_t)__builtin_neon_vmla_lanev8hi (__a, __b, __c, __d);
 }
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vmlaq_lane_s32 (int32x4_t __a, int32x4_t __b, int32x2_t __c, const int __d)
 {
   return (int32x4_t)__builtin_neon_vmla_lanev4si (__a, __b, __c, __d);
 }
 
-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline float32x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vmlaq_lane_f32 (float32x4_t __a, float32x4_t __b, float32x2_t __c, const int __d)
 {
   return (float32x4_t)__builtin_neon_vmla_lanev4sf (__a, __b, __c, __d);
 }
 
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vmlaq_lane_u16 (uint16x8_t __a, uint16x8_t __b, uint16x4_t __c, const int __d)
 {
   return (uint16x8_t)__builtin_neon_vmla_lanev8hi ((int16x8_t) __a, (int16x8_t) __b, (int16x4_t) __c, __d);
 }
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vmlaq_lane_u32 (uint32x4_t __a, uint32x4_t __b, uint32x2_t __c, const int __d)
 {
   return (uint32x4_t)__builtin_neon_vmla_lanev4si ((int32x4_t) __a, (int32x4_t) __b, (int32x2_t) __c, __d);
 }
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vmlal_lane_s16 (int32x4_t __a, int16x4_t __b, int16x4_t __c, const int __d)
 {
   return (int32x4_t)__builtin_neon_vmlals_lanev4hi (__a, __b, __c, __d);
 }
 
-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int64x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vmlal_lane_s32 (int64x2_t __a, int32x2_t __b, int32x2_t __c, const int __d)
 {
   return (int64x2_t)__builtin_neon_vmlals_lanev2si (__a, __b, __c, __d);
 }
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vmlal_lane_u16 (uint32x4_t __a, uint16x4_t __b, uint16x4_t __c, const int __d)
 {
   return (uint32x4_t)__builtin_neon_vmlalu_lanev4hi ((int32x4_t) __a, (int16x4_t) __b, (int16x4_t) __c, __d);
 }
 
-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint64x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vmlal_lane_u32 (uint64x2_t __a, uint32x2_t __b, uint32x2_t __c, const int __d)
 {
   return (uint64x2_t)__builtin_neon_vmlalu_lanev2si ((int64x2_t) __a, (int32x2_t) __b, (int32x2_t) __c, __d);
 }
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vqdmlal_lane_s16 (int32x4_t __a, int16x4_t __b, int16x4_t __c, const int __d)
 {
   return (int32x4_t)__builtin_neon_vqdmlal_lanev4hi (__a, __b, __c, __d);
 }
 
-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int64x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vqdmlal_lane_s32 (int64x2_t __a, int32x2_t __b, int32x2_t __c, const int __d)
 {
   return (int64x2_t)__builtin_neon_vqdmlal_lanev2si (__a, __b, __c, __d);
 }
 
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vmls_lane_s16 (int16x4_t __a, int16x4_t __b, int16x4_t __c, const int __d)
 {
   return (int16x4_t)__builtin_neon_vmls_lanev4hi (__a, __b, __c, __d);
 }
 
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vmls_lane_s32 (int32x2_t __a, int32x2_t __b, int32x2_t __c, const int __d)
 {
   return (int32x2_t)__builtin_neon_vmls_lanev2si (__a, __b, __c, __d);
 }
 
-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline float32x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vmls_lane_f32 (float32x2_t __a, float32x2_t __b, float32x2_t __c, const int __d)
 {
   return (float32x2_t)__builtin_neon_vmls_lanev2sf (__a, __b, __c, __d);
 }
 
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vmls_lane_u16 (uint16x4_t __a, uint16x4_t __b, uint16x4_t __c, const int __d)
 {
   return (uint16x4_t)__builtin_neon_vmls_lanev4hi ((int16x4_t) __a, (int16x4_t) __b, (int16x4_t) __c, __d);
 }
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vmls_lane_u32 (uint32x2_t __a, uint32x2_t __b, uint32x2_t __c, const int __d)
 {
   return (uint32x2_t)__builtin_neon_vmls_lanev2si ((int32x2_t) __a, (int32x2_t) __b, (int32x2_t) __c, __d);
 }
 
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vmlsq_lane_s16 (int16x8_t __a, int16x8_t __b, int16x4_t __c, const int __d)
 {
   return (int16x8_t)__builtin_neon_vmls_lanev8hi (__a, __b, __c, __d);
 }
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vmlsq_lane_s32 (int32x4_t __a, int32x4_t __b, int32x2_t __c, const int __d)
 {
   return (int32x4_t)__builtin_neon_vmls_lanev4si (__a, __b, __c, __d);
 }
 
-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline float32x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vmlsq_lane_f32 (float32x4_t __a, float32x4_t __b, float32x2_t __c, const int __d)
 {
   return (float32x4_t)__builtin_neon_vmls_lanev4sf (__a, __b, __c, __d);
 }
 
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vmlsq_lane_u16 (uint16x8_t __a, uint16x8_t __b, uint16x4_t __c, const int __d)
 {
   return (uint16x8_t)__builtin_neon_vmls_lanev8hi ((int16x8_t) __a, (int16x8_t) __b, (int16x4_t) __c, __d);
 }
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vmlsq_lane_u32 (uint32x4_t __a, uint32x4_t __b, uint32x2_t __c, const int __d)
 {
   return (uint32x4_t)__builtin_neon_vmls_lanev4si ((int32x4_t) __a, (int32x4_t) __b, (int32x2_t) __c, __d);
 }
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vmlsl_lane_s16 (int32x4_t __a, int16x4_t __b, int16x4_t __c, const int __d)
 {
   return (int32x4_t)__builtin_neon_vmlsls_lanev4hi (__a, __b, __c, __d);
 }
 
-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int64x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vmlsl_lane_s32 (int64x2_t __a, int32x2_t __b, int32x2_t __c, const int __d)
 {
   return (int64x2_t)__builtin_neon_vmlsls_lanev2si (__a, __b, __c, __d);
 }
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vmlsl_lane_u16 (uint32x4_t __a, uint16x4_t __b, uint16x4_t __c, const int __d)
 {
   return (uint32x4_t)__builtin_neon_vmlslu_lanev4hi ((int32x4_t) __a, (int16x4_t) __b, (int16x4_t) __c, __d);
 }
 
-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint64x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vmlsl_lane_u32 (uint64x2_t __a, uint32x2_t __b, uint32x2_t __c, const int __d)
 {
   return (uint64x2_t)__builtin_neon_vmlslu_lanev2si ((int64x2_t) __a, (int32x2_t) __b, (int32x2_t) __c, __d);
 }
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vqdmlsl_lane_s16 (int32x4_t __a, int16x4_t __b, int16x4_t __c, const int __d)
 {
   return (int32x4_t)__builtin_neon_vqdmlsl_lanev4hi (__a, __b, __c, __d);
 }
 
-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int64x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vqdmlsl_lane_s32 (int64x2_t __a, int32x2_t __b, int32x2_t __c, const int __d)
 {
   return (int64x2_t)__builtin_neon_vqdmlsl_lanev2si (__a, __b, __c, __d);
 }
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vmull_lane_s16 (int16x4_t __a, int16x4_t __b, const int __c)
 {
   return (int32x4_t)__builtin_neon_vmulls_lanev4hi (__a, __b, __c);
 }
 
-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int64x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vmull_lane_s32 (int32x2_t __a, int32x2_t __b, const int __c)
 {
   return (int64x2_t)__builtin_neon_vmulls_lanev2si (__a, __b, __c);
 }
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vmull_lane_u16 (uint16x4_t __a, uint16x4_t __b, const int __c)
 {
   return (uint32x4_t)__builtin_neon_vmullu_lanev4hi ((int16x4_t) __a, (int16x4_t) __b, __c);
 }
 
-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint64x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vmull_lane_u32 (uint32x2_t __a, uint32x2_t __b, const int __c)
 {
   return (uint64x2_t)__builtin_neon_vmullu_lanev2si ((int32x2_t) __a, (int32x2_t) __b, __c);
 }
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vqdmull_lane_s16 (int16x4_t __a, int16x4_t __b, const int __c)
 {
   return (int32x4_t)__builtin_neon_vqdmull_lanev4hi (__a, __b, __c);
 }
 
-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int64x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vqdmull_lane_s32 (int32x2_t __a, int32x2_t __b, const int __c)
 {
   return (int64x2_t)__builtin_neon_vqdmull_lanev2si (__a, __b, __c);
 }
 
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vqdmulhq_lane_s16 (int16x8_t __a, int16x4_t __b, const int __c)
 {
   return (int16x8_t)__builtin_neon_vqdmulh_lanev8hi (__a, __b, __c);
 }
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vqdmulhq_lane_s32 (int32x4_t __a, int32x2_t __b, const int __c)
 {
   return (int32x4_t)__builtin_neon_vqdmulh_lanev4si (__a, __b, __c);
 }
 
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vqdmulh_lane_s16 (int16x4_t __a, int16x4_t __b, const int __c)
 {
   return (int16x4_t)__builtin_neon_vqdmulh_lanev4hi (__a, __b, __c);
 }
 
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vqdmulh_lane_s32 (int32x2_t __a, int32x2_t __b, const int __c)
 {
   return (int32x2_t)__builtin_neon_vqdmulh_lanev2si (__a, __b, __c);
 }
 
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vqrdmulhq_lane_s16 (int16x8_t __a, int16x4_t __b, const int __c)
 {
   return (int16x8_t)__builtin_neon_vqrdmulh_lanev8hi (__a, __b, __c);
 }
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vqrdmulhq_lane_s32 (int32x4_t __a, int32x2_t __b, const int __c)
 {
   return (int32x4_t)__builtin_neon_vqrdmulh_lanev4si (__a, __b, __c);
 }
 
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vqrdmulh_lane_s16 (int16x4_t __a, int16x4_t __b, const int __c)
 {
   return (int16x4_t)__builtin_neon_vqrdmulh_lanev4hi (__a, __b, __c);
 }
 
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vqrdmulh_lane_s32 (int32x2_t __a, int32x2_t __b, const int __c)
 {
   return (int32x2_t)__builtin_neon_vqrdmulh_lanev2si (__a, __b, __c);
 }
 
 #ifdef __ARM_FEATURE_QRDMX
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vqrdmlahq_lane_s16 (int16x8_t __a, int16x8_t __b, int16x4_t __c, const int __d)
 {
   return (int16x8_t)__builtin_neon_vqrdmlah_lanev8hi (__a, __b, __c, __d);
 }
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vqrdmlahq_lane_s32 (int32x4_t __a, int32x4_t __b, int32x2_t __c, const int __d)
 {
   return (int32x4_t)__builtin_neon_vqrdmlah_lanev4si (__a, __b, __c, __d);
 }
 
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vqrdmlah_lane_s16 (int16x4_t __a, int16x4_t __b, int16x4_t __c, const int __d)
 {
   return (int16x4_t)__builtin_neon_vqrdmlah_lanev4hi (__a, __b, __c, __d);
 }
 
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vqrdmlah_lane_s32 (int32x2_t __a, int32x2_t __b, int32x2_t __c, const int __d)
 {
   return (int32x2_t)__builtin_neon_vqrdmlah_lanev2si (__a, __b, __c, __d);
 }
 
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vqrdmlshq_lane_s16 (int16x8_t __a, int16x8_t __b, int16x4_t __c, const int __d)
 {
   return (int16x8_t)__builtin_neon_vqrdmlsh_lanev8hi (__a, __b, __c, __d);
 }
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vqrdmlshq_lane_s32 (int32x4_t __a, int32x4_t __b, int32x2_t __c, const int __d)
 {
   return (int32x4_t)__builtin_neon_vqrdmlsh_lanev4si (__a, __b, __c, __d);
 }
 
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vqrdmlsh_lane_s16 (int16x4_t __a, int16x4_t __b, int16x4_t __c, const int __d)
 {
   return (int16x4_t)__builtin_neon_vqrdmlsh_lanev4hi (__a, __b, __c, __d);
 }
 
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vqrdmlsh_lane_s32 (int32x2_t __a, int32x2_t __b, int32x2_t __c, const int __d)
 {
   return (int32x2_t)__builtin_neon_vqrdmlsh_lanev2si (__a, __b, __c, __d);
 }
 #endif
 
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vmul_n_s16 (int16x4_t __a, int16_t __b)
 {
   return (int16x4_t)__builtin_neon_vmul_nv4hi (__a, (__builtin_neon_hi) __b);
 }
 
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vmul_n_s32 (int32x2_t __a, int32_t __b)
 {
   return (int32x2_t)__builtin_neon_vmul_nv2si (__a, (__builtin_neon_si) __b);
 }
 
-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline float32x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vmul_n_f32 (float32x2_t __a, float32_t __b)
 {
   return (float32x2_t)__builtin_neon_vmul_nv2sf (__a, (__builtin_neon_sf) __b);
 }
 
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vmul_n_u16 (uint16x4_t __a, uint16_t __b)
 {
   return (uint16x4_t)__builtin_neon_vmul_nv4hi ((int16x4_t) __a, (__builtin_neon_hi) __b);
 }
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vmul_n_u32 (uint32x2_t __a, uint32_t __b)
 {
   return (uint32x2_t)__builtin_neon_vmul_nv2si ((int32x2_t) __a, (__builtin_neon_si) __b);
 }
 
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vmulq_n_s16 (int16x8_t __a, int16_t __b)
 {
   return (int16x8_t)__builtin_neon_vmul_nv8hi (__a, (__builtin_neon_hi) __b);
 }
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vmulq_n_s32 (int32x4_t __a, int32_t __b)
 {
   return (int32x4_t)__builtin_neon_vmul_nv4si (__a, (__builtin_neon_si) __b);
 }
 
-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline float32x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vmulq_n_f32 (float32x4_t __a, float32_t __b)
 {
   return (float32x4_t)__builtin_neon_vmul_nv4sf (__a, (__builtin_neon_sf) __b);
 }
 
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vmulq_n_u16 (uint16x8_t __a, uint16_t __b)
 {
   return (uint16x8_t)__builtin_neon_vmul_nv8hi ((int16x8_t) __a, (__builtin_neon_hi) __b);
 }
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vmulq_n_u32 (uint32x4_t __a, uint32_t __b)
 {
   return (uint32x4_t)__builtin_neon_vmul_nv4si ((int32x4_t) __a, (__builtin_neon_si) __b);
 }
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vmull_n_s16 (int16x4_t __a, int16_t __b)
 {
   return (int32x4_t)__builtin_neon_vmulls_nv4hi (__a, (__builtin_neon_hi) __b);
 }
 
-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int64x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vmull_n_s32 (int32x2_t __a, int32_t __b)
 {
   return (int64x2_t)__builtin_neon_vmulls_nv2si (__a, (__builtin_neon_si) __b);
 }
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vmull_n_u16 (uint16x4_t __a, uint16_t __b)
 {
   return (uint32x4_t)__builtin_neon_vmullu_nv4hi ((int16x4_t) __a, (__builtin_neon_hi) __b);
 }
 
-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint64x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vmull_n_u32 (uint32x2_t __a, uint32_t __b)
 {
   return (uint64x2_t)__builtin_neon_vmullu_nv2si ((int32x2_t) __a, (__builtin_neon_si) __b);
 }
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vqdmull_n_s16 (int16x4_t __a, int16_t __b)
 {
   return (int32x4_t)__builtin_neon_vqdmull_nv4hi (__a, (__builtin_neon_hi) __b);
 }
 
-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int64x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vqdmull_n_s32 (int32x2_t __a, int32_t __b)
 {
   return (int64x2_t)__builtin_neon_vqdmull_nv2si (__a, (__builtin_neon_si) __b);
 }
 
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vqdmulhq_n_s16 (int16x8_t __a, int16_t __b)
 {
   return (int16x8_t)__builtin_neon_vqdmulh_nv8hi (__a, (__builtin_neon_hi) __b);
 }
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vqdmulhq_n_s32 (int32x4_t __a, int32_t __b)
 {
   return (int32x4_t)__builtin_neon_vqdmulh_nv4si (__a, (__builtin_neon_si) __b);
 }
 
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vqdmulh_n_s16 (int16x4_t __a, int16_t __b)
 {
   return (int16x4_t)__builtin_neon_vqdmulh_nv4hi (__a, (__builtin_neon_hi) __b);
 }
 
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vqdmulh_n_s32 (int32x2_t __a, int32_t __b)
 {
   return (int32x2_t)__builtin_neon_vqdmulh_nv2si (__a, (__builtin_neon_si) __b);
 }
 
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vqrdmulhq_n_s16 (int16x8_t __a, int16_t __b)
 {
   return (int16x8_t)__builtin_neon_vqrdmulh_nv8hi (__a, (__builtin_neon_hi) __b);
 }
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vqrdmulhq_n_s32 (int32x4_t __a, int32_t __b)
 {
   return (int32x4_t)__builtin_neon_vqrdmulh_nv4si (__a, (__builtin_neon_si) __b);
 }
 
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vqrdmulh_n_s16 (int16x4_t __a, int16_t __b)
 {
   return (int16x4_t)__builtin_neon_vqrdmulh_nv4hi (__a, (__builtin_neon_hi) __b);
 }
 
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vqrdmulh_n_s32 (int32x2_t __a, int32_t __b)
 {
   return (int32x2_t)__builtin_neon_vqrdmulh_nv2si (__a, (__builtin_neon_si) __b);
 }
 
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vmla_n_s16 (int16x4_t __a, int16x4_t __b, int16_t __c)
 {
   return (int16x4_t)__builtin_neon_vmla_nv4hi (__a, __b, (__builtin_neon_hi) __c);
 }
 
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vmla_n_s32 (int32x2_t __a, int32x2_t __b, int32_t __c)
 {
   return (int32x2_t)__builtin_neon_vmla_nv2si (__a, __b, (__builtin_neon_si) __c);
 }
 
-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline float32x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vmla_n_f32 (float32x2_t __a, float32x2_t __b, float32_t __c)
 {
   return (float32x2_t)__builtin_neon_vmla_nv2sf (__a, __b, (__builtin_neon_sf) __c);
 }
 
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vmla_n_u16 (uint16x4_t __a, uint16x4_t __b, uint16_t __c)
 {
   return (uint16x4_t)__builtin_neon_vmla_nv4hi ((int16x4_t) __a, (int16x4_t) __b, (__builtin_neon_hi) __c);
 }
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vmla_n_u32 (uint32x2_t __a, uint32x2_t __b, uint32_t __c)
 {
   return (uint32x2_t)__builtin_neon_vmla_nv2si ((int32x2_t) __a, (int32x2_t) __b, (__builtin_neon_si) __c);
 }
 
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vmlaq_n_s16 (int16x8_t __a, int16x8_t __b, int16_t __c)
 {
   return (int16x8_t)__builtin_neon_vmla_nv8hi (__a, __b, (__builtin_neon_hi) __c);
 }
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vmlaq_n_s32 (int32x4_t __a, int32x4_t __b, int32_t __c)
 {
   return (int32x4_t)__builtin_neon_vmla_nv4si (__a, __b, (__builtin_neon_si) __c);
 }
 
-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline float32x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vmlaq_n_f32 (float32x4_t __a, float32x4_t __b, float32_t __c)
 {
   return (float32x4_t)__builtin_neon_vmla_nv4sf (__a, __b, (__builtin_neon_sf) __c);
 }
 
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vmlaq_n_u16 (uint16x8_t __a, uint16x8_t __b, uint16_t __c)
 {
   return (uint16x8_t)__builtin_neon_vmla_nv8hi ((int16x8_t) __a, (int16x8_t) __b, (__builtin_neon_hi) __c);
 }
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vmlaq_n_u32 (uint32x4_t __a, uint32x4_t __b, uint32_t __c)
 {
   return (uint32x4_t)__builtin_neon_vmla_nv4si ((int32x4_t) __a, (int32x4_t) __b, (__builtin_neon_si) __c);
 }
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vmlal_n_s16 (int32x4_t __a, int16x4_t __b, int16_t __c)
 {
   return (int32x4_t)__builtin_neon_vmlals_nv4hi (__a, __b, (__builtin_neon_hi) __c);
 }
 
-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int64x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vmlal_n_s32 (int64x2_t __a, int32x2_t __b, int32_t __c)
 {
   return (int64x2_t)__builtin_neon_vmlals_nv2si (__a, __b, (__builtin_neon_si) __c);
 }
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vmlal_n_u16 (uint32x4_t __a, uint16x4_t __b, uint16_t __c)
 {
   return (uint32x4_t)__builtin_neon_vmlalu_nv4hi ((int32x4_t) __a, (int16x4_t) __b, (__builtin_neon_hi) __c);
 }
 
-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint64x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vmlal_n_u32 (uint64x2_t __a, uint32x2_t __b, uint32_t __c)
 {
   return (uint64x2_t)__builtin_neon_vmlalu_nv2si ((int64x2_t) __a, (int32x2_t) __b, (__builtin_neon_si) __c);
 }
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vqdmlal_n_s16 (int32x4_t __a, int16x4_t __b, int16_t __c)
 {
   return (int32x4_t)__builtin_neon_vqdmlal_nv4hi (__a, __b, (__builtin_neon_hi) __c);
 }
 
-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int64x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vqdmlal_n_s32 (int64x2_t __a, int32x2_t __b, int32_t __c)
 {
   return (int64x2_t)__builtin_neon_vqdmlal_nv2si (__a, __b, (__builtin_neon_si) __c);
 }
 
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vmls_n_s16 (int16x4_t __a, int16x4_t __b, int16_t __c)
 {
   return (int16x4_t)__builtin_neon_vmls_nv4hi (__a, __b, (__builtin_neon_hi) __c);
 }
 
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vmls_n_s32 (int32x2_t __a, int32x2_t __b, int32_t __c)
 {
   return (int32x2_t)__builtin_neon_vmls_nv2si (__a, __b, (__builtin_neon_si) __c);
 }
 
-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline float32x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vmls_n_f32 (float32x2_t __a, float32x2_t __b, float32_t __c)
 {
   return (float32x2_t)__builtin_neon_vmls_nv2sf (__a, __b, (__builtin_neon_sf) __c);
 }
 
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vmls_n_u16 (uint16x4_t __a, uint16x4_t __b, uint16_t __c)
 {
   return (uint16x4_t)__builtin_neon_vmls_nv4hi ((int16x4_t) __a, (int16x4_t) __b, (__builtin_neon_hi) __c);
 }
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vmls_n_u32 (uint32x2_t __a, uint32x2_t __b, uint32_t __c)
 {
   return (uint32x2_t)__builtin_neon_vmls_nv2si ((int32x2_t) __a, (int32x2_t) __b, (__builtin_neon_si) __c);
 }
 
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vmlsq_n_s16 (int16x8_t __a, int16x8_t __b, int16_t __c)
 {
   return (int16x8_t)__builtin_neon_vmls_nv8hi (__a, __b, (__builtin_neon_hi) __c);
 }
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vmlsq_n_s32 (int32x4_t __a, int32x4_t __b, int32_t __c)
 {
   return (int32x4_t)__builtin_neon_vmls_nv4si (__a, __b, (__builtin_neon_si) __c);
 }
 
-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline float32x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vmlsq_n_f32 (float32x4_t __a, float32x4_t __b, float32_t __c)
 {
   return (float32x4_t)__builtin_neon_vmls_nv4sf (__a, __b, (__builtin_neon_sf) __c);
 }
 
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vmlsq_n_u16 (uint16x8_t __a, uint16x8_t __b, uint16_t __c)
 {
   return (uint16x8_t)__builtin_neon_vmls_nv8hi ((int16x8_t) __a, (int16x8_t) __b, (__builtin_neon_hi) __c);
 }
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vmlsq_n_u32 (uint32x4_t __a, uint32x4_t __b, uint32_t __c)
 {
   return (uint32x4_t)__builtin_neon_vmls_nv4si ((int32x4_t) __a, (int32x4_t) __b, (__builtin_neon_si) __c);
 }
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vmlsl_n_s16 (int32x4_t __a, int16x4_t __b, int16_t __c)
 {
   return (int32x4_t)__builtin_neon_vmlsls_nv4hi (__a, __b, (__builtin_neon_hi) __c);
 }
 
-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int64x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vmlsl_n_s32 (int64x2_t __a, int32x2_t __b, int32_t __c)
 {
   return (int64x2_t)__builtin_neon_vmlsls_nv2si (__a, __b, (__builtin_neon_si) __c);
 }
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vmlsl_n_u16 (uint32x4_t __a, uint16x4_t __b, uint16_t __c)
 {
   return (uint32x4_t)__builtin_neon_vmlslu_nv4hi ((int32x4_t) __a, (int16x4_t) __b, (__builtin_neon_hi) __c);
 }
 
-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint64x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vmlsl_n_u32 (uint64x2_t __a, uint32x2_t __b, uint32_t __c)
 {
   return (uint64x2_t)__builtin_neon_vmlslu_nv2si ((int64x2_t) __a, (int32x2_t) __b, (__builtin_neon_si) __c);
 }
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vqdmlsl_n_s16 (int32x4_t __a, int16x4_t __b, int16_t __c)
 {
   return (int32x4_t)__builtin_neon_vqdmlsl_nv4hi (__a, __b, (__builtin_neon_hi) __c);
 }
 
-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int64x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vqdmlsl_n_s32 (int64x2_t __a, int32x2_t __b, int32_t __c)
 {
   return (int64x2_t)__builtin_neon_vqdmlsl_nv2si (__a, __b, (__builtin_neon_si) __c);
@@ -7500,74 +8690,86 @@ vqdmlsl_n_s32 (int64x2_t __a, int32x2_t __b, int32_t __c)
 
 #pragma GCC push_options
 #pragma GCC target ("fpu=crypto-neon-fp-armv8")
-__extension__ static __inline poly64x1_t __attribute__ ((__always_inline__))
+__extension__ extern __inline poly64x1_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vext_p64 (poly64x1_t __a, poly64x1_t __b, const int __c)
 {
   return (poly64x1_t)__builtin_neon_vextdi (__a, __b, __c);
 }
 
 #pragma GCC pop_options
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int8x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vext_s8 (int8x8_t __a, int8x8_t __b, const int __c)
 {
   return (int8x8_t)__builtin_neon_vextv8qi (__a, __b, __c);
 }
 
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vext_s16 (int16x4_t __a, int16x4_t __b, const int __c)
 {
   return (int16x4_t)__builtin_neon_vextv4hi (__a, __b, __c);
 }
 
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vext_s32 (int32x2_t __a, int32x2_t __b, const int __c)
 {
   return (int32x2_t)__builtin_neon_vextv2si (__a, __b, __c);
 }
 
-__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int64x1_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vext_s64 (int64x1_t __a, int64x1_t __b, const int __c)
 {
   return (int64x1_t)__builtin_neon_vextdi (__a, __b, __c);
 }
 
-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline float32x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vext_f32 (float32x2_t __a, float32x2_t __b, const int __c)
 {
   return (float32x2_t)__builtin_neon_vextv2sf (__a, __b, __c);
 }
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint8x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vext_u8 (uint8x8_t __a, uint8x8_t __b, const int __c)
 {
   return (uint8x8_t)__builtin_neon_vextv8qi ((int8x8_t) __a, (int8x8_t) __b, __c);
 }
 
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vext_u16 (uint16x4_t __a, uint16x4_t __b, const int __c)
 {
   return (uint16x4_t)__builtin_neon_vextv4hi ((int16x4_t) __a, (int16x4_t) __b, __c);
 }
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vext_u32 (uint32x2_t __a, uint32x2_t __b, const int __c)
 {
   return (uint32x2_t)__builtin_neon_vextv2si ((int32x2_t) __a, (int32x2_t) __b, __c);
 }
 
-__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint64x1_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vext_u64 (uint64x1_t __a, uint64x1_t __b, const int __c)
 {
   return (uint64x1_t)__builtin_neon_vextdi ((int64x1_t) __a, (int64x1_t) __b, __c);
 }
 
-__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline poly8x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vext_p8 (poly8x8_t __a, poly8x8_t __b, const int __c)
 {
   return (poly8x8_t)__builtin_neon_vextv8qi ((int8x8_t) __a, (int8x8_t) __b, __c);
 }
 
-__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline poly16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vext_p16 (poly16x4_t __a, poly16x4_t __b, const int __c)
 {
   return (poly16x4_t)__builtin_neon_vextv4hi ((int16x4_t) __a, (int16x4_t) __b, __c);
@@ -7575,290 +8777,338 @@ vext_p16 (poly16x4_t __a, poly16x4_t __b, const int __c)
 
 #pragma GCC push_options
 #pragma GCC target ("fpu=crypto-neon-fp-armv8")
-__extension__ static __inline poly64x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline poly64x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vextq_p64 (poly64x2_t __a, poly64x2_t __b, const int __c)
 {
   return (poly64x2_t)__builtin_neon_vextv2di ((int64x2_t) __a, (int64x2_t) __b, __c);
 }
 
 #pragma GCC pop_options
-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int8x16_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vextq_s8 (int8x16_t __a, int8x16_t __b, const int __c)
 {
   return (int8x16_t)__builtin_neon_vextv16qi (__a, __b, __c);
 }
 
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vextq_s16 (int16x8_t __a, int16x8_t __b, const int __c)
 {
   return (int16x8_t)__builtin_neon_vextv8hi (__a, __b, __c);
 }
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vextq_s32 (int32x4_t __a, int32x4_t __b, const int __c)
 {
   return (int32x4_t)__builtin_neon_vextv4si (__a, __b, __c);
 }
 
-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int64x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vextq_s64 (int64x2_t __a, int64x2_t __b, const int __c)
 {
   return (int64x2_t)__builtin_neon_vextv2di (__a, __b, __c);
 }
 
-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline float32x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vextq_f32 (float32x4_t __a, float32x4_t __b, const int __c)
 {
   return (float32x4_t)__builtin_neon_vextv4sf (__a, __b, __c);
 }
 
-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint8x16_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vextq_u8 (uint8x16_t __a, uint8x16_t __b, const int __c)
 {
   return (uint8x16_t)__builtin_neon_vextv16qi ((int8x16_t) __a, (int8x16_t) __b, __c);
 }
 
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vextq_u16 (uint16x8_t __a, uint16x8_t __b, const int __c)
 {
   return (uint16x8_t)__builtin_neon_vextv8hi ((int16x8_t) __a, (int16x8_t) __b, __c);
 }
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vextq_u32 (uint32x4_t __a, uint32x4_t __b, const int __c)
 {
   return (uint32x4_t)__builtin_neon_vextv4si ((int32x4_t) __a, (int32x4_t) __b, __c);
 }
 
-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint64x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vextq_u64 (uint64x2_t __a, uint64x2_t __b, const int __c)
 {
   return (uint64x2_t)__builtin_neon_vextv2di ((int64x2_t) __a, (int64x2_t) __b, __c);
 }
 
-__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
+__extension__ extern __inline poly8x16_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vextq_p8 (poly8x16_t __a, poly8x16_t __b, const int __c)
 {
   return (poly8x16_t)__builtin_neon_vextv16qi ((int8x16_t) __a, (int8x16_t) __b, __c);
 }
 
-__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline poly16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vextq_p16 (poly16x8_t __a, poly16x8_t __b, const int __c)
 {
   return (poly16x8_t)__builtin_neon_vextv8hi ((int16x8_t) __a, (int16x8_t) __b, __c);
 }
 
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int8x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vrev64_s8 (int8x8_t __a)
 {
   return (int8x8_t) __builtin_shuffle (__a, (uint8x8_t) { 7, 6, 5, 4, 3, 2, 1, 0 });
 }
 
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vrev64_s16 (int16x4_t __a)
 {
   return (int16x4_t) __builtin_shuffle (__a, (uint16x4_t) { 3, 2, 1, 0 });
 }
 
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vrev64_s32 (int32x2_t __a)
 {
   return (int32x2_t) __builtin_shuffle (__a, (uint32x2_t) { 1, 0 });
 }
 
-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline float32x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vrev64_f32 (float32x2_t __a)
 {
   return (float32x2_t) __builtin_shuffle (__a, (uint32x2_t) { 1, 0 });
 }
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint8x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vrev64_u8 (uint8x8_t __a)
 {
   return (uint8x8_t) __builtin_shuffle (__a, (uint8x8_t) { 7, 6, 5, 4, 3, 2, 1, 0 });
 }
 
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vrev64_u16 (uint16x4_t __a)
 {
   return (uint16x4_t) __builtin_shuffle (__a, (uint16x4_t) { 3, 2, 1, 0 });
 }
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vrev64_u32 (uint32x2_t __a)
 {
   return (uint32x2_t) __builtin_shuffle (__a, (uint32x2_t) { 1, 0 });
 }
 
-__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline poly8x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vrev64_p8 (poly8x8_t __a)
 {
   return (poly8x8_t) __builtin_shuffle (__a, (uint8x8_t) { 7, 6, 5, 4, 3, 2, 1, 0 });
 }
 
-__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline poly16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vrev64_p16 (poly16x4_t __a)
 {
   return (poly16x4_t) __builtin_shuffle (__a, (uint16x4_t) { 3, 2, 1, 0 });
 }
 
-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int8x16_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vrev64q_s8 (int8x16_t __a)
 {
   return (int8x16_t) __builtin_shuffle (__a, (uint8x16_t) { 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8 });
 }
 
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vrev64q_s16 (int16x8_t __a)
 {
   return (int16x8_t) __builtin_shuffle (__a, (uint16x8_t) { 3, 2, 1, 0, 7, 6, 5, 4 });
 }
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vrev64q_s32 (int32x4_t __a)
 {
   return (int32x4_t) __builtin_shuffle (__a, (uint32x4_t) { 1, 0, 3, 2 });
 }
 
-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline float32x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vrev64q_f32 (float32x4_t __a)
 {
   return (float32x4_t) __builtin_shuffle (__a, (uint32x4_t) { 1, 0, 3, 2 });
 }
 
-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint8x16_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vrev64q_u8 (uint8x16_t __a)
 {
   return (uint8x16_t) __builtin_shuffle (__a, (uint8x16_t) { 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8 });
 }
 
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vrev64q_u16 (uint16x8_t __a)
 {
   return (uint16x8_t) __builtin_shuffle (__a, (uint16x8_t) { 3, 2, 1, 0, 7, 6, 5, 4 });
 }
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vrev64q_u32 (uint32x4_t __a)
 {
   return (uint32x4_t) __builtin_shuffle (__a, (uint32x4_t) { 1, 0, 3, 2 });
 }
 
-__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
+__extension__ extern __inline poly8x16_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vrev64q_p8 (poly8x16_t __a)
 {
   return (poly8x16_t) __builtin_shuffle (__a, (uint8x16_t) { 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8 });
 }
 
-__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline poly16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vrev64q_p16 (poly16x8_t __a)
 {
   return (poly16x8_t) __builtin_shuffle (__a, (uint16x8_t) { 3, 2, 1, 0, 7, 6, 5, 4 });
 }
 
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int8x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vrev32_s8 (int8x8_t __a)
 {
   return (int8x8_t) __builtin_shuffle (__a, (uint8x8_t) { 3, 2, 1, 0, 7, 6, 5, 4 });
 }
 
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vrev32_s16 (int16x4_t __a)
 {
   return (int16x4_t) __builtin_shuffle (__a, (uint16x4_t) { 1, 0, 3, 2 });
 }
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint8x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vrev32_u8 (uint8x8_t __a)
 {
   return (uint8x8_t) __builtin_shuffle (__a, (uint8x8_t) { 3, 2, 1, 0, 7, 6, 5, 4 });
 }
 
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vrev32_u16 (uint16x4_t __a)
 {
   return (uint16x4_t) __builtin_shuffle (__a, (uint16x4_t) { 1, 0, 3, 2 });
 }
 
-__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline poly8x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vrev32_p8 (poly8x8_t __a)
 {
   return (poly8x8_t) __builtin_shuffle (__a, (uint8x8_t) { 3, 2, 1, 0, 7, 6, 5, 4 });
 }
 
-__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline poly16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vrev32_p16 (poly16x4_t __a)
 {
   return (poly16x4_t) __builtin_shuffle (__a, (uint16x4_t) { 1, 0, 3, 2 });
 }
 
-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int8x16_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vrev32q_s8 (int8x16_t __a)
 {
   return (int8x16_t) __builtin_shuffle (__a, (uint8x16_t) { 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12 });
 }
 
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vrev32q_s16 (int16x8_t __a)
 {
   return (int16x8_t) __builtin_shuffle (__a, (uint16x8_t) { 1, 0, 3, 2, 5, 4, 7, 6 });
 }
 
-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint8x16_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vrev32q_u8 (uint8x16_t __a)
 {
   return (uint8x16_t) __builtin_shuffle (__a, (uint8x16_t) { 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12 });
 }
 
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vrev32q_u16 (uint16x8_t __a)
 {
   return (uint16x8_t) __builtin_shuffle (__a, (uint16x8_t) { 1, 0, 3, 2, 5, 4, 7, 6 });
 }
 
-__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
+__extension__ extern __inline poly8x16_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vrev32q_p8 (poly8x16_t __a)
 {
   return (poly8x16_t) __builtin_shuffle (__a, (uint8x16_t) { 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12 });
 }
 
-__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline poly16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vrev32q_p16 (poly16x8_t __a)
 {
   return (poly16x8_t) __builtin_shuffle (__a, (uint16x8_t) { 1, 0, 3, 2, 5, 4, 7, 6 });
 }
 
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int8x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vrev16_s8 (int8x8_t __a)
 {
   return (int8x8_t) __builtin_shuffle (__a, (uint8x8_t) { 1, 0, 3, 2, 5, 4, 7, 6 });
 }
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint8x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vrev16_u8 (uint8x8_t __a)
 {
   return (uint8x8_t) __builtin_shuffle (__a, (uint8x8_t) { 1, 0, 3, 2, 5, 4, 7, 6 });
 }
 
-__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline poly8x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vrev16_p8 (poly8x8_t __a)
 {
   return (poly8x8_t) __builtin_shuffle (__a, (uint8x8_t) { 1, 0, 3, 2, 5, 4, 7, 6 });
 }
 
-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int8x16_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vrev16q_s8 (int8x16_t __a)
 {
   return (int8x16_t) __builtin_shuffle (__a, (uint8x16_t) { 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14 });
 }
 
-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint8x16_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vrev16q_u8 (uint8x16_t __a)
 {
   return (uint8x16_t) __builtin_shuffle (__a, (uint8x16_t) { 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14 });
 }
 
-__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
+__extension__ extern __inline poly8x16_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vrev16q_p8 (poly8x16_t __a)
 {
   return (poly8x16_t) __builtin_shuffle (__a, (uint8x16_t) { 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14 });
@@ -7866,74 +9116,86 @@ vrev16q_p8 (poly8x16_t __a)
 
 #pragma GCC push_options
 #pragma GCC target ("fpu=crypto-neon-fp-armv8")
-__extension__ static __inline poly64x1_t __attribute__ ((__always_inline__))
+__extension__ extern __inline poly64x1_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vbsl_p64 (uint64x1_t __a, poly64x1_t __b, poly64x1_t __c)
 {
   return (poly64x1_t)__builtin_neon_vbsldi ((int64x1_t) __a, __b, __c);
 }
 
 #pragma GCC pop_options
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int8x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vbsl_s8 (uint8x8_t __a, int8x8_t __b, int8x8_t __c)
 {
   return (int8x8_t)__builtin_neon_vbslv8qi ((int8x8_t) __a, __b, __c);
 }
 
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vbsl_s16 (uint16x4_t __a, int16x4_t __b, int16x4_t __c)
 {
   return (int16x4_t)__builtin_neon_vbslv4hi ((int16x4_t) __a, __b, __c);
 }
 
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vbsl_s32 (uint32x2_t __a, int32x2_t __b, int32x2_t __c)
 {
   return (int32x2_t)__builtin_neon_vbslv2si ((int32x2_t) __a, __b, __c);
 }
 
-__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int64x1_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vbsl_s64 (uint64x1_t __a, int64x1_t __b, int64x1_t __c)
 {
   return (int64x1_t)__builtin_neon_vbsldi ((int64x1_t) __a, __b, __c);
 }
 
-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline float32x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vbsl_f32 (uint32x2_t __a, float32x2_t __b, float32x2_t __c)
 {
   return (float32x2_t)__builtin_neon_vbslv2sf ((int32x2_t) __a, __b, __c);
 }
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint8x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vbsl_u8 (uint8x8_t __a, uint8x8_t __b, uint8x8_t __c)
 {
   return (uint8x8_t)__builtin_neon_vbslv8qi ((int8x8_t) __a, (int8x8_t) __b, (int8x8_t) __c);
 }
 
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vbsl_u16 (uint16x4_t __a, uint16x4_t __b, uint16x4_t __c)
 {
   return (uint16x4_t)__builtin_neon_vbslv4hi ((int16x4_t) __a, (int16x4_t) __b, (int16x4_t) __c);
 }
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vbsl_u32 (uint32x2_t __a, uint32x2_t __b, uint32x2_t __c)
 {
   return (uint32x2_t)__builtin_neon_vbslv2si ((int32x2_t) __a, (int32x2_t) __b, (int32x2_t) __c);
 }
 
-__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint64x1_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vbsl_u64 (uint64x1_t __a, uint64x1_t __b, uint64x1_t __c)
 {
   return (uint64x1_t)__builtin_neon_vbsldi ((int64x1_t) __a, (int64x1_t) __b, (int64x1_t) __c);
 }
 
-__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline poly8x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vbsl_p8 (uint8x8_t __a, poly8x8_t __b, poly8x8_t __c)
 {
   return (poly8x8_t)__builtin_neon_vbslv8qi ((int8x8_t) __a, (int8x8_t) __b, (int8x8_t) __c);
 }
 
-__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline poly16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vbsl_p16 (uint16x4_t __a, poly16x4_t __b, poly16x4_t __c)
 {
   return (poly16x4_t)__builtin_neon_vbslv4hi ((int16x4_t) __a, (int16x4_t) __b, (int16x4_t) __c);
@@ -7941,74 +9203,86 @@ vbsl_p16 (uint16x4_t __a, poly16x4_t __b, poly16x4_t __c)
 
 #pragma GCC push_options
 #pragma GCC target ("fpu=crypto-neon-fp-armv8")
-__extension__ static __inline poly64x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline poly64x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vbslq_p64 (uint64x2_t __a, poly64x2_t __b, poly64x2_t __c)
 {
   return (poly64x2_t)__builtin_neon_vbslv2di ((int64x2_t) __a, (int64x2_t) __b, (int64x2_t) __c);
 }
 
 #pragma GCC pop_options
-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int8x16_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vbslq_s8 (uint8x16_t __a, int8x16_t __b, int8x16_t __c)
 {
   return (int8x16_t)__builtin_neon_vbslv16qi ((int8x16_t) __a, __b, __c);
 }
 
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vbslq_s16 (uint16x8_t __a, int16x8_t __b, int16x8_t __c)
 {
   return (int16x8_t)__builtin_neon_vbslv8hi ((int16x8_t) __a, __b, __c);
 }
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vbslq_s32 (uint32x4_t __a, int32x4_t __b, int32x4_t __c)
 {
   return (int32x4_t)__builtin_neon_vbslv4si ((int32x4_t) __a, __b, __c);
 }
 
-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int64x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vbslq_s64 (uint64x2_t __a, int64x2_t __b, int64x2_t __c)
 {
   return (int64x2_t)__builtin_neon_vbslv2di ((int64x2_t) __a, __b, __c);
 }
 
-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline float32x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vbslq_f32 (uint32x4_t __a, float32x4_t __b, float32x4_t __c)
 {
   return (float32x4_t)__builtin_neon_vbslv4sf ((int32x4_t) __a, __b, __c);
 }
 
-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint8x16_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vbslq_u8 (uint8x16_t __a, uint8x16_t __b, uint8x16_t __c)
 {
   return (uint8x16_t)__builtin_neon_vbslv16qi ((int8x16_t) __a, (int8x16_t) __b, (int8x16_t) __c);
 }
 
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vbslq_u16 (uint16x8_t __a, uint16x8_t __b, uint16x8_t __c)
 {
   return (uint16x8_t)__builtin_neon_vbslv8hi ((int16x8_t) __a, (int16x8_t) __b, (int16x8_t) __c);
 }
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vbslq_u32 (uint32x4_t __a, uint32x4_t __b, uint32x4_t __c)
 {
   return (uint32x4_t)__builtin_neon_vbslv4si ((int32x4_t) __a, (int32x4_t) __b, (int32x4_t) __c);
 }
 
-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint64x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vbslq_u64 (uint64x2_t __a, uint64x2_t __b, uint64x2_t __c)
 {
   return (uint64x2_t)__builtin_neon_vbslv2di ((int64x2_t) __a, (int64x2_t) __b, (int64x2_t) __c);
 }
 
-__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
+__extension__ extern __inline poly8x16_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vbslq_p8 (uint8x16_t __a, poly8x16_t __b, poly8x16_t __c)
 {
   return (poly8x16_t)__builtin_neon_vbslv16qi ((int8x16_t) __a, (int8x16_t) __b, (int8x16_t) __c);
 }
 
-__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline poly16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vbslq_p16 (uint16x8_t __a, poly16x8_t __b, poly16x8_t __c)
 {
   return (poly16x8_t)__builtin_neon_vbslv8hi ((int16x8_t) __a, (int16x8_t) __b, (int16x8_t) __c);
@@ -8025,7 +9299,8 @@ vbslq_p16 (uint16x8_t __a, poly16x8_t __b, poly16x8_t __c)
    vector, and will itself be loaded in reverse order (again, relative to the
    neon intrinsics view, i.e. that would result from a "vld1" instruction).  */
 
-__extension__ static __inline int8x8x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int8x8x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vtrn_s8 (int8x8_t __a, int8x8_t __b)
 {
   int8x8x2_t __rv;
@@ -8043,7 +9318,8 @@ vtrn_s8 (int8x8_t __a, int8x8_t __b)
   return __rv;
 }
 
-__extension__ static __inline int16x4x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int16x4x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vtrn_s16 (int16x4_t __a, int16x4_t __b)
 {
   int16x4x2_t __rv;
@@ -8057,7 +9333,8 @@ vtrn_s16 (int16x4_t __a, int16x4_t __b)
   return __rv;
 }
 
-__extension__ static __inline uint8x8x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint8x8x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vtrn_u8 (uint8x8_t __a, uint8x8_t __b)
 {
   uint8x8x2_t __rv;
@@ -8075,7 +9352,8 @@ vtrn_u8 (uint8x8_t __a, uint8x8_t __b)
   return __rv;
 }
 
-__extension__ static __inline uint16x4x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint16x4x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vtrn_u16 (uint16x4_t __a, uint16x4_t __b)
 {
   uint16x4x2_t __rv;
@@ -8089,7 +9367,8 @@ vtrn_u16 (uint16x4_t __a, uint16x4_t __b)
   return __rv;
 }
 
-__extension__ static __inline poly8x8x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline poly8x8x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vtrn_p8 (poly8x8_t __a, poly8x8_t __b)
 {
   poly8x8x2_t __rv;
@@ -8107,7 +9386,8 @@ vtrn_p8 (poly8x8_t __a, poly8x8_t __b)
   return __rv;
 }
 
-__extension__ static __inline poly16x4x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline poly16x4x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vtrn_p16 (poly16x4_t __a, poly16x4_t __b)
 {
   poly16x4x2_t __rv;
@@ -8121,7 +9401,8 @@ vtrn_p16 (poly16x4_t __a, poly16x4_t __b)
   return __rv;
 }
 
-__extension__ static __inline int32x2x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32x2x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vtrn_s32 (int32x2_t __a, int32x2_t __b)
 {
   int32x2x2_t __rv;
@@ -8135,7 +9416,8 @@ vtrn_s32 (int32x2_t __a, int32x2_t __b)
   return __rv;
 }
 
-__extension__ static __inline float32x2x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline float32x2x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vtrn_f32 (float32x2_t __a, float32x2_t __b)
 {
   float32x2x2_t __rv;
@@ -8149,7 +9431,8 @@ vtrn_f32 (float32x2_t __a, float32x2_t __b)
   return __rv;
 }
 
-__extension__ static __inline uint32x2x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x2x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vtrn_u32 (uint32x2_t __a, uint32x2_t __b)
 {
   uint32x2x2_t __rv;
@@ -8163,7 +9446,8 @@ vtrn_u32 (uint32x2_t __a, uint32x2_t __b)
   return __rv;
 }
 
-__extension__ static __inline int8x16x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int8x16x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vtrnq_s8 (int8x16_t __a, int8x16_t __b)
 {
   int8x16x2_t __rv;
@@ -8181,7 +9465,8 @@ vtrnq_s8 (int8x16_t __a, int8x16_t __b)
   return __rv;
 }
 
-__extension__ static __inline int16x8x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int16x8x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vtrnq_s16 (int16x8_t __a, int16x8_t __b)
 {
   int16x8x2_t __rv;
@@ -8199,7 +9484,8 @@ vtrnq_s16 (int16x8_t __a, int16x8_t __b)
   return __rv;
 }
 
-__extension__ static __inline int32x4x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32x4x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vtrnq_s32 (int32x4_t __a, int32x4_t __b)
 {
   int32x4x2_t __rv;
@@ -8213,7 +9499,8 @@ vtrnq_s32 (int32x4_t __a, int32x4_t __b)
   return __rv;
 }
 
-__extension__ static __inline float32x4x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline float32x4x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vtrnq_f32 (float32x4_t __a, float32x4_t __b)
 {
   float32x4x2_t __rv;
@@ -8227,7 +9514,8 @@ vtrnq_f32 (float32x4_t __a, float32x4_t __b)
   return __rv;
 }
 
-__extension__ static __inline uint8x16x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint8x16x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vtrnq_u8 (uint8x16_t __a, uint8x16_t __b)
 {
   uint8x16x2_t __rv;
@@ -8245,7 +9533,8 @@ vtrnq_u8 (uint8x16_t __a, uint8x16_t __b)
   return __rv;
 }
 
-__extension__ static __inline uint16x8x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint16x8x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vtrnq_u16 (uint16x8_t __a, uint16x8_t __b)
 {
   uint16x8x2_t __rv;
@@ -8263,7 +9552,8 @@ vtrnq_u16 (uint16x8_t __a, uint16x8_t __b)
   return __rv;
 }
 
-__extension__ static __inline uint32x4x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x4x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vtrnq_u32 (uint32x4_t __a, uint32x4_t __b)
 {
   uint32x4x2_t __rv;
@@ -8277,7 +9567,8 @@ vtrnq_u32 (uint32x4_t __a, uint32x4_t __b)
   return __rv;
 }
 
-__extension__ static __inline poly8x16x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline poly8x16x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vtrnq_p8 (poly8x16_t __a, poly8x16_t __b)
 {
   poly8x16x2_t __rv;
@@ -8295,7 +9586,8 @@ vtrnq_p8 (poly8x16_t __a, poly8x16_t __b)
   return __rv;
 }
 
-__extension__ static __inline poly16x8x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline poly16x8x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vtrnq_p16 (poly16x8_t __a, poly16x8_t __b)
 {
   poly16x8x2_t __rv;
@@ -8313,7 +9605,8 @@ vtrnq_p16 (poly16x8_t __a, poly16x8_t __b)
   return __rv;
 }
 
-__extension__ static __inline int8x8x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int8x8x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vzip_s8 (int8x8_t __a, int8x8_t __b)
 {
   int8x8x2_t __rv;
@@ -8331,7 +9624,8 @@ vzip_s8 (int8x8_t __a, int8x8_t __b)
   return __rv;
 }
 
-__extension__ static __inline int16x4x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int16x4x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vzip_s16 (int16x4_t __a, int16x4_t __b)
 {
   int16x4x2_t __rv;
@@ -8345,7 +9639,8 @@ vzip_s16 (int16x4_t __a, int16x4_t __b)
   return __rv;
 }
 
-__extension__ static __inline uint8x8x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint8x8x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vzip_u8 (uint8x8_t __a, uint8x8_t __b)
 {
   uint8x8x2_t __rv;
@@ -8363,7 +9658,8 @@ vzip_u8 (uint8x8_t __a, uint8x8_t __b)
   return __rv;
 }
 
-__extension__ static __inline uint16x4x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint16x4x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vzip_u16 (uint16x4_t __a, uint16x4_t __b)
 {
   uint16x4x2_t __rv;
@@ -8377,7 +9673,8 @@ vzip_u16 (uint16x4_t __a, uint16x4_t __b)
   return __rv;
 }
 
-__extension__ static __inline poly8x8x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline poly8x8x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vzip_p8 (poly8x8_t __a, poly8x8_t __b)
 {
   poly8x8x2_t __rv;
@@ -8395,7 +9692,8 @@ vzip_p8 (poly8x8_t __a, poly8x8_t __b)
   return __rv;
 }
 
-__extension__ static __inline poly16x4x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline poly16x4x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vzip_p16 (poly16x4_t __a, poly16x4_t __b)
 {
   poly16x4x2_t __rv;
@@ -8409,7 +9707,8 @@ vzip_p16 (poly16x4_t __a, poly16x4_t __b)
   return __rv;
 }
 
-__extension__ static __inline int32x2x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32x2x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vzip_s32 (int32x2_t __a, int32x2_t __b)
 {
   int32x2x2_t __rv;
@@ -8423,7 +9722,8 @@ vzip_s32 (int32x2_t __a, int32x2_t __b)
   return __rv;
 }
 
-__extension__ static __inline float32x2x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline float32x2x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vzip_f32 (float32x2_t __a, float32x2_t __b)
 {
   float32x2x2_t __rv;
@@ -8437,7 +9737,8 @@ vzip_f32 (float32x2_t __a, float32x2_t __b)
   return __rv;
 }
 
-__extension__ static __inline uint32x2x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x2x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vzip_u32 (uint32x2_t __a, uint32x2_t __b)
 {
   uint32x2x2_t __rv;
@@ -8451,7 +9752,8 @@ vzip_u32 (uint32x2_t __a, uint32x2_t __b)
   return __rv;
 }
 
-__extension__ static __inline int8x16x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int8x16x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vzipq_s8 (int8x16_t __a, int8x16_t __b)
 {
   int8x16x2_t __rv;
@@ -8469,7 +9771,8 @@ vzipq_s8 (int8x16_t __a, int8x16_t __b)
   return __rv;
 }
 
-__extension__ static __inline int16x8x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int16x8x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vzipq_s16 (int16x8_t __a, int16x8_t __b)
 {
   int16x8x2_t __rv;
@@ -8487,7 +9790,8 @@ vzipq_s16 (int16x8_t __a, int16x8_t __b)
   return __rv;
 }
 
-__extension__ static __inline int32x4x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32x4x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vzipq_s32 (int32x4_t __a, int32x4_t __b)
 {
   int32x4x2_t __rv;
@@ -8501,7 +9805,8 @@ vzipq_s32 (int32x4_t __a, int32x4_t __b)
   return __rv;
 }
 
-__extension__ static __inline float32x4x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline float32x4x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vzipq_f32 (float32x4_t __a, float32x4_t __b)
 {
   float32x4x2_t __rv;
@@ -8515,7 +9820,8 @@ vzipq_f32 (float32x4_t __a, float32x4_t __b)
   return __rv;
 }
 
-__extension__ static __inline uint8x16x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint8x16x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vzipq_u8 (uint8x16_t __a, uint8x16_t __b)
 {
   uint8x16x2_t __rv;
@@ -8533,7 +9839,8 @@ vzipq_u8 (uint8x16_t __a, uint8x16_t __b)
   return __rv;
 }
 
-__extension__ static __inline uint16x8x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint16x8x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vzipq_u16 (uint16x8_t __a, uint16x8_t __b)
 {
   uint16x8x2_t __rv;
@@ -8551,7 +9858,8 @@ vzipq_u16 (uint16x8_t __a, uint16x8_t __b)
   return __rv;
 }
 
-__extension__ static __inline uint32x4x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x4x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vzipq_u32 (uint32x4_t __a, uint32x4_t __b)
 {
   uint32x4x2_t __rv;
@@ -8565,7 +9873,8 @@ vzipq_u32 (uint32x4_t __a, uint32x4_t __b)
   return __rv;
 }
 
-__extension__ static __inline poly8x16x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline poly8x16x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vzipq_p8 (poly8x16_t __a, poly8x16_t __b)
 {
   poly8x16x2_t __rv;
@@ -8583,7 +9892,8 @@ vzipq_p8 (poly8x16_t __a, poly8x16_t __b)
   return __rv;
 }
 
-__extension__ static __inline poly16x8x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline poly16x8x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vzipq_p16 (poly16x8_t __a, poly16x8_t __b)
 {
   poly16x8x2_t __rv;
@@ -8601,7 +9911,8 @@ vzipq_p16 (poly16x8_t __a, poly16x8_t __b)
   return __rv;
 }
 
-__extension__ static __inline int8x8x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int8x8x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vuzp_s8 (int8x8_t __a, int8x8_t __b)
 {
   int8x8x2_t __rv;
@@ -8619,7 +9930,8 @@ vuzp_s8 (int8x8_t __a, int8x8_t __b)
   return __rv;
 }
 
-__extension__ static __inline int16x4x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int16x4x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vuzp_s16 (int16x4_t __a, int16x4_t __b)
 {
   int16x4x2_t __rv;
@@ -8633,7 +9945,8 @@ vuzp_s16 (int16x4_t __a, int16x4_t __b)
   return __rv;
 }
 
-__extension__ static __inline int32x2x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32x2x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vuzp_s32 (int32x2_t __a, int32x2_t __b)
 {
   int32x2x2_t __rv;
@@ -8647,7 +9960,8 @@ vuzp_s32 (int32x2_t __a, int32x2_t __b)
   return __rv;
 }
 
-__extension__ static __inline float32x2x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline float32x2x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vuzp_f32 (float32x2_t __a, float32x2_t __b)
 {
   float32x2x2_t __rv;
@@ -8661,7 +9975,8 @@ vuzp_f32 (float32x2_t __a, float32x2_t __b)
   return __rv;
 }
 
-__extension__ static __inline uint8x8x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint8x8x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vuzp_u8 (uint8x8_t __a, uint8x8_t __b)
 {
   uint8x8x2_t __rv;
@@ -8679,7 +9994,8 @@ vuzp_u8 (uint8x8_t __a, uint8x8_t __b)
   return __rv;
 }
 
-__extension__ static __inline uint16x4x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint16x4x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vuzp_u16 (uint16x4_t __a, uint16x4_t __b)
 {
   uint16x4x2_t __rv;
@@ -8693,7 +10009,8 @@ vuzp_u16 (uint16x4_t __a, uint16x4_t __b)
   return __rv;
 }
 
-__extension__ static __inline uint32x2x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x2x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vuzp_u32 (uint32x2_t __a, uint32x2_t __b)
 {
   uint32x2x2_t __rv;
@@ -8707,7 +10024,8 @@ vuzp_u32 (uint32x2_t __a, uint32x2_t __b)
   return __rv;
 }
 
-__extension__ static __inline poly8x8x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline poly8x8x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vuzp_p8 (poly8x8_t __a, poly8x8_t __b)
 {
   poly8x8x2_t __rv;
@@ -8725,7 +10043,8 @@ vuzp_p8 (poly8x8_t __a, poly8x8_t __b)
   return __rv;
 }
 
-__extension__ static __inline poly16x4x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline poly16x4x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vuzp_p16 (poly16x4_t __a, poly16x4_t __b)
 {
   poly16x4x2_t __rv;
@@ -8739,7 +10058,8 @@ vuzp_p16 (poly16x4_t __a, poly16x4_t __b)
   return __rv;
 }
 
-__extension__ static __inline int8x16x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int8x16x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vuzpq_s8 (int8x16_t __a, int8x16_t __b)
 {
   int8x16x2_t __rv;
@@ -8757,7 +10077,8 @@ vuzpq_s8 (int8x16_t __a, int8x16_t __b)
   return __rv;
 }
 
-__extension__ static __inline int16x8x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int16x8x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vuzpq_s16 (int16x8_t __a, int16x8_t __b)
 {
   int16x8x2_t __rv;
@@ -8775,7 +10096,8 @@ vuzpq_s16 (int16x8_t __a, int16x8_t __b)
   return __rv;
 }
 
-__extension__ static __inline int32x4x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32x4x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vuzpq_s32 (int32x4_t __a, int32x4_t __b)
 {
   int32x4x2_t __rv;
@@ -8789,7 +10111,8 @@ vuzpq_s32 (int32x4_t __a, int32x4_t __b)
   return __rv;
 }
 
-__extension__ static __inline float32x4x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline float32x4x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vuzpq_f32 (float32x4_t __a, float32x4_t __b)
 {
   float32x4x2_t __rv;
@@ -8803,7 +10126,8 @@ vuzpq_f32 (float32x4_t __a, float32x4_t __b)
   return __rv;
 }
 
-__extension__ static __inline uint8x16x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint8x16x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vuzpq_u8 (uint8x16_t __a, uint8x16_t __b)
 {
   uint8x16x2_t __rv;
@@ -8821,7 +10145,8 @@ vuzpq_u8 (uint8x16_t __a, uint8x16_t __b)
   return __rv;
 }
 
-__extension__ static __inline uint16x8x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint16x8x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vuzpq_u16 (uint16x8_t __a, uint16x8_t __b)
 {
   uint16x8x2_t __rv;
@@ -8839,7 +10164,8 @@ vuzpq_u16 (uint16x8_t __a, uint16x8_t __b)
   return __rv;
 }
 
-__extension__ static __inline uint32x4x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x4x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vuzpq_u32 (uint32x4_t __a, uint32x4_t __b)
 {
   uint32x4x2_t __rv;
@@ -8853,7 +10179,8 @@ vuzpq_u32 (uint32x4_t __a, uint32x4_t __b)
   return __rv;
 }
 
-__extension__ static __inline poly8x16x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline poly8x16x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vuzpq_p8 (poly8x16_t __a, poly8x16_t __b)
 {
   poly8x16x2_t __rv;
@@ -8871,7 +10198,8 @@ vuzpq_p8 (poly8x16_t __a, poly8x16_t __b)
   return __rv;
 }
 
-__extension__ static __inline poly16x8x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline poly16x8x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vuzpq_p16 (poly16x8_t __a, poly16x8_t __b)
 {
   poly16x8x2_t __rv;
@@ -8891,82 +10219,95 @@ vuzpq_p16 (poly16x8_t __a, poly16x8_t __b)
 
 #pragma GCC push_options
 #pragma GCC target ("fpu=crypto-neon-fp-armv8")
-__extension__ static __inline poly64x1_t __attribute__ ((__always_inline__))
+__extension__ extern __inline poly64x1_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vld1_p64 (const poly64_t * __a)
 {
   return (poly64x1_t)__builtin_neon_vld1di ((const __builtin_neon_di *) __a);
 }
 
 #pragma GCC pop_options
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int8x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vld1_s8 (const int8_t * __a)
 {
   return (int8x8_t)__builtin_neon_vld1v8qi ((const __builtin_neon_qi *) __a);
 }
 
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vld1_s16 (const int16_t * __a)
 {
   return (int16x4_t)__builtin_neon_vld1v4hi ((const __builtin_neon_hi *) __a);
 }
 
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vld1_s32 (const int32_t * __a)
 {
   return (int32x2_t)__builtin_neon_vld1v2si ((const __builtin_neon_si *) __a);
 }
 
-__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int64x1_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vld1_s64 (const int64_t * __a)
 {
   return (int64x1_t)__builtin_neon_vld1di ((const __builtin_neon_di *) __a);
 }
 
 #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
-__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline float16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vld1_f16 (const float16_t * __a)
 {
   return __builtin_neon_vld1v4hf (__a);
 }
 #endif
 
-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline float32x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vld1_f32 (const float32_t * __a)
 {
   return (float32x2_t)__builtin_neon_vld1v2sf ((const __builtin_neon_sf *) __a);
 }
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint8x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vld1_u8 (const uint8_t * __a)
 {
   return (uint8x8_t)__builtin_neon_vld1v8qi ((const __builtin_neon_qi *) __a);
 }
 
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vld1_u16 (const uint16_t * __a)
 {
   return (uint16x4_t)__builtin_neon_vld1v4hi ((const __builtin_neon_hi *) __a);
 }
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vld1_u32 (const uint32_t * __a)
 {
   return (uint32x2_t)__builtin_neon_vld1v2si ((const __builtin_neon_si *) __a);
 }
 
-__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint64x1_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vld1_u64 (const uint64_t * __a)
 {
   return (uint64x1_t)__builtin_neon_vld1di ((const __builtin_neon_di *) __a);
 }
 
-__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline poly8x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vld1_p8 (const poly8_t * __a)
 {
   return (poly8x8_t)__builtin_neon_vld1v8qi ((const __builtin_neon_qi *) __a);
 }
 
-__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline poly16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vld1_p16 (const poly16_t * __a)
 {
   return (poly16x4_t)__builtin_neon_vld1v4hi ((const __builtin_neon_hi *) __a);
@@ -8974,144 +10315,167 @@ vld1_p16 (const poly16_t * __a)
 
 #pragma GCC push_options
 #pragma GCC target ("fpu=crypto-neon-fp-armv8")
-__extension__ static __inline poly64x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline poly64x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vld1q_p64 (const poly64_t * __a)
 {
   return (poly64x2_t)__builtin_neon_vld1v2di ((const __builtin_neon_di *) __a);
 }
 
 #pragma GCC pop_options
-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int8x16_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vld1q_s8 (const int8_t * __a)
 {
   return (int8x16_t)__builtin_neon_vld1v16qi ((const __builtin_neon_qi *) __a);
 }
 
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vld1q_s16 (const int16_t * __a)
 {
   return (int16x8_t)__builtin_neon_vld1v8hi ((const __builtin_neon_hi *) __a);
 }
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vld1q_s32 (const int32_t * __a)
 {
   return (int32x4_t)__builtin_neon_vld1v4si ((const __builtin_neon_si *) __a);
 }
 
-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int64x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vld1q_s64 (const int64_t * __a)
 {
   return (int64x2_t)__builtin_neon_vld1v2di ((const __builtin_neon_di *) __a);
 }
 
 #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
-__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline float16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vld1q_f16 (const float16_t * __a)
 {
   return __builtin_neon_vld1v8hf (__a);
 }
 #endif
 
-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline float32x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vld1q_f32 (const float32_t * __a)
 {
   return (float32x4_t)__builtin_neon_vld1v4sf ((const __builtin_neon_sf *) __a);
 }
 
-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint8x16_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vld1q_u8 (const uint8_t * __a)
 {
   return (uint8x16_t)__builtin_neon_vld1v16qi ((const __builtin_neon_qi *) __a);
 }
 
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vld1q_u16 (const uint16_t * __a)
 {
   return (uint16x8_t)__builtin_neon_vld1v8hi ((const __builtin_neon_hi *) __a);
 }
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vld1q_u32 (const uint32_t * __a)
 {
   return (uint32x4_t)__builtin_neon_vld1v4si ((const __builtin_neon_si *) __a);
 }
 
-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint64x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vld1q_u64 (const uint64_t * __a)
 {
   return (uint64x2_t)__builtin_neon_vld1v2di ((const __builtin_neon_di *) __a);
 }
 
-__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
+__extension__ extern __inline poly8x16_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vld1q_p8 (const poly8_t * __a)
 {
   return (poly8x16_t)__builtin_neon_vld1v16qi ((const __builtin_neon_qi *) __a);
 }
 
-__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline poly16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vld1q_p16 (const poly16_t * __a)
 {
   return (poly16x8_t)__builtin_neon_vld1v8hi ((const __builtin_neon_hi *) __a);
 }
 
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int8x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vld1_lane_s8 (const int8_t * __a, int8x8_t __b, const int __c)
 {
   return (int8x8_t)__builtin_neon_vld1_lanev8qi ((const __builtin_neon_qi *) __a, __b, __c);
 }
 
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vld1_lane_s16 (const int16_t * __a, int16x4_t __b, const int __c)
 {
   return (int16x4_t)__builtin_neon_vld1_lanev4hi ((const __builtin_neon_hi *) __a, __b, __c);
 }
 
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vld1_lane_s32 (const int32_t * __a, int32x2_t __b, const int __c)
 {
   return (int32x2_t)__builtin_neon_vld1_lanev2si ((const __builtin_neon_si *) __a, __b, __c);
 }
 
 #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
-__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline float16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vld1_lane_f16 (const float16_t * __a, float16x4_t __b, const int __c)
 {
   return vset_lane_f16 (*__a, __b, __c);
 }
 #endif
 
-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline float32x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vld1_lane_f32 (const float32_t * __a, float32x2_t __b, const int __c)
 {
   return (float32x2_t)__builtin_neon_vld1_lanev2sf ((const __builtin_neon_sf *) __a, __b, __c);
 }
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint8x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vld1_lane_u8 (const uint8_t * __a, uint8x8_t __b, const int __c)
 {
   return (uint8x8_t)__builtin_neon_vld1_lanev8qi ((const __builtin_neon_qi *) __a, (int8x8_t) __b, __c);
 }
 
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vld1_lane_u16 (const uint16_t * __a, uint16x4_t __b, const int __c)
 {
   return (uint16x4_t)__builtin_neon_vld1_lanev4hi ((const __builtin_neon_hi *) __a, (int16x4_t) __b, __c);
 }
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vld1_lane_u32 (const uint32_t * __a, uint32x2_t __b, const int __c)
 {
   return (uint32x2_t)__builtin_neon_vld1_lanev2si ((const __builtin_neon_si *) __a, (int32x2_t) __b, __c);
 }
 
-__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline poly8x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vld1_lane_p8 (const poly8_t * __a, poly8x8_t __b, const int __c)
 {
   return (poly8x8_t)__builtin_neon_vld1_lanev8qi ((const __builtin_neon_qi *) __a, (int8x8_t) __b, __c);
 }
 
-__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline poly16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vld1_lane_p16 (const poly16_t * __a, poly16x4_t __b, const int __c)
 {
   return (poly16x4_t)__builtin_neon_vld1_lanev4hi ((const __builtin_neon_hi *) __a, (int16x4_t) __b, __c);
@@ -9119,82 +10483,95 @@ vld1_lane_p16 (const poly16_t * __a, poly16x4_t __b, const int __c)
 
 #pragma GCC push_options
 #pragma GCC target ("fpu=crypto-neon-fp-armv8")
-__extension__ static __inline poly64x1_t __attribute__ ((__always_inline__))
+__extension__ extern __inline poly64x1_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vld1_lane_p64 (const poly64_t * __a, poly64x1_t __b, const int __c)
 {
   return (poly64x1_t)__builtin_neon_vld1_lanedi ((const __builtin_neon_di *) __a, __b, __c);
 }
 
 #pragma GCC pop_options
-__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int64x1_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vld1_lane_s64 (const int64_t * __a, int64x1_t __b, const int __c)
 {
   return (int64x1_t)__builtin_neon_vld1_lanedi ((const __builtin_neon_di *) __a, __b, __c);
 }
 
-__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint64x1_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vld1_lane_u64 (const uint64_t * __a, uint64x1_t __b, const int __c)
 {
   return (uint64x1_t)__builtin_neon_vld1_lanedi ((const __builtin_neon_di *) __a, (int64x1_t) __b, __c);
 }
 
-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int8x16_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vld1q_lane_s8 (const int8_t * __a, int8x16_t __b, const int __c)
 {
   return (int8x16_t)__builtin_neon_vld1_lanev16qi ((const __builtin_neon_qi *) __a, __b, __c);
 }
 
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vld1q_lane_s16 (const int16_t * __a, int16x8_t __b, const int __c)
 {
   return (int16x8_t)__builtin_neon_vld1_lanev8hi ((const __builtin_neon_hi *) __a, __b, __c);
 }
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vld1q_lane_s32 (const int32_t * __a, int32x4_t __b, const int __c)
 {
   return (int32x4_t)__builtin_neon_vld1_lanev4si ((const __builtin_neon_si *) __a, __b, __c);
 }
 
 #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
-__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline float16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vld1q_lane_f16 (const float16_t * __a, float16x8_t __b, const int __c)
 {
   return vsetq_lane_f16 (*__a, __b, __c);
 }
 #endif
 
-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline float32x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vld1q_lane_f32 (const float32_t * __a, float32x4_t __b, const int __c)
 {
   return (float32x4_t)__builtin_neon_vld1_lanev4sf ((const __builtin_neon_sf *) __a, __b, __c);
 }
 
-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint8x16_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vld1q_lane_u8 (const uint8_t * __a, uint8x16_t __b, const int __c)
 {
   return (uint8x16_t)__builtin_neon_vld1_lanev16qi ((const __builtin_neon_qi *) __a, (int8x16_t) __b, __c);
 }
 
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vld1q_lane_u16 (const uint16_t * __a, uint16x8_t __b, const int __c)
 {
   return (uint16x8_t)__builtin_neon_vld1_lanev8hi ((const __builtin_neon_hi *) __a, (int16x8_t) __b, __c);
 }
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vld1q_lane_u32 (const uint32_t * __a, uint32x4_t __b, const int __c)
 {
   return (uint32x4_t)__builtin_neon_vld1_lanev4si ((const __builtin_neon_si *) __a, (int32x4_t) __b, __c);
 }
 
-__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
+__extension__ extern __inline poly8x16_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vld1q_lane_p8 (const poly8_t * __a, poly8x16_t __b, const int __c)
 {
   return (poly8x16_t)__builtin_neon_vld1_lanev16qi ((const __builtin_neon_qi *) __a, (int8x16_t) __b, __c);
 }
 
-__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline poly16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vld1q_lane_p16 (const poly16_t * __a, poly16x8_t __b, const int __c)
 {
   return (poly16x8_t)__builtin_neon_vld1_lanev8hi ((const __builtin_neon_hi *) __a, (int16x8_t) __b, __c);
@@ -9202,45 +10579,52 @@ vld1q_lane_p16 (const poly16_t * __a, poly16x8_t __b, const int __c)
 
 #pragma GCC push_options
 #pragma GCC target ("fpu=crypto-neon-fp-armv8")
-__extension__ static __inline poly64x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline poly64x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vld1q_lane_p64 (const poly64_t * __a, poly64x2_t __b, const int __c)
 {
   return (poly64x2_t)__builtin_neon_vld1_lanev2di ((const __builtin_neon_di *) __a, (int64x2_t) __b, __c);
 }
 
 #pragma GCC pop_options
-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int64x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vld1q_lane_s64 (const int64_t * __a, int64x2_t __b, const int __c)
 {
   return (int64x2_t)__builtin_neon_vld1_lanev2di ((const __builtin_neon_di *) __a, __b, __c);
 }
 
-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint64x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vld1q_lane_u64 (const uint64_t * __a, uint64x2_t __b, const int __c)
 {
   return (uint64x2_t)__builtin_neon_vld1_lanev2di ((const __builtin_neon_di *) __a, (int64x2_t) __b, __c);
 }
 
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int8x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vld1_dup_s8 (const int8_t * __a)
 {
   return (int8x8_t)__builtin_neon_vld1_dupv8qi ((const __builtin_neon_qi *) __a);
 }
 
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vld1_dup_s16 (const int16_t * __a)
 {
   return (int16x4_t)__builtin_neon_vld1_dupv4hi ((const __builtin_neon_hi *) __a);
 }
 
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vld1_dup_s32 (const int32_t * __a)
 {
   return (int32x2_t)__builtin_neon_vld1_dupv2si ((const __builtin_neon_si *) __a);
 }
 
 #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
-__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline float16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vld1_dup_f16 (const float16_t * __a)
 {
   float16_t __f = *__a;
@@ -9248,37 +10632,43 @@ vld1_dup_f16 (const float16_t * __a)
 }
 #endif
 
-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline float32x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vld1_dup_f32 (const float32_t * __a)
 {
   return (float32x2_t)__builtin_neon_vld1_dupv2sf ((const __builtin_neon_sf *) __a);
 }
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint8x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vld1_dup_u8 (const uint8_t * __a)
 {
   return (uint8x8_t)__builtin_neon_vld1_dupv8qi ((const __builtin_neon_qi *) __a);
 }
 
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vld1_dup_u16 (const uint16_t * __a)
 {
   return (uint16x4_t)__builtin_neon_vld1_dupv4hi ((const __builtin_neon_hi *) __a);
 }
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vld1_dup_u32 (const uint32_t * __a)
 {
   return (uint32x2_t)__builtin_neon_vld1_dupv2si ((const __builtin_neon_si *) __a);
 }
 
-__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline poly8x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vld1_dup_p8 (const poly8_t * __a)
 {
   return (poly8x8_t)__builtin_neon_vld1_dupv8qi ((const __builtin_neon_qi *) __a);
 }
 
-__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline poly16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vld1_dup_p16 (const poly16_t * __a)
 {
   return (poly16x4_t)__builtin_neon_vld1_dupv4hi ((const __builtin_neon_hi *) __a);
@@ -9286,45 +10676,52 @@ vld1_dup_p16 (const poly16_t * __a)
 
 #pragma GCC push_options
 #pragma GCC target ("fpu=crypto-neon-fp-armv8")
-__extension__ static __inline poly64x1_t __attribute__ ((__always_inline__))
+__extension__ extern __inline poly64x1_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vld1_dup_p64 (const poly64_t * __a)
 {
   return (poly64x1_t)__builtin_neon_vld1_dupdi ((const __builtin_neon_di *) __a);
 }
 
 #pragma GCC pop_options
-__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int64x1_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vld1_dup_s64 (const int64_t * __a)
 {
   return (int64x1_t)__builtin_neon_vld1_dupdi ((const __builtin_neon_di *) __a);
 }
 
-__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint64x1_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vld1_dup_u64 (const uint64_t * __a)
 {
   return (uint64x1_t)__builtin_neon_vld1_dupdi ((const __builtin_neon_di *) __a);
 }
 
-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int8x16_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vld1q_dup_s8 (const int8_t * __a)
 {
   return (int8x16_t)__builtin_neon_vld1_dupv16qi ((const __builtin_neon_qi *) __a);
 }
 
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vld1q_dup_s16 (const int16_t * __a)
 {
   return (int16x8_t)__builtin_neon_vld1_dupv8hi ((const __builtin_neon_hi *) __a);
 }
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vld1q_dup_s32 (const int32_t * __a)
 {
   return (int32x4_t)__builtin_neon_vld1_dupv4si ((const __builtin_neon_si *) __a);
 }
 
 #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
-__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline float16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vld1q_dup_f16 (const float16_t * __a)
 {
   float16_t __f = *__a;
@@ -9332,37 +10729,43 @@ vld1q_dup_f16 (const float16_t * __a)
 }
 #endif
 
-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline float32x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vld1q_dup_f32 (const float32_t * __a)
 {
   return (float32x4_t)__builtin_neon_vld1_dupv4sf ((const __builtin_neon_sf *) __a);
 }
 
-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint8x16_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vld1q_dup_u8 (const uint8_t * __a)
 {
   return (uint8x16_t)__builtin_neon_vld1_dupv16qi ((const __builtin_neon_qi *) __a);
 }
 
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vld1q_dup_u16 (const uint16_t * __a)
 {
   return (uint16x8_t)__builtin_neon_vld1_dupv8hi ((const __builtin_neon_hi *) __a);
 }
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vld1q_dup_u32 (const uint32_t * __a)
 {
   return (uint32x4_t)__builtin_neon_vld1_dupv4si ((const __builtin_neon_si *) __a);
 }
 
-__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
+__extension__ extern __inline poly8x16_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vld1q_dup_p8 (const poly8_t * __a)
 {
   return (poly8x16_t)__builtin_neon_vld1_dupv16qi ((const __builtin_neon_qi *) __a);
 }
 
-__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline poly16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vld1q_dup_p16 (const poly16_t * __a)
 {
   return (poly16x8_t)__builtin_neon_vld1_dupv8hi ((const __builtin_neon_hi *) __a);
@@ -9370,20 +10773,23 @@ vld1q_dup_p16 (const poly16_t * __a)
 
 #pragma GCC push_options
 #pragma GCC target ("fpu=crypto-neon-fp-armv8")
-__extension__ static __inline poly64x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline poly64x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vld1q_dup_p64 (const poly64_t * __a)
 {
   return (poly64x2_t)__builtin_neon_vld1_dupv2di ((const __builtin_neon_di *) __a);
 }
 
 #pragma GCC pop_options
-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int64x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vld1q_dup_s64 (const int64_t * __a)
 {
   return (int64x2_t)__builtin_neon_vld1_dupv2di ((const __builtin_neon_di *) __a);
 }
 
-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint64x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vld1q_dup_u64 (const uint64_t * __a)
 {
   return (uint64x2_t)__builtin_neon_vld1_dupv2di ((const __builtin_neon_di *) __a);
@@ -9391,82 +10797,95 @@ vld1q_dup_u64 (const uint64_t * __a)
 
 #pragma GCC push_options
 #pragma GCC target ("fpu=crypto-neon-fp-armv8")
-__extension__ static __inline void __attribute__ ((__always_inline__))
+__extension__ extern __inline void
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vst1_p64 (poly64_t * __a, poly64x1_t __b)
 {
   __builtin_neon_vst1di ((__builtin_neon_di *) __a, __b);
 }
 
 #pragma GCC pop_options
-__extension__ static __inline void __attribute__ ((__always_inline__))
+__extension__ extern __inline void
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vst1_s8 (int8_t * __a, int8x8_t __b)
 {
   __builtin_neon_vst1v8qi ((__builtin_neon_qi *) __a, __b);
 }
 
-__extension__ static __inline void __attribute__ ((__always_inline__))
+__extension__ extern __inline void
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vst1_s16 (int16_t * __a, int16x4_t __b)
 {
   __builtin_neon_vst1v4hi ((__builtin_neon_hi *) __a, __b);
 }
 
-__extension__ static __inline void __attribute__ ((__always_inline__))
+__extension__ extern __inline void
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vst1_s32 (int32_t * __a, int32x2_t __b)
 {
   __builtin_neon_vst1v2si ((__builtin_neon_si *) __a, __b);
 }
 
-__extension__ static __inline void __attribute__ ((__always_inline__))
+__extension__ extern __inline void
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vst1_s64 (int64_t * __a, int64x1_t __b)
 {
   __builtin_neon_vst1di ((__builtin_neon_di *) __a, __b);
 }
 
 #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
-__extension__ static __inline void __attribute__ ((__always_inline__))
+__extension__ extern __inline void
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vst1_f16 (float16_t * __a, float16x4_t __b)
 {
   __builtin_neon_vst1v4hf (__a, __b);
 }
 #endif
 
-__extension__ static __inline void __attribute__ ((__always_inline__))
+__extension__ extern __inline void
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vst1_f32 (float32_t * __a, float32x2_t __b)
 {
   __builtin_neon_vst1v2sf ((__builtin_neon_sf *) __a, __b);
 }
 
-__extension__ static __inline void __attribute__ ((__always_inline__))
+__extension__ extern __inline void
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vst1_u8 (uint8_t * __a, uint8x8_t __b)
 {
   __builtin_neon_vst1v8qi ((__builtin_neon_qi *) __a, (int8x8_t) __b);
 }
 
-__extension__ static __inline void __attribute__ ((__always_inline__))
+__extension__ extern __inline void
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vst1_u16 (uint16_t * __a, uint16x4_t __b)
 {
   __builtin_neon_vst1v4hi ((__builtin_neon_hi *) __a, (int16x4_t) __b);
 }
 
-__extension__ static __inline void __attribute__ ((__always_inline__))
+__extension__ extern __inline void
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vst1_u32 (uint32_t * __a, uint32x2_t __b)
 {
   __builtin_neon_vst1v2si ((__builtin_neon_si *) __a, (int32x2_t) __b);
 }
 
-__extension__ static __inline void __attribute__ ((__always_inline__))
+__extension__ extern __inline void
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vst1_u64 (uint64_t * __a, uint64x1_t __b)
 {
   __builtin_neon_vst1di ((__builtin_neon_di *) __a, (int64x1_t) __b);
 }
 
-__extension__ static __inline void __attribute__ ((__always_inline__))
+__extension__ extern __inline void
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vst1_p8 (poly8_t * __a, poly8x8_t __b)
 {
   __builtin_neon_vst1v8qi ((__builtin_neon_qi *) __a, (int8x8_t) __b);
 }
 
-__extension__ static __inline void __attribute__ ((__always_inline__))
+__extension__ extern __inline void
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vst1_p16 (poly16_t * __a, poly16x4_t __b)
 {
   __builtin_neon_vst1v4hi ((__builtin_neon_hi *) __a, (int16x4_t) __b);
@@ -9474,144 +10893,167 @@ vst1_p16 (poly16_t * __a, poly16x4_t __b)
 
 #pragma GCC push_options
 #pragma GCC target ("fpu=crypto-neon-fp-armv8")
-__extension__ static __inline void __attribute__ ((__always_inline__))
+__extension__ extern __inline void
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vst1q_p64 (poly64_t * __a, poly64x2_t __b)
 {
   __builtin_neon_vst1v2di ((__builtin_neon_di *) __a, (int64x2_t) __b);
 }
 
 #pragma GCC pop_options
-__extension__ static __inline void __attribute__ ((__always_inline__))
+__extension__ extern __inline void
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vst1q_s8 (int8_t * __a, int8x16_t __b)
 {
   __builtin_neon_vst1v16qi ((__builtin_neon_qi *) __a, __b);
 }
 
-__extension__ static __inline void __attribute__ ((__always_inline__))
+__extension__ extern __inline void
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vst1q_s16 (int16_t * __a, int16x8_t __b)
 {
   __builtin_neon_vst1v8hi ((__builtin_neon_hi *) __a, __b);
 }
 
-__extension__ static __inline void __attribute__ ((__always_inline__))
+__extension__ extern __inline void
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vst1q_s32 (int32_t * __a, int32x4_t __b)
 {
   __builtin_neon_vst1v4si ((__builtin_neon_si *) __a, __b);
 }
 
-__extension__ static __inline void __attribute__ ((__always_inline__))
+__extension__ extern __inline void
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vst1q_s64 (int64_t * __a, int64x2_t __b)
 {
   __builtin_neon_vst1v2di ((__builtin_neon_di *) __a, __b);
 }
 
 #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
-__extension__ static __inline void __attribute__ ((__always_inline__))
+__extension__ extern __inline void
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vst1q_f16 (float16_t * __a, float16x8_t __b)
 {
   __builtin_neon_vst1v8hf (__a, __b);
 }
 #endif
 
-__extension__ static __inline void __attribute__ ((__always_inline__))
+__extension__ extern __inline void
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vst1q_f32 (float32_t * __a, float32x4_t __b)
 {
   __builtin_neon_vst1v4sf ((__builtin_neon_sf *) __a, __b);
 }
 
-__extension__ static __inline void __attribute__ ((__always_inline__))
+__extension__ extern __inline void
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vst1q_u8 (uint8_t * __a, uint8x16_t __b)
 {
   __builtin_neon_vst1v16qi ((__builtin_neon_qi *) __a, (int8x16_t) __b);
 }
 
-__extension__ static __inline void __attribute__ ((__always_inline__))
+__extension__ extern __inline void
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vst1q_u16 (uint16_t * __a, uint16x8_t __b)
 {
   __builtin_neon_vst1v8hi ((__builtin_neon_hi *) __a, (int16x8_t) __b);
 }
 
-__extension__ static __inline void __attribute__ ((__always_inline__))
+__extension__ extern __inline void
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vst1q_u32 (uint32_t * __a, uint32x4_t __b)
 {
   __builtin_neon_vst1v4si ((__builtin_neon_si *) __a, (int32x4_t) __b);
 }
 
-__extension__ static __inline void __attribute__ ((__always_inline__))
+__extension__ extern __inline void
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vst1q_u64 (uint64_t * __a, uint64x2_t __b)
 {
   __builtin_neon_vst1v2di ((__builtin_neon_di *) __a, (int64x2_t) __b);
 }
 
-__extension__ static __inline void __attribute__ ((__always_inline__))
+__extension__ extern __inline void
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vst1q_p8 (poly8_t * __a, poly8x16_t __b)
 {
   __builtin_neon_vst1v16qi ((__builtin_neon_qi *) __a, (int8x16_t) __b);
 }
 
-__extension__ static __inline void __attribute__ ((__always_inline__))
+__extension__ extern __inline void
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vst1q_p16 (poly16_t * __a, poly16x8_t __b)
 {
   __builtin_neon_vst1v8hi ((__builtin_neon_hi *) __a, (int16x8_t) __b);
 }
 
-__extension__ static __inline void __attribute__ ((__always_inline__))
+__extension__ extern __inline void
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vst1_lane_s8 (int8_t * __a, int8x8_t __b, const int __c)
 {
   __builtin_neon_vst1_lanev8qi ((__builtin_neon_qi *) __a, __b, __c);
 }
 
-__extension__ static __inline void __attribute__ ((__always_inline__))
+__extension__ extern __inline void
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vst1_lane_s16 (int16_t * __a, int16x4_t __b, const int __c)
 {
   __builtin_neon_vst1_lanev4hi ((__builtin_neon_hi *) __a, __b, __c);
 }
 
-__extension__ static __inline void __attribute__ ((__always_inline__))
+__extension__ extern __inline void
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vst1_lane_s32 (int32_t * __a, int32x2_t __b, const int __c)
 {
   __builtin_neon_vst1_lanev2si ((__builtin_neon_si *) __a, __b, __c);
 }
 
 #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
-__extension__ static __inline void __attribute__ ((__always_inline__))
+__extension__ extern __inline void
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vst1_lane_f16 (float16_t * __a, float16x4_t __b, const int __c)
 {
   __builtin_neon_vst1_lanev4hf (__a, __b, __c);
 }
 #endif
 
-__extension__ static __inline void __attribute__ ((__always_inline__))
+__extension__ extern __inline void
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vst1_lane_f32 (float32_t * __a, float32x2_t __b, const int __c)
 {
   __builtin_neon_vst1_lanev2sf ((__builtin_neon_sf *) __a, __b, __c);
 }
 
-__extension__ static __inline void __attribute__ ((__always_inline__))
+__extension__ extern __inline void
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vst1_lane_u8 (uint8_t * __a, uint8x8_t __b, const int __c)
 {
   __builtin_neon_vst1_lanev8qi ((__builtin_neon_qi *) __a, (int8x8_t) __b, __c);
 }
 
-__extension__ static __inline void __attribute__ ((__always_inline__))
+__extension__ extern __inline void
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vst1_lane_u16 (uint16_t * __a, uint16x4_t __b, const int __c)
 {
   __builtin_neon_vst1_lanev4hi ((__builtin_neon_hi *) __a, (int16x4_t) __b, __c);
 }
 
-__extension__ static __inline void __attribute__ ((__always_inline__))
+__extension__ extern __inline void
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vst1_lane_u32 (uint32_t * __a, uint32x2_t __b, const int __c)
 {
   __builtin_neon_vst1_lanev2si ((__builtin_neon_si *) __a, (int32x2_t) __b, __c);
 }
 
-__extension__ static __inline void __attribute__ ((__always_inline__))
+__extension__ extern __inline void
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vst1_lane_p8 (poly8_t * __a, poly8x8_t __b, const int __c)
 {
   __builtin_neon_vst1_lanev8qi ((__builtin_neon_qi *) __a, (int8x8_t) __b, __c);
 }
 
-__extension__ static __inline void __attribute__ ((__always_inline__))
+__extension__ extern __inline void
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vst1_lane_p16 (poly16_t * __a, poly16x4_t __b, const int __c)
 {
   __builtin_neon_vst1_lanev4hi ((__builtin_neon_hi *) __a, (int16x4_t) __b, __c);
@@ -9619,82 +11061,95 @@ vst1_lane_p16 (poly16_t * __a, poly16x4_t __b, const int __c)
 
 #pragma GCC push_options
 #pragma GCC target ("fpu=crypto-neon-fp-armv8")
-__extension__ static __inline void __attribute__ ((__always_inline__))
+__extension__ extern __inline void
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vst1_lane_p64 (poly64_t * __a, poly64x1_t __b, const int __c)
 {
   __builtin_neon_vst1_lanedi ((__builtin_neon_di *) __a, __b, __c);
 }
 
 #pragma GCC pop_options
-__extension__ static __inline void __attribute__ ((__always_inline__))
+__extension__ extern __inline void
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vst1_lane_s64 (int64_t * __a, int64x1_t __b, const int __c)
 {
   __builtin_neon_vst1_lanedi ((__builtin_neon_di *) __a, __b, __c);
 }
 
-__extension__ static __inline void __attribute__ ((__always_inline__))
+__extension__ extern __inline void
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vst1_lane_u64 (uint64_t * __a, uint64x1_t __b, const int __c)
 {
   __builtin_neon_vst1_lanedi ((__builtin_neon_di *) __a, (int64x1_t) __b, __c);
 }
 
-__extension__ static __inline void __attribute__ ((__always_inline__))
+__extension__ extern __inline void
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vst1q_lane_s8 (int8_t * __a, int8x16_t __b, const int __c)
 {
   __builtin_neon_vst1_lanev16qi ((__builtin_neon_qi *) __a, __b, __c);
 }
 
-__extension__ static __inline void __attribute__ ((__always_inline__))
+__extension__ extern __inline void
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vst1q_lane_s16 (int16_t * __a, int16x8_t __b, const int __c)
 {
   __builtin_neon_vst1_lanev8hi ((__builtin_neon_hi *) __a, __b, __c);
 }
 
-__extension__ static __inline void __attribute__ ((__always_inline__))
+__extension__ extern __inline void
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vst1q_lane_s32 (int32_t * __a, int32x4_t __b, const int __c)
 {
   __builtin_neon_vst1_lanev4si ((__builtin_neon_si *) __a, __b, __c);
 }
 
 #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
-__extension__ static __inline void __attribute__ ((__always_inline__))
+__extension__ extern __inline void
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vst1q_lane_f16 (float16_t * __a, float16x8_t __b, const int __c)
 {
   __builtin_neon_vst1_lanev8hf (__a, __b, __c);
 }
 #endif
 
-__extension__ static __inline void __attribute__ ((__always_inline__))
+__extension__ extern __inline void
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vst1q_lane_f32 (float32_t * __a, float32x4_t __b, const int __c)
 {
   __builtin_neon_vst1_lanev4sf ((__builtin_neon_sf *) __a, __b, __c);
 }
 
-__extension__ static __inline void __attribute__ ((__always_inline__))
+__extension__ extern __inline void
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vst1q_lane_u8 (uint8_t * __a, uint8x16_t __b, const int __c)
 {
   __builtin_neon_vst1_lanev16qi ((__builtin_neon_qi *) __a, (int8x16_t) __b, __c);
 }
 
-__extension__ static __inline void __attribute__ ((__always_inline__))
+__extension__ extern __inline void
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vst1q_lane_u16 (uint16_t * __a, uint16x8_t __b, const int __c)
 {
   __builtin_neon_vst1_lanev8hi ((__builtin_neon_hi *) __a, (int16x8_t) __b, __c);
 }
 
-__extension__ static __inline void __attribute__ ((__always_inline__))
+__extension__ extern __inline void
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vst1q_lane_u32 (uint32_t * __a, uint32x4_t __b, const int __c)
 {
   __builtin_neon_vst1_lanev4si ((__builtin_neon_si *) __a, (int32x4_t) __b, __c);
 }
 
-__extension__ static __inline void __attribute__ ((__always_inline__))
+__extension__ extern __inline void
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vst1q_lane_p8 (poly8_t * __a, poly8x16_t __b, const int __c)
 {
   __builtin_neon_vst1_lanev16qi ((__builtin_neon_qi *) __a, (int8x16_t) __b, __c);
 }
 
-__extension__ static __inline void __attribute__ ((__always_inline__))
+__extension__ extern __inline void
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vst1q_lane_p16 (poly16_t * __a, poly16x8_t __b, const int __c)
 {
   __builtin_neon_vst1_lanev8hi ((__builtin_neon_hi *) __a, (int16x8_t) __b, __c);
@@ -9702,26 +11157,30 @@ vst1q_lane_p16 (poly16_t * __a, poly16x8_t __b, const int __c)
 
 #pragma GCC push_options
 #pragma GCC target ("fpu=crypto-neon-fp-armv8")
-__extension__ static __inline void __attribute__ ((__always_inline__))
+__extension__ extern __inline void
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vst1q_lane_p64 (poly64_t * __a, poly64x2_t __b, const int __c)
 {
   __builtin_neon_vst1_lanev2di ((__builtin_neon_di *) __a, (int64x2_t) __b, __c);
 }
 
 #pragma GCC pop_options
-__extension__ static __inline void __attribute__ ((__always_inline__))
+__extension__ extern __inline void
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vst1q_lane_s64 (int64_t * __a, int64x2_t __b, const int __c)
 {
   __builtin_neon_vst1_lanev2di ((__builtin_neon_di *) __a, __b, __c);
 }
 
-__extension__ static __inline void __attribute__ ((__always_inline__))
+__extension__ extern __inline void
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vst1q_lane_u64 (uint64_t * __a, uint64x2_t __b, const int __c)
 {
   __builtin_neon_vst1_lanev2di ((__builtin_neon_di *) __a, (int64x2_t) __b, __c);
 }
 
-__extension__ static __inline int8x8x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int8x8x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vld2_s8 (const int8_t * __a)
 {
   union { int8x8x2_t __i; __builtin_neon_ti __o; } __rv;
@@ -9729,7 +11188,8 @@ vld2_s8 (const int8_t * __a)
   return __rv.__i;
 }
 
-__extension__ static __inline int16x4x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int16x4x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vld2_s16 (const int16_t * __a)
 {
   union { int16x4x2_t __i; __builtin_neon_ti __o; } __rv;
@@ -9737,7 +11197,8 @@ vld2_s16 (const int16_t * __a)
   return __rv.__i;
 }
 
-__extension__ static __inline int32x2x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32x2x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vld2_s32 (const int32_t * __a)
 {
   union { int32x2x2_t __i; __builtin_neon_ti __o; } __rv;
@@ -9746,7 +11207,8 @@ vld2_s32 (const int32_t * __a)
 }
 
 #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
-__extension__ static __inline float16x4x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline float16x4x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vld2_f16 (const float16_t * __a)
 {
   union { float16x4x2_t __i; __builtin_neon_ti __o; } __rv;
@@ -9755,7 +11217,8 @@ vld2_f16 (const float16_t * __a)
 }
 #endif
 
-__extension__ static __inline float32x2x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline float32x2x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vld2_f32 (const float32_t * __a)
 {
   union { float32x2x2_t __i; __builtin_neon_ti __o; } __rv;
@@ -9763,7 +11226,8 @@ vld2_f32 (const float32_t * __a)
   return __rv.__i;
 }
 
-__extension__ static __inline uint8x8x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint8x8x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vld2_u8 (const uint8_t * __a)
 {
   union { uint8x8x2_t __i; __builtin_neon_ti __o; } __rv;
@@ -9771,7 +11235,8 @@ vld2_u8 (const uint8_t * __a)
   return __rv.__i;
 }
 
-__extension__ static __inline uint16x4x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint16x4x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vld2_u16 (const uint16_t * __a)
 {
   union { uint16x4x2_t __i; __builtin_neon_ti __o; } __rv;
@@ -9779,7 +11244,8 @@ vld2_u16 (const uint16_t * __a)
   return __rv.__i;
 }
 
-__extension__ static __inline uint32x2x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x2x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vld2_u32 (const uint32_t * __a)
 {
   union { uint32x2x2_t __i; __builtin_neon_ti __o; } __rv;
@@ -9787,7 +11253,8 @@ vld2_u32 (const uint32_t * __a)
   return __rv.__i;
 }
 
-__extension__ static __inline poly8x8x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline poly8x8x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vld2_p8 (const poly8_t * __a)
 {
   union { poly8x8x2_t __i; __builtin_neon_ti __o; } __rv;
@@ -9795,7 +11262,8 @@ vld2_p8 (const poly8_t * __a)
   return __rv.__i;
 }
 
-__extension__ static __inline poly16x4x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline poly16x4x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vld2_p16 (const poly16_t * __a)
 {
   union { poly16x4x2_t __i; __builtin_neon_ti __o; } __rv;
@@ -9805,7 +11273,8 @@ vld2_p16 (const poly16_t * __a)
 
 #pragma GCC push_options
 #pragma GCC target ("fpu=crypto-neon-fp-armv8")
-__extension__ static __inline poly64x1x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline poly64x1x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vld2_p64 (const poly64_t * __a)
 {
   union { poly64x1x2_t __i; __builtin_neon_ti __o; } __rv;
@@ -9814,7 +11283,8 @@ vld2_p64 (const poly64_t * __a)
 }
 
 #pragma GCC pop_options
-__extension__ static __inline int64x1x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int64x1x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vld2_s64 (const int64_t * __a)
 {
   union { int64x1x2_t __i; __builtin_neon_ti __o; } __rv;
@@ -9822,7 +11292,8 @@ vld2_s64 (const int64_t * __a)
   return __rv.__i;
 }
 
-__extension__ static __inline uint64x1x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint64x1x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vld2_u64 (const uint64_t * __a)
 {
   union { uint64x1x2_t __i; __builtin_neon_ti __o; } __rv;
@@ -9830,7 +11301,8 @@ vld2_u64 (const uint64_t * __a)
   return __rv.__i;
 }
 
-__extension__ static __inline int8x16x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int8x16x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vld2q_s8 (const int8_t * __a)
 {
   union { int8x16x2_t __i; __builtin_neon_oi __o; } __rv;
@@ -9838,7 +11310,8 @@ vld2q_s8 (const int8_t * __a)
   return __rv.__i;
 }
 
-__extension__ static __inline int16x8x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int16x8x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vld2q_s16 (const int16_t * __a)
 {
   union { int16x8x2_t __i; __builtin_neon_oi __o; } __rv;
@@ -9846,7 +11319,8 @@ vld2q_s16 (const int16_t * __a)
   return __rv.__i;
 }
 
-__extension__ static __inline int32x4x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32x4x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vld2q_s32 (const int32_t * __a)
 {
   union { int32x4x2_t __i; __builtin_neon_oi __o; } __rv;
@@ -9855,7 +11329,8 @@ vld2q_s32 (const int32_t * __a)
 }
 
 #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
-__extension__ static __inline float16x8x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline float16x8x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vld2q_f16 (const float16_t * __a)
 {
   union { float16x8x2_t __i; __builtin_neon_oi __o; } __rv;
@@ -9864,7 +11339,8 @@ vld2q_f16 (const float16_t * __a)
 }
 #endif
 
-__extension__ static __inline float32x4x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline float32x4x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vld2q_f32 (const float32_t * __a)
 {
   union { float32x4x2_t __i; __builtin_neon_oi __o; } __rv;
@@ -9872,7 +11348,8 @@ vld2q_f32 (const float32_t * __a)
   return __rv.__i;
 }
 
-__extension__ static __inline uint8x16x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint8x16x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vld2q_u8 (const uint8_t * __a)
 {
   union { uint8x16x2_t __i; __builtin_neon_oi __o; } __rv;
@@ -9880,7 +11357,8 @@ vld2q_u8 (const uint8_t * __a)
   return __rv.__i;
 }
 
-__extension__ static __inline uint16x8x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint16x8x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vld2q_u16 (const uint16_t * __a)
 {
   union { uint16x8x2_t __i; __builtin_neon_oi __o; } __rv;
@@ -9888,7 +11366,8 @@ vld2q_u16 (const uint16_t * __a)
   return __rv.__i;
 }
 
-__extension__ static __inline uint32x4x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x4x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vld2q_u32 (const uint32_t * __a)
 {
   union { uint32x4x2_t __i; __builtin_neon_oi __o; } __rv;
@@ -9896,7 +11375,8 @@ vld2q_u32 (const uint32_t * __a)
   return __rv.__i;
 }
 
-__extension__ static __inline poly8x16x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline poly8x16x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vld2q_p8 (const poly8_t * __a)
 {
   union { poly8x16x2_t __i; __builtin_neon_oi __o; } __rv;
@@ -9904,7 +11384,8 @@ vld2q_p8 (const poly8_t * __a)
   return __rv.__i;
 }
 
-__extension__ static __inline poly16x8x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline poly16x8x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vld2q_p16 (const poly16_t * __a)
 {
   union { poly16x8x2_t __i; __builtin_neon_oi __o; } __rv;
@@ -9912,7 +11393,8 @@ vld2q_p16 (const poly16_t * __a)
   return __rv.__i;
 }
 
-__extension__ static __inline int8x8x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int8x8x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vld2_lane_s8 (const int8_t * __a, int8x8x2_t __b, const int __c)
 {
   union { int8x8x2_t __i; __builtin_neon_ti __o; } __bu = { __b };
@@ -9921,7 +11403,8 @@ vld2_lane_s8 (const int8_t * __a, int8x8x2_t __b, const int __c)
   return __rv.__i;
 }
 
-__extension__ static __inline int16x4x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int16x4x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vld2_lane_s16 (const int16_t * __a, int16x4x2_t __b, const int __c)
 {
   union { int16x4x2_t __i; __builtin_neon_ti __o; } __bu = { __b };
@@ -9930,7 +11413,8 @@ vld2_lane_s16 (const int16_t * __a, int16x4x2_t __b, const int __c)
   return __rv.__i;
 }
 
-__extension__ static __inline int32x2x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32x2x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vld2_lane_s32 (const int32_t * __a, int32x2x2_t __b, const int __c)
 {
   union { int32x2x2_t __i; __builtin_neon_ti __o; } __bu = { __b };
@@ -9940,7 +11424,8 @@ vld2_lane_s32 (const int32_t * __a, int32x2x2_t __b, const int __c)
 }
 
 #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
-__extension__ static __inline float16x4x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline float16x4x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vld2_lane_f16 (const float16_t * __a, float16x4x2_t __b, const int __c)
 {
   union { float16x4x2_t __i; __builtin_neon_ti __o; } __bu = { __b };
@@ -9950,7 +11435,8 @@ vld2_lane_f16 (const float16_t * __a, float16x4x2_t __b, const int __c)
 }
 #endif
 
-__extension__ static __inline float32x2x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline float32x2x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vld2_lane_f32 (const float32_t * __a, float32x2x2_t __b, const int __c)
 {
   union { float32x2x2_t __i; __builtin_neon_ti __o; } __bu = { __b };
@@ -9959,7 +11445,8 @@ vld2_lane_f32 (const float32_t * __a, float32x2x2_t __b, const int __c)
   return __rv.__i;
 }
 
-__extension__ static __inline uint8x8x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint8x8x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vld2_lane_u8 (const uint8_t * __a, uint8x8x2_t __b, const int __c)
 {
   union { uint8x8x2_t __i; __builtin_neon_ti __o; } __bu = { __b };
@@ -9968,7 +11455,8 @@ vld2_lane_u8 (const uint8_t * __a, uint8x8x2_t __b, const int __c)
   return __rv.__i;
 }
 
-__extension__ static __inline uint16x4x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint16x4x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vld2_lane_u16 (const uint16_t * __a, uint16x4x2_t __b, const int __c)
 {
   union { uint16x4x2_t __i; __builtin_neon_ti __o; } __bu = { __b };
@@ -9977,7 +11465,8 @@ vld2_lane_u16 (const uint16_t * __a, uint16x4x2_t __b, const int __c)
   return __rv.__i;
 }
 
-__extension__ static __inline uint32x2x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x2x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vld2_lane_u32 (const uint32_t * __a, uint32x2x2_t __b, const int __c)
 {
   union { uint32x2x2_t __i; __builtin_neon_ti __o; } __bu = { __b };
@@ -9986,7 +11475,8 @@ vld2_lane_u32 (const uint32_t * __a, uint32x2x2_t __b, const int __c)
   return __rv.__i;
 }
 
-__extension__ static __inline poly8x8x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline poly8x8x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vld2_lane_p8 (const poly8_t * __a, poly8x8x2_t __b, const int __c)
 {
   union { poly8x8x2_t __i; __builtin_neon_ti __o; } __bu = { __b };
@@ -9995,7 +11485,8 @@ vld2_lane_p8 (const poly8_t * __a, poly8x8x2_t __b, const int __c)
   return __rv.__i;
 }
 
-__extension__ static __inline poly16x4x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline poly16x4x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vld2_lane_p16 (const poly16_t * __a, poly16x4x2_t __b, const int __c)
 {
   union { poly16x4x2_t __i; __builtin_neon_ti __o; } __bu = { __b };
@@ -10004,7 +11495,8 @@ vld2_lane_p16 (const poly16_t * __a, poly16x4x2_t __b, const int __c)
   return __rv.__i;
 }
 
-__extension__ static __inline int16x8x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int16x8x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vld2q_lane_s16 (const int16_t * __a, int16x8x2_t __b, const int __c)
 {
   union { int16x8x2_t __i; __builtin_neon_oi __o; } __bu = { __b };
@@ -10013,7 +11505,8 @@ vld2q_lane_s16 (const int16_t * __a, int16x8x2_t __b, const int __c)
   return __rv.__i;
 }
 
-__extension__ static __inline int32x4x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32x4x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vld2q_lane_s32 (const int32_t * __a, int32x4x2_t __b, const int __c)
 {
   union { int32x4x2_t __i; __builtin_neon_oi __o; } __bu = { __b };
@@ -10023,7 +11516,8 @@ vld2q_lane_s32 (const int32_t * __a, int32x4x2_t __b, const int __c)
 }
 
 #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
-__extension__ static __inline float16x8x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline float16x8x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vld2q_lane_f16 (const float16_t * __a, float16x8x2_t __b, const int __c)
 {
   union { float16x8x2_t __i; __builtin_neon_oi __o; } __bu = { __b };
@@ -10033,7 +11527,8 @@ vld2q_lane_f16 (const float16_t * __a, float16x8x2_t __b, const int __c)
 }
 #endif
 
-__extension__ static __inline float32x4x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline float32x4x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vld2q_lane_f32 (const float32_t * __a, float32x4x2_t __b, const int __c)
 {
   union { float32x4x2_t __i; __builtin_neon_oi __o; } __bu = { __b };
@@ -10042,7 +11537,8 @@ vld2q_lane_f32 (const float32_t * __a, float32x4x2_t __b, const int __c)
   return __rv.__i;
 }
 
-__extension__ static __inline uint16x8x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint16x8x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vld2q_lane_u16 (const uint16_t * __a, uint16x8x2_t __b, const int __c)
 {
   union { uint16x8x2_t __i; __builtin_neon_oi __o; } __bu = { __b };
@@ -10051,7 +11547,8 @@ vld2q_lane_u16 (const uint16_t * __a, uint16x8x2_t __b, const int __c)
   return __rv.__i;
 }
 
-__extension__ static __inline uint32x4x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x4x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vld2q_lane_u32 (const uint32_t * __a, uint32x4x2_t __b, const int __c)
 {
   union { uint32x4x2_t __i; __builtin_neon_oi __o; } __bu = { __b };
@@ -10060,7 +11557,8 @@ vld2q_lane_u32 (const uint32_t * __a, uint32x4x2_t __b, const int __c)
   return __rv.__i;
 }
 
-__extension__ static __inline poly16x8x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline poly16x8x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vld2q_lane_p16 (const poly16_t * __a, poly16x8x2_t __b, const int __c)
 {
   union { poly16x8x2_t __i; __builtin_neon_oi __o; } __bu = { __b };
@@ -10069,7 +11567,8 @@ vld2q_lane_p16 (const poly16_t * __a, poly16x8x2_t __b, const int __c)
   return __rv.__i;
 }
 
-__extension__ static __inline int8x8x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int8x8x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vld2_dup_s8 (const int8_t * __a)
 {
   union { int8x8x2_t __i; __builtin_neon_ti __o; } __rv;
@@ -10077,7 +11576,8 @@ vld2_dup_s8 (const int8_t * __a)
   return __rv.__i;
 }
 
-__extension__ static __inline int16x4x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int16x4x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vld2_dup_s16 (const int16_t * __a)
 {
   union { int16x4x2_t __i; __builtin_neon_ti __o; } __rv;
@@ -10085,7 +11585,8 @@ vld2_dup_s16 (const int16_t * __a)
   return __rv.__i;
 }
 
-__extension__ static __inline int32x2x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32x2x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vld2_dup_s32 (const int32_t * __a)
 {
   union { int32x2x2_t __i; __builtin_neon_ti __o; } __rv;
@@ -10094,7 +11595,8 @@ vld2_dup_s32 (const int32_t * __a)
 }
 
 #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
-__extension__ static __inline float16x4x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline float16x4x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vld2_dup_f16 (const float16_t * __a)
 {
   union { float16x4x2_t __i; __builtin_neon_ti __o; } __rv;
@@ -10103,7 +11605,8 @@ vld2_dup_f16 (const float16_t * __a)
 }
 #endif
 
-__extension__ static __inline float32x2x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline float32x2x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vld2_dup_f32 (const float32_t * __a)
 {
   union { float32x2x2_t __i; __builtin_neon_ti __o; } __rv;
@@ -10111,7 +11614,8 @@ vld2_dup_f32 (const float32_t * __a)
   return __rv.__i;
 }
 
-__extension__ static __inline uint8x8x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint8x8x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vld2_dup_u8 (const uint8_t * __a)
 {
   union { uint8x8x2_t __i; __builtin_neon_ti __o; } __rv;
@@ -10119,7 +11623,8 @@ vld2_dup_u8 (const uint8_t * __a)
   return __rv.__i;
 }
 
-__extension__ static __inline uint16x4x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint16x4x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vld2_dup_u16 (const uint16_t * __a)
 {
   union { uint16x4x2_t __i; __builtin_neon_ti __o; } __rv;
@@ -10127,7 +11632,8 @@ vld2_dup_u16 (const uint16_t * __a)
   return __rv.__i;
 }
 
-__extension__ static __inline uint32x2x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x2x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vld2_dup_u32 (const uint32_t * __a)
 {
   union { uint32x2x2_t __i; __builtin_neon_ti __o; } __rv;
@@ -10135,7 +11641,8 @@ vld2_dup_u32 (const uint32_t * __a)
   return __rv.__i;
 }
 
-__extension__ static __inline poly8x8x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline poly8x8x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vld2_dup_p8 (const poly8_t * __a)
 {
   union { poly8x8x2_t __i; __builtin_neon_ti __o; } __rv;
@@ -10143,7 +11650,8 @@ vld2_dup_p8 (const poly8_t * __a)
   return __rv.__i;
 }
 
-__extension__ static __inline poly16x4x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline poly16x4x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vld2_dup_p16 (const poly16_t * __a)
 {
   union { poly16x4x2_t __i; __builtin_neon_ti __o; } __rv;
@@ -10153,7 +11661,8 @@ vld2_dup_p16 (const poly16_t * __a)
 
 #pragma GCC push_options
 #pragma GCC target ("fpu=crypto-neon-fp-armv8")
-__extension__ static __inline poly64x1x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline poly64x1x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vld2_dup_p64 (const poly64_t * __a)
 {
   union { poly64x1x2_t __i; __builtin_neon_ti __o; } __rv;
@@ -10162,7 +11671,8 @@ vld2_dup_p64 (const poly64_t * __a)
 }
 
 #pragma GCC pop_options
-__extension__ static __inline int64x1x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int64x1x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vld2_dup_s64 (const int64_t * __a)
 {
   union { int64x1x2_t __i; __builtin_neon_ti __o; } __rv;
@@ -10170,7 +11680,8 @@ vld2_dup_s64 (const int64_t * __a)
   return __rv.__i;
 }
 
-__extension__ static __inline uint64x1x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint64x1x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vld2_dup_u64 (const uint64_t * __a)
 {
   union { uint64x1x2_t __i; __builtin_neon_ti __o; } __rv;
@@ -10178,21 +11689,24 @@ vld2_dup_u64 (const uint64_t * __a)
   return __rv.__i;
 }
 
-__extension__ static __inline void __attribute__ ((__always_inline__))
+__extension__ extern __inline void
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vst2_s8 (int8_t * __a, int8x8x2_t __b)
 {
   union { int8x8x2_t __i; __builtin_neon_ti __o; } __bu = { __b };
   __builtin_neon_vst2v8qi ((__builtin_neon_qi *) __a, __bu.__o);
 }
 
-__extension__ static __inline void __attribute__ ((__always_inline__))
+__extension__ extern __inline void
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vst2_s16 (int16_t * __a, int16x4x2_t __b)
 {
   union { int16x4x2_t __i; __builtin_neon_ti __o; } __bu = { __b };
   __builtin_neon_vst2v4hi ((__builtin_neon_hi *) __a, __bu.__o);
 }
 
-__extension__ static __inline void __attribute__ ((__always_inline__))
+__extension__ extern __inline void
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vst2_s32 (int32_t * __a, int32x2x2_t __b)
 {
   union { int32x2x2_t __i; __builtin_neon_ti __o; } __bu = { __b };
@@ -10200,7 +11714,8 @@ vst2_s32 (int32_t * __a, int32x2x2_t __b)
 }
 
 #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
-__extension__ static __inline void __attribute__ ((__always_inline__))
+__extension__ extern __inline void
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vst2_f16 (float16_t * __a, float16x4x2_t __b)
 {
   union { float16x4x2_t __i; __builtin_neon_ti __o; } __bu = { __b };
@@ -10208,42 +11723,48 @@ vst2_f16 (float16_t * __a, float16x4x2_t __b)
 }
 #endif
 
-__extension__ static __inline void __attribute__ ((__always_inline__))
+__extension__ extern __inline void
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vst2_f32 (float32_t * __a, float32x2x2_t __b)
 {
   union { float32x2x2_t __i; __builtin_neon_ti __o; } __bu = { __b };
   __builtin_neon_vst2v2sf ((__builtin_neon_sf *) __a, __bu.__o);
 }
 
-__extension__ static __inline void __attribute__ ((__always_inline__))
+__extension__ extern __inline void
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vst2_u8 (uint8_t * __a, uint8x8x2_t __b)
 {
   union { uint8x8x2_t __i; __builtin_neon_ti __o; } __bu = { __b };
   __builtin_neon_vst2v8qi ((__builtin_neon_qi *) __a, __bu.__o);
 }
 
-__extension__ static __inline void __attribute__ ((__always_inline__))
+__extension__ extern __inline void
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vst2_u16 (uint16_t * __a, uint16x4x2_t __b)
 {
   union { uint16x4x2_t __i; __builtin_neon_ti __o; } __bu = { __b };
   __builtin_neon_vst2v4hi ((__builtin_neon_hi *) __a, __bu.__o);
 }
 
-__extension__ static __inline void __attribute__ ((__always_inline__))
+__extension__ extern __inline void
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vst2_u32 (uint32_t * __a, uint32x2x2_t __b)
 {
   union { uint32x2x2_t __i; __builtin_neon_ti __o; } __bu = { __b };
   __builtin_neon_vst2v2si ((__builtin_neon_si *) __a, __bu.__o);
 }
 
-__extension__ static __inline void __attribute__ ((__always_inline__))
+__extension__ extern __inline void
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vst2_p8 (poly8_t * __a, poly8x8x2_t __b)
 {
   union { poly8x8x2_t __i; __builtin_neon_ti __o; } __bu = { __b };
   __builtin_neon_vst2v8qi ((__builtin_neon_qi *) __a, __bu.__o);
 }
 
-__extension__ static __inline void __attribute__ ((__always_inline__))
+__extension__ extern __inline void
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vst2_p16 (poly16_t * __a, poly16x4x2_t __b)
 {
   union { poly16x4x2_t __i; __builtin_neon_ti __o; } __bu = { __b };
@@ -10252,7 +11773,8 @@ vst2_p16 (poly16_t * __a, poly16x4x2_t __b)
 
 #pragma GCC push_options
 #pragma GCC target ("fpu=crypto-neon-fp-armv8")
-__extension__ static __inline void __attribute__ ((__always_inline__))
+__extension__ extern __inline void
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vst2_p64 (poly64_t * __a, poly64x1x2_t __b)
 {
   union { poly64x1x2_t __i; __builtin_neon_ti __o; } __bu = { __b };
@@ -10260,35 +11782,40 @@ vst2_p64 (poly64_t * __a, poly64x1x2_t __b)
 }
 
 #pragma GCC pop_options
-__extension__ static __inline void __attribute__ ((__always_inline__))
+__extension__ extern __inline void
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vst2_s64 (int64_t * __a, int64x1x2_t __b)
 {
   union { int64x1x2_t __i; __builtin_neon_ti __o; } __bu = { __b };
   __builtin_neon_vst2di ((__builtin_neon_di *) __a, __bu.__o);
 }
 
-__extension__ static __inline void __attribute__ ((__always_inline__))
+__extension__ extern __inline void
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vst2_u64 (uint64_t * __a, uint64x1x2_t __b)
 {
   union { uint64x1x2_t __i; __builtin_neon_ti __o; } __bu = { __b };
   __builtin_neon_vst2di ((__builtin_neon_di *) __a, __bu.__o);
 }
 
-__extension__ static __inline void __attribute__ ((__always_inline__))
+__extension__ extern __inline void
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vst2q_s8 (int8_t * __a, int8x16x2_t __b)
 {
   union { int8x16x2_t __i; __builtin_neon_oi __o; } __bu = { __b };
   __builtin_neon_vst2v16qi ((__builtin_neon_qi *) __a, __bu.__o);
 }
 
-__extension__ static __inline void __attribute__ ((__always_inline__))
+__extension__ extern __inline void
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vst2q_s16 (int16_t * __a, int16x8x2_t __b)
 {
   union { int16x8x2_t __i; __builtin_neon_oi __o; } __bu = { __b };
   __builtin_neon_vst2v8hi ((__builtin_neon_hi *) __a, __bu.__o);
 }
 
-__extension__ static __inline void __attribute__ ((__always_inline__))
+__extension__ extern __inline void
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vst2q_s32 (int32_t * __a, int32x4x2_t __b)
 {
   union { int32x4x2_t __i; __builtin_neon_oi __o; } __bu = { __b };
@@ -10296,7 +11823,8 @@ vst2q_s32 (int32_t * __a, int32x4x2_t __b)
 }
 
 #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
-__extension__ static __inline void __attribute__ ((__always_inline__))
+__extension__ extern __inline void
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vst2q_f16 (float16_t * __a, float16x8x2_t __b)
 {
   union { float16x8x2_t __i; __builtin_neon_oi __o; } __bu = { __b };
@@ -10304,63 +11832,72 @@ vst2q_f16 (float16_t * __a, float16x8x2_t __b)
 }
 #endif
 
-__extension__ static __inline void __attribute__ ((__always_inline__))
+__extension__ extern __inline void
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vst2q_f32 (float32_t * __a, float32x4x2_t __b)
 {
   union { float32x4x2_t __i; __builtin_neon_oi __o; } __bu = { __b };
   __builtin_neon_vst2v4sf ((__builtin_neon_sf *) __a, __bu.__o);
 }
 
-__extension__ static __inline void __attribute__ ((__always_inline__))
+__extension__ extern __inline void
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vst2q_u8 (uint8_t * __a, uint8x16x2_t __b)
 {
   union { uint8x16x2_t __i; __builtin_neon_oi __o; } __bu = { __b };
   __builtin_neon_vst2v16qi ((__builtin_neon_qi *) __a, __bu.__o);
 }
 
-__extension__ static __inline void __attribute__ ((__always_inline__))
+__extension__ extern __inline void
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vst2q_u16 (uint16_t * __a, uint16x8x2_t __b)
 {
   union { uint16x8x2_t __i; __builtin_neon_oi __o; } __bu = { __b };
   __builtin_neon_vst2v8hi ((__builtin_neon_hi *) __a, __bu.__o);
 }
 
-__extension__ static __inline void __attribute__ ((__always_inline__))
+__extension__ extern __inline void
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vst2q_u32 (uint32_t * __a, uint32x4x2_t __b)
 {
   union { uint32x4x2_t __i; __builtin_neon_oi __o; } __bu = { __b };
   __builtin_neon_vst2v4si ((__builtin_neon_si *) __a, __bu.__o);
 }
 
-__extension__ static __inline void __attribute__ ((__always_inline__))
+__extension__ extern __inline void
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vst2q_p8 (poly8_t * __a, poly8x16x2_t __b)
 {
   union { poly8x16x2_t __i; __builtin_neon_oi __o; } __bu = { __b };
   __builtin_neon_vst2v16qi ((__builtin_neon_qi *) __a, __bu.__o);
 }
 
-__extension__ static __inline void __attribute__ ((__always_inline__))
+__extension__ extern __inline void
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vst2q_p16 (poly16_t * __a, poly16x8x2_t __b)
 {
   union { poly16x8x2_t __i; __builtin_neon_oi __o; } __bu = { __b };
   __builtin_neon_vst2v8hi ((__builtin_neon_hi *) __a, __bu.__o);
 }
 
-__extension__ static __inline void __attribute__ ((__always_inline__))
+__extension__ extern __inline void
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vst2_lane_s8 (int8_t * __a, int8x8x2_t __b, const int __c)
 {
   union { int8x8x2_t __i; __builtin_neon_ti __o; } __bu = { __b };
   __builtin_neon_vst2_lanev8qi ((__builtin_neon_qi *) __a, __bu.__o, __c);
 }
 
-__extension__ static __inline void __attribute__ ((__always_inline__))
+__extension__ extern __inline void
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vst2_lane_s16 (int16_t * __a, int16x4x2_t __b, const int __c)
 {
   union { int16x4x2_t __i; __builtin_neon_ti __o; } __bu = { __b };
   __builtin_neon_vst2_lanev4hi ((__builtin_neon_hi *) __a, __bu.__o, __c);
 }
 
-__extension__ static __inline void __attribute__ ((__always_inline__))
+__extension__ extern __inline void
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vst2_lane_s32 (int32_t * __a, int32x2x2_t __b, const int __c)
 {
   union { int32x2x2_t __i; __builtin_neon_ti __o; } __bu = { __b };
@@ -10368,7 +11905,8 @@ vst2_lane_s32 (int32_t * __a, int32x2x2_t __b, const int __c)
 }
 
 #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
-__extension__ static __inline void __attribute__ ((__always_inline__))
+__extension__ extern __inline void
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vst2_lane_f16 (float16_t * __a, float16x4x2_t __b, const int __c)
 {
   union { float16x4x2_t __i; __builtin_neon_ti __o; } __bu = { __b };
@@ -10376,56 +11914,64 @@ vst2_lane_f16 (float16_t * __a, float16x4x2_t __b, const int __c)
 }
 #endif
 
-__extension__ static __inline void __attribute__ ((__always_inline__))
+__extension__ extern __inline void
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vst2_lane_f32 (float32_t * __a, float32x2x2_t __b, const int __c)
 {
   union { float32x2x2_t __i; __builtin_neon_ti __o; } __bu = { __b };
   __builtin_neon_vst2_lanev2sf ((__builtin_neon_sf *) __a, __bu.__o, __c);
 }
 
-__extension__ static __inline void __attribute__ ((__always_inline__))
+__extension__ extern __inline void
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vst2_lane_u8 (uint8_t * __a, uint8x8x2_t __b, const int __c)
 {
   union { uint8x8x2_t __i; __builtin_neon_ti __o; } __bu = { __b };
   __builtin_neon_vst2_lanev8qi ((__builtin_neon_qi *) __a, __bu.__o, __c);
 }
 
-__extension__ static __inline void __attribute__ ((__always_inline__))
+__extension__ extern __inline void
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vst2_lane_u16 (uint16_t * __a, uint16x4x2_t __b, const int __c)
 {
   union { uint16x4x2_t __i; __builtin_neon_ti __o; } __bu = { __b };
   __builtin_neon_vst2_lanev4hi ((__builtin_neon_hi *) __a, __bu.__o, __c);
 }
 
-__extension__ static __inline void __attribute__ ((__always_inline__))
+__extension__ extern __inline void
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vst2_lane_u32 (uint32_t * __a, uint32x2x2_t __b, const int __c)
 {
   union { uint32x2x2_t __i; __builtin_neon_ti __o; } __bu = { __b };
   __builtin_neon_vst2_lanev2si ((__builtin_neon_si *) __a, __bu.__o, __c);
 }
 
-__extension__ static __inline void __attribute__ ((__always_inline__))
+__extension__ extern __inline void
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vst2_lane_p8 (poly8_t * __a, poly8x8x2_t __b, const int __c)
 {
   union { poly8x8x2_t __i; __builtin_neon_ti __o; } __bu = { __b };
   __builtin_neon_vst2_lanev8qi ((__builtin_neon_qi *) __a, __bu.__o, __c);
 }
 
-__extension__ static __inline void __attribute__ ((__always_inline__))
+__extension__ extern __inline void
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vst2_lane_p16 (poly16_t * __a, poly16x4x2_t __b, const int __c)
 {
   union { poly16x4x2_t __i; __builtin_neon_ti __o; } __bu = { __b };
   __builtin_neon_vst2_lanev4hi ((__builtin_neon_hi *) __a, __bu.__o, __c);
 }
 
-__extension__ static __inline void __attribute__ ((__always_inline__))
+__extension__ extern __inline void
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vst2q_lane_s16 (int16_t * __a, int16x8x2_t __b, const int __c)
 {
   union { int16x8x2_t __i; __builtin_neon_oi __o; } __bu = { __b };
   __builtin_neon_vst2_lanev8hi ((__builtin_neon_hi *) __a, __bu.__o, __c);
 }
 
-__extension__ static __inline void __attribute__ ((__always_inline__))
+__extension__ extern __inline void
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vst2q_lane_s32 (int32_t * __a, int32x4x2_t __b, const int __c)
 {
   union { int32x4x2_t __i; __builtin_neon_oi __o; } __bu = { __b };
@@ -10433,7 +11979,8 @@ vst2q_lane_s32 (int32_t * __a, int32x4x2_t __b, const int __c)
 }
 
 #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
-__extension__ static __inline void __attribute__ ((__always_inline__))
+__extension__ extern __inline void
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vst2q_lane_f16 (float16_t * __a, float16x8x2_t __b, const int __c)
 {
   union { float16x8x2_t __i; __builtin_neon_oi __o; } __bu = { __b };
@@ -10441,35 +11988,40 @@ vst2q_lane_f16 (float16_t * __a, float16x8x2_t __b, const int __c)
 }
 #endif
 
-__extension__ static __inline void __attribute__ ((__always_inline__))
+__extension__ extern __inline void
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vst2q_lane_f32 (float32_t * __a, float32x4x2_t __b, const int __c)
 {
   union { float32x4x2_t __i; __builtin_neon_oi __o; } __bu = { __b };
   __builtin_neon_vst2_lanev4sf ((__builtin_neon_sf *) __a, __bu.__o, __c);
 }
 
-__extension__ static __inline void __attribute__ ((__always_inline__))
+__extension__ extern __inline void
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vst2q_lane_u16 (uint16_t * __a, uint16x8x2_t __b, const int __c)
 {
   union { uint16x8x2_t __i; __builtin_neon_oi __o; } __bu = { __b };
   __builtin_neon_vst2_lanev8hi ((__builtin_neon_hi *) __a, __bu.__o, __c);
 }
 
-__extension__ static __inline void __attribute__ ((__always_inline__))
+__extension__ extern __inline void
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vst2q_lane_u32 (uint32_t * __a, uint32x4x2_t __b, const int __c)
 {
   union { uint32x4x2_t __i; __builtin_neon_oi __o; } __bu = { __b };
   __builtin_neon_vst2_lanev4si ((__builtin_neon_si *) __a, __bu.__o, __c);
 }
 
-__extension__ static __inline void __attribute__ ((__always_inline__))
+__extension__ extern __inline void
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vst2q_lane_p16 (poly16_t * __a, poly16x8x2_t __b, const int __c)
 {
   union { poly16x8x2_t __i; __builtin_neon_oi __o; } __bu = { __b };
   __builtin_neon_vst2_lanev8hi ((__builtin_neon_hi *) __a, __bu.__o, __c);
 }
 
-__extension__ static __inline int8x8x3_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int8x8x3_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vld3_s8 (const int8_t * __a)
 {
   union { int8x8x3_t __i; __builtin_neon_ei __o; } __rv;
@@ -10477,7 +12029,8 @@ vld3_s8 (const int8_t * __a)
   return __rv.__i;
 }
 
-__extension__ static __inline int16x4x3_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int16x4x3_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vld3_s16 (const int16_t * __a)
 {
   union { int16x4x3_t __i; __builtin_neon_ei __o; } __rv;
@@ -10485,7 +12038,8 @@ vld3_s16 (const int16_t * __a)
   return __rv.__i;
 }
 
-__extension__ static __inline int32x2x3_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32x2x3_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vld3_s32 (const int32_t * __a)
 {
   union { int32x2x3_t __i; __builtin_neon_ei __o; } __rv;
@@ -10494,7 +12048,8 @@ vld3_s32 (const int32_t * __a)
 }
 
 #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
-__extension__ static __inline float16x4x3_t __attribute__ ((__always_inline__))
+__extension__ extern __inline float16x4x3_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vld3_f16 (const float16_t * __a)
 {
   union { float16x4x3_t __i; __builtin_neon_ei __o; } __rv;
@@ -10503,7 +12058,8 @@ vld3_f16 (const float16_t * __a)
 }
 #endif
 
-__extension__ static __inline float32x2x3_t __attribute__ ((__always_inline__))
+__extension__ extern __inline float32x2x3_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vld3_f32 (const float32_t * __a)
 {
   union { float32x2x3_t __i; __builtin_neon_ei __o; } __rv;
@@ -10511,7 +12067,8 @@ vld3_f32 (const float32_t * __a)
   return __rv.__i;
 }
 
-__extension__ static __inline uint8x8x3_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint8x8x3_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vld3_u8 (const uint8_t * __a)
 {
   union { uint8x8x3_t __i; __builtin_neon_ei __o; } __rv;
@@ -10519,7 +12076,8 @@ vld3_u8 (const uint8_t * __a)
   return __rv.__i;
 }
 
-__extension__ static __inline uint16x4x3_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint16x4x3_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vld3_u16 (const uint16_t * __a)
 {
   union { uint16x4x3_t __i; __builtin_neon_ei __o; } __rv;
@@ -10527,7 +12085,8 @@ vld3_u16 (const uint16_t * __a)
   return __rv.__i;
 }
 
-__extension__ static __inline uint32x2x3_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x2x3_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vld3_u32 (const uint32_t * __a)
 {
   union { uint32x2x3_t __i; __builtin_neon_ei __o; } __rv;
@@ -10535,7 +12094,8 @@ vld3_u32 (const uint32_t * __a)
   return __rv.__i;
 }
 
-__extension__ static __inline poly8x8x3_t __attribute__ ((__always_inline__))
+__extension__ extern __inline poly8x8x3_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vld3_p8 (const poly8_t * __a)
 {
   union { poly8x8x3_t __i; __builtin_neon_ei __o; } __rv;
@@ -10543,7 +12103,8 @@ vld3_p8 (const poly8_t * __a)
   return __rv.__i;
 }
 
-__extension__ static __inline poly16x4x3_t __attribute__ ((__always_inline__))
+__extension__ extern __inline poly16x4x3_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vld3_p16 (const poly16_t * __a)
 {
   union { poly16x4x3_t __i; __builtin_neon_ei __o; } __rv;
@@ -10553,7 +12114,8 @@ vld3_p16 (const poly16_t * __a)
 
 #pragma GCC push_options
 #pragma GCC target ("fpu=crypto-neon-fp-armv8")
-__extension__ static __inline poly64x1x3_t __attribute__ ((__always_inline__))
+__extension__ extern __inline poly64x1x3_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vld3_p64 (const poly64_t * __a)
 {
   union { poly64x1x3_t __i; __builtin_neon_ei __o; } __rv;
@@ -10562,7 +12124,8 @@ vld3_p64 (const poly64_t * __a)
 }
 
 #pragma GCC pop_options
-__extension__ static __inline int64x1x3_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int64x1x3_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vld3_s64 (const int64_t * __a)
 {
   union { int64x1x3_t __i; __builtin_neon_ei __o; } __rv;
@@ -10570,7 +12133,8 @@ vld3_s64 (const int64_t * __a)
   return __rv.__i;
 }
 
-__extension__ static __inline uint64x1x3_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint64x1x3_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vld3_u64 (const uint64_t * __a)
 {
   union { uint64x1x3_t __i; __builtin_neon_ei __o; } __rv;
@@ -10578,7 +12142,8 @@ vld3_u64 (const uint64_t * __a)
   return __rv.__i;
 }
 
-__extension__ static __inline int8x16x3_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int8x16x3_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vld3q_s8 (const int8_t * __a)
 {
   union { int8x16x3_t __i; __builtin_neon_ci __o; } __rv;
@@ -10586,7 +12151,8 @@ vld3q_s8 (const int8_t * __a)
   return __rv.__i;
 }
 
-__extension__ static __inline int16x8x3_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int16x8x3_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vld3q_s16 (const int16_t * __a)
 {
   union { int16x8x3_t __i; __builtin_neon_ci __o; } __rv;
@@ -10594,7 +12160,8 @@ vld3q_s16 (const int16_t * __a)
   return __rv.__i;
 }
 
-__extension__ static __inline int32x4x3_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32x4x3_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vld3q_s32 (const int32_t * __a)
 {
   union { int32x4x3_t __i; __builtin_neon_ci __o; } __rv;
@@ -10603,7 +12170,8 @@ vld3q_s32 (const int32_t * __a)
 }
 
 #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
-__extension__ static __inline float16x8x3_t __attribute__ ((__always_inline__))
+__extension__ extern __inline float16x8x3_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vld3q_f16 (const float16_t * __a)
 {
   union { float16x8x3_t __i; __builtin_neon_ci __o; } __rv;
@@ -10612,7 +12180,8 @@ vld3q_f16 (const float16_t * __a)
 }
 #endif
 
-__extension__ static __inline float32x4x3_t __attribute__ ((__always_inline__))
+__extension__ extern __inline float32x4x3_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vld3q_f32 (const float32_t * __a)
 {
   union { float32x4x3_t __i; __builtin_neon_ci __o; } __rv;
@@ -10620,7 +12189,8 @@ vld3q_f32 (const float32_t * __a)
   return __rv.__i;
 }
 
-__extension__ static __inline uint8x16x3_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint8x16x3_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vld3q_u8 (const uint8_t * __a)
 {
   union { uint8x16x3_t __i; __builtin_neon_ci __o; } __rv;
@@ -10628,7 +12198,8 @@ vld3q_u8 (const uint8_t * __a)
   return __rv.__i;
 }
 
-__extension__ static __inline uint16x8x3_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint16x8x3_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vld3q_u16 (const uint16_t * __a)
 {
   union { uint16x8x3_t __i; __builtin_neon_ci __o; } __rv;
@@ -10636,7 +12207,8 @@ vld3q_u16 (const uint16_t * __a)
   return __rv.__i;
 }
 
-__extension__ static __inline uint32x4x3_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x4x3_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vld3q_u32 (const uint32_t * __a)
 {
   union { uint32x4x3_t __i; __builtin_neon_ci __o; } __rv;
@@ -10644,7 +12216,8 @@ vld3q_u32 (const uint32_t * __a)
   return __rv.__i;
 }
 
-__extension__ static __inline poly8x16x3_t __attribute__ ((__always_inline__))
+__extension__ extern __inline poly8x16x3_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vld3q_p8 (const poly8_t * __a)
 {
   union { poly8x16x3_t __i; __builtin_neon_ci __o; } __rv;
@@ -10652,7 +12225,8 @@ vld3q_p8 (const poly8_t * __a)
   return __rv.__i;
 }
 
-__extension__ static __inline poly16x8x3_t __attribute__ ((__always_inline__))
+__extension__ extern __inline poly16x8x3_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vld3q_p16 (const poly16_t * __a)
 {
   union { poly16x8x3_t __i; __builtin_neon_ci __o; } __rv;
@@ -10660,7 +12234,8 @@ vld3q_p16 (const poly16_t * __a)
   return __rv.__i;
 }
 
-__extension__ static __inline int8x8x3_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int8x8x3_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vld3_lane_s8 (const int8_t * __a, int8x8x3_t __b, const int __c)
 {
   union { int8x8x3_t __i; __builtin_neon_ei __o; } __bu = { __b };
@@ -10669,7 +12244,8 @@ vld3_lane_s8 (const int8_t * __a, int8x8x3_t __b, const int __c)
   return __rv.__i;
 }
 
-__extension__ static __inline int16x4x3_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int16x4x3_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vld3_lane_s16 (const int16_t * __a, int16x4x3_t __b, const int __c)
 {
   union { int16x4x3_t __i; __builtin_neon_ei __o; } __bu = { __b };
@@ -10678,7 +12254,8 @@ vld3_lane_s16 (const int16_t * __a, int16x4x3_t __b, const int __c)
   return __rv.__i;
 }
 
-__extension__ static __inline int32x2x3_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32x2x3_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vld3_lane_s32 (const int32_t * __a, int32x2x3_t __b, const int __c)
 {
   union { int32x2x3_t __i; __builtin_neon_ei __o; } __bu = { __b };
@@ -10688,7 +12265,8 @@ vld3_lane_s32 (const int32_t * __a, int32x2x3_t __b, const int __c)
 }
 
 #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
-__extension__ static __inline float16x4x3_t __attribute__ ((__always_inline__))
+__extension__ extern __inline float16x4x3_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vld3_lane_f16 (const float16_t * __a, float16x4x3_t __b, const int __c)
 {
   union { float16x4x3_t __i; __builtin_neon_ei __o; } __bu = { __b };
@@ -10698,7 +12276,8 @@ vld3_lane_f16 (const float16_t * __a, float16x4x3_t __b, const int __c)
 }
 #endif
 
-__extension__ static __inline float32x2x3_t __attribute__ ((__always_inline__))
+__extension__ extern __inline float32x2x3_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vld3_lane_f32 (const float32_t * __a, float32x2x3_t __b, const int __c)
 {
   union { float32x2x3_t __i; __builtin_neon_ei __o; } __bu = { __b };
@@ -10707,7 +12286,8 @@ vld3_lane_f32 (const float32_t * __a, float32x2x3_t __b, const int __c)
   return __rv.__i;
 }
 
-__extension__ static __inline uint8x8x3_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint8x8x3_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vld3_lane_u8 (const uint8_t * __a, uint8x8x3_t __b, const int __c)
 {
   union { uint8x8x3_t __i; __builtin_neon_ei __o; } __bu = { __b };
@@ -10716,7 +12296,8 @@ vld3_lane_u8 (const uint8_t * __a, uint8x8x3_t __b, const int __c)
   return __rv.__i;
 }
 
-__extension__ static __inline uint16x4x3_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint16x4x3_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vld3_lane_u16 (const uint16_t * __a, uint16x4x3_t __b, const int __c)
 {
   union { uint16x4x3_t __i; __builtin_neon_ei __o; } __bu = { __b };
@@ -10725,7 +12306,8 @@ vld3_lane_u16 (const uint16_t * __a, uint16x4x3_t __b, const int __c)
   return __rv.__i;
 }
 
-__extension__ static __inline uint32x2x3_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x2x3_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vld3_lane_u32 (const uint32_t * __a, uint32x2x3_t __b, const int __c)
 {
   union { uint32x2x3_t __i; __builtin_neon_ei __o; } __bu = { __b };
@@ -10734,7 +12316,8 @@ vld3_lane_u32 (const uint32_t * __a, uint32x2x3_t __b, const int __c)
   return __rv.__i;
 }
 
-__extension__ static __inline poly8x8x3_t __attribute__ ((__always_inline__))
+__extension__ extern __inline poly8x8x3_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vld3_lane_p8 (const poly8_t * __a, poly8x8x3_t __b, const int __c)
 {
   union { poly8x8x3_t __i; __builtin_neon_ei __o; } __bu = { __b };
@@ -10743,7 +12326,8 @@ vld3_lane_p8 (const poly8_t * __a, poly8x8x3_t __b, const int __c)
   return __rv.__i;
 }
 
-__extension__ static __inline poly16x4x3_t __attribute__ ((__always_inline__))
+__extension__ extern __inline poly16x4x3_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vld3_lane_p16 (const poly16_t * __a, poly16x4x3_t __b, const int __c)
 {
   union { poly16x4x3_t __i; __builtin_neon_ei __o; } __bu = { __b };
@@ -10752,7 +12336,8 @@ vld3_lane_p16 (const poly16_t * __a, poly16x4x3_t __b, const int __c)
   return __rv.__i;
 }
 
-__extension__ static __inline int16x8x3_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int16x8x3_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vld3q_lane_s16 (const int16_t * __a, int16x8x3_t __b, const int __c)
 {
   union { int16x8x3_t __i; __builtin_neon_ci __o; } __bu = { __b };
@@ -10761,7 +12346,8 @@ vld3q_lane_s16 (const int16_t * __a, int16x8x3_t __b, const int __c)
   return __rv.__i;
 }
 
-__extension__ static __inline int32x4x3_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32x4x3_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vld3q_lane_s32 (const int32_t * __a, int32x4x3_t __b, const int __c)
 {
   union { int32x4x3_t __i; __builtin_neon_ci __o; } __bu = { __b };
@@ -10771,7 +12357,8 @@ vld3q_lane_s32 (const int32_t * __a, int32x4x3_t __b, const int __c)
 }
 
 #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
-__extension__ static __inline float16x8x3_t __attribute__ ((__always_inline__))
+__extension__ extern __inline float16x8x3_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vld3q_lane_f16 (const float16_t * __a, float16x8x3_t __b, const int __c)
 {
   union { float16x8x3_t __i; __builtin_neon_ci __o; } __bu = { __b };
@@ -10781,7 +12368,8 @@ vld3q_lane_f16 (const float16_t * __a, float16x8x3_t __b, const int __c)
 }
 #endif
 
-__extension__ static __inline float32x4x3_t __attribute__ ((__always_inline__))
+__extension__ extern __inline float32x4x3_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vld3q_lane_f32 (const float32_t * __a, float32x4x3_t __b, const int __c)
 {
   union { float32x4x3_t __i; __builtin_neon_ci __o; } __bu = { __b };
@@ -10790,7 +12378,8 @@ vld3q_lane_f32 (const float32_t * __a, float32x4x3_t __b, const int __c)
   return __rv.__i;
 }
 
-__extension__ static __inline uint16x8x3_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint16x8x3_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vld3q_lane_u16 (const uint16_t * __a, uint16x8x3_t __b, const int __c)
 {
   union { uint16x8x3_t __i; __builtin_neon_ci __o; } __bu = { __b };
@@ -10799,7 +12388,8 @@ vld3q_lane_u16 (const uint16_t * __a, uint16x8x3_t __b, const int __c)
   return __rv.__i;
 }
 
-__extension__ static __inline uint32x4x3_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x4x3_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vld3q_lane_u32 (const uint32_t * __a, uint32x4x3_t __b, const int __c)
 {
   union { uint32x4x3_t __i; __builtin_neon_ci __o; } __bu = { __b };
@@ -10808,7 +12398,8 @@ vld3q_lane_u32 (const uint32_t * __a, uint32x4x3_t __b, const int __c)
   return __rv.__i;
 }
 
-__extension__ static __inline poly16x8x3_t __attribute__ ((__always_inline__))
+__extension__ extern __inline poly16x8x3_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vld3q_lane_p16 (const poly16_t * __a, poly16x8x3_t __b, const int __c)
 {
   union { poly16x8x3_t __i; __builtin_neon_ci __o; } __bu = { __b };
@@ -10817,7 +12408,8 @@ vld3q_lane_p16 (const poly16_t * __a, poly16x8x3_t __b, const int __c)
   return __rv.__i;
 }
 
-__extension__ static __inline int8x8x3_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int8x8x3_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vld3_dup_s8 (const int8_t * __a)
 {
   union { int8x8x3_t __i; __builtin_neon_ei __o; } __rv;
@@ -10825,7 +12417,8 @@ vld3_dup_s8 (const int8_t * __a)
   return __rv.__i;
 }
 
-__extension__ static __inline int16x4x3_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int16x4x3_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vld3_dup_s16 (const int16_t * __a)
 {
   union { int16x4x3_t __i; __builtin_neon_ei __o; } __rv;
@@ -10833,7 +12426,8 @@ vld3_dup_s16 (const int16_t * __a)
   return __rv.__i;
 }
 
-__extension__ static __inline int32x2x3_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32x2x3_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vld3_dup_s32 (const int32_t * __a)
 {
   union { int32x2x3_t __i; __builtin_neon_ei __o; } __rv;
@@ -10842,7 +12436,8 @@ vld3_dup_s32 (const int32_t * __a)
 }
 
 #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
-__extension__ static __inline float16x4x3_t __attribute__ ((__always_inline__))
+__extension__ extern __inline float16x4x3_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vld3_dup_f16 (const float16_t * __a)
 {
   union { float16x4x3_t __i; __builtin_neon_ei __o; } __rv;
@@ -10851,7 +12446,8 @@ vld3_dup_f16 (const float16_t * __a)
 }
 #endif
 
-__extension__ static __inline float32x2x3_t __attribute__ ((__always_inline__))
+__extension__ extern __inline float32x2x3_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vld3_dup_f32 (const float32_t * __a)
 {
   union { float32x2x3_t __i; __builtin_neon_ei __o; } __rv;
@@ -10859,7 +12455,8 @@ vld3_dup_f32 (const float32_t * __a)
   return __rv.__i;
 }
 
-__extension__ static __inline uint8x8x3_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint8x8x3_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vld3_dup_u8 (const uint8_t * __a)
 {
   union { uint8x8x3_t __i; __builtin_neon_ei __o; } __rv;
@@ -10867,7 +12464,8 @@ vld3_dup_u8 (const uint8_t * __a)
   return __rv.__i;
 }
 
-__extension__ static __inline uint16x4x3_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint16x4x3_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vld3_dup_u16 (const uint16_t * __a)
 {
   union { uint16x4x3_t __i; __builtin_neon_ei __o; } __rv;
@@ -10875,7 +12473,8 @@ vld3_dup_u16 (const uint16_t * __a)
   return __rv.__i;
 }
 
-__extension__ static __inline uint32x2x3_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x2x3_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vld3_dup_u32 (const uint32_t * __a)
 {
   union { uint32x2x3_t __i; __builtin_neon_ei __o; } __rv;
@@ -10883,7 +12482,8 @@ vld3_dup_u32 (const uint32_t * __a)
   return __rv.__i;
 }
 
-__extension__ static __inline poly8x8x3_t __attribute__ ((__always_inline__))
+__extension__ extern __inline poly8x8x3_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vld3_dup_p8 (const poly8_t * __a)
 {
   union { poly8x8x3_t __i; __builtin_neon_ei __o; } __rv;
@@ -10891,7 +12491,8 @@ vld3_dup_p8 (const poly8_t * __a)
   return __rv.__i;
 }
 
-__extension__ static __inline poly16x4x3_t __attribute__ ((__always_inline__))
+__extension__ extern __inline poly16x4x3_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vld3_dup_p16 (const poly16_t * __a)
 {
   union { poly16x4x3_t __i; __builtin_neon_ei __o; } __rv;
@@ -10901,7 +12502,8 @@ vld3_dup_p16 (const poly16_t * __a)
 
 #pragma GCC push_options
 #pragma GCC target ("fpu=crypto-neon-fp-armv8")
-__extension__ static __inline poly64x1x3_t __attribute__ ((__always_inline__))
+__extension__ extern __inline poly64x1x3_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vld3_dup_p64 (const poly64_t * __a)
 {
   union { poly64x1x3_t __i; __builtin_neon_ei __o; } __rv;
@@ -10910,7 +12512,8 @@ vld3_dup_p64 (const poly64_t * __a)
 }
 
 #pragma GCC pop_options
-__extension__ static __inline int64x1x3_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int64x1x3_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vld3_dup_s64 (const int64_t * __a)
 {
   union { int64x1x3_t __i; __builtin_neon_ei __o; } __rv;
@@ -10918,7 +12521,8 @@ vld3_dup_s64 (const int64_t * __a)
   return __rv.__i;
 }
 
-__extension__ static __inline uint64x1x3_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint64x1x3_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vld3_dup_u64 (const uint64_t * __a)
 {
   union { uint64x1x3_t __i; __builtin_neon_ei __o; } __rv;
@@ -10926,21 +12530,24 @@ vld3_dup_u64 (const uint64_t * __a)
   return __rv.__i;
 }
 
-__extension__ static __inline void __attribute__ ((__always_inline__))
+__extension__ extern __inline void
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vst3_s8 (int8_t * __a, int8x8x3_t __b)
 {
   union { int8x8x3_t __i; __builtin_neon_ei __o; } __bu = { __b };
   __builtin_neon_vst3v8qi ((__builtin_neon_qi *) __a, __bu.__o);
 }
 
-__extension__ static __inline void __attribute__ ((__always_inline__))
+__extension__ extern __inline void
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vst3_s16 (int16_t * __a, int16x4x3_t __b)
 {
   union { int16x4x3_t __i; __builtin_neon_ei __o; } __bu = { __b };
   __builtin_neon_vst3v4hi ((__builtin_neon_hi *) __a, __bu.__o);
 }
 
-__extension__ static __inline void __attribute__ ((__always_inline__))
+__extension__ extern __inline void
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vst3_s32 (int32_t * __a, int32x2x3_t __b)
 {
   union { int32x2x3_t __i; __builtin_neon_ei __o; } __bu = { __b };
@@ -10948,7 +12555,8 @@ vst3_s32 (int32_t * __a, int32x2x3_t __b)
 }
 
 #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
-__extension__ static __inline void __attribute__ ((__always_inline__))
+__extension__ extern __inline void
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vst3_f16 (float16_t * __a, float16x4x3_t __b)
 {
   union { float16x4x3_t __i; __builtin_neon_ei __o; } __bu = { __b };
@@ -10956,42 +12564,48 @@ vst3_f16 (float16_t * __a, float16x4x3_t __b)
 }
 #endif
 
-__extension__ static __inline void __attribute__ ((__always_inline__))
+__extension__ extern __inline void
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vst3_f32 (float32_t * __a, float32x2x3_t __b)
 {
   union { float32x2x3_t __i; __builtin_neon_ei __o; } __bu = { __b };
   __builtin_neon_vst3v2sf ((__builtin_neon_sf *) __a, __bu.__o);
 }
 
-__extension__ static __inline void __attribute__ ((__always_inline__))
+__extension__ extern __inline void
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vst3_u8 (uint8_t * __a, uint8x8x3_t __b)
 {
   union { uint8x8x3_t __i; __builtin_neon_ei __o; } __bu = { __b };
   __builtin_neon_vst3v8qi ((__builtin_neon_qi *) __a, __bu.__o);
 }
 
-__extension__ static __inline void __attribute__ ((__always_inline__))
+__extension__ extern __inline void
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vst3_u16 (uint16_t * __a, uint16x4x3_t __b)
 {
   union { uint16x4x3_t __i; __builtin_neon_ei __o; } __bu = { __b };
   __builtin_neon_vst3v4hi ((__builtin_neon_hi *) __a, __bu.__o);
 }
 
-__extension__ static __inline void __attribute__ ((__always_inline__))
+__extension__ extern __inline void
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vst3_u32 (uint32_t * __a, uint32x2x3_t __b)
 {
   union { uint32x2x3_t __i; __builtin_neon_ei __o; } __bu = { __b };
   __builtin_neon_vst3v2si ((__builtin_neon_si *) __a, __bu.__o);
 }
 
-__extension__ static __inline void __attribute__ ((__always_inline__))
+__extension__ extern __inline void
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vst3_p8 (poly8_t * __a, poly8x8x3_t __b)
 {
   union { poly8x8x3_t __i; __builtin_neon_ei __o; } __bu = { __b };
   __builtin_neon_vst3v8qi ((__builtin_neon_qi *) __a, __bu.__o);
 }
 
-__extension__ static __inline void __attribute__ ((__always_inline__))
+__extension__ extern __inline void
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vst3_p16 (poly16_t * __a, poly16x4x3_t __b)
 {
   union { poly16x4x3_t __i; __builtin_neon_ei __o; } __bu = { __b };
@@ -11000,7 +12614,8 @@ vst3_p16 (poly16_t * __a, poly16x4x3_t __b)
 
 #pragma GCC push_options
 #pragma GCC target ("fpu=crypto-neon-fp-armv8")
-__extension__ static __inline void __attribute__ ((__always_inline__))
+__extension__ extern __inline void
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vst3_p64 (poly64_t * __a, poly64x1x3_t __b)
 {
   union { poly64x1x3_t __i; __builtin_neon_ei __o; } __bu = { __b };
@@ -11008,35 +12623,40 @@ vst3_p64 (poly64_t * __a, poly64x1x3_t __b)
 }
 
 #pragma GCC pop_options
-__extension__ static __inline void __attribute__ ((__always_inline__))
+__extension__ extern __inline void
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vst3_s64 (int64_t * __a, int64x1x3_t __b)
 {
   union { int64x1x3_t __i; __builtin_neon_ei __o; } __bu = { __b };
   __builtin_neon_vst3di ((__builtin_neon_di *) __a, __bu.__o);
 }
 
-__extension__ static __inline void __attribute__ ((__always_inline__))
+__extension__ extern __inline void
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vst3_u64 (uint64_t * __a, uint64x1x3_t __b)
 {
   union { uint64x1x3_t __i; __builtin_neon_ei __o; } __bu = { __b };
   __builtin_neon_vst3di ((__builtin_neon_di *) __a, __bu.__o);
 }
 
-__extension__ static __inline void __attribute__ ((__always_inline__))
+__extension__ extern __inline void
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vst3q_s8 (int8_t * __a, int8x16x3_t __b)
 {
   union { int8x16x3_t __i; __builtin_neon_ci __o; } __bu = { __b };
   __builtin_neon_vst3v16qi ((__builtin_neon_qi *) __a, __bu.__o);
 }
 
-__extension__ static __inline void __attribute__ ((__always_inline__))
+__extension__ extern __inline void
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vst3q_s16 (int16_t * __a, int16x8x3_t __b)
 {
   union { int16x8x3_t __i; __builtin_neon_ci __o; } __bu = { __b };
   __builtin_neon_vst3v8hi ((__builtin_neon_hi *) __a, __bu.__o);
 }
 
-__extension__ static __inline void __attribute__ ((__always_inline__))
+__extension__ extern __inline void
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vst3q_s32 (int32_t * __a, int32x4x3_t __b)
 {
   union { int32x4x3_t __i; __builtin_neon_ci __o; } __bu = { __b };
@@ -11044,7 +12664,8 @@ vst3q_s32 (int32_t * __a, int32x4x3_t __b)
 }
 
 #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
-__extension__ static __inline void __attribute__ ((__always_inline__))
+__extension__ extern __inline void
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vst3q_f16 (float16_t * __a, float16x8x3_t __b)
 {
   union { float16x8x3_t __i; __builtin_neon_ci __o; } __bu = { __b };
@@ -11052,63 +12673,72 @@ vst3q_f16 (float16_t * __a, float16x8x3_t __b)
 }
 #endif
 
-__extension__ static __inline void __attribute__ ((__always_inline__))
+__extension__ extern __inline void
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vst3q_f32 (float32_t * __a, float32x4x3_t __b)
 {
   union { float32x4x3_t __i; __builtin_neon_ci __o; } __bu = { __b };
   __builtin_neon_vst3v4sf ((__builtin_neon_sf *) __a, __bu.__o);
 }
 
-__extension__ static __inline void __attribute__ ((__always_inline__))
+__extension__ extern __inline void
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vst3q_u8 (uint8_t * __a, uint8x16x3_t __b)
 {
   union { uint8x16x3_t __i; __builtin_neon_ci __o; } __bu = { __b };
   __builtin_neon_vst3v16qi ((__builtin_neon_qi *) __a, __bu.__o);
 }
 
-__extension__ static __inline void __attribute__ ((__always_inline__))
+__extension__ extern __inline void
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vst3q_u16 (uint16_t * __a, uint16x8x3_t __b)
 {
   union { uint16x8x3_t __i; __builtin_neon_ci __o; } __bu = { __b };
   __builtin_neon_vst3v8hi ((__builtin_neon_hi *) __a, __bu.__o);
 }
 
-__extension__ static __inline void __attribute__ ((__always_inline__))
+__extension__ extern __inline void
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vst3q_u32 (uint32_t * __a, uint32x4x3_t __b)
 {
   union { uint32x4x3_t __i; __builtin_neon_ci __o; } __bu = { __b };
   __builtin_neon_vst3v4si ((__builtin_neon_si *) __a, __bu.__o);
 }
 
-__extension__ static __inline void __attribute__ ((__always_inline__))
+__extension__ extern __inline void
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vst3q_p8 (poly8_t * __a, poly8x16x3_t __b)
 {
   union { poly8x16x3_t __i; __builtin_neon_ci __o; } __bu = { __b };
   __builtin_neon_vst3v16qi ((__builtin_neon_qi *) __a, __bu.__o);
 }
 
-__extension__ static __inline void __attribute__ ((__always_inline__))
+__extension__ extern __inline void
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vst3q_p16 (poly16_t * __a, poly16x8x3_t __b)
 {
   union { poly16x8x3_t __i; __builtin_neon_ci __o; } __bu = { __b };
   __builtin_neon_vst3v8hi ((__builtin_neon_hi *) __a, __bu.__o);
 }
 
-__extension__ static __inline void __attribute__ ((__always_inline__))
+__extension__ extern __inline void
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vst3_lane_s8 (int8_t * __a, int8x8x3_t __b, const int __c)
 {
   union { int8x8x3_t __i; __builtin_neon_ei __o; } __bu = { __b };
   __builtin_neon_vst3_lanev8qi ((__builtin_neon_qi *) __a, __bu.__o, __c);
 }
 
-__extension__ static __inline void __attribute__ ((__always_inline__))
+__extension__ extern __inline void
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vst3_lane_s16 (int16_t * __a, int16x4x3_t __b, const int __c)
 {
   union { int16x4x3_t __i; __builtin_neon_ei __o; } __bu = { __b };
   __builtin_neon_vst3_lanev4hi ((__builtin_neon_hi *) __a, __bu.__o, __c);
 }
 
-__extension__ static __inline void __attribute__ ((__always_inline__))
+__extension__ extern __inline void
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vst3_lane_s32 (int32_t * __a, int32x2x3_t __b, const int __c)
 {
   union { int32x2x3_t __i; __builtin_neon_ei __o; } __bu = { __b };
@@ -11116,7 +12746,8 @@ vst3_lane_s32 (int32_t * __a, int32x2x3_t __b, const int __c)
 }
 
 #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
-__extension__ static __inline void __attribute__ ((__always_inline__))
+__extension__ extern __inline void
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vst3_lane_f16 (float16_t * __a, float16x4x3_t __b, const int __c)
 {
   union { float16x4x3_t __i; __builtin_neon_ei __o; } __bu = { __b };
@@ -11124,56 +12755,64 @@ vst3_lane_f16 (float16_t * __a, float16x4x3_t __b, const int __c)
 }
 #endif
 
-__extension__ static __inline void __attribute__ ((__always_inline__))
+__extension__ extern __inline void
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vst3_lane_f32 (float32_t * __a, float32x2x3_t __b, const int __c)
 {
   union { float32x2x3_t __i; __builtin_neon_ei __o; } __bu = { __b };
   __builtin_neon_vst3_lanev2sf ((__builtin_neon_sf *) __a, __bu.__o, __c);
 }
 
-__extension__ static __inline void __attribute__ ((__always_inline__))
+__extension__ extern __inline void
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vst3_lane_u8 (uint8_t * __a, uint8x8x3_t __b, const int __c)
 {
   union { uint8x8x3_t __i; __builtin_neon_ei __o; } __bu = { __b };
   __builtin_neon_vst3_lanev8qi ((__builtin_neon_qi *) __a, __bu.__o, __c);
 }
 
-__extension__ static __inline void __attribute__ ((__always_inline__))
+__extension__ extern __inline void
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vst3_lane_u16 (uint16_t * __a, uint16x4x3_t __b, const int __c)
 {
   union { uint16x4x3_t __i; __builtin_neon_ei __o; } __bu = { __b };
   __builtin_neon_vst3_lanev4hi ((__builtin_neon_hi *) __a, __bu.__o, __c);
 }
 
-__extension__ static __inline void __attribute__ ((__always_inline__))
+__extension__ extern __inline void
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vst3_lane_u32 (uint32_t * __a, uint32x2x3_t __b, const int __c)
 {
   union { uint32x2x3_t __i; __builtin_neon_ei __o; } __bu = { __b };
   __builtin_neon_vst3_lanev2si ((__builtin_neon_si *) __a, __bu.__o, __c);
 }
 
-__extension__ static __inline void __attribute__ ((__always_inline__))
+__extension__ extern __inline void
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vst3_lane_p8 (poly8_t * __a, poly8x8x3_t __b, const int __c)
 {
   union { poly8x8x3_t __i; __builtin_neon_ei __o; } __bu = { __b };
   __builtin_neon_vst3_lanev8qi ((__builtin_neon_qi *) __a, __bu.__o, __c);
 }
 
-__extension__ static __inline void __attribute__ ((__always_inline__))
+__extension__ extern __inline void
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vst3_lane_p16 (poly16_t * __a, poly16x4x3_t __b, const int __c)
 {
   union { poly16x4x3_t __i; __builtin_neon_ei __o; } __bu = { __b };
   __builtin_neon_vst3_lanev4hi ((__builtin_neon_hi *) __a, __bu.__o, __c);
 }
 
-__extension__ static __inline void __attribute__ ((__always_inline__))
+__extension__ extern __inline void
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vst3q_lane_s16 (int16_t * __a, int16x8x3_t __b, const int __c)
 {
   union { int16x8x3_t __i; __builtin_neon_ci __o; } __bu = { __b };
   __builtin_neon_vst3_lanev8hi ((__builtin_neon_hi *) __a, __bu.__o, __c);
 }
 
-__extension__ static __inline void __attribute__ ((__always_inline__))
+__extension__ extern __inline void
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vst3q_lane_s32 (int32_t * __a, int32x4x3_t __b, const int __c)
 {
   union { int32x4x3_t __i; __builtin_neon_ci __o; } __bu = { __b };
@@ -11181,7 +12820,8 @@ vst3q_lane_s32 (int32_t * __a, int32x4x3_t __b, const int __c)
 }
 
 #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
-__extension__ static __inline void __attribute__ ((__always_inline__))
+__extension__ extern __inline void
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vst3q_lane_f16 (float16_t * __a, float16x8x3_t __b, const int __c)
 {
   union { float16x8x3_t __i; __builtin_neon_ci __o; } __bu = { __b };
@@ -11189,35 +12829,40 @@ vst3q_lane_f16 (float16_t * __a, float16x8x3_t __b, const int __c)
 }
 #endif
 
-__extension__ static __inline void __attribute__ ((__always_inline__))
+__extension__ extern __inline void
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vst3q_lane_f32 (float32_t * __a, float32x4x3_t __b, const int __c)
 {
   union { float32x4x3_t __i; __builtin_neon_ci __o; } __bu = { __b };
   __builtin_neon_vst3_lanev4sf ((__builtin_neon_sf *) __a, __bu.__o, __c);
 }
 
-__extension__ static __inline void __attribute__ ((__always_inline__))
+__extension__ extern __inline void
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vst3q_lane_u16 (uint16_t * __a, uint16x8x3_t __b, const int __c)
 {
   union { uint16x8x3_t __i; __builtin_neon_ci __o; } __bu = { __b };
   __builtin_neon_vst3_lanev8hi ((__builtin_neon_hi *) __a, __bu.__o, __c);
 }
 
-__extension__ static __inline void __attribute__ ((__always_inline__))
+__extension__ extern __inline void
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vst3q_lane_u32 (uint32_t * __a, uint32x4x3_t __b, const int __c)
 {
   union { uint32x4x3_t __i; __builtin_neon_ci __o; } __bu = { __b };
   __builtin_neon_vst3_lanev4si ((__builtin_neon_si *) __a, __bu.__o, __c);
 }
 
-__extension__ static __inline void __attribute__ ((__always_inline__))
+__extension__ extern __inline void
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vst3q_lane_p16 (poly16_t * __a, poly16x8x3_t __b, const int __c)
 {
   union { poly16x8x3_t __i; __builtin_neon_ci __o; } __bu = { __b };
   __builtin_neon_vst3_lanev8hi ((__builtin_neon_hi *) __a, __bu.__o, __c);
 }
 
-__extension__ static __inline int8x8x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int8x8x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vld4_s8 (const int8_t * __a)
 {
   union { int8x8x4_t __i; __builtin_neon_oi __o; } __rv;
@@ -11225,7 +12870,8 @@ vld4_s8 (const int8_t * __a)
   return __rv.__i;
 }
 
-__extension__ static __inline int16x4x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int16x4x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vld4_s16 (const int16_t * __a)
 {
   union { int16x4x4_t __i; __builtin_neon_oi __o; } __rv;
@@ -11233,7 +12879,8 @@ vld4_s16 (const int16_t * __a)
   return __rv.__i;
 }
 
-__extension__ static __inline int32x2x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32x2x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vld4_s32 (const int32_t * __a)
 {
   union { int32x2x4_t __i; __builtin_neon_oi __o; } __rv;
@@ -11242,7 +12889,8 @@ vld4_s32 (const int32_t * __a)
 }
 
 #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
-__extension__ static __inline float16x4x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline float16x4x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vld4_f16 (const float16_t * __a)
 {
   union { float16x4x4_t __i; __builtin_neon_oi __o; } __rv;
@@ -11251,7 +12899,8 @@ vld4_f16 (const float16_t * __a)
 }
 #endif
 
-__extension__ static __inline float32x2x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline float32x2x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vld4_f32 (const float32_t * __a)
 {
   union { float32x2x4_t __i; __builtin_neon_oi __o; } __rv;
@@ -11259,7 +12908,8 @@ vld4_f32 (const float32_t * __a)
   return __rv.__i;
 }
 
-__extension__ static __inline uint8x8x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint8x8x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vld4_u8 (const uint8_t * __a)
 {
   union { uint8x8x4_t __i; __builtin_neon_oi __o; } __rv;
@@ -11267,7 +12917,8 @@ vld4_u8 (const uint8_t * __a)
   return __rv.__i;
 }
 
-__extension__ static __inline uint16x4x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint16x4x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vld4_u16 (const uint16_t * __a)
 {
   union { uint16x4x4_t __i; __builtin_neon_oi __o; } __rv;
@@ -11275,7 +12926,8 @@ vld4_u16 (const uint16_t * __a)
   return __rv.__i;
 }
 
-__extension__ static __inline uint32x2x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x2x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vld4_u32 (const uint32_t * __a)
 {
   union { uint32x2x4_t __i; __builtin_neon_oi __o; } __rv;
@@ -11283,7 +12935,8 @@ vld4_u32 (const uint32_t * __a)
   return __rv.__i;
 }
 
-__extension__ static __inline poly8x8x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline poly8x8x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vld4_p8 (const poly8_t * __a)
 {
   union { poly8x8x4_t __i; __builtin_neon_oi __o; } __rv;
@@ -11291,7 +12944,8 @@ vld4_p8 (const poly8_t * __a)
   return __rv.__i;
 }
 
-__extension__ static __inline poly16x4x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline poly16x4x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vld4_p16 (const poly16_t * __a)
 {
   union { poly16x4x4_t __i; __builtin_neon_oi __o; } __rv;
@@ -11301,7 +12955,8 @@ vld4_p16 (const poly16_t * __a)
 
 #pragma GCC push_options
 #pragma GCC target ("fpu=crypto-neon-fp-armv8")
-__extension__ static __inline poly64x1x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline poly64x1x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vld4_p64 (const poly64_t * __a)
 {
   union { poly64x1x4_t __i; __builtin_neon_oi __o; } __rv;
@@ -11310,7 +12965,8 @@ vld4_p64 (const poly64_t * __a)
 }
 
 #pragma GCC pop_options
-__extension__ static __inline int64x1x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int64x1x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vld4_s64 (const int64_t * __a)
 {
   union { int64x1x4_t __i; __builtin_neon_oi __o; } __rv;
@@ -11318,7 +12974,8 @@ vld4_s64 (const int64_t * __a)
   return __rv.__i;
 }
 
-__extension__ static __inline uint64x1x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint64x1x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vld4_u64 (const uint64_t * __a)
 {
   union { uint64x1x4_t __i; __builtin_neon_oi __o; } __rv;
@@ -11326,7 +12983,8 @@ vld4_u64 (const uint64_t * __a)
   return __rv.__i;
 }
 
-__extension__ static __inline int8x16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int8x16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vld4q_s8 (const int8_t * __a)
 {
   union { int8x16x4_t __i; __builtin_neon_xi __o; } __rv;
@@ -11334,7 +12992,8 @@ vld4q_s8 (const int8_t * __a)
   return __rv.__i;
 }
 
-__extension__ static __inline int16x8x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int16x8x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vld4q_s16 (const int16_t * __a)
 {
   union { int16x8x4_t __i; __builtin_neon_xi __o; } __rv;
@@ -11342,7 +13001,8 @@ vld4q_s16 (const int16_t * __a)
   return __rv.__i;
 }
 
-__extension__ static __inline int32x4x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32x4x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vld4q_s32 (const int32_t * __a)
 {
   union { int32x4x4_t __i; __builtin_neon_xi __o; } __rv;
@@ -11351,7 +13011,8 @@ vld4q_s32 (const int32_t * __a)
 }
 
 #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
-__extension__ static __inline float16x8x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline float16x8x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vld4q_f16 (const float16_t * __a)
 {
   union { float16x8x4_t __i; __builtin_neon_xi __o; } __rv;
@@ -11360,7 +13021,8 @@ vld4q_f16 (const float16_t * __a)
 }
 #endif
 
-__extension__ static __inline float32x4x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline float32x4x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vld4q_f32 (const float32_t * __a)
 {
   union { float32x4x4_t __i; __builtin_neon_xi __o; } __rv;
@@ -11368,7 +13030,8 @@ vld4q_f32 (const float32_t * __a)
   return __rv.__i;
 }
 
-__extension__ static __inline uint8x16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint8x16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vld4q_u8 (const uint8_t * __a)
 {
   union { uint8x16x4_t __i; __builtin_neon_xi __o; } __rv;
@@ -11376,7 +13039,8 @@ vld4q_u8 (const uint8_t * __a)
   return __rv.__i;
 }
 
-__extension__ static __inline uint16x8x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint16x8x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vld4q_u16 (const uint16_t * __a)
 {
   union { uint16x8x4_t __i; __builtin_neon_xi __o; } __rv;
@@ -11384,7 +13048,8 @@ vld4q_u16 (const uint16_t * __a)
   return __rv.__i;
 }
 
-__extension__ static __inline uint32x4x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x4x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vld4q_u32 (const uint32_t * __a)
 {
   union { uint32x4x4_t __i; __builtin_neon_xi __o; } __rv;
@@ -11392,7 +13057,8 @@ vld4q_u32 (const uint32_t * __a)
   return __rv.__i;
 }
 
-__extension__ static __inline poly8x16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline poly8x16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vld4q_p8 (const poly8_t * __a)
 {
   union { poly8x16x4_t __i; __builtin_neon_xi __o; } __rv;
@@ -11400,7 +13066,8 @@ vld4q_p8 (const poly8_t * __a)
   return __rv.__i;
 }
 
-__extension__ static __inline poly16x8x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline poly16x8x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vld4q_p16 (const poly16_t * __a)
 {
   union { poly16x8x4_t __i; __builtin_neon_xi __o; } __rv;
@@ -11408,7 +13075,8 @@ vld4q_p16 (const poly16_t * __a)
   return __rv.__i;
 }
 
-__extension__ static __inline int8x8x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int8x8x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vld4_lane_s8 (const int8_t * __a, int8x8x4_t __b, const int __c)
 {
   union { int8x8x4_t __i; __builtin_neon_oi __o; } __bu = { __b };
@@ -11417,7 +13085,8 @@ vld4_lane_s8 (const int8_t * __a, int8x8x4_t __b, const int __c)
   return __rv.__i;
 }
 
-__extension__ static __inline int16x4x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int16x4x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vld4_lane_s16 (const int16_t * __a, int16x4x4_t __b, const int __c)
 {
   union { int16x4x4_t __i; __builtin_neon_oi __o; } __bu = { __b };
@@ -11426,7 +13095,8 @@ vld4_lane_s16 (const int16_t * __a, int16x4x4_t __b, const int __c)
   return __rv.__i;
 }
 
-__extension__ static __inline int32x2x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32x2x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vld4_lane_s32 (const int32_t * __a, int32x2x4_t __b, const int __c)
 {
   union { int32x2x4_t __i; __builtin_neon_oi __o; } __bu = { __b };
@@ -11436,7 +13106,8 @@ vld4_lane_s32 (const int32_t * __a, int32x2x4_t __b, const int __c)
 }
 
 #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
-__extension__ static __inline float16x4x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline float16x4x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vld4_lane_f16 (const float16_t * __a, float16x4x4_t __b, const int __c)
 {
   union { float16x4x4_t __i; __builtin_neon_oi __o; } __bu = { __b };
@@ -11447,7 +13118,8 @@ vld4_lane_f16 (const float16_t * __a, float16x4x4_t __b, const int __c)
 }
 #endif
 
-__extension__ static __inline float32x2x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline float32x2x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vld4_lane_f32 (const float32_t * __a, float32x2x4_t __b, const int __c)
 {
   union { float32x2x4_t __i; __builtin_neon_oi __o; } __bu = { __b };
@@ -11456,7 +13128,8 @@ vld4_lane_f32 (const float32_t * __a, float32x2x4_t __b, const int __c)
   return __rv.__i;
 }
 
-__extension__ static __inline uint8x8x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint8x8x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vld4_lane_u8 (const uint8_t * __a, uint8x8x4_t __b, const int __c)
 {
   union { uint8x8x4_t __i; __builtin_neon_oi __o; } __bu = { __b };
@@ -11465,7 +13138,8 @@ vld4_lane_u8 (const uint8_t * __a, uint8x8x4_t __b, const int __c)
   return __rv.__i;
 }
 
-__extension__ static __inline uint16x4x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint16x4x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vld4_lane_u16 (const uint16_t * __a, uint16x4x4_t __b, const int __c)
 {
   union { uint16x4x4_t __i; __builtin_neon_oi __o; } __bu = { __b };
@@ -11474,7 +13148,8 @@ vld4_lane_u16 (const uint16_t * __a, uint16x4x4_t __b, const int __c)
   return __rv.__i;
 }
 
-__extension__ static __inline uint32x2x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x2x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vld4_lane_u32 (const uint32_t * __a, uint32x2x4_t __b, const int __c)
 {
   union { uint32x2x4_t __i; __builtin_neon_oi __o; } __bu = { __b };
@@ -11483,7 +13158,8 @@ vld4_lane_u32 (const uint32_t * __a, uint32x2x4_t __b, const int __c)
   return __rv.__i;
 }
 
-__extension__ static __inline poly8x8x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline poly8x8x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vld4_lane_p8 (const poly8_t * __a, poly8x8x4_t __b, const int __c)
 {
   union { poly8x8x4_t __i; __builtin_neon_oi __o; } __bu = { __b };
@@ -11492,7 +13168,8 @@ vld4_lane_p8 (const poly8_t * __a, poly8x8x4_t __b, const int __c)
   return __rv.__i;
 }
 
-__extension__ static __inline poly16x4x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline poly16x4x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vld4_lane_p16 (const poly16_t * __a, poly16x4x4_t __b, const int __c)
 {
   union { poly16x4x4_t __i; __builtin_neon_oi __o; } __bu = { __b };
@@ -11501,7 +13178,8 @@ vld4_lane_p16 (const poly16_t * __a, poly16x4x4_t __b, const int __c)
   return __rv.__i;
 }
 
-__extension__ static __inline int16x8x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int16x8x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vld4q_lane_s16 (const int16_t * __a, int16x8x4_t __b, const int __c)
 {
   union { int16x8x4_t __i; __builtin_neon_xi __o; } __bu = { __b };
@@ -11510,7 +13188,8 @@ vld4q_lane_s16 (const int16_t * __a, int16x8x4_t __b, const int __c)
   return __rv.__i;
 }
 
-__extension__ static __inline int32x4x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32x4x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vld4q_lane_s32 (const int32_t * __a, int32x4x4_t __b, const int __c)
 {
   union { int32x4x4_t __i; __builtin_neon_xi __o; } __bu = { __b };
@@ -11520,7 +13199,8 @@ vld4q_lane_s32 (const int32_t * __a, int32x4x4_t __b, const int __c)
 }
 
 #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
-__extension__ static __inline float16x8x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline float16x8x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vld4q_lane_f16 (const float16_t * __a, float16x8x4_t __b, const int __c)
 {
   union { float16x8x4_t __i; __builtin_neon_xi __o; } __bu = { __b };
@@ -11531,7 +13211,8 @@ vld4q_lane_f16 (const float16_t * __a, float16x8x4_t __b, const int __c)
 }
 #endif
 
-__extension__ static __inline float32x4x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline float32x4x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vld4q_lane_f32 (const float32_t * __a, float32x4x4_t __b, const int __c)
 {
   union { float32x4x4_t __i; __builtin_neon_xi __o; } __bu = { __b };
@@ -11540,7 +13221,8 @@ vld4q_lane_f32 (const float32_t * __a, float32x4x4_t __b, const int __c)
   return __rv.__i;
 }
 
-__extension__ static __inline uint16x8x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint16x8x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vld4q_lane_u16 (const uint16_t * __a, uint16x8x4_t __b, const int __c)
 {
   union { uint16x8x4_t __i; __builtin_neon_xi __o; } __bu = { __b };
@@ -11549,7 +13231,8 @@ vld4q_lane_u16 (const uint16_t * __a, uint16x8x4_t __b, const int __c)
   return __rv.__i;
 }
 
-__extension__ static __inline uint32x4x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x4x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vld4q_lane_u32 (const uint32_t * __a, uint32x4x4_t __b, const int __c)
 {
   union { uint32x4x4_t __i; __builtin_neon_xi __o; } __bu = { __b };
@@ -11558,7 +13241,8 @@ vld4q_lane_u32 (const uint32_t * __a, uint32x4x4_t __b, const int __c)
   return __rv.__i;
 }
 
-__extension__ static __inline poly16x8x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline poly16x8x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vld4q_lane_p16 (const poly16_t * __a, poly16x8x4_t __b, const int __c)
 {
   union { poly16x8x4_t __i; __builtin_neon_xi __o; } __bu = { __b };
@@ -11567,7 +13251,8 @@ vld4q_lane_p16 (const poly16_t * __a, poly16x8x4_t __b, const int __c)
   return __rv.__i;
 }
 
-__extension__ static __inline int8x8x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int8x8x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vld4_dup_s8 (const int8_t * __a)
 {
   union { int8x8x4_t __i; __builtin_neon_oi __o; } __rv;
@@ -11575,7 +13260,8 @@ vld4_dup_s8 (const int8_t * __a)
   return __rv.__i;
 }
 
-__extension__ static __inline int16x4x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int16x4x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vld4_dup_s16 (const int16_t * __a)
 {
   union { int16x4x4_t __i; __builtin_neon_oi __o; } __rv;
@@ -11583,7 +13269,8 @@ vld4_dup_s16 (const int16_t * __a)
   return __rv.__i;
 }
 
-__extension__ static __inline int32x2x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32x2x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vld4_dup_s32 (const int32_t * __a)
 {
   union { int32x2x4_t __i; __builtin_neon_oi __o; } __rv;
@@ -11592,7 +13279,8 @@ vld4_dup_s32 (const int32_t * __a)
 }
 
 #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
-__extension__ static __inline float16x4x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline float16x4x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vld4_dup_f16 (const float16_t * __a)
 {
   union { float16x4x4_t __i; __builtin_neon_oi __o; } __rv;
@@ -11601,7 +13289,8 @@ vld4_dup_f16 (const float16_t * __a)
 }
 #endif
 
-__extension__ static __inline float32x2x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline float32x2x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vld4_dup_f32 (const float32_t * __a)
 {
   union { float32x2x4_t __i; __builtin_neon_oi __o; } __rv;
@@ -11609,7 +13298,8 @@ vld4_dup_f32 (const float32_t * __a)
   return __rv.__i;
 }
 
-__extension__ static __inline uint8x8x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint8x8x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vld4_dup_u8 (const uint8_t * __a)
 {
   union { uint8x8x4_t __i; __builtin_neon_oi __o; } __rv;
@@ -11617,7 +13307,8 @@ vld4_dup_u8 (const uint8_t * __a)
   return __rv.__i;
 }
 
-__extension__ static __inline uint16x4x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint16x4x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vld4_dup_u16 (const uint16_t * __a)
 {
   union { uint16x4x4_t __i; __builtin_neon_oi __o; } __rv;
@@ -11625,7 +13316,8 @@ vld4_dup_u16 (const uint16_t * __a)
   return __rv.__i;
 }
 
-__extension__ static __inline uint32x2x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x2x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vld4_dup_u32 (const uint32_t * __a)
 {
   union { uint32x2x4_t __i; __builtin_neon_oi __o; } __rv;
@@ -11633,7 +13325,8 @@ vld4_dup_u32 (const uint32_t * __a)
   return __rv.__i;
 }
 
-__extension__ static __inline poly8x8x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline poly8x8x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vld4_dup_p8 (const poly8_t * __a)
 {
   union { poly8x8x4_t __i; __builtin_neon_oi __o; } __rv;
@@ -11641,7 +13334,8 @@ vld4_dup_p8 (const poly8_t * __a)
   return __rv.__i;
 }
 
-__extension__ static __inline poly16x4x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline poly16x4x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vld4_dup_p16 (const poly16_t * __a)
 {
   union { poly16x4x4_t __i; __builtin_neon_oi __o; } __rv;
@@ -11651,7 +13345,8 @@ vld4_dup_p16 (const poly16_t * __a)
 
 #pragma GCC push_options
 #pragma GCC target ("fpu=crypto-neon-fp-armv8")
-__extension__ static __inline poly64x1x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline poly64x1x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vld4_dup_p64 (const poly64_t * __a)
 {
   union { poly64x1x4_t __i; __builtin_neon_oi __o; } __rv;
@@ -11660,7 +13355,8 @@ vld4_dup_p64 (const poly64_t * __a)
 }
 
 #pragma GCC pop_options
-__extension__ static __inline int64x1x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int64x1x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vld4_dup_s64 (const int64_t * __a)
 {
   union { int64x1x4_t __i; __builtin_neon_oi __o; } __rv;
@@ -11668,7 +13364,8 @@ vld4_dup_s64 (const int64_t * __a)
   return __rv.__i;
 }
 
-__extension__ static __inline uint64x1x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint64x1x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vld4_dup_u64 (const uint64_t * __a)
 {
   union { uint64x1x4_t __i; __builtin_neon_oi __o; } __rv;
@@ -11676,21 +13373,24 @@ vld4_dup_u64 (const uint64_t * __a)
   return __rv.__i;
 }
 
-__extension__ static __inline void __attribute__ ((__always_inline__))
+__extension__ extern __inline void
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vst4_s8 (int8_t * __a, int8x8x4_t __b)
 {
   union { int8x8x4_t __i; __builtin_neon_oi __o; } __bu = { __b };
   __builtin_neon_vst4v8qi ((__builtin_neon_qi *) __a, __bu.__o);
 }
 
-__extension__ static __inline void __attribute__ ((__always_inline__))
+__extension__ extern __inline void
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vst4_s16 (int16_t * __a, int16x4x4_t __b)
 {
   union { int16x4x4_t __i; __builtin_neon_oi __o; } __bu = { __b };
   __builtin_neon_vst4v4hi ((__builtin_neon_hi *) __a, __bu.__o);
 }
 
-__extension__ static __inline void __attribute__ ((__always_inline__))
+__extension__ extern __inline void
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vst4_s32 (int32_t * __a, int32x2x4_t __b)
 {
   union { int32x2x4_t __i; __builtin_neon_oi __o; } __bu = { __b };
@@ -11698,7 +13398,8 @@ vst4_s32 (int32_t * __a, int32x2x4_t __b)
 }
 
 #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
-__extension__ static __inline void __attribute__ ((__always_inline__))
+__extension__ extern __inline void
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vst4_f16 (float16_t * __a, float16x4x4_t __b)
 {
   union { float16x4x4_t __i; __builtin_neon_oi __o; } __bu = { __b };
@@ -11706,42 +13407,48 @@ vst4_f16 (float16_t * __a, float16x4x4_t __b)
 }
 #endif
 
-__extension__ static __inline void __attribute__ ((__always_inline__))
+__extension__ extern __inline void
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vst4_f32 (float32_t * __a, float32x2x4_t __b)
 {
   union { float32x2x4_t __i; __builtin_neon_oi __o; } __bu = { __b };
   __builtin_neon_vst4v2sf ((__builtin_neon_sf *) __a, __bu.__o);
 }
 
-__extension__ static __inline void __attribute__ ((__always_inline__))
+__extension__ extern __inline void
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vst4_u8 (uint8_t * __a, uint8x8x4_t __b)
 {
   union { uint8x8x4_t __i; __builtin_neon_oi __o; } __bu = { __b };
   __builtin_neon_vst4v8qi ((__builtin_neon_qi *) __a, __bu.__o);
 }
 
-__extension__ static __inline void __attribute__ ((__always_inline__))
+__extension__ extern __inline void
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vst4_u16 (uint16_t * __a, uint16x4x4_t __b)
 {
   union { uint16x4x4_t __i; __builtin_neon_oi __o; } __bu = { __b };
   __builtin_neon_vst4v4hi ((__builtin_neon_hi *) __a, __bu.__o);
 }
 
-__extension__ static __inline void __attribute__ ((__always_inline__))
+__extension__ extern __inline void
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vst4_u32 (uint32_t * __a, uint32x2x4_t __b)
 {
   union { uint32x2x4_t __i; __builtin_neon_oi __o; } __bu = { __b };
   __builtin_neon_vst4v2si ((__builtin_neon_si *) __a, __bu.__o);
 }
 
-__extension__ static __inline void __attribute__ ((__always_inline__))
+__extension__ extern __inline void
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vst4_p8 (poly8_t * __a, poly8x8x4_t __b)
 {
   union { poly8x8x4_t __i; __builtin_neon_oi __o; } __bu = { __b };
   __builtin_neon_vst4v8qi ((__builtin_neon_qi *) __a, __bu.__o);
 }
 
-__extension__ static __inline void __attribute__ ((__always_inline__))
+__extension__ extern __inline void
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vst4_p16 (poly16_t * __a, poly16x4x4_t __b)
 {
   union { poly16x4x4_t __i; __builtin_neon_oi __o; } __bu = { __b };
@@ -11750,7 +13457,8 @@ vst4_p16 (poly16_t * __a, poly16x4x4_t __b)
 
 #pragma GCC push_options
 #pragma GCC target ("fpu=crypto-neon-fp-armv8")
-__extension__ static __inline void __attribute__ ((__always_inline__))
+__extension__ extern __inline void
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vst4_p64 (poly64_t * __a, poly64x1x4_t __b)
 {
   union { poly64x1x4_t __i; __builtin_neon_oi __o; } __bu = { __b };
@@ -11758,35 +13466,40 @@ vst4_p64 (poly64_t * __a, poly64x1x4_t __b)
 }
 
 #pragma GCC pop_options
-__extension__ static __inline void __attribute__ ((__always_inline__))
+__extension__ extern __inline void
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vst4_s64 (int64_t * __a, int64x1x4_t __b)
 {
   union { int64x1x4_t __i; __builtin_neon_oi __o; } __bu = { __b };
   __builtin_neon_vst4di ((__builtin_neon_di *) __a, __bu.__o);
 }
 
-__extension__ static __inline void __attribute__ ((__always_inline__))
+__extension__ extern __inline void
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vst4_u64 (uint64_t * __a, uint64x1x4_t __b)
 {
   union { uint64x1x4_t __i; __builtin_neon_oi __o; } __bu = { __b };
   __builtin_neon_vst4di ((__builtin_neon_di *) __a, __bu.__o);
 }
 
-__extension__ static __inline void __attribute__ ((__always_inline__))
+__extension__ extern __inline void
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vst4q_s8 (int8_t * __a, int8x16x4_t __b)
 {
   union { int8x16x4_t __i; __builtin_neon_xi __o; } __bu = { __b };
   __builtin_neon_vst4v16qi ((__builtin_neon_qi *) __a, __bu.__o);
 }
 
-__extension__ static __inline void __attribute__ ((__always_inline__))
+__extension__ extern __inline void
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vst4q_s16 (int16_t * __a, int16x8x4_t __b)
 {
   union { int16x8x4_t __i; __builtin_neon_xi __o; } __bu = { __b };
   __builtin_neon_vst4v8hi ((__builtin_neon_hi *) __a, __bu.__o);
 }
 
-__extension__ static __inline void __attribute__ ((__always_inline__))
+__extension__ extern __inline void
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vst4q_s32 (int32_t * __a, int32x4x4_t __b)
 {
   union { int32x4x4_t __i; __builtin_neon_xi __o; } __bu = { __b };
@@ -11794,7 +13507,8 @@ vst4q_s32 (int32_t * __a, int32x4x4_t __b)
 }
 
 #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
-__extension__ static __inline void __attribute__ ((__always_inline__))
+__extension__ extern __inline void
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vst4q_f16 (float16_t * __a, float16x8x4_t __b)
 {
   union { float16x8x4_t __i; __builtin_neon_xi __o; } __bu = { __b };
@@ -11802,63 +13516,72 @@ vst4q_f16 (float16_t * __a, float16x8x4_t __b)
 }
 #endif
 
-__extension__ static __inline void __attribute__ ((__always_inline__))
+__extension__ extern __inline void
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vst4q_f32 (float32_t * __a, float32x4x4_t __b)
 {
   union { float32x4x4_t __i; __builtin_neon_xi __o; } __bu = { __b };
   __builtin_neon_vst4v4sf ((__builtin_neon_sf *) __a, __bu.__o);
 }
 
-__extension__ static __inline void __attribute__ ((__always_inline__))
+__extension__ extern __inline void
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vst4q_u8 (uint8_t * __a, uint8x16x4_t __b)
 {
   union { uint8x16x4_t __i; __builtin_neon_xi __o; } __bu = { __b };
   __builtin_neon_vst4v16qi ((__builtin_neon_qi *) __a, __bu.__o);
 }
 
-__extension__ static __inline void __attribute__ ((__always_inline__))
+__extension__ extern __inline void
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vst4q_u16 (uint16_t * __a, uint16x8x4_t __b)
 {
   union { uint16x8x4_t __i; __builtin_neon_xi __o; } __bu = { __b };
   __builtin_neon_vst4v8hi ((__builtin_neon_hi *) __a, __bu.__o);
 }
 
-__extension__ static __inline void __attribute__ ((__always_inline__))
+__extension__ extern __inline void
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vst4q_u32 (uint32_t * __a, uint32x4x4_t __b)
 {
   union { uint32x4x4_t __i; __builtin_neon_xi __o; } __bu = { __b };
   __builtin_neon_vst4v4si ((__builtin_neon_si *) __a, __bu.__o);
 }
 
-__extension__ static __inline void __attribute__ ((__always_inline__))
+__extension__ extern __inline void
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vst4q_p8 (poly8_t * __a, poly8x16x4_t __b)
 {
   union { poly8x16x4_t __i; __builtin_neon_xi __o; } __bu = { __b };
   __builtin_neon_vst4v16qi ((__builtin_neon_qi *) __a, __bu.__o);
 }
 
-__extension__ static __inline void __attribute__ ((__always_inline__))
+__extension__ extern __inline void
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vst4q_p16 (poly16_t * __a, poly16x8x4_t __b)
 {
   union { poly16x8x4_t __i; __builtin_neon_xi __o; } __bu = { __b };
   __builtin_neon_vst4v8hi ((__builtin_neon_hi *) __a, __bu.__o);
 }
 
-__extension__ static __inline void __attribute__ ((__always_inline__))
+__extension__ extern __inline void
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vst4_lane_s8 (int8_t * __a, int8x8x4_t __b, const int __c)
 {
   union { int8x8x4_t __i; __builtin_neon_oi __o; } __bu = { __b };
   __builtin_neon_vst4_lanev8qi ((__builtin_neon_qi *) __a, __bu.__o, __c);
 }
 
-__extension__ static __inline void __attribute__ ((__always_inline__))
+__extension__ extern __inline void
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vst4_lane_s16 (int16_t * __a, int16x4x4_t __b, const int __c)
 {
   union { int16x4x4_t __i; __builtin_neon_oi __o; } __bu = { __b };
   __builtin_neon_vst4_lanev4hi ((__builtin_neon_hi *) __a, __bu.__o, __c);
 }
 
-__extension__ static __inline void __attribute__ ((__always_inline__))
+__extension__ extern __inline void
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vst4_lane_s32 (int32_t * __a, int32x2x4_t __b, const int __c)
 {
   union { int32x2x4_t __i; __builtin_neon_oi __o; } __bu = { __b };
@@ -11866,7 +13589,8 @@ vst4_lane_s32 (int32_t * __a, int32x2x4_t __b, const int __c)
 }
 
 #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
-__extension__ static __inline void __attribute__ ((__always_inline__))
+__extension__ extern __inline void
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vst4_lane_f16 (float16_t * __a, float16x4x4_t __b, const int __c)
 {
   union { float16x4x4_t __i; __builtin_neon_oi __o; } __bu = { __b };
@@ -11874,56 +13598,64 @@ vst4_lane_f16 (float16_t * __a, float16x4x4_t __b, const int __c)
 }
 #endif
 
-__extension__ static __inline void __attribute__ ((__always_inline__))
+__extension__ extern __inline void
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vst4_lane_f32 (float32_t * __a, float32x2x4_t __b, const int __c)
 {
   union { float32x2x4_t __i; __builtin_neon_oi __o; } __bu = { __b };
   __builtin_neon_vst4_lanev2sf ((__builtin_neon_sf *) __a, __bu.__o, __c);
 }
 
-__extension__ static __inline void __attribute__ ((__always_inline__))
+__extension__ extern __inline void
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vst4_lane_u8 (uint8_t * __a, uint8x8x4_t __b, const int __c)
 {
   union { uint8x8x4_t __i; __builtin_neon_oi __o; } __bu = { __b };
   __builtin_neon_vst4_lanev8qi ((__builtin_neon_qi *) __a, __bu.__o, __c);
 }
 
-__extension__ static __inline void __attribute__ ((__always_inline__))
+__extension__ extern __inline void
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vst4_lane_u16 (uint16_t * __a, uint16x4x4_t __b, const int __c)
 {
   union { uint16x4x4_t __i; __builtin_neon_oi __o; } __bu = { __b };
   __builtin_neon_vst4_lanev4hi ((__builtin_neon_hi *) __a, __bu.__o, __c);
 }
 
-__extension__ static __inline void __attribute__ ((__always_inline__))
+__extension__ extern __inline void
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vst4_lane_u32 (uint32_t * __a, uint32x2x4_t __b, const int __c)
 {
   union { uint32x2x4_t __i; __builtin_neon_oi __o; } __bu = { __b };
   __builtin_neon_vst4_lanev2si ((__builtin_neon_si *) __a, __bu.__o, __c);
 }
 
-__extension__ static __inline void __attribute__ ((__always_inline__))
+__extension__ extern __inline void
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vst4_lane_p8 (poly8_t * __a, poly8x8x4_t __b, const int __c)
 {
   union { poly8x8x4_t __i; __builtin_neon_oi __o; } __bu = { __b };
   __builtin_neon_vst4_lanev8qi ((__builtin_neon_qi *) __a, __bu.__o, __c);
 }
 
-__extension__ static __inline void __attribute__ ((__always_inline__))
+__extension__ extern __inline void
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vst4_lane_p16 (poly16_t * __a, poly16x4x4_t __b, const int __c)
 {
   union { poly16x4x4_t __i; __builtin_neon_oi __o; } __bu = { __b };
   __builtin_neon_vst4_lanev4hi ((__builtin_neon_hi *) __a, __bu.__o, __c);
 }
 
-__extension__ static __inline void __attribute__ ((__always_inline__))
+__extension__ extern __inline void
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vst4q_lane_s16 (int16_t * __a, int16x8x4_t __b, const int __c)
 {
   union { int16x8x4_t __i; __builtin_neon_xi __o; } __bu = { __b };
   __builtin_neon_vst4_lanev8hi ((__builtin_neon_hi *) __a, __bu.__o, __c);
 }
 
-__extension__ static __inline void __attribute__ ((__always_inline__))
+__extension__ extern __inline void
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vst4q_lane_s32 (int32_t * __a, int32x4x4_t __b, const int __c)
 {
   union { int32x4x4_t __i; __builtin_neon_xi __o; } __bu = { __b };
@@ -11931,7 +13663,8 @@ vst4q_lane_s32 (int32_t * __a, int32x4x4_t __b, const int __c)
 }
 
 #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
-__extension__ static __inline void __attribute__ ((__always_inline__))
+__extension__ extern __inline void
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vst4q_lane_f16 (float16_t * __a, float16x8x4_t __b, const int __c)
 {
   union { float16x8x4_t __i; __builtin_neon_xi __o; } __bu = { __b };
@@ -11939,529 +13672,616 @@ vst4q_lane_f16 (float16_t * __a, float16x8x4_t __b, const int __c)
 }
 #endif
 
-__extension__ static __inline void __attribute__ ((__always_inline__))
+__extension__ extern __inline void
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vst4q_lane_f32 (float32_t * __a, float32x4x4_t __b, const int __c)
 {
   union { float32x4x4_t __i; __builtin_neon_xi __o; } __bu = { __b };
   __builtin_neon_vst4_lanev4sf ((__builtin_neon_sf *) __a, __bu.__o, __c);
 }
 
-__extension__ static __inline void __attribute__ ((__always_inline__))
+__extension__ extern __inline void
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vst4q_lane_u16 (uint16_t * __a, uint16x8x4_t __b, const int __c)
 {
   union { uint16x8x4_t __i; __builtin_neon_xi __o; } __bu = { __b };
   __builtin_neon_vst4_lanev8hi ((__builtin_neon_hi *) __a, __bu.__o, __c);
 }
 
-__extension__ static __inline void __attribute__ ((__always_inline__))
+__extension__ extern __inline void
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vst4q_lane_u32 (uint32_t * __a, uint32x4x4_t __b, const int __c)
 {
   union { uint32x4x4_t __i; __builtin_neon_xi __o; } __bu = { __b };
   __builtin_neon_vst4_lanev4si ((__builtin_neon_si *) __a, __bu.__o, __c);
 }
 
-__extension__ static __inline void __attribute__ ((__always_inline__))
+__extension__ extern __inline void
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vst4q_lane_p16 (poly16_t * __a, poly16x8x4_t __b, const int __c)
 {
   union { poly16x8x4_t __i; __builtin_neon_xi __o; } __bu = { __b };
   __builtin_neon_vst4_lanev8hi ((__builtin_neon_hi *) __a, __bu.__o, __c);
 }
 
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int8x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vand_s8 (int8x8_t __a, int8x8_t __b)
 {
   return __a & __b;
 }
 
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vand_s16 (int16x4_t __a, int16x4_t __b)
 {
   return __a & __b;
 }
 
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vand_s32 (int32x2_t __a, int32x2_t __b)
 {
   return __a & __b;
 }
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint8x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vand_u8 (uint8x8_t __a, uint8x8_t __b)
 {
   return __a & __b;
 }
 
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vand_u16 (uint16x4_t __a, uint16x4_t __b)
 {
   return __a & __b;
 }
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vand_u32 (uint32x2_t __a, uint32x2_t __b)
 {
   return __a & __b;
 }
 
-__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int64x1_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vand_s64 (int64x1_t __a, int64x1_t __b)
 {
   return __a & __b;
 }
 
-__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint64x1_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vand_u64 (uint64x1_t __a, uint64x1_t __b)
 {
   return __a & __b;
 }
 
-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int8x16_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vandq_s8 (int8x16_t __a, int8x16_t __b)
 {
   return __a & __b;
 }
 
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vandq_s16 (int16x8_t __a, int16x8_t __b)
 {
   return __a & __b;
 }
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vandq_s32 (int32x4_t __a, int32x4_t __b)
 {
   return __a & __b;
 }
 
-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int64x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vandq_s64 (int64x2_t __a, int64x2_t __b)
 {
   return __a & __b;
 }
 
-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint8x16_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vandq_u8 (uint8x16_t __a, uint8x16_t __b)
 {
   return __a & __b;
 }
 
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vandq_u16 (uint16x8_t __a, uint16x8_t __b)
 {
   return __a & __b;
 }
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vandq_u32 (uint32x4_t __a, uint32x4_t __b)
 {
   return __a & __b;
 }
 
-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint64x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vandq_u64 (uint64x2_t __a, uint64x2_t __b)
 {
   return __a & __b;
 }
 
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int8x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vorr_s8 (int8x8_t __a, int8x8_t __b)
 {
   return __a | __b;
 }
 
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vorr_s16 (int16x4_t __a, int16x4_t __b)
 {
   return __a | __b;
 }
 
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vorr_s32 (int32x2_t __a, int32x2_t __b)
 {
   return __a | __b;
 }
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint8x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vorr_u8 (uint8x8_t __a, uint8x8_t __b)
 {
   return __a | __b;
 }
 
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vorr_u16 (uint16x4_t __a, uint16x4_t __b)
 {
   return __a | __b;
 }
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vorr_u32 (uint32x2_t __a, uint32x2_t __b)
 {
   return __a | __b;
 }
 
-__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int64x1_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vorr_s64 (int64x1_t __a, int64x1_t __b)
 {
   return __a | __b;
 }
 
-__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint64x1_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vorr_u64 (uint64x1_t __a, uint64x1_t __b)
 {
   return __a | __b;
 }
 
-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int8x16_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vorrq_s8 (int8x16_t __a, int8x16_t __b)
 {
   return __a | __b;
 }
 
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vorrq_s16 (int16x8_t __a, int16x8_t __b)
 {
   return __a | __b;
 }
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vorrq_s32 (int32x4_t __a, int32x4_t __b)
 {
   return __a | __b;
 }
 
-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int64x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vorrq_s64 (int64x2_t __a, int64x2_t __b)
 {
   return __a | __b;
 }
 
-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint8x16_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vorrq_u8 (uint8x16_t __a, uint8x16_t __b)
 {
   return __a | __b;
 }
 
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vorrq_u16 (uint16x8_t __a, uint16x8_t __b)
 {
   return __a | __b;
 }
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vorrq_u32 (uint32x4_t __a, uint32x4_t __b)
 {
   return __a | __b;
 }
 
-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint64x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vorrq_u64 (uint64x2_t __a, uint64x2_t __b)
 {
   return __a | __b;
 }
 
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int8x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 veor_s8 (int8x8_t __a, int8x8_t __b)
 {
   return __a ^ __b;
 }
 
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 veor_s16 (int16x4_t __a, int16x4_t __b)
 {
   return __a ^ __b;
 }
 
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 veor_s32 (int32x2_t __a, int32x2_t __b)
 {
   return __a ^ __b;
 }
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint8x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 veor_u8 (uint8x8_t __a, uint8x8_t __b)
 {
   return __a ^ __b;
 }
 
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 veor_u16 (uint16x4_t __a, uint16x4_t __b)
 {
   return __a ^ __b;
 }
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 veor_u32 (uint32x2_t __a, uint32x2_t __b)
 {
   return __a ^ __b;
 }
 
-__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int64x1_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 veor_s64 (int64x1_t __a, int64x1_t __b)
 {
   return __a ^ __b;
 }
 
-__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint64x1_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 veor_u64 (uint64x1_t __a, uint64x1_t __b)
 {
   return __a ^ __b;
 }
 
-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int8x16_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 veorq_s8 (int8x16_t __a, int8x16_t __b)
 {
   return __a ^ __b;
 }
 
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 veorq_s16 (int16x8_t __a, int16x8_t __b)
 {
   return __a ^ __b;
 }
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 veorq_s32 (int32x4_t __a, int32x4_t __b)
 {
   return __a ^ __b;
 }
 
-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int64x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 veorq_s64 (int64x2_t __a, int64x2_t __b)
 {
   return __a ^ __b;
 }
 
-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint8x16_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 veorq_u8 (uint8x16_t __a, uint8x16_t __b)
 {
   return __a ^ __b;
 }
 
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 veorq_u16 (uint16x8_t __a, uint16x8_t __b)
 {
   return __a ^ __b;
 }
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 veorq_u32 (uint32x4_t __a, uint32x4_t __b)
 {
   return __a ^ __b;
 }
 
-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint64x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 veorq_u64 (uint64x2_t __a, uint64x2_t __b)
 {
   return __a ^ __b;
 }
 
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int8x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vbic_s8 (int8x8_t __a, int8x8_t __b)
 {
   return __a & ~__b;
 }
 
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vbic_s16 (int16x4_t __a, int16x4_t __b)
 {
   return __a & ~__b;
 }
 
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vbic_s32 (int32x2_t __a, int32x2_t __b)
 {
   return __a & ~__b;
 }
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint8x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vbic_u8 (uint8x8_t __a, uint8x8_t __b)
 {
   return __a & ~__b;
 }
 
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vbic_u16 (uint16x4_t __a, uint16x4_t __b)
 {
   return __a & ~__b;
 }
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vbic_u32 (uint32x2_t __a, uint32x2_t __b)
 {
   return __a & ~__b;
 }
 
-__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int64x1_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vbic_s64 (int64x1_t __a, int64x1_t __b)
 {
   return __a & ~__b;
 }
 
-__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint64x1_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vbic_u64 (uint64x1_t __a, uint64x1_t __b)
 {
   return __a & ~__b;
 }
 
-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int8x16_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vbicq_s8 (int8x16_t __a, int8x16_t __b)
 {
   return __a & ~__b;
 }
 
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vbicq_s16 (int16x8_t __a, int16x8_t __b)
 {
   return __a & ~__b;
 }
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vbicq_s32 (int32x4_t __a, int32x4_t __b)
 {
   return __a & ~__b;
 }
 
-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int64x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vbicq_s64 (int64x2_t __a, int64x2_t __b)
 {
   return __a & ~__b;
 }
 
-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint8x16_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vbicq_u8 (uint8x16_t __a, uint8x16_t __b)
 {
   return __a & ~__b;
 }
 
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vbicq_u16 (uint16x8_t __a, uint16x8_t __b)
 {
   return __a & ~__b;
 }
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vbicq_u32 (uint32x4_t __a, uint32x4_t __b)
 {
   return __a & ~__b;
 }
 
-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint64x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vbicq_u64 (uint64x2_t __a, uint64x2_t __b)
 {
   return __a & ~__b;
 }
 
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int8x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vorn_s8 (int8x8_t __a, int8x8_t __b)
 {
   return __a | ~__b;
 }
 
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vorn_s16 (int16x4_t __a, int16x4_t __b)
 {
   return __a | ~__b;
 }
 
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vorn_s32 (int32x2_t __a, int32x2_t __b)
 {
   return __a | ~__b;
 }
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint8x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vorn_u8 (uint8x8_t __a, uint8x8_t __b)
 {
   return __a | ~__b;
 }
 
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vorn_u16 (uint16x4_t __a, uint16x4_t __b)
 {
   return __a | ~__b;
 }
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vorn_u32 (uint32x2_t __a, uint32x2_t __b)
 {
   return __a | ~__b;
 }
 
-__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int64x1_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vorn_s64 (int64x1_t __a, int64x1_t __b)
 {
   return __a | ~__b;
 }
 
-__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint64x1_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vorn_u64 (uint64x1_t __a, uint64x1_t __b)
 {
   return __a | ~__b;
 }
 
-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int8x16_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vornq_s8 (int8x16_t __a, int8x16_t __b)
 {
   return __a | ~__b;
 }
 
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vornq_s16 (int16x8_t __a, int16x8_t __b)
 {
   return __a | ~__b;
 }
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vornq_s32 (int32x4_t __a, int32x4_t __b)
 {
   return __a | ~__b;
 }
 
-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int64x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vornq_s64 (int64x2_t __a, int64x2_t __b)
 {
   return __a | ~__b;
 }
 
-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint8x16_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vornq_u8 (uint8x16_t __a, uint8x16_t __b)
 {
   return __a | ~__b;
 }
 
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vornq_u16 (uint16x8_t __a, uint16x8_t __b)
 {
   return __a | ~__b;
 }
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vornq_u32 (uint32x4_t __a, uint32x4_t __b)
 {
   return __a | ~__b;
 }
 
-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint64x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vornq_u64 (uint64x2_t __a, uint64x2_t __b)
 {
   return __a | ~__b;
 }
 
-__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline poly8x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpret_p8_p16 (poly16x4_t __a)
 {
   return (poly8x8_t) __a;
 }
 
 #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
-__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline poly8x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpret_p8_f16 (float16x4_t __a)
 {
   return (poly8x8_t) __a;
 }
 #endif
 
-__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline poly8x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpret_p8_f32 (float32x2_t __a)
 {
   return (poly8x8_t)__a;
@@ -12469,76 +14289,88 @@ vreinterpret_p8_f32 (float32x2_t __a)
 
 #pragma GCC push_options
 #pragma GCC target ("fpu=crypto-neon-fp-armv8")
-__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline poly8x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpret_p8_p64 (poly64x1_t __a)
 {
   return (poly8x8_t)__a;
 }
 
 #pragma GCC pop_options
-__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline poly8x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpret_p8_s64 (int64x1_t __a)
 {
   return (poly8x8_t)__a;
 }
 
-__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline poly8x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpret_p8_u64 (uint64x1_t __a)
 {
   return (poly8x8_t)__a;
 }
 
-__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline poly8x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpret_p8_s8 (int8x8_t __a)
 {
   return (poly8x8_t)__a;
 }
 
-__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline poly8x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpret_p8_s16 (int16x4_t __a)
 {
   return (poly8x8_t)__a;
 }
 
-__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline poly8x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpret_p8_s32 (int32x2_t __a)
 {
   return (poly8x8_t)__a;
 }
 
-__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline poly8x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpret_p8_u8 (uint8x8_t __a)
 {
   return (poly8x8_t)__a;
 }
 
-__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline poly8x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpret_p8_u16 (uint16x4_t __a)
 {
   return (poly8x8_t)__a;
 }
 
-__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline poly8x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpret_p8_u32 (uint32x2_t __a)
 {
   return (poly8x8_t)__a;
 }
 
-__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline poly16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpret_p16_p8 (poly8x8_t __a)
 {
   return (poly16x4_t)__a;
 }
 
 #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
-__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline poly16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpret_p16_f16 (float16x4_t __a)
 {
   return (poly16x4_t) __a;
 }
 #endif
 
-__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline poly16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpret_p16_f32 (float32x2_t __a)
 {
   return (poly16x4_t)__a;
@@ -12546,63 +14378,73 @@ vreinterpret_p16_f32 (float32x2_t __a)
 
 #pragma GCC push_options
 #pragma GCC target ("fpu=crypto-neon-fp-armv8")
-__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline poly16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpret_p16_p64 (poly64x1_t __a)
 {
   return (poly16x4_t)__a;
 }
 
 #pragma GCC pop_options
-__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline poly16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpret_p16_s64 (int64x1_t __a)
 {
   return (poly16x4_t)__a;
 }
 
-__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline poly16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpret_p16_u64 (uint64x1_t __a)
 {
   return (poly16x4_t)__a;
 }
 
-__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline poly16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpret_p16_s8 (int8x8_t __a)
 {
   return (poly16x4_t)__a;
 }
 
-__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline poly16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpret_p16_s16 (int16x4_t __a)
 {
   return (poly16x4_t)__a;
 }
 
-__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline poly16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpret_p16_s32 (int32x2_t __a)
 {
   return (poly16x4_t)__a;
 }
 
-__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline poly16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpret_p16_u8 (uint8x8_t __a)
 {
   return (poly16x4_t)__a;
 }
 
-__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline poly16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpret_p16_u16 (uint16x4_t __a)
 {
   return (poly16x4_t)__a;
 }
 
-__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline poly16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpret_p16_u32 (uint32x2_t __a)
 {
   return (poly16x4_t)__a;
 }
 
 #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
-__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline float16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpret_f16_p8 (poly8x8_t __a)
 {
   return (float16x4_t) __a;
@@ -12610,7 +14452,8 @@ vreinterpret_f16_p8 (poly8x8_t __a)
 #endif
 
 #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
-__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline float16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpret_f16_p16 (poly16x4_t __a)
 {
   return (float16x4_t) __a;
@@ -12618,7 +14461,8 @@ vreinterpret_f16_p16 (poly16x4_t __a)
 #endif
 
 #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
-__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline float16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpret_f16_f32 (float32x2_t __a)
 {
   return (float16x4_t) __a;
@@ -12628,7 +14472,8 @@ vreinterpret_f16_f32 (float32x2_t __a)
 #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
 #pragma GCC push_options
 #pragma GCC target ("fpu=crypto-neon-fp-armv8")
-__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline float16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpret_f16_p64 (poly64x1_t __a)
 {
   return (float16x4_t) __a;
@@ -12637,7 +14482,8 @@ vreinterpret_f16_p64 (poly64x1_t __a)
 #endif
 
 #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
-__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline float16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpret_f16_s64 (int64x1_t __a)
 {
   return (float16x4_t) __a;
@@ -12645,7 +14491,8 @@ vreinterpret_f16_s64 (int64x1_t __a)
 #endif
 
 #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
-__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline float16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpret_f16_u64 (uint64x1_t __a)
 {
   return (float16x4_t) __a;
@@ -12653,7 +14500,8 @@ vreinterpret_f16_u64 (uint64x1_t __a)
 #endif
 
 #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
-__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline float16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpret_f16_s8 (int8x8_t __a)
 {
   return (float16x4_t) __a;
@@ -12661,7 +14509,8 @@ vreinterpret_f16_s8 (int8x8_t __a)
 #endif
 
 #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
-__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline float16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpret_f16_s16 (int16x4_t __a)
 {
   return (float16x4_t) __a;
@@ -12669,7 +14518,8 @@ vreinterpret_f16_s16 (int16x4_t __a)
 #endif
 
 #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
-__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline float16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpret_f16_s32 (int32x2_t __a)
 {
   return (float16x4_t) __a;
@@ -12677,7 +14527,8 @@ vreinterpret_f16_s32 (int32x2_t __a)
 #endif
 
 #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
-__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline float16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpret_f16_u8 (uint8x8_t __a)
 {
   return (float16x4_t) __a;
@@ -12685,7 +14536,8 @@ vreinterpret_f16_u8 (uint8x8_t __a)
 #endif
 
 #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
-__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline float16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpret_f16_u16 (uint16x4_t __a)
 {
   return (float16x4_t) __a;
@@ -12693,27 +14545,31 @@ vreinterpret_f16_u16 (uint16x4_t __a)
 #endif
 
 #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
-__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline float16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpret_f16_u32 (uint32x2_t __a)
 {
   return (float16x4_t) __a;
 }
 #endif
 
-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline float32x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpret_f32_p8 (poly8x8_t __a)
 {
   return (float32x2_t)__a;
 }
 
-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline float32x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpret_f32_p16 (poly16x4_t __a)
 {
   return (float32x2_t)__a;
 }
 
 #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline float32x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpret_f32_f16 (float16x4_t __a)
 {
   return (float32x2_t) __a;
@@ -12722,56 +14578,65 @@ vreinterpret_f32_f16 (float16x4_t __a)
 
 #pragma GCC push_options
 #pragma GCC target ("fpu=crypto-neon-fp-armv8")
-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline float32x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpret_f32_p64 (poly64x1_t __a)
 {
   return (float32x2_t)__a;
 }
 
 #pragma GCC pop_options
-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline float32x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpret_f32_s64 (int64x1_t __a)
 {
   return (float32x2_t)__a;
 }
 
-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline float32x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpret_f32_u64 (uint64x1_t __a)
 {
   return (float32x2_t)__a;
 }
 
-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline float32x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpret_f32_s8 (int8x8_t __a)
 {
   return (float32x2_t)__a;
 }
 
-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline float32x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpret_f32_s16 (int16x4_t __a)
 {
   return (float32x2_t)__a;
 }
 
-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline float32x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpret_f32_s32 (int32x2_t __a)
 {
   return (float32x2_t)__a;
 }
 
-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline float32x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpret_f32_u8 (uint8x8_t __a)
 {
   return (float32x2_t)__a;
 }
 
-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline float32x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpret_f32_u16 (uint16x4_t __a)
 {
   return (float32x2_t)__a;
 }
 
-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline float32x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpret_f32_u32 (uint32x2_t __a)
 {
   return (float32x2_t)__a;
@@ -12779,102 +14644,118 @@ vreinterpret_f32_u32 (uint32x2_t __a)
 
 #pragma GCC push_options
 #pragma GCC target ("fpu=crypto-neon-fp-armv8")
-__extension__ static __inline poly64x1_t __attribute__ ((__always_inline__))
+__extension__ extern __inline poly64x1_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpret_p64_p8 (poly8x8_t __a)
 {
   return (poly64x1_t)__a;
 }
 
-__extension__ static __inline poly64x1_t __attribute__ ((__always_inline__))
+__extension__ extern __inline poly64x1_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpret_p64_p16 (poly16x4_t __a)
 {
   return (poly64x1_t)__a;
 }
 
 #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
-__extension__ static __inline poly64x1_t __attribute__ ((__always_inline__))
+__extension__ extern __inline poly64x1_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpret_p64_f16 (float16x4_t __a)
 {
   return (poly64x1_t) __a;
 }
 #endif
 
-__extension__ static __inline poly64x1_t __attribute__ ((__always_inline__))
+__extension__ extern __inline poly64x1_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpret_p64_f32 (float32x2_t __a)
 {
   return (poly64x1_t)__a;
 }
 
-__extension__ static __inline poly64x1_t __attribute__ ((__always_inline__))
+__extension__ extern __inline poly64x1_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpret_p64_s64 (int64x1_t __a)
 {
   return (poly64x1_t)__a;
 }
 
-__extension__ static __inline poly64x1_t __attribute__ ((__always_inline__))
+__extension__ extern __inline poly64x1_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpret_p64_u64 (uint64x1_t __a)
 {
   return (poly64x1_t)__a;
 }
 
-__extension__ static __inline poly64x1_t __attribute__ ((__always_inline__))
+__extension__ extern __inline poly64x1_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpret_p64_s8 (int8x8_t __a)
 {
   return (poly64x1_t)__a;
 }
 
-__extension__ static __inline poly64x1_t __attribute__ ((__always_inline__))
+__extension__ extern __inline poly64x1_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpret_p64_s16 (int16x4_t __a)
 {
   return (poly64x1_t)__a;
 }
 
-__extension__ static __inline poly64x1_t __attribute__ ((__always_inline__))
+__extension__ extern __inline poly64x1_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpret_p64_s32 (int32x2_t __a)
 {
   return (poly64x1_t)__a;
 }
 
-__extension__ static __inline poly64x1_t __attribute__ ((__always_inline__))
+__extension__ extern __inline poly64x1_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpret_p64_u8 (uint8x8_t __a)
 {
   return (poly64x1_t)__a;
 }
 
-__extension__ static __inline poly64x1_t __attribute__ ((__always_inline__))
+__extension__ extern __inline poly64x1_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpret_p64_u16 (uint16x4_t __a)
 {
   return (poly64x1_t)__a;
 }
 
-__extension__ static __inline poly64x1_t __attribute__ ((__always_inline__))
+__extension__ extern __inline poly64x1_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpret_p64_u32 (uint32x2_t __a)
 {
   return (poly64x1_t)__a;
 }
 
 #pragma GCC pop_options
-__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int64x1_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpret_s64_p8 (poly8x8_t __a)
 {
   return (int64x1_t)__a;
 }
 
-__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int64x1_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpret_s64_p16 (poly16x4_t __a)
 {
   return (int64x1_t)__a;
 }
 
 #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
-__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int64x1_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpret_s64_f16 (float16x4_t __a)
 {
   return (int64x1_t) __a;
 }
 #endif
 
-__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int64x1_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpret_s64_f32 (float32x2_t __a)
 {
   return (int64x1_t)__a;
@@ -12882,76 +14763,88 @@ vreinterpret_s64_f32 (float32x2_t __a)
 
 #pragma GCC push_options
 #pragma GCC target ("fpu=crypto-neon-fp-armv8")
-__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int64x1_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpret_s64_p64 (poly64x1_t __a)
 {
   return (int64x1_t)__a;
 }
 
 #pragma GCC pop_options
-__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int64x1_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpret_s64_u64 (uint64x1_t __a)
 {
   return (int64x1_t)__a;
 }
 
-__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int64x1_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpret_s64_s8 (int8x8_t __a)
 {
   return (int64x1_t)__a;
 }
 
-__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int64x1_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpret_s64_s16 (int16x4_t __a)
 {
   return (int64x1_t)__a;
 }
 
-__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int64x1_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpret_s64_s32 (int32x2_t __a)
 {
   return (int64x1_t)__a;
 }
 
-__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int64x1_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpret_s64_u8 (uint8x8_t __a)
 {
   return (int64x1_t)__a;
 }
 
-__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int64x1_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpret_s64_u16 (uint16x4_t __a)
 {
   return (int64x1_t)__a;
 }
 
-__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int64x1_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpret_s64_u32 (uint32x2_t __a)
 {
   return (int64x1_t)__a;
 }
 
-__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint64x1_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpret_u64_p8 (poly8x8_t __a)
 {
   return (uint64x1_t)__a;
 }
 
-__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint64x1_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpret_u64_p16 (poly16x4_t __a)
 {
   return (uint64x1_t)__a;
 }
 
 #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
-__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint64x1_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpret_u64_f16 (float16x4_t __a)
 {
   return (uint64x1_t) __a;
 }
 #endif
 
-__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint64x1_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpret_u64_f32 (float32x2_t __a)
 {
   return (uint64x1_t)__a;
@@ -12959,76 +14852,88 @@ vreinterpret_u64_f32 (float32x2_t __a)
 
 #pragma GCC push_options
 #pragma GCC target ("fpu=crypto-neon-fp-armv8")
-__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint64x1_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpret_u64_p64 (poly64x1_t __a)
 {
   return (uint64x1_t)__a;
 }
 
 #pragma GCC pop_options
-__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint64x1_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpret_u64_s64 (int64x1_t __a)
 {
   return (uint64x1_t)__a;
 }
 
-__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint64x1_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpret_u64_s8 (int8x8_t __a)
 {
   return (uint64x1_t)__a;
 }
 
-__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint64x1_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpret_u64_s16 (int16x4_t __a)
 {
   return (uint64x1_t)__a;
 }
 
-__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint64x1_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpret_u64_s32 (int32x2_t __a)
 {
   return (uint64x1_t)__a;
 }
 
-__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint64x1_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpret_u64_u8 (uint8x8_t __a)
 {
   return (uint64x1_t)__a;
 }
 
-__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint64x1_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpret_u64_u16 (uint16x4_t __a)
 {
   return (uint64x1_t)__a;
 }
 
-__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint64x1_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpret_u64_u32 (uint32x2_t __a)
 {
   return (uint64x1_t)__a;
 }
 
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int8x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpret_s8_p8 (poly8x8_t __a)
 {
   return (int8x8_t)__a;
 }
 
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int8x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpret_s8_p16 (poly16x4_t __a)
 {
   return (int8x8_t)__a;
 }
 
 #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int8x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpret_s8_f16 (float16x4_t __a)
 {
   return (int8x8_t) __a;
 }
 #endif
 
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int8x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpret_s8_f32 (float32x2_t __a)
 {
   return (int8x8_t)__a;
@@ -13036,76 +14941,88 @@ vreinterpret_s8_f32 (float32x2_t __a)
 
 #pragma GCC push_options
 #pragma GCC target ("fpu=crypto-neon-fp-armv8")
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int8x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpret_s8_p64 (poly64x1_t __a)
 {
   return (int8x8_t)__a;
 }
 
 #pragma GCC pop_options
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int8x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpret_s8_s64 (int64x1_t __a)
 {
   return (int8x8_t)__a;
 }
 
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int8x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpret_s8_u64 (uint64x1_t __a)
 {
   return (int8x8_t)__a;
 }
 
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int8x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpret_s8_s16 (int16x4_t __a)
 {
   return (int8x8_t)__a;
 }
 
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int8x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpret_s8_s32 (int32x2_t __a)
 {
   return (int8x8_t)__a;
 }
 
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int8x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpret_s8_u8 (uint8x8_t __a)
 {
   return (int8x8_t)__a;
 }
 
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int8x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpret_s8_u16 (uint16x4_t __a)
 {
   return (int8x8_t)__a;
 }
 
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int8x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpret_s8_u32 (uint32x2_t __a)
 {
   return (int8x8_t)__a;
 }
 
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpret_s16_p8 (poly8x8_t __a)
 {
   return (int16x4_t)__a;
 }
 
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpret_s16_p16 (poly16x4_t __a)
 {
   return (int16x4_t)__a;
 }
 
 #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpret_s16_f16 (float16x4_t __a)
 {
   return (int16x4_t) __a;
 }
 #endif
 
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpret_s16_f32 (float32x2_t __a)
 {
   return (int16x4_t)__a;
@@ -13113,76 +15030,88 @@ vreinterpret_s16_f32 (float32x2_t __a)
 
 #pragma GCC push_options
 #pragma GCC target ("fpu=crypto-neon-fp-armv8")
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpret_s16_p64 (poly64x1_t __a)
 {
   return (int16x4_t)__a;
 }
 
 #pragma GCC pop_options
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpret_s16_s64 (int64x1_t __a)
 {
   return (int16x4_t)__a;
 }
 
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpret_s16_u64 (uint64x1_t __a)
 {
   return (int16x4_t)__a;
 }
 
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpret_s16_s8 (int8x8_t __a)
 {
   return (int16x4_t)__a;
 }
 
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpret_s16_s32 (int32x2_t __a)
 {
   return (int16x4_t)__a;
 }
 
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpret_s16_u8 (uint8x8_t __a)
 {
   return (int16x4_t)__a;
 }
 
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpret_s16_u16 (uint16x4_t __a)
 {
   return (int16x4_t)__a;
 }
 
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpret_s16_u32 (uint32x2_t __a)
 {
   return (int16x4_t)__a;
 }
 
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpret_s32_p8 (poly8x8_t __a)
 {
   return (int32x2_t)__a;
 }
 
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpret_s32_p16 (poly16x4_t __a)
 {
   return (int32x2_t)__a;
 }
 
 #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpret_s32_f16 (float16x4_t __a)
 {
   return (int32x2_t) __a;
 }
 #endif
 
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpret_s32_f32 (float32x2_t __a)
 {
   return (int32x2_t)__a;
@@ -13190,76 +15119,88 @@ vreinterpret_s32_f32 (float32x2_t __a)
 
 #pragma GCC push_options
 #pragma GCC target ("fpu=crypto-neon-fp-armv8")
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpret_s32_p64 (poly64x1_t __a)
 {
   return (int32x2_t)__a;
 }
 
 #pragma GCC pop_options
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpret_s32_s64 (int64x1_t __a)
 {
   return (int32x2_t)__a;
 }
 
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpret_s32_u64 (uint64x1_t __a)
 {
   return (int32x2_t)__a;
 }
 
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpret_s32_s8 (int8x8_t __a)
 {
   return (int32x2_t)__a;
 }
 
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpret_s32_s16 (int16x4_t __a)
 {
   return (int32x2_t)__a;
 }
 
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpret_s32_u8 (uint8x8_t __a)
 {
   return (int32x2_t)__a;
 }
 
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpret_s32_u16 (uint16x4_t __a)
 {
   return (int32x2_t)__a;
 }
 
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpret_s32_u32 (uint32x2_t __a)
 {
   return (int32x2_t)__a;
 }
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint8x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpret_u8_p8 (poly8x8_t __a)
 {
   return (uint8x8_t)__a;
 }
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint8x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpret_u8_p16 (poly16x4_t __a)
 {
   return (uint8x8_t)__a;
 }
 
 #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint8x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpret_u8_f16 (float16x4_t __a)
 {
   return (uint8x8_t) __a;
 }
 #endif
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint8x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpret_u8_f32 (float32x2_t __a)
 {
   return (uint8x8_t)__a;
@@ -13267,76 +15208,88 @@ vreinterpret_u8_f32 (float32x2_t __a)
 
 #pragma GCC push_options
 #pragma GCC target ("fpu=crypto-neon-fp-armv8")
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint8x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpret_u8_p64 (poly64x1_t __a)
 {
   return (uint8x8_t)__a;
 }
 
 #pragma GCC pop_options
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint8x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpret_u8_s64 (int64x1_t __a)
 {
   return (uint8x8_t)__a;
 }
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint8x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpret_u8_u64 (uint64x1_t __a)
 {
   return (uint8x8_t)__a;
 }
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint8x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpret_u8_s8 (int8x8_t __a)
 {
   return (uint8x8_t)__a;
 }
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint8x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpret_u8_s16 (int16x4_t __a)
 {
   return (uint8x8_t)__a;
 }
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint8x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpret_u8_s32 (int32x2_t __a)
 {
   return (uint8x8_t)__a;
 }
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint8x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpret_u8_u16 (uint16x4_t __a)
 {
   return (uint8x8_t)__a;
 }
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint8x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpret_u8_u32 (uint32x2_t __a)
 {
   return (uint8x8_t)__a;
 }
 
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpret_u16_p8 (poly8x8_t __a)
 {
   return (uint16x4_t)__a;
 }
 
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpret_u16_p16 (poly16x4_t __a)
 {
   return (uint16x4_t)__a;
 }
 
 #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpret_u16_f16 (float16x4_t __a)
 {
   return (uint16x4_t) __a;
 }
 #endif
 
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpret_u16_f32 (float32x2_t __a)
 {
   return (uint16x4_t)__a;
@@ -13344,76 +15297,88 @@ vreinterpret_u16_f32 (float32x2_t __a)
 
 #pragma GCC push_options
 #pragma GCC target ("fpu=crypto-neon-fp-armv8")
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpret_u16_p64 (poly64x1_t __a)
 {
   return (uint16x4_t)__a;
 }
 
 #pragma GCC pop_options
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpret_u16_s64 (int64x1_t __a)
 {
   return (uint16x4_t)__a;
 }
 
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpret_u16_u64 (uint64x1_t __a)
 {
   return (uint16x4_t)__a;
 }
 
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpret_u16_s8 (int8x8_t __a)
 {
   return (uint16x4_t)__a;
 }
 
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpret_u16_s16 (int16x4_t __a)
 {
   return (uint16x4_t)__a;
 }
 
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpret_u16_s32 (int32x2_t __a)
 {
   return (uint16x4_t)__a;
 }
 
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpret_u16_u8 (uint8x8_t __a)
 {
   return (uint16x4_t)__a;
 }
 
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpret_u16_u32 (uint32x2_t __a)
 {
   return (uint16x4_t)__a;
 }
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpret_u32_p8 (poly8x8_t __a)
 {
   return (uint32x2_t)__a;
 }
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpret_u32_p16 (poly16x4_t __a)
 {
   return (uint32x2_t)__a;
 }
 
 #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpret_u32_f16 (float16x4_t __a)
 {
   return (uint32x2_t) __a;
 }
 #endif
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpret_u32_f32 (float32x2_t __a)
 {
   return (uint32x2_t)__a;
@@ -13421,70 +15386,81 @@ vreinterpret_u32_f32 (float32x2_t __a)
 
 #pragma GCC push_options
 #pragma GCC target ("fpu=crypto-neon-fp-armv8")
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpret_u32_p64 (poly64x1_t __a)
 {
   return (uint32x2_t)__a;
 }
 
 #pragma GCC pop_options
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpret_u32_s64 (int64x1_t __a)
 {
   return (uint32x2_t)__a;
 }
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpret_u32_u64 (uint64x1_t __a)
 {
   return (uint32x2_t)__a;
 }
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpret_u32_s8 (int8x8_t __a)
 {
   return (uint32x2_t)__a;
 }
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpret_u32_s16 (int16x4_t __a)
 {
   return (uint32x2_t)__a;
 }
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpret_u32_s32 (int32x2_t __a)
 {
   return (uint32x2_t)__a;
 }
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpret_u32_u8 (uint8x8_t __a)
 {
   return (uint32x2_t)__a;
 }
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpret_u32_u16 (uint16x4_t __a)
 {
   return (uint32x2_t)__a;
 }
 
-__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
+__extension__ extern __inline poly8x16_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_p8_p16 (poly16x8_t __a)
 {
   return (poly8x16_t)__a;
 }
 
 #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
-__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
+__extension__ extern __inline poly8x16_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_p8_f16 (float16x8_t __a)
 {
   return (poly8x16_t) __a;
 }
 #endif
 
-__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
+__extension__ extern __inline poly8x16_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_p8_f32 (float32x4_t __a)
 {
   return (poly8x16_t)__a;
@@ -13492,83 +15468,96 @@ vreinterpretq_p8_f32 (float32x4_t __a)
 
 #pragma GCC push_options
 #pragma GCC target ("fpu=crypto-neon-fp-armv8")
-__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
+__extension__ extern __inline poly8x16_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_p8_p64 (poly64x2_t __a)
 {
   return (poly8x16_t)__a;
 }
 
 
-__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
+__extension__ extern __inline poly8x16_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_p8_p128 (poly128_t __a)
 {
   return (poly8x16_t)__a;
 }
 
 #pragma GCC pop_options
-__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
+__extension__ extern __inline poly8x16_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_p8_s64 (int64x2_t __a)
 {
   return (poly8x16_t)__a;
 }
 
-__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
+__extension__ extern __inline poly8x16_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_p8_u64 (uint64x2_t __a)
 {
   return (poly8x16_t)__a;
 }
 
-__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
+__extension__ extern __inline poly8x16_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_p8_s8 (int8x16_t __a)
 {
   return (poly8x16_t)__a;
 }
 
-__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
+__extension__ extern __inline poly8x16_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_p8_s16 (int16x8_t __a)
 {
   return (poly8x16_t)__a;
 }
 
-__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
+__extension__ extern __inline poly8x16_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_p8_s32 (int32x4_t __a)
 {
   return (poly8x16_t)__a;
 }
 
-__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
+__extension__ extern __inline poly8x16_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_p8_u8 (uint8x16_t __a)
 {
   return (poly8x16_t)__a;
 }
 
-__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
+__extension__ extern __inline poly8x16_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_p8_u16 (uint16x8_t __a)
 {
   return (poly8x16_t)__a;
 }
 
-__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
+__extension__ extern __inline poly8x16_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_p8_u32 (uint32x4_t __a)
 {
   return (poly8x16_t)__a;
 }
 
-__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline poly16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_p16_p8 (poly8x16_t __a)
 {
   return (poly16x8_t)__a;
 }
 
 #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
-__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline poly16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_p16_f16 (float16x8_t __a)
 {
   return (poly16x8_t) __a;
 }
 #endif
 
-__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline poly16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_p16_f32 (float32x4_t __a)
 {
   return (poly16x8_t)__a;
@@ -13576,69 +15565,80 @@ vreinterpretq_p16_f32 (float32x4_t __a)
 
 #pragma GCC push_options
 #pragma GCC target ("fpu=crypto-neon-fp-armv8")
-__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline poly16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_p16_p64 (poly64x2_t __a)
 {
   return (poly16x8_t)__a;
 }
 
-__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline poly16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_p16_p128 (poly128_t __a)
 {
   return (poly16x8_t)__a;
 }
 
 #pragma GCC pop_options
-__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline poly16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_p16_s64 (int64x2_t __a)
 {
   return (poly16x8_t)__a;
 }
 
-__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline poly16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_p16_u64 (uint64x2_t __a)
 {
   return (poly16x8_t)__a;
 }
 
-__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline poly16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_p16_s8 (int8x16_t __a)
 {
   return (poly16x8_t)__a;
 }
 
-__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline poly16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_p16_s16 (int16x8_t __a)
 {
   return (poly16x8_t)__a;
 }
 
-__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline poly16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_p16_s32 (int32x4_t __a)
 {
   return (poly16x8_t)__a;
 }
 
-__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline poly16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_p16_u8 (uint8x16_t __a)
 {
   return (poly16x8_t)__a;
 }
 
-__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline poly16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_p16_u16 (uint16x8_t __a)
 {
   return (poly16x8_t)__a;
 }
 
-__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline poly16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_p16_u32 (uint32x4_t __a)
 {
   return (poly16x8_t)__a;
 }
 
 #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
-__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline float16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_f16_p8 (poly8x16_t __a)
 {
   return (float16x8_t) __a;
@@ -13646,7 +15646,8 @@ vreinterpretq_f16_p8 (poly8x16_t __a)
 #endif
 
 #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
-__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline float16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_f16_p16 (poly16x8_t __a)
 {
   return (float16x8_t) __a;
@@ -13654,7 +15655,8 @@ vreinterpretq_f16_p16 (poly16x8_t __a)
 #endif
 
 #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
-__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline float16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_f16_f32 (float32x4_t __a)
 {
   return (float16x8_t) __a;
@@ -13665,7 +15667,8 @@ vreinterpretq_f16_f32 (float32x4_t __a)
 #pragma GCC target ("fpu=crypto-neon-fp-armv8")
 
 #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
-__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline float16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_f16_p64 (poly64x2_t __a)
 {
   return (float16x8_t) __a;
@@ -13673,7 +15676,8 @@ vreinterpretq_f16_p64 (poly64x2_t __a)
 #endif
 
 #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
-__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline float16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_f16_p128 (poly128_t __a)
 {
   return (float16x8_t) __a;
@@ -13683,7 +15687,8 @@ vreinterpretq_f16_p128 (poly128_t __a)
 #pragma GCC pop_options
 
 #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
-__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline float16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_f16_s64 (int64x2_t __a)
 {
   return (float16x8_t) __a;
@@ -13691,7 +15696,8 @@ vreinterpretq_f16_s64 (int64x2_t __a)
 #endif
 
 #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
-__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline float16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_f16_u64 (uint64x2_t __a)
 {
   return (float16x8_t) __a;
@@ -13699,7 +15705,8 @@ vreinterpretq_f16_u64 (uint64x2_t __a)
 #endif
 
 #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
-__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline float16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_f16_s8 (int8x16_t __a)
 {
   return (float16x8_t) __a;
@@ -13707,7 +15714,8 @@ vreinterpretq_f16_s8 (int8x16_t __a)
 #endif
 
 #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
-__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline float16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_f16_s16 (int16x8_t __a)
 {
   return (float16x8_t) __a;
@@ -13715,7 +15723,8 @@ vreinterpretq_f16_s16 (int16x8_t __a)
 #endif
 
 #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
-__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline float16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_f16_s32 (int32x4_t __a)
 {
   return (float16x8_t) __a;
@@ -13723,7 +15732,8 @@ vreinterpretq_f16_s32 (int32x4_t __a)
 #endif
 
 #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
-__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline float16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_f16_u8 (uint8x16_t __a)
 {
   return (float16x8_t) __a;
@@ -13731,7 +15741,8 @@ vreinterpretq_f16_u8 (uint8x16_t __a)
 #endif
 
 #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
-__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline float16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_f16_u16 (uint16x8_t __a)
 {
   return (float16x8_t) __a;
@@ -13739,27 +15750,31 @@ vreinterpretq_f16_u16 (uint16x8_t __a)
 #endif
 
 #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
-__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline float16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_f16_u32 (uint32x4_t __a)
 {
   return (float16x8_t) __a;
 }
 #endif
 
-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline float32x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_f32_p8 (poly8x16_t __a)
 {
   return (float32x4_t)__a;
 }
 
-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline float32x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_f32_p16 (poly16x8_t __a)
 {
   return (float32x4_t)__a;
 }
 
 #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline float32x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_f32_f16 (float16x8_t __a)
 {
   return (float32x4_t) __a;
@@ -13768,62 +15783,72 @@ vreinterpretq_f32_f16 (float16x8_t __a)
 
 #pragma GCC push_options
 #pragma GCC target ("fpu=crypto-neon-fp-armv8")
-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline float32x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_f32_p64 (poly64x2_t __a)
 {
   return (float32x4_t)__a;
 }
 
-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline float32x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_f32_p128 (poly128_t __a)
 {
   return (float32x4_t)__a;
 }
 
 #pragma GCC pop_options
-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline float32x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_f32_s64 (int64x2_t __a)
 {
   return (float32x4_t)__a;
 }
 
-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline float32x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_f32_u64 (uint64x2_t __a)
 {
   return (float32x4_t)__a;
 }
 
-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline float32x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_f32_s8 (int8x16_t __a)
 {
   return (float32x4_t)__a;
 }
 
-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline float32x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_f32_s16 (int16x8_t __a)
 {
   return (float32x4_t)__a;
 }
 
-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline float32x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_f32_s32 (int32x4_t __a)
 {
   return (float32x4_t)__a;
 }
 
-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline float32x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_f32_u8 (uint8x16_t __a)
 {
   return (float32x4_t)__a;
 }
 
-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline float32x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_f32_u16 (uint16x8_t __a)
 {
   return (float32x4_t)__a;
 }
 
-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline float32x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_f32_u32 (uint32x4_t __a)
 {
   return (float32x4_t)__a;
@@ -13831,188 +15856,218 @@ vreinterpretq_f32_u32 (uint32x4_t __a)
 
 #pragma GCC push_options
 #pragma GCC target ("fpu=crypto-neon-fp-armv8")
-__extension__ static __inline poly64x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline poly64x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_p64_p8 (poly8x16_t __a)
 {
   return (poly64x2_t)__a;
 }
 
-__extension__ static __inline poly64x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline poly64x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_p64_p16 (poly16x8_t __a)
 {
   return (poly64x2_t)__a;
 }
 
 #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
-__extension__ static __inline poly64x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline poly64x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_p64_f16 (float16x8_t __a)
 {
   return (poly64x2_t) __a;
 }
 #endif
 
-__extension__ static __inline poly64x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline poly64x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_p64_f32 (float32x4_t __a)
 {
   return (poly64x2_t)__a;
 }
 
-__extension__ static __inline poly64x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline poly64x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_p64_p128 (poly128_t __a)
 {
   return (poly64x2_t)__a;
 }
 
-__extension__ static __inline poly64x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline poly64x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_p64_s64 (int64x2_t __a)
 {
   return (poly64x2_t)__a;
 }
 
-__extension__ static __inline poly64x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline poly64x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_p64_u64 (uint64x2_t __a)
 {
   return (poly64x2_t)__a;
 }
 
-__extension__ static __inline poly64x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline poly64x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_p64_s8 (int8x16_t __a)
 {
   return (poly64x2_t)__a;
 }
 
-__extension__ static __inline poly64x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline poly64x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_p64_s16 (int16x8_t __a)
 {
   return (poly64x2_t)__a;
 }
 
-__extension__ static __inline poly64x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline poly64x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_p64_s32 (int32x4_t __a)
 {
   return (poly64x2_t)__a;
 }
 
-__extension__ static __inline poly64x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline poly64x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_p64_u8 (uint8x16_t __a)
 {
   return (poly64x2_t)__a;
 }
 
-__extension__ static __inline poly64x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline poly64x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_p64_u16 (uint16x8_t __a)
 {
   return (poly64x2_t)__a;
 }
 
-__extension__ static __inline poly64x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline poly64x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_p64_u32 (uint32x4_t __a)
 {
   return (poly64x2_t)__a;
 }
 
-__extension__ static __inline poly128_t __attribute__ ((__always_inline__))
+__extension__ extern __inline poly128_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_p128_p8 (poly8x16_t __a)
 {
   return (poly128_t)__a;
 }
 
-__extension__ static __inline poly128_t __attribute__ ((__always_inline__))
+__extension__ extern __inline poly128_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_p128_p16 (poly16x8_t __a)
 {
   return (poly128_t)__a;
 }
 
 #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
-__extension__ static __inline poly128_t __attribute__ ((__always_inline__))
+__extension__ extern __inline poly128_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_p128_f16 (float16x8_t __a)
 {
   return (poly128_t) __a;
 }
 #endif
 
-__extension__ static __inline poly128_t __attribute__ ((__always_inline__))
+__extension__ extern __inline poly128_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_p128_f32 (float32x4_t __a)
 {
   return (poly128_t)__a;
 }
 
-__extension__ static __inline poly128_t __attribute__ ((__always_inline__))
+__extension__ extern __inline poly128_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_p128_p64 (poly64x2_t __a)
 {
   return (poly128_t)__a;
 }
 
-__extension__ static __inline poly128_t __attribute__ ((__always_inline__))
+__extension__ extern __inline poly128_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_p128_s64 (int64x2_t __a)
 {
   return (poly128_t)__a;
 }
 
-__extension__ static __inline poly128_t __attribute__ ((__always_inline__))
+__extension__ extern __inline poly128_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_p128_u64 (uint64x2_t __a)
 {
   return (poly128_t)__a;
 }
 
-__extension__ static __inline poly128_t __attribute__ ((__always_inline__))
+__extension__ extern __inline poly128_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_p128_s8 (int8x16_t __a)
 {
   return (poly128_t)__a;
 }
 
-__extension__ static __inline poly128_t __attribute__ ((__always_inline__))
+__extension__ extern __inline poly128_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_p128_s16 (int16x8_t __a)
 {
   return (poly128_t)__a;
 }
 
-__extension__ static __inline poly128_t __attribute__ ((__always_inline__))
+__extension__ extern __inline poly128_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_p128_s32 (int32x4_t __a)
 {
   return (poly128_t)__a;
 }
 
-__extension__ static __inline poly128_t __attribute__ ((__always_inline__))
+__extension__ extern __inline poly128_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_p128_u8 (uint8x16_t __a)
 {
   return (poly128_t)__a;
 }
 
-__extension__ static __inline poly128_t __attribute__ ((__always_inline__))
+__extension__ extern __inline poly128_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_p128_u16 (uint16x8_t __a)
 {
   return (poly128_t)__a;
 }
 
-__extension__ static __inline poly128_t __attribute__ ((__always_inline__))
+__extension__ extern __inline poly128_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_p128_u32 (uint32x4_t __a)
 {
   return (poly128_t)__a;
 }
 
 #pragma GCC pop_options
-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int64x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_s64_p8 (poly8x16_t __a)
 {
   return (int64x2_t)__a;
 }
 
-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int64x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_s64_p16 (poly16x8_t __a)
 {
   return (int64x2_t)__a;
 }
 
 #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int64x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_s64_f16 (float16x8_t __a)
 {
   return (int64x2_t) __a;
 }
 #endif
 
-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int64x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_s64_f32 (float32x4_t __a)
 {
   return (int64x2_t)__a;
@@ -14020,82 +16075,95 @@ vreinterpretq_s64_f32 (float32x4_t __a)
 
 #pragma GCC push_options
 #pragma GCC target ("fpu=crypto-neon-fp-armv8")
-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int64x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_s64_p64 (poly64x2_t __a)
 {
   return (int64x2_t)__a;
 }
 
-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int64x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_s64_p128 (poly128_t __a)
 {
   return (int64x2_t)__a;
 }
 
 #pragma GCC pop_options
-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int64x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_s64_u64 (uint64x2_t __a)
 {
   return (int64x2_t)__a;
 }
 
-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int64x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_s64_s8 (int8x16_t __a)
 {
   return (int64x2_t)__a;
 }
 
-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int64x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_s64_s16 (int16x8_t __a)
 {
   return (int64x2_t)__a;
 }
 
-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int64x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_s64_s32 (int32x4_t __a)
 {
   return (int64x2_t)__a;
 }
 
-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int64x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_s64_u8 (uint8x16_t __a)
 {
   return (int64x2_t)__a;
 }
 
-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int64x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_s64_u16 (uint16x8_t __a)
 {
   return (int64x2_t)__a;
 }
 
-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int64x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_s64_u32 (uint32x4_t __a)
 {
   return (int64x2_t)__a;
 }
 
-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint64x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_u64_p8 (poly8x16_t __a)
 {
   return (uint64x2_t)__a;
 }
 
-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint64x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_u64_p16 (poly16x8_t __a)
 {
   return (uint64x2_t)__a;
 }
 
 #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint64x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_u64_f16 (float16x8_t __a)
 {
   return (uint64x2_t) __a;
 }
 #endif
 
-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint64x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_u64_f32 (float32x4_t __a)
 {
   return (uint64x2_t)__a;
@@ -14103,82 +16171,95 @@ vreinterpretq_u64_f32 (float32x4_t __a)
 
 #pragma GCC push_options
 #pragma GCC target ("fpu=crypto-neon-fp-armv8")
-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint64x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_u64_p64 (poly64x2_t __a)
 {
   return (uint64x2_t)__a;
 }
 
-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint64x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_u64_p128 (poly128_t __a)
 {
   return (uint64x2_t)__a;
 }
 
 #pragma GCC pop_options
-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint64x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_u64_s64 (int64x2_t __a)
 {
   return (uint64x2_t)__a;
 }
 
-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint64x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_u64_s8 (int8x16_t __a)
 {
   return (uint64x2_t)__a;
 }
 
-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint64x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_u64_s16 (int16x8_t __a)
 {
   return (uint64x2_t)__a;
 }
 
-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint64x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_u64_s32 (int32x4_t __a)
 {
   return (uint64x2_t)__a;
 }
 
-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint64x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_u64_u8 (uint8x16_t __a)
 {
   return (uint64x2_t)__a;
 }
 
-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint64x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_u64_u16 (uint16x8_t __a)
 {
   return (uint64x2_t)__a;
 }
 
-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint64x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_u64_u32 (uint32x4_t __a)
 {
   return (uint64x2_t)__a;
 }
 
-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int8x16_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_s8_p8 (poly8x16_t __a)
 {
   return (int8x16_t)__a;
 }
 
-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int8x16_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_s8_p16 (poly16x8_t __a)
 {
   return (int8x16_t)__a;
 }
 
 #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int8x16_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_s8_f16 (float16x8_t __a)
 {
   return (int8x16_t) __a;
 }
 #endif
 
-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int8x16_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_s8_f32 (float32x4_t __a)
 {
   return (int8x16_t)__a;
@@ -14186,82 +16267,95 @@ vreinterpretq_s8_f32 (float32x4_t __a)
 
 #pragma GCC push_options
 #pragma GCC target ("fpu=crypto-neon-fp-armv8")
-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int8x16_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_s8_p64 (poly64x2_t __a)
 {
   return (int8x16_t)__a;
 }
 
-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int8x16_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_s8_p128 (poly128_t __a)
 {
   return (int8x16_t)__a;
 }
 
 #pragma GCC pop_options
-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int8x16_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_s8_s64 (int64x2_t __a)
 {
   return (int8x16_t)__a;
 }
 
-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int8x16_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_s8_u64 (uint64x2_t __a)
 {
   return (int8x16_t)__a;
 }
 
-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int8x16_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_s8_s16 (int16x8_t __a)
 {
   return (int8x16_t)__a;
 }
 
-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int8x16_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_s8_s32 (int32x4_t __a)
 {
   return (int8x16_t)__a;
 }
 
-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int8x16_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_s8_u8 (uint8x16_t __a)
 {
   return (int8x16_t)__a;
 }
 
-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int8x16_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_s8_u16 (uint16x8_t __a)
 {
   return (int8x16_t)__a;
 }
 
-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int8x16_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_s8_u32 (uint32x4_t __a)
 {
   return (int8x16_t)__a;
 }
 
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_s16_p8 (poly8x16_t __a)
 {
   return (int16x8_t)__a;
 }
 
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_s16_p16 (poly16x8_t __a)
 {
   return (int16x8_t)__a;
 }
 
 #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_s16_f16 (float16x8_t __a)
 {
   return (int16x8_t) __a;
 }
 #endif
 
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_s16_f32 (float32x4_t __a)
 {
   return (int16x8_t)__a;
@@ -14269,82 +16363,95 @@ vreinterpretq_s16_f32 (float32x4_t __a)
 
 #pragma GCC push_options
 #pragma GCC target ("fpu=crypto-neon-fp-armv8")
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_s16_p64 (poly64x2_t __a)
 {
   return (int16x8_t)__a;
 }
 
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_s16_p128 (poly128_t __a)
 {
   return (int16x8_t)__a;
 }
 
 #pragma GCC pop_options
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_s16_s64 (int64x2_t __a)
 {
   return (int16x8_t)__a;
 }
 
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_s16_u64 (uint64x2_t __a)
 {
   return (int16x8_t)__a;
 }
 
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_s16_s8 (int8x16_t __a)
 {
   return (int16x8_t)__a;
 }
 
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_s16_s32 (int32x4_t __a)
 {
   return (int16x8_t)__a;
 }
 
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_s16_u8 (uint8x16_t __a)
 {
   return (int16x8_t)__a;
 }
 
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_s16_u16 (uint16x8_t __a)
 {
   return (int16x8_t)__a;
 }
 
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_s16_u32 (uint32x4_t __a)
 {
   return (int16x8_t)__a;
 }
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_s32_p8 (poly8x16_t __a)
 {
   return (int32x4_t)__a;
 }
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_s32_p16 (poly16x8_t __a)
 {
   return (int32x4_t)__a;
 }
 
 #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_s32_f16 (float16x8_t __a)
 {
   return (int32x4_t)__a;
 }
 #endif
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_s32_f32 (float32x4_t __a)
 {
   return (int32x4_t)__a;
@@ -14352,82 +16459,95 @@ vreinterpretq_s32_f32 (float32x4_t __a)
 
 #pragma GCC push_options
 #pragma GCC target ("fpu=crypto-neon-fp-armv8")
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_s32_p64 (poly64x2_t __a)
 {
   return (int32x4_t)__a;
 }
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_s32_p128 (poly128_t __a)
 {
   return (int32x4_t)__a;
 }
 
 #pragma GCC pop_options
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_s32_s64 (int64x2_t __a)
 {
   return (int32x4_t)__a;
 }
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_s32_u64 (uint64x2_t __a)
 {
   return (int32x4_t)__a;
 }
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_s32_s8 (int8x16_t __a)
 {
   return (int32x4_t)__a;
 }
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_s32_s16 (int16x8_t __a)
 {
   return (int32x4_t)__a;
 }
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_s32_u8 (uint8x16_t __a)
 {
   return (int32x4_t)__a;
 }
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_s32_u16 (uint16x8_t __a)
 {
   return (int32x4_t)__a;
 }
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline int32x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_s32_u32 (uint32x4_t __a)
 {
   return (int32x4_t)__a;
 }
 
-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint8x16_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_u8_p8 (poly8x16_t __a)
 {
   return (uint8x16_t)__a;
 }
 
-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint8x16_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_u8_p16 (poly16x8_t __a)
 {
   return (uint8x16_t)__a;
 }
 
 #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint8x16_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_u8_f16 (float16x8_t __a)
 {
   return (uint8x16_t) __a;
 }
 #endif
 
-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint8x16_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_u8_f32 (float32x4_t __a)
 {
   return (uint8x16_t)__a;
@@ -14435,82 +16555,95 @@ vreinterpretq_u8_f32 (float32x4_t __a)
 
 #pragma GCC push_options
 #pragma GCC target ("fpu=crypto-neon-fp-armv8")
-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint8x16_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_u8_p64 (poly64x2_t __a)
 {
   return (uint8x16_t)__a;
 }
 
-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint8x16_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_u8_p128 (poly128_t __a)
 {
   return (uint8x16_t)__a;
 }
 
 #pragma GCC pop_options
-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint8x16_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_u8_s64 (int64x2_t __a)
 {
   return (uint8x16_t)__a;
 }
 
-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint8x16_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_u8_u64 (uint64x2_t __a)
 {
   return (uint8x16_t)__a;
 }
 
-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint8x16_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_u8_s8 (int8x16_t __a)
 {
   return (uint8x16_t)__a;
 }
 
-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint8x16_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_u8_s16 (int16x8_t __a)
 {
   return (uint8x16_t)__a;
 }
 
-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint8x16_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_u8_s32 (int32x4_t __a)
 {
   return (uint8x16_t)__a;
 }
 
-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint8x16_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_u8_u16 (uint16x8_t __a)
 {
   return (uint8x16_t)__a;
 }
 
-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint8x16_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_u8_u32 (uint32x4_t __a)
 {
   return (uint8x16_t)__a;
 }
 
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_u16_p8 (poly8x16_t __a)
 {
   return (uint16x8_t)__a;
 }
 
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_u16_p16 (poly16x8_t __a)
 {
   return (uint16x8_t)__a;
 }
 
 #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_u16_f16 (float16x8_t __a)
 {
   return (uint16x8_t) __a;
 }
 #endif
 
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_u16_f32 (float32x4_t __a)
 {
   return (uint16x8_t)__a;
@@ -14518,82 +16651,95 @@ vreinterpretq_u16_f32 (float32x4_t __a)
 
 #pragma GCC push_options
 #pragma GCC target ("fpu=crypto-neon-fp-armv8")
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_u16_p64 (poly64x2_t __a)
 {
   return (uint16x8_t)__a;
 }
 
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_u16_p128 (poly128_t __a)
 {
   return (uint16x8_t)__a;
 }
 
 #pragma GCC pop_options
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_u16_s64 (int64x2_t __a)
 {
   return (uint16x8_t)__a;
 }
 
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_u16_u64 (uint64x2_t __a)
 {
   return (uint16x8_t)__a;
 }
 
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_u16_s8 (int8x16_t __a)
 {
   return (uint16x8_t)__a;
 }
 
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_u16_s16 (int16x8_t __a)
 {
   return (uint16x8_t)__a;
 }
 
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_u16_s32 (int32x4_t __a)
 {
   return (uint16x8_t)__a;
 }
 
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_u16_u8 (uint8x16_t __a)
 {
   return (uint16x8_t)__a;
 }
 
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_u16_u32 (uint32x4_t __a)
 {
   return (uint16x8_t)__a;
 }
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_u32_p8 (poly8x16_t __a)
 {
   return (uint32x4_t)__a;
 }
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_u32_p16 (poly16x8_t __a)
 {
   return (uint32x4_t)__a;
 }
 
 #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_u32_f16 (float16x8_t __a)
 {
   return (uint32x4_t) __a;
 }
 #endif
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_u32_f32 (float32x4_t __a)
 {
   return (uint32x4_t)__a;
@@ -14601,56 +16747,65 @@ vreinterpretq_u32_f32 (float32x4_t __a)
 
 #pragma GCC push_options
 #pragma GCC target ("fpu=crypto-neon-fp-armv8")
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_u32_p64 (poly64x2_t __a)
 {
   return (uint32x4_t)__a;
 }
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_u32_p128 (poly128_t __a)
 {
   return (uint32x4_t)__a;
 }
 
 #pragma GCC pop_options
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_u32_s64 (int64x2_t __a)
 {
   return (uint32x4_t)__a;
 }
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_u32_u64 (uint64x2_t __a)
 {
   return (uint32x4_t)__a;
 }
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_u32_s8 (int8x16_t __a)
 {
   return (uint32x4_t)__a;
 }
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_u32_s16 (int16x8_t __a)
 {
   return (uint32x4_t)__a;
 }
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_u32_s32 (int32x4_t __a)
 {
   return (uint32x4_t)__a;
 }
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_u32_u8 (uint8x16_t __a)
 {
   return (uint32x4_t)__a;
 }
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_u32_u16 (uint16x8_t __a)
 {
   return (uint32x4_t)__a;
@@ -14659,7 +16814,8 @@ vreinterpretq_u32_u16 (uint16x8_t __a)
 
 #pragma GCC push_options
 #pragma GCC target ("fpu=crypto-neon-fp-armv8")
-__extension__ static __inline poly128_t __attribute__ ((__always_inline__))
+__extension__ extern __inline poly128_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vldrq_p128 (poly128_t const * __ptr)
 {
 #ifdef __ARM_BIG_ENDIAN
@@ -14672,7 +16828,8 @@ vldrq_p128 (poly128_t const * __ptr)
 #endif
 }
 
-__extension__ static __inline void __attribute__ ((__always_inline__))
+__extension__ extern __inline void
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vstrq_p128 (poly128_t * __ptr, poly128_t __val)
 {
 #ifdef __ARM_BIG_ENDIAN
@@ -14695,7 +16852,8 @@ vstrq_p128 (poly128_t * __ptr, poly128_t __val)
    If the result is all zeroes for any half then the whole result is zeroes.
    This is what the pairwise min reduction achieves.  */
 
-__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint64x1_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vceq_p64 (poly64x1_t __a, poly64x1_t __b)
 {
   uint32x2_t __t_a = vreinterpret_u32_p64 (__a);
@@ -14710,7 +16868,8 @@ vceq_p64 (poly64x1_t __a, poly64x1_t __b)
    a reduction with max since if any two corresponding bits
    in the two poly64_t's match, then the whole result must be all ones.  */
 
-__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint64x1_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vtst_p64 (poly64x1_t __a, poly64x1_t __b)
 {
   uint32x2_t __t_a = vreinterpret_u32_p64 (__a);
@@ -14720,31 +16879,36 @@ vtst_p64 (poly64x1_t __a, poly64x1_t __b)
   return vreinterpret_u64_u32 (__m);
 }
 
-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint8x16_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vaeseq_u8 (uint8x16_t __data, uint8x16_t __key)
 {
   return __builtin_arm_crypto_aese (__data, __key);
 }
 
-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint8x16_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vaesdq_u8 (uint8x16_t __data, uint8x16_t __key)
 {
   return __builtin_arm_crypto_aesd (__data, __key);
 }
 
-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint8x16_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vaesmcq_u8 (uint8x16_t __data)
 {
   return __builtin_arm_crypto_aesmc (__data);
 }
 
-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint8x16_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vaesimcq_u8 (uint8x16_t __data)
 {
   return __builtin_arm_crypto_aesimc (__data);
 }
 
-__extension__ static __inline uint32_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vsha1h_u32 (uint32_t __hash_e)
 {
   uint32x4_t __t = vdupq_n_u32 (0);
@@ -14753,7 +16917,8 @@ vsha1h_u32 (uint32_t __hash_e)
   return vgetq_lane_u32 (__t, 0);
 }
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vsha1cq_u32 (uint32x4_t __hash_abcd, uint32_t __hash_e, uint32x4_t __wk)
 {
   uint32x4_t __t = vdupq_n_u32 (0);
@@ -14761,7 +16926,8 @@ vsha1cq_u32 (uint32x4_t __hash_abcd, uint32_t __hash_e, uint32x4_t __wk)
   return __builtin_arm_crypto_sha1c (__hash_abcd, __t, __wk);
 }
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vsha1pq_u32 (uint32x4_t __hash_abcd, uint32_t __hash_e, uint32x4_t __wk)
 {
   uint32x4_t __t = vdupq_n_u32 (0);
@@ -14769,7 +16935,8 @@ vsha1pq_u32 (uint32x4_t __hash_abcd, uint32_t __hash_e, uint32x4_t __wk)
   return __builtin_arm_crypto_sha1p (__hash_abcd, __t, __wk);
 }
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vsha1mq_u32 (uint32x4_t __hash_abcd, uint32_t __hash_e, uint32x4_t __wk)
 {
   uint32x4_t __t = vdupq_n_u32 (0);
@@ -14777,49 +16944,57 @@ vsha1mq_u32 (uint32x4_t __hash_abcd, uint32_t __hash_e, uint32x4_t __wk)
   return __builtin_arm_crypto_sha1m (__hash_abcd, __t, __wk);
 }
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vsha1su0q_u32 (uint32x4_t __w0_3, uint32x4_t __w4_7, uint32x4_t __w8_11)
 {
   return __builtin_arm_crypto_sha1su0 (__w0_3, __w4_7, __w8_11);
 }
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vsha1su1q_u32 (uint32x4_t __tw0_3, uint32x4_t __w12_15)
 {
   return __builtin_arm_crypto_sha1su1 (__tw0_3, __w12_15);
 }
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vsha256hq_u32 (uint32x4_t __hash_abcd, uint32x4_t __hash_efgh, uint32x4_t __wk)
 {
   return __builtin_arm_crypto_sha256h (__hash_abcd, __hash_efgh, __wk);
 }
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vsha256h2q_u32 (uint32x4_t __hash_abcd, uint32x4_t __hash_efgh, uint32x4_t __wk)
 {
   return __builtin_arm_crypto_sha256h2 (__hash_abcd, __hash_efgh, __wk);
 }
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vsha256su0q_u32 (uint32x4_t __w0_3, uint32x4_t __w4_7)
 {
   return __builtin_arm_crypto_sha256su0 (__w0_3, __w4_7);
 }
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+__extension__ extern __inline uint32x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vsha256su1q_u32 (uint32x4_t __tw0_3, uint32x4_t __w8_11, uint32x4_t __w12_15)
 {
   return __builtin_arm_crypto_sha256su1 (__tw0_3, __w8_11, __w12_15);
 }
 
-__extension__ static __inline poly128_t __attribute__ ((__always_inline__))
+__extension__ extern __inline poly128_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vmull_p64 (poly64_t __a, poly64_t __b)
 {
   return (poly128_t) __builtin_arm_crypto_vmullp64 ((uint64_t) __a, (uint64_t) __b);
 }
 
-__extension__ static __inline poly128_t __attribute__ ((__always_inline__))
+__extension__ extern __inline poly128_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vmull_high_p64 (poly64x2_t __a, poly64x2_t __b)
 {
   poly64_t __t1 = vget_high_p64 (__a);
@@ -14830,6 +17005,984 @@ vmull_high_p64 (poly64x2_t __a, poly64x2_t __b)
 
 #pragma GCC pop_options
 
+  /* Intrinsics for FP16 instructions.  */
+#pragma GCC push_options
+#pragma GCC target ("fpu=neon-fp-armv8")
+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+
+__extension__ extern __inline float16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vabd_f16 (float16x4_t __a, float16x4_t __b)
+{
+  return __builtin_neon_vabdv4hf (__a, __b);
+}
+
+__extension__ extern __inline float16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vabdq_f16 (float16x8_t __a, float16x8_t __b)
+{
+  return __builtin_neon_vabdv8hf (__a, __b);
+}
+
+__extension__ extern __inline float16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vabs_f16 (float16x4_t __a)
+{
+  return __builtin_neon_vabsv4hf (__a);
+}
+
+__extension__ extern __inline float16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vabsq_f16 (float16x8_t __a)
+{
+  return __builtin_neon_vabsv8hf (__a);
+}
+
+__extension__ extern __inline float16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vadd_f16 (float16x4_t __a, float16x4_t __b)
+{
+  return __builtin_neon_vaddv4hf (__a, __b);
+}
+
+__extension__ extern __inline float16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vaddq_f16 (float16x8_t __a, float16x8_t __b)
+{
+  return __builtin_neon_vaddv8hf (__a, __b);
+}
+
+__extension__ extern __inline uint16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vcage_f16 (float16x4_t __a, float16x4_t __b)
+{
+  return (uint16x4_t)__builtin_neon_vcagev4hf (__a, __b);
+}
+
+__extension__ extern __inline uint16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vcageq_f16 (float16x8_t __a, float16x8_t __b)
+{
+  return (uint16x8_t)__builtin_neon_vcagev8hf (__a, __b);
+}
+
+__extension__ extern __inline uint16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vcagt_f16 (float16x4_t __a, float16x4_t __b)
+{
+  return (uint16x4_t)__builtin_neon_vcagtv4hf (__a, __b);
+}
+
+__extension__ extern __inline uint16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vcagtq_f16 (float16x8_t __a, float16x8_t __b)
+{
+  return (uint16x8_t)__builtin_neon_vcagtv8hf (__a, __b);
+}
+
+__extension__ extern __inline uint16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vcale_f16 (float16x4_t __a, float16x4_t __b)
+{
+  return (uint16x4_t)__builtin_neon_vcalev4hf (__a, __b);
+}
+
+__extension__ extern __inline uint16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vcaleq_f16 (float16x8_t __a, float16x8_t __b)
+{
+  return (uint16x8_t)__builtin_neon_vcalev8hf (__a, __b);
+}
+
+__extension__ extern __inline uint16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vcalt_f16 (float16x4_t __a, float16x4_t __b)
+{
+  return (uint16x4_t)__builtin_neon_vcaltv4hf (__a, __b);
+}
+
+__extension__ extern __inline uint16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vcaltq_f16 (float16x8_t __a, float16x8_t __b)
+{
+  return (uint16x8_t)__builtin_neon_vcaltv8hf (__a, __b);
+}
+
+__extension__ extern __inline uint16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vceq_f16 (float16x4_t __a, float16x4_t __b)
+{
+  return (uint16x4_t)__builtin_neon_vceqv4hf (__a, __b);
+}
+
+__extension__ extern __inline uint16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vceqq_f16 (float16x8_t __a, float16x8_t __b)
+{
+  return (uint16x8_t)__builtin_neon_vceqv8hf (__a, __b);
+}
+
+__extension__ extern __inline uint16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vceqz_f16 (float16x4_t __a)
+{
+  return (uint16x4_t)__builtin_neon_vceqzv4hf (__a);
+}
+
+__extension__ extern __inline uint16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vceqzq_f16 (float16x8_t __a)
+{
+  return (uint16x8_t)__builtin_neon_vceqzv8hf (__a);
+}
+
+__extension__ extern __inline uint16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vcge_f16 (float16x4_t __a, float16x4_t __b)
+{
+  return (uint16x4_t)__builtin_neon_vcgev4hf (__a, __b);
+}
+
+__extension__ extern __inline uint16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vcgeq_f16 (float16x8_t __a, float16x8_t __b)
+{
+  return (uint16x8_t)__builtin_neon_vcgev8hf (__a, __b);
+}
+
+__extension__ extern __inline uint16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vcgez_f16 (float16x4_t __a)
+{
+  return (uint16x4_t)__builtin_neon_vcgezv4hf (__a);
+}
+
+__extension__ extern __inline uint16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vcgezq_f16 (float16x8_t __a)
+{
+  return (uint16x8_t)__builtin_neon_vcgezv8hf (__a);
+}
+
+__extension__ extern __inline uint16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vcgt_f16 (float16x4_t __a, float16x4_t __b)
+{
+  return (uint16x4_t)__builtin_neon_vcgtv4hf (__a, __b);
+}
+
+__extension__ extern __inline uint16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vcgtq_f16 (float16x8_t __a, float16x8_t __b)
+{
+  return (uint16x8_t)__builtin_neon_vcgtv8hf (__a, __b);
+}
+
+__extension__ extern __inline uint16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vcgtz_f16 (float16x4_t __a)
+{
+  return (uint16x4_t)__builtin_neon_vcgtzv4hf (__a);
+}
+
+__extension__ extern __inline uint16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vcgtzq_f16 (float16x8_t __a)
+{
+  return (uint16x8_t)__builtin_neon_vcgtzv8hf (__a);
+}
+
+__extension__ extern __inline uint16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vcle_f16 (float16x4_t __a, float16x4_t __b)
+{
+  return (uint16x4_t)__builtin_neon_vclev4hf (__a, __b);
+}
+
+__extension__ extern __inline uint16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vcleq_f16 (float16x8_t __a, float16x8_t __b)
+{
+  return (uint16x8_t)__builtin_neon_vclev8hf (__a, __b);
+}
+
+__extension__ extern __inline uint16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vclez_f16 (float16x4_t __a)
+{
+  return (uint16x4_t)__builtin_neon_vclezv4hf (__a);
+}
+
+__extension__ extern __inline uint16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vclezq_f16 (float16x8_t __a)
+{
+  return (uint16x8_t)__builtin_neon_vclezv8hf (__a);
+}
+
+__extension__ extern __inline uint16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vclt_f16 (float16x4_t __a, float16x4_t __b)
+{
+  return (uint16x4_t)__builtin_neon_vcltv4hf (__a, __b);
+}
+
+__extension__ extern __inline uint16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vcltq_f16 (float16x8_t __a, float16x8_t __b)
+{
+  return (uint16x8_t)__builtin_neon_vcltv8hf (__a, __b);
+}
+
+__extension__ extern __inline uint16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vcltz_f16 (float16x4_t __a)
+{
+  return (uint16x4_t)__builtin_neon_vcltzv4hf (__a);
+}
+
+__extension__ extern __inline uint16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vcltzq_f16 (float16x8_t __a)
+{
+  return (uint16x8_t)__builtin_neon_vcltzv8hf (__a);
+}
+
+__extension__ extern __inline float16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vcvt_f16_s16 (int16x4_t __a)
+{
+  return (float16x4_t)__builtin_neon_vcvtsv4hi (__a);
+}
+
+__extension__ extern __inline float16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vcvt_f16_u16 (uint16x4_t __a)
+{
+  return (float16x4_t)__builtin_neon_vcvtuv4hi ((int16x4_t)__a);
+}
+
+__extension__ extern __inline int16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vcvt_s16_f16 (float16x4_t __a)
+{
+  return (int16x4_t)__builtin_neon_vcvtsv4hf (__a);
+}
+
+__extension__ extern __inline uint16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vcvt_u16_f16 (float16x4_t __a)
+{
+  return (uint16x4_t)__builtin_neon_vcvtuv4hf (__a);
+}
+
+__extension__ extern __inline float16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vcvtq_f16_s16 (int16x8_t __a)
+{
+  return (float16x8_t)__builtin_neon_vcvtsv8hi (__a);
+}
+
+__extension__ extern __inline float16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vcvtq_f16_u16 (uint16x8_t __a)
+{
+  return (float16x8_t)__builtin_neon_vcvtuv8hi ((int16x8_t)__a);
+}
+
+__extension__ extern __inline int16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vcvtq_s16_f16 (float16x8_t __a)
+{
+  return (int16x8_t)__builtin_neon_vcvtsv8hf (__a);
+}
+
+__extension__ extern __inline uint16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vcvtq_u16_f16 (float16x8_t __a)
+{
+  return (uint16x8_t)__builtin_neon_vcvtuv8hf (__a);
+}
+
+__extension__ extern __inline int16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vcvta_s16_f16 (float16x4_t __a)
+{
+  return __builtin_neon_vcvtasv4hf (__a);
+}
+
+__extension__ extern __inline uint16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vcvta_u16_f16 (float16x4_t __a)
+{
+  return (uint16x4_t)__builtin_neon_vcvtauv4hf (__a);
+}
+
+__extension__ extern __inline int16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vcvtaq_s16_f16 (float16x8_t __a)
+{
+  return __builtin_neon_vcvtasv8hf (__a);
+}
+
+__extension__ extern __inline uint16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vcvtaq_u16_f16 (float16x8_t __a)
+{
+  return (uint16x8_t)__builtin_neon_vcvtauv8hf (__a);
+}
+
+__extension__ extern __inline int16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vcvtm_s16_f16 (float16x4_t __a)
+{
+  return __builtin_neon_vcvtmsv4hf (__a);
+}
+
+__extension__ extern __inline uint16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vcvtm_u16_f16 (float16x4_t __a)
+{
+  return (uint16x4_t)__builtin_neon_vcvtmuv4hf (__a);
+}
+
+__extension__ extern __inline int16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vcvtmq_s16_f16 (float16x8_t __a)
+{
+  return __builtin_neon_vcvtmsv8hf (__a);
+}
+
+__extension__ extern __inline uint16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vcvtmq_u16_f16 (float16x8_t __a)
+{
+  return (uint16x8_t)__builtin_neon_vcvtmuv8hf (__a);
+}
+
+__extension__ extern __inline int16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vcvtn_s16_f16 (float16x4_t __a)
+{
+  return __builtin_neon_vcvtnsv4hf (__a);
+}
+
+__extension__ extern __inline uint16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vcvtn_u16_f16 (float16x4_t __a)
+{
+  return (uint16x4_t)__builtin_neon_vcvtnuv4hf (__a);
+}
+
+__extension__ extern __inline int16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vcvtnq_s16_f16 (float16x8_t __a)
+{
+  return __builtin_neon_vcvtnsv8hf (__a);
+}
+
+__extension__ extern __inline uint16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vcvtnq_u16_f16 (float16x8_t __a)
+{
+  return (uint16x8_t)__builtin_neon_vcvtnuv8hf (__a);
+}
+
+__extension__ extern __inline int16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vcvtp_s16_f16 (float16x4_t __a)
+{
+  return __builtin_neon_vcvtpsv4hf (__a);
+}
+
+__extension__ extern __inline uint16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vcvtp_u16_f16 (float16x4_t __a)
+{
+  return (uint16x4_t)__builtin_neon_vcvtpuv4hf (__a);
+}
+
+__extension__ extern __inline int16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vcvtpq_s16_f16 (float16x8_t __a)
+{
+  return __builtin_neon_vcvtpsv8hf (__a);
+}
+
+__extension__ extern __inline uint16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vcvtpq_u16_f16 (float16x8_t __a)
+{
+  return (uint16x8_t)__builtin_neon_vcvtpuv8hf (__a);
+}
+
+__extension__ extern __inline float16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vcvt_n_f16_s16 (int16x4_t __a, const int __b)
+{
+  return __builtin_neon_vcvts_nv4hi (__a, __b);
+}
+
+__extension__ extern __inline float16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vcvt_n_f16_u16 (uint16x4_t __a, const int __b)
+{
+  return __builtin_neon_vcvtu_nv4hi ((int16x4_t)__a, __b);
+}
+
+__extension__ extern __inline float16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vcvtq_n_f16_s16 (int16x8_t __a, const int __b)
+{
+  return __builtin_neon_vcvts_nv8hi (__a, __b);
+}
+
+__extension__ extern __inline float16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vcvtq_n_f16_u16 (uint16x8_t __a, const int __b)
+{
+  return __builtin_neon_vcvtu_nv8hi ((int16x8_t)__a, __b);
+}
+
+__extension__ extern __inline int16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vcvt_n_s16_f16 (float16x4_t __a, const int __b)
+{
+  return __builtin_neon_vcvts_nv4hf (__a, __b);
+}
+
+__extension__ extern __inline uint16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vcvt_n_u16_f16 (float16x4_t __a, const int __b)
+{
+  return (uint16x4_t)__builtin_neon_vcvtu_nv4hf (__a, __b);
+}
+
+__extension__ extern __inline int16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vcvtq_n_s16_f16 (float16x8_t __a, const int __b)
+{
+  return __builtin_neon_vcvts_nv8hf (__a, __b);
+}
+
+__extension__ extern __inline uint16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vcvtq_n_u16_f16 (float16x8_t __a, const int __b)
+{
+  return (uint16x8_t)__builtin_neon_vcvtu_nv8hf (__a, __b);
+}
+
+__extension__ extern __inline float16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vfma_f16 (float16x4_t __a, float16x4_t __b, float16x4_t __c)
+{
+  return __builtin_neon_vfmav4hf (__a, __b, __c);
+}
+
+__extension__ extern __inline float16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vfmaq_f16 (float16x8_t __a, float16x8_t __b, float16x8_t __c)
+{
+  return __builtin_neon_vfmav8hf (__a, __b, __c);
+}
+
+__extension__ extern __inline float16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vfms_f16 (float16x4_t __a, float16x4_t __b, float16x4_t __c)
+{
+  return __builtin_neon_vfmsv4hf (__a, __b, __c);
+}
+
+__extension__ extern __inline float16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vfmsq_f16 (float16x8_t __a, float16x8_t __b, float16x8_t __c)
+{
+  return __builtin_neon_vfmsv8hf (__a, __b, __c);
+}
+
+__extension__ extern __inline float16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vmax_f16 (float16x4_t __a, float16x4_t __b)
+{
+  return __builtin_neon_vmaxfv4hf (__a, __b);
+}
+
+__extension__ extern __inline float16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vmaxq_f16 (float16x8_t __a, float16x8_t __b)
+{
+  return __builtin_neon_vmaxfv8hf (__a, __b);
+}
+
+__extension__ extern __inline float16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vmaxnm_f16 (float16x4_t __a, float16x4_t __b)
+{
+  return __builtin_neon_vmaxnmv4hf (__a, __b);
+}
+
+__extension__ extern __inline float16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vmaxnmq_f16 (float16x8_t __a, float16x8_t __b)
+{
+  return __builtin_neon_vmaxnmv8hf (__a, __b);
+}
+
+__extension__ extern __inline float16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vmin_f16 (float16x4_t __a, float16x4_t __b)
+{
+  return __builtin_neon_vminfv4hf (__a, __b);
+}
+
+__extension__ extern __inline float16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vminq_f16 (float16x8_t __a, float16x8_t __b)
+{
+  return __builtin_neon_vminfv8hf (__a, __b);
+}
+
+__extension__ extern __inline float16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vminnm_f16 (float16x4_t __a, float16x4_t __b)
+{
+  return __builtin_neon_vminnmv4hf (__a, __b);
+}
+
+__extension__ extern __inline float16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vminnmq_f16 (float16x8_t __a, float16x8_t __b)
+{
+  return __builtin_neon_vminnmv8hf (__a, __b);
+}
+
+__extension__ extern __inline float16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vmul_f16 (float16x4_t __a, float16x4_t __b)
+{
+  return __builtin_neon_vmulfv4hf (__a, __b);
+}
+
+__extension__ extern __inline float16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vmul_lane_f16 (float16x4_t __a, float16x4_t __b, const int __c)
+{
+  return __builtin_neon_vmul_lanev4hf (__a, __b, __c);
+}
+
+__extension__ extern __inline float16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vmul_n_f16 (float16x4_t __a, float16_t __b)
+{
+  return __builtin_neon_vmul_nv4hf (__a, __b);
+}
+
+__extension__ extern __inline float16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vmulq_f16 (float16x8_t __a, float16x8_t __b)
+{
+  return __builtin_neon_vmulfv8hf (__a, __b);
+}
+
+__extension__ extern __inline float16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vmulq_lane_f16 (float16x8_t __a, float16x4_t __b, const int __c)
+{
+  return __builtin_neon_vmul_lanev8hf (__a, __b, __c);
+}
+
+__extension__ extern __inline float16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vmulq_n_f16 (float16x8_t __a, float16_t __b)
+{
+  return __builtin_neon_vmul_nv8hf (__a, __b);
+}
+
+__extension__ extern __inline float16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vneg_f16 (float16x4_t __a)
+{
+  return __builtin_neon_vnegv4hf (__a);
+}
+
+__extension__ extern __inline float16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vnegq_f16 (float16x8_t __a)
+{
+  return __builtin_neon_vnegv8hf (__a);
+}
+
+__extension__ extern __inline float16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vpadd_f16 (float16x4_t __a, float16x4_t __b)
+{
+  return __builtin_neon_vpaddv4hf (__a, __b);
+}
+
+__extension__ extern __inline float16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vpmax_f16 (float16x4_t __a, float16x4_t __b)
+{
+  return __builtin_neon_vpmaxfv4hf (__a, __b);
+}
+
+__extension__ extern __inline float16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vpmin_f16 (float16x4_t __a, float16x4_t __b)
+{
+  return __builtin_neon_vpminfv4hf (__a, __b);
+}
+
+__extension__ extern __inline float16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vrecpe_f16 (float16x4_t __a)
+{
+  return __builtin_neon_vrecpev4hf (__a);
+}
+
+__extension__ extern __inline float16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vrecpeq_f16 (float16x8_t __a)
+{
+  return __builtin_neon_vrecpev8hf (__a);
+}
+
+__extension__ extern __inline float16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vrnd_f16 (float16x4_t __a)
+{
+  return __builtin_neon_vrndv4hf (__a);
+}
+
+__extension__ extern __inline float16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vrndq_f16 (float16x8_t __a)
+{
+  return __builtin_neon_vrndv8hf (__a);
+}
+
+__extension__ extern __inline float16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vrnda_f16 (float16x4_t __a)
+{
+  return __builtin_neon_vrndav4hf (__a);
+}
+
+__extension__ extern __inline float16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vrndaq_f16 (float16x8_t __a)
+{
+  return __builtin_neon_vrndav8hf (__a);
+}
+
+__extension__ extern __inline float16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vrndm_f16 (float16x4_t __a)
+{
+  return __builtin_neon_vrndmv4hf (__a);
+}
+
+__extension__ extern __inline float16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vrndmq_f16 (float16x8_t __a)
+{
+  return __builtin_neon_vrndmv8hf (__a);
+}
+
+__extension__ extern __inline float16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vrndn_f16 (float16x4_t __a)
+{
+  return __builtin_neon_vrndnv4hf (__a);
+}
+
+__extension__ extern __inline float16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vrndnq_f16 (float16x8_t __a)
+{
+  return __builtin_neon_vrndnv8hf (__a);
+}
+
+__extension__ extern __inline float16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vrndp_f16 (float16x4_t __a)
+{
+  return __builtin_neon_vrndpv4hf (__a);
+}
+
+__extension__ extern __inline float16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vrndpq_f16 (float16x8_t __a)
+{
+  return __builtin_neon_vrndpv8hf (__a);
+}
+
+__extension__ extern __inline float16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vrndx_f16 (float16x4_t __a)
+{
+  return __builtin_neon_vrndxv4hf (__a);
+}
+
+__extension__ extern __inline float16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vrndxq_f16 (float16x8_t __a)
+{
+  return __builtin_neon_vrndxv8hf (__a);
+}
+
+__extension__ extern __inline float16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vrsqrte_f16 (float16x4_t __a)
+{
+  return __builtin_neon_vrsqrtev4hf (__a);
+}
+
+__extension__ extern __inline float16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vrsqrteq_f16 (float16x8_t __a)
+{
+  return __builtin_neon_vrsqrtev8hf (__a);
+}
+
+__extension__ extern __inline float16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vrecps_f16 (float16x4_t __a, float16x4_t __b)
+{
+  return __builtin_neon_vrecpsv4hf (__a, __b);
+}
+
+__extension__ extern __inline float16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vrecpsq_f16 (float16x8_t __a, float16x8_t __b)
+{
+  return __builtin_neon_vrecpsv8hf (__a, __b);
+}
+
+__extension__ extern __inline float16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vrsqrts_f16 (float16x4_t __a, float16x4_t __b)
+{
+  return __builtin_neon_vrsqrtsv4hf (__a, __b);
+}
+
+__extension__ extern __inline float16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vrsqrtsq_f16 (float16x8_t __a, float16x8_t __b)
+{
+  return __builtin_neon_vrsqrtsv8hf (__a, __b);
+}
+
+__extension__ extern __inline float16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vsub_f16 (float16x4_t __a, float16x4_t __b)
+{
+  return __builtin_neon_vsubv4hf (__a, __b);
+}
+
+__extension__ extern __inline float16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vsubq_f16 (float16x8_t __a, float16x8_t __b)
+{
+  return __builtin_neon_vsubv8hf (__a, __b);
+}
+
+#endif /* __ARM_FEATURE_VECTOR_FP16_ARITHMETIC.  */
+#pragma GCC pop_options
+
+  /* Half-precision data processing intrinsics.  */
+#if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
+
+__extension__ extern __inline float16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vbsl_f16 (uint16x4_t __a, float16x4_t __b, float16x4_t __c)
+{
+  return __builtin_neon_vbslv4hf ((int16x4_t)__a, __b, __c);
+}
+
+__extension__ extern __inline float16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vbslq_f16 (uint16x8_t __a, float16x8_t __b, float16x8_t __c)
+{
+  return __builtin_neon_vbslv8hf ((int16x8_t)__a, __b, __c);
+}
+
+__extension__ extern __inline float16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vdup_n_f16 (float16_t __a)
+{
+  return __builtin_neon_vdup_nv4hf (__a);
+}
+
+__extension__ extern __inline float16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vdupq_n_f16 (float16_t __a)
+{
+  return __builtin_neon_vdup_nv8hf (__a);
+}
+
+__extension__ extern __inline float16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vdup_lane_f16 (float16x4_t __a, const int __b)
+{
+  return __builtin_neon_vdup_lanev4hf (__a, __b);
+}
+
+__extension__ extern __inline float16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vdupq_lane_f16 (float16x4_t __a, const int __b)
+{
+  return __builtin_neon_vdup_lanev8hf (__a, __b);
+}
+
+__extension__ extern __inline float16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vext_f16 (float16x4_t __a, float16x4_t __b, const int __c)
+{
+  return __builtin_neon_vextv4hf (__a, __b, __c);
+}
+
+__extension__ extern __inline float16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vextq_f16 (float16x8_t __a, float16x8_t __b, const int __c)
+{
+  return __builtin_neon_vextv8hf (__a, __b, __c);
+}
+
+__extension__ extern __inline float16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vmov_n_f16 (float16_t __a)
+{
+  return __builtin_neon_vdup_nv4hf (__a);
+}
+
+__extension__ extern __inline float16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vmovq_n_f16 (float16_t __a)
+{
+  return __builtin_neon_vdup_nv8hf (__a);
+}
+
+__extension__ extern __inline float16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vrev64_f16 (float16x4_t __a)
+{
+  return (float16x4_t)__builtin_shuffle (__a, (uint16x4_t){ 3, 2, 1, 0 });
+}
+
+__extension__ extern __inline float16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vrev64q_f16 (float16x8_t __a)
+{
+  return
+    (float16x8_t)__builtin_shuffle (__a,
+				    (uint16x8_t){ 3, 2, 1, 0, 7, 6, 5, 4 });
+}
+
+__extension__ extern __inline float16x4x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vtrn_f16 (float16x4_t __a, float16x4_t __b)
+{
+  float16x4x2_t __rv;
+#ifdef __ARM_BIG_ENDIAN
+  __rv.val[0] = __builtin_shuffle (__a, __b, (uint16x4_t){ 5, 1, 7, 3 });
+  __rv.val[1] = __builtin_shuffle (__a, __b, (uint16x4_t){ 4, 0, 6, 2 });
+#else
+  __rv.val[0] = __builtin_shuffle (__a, __b, (uint16x4_t){ 0, 4, 2, 6 });
+  __rv.val[1] = __builtin_shuffle (__a, __b, (uint16x4_t){ 1, 5, 3, 7 });
+#endif
+  return __rv;
+}
+
+__extension__ extern __inline float16x8x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vtrnq_f16 (float16x8_t __a, float16x8_t __b)
+{
+  float16x8x2_t __rv;
+#ifdef __ARM_BIG_ENDIAN
+  __rv.val[0] = __builtin_shuffle (__a, __b,
+				   (uint16x8_t){ 9, 1, 11, 3, 13, 5, 15, 7 });
+  __rv.val[1] = __builtin_shuffle (__a, __b,
+				   (uint16x8_t){ 8, 0, 10, 2, 12, 4, 14, 6 });
+#else
+  __rv.val[0] = __builtin_shuffle (__a, __b,
+				   (uint16x8_t){ 0, 8, 2, 10, 4, 12, 6, 14 });
+  __rv.val[1] = __builtin_shuffle (__a, __b,
+				   (uint16x8_t){ 1, 9, 3, 11, 5, 13, 7, 15 });
+#endif
+  return __rv;
+}
+
+__extension__ extern __inline float16x4x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vuzp_f16 (float16x4_t __a, float16x4_t __b)
+{
+  float16x4x2_t __rv;
+#ifdef __ARM_BIG_ENDIAN
+  __rv.val[0] = __builtin_shuffle (__a, __b, (uint16x4_t){ 5, 7, 1, 3 });
+  __rv.val[1] = __builtin_shuffle (__a, __b, (uint16x4_t){ 4, 6, 0, 2 });
+#else
+  __rv.val[0] = __builtin_shuffle (__a, __b, (uint16x4_t){ 0, 2, 4, 6 });
+  __rv.val[1] = __builtin_shuffle (__a, __b, (uint16x4_t){ 1, 3, 5, 7 });
+#endif
+  return __rv;
+}
+
+__extension__ extern __inline float16x8x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vuzpq_f16 (float16x8_t __a, float16x8_t __b)
+{
+  float16x8x2_t __rv;
+#ifdef __ARM_BIG_ENDIAN
+  __rv.val[0] = __builtin_shuffle (__a, __b, (uint16x8_t)
+				   { 5, 7, 1, 3, 13, 15, 9, 11 });
+  __rv.val[1] = __builtin_shuffle (__a, __b, (uint16x8_t)
+				   { 4, 6, 0, 2, 12, 14, 8, 10 });
+#else
+  __rv.val[0] = __builtin_shuffle (__a, __b,
+				   (uint16x8_t){ 0, 2, 4, 6, 8, 10, 12, 14 });
+  __rv.val[1] = __builtin_shuffle (__a, __b,
+				   (uint16x8_t){ 1, 3, 5, 7, 9, 11, 13, 15 });
+#endif
+  return __rv;
+}
+
+__extension__ extern __inline float16x4x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vzip_f16 (float16x4_t __a, float16x4_t __b)
+{
+  float16x4x2_t __rv;
+#ifdef __ARM_BIG_ENDIAN
+  __rv.val[0] = __builtin_shuffle (__a, __b, (uint16x4_t){ 6, 2, 7, 3 });
+  __rv.val[1] = __builtin_shuffle (__a, __b, (uint16x4_t){ 4, 0, 5, 1 });
+#else
+  __rv.val[0] = __builtin_shuffle (__a, __b, (uint16x4_t){ 0, 4, 1, 5 });
+  __rv.val[1] = __builtin_shuffle (__a, __b, (uint16x4_t){ 2, 6, 3, 7 });
+#endif
+  return __rv;
+}
+
+__extension__ extern __inline float16x8x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vzipq_f16 (float16x8_t __a, float16x8_t __b)
+{
+  float16x8x2_t __rv;
+#ifdef __ARM_BIG_ENDIAN
+  __rv.val[0] = __builtin_shuffle (__a, __b, (uint16x8_t)
+				   { 10, 2, 11, 3, 8, 0, 9, 1 });
+  __rv.val[1] = __builtin_shuffle (__a, __b, (uint16x8_t)
+				   { 14, 6, 15, 7, 12, 4, 13, 5 });
+#else
+  __rv.val[0] = __builtin_shuffle (__a, __b,
+				   (uint16x8_t){ 0, 8, 1, 9, 2, 10, 3, 11 });
+  __rv.val[1] = __builtin_shuffle (__a, __b,
+				   (uint16x8_t){ 4, 12, 5, 13, 6, 14, 7, 15 });
+#endif
+  return __rv;
+}
+
+#endif
+
 #ifdef __cplusplus
 }
 #endif
--- a/src/gcc/config/arm/arm_neon_builtins.def
+++ b/src/gcc/config/arm/arm_neon_builtins.def
@@ -19,6 +19,7 @@
    <http://www.gnu.org/licenses/>.  */
 
 VAR2 (BINOP, vadd, v2sf, v4sf)
+VAR2 (BINOP, vadd, v8hf, v4hf)
 VAR3 (BINOP, vaddls, v8qi, v4hi, v2si)
 VAR3 (BINOP, vaddlu, v8qi, v4hi, v2si)
 VAR3 (BINOP, vaddws, v8qi, v4hi, v2si)
@@ -32,12 +33,15 @@ VAR8 (BINOP, vqaddu, v8qi, v4hi, v2si, di, v16qi, v8hi, v4si, v2di)
 VAR3 (BINOP, vaddhn, v8hi, v4si, v2di)
 VAR3 (BINOP, vraddhn, v8hi, v4si, v2di)
 VAR2 (BINOP, vmulf, v2sf, v4sf)
+VAR2 (BINOP, vmulf, v8hf, v4hf)
 VAR2 (BINOP, vmulp, v8qi, v16qi)
 VAR8 (TERNOP, vmla, v8qi, v4hi, v2si, v2sf, v16qi, v8hi, v4si, v4sf)
 VAR3 (TERNOP, vmlals, v8qi, v4hi, v2si)
 VAR3 (TERNOP, vmlalu, v8qi, v4hi, v2si)
 VAR2 (TERNOP, vfma, v2sf, v4sf)
+VAR2 (TERNOP, vfma, v4hf, v8hf)
 VAR2 (TERNOP, vfms, v2sf, v4sf)
+VAR2 (TERNOP, vfms, v4hf, v8hf)
 VAR8 (TERNOP, vmls, v8qi, v4hi, v2si, v2sf, v16qi, v8hi, v4si, v4sf)
 VAR3 (TERNOP, vmlsls, v8qi, v4hi, v2si)
 VAR3 (TERNOP, vmlslu, v8qi, v4hi, v2si)
@@ -94,6 +98,7 @@ VAR8 (TERNOP_IMM, vsrau_n, v8qi, v4hi, v2si, di, v16qi, v8hi, v4si, v2di)
 VAR8 (TERNOP_IMM, vrsras_n, v8qi, v4hi, v2si, di, v16qi, v8hi, v4si, v2di)
 VAR8 (TERNOP_IMM, vrsrau_n, v8qi, v4hi, v2si, di, v16qi, v8hi, v4si, v2di)
 VAR2 (BINOP, vsub, v2sf, v4sf)
+VAR2 (BINOP, vsub, v8hf, v4hf)
 VAR3 (BINOP, vsubls, v8qi, v4hi, v2si)
 VAR3 (BINOP, vsublu, v8qi, v4hi, v2si)
 VAR3 (BINOP, vsubws, v8qi, v4hi, v2si)
@@ -111,12 +116,27 @@ VAR8 (BINOP, vcgt, v8qi, v4hi, v2si, v2sf, v16qi, v8hi, v4si, v4sf)
 VAR6 (BINOP, vcgtu, v8qi, v4hi, v2si, v16qi, v8hi, v4si)
 VAR2 (BINOP, vcage, v2sf, v4sf)
 VAR2 (BINOP, vcagt, v2sf, v4sf)
+VAR2 (BINOP, vcage, v4hf, v8hf)
+VAR2 (BINOP, vcagt, v4hf, v8hf)
+VAR2 (BINOP, vcale, v4hf, v8hf)
+VAR2 (BINOP, vcalt, v4hf, v8hf)
+VAR2 (BINOP, vceq, v4hf, v8hf)
+VAR2 (BINOP, vcge, v4hf, v8hf)
+VAR2 (BINOP, vcgt, v4hf, v8hf)
+VAR2 (BINOP, vcle, v4hf, v8hf)
+VAR2 (BINOP, vclt, v4hf, v8hf)
+VAR2 (UNOP, vceqz, v4hf, v8hf)
+VAR2 (UNOP, vcgez, v4hf, v8hf)
+VAR2 (UNOP, vcgtz, v4hf, v8hf)
+VAR2 (UNOP, vclez, v4hf, v8hf)
+VAR2 (UNOP, vcltz, v4hf, v8hf)
 VAR6 (BINOP, vtst, v8qi, v4hi, v2si, v16qi, v8hi, v4si)
 VAR6 (BINOP, vabds, v8qi, v4hi, v2si, v16qi, v8hi, v4si)
 VAR6 (BINOP, vabdu, v8qi, v4hi, v2si, v16qi, v8hi, v4si)
 VAR2 (BINOP, vabdf, v2sf, v4sf)
 VAR3 (BINOP, vabdls, v8qi, v4hi, v2si)
 VAR3 (BINOP, vabdlu, v8qi, v4hi, v2si)
+VAR2 (BINOP, vabd, v8hf, v4hf)
 
 VAR6 (TERNOP, vabas, v8qi, v4hi, v2si, v16qi, v8hi, v4si)
 VAR6 (TERNOP, vabau, v8qi, v4hi, v2si, v16qi, v8hi, v4si)
@@ -126,27 +146,38 @@ VAR3 (TERNOP, vabalu, v8qi, v4hi, v2si)
 VAR6 (BINOP, vmaxs, v8qi, v4hi, v2si, v16qi, v8hi, v4si)
 VAR6 (BINOP, vmaxu, v8qi, v4hi, v2si, v16qi, v8hi, v4si)
 VAR2 (BINOP, vmaxf, v2sf, v4sf)
+VAR2 (BINOP, vmaxf, v8hf, v4hf)
+VAR4 (BINOP, vmaxnm, v2sf, v4sf, v4hf, v8hf)
 VAR6 (BINOP, vmins, v8qi, v4hi, v2si, v16qi, v8hi, v4si)
 VAR6 (BINOP, vminu, v8qi, v4hi, v2si, v16qi, v8hi, v4si)
 VAR2 (BINOP, vminf, v2sf, v4sf)
+VAR2 (BINOP, vminf, v4hf, v8hf)
+VAR4 (BINOP, vminnm, v2sf, v4sf, v8hf, v4hf)
 
 VAR3 (BINOP, vpmaxs, v8qi, v4hi, v2si)
 VAR3 (BINOP, vpmaxu, v8qi, v4hi, v2si)
 VAR1 (BINOP, vpmaxf, v2sf)
+VAR1 (BINOP, vpmaxf, v4hf)
 VAR3 (BINOP, vpmins, v8qi, v4hi, v2si)
 VAR3 (BINOP, vpminu, v8qi, v4hi, v2si)
 VAR1 (BINOP, vpminf, v2sf)
+VAR1 (BINOP, vpminf, v4hf)
 
 VAR4 (BINOP, vpadd, v8qi, v4hi, v2si, v2sf)
+VAR1 (BINOP, vpadd, v4hf)
 VAR6 (UNOP, vpaddls, v8qi, v4hi, v2si, v16qi, v8hi, v4si)
 VAR6 (UNOP, vpaddlu, v8qi, v4hi, v2si, v16qi, v8hi, v4si)
 VAR6 (BINOP, vpadals, v8qi, v4hi, v2si, v16qi, v8hi, v4si)
 VAR6 (BINOP, vpadalu, v8qi, v4hi, v2si, v16qi, v8hi, v4si)
 VAR2 (BINOP, vrecps, v2sf, v4sf)
 VAR2 (BINOP, vrsqrts, v2sf, v4sf)
+VAR2 (BINOP, vrecps, v4hf, v8hf)
+VAR2 (BINOP, vrsqrts, v4hf, v8hf)
 VAR8 (TERNOP_IMM, vsri_n, v8qi, v4hi, v2si, di, v16qi, v8hi, v4si, v2di)
 VAR8 (TERNOP_IMM, vsli_n, v8qi, v4hi, v2si, di, v16qi, v8hi, v4si, v2di)
 VAR8 (UNOP, vabs, v8qi, v4hi, v2si, v2sf, v16qi, v8hi, v4si, v4sf)
+VAR2 (UNOP, vabs, v8hf, v4hf)
+VAR2 (UNOP, vneg, v8hf, v4hf)
 VAR6 (UNOP, vqabs, v8qi, v4hi, v2si, v16qi, v8hi, v4si)
 VAR8 (UNOP, vneg, v8qi, v4hi, v2si, v2sf, v16qi, v8hi, v4si, v4sf)
 VAR6 (UNOP, vqneg, v8qi, v4hi, v2si, v16qi, v8hi, v4si)
@@ -155,8 +186,16 @@ VAR6 (UNOP, vclz, v8qi, v4hi, v2si, v16qi, v8hi, v4si)
 VAR5 (BSWAP, bswap, v4hi, v8hi, v2si, v4si, v2di)
 VAR2 (UNOP, vcnt, v8qi, v16qi)
 VAR4 (UNOP, vrecpe, v2si, v2sf, v4si, v4sf)
+VAR2 (UNOP, vrecpe, v8hf, v4hf)
 VAR4 (UNOP, vrsqrte, v2si, v2sf, v4si, v4sf)
+VAR2 (UNOP, vrsqrte, v4hf, v8hf)
 VAR6 (UNOP, vmvn, v8qi, v4hi, v2si, v16qi, v8hi, v4si)
+VAR2 (UNOP, vrnd, v8hf, v4hf)
+VAR2 (UNOP, vrnda, v8hf, v4hf)
+VAR2 (UNOP, vrndm, v8hf, v4hf)
+VAR2 (UNOP, vrndn, v8hf, v4hf)
+VAR2 (UNOP, vrndp, v8hf, v4hf)
+VAR2 (UNOP, vrndx, v8hf, v4hf)
   /* FIXME: vget_lane supports more variants than this!  */
 VAR10 (GETLANE, vget_lane,
 	 v8qi, v4hi, v2si, v2sf, di, v16qi, v8hi, v4si, v4sf, v2di)
@@ -166,8 +205,10 @@ VAR10 (SETLANE, vset_lane,
 VAR5 (UNOP, vcreate, v8qi, v4hi, v2si, v2sf, di)
 VAR10 (UNOP, vdup_n,
 	 v8qi, v4hi, v2si, v2sf, di, v16qi, v8hi, v4si, v4sf, v2di)
+VAR2 (UNOP, vdup_n, v8hf, v4hf)
 VAR10 (GETLANE, vdup_lane,
 	 v8qi, v4hi, v2si, v2sf, di, v16qi, v8hi, v4si, v4sf, v2di)
+VAR2 (GETLANE, vdup_lane, v8hf, v4hf)
 VAR6 (COMBINE, vcombine, v8qi, v4hi, v4hf, v2si, v2sf, di)
 VAR6 (UNOP, vget_high, v16qi, v8hi, v8hf, v4si, v4sf, v2di)
 VAR6 (UNOP, vget_low, v16qi, v8hi, v8hf, v4si, v4sf, v2di)
@@ -177,7 +218,7 @@ VAR3 (UNOP, vqmovnu, v8hi, v4si, v2di)
 VAR3 (UNOP, vqmovun, v8hi, v4si, v2di)
 VAR3 (UNOP, vmovls, v8qi, v4hi, v2si)
 VAR3 (UNOP, vmovlu, v8qi, v4hi, v2si)
-VAR6 (SETLANE, vmul_lane, v4hi, v2si, v2sf, v8hi, v4si, v4sf)
+VAR8 (SETLANE, vmul_lane, v4hi, v2si, v2sf, v8hi, v4si, v4sf, v4hf, v8hf)
 VAR6 (MAC_LANE, vmla_lane, v4hi, v2si, v2sf, v8hi, v4si, v4sf)
 VAR2 (MAC_LANE, vmlals_lane, v4hi, v2si)
 VAR2 (MAC_LANE, vmlalu_lane, v4hi, v2si)
@@ -186,7 +227,7 @@ VAR6 (MAC_LANE, vmls_lane, v4hi, v2si, v2sf, v8hi, v4si, v4sf)
 VAR2 (MAC_LANE, vmlsls_lane, v4hi, v2si)
 VAR2 (MAC_LANE, vmlslu_lane, v4hi, v2si)
 VAR2 (MAC_LANE, vqdmlsl_lane, v4hi, v2si)
-VAR6 (BINOP, vmul_n, v4hi, v2si, v2sf, v8hi, v4si, v4sf)
+VAR8 (BINOP, vmul_n, v4hi, v2si, v2sf, v8hi, v4si, v4sf, v4hf, v8hf)
 VAR6 (MAC_N, vmla_n, v4hi, v2si, v2sf, v8hi, v4si, v4sf)
 VAR2 (MAC_N, vmlals_n, v4hi, v2si)
 VAR2 (MAC_N, vmlalu_n, v4hi, v2si)
@@ -197,17 +238,27 @@ VAR2 (MAC_N, vmlslu_n, v4hi, v2si)
 VAR2 (MAC_N, vqdmlsl_n, v4hi, v2si)
 VAR10 (SETLANE, vext,
 	 v8qi, v4hi, v2si, v2sf, di, v16qi, v8hi, v4si, v4sf, v2di)
+VAR2 (SETLANE, vext, v8hf, v4hf)
 VAR8 (UNOP, vrev64, v8qi, v4hi, v2si, v2sf, v16qi, v8hi, v4si, v4sf)
 VAR4 (UNOP, vrev32, v8qi, v4hi, v16qi, v8hi)
 VAR2 (UNOP, vrev16, v8qi, v16qi)
 VAR4 (UNOP, vcvts, v2si, v2sf, v4si, v4sf)
+VAR2 (UNOP, vcvts, v4hi, v8hi)
+VAR2 (UNOP, vcvts, v4hf, v8hf)
+VAR2 (UNOP, vcvtu, v4hi, v8hi)
+VAR2 (UNOP, vcvtu, v4hf, v8hf)
 VAR4 (UNOP, vcvtu, v2si, v2sf, v4si, v4sf)
 VAR4 (BINOP, vcvts_n, v2si, v2sf, v4si, v4sf)
 VAR4 (BINOP, vcvtu_n, v2si, v2sf, v4si, v4sf)
+VAR2 (BINOP, vcvts_n, v4hf, v8hf)
+VAR2 (BINOP, vcvtu_n, v4hi, v8hi)
+VAR2 (BINOP, vcvts_n, v4hi, v8hi)
+VAR2 (BINOP, vcvtu_n, v4hf, v8hf)
 VAR1 (UNOP, vcvtv4sf, v4hf)
 VAR1 (UNOP, vcvtv4hf, v4sf)
 VAR10 (TERNOP, vbsl,
 	 v8qi, v4hi, v2si, v2sf, di, v16qi, v8hi, v4si, v4sf, v2di)
+VAR2 (TERNOP, vbsl, v8hf, v4hf)
 VAR2 (UNOP, copysignf, v2sf, v4sf)
 VAR2 (UNOP, vrintn, v2sf, v4sf)
 VAR2 (UNOP, vrinta, v2sf, v4sf)
@@ -219,6 +270,14 @@ VAR1 (UNOP, vcvtav2sf, v2si)
 VAR1 (UNOP, vcvtav4sf, v4si)
 VAR1 (UNOP, vcvtauv2sf, v2si)
 VAR1 (UNOP, vcvtauv4sf, v4si)
+VAR2 (UNOP, vcvtas, v4hf, v8hf)
+VAR2 (UNOP, vcvtau, v4hf, v8hf)
+VAR2 (UNOP, vcvtms, v4hf, v8hf)
+VAR2 (UNOP, vcvtmu, v4hf, v8hf)
+VAR2 (UNOP, vcvtns, v4hf, v8hf)
+VAR2 (UNOP, vcvtnu, v4hf, v8hf)
+VAR2 (UNOP, vcvtps, v4hf, v8hf)
+VAR2 (UNOP, vcvtpu, v4hf, v8hf)
 VAR1 (UNOP, vcvtpv2sf, v2si)
 VAR1 (UNOP, vcvtpv4sf, v4si)
 VAR1 (UNOP, vcvtpuv2sf, v2si)
--- /dev/null
+++ b/src/gcc/config/arm/arm_vfp_builtins.def
@@ -0,0 +1,51 @@
+/* VFP instruction builtin definitions.
+   Copyright (C) 2016 Free Software Foundation, Inc.
+   Contributed by ARM Ltd.
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify it
+   under the terms of the GNU General Public License as published
+   by the Free Software Foundation; either version 3, or (at your
+   option) any later version.
+
+   GCC is distributed in the hope that it will be useful, but WITHOUT
+   ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+   or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
+   License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with GCC; see the file COPYING3.  If not see
+   <http://www.gnu.org/licenses/>.  */
+
+/* This file lists the builtins that may be available when VFP is enabled but
+   not NEON is enabled.  The entries otherwise have the same requirements and
+   generate the same structures as those in the arm_neon_builtins.def.  */
+
+/* FP16 Arithmetic instructions.  */
+VAR1 (UNOP, vabs, hf)
+VAR2 (UNOP, vcvths, hf, si)
+VAR2 (UNOP, vcvthu, hf, si)
+VAR1 (UNOP, vcvtahs, si)
+VAR1 (UNOP, vcvtahu, si)
+VAR1 (UNOP, vcvtmhs, si)
+VAR1 (UNOP, vcvtmhu, si)
+VAR1 (UNOP, vcvtnhs, si)
+VAR1 (UNOP, vcvtnhu, si)
+VAR1 (UNOP, vcvtphs, si)
+VAR1 (UNOP, vcvtphu, si)
+VAR1 (UNOP, vrnd, hf)
+VAR1 (UNOP, vrnda, hf)
+VAR1 (UNOP, vrndi, hf)
+VAR1 (UNOP, vrndm, hf)
+VAR1 (UNOP, vrndn, hf)
+VAR1 (UNOP, vrndp, hf)
+VAR1 (UNOP, vrndx, hf)
+VAR1 (UNOP, vsqrt, hf)
+
+VAR2 (BINOP, vcvths_n, hf, si)
+VAR2 (BINOP, vcvthu_n, hf, si)
+VAR1 (BINOP, vmaxnm, hf)
+VAR1 (BINOP, vminnm, hf)
+
+VAR1 (TERNOP, vfma, hf)
+VAR1 (TERNOP, vfms, hf)
--- a/src/gcc/config/arm/bpabi.h
+++ b/src/gcc/config/arm/bpabi.h
@@ -75,6 +75,9 @@
    |mcpu=cortex-a57.cortex-a53				\
    |mcpu=cortex-a72					\
    |mcpu=cortex-a72.cortex-a53				\
+   |mcpu=cortex-a73					\
+   |mcpu=cortex-a73.cortex-a35				\
+   |mcpu=cortex-a73.cortex-a53				\
    |mcpu=exynos-m1                                      \
    |mcpu=qdf24xx					\
    |mcpu=xgene1                                         \
@@ -90,6 +93,11 @@
    |march=armv8-a+crc					\
    |march=armv8.1-a					\
    |march=armv8.1-a+crc					\
+   |march=armv8.2-a					\
+   |march=armv8.2-a+fp16				\
+   |march=armv8-m.base|mcpu=cortex-m23			\
+   |march=armv8-m.main					\
+   |march=armv8-m.main+dsp|mcpu=cortex-m33		\
    :%{!r:--be8}}}"
 #else
 #define BE8_LINK_SPEC \
@@ -105,6 +113,9 @@
    |mcpu=cortex-a57.cortex-a53				\
    |mcpu=cortex-a72					\
    |mcpu=cortex-a72.cortex-a53				\
+   |mcpu=cortex-a73					\
+   |mcpu=cortex-a73.cortex-a35				\
+   |mcpu=cortex-a73.cortex-a53				\
    |mcpu=exynos-m1                                      \
    |mcpu=qdf24xx					\
    |mcpu=xgene1                                         \
@@ -121,6 +132,11 @@
    |march=armv8-a+crc					\
    |march=armv8.1-a					\
    |march=armv8.1-a+crc					\
+   |march=armv8.2-a					\
+   |march=armv8.2-a+fp16				\
+   |march=armv8-m.base|mcpu=cortex-m23			\
+   |march=armv8-m.main					\
+   |march=armv8-m.main+dsp|mcpu=cortex-m33		\
    :%{!r:--be8}}}"
 #endif
 
--- a/src/gcc/config/arm/constraints.md
+++ b/src/gcc/config/arm/constraints.md
@@ -34,11 +34,13 @@
 ;; in ARM/Thumb-2 state: Da, Db, Dc, Dd, Dn, Dl, DL, Do, Dv, Dy, Di, Dt, Dp, Dz
 ;; in Thumb-1 state: Pa, Pb, Pc, Pd, Pe
 ;; in Thumb-2 state: Pj, PJ, Ps, Pt, Pu, Pv, Pw, Px, Py
+;; in all states: Pf
 
 ;; The following memory constraints have been used:
-;; in ARM/Thumb-2 state: Q, Uh, Ut, Uv, Uy, Un, Um, Us
+;; in ARM/Thumb-2 state: Uh, Ut, Uv, Uy, Un, Um, Us
 ;; in ARM state: Uq
 ;; in Thumb state: Uu, Uw
+;; in all states: Q
 
 
 (define_register_constraint "t" "TARGET_32BIT ? VFP_LO_REGS : NO_REGS"
@@ -66,7 +68,7 @@
 
 (define_constraint "j"
  "A constant suitable for a MOVW instruction. (ARM/Thumb-2)"
- (and (match_test "TARGET_32BIT && arm_arch_thumb2")
+ (and (match_test "TARGET_HAVE_MOVT")
       (ior (and (match_code "high")
 		(match_test "arm_valid_symbolic_address_p (XEXP (op, 0))"))
 	   (and (match_code "const_int")
@@ -180,6 +182,13 @@
   (and (match_code "const_int")
        (match_test "TARGET_THUMB1 && ival >= 256 && ival <= 510")))
 
+(define_constraint "Pf"
+  "Memory models except relaxed, consume or release ones."
+  (and (match_code "const_int")
+       (match_test "!is_mm_relaxed (memmodel_from_int (ival))
+		    && !is_mm_consume (memmodel_from_int (ival))
+		    && !is_mm_release (memmodel_from_int (ival))")))
+
 (define_constraint "Ps"
   "@internal In Thumb-2 state a constant in the range -255 to +255"
   (and (match_code "const_int")
@@ -333,13 +342,13 @@
  "@internal
   In ARM/ Thumb2 a const_double which can be used with a vcvt.f32.s32 with fract bits operation"
   (and (match_code "const_double")
-       (match_test "TARGET_32BIT && TARGET_VFP && vfp3_const_double_for_fract_bits (op)")))
+       (match_test "TARGET_32BIT && vfp3_const_double_for_fract_bits (op)")))
 
 (define_constraint "Dp"
  "@internal
   In ARM/ Thumb2 a const_double which can be used with a vcvt.s32.f32 with bits operation"
   (and (match_code "const_double")
-       (match_test "TARGET_32BIT && TARGET_VFP
+       (match_test "TARGET_32BIT
 		    && vfp3_const_double_for_bits (op) > 0")))
 
 (define_register_constraint "Ts" "(arm_restrict_it) ? LO_REGS : GENERAL_REGS"
@@ -407,7 +416,7 @@
 
 (define_memory_constraint "Q"
  "@internal
-  In ARM/Thumb-2 state an address that is a single base register."
+  An address that is a single base register."
  (and (match_code "mem")
       (match_test "REG_P (XEXP (op, 0))")))
 
--- a/src/gcc/config/arm/cortex-a53.md
+++ b/src/gcc/config/arm/cortex-a53.md
@@ -30,6 +30,7 @@
 
 (define_cpu_unit "cortex_a53_slot0" "cortex_a53")
 (define_cpu_unit "cortex_a53_slot1" "cortex_a53")
+(final_presence_set "cortex_a53_slot1" "cortex_a53_slot0")
 
 (define_reservation "cortex_a53_slot_any"
 		    "cortex_a53_slot0\
@@ -71,41 +72,43 @@
 
 (define_insn_reservation "cortex_a53_shift" 2
   (and (eq_attr "tune" "cortexa53")
-       (eq_attr "type" "adr,shift_imm,shift_reg,mov_imm,mvn_imm"))
+       (eq_attr "type" "adr,shift_imm,mov_imm,mvn_imm,mov_shift"))
   "cortex_a53_slot_any")
 
-(define_insn_reservation "cortex_a53_alu_rotate_imm" 2
+(define_insn_reservation "cortex_a53_shift_reg" 2
   (and (eq_attr "tune" "cortexa53")
-       (eq_attr "type" "rotate_imm"))
-  "(cortex_a53_slot1)
-   | (cortex_a53_single_issue)")
+       (eq_attr "type" "shift_reg,mov_shift_reg"))
+  "cortex_a53_slot_any+cortex_a53_hazard")
 
 (define_insn_reservation "cortex_a53_alu" 3
   (and (eq_attr "tune" "cortexa53")
        (eq_attr "type" "alu_imm,alus_imm,logic_imm,logics_imm,
 			alu_sreg,alus_sreg,logic_reg,logics_reg,
 			adc_imm,adcs_imm,adc_reg,adcs_reg,
-			bfm,csel,clz,rbit,rev,alu_dsp_reg,
-			mov_reg,mvn_reg,
-			mrs,multiple,no_insn"))
+			csel,clz,rbit,rev,alu_dsp_reg,
+			mov_reg,mvn_reg,mrs,multiple,no_insn"))
   "cortex_a53_slot_any")
 
 (define_insn_reservation "cortex_a53_alu_shift" 3
   (and (eq_attr "tune" "cortexa53")
        (eq_attr "type" "alu_shift_imm,alus_shift_imm,
 			crc,logic_shift_imm,logics_shift_imm,
-			alu_ext,alus_ext,
-			extend,mov_shift,mvn_shift"))
+			alu_ext,alus_ext,bfm,bfx,extend,mvn_shift"))
   "cortex_a53_slot_any")
 
 (define_insn_reservation "cortex_a53_alu_shift_reg" 3
   (and (eq_attr "tune" "cortexa53")
        (eq_attr "type" "alu_shift_reg,alus_shift_reg,
 			logic_shift_reg,logics_shift_reg,
-			mov_shift_reg,mvn_shift_reg"))
+			mvn_shift_reg"))
   "cortex_a53_slot_any+cortex_a53_hazard")
 
-(define_insn_reservation "cortex_a53_mul" 3
+(define_insn_reservation "cortex_a53_alu_extr" 3
+  (and (eq_attr "tune" "cortexa53")
+       (eq_attr "type" "rotate_imm"))
+  "cortex_a53_slot1|cortex_a53_single_issue")
+
+(define_insn_reservation "cortex_a53_mul" 4
   (and (eq_attr "tune" "cortexa53")
        (ior (eq_attr "mul32" "yes")
 	    (eq_attr "mul64" "yes")))
@@ -189,49 +192,43 @@
 (define_insn_reservation "cortex_a53_branch" 0
   (and (eq_attr "tune" "cortexa53")
        (eq_attr "type" "branch,call"))
-  "cortex_a53_slot_any,cortex_a53_branch")
+  "cortex_a53_slot_any+cortex_a53_branch")
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; General-purpose register bypasses
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 
-;; Model bypasses for unshifted operands to ALU instructions.
+;; Model bypasses for ALU to ALU instructions.
+
+(define_bypass 0 "cortex_a53_shift*"
+		 "cortex_a53_alu")
 
-(define_bypass 1 "cortex_a53_shift"
-		 "cortex_a53_shift")
+(define_bypass 1 "cortex_a53_shift*"
+		 "cortex_a53_shift*,cortex_a53_alu_*")
 
-(define_bypass 1 "cortex_a53_alu,
-		  cortex_a53_alu_shift*,
-		  cortex_a53_alu_rotate_imm,
-		  cortex_a53_shift"
+(define_bypass 1 "cortex_a53_alu*"
 		 "cortex_a53_alu")
 
-(define_bypass 2 "cortex_a53_alu,
-		  cortex_a53_alu_shift*"
+(define_bypass 1 "cortex_a53_alu*"
 		 "cortex_a53_alu_shift*"
 		 "aarch_forward_to_shift_is_not_shifted_reg")
 
-;; In our model, we allow any general-purpose register operation to
-;; bypass to the accumulator operand of an integer MADD-like operation.
+(define_bypass 2 "cortex_a53_alu*"
+		 "cortex_a53_alu_*,cortex_a53_shift*")
+
+;; Model a bypass from MUL/MLA to MLA instructions.
 
-(define_bypass 1 "cortex_a53_alu*,
-		  cortex_a53_load*,
-		  cortex_a53_mul"
+(define_bypass 1 "cortex_a53_mul"
 		 "cortex_a53_mul"
 		 "aarch_accumulator_forwarding")
 
-;; Model a bypass from MLA/MUL to many ALU instructions.
+;; Model a bypass from MUL/MLA to ALU instructions.
 
 (define_bypass 2 "cortex_a53_mul"
-		 "cortex_a53_alu,
-		  cortex_a53_alu_shift*")
-
-;; We get neater schedules by allowing an MLA/MUL to feed an
-;; early load address dependency to a load.
+		 "cortex_a53_alu")
 
-(define_bypass 2 "cortex_a53_mul"
-		 "cortex_a53_load*"
-		 "arm_early_load_addr_dep")
+(define_bypass 3 "cortex_a53_mul"
+		 "cortex_a53_alu_*,cortex_a53_shift*")
 
 ;; Model bypasses for loads which are to be consumed by the ALU.
 
@@ -239,47 +236,46 @@
 		 "cortex_a53_alu")
 
 (define_bypass 3 "cortex_a53_load1"
-		 "cortex_a53_alu_shift*")
+		 "cortex_a53_alu_*,cortex_a53_shift*")
+
+(define_bypass 3 "cortex_a53_load2"
+		 "cortex_a53_alu")
 
 ;; Model a bypass for ALU instructions feeding stores.
 
-(define_bypass 1 "cortex_a53_alu*"
-		 "cortex_a53_store1,
-		  cortex_a53_store2,
-		  cortex_a53_store3plus"
+(define_bypass 0 "cortex_a53_alu*,cortex_a53_shift*"
+		 "cortex_a53_store*"
 		 "arm_no_early_store_addr_dep")
 
 ;; Model a bypass for load and multiply instructions feeding stores.
 
-(define_bypass 2 "cortex_a53_mul,
-		  cortex_a53_load1,
-		  cortex_a53_load2,
-		  cortex_a53_load3plus"
-		 "cortex_a53_store1,
-		  cortex_a53_store2,
-		  cortex_a53_store3plus"
+(define_bypass 1 "cortex_a53_mul,
+		  cortex_a53_load*"
+		 "cortex_a53_store*"
 		 "arm_no_early_store_addr_dep")
 
 ;; Model a GP->FP register move as similar to stores.
 
-(define_bypass 1 "cortex_a53_alu*"
+(define_bypass 0 "cortex_a53_alu*,cortex_a53_shift*"
 		 "cortex_a53_r2f")
 
-(define_bypass 2 "cortex_a53_mul,
+(define_bypass 1 "cortex_a53_mul,
 		  cortex_a53_load1,
-		  cortex_a53_load2,
-		  cortex_a53_load3plus"
+		  cortex_a53_load2"
 		 "cortex_a53_r2f")
 
-;; Shifts feeding Load/Store addresses may not be ready in time.
+(define_bypass 2 "cortex_a53_alu*"
+		 "cortex_a53_r2f_cvt")
 
-(define_bypass 3 "cortex_a53_shift"
-		 "cortex_a53_load*"
-		 "arm_early_load_addr_dep")
+(define_bypass 3 "cortex_a53_mul,
+		  cortex_a53_load1,
+		  cortex_a53_load2"
+		 "cortex_a53_r2f_cvt")
 
-(define_bypass 3 "cortex_a53_shift"
-		 "cortex_a53_store*"
-		 "arm_early_store_addr_dep")
+;; Model flag forwarding to branches.
+
+(define_bypass 0 "cortex_a53_alu*,cortex_a53_shift*"
+		 "cortex_a53_branch")
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; Floating-point/Advanced SIMD.
@@ -535,19 +531,25 @@
 ;; Floating-point to/from core transfers.
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 
-(define_insn_reservation "cortex_a53_r2f" 6
+(define_insn_reservation "cortex_a53_r2f" 2
   (and (eq_attr "tune" "cortexa53")
-       (eq_attr "type" "f_mcr,f_mcrr,f_cvti2f,
-			neon_from_gp, neon_from_gp_q"))
-  "cortex_a53_slot_any,cortex_a53_store,
-   nothing,cortex_a53_fp_alu")
+       (eq_attr "type" "f_mcr,f_mcrr"))
+  "cortex_a53_slot_any,cortex_a53_fp_alu")
 
-(define_insn_reservation "cortex_a53_f2r" 6
+(define_insn_reservation "cortex_a53_f2r" 4
   (and (eq_attr "tune" "cortexa53")
-       (eq_attr "type" "f_mrc,f_mrrc,f_cvtf2i,
-			neon_to_gp, neon_to_gp_q"))
-  "cortex_a53_slot_any,cortex_a53_fp_alu,
-   nothing,cortex_a53_store")
+       (eq_attr "type" "f_mrc,f_mrrc"))
+  "cortex_a53_slot_any,cortex_a53_fp_alu")
+
+(define_insn_reservation "cortex_a53_r2f_cvt" 4
+  (and (eq_attr "tune" "cortexa53")
+       (eq_attr "type" "f_cvti2f, neon_from_gp, neon_from_gp_q"))
+  "cortex_a53_slot_any,cortex_a53_fp_alu")
+
+(define_insn_reservation "cortex_a53_f2r_cvt" 5
+  (and (eq_attr "tune" "cortexa53")
+       (eq_attr "type" "f_cvtf2i, neon_to_gp, neon_to_gp_q"))
+  "cortex_a53_slot_any,cortex_a53_fp_alu")
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; Floating-point flag transfer.
--- a/src/gcc/config/arm/cortex-a57.md
+++ b/src/gcc/config/arm/cortex-a57.md
@@ -297,7 +297,7 @@
        (eq_attr "type" "alu_imm,alus_imm,logic_imm,logics_imm,\
 			alu_sreg,alus_sreg,logic_reg,logics_reg,\
 			adc_imm,adcs_imm,adc_reg,adcs_reg,\
-			adr,bfm,clz,rbit,rev,alu_dsp_reg,\
+			adr,bfx,extend,clz,rbit,rev,alu_dsp_reg,\
 			rotate_imm,shift_imm,shift_reg,\
 			mov_imm,mov_reg,\
 			mvn_imm,mvn_reg,\
@@ -307,7 +307,7 @@
 ;; ALU ops with immediate shift
 (define_insn_reservation "cortex_a57_alu_shift" 3
   (and (eq_attr "tune" "cortexa57")
-       (eq_attr "type" "extend,\
+       (eq_attr "type" "bfm,\
 			alu_shift_imm,alus_shift_imm,\
 			crc,logic_shift_imm,logics_shift_imm,\
 			mov_shift,mvn_shift"))
@@ -726,7 +726,7 @@
 
 (define_insn_reservation "cortex_a57_fp_cpys" 4
   (and (eq_attr "tune" "cortexa57")
-       (eq_attr "type" "fmov"))
+       (eq_attr "type" "fmov,fcsel"))
   "(ca57_cx1|ca57_cx2)")
 
 (define_insn_reservation "cortex_a57_fp_divs" 12
--- a/src/gcc/config/arm/cortex-a8-neon.md
+++ b/src/gcc/config/arm/cortex-a8-neon.md
@@ -357,30 +357,34 @@
        (eq_attr "type" "fmuls"))
   "cortex_a8_vfp,cortex_a8_vfplite*11")
 
+;; Don't model a reservation for more than 15 cycles as this explodes the
+;; state space of the automaton for little gain.  It is unlikely that the
+;; scheduler will find enough instructions to hide the full latency of the
+;; instructions.
 (define_insn_reservation "cortex_a8_vfp_muld" 17
   (and (eq_attr "tune" "cortexa8")
        (eq_attr "type" "fmuld"))
-  "cortex_a8_vfp,cortex_a8_vfplite*16")
+  "cortex_a8_vfp,cortex_a8_vfplite*15")
 
 (define_insn_reservation "cortex_a8_vfp_macs" 21
   (and (eq_attr "tune" "cortexa8")
        (eq_attr "type" "fmacs,ffmas"))
-  "cortex_a8_vfp,cortex_a8_vfplite*20")
+  "cortex_a8_vfp,cortex_a8_vfplite*15")
 
 (define_insn_reservation "cortex_a8_vfp_macd" 26
   (and (eq_attr "tune" "cortexa8")
        (eq_attr "type" "fmacd,ffmad"))
-  "cortex_a8_vfp,cortex_a8_vfplite*25")
+  "cortex_a8_vfp,cortex_a8_vfplite*15")
 
 (define_insn_reservation "cortex_a8_vfp_divs" 37
   (and (eq_attr "tune" "cortexa8")
        (eq_attr "type" "fdivs, fsqrts"))
-  "cortex_a8_vfp,cortex_a8_vfplite*36")
+  "cortex_a8_vfp,cortex_a8_vfplite*15")
 
 (define_insn_reservation "cortex_a8_vfp_divd" 65
   (and (eq_attr "tune" "cortexa8")
        (eq_attr "type" "fdivd, fsqrtd"))
-  "cortex_a8_vfp,cortex_a8_vfplite*64")
+  "cortex_a8_vfp,cortex_a8_vfplite*15")
 
 ;; Comparisons can actually take 7 cycles sometimes instead of four,
 ;; but given all the other instructions lumped into type=ffarith that
--- a/src/gcc/config/arm/crypto.md
+++ b/src/gcc/config/arm/crypto.md
@@ -18,14 +18,27 @@
 ;; along with GCC; see the file COPYING3.  If not see
 ;; <http://www.gnu.org/licenses/>.
 
+
+;; When AES/AESMC fusion is enabled we want the register allocation to
+;; look like:
+;;    AESE Vn, _
+;;    AESMC Vn, Vn
+;; So prefer to tie operand 1 to operand 0 when fusing.
+
 (define_insn "crypto_<crypto_pattern>"
-  [(set (match_operand:<crypto_mode> 0 "register_operand" "=w")
+  [(set (match_operand:<crypto_mode> 0 "register_operand" "=w,w")
         (unspec:<crypto_mode> [(match_operand:<crypto_mode> 1
-                       "register_operand" "w")]
+                       "register_operand" "0,w")]
          CRYPTO_UNARY))]
   "TARGET_CRYPTO"
   "<crypto_pattern>.<crypto_size_sfx>\\t%q0, %q1"
-  [(set_attr "type" "<crypto_type>")]
+  [(set_attr "type" "<crypto_type>")
+   (set_attr_alternative "enabled"
+     [(if_then_else (match_test
+		       "arm_fusion_enabled_p (tune_params::FUSE_AES_AESMC)")
+		     (const_string "yes" )
+		     (const_string "no"))
+      (const_string "yes")])]
 )
 
 (define_insn "crypto_<crypto_pattern>"
--- a/src/gcc/config/arm/driver-arm.c
+++ b/src/gcc/config/arm/driver-arm.c
@@ -46,6 +46,12 @@ static struct vendor_cpu arm_cpu_table[] = {
     {"0xc0d", "armv7ve", "cortex-a12"},
     {"0xc0e", "armv7ve", "cortex-a17"},
     {"0xc0f", "armv7ve", "cortex-a15"},
+    {"0xd01", "armv8-a+crc", "cortex-a32"},
+    {"0xd04", "armv8-a+crc", "cortex-a35"},
+    {"0xd03", "armv8-a+crc", "cortex-a53"},
+    {"0xd07", "armv8-a+crc", "cortex-a57"},
+    {"0xd08", "armv8-a+crc", "cortex-a72"},
+    {"0xd09", "armv8-a+crc", "cortex-a73"},
     {"0xc14", "armv7-r", "cortex-r4"},
     {"0xc15", "armv7-r", "cortex-r5"},
     {"0xc20", "armv6-m", "cortex-m0"},
--- a/src/gcc/config/arm/elf.h
+++ b/src/gcc/config/arm/elf.h
@@ -75,16 +75,7 @@
 
 /* We might need a ARM specific header to function declarations.  */
 #undef  ASM_DECLARE_FUNCTION_NAME
-#define ASM_DECLARE_FUNCTION_NAME(FILE, NAME, DECL)		\
-  do								\
-    {								\
-      ARM_DECLARE_FUNCTION_NAME (FILE, NAME, DECL);		\
-      ASM_OUTPUT_TYPE_DIRECTIVE (FILE, NAME, "function");	\
-      ASM_DECLARE_RESULT (FILE, DECL_RESULT (DECL));		\
-      ASM_OUTPUT_LABEL(FILE, NAME);				\
-      ARM_OUTPUT_FN_UNWIND (FILE, TRUE);			\
-    }								\
-  while (0)
+#define ASM_DECLARE_FUNCTION_NAME arm_asm_declare_function_name
 
 /* We might need an ARM specific trailer for function declarations.  */
 #undef  ASM_DECLARE_FUNCTION_SIZE
@@ -148,8 +139,9 @@
   while (0)
 
 /* Horrible hack: We want to prevent some libgcc routines being included
-   for some multilibs.  */
-#ifndef __ARM_ARCH_6M__
+   for some multilibs.  The condition should match the one in
+   libgcc/config/arm/lib1funcs.S.  */
+#if __ARM_ARCH_ISA_ARM || __ARM_ARCH_ISA_THUMB != 1
 #undef L_fixdfsi
 #undef L_fixunsdfsi
 #undef L_truncdfsf2
--- a/src/gcc/config/arm/exynos-m1.md
+++ b/src/gcc/config/arm/exynos-m1.md
@@ -358,7 +358,7 @@
 	    (eq_attr "type" "alu_imm, alus_imm, logic_imm, logics_imm,\
 			     alu_sreg, alus_sreg, logic_reg, logics_reg,\
 			     adc_imm, adcs_imm, adc_reg, adcs_reg,\
-			     adr, bfm, clz, rbit, rev, csel, alu_dsp_reg,\
+			     adr, bfm, bfx, clz, rbit, rev, csel, alu_dsp_reg,\
 			     shift_imm, shift_reg, rotate_imm, extend,\
 			     mov_imm, mov_reg,\
 			     mvn_imm, mvn_reg,\
@@ -372,7 +372,7 @@
 	    (eq_attr "type" "alu_imm, alus_imm, logic_imm, logics_imm,\
 			     alu_sreg, alus_sreg, logic_reg, logics_reg,\
 			     adc_imm, adcs_imm, adc_reg, adcs_reg,\
-			     adr, bfm, clz, rbit, rev, alu_dsp_reg,\
+			     adr, bfm, bfx, clz, rbit, rev, alu_dsp_reg,\
 			     shift_imm, shift_reg, rotate_imm, extend,\
 			     mov_imm, mov_reg,\
 			     mvn_imm, mvn_reg,\
--- a/src/gcc/config/arm/iterators.md
+++ b/src/gcc/config/arm/iterators.md
@@ -46,7 +46,7 @@
 (define_mode_iterator SIDI [SI DI])
 
 ;; A list of modes which the VFP unit can handle
-(define_mode_iterator SDF [(SF "TARGET_VFP") (DF "TARGET_VFP_DOUBLE")])
+(define_mode_iterator SDF [(SF "") (DF "TARGET_VFP_DOUBLE")])
 
 ;; Integer element sizes implemented by IWMMXT.
 (define_mode_iterator VMMX [V2SI V4HI V8QI])
@@ -119,6 +119,10 @@
 ;; All supported vector modes (except those with 64-bit integer elements).
 (define_mode_iterator VDQW [V8QI V16QI V4HI V8HI V2SI V4SI V2SF V4SF])
 
+;; All supported vector modes including 16-bit float modes.
+(define_mode_iterator VDQWH [V8QI V16QI V4HI V8HI V2SI V4SI V2SF V4SF
+			     V8HF V4HF])
+
 ;; Supported integer vector modes (not 64 bit elements).
 (define_mode_iterator VDQIW [V8QI V16QI V4HI V8HI V2SI V4SI])
 
@@ -141,6 +145,9 @@
 ;; Vector modes form int->float conversions.
 (define_mode_iterator VCVTI [V2SI V4SI])
 
+;; Vector modes for int->half conversions.
+(define_mode_iterator VCVTHI [V4HI V8HI])
+
 ;; Vector modes for doubleword multiply-accumulate, etc. insns.
 (define_mode_iterator VMD [V4HI V2SI V2SF])
 
@@ -174,6 +181,9 @@
 ;; Modes with 8-bit, 16-bit and 32-bit elements.
 (define_mode_iterator VU [V16QI V8HI V4SI])
 
+;; Vector modes for 16-bit floating-point support.
+(define_mode_iterator VH [V8HF V4HF])
+
 ;; Iterators used for fixed-point support.
 (define_mode_iterator FIXED [QQ HQ SQ UQQ UHQ USQ HA SA UHA USA])
 
@@ -192,14 +202,17 @@
 ;; Code iterators
 ;;----------------------------------------------------------------------------
 
-;; A list of condition codes used in compare instructions where 
-;; the carry flag from the addition is used instead of doing the 
+;; A list of condition codes used in compare instructions where
+;; the carry flag from the addition is used instead of doing the
 ;; compare a second time.
 (define_code_iterator LTUGEU [ltu geu])
 
 ;; The signed gt, ge comparisons
 (define_code_iterator GTGE [gt ge])
 
+;; The signed gt, ge, lt, le comparisons
+(define_code_iterator GLTE [gt ge lt le])
+
 ;; The unsigned gt, ge comparisons
 (define_code_iterator GTUGEU [gtu geu])
 
@@ -228,6 +241,12 @@
 ;; Binary operators whose second operand can be shifted.
 (define_code_iterator SHIFTABLE_OPS [plus minus ior xor and])
 
+;; Operations on the sign of a number.
+(define_code_iterator ABSNEG [abs neg])
+
+;; Conversions.
+(define_code_iterator FCVT [unsigned_float float])
+
 ;; plus and minus are the only SHIFTABLE_OPS for which Thumb2 allows
 ;; a stack pointer opoerand.  The minus operation is a candidate for an rsub
 ;; and hence only plus is supported.
@@ -251,10 +270,14 @@
 (define_int_iterator VRINT [UNSPEC_VRINTZ UNSPEC_VRINTP UNSPEC_VRINTM
                             UNSPEC_VRINTR UNSPEC_VRINTX UNSPEC_VRINTA])
 
-(define_int_iterator NEON_VCMP [UNSPEC_VCEQ UNSPEC_VCGT UNSPEC_VCGE UNSPEC_VCLT UNSPEC_VCLE])
+(define_int_iterator NEON_VCMP [UNSPEC_VCEQ UNSPEC_VCGT UNSPEC_VCGE
+				UNSPEC_VCLT UNSPEC_VCLE])
 
 (define_int_iterator NEON_VACMP [UNSPEC_VCAGE UNSPEC_VCAGT])
 
+(define_int_iterator NEON_VAGLTE [UNSPEC_VCAGE UNSPEC_VCAGT
+				  UNSPEC_VCALE UNSPEC_VCALT])
+
 (define_int_iterator VCVT [UNSPEC_VRINTP UNSPEC_VRINTM UNSPEC_VRINTA])
 
 (define_int_iterator NEON_VRINT [UNSPEC_NVRINTP UNSPEC_NVRINTZ UNSPEC_NVRINTM
@@ -323,6 +346,22 @@
 
 (define_int_iterator VCVT_US_N [UNSPEC_VCVT_S_N UNSPEC_VCVT_U_N])
 
+(define_int_iterator VCVT_HF_US_N [UNSPEC_VCVT_HF_S_N UNSPEC_VCVT_HF_U_N])
+
+(define_int_iterator VCVT_SI_US_N [UNSPEC_VCVT_SI_S_N UNSPEC_VCVT_SI_U_N])
+
+(define_int_iterator VCVT_HF_US [UNSPEC_VCVTA_S UNSPEC_VCVTA_U
+				 UNSPEC_VCVTM_S UNSPEC_VCVTM_U
+				 UNSPEC_VCVTN_S UNSPEC_VCVTN_U
+				 UNSPEC_VCVTP_S UNSPEC_VCVTP_U])
+
+(define_int_iterator VCVTH_US [UNSPEC_VCVTH_S UNSPEC_VCVTH_U])
+
+;; Operators for FP16 instructions.
+(define_int_iterator FP16_RND [UNSPEC_VRND UNSPEC_VRNDA
+			       UNSPEC_VRNDM UNSPEC_VRNDN
+			       UNSPEC_VRNDP UNSPEC_VRNDX])
+
 (define_int_iterator VQMOVN [UNSPEC_VQMOVN_S UNSPEC_VQMOVN_U])
 
 (define_int_iterator VMOVL [UNSPEC_VMOVL_S UNSPEC_VMOVL_U])
@@ -366,6 +405,8 @@
 
 (define_int_iterator VQRDMLH_AS [UNSPEC_VQRDMLAH UNSPEC_VQRDMLSH])
 
+(define_int_iterator VFM_LANE_AS [UNSPEC_VFMA_LANE UNSPEC_VFMS_LANE])
+
 ;;----------------------------------------------------------------------------
 ;; Mode attributes
 ;;----------------------------------------------------------------------------
@@ -384,6 +425,10 @@
 (define_mode_attr V_cvtto [(V2SI "v2sf") (V2SF "v2si")
                            (V4SI "v4sf") (V4SF "v4si")])
 
+;; (Opposite) mode to convert to/from for vector-half mode conversions.
+(define_mode_attr VH_CVTTO [(V4HI "V4HF") (V4HF "V4HI")
+			    (V8HI "V8HF") (V8HF "V8HI")])
+
 ;; Define element mode for each vector mode.
 (define_mode_attr V_elem [(V8QI "QI") (V16QI "QI")
 			  (V4HI "HI") (V8HI "HI")
@@ -427,12 +472,13 @@
 
 ;; Register width from element mode
 (define_mode_attr V_reg [(V8QI "P") (V16QI "q")
-                         (V4HI "P") (V8HI  "q")
-                         (V4HF "P") (V8HF  "q")
-                         (V2SI "P") (V4SI  "q")
-                         (V2SF "P") (V4SF  "q")
-                         (DI   "P") (V2DI  "q")
-                         (SF   "")  (DF    "P")])
+			 (V4HI "P") (V8HI  "q")
+			 (V4HF "P") (V8HF  "q")
+			 (V2SI "P") (V4SI  "q")
+			 (V2SF "P") (V4SF  "q")
+			 (DI   "P") (V2DI  "q")
+			 (SF   "")  (DF    "P")
+			 (HF   "")])
 
 ;; Wider modes with the same number of elements.
 (define_mode_attr V_widen [(V8QI "V8HI") (V4HI "V4SI") (V2SI "V2DI")])
@@ -448,7 +494,7 @@
 (define_mode_attr V_HALF [(V16QI "V8QI") (V8HI "V4HI")
 			  (V8HF "V4HF") (V4SI  "V2SI")
 			  (V4SF "V2SF") (V2DF "DF")
-                          (V2DI "DI")])
+			  (V2DI "DI") (V4HF "HF")])
 
 ;; Same, but lower-case.
 (define_mode_attr V_half [(V16QI "v8qi") (V8HI "v4hi")
@@ -475,9 +521,10 @@
 ;; Used for neon_vdup_lane, where the second operand is double-sized
 ;; even when the first one is quad.
 (define_mode_attr V_double_vector_mode [(V16QI "V8QI") (V8HI "V4HI")
-                                        (V4SI "V2SI") (V4SF "V2SF")
-                                        (V8QI "V8QI") (V4HI "V4HI")
-                                        (V2SI "V2SI") (V2SF "V2SF")])
+					(V4SI "V2SI") (V4SF "V2SF")
+					(V8QI "V8QI") (V4HI "V4HI")
+					(V2SI "V2SI") (V2SF "V2SF")
+					(V8HF "V4HF") (V4HF "V4HF")])
 
 ;; Mode of result of comparison operations (and bit-select operand 1).
 (define_mode_attr V_cmp_result [(V8QI "V8QI") (V16QI "V16QI")
@@ -496,18 +543,22 @@
 ;; Get element type from double-width mode, for operations where we 
 ;; don't care about signedness.
 (define_mode_attr V_if_elem [(V8QI "i8")  (V16QI "i8")
-                 (V4HI "i16") (V8HI  "i16")
-                             (V2SI "i32") (V4SI  "i32")
-                             (DI   "i64") (V2DI  "i64")
-                 (V2SF "f32") (V4SF  "f32")
-                 (SF "f32") (DF "f64")])
+			     (V4HI "i16") (V8HI  "i16")
+			     (V2SI "i32") (V4SI  "i32")
+			     (DI   "i64") (V2DI  "i64")
+			     (V2SF "f32") (V4SF  "f32")
+			     (SF   "f32") (DF    "f64")
+			     (HF   "f16") (V4HF  "f16")
+			     (V8HF "f16")])
 
 ;; Same, but for operations which work on signed values.
 (define_mode_attr V_s_elem [(V8QI "s8")  (V16QI "s8")
-                (V4HI "s16") (V8HI  "s16")
-                            (V2SI "s32") (V4SI  "s32")
-                            (DI   "s64") (V2DI  "s64")
-                (V2SF "f32") (V4SF  "f32")])
+			    (V4HI "s16") (V8HI  "s16")
+			    (V2SI "s32") (V4SI  "s32")
+			    (DI   "s64") (V2DI  "s64")
+			    (V2SF "f32") (V4SF  "f32")
+			    (HF   "f16") (V4HF  "f16")
+			    (V8HF "f16")])
 
 ;; Same, but for operations which work on unsigned values.
 (define_mode_attr V_u_elem [(V8QI "u8")  (V16QI "u8")
@@ -524,17 +575,22 @@
                              (V2SF "32") (V4SF "32")])
 
 (define_mode_attr V_sz_elem [(V8QI "8")  (V16QI "8")
-                 (V4HI "16") (V8HI  "16")
-                             (V2SI "32") (V4SI  "32")
-                             (DI   "64") (V2DI  "64")
+			     (V4HI "16") (V8HI  "16")
+			     (V2SI "32") (V4SI  "32")
+			     (DI   "64") (V2DI  "64")
 			     (V4HF "16") (V8HF "16")
-                 (V2SF "32") (V4SF  "32")])
+			     (V2SF "32") (V4SF  "32")])
 
 (define_mode_attr V_elem_ch [(V8QI "b")  (V16QI "b")
-                             (V4HI "h") (V8HI  "h")
-                             (V2SI "s") (V4SI  "s")
-                             (DI   "d") (V2DI  "d")
-                             (V2SF "s") (V4SF  "s")])
+			     (V4HI "h") (V8HI  "h")
+			     (V2SI "s") (V4SI  "s")
+			     (DI   "d") (V2DI  "d")
+			     (V2SF "s") (V4SF  "s")
+			     (V2SF "s") (V4SF  "s")])
+
+(define_mode_attr VH_elem_ch [(V4HI "s") (V8HI  "s")
+			      (V4HF "s") (V8HF  "s")
+			      (HF "s")])
 
 ;; Element sizes for duplicating ARM registers to all elements of a vector.
 (define_mode_attr VD_dup [(V8QI "8") (V4HI "16") (V2SI "32") (V2SF "32")])
@@ -570,29 +626,30 @@
 ;; This mode attribute is used to obtain the correct register constraints.
 
 (define_mode_attr scalar_mul_constraint [(V4HI "x") (V2SI "t") (V2SF "t")
-                                         (V8HI "x") (V4SI "t") (V4SF "t")])
+					 (V8HI "x") (V4SI "t") (V4SF "t")
+					 (V8HF "x") (V4HF "x")])
 
 ;; Predicates used for setting type for neon instructions
 
 (define_mode_attr Is_float_mode [(V8QI "false") (V16QI "false")
-                 (V4HI "false") (V8HI "false")
-                 (V2SI "false") (V4SI "false")
-                 (V4HF "true") (V8HF "true")
-                 (V2SF "true") (V4SF "true")
-                 (DI "false") (V2DI "false")])
+				 (V4HI "false") (V8HI "false")
+				 (V2SI "false") (V4SI "false")
+				 (V4HF "true") (V8HF "true")
+				 (V2SF "true") (V4SF "true")
+				 (DI "false") (V2DI "false")])
 
 (define_mode_attr Scalar_mul_8_16 [(V8QI "true") (V16QI "true")
-                   (V4HI "true") (V8HI "true")
-                   (V2SI "false") (V4SI "false")
-                   (V2SF "false") (V4SF "false")
-                   (DI "false") (V2DI "false")])
-
+				   (V4HI "true") (V8HI "true")
+				   (V2SI "false") (V4SI "false")
+				   (V2SF "false") (V4SF "false")
+				   (DI "false") (V2DI "false")])
 
 (define_mode_attr Is_d_reg [(V8QI "true") (V16QI "false")
-                            (V4HI "true") (V8HI  "false")
-                            (V2SI "true") (V4SI  "false")
-                            (V2SF "true") (V4SF  "false")
-                            (DI   "true") (V2DI  "false")])
+			    (V4HI "true") (V8HI  "false")
+			    (V2SI "true") (V4SI  "false")
+			    (V2SF "true") (V4SF  "false")
+			    (DI   "true") (V2DI  "false")
+			    (V4HF "true") (V8HF  "false")])
 
 (define_mode_attr V_mode_nunits [(V8QI "8") (V16QI "16")
 				 (V4HF "4") (V8HF "8")
@@ -637,12 +694,14 @@
 
 ;; Mode attribute used to build the "type" attribute.
 (define_mode_attr q [(V8QI "") (V16QI "_q")
-                     (V4HI "") (V8HI "_q")
-                     (V2SI "") (V4SI "_q")
+		     (V4HI "") (V8HI "_q")
+		     (V2SI "") (V4SI "_q")
+		     (V4HF "") (V8HF "_q")
+		     (V2SF "") (V4SF "_q")
 		     (V4HF "") (V8HF "_q")
-                     (V2SF "") (V4SF "_q")
-                     (DI "")   (V2DI "_q")
-                     (DF "")   (V2DF "_q")])
+		     (DI "")   (V2DI "_q")
+		     (DF "")   (V2DF "_q")
+		     (HF "")])
 
 (define_mode_attr pf [(V8QI "p") (V16QI "p") (V2SF "f") (V4SF "f")])
 
@@ -679,6 +738,16 @@
 (define_code_attr shift [(ashiftrt "ashr") (lshiftrt "lshr")])
 (define_code_attr shifttype [(ashiftrt "signed") (lshiftrt "unsigned")])
 
+;; String reprentations of operations on the sign of a number.
+(define_code_attr absneg_str [(abs "abs") (neg "neg")])
+
+;; Conversions.
+(define_code_attr FCVTI32typename [(unsigned_float "u32") (float "s32")])
+
+(define_code_attr float_sup [(unsigned_float "u") (float "s")])
+
+(define_code_attr float_SUP [(unsigned_float "U") (float "S")])
+
 ;;----------------------------------------------------------------------------
 ;; Int attributes
 ;;----------------------------------------------------------------------------
@@ -710,7 +779,13 @@
   (UNSPEC_VPMAX "s") (UNSPEC_VPMAX_U "u")
   (UNSPEC_VPMIN "s") (UNSPEC_VPMIN_U "u")
   (UNSPEC_VCVT_S "s") (UNSPEC_VCVT_U "u")
+  (UNSPEC_VCVTA_S "s") (UNSPEC_VCVTA_U "u")
+  (UNSPEC_VCVTM_S "s") (UNSPEC_VCVTM_U "u")
+  (UNSPEC_VCVTN_S "s") (UNSPEC_VCVTN_U "u")
+  (UNSPEC_VCVTP_S "s") (UNSPEC_VCVTP_U "u")
   (UNSPEC_VCVT_S_N "s") (UNSPEC_VCVT_U_N "u")
+  (UNSPEC_VCVT_HF_S_N "s") (UNSPEC_VCVT_HF_U_N "u")
+  (UNSPEC_VCVT_SI_S_N "s") (UNSPEC_VCVT_SI_U_N "u")
   (UNSPEC_VQMOVN_S "s") (UNSPEC_VQMOVN_U "u")
   (UNSPEC_VMOVL_S "s") (UNSPEC_VMOVL_U "u")
   (UNSPEC_VSHL_S "s") (UNSPEC_VSHL_U "u")
@@ -725,13 +800,30 @@
   (UNSPEC_VSHLL_S_N "s") (UNSPEC_VSHLL_U_N "u")
   (UNSPEC_VSRA_S_N "s") (UNSPEC_VSRA_U_N "u")
   (UNSPEC_VRSRA_S_N "s") (UNSPEC_VRSRA_U_N "u")
-
+  (UNSPEC_VCVTH_S "s") (UNSPEC_VCVTH_U "u")
 ])
 
+(define_int_attr vcvth_op
+ [(UNSPEC_VCVTA_S "a") (UNSPEC_VCVTA_U "a")
+  (UNSPEC_VCVTM_S "m") (UNSPEC_VCVTM_U "m")
+  (UNSPEC_VCVTN_S "n") (UNSPEC_VCVTN_U "n")
+  (UNSPEC_VCVTP_S "p") (UNSPEC_VCVTP_U "p")])
+
+(define_int_attr fp16_rnd_str
+  [(UNSPEC_VRND "rnd") (UNSPEC_VRNDA "rnda")
+   (UNSPEC_VRNDM "rndm") (UNSPEC_VRNDN "rndn")
+   (UNSPEC_VRNDP "rndp") (UNSPEC_VRNDX "rndx")])
+
+(define_int_attr fp16_rnd_insn
+  [(UNSPEC_VRND "vrintz") (UNSPEC_VRNDA "vrinta")
+   (UNSPEC_VRNDM "vrintm") (UNSPEC_VRNDN "vrintn")
+   (UNSPEC_VRNDP "vrintp") (UNSPEC_VRNDX "vrintx")])
+
 (define_int_attr cmp_op_unsp [(UNSPEC_VCEQ "eq") (UNSPEC_VCGT "gt")
-                              (UNSPEC_VCGE "ge") (UNSPEC_VCLE "le")
-                              (UNSPEC_VCLT "lt") (UNSPEC_VCAGE "ge")
-                              (UNSPEC_VCAGT "gt")])
+			      (UNSPEC_VCGE "ge") (UNSPEC_VCLE "le")
+			      (UNSPEC_VCLT "lt") (UNSPEC_VCAGE "ge")
+			      (UNSPEC_VCAGT "gt") (UNSPEC_VCALE "le")
+			      (UNSPEC_VCALT "lt")])
 
 (define_int_attr r [
   (UNSPEC_VRHADD_S "r") (UNSPEC_VRHADD_U "r")
@@ -847,3 +939,7 @@
 
 ;; Attributes for VQRDMLAH/VQRDMLSH
 (define_int_attr neon_rdma_as [(UNSPEC_VQRDMLAH "a") (UNSPEC_VQRDMLSH "s")])
+
+;; Attributes for VFMA_LANE/ VFMS_LANE
+(define_int_attr neon_vfm_lane_as
+ [(UNSPEC_VFMA_LANE "a") (UNSPEC_VFMS_LANE "s")])
--- a/src/gcc/config/arm/neon-testgen.ml
+++ b/src//dev/null
@@ -1,324 +0,0 @@
-(* Auto-generate ARM Neon intrinsics tests.
-   Copyright (C) 2006-2016 Free Software Foundation, Inc.
-   Contributed by CodeSourcery.
-
-   This file is part of GCC.
-
-   GCC is free software; you can redistribute it and/or modify it under
-   the terms of the GNU General Public License as published by the Free
-   Software Foundation; either version 3, or (at your option) any later
-   version.
-
-   GCC is distributed in the hope that it will be useful, but WITHOUT ANY
-   WARRANTY; without even the implied warranty of MERCHANTABILITY or
-   FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
-   for more details.
-
-   You should have received a copy of the GNU General Public License
-   along with GCC; see the file COPYING3.  If not see
-   <http://www.gnu.org/licenses/>.
-
-   This is an O'Caml program.  The O'Caml compiler is available from:
-
-     http://caml.inria.fr/
-
-   Or from your favourite OS's friendly packaging system. Tested with version
-   3.09.2, though other versions will probably work too.
-
-   Compile with:
-     ocamlc -c neon.ml
-     ocamlc -o neon-testgen neon.cmo neon-testgen.ml
-
-   Run with:
-     cd /path/to/gcc/testsuite/gcc.target/arm/neon
-     /path/to/neon-testgen
-*)
-
-open Neon
-
-type c_type_flags = Pointer | Const
-
-(* Open a test source file.  *)
-let open_test_file dir name =
-  try
-    open_out (dir ^ "/" ^ name ^ ".c")
-  with Sys_error str ->
-    failwith ("Could not create test source file " ^ name ^ ": " ^ str)
-
-(* Emit prologue code to a test source file.  *)
-let emit_prologue chan test_name effective_target compile_test_optim =
-  Printf.fprintf chan "/* Test the `%s' ARM Neon intrinsic.  */\n" test_name;
-  Printf.fprintf chan "/* This file was autogenerated by neon-testgen.  */\n\n";
-  Printf.fprintf chan "/* { dg-do assemble } */\n";
-  Printf.fprintf chan "/* { dg-require-effective-target %s_ok } */\n"
-                 effective_target;
-  Printf.fprintf chan "/* { dg-options \"-save-temps %s\" } */\n" compile_test_optim;
-  Printf.fprintf chan "/* { dg-add-options %s } */\n" effective_target;
-  Printf.fprintf chan "\n#include \"arm_neon.h\"\n\n"
-
-(* Emit declarations of variables that are going to be passed
-   to an intrinsic, together with one to take a returned value if needed.  *)
-let emit_variables chan c_types features spaces =
-  let emit () =
-    ignore (
-      List.fold_left (fun arg_number -> fun (flags, ty) ->
-                        let pointer_bit =
-                          if List.mem Pointer flags then "*" else ""
-                        in
-                          (* Const arguments to builtins are directly
-                             written in as constants.  *)
-                          if not (List.mem Const flags) then
-                            Printf.fprintf chan "%s%s %sarg%d_%s;\n"
-                                           spaces ty pointer_bit arg_number ty;
-                        arg_number + 1)
-                     0 (List.tl c_types))
-  in
-    match c_types with
-      (_, return_ty) :: tys ->
-        if return_ty <> "void" then begin
-          (* The intrinsic returns a value.  We need to do explicit register
-             allocation for vget_low tests or they fail because of copy
-             elimination.  *)
-          ((if List.mem Fixed_vector_reg features then
-              Printf.fprintf chan "%sregister %s out_%s asm (\"d18\");\n"
-                             spaces return_ty return_ty
-            else if List.mem Fixed_core_reg features then
-              Printf.fprintf chan "%sregister %s out_%s asm (\"r0\");\n"
-                             spaces return_ty return_ty
-            else
-              Printf.fprintf chan "%s%s out_%s;\n" spaces return_ty return_ty);
-	   emit ())
-        end else
-          (* The intrinsic does not return a value.  *)
-          emit ()
-    | _ -> assert false
-
-(* Emit code to call an intrinsic.  *)
-let emit_call chan const_valuator c_types name elt_ty =
-  (if snd (List.hd c_types) <> "void" then
-     Printf.fprintf chan "  out_%s = " (snd (List.hd c_types))
-   else
-     Printf.fprintf chan "  ");
-  Printf.fprintf chan "%s_%s (" (intrinsic_name name) (string_of_elt elt_ty);
-  let print_arg chan arg_number (flags, ty) =
-    (* If the argument is of const type, then directly write in the
-       constant now.  *)
-    if List.mem Const flags then
-      match const_valuator with
-        None ->
-          if List.mem Pointer flags then
-            Printf.fprintf chan "0"
-          else
-            Printf.fprintf chan "1"
-      | Some f -> Printf.fprintf chan "%s" (string_of_int (f arg_number))
-    else
-      Printf.fprintf chan "arg%d_%s" arg_number ty
-  in
-  let rec print_args arg_number tys =
-    match tys with
-      [] -> ()
-    | [ty] -> print_arg chan arg_number ty
-    | ty::tys ->
-      print_arg chan arg_number ty;
-      Printf.fprintf chan ", ";
-      print_args (arg_number + 1) tys
-  in
-    print_args 0 (List.tl c_types);
-    Printf.fprintf chan ");\n"
-
-(* Emit epilogue code to a test source file.  *)
-let emit_epilogue chan features regexps =
-  let no_op = List.exists (fun feature -> feature = No_op) features in
-    Printf.fprintf chan "}\n\n";
-    if not no_op then
-      List.iter (fun regexp ->
-                  Printf.fprintf chan
-                    "/* { dg-final { scan-assembler \"%s\" } } */\n" regexp)
-                regexps
-    else
-      ()
-    
-
-(* Check a list of C types to determine which ones are pointers and which
-   ones are const.  *)
-let check_types tys =
-  let tys' =
-    List.map (fun ty ->
-                let len = String.length ty in
-                  if len > 2 && String.get ty (len - 2) = ' '
-                             && String.get ty (len - 1) = '*'
-                  then ([Pointer], String.sub ty 0 (len - 2))
-                  else ([], ty)) tys
-  in
-    List.map (fun (flags, ty) ->
-                if String.length ty > 6 && String.sub ty 0 6 = "const "
-                then (Const :: flags, String.sub ty 6 ((String.length ty) - 6))
-                else (flags, ty)) tys'
-
-(* Work out what the effective target should be.  *)
-let effective_target features =
-  try
-    match List.find (fun feature ->
-                       match feature with Requires_feature _ -> true
-                                        | Requires_arch _ -> true
-                                        | Requires_FP_bit 1 -> true
-                                        | _ -> false)
-                     features with
-      Requires_feature "FMA" -> "arm_neonv2"
-    | Requires_feature "CRYPTO" -> "arm_crypto"
-    | Requires_arch 8 -> "arm_v8_neon"
-    | Requires_FP_bit 1 -> "arm_neon_fp16"
-    | _ -> assert false
-  with Not_found -> "arm_neon"
-
-(* Work out what the testcase optimization level should be, default to -O0.  *)
-let compile_test_optim features =
-  try
-    match List.find (fun feature ->
-                       match feature with Compiler_optim _ -> true
-                                        | _ -> false)
-                     features with
-      Compiler_optim opt -> opt
-    | _ -> assert false
-  with Not_found -> "-O0"
-
-(* Given an intrinsic shape, produce a regexp that will match
-   the right-hand sides of instructions generated by an intrinsic of
-   that shape.  *)
-let rec analyze_shape shape =
-  let rec n_things n thing =
-    match n with
-      0 -> []
-    | n -> thing :: (n_things (n - 1) thing)
-  in
-  let rec analyze_shape_elt elt =
-    match elt with
-      Dreg -> "\\[dD\\]\\[0-9\\]+"
-    | Qreg -> "\\[qQ\\]\\[0-9\\]+"
-    | Corereg -> "\\[rR\\]\\[0-9\\]+"
-    | Immed -> "#\\[0-9\\]+"
-    | VecArray (1, elt) ->
-        let elt_regexp = analyze_shape_elt elt in
-          "((\\\\\\{" ^ elt_regexp ^ "\\\\\\})|(" ^ elt_regexp ^ "))"
-    | VecArray (n, elt) ->
-      let elt_regexp = analyze_shape_elt elt in
-      let alt1 = elt_regexp ^ "-" ^ elt_regexp in
-      let alt2 = commas (fun x -> x) (n_things n elt_regexp) "" in
-        "\\\\\\{((" ^ alt1 ^ ")|(" ^ alt2 ^ "))\\\\\\}"
-    | (PtrTo elt | CstPtrTo elt) ->
-      "\\\\\\[" ^ (analyze_shape_elt elt) ^ "\\(:\\[0-9\\]+\\)?\\\\\\]"
-    | Element_of_dreg -> (analyze_shape_elt Dreg) ^ "\\\\\\[\\[0-9\\]+\\\\\\]"
-    | Element_of_qreg -> (analyze_shape_elt Qreg) ^ "\\\\\\[\\[0-9\\]+\\\\\\]"
-    | All_elements_of_dreg -> (analyze_shape_elt Dreg) ^ "\\\\\\[\\\\\\]"
-    | Alternatives (elts) -> "(" ^ (String.concat "|" (List.map analyze_shape_elt elts)) ^ ")"
-  in
-    match shape with
-      All (n, elt) -> commas analyze_shape_elt (n_things n elt) ""
-    | Long -> (analyze_shape_elt Qreg) ^ ", " ^ (analyze_shape_elt Dreg) ^
-              ", " ^ (analyze_shape_elt Dreg)
-    | Long_noreg elt -> (analyze_shape_elt elt) ^ ", " ^ (analyze_shape_elt elt)
-    | Wide -> (analyze_shape_elt Qreg) ^ ", " ^ (analyze_shape_elt Qreg) ^
-              ", " ^ (analyze_shape_elt Dreg)
-    | Wide_noreg elt -> analyze_shape (Long_noreg elt)
-    | Narrow -> (analyze_shape_elt Dreg) ^ ", " ^ (analyze_shape_elt Qreg) ^
-                ", " ^ (analyze_shape_elt Qreg)
-    | Use_operands elts -> commas analyze_shape_elt (Array.to_list elts) ""
-    | By_scalar Dreg ->
-        analyze_shape (Use_operands [| Dreg; Dreg; Element_of_dreg |])
-    | By_scalar Qreg ->
-        analyze_shape (Use_operands [| Qreg; Qreg; Element_of_dreg |])
-    | By_scalar _ -> assert false
-    | Wide_lane ->
-        analyze_shape (Use_operands [| Qreg; Dreg; Element_of_dreg |])
-    | Wide_scalar ->
-        analyze_shape (Use_operands [| Qreg; Dreg; Element_of_dreg |])
-    | Pair_result elt ->
-      let elt_regexp = analyze_shape_elt elt in
-        elt_regexp ^ ", " ^ elt_regexp
-    | Unary_scalar _ -> "FIXME Unary_scalar"
-    | Binary_imm elt -> analyze_shape (Use_operands [| elt; elt; Immed |])
-    | Narrow_imm -> analyze_shape (Use_operands [| Dreg; Qreg; Immed |])
-    | Long_imm -> analyze_shape (Use_operands [| Qreg; Dreg; Immed |])
-
-(* Generate tests for one intrinsic.  *)
-let test_intrinsic dir opcode features shape name munge elt_ty =
-  (* Open the test source file.  *)
-  let test_name = name ^ (string_of_elt elt_ty) in
-  let chan = open_test_file dir test_name in
-  (* Work out what argument and return types the intrinsic has.  *)
-  let c_arity, new_elt_ty = munge shape elt_ty in
-  let c_types = check_types (strings_of_arity c_arity) in
-  (* Extract any constant valuator (a function specifying what constant
-     values are to be written into the intrinsic call) from the features
-     list.  *)
-  let const_valuator =
-    try
-      match (List.find (fun feature -> match feature with
-                                         Const_valuator _ -> true
-				       | _ -> false) features) with
-        Const_valuator f -> Some f
-      | _ -> assert false
-    with Not_found -> None
-  in
-  (* Work out what instruction name(s) to expect.  *)
-  let insns = get_insn_names features name in
-  let no_suffix = (new_elt_ty = NoElts) in
-  let insns =
-    if no_suffix then insns
-                 else List.map (fun insn ->
-                                  let suffix = string_of_elt_dots new_elt_ty in
-                                    insn ^ "\\." ^ suffix) insns
-  in
-  (* Construct a regexp to match against the expected instruction name(s).  *)
-  let insn_regexp =
-    match insns with
-      [] -> assert false
-    | [insn] -> insn
-    | _ ->
-      let rec calc_regexp insns cur_regexp =
-        match insns with
-          [] -> cur_regexp
-        | [insn] -> cur_regexp ^ "(" ^ insn ^ "))"
-        | insn::insns -> calc_regexp insns (cur_regexp ^ "(" ^ insn ^ ")|")
-      in calc_regexp insns "("
-  in
-  (* Construct regexps to match against the instructions that this
-     intrinsic expands to.  Watch out for any writeback character and
-     comments after the instruction.  *)
-  let regexps = List.map (fun regexp -> insn_regexp ^ "\\[ \t\\]+" ^ regexp ^
-			  "!?\\(\\[ \t\\]+@\\[a-zA-Z0-9 \\]+\\)?\\n")
-                         (analyze_all_shapes features shape analyze_shape)
-  in
-  let effective_target = effective_target features in
-  let compile_test_optim = compile_test_optim features
-  in
-    (* Emit file and function prologues.  *)
-    emit_prologue chan test_name effective_target compile_test_optim;
-
-    if (compare compile_test_optim "-O0") <> 0 then
-        (* Emit variable declarations.  *)
-        emit_variables chan c_types features "";
-
-    Printf.fprintf chan "void test_%s (void)\n{\n" test_name;
-
-    if compare compile_test_optim "-O0" = 0 then
-        (* Emit variable declarations.  *)
-        emit_variables chan c_types features "  ";
-
-    Printf.fprintf chan "\n";
-    (* Emit the call to the intrinsic.  *)
-    emit_call chan const_valuator c_types name elt_ty;
-    (* Emit the function epilogue and the DejaGNU scan-assembler directives.  *)
-    emit_epilogue chan features regexps;
-    (* Close the test file.  *)
-    close_out chan
-
-(* Generate tests for one element of the "ops" table.  *)
-let test_intrinsic_group dir (opcode, features, shape, name, munge, types) =
-  List.iter (test_intrinsic dir opcode features shape name munge) types
-
-(* Program entry point.  *)
-let _ =
-  let directory = if Array.length Sys.argv <> 1 then Sys.argv.(1) else "." in
-    List.iter (test_intrinsic_group directory) (reinterp @ reinterpq @ ops)
-
--- a/src/gcc/config/arm/neon.md
+++ b/src/gcc/config/arm/neon.md
@@ -406,7 +406,7 @@
    (match_operand:SI 2 "immediate_operand" "")]
   "TARGET_NEON"
 {
-  HOST_WIDE_INT elem = (HOST_WIDE_INT) 1 << INTVAL (operands[2]);
+  HOST_WIDE_INT elem = HOST_WIDE_INT_1 << INTVAL (operands[2]);
   emit_insn (gen_vec_set<mode>_internal (operands[0], operands[1],
 					 GEN_INT (elem), operands[0]));
   DONE;
@@ -505,6 +505,20 @@
                     (const_string "neon_add<q>")))]
 )
 
+(define_insn "add<mode>3_fp16"
+  [(set
+    (match_operand:VH 0 "s_register_operand" "=w")
+    (plus:VH
+     (match_operand:VH 1 "s_register_operand" "w")
+     (match_operand:VH 2 "s_register_operand" "w")))]
+ "TARGET_NEON_FP16INST"
+ "vadd.<V_if_elem>\t%<V_reg>0, %<V_reg>1, %<V_reg>2"
+ [(set (attr "type")
+   (if_then_else (match_test "<Is_float_mode>")
+    (const_string "neon_fp_addsub_s<q>")
+    (const_string "neon_add<q>")))]
+)
+
 (define_insn "adddi3_neon"
   [(set (match_operand:DI 0 "s_register_operand" "=w,?&r,?&r,?w,?&r,?&r,?&r")
         (plus:DI (match_operand:DI 1 "s_register_operand" "%w,0,0,w,r,0,r")
@@ -543,6 +557,17 @@
                     (const_string "neon_sub<q>")))]
 )
 
+(define_insn "sub<mode>3_fp16"
+ [(set
+   (match_operand:VH 0 "s_register_operand" "=w")
+   (minus:VH
+    (match_operand:VH 1 "s_register_operand" "w")
+    (match_operand:VH 2 "s_register_operand" "w")))]
+ "TARGET_NEON_FP16INST"
+ "vsub.<V_if_elem>\t%<V_reg>0, %<V_reg>1, %<V_reg>2"
+ [(set_attr "type" "neon_sub<q>")]
+)
+
 (define_insn "subdi3_neon"
   [(set (match_operand:DI 0 "s_register_operand" "=w,?&r,?&r,?&r,?w")
         (minus:DI (match_operand:DI 1 "s_register_operand" "w,0,r,0,w")
@@ -591,6 +616,16 @@
 		    (const_string "neon_mla_<V_elem_ch><q>")))]
 )
 
+(define_insn "mul<mode>3add<mode>_neon"
+  [(set (match_operand:VH 0 "s_register_operand" "=w")
+	(plus:VH (mult:VH (match_operand:VH 2 "s_register_operand" "w")
+			  (match_operand:VH 3 "s_register_operand" "w"))
+		  (match_operand:VH 1 "s_register_operand" "0")))]
+  "TARGET_NEON_FP16INST && (!<Is_float_mode> || flag_unsafe_math_optimizations)"
+  "vmla.f16\t%<V_reg>0, %<V_reg>2, %<V_reg>3"
+  [(set_attr "type" "neon_fp_mla_s<q>")]
+)
+
 (define_insn "mul<mode>3neg<mode>add<mode>_neon"
   [(set (match_operand:VDQW 0 "s_register_operand" "=w")
         (minus:VDQW (match_operand:VDQW 1 "s_register_operand" "0")
@@ -629,6 +664,19 @@
   [(set_attr "type" "neon_fp_mla_s<q>")]
 )
 
+;; There is limited support for unsafe-math optimizations using the NEON FP16
+;; arithmetic instructions, so only the intrinsic is currently supported.
+(define_insn "fma<VH:mode>4_intrinsic"
+ [(set (match_operand:VH 0 "register_operand" "=w")
+   (fma:VH
+    (match_operand:VH 1 "register_operand" "w")
+    (match_operand:VH 2 "register_operand" "w")
+    (match_operand:VH 3 "register_operand" "0")))]
+ "TARGET_NEON_FP16INST"
+ "vfma.<V_if_elem>\\t%<V_reg>0, %<V_reg>1, %<V_reg>2"
+ [(set_attr "type" "neon_fp_mla_s<q>")]
+)
+
 (define_insn "*fmsub<VCVTF:mode>4"
   [(set (match_operand:VCVTF 0 "register_operand" "=w")
         (fma:VCVTF (neg:VCVTF (match_operand:VCVTF 1 "register_operand" "w"))
@@ -640,13 +688,25 @@
 )
 
 (define_insn "fmsub<VCVTF:mode>4_intrinsic"
-  [(set (match_operand:VCVTF 0 "register_operand" "=w")
-        (fma:VCVTF (neg:VCVTF (match_operand:VCVTF 1 "register_operand" "w"))
-		   (match_operand:VCVTF 2 "register_operand" "w")
-		   (match_operand:VCVTF 3 "register_operand" "0")))]
-  "TARGET_NEON && TARGET_FMA"
-  "vfms%?.<V_if_elem>\\t%<V_reg>0, %<V_reg>1, %<V_reg>2"
-  [(set_attr "type" "neon_fp_mla_s<q>")]
+ [(set (match_operand:VCVTF 0 "register_operand" "=w")
+   (fma:VCVTF
+    (neg:VCVTF (match_operand:VCVTF 1 "register_operand" "w"))
+    (match_operand:VCVTF 2 "register_operand" "w")
+    (match_operand:VCVTF 3 "register_operand" "0")))]
+ "TARGET_NEON && TARGET_FMA"
+ "vfms%?.<V_if_elem>\\t%<V_reg>0, %<V_reg>1, %<V_reg>2"
+ [(set_attr "type" "neon_fp_mla_s<q>")]
+)
+
+(define_insn "fmsub<VH:mode>4_intrinsic"
+ [(set (match_operand:VH 0 "register_operand" "=w")
+   (fma:VH
+    (neg:VH (match_operand:VH 1 "register_operand" "w"))
+    (match_operand:VH 2 "register_operand" "w")
+    (match_operand:VH 3 "register_operand" "0")))]
+ "TARGET_NEON_FP16INST"
+ "vfms.<V_if_elem>\\t%<V_reg>0, %<V_reg>1, %<V_reg>2"
+ [(set_attr "type" "neon_fp_mla_s<q>")]
 )
 
 (define_insn "neon_vrint<NEON_VRINT:nvrint_variant><VCVTF:mode>"
@@ -860,6 +920,44 @@
   ""
 )
 
+(define_insn "<absneg_str><mode>2"
+  [(set (match_operand:VH 0 "s_register_operand" "=w")
+    (ABSNEG:VH (match_operand:VH 1 "s_register_operand" "w")))]
+ "TARGET_NEON_FP16INST"
+ "v<absneg_str>.<V_s_elem>\t%<V_reg>0, %<V_reg>1"
+ [(set_attr "type" "neon_abs<q>")]
+)
+
+(define_expand "neon_v<absneg_str><mode>"
+ [(set
+   (match_operand:VH 0 "s_register_operand")
+   (ABSNEG:VH (match_operand:VH 1 "s_register_operand")))]
+ "TARGET_NEON_FP16INST"
+{
+  emit_insn (gen_<absneg_str><mode>2 (operands[0], operands[1]));
+  DONE;
+})
+
+(define_insn "neon_v<fp16_rnd_str><mode>"
+  [(set (match_operand:VH 0 "s_register_operand" "=w")
+    (unspec:VH
+     [(match_operand:VH 1 "s_register_operand" "w")]
+     FP16_RND))]
+ "TARGET_NEON_FP16INST"
+ "<fp16_rnd_insn>.<V_s_elem>\t%<V_reg>0, %<V_reg>1"
+ [(set_attr "type" "neon_fp_round_s<q>")]
+)
+
+(define_insn "neon_vrsqrte<mode>"
+  [(set (match_operand:VH 0 "s_register_operand" "=w")
+    (unspec:VH
+     [(match_operand:VH 1 "s_register_operand" "w")]
+     UNSPEC_VRSQRTE))]
+  "TARGET_NEON_FP16INST"
+  "vrsqrte.f16\t%<V_reg>0, %<V_reg>1"
+ [(set_attr "type" "neon_fp_rsqrte_s<q>")]
+)
+
 (define_insn "*umin<mode>3_neon"
   [(set (match_operand:VDQIW 0 "s_register_operand" "=w")
 	(umin:VDQIW (match_operand:VDQIW 1 "s_register_operand" "w")
@@ -1208,16 +1306,133 @@
 
 ;; Widening operations
 
+(define_expand "widen_ssum<mode>3"
+  [(set (match_operand:<V_double_width> 0 "s_register_operand" "")
+	(plus:<V_double_width>
+	 (sign_extend:<V_double_width>
+	  (match_operand:VQI 1 "s_register_operand" ""))
+	 (match_operand:<V_double_width> 2 "s_register_operand" "")))]
+  "TARGET_NEON"
+  {
+    machine_mode mode = GET_MODE (operands[1]);
+    rtx p1, p2;
+
+    p1  = arm_simd_vect_par_cnst_half (mode, false);
+    p2  = arm_simd_vect_par_cnst_half (mode, true);
+
+    if (operands[0] != operands[2])
+      emit_move_insn (operands[0], operands[2]);
+
+    emit_insn (gen_vec_sel_widen_ssum_lo<mode><V_half>3 (operands[0],
+							 operands[1],
+							 p1,
+							 operands[0]));
+    emit_insn (gen_vec_sel_widen_ssum_hi<mode><V_half>3 (operands[0],
+							 operands[1],
+							 p2,
+							 operands[0]));
+    DONE;
+  }
+)
+
+(define_insn "vec_sel_widen_ssum_lo<VQI:mode><VW:mode>3"
+  [(set (match_operand:<VW:V_widen> 0 "s_register_operand" "=w")
+	(plus:<VW:V_widen>
+	 (sign_extend:<VW:V_widen>
+	  (vec_select:VW
+	   (match_operand:VQI 1 "s_register_operand" "%w")
+	   (match_operand:VQI 2 "vect_par_constant_low" "")))
+	 (match_operand:<VW:V_widen> 3 "s_register_operand" "0")))]
+  "TARGET_NEON"
+{
+  return BYTES_BIG_ENDIAN ?  "vaddw.<V_s_elem>\t%q0, %q3, %f1" :
+    "vaddw.<V_s_elem>\t%q0, %q3, %e1";
+}
+  [(set_attr "type" "neon_add_widen")])
+
+(define_insn "vec_sel_widen_ssum_hi<VQI:mode><VW:mode>3"
+  [(set (match_operand:<VW:V_widen> 0 "s_register_operand" "=w")
+	(plus:<VW:V_widen>
+	 (sign_extend:<VW:V_widen>
+	  (vec_select:VW (match_operand:VQI 1 "s_register_operand" "%w")
+			 (match_operand:VQI 2 "vect_par_constant_high" "")))
+	 (match_operand:<VW:V_widen> 3 "s_register_operand" "0")))]
+  "TARGET_NEON"
+{
+  return BYTES_BIG_ENDIAN ?  "vaddw.<V_s_elem>\t%q0, %q3, %e1" :
+    "vaddw.<V_s_elem>\t%q0, %q3, %f1";
+}
+  [(set_attr "type" "neon_add_widen")])
+
 (define_insn "widen_ssum<mode>3"
   [(set (match_operand:<V_widen> 0 "s_register_operand" "=w")
-	(plus:<V_widen> (sign_extend:<V_widen>
-			  (match_operand:VW 1 "s_register_operand" "%w"))
-		        (match_operand:<V_widen> 2 "s_register_operand" "w")))]
+	(plus:<V_widen>
+	 (sign_extend:<V_widen>
+	  (match_operand:VW 1 "s_register_operand" "%w"))
+	 (match_operand:<V_widen> 2 "s_register_operand" "w")))]
   "TARGET_NEON"
   "vaddw.<V_s_elem>\t%q0, %q2, %P1"
   [(set_attr "type" "neon_add_widen")]
 )
 
+(define_expand "widen_usum<mode>3"
+  [(set (match_operand:<V_double_width> 0 "s_register_operand" "")
+	(plus:<V_double_width>
+	 (zero_extend:<V_double_width>
+	  (match_operand:VQI 1 "s_register_operand" ""))
+	 (match_operand:<V_double_width> 2 "s_register_operand" "")))]
+  "TARGET_NEON"
+  {
+    machine_mode mode = GET_MODE (operands[1]);
+    rtx p1, p2;
+
+    p1  = arm_simd_vect_par_cnst_half (mode, false);
+    p2  = arm_simd_vect_par_cnst_half (mode, true);
+
+    if (operands[0] != operands[2])
+      emit_move_insn (operands[0], operands[2]);
+
+    emit_insn (gen_vec_sel_widen_usum_lo<mode><V_half>3 (operands[0],
+							 operands[1],
+							 p1,
+							 operands[0]));
+    emit_insn (gen_vec_sel_widen_usum_hi<mode><V_half>3 (operands[0],
+							 operands[1],
+							 p2,
+							 operands[0]));
+    DONE;
+  }
+)
+
+(define_insn "vec_sel_widen_usum_lo<VQI:mode><VW:mode>3"
+  [(set (match_operand:<VW:V_widen> 0 "s_register_operand" "=w")
+	(plus:<VW:V_widen>
+	 (zero_extend:<VW:V_widen>
+	  (vec_select:VW
+	   (match_operand:VQI 1 "s_register_operand" "%w")
+	   (match_operand:VQI 2 "vect_par_constant_low" "")))
+	 (match_operand:<VW:V_widen> 3 "s_register_operand" "0")))]
+  "TARGET_NEON"
+{
+  return BYTES_BIG_ENDIAN ?  "vaddw.<V_u_elem>\t%q0, %q3, %f1" :
+    "vaddw.<V_u_elem>\t%q0, %q3, %e1";
+}
+  [(set_attr "type" "neon_add_widen")])
+
+(define_insn "vec_sel_widen_usum_hi<VQI:mode><VW:mode>3"
+  [(set (match_operand:<VW:V_widen> 0 "s_register_operand" "=w")
+	(plus:<VW:V_widen>
+	 (zero_extend:<VW:V_widen>
+	  (vec_select:VW (match_operand:VQI 1 "s_register_operand" "%w")
+			 (match_operand:VQI 2 "vect_par_constant_high" "")))
+	 (match_operand:<VW:V_widen> 3 "s_register_operand" "0")))]
+  "TARGET_NEON"
+{
+ return BYTES_BIG_ENDIAN ?  "vaddw.<V_u_elem>\t%q0, %q3, %e1" :
+    "vaddw.<V_u_elem>\t%q0, %q3, %f1";
+}
+  [(set_attr "type" "neon_add_widen")])
+
 (define_insn "widen_usum<mode>3"
   [(set (match_operand:<V_widen> 0 "s_register_operand" "=w")
 	(plus:<V_widen> (zero_extend:<V_widen>
@@ -1488,6 +1703,17 @@
                     (const_string "neon_reduc_add<q>")))]
 )
 
+(define_insn "neon_vpaddv4hf"
+ [(set
+   (match_operand:V4HF 0 "s_register_operand" "=w")
+   (unspec:V4HF [(match_operand:V4HF 1 "s_register_operand" "w")
+		 (match_operand:V4HF 2 "s_register_operand" "w")]
+    UNSPEC_VPADD))]
+ "TARGET_NEON_FP16INST"
+ "vpadd.f16\t%P0, %P1, %P2"
+ [(set_attr "type" "neon_reduc_add")]
+)
+
 (define_insn "neon_vpsmin<mode>"
   [(set (match_operand:VD 0 "s_register_operand" "=w")
 	(unspec:VD [(match_operand:VD 1 "s_register_operand" "w")
@@ -1836,6 +2062,26 @@
   DONE;
 })
 
+(define_expand "neon_vadd<mode>"
+  [(match_operand:VH 0 "s_register_operand")
+   (match_operand:VH 1 "s_register_operand")
+   (match_operand:VH 2 "s_register_operand")]
+  "TARGET_NEON_FP16INST"
+{
+  emit_insn (gen_add<mode>3_fp16 (operands[0], operands[1], operands[2]));
+  DONE;
+})
+
+(define_expand "neon_vsub<mode>"
+  [(match_operand:VH 0 "s_register_operand")
+   (match_operand:VH 1 "s_register_operand")
+   (match_operand:VH 2 "s_register_operand")]
+  "TARGET_NEON_FP16INST"
+{
+  emit_insn (gen_sub<mode>3_fp16 (operands[0], operands[1], operands[2]));
+  DONE;
+})
+
 ; Note that NEON operations don't support the full IEEE 754 standard: in
 ; particular, denormal values are flushed to zero.  This means that GCC cannot
 ; use those instructions for autovectorization, etc. unless
@@ -1927,6 +2173,17 @@
                     (const_string "neon_mul_<V_elem_ch><q>")))]
 )
 
+(define_insn "neon_vmulf<mode>"
+ [(set
+   (match_operand:VH 0 "s_register_operand" "=w")
+   (mult:VH
+    (match_operand:VH 1 "s_register_operand" "w")
+    (match_operand:VH 2 "s_register_operand" "w")))]
+  "TARGET_NEON_FP16INST"
+  "vmul.f16\t%<V_reg>0, %<V_reg>1, %<V_reg>2"
+ [(set_attr "type" "neon_mul_<VH_elem_ch><q>")]
+)
+
 (define_expand "neon_vmla<mode>"
   [(match_operand:VDQW 0 "s_register_operand" "=w")
    (match_operand:VDQW 1 "s_register_operand" "0")
@@ -1955,6 +2212,18 @@
   DONE;
 })
 
+(define_expand "neon_vfma<VH:mode>"
+  [(match_operand:VH 0 "s_register_operand")
+   (match_operand:VH 1 "s_register_operand")
+   (match_operand:VH 2 "s_register_operand")
+   (match_operand:VH 3 "s_register_operand")]
+  "TARGET_NEON_FP16INST"
+{
+  emit_insn (gen_fma<mode>4_intrinsic (operands[0], operands[2], operands[3],
+				       operands[1]));
+  DONE;
+})
+
 (define_expand "neon_vfms<VCVTF:mode>"
   [(match_operand:VCVTF 0 "s_register_operand")
    (match_operand:VCVTF 1 "s_register_operand")
@@ -1967,6 +2236,18 @@
   DONE;
 })
 
+(define_expand "neon_vfms<VH:mode>"
+  [(match_operand:VH 0 "s_register_operand")
+   (match_operand:VH 1 "s_register_operand")
+   (match_operand:VH 2 "s_register_operand")
+   (match_operand:VH 3 "s_register_operand")]
+  "TARGET_NEON_FP16INST"
+{
+  emit_insn (gen_fmsub<mode>4_intrinsic (operands[0], operands[2], operands[3],
+					 operands[1]));
+  DONE;
+})
+
 ; Used for intrinsics when flag_unsafe_math_optimizations is false.
 
 (define_insn "neon_vmla<mode>_unspec"
@@ -2267,6 +2548,72 @@
   [(set_attr "type" "neon_fp_compare_s<q>")]
 )
 
+(define_expand "neon_vc<cmp_op><mode>"
+ [(match_operand:<V_cmp_result> 0 "s_register_operand")
+  (neg:<V_cmp_result>
+   (COMPARISONS:VH
+    (match_operand:VH 1 "s_register_operand")
+    (match_operand:VH 2 "reg_or_zero_operand")))]
+ "TARGET_NEON_FP16INST"
+{
+  /* For FP comparisons use UNSPECS unless -funsafe-math-optimizations
+     are enabled.  */
+  if (GET_MODE_CLASS (<MODE>mode) == MODE_VECTOR_FLOAT
+      && !flag_unsafe_math_optimizations)
+    emit_insn
+      (gen_neon_vc<cmp_op><mode>_fp16insn_unspec
+       (operands[0], operands[1], operands[2]));
+  else
+    emit_insn
+      (gen_neon_vc<cmp_op><mode>_fp16insn
+       (operands[0], operands[1], operands[2]));
+  DONE;
+})
+
+(define_insn "neon_vc<cmp_op><mode>_fp16insn"
+ [(set (match_operand:<V_cmp_result> 0 "s_register_operand" "=w,w")
+   (neg:<V_cmp_result>
+    (COMPARISONS:<V_cmp_result>
+     (match_operand:VH 1 "s_register_operand" "w,w")
+     (match_operand:VH 2 "reg_or_zero_operand" "w,Dz"))))]
+ "TARGET_NEON_FP16INST
+  && !(GET_MODE_CLASS (<MODE>mode) == MODE_VECTOR_FLOAT
+  && !flag_unsafe_math_optimizations)"
+{
+  char pattern[100];
+  sprintf (pattern, "vc<cmp_op>.%s%%#<V_sz_elem>\t%%<V_reg>0,"
+	   " %%<V_reg>1, %s",
+	   GET_MODE_CLASS (<MODE>mode) == MODE_VECTOR_FLOAT
+	   ? "f" : "<cmp_type>",
+	   which_alternative == 0
+	   ? "%<V_reg>2" : "#0");
+  output_asm_insn (pattern, operands);
+  return "";
+}
+ [(set (attr "type")
+   (if_then_else (match_operand 2 "zero_operand")
+    (const_string "neon_compare_zero<q>")
+    (const_string "neon_compare<q>")))])
+
+(define_insn "neon_vc<cmp_op_unsp><mode>_fp16insn_unspec"
+ [(set
+   (match_operand:<V_cmp_result> 0 "s_register_operand" "=w,w")
+   (unspec:<V_cmp_result>
+    [(match_operand:VH 1 "s_register_operand" "w,w")
+     (match_operand:VH 2 "reg_or_zero_operand" "w,Dz")]
+    NEON_VCMP))]
+ "TARGET_NEON_FP16INST"
+{
+  char pattern[100];
+  sprintf (pattern, "vc<cmp_op_unsp>.f%%#<V_sz_elem>\t%%<V_reg>0,"
+	   " %%<V_reg>1, %s",
+	   which_alternative == 0
+	   ? "%<V_reg>2" : "#0");
+  output_asm_insn (pattern, operands);
+  return "";
+}
+ [(set_attr "type" "neon_fp_compare_s<q>")])
+
 (define_insn "neon_vc<cmp_op>u<mode>"
   [(set (match_operand:<V_cmp_result> 0 "s_register_operand" "=w")
         (neg:<V_cmp_result>
@@ -2318,6 +2665,60 @@
   [(set_attr "type" "neon_fp_compare_s<q>")]
 )
 
+(define_expand "neon_vca<cmp_op><mode>"
+  [(set
+    (match_operand:<V_cmp_result> 0 "s_register_operand")
+    (neg:<V_cmp_result>
+     (GLTE:<V_cmp_result>
+      (abs:VH (match_operand:VH 1 "s_register_operand"))
+      (abs:VH (match_operand:VH 2 "s_register_operand")))))]
+ "TARGET_NEON_FP16INST"
+{
+  if (flag_unsafe_math_optimizations)
+    emit_insn (gen_neon_vca<cmp_op><mode>_fp16insn
+	       (operands[0], operands[1], operands[2]));
+  else
+    emit_insn (gen_neon_vca<cmp_op><mode>_fp16insn_unspec
+	       (operands[0], operands[1], operands[2]));
+  DONE;
+})
+
+(define_insn "neon_vca<cmp_op><mode>_fp16insn"
+  [(set
+    (match_operand:<V_cmp_result> 0 "s_register_operand" "=w")
+    (neg:<V_cmp_result>
+     (GLTE:<V_cmp_result>
+      (abs:VH (match_operand:VH 1 "s_register_operand" "w"))
+      (abs:VH (match_operand:VH 2 "s_register_operand" "w")))))]
+ "TARGET_NEON_FP16INST && flag_unsafe_math_optimizations"
+ "vac<cmp_op>.<V_if_elem>\t%<V_reg>0, %<V_reg>1, %<V_reg>2"
+ [(set_attr "type" "neon_fp_compare_s<q>")]
+)
+
+(define_insn "neon_vca<cmp_op_unsp><mode>_fp16insn_unspec"
+ [(set (match_operand:<V_cmp_result> 0 "s_register_operand" "=w")
+   (unspec:<V_cmp_result>
+    [(match_operand:VH 1 "s_register_operand" "w")
+     (match_operand:VH 2 "s_register_operand" "w")]
+    NEON_VAGLTE))]
+ "TARGET_NEON"
+ "vac<cmp_op_unsp>.<V_if_elem>\t%<V_reg>0, %<V_reg>1, %<V_reg>2"
+ [(set_attr "type" "neon_fp_compare_s<q>")]
+)
+
+(define_expand "neon_vc<cmp_op>z<mode>"
+ [(set
+   (match_operand:<V_cmp_result> 0 "s_register_operand")
+   (COMPARISONS:<V_cmp_result>
+    (match_operand:VH 1 "s_register_operand")
+    (const_int 0)))]
+ "TARGET_NEON_FP16INST"
+ {
+  emit_insn (gen_neon_vc<cmp_op><mode> (operands[0], operands[1],
+					CONST0_RTX (<MODE>mode)));
+  DONE;
+})
+
 (define_insn "neon_vtst<mode>"
   [(set (match_operand:VDQIW 0 "s_register_operand" "=w")
         (unspec:VDQIW [(match_operand:VDQIW 1 "s_register_operand" "w")
@@ -2338,6 +2739,16 @@
   [(set_attr "type" "neon_abd<q>")]
 )
 
+(define_insn "neon_vabd<mode>"
+  [(set (match_operand:VH 0 "s_register_operand" "=w")
+    (unspec:VH [(match_operand:VH 1 "s_register_operand" "w")
+		(match_operand:VH 2 "s_register_operand" "w")]
+     UNSPEC_VABD_F))]
+ "TARGET_NEON_FP16INST"
+ "vabd.<V_s_elem>\t%<V_reg>0, %<V_reg>1, %<V_reg>2"
+  [(set_attr "type" "neon_abd<q>")]
+)
+
 (define_insn "neon_vabdf<mode>"
   [(set (match_operand:VCVTF 0 "s_register_operand" "=w")
         (unspec:VCVTF [(match_operand:VCVTF 1 "s_register_operand" "w")
@@ -2400,6 +2811,51 @@
   [(set_attr "type" "neon_fp_minmax_s<q>")]
 )
 
+(define_insn "neon_v<maxmin>f<mode>"
+ [(set (match_operand:VH 0 "s_register_operand" "=w")
+   (unspec:VH
+    [(match_operand:VH 1 "s_register_operand" "w")
+     (match_operand:VH 2 "s_register_operand" "w")]
+    VMAXMINF))]
+ "TARGET_NEON_FP16INST"
+ "v<maxmin>.<V_s_elem>\t%<V_reg>0, %<V_reg>1, %<V_reg>2"
+ [(set_attr "type" "neon_fp_minmax_s<q>")]
+)
+
+(define_insn "neon_vp<maxmin>fv4hf"
+ [(set (match_operand:V4HF 0 "s_register_operand" "=w")
+   (unspec:V4HF
+    [(match_operand:V4HF 1 "s_register_operand" "w")
+     (match_operand:V4HF 2 "s_register_operand" "w")]
+    VPMAXMINF))]
+ "TARGET_NEON_FP16INST"
+ "vp<maxmin>.f16\t%P0, %P1, %P2"
+  [(set_attr "type" "neon_reduc_minmax")]
+)
+
+(define_insn "neon_<fmaxmin_op><mode>"
+ [(set
+   (match_operand:VH 0 "s_register_operand" "=w")
+   (unspec:VH
+    [(match_operand:VH 1 "s_register_operand" "w")
+     (match_operand:VH 2 "s_register_operand" "w")]
+    VMAXMINFNM))]
+ "TARGET_NEON_FP16INST"
+ "<fmaxmin_op>.<V_s_elem>\t%<V_reg>0, %<V_reg>1, %<V_reg>2"
+ [(set_attr "type" "neon_fp_minmax_s<q>")]
+)
+
+;; v<maxmin>nm intrinsics.
+(define_insn "neon_<fmaxmin_op><mode>"
+  [(set (match_operand:VCVTF 0 "s_register_operand" "=w")
+	(unspec:VCVTF [(match_operand:VCVTF 1 "s_register_operand" "w")
+		       (match_operand:VCVTF 2 "s_register_operand" "w")]
+		       VMAXMINFNM))]
+  "TARGET_NEON && TARGET_FPU_ARMV8"
+  "<fmaxmin_op>.<V_s_elem>\t%<V_reg>0, %<V_reg>1, %<V_reg>2"
+  [(set_attr "type" "neon_fp_minmax_s<q>")]
+)
+
 ;; Vector forms for the IEEE-754 fmax()/fmin() functions
 (define_insn "<fmaxmin><mode>3"
   [(set (match_operand:VCVTF 0 "s_register_operand" "=w")
@@ -2471,6 +2927,17 @@
   [(set_attr "type" "neon_fp_recps_s<q>")]
 )
 
+(define_insn "neon_vrecps<mode>"
+  [(set
+    (match_operand:VH 0 "s_register_operand" "=w")
+    (unspec:VH [(match_operand:VH 1 "s_register_operand" "w")
+		(match_operand:VH 2 "s_register_operand" "w")]
+     UNSPEC_VRECPS))]
+  "TARGET_NEON_FP16INST"
+  "vrecps.<V_if_elem>\t%<V_reg>0, %<V_reg>1, %<V_reg>2"
+  [(set_attr "type" "neon_fp_recps_s<q>")]
+)
+
 (define_insn "neon_vrsqrts<mode>"
   [(set (match_operand:VCVTF 0 "s_register_operand" "=w")
         (unspec:VCVTF [(match_operand:VCVTF 1 "s_register_operand" "w")
@@ -2481,6 +2948,17 @@
   [(set_attr "type" "neon_fp_rsqrts_s<q>")]
 )
 
+(define_insn "neon_vrsqrts<mode>"
+  [(set
+    (match_operand:VH 0 "s_register_operand" "=w")
+    (unspec:VH [(match_operand:VH 1 "s_register_operand" "w")
+		 (match_operand:VH 2 "s_register_operand" "w")]
+     UNSPEC_VRSQRTS))]
+ "TARGET_NEON_FP16INST"
+ "vrsqrts.<V_if_elem>\t%<V_reg>0, %<V_reg>1, %<V_reg>2"
+ [(set_attr "type" "neon_fp_rsqrts_s<q>")]
+)
+
 (define_expand "neon_vabs<mode>"
   [(match_operand:VDQW 0 "s_register_operand" "")
    (match_operand:VDQW 1 "s_register_operand" "")]
@@ -2596,6 +3074,15 @@
 })
 
 (define_insn "neon_vrecpe<mode>"
+  [(set (match_operand:VH 0 "s_register_operand" "=w")
+	(unspec:VH [(match_operand:VH 1 "s_register_operand" "w")]
+		   UNSPEC_VRECPE))]
+  "TARGET_NEON_FP16INST"
+  "vrecpe.f16\t%<V_reg>0, %<V_reg>1"
+  [(set_attr "type" "neon_fp_recpe_s<q>")]
+)
+
+(define_insn "neon_vrecpe<mode>"
   [(set (match_operand:V32 0 "s_register_operand" "=w")
 	(unspec:V32 [(match_operand:V32 1 "s_register_operand" "w")]
                     UNSPEC_VRECPE))]
@@ -2932,6 +3419,28 @@ if (BYTES_BIG_ENDIAN)
   [(set_attr "type" "neon_dup<q>")]
 )
 
+(define_insn "neon_vdup_lane<mode>_internal"
+ [(set (match_operand:VH 0 "s_register_operand" "=w")
+   (vec_duplicate:VH
+    (vec_select:<V_elem>
+     (match_operand:<V_double_vector_mode> 1 "s_register_operand" "w")
+     (parallel [(match_operand:SI 2 "immediate_operand" "i")]))))]
+ "TARGET_NEON && TARGET_FP16"
+{
+  if (BYTES_BIG_ENDIAN)
+    {
+      int elt = INTVAL (operands[2]);
+      elt = GET_MODE_NUNITS (<V_double_vector_mode>mode) - 1 - elt;
+      operands[2] = GEN_INT (elt);
+    }
+  if (<Is_d_reg>)
+    return "vdup.<V_sz_elem>\t%P0, %P1[%c2]";
+  else
+    return "vdup.<V_sz_elem>\t%q0, %P1[%c2]";
+}
+  [(set_attr "type" "neon_dup<q>")]
+)
+
 (define_expand "neon_vdup_lane<mode>"
   [(match_operand:VDQW 0 "s_register_operand" "=w")
    (match_operand:<V_double_vector_mode> 1 "s_register_operand" "w")
@@ -2951,6 +3460,25 @@ if (BYTES_BIG_ENDIAN)
     DONE;
 })
 
+(define_expand "neon_vdup_lane<mode>"
+  [(match_operand:VH 0 "s_register_operand")
+   (match_operand:<V_double_vector_mode> 1 "s_register_operand")
+   (match_operand:SI 2 "immediate_operand")]
+  "TARGET_NEON && TARGET_FP16"
+{
+  if (BYTES_BIG_ENDIAN)
+    {
+      unsigned int elt = INTVAL (operands[2]);
+      unsigned int reg_nelts
+	= 64 / GET_MODE_UNIT_BITSIZE (<V_double_vector_mode>mode);
+      elt ^= reg_nelts - 1;
+      operands[2] = GEN_INT (elt);
+    }
+  emit_insn (gen_neon_vdup_lane<mode>_internal (operands[0], operands[1],
+						operands[2]));
+  DONE;
+})
+
 ; Scalar index is ignored, since only zero is valid here.
 (define_expand "neon_vdup_lanedi"
   [(match_operand:DI 0 "s_register_operand" "=w")
@@ -3097,6 +3625,28 @@ if (BYTES_BIG_ENDIAN)
   [(set_attr "type" "neon_fp_cvt_narrow_s_q")]
 )
 
+(define_insn "neon_vcvt<sup><mode>"
+ [(set
+   (match_operand:<VH_CVTTO> 0 "s_register_operand" "=w")
+   (unspec:<VH_CVTTO>
+    [(match_operand:VCVTHI 1 "s_register_operand" "w")]
+    VCVT_US))]
+ "TARGET_NEON_FP16INST"
+ "vcvt.f16.<sup>%#16\t%<V_reg>0, %<V_reg>1"
+  [(set_attr "type" "neon_int_to_fp_<VH_elem_ch><q>")]
+)
+
+(define_insn "neon_vcvt<sup><mode>"
+ [(set
+   (match_operand:<VH_CVTTO> 0 "s_register_operand" "=w")
+   (unspec:<VH_CVTTO>
+    [(match_operand:VH 1 "s_register_operand" "w")]
+    VCVT_US))]
+ "TARGET_NEON_FP16INST"
+ "vcvt.<sup>%#16.f16\t%<V_reg>0, %<V_reg>1"
+  [(set_attr "type" "neon_fp_to_int_<VH_elem_ch><q>")]
+)
+
 (define_insn "neon_vcvt<sup>_n<mode>"
   [(set (match_operand:<V_CVTTO> 0 "s_register_operand" "=w")
 	(unspec:<V_CVTTO> [(match_operand:VCVTF 1 "s_register_operand" "w")
@@ -3111,6 +3661,20 @@ if (BYTES_BIG_ENDIAN)
 )
 
 (define_insn "neon_vcvt<sup>_n<mode>"
+ [(set (match_operand:<VH_CVTTO> 0 "s_register_operand" "=w")
+   (unspec:<VH_CVTTO>
+    [(match_operand:VH 1 "s_register_operand" "w")
+     (match_operand:SI 2 "immediate_operand" "i")]
+    VCVT_US_N))]
+  "TARGET_NEON_FP16INST"
+{
+  neon_const_bounds (operands[2], 0, 17);
+  return "vcvt.<sup>%#16.f16\t%<V_reg>0, %<V_reg>1, %2";
+}
+ [(set_attr "type" "neon_fp_to_int_<VH_elem_ch><q>")]
+)
+
+(define_insn "neon_vcvt<sup>_n<mode>"
   [(set (match_operand:<V_CVTTO> 0 "s_register_operand" "=w")
 	(unspec:<V_CVTTO> [(match_operand:VCVTI 1 "s_register_operand" "w")
 			   (match_operand:SI 2 "immediate_operand" "i")]
@@ -3123,6 +3687,31 @@ if (BYTES_BIG_ENDIAN)
   [(set_attr "type" "neon_int_to_fp_<V_elem_ch><q>")]
 )
 
+(define_insn "neon_vcvt<sup>_n<mode>"
+ [(set (match_operand:<VH_CVTTO> 0 "s_register_operand" "=w")
+   (unspec:<VH_CVTTO>
+    [(match_operand:VCVTHI 1 "s_register_operand" "w")
+     (match_operand:SI 2 "immediate_operand" "i")]
+    VCVT_US_N))]
+ "TARGET_NEON_FP16INST"
+{
+  neon_const_bounds (operands[2], 0, 17);
+  return "vcvt.f16.<sup>%#16\t%<V_reg>0, %<V_reg>1, %2";
+}
+ [(set_attr "type" "neon_int_to_fp_<VH_elem_ch><q>")]
+)
+
+(define_insn "neon_vcvt<vcvth_op><sup><mode>"
+ [(set
+   (match_operand:<VH_CVTTO> 0 "s_register_operand" "=w")
+   (unspec:<VH_CVTTO>
+    [(match_operand:VH 1 "s_register_operand" "w")]
+    VCVT_HF_US))]
+ "TARGET_NEON_FP16INST"
+ "vcvt<vcvth_op>.<sup>%#16.f16\t%<V_reg>0, %<V_reg>1"
+  [(set_attr "type" "neon_fp_to_int_<VH_elem_ch><q>")]
+)
+
 (define_insn "neon_vmovn<mode>"
   [(set (match_operand:<V_narrow> 0 "s_register_operand" "=w")
 	(unspec:<V_narrow> [(match_operand:VN 1 "s_register_operand" "w")]
@@ -3193,6 +3782,18 @@ if (BYTES_BIG_ENDIAN)
                    (const_string "neon_mul_<V_elem_ch>_scalar<q>")))]
 )
 
+(define_insn "neon_vmul_lane<mode>"
+  [(set (match_operand:VH 0 "s_register_operand" "=w")
+	(unspec:VH [(match_operand:VH 1 "s_register_operand" "w")
+		    (match_operand:V4HF 2 "s_register_operand"
+		     "<scalar_mul_constraint>")
+		     (match_operand:SI 3 "immediate_operand" "i")]
+		     UNSPEC_VMUL_LANE))]
+  "TARGET_NEON_FP16INST"
+  "vmul.f16\t%<V_reg>0, %<V_reg>1, %P2[%c3]"
+  [(set_attr "type" "neon_fp_mul_s_scalar<q>")]
+)
+
 (define_insn "neon_vmull<sup>_lane<mode>"
   [(set (match_operand:<V_widen> 0 "s_register_operand" "=w")
 	(unspec:<V_widen> [(match_operand:VMDI 1 "s_register_operand" "w")
@@ -3447,6 +4048,19 @@ if (BYTES_BIG_ENDIAN)
   DONE;
 })
 
+(define_expand "neon_vmul_n<mode>"
+  [(match_operand:VH 0 "s_register_operand")
+   (match_operand:VH 1 "s_register_operand")
+   (match_operand:<V_elem> 2 "s_register_operand")]
+  "TARGET_NEON_FP16INST"
+{
+  rtx tmp = gen_reg_rtx (V4HFmode);
+  emit_insn (gen_neon_vset_lanev4hf (tmp, operands[2], tmp, const0_rtx));
+  emit_insn (gen_neon_vmul_lane<mode> (operands[0], operands[1], tmp,
+				       const0_rtx));
+  DONE;
+})
+
 (define_expand "neon_vmulls_n<mode>"
   [(match_operand:<V_widen> 0 "s_register_operand" "")
    (match_operand:VMDI 1 "s_register_operand" "")
@@ -4168,25 +4782,25 @@ if (BYTES_BIG_ENDIAN)
 
 (define_expand "neon_vtrn<mode>_internal"
   [(parallel
-    [(set (match_operand:VDQW 0 "s_register_operand" "")
-	  (unspec:VDQW [(match_operand:VDQW 1 "s_register_operand" "")
-			(match_operand:VDQW 2 "s_register_operand" "")]
+    [(set (match_operand:VDQWH 0 "s_register_operand")
+	  (unspec:VDQWH [(match_operand:VDQWH 1 "s_register_operand")
+			 (match_operand:VDQWH 2 "s_register_operand")]
 	   UNSPEC_VTRN1))
-     (set (match_operand:VDQW 3 "s_register_operand" "")
-          (unspec:VDQW [(match_dup 1) (match_dup 2)] UNSPEC_VTRN2))])]
+     (set (match_operand:VDQWH 3 "s_register_operand")
+	  (unspec:VDQWH [(match_dup 1) (match_dup 2)] UNSPEC_VTRN2))])]
   "TARGET_NEON"
   ""
 )
 
 ;; Note: Different operand numbering to handle tied registers correctly.
 (define_insn "*neon_vtrn<mode>_insn"
-  [(set (match_operand:VDQW 0 "s_register_operand" "=&w")
-        (unspec:VDQW [(match_operand:VDQW 1 "s_register_operand" "0")
-                      (match_operand:VDQW 3 "s_register_operand" "2")]
-                     UNSPEC_VTRN1))
-   (set (match_operand:VDQW 2 "s_register_operand" "=&w")
-         (unspec:VDQW [(match_dup 1) (match_dup 3)]
-                     UNSPEC_VTRN2))]
+  [(set (match_operand:VDQWH 0 "s_register_operand" "=&w")
+	(unspec:VDQWH [(match_operand:VDQWH 1 "s_register_operand" "0")
+		       (match_operand:VDQWH 3 "s_register_operand" "2")]
+	 UNSPEC_VTRN1))
+   (set (match_operand:VDQWH 2 "s_register_operand" "=&w")
+	(unspec:VDQWH [(match_dup 1) (match_dup 3)]
+	 UNSPEC_VTRN2))]
   "TARGET_NEON"
   "vtrn.<V_sz_elem>\t%<V_reg>0, %<V_reg>2"
   [(set_attr "type" "neon_permute<q>")]
@@ -4194,25 +4808,25 @@ if (BYTES_BIG_ENDIAN)
 
 (define_expand "neon_vzip<mode>_internal"
   [(parallel
-    [(set (match_operand:VDQW 0 "s_register_operand" "")
-	  (unspec:VDQW [(match_operand:VDQW 1 "s_register_operand" "")
-	  	        (match_operand:VDQW 2 "s_register_operand" "")]
-		       UNSPEC_VZIP1))
-    (set (match_operand:VDQW 3 "s_register_operand" "")
-	 (unspec:VDQW [(match_dup 1) (match_dup 2)] UNSPEC_VZIP2))])]
+    [(set (match_operand:VDQWH 0 "s_register_operand")
+	  (unspec:VDQWH [(match_operand:VDQWH 1 "s_register_operand")
+			 (match_operand:VDQWH 2 "s_register_operand")]
+	   UNSPEC_VZIP1))
+    (set (match_operand:VDQWH 3 "s_register_operand")
+	 (unspec:VDQWH [(match_dup 1) (match_dup 2)] UNSPEC_VZIP2))])]
   "TARGET_NEON"
   ""
 )
 
 ;; Note: Different operand numbering to handle tied registers correctly.
 (define_insn "*neon_vzip<mode>_insn"
-  [(set (match_operand:VDQW 0 "s_register_operand" "=&w")
-        (unspec:VDQW [(match_operand:VDQW 1 "s_register_operand" "0")
-                      (match_operand:VDQW 3 "s_register_operand" "2")]
-                     UNSPEC_VZIP1))
-   (set (match_operand:VDQW 2 "s_register_operand" "=&w")
-        (unspec:VDQW [(match_dup 1) (match_dup 3)]
-                     UNSPEC_VZIP2))]
+  [(set (match_operand:VDQWH 0 "s_register_operand" "=&w")
+	(unspec:VDQWH [(match_operand:VDQWH 1 "s_register_operand" "0")
+		       (match_operand:VDQWH 3 "s_register_operand" "2")]
+	 UNSPEC_VZIP1))
+   (set (match_operand:VDQWH 2 "s_register_operand" "=&w")
+	(unspec:VDQWH [(match_dup 1) (match_dup 3)]
+	 UNSPEC_VZIP2))]
   "TARGET_NEON"
   "vzip.<V_sz_elem>\t%<V_reg>0, %<V_reg>2"
   [(set_attr "type" "neon_zip<q>")]
@@ -4220,25 +4834,25 @@ if (BYTES_BIG_ENDIAN)
 
 (define_expand "neon_vuzp<mode>_internal"
   [(parallel
-    [(set (match_operand:VDQW 0 "s_register_operand" "")
-	  (unspec:VDQW [(match_operand:VDQW 1 "s_register_operand" "")
-			(match_operand:VDQW 2 "s_register_operand" "")]
+    [(set (match_operand:VDQWH 0 "s_register_operand")
+	  (unspec:VDQWH [(match_operand:VDQWH 1 "s_register_operand")
+			(match_operand:VDQWH 2 "s_register_operand")]
 	   UNSPEC_VUZP1))
-     (set (match_operand:VDQW 3 "s_register_operand" "")
-	  (unspec:VDQW [(match_dup 1) (match_dup 2)] UNSPEC_VUZP2))])]
+     (set (match_operand:VDQWH 3 "s_register_operand" "")
+	  (unspec:VDQWH [(match_dup 1) (match_dup 2)] UNSPEC_VUZP2))])]
   "TARGET_NEON"
   ""
 )
 
 ;; Note: Different operand numbering to handle tied registers correctly.
 (define_insn "*neon_vuzp<mode>_insn"
-  [(set (match_operand:VDQW 0 "s_register_operand" "=&w")
-        (unspec:VDQW [(match_operand:VDQW 1 "s_register_operand" "0")
-                      (match_operand:VDQW 3 "s_register_operand" "2")]
-                     UNSPEC_VUZP1))
-   (set (match_operand:VDQW 2 "s_register_operand" "=&w")
-        (unspec:VDQW [(match_dup 1) (match_dup 3)]
-                     UNSPEC_VUZP2))]
+  [(set (match_operand:VDQWH 0 "s_register_operand" "=&w")
+	(unspec:VDQWH [(match_operand:VDQWH 1 "s_register_operand" "0")
+		       (match_operand:VDQWH 3 "s_register_operand" "2")]
+	 UNSPEC_VUZP1))
+   (set (match_operand:VDQWH 2 "s_register_operand" "=&w")
+	(unspec:VDQWH [(match_dup 1) (match_dup 3)]
+	 UNSPEC_VUZP2))]
   "TARGET_NEON"
   "vuzp.<V_sz_elem>\t%<V_reg>0, %<V_reg>2"
   [(set_attr "type" "neon_zip<q>")]
--- a/src/gcc/config/arm/neon.ml
+++ b/src//dev/null
@@ -1,2357 +0,0 @@
-(* Common code for ARM NEON header file, documentation and test case
-   generators.
-
-   Copyright (C) 2006-2016 Free Software Foundation, Inc.
-   Contributed by CodeSourcery.
-
-   This file is part of GCC.
-
-   GCC is free software; you can redistribute it and/or modify it under
-   the terms of the GNU General Public License as published by the Free
-   Software Foundation; either version 3, or (at your option) any later
-   version.
-
-   GCC is distributed in the hope that it will be useful, but WITHOUT ANY
-   WARRANTY; without even the implied warranty of MERCHANTABILITY or
-   FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
-   for more details.
-
-   You should have received a copy of the GNU General Public License
-   along with GCC; see the file COPYING3.  If not see
-   <http://www.gnu.org/licenses/>.  *)
-
-(* Shorthand types for vector elements.  *)
-type elts = S8 | S16 | S32 | S64 | F16 | F32 | U8 | U16 | U32 | U64 | P8 | P16
-          | P64 | P128 | I8 | I16 | I32 | I64 | B8 | B16 | B32 | B64 | Conv of elts * elts
-          | Cast of elts * elts | NoElts
-
-type eltclass = Signed | Unsigned | Float | Poly | Int | Bits
-	      | ConvClass of eltclass * eltclass | NoType
-
-(* These vector types correspond directly to C types.  *)
-type vectype = T_int8x8    | T_int8x16
-             | T_int16x4   | T_int16x8
-	     | T_int32x2   | T_int32x4
-	     | T_int64x1   | T_int64x2
-	     | T_uint8x8   | T_uint8x16
-	     | T_uint16x4  | T_uint16x8
-	     | T_uint32x2  | T_uint32x4
-	     | T_uint64x1  | T_uint64x2
-	     | T_float16x4
-	     | T_float32x2 | T_float32x4
-	     | T_poly8x8   | T_poly8x16
-	     | T_poly16x4  | T_poly16x8
-	     | T_immediate of int * int
-             | T_int8      | T_int16
-             | T_int32     | T_int64
-             | T_uint8     | T_uint16
-             | T_uint32    | T_uint64
-             | T_poly8     | T_poly16
-             | T_poly64    | T_poly64x1
-             | T_poly64x2  | T_poly128
-             | T_float16   | T_float32
-             | T_arrayof of int * vectype
-             | T_ptrto of vectype | T_const of vectype
-             | T_void      | T_intQI
-             | T_intHI     | T_intSI
-             | T_intDI     | T_intTI
-             | T_floatHF   | T_floatSF
-
-(* The meanings of the following are:
-     TImode : "Tetra", two registers (four words).
-     EImode : "hExa", three registers (six words).
-     OImode : "Octa", four registers (eight words).
-     CImode : "dodeCa", six registers (twelve words).
-     XImode : "heXadeca", eight registers (sixteen words).
-*)
-
-type inttype = B_TImode | B_EImode | B_OImode | B_CImode | B_XImode
-
-type shape_elt = Dreg | Qreg | Corereg | Immed | VecArray of int * shape_elt
-               | PtrTo of shape_elt | CstPtrTo of shape_elt
-	       (* These next ones are used only in the test generator.  *)
-	       | Element_of_dreg	(* Used for "lane" variants.  *)
-	       | Element_of_qreg	(* Likewise.  *)
-	       | All_elements_of_dreg	(* Used for "dup" variants.  *)
-	       | Alternatives of shape_elt list (* Used for multiple valid operands *)
-
-type shape_form = All of int * shape_elt
-                | Long
-		| Long_noreg of shape_elt
-		| Wide
-		| Wide_noreg of shape_elt
-		| Narrow
-                | Long_imm
-                | Narrow_imm
-                | Binary_imm of shape_elt
-                | Use_operands of shape_elt array
-                | By_scalar of shape_elt
-                | Unary_scalar of shape_elt
-                | Wide_lane
-                | Wide_scalar
-                | Pair_result of shape_elt
-
-type arity = Arity0 of vectype
-           | Arity1 of vectype * vectype
-	   | Arity2 of vectype * vectype * vectype
-	   | Arity3 of vectype * vectype * vectype * vectype
-           | Arity4 of vectype * vectype * vectype * vectype * vectype
-
-type vecmode = V8QI | V4HI | V4HF |V2SI | V2SF | DI
-             | V16QI | V8HI | V4SI | V4SF | V2DI | TI
-             | QI | HI | SI | SF
-
-type opcode =
-  (* Binary ops.  *)
-    Vadd
-  | Vmul
-  | Vmla
-  | Vmls
-  | Vfma
-  | Vfms
-  | Vsub
-  | Vceq
-  | Vcge
-  | Vcgt
-  | Vcle
-  | Vclt
-  | Vcage
-  | Vcagt
-  | Vcale
-  | Vcalt
-  | Vtst
-  | Vabd
-  | Vaba
-  | Vmax
-  | Vmin
-  | Vpadd
-  | Vpada
-  | Vpmax
-  | Vpmin
-  | Vrecps
-  | Vrsqrts
-  | Vshl
-  | Vshr_n
-  | Vshl_n
-  | Vsra_n
-  | Vsri
-  | Vsli
-  (* Logic binops.  *)
-  | Vand
-  | Vorr
-  | Veor
-  | Vbic
-  | Vorn
-  | Vbsl
-  (* Ops with scalar.  *)
-  | Vmul_lane
-  | Vmla_lane
-  | Vmls_lane
-  | Vmul_n
-  | Vmla_n
-  | Vmls_n
-  | Vmull_n
-  | Vmull_lane
-  | Vqdmull_n
-  | Vqdmull_lane
-  | Vqdmulh_n
-  | Vqdmulh_lane
-  (* Unary ops.  *)
-  | Vrintn
-  | Vrinta
-  | Vrintp
-  | Vrintm
-  | Vrintz
-  | Vabs
-  | Vneg
-  | Vcls
-  | Vclz
-  | Vcnt
-  | Vrecpe
-  | Vrsqrte
-  | Vmvn
-  (* Vector extract.  *)
-  | Vext
-  (* Reverse elements.  *)
-  | Vrev64
-  | Vrev32
-  | Vrev16
-  (* Transposition ops.  *)
-  | Vtrn
-  | Vzip
-  | Vuzp
-  (* Loads and stores (VLD1/VST1/VLD2...), elements and structures.  *)
-  | Vldx of int
-  | Vstx of int
-  | Vldx_lane of int
-  | Vldx_dup of int
-  | Vstx_lane of int
-  (* Set/extract lanes from a vector.  *)
-  | Vget_lane
-  | Vset_lane
-  (* Initialize vector from bit pattern.  *)
-  | Vcreate
-  (* Set all lanes to same value.  *)
-  | Vdup_n
-  | Vmov_n  (* Is this the same?  *)
-  (* Duplicate scalar to all lanes of vector.  *)
-  | Vdup_lane
-  (* Combine vectors.  *)
-  | Vcombine
-  (* Get quadword high/low parts.  *)
-  | Vget_high
-  | Vget_low
-  (* Convert vectors.  *)
-  | Vcvt
-  | Vcvt_n
-  (* Narrow/lengthen vectors.  *)
-  | Vmovn
-  | Vmovl
-  (* Table lookup.  *)
-  | Vtbl of int
-  | Vtbx of int
-  (* Reinterpret casts.  *)
-  | Vreinterp
-
-let rev_elems revsize elsize nelts _ =
-  let mask = (revsize / elsize) - 1 in
-  let arr = Array.init nelts
-    (fun i -> i lxor mask) in
-  Array.to_list arr
-
-let permute_range i stride nelts increment =
-  let rec build i = function
-    0 -> []
-  | nelts -> i :: (i + stride) :: build (i + increment) (pred nelts) in
-  build i nelts
-
-(* Generate a list of integers suitable for vzip.  *)
-let zip_range i stride nelts = permute_range i stride nelts 1
-
-(* Generate a list of integers suitable for vunzip.  *)
-let uzip_range i stride nelts = permute_range i stride nelts 4
-
-(* Generate a list of integers suitable for trn.  *)
-let trn_range i stride nelts = permute_range i stride nelts 2
-
-let zip_elems _ nelts part =
-  match part with
-    `lo -> zip_range 0 nelts (nelts / 2)
-  | `hi -> zip_range (nelts / 2) nelts (nelts / 2)
-
-let uzip_elems _ nelts part =
-  match part with
-    `lo -> uzip_range 0 2 (nelts / 2)
-  | `hi -> uzip_range 1 2 (nelts / 2)
-
-let trn_elems _ nelts part =
-  match part with
-    `lo -> trn_range 0 nelts (nelts / 2)
-  | `hi -> trn_range 1 nelts (nelts / 2)
-
-(* Features used for documentation, to distinguish between some instruction
-   variants, and to signal special requirements (e.g. swapping arguments).  *)
-
-type features =
-    Halving
-  | Rounding
-  | Saturating
-  | Dst_unsign
-  | High_half
-  | Doubling
-  | Flipped of string  (* Builtin name to use with flipped arguments.  *)
-  | InfoWord  (* Pass an extra word for signage/rounding etc. (always passed
-                 for All _, Long, Wide, Narrow shape_forms.  *)
-    (* Implement builtin as shuffle.  The parameter is a function which returns
-       masks suitable for __builtin_shuffle: arguments are (element size,
-       number of elements, high/low part selector).  *)
-  | Use_shuffle of (int -> int -> [`lo|`hi] -> int list)
-    (* A specification as to the shape of instruction expected upon
-       disassembly, used if it differs from the shape used to build the
-       intrinsic prototype.  Multiple entries in the constructor's argument
-       indicate that the intrinsic expands to more than one assembly
-       instruction, each with a corresponding shape specified here.  *)
-  | Disassembles_as of shape_form list
-  | Builtin_name of string  (* Override the name of the builtin.  *)
-    (* Override the name of the instruction.  If more than one name
-       is specified, it means that the instruction can have any of those
-       names.  *)
-  | Instruction_name of string list
-    (* Mark that the intrinsic yields no instructions, or expands to yield
-       behavior that the test generator cannot test.  *)
-  | No_op
-    (* Mark that the intrinsic has constant arguments that cannot be set
-       to the defaults (zero for pointers and one otherwise) in the test
-       cases.  The function supplied must return the integer to be written
-       into the testcase for the argument number (0-based) supplied to it.  *)
-  | Const_valuator of (int -> int)
-  | Fixed_vector_reg
-  | Fixed_core_reg
-    (* Mark that the intrinsic requires __ARM_FEATURE_string to be defined.  *)
-  | Requires_feature of string
-    (* Mark that the intrinsic requires a particular architecture version.  *)
-  | Requires_arch of int
-    (* Mark that the intrinsic requires a particular bit in __ARM_FP to
-    be set.   *)
-  | Requires_FP_bit of int
-    (* Compiler optimization level for the test.  *)
-  | Compiler_optim of string
-
-exception MixedMode of elts * elts
-
-let rec elt_width = function
-    S8 | U8 | P8 | I8 | B8 -> 8
-  | S16 | U16 | P16 | I16 | B16 | F16 -> 16
-  | S32 | F32 | U32 | I32 | B32 -> 32
-  | S64 | U64 | P64 | I64 | B64 -> 64
-  | P128 -> 128
-  | Conv (a, b) ->
-      let wa = elt_width a and wb = elt_width b in
-      if wa = wb then wa else raise (MixedMode (a, b))
-  | Cast (a, b) -> raise (MixedMode (a, b))
-  | NoElts -> failwith "No elts"
-
-let rec elt_class = function
-    S8 | S16 | S32 | S64 -> Signed
-  | U8 | U16 | U32 | U64 -> Unsigned
-  | P8 | P16 | P64 | P128 -> Poly
-  | F16 | F32 -> Float
-  | I8 | I16 | I32 | I64 -> Int
-  | B8 | B16 | B32 | B64 -> Bits
-  | Conv (a, b) | Cast (a, b) -> ConvClass (elt_class a, elt_class b)
-  | NoElts -> NoType
-
-let elt_of_class_width c w =
-  match c, w with
-    Signed, 8 -> S8
-  | Signed, 16 -> S16
-  | Signed, 32 -> S32
-  | Signed, 64 -> S64
-  | Float, 16 -> F16
-  | Float, 32 -> F32
-  | Unsigned, 8 -> U8
-  | Unsigned, 16 -> U16
-  | Unsigned, 32 -> U32
-  | Unsigned, 64 -> U64
-  | Poly, 8 -> P8
-  | Poly, 16 -> P16
-  | Poly, 64 -> P64
-  | Poly, 128 -> P128
-  | Int, 8 -> I8
-  | Int, 16 -> I16
-  | Int, 32 -> I32
-  | Int, 64 -> I64
-  | Bits, 8 -> B8
-  | Bits, 16 -> B16
-  | Bits, 32 -> B32
-  | Bits, 64 -> B64
-  | _ -> failwith "Bad element type"
-
-(* Return unsigned integer element the same width as argument.  *)
-let unsigned_of_elt elt =
-  elt_of_class_width Unsigned (elt_width elt)
-
-let signed_of_elt elt =
-  elt_of_class_width Signed (elt_width elt)
-
-(* Return untyped bits element the same width as argument.  *)
-let bits_of_elt elt =
-  elt_of_class_width Bits (elt_width elt)
-
-let non_signed_variant = function
-    S8 -> I8
-  | S16 -> I16
-  | S32 -> I32
-  | S64 -> I64
-  | U8 -> I8
-  | U16 -> I16
-  | U32 -> I32
-  | U64 -> I64
-  | x -> x
-
-let poly_unsigned_variant v =
-  let elclass = match elt_class v with
-    Poly -> Unsigned
-  | x -> x in
-  elt_of_class_width elclass (elt_width v)
-
-let widen_elt elt =
-  let w = elt_width elt
-  and c = elt_class elt in
-  elt_of_class_width c (w * 2)
-
-let narrow_elt elt =
-  let w = elt_width elt
-  and c = elt_class elt in
-  elt_of_class_width c (w / 2)
-
-(* If we're trying to find a mode from a "Use_operands" instruction, use the
-   last vector operand as the dominant mode used to invoke the correct builtin.
-   We must stick to this rule in neon.md.  *)
-let find_key_operand operands =
-  let rec scan opno =
-    match operands.(opno) with
-      Qreg -> Qreg
-    | Dreg -> Dreg
-    | VecArray (_, Qreg) -> Qreg
-    | VecArray (_, Dreg) -> Dreg
-    | _ -> scan (opno-1)
-  in
-    scan ((Array.length operands) - 1)
-
-(* Find a vecmode from a shape_elt ELT for an instruction with shape_form
-   SHAPE.  For a Use_operands shape, if ARGPOS is passed then return the mode
-   for the given argument position, else determine which argument to return a
-   mode for automatically.  *)
-
-let rec mode_of_elt ?argpos elt shape =
-  let flt = match elt_class elt with
-    Float | ConvClass(_, Float) -> true | _ -> false in
-  let idx =
-    match elt_width elt with
-      8 -> 0 | 16 -> 1 | 32 -> 2 | 64 -> 3 | 128 -> 4
-    | _ -> failwith "Bad element width"
-  in match shape with
-    All (_, Dreg) | By_scalar Dreg | Pair_result Dreg | Unary_scalar Dreg
-  | Binary_imm Dreg | Long_noreg Dreg | Wide_noreg Dreg ->
-      if flt then
-        [| V8QI; V4HF; V2SF; DI |].(idx)
-      else
-        [| V8QI; V4HI; V2SI; DI |].(idx)
-  | All (_, Qreg) | By_scalar Qreg | Pair_result Qreg | Unary_scalar Qreg
-  | Binary_imm Qreg | Long_noreg Qreg | Wide_noreg Qreg ->
-      [| V16QI; V8HI; if flt then V4SF else V4SI; V2DI; TI|].(idx)
-  | All (_, (Corereg | PtrTo _ | CstPtrTo _)) ->
-      [| QI; HI; if flt then SF else SI; DI |].(idx)
-  | Long | Wide | Wide_lane | Wide_scalar
-  | Long_imm ->
-      [| V8QI; V4HI; V2SI; DI |].(idx)
-  | Narrow | Narrow_imm -> [| V16QI; V8HI; V4SI; V2DI |].(idx)
-  | Use_operands ops ->
-      begin match argpos with
-        None -> mode_of_elt ?argpos elt (All (0, (find_key_operand ops)))
-      | Some pos -> mode_of_elt ?argpos elt (All (0, ops.(pos)))
-      end
-  | _ -> failwith "invalid shape"
-
-(* Modify an element type dependent on the shape of the instruction and the
-   operand number.  *)
-
-let shapemap shape no =
-  let ident = fun x -> x in
-  match shape with
-    All _ | Use_operands _ | By_scalar _ | Pair_result _ | Unary_scalar _
-  | Binary_imm _ -> ident
-  | Long | Long_noreg _ | Wide_scalar | Long_imm ->
-      [| widen_elt; ident; ident |].(no)
-  | Wide | Wide_noreg _ -> [| widen_elt; widen_elt; ident |].(no)
-  | Wide_lane -> [| widen_elt; ident; ident; ident |].(no)
-  | Narrow | Narrow_imm -> [| narrow_elt; ident; ident |].(no)
-
-(* Register type (D/Q) of an operand, based on shape and operand number.  *)
-
-let regmap shape no =
-  match shape with
-    All (_, reg) | Long_noreg reg | Wide_noreg reg -> reg
-  | Long -> [| Qreg; Dreg; Dreg |].(no)
-  | Wide -> [| Qreg; Qreg; Dreg |].(no)
-  | Narrow -> [| Dreg; Qreg; Qreg |].(no)
-  | Wide_lane -> [| Qreg; Dreg; Dreg; Immed |].(no)
-  | Wide_scalar -> [| Qreg; Dreg; Corereg |].(no)
-  | By_scalar reg -> [| reg; reg; Dreg; Immed |].(no)
-  | Unary_scalar reg -> [| reg; Dreg; Immed |].(no)
-  | Pair_result reg -> [| VecArray (2, reg); reg; reg |].(no)
-  | Binary_imm reg -> [| reg; reg; Immed |].(no)
-  | Long_imm -> [| Qreg; Dreg; Immed |].(no)
-  | Narrow_imm -> [| Dreg; Qreg; Immed |].(no)
-  | Use_operands these -> these.(no)
-
-let type_for_elt shape elt no =
-  let elt = (shapemap shape no) elt in
-  let reg = regmap shape no in
-  let rec type_for_reg_elt reg elt =
-    match reg with
-      Dreg ->
-        begin match elt with
-          S8 -> T_int8x8
-        | S16 -> T_int16x4
-        | S32 -> T_int32x2
-        | S64 -> T_int64x1
-        | U8 -> T_uint8x8
-        | U16 -> T_uint16x4
-        | U32 -> T_uint32x2
-        | U64 -> T_uint64x1
-        | P64 -> T_poly64x1
-        | P128 -> T_poly128
-        | F16 -> T_float16x4
-        | F32 -> T_float32x2
-        | P8 -> T_poly8x8
-        | P16 -> T_poly16x4
-        | _ -> failwith "Bad elt type for Dreg"
-        end
-    | Qreg ->
-        begin match elt with
-          S8 -> T_int8x16
-        | S16 -> T_int16x8
-        | S32 -> T_int32x4
-        | S64 -> T_int64x2
-        | U8 -> T_uint8x16
-        | U16 -> T_uint16x8
-        | U32 -> T_uint32x4
-        | U64 -> T_uint64x2
-        | F32 -> T_float32x4
-        | P8 -> T_poly8x16
-        | P16 -> T_poly16x8
-        | P64 -> T_poly64x2
-        | P128 -> T_poly128
-        | _ -> failwith "Bad elt type for Qreg"
-        end
-    | Corereg ->
-        begin match elt with
-          S8 -> T_int8
-        | S16 -> T_int16
-        | S32 -> T_int32
-        | S64 -> T_int64
-        | U8 -> T_uint8
-        | U16 -> T_uint16
-        | U32 -> T_uint32
-        | U64 -> T_uint64
-        | P8 -> T_poly8
-        | P16 -> T_poly16
-        | P64 -> T_poly64
-        | P128 -> T_poly128
-        | F32 -> T_float32
-        | _ -> failwith "Bad elt type for Corereg"
-        end
-    | Immed ->
-        T_immediate (0, 0)
-    | VecArray (num, sub) ->
-        T_arrayof (num, type_for_reg_elt sub elt)
-    | PtrTo x ->
-        T_ptrto (type_for_reg_elt x elt)
-    | CstPtrTo x ->
-        T_ptrto (T_const (type_for_reg_elt x elt))
-    (* Anything else is solely for the use of the test generator.  *)
-    | _ -> assert false
-  in
-    type_for_reg_elt reg elt
-
-(* Return size of a vector type, in bits.  *)
-let vectype_size = function
-    T_int8x8 | T_int16x4 | T_int32x2 | T_int64x1
-  | T_uint8x8 | T_uint16x4 | T_uint32x2 | T_uint64x1
-  | T_float32x2 | T_poly8x8 | T_poly64x1 | T_poly16x4 | T_float16x4 -> 64
-  | T_int8x16 | T_int16x8 | T_int32x4 | T_int64x2
-  | T_uint8x16 | T_uint16x8  | T_uint32x4  | T_uint64x2
-  | T_float32x4 | T_poly8x16 | T_poly64x2 | T_poly16x8 -> 128
-  | _ -> raise Not_found
-
-let inttype_for_array num elttype =
-  let eltsize = vectype_size elttype in
-  let numwords = (num * eltsize) / 32 in
-  match numwords with
-    4 -> B_TImode
-  | 6 -> B_EImode
-  | 8 -> B_OImode
-  | 12 -> B_CImode
-  | 16 -> B_XImode
-  | _ -> failwith ("no int type for size " ^ string_of_int numwords)
-
-(* These functions return pairs of (internal, external) types, where "internal"
-   types are those seen by GCC, and "external" are those seen by the assembler.
-   These types aren't necessarily the same, since the intrinsics can munge more
-   than one C type into each assembler opcode.  *)
-
-let make_sign_invariant func shape elt =
-  let arity, elt' = func shape elt in
-  arity, non_signed_variant elt'
-
-(* Don't restrict any types.  *)
-
-let elts_same make_arity shape elt =
-  let vtype = type_for_elt shape elt in
-  make_arity vtype, elt
-
-(* As sign_invar_*, but when sign matters.  *)
-let elts_same_io_lane =
-  elts_same (fun vtype -> Arity4 (vtype 0, vtype 0, vtype 1, vtype 2, vtype 3))
-
-let elts_same_io =
-  elts_same (fun vtype -> Arity3 (vtype 0, vtype 0, vtype 1, vtype 2))
-
-let elts_same_2_lane =
-  elts_same (fun vtype -> Arity3 (vtype 0, vtype 1, vtype 2, vtype 3))
-
-let elts_same_3 = elts_same_2_lane
-
-let elts_same_2 =
-  elts_same (fun vtype -> Arity2 (vtype 0, vtype 1, vtype 2))
-
-let elts_same_1 =
-  elts_same (fun vtype -> Arity1 (vtype 0, vtype 1))
-
-(* Use for signed/unsigned invariant operations (i.e. where the operation
-   doesn't depend on the sign of the data.  *)
-
-let sign_invar_io_lane = make_sign_invariant elts_same_io_lane
-let sign_invar_io = make_sign_invariant elts_same_io
-let sign_invar_2_lane = make_sign_invariant elts_same_2_lane
-let sign_invar_2 = make_sign_invariant elts_same_2
-let sign_invar_1 = make_sign_invariant elts_same_1
-
-(* Sign-sensitive comparison.  *)
-
-let cmp_sign_matters shape elt =
-  let vtype = type_for_elt shape elt
-  and rtype = type_for_elt shape (unsigned_of_elt elt) 0 in
-  Arity2 (rtype, vtype 1, vtype 2), elt
-
-(* Signed/unsigned invariant comparison.  *)
-
-let cmp_sign_invar shape elt =
-  let shape', elt' = cmp_sign_matters shape elt in
-  let elt'' =
-    match non_signed_variant elt' with
-      P8 -> I8
-    | x -> x
-  in
-    shape', elt''
-
-(* Comparison (VTST) where only the element width matters.  *)
-
-let cmp_bits shape elt =
-  let vtype = type_for_elt shape elt
-  and rtype = type_for_elt shape (unsigned_of_elt elt) 0
-  and bits_only = bits_of_elt elt in
-  Arity2 (rtype, vtype 1, vtype 2), bits_only
-
-let reg_shift shape elt =
-  let vtype = type_for_elt shape elt
-  and op2type = type_for_elt shape (signed_of_elt elt) 2 in
-  Arity2 (vtype 0, vtype 1, op2type), elt
-
-(* Genericised constant-shift type-generating function.  *)
-
-let const_shift mkimm ?arity ?result shape elt =
-  let op2type = (shapemap shape 2) elt in
-  let op2width = elt_width op2type in
-  let op2 = mkimm op2width
-  and op1 = type_for_elt shape elt 1
-  and r_elt =
-    match result with
-      None -> elt
-    | Some restriction -> restriction elt in
-  let rtype = type_for_elt shape r_elt 0 in
-  match arity with
-    None -> Arity2 (rtype, op1, op2), elt
-  | Some mkarity -> mkarity rtype op1 op2, elt
-
-(* Use for immediate right-shifts.  *)
-
-let shift_right shape elt =
-  const_shift (fun imm -> T_immediate (1, imm)) shape elt
-
-let shift_right_acc shape elt =
-  const_shift (fun imm -> T_immediate (1, imm))
-    ~arity:(fun dst op1 op2 -> Arity3 (dst, dst, op1, op2)) shape elt
-
-(* Use for immediate right-shifts when the operation doesn't care about
-   signedness.  *)
-
-let shift_right_sign_invar =
-  make_sign_invariant shift_right
-
-(* Immediate right-shift; result is unsigned even when operand is signed.  *)
-
-let shift_right_to_uns shape elt =
-  const_shift (fun imm -> T_immediate (1, imm)) ~result:unsigned_of_elt
-    shape elt
-
-(* Immediate left-shift.  *)
-
-let shift_left shape elt =
-  const_shift (fun imm -> T_immediate (0, imm - 1)) shape elt
-
-(* Immediate left-shift, unsigned result.  *)
-
-let shift_left_to_uns shape elt =
-  const_shift (fun imm -> T_immediate (0, imm - 1)) ~result:unsigned_of_elt
-    shape elt
-
-(* Immediate left-shift, don't care about signs.  *)
-
-let shift_left_sign_invar =
-  make_sign_invariant shift_left
-
-(* Shift left/right and insert: only element size matters.  *)
-
-let shift_insert shape elt =
-  let arity, elt =
-    const_shift (fun imm -> T_immediate (1, imm))
-    ~arity:(fun dst op1 op2 -> Arity3 (dst, dst, op1, op2)) shape elt in
-  arity, bits_of_elt elt
-
-(* Get/set lane.  *)
-
-let get_lane shape elt =
-  let vtype = type_for_elt shape elt in
-  Arity2 (vtype 0, vtype 1, vtype 2),
-    (match elt with P8 -> U8 | P16 -> U16 | S32 | U32 | F32 -> B32 | x -> x)
-
-let set_lane shape elt =
-  let vtype = type_for_elt shape elt in
-  Arity3 (vtype 0, vtype 1, vtype 2, vtype 3), bits_of_elt elt
-
-let set_lane_notype shape elt =
-  let vtype = type_for_elt shape elt in
-  Arity3 (vtype 0, vtype 1, vtype 2, vtype 3), NoElts
-
-let create_vector shape elt =
-  let vtype = type_for_elt shape U64 1
-  and rtype = type_for_elt shape elt 0 in
-  Arity1 (rtype, vtype), elt
-
-let conv make_arity shape elt =
-  let edest, esrc = match elt with
-    Conv (edest, esrc) | Cast (edest, esrc) -> edest, esrc
-  | _ -> failwith "Non-conversion element in conversion" in
-  let vtype = type_for_elt shape esrc
-  and rtype = type_for_elt shape edest 0 in
-  make_arity rtype vtype, elt
-
-let conv_1 = conv (fun rtype vtype -> Arity1 (rtype, vtype 1))
-let conv_2 = conv (fun rtype vtype -> Arity2 (rtype, vtype 1, vtype 2))
-
-(* Operation has an unsigned result even if operands are signed.  *)
-
-let dst_unsign make_arity shape elt =
-  let vtype = type_for_elt shape elt
-  and rtype = type_for_elt shape (unsigned_of_elt elt) 0 in
-  make_arity rtype vtype, elt
-
-let dst_unsign_1 = dst_unsign (fun rtype vtype -> Arity1 (rtype, vtype 1))
-
-let make_bits_only func shape elt =
-  let arity, elt' = func shape elt in
-  arity, bits_of_elt elt'
-
-(* Extend operation.  *)
-
-let extend shape elt =
-  let vtype = type_for_elt shape elt in
-  Arity3 (vtype 0, vtype 1, vtype 2, vtype 3), bits_of_elt elt
-
-(* Table look-up operations. Operand 2 is signed/unsigned for signed/unsigned
-   integer ops respectively, or unsigned for polynomial ops.  *)
-
-let table mkarity shape elt =
-  let vtype = type_for_elt shape elt in
-  let op2 = type_for_elt shape (poly_unsigned_variant elt) 2 in
-  mkarity vtype op2, bits_of_elt elt
-
-let table_2 = table (fun vtype op2 -> Arity2 (vtype 0, vtype 1, op2))
-let table_io = table (fun vtype op2 -> Arity3 (vtype 0, vtype 0, vtype 1, op2))
-
-(* Operations where only bits matter.  *)
-
-let bits_1 = make_bits_only elts_same_1
-let bits_2 = make_bits_only elts_same_2
-let bits_3 = make_bits_only elts_same_3
-
-(* Store insns.  *)
-let store_1 shape elt =
-  let vtype = type_for_elt shape elt in
-  Arity2 (T_void, vtype 0, vtype 1), bits_of_elt elt
-
-let store_3 shape elt =
-  let vtype = type_for_elt shape elt in
-  Arity3 (T_void, vtype 0, vtype 1, vtype 2), bits_of_elt elt
-
-let make_notype func shape elt =
-  let arity, _ = func shape elt in
-  arity, NoElts
-
-let notype_1 = make_notype elts_same_1
-let notype_2 = make_notype elts_same_2
-let notype_3 = make_notype elts_same_3
-
-(* Bit-select operations (first operand is unsigned int).  *)
-
-let bit_select shape elt =
-  let vtype = type_for_elt shape elt
-  and itype = type_for_elt shape (unsigned_of_elt elt) in
-  Arity3 (vtype 0, itype 1, vtype 2, vtype 3), NoElts
-
-(* Common lists of supported element types.  *)
-
-let s_8_32 = [S8; S16; S32]
-let u_8_32 = [U8; U16; U32]
-let su_8_32 = [S8; S16; S32; U8; U16; U32]
-let su_8_64 = S64 :: U64 :: su_8_32
-let su_16_64 = [S16; S32; S64; U16; U32; U64]
-let pf_su_8_16 = [P8; P16; S8; S16; U8; U16]
-let pf_su_8_32 = P8 :: P16 :: F32 :: su_8_32
-let pf_su_8_64 = P8 :: P16 :: F32 :: su_8_64
-let suf_32 = [S32; U32; F32]
-
-let ops =
-  [
-    (* Addition.  *)
-    Vadd, [], All (3, Dreg), "vadd", sign_invar_2, F32 :: su_8_32;
-    Vadd, [No_op], All (3, Dreg), "vadd", sign_invar_2, [S64; U64];
-    Vadd, [], All (3, Qreg), "vaddQ", sign_invar_2, F32 :: su_8_64;
-    Vadd, [], Long, "vaddl", elts_same_2, su_8_32;
-    Vadd, [], Wide, "vaddw", elts_same_2, su_8_32;
-    Vadd, [Halving], All (3, Dreg), "vhadd", elts_same_2, su_8_32;
-    Vadd, [Halving], All (3, Qreg), "vhaddQ", elts_same_2, su_8_32;
-    Vadd, [Instruction_name ["vrhadd"]; Rounding; Halving],
-      All (3, Dreg), "vRhadd", elts_same_2, su_8_32;
-    Vadd, [Instruction_name ["vrhadd"]; Rounding; Halving],
-      All (3, Qreg), "vRhaddQ", elts_same_2, su_8_32;
-    Vadd, [Saturating], All (3, Dreg), "vqadd", elts_same_2, su_8_64;
-    Vadd, [Saturating], All (3, Qreg), "vqaddQ", elts_same_2, su_8_64;
-    Vadd, [High_half], Narrow, "vaddhn", sign_invar_2, su_16_64;
-    Vadd, [Instruction_name ["vraddhn"]; Rounding; High_half],
-      Narrow, "vRaddhn", sign_invar_2, su_16_64;
-
-    (* Multiplication.  *)
-    Vmul, [], All (3, Dreg), "vmul", sign_invar_2, P8 :: F32 :: su_8_32;
-    Vmul, [], All (3, Qreg), "vmulQ", sign_invar_2, P8 :: F32 :: su_8_32;
-    Vmul, [Saturating; Doubling; High_half], All (3, Dreg), "vqdmulh",
-      elts_same_2, [S16; S32];
-    Vmul, [Saturating; Doubling; High_half], All (3, Qreg), "vqdmulhQ",
-      elts_same_2, [S16; S32];
-    Vmul,
-      [Saturating; Rounding; Doubling; High_half;
-       Instruction_name ["vqrdmulh"]],
-      All (3, Dreg), "vqRdmulh",
-      elts_same_2, [S16; S32];
-    Vmul,
-      [Saturating; Rounding; Doubling; High_half;
-       Instruction_name ["vqrdmulh"]],
-      All (3, Qreg), "vqRdmulhQ",
-      elts_same_2, [S16; S32];
-    Vmul, [], Long, "vmull", elts_same_2, P8 :: su_8_32;
-    Vmul, [Saturating; Doubling], Long, "vqdmull", elts_same_2, [S16; S32];
-
-    (* Multiply-accumulate. *)
-    Vmla, [], All (3, Dreg), "vmla", sign_invar_io, F32 :: su_8_32;
-    Vmla, [], All (3, Qreg), "vmlaQ", sign_invar_io, F32 :: su_8_32;
-    Vmla, [], Long, "vmlal", elts_same_io, su_8_32;
-    Vmla, [Saturating; Doubling], Long, "vqdmlal", elts_same_io, [S16; S32];
-
-    (* Multiply-subtract.  *)
-    Vmls, [], All (3, Dreg), "vmls", sign_invar_io, F32 :: su_8_32;
-    Vmls, [], All (3, Qreg), "vmlsQ", sign_invar_io, F32 :: su_8_32;
-    Vmls, [], Long, "vmlsl", elts_same_io, su_8_32;
-    Vmls, [Saturating; Doubling], Long, "vqdmlsl", elts_same_io, [S16; S32];
-
-    (* Fused-multiply-accumulate. *)
-    Vfma, [Requires_feature "FMA"], All (3, Dreg), "vfma", elts_same_io, [F32];
-    Vfma, [Requires_feature "FMA"], All (3, Qreg), "vfmaQ", elts_same_io, [F32];
-    Vfms, [Requires_feature "FMA"], All (3, Dreg), "vfms", elts_same_io, [F32];
-    Vfms, [Requires_feature "FMA"], All (3, Qreg), "vfmsQ", elts_same_io, [F32];
-
-    (* Round to integral. *)
-    Vrintn, [Builtin_name "vrintn"; Requires_arch 8], Use_operands [| Dreg; Dreg |],
-            "vrndn", elts_same_1, [F32];
-    Vrintn, [Builtin_name "vrintn"; Requires_arch 8], Use_operands [| Qreg; Qreg |],
-            "vrndqn", elts_same_1, [F32];
-    Vrinta, [Builtin_name "vrinta"; Requires_arch 8], Use_operands [| Dreg; Dreg |],
-            "vrnda", elts_same_1, [F32];
-    Vrinta, [Builtin_name "vrinta"; Requires_arch 8], Use_operands [| Qreg; Qreg |],
-            "vrndqa", elts_same_1, [F32];
-    Vrintp, [Builtin_name "vrintp"; Requires_arch 8], Use_operands [| Dreg; Dreg |],
-            "vrndp", elts_same_1, [F32];
-    Vrintp, [Builtin_name "vrintp"; Requires_arch 8], Use_operands [| Qreg; Qreg |],
-            "vrndqp", elts_same_1, [F32];
-    Vrintm, [Builtin_name "vrintm"; Requires_arch 8], Use_operands [| Dreg; Dreg |],
-            "vrndm", elts_same_1, [F32];
-    Vrintm, [Builtin_name "vrintm"; Requires_arch 8], Use_operands [| Qreg; Qreg |],
-            "vrndqm", elts_same_1, [F32];
-    Vrintz, [Builtin_name "vrintz"; Requires_arch 8], Use_operands [| Dreg; Dreg |],
-            "vrnd", elts_same_1, [F32];
-    Vrintz, [Builtin_name "vrintz"; Requires_arch 8], Use_operands [| Qreg; Qreg |],
-            "vrndq", elts_same_1, [F32];
-    (* Subtraction.  *)
-    Vsub, [], All (3, Dreg), "vsub", sign_invar_2, F32 :: su_8_32;
-    Vsub, [No_op], All (3, Dreg), "vsub", sign_invar_2,  [S64; U64];
-    Vsub, [], All (3, Qreg), "vsubQ", sign_invar_2, F32 :: su_8_64;
-    Vsub, [], Long, "vsubl", elts_same_2, su_8_32;
-    Vsub, [], Wide, "vsubw", elts_same_2, su_8_32;
-    Vsub, [Halving], All (3, Dreg), "vhsub", elts_same_2, su_8_32;
-    Vsub, [Halving], All (3, Qreg), "vhsubQ", elts_same_2, su_8_32;
-    Vsub, [Saturating], All (3, Dreg), "vqsub", elts_same_2, su_8_64;
-    Vsub, [Saturating], All (3, Qreg), "vqsubQ", elts_same_2, su_8_64;
-    Vsub, [High_half], Narrow, "vsubhn", sign_invar_2, su_16_64;
-    Vsub, [Instruction_name ["vrsubhn"]; Rounding; High_half],
-      Narrow, "vRsubhn", sign_invar_2, su_16_64;
-
-    (* Comparison, equal.  *)
-    Vceq, [], All (3, Dreg), "vceq", cmp_sign_invar, P8 :: F32 :: su_8_32;
-    Vceq, [], All (3, Qreg), "vceqQ", cmp_sign_invar, P8 :: F32 :: su_8_32;
-
-    (* Comparison, greater-than or equal.  *)
-    Vcge, [], All (3, Dreg), "vcge", cmp_sign_matters, F32 :: s_8_32;
-    Vcge, [Instruction_name ["vcge"]; Builtin_name "vcgeu"],
-      All (3, Dreg), "vcge", cmp_sign_matters,
-      u_8_32;
-    Vcge, [], All (3, Qreg), "vcgeQ", cmp_sign_matters, F32 :: s_8_32;
-    Vcge, [Instruction_name ["vcge"]; Builtin_name "vcgeu"],
-      All (3, Qreg), "vcgeQ", cmp_sign_matters,
-      u_8_32;
-
-    (* Comparison, less-than or equal.  *)
-    Vcle, [Flipped "vcge"], All (3, Dreg), "vcle", cmp_sign_matters,
-      F32 :: s_8_32;
-    Vcle, [Instruction_name ["vcge"]; Flipped "vcgeu"],
-      All (3, Dreg), "vcle", cmp_sign_matters,
-      u_8_32;
-    Vcle, [Instruction_name ["vcge"]; Flipped "vcgeQ"],
-      All (3, Qreg), "vcleQ", cmp_sign_matters,
-      F32 :: s_8_32;
-    Vcle, [Instruction_name ["vcge"]; Flipped "vcgeuQ"],
-      All (3, Qreg), "vcleQ", cmp_sign_matters,
-      u_8_32;
-
-    (* Comparison, greater-than.  *)
-    Vcgt, [], All (3, Dreg), "vcgt", cmp_sign_matters, F32 :: s_8_32;
-    Vcgt, [Instruction_name ["vcgt"]; Builtin_name "vcgtu"],
-      All (3, Dreg), "vcgt", cmp_sign_matters,
-      u_8_32;
-    Vcgt, [], All (3, Qreg), "vcgtQ", cmp_sign_matters, F32 :: s_8_32;
-    Vcgt, [Instruction_name ["vcgt"]; Builtin_name "vcgtu"],
-      All (3, Qreg), "vcgtQ", cmp_sign_matters,
-      u_8_32;
-
-    (* Comparison, less-than.  *)
-    Vclt, [Flipped "vcgt"], All (3, Dreg), "vclt", cmp_sign_matters,
-      F32 :: s_8_32;
-    Vclt, [Instruction_name ["vcgt"]; Flipped "vcgtu"],
-      All (3, Dreg), "vclt", cmp_sign_matters,
-      u_8_32;
-    Vclt, [Instruction_name ["vcgt"]; Flipped "vcgtQ"],
-      All (3, Qreg), "vcltQ", cmp_sign_matters,
-      F32 :: s_8_32;
-    Vclt, [Instruction_name ["vcgt"]; Flipped "vcgtuQ"],
-      All (3, Qreg), "vcltQ", cmp_sign_matters,
-      u_8_32;
-
-    (* Compare absolute greater-than or equal.  *)
-    Vcage, [Instruction_name ["vacge"]],
-      All (3, Dreg), "vcage", cmp_sign_matters, [F32];
-    Vcage, [Instruction_name ["vacge"]],
-      All (3, Qreg), "vcageQ", cmp_sign_matters, [F32];
-
-    (* Compare absolute less-than or equal.  *)
-    Vcale, [Instruction_name ["vacge"]; Flipped "vcage"],
-      All (3, Dreg), "vcale", cmp_sign_matters, [F32];
-    Vcale, [Instruction_name ["vacge"]; Flipped "vcageQ"],
-      All (3, Qreg), "vcaleQ", cmp_sign_matters, [F32];
-
-    (* Compare absolute greater-than or equal.  *)
-    Vcagt, [Instruction_name ["vacgt"]],
-      All (3, Dreg), "vcagt", cmp_sign_matters, [F32];
-    Vcagt, [Instruction_name ["vacgt"]],
-      All (3, Qreg), "vcagtQ", cmp_sign_matters, [F32];
-
-    (* Compare absolute less-than or equal.  *)
-    Vcalt, [Instruction_name ["vacgt"]; Flipped "vcagt"],
-      All (3, Dreg), "vcalt", cmp_sign_matters, [F32];
-    Vcalt, [Instruction_name ["vacgt"]; Flipped "vcagtQ"],
-      All (3, Qreg), "vcaltQ", cmp_sign_matters, [F32];
-
-    (* Test bits.  *)
-    Vtst, [], All (3, Dreg), "vtst", cmp_bits, P8 :: su_8_32;
-    Vtst, [], All (3, Qreg), "vtstQ", cmp_bits, P8 :: su_8_32;
-
-    (* Absolute difference.  *)
-    Vabd, [], All (3, Dreg), "vabd", elts_same_2, F32 :: su_8_32;
-    Vabd, [], All (3, Qreg), "vabdQ", elts_same_2, F32 :: su_8_32;
-    Vabd, [], Long, "vabdl", elts_same_2, su_8_32;
-
-    (* Absolute difference and accumulate.  *)
-    Vaba, [], All (3, Dreg), "vaba", elts_same_io, su_8_32;
-    Vaba, [], All (3, Qreg), "vabaQ", elts_same_io, su_8_32;
-    Vaba, [], Long, "vabal", elts_same_io, su_8_32;
-
-    (* Max.  *)
-    Vmax, [], All (3, Dreg), "vmax", elts_same_2, F32 :: su_8_32;
-    Vmax, [], All (3, Qreg), "vmaxQ", elts_same_2, F32 :: su_8_32;
-
-    (* Min.  *)
-    Vmin, [], All (3, Dreg), "vmin", elts_same_2, F32 :: su_8_32;
-    Vmin, [], All (3, Qreg), "vminQ", elts_same_2, F32 :: su_8_32;
-
-    (* Pairwise add.  *)
-    Vpadd, [], All (3, Dreg), "vpadd", sign_invar_2, F32 :: su_8_32;
-    Vpadd, [], Long_noreg Dreg, "vpaddl", elts_same_1, su_8_32;
-    Vpadd, [], Long_noreg Qreg, "vpaddlQ", elts_same_1, su_8_32;
-
-    (* Pairwise add, widen and accumulate.  *)
-    Vpada, [], Wide_noreg Dreg, "vpadal", elts_same_2, su_8_32;
-    Vpada, [], Wide_noreg Qreg, "vpadalQ", elts_same_2, su_8_32;
-
-    (* Folding maximum, minimum.  *)
-    Vpmax, [], All (3, Dreg), "vpmax", elts_same_2, F32 :: su_8_32;
-    Vpmin, [], All (3, Dreg), "vpmin", elts_same_2, F32 :: su_8_32;
-
-    (* Reciprocal step.  *)
-    Vrecps, [], All (3, Dreg), "vrecps", elts_same_2, [F32];
-    Vrecps, [], All (3, Qreg), "vrecpsQ", elts_same_2, [F32];
-    Vrsqrts, [], All (3, Dreg), "vrsqrts", elts_same_2, [F32];
-    Vrsqrts, [], All (3, Qreg), "vrsqrtsQ", elts_same_2, [F32];
-
-    (* Vector shift left.  *)
-    Vshl, [], All (3, Dreg), "vshl", reg_shift, su_8_64;
-    Vshl, [], All (3, Qreg), "vshlQ", reg_shift, su_8_64;
-    Vshl, [Instruction_name ["vrshl"]; Rounding],
-      All (3, Dreg), "vRshl", reg_shift, su_8_64;
-    Vshl, [Instruction_name ["vrshl"]; Rounding],
-      All (3, Qreg), "vRshlQ", reg_shift, su_8_64;
-    Vshl, [Saturating], All (3, Dreg), "vqshl", reg_shift, su_8_64;
-    Vshl, [Saturating], All (3, Qreg), "vqshlQ", reg_shift, su_8_64;
-    Vshl, [Instruction_name ["vqrshl"]; Saturating; Rounding],
-      All (3, Dreg), "vqRshl", reg_shift, su_8_64;
-    Vshl, [Instruction_name ["vqrshl"]; Saturating; Rounding],
-      All (3, Qreg), "vqRshlQ", reg_shift, su_8_64;
-
-    (* Vector shift right by constant.  *)
-    Vshr_n, [], Binary_imm Dreg, "vshr_n", shift_right, su_8_64;
-    Vshr_n, [], Binary_imm Qreg, "vshrQ_n", shift_right, su_8_64;
-    Vshr_n, [Instruction_name ["vrshr"]; Rounding], Binary_imm Dreg,
-      "vRshr_n", shift_right, su_8_64;
-    Vshr_n, [Instruction_name ["vrshr"]; Rounding], Binary_imm Qreg,
-      "vRshrQ_n", shift_right, su_8_64;
-    Vshr_n, [], Narrow_imm, "vshrn_n", shift_right_sign_invar, su_16_64;
-    Vshr_n, [Instruction_name ["vrshrn"]; Rounding], Narrow_imm, "vRshrn_n",
-      shift_right_sign_invar, su_16_64;
-    Vshr_n, [Saturating], Narrow_imm, "vqshrn_n", shift_right, su_16_64;
-    Vshr_n, [Instruction_name ["vqrshrn"]; Saturating; Rounding], Narrow_imm,
-      "vqRshrn_n", shift_right, su_16_64;
-    Vshr_n, [Saturating; Dst_unsign], Narrow_imm, "vqshrun_n",
-      shift_right_to_uns, [S16; S32; S64];
-    Vshr_n, [Instruction_name ["vqrshrun"]; Saturating; Dst_unsign; Rounding],
-      Narrow_imm, "vqRshrun_n", shift_right_to_uns, [S16; S32; S64];
-
-    (* Vector shift left by constant.  *)
-    Vshl_n, [], Binary_imm Dreg, "vshl_n", shift_left_sign_invar, su_8_64;
-    Vshl_n, [], Binary_imm Qreg, "vshlQ_n", shift_left_sign_invar, su_8_64;
-    Vshl_n, [Saturating], Binary_imm Dreg, "vqshl_n", shift_left, su_8_64;
-    Vshl_n, [Saturating], Binary_imm Qreg, "vqshlQ_n", shift_left, su_8_64;
-    Vshl_n, [Saturating; Dst_unsign], Binary_imm Dreg, "vqshlu_n",
-      shift_left_to_uns, [S8; S16; S32; S64];
-    Vshl_n, [Saturating; Dst_unsign], Binary_imm Qreg, "vqshluQ_n",
-      shift_left_to_uns, [S8; S16; S32; S64];
-    Vshl_n, [], Long_imm, "vshll_n", shift_left, su_8_32;
-
-    (* Vector shift right by constant and accumulate.  *)
-    Vsra_n, [], Binary_imm Dreg, "vsra_n", shift_right_acc, su_8_64;
-    Vsra_n, [], Binary_imm Qreg, "vsraQ_n", shift_right_acc, su_8_64;
-    Vsra_n, [Instruction_name ["vrsra"]; Rounding], Binary_imm Dreg,
-      "vRsra_n", shift_right_acc, su_8_64;
-    Vsra_n, [Instruction_name ["vrsra"]; Rounding], Binary_imm Qreg,
-      "vRsraQ_n", shift_right_acc, su_8_64;
-
-    (* Vector shift right and insert.  *)
-    Vsri, [Requires_feature "CRYPTO"], Use_operands [| Dreg; Dreg; Immed |], "vsri_n", shift_insert,
-      [P64];
-    Vsri, [], Use_operands [| Dreg; Dreg; Immed |], "vsri_n", shift_insert,
-      P8 :: P16 :: su_8_64;
-    Vsri, [Requires_feature "CRYPTO"], Use_operands [| Qreg; Qreg; Immed |], "vsriQ_n", shift_insert,
-      [P64];
-    Vsri, [], Use_operands [| Qreg; Qreg; Immed |], "vsriQ_n", shift_insert,
-      P8 :: P16 :: su_8_64;
-
-    (* Vector shift left and insert.  *)
-    Vsli, [Requires_feature "CRYPTO"], Use_operands [| Dreg; Dreg; Immed |], "vsli_n", shift_insert,
-      [P64];
-    Vsli, [], Use_operands [| Dreg; Dreg; Immed |], "vsli_n", shift_insert,
-      P8 :: P16 :: su_8_64;
-    Vsli, [Requires_feature "CRYPTO"], Use_operands [| Qreg; Qreg; Immed |], "vsliQ_n", shift_insert,
-      [P64];
-    Vsli, [], Use_operands [| Qreg; Qreg; Immed |], "vsliQ_n", shift_insert,
-      P8 :: P16 :: su_8_64;
-
-    (* Absolute value.  *)
-    Vabs, [], All (2, Dreg), "vabs", elts_same_1, [S8; S16; S32; F32];
-    Vabs, [], All (2, Qreg), "vabsQ", elts_same_1, [S8; S16; S32; F32];
-    Vabs, [Saturating], All (2, Dreg), "vqabs", elts_same_1, [S8; S16; S32];
-    Vabs, [Saturating], All (2, Qreg), "vqabsQ", elts_same_1, [S8; S16; S32];
-
-    (* Negate.  *)
-    Vneg, [], All (2, Dreg), "vneg", elts_same_1, [S8; S16; S32; F32];
-    Vneg, [], All (2, Qreg), "vnegQ", elts_same_1, [S8; S16; S32; F32];
-    Vneg, [Saturating], All (2, Dreg), "vqneg", elts_same_1, [S8; S16; S32];
-    Vneg, [Saturating], All (2, Qreg), "vqnegQ", elts_same_1, [S8; S16; S32];
-
-    (* Bitwise not.  *)
-    Vmvn, [], All (2, Dreg), "vmvn", notype_1, P8 :: su_8_32;
-    Vmvn, [], All (2, Qreg), "vmvnQ", notype_1, P8 :: su_8_32;
-
-    (* Count leading sign bits.  *)
-    Vcls, [], All (2, Dreg), "vcls", elts_same_1, [S8; S16; S32];
-    Vcls, [], All (2, Qreg), "vclsQ", elts_same_1, [S8; S16; S32];
-
-    (* Count leading zeros.  *)
-    Vclz, [], All (2, Dreg), "vclz", sign_invar_1, su_8_32;
-    Vclz, [], All (2, Qreg), "vclzQ", sign_invar_1, su_8_32;
-
-    (* Count number of set bits.  *)
-    Vcnt, [], All (2, Dreg), "vcnt", bits_1, [P8; S8; U8];
-    Vcnt, [], All (2, Qreg), "vcntQ", bits_1, [P8; S8; U8];
-
-    (* Reciprocal estimate.  *)
-    Vrecpe, [], All (2, Dreg), "vrecpe", elts_same_1, [U32; F32];
-    Vrecpe, [], All (2, Qreg), "vrecpeQ", elts_same_1, [U32; F32];
-
-    (* Reciprocal square-root estimate.  *)
-    Vrsqrte, [], All (2, Dreg), "vrsqrte", elts_same_1, [U32; F32];
-    Vrsqrte, [], All (2, Qreg), "vrsqrteQ", elts_same_1, [U32; F32];
-
-    (* Get lanes from a vector.  *)
-    Vget_lane,
-      [InfoWord; Disassembles_as [Use_operands [| Corereg; Element_of_dreg |]];
-       Instruction_name ["vmov"]],
-      Use_operands [| Corereg; Dreg; Immed |],
-      "vget_lane", get_lane, pf_su_8_32;
-    Vget_lane,
-      [No_op;
-       InfoWord;
-       Disassembles_as [Use_operands [| Corereg; Corereg; Dreg |]];
-       Instruction_name ["vmov"]; Const_valuator (fun _ -> 0)],
-      Use_operands [| Corereg; Dreg; Immed |],
-      "vget_lane", notype_2, [S64; U64];
-    Vget_lane,
-      [InfoWord; Disassembles_as [Use_operands [| Corereg; Element_of_dreg |]];
-       Instruction_name ["vmov"]],
-      Use_operands [| Corereg; Qreg; Immed |],
-      "vgetQ_lane", get_lane, pf_su_8_32;
-    Vget_lane,
-      [InfoWord;
-       Disassembles_as [Use_operands [| Corereg; Corereg; Dreg |]];
-       Instruction_name ["vmov"; "fmrrd"]; Const_valuator (fun _ -> 0);
-       Fixed_core_reg],
-      Use_operands [| Corereg; Qreg; Immed |],
-      "vgetQ_lane", notype_2, [S64; U64];
-
-    (* Set lanes in a vector.  *)
-    Vset_lane, [Disassembles_as [Use_operands [| Element_of_dreg; Corereg |]];
-                Instruction_name ["vmov"]],
-      Use_operands [| Dreg; Corereg; Dreg; Immed |], "vset_lane",
-      set_lane, pf_su_8_32;
-    Vset_lane, [No_op;
-                Disassembles_as [Use_operands [| Dreg; Corereg; Corereg |]];
-                Instruction_name ["vmov"]; Const_valuator (fun _ -> 0)],
-      Use_operands [| Dreg; Corereg; Dreg; Immed |], "vset_lane",
-      set_lane_notype, [S64; U64];
-    Vset_lane, [Disassembles_as [Use_operands [| Element_of_dreg; Corereg |]];
-                Instruction_name ["vmov"]],
-      Use_operands [| Qreg; Corereg; Qreg; Immed |], "vsetQ_lane",
-      set_lane, pf_su_8_32;
-    Vset_lane, [Disassembles_as [Use_operands [| Dreg; Corereg; Corereg |]];
-                Instruction_name ["vmov"]; Const_valuator (fun _ -> 0)],
-      Use_operands [| Qreg; Corereg; Qreg; Immed |], "vsetQ_lane",
-      set_lane_notype, [S64; U64];
-
-    (* Create vector from literal bit pattern.  *)
-    Vcreate,
-      [Requires_feature "CRYPTO"; No_op], (* Not really, but it can yield various things that are too
-                                   hard for the test generator at this time.  *)
-      Use_operands [| Dreg; Corereg |], "vcreate", create_vector,
-      [P64];
-    Vcreate,
-      [No_op], (* Not really, but it can yield various things that are too
-                  hard for the test generator at this time.  *)
-      Use_operands [| Dreg; Corereg |], "vcreate", create_vector,
-      pf_su_8_64;
-
-    (* Set all lanes to the same value.  *)
-    Vdup_n,
-      [Disassembles_as [Use_operands [| Dreg;
-                                        Alternatives [ Corereg;
-                                                       Element_of_dreg ] |]]],
-      Use_operands [| Dreg; Corereg |], "vdup_n", bits_1,
-      pf_su_8_32;
-    Vdup_n,
-      [No_op; Requires_feature "CRYPTO";
-       Instruction_name ["vmov"];
-       Disassembles_as [Use_operands [| Dreg; Corereg; Corereg |]]],
-      Use_operands [| Dreg; Corereg |], "vdup_n", notype_1,
-      [P64];
-    Vdup_n,
-      [No_op;
-       Instruction_name ["vmov"];
-       Disassembles_as [Use_operands [| Dreg; Corereg; Corereg |]]],
-      Use_operands [| Dreg; Corereg |], "vdup_n", notype_1,
-      [S64; U64];
-    Vdup_n,
-      [No_op; Requires_feature "CRYPTO";
-       Disassembles_as [Use_operands [| Qreg;
-                                        Alternatives [ Corereg;
-                                                       Element_of_dreg ] |]]],
-      Use_operands [| Qreg; Corereg |], "vdupQ_n", bits_1,
-      [P64];
-    Vdup_n,
-      [Disassembles_as [Use_operands [| Qreg;
-                                        Alternatives [ Corereg;
-                                                       Element_of_dreg ] |]]],
-      Use_operands [| Qreg; Corereg |], "vdupQ_n", bits_1,
-      pf_su_8_32;
-    Vdup_n,
-      [No_op;
-       Instruction_name ["vmov"];
-       Disassembles_as [Use_operands [| Dreg; Corereg; Corereg |];
-                        Use_operands [| Dreg; Corereg; Corereg |]]],
-      Use_operands [| Qreg; Corereg |], "vdupQ_n", notype_1,
-      [S64; U64];
-
-    (* These are just aliases for the above.  *)
-    Vmov_n,
-      [Builtin_name "vdup_n";
-       Disassembles_as [Use_operands [| Dreg;
-                                        Alternatives [ Corereg;
-                                                       Element_of_dreg ] |]]],
-      Use_operands [| Dreg; Corereg |],
-      "vmov_n", bits_1, pf_su_8_32;
-    Vmov_n,
-      [No_op;
-       Builtin_name "vdup_n";
-       Instruction_name ["vmov"];
-       Disassembles_as [Use_operands [| Dreg; Corereg; Corereg |]]],
-      Use_operands [| Dreg; Corereg |],
-      "vmov_n", notype_1, [S64; U64];
-    Vmov_n,
-      [Builtin_name "vdupQ_n";
-       Disassembles_as [Use_operands [| Qreg;
-                                        Alternatives [ Corereg;
-                                                       Element_of_dreg ] |]]],
-      Use_operands [| Qreg; Corereg |],
-      "vmovQ_n", bits_1, pf_su_8_32;
-    Vmov_n,
-      [No_op;
-       Builtin_name "vdupQ_n";
-       Instruction_name ["vmov"];
-       Disassembles_as [Use_operands [| Dreg; Corereg; Corereg |];
-                        Use_operands [| Dreg; Corereg; Corereg |]]],
-      Use_operands [| Qreg; Corereg |],
-      "vmovQ_n", notype_1, [S64; U64];
-
-    (* Duplicate, lane version.  We can't use Use_operands here because the
-       rightmost register (always Dreg) would be picked up by find_key_operand,
-       when we want the leftmost register to be used in this case (otherwise
-       the modes are indistinguishable in neon.md, etc.  *)
-    Vdup_lane,
-      [Disassembles_as [Use_operands [| Dreg; Element_of_dreg |]]],
-      Unary_scalar Dreg, "vdup_lane", bits_2, pf_su_8_32;
-    Vdup_lane,
-      [No_op; Requires_feature "CRYPTO"; Const_valuator (fun _ -> 0)],
-      Unary_scalar Dreg, "vdup_lane", bits_2, [P64];
-    Vdup_lane,
-      [No_op; Const_valuator (fun _ -> 0)],
-      Unary_scalar Dreg, "vdup_lane", bits_2, [S64; U64];
-    Vdup_lane,
-      [Disassembles_as [Use_operands [| Qreg; Element_of_dreg |]]],
-      Unary_scalar Qreg, "vdupQ_lane", bits_2, pf_su_8_32;
-    Vdup_lane,
-      [No_op; Requires_feature "CRYPTO"; Const_valuator (fun _ -> 0)],
-      Unary_scalar Qreg, "vdupQ_lane", bits_2, [P64];
-    Vdup_lane,
-      [No_op; Const_valuator (fun _ -> 0)],
-      Unary_scalar Qreg, "vdupQ_lane", bits_2, [S64; U64];
-
-    (* Combining vectors.  *)
-    Vcombine, [Requires_feature "CRYPTO"; No_op],
-      Use_operands [| Qreg; Dreg; Dreg |], "vcombine", notype_2,
-      [P64];
-    Vcombine, [No_op],
-      Use_operands [| Qreg; Dreg; Dreg |], "vcombine", notype_2,
-      pf_su_8_64;
-
-    (* Splitting vectors.  *)
-    Vget_high, [Requires_feature "CRYPTO"; No_op],
-      Use_operands [| Dreg; Qreg |], "vget_high",
-      notype_1, [P64];
-    Vget_high, [No_op],
-      Use_operands [| Dreg; Qreg |], "vget_high",
-      notype_1, pf_su_8_64;
-    Vget_low, [Instruction_name ["vmov"];
-               Disassembles_as [Use_operands [| Dreg; Dreg |]];
-	       Fixed_vector_reg],
-      Use_operands [| Dreg; Qreg |], "vget_low",
-      notype_1, pf_su_8_32;
-    Vget_low, [Requires_feature "CRYPTO"; No_op],
-      Use_operands [| Dreg; Qreg |], "vget_low",
-      notype_1, [P64];
-    Vget_low, [No_op],
-      Use_operands [| Dreg; Qreg |], "vget_low",
-      notype_1, [S64; U64];
-
-    (* Conversions.  *)
-    Vcvt, [InfoWord], All (2, Dreg), "vcvt", conv_1,
-      [Conv (S32, F32); Conv (U32, F32); Conv (F32, S32); Conv (F32, U32)];
-    Vcvt, [InfoWord], All (2, Qreg), "vcvtQ", conv_1,
-      [Conv (S32, F32); Conv (U32, F32); Conv (F32, S32); Conv (F32, U32)];
-    Vcvt, [Builtin_name "vcvt" ; Requires_FP_bit 1],
-          Use_operands [| Dreg; Qreg; |], "vcvt", conv_1, [Conv (F16, F32)];
-    Vcvt, [Builtin_name "vcvt" ; Requires_FP_bit 1],
-          Use_operands [| Qreg; Dreg; |], "vcvt", conv_1, [Conv (F32, F16)];
-    Vcvt_n, [InfoWord], Use_operands [| Dreg; Dreg; Immed |], "vcvt_n", conv_2,
-      [Conv (S32, F32); Conv (U32, F32); Conv (F32, S32); Conv (F32, U32)];
-    Vcvt_n, [InfoWord], Use_operands [| Qreg; Qreg; Immed |], "vcvtQ_n", conv_2,
-      [Conv (S32, F32); Conv (U32, F32); Conv (F32, S32); Conv (F32, U32)];
-
-    (* Move, narrowing.  *)
-    Vmovn, [Disassembles_as [Use_operands [| Dreg; Qreg |]]],
-      Narrow, "vmovn", sign_invar_1, su_16_64;
-    Vmovn, [Disassembles_as [Use_operands [| Dreg; Qreg |]]; Saturating],
-      Narrow, "vqmovn", elts_same_1, su_16_64;
-    Vmovn,
-      [Disassembles_as [Use_operands [| Dreg; Qreg |]]; Saturating; Dst_unsign],
-      Narrow, "vqmovun", dst_unsign_1,
-      [S16; S32; S64];
-
-    (* Move, long.  *)
-    Vmovl, [Disassembles_as [Use_operands [| Qreg; Dreg |]]],
-      Long, "vmovl", elts_same_1, su_8_32;
-
-    (* Table lookup.  *)
-    Vtbl 1,
-      [Instruction_name ["vtbl"];
-       Disassembles_as [Use_operands [| Dreg; VecArray (1, Dreg); Dreg |]]],
-      Use_operands [| Dreg; Dreg; Dreg |], "vtbl1", table_2, [U8; S8; P8];
-    Vtbl 2, [Instruction_name ["vtbl"]],
-      Use_operands [| Dreg; VecArray (2, Dreg); Dreg |], "vtbl2", table_2,
-      [U8; S8; P8];
-    Vtbl 3, [Instruction_name ["vtbl"]],
-      Use_operands [| Dreg; VecArray (3, Dreg); Dreg |], "vtbl3", table_2,
-      [U8; S8; P8];
-    Vtbl 4, [Instruction_name ["vtbl"]],
-      Use_operands [| Dreg; VecArray (4, Dreg); Dreg |], "vtbl4", table_2,
-      [U8; S8; P8];
-
-    (* Extended table lookup.  *)
-    Vtbx 1,
-      [Instruction_name ["vtbx"];
-       Disassembles_as [Use_operands [| Dreg; VecArray (1, Dreg); Dreg |]]],
-      Use_operands [| Dreg; Dreg; Dreg |], "vtbx1", table_io, [U8; S8; P8];
-    Vtbx 2, [Instruction_name ["vtbx"]],
-      Use_operands [| Dreg; VecArray (2, Dreg); Dreg |], "vtbx2", table_io,
-      [U8; S8; P8];
-    Vtbx 3, [Instruction_name ["vtbx"]],
-      Use_operands [| Dreg; VecArray (3, Dreg); Dreg |], "vtbx3", table_io,
-      [U8; S8; P8];
-    Vtbx 4, [Instruction_name ["vtbx"]],
-      Use_operands [| Dreg; VecArray (4, Dreg); Dreg |], "vtbx4", table_io,
-      [U8; S8; P8];
-
-    (* Multiply, lane.  (note: these were undocumented at the time of
-       writing).  *)
-    Vmul_lane, [], By_scalar Dreg, "vmul_lane", sign_invar_2_lane,
-      [S16; S32; U16; U32; F32];
-    Vmul_lane, [], By_scalar Qreg, "vmulQ_lane", sign_invar_2_lane,
-      [S16; S32; U16; U32; F32];
-
-    (* Multiply-accumulate, lane.  *)
-    Vmla_lane, [], By_scalar Dreg, "vmla_lane", sign_invar_io_lane,
-      [S16; S32; U16; U32; F32];
-    Vmla_lane, [], By_scalar Qreg, "vmlaQ_lane", sign_invar_io_lane,
-      [S16; S32; U16; U32; F32];
-    Vmla_lane, [], Wide_lane, "vmlal_lane", elts_same_io_lane,
-      [S16; S32; U16; U32];
-    Vmla_lane, [Saturating; Doubling], Wide_lane, "vqdmlal_lane",
-      elts_same_io_lane, [S16; S32];
-
-    (* Multiply-subtract, lane.  *)
-    Vmls_lane, [], By_scalar Dreg, "vmls_lane", sign_invar_io_lane,
-      [S16; S32; U16; U32; F32];
-    Vmls_lane, [], By_scalar Qreg, "vmlsQ_lane", sign_invar_io_lane,
-      [S16; S32; U16; U32; F32];
-    Vmls_lane, [], Wide_lane, "vmlsl_lane", elts_same_io_lane,
-      [S16; S32; U16; U32];
-    Vmls_lane, [Saturating; Doubling], Wide_lane, "vqdmlsl_lane",
-      elts_same_io_lane, [S16; S32];
-
-    (* Long multiply, lane.  *)
-    Vmull_lane, [],
-      Wide_lane, "vmull_lane", elts_same_2_lane, [S16; S32; U16; U32];
-
-    (* Saturating doubling long multiply, lane.  *)
-    Vqdmull_lane, [Saturating; Doubling],
-      Wide_lane, "vqdmull_lane", elts_same_2_lane, [S16; S32];
-
-    (* Saturating doubling long multiply high, lane.  *)
-    Vqdmulh_lane, [Saturating; Halving],
-      By_scalar Qreg, "vqdmulhQ_lane", elts_same_2_lane, [S16; S32];
-    Vqdmulh_lane, [Saturating; Halving],
-      By_scalar Dreg, "vqdmulh_lane", elts_same_2_lane, [S16; S32];
-    Vqdmulh_lane, [Saturating; Halving; Rounding;
-		   Instruction_name ["vqrdmulh"]],
-      By_scalar Qreg, "vqRdmulhQ_lane", elts_same_2_lane, [S16; S32];
-    Vqdmulh_lane, [Saturating; Halving; Rounding;
-		   Instruction_name ["vqrdmulh"]],
-      By_scalar Dreg, "vqRdmulh_lane", elts_same_2_lane, [S16; S32];
-
-    (* Vector multiply by scalar.  *)
-    Vmul_n, [InfoWord;
-             Disassembles_as [Use_operands [| Dreg; Dreg; Element_of_dreg |]]],
-             Use_operands [| Dreg; Dreg; Corereg |], "vmul_n",
-      sign_invar_2, [S16; S32; U16; U32; F32];
-    Vmul_n, [InfoWord;
-             Disassembles_as [Use_operands [| Qreg; Qreg; Element_of_dreg |]]],
-             Use_operands [| Qreg; Qreg; Corereg |], "vmulQ_n",
-      sign_invar_2, [S16; S32; U16; U32; F32];
-
-    (* Vector long multiply by scalar.  *)
-    Vmull_n, [Instruction_name ["vmull"];
-              Disassembles_as [Use_operands [| Qreg; Dreg; Element_of_dreg |]]],
-              Wide_scalar, "vmull_n",
-      elts_same_2, [S16; S32; U16; U32];
-
-    (* Vector saturating doubling long multiply by scalar.  *)
-    Vqdmull_n, [Saturating; Doubling;
-	        Disassembles_as [Use_operands [| Qreg; Dreg;
-						 Element_of_dreg |]]],
-                Wide_scalar, "vqdmull_n",
-      elts_same_2, [S16; S32];
-
-    (* Vector saturating doubling long multiply high by scalar.  *)
-    Vqdmulh_n,
-      [Saturating; Halving; InfoWord;
-       Disassembles_as [Use_operands [| Qreg; Qreg; Element_of_dreg |]]],
-      Use_operands [| Qreg; Qreg; Corereg |],
-      "vqdmulhQ_n", elts_same_2, [S16; S32];
-    Vqdmulh_n,
-      [Saturating; Halving; InfoWord;
-       Disassembles_as [Use_operands [| Dreg; Dreg; Element_of_dreg |]]],
-      Use_operands [| Dreg; Dreg; Corereg |],
-      "vqdmulh_n", elts_same_2, [S16; S32];
-    Vqdmulh_n,
-      [Saturating; Halving; Rounding; InfoWord;
-       Instruction_name ["vqrdmulh"];
-       Disassembles_as [Use_operands [| Qreg; Qreg; Element_of_dreg |]]],
-      Use_operands [| Qreg; Qreg; Corereg |],
-      "vqRdmulhQ_n", elts_same_2, [S16; S32];
-    Vqdmulh_n,
-      [Saturating; Halving; Rounding; InfoWord;
-       Instruction_name ["vqrdmulh"];
-       Disassembles_as [Use_operands [| Dreg; Dreg; Element_of_dreg |]]],
-      Use_operands [| Dreg; Dreg; Corereg |],
-      "vqRdmulh_n", elts_same_2, [S16; S32];
-
-    (* Vector multiply-accumulate by scalar.  *)
-    Vmla_n, [InfoWord;
-             Disassembles_as [Use_operands [| Dreg; Dreg; Element_of_dreg |]]],
-      Use_operands [| Dreg; Dreg; Corereg |], "vmla_n",
-      sign_invar_io, [S16; S32; U16; U32; F32];
-    Vmla_n, [InfoWord;
-             Disassembles_as [Use_operands [| Qreg; Qreg; Element_of_dreg |]]],
-      Use_operands [| Qreg; Qreg; Corereg |], "vmlaQ_n",
-      sign_invar_io, [S16; S32; U16; U32; F32];
-    Vmla_n, [], Wide_scalar, "vmlal_n", elts_same_io, [S16; S32; U16; U32];
-    Vmla_n, [Saturating; Doubling], Wide_scalar, "vqdmlal_n", elts_same_io,
-      [S16; S32];
-
-    (* Vector multiply subtract by scalar.  *)
-    Vmls_n, [InfoWord;
-             Disassembles_as [Use_operands [| Dreg; Dreg; Element_of_dreg |]]],
-      Use_operands [| Dreg; Dreg; Corereg |], "vmls_n",
-      sign_invar_io, [S16; S32; U16; U32; F32];
-    Vmls_n, [InfoWord;
-             Disassembles_as [Use_operands [| Qreg; Qreg; Element_of_dreg |]]],
-      Use_operands [| Qreg; Qreg; Corereg |], "vmlsQ_n",
-      sign_invar_io, [S16; S32; U16; U32; F32];
-    Vmls_n, [], Wide_scalar, "vmlsl_n", elts_same_io, [S16; S32; U16; U32];
-    Vmls_n, [Saturating; Doubling], Wide_scalar, "vqdmlsl_n", elts_same_io,
-      [S16; S32];
-
-    (* Vector extract.  *)
-    Vext, [Requires_feature "CRYPTO"; Const_valuator (fun _ -> 0)],
-      Use_operands [| Dreg; Dreg; Dreg; Immed |], "vext", extend,
-      [P64];
-    Vext, [Const_valuator (fun _ -> 0)],
-      Use_operands [| Dreg; Dreg; Dreg; Immed |], "vext", extend,
-      pf_su_8_64;
-    Vext, [Requires_feature "CRYPTO"; Const_valuator (fun _ -> 0)],
-      Use_operands [| Qreg; Qreg; Qreg; Immed |], "vextQ", extend,
-      [P64];
-    Vext, [Const_valuator (fun _ -> 0)],
-      Use_operands [| Qreg; Qreg; Qreg; Immed |], "vextQ", extend,
-      pf_su_8_64;
-
-    (* Reverse elements.  *)
-    Vrev64, [Use_shuffle (rev_elems 64)], All (2, Dreg), "vrev64", bits_1,
-      P8 :: P16 :: F32 :: su_8_32;
-    Vrev64, [Use_shuffle (rev_elems 64)], All (2, Qreg), "vrev64Q", bits_1,
-      P8 :: P16 :: F32 :: su_8_32;
-    Vrev32, [Use_shuffle (rev_elems 32)], All (2, Dreg), "vrev32", bits_1,
-      [P8; P16; S8; U8; S16; U16];
-    Vrev32, [Use_shuffle (rev_elems 32)], All (2, Qreg), "vrev32Q", bits_1,
-      [P8; P16; S8; U8; S16; U16];
-    Vrev16, [Use_shuffle (rev_elems 16)], All (2, Dreg), "vrev16", bits_1,
-      [P8; S8; U8];
-    Vrev16, [Use_shuffle (rev_elems 16)], All (2, Qreg), "vrev16Q", bits_1,
-      [P8; S8; U8];
-
-    (* Bit selection.  *)
-    Vbsl,
-      [Requires_feature "CRYPTO"; Instruction_name ["vbsl"; "vbit"; "vbif"];
-       Disassembles_as [Use_operands [| Dreg; Dreg; Dreg |]]],
-      Use_operands [| Dreg; Dreg; Dreg; Dreg |], "vbsl", bit_select,
-      [P64];
-    Vbsl,
-      [Instruction_name ["vbsl"; "vbit"; "vbif"];
-       Disassembles_as [Use_operands [| Dreg; Dreg; Dreg |]]],
-      Use_operands [| Dreg; Dreg; Dreg; Dreg |], "vbsl", bit_select,
-      pf_su_8_64;
-    Vbsl,
-      [Requires_feature "CRYPTO"; Instruction_name ["vbsl"; "vbit"; "vbif"];
-       Disassembles_as [Use_operands [| Qreg; Qreg; Qreg |]]],
-      Use_operands [| Qreg; Qreg; Qreg; Qreg |], "vbslQ", bit_select,
-      [P64];
-    Vbsl,
-      [Instruction_name ["vbsl"; "vbit"; "vbif"];
-       Disassembles_as [Use_operands [| Qreg; Qreg; Qreg |]]],
-      Use_operands [| Qreg; Qreg; Qreg; Qreg |], "vbslQ", bit_select,
-      pf_su_8_64;
-
-    Vtrn, [Use_shuffle trn_elems], Pair_result Dreg, "vtrn", bits_2, pf_su_8_16;
-    Vtrn, [Use_shuffle trn_elems; Instruction_name ["vuzp"]], Pair_result Dreg, "vtrn", bits_2, suf_32;
-    Vtrn, [Use_shuffle trn_elems], Pair_result Qreg, "vtrnQ", bits_2, pf_su_8_32;
-    (* Zip elements.  *)
-    Vzip, [Use_shuffle zip_elems], Pair_result Dreg, "vzip", bits_2, pf_su_8_16;
-    Vzip, [Use_shuffle zip_elems; Instruction_name ["vuzp"]], Pair_result Dreg, "vzip", bits_2, suf_32;
-    Vzip, [Use_shuffle zip_elems], Pair_result Qreg, "vzipQ", bits_2, pf_su_8_32; 
-
-    (* Unzip elements.  *)
-    Vuzp, [Use_shuffle uzip_elems], Pair_result Dreg, "vuzp", bits_2,
-      pf_su_8_32;
-    Vuzp, [Use_shuffle uzip_elems], Pair_result Qreg, "vuzpQ", bits_2,
-      pf_su_8_32;
-
-    (* Element/structure loads.  VLD1 variants.  *)
-    Vldx 1,
-      [Requires_feature "CRYPTO";
-       Disassembles_as [Use_operands [| VecArray (1, Dreg);
-                                        CstPtrTo Corereg |]]],
-      Use_operands [| Dreg; CstPtrTo Corereg |], "vld1", bits_1,
-      [P64];
-    Vldx 1,
-      [Disassembles_as [Use_operands [| VecArray (1, Dreg);
-                                        CstPtrTo Corereg |]]],
-      Use_operands [| Dreg; CstPtrTo Corereg |], "vld1", bits_1,
-      pf_su_8_64;
-    Vldx 1, [Requires_feature "CRYPTO";
-             Disassembles_as [Use_operands [| VecArray (2, Dreg);
-					      CstPtrTo Corereg |]]],
-      Use_operands [| Qreg; CstPtrTo Corereg |], "vld1Q", bits_1,
-      [P64];
-    Vldx 1, [Disassembles_as [Use_operands [| VecArray (2, Dreg);
-					      CstPtrTo Corereg |]]],
-      Use_operands [| Qreg; CstPtrTo Corereg |], "vld1Q", bits_1,
-      pf_su_8_64;
-
-    Vldx_lane 1,
-      [Disassembles_as [Use_operands [| VecArray (1, Element_of_dreg);
-                                        CstPtrTo Corereg |]]],
-      Use_operands [| Dreg; CstPtrTo Corereg; Dreg; Immed |],
-      "vld1_lane", bits_3, pf_su_8_32;
-    Vldx_lane 1,
-      [Requires_feature "CRYPTO";
-       Disassembles_as [Use_operands [| VecArray (1, Dreg);
-                                        CstPtrTo Corereg |]];
-       Const_valuator (fun _ -> 0)],
-      Use_operands [| Dreg; CstPtrTo Corereg; Dreg; Immed |],
-      "vld1_lane", bits_3, [P64];
-    Vldx_lane 1,
-      [Disassembles_as [Use_operands [| VecArray (1, Dreg);
-                                        CstPtrTo Corereg |]];
-       Const_valuator (fun _ -> 0)],
-      Use_operands [| Dreg; CstPtrTo Corereg; Dreg; Immed |],
-      "vld1_lane", bits_3, [S64; U64];
-    Vldx_lane 1,
-      [Disassembles_as [Use_operands [| VecArray (1, Element_of_dreg);
-                                        CstPtrTo Corereg |]]],
-      Use_operands [| Qreg; CstPtrTo Corereg; Qreg; Immed |],
-      "vld1Q_lane", bits_3, pf_su_8_32;
-    Vldx_lane 1,
-      [Requires_feature "CRYPTO";
-       Disassembles_as [Use_operands [| VecArray (1, Dreg);
-                                        CstPtrTo Corereg |]]],
-      Use_operands [| Qreg; CstPtrTo Corereg; Qreg; Immed |],
-      "vld1Q_lane", bits_3, [P64];
-    Vldx_lane 1,
-      [Disassembles_as [Use_operands [| VecArray (1, Dreg);
-                                        CstPtrTo Corereg |]]],
-      Use_operands [| Qreg; CstPtrTo Corereg; Qreg; Immed |],
-      "vld1Q_lane", bits_3, [S64; U64];
-
-    Vldx_dup 1,
-      [Disassembles_as [Use_operands [| VecArray (1, All_elements_of_dreg);
-                                        CstPtrTo Corereg |]]],
-      Use_operands [| Dreg; CstPtrTo Corereg |], "vld1_dup",
-      bits_1, pf_su_8_32;
-    Vldx_dup 1,
-      [Requires_feature "CRYPTO";
-       Disassembles_as [Use_operands [| VecArray (1, Dreg);
-                                        CstPtrTo Corereg |]]],
-      Use_operands [| Dreg; CstPtrTo Corereg |], "vld1_dup",
-      bits_1, [P64];
-    Vldx_dup 1,
-      [Disassembles_as [Use_operands [| VecArray (1, Dreg);
-                                        CstPtrTo Corereg |]]],
-      Use_operands [| Dreg; CstPtrTo Corereg |], "vld1_dup",
-      bits_1, [S64; U64];
-    Vldx_dup 1,
-      [Disassembles_as [Use_operands [| VecArray (2, All_elements_of_dreg);
-                                        CstPtrTo Corereg |]]],
-      Use_operands [| Qreg; CstPtrTo Corereg |], "vld1Q_dup",
-      bits_1, pf_su_8_32;
-    (* Treated identically to vld1_dup above as we now
-       do a single load followed by a duplicate.  *)
-    Vldx_dup 1,
-      [Requires_feature "CRYPTO";
-       Disassembles_as [Use_operands [| VecArray (1, Dreg);
-                                        CstPtrTo Corereg |]]],
-      Use_operands [| Qreg; CstPtrTo Corereg |], "vld1Q_dup",
-      bits_1, [P64];
-    Vldx_dup 1,
-      [Disassembles_as [Use_operands [| VecArray (1, Dreg);
-                                        CstPtrTo Corereg |]]],
-      Use_operands [| Qreg; CstPtrTo Corereg |], "vld1Q_dup",
-      bits_1, [S64; U64];
-
-    (* VST1 variants.  *)
-    Vstx 1, [Requires_feature "CRYPTO";
-             Disassembles_as [Use_operands [| VecArray (1, Dreg);
-                                              PtrTo Corereg |]]],
-      Use_operands [| PtrTo Corereg; Dreg |], "vst1",
-      store_1, [P64];
-    Vstx 1, [Disassembles_as [Use_operands [| VecArray (1, Dreg);
-                                              PtrTo Corereg |]]],
-      Use_operands [| PtrTo Corereg; Dreg |], "vst1",
-      store_1, pf_su_8_64;
-    Vstx 1, [Requires_feature "CRYPTO";
-             Disassembles_as [Use_operands [| VecArray (2, Dreg);
-					      PtrTo Corereg |]]],
-      Use_operands [| PtrTo Corereg; Qreg |], "vst1Q",
-      store_1, [P64];
-    Vstx 1, [Disassembles_as [Use_operands [| VecArray (2, Dreg);
-					      PtrTo Corereg |]]],
-      Use_operands [| PtrTo Corereg; Qreg |], "vst1Q",
-      store_1, pf_su_8_64;
-
-    Vstx_lane 1,
-      [Disassembles_as [Use_operands [| VecArray (1, Element_of_dreg);
-                                        CstPtrTo Corereg |]]],
-      Use_operands [| PtrTo Corereg; Dreg; Immed |],
-      "vst1_lane", store_3, pf_su_8_32;
-    Vstx_lane 1,
-      [Requires_feature "CRYPTO";
-       Disassembles_as [Use_operands [| VecArray (1, Dreg);
-                                        CstPtrTo Corereg |]];
-       Const_valuator (fun _ -> 0)],
-      Use_operands [| PtrTo Corereg; Dreg; Immed |],
-      "vst1_lane", store_3, [P64];
-    Vstx_lane 1,
-      [Disassembles_as [Use_operands [| VecArray (1, Dreg);
-                                        CstPtrTo Corereg |]];
-       Const_valuator (fun _ -> 0)],
-      Use_operands [| PtrTo Corereg; Dreg; Immed |],
-      "vst1_lane", store_3, [U64; S64];
-    Vstx_lane 1,
-      [Disassembles_as [Use_operands [| VecArray (1, Element_of_dreg);
-                                        CstPtrTo Corereg |]]],
-      Use_operands [| PtrTo Corereg; Qreg; Immed |],
-      "vst1Q_lane", store_3, pf_su_8_32;
-    Vstx_lane 1,
-      [Requires_feature "CRYPTO";
-       Disassembles_as [Use_operands [| VecArray (1, Dreg);
-                                        CstPtrTo Corereg |]]],
-      Use_operands [| PtrTo Corereg; Qreg; Immed |],
-      "vst1Q_lane", store_3, [P64];
-    Vstx_lane 1,
-      [Disassembles_as [Use_operands [| VecArray (1, Dreg);
-                                        CstPtrTo Corereg |]]],
-      Use_operands [| PtrTo Corereg; Qreg; Immed |],
-      "vst1Q_lane", store_3, [U64; S64];
-
-    (* VLD2 variants.  *)
-    Vldx 2, [], Use_operands [| VecArray (2, Dreg); CstPtrTo Corereg |],
-      "vld2", bits_1, pf_su_8_32;
-    Vldx 2, [Requires_feature "CRYPTO"; Instruction_name ["vld1"]],
-       Use_operands [| VecArray (2, Dreg); CstPtrTo Corereg |],
-      "vld2", bits_1, [P64];
-    Vldx 2, [Instruction_name ["vld1"]],
-       Use_operands [| VecArray (2, Dreg); CstPtrTo Corereg |],
-      "vld2", bits_1, [S64; U64];
-    Vldx 2, [Disassembles_as [Use_operands [| VecArray (2, Dreg);
-                                              CstPtrTo Corereg |];
-                              Use_operands [| VecArray (2, Dreg);
-					      CstPtrTo Corereg |]]],
-      Use_operands [| VecArray (2, Qreg); CstPtrTo Corereg |],
-      "vld2Q", bits_1, pf_su_8_32;
-
-    Vldx_lane 2,
-      [Disassembles_as [Use_operands
-        [| VecArray (2, Element_of_dreg);
-           CstPtrTo Corereg |]]],
-      Use_operands [| VecArray (2, Dreg); CstPtrTo Corereg;
-                      VecArray (2, Dreg); Immed |],
-      "vld2_lane", bits_3, P8 :: P16 :: F32 :: su_8_32;
-    Vldx_lane 2,
-      [Disassembles_as [Use_operands
-        [| VecArray (2, Element_of_dreg);
-           CstPtrTo Corereg |]]],
-      Use_operands [| VecArray (2, Qreg); CstPtrTo Corereg;
- 	              VecArray (2, Qreg); Immed |],
-      "vld2Q_lane", bits_3, [P16; F32; U16; U32; S16; S32];
-
-    Vldx_dup 2,
-      [Disassembles_as [Use_operands
-        [| VecArray (2, All_elements_of_dreg); CstPtrTo Corereg |]]],
-      Use_operands [| VecArray (2, Dreg); CstPtrTo Corereg |],
-      "vld2_dup", bits_1, pf_su_8_32;
-    Vldx_dup 2,
-      [Requires_feature "CRYPTO";
-       Instruction_name ["vld1"]; Disassembles_as [Use_operands
-        [| VecArray (2, Dreg); CstPtrTo Corereg |]]],
-      Use_operands [| VecArray (2, Dreg); CstPtrTo Corereg |],
-      "vld2_dup", bits_1, [P64];
-    Vldx_dup 2,
-      [Instruction_name ["vld1"]; Disassembles_as [Use_operands
-        [| VecArray (2, Dreg); CstPtrTo Corereg |]]],
-      Use_operands [| VecArray (2, Dreg); CstPtrTo Corereg |],
-      "vld2_dup", bits_1, [S64; U64];
-
-    (* VST2 variants.  *)
-    Vstx 2, [Disassembles_as [Use_operands [| VecArray (2, Dreg);
-                                              PtrTo Corereg |]]],
-      Use_operands [| PtrTo Corereg; VecArray (2, Dreg) |], "vst2",
-      store_1, pf_su_8_32;
-    Vstx 2, [Requires_feature "CRYPTO";
-             Disassembles_as [Use_operands [| VecArray (2, Dreg);
-                                              PtrTo Corereg |]];
-             Instruction_name ["vst1"]],
-      Use_operands [| PtrTo Corereg; VecArray (2, Dreg) |], "vst2",
-      store_1, [P64];
-    Vstx 2, [Disassembles_as [Use_operands [| VecArray (2, Dreg);
-                                              PtrTo Corereg |]];
-             Instruction_name ["vst1"]],
-      Use_operands [| PtrTo Corereg; VecArray (2, Dreg) |], "vst2",
-      store_1, [S64; U64];
-    Vstx 2, [Disassembles_as [Use_operands [| VecArray (2, Dreg);
-					      PtrTo Corereg |];
-                              Use_operands [| VecArray (2, Dreg);
-				              PtrTo Corereg |]]],
-      Use_operands [| PtrTo Corereg; VecArray (2, Qreg) |], "vst2Q",
-      store_1, pf_su_8_32;
-
-    Vstx_lane 2,
-      [Disassembles_as [Use_operands
-        [| VecArray (2, Element_of_dreg);
-           CstPtrTo Corereg |]]],
-      Use_operands [| PtrTo Corereg; VecArray (2, Dreg); Immed |], "vst2_lane",
-      store_3, P8 :: P16 :: F32 :: su_8_32;
-    Vstx_lane 2,
-      [Disassembles_as [Use_operands
-        [| VecArray (2, Element_of_dreg);
-           CstPtrTo Corereg |]]],
-      Use_operands [| PtrTo Corereg; VecArray (2, Qreg); Immed |], "vst2Q_lane",
-      store_3, [P16; F32; U16; U32; S16; S32];
-
-    (* VLD3 variants.  *)
-    Vldx 3, [], Use_operands [| VecArray (3, Dreg); CstPtrTo Corereg |],
-      "vld3", bits_1, pf_su_8_32;
-    Vldx 3, [Requires_feature "CRYPTO"; Instruction_name ["vld1"]],
-      Use_operands [| VecArray (3, Dreg); CstPtrTo Corereg |],
-      "vld3", bits_1, [P64];
-    Vldx 3, [Instruction_name ["vld1"]],
-      Use_operands [| VecArray (3, Dreg); CstPtrTo Corereg |],
-      "vld3", bits_1, [S64; U64];
-    Vldx 3, [Disassembles_as [Use_operands [| VecArray (3, Dreg);
-					      CstPtrTo Corereg |];
-                              Use_operands [| VecArray (3, Dreg);
-					      CstPtrTo Corereg |]]],
-      Use_operands [| VecArray (3, Qreg); CstPtrTo Corereg |],
-      "vld3Q", bits_1, P8 :: P16 :: F32 :: su_8_32;
-
-    Vldx_lane 3,
-      [Disassembles_as [Use_operands
-        [| VecArray (3, Element_of_dreg);
-           CstPtrTo Corereg |]]],
-      Use_operands [| VecArray (3, Dreg); CstPtrTo Corereg;
-                                     VecArray (3, Dreg); Immed |],
-      "vld3_lane", bits_3, P8 :: P16 :: F32 :: su_8_32;
-    Vldx_lane 3,
-      [Disassembles_as [Use_operands
-        [| VecArray (3, Element_of_dreg);
-           CstPtrTo Corereg |]]],
-      Use_operands [| VecArray (3, Qreg); CstPtrTo Corereg;
-				     VecArray (3, Qreg); Immed |],
-      "vld3Q_lane", bits_3, [P16; F32; U16; U32; S16; S32];
-
-    Vldx_dup 3,
-      [Disassembles_as [Use_operands
-        [| VecArray (3, All_elements_of_dreg); CstPtrTo Corereg |]]],
-      Use_operands [| VecArray (3, Dreg); CstPtrTo Corereg |],
-      "vld3_dup", bits_1, pf_su_8_32;
-    Vldx_dup 3,
-      [Requires_feature "CRYPTO";
-       Instruction_name ["vld1"]; Disassembles_as [Use_operands
-        [| VecArray (3, Dreg); CstPtrTo Corereg |]]],
-      Use_operands [| VecArray (3, Dreg); CstPtrTo Corereg |],
-      "vld3_dup", bits_1, [P64];
-    Vldx_dup 3,
-      [Instruction_name ["vld1"]; Disassembles_as [Use_operands
-        [| VecArray (3, Dreg); CstPtrTo Corereg |]]],
-      Use_operands [| VecArray (3, Dreg); CstPtrTo Corereg |],
-      "vld3_dup", bits_1, [S64; U64];
-
-    (* VST3 variants.  *)
-    Vstx 3, [Disassembles_as [Use_operands [| VecArray (4, Dreg);
-                                              PtrTo Corereg |]]],
-      Use_operands [| PtrTo Corereg; VecArray (3, Dreg) |], "vst3",
-      store_1, pf_su_8_32;
-    Vstx 3, [Requires_feature "CRYPTO";
-             Disassembles_as [Use_operands [| VecArray (4, Dreg);
-                                              PtrTo Corereg |]];
-             Instruction_name ["vst1"]],
-      Use_operands [| PtrTo Corereg; VecArray (3, Dreg) |], "vst3",
-      store_1, [P64];
-    Vstx 3, [Disassembles_as [Use_operands [| VecArray (4, Dreg);
-                                              PtrTo Corereg |]];
-             Instruction_name ["vst1"]],
-      Use_operands [| PtrTo Corereg; VecArray (3, Dreg) |], "vst3",
-      store_1, [S64; U64];
-    Vstx 3, [Disassembles_as [Use_operands [| VecArray (3, Dreg);
-					      PtrTo Corereg |];
-                              Use_operands [| VecArray (3, Dreg);
-					      PtrTo Corereg |]]],
-      Use_operands [| PtrTo Corereg; VecArray (3, Qreg) |], "vst3Q",
-      store_1, pf_su_8_32;
-
-    Vstx_lane 3,
-      [Disassembles_as [Use_operands
-        [| VecArray (3, Element_of_dreg);
-           CstPtrTo Corereg |]]],
-      Use_operands [| PtrTo Corereg; VecArray (3, Dreg); Immed |], "vst3_lane",
-      store_3, P8 :: P16 :: F32 :: su_8_32;
-    Vstx_lane 3,
-      [Disassembles_as [Use_operands
-        [| VecArray (3, Element_of_dreg);
-           CstPtrTo Corereg |]]],
-      Use_operands [| PtrTo Corereg; VecArray (3, Qreg); Immed |], "vst3Q_lane",
-      store_3, [P16; F32; U16; U32; S16; S32];
-
-    (* VLD4/VST4 variants.  *)
-    Vldx 4, [], Use_operands [| VecArray (4, Dreg); CstPtrTo Corereg |],
-      "vld4", bits_1, pf_su_8_32;
-    Vldx 4, [Requires_feature "CRYPTO"; Instruction_name ["vld1"]],
-      Use_operands [| VecArray (4, Dreg); CstPtrTo Corereg |],
-      "vld4", bits_1, [P64];
-    Vldx 4, [Instruction_name ["vld1"]],
-      Use_operands [| VecArray (4, Dreg); CstPtrTo Corereg |],
-      "vld4", bits_1, [S64; U64];
-    Vldx 4, [Disassembles_as [Use_operands [| VecArray (4, Dreg);
-					      CstPtrTo Corereg |];
-                              Use_operands [| VecArray (4, Dreg);
-					      CstPtrTo Corereg |]]],
-      Use_operands [| VecArray (4, Qreg); CstPtrTo Corereg |],
-      "vld4Q", bits_1, P8 :: P16 :: F32 :: su_8_32;
-
-    Vldx_lane 4,
-      [Disassembles_as [Use_operands
-        [| VecArray (4, Element_of_dreg);
-           CstPtrTo Corereg |]]],
-      Use_operands [| VecArray (4, Dreg); CstPtrTo Corereg;
-                                     VecArray (4, Dreg); Immed |],
-      "vld4_lane", bits_3, P8 :: P16 :: F32 :: su_8_32;
-    Vldx_lane 4,
-      [Disassembles_as [Use_operands
-        [| VecArray (4, Element_of_dreg);
-           CstPtrTo Corereg |]]],
-      Use_operands [| VecArray (4, Qreg); CstPtrTo Corereg;
-   	              VecArray (4, Qreg); Immed |],
-      "vld4Q_lane", bits_3, [P16; F32; U16; U32; S16; S32];
-
-    Vldx_dup 4,
-      [Disassembles_as [Use_operands
-        [| VecArray (4, All_elements_of_dreg); CstPtrTo Corereg |]]],
-      Use_operands [| VecArray (4, Dreg); CstPtrTo Corereg |],
-      "vld4_dup", bits_1, pf_su_8_32;
-    Vldx_dup 4,
-      [Requires_feature "CRYPTO";
-       Instruction_name ["vld1"]; Disassembles_as [Use_operands
-        [| VecArray (4, Dreg); CstPtrTo Corereg |]]],
-      Use_operands [| VecArray (4, Dreg); CstPtrTo Corereg |],
-      "vld4_dup", bits_1, [P64];
-    Vldx_dup 4,
-      [Instruction_name ["vld1"]; Disassembles_as [Use_operands
-        [| VecArray (4, Dreg); CstPtrTo Corereg |]]],
-      Use_operands [| VecArray (4, Dreg); CstPtrTo Corereg |],
-      "vld4_dup", bits_1, [S64; U64];
-
-    Vstx 4, [Disassembles_as [Use_operands [| VecArray (4, Dreg);
-                                              PtrTo Corereg |]]],
-      Use_operands [| PtrTo Corereg; VecArray (4, Dreg) |], "vst4",
-      store_1, pf_su_8_32;
-    Vstx 4, [Requires_feature "CRYPTO";
-             Disassembles_as [Use_operands [| VecArray (4, Dreg);
-                                              PtrTo Corereg |]];
-             Instruction_name ["vst1"]],
-      Use_operands [| PtrTo Corereg; VecArray (4, Dreg) |], "vst4",
-      store_1, [P64];
-    Vstx 4, [Disassembles_as [Use_operands [| VecArray (4, Dreg);
-                                              PtrTo Corereg |]];
-             Instruction_name ["vst1"]],
-      Use_operands [| PtrTo Corereg; VecArray (4, Dreg) |], "vst4",
-      store_1, [S64; U64];
-    Vstx 4, [Disassembles_as [Use_operands [| VecArray (4, Dreg);
-					      PtrTo Corereg |];
-                              Use_operands [| VecArray (4, Dreg);
-					      PtrTo Corereg |]]],
-     Use_operands [| PtrTo Corereg; VecArray (4, Qreg) |], "vst4Q",
-      store_1, pf_su_8_32;
-
-    Vstx_lane 4,
-      [Disassembles_as [Use_operands
-        [| VecArray (4, Element_of_dreg);
-           CstPtrTo Corereg |]]],
-      Use_operands [| PtrTo Corereg; VecArray (4, Dreg); Immed |], "vst4_lane",
-      store_3, P8 :: P16 :: F32 :: su_8_32;
-    Vstx_lane 4,
-      [Disassembles_as [Use_operands
-        [| VecArray (4, Element_of_dreg);
-           CstPtrTo Corereg |]]],
-      Use_operands [| PtrTo Corereg; VecArray (4, Qreg); Immed |], "vst4Q_lane",
-      store_3, [P16; F32; U16; U32; S16; S32];
-
-    (* Logical operations. And.  *)
-    Vand, [], All (3, Dreg), "vand", notype_2, su_8_32;
-    Vand, [No_op], All (3, Dreg), "vand", notype_2, [S64; U64];
-    Vand, [], All (3, Qreg), "vandQ", notype_2, su_8_64;
-
-    (* Or.  *)
-    Vorr, [], All (3, Dreg), "vorr", notype_2, su_8_32;
-    Vorr, [No_op], All (3, Dreg), "vorr", notype_2, [S64; U64];
-    Vorr, [], All (3, Qreg), "vorrQ", notype_2, su_8_64;
-
-    (* Eor.  *)
-    Veor, [], All (3, Dreg), "veor", notype_2, su_8_32;
-    Veor, [No_op], All (3, Dreg), "veor", notype_2, [S64; U64];
-    Veor, [], All (3, Qreg), "veorQ", notype_2, su_8_64;
-
-    (* Bic (And-not).  *)
-    Vbic, [Compiler_optim "-O2"], All (3, Dreg), "vbic", notype_2, su_8_32;
-    Vbic, [No_op; Compiler_optim "-O2"], All (3, Dreg), "vbic", notype_2, [S64; U64];
-    Vbic, [Compiler_optim "-O2"], All (3, Qreg), "vbicQ", notype_2, su_8_64;
-
-    (* Or-not.  *)
-    Vorn, [Compiler_optim "-O2"], All (3, Dreg), "vorn", notype_2, su_8_32;
-    Vorn, [No_op; Compiler_optim "-O2"], All (3, Dreg), "vorn", notype_2, [S64; U64];
-    Vorn, [Compiler_optim "-O2"], All (3, Qreg), "vornQ", notype_2, su_8_64;
-  ]
-
-let type_in_crypto_only t
-  = (t == P64) || (t == P128)
-
-let cross_product s1 s2
-  = List.filter (fun (e, e') -> e <> e')
-                (List.concat (List.map (fun e1 -> List.map (fun e2 -> (e1,e2)) s1) s2))
-
-let reinterp =
-  let elems = P8 :: P16 :: F32 :: P64 :: su_8_64 in
-  let casts = cross_product elems elems in
-  List.map
-    (fun (convto, convfrom) ->
-       Vreinterp, (if (type_in_crypto_only convto) || (type_in_crypto_only convfrom)
-                   then [Requires_feature "CRYPTO"] else []) @ [No_op], Use_operands [| Dreg; Dreg |],
-                   "vreinterpret", conv_1, [Cast (convto, convfrom)])
-    casts
-
-let reinterpq =
-  let elems = P8 :: P16 :: F32 :: P64 :: P128 :: su_8_64 in
-  let casts = cross_product elems elems in
-  List.map
-    (fun (convto, convfrom) ->
-       Vreinterp, (if (type_in_crypto_only convto) || (type_in_crypto_only convfrom)
-                   then [Requires_feature "CRYPTO"] else []) @ [No_op], Use_operands [| Qreg; Qreg |],
-                   "vreinterpretQ", conv_1, [Cast (convto, convfrom)])
-    casts
-
-(* Output routines.  *)
-
-let rec string_of_elt = function
-    S8 -> "s8" | S16 -> "s16" | S32 -> "s32" | S64 -> "s64"
-  | U8 -> "u8" | U16 -> "u16" | U32 -> "u32" | U64 -> "u64"
-  | I8 -> "i8" | I16 -> "i16" | I32 -> "i32" | I64 -> "i64"
-  | B8 -> "8" | B16 -> "16" | B32 -> "32" | B64 -> "64"
-  | F16 -> "f16" | F32 -> "f32" | P8 -> "p8" | P16 -> "p16"
-  | P64 -> "p64" | P128 -> "p128"
-  | Conv (a, b) | Cast (a, b) -> string_of_elt a ^ "_" ^ string_of_elt b
-  | NoElts -> failwith "No elts"
-
-let string_of_elt_dots elt =
-  match elt with
-    Conv (a, b) | Cast (a, b) -> string_of_elt a ^ "." ^ string_of_elt b
-  | _ -> string_of_elt elt
-
-let string_of_vectype vt =
-  let rec name affix = function
-    T_int8x8 -> affix "int8x8"
-  | T_int8x16 -> affix "int8x16"
-  | T_int16x4 -> affix "int16x4"
-  | T_int16x8 -> affix "int16x8"
-  | T_int32x2 -> affix "int32x2"
-  | T_int32x4 -> affix "int32x4"
-  | T_int64x1 -> affix "int64x1"
-  | T_int64x2 -> affix "int64x2"
-  | T_uint8x8 -> affix "uint8x8"
-  | T_uint8x16 -> affix "uint8x16"
-  | T_uint16x4 -> affix "uint16x4"
-  | T_uint16x8 -> affix "uint16x8"
-  | T_uint32x2 -> affix "uint32x2"
-  | T_uint32x4 -> affix "uint32x4"
-  | T_uint64x1 -> affix "uint64x1"
-  | T_uint64x2 -> affix "uint64x2"
-  | T_float16x4 -> affix "float16x4"
-  | T_float32x2 -> affix "float32x2"
-  | T_float32x4 -> affix "float32x4"
-  | T_poly8x8 -> affix "poly8x8"
-  | T_poly8x16 -> affix "poly8x16"
-  | T_poly16x4 -> affix "poly16x4"
-  | T_poly16x8 -> affix "poly16x8"
-  | T_int8 -> affix "int8"
-  | T_int16 -> affix "int16"
-  | T_int32 -> affix "int32"
-  | T_int64 -> affix "int64"
-  | T_uint8 -> affix "uint8"
-  | T_uint16 -> affix "uint16"
-  | T_uint32 -> affix "uint32"
-  | T_uint64 -> affix "uint64"
-  | T_poly8 -> affix "poly8"
-  | T_poly16 -> affix "poly16"
-  | T_poly64 -> affix "poly64"
-  | T_poly64x1 -> affix "poly64x1"
-  | T_poly64x2 -> affix "poly64x2"
-  | T_poly128 -> affix "poly128"
-  | T_float16 -> affix "float16"
-  | T_float32 -> affix "float32"
-  | T_immediate _ -> "const int"
-  | T_void -> "void"
-  | T_intQI -> "__builtin_neon_qi"
-  | T_intHI -> "__builtin_neon_hi"
-  | T_intSI -> "__builtin_neon_si"
-  | T_intDI -> "__builtin_neon_di"
-  | T_intTI -> "__builtin_neon_ti"
-  | T_floatHF -> "__builtin_neon_hf"
-  | T_floatSF -> "__builtin_neon_sf"
-  | T_arrayof (num, base) ->
-      let basename = name (fun x -> x) base in
-      affix (Printf.sprintf "%sx%d" basename num)
-  | T_ptrto x ->
-      let basename = name affix x in
-      Printf.sprintf "%s *" basename
-  | T_const x ->
-      let basename = name affix x in
-      Printf.sprintf "const %s" basename
-  in
-    name (fun x -> x ^ "_t") vt
-
-let string_of_inttype = function
-    B_TImode -> "__builtin_neon_ti"
-  | B_EImode -> "__builtin_neon_ei"
-  | B_OImode -> "__builtin_neon_oi"
-  | B_CImode -> "__builtin_neon_ci"
-  | B_XImode -> "__builtin_neon_xi"
-
-let string_of_mode = function
-    V8QI -> "v8qi" | V4HI -> "v4hi" | V4HF  -> "v4hf"  | V2SI -> "v2si"
-  | V2SF -> "v2sf" | DI   -> "di"   | V16QI -> "v16qi" | V8HI -> "v8hi"
-  | V4SI -> "v4si" | V4SF -> "v4sf" | V2DI  -> "v2di"  | QI   -> "qi"
-  | HI -> "hi" | SI -> "si" | SF -> "sf" | TI -> "ti"
-
-(* Use uppercase chars for letters which form part of the intrinsic name, but
-   should be omitted from the builtin name (the info is passed in an extra
-   argument, instead).  *)
-let intrinsic_name name = String.lowercase name
-
-(* Allow the name of the builtin to be overridden by things (e.g. Flipped)
-   found in the features list.  *)
-let builtin_name features name =
-  let name = List.fold_right
-               (fun el name ->
-                 match el with
-                   Flipped x | Builtin_name x -> x
-                 | _ -> name)
-               features name in
-  let islower x = let str = String.make 1 x in (String.lowercase str) = str
-  and buf = Buffer.create (String.length name) in
-  String.iter (fun c -> if islower c then Buffer.add_char buf c) name;
-  Buffer.contents buf
-
-(* Transform an arity into a list of strings.  *)
-let strings_of_arity a =
-  match a with
-  | Arity0 vt -> [string_of_vectype vt]
-  | Arity1 (vt1, vt2) -> [string_of_vectype vt1; string_of_vectype vt2]
-  | Arity2 (vt1, vt2, vt3) -> [string_of_vectype vt1;
-			       string_of_vectype vt2;
-                               string_of_vectype vt3]
-  | Arity3 (vt1, vt2, vt3, vt4) -> [string_of_vectype vt1;
-                                    string_of_vectype vt2;
-                                    string_of_vectype vt3;
-                                    string_of_vectype vt4]
-  | Arity4 (vt1, vt2, vt3, vt4, vt5) -> [string_of_vectype vt1;
-                                         string_of_vectype vt2;
-                                         string_of_vectype vt3;
-                                         string_of_vectype vt4;
-                                         string_of_vectype vt5]
-
-(* Suffixes on the end of builtin names that are to be stripped in order
-   to obtain the name used as an instruction.  They are only stripped if
-   preceded immediately by an underscore.  *)
-let suffixes_to_strip = [ "n"; "lane"; "dup" ]
-
-(* Get the possible names of an instruction corresponding to a "name" from the
-   ops table.  This is done by getting the equivalent builtin name and
-   stripping any suffixes from the list at the top of this file, unless
-   the features list presents with an Instruction_name entry, in which
-   case that is used; or unless the features list presents with a Flipped
-   entry, in which case that is used.  If both such entries are present,
-   the first in the list will be chosen.  *)
-let get_insn_names features name =
-  let names = try
-  begin
-    match List.find (fun feature -> match feature with
-                                      Instruction_name _ -> true
-				    | Flipped _ -> true
-				    | _ -> false) features
-    with
-      Instruction_name names -> names
-    | Flipped name -> [name]
-    | _ -> assert false
-  end
-  with Not_found -> [builtin_name features name]
-  in
-  begin
-    List.map (fun name' ->
-      try
-        let underscore = String.rindex name' '_' in
-        let our_suffix = String.sub name' (underscore + 1)
-                                    ((String.length name') - underscore - 1)
-        in
-          let rec strip remaining_suffixes =
-            match remaining_suffixes with
-              [] -> name'
-            | s::ss when our_suffix = s -> String.sub name' 0 underscore
-            | _::ss -> strip ss
-          in
-            strip suffixes_to_strip
-      with (Not_found | Invalid_argument _) -> name') names
-  end
-
-(* Apply a function to each element of a list and then comma-separate
-   the resulting strings.  *)
-let rec commas f elts acc =
-  match elts with
-    [] -> acc
-  | [elt] -> acc ^ (f elt)
-  | elt::elts ->
-    commas f elts (acc ^ (f elt) ^ ", ")
-
-(* Given a list of features and the shape specified in the "ops" table, apply
-   a function to each possible shape that the instruction may have.
-   By default, this is the "shape" entry in "ops".  If the features list
-   contains a Disassembles_as entry, the shapes contained in that entry are
-   mapped to corresponding outputs and returned in a list.  If there is more
-   than one Disassembles_as entry, only the first is used.  *)
-let analyze_all_shapes features shape f =
-  try
-    match List.find (fun feature ->
-                       match feature with Disassembles_as _ -> true
-                                        | _ -> false)
-                    features with
-      Disassembles_as shapes -> List.map f shapes
-    | _ -> assert false
-  with Not_found -> [f shape]
-
-(* The crypto intrinsics have unconventional shapes and are not that
-   numerous to be worth the trouble of encoding here.  We implement them
-   explicitly here.  *)
-let crypto_intrinsics =
-"
-#ifdef __ARM_FEATURE_CRYPTO
-
-__extension__ static __inline poly128_t __attribute__ ((__always_inline__))
-vldrq_p128 (poly128_t const * __ptr)
-{
-#ifdef __ARM_BIG_ENDIAN
-  poly64_t* __ptmp = (poly64_t*) __ptr;
-  poly64_t __d0 = vld1_p64 (__ptmp);
-  poly64_t __d1 = vld1_p64 (__ptmp + 1);
-  return vreinterpretq_p128_p64 (vcombine_p64 (__d1, __d0));
-#else
-  return vreinterpretq_p128_p64 (vld1q_p64 ((poly64_t*) __ptr));
-#endif
-}
-
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vstrq_p128 (poly128_t * __ptr, poly128_t __val)
-{
-#ifdef __ARM_BIG_ENDIAN
-  poly64x2_t __tmp = vreinterpretq_p64_p128 (__val);
-  poly64_t __d0 = vget_high_p64 (__tmp);
-  poly64_t __d1 = vget_low_p64 (__tmp);
-  vst1q_p64 ((poly64_t*) __ptr, vcombine_p64 (__d0, __d1));
-#else
-  vst1q_p64 ((poly64_t*) __ptr, vreinterpretq_p64_p128 (__val));
-#endif
-}
-
-/* The vceq_p64 intrinsic does not map to a single instruction.
-   Instead we emulate it by performing a 32-bit variant of the vceq
-   and applying a pairwise min reduction to the result.
-   vceq_u32 will produce two 32-bit halves, each of which will contain either
-   all ones or all zeros depending on whether the corresponding 32-bit
-   halves of the poly64_t were equal.  The whole poly64_t values are equal
-   if and only if both halves are equal, i.e. vceq_u32 returns all ones.
-   If the result is all zeroes for any half then the whole result is zeroes.
-   This is what the pairwise min reduction achieves.  */
-
-__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
-vceq_p64 (poly64x1_t __a, poly64x1_t __b)
-{
-  uint32x2_t __t_a = vreinterpret_u32_p64 (__a);
-  uint32x2_t __t_b = vreinterpret_u32_p64 (__b);
-  uint32x2_t __c = vceq_u32 (__t_a, __t_b);
-  uint32x2_t __m = vpmin_u32 (__c, __c);
-  return vreinterpret_u64_u32 (__m);
-}
-
-/* The vtst_p64 intrinsic does not map to a single instruction.
-   We emulate it in way similar to vceq_p64 above but here we do
-   a reduction with max since if any two corresponding bits
-   in the two poly64_t's match, then the whole result must be all ones.  */
-
-__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
-vtst_p64 (poly64x1_t __a, poly64x1_t __b)
-{
-  uint32x2_t __t_a = vreinterpret_u32_p64 (__a);
-  uint32x2_t __t_b = vreinterpret_u32_p64 (__b);
-  uint32x2_t __c = vtst_u32 (__t_a, __t_b);
-  uint32x2_t __m = vpmax_u32 (__c, __c);
-  return vreinterpret_u64_u32 (__m);
-}
-
-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-vaeseq_u8 (uint8x16_t __data, uint8x16_t __key)
-{
-  return __builtin_arm_crypto_aese (__data, __key);
-}
-
-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-vaesdq_u8 (uint8x16_t __data, uint8x16_t __key)
-{
-  return __builtin_arm_crypto_aesd (__data, __key);
-}
-
-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-vaesmcq_u8 (uint8x16_t __data)
-{
-  return __builtin_arm_crypto_aesmc (__data);
-}
-
-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-vaesimcq_u8 (uint8x16_t __data)
-{
-  return __builtin_arm_crypto_aesimc (__data);
-}
-
-__extension__ static __inline uint32_t __attribute__ ((__always_inline__))
-vsha1h_u32 (uint32_t __hash_e)
-{
-  uint32x4_t __t = vdupq_n_u32 (0);
-  __t = vsetq_lane_u32 (__hash_e, __t, 0);
-  __t = __builtin_arm_crypto_sha1h (__t);
-  return vgetq_lane_u32 (__t, 0);
-}
-
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vsha1cq_u32 (uint32x4_t __hash_abcd, uint32_t __hash_e, uint32x4_t __wk)
-{
-  uint32x4_t __t = vdupq_n_u32 (0);
-  __t = vsetq_lane_u32 (__hash_e, __t, 0);
-  return __builtin_arm_crypto_sha1c (__hash_abcd, __t, __wk);
-}
-
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vsha1pq_u32 (uint32x4_t __hash_abcd, uint32_t __hash_e, uint32x4_t __wk)
-{
-  uint32x4_t __t = vdupq_n_u32 (0);
-  __t = vsetq_lane_u32 (__hash_e, __t, 0);
-  return __builtin_arm_crypto_sha1p (__hash_abcd, __t, __wk);
-}
-
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vsha1mq_u32 (uint32x4_t __hash_abcd, uint32_t __hash_e, uint32x4_t __wk)
-{
-  uint32x4_t __t = vdupq_n_u32 (0);
-  __t = vsetq_lane_u32 (__hash_e, __t, 0);
-  return __builtin_arm_crypto_sha1m (__hash_abcd, __t, __wk);
-}
-
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vsha1su0q_u32 (uint32x4_t __w0_3, uint32x4_t __w4_7, uint32x4_t __w8_11)
-{
-  return __builtin_arm_crypto_sha1su0 (__w0_3, __w4_7, __w8_11);
-}
-
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vsha1su1q_u32 (uint32x4_t __tw0_3, uint32x4_t __w12_15)
-{
-  return __builtin_arm_crypto_sha1su1 (__tw0_3, __w12_15);
-}
-
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vsha256hq_u32 (uint32x4_t __hash_abcd, uint32x4_t __hash_efgh, uint32x4_t __wk)
-{
-  return __builtin_arm_crypto_sha256h (__hash_abcd, __hash_efgh, __wk);
-}
-
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vsha256h2q_u32 (uint32x4_t __hash_abcd, uint32x4_t __hash_efgh, uint32x4_t __wk)
-{
-  return __builtin_arm_crypto_sha256h2 (__hash_abcd, __hash_efgh, __wk);
-}
-
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vsha256su0q_u32 (uint32x4_t __w0_3, uint32x4_t __w4_7)
-{
-  return __builtin_arm_crypto_sha256su0 (__w0_3, __w4_7);
-}
-
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vsha256su1q_u32 (uint32x4_t __tw0_3, uint32x4_t __w8_11, uint32x4_t __w12_15)
-{
-  return __builtin_arm_crypto_sha256su1 (__tw0_3, __w8_11, __w12_15);
-}
-
-__extension__ static __inline poly128_t __attribute__ ((__always_inline__))
-vmull_p64 (poly64_t __a, poly64_t __b)
-{
-  return (poly128_t) __builtin_arm_crypto_vmullp64 ((uint64_t) __a, (uint64_t) __b);
-}
-
-__extension__ static __inline poly128_t __attribute__ ((__always_inline__))
-vmull_high_p64 (poly64x2_t __a, poly64x2_t __b)
-{
-  poly64_t __t1 = vget_high_p64 (__a);
-  poly64_t __t2 = vget_high_p64 (__b);
-
-  return (poly128_t) __builtin_arm_crypto_vmullp64 ((uint64_t) __t1, (uint64_t) __t2);
-}
-
-#endif
-"
--- a/src/gcc/config/arm/predicates.md
+++ b/src/gcc/config/arm/predicates.md
@@ -141,8 +141,7 @@
        (match_test "const_ok_for_arm (~INTVAL (op))")))
 
 (define_predicate "const0_operand"
-  (and (match_code "const_int")
-       (match_test "INTVAL (op) == 0")))
+  (match_test "op == CONST0_RTX (mode)"))
 
 ;; Something valid on the RHS of an ARM data-processing instruction
 (define_predicate "arm_rhs_operand"
@@ -170,8 +169,7 @@
 
 (define_predicate "const_neon_scalar_shift_amount_operand"
   (and (match_code "const_int")
-       (match_test "((unsigned HOST_WIDE_INT) INTVAL (op)) <= GET_MODE_BITSIZE (mode)
-	&& ((unsigned HOST_WIDE_INT) INTVAL (op)) > 0")))
+       (match_test "IN_RANGE (UINTVAL (op), 1, GET_MODE_BITSIZE (mode))")))
 
 (define_predicate "ldrd_strd_offset_operand"
   (and (match_operand 0 "const_int_operand")
@@ -243,11 +241,6 @@
        (and (match_code "const_double")
 	    (match_test "arm_const_double_rtx (op)"))))
 
-(define_predicate "arm_float_compare_operand"
-  (if_then_else (match_test "TARGET_VFP")
-		(match_operand 0 "vfp_compare_operand")
-		(match_operand 0 "s_register_operand")))
-
 ;; True for valid index operands.
 (define_predicate "index_operand"
   (ior (match_operand 0 "s_register_operand")
@@ -285,19 +278,19 @@
 		      (match_test "power_of_two_operand (XEXP (op, 1), mode)"))
 		 (and (match_code "rotate")
 		      (match_test "CONST_INT_P (XEXP (op, 1))
-				   && ((unsigned HOST_WIDE_INT) INTVAL (XEXP (op, 1))) < 32")))
+				   && (UINTVAL (XEXP (op, 1))) < 32")))
 	    (and (match_code "ashift,ashiftrt,lshiftrt,rotatert")
 		 (match_test "!CONST_INT_P (XEXP (op, 1))
-			      || ((unsigned HOST_WIDE_INT) INTVAL (XEXP (op, 1))) < 32")))
+			      || (UINTVAL (XEXP (op, 1))) < 32")))
        (match_test "mode == GET_MODE (op)")))
 
 (define_special_predicate "shift_nomul_operator"
   (and (ior (and (match_code "rotate")
 		 (match_test "CONST_INT_P (XEXP (op, 1))
-			      && ((unsigned HOST_WIDE_INT) INTVAL (XEXP (op, 1))) < 32"))
+			      && (UINTVAL (XEXP (op, 1))) < 32"))
 	    (and (match_code "ashift,ashiftrt,lshiftrt,rotatert")
 		 (match_test "!CONST_INT_P (XEXP (op, 1))
-			      || ((unsigned HOST_WIDE_INT) INTVAL (XEXP (op, 1))) < 32")))
+			      || (UINTVAL (XEXP (op, 1))) < 32")))
        (match_test "mode == GET_MODE (op)")))
 
 ;; True for shift operators which can be used with saturation instructions.
@@ -306,7 +299,7 @@
                  (match_test "power_of_two_operand (XEXP (op, 1), mode)"))
             (and (match_code "ashift,ashiftrt")
                  (match_test "CONST_INT_P (XEXP (op, 1))
-		              && ((unsigned HOST_WIDE_INT) INTVAL (XEXP (op, 1)) < 32)")))
+		              && (UINTVAL (XEXP (op, 1)) < 32)")))
        (match_test "mode == GET_MODE (op)")))
 
 ;; True for MULT, to identify which variant of shift_operator is in use.
@@ -398,6 +391,12 @@
 	     || mode == CC_DGTUmode));
 })
 
+;; Any register, including CC
+(define_predicate "cc_register_operand"
+  (and (match_code "reg")
+       (ior (match_operand 0 "s_register_operand")
+	    (match_operand 0 "cc_register"))))
+
 (define_special_predicate "arm_extendqisi_mem_op"
   (and (match_operand 0 "memory_operand")
        (match_test "TARGET_ARM ? arm_legitimate_address_outer_p (mode,
@@ -532,7 +531,7 @@
   (ior (and (match_code "reg,subreg")
 	    (match_operand 0 "s_register_operand"))
        (and (match_code "const_int")
-	    (match_test "((unsigned HOST_WIDE_INT) INTVAL (op)) < 256"))))
+	    (match_test "(UINTVAL (op)) < 256"))))
 
 (define_predicate "thumb1_cmpneg_operand"
   (and (match_code "const_int")
@@ -612,69 +611,23 @@
 (define_special_predicate "vect_par_constant_high" 
   (match_code "parallel")
 {
-  HOST_WIDE_INT count = XVECLEN (op, 0);
-  int i;
-  int base = GET_MODE_NUNITS (mode);
-
-  if ((count < 1)
-      || (count != base/2))
-    return false;
-    
-  if (!VECTOR_MODE_P (mode))
-    return false;
-
-  for (i = 0; i < count; i++)
-   {
-     rtx elt = XVECEXP (op, 0, i);
-     int val;
-
-     if (!CONST_INT_P (elt))
-       return false;
-
-     val = INTVAL (elt);
-     if (val != (base/2) + i)
-       return false;
-   }
-  return true; 
+  return arm_simd_check_vect_par_cnst_half_p (op, mode, true);
 })
 
 (define_special_predicate "vect_par_constant_low"
   (match_code "parallel")
 {
-  HOST_WIDE_INT count = XVECLEN (op, 0);
-  int i;
-  int base = GET_MODE_NUNITS (mode);
-
-  if ((count < 1)
-      || (count != base/2))
-    return false;
-    
-  if (!VECTOR_MODE_P (mode))
-    return false;
-
-  for (i = 0; i < count; i++)
-   {
-     rtx elt = XVECEXP (op, 0, i);
-     int val;
-
-     if (!CONST_INT_P (elt))
-       return false;
-
-     val = INTVAL (elt);
-     if (val != i)
-       return false;
-   } 
-  return true; 
+  return arm_simd_check_vect_par_cnst_half_p (op, mode, false);
 })
 
 (define_predicate "const_double_vcvt_power_of_two_reciprocal"
   (and (match_code "const_double")
-       (match_test "TARGET_32BIT && TARGET_VFP
-                   && vfp3_const_double_for_fract_bits (op)")))
+       (match_test "TARGET_32BIT
+		    && vfp3_const_double_for_fract_bits (op)")))
 
 (define_predicate "const_double_vcvt_power_of_two"
   (and (match_code "const_double")
-       (match_test "TARGET_32BIT && TARGET_VFP
+       (match_test "TARGET_32BIT
 		    && vfp3_const_double_for_bits (op) > 0")))
 
 (define_predicate "neon_struct_operand"
--- a/src/gcc/config/arm/sync.md
+++ b/src/gcc/config/arm/sync.md
@@ -63,37 +63,59 @@
    (set_attr "predicable" "no")])
 
 (define_insn "atomic_load<mode>"
-  [(set (match_operand:QHSI 0 "register_operand" "=r")
+  [(set (match_operand:QHSI 0 "register_operand" "=r,r,l")
     (unspec_volatile:QHSI
-      [(match_operand:QHSI 1 "arm_sync_memory_operand" "Q")
-       (match_operand:SI 2 "const_int_operand")]		;; model
+      [(match_operand:QHSI 1 "arm_sync_memory_operand" "Q,Q,Q")
+       (match_operand:SI 2 "const_int_operand" "n,Pf,n")]	;; model
       VUNSPEC_LDA))]
   "TARGET_HAVE_LDACQ"
   {
     enum memmodel model = memmodel_from_int (INTVAL (operands[2]));
     if (is_mm_relaxed (model) || is_mm_consume (model) || is_mm_release (model))
-      return \"ldr<sync_sfx>%?\\t%0, %1\";
+      {
+	if (TARGET_THUMB1)
+	  return \"ldr<sync_sfx>\\t%0, %1\";
+	else
+	  return \"ldr<sync_sfx>%?\\t%0, %1\";
+      }
     else
-      return \"lda<sync_sfx>%?\\t%0, %1\";
+      {
+	if (TARGET_THUMB1)
+	  return \"lda<sync_sfx>\\t%0, %1\";
+	else
+	  return \"lda<sync_sfx>%?\\t%0, %1\";
+      }
   }
-  [(set_attr "predicable" "yes")
+  [(set_attr "arch" "32,v8mb,any")
+   (set_attr "predicable" "yes")
    (set_attr "predicable_short_it" "no")])
 
 (define_insn "atomic_store<mode>"
-  [(set (match_operand:QHSI 0 "memory_operand" "=Q")
+  [(set (match_operand:QHSI 0 "memory_operand" "=Q,Q,Q")
     (unspec_volatile:QHSI
-      [(match_operand:QHSI 1 "general_operand" "r")
-       (match_operand:SI 2 "const_int_operand")]		;; model
+      [(match_operand:QHSI 1 "general_operand" "r,r,l")
+       (match_operand:SI 2 "const_int_operand" "n,Pf,n")]	;; model
       VUNSPEC_STL))]
   "TARGET_HAVE_LDACQ"
   {
     enum memmodel model = memmodel_from_int (INTVAL (operands[2]));
     if (is_mm_relaxed (model) || is_mm_consume (model) || is_mm_acquire (model))
-      return \"str<sync_sfx>%?\t%1, %0\";
+      {
+	if (TARGET_THUMB1)
+	  return \"str<sync_sfx>\t%1, %0\";
+	else
+	  return \"str<sync_sfx>%?\t%1, %0\";
+      }
     else
-      return \"stl<sync_sfx>%?\t%1, %0\";
+      {
+	if (TARGET_THUMB1)
+	  return \"stl<sync_sfx>\t%1, %0\";
+	else
+	  return \"stl<sync_sfx>%?\t%1, %0\";
+      }
   }
-  [(set_attr "predicable" "yes")
+  [(set_attr "arch" "32,v8mb,any")
+   (set_attr "predicable" "yes")
    (set_attr "predicable_short_it" "no")])
 
 ;; An LDRD instruction usable by the atomic_loaddi expander on LPAE targets
@@ -117,7 +139,7 @@
   [(match_operand:DI 0 "s_register_operand")		;; val out
    (match_operand:DI 1 "mem_noofs_operand")		;; memory
    (match_operand:SI 2 "const_int_operand")]		;; model
-  "(TARGET_HAVE_LDREXD || TARGET_HAVE_LPAE || TARGET_HAVE_LDACQ)
+  "(TARGET_HAVE_LDREXD || TARGET_HAVE_LPAE || TARGET_HAVE_LDACQEXD)
    && ARM_DOUBLEWORD_ALIGN"
 {
   memmodel model = memmodel_from_int (INTVAL (operands[2]));
@@ -125,7 +147,7 @@
   /* For ARMv8-A we can use an LDAEXD to atomically load two 32-bit registers
      when acquire or stronger semantics are needed.  When the relaxed model is
      used this can be relaxed to a normal LDRD.  */
-  if (TARGET_HAVE_LDACQ)
+  if (TARGET_HAVE_LDACQEXD)
     {
       if (is_mm_relaxed (model))
 	emit_insn (gen_arm_atomic_loaddi2_ldrd (operands[0], operands[1]));
@@ -167,21 +189,23 @@
   DONE;
 })
 
+;; Constraints of this pattern must be at least as strict as those of the
+;; cbranchsi operations in thumb1.md and aim to be as permissive.
 (define_insn_and_split "atomic_compare_and_swap<mode>_1"
-  [(set (reg:CC_Z CC_REGNUM)					;; bool out
+  [(set (match_operand 0 "cc_register_operand" "=&c,&l,&l,&l")		;; bool out
 	(unspec_volatile:CC_Z [(const_int 0)] VUNSPEC_ATOMIC_CAS))
-   (set (match_operand:SI 0 "s_register_operand" "=&r")		;; val out
+   (set (match_operand:SI 1 "s_register_operand" "=&r,&l,&0,&l*h")	;; val out
 	(zero_extend:SI
-	  (match_operand:NARROW 1 "mem_noofs_operand" "+Ua")))	;; memory
-   (set (match_dup 1)
+	  (match_operand:NARROW 2 "mem_noofs_operand" "+Ua,Ua,Ua,Ua")))	;; memory
+   (set (match_dup 2)
 	(unspec_volatile:NARROW
-	  [(match_operand:SI 2 "arm_add_operand" "rIL")		;; expected
-	   (match_operand:NARROW 3 "s_register_operand" "r")	;; desired
-	   (match_operand:SI 4 "const_int_operand")		;; is_weak
-	   (match_operand:SI 5 "const_int_operand")		;; mod_s
-	   (match_operand:SI 6 "const_int_operand")]		;; mod_f
+	  [(match_operand:SI 3 "arm_add_operand" "rIL,lIL*h,J,*r")	;; expected
+	   (match_operand:NARROW 4 "s_register_operand" "r,r,r,r")	;; desired
+	   (match_operand:SI 5 "const_int_operand")		;; is_weak
+	   (match_operand:SI 6 "const_int_operand")		;; mod_s
+	   (match_operand:SI 7 "const_int_operand")]		;; mod_f
 	  VUNSPEC_ATOMIC_CAS))
-   (clobber (match_scratch:SI 7 "=&r"))]
+   (clobber (match_scratch:SI 8 "=&r,X,X,X"))]
   "<sync_predtab>"
   "#"
   "&& reload_completed"
@@ -189,27 +213,30 @@
   {
     arm_split_compare_and_swap (operands);
     DONE;
-  })
+  }
+  [(set_attr "arch" "32,v8mb,v8mb,v8mb")])
 
 (define_mode_attr cas_cmp_operand
   [(SI "arm_add_operand") (DI "cmpdi_operand")])
 (define_mode_attr cas_cmp_str
   [(SI "rIL") (DI "rDi")])
 
+;; Constraints of this pattern must be at least as strict as those of the
+;; cbranchsi operations in thumb1.md and aim to be as permissive.
 (define_insn_and_split "atomic_compare_and_swap<mode>_1"
-  [(set (reg:CC_Z CC_REGNUM)					;; bool out
+  [(set (match_operand 0 "cc_register_operand" "=&c,&l,&l,&l")		;; bool out
 	(unspec_volatile:CC_Z [(const_int 0)] VUNSPEC_ATOMIC_CAS))
-   (set (match_operand:SIDI 0 "s_register_operand" "=&r")	;; val out
-	(match_operand:SIDI 1 "mem_noofs_operand" "+Ua"))	;; memory
-   (set (match_dup 1)
+   (set (match_operand:SIDI 1 "s_register_operand" "=&r,&l,&0,&l*h")	;; val out
+	(match_operand:SIDI 2 "mem_noofs_operand" "+Ua,Ua,Ua,Ua"))	;; memory
+   (set (match_dup 2)
 	(unspec_volatile:SIDI
-	  [(match_operand:SIDI 2 "<cas_cmp_operand>" "<cas_cmp_str>") ;; expect
-	   (match_operand:SIDI 3 "s_register_operand" "r")	;; desired
-	   (match_operand:SI 4 "const_int_operand")		;; is_weak
-	   (match_operand:SI 5 "const_int_operand")		;; mod_s
-	   (match_operand:SI 6 "const_int_operand")]		;; mod_f
+	  [(match_operand:SIDI 3 "<cas_cmp_operand>" "<cas_cmp_str>,lIL*h,J,*r") ;; expect
+	   (match_operand:SIDI 4 "s_register_operand" "r,r,r,r")	;; desired
+	   (match_operand:SI 5 "const_int_operand")		;; is_weak
+	   (match_operand:SI 6 "const_int_operand")		;; mod_s
+	   (match_operand:SI 7 "const_int_operand")]		;; mod_f
 	  VUNSPEC_ATOMIC_CAS))
-   (clobber (match_scratch:SI 7 "=&r"))]
+   (clobber (match_scratch:SI 8 "=&r,X,X,X"))]
   "<sync_predtab>"
   "#"
   "&& reload_completed"
@@ -217,18 +244,19 @@
   {
     arm_split_compare_and_swap (operands);
     DONE;
-  })
+  }
+  [(set_attr "arch" "32,v8mb,v8mb,v8mb")])
 
 (define_insn_and_split "atomic_exchange<mode>"
-  [(set (match_operand:QHSD 0 "s_register_operand" "=&r")	;; output
-	(match_operand:QHSD 1 "mem_noofs_operand" "+Ua"))	;; memory
+  [(set (match_operand:QHSD 0 "s_register_operand" "=&r,&r")	;; output
+	(match_operand:QHSD 1 "mem_noofs_operand" "+Ua,Ua"))	;; memory
    (set (match_dup 1)
 	(unspec_volatile:QHSD
-	  [(match_operand:QHSD 2 "s_register_operand" "r")	;; input
+	  [(match_operand:QHSD 2 "s_register_operand" "r,r")	;; input
 	   (match_operand:SI 3 "const_int_operand" "")]		;; model
 	  VUNSPEC_ATOMIC_XCHG))
    (clobber (reg:CC CC_REGNUM))
-   (clobber (match_scratch:SI 4 "=&r"))]
+   (clobber (match_scratch:SI 4 "=&r,&l"))]
   "<sync_predtab>"
   "#"
   "&& reload_completed"
@@ -237,7 +265,11 @@
     arm_split_atomic_op (SET, operands[0], NULL, operands[1],
 			 operands[2], operands[3], operands[4]);
     DONE;
-  })
+  }
+  [(set_attr "arch" "32,v8mb")])
+
+;; The following mode and code attribute are defined here because they are
+;; specific to atomics and are not needed anywhere else.
 
 (define_mode_attr atomic_op_operand
   [(QI "reg_or_int_operand")
@@ -248,16 +280,24 @@
 (define_mode_attr atomic_op_str
   [(QI "rn") (HI "rn") (SI "rn") (DI "r")])
 
+(define_code_attr thumb1_atomic_op_str
+  [(ior "l,l") (xor "l,l") (and "l,l") (plus "lIJL,r") (minus "lPd,lPd")])
+
+(define_code_attr thumb1_atomic_newop_str
+  [(ior "&l,&l") (xor "&l,&l") (and "&l,&l") (plus "&l,&r") (minus "&l,&l")])
+
+;; Constraints of this pattern must be at least as strict as those of the non
+;; atomic operations in thumb1.md and aim to be as permissive.
 (define_insn_and_split "atomic_<sync_optab><mode>"
-  [(set (match_operand:QHSD 0 "mem_noofs_operand" "+Ua")
+  [(set (match_operand:QHSD 0 "mem_noofs_operand" "+Ua,Ua,Ua")
 	(unspec_volatile:QHSD
 	  [(syncop:QHSD (match_dup 0)
-	     (match_operand:QHSD 1 "<atomic_op_operand>" "<atomic_op_str>"))
+	     (match_operand:QHSD 1 "<atomic_op_operand>" "<atomic_op_str>,<thumb1_atomic_op_str>"))
 	   (match_operand:SI 2 "const_int_operand")]		;; model
 	  VUNSPEC_ATOMIC_OP))
    (clobber (reg:CC CC_REGNUM))
-   (clobber (match_scratch:QHSD 3 "=&r"))
-   (clobber (match_scratch:SI 4 "=&r"))]
+   (clobber (match_scratch:QHSD 3 "=&r,<thumb1_atomic_newop_str>"))
+   (clobber (match_scratch:SI 4 "=&r,&l,&l"))]
   "<sync_predtab>"
   "#"
   "&& reload_completed"
@@ -266,19 +306,22 @@
     arm_split_atomic_op (<CODE>, NULL, operands[3], operands[0],
 			 operands[1], operands[2], operands[4]);
     DONE;
-  })
+  }
+  [(set_attr "arch" "32,v8mb,v8mb")])
 
+;; Constraints of this pattern must be at least as strict as those of the non
+;; atomic NANDs in thumb1.md and aim to be as permissive.
 (define_insn_and_split "atomic_nand<mode>"
-  [(set (match_operand:QHSD 0 "mem_noofs_operand" "+Ua")
+  [(set (match_operand:QHSD 0 "mem_noofs_operand" "+Ua,Ua")
 	(unspec_volatile:QHSD
 	  [(not:QHSD
 	     (and:QHSD (match_dup 0)
-	       (match_operand:QHSD 1 "<atomic_op_operand>" "<atomic_op_str>")))
+	       (match_operand:QHSD 1 "<atomic_op_operand>" "<atomic_op_str>,l")))
 	   (match_operand:SI 2 "const_int_operand")]		;; model
 	  VUNSPEC_ATOMIC_OP))
    (clobber (reg:CC CC_REGNUM))
-   (clobber (match_scratch:QHSD 3 "=&r"))
-   (clobber (match_scratch:SI 4 "=&r"))]
+   (clobber (match_scratch:QHSD 3 "=&r,&l"))
+   (clobber (match_scratch:SI 4 "=&r,&l"))]
   "<sync_predtab>"
   "#"
   "&& reload_completed"
@@ -287,20 +330,38 @@
     arm_split_atomic_op (NOT, NULL, operands[3], operands[0],
 			 operands[1], operands[2], operands[4]);
     DONE;
-  })
+  }
+  [(set_attr "arch" "32,v8mb")])
+
+;; 3 alternatives are needed to represent constraints after split from
+;; thumb1_addsi3: (i) case where operand1 and destination can be in different
+;; registers, (ii) case where they are in the same low register and (iii) case
+;; when they are in the same register without restriction on the register.  We
+;; disparage slightly alternatives that require copying the old value into the
+;; register for the new value (see bind_old_new in arm_split_atomic_op).
+(define_code_attr thumb1_atomic_fetch_op_str
+  [(ior "l,l,l") (xor "l,l,l") (and "l,l,l") (plus "lL,?IJ,?r") (minus "lPd,lPd,lPd")])
+
+(define_code_attr thumb1_atomic_fetch_newop_str
+  [(ior "&l,&l,&l") (xor "&l,&l,&l") (and "&l,&l,&l") (plus "&l,&l,&r") (minus "&l,&l,&l")])
 
+(define_code_attr thumb1_atomic_fetch_oldop_str
+  [(ior "&r,&r,&r") (xor "&r,&r,&r") (and "&r,&r,&r") (plus "&l,&r,&r") (minus "&l,&l,&l")])
+
+;; Constraints of this pattern must be at least as strict as those of the non
+;; atomic operations in thumb1.md and aim to be as permissive.
 (define_insn_and_split "atomic_fetch_<sync_optab><mode>"
-  [(set (match_operand:QHSD 0 "s_register_operand" "=&r")
-	(match_operand:QHSD 1 "mem_noofs_operand" "+Ua"))
+  [(set (match_operand:QHSD 0 "s_register_operand" "=&r,<thumb1_atomic_fetch_oldop_str>")
+	(match_operand:QHSD 1 "mem_noofs_operand" "+Ua,Ua,Ua,Ua"))
    (set (match_dup 1)
 	(unspec_volatile:QHSD
 	  [(syncop:QHSD (match_dup 1)
-	     (match_operand:QHSD 2 "<atomic_op_operand>" "<atomic_op_str>"))
+	     (match_operand:QHSD 2 "<atomic_op_operand>" "<atomic_op_str>,<thumb1_atomic_fetch_op_str>"))
 	   (match_operand:SI 3 "const_int_operand")]		;; model
 	  VUNSPEC_ATOMIC_OP))
    (clobber (reg:CC CC_REGNUM))
-   (clobber (match_scratch:QHSD 4 "=&r"))
-   (clobber (match_scratch:SI 5 "=&r"))]
+   (clobber (match_scratch:QHSD 4 "=&r,<thumb1_atomic_fetch_newop_str>"))
+   (clobber (match_scratch:SI 5 "=&r,&l,&l,&l"))]
   "<sync_predtab>"
   "#"
   "&& reload_completed"
@@ -309,21 +370,24 @@
     arm_split_atomic_op (<CODE>, operands[0], operands[4], operands[1],
 			 operands[2], operands[3], operands[5]);
     DONE;
-  })
+  }
+  [(set_attr "arch" "32,v8mb,v8mb,v8mb")])
 
+;; Constraints of this pattern must be at least as strict as those of the non
+;; atomic NANDs in thumb1.md and aim to be as permissive.
 (define_insn_and_split "atomic_fetch_nand<mode>"
-  [(set (match_operand:QHSD 0 "s_register_operand" "=&r")
-	(match_operand:QHSD 1 "mem_noofs_operand" "+Ua"))
+  [(set (match_operand:QHSD 0 "s_register_operand" "=&r,&r")
+	(match_operand:QHSD 1 "mem_noofs_operand" "+Ua,Ua"))
    (set (match_dup 1)
 	(unspec_volatile:QHSD
 	  [(not:QHSD
 	     (and:QHSD (match_dup 1)
-	       (match_operand:QHSD 2 "<atomic_op_operand>" "<atomic_op_str>")))
+	       (match_operand:QHSD 2 "<atomic_op_operand>" "<atomic_op_str>,l")))
 	   (match_operand:SI 3 "const_int_operand")]		;; model
 	  VUNSPEC_ATOMIC_OP))
    (clobber (reg:CC CC_REGNUM))
-   (clobber (match_scratch:QHSD 4 "=&r"))
-   (clobber (match_scratch:SI 5 "=&r"))]
+   (clobber (match_scratch:QHSD 4 "=&r,&l"))
+   (clobber (match_scratch:SI 5 "=&r,&l"))]
   "<sync_predtab>"
   "#"
   "&& reload_completed"
@@ -332,20 +396,23 @@
     arm_split_atomic_op (NOT, operands[0], operands[4], operands[1],
 			 operands[2], operands[3], operands[5]);
     DONE;
-  })
+  }
+  [(set_attr "arch" "32,v8mb")])
 
+;; Constraints of this pattern must be at least as strict as those of the non
+;; atomic operations in thumb1.md and aim to be as permissive.
 (define_insn_and_split "atomic_<sync_optab>_fetch<mode>"
-  [(set (match_operand:QHSD 0 "s_register_operand" "=&r")
+  [(set (match_operand:QHSD 0 "s_register_operand" "=&r,<thumb1_atomic_newop_str>")
 	(syncop:QHSD
-	  (match_operand:QHSD 1 "mem_noofs_operand" "+Ua")
-	  (match_operand:QHSD 2 "<atomic_op_operand>" "<atomic_op_str>")))
+	  (match_operand:QHSD 1 "mem_noofs_operand" "+Ua,Ua,Ua")
+	  (match_operand:QHSD 2 "<atomic_op_operand>" "<atomic_op_str>,<thumb1_atomic_op_str>")))
    (set (match_dup 1)
 	(unspec_volatile:QHSD
 	  [(match_dup 1) (match_dup 2)
 	   (match_operand:SI 3 "const_int_operand")]		;; model
 	  VUNSPEC_ATOMIC_OP))
    (clobber (reg:CC CC_REGNUM))
-   (clobber (match_scratch:SI 4 "=&r"))]
+   (clobber (match_scratch:SI 4 "=&r,&l,&l"))]
   "<sync_predtab>"
   "#"
   "&& reload_completed"
@@ -354,21 +421,24 @@
     arm_split_atomic_op (<CODE>, NULL, operands[0], operands[1],
 			 operands[2], operands[3], operands[4]);
     DONE;
-  })
+  }
+  [(set_attr "arch" "32,v8mb,v8mb")])
 
+;; Constraints of this pattern must be at least as strict as those of the non
+;; atomic NANDs in thumb1.md and aim to be as permissive.
 (define_insn_and_split "atomic_nand_fetch<mode>"
-  [(set (match_operand:QHSD 0 "s_register_operand" "=&r")
+  [(set (match_operand:QHSD 0 "s_register_operand" "=&r,&l")
 	(not:QHSD
 	  (and:QHSD
-	    (match_operand:QHSD 1 "mem_noofs_operand" "+Ua")
-	    (match_operand:QHSD 2 "<atomic_op_operand>" "<atomic_op_str>"))))
+	    (match_operand:QHSD 1 "mem_noofs_operand" "+Ua,Ua")
+	    (match_operand:QHSD 2 "<atomic_op_operand>" "<atomic_op_str>,l"))))
    (set (match_dup 1)
 	(unspec_volatile:QHSD
 	  [(match_dup 1) (match_dup 2)
 	   (match_operand:SI 3 "const_int_operand")]		;; model
 	  VUNSPEC_ATOMIC_OP))
    (clobber (reg:CC CC_REGNUM))
-   (clobber (match_scratch:SI 4 "=&r"))]
+   (clobber (match_scratch:SI 4 "=&r,&l"))]
   "<sync_predtab>"
   "#"
   "&& reload_completed"
@@ -377,48 +447,61 @@
     arm_split_atomic_op (NOT, NULL, operands[0], operands[1],
 			 operands[2], operands[3], operands[4]);
     DONE;
-  })
+  }
+  [(set_attr "arch" "32,v8mb")])
 
 (define_insn "arm_load_exclusive<mode>"
-  [(set (match_operand:SI 0 "s_register_operand" "=r")
+  [(set (match_operand:SI 0 "s_register_operand" "=r,r")
         (zero_extend:SI
 	  (unspec_volatile:NARROW
-	    [(match_operand:NARROW 1 "mem_noofs_operand" "Ua")]
+	    [(match_operand:NARROW 1 "mem_noofs_operand" "Ua,Ua")]
 	    VUNSPEC_LL)))]
   "TARGET_HAVE_LDREXBH"
-  "ldrex<sync_sfx>%?\t%0, %C1"
-  [(set_attr "predicable" "yes")
+  "@
+   ldrex<sync_sfx>%?\t%0, %C1
+   ldrex<sync_sfx>\t%0, %C1"
+  [(set_attr "arch" "32,v8mb")
+   (set_attr "predicable" "yes")
    (set_attr "predicable_short_it" "no")])
 
 (define_insn "arm_load_acquire_exclusive<mode>"
-  [(set (match_operand:SI 0 "s_register_operand" "=r")
+  [(set (match_operand:SI 0 "s_register_operand" "=r,r")
         (zero_extend:SI
 	  (unspec_volatile:NARROW
-	    [(match_operand:NARROW 1 "mem_noofs_operand" "Ua")]
+	    [(match_operand:NARROW 1 "mem_noofs_operand" "Ua,Ua")]
 	    VUNSPEC_LAX)))]
   "TARGET_HAVE_LDACQ"
-  "ldaex<sync_sfx>%?\\t%0, %C1"
-  [(set_attr "predicable" "yes")
+  "@
+   ldaex<sync_sfx>%?\\t%0, %C1
+   ldaex<sync_sfx>\\t%0, %C1"
+  [(set_attr "arch" "32,v8mb")
+   (set_attr "predicable" "yes")
    (set_attr "predicable_short_it" "no")])
 
 (define_insn "arm_load_exclusivesi"
-  [(set (match_operand:SI 0 "s_register_operand" "=r")
+  [(set (match_operand:SI 0 "s_register_operand" "=r,r")
 	(unspec_volatile:SI
-	  [(match_operand:SI 1 "mem_noofs_operand" "Ua")]
+	  [(match_operand:SI 1 "mem_noofs_operand" "Ua,Ua")]
 	  VUNSPEC_LL))]
   "TARGET_HAVE_LDREX"
-  "ldrex%?\t%0, %C1"
-  [(set_attr "predicable" "yes")
+  "@
+   ldrex%?\t%0, %C1
+   ldrex\t%0, %C1"
+  [(set_attr "arch" "32,v8mb")
+   (set_attr "predicable" "yes")
    (set_attr "predicable_short_it" "no")])
 
 (define_insn "arm_load_acquire_exclusivesi"
-  [(set (match_operand:SI 0 "s_register_operand" "=r")
+  [(set (match_operand:SI 0 "s_register_operand" "=r,r")
 	(unspec_volatile:SI
-	  [(match_operand:SI 1 "mem_noofs_operand" "Ua")]
+	  [(match_operand:SI 1 "mem_noofs_operand" "Ua,Ua")]
 	  VUNSPEC_LAX))]
   "TARGET_HAVE_LDACQ"
-  "ldaex%?\t%0, %C1"
-  [(set_attr "predicable" "yes")
+  "@
+   ldaex%?\t%0, %C1
+   ldaex\t%0, %C1"
+  [(set_attr "arch" "32,v8mb")
+   (set_attr "predicable" "yes")
    (set_attr "predicable_short_it" "no")])
 
 (define_insn "arm_load_exclusivedi"
@@ -436,7 +519,7 @@
 	(unspec_volatile:DI
 	  [(match_operand:DI 1 "mem_noofs_operand" "Ua")]
 	  VUNSPEC_LAX))]
-  "TARGET_HAVE_LDACQ && ARM_DOUBLEWORD_ALIGN"
+  "TARGET_HAVE_LDACQEXD && ARM_DOUBLEWORD_ALIGN"
   "ldaexd%?\t%0, %H0, %C1"
   [(set_attr "predicable" "yes")
    (set_attr "predicable_short_it" "no")])
@@ -452,16 +535,18 @@
   {
     if (<MODE>mode == DImode)
       {
-	rtx value = operands[2];
 	/* The restrictions on target registers in ARM mode are that the two
 	   registers are consecutive and the first one is even; Thumb is
 	   actually more flexible, but DI should give us this anyway.
-	   Note that the 1st register always gets the lowest word in memory.  */
-	gcc_assert ((REGNO (value) & 1) == 0 || TARGET_THUMB2);
-	operands[3] = gen_rtx_REG (SImode, REGNO (value) + 1);
-	return "strexd%?\t%0, %2, %3, %C1";
+	   Note that the 1st register always gets the
+	   lowest word in memory.  */
+	gcc_assert ((REGNO (operands[2]) & 1) == 0 || TARGET_THUMB2);
+	return "strexd%?\t%0, %2, %H2, %C1";
       }
-    return "strex<sync_sfx>%?\t%0, %2, %C1";
+    if (TARGET_THUMB1)
+      return "strex<sync_sfx>\t%0, %2, %C1";
+    else
+      return "strex<sync_sfx>%?\t%0, %2, %C1";
   }
   [(set_attr "predicable" "yes")
    (set_attr "predicable_short_it" "no")])
@@ -473,25 +558,26 @@
 	(unspec_volatile:DI
 	  [(match_operand:DI 2 "s_register_operand" "r")]
 	  VUNSPEC_SLX))]
-  "TARGET_HAVE_LDACQ && ARM_DOUBLEWORD_ALIGN"
+  "TARGET_HAVE_LDACQEXD && ARM_DOUBLEWORD_ALIGN"
   {
-    rtx value = operands[2];
     /* See comment in arm_store_exclusive<mode> above.  */
-    gcc_assert ((REGNO (value) & 1) == 0 || TARGET_THUMB2);
-    operands[3] = gen_rtx_REG (SImode, REGNO (value) + 1);
-    return "stlexd%?\t%0, %2, %3, %C1";
+    gcc_assert ((REGNO (operands[2]) & 1) == 0 || TARGET_THUMB2);
+    return "stlexd%?\t%0, %2, %H2, %C1";
   }
   [(set_attr "predicable" "yes")
    (set_attr "predicable_short_it" "no")])
 
 (define_insn "arm_store_release_exclusive<mode>"
-  [(set (match_operand:SI 0 "s_register_operand" "=&r")
+  [(set (match_operand:SI 0 "s_register_operand" "=&r,&r")
 	(unspec_volatile:SI [(const_int 0)] VUNSPEC_SLX))
-   (set (match_operand:QHSI 1 "mem_noofs_operand" "=Ua")
+   (set (match_operand:QHSI 1 "mem_noofs_operand" "=Ua,Ua")
 	(unspec_volatile:QHSI
-	  [(match_operand:QHSI 2 "s_register_operand" "r")]
+	  [(match_operand:QHSI 2 "s_register_operand" "r,r")]
 	  VUNSPEC_SLX))]
   "TARGET_HAVE_LDACQ"
-  "stlex<sync_sfx>%?\t%0, %2, %C1"
-  [(set_attr "predicable" "yes")
+  "@
+   stlex<sync_sfx>%?\t%0, %2, %C1
+   stlex<sync_sfx>\t%0, %2, %C1"
+  [(set_attr "arch" "32,v8mb")
+   (set_attr "predicable" "yes")
    (set_attr "predicable_short_it" "no")])
--- a/src/gcc/config/arm/t-aprofile
+++ b/src/gcc/config/arm/t-aprofile
@@ -49,38 +49,33 @@ MULTILIB_DIRNAMES      += fpv3 simdv1 fpv4 simdvfpv4 simdv8
 MULTILIB_OPTIONS       += mfloat-abi=softfp/mfloat-abi=hard
 MULTILIB_DIRNAMES      += softfp hard
 
-# We don't build no-float libraries with an FPU.
-MULTILIB_EXCEPTIONS    += *mfpu=vfpv3-d16
-MULTILIB_EXCEPTIONS    += *mfpu=neon
-MULTILIB_EXCEPTIONS    += *mfpu=vfpv4-d16
-MULTILIB_EXCEPTIONS    += *mfpu=neon-vfpv4
-MULTILIB_EXCEPTIONS    += *mfpu=neon-fp-armv8
-
-# We don't build libraries requiring an FPU at the CPU/Arch/ISA level.
-MULTILIB_EXCEPTIONS    += mfloat-abi=*
-MULTILIB_EXCEPTIONS    += mfpu=*
-MULTILIB_EXCEPTIONS    += mthumb/mfloat-abi=*
-MULTILIB_EXCEPTIONS    += mthumb/mfpu=*
-MULTILIB_EXCEPTIONS    += *march=armv7-a/mfloat-abi=*
-MULTILIB_EXCEPTIONS    += *march=armv7ve/mfloat-abi=*
-MULTILIB_EXCEPTIONS    += *march=armv8-a/mfloat-abi=*
-
-# Ensure the correct FPU variants apply to the correct base architectures.
-MULTILIB_EXCEPTIONS    += *march=armv7ve/*mfpu=vfpv3-d16*
-MULTILIB_EXCEPTIONS    += *march=armv7ve/*mfpu=neon/*
-MULTILIB_EXCEPTIONS    += *march=armv8-a/*mfpu=vfpv3-d16*
-MULTILIB_EXCEPTIONS    += *march=armv8-a/*mfpu=neon/*
-MULTILIB_EXCEPTIONS    += *march=armv7-a/*mfpu=vfpv4-d16*
-MULTILIB_EXCEPTIONS    += *march=armv7-a/*mfpu=neon-vfpv4*
-MULTILIB_EXCEPTIONS    += *march=armv8-a/*mfpu=vfpv4-d16*
-MULTILIB_EXCEPTIONS    += *march=armv8-a/*mfpu=neon-vfpv4*
-MULTILIB_EXCEPTIONS    += *march=armv7-a/*mfpu=neon-fp-armv8*
-MULTILIB_EXCEPTIONS    += *march=armv7ve/*mfpu=neon-fp-armv8*
+
+# Option combinations to build library with
+
+# Default CPU/Arch (ARM is implicitly included because it uses the default
+# multilib)
+MULTILIB_REQUIRED      += mthumb
+
+# ARMv7-A
+MULTILIB_REQUIRED      += *march=armv7-a
+MULTILIB_REQUIRED      += *march=armv7-a/mfpu=vfpv3-d16/mfloat-abi=*
+MULTILIB_REQUIRED      += *march=armv7-a/mfpu=neon/mfloat-abi=*
+
+# ARMv7VE
+MULTILIB_REQUIRED      += *march=armv7ve
+MULTILIB_REQUIRED      += *march=armv7ve/mfpu=vfpv4-d16/mfloat-abi=*
+MULTILIB_REQUIRED      += *march=armv7ve/mfpu=neon-vfpv4/mfloat-abi=*
+
+# ARMv8-A
+MULTILIB_REQUIRED      += *march=armv8-a
+MULTILIB_REQUIRED      += *march=armv8-a/mfpu=neon-fp-armv8/mfloat-abi=*
+
 
 # CPU Matches
 MULTILIB_MATCHES       += march?armv7-a=mcpu?cortex-a8
 MULTILIB_MATCHES       += march?armv7-a=mcpu?cortex-a9
 MULTILIB_MATCHES       += march?armv7-a=mcpu?cortex-a5
+MULTILIB_MATCHES       += march?armv7ve=mcpu?cortex-a7
 MULTILIB_MATCHES       += march?armv7ve=mcpu?cortex-a15
 MULTILIB_MATCHES       += march?armv7ve=mcpu?cortex-a12
 MULTILIB_MATCHES       += march?armv7ve=mcpu?cortex-a17
@@ -93,6 +88,9 @@ MULTILIB_MATCHES       += march?armv8-a=mcpu?cortex-a57
 MULTILIB_MATCHES       += march?armv8-a=mcpu?cortex-a57.cortex-a53
 MULTILIB_MATCHES       += march?armv8-a=mcpu?cortex-a72
 MULTILIB_MATCHES       += march?armv8-a=mcpu?cortex-a72.cortex-a53
+MULTILIB_MATCHES       += march?armv8-a=mcpu?cortex-a73
+MULTILIB_MATCHES       += march?armv8-a=mcpu?cortex-a73.cortex-a35
+MULTILIB_MATCHES       += march?armv8-a=mcpu?cortex-a73.cortex-a53
 MULTILIB_MATCHES       += march?armv8-a=mcpu?exynos-m1
 MULTILIB_MATCHES       += march?armv8-a=mcpu?qdf24xx
 MULTILIB_MATCHES       += march?armv8-a=mcpu?xgene1
@@ -101,13 +99,20 @@ MULTILIB_MATCHES       += march?armv8-a=mcpu?xgene1
 MULTILIB_MATCHES       += march?armv8-a=march?armv8-a+crc
 MULTILIB_MATCHES       += march?armv8-a=march?armv8.1-a
 MULTILIB_MATCHES       += march?armv8-a=march?armv8.1-a+crc
+MULTILIB_MATCHES       += march?armv8-a=march?armv8.2-a
+MULTILIB_MATCHES       += march?armv8-a=march?armv8.2-a+fp16
 
 # FPU matches
 MULTILIB_MATCHES       += mfpu?vfpv3-d16=mfpu?vfpv3
 MULTILIB_MATCHES       += mfpu?vfpv3-d16=mfpu?vfpv3-fp16
-MULTILIB_MATCHES       += mfpu?vfpv3-d16=mfpu?vfpv3-fp16-d16
+MULTILIB_MATCHES       += mfpu?vfpv3-d16=mfpu?vfpv3-d16-fp16
+MULTILIB_MATCHES       += mfpu?neon=mfpu?neon-fp16
 MULTILIB_MATCHES       += mfpu?vfpv4-d16=mfpu?vfpv4
+MULTILIB_MATCHES       += mfpu?vfpv4-d16=mfpu?fpv5-d16
+MULTILIB_MATCHES       += mfpu?vfpv4-d16=mfpu?fp-armv8
 MULTILIB_MATCHES       += mfpu?neon-fp-armv8=mfpu?crypto-neon-fp-armv8
+MULTILIB_MATCHES       += mfpu?vfp=mfpu?vfpv2
+MULTILIB_MATCHES       += mfpu?neon=mfpu?neon-vfpv3
 
 
 # Map all requests for vfpv3 with a later CPU to vfpv3-d16 v7-a.
@@ -124,10 +129,6 @@ MULTILIB_REUSE	      += march.armv7-a/mfpu.vfpv3-d16/mfloat-abi.hard=march.armv8
 MULTILIB_REUSE	      += march.armv7-a/mfpu.vfpv3-d16/mfloat-abi.softfp=march.armv8-a/mfpu.vfpv3-d16/mfloat-abi.softfp
 MULTILIB_REUSE	      += march.armv7-a/mfpu.vfpv3-d16/mfloat-abi.hard=march.armv7-a/mfpu.vfpv4-d16/mfloat-abi.hard
 MULTILIB_REUSE	      += march.armv7-a/mfpu.vfpv3-d16/mfloat-abi.softfp=march.armv7-a/mfpu.vfpv4-d16/mfloat-abi.softfp
-MULTILIB_REUSE	      += march.armv7-a/mfpu.vfpv3-d16/mfloat-abi.hard=march.armv7-a/mfpu.fp-armv8/mfloat-abi.hard
-MULTILIB_REUSE	      += march.armv7-a/mfpu.vfpv3-d16/mfloat-abi.softfp=march.armv7-a/mfpu.fp-armv8/mfloat-abi.softfp
-MULTILIB_REUSE	      += march.armv7-a/mfpu.vfpv3-d16/mfloat-abi.hard=march.armv7-a/mfpu.vfpv4/mfloat-abi.hard
-MULTILIB_REUSE	      += march.armv7-a/mfpu.vfpv3-d16/mfloat-abi.softfp=march.armv7-a/mfpu.vfpv4/mfloat-abi.softfp
 
 
 MULTILIB_REUSE	      += march.armv7-a/mfpu.neon/mfloat-abi.hard=march.armv7ve/mfpu.neon/mfloat-abi.hard
@@ -140,10 +141,6 @@ MULTILIB_REUSE	      += march.armv7-a/mfpu.neon/mfloat-abi.hard=march.armv7-a/mf
 MULTILIB_REUSE	      += march.armv7-a/mfpu.neon/mfloat-abi.softfp=march.armv7-a/mfpu.neon-fp-armv8/mfloat-abi.softfp
 
 
-MULTILIB_REUSE	      += march.armv7ve/mfpu.vfpv4-d16/mfloat-abi.hard=march.armv7ve/mfpu.fp-armv8/mfloat-abi.hard
-MULTILIB_REUSE	      += march.armv7ve/mfpu.vfpv4-d16/mfloat-abi.softfp=march.armv7ve/mfpu.fp-armv8/mfloat-abi.softfp
-MULTILIB_REUSE	      += march.armv7ve/mfpu.vfpv4-d16/mfloat-abi.hard=march.armv8-a/mfpu.vfpv4/mfloat-abi.hard
-MULTILIB_REUSE	      += march.armv7ve/mfpu.vfpv4-d16/mfloat-abi.softfp=march.armv8-a/mfpu.vfpv4/mfloat-abi.softfp
 MULTILIB_REUSE	      += march.armv7ve/mfpu.vfpv4-d16/mfloat-abi.hard=march.armv8-a/mfpu.vfpv4-d16/mfloat-abi.hard
 MULTILIB_REUSE	      += march.armv7ve/mfpu.vfpv4-d16/mfloat-abi.softfp=march.armv8-a/mfpu.vfpv4-d16/mfloat-abi.softfp
 
@@ -163,10 +160,6 @@ MULTILIB_REUSE	      += mthumb/march.armv7-a/mfpu.vfpv3-d16/mfloat-abi.hard=mthu
 MULTILIB_REUSE	      += mthumb/march.armv7-a/mfpu.vfpv3-d16/mfloat-abi.softfp=mthumb/march.armv8-a/mfpu.vfpv3-d16/mfloat-abi.softfp
 MULTILIB_REUSE	      += mthumb/march.armv7-a/mfpu.vfpv3-d16/mfloat-abi.hard=mthumb/march.armv7-a/mfpu.vfpv4-d16/mfloat-abi.hard
 MULTILIB_REUSE	      += mthumb/march.armv7-a/mfpu.vfpv3-d16/mfloat-abi.softfp=mthumb/march.armv7-a/mfpu.vfpv4-d16/mfloat-abi.softfp
-MULTILIB_REUSE	      += mthumb/march.armv7-a/mfpu.vfpv3-d16/mfloat-abi.hard=mthumb/march.armv7-a/mfpu.fp-armv8/mfloat-abi.hard
-MULTILIB_REUSE	      += mthumb/march.armv7-a/mfpu.vfpv3-d16/mfloat-abi.softfp=mthumb/march.armv7-a/mfpu.fp-armv8/mfloat-abi.softfp
-MULTILIB_REUSE	      += mthumb/march.armv7-a/mfpu.vfpv3-d16/mfloat-abi.hard=mthumb/march.armv7-a/mfpu.vfpv4/mfloat-abi.hard
-MULTILIB_REUSE	      += mthumb/march.armv7-a/mfpu.vfpv3-d16/mfloat-abi.softfp=mthumb/march.armv7-a/mfpu.vfpv4/mfloat-abi.softfp
 
 
 MULTILIB_REUSE	      += mthumb/march.armv7-a/mfpu.neon/mfloat-abi.hard=mthumb/march.armv7ve/mfpu.neon/mfloat-abi.hard
@@ -179,10 +172,6 @@ MULTILIB_REUSE	      += mthumb/march.armv7-a/mfpu.neon/mfloat-abi.hard=mthumb/ma
 MULTILIB_REUSE	      += mthumb/march.armv7-a/mfpu.neon/mfloat-abi.softfp=mthumb/march.armv7-a/mfpu.neon-fp-armv8/mfloat-abi.softfp
 
 
-MULTILIB_REUSE	      += mthumb/march.armv7ve/mfpu.vfpv4-d16/mfloat-abi.hard=mthumb/march.armv7ve/mfpu.fp-armv8/mfloat-abi.hard
-MULTILIB_REUSE	      += mthumb/march.armv7ve/mfpu.vfpv4-d16/mfloat-abi.softfp=mthumb/march.armv7ve/mfpu.fp-armv8/mfloat-abi.softfp
-MULTILIB_REUSE	      += mthumb/march.armv7ve/mfpu.vfpv4-d16/mfloat-abi.hard=mthumb/march.armv8-a/mfpu.vfpv4/mfloat-abi.hard
-MULTILIB_REUSE	      += mthumb/march.armv7ve/mfpu.vfpv4-d16/mfloat-abi.softfp=mthumb/march.armv8-a/mfpu.vfpv4/mfloat-abi.softfp
 MULTILIB_REUSE	      += mthumb/march.armv7ve/mfpu.vfpv4-d16/mfloat-abi.hard=mthumb/march.armv8-a/mfpu.vfpv4-d16/mfloat-abi.hard
 MULTILIB_REUSE	      += mthumb/march.armv7ve/mfpu.vfpv4-d16/mfloat-abi.softfp=mthumb/march.armv8-a/mfpu.vfpv4-d16/mfloat-abi.softfp
 
--- a/src/gcc/config/arm/t-arm
+++ b/src/gcc/config/arm/t-arm
@@ -95,7 +95,8 @@ arm.o: $(srcdir)/config/arm/arm.c $(CONFIG_H) $(SYSTEM_H) coretypes.h $(TM_H) \
   $(srcdir)/config/arm/arm-cores.def \
   $(srcdir)/config/arm/arm-arches.def $(srcdir)/config/arm/arm-fpus.def \
   $(srcdir)/config/arm/arm-protos.h \
-  $(srcdir)/config/arm/arm_neon_builtins.def
+  $(srcdir)/config/arm/arm_neon_builtins.def \
+  $(srcdir)/config/arm/arm_vfp_builtins.def
 
 arm-builtins.o: $(srcdir)/config/arm/arm-builtins.c $(CONFIG_H) \
   $(SYSTEM_H) coretypes.h $(TM_H) \
@@ -103,6 +104,7 @@ arm-builtins.o: $(srcdir)/config/arm/arm-builtins.c $(CONFIG_H) \
   $(DIAGNOSTIC_CORE_H) $(OPTABS_H) \
   $(srcdir)/config/arm/arm-protos.h \
   $(srcdir)/config/arm/arm_neon_builtins.def \
+  $(srcdir)/config/arm/arm_vfp_builtins.def \
   $(srcdir)/config/arm/arm-simd-builtin-types.def
 	$(COMPILER) -c $(ALL_COMPILERFLAGS) $(ALL_CPPFLAGS) $(INCLUDES) \
 		$(srcdir)/config/arm/arm-builtins.c
--- /dev/null
+++ b/src/gcc/config/arm/t-rmprofile
@@ -0,0 +1,176 @@
+# Copyright (C) 2016 Free Software Foundation, Inc.
+#
+# This file is part of GCC.
+#
+# GCC is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 3, or (at your option)
+# any later version.
+#
+# GCC is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with GCC; see the file COPYING3.  If not see
+# <http://www.gnu.org/licenses/>.
+
+# This is a target makefile fragment that attempts to get
+# multilibs built for the range of CPU's, FPU's and ABI's that
+# are relevant for the ARM architecture.  It should not be used in
+# conjunction with another make file fragment and assumes --with-arch,
+# --with-cpu, --with-fpu, --with-float, --with-mode have their default
+# values during the configure step.  We enforce this during the
+# top-level configury.
+
+MULTILIB_OPTIONS     =
+MULTILIB_DIRNAMES    =
+MULTILIB_EXCEPTIONS  =
+MULTILIB_MATCHES     =
+MULTILIB_REUSE       =
+
+# We have the following hierachy:
+#   ISA: A32 (.) or T16/T32 (thumb).
+#   Architecture: ARMv6S-M (v6-m), ARMv7-M (v7-m), ARMv7E-M (v7e-m),
+#                 ARMv8-M Baseline (v8-m.base) or ARMv8-M Mainline (v8-m.main).
+#   FPU: VFPv3-D16 (fpv3), FPV4-SP-D16 (fpv4-sp), FPV5-SP-D16 (fpv5-sp),
+#        VFPv5-D16 (fpv5), or None (.).
+#   Float-abi: Soft (.), softfp (softfp), or hard (hardfp).
+
+# Options to build libraries with
+
+MULTILIB_OPTIONS       += mthumb
+MULTILIB_DIRNAMES      += thumb
+
+MULTILIB_OPTIONS       += march=armv6s-m/march=armv7-m/march=armv7e-m/march=armv7/march=armv8-m.base/march=armv8-m.main
+MULTILIB_DIRNAMES      += v6-m v7-m v7e-m v7-ar v8-m.base v8-m.main
+
+MULTILIB_OPTIONS       += mfpu=vfpv3-d16/mfpu=fpv4-sp-d16/mfpu=fpv5-sp-d16/mfpu=fpv5-d16
+MULTILIB_DIRNAMES      += fpv3 fpv4-sp fpv5-sp fpv5
+
+MULTILIB_OPTIONS       += mfloat-abi=softfp/mfloat-abi=hard
+MULTILIB_DIRNAMES      += softfp hard
+
+
+# Option combinations to build library with
+
+# Default CPU/Arch
+MULTILIB_REQUIRED      += mthumb
+MULTILIB_REQUIRED      += mfloat-abi=hard
+
+# ARMv6-M
+MULTILIB_REQUIRED      += mthumb/march=armv6s-m
+
+# ARMv8-M Baseline
+MULTILIB_REQUIRED      += mthumb/march=armv8-m.base
+
+# ARMv7-M
+MULTILIB_REQUIRED      += mthumb/march=armv7-m
+
+# ARMv7E-M
+MULTILIB_REQUIRED      += mthumb/march=armv7e-m
+MULTILIB_REQUIRED      += mthumb/march=armv7e-m/mfpu=fpv4-sp-d16/mfloat-abi=softfp
+MULTILIB_REQUIRED      += mthumb/march=armv7e-m/mfpu=fpv4-sp-d16/mfloat-abi=hard
+MULTILIB_REQUIRED      += mthumb/march=armv7e-m/mfpu=fpv5-d16/mfloat-abi=softfp
+MULTILIB_REQUIRED      += mthumb/march=armv7e-m/mfpu=fpv5-d16/mfloat-abi=hard
+MULTILIB_REQUIRED      += mthumb/march=armv7e-m/mfpu=fpv5-sp-d16/mfloat-abi=softfp
+MULTILIB_REQUIRED      += mthumb/march=armv7e-m/mfpu=fpv5-sp-d16/mfloat-abi=hard
+
+# ARMv8-M Mainline
+MULTILIB_REQUIRED      += mthumb/march=armv8-m.main
+MULTILIB_REQUIRED      += mthumb/march=armv8-m.main/mfpu=fpv5-d16/mfloat-abi=softfp
+MULTILIB_REQUIRED      += mthumb/march=armv8-m.main/mfpu=fpv5-d16/mfloat-abi=hard
+MULTILIB_REQUIRED      += mthumb/march=armv8-m.main/mfpu=fpv5-sp-d16/mfloat-abi=softfp
+MULTILIB_REQUIRED      += mthumb/march=armv8-m.main/mfpu=fpv5-sp-d16/mfloat-abi=hard
+
+# ARMv7-R as well as ARMv7-A and ARMv8-A if aprofile was not specified
+MULTILIB_REQUIRED      += mthumb/march=armv7
+MULTILIB_REQUIRED      += mthumb/march=armv7/mfpu=vfpv3-d16/mfloat-abi=softfp
+MULTILIB_REQUIRED      += mthumb/march=armv7/mfpu=vfpv3-d16/mfloat-abi=hard
+
+
+# Matches
+
+# CPU Matches
+MULTILIB_MATCHES       += march?armv6s-m=mcpu?cortex-m0
+MULTILIB_MATCHES       += march?armv6s-m=mcpu?cortex-m0.small-multiply
+MULTILIB_MATCHES       += march?armv6s-m=mcpu?cortex-m0plus
+MULTILIB_MATCHES       += march?armv6s-m=mcpu?cortex-m0plus.small-multiply
+MULTILIB_MATCHES       += march?armv6s-m=mcpu?cortex-m1
+MULTILIB_MATCHES       += march?armv6s-m=mcpu?cortex-m1.small-multiply
+MULTILIB_MATCHES       += march?armv7-m=mcpu?cortex-m3
+MULTILIB_MATCHES       += march?armv7e-m=mcpu?cortex-m4
+MULTILIB_MATCHES       += march?armv7e-m=mcpu?cortex-m7
+MULTILIB_MATCHES       += march?armv8-m.base=mcpu?cortex-m23
+MULTILIB_MATCHES       += march?armv8-m.main=mcpu?cortex-m33
+MULTILIB_MATCHES       += march?armv7=mcpu?cortex-r4
+MULTILIB_MATCHES       += march?armv7=mcpu?cortex-r4f
+MULTILIB_MATCHES       += march?armv7=mcpu?cortex-r5
+MULTILIB_MATCHES       += march?armv7=mcpu?cortex-r7
+MULTILIB_MATCHES       += march?armv7=mcpu?cortex-r8
+MULTILIB_MATCHES       += march?armv7=mcpu?marvell-pj4
+MULTILIB_MATCHES       += march?armv7=mcpu?generic-armv7-a
+MULTILIB_MATCHES       += march?armv7=mcpu?cortex-a8
+MULTILIB_MATCHES       += march?armv7=mcpu?cortex-a9
+MULTILIB_MATCHES       += march?armv7=mcpu?cortex-a5
+MULTILIB_MATCHES       += march?armv7=mcpu?cortex-a7
+MULTILIB_MATCHES       += march?armv7=mcpu?cortex-a15
+MULTILIB_MATCHES       += march?armv7=mcpu?cortex-a12
+MULTILIB_MATCHES       += march?armv7=mcpu?cortex-a17
+MULTILIB_MATCHES       += march?armv7=mcpu?cortex-a15.cortex-a7
+MULTILIB_MATCHES       += march?armv7=mcpu?cortex-a17.cortex-a7
+MULTILIB_MATCHES       += march?armv7=mcpu?cortex-a32
+MULTILIB_MATCHES       += march?armv7=mcpu?cortex-a35
+MULTILIB_MATCHES       += march?armv7=mcpu?cortex-a53
+MULTILIB_MATCHES       += march?armv7=mcpu?cortex-a57
+MULTILIB_MATCHES       += march?armv7=mcpu?cortex-a57.cortex-a53
+MULTILIB_MATCHES       += march?armv7=mcpu?cortex-a72
+MULTILIB_MATCHES       += march?armv7=mcpu?cortex-a72.cortex-a53
+MULTILIB_MATCHES       += march?armv7=mcpu?cortex-a73
+MULTILIB_MATCHES       += march?armv7=mcpu?cortex-a73.cortex-a35
+MULTILIB_MATCHES       += march?armv7=mcpu?cortex-a73.cortex-a53
+MULTILIB_MATCHES       += march?armv7=mcpu?exynos-m1
+MULTILIB_MATCHES       += march?armv7=mcpu?qdf24xx
+MULTILIB_MATCHES       += march?armv7=mcpu?xgene1
+
+# Arch Matches
+MULTILIB_MATCHES       += march?armv6s-m=march?armv6-m
+MULTILIB_MATCHES       += march?armv8-m.main=march?armv8-m.main+dsp
+MULTILIB_MATCHES       += march?armv7=march?armv7-r
+ifeq (,$(HAS_APROFILE))
+MULTILIB_MATCHES       += march?armv7=march?armv7-a
+MULTILIB_MATCHES       += march?armv7=march?armv7ve
+MULTILIB_MATCHES       += march?armv7=march?armv8-a
+MULTILIB_MATCHES       += march?armv7=march?armv8-a+crc
+MULTILIB_MATCHES       += march?armv7=march?armv8.1-a
+MULTILIB_MATCHES       += march?armv7=march?armv8.1-a+crc
+MULTILIB_MATCHES       += march?armv7=march?armv8.2-a
+MULTILIB_MATCHES       += march?armv7=march?armv8.2-a+fp16
+endif
+
+# FPU matches
+ifeq (,$(HAS_APROFILE))
+MULTILIB_MATCHES       += mfpu?vfpv3-d16=mfpu?vfpv3
+MULTILIB_MATCHES       += mfpu?vfpv3-d16=mfpu?vfpv3-fp16
+MULTILIB_MATCHES       += mfpu?vfpv3-d16=mfpu?vfpv3-d16-fp16
+MULTILIB_MATCHES       += mfpu?vfpv3-d16=mfpu?neon
+MULTILIB_MATCHES       += mfpu?vfpv3-d16=mfpu?neon-fp16
+MULTILIB_MATCHES       += mfpu?vfpv3-d16=mfpu?vfpv4
+MULTILIB_MATCHES       += mfpu?vfpv3-d16=mfpu?vfpv4-d16
+MULTILIB_MATCHES       += mfpu?vfpv3-d16=mfpu?neon-vfpv4
+MULTILIB_MATCHES       += mfpu?fpv5-d16=mfpu?fp-armv8
+MULTILIB_MATCHES       += mfpu?fpv5-d16=mfpu?neon-fp-armv8
+MULTILIB_MATCHES       += mfpu?fpv5-d16=mfpu?crypto-neon-fp-armv8
+endif
+
+
+# We map all requests for ARMv7-R or ARMv7-A in ARM mode to Thumb mode and
+# any FPU to VFPv3-d16 if possible.
+MULTILIB_REUSE         += mthumb/march.armv7=march.armv7
+MULTILIB_REUSE         += mthumb/march.armv7/mfpu.vfpv3-d16/mfloat-abi.softfp=march.armv7/mfpu.vfpv3-d16/mfloat-abi.softfp
+MULTILIB_REUSE         += mthumb/march.armv7/mfpu.vfpv3-d16/mfloat-abi.hard=march.armv7/mfpu.vfpv3-d16/mfloat-abi.hard
+MULTILIB_REUSE         += mthumb/march.armv7/mfpu.vfpv3-d16/mfloat-abi.softfp=march.armv7/mfpu.fpv5-d16/mfloat-abi.softfp
+MULTILIB_REUSE         += mthumb/march.armv7/mfpu.vfpv3-d16/mfloat-abi.hard=march.armv7/mfpu.fpv5-d16/mfloat-abi.hard
+MULTILIB_REUSE         += mthumb/march.armv7/mfpu.vfpv3-d16/mfloat-abi.softfp=mthumb/march.armv7/mfpu.fpv5-d16/mfloat-abi.softfp
+MULTILIB_REUSE         += mthumb/march.armv7/mfpu.vfpv3-d16/mfloat-abi.hard=mthumb/march.armv7/mfpu.fpv5-d16/mfloat-abi.hard
--- a/src/gcc/config/arm/thumb1.md
+++ b/src/gcc/config/arm/thumb1.md
@@ -55,6 +55,10 @@
    (set_attr "type" "multiple")]
 )
 
+;; Changes to the constraints of this pattern must be propagated to those of
+;; atomic additions in sync.md and to the logic for bind_old_new in
+;; arm_split_atomic_op in arm.c.  These must be at least as strict as the
+;; constraints here and aim to be as permissive.
 (define_insn_and_split "*thumb1_addsi3"
   [(set (match_operand:SI          0 "register_operand" "=l,l,l,*rk,*hk,l,k,l,l,l")
 	(plus:SI (match_operand:SI 1 "register_operand" "%0,0,l,*0,*0,k,k,0,l,k")
@@ -114,8 +118,8 @@
    (set (match_dup 0)
 	(plus:SI (match_dup 0) (reg:SI SP_REGNUM)))]
   "TARGET_THUMB1
-   && (unsigned HOST_WIDE_INT) (INTVAL (operands[1])) < 1024
-   && (INTVAL (operands[1]) & 3) == 0"
+   && UINTVAL (operands[1]) < 1024
+   && (UINTVAL (operands[1]) & 3) == 0"
   [(set (match_dup 0) (plus:SI (reg:SI SP_REGNUM) (match_dup 1)))]
   ""
 )
@@ -131,6 +135,10 @@
    (set_attr "type" "multiple")]
 )
 
+;; Changes to the constraints of this pattern must be propagated to those of
+;; atomic subtractions in sync.md and to the logic for bind_old_new in
+;; arm_split_atomic_op in arm.c.  These must be at least as strict as the
+;; constraints here and aim to be as permissive.
 (define_insn "thumb1_subsi3_insn"
   [(set (match_operand:SI           0 "register_operand" "=l")
 	(minus:SI (match_operand:SI 1 "register_operand" "l")
@@ -142,11 +150,11 @@
    (set_attr "type" "alus_sreg")]
 )
 
-; Unfortunately with the Thumb the '&'/'0' trick can fails when operands
-; 1 and 2; are the same, because reload will make operand 0 match
-; operand 1 without realizing that this conflicts with operand 2.  We fix
-; this by adding another alternative to match this case, and then `reload'
-; it ourselves.  This alternative must come first.
+;; Unfortunately on Thumb the '&'/'0' trick can fail when operands
+;; 1 and 2 are the same, because reload will make operand 0 match
+;; operand 1 without realizing that this conflicts with operand 2.  We fix
+;; this by adding another alternative to match this case, and then `reload'
+;; it ourselves.  This alternative must come first.
 (define_insn "*thumb_mulsi3"
   [(set (match_operand:SI          0 "register_operand" "=&l,&l,&l")
 	(mult:SI (match_operand:SI 1 "register_operand" "%l,*h,0")
@@ -173,6 +181,10 @@
    (set_attr "type" "muls")]
 )
 
+;; Changes to the constraints of this pattern must be propagated to those of
+;; atomic bitwise ANDs and NANDs in sync.md and to the logic for bind_old_new
+;; in arm_split_atomic_op in arm.c.  These must be at least as strict as the
+;; constraints here and aim to be as permissive.
 (define_insn "*thumb1_andsi3_insn"
   [(set (match_operand:SI         0 "register_operand" "=l")
 	(and:SI (match_operand:SI 1 "register_operand" "%0")
@@ -227,6 +239,10 @@
    (set_attr "type" "logics_reg")]
 )
 
+;; Changes to the constraints of this pattern must be propagated to those of
+;; atomic inclusive ORs in sync.md and to the logic for bind_old_new in
+;; arm_split_atomic_op in arm.c.  These must be at least as strict as the
+;; constraints here and aim to be as permissive.
 (define_insn "*thumb1_iorsi3_insn"
   [(set (match_operand:SI         0 "register_operand" "=l")
 	(ior:SI (match_operand:SI 1 "register_operand" "%0")
@@ -237,6 +253,10 @@
    (set_attr "conds" "set")
    (set_attr "type" "logics_reg")])
 
+;; Changes to the constraints of this pattern must be propagated to those of
+;; atomic exclusive ORs in sync.md and to the logic for bind_old_new in
+;; arm_split_atomic_op in arm.c.  These must be at least as strict as the
+;; constraints here and aim to be as permissive.
 (define_insn "*thumb1_xorsi3_insn"
   [(set (match_operand:SI         0 "register_operand" "=l")
 	(xor:SI (match_operand:SI 1 "register_operand" "%0")
@@ -590,8 +610,8 @@
 ;;; ??? The 'i' constraint looks funny, but it should always be replaced by
 ;;; thumb_reorg with a memory reference.
 (define_insn "*thumb1_movdi_insn"
-  [(set (match_operand:DI 0 "nonimmediate_operand" "=l,l,l,l,>,l, m,*r")
-	(match_operand:DI 1 "general_operand"      "l, I,J,>,l,mi,l,*r"))]
+  [(set (match_operand:DI 0 "nonimmediate_operand" "=l,l,l,r,l,>,l, m,*r")
+	(match_operand:DI 1 "general_operand"      "l, I,J,j,>,l,mi,l,*r"))]
   "TARGET_THUMB1
    && (   register_operand (operands[0], DImode)
        || register_operand (operands[1], DImode))"
@@ -610,36 +630,41 @@
       operands[1] = GEN_INT (- INTVAL (operands[1]));
       return \"movs\\t%Q0, %1\;rsbs\\t%Q0, %Q0, #0\;asrs\\t%R0, %Q0, #31\";
     case 3:
-      return \"ldmia\\t%1, {%0, %H0}\";
+      gcc_assert (TARGET_HAVE_MOVT);
+      return \"movw\\t%Q0, %L1\;movs\\tR0, #0\";
     case 4:
-      return \"stmia\\t%0, {%1, %H1}\";
+      return \"ldmia\\t%1, {%0, %H0}\";
     case 5:
-      return thumb_load_double_from_address (operands);
+      return \"stmia\\t%0, {%1, %H1}\";
     case 6:
+      return thumb_load_double_from_address (operands);
+    case 7:
       operands[2] = gen_rtx_MEM (SImode,
 			     plus_constant (Pmode, XEXP (operands[0], 0), 4));
       output_asm_insn (\"str\\t%1, %0\;str\\t%H1, %2\", operands);
       return \"\";
-    case 7:
+    case 8:
       if (REGNO (operands[1]) == REGNO (operands[0]) + 1)
 	return \"mov\\t%0, %1\;mov\\t%H0, %H1\";
       return \"mov\\t%H0, %H1\;mov\\t%0, %1\";
     }
   }"
-  [(set_attr "length" "4,4,6,2,2,6,4,4")
-   (set_attr "type" "multiple,multiple,multiple,load2,store2,load2,store2,multiple")
-   (set_attr "pool_range" "*,*,*,*,*,1018,*,*")]
+  [(set_attr "length" "4,4,6,6,2,2,6,4,4")
+   (set_attr "type" "multiple,multiple,multiple,multiple,load2,store2,load2,store2,multiple")
+   (set_attr "arch" "t1,t1,t1,v8mb,t1,t1,t1,t1,t1")
+   (set_attr "pool_range" "*,*,*,*,*,*,1018,*,*")]
 )
 
 (define_insn "*thumb1_movsi_insn"
-  [(set (match_operand:SI 0 "nonimmediate_operand" "=l,l,l,l,l,>,l, m,*l*h*k")
-	(match_operand:SI 1 "general_operand"      "l, I,J,K,>,l,mi,l,*l*h*k"))]
+  [(set (match_operand:SI 0 "nonimmediate_operand" "=l,l,r,l,l,l,>,l, m,*l*h*k")
+	(match_operand:SI 1 "general_operand"      "l, I,j,J,K,>,l,mi,l,*l*h*k"))]
   "TARGET_THUMB1
    && (   register_operand (operands[0], SImode)
        || register_operand (operands[1], SImode))"
   "@
    movs	%0, %1
    movs	%0, %1
+   movw	%0, %1
    #
    #
    ldmia\\t%1, {%0}
@@ -647,10 +672,11 @@
    ldr\\t%0, %1
    str\\t%1, %0
    mov\\t%0, %1"
-  [(set_attr "length" "2,2,4,4,2,2,2,2,2")
-   (set_attr "type" "mov_reg,mov_imm,multiple,multiple,load1,store1,load1,store1,mov_reg")
-   (set_attr "pool_range" "*,*,*,*,*,*,1018,*,*")
-   (set_attr "conds" "set,clob,*,*,nocond,nocond,nocond,nocond,nocond")])
+  [(set_attr "length" "2,2,4,4,4,2,2,2,2,2")
+   (set_attr "type" "mov_reg,mov_imm,mov_imm,multiple,multiple,load1,store1,load1,store1,mov_reg")
+   (set_attr "pool_range" "*,*,*,*,*,*,*,1018,*,*")
+   (set_attr "arch" "t1,t1,v8mb,t1,t1,t1,t1,t1,t1,t1")
+   (set_attr "conds" "set,clob,nocond,*,*,nocond,nocond,nocond,nocond,nocond")])
 
 ; Split the load of 64-bit constant into two loads for high and low 32-bit parts respectively
 ; to see if we can load them in fewer instructions or fewer cycles.
@@ -687,7 +713,8 @@
 (define_split
   [(set (match_operand:SI 0 "register_operand" "")
 	(match_operand:SI 1 "const_int_operand" ""))]
-  "TARGET_THUMB1 && satisfies_constraint_K (operands[1])"
+  "TARGET_THUMB1 && satisfies_constraint_K (operands[1])
+   && !(TARGET_HAVE_MOVT && satisfies_constraint_j (operands[1]))"
   [(set (match_dup 2) (match_dup 1))
    (set (match_dup 0) (ashift:SI (match_dup 2) (match_dup 3)))]
   "
@@ -714,7 +741,8 @@
 (define_split
   [(set (match_operand:SI 0 "register_operand" "")
 	(match_operand:SI 1 "const_int_operand" ""))]
-  "TARGET_THUMB1 && satisfies_constraint_Pe (operands[1])"
+  "TARGET_THUMB1 && satisfies_constraint_Pe (operands[1])
+   && !(TARGET_HAVE_MOVT && satisfies_constraint_j (operands[1]))"
   [(set (match_dup 2) (match_dup 1))
    (set (match_dup 0) (plus:SI (match_dup 2) (match_dup 3)))]
   "
@@ -726,8 +754,8 @@
 )
 
 (define_insn "*thumb1_movhi_insn"
-  [(set (match_operand:HI 0 "nonimmediate_operand" "=l,l,m,l*r,*h,l")
-	(match_operand:HI 1 "general_operand"       "l,m,l,k*h,*r,I"))]
+  [(set (match_operand:HI 0 "nonimmediate_operand" "=l,l,m,l*r,*h,l,r")
+	(match_operand:HI 1 "general_operand"       "l,m,l,k*h,*r,I,n"))]
   "TARGET_THUMB1
    && (   register_operand (operands[0], HImode)
        || register_operand (operands[1], HImode))"
@@ -739,6 +767,8 @@
     case 3: return \"mov	%0, %1\";
     case 4: return \"mov	%0, %1\";
     case 5: return \"movs	%0, %1\";
+    case 6: gcc_assert (TARGET_HAVE_MOVT);
+	    return \"movw	%0, %L1\";
     default: gcc_unreachable ();
     case 1:
       /* The stack pointer can end up being taken as an index register.
@@ -758,9 +788,10 @@
 	}
       return \"ldrh	%0, %1\";
     }"
-  [(set_attr "length" "2,4,2,2,2,2")
-   (set_attr "type" "alus_imm,load1,store1,mov_reg,mov_reg,mov_imm")
-   (set_attr "conds" "clob,nocond,nocond,nocond,nocond,clob")])
+  [(set_attr "length" "2,4,2,2,2,2,4")
+   (set_attr "type" "alus_imm,load1,store1,mov_reg,mov_reg,mov_imm,mov_imm")
+   (set_attr "arch" "t1,t1,t1,t1,t1,t1,v8mb")
+   (set_attr "conds" "clob,nocond,nocond,nocond,nocond,clob,nocond")])
 
 (define_expand "thumb_movhi_clobber"
   [(set (match_operand:HI     0 "memory_operand"   "")
@@ -963,6 +994,94 @@
   DONE;
 })
 
+;; A pattern for the CB(N)Z instruction added in ARMv8-M Baseline profile,
+;; adapted from cbranchsi4_insn.  Modifying cbranchsi4_insn instead leads to
+;; code generation difference for ARMv6-M because the minimum length of the
+;; instruction becomes 2 even for ARMv6-M due to a limitation in genattrtab's
+;; handling of PC in the length condition.
+(define_insn "thumb1_cbz"
+  [(set (pc) (if_then_else
+	      (match_operator 0 "equality_operator"
+	       [(match_operand:SI 1 "s_register_operand" "l")
+		(const_int 0)])
+	      (label_ref (match_operand 2 "" ""))
+	      (pc)))]
+  "TARGET_THUMB1 && TARGET_HAVE_CBZ"
+{
+  if (get_attr_length (insn) == 2)
+    {
+      if (GET_CODE (operands[0]) == EQ)
+	return "cbz\t%1, %l2";
+      else
+	return "cbnz\t%1, %l2";
+    }
+  else
+    {
+      rtx t = cfun->machine->thumb1_cc_insn;
+      if (t != NULL_RTX)
+	{
+	  if (!rtx_equal_p (cfun->machine->thumb1_cc_op0, operands[1])
+	      || !rtx_equal_p (cfun->machine->thumb1_cc_op1, operands[2]))
+	    t = NULL_RTX;
+	  if (cfun->machine->thumb1_cc_mode == CC_NOOVmode)
+	    {
+	      if (!noov_comparison_operator (operands[0], VOIDmode))
+		t = NULL_RTX;
+	    }
+	  else if (cfun->machine->thumb1_cc_mode != CCmode)
+	    t = NULL_RTX;
+	}
+      if (t == NULL_RTX)
+	{
+	  output_asm_insn ("cmp\t%1, #0", operands);
+	  cfun->machine->thumb1_cc_insn = insn;
+	  cfun->machine->thumb1_cc_op0 = operands[1];
+	  cfun->machine->thumb1_cc_op1 = operands[2];
+	  cfun->machine->thumb1_cc_mode = CCmode;
+	}
+      else
+	/* Ensure we emit the right type of condition code on the jump.  */
+	XEXP (operands[0], 0) = gen_rtx_REG (cfun->machine->thumb1_cc_mode,
+					     CC_REGNUM);
+
+      switch (get_attr_length (insn))
+	{
+	case 4:  return "b%d0\t%l2";
+	case 6:  return "b%D0\t.LCB%=;b\t%l2\t%@long jump\n.LCB%=:";
+	case 8:  return "b%D0\t.LCB%=;bl\t%l2\t%@far jump\n.LCB%=:";
+	default: gcc_unreachable ();
+	}
+    }
+}
+  [(set (attr "far_jump")
+	(if_then_else
+	    (eq_attr "length" "8")
+	    (const_string "yes")
+	    (const_string "no")))
+   (set (attr "length")
+	(if_then_else
+	    (and (ge (minus (match_dup 2) (pc)) (const_int 2))
+		 (le (minus (match_dup 2) (pc)) (const_int 128)))
+	    (const_int 2)
+	    (if_then_else
+		(and (ge (minus (match_dup 2) (pc)) (const_int -250))
+		     (le (minus (match_dup 2) (pc)) (const_int 256)))
+		(const_int 4)
+		(if_then_else
+		    (and (ge (minus (match_dup 2) (pc)) (const_int -2040))
+			 (le (minus (match_dup 2) (pc)) (const_int 2048)))
+		    (const_int 6)
+		    (const_int 8)))))
+   (set (attr "type")
+	(if_then_else
+	    (eq_attr "length" "2")
+	    (const_string "branch")
+	    (const_string "multiple")))]
+)
+
+;; Changes to the constraints of this pattern must be propagated to those of
+;; atomic compare_and_swap splitters in sync.md.  These must be at least as
+;; strict as the constraints here and aim to be as permissive.
 (define_insn "cbranchsi4_insn"
   [(set (pc) (if_then_else
 	      (match_operator 0 "arm_comparison_operator"
@@ -1024,6 +1143,9 @@
    (set_attr "type" "multiple")]
 )
 
+;; Changes to the constraints of this pattern must be propagated to those of
+;; atomic compare_and_swap splitters in sync.md.  These must be at least as
+;; strict as the constraints here and aim to be as permissive.
 (define_insn "cbranchsi4_scratch"
   [(set (pc) (if_then_else
 	      (match_operator 4 "arm_comparison_operator"
@@ -1609,6 +1731,19 @@
    (set_attr "type" "call")]
 )
 
+(define_insn "*nonsecure_call_reg_thumb1_v5"
+  [(call (unspec:SI [(mem:SI (match_operand:SI 0 "register_operand" "l*r"))]
+		    UNSPEC_NONSECURE_MEM)
+	 (match_operand 1 "" ""))
+   (use (match_operand 2 "" ""))
+   (clobber (reg:SI LR_REGNUM))
+   (clobber (match_dup 0))]
+  "TARGET_THUMB1 && use_cmse && !SIBLING_CALL_P (insn)"
+  "bl\\t__gnu_cmse_nonsecure_call"
+  [(set_attr "length" "4")
+   (set_attr "type" "call")]
+)
+
 (define_insn "*call_reg_thumb1"
   [(call (mem:SI (match_operand:SI 0 "register_operand" "l*r"))
 	 (match_operand 1 "" ""))
@@ -1641,6 +1776,21 @@
    (set_attr "type" "call")]
 )
 
+(define_insn "*nonsecure_call_value_reg_thumb1_v5"
+  [(set (match_operand 0 "" "")
+	(call (unspec:SI
+	       [(mem:SI (match_operand:SI 1 "register_operand" "l*r"))]
+	       UNSPEC_NONSECURE_MEM)
+	      (match_operand 2 "" "")))
+   (use (match_operand 3 "" ""))
+   (clobber (reg:SI LR_REGNUM))
+   (clobber (match_dup 1))]
+  "TARGET_THUMB1 && use_cmse"
+  "bl\\t__gnu_cmse_nonsecure_call"
+  [(set_attr "length" "4")
+   (set_attr "type" "call")]
+)
+
 (define_insn "*call_value_reg_thumb1"
   [(set (match_operand 0 "" "")
 	(call (mem:SI (match_operand:SI 1 "register_operand" "l*r"))
@@ -1747,8 +1897,13 @@
   "*
     return thumb1_unexpanded_epilogue ();
   "
-  ; Length is absolute worst case
-  [(set_attr "length" "44")
+  ; Length is absolute worst case, when using CMSE and if this is an entry
+  ; function an extra 4 (MSR) bytes will be added.
+  [(set (attr "length")
+	(if_then_else
+	 (match_test "IS_CMSE_ENTRY (arm_current_func_type ())")
+	 (const_int 48)
+	 (const_int 44)))
    (set_attr "type" "block")
    ;; We don't clobber the conditions, but the potential length of this
    ;; operation is sufficient to make conditionalizing the sequence
--- a/src/gcc/config/arm/thumb2.md
+++ b/src/gcc/config/arm/thumb2.md
@@ -125,32 +125,6 @@
    (set_attr "type" "multiple")]
 )
 
-;; Thumb-2 does not have rsc, so use a clever trick with shifter operands.
-(define_insn_and_split "*thumb2_negdi2"
-  [(set (match_operand:DI         0 "s_register_operand" "=&r,r")
-	(neg:DI (match_operand:DI 1 "s_register_operand"  "?r,0")))
-   (clobber (reg:CC CC_REGNUM))]
-  "TARGET_THUMB2"
-  "#" ; negs\\t%Q0, %Q1\;sbc\\t%R0, %R1, %R1, lsl #1
-  "&& reload_completed"
-  [(parallel [(set (reg:CC CC_REGNUM)
-		   (compare:CC (const_int 0) (match_dup 1)))
-	      (set (match_dup 0) (minus:SI (const_int 0) (match_dup 1)))])
-   (set (match_dup 2) (minus:SI (minus:SI (match_dup 3)
-                                          (ashift:SI (match_dup 3)
-                                                     (const_int 1)))
-                                (ltu:SI (reg:CC_C CC_REGNUM) (const_int 0))))]
-  {
-    operands[2] = gen_highpart (SImode, operands[0]);
-    operands[0] = gen_lowpart (SImode, operands[0]);
-    operands[3] = gen_highpart (SImode, operands[1]);
-    operands[1] = gen_lowpart (SImode, operands[1]);
-  }
-  [(set_attr "conds" "clob")
-   (set_attr "length" "8")
-   (set_attr "type" "multiple")]
-)
-
 (define_insn_and_split "*thumb2_abssi2"
   [(set (match_operand:SI         0 "s_register_operand" "=&r,l,r")
 	(abs:SI (match_operand:SI 1 "s_register_operand" "r,0,0")))
@@ -278,8 +252,7 @@
 (define_insn "*thumb2_movsi_insn"
   [(set (match_operand:SI 0 "nonimmediate_operand" "=rk,r,l,r,r,l ,*hk,m,*m")
 	(match_operand:SI 1 "general_operand"	   "rk,I,Py,K,j,mi,*mi,l,*hk"))]
-  "TARGET_THUMB2 && ! TARGET_IWMMXT
-   && !(TARGET_HARD_FLOAT && TARGET_VFP)
+  "TARGET_THUMB2 && !TARGET_IWMMXT && !TARGET_HARD_FLOAT
    && (   register_operand (operands[0], SImode)
        || register_operand (operands[1], SImode))"
   "@
@@ -581,6 +554,19 @@
   [(set_attr "type" "call")]
 )
 
+(define_insn "*nonsecure_call_reg_thumb2"
+  [(call (unspec:SI [(mem:SI (match_operand:SI 0 "s_register_operand" "r"))]
+		    UNSPEC_NONSECURE_MEM)
+	 (match_operand 1 "" ""))
+   (use (match_operand 2 "" ""))
+   (clobber (reg:SI LR_REGNUM))
+   (clobber (match_dup 0))]
+  "TARGET_THUMB2 && use_cmse"
+  "bl\\t__gnu_cmse_nonsecure_call"
+  [(set_attr "length" "4")
+   (set_attr "type" "call")]
+)
+
 (define_insn "*call_value_reg_thumb2"
   [(set (match_operand 0 "" "")
 	(call (mem:SI (match_operand:SI 1 "register_operand" "l*r"))
@@ -592,6 +578,21 @@
   [(set_attr "type" "call")]
 )
 
+(define_insn "*nonsecure_call_value_reg_thumb2"
+  [(set (match_operand 0 "" "")
+	(call
+	 (unspec:SI [(mem:SI (match_operand:SI 1 "register_operand" "l*r"))]
+		    UNSPEC_NONSECURE_MEM)
+	 (match_operand 2 "" "")))
+   (use (match_operand 3 "" ""))
+   (clobber (reg:SI LR_REGNUM))
+   (clobber (match_dup 1))]
+  "TARGET_THUMB2 && use_cmse"
+  "bl\t__gnu_cmse_nonsecure_call"
+  [(set_attr "length" "4")
+   (set_attr "type" "call")]
+)
+
 (define_insn "*thumb2_indirect_jump"
   [(set (pc)
 	(match_operand:SI 0 "register_operand" "l*r"))]
@@ -1115,12 +1116,31 @@
 
 (define_insn "*thumb2_return"
   [(simple_return)]
-  "TARGET_THUMB2"
+  "TARGET_THUMB2 && !IS_CMSE_ENTRY (arm_current_func_type ())"
   "* return output_return_instruction (const_true_rtx, true, false, true);"
   [(set_attr "type" "branch")
    (set_attr "length" "4")]
 )
 
+(define_insn "*thumb2_cmse_entry_return"
+  [(simple_return)]
+  "TARGET_THUMB2 && IS_CMSE_ENTRY (arm_current_func_type ())"
+  "* return output_return_instruction (const_true_rtx, true, false, true);"
+  [(set_attr "type" "branch")
+   ; This is a return from a cmse_nonsecure_entry function so code will be
+   ; added to clear the APSR and potentially the FPSCR if VFP is available, so
+   ; we adapt the length accordingly.
+   (set (attr "length")
+     (if_then_else (match_test "TARGET_HARD_FLOAT")
+      (const_int 12)
+      (const_int 8)))
+   ; We do not support predicate execution of returns from cmse_nonsecure_entry
+   ; functions because we need to clear the APSR.  Since predicable has to be
+   ; a constant, we had to duplicate the thumb2_return pattern for CMSE entry
+   ; functions.
+   (set_attr "predicable" "no")]
+)
+
 (define_insn_and_split "thumb2_eh_return"
   [(unspec_volatile [(match_operand:SI 0 "s_register_operand" "r")]
 		    VUNSPEC_EH_RETURN)
--- a/src/gcc/config/arm/types.md
+++ b/src/gcc/config/arm/types.md
@@ -51,6 +51,7 @@
 ; alus_shift_imm     as alu_shift_imm, setting condition flags.
 ; alus_shift_reg     as alu_shift_reg, setting condition flags.
 ; bfm                bitfield move operation.
+; bfx                bitfield extract operation.
 ; block              blockage insn, this blocks all functional units.
 ; branch             branch.
 ; call               subroutine call.
@@ -557,6 +558,7 @@
   alus_shift_imm,\
   alus_shift_reg,\
   bfm,\
+  bfx,\
   block,\
   branch,\
   call,\
--- a/src/gcc/config/arm/unspecs.md
+++ b/src/gcc/config/arm/unspecs.md
@@ -84,6 +84,8 @@
   UNSPEC_VRINTA         ; Represent a float to integral float rounding
                         ; towards nearest, ties away from zero.
   UNSPEC_PROBE_STACK    ; Probe stack memory reference
+  UNSPEC_NONSECURE_MEM	; Represent non-secure memory in ARMv8-M with
+			; security extension
 ])
 
 (define_c_enum "unspec" [
@@ -191,6 +193,8 @@
   UNSPEC_VBSL
   UNSPEC_VCAGE
   UNSPEC_VCAGT
+  UNSPEC_VCALE
+  UNSPEC_VCALT
   UNSPEC_VCEQ
   UNSPEC_VCGE
   UNSPEC_VCGEU
@@ -203,6 +207,20 @@
   UNSPEC_VCVT_U
   UNSPEC_VCVT_S_N
   UNSPEC_VCVT_U_N
+  UNSPEC_VCVT_HF_S_N
+  UNSPEC_VCVT_HF_U_N
+  UNSPEC_VCVT_SI_S_N
+  UNSPEC_VCVT_SI_U_N
+  UNSPEC_VCVTH_S
+  UNSPEC_VCVTH_U
+  UNSPEC_VCVTA_S
+  UNSPEC_VCVTA_U
+  UNSPEC_VCVTM_S
+  UNSPEC_VCVTM_U
+  UNSPEC_VCVTN_S
+  UNSPEC_VCVTN_U
+  UNSPEC_VCVTP_S
+  UNSPEC_VCVTP_U
   UNSPEC_VEXT
   UNSPEC_VHADD_S
   UNSPEC_VHADD_U
@@ -244,6 +262,8 @@
   UNSPEC_VMLSL_S_LANE
   UNSPEC_VMLSL_U_LANE
   UNSPEC_VMLSL_LANE
+  UNSPEC_VFMA_LANE
+  UNSPEC_VFMS_LANE
   UNSPEC_VMOVL_S
   UNSPEC_VMOVL_U
   UNSPEC_VMOVN
@@ -365,5 +385,11 @@
   UNSPEC_NVRINTN
   UNSPEC_VQRDMLAH
   UNSPEC_VQRDMLSH
+  UNSPEC_VRND
+  UNSPEC_VRNDA
+  UNSPEC_VRNDI
+  UNSPEC_VRNDM
+  UNSPEC_VRNDN
+  UNSPEC_VRNDP
+  UNSPEC_VRNDX
 ])
-
--- a/src/gcc/config/arm/vec-common.md
+++ b/src/gcc/config/arm/vec-common.md
@@ -124,6 +124,20 @@
     FAIL;
 })
 
+(define_expand "vec_perm_const<mode>"
+  [(match_operand:VH 0 "s_register_operand")
+   (match_operand:VH 1 "s_register_operand")
+   (match_operand:VH 2 "s_register_operand")
+   (match_operand:<V_cmp_result> 3)]
+  "TARGET_NEON"
+{
+  if (arm_expand_vec_perm_const (operands[0], operands[1],
+				 operands[2], operands[3]))
+    DONE;
+  else
+    FAIL;
+})
+
 (define_expand "vec_perm<mode>"
   [(match_operand:VE 0 "s_register_operand" "")
    (match_operand:VE 1 "s_register_operand" "")
--- a/src/gcc/config/arm/vfp.md
+++ b/src/gcc/config/arm/vfp.md
@@ -18,13 +18,206 @@
 ;; along with GCC; see the file COPYING3.  If not see
 ;; <http://www.gnu.org/licenses/>.  */
 
+;; Patterns for HI moves which provide more data transfer instructions when VFP
+;; support is enabled.
+(define_insn "*arm_movhi_vfp"
+ [(set
+   (match_operand:HI 0 "nonimmediate_operand"
+    "=rk,  r, r, m, r, *t,  r, *t")
+   (match_operand:HI 1 "general_operand"
+    "rIk, K, n, r, mi, r, *t, *t"))]
+ "TARGET_ARM && TARGET_HARD_FLOAT
+  && !TARGET_VFP_FP16INST
+  && (register_operand (operands[0], HImode)
+       || register_operand (operands[1], HImode))"
+{
+  switch (which_alternative)
+    {
+    case 0:
+      return "mov%?\t%0, %1\t%@ movhi";
+    case 1:
+      return "mvn%?\t%0, #%B1\t%@ movhi";
+    case 2:
+      return "movw%?\t%0, %L1\t%@ movhi";
+    case 3:
+      return "strh%?\t%1, %0\t%@ movhi";
+    case 4:
+      return "ldrh%?\t%0, %1\t%@ movhi";
+    case 5:
+    case 6:
+      return "vmov%?\t%0, %1\t%@ int";
+    case 7:
+      return "vmov%?.f32\t%0, %1\t%@ int";
+    default:
+      gcc_unreachable ();
+    }
+}
+ [(set_attr "predicable" "yes")
+  (set_attr_alternative "type"
+   [(if_then_else
+     (match_operand 1 "const_int_operand" "")
+     (const_string "mov_imm")
+     (const_string "mov_reg"))
+    (const_string "mvn_imm")
+    (const_string "mov_imm")
+    (const_string "store1")
+    (const_string "load1")
+    (const_string "f_mcr")
+    (const_string "f_mrc")
+    (const_string "fmov")])
+  (set_attr "arch" "*, *, v6t2, *, *, *, *, *")
+  (set_attr "pool_range" "*, *, *, *, 256, *, *, *")
+  (set_attr "neg_pool_range" "*, *, *, *, 244, *, *, *")
+  (set_attr "length" "4")]
+)
+
+(define_insn "*thumb2_movhi_vfp"
+ [(set
+   (match_operand:HI 0 "nonimmediate_operand"
+    "=rk, r, l, r, m, r, *t, r, *t")
+   (match_operand:HI 1 "general_operand"
+    "rk, I, Py, n, r, m, r, *t, *t"))]
+ "TARGET_THUMB2 && TARGET_HARD_FLOAT
+  && !TARGET_VFP_FP16INST
+  && (register_operand (operands[0], HImode)
+       || register_operand (operands[1], HImode))"
+{
+  switch (which_alternative)
+    {
+    case 0:
+    case 1:
+    case 2:
+      return "mov%?\t%0, %1\t%@ movhi";
+    case 3:
+      return "movw%?\t%0, %L1\t%@ movhi";
+    case 4:
+      return "strh%?\t%1, %0\t%@ movhi";
+    case 5:
+      return "ldrh%?\t%0, %1\t%@ movhi";
+    case 6:
+    case 7:
+      return "vmov%?\t%0, %1\t%@ int";
+    case 8:
+      return "vmov%?.f32\t%0, %1\t%@ int";
+    default:
+      gcc_unreachable ();
+    }
+}
+ [(set_attr "predicable" "yes")
+  (set_attr "predicable_short_it"
+   "yes, no, yes, no, no, no, no, no, no")
+  (set_attr "type"
+   "mov_reg, mov_imm, mov_imm, mov_imm, store1, load1,\
+    f_mcr, f_mrc, fmov")
+  (set_attr "arch" "*, *, *, v6t2, *, *, *, *, *")
+  (set_attr "pool_range" "*, *, *, *, *, 4094, *, *, *")
+  (set_attr "neg_pool_range" "*, *, *, *, *, 250, *, *, *")
+  (set_attr "length" "2, 4, 2, 4, 4, 4, 4, 4, 4")]
+)
+
+;; Patterns for HI moves which provide more data transfer instructions when FP16
+;; instructions are available.
+(define_insn "*arm_movhi_fp16"
+ [(set
+   (match_operand:HI 0 "nonimmediate_operand"
+    "=r,  r, r, m, r, *t,  r, *t")
+   (match_operand:HI 1 "general_operand"
+    "rIk, K, n, r, mi, r, *t, *t"))]
+ "TARGET_ARM && TARGET_VFP_FP16INST
+  && (register_operand (operands[0], HImode)
+       || register_operand (operands[1], HImode))"
+{
+  switch (which_alternative)
+    {
+    case 0:
+      return "mov%?\t%0, %1\t%@ movhi";
+    case 1:
+      return "mvn%?\t%0, #%B1\t%@ movhi";
+    case 2:
+      return "movw%?\t%0, %L1\t%@ movhi";
+    case 3:
+      return "strh%?\t%1, %0\t%@ movhi";
+    case 4:
+      return "ldrh%?\t%0, %1\t%@ movhi";
+    case 5:
+    case 6:
+      return "vmov.f16\t%0, %1\t%@ int";
+    case 7:
+      return "vmov%?.f32\t%0, %1\t%@ int";
+    default:
+      gcc_unreachable ();
+    }
+}
+ [(set_attr "predicable" "yes, yes, yes, yes, yes, no, no, yes")
+  (set_attr_alternative "type"
+   [(if_then_else
+     (match_operand 1 "const_int_operand" "")
+     (const_string "mov_imm")
+     (const_string "mov_reg"))
+    (const_string "mvn_imm")
+    (const_string "mov_imm")
+    (const_string "store1")
+    (const_string "load1")
+    (const_string "f_mcr")
+    (const_string "f_mrc")
+    (const_string "fmov")])
+  (set_attr "arch" "*, *, v6t2, *, *, *, *, *")
+  (set_attr "pool_range" "*, *, *, *, 256, *, *, *")
+  (set_attr "neg_pool_range" "*, *, *, *, 244, *, *, *")
+  (set_attr "length" "4")]
+)
+
+(define_insn "*thumb2_movhi_fp16"
+ [(set
+   (match_operand:HI 0 "nonimmediate_operand"
+    "=rk, r, l, r, m, r, *t, r, *t")
+   (match_operand:HI 1 "general_operand"
+    "rk, I, Py, n, r, m, r, *t, *t"))]
+ "TARGET_THUMB2 && TARGET_VFP_FP16INST
+  && (register_operand (operands[0], HImode)
+       || register_operand (operands[1], HImode))"
+{
+  switch (which_alternative)
+    {
+    case 0:
+    case 1:
+    case 2:
+      return "mov%?\t%0, %1\t%@ movhi";
+    case 3:
+      return "movw%?\t%0, %L1\t%@ movhi";
+    case 4:
+      return "strh%?\t%1, %0\t%@ movhi";
+    case 5:
+      return "ldrh%?\t%0, %1\t%@ movhi";
+    case 6:
+    case 7:
+      return "vmov.f16\t%0, %1\t%@ int";
+    case 8:
+      return "vmov%?.f32\t%0, %1\t%@ int";
+    default:
+      gcc_unreachable ();
+    }
+}
+ [(set_attr "predicable"
+   "yes, yes, yes, yes, yes, yes, no, no, yes")
+  (set_attr "predicable_short_it"
+   "yes, no, yes, no, no, no, no, no, no")
+  (set_attr "type"
+   "mov_reg, mov_imm, mov_imm, mov_imm, store1, load1,\
+    f_mcr, f_mrc, fmov")
+  (set_attr "arch" "*, *, *, v6t2, *, *, *, *, *")
+  (set_attr "pool_range" "*, *, *, *, *, 4094, *, *, *")
+  (set_attr "neg_pool_range" "*, *, *, *, *, 250, *, *, *")
+  (set_attr "length" "2, 4, 2, 4, 4, 4, 4, 4, 4")]
+)
+
 ;; SImode moves
 ;; ??? For now do not allow loading constants into vfp regs.  This causes
 ;; problems because small constants get converted into adds.
 (define_insn "*arm_movsi_vfp"
   [(set (match_operand:SI 0 "nonimmediate_operand" "=rk,r,r,r,rk,m ,*t,r,*t,*t, *Uv")
       (match_operand:SI 1 "general_operand"	   "rk, I,K,j,mi,rk,r,*t,*t,*Uvi,*t"))]
-  "TARGET_ARM && TARGET_VFP && TARGET_HARD_FLOAT
+  "TARGET_ARM && TARGET_HARD_FLOAT
    && (   s_register_operand (operands[0], SImode)
        || s_register_operand (operands[1], SImode))"
   "*
@@ -53,7 +246,8 @@
     }
   "
   [(set_attr "predicable" "yes")
-   (set_attr "type" "mov_reg,mov_reg,mvn_imm,mov_imm,load1,store1,f_mcr,f_mrc,fmov,f_loads,f_stores")
+   (set_attr "type" "mov_reg,mov_reg,mvn_imm,mov_imm,load1,store1,
+		     f_mcr,f_mrc,fmov,f_loads,f_stores")
    (set_attr "pool_range"     "*,*,*,*,4096,*,*,*,*,1020,*")
    (set_attr "neg_pool_range" "*,*,*,*,4084,*,*,*,*,1008,*")]
 )
@@ -66,7 +260,7 @@
 (define_insn "*thumb2_movsi_vfp"
   [(set (match_operand:SI 0 "nonimmediate_operand" "=rk,r,l,r,r, l,*hk,m, *m,*t, r,*t,*t,  *Uv")
 	(match_operand:SI 1 "general_operand"	   "rk,I,Py,K,j,mi,*mi,l,*hk, r,*t,*t,*Uvi,*t"))]
-  "TARGET_THUMB2 && TARGET_VFP && TARGET_HARD_FLOAT
+  "TARGET_THUMB2 && TARGET_HARD_FLOAT
    && (   s_register_operand (operands[0], SImode)
        || s_register_operand (operands[1], SImode))"
   "*
@@ -112,7 +306,7 @@
 (define_insn "*movdi_vfp"
   [(set (match_operand:DI 0 "nonimmediate_di_operand" "=r,r,r,r,q,q,m,w,r,w,w, Uv")
        (match_operand:DI 1 "di_operand"              "r,rDa,Db,Dc,mi,mi,q,r,w,w,Uvi,w"))]
-  "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_VFP && arm_tune != cortexa8
+  "TARGET_32BIT && TARGET_HARD_FLOAT && arm_tune != cortexa8
    && (   register_operand (operands[0], DImode)
        || register_operand (operands[1], DImode))
    && !(TARGET_NEON && CONST_INT_P (operands[1])
@@ -163,7 +357,7 @@
 (define_insn "*movdi_vfp_cortexa8"
   [(set (match_operand:DI 0 "nonimmediate_di_operand" "=r,r,r,r,r,r,m,w,!r,w,w, Uv")
        (match_operand:DI 1 "di_operand"              "r,rDa,Db,Dc,mi,mi,r,r,w,w,Uvi,w"))]
-  "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_VFP && arm_tune == cortexa8
+  "TARGET_32BIT && TARGET_HARD_FLOAT && arm_tune == cortexa8
     && (   register_operand (operands[0], DImode)
         || register_operand (operands[1], DImode))
     && !(TARGET_NEON && CONST_INT_P (operands[1])
@@ -211,10 +405,87 @@
  )
 
 ;; HFmode moves
+
+(define_insn "*movhf_vfp_fp16"
+  [(set (match_operand:HF 0 "nonimmediate_operand"
+			  "= r,m,t,r,t,r,t,t,Um,r")
+	(match_operand:HF 1 "general_operand"
+			  "  m,r,t,r,r,t,Dv,Um,t,F"))]
+  "TARGET_32BIT
+   && TARGET_VFP_FP16INST
+   && (s_register_operand (operands[0], HFmode)
+       || s_register_operand (operands[1], HFmode))"
+ {
+  switch (which_alternative)
+    {
+    case 0: /* ARM register from memory.  */
+      return \"ldrh%?\\t%0, %1\\t%@ __fp16\";
+    case 1: /* Memory from ARM register.  */
+      return \"strh%?\\t%1, %0\\t%@ __fp16\";
+    case 2: /* S register from S register.  */
+      return \"vmov\\t%0, %1\t%@ __fp16\";
+    case 3: /* ARM register from ARM register.  */
+      return \"mov%?\\t%0, %1\\t%@ __fp16\";
+    case 4: /* S register from ARM register.  */
+    case 5: /* ARM register from S register.  */
+    case 6: /* S register from immediate.  */
+      return \"vmov.f16\\t%0, %1\t%@ __fp16\";
+    case 7: /* S register from memory.  */
+      return \"vld1.16\\t{%z0}, %A1\";
+    case 8: /* Memory from S register.  */
+      return \"vst1.16\\t{%z1}, %A0\";
+    case 9: /* ARM register from constant.  */
+      {
+	long bits;
+	rtx ops[4];
+
+	bits = real_to_target (NULL, CONST_DOUBLE_REAL_VALUE (operands[1]),
+			       HFmode);
+	ops[0] = operands[0];
+	ops[1] = GEN_INT (bits);
+	ops[2] = GEN_INT (bits & 0xff00);
+	ops[3] = GEN_INT (bits & 0x00ff);
+
+	if (arm_arch_thumb2)
+	  output_asm_insn (\"movw\\t%0, %1\", ops);
+	else
+	  output_asm_insn (\"mov\\t%0, %2\;orr\\t%0, %0, %3\", ops);
+	return \"\";
+       }
+    default:
+      gcc_unreachable ();
+    }
+ }
+  [(set_attr "predicable" "yes, yes, no, yes, no, no, no, no, no, no")
+   (set_attr "predicable_short_it" "no, no, no, yes,\
+				    no, no, no, no,\
+				    no, no")
+   (set_attr_alternative "type"
+    [(const_string "load1") (const_string "store1")
+     (const_string "fmov") (const_string "mov_reg")
+     (const_string "f_mcr") (const_string "f_mrc")
+     (const_string "fconsts") (const_string "neon_load1_1reg")
+     (const_string "neon_store1_1reg")
+     (if_then_else (match_test "arm_arch_thumb2")
+      (const_string "mov_imm")
+      (const_string "multiple"))])
+   (set_attr_alternative "length"
+    [(const_int 4) (const_int 4)
+     (const_int 4) (const_int 4)
+     (const_int 4) (const_int 4)
+     (const_int 4) (const_int 4)
+     (const_int 4)
+     (if_then_else (match_test "arm_arch_thumb2")
+      (const_int 4)
+      (const_int 8))])]
+)
+
 (define_insn "*movhf_vfp_neon"
   [(set (match_operand:HF 0 "nonimmediate_operand" "= t,Um,r,m,t,r,t,r,r")
 	(match_operand:HF 1 "general_operand"	   " Um, t,m,r,t,r,r,t,F"))]
-  "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_NEON_FP16
+  "TARGET_32BIT
+   && TARGET_HARD_FLOAT && TARGET_NEON_FP16
+   && !TARGET_VFP_FP16INST
    && (   s_register_operand (operands[0], HFmode)
        || s_register_operand (operands[1], HFmode))"
   "*
@@ -268,7 +539,10 @@
 (define_insn "*movhf_vfp"
   [(set (match_operand:HF 0 "nonimmediate_operand" "=r,m,t,r,t,r,r")
 	(match_operand:HF 1 "general_operand"	   " m,r,t,r,r,t,F"))]
-  "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_FP16 && !TARGET_NEON_FP16
+  "TARGET_32BIT
+   && TARGET_HARD_FLOAT
+   && !TARGET_NEON_FP16
+   && !TARGET_VFP_FP16INST
    && (   s_register_operand (operands[0], HFmode)
        || s_register_operand (operands[1], HFmode))"
   "*
@@ -321,7 +595,7 @@
 (define_insn "*movsf_vfp"
   [(set (match_operand:SF 0 "nonimmediate_operand" "=t,?r,t ,t  ,Uv,r ,m,t,r")
         (match_operand:SF 1 "general_operand"	   " ?r,t,Dv,UvE,t, mE,r,t,r"))]
-  "TARGET_ARM && TARGET_HARD_FLOAT && TARGET_VFP
+  "TARGET_ARM && TARGET_HARD_FLOAT
    && (   s_register_operand (operands[0], SFmode)
        || s_register_operand (operands[1], SFmode))"
   "*
@@ -357,7 +631,7 @@
 (define_insn "*thumb2_movsf_vfp"
   [(set (match_operand:SF 0 "nonimmediate_operand" "=t,?r,t, t  ,Uv,r ,m,t,r")
 	(match_operand:SF 1 "general_operand"	   " ?r,t,Dv,UvE,t, mE,r,t,r"))]
-  "TARGET_THUMB2 && TARGET_HARD_FLOAT && TARGET_VFP
+  "TARGET_THUMB2 && TARGET_HARD_FLOAT
    && (   s_register_operand (operands[0], SFmode)
        || s_register_operand (operands[1], SFmode))"
   "*
@@ -394,9 +668,9 @@
 ;; DFmode moves
 
 (define_insn "*movdf_vfp"
-  [(set (match_operand:DF 0 "nonimmediate_soft_df_operand" "=w,?r,w ,w  ,Uv,r, m,w,r")
-	(match_operand:DF 1 "soft_df_operand"		   " ?r,w,Dy,UvF,w ,mF,r,w,r"))]
-  "TARGET_ARM && TARGET_HARD_FLOAT && TARGET_VFP
+  [(set (match_operand:DF 0 "nonimmediate_soft_df_operand" "=w,?r,w ,w,w  ,Uv,r, m,w,r")
+	(match_operand:DF 1 "soft_df_operand"		   " ?r,w,Dy,G,UvF,w ,mF,r,w,r"))]
+  "TARGET_ARM && TARGET_HARD_FLOAT
    && (   register_operand (operands[0], DFmode)
        || register_operand (operands[1], DFmode))"
   "*
@@ -410,40 +684,44 @@
       case 2:
 	gcc_assert (TARGET_VFP_DOUBLE);
         return \"vmov%?.f64\\t%P0, %1\";
-      case 3: case 4:
+      case 3:
+	gcc_assert (TARGET_VFP_DOUBLE);
+	return \"vmov.i64\\t%P0, #0\\t%@ float\";
+      case 4: case 5:
 	return output_move_vfp (operands);
-      case 5: case 6:
+      case 6: case 7:
 	return output_move_double (operands, true, NULL);
-      case 7:
+      case 8:
 	if (TARGET_VFP_SINGLE)
 	  return \"vmov%?.f32\\t%0, %1\;vmov%?.f32\\t%p0, %p1\";
 	else
 	  return \"vmov%?.f64\\t%P0, %P1\";
-      case 8:
+      case 9:
         return \"#\";
       default:
 	gcc_unreachable ();
       }
     }
   "
-  [(set_attr "type" "f_mcrr,f_mrrc,fconstd,f_loadd,f_stored,\
+  [(set_attr "type" "f_mcrr,f_mrrc,fconstd,neon_move,f_loadd,f_stored,\
                      load2,store2,ffarithd,multiple")
-   (set (attr "length") (cond [(eq_attr "alternative" "5,6,8") (const_int 8)
-			       (eq_attr "alternative" "7")
+   (set (attr "length") (cond [(eq_attr "alternative" "6,7,9") (const_int 8)
+			       (eq_attr "alternative" "8")
 				(if_then_else
 				 (match_test "TARGET_VFP_SINGLE")
 				 (const_int 8)
 				 (const_int 4))]
 			      (const_int 4)))
-   (set_attr "predicable" "yes")
-   (set_attr "pool_range" "*,*,*,1020,*,1020,*,*,*")
-   (set_attr "neg_pool_range" "*,*,*,1004,*,1004,*,*,*")]
+   (set_attr "predicable" "yes,yes,yes,no,yes,yes,yes,yes,yes,yes")
+   (set_attr "pool_range" "*,*,*,*,1020,*,1020,*,*,*")
+   (set_attr "neg_pool_range" "*,*,*,*,1004,*,1004,*,*,*")
+   (set_attr "arch" "any,any,any,neon,any,any,any,any,any,any")]
 )
 
 (define_insn "*thumb2_movdf_vfp"
-  [(set (match_operand:DF 0 "nonimmediate_soft_df_operand" "=w,?r,w ,w  ,Uv,r ,m,w,r")
-	(match_operand:DF 1 "soft_df_operand"		   " ?r,w,Dy,UvF,w, mF,r, w,r"))]
-  "TARGET_THUMB2 && TARGET_HARD_FLOAT && TARGET_VFP
+  [(set (match_operand:DF 0 "nonimmediate_soft_df_operand" "=w,?r,w ,w,w  ,Uv,r ,m,w,r")
+	(match_operand:DF 1 "soft_df_operand"		   " ?r,w,Dy,G,UvF,w, mF,r, w,r"))]
+  "TARGET_THUMB2 && TARGET_HARD_FLOAT
    && (   register_operand (operands[0], DFmode)
        || register_operand (operands[1], DFmode))"
   "*
@@ -457,11 +735,14 @@
       case 2:
 	gcc_assert (TARGET_VFP_DOUBLE);
 	return \"vmov%?.f64\\t%P0, %1\";
-      case 3: case 4:
+      case 3:
+	gcc_assert (TARGET_VFP_DOUBLE);
+	return \"vmov.i64\\t%P0, #0\\t%@ float\";
+      case 4: case 5:
 	return output_move_vfp (operands);
-      case 5: case 6: case 8:
+      case 6: case 7: case 9:
 	return output_move_double (operands, true, NULL);
-      case 7:
+      case 8:
 	if (TARGET_VFP_SINGLE)
 	  return \"vmov%?.f32\\t%0, %1\;vmov%?.f32\\t%p0, %p1\";
 	else
@@ -471,17 +752,18 @@
       }
     }
   "
-  [(set_attr "type" "f_mcrr,f_mrrc,fconstd,f_loadd,\
+  [(set_attr "type" "f_mcrr,f_mrrc,fconstd,neon_move,f_loadd,\
                      f_stored,load2,store2,ffarithd,multiple")
-   (set (attr "length") (cond [(eq_attr "alternative" "5,6,8") (const_int 8)
-			       (eq_attr "alternative" "7")
+   (set (attr "length") (cond [(eq_attr "alternative" "6,7,9") (const_int 8)
+			       (eq_attr "alternative" "8")
 				(if_then_else
 				 (match_test "TARGET_VFP_SINGLE")
 				 (const_int 8)
 				 (const_int 4))]
 			      (const_int 4)))
-   (set_attr "pool_range" "*,*,*,1018,*,4094,*,*,*")
-   (set_attr "neg_pool_range" "*,*,*,1008,*,0,*,*,*")]
+   (set_attr "pool_range" "*,*,*,*,1018,*,4094,*,*,*")
+   (set_attr "neg_pool_range" "*,*,*,*,1008,*,0,*,*,*")
+   (set_attr "arch" "any,any,any,neon,any,any,any,any,any,any")]
 )
 
 
@@ -494,7 +776,7 @@
 	    [(match_operand 4 "cc_register" "") (const_int 0)])
 	  (match_operand:SF 1 "s_register_operand" "0,t,t,0,?r,?r,0,t,t")
 	  (match_operand:SF 2 "s_register_operand" "t,0,t,?r,0,?r,t,0,t")))]
-  "TARGET_ARM && TARGET_HARD_FLOAT && TARGET_VFP"
+  "TARGET_ARM && TARGET_HARD_FLOAT"
   "@
    vmov%D3.f32\\t%0, %2
    vmov%d3.f32\\t%0, %1
@@ -517,7 +799,7 @@
 	    [(match_operand 4 "cc_register" "") (const_int 0)])
 	  (match_operand:SF 1 "s_register_operand" "0,t,t,0,?r,?r,0,t,t")
 	  (match_operand:SF 2 "s_register_operand" "t,0,t,?r,0,?r,t,0,t")))]
-  "TARGET_THUMB2 && TARGET_HARD_FLOAT && TARGET_VFP && !arm_restrict_it"
+  "TARGET_THUMB2 && TARGET_HARD_FLOAT && !arm_restrict_it"
   "@
    it\\t%D3\;vmov%D3.f32\\t%0, %2
    it\\t%d3\;vmov%d3.f32\\t%0, %1
@@ -585,7 +867,7 @@
 (define_insn "*abssf2_vfp"
   [(set (match_operand:SF	  0 "s_register_operand" "=t")
 	(abs:SF (match_operand:SF 1 "s_register_operand" "t")))]
-  "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_VFP"
+  "TARGET_32BIT && TARGET_HARD_FLOAT"
   "vabs%?.f32\\t%0, %1"
   [(set_attr "predicable" "yes")
    (set_attr "predicable_short_it" "no")
@@ -605,7 +887,7 @@
 (define_insn "*negsf2_vfp"
   [(set (match_operand:SF	  0 "s_register_operand" "=t,?r")
 	(neg:SF (match_operand:SF 1 "s_register_operand" "t,r")))]
-  "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_VFP"
+  "TARGET_32BIT && TARGET_HARD_FLOAT"
   "@
    vneg%?.f32\\t%0, %1
    eor%?\\t%0, %1, #-2147483648"
@@ -661,14 +943,68 @@
    (set_attr "type" "ffarithd")]
 )
 
+;; ABS and NEG for FP16.
+(define_insn "<absneg_str>hf2"
+  [(set (match_operand:HF 0 "s_register_operand" "=w")
+    (ABSNEG:HF (match_operand:HF 1 "s_register_operand" "w")))]
+ "TARGET_VFP_FP16INST"
+ "v<absneg_str>.f16\t%0, %1"
+  [(set_attr "conds" "unconditional")
+   (set_attr "type" "ffariths")]
+)
+
+(define_expand "neon_vabshf"
+ [(set
+   (match_operand:HF 0 "s_register_operand")
+   (abs:HF (match_operand:HF 1 "s_register_operand")))]
+ "TARGET_VFP_FP16INST"
+{
+  emit_insn (gen_abshf2 (operands[0], operands[1]));
+  DONE;
+})
+
+;; VRND for FP16.
+(define_insn "neon_v<fp16_rnd_str>hf"
+  [(set (match_operand:HF 0 "s_register_operand" "=w")
+    (unspec:HF
+     [(match_operand:HF 1 "s_register_operand" "w")]
+     FP16_RND))]
+ "TARGET_VFP_FP16INST"
+ "<fp16_rnd_insn>.f16\t%0, %1"
+ [(set_attr "conds" "unconditional")
+  (set_attr "type" "neon_fp_round_s")]
+)
+
+(define_insn "neon_vrndihf"
+  [(set (match_operand:HF 0 "s_register_operand" "=w")
+    (unspec:HF
+     [(match_operand:HF 1 "s_register_operand" "w")]
+     UNSPEC_VRNDI))]
+  "TARGET_VFP_FP16INST"
+  "vrintr.f16\t%0, %1"
+ [(set_attr "conds" "unconditional")
+  (set_attr "type" "neon_fp_round_s")]
+)
 
 ;; Arithmetic insns
 
+(define_insn "addhf3"
+  [(set
+    (match_operand:HF 0 "s_register_operand" "=w")
+    (plus:HF
+     (match_operand:HF 1 "s_register_operand" "w")
+     (match_operand:HF 2 "s_register_operand" "w")))]
+ "TARGET_VFP_FP16INST"
+ "vadd.f16\t%0, %1, %2"
+  [(set_attr "conds" "unconditional")
+   (set_attr "type" "fadds")]
+)
+
 (define_insn "*addsf3_vfp"
   [(set (match_operand:SF	   0 "s_register_operand" "=t")
 	(plus:SF (match_operand:SF 1 "s_register_operand" "t")
 		 (match_operand:SF 2 "s_register_operand" "t")))]
-  "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_VFP"
+  "TARGET_32BIT && TARGET_HARD_FLOAT"
   "vadd%?.f32\\t%0, %1, %2"
   [(set_attr "predicable" "yes")
    (set_attr "predicable_short_it" "no")
@@ -686,12 +1022,23 @@
    (set_attr "type" "faddd")]
 )
 
+(define_insn "subhf3"
+ [(set
+   (match_operand:HF 0 "s_register_operand" "=w")
+   (minus:HF
+    (match_operand:HF 1 "s_register_operand" "w")
+    (match_operand:HF 2 "s_register_operand" "w")))]
+ "TARGET_VFP_FP16INST"
+ "vsub.f16\t%0, %1, %2"
+  [(set_attr "conds" "unconditional")
+   (set_attr "type" "fadds")]
+)
 
 (define_insn "*subsf3_vfp"
   [(set (match_operand:SF	    0 "s_register_operand" "=t")
 	(minus:SF (match_operand:SF 1 "s_register_operand" "t")
 		  (match_operand:SF 2 "s_register_operand" "t")))]
-  "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_VFP"
+  "TARGET_32BIT && TARGET_HARD_FLOAT"
   "vsub%?.f32\\t%0, %1, %2"
   [(set_attr "predicable" "yes")
    (set_attr "predicable_short_it" "no")
@@ -712,6 +1059,19 @@
 
 ;; Division insns
 
+;; FP16 Division.
+(define_insn "divhf3"
+  [(set
+    (match_operand:HF	   0 "s_register_operand" "=w")
+    (div:HF
+     (match_operand:HF 1 "s_register_operand" "w")
+     (match_operand:HF 2 "s_register_operand" "w")))]
+  "TARGET_VFP_FP16INST"
+  "vdiv.f16\t%0, %1, %2"
+  [(set_attr "conds" "unconditional")
+   (set_attr "type" "fdivs")]
+)
+
 ; VFP9 Erratum 760019: It's potentially unsafe to overwrite the input
 ; operands, so mark the output as early clobber for VFPv2 on ARMv5 or
 ; earlier.
@@ -719,7 +1079,7 @@
   [(set (match_operand:SF	  0 "s_register_operand" "=&t,t")
 	(div:SF (match_operand:SF 1 "s_register_operand" "t,t")
 		(match_operand:SF 2 "s_register_operand" "t,t")))]
-  "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_VFP"
+  "TARGET_32BIT && TARGET_HARD_FLOAT"
   "vdiv%?.f32\\t%0, %1, %2"
   [(set_attr "predicable" "yes")
    (set_attr "predicable_short_it" "no")
@@ -742,11 +1102,22 @@
 
 ;; Multiplication insns
 
+(define_insn "mulhf3"
+ [(set
+   (match_operand:HF 0 "s_register_operand" "=w")
+   (mult:HF (match_operand:HF 1 "s_register_operand" "w")
+	    (match_operand:HF 2 "s_register_operand" "w")))]
+  "TARGET_VFP_FP16INST"
+  "vmul.f16\t%0, %1, %2"
+  [(set_attr "conds" "unconditional")
+   (set_attr "type" "fmuls")]
+)
+
 (define_insn "*mulsf3_vfp"
   [(set (match_operand:SF	   0 "s_register_operand" "=t")
 	(mult:SF (match_operand:SF 1 "s_register_operand" "t")
 		 (match_operand:SF 2 "s_register_operand" "t")))]
-  "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_VFP"
+  "TARGET_32BIT && TARGET_HARD_FLOAT"
   "vmul%?.f32\\t%0, %1, %2"
   [(set_attr "predicable" "yes")
    (set_attr "predicable_short_it" "no")
@@ -764,11 +1135,31 @@
    (set_attr "type" "fmuld")]
 )
 
+(define_insn "*mulsf3neghf_vfp"
+  [(set (match_operand:HF		   0 "s_register_operand" "=t")
+	(mult:HF (neg:HF (match_operand:HF 1 "s_register_operand" "t"))
+		 (match_operand:HF	   2 "s_register_operand" "t")))]
+  "TARGET_VFP_FP16INST && !flag_rounding_math"
+  "vnmul.f16\\t%0, %1, %2"
+  [(set_attr "conds" "unconditional")
+   (set_attr "type" "fmuls")]
+)
+
+(define_insn "*negmulhf3_vfp"
+  [(set (match_operand:HF		   0 "s_register_operand" "=t")
+	(neg:HF (mult:HF (match_operand:HF 1 "s_register_operand" "t")
+		 (match_operand:HF	   2 "s_register_operand" "t"))))]
+  "TARGET_VFP_FP16INST"
+  "vnmul.f16\\t%0, %1, %2"
+  [(set_attr "conds" "unconditional")
+   (set_attr "type" "fmuls")]
+)
+
 (define_insn "*mulsf3negsf_vfp"
   [(set (match_operand:SF		   0 "s_register_operand" "=t")
 	(mult:SF (neg:SF (match_operand:SF 1 "s_register_operand" "t"))
 		 (match_operand:SF	   2 "s_register_operand" "t")))]
-  "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_VFP && !flag_rounding_math"
+  "TARGET_32BIT && TARGET_HARD_FLOAT && !flag_rounding_math"
   "vnmul%?.f32\\t%0, %1, %2"
   [(set_attr "predicable" "yes")
    (set_attr "predicable_short_it" "no")
@@ -779,7 +1170,7 @@
   [(set (match_operand:SF		   0 "s_register_operand" "=t")
 	(neg:SF (mult:SF (match_operand:SF 1 "s_register_operand" "t")
 		 (match_operand:SF	   2 "s_register_operand" "t"))))]
-  "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_VFP"
+  "TARGET_32BIT && TARGET_HARD_FLOAT"
   "vnmul%?.f32\\t%0, %1, %2"
   [(set_attr "predicable" "yes")
    (set_attr "predicable_short_it" "no")
@@ -813,12 +1204,24 @@
 ;; Multiply-accumulate insns
 
 ;; 0 = 1 * 2 + 0
+(define_insn "*mulsf3addhf_vfp"
+ [(set (match_operand:HF 0 "s_register_operand" "=t")
+       (plus:HF
+	(mult:HF (match_operand:HF 2 "s_register_operand" "t")
+		 (match_operand:HF 3 "s_register_operand" "t"))
+	(match_operand:HF 1 "s_register_operand" "0")))]
+  "TARGET_VFP_FP16INST"
+  "vmla.f16\\t%0, %2, %3"
+  [(set_attr "conds" "unconditional")
+   (set_attr "type" "fmacs")]
+)
+
 (define_insn "*mulsf3addsf_vfp"
   [(set (match_operand:SF		    0 "s_register_operand" "=t")
 	(plus:SF (mult:SF (match_operand:SF 2 "s_register_operand" "t")
 			  (match_operand:SF 3 "s_register_operand" "t"))
 		 (match_operand:SF	    1 "s_register_operand" "0")))]
-  "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_VFP"
+  "TARGET_32BIT && TARGET_HARD_FLOAT"
   "vmla%?.f32\\t%0, %2, %3"
   [(set_attr "predicable" "yes")
    (set_attr "predicable_short_it" "no")
@@ -838,12 +1241,23 @@
 )
 
 ;; 0 = 1 * 2 - 0
+(define_insn "*mulhf3subhf_vfp"
+  [(set (match_operand:HF 0 "s_register_operand" "=t")
+	(minus:HF (mult:HF (match_operand:HF 2 "s_register_operand" "t")
+			   (match_operand:HF 3 "s_register_operand" "t"))
+		  (match_operand:HF 1 "s_register_operand" "0")))]
+  "TARGET_VFP_FP16INST"
+  "vnmls.f16\\t%0, %2, %3"
+  [(set_attr "conds" "unconditional")
+   (set_attr "type" "fmacs")]
+)
+
 (define_insn "*mulsf3subsf_vfp"
   [(set (match_operand:SF		     0 "s_register_operand" "=t")
 	(minus:SF (mult:SF (match_operand:SF 2 "s_register_operand" "t")
 			   (match_operand:SF 3 "s_register_operand" "t"))
 		  (match_operand:SF	     1 "s_register_operand" "0")))]
-  "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_VFP"
+  "TARGET_32BIT && TARGET_HARD_FLOAT"
   "vnmls%?.f32\\t%0, %2, %3"
   [(set_attr "predicable" "yes")
    (set_attr "predicable_short_it" "no")
@@ -863,12 +1277,23 @@
 )
 
 ;; 0 = -(1 * 2) + 0
+(define_insn "*mulhf3neghfaddhf_vfp"
+  [(set (match_operand:HF 0 "s_register_operand" "=t")
+	(minus:HF (match_operand:HF 1 "s_register_operand" "0")
+		  (mult:HF (match_operand:HF 2 "s_register_operand" "t")
+			   (match_operand:HF 3 "s_register_operand" "t"))))]
+  "TARGET_VFP_FP16INST"
+  "vmls.f16\\t%0, %2, %3"
+  [(set_attr "conds" "unconditional")
+   (set_attr "type" "fmacs")]
+)
+
 (define_insn "*mulsf3negsfaddsf_vfp"
   [(set (match_operand:SF		     0 "s_register_operand" "=t")
 	(minus:SF (match_operand:SF	     1 "s_register_operand" "0")
 		  (mult:SF (match_operand:SF 2 "s_register_operand" "t")
 			   (match_operand:SF 3 "s_register_operand" "t"))))]
-  "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_VFP"
+  "TARGET_32BIT && TARGET_HARD_FLOAT"
   "vmls%?.f32\\t%0, %2, %3"
   [(set_attr "predicable" "yes")
    (set_attr "predicable_short_it" "no")
@@ -889,13 +1314,25 @@
 
 
 ;; 0 = -(1 * 2) - 0
+(define_insn "*mulhf3neghfsubhf_vfp"
+  [(set (match_operand:HF 0 "s_register_operand" "=t")
+	(minus:HF (mult:HF
+		   (neg:HF (match_operand:HF 2 "s_register_operand" "t"))
+		   (match_operand:HF 3 "s_register_operand" "t"))
+		  (match_operand:HF 1 "s_register_operand" "0")))]
+  "TARGET_VFP_FP16INST"
+  "vnmla.f16\\t%0, %2, %3"
+  [(set_attr "conds" "unconditional")
+   (set_attr "type" "fmacs")]
+)
+
 (define_insn "*mulsf3negsfsubsf_vfp"
   [(set (match_operand:SF		      0 "s_register_operand" "=t")
 	(minus:SF (mult:SF
 		    (neg:SF (match_operand:SF 2 "s_register_operand" "t"))
 		    (match_operand:SF	      3 "s_register_operand" "t"))
 		  (match_operand:SF	      1 "s_register_operand" "0")))]
-  "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_VFP"
+  "TARGET_32BIT && TARGET_HARD_FLOAT"
   "vnmla%?.f32\\t%0, %2, %3"
   [(set_attr "predicable" "yes")
    (set_attr "predicable_short_it" "no")
@@ -917,6 +1354,30 @@
 
 ;; Fused-multiply-accumulate
 
+(define_insn "fmahf4"
+  [(set (match_operand:HF 0 "register_operand" "=w")
+    (fma:HF
+     (match_operand:HF 1 "register_operand" "w")
+     (match_operand:HF 2 "register_operand" "w")
+     (match_operand:HF 3 "register_operand" "0")))]
+ "TARGET_VFP_FP16INST"
+ "vfma.f16\\t%0, %1, %2"
+ [(set_attr "conds" "unconditional")
+  (set_attr "type" "ffmas")]
+)
+
+(define_expand "neon_vfmahf"
+  [(match_operand:HF 0 "s_register_operand")
+   (match_operand:HF 1 "s_register_operand")
+   (match_operand:HF 2 "s_register_operand")
+   (match_operand:HF 3 "s_register_operand")]
+  "TARGET_VFP_FP16INST"
+{
+  emit_insn (gen_fmahf4 (operands[0], operands[2], operands[3],
+			 operands[1]));
+  DONE;
+})
+
 (define_insn "fma<SDF:mode>4"
   [(set (match_operand:SDF 0 "register_operand" "=<F_constraint>")
         (fma:SDF (match_operand:SDF 1 "register_operand" "<F_constraint>")
@@ -929,6 +1390,30 @@
    (set_attr "type" "ffma<vfp_type>")]
 )
 
+(define_insn "fmsubhf4_fp16"
+ [(set (match_operand:HF 0 "register_operand" "=w")
+   (fma:HF
+    (neg:HF (match_operand:HF 1 "register_operand" "w"))
+    (match_operand:HF 2 "register_operand" "w")
+    (match_operand:HF 3 "register_operand" "0")))]
+ "TARGET_VFP_FP16INST"
+ "vfms.f16\\t%0, %1, %2"
+ [(set_attr "conds" "unconditional")
+  (set_attr "type" "ffmas")]
+)
+
+(define_expand "neon_vfmshf"
+  [(match_operand:HF 0 "s_register_operand")
+   (match_operand:HF 1 "s_register_operand")
+   (match_operand:HF 2 "s_register_operand")
+   (match_operand:HF 3 "s_register_operand")]
+  "TARGET_VFP_FP16INST"
+{
+  emit_insn (gen_fmsubhf4_fp16 (operands[0], operands[2], operands[3],
+				operands[1]));
+  DONE;
+})
+
 (define_insn "*fmsub<SDF:mode>4"
   [(set (match_operand:SDF 0 "register_operand" "=<F_constraint>")
 	(fma:SDF (neg:SDF (match_operand:SDF 1 "register_operand"
@@ -942,6 +1427,17 @@
    (set_attr "type" "ffma<vfp_type>")]
 )
 
+(define_insn "*fnmsubhf4"
+  [(set (match_operand:HF 0 "register_operand" "=w")
+	(fma:HF (match_operand:HF 1 "register_operand" "w")
+		 (match_operand:HF 2 "register_operand" "w")
+		 (neg:HF (match_operand:HF 3 "register_operand" "0"))))]
+  "TARGET_VFP_FP16INST"
+  "vfnms.f16\\t%0, %1, %2"
+  [(set_attr "conds" "unconditional")
+   (set_attr "type" "ffmas")]
+)
+
 (define_insn "*fnmsub<SDF:mode>4"
   [(set (match_operand:SDF 0 "register_operand" "=<F_constraint>")
 	(fma:SDF (match_operand:SDF 1 "register_operand" "<F_constraint>")
@@ -954,6 +1450,17 @@
    (set_attr "type" "ffma<vfp_type>")]
 )
 
+(define_insn "*fnmaddhf4"
+  [(set (match_operand:HF 0 "register_operand" "=w")
+	(fma:HF (neg:HF (match_operand:HF 1 "register_operand" "w"))
+		 (match_operand:HF 2 "register_operand" "w")
+		 (neg:HF (match_operand:HF 3 "register_operand" "0"))))]
+  "TARGET_VFP_FP16INST"
+  "vfnma.f16\\t%0, %1, %2"
+  [(set_attr "conds" "unconditional")
+   (set_attr "type" "ffmas")]
+)
+
 (define_insn "*fnmadd<SDF:mode>4"
   [(set (match_operand:SDF 0 "register_operand" "=<F_constraint>")
 	(fma:SDF (neg:SDF (match_operand:SDF 1 "register_operand"
@@ -993,7 +1500,7 @@
 (define_insn "extendhfsf2"
   [(set (match_operand:SF		   0 "s_register_operand" "=t")
 	(float_extend:SF (match_operand:HF 1 "s_register_operand" "t")))]
-  "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_FP16"
+  "TARGET_32BIT && TARGET_HARD_FLOAT && (TARGET_FP16 || TARGET_VFP_FP16INST)"
   "vcvtb%?.f32.f16\\t%0, %1"
   [(set_attr "predicable" "yes")
    (set_attr "predicable_short_it" "no")
@@ -1003,7 +1510,7 @@
 (define_insn "truncsfhf2"
   [(set (match_operand:HF		   0 "s_register_operand" "=t")
 	(float_truncate:HF (match_operand:SF 1 "s_register_operand" "t")))]
-  "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_FP16"
+  "TARGET_32BIT && TARGET_HARD_FLOAT && (TARGET_FP16 || TARGET_VFP_FP16INST)"
   "vcvtb%?.f16.f32\\t%0, %1"
   [(set_attr "predicable" "yes")
    (set_attr "predicable_short_it" "no")
@@ -1013,7 +1520,7 @@
 (define_insn "*truncsisf2_vfp"
   [(set (match_operand:SI		  0 "s_register_operand" "=t")
 	(fix:SI (fix:SF (match_operand:SF 1 "s_register_operand" "t"))))]
-  "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_VFP"
+  "TARGET_32BIT && TARGET_HARD_FLOAT"
   "vcvt%?.s32.f32\\t%0, %1"
   [(set_attr "predicable" "yes")
    (set_attr "predicable_short_it" "no")
@@ -1034,7 +1541,7 @@
 (define_insn "fixuns_truncsfsi2"
   [(set (match_operand:SI		  0 "s_register_operand" "=t")
 	(unsigned_fix:SI (fix:SF (match_operand:SF 1 "s_register_operand" "t"))))]
-  "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_VFP"
+  "TARGET_32BIT && TARGET_HARD_FLOAT"
   "vcvt%?.u32.f32\\t%0, %1"
   [(set_attr "predicable" "yes")
    (set_attr "predicable_short_it" "no")
@@ -1055,7 +1562,7 @@
 (define_insn "*floatsisf2_vfp"
   [(set (match_operand:SF	    0 "s_register_operand" "=t")
 	(float:SF (match_operand:SI 1 "s_register_operand" "t")))]
-  "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_VFP"
+  "TARGET_32BIT && TARGET_HARD_FLOAT"
   "vcvt%?.f32.s32\\t%0, %1"
   [(set_attr "predicable" "yes")
    (set_attr "predicable_short_it" "no")
@@ -1076,7 +1583,7 @@
 (define_insn "floatunssisf2"
   [(set (match_operand:SF	    0 "s_register_operand" "=t")
 	(unsigned_float:SF (match_operand:SI 1 "s_register_operand" "t")))]
-  "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_VFP"
+  "TARGET_32BIT && TARGET_HARD_FLOAT"
   "vcvt%?.f32.u32\\t%0, %1"
   [(set_attr "predicable" "yes")
    (set_attr "predicable_short_it" "no")
@@ -1096,13 +1603,34 @@
 
 ;; Sqrt insns.
 
+(define_insn "neon_vsqrthf"
+  [(set (match_operand:HF 0 "s_register_operand" "=w")
+	(sqrt:HF (match_operand:HF 1 "s_register_operand" "w")))]
+  "TARGET_VFP_FP16INST"
+  "vsqrt.f16\t%0, %1"
+  [(set_attr "conds" "unconditional")
+   (set_attr "type" "fsqrts")]
+)
+
+(define_insn "neon_vrsqrtshf"
+  [(set
+    (match_operand:HF 0 "s_register_operand" "=w")
+    (unspec:HF [(match_operand:HF 1 "s_register_operand" "w")
+		(match_operand:HF 2 "s_register_operand" "w")]
+     UNSPEC_VRSQRTS))]
+ "TARGET_VFP_FP16INST"
+ "vrsqrts.f16\t%0, %1, %2"
+ [(set_attr "conds" "unconditional")
+  (set_attr "type" "fsqrts")]
+)
+
 ; VFP9 Erratum 760019: It's potentially unsafe to overwrite the input
 ; operands, so mark the output as early clobber for VFPv2 on ARMv5 or
 ; earlier.
 (define_insn "*sqrtsf2_vfp"
   [(set (match_operand:SF	   0 "s_register_operand" "=&t,t")
 	(sqrt:SF (match_operand:SF 1 "s_register_operand" "t,t")))]
-  "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_VFP"
+  "TARGET_32BIT && TARGET_HARD_FLOAT"
   "vsqrt%?.f32\\t%0, %1"
   [(set_attr "predicable" "yes")
    (set_attr "predicable_short_it" "no")
@@ -1127,7 +1655,7 @@
 (define_insn "*movcc_vfp"
   [(set (reg CC_REGNUM)
 	(reg VFPCC_REGNUM))]
-  "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_VFP"
+  "TARGET_32BIT && TARGET_HARD_FLOAT"
   "vmrs%?\\tAPSR_nzcv, FPSCR"
   [(set_attr "conds" "set")
    (set_attr "type" "f_flag")]
@@ -1137,9 +1665,9 @@
   [(set (reg:CCFP CC_REGNUM)
 	(compare:CCFP (match_operand:SF 0 "s_register_operand"  "t")
 		      (match_operand:SF 1 "vfp_compare_operand" "tG")))]
-  "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_VFP"
+  "TARGET_32BIT && TARGET_HARD_FLOAT"
   "#"
-  "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_VFP"
+  "TARGET_32BIT && TARGET_HARD_FLOAT"
   [(set (reg:CCFP VFPCC_REGNUM)
 	(compare:CCFP (match_dup 0)
 		      (match_dup 1)))
@@ -1152,9 +1680,9 @@
   [(set (reg:CCFPE CC_REGNUM)
 	(compare:CCFPE (match_operand:SF 0 "s_register_operand"  "t")
 		       (match_operand:SF 1 "vfp_compare_operand" "tG")))]
-  "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_VFP"
+  "TARGET_32BIT && TARGET_HARD_FLOAT"
   "#"
-  "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_VFP"
+  "TARGET_32BIT && TARGET_HARD_FLOAT"
   [(set (reg:CCFPE VFPCC_REGNUM)
 	(compare:CCFPE (match_dup 0)
 		       (match_dup 1)))
@@ -1203,7 +1731,7 @@
   [(set (reg:CCFP VFPCC_REGNUM)
 	(compare:CCFP (match_operand:SF 0 "s_register_operand"  "t,t")
 		      (match_operand:SF 1 "vfp_compare_operand" "t,G")))]
-  "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_VFP"
+  "TARGET_32BIT && TARGET_HARD_FLOAT"
   "@
    vcmp%?.f32\\t%0, %1
    vcmp%?.f32\\t%0, #0"
@@ -1216,7 +1744,7 @@
   [(set (reg:CCFPE VFPCC_REGNUM)
 	(compare:CCFPE (match_operand:SF 0 "s_register_operand"  "t,t")
 		       (match_operand:SF 1 "vfp_compare_operand" "t,G")))]
-  "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_VFP"
+  "TARGET_32BIT && TARGET_HARD_FLOAT"
   "@
    vcmpe%?.f32\\t%0, %1
    vcmpe%?.f32\\t%0, #0"
@@ -1252,9 +1780,6 @@
 )
 
 ;; Fixed point to floating point conversions.
-(define_code_iterator FCVT [unsigned_float float])
-(define_code_attr FCVTI32typename [(unsigned_float "u32") (float "s32")])
-
 (define_insn "*combine_vcvt_f32_<FCVTI32typename>"
   [(set (match_operand:SF 0 "s_register_operand" "=t")
 	(mult:SF (FCVT:SF (match_operand:SI 1 "s_register_operand" "0"))
@@ -1299,13 +1824,132 @@
    (set_attr "type" "f_cvtf2i")]
  )
 
+;; FP16 conversions.
+(define_insn "neon_vcvth<sup>hf"
+ [(set (match_operand:HF 0 "s_register_operand" "=w")
+   (unspec:HF
+    [(match_operand:SI 1 "s_register_operand" "w")]
+    VCVTH_US))]
+ "TARGET_VFP_FP16INST"
+ "vcvt.f16.<sup>%#32\t%0, %1"
+ [(set_attr "conds" "unconditional")
+  (set_attr "type" "f_cvti2f")]
+)
+
+(define_insn "neon_vcvth<sup>si"
+ [(set (match_operand:SI 0 "s_register_operand" "=w")
+   (unspec:SI
+    [(match_operand:HF 1 "s_register_operand" "w")]
+    VCVTH_US))]
+ "TARGET_VFP_FP16INST"
+ "vcvt.<sup>%#32.f16\t%0, %1"
+ [(set_attr "conds" "unconditional")
+  (set_attr "type" "f_cvtf2i")]
+)
+
+;; The neon_vcvth<sup>_nhf patterns are used to generate the instruction for the
+;; vcvth_n_f16_<sup>32 arm_fp16 intrinsics.  They are complicated by the
+;; hardware requirement that the source and destination registers are the same
+;; despite having different machine modes.  The approach is to use a temporary
+;; register for the conversion and move that to the correct destination.
+
+;; Generate an unspec pattern for the intrinsic.
+(define_insn "neon_vcvth<sup>_nhf_unspec"
+ [(set
+   (match_operand:SI 0 "s_register_operand" "=w")
+   (unspec:SI
+    [(match_operand:SI 1 "s_register_operand" "0")
+     (match_operand:SI 2 "immediate_operand" "i")]
+    VCVT_HF_US_N))
+ (set
+  (match_operand:HF 3 "s_register_operand" "=w")
+  (float_truncate:HF (float:SF (match_dup 0))))]
+ "TARGET_VFP_FP16INST"
+{
+  neon_const_bounds (operands[2], 1, 33);
+  return "vcvt.f16.<sup>32\t%0, %0, %2\;vmov.f32\t%3, %0";
+}
+  [(set_attr "conds" "unconditional")
+   (set_attr "type" "f_cvti2f")]
+)
+
+;; Generate the instruction patterns needed for vcvth_n_f16_s32 neon intrinsics.
+(define_expand "neon_vcvth<sup>_nhf"
+ [(match_operand:HF 0 "s_register_operand")
+  (unspec:HF [(match_operand:SI 1 "s_register_operand")
+	      (match_operand:SI 2 "immediate_operand")]
+   VCVT_HF_US_N)]
+"TARGET_VFP_FP16INST"
+{
+  rtx op1 = gen_reg_rtx (SImode);
+
+  neon_const_bounds (operands[2], 1, 33);
+
+  emit_move_insn (op1, operands[1]);
+  emit_insn (gen_neon_vcvth<sup>_nhf_unspec (op1, op1, operands[2],
+					     operands[0]));
+  DONE;
+})
+
+;; The neon_vcvth<sup>_nsi patterns are used to generate the instruction for the
+;; vcvth_n_<sup>32_f16 arm_fp16 intrinsics.  They have the same restrictions and
+;; are implemented in the same way as the neon_vcvth<sup>_nhf patterns.
+
+;; Generate an unspec pattern, constraining the registers.
+(define_insn "neon_vcvth<sup>_nsi_unspec"
+ [(set (match_operand:SI 0 "s_register_operand" "=w")
+   (unspec:SI
+    [(fix:SI
+      (fix:SF
+       (float_extend:SF
+	(match_operand:HF 1 "s_register_operand" "w"))))
+     (match_operand:SI 2 "immediate_operand" "i")]
+    VCVT_SI_US_N))]
+ "TARGET_VFP_FP16INST"
+{
+  neon_const_bounds (operands[2], 1, 33);
+  return "vmov.f32\t%0, %1\;vcvt.<sup>%#32.f16\t%0, %0, %2";
+}
+  [(set_attr "conds" "unconditional")
+   (set_attr "type" "f_cvtf2i")]
+)
+
+;; Generate the instruction patterns needed for vcvth_n_f16_s32 neon intrinsics.
+(define_expand "neon_vcvth<sup>_nsi"
+ [(match_operand:SI 0 "s_register_operand")
+  (unspec:SI
+   [(match_operand:HF 1 "s_register_operand")
+    (match_operand:SI 2 "immediate_operand")]
+   VCVT_SI_US_N)]
+ "TARGET_VFP_FP16INST"
+{
+  rtx op1 = gen_reg_rtx (SImode);
+
+  neon_const_bounds (operands[2], 1, 33);
+  emit_insn (gen_neon_vcvth<sup>_nsi_unspec (op1, operands[1], operands[2]));
+  emit_move_insn (operands[0], op1);
+  DONE;
+})
+
+(define_insn "neon_vcvt<vcvth_op>h<sup>si"
+ [(set
+   (match_operand:SI 0 "s_register_operand" "=w")
+   (unspec:SI
+    [(match_operand:HF 1 "s_register_operand" "w")]
+    VCVT_HF_US))]
+ "TARGET_VFP_FP16INST"
+ "vcvt<vcvth_op>.<sup>%#32.f16\t%0, %1"
+  [(set_attr "conds" "unconditional")
+   (set_attr "type" "f_cvtf2i")]
+)
+
 ;; Store multiple insn used in function prologue.
 (define_insn "*push_multi_vfp"
   [(match_parallel 2 "multi_register_push"
     [(set (match_operand:BLK 0 "memory_operand" "=m")
 	  (unspec:BLK [(match_operand:DF 1 "vfp_register_operand" "")]
 		      UNSPEC_PUSH_MULT))])]
-  "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_VFP"
+  "TARGET_32BIT && TARGET_HARD_FLOAT"
   "* return vfp_output_vstmd (operands);"
   [(set_attr "type" "f_stored")]
 )
@@ -1368,6 +2012,20 @@
 )
 
 ;; Scalar forms for the IEEE-754 fmax()/fmin() functions
+
+(define_insn "neon_<fmaxmin_op>hf"
+ [(set
+   (match_operand:HF 0 "s_register_operand" "=w")
+   (unspec:HF
+    [(match_operand:HF 1 "s_register_operand" "w")
+     (match_operand:HF 2 "s_register_operand" "w")]
+    VMAXMINFNM))]
+ "TARGET_VFP_FP16INST"
+ "<fmaxmin_op>.f16\t%0, %1, %2"
+ [(set_attr "conds" "unconditional")
+  (set_attr "type" "f_minmaxs")]
+)
+
 (define_insn "<fmaxmin><mode>3"
   [(set (match_operand:SDF 0 "s_register_operand" "=<F_constraint>")
 	(unspec:SDF [(match_operand:SDF 1 "s_register_operand" "<F_constraint>")
@@ -1382,7 +2040,7 @@
 ;; Write Floating-point Status and Control Register.
 (define_insn "set_fpscr"
   [(unspec_volatile [(match_operand:SI 0 "register_operand" "r")] VUNSPEC_SET_FPSCR)]
-  "TARGET_VFP && TARGET_HARD_FLOAT"
+  "TARGET_HARD_FLOAT"
   "mcr\\tp10, 7, %0, cr1, cr0, 0\\t @SET_FPSCR"
   [(set_attr "type" "mrs")])
 
@@ -1390,7 +2048,7 @@
 (define_insn "get_fpscr"
   [(set (match_operand:SI 0 "register_operand" "=r")
         (unspec_volatile:SI [(const_int 0)] VUNSPEC_GET_FPSCR))]
-  "TARGET_VFP && TARGET_HARD_FLOAT"
+  "TARGET_HARD_FLOAT"
   "mrc\\tp10, 7, %0, cr1, cr0, 0\\t @GET_FPSCR"
   [(set_attr "type" "mrs")])
 
--- a/src/gcc/config/arm/xgene1.md
+++ b/src/gcc/config/arm/xgene1.md
@@ -164,7 +164,7 @@
 
 (define_insn_reservation "xgene1_bfm" 2
   (and (eq_attr "tune" "xgene1")
-       (eq_attr "type" "bfm"))
+       (eq_attr "type" "bfm,bfx"))
   "xgene1_decode1op,xgene1_fsu")
 
 (define_insn_reservation "xgene1_f_rint" 5
--- a/src/gcc/config/i386/i386.c
+++ b/src/gcc/config/i386/i386.c
@@ -23,6 +23,7 @@ along with GCC; see the file COPYING3.  If not see
 #include "backend.h"
 #include "rtl.h"
 #include "tree.h"
+#include "memmodel.h"
 #include "gimple.h"
 #include "cfghooks.h"
 #include "cfgloop.h"
--- a/src/gcc/config/ia64/ia64.c
+++ b/src/gcc/config/ia64/ia64.c
@@ -26,6 +26,7 @@ along with GCC; see the file COPYING3.  If not see
 #include "target.h"
 #include "rtl.h"
 #include "tree.h"
+#include "memmodel.h"
 #include "cfghooks.h"
 #include "df.h"
 #include "tm_p.h"
--- a/src/gcc/config/linux.c
+++ b/src/gcc/config/linux.c
@@ -26,7 +26,7 @@ along with GCC; see the file COPYING3.  If not see
 bool
 linux_libc_has_function (enum function_class fn_class)
 {
-  if (OPTION_GLIBC)
+  if (OPTION_GLIBC || OPTION_MUSL)
     return true;
   if (OPTION_BIONIC)
     if (fn_class == function_c94
--- a/src/gcc/config/mips/mips.c
+++ b/src/gcc/config/mips/mips.c
@@ -28,6 +28,7 @@ along with GCC; see the file COPYING3.  If not see
 #include "target.h"
 #include "rtl.h"
 #include "tree.h"
+#include "memmodel.h"
 #include "gimple.h"
 #include "cfghooks.h"
 #include "df.h"
--- a/src/gcc/config/rs6000/rs6000.c
+++ b/src/gcc/config/rs6000/rs6000.c
@@ -24,6 +24,7 @@
 #include "backend.h"
 #include "rtl.h"
 #include "tree.h"
+#include "memmodel.h"
 #include "gimple.h"
 #include "cfghooks.h"
 #include "cfgloop.h"
--- a/src/gcc/config/sparc/sparc.c
+++ b/src/gcc/config/sparc/sparc.c
@@ -27,6 +27,7 @@ along with GCC; see the file COPYING3.  If not see
 #include "target.h"
 #include "rtl.h"
 #include "tree.h"
+#include "memmodel.h"
 #include "gimple.h"
 #include "df.h"
 #include "tm_p.h"
--- a/src/gcc/configure
+++ b/src/gcc/configure
@@ -1711,7 +1711,8 @@ Optional Packages:
   --with-stabs            arrange to use stabs instead of host debug format
   --with-dwarf2           force the default debug format to be DWARF 2
   --with-specs=SPECS      add SPECS to driver command-line processing
-  --with-pkgversion=PKG   Use PKG in the version string in place of "GCC"
+  --with-pkgversion=PKG   Use PKG in the version string in place of "Linaro
+                          GCC `cat $srcdir/LINARO-VERSION`"
   --with-bugurl=URL       Direct users to URL to report a bug
   --with-multilib-list    select multilibs (AArch64, SH and x86-64 only)
   --with-gnu-ld           assume the C compiler uses GNU ld default=no
@@ -7658,7 +7659,7 @@ if test "${with_pkgversion+set}" = set; then :
       *)   PKGVERSION="($withval) " ;;
      esac
 else
-  PKGVERSION="(GCC) "
+  PKGVERSION="(Linaro GCC `cat $srcdir/LINARO-VERSION`) "
 
 fi
 
@@ -18460,7 +18461,7 @@ else
   lt_dlunknown=0; lt_dlno_uscore=1; lt_dlneed_uscore=2
   lt_status=$lt_dlunknown
   cat > conftest.$ac_ext <<_LT_EOF
-#line 18463 "configure"
+#line 18464 "configure"
 #include "confdefs.h"
 
 #if HAVE_DLFCN_H
@@ -18566,7 +18567,7 @@ else
   lt_dlunknown=0; lt_dlno_uscore=1; lt_dlneed_uscore=2
   lt_status=$lt_dlunknown
   cat > conftest.$ac_ext <<_LT_EOF
-#line 18569 "configure"
+#line 18570 "configure"
 #include "confdefs.h"
 
 #if HAVE_DLFCN_H
--- a/src/gcc/configure.ac
+++ b/src/gcc/configure.ac
@@ -910,7 +910,7 @@ AC_ARG_WITH(specs,
 )
 AC_SUBST(CONFIGURE_SPECS)
 
-ACX_PKGVERSION([GCC])
+ACX_PKGVERSION([Linaro GCC `cat $srcdir/LINARO-VERSION`])
 ACX_BUGURL([http://gcc.gnu.org/bugs.html])
 
 # Sanity check enable_languages in case someone does not run the toplevel
--- a/src/gcc/cppbuiltin.c
+++ b/src/gcc/cppbuiltin.c
@@ -52,18 +52,41 @@ parse_basever (int *major, int *minor, int *patchlevel)
     *patchlevel = s_patchlevel;
 }
 
+/* Parse a LINAROVER version string of the format "M.m-year.month[-spin][~dev]"
+   to create Linaro release number YYYYMM and spin version.  */
+static void
+parse_linarover (int *release, int *spin)
+{
+  static int s_year = -1, s_month, s_spin;
+
+  if (s_year == -1)
+    if (sscanf (LINAROVER, "%*[^-]-%d.%d-%d", &s_year, &s_month, &s_spin) != 3)
+      {
+	sscanf (LINAROVER, "%*[^-]-%d.%d", &s_year, &s_month);
+	s_spin = 0;
+      }
+
+  if (release)
+    *release = s_year * 100 + s_month;
+
+  if (spin)
+    *spin = s_spin;
+}
 
 /* Define __GNUC__, __GNUC_MINOR__, __GNUC_PATCHLEVEL__ and __VERSION__.  */
 static void
 define__GNUC__ (cpp_reader *pfile)
 {
-  int major, minor, patchlevel;
+  int major, minor, patchlevel, linaro_release, linaro_spin;
 
   parse_basever (&major, &minor, &patchlevel);
+  parse_linarover (&linaro_release, &linaro_spin);
   cpp_define_formatted (pfile, "__GNUC__=%d", major);
   cpp_define_formatted (pfile, "__GNUC_MINOR__=%d", minor);
   cpp_define_formatted (pfile, "__GNUC_PATCHLEVEL__=%d", patchlevel);
   cpp_define_formatted (pfile, "__VERSION__=\"%s\"", version_string);
+  cpp_define_formatted (pfile, "__LINARO_RELEASE__=%d", linaro_release);
+  cpp_define_formatted (pfile, "__LINARO_SPIN__=%d", linaro_spin);
   cpp_define_formatted (pfile, "__ATOMIC_RELAXED=%d", MEMMODEL_RELAXED);
   cpp_define_formatted (pfile, "__ATOMIC_SEQ_CST=%d", MEMMODEL_SEQ_CST);
   cpp_define_formatted (pfile, "__ATOMIC_ACQUIRE=%d", MEMMODEL_ACQUIRE);
--- a/src/gcc/defaults.h
+++ b/src/gcc/defaults.h
@@ -971,11 +971,8 @@ see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
 #define REG_WORDS_BIG_ENDIAN WORDS_BIG_ENDIAN
 #endif
 
-#ifdef TARGET_FLT_EVAL_METHOD
-#define TARGET_FLT_EVAL_METHOD_NON_DEFAULT 1
-#else
+#ifndef TARGET_FLT_EVAL_METHOD
 #define TARGET_FLT_EVAL_METHOD 0
-#define TARGET_FLT_EVAL_METHOD_NON_DEFAULT 0
 #endif
 
 #ifndef TARGET_DEC_EVAL_METHOD
--- a/src/gcc/expmed.c
+++ b/src/gcc/expmed.c
@@ -2522,16 +2522,8 @@ expand_variable_shift (enum tree_code code, machine_mode mode, rtx shifted,
 }
 
 
-/* Indicates the type of fixup needed after a constant multiplication.
-   BASIC_VARIANT means no fixup is needed, NEGATE_VARIANT means that
-   the result should be negated, and ADD_VARIANT means that the
-   multiplicand should be added to the result.  */
-enum mult_variant {basic_variant, negate_variant, add_variant};
-
 static void synth_mult (struct algorithm *, unsigned HOST_WIDE_INT,
 			const struct mult_cost *, machine_mode mode);
-static bool choose_mult_variant (machine_mode, HOST_WIDE_INT,
-				 struct algorithm *, enum mult_variant *, int);
 static rtx expand_mult_const (machine_mode, rtx, HOST_WIDE_INT, rtx,
 			      const struct algorithm *, enum mult_variant);
 static unsigned HOST_WIDE_INT invert_mod2n (unsigned HOST_WIDE_INT, int);
@@ -3021,7 +3013,7 @@ synth_mult (struct algorithm *alg_out, unsigned HOST_WIDE_INT t,
    Return true if the cheapest of these cost less than MULT_COST,
    describing the algorithm in *ALG and final fixup in *VARIANT.  */
 
-static bool
+bool
 choose_mult_variant (machine_mode mode, HOST_WIDE_INT val,
 		     struct algorithm *alg, enum mult_variant *variant,
 		     int mult_cost)
--- a/src/gcc/expmed.h
+++ b/src/gcc/expmed.h
@@ -35,6 +35,15 @@ enum alg_code {
   alg_impossible
 };
 
+/* Indicates the type of fixup needed after a constant multiplication.
+   BASIC_VARIANT means no fixup is needed, NEGATE_VARIANT means that
+   the result should be negated, and ADD_VARIANT means that the
+   multiplicand should be added to the result.  */
+enum mult_variant {basic_variant, negate_variant, add_variant};
+
+bool choose_mult_variant (machine_mode, HOST_WIDE_INT,
+			  struct algorithm *, enum mult_variant *, int);
+
 /* This structure holds the "cost" of a multiply sequence.  The
    "cost" field holds the total rtx_cost of every operator in the
    synthetic multiplication sequence, hence cost(a op b) is defined
--- a/src/gcc/fold-const.c
+++ b/src/gcc/fold-const.c
@@ -7230,7 +7230,16 @@ native_encode_real (const_tree expr, unsigned char *ptr, int len, int off)
 	    offset += byte % UNITS_PER_WORD;
 	}
       else
-	offset = BYTES_BIG_ENDIAN ? 3 - byte : byte;
+	{
+	  offset = byte;
+	  if (BYTES_BIG_ENDIAN)
+	    {
+	      /* Reverse bytes within each long, or within the entire float
+		 if it's smaller than a long (for HFmode).  */
+	      offset = MIN (3, total_bytes - 1) - offset;
+	      gcc_assert (offset >= 0);
+	    }
+	}
       offset = offset + ((bitpos / BITS_PER_UNIT) & ~3);
       if (offset >= off
 	  && offset - off < len)
--- a/src/gcc/fortran/options.c
+++ b/src/gcc/fortran/options.c
@@ -208,8 +208,7 @@ gfc_post_options (const char **pfilename)
 
   /* Excess precision other than "fast" requires front-end
      support.  */
-  if (flag_excess_precision_cmdline == EXCESS_PRECISION_STANDARD
-      && TARGET_FLT_EVAL_METHOD_NON_DEFAULT)
+  if (flag_excess_precision_cmdline == EXCESS_PRECISION_STANDARD)
     sorry ("-fexcess-precision=standard for Fortran");
   flag_excess_precision_cmdline = EXCESS_PRECISION_FAST;
 
--- a/src/gcc/genconditions.c
+++ b/src/gcc/genconditions.c
@@ -94,6 +94,7 @@ write_header (void)
 #include \"resource.h\"\n\
 #include \"diagnostic-core.h\"\n\
 #include \"reload.h\"\n\
+#include \"memmodel.h\"\n\
 #include \"tm-constrs.h\"\n");
 
   if (saw_eh_return)
--- a/src/gcc/genemit.c
+++ b/src/gcc/genemit.c
@@ -792,6 +792,7 @@ from the machine description file `md'.  */\n\n");
   printf ("#include \"reload.h\"\n");
   printf ("#include \"diagnostic-core.h\"\n");
   printf ("#include \"regs.h\"\n");
+  printf ("#include \"memmodel.h\"\n");
   printf ("#include \"tm-constrs.h\"\n");
   printf ("#include \"ggc.h\"\n");
   printf ("#include \"dumpfile.h\"\n");
--- a/src/gcc/genmultilib
+++ b/src/gcc/genmultilib
@@ -186,7 +186,8 @@ fi
 EOF
 chmod +x tmpmultilib
 
-combinations=`initial=/ ./tmpmultilib ${options}`
+combination_space=`initial=/ ./tmpmultilib ${options}`
+combinations="$combination_space"
 
 # If there exceptions, weed them out now
 if [ -n "${exceptions}" ]; then
@@ -472,14 +473,19 @@ for rrule in ${multilib_reuse}; do
   # in this variable, it means no multilib will be built for current reuse
   # rule.  Thus the reuse purpose specified by current rule is meaningless.
   if expr "${combinations} " : ".*/${combo}/.*" > /dev/null; then
-    combo="/${combo}/"
-    dirout=`./tmpmultilib3 "${combo}" "${todirnames}" "${toosdirnames}" "${enable_multilib}"`
-    copts="/${copts}/"
-    optout=`./tmpmultilib4 "${copts}" "${options}"`
-    # Output the line with all appropriate matches.
-    dirout="${dirout}" optout="${optout}" ./tmpmultilib2
+    if expr "${combination_space} " : ".*/${copts}/.*" > /dev/null; then
+      combo="/${combo}/"
+      dirout=`./tmpmultilib3 "${combo}" "${todirnames}" "${toosdirnames}" "${enable_multilib}"`
+      copts="/${copts}/"
+      optout=`./tmpmultilib4 "${copts}" "${options}"`
+      # Output the line with all appropriate matches.
+      dirout="${dirout}" optout="${optout}" ./tmpmultilib2
+    else
+      echo "The rule ${rrule} contains an option absent from MULTILIB_OPTIONS." >&2
+      exit 1
+    fi
   else
-    echo "The rule ${rrule} is trying to reuse nonexistent multilib."
+    echo "The rule ${rrule} is trying to reuse nonexistent multilib." >&2
     exit 1
   fi
 done
--- a/src/gcc/genoutput.c
+++ b/src/gcc/genoutput.c
@@ -231,6 +231,7 @@ output_prologue (void)
   printf ("#include \"diagnostic-core.h\"\n");
   printf ("#include \"output.h\"\n");
   printf ("#include \"target.h\"\n");
+  printf ("#include \"memmodel.h\"\n");
   printf ("#include \"tm-constrs.h\"\n");
 }
 
--- a/src/gcc/genpeep.c
+++ b/src/gcc/genpeep.c
@@ -373,6 +373,7 @@ from the machine description file `md'.  */\n\n");
   printf ("#include \"except.h\"\n");
   printf ("#include \"diagnostic-core.h\"\n");
   printf ("#include \"flags.h\"\n");
+  printf ("#include \"memmodel.h\"\n");
   printf ("#include \"tm-constrs.h\"\n\n");
 
   printf ("extern rtx peep_operand[];\n\n");
--- a/src/gcc/genpreds.c
+++ b/src/gcc/genpreds.c
@@ -1577,6 +1577,7 @@ write_insn_preds_c (void)
 #include \"reload.h\"\n\
 #include \"regs.h\"\n\
 #include \"emit-rtl.h\"\n\
+#include \"memmodel.h\"\n\
 #include \"tm-constrs.h\"\n");
 
   FOR_ALL_PREDICATES (p)
--- a/src/gcc/genrecog.c
+++ b/src/gcc/genrecog.c
@@ -4172,6 +4172,7 @@ write_header (void)
 #include \"diagnostic-core.h\"\n\
 #include \"reload.h\"\n\
 #include \"regs.h\"\n\
+#include \"memmodel.h\"\n\
 #include \"tm-constrs.h\"\n\
 \n");
 
--- a/src/gcc/gimple-fold.c
+++ b/src/gcc/gimple-fold.c
@@ -1379,6 +1379,55 @@ gimple_fold_builtin_strncpy (gimple_stmt_iterator *gsi,
   return true;
 }
 
+/* Simplify strchr (str, 0) into str + strlen (str).
+   In general strlen is significantly faster than strchr
+   due to being a simpler operation.  */
+static bool
+gimple_fold_builtin_strchr (gimple_stmt_iterator *gsi)
+{
+  gimple *stmt = gsi_stmt (*gsi);
+  tree str = gimple_call_arg (stmt, 0);
+  tree c = gimple_call_arg (stmt, 1);
+  location_t loc = gimple_location (stmt);
+
+  if (optimize_function_for_size_p (cfun))
+    return false;
+
+  if (!integer_zerop (c) || !gimple_call_lhs (stmt))
+    return false;
+
+  tree len;
+  tree strlen_fn = builtin_decl_implicit (BUILT_IN_STRLEN);
+
+  if (!strlen_fn)
+    return false;
+
+  /* Create newstr = strlen (str).  */
+  gimple_seq stmts = NULL;
+  gimple *new_stmt = gimple_build_call (strlen_fn, 1, str);
+  gimple_set_location (new_stmt, loc);
+  if (gimple_in_ssa_p (cfun))
+    len = make_ssa_name (size_type_node);
+  else
+    len = create_tmp_reg (size_type_node);
+  gimple_call_set_lhs (new_stmt, len);
+  gimple_seq_add_stmt_without_update (&stmts, new_stmt);
+
+  /* Create (str p+ strlen (str)).  */
+  new_stmt = gimple_build_assign (gimple_call_lhs (stmt),
+				  POINTER_PLUS_EXPR, str, len);
+  gimple_seq_add_stmt_without_update (&stmts, new_stmt);
+  gsi_replace_with_seq_vops (gsi, stmts);
+  /* gsi now points at the assignment to the lhs, get a
+     stmt iterator to the strlen.
+     ???  We can't use gsi_for_stmt as that doesn't work when the
+     CFG isn't built yet.  */
+  gimple_stmt_iterator gsi2 = *gsi;
+  gsi_prev (&gsi2);
+  fold_stmt (&gsi2);
+  return true;
+}
+
 /* Simplify a call to the strcat builtin.  DST and SRC are the arguments
    to the call.
 
@@ -2820,6 +2869,11 @@ gimple_fold_builtin (gimple_stmt_iterator *gsi)
 					 gimple_call_arg (stmt, 1));
     case BUILT_IN_STRNCAT:
       return gimple_fold_builtin_strncat (gsi);
+    case BUILT_IN_STRCHR:
+      if (gimple_fold_builtin_strchr (gsi))
+	return true;
+      /* Perform additional folding in builtin.c.  */
+      break;
     case BUILT_IN_FPUTS:
       return gimple_fold_builtin_fputs (gsi, gimple_call_arg (stmt, 0),
 					gimple_call_arg (stmt, 1), false);
--- a/src/gcc/ifcvt.c
+++ b/src/gcc/ifcvt.c
@@ -813,10 +813,15 @@ struct noce_if_info
 
   /* Estimated cost of the particular branch instruction.  */
   unsigned int branch_cost;
+
+  /* The name of the noce transform that succeeded in if-converting
+     this structure.  Used for debugging.  */
+  const char *transform_name;
 };
 
 static rtx noce_emit_store_flag (struct noce_if_info *, rtx, int, int);
 static int noce_try_move (struct noce_if_info *);
+static int noce_try_ifelse_collapse (struct noce_if_info *);
 static int noce_try_store_flag (struct noce_if_info *);
 static int noce_try_addcc (struct noce_if_info *);
 static int noce_try_store_flag_constants (struct noce_if_info *);
@@ -1115,11 +1120,45 @@ noce_try_move (struct noce_if_info *if_info)
 	  emit_insn_before_setloc (seq, if_info->jump,
 				   INSN_LOCATION (if_info->insn_a));
 	}
+      if_info->transform_name = "noce_try_move";
       return TRUE;
     }
   return FALSE;
 }
 
+/* Try forming an IF_THEN_ELSE (cond, b, a) and collapsing that
+   through simplify_rtx.  Sometimes that can eliminate the IF_THEN_ELSE.
+   If that is the case, emit the result into x.  */
+
+static int
+noce_try_ifelse_collapse (struct noce_if_info * if_info)
+{
+  if (!noce_simple_bbs (if_info))
+    return FALSE;
+
+  machine_mode mode = GET_MODE (if_info->x);
+  rtx if_then_else = simplify_gen_ternary (IF_THEN_ELSE, mode, mode,
+					    if_info->cond, if_info->b,
+					    if_info->a);
+
+  if (GET_CODE (if_then_else) == IF_THEN_ELSE)
+    return FALSE;
+
+  rtx_insn *seq;
+  start_sequence ();
+  noce_emit_move_insn (if_info->x, if_then_else);
+  seq = end_ifcvt_sequence (if_info);
+  if (!seq)
+    return FALSE;
+
+  emit_insn_before_setloc (seq, if_info->jump,
+			  INSN_LOCATION (if_info->insn_a));
+
+  if_info->transform_name = "noce_try_ifelse_collapse";
+  return TRUE;
+}
+
+
 /* Convert "if (test) x = 1; else x = 0".
 
    Only try 0 and STORE_FLAG_VALUE here.  Other combinations will be
@@ -1163,6 +1202,7 @@ noce_try_store_flag (struct noce_if_info *if_info)
 
       emit_insn_before_setloc (seq, if_info->jump,
 			       INSN_LOCATION (if_info->insn_a));
+      if_info->transform_name = "noce_try_store_flag";
       return TRUE;
     }
   else
@@ -1241,6 +1281,7 @@ noce_try_inverse_constants (struct noce_if_info *if_info)
 
       emit_insn_before_setloc (seq, if_info->jump,
 			       INSN_LOCATION (if_info->insn_a));
+      if_info->transform_name = "noce_try_inverse_constants";
       return true;
     }
 
@@ -1461,6 +1502,8 @@ noce_try_store_flag_constants (struct noce_if_info *if_info)
 
       emit_insn_before_setloc (seq, if_info->jump,
 			       INSN_LOCATION (if_info->insn_a));
+      if_info->transform_name = "noce_try_store_flag_constants";
+
       return TRUE;
     }
 
@@ -1513,6 +1556,8 @@ noce_try_addcc (struct noce_if_info *if_info)
 
 	      emit_insn_before_setloc (seq, if_info->jump,
 				       INSN_LOCATION (if_info->insn_a));
+	      if_info->transform_name = "noce_try_addcc";
+
 	      return TRUE;
 	    }
 	  end_sequence ();
@@ -1553,6 +1598,7 @@ noce_try_addcc (struct noce_if_info *if_info)
 
 	      emit_insn_before_setloc (seq, if_info->jump,
 				       INSN_LOCATION (if_info->insn_a));
+	      if_info->transform_name = "noce_try_addcc";
 	      return TRUE;
 	    }
 	  end_sequence ();
@@ -1617,6 +1663,8 @@ noce_try_store_flag_mask (struct noce_if_info *if_info)
 
 	  emit_insn_before_setloc (seq, if_info->jump,
 				   INSN_LOCATION (if_info->insn_a));
+	  if_info->transform_name = "noce_try_store_flag_mask";
+
 	  return TRUE;
 	}
 
@@ -1767,6 +1815,8 @@ noce_try_cmove (struct noce_if_info *if_info)
 
 	  emit_insn_before_setloc (seq, if_info->jump,
 				   INSN_LOCATION (if_info->insn_a));
+	  if_info->transform_name = "noce_try_cmove";
+
 	  return TRUE;
 	}
       /* If both a and b are constants try a last-ditch transformation:
@@ -1820,6 +1870,7 @@ noce_try_cmove (struct noce_if_info *if_info)
 
 	      emit_insn_before_setloc (seq, if_info->jump,
 				   INSN_LOCATION (if_info->insn_a));
+	      if_info->transform_name = "noce_try_cmove";
 	      return TRUE;
 	    }
 	  else
@@ -2273,6 +2324,7 @@ noce_try_cmove_arith (struct noce_if_info *if_info)
 
   emit_insn_before_setloc (ifcvt_seq, if_info->jump,
 			   INSN_LOCATION (if_info->insn_a));
+  if_info->transform_name = "noce_try_cmove_arith";
   return TRUE;
 
  end_seq_and_fail:
@@ -2364,28 +2416,32 @@ noce_get_alt_condition (struct noce_if_info *if_info, rtx target,
 	  switch (code)
 	    {
 	    case LT:
-	      if (actual_val == desired_val + 1)
+	      if (desired_val != HOST_WIDE_INT_MAX
+		  && actual_val == desired_val + 1)
 		{
 		  code = LE;
 		  op_b = GEN_INT (desired_val);
 		}
 	      break;
 	    case LE:
-	      if (actual_val == desired_val - 1)
+	      if (desired_val != HOST_WIDE_INT_MIN
+		  && actual_val == desired_val - 1)
 		{
 		  code = LT;
 		  op_b = GEN_INT (desired_val);
 		}
 	      break;
 	    case GT:
-	      if (actual_val == desired_val - 1)
+	      if (desired_val != HOST_WIDE_INT_MIN
+		  && actual_val == desired_val - 1)
 		{
 		  code = GE;
 		  op_b = GEN_INT (desired_val);
 		}
 	      break;
 	    case GE:
-	      if (actual_val == desired_val + 1)
+	      if (desired_val != HOST_WIDE_INT_MAX
+		  && actual_val == desired_val + 1)
 		{
 		  code = GT;
 		  op_b = GEN_INT (desired_val);
@@ -2525,6 +2581,7 @@ noce_try_minmax (struct noce_if_info *if_info)
   emit_insn_before_setloc (seq, if_info->jump, INSN_LOCATION (if_info->insn_a));
   if_info->cond = cond;
   if_info->cond_earliest = earliest;
+  if_info->transform_name = "noce_try_minmax";
 
   return TRUE;
 }
@@ -2691,6 +2748,7 @@ noce_try_abs (struct noce_if_info *if_info)
   emit_insn_before_setloc (seq, if_info->jump, INSN_LOCATION (if_info->insn_a));
   if_info->cond = cond;
   if_info->cond_earliest = earliest;
+  if_info->transform_name = "noce_try_abs";
 
   return TRUE;
 }
@@ -2772,6 +2830,8 @@ noce_try_sign_mask (struct noce_if_info *if_info)
     return FALSE;
 
   emit_insn_before_setloc (seq, if_info->jump, INSN_LOCATION (if_info->insn_a));
+  if_info->transform_name = "noce_try_sign_mask";
+
   return TRUE;
 }
 
@@ -2877,6 +2937,7 @@ noce_try_bitop (struct noce_if_info *if_info)
       emit_insn_before_setloc (seq, if_info->jump,
 			       INSN_LOCATION (if_info->insn_a));
     }
+  if_info->transform_name = "noce_try_bitop";
   return TRUE;
 }
 
@@ -3167,6 +3228,41 @@ noce_convert_multiple_sets (struct noce_if_info *if_info)
       if (if_info->then_else_reversed)
 	std::swap (old_val, new_val);
 
+
+      /* We allow simple lowpart register subreg SET sources in
+	 bb_ok_for_noce_convert_multiple_sets.  Be careful when processing
+	 sequences like:
+	 (set (reg:SI r1) (reg:SI r2))
+	 (set (reg:HI r3) (subreg:HI (r1)))
+	 For the second insn new_val or old_val (r1 in this example) will be
+	 taken from the temporaries and have the wider mode which will not
+	 match with the mode of the other source of the conditional move, so
+	 we'll end up trying to emit r4:HI = cond ? (r1:SI) : (r3:HI).
+	 Wrap the two cmove operands into subregs if appropriate to prevent
+	 that.  */
+      if (GET_MODE (new_val) != GET_MODE (temp))
+	{
+	  machine_mode src_mode = GET_MODE (new_val);
+	  machine_mode dst_mode = GET_MODE (temp);
+	  if (GET_MODE_SIZE (src_mode) <= GET_MODE_SIZE (dst_mode))
+	    {
+	      end_sequence ();
+	      return FALSE;
+	    }
+	  new_val = lowpart_subreg (dst_mode, new_val, src_mode);
+	}
+      if (GET_MODE (old_val) != GET_MODE (temp))
+	{
+	  machine_mode src_mode = GET_MODE (old_val);
+	  machine_mode dst_mode = GET_MODE (temp);
+	  if (GET_MODE_SIZE (src_mode) <= GET_MODE_SIZE (dst_mode))
+	    {
+	      end_sequence ();
+	      return FALSE;
+	    }
+	  old_val = lowpart_subreg (dst_mode, old_val, src_mode);
+	}
+
       /* Actually emit the conditional move.  */
       rtx temp_dest = noce_emit_cmove (if_info, temp, cond_code,
 				       x, y, new_val, old_val);
@@ -3240,6 +3336,7 @@ noce_convert_multiple_sets (struct noce_if_info *if_info)
     }
 
   num_updated_if_blocks++;
+  if_info->transform_name = "noce_convert_multiple_sets";
   return TRUE;
 }
 
@@ -3277,9 +3374,15 @@ bb_ok_for_noce_convert_multiple_sets (basic_block test_bb,
       rtx src = SET_SRC (set);
 
       /* We can possibly relax this, but for now only handle REG to REG
-	 moves.  This avoids any issues that might come from introducing
-	 loads/stores that might violate data-race-freedom guarantees.  */
-      if (!(REG_P (src) && REG_P (dest)))
+	 (including subreg) moves.  This avoids any issues that might come
+	 from introducing loads/stores that might violate data-race-freedom
+	 guarantees.  */
+      if (!REG_P (dest))
+	return false;
+
+      if (!(REG_P (src)
+	   || (GET_CODE (src) == SUBREG && REG_P (SUBREG_REG (src))
+	       && subreg_lowpart_p (src))))
 	return false;
 
       /* Destination must be appropriate for a conditional write.  */
@@ -3336,7 +3439,12 @@ noce_process_if_block (struct noce_if_info *if_info)
       && bb_ok_for_noce_convert_multiple_sets (then_bb, if_info))
     {
       if (noce_convert_multiple_sets (if_info))
-	return TRUE;
+	{
+	  if (dump_file && if_info->transform_name)
+	    fprintf (dump_file, "if-conversion succeeded through %s\n",
+		     if_info->transform_name);
+	  return TRUE;
+	}
     }
 
   if (! bb_valid_for_noce_process_p (then_bb, cond, &if_info->then_cost,
@@ -3493,6 +3601,8 @@ noce_process_if_block (struct noce_if_info *if_info)
 
   if (noce_try_move (if_info))
     goto success;
+  if (noce_try_ifelse_collapse (if_info))
+    goto success;
   if (noce_try_store_flag (if_info))
     goto success;
   if (noce_try_bitop (if_info))
@@ -3533,6 +3643,9 @@ noce_process_if_block (struct noce_if_info *if_info)
   return FALSE;
 
  success:
+  if (dump_file && if_info->transform_name)
+    fprintf (dump_file, "if-conversion succeeded through %s\n",
+	     if_info->transform_name);
 
   /* If we used a temporary, fix it up now.  */
   if (orig_x != x)
--- a/src/gcc/internal-fn.c
+++ b/src/gcc/internal-fn.c
@@ -1812,11 +1812,7 @@ expand_arith_overflow (enum tree_code code, gimple *stmt)
       /* For sub-word operations, retry with a wider type first.  */
       if (orig_precres == precres && precop <= BITS_PER_WORD)
 	{
-#if WORD_REGISTER_OPERATIONS
-	  int p = BITS_PER_WORD;
-#else
-	  int p = precop;
-#endif
+	  int p = WORD_REGISTER_OPERATIONS ? BITS_PER_WORD : precop;
 	  enum machine_mode m = smallest_mode_for_size (p, MODE_INT);
 	  tree optype = build_nonstandard_integer_type (GET_MODE_PRECISION (m),
 							uns0_p && uns1_p
--- a/src/gcc/java/lang.c
+++ b/src/gcc/java/lang.c
@@ -569,8 +569,7 @@ java_post_options (const char **pfilename)
 
   /* Excess precision other than "fast" requires front-end
      support.  */
-  if (flag_excess_precision_cmdline == EXCESS_PRECISION_STANDARD
-      && TARGET_FLT_EVAL_METHOD_NON_DEFAULT)
+  if (flag_excess_precision_cmdline == EXCESS_PRECISION_STANDARD)
     sorry ("-fexcess-precision=standard for Java");
   flag_excess_precision_cmdline = EXCESS_PRECISION_FAST;
 
--- a/src/gcc/lra-constraints.c
+++ b/src/gcc/lra-constraints.c
@@ -1326,7 +1326,22 @@ process_addr_reg (rtx *loc, bool check_only_p, rtx_insn **before, rtx_insn **aft
 
   subreg_p = GET_CODE (*loc) == SUBREG;
   if (subreg_p)
-    loc = &SUBREG_REG (*loc);
+    {
+      reg = SUBREG_REG (*loc);
+      mode = GET_MODE (reg);
+
+      /* For mode with size bigger than ptr_mode, there unlikely to be "mov"
+	 between two registers with different classes, but there normally will
+	 be "mov" which transfers element of vector register into the general
+	 register, and this normally will be a subreg which should be reloaded
+	 as a whole.  This is particularly likely to be triggered when
+	 -fno-split-wide-types specified.  */
+      if (!REG_P (reg)
+	  || in_class_p (reg, cl, &new_class)
+	  || GET_MODE_SIZE (mode) <= GET_MODE_SIZE (ptr_mode))
+       loc = &SUBREG_REG (*loc);
+    }
+
   reg = *loc;
   mode = GET_MODE (reg);
   if (! REG_P (reg))
@@ -2475,14 +2490,29 @@ process_alt_operands (int only_alternative)
 	      /* We are trying to spill pseudo into memory.  It is
 		 usually more costly than moving to a hard register
 		 although it might takes the same number of
-		 reloads.  */
-	      if (no_regs_p && REG_P (op) && hard_regno[nop] >= 0)
+		 reloads.
+
+		 Non-pseudo spill may happen also.  Suppose a target allows both
+		 register and memory in the operand constraint alternatives,
+		 then it's typical that an eliminable register has a substition
+		 of "base + offset" which can either be reloaded by a simple
+		 "new_reg <= base + offset" which will match the register
+		 constraint, or a similar reg addition followed by further spill
+		 to and reload from memory which will match the memory
+		 constraint, but this memory spill will be much more costly
+		 usually.
+
+		 Code below increases the reject for both pseudo and non-pseudo
+		 spill.  */
+	      if (no_regs_p
+		  && !(MEM_P (op) && offmemok)
+		  && !(REG_P (op) && hard_regno[nop] < 0))
 		{
 		  if (lra_dump_file != NULL)
 		    fprintf
 		      (lra_dump_file,
-		       "            %d Spill pseudo into memory: reject+=3\n",
-		       nop);
+		       "            %d Spill %spseudo into memory: reject+=3\n",
+		       nop, REG_P (op) ? "" : "Non-");
 		  reject += 3;
 		  if (VECTOR_MODE_P (mode))
 		    {
--- a/src/gcc/lto/lto-partition.c
+++ b/src/gcc/lto/lto-partition.c
@@ -447,7 +447,7 @@ add_sorted_nodes (vec<symtab_node *> &next_nodes, ltrans_partition partition)
    and in-partition calls was reached.  */
 
 void
-lto_balanced_map (int n_lto_partitions)
+lto_balanced_map (int n_lto_partitions, int max_partition_size)
 {
   int n_nodes = 0;
   int n_varpool_nodes = 0, varpool_pos = 0, best_varpool_pos = 0;
@@ -511,6 +511,9 @@ lto_balanced_map (int n_lto_partitions)
   varpool_order.qsort (varpool_node_cmp);
 
   /* Compute partition size and create the first partition.  */
+  if (PARAM_VALUE (MIN_PARTITION_SIZE) > max_partition_size)
+    fatal_error (input_location, "min partition size cannot be greater than max partition size");
+
   partition_size = total_size / n_lto_partitions;
   if (partition_size < PARAM_VALUE (MIN_PARTITION_SIZE))
     partition_size = PARAM_VALUE (MIN_PARTITION_SIZE);
@@ -719,7 +722,8 @@ lto_balanced_map (int n_lto_partitions)
 		 best_cost, best_internal, best_i);
       /* Partition is too large, unwind into step when best cost was reached and
 	 start new partition.  */
-      if (partition->insns > 2 * partition_size)
+      if (partition->insns > 2 * partition_size
+	  || partition->insns > max_partition_size)
 	{
 	  if (best_i != i)
 	    {
--- a/src/gcc/lto/lto-partition.h
+++ b/src/gcc/lto/lto-partition.h
@@ -35,7 +35,7 @@ extern vec<ltrans_partition> ltrans_partitions;
 
 void lto_1_to_1_map (void);
 void lto_max_map (void);
-void lto_balanced_map (int);
+void lto_balanced_map (int, int);
 void lto_promote_cross_file_statics (void);
 void free_ltrans_partitions (void);
 void lto_promote_statics_nonwpa (void);
--- a/src/gcc/lto/lto.c
+++ b/src/gcc/lto/lto.c
@@ -3123,9 +3123,10 @@ do_whole_program_analysis (void)
   else if (flag_lto_partition == LTO_PARTITION_MAX)
     lto_max_map ();
   else if (flag_lto_partition == LTO_PARTITION_ONE)
-    lto_balanced_map (1);
+    lto_balanced_map (1, INT_MAX);
   else if (flag_lto_partition == LTO_PARTITION_BALANCED)
-    lto_balanced_map (PARAM_VALUE (PARAM_LTO_PARTITIONS));
+    lto_balanced_map (PARAM_VALUE (PARAM_LTO_PARTITIONS),
+		      PARAM_VALUE (MAX_PARTITION_SIZE));
   else
     gcc_unreachable ();
 
--- a/src/gcc/match.pd
+++ b/src/gcc/match.pd
@@ -468,6 +468,12 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
  (bit_and:c (convert? @0) (convert? (bit_not @0)))
   { build_zero_cst (type); })
 
+/* PR71636: Transform x & ((1U << b) - 1) -> x & ~(~0U << b);  */
+(simplify
+  (bit_and:c @0 (plus:s (lshift:s integer_onep @1) integer_minus_onep))
+  (if (TYPE_UNSIGNED (type))
+    (bit_and @0 (bit_not (lshift { build_all_ones_cst (type); } @1)))))
+
 /* Fold (A & ~B) - (A & B) into (A ^ B) - B.  */
 (simplify
  (minus (bit_and:cs @0 (bit_not @1)) (bit_and:cs @0 @1))
--- /dev/null
+++ b/src/gcc/memmodel.h
@@ -0,0 +1,86 @@
+/* Prototypes of memory model helper functions.
+   Copyright (C) 2015-2016 Free Software Foundation, Inc.
+
+This file is part of GCC.
+
+GCC is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free
+Software Foundation; either version 3, or (at your option) any later
+version.
+
+GCC is distributed in the hope that it will be useful, but WITHOUT ANY
+WARRANTY; without even the implied warranty of MERCHANTABILITY or
+FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received a copy of the GNU General Public License
+along with GCC; see the file COPYING3.  If not see
+<http://www.gnu.org/licenses/>.  */
+
+#ifndef GCC_MEMMODEL_H
+#define GCC_MEMMODEL_H
+
+/* Return the memory model from a host integer.  */
+static inline enum memmodel
+memmodel_from_int (unsigned HOST_WIDE_INT val)
+{
+  return (enum memmodel) (val & MEMMODEL_MASK);
+}
+
+/* Return the base memory model from a host integer.  */
+static inline enum memmodel
+memmodel_base (unsigned HOST_WIDE_INT val)
+{
+  return (enum memmodel) (val & MEMMODEL_BASE_MASK);
+}
+
+/* Return TRUE if the memory model is RELAXED.  */
+static inline bool
+is_mm_relaxed (enum memmodel model)
+{
+  return (model & MEMMODEL_BASE_MASK) == MEMMODEL_RELAXED;
+}
+
+/* Return TRUE if the memory model is CONSUME.  */
+static inline bool
+is_mm_consume (enum memmodel model)
+{
+  return (model & MEMMODEL_BASE_MASK) == MEMMODEL_CONSUME;
+}
+
+/* Return TRUE if the memory model is ACQUIRE.  */
+static inline bool
+is_mm_acquire (enum memmodel model)
+{
+  return (model & MEMMODEL_BASE_MASK) == MEMMODEL_ACQUIRE;
+}
+
+/* Return TRUE if the memory model is RELEASE.  */
+static inline bool
+is_mm_release (enum memmodel model)
+{
+  return (model & MEMMODEL_BASE_MASK) == MEMMODEL_RELEASE;
+}
+
+/* Return TRUE if the memory model is ACQ_REL.  */
+static inline bool
+is_mm_acq_rel (enum memmodel model)
+{
+  return (model & MEMMODEL_BASE_MASK) == MEMMODEL_ACQ_REL;
+}
+
+/* Return TRUE if the memory model is SEQ_CST.  */
+static inline bool
+is_mm_seq_cst (enum memmodel model)
+{
+  return (model & MEMMODEL_BASE_MASK) == MEMMODEL_SEQ_CST;
+}
+
+/* Return TRUE if the memory model is a SYNC variant.  */
+static inline bool
+is_mm_sync (enum memmodel model)
+{
+  return (model & MEMMODEL_SYNC);
+}
+
+#endif  /* GCC_MEMMODEL_H  */
--- a/src/gcc/optabs.c
+++ b/src/gcc/optabs.c
@@ -25,6 +25,7 @@ along with GCC; see the file COPYING3.  If not see
 #include "target.h"
 #include "rtl.h"
 #include "tree.h"
+#include "memmodel.h"
 #include "predict.h"
 #include "tm_p.h"
 #include "expmed.h"
--- a/src/gcc/params.def
+++ b/src/gcc/params.def
@@ -1027,7 +1027,12 @@ DEFPARAM (PARAM_LTO_PARTITIONS,
 DEFPARAM (MIN_PARTITION_SIZE,
 	  "lto-min-partition",
 	  "Minimal size of a partition for LTO (in estimated instructions).",
-	  1000, 0, 0)
+	  10000, 0, 0)
+
+DEFPARAM (MAX_PARTITION_SIZE,
+	  "lto-max-partition",
+	  "Maximal size of a partition for LTO (in estimated instructions).",
+	  1000000, 0, INT_MAX)
 
 /* Diagnostic parameters.  */
 
--- a/src/gcc/rtlanal.c
+++ b/src/gcc/rtlanal.c
@@ -3663,6 +3663,16 @@ subreg_get_info (unsigned int xregno, machine_mode xmode,
 	  info->offset = offset / regsize_xmode;
 	  return;
 	}
+      /* It's not valid to extract a subreg of mode YMODE at OFFSET that
+	 would go outside of XMODE.  */
+      if (!rknown
+	  && GET_MODE_SIZE (ymode) + offset > GET_MODE_SIZE (xmode))
+	{
+	  info->representable_p = false;
+	  info->nregs = nregs_ymode;
+	  info->offset = offset / regsize_xmode;
+	  return;
+	}
       /* Quick exit for the simple and common case of extracting whole
 	 subregisters from a multiregister value.  */
       /* ??? It would be better to integrate this into the code below,
@@ -4590,13 +4600,14 @@ nonzero_bits1 (const_rtx x, machine_mode mode, const_rtx known_x,
 	  nonzero &= cached_nonzero_bits (SUBREG_REG (x), mode,
 					  known_x, known_mode, known_ret);
 
-#if WORD_REGISTER_OPERATIONS && defined (LOAD_EXTEND_OP)
+#ifdef LOAD_EXTEND_OP
 	  /* If this is a typical RISC machine, we only have to worry
 	     about the way loads are extended.  */
-	  if ((LOAD_EXTEND_OP (inner_mode) == SIGN_EXTEND
-	       ? val_signbit_known_set_p (inner_mode, nonzero)
-	       : LOAD_EXTEND_OP (inner_mode) != ZERO_EXTEND)
-	      || !MEM_P (SUBREG_REG (x)))
+	  if (WORD_REGISTER_OPERATIONS
+	      && ((LOAD_EXTEND_OP (inner_mode) == SIGN_EXTEND
+		     ? val_signbit_known_set_p (inner_mode, nonzero)
+		     : LOAD_EXTEND_OP (inner_mode) != ZERO_EXTEND)
+		   || !MEM_P (SUBREG_REG (x))))
 #endif
 	    {
 	      /* On many CISC machines, accessing an object in a wider mode
--- a/src/gcc/simplify-rtx.c
+++ b/src/gcc/simplify-rtx.c
@@ -5274,6 +5274,50 @@ simplify_const_relational_operation (enum rtx_code code,
 
   return 0;
 }
+
+/* Recognize expressions of the form (X CMP 0) ? VAL : OP (X)
+   where OP is CLZ or CTZ and VAL is the value from CLZ_DEFINED_VALUE_AT_ZERO
+   or CTZ_DEFINED_VALUE_AT_ZERO respectively and return OP (X) if the expression
+   can be simplified to that or NULL_RTX if not.
+   Assume X is compared against zero with CMP_CODE and the true
+   arm is TRUE_VAL and the false arm is FALSE_VAL.  */
+
+static rtx
+simplify_cond_clz_ctz (rtx x, rtx_code cmp_code, rtx true_val, rtx false_val)
+{
+  if (cmp_code != EQ && cmp_code != NE)
+    return NULL_RTX;
+
+  /* Result on X == 0 and X !=0 respectively.  */
+  rtx on_zero, on_nonzero;
+  if (cmp_code == EQ)
+    {
+      on_zero = true_val;
+      on_nonzero = false_val;
+    }
+  else
+    {
+      on_zero = false_val;
+      on_nonzero = true_val;
+    }
+
+  rtx_code op_code = GET_CODE (on_nonzero);
+  if ((op_code != CLZ && op_code != CTZ)
+      || !rtx_equal_p (XEXP (on_nonzero, 0), x)
+      || !CONST_INT_P (on_zero))
+    return NULL_RTX;
+
+  HOST_WIDE_INT op_val;
+  if (((op_code == CLZ
+	&& CLZ_DEFINED_VALUE_AT_ZERO (GET_MODE (on_nonzero), op_val))
+      || (op_code == CTZ
+	  && CTZ_DEFINED_VALUE_AT_ZERO (GET_MODE (on_nonzero), op_val)))
+      && op_val == INTVAL (on_zero))
+    return on_nonzero;
+
+  return NULL_RTX;
+}
+
 
 /* Simplify CODE, an operation with result mode MODE and three operands,
    OP0, OP1, and OP2.  OP0_MODE was the mode of OP0 before it became
@@ -5407,6 +5451,19 @@ simplify_ternary_operation (enum rtx_code code, machine_mode mode,
 	    }
 	}
 
+      /* Convert x == 0 ? N : clz (x) into clz (x) when
+	 CLZ_DEFINED_VALUE_AT_ZERO is defined to N for the mode of x.
+	 Similarly for ctz (x).  */
+      if (COMPARISON_P (op0) && !side_effects_p (op0)
+	  && XEXP (op0, 1) == const0_rtx)
+	{
+	  rtx simplified
+	    = simplify_cond_clz_ctz (XEXP (op0, 0), GET_CODE (op0),
+				     op1, op2);
+	  if (simplified)
+	    return simplified;
+	}
+
       if (COMPARISON_P (op0) && ! side_effects_p (op0))
 	{
 	  machine_mode cmp_mode = (GET_MODE (XEXP (op0, 0)) == VOIDmode
--- a/src/gcc/system.h
+++ b/src/gcc/system.h
@@ -971,7 +971,8 @@ extern void fancy_abort (const char *, int, const char *) ATTRIBUTE_NORETURN;
 	EXTRA_ADDRESS_CONSTRAINT CONST_DOUBLE_OK_FOR_CONSTRAINT_P	   \
 	CALLER_SAVE_PROFITABLE LARGEST_EXPONENT_IS_NORMAL		   \
 	ROUND_TOWARDS_ZERO SF_SIZE DF_SIZE XF_SIZE TF_SIZE LIBGCC2_TF_CEXT \
-	LIBGCC2_LONG_DOUBLE_TYPE_SIZE STRUCT_VALUE EH_FRAME_IN_DATA_SECTION
+	LIBGCC2_LONG_DOUBLE_TYPE_SIZE STRUCT_VALUE			   \
+	EH_FRAME_IN_DATA_SECTION TARGET_FLT_EVAL_METHOD_NON_DEFAULT
 
 /* Hooks that are no longer used.  */
  #pragma GCC poison LANG_HOOKS_FUNCTION_MARK LANG_HOOKS_FUNCTION_FREE	\
--- a/src/gcc/testsuite/c-c++-common/asan/clone-test-1.c
+++ b/src/gcc/testsuite/c-c++-common/asan/clone-test-1.c
@@ -29,6 +29,10 @@ int main(int argc, char **argv) {
   char *sp = child_stack + kStackSize;  /* Stack grows down. */
   printf("Parent: %p\n", sp);
   pid_t clone_pid = clone(Child, sp, CLONE_FILES | CLONE_VM, NULL, 0, 0, 0);
+  if (clone_pid == -1) {
+    perror("clone");
+    return 1;
+  }
   int status;
   pid_t wait_result = waitpid(clone_pid, &status, __WCLONE);
   if (wait_result < 0) {
--- a/src/gcc/testsuite/g++.dg/ext/arm-fp16/arm-fp16-ops-3.C
+++ b/src/gcc/testsuite/g++.dg/ext/arm-fp16/arm-fp16-ops-3.C
@@ -1,5 +1,6 @@
 /* Test various operators on __fp16 and mixed __fp16/float operands.  */
 /* { dg-do run { target arm*-*-* } } */
+/* { dg-require-effective-target arm_fp16_alternative_ok } */
 /* { dg-options "-mfp16-format=alternative" } */
 
 #include "arm-fp16-ops.h"
--- a/src/gcc/testsuite/g++.dg/ext/arm-fp16/arm-fp16-ops-4.C
+++ b/src/gcc/testsuite/g++.dg/ext/arm-fp16/arm-fp16-ops-4.C
@@ -1,5 +1,6 @@
 /* Test various operators on __fp16 and mixed __fp16/float operands.  */
 /* { dg-do run { target arm*-*-* } } */
+/* { dg-require-effective-target arm_fp16_alternative_ok } */
 /* { dg-options "-mfp16-format=alternative -ffast-math" } */
 
 #include "arm-fp16-ops.h"
--- a/src/gcc/testsuite/g++.dg/ext/arm-fp16/fp16-param-1.C
+++ b/src/gcc/testsuite/g++.dg/ext/arm-fp16/fp16-param-1.C
@@ -1,10 +1,14 @@
 /* { dg-do compile { target arm*-*-* } } */
 /* { dg-options "-mfp16-format=ieee" } */
 
-/* Functions cannot have parameters of type __fp16.  */
-extern void f (__fp16);		/* { dg-error "parameters cannot have __fp16 type" } */
-extern void (*pf) (__fp16);	/* { dg-error "parameters cannot have __fp16 type" } */
+/* Test that the ACLE macro is defined.  */
+#if __ARM_FP16_ARGS != 1
+#error Unexpected value for __ARM_FP16_ARGS
+#endif
+
+/* Test that __fp16 is supported as a parameter type.  */
+extern void f (__fp16);
+extern void (*pf) (__fp16);
 
-/* These should be OK.  */
 extern void g (__fp16 *);
 extern void (*pg) (__fp16 *);
--- a/src/gcc/testsuite/g++.dg/ext/arm-fp16/fp16-return-1.C
+++ b/src/gcc/testsuite/g++.dg/ext/arm-fp16/fp16-return-1.C
@@ -1,10 +1,9 @@
 /* { dg-do compile { target arm*-*-* } } */
 /* { dg-options "-mfp16-format=ieee" } */
 
-/* Functions cannot return type __fp16.  */
-extern __fp16 f (void);		/* { dg-error "cannot return __fp16" } */
-extern __fp16 (*pf) (void);	/* { dg-error "cannot return __fp16" } */
+/* Test that __fp16 is supported as a return type.  */
+extern __fp16 f (void);
+extern __fp16 (*pf) (void);
 
-/* These should be OK.  */
 extern __fp16 *g (void);
 extern __fp16 *(*pg) (void);
--- a/src/gcc/testsuite/g++.dg/inherit/thunk1.C
+++ b/src/gcc/testsuite/g++.dg/inherit/thunk1.C
@@ -1,4 +1,5 @@
-// { dg-do run { target i?86-*-* x86_64-*-* s390*-*-* alpha*-*-* ia64-*-* sparc*-*-* } }
+// { dg-do run { target arm*-*-* aarch64*-*-* i?86-*-* x86_64-*-* s390*-*-* alpha*-*-* ia64-*-* sparc*-*-* } }
+// { dg-skip-if "" { arm_thumb1_ok } }
 
 #include <stdarg.h>
 
--- a/src/gcc/testsuite/g++.dg/lto/pr69589_0.C
+++ b/src/gcc/testsuite/g++.dg/lto/pr69589_0.C
@@ -1,6 +1,8 @@
 // { dg-lto-do link }
-// { dg-lto-options "-O2 -rdynamic" } 
+// { dg-lto-options "-O2 -rdynamic" }
 // { dg-extra-ld-options "-r -nostdlib" }
+// { dg-skip-if "Skip targets without -rdynamic support" { arm*-none-eabi aarch64*-*-elf } { "*" } { "" } }
+
 #pragma GCC visibility push(hidden)
 struct A { int &operator[] (long); };
 template <typename> struct B;
--- /dev/null
+++ b/src/gcc/testsuite/g++.dg/opt/pr78201.C
@@ -0,0 +1,13 @@
+// PR middle-end/78201
+// { dg-do compile }
+// { dg-options "-O2" }
+
+struct B { long d (); } *c;
+long e;
+
+void
+foo ()
+{
+  char a[e] = "";
+  c && c->d();
+}
--- /dev/null
+++ b/src/gcc/testsuite/gcc.c-torture/compile/pr71112.c
@@ -0,0 +1,10 @@
+/* PR target/71112.  */
+/* { dg-additional-options "-fpie" { target pie } } */
+
+extern int dbs[100];
+void f (int *);
+int nscd_init (void)
+{
+  f (dbs);
+  return 0;
+}
--- /dev/null
+++ b/src/gcc/testsuite/gcc.c-torture/compile/pr71295.c
@@ -0,0 +1,12 @@
+extern void fn2 (long long);
+int a;
+
+void
+fn1 ()
+{
+  long long b[3];
+  a = 0;
+  for (; a < 3; a++)
+    b[a] = 1;
+  fn2 (b[1]);
+}
--- /dev/null
+++ b/src/gcc/testsuite/gcc.c-torture/compile/pr78362.c
@@ -0,0 +1,11 @@
+/* PR target/78362.  */
+
+long a;
+
+void
+foo (void)
+{
+  for (;; a--)
+    if ((int) a)
+      break;
+}
--- /dev/null
+++ b/src/gcc/testsuite/gcc.c-torture/compile/pr78694.c
@@ -0,0 +1,118 @@
+/* PR target/78694.  */
+
+enum
+{
+  MEMMODEL_RELAXED,
+  MEMMODEL_ACQUIRE,
+  PRIORITY_INSERT_END
+};
+enum
+{
+  PQ_CHILDREN,
+  PQ_TASKGROUP
+};
+struct gomp_team_state
+{
+  struct gomp_team *team;
+};
+enum gomp_task_kind
+{
+  GOMP_TASK_UNDEFERRED,
+  GOMP_TASK_WAITING
+};
+struct gomp_taskwait
+{
+  _Bool in_taskwait;
+};
+struct gomp_task
+{
+  struct gomp_task *parent;
+  int children_queue;
+  struct gomp_taskgroup *taskgroup;
+  int dependers;
+  struct gomp_taskwait taskwait;
+  enum gomp_task_kind kind;
+  _Bool in_tied_task;
+} j, q, *n;
+struct gomp_taskgroup
+{
+  _Bool in_taskgroup_wait;
+  int num_children;
+} l;
+struct gomp_team
+{
+  int task_queue;
+  int task_running_count;
+};
+struct gomp_thread
+{
+  struct gomp_team_state ts;
+  struct gomp_task task;
+} extern __thread a;
+
+int b, c, d, e, f, g, h, i, k, m, o, p, r;
+
+void priority_queue_next_task (struct gomp_task *, int, int);
+int gomp_task_run_pre (struct gomp_task *, struct gomp_task, struct gomp_team);
+void priority_queue_insert (int, struct gomp_task);
+void priority_queue_insert2 (int, struct gomp_task, int, int, int);
+void priority_queue_insert3 (int, struct gomp_task, int, int, int);
+void gomp_sem_post (int);
+void free (void *);
+
+_Bool s;
+int
+GOMP_taskgroup_end ()
+{
+  struct gomp_thread *t = &a;
+  struct gomp_team u = *t->ts.team;
+  struct gomp_task *v = &t->task, *w;
+  if (__atomic_load_n (&l.num_children, MEMMODEL_ACQUIRE))
+    while (1)
+      {
+	if (l.num_children)
+	  priority_queue_next_task (v, u.task_queue, r);
+	else if (w)
+	  free (w);
+	if (n->kind == GOMP_TASK_WAITING)
+	  {
+	    s = gomp_task_run_pre (n, q, u);
+	    if (__builtin_expect (s, 0))
+	      {
+		if (w)
+		  free (w);
+		goto finish_cancelled;
+	      }
+	    n = 0;
+	    l.in_taskgroup_wait = 1;
+	  }
+	if (w)
+	  {
+	    t->task = *n;
+	    if (__builtin_expect (p, 0))
+	      if (o)
+		t->task = *v;
+	  }
+	if (n)
+	  {
+	    struct gomp_task x = x;
+	    for (; i; b++)
+	      {
+		struct gomp_task y = j;
+		if (g)
+		  continue;
+		priority_queue_insert (PQ_CHILDREN, x);
+		if (x.taskwait.in_taskwait)
+		  priority_queue_insert2 (PQ_TASKGROUP, y, e, 0, d);
+		if (h)
+		  gomp_sem_post (f);
+		priority_queue_insert3 (k, y, PRIORITY_INSERT_END, 0, d);
+		++c;
+	      }
+	  }
+      finish_cancelled:
+	w = (struct gomp_task *) (n - u.task_running_count - v);
+      }
+  v->taskgroup = (struct gomp_taskgroup *) m;
+  return 1;
+}
--- /dev/null
+++ b/src/gcc/testsuite/gcc.c-torture/execute/pr37780.c
@@ -0,0 +1,49 @@
+/* PR middle-end/37780.  */
+
+#define VAL (8 * sizeof (int))
+
+int __attribute__ ((noinline, noclone))
+fooctz (int i)
+{
+  return (i == 0) ? VAL : __builtin_ctz (i);
+}
+
+int __attribute__ ((noinline, noclone))
+fooctz2 (int i)
+{
+  return (i != 0) ? __builtin_ctz (i) : VAL;
+}
+
+unsigned int __attribute__ ((noinline, noclone))
+fooctz3 (unsigned int i)
+{
+  return (i > 0) ?  __builtin_ctz (i) : VAL;
+}
+
+int __attribute__ ((noinline, noclone))
+fooclz (int i)
+{
+  return (i == 0) ? VAL : __builtin_clz (i);
+}
+
+int __attribute__ ((noinline, noclone))
+fooclz2 (int i)
+{
+  return (i != 0) ? __builtin_clz (i) : VAL;
+}
+
+unsigned int __attribute__ ((noinline, noclone))
+fooclz3 (unsigned int i)
+{
+  return (i > 0) ? __builtin_clz (i) : VAL;
+}
+
+int
+main (void)
+{
+  if (fooctz (0) != VAL || fooctz2 (0) != VAL || fooctz3 (0) != VAL
+      || fooclz (0) != VAL || fooclz2 (0) != VAL || fooclz3 (0) != VAL)
+    __builtin_abort ();
+
+  return 0;
+}
\ No newline at end of file
--- /dev/null
+++ b/src/gcc/testsuite/gcc.c-torture/execute/pr66940.c
@@ -0,0 +1,20 @@
+long long __attribute__ ((noinline, noclone))
+foo (long long ival)
+{
+ if (ival <= 0)
+    return -0x7fffffffffffffffL - 1;
+
+ return 0x7fffffffffffffffL;
+}
+
+int
+main (void)
+{
+  if (foo (-1) != (-0x7fffffffffffffffL - 1))
+    __builtin_abort ();
+
+  if (foo (1) != 0x7fffffffffffffffL)
+    __builtin_abort ();
+
+  return 0;
+}
--- a/src/gcc/testsuite/gcc.dg/asr_div1.c
+++ b/src/gcc/testsuite/gcc.dg/asr_div1.c
@@ -1,6 +1,7 @@
 /* Test division by const int generates only one shift.  */
 /* { dg-do run } */
 /* { dg-options "-O2 -fdump-rtl-combine-all" } */
+/* { dg-options "-O2 -fdump-rtl-combine-all -mtune=cortex-a53" { target aarch64*-*-* } } */
 
 extern void abort (void);
 
--- a/src/gcc/testsuite/gcc.dg/atomic/c11-atomic-exec-5.c
+++ b/src/gcc/testsuite/gcc.dg/atomic/c11-atomic-exec-5.c
@@ -24,7 +24,7 @@
 			 | FE_OVERFLOW		\
 			 | FE_UNDERFLOW)
 
-#if defined __alpha__
+#if defined __alpha__ || defined __aarch64__
   #define ITER_COUNT 100
 #else
   #define ITER_COUNT 10000
--- a/src/gcc/testsuite/gcc.dg/cpp/trad/include.c
+++ b/src/gcc/testsuite/gcc.dg/cpp/trad/include.c
@@ -2,11 +2,5 @@
 
 /* Test that macros are not expanded in the <> quotes of #inlcude.  */
 
-/* vxWorksCommon.h uses the "#" operator to construct the name of an
-   include file, thus making the file incompatible with -traditional-cpp.
-   Newlib uses ## when including stdlib.h as of 2007-09-07.  */
-/* { dg-do preprocess { target { { ! vxworks_kernel } && { ! newlib } } } } */
-
-#define __STDC__ 1		/* Stop complaints about non-ISO compilers.  */
-#define stdlib 1
-#include <stdlib.h>		/* { dg-bogus "o such file or directory" } */
+#define builtins 1
+#include <builtins.h>		/* { dg-bogus "o such file or directory" } */
--- a/src/gcc/testsuite/gcc.dg/cpp/trad/trad.exp
+++ b/src/gcc/testsuite/gcc.dg/cpp/trad/trad.exp
@@ -29,7 +29,7 @@ load_lib gcc-dg.exp
 # If a testcase doesn't have special options, use these.
 global DEFAULT_TRADCPPFLAGS
 if ![info exists DEFAULT_TRADCPPFLAGS] then {
-    set DEFAULT_TRADCPPFLAGS " -traditional-cpp"
+    set DEFAULT_TRADCPPFLAGS " -traditional-cpp -I$srcdir/$subdir/"
 }
 
 # Initialize `dg'.
--- a/src/gcc/testsuite/gcc.dg/cpp/warn-undef-2.c
+++ b/src/gcc/testsuite/gcc.dg/cpp/warn-undef-2.c
@@ -1,5 +1,5 @@
 // { dg-do preprocess }
 // { dg-options "-std=gnu99 -fdiagnostics-show-option -Werror=undef" }
 /* { dg-message "some warnings being treated as errors" "" {target "*-*-*"} 0 } */
-#if x  // { dg-error "\"x\" is not defined .-Werror=undef." }
+#if x  // { dg-error "\"x\" is not defined, evaluates to 0 .-Werror=undef." }
 #endif
--- a/src/gcc/testsuite/gcc.dg/cpp/warn-undef.c
+++ b/src/gcc/testsuite/gcc.dg/cpp/warn-undef.c
@@ -1,5 +1,5 @@
 // { dg-do preprocess }
 // { dg-options "-std=gnu99 -fdiagnostics-show-option -Wundef" }
 
-#if x  // { dg-warning "\"x\" is not defined .-Wundef." }
+#if x  // { dg-warning "\"x\" is not defined, evaluates to 0 .-Wundef." }
 #endif
--- a/src/gcc/testsuite/gcc.dg/lto/pr54709_0.c
+++ b/src/gcc/testsuite/gcc.dg/lto/pr54709_0.c
@@ -1,6 +1,7 @@
 /* { dg-lto-do link } */
 /* { dg-require-visibility "hidden" } */
 /* { dg-require-effective-target fpic } */
+/* { dg-require-effective-target shared } */
 /* { dg-extra-ld-options { -shared } } */
 /* { dg-lto-options { { -fPIC -fvisibility=hidden -flto } } } */
 
--- a/src/gcc/testsuite/gcc.dg/lto/pr61526_0.c
+++ b/src/gcc/testsuite/gcc.dg/lto/pr61526_0.c
@@ -1,4 +1,5 @@
 /* { dg-require-effective-target fpic } */
+/* { dg-require-effective-target shared } */
 /* { dg-lto-do link } */
 /* { dg-lto-options { { -fPIC -flto -flto-partition=1to1 } } } */
 /* { dg-extra-ld-options { -shared } } */
--- a/src/gcc/testsuite/gcc.dg/lto/pr64415_0.c
+++ b/src/gcc/testsuite/gcc.dg/lto/pr64415_0.c
@@ -1,5 +1,6 @@
 /* { dg-lto-do link } */
 /* { dg-require-effective-target fpic } */
+/* { dg-require-effective-target shared } */
 /* { dg-lto-options { { -O -flto -fpic } } } */
 /* { dg-extra-ld-options { -shared } } */
 /* { dg-extra-ld-options "-Wl,-undefined,dynamic_lookup" { target *-*-darwin* } } */
--- a/src/gcc/testsuite/gcc.dg/plugin/plugin.exp
+++ b/src/gcc/testsuite/gcc.dg/plugin/plugin.exp
@@ -87,6 +87,12 @@ foreach plugin_test $plugin_test_list {
     if ![runtest_file_p $runtests $plugin_src] then {
         continue
     }
+    # Skip tail call tests on targets that do not have sibcall_epilogue.
+    if {[regexp ".*must_tail_call_plugin.c" $plugin_src]
+	&& [istarget arm*-*-*]
+	&& [check_effective_target_arm_thumb1]} then {
+	continue
+    }
     set plugin_input_tests [lreplace $plugin_test 0 0]
     plugin-test-execute $plugin_src $plugin_input_tests
 }
--- /dev/null
+++ b/src/gcc/testsuite/gcc.dg/pr59833.c
@@ -0,0 +1,18 @@
+/* { dg-do run { target { *-*-linux* *-*-gnu* } } } */
+/* { dg-options "-O0 -lm" } */
+/* { dg-require-effective-target issignaling } */
+
+#define _GNU_SOURCE
+#include <math.h>
+
+int main (void)
+{
+  float sNaN = __builtin_nansf ("");
+  double x = (double) sNaN;
+  if (issignaling(x))
+  {
+    __builtin_abort();
+  }
+
+  return 0;
+}
--- /dev/null
+++ b/src/gcc/testsuite/gcc.dg/pr68217.c
@@ -0,0 +1,14 @@
+
+/* { dg-do compile } */
+/* { dg-options "-O2 -fdump-tree-vrp1" } */
+
+int foo (void)
+{
+    volatile int a = -1;
+    long long b = (1LL << (sizeof (b) * 8 - 1)); // LLONG_MIN
+    long long x = (a & b); // x == 0x8000000000000000
+    if (x < 1LL) { ; } else { __builtin_abort(); }
+    return 0;
+}
+
+/* { dg-final { scan-tree-dump "\\\[-INF, 0\\\]" "vrp1" } } */
--- /dev/null
+++ b/src/gcc/testsuite/gcc.dg/pr71636-1.c
@@ -0,0 +1,9 @@
+/* { dg-do compile } */
+/* { dg-options "-fdump-tree-gimple" } */
+
+unsigned f(unsigned x, unsigned b)
+{
+  return x & ((1U << b) - 1);
+}
+
+/* { dg-final { scan-tree-dump-not "1 <<" "gimple" } } */
--- /dev/null
+++ b/src/gcc/testsuite/gcc.dg/pr71636-2.c
@@ -0,0 +1,12 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -fdump-tree-forwprop-details" } */
+
+unsigned f(unsigned x, unsigned b)
+{
+  unsigned t1 = 1U << b;
+  unsigned t2 = t1 - 1;
+  unsigned t3 = x & t2;
+  return t3;
+}
+
+/* { dg-final { scan-tree-dump "_\[0-9\] = ~_\[0-9\]" "forwprop1" } } */
--- a/src/gcc/testsuite/gcc.dg/strlenopt-20.c
+++ b/src/gcc/testsuite/gcc.dg/strlenopt-20.c
@@ -86,9 +86,9 @@ main ()
   return 0;
 }
 
-/* { dg-final { scan-tree-dump-times "strlen \\(" 1 "strlen" } } */
+/* { dg-final { scan-tree-dump-times "strlen \\(" 2 "strlen" } } */
 /* { dg-final { scan-tree-dump-times "memcpy \\(" 4 "strlen" } } */
 /* { dg-final { scan-tree-dump-times "strcpy \\(" 0 "strlen" } } */
 /* { dg-final { scan-tree-dump-times "strcat \\(" 0 "strlen" } } */
-/* { dg-final { scan-tree-dump-times "strchr \\(" 1 "strlen" } } */
+/* { dg-final { scan-tree-dump-times "strchr \\(" 0 "strlen" } } */
 /* { dg-final { scan-tree-dump-times "stpcpy \\(" 0 "strlen" } } */
--- a/src/gcc/testsuite/gcc.dg/strlenopt-21.c
+++ b/src/gcc/testsuite/gcc.dg/strlenopt-21.c
@@ -57,9 +57,9 @@ main ()
   return 0;
 }
 
-/* { dg-final { scan-tree-dump-times "strlen \\(" 1 "strlen" } } */
+/* { dg-final { scan-tree-dump-times "strlen \\(" 2 "strlen" } } */
 /* { dg-final { scan-tree-dump-times "memcpy \\(" 3 "strlen" } } */
 /* { dg-final { scan-tree-dump-times "strcpy \\(" 0 "strlen" } } */
 /* { dg-final { scan-tree-dump-times "strcat \\(" 0 "strlen" } } */
-/* { dg-final { scan-tree-dump-times "strchr \\(" 1 "strlen" } } */
+/* { dg-final { scan-tree-dump-times "strchr \\(" 0 "strlen" } } */
 /* { dg-final { scan-tree-dump-times "stpcpy \\(" 0 "strlen" } } */
--- a/src/gcc/testsuite/gcc.dg/strlenopt-22.c
+++ b/src/gcc/testsuite/gcc.dg/strlenopt-22.c
@@ -31,9 +31,9 @@ main ()
   return 0;
 }
 
-/* { dg-final { scan-tree-dump-times "strlen \\(" 3 "strlen" } } */
+/* { dg-final { scan-tree-dump-times "strlen \\(" 4 "strlen" } } */
 /* { dg-final { scan-tree-dump-times "memcpy \\(" 1 "strlen" } } */
 /* { dg-final { scan-tree-dump-times "strcpy \\(" 1 "strlen" } } */
 /* { dg-final { scan-tree-dump-times "strcat \\(" 0 "strlen" } } */
-/* { dg-final { scan-tree-dump-times "strchr \\(" 1 "strlen" } } */
+/* { dg-final { scan-tree-dump-times "strchr \\(" 0 "strlen" } } */
 /* { dg-final { scan-tree-dump-times "stpcpy \\(" 0 "strlen" } } */
--- a/src/gcc/testsuite/gcc.dg/strlenopt-22g.c
+++ b/src/gcc/testsuite/gcc.dg/strlenopt-22g.c
@@ -5,9 +5,9 @@
 #define USE_GNU
 #include "strlenopt-22.c"
 
-/* { dg-final { scan-tree-dump-times "strlen \\(" 0 "strlen" } } */
+/* { dg-final { scan-tree-dump-times "strlen \\(" 1 "strlen" } } */
 /* { dg-final { scan-tree-dump-times "memcpy \\(" 1 "strlen" } } */
 /* { dg-final { scan-tree-dump-times "strcpy \\(" 0 "strlen" } } */
 /* { dg-final { scan-tree-dump-times "strcat \\(" 0 "strlen" } } */
-/* { dg-final { scan-tree-dump-times "strchr \\(" 1 "strlen" } } */
+/* { dg-final { scan-tree-dump-times "strchr \\(" 0 "strlen" } } */
 /* { dg-final { scan-tree-dump-times "stpcpy \\(" 1 "strlen" } } */
--- a/src/gcc/testsuite/gcc.dg/strlenopt-26.c
+++ b/src/gcc/testsuite/gcc.dg/strlenopt-26.c
@@ -21,4 +21,5 @@ main (void)
   return 0;
 }
 
-/* { dg-final { scan-tree-dump-times "strlen \\(" 1 "strlen" } } */
+/* { dg-final { scan-tree-dump-times "strlen \\(" 2 "strlen" } } */
+/* { dg-final { scan-tree-dump-times "strchr \\(" 0 "strlen" } } */
--- a/src/gcc/testsuite/gcc.dg/strlenopt-5.c
+++ b/src/gcc/testsuite/gcc.dg/strlenopt-5.c
@@ -48,9 +48,9 @@ main ()
   return 0;
 }
 
-/* { dg-final { scan-tree-dump-times "strlen \\(" 0 "strlen" } } */
+/* { dg-final { scan-tree-dump-times "strlen \\(" 2 "strlen" } } */
 /* { dg-final { scan-tree-dump-times "memcpy \\(" 2 "strlen" } } */
 /* { dg-final { scan-tree-dump-times "strcpy \\(" 1 "strlen" } } */
 /* { dg-final { scan-tree-dump-times "strcat \\(" 0 "strlen" } } */
-/* { dg-final { scan-tree-dump-times "strchr \\(" 2 "strlen" } } */
+/* { dg-final { scan-tree-dump-times "strchr \\(" 0 "strlen" } } */
 /* { dg-final { scan-tree-dump-times "stpcpy \\(" 0 "strlen" } } */
--- a/src/gcc/testsuite/gcc.dg/strlenopt-7.c
+++ b/src/gcc/testsuite/gcc.dg/strlenopt-7.c
@@ -40,11 +40,11 @@ main ()
   return 0;
 }
 
-/* { dg-final { scan-tree-dump-times "strlen \\(" 0 "strlen" } } */
+/* { dg-final { scan-tree-dump-times "strlen \\(" 1 "strlen" } } */
 /* { dg-final { scan-tree-dump-times "memcpy \\(" 2 "strlen" } } */
 /* { dg-final { scan-tree-dump-times "strcpy \\(" 0 "strlen" } } */
 /* { dg-final { scan-tree-dump-times "strcat \\(" 0 "strlen" } } */
-/* { dg-final { scan-tree-dump-times "strchr \\(" 1 "strlen" } } */
+/* { dg-final { scan-tree-dump-times "strchr \\(" 0 "strlen" } } */
 /* { dg-final { scan-tree-dump-times "stpcpy \\(" 0 "strlen" } } */
 /* { dg-final { scan-tree-dump-times "\\*r_\[0-9\]* = 0;" 1 "strlen" } } */
 /* { dg-final { scan-tree-dump-times "return 3;" 1 "optimized" } } */
--- a/src/gcc/testsuite/gcc.dg/strlenopt-9.c
+++ b/src/gcc/testsuite/gcc.dg/strlenopt-9.c
@@ -98,10 +98,10 @@ main ()
   return 0;
 }
 
-/* { dg-final { scan-tree-dump-times "strlen \\(" 3 "strlen" } } */
+/* { dg-final { scan-tree-dump-times "strlen \\(" 5 "strlen" } } */
 /* { dg-final { scan-tree-dump-times "memcpy \\(" 6 "strlen" } } */
 /* { dg-final { scan-tree-dump-times "strcpy \\(" 1 "strlen" } } */
 /* { dg-final { scan-tree-dump-times "strcat \\(" 0 "strlen" } } */
-/* { dg-final { scan-tree-dump-times "strchr \\(" 3 "strlen" } } */
+/* { dg-final { scan-tree-dump-times "strchr \\(" 0 "strlen" } } */
 /* { dg-final { scan-tree-dump-times "stpcpy \\(" 0 "strlen" } } */
 /* { dg-final { scan-tree-dump-times "return 4;" 1 "optimized" } } */
--- a/src/gcc/testsuite/gcc.dg/torture/arm-fp16-int-convert-alt.c
+++ b/src/gcc/testsuite/gcc.dg/torture/arm-fp16-int-convert-alt.c
@@ -1,5 +1,6 @@
 /* Test floating-point conversions.  Standard types and __fp16.  */
 /* { dg-do run { target arm*-*-* } } */
+/* { dg-require-effective-target arm_fp16_alternative_ok }
 /* { dg-options "-mfp16-format=alternative" } */
 
 #include "fp-int-convert.h"
--- a/src/gcc/testsuite/gcc.dg/torture/arm-fp16-ops-3.c
+++ b/src/gcc/testsuite/gcc.dg/torture/arm-fp16-ops-3.c
@@ -1,5 +1,6 @@
 /* Test various operators on __fp16 and mixed __fp16/float operands.  */
 /* { dg-do run { target arm*-*-* } } */
+/* { dg-require-effective-target arm_fp16_alternative_ok }
 /* { dg-options "-mfp16-format=alternative" } */
 
 #include "arm-fp16-ops.h"
--- a/src/gcc/testsuite/gcc.dg/torture/arm-fp16-ops-4.c
+++ b/src/gcc/testsuite/gcc.dg/torture/arm-fp16-ops-4.c
@@ -1,5 +1,6 @@
 /* Test various operators on __fp16 and mixed __fp16/float operands.  */
 /* { dg-do run { target arm*-*-* } } */
+/* { dg-require-effective-target arm_fp16_alternative_ok }
 /* { dg-options "-mfp16-format=alternative -ffast-math" } */
 
 #include "arm-fp16-ops.h"
--- /dev/null
+++ b/src/gcc/testsuite/gcc.dg/torture/pr71594.c
@@ -0,0 +1,15 @@
+/* { dg-do compile } */
+/* { dg-options "--param max-rtl-if-conversion-insns=2" } */
+
+unsigned short a;
+int b, c;
+int *d;
+void fn1() {
+  *d = 24;
+  for (; *d <= 65;) {
+    unsigned short *e = &a;
+    b = (a &= 0 <= 0) < (c ?: (*e %= *d));
+    for (; *d <= 83;)
+      ;
+  }
+}
--- /dev/null
+++ b/src/gcc/testsuite/gcc.dg/tree-ssa/pr61839_1.c
@@ -0,0 +1,44 @@
+/* PR tree-optimization/61839.  */
+/* { dg-do run } */
+/* { dg-options "-O2 -fdump-tree-vrp1 -fdump-tree-optimized" } */
+/* { dg-require-effective-target int32plus } */
+
+__attribute__ ((noinline))
+int foo ()
+{
+  int a = -1;
+  volatile unsigned b = 1U;
+  int c = 1;
+  c = (a + 972195718) >> (1LU <= b);
+  if (c == 486097858)
+    ;
+  else
+    __builtin_abort ();
+  return 0;
+}
+
+__attribute__ ((noinline))
+int bar ()
+{
+  int a = -1;
+  volatile unsigned b = 1U;
+  int c = 1;
+  c = (a + 972195718) >> (b ? 2 : 3);
+  if (c == 243048929)
+    ;
+  else
+    __builtin_abort ();
+  return 0;
+}
+
+int main ()
+{
+  foo ();
+  bar ();
+}
+
+/* Scan for c = 972195717) >> [0, 1] in function foo.  */
+/* { dg-final { scan-tree-dump-times "486097858 : 972195717" 1  "vrp1" } } */
+/* Scan for c = 972195717) >> [2, 3] in function bar.  */
+/* { dg-final { scan-tree-dump-times "243048929 : 121524464" 2  "vrp1" } } */
+/* { dg-final { scan-tree-dump-times "486097858" 0  "optimized" } } */
--- /dev/null
+++ b/src/gcc/testsuite/gcc.dg/tree-ssa/pr61839_2.c
@@ -0,0 +1,54 @@
+/* PR tree-optimization/61839.  */
+/* { dg-do compile } */
+/* { dg-options "-O2 -fdump-tree-vrp1" } */
+/* { dg-require-effective-target int32plus } */
+
+__attribute__ ((noinline))
+int foo ()
+{
+  int a = -1;
+  volatile unsigned b = 1U;
+  int c = 1;
+  c = (a + 972195718) / (b ? 1 : 0);
+  if (c == 972195717)
+    ;
+  else
+    __builtin_abort ();
+  return 0;
+}
+
+__attribute__ ((noinline))
+int bar ()
+{
+  int a = -1;
+  volatile unsigned b = 1U;
+  int c = 1;
+  c = (a + 972195718) % (b ? 1 : 0);
+  if (c == 972195717)
+    ;
+  else
+    __builtin_abort ();
+  return 0;
+}
+
+__attribute__ ((noinline))
+int bar2 ()
+{
+  int a = -1;
+  volatile unsigned b = 1U;
+  int c = 1;
+  c = (a + 972195716) % (b ? 1 : 2);
+  if (c == 972195715)
+    ;
+  else
+    __builtin_abort ();
+  return 0;
+}
+
+
+/* Dont optimize 972195717 / 0 in function foo.  */
+/* { dg-final { scan-tree-dump-times "972195717 / _" 1  "vrp1" } } */
+/* Dont optimize 972195717 % 0 in function bar.  */
+/* { dg-final { scan-tree-dump-times "972195717 % _" 1 "vrp1" } } */
+/* Optimize in function bar2.  */
+/* { dg-final { scan-tree-dump-times "972195715 % _" 0 "vrp1" } } */
--- /dev/null
+++ b/src/gcc/testsuite/gcc.dg/tree-ssa/pr61839_3.c
@@ -0,0 +1,26 @@
+/* PR tree-optimization/61839.  */
+/* { dg-do run } */
+/* { dg-options "-O2 -fdump-tree-vrp1 -fdump-tree-optimized" } */
+
+__attribute__ ((noinline))
+int foo (int a, unsigned b)
+{
+  int c = 1;
+  b =  a ? 12 : 13;
+  c = b << 8;
+  if (c == 3072)
+    ;
+  else
+    __builtin_abort ();
+  return 0;
+}
+
+int main ()
+{
+  volatile unsigned b = 1U;
+  foo (-1, b);
+}
+
+/* Scan for c [12, 13] << 8 in function foo.  */
+/* { dg-final { scan-tree-dump-times "3072 : 3328" 2  "vrp1" } } */
+/* { dg-final { scan-tree-dump-times "3072" 0  "optimized" } } */
--- /dev/null
+++ b/src/gcc/testsuite/gcc.dg/tree-ssa/pr61839_4.c
@@ -0,0 +1,28 @@
+/* PR tree-optimization/61839.  */
+/* { dg-do run } */
+/* { dg-options "-O2 -fdump-tree-vrp1 -fdump-tree-optimized" } */
+/* { dg-require-effective-target int32plus } */
+
+__attribute__ ((noinline))
+int foo (int a, unsigned b)
+{
+  unsigned c = 1;
+  if (b >= 1 && b <= ((unsigned)(-1) - 1))
+    return 0;
+  c = b >> 4;
+  if (c == 268435455)
+    ;
+  else
+    __builtin_abort ();
+  return 0;
+}
+
+int main ()
+{
+  volatile unsigned b = (unsigned)(-1);
+  foo (-1, b);
+}
+
+/* Scan for ~[1, 4294967294] >> 4 in function foo.  */
+/* { dg-final { scan-tree-dump-times "0 : 268435455" 1  "vrp1" } } */
+/* { dg-final { scan-tree-dump-times "268435455" 0  "optimized" } } */
--- /dev/null
+++ b/src/gcc/testsuite/gcc.dg/tree-ssa/scev-11.c
@@ -0,0 +1,28 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -fdump-tree-ivopts-details" } */
+
+int a[128];
+extern int b[];
+
+int bar (int *);
+
+int
+foo (int n)
+{
+  int i;
+
+  for (i = 0; i < n; i++)
+    {
+      unsigned char uc = (unsigned char)i;
+      a[i] = i;
+      b[uc] = 0;
+    }
+
+  bar (a);
+  return 0;
+}
+
+/* Address of array reference to b is scev.  */
+/* { dg-final { scan-tree-dump-times "use \[0-9\]\n  address" 2 "ivopts" } } */
+
+
--- /dev/null
+++ b/src/gcc/testsuite/gcc.dg/tree-ssa/scev-12.c
@@ -0,0 +1,30 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -fdump-tree-ivopts-details" } */
+
+int a[128];
+extern int b[];
+
+int bar (int *);
+
+int
+foo (int x, int n)
+{
+  int i;
+
+  for (i = 0; i < n; i++)
+    {
+      unsigned char uc = (unsigned char)i;
+      if (x)
+	a[i] = i;
+      b[uc] = 0;
+    }
+
+  bar (a);
+  return 0;
+}
+
+/* Address of array reference to b is not scev.  */
+/* { dg-final { scan-tree-dump-times "use \[0-9\]\n  address" 1 "ivopts" } } */
+
+
+
--- a/src/gcc/testsuite/gcc.dg/tree-ssa/stdarg-2.c
+++ b/src/gcc/testsuite/gcc.dg/tree-ssa/stdarg-2.c
@@ -25,6 +25,7 @@ f1 (int i, ...)
 /* { dg-final { scan-tree-dump "f1: va_list escapes 0, needs to save 0 GPR units and 0 FPR units" "stdarg" { target { powerpc*-*-linux* && ilp32 } } } } */
 /* { dg-final { scan-tree-dump "f1: va_list escapes 0, needs to save 0 GPR units and 0 FPR units" "stdarg" { target alpha*-*-linux* } } } */
 /* { dg-final { scan-tree-dump "f1: va_list escapes 0, needs to save 0 GPR units and 0 FPR units" "stdarg" { target s390*-*-linux* } } } */
+/* { dg-final { scan-tree-dump "f1: va_list escapes 0, needs to save 0 GPR units and 0 FPR units" "stdarg" { target aarch64*-*-* } } } */
 /* { dg-final { scan-tree-dump "f1: va_list escapes 0, needs to save 0 GPR units" "stdarg" { target { { i?86-*-* x86_64-*-* } && ia32 } } } } */
 /* { dg-final { scan-tree-dump "f1: va_list escapes 0, needs to save 0 GPR units" "stdarg" { target ia64-*-* } } } */
 /* { dg-final { scan-tree-dump "f1: va_list escapes 0, needs to save 0 GPR units" "stdarg" { target { powerpc*-*-* && lp64 } } } } */
@@ -45,6 +46,7 @@ f2 (int i, ...)
 /* { dg-final { scan-tree-dump "f2: va_list escapes 0, needs to save \[148\] GPR units and 0 FPR units" "stdarg" { target { powerpc*-*-linux* && ilp32 } } } } */
 /* { dg-final { scan-tree-dump "f2: va_list escapes 0, needs to save 8 GPR units and 1" "stdarg" { target alpha*-*-linux* } } } */
 /* { dg-final { scan-tree-dump "f2: va_list escapes 0, needs to save 1 GPR units and 0 FPR units" "stdarg" { target s390*-*-linux* } } } */
+/* { dg-final { scan-tree-dump "f2: va_list escapes 0, needs to save 8 GPR units and 0 FPR units" "stdarg" { target aarch64*-*-* } } } */
 /* { dg-final { scan-tree-dump "f2: va_list escapes 0, needs to save \[148\] GPR units" "stdarg" { target { { i?86-*-* x86_64-*-* } && ia32 } } } } */
 /* { dg-final { scan-tree-dump "f2: va_list escapes 0, needs to save \[148\] GPR units" "stdarg" { target ia64-*-* } } } */
 /* { dg-final { scan-tree-dump "f2: va_list escapes 0, needs to save \[148\] GPR units" "stdarg" { target { powerpc*-*-* && lp64 } } } } */
@@ -60,6 +62,7 @@ f3 (int i, ...)
 /* { dg-final { scan-tree-dump "f3: va_list escapes 0, needs to save 0 GPR units and \[1-9\]\[0-9\]* FPR units" "stdarg" { target { { i?86-*-* x86_64-*-* } && { ! { ia32 || llp64 } } } } } } */
 /* { dg-final { scan-tree-dump "f3: va_list escapes 0, needs to save 0 GPR units and \[1-9\]\[0-9\]* FPR units" "stdarg" { target { powerpc*-*-linux* && { powerpc_fprs && ilp32 } } } } } */
 /* { dg-final { scan-tree-dump "f3: va_list escapes 0, needs to save 0 GPR units and 1 FPR units" "stdarg" { target s390*-*-linux* } } } */
+/* { dg-final { scan-tree-dump "f3: va_list escapes 0, needs to save 0 GPR units and 16 FPR units" "stdarg" { target aarch64*-*-* } } } */
 /* { dg-final { scan-tree-dump "f3: va_list escapes 0, needs to save 8 GPR units and 2" "stdarg" { target alpha*-*-linux* } } } */
 /* { dg-final { scan-tree-dump "f3: va_list escapes 0, needs to save \[1-9\]\[0-9\]* GPR units" "stdarg" { target { { i?86-*-* x86_64-*-* } && ia32 } } } } */
 /* { dg-final { scan-tree-dump "f3: va_list escapes 0, needs to save \[1-9\]\[0-9\]* GPR units" "stdarg" { target ia64-*-* } } } */
@@ -78,6 +81,7 @@ f4 (int i, ...)
 /* { dg-final { scan-tree-dump "f4: va_list escapes 1, needs to save all GPR units and all FPR units" "stdarg" { target { powerpc*-*-linux* && ilp32 } } } } */
 /* { dg-final { scan-tree-dump "f4: va_list escapes 1, needs to save all GPR units and all FPR units" "stdarg" { target alpha*-*-linux* } } } */
 /* { dg-final { scan-tree-dump "f4: va_list escapes 1, needs to save all GPR units and all FPR units" "stdarg" { target s390*-*-linux* } } } */
+/* { dg-final { scan-tree-dump "f4: va_list escapes 1, needs to save all GPR units and all FPR units" "stdarg" { target aarch64*-*-* } } } */
 /* { dg-final { scan-tree-dump "f4: va_list escapes 1, needs to save all GPR units" "stdarg" { target { { i?86-*-* x86_64-*-* } && ia32 } } } } */
 /* { dg-final { scan-tree-dump "f4: va_list escapes 1, needs to save all GPR units" "stdarg" { target ia64-*-* } } } */
 /* { dg-final { scan-tree-dump "f4: va_list escapes 1, needs to save all GPR units" "stdarg" { target { powerpc*-*-* && lp64 } } } } */
@@ -96,6 +100,7 @@ f5 (int i, ...)
 /* { dg-final { scan-tree-dump "f5: va_list escapes 1, needs to save all GPR units and all FPR units" "stdarg" { target { powerpc*-*-linux* && ilp32 } } } } */
 /* { dg-final { scan-tree-dump "f5: va_list escapes 1, needs to save all GPR units and all FPR units" "stdarg" { target alpha*-*-linux* } } } */
 /* { dg-final { scan-tree-dump "f5: va_list escapes 1, needs to save all GPR units and all FPR units" "stdarg" { target s390*-*-linux* } } } */
+/* { dg-final { scan-tree-dump "f5: va_list escapes 1, needs to save all GPR units and all FPR units" "stdarg" { target aarch64*-*-* } } } */
 /* { dg-final { scan-tree-dump "f5: va_list escapes 1, needs to save all GPR units" "stdarg" { target { { i?86-*-* x86_64-*-* } && ia32 } } } } */
 /* { dg-final { scan-tree-dump "f5: va_list escapes 1, needs to save all GPR units" "stdarg" { target ia64-*-* } } } */
 /* { dg-final { scan-tree-dump "f5: va_list escapes 1, needs to save all GPR units" "stdarg" { target { powerpc*-*-* && lp64 } } } } */
@@ -116,6 +121,7 @@ f6 (int i, ...)
 /* { dg-final { scan-tree-dump "f6: va_list escapes 0, needs to save (3|12|24) GPR units and 0 FPR units" "stdarg" { target { powerpc*-*-linux* && ilp32 } } } } */
 /* { dg-final { scan-tree-dump "f6: va_list escapes 0, needs to save 24 GPR units and 1" "stdarg" { target alpha*-*-linux* } } } */
 /* { dg-final { scan-tree-dump "f6: va_list escapes 0, needs to save 3 GPR units and 0 FPR units" "stdarg" { target s390*-*-linux* } } } */
+/* { dg-final { scan-tree-dump "f6: va_list escapes 0, needs to save 24 GPR units and 0 FPR units" "stdarg" { target aarch64*-*-* } } } */
 /* { dg-final { scan-tree-dump "f6: va_list escapes 0, needs to save (3|12|24) GPR units" "stdarg" { target { { i?86-*-* x86_64-*-* } && ia32 } } } } */
 /* { dg-final { scan-tree-dump "f6: va_list escapes 0, needs to save (3|12|24) GPR units" "stdarg" { target ia64-*-* } } } */
 /* { dg-final { scan-tree-dump "f6: va_list escapes 0, needs to save (3|12|24) GPR units" "stdarg" { target { powerpc*-*-* && lp64 } } } } */
@@ -133,6 +139,7 @@ f7 (int i, ...)
 /* { dg-final { scan-tree-dump "f7: va_list escapes 1, needs to save all GPR units and all FPR units" "stdarg" { target { powerpc*-*-linux* && ilp32 } } } } */
 /* { dg-final { scan-tree-dump "f7: va_list escapes 1, needs to save all GPR units and all FPR units" "stdarg" { target alpha*-*-linux* } } } */
 /* { dg-final { scan-tree-dump "f7: va_list escapes 1, needs to save all GPR units and all FPR units" "stdarg" { target s390*-*-linux* } } } */
+/* { dg-final { scan-tree-dump "f7: va_list escapes 1, needs to save all GPR units and all FPR units" "stdarg" { target aarch64*-*-* } } } */
 /* { dg-final { scan-tree-dump "f7: va_list escapes 1, needs to save all GPR units" "stdarg" { target { { i?86-*-* x86_64-*-* } && ia32 } } } } */
 /* { dg-final { scan-tree-dump "f7: va_list escapes 1, needs to save all GPR units" "stdarg" { target ia64-*-* } } } */
 /* { dg-final { scan-tree-dump "f7: va_list escapes 1, needs to save all GPR units" "stdarg" { target { powerpc*-*-* && lp64 } } } } */
@@ -152,6 +159,7 @@ f8 (int i, ...)
 /* { dg-final { scan-tree-dump "f8: va_list escapes 1, needs to save all GPR units and all FPR units" "stdarg" { target { powerpc*-*-linux* && ilp32 } } } } */
 /* { dg-final { scan-tree-dump "f8: va_list escapes 1, needs to save all GPR units and all FPR units" "stdarg" { target alpha*-*-linux* } } } */
 /* { dg-final { scan-tree-dump "f8: va_list escapes 1, needs to save all GPR units and all FPR units" "stdarg" { target s390*-*-linux* } } } */
+/* { dg-final { scan-tree-dump "f8: va_list escapes 1, needs to save all GPR units and all FPR units" "stdarg" { target aarch64*-*-* } } } */
 /* { dg-final { scan-tree-dump "f8: va_list escapes 1, needs to save all GPR units" "stdarg" { target { { i?86-*-* x86_64-*-* } && ia32 } } } } */
 /* { dg-final { scan-tree-dump "f8: va_list escapes 1, needs to save all GPR units" "stdarg" { target ia64-*-* } } } */
 /* { dg-final { scan-tree-dump "f8: va_list escapes 1, needs to save all GPR units" "stdarg" { target { powerpc*-*-* && lp64 } } } } */
@@ -169,6 +177,7 @@ f9 (int i, ...)
 /* { dg-final { scan-tree-dump "f9: va_list escapes 1, needs to save all GPR units and all FPR units" "stdarg" { target { powerpc*-*-linux* && ilp32 } } } } */
 /* { dg-final { scan-tree-dump "f9: va_list escapes 1, needs to save all GPR units and all FPR units" "stdarg" { target alpha*-*-linux* } } } */
 /* { dg-final { scan-tree-dump "f9: va_list escapes 1, needs to save all GPR units and all FPR units" "stdarg" { target s390*-*-linux* } } } */
+/* { dg-final { scan-tree-dump "f9: va_list escapes 1, needs to save all GPR units and all FPR units" "stdarg" { target aarch64*-*-* } } } */
 /* { dg-final { scan-tree-dump "f9: va_list escapes 1, needs to save all GPR units" "stdarg" { target { { i?86-*-* x86_64-*-* } && ia32 } } } } */
 /* { dg-final { scan-tree-dump "f9: va_list escapes 1, needs to save all GPR units" "stdarg" { target ia64-*-* } } } */
 /* { dg-final { scan-tree-dump "f9: va_list escapes 1, needs to save all GPR units" "stdarg" { target { powerpc*-*-* && lp64 } } } } */
@@ -188,6 +197,7 @@ f10 (int i, ...)
 /* { dg-final { scan-tree-dump "f10: va_list escapes 1, needs to save all GPR units and all FPR units" "stdarg" { target { powerpc*-*-linux* && ilp32 } } } } */
 /* { dg-final { scan-tree-dump "f10: va_list escapes 1, needs to save all GPR units and all FPR units" "stdarg" { target alpha*-*-linux* } } } */
 /* { dg-final { scan-tree-dump "f10: va_list escapes 1, needs to save all GPR units and all FPR units" "stdarg" { target s390*-*-linux* } } } */
+/* { dg-final { scan-tree-dump "f10: va_list escapes 1, needs to save all GPR units and all FPR units" "stdarg" { target aarch64*-*-* } } } */
 /* { dg-final { scan-tree-dump "f10: va_list escapes 1, needs to save all GPR units" "stdarg" { target { { i?86-*-* x86_64-*-* } && ia32 } } } } */
 /* { dg-final { scan-tree-dump "f10: va_list escapes 1, needs to save all GPR units" "stdarg" { target ia64-*-* } } } */
 /* { dg-final { scan-tree-dump "f10: va_list escapes 1, needs to save all GPR units" "stdarg" { target { powerpc*-*-* && lp64 } } } } */
@@ -208,6 +218,7 @@ f11 (int i, ...)
 /* { dg-final { scan-tree-dump "f11: va_list escapes 0, needs to save (3|12|24) GPR units and 0 FPR units" "stdarg" { target { powerpc*-*-linux* && ilp32 } } } } */
 /* { dg-final { scan-tree-dump "f11: va_list escapes 0, needs to save 24 GPR units and 1" "stdarg" { target alpha*-*-linux* } } } */
 /* { dg-final { scan-tree-dump "f11: va_list escapes 0, needs to save 3 GPR units and 0 FPR units" "stdarg" { target s390*-*-linux* } } } */
+/* { dg-final { scan-tree-dump "f11: va_list escapes 0, needs to save 24 GPR units and 0 FPR units" "stdarg" { target aarch64*-*-* } } } */
 /* { dg-final { scan-tree-dump "f11: va_list escapes 0, needs to save (3|12|24) GPR units" "stdarg" { target { { i?86-*-* x86_64-*-* } && ia32 } } } } */
 /* { dg-final { scan-tree-dump "f11: va_list escapes 0, needs to save (3|12|24) GPR units" "stdarg" { target ia64-*-* } } } */
 /* { dg-final { scan-tree-dump "f11: va_list escapes 0, needs to save (3|12|24) GPR units" "stdarg" { target { powerpc*-*-* && lp64 } } } } */
@@ -228,6 +239,7 @@ f12 (int i, ...)
 /* { dg-final { scan-tree-dump "f12: va_list escapes 0, needs to save 0 GPR units and \[1-9\]\[0-9\]* FPR units" "stdarg" { target { powerpc*-*-linux* && { powerpc_fprs && ilp32 } } } } } */
 /* { dg-final { scan-tree-dump "f12: va_list escapes 0, needs to save 24 GPR units and 2" "stdarg" { target alpha*-*-linux* } } } */
 /* { dg-final { scan-tree-dump "f12: va_list escapes 0, needs to save 0 GPR units and 3 FPR units" "stdarg" { target s390*-*-linux* } } } */
+/* { dg-final { scan-tree-dump "f12: va_list escapes 0, needs to save 0 GPR units and 48 FPR units" "stdarg" { target aarch64*-*-* } } } */
 /* { dg-final { scan-tree-dump "f12: va_list escapes 0, needs to save \[1-9]\[0-9\]* GPR units" "stdarg" { target { { i?86-*-* x86_64-*-* } && ia32 } } } } */
 /* { dg-final { scan-tree-dump "f12: va_list escapes 0, needs to save \[1-9]\[0-9\]* GPR units" "stdarg" { target ia64-*-* } } } */
 /* { dg-final { scan-tree-dump "f12: va_list escapes 0, needs to save \[1-9]\[0-9\]* GPR units" "stdarg" { target { powerpc*-*-* && lp64 } } } } */
@@ -248,6 +260,7 @@ f13 (int i, ...)
 /* { dg-final { scan-tree-dump "f13: va_list escapes 0, needs to save 0 GPR units and \[1-9\]\[0-9\]* FPR units" "stdarg" { target { powerpc*-*-linux* && { powerpc_fprs && ilp32 } } } } } */
 /* { dg-final { scan-tree-dump "f13: va_list escapes 0, needs to save 24 GPR units and 2" "stdarg" { target alpha*-*-linux* } } } */
 /* { dg-final { scan-tree-dump "f13: va_list escapes 0, needs to save 0 GPR units and 3 FPR units" "stdarg" { target s390*-*-linux* } } } */
+/* { dg-final { scan-tree-dump "f13: va_list escapes 0, needs to save 0 GPR units and 48 FPR units" "stdarg" { target aarch64*-*-* } } } */
 /* { dg-final { scan-tree-dump "f13: va_list escapes 0, needs to save \[1-9]\[0-9\]* GPR units" "stdarg" { target { { i?86-*-* x86_64-*-* } && ia32 } } } } */
 /* { dg-final { scan-tree-dump "f13: va_list escapes 0, needs to save \[1-9]\[0-9\]* GPR units" "stdarg" { target ia64-*-* } } } */
 /* { dg-final { scan-tree-dump "f13: va_list escapes 0, needs to save \[1-9]\[0-9\]* GPR units" "stdarg" { target { powerpc*-*-* && lp64 } } } } */
@@ -268,6 +281,7 @@ f14 (int i, ...)
 /* { dg-final { scan-tree-dump "f14: va_list escapes 0, needs to save \[148\] GPR units and \[1-9\]\[0-9\]* FPR units" "stdarg" { target { powerpc*-*-linux* && { powerpc_fprs && ilp32 } } } } } */
 /* { dg-final { scan-tree-dump "f14: va_list escapes 0, needs to save 24 GPR units and 3" "stdarg" { target alpha*-*-linux* } } } */
 /* { dg-final { scan-tree-dump "f14: va_list escapes 0, needs to save 1 GPR units and 2 FPR units" "stdarg" { target s390*-*-linux* } } } */
+/* { dg-final { scan-tree-dump "f14: va_list escapes 0, needs to save 8 GPR units and 32 FPR units" "stdarg" { target aarch64*-*-* } } } */
 /* { dg-final { scan-tree-dump "f14: va_list escapes 0, needs to save \[1-9]\[0-9\]* GPR units" "stdarg" { target { { i?86-*-* x86_64-*-* } && ia32 } } } } */
 /* { dg-final { scan-tree-dump "f14: va_list escapes 0, needs to save \[1-9]\[0-9\]* GPR units" "stdarg" { target ia64-*-* } } } */
 /* { dg-final { scan-tree-dump "f14: va_list escapes 0, needs to save \[1-9]\[0-9\]* GPR units" "stdarg" { target { powerpc*-*-* && lp64 } } } } */
@@ -291,6 +305,7 @@ f15 (int i, ...)
 /* { dg-final { scan-tree-dump "f15: va_list escapes 0, needs to save \[148\] GPR units and \[1-9\]\[0-9\]* FPR units" "stdarg" { target { { i?86-*-* x86_64-*-* } && { ! { ia32 || llp64 } } } } } } */
 /* { dg-final { scan-tree-dump "f15: va_list escapes 0, needs to save \[148\] GPR units and \[1-9\]\[0-9\]* FPR units" "stdarg" { target { powerpc*-*-linux* && { powerpc_fprs && ilp32 } } } } } */
 /* { dg-final { scan-tree-dump "f15: va_list escapes 0, needs to save 1 GPR units and 2 FPR units" "stdarg" { target s390*-*-linux* } } } */
+/* { dg-final { scan-tree-dump "f15: va_list escapes 1, needs to save all GPR units and all FPR units" "stdarg" { target aarch64*-*-* } } } */
 
 /* We may be able to improve upon this after fixing PR66010/PR66013.  */
 /* { dg-final { scan-tree-dump "f15: va_list escapes 1, needs to save all GPR units and all FPR units" "stdarg" { target alpha*-*-linux* } } } */
--- a/src/gcc/testsuite/gcc.dg/tree-ssa/stdarg-3.c
+++ b/src/gcc/testsuite/gcc.dg/tree-ssa/stdarg-3.c
@@ -24,6 +24,7 @@ f1 (int i, ...)
 /* { dg-final { scan-tree-dump "f1: va_list escapes 1, needs to save all GPR units and all FPR units" "stdarg" { target { powerpc*-*-linux* && ilp32 } } } } */
 /* { dg-final { scan-tree-dump "f1: va_list escapes 1, needs to save all GPR units and all FPR units" "stdarg" { target alpha*-*-linux* } } } */
 /* { dg-final { scan-tree-dump "f1: va_list escapes 1, needs to save all GPR units and all FPR units" "stdarg" { target s390*-*-linux* } } } */
+/* { dg-final { scan-tree-dump "f1: va_list escapes 1, needs to save all GPR units and all FPR units" "stdarg" { target aarch64*-*-* } } } */
 /* { dg-final { scan-tree-dump "f1: va_list escapes 1, needs to save all GPR units" "stdarg" { target { { i?86-*-* x86_64-*-* } && ia32 } } } } */
 /* { dg-final { scan-tree-dump "f1: va_list escapes 1, needs to save all GPR units" "stdarg" { target ia64-*-* } } } */
 /* { dg-final { scan-tree-dump "f1: va_list escapes 1, needs to save all GPR units" "stdarg" { target { powerpc*-*-* && lp64 } } } } */
@@ -39,6 +40,7 @@ f2 (int i, ...)
 /* { dg-final { scan-tree-dump "f2: va_list escapes 1, needs to save all GPR units and all FPR units" "stdarg" { target { powerpc*-*-linux* && ilp32 } } } } */
 /* { dg-final { scan-tree-dump "f2: va_list escapes 1, needs to save all GPR units and all FPR units" "stdarg" { target alpha*-*-linux* } } } */
 /* { dg-final { scan-tree-dump "f2: va_list escapes 1, needs to save all GPR units and all FPR units" "stdarg" { target s390*-*-linux* } } } */
+/* { dg-final { scan-tree-dump "f2: va_list escapes 1, needs to save all GPR units and all FPR units" "stdarg" { target aarch64*-*-* } } } */
 /* { dg-final { scan-tree-dump "f2: va_list escapes 1, needs to save all GPR units" "stdarg" { target { { i?86-*-* x86_64-*-* } && ia32 } } } } */
 /* { dg-final { scan-tree-dump "f2: va_list escapes 1, needs to save all GPR units" "stdarg" { target ia64-*-* } } } */
 /* { dg-final { scan-tree-dump "f2: va_list escapes 1, needs to save all GPR units" "stdarg" { target { powerpc*-*-* && lp64 } } } } */
@@ -57,6 +59,7 @@ f3 (int i, ...)
 /* { dg-final { scan-tree-dump "f3: va_list escapes 1, needs to save all GPR units and all FPR units" "stdarg" { target { powerpc*-*-linux* && ilp32 } } } } */
 /* { dg-final { scan-tree-dump "f3: va_list escapes 1, needs to save all GPR units and all FPR units" "stdarg" { target alpha*-*-linux* } } } */
 /* { dg-final { scan-tree-dump "f3: va_list escapes 1, needs to save all GPR units and all FPR units" "stdarg" { target s390*-*-linux* } } } */
+/* { dg-final { scan-tree-dump "f3: va_list escapes 1, needs to save all GPR units and all FPR units" "stdarg" { target aarch64*-*-* } } } */
 /* { dg-final { scan-tree-dump "f3: va_list escapes 1, needs to save all GPR units" "stdarg" { target { { i?86-*-* x86_64-*-* } && ia32 } } } } */
 /* { dg-final { scan-tree-dump "f3: va_list escapes 1, needs to save all GPR units" "stdarg" { target ia64-*-* } } } */
 /* { dg-final { scan-tree-dump "f3: va_list escapes 1, needs to save all GPR units" "stdarg" { target { powerpc*-*-* && lp64 } } } } */
@@ -73,6 +76,7 @@ f4 (int i, ...)
 /* { dg-final { scan-tree-dump "f4: va_list escapes 1, needs to save all GPR units and all FPR units" "stdarg" { target { powerpc*-*-linux* && ilp32 } } } } */
 /* { dg-final { scan-tree-dump "f4: va_list escapes 1, needs to save all GPR units and all FPR units" "stdarg" { target alpha*-*-linux* } } } */
 /* { dg-final { scan-tree-dump "f4: va_list escapes 1, needs to save all GPR units and all FPR units" "stdarg" { target s390*-*-linux* } } } */
+/* { dg-final { scan-tree-dump "f4: va_list escapes 1, needs to save all GPR units and all FPR units" "stdarg" { target aarch64*-*-* } } } */
 /* { dg-final { scan-tree-dump "f4: va_list escapes 1, needs to save all GPR units" "stdarg" { target { { i?86-*-* x86_64-*-* } && ia32 } } } } */
 /* { dg-final { scan-tree-dump "f4: va_list escapes 1, needs to save all GPR units" "stdarg" { target ia64-*-* } } } */
 /* { dg-final { scan-tree-dump "f4: va_list escapes 1, needs to save all GPR units" "stdarg" { target { powerpc*-*-* && lp64 } } } } */
@@ -89,6 +93,7 @@ f5 (int i, ...)
 /* { dg-final { scan-tree-dump "f5: va_list escapes 1, needs to save all GPR units and all FPR units" "stdarg" { target { powerpc*-*-linux* && ilp32 } } } } */
 /* { dg-final { scan-tree-dump "f5: va_list escapes 1, needs to save all GPR units and all FPR units" "stdarg" { target alpha*-*-linux* } } } */
 /* { dg-final { scan-tree-dump "f5: va_list escapes 1, needs to save all GPR units and all FPR units" "stdarg" { target s390*-*-linux* } } } */
+/* { dg-final { scan-tree-dump "f5: va_list escapes 1, needs to save all GPR units and all FPR units" "stdarg" { target aarch64*-*-* } } } */
 /* { dg-final { scan-tree-dump "f5: va_list escapes 1, needs to save all GPR units" "stdarg" { target { { i?86-*-* x86_64-*-* } && ia32 } } } } */
 /* { dg-final { scan-tree-dump "f5: va_list escapes 1, needs to save all GPR units" "stdarg" { target ia64-*-* } } } */
 /* { dg-final { scan-tree-dump "f5: va_list escapes 1, needs to save all GPR units" "stdarg" { target { powerpc*-*-* && lp64 } } } } */
@@ -107,6 +112,7 @@ f6 (int i, ...)
 /* { dg-final { scan-tree-dump "f6: va_list escapes 1, needs to save all GPR units and all FPR units" "stdarg" { target { powerpc*-*-linux* && ilp32 } } } } */
 /* { dg-final { scan-tree-dump "f6: va_list escapes 1, needs to save all GPR units and all FPR units" "stdarg" { target alpha*-*-linux* } } } */
 /* { dg-final { scan-tree-dump "f6: va_list escapes 1, needs to save all GPR units and all FPR units" "stdarg" { target s390*-*-linux* } } } */
+/* { dg-final { scan-tree-dump "f6: va_list escapes 1, needs to save all GPR units and all FPR units" "stdarg" { target aarch64*-*-* } } } */
 /* { dg-final { scan-tree-dump "f6: va_list escapes 1, needs to save all GPR units" "stdarg" { target { { i?86-*-* x86_64-*-* } && ia32 } } } } */
 /* { dg-final { scan-tree-dump "f6: va_list escapes 1, needs to save all GPR units" "stdarg" { target ia64-*-* } } } */
 /* { dg-final { scan-tree-dump "f6: va_list escapes 1, needs to save all GPR units" "stdarg" { target { powerpc*-*-* && lp64 } } } } */
@@ -123,6 +129,7 @@ f7 (int i, ...)
 /* { dg-final { scan-tree-dump "f7: va_list escapes 1, needs to save all GPR units and all FPR units" "stdarg" { target { powerpc*-*-linux* && ilp32 } } } } */
 /* { dg-final { scan-tree-dump "f7: va_list escapes 1, needs to save all GPR units and all FPR units" "stdarg" { target alpha*-*-linux* } } } */
 /* { dg-final { scan-tree-dump "f7: va_list escapes 1, needs to save all GPR units and all FPR units" "stdarg" { target s390*-*-linux* } } } */
+/* { dg-final { scan-tree-dump "f7: va_list escapes 1, needs to save all GPR units and all FPR units" "stdarg" { target aarch64*-*-* } } } */
 /* { dg-final { scan-tree-dump "f7: va_list escapes 1, needs to save all GPR units" "stdarg" { target { { i?86-*-* x86_64-*-* } && ia32 } } } } */
 /* { dg-final { scan-tree-dump "f7: va_list escapes 1, needs to save all GPR units" "stdarg" { target ia64-*-* } } } */
 /* { dg-final { scan-tree-dump "f7: va_list escapes 1, needs to save all GPR units" "stdarg" { target { powerpc*-*-* && lp64 } } } } */
@@ -139,6 +146,7 @@ f8 (int i, ...)
 /* { dg-final { scan-tree-dump "f8: va_list escapes 1, needs to save all GPR units and all FPR units" "stdarg" { target { powerpc*-*-linux* && ilp32 } } } } */
 /* { dg-final { scan-tree-dump "f8: va_list escapes 1, needs to save all GPR units and all FPR units" "stdarg" { target alpha*-*-linux* } } } */
 /* { dg-final { scan-tree-dump "f8: va_list escapes 1, needs to save all GPR units and all FPR units" "stdarg" { target s390*-*-linux* } } } */
+/* { dg-final { scan-tree-dump "f8: va_list escapes 1, needs to save all GPR units and all FPR units" "stdarg" { target aarch64*-*-* } } } */
 /* { dg-final { scan-tree-dump "f8: va_list escapes 1, needs to save all GPR units" "stdarg" { target { { i?86-*-* x86_64-*-* } && ia32 } } } } */
 /* { dg-final { scan-tree-dump "f8: va_list escapes 1, needs to save all GPR units" "stdarg" { target ia64-*-* } } } */
 /* { dg-final { scan-tree-dump "f8: va_list escapes 1, needs to save all GPR units" "stdarg" { target { powerpc*-*-* && lp64 } } } } */
@@ -155,6 +163,7 @@ f10 (int i, ...)
 /* { dg-final { scan-tree-dump "f10: va_list escapes 1, needs to save all GPR units and all FPR units" "stdarg" { target { powerpc*-*-linux* && ilp32 } } } } */
 /* { dg-final { scan-tree-dump "f10: va_list escapes 1, needs to save all GPR units and all FPR units" "stdarg" { target alpha*-*-linux* } } } */
 /* { dg-final { scan-tree-dump "f10: va_list escapes 1, needs to save all GPR units and all FPR units" "stdarg" { target s390*-*-linux* } } } */
+/* { dg-final { scan-tree-dump "f10: va_list escapes 1, needs to save all GPR units and all FPR units" "stdarg" { target aarch64*-*-* } } } */
 /* { dg-final { scan-tree-dump "f10: va_list escapes 1, needs to save all GPR units" "stdarg" { target { { i?86-*-* x86_64-*-* } && ia32 } } } } */
 /* { dg-final { scan-tree-dump "f10: va_list escapes 1, needs to save all GPR units" "stdarg" { target ia64-*-* } } } */
 /* { dg-final { scan-tree-dump "f10: va_list escapes 1, needs to save all GPR units" "stdarg" { target { powerpc*-*-* && lp64 } } } } */
@@ -171,6 +180,7 @@ f11 (int i, ...)
 /* { dg-final { scan-tree-dump "f11: va_list escapes 1, needs to save all GPR units and all FPR units" "stdarg" { target { powerpc*-*-linux* && ilp32 } } } } */
 /* { dg-final { scan-tree-dump "f11: va_list escapes 1, needs to save all GPR units and all FPR units" "stdarg" { target alpha*-*-linux* } } } */
 /* { dg-final { scan-tree-dump "f11: va_list escapes 1, needs to save all GPR units and all FPR units" "stdarg" { target s390*-*-linux* } } } */
+/* { dg-final { scan-tree-dump "f11: va_list escapes 1, needs to save all GPR units and all FPR units" "stdarg" { target aarch64*-*-* } } } */
 /* { dg-final { scan-tree-dump "f11: va_list escapes 1, needs to save all GPR units" "stdarg" { target { { i?86-*-* x86_64-*-* } && ia32 } } } } */
 /* { dg-final { scan-tree-dump "f11: va_list escapes 1, needs to save all GPR units" "stdarg" { target ia64-*-* } } } */
 /* { dg-final { scan-tree-dump "f11: va_list escapes 1, needs to save all GPR units" "stdarg" { target { powerpc*-*-* && lp64 } } } } */
@@ -187,6 +197,7 @@ f12 (int i, ...)
 /* { dg-final { scan-tree-dump "f12: va_list escapes 1, needs to save all GPR units and all FPR units" "stdarg" { target { powerpc*-*-linux* && ilp32 } } } } */
 /* { dg-final { scan-tree-dump "f12: va_list escapes 1, needs to save all GPR units and all FPR units" "stdarg" { target alpha*-*-linux* } } } */
 /* { dg-final { scan-tree-dump "f12: va_list escapes 1, needs to save all GPR units and all FPR units" "stdarg" { target s390*-*-linux* } } } */
+/* { dg-final { scan-tree-dump "f12: va_list escapes 1, needs to save all GPR units and all FPR units" "stdarg" { target aarch64*-*-* } } } */
 /* { dg-final { scan-tree-dump "f12: va_list escapes 1, needs to save all GPR units" "stdarg" { target { { i?86-*-* x86_64-*-* } && ia32 } } } } */
 /* { dg-final { scan-tree-dump "f12: va_list escapes 1, needs to save all GPR units" "stdarg" { target ia64-*-* } } } */
 /* { dg-final { scan-tree-dump "f12: va_list escapes 1, needs to save all GPR units" "stdarg" { target { powerpc*-*-* && lp64 } } } } */
--- a/src/gcc/testsuite/gcc.dg/tree-ssa/stdarg-4.c
+++ b/src/gcc/testsuite/gcc.dg/tree-ssa/stdarg-4.c
@@ -27,6 +27,7 @@ f1 (int i, ...)
 /* { dg-final { scan-tree-dump "f1: va_list escapes 0, needs to save all GPR units and 0 FPR units" "stdarg" { target { powerpc*-*-linux* && ilp32 } } } } */
 /* { dg-final { scan-tree-dump "f1: va_list escapes 0, needs to save all GPR units and 1" "stdarg" { target alpha*-*-linux* } } } */
 /* { dg-final { scan-tree-dump "f1: va_list escapes 0, needs to save all GPR units and 0 FPR units" "stdarg" { target s390*-*-linux* } } } */
+/* { dg-final { scan-tree-dump "f1: va_list escapes 0, needs to save all GPR units and 0 FPR units" "stdarg" { target aarch64*-*-* } } } */
 /* { dg-final { scan-tree-dump "f1: va_list escapes \[01\], needs to save all GPR units" "stdarg" { target { { i?86-*-* x86_64-*-* } && ia32 } } } } */
 /* { dg-final { scan-tree-dump "f1: va_list escapes \[01\], needs to save all GPR units" "stdarg" { target ia64-*-* } } } */
 /* { dg-final { scan-tree-dump "f1: va_list escapes \[01\], needs to save all GPR units" "stdarg" { target { powerpc*-*-* && lp64 } } } } */
@@ -44,6 +45,7 @@ f2 (int i, ...)
 /* { dg-final { scan-tree-dump "f2: va_list escapes 0, needs to save 0 GPR units and all FPR units" "stdarg" { target { powerpc*-*-linux* && { powerpc_fprs && ilp32 } } } } } */
 /* { dg-final { scan-tree-dump "f2: va_list escapes 0, needs to save all GPR units and 2" "stdarg" { target alpha*-*-linux* } } } */
 /* { dg-final { scan-tree-dump "f2: va_list escapes 0, needs to save 0 GPR units and all FPR units" "stdarg" { target s390*-*-linux* } } } */
+/* { dg-final { scan-tree-dump "f2: va_list escapes 0, needs to save 0 GPR units and all FPR units" "stdarg" { target aarch64*-*-* } } } */
 /* { dg-final { scan-tree-dump "f2: va_list escapes \[01\], needs to save all GPR units" "stdarg" { target { { i?86-*-* x86_64-*-* } && ia32 } } } } */
 /* { dg-final { scan-tree-dump "f2: va_list escapes \[01\], needs to save all GPR units" "stdarg" { target ia64-*-* } } } */
 /* { dg-final { scan-tree-dump "f2: va_list escapes \[01\], needs to save all GPR units" "stdarg" { target { powerpc*-*-* && lp64 } } } } */
@@ -67,6 +69,7 @@ f3 (int i, ...)
 /* { dg-final { scan-tree-dump "f3: va_list escapes 0, needs to save \[148\] GPR units and 0 FPR units" "stdarg" { target { powerpc*-*-linux* && ilp32 } } } } */
 /* { dg-final { scan-tree-dump "f3: va_list escapes 0, needs to save 8 GPR units and 1" "stdarg" { target alpha*-*-linux* } } } */
 /* { dg-final { scan-tree-dump "f3: va_list escapes 0, needs to save 1 GPR units and 0 FPR units" "stdarg" { target s390*-*-linux* } } } */
+/* { dg-final { scan-tree-dump "f3: va_list escapes 0, needs to save 8 GPR units and 0 FPR units" "stdarg" { target aarch64*-*-* } } } */
 /* { dg-final { scan-tree-dump "f3: va_list escapes 0, needs to save \[148\] GPR units" "stdarg" { target { { i?86-*-* x86_64-*-* } && ia32 } } } } */
 /* { dg-final { scan-tree-dump "f3: va_list escapes 0, needs to save \[148\] GPR units" "stdarg" { target ia64-*-* } } } */
 /* { dg-final { scan-tree-dump "f3: va_list escapes 0, needs to save \[148\] GPR units" "stdarg" { target { powerpc*-*-* && lp64 } } } } */
@@ -88,6 +91,7 @@ f4 (int i, ...)
 /* { dg-final { scan-tree-dump "f4: va_list escapes 0, needs to save 0 GPR units and \[1-9\]\[0-9\]* FPR units" "stdarg" { target { powerpc*-*-linux* && { powerpc_fprs && ilp32 } } } } } */
 /* { dg-final { scan-tree-dump "f4: va_list escapes 0, needs to save 8 GPR units and 2" "stdarg" { target alpha*-*-linux* } } } */
 /* { dg-final { scan-tree-dump "f4: va_list escapes 0, needs to save 0 GPR units and 1 FPR units" "stdarg" { target s390*-*-linux* } } } */
+/* { dg-final { scan-tree-dump "f4: va_list escapes 0, needs to save 0 GPR units and 16 FPR units" "stdarg" { target aarch64*-*-* } } } */
 /* { dg-final { scan-tree-dump "f4: va_list escapes 0, needs to save \[148\] GPR units" "stdarg" { target { { i?86-*-* x86_64-*-* } && ia32 } } } } */
 /* { dg-final { scan-tree-dump "f4: va_list escapes 0, needs to save \[148\] GPR units" "stdarg" { target ia64-*-* } } } */
 /* { dg-final { scan-tree-dump "f4: va_list escapes 0, needs to save \[148\] GPR units" "stdarg" { target { powerpc*-*-* && lp64 } } } } */
--- a/src/gcc/testsuite/gcc.dg/tree-ssa/stdarg-5.c
+++ b/src/gcc/testsuite/gcc.dg/tree-ssa/stdarg-5.c
@@ -25,6 +25,7 @@ f1 (int i, ...)
 /* { dg-final { scan-tree-dump "f1: va_list escapes 0, needs to save 0 GPR units and 0 FPR units" "stdarg" { target { { i?86-*-* x86_64-*-* } && { ! { ia32 || llp64 } } } } } } */
 /* { dg-final { scan-tree-dump "f1: va_list escapes 0, needs to save all GPR units and 1" "stdarg" { target alpha*-*-linux* } } } */
 /* { dg-final { scan-tree-dump "f1: va_list escapes 0, needs to save all GPR units and 0 FPR units" "stdarg" { target s390*-*-linux* } } } */
+/* { dg-final { scan-tree-dump "f1: va_list escapes 0, needs to save all GPR units and 0 FPR units" "stdarg" { target aarch64*-*-* } } } */
 
 void
 f2 (int i, ...)
@@ -38,6 +39,7 @@ f2 (int i, ...)
 /* { dg-final { scan-tree-dump "f2: va_list escapes 0, needs to save all GPR units and all FPR units" "stdarg" { target { { i?86-*-* x86_64-*-* } && { ! { ia32 || llp64 } } } } } } */
 /* { dg-final { scan-tree-dump "f2: va_list escapes 0, needs to save all GPR units and 1" "stdarg" { target alpha*-*-linux* } } } */
 /* { dg-final { scan-tree-dump "f2: va_list escapes 0, needs to save all GPR units and 0 FPR units" "stdarg" { target s390*-*-linux* } } } */
+/* { dg-final { scan-tree-dump "f2: va_list escapes 0, needs to save all GPR units and 0 FPR units" "stdarg" { target aarch64*-*-* } } } */
 
 /* Here va_arg can be executed at most as many times as va_start.  */
 void
@@ -56,6 +58,7 @@ f3 (int i, ...)
 /* { dg-final { scan-tree-dump "f3: va_list escapes 0, needs to save 0 GPR units and 0 FPR units" "stdarg" { target { { i?86-*-* x86_64-*-* } && { ! { ia32 || llp64 } } } } } } */
 /* { dg-final { scan-tree-dump "f3: va_list escapes 0, needs to save 32 GPR units and 1" "stdarg" { target alpha*-*-linux* } } } */
 /* { dg-final { scan-tree-dump "f3: va_list escapes 0, needs to save 1 GPR units and 0 FPR units" "stdarg" { target s390*-*-linux* } } } */
+/* { dg-final { scan-tree-dump "f3: va_list escapes 0, needs to save 8 GPR units and 0 FPR units" "stdarg" { target aarch64*-*-* } } } */
 
 void
 f4 (int i, ...)
@@ -74,6 +77,7 @@ f4 (int i, ...)
 /* { dg-final { scan-tree-dump "f4: va_list escapes 0, needs to save 16 GPR units and 16 FPR units" "stdarg" { target { { i?86-*-* x86_64-*-* } && { ! { ia32 || llp64 } } } } } } */
 /* { dg-final { scan-tree-dump "f4: va_list escapes 0, needs to save 24 GPR units and 1" "stdarg" { target alpha*-*-linux* } } } */
 /* { dg-final { scan-tree-dump "f4: va_list escapes 0, needs to save 2 GPR units and 0 FPR units" "stdarg" { target s390*-*-linux* } } } */
+/* { dg-final { scan-tree-dump "f4: va_list escapes 0, needs to save 24 GPR units and 0 FPR units" "stdarg" { target aarch64*-*-* } } } */
 
 void
 f5 (int i, ...)
@@ -88,6 +92,7 @@ f5 (int i, ...)
 /* { dg-final { scan-tree-dump "f5: va_list escapes 0, needs to save 16 GPR units and 0 FPR units" "stdarg" { target { { i?86-*-* x86_64-*-* } && { ! { ia32 || llp64 } } } } } } */
 /* { dg-final { scan-tree-dump "f5: va_list escapes 0, needs to save 32 GPR units and 1" "stdarg" { target alpha*-*-linux* } } } */
 /* { dg-final { scan-tree-dump "f5: va_list escapes 0, needs to save (4|2) GPR units and 0 FPR units" "stdarg" { target s390*-*-linux* } } } */
+/* { dg-final { scan-tree-dump "f5: va_list escapes 0, needs to save 16 GPR units and 0 FPR units" "stdarg" { target aarch64*-*-* } } } */
 
 void
 f6 (int i, ...)
@@ -102,6 +107,7 @@ f6 (int i, ...)
 /* { dg-final { scan-tree-dump "f6: va_list escapes 0, needs to save 8 GPR units and 32 FPR units" "stdarg" { target { { i?86-*-* x86_64-*-* } && { ! { ia32 || llp64 } } } } } } */
 /* { dg-final { scan-tree-dump "f6: va_list escapes 0, needs to save 32 GPR units and 3" "stdarg" { target alpha*-*-linux* } } } */
 /* { dg-final { scan-tree-dump "f6: va_list escapes 0, needs to save (3|2) GPR units and 0 FPR units" "stdarg" { target s390*-*-linux* } } } */
+/* { dg-final { scan-tree-dump "f6: va_list escapes 0, needs to save 8 GPR units and 32 FPR units" "stdarg" { target aarch64*-*-* } } } */
 
 void
 f7 (int i, ...)
@@ -116,3 +122,4 @@ f7 (int i, ...)
 /* { dg-final { scan-tree-dump "f7: va_list escapes 0, needs to save 0 GPR units and 64 FPR units" "stdarg" { target { { i?86-*-* x86_64-*-* } && { ! { ia32 || llp64 } } } } } } */
 /* { dg-final { scan-tree-dump "f7: va_list escapes 0, needs to save 32 GPR units and 2" "stdarg" { target alpha*-*-linux* } } } */
 /* { dg-final { scan-tree-dump "f7: va_list escapes 0, needs to save 2 GPR units and 0 FPR units" "stdarg" { target s390*-*-linux* } } } */
+/* { dg-final { scan-tree-dump "f7: va_list escapes 0, needs to save 0 GPR units and 64 FPR units" "stdarg" { target aarch64*-*-* } } } */
--- a/src/gcc/testsuite/gcc.dg/tree-ssa/stdarg-6.c
+++ b/src/gcc/testsuite/gcc.dg/tree-ssa/stdarg-6.c
@@ -30,6 +30,7 @@ bar (int x, char const *y, ...)
 /* { dg-final { scan-tree-dump "bar: va_list escapes 1, needs to save all GPR units and all FPR units" "stdarg" { target { powerpc*-*-linux* && ilp32 } } } } */
 /* { dg-final { scan-tree-dump "bar: va_list escapes 1, needs to save all GPR units and all FPR units" "stdarg" { target alpha*-*-linux* } } } */
 /* { dg-final { scan-tree-dump "bar: va_list escapes 1, needs to save all GPR units and all FPR units" "stdarg" { target s390*-*-linux* } } } */
+/* { dg-final { scan-tree-dump "bar: va_list escapes 1, needs to save all GPR units and all FPR units" "stdarg" { target aarch64*-*-* } } } */
 /* { dg-final { scan-tree-dump "bar: va_list escapes 1, needs to save all GPR units" "stdarg" { target { { i?86-*-* x86_64-*-* } && ia32 } } } } */
 /* { dg-final { scan-tree-dump "bar: va_list escapes 1, needs to save all GPR units" "stdarg" { target ia64-*-* } } } */
 /* { dg-final { scan-tree-dump "bar: va_list escapes 1, needs to save all GPR units" "stdarg" { target { powerpc*-*-* && lp64 } } } } */
--- a/src/gcc/testsuite/gcc.dg/uninit-pred-8_a.c
+++ b/src/gcc/testsuite/gcc.dg/uninit-pred-8_a.c
@@ -1,6 +1,8 @@
 
 /* { dg-do compile } */
 /* { dg-options "-Wuninitialized -O2" } */
+/* Pick a particular tuning to pin down BRANCH_COST.  */
+/* { dg-additional-options "-mtune=cortex-a15" { target arm*-*-* } } */
 
 int g;
 void bar();
--- /dev/null
+++ b/src/gcc/testsuite/gcc.dg/vect/aligned-section-anchors-vect-70.c
@@ -0,0 +1,33 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target section_anchors } */
+/* { dg-require-effective-target vect_int } */
+
+#define N 32
+
+/* Increase alignment of struct if an array's offset is multiple of alignment of
+   vector type corresponding to it's scalar type.
+   For the below test-case:
+   offsetof(e) == 8 bytes. 
+   i) For arm: let x = alignment of vector type corresponding to int,
+   x == 8 bytes.
+   Since offsetof(e) % x == 0, set DECL_ALIGN(a, b, c) to x.
+   ii) For aarch64, ppc: x == 16 bytes.
+   Since offsetof(e) % x != 0, don't increase alignment of a, b, c.
+*/
+
+static struct A {
+  int p1, p2;
+  int e[N];
+} a, b, c;
+
+int foo(void)
+{
+  for (int i = 0; i < N; i++)
+    a.e[i] = b.e[i] + c.e[i];
+
+   return a.e[0];
+}
+
+/* { dg-final { scan-ipa-dump-times "Increasing alignment of decl" 0 "increase_alignment" { target aarch64*-*-* } } } */
+/* { dg-final { scan-ipa-dump-times "Increasing alignment of decl" 0 "increase_alignment" { target powerpc64*-*-* } } } */
+/* { dg-final { scan-ipa-dump-times "Increasing alignment of decl" 3 "increase_alignment" { target arm*-*-* } } } */
--- /dev/null
+++ b/src/gcc/testsuite/gcc.dg/vect/aligned-section-anchors-vect-71.c
@@ -0,0 +1,25 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target section_anchors } */
+/* { dg-require-effective-target vect_int } */
+
+/* Should not increase alignment of the struct because
+   sizeof (A.e) < sizeof(corresponding vector type).  */
+
+#define N 3
+
+static struct A {
+  int p1, p2;
+  int e[N];
+} a, b, c;
+
+int foo(void)
+{
+  for (int i = 0; i < N; i++)
+    a.e[i] = b.e[i] + c.e[i];
+
+   return a.e[0];
+}
+
+/* { dg-final { scan-ipa-dump-times "Increasing alignment of decl" 0 "increase_alignment" { target aarch64*-*-* } } } */
+/* { dg-final { scan-ipa-dump-times "Increasing alignment of decl" 0 "increase_alignment" { target powerpc64*-*-* } } } */
+/* { dg-final { scan-ipa-dump-times "Increasing alignment of decl" 0 "increase_alignment" { target arm*-*-* } } } */
--- /dev/null
+++ b/src/gcc/testsuite/gcc.dg/vect/aligned-section-anchors-vect-72.c
@@ -0,0 +1,29 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target section_anchors } */
+/* { dg-require-effective-target vect_int } */
+
+#define N 32
+
+/* Clone of section-anchors-vect-70.c having nested struct.  */
+
+struct S
+{
+  int e[N];
+};
+
+static struct A {
+  int p1, p2;
+  struct S s; 
+} a, b, c;
+
+int foo(void)
+{
+  for (int i = 0; i < N; i++)
+    a.s.e[i] = b.s.e[i] + c.s.e[i];
+
+   return a.s.e[0];
+}
+
+/* { dg-final { scan-ipa-dump-times "Increasing alignment of decl" 0 "increase_alignment" { target aarch64*-*-* } } } */
+/* { dg-final { scan-ipa-dump-times "Increasing alignment of decl" 0 "increase_alignment" { target powerpc64*-*-* } } } */
+/* { dg-final { scan-ipa-dump-times "Increasing alignment of decl" 3 "increase_alignment" { target arm*-*-* } } } */
--- /dev/null
+++ b/src/gcc/testsuite/gcc.dg/vect/pr57206.c
@@ -0,0 +1,11 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target vect_float } */
+
+void bad0(float * d, unsigned int n)
+{
+  unsigned int i;
+  for (i=n; i>0; --i) 
+    d[n-i] = 0.0;
+}
+
+/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" } } */
--- /dev/null
+++ b/src/gcc/testsuite/gcc.dg/vect/pr65951.c
@@ -0,0 +1,63 @@
+/* { dg-require-effective-target vect_int } */
+
+#include <stdarg.h>
+#include "tree-vect.h"
+
+#define N 512
+
+/* These multiplications should be vectorizable with additions when
+   no vector shift is available.  */
+
+__attribute__ ((noinline)) void
+foo (int *arr)
+{
+  for (int i = 0; i < N; i++)
+    arr[i] *= 2;
+}
+
+__attribute__ ((noinline)) void
+foo2 (int *arr)
+{
+  for (int i = 0; i < N; i++)
+    arr[i] *= 4;
+}
+
+int
+main (void)
+{
+  check_vect ();
+  int data[N];
+  int i;
+
+  for (i = 0; i < N; i++)
+    {
+      data[i] = i;
+      __asm__ volatile ("");
+    }
+
+  foo (data);
+  for (i = 0; i < N; i++)
+    {
+      if (data[i] / 2 != i)
+      __builtin_abort ();
+      __asm__ volatile ("");
+    }
+
+  for (i = 0; i < N; i++)
+    {
+      data[i] = i;
+      __asm__ volatile ("");
+    }
+
+  foo2 (data);
+  for (i = 0; i < N; i++)
+    {
+      if (data[i] / 4 != i)
+      __builtin_abort ();
+      __asm__ volatile ("");
+    }
+
+  return 0;
+}
+
+/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 2 "vect" } } */
--- /dev/null
+++ b/src/gcc/testsuite/gcc.dg/vect/pr71818.c
@@ -0,0 +1,16 @@
+/* { dg-do compile } */
+
+char a;
+short b;
+int c, d;
+void fn1() {
+  char e = 75, g;
+  unsigned char *f = &e;
+  a = 21;
+  for (; a <= 48; a++) {
+    for (; e <= 6;)
+      ;
+    g -= e -= b || g <= c;
+  }
+  d = *f;
+}
--- a/src/gcc/testsuite/gcc.dg/vect/vect-iv-9.c
+++ b/src/gcc/testsuite/gcc.dg/vect/vect-iv-9.c
@@ -33,5 +33,4 @@ int main (void)
   return 0;
 } 
 
-/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 2 "vect" { target vect_int_mult } } } */
-/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" { target {! vect_int_mult } } } } */
+/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 2 "vect" } } */
--- /dev/null
+++ b/src/gcc/testsuite/gcc.dg/vect/vect-load-lanes-peeling-1.c
@@ -0,0 +1,13 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target vect_int } */
+/* { dg-require-effective-target vect_load_lanes } */
+
+void
+f (int *__restrict a, int *__restrict b)
+{
+  for (int i = 0; i < 96; ++i)
+    a[i] = b[i * 3] + b[i * 3 + 1] + b[i * 3 + 2];
+}
+
+/* { dg-final { scan-tree-dump-not "Data access with gaps" "vect" } } */
+/* { dg-final { scan-tree-dump-not "epilog loop required" "vect" } } */
--- /dev/null
+++ b/src/gcc/testsuite/gcc.dg/vect/vect-mult-const-pattern-1.c
@@ -0,0 +1,41 @@
+/* { dg-require-effective-target vect_int } */
+/* { dg-require-effective-target vect_shift } */
+
+#include <stdarg.h>
+#include "tree-vect.h"
+
+#define N 256
+
+__attribute__ ((noinline)) void
+foo (long long *arr)
+{
+  for (int i = 0; i < N; i++)
+    arr[i] *= 123;
+}
+
+int
+main (void)
+{
+  check_vect ();
+  long long data[N];
+  int i;
+
+  for (i = 0; i < N; i++)
+    {
+      data[i] = i;
+      __asm__ volatile ("");
+    }
+
+  foo (data);
+  for (i = 0; i < N; i++)
+    {
+      if (data[i] / 123 != i)
+      __builtin_abort ();
+      __asm__ volatile ("");
+    }
+
+  return 0;
+}
+
+/* { dg-final { scan-tree-dump-times "vect_recog_mult_pattern: detected" 2 "vect"  { target aarch64*-*-* } } } */
+/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect"  { target aarch64*-*-* } } } */
--- /dev/null
+++ b/src/gcc/testsuite/gcc.dg/vect/vect-mult-const-pattern-2.c
@@ -0,0 +1,40 @@
+/* { dg-require-effective-target vect_int } */
+
+#include <stdarg.h>
+#include "tree-vect.h"
+
+#define N 256
+
+__attribute__ ((noinline)) void
+foo (long long *arr)
+{
+  for (int i = 0; i < N; i++)
+    arr[i] *= -19594LL;
+}
+
+int
+main (void)
+{
+  check_vect ();
+  long long data[N];
+  int i;
+
+  for (i = 0; i < N; i++)
+    {
+      data[i] = i;
+      __asm__ volatile ("");
+    }
+
+  foo (data);
+  for (i = 0; i < N; i++)
+    {
+      if (data[i] / -19594LL != i)
+      __builtin_abort ();
+      __asm__ volatile ("");
+    }
+
+  return 0;
+}
+
+/* { dg-final { scan-tree-dump-times "vect_recog_mult_pattern: detected" 2 "vect"  { target aarch64*-*-* } } } */
+/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect"  { target aarch64*-*-* } } } */
--- a/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/advsimd-intrinsics.exp
+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/advsimd-intrinsics.exp
@@ -53,7 +53,10 @@ torture-init
 set-torture-options $C_TORTURE_OPTIONS {{}} $LTO_TORTURE_OPTIONS
 
 # Make sure Neon flags are provided, if necessary.  Use fp16 if we can.
-if {[check_effective_target_arm_neon_fp16_ok]} then {
+# Use fp16 arithmetic operations if the hardware supports it.
+if {[check_effective_target_arm_v8_2a_fp16_neon_hw]} then {
+  set additional_flags [add_options_for_arm_v8_2a_fp16_neon ""]
+} elseif {[check_effective_target_arm_neon_fp16_ok]} then {
   set additional_flags [add_options_for_arm_neon_fp16 ""]
 } else {
   set additional_flags [add_options_for_arm_neon ""]
--- a/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/arm-neon-ref.h
+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/arm-neon-ref.h
@@ -16,6 +16,14 @@ extern void *memset(void *, int, size_t);
 extern void *memcpy(void *, const void *, size_t);
 extern size_t strlen(const char *);
 
+/* Helper macro to select FP16 tests.  */
+#if (defined (__ARM_FP16_FORMAT_IEEE) \
+     || defined (__ARM_FP16_FORMAT_ALTERNATIVE))
+#define FP16_SUPPORTED (1)
+#else
+#undef FP16_SUPPORTED
+#endif
+
 /* Various string construction helpers.  */
 
 /*
@@ -24,6 +32,13 @@ extern size_t strlen(const char *);
    VECT_VAR(expected, int, 16, 4) -> expected_int16x4
    VECT_VAR_DECL(expected, int, 16, 4) -> int16x4_t expected_int16x4
 */
+/* Some instructions don't exist on ARM.
+   Use this macro to guard against them.  */
+#ifdef __aarch64__
+#define AARCH64_ONLY(X) X
+#else
+#define AARCH64_ONLY(X)
+#endif
 
 #define xSTR(X) #X
 #define STR(X) xSTR(X)
@@ -81,7 +96,7 @@ extern size_t strlen(const char *);
 	  abort();							\
 	}								\
       }									\
-    fprintf(stderr, "CHECKED %s\n", MSG);				\
+    fprintf(stderr, "CHECKED %s %s\n", STR(VECT_TYPE(T, W, N)), MSG);	\
   }
 
 /* Floating-point variant.  */
@@ -110,7 +125,36 @@ extern size_t strlen(const char *);
 	  abort();							\
 	}								\
       }									\
-    fprintf(stderr, "CHECKED %s\n", MSG);				\
+    fprintf(stderr, "CHECKED %s %s\n", STR(VECT_TYPE(T, W, N)), MSG);	\
+  }
+
+/* poly variant.  */
+#define CHECK_POLY(MSG,T,W,N,FMT,EXPECTED,COMMENT)			\
+  {									\
+    int i;								\
+    for(i=0; i<N ; i++)							\
+      {									\
+	union poly_operand {						\
+	  uint##W##_t i;						\
+	  poly##W##_t p;						\
+	} tmp_res, tmp_exp;						\
+	tmp_res.p = VECT_VAR(result, T, W, N)[i];			\
+	tmp_exp.i = VECT_VAR(EXPECTED, T, W, N)[i];			\
+	if (tmp_res.i != tmp_exp.i) {					\
+	  fprintf(stderr,						\
+		  "ERROR in %s (%s line %d in buffer '%s') at type %s "	\
+		  "index %d: got 0x%" FMT " != 0x%" FMT " %s\n",	\
+		  MSG, __FILE__, __LINE__,				\
+		  STR(EXPECTED),					\
+		  STR(VECT_NAME(T, W, N)),				\
+		  i,							\
+		  tmp_res.i,						\
+		  tmp_exp.i,						\
+		  strlen(COMMENT) > 0 ? COMMENT : "");			\
+	  abort();							\
+	}								\
+      }									\
+    fprintf(stderr, "CHECKED %s %s\n", STR(VECT_TYPE(T, W, N)), MSG);	\
   }
 
 /* Clean buffer with a non-zero pattern to help diagnose buffer
@@ -133,10 +177,16 @@ static ARRAY(result, uint, 32, 2);
 static ARRAY(result, uint, 64, 1);
 static ARRAY(result, poly, 8, 8);
 static ARRAY(result, poly, 16, 4);
+#if defined (__ARM_FEATURE_CRYPTO)
+static ARRAY(result, poly, 64, 1);
+#endif
 #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
 static ARRAY(result, float, 16, 4);
 #endif
 static ARRAY(result, float, 32, 2);
+#ifdef __aarch64__
+static ARRAY(result, float, 64, 1);
+#endif
 static ARRAY(result, int, 8, 16);
 static ARRAY(result, int, 16, 8);
 static ARRAY(result, int, 32, 4);
@@ -147,6 +197,9 @@ static ARRAY(result, uint, 32, 4);
 static ARRAY(result, uint, 64, 2);
 static ARRAY(result, poly, 8, 16);
 static ARRAY(result, poly, 16, 8);
+#if defined (__ARM_FEATURE_CRYPTO)
+static ARRAY(result, poly, 64, 2);
+#endif
 #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
 static ARRAY(result, float, 16, 8);
 #endif
@@ -169,6 +222,7 @@ extern ARRAY(expected, poly, 8, 8);
 extern ARRAY(expected, poly, 16, 4);
 extern ARRAY(expected, hfloat, 16, 4);
 extern ARRAY(expected, hfloat, 32, 2);
+extern ARRAY(expected, hfloat, 64, 1);
 extern ARRAY(expected, int, 8, 16);
 extern ARRAY(expected, int, 16, 8);
 extern ARRAY(expected, int, 32, 4);
@@ -193,8 +247,8 @@ extern ARRAY(expected, hfloat, 64, 2);
     CHECK(test_name, uint, 16, 4, PRIx16, EXPECTED, comment);		\
     CHECK(test_name, uint, 32, 2, PRIx32, EXPECTED, comment);		\
     CHECK(test_name, uint, 64, 1, PRIx64, EXPECTED, comment);		\
-    CHECK(test_name, poly, 8, 8, PRIx8, EXPECTED, comment);		\
-    CHECK(test_name, poly, 16, 4, PRIx16, EXPECTED, comment);		\
+    CHECK_POLY(test_name, poly, 8, 8, PRIx8, EXPECTED, comment);	\
+    CHECK_POLY(test_name, poly, 16, 4, PRIx16, EXPECTED, comment);	\
     CHECK_FP(test_name, float, 32, 2, PRIx32, EXPECTED, comment);	\
 									\
     CHECK(test_name, int, 8, 16, PRIx8, EXPECTED, comment);		\
@@ -205,8 +259,8 @@ extern ARRAY(expected, hfloat, 64, 2);
     CHECK(test_name, uint, 16, 8, PRIx16, EXPECTED, comment);		\
     CHECK(test_name, uint, 32, 4, PRIx32, EXPECTED, comment);		\
     CHECK(test_name, uint, 64, 2, PRIx64, EXPECTED, comment);		\
-    CHECK(test_name, poly, 8, 16, PRIx8, EXPECTED, comment);		\
-    CHECK(test_name, poly, 16, 8, PRIx16, EXPECTED, comment);		\
+    CHECK_POLY(test_name, poly, 8, 16, PRIx8, EXPECTED, comment);	\
+    CHECK_POLY(test_name, poly, 16, 8, PRIx16, EXPECTED, comment);	\
     CHECK_FP(test_name, float, 32, 4, PRIx32, EXPECTED, comment);	\
   }									\
 
@@ -335,7 +389,8 @@ extern int VECT_VAR(expected_cumulative_sat, uint, 64, 2);
 	      strlen(COMMENT) > 0 ? " " COMMENT : "");			\
       abort();								\
     }									\
-    fprintf(stderr, "CHECKED CUMULATIVE SAT %s\n", MSG);		\
+    fprintf(stderr, "CHECKED CUMULATIVE SAT %s %s\n",			\
+	    STR(VECT_TYPE(T, W, N)), MSG);				\
   }
 
 #define CHECK_CUMULATIVE_SAT_NAMED(test_name,EXPECTED,comment)		\
@@ -379,6 +434,9 @@ static void clean_results (void)
   CLEAN(result, uint, 64, 1);
   CLEAN(result, poly, 8, 8);
   CLEAN(result, poly, 16, 4);
+#if defined (__ARM_FEATURE_CRYPTO)
+  CLEAN(result, poly, 64, 1);
+#endif
 #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
   CLEAN(result, float, 16, 4);
 #endif
@@ -394,6 +452,9 @@ static void clean_results (void)
   CLEAN(result, uint, 64, 2);
   CLEAN(result, poly, 8, 16);
   CLEAN(result, poly, 16, 8);
+#if defined (__ARM_FEATURE_CRYPTO)
+  CLEAN(result, poly, 64, 2);
+#endif
 #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
   CLEAN(result, float, 16, 8);
 #endif
@@ -419,6 +480,13 @@ static void clean_results (void)
 #define DECL_VARIABLE(VAR, T1, W, N)		\
   VECT_TYPE(T1, W, N) VECT_VAR(VAR, T1, W, N)
 
+#if defined (__ARM_FEATURE_CRYPTO)
+#define DECL_VARIABLE_CRYPTO(VAR, T1, W, N) \
+  DECL_VARIABLE(VAR, T1, W, N)
+#else
+#define DECL_VARIABLE_CRYPTO(VAR, T1, W, N)
+#endif
+
 /* Declare only 64 bits signed variants.  */
 #define DECL_VARIABLE_64BITS_SIGNED_VARIANTS(VAR)	\
   DECL_VARIABLE(VAR, int, 8, 8);			\
@@ -454,6 +522,7 @@ static void clean_results (void)
   DECL_VARIABLE_64BITS_UNSIGNED_VARIANTS(VAR);	\
   DECL_VARIABLE(VAR, poly, 8, 8);		\
   DECL_VARIABLE(VAR, poly, 16, 4);		\
+  DECL_VARIABLE_CRYPTO(VAR, poly, 64, 1);	\
   DECL_VARIABLE(VAR, float, 16, 4);		\
   DECL_VARIABLE(VAR, float, 32, 2)
 #else
@@ -462,6 +531,7 @@ static void clean_results (void)
   DECL_VARIABLE_64BITS_UNSIGNED_VARIANTS(VAR);	\
   DECL_VARIABLE(VAR, poly, 8, 8);		\
   DECL_VARIABLE(VAR, poly, 16, 4);		\
+  DECL_VARIABLE_CRYPTO(VAR, poly, 64, 1);	\
   DECL_VARIABLE(VAR, float, 32, 2)
 #endif
 
@@ -472,6 +542,7 @@ static void clean_results (void)
   DECL_VARIABLE_128BITS_UNSIGNED_VARIANTS(VAR);	\
   DECL_VARIABLE(VAR, poly, 8, 16);		\
   DECL_VARIABLE(VAR, poly, 16, 8);		\
+  DECL_VARIABLE_CRYPTO(VAR, poly, 64, 2);	\
   DECL_VARIABLE(VAR, float, 16, 8);		\
   DECL_VARIABLE(VAR, float, 32, 4)
 #else
@@ -480,6 +551,7 @@ static void clean_results (void)
   DECL_VARIABLE_128BITS_UNSIGNED_VARIANTS(VAR);	\
   DECL_VARIABLE(VAR, poly, 8, 16);		\
   DECL_VARIABLE(VAR, poly, 16, 8);		\
+  DECL_VARIABLE_CRYPTO(VAR, poly, 64, 2);	\
   DECL_VARIABLE(VAR, float, 32, 4)
 #endif
 /* Declare all variants.  */
@@ -500,15 +572,6 @@ static void clean_results (void)
 /* Helpers to initialize vectors.  */
 #define VDUP(VAR, Q, T1, T2, W, N, V)			\
   VECT_VAR(VAR, T1, W, N) = vdup##Q##_n_##T2##W(V)
-#if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
-/* Work around that there is no vdup_n_f16 intrinsic.  */
-#define vdup_n_f16(VAL)		\
-  __extension__			\
-    ({				\
-      float16_t f = VAL;	\
-      vld1_dup_f16(&f);		\
-    })
-#endif
 
 #define VSET_LANE(VAR, Q, T1, T2, W, N, L, V)				\
   VECT_VAR(VAR, T1, W, N) = vset##Q##_lane_##T2##W(V,			\
@@ -521,6 +584,13 @@ static void clean_results (void)
 
 /* Helpers to call macros with 1 constant and 5 variable
    arguments.  */
+#if defined (__ARM_FEATURE_CRYPTO)
+#define MACRO_CRYPTO(MACRO, VAR1, VAR2, T1, T2, T3, W, N) \
+  MACRO(VAR1, VAR2, T1, T2, T3, W, N)
+#else
+#define MACRO_CRYPTO(MACRO, VAR1, VAR2, T1, T2, T3, W, N)
+#endif
+
 #define TEST_MACRO_64BITS_SIGNED_VARIANTS_1_5(MACRO, VAR)	\
   MACRO(VAR, , int, s, 8, 8);					\
   MACRO(VAR, , int, s, 16, 4);					\
@@ -591,13 +661,15 @@ static void clean_results (void)
   TEST_MACRO_64BITS_SIGNED_VARIANTS_2_5(MACRO, VAR1, VAR2);	\
   TEST_MACRO_64BITS_UNSIGNED_VARIANTS_2_5(MACRO, VAR1, VAR2);	\
   MACRO(VAR1, VAR2, , poly, p, 8, 8);				\
-  MACRO(VAR1, VAR2, , poly, p, 16, 4)
+  MACRO(VAR1, VAR2, , poly, p, 16, 4);				\
+  MACRO_CRYPTO(MACRO, VAR1, VAR2, , poly, p, 64, 1)
 
 #define TEST_MACRO_128BITS_VARIANTS_2_5(MACRO, VAR1, VAR2)	\
   TEST_MACRO_128BITS_SIGNED_VARIANTS_2_5(MACRO, VAR1, VAR2);	\
   TEST_MACRO_128BITS_UNSIGNED_VARIANTS_2_5(MACRO, VAR1, VAR2);	\
   MACRO(VAR1, VAR2, q, poly, p, 8, 16);				\
-  MACRO(VAR1, VAR2, q, poly, p, 16, 8)
+  MACRO(VAR1, VAR2, q, poly, p, 16, 8);				\
+  MACRO_CRYPTO(MACRO, VAR1, VAR2, q, poly, p, 64, 2)
 
 #define TEST_MACRO_ALL_VARIANTS_2_5(MACRO, VAR1, VAR2)	\
   TEST_MACRO_64BITS_VARIANTS_2_5(MACRO, VAR1, VAR2);	\
--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/binary_op_float.inc
@@ -0,0 +1,170 @@
+/* Floating-point only version of binary_op_no64.inc template.  Currently only
+   float16_t is used.  */
+
+#include <math.h>
+
+#define FNNAME1(NAME) exec_ ## NAME
+#define FNNAME(NAME) FNNAME1(NAME)
+
+void FNNAME (INSN_NAME) (void)
+{
+  int i;
+
+  /* Basic test: z = INSN (x, y), then store the result.  */
+#define TEST_BINARY_OP1(INSN, Q, T1, T2, W, N)				\
+  VECT_VAR(vector_res, T1, W, N) =					\
+    INSN##Q##_##T2##W(VECT_VAR(vector, T1, W, N),			\
+		      VECT_VAR(vector2, T1, W, N));			\
+  vst1##Q##_##T2##W(VECT_VAR(result, T1, W, N), VECT_VAR(vector_res, T1, W, N))
+
+#define TEST_BINARY_OP(INSN, Q, T1, T2, W, N)   \
+  TEST_BINARY_OP1(INSN, Q, T1, T2, W, N)	\
+
+#ifdef HAS_FLOAT16_VARIANT
+  DECL_VARIABLE(vector, float, 16, 4);
+  DECL_VARIABLE(vector2, float, 16, 4);
+  DECL_VARIABLE(vector_res, float, 16, 4);
+
+  DECL_VARIABLE(vector, float, 16, 8);
+  DECL_VARIABLE(vector2, float, 16, 8);
+  DECL_VARIABLE(vector_res, float, 16, 8);
+#endif
+
+#ifdef HAS_FLOAT_VARIANT
+  DECL_VARIABLE(vector, float, 32, 2);
+  DECL_VARIABLE(vector2, float, 32, 2);
+  DECL_VARIABLE(vector_res, float, 32, 2);
+
+  DECL_VARIABLE(vector, float, 32, 4);
+  DECL_VARIABLE(vector2, float, 32, 4);
+  DECL_VARIABLE(vector_res, float, 32, 4);
+#endif
+
+  clean_results ();
+
+  /* Initialize input "vector" from "buffer".  */
+#ifdef HAS_FLOAT16_VARIANT
+  VLOAD(vector, buffer, , float, f, 16, 4);
+  VLOAD(vector, buffer, q, float, f, 16, 8);
+#endif
+#ifdef HAS_FLOAT_VARIANT
+  VLOAD(vector, buffer, , float, f, 32, 2);
+  VLOAD(vector, buffer, q, float, f, 32, 4);
+#endif
+
+  /* Choose init value arbitrarily, will be used as comparison value.  */
+#ifdef HAS_FLOAT16_VARIANT
+  VDUP(vector2, , float, f, 16, 4, -15.5f);
+  VDUP(vector2, q, float, f, 16, 8, -14.5f);
+#endif
+#ifdef HAS_FLOAT_VARIANT
+  VDUP(vector2, , float, f, 32, 2, -15.5f);
+  VDUP(vector2, q, float, f, 32, 4, -14.5f);
+#endif
+
+#ifdef HAS_FLOAT16_VARIANT
+#define FLOAT16_VARIANT(MACRO, VAR)			\
+  MACRO(VAR, , float, f, 16, 4);			\
+  MACRO(VAR, q, float, f, 16, 8);
+#else
+#define FLOAT16_VARIANT(MACRO, VAR)
+#endif
+
+#ifdef HAS_FLOAT_VARIANT
+#define FLOAT_VARIANT(MACRO, VAR)			\
+  MACRO(VAR, , float, f, 32, 2);			\
+  MACRO(VAR, q, float, f, 32, 4);
+#else
+#define FLOAT_VARIANT(MACRO, VAR)
+#endif
+
+#define TEST_MACRO_NO64BIT_VARIANT_1_5(MACRO, VAR)	\
+
+  /* Apply a binary operator named INSN_NAME.  */
+  FLOAT16_VARIANT(TEST_BINARY_OP, INSN_NAME);
+  FLOAT_VARIANT(TEST_BINARY_OP, INSN_NAME);
+
+#ifdef HAS_FLOAT16_VARIANT
+  CHECK_FP(TEST_MSG, float, 16, 4, PRIx16, expected, "");
+  CHECK_FP(TEST_MSG, float, 16, 8, PRIx16, expected, "");
+
+  /* Extra FP tests with special values (NaN, ....)  */
+  VDUP(vector, q, float, f, 16, 8, 1.0f);
+  VDUP(vector2, q, float, f, 16, 8, NAN);
+  TEST_BINARY_OP(INSN_NAME, q, float, f, 16, 8);
+  CHECK_FP(TEST_MSG, float, 16, 8, PRIx16, expected_nan,
+	   " FP special (NaN)");
+
+  VDUP(vector, q, float, f, 16, 8, -NAN);
+  VDUP(vector2, q, float, f, 16, 8, 1.0f);
+  TEST_BINARY_OP(INSN_NAME, q, float, f, 16, 8);
+  CHECK_FP(TEST_MSG, float, 16, 8, PRIx16, expected_mnan,
+	   " FP special (-NaN)");
+
+  VDUP(vector, q, float, f, 16, 8, 1.0f);
+  VDUP(vector2, q, float, f, 16, 8, HUGE_VALF);
+  TEST_BINARY_OP(INSN_NAME, q, float, f, 16, 8);
+  CHECK_FP(TEST_MSG, float, 16, 8, PRIx16, expected_inf,
+	   " FP special (inf)");
+
+  VDUP(vector, q, float, f, 16, 8, -HUGE_VALF);
+  VDUP(vector2, q, float, f, 16, 8, 1.0f);
+  TEST_BINARY_OP(INSN_NAME, q, float, f, 16, 8);
+  CHECK_FP(TEST_MSG, float, 16, 8, PRIx16, expected_minf,
+	   " FP special (-inf)");
+
+  VDUP(vector, q, float, f, 16, 8, 0.0f);
+  VDUP(vector2, q, float, f, 16, 8, -0.0f);
+  TEST_BINARY_OP(INSN_NAME, q, float, f, 16, 8);
+  CHECK_FP(TEST_MSG, float, 16, 8, PRIx16, expected_zero1,
+	   " FP special (-0.0)");
+
+  VDUP(vector, q, float, f, 16, 8, -0.0f);
+  VDUP(vector2, q, float, f, 16, 8, 0.0f);
+  TEST_BINARY_OP(INSN_NAME, q, float, f, 16, 8);
+  CHECK_FP(TEST_MSG, float, 16, 8, PRIx16, expected_zero2,
+	   " FP special (-0.0)");
+#endif
+
+#ifdef HAS_FLOAT_VARIANT
+  CHECK_FP(TEST_MSG, float, 32, 2, PRIx32, expected, "");
+  CHECK_FP(TEST_MSG, float, 32, 4, PRIx32, expected, "");
+
+  /* Extra FP tests with special values (NaN, ....)  */
+  VDUP(vector, q, float, f, 32, 4, 1.0f);
+  VDUP(vector2, q, float, f, 32, 4, NAN);
+  TEST_BINARY_OP(INSN_NAME, q, float, f, 32, 4);
+  CHECK_FP(TEST_MSG, float, 32, 4, PRIx32, expected_nan, " FP special (NaN)");
+
+  VDUP(vector, q, float, f, 32, 4, -NAN);
+  VDUP(vector2, q, float, f, 32, 4, 1.0f);
+  TEST_BINARY_OP(INSN_NAME, q, float, f, 32, 4);
+  CHECK_FP(TEST_MSG, float, 32, 4, PRIx32, expected_mnan, " FP special (-NaN)");
+
+  VDUP(vector, q, float, f, 32, 4, 1.0f);
+  VDUP(vector2, q, float, f, 32, 4, HUGE_VALF);
+  TEST_BINARY_OP(INSN_NAME, q, float, f, 32, 4);
+  CHECK_FP(TEST_MSG, float, 32, 4, PRIx32, expected_inf, " FP special (inf)");
+
+  VDUP(vector, q, float, f, 32, 4, -HUGE_VALF);
+  VDUP(vector2, q, float, f, 32, 4, 1.0f);
+  TEST_BINARY_OP(INSN_NAME, q, float, f, 32, 4);
+  CHECK_FP(TEST_MSG, float, 32, 4, PRIx32, expected_minf, " FP special (-inf)");
+
+  VDUP(vector, q, float, f, 32, 4, 0.0f);
+  VDUP(vector2, q, float, f, 32, 4, -0.0f);
+  TEST_BINARY_OP(INSN_NAME, q, float, f, 32, 4);
+  CHECK_FP(TEST_MSG, float, 32, 4, PRIx32, expected_zero1, " FP special (-0.0)");
+
+  VDUP(vector, q, float, f, 32, 4, -0.0f);
+  VDUP(vector2, q, float, f, 32, 4, 0.0f);
+  TEST_BINARY_OP(INSN_NAME, q, float, f, 32, 4);
+  CHECK_FP(TEST_MSG, float, 32, 4, PRIx32, expected_zero2, " FP special (-0.0)");
+#endif
+}
+
+int main (void)
+{
+  FNNAME (INSN_NAME) ();
+  return 0;
+}
--- a/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/binary_op_no64.inc
+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/binary_op_no64.inc
@@ -28,6 +28,10 @@ void FNNAME (INSN_NAME) (void)
 
   /* Initialize input "vector" from "buffer".  */
   TEST_MACRO_ALL_VARIANTS_2_5(VLOAD, vector, buffer);
+#ifdef HAS_FLOAT16_VARIANT
+  VLOAD(vector, buffer, , float, f, 16, 4);
+  VLOAD(vector, buffer, q, float, f, 16, 8);
+#endif
 #ifdef HAS_FLOAT_VARIANT
   VLOAD(vector, buffer, , float, f, 32, 2);
   VLOAD(vector, buffer, q, float, f, 32, 4);
@@ -46,15 +50,27 @@ void FNNAME (INSN_NAME) (void)
   VDUP(vector2, q, uint, u, 8, 16, 0xf9);
   VDUP(vector2, q, uint, u, 16, 8, 0xfff2);
   VDUP(vector2, q, uint, u, 32, 4, 0xfffffff1);
+#ifdef HAS_FLOAT16_VARIANT
+  VDUP(vector2, , float, f, 16, 4, -15.5f);
+  VDUP(vector2, q, float, f, 16, 8, -14.5f);
+#endif
 #ifdef HAS_FLOAT_VARIANT
   VDUP(vector2, , float, f, 32, 2, -15.5f);
   VDUP(vector2, q, float, f, 32, 4, -14.5f);
 #endif
 
+#ifdef HAS_FLOAT16_VARIANT
+#define FLOAT16_VARIANT(MACRO, VAR)			\
+  MACRO(VAR, , float, f, 16, 4);			\
+  MACRO(VAR, q, float, f, 16, 8);
+#else
+#define FLOAT16_VARIANT(MACRO, VAR)
+#endif
+
 #ifdef HAS_FLOAT_VARIANT
 #define FLOAT_VARIANT(MACRO, VAR)			\
   MACRO(VAR, , float, f, 32, 2);			\
-  MACRO(VAR, q, float, f, 32, 4)
+  MACRO(VAR, q, float, f, 32, 4);
 #else
 #define FLOAT_VARIANT(MACRO, VAR)
 #endif
@@ -72,7 +88,8 @@ void FNNAME (INSN_NAME) (void)
   MACRO(VAR, q, uint, u, 8, 16);			\
   MACRO(VAR, q, uint, u, 16, 8);			\
   MACRO(VAR, q, uint, u, 32, 4);			\
-  FLOAT_VARIANT(MACRO, VAR)
+  FLOAT_VARIANT(MACRO, VAR);				\
+  FLOAT16_VARIANT(MACRO, VAR);
 
   /* Apply a binary operator named INSN_NAME.  */
   TEST_MACRO_NO64BIT_VARIANT_1_5(TEST_BINARY_OP, INSN_NAME);
@@ -90,6 +107,42 @@ void FNNAME (INSN_NAME) (void)
   CHECK(TEST_MSG, uint, 16, 8, PRIx16, expected, "");
   CHECK(TEST_MSG, uint, 32, 4, PRIx32, expected, "");
 
+#ifdef HAS_FLOAT16_VARIANT
+  CHECK_FP(TEST_MSG, float, 16, 4, PRIx16, expected, "");
+  CHECK_FP(TEST_MSG, float, 16, 8, PRIx16, expected, "");
+
+  /* Extra FP tests with special values (NaN, ....)  */
+  VDUP(vector, q, float, f, 16, 8, 1.0f);
+  VDUP(vector2, q, float, f, 16, 8, NAN);
+  TEST_BINARY_OP(INSN_NAME, q, float, f, 16, 8);
+  CHECK_FP(TEST_MSG, float, 16, 8, PRIx16, expected_nan, " FP special (NaN)");
+
+  VDUP(vector, q, float, f, 16, 8, -NAN);
+  VDUP(vector2, q, float, f, 16, 8, 1.0f);
+  TEST_BINARY_OP(INSN_NAME, q, float, f, 16, 8);
+  CHECK_FP(TEST_MSG, float, 16, 8, PRIx16, expected_mnan, " FP special (-NaN)");
+
+  VDUP(vector, q, float, f, 16, 8, 1.0f);
+  VDUP(vector2, q, float, f, 16, 8, HUGE_VALF);
+  TEST_BINARY_OP(INSN_NAME, q, float, f, 16, 8);
+  CHECK_FP(TEST_MSG, float, 16, 8, PRIx16, expected_inf, " FP special (inf)");
+
+  VDUP(vector, q, float, f, 16, 8, -HUGE_VALF);
+  VDUP(vector2, q, float, f, 16, 8, 1.0f);
+  TEST_BINARY_OP(INSN_NAME, q, float, f, 16, 8);
+  CHECK_FP(TEST_MSG, float, 16, 8, PRIx16, expected_minf, " FP special (-inf)");
+
+  VDUP(vector, q, float, f, 16, 8, 0.0f);
+  VDUP(vector2, q, float, f, 16, 8, -0.0f);
+  TEST_BINARY_OP(INSN_NAME, q, float, f, 16, 8);
+  CHECK_FP(TEST_MSG, float, 16, 8, PRIx16, expected_zero1, " FP special (-0.0)");
+
+  VDUP(vector, q, float, f, 16, 8, -0.0f);
+  VDUP(vector2, q, float, f, 16, 8, 0.0f);
+  TEST_BINARY_OP(INSN_NAME, q, float, f, 16, 8);
+  CHECK_FP(TEST_MSG, float, 16, 8, PRIx16, expected_zero2, " FP special (-0.0)");
+#endif
+
 #ifdef HAS_FLOAT_VARIANT
   CHECK_FP(TEST_MSG, float, 32, 2, PRIx32, expected, "");
   CHECK_FP(TEST_MSG, float, 32, 4, PRIx32, expected, "");
--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/binary_scalar_op.inc
@@ -0,0 +1,160 @@
+/* Template file for binary scalar operator validation.
+
+   This file is meant to be included by test files for binary scalar
+   operations.  */
+
+/* Check for required settings.  */
+
+#ifndef INSN_NAME
+#error INSN_NAME (the intrinsic to test) must be defined.
+#endif
+
+#ifndef INPUT_TYPE
+#error INPUT_TYPE (basic type of an input value) must be defined.
+#endif
+
+#ifndef OUTPUT_TYPE
+#error OUTPUT_TYPE (basic type of an output value) must be defined.
+#endif
+
+#ifndef OUTPUT_TYPE_SIZE
+#error OUTPUT_TYPE_SIZE (size in bits of an output value) must be defined.
+#endif
+
+/* Optional settings:
+
+   INPUT_1: Input values for the first parameter.  Must be of type INPUT_TYPE.
+   INPUT_2: Input values for the first parameter.  Must be of type
+   INPUT_TYPE.  */
+
+#ifndef TEST_MSG
+#define TEST_MSG "unnamed test"
+#endif
+
+/* The test framework.  */
+
+#include <stdio.h>
+
+extern void abort ();
+
+#define INFF __builtin_inf ()
+
+/* Stringify a macro.  */
+#define STR0(A) #A
+#define STR(A) STR0 (A)
+
+/* Macro concatenation.  */
+#define CAT0(A, B) A##B
+#define CAT(A, B) CAT0 (A, B)
+
+/* Format strings for error reporting.  */
+#define FMT16 "0x%04x"
+#define FMT32 "0x%08x"
+#define FMT CAT (FMT,OUTPUT_TYPE_SIZE)
+
+/* Type construction: forms TS_t, where T is the base type and S the size in
+   bits.  */
+#define MK_TYPE0(T, S) T##S##_t
+#define MK_TYPE(T, S) MK_TYPE0 (T, S)
+
+/* Convenience types for input and output data.  */
+typedef MK_TYPE (uint, OUTPUT_TYPE_SIZE) output_hex_type;
+
+/* Conversion between typed values and their hexadecimal representation.  */
+typedef union
+{
+  OUTPUT_TYPE value;
+  output_hex_type hex;
+} output_conv_type;
+
+/* Default input values.  */
+
+float16_t input_1_float16_t[] =
+{
+  0.0, -0.0,
+  2.0, 3.1,
+  20.0, 0.40,
+  -2.3, 1.33,
+  -7.6, 0.31,
+  0.3353, 0.5,
+  1.0, 13.13,
+  -6.3, 20.0,
+  (float16_t)INFF, (float16_t)-INFF,
+};
+
+float16_t input_2_float16_t[] =
+{
+  1.0, 1.0,
+  -4.33, 100.0,
+  30.0, -0.02,
+  0.5, -7.231,
+  -6.3, 20.0,
+  -7.231, 2.3,
+  -7.6, 5.1,
+  0.31, 0.33353,
+  (float16_t)-INFF, (float16_t)INFF,
+};
+
+#ifndef INPUT_1
+#define INPUT_1 CAT (input_1_,INPUT_TYPE)
+#endif
+
+#ifndef INPUT_2
+#define INPUT_2 CAT (input_2_,INPUT_TYPE)
+#endif
+
+/* Support macros and routines for the test function.  */
+
+#define CHECK()						\
+  {								\
+    output_conv_type actual;					\
+    output_conv_type expect;					\
+								\
+    expect.hex = ((output_hex_type*)EXPECTED)[index];		\
+    actual.value = INSN_NAME ((INPUT_1)[index],			\
+			      (INPUT_2)[index]);		\
+								\
+    if (actual.hex != expect.hex)				\
+      {								\
+	fprintf (stderr,					\
+		 "ERROR in %s (%s line %d), buffer %s, "	\
+		 "index %d: got "				\
+		 FMT " != " FMT "\n",				\
+		 TEST_MSG, __FILE__, __LINE__,			\
+		 STR (EXPECTED), index,				\
+		 actual.hex, expect.hex);			\
+	abort ();						\
+      }								\
+    fprintf (stderr, "CHECKED %s %s\n",				\
+	     STR (EXPECTED), TEST_MSG);				\
+  }
+
+#define FNNAME1(NAME) exec_ ## NAME
+#define FNNAME(NAME) FNNAME1 (NAME)
+
+/* The test function.  */
+
+void
+FNNAME (INSN_NAME) (void)
+{
+  /* Basic test: y[i] = OP (x[i]), for each INPUT[i], then compare the result
+     against EXPECTED[i].  */
+
+  const int num_tests = sizeof (INPUT_1) / sizeof (INPUT_1[0]);
+  int index;
+
+  for (index = 0; index < num_tests; index++)
+    CHECK ();
+
+#ifdef EXTRA_TESTS
+  EXTRA_TESTS ();
+#endif
+}
+
+int
+main (void)
+{
+  FNNAME (INSN_NAME) ();
+
+  return 0;
+}
--- a/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/cmp_fp_op.inc
+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/cmp_fp_op.inc
@@ -15,6 +15,10 @@
    each test file.  */
 extern ARRAY(expected2, uint, 32, 2);
 extern ARRAY(expected2, uint, 32, 4);
+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+extern ARRAY(expected2, uint, 16, 4);
+extern ARRAY(expected2, uint, 16, 8);
+#endif
 
 #define FNNAME1(NAME) exec_ ## NAME
 #define FNNAME(NAME) FNNAME1(NAME)
@@ -37,17 +41,33 @@ void FNNAME (INSN_NAME) (void)
   DECL_VARIABLE(vector2, float, 32, 4);
   DECL_VARIABLE(vector_res, uint, 32, 2);
   DECL_VARIABLE(vector_res, uint, 32, 4);
+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+  DECL_VARIABLE(vector, float, 16, 4);
+  DECL_VARIABLE(vector, float, 16, 8);
+  DECL_VARIABLE(vector2, float, 16, 4);
+  DECL_VARIABLE(vector2, float, 16, 8);
+  DECL_VARIABLE(vector_res, uint, 16, 4);
+  DECL_VARIABLE(vector_res, uint, 16, 8);
+#endif
 
   clean_results ();
 
   /* Initialize input "vector" from "buffer".  */
   VLOAD(vector, buffer, , float, f, 32, 2);
   VLOAD(vector, buffer, q, float, f, 32, 4);
+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+  VLOAD(vector, buffer, , float, f, 16, 4);
+  VLOAD(vector, buffer, q, float, f, 16, 8);
+#endif
 
   /* Choose init value arbitrarily, will be used for vector
      comparison.  */
   VDUP(vector2, , float, f, 32, 2, -16.0f);
   VDUP(vector2, q, float, f, 32, 4, -14.0f);
+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+  VDUP(vector2, , float, f, 16, 4, -16.0f);
+  VDUP(vector2, q, float, f, 16, 8, -14.0f);
+#endif
 
   /* Apply operator named INSN_NAME.  */
   TEST_VCOMP(INSN_NAME, , float, f, uint, 32, 2);
@@ -56,15 +76,36 @@ void FNNAME (INSN_NAME) (void)
   TEST_VCOMP(INSN_NAME, q, float, f, uint, 32, 4);
   CHECK(TEST_MSG, uint, 32, 4, PRIx32, expected, "");
 
+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+  TEST_VCOMP(INSN_NAME, , float, f, uint, 16, 4);
+  CHECK(TEST_MSG, uint, 16, 4, PRIx16, expected, "");
+
+  TEST_VCOMP(INSN_NAME, q, float, f, uint, 16, 8);
+  CHECK(TEST_MSG, uint, 16, 8, PRIx16, expected, "");
+#endif
+
   /* Test again, with different input values.  */
   VDUP(vector2, , float, f, 32, 2, -10.0f);
   VDUP(vector2, q, float, f, 32, 4, 10.0f);
 
+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+  VDUP(vector2, , float, f, 16, 4, -10.0f);
+  VDUP(vector2, q, float, f, 16, 8, 10.0f);
+#endif
+
   TEST_VCOMP(INSN_NAME, , float, f, uint, 32, 2);
   CHECK(TEST_MSG, uint, 32, 2, PRIx32, expected2, "");
 
   TEST_VCOMP(INSN_NAME, q, float, f, uint, 32, 4);
   CHECK(TEST_MSG, uint, 32, 4, PRIx32, expected2,"");
+
+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+  TEST_VCOMP(INSN_NAME, , float, f, uint, 16, 4);
+  CHECK(TEST_MSG, uint, 16, 4, PRIx16, expected2, "");
+
+  TEST_VCOMP(INSN_NAME, q, float, f, uint, 16, 8);
+  CHECK(TEST_MSG, uint, 16, 8, PRIx16, expected2,"");
+#endif
 }
 
 int main (void)
--- a/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/cmp_op.inc
+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/cmp_op.inc
@@ -11,6 +11,17 @@ extern ARRAY(expected_uint, uint, 32, 2);
 extern ARRAY(expected_q_uint, uint, 8, 16);
 extern ARRAY(expected_q_uint, uint, 16, 8);
 extern ARRAY(expected_q_uint, uint, 32, 4);
+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+extern ARRAY(expected_float, uint, 16, 4);
+extern ARRAY(expected_q_float, uint, 16, 8);
+extern ARRAY(expected_nan, uint, 16, 4);
+extern ARRAY(expected_mnan, uint, 16, 4);
+extern ARRAY(expected_nan2, uint, 16, 4);
+extern ARRAY(expected_inf, uint, 16, 4);
+extern ARRAY(expected_minf, uint, 16, 4);
+extern ARRAY(expected_inf2, uint, 16, 4);
+extern ARRAY(expected_mzero, uint, 16, 4);
+#endif
 extern ARRAY(expected_float, uint, 32, 2);
 extern ARRAY(expected_q_float, uint, 32, 4);
 extern ARRAY(expected_uint2, uint, 32, 2);
@@ -48,6 +59,9 @@ void FNNAME (INSN_NAME) (void)
   DECL_VARIABLE(vector, uint, 8, 8);
   DECL_VARIABLE(vector, uint, 16, 4);
   DECL_VARIABLE(vector, uint, 32, 2);
+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+  DECL_VARIABLE (vector, float, 16, 4);
+#endif
   DECL_VARIABLE(vector, float, 32, 2);
   DECL_VARIABLE(vector, int, 8, 16);
   DECL_VARIABLE(vector, int, 16, 8);
@@ -55,6 +69,9 @@ void FNNAME (INSN_NAME) (void)
   DECL_VARIABLE(vector, uint, 8, 16);
   DECL_VARIABLE(vector, uint, 16, 8);
   DECL_VARIABLE(vector, uint, 32, 4);
+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+  DECL_VARIABLE (vector, float, 16, 8);
+#endif
   DECL_VARIABLE(vector, float, 32, 4);
 
   DECL_VARIABLE(vector2, int, 8, 8);
@@ -63,6 +80,9 @@ void FNNAME (INSN_NAME) (void)
   DECL_VARIABLE(vector2, uint, 8, 8);
   DECL_VARIABLE(vector2, uint, 16, 4);
   DECL_VARIABLE(vector2, uint, 32, 2);
+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+  DECL_VARIABLE (vector2, float, 16, 4);
+#endif
   DECL_VARIABLE(vector2, float, 32, 2);
   DECL_VARIABLE(vector2, int, 8, 16);
   DECL_VARIABLE(vector2, int, 16, 8);
@@ -70,6 +90,9 @@ void FNNAME (INSN_NAME) (void)
   DECL_VARIABLE(vector2, uint, 8, 16);
   DECL_VARIABLE(vector2, uint, 16, 8);
   DECL_VARIABLE(vector2, uint, 32, 4);
+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+  DECL_VARIABLE (vector2, float, 16, 8);
+#endif
   DECL_VARIABLE(vector2, float, 32, 4);
 
   DECL_VARIABLE(vector_res, uint, 8, 8);
@@ -88,6 +111,9 @@ void FNNAME (INSN_NAME) (void)
   VLOAD(vector, buffer, , uint, u, 8, 8);
   VLOAD(vector, buffer, , uint, u, 16, 4);
   VLOAD(vector, buffer, , uint, u, 32, 2);
+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+  VLOAD (vector, buffer, , float, f, 16, 4);
+#endif
   VLOAD(vector, buffer, , float, f, 32, 2);
 
   VLOAD(vector, buffer, q, int, s, 8, 16);
@@ -96,6 +122,9 @@ void FNNAME (INSN_NAME) (void)
   VLOAD(vector, buffer, q, uint, u, 8, 16);
   VLOAD(vector, buffer, q, uint, u, 16, 8);
   VLOAD(vector, buffer, q, uint, u, 32, 4);
+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+  VLOAD (vector, buffer, q, float, f, 16, 8);
+#endif
   VLOAD(vector, buffer, q, float, f, 32, 4);
 
   /* Choose init value arbitrarily, will be used for vector
@@ -106,6 +135,9 @@ void FNNAME (INSN_NAME) (void)
   VDUP(vector2, , uint, u, 8, 8, 0xF3);
   VDUP(vector2, , uint, u, 16, 4, 0xFFF2);
   VDUP(vector2, , uint, u, 32, 2, 0xFFFFFFF1);
+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+  VDUP (vector2, , float, f, 16, 4, -15.0f);
+#endif
   VDUP(vector2, , float, f, 32, 2, -15.0f);
 
   VDUP(vector2, q, int, s, 8, 16, -4);
@@ -114,6 +146,9 @@ void FNNAME (INSN_NAME) (void)
   VDUP(vector2, q, uint, u, 8, 16, 0xF4);
   VDUP(vector2, q, uint, u, 16, 8, 0xFFF6);
   VDUP(vector2, q, uint, u, 32, 4, 0xFFFFFFF2);
+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+  VDUP (vector2, q, float, f, 16, 8, -14.0f);
+#endif
   VDUP(vector2, q, float, f, 32, 4, -14.0f);
 
   /* The comparison operators produce only unsigned results, which
@@ -154,9 +189,17 @@ void FNNAME (INSN_NAME) (void)
   CHECK(TEST_MSG, uint, 32, 4, PRIx32, expected_q_uint, "");
 
   /* The float variants.  */
+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+  TEST_VCOMP (INSN_NAME, , float, f, uint, 16, 4);
+  CHECK (TEST_MSG, uint, 16, 4, PRIx16, expected_float, "");
+#endif
   TEST_VCOMP(INSN_NAME, , float, f, uint, 32, 2);
   CHECK(TEST_MSG, uint, 32, 2, PRIx32, expected_float, "");
 
+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+  TEST_VCOMP (INSN_NAME, q, float, f, uint, 16, 8);
+  CHECK (TEST_MSG, uint, 16, 8, PRIx16, expected_q_float, "");
+#endif
   TEST_VCOMP(INSN_NAME, q, float, f, uint, 32, 4);
   CHECK(TEST_MSG, uint, 32, 4, PRIx32, expected_q_float, "");
 
@@ -176,6 +219,43 @@ void FNNAME (INSN_NAME) (void)
 
 
   /* Extra FP tests with special values (NaN, ....).  */
+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+  VDUP (vector, , float, f, 16, 4, 1.0);
+  VDUP (vector2, , float, f, 16, 4, NAN);
+  TEST_VCOMP (INSN_NAME, , float, f, uint, 16, 4);
+  CHECK (TEST_MSG, uint, 16, 4, PRIx16, expected_nan, "FP special (NaN)");
+
+  VDUP (vector, , float, f, 16, 4, 1.0);
+  VDUP (vector2, , float, f, 16, 4, -NAN);
+  TEST_VCOMP (INSN_NAME, , float, f, uint, 16, 4);
+  CHECK (TEST_MSG, uint, 16, 4, PRIx16, expected_mnan, " FP special (-NaN)");
+
+  VDUP (vector, , float, f, 16, 4, NAN);
+  VDUP (vector2, , float, f, 16, 4, 1.0);
+  TEST_VCOMP (INSN_NAME, , float, f, uint, 16, 4);
+  CHECK (TEST_MSG, uint, 16, 4, PRIx16, expected_nan2, " FP special (NaN)");
+
+  VDUP (vector, , float, f, 16, 4, 1.0);
+  VDUP (vector2, , float, f, 16, 4, HUGE_VALF);
+  TEST_VCOMP (INSN_NAME, , float, f, uint, 16, 4);
+  CHECK (TEST_MSG, uint, 16, 4, PRIx16, expected_inf, " FP special (inf)");
+
+  VDUP (vector, , float, f, 16, 4, 1.0);
+  VDUP (vector2, , float, f, 16, 4, -HUGE_VALF);
+  TEST_VCOMP (INSN_NAME, , float, f, uint, 16, 4);
+  CHECK (TEST_MSG, uint, 16, 4, PRIx16, expected_minf, " FP special (-inf)");
+
+  VDUP (vector, , float, f, 16, 4, HUGE_VALF);
+  VDUP (vector2, , float, f, 16, 4, 1.0);
+  TEST_VCOMP (INSN_NAME, , float, f, uint, 16, 4);
+  CHECK (TEST_MSG, uint, 16, 4, PRIx16, expected_inf2, " FP special (inf)");
+
+  VDUP (vector, , float, f, 16, 4, -0.0);
+  VDUP (vector2, , float, f, 16, 4, 0.0);
+  TEST_VCOMP (INSN_NAME, , float, f, uint, 16, 4);
+  CHECK (TEST_MSG, uint, 16, 4, PRIx16, expected_mzero, " FP special (-0.0)");
+#endif
+
   VDUP(vector, , float, f, 32, 2, 1.0);
   VDUP(vector2, , float, f, 32, 2, NAN);
   TEST_VCOMP(INSN_NAME, , float, f, uint, 32, 2);
--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/cmp_zero_op.inc
@@ -0,0 +1,111 @@
+/* Template file for the validation of compare against zero operators.
+
+   This file is base on cmp_op.inc.  It is meant to be included by the relevant
+   test files, which have to define the intrinsic family to test.  If a given
+   intrinsic supports variants which are not supported by all the other
+   operators, these can be tested by providing a definition for EXTRA_TESTS.  */
+
+#include <arm_neon.h>
+#include "arm-neon-ref.h"
+#include "compute-ref-data.h"
+#include <math.h>
+
+/* Additional expected results declaration, they are initialized in
+   each test file.  */
+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+extern ARRAY(expected_float, uint, 16, 4);
+extern ARRAY(expected_q_float, uint, 16, 8);
+extern ARRAY(expected_uint2, uint, 16, 4);
+extern ARRAY(expected_uint3, uint, 16, 4);
+extern ARRAY(expected_uint4, uint, 16, 4);
+extern ARRAY(expected_nan, uint, 16, 4);
+extern ARRAY(expected_mnan, uint, 16, 4);
+extern ARRAY(expected_inf, uint, 16, 4);
+extern ARRAY(expected_minf, uint, 16, 4);
+extern ARRAY(expected_zero, uint, 16, 4);
+extern ARRAY(expected_mzero, uint, 16, 4);
+#endif
+
+#define FNNAME1(NAME) exec_ ## NAME
+#define FNNAME(NAME) FNNAME1(NAME)
+
+void FNNAME (INSN_NAME) (void)
+{
+  /* Basic test: y=vcomp(x1,x2), then store the result.  */
+#define TEST_VCOMP1(INSN, Q, T1, T2, T3, W, N)				\
+  VECT_VAR(vector_res, T3, W, N) =					\
+    INSN##Q##_##T2##W(VECT_VAR(vector, T1, W, N));			\
+  vst1##Q##_u##W(VECT_VAR(result, T3, W, N), VECT_VAR(vector_res, T3, W, N))
+
+#define TEST_VCOMP(INSN, Q, T1, T2, T3, W, N)				\
+  TEST_VCOMP1(INSN, Q, T1, T2, T3, W, N)
+
+  /* No need for 64 bits elements.  */
+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+  DECL_VARIABLE (vector, float, 16, 4);
+  DECL_VARIABLE (vector, float, 16, 8);
+#endif
+
+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+  DECL_VARIABLE(vector_res, uint, 16, 4);
+  DECL_VARIABLE(vector_res, uint, 16, 8);
+#endif
+
+  clean_results ();
+
+  /* Choose init value arbitrarily, will be used for vector
+     comparison.  */
+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+  VDUP (vector, , float, f, 16, 4, -15.0f);
+  VDUP (vector, q, float, f, 16, 8, 14.0f);
+#endif
+
+  /* Float variants.  */
+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+  TEST_VCOMP (INSN_NAME, , float, f, uint, 16, 4);
+  TEST_VCOMP (INSN_NAME, q, float, f, uint, 16, 8);
+#endif
+
+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+  CHECK (TEST_MSG, uint, 16, 4, PRIx16, expected_float, "");
+  CHECK (TEST_MSG, uint, 16, 8, PRIx16, expected_q_float, "");
+#endif
+
+  /* Extra FP tests with special values (NaN, ....).  */
+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+  VDUP (vector, , float, f, 16, 4, NAN);
+  TEST_VCOMP (INSN_NAME, , float, f, uint, 16, 4);
+  CHECK (TEST_MSG, uint, 16, 4, PRIx16, expected_nan, "FP special (NaN)");
+
+  VDUP (vector, , float, f, 16, 4, -NAN);
+  TEST_VCOMP (INSN_NAME, , float, f, uint, 16, 4);
+  CHECK (TEST_MSG, uint, 16, 4, PRIx16, expected_mnan, " FP special (-NaN)");
+
+  VDUP (vector, , float, f, 16, 4, HUGE_VALF);
+  TEST_VCOMP (INSN_NAME, , float, f, uint, 16, 4);
+  CHECK (TEST_MSG, uint, 16, 4, PRIx16, expected_inf, " FP special (inf)");
+
+  VDUP (vector, , float, f, 16, 4, -HUGE_VALF);
+  TEST_VCOMP (INSN_NAME, , float, f, uint, 16, 4);
+  CHECK (TEST_MSG, uint, 16, 4, PRIx16, expected_minf, " FP special (-inf)");
+
+  VDUP (vector, , float, f, 16, 4, 0.0);
+  TEST_VCOMP (INSN_NAME, , float, f, uint, 16, 4);
+  CHECK (TEST_MSG, uint, 16, 4, PRIx16, expected_zero, " FP special (0.0)");
+
+  VDUP (vector, , float, f, 16, 4, 0.0);
+  TEST_VCOMP (INSN_NAME, , float, f, uint, 16, 4);
+  CHECK (TEST_MSG, uint, 16, 4, PRIx16, expected_mzero, " FP special (-0.0)");
+#endif
+
+#ifdef EXTRA_TESTS
+  EXTRA_TESTS();
+#endif
+}
+
+int main (void)
+{
+  FNNAME (INSN_NAME) ();
+
+  return 0;
+}
--- a/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/compute-ref-data.h
+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/compute-ref-data.h
@@ -118,6 +118,10 @@ VECT_VAR_DECL_INIT(buffer, uint, 32, 2);
 PAD(buffer_pad, uint, 32, 2);
 VECT_VAR_DECL_INIT(buffer, uint, 64, 1);
 PAD(buffer_pad, uint, 64, 1);
+#if defined (__ARM_FEATURE_CRYPTO)
+VECT_VAR_DECL_INIT(buffer, poly, 64, 1);
+PAD(buffer_pad, poly, 64, 1);
+#endif
 #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
 VECT_VAR_DECL_INIT(buffer, float, 16, 4);
 PAD(buffer_pad, float, 16, 4);
@@ -144,6 +148,10 @@ VECT_VAR_DECL_INIT(buffer, poly, 8, 16);
 PAD(buffer_pad, poly, 8, 16);
 VECT_VAR_DECL_INIT(buffer, poly, 16, 8);
 PAD(buffer_pad, poly, 16, 8);
+#if defined (__ARM_FEATURE_CRYPTO)
+VECT_VAR_DECL_INIT(buffer, poly, 64, 2);
+PAD(buffer_pad, poly, 64, 2);
+#endif
 #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
 VECT_VAR_DECL_INIT(buffer, float, 16, 8);
 PAD(buffer_pad, float, 16, 8);
@@ -178,6 +186,10 @@ VECT_VAR_DECL_INIT(buffer_dup, poly, 8, 8);
 VECT_VAR_DECL(buffer_dup_pad, poly, 8, 8);
 VECT_VAR_DECL_INIT(buffer_dup, poly, 16, 4);
 VECT_VAR_DECL(buffer_dup_pad, poly, 16, 4);
+#if defined (__ARM_FEATURE_CRYPTO)
+VECT_VAR_DECL_INIT4(buffer_dup, poly, 64, 1);
+VECT_VAR_DECL(buffer_dup_pad, poly, 64, 1);
+#endif
 #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
 VECT_VAR_DECL_INIT4(buffer_dup, float, 16, 4);
 VECT_VAR_DECL(buffer_dup_pad, float, 16, 4);
@@ -205,6 +217,10 @@ VECT_VAR_DECL_INIT(buffer_dup, poly, 8, 16);
 VECT_VAR_DECL(buffer_dup_pad, poly, 8, 16);
 VECT_VAR_DECL_INIT(buffer_dup, poly, 16, 8);
 VECT_VAR_DECL(buffer_dup_pad, poly, 16, 8);
+#if defined (__ARM_FEATURE_CRYPTO)
+VECT_VAR_DECL_INIT4(buffer_dup, poly, 64, 2);
+VECT_VAR_DECL(buffer_dup_pad, poly, 64, 2);
+#endif
 #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
 VECT_VAR_DECL_INIT(buffer_dup, float, 16, 8);
 VECT_VAR_DECL(buffer_dup_pad, float, 16, 8);
--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/p64_p128.c
@@ -0,0 +1,1024 @@
+/* This file contains tests for all the *p64 intrinsics, except for
+   vreinterpret which have their own testcase.  */
+
+/* { dg-require-effective-target arm_crypto_ok { target { arm*-*-* } } } */
+/* { dg-add-options arm_crypto } */
+/* { dg-additional-options "-march=armv8-a+crypto" { target { aarch64*-*-* } } }*/
+
+#include <arm_neon.h>
+#include "arm-neon-ref.h"
+#include "compute-ref-data.h"
+
+/* Expected results: vbsl.  */
+VECT_VAR_DECL(vbsl_expected,poly,64,1) [] = { 0xfffffff1 };
+VECT_VAR_DECL(vbsl_expected,poly,64,2) [] = { 0xfffffff1,
+					      0xfffffff1 };
+
+/* Expected results: vceq.  */
+VECT_VAR_DECL(vceq_expected,uint,64,1) [] = { 0x0 };
+
+/* Expected results: vcombine.  */
+VECT_VAR_DECL(vcombine_expected,poly,64,2) [] = { 0xfffffffffffffff0, 0x88 };
+
+/* Expected results: vcreate.  */
+VECT_VAR_DECL(vcreate_expected,poly,64,1) [] = { 0x123456789abcdef0 };
+
+/* Expected results: vdup_lane.  */
+VECT_VAR_DECL(vdup_lane_expected,poly,64,1) [] = { 0xfffffffffffffff0 };
+VECT_VAR_DECL(vdup_lane_expected,poly,64,2) [] = { 0xfffffffffffffff0,
+						   0xfffffffffffffff0 };
+
+/* Expected results: vdup_n.  */
+VECT_VAR_DECL(vdup_n_expected0,poly,64,1) [] = { 0xfffffffffffffff0 };
+VECT_VAR_DECL(vdup_n_expected0,poly,64,2) [] = { 0xfffffffffffffff0,
+						 0xfffffffffffffff0 };
+VECT_VAR_DECL(vdup_n_expected1,poly,64,1) [] = { 0xfffffffffffffff1 };
+VECT_VAR_DECL(vdup_n_expected1,poly,64,2) [] = { 0xfffffffffffffff1,
+						 0xfffffffffffffff1 };
+VECT_VAR_DECL(vdup_n_expected2,poly,64,1) [] = { 0xfffffffffffffff2 };
+VECT_VAR_DECL(vdup_n_expected2,poly,64,2) [] = { 0xfffffffffffffff2,
+						 0xfffffffffffffff2 };
+
+/* Expected results: vmov_n.  */
+VECT_VAR_DECL(vmov_n_expected0,poly,64,1) [] = { 0xfffffffffffffff0 };
+VECT_VAR_DECL(vmov_n_expected0,poly,64,2) [] = { 0xfffffffffffffff0,
+						 0xfffffffffffffff0 };
+VECT_VAR_DECL(vmov_n_expected1,poly,64,1) [] = { 0xfffffffffffffff1 };
+VECT_VAR_DECL(vmov_n_expected1,poly,64,2) [] = { 0xfffffffffffffff1,
+						 0xfffffffffffffff1 };
+VECT_VAR_DECL(vmov_n_expected2,poly,64,1) [] = { 0xfffffffffffffff2 };
+VECT_VAR_DECL(vmov_n_expected2,poly,64,2) [] = { 0xfffffffffffffff2,
+						 0xfffffffffffffff2 };
+
+/* Expected results: vext.  */
+VECT_VAR_DECL(vext_expected,poly,64,1) [] = { 0xfffffffffffffff0 };
+VECT_VAR_DECL(vext_expected,poly,64,2) [] = { 0xfffffffffffffff1, 0x88 };
+
+/* Expected results: vget_low.  */
+VECT_VAR_DECL(vget_low_expected,poly,64,1) [] = { 0xfffffffffffffff0 };
+
+/* Expected results: vget_high.  */
+VECT_VAR_DECL(vget_high_expected,poly,64,1) [] = { 0xfffffffffffffff1 };
+
+/* Expected results: vld1.  */
+VECT_VAR_DECL(vld1_expected,poly,64,1) [] = { 0xfffffffffffffff0 };
+VECT_VAR_DECL(vld1_expected,poly,64,2) [] = { 0xfffffffffffffff0,
+					      0xfffffffffffffff1 };
+
+/* Expected results: vld1_dup.  */
+VECT_VAR_DECL(vld1_dup_expected0,poly,64,1) [] = { 0xfffffffffffffff0 };
+VECT_VAR_DECL(vld1_dup_expected0,poly,64,2) [] = { 0xfffffffffffffff0,
+						   0xfffffffffffffff0 };
+VECT_VAR_DECL(vld1_dup_expected1,poly,64,1) [] = { 0xfffffffffffffff1 };
+VECT_VAR_DECL(vld1_dup_expected1,poly,64,2) [] = { 0xfffffffffffffff1,
+						   0xfffffffffffffff1 };
+VECT_VAR_DECL(vld1_dup_expected2,poly,64,1) [] = { 0xfffffffffffffff2 };
+VECT_VAR_DECL(vld1_dup_expected2,poly,64,2) [] = { 0xfffffffffffffff2,
+						   0xfffffffffffffff2 };
+
+/* Expected results: vld1_lane.  */
+VECT_VAR_DECL(vld1_lane_expected,poly,64,1) [] = { 0xfffffffffffffff0 };
+VECT_VAR_DECL(vld1_lane_expected,poly,64,2) [] = { 0xfffffffffffffff0,
+						   0xaaaaaaaaaaaaaaaa };
+
+/* Expected results: vldX.  */
+VECT_VAR_DECL(vld2_expected_0,poly,64,1) [] = { 0xfffffffffffffff0 };
+VECT_VAR_DECL(vld2_expected_1,poly,64,1) [] = { 0xfffffffffffffff1 };
+VECT_VAR_DECL(vld3_expected_0,poly,64,1) [] = { 0xfffffffffffffff0 };
+VECT_VAR_DECL(vld3_expected_1,poly,64,1) [] = { 0xfffffffffffffff1 };
+VECT_VAR_DECL(vld3_expected_2,poly,64,1) [] = { 0xfffffffffffffff2 };
+VECT_VAR_DECL(vld4_expected_0,poly,64,1) [] = { 0xfffffffffffffff0 };
+VECT_VAR_DECL(vld4_expected_1,poly,64,1) [] = { 0xfffffffffffffff1 };
+VECT_VAR_DECL(vld4_expected_2,poly,64,1) [] = { 0xfffffffffffffff2 };
+VECT_VAR_DECL(vld4_expected_3,poly,64,1) [] = { 0xfffffffffffffff3 };
+
+/* Expected results: vldX_dup.  */
+VECT_VAR_DECL(vld2_dup_expected_0,poly,64,1) [] = { 0xfffffffffffffff0 };
+VECT_VAR_DECL(vld2_dup_expected_1,poly,64,1) [] = { 0xfffffffffffffff1 };
+VECT_VAR_DECL(vld3_dup_expected_0,poly,64,1) [] = { 0xfffffffffffffff0 };
+VECT_VAR_DECL(vld3_dup_expected_1,poly,64,1) [] = { 0xfffffffffffffff1 };
+VECT_VAR_DECL(vld3_dup_expected_2,poly,64,1) [] = { 0xfffffffffffffff2 };
+VECT_VAR_DECL(vld4_dup_expected_0,poly,64,1) [] = { 0xfffffffffffffff0 };
+VECT_VAR_DECL(vld4_dup_expected_1,poly,64,1) [] = { 0xfffffffffffffff1 };
+VECT_VAR_DECL(vld4_dup_expected_2,poly,64,1) [] = { 0xfffffffffffffff2 };
+VECT_VAR_DECL(vld4_dup_expected_3,poly,64,1) [] = { 0xfffffffffffffff3 };
+
+/* Expected results: vsli.  */
+VECT_VAR_DECL(vsli_expected,poly,64,1) [] = { 0x10 };
+VECT_VAR_DECL(vsli_expected,poly,64,2) [] = { 0x7ffffffffffff0,
+					      0x7ffffffffffff1 };
+VECT_VAR_DECL(vsli_expected_max_shift,poly,64,1) [] = { 0x7ffffffffffffff0 };
+VECT_VAR_DECL(vsli_expected_max_shift,poly,64,2) [] = { 0xfffffffffffffff0,
+							0xfffffffffffffff1 };
+
+/* Expected results: vsri.  */
+VECT_VAR_DECL(vsri_expected,poly,64,1) [] = { 0xe000000000000000 };
+VECT_VAR_DECL(vsri_expected,poly,64,2) [] = { 0xfffffffffffff800,
+					      0xfffffffffffff800 };
+VECT_VAR_DECL(vsri_expected_max_shift,poly,64,1) [] = { 0xfffffffffffffff0 };
+VECT_VAR_DECL(vsri_expected_max_shift,poly,64,2) [] = { 0xfffffffffffffff0,
+							0xfffffffffffffff1 };
+
+/* Expected results: vst1_lane.  */
+VECT_VAR_DECL(vst1_lane_expected,poly,64,1) [] = { 0xfffffffffffffff0 };
+VECT_VAR_DECL(vst1_lane_expected,poly,64,2) [] = { 0xfffffffffffffff0,
+						   0x3333333333333333 };
+
+/* Expected results: vldX_lane.  */
+VECT_VAR_DECL(expected_vld_st2_0,poly,64,1) [] = { 0xfffffffffffffff0 };
+VECT_VAR_DECL(expected_vld_st2_0,poly,64,2) [] = { 0xfffffffffffffff0,
+						   0xfffffffffffffff1 };
+VECT_VAR_DECL(expected_vld_st2_1,poly,64,1) [] = { 0xfffffffffffffff1 };
+VECT_VAR_DECL(expected_vld_st2_1,poly,64,2) [] = { 0xaaaaaaaaaaaaaaaa,
+						   0xaaaaaaaaaaaaaaaa };
+VECT_VAR_DECL(expected_vld_st3_0,poly,64,1) [] = { 0xfffffffffffffff0 };
+VECT_VAR_DECL(expected_vld_st3_0,poly,64,2) [] = { 0xfffffffffffffff0,
+						   0xfffffffffffffff1 };
+VECT_VAR_DECL(expected_vld_st3_1,poly,64,1) [] = { 0xfffffffffffffff1 };
+VECT_VAR_DECL(expected_vld_st3_1,poly,64,2) [] = { 0xfffffffffffffff2,
+						   0xaaaaaaaaaaaaaaaa };
+VECT_VAR_DECL(expected_vld_st3_2,poly,64,1) [] = { 0xfffffffffffffff2 };
+VECT_VAR_DECL(expected_vld_st3_2,poly,64,2) [] = { 0xaaaaaaaaaaaaaaaa,
+						   0xaaaaaaaaaaaaaaaa };
+VECT_VAR_DECL(expected_vld_st4_0,poly,64,1) [] = { 0xfffffffffffffff0 };
+VECT_VAR_DECL(expected_vld_st4_0,poly,64,2) [] = { 0xfffffffffffffff0,
+						   0xfffffffffffffff1 };
+VECT_VAR_DECL(expected_vld_st4_1,poly,64,1) [] = { 0xfffffffffffffff1 };
+VECT_VAR_DECL(expected_vld_st4_1,poly,64,2) [] = { 0xfffffffffffffff2,
+						   0xfffffffffffffff3 };
+VECT_VAR_DECL(expected_vld_st4_2,poly,64,1) [] = { 0xfffffffffffffff2 };
+VECT_VAR_DECL(expected_vld_st4_2,poly,64,2) [] = { 0xaaaaaaaaaaaaaaaa,
+						   0xaaaaaaaaaaaaaaaa };
+VECT_VAR_DECL(expected_vld_st4_3,poly,64,1) [] = { 0xfffffffffffffff3 };
+VECT_VAR_DECL(expected_vld_st4_3,poly,64,2) [] = { 0xaaaaaaaaaaaaaaaa,
+						   0xaaaaaaaaaaaaaaaa };
+
+/* Expected results: vget_lane.  */
+VECT_VAR_DECL(vget_lane_expected,poly,64,1) = 0xfffffffffffffff0;
+VECT_VAR_DECL(vget_lane_expected,poly,64,2) = 0xfffffffffffffff0;
+
+int main (void)
+{
+  int i;
+
+  /* vbsl_p64 tests.  */
+#define TEST_MSG "VBSL/VBSLQ"
+
+#define TEST_VBSL(T3, Q, T1, T2, W, N)					\
+  VECT_VAR(vbsl_vector_res, T1, W, N) =					\
+    vbsl##Q##_##T2##W(VECT_VAR(vbsl_vector_first, T3, W, N),		\
+		      VECT_VAR(vbsl_vector, T1, W, N),			\
+		      VECT_VAR(vbsl_vector2, T1, W, N));		\
+  vst1##Q##_##T2##W(VECT_VAR(result, T1, W, N), VECT_VAR(vbsl_vector_res, T1, W, N))
+
+  DECL_VARIABLE(vbsl_vector, poly, 64, 1);
+  DECL_VARIABLE(vbsl_vector, poly, 64, 2);
+  DECL_VARIABLE(vbsl_vector2, poly, 64, 1);
+  DECL_VARIABLE(vbsl_vector2, poly, 64, 2);
+  DECL_VARIABLE(vbsl_vector_res, poly, 64, 1);
+  DECL_VARIABLE(vbsl_vector_res, poly, 64, 2);
+
+  DECL_VARIABLE(vbsl_vector_first, uint, 64, 1);
+  DECL_VARIABLE(vbsl_vector_first, uint, 64, 2);
+
+  CLEAN(result, poly, 64, 1);
+  CLEAN(result, poly, 64, 2);
+
+  VLOAD(vbsl_vector, buffer, , poly, p, 64, 1);
+  VLOAD(vbsl_vector, buffer, q, poly, p, 64, 2);
+
+  VDUP(vbsl_vector2, , poly, p, 64, 1, 0xFFFFFFF3);
+  VDUP(vbsl_vector2, q, poly, p, 64, 2, 0xFFFFFFF3);
+
+  VDUP(vbsl_vector_first, , uint, u, 64, 1, 0xFFFFFFF2);
+  VDUP(vbsl_vector_first, q, uint, u, 64, 2, 0xFFFFFFF2);
+
+  TEST_VBSL(uint, , poly, p, 64, 1);
+  TEST_VBSL(uint, q, poly, p, 64, 2);
+
+  CHECK_POLY(TEST_MSG, poly, 64, 1, PRIx64, vbsl_expected, "");
+  CHECK_POLY(TEST_MSG, poly, 64, 2, PRIx64, vbsl_expected, "");
+
+  /* vceq_p64 tests. */
+#undef TEST_MSG
+#define TEST_MSG "VCEQ"
+
+#define TEST_VCOMP1(INSN, Q, T1, T2, T3, W, N)				\
+  VECT_VAR(vceq_vector_res, T3, W, N) =					\
+    INSN##Q##_##T2##W(VECT_VAR(vceq_vector, T1, W, N),			\
+		      VECT_VAR(vceq_vector2, T1, W, N));		\
+  vst1##Q##_u##W(VECT_VAR(result, T3, W, N), VECT_VAR(vceq_vector_res, T3, W, N))
+
+#define TEST_VCOMP(INSN, Q, T1, T2, T3, W, N)				\
+  TEST_VCOMP1(INSN, Q, T1, T2, T3, W, N)
+
+  DECL_VARIABLE(vceq_vector, poly, 64, 1);
+  DECL_VARIABLE(vceq_vector2, poly, 64, 1);
+  DECL_VARIABLE(vceq_vector_res, uint, 64, 1);
+
+  CLEAN(result, uint, 64, 1);
+
+  VLOAD(vceq_vector, buffer, , poly, p, 64, 1);
+
+  VDUP(vceq_vector2, , poly, p, 64, 1, 0x88);
+
+  TEST_VCOMP(vceq, , poly, p, uint, 64, 1);
+
+  CHECK(TEST_MSG, uint, 64, 1, PRIx64, vceq_expected, "");
+
+  /* vcombine_p64 tests.  */
+#undef TEST_MSG
+#define TEST_MSG "VCOMBINE"
+
+#define TEST_VCOMBINE(T1, T2, W, N, N2)					\
+  VECT_VAR(vcombine_vector128, T1, W, N2) =				\
+    vcombine_##T2##W(VECT_VAR(vcombine_vector64_a, T1, W, N),		\
+		     VECT_VAR(vcombine_vector64_b, T1, W, N));		\
+  vst1q_##T2##W(VECT_VAR(result, T1, W, N2), VECT_VAR(vcombine_vector128, T1, W, N2))
+
+  DECL_VARIABLE(vcombine_vector64_a, poly, 64, 1);
+  DECL_VARIABLE(vcombine_vector64_b, poly, 64, 1);
+  DECL_VARIABLE(vcombine_vector128, poly, 64, 2);
+
+  CLEAN(result, poly, 64, 2);
+
+  VLOAD(vcombine_vector64_a, buffer, , poly, p, 64, 1);
+
+  VDUP(vcombine_vector64_b, , poly, p, 64, 1, 0x88);
+
+  TEST_VCOMBINE(poly, p, 64, 1, 2);
+
+  CHECK_POLY(TEST_MSG, poly, 64, 2, PRIx64, vcombine_expected, "");
+
+  /* vcreate_p64 tests.  */
+#undef TEST_MSG
+#define TEST_MSG "VCREATE"
+
+#define TEST_VCREATE(T1, T2, W, N)					\
+  VECT_VAR(vcreate_vector_res, T1, W, N) =				\
+    vcreate_##T2##W(VECT_VAR(vcreate_val, T1, W, N));			\
+  vst1_##T2##W(VECT_VAR(result, T1, W, N), VECT_VAR(vcreate_vector_res, T1, W, N))
+
+#define DECL_VAL(VAR, T1, W, N)			\
+  uint64_t VECT_VAR(VAR, T1, W, N)
+
+  DECL_VAL(vcreate_val, poly, 64, 1);
+  DECL_VARIABLE(vcreate_vector_res, poly, 64, 1);
+
+  CLEAN(result, poly, 64, 2);
+
+  VECT_VAR(vcreate_val, poly, 64, 1) = 0x123456789abcdef0ULL;
+
+  TEST_VCREATE(poly, p, 64, 1);
+
+  CHECK_POLY(TEST_MSG, poly, 64, 1, PRIx64, vcreate_expected, "");
+
+  /* vdup_lane_p64 tests.  */
+#undef TEST_MSG
+#define TEST_MSG "VDUP_LANE/VDUP_LANEQ"
+
+#define TEST_VDUP_LANE(Q, T1, T2, W, N, N2, L)				\
+  VECT_VAR(vdup_lane_vector_res, T1, W, N) =				\
+    vdup##Q##_lane_##T2##W(VECT_VAR(vdup_lane_vector, T1, W, N2), L);	\
+  vst1##Q##_##T2##W(VECT_VAR(result, T1, W, N), VECT_VAR(vdup_lane_vector_res, T1, W, N))
+
+  DECL_VARIABLE(vdup_lane_vector, poly, 64, 1);
+  DECL_VARIABLE(vdup_lane_vector, poly, 64, 2);
+  DECL_VARIABLE(vdup_lane_vector_res, poly, 64, 1);
+  DECL_VARIABLE(vdup_lane_vector_res, poly, 64, 2);
+
+  CLEAN(result, poly, 64, 1);
+  CLEAN(result, poly, 64, 2);
+
+  VLOAD(vdup_lane_vector, buffer, , poly, p, 64, 1);
+
+  TEST_VDUP_LANE(, poly, p, 64, 1, 1, 0);
+  TEST_VDUP_LANE(q, poly, p, 64, 2, 1, 0);
+
+  CHECK_POLY(TEST_MSG, poly, 64, 1, PRIx64, vdup_lane_expected, "");
+  CHECK_POLY(TEST_MSG, poly, 64, 2, PRIx64, vdup_lane_expected, "");
+
+  /* vdup_n_p64 tests.  */
+#undef TEST_MSG
+#define TEST_MSG "VDUP/VDUPQ"
+
+#define TEST_VDUP(Q, T1, T2, W, N)					\
+  VECT_VAR(vdup_n_vector, T1, W, N) =					\
+    vdup##Q##_n_##T2##W(VECT_VAR(buffer_dup, T1, W, N)[i]);		\
+  vst1##Q##_##T2##W(VECT_VAR(result, T1, W, N), VECT_VAR(vdup_n_vector, T1, W, N))
+
+  DECL_VARIABLE(vdup_n_vector, poly, 64, 1);
+  DECL_VARIABLE(vdup_n_vector, poly, 64, 2);
+
+  /* Try to read different places from the input buffer.  */
+  for (i=0; i< 3; i++) {
+    CLEAN(result, poly, 64, 1);
+    CLEAN(result, poly, 64, 2);
+
+    TEST_VDUP(, poly, p, 64, 1);
+    TEST_VDUP(q, poly, p, 64, 2);
+
+    switch (i) {
+    case 0:
+      CHECK_POLY(TEST_MSG, poly, 64, 1, PRIx64, vdup_n_expected0, "");
+      CHECK_POLY(TEST_MSG, poly, 64, 2, PRIx64, vdup_n_expected0, "");
+      break;
+    case 1:
+      CHECK_POLY(TEST_MSG, poly, 64, 1, PRIx64, vdup_n_expected1, "");
+      CHECK_POLY(TEST_MSG, poly, 64, 2, PRIx64, vdup_n_expected1, "");
+      break;
+    case 2:
+      CHECK_POLY(TEST_MSG, poly, 64, 1, PRIx64, vdup_n_expected2, "");
+      CHECK_POLY(TEST_MSG, poly, 64, 2, PRIx64, vdup_n_expected2, "");
+      break;
+    default:
+      abort();
+    }
+  }
+
+  /* vexit_p64 tests.  */
+#undef TEST_MSG
+#define TEST_MSG "VEXT/VEXTQ"
+
+#define TEST_VEXT(Q, T1, T2, W, N, V)					\
+  VECT_VAR(vext_vector_res, T1, W, N) =					\
+    vext##Q##_##T2##W(VECT_VAR(vext_vector1, T1, W, N),			\
+		      VECT_VAR(vext_vector2, T1, W, N),			\
+		      V);						\
+  vst1##Q##_##T2##W(VECT_VAR(result, T1, W, N), VECT_VAR(vext_vector_res, T1, W, N))
+
+  DECL_VARIABLE(vext_vector1, poly, 64, 1);
+  DECL_VARIABLE(vext_vector1, poly, 64, 2);
+  DECL_VARIABLE(vext_vector2, poly, 64, 1);
+  DECL_VARIABLE(vext_vector2, poly, 64, 2);
+  DECL_VARIABLE(vext_vector_res, poly, 64, 1);
+  DECL_VARIABLE(vext_vector_res, poly, 64, 2);
+
+  CLEAN(result, poly, 64, 1);
+  CLEAN(result, poly, 64, 2);
+
+  VLOAD(vext_vector1, buffer, , poly, p, 64, 1);
+  VLOAD(vext_vector1, buffer, q, poly, p, 64, 2);
+
+  VDUP(vext_vector2, , poly, p, 64, 1, 0x88);
+  VDUP(vext_vector2, q, poly, p, 64, 2, 0x88);
+
+  TEST_VEXT(, poly, p, 64, 1, 0);
+  TEST_VEXT(q, poly, p, 64, 2, 1);
+
+  CHECK_POLY(TEST_MSG, poly, 64, 1, PRIx64, vext_expected, "");
+  CHECK_POLY(TEST_MSG, poly, 64, 2, PRIx64, vext_expected, "");
+
+  /* vget_low_p64 tests.  */
+#undef TEST_MSG
+#define TEST_MSG "VGET_LOW"
+
+#define TEST_VGET_LOW(T1, T2, W, N, N2)					\
+  VECT_VAR(vget_low_vector64, T1, W, N) =				\
+    vget_low_##T2##W(VECT_VAR(vget_low_vector128, T1, W, N2));		\
+  vst1_##T2##W(VECT_VAR(result, T1, W, N), VECT_VAR(vget_low_vector64, T1, W, N))
+
+  DECL_VARIABLE(vget_low_vector64, poly, 64, 1);
+  DECL_VARIABLE(vget_low_vector128, poly, 64, 2);
+
+  CLEAN(result, poly, 64, 1);
+
+  VLOAD(vget_low_vector128, buffer, q, poly, p, 64, 2);
+
+  TEST_VGET_LOW(poly, p, 64, 1, 2);
+
+  CHECK_POLY(TEST_MSG, poly, 64, 1, PRIx64, vget_low_expected, "");
+
+  /* vget_high_p64 tests.  */
+#undef TEST_MSG
+#define TEST_MSG "VGET_HIGH"
+
+#define TEST_VGET_HIGH(T1, T2, W, N, N2)					\
+  VECT_VAR(vget_high_vector64, T1, W, N) =				\
+    vget_high_##T2##W(VECT_VAR(vget_high_vector128, T1, W, N2));		\
+  vst1_##T2##W(VECT_VAR(result, T1, W, N), VECT_VAR(vget_high_vector64, T1, W, N))
+
+  DECL_VARIABLE(vget_high_vector64, poly, 64, 1);
+  DECL_VARIABLE(vget_high_vector128, poly, 64, 2);
+
+  CLEAN(result, poly, 64, 1);
+
+  VLOAD(vget_high_vector128, buffer, q, poly, p, 64, 2);
+
+  TEST_VGET_HIGH(poly, p, 64, 1, 2);
+
+  CHECK_POLY(TEST_MSG, poly, 64, 1, PRIx64, vget_high_expected, "");
+
+  /* vld1_p64 tests.  */
+#undef TEST_MSG
+#define TEST_MSG "VLD1/VLD1Q"
+
+#define TEST_VLD1(VAR, BUF, Q, T1, T2, W, N)				\
+  VECT_VAR(VAR, T1, W, N) = vld1##Q##_##T2##W(VECT_VAR(BUF, T1, W, N)); \
+  vst1##Q##_##T2##W(VECT_VAR(result, T1, W, N), VECT_VAR(VAR, T1, W, N))
+
+  DECL_VARIABLE(vld1_vector, poly, 64, 1);
+  DECL_VARIABLE(vld1_vector, poly, 64, 2);
+
+  CLEAN(result, poly, 64, 1);
+  CLEAN(result, poly, 64, 2);
+
+  VLOAD(vld1_vector, buffer, , poly, p, 64, 1);
+  VLOAD(vld1_vector, buffer, q, poly, p, 64, 2);
+
+  TEST_VLD1(vld1_vector, buffer, , poly, p, 64, 1);
+  TEST_VLD1(vld1_vector, buffer, q, poly, p, 64, 2);
+
+  CHECK_POLY(TEST_MSG, poly, 64, 1, PRIx64, vld1_expected, "");
+  CHECK_POLY(TEST_MSG, poly, 64, 2, PRIx64, vld1_expected, "");
+
+  /* vld1_dup_p64 tests.  */
+#undef TEST_MSG
+#define TEST_MSG "VLD1_DUP/VLD1_DUPQ"
+
+#define TEST_VLD1_DUP(VAR, BUF, Q, T1, T2, W, N)			\
+  VECT_VAR(VAR, T1, W, N) =						\
+    vld1##Q##_dup_##T2##W(&VECT_VAR(BUF, T1, W, N)[i]);			\
+  vst1##Q##_##T2##W(VECT_VAR(result, T1, W, N), VECT_VAR(VAR, T1, W, N))
+
+  DECL_VARIABLE(vld1_dup_vector, poly, 64, 1);
+  DECL_VARIABLE(vld1_dup_vector, poly, 64, 2);
+
+  /* Try to read different places from the input buffer.  */
+  for (i=0; i<3; i++) {
+    CLEAN(result, poly, 64, 1);
+    CLEAN(result, poly, 64, 2);
+
+    TEST_VLD1_DUP(vld1_dup_vector, buffer_dup, , poly, p, 64, 1);
+    TEST_VLD1_DUP(vld1_dup_vector, buffer_dup, q, poly, p, 64, 2);
+
+    switch (i) {
+    case 0:
+      CHECK_POLY(TEST_MSG, poly, 64, 1, PRIx64, vld1_dup_expected0, "");
+      CHECK_POLY(TEST_MSG, poly, 64, 2, PRIx64, vld1_dup_expected0, "");
+      break;
+    case 1:
+      CHECK_POLY(TEST_MSG, poly, 64, 1, PRIx64, vld1_dup_expected1, "");
+      CHECK_POLY(TEST_MSG, poly, 64, 2, PRIx64, vld1_dup_expected1, "");
+      break;
+    case 2:
+      CHECK_POLY(TEST_MSG, poly, 64, 1, PRIx64, vld1_dup_expected2, "");
+      CHECK_POLY(TEST_MSG, poly, 64, 2, PRIx64, vld1_dup_expected2, "");
+      break;
+    default:
+      abort();
+    }
+  }
+
+  /* vld1_lane_p64 tests.  */
+#undef TEST_MSG
+#define TEST_MSG "VLD1_LANE/VLD1_LANEQ"
+
+#define TEST_VLD1_LANE(Q, T1, T2, W, N, L)				\
+  memset (VECT_VAR(vld1_lane_buffer_src, T1, W, N), 0xAA, W/8*N);	\
+  VECT_VAR(vld1_lane_vector_src, T1, W, N) =				\
+    vld1##Q##_##T2##W(VECT_VAR(vld1_lane_buffer_src, T1, W, N));	\
+  VECT_VAR(vld1_lane_vector, T1, W, N) =				\
+    vld1##Q##_lane_##T2##W(VECT_VAR(buffer, T1, W, N),			\
+			   VECT_VAR(vld1_lane_vector_src, T1, W, N), L); \
+  vst1##Q##_##T2##W(VECT_VAR(result, T1, W, N), VECT_VAR(vld1_lane_vector, T1, W, N))
+
+  DECL_VARIABLE(vld1_lane_vector, poly, 64, 1);
+  DECL_VARIABLE(vld1_lane_vector, poly, 64, 2);
+  DECL_VARIABLE(vld1_lane_vector_src, poly, 64, 1);
+  DECL_VARIABLE(vld1_lane_vector_src, poly, 64, 2);
+
+  ARRAY(vld1_lane_buffer_src, poly, 64, 1);
+  ARRAY(vld1_lane_buffer_src, poly, 64, 2);
+
+  CLEAN(result, poly, 64, 1);
+  CLEAN(result, poly, 64, 2);
+
+  TEST_VLD1_LANE(, poly, p, 64, 1, 0);
+  TEST_VLD1_LANE(q, poly, p, 64, 2, 0);
+
+  CHECK_POLY(TEST_MSG, poly, 64, 1, PRIx64, vld1_lane_expected, "");
+  CHECK_POLY(TEST_MSG, poly, 64, 2, PRIx64, vld1_lane_expected, "");
+
+  /* vldX_p64 tests.  */
+#define DECL_VLDX(T1, W, N, X)						\
+  VECT_ARRAY_TYPE(T1, W, N, X) VECT_ARRAY_VAR(vldX_vector, T1, W, N, X); \
+  VECT_VAR_DECL(vldX_result_bis_##X, T1, W, N)[X * N]
+
+#define TEST_VLDX(Q, T1, T2, W, N, X)					\
+  VECT_ARRAY_VAR(vldX_vector, T1, W, N, X) =				\
+    /* Use dedicated init buffer, of size X */				\
+    vld##X##Q##_##T2##W(VECT_ARRAY_VAR(buffer_vld##X, T1, W, N, X));	\
+  vst##X##Q##_##T2##W(VECT_VAR(vldX_result_bis_##X, T1, W, N),		\
+		      VECT_ARRAY_VAR(vldX_vector, T1, W, N, X));	\
+  memcpy(VECT_VAR(result, T1, W, N), VECT_VAR(vldX_result_bis_##X, T1, W, N), \
+	 sizeof(VECT_VAR(result, T1, W, N)));
+
+  /* Overwrite "result" with the contents of "result_bis"[Y].  */
+#define TEST_EXTRA_CHUNK(T1, W, N, X,Y)				\
+  memcpy(VECT_VAR(result, T1, W, N),				\
+	 &(VECT_VAR(vldX_result_bis_##X, T1, W, N)[Y*N]),	\
+	 sizeof(VECT_VAR(result, T1, W, N)));
+
+  DECL_VLDX(poly, 64, 1, 2);
+  DECL_VLDX(poly, 64, 1, 3);
+  DECL_VLDX(poly, 64, 1, 4);
+
+  VECT_ARRAY_INIT2(buffer_vld2, poly, 64, 1);
+  PAD(buffer_vld2_pad, poly, 64, 1);
+  VECT_ARRAY_INIT3(buffer_vld3, poly, 64, 1);
+  PAD(buffer_vld3_pad, poly, 64, 1);
+  VECT_ARRAY_INIT4(buffer_vld4, poly, 64, 1);
+  PAD(buffer_vld4_pad, poly, 64, 1);
+
+#undef TEST_MSG
+#define TEST_MSG "VLD2/VLD2Q"
+  CLEAN(result, poly, 64, 1);
+  TEST_VLDX(, poly, p, 64, 1, 2);
+  CHECK_POLY(TEST_MSG, poly, 64, 1, PRIx64, vld2_expected_0, "chunk 0");
+  CLEAN(result, poly, 64, 1);
+  TEST_EXTRA_CHUNK(poly, 64, 1, 2, 1);
+  CHECK_POLY(TEST_MSG, poly, 64, 1, PRIx64, vld2_expected_1, "chunk 1");
+
+#undef TEST_MSG
+#define TEST_MSG "VLD3/VLD3Q"
+  CLEAN(result, poly, 64, 1);
+  TEST_VLDX(, poly, p, 64, 1, 3);
+  CHECK_POLY(TEST_MSG, poly, 64, 1, PRIx64, vld3_expected_0, "chunk 0");
+  CLEAN(result, poly, 64, 1);
+  TEST_EXTRA_CHUNK(poly, 64, 1, 3, 1);
+  CHECK_POLY(TEST_MSG, poly, 64, 1, PRIx64, vld3_expected_1, "chunk 1");
+  CLEAN(result, poly, 64, 1);
+  TEST_EXTRA_CHUNK(poly, 64, 1, 3, 2);
+  CHECK_POLY(TEST_MSG, poly, 64, 1, PRIx64, vld3_expected_2, "chunk 2");
+
+#undef TEST_MSG
+#define TEST_MSG "VLD4/VLD4Q"
+  CLEAN(result, poly, 64, 1);
+  TEST_VLDX(, poly, p, 64, 1, 4);
+  CHECK_POLY(TEST_MSG, poly, 64, 1, PRIx64, vld4_expected_0, "chunk 0");
+  CLEAN(result, poly, 64, 1);
+  TEST_EXTRA_CHUNK(poly, 64, 1, 4, 1);
+  CHECK_POLY(TEST_MSG, poly, 64, 1, PRIx64, vld4_expected_1, "chunk 1");
+  CLEAN(result, poly, 64, 1);
+  TEST_EXTRA_CHUNK(poly, 64, 1, 4, 2);
+  CHECK_POLY(TEST_MSG, poly, 64, 1, PRIx64, vld4_expected_2, "chunk 2");
+  CLEAN(result, poly, 64, 1);
+  TEST_EXTRA_CHUNK(poly, 64, 1, 4, 3);
+  CHECK_POLY(TEST_MSG, poly, 64, 1, PRIx64, vld4_expected_3, "chunk 3");
+
+  /* vldX_dup_p64 tests.  */
+#define DECL_VLDX_DUP(T1, W, N, X)					\
+  VECT_ARRAY_TYPE(T1, W, N, X) VECT_ARRAY_VAR(vldX_dup_vector, T1, W, N, X); \
+  VECT_VAR_DECL(vldX_dup_result_bis_##X, T1, W, N)[X * N]
+
+#define TEST_VLDX_DUP(Q, T1, T2, W, N, X)				\
+  VECT_ARRAY_VAR(vldX_dup_vector, T1, W, N, X) =			\
+    vld##X##Q##_dup_##T2##W(&VECT_VAR(buffer_dup, T1, W, N)[0]);	\
+    									\
+  vst##X##Q##_##T2##W(VECT_VAR(vldX_dup_result_bis_##X, T1, W, N),	\
+		      VECT_ARRAY_VAR(vldX_dup_vector, T1, W, N, X));	\
+  memcpy(VECT_VAR(result, T1, W, N), VECT_VAR(vldX_dup_result_bis_##X, T1, W, N), \
+	 sizeof(VECT_VAR(result, T1, W, N)));
+
+  /* Overwrite "result" with the contents of "result_bis"[Y].  */
+#define TEST_VLDX_DUP_EXTRA_CHUNK(T1, W, N, X,Y)		\
+  memcpy(VECT_VAR(result, T1, W, N),				\
+	 &(VECT_VAR(vldX_dup_result_bis_##X, T1, W, N)[Y*N]),	\
+	 sizeof(VECT_VAR(result, T1, W, N)));
+
+  DECL_VLDX_DUP(poly, 64, 1, 2);
+  DECL_VLDX_DUP(poly, 64, 1, 3);
+  DECL_VLDX_DUP(poly, 64, 1, 4);
+
+
+#undef TEST_MSG
+#define TEST_MSG "VLD2_DUP/VLD2Q_DUP"
+  CLEAN(result, poly, 64, 1);
+  TEST_VLDX_DUP(, poly, p, 64, 1, 2);
+  CHECK_POLY(TEST_MSG, poly, 64, 1, PRIx64, vld2_dup_expected_0, "chunk 0");
+  CLEAN(result, poly, 64, 1);
+  TEST_VLDX_DUP_EXTRA_CHUNK(poly, 64, 1, 2, 1);
+  CHECK_POLY(TEST_MSG, poly, 64, 1, PRIx64, vld2_dup_expected_1, "chunk 1");
+
+#undef TEST_MSG
+#define TEST_MSG "VLD3_DUP/VLD3Q_DUP"
+  CLEAN(result, poly, 64, 1);
+  TEST_VLDX_DUP(, poly, p, 64, 1, 3);
+  CHECK_POLY(TEST_MSG, poly, 64, 1, PRIx64, vld3_dup_expected_0, "chunk 0");
+  CLEAN(result, poly, 64, 1);
+  TEST_VLDX_DUP_EXTRA_CHUNK(poly, 64, 1, 3, 1);
+  CHECK_POLY(TEST_MSG, poly, 64, 1, PRIx64, vld3_dup_expected_1, "chunk 1");
+  CLEAN(result, poly, 64, 1);
+  TEST_VLDX_DUP_EXTRA_CHUNK(poly, 64, 1, 3, 2);
+  CHECK_POLY(TEST_MSG, poly, 64, 1, PRIx64, vld3_dup_expected_2, "chunk 2");
+
+#undef TEST_MSG
+#define TEST_MSG "VLD4_DUP/VLD4Q_DUP"
+  CLEAN(result, poly, 64, 1);
+  TEST_VLDX_DUP(, poly, p, 64, 1, 4);
+  CHECK_POLY(TEST_MSG, poly, 64, 1, PRIx64, vld4_dup_expected_0, "chunk 0");
+  CLEAN(result, poly, 64, 1);
+  TEST_VLDX_DUP_EXTRA_CHUNK(poly, 64, 1, 4, 1);
+  CHECK_POLY(TEST_MSG, poly, 64, 1, PRIx64, vld4_dup_expected_1, "chunk 1");
+  CLEAN(result, poly, 64, 1);
+  TEST_VLDX_DUP_EXTRA_CHUNK(poly, 64, 1, 4, 2);
+  CHECK_POLY(TEST_MSG, poly, 64, 1, PRIx64, vld4_dup_expected_2, "chunk 2");
+  CLEAN(result, poly, 64, 1);
+  TEST_VLDX_DUP_EXTRA_CHUNK(poly, 64, 1, 4, 3);
+  CHECK_POLY(TEST_MSG, poly, 64, 1, PRIx64, vld4_dup_expected_3, "chunk 3");
+
+  /* vsli_p64 tests.  */
+#undef TEST_MSG
+#define TEST_MSG "VSLI"
+
+#define TEST_VSXI1(INSN, Q, T1, T2, W, N, V)				\
+  VECT_VAR(vsXi_vector_res, T1, W, N) =					\
+    INSN##Q##_n_##T2##W(VECT_VAR(vsXi_vector, T1, W, N),		\
+		      VECT_VAR(vsXi_vector2, T1, W, N),			\
+		      V);						\
+  vst1##Q##_##T2##W(VECT_VAR(result, T1, W, N), VECT_VAR(vsXi_vector_res, T1, W, N))
+
+#define TEST_VSXI(INSN, Q, T1, T2, W, N, V)	\
+  TEST_VSXI1(INSN, Q, T1, T2, W, N, V)
+
+  DECL_VARIABLE(vsXi_vector, poly, 64, 1);
+  DECL_VARIABLE(vsXi_vector, poly, 64, 2);
+  DECL_VARIABLE(vsXi_vector2, poly, 64, 1);
+  DECL_VARIABLE(vsXi_vector2, poly, 64, 2);
+  DECL_VARIABLE(vsXi_vector_res, poly, 64, 1);
+  DECL_VARIABLE(vsXi_vector_res, poly, 64, 2);
+
+  CLEAN(result, poly, 64, 1);
+  CLEAN(result, poly, 64, 2);
+
+  VLOAD(vsXi_vector, buffer, , poly, p, 64, 1);
+  VLOAD(vsXi_vector, buffer, q, poly, p, 64, 2);
+
+  VDUP(vsXi_vector2, , poly, p, 64, 1, 2);
+  VDUP(vsXi_vector2, q, poly, p, 64, 2, 3);
+
+  TEST_VSXI(vsli, , poly, p, 64, 1, 3);
+  TEST_VSXI(vsli, q, poly, p, 64, 2, 53);
+
+  CHECK_POLY(TEST_MSG, poly, 64, 1, PRIx64, vsli_expected, "");
+  CHECK_POLY(TEST_MSG, poly, 64, 2, PRIx64, vsli_expected, "");
+
+  /* Test cases with maximum shift amount.  */
+  CLEAN(result, poly, 64, 1);
+  CLEAN(result, poly, 64, 2);
+
+  TEST_VSXI(vsli, , poly, p, 64, 1, 63);
+  TEST_VSXI(vsli, q, poly, p, 64, 2, 63);
+
+#define COMMENT "(max shift amount)"
+  CHECK_POLY(TEST_MSG, poly, 64, 1, PRIx64, vsli_expected_max_shift, COMMENT);
+  CHECK_POLY(TEST_MSG, poly, 64, 2, PRIx64, vsli_expected_max_shift, COMMENT);
+
+  /* vsri_p64 tests.  */
+#undef TEST_MSG
+#define TEST_MSG "VSRI"
+
+  CLEAN(result, poly, 64, 1);
+  CLEAN(result, poly, 64, 2);
+
+  VLOAD(vsXi_vector, buffer, , poly, p, 64, 1);
+  VLOAD(vsXi_vector, buffer, q, poly, p, 64, 2);
+
+  VDUP(vsXi_vector2, , poly, p, 64, 1, 2);
+  VDUP(vsXi_vector2, q, poly, p, 64, 2, 3);
+
+  TEST_VSXI(vsri, , poly, p, 64, 1, 3);
+  TEST_VSXI(vsri, q, poly, p, 64, 2, 53);
+
+  CHECK_POLY(TEST_MSG, poly, 64, 1, PRIx64, vsri_expected, "");
+  CHECK_POLY(TEST_MSG, poly, 64, 2, PRIx64, vsri_expected, "");
+
+  /* Test cases with maximum shift amount.  */
+  CLEAN(result, poly, 64, 1);
+  CLEAN(result, poly, 64, 2);
+
+  TEST_VSXI(vsri, , poly, p, 64, 1, 64);
+  TEST_VSXI(vsri, q, poly, p, 64, 2, 64);
+
+#define COMMENT "(max shift amount)"
+  CHECK_POLY(TEST_MSG, poly, 64, 1, PRIx64, vsri_expected_max_shift, COMMENT);
+  CHECK_POLY(TEST_MSG, poly, 64, 2, PRIx64, vsri_expected_max_shift, COMMENT);
+
+  /* vst1_lane_p64 tests.  */
+#undef TEST_MSG
+#define TEST_MSG "VST1_LANE/VST1_LANEQ"
+
+#define TEST_VST1_LANE(Q, T1, T2, W, N, L)				\
+  VECT_VAR(vst1_lane_vector, T1, W, N) =				\
+    vld1##Q##_##T2##W(VECT_VAR(buffer, T1, W, N));			\
+  vst1##Q##_lane_##T2##W(VECT_VAR(result, T1, W, N),			\
+			 VECT_VAR(vst1_lane_vector, T1, W, N), L);
+
+  DECL_VARIABLE(vst1_lane_vector, poly, 64, 1);
+  DECL_VARIABLE(vst1_lane_vector, poly, 64, 2);
+
+  CLEAN(result, poly, 64, 1);
+  CLEAN(result, poly, 64, 2);
+
+  TEST_VST1_LANE(, poly, p, 64, 1, 0);
+  TEST_VST1_LANE(q, poly, p, 64, 2, 0);
+
+  CHECK_POLY(TEST_MSG, poly, 64, 1, PRIx64, vst1_lane_expected, "");
+  CHECK_POLY(TEST_MSG, poly, 64, 2, PRIx64, vst1_lane_expected, "");
+
+#ifdef __aarch64__
+
+  /* vmov_n_p64 tests.  */
+#undef TEST_MSG
+#define TEST_MSG "VMOV/VMOVQ"
+
+#define TEST_VMOV(Q, T1, T2, W, N)					\
+  VECT_VAR(vmov_n_vector, T1, W, N) =					\
+    vmov##Q##_n_##T2##W(VECT_VAR(buffer_dup, T1, W, N)[i]);		\
+  vst1##Q##_##T2##W(VECT_VAR(result, T1, W, N), VECT_VAR(vmov_n_vector, T1, W, N))
+
+  DECL_VARIABLE(vmov_n_vector, poly, 64, 1);
+  DECL_VARIABLE(vmov_n_vector, poly, 64, 2);
+
+  /* Try to read different places from the input buffer.  */
+  for (i=0; i< 3; i++) {
+    CLEAN(result, poly, 64, 1);
+    CLEAN(result, poly, 64, 2);
+
+    TEST_VMOV(, poly, p, 64, 1);
+    TEST_VMOV(q, poly, p, 64, 2);
+
+    switch (i) {
+    case 0:
+      CHECK_POLY(TEST_MSG, poly, 64, 1, PRIx64, vmov_n_expected0, "");
+      CHECK_POLY(TEST_MSG, poly, 64, 2, PRIx64, vmov_n_expected0, "");
+      break;
+    case 1:
+      CHECK_POLY(TEST_MSG, poly, 64, 1, PRIx64, vmov_n_expected1, "");
+      CHECK_POLY(TEST_MSG, poly, 64, 2, PRIx64, vmov_n_expected1, "");
+      break;
+    case 2:
+      CHECK_POLY(TEST_MSG, poly, 64, 1, PRIx64, vmov_n_expected2, "");
+      CHECK_POLY(TEST_MSG, poly, 64, 2, PRIx64, vmov_n_expected2, "");
+      break;
+    default:
+      abort();
+    }
+  }
+
+  /* vget_lane_p64 tests.  */
+#undef TEST_MSG
+#define TEST_MSG "VGET_LANE/VGETQ_LANE"
+
+#define TEST_VGET_LANE(Q, T1, T2, W, N, L)				   \
+  VECT_VAR(vget_lane_vector, T1, W, N) = vget##Q##_lane_##T2##W(VECT_VAR(vector, T1, W, N), L); \
+  if (VECT_VAR(vget_lane_vector, T1, W, N) != VECT_VAR(vget_lane_expected, T1, W, N)) {		\
+    fprintf(stderr,							   \
+	    "ERROR in %s (%s line %d in result '%s') at type %s "	   \
+	    "got 0x%" PRIx##W " != 0x%" PRIx##W "\n",			   \
+	    TEST_MSG, __FILE__, __LINE__,				   \
+	    STR(VECT_VAR(vget_lane_expected, T1, W, N)),		   \
+	    STR(VECT_NAME(T1, W, N)),					   \
+	    (uint##W##_t)VECT_VAR(vget_lane_vector, T1, W, N),		   \
+	    (uint##W##_t)VECT_VAR(vget_lane_expected, T1, W, N));	   \
+    abort ();								   \
+  }
+
+  /* Initialize input values.  */
+  DECL_VARIABLE(vector, poly, 64, 1);
+  DECL_VARIABLE(vector, poly, 64, 2);
+
+  VLOAD(vector, buffer,  , poly, p, 64, 1);
+  VLOAD(vector, buffer, q, poly, p, 64, 2);
+
+  VECT_VAR_DECL(vget_lane_vector, poly, 64, 1);
+  VECT_VAR_DECL(vget_lane_vector, poly, 64, 2);
+
+  TEST_VGET_LANE( , poly, p, 64, 1, 0);
+  TEST_VGET_LANE(q, poly, p, 64, 2, 0);
+
+  /* vldx_lane_p64 tests.  */
+#undef TEST_MSG
+#define TEST_MSG "VLDX_LANE/VLDXQ_LANE"
+
+VECT_VAR_DECL_INIT(buffer_vld2_lane, poly, 64, 2);
+VECT_VAR_DECL_INIT(buffer_vld3_lane, poly, 64, 3);
+VECT_VAR_DECL_INIT(buffer_vld4_lane, poly, 64, 4);
+
+  /* In this case, input variables are arrays of vectors.  */
+#define DECL_VLD_STX_LANE(T1, W, N, X)					\
+  VECT_ARRAY_TYPE(T1, W, N, X) VECT_ARRAY_VAR(vector, T1, W, N, X);	\
+  VECT_ARRAY_TYPE(T1, W, N, X) VECT_ARRAY_VAR(vector_src, T1, W, N, X);	\
+  VECT_VAR_DECL(result_bis_##X, T1, W, N)[X * N]
+
+  /* We need to use a temporary result buffer (result_bis), because
+     the one used for other tests is not large enough. A subset of the
+     result data is moved from result_bis to result, and it is this
+     subset which is used to check the actual behavior. The next
+     macro enables to move another chunk of data from result_bis to
+     result.  */
+  /* We also use another extra input buffer (buffer_src), which we
+     fill with 0xAA, and which it used to load a vector from which we
+     read a given lane.  */
+
+#define TEST_VLDX_LANE(Q, T1, T2, W, N, X, L)				\
+  memset (VECT_VAR(buffer_src, T1, W, N), 0xAA,				\
+	  sizeof(VECT_VAR(buffer_src, T1, W, N)));			\
+									\
+  VECT_ARRAY_VAR(vector_src, T1, W, N, X) =				\
+    vld##X##Q##_##T2##W(VECT_VAR(buffer_src, T1, W, N));		\
+									\
+  VECT_ARRAY_VAR(vector, T1, W, N, X) =					\
+    /* Use dedicated init buffer, of size.  X */			\
+    vld##X##Q##_lane_##T2##W(VECT_VAR(buffer_vld##X##_lane, T1, W, X),	\
+			     VECT_ARRAY_VAR(vector_src, T1, W, N, X),	\
+			     L);					\
+  vst##X##Q##_##T2##W(VECT_VAR(result_bis_##X, T1, W, N),		\
+		      VECT_ARRAY_VAR(vector, T1, W, N, X));		\
+  memcpy(VECT_VAR(result, T1, W, N), VECT_VAR(result_bis_##X, T1, W, N), \
+	 sizeof(VECT_VAR(result, T1, W, N)))
+
+  /* Overwrite "result" with the contents of "result_bis"[Y].  */
+#undef TEST_EXTRA_CHUNK
+#define TEST_EXTRA_CHUNK(T1, W, N, X, Y)		\
+  memcpy(VECT_VAR(result, T1, W, N),			\
+	 &(VECT_VAR(result_bis_##X, T1, W, N)[Y*N]),	\
+	 sizeof(VECT_VAR(result, T1, W, N)));
+
+  /* Add some padding to try to catch out of bound accesses.  */
+#define ARRAY1(V, T, W, N) VECT_VAR_DECL(V,T,W,N)[1]={42}
+#define DUMMY_ARRAY(V, T, W, N, L) \
+  VECT_VAR_DECL(V,T,W,N)[N*L]={0}; \
+  ARRAY1(V##_pad,T,W,N)
+
+#define DECL_ALL_VLD_STX_LANE(X)     \
+  DECL_VLD_STX_LANE(poly, 64, 1, X); \
+  DECL_VLD_STX_LANE(poly, 64, 2, X);
+
+#define TEST_ALL_VLDX_LANE(X)		  \
+  TEST_VLDX_LANE(, poly, p, 64, 1, X, 0); \
+  TEST_VLDX_LANE(q, poly, p, 64, 2, X, 0);
+
+#define TEST_ALL_EXTRA_CHUNKS(X,Y)	     \
+  TEST_EXTRA_CHUNK(poly, 64, 1, X, Y) \
+  TEST_EXTRA_CHUNK(poly, 64, 2, X, Y)
+
+#define CHECK_RESULTS_VLD_STX_LANE(test_name,EXPECTED,comment)		\
+  CHECK_POLY(test_name, poly, 64, 1, PRIx64, EXPECTED, comment);	\
+  CHECK_POLY(test_name, poly, 64, 2, PRIx64, EXPECTED, comment);
+
+  /* Declare the temporary buffers / variables.  */
+  DECL_ALL_VLD_STX_LANE(2);
+  DECL_ALL_VLD_STX_LANE(3);
+  DECL_ALL_VLD_STX_LANE(4);
+
+  DUMMY_ARRAY(buffer_src, poly, 64, 1, 4);
+  DUMMY_ARRAY(buffer_src, poly, 64, 2, 4);
+
+  /* Check vld2_lane/vld2q_lane.  */
+  clean_results ();
+#undef TEST_MSG
+#define TEST_MSG "VLD2_LANE/VLD2Q_LANE"
+  TEST_ALL_VLDX_LANE(2);
+  CHECK_RESULTS_VLD_STX_LANE (TEST_MSG, expected_vld_st2_0, " chunk 0");
+
+  TEST_ALL_EXTRA_CHUNKS(2, 1);
+  CHECK_RESULTS_VLD_STX_LANE (TEST_MSG, expected_vld_st2_1, " chunk 1");
+
+  /* Check vld3_lane/vld3q_lane.  */
+  clean_results ();
+#undef TEST_MSG
+#define TEST_MSG "VLD3_LANE/VLD3Q_LANE"
+  TEST_ALL_VLDX_LANE(3);
+  CHECK_RESULTS_VLD_STX_LANE (TEST_MSG, expected_vld_st3_0, " chunk 0");
+
+  TEST_ALL_EXTRA_CHUNKS(3, 1);
+  CHECK_RESULTS_VLD_STX_LANE (TEST_MSG, expected_vld_st3_1, " chunk 1");
+
+  TEST_ALL_EXTRA_CHUNKS(3, 2);
+  CHECK_RESULTS_VLD_STX_LANE (TEST_MSG, expected_vld_st3_2, " chunk 2");
+
+  /* Check vld4_lane/vld4q_lane.  */
+  clean_results ();
+#undef TEST_MSG
+#define TEST_MSG "VLD4_LANE/VLD4Q_LANE"
+  TEST_ALL_VLDX_LANE(4);
+  CHECK_RESULTS_VLD_STX_LANE (TEST_MSG, expected_vld_st4_0, " chunk 0");
+
+  TEST_ALL_EXTRA_CHUNKS(4, 1);
+  CHECK_RESULTS_VLD_STX_LANE (TEST_MSG, expected_vld_st4_1, " chunk 1");
+
+  TEST_ALL_EXTRA_CHUNKS(4, 2);
+  CHECK_RESULTS_VLD_STX_LANE (TEST_MSG, expected_vld_st4_2, " chunk 2");
+
+  TEST_ALL_EXTRA_CHUNKS(4, 3);
+  CHECK_RESULTS_VLD_STX_LANE (TEST_MSG, expected_vld_st4_3, " chunk 3");
+
+  /* In this case, input variables are arrays of vectors.  */
+#define DECL_VSTX_LANE(T1, W, N, X)					\
+  VECT_ARRAY_TYPE(T1, W, N, X) VECT_ARRAY_VAR(vector, T1, W, N, X);	\
+  VECT_ARRAY_TYPE(T1, W, N, X) VECT_ARRAY_VAR(vector_src, T1, W, N, X);	\
+  VECT_VAR_DECL(result_bis_##X, T1, W, N)[X * N]
+
+  /* We need to use a temporary result buffer (result_bis), because
+     the one used for other tests is not large enough. A subset of the
+     result data is moved from result_bis to result, and it is this
+     subset which is used to check the actual behavior. The next
+     macro enables to move another chunk of data from result_bis to
+     result.  */
+  /* We also use another extra input buffer (buffer_src), which we
+     fill with 0xAA, and which it used to load a vector from which we
+     read a given lane.  */
+#define TEST_VSTX_LANE(Q, T1, T2, W, N, X, L)				 \
+  memset (VECT_VAR(buffer_src, T1, W, N), 0xAA,				 \
+	  sizeof(VECT_VAR(buffer_src, T1, W, N)));			 \
+  memset (VECT_VAR(result_bis_##X, T1, W, N), 0,			 \
+	  sizeof(VECT_VAR(result_bis_##X, T1, W, N)));			 \
+									 \
+  VECT_ARRAY_VAR(vector_src, T1, W, N, X) =				 \
+    vld##X##Q##_##T2##W(VECT_VAR(buffer_src, T1, W, N));		 \
+									 \
+  VECT_ARRAY_VAR(vector, T1, W, N, X) =					 \
+    /* Use dedicated init buffer, of size X.  */			 \
+    vld##X##Q##_lane_##T2##W(VECT_VAR(buffer_vld##X##_lane, T1, W, X),	 \
+			     VECT_ARRAY_VAR(vector_src, T1, W, N, X),	 \
+			     L);					 \
+  vst##X##Q##_lane_##T2##W(VECT_VAR(result_bis_##X, T1, W, N),		 \
+			   VECT_ARRAY_VAR(vector, T1, W, N, X),		 \
+			   L);						 \
+  memcpy(VECT_VAR(result, T1, W, N), VECT_VAR(result_bis_##X, T1, W, N), \
+	 sizeof(VECT_VAR(result, T1, W, N)));
+
+#define TEST_ALL_VSTX_LANE(X)		  \
+  TEST_VSTX_LANE(, poly, p, 64, 1, X, 0); \
+  TEST_VSTX_LANE(q, poly, p, 64, 2, X, 0);
+
+  /* Check vst2_lane/vst2q_lane.  */
+  clean_results ();
+#undef TEST_MSG
+#define TEST_MSG "VST2_LANE/VST2Q_LANE"
+  TEST_ALL_VSTX_LANE(2);
+
+#define CMT " (chunk 0)"
+  CHECK_POLY(TEST_MSG, poly, 64, 1, PRIx64, expected_vld_st2_0, CMT);
+
+  TEST_ALL_EXTRA_CHUNKS(2, 1);
+#undef CMT
+#define CMT " chunk 1"
+  CHECK_POLY(TEST_MSG, poly, 64, 1, PRIx64, expected_vld_st2_1, CMT);
+
+  /* Check vst3_lane/vst3q_lane.  */
+  clean_results ();
+#undef TEST_MSG
+#define TEST_MSG "VST3_LANE/VST3Q_LANE"
+  TEST_ALL_VSTX_LANE(3);
+
+#undef CMT
+#define CMT " (chunk 0)"
+  CHECK_POLY(TEST_MSG, poly, 64, 1, PRIx64, expected_vld_st3_0, CMT);
+
+  TEST_ALL_EXTRA_CHUNKS(3, 1);
+
+#undef CMT
+#define CMT " (chunk 1)"
+  CHECK_POLY(TEST_MSG, poly, 64, 1, PRIx64, expected_vld_st3_1, CMT);
+
+  TEST_ALL_EXTRA_CHUNKS(3, 2);
+
+#undef CMT
+#define CMT " (chunk 2)"
+  CHECK_POLY(TEST_MSG, poly, 64, 1, PRIx64, expected_vld_st3_2, CMT);
+
+  /* Check vst4_lane/vst4q_lane.  */
+  clean_results ();
+#undef TEST_MSG
+#define TEST_MSG "VST4_LANE/VST4Q_LANE"
+  TEST_ALL_VSTX_LANE(4);
+
+#undef CMT
+#define CMT " (chunk 0)"
+  CHECK_POLY(TEST_MSG, poly, 64, 1, PRIx64, expected_vld_st4_0, CMT);
+
+  TEST_ALL_EXTRA_CHUNKS(4, 1);
+
+#undef CMT
+#define CMT " (chunk 1)"
+  CHECK_POLY(TEST_MSG, poly, 64, 1, PRIx64, expected_vld_st4_1, CMT);
+
+  TEST_ALL_EXTRA_CHUNKS(4, 2);
+
+#undef CMT
+#define CMT " (chunk 2)"
+  CHECK_POLY(TEST_MSG, poly, 64, 1, PRIx64, expected_vld_st4_2, CMT);
+
+  TEST_ALL_EXTRA_CHUNKS(4, 3);
+
+#undef CMT
+#define CMT " (chunk 3)"
+  CHECK_POLY(TEST_MSG, poly, 64, 1, PRIx64, expected_vld_st4_3, CMT);
+
+#endif /* __aarch64__.  */
+
+  return 0;
+}
--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/ternary_scalar_op.inc
@@ -0,0 +1,206 @@
+/* Template file for ternary scalar operator validation.
+
+   This file is meant to be included by test files for binary scalar
+   operations.  */
+
+/* Check for required settings.  */
+
+#ifndef INSN_NAME
+#error INSN_NAME (the intrinsic to test) must be defined.
+#endif
+
+#ifndef INPUT_TYPE
+#error INPUT_TYPE (basic type of an input value) must be defined.
+#endif
+
+#ifndef OUTPUT_TYPE
+#error OUTPUT_TYPE (basic type of an output value) must be defined.
+#endif
+
+#ifndef OUTPUT_TYPE_SIZE
+#error OUTPUT_TYPE_SIZE (size in bits of an output value) must be defined.
+#endif
+
+/* Optional settings:
+
+   INPUT_1: Input values for the first parameter.  Must be of type INPUT_TYPE.
+   INPUT_2: Input values for the second parameter.  Must be of type INPUT_TYPE.
+   INPUT_3: Input values for the third parameter.  Must be of type
+   INPUT_TYPE.  */
+
+#ifndef TEST_MSG
+#define TEST_MSG "unnamed test"
+#endif
+
+/* The test framework.  */
+
+#include <stdio.h>
+
+extern void abort ();
+
+#define INFF __builtin_inf ()
+
+/* Stringify a macro.  */
+#define STR0(A) #A
+#define STR(A) STR0 (A)
+
+/* Macro concatenation.  */
+#define CAT0(A, B) A##B
+#define CAT(A, B) CAT0 (A, B)
+
+/* Format strings for error reporting.  */
+#define FMT16 "0x%04x"
+#define FMT32 "0x%08x"
+#define FMT CAT (FMT,OUTPUT_TYPE_SIZE)
+
+/* Type construction: forms TS_t, where T is the base type and S the size in
+   bits.  */
+#define MK_TYPE0(T, S) T##S##_t
+#define MK_TYPE(T, S) MK_TYPE0 (T, S)
+
+/* Convenience types for input and output data.  */
+typedef MK_TYPE (uint, OUTPUT_TYPE_SIZE) output_hex_type;
+
+/* Conversion between typed values and their hexadecimal representation.  */
+typedef union
+{
+  OUTPUT_TYPE value;
+  output_hex_type hex;
+} output_conv_type;
+
+/* Default input values.  */
+
+float16_t input_1_float16_t[] =
+{
+  0.0,
+  -0.0,
+  2.0,
+  3.1,
+  20.0,
+  0.40,
+  -2.3,
+  1.33,
+  -7.6,
+  0.31,
+  0.3353,
+  0.5,
+  1.0,
+  13.13,
+  -6.3,
+  20.0,
+  (float16_t)INFF,
+  (float16_t)-INFF,
+};
+
+float16_t input_2_float16_t[] =
+{
+  1.0,
+  1.0,
+  -4.33,
+  100.0,
+  30.0,
+  -0.02,
+  0.5,
+  -7.231,
+  -6.3,
+  20.0,
+  -7.231,
+  2.3,
+  -7.6,
+  5.1,
+  0.31,
+  0.33353,
+  (float16_t)-INFF,
+  (float16_t)INFF,
+};
+
+float16_t input_3_float16_t[] =
+{
+  -0.0,
+  0.0,
+  0.31,
+  -0.31,
+  1.31,
+  2.1,
+  -6.3,
+  1.0,
+  -1.5,
+  5.1,
+  0.3353,
+  9.3,
+  -9.3,
+  -7.231,
+  0.5,
+  -0.33,
+  (float16_t)INFF,
+  (float16_t)INFF,
+};
+
+#ifndef INPUT_1
+#define INPUT_1 CAT (input_1_,INPUT_TYPE)
+#endif
+
+#ifndef INPUT_2
+#define INPUT_2 CAT (input_2_,INPUT_TYPE)
+#endif
+
+#ifndef INPUT_3
+#define INPUT_3 CAT (input_3_,INPUT_TYPE)
+#endif
+
+/* Support macros and routines for the test function.  */
+
+#define CHECK()							\
+  {								\
+    output_conv_type actual;					\
+    output_conv_type expect;					\
+								\
+    expect.hex = ((output_hex_type*)EXPECTED)[index];		\
+    actual.value = INSN_NAME ((INPUT_1)[index],			\
+			      (INPUT_2)[index],			\
+			      (INPUT_3)[index]);		\
+								\
+    if (actual.hex != expect.hex)				\
+      {								\
+	fprintf (stderr,					\
+		 "ERROR in %s (%s line %d), buffer %s, "	\
+		 "index %d: got "				\
+		 FMT " != " FMT "\n",				\
+		 TEST_MSG, __FILE__, __LINE__,			\
+		 STR (EXPECTED), index,				\
+		 actual.hex, expect.hex);			\
+	abort ();						\
+      }								\
+    fprintf (stderr, "CHECKED %s %s\n",				\
+	     STR (EXPECTED), TEST_MSG);				\
+  }
+
+#define FNNAME1(NAME) exec_ ## NAME
+#define FNNAME(NAME) FNNAME1 (NAME)
+
+/* The test function.  */
+
+void
+FNNAME (INSN_NAME) (void)
+{
+  /* Basic test: y[i] = OP (x[i]), for each INPUT[i], then compare the result
+     against EXPECTED[i].  */
+
+  const int num_tests = sizeof (INPUT_1) / sizeof (INPUT_1[0]);
+  int index;
+
+  for (index = 0; index < num_tests; index++)
+    CHECK ();
+
+#ifdef EXTRA_TESTS
+  EXTRA_TESTS ();
+#endif
+}
+
+int
+main (void)
+{
+  FNNAME (INSN_NAME) ();
+
+  return 0;
+}
--- a/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/unary_sat_op.inc
+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/unary_sat_op.inc
@@ -61,11 +61,11 @@ void FNNAME (INSN_NAME) (void)
   TEST_UNARY_SAT_OP(INSN_NAME, q, int, s, 32, 4, expected_cumulative_sat, "");
 
   CHECK(TEST_MSG, int, 8, 8, PRIx8, expected, "");
-  CHECK(TEST_MSG, int, 16, 4, PRIx8, expected, "");
-  CHECK(TEST_MSG, int, 32, 2, PRIx8, expected, "");
+  CHECK(TEST_MSG, int, 16, 4, PRIx16, expected, "");
+  CHECK(TEST_MSG, int, 32, 2, PRIx32, expected, "");
   CHECK(TEST_MSG, int, 8, 16, PRIx8, expected, "");
-  CHECK(TEST_MSG, int, 16, 8, PRIx8, expected, "");
-  CHECK(TEST_MSG, int, 32, 4, PRIx8, expected, "");
+  CHECK(TEST_MSG, int, 16, 8, PRIx16, expected, "");
+  CHECK(TEST_MSG, int, 32, 4, PRIx32, expected, "");
 
 #ifdef EXTRA_TESTS
   EXTRA_TESTS();
--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/unary_scalar_op.inc
@@ -0,0 +1,200 @@
+/* Template file for unary scalar operator validation.
+
+   This file is meant to be included by test files for unary scalar
+   operations.  */
+
+/* Check for required settings.  */
+
+#ifndef INSN_NAME
+#error INSN_NAME (the intrinsic to test) must be defined.
+#endif
+
+#ifndef INPUT_TYPE
+#error INPUT_TYPE (basic type of an input value) must be defined.
+#endif
+
+#ifndef SCALAR_OPERANDS
+#ifndef EXPECTED
+#error EXPECTED (an array of expected output values) must be defined.
+#endif
+#endif
+
+#ifndef OUTPUT_TYPE
+#error OUTPUT_TYPE (basic type of an output value) must be defined.
+#endif
+
+#ifndef OUTPUT_TYPE_SIZE
+#error OUTPUT_TYPE_SIZE (size in bits of an output value) must be defined.
+#endif
+
+/* Optional settings.  */
+
+/* SCALAR_OPERANDS: Defined iff the intrinsic has a scalar operand.
+
+   SCALAR_1, SCALAR_2, .., SCALAR_4: If SCALAR_OPERANDS is defined, SCALAR_<n>
+   is the scalar and EXPECTED_<n> is array of expected values.
+
+   INPUT: Input values for the first parameter.  Must be of type INPUT_TYPE.  */
+
+/* Additional comments for the error message.  */
+#ifndef COMMENT
+#define COMMENT ""
+#endif
+
+#ifndef TEST_MSG
+#define TEST_MSG "unnamed test"
+#endif
+
+/* The test framework.  */
+
+#include <stdio.h>
+
+extern void abort ();
+
+#define INFF __builtin_inf ()
+
+/* Stringify a macro.  */
+#define STR0(A) #A
+#define STR(A) STR0 (A)
+
+/* Macro concatenation.  */
+#define CAT0(A, B) A##B
+#define CAT(A, B) CAT0 (A, B)
+
+/* Format strings for error reporting.  */
+#define FMT16 "0x%04x"
+#define FMT32 "0x%08x"
+#define FMT64 "0x%016x"
+#define FMT CAT (FMT,OUTPUT_TYPE_SIZE)
+
+/* Type construction: forms TS_t, where T is the base type and S the size in
+   bits.  */
+#define MK_TYPE0(T, S) T##S##_t
+#define MK_TYPE(T, S) MK_TYPE0 (T, S)
+
+/* Convenience types for input and output data.  */
+typedef MK_TYPE (uint, OUTPUT_TYPE_SIZE) output_hex_type;
+
+/* Conversion between typed values and their hexadecimal representation.  */
+typedef union
+{
+  OUTPUT_TYPE value;
+  output_hex_type hex;
+} output_conv_type;
+
+/* Default input values.  */
+
+float16_t input_1_float16_t[] =
+{
+  0.0, -0.0,
+  2.0, 3.1,
+  20.0, 0.40,
+  -2.3, 1.33,
+  -7.6, 0.31,
+  0.3353, 0.5,
+  1.0, 13.13,
+  -6.3, 20.0,
+  (float16_t)INFF, (float16_t)-INFF,
+};
+
+#ifndef INPUT
+#define INPUT CAT(input_1_,INPUT_TYPE)
+#endif
+
+/* Support macros and routines for the test function.  */
+
+#define CHECK()							\
+  {								\
+    output_conv_type actual;					\
+    output_conv_type expect;					\
+								\
+    expect.hex = ((output_hex_type*)EXPECTED)[index];		\
+    actual.value = INSN_NAME ((INPUT)[index]);			\
+								\
+    if (actual.hex != expect.hex)				\
+      {								\
+	fprintf (stderr,					\
+		 "ERROR in %s (%s line %d), buffer %s, "	\
+		 "index %d: got "				\
+		 FMT " != " FMT "\n",				\
+		 TEST_MSG, __FILE__, __LINE__,			\
+		 STR (EXPECTED), index,				\
+		 actual.hex, expect.hex);			\
+	abort ();						\
+      }								\
+    fprintf (stderr, "CHECKED %s %s\n",				\
+	     STR (EXPECTED), TEST_MSG);				\
+  }
+
+#define CHECK_N(SCALAR, EXPECTED)				\
+  {								\
+    output_conv_type actual;					\
+    output_conv_type expect;					\
+								\
+    expect.hex							\
+      = ((output_hex_type*)EXPECTED)[index];			\
+    actual.value = INSN_NAME ((INPUT)[index], (SCALAR));	\
+								\
+    if (actual.hex != expect.hex)				\
+      {								\
+	fprintf (stderr,					\
+		 "ERROR in %s (%s line %d), buffer %s, "	\
+		 "index %d: got "				\
+		 FMT " != " FMT "\n",				\
+		 TEST_MSG, __FILE__, __LINE__,			\
+		 STR (EXPECTED), index,				\
+		 actual.hex, expect.hex);			\
+	abort ();						\
+      }								\
+    fprintf (stderr, "CHECKED %s %s\n",				\
+	     STR (EXPECTED), TEST_MSG);				\
+  }
+
+#define FNNAME1(NAME) exec_ ## NAME
+#define FNNAME(NAME) FNNAME1 (NAME)
+
+/* The test function.  */
+
+void
+FNNAME (INSN_NAME) (void)
+{
+  /* Basic test: y[i] = OP (x[i]), for each INPUT[i], then compare the result
+     against EXPECTED[i].  */
+
+  const int num_tests = sizeof (INPUT) / sizeof (INPUT[0]);
+  int index;
+
+  for (index = 0; index < num_tests; index++)
+    {
+#if defined (SCALAR_OPERANDS)
+
+#ifdef SCALAR_1
+      CHECK_N (SCALAR_1, EXPECTED_1);
+#endif
+#ifdef SCALAR_2
+      CHECK_N (SCALAR_2, EXPECTED_2);
+#endif
+#ifdef SCALAR_3
+      CHECK_N (SCALAR_3, EXPECTED_3);
+#endif
+#ifdef SCALAR_4
+      CHECK_N (SCALAR_4, EXPECTED_4);
+#endif
+
+#else /* !defined (SCALAR_OPERAND).  */
+      CHECK ();
+#endif
+    }
+
+#ifdef EXTRA_TESTS
+  EXTRA_TESTS ();
+#endif
+}
+
+int
+main (void)
+{
+  FNNAME (INSN_NAME) ();
+
+  return 0;
+}
--- a/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vabd.c
+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vabd.c
@@ -30,10 +30,20 @@ VECT_VAR_DECL(expected,uint,32,4) [] = { 0xffffffd0, 0xffffffd1,
 					 0xffffffd2, 0xffffffd3 };
 VECT_VAR_DECL(expected,hfloat,32,4) [] = { 0x42407ae1, 0x423c7ae1,
 					   0x42387ae1, 0x42347ae1 };
+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+VECT_VAR_DECL(expected, hfloat, 16, 4) [] = { 0x4e13, 0x4dd3,
+					      0x4d93, 0x4d53 };
+VECT_VAR_DECL(expected, hfloat, 16, 8) [] = { 0x5204, 0x51e4, 0x51c4, 0x51a4,
+					      0x5184, 0x5164, 0x5144, 0x5124 };
+#endif
 
 /* Additional expected results for float32 variants with specially
    chosen input values.  */
 VECT_VAR_DECL(expected_float32,hfloat,32,4) [] = { 0x0, 0x0, 0x0, 0x0 };
+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+VECT_VAR_DECL(expected_float16, hfloat, 16, 8) [] = { 0x0, 0x0, 0x0, 0x0,
+						      0x0, 0x0, 0x0, 0x0 };
+#endif
 
 #define TEST_MSG "VABD/VABDQ"
 void exec_vabd (void)
@@ -65,6 +75,17 @@ void exec_vabd (void)
   DECL_VABD_VAR(vector2);
   DECL_VABD_VAR(vector_res);
 
+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+  DECL_VARIABLE(vector1, float, 16, 4);
+  DECL_VARIABLE(vector1, float, 16, 8);
+
+  DECL_VARIABLE(vector2, float, 16, 4);
+  DECL_VARIABLE(vector2, float, 16, 8);
+
+  DECL_VARIABLE(vector_res, float, 16, 4);
+  DECL_VARIABLE(vector_res, float, 16, 8);
+#endif
+
   clean_results ();
 
   /* Initialize input "vector1" from "buffer".  */
@@ -82,6 +103,12 @@ void exec_vabd (void)
   VLOAD(vector1, buffer, q, uint, u, 16, 8);
   VLOAD(vector1, buffer, q, uint, u, 32, 4);
   VLOAD(vector1, buffer, q, float, f, 32, 4);
+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+  VLOAD(vector1, buffer, , float, f, 16, 4);
+  VLOAD(vector1, buffer, , float, f, 16, 4);
+  VLOAD(vector1, buffer, q, float, f, 16, 8);
+  VLOAD(vector1, buffer, q, float, f, 16, 8);
+#endif
 
   /* Choose init value arbitrarily.  */
   VDUP(vector2, , int, s, 8, 8, 1);
@@ -98,6 +125,10 @@ void exec_vabd (void)
   VDUP(vector2, q, uint, u, 16, 8, 12);
   VDUP(vector2, q, uint, u, 32, 4, 32);
   VDUP(vector2, q, float, f, 32, 4, 32.12f);
+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+  VDUP(vector2, , float, f, 16, 4, 8.3f);
+  VDUP(vector2, q, float, f, 16, 8, 32.12f);
+#endif
 
   /* Execute the tests.  */
   TEST_VABD(, int, s, 8, 8);
@@ -115,6 +146,11 @@ void exec_vabd (void)
   TEST_VABD(q, uint, u, 32, 4);
   TEST_VABD(q, float, f, 32, 4);
 
+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+  TEST_VABD(, float, f, 16, 4);
+  TEST_VABD(q, float, f, 16, 8);
+#endif
+
   CHECK(TEST_MSG, int, 8, 8, PRIx8, expected, "");
   CHECK(TEST_MSG, int, 16, 4, PRIx16, expected, "");
   CHECK(TEST_MSG, int, 32, 2, PRIx32, expected, "");
@@ -129,7 +165,10 @@ void exec_vabd (void)
   CHECK(TEST_MSG, uint, 16, 8, PRIx16, expected, "");
   CHECK(TEST_MSG, uint, 32, 4, PRIx32, expected, "");
   CHECK_FP(TEST_MSG, float, 32, 4, PRIx32, expected, "");
-
+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+  CHECK_FP(TEST_MSG, float, 16, 4, PRIx16, expected, "");
+  CHECK_FP(TEST_MSG, float, 16, 8, PRIx16, expected, "");
+#endif
 
   /* Extra FP tests with special values (-0.0, ....) */
   VDUP(vector1, q, float, f, 32, 4, -0.0f);
@@ -137,11 +176,27 @@ void exec_vabd (void)
   TEST_VABD(q, float, f, 32, 4);
   CHECK_FP(TEST_MSG, float, 32, 4, PRIx32, expected_float32, " FP special (-0.0)");
 
+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+  VDUP(vector1, q, float, f, 16, 8, -0.0f);
+  VDUP(vector2, q, float, f, 16, 8, 0.0);
+  TEST_VABD(q, float, f, 16, 8);
+  CHECK_FP(TEST_MSG, float, 16, 8, PRIx16, expected_float16,
+	   " FP special (-0.0)");
+#endif
+
   /* Extra FP tests with special values (-0.0, ....) */
   VDUP(vector1, q, float, f, 32, 4, 0.0f);
   VDUP(vector2, q, float, f, 32, 4, -0.0);
   TEST_VABD(q, float, f, 32, 4);
   CHECK_FP(TEST_MSG, float, 32, 4, PRIx32, expected_float32, " FP special (-0.0)");
+
+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+  VDUP(vector1, q, float, f, 16, 8, 0.0f);
+  VDUP(vector2, q, float, f, 16, 8, -0.0);
+  TEST_VABD(q, float, f, 16, 8);
+  CHECK_FP(TEST_MSG, float, 16, 8, PRIx16, expected_float16,
+	   " FP special (-0.0)");
+#endif
 }
 
 int main (void)
--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vabdh_f16_1.c
@@ -0,0 +1,44 @@
+/* { dg-do run } */
+/* { dg-require-effective-target arm_v8_2a_fp16_scalar_hw } */
+/* { dg-add-options arm_v8_2a_fp16_scalar }  */
+/* { dg-skip-if "" { arm*-*-* } } */
+
+#include <arm_fp16.h>
+
+#define INFF __builtin_inf ()
+
+/* Expected results.
+   Absolute difference between INPUT1 and INPUT2 in binary_scalar_op.inc.  */
+uint16_t expected[] =
+{
+  0x3C00,
+  0x3C00,
+  0x4654,
+  0x560E,
+  0x4900,
+  0x36B8,
+  0x419a,
+  0x4848,
+  0x3d34,
+  0x4cec,
+  0x4791,
+  0x3f34,
+  0x484d,
+  0x4804,
+  0x469c,
+  0x4ceb,
+  0x7c00,
+  0x7c00
+};
+
+#define TEST_MSG "VABDH_F16"
+#define INSN_NAME vabdh_f16
+
+#define EXPECTED expected
+
+#define INPUT_TYPE float16_t
+#define OUTPUT_TYPE float16_t
+#define OUTPUT_TYPE_SIZE 16
+
+/* Include the template for binary scalar operations.  */
+#include "binary_scalar_op.inc"
--- a/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vabs.c
+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vabs.c
@@ -21,24 +21,52 @@ VECT_VAR_DECL(expected,int,32,4) [] = { 0x10, 0xf, 0xe, 0xd };
 /* Expected results for float32 variants. Needs to be separated since
    the generic test function does not test floating-point
    versions.  */
+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+VECT_VAR_DECL(expected_float16, hfloat, 16, 4) [] = { 0x409a, 0x409a,
+						      0x409a, 0x409a };
+VECT_VAR_DECL(expected_float16, hfloat, 16, 8) [] = { 0x42cd, 0x42cd,
+						      0x42cd, 0x42cd,
+						      0x42cd, 0x42cd,
+						      0x42cd, 0x42cd };
+#endif
 VECT_VAR_DECL(expected_float32,hfloat,32,2) [] = { 0x40133333, 0x40133333 };
 VECT_VAR_DECL(expected_float32,hfloat,32,4) [] = { 0x4059999a, 0x4059999a,
 						   0x4059999a, 0x4059999a };
 
 void exec_vabs_f32(void)
 {
+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+  DECL_VARIABLE(vector, float, 16, 4);
+  DECL_VARIABLE(vector, float, 16, 8);
+#endif
   DECL_VARIABLE(vector, float, 32, 2);
   DECL_VARIABLE(vector, float, 32, 4);
 
+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+  DECL_VARIABLE(vector_res, float, 16, 4);
+  DECL_VARIABLE(vector_res, float, 16, 8);
+#endif
   DECL_VARIABLE(vector_res, float, 32, 2);
   DECL_VARIABLE(vector_res, float, 32, 4);
 
+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+  VDUP(vector, , float, f, 16, 4, -2.3f);
+  VDUP(vector, q, float, f, 16, 8, 3.4f);
+#endif
   VDUP(vector, , float, f, 32, 2, -2.3f);
   VDUP(vector, q, float, f, 32, 4, 3.4f);
 
+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+  TEST_UNARY_OP(INSN_NAME, , float, f, 16, 4);
+  TEST_UNARY_OP(INSN_NAME, q, float, f, 16, 8);
+#endif
   TEST_UNARY_OP(INSN_NAME, , float, f, 32, 2);
   TEST_UNARY_OP(INSN_NAME, q, float, f, 32, 4);
 
+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+  CHECK_FP(TEST_MSG, float, 16, 4, PRIx16, expected_float16, "");
+  CHECK_FP(TEST_MSG, float, 16, 8, PRIx16, expected_float16, "");
+#endif
   CHECK_FP(TEST_MSG, float, 32, 2, PRIx32, expected_float32, "");
   CHECK_FP(TEST_MSG, float, 32, 4, PRIx32, expected_float32, "");
 }
--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vabsh_f16_1.c
@@ -0,0 +1,40 @@
+/* { dg-do run } */
+/* { dg-require-effective-target arm_v8_2a_fp16_scalar_hw } */
+/* { dg-add-options arm_v8_2a_fp16_scalar }  */
+
+#include <arm_fp16.h>
+
+/* Expected results (16-bit hexadecimal representation).  */
+uint16_t expected[] =
+{
+  0x0000 /* 0.000000 */,
+  0x0000 /* 0.000000 */,
+  0x4000 /* 2.000000 */,
+  0x4233 /* 3.099609 */,
+  0x4d00 /* 20.000000 */,
+  0x3666 /* 0.399902 */,
+  0x409a /* 2.300781 */,
+  0x3d52 /* 1.330078 */,
+  0x479a /* 7.601562 */,
+  0x34f6 /* 0.310059 */,
+  0x355d /* 0.335205 */,
+  0x3800 /* 0.500000 */,
+  0x3c00 /* 1.000000 */,
+  0x4a91 /* 13.132812 */,
+  0x464d /* 6.300781 */,
+  0x4d00 /* 20.000000 */,
+  0x7c00 /* inf */,
+  0x7c00 /* inf */
+};
+
+#define TEST_MSG "VABSH_F16"
+#define INSN_NAME vabsh_f16
+
+#define EXPECTED expected
+
+#define INPUT_TYPE float16_t
+#define OUTPUT_TYPE float16_t
+#define OUTPUT_TYPE_SIZE 16
+
+/* Include the template for unary scalar operations.  */
+#include "unary_scalar_op.inc"
--- a/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vadd.c
+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vadd.c
@@ -43,6 +43,14 @@ VECT_VAR_DECL(expected,uint,64,2) [] = { 0xfffffffffffffff3,
 VECT_VAR_DECL(expected_float32,hfloat,32,2) [] = { 0x40d9999a, 0x40d9999a };
 VECT_VAR_DECL(expected_float32,hfloat,32,4) [] = { 0x41100000, 0x41100000,
 						   0x41100000, 0x41100000 };
+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+VECT_VAR_DECL(expected_float16, hfloat, 16, 4) [] = { 0x46cd, 0x46cd,
+						      0x46cd, 0x46cd };
+VECT_VAR_DECL(expected_float16, hfloat, 16, 8) [] = { 0x4880, 0x4880,
+						      0x4880, 0x4880,
+						      0x4880, 0x4880,
+						      0x4880, 0x4880 };
+#endif
 
 void exec_vadd_f32(void)
 {
@@ -66,4 +74,27 @@ void exec_vadd_f32(void)
 
   CHECK_FP(TEST_MSG, float, 32, 2, PRIx32, expected_float32, "");
   CHECK_FP(TEST_MSG, float, 32, 4, PRIx32, expected_float32, "");
+
+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+  DECL_VARIABLE(vector, float, 16, 4);
+  DECL_VARIABLE(vector, float, 16, 8);
+
+  DECL_VARIABLE(vector2, float, 16, 4);
+  DECL_VARIABLE(vector2, float, 16, 8);
+
+  DECL_VARIABLE(vector_res, float, 16, 4);
+  DECL_VARIABLE(vector_res, float, 16, 8);
+
+  VDUP(vector, , float, f, 16, 4, 2.3f);
+  VDUP(vector, q, float, f, 16, 8, 3.4f);
+
+  VDUP(vector2, , float, f, 16, 4, 4.5f);
+  VDUP(vector2, q, float, f, 16, 8, 5.6f);
+
+  TEST_BINARY_OP(INSN_NAME, , float, f, 16, 4);
+  TEST_BINARY_OP(INSN_NAME, q, float, f, 16, 8);
+
+  CHECK_FP(TEST_MSG, float, 16, 4, PRIx16, expected_float16, "");
+  CHECK_FP(TEST_MSG, float, 16, 8, PRIx16, expected_float16, "");
+#endif
 }
--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vaddh_f16_1.c
@@ -0,0 +1,40 @@
+/* { dg-do run } */
+/* { dg-require-effective-target arm_v8_2a_fp16_scalar_hw } */
+/* { dg-add-options arm_v8_2a_fp16_scalar }  */
+
+#include <arm_fp16.h>
+
+/* Expected results (16-bit hexadecimal representation).  */
+uint16_t expected[] =
+{
+  0x3c00 /* 1.000000 */,
+  0x3c00 /* 1.000000 */,
+  0xc0a8 /* -2.328125 */,
+  0x5672 /* 103.125000 */,
+  0x5240 /* 50.000000 */,
+  0x3614 /* 0.379883 */,
+  0xbf34 /* -1.800781 */,
+  0xc5e6 /* -5.898438 */,
+  0xcaf4 /* -13.906250 */,
+  0x4d14 /* 20.312500 */,
+  0xc6e5 /* -6.894531 */,
+  0x419a /* 2.800781 */,
+  0xc69a /* -6.601562 */,
+  0x4c8f /* 18.234375 */,
+  0xc5fe /* -5.992188 */,
+  0x4d15 /* 20.328125 */,
+  0x7e00 /* nan */,
+  0x7e00 /* nan */,
+};
+
+#define TEST_MSG "VADDH_F16"
+#define INSN_NAME vaddh_f16
+
+#define EXPECTED expected
+
+#define INPUT_TYPE float16_t
+#define OUTPUT_TYPE float16_t
+#define OUTPUT_TYPE_SIZE 16
+
+/* Include the template for binary scalar operations.  */
+#include "binary_scalar_op.inc"
--- a/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vbsl.c
+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vbsl.c
@@ -16,6 +16,10 @@ VECT_VAR_DECL(expected,uint,64,1) [] = { 0xfffffff1 };
 VECT_VAR_DECL(expected,poly,8,8) [] = { 0xf3, 0xf3, 0xf3, 0xf3,
 					0xf7, 0xf7, 0xf7, 0xf7 };
 VECT_VAR_DECL(expected,poly,16,4) [] = { 0xfff0, 0xfff0, 0xfff2, 0xfff2 };
+#if defined (FP16_SUPPORTED)
+VECT_VAR_DECL (expected, hfloat, 16, 4) [] = { 0xcc09, 0xcb89,
+					       0xcb09, 0xca89 };
+#endif
 VECT_VAR_DECL(expected,hfloat,32,2) [] = { 0xc1800004, 0xc1700004 };
 VECT_VAR_DECL(expected,int,8,16) [] = { 0xf2, 0xf2, 0xf2, 0xf2,
 					0xf6, 0xf6, 0xf6, 0xf6,
@@ -43,6 +47,12 @@ VECT_VAR_DECL(expected,poly,8,16) [] = { 0xf3, 0xf3, 0xf3, 0xf3,
 					 0xf7, 0xf7, 0xf7, 0xf7 };
 VECT_VAR_DECL(expected,poly,16,8) [] = { 0xfff0, 0xfff0, 0xfff2, 0xfff2,
 					 0xfff4, 0xfff4, 0xfff6, 0xfff6 };
+#if defined (FP16_SUPPORTED)
+VECT_VAR_DECL (expected, hfloat, 16, 8) [] = { 0xcc09, 0xcb89,
+					       0xcb09, 0xca89,
+					       0xca09, 0xc989,
+					       0xc909, 0xc889 };
+#endif
 VECT_VAR_DECL(expected,hfloat,32,4) [] = { 0xc1800001, 0xc1700001,
 					   0xc1600001, 0xc1500001 };
 
@@ -66,6 +76,10 @@ void exec_vbsl (void)
   clean_results ();
 
   TEST_MACRO_ALL_VARIANTS_2_5(VLOAD, vector, buffer);
+#if defined (FP16_SUPPORTED)
+  VLOAD(vector, buffer, , float, f, 16, 4);
+  VLOAD(vector, buffer, q, float, f, 16, 8);
+#endif
   VLOAD(vector, buffer, , float, f, 32, 2);
   VLOAD(vector, buffer, q, float, f, 32, 4);
 
@@ -80,6 +94,9 @@ void exec_vbsl (void)
   VDUP(vector2, , uint, u, 16, 4, 0xFFF2);
   VDUP(vector2, , uint, u, 32, 2, 0xFFFFFFF0);
   VDUP(vector2, , uint, u, 64, 1, 0xFFFFFFF3);
+#if defined (FP16_SUPPORTED)
+  VDUP(vector2, , float, f, 16, 4, -2.4f);   /* -2.4f is 0xC0CD.  */
+#endif
   VDUP(vector2, , float, f, 32, 2, -30.3f);
   VDUP(vector2, , poly, p, 8, 8, 0xF3);
   VDUP(vector2, , poly, p, 16, 4, 0xFFF2);
@@ -94,6 +111,9 @@ void exec_vbsl (void)
   VDUP(vector2, q, uint, u, 64, 2, 0xFFFFFFF3);
   VDUP(vector2, q, poly, p, 8, 16, 0xF3);
   VDUP(vector2, q, poly, p, 16, 8, 0xFFF2);
+#if defined (FP16_SUPPORTED)
+  VDUP(vector2, q, float, f, 16, 8, -2.4f);
+#endif
   VDUP(vector2, q, float, f, 32, 4, -30.4f);
 
   VDUP(vector_first, , uint, u, 8, 8, 0xF4);
@@ -111,10 +131,18 @@ void exec_vbsl (void)
   TEST_VBSL(uint, , poly, p, 16, 4);
   TEST_VBSL(uint, q, poly, p, 8, 16);
   TEST_VBSL(uint, q, poly, p, 16, 8);
+#if defined (FP16_SUPPORTED)
+  TEST_VBSL(uint, , float, f, 16, 4);
+  TEST_VBSL(uint, q, float, f, 16, 8);
+#endif
   TEST_VBSL(uint, , float, f, 32, 2);
   TEST_VBSL(uint, q, float, f, 32, 4);
 
+#if defined (FP16_SUPPORTED)
+  CHECK_RESULTS (TEST_MSG, "");
+#else
   CHECK_RESULTS_NO_FP16 (TEST_MSG, "");
+#endif
 }
 
 int main (void)
--- a/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vcage.c
+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vcage.c
@@ -11,3 +11,13 @@ VECT_VAR_DECL(expected,uint,32,4) [] = { 0xffffffff, 0xffffffff,
 VECT_VAR_DECL(expected2,uint,32,2) [] = { 0xffffffff, 0xffffffff };
 VECT_VAR_DECL(expected2,uint,32,4) [] = { 0xffffffff, 0xffffffff,
 					  0xffffffff, 0xffffffff };
+
+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+VECT_VAR_DECL (expected, uint, 16, 4) [] = { 0xffff, 0x0, 0x0, 0x0 };
+VECT_VAR_DECL (expected, uint, 16, 8) [] = { 0xffff, 0xffff, 0xffff, 0x0,
+					     0x0, 0x0, 0x0, 0x0 };
+
+VECT_VAR_DECL (expected2, uint, 16, 4) [] = { 0xffff, 0xffff, 0xffff, 0xffff };
+VECT_VAR_DECL (expected2, uint, 16, 8) [] = { 0xffff, 0xffff, 0xffff, 0xffff,
+					      0xffff, 0xffff, 0xffff, 0x0 };
+#endif
--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vcageh_f16_1.c
@@ -0,0 +1,22 @@
+/* { dg-do run } */
+/* { dg-require-effective-target arm_v8_2a_fp16_scalar_hw } */
+/* { dg-add-options arm_v8_2a_fp16_scalar }  */
+/* { dg-skip-if "" { arm*-*-* } } */
+
+#include <arm_fp16.h>
+
+uint16_t expected[] = { 0x0, 0x0, 0x0, 0x0, 0x0, 0xFFFF, 0xFFFF, 0x0, 0xFFFF,
+			0x0, 0x0, 0x0, 0x0, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF,
+			0xFFFF};
+
+#define TEST_MSG "VCAGEH_F16"
+#define INSN_NAME vcageh_f16
+
+#define EXPECTED expected
+
+#define INPUT_TYPE float16_t
+#define OUTPUT_TYPE uint16_t
+#define OUTPUT_TYPE_SIZE 16
+
+/* Include the template for binary scalar operations.  */
+#include "binary_scalar_op.inc"
--- a/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vcagt.c
+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vcagt.c
@@ -11,3 +11,13 @@ VECT_VAR_DECL(expected,uint,32,4) [] = { 0xffffffff, 0xffffffff,
 VECT_VAR_DECL(expected2,uint,32,2) [] = { 0xffffffff, 0xffffffff };
 VECT_VAR_DECL(expected2,uint,32,4) [] = { 0xffffffff, 0xffffffff,
 					  0xffffffff, 0xffffffff };
+
+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+VECT_VAR_DECL (expected, uint, 16, 4) [] = { 0x0, 0x0, 0x0, 0x0 };
+VECT_VAR_DECL (expected, uint, 16, 8) [] = { 0xffff, 0xffff, 0x0, 0x0,
+					     0x0, 0x0, 0x0, 0x0 };
+
+VECT_VAR_DECL (expected2, uint, 16, 4) [] = { 0xffff, 0xffff, 0xffff, 0xffff };
+VECT_VAR_DECL (expected2, uint, 16, 8) [] = { 0xffff, 0xffff, 0xffff, 0xffff,
+					      0xffff, 0xffff, 0x0, 0x0 };
+#endif
--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vcagth_f16_1.c
@@ -0,0 +1,21 @@
+/* { dg-do run } */
+/* { dg-require-effective-target arm_v8_2a_fp16_scalar_hw } */
+/* { dg-add-options arm_v8_2a_fp16_scalar }  */
+/* { dg-skip-if "" { arm*-*-* } } */
+
+#include <arm_fp16.h>
+
+uint16_t expected[] = { 0x0, 0x0, 0x0, 0x0, 0x0, 0xFFFF, 0xFFFF, 0x0, 0xFFFF,
+			0x0, 0x0, 0x0, 0x0, 0xFFFF, 0xFFFF, 0xFFFF, 0x0, 0x0};
+
+#define TEST_MSG "VCAGTH_F16"
+#define INSN_NAME vcagth_f16
+
+#define EXPECTED expected
+
+#define INPUT_TYPE float16_t
+#define OUTPUT_TYPE uint16_t
+#define OUTPUT_TYPE_SIZE 16
+
+/* Include the template for binary scalar operations.  */
+#include "binary_scalar_op.inc"
--- a/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vcale.c
+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vcale.c
@@ -9,3 +9,13 @@ VECT_VAR_DECL(expected,uint,32,4) [] = { 0x0, 0x0, 0xffffffff, 0xffffffff };
 
 VECT_VAR_DECL(expected2,uint,32,2) [] = { 0x0, 0x0 };
 VECT_VAR_DECL(expected2,uint,32,4) [] = { 0x0, 0x0, 0x0, 0x0 };
+
+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+VECT_VAR_DECL (expected, uint, 16, 4) [] = { 0xffff, 0xffff, 0xffff, 0xffff };
+VECT_VAR_DECL (expected, uint, 16, 8) [] = { 0x0, 0x0, 0xffff, 0xffff,
+					     0xffff, 0xffff, 0xffff, 0xffff };
+
+VECT_VAR_DECL (expected2, uint, 16, 4) [] = { 0x0, 0x0, 0x0, 0x0 };
+VECT_VAR_DECL (expected2, uint, 16, 8) [] = { 0x0, 0x0, 0x0, 0x0,
+					      0x0, 0x0, 0xffff, 0xffff };
+#endif
--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vcaleh_f16_1.c
@@ -0,0 +1,22 @@
+/* { dg-do run } */
+/* { dg-require-effective-target arm_v8_2a_fp16_scalar_hw } */
+/* { dg-add-options arm_v8_2a_fp16_scalar }  */
+/* { dg-skip-if "" { arm*-*-* } } */
+
+#include <arm_fp16.h>
+
+uint16_t expected[] = { 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0x0, 0x0,
+			0xFFFF, 0x0, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0x0, 0x0,
+			0x0, 0xFFFF, 0xFFFF};
+
+#define TEST_MSG "VCALEH_F16"
+#define INSN_NAME vcaleh_f16
+
+#define EXPECTED expected
+
+#define INPUT_TYPE float16_t
+#define OUTPUT_TYPE uint16_t
+#define OUTPUT_TYPE_SIZE 16
+
+/* Include the template for binary scalar operations.  */
+#include "binary_scalar_op.inc"
--- a/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vcalt.c
+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vcalt.c
@@ -9,3 +9,13 @@ VECT_VAR_DECL(expected,uint,32,4) [] = { 0x0, 0x0, 0x0, 0xffffffff };
 
 VECT_VAR_DECL(expected2,uint,32,2) [] = { 0x0, 0x0 };
 VECT_VAR_DECL(expected2,uint,32,4) [] = { 0x0, 0x0, 0x0, 0x0 };
+
+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+VECT_VAR_DECL (expected, uint, 16, 4) [] = { 0x0, 0xffff, 0xffff, 0xffff };
+VECT_VAR_DECL (expected, uint, 16, 8) [] = { 0x0, 0x0, 0x0, 0xffff,
+					     0xffff, 0xffff, 0xffff, 0xffff };
+
+VECT_VAR_DECL (expected2, uint, 16, 4) [] = { 0x0, 0x0, 0x0, 0x0 };
+VECT_VAR_DECL (expected2, uint, 16, 8) [] = { 0x0, 0x0, 0x0, 0x0,
+					      0x0, 0x0, 0x0, 0xffff };
+#endif
--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vcalth_f16_1.c
@@ -0,0 +1,22 @@
+/* { dg-do run } */
+/* { dg-require-effective-target arm_v8_2a_fp16_scalar_hw } */
+/* { dg-add-options arm_v8_2a_fp16_scalar }  */
+/* { dg-skip-if "" { arm*-*-* } } */
+
+#include <arm_fp16.h>
+
+uint16_t expected[] = { 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0x0, 0x0,
+			0xFFFF, 0x0, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0x0, 0x0,
+			0x0, 0x0, 0x0};
+
+#define TEST_MSG "VCALTH_F16"
+#define INSN_NAME vcalth_f16
+
+#define EXPECTED expected
+
+#define INPUT_TYPE float16_t
+#define OUTPUT_TYPE uint16_t
+#define OUTPUT_TYPE_SIZE 16
+
+/* Include the template for binary scalar operations.  */
+#include "binary_scalar_op.inc"
--- a/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vceq.c
+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vceq.c
@@ -32,6 +32,12 @@ VECT_VAR_DECL(expected_q_uint,uint,16,8) [] = { 0x0, 0x0, 0x0, 0x0,
 						0x0, 0x0, 0xffff, 0x0 };
 VECT_VAR_DECL(expected_q_uint,uint,32,4) [] = { 0x0, 0x0, 0xffffffff, 0x0 };
 
+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+VECT_VAR_DECL (expected_float, uint, 16, 4) [] = { 0x0, 0xffff, 0x0, 0x0 };
+VECT_VAR_DECL (expected_q_float, uint, 16, 8) [] = { 0x0, 0x0, 0xffff, 0x0,
+						     0x0, 0x0, 0x0, 0x0, };
+#endif
+
 VECT_VAR_DECL(expected_float,uint,32,2) [] = { 0x0, 0xffffffff };
 VECT_VAR_DECL(expected_q_float,uint,32,4) [] = { 0x0, 0x0, 0xffffffff, 0x0 };
 
@@ -39,6 +45,18 @@ VECT_VAR_DECL(expected_uint2,uint,32,2) [] = { 0xffffffff, 0x0 };
 VECT_VAR_DECL(expected_uint3,uint,32,2) [] = { 0x0, 0xffffffff };
 VECT_VAR_DECL(expected_uint4,uint,32,2) [] = { 0xffffffff, 0x0 };
 
+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+VECT_VAR_DECL (expected_nan, uint, 16, 4) [] = { 0x0, 0x0, 0x0, 0x0  };
+VECT_VAR_DECL (expected_mnan, uint, 16, 4) [] = { 0x0, 0x0, 0x0, 0x0 };
+VECT_VAR_DECL (expected_nan2, uint, 16, 4) [] = { 0x0, 0x0, 0x0, 0x0 };
+
+VECT_VAR_DECL (expected_inf, uint, 16, 4) [] = { 0x0, 0x0, 0x0, 0x0 };
+VECT_VAR_DECL (expected_minf, uint, 16, 4) [] = { 0x0, 0x0, 0x0, 0x0 };
+VECT_VAR_DECL (expected_inf2, uint, 16, 4) [] = { 0x0, 0x0, 0x0, 0x0 };
+VECT_VAR_DECL (expected_mzero, uint, 16, 4) [] = { 0xffff, 0xffff,
+						   0xffff, 0xffff };
+#endif
+
 VECT_VAR_DECL(expected_nan,uint,32,2) [] = { 0x0, 0x0 };
 VECT_VAR_DECL(expected_mnan,uint,32,2) [] = { 0x0, 0x0 };
 VECT_VAR_DECL(expected_nan2,uint,32,2) [] = { 0x0, 0x0 };
--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vceqh_f16_1.c
@@ -0,0 +1,21 @@
+/* { dg-do run } */
+/* { dg-require-effective-target arm_v8_2a_fp16_scalar_hw } */
+/* { dg-add-options arm_v8_2a_fp16_scalar }  */
+/* { dg-skip-if "" { arm*-*-* } } */
+
+#include <arm_fp16.h>
+
+uint16_t expected[] = { 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
+			0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0};
+
+#define TEST_MSG "VCEQH_F16"
+#define INSN_NAME vceqh_f16
+
+#define EXPECTED expected
+
+#define INPUT_TYPE float16_t
+#define OUTPUT_TYPE uint16_t
+#define OUTPUT_TYPE_SIZE 16
+
+/* Include the template for binary scalar operations.  */
+#include "binary_scalar_op.inc"
--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vceqz_1.c
@@ -0,0 +1,27 @@
+/* This file tests an intrinsic which currently has only an f16 variant and that
+   is only available when FP16 arithmetic instructions are supported.  */
+/* { dg-require-effective-target arm_v8_2a_fp16_neon_hw } */
+
+#define INSN_NAME vceqz
+#define TEST_MSG "VCEQZ/VCEQZQ"
+
+#include "cmp_zero_op.inc"
+
+/* Expected results.  */
+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+VECT_VAR_DECL (expected_float, uint, 16, 4) [] = { 0x0, 0x0, 0x0, 0x0  };
+VECT_VAR_DECL (expected_q_float, uint, 16, 8) [] = { 0x0, 0x0, 0x0, 0x0,
+						     0x0, 0x0, 0x0, 0x0 };
+#endif
+
+/* Extra FP tests with special values (NaN, ....).  */
+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+VECT_VAR_DECL (expected_nan, uint, 16, 4) [] = { 0x0, 0x0, 0x0, 0x0  };
+VECT_VAR_DECL (expected_mnan, uint, 16, 4) [] = { 0x0, 0x0, 0x0, 0x0 };
+VECT_VAR_DECL (expected_inf, uint, 16, 4) [] = { 0x0, 0x0, 0x0, 0x0 };
+VECT_VAR_DECL (expected_minf, uint, 16, 4) [] = { 0x0, 0x0, 0x0, 0x0 };
+VECT_VAR_DECL (expected_zero, uint, 16, 4) [] = { 0xffff, 0xffff,
+						  0xffff, 0xffff };
+VECT_VAR_DECL (expected_mzero, uint, 16, 4) [] = { 0xffff, 0xffff,
+						   0xffff, 0xffff };
+#endif
--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vceqzh_f16_1.c
@@ -0,0 +1,21 @@
+/* { dg-do run } */
+/* { dg-require-effective-target arm_v8_2a_fp16_scalar_hw } */
+/* { dg-add-options arm_v8_2a_fp16_scalar }  */
+/* { dg-skip-if "" { arm*-*-* } } */
+
+#include <arm_fp16.h>
+
+uint16_t expected[] = { 0xFFFF, 0xFFFF, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
+			0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0};
+
+#define TEST_MSG "VCEQZH_F16"
+#define INSN_NAME vceqzh_f16
+
+#define EXPECTED expected
+
+#define INPUT_TYPE float16_t
+#define OUTPUT_TYPE uint16_t
+#define OUTPUT_TYPE_SIZE 16
+
+/* Include the template for binary scalar operations.  */
+#include "unary_scalar_op.inc"
--- a/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vcge.c
+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vcge.c
@@ -28,6 +28,14 @@ VECT_VAR_DECL(expected_q_uint,uint,16,8) [] = { 0x0, 0x0, 0x0, 0x0,
 						0, 0x0, 0xffff, 0xffff };
 VECT_VAR_DECL(expected_q_uint,uint,32,4) [] = { 0x0, 0x0, 0xffffffff, 0xffffffff };
 
+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+VECT_VAR_DECL (expected_float, uint, 16, 4) [] = { 0x0, 0xffff, 0xffff, 0xffff };
+VECT_VAR_DECL (expected_q_float, uint, 16, 8) [] = { 0x0, 0x0,
+						     0xffff, 0xffff,
+						     0xffff, 0xffff,
+						     0xffff, 0xffff };
+#endif
+
 VECT_VAR_DECL(expected_float,uint,32,2) [] = { 0x0, 0xffffffff };
 VECT_VAR_DECL(expected_q_float,uint,32,4) [] = { 0x0, 0x0, 0xffffffff, 0xffffffff };
 
@@ -35,6 +43,20 @@ VECT_VAR_DECL(expected_uint2,uint,32,2) [] = { 0xffffffff, 0xffffffff };
 VECT_VAR_DECL(expected_uint3,uint,32,2) [] = { 0x0, 0xffffffff };
 VECT_VAR_DECL(expected_uint4,uint,32,2) [] = { 0xffffffff, 0xffffffff };
 
+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+VECT_VAR_DECL (expected_nan, uint, 16, 4) [] = { 0x0, 0x0, 0x0, 0x0  };
+VECT_VAR_DECL (expected_mnan, uint, 16, 4) [] = { 0x0, 0x0, 0x0, 0x0 };
+VECT_VAR_DECL (expected_nan2, uint, 16, 4) [] = { 0x0, 0x0, 0x0, 0x0 };
+
+VECT_VAR_DECL (expected_inf, uint, 16, 4) [] = { 0x0, 0x0, 0x0, 0x0 };
+VECT_VAR_DECL (expected_minf, uint, 16, 4) [] = { 0xffff, 0xffff,
+						  0xffff, 0xffff };
+VECT_VAR_DECL (expected_inf2, uint, 16, 4) [] = { 0xffff, 0xffff,
+						  0xffff, 0xffff };
+VECT_VAR_DECL (expected_mzero, uint, 16, 4) [] = { 0xffff, 0xffff,
+						   0xffff, 0xffff };
+#endif
+
 VECT_VAR_DECL(expected_nan,uint,32,2) [] = { 0x0, 0x0 };
 VECT_VAR_DECL(expected_mnan,uint,32,2) [] = { 0x0, 0x0 };
 VECT_VAR_DECL(expected_nan2,uint,32,2) [] = { 0x0, 0x0 };
--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vcgeh_f16_1.c
@@ -0,0 +1,22 @@
+/* { dg-do run } */
+/* { dg-require-effective-target arm_v8_2a_fp16_scalar_hw } */
+/* { dg-add-options arm_v8_2a_fp16_scalar }  */
+/* { dg-skip-if "" { arm*-*-* } } */
+
+#include <arm_fp16.h>
+
+uint16_t expected[] = { 0x0, 0x0, 0xFFFF, 0x0, 0x0, 0xFFFF, 0x0, 0xFFFF,
+			0x0, 0x0, 0xFFFF, 0x0, 0xFFFF, 0xFFFF, 0x0, 0xFFFF,
+			0xFFFF, 0x0};
+
+#define TEST_MSG "VCGEH_F16"
+#define INSN_NAME vcgeh_f16
+
+#define EXPECTED expected
+
+#define INPUT_TYPE float16_t
+#define OUTPUT_TYPE uint16_t
+#define OUTPUT_TYPE_SIZE 16
+
+/* Include the template for binary scalar operations.  */
+#include "binary_scalar_op.inc"
--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vcgez_1.c
@@ -0,0 +1,30 @@
+/* This file tests an intrinsic which currently has only an f16 variant and that
+   is only available when FP16 arithmetic instructions are supported.  */
+/* { dg-require-effective-target arm_v8_2a_fp16_neon_hw } */
+
+#define INSN_NAME vcgez
+#define TEST_MSG "VCGEZ/VCGEZQ"
+
+#include "cmp_zero_op.inc"
+
+/* Expected results.  */
+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+VECT_VAR_DECL (expected_float, uint, 16, 4) [] = { 0x0, 0x0, 0x0, 0x0  };
+VECT_VAR_DECL (expected_q_float, uint, 16, 8) [] = { 0xffff, 0xffff,
+						     0xffff, 0xffff,
+						     0xffff, 0xffff,
+						     0xffff, 0xffff };
+#endif
+
+/* Extra FP tests with special values (NaN, ....).  */
+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+VECT_VAR_DECL (expected_nan, uint, 16, 4) [] = { 0x0, 0x0, 0x0, 0x0  };
+VECT_VAR_DECL (expected_mnan, uint, 16, 4) [] = { 0x0, 0x0, 0x0, 0x0 };
+VECT_VAR_DECL (expected_inf, uint, 16, 4) [] = { 0xffff, 0xffff,
+						 0xffff, 0xffff };
+VECT_VAR_DECL (expected_minf, uint, 16, 4) [] = { 0x0, 0x0, 0x0, 0x0 };
+VECT_VAR_DECL (expected_zero, uint, 16, 4) [] = { 0xffff, 0xffff,
+						  0xffff, 0xffff };
+VECT_VAR_DECL (expected_mzero, uint, 16, 4) [] = { 0xffff, 0xffff,
+						   0xffff, 0xffff };
+#endif
--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vcgezh_f16_1.c
@@ -0,0 +1,22 @@
+/* { dg-do run } */
+/* { dg-require-effective-target arm_v8_2a_fp16_scalar_hw } */
+/* { dg-add-options arm_v8_2a_fp16_scalar }  */
+/* { dg-skip-if "" { arm*-*-* } } */
+
+#include <arm_fp16.h>
+
+uint16_t expected[] = { 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0x0,
+			0xFFFF, 0x0, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF,
+			0x0, 0xFFFF, 0xFFFF, 0x0};
+
+#define TEST_MSG "VCGEZH_F16"
+#define INSN_NAME vcgezh_f16
+
+#define EXPECTED expected
+
+#define INPUT_TYPE float16_t
+#define OUTPUT_TYPE uint16_t
+#define OUTPUT_TYPE_SIZE 16
+
+/* Include the template for binary scalar operations.  */
+#include "unary_scalar_op.inc"
--- a/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vcgt.c
+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vcgt.c
@@ -28,6 +28,14 @@ VECT_VAR_DECL(expected_q_uint,uint,16,8) [] = { 0x0, 0x0, 0x0, 0x0,
 						0x0, 0x0, 0x0, 0xffff };
 VECT_VAR_DECL(expected_q_uint,uint,32,4) [] = { 0x0, 0x0, 0x0, 0xffffffff };
 
+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+VECT_VAR_DECL (expected_float, uint, 16, 4) [] = { 0x0, 0x0, 0xffff, 0xffff };
+VECT_VAR_DECL (expected_q_float, uint, 16, 8) [] = { 0x0, 0x0,
+						     0x0, 0xffff,
+						     0xffff, 0xffff,
+						     0xffff, 0xffff };
+#endif
+
 VECT_VAR_DECL(expected_float,uint,32,2) [] = { 0x0, 0x0 };
 VECT_VAR_DECL(expected_q_float,uint,32,4) [] = { 0x0, 0x0, 0x0, 0xffffffff };
 
@@ -35,6 +43,19 @@ VECT_VAR_DECL(expected_uint2,uint,32,2) [] = { 0x0, 0xffffffff };
 VECT_VAR_DECL(expected_uint3,uint,32,2) [] = { 0x0, 0x0 };
 VECT_VAR_DECL(expected_uint4,uint,32,2) [] = { 0x0, 0xffffffff };
 
+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+VECT_VAR_DECL (expected_nan, uint, 16, 4) [] = { 0x0, 0x0, 0x0, 0x0  };
+VECT_VAR_DECL (expected_mnan, uint, 16, 4) [] = { 0x0, 0x0, 0x0, 0x0 };
+VECT_VAR_DECL (expected_nan2, uint, 16, 4) [] = { 0x0, 0x0, 0x0, 0x0 };
+
+VECT_VAR_DECL (expected_inf, uint, 16, 4) [] = { 0x0, 0x0, 0x0, 0x0 };
+VECT_VAR_DECL (expected_minf, uint, 16, 4) [] = { 0xffff, 0xffff,
+						  0xffff, 0xffff };
+VECT_VAR_DECL (expected_inf2, uint, 16, 4) [] = { 0xffff, 0xffff,
+						  0xffff, 0xffff };
+VECT_VAR_DECL (expected_mzero, uint, 16, 4) [] = { 0x0, 0x0, 0x0, 0x0 };
+#endif
+
 VECT_VAR_DECL(expected_nan,uint,32,2) [] = { 0x0, 0x0 };
 VECT_VAR_DECL(expected_mnan,uint,32,2) [] = { 0x0, 0x0 };
 VECT_VAR_DECL(expected_nan2,uint,32,2) [] = { 0x0, 0x0 };
--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vcgth_f16_1.c
@@ -0,0 +1,22 @@
+/* { dg-do run } */
+/* { dg-require-effective-target arm_v8_2a_fp16_scalar_hw } */
+/* { dg-add-options arm_v8_2a_fp16_scalar }  */
+/* { dg-skip-if "" { arm*-*-* } } */
+
+#include <arm_fp16.h>
+
+uint16_t expected[] = { 0x0, 0x0, 0xFFFF, 0x0, 0x0, 0xFFFF, 0x0, 0xFFFF,
+			0x0, 0x0, 0xFFFF, 0x0, 0xFFFF, 0xFFFF, 0x0, 0xFFFF,
+			0xFFFF, 0x0};
+
+#define TEST_MSG "VCGTH_F16"
+#define INSN_NAME vcgth_f16
+
+#define EXPECTED expected
+
+#define INPUT_TYPE float16_t
+#define OUTPUT_TYPE uint16_t
+#define OUTPUT_TYPE_SIZE 16
+
+/* Include the template for binary scalar operations.  */
+#include "binary_scalar_op.inc"
--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vcgtz_1.c
@@ -0,0 +1,28 @@
+/* This file tests an intrinsic which currently has only an f16 variant and that
+   is only available when FP16 arithmetic instructions are supported.  */
+/* { dg-require-effective-target arm_v8_2a_fp16_neon_hw } */
+
+#define INSN_NAME vcgtz
+#define TEST_MSG "VCGTZ/VCGTZQ"
+
+#include "cmp_zero_op.inc"
+
+/* Expected results.  */
+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+VECT_VAR_DECL (expected_float, uint, 16, 4) [] = { 0x0, 0x0, 0x0, 0x0  };
+VECT_VAR_DECL (expected_q_float, uint, 16, 8) [] = { 0xffff, 0xffff,
+						     0xffff, 0xffff,
+						     0xffff, 0xffff,
+						     0xffff, 0xffff };
+#endif
+
+/* Extra FP tests with special values (NaN, ....).  */
+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+VECT_VAR_DECL (expected_nan, uint, 16, 4) [] = { 0x0, 0x0, 0x0, 0x0  };
+VECT_VAR_DECL (expected_mnan, uint, 16, 4) [] = { 0x0, 0x0, 0x0, 0x0 };
+VECT_VAR_DECL (expected_inf, uint, 16, 4) [] = { 0xffff, 0xffff,
+						 0xffff, 0xffff };
+VECT_VAR_DECL (expected_minf, uint, 16, 4) [] = { 0x0, 0x0, 0x0, 0x0 };
+VECT_VAR_DECL (expected_zero, uint, 16, 4) [] = { 0x0, 0x0, 0x0, 0x0 };
+VECT_VAR_DECL (expected_mzero, uint, 16, 4) [] = { 0x0, 0x0, 0x0, 0x0 };
+#endif
--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vcgtzh_f16_1.c
@@ -0,0 +1,22 @@
+/* { dg-do run } */
+/* { dg-require-effective-target arm_v8_2a_fp16_scalar_hw } */
+/* { dg-add-options arm_v8_2a_fp16_scalar }  */
+/* { dg-skip-if "" { arm*-*-* } } */
+
+#include <arm_fp16.h>
+
+uint16_t expected[] = { 0x0, 0x0, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0x0, 0xFFFF,
+			0x0, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0x0,
+			0xFFFF, 0xFFFF, 0x0};
+
+#define TEST_MSG "VCGTZH_F16"
+#define INSN_NAME vcgtzh_f16
+
+#define EXPECTED expected
+
+#define INPUT_TYPE float16_t
+#define OUTPUT_TYPE uint16_t
+#define OUTPUT_TYPE_SIZE 16
+
+/* Include the template for binary scalar operations.  */
+#include "unary_scalar_op.inc"
--- a/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vcle.c
+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vcle.c
@@ -31,6 +31,14 @@ VECT_VAR_DECL(expected_q_uint,uint,16,8) [] = { 0xffff, 0xffff, 0xffff, 0xffff,
 VECT_VAR_DECL(expected_q_uint,uint,32,4) [] = { 0xffffffff, 0xffffffff,
 						0xffffffff, 0x0 };
 
+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+VECT_VAR_DECL (expected_float, uint, 16, 4) [] = { 0xffff, 0xffff, 0x0, 0x0 };
+VECT_VAR_DECL (expected_q_float, uint, 16, 8) [] = { 0xffff, 0xffff,
+						     0xffff, 0x0,
+						     0x0, 0x0,
+						     0x0, 0x0 };
+#endif
+
 VECT_VAR_DECL(expected_float,uint,32,2) [] = { 0xffffffff, 0xffffffff };
 VECT_VAR_DECL(expected_q_float,uint,32,4) [] = { 0xffffffff, 0xffffffff,
 						 0xffffffff, 0x0 };
@@ -39,6 +47,20 @@ VECT_VAR_DECL(expected_uint2,uint,32,2) [] = { 0xffffffff, 0x0 };
 VECT_VAR_DECL(expected_uint3,uint,32,2) [] = { 0xffffffff, 0xffffffff };
 VECT_VAR_DECL(expected_uint4,uint,32,2) [] = { 0xffffffff, 0x0 };
 
+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+VECT_VAR_DECL (expected_nan, uint, 16, 4) [] = { 0x0, 0x0, 0x0, 0x0  };
+VECT_VAR_DECL (expected_mnan, uint, 16, 4) [] = { 0x0, 0x0, 0x0, 0x0 };
+VECT_VAR_DECL (expected_nan2, uint, 16, 4) [] = { 0x0, 0x0, 0x0, 0x0 };
+
+VECT_VAR_DECL (expected_inf, uint, 16, 4) [] = { 0xffff, 0xffff,
+						 0xffff, 0xffff };
+VECT_VAR_DECL (expected_minf, uint, 16, 4) [] = { 0x0, 0x0, 0x0, 0x0 };
+VECT_VAR_DECL (expected_inf2, uint, 16, 4) [] = { 0x0, 0x0, 0x0, 0x0 };
+
+VECT_VAR_DECL (expected_mzero, uint, 16, 4) [] = { 0xffff, 0xffff,
+						   0xffff, 0xffff };
+#endif
+
 VECT_VAR_DECL(expected_nan,uint,32,2) [] = { 0x0, 0x0 };
 VECT_VAR_DECL(expected_mnan,uint,32,2) [] = { 0x0, 0x0 };
 VECT_VAR_DECL(expected_nan2,uint,32,2) [] = { 0x0, 0x0 };
--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vcleh_f16_1.c
@@ -0,0 +1,22 @@
+/* { dg-do run } */
+/* { dg-require-effective-target arm_v8_2a_fp16_scalar_hw } */
+/* { dg-add-options arm_v8_2a_fp16_scalar }  */
+/* { dg-skip-if "" { arm*-*-* } } */
+
+#include <arm_fp16.h>
+
+uint16_t expected[] = { 0xFFFF, 0xFFFF, 0x0, 0xFFFF, 0xFFFF, 0x0, 0xFFFF, 0x0,
+			0xFFFF, 0xFFFF, 0x0, 0xFFFF, 0x0, 0x0, 0xFFFF, 0x0, 0x0,
+			0xFFFF};
+
+#define TEST_MSG "VCLEH_F16"
+#define INSN_NAME vcleh_f16
+
+#define EXPECTED expected
+
+#define INPUT_TYPE float16_t
+#define OUTPUT_TYPE uint16_t
+#define OUTPUT_TYPE_SIZE 16
+
+/* Include the template for binary scalar operations.  */
+#include "binary_scalar_op.inc"
--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vclez_1.c
@@ -0,0 +1,29 @@
+/* This file tests an intrinsic which currently has only an f16 variant and that
+   is only available when FP16 arithmetic instructions are supported.  */
+/* { dg-require-effective-target arm_v8_2a_fp16_neon_hw } */
+
+#define INSN_NAME vclez
+#define TEST_MSG "VCLEZ/VCLEZQ"
+
+#include "cmp_zero_op.inc"
+
+/* Expected results.  */
+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+VECT_VAR_DECL (expected_float, uint, 16, 4) [] = { 0xffff, 0xffff,
+						   0xffff, 0xffff };
+VECT_VAR_DECL (expected_q_float, uint, 16, 8) [] = { 0x0, 0x0, 0x0, 0x0 };
+#endif
+
+/* Extra FP tests with special values (NaN, ....).  */
+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+VECT_VAR_DECL (expected_nan, uint, 16, 4) [] = { 0x0, 0x0, 0x0, 0x0  };
+VECT_VAR_DECL (expected_mnan, uint, 16, 4) [] = { 0x0, 0x0, 0x0, 0x0 };
+VECT_VAR_DECL (expected_inf, uint, 16, 4) [] = { 0x0, 0x0, 0x0, 0x0 };
+
+VECT_VAR_DECL (expected_minf, uint, 16, 4) [] = { 0xffff, 0xffff,
+						  0xffff, 0xffff };
+VECT_VAR_DECL (expected_zero, uint, 16, 4) [] = { 0xffff, 0xffff,
+						  0xffff, 0xffff };
+VECT_VAR_DECL (expected_mzero, uint, 16, 4) [] = { 0xffff, 0xffff,
+						   0xffff, 0xffff };
+#endif
--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vclezh_f16_1.c
@@ -0,0 +1,21 @@
+/* { dg-do run } */
+/* { dg-require-effective-target arm_v8_2a_fp16_scalar_hw } */
+/* { dg-add-options arm_v8_2a_fp16_scalar }  */
+/* { dg-skip-if "" { arm*-*-* } } */
+
+#include <arm_fp16.h>
+
+uint16_t expected[] = { 0xFFFF, 0xFFFF, 0x0, 0x0, 0x0, 0x0, 0xFFFF, 0x0, 0xFFFF,
+			0x0, 0x0, 0x0, 0x0, 0x0, 0xFFFF, 0x0, 0x0, 0xFFFF};
+
+#define TEST_MSG "VCLEZH_F16"
+#define INSN_NAME vclezh_f16
+
+#define EXPECTED expected
+
+#define INPUT_TYPE float16_t
+#define OUTPUT_TYPE uint16_t
+#define OUTPUT_TYPE_SIZE 16
+
+/* Include the template for binary scalar operations.  */
+#include "unary_scalar_op.inc"
--- a/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vclt.c
+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vclt.c
@@ -30,6 +30,14 @@ VECT_VAR_DECL(expected_q_uint,uint,16,8) [] = { 0xffff, 0xffff, 0xffff, 0xffff,
 VECT_VAR_DECL(expected_q_uint,uint,32,4) [] = { 0xffffffff, 0xffffffff,
 						0x0, 0x0 };
 
+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+VECT_VAR_DECL (expected_float, uint, 16, 4) [] = { 0xffff, 0x0, 0x0, 0x0 };
+VECT_VAR_DECL (expected_q_float, uint, 16, 8) [] = { 0xffff, 0xffff,
+						     0x0, 0x0,
+						     0x0, 0x0,
+						     0x0, 0x0 };
+#endif
+
 VECT_VAR_DECL(expected_float,uint,32,2) [] = { 0xffffffff, 0x0 };
 VECT_VAR_DECL(expected_q_float,uint,32,4) [] = { 0xffffffff, 0xffffffff,
 						 0x0, 0x0 };
@@ -38,6 +46,19 @@ VECT_VAR_DECL(expected_uint2,uint,32,2) [] = { 0x0, 0x0 };
 VECT_VAR_DECL(expected_uint3,uint,32,2) [] = { 0xffffffff, 0x0 };
 VECT_VAR_DECL(expected_uint4,uint,32,2) [] = { 0x0, 0x0 };
 
+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+VECT_VAR_DECL (expected_nan, uint, 16, 4) [] = { 0x0, 0x0, 0x0, 0x0  };
+VECT_VAR_DECL (expected_mnan, uint, 16, 4) [] = { 0x0, 0x0, 0x0, 0x0 };
+VECT_VAR_DECL (expected_nan2, uint, 16, 4) [] = { 0x0, 0x0, 0x0, 0x0 };
+
+VECT_VAR_DECL (expected_inf, uint, 16, 4) [] = { 0xffff, 0xffff,
+						 0xffff, 0xffff };
+VECT_VAR_DECL (expected_minf, uint, 16, 4) [] = { 0x0, 0x0, 0x0, 0x0 };
+VECT_VAR_DECL (expected_inf2, uint, 16, 4) [] = { 0x0, 0x0, 0x0, 0x0 };
+
+VECT_VAR_DECL (expected_mzero, uint, 16, 4) [] = { 0x0, 0x0, 0x0, 0x0 };
+#endif
+
 VECT_VAR_DECL(expected_nan,uint,32,2) [] = { 0x0, 0x0 };
 VECT_VAR_DECL(expected_mnan,uint,32,2) [] = { 0x0, 0x0 };
 VECT_VAR_DECL(expected_nan2,uint,32,2) [] = { 0x0, 0x0 };
--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vclth_f16_1.c
@@ -0,0 +1,22 @@
+/* { dg-do run } */
+/* { dg-require-effective-target arm_v8_2a_fp16_scalar_hw } */
+/* { dg-add-options arm_v8_2a_fp16_scalar }  */
+/* { dg-skip-if "" { arm*-*-* } } */
+
+#include <arm_fp16.h>
+
+uint16_t expected[] = { 0xFFFF, 0xFFFF, 0x0, 0xFFFF, 0xFFFF, 0x0, 0xFFFF, 0x0,
+			0xFFFF, 0xFFFF, 0x0, 0xFFFF, 0x0, 0x0, 0xFFFF, 0x0, 0x0,
+			0xFFFF};
+
+#define TEST_MSG "VCLTH_F16"
+#define INSN_NAME vclth_f16
+
+#define EXPECTED expected
+
+#define INPUT_TYPE float16_t
+#define OUTPUT_TYPE uint16_t
+#define OUTPUT_TYPE_SIZE 16
+
+/* Include the template for binary scalar operations.  */
+#include "binary_scalar_op.inc"
--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vcltz_1.c
@@ -0,0 +1,27 @@
+/* This file tests an intrinsic which currently has only an f16 variant and that
+   is only available when FP16 arithmetic instructions are supported.  */
+/* { dg-require-effective-target arm_v8_2a_fp16_neon_hw } */
+
+#define INSN_NAME vcltz
+#define TEST_MSG "VCLTZ/VCLTZQ"
+
+#include "cmp_zero_op.inc"
+
+/* Expected results.  */
+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+VECT_VAR_DECL (expected_float, uint, 16, 4) [] = { 0xffff, 0xffff,
+						   0xffff, 0xffff };
+VECT_VAR_DECL (expected_q_float, uint, 16, 8) [] = { 0x0, 0x0, 0x0, 0x0 };
+#endif
+
+/* Extra FP tests with special values (NaN, ....).  */
+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+VECT_VAR_DECL (expected_nan, uint, 16, 4) [] = { 0x0, 0x0, 0x0, 0x0  };
+VECT_VAR_DECL (expected_mnan, uint, 16, 4) [] = { 0x0, 0x0, 0x0, 0x0 };
+VECT_VAR_DECL (expected_inf, uint, 16, 4) [] = { 0x0, 0x0, 0x0, 0x0 };
+
+VECT_VAR_DECL (expected_minf, uint, 16, 4) [] = { 0xffff, 0xffff,
+						  0xffff, 0xffff };
+VECT_VAR_DECL (expected_zero, uint, 16, 4) [] = { 0x0, 0x0, 0x0, 0x0 };
+VECT_VAR_DECL (expected_mzero, uint, 16, 4) [] = { 0x0, 0x0, 0x0, 0x0 };
+#endif
--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vcltzh_f16_1.c
@@ -0,0 +1,21 @@
+/* { dg-do run } */
+/* { dg-require-effective-target arm_v8_2a_fp16_scalar_hw } */
+/* { dg-add-options arm_v8_2a_fp16_scalar }  */
+/* { dg-skip-if "" { arm*-*-* } } */
+
+#include <arm_fp16.h>
+
+uint16_t expected[] = { 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0xFFFF, 0x0, 0xFFFF,
+			0x0, 0x0, 0x0, 0x0, 0x0, 0xFFFF, 0x0, 0x0, 0xFFFF};
+
+#define TEST_MSG "VCltZH_F16"
+#define INSN_NAME vcltzh_f16
+
+#define EXPECTED expected
+
+#define INPUT_TYPE float16_t
+#define OUTPUT_TYPE uint16_t
+#define OUTPUT_TYPE_SIZE 16
+
+/* Include the template for binary scalar operations.  */
+#include "unary_scalar_op.inc"
--- a/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vcnt.c
+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vcnt.c
@@ -65,10 +65,10 @@ FNNAME (INSN_NAME)
 
   CHECK(TEST_MSG, int, 8, 8, PRIx8, expected, "");
   CHECK(TEST_MSG, uint, 8, 8, PRIx8, expected, "");
-  CHECK(TEST_MSG, poly, 8, 8, PRIx8, expected, "");
+  CHECK_POLY(TEST_MSG, poly, 8, 8, PRIx8, expected, "");
   CHECK(TEST_MSG, int, 8, 16, PRIx8, expected, "");
   CHECK(TEST_MSG, uint, 8, 16, PRIx8, expected, "");
-  CHECK(TEST_MSG, poly, 8, 16, PRIx8, expected, "");
+  CHECK_POLY(TEST_MSG, poly, 8, 16, PRIx8, expected, "");
 }
 
 int main (void)
--- a/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vcombine.c
+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vcombine.c
@@ -93,8 +93,8 @@ void exec_vcombine (void)
   CHECK(TEST_MSG, uint, 16, 8, PRIx16, expected, "");
   CHECK(TEST_MSG, uint, 32, 4, PRIx32, expected, "");
   CHECK(TEST_MSG, uint, 64, 2, PRIx64, expected, "");
-  CHECK(TEST_MSG, poly, 8, 16, PRIx8, expected, "");
-  CHECK(TEST_MSG, poly, 16, 8, PRIx16, expected, "");
+  CHECK_POLY(TEST_MSG, poly, 8, 16, PRIx8, expected, "");
+  CHECK_POLY(TEST_MSG, poly, 16, 8, PRIx16, expected, "");
 #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
   CHECK_FP(TEST_MSG, float, 16, 8, PRIx16, expected, "");
 #endif
--- a/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vcreate.c
+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vcreate.c
@@ -106,8 +106,8 @@ FNNAME (INSN_NAME)
   CHECK(TEST_MSG, uint, 16, 4, PRIx16, expected, "");
   CHECK(TEST_MSG, uint, 32, 2, PRIx32, expected, "");
   CHECK(TEST_MSG, uint, 64, 1, PRIx64, expected, "");
-  CHECK(TEST_MSG, poly, 8, 8, PRIx8, expected, "");
-  CHECK(TEST_MSG, poly, 16, 4, PRIx16, expected, "");
+  CHECK_POLY(TEST_MSG, poly, 8, 8, PRIx8, expected, "");
+  CHECK_POLY(TEST_MSG, poly, 16, 4, PRIx16, expected, "");
 #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
   CHECK_FP(TEST_MSG, float, 16, 4, PRIx16, expected, "");
 #endif
--- a/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vcvt.c
+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vcvt.c
@@ -4,36 +4,99 @@
 #include <math.h>
 
 /* Expected results for vcvt.  */
+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+VECT_VAR_DECL(expected_s, hfloat, 16, 4) [] =
+{ 0xcc00, 0xcb80, 0xcb00, 0xca80 };
+VECT_VAR_DECL(expected_u, hfloat, 16, 4) [] =
+{ 0x7c00, 0x7c00, 0x7c00, 0x7c00, };
+VECT_VAR_DECL(expected_s, hfloat, 16, 8) [] =
+{ 0xcc00, 0xcb80, 0xcb00, 0xca80,
+  0xca00, 0xc980, 0xc900, 0xc880 };
+VECT_VAR_DECL(expected_u, hfloat, 16, 8) [] =
+{ 0x7c00, 0x7c00, 0x7c00, 0x7c00,
+  0x7c00, 0x7c00, 0x7c00, 0x7c00, };
+#endif
 VECT_VAR_DECL(expected_s,hfloat,32,2) [] = { 0xc1800000, 0xc1700000 };
 VECT_VAR_DECL(expected_u,hfloat,32,2) [] = { 0x4f800000, 0x4f800000 };
 VECT_VAR_DECL(expected_s,hfloat,32,4) [] = { 0xc1800000, 0xc1700000,
-					   0xc1600000, 0xc1500000 };
+					     0xc1600000, 0xc1500000 };
 VECT_VAR_DECL(expected_u,hfloat,32,4) [] = { 0x4f800000, 0x4f800000,
-					   0x4f800000, 0x4f800000 };
+					     0x4f800000, 0x4f800000 };
+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+VECT_VAR_DECL(expected, int, 16, 4) [] = { 0xfff1, 0x5, 0xfff1, 0x5 };
+VECT_VAR_DECL(expected, uint, 16, 4) [] = { 0x0, 0x5, 0x0, 0x5 };
+VECT_VAR_DECL(expected, int, 16, 8) [] = { 0x0, 0x0, 0xf, 0xfff1,
+					   0x0, 0x0, 0xf, 0xfff1 };
+VECT_VAR_DECL(expected, uint, 16, 8) [] = { 0x0, 0x0, 0xf, 0x0,
+					    0x0, 0x0, 0xf, 0x0 };
+#endif
 VECT_VAR_DECL(expected,int,32,2) [] = { 0xfffffff1, 0x5 };
 VECT_VAR_DECL(expected,uint,32,2) [] = { 0x0, 0x5 };
 VECT_VAR_DECL(expected,int,32,4) [] = { 0x0, 0x0, 0xf, 0xfffffff1 };
 VECT_VAR_DECL(expected,uint,32,4) [] = { 0x0, 0x0, 0xf, 0x0 };
 
 /* Expected results for vcvt_n.  */
+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+VECT_VAR_DECL(expected_vcvt_n_s, hfloat, 16, 4) [] = { 0xc400, 0xc380,
+						       0xc300, 0xc280 };
+VECT_VAR_DECL(expected_vcvt_n_u, hfloat, 16, 4) [] = { 0x6000, 0x6000,
+						       0x6000, 0x6000 };
+VECT_VAR_DECL(expected_vcvt_n_s, hfloat, 16, 8) [] = { 0xb000, 0xaf80,
+						       0xaf00, 0xae80,
+						       0xae00, 0xad80,
+						       0xad00, 0xac80 };
+VECT_VAR_DECL(expected_vcvt_n_u, hfloat, 16, 8) [] = { 0x4c00, 0x4c00,
+						       0x4c00, 0x4c00,
+						       0x4c00, 0x4c00,
+						       0x4c00, 0x4c00 };
+#endif
 VECT_VAR_DECL(expected_vcvt_n_s,hfloat,32,2) [] = { 0xc0800000, 0xc0700000 };
 VECT_VAR_DECL(expected_vcvt_n_u,hfloat,32,2) [] = { 0x4c000000, 0x4c000000 };
 VECT_VAR_DECL(expected_vcvt_n_s,hfloat,32,4) [] = { 0xb2800000, 0xb2700000,
 						  0xb2600000, 0xb2500000 };
 VECT_VAR_DECL(expected_vcvt_n_u,hfloat,32,4) [] = { 0x49800000, 0x49800000,
 						  0x49800000, 0x49800000 };
+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+VECT_VAR_DECL(expected_vcvt_n, int, 16, 4) [] = { 0xffc3, 0x15,
+						  0xffc3, 0x15 };
+VECT_VAR_DECL(expected_vcvt_n, uint, 16, 4) [] = { 0x0, 0x2a6, 0x0, 0x2a6 };
+VECT_VAR_DECL(expected_vcvt_n, int, 16, 8) [] = { 0x0, 0x0, 0x78f, 0xf871,
+						  0x0, 0x0, 0x78f, 0xf871 };
+VECT_VAR_DECL(expected_vcvt_n, uint, 16, 8) [] = { 0x0, 0x0, 0xf1e0, 0x0,
+						   0x0, 0x0, 0xf1e0, 0x0 };
+#endif
 VECT_VAR_DECL(expected_vcvt_n,int,32,2) [] = { 0xff0b3333, 0x54cccd };
 VECT_VAR_DECL(expected_vcvt_n,uint,32,2) [] = { 0x0, 0x15 };
 VECT_VAR_DECL(expected_vcvt_n,int,32,4) [] = { 0x0, 0x0, 0x1e3d7, 0xfffe1c29 };
 VECT_VAR_DECL(expected_vcvt_n,uint,32,4) [] = { 0x0, 0x0, 0x1e, 0x0 };
 
 /* Expected results for vcvt with rounding.  */
+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+VECT_VAR_DECL(expected_rounding, int, 16, 4) [] = { 0xa, 0xa, 0xa, 0xa };
+VECT_VAR_DECL(expected_rounding, uint, 16, 4) [] = { 0xa, 0xa, 0xa, 0xa };
+VECT_VAR_DECL(expected_rounding, int, 16, 8) [] = { 0x7d, 0x7d, 0x7d, 0x7d,
+						    0x7d, 0x7d, 0x7d, 0x7d };
+VECT_VAR_DECL(expected_rounding, uint, 16, 8) [] = { 0x7d, 0x7d, 0x7d, 0x7d,
+						     0x7d, 0x7d, 0x7d, 0x7d };
+#endif
 VECT_VAR_DECL(expected_rounding,int,32,2) [] = { 0xa, 0xa };
 VECT_VAR_DECL(expected_rounding,uint,32,2) [] = { 0xa, 0xa };
 VECT_VAR_DECL(expected_rounding,int,32,4) [] = { 0x7d, 0x7d, 0x7d, 0x7d };
 VECT_VAR_DECL(expected_rounding,uint,32,4) [] = { 0x7d, 0x7d, 0x7d, 0x7d };
 
 /* Expected results for vcvt_n with rounding.  */
+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+VECT_VAR_DECL(expected_vcvt_n_rounding, int, 16, 4) [] =
+{ 0x533, 0x533, 0x533, 0x533 };
+VECT_VAR_DECL(expected_vcvt_n_rounding, uint, 16, 4) [] =
+{ 0x533, 0x533, 0x533, 0x533 };
+VECT_VAR_DECL(expected_vcvt_n_rounding, int, 16, 8) [] =
+{ 0x7fff, 0x7fff, 0x7fff, 0x7fff,
+  0x7fff, 0x7fff, 0x7fff, 0x7fff };
+VECT_VAR_DECL(expected_vcvt_n_rounding, uint, 16, 8) [] =
+{ 0xffff, 0xffff, 0xffff, 0xffff,
+  0xffff, 0xffff, 0xffff, 0xffff };
+#endif
 VECT_VAR_DECL(expected_vcvt_n_rounding,int,32,2) [] = { 0xa66666, 0xa66666 };
 VECT_VAR_DECL(expected_vcvt_n_rounding,uint,32,2) [] = { 0xa66666, 0xa66666 };
 VECT_VAR_DECL(expected_vcvt_n_rounding,int,32,4) [] = { 0xfbccc, 0xfbccc,
@@ -42,11 +105,17 @@ VECT_VAR_DECL(expected_vcvt_n_rounding,uint,32,4) [] = { 0xfbccc, 0xfbccc,
 						0xfbccc, 0xfbccc };
 
 /* Expected results for vcvt_n with saturation.  */
-VECT_VAR_DECL(expected_vcvt_n_saturation,int,32,2) [] = { 0x7fffffff,
-							  0x7fffffff };
-VECT_VAR_DECL(expected_vcvt_n_saturation,int,32,4) [] = { 0x7fffffff,
-							  0x7fffffff,
-					       0x7fffffff, 0x7fffffff };
+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+VECT_VAR_DECL(expected_vcvt_n_saturation, int, 16, 4) [] =
+{ 0x533, 0x533, 0x533, 0x533 };
+VECT_VAR_DECL(expected_vcvt_n_saturation, int, 16, 8) [] =
+{ 0x7fff, 0x7fff, 0x7fff, 0x7fff,
+  0x7fff, 0x7fff, 0x7fff, 0x7fff };
+#endif
+VECT_VAR_DECL(expected_vcvt_n_saturation,int,32,2) [] =
+{ 0x7fffffff, 0x7fffffff };
+VECT_VAR_DECL(expected_vcvt_n_saturation,int,32,4) [] =
+{ 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff };
 
 #define TEST_MSG "VCVT/VCVTQ"
 void exec_vcvt (void)
@@ -89,11 +158,26 @@ void exec_vcvt (void)
 
   /* Initialize input "vector" from "buffer".  */
   TEST_MACRO_ALL_VARIANTS_2_5(VLOAD, vector, buffer);
+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+  VLOAD(vector, buffer, , float, f, 16, 4);
+  VLOAD(vector, buffer, q, float, f, 16, 8);
+#endif
   VLOAD(vector, buffer, , float, f, 32, 2);
   VLOAD(vector, buffer, q, float, f, 32, 4);
 
   /* Make sure some elements have a fractional part, to exercise
      integer conversions.  */
+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+  VSET_LANE(vector, , float, f, 16, 4, 0, -15.3f);
+  VSET_LANE(vector, , float, f, 16, 4, 1, 5.3f);
+  VSET_LANE(vector, , float, f, 16, 4, 2, -15.3f);
+  VSET_LANE(vector, , float, f, 16, 4, 3, 5.3f);
+  VSET_LANE(vector, q, float, f, 16, 8, 4, -15.3f);
+  VSET_LANE(vector, q, float, f, 16, 8, 5, 5.3f);
+  VSET_LANE(vector, q, float, f, 16, 8, 6, -15.3f);
+  VSET_LANE(vector, q, float, f, 16, 8, 7, 5.3f);
+#endif
+
   VSET_LANE(vector, , float, f, 32, 2, 0, -15.3f);
   VSET_LANE(vector, , float, f, 32, 2, 1, 5.3f);
   VSET_LANE(vector, q, float, f, 32, 4, 2, -15.3f);
@@ -103,23 +187,55 @@ void exec_vcvt (void)
      before overwriting them.  */
 #define TEST_MSG2 ""
 
+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+  /* vcvt_f16_xx.  */
+  TEST_VCVT_FP(, float, f, 16, 4, int, s, expected_s);
+  TEST_VCVT_FP(, float, f, 16, 4, uint, u, expected_u);
+#endif
   /* vcvt_f32_xx.  */
   TEST_VCVT_FP(, float, f, 32, 2, int, s, expected_s);
   TEST_VCVT_FP(, float, f, 32, 2, uint, u, expected_u);
 
+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+  /* vcvtq_f16_xx.  */
+  TEST_VCVT_FP(q, float, f, 16, 8, int, s, expected_s);
+  TEST_VCVT_FP(q, float, f, 16, 8, uint, u, expected_u);
+#endif
   /* vcvtq_f32_xx.  */
   TEST_VCVT_FP(q, float, f, 32, 4, int, s, expected_s);
   TEST_VCVT_FP(q, float, f, 32, 4, uint, u, expected_u);
 
+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+  /* vcvt_xx_f16.  */
+  TEST_VCVT(, int, s, 16, 4, float, f, expected);
+  TEST_VCVT(, uint, u, 16, 4, float, f, expected);
+#endif
   /* vcvt_xx_f32.  */
   TEST_VCVT(, int, s, 32, 2, float, f, expected);
   TEST_VCVT(, uint, u, 32, 2, float, f, expected);
 
+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+  VSET_LANE(vector, q, float, f, 16, 8, 0, 0.0f);
+  VSET_LANE(vector, q, float, f, 16, 8, 1, -0.0f);
+  VSET_LANE(vector, q, float, f, 16, 8, 2, 15.12f);
+  VSET_LANE(vector, q, float, f, 16, 8, 3, -15.12f);
+  VSET_LANE(vector, q, float, f, 16, 8, 4, 0.0f);
+  VSET_LANE(vector, q, float, f, 16, 8, 5, -0.0f);
+  VSET_LANE(vector, q, float, f, 16, 8, 6, 15.12f);
+  VSET_LANE(vector, q, float, f, 16, 8, 7, -15.12f);
+#endif
+
   VSET_LANE(vector, q, float, f, 32, 4, 0, 0.0f);
   VSET_LANE(vector, q, float, f, 32, 4, 1, -0.0f);
   VSET_LANE(vector, q, float, f, 32, 4, 2, 15.12f);
   VSET_LANE(vector, q, float, f, 32, 4, 3, -15.12f);
 
+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+  /* vcvtq_xx_f16.  */
+  TEST_VCVT(q, int, s, 16, 8, float, f, expected);
+  TEST_VCVT(q, uint, u, 16, 8, float, f, expected);
+#endif
+
   /* vcvtq_xx_f32.  */
   TEST_VCVT(q, int, s, 32, 4, float, f, expected);
   TEST_VCVT(q, uint, u, 32, 4, float, f, expected);
@@ -129,18 +245,38 @@ void exec_vcvt (void)
 #undef TEST_MSG
 #define TEST_MSG "VCVT_N/VCVTQ_N"
 
+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+  /* vcvt_n_f16_xx.  */
+  TEST_VCVT_N_FP(, float, f, 16, 4, int, s, 2, expected_vcvt_n_s);
+  TEST_VCVT_N_FP(, float, f, 16, 4, uint, u, 7, expected_vcvt_n_u);
+#endif
   /* vcvt_n_f32_xx.  */
   TEST_VCVT_N_FP(, float, f, 32, 2, int, s, 2, expected_vcvt_n_s);
   TEST_VCVT_N_FP(, float, f, 32, 2, uint, u, 7, expected_vcvt_n_u);
 
+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+  /* vcvtq_n_f16_xx.  */
+  TEST_VCVT_N_FP(q, float, f, 16, 8, int, s, 7, expected_vcvt_n_s);
+  TEST_VCVT_N_FP(q, float, f, 16, 8, uint, u, 12, expected_vcvt_n_u);
+#endif
   /* vcvtq_n_f32_xx.  */
   TEST_VCVT_N_FP(q, float, f, 32, 4, int, s, 30, expected_vcvt_n_s);
   TEST_VCVT_N_FP(q, float, f, 32, 4, uint, u, 12, expected_vcvt_n_u);
 
+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+  /* vcvt_n_xx_f16.  */
+  TEST_VCVT_N(, int, s, 16, 4, float, f, 2, expected_vcvt_n);
+  TEST_VCVT_N(, uint, u, 16, 4, float, f, 7, expected_vcvt_n);
+#endif
   /* vcvt_n_xx_f32.  */
   TEST_VCVT_N(, int, s, 32, 2, float, f, 20, expected_vcvt_n);
   TEST_VCVT_N(, uint, u, 32, 2, float, f, 2, expected_vcvt_n);
 
+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+  /* vcvtq_n_xx_f16.  */
+  TEST_VCVT_N(q, int, s, 16, 8, float, f, 7, expected_vcvt_n);
+  TEST_VCVT_N(q, uint, u, 16, 8, float, f, 12, expected_vcvt_n);
+#endif
   /* vcvtq_n_xx_f32.  */
   TEST_VCVT_N(q, int, s, 32, 4, float, f, 13, expected_vcvt_n);
   TEST_VCVT_N(q, uint, u, 32, 4, float, f, 1, expected_vcvt_n);
@@ -150,20 +286,49 @@ void exec_vcvt (void)
 #define TEST_MSG "VCVT/VCVTQ"
 #undef TEST_MSG2
 #define TEST_MSG2 "(check rounding)"
+
+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+  VDUP(vector, , float, f, 16, 4, 10.4f);
+  VDUP(vector, q, float, f, 16, 8, 125.9f);
+#endif
   VDUP(vector, , float, f, 32, 2, 10.4f);
   VDUP(vector, q, float, f, 32, 4, 125.9f);
+
+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+  /* vcvt_xx_f16.  */
+  TEST_VCVT(, int, s, 16, 4, float, f, expected_rounding);
+  TEST_VCVT(, uint, u, 16, 4, float, f, expected_rounding);
+#endif
   /* vcvt_xx_f32.  */
   TEST_VCVT(, int, s, 32, 2, float, f, expected_rounding);
   TEST_VCVT(, uint, u, 32, 2, float, f, expected_rounding);
+
+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+  /* vcvtq_xx_f16.  */
+  TEST_VCVT(q, int, s, 16, 8, float, f, expected_rounding);
+  TEST_VCVT(q, uint, u, 16, 8, float, f, expected_rounding);
+#endif
   /* vcvtq_xx_f32.  */
   TEST_VCVT(q, int, s, 32, 4, float, f, expected_rounding);
   TEST_VCVT(q, uint, u, 32, 4, float, f, expected_rounding);
 
 #undef TEST_MSG
 #define TEST_MSG "VCVT_N/VCVTQ_N"
+
+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+  /* vcvt_n_xx_f16.  */
+  TEST_VCVT_N(, int, s, 16, 4, float, f, 7, expected_vcvt_n_rounding);
+  TEST_VCVT_N(, uint, u, 16, 4, float, f, 7, expected_vcvt_n_rounding);
+#endif
   /* vcvt_n_xx_f32.  */
   TEST_VCVT_N(, int, s, 32, 2, float, f, 20, expected_vcvt_n_rounding);
   TEST_VCVT_N(, uint, u, 32, 2, float, f, 20, expected_vcvt_n_rounding);
+
+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+  /* vcvtq_n_xx_f16.  */
+  TEST_VCVT_N(q, int, s, 16, 8, float, f, 13, expected_vcvt_n_rounding);
+  TEST_VCVT_N(q, uint, u, 16, 8, float, f, 13, expected_vcvt_n_rounding);
+#endif
   /* vcvtq_n_xx_f32.  */
   TEST_VCVT_N(q, int, s, 32, 4, float, f, 13, expected_vcvt_n_rounding);
   TEST_VCVT_N(q, uint, u, 32, 4, float, f, 13, expected_vcvt_n_rounding);
@@ -172,8 +337,18 @@ void exec_vcvt (void)
 #define TEST_MSG "VCVT_N/VCVTQ_N"
 #undef TEST_MSG2
 #define TEST_MSG2 "(check saturation)"
+
+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+  /* vcvt_n_xx_f16.  */
+  TEST_VCVT_N(, int, s, 16, 4, float, f, 7, expected_vcvt_n_saturation);
+#endif
   /* vcvt_n_xx_f32.  */
   TEST_VCVT_N(, int, s, 32, 2, float, f, 31, expected_vcvt_n_saturation);
+
+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+  /* vcvtq_n_xx_f16.  */
+  TEST_VCVT_N(q, int, s, 16, 8, float, f, 13, expected_vcvt_n_saturation);
+#endif
   /* vcvtq_n_xx_f32.  */
   TEST_VCVT_N(q, int, s, 32, 4, float, f, 31, expected_vcvt_n_saturation);
 }
--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vcvtX.inc
@@ -0,0 +1,113 @@
+/* Template file for VCVT operator validation.
+
+   This file is meant to be included by the relevant test files, which
+   have to define the intrinsic family to test.  If a given intrinsic
+   supports variants which are not supported by all the other vcvt
+   operators, these can be tested by providing a definition for
+   EXTRA_TESTS.
+
+   This file is only used for VCVT? tests, which currently have only f16 to
+   integer variants.  It is based on vcvt.c.  */
+
+#define FNNAME1(NAME) exec_ ## NAME
+#define FNNAME(NAME) FNNAME1 (NAME)
+
+void FNNAME (INSN_NAME) (void)
+{
+  int i;
+
+  /* Basic test: y=vcvt(x), then store the result.  */
+#define TEST_VCVT1(INSN, Q, T1, T2, W, N, TS1, TS2, EXP)	\
+  VECT_VAR(vector_res, T1, W, N) =				\
+    INSN##Q##_##T2##W##_##TS2##W(VECT_VAR(vector, TS1, W, N));	\
+  vst1##Q##_##T2##W(VECT_VAR(result, T1, W, N),			\
+		    VECT_VAR(vector_res, T1, W, N));		\
+  CHECK(TEST_MSG, T1, W, N, PRIx##W, EXP, TEST_MSG2);
+
+#define TEST_VCVT(INSN, Q, T1, T2, W, N, TS1, TS2, EXP)		\
+  TEST_VCVT1 (INSN, Q, T1, T2, W, N, TS1, TS2, EXP)
+
+  DECL_VARIABLE_ALL_VARIANTS(vector);
+  DECL_VARIABLE_ALL_VARIANTS(vector_res);
+
+  clean_results ();
+
+  /* Initialize input "vector" from "buffer".  */
+  TEST_MACRO_ALL_VARIANTS_2_5(VLOAD, vector, buffer);
+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+  VLOAD(vector, buffer, , float, f, 16, 4);
+  VLOAD(vector, buffer, q, float, f, 16, 8);
+#endif
+
+  /* Make sure some elements have a fractional part, to exercise
+     integer conversions.  */
+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+  VSET_LANE(vector, , float, f, 16, 4, 0, -15.3f);
+  VSET_LANE(vector, , float, f, 16, 4, 1, 5.3f);
+  VSET_LANE(vector, , float, f, 16, 4, 2, -15.3f);
+  VSET_LANE(vector, , float, f, 16, 4, 3, 5.3f);
+  VSET_LANE(vector, q, float, f, 16, 8, 4, -15.3f);
+  VSET_LANE(vector, q, float, f, 16, 8, 5, 5.3f);
+  VSET_LANE(vector, q, float, f, 16, 8, 6, -15.3f);
+  VSET_LANE(vector, q, float, f, 16, 8, 7, 5.3f);
+#endif
+
+  /* The same result buffers are used multiple times, so we check them
+     before overwriting them.  */
+#define TEST_MSG2 ""
+
+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+  /* vcvt?_xx_f16.  */
+  TEST_VCVT(INSN_NAME, , int, s, 16, 4, float, f, expected);
+  TEST_VCVT(INSN_NAME, , uint, u, 16, 4, float, f, expected);
+#endif
+
+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+  VSET_LANE(vector, q, float, f, 16, 8, 0, 0.0f);
+  VSET_LANE(vector, q, float, f, 16, 8, 1, -0.0f);
+  VSET_LANE(vector, q, float, f, 16, 8, 2, 15.12f);
+  VSET_LANE(vector, q, float, f, 16, 8, 3, -15.12f);
+  VSET_LANE(vector, q, float, f, 16, 8, 4, 0.0f);
+  VSET_LANE(vector, q, float, f, 16, 8, 5, -0.0f);
+  VSET_LANE(vector, q, float, f, 16, 8, 6, 15.12f);
+  VSET_LANE(vector, q, float, f, 16, 8, 7, -15.12f);
+#endif
+
+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+  /* vcvt?q_xx_f16.  */
+  TEST_VCVT(INSN_NAME, q, int, s, 16, 8, float, f, expected);
+  TEST_VCVT(INSN_NAME, q, uint, u, 16, 8, float, f, expected);
+#endif
+
+  /* Check rounding.  */
+#undef TEST_MSG2
+#define TEST_MSG2 "(check rounding)"
+
+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+  VDUP(vector, , float, f, 16, 4, 10.4f);
+  VDUP(vector, q, float, f, 16, 8, 125.9f);
+#endif
+
+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+  /* vcvt?_xx_f16.  */
+  TEST_VCVT(INSN_NAME, , int, s, 16, 4, float, f, expected_rounding);
+  TEST_VCVT(INSN_NAME, , uint, u, 16, 4, float, f, expected_rounding);
+#endif
+
+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+  /* vcvt?q_xx_f16.  */
+  TEST_VCVT(INSN_NAME, q, int, s, 16, 8, float, f, expected_rounding);
+  TEST_VCVT(INSN_NAME, q, uint, u, 16, 8, float, f, expected_rounding);
+#endif
+
+#ifdef EXTRA_TESTS
+  EXTRA_TESTS();
+#endif
+}
+
+int
+main (void)
+{
+  FNNAME (INSN_NAME) ();
+  return 0;
+}
--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vcvta_1.c
@@ -0,0 +1,33 @@
+/* This file tests an intrinsic which currently has only an f16 variant and that
+   is only available when FP16 arithmetic instructions are supported.  */
+/* { dg-require-effective-target arm_v8_2a_fp16_neon_hw } */
+
+#include <arm_neon.h>
+#include "arm-neon-ref.h"
+#include "compute-ref-data.h"
+#include <math.h>
+
+/* Expected results.  */
+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+VECT_VAR_DECL(expected, int, 16, 4) [] = { 0xfff1, 0x5, 0xfff1, 0x5 };
+VECT_VAR_DECL(expected, uint, 16, 4) [] = { 0x0, 0x5, 0x0, 0x5 };
+VECT_VAR_DECL(expected, int, 16, 8) [] = { 0x0, 0x0, 0xf, 0xfff1,
+					   0x0, 0x0, 0xf, 0xfff1 };
+VECT_VAR_DECL(expected, uint, 16, 8) [] = { 0x0, 0x0, 0xf, 0x0,
+					    0x0, 0x0, 0xf, 0x0 };
+#endif
+
+/* Expected results with rounding.  */
+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+VECT_VAR_DECL(expected_rounding, int, 16, 4) [] = { 0xa, 0xa, 0xa, 0xa };
+VECT_VAR_DECL(expected_rounding, uint, 16, 4) [] = { 0xa, 0xa, 0xa, 0xa };
+VECT_VAR_DECL(expected_rounding, int, 16, 8) [] = { 0x7e, 0x7e, 0x7e, 0x7e,
+						    0x7e, 0x7e, 0x7e, 0x7e };
+VECT_VAR_DECL(expected_rounding, uint, 16, 8) [] = { 0x7e, 0x7e, 0x7e, 0x7e,
+						     0x7e, 0x7e, 0x7e, 0x7e };
+#endif
+
+#define TEST_MSG "VCVTA/VCVTAQ"
+#define INSN_NAME vcvta
+
+#include "vcvtX.inc"
--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vcvtah_s16_f16_1.c
@@ -0,0 +1,23 @@
+/* { dg-do run } */
+/* { dg-require-effective-target arm_v8_2a_fp16_scalar_hw } */
+/* { dg-add-options arm_v8_2a_fp16_scalar }  */
+/* { dg-skip-if "" { arm*-*-* } } */
+
+#include <arm_fp16.h>
+
+/* Input values.  */
+float16_t input[] = { 123.9, -56.8, 0.7, 24.6, -63.5, 169.4, -4.3, 77.0 };
+int16_t expected[] = { 124, -57, 1, 25, -64, 169, -4, 77 };
+
+#define TEST_MSG "VCVTAH_S16_F16"
+#define INSN_NAME vcvtah_s16_f16
+
+#define INPUT input
+#define EXPECTED expected
+
+#define INPUT_TYPE float16_t
+#define OUTPUT_TYPE int16_t
+#define OUTPUT_TYPE_SIZE 16
+
+/* Include the template for unary scalar operations.  */
+#include "unary_scalar_op.inc"
--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vcvtah_s32_f16_1.c
@@ -0,0 +1,53 @@
+/* { dg-do run } */
+/* { dg-require-effective-target arm_v8_2a_fp16_scalar_hw } */
+/* { dg-add-options arm_v8_2a_fp16_scalar }  */
+
+#include <arm_fp16.h>
+
+/* Input values.  */
+float16_t input[] =
+{
+  0.0, -0.0,
+  123.4, -567.8,
+  -34.8, 1024,
+  663.1, 169.1,
+  -4.8, 77.0,
+  -144.5, -56.8,
+
+  (float16_t) -16, (float16_t) -15,
+  (float16_t) -14, (float16_t) -13,
+};
+
+/* Expected results (32-bit hexadecimal representation).  */
+uint32_t expected[] =
+{
+  0x00000000,
+  0x00000000,
+  0x0000007b,
+  0xfffffdc8,
+  0xffffffdd,
+  0x00000400,
+  0x00000297,
+  0x000000a9,
+  0xfffffffb,
+  0x0000004d,
+  0xffffff6f,
+  0xffffffc7,
+  0xfffffff0,
+  0xfffffff1,
+  0xfffffff2,
+  0xfffffff3
+};
+
+#define TEST_MSG "VCVTAH_S32_F16"
+#define INSN_NAME vcvtah_s32_f16
+
+#define INPUT input
+#define EXPECTED expected
+
+#define INPUT_TYPE float16_t
+#define OUTPUT_TYPE int32_t
+#define OUTPUT_TYPE_SIZE 32
+
+/* Include the template for unary scalar operations.  */
+#include "unary_scalar_op.inc"
--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vcvtah_s64_f16_1.c
@@ -0,0 +1,23 @@
+/* { dg-do run } */
+/* { dg-require-effective-target arm_v8_2a_fp16_scalar_hw } */
+/* { dg-add-options arm_v8_2a_fp16_scalar }  */
+/* { dg-skip-if "" { arm*-*-* } } */
+
+#include <arm_fp16.h>
+
+/* Input values.  */
+float16_t input[] = { 123.9, -56.8, 0.7, 24.6, -63.5, 169.4, -4.3, 77.0 };
+int64_t expected[] = { 124, -57, 1, 25, -64, 169, -4, 77 };
+
+#define TEST_MSG "VCVTAH_S64_F16"
+#define INSN_NAME vcvtah_s64_f16
+
+#define INPUT input
+#define EXPECTED expected
+
+#define INPUT_TYPE float16_t
+#define OUTPUT_TYPE int64_t
+#define OUTPUT_TYPE_SIZE 64
+
+/* Include the template for unary scalar operations.  */
+#include "unary_scalar_op.inc"
--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vcvtah_u16_f16_1.c
@@ -0,0 +1,23 @@
+/* { dg-do run } */
+/* { dg-require-effective-target arm_v8_2a_fp16_scalar_hw } */
+/* { dg-add-options arm_v8_2a_fp16_scalar }  */
+/* { dg-skip-if "" { arm*-*-* } } */
+
+#include <arm_fp16.h>
+
+/* Input values.  */
+float16_t input[] = { 123.9, 56.8, 0.7, 24.6, 63.5, 169.4, 4.3, 77.0 };
+uint16_t expected[] = { 124, 57, 1, 25, 64, 169, 4, 77 };
+
+#define TEST_MSG "VCVTAH_u16_F16"
+#define INSN_NAME vcvtah_u16_f16
+
+#define INPUT input
+#define EXPECTED expected
+
+#define INPUT_TYPE float16_t
+#define OUTPUT_TYPE uint16_t
+#define OUTPUT_TYPE_SIZE 16
+
+/* Include the template for unary scalar operations.  */
+#include "unary_scalar_op.inc"
--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vcvtah_u32_f16_1.c
@@ -0,0 +1,53 @@
+/* { dg-do run } */
+/* { dg-require-effective-target arm_v8_2a_fp16_scalar_hw } */
+/* { dg-add-options arm_v8_2a_fp16_scalar }  */
+
+#include <arm_fp16.h>
+
+/* Input values.  */
+float16_t input[] =
+{
+  0.0, -0.0,
+  123.4, -567.8,
+  -34.8, 1024,
+  663.1, 169.1,
+  -4.8, 77.0,
+  -144.5, -56.8,
+
+  (float16_t) -16, (float16_t) -15,
+  (float16_t) -14, (float16_t) -13,
+};
+
+/* Expected results (32-bit hexadecimal representation).  */
+uint32_t expected[] =
+{
+  0x00000000,
+  0x00000000,
+  0x0000007b,
+  0x00000000,
+  0x00000000,
+  0x00000400,
+  0x00000297,
+  0x000000a9,
+  0x00000000,
+  0x0000004d,
+  0x00000000,
+  0x00000000,
+  0x00000000,
+  0x00000000,
+  0x00000000,
+  0x00000000
+};
+
+#define TEST_MSG "VCVTAH_U32_F16"
+#define INSN_NAME vcvtah_u32_f16
+
+#define INPUT input
+#define EXPECTED expected
+
+#define INPUT_TYPE float16_t
+#define OUTPUT_TYPE uint32_t
+#define OUTPUT_TYPE_SIZE 32
+
+/* Include the template for unary scalar operations.  */
+#include "unary_scalar_op.inc"
--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vcvtah_u64_f16_1.c
@@ -0,0 +1,23 @@
+/* { dg-do run } */
+/* { dg-require-effective-target arm_v8_2a_fp16_scalar_hw } */
+/* { dg-add-options arm_v8_2a_fp16_scalar }  */
+/* { dg-skip-if "" { arm*-*-* } } */
+
+#include <arm_fp16.h>
+
+/* Input values.  */
+float16_t input[] = { 123.9, 56.8, 0.7, 24.6, 63.5, 169.4, 4.3, 77.0 };
+uint64_t expected[] = { 124, 57, 1, 25, 64, 169, 4, 77 };
+
+#define TEST_MSG "VCVTAH_u64_F16"
+#define INSN_NAME vcvtah_u64_f16
+
+#define INPUT input
+#define EXPECTED expected
+
+#define INPUT_TYPE float16_t
+#define OUTPUT_TYPE uint64_t
+#define OUTPUT_TYPE_SIZE 64
+
+/* Include the template for unary scalar operations.  */
+#include "unary_scalar_op.inc"
--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vcvth_f16_s16_1.c
@@ -0,0 +1,25 @@
+/* { dg-do run } */
+/* { dg-require-effective-target arm_v8_2a_fp16_scalar_hw } */
+/* { dg-add-options arm_v8_2a_fp16_scalar }  */
+/* { dg-skip-if "" { arm*-*-* } } */
+
+#include <arm_fp16.h>
+
+int16_t input[] = { 123, -567, 0, 1024, -63, 169, -4, 77 };
+uint16_t expected[] = { 0x57B0 /* 123.0.  */, 0xE06E /* -567.0.  */,
+			0x0000 /* 0.0.  */, 0x6400 /* 1024.  */,
+			0xD3E0 /* -63.  */, 0x5948 /* 169.  */,
+			0xC400 /* -4.  */, 0x54D0 /* 77.  */ };
+
+#define TEST_MSG "VCVTH_F16_S16"
+#define INSN_NAME vcvth_f16_s16
+
+#define EXPECTED expected
+
+#define INPUT input
+#define INPUT_TYPE int16_t
+#define OUTPUT_TYPE float16_t
+#define OUTPUT_TYPE_SIZE 16
+
+/* Include the template for binary scalar operations.  */
+#include "unary_scalar_op.inc"
--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vcvth_f16_s32_1.c
@@ -0,0 +1,52 @@
+/* { dg-do run } */
+/* { dg-require-effective-target arm_v8_2a_fp16_scalar_hw } */
+/* { dg-add-options arm_v8_2a_fp16_scalar }  */
+
+#include <arm_fp16.h>
+
+/* Input values.  */
+uint32_t input[] =
+{
+  0, -0,
+  123, -567,
+  -34, 1024,
+  -63, 169,
+  -4, 77,
+  -144, -56,
+  -16, -15,
+  -14, -13,
+};
+
+/* Expected results (16-bit hexadecimal representation).  */
+uint16_t expected[] =
+{
+  0x0000 /* 0.000000 */,
+  0x0000 /* 0.000000 */,
+  0x57b0 /* 123.000000 */,
+  0xe06e /* -567.000000 */,
+  0xd040 /* -34.000000 */,
+  0x6400 /* 1024.000000 */,
+  0xd3e0 /* -63.000000 */,
+  0x5948 /* 169.000000 */,
+  0xc400 /* -4.000000 */,
+  0x54d0 /* 77.000000 */,
+  0xd880 /* -144.000000 */,
+  0xd300 /* -56.000000 */,
+  0xcc00 /* -16.000000 */,
+  0xcb80 /* -15.000000 */,
+  0xcb00 /* -14.000000 */,
+  0xca80 /* -13.000000 */
+};
+
+#define TEST_MSG "VCVTH_F16_S32"
+#define INSN_NAME vcvth_f16_s32
+
+#define INPUT input
+#define EXPECTED expected
+
+#define INPUT_TYPE uint32_t
+#define OUTPUT_TYPE float16_t
+#define OUTPUT_TYPE_SIZE 16
+
+/* Include the template for unary scalar operations.  */
+#include "unary_scalar_op.inc"
--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vcvth_f16_s64_1.c
@@ -0,0 +1,25 @@
+/* { dg-do run } */
+/* { dg-require-effective-target arm_v8_2a_fp16_scalar_hw } */
+/* { dg-add-options arm_v8_2a_fp16_scalar }  */
+/* { dg-skip-if "" { arm*-*-* } } */
+
+#include <arm_fp16.h>
+
+int64_t input[] = { 123, -567, 0, 1024, -63, 169, -4, 77 };
+uint16_t expected[] = { 0x57B0 /* 123.0.  */, 0xE06E /* -567.0.  */,
+			0x0000 /* 0.0.  */, 0x6400 /* 1024.  */,
+			0xD3E0 /* -63.  */, 0x5948 /* 169.  */,
+			0xC400 /* -4.  */, 0x54D0 /* 77.  */ };
+
+#define TEST_MSG "VCVTH_F16_S64"
+#define INSN_NAME vcvth_f16_s64
+
+#define EXPECTED expected
+
+#define INPUT input
+#define INPUT_TYPE int64_t
+#define OUTPUT_TYPE float16_t
+#define OUTPUT_TYPE_SIZE 16
+
+/* Include the template for binary scalar operations.  */
+#include "unary_scalar_op.inc"
--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vcvth_f16_u16_1.c
@@ -0,0 +1,25 @@
+/* { dg-do run } */
+/* { dg-require-effective-target arm_v8_2a_fp16_scalar_hw } */
+/* { dg-add-options arm_v8_2a_fp16_scalar }  */
+/* { dg-skip-if "" { arm*-*-* } } */
+
+#include <arm_fp16.h>
+
+uint16_t input[] = { 123, 567, 0, 1024, 63, 169, 4, 77 };
+uint16_t expected[] = { 0x57B0 /* 123.0.  */, 0x606E /* 567.0.  */,
+			0x0000 /* 0.0.  */, 0x6400 /* 1024.0.  */,
+			0x53E0 /* 63.0.  */, 0x5948 /* 169.0.  */,
+			0x4400 /* 4.0.  */, 0x54D0 /* 77.0.  */ };
+
+#define TEST_MSG "VCVTH_F16_U16"
+#define INSN_NAME vcvth_f16_u16
+
+#define EXPECTED expected
+
+#define INPUT input
+#define INPUT_TYPE uint16_t
+#define OUTPUT_TYPE float16_t
+#define OUTPUT_TYPE_SIZE 16
+
+/* Include the template for binary scalar operations.  */
+#include "unary_scalar_op.inc"
--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vcvth_f16_u32_1.c
@@ -0,0 +1,52 @@
+/* { dg-do run } */
+/* { dg-require-effective-target arm_v8_2a_fp16_scalar_hw } */
+/* { dg-add-options arm_v8_2a_fp16_scalar }  */
+
+#include <arm_fp16.h>
+
+/* Input values.  */
+int32_t input[] =
+{
+  0, -0,
+  123, -567,
+  -34, 1024,
+  -63, 169,
+  -4, 77,
+  -144, -56,
+  -16, -15,
+  -14, -13,
+};
+
+/* Expected results (16-bit hexadecimal representation).  */
+uint16_t expected[] =
+{
+  0x0000 /* 0.000000 */,
+  0x0000 /* 0.000000 */,
+  0x57b0 /* 123.000000 */,
+  0x7c00 /* inf */,
+  0x7c00 /* inf */,
+  0x6400 /* 1024.000000 */,
+  0x7c00 /* inf */,
+  0x5948 /* 169.000000 */,
+  0x7c00 /* inf */,
+  0x54d0 /* 77.000000 */,
+  0x7c00 /* inf */,
+  0x7c00 /* inf */,
+  0x7c00 /* inf */,
+  0x7c00 /* inf */,
+  0x7c00 /* inf */,
+  0x7c00 /* inf */
+};
+
+#define TEST_MSG "VCVTH_F16_U32"
+#define INSN_NAME vcvth_f16_u32
+
+#define INPUT input
+#define EXPECTED expected
+
+#define INPUT_TYPE int32_t
+#define OUTPUT_TYPE float16_t
+#define OUTPUT_TYPE_SIZE 16
+
+/* Include the template for unary scalar operations.  */
+#include "unary_scalar_op.inc"
--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vcvth_f16_u64_1.c
@@ -0,0 +1,25 @@
+/* { dg-do run } */
+/* { dg-require-effective-target arm_v8_2a_fp16_scalar_hw } */
+/* { dg-add-options arm_v8_2a_fp16_scalar }  */
+/* { dg-skip-if "" { arm*-*-* } } */
+
+#include <arm_fp16.h>
+
+uint64_t input[] = { 123, 567, 0, 1024, 63, 169, 4, 77 };
+uint16_t expected[] = { 0x57B0 /* 123.0.  */, 0x606E /* 567.0.  */,
+			0x0000 /* 0.0.  */, 0x6400 /* 1024.0.  */,
+			0x53E0 /* 63.0.  */, 0x5948 /* 169.0.  */,
+			0x4400 /* 4.0.  */, 0x54D0 /* 77.0.  */ };
+
+#define TEST_MSG "VCVTH_F16_U64"
+#define INSN_NAME vcvth_f16_u64
+
+#define EXPECTED expected
+
+#define INPUT input
+#define INPUT_TYPE uint64_t
+#define OUTPUT_TYPE float16_t
+#define OUTPUT_TYPE_SIZE 16
+
+/* Include the template for binary scalar operations.  */
+#include "unary_scalar_op.inc"
--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vcvth_n_f16_s16_1.c
@@ -0,0 +1,46 @@
+/* { dg-do run } */
+/* { dg-require-effective-target arm_v8_2a_fp16_scalar_hw } */
+/* { dg-add-options arm_v8_2a_fp16_scalar }  */
+/* { dg-skip-if "" { arm*-*-* } } */
+
+#include <arm_fp16.h>
+
+/* Input values.  */
+int16_t input[] = { 1, 10, 48, 100, -1, -10, 7, -7 };
+
+/* Expected results (16-bit hexadecimal representation).  */
+uint16_t expected_1[] = { 0x3800 /* 0.5.  */,
+			  0x4500 /* 5.  */,
+			  0x4E00 /* 24.  */,
+			  0x5240 /* 50.  */,
+			  0xB800 /* -0.5.  */,
+			  0xC500 /* -5.  */,
+			  0x4300 /* 3.5.  */,
+			  0xC300 /* -3.5.  */ };
+
+uint16_t expected_2[] = { 0x3400 /* 0.25.  */,
+			  0x4100 /* 2.5.  */,
+			  0x4A00 /* 12.  */,
+			  0x4E40 /* 25.  */,
+			  0xB400 /* -0.25.  */,
+			  0xC100 /* -2.5.  */,
+			  0x3F00 /* 1.75.  */,
+			  0xBF00 /* -1.75.  */ };
+
+#define TEST_MSG "VCVTH_N_F16_S16"
+#define INSN_NAME vcvth_n_f16_s16
+
+#define INPUT input
+#define EXPECTED_1 expected_1
+#define EXPECTED_2 expected_2
+
+#define INPUT_TYPE int16_t
+#define OUTPUT_TYPE float16_t
+#define OUTPUT_TYPE_SIZE 16
+
+#define SCALAR_OPERANDS
+#define SCALAR_1 1
+#define SCALAR_2 2
+
+/* Include the template for unary scalar operations.  */
+#include "unary_scalar_op.inc"
--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vcvth_n_f16_s32_1.c
@@ -0,0 +1,99 @@
+/* { dg-do run } */
+/* { dg-require-effective-target arm_v8_2a_fp16_scalar_hw } */
+/* { dg-add-options arm_v8_2a_fp16_scalar }  */
+
+#include <arm_fp16.h>
+
+/* Input values.  */
+uint32_t input[] =
+{
+  0, -0,
+  123, -567,
+  -34, 1024,
+  -63, 169,
+  -4, 77,
+  -144, -56,
+  -16, -15,
+  -14, -13,
+};
+
+/* Expected results (16-bit hexadecimal representation).  */
+uint16_t expected_1[] =
+{
+  0x0000 /* 0.000000 */,
+  0x0000 /* 0.000000 */,
+  0x53b0 /* 61.500000 */,
+  0xdc6e /* -283.500000 */,
+  0xcc40 /* -17.000000 */,
+  0x6000 /* 512.000000 */,
+  0xcfe0 /* -31.500000 */,
+  0x5548 /* 84.500000 */,
+  0xc000 /* -2.000000 */,
+  0x50d0 /* 38.500000 */,
+  0xd480 /* -72.000000 */,
+  0xcf00 /* -28.000000 */,
+  0xc800 /* -8.000000 */,
+  0xc780 /* -7.500000 */,
+  0xc700 /* -7.000000 */,
+  0xc680 /* -6.500000 */
+};
+
+uint16_t expected_2[] =
+{
+  0x0000 /* 0.000000 */,
+  0x0000 /* 0.000000 */,
+  0x4fb0 /* 30.750000 */,
+  0xd86e /* -141.750000 */,
+  0xc840 /* -8.500000 */,
+  0x5c00 /* 256.000000 */,
+  0xcbe0 /* -15.750000 */,
+  0x5148 /* 42.250000 */,
+  0xbc00 /* -1.000000 */,
+  0x4cd0 /* 19.250000 */,
+  0xd080 /* -36.000000 */,
+  0xcb00 /* -14.000000 */,
+  0xc400 /* -4.000000 */,
+  0xc380 /* -3.750000 */,
+  0xc300 /* -3.500000 */,
+  0xc280 /* -3.250000 */
+};
+
+uint16_t expected_3[] =
+{
+ 0x0000 /* 0.000000 */,
+ 0x0000 /* 0.000000 */,
+ 0x0000 /* 0.000000 */,
+ 0x8002 /* -0.000000 */,
+ 0x8000 /* -0.000000 */,
+ 0x0004 /* 0.000000 */,
+ 0x8000 /* -0.000000 */,
+ 0x0001 /* 0.000000 */,
+ 0x8000 /* -0.000000 */,
+ 0x0000 /* 0.000000 */,
+ 0x8001 /* -0.000000 */,
+ 0x8000 /* -0.000000 */,
+ 0x8000 /* -0.000000 */,
+ 0x8000 /* -0.000000 */,
+ 0x8000 /* -0.000000 */,
+ 0x8000 /* -0.000000 */
+};
+
+#define TEST_MSG "VCVTH_N_F16_S32"
+#define INSN_NAME vcvth_n_f16_s32
+
+#define INPUT input
+#define EXPECTED_1 expected_1
+#define EXPECTED_2 expected_2
+#define EXPECTED_3 expected_3
+
+#define INPUT_TYPE int32_t
+#define OUTPUT_TYPE float16_t
+#define OUTPUT_TYPE_SIZE 16
+
+#define SCALAR_OPERANDS
+#define SCALAR_1 1
+#define SCALAR_2 2
+#define SCALAR_3 32
+
+/* Include the template for unary scalar operations.  */
+#include "unary_scalar_op.inc"
--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vcvth_n_f16_s64_1.c
@@ -0,0 +1,46 @@
+/* { dg-do run } */
+/* { dg-require-effective-target arm_v8_2a_fp16_scalar_hw } */
+/* { dg-add-options arm_v8_2a_fp16_scalar }  */
+/* { dg-skip-if "" { arm*-*-* } } */
+
+#include <arm_fp16.h>
+
+/* Input values.  */
+int64_t input[] = { 1, 10, 48, 100, -1, -10, 7, -7 };
+
+/* Expected results (16-bit hexadecimal representation).  */
+uint16_t expected_1[] = { 0x3800 /* 0.5.  */,
+			  0x4500 /* 5.  */,
+			  0x4E00 /* 24.  */,
+			  0x5240 /* 50.  */,
+			  0xB800 /* -0.5.  */,
+			  0xC500 /* -5.  */,
+			  0x4300 /* 3.5.  */,
+			  0xC300 /* -3.5.  */ };
+
+uint16_t expected_2[] = { 0x3400 /* 0.25.  */,
+			  0x4100 /* 2.5.  */,
+			  0x4A00 /* 12.  */,
+			  0x4E40 /* 25.  */,
+			  0xB400 /* -0.25.  */,
+			  0xC100 /* -2.5.  */,
+			  0x3F00 /* 1.75.  */,
+			  0xBF00 /* -1.75.  */ };
+
+#define TEST_MSG "VCVTH_N_F16_S64"
+#define INSN_NAME vcvth_n_f16_s64
+
+#define INPUT input
+#define EXPECTED_1 expected_1
+#define EXPECTED_2 expected_2
+
+#define INPUT_TYPE int64_t
+#define OUTPUT_TYPE float16_t
+#define OUTPUT_TYPE_SIZE 16
+
+#define SCALAR_OPERANDS
+#define SCALAR_1 1
+#define SCALAR_2 2
+
+/* Include the template for unary scalar operations.  */
+#include "unary_scalar_op.inc"
--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vcvth_n_f16_u16_1.c
@@ -0,0 +1,46 @@
+/* { dg-do run } */
+/* { dg-require-effective-target arm_v8_2a_fp16_scalar_hw } */
+/* { dg-add-options arm_v8_2a_fp16_scalar }  */
+/* { dg-skip-if "" { arm*-*-* } } */
+
+#include <arm_fp16.h>
+
+/* Input values.  */
+uint16_t input[] = { 1, 10, 48, 100, 1000, 0, 500, 9 };
+
+/* Expected results (16-bit hexadecimal representation).  */
+uint16_t expected_1[] = { 0x3800 /* 0.5.  */,
+			  0x4500 /* 5.  */,
+			  0x4E00 /* 24.  */,
+			  0x5240 /* 50.  */,
+			  0x5FD0 /* 500.  */,
+			  0x0000 /* 0.0.  */,
+			  0x5BD0 /* 250.  */,
+			  0x4480 /* 4.5.  */ };
+
+uint16_t expected_2[] = { 0x3400 /* 0.25.  */,
+			  0x4100 /* 2.5.  */,
+			  0x4A00 /* 12.  */,
+			  0x4E40 /* 25.  */,
+			  0x5BD0 /* 250.  */,
+			  0x0000 /* 0.0.  */,
+			  0x57D0 /* 125.  */,
+			  0x4080 /* 2.25.  */ };
+
+#define TEST_MSG "VCVTH_N_F16_U16"
+#define INSN_NAME vcvth_n_f16_u16
+
+#define INPUT input
+#define EXPECTED_1 expected_1
+#define EXPECTED_2 expected_2
+
+#define INPUT_TYPE uint16_t
+#define OUTPUT_TYPE float16_t
+#define OUTPUT_TYPE_SIZE 16
+
+#define SCALAR_OPERANDS
+#define SCALAR_1 1
+#define SCALAR_2 2
+
+/* Include the template for unary scalar operations.  */
+#include "unary_scalar_op.inc"
--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vcvth_n_f16_u32_1.c
@@ -0,0 +1,99 @@
+/* { dg-do run } */
+/* { dg-require-effective-target arm_v8_2a_fp16_scalar_hw } */
+/* { dg-add-options arm_v8_2a_fp16_scalar }  */
+
+#include <arm_fp16.h>
+
+/* Input values.  */
+uint32_t input[] =
+{
+  0, -0,
+  123, -567,
+  -34, 1024,
+  -63, 169,
+  -4, 77,
+  -144, -56,
+  -16, -15,
+  -14, -13,
+};
+
+/* Expected results (16-bit hexadecimal representation).  */
+uint16_t expected_1[] =
+{
+  0x0000 /* 0.000000 */,
+  0x0000 /* 0.000000 */,
+  0x53b0 /* 61.500000 */,
+  0x7c00 /* inf */,
+  0x7c00 /* inf */,
+  0x6000 /* 512.000000 */,
+  0x7c00 /* inf */,
+  0x5548 /* 84.500000 */,
+  0x7c00 /* inf */,
+  0x50d0 /* 38.500000 */,
+  0x7c00 /* inf */,
+  0x7c00 /* inf */,
+  0x7c00 /* inf */,
+  0x7c00 /* inf */,
+  0x7c00 /* inf */,
+  0x7c00 /* inf */
+};
+
+uint16_t expected_2[] =
+{
+  0x0000 /* 0.000000 */,
+  0x0000 /* 0.000000 */,
+  0x4fb0 /* 30.750000 */,
+  0x7c00 /* inf */,
+  0x7c00 /* inf */,
+  0x5c00 /* 256.000000 */,
+  0x7c00 /* inf */,
+  0x5148 /* 42.250000 */,
+  0x7c00 /* inf */,
+  0x4cd0 /* 19.250000 */,
+  0x7c00 /* inf */,
+  0x7c00 /* inf */,
+  0x7c00 /* inf */,
+  0x7c00 /* inf */,
+  0x7c00 /* inf */,
+  0x7c00 /* inf */
+};
+
+uint16_t expected_3[] =
+{
+  0x0000 /* 0.000000 */,
+  0x0000 /* 0.000000 */,
+  0x0000 /* 0.000000 */,
+  0x3c00 /* 1.000000 */,
+  0x3c00 /* 1.000000 */,
+  0x0004 /* 0.000000 */,
+  0x3c00 /* 1.000000 */,
+  0x0001 /* 0.000000 */,
+  0x3c00 /* 1.000000 */,
+  0x0000 /* 0.000000 */,
+  0x3c00 /* 1.000000 */,
+  0x3c00 /* 1.000000 */,
+  0x3c00 /* 1.000000 */,
+  0x3c00 /* 1.000000 */,
+  0x3c00 /* 1.000000 */,
+  0x3c00 /* 1.000000 */
+};
+
+#define TEST_MSG "VCVTH_N_F16_U32"
+#define INSN_NAME vcvth_n_f16_u32
+
+#define INPUT input
+#define EXPECTED_1 expected_1
+#define EXPECTED_2 expected_2
+#define EXPECTED_3 expected_3
+
+#define INPUT_TYPE uint32_t
+#define OUTPUT_TYPE float16_t
+#define OUTPUT_TYPE_SIZE 16
+
+#define SCALAR_OPERANDS
+#define SCALAR_1 1
+#define SCALAR_2 2
+#define SCALAR_3 32
+
+/* Include the template for unary scalar operations.  */
+#include "unary_scalar_op.inc"
--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vcvth_n_f16_u64_1.c
@@ -0,0 +1,46 @@
+/* { dg-do run } */
+/* { dg-require-effective-target arm_v8_2a_fp16_scalar_hw } */
+/* { dg-add-options arm_v8_2a_fp16_scalar }  */
+/* { dg-skip-if "" { arm*-*-* } } */
+
+#include <arm_fp16.h>
+
+/* Input values.  */
+uint64_t input[] = { 1, 10, 48, 100, 1000, 0, 500, 9 };
+
+/* Expected results (16-bit hexadecimal representation).  */
+uint16_t expected_1[] = { 0x3800 /* 0.5.  */,
+			  0x4500 /* 5.  */,
+			  0x4E00 /* 24.  */,
+			  0x5240 /* 50.  */,
+			  0x5FD0 /* 500.  */,
+			  0x0000 /* 0.0.  */,
+			  0x5BD0 /* 250.  */,
+			  0x4480 /* 4.5.  */ };
+
+uint16_t expected_2[] = { 0x3400 /* 0.25.  */,
+			  0x4100 /* 2.5.  */,
+			  0x4A00 /* 12.  */,
+			  0x4E40 /* 25.  */,
+			  0x5BD0 /* 250.  */,
+			  0x0000 /* 0.0.  */,
+			  0x57D0 /* 125.  */,
+			  0x4080 /* 2.25.  */ };
+
+#define TEST_MSG "VCVTH_N_F16_U64"
+#define INSN_NAME vcvth_n_f16_u64
+
+#define INPUT input
+#define EXPECTED_1 expected_1
+#define EXPECTED_2 expected_2
+
+#define INPUT_TYPE uint64_t
+#define OUTPUT_TYPE float16_t
+#define OUTPUT_TYPE_SIZE 16
+
+#define SCALAR_OPERANDS
+#define SCALAR_1 1
+#define SCALAR_2 2
+
+/* Include the template for unary scalar operations.  */
+#include "unary_scalar_op.inc"
--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vcvth_n_s16_f16_1.c
@@ -0,0 +1,29 @@
+/* { dg-do run } */
+/* { dg-require-effective-target arm_v8_2a_fp16_scalar_hw } */
+/* { dg-add-options arm_v8_2a_fp16_scalar }  */
+/* { dg-skip-if "" { arm*-*-* } } */
+
+#include <arm_fp16.h>
+
+/* Input values.  */
+float16_t input[] = { 2.5, 100, 7.1, -9.9, -5.0, 9.1, -4.8, 77 };
+int16_t expected_1[] = { 5, 200, 14, -19, -10, 18, -9, 154 };
+int16_t expected_2[] = { 10, 400, 28, -39, -20, 36, -19, 308 };
+
+#define TEST_MSG "VCVTH_N_S16_F16"
+#define INSN_NAME vcvth_n_s16_f16
+
+#define INPUT input
+#define EXPECTED_1 expected_1
+#define EXPECTED_2 expected_2
+
+#define INPUT_TYPE float16_t
+#define OUTPUT_TYPE int16_t
+#define OUTPUT_TYPE_SIZE 16
+
+#define SCALAR_OPERANDS
+#define SCALAR_1 1
+#define SCALAR_2 2
+
+/* Include the template for unary scalar operations.  */
+#include "unary_scalar_op.inc"
--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vcvth_n_s32_f16_1.c
@@ -0,0 +1,100 @@
+/* { dg-do run } */
+/* { dg-require-effective-target arm_v8_2a_fp16_scalar_hw } */
+/* { dg-add-options arm_v8_2a_fp16_scalar }  */
+
+#include <arm_fp16.h>
+
+/* Input values.  */
+float16_t input[] =
+{
+  0.0, -0.0,
+  123.4, -567.8,
+  -34.8, 1024,
+  663.1, 169.1,
+  -4.8, 77.0,
+  -144.5, -56.8,
+
+  (float16_t) -16, (float16_t) -15,
+  (float16_t) -14, (float16_t) -13,
+};
+
+/* Expected results (32-bit hexadecimal representation).  */
+uint32_t expected_1[] =
+{
+  0x00000000,
+  0x00000000,
+  0x000000f6,
+  0xfffffb90,
+  0xffffffbb,
+  0x00000800,
+  0x0000052e,
+  0x00000152,
+  0xfffffff7,
+  0x0000009a,
+  0xfffffedf,
+  0xffffff8f,
+  0xffffffe0,
+  0xffffffe2,
+  0xffffffe4,
+  0xffffffe6,
+};
+
+uint32_t expected_2[] =
+{
+  0x00000000,
+  0x00000000,
+  0x000001ed,
+  0xfffff720,
+  0xffffff75,
+  0x00001000,
+  0x00000a5c,
+  0x000002a4,
+  0xffffffed,
+  0x00000134,
+  0xfffffdbe,
+  0xffffff1d,
+  0xffffffc0,
+  0xffffffc4,
+  0xffffffc8,
+  0xffffffcc,
+};
+
+uint32_t expected_3[] =
+{
+  0x00000000,
+  0x00000000,
+  0x7fffffff,
+  0x80000000,
+  0x80000000,
+  0x7fffffff,
+  0x7fffffff,
+  0x7fffffff,
+  0x80000000,
+  0x7fffffff,
+  0x80000000,
+  0x80000000,
+  0x80000000,
+  0x80000000,
+  0x80000000,
+  0x80000000,
+};
+
+#define TEST_MSG "VCVTH_N_S32_F16"
+#define INSN_NAME vcvth_n_s32_f16
+
+#define INPUT input
+#define EXPECTED_1 expected_1
+#define EXPECTED_2 expected_2
+#define EXPECTED_3 expected_3
+
+#define INPUT_TYPE float16_t
+#define OUTPUT_TYPE uint32_t
+#define OUTPUT_TYPE_SIZE 32
+
+#define SCALAR_OPERANDS
+#define SCALAR_1 1
+#define SCALAR_2 2
+#define SCALAR_3 32
+
+/* Include the template for unary scalar operations.  */
+#include "unary_scalar_op.inc"
--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vcvth_n_s64_f16_1.c
@@ -0,0 +1,29 @@
+/* { dg-do run } */
+/* { dg-require-effective-target arm_v8_2a_fp16_scalar_hw } */
+/* { dg-add-options arm_v8_2a_fp16_scalar }  */
+/* { dg-skip-if "" { arm*-*-* } } */
+
+#include <arm_fp16.h>
+
+/* Input values.  */
+float16_t input[] = { 2.5, 100, 7.1, -9.9, -5.0, 9.1, -4.8, 77 };
+int64_t expected_1[] = { 5, 200, 14, -19, -10, 18, -9, 154 };
+int64_t expected_2[] = { 10, 400, 28, -39, -20, 36, -19, 308 };
+
+#define TEST_MSG "VCVTH_N_S64_F16"
+#define INSN_NAME vcvth_n_s64_f16
+
+#define INPUT input
+#define EXPECTED_1 expected_1
+#define EXPECTED_2 expected_2
+
+#define INPUT_TYPE float16_t
+#define OUTPUT_TYPE int64_t
+#define OUTPUT_TYPE_SIZE 64
+
+#define SCALAR_OPERANDS
+#define SCALAR_1 1
+#define SCALAR_2 2
+
+/* Include the template for unary scalar operations.  */
+#include "unary_scalar_op.inc"
--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vcvth_n_u16_f16_1.c
@@ -0,0 +1,29 @@
+/* { dg-do run } */
+/* { dg-require-effective-target arm_v8_2a_fp16_scalar_hw } */
+/* { dg-add-options arm_v8_2a_fp16_scalar }  */
+/* { dg-skip-if "" { arm*-*-* } } */
+
+#include <arm_fp16.h>
+
+/* Input values.  */
+float16_t input[] = { 2.5, 100, 7.1, 9.9, 5.0, 9.1, 4.8, 77 };
+uint16_t expected_1[] = {5, 200, 14, 19, 10, 18, 9, 154};
+uint16_t expected_2[] = {10, 400, 28, 39, 20, 36, 19, 308};
+
+#define TEST_MSG "VCVTH_N_U16_F16"
+#define INSN_NAME vcvth_n_u16_f16
+
+#define INPUT input
+#define EXPECTED_1 expected_1
+#define EXPECTED_2 expected_2
+
+#define INPUT_TYPE float16_t
+#define OUTPUT_TYPE uint16_t
+#define OUTPUT_TYPE_SIZE 16
+
+#define SCALAR_OPERANDS
+#define SCALAR_1 1
+#define SCALAR_2 2
+
+/* Include the template for unary scalar operations.  */
+#include "unary_scalar_op.inc"
--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vcvth_n_u32_f16_1.c
@@ -0,0 +1,100 @@
+/* { dg-do run } */
+/* { dg-require-effective-target arm_v8_2a_fp16_scalar_hw } */
+/* { dg-add-options arm_v8_2a_fp16_scalar }  */
+
+#include <arm_fp16.h>
+
+/* Input values.  */
+float16_t input[] =
+{
+  0.0, -0.0,
+  123.4, -567.8,
+  -34.8, 1024,
+  663.1, 169.1,
+  -4.8, 77.0,
+  -144.5, -56.8,
+
+  (float16_t) -16, (float16_t) -15,
+  (float16_t) -14, (float16_t) -13,
+};
+
+/* Expected results (32-bit hexadecimal representation).  */
+uint32_t expected_1[] =
+{
+  0x00000000,
+  0x00000000,
+  0x000000f6,
+  0x00000000,
+  0x00000000,
+  0x00000800,
+  0x0000052e,
+  0x00000152,
+  0x00000000,
+  0x0000009a,
+  0x00000000,
+  0x00000000,
+  0x00000000,
+  0x00000000,
+  0x00000000,
+  0x00000000,
+};
+
+uint32_t expected_2[] =
+{
+  0x00000000,
+  0x00000000,
+  0x000001ed,
+  0x00000000,
+  0x00000000,
+  0x00001000,
+  0x00000a5c,
+  0x000002a4,
+  0x00000000,
+  0x00000134,
+  0x00000000,
+  0x00000000,
+  0x00000000,
+  0x00000000,
+  0x00000000,
+  0x00000000,
+};
+
+uint32_t expected_3[] =
+{
+  0x00000000,
+  0x00000000,
+  0xffffffff,
+  0x00000000,
+  0x00000000,
+  0xffffffff,
+  0xffffffff,
+  0xffffffff,
+  0x00000000,
+  0xffffffff,
+  0x00000000,
+  0x00000000,
+  0x00000000,
+  0x00000000,
+  0x00000000,
+  0x00000000,
+};
+
+#define TEST_MSG "VCVTH_N_U32_F16"
+#define INSN_NAME vcvth_n_u32_f16
+
+#define INPUT input
+#define EXPECTED_1 expected_1
+#define EXPECTED_2 expected_2
+#define EXPECTED_3 expected_3
+
+#define INPUT_TYPE float16_t
+#define OUTPUT_TYPE uint32_t
+#define OUTPUT_TYPE_SIZE 32
+
+#define SCALAR_OPERANDS
+#define SCALAR_1 1
+#define SCALAR_2 2
+#define SCALAR_3 32
+
+/* Include the template for unary scalar operations.  */
+#include "unary_scalar_op.inc"
--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vcvth_n_u64_f16_1.c
@@ -0,0 +1,29 @@
+/* { dg-do run } */
+/* { dg-require-effective-target arm_v8_2a_fp16_scalar_hw } */
+/* { dg-add-options arm_v8_2a_fp16_scalar }  */
+/* { dg-skip-if "" { arm*-*-* } } */
+
+#include <arm_fp16.h>
+
+/* Input values.  */
+float16_t input[] = { 2.5, 100, 7.1, 9.9, 5.0, 9.1, 4.8, 77 };
+uint64_t expected_1[] = { 5, 200, 14, 19, 10, 18, 9, 154 };
+uint64_t expected_2[] = { 10, 400, 28, 39, 20, 36, 19, 308 };
+
+#define TEST_MSG "VCVTH_N_U64_F16"
+#define INSN_NAME vcvth_n_u64_f16
+
+#define INPUT input
+#define EXPECTED_1 expected_1
+#define EXPECTED_2 expected_2
+
+#define INPUT_TYPE float16_t
+#define OUTPUT_TYPE uint64_t
+#define OUTPUT_TYPE_SIZE 64
+
+#define SCALAR_OPERANDS
+#define SCALAR_1 1
+#define SCALAR_2 2
+
+/* Include the template for unary scalar operations.  */
+#include "unary_scalar_op.inc"
--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vcvth_s16_f16_1.c
@@ -0,0 +1,23 @@
+/* { dg-do run } */
+/* { dg-require-effective-target arm_v8_2a_fp16_scalar_hw } */
+/* { dg-add-options arm_v8_2a_fp16_scalar }  */
+/* { dg-skip-if "" { arm*-*-* } } */
+
+#include <arm_fp16.h>
+
+/* Input values.  */
+float16_t input[] = { 123.9, -56.8, 0.7, 24.6, -63.5, 169.4, -4.3, 77.0 };
+int16_t expected[] = { 123, -56, 0, 24, -63, 169, -4, 77 };
+
+#define TEST_MSG "VCVTH_S16_F16"
+#define INSN_NAME vcvth_s16_f16
+
+#define INPUT input
+#define EXPECTED expected
+
+#define INPUT_TYPE float16_t
+#define OUTPUT_TYPE int16_t
+#define OUTPUT_TYPE_SIZE 16
+
+/* Include the template for unary scalar operations.  */
+#include "unary_scalar_op.inc"
--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vcvth_s32_f16_1.c
@@ -0,0 +1,53 @@
+/* { dg-do run } */
+/* { dg-require-effective-target arm_v8_2a_fp16_scalar_hw } */
+/* { dg-add-options arm_v8_2a_fp16_scalar }  */
+
+#include <arm_fp16.h>
+
+/* Input values.  */
+float16_t input[] =
+{
+  0.0, -0.0,
+  123.4, -567.8,
+  -34.8, 1024,
+  663.1, 169.1,
+  -4.8, 77.0,
+  -144.5, -56.8,
+
+  (float16_t) -16, (float16_t) -15,
+  (float16_t) -14, (float16_t) -13,
+};
+
+/* Expected results (32-bit hexadecimal representation).  */
+uint32_t expected[] =
+{
+  0x00000000,
+  0x00000000,
+  0x0000007b,
+  0xfffffdc8,
+  0xffffffde,
+  0x00000400,
+  0x00000297,
+  0x000000a9,
+  0xfffffffc,
+  0x0000004d,
+  0xffffff70,
+  0xffffffc8,
+  0xfffffff0,
+  0xfffffff1,
+  0xfffffff2,
+  0xfffffff3,
+};
+
+#define TEST_MSG "VCVTH_S32_F16"
+#define INSN_NAME vcvth_s32_f16
+
+#define INPUT input
+#define EXPECTED expected
+
+#define INPUT_TYPE float16_t
+#define OUTPUT_TYPE int32_t
+#define OUTPUT_TYPE_SIZE 32
+
+/* Include the template for unary scalar operations.  */
+#include "unary_scalar_op.inc"
--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vcvth_s64_f16_1.c
@@ -0,0 +1,23 @@
+/* { dg-do run } */
+/* { dg-require-effective-target arm_v8_2a_fp16_scalar_hw } */
+/* { dg-add-options arm_v8_2a_fp16_scalar }  */
+/* { dg-skip-if "" { arm*-*-* } } */
+
+#include <arm_fp16.h>
+
+/* Input values.  */
+float16_t input[] = { 123.9, -56.8, 0.7, 24.6, -63.5, 169.4, -4.3, 77.0 };
+int64_t expected[] = { 123, -56, 0, 24, -63, 169, -4, 77 };
+
+#define TEST_MSG "VCVTH_S64_F16"
+#define INSN_NAME vcvth_s64_f16
+
+#define INPUT input
+#define EXPECTED expected
+
+#define INPUT_TYPE float16_t
+#define OUTPUT_TYPE int64_t
+#define OUTPUT_TYPE_SIZE 64
+
+/* Include the template for unary scalar operations.  */
+#include "unary_scalar_op.inc"
--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vcvth_u16_f16_1.c
@@ -0,0 +1,23 @@
+/* { dg-do run } */
+/* { dg-require-effective-target arm_v8_2a_fp16_scalar_hw } */
+/* { dg-add-options arm_v8_2a_fp16_scalar }  */
+/* { dg-skip-if "" { arm*-*-* } } */
+
+#include <arm_fp16.h>
+
+/* Input values.  */
+float16_t input[] = { 123.9, 56.8, 0.7, 24.6, 63.5, 169.4, 4.3, 77.0 };
+uint16_t expected[] = { 123, 56, 0, 24, 63, 169, 4, 77 };
+
+#define TEST_MSG "VCVTH_u16_F16"
+#define INSN_NAME vcvth_u16_f16
+
+#define INPUT input
+#define EXPECTED expected
+
+#define INPUT_TYPE float16_t
+#define OUTPUT_TYPE uint16_t
+#define OUTPUT_TYPE_SIZE 16
+
+/* Include the template for unary scalar operations.  */
+#include "unary_scalar_op.inc"
--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vcvth_u32_f16_1.c
@@ -0,0 +1,53 @@
+/* { dg-do run } */
+/* { dg-require-effective-target arm_v8_2a_fp16_scalar_hw } */
+/* { dg-add-options arm_v8_2a_fp16_scalar }  */
+
+#include <arm_fp16.h>
+
+/* Input values.  */
+float16_t input[] =
+{
+  0.0, -0.0,
+  123.4, -567.8,
+  -34.8, 1024,
+  663.1, 169.1,
+  -4.8, 77.0,
+  -144.5, -56.8,
+
+  (float16_t) -16, (float16_t) -15,
+  (float16_t) -14, (float16_t) -13,
+};
+
+/* Expected results (32-bit hexadecimal representation).  */
+uint32_t expected[] =
+{
+  0x00000000,
+  0x00000000,
+  0x0000007b,
+  0x00000000,
+  0x00000000,
+  0x00000400,
+  0x00000297,
+  0x000000a9,
+  0x00000000,
+  0x0000004d,
+  0x00000000,
+  0x00000000,
+  0x00000000,
+  0x00000000,
+  0x00000000,
+  0x00000000,
+};
+
+#define TEST_MSG "VCVTH_U32_F16"
+#define INSN_NAME vcvth_u32_f16
+
+#define INPUT input
+#define EXPECTED expected
+
+#define INPUT_TYPE float16_t
+#define OUTPUT_TYPE uint32_t
+#define OUTPUT_TYPE_SIZE 32
+
+/* Include the template for unary scalar operations.  */
+#include "unary_scalar_op.inc"
--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vcvth_u64_f16_1.c
@@ -0,0 +1,23 @@
+/* { dg-do run } */
+/* { dg-require-effective-target arm_v8_2a_fp16_scalar_hw } */
+/* { dg-add-options arm_v8_2a_fp16_scalar }  */
+/* { dg-skip-if "" { arm*-*-* } } */
+
+#include <arm_fp16.h>
+
+/* Input values.  */
+float16_t input[] = { 123.9, 56.8, 0.7, 24.6, 63.5, 169.4, 4.3, 77.0 };
+uint64_t expected[] = { 123, 56, 0, 24, 63, 169, 4, 77 };
+
+#define TEST_MSG "VCVTH_u64_F16"
+#define INSN_NAME vcvth_u64_f16
+
+#define INPUT input
+#define EXPECTED expected
+
+#define INPUT_TYPE float16_t
+#define OUTPUT_TYPE uint64_t
+#define OUTPUT_TYPE_SIZE 64
+
+/* Include the template for unary scalar operations.  */
+#include "unary_scalar_op.inc"
--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vcvtm_1.c
@@ -0,0 +1,33 @@
+/* This file tests an intrinsic which currently has only an f16 variant and that
+   is only available when FP16 arithmetic instructions are supported.  */
+/* { dg-require-effective-target arm_v8_2a_fp16_neon_hw } */
+
+#include <arm_neon.h>
+#include "arm-neon-ref.h"
+#include "compute-ref-data.h"
+#include <math.h>
+
+/* Expected results.  */
+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+VECT_VAR_DECL(expected, int, 16, 4) [] = { 0xfff0, 0x5, 0xfff0, 0x5 };
+VECT_VAR_DECL(expected, uint, 16, 4) [] = { 0x0, 0x5, 0x0, 0x5 };
+VECT_VAR_DECL(expected, int, 16, 8) [] = { 0x0, 0x0, 0xf, 0xfff0, 0x0,
+					   0x0, 0xf, 0xfff0 };
+VECT_VAR_DECL(expected, uint, 16, 8) [] = { 0x0, 0x0, 0xf, 0x0,
+					    0x0, 0x0, 0xf, 0x0 };
+#endif
+
+/* Expected results with rounding.  */
+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+VECT_VAR_DECL(expected_rounding, int, 16, 4) [] = { 0xa, 0xa, 0xa, 0xa };
+VECT_VAR_DECL(expected_rounding, uint, 16, 4) [] = { 0xa, 0xa, 0xa, 0xa };
+VECT_VAR_DECL(expected_rounding, int, 16, 8) [] = { 0x7d, 0x7d, 0x7d, 0x7d,
+						    0x7d, 0x7d, 0x7d, 0x7d };
+VECT_VAR_DECL(expected_rounding, uint, 16, 8) [] = { 0x7d, 0x7d, 0x7d, 0x7d,
+						     0x7d, 0x7d, 0x7d, 0x7d };
+#endif
+
+#define TEST_MSG "VCVTM/VCVTMQ"
+#define INSN_NAME vcvtm
+
+#include "vcvtX.inc"
--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vcvtmh_s16_f16_1.c
@@ -0,0 +1,23 @@
+/* { dg-do run } */
+/* { dg-require-effective-target arm_v8_2a_fp16_scalar_hw } */
+/* { dg-add-options arm_v8_2a_fp16_scalar }  */
+/* { dg-skip-if "" { arm*-*-* } } */
+
+#include <arm_fp16.h>
+
+/* Input values.  */
+float16_t input[] = { 123.9, -56.8, 0.7, 24.6, -63.5, 169.4, -4.3, 77.0 };
+int16_t expected[] = { 123, -57, 0, 24, -64, 169, -5, 77 };
+
+#define TEST_MSG "VCVTMH_S16_F16"
+#define INSN_NAME vcvtmh_s16_f16
+
+#define INPUT input
+#define EXPECTED expected
+
+#define INPUT_TYPE float16_t
+#define OUTPUT_TYPE int16_t
+#define OUTPUT_TYPE_SIZE 16
+
+/* Include the template for unary scalar operations.  */
+#include "unary_scalar_op.inc"
--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vcvtmh_s32_f16_1.c
@@ -0,0 +1,53 @@
+/* { dg-do run } */
+/* { dg-require-effective-target arm_v8_2a_fp16_scalar_hw } */
+/* { dg-add-options arm_v8_2a_fp16_scalar }  */
+
+#include <arm_fp16.h>
+
+/* Input values.  */
+float16_t input[] =
+{
+  0.0, -0.0,
+  123.4, -567.8,
+  -34.8, 1024,
+  663.1, 169.1,
+  -4.8, 77.0,
+  -144.5, -56.8,
+
+  (float16_t) -16, (float16_t) -15,
+  (float16_t) -14, (float16_t) -13,
+};
+
+/* Expected results (32-bit hexadecimal representation).  */
+uint32_t expected[] =
+{
+  0x00000000,
+  0x00000000,
+  0x0000007b,
+  0xfffffdc8,
+  0xffffffdd,
+  0x00000400,
+  0x00000297,
+  0x000000a9,
+  0xfffffffb,
+  0x0000004d,
+  0xffffff6f,
+  0xffffffc7,
+  0xfffffff0,
+  0xfffffff1,
+  0xfffffff2,
+  0xfffffff3
+};
+
+#define TEST_MSG "VCVTMH_S32_F16"
+#define INSN_NAME vcvtmh_s32_f16
+
+#define INPUT input
+#define EXPECTED expected
+
+#define INPUT_TYPE float16_t
+#define OUTPUT_TYPE int32_t
+#define OUTPUT_TYPE_SIZE 32
+
+/* Include the template for unary scalar operations.  */
+#include "unary_scalar_op.inc"
--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vcvtmh_s64_f16_1.c
@@ -0,0 +1,23 @@
+/* { dg-do run } */
+/* { dg-require-effective-target arm_v8_2a_fp16_scalar_hw } */
+/* { dg-add-options arm_v8_2a_fp16_scalar }  */
+/* { dg-skip-if "" { arm*-*-* } } */
+
+#include <arm_fp16.h>
+
+/* Input values.  */
+float16_t input[] = { 123.9, -56.8, 0.7, 24.6, -63.5, 169.4, -4.3, 77.0 };
+int64_t expected[] = { 123, -57, 0, 24, -64, 169, -5, 77 };
+
+#define TEST_MSG "VCVTMH_S64_F16"
+#define INSN_NAME vcvtmh_s64_f16
+
+#define INPUT input
+#define EXPECTED expected
+
+#define INPUT_TYPE float16_t
+#define OUTPUT_TYPE int64_t
+#define OUTPUT_TYPE_SIZE 64
+
+/* Include the template for unary scalar operations.  */
+#include "unary_scalar_op.inc"
--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vcvtmh_u16_f16_1.c
@@ -0,0 +1,23 @@
+/* { dg-do run } */
+/* { dg-require-effective-target arm_v8_2a_fp16_scalar_hw } */
+/* { dg-add-options arm_v8_2a_fp16_scalar }  */
+/* { dg-skip-if "" { arm*-*-* } } */
+
+#include <arm_fp16.h>
+
+/* Input values.  */
+float16_t input[] = { 123.9, 56.8, 0.7, 24.6, 63.5, 169.4, 4.3, 77.0 };
+uint16_t expected[] = { 123, 56, 0, 24, 63, 169, 4, 77 };
+
+#define TEST_MSG "VCVTMH_u16_F16"
+#define INSN_NAME vcvtmh_u16_f16
+
+#define INPUT input
+#define EXPECTED expected
+
+#define INPUT_TYPE float16_t
+#define OUTPUT_TYPE uint16_t
+#define OUTPUT_TYPE_SIZE 16
+
+/* Include the template for unary scalar operations.  */
+#include "unary_scalar_op.inc"
--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vcvtmh_u32_f16_1.c
@@ -0,0 +1,53 @@
+/* { dg-do run } */
+/* { dg-require-effective-target arm_v8_2a_fp16_scalar_hw } */
+/* { dg-add-options arm_v8_2a_fp16_scalar }  */
+
+#include <arm_fp16.h>
+
+/* Input values.  */
+float16_t input[] =
+{
+  0.0, -0.0,
+  123.4, -567.8,
+  -34.8, 1024,
+  663.1, 169.1,
+  -4.8, 77.0,
+  -144.5, -56.8,
+
+  (float16_t) -16, (float16_t) -15,
+  (float16_t) -14, (float16_t) -13,
+};
+
+/* Expected results (32-bit hexadecimal representation).  */
+uint32_t expected[] =
+{
+  0x00000000,
+  0x00000000,
+  0x0000007b,
+  0x00000000,
+  0x00000000,
+  0x00000400,
+  0x00000297,
+  0x000000a9,
+  0x00000000,
+  0x0000004d,
+  0x00000000,
+  0x00000000,
+  0x00000000,
+  0x00000000,
+  0x00000000,
+  0x00000000,
+};
+
+#define TEST_MSG "VCVTMH_U32_F16"
+#define INSN_NAME vcvtmh_u32_f16
+
+#define INPUT input
+#define EXPECTED expected
+
+#define INPUT_TYPE float16_t
+#define OUTPUT_TYPE uint32_t
+#define OUTPUT_TYPE_SIZE 32
+
+/* Include the template for unary scalar operations.  */
+#include "unary_scalar_op.inc"
--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vcvtmh_u64_f16_1.c
@@ -0,0 +1,23 @@
+/* { dg-do run } */
+/* { dg-require-effective-target arm_v8_2a_fp16_scalar_hw } */
+/* { dg-add-options arm_v8_2a_fp16_scalar }  */
+/* { dg-skip-if "" { arm*-*-* } } */
+
+#include <arm_fp16.h>
+
+/* Input values.  */
+float16_t input[] = { 123.9, 56.8, 0.7, 24.6, 63.5, 169.4, 4.3, 77.0 };
+uint64_t expected[] = { 123, 56, 0, 24, 63, 169, 4, 77 };
+
+#define TEST_MSG "VCVTMH_u64_F16"
+#define INSN_NAME vcvtmh_u64_f16
+
+#define INPUT input
+#define EXPECTED expected
+
+#define INPUT_TYPE float16_t
+#define OUTPUT_TYPE uint64_t
+#define OUTPUT_TYPE_SIZE 64
+
+/* Include the template for unary scalar operations.  */
+#include "unary_scalar_op.inc"
--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vcvtnh_s16_f16_1.c
@@ -0,0 +1,23 @@
+/* { dg-do run } */
+/* { dg-require-effective-target arm_v8_2a_fp16_scalar_hw } */
+/* { dg-add-options arm_v8_2a_fp16_scalar }  */
+/* { dg-skip-if "" { arm*-*-* } } */
+
+#include <arm_fp16.h>
+
+/* Input values.  */
+float16_t input[] = { 123.9, -56.8, 0.7, 24.6, -63.5, 169.4, -4.3, 77.0 };
+int16_t expected[] = { 124, -57, 1, 25, -64, 169, -4, 77 };
+
+#define TEST_MSG "VCVTNH_S16_F16"
+#define INSN_NAME vcvtnh_s16_f16
+
+#define INPUT input
+#define EXPECTED expected
+
+#define INPUT_TYPE float16_t
+#define OUTPUT_TYPE int16_t
+#define OUTPUT_TYPE_SIZE 16
+
+/* Include the template for unary scalar operations.  */
+#include "unary_scalar_op.inc"
--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vcvtnh_s32_f16_1.c
@@ -0,0 +1,53 @@
+/* { dg-do run } */
+/* { dg-require-effective-target arm_v8_2a_fp16_scalar_hw } */
+/* { dg-add-options arm_v8_2a_fp16_scalar }  */
+
+#include <arm_fp16.h>
+
+/* Input values.  */
+float16_t input[] =
+{
+  0.0, -0.0,
+  123.4, -567.8,
+  -34.8, 1024,
+  663.1, 169.1,
+  -4.8, 77.0,
+  -144.5, -56.8,
+
+  (float16_t) -16, (float16_t) -15,
+  (float16_t) -14, (float16_t) -13,
+};
+
+/* Expected results (32-bit hexadecimal representation).  */
+uint32_t expected[] =
+{
+  0x00000000,
+  0x00000000,
+  0x0000007b,
+  0xfffffdc8,
+  0xffffffdd,
+  0x00000400,
+  0x00000297,
+  0x000000a9,
+  0xfffffffb,
+  0x0000004d,
+  0xffffff70,
+  0xffffffc7,
+  0xfffffff0,
+  0xfffffff1,
+  0xfffffff2,
+  0xfffffff3
+};
+
+#define TEST_MSG "VCVTNH_S32_F16"
+#define INSN_NAME vcvtnh_s32_f16
+
+#define INPUT input
+#define EXPECTED expected
+
+#define INPUT_TYPE float16_t
+#define OUTPUT_TYPE int32_t
+#define OUTPUT_TYPE_SIZE 32
+
+/* Include the template for unary scalar operations.  */
+#include "unary_scalar_op.inc"
--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vcvtnh_s64_f16_1.c
@@ -0,0 +1,23 @@
+/* { dg-do run } */
+/* { dg-require-effective-target arm_v8_2a_fp16_scalar_hw } */
+/* { dg-add-options arm_v8_2a_fp16_scalar }  */
+/* { dg-skip-if "" { arm*-*-* } } */
+
+#include <arm_fp16.h>
+
+/* Input values.  */
+float16_t input[] = { 123.9, -56.8, 0.7, 24.6, -63.5, 169.4, -4.3, 77.0 };
+int64_t expected[] = { 124, -57, 1, 25, -64, 169, -4, 77 };
+
+#define TEST_MSG "VCVTNH_S64_F16"
+#define INSN_NAME vcvtnh_s64_f16
+
+#define INPUT input
+#define EXPECTED expected
+
+#define INPUT_TYPE float16_t
+#define OUTPUT_TYPE int64_t
+#define OUTPUT_TYPE_SIZE 64
+
+/* Include the template for unary scalar operations.  */
+#include "unary_scalar_op.inc"
--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vcvtnh_u16_f16_1.c
@@ -0,0 +1,23 @@
+/* { dg-do run } */
+/* { dg-require-effective-target arm_v8_2a_fp16_scalar_hw } */
+/* { dg-add-options arm_v8_2a_fp16_scalar }  */
+/* { dg-skip-if "" { arm*-*-* } } */
+
+#include <arm_fp16.h>
+
+/* Input values.  */
+float16_t input[] = { 123.9, 56.8, 0.7, 24.6, 63.5, 169.4, 4.3, 77.0 };
+uint16_t expected[] = { 124, 57, 1, 25, 64, 169, 4, 77 };
+
+#define TEST_MSG "VCVTNH_u16_F16"
+#define INSN_NAME vcvtnh_u16_f16
+
+#define INPUT input
+#define EXPECTED expected
+
+#define INPUT_TYPE float16_t
+#define OUTPUT_TYPE uint16_t
+#define OUTPUT_TYPE_SIZE 16
+
+/* Include the template for unary scalar operations.  */
+#include "unary_scalar_op.inc"
--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vcvtnh_u32_f16_1.c
@@ -0,0 +1,53 @@
+/* { dg-do run } */
+/* { dg-require-effective-target arm_v8_2a_fp16_scalar_hw } */
+/* { dg-add-options arm_v8_2a_fp16_scalar }  */
+
+#include <arm_fp16.h>
+
+/* Input values.  */
+float16_t input[] =
+{
+  0.0, -0.0,
+  123.4, -567.8,
+  -34.8, 1024,
+  663.1, 169.1,
+  -4.8, 77.0,
+  -144.5, -56.8,
+
+  (float16_t) -16, (float16_t) -15,
+  (float16_t) -14, (float16_t) -13,
+};
+
+/* Expected results (32-bit hexadecimal representation).  */
+uint32_t expected[] =
+{
+  0x00000000,
+  0x00000000,
+  0x0000007b,
+  0x00000000,
+  0x00000000,
+  0x00000400,
+  0x00000297,
+  0x000000a9,
+  0x00000000,
+  0x0000004d,
+  0x00000000,
+  0x00000000,
+  0x00000000,
+  0x00000000,
+  0x00000000,
+  0x00000000,
+};
+
+#define TEST_MSG "VCVTNH_U32_F16"
+#define INSN_NAME vcvtnh_u32_f16
+
+#define INPUT input
+#define EXPECTED expected
+
+#define INPUT_TYPE float16_t
+#define OUTPUT_TYPE uint32_t
+#define OUTPUT_TYPE_SIZE 32
+
+/* Include the template for unary scalar operations.  */
+#include "unary_scalar_op.inc"
--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vcvtnh_u64_f16_1.c
@@ -0,0 +1,23 @@
+/* { dg-do run } */
+/* { dg-require-effective-target arm_v8_2a_fp16_scalar_hw } */
+/* { dg-add-options arm_v8_2a_fp16_scalar }  */
+/* { dg-skip-if "" { arm*-*-* } } */
+
+#include <arm_fp16.h>
+
+/* Input values.  */
+float16_t input[] = { 123.9, 56.8, 0.7, 24.6, 63.5, 169.4, 4.3, 77.0 };
+uint64_t expected[] = { 124, 57, 1, 25, 64, 169, 4, 77 };
+
+#define TEST_MSG "VCVTNH_u64_F16"
+#define INSN_NAME vcvtnh_u64_f16
+
+#define INPUT input
+#define EXPECTED expected
+
+#define INPUT_TYPE float16_t
+#define OUTPUT_TYPE uint64_t
+#define OUTPUT_TYPE_SIZE 64
+
+/* Include the template for unary scalar operations.  */
+#include "unary_scalar_op.inc"
--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vcvtp_1.c
@@ -0,0 +1,33 @@
+/* This file tests an intrinsic which currently has only an f16 variant and that
+   is only available when FP16 arithmetic instructions are supported.  */
+/* { dg-require-effective-target arm_v8_2a_fp16_neon_hw } */
+
+#include <arm_neon.h>
+#include "arm-neon-ref.h"
+#include "compute-ref-data.h"
+#include <math.h>
+
+/* Expected results.  */
+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+VECT_VAR_DECL(expected, int, 16, 4) [] = { 0xfff1, 0x6, 0xfff1, 0x6 };
+VECT_VAR_DECL(expected, uint, 16, 4) [] = { 0x0, 0x6, 0x0, 0x6 };
+VECT_VAR_DECL(expected, int, 16, 8) [] = { 0x0, 0x0, 0x10, 0xfff1,
+					   0x0, 0x0, 0x10, 0xfff1 };
+VECT_VAR_DECL(expected, uint, 16, 8) [] = { 0x0, 0x0, 0x10, 0x0,
+					    0x0, 0x0, 0x10, 0x0 };
+#endif
+
+/* Expected results with rounding.  */
+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+VECT_VAR_DECL(expected_rounding, int, 16, 4) [] = { 0xb, 0xb, 0xb, 0xb };
+VECT_VAR_DECL(expected_rounding, uint, 16, 4) [] = { 0xb, 0xb, 0xb, 0xb };
+VECT_VAR_DECL(expected_rounding, int, 16, 8) [] = { 0x7e, 0x7e, 0x7e, 0x7e,
+						    0x7e, 0x7e, 0x7e, 0x7e };
+VECT_VAR_DECL(expected_rounding, uint, 16, 8) [] = { 0x7e, 0x7e, 0x7e, 0x7e,
+						     0x7e, 0x7e, 0x7e, 0x7e };
+#endif
+
+#define TEST_MSG "VCVTP/VCVTPQ"
+#define INSN_NAME vcvtp
+
+#include "vcvtX.inc"
--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vcvtph_s16_f16_1.c
@@ -0,0 +1,23 @@
+/* { dg-do run } */
+/* { dg-require-effective-target arm_v8_2a_fp16_scalar_hw } */
+/* { dg-add-options arm_v8_2a_fp16_scalar }  */
+/* { dg-skip-if "" { arm*-*-* } } */
+
+#include <arm_fp16.h>
+
+/* Input values.  */
+float16_t input[] = { 123.9, -56.8, 0.7, 24.6, -63.5, 169.4, -4.3, 77.0 };
+int16_t expected[] = { 124, -56, 1, 25, -63, 170, -4, 77 };
+
+#define TEST_MSG "VCVTPH_S16_F16"
+#define INSN_NAME vcvtph_s16_f16
+
+#define INPUT input
+#define EXPECTED expected
+
+#define INPUT_TYPE float16_t
+#define OUTPUT_TYPE int16_t
+#define OUTPUT_TYPE_SIZE 16
+
+/* Include the template for unary scalar operations.  */
+#include "unary_scalar_op.inc"
--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vcvtph_s32_f16_1.c
@@ -0,0 +1,53 @@
+/* { dg-do run } */
+/* { dg-require-effective-target arm_v8_2a_fp16_scalar_hw } */
+/* { dg-add-options arm_v8_2a_fp16_scalar }  */
+
+#include <arm_fp16.h>
+
+/* Input values.  */
+float16_t input[] =
+{
+  0.0, -0.0,
+  123.4, -567.8,
+  -34.8, 1024,
+  663.1, 169.1,
+  -4.8, 77.0,
+  -144.5, -56.8,
+
+  (float16_t) -16, (float16_t) -15,
+  (float16_t) -14, (float16_t) -13,
+};
+
+/* Expected results (32-bit hexadecimal representation).  */
+uint32_t expected[] =
+{
+  0x00000000,
+  0x00000000,
+  0x0000007c,
+  0xfffffdc8,
+  0xffffffde,
+  0x00000400,
+  0x00000297,
+  0x000000aa,
+  0xfffffffc,
+  0x0000004d,
+  0xffffff70,
+  0xffffffc8,
+  0xfffffff0,
+  0xfffffff1,
+  0xfffffff2,
+  0xfffffff3
+};
+
+#define TEST_MSG "VCVTPH_S32_F16"
+#define INSN_NAME vcvtph_s32_f16
+
+#define INPUT input
+#define EXPECTED expected
+
+#define INPUT_TYPE float16_t
+#define OUTPUT_TYPE int32_t
+#define OUTPUT_TYPE_SIZE 32
+
+/* Include the template for unary scalar operations.  */
+#include "unary_scalar_op.inc"
--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vcvtph_s64_f16_1.c
@@ -0,0 +1,23 @@
+/* { dg-do run } */
+/* { dg-require-effective-target arm_v8_2a_fp16_scalar_hw } */
+/* { dg-add-options arm_v8_2a_fp16_scalar }  */
+/* { dg-skip-if "" { arm*-*-* } } */
+
+#include <arm_fp16.h>
+
+/* Input values.  */
+float16_t input[] = { 123.9, -56.8, 0.7, 24.6, -63.5, 169.4, -4.3, 77.0 };
+int64_t expected[] = { 124, -56, 1, 25, -63, 170, -4, 77 };
+
+#define TEST_MSG "VCVTPH_S64_F16"
+#define INSN_NAME vcvtph_s64_f16
+
+#define INPUT input
+#define EXPECTED expected
+
+#define INPUT_TYPE float16_t
+#define OUTPUT_TYPE int64_t
+#define OUTPUT_TYPE_SIZE 64
+
+/* Include the template for unary scalar operations.  */
+#include "unary_scalar_op.inc"
--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vcvtph_u16_f16_1.c
@@ -0,0 +1,23 @@
+/* { dg-do run } */
+/* { dg-require-effective-target arm_v8_2a_fp16_scalar_hw } */
+/* { dg-add-options arm_v8_2a_fp16_scalar }  */
+/* { dg-skip-if "" { arm*-*-* } } */
+
+#include <arm_fp16.h>
+
+/* Input values.  */
+float16_t input[] = { 123.9, 56.8, 0.7, 24.6, 63.5, 169.4, 4.3, 77.0 };
+uint16_t expected[] = { 124, 57, 1, 25, 64, 170, 5, 77 };
+
+#define TEST_MSG "VCVTPH_u16_F16"
+#define INSN_NAME vcvtph_u16_f16
+
+#define INPUT input
+#define EXPECTED expected
+
+#define INPUT_TYPE float16_t
+#define OUTPUT_TYPE uint16_t
+#define OUTPUT_TYPE_SIZE 16
+
+/* Include the template for unary scalar operations.  */
+#include "unary_scalar_op.inc"
--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vcvtph_u32_f16_1.c
@@ -0,0 +1,53 @@
+/* { dg-do run } */
+/* { dg-require-effective-target arm_v8_2a_fp16_scalar_hw } */
+/* { dg-add-options arm_v8_2a_fp16_scalar }  */
+
+#include <arm_fp16.h>
+
+/* Input values.  */
+float16_t input[] =
+{
+  0.0, -0.0,
+  123.4, -567.8,
+  -34.8, 1024,
+  663.1, 169.1,
+  -4.8, 77.0,
+  -144.5, -56.8,
+
+  (float16_t) -16, (float16_t) -15,
+  (float16_t) -14, (float16_t) -13,
+};
+
+/* Expected results (32-bit hexadecimal representation).  */
+uint32_t expected[] =
+{
+  0x00000000,
+  0x00000000,
+  0x0000007c,
+  0x00000000,
+  0x00000000,
+  0x00000400,
+  0x00000297,
+  0x000000aa,
+  0x00000000,
+  0x0000004d,
+  0x00000000,
+  0x00000000,
+  0x00000000,
+  0x00000000,
+  0x00000000,
+  0x00000000,
+};
+
+#define TEST_MSG "VCVTPH_U32_F16"
+#define INSN_NAME vcvtph_u32_f16
+
+#define INPUT input
+#define EXPECTED expected
+
+#define INPUT_TYPE float16_t
+#define OUTPUT_TYPE uint32_t
+#define OUTPUT_TYPE_SIZE 32
+
+/* Include the template for unary scalar operations.  */
+#include "unary_scalar_op.inc"
--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vcvtph_u64_f16_1.c
@@ -0,0 +1,23 @@
+/* { dg-do run } */
+/* { dg-require-effective-target arm_v8_2a_fp16_scalar_hw } */
+/* { dg-add-options arm_v8_2a_fp16_scalar }  */
+/* { dg-skip-if "" { arm*-*-* } } */
+
+#include <arm_fp16.h>
+
+/* Input values.  */
+float16_t input[] = { 123.9, 56.8, 0.7, 24.6, 63.5, 169.4, 4.3, 77.0 };
+uint64_t expected[] = { 124, 57, 1, 25, 64, 170, 5, 77 };
+
+#define TEST_MSG "VCVTPH_u64_F16"
+#define INSN_NAME vcvtph_u64_f16
+
+#define INPUT input
+#define EXPECTED expected
+
+#define INPUT_TYPE float16_t
+#define OUTPUT_TYPE uint64_t
+#define OUTPUT_TYPE_SIZE 64
+
+/* Include the template for unary scalar operations.  */
+#include "unary_scalar_op.inc"
--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vdiv_f16_1.c
@@ -0,0 +1,86 @@
+/* { dg-do run } */
+/* { dg-require-effective-target arm_v8_2a_fp16_neon_hw } */
+/* { dg-add-options arm_v8_2a_fp16_neon } */
+/* { dg-skip-if "" { arm*-*-* } } */
+
+#include <arm_neon.h>
+#include "arm-neon-ref.h"
+#include "compute-ref-data.h"
+
+#define FP16_C(a) ((__fp16) a)
+#define A FP16_C (13.4)
+#define B FP16_C (-56.8)
+#define C FP16_C (-34.8)
+#define D FP16_C (12)
+#define E FP16_C (63.1)
+#define F FP16_C (19.1)
+#define G FP16_C (-4.8)
+#define H FP16_C (77)
+
+#define I FP16_C (0.7)
+#define J FP16_C (-78)
+#define K FP16_C (11.23)
+#define L FP16_C (98)
+#define M FP16_C (87.1)
+#define N FP16_C (-8)
+#define O FP16_C (-1.1)
+#define P FP16_C (-9.7)
+
+/* Expected results for vdiv.  */
+VECT_VAR_DECL (expected_div_static, hfloat, 16, 4) []
+  = { 0x32CC /* A / E.  */, 0xC1F3 /* B / F.  */,
+      0x4740 /* C / G.  */, 0x30FD /* D / H.  */ };
+
+VECT_VAR_DECL (expected_div_static, hfloat, 16, 8) []
+  = { 0x32CC /* A / E.  */, 0xC1F3 /* B / F.  */,
+      0x4740 /* C / G.  */, 0x30FD /* D / H.  */,
+      0x201D /* I / M.  */, 0x48E0 /* J / N.  */,
+      0xC91B /* K / O.  */, 0xC90D /* L / P.  */ };
+
+void exec_vdiv_f16 (void)
+{
+#undef TEST_MSG
+#define TEST_MSG "VDIV (FP16)"
+  clean_results ();
+
+  DECL_VARIABLE(vsrc_1, float, 16, 4);
+  DECL_VARIABLE(vsrc_2, float, 16, 4);
+  VECT_VAR_DECL (buf_src_1, float, 16, 4) [] = {A, B, C, D};
+  VECT_VAR_DECL (buf_src_2, float, 16, 4) [] = {E, F, G, H};
+  VLOAD (vsrc_1, buf_src_1, , float, f, 16, 4);
+  VLOAD (vsrc_2, buf_src_2, , float, f, 16, 4);
+
+  DECL_VARIABLE (vector_res, float, 16, 4)
+    = vdiv_f16 (VECT_VAR (vsrc_1, float, 16, 4),
+		VECT_VAR (vsrc_2, float, 16, 4));
+  vst1_f16 (VECT_VAR (result, float, 16, 4),
+	    VECT_VAR (vector_res, float, 16, 4));
+
+  CHECK_FP (TEST_MSG, float, 16, 4, PRIx16, expected_div_static, "");
+
+#undef TEST_MSG
+#define TEST_MSG "VDIVQ (FP16)"
+  clean_results ();
+
+  DECL_VARIABLE(vsrc_1, float, 16, 8);
+  DECL_VARIABLE(vsrc_2, float, 16, 8);
+  VECT_VAR_DECL (buf_src_1, float, 16, 8) [] = {A, B, C, D, I, J, K, L};
+  VECT_VAR_DECL (buf_src_2, float, 16, 8) [] = {E, F, G, H, M, N, O, P};
+  VLOAD (vsrc_1, buf_src_1, q, float, f, 16, 8);
+  VLOAD (vsrc_2, buf_src_2, q, float, f, 16, 8);
+
+  DECL_VARIABLE (vector_res, float, 16, 8)
+    = vdivq_f16 (VECT_VAR (vsrc_1, float, 16, 8),
+		 VECT_VAR (vsrc_2, float, 16, 8));
+  vst1q_f16 (VECT_VAR (result, float, 16, 8),
+	     VECT_VAR (vector_res, float, 16, 8));
+
+  CHECK_FP (TEST_MSG, float, 16, 8, PRIx16, expected_div_static, "");
+}
+
+int
+main (void)
+{
+  exec_vdiv_f16 ();
+  return 0;
+}
--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vdivh_f16_1.c
@@ -0,0 +1,42 @@
+/* { dg-do run } */
+/* { dg-require-effective-target arm_v8_2a_fp16_scalar_hw } */
+/* { dg-add-options arm_v8_2a_fp16_scalar }  */
+
+#include <arm_fp16.h>
+
+#define INFF __builtin_inf ()
+
+/* Expected results (16-bit hexadecimal representation).  */
+uint16_t expected[] =
+{
+  0x0000 /* 0.000000 */,
+  0x8000 /* -0.000000 */,
+  0xb765 /* -0.462158 */,
+  0x27ef /* 0.030991 */,
+  0x3955 /* 0.666504 */,
+  0xccff /* -19.984375 */,
+  0xc49a /* -4.601562 */,
+  0xb1e3 /* -0.183960 */,
+  0x3cd3 /* 1.206055 */,
+  0x23f0 /* 0.015503 */,
+  0xa9ef /* -0.046356 */,
+  0x32f4 /* 0.217285 */,
+  0xb036 /* -0.131592 */,
+  0x4126 /* 2.574219 */,
+  0xcd15 /* -20.328125 */,
+  0x537f /* 59.968750 */,
+  0x7e00 /* nan */,
+  0x7e00 /* nan */
+};
+
+#define TEST_MSG "VDIVH_F16"
+#define INSN_NAME vdivh_f16
+
+#define EXPECTED expected
+
+#define INPUT_TYPE float16_t
+#define OUTPUT_TYPE float16_t
+#define OUTPUT_TYPE_SIZE 16
+
+/* Include the template for binary scalar operations.  */
+#include "binary_scalar_op.inc"
--- a/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vdup-vmov.c
+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vdup-vmov.c
@@ -19,6 +19,10 @@ VECT_VAR_DECL(expected0,uint,64,1) [] = { 0xfffffffffffffff0 };
 VECT_VAR_DECL(expected0,poly,8,8) [] = { 0xf0, 0xf0, 0xf0, 0xf0,
 					 0xf0, 0xf0, 0xf0, 0xf0 };
 VECT_VAR_DECL(expected0,poly,16,4) [] = { 0xfff0, 0xfff0, 0xfff0, 0xfff0 };
+#if defined (FP16_SUPPORTED)
+VECT_VAR_DECL (expected0, hfloat, 16, 4) [] = { 0xcc00, 0xcc00,
+						0xcc00, 0xcc00 };
+#endif
 VECT_VAR_DECL(expected0,hfloat,32,2) [] = { 0xc1800000, 0xc1800000 };
 VECT_VAR_DECL(expected0,int,8,16) [] = { 0xf0, 0xf0, 0xf0, 0xf0,
 					 0xf0, 0xf0, 0xf0, 0xf0,
@@ -46,6 +50,12 @@ VECT_VAR_DECL(expected0,poly,8,16) [] = { 0xf0, 0xf0, 0xf0, 0xf0,
 					  0xf0, 0xf0, 0xf0, 0xf0 };
 VECT_VAR_DECL(expected0,poly,16,8) [] = { 0xfff0, 0xfff0, 0xfff0, 0xfff0,
 					  0xfff0, 0xfff0, 0xfff0, 0xfff0 };
+#if defined (FP16_SUPPORTED)
+VECT_VAR_DECL (expected0, hfloat, 16, 8) [] = { 0xcc00, 0xcc00,
+						0xcc00, 0xcc00,
+						0xcc00, 0xcc00,
+						0xcc00, 0xcc00 };
+#endif
 VECT_VAR_DECL(expected0,hfloat,32,4) [] = { 0xc1800000, 0xc1800000,
 					    0xc1800000, 0xc1800000 };
 
@@ -63,6 +73,10 @@ VECT_VAR_DECL(expected1,uint,64,1) [] = { 0xfffffffffffffff1 };
 VECT_VAR_DECL(expected1,poly,8,8) [] = { 0xf1, 0xf1, 0xf1, 0xf1,
 					 0xf1, 0xf1, 0xf1, 0xf1 };
 VECT_VAR_DECL(expected1,poly,16,4) [] = { 0xfff1, 0xfff1, 0xfff1, 0xfff1 };
+#if defined (FP16_SUPPORTED)
+VECT_VAR_DECL (expected1, hfloat, 16, 4) [] = { 0xcb80, 0xcb80,
+						0xcb80, 0xcb80 };
+#endif
 VECT_VAR_DECL(expected1,hfloat,32,2) [] = { 0xc1700000, 0xc1700000 };
 VECT_VAR_DECL(expected1,int,8,16) [] = { 0xf1, 0xf1, 0xf1, 0xf1,
 					 0xf1, 0xf1, 0xf1, 0xf1,
@@ -90,6 +104,12 @@ VECT_VAR_DECL(expected1,poly,8,16) [] = { 0xf1, 0xf1, 0xf1, 0xf1,
 					  0xf1, 0xf1, 0xf1, 0xf1 };
 VECT_VAR_DECL(expected1,poly,16,8) [] = { 0xfff1, 0xfff1, 0xfff1, 0xfff1,
 					  0xfff1, 0xfff1, 0xfff1, 0xfff1 };
+#if defined (FP16_SUPPORTED)
+VECT_VAR_DECL (expected1, hfloat, 16, 8) [] = { 0xcb80, 0xcb80,
+						0xcb80, 0xcb80,
+						0xcb80, 0xcb80,
+						0xcb80, 0xcb80 };
+#endif
 VECT_VAR_DECL(expected1,hfloat,32,4) [] = { 0xc1700000, 0xc1700000,
 					    0xc1700000, 0xc1700000 };
 
@@ -107,6 +127,10 @@ VECT_VAR_DECL(expected2,uint,64,1) [] = { 0xfffffffffffffff2 };
 VECT_VAR_DECL(expected2,poly,8,8) [] = { 0xf2, 0xf2, 0xf2, 0xf2,
 					 0xf2, 0xf2, 0xf2, 0xf2 };
 VECT_VAR_DECL(expected2,poly,16,4) [] = { 0xfff2, 0xfff2, 0xfff2, 0xfff2 };
+#if defined (FP16_SUPPORTED)
+VECT_VAR_DECL (expected2, hfloat, 16, 4) [] = { 0xcb00, 0xcb00,
+						0xcb00, 0xcb00 };
+#endif
 VECT_VAR_DECL(expected2,hfloat,32,2) [] = { 0xc1600000, 0xc1600000 };
 VECT_VAR_DECL(expected2,int,8,16) [] = { 0xf2, 0xf2, 0xf2, 0xf2,
 					 0xf2, 0xf2, 0xf2, 0xf2,
@@ -134,6 +158,12 @@ VECT_VAR_DECL(expected2,poly,8,16) [] = { 0xf2, 0xf2, 0xf2, 0xf2,
 					  0xf2, 0xf2, 0xf2, 0xf2 };
 VECT_VAR_DECL(expected2,poly,16,8) [] = { 0xfff2, 0xfff2, 0xfff2, 0xfff2,
 					  0xfff2, 0xfff2, 0xfff2, 0xfff2 };
+#if defined (FP16_SUPPORTED)
+VECT_VAR_DECL (expected2, hfloat, 16, 8) [] = { 0xcb00, 0xcb00,
+						0xcb00, 0xcb00,
+						0xcb00, 0xcb00,
+						0xcb00, 0xcb00 };
+#endif
 VECT_VAR_DECL(expected2,hfloat,32,4) [] = { 0xc1600000, 0xc1600000,
 					    0xc1600000, 0xc1600000 };
 
@@ -171,6 +201,9 @@ void exec_vdup_vmov (void)
     TEST_VDUP(, uint, u, 64, 1);
     TEST_VDUP(, poly, p, 8, 8);
     TEST_VDUP(, poly, p, 16, 4);
+#if defined (FP16_SUPPORTED)
+    TEST_VDUP(, float, f, 16, 4);
+#endif
     TEST_VDUP(, float, f, 32, 2);
 
     TEST_VDUP(q, int, s, 8, 16);
@@ -183,8 +216,26 @@ void exec_vdup_vmov (void)
     TEST_VDUP(q, uint, u, 64, 2);
     TEST_VDUP(q, poly, p, 8, 16);
     TEST_VDUP(q, poly, p, 16, 8);
+#if defined (FP16_SUPPORTED)
+    TEST_VDUP(q, float, f, 16, 8);
+#endif
     TEST_VDUP(q, float, f, 32, 4);
 
+#if defined (FP16_SUPPORTED)
+    switch (i) {
+    case 0:
+      CHECK_RESULTS_NAMED (TEST_MSG, expected0, "");
+      break;
+    case 1:
+      CHECK_RESULTS_NAMED (TEST_MSG, expected1, "");
+      break;
+    case 2:
+      CHECK_RESULTS_NAMED (TEST_MSG, expected2, "");
+      break;
+    default:
+      abort();
+    }
+#else
     switch (i) {
     case 0:
       CHECK_RESULTS_NAMED_NO_FP16 (TEST_MSG, expected0, "");
@@ -198,6 +249,7 @@ void exec_vdup_vmov (void)
     default:
       abort();
     }
+#endif
   }
 
   /* Do the same tests with vmov. Use the same expected results.  */
@@ -216,6 +268,9 @@ void exec_vdup_vmov (void)
     TEST_VMOV(, uint, u, 64, 1);
     TEST_VMOV(, poly, p, 8, 8);
     TEST_VMOV(, poly, p, 16, 4);
+#if defined (FP16_SUPPORTED)
+    TEST_VMOV(, float, f, 16, 4);
+#endif
     TEST_VMOV(, float, f, 32, 2);
 
     TEST_VMOV(q, int, s, 8, 16);
@@ -228,8 +283,26 @@ void exec_vdup_vmov (void)
     TEST_VMOV(q, uint, u, 64, 2);
     TEST_VMOV(q, poly, p, 8, 16);
     TEST_VMOV(q, poly, p, 16, 8);
+#if defined (FP16_SUPPORTED)
+    TEST_VMOV(q, float, f, 16, 8);
+#endif
     TEST_VMOV(q, float, f, 32, 4);
 
+#if defined (FP16_SUPPORTED)
+    switch (i) {
+    case 0:
+      CHECK_RESULTS_NAMED (TEST_MSG, expected0, "");
+      break;
+    case 1:
+      CHECK_RESULTS_NAMED (TEST_MSG, expected1, "");
+      break;
+    case 2:
+      CHECK_RESULTS_NAMED (TEST_MSG, expected2, "");
+      break;
+    default:
+      abort();
+    }
+#else
     switch (i) {
     case 0:
       CHECK_RESULTS_NAMED_NO_FP16 (TEST_MSG, expected0, "");
@@ -243,6 +316,8 @@ void exec_vdup_vmov (void)
     default:
       abort();
     }
+#endif
+
   }
 }
 
--- a/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vdup_lane.c
+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vdup_lane.c
@@ -17,6 +17,10 @@ VECT_VAR_DECL(expected,poly,8,8) [] = { 0xf7, 0xf7, 0xf7, 0xf7,
 					0xf7, 0xf7, 0xf7, 0xf7 };
 VECT_VAR_DECL(expected,poly,16,4) [] = { 0xfff3, 0xfff3, 0xfff3, 0xfff3 };
 VECT_VAR_DECL(expected,hfloat,32,2) [] = { 0xc1700000, 0xc1700000 };
+#if defined (FP16_SUPPORTED)
+VECT_VAR_DECL (expected, hfloat, 16, 4) [] = { 0xca80, 0xca80,
+					       0xca80, 0xca80 };
+#endif
 VECT_VAR_DECL(expected,int,8,16) [] = { 0xf2, 0xf2, 0xf2, 0xf2,
 					0xf2, 0xf2, 0xf2, 0xf2,
 					0xf2, 0xf2, 0xf2, 0xf2,
@@ -43,10 +47,16 @@ VECT_VAR_DECL(expected,poly,8,16) [] = { 0xf5, 0xf5, 0xf5, 0xf5,
 					 0xf5, 0xf5, 0xf5, 0xf5 };
 VECT_VAR_DECL(expected,poly,16,8) [] = { 0xfff1, 0xfff1, 0xfff1, 0xfff1,
 					 0xfff1, 0xfff1, 0xfff1, 0xfff1 };
+#if defined (FP16_SUPPORTED)
+VECT_VAR_DECL (expected, hfloat, 16, 8) [] = { 0xca80, 0xca80,
+					       0xca80, 0xca80,
+					       0xca80, 0xca80,
+					       0xca80, 0xca80 };
+#endif
 VECT_VAR_DECL(expected,hfloat,32,4) [] = { 0xc1700000, 0xc1700000,
 					   0xc1700000, 0xc1700000 };
 
-#define TEST_MSG "VDUP_LANE/VDUP_LANEQ"
+#define TEST_MSG "VDUP_LANE/VDUPQ_LANE"
 void exec_vdup_lane (void)
 {
   /* Basic test: vec1=vdup_lane(vec2, lane), then store the result.  */
@@ -63,6 +73,9 @@ void exec_vdup_lane (void)
   clean_results ();
 
   TEST_MACRO_64BITS_VARIANTS_2_5(VLOAD, vector, buffer);
+#if defined (FP16_SUPPORTED)
+  VLOAD(vector, buffer, , float, f, 16, 4);
+#endif
   VLOAD(vector, buffer, , float, f, 32, 2);
 
   /* Choose lane arbitrarily.  */
@@ -76,6 +89,9 @@ void exec_vdup_lane (void)
   TEST_VDUP_LANE(, uint, u, 64, 1, 1, 0);
   TEST_VDUP_LANE(, poly, p, 8, 8, 8, 7);
   TEST_VDUP_LANE(, poly, p, 16, 4, 4, 3);
+#if defined (FP16_SUPPORTED)
+  TEST_VDUP_LANE(, float, f, 16, 4, 4, 3);
+#endif
   TEST_VDUP_LANE(, float, f, 32, 2, 2, 1);
 
   TEST_VDUP_LANE(q, int, s, 8, 16, 8, 2);
@@ -88,9 +104,133 @@ void exec_vdup_lane (void)
   TEST_VDUP_LANE(q, uint, u, 64, 2, 1, 0);
   TEST_VDUP_LANE(q, poly, p, 8, 16, 8, 5);
   TEST_VDUP_LANE(q, poly, p, 16, 8, 4, 1);
+#if defined (FP16_SUPPORTED)
+  TEST_VDUP_LANE(q, float, f, 16, 8, 4, 3);
+#endif
   TEST_VDUP_LANE(q, float, f, 32, 4, 2, 1);
 
+#if defined (FP16_SUPPORTED)
+  CHECK_RESULTS (TEST_MSG, "");
+#else
   CHECK_RESULTS_NO_FP16 (TEST_MSG, "");
+#endif
+
+#if defined (__aarch64__)
+
+#undef TEST_MSG
+#define TEST_MSG "VDUP_LANEQ/VDUPQ_LANEQ"
+
+  /* Expected results for vdup*_laneq tests.  */
+VECT_VAR_DECL(expected2,int,8,8) [] = { 0xfd, 0xfd, 0xfd, 0xfd,
+					0xfd, 0xfd, 0xfd, 0xfd };
+VECT_VAR_DECL(expected2,int,16,4) [] = { 0xfff2, 0xfff2, 0xfff2, 0xfff2 };
+VECT_VAR_DECL(expected2,int,32,2) [] = { 0xfffffff1, 0xfffffff1 };
+VECT_VAR_DECL(expected2,int,64,1) [] = { 0xfffffffffffffff0 };
+VECT_VAR_DECL(expected2,uint,8,8) [] = { 0xff, 0xff, 0xff, 0xff,
+					 0xff, 0xff, 0xff, 0xff };
+VECT_VAR_DECL(expected2,uint,16,4) [] = { 0xfff3, 0xfff3, 0xfff3, 0xfff3 };
+VECT_VAR_DECL(expected2,uint,32,2) [] = { 0xfffffff1, 0xfffffff1 };
+VECT_VAR_DECL(expected2,uint,64,1) [] = { 0xfffffffffffffff0 };
+VECT_VAR_DECL(expected2,poly,8,8) [] = { 0xf7, 0xf7, 0xf7, 0xf7,
+					 0xf7, 0xf7, 0xf7, 0xf7 };
+VECT_VAR_DECL(expected2,poly,16,4) [] = { 0xfff3, 0xfff3, 0xfff3, 0xfff3 };
+VECT_VAR_DECL(expected2,hfloat,32,2) [] = { 0xc1700000, 0xc1700000 };
+#if defined (FP16_SUPPORTED)
+VECT_VAR_DECL (expected2, hfloat, 16, 4) [] = { 0xca80, 0xca80,
+						0xca80, 0xca80 };
+#endif
+VECT_VAR_DECL(expected2,int,8,16) [] = { 0xfb, 0xfb, 0xfb, 0xfb,
+					 0xfb, 0xfb, 0xfb, 0xfb,
+					 0xfb, 0xfb, 0xfb, 0xfb,
+					 0xfb, 0xfb, 0xfb, 0xfb };
+VECT_VAR_DECL(expected2,int,16,8) [] = { 0xfff7, 0xfff7, 0xfff7, 0xfff7,
+					 0xfff7, 0xfff7, 0xfff7, 0xfff7 };
+VECT_VAR_DECL(expected2,int,32,4) [] = { 0xfffffff1, 0xfffffff1,
+					 0xfffffff1, 0xfffffff1 };
+VECT_VAR_DECL(expected2,int,64,2) [] = { 0xfffffffffffffff0,
+					 0xfffffffffffffff0 };
+VECT_VAR_DECL(expected2,uint,8,16) [] = { 0xf5, 0xf5, 0xf5, 0xf5,
+					  0xf5, 0xf5, 0xf5, 0xf5,
+					  0xf5, 0xf5, 0xf5, 0xf5,
+					  0xf5, 0xf5, 0xf5, 0xf5 };
+VECT_VAR_DECL(expected2,uint,16,8) [] = { 0xfff1, 0xfff1, 0xfff1, 0xfff1,
+					  0xfff1, 0xfff1, 0xfff1, 0xfff1 };
+VECT_VAR_DECL(expected2,uint,32,4) [] = { 0xfffffff0, 0xfffffff0,
+					  0xfffffff0, 0xfffffff0 };
+VECT_VAR_DECL(expected2,uint,64,2) [] = { 0xfffffffffffffff0,
+					  0xfffffffffffffff0 };
+VECT_VAR_DECL(expected2,poly,8,16) [] = { 0xf5, 0xf5, 0xf5, 0xf5,
+					  0xf5, 0xf5, 0xf5, 0xf5,
+					  0xf5, 0xf5, 0xf5, 0xf5,
+					  0xf5, 0xf5, 0xf5, 0xf5 };
+VECT_VAR_DECL(expected2,poly,16,8) [] = { 0xfff1, 0xfff1, 0xfff1, 0xfff1,
+					  0xfff1, 0xfff1, 0xfff1, 0xfff1 };
+#if defined (FP16_SUPPORTED)
+VECT_VAR_DECL (expected2, hfloat, 16, 8) [] = { 0xc880, 0xc880,
+						0xc880, 0xc880,
+						0xc880, 0xc880,
+						0xc880, 0xc880 };
+#endif
+VECT_VAR_DECL(expected2,hfloat,32,4) [] = { 0xc1700000, 0xc1700000,
+					    0xc1700000, 0xc1700000 };
+
+  /* Clean all results for vdup*_laneq tests.  */
+  clean_results ();
+  /* Basic test: vec1=vdup_lane(vec2, lane), then store the result.  */
+#define TEST_VDUP_LANEQ(Q, T1, T2, W, N, N2, L)				\
+  VECT_VAR(vector_res, T1, W, N) =					\
+    vdup##Q##_laneq_##T2##W(VECT_VAR(vector, T1, W, N2), L);		\
+  vst1##Q##_##T2##W(VECT_VAR(result, T1, W, N), VECT_VAR(vector_res, T1, W, N))
+
+  /* Input vector can only have 64 bits.  */
+  DECL_VARIABLE_128BITS_VARIANTS(vector);
+
+  clean_results ();
+
+  TEST_MACRO_128BITS_VARIANTS_2_5(VLOAD, vector, buffer);
+#if defined (FP16_SUPPORTED)
+  VLOAD(vector, buffer, q, float, f, 16, 8);
+#endif
+  VLOAD(vector, buffer, q, float, f, 32, 4);
+
+  /* Choose lane arbitrarily.  */
+  TEST_VDUP_LANEQ(, int, s, 8, 8, 16, 13);
+  TEST_VDUP_LANEQ(, int, s, 16, 4, 8, 2);
+  TEST_VDUP_LANEQ(, int, s, 32, 2, 4, 1);
+  TEST_VDUP_LANEQ(, int, s, 64, 1, 2, 0);
+  TEST_VDUP_LANEQ(, uint, u, 8, 8, 16, 15);
+  TEST_VDUP_LANEQ(, uint, u, 16, 4, 8, 3);
+  TEST_VDUP_LANEQ(, uint, u, 32, 2, 4, 1);
+  TEST_VDUP_LANEQ(, uint, u, 64, 1, 2, 0);
+  TEST_VDUP_LANEQ(, poly, p, 8, 8, 16, 7);
+  TEST_VDUP_LANEQ(, poly, p, 16, 4, 8, 3);
+#if defined (FP16_SUPPORTED)
+  TEST_VDUP_LANEQ(, float, f, 16, 4, 8, 3);
+#endif
+  TEST_VDUP_LANEQ(, float, f, 32, 2, 4, 1);
+
+  TEST_VDUP_LANEQ(q, int, s, 8, 16, 16, 11);
+  TEST_VDUP_LANEQ(q, int, s, 16, 8, 8, 7);
+  TEST_VDUP_LANEQ(q, int, s, 32, 4, 4, 1);
+  TEST_VDUP_LANEQ(q, int, s, 64, 2, 2, 0);
+  TEST_VDUP_LANEQ(q, uint, u, 8, 16, 16, 5);
+  TEST_VDUP_LANEQ(q, uint, u, 16, 8, 8, 1);
+  TEST_VDUP_LANEQ(q, uint, u, 32, 4, 4, 0);
+  TEST_VDUP_LANEQ(q, uint, u, 64, 2, 2, 0);
+  TEST_VDUP_LANEQ(q, poly, p, 8, 16, 16, 5);
+  TEST_VDUP_LANEQ(q, poly, p, 16, 8, 8, 1);
+#if defined (FP16_SUPPORTED)
+  TEST_VDUP_LANEQ(q, float, f, 16, 8, 8, 7);
+#endif
+  TEST_VDUP_LANEQ(q, float, f, 32, 4, 4, 1);
+
+  CHECK_RESULTS_NAMED (TEST_MSG, expected2, "");
+#if defined (FP16_SUPPORTED)
+  CHECK_FP(TEST_MSG, float, 16, 4, PRIx16, expected2, "");
+  CHECK_FP(TEST_MSG, float, 16, 8, PRIx16, expected2, "");
+#endif
+
+#endif /* __aarch64__.  */
 }
 
 int main (void)
--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vduph_lane.c
@@ -0,0 +1,137 @@
+/* { dg-do run } */
+/* { dg-skip-if "" { arm*-*-* } } */
+
+#include <arm_neon.h>
+#include "arm-neon-ref.h"
+#include "compute-ref-data.h"
+
+#define A -16
+#define B -15
+#define C -14
+#define D -13
+#define E -12
+#define F -11
+#define G -10
+#define H -9
+
+#define F16_C(a) ((__fp16) a)
+#define AF F16_C (A)
+#define BF F16_C (B)
+#define CF F16_C (C)
+#define DF F16_C (D)
+#define EF F16_C (E)
+#define FF F16_C (F)
+#define GF F16_C (G)
+#define HF F16_C (H)
+
+#define S16_C(a) ((int16_t) a)
+#define AS S16_C (A)
+#define BS S16_C (B)
+#define CS S16_C (C)
+#define DS S16_C (D)
+#define ES S16_C (E)
+#define FS S16_C (F)
+#define GS S16_C (G)
+#define HS S16_C (H)
+
+#define U16_C(a) ((int16_t) a)
+#define AU U16_C (A)
+#define BU U16_C (B)
+#define CU U16_C (C)
+#define DU U16_C (D)
+#define EU U16_C (E)
+#define FU U16_C (F)
+#define GU U16_C (G)
+#define HU U16_C (H)
+
+#define P16_C(a) ((poly16_t) a)
+#define AP P16_C (A)
+#define BP P16_C (B)
+#define CP P16_C (C)
+#define DP P16_C (D)
+#define EP P16_C (E)
+#define FP P16_C (F)
+#define GP P16_C (G)
+#define HP P16_C (H)
+
+/* Expected results for vduph_lane.  */
+float16_t expected_f16 = AF;
+int16_t expected_s16 = DS;
+uint16_t expected_u16 = BU;
+poly16_t expected_p16 = CP;
+
+/* Expected results for vduph_laneq.  */
+float16_t expected_q_f16 = EF;
+int16_t expected_q_s16 = BS;
+uint16_t expected_q_u16 = GU;
+poly16_t expected_q_p16 = FP;
+
+void exec_vduph_lane_f16 (void)
+{
+  /* vduph_lane.  */
+  DECL_VARIABLE(vsrc, float, 16, 4);
+  DECL_VARIABLE(vsrc, int, 16, 4);
+  DECL_VARIABLE(vsrc, uint, 16, 4);
+  DECL_VARIABLE(vsrc, poly, 16, 4);
+  VECT_VAR_DECL (buf_src, float, 16, 4) [] = {AF, BF, CF, DF};
+  VECT_VAR_DECL (buf_src, int, 16, 4) [] = {AS, BS, CS, DS};
+  VECT_VAR_DECL (buf_src, uint, 16, 4) [] = {AU, BU, CU, DU};
+  VECT_VAR_DECL (buf_src, poly, 16, 4) [] = {AP, BP, CP, DP};
+  VLOAD (vsrc, buf_src, , int, s, 16, 4);
+  VLOAD (vsrc, buf_src, , float, f, 16, 4);
+  VLOAD (vsrc, buf_src, , uint, u, 16, 4);
+  VLOAD (vsrc, buf_src, , poly, p, 16, 4);
+
+  float16_t res_f = vduph_lane_f16 (VECT_VAR (vsrc, float, 16, 4), 0);
+  if (* (unsigned short *) &res_f != * (unsigned short *) &expected_f16)
+    abort ();
+
+  int16_t res_s = vduph_lane_s16 (VECT_VAR (vsrc, int, 16, 4), 3);
+  if (* (unsigned short *) &res_s != * (unsigned short *) &expected_s16)
+    abort ();
+
+  uint16_t res_u = vduph_lane_u16 (VECT_VAR (vsrc, uint, 16, 4), 1);
+  if (* (unsigned short *) &res_u != * (unsigned short *) &expected_u16)
+    abort ();
+
+  poly16_t res_p = vduph_lane_p16 (VECT_VAR (vsrc, poly, 16, 4), 2);
+  if (* (unsigned short *) &res_p != * (unsigned short *) &expected_p16)
+    abort ();
+
+  /* vduph_laneq.  */
+  DECL_VARIABLE(vsrc, float, 16, 8);
+  DECL_VARIABLE(vsrc, int, 16, 8);
+  DECL_VARIABLE(vsrc, uint, 16, 8);
+  DECL_VARIABLE(vsrc, poly, 16, 8);
+  VECT_VAR_DECL (buf_src, float, 16, 8) [] = {AF, BF, CF, DF, EF, FF, GF, HF};
+  VECT_VAR_DECL (buf_src, int, 16, 8) [] = {AS, BS, CS, DS, ES, FS, GS, HS};
+  VECT_VAR_DECL (buf_src, uint, 16, 8) [] = {AU, BU, CU, DU, EU, FU, GU, HU};
+  VECT_VAR_DECL (buf_src, poly, 16, 8) [] = {AP, BP, CP, DP, EP, FP, GP, HP};
+  VLOAD (vsrc, buf_src, q, int, s, 16, 8);
+  VLOAD (vsrc, buf_src, q, float, f, 16, 8);
+  VLOAD (vsrc, buf_src, q, uint, u, 16, 8);
+  VLOAD (vsrc, buf_src, q, poly, p, 16, 8);
+
+  res_f = vduph_laneq_f16 (VECT_VAR (vsrc, float, 16, 8), 4);
+  if (* (unsigned short *) &res_f != * (unsigned short *) &expected_q_f16)
+    abort ();
+
+  res_s = vduph_laneq_s16 (VECT_VAR (vsrc, int, 16, 8), 1);
+  if (* (unsigned short *) &res_s != * (unsigned short *) &expected_q_s16)
+    abort ();
+
+  res_u = vduph_laneq_u16 (VECT_VAR (vsrc, uint, 16, 8), 6);
+  if (* (unsigned short *) &res_u != * (unsigned short *) &expected_q_u16)
+    abort ();
+
+  res_p = vduph_laneq_p16 (VECT_VAR (vsrc, poly, 16, 8), 5);
+  if (* (unsigned short *) &res_p != * (unsigned short *) &expected_q_p16)
+    abort ();
+}
+
+int
+main (void)
+{
+  exec_vduph_lane_f16 ();
+  return 0;
+}
--- a/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vext.c
+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vext.c
@@ -16,6 +16,10 @@ VECT_VAR_DECL(expected,uint,64,1) [] = { 0xfffffffffffffff0 };
 VECT_VAR_DECL(expected,poly,8,8) [] = { 0xf6, 0xf7, 0x55, 0x55,
 					0x55, 0x55, 0x55, 0x55 };
 VECT_VAR_DECL(expected,poly,16,4) [] = { 0xfff2, 0xfff3, 0x66, 0x66 };
+#if defined (FP16_SUPPORTED)
+VECT_VAR_DECL (expected, hfloat, 16, 4) [] = { 0xcb00, 0xca80,
+					       0x4b4d, 0x4b4d };
+#endif
 VECT_VAR_DECL(expected,hfloat,32,2) [] = { 0xc1700000, 0x42066666 };
 VECT_VAR_DECL(expected,int,8,16) [] = { 0xfe, 0xff, 0x11, 0x11,
 					0x11, 0x11, 0x11, 0x11,
@@ -39,6 +43,12 @@ VECT_VAR_DECL(expected,poly,8,16) [] = { 0xfc, 0xfd, 0xfe, 0xff,
 					 0x55, 0x55, 0x55, 0x55 };
 VECT_VAR_DECL(expected,poly,16,8) [] = { 0xfff6, 0xfff7, 0x66, 0x66,
 					 0x66, 0x66, 0x66, 0x66 };
+#if defined (FP16_SUPPORTED)
+VECT_VAR_DECL (expected, hfloat, 16, 8) [] = { 0xc880, 0x4b4d,
+					       0x4b4d, 0x4b4d,
+					       0x4b4d, 0x4b4d,
+					       0x4b4d, 0x4b4d };
+#endif
 VECT_VAR_DECL(expected,hfloat,32,4) [] = { 0xc1500000, 0x4204cccd,
 					   0x4204cccd, 0x4204cccd };
 
@@ -60,6 +70,10 @@ void exec_vext (void)
   clean_results ();
 
   TEST_MACRO_ALL_VARIANTS_2_5(VLOAD, vector1, buffer);
+#ifdef FP16_SUPPORTED
+  VLOAD(vector1, buffer, , float, f, 16, 4);
+  VLOAD(vector1, buffer, q, float, f, 16, 8);
+#endif
   VLOAD(vector1, buffer, , float, f, 32, 2);
   VLOAD(vector1, buffer, q, float, f, 32, 4);
 
@@ -74,6 +88,9 @@ void exec_vext (void)
   VDUP(vector2, , uint, u, 64, 1, 0x88);
   VDUP(vector2, , poly, p, 8, 8, 0x55);
   VDUP(vector2, , poly, p, 16, 4, 0x66);
+#if defined (FP16_SUPPORTED)
+  VDUP (vector2, , float, f, 16, 4, 14.6f);   /* 14.6f is 0x4b4d.  */
+#endif
   VDUP(vector2, , float, f, 32, 2, 33.6f);
 
   VDUP(vector2, q, int, s, 8, 16, 0x11);
@@ -86,6 +103,9 @@ void exec_vext (void)
   VDUP(vector2, q, uint, u, 64, 2, 0x88);
   VDUP(vector2, q, poly, p, 8, 16, 0x55);
   VDUP(vector2, q, poly, p, 16, 8, 0x66);
+#if defined (FP16_SUPPORTED)
+  VDUP (vector2, q, float, f, 16, 8, 14.6f);
+#endif
   VDUP(vector2, q, float, f, 32, 4, 33.2f);
 
   /* Choose arbitrary extract offsets.  */
@@ -99,6 +119,9 @@ void exec_vext (void)
   TEST_VEXT(, uint, u, 64, 1, 0);
   TEST_VEXT(, poly, p, 8, 8, 6);
   TEST_VEXT(, poly, p, 16, 4, 2);
+#if defined (FP16_SUPPORTED)
+  TEST_VEXT(, float, f, 16, 4, 2);
+#endif
   TEST_VEXT(, float, f, 32, 2, 1);
 
   TEST_VEXT(q, int, s, 8, 16, 14);
@@ -111,9 +134,16 @@ void exec_vext (void)
   TEST_VEXT(q, uint, u, 64, 2, 1);
   TEST_VEXT(q, poly, p, 8, 16, 12);
   TEST_VEXT(q, poly, p, 16, 8, 6);
+#if defined (FP16_SUPPORTED)
+  TEST_VEXT(q, float, f, 16, 8, 7);
+#endif
   TEST_VEXT(q, float, f, 32, 4, 3);
 
+#if defined (FP16_SUPPORTED)
+  CHECK_RESULTS (TEST_MSG, "");
+#else
   CHECK_RESULTS_NO_FP16 (TEST_MSG, "");
+#endif
 }
 
 int main (void)
--- a/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vfma.c
+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vfma.c
@@ -3,11 +3,19 @@
 #include "compute-ref-data.h"
 
 #ifdef __ARM_FEATURE_FMA
+
 /* Expected results.  */
+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+VECT_VAR_DECL(expected, hfloat, 16, 4) [] = { 0x61c6, 0x61c8, 0x61ca, 0x61cc };
+VECT_VAR_DECL(expected, hfloat, 16, 8) [] = { 0x6435, 0x6436, 0x6437, 0x6438,
+					      0x6439, 0x643a, 0x643b, 0x643c };
+#endif
 VECT_VAR_DECL(expected,hfloat,32,2) [] = { 0x4438ca3d, 0x44390a3d };
-VECT_VAR_DECL(expected,hfloat,32,4) [] = { 0x44869eb8, 0x4486beb8, 0x4486deb8, 0x4486feb8 };
+VECT_VAR_DECL(expected,hfloat,32,4) [] = { 0x44869eb8, 0x4486beb8,
+					   0x4486deb8, 0x4486feb8 };
 #ifdef __aarch64__
-VECT_VAR_DECL(expected,hfloat,64,2) [] = { 0x408906e1532b8520, 0x40890ee1532b8520 };
+VECT_VAR_DECL(expected,hfloat,64,2) [] = { 0x408906e1532b8520,
+					   0x40890ee1532b8520 };
 #endif
 
 #define TEST_MSG "VFMA/VFMAQ"
@@ -44,6 +52,18 @@ void exec_vfma (void)
   DECL_VARIABLE(VAR, float, 32, 4);
 #endif
 
+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+  DECL_VARIABLE(vector1, float, 16, 4);
+  DECL_VARIABLE(vector2, float, 16, 4);
+  DECL_VARIABLE(vector3, float, 16, 4);
+  DECL_VARIABLE(vector_res, float, 16, 4);
+
+  DECL_VARIABLE(vector1, float, 16, 8);
+  DECL_VARIABLE(vector2, float, 16, 8);
+  DECL_VARIABLE(vector3, float, 16, 8);
+  DECL_VARIABLE(vector_res, float, 16, 8);
+#endif
+
   DECL_VFMA_VAR(vector1);
   DECL_VFMA_VAR(vector2);
   DECL_VFMA_VAR(vector3);
@@ -52,6 +72,10 @@ void exec_vfma (void)
   clean_results ();
 
   /* Initialize input "vector1" from "buffer".  */
+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+  VLOAD(vector1, buffer, , float, f, 16, 4);
+  VLOAD(vector1, buffer, q, float, f, 16, 8);
+#endif
   VLOAD(vector1, buffer, , float, f, 32, 2);
   VLOAD(vector1, buffer, q, float, f, 32, 4);
 #ifdef __aarch64__
@@ -59,13 +83,21 @@ void exec_vfma (void)
 #endif
 
   /* Choose init value arbitrarily.  */
+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+  VDUP(vector2, , float, f, 16, 4, 9.3f);
+  VDUP(vector2, q, float, f, 16, 8, 29.7f);
+#endif
   VDUP(vector2, , float, f, 32, 2, 9.3f);
   VDUP(vector2, q, float, f, 32, 4, 29.7f);
 #ifdef __aarch64__
   VDUP(vector2, q, float, f, 64, 2, 15.8f);
 #endif
-  
+
   /* Choose init value arbitrarily.  */
+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+  VDUP(vector3, , float, f, 16, 4, 81.2f);
+  VDUP(vector3, q, float, f, 16, 8, 36.8f);
+#endif
   VDUP(vector3, , float, f, 32, 2, 81.2f);
   VDUP(vector3, q, float, f, 32, 4, 36.8f);
 #ifdef __aarch64__
@@ -73,12 +105,20 @@ void exec_vfma (void)
 #endif
 
   /* Execute the tests.  */
+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+  TEST_VFMA(, float, f, 16, 4);
+  TEST_VFMA(q, float, f, 16, 8);
+#endif
   TEST_VFMA(, float, f, 32, 2);
   TEST_VFMA(q, float, f, 32, 4);
 #ifdef __aarch64__
   TEST_VFMA(q, float, f, 64, 2);
 #endif
 
+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+  CHECK_FP(TEST_MSG, float, 16, 4, PRIx16, expected, "");
+  CHECK_FP(TEST_MSG, float, 16, 8, PRIx16, expected, "");
+#endif
   CHECK_VFMA_RESULTS (TEST_MSG, "");
 }
 #endif
--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vfmah_f16_1.c
@@ -0,0 +1,40 @@
+/* { dg-do run } */
+/* { dg-require-effective-target arm_v8_2a_fp16_scalar_hw } */
+/* { dg-add-options arm_v8_2a_fp16_scalar }  */
+
+#include <arm_fp16.h>
+
+/* Expected results (16-bit hexadecimal representation).  */
+uint16_t expected[] =
+{
+ 0x0000 /* 0.000000 */,
+ 0x0000 /* 0.000000 */,
+ 0x3944 /* 0.658203 */,
+ 0xcefa /* -27.906250 */,
+ 0x5369 /* 59.281250 */,
+ 0x35ba /* 0.357910 */,
+ 0xc574 /* -5.453125 */,
+ 0xc5e6 /* -5.898438 */,
+ 0x3f66 /* 1.849609 */,
+ 0x5665 /* 102.312500 */,
+ 0xc02d /* -2.087891 */,
+ 0x4d79 /* 21.890625 */,
+ 0x547b /* 71.687500 */,
+ 0xcdf0 /* -23.750000 */,
+ 0xc625 /* -6.144531 */,
+ 0x4cf9 /* 19.890625 */,
+ 0x7e00 /* nan */,
+ 0x7e00 /* nan */
+};
+
+#define TEST_MSG "VFMAH_F16"
+#define INSN_NAME vfmah_f16
+
+#define EXPECTED expected
+
+#define INPUT_TYPE float16_t
+#define OUTPUT_TYPE float16_t
+#define OUTPUT_TYPE_SIZE 16
+
+/* Include the template for binary scalar operations.  */
+#include "ternary_scalar_op.inc"
--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vfmas_lane_f16_1.c
@@ -0,0 +1,908 @@
+/* { dg-do run } */
+/* { dg-require-effective-target arm_v8_2a_fp16_neon_hw } */
+/* { dg-add-options arm_v8_2a_fp16_neon } */
+/* { dg-skip-if "" { arm*-*-* } } */
+
+#include <arm_neon.h>
+#include "arm-neon-ref.h"
+#include "compute-ref-data.h"
+
+#define FP16_C(a) ((__fp16) a)
+#define A0 FP16_C (123.4)
+#define A1 FP16_C (-5.8)
+#define A2 FP16_C (-0.0)
+#define A3 FP16_C (10)
+#define A4 FP16_C (123412.43)
+#define A5 FP16_C (-5.8)
+#define A6 FP16_C (90.8)
+#define A7 FP16_C (24)
+
+#define B0 FP16_C (23.4)
+#define B1 FP16_C (-5.8)
+#define B2 FP16_C (8.9)
+#define B3 FP16_C (4.0)
+#define B4 FP16_C (3.4)
+#define B5 FP16_C (-550.8)
+#define B6 FP16_C (-31.8)
+#define B7 FP16_C (20000.0)
+
+/* Expected results for vfma_lane.  */
+VECT_VAR_DECL (expected0_static, hfloat, 16, 4) []
+  = { 0x613E /* A0 + B0 * B0.  */,
+      0xD86D /* A1 + B1 * B0.  */,
+      0x5A82 /* A2 + B2 * B0.  */,
+      0x567A /* A3 + B3 * B0.  */};
+
+VECT_VAR_DECL (expected1_static, hfloat, 16, 4) []
+  = { 0xCA33 /* A0 + B0 * B1.  */,
+      0x4EF6 /* A1 + B1 * B1.  */,
+      0xD274 /* A2 + B2 * B1.  */,
+      0xCA9A /* A3 + B3 * B1.  */ };
+
+VECT_VAR_DECL (expected2_static, hfloat, 16, 4) []
+  = { 0x5D2F /* A0 + B0 * B2.  */,
+      0xD32D /* A1 + B1 * B2.  */,
+      0x54F3 /* A2 + B2 * B2.  */,
+      0x51B3 /* A3 + B3 * B2.  */ };
+
+VECT_VAR_DECL (expected3_static, hfloat, 16, 4) []
+  = { 0x5AC8 /* A0 + B0 * B3.  */,
+      0xCF40 /* A1 + B1 * B3.  */,
+      0x5073 /* A2 + B2 * B3.  */,
+      0x4E80 /* A3 + B3 * B3.  */ };
+
+/* Expected results for vfmaq_lane.  */
+VECT_VAR_DECL (expected0_static, hfloat, 16, 8) []
+  = { 0x613E /* A0 + B0 * B0.  */,
+      0xD86D /* A1 + B1 * B0.  */,
+      0x5A82 /* A2 + B2 * B0.  */,
+      0x567A /* A3 + B3 * B0.  */,
+      0x7C00 /* A4 + B4 * B0.  */,
+      0xF24D /* A5 + B5 * B0.  */,
+      0xE11B /* A6 + B6 * B0.  */,
+      0x7C00 /* A7 + B7 * B0.  */ };
+
+VECT_VAR_DECL (expected1_static, hfloat, 16, 8) []
+  = { 0xCA33 /* A0 + B0 * B1.  */,
+      0x4EF6 /* A1 + B1 * B1.  */,
+      0xD274 /* A2 + B2 * B1.  */,
+      0xCA9A /* A3 + B3 * B1.  */,
+      0x7C00 /* A4 + B4 * B1.  */,
+      0x6A3B /* A5 + B5 * B1.  */,
+      0x5C4D /* A6 + B6 * B1.  */,
+      0xFC00 /* A7 + B7 * B1.  */ };
+
+VECT_VAR_DECL (expected2_static, hfloat, 16, 8) []
+  = { 0x5D2F /* A0 + B0 * B2.  */,
+      0xD32D /* A1 + B1 * B2.  */,
+      0x54F3 /* A2 + B2 * B2.  */,
+      0x51B3 /* A3 + B3 * B2.  */,
+      0x7C00 /* A4 + B4 * B2.  */,
+      0xECCB /* A5 + B5 * B2.  */,
+      0xDA01 /* A6 + B6 * B2.  */,
+      0x7C00 /* A7 + B7 * B2.  */ };
+
+VECT_VAR_DECL (expected3_static, hfloat, 16, 8) []
+  = { 0x5AC8 /* A0 + B0 * B3.  */,
+      0xCF40 /* A1 + B1 * B3.  */,
+      0x5073 /* A2 + B2 * B3.  */,
+      0x4E80 /* A3 + B3 * B3.  */,
+      0x7C00 /* A4 + B4 * B3.  */,
+      0xE851 /* A5 + B5 * B3.  */,
+      0xD08C /* A6 + B6 * B3.  */,
+      0x7C00 /* A7 + B7 * B3.  */ };
+
+/* Expected results for vfma_laneq.  */
+VECT_VAR_DECL (expected0_laneq_static, hfloat, 16, 4) []
+  = { 0x613E /* A0 + B0 * B0.  */,
+      0xD86D /* A1 + B1 * B0.  */,
+      0x5A82 /* A2 + B2 * B0.  */,
+      0x567A /* A3 + B3 * B0.  */ };
+
+VECT_VAR_DECL (expected1_laneq_static, hfloat, 16, 4) []
+  = { 0xCA33 /* A0 + B0 * B1.  */,
+      0x4EF6 /* A1 + B1 * B1.  */,
+      0xD274 /* A2 + B2 * B1.  */,
+      0xCA9A /* A3 + B3 * B1.  */ };
+
+VECT_VAR_DECL (expected2_laneq_static, hfloat, 16, 4) []
+  = { 0x5D2F /* A0 + B0 * B2.  */,
+      0xD32D /* A1 + B1 * B2.  */,
+      0x54F3 /* A2 + B2 * B2.  */,
+      0x51B3 /* A3 + B3 * B2.  */ };
+
+VECT_VAR_DECL (expected3_laneq_static, hfloat, 16, 4) []
+  = { 0x5AC8 /* A0 + B0 * B3.  */,
+      0xCF40 /* A1 + B1 * B3.  */,
+      0x5073 /* A2 + B2 * B3.  */,
+      0x4E80 /* A3 + B3 * B3.  */ };
+
+VECT_VAR_DECL (expected4_laneq_static, hfloat, 16, 4) []
+  = { 0x5A58 /* A0 + B0 * B4.  */,
+      0xCE62 /* A1 + B1 * B4.  */,
+      0x4F91 /* A2 + B2 * B4.  */,
+      0x4DE6 /* A3 + B3 * B4.  */ };
+
+VECT_VAR_DECL (expected5_laneq_static, hfloat, 16, 4) []
+  = { 0xF23D /* A0 + B0 * B5.  */,
+      0x6A3B /* A1 + B1 * B5.  */,
+      0xECCA /* A2 + B2 * B5.  */,
+      0xE849 /* A3 + B3 * B5.  */ };
+
+VECT_VAR_DECL (expected6_laneq_static, hfloat, 16, 4) []
+  = { 0xE0DA /* A0 + B0 * B6.  */,
+      0x5995 /* A1 + B1 * B6.  */,
+      0xDC6C /* A2 + B2 * B6.  */,
+      0xD753 /* A3 + B3 * B6.  */ };
+
+VECT_VAR_DECL (expected7_laneq_static, hfloat, 16, 4) []
+  = { 0x7C00 /* A0 + B0 * B7.  */,
+      0xFC00 /* A1 + B1 * B7.  */,
+      0x7C00 /* A2 + B2 * B7.  */,
+      0x7C00 /* A3 + B3 * B7.  */ };
+
+/* Expected results for vfmaq_laneq.  */
+VECT_VAR_DECL (expected0_laneq_static, hfloat, 16, 8) []
+  = { 0x613E /* A0 + B0 * B0.  */,
+      0xD86D /* A1 + B1 * B0.  */,
+      0x5A82 /* A2 + B2 * B0.  */,
+      0x567A /* A3 + B3 * B0.  */,
+      0x7C00 /* A4 + B4 * B0.  */,
+      0xF24D /* A5 + B5 * B0.  */,
+      0xE11B /* A6 + B6 * B0.  */,
+      0x7C00 /* A7 + B7 * B0.  */ };
+
+VECT_VAR_DECL (expected1_laneq_static, hfloat, 16, 8) []
+  = { 0xCA33 /* A0 + B0 * B1.  */,
+      0x4EF6 /* A1 + B1 * B1.  */,
+      0xD274 /* A2 + B2 * B1.  */,
+      0xCA9A /* A3 + B3 * B1.  */,
+      0x7C00 /* A4 + B4 * B1.  */,
+      0x6A3B /* A5 + B5 * B1.  */,
+      0x5C4D /* A6 + B6 * B1.  */,
+      0xFC00 /* A7 + B7 * B1.  */ };
+
+VECT_VAR_DECL (expected2_laneq_static, hfloat, 16, 8) []
+  = { 0x5D2F /* A0 + B0 * B2.  */,
+      0xD32D /* A1 + B1 * B2.  */,
+      0x54F3 /* A2 + B2 * B2.  */,
+      0x51B3 /* A3 + B3 * B2.  */,
+      0x7C00 /* A4 + B4 * B2.  */,
+      0xECCB /* A5 + B5 * B2.  */,
+      0xDA01 /* A6 + B6 * B2.  */,
+      0x7C00 /* A7 + B7 * B2.  */ };
+
+VECT_VAR_DECL (expected3_laneq_static, hfloat, 16, 8) []
+  = { 0x5AC8 /* A0 + B0 * B3.  */,
+      0xCF40 /* A1 + B1 * B3.  */,
+      0x5073 /* A2 + B2 * B3.  */,
+      0x4E80 /* A3 + B3 * B3.  */,
+      0x7C00 /* A4 + B4 * B3.  */,
+      0xE851 /* A5 + B5 * B3.  */,
+      0xD08C /* A6 + B6 * B3.  */,
+      0x7C00 /* A7 + B7 * B3.  */ };
+
+VECT_VAR_DECL (expected4_laneq_static, hfloat, 16, 8) []
+  = { 0x5A58 /* A0 + B0 * B4.  */,
+      0xCE62 /* A1 + B1 * B4.  */,
+      0x4F91 /* A2 + B2 * B4.  */,
+      0x4DE6 /* A3 + B3 * B4.  */,
+      0x7C00 /* A4 + B4 * B4.  */,
+      0xE757 /* A5 + B5 * B4.  */,
+      0xCC54 /* A6 + B6 * B4.  */,
+      0x7C00 /* A7 + B7 * B4.  */ };
+
+VECT_VAR_DECL (expected5_laneq_static, hfloat, 16, 8) []
+  = { 0xF23D /* A0 + B0 * B5.  */,
+      0x6A3B /* A1 + B1 * B5.  */,
+      0xECCA /* A2 + B2 * B5.  */,
+      0xE849 /* A3 + B3 * B5.  */,
+      0x7C00 /* A4 + B4 * B5.  */,
+      0x7C00 /* A5 + B5 * B5.  */,
+      0x744D /* A6 + B6 * B5.  */,
+      0xFC00 /* A7 + B7 * B5.  */ };
+
+VECT_VAR_DECL (expected6_laneq_static, hfloat, 16, 8) []
+  = { 0xE0DA /* A0 + B0 * B6.  */,
+      0x5995 /* A1 + B1 * B6.  */,
+      0xDC6C /* A2 + B2 * B6.  */,
+      0xD753 /* A3 + B3 * B6.  */,
+      0x7C00 /* A4 + B4 * B6.  */,
+      0x7447 /* A5 + B5 * B6.  */,
+      0x644E /* A6 + B6 * B6.  */,
+      0xFC00 /* A7 + B7 * B6.  */ };
+
+VECT_VAR_DECL (expected7_laneq_static, hfloat, 16, 8) []
+  = { 0x7C00 /* A0 + B0 * B7.  */,
+      0xFC00 /* A1 + B1 * B7.  */,
+      0x7C00 /* A2 + B2 * B7.  */,
+      0x7C00 /* A3 + B3 * B7.  */,
+      0x7C00 /* A4 + B4 * B7.  */,
+      0xFC00 /* A5 + B5 * B7.  */,
+      0xFC00 /* A6 + B6 * B7.  */,
+      0x7C00 /* A7 + B7 * B7.  */ };
+
+/* Expected results for vfms_lane.  */
+VECT_VAR_DECL (expected0_fms_static, hfloat, 16, 4) []
+  = { 0xDEA2 /* A0 + (-B0) * B0.  */,
+      0x5810 /* A1 + (-B1) * B0.  */,
+      0xDA82 /* A2 + (-B2) * B0.  */,
+      0xD53A /* A3 + (-B3) * B0.  */ };
+
+VECT_VAR_DECL (expected1_fms_static, hfloat, 16, 4) []
+  = { 0x5C0D /* A0 + (-B0) * B1.  */,
+      0xD0EE /* A1 + (-B1) * B1.  */,
+      0x5274 /* A2 + (-B2) * B1.  */,
+      0x5026 /* A3 + (-B3) * B1.  */ };
+
+VECT_VAR_DECL (expected2_fms_static, hfloat, 16, 4) []
+  = { 0xD54E /* A0 + (-B0) * B2.  */,
+      0x51BA /* A1 + (-B1) * B2.  */,
+      0xD4F3 /* A2 + (-B2) * B2.  */,
+      0xCE66 /* A3 + (-B3) * B2.  */ };
+
+VECT_VAR_DECL (expected3_fms_static, hfloat, 16, 4) []
+  = { 0x4F70 /* A0 + (-B0) * B3.  */,
+      0x4C5A /* A1 + (-B1) * B3.  */,
+      0xD073 /* A2 + (-B2) * B3.  */,
+      0xC600 /* A3 + (-B3) * B3.  */ };
+
+/* Expected results for vfmsq_lane.  */
+VECT_VAR_DECL (expected0_fms_static, hfloat, 16, 8) []
+  = { 0xDEA2 /* A0 + (-B0) * B0.  */,
+      0x5810 /* A1 + (-B1) * B0.  */,
+      0xDA82 /* A2 + (-B2) * B0.  */,
+      0xD53A /* A3 + (-B3) * B0.  */,
+      0x7C00 /* A4 + (-B4) * B0.  */,
+      0x724B /* A5 + (-B5) * B0.  */,
+      0x6286 /* A6 + (-B6) * B0.  */,
+      0xFC00 /* A7 + (-B7) * B0.  */ };
+
+VECT_VAR_DECL (expected1_fms_static, hfloat, 16, 8) []
+  = { 0x5C0D /* A0 + (-B0) * B1.  */,
+      0xD0EE /* A1 + (-B1) * B1.  */,
+      0x5274 /* A2 + (-B2) * B1.  */,
+      0x5026 /* A3 + (-B3) * B1.  */,
+      0x7C00 /* A4 + (-B4) * B1.  */,
+      0xEA41 /* A5 + (-B5) * B1.  */,
+      0xD5DA /* A6 + (-B6) * B1.  */,
+      0x7C00 /* A7 + (-B7) * B1.  */ };
+
+VECT_VAR_DECL (expected2_fms_static, hfloat, 16, 8) []
+  = { 0xD54E /* A0 + (-B0) * B2.  */,
+      0x51BA /* A1 + (-B1) * B2.  */,
+      0xD4F3 /* A2 + (-B2) * B2.  */,
+      0xCE66 /* A3 + (-B3) * B2.  */,
+      0x7C00 /* A4 + (-B4) * B2.  */,
+      0x6CC8 /* A5 + (-B5) * B2.  */,
+      0x5DD7 /* A6 + (-B6) * B2.  */,
+      0xFC00 /* A7 + (-B7) * B2.  */ };
+
+VECT_VAR_DECL (expected3_fms_static, hfloat, 16, 8) []
+  = { 0x4F70 /* A0 + (-B0) * B3.  */,
+      0x4C5A /* A1 + (-B1) * B3.  */,
+      0xD073 /* A2 + (-B2) * B3.  */,
+      0xC600 /* A3 + (-B3) * B3.  */,
+      0x7C00 /* A4 + (-B4) * B3.  */,
+      0x684B /* A5 + (-B5) * B3.  */,
+      0x5AD0 /* A6 + (-B6) * B3.  */,
+      0xFC00 /* A7 + (-B7) * B3.  */ };
+
+/* Expected results for vfms_laneq.  */
+VECT_VAR_DECL (expected0_fms_laneq_static, hfloat, 16, 4) []
+  = { 0xDEA2 /* A0 + (-B0) * B0.  */,
+      0x5810 /* A1 + (-B1) * B0.  */,
+      0xDA82 /* A2 + (-B2) * B0.  */,
+      0xD53A /* A3 + (-B3) * B0.  */ };
+
+VECT_VAR_DECL (expected1_fms_laneq_static, hfloat, 16, 4) []
+  = { 0x5C0D /* A0 + (-B0) * B1.  */,
+      0xD0EE /* A1 + (-B1) * B1.  */,
+      0x5274 /* A2 + (-B2) * B1.  */,
+      0x5026 /* A3 + (-B3) * B1.  */ };
+
+VECT_VAR_DECL (expected2_fms_laneq_static, hfloat, 16, 4) []
+  = { 0xD54E /* A0 + (-B0) * B2.  */,
+      0x51BA /* A1 + (-B1) * B2.  */,
+      0xD4F3 /* A2 + (-B2) * B2.  */,
+      0xCE66 /* A3 + (-B3) * B2.  */ };
+
+VECT_VAR_DECL (expected3_fms_laneq_static, hfloat, 16, 4) []
+  = { 0x4F70 /* A0 + (-B0) * B3.  */,
+      0x4C5A /* A1 + (-B1) * B3.  */,
+      0xD073 /* A2 + (-B2) * B3.  */,
+      0xC600 /* A3 + (-B3) * B3.  */ };
+
+VECT_VAR_DECL (expected4_fms_laneq_static, hfloat, 16, 4) []
+  = { 0x5179 /* A0 + (-B0) * B4.  */,
+      0x4AF6 /* A1 + (-B1) * B4.  */,
+      0xCF91 /* A2 + (-B2) * B4.  */,
+      0xC334 /* A3 + (-B3) * B4.  */ };
+
+VECT_VAR_DECL (expected5_fms_laneq_static, hfloat, 16, 4) []
+  = { 0x725C /* A0 + (-B0) * B5.  */,
+      0xEA41 /* A1 + (-B1) * B5.  */,
+      0x6CCA /* A2 + (-B2) * B5.  */,
+      0x6853 /* A3 + (-B3) * B5.  */ };
+
+VECT_VAR_DECL (expected6_fms_laneq_static, hfloat, 16, 4) []
+  = { 0x62C7 /* A0 + (-B0) * B6.  */,
+      0xD9F2 /* A1 + (-B1) * B6.  */,
+      0x5C6C /* A2 + (-B2) * B6.  */,
+      0x584A /* A3 + (-B3) * B6.  */ };
+
+VECT_VAR_DECL (expected7_fms_laneq_static, hfloat, 16, 4) []
+  = { 0xFC00 /* A0 + (-B0) * B7.  */,
+      0x7C00 /* A1 + (-B1) * B7.  */,
+      0xFC00 /* A2 + (-B2) * B7.  */,
+      0xFC00 /* A3 + (-B3) * B7.  */ };
+
+/* Expected results for vfmsq_laneq.  */
+VECT_VAR_DECL (expected0_fms_laneq_static, hfloat, 16, 8) []
+  = { 0xDEA2 /* A0 + (-B0) * B0.  */,
+      0x5810 /* A1 + (-B1) * B0.  */,
+      0xDA82 /* A2 + (-B2) * B0.  */,
+      0xD53A /* A3 + (-B3) * B0.  */,
+      0x7C00 /* A4 + (-B4) * B0.  */,
+      0x724B /* A5 + (-B5) * B0.  */,
+      0x6286 /* A6 + (-B6) * B0.  */,
+      0xFC00 /* A7 + (-B7) * B0.  */ };
+
+VECT_VAR_DECL (expected1_fms_laneq_static, hfloat, 16, 8) []
+  = { 0x5C0D /* A0 + (-B0) * B1.  */,
+      0xD0EE /* A1 + (-B1) * B1.  */,
+      0x5274 /* A2 + (-B2) * B1.  */,
+      0x5026 /* A3 + (-B3) * B1.  */,
+      0x7C00 /* A4 + (-B4) * B1.  */,
+      0xEA41 /* A5 + (-B5) * B1.  */,
+      0xD5DA /* A6 + (-B6) * B1.  */,
+      0x7C00 /* A7 + (-B7) * B1.  */ };
+
+VECT_VAR_DECL (expected2_fms_laneq_static, hfloat, 16, 8) []
+  = { 0xD54E /* A0 + (-B0) * B2.  */,
+      0x51BA /* A1 + (-B1) * B2.  */,
+      0xD4F3 /* A2 + (-B2) * B2.  */,
+      0xCE66 /* A3 + (-B3) * B2.  */,
+      0x7C00 /* A4 + (-B4) * B2.  */,
+      0x6CC8 /* A5 + (-B5) * B2.  */,
+      0x5DD7 /* A6 + (-B6) * B2.  */,
+      0xFC00 /* A7 + (-B7) * B2.  */ };
+
+VECT_VAR_DECL (expected3_fms_laneq_static, hfloat, 16, 8) []
+  = { 0x4F70 /* A0 + (-B0) * B3.  */,
+      0x4C5A /* A1 + (-B1) * B3.  */,
+      0xD073 /* A2 + (-B2) * B3.  */,
+      0xC600 /* A3 + (-B3) * B3.  */,
+      0x7C00 /* A4 + (-B4) * B3.  */,
+      0x684B /* A5 + (-B5) * B3.  */,
+      0x5AD0 /* A6 + (-B6) * B3.  */,
+      0xFC00 /* A7 + (-B7) * B3.  */ };
+
+VECT_VAR_DECL (expected4_fms_laneq_static, hfloat, 16, 8) []
+  = { 0x5179 /* A0 + (-B0) * B4.  */,
+      0x4AF6 /* A1 + (-B1) * B4.  */,
+      0xCF91 /* A2 + (-B2) * B4.  */,
+      0xC334 /* A3 + (-B3) * B4.  */,
+      0x7C00 /* A4 + (-B4) * B4.  */,
+      0x674C /* A5 + (-B5) * B4.  */,
+      0x5A37 /* A6 + (-B6) * B4.  */,
+      0xFC00 /* A7 + (-B7) * B4.  */ };
+
+VECT_VAR_DECL (expected5_fms_laneq_static, hfloat, 16, 8) []
+  = { 0x725C /* A0 + (-B0) * B5.  */,
+      0xEA41 /* A1 + (-B1) * B5.  */,
+      0x6CCA /* A2 + (-B2) * B5.  */,
+      0x6853 /* A3 + (-B3) * B5.  */,
+      0x7C00 /* A4 + (-B4) * B5.  */,
+      0xFC00 /* A5 + (-B5) * B5.  */,
+      0xF441 /* A6 + (-B6) * B5.  */,
+      0x7C00 /* A7 + (-B7) * B5.  */ };
+
+VECT_VAR_DECL (expected6_fms_laneq_static, hfloat, 16, 8) []
+  = { 0x62C7 /* A0 + (-B0) * B6.  */,
+      0xD9F2 /* A1 + (-B1) * B6.  */,
+      0x5C6C /* A2 + (-B2) * B6.  */,
+      0x584A /* A3 + (-B3) * B6.  */,
+      0x7C00 /* A4 + (-B4) * B6.  */,
+      0xF447 /* A5 + (-B5) * B6.  */,
+      0xE330 /* A6 + (-B6) * B6.  */,
+      0x7C00 /* A7 + (-B7) * B6.  */ };
+
+VECT_VAR_DECL (expected7_fms_laneq_static, hfloat, 16, 8) []
+  = { 0xFC00 /* A0 + (-B0) * B7.  */,
+      0x7C00 /* A1 + (-B1) * B7.  */,
+      0xFC00 /* A2 + (-B2) * B7.  */,
+      0xFC00 /* A3 + (-B3) * B7.  */,
+      0x7C00 /* A4 + (-B4) * B7.  */,
+      0x7C00 /* A5 + (-B5) * B7.  */,
+      0x7C00 /* A6 + (-B6) * B7.  */,
+      0xFC00 /* A7 + (-B7) * B7.  */ };
+
+void exec_vfmas_lane_f16 (void)
+{
+#undef TEST_MSG
+#define TEST_MSG "VFMA_LANE (FP16)"
+  clean_results ();
+
+  DECL_VARIABLE(vsrc_1, float, 16, 4);
+  DECL_VARIABLE(vsrc_2, float, 16, 4);
+  VECT_VAR_DECL (buf_src_1, float, 16, 4) [] = {A0, A1, A2, A3};
+  VECT_VAR_DECL (buf_src_2, float, 16, 4) [] = {B0, B1, B2, B3};
+  VLOAD (vsrc_1, buf_src_1, , float, f, 16, 4);
+  VLOAD (vsrc_2, buf_src_2, , float, f, 16, 4);
+  DECL_VARIABLE (vector_res, float, 16, 4)
+    = vfma_lane_f16 (VECT_VAR (vsrc_1, float, 16, 4),
+		     VECT_VAR (vsrc_2, float, 16, 4),
+		     VECT_VAR (vsrc_2, float, 16, 4), 0);
+  vst1_f16 (VECT_VAR (result, float, 16, 4),
+	    VECT_VAR (vector_res, float, 16, 4));
+
+  CHECK_FP (TEST_MSG, float, 16, 4, PRIx16, expected0_static, "");
+
+  VECT_VAR (vector_res, float, 16, 4)
+    = vfma_lane_f16 (VECT_VAR (vsrc_1, float, 16, 4),
+		     VECT_VAR (vsrc_2, float, 16, 4),
+		     VECT_VAR (vsrc_2, float, 16, 4), 1);
+  vst1_f16 (VECT_VAR (result, float, 16, 4),
+	    VECT_VAR (vector_res, float, 16, 4));
+
+  CHECK_FP (TEST_MSG, float, 16, 4, PRIx16, expected1_static, "");
+
+  VECT_VAR (vector_res, float, 16, 4)
+    = vfma_lane_f16 (VECT_VAR (vsrc_1, float, 16, 4),
+		     VECT_VAR (vsrc_2, float, 16, 4),
+		     VECT_VAR (vsrc_2, float, 16, 4), 2);
+  vst1_f16 (VECT_VAR (result, float, 16, 4),
+	    VECT_VAR (vector_res, float, 16, 4));
+
+  CHECK_FP (TEST_MSG, float, 16, 4, PRIx16, expected2_static, "");
+
+  VECT_VAR (vector_res, float, 16, 4)
+    = vfma_lane_f16 (VECT_VAR (vsrc_1, float, 16, 4),
+		     VECT_VAR (vsrc_2, float, 16, 4),
+		     VECT_VAR (vsrc_2, float, 16, 4), 3);
+  vst1_f16 (VECT_VAR (result, float, 16, 4),
+	    VECT_VAR (vector_res, float, 16, 4));
+
+  CHECK_FP (TEST_MSG, float, 16, 4, PRIx16, expected3_static, "");
+
+#undef TEST_MSG
+#define TEST_MSG "VFMAQ_LANE (FP16)"
+  clean_results ();
+
+  DECL_VARIABLE(vsrc_1, float, 16, 8);
+  DECL_VARIABLE(vsrc_2, float, 16, 8);
+  VECT_VAR_DECL (buf_src_1, float, 16, 8) [] = {A0, A1, A2, A3, A4, A5, A6, A7};
+  VECT_VAR_DECL (buf_src_2, float, 16, 8) [] = {B0, B1, B2, B3, B4, B5, B6, B7};
+  VLOAD (vsrc_1, buf_src_1, q, float, f, 16, 8);
+  VLOAD (vsrc_2, buf_src_2, q, float, f, 16, 8);
+  DECL_VARIABLE (vector_res, float, 16, 8)
+    = vfmaq_lane_f16 (VECT_VAR (vsrc_1, float, 16, 8),
+		      VECT_VAR (vsrc_2, float, 16, 8),
+		      VECT_VAR (vsrc_2, float, 16, 4), 0);
+  vst1q_f16 (VECT_VAR (result, float, 16, 8),
+	     VECT_VAR (vector_res, float, 16, 8));
+
+  CHECK_FP (TEST_MSG, float, 16, 8, PRIx16, expected0_static, "");
+
+  VECT_VAR (vector_res, float, 16, 8)
+    = vfmaq_lane_f16 (VECT_VAR (vsrc_1, float, 16, 8),
+		      VECT_VAR (vsrc_2, float, 16, 8),
+		      VECT_VAR (vsrc_2, float, 16, 4), 1);
+  vst1q_f16 (VECT_VAR (result, float, 16, 8),
+	     VECT_VAR (vector_res, float, 16, 8));
+
+  CHECK_FP (TEST_MSG, float, 16, 8, PRIx16, expected1_static, "");
+
+  VECT_VAR (vector_res, float, 16, 8)
+    = vfmaq_lane_f16 (VECT_VAR (vsrc_1, float, 16, 8),
+		      VECT_VAR (vsrc_2, float, 16, 8),
+		      VECT_VAR (vsrc_2, float, 16, 4), 2);
+  vst1q_f16 (VECT_VAR (result, float, 16, 8),
+	     VECT_VAR (vector_res, float, 16, 8));
+
+  CHECK_FP (TEST_MSG, float, 16, 8, PRIx16, expected2_static, "");
+
+  VECT_VAR (vector_res, float, 16, 8)
+    = vfmaq_lane_f16 (VECT_VAR (vsrc_1, float, 16, 8),
+		      VECT_VAR (vsrc_2, float, 16, 8),
+		      VECT_VAR (vsrc_2, float, 16, 4), 3);
+  vst1q_f16 (VECT_VAR (result, float, 16, 8),
+	     VECT_VAR (vector_res, float, 16, 8));
+
+  CHECK_FP (TEST_MSG, float, 16, 8, PRIx16, expected3_static, "");
+
+#undef TEST_MSG
+#define TEST_MSG "VFMA_LANEQ (FP16)"
+  clean_results ();
+
+  DECL_VARIABLE(vsrc_3, float, 16, 8);
+  VECT_VAR_DECL (buf_src_3, float, 16, 8) [] = {B0, B1, B2, B3, B4, B5, B6, B7};
+  VLOAD (vsrc_3, buf_src_3, q, float, f, 16, 8);
+  VECT_VAR (vector_res, float, 16, 4)
+    = vfma_laneq_f16 (VECT_VAR (vsrc_1, float, 16, 4),
+		      VECT_VAR (vsrc_2, float, 16, 4),
+		      VECT_VAR (vsrc_3, float, 16, 8), 0);
+  vst1_f16 (VECT_VAR (result, float, 16, 4),
+	    VECT_VAR (vector_res, float, 16, 4));
+
+  CHECK_FP (TEST_MSG, float, 16, 4, PRIx16, expected0_laneq_static, "");
+
+  VECT_VAR (vector_res, float, 16, 4)
+    = vfma_laneq_f16 (VECT_VAR (vsrc_1, float, 16, 4),
+		      VECT_VAR (vsrc_2, float, 16, 4),
+		      VECT_VAR (vsrc_3, float, 16, 8), 1);
+  vst1_f16 (VECT_VAR (result, float, 16, 4),
+	    VECT_VAR (vector_res, float, 16, 4));
+
+  CHECK_FP (TEST_MSG, float, 16, 4, PRIx16, expected1_laneq_static, "");
+
+  VECT_VAR (vector_res, float, 16, 4)
+    = vfma_laneq_f16 (VECT_VAR (vsrc_1, float, 16, 4),
+		      VECT_VAR (vsrc_2, float, 16, 4),
+		      VECT_VAR (vsrc_3, float, 16, 8), 2);
+  vst1_f16 (VECT_VAR (result, float, 16, 4),
+	    VECT_VAR (vector_res, float, 16, 4));
+
+  CHECK_FP (TEST_MSG, float, 16, 4, PRIx16, expected2_laneq_static, "");
+
+  VECT_VAR (vector_res, float, 16, 4)
+    = vfma_laneq_f16 (VECT_VAR (vsrc_1, float, 16, 4),
+		      VECT_VAR (vsrc_2, float, 16, 4),
+		      VECT_VAR (vsrc_3, float, 16, 8), 3);
+  vst1_f16 (VECT_VAR (result, float, 16, 4),
+	    VECT_VAR (vector_res, float, 16, 4));
+
+  CHECK_FP (TEST_MSG, float, 16, 4, PRIx16, expected3_laneq_static, "");
+
+  VECT_VAR (vector_res, float, 16, 4)
+    = vfma_laneq_f16 (VECT_VAR (vsrc_1, float, 16, 4),
+		      VECT_VAR (vsrc_2, float, 16, 4),
+		      VECT_VAR (vsrc_3, float, 16, 8), 4);
+  vst1_f16 (VECT_VAR (result, float, 16, 4),
+	    VECT_VAR (vector_res, float, 16, 4));
+
+  CHECK_FP (TEST_MSG, float, 16, 4, PRIx16, expected4_laneq_static, "");
+
+  VECT_VAR (vector_res, float, 16, 4)
+    = vfma_laneq_f16 (VECT_VAR (vsrc_1, float, 16, 4),
+		      VECT_VAR (vsrc_2, float, 16, 4),
+		      VECT_VAR (vsrc_3, float, 16, 8), 5);
+  vst1_f16 (VECT_VAR (result, float, 16, 4),
+	    VECT_VAR (vector_res, float, 16, 4));
+
+  CHECK_FP (TEST_MSG, float, 16, 4, PRIx16, expected5_laneq_static, "");
+
+  VECT_VAR (vector_res, float, 16, 4)
+    = vfma_laneq_f16 (VECT_VAR (vsrc_1, float, 16, 4),
+		      VECT_VAR (vsrc_2, float, 16, 4),
+		      VECT_VAR (vsrc_3, float, 16, 8), 6);
+  vst1_f16 (VECT_VAR (result, float, 16, 4),
+	    VECT_VAR (vector_res, float, 16, 4));
+
+  CHECK_FP (TEST_MSG, float, 16, 4, PRIx16, expected6_laneq_static, "");
+
+  VECT_VAR (vector_res, float, 16, 4)
+    = vfma_laneq_f16 (VECT_VAR (vsrc_1, float, 16, 4),
+		      VECT_VAR (vsrc_2, float, 16, 4),
+		      VECT_VAR (vsrc_3, float, 16, 8), 7);
+  vst1_f16 (VECT_VAR (result, float, 16, 4),
+	    VECT_VAR (vector_res, float, 16, 4));
+
+  CHECK_FP (TEST_MSG, float, 16, 4, PRIx16, expected7_laneq_static, "");
+
+#undef TEST_MSG
+#define TEST_MSG "VFMAQ_LANEQ (FP16)"
+  clean_results ();
+
+  VECT_VAR (vector_res, float, 16, 8)
+    = vfmaq_laneq_f16 (VECT_VAR (vsrc_1, float, 16, 8),
+		       VECT_VAR (vsrc_2, float, 16, 8),
+		       VECT_VAR (vsrc_3, float, 16, 8), 0);
+  vst1q_f16 (VECT_VAR (result, float, 16, 8),
+	     VECT_VAR (vector_res, float, 16, 8));
+
+  CHECK_FP (TEST_MSG, float, 16, 8, PRIx16, expected0_laneq_static, "");
+
+  VECT_VAR (vector_res, float, 16, 8)
+    = vfmaq_laneq_f16 (VECT_VAR (vsrc_1, float, 16, 8),
+		       VECT_VAR (vsrc_2, float, 16, 8),
+		       VECT_VAR (vsrc_3, float, 16, 8), 1);
+  vst1q_f16 (VECT_VAR (result, float, 16, 8),
+	     VECT_VAR (vector_res, float, 16, 8));
+
+  CHECK_FP (TEST_MSG, float, 16, 8, PRIx16, expected1_laneq_static, "");
+
+  VECT_VAR (vector_res, float, 16, 8)
+    = vfmaq_laneq_f16 (VECT_VAR (vsrc_1, float, 16, 8),
+		       VECT_VAR (vsrc_2, float, 16, 8),
+		       VECT_VAR (vsrc_3, float, 16, 8), 2);
+  vst1q_f16 (VECT_VAR (result, float, 16, 8),
+	     VECT_VAR (vector_res, float, 16, 8));
+
+  CHECK_FP (TEST_MSG, float, 16, 8, PRIx16, expected2_laneq_static, "");
+
+  VECT_VAR (vector_res, float, 16, 8)
+    = vfmaq_laneq_f16 (VECT_VAR (vsrc_1, float, 16, 8),
+		       VECT_VAR (vsrc_2, float, 16, 8),
+		       VECT_VAR (vsrc_3, float, 16, 8), 3);
+  vst1q_f16 (VECT_VAR (result, float, 16, 8),
+	     VECT_VAR (vector_res, float, 16, 8));
+
+  CHECK_FP (TEST_MSG, float, 16, 8, PRIx16, expected3_laneq_static, "");
+
+  VECT_VAR (vector_res, float, 16, 8)
+    = vfmaq_laneq_f16 (VECT_VAR (vsrc_1, float, 16, 8),
+		       VECT_VAR (vsrc_2, float, 16, 8),
+		       VECT_VAR (vsrc_3, float, 16, 8), 4);
+  vst1q_f16 (VECT_VAR (result, float, 16, 8),
+	     VECT_VAR (vector_res, float, 16, 8));
+
+  CHECK_FP (TEST_MSG, float, 16, 8, PRIx16, expected4_laneq_static, "");
+
+  VECT_VAR (vector_res, float, 16, 8)
+    = vfmaq_laneq_f16 (VECT_VAR (vsrc_1, float, 16, 8),
+		       VECT_VAR (vsrc_2, float, 16, 8),
+		       VECT_VAR (vsrc_3, float, 16, 8), 5);
+  vst1q_f16 (VECT_VAR (result, float, 16, 8),
+	     VECT_VAR (vector_res, float, 16, 8));
+
+  CHECK_FP (TEST_MSG, float, 16, 8, PRIx16, expected5_laneq_static, "");
+
+  VECT_VAR (vector_res, float, 16, 8)
+    = vfmaq_laneq_f16 (VECT_VAR (vsrc_1, float, 16, 8),
+		       VECT_VAR (vsrc_2, float, 16, 8),
+		       VECT_VAR (vsrc_3, float, 16, 8), 6);
+  vst1q_f16 (VECT_VAR (result, float, 16, 8),
+	     VECT_VAR (vector_res, float, 16, 8));
+
+  CHECK_FP (TEST_MSG, float, 16, 8, PRIx16, expected6_laneq_static, "");
+
+  VECT_VAR (vector_res, float, 16, 8)
+    = vfmaq_laneq_f16 (VECT_VAR (vsrc_1, float, 16, 8),
+		       VECT_VAR (vsrc_2, float, 16, 8),
+		       VECT_VAR (vsrc_3, float, 16, 8), 7);
+  vst1q_f16 (VECT_VAR (result, float, 16, 8),
+	     VECT_VAR (vector_res, float, 16, 8));
+
+  CHECK_FP (TEST_MSG, float, 16, 8, PRIx16, expected7_laneq_static, "");
+
+#undef TEST_MSG
+#define TEST_MSG "VFMS_LANE (FP16)"
+  clean_results ();
+
+  VECT_VAR (vector_res, float, 16, 4)
+    = vfms_lane_f16 (VECT_VAR (vsrc_1, float, 16, 4),
+		     VECT_VAR (vsrc_2, float, 16, 4),
+		     VECT_VAR (vsrc_2, float, 16, 4), 0);
+  vst1_f16 (VECT_VAR (result, float, 16, 4),
+	    VECT_VAR (vector_res, float, 16, 4));
+
+  CHECK_FP (TEST_MSG, float, 16, 4, PRIx16, expected0_fms_static, "");
+
+  VECT_VAR (vector_res, float, 16, 4)
+    = vfms_lane_f16 (VECT_VAR (vsrc_1, float, 16, 4),
+		     VECT_VAR (vsrc_2, float, 16, 4),
+		     VECT_VAR (vsrc_2, float, 16, 4), 1);
+  vst1_f16 (VECT_VAR (result, float, 16, 4),
+	    VECT_VAR (vector_res, float, 16, 4));
+
+  CHECK_FP (TEST_MSG, float, 16, 4, PRIx16, expected1_fms_static, "");
+
+  VECT_VAR (vector_res, float, 16, 4)
+    = vfms_lane_f16 (VECT_VAR (vsrc_1, float, 16, 4),
+		     VECT_VAR (vsrc_2, float, 16, 4),
+		     VECT_VAR (vsrc_2, float, 16, 4), 2);
+  vst1_f16 (VECT_VAR (result, float, 16, 4),
+	    VECT_VAR (vector_res, float, 16, 4));
+
+  CHECK_FP (TEST_MSG, float, 16, 4, PRIx16, expected2_fms_static, "");
+
+  VECT_VAR (vector_res, float, 16, 4)
+    = vfms_lane_f16 (VECT_VAR (vsrc_1, float, 16, 4),
+		     VECT_VAR (vsrc_2, float, 16, 4),
+		     VECT_VAR (vsrc_2, float, 16, 4), 3);
+  vst1_f16 (VECT_VAR (result, float, 16, 4),
+	    VECT_VAR (vector_res, float, 16, 4));
+
+  CHECK_FP (TEST_MSG, float, 16, 4, PRIx16, expected3_fms_static, "");
+
+#undef TEST_MSG
+#define TEST_MSG "VFMSQ_LANE (FP16)"
+  clean_results ();
+
+  VECT_VAR (vector_res, float, 16, 8)
+    = vfmsq_lane_f16 (VECT_VAR (vsrc_1, float, 16, 8),
+		      VECT_VAR (vsrc_2, float, 16, 8),
+		      VECT_VAR (vsrc_2, float, 16, 4), 0);
+  vst1q_f16 (VECT_VAR (result, float, 16, 8),
+	     VECT_VAR (vector_res, float, 16, 8));
+
+  CHECK_FP (TEST_MSG, float, 16, 8, PRIx16, expected0_fms_static, "");
+
+  VECT_VAR (vector_res, float, 16, 8)
+    = vfmsq_lane_f16 (VECT_VAR (vsrc_1, float, 16, 8),
+		      VECT_VAR (vsrc_2, float, 16, 8),
+		      VECT_VAR (vsrc_2, float, 16, 4), 1);
+  vst1q_f16 (VECT_VAR (result, float, 16, 8),
+	     VECT_VAR (vector_res, float, 16, 8));
+
+  CHECK_FP (TEST_MSG, float, 16, 8, PRIx16, expected1_fms_static, "");
+
+  VECT_VAR (vector_res, float, 16, 8)
+    = vfmsq_lane_f16 (VECT_VAR (vsrc_1, float, 16, 8),
+		      VECT_VAR (vsrc_2, float, 16, 8),
+		      VECT_VAR (vsrc_2, float, 16, 4), 2);
+  vst1q_f16 (VECT_VAR (result, float, 16, 8),
+	     VECT_VAR (vector_res, float, 16, 8));
+
+  CHECK_FP (TEST_MSG, float, 16, 8, PRIx16, expected2_fms_static, "");
+
+  VECT_VAR (vector_res, float, 16, 8)
+    = vfmsq_lane_f16 (VECT_VAR (vsrc_1, float, 16, 8),
+		      VECT_VAR (vsrc_2, float, 16, 8),
+		      VECT_VAR (vsrc_2, float, 16, 4), 3);
+  vst1q_f16 (VECT_VAR (result, float, 16, 8),
+	     VECT_VAR (vector_res, float, 16, 8));
+
+  CHECK_FP (TEST_MSG, float, 16, 8, PRIx16, expected3_fms_static, "");
+
+#undef TEST_MSG
+#define TEST_MSG "VFMS_LANEQ (FP16)"
+  clean_results ();
+
+  VECT_VAR (vector_res, float, 16, 4)
+    = vfms_laneq_f16 (VECT_VAR (vsrc_1, float, 16, 4),
+		      VECT_VAR (vsrc_2, float, 16, 4),
+		      VECT_VAR (vsrc_3, float, 16, 8), 0);
+  vst1_f16 (VECT_VAR (result, float, 16, 4),
+	    VECT_VAR (vector_res, float, 16, 4));
+
+  CHECK_FP (TEST_MSG, float, 16, 4, PRIx16, expected0_fms_laneq_static, "");
+
+  VECT_VAR (vector_res, float, 16, 4)
+    = vfms_laneq_f16 (VECT_VAR (vsrc_1, float, 16, 4),
+		      VECT_VAR (vsrc_2, float, 16, 4),
+		      VECT_VAR (vsrc_3, float, 16, 8), 1);
+  vst1_f16 (VECT_VAR (result, float, 16, 4),
+	    VECT_VAR (vector_res, float, 16, 4));
+
+  CHECK_FP (TEST_MSG, float, 16, 4, PRIx16, expected1_fms_laneq_static, "");
+
+  VECT_VAR (vector_res, float, 16, 4)
+    = vfms_laneq_f16 (VECT_VAR (vsrc_1, float, 16, 4),
+		      VECT_VAR (vsrc_2, float, 16, 4),
+		      VECT_VAR (vsrc_3, float, 16, 8), 2);
+  vst1_f16 (VECT_VAR (result, float, 16, 4),
+	    VECT_VAR (vector_res, float, 16, 4));
+
+  CHECK_FP (TEST_MSG, float, 16, 4, PRIx16, expected2_fms_laneq_static, "");
+
+  VECT_VAR (vector_res, float, 16, 4)
+    = vfms_laneq_f16 (VECT_VAR (vsrc_1, float, 16, 4),
+		      VECT_VAR (vsrc_2, float, 16, 4),
+		      VECT_VAR (vsrc_3, float, 16, 8), 3);
+  vst1_f16 (VECT_VAR (result, float, 16, 4),
+	    VECT_VAR (vector_res, float, 16, 4));
+
+  CHECK_FP (TEST_MSG, float, 16, 4, PRIx16, expected3_fms_laneq_static, "");
+
+  VECT_VAR (vector_res, float, 16, 4)
+    = vfms_laneq_f16 (VECT_VAR (vsrc_1, float, 16, 4),
+		      VECT_VAR (vsrc_2, float, 16, 4),
+		      VECT_VAR (vsrc_3, float, 16, 8), 4);
+  vst1_f16 (VECT_VAR (result, float, 16, 4),
+	    VECT_VAR (vector_res, float, 16, 4));
+
+  CHECK_FP (TEST_MSG, float, 16, 4, PRIx16, expected4_fms_laneq_static, "");
+
+  VECT_VAR (vector_res, float, 16, 4)
+    = vfms_laneq_f16 (VECT_VAR (vsrc_1, float, 16, 4),
+		      VECT_VAR (vsrc_2, float, 16, 4),
+		      VECT_VAR (vsrc_3, float, 16, 8), 5);
+  vst1_f16 (VECT_VAR (result, float, 16, 4),
+	    VECT_VAR (vector_res, float, 16, 4));
+
+  CHECK_FP (TEST_MSG, float, 16, 4, PRIx16, expected5_fms_laneq_static, "");
+
+  VECT_VAR (vector_res, float, 16, 4)
+    = vfms_laneq_f16 (VECT_VAR (vsrc_1, float, 16, 4),
+		      VECT_VAR (vsrc_2, float, 16, 4),
+		      VECT_VAR (vsrc_3, float, 16, 8), 6);
+  vst1_f16 (VECT_VAR (result, float, 16, 4),
+	    VECT_VAR (vector_res, float, 16, 4));
+
+  CHECK_FP (TEST_MSG, float, 16, 4, PRIx16, expected6_fms_laneq_static, "");
+
+  VECT_VAR (vector_res, float, 16, 4)
+    = vfms_laneq_f16 (VECT_VAR (vsrc_1, float, 16, 4),
+		      VECT_VAR (vsrc_2, float, 16, 4),
+		      VECT_VAR (vsrc_3, float, 16, 8), 7);
+  vst1_f16 (VECT_VAR (result, float, 16, 4),
+	    VECT_VAR (vector_res, float, 16, 4));
+
+  CHECK_FP (TEST_MSG, float, 16, 4, PRIx16, expected7_fms_laneq_static, "");
+
+#undef TEST_MSG
+#define TEST_MSG "VFMSQ_LANEQ (FP16)"
+  clean_results ();
+
+  VECT_VAR (vector_res, float, 16, 8)
+    = vfmsq_laneq_f16 (VECT_VAR (vsrc_1, float, 16, 8),
+		       VECT_VAR (vsrc_2, float, 16, 8),
+		       VECT_VAR (vsrc_3, float, 16, 8), 0);
+  vst1q_f16 (VECT_VAR (result, float, 16, 8),
+	     VECT_VAR (vector_res, float, 16, 8));
+
+  CHECK_FP (TEST_MSG, float, 16, 8, PRIx16, expected0_fms_laneq_static, "");
+
+  VECT_VAR (vector_res, float, 16, 8)
+    = vfmsq_laneq_f16 (VECT_VAR (vsrc_1, float, 16, 8),
+		       VECT_VAR (vsrc_2, float, 16, 8),
+		       VECT_VAR (vsrc_3, float, 16, 8), 1);
+  vst1q_f16 (VECT_VAR (result, float, 16, 8),
+	     VECT_VAR (vector_res, float, 16, 8));
+
+  CHECK_FP (TEST_MSG, float, 16, 8, PRIx16, expected1_fms_laneq_static, "");
+
+  VECT_VAR (vector_res, float, 16, 8)
+    = vfmsq_laneq_f16 (VECT_VAR (vsrc_1, float, 16, 8),
+		       VECT_VAR (vsrc_2, float, 16, 8),
+		       VECT_VAR (vsrc_3, float, 16, 8), 2);
+  vst1q_f16 (VECT_VAR (result, float, 16, 8),
+	     VECT_VAR (vector_res, float, 16, 8));
+
+  CHECK_FP (TEST_MSG, float, 16, 8, PRIx16, expected2_fms_laneq_static, "");
+
+  VECT_VAR (vector_res, float, 16, 8)
+    = vfmsq_laneq_f16 (VECT_VAR (vsrc_1, float, 16, 8),
+		       VECT_VAR (vsrc_2, float, 16, 8),
+		       VECT_VAR (vsrc_3, float, 16, 8), 3);
+  vst1q_f16 (VECT_VAR (result, float, 16, 8),
+	     VECT_VAR (vector_res, float, 16, 8));
+
+  CHECK_FP (TEST_MSG, float, 16, 8, PRIx16, expected3_fms_laneq_static, "");
+
+  VECT_VAR (vector_res, float, 16, 8)
+    = vfmsq_laneq_f16 (VECT_VAR (vsrc_1, float, 16, 8),
+		       VECT_VAR (vsrc_2, float, 16, 8),
+		       VECT_VAR (vsrc_3, float, 16, 8), 4);
+  vst1q_f16 (VECT_VAR (result, float, 16, 8),
+	     VECT_VAR (vector_res, float, 16, 8));
+
+  CHECK_FP (TEST_MSG, float, 16, 8, PRIx16, expected4_fms_laneq_static, "");
+
+  VECT_VAR (vector_res, float, 16, 8)
+    = vfmsq_laneq_f16 (VECT_VAR (vsrc_1, float, 16, 8),
+		       VECT_VAR (vsrc_2, float, 16, 8),
+		       VECT_VAR (vsrc_3, float, 16, 8), 5);
+  vst1q_f16 (VECT_VAR (result, float, 16, 8),
+	     VECT_VAR (vector_res, float, 16, 8));
+
+  CHECK_FP (TEST_MSG, float, 16, 8, PRIx16, expected5_fms_laneq_static, "");
+
+  VECT_VAR (vector_res, float, 16, 8)
+    = vfmsq_laneq_f16 (VECT_VAR (vsrc_1, float, 16, 8),
+		       VECT_VAR (vsrc_2, float, 16, 8),
+		       VECT_VAR (vsrc_3, float, 16, 8), 6);
+  vst1q_f16 (VECT_VAR (result, float, 16, 8),
+	     VECT_VAR (vector_res, float, 16, 8));
+
+  CHECK_FP (TEST_MSG, float, 16, 8, PRIx16, expected6_fms_laneq_static, "");
+
+  VECT_VAR (vector_res, float, 16, 8)
+    = vfmsq_laneq_f16 (VECT_VAR (vsrc_1, float, 16, 8),
+		       VECT_VAR (vsrc_2, float, 16, 8),
+		       VECT_VAR (vsrc_3, float, 16, 8), 7);
+  vst1q_f16 (VECT_VAR (result, float, 16, 8),
+	     VECT_VAR (vector_res, float, 16, 8));
+
+  CHECK_FP (TEST_MSG, float, 16, 8, PRIx16, expected7_fms_laneq_static, "");
+}
+
+int
+main (void)
+{
+  exec_vfmas_lane_f16 ();
+  return 0;
+}
--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vfmas_n_f16_1.c
@@ -0,0 +1,469 @@
+/* { dg-do run } */
+/* { dg-require-effective-target arm_v8_2a_fp16_neon_hw } */
+/* { dg-add-options arm_v8_2a_fp16_neon } */
+/* { dg-skip-if "" { arm*-*-* } } */
+
+#include <arm_neon.h>
+#include "arm-neon-ref.h"
+#include "compute-ref-data.h"
+
+#define FP16_C(a) ((__fp16) a)
+#define A0 FP16_C (123.4)
+#define A1 FP16_C (-5.8)
+#define A2 FP16_C (-0.0)
+#define A3 FP16_C (10)
+#define A4 FP16_C (123412.43)
+#define A5 FP16_C (-5.8)
+#define A6 FP16_C (90.8)
+#define A7 FP16_C (24)
+
+#define B0 FP16_C (23.4)
+#define B1 FP16_C (-5.8)
+#define B2 FP16_C (8.9)
+#define B3 FP16_C (4.0)
+#define B4 FP16_C (3.4)
+#define B5 FP16_C (-550.8)
+#define B6 FP16_C (-31.8)
+#define B7 FP16_C (20000.0)
+
+/* Expected results for vfma_n.  */
+VECT_VAR_DECL (expected_fma0_static, hfloat, 16, 4) []
+  = { 0x613E /* A0 + B0 * B0.  */,
+      0xD86D /* A1 + B1 * B0.  */,
+      0x5A82 /* A2 + B2 * B0.  */,
+      0x567A /* A3 + B3 * B0.  */ };
+
+VECT_VAR_DECL (expected_fma1_static, hfloat, 16, 4) []
+  = { 0xCA33 /* A0 + B0 * B1.  */,
+      0x4EF6 /* A1 + B1 * B1.  */,
+      0xD274 /* A2 + B2 * B1.  */,
+      0xCA9A /* A3 + B3 * B1.  */ };
+
+VECT_VAR_DECL (expected_fma2_static, hfloat, 16, 4) []
+  = { 0x5D2F /* A0 + B0 * B2.  */,
+      0xD32D /* A1 + B1 * B2.  */,
+      0x54F3 /* A2 + B2 * B2.  */,
+      0x51B3 /* A3 + B3 * B2.  */ };
+
+VECT_VAR_DECL (expected_fma3_static, hfloat, 16, 4) []
+  = { 0x5AC8 /* A0 + B0 * B3.  */,
+      0xCF40 /* A1 + B1 * B3.  */,
+      0x5073 /* A2 + B2 * B3.  */,
+      0x4E80 /* A3 + B3 * B3.  */ };
+
+VECT_VAR_DECL (expected_fma0_static, hfloat, 16, 8) []
+  = { 0x613E /* A0 + B0 * B0.  */,
+      0xD86D /* A1 + B1 * B0.  */,
+      0x5A82 /* A2 + B2 * B0.  */,
+      0x567A /* A3 + B3 * B0.  */,
+      0x7C00 /* A4 + B4 * B0.  */,
+      0xF24D /* A5 + B5 * B0.  */,
+      0xE11B /* A6 + B6 * B0.  */,
+      0x7C00 /* A7 + B7 * B0.  */ };
+
+VECT_VAR_DECL (expected_fma1_static, hfloat, 16, 8) []
+  = { 0xCA33 /* A0 + B0 * B1.  */,
+      0x4EF6 /* A1 + B1 * B1.  */,
+      0xD274 /* A2 + B2 * B1.  */,
+      0xCA9A /* A3 + B3 * B1.  */,
+      0x7C00 /* A4 + B4 * B1.  */,
+      0x6A3B /* A5 + B5 * B1.  */,
+      0x5C4D /* A6 + B6 * B1.  */,
+      0xFC00 /* A7 + B7 * B1.  */ };
+
+VECT_VAR_DECL (expected_fma2_static, hfloat, 16, 8) []
+  = { 0x5D2F /* A0 + B0 * B2.  */,
+      0xD32D /* A1 + B1 * B2.  */,
+      0x54F3 /* A2 + B2 * B2.  */,
+      0x51B3 /* A3 + B3 * B2.  */,
+      0x7C00 /* A4 + B4 * B2.  */,
+      0xECCB /* A5 + B5 * B2.  */,
+      0xDA01 /* A6 + B6 * B2.  */,
+      0x7C00 /* A7 + B7 * B2.  */ };
+
+VECT_VAR_DECL (expected_fma3_static, hfloat, 16, 8) []
+  = { 0x5AC8 /* A0 + B0 * B3.  */,
+      0xCF40 /* A1 + B1 * B3.  */,
+      0x5073 /* A2 + B2 * B3.  */,
+      0x4E80 /* A3 + B3 * B3.  */,
+      0x7C00 /* A4 + B4 * B3.  */,
+      0xE851 /* A5 + B5 * B3.  */,
+      0xD08C /* A6 + B6 * B3.  */,
+      0x7C00 /* A7 + B7 * B3.  */ };
+
+VECT_VAR_DECL (expected_fma4_static, hfloat, 16, 8) []
+  = { 0x5A58 /* A0 + B0 * B4.  */,
+      0xCE62 /* A1 + B1 * B4.  */,
+      0x4F91 /* A2 + B2 * B4.  */,
+      0x4DE6 /* A3 + B3 * B4.  */,
+      0x7C00 /* A4 + B4 * B4.  */,
+      0xE757 /* A5 + B5 * B4.  */,
+      0xCC54 /* A6 + B6 * B4.  */,
+      0x7C00 /* A7 + B7 * B4.  */ };
+
+VECT_VAR_DECL (expected_fma5_static, hfloat, 16, 8) []
+  = { 0xF23D /* A0 + B0 * B5.  */,
+      0x6A3B /* A1 + B1 * B5.  */,
+      0xECCA /* A2 + B2 * B5.  */,
+      0xE849 /* A3 + B3 * B5.  */,
+      0x7C00 /* A4 + B4 * B5.  */,
+      0x7C00 /* A5 + B5 * B5.  */,
+      0x744D /* A6 + B6 * B5.  */,
+      0xFC00 /* A7 + B7 * B5.  */ };
+
+VECT_VAR_DECL (expected_fma6_static, hfloat, 16, 8) []
+  = { 0xE0DA /* A0 + B0 * B6.  */,
+      0x5995 /* A1 + B1 * B6.  */,
+      0xDC6C /* A2 + B2 * B6.  */,
+      0xD753 /* A3 + B3 * B6.  */,
+      0x7C00 /* A4 + B4 * B6.  */,
+      0x7447 /* A5 + B5 * B6.  */,
+      0x644E /* A6 + B6 * B6.  */,
+      0xFC00 /* A7 + B7 * B6.  */ };
+
+VECT_VAR_DECL (expected_fma7_static, hfloat, 16, 8) []
+  = { 0x7C00 /* A0 + B0 * B7.  */,
+      0xFC00 /* A1 + B1 * B7.  */,
+      0x7C00 /* A2 + B2 * B7.  */,
+      0x7C00 /* A3 + B3 * B7.  */,
+      0x7C00 /* A4 + B4 * B7.  */,
+      0xFC00 /* A5 + B5 * B7.  */,
+      0xFC00 /* A6 + B6 * B7.  */,
+      0x7C00 /* A7 + B7 * B7.  */ };
+
+/* Expected results for vfms_n.  */
+VECT_VAR_DECL (expected_fms0_static, hfloat, 16, 4) []
+  = { 0xDEA2 /* A0 + (-B0) * B0.  */,
+      0x5810 /* A1 + (-B1) * B0.  */,
+      0xDA82 /* A2 + (-B2) * B0.  */,
+      0xD53A /* A3 + (-B3) * B0.  */ };
+
+VECT_VAR_DECL (expected_fms1_static, hfloat, 16, 4) []
+  = { 0x5C0D /* A0 + (-B0) * B1.  */,
+      0xD0EE /* A1 + (-B1) * B1.  */,
+      0x5274 /* A2 + (-B2) * B1.  */,
+      0x5026 /* A3 + (-B3) * B1.  */ };
+
+VECT_VAR_DECL (expected_fms2_static, hfloat, 16, 4) []
+  = { 0xD54E /* A0 + (-B0) * B2.  */,
+      0x51BA /* A1 + (-B1) * B2.  */,
+      0xD4F3 /* A2 + (-B2) * B2.  */,
+      0xCE66 /* A3 + (-B3) * B2.  */ };
+
+VECT_VAR_DECL (expected_fms3_static, hfloat, 16, 4) []
+  = { 0x4F70 /* A0 + (-B0) * B3.  */,
+      0x4C5A /* A1 + (-B1) * B3.  */,
+      0xD073 /* A2 + (-B2) * B3.  */,
+      0xC600 /* A3 + (-B3) * B3.  */ };
+
+VECT_VAR_DECL (expected_fms0_static, hfloat, 16, 8) []
+  = { 0xDEA2 /* A0 + (-B0) * B0.  */,
+      0x5810 /* A1 + (-B1) * B0.  */,
+      0xDA82 /* A2 + (-B2) * B0.  */,
+      0xD53A /* A3 + (-B3) * B0.  */,
+      0x7C00 /* A4 + (-B4) * B0.  */,
+      0x724B /* A5 + (-B5) * B0.  */,
+      0x6286 /* A6 + (-B6) * B0.  */,
+      0xFC00 /* A7 + (-B7) * B0.  */ };
+
+VECT_VAR_DECL (expected_fms1_static, hfloat, 16, 8) []
+  = { 0x5C0D /* A0 + (-B0) * B1.  */,
+      0xD0EE /* A1 + (-B1) * B1.  */,
+      0x5274 /* A2 + (-B2) * B1.  */,
+      0x5026 /* A3 + (-B3) * B1.  */,
+      0x7C00 /* A4 + (-B4) * B1.  */,
+      0xEA41 /* A5 + (-B5) * B1.  */,
+      0xD5DA /* A6 + (-B6) * B1.  */,
+      0x7C00 /* A7 + (-B7) * B1.  */ };
+
+VECT_VAR_DECL (expected_fms2_static, hfloat, 16, 8) []
+  = { 0xD54E /* A0 + (-B0) * B2.  */,
+      0x51BA /* A1 + (-B1) * B2.  */,
+      0xD4F3 /* A2 + (-B2) * B2.  */,
+      0xCE66 /* A3 + (-B3) * B2.  */,
+      0x7C00 /* A4 + (-B4) * B2.  */,
+      0x6CC8 /* A5 + (-B5) * B2.  */,
+      0x5DD7 /* A6 + (-B6) * B2.  */,
+      0xFC00 /* A7 + (-B7) * B2.  */ };
+
+VECT_VAR_DECL (expected_fms3_static, hfloat, 16, 8) []
+  = { 0x4F70 /* A0 + (-B0) * B3.  */,
+      0x4C5A /* A1 + (-B1) * B3.  */,
+      0xD073 /* A2 + (-B2) * B3.  */,
+      0xC600 /* A3 + (-B3) * B3.  */,
+      0x7C00 /* A4 + (-B4) * B3.  */,
+      0x684B /* A5 + (-B5) * B3.  */,
+      0x5AD0 /* A6 + (-B6) * B3.  */,
+      0xFC00 /* A7 + (-B7) * B3.  */ };
+
+VECT_VAR_DECL (expected_fms4_static, hfloat, 16, 8) []
+  = { 0x5179 /* A0 + (-B0) * B4.  */,
+      0x4AF6 /* A1 + (-B1) * B4.  */,
+      0xCF91 /* A2 + (-B2) * B4.  */,
+      0xC334 /* A3 + (-B3) * B4.  */,
+      0x7C00 /* A4 + (-B4) * B4.  */,
+      0x674C /* A5 + (-B5) * B4.  */,
+      0x5A37 /* A6 + (-B6) * B4.  */,
+      0xFC00 /* A7 + (-B7) * B4.  */ };
+
+VECT_VAR_DECL (expected_fms5_static, hfloat, 16, 8) []
+  = { 0x725C /* A0 + (-B0) * B5.  */,
+      0xEA41 /* A1 + (-B1) * B5.  */,
+      0x6CCA /* A2 + (-B2) * B5.  */,
+      0x6853 /* A3 + (-B3) * B5.  */,
+      0x7C00 /* A4 + (-B4) * B5.  */,
+      0xFC00 /* A5 + (-B5) * B5.  */,
+      0xF441 /* A6 + (-B6) * B5.  */,
+      0x7C00 /* A7 + (-B7) * B5.  */ };
+
+VECT_VAR_DECL (expected_fms6_static, hfloat, 16, 8) []
+  = { 0x62C7 /* A0 + (-B0) * B6.  */,
+      0xD9F2 /* A1 + (-B1) * B6.  */,
+      0x5C6C /* A2 + (-B2) * B6.  */,
+      0x584A /* A3 + (-B3) * B6.  */,
+      0x7C00 /* A4 + (-B4) * B6.  */,
+      0xF447 /* A5 + (-B5) * B6.  */,
+      0xE330 /* A6 + (-B6) * B6.  */,
+      0x7C00 /* A7 + (-B7) * B6.  */ };
+
+VECT_VAR_DECL (expected_fms7_static, hfloat, 16, 8) []
+  = { 0xFC00 /* A0 + (-B0) * B7.  */,
+      0x7C00 /* A1 + (-B1) * B7.  */,
+      0xFC00 /* A2 + (-B2) * B7.  */,
+      0xFC00 /* A3 + (-B3) * B7.  */,
+      0x7C00 /* A4 + (-B4) * B7.  */,
+      0x7C00 /* A5 + (-B5) * B7.  */,
+      0x7C00 /* A6 + (-B6) * B7.  */,
+      0xFC00 /* A7 + (-B7) * B7.  */ };
+
+void exec_vfmas_n_f16 (void)
+{
+#undef TEST_MSG
+#define TEST_MSG "VFMA_N (FP16)"
+  clean_results ();
+
+  DECL_VARIABLE(vsrc_1, float, 16, 4);
+  DECL_VARIABLE(vsrc_2, float, 16, 4);
+  VECT_VAR_DECL (buf_src_1, float, 16, 4) [] = {A0, A1, A2, A3};
+  VECT_VAR_DECL (buf_src_2, float, 16, 4) [] = {B0, B1, B2, B3};
+  VLOAD (vsrc_1, buf_src_1, , float, f, 16, 4);
+  VLOAD (vsrc_2, buf_src_2, , float, f, 16, 4);
+  DECL_VARIABLE (vector_res, float, 16, 4)
+    = vfma_n_f16 (VECT_VAR (vsrc_1, float, 16, 4),
+		  VECT_VAR (vsrc_2, float, 16, 4), B0);
+
+  vst1_f16 (VECT_VAR (result, float, 16, 4),
+	    VECT_VAR (vector_res, float, 16, 4));
+
+  CHECK_FP (TEST_MSG, float, 16, 4, PRIx16, expected_fma0_static, "");
+
+  VECT_VAR (vector_res, float, 16, 4)
+    = vfma_n_f16 (VECT_VAR (vsrc_1, float, 16, 4),
+		  VECT_VAR (vsrc_2, float, 16, 4), B1);
+  vst1_f16 (VECT_VAR (result, float, 16, 4),
+	    VECT_VAR (vector_res, float, 16, 4));
+
+  CHECK_FP (TEST_MSG, float, 16, 4, PRIx16, expected_fma1_static, "");
+
+  VECT_VAR (vector_res, float, 16, 4)
+    = vfma_n_f16 (VECT_VAR (vsrc_1, float, 16, 4),
+		  VECT_VAR (vsrc_2, float, 16, 4), B2);
+  vst1_f16 (VECT_VAR (result, float, 16, 4),
+	    VECT_VAR (vector_res, float, 16, 4));
+
+  CHECK_FP (TEST_MSG, float, 16, 4, PRIx16, expected_fma2_static, "");
+
+  VECT_VAR (vector_res, float, 16, 4)
+    = vfma_n_f16 (VECT_VAR (vsrc_1, float, 16, 4),
+		  VECT_VAR (vsrc_2, float, 16, 4), B3);
+  vst1_f16 (VECT_VAR (result, float, 16, 4),
+	    VECT_VAR (vector_res, float, 16, 4));
+
+  CHECK_FP (TEST_MSG, float, 16, 4, PRIx16, expected_fma3_static, "");
+
+#undef TEST_MSG
+#define TEST_MSG "VFMAQ_N (FP16)"
+  clean_results ();
+
+  DECL_VARIABLE(vsrc_1, float, 16, 8);
+  DECL_VARIABLE(vsrc_2, float, 16, 8);
+  VECT_VAR_DECL (buf_src_1, float, 16, 8) [] = {A0, A1, A2, A3, A4, A5, A6, A7};
+  VECT_VAR_DECL (buf_src_2, float, 16, 8) [] = {B0, B1, B2, B3, B4, B5, B6, B7};
+  VLOAD (vsrc_1, buf_src_1, q, float, f, 16, 8);
+  VLOAD (vsrc_2, buf_src_2, q, float, f, 16, 8);
+  DECL_VARIABLE (vector_res, float, 16, 8)
+    = vfmaq_n_f16 (VECT_VAR (vsrc_1, float, 16, 8),
+		   VECT_VAR (vsrc_2, float, 16, 8), B0);
+  vst1q_f16 (VECT_VAR (result, float, 16, 8),
+	     VECT_VAR (vector_res, float, 16, 8));
+
+  CHECK_FP (TEST_MSG, float, 16, 8, PRIx16, expected_fma0_static, "");
+
+  VECT_VAR (vector_res, float, 16, 8)
+    = vfmaq_n_f16 (VECT_VAR (vsrc_1, float, 16, 8),
+		   VECT_VAR (vsrc_2, float, 16, 8), B1);
+  vst1q_f16 (VECT_VAR (result, float, 16, 8),
+	     VECT_VAR (vector_res, float, 16, 8));
+
+  CHECK_FP (TEST_MSG, float, 16, 8, PRIx16, expected_fma1_static, "");
+
+  VECT_VAR (vector_res, float, 16, 8)
+    = vfmaq_n_f16 (VECT_VAR (vsrc_1, float, 16, 8),
+		   VECT_VAR (vsrc_2, float, 16, 8), B2);
+  vst1q_f16 (VECT_VAR (result, float, 16, 8),
+	     VECT_VAR (vector_res, float, 16, 8));
+
+  CHECK_FP (TEST_MSG, float, 16, 8, PRIx16, expected_fma2_static, "");
+
+  VECT_VAR (vector_res, float, 16, 8)
+    = vfmaq_n_f16 (VECT_VAR (vsrc_1, float, 16, 8),
+		   VECT_VAR (vsrc_2, float, 16, 8), B3);
+  vst1q_f16 (VECT_VAR (result, float, 16, 8),
+	     VECT_VAR (vector_res, float, 16, 8));
+
+  CHECK_FP (TEST_MSG, float, 16, 8, PRIx16, expected_fma3_static, "");
+
+  VECT_VAR (vector_res, float, 16, 8)
+    = vfmaq_n_f16 (VECT_VAR (vsrc_1, float, 16, 8),
+		   VECT_VAR (vsrc_2, float, 16, 8), B4);
+  vst1q_f16 (VECT_VAR (result, float, 16, 8),
+	     VECT_VAR (vector_res, float, 16, 8));
+
+  CHECK_FP (TEST_MSG, float, 16, 8, PRIx16, expected_fma4_static, "");
+
+  VECT_VAR (vector_res, float, 16, 8)
+    = vfmaq_n_f16 (VECT_VAR (vsrc_1, float, 16, 8),
+		   VECT_VAR (vsrc_2, float, 16, 8), B5);
+  vst1q_f16 (VECT_VAR (result, float, 16, 8),
+	     VECT_VAR (vector_res, float, 16, 8));
+
+  CHECK_FP (TEST_MSG, float, 16, 8, PRIx16, expected_fma5_static, "");
+
+  VECT_VAR (vector_res, float, 16, 8)
+    = vfmaq_n_f16 (VECT_VAR (vsrc_1, float, 16, 8),
+		   VECT_VAR (vsrc_2, float, 16, 8), B6);
+  vst1q_f16 (VECT_VAR (result, float, 16, 8),
+	     VECT_VAR (vector_res, float, 16, 8));
+
+  CHECK_FP (TEST_MSG, float, 16, 8, PRIx16, expected_fma6_static, "");
+
+  VECT_VAR (vector_res, float, 16, 8)
+    = vfmaq_n_f16 (VECT_VAR (vsrc_1, float, 16, 8),
+		   VECT_VAR (vsrc_2, float, 16, 8), B7);
+  vst1q_f16 (VECT_VAR (result, float, 16, 8),
+	     VECT_VAR (vector_res, float, 16, 8));
+
+  CHECK_FP (TEST_MSG, float, 16, 8, PRIx16, expected_fma7_static, "");
+
+#undef TEST_MSG
+#define TEST_MSG "VFMA_N (FP16)"
+  clean_results ();
+
+  VECT_VAR (vector_res, float, 16, 4)
+    = vfms_n_f16 (VECT_VAR (vsrc_1, float, 16, 4),
+		  VECT_VAR (vsrc_2, float, 16, 4), B0);
+
+  vst1_f16 (VECT_VAR (result, float, 16, 4),
+	    VECT_VAR (vector_res, float, 16, 4));
+
+  CHECK_FP (TEST_MSG, float, 16, 4, PRIx16, expected_fms0_static, "");
+
+  VECT_VAR (vector_res, float, 16, 4)
+    = vfms_n_f16 (VECT_VAR (vsrc_1, float, 16, 4),
+		  VECT_VAR (vsrc_2, float, 16, 4), B1);
+  vst1_f16 (VECT_VAR (result, float, 16, 4),
+	    VECT_VAR (vector_res, float, 16, 4));
+
+  CHECK_FP (TEST_MSG, float, 16, 4, PRIx16, expected_fms1_static, "");
+
+  VECT_VAR (vector_res, float, 16, 4)
+    = vfms_n_f16 (VECT_VAR (vsrc_1, float, 16, 4),
+		  VECT_VAR (vsrc_2, float, 16, 4), B2);
+  vst1_f16 (VECT_VAR (result, float, 16, 4),
+	    VECT_VAR (vector_res, float, 16, 4));
+
+  CHECK_FP (TEST_MSG, float, 16, 4, PRIx16, expected_fms2_static, "");
+
+  VECT_VAR (vector_res, float, 16, 4)
+    = vfms_n_f16 (VECT_VAR (vsrc_1, float, 16, 4),
+		  VECT_VAR (vsrc_2, float, 16, 4), B3);
+  vst1_f16 (VECT_VAR (result, float, 16, 4),
+	    VECT_VAR (vector_res, float, 16, 4));
+
+  CHECK_FP (TEST_MSG, float, 16, 4, PRIx16, expected_fms3_static, "");
+
+#undef TEST_MSG
+#define TEST_MSG "VFMAQ_N (FP16)"
+  clean_results ();
+
+  VECT_VAR (vector_res, float, 16, 8)
+    = vfmsq_n_f16 (VECT_VAR (vsrc_1, float, 16, 8),
+		   VECT_VAR (vsrc_2, float, 16, 8), B0);
+  vst1q_f16 (VECT_VAR (result, float, 16, 8),
+	     VECT_VAR (vector_res, float, 16, 8));
+
+  CHECK_FP (TEST_MSG, float, 16, 8, PRIx16, expected_fms0_static, "");
+
+  VECT_VAR (vector_res, float, 16, 8)
+    = vfmsq_n_f16 (VECT_VAR (vsrc_1, float, 16, 8),
+		   VECT_VAR (vsrc_2, float, 16, 8), B1);
+  vst1q_f16 (VECT_VAR (result, float, 16, 8),
+	     VECT_VAR (vector_res, float, 16, 8));
+
+  CHECK_FP (TEST_MSG, float, 16, 8, PRIx16, expected_fms1_static, "");
+
+  VECT_VAR (vector_res, float, 16, 8)
+    = vfmsq_n_f16 (VECT_VAR (vsrc_1, float, 16, 8),
+		   VECT_VAR (vsrc_2, float, 16, 8), B2);
+  vst1q_f16 (VECT_VAR (result, float, 16, 8),
+	     VECT_VAR (vector_res, float, 16, 8));
+
+  CHECK_FP (TEST_MSG, float, 16, 8, PRIx16, expected_fms2_static, "");
+
+  VECT_VAR (vector_res, float, 16, 8)
+    = vfmsq_n_f16 (VECT_VAR (vsrc_1, float, 16, 8),
+		   VECT_VAR (vsrc_2, float, 16, 8), B3);
+  vst1q_f16 (VECT_VAR (result, float, 16, 8),
+	     VECT_VAR (vector_res, float, 16, 8));
+
+  CHECK_FP (TEST_MSG, float, 16, 8, PRIx16, expected_fms3_static, "");
+
+  VECT_VAR (vector_res, float, 16, 8)
+    = vfmsq_n_f16 (VECT_VAR (vsrc_1, float, 16, 8),
+		   VECT_VAR (vsrc_2, float, 16, 8), B4);
+  vst1q_f16 (VECT_VAR (result, float, 16, 8),
+	     VECT_VAR (vector_res, float, 16, 8));
+
+  CHECK_FP (TEST_MSG, float, 16, 8, PRIx16, expected_fms4_static, "");
+
+  VECT_VAR (vector_res, float, 16, 8)
+    = vfmsq_n_f16 (VECT_VAR (vsrc_1, float, 16, 8),
+		   VECT_VAR (vsrc_2, float, 16, 8), B5);
+  vst1q_f16 (VECT_VAR (result, float, 16, 8),
+	     VECT_VAR (vector_res, float, 16, 8));
+
+  CHECK_FP (TEST_MSG, float, 16, 8, PRIx16, expected_fms5_static, "");
+
+  VECT_VAR (vector_res, float, 16, 8)
+    = vfmsq_n_f16 (VECT_VAR (vsrc_1, float, 16, 8),
+		   VECT_VAR (vsrc_2, float, 16, 8), B6);
+  vst1q_f16 (VECT_VAR (result, float, 16, 8),
+	     VECT_VAR (vector_res, float, 16, 8));
+
+  CHECK_FP (TEST_MSG, float, 16, 8, PRIx16, expected_fms6_static, "");
+
+  VECT_VAR (vector_res, float, 16, 8)
+    = vfmsq_n_f16 (VECT_VAR (vsrc_1, float, 16, 8),
+		   VECT_VAR (vsrc_2, float, 16, 8), B7);
+  vst1q_f16 (VECT_VAR (result, float, 16, 8),
+	     VECT_VAR (vector_res, float, 16, 8));
+
+  CHECK_FP (TEST_MSG, float, 16, 8, PRIx16, expected_fms7_static, "");
+}
+
+int
+main (void)
+{
+  exec_vfmas_n_f16 ();
+  return 0;
+}
--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vfmash_lane_f16_1.c
@@ -0,0 +1,143 @@
+/* { dg-do run } */
+/* { dg-require-effective-target arm_v8_2a_fp16_scalar_hw } */
+/* { dg-add-options arm_v8_2a_fp16_neon } */
+/* { dg-skip-if "" { arm*-*-* } } */
+
+#include <arm_neon.h>
+#include "arm-neon-ref.h"
+#include "compute-ref-data.h"
+
+#define FP16_C(a) ((__fp16) a)
+#define A0 FP16_C (123.4)
+#define B0 FP16_C (-5.8)
+#define C0 FP16_C (-3.8)
+#define D0 FP16_C (10)
+
+#define A1 FP16_C (12.4)
+#define B1 FP16_C (-5.8)
+#define C1 FP16_C (90.8)
+#define D1 FP16_C (24)
+
+#define A2 FP16_C (23.4)
+#define B2 FP16_C (-5.8)
+#define C2 FP16_C (8.9)
+#define D2 FP16_C (4)
+
+#define E0 FP16_C (3.4)
+#define F0 FP16_C (-55.8)
+#define G0 FP16_C (-31.8)
+#define H0 FP16_C (2)
+
+#define E1 FP16_C (123.4)
+#define F1 FP16_C (-5.8)
+#define G1 FP16_C (-3.8)
+#define H1 FP16_C (102)
+
+#define E2 FP16_C (4.9)
+#define F2 FP16_C (-15.8)
+#define G2 FP16_C (39.8)
+#define H2 FP16_C (49)
+
+extern void abort ();
+
+float16_t src1[8] = { A0, B0, C0, D0, E0, F0, G0, H0 };
+float16_t src2[8] = { A1, B1, C1, D1, E1, F1, G1, H1 };
+VECT_VAR_DECL (src3, float, 16, 4) [] = { A2, B2, C2, D2 };
+VECT_VAR_DECL (src3, float, 16, 8) [] = { A2, B2, C2, D2, E2, F2, G2, H2 };
+
+/* Expected results for vfmah_lane_f16.  */
+uint16_t expected[4] = { 0x5E76 /* A0 + A1 * A2.  */,
+			 0x4EF6 /* B0 + B1 * B2.  */,
+			 0x6249 /* C0 + C1 * C2.  */,
+			 0x56A0 /* D0 + D1 * D2.  */ };
+
+/* Expected results for vfmah_laneq_f16.  */
+uint16_t expected_laneq[8] = { 0x5E76 /* A0 + A1 * A2.  */,
+			       0x4EF6 /* B0 + B1 * B2.  */,
+			       0x6249 /* C0 + C1 * C2.  */,
+			       0x56A0 /* D0 + D1 * D2.  */,
+			       0x60BF /* E0 + E1 * E2.  */,
+			       0x507A /* F0 + F1 * F2.  */,
+			       0xD9B9 /* G0 + G1 * G2.  */,
+			       0x6CE2 /* H0 + H1 * H2.  */ };
+
+/* Expected results for vfmsh_lane_f16.  */
+uint16_t expected_fms[4] = { 0xD937 /* A0 + -A1 * A2.  */,
+			     0xD0EE /* B0 + -B1 * B2.  */,
+			     0xE258 /* C0 + -C1 * C2.  */,
+			     0xD560 /* D0 + -D1 * D2.  */ };
+
+/* Expected results for vfmsh_laneq_f16.  */
+uint16_t expected_fms_laneq[8] = { 0xD937 /* A0 + -A1 * A2.  */,
+				   0xD0EE /* B0 + -B1 * B2.  */,
+				   0xE258 /* C0 + -C1 * C2.  */,
+				   0xD560 /* D0 + -D1 * D2.  */,
+				   0xE0B2 /* E0 + -E1 * E2.  */,
+				   0xD89C /* F0 + -F1 * F2.  */,
+				   0x5778 /* G0 + -G1 * G2.  */,
+				   0xECE1 /* H0 + -H1 * H2.  */ };
+
+void exec_vfmash_lane_f16 (void)
+{
+#define CHECK_LANE(N) \
+  ret = vfmah_lane_f16 (src1[N], src2[N], VECT_VAR (vsrc3, float, 16, 4), N);\
+  if (*(uint16_t *) &ret != expected[N])\
+    abort ();
+
+  DECL_VARIABLE(vsrc3, float, 16, 4);
+  VLOAD (vsrc3, src3, , float, f, 16, 4);
+  float16_t ret;
+  CHECK_LANE(0)
+  CHECK_LANE(1)
+  CHECK_LANE(2)
+  CHECK_LANE(3)
+
+#undef CHECK_LANE
+#define CHECK_LANE(N) \
+  ret = vfmah_laneq_f16 (src1[N], src2[N], VECT_VAR (vsrc3, float, 16, 8), N);\
+  if (*(uint16_t *) &ret != expected_laneq[N]) \
+	  abort ();
+
+  DECL_VARIABLE(vsrc3, float, 16, 8);
+  VLOAD (vsrc3, src3, q, float, f, 16, 8);
+  CHECK_LANE(0)
+  CHECK_LANE(1)
+  CHECK_LANE(2)
+  CHECK_LANE(3)
+  CHECK_LANE(4)
+  CHECK_LANE(5)
+  CHECK_LANE(6)
+  CHECK_LANE(7)
+
+#undef CHECK_LANE
+#define CHECK_LANE(N) \
+  ret = vfmsh_lane_f16 (src1[N], src2[N], VECT_VAR (vsrc3, float, 16, 4), N);\
+  if (*(uint16_t *) &ret != expected_fms[N])\
+    abort ();
+
+  CHECK_LANE(0)
+  CHECK_LANE(1)
+  CHECK_LANE(2)
+
+#undef CHECK_LANE
+#define CHECK_LANE(N) \
+  ret = vfmsh_laneq_f16 (src1[N], src2[N], VECT_VAR (vsrc3, float, 16, 8), N);\
+  if (*(uint16_t *) &ret != expected_fms_laneq[N]) \
+	  abort ();
+
+  CHECK_LANE(0)
+  CHECK_LANE(1)
+  CHECK_LANE(2)
+  CHECK_LANE(3)
+  CHECK_LANE(4)
+  CHECK_LANE(5)
+  CHECK_LANE(6)
+  CHECK_LANE(7)
+}
+
+int
+main (void)
+{
+  exec_vfmash_lane_f16 ();
+  return 0;
+}
--- a/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vfms.c
+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vfms.c
@@ -4,10 +4,17 @@
 
 #ifdef __ARM_FEATURE_FMA
 /* Expected results.  */
+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+VECT_VAR_DECL(expected, hfloat, 16, 4) [] = { 0xe206, 0xe204, 0xe202, 0xe200 };
+VECT_VAR_DECL(expected, hfloat, 16, 8) [] = { 0xe455, 0xe454, 0xe453, 0xe452,
+					      0xe451, 0xe450, 0xe44f, 0xe44e };
+#endif
 VECT_VAR_DECL(expected,hfloat,32,2) [] = { 0xc440ca3d, 0xc4408a3d };
-VECT_VAR_DECL(expected,hfloat,32,4) [] = { 0xc48a9eb8, 0xc48a7eb8, 0xc48a5eb8, 0xc48a3eb8 };
+VECT_VAR_DECL(expected,hfloat,32,4) [] = { 0xc48a9eb8, 0xc48a7eb8,
+					   0xc48a5eb8, 0xc48a3eb8 };
 #ifdef __aarch64__
-VECT_VAR_DECL(expected,hfloat,64,2) [] = { 0xc08a06e1532b8520, 0xc089fee1532b8520 };
+VECT_VAR_DECL(expected,hfloat,64,2) [] = { 0xc08a06e1532b8520,
+					   0xc089fee1532b8520 };
 #endif
 
 #define TEST_MSG "VFMS/VFMSQ"
@@ -44,6 +51,18 @@ void exec_vfms (void)
   DECL_VARIABLE(VAR, float, 32, 4);
 #endif
 
+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+  DECL_VARIABLE(vector1, float, 16, 4);
+  DECL_VARIABLE(vector2, float, 16, 4);
+  DECL_VARIABLE(vector3, float, 16, 4);
+  DECL_VARIABLE(vector_res, float, 16, 4);
+
+  DECL_VARIABLE(vector1, float, 16, 8);
+  DECL_VARIABLE(vector2, float, 16, 8);
+  DECL_VARIABLE(vector3, float, 16, 8);
+  DECL_VARIABLE(vector_res, float, 16, 8);
+#endif
+
   DECL_VFMS_VAR(vector1);
   DECL_VFMS_VAR(vector2);
   DECL_VFMS_VAR(vector3);
@@ -52,6 +71,10 @@ void exec_vfms (void)
   clean_results ();
 
   /* Initialize input "vector1" from "buffer".  */
+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+  VLOAD(vector1, buffer, , float, f, 16, 4);
+  VLOAD(vector1, buffer, q, float, f, 16, 8);
+#endif
   VLOAD(vector1, buffer, , float, f, 32, 2);
   VLOAD(vector1, buffer, q, float, f, 32, 4);
 #ifdef __aarch64__
@@ -59,13 +82,21 @@ void exec_vfms (void)
 #endif
 
   /* Choose init value arbitrarily.  */
+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+  VDUP(vector2, , float, f, 16, 4, 9.3f);
+  VDUP(vector2, q, float, f, 16, 8, 29.7f);
+#endif
   VDUP(vector2, , float, f, 32, 2, 9.3f);
   VDUP(vector2, q, float, f, 32, 4, 29.7f);
 #ifdef __aarch64__
   VDUP(vector2, q, float, f, 64, 2, 15.8f);
 #endif
-  
+
   /* Choose init value arbitrarily.  */
+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+  VDUP(vector3, , float, f, 16, 4, 81.2f);
+  VDUP(vector3, q, float, f, 16, 8, 36.8f);
+#endif
   VDUP(vector3, , float, f, 32, 2, 81.2f);
   VDUP(vector3, q, float, f, 32, 4, 36.8f);
 #ifdef __aarch64__
@@ -73,12 +104,20 @@ void exec_vfms (void)
 #endif
 
   /* Execute the tests.  */
+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+  TEST_VFMS(, float, f, 16, 4);
+  TEST_VFMS(q, float, f, 16, 8);
+#endif
   TEST_VFMS(, float, f, 32, 2);
   TEST_VFMS(q, float, f, 32, 4);
 #ifdef __aarch64__
   TEST_VFMS(q, float, f, 64, 2);
 #endif
 
+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+  CHECK_FP(TEST_MSG, float, 16, 4, PRIx16, expected, "");
+  CHECK_FP(TEST_MSG, float, 16, 8, PRIx16, expected, "");
+#endif
   CHECK_VFMS_RESULTS (TEST_MSG, "");
 }
 #endif
--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vfms_vfma_n.c
@@ -0,0 +1,490 @@
+#include <arm_neon.h>
+#include "arm-neon-ref.h"
+#include "compute-ref-data.h"
+
+#if defined(__aarch64__) && defined(__ARM_FEATURE_FMA)
+
+#define A0 123.4f
+#define A1 -3.8f
+#define A2 -29.4f
+#define A3 (__builtin_inff ())
+#define A4 0.0f
+#define A5 24.0f
+#define A6 124.0f
+#define A7 1024.0f
+
+#define B0 -5.8f
+#define B1 -0.0f
+#define B2 -10.8f
+#define B3 10.0f
+#define B4 23.4f
+#define B5 -1234.8f
+#define B6 8.9f
+#define B7 4.0f
+
+#define E0 9.8f
+#define E1 -1024.0f
+#define E2 (-__builtin_inff ())
+#define E3 479.0f
+float32_t elem0 = E0;
+float32_t elem1 = E1;
+float32_t elem2 = E2;
+float32_t elem3 = E3;
+
+#define DA0 1231234.4
+#define DA1 -3.8
+#define DA2 -2980.4
+#define DA3 -5.8
+#define DA4 0.01123
+#define DA5 24.0
+#define DA6 124.12345
+#define DA7 1024.0
+
+#define DB0 -5.8
+#define DB1 (__builtin_inf ())
+#define DB2 -105.8
+#define DB3 10.0
+#define DB4 (-__builtin_inf ())
+#define DB5 -1234.8
+#define DB6 848.9
+#define DB7 44444.0
+
+#define DE0 9.8
+#define DE1 -1024.0
+#define DE2 105.8
+#define DE3 479.0
+float64_t delem0 = DE0;
+float64_t delem1 = DE1;
+float64_t delem2 = DE2;
+float64_t delem3 = DE3;
+
+/* Expected results for vfms_n.  */
+
+VECT_VAR_DECL(expectedfms0, float, 32, 2) [] = {A0 + -B0 * E0, A1 + -B1 * E0};
+VECT_VAR_DECL(expectedfms1, float, 32, 2) [] = {A2 + -B2 * E1, A3 + -B3 * E1};
+VECT_VAR_DECL(expectedfms2, float, 32, 2) [] = {A4 + -B4 * E2, A5 + -B5 * E2};
+VECT_VAR_DECL(expectedfms3, float, 32, 2) [] = {A6 + -B6 * E3, A7 + -B7 * E3};
+VECT_VAR_DECL(expectedfma0, float, 32, 2) [] = {A0 + B0 * E0, A1 + B1 * E0};
+VECT_VAR_DECL(expectedfma1, float, 32, 2) [] = {A2 + B2 * E1, A3 + B3 * E1};
+VECT_VAR_DECL(expectedfma2, float, 32, 2) [] = {A4 + B4 * E2, A5 + B5 * E2};
+VECT_VAR_DECL(expectedfma3, float, 32, 2) [] = {A6 + B6 * E3, A7 + B7 * E3};
+
+hfloat32_t * VECT_VAR (expectedfms0_static, hfloat, 32, 2) =
+  (hfloat32_t *) VECT_VAR (expectedfms0, float, 32, 2);
+hfloat32_t * VECT_VAR (expectedfms1_static, hfloat, 32, 2) =
+  (hfloat32_t *) VECT_VAR (expectedfms1, float, 32, 2);
+hfloat32_t * VECT_VAR (expectedfms2_static, hfloat, 32, 2) =
+  (hfloat32_t *) VECT_VAR (expectedfms2, float, 32, 2);
+hfloat32_t * VECT_VAR (expectedfms3_static, hfloat, 32, 2) =
+  (hfloat32_t *) VECT_VAR (expectedfms3, float, 32, 2);
+hfloat32_t * VECT_VAR (expectedfma0_static, hfloat, 32, 2) =
+  (hfloat32_t *) VECT_VAR (expectedfma0, float, 32, 2);
+hfloat32_t * VECT_VAR (expectedfma1_static, hfloat, 32, 2) =
+  (hfloat32_t *) VECT_VAR (expectedfma1, float, 32, 2);
+hfloat32_t * VECT_VAR (expectedfma2_static, hfloat, 32, 2) =
+  (hfloat32_t *) VECT_VAR (expectedfma2, float, 32, 2);
+hfloat32_t * VECT_VAR (expectedfma3_static, hfloat, 32, 2) =
+  (hfloat32_t *) VECT_VAR (expectedfma3, float, 32, 2);
+
+
+VECT_VAR_DECL(expectedfms0, float, 32, 4) [] = {A0 + -B0 * E0, A1 + -B1 * E0,
+						A2 + -B2 * E0, A3 + -B3 * E0};
+VECT_VAR_DECL(expectedfms1, float, 32, 4) [] = {A4 + -B4 * E1, A5 + -B5 * E1,
+						A6 + -B6 * E1, A7 + -B7 * E1};
+VECT_VAR_DECL(expectedfms2, float, 32, 4) [] = {A0 + -B0 * E2, A2 + -B2 * E2,
+						A4 + -B4 * E2, A6 + -B6 * E2};
+VECT_VAR_DECL(expectedfms3, float, 32, 4) [] = {A1 + -B1 * E3, A3 + -B3 * E3,
+						A5 + -B5 * E3, A7 + -B7 * E3};
+VECT_VAR_DECL(expectedfma0, float, 32, 4) [] = {A0 + B0 * E0, A1 + B1 * E0,
+						A2 + B2 * E0, A3 + B3 * E0};
+VECT_VAR_DECL(expectedfma1, float, 32, 4) [] = {A4 + B4 * E1, A5 + B5 * E1,
+						A6 + B6 * E1, A7 + B7 * E1};
+VECT_VAR_DECL(expectedfma2, float, 32, 4) [] = {A0 + B0 * E2, A2 + B2 * E2,
+						A4 + B4 * E2, A6 + B6 * E2};
+VECT_VAR_DECL(expectedfma3, float, 32, 4) [] = {A1 + B1 * E3, A3 + B3 * E3,
+						A5 + B5 * E3, A7 + B7 * E3};
+
+hfloat32_t * VECT_VAR (expectedfms0_static, hfloat, 32, 4) =
+  (hfloat32_t *) VECT_VAR (expectedfms0, float, 32, 4);
+hfloat32_t * VECT_VAR (expectedfms1_static, hfloat, 32, 4) =
+  (hfloat32_t *) VECT_VAR (expectedfms1, float, 32, 4);
+hfloat32_t * VECT_VAR (expectedfms2_static, hfloat, 32, 4) =
+  (hfloat32_t *) VECT_VAR (expectedfms2, float, 32, 4);
+hfloat32_t * VECT_VAR (expectedfms3_static, hfloat, 32, 4) =
+  (hfloat32_t *) VECT_VAR (expectedfms3, float, 32, 4);
+hfloat32_t * VECT_VAR (expectedfma0_static, hfloat, 32, 4) =
+  (hfloat32_t *) VECT_VAR (expectedfma0, float, 32, 4);
+hfloat32_t * VECT_VAR (expectedfma1_static, hfloat, 32, 4) =
+  (hfloat32_t *) VECT_VAR (expectedfma1, float, 32, 4);
+hfloat32_t * VECT_VAR (expectedfma2_static, hfloat, 32, 4) =
+  (hfloat32_t *) VECT_VAR (expectedfma2, float, 32, 4);
+hfloat32_t * VECT_VAR (expectedfma3_static, hfloat, 32, 4) =
+  (hfloat32_t *) VECT_VAR (expectedfma3, float, 32, 4);
+
+VECT_VAR_DECL(expectedfms0, float, 64, 2) [] = {DA0 + -DB0 * DE0,
+						DA1 + -DB1 * DE0};
+VECT_VAR_DECL(expectedfms1, float, 64, 2) [] = {DA2 + -DB2 * DE1,
+						DA3 + -DB3 * DE1};
+VECT_VAR_DECL(expectedfms2, float, 64, 2) [] = {DA4 + -DB4 * DE2,
+						DA5 + -DB5 * DE2};
+VECT_VAR_DECL(expectedfms3, float, 64, 2) [] = {DA6 + -DB6 * DE3,
+						DA7 + -DB7 * DE3};
+VECT_VAR_DECL(expectedfma0, float, 64, 2) [] = {DA0 + DB0 * DE0,
+						DA1 + DB1 * DE0};
+VECT_VAR_DECL(expectedfma1, float, 64, 2) [] = {DA2 + DB2 * DE1,
+						DA3 + DB3 * DE1};
+VECT_VAR_DECL(expectedfma2, float, 64, 2) [] = {DA4 + DB4 * DE2,
+						DA5 + DB5 * DE2};
+VECT_VAR_DECL(expectedfma3, float, 64, 2) [] = {DA6 + DB6 * DE3,
+						DA7 + DB7 * DE3};
+hfloat64_t * VECT_VAR (expectedfms0_static, hfloat, 64, 2) =
+  (hfloat64_t *) VECT_VAR (expectedfms0, float, 64, 2);
+hfloat64_t * VECT_VAR (expectedfms1_static, hfloat, 64, 2) =
+  (hfloat64_t *) VECT_VAR (expectedfms1, float, 64, 2);
+hfloat64_t * VECT_VAR (expectedfms2_static, hfloat, 64, 2) =
+  (hfloat64_t *) VECT_VAR (expectedfms2, float, 64, 2);
+hfloat64_t * VECT_VAR (expectedfms3_static, hfloat, 64, 2) =
+  (hfloat64_t *) VECT_VAR (expectedfms3, float, 64, 2);
+hfloat64_t * VECT_VAR (expectedfma0_static, hfloat, 64, 2) =
+  (hfloat64_t *) VECT_VAR (expectedfma0, float, 64, 2);
+hfloat64_t * VECT_VAR (expectedfma1_static, hfloat, 64, 2) =
+  (hfloat64_t *) VECT_VAR (expectedfma1, float, 64, 2);
+hfloat64_t * VECT_VAR (expectedfma2_static, hfloat, 64, 2) =
+  (hfloat64_t *) VECT_VAR (expectedfma2, float, 64, 2);
+hfloat64_t * VECT_VAR (expectedfma3_static, hfloat, 64, 2) =
+  (hfloat64_t *) VECT_VAR (expectedfma3, float, 64, 2);
+
+VECT_VAR_DECL(expectedfms0, float, 64, 1) [] = {DA0 + -DB0 * DE0};
+VECT_VAR_DECL(expectedfms1, float, 64, 1) [] = {DA2 + -DB2 * DE1};
+VECT_VAR_DECL(expectedfms2, float, 64, 1) [] = {DA4 + -DB4 * DE2};
+VECT_VAR_DECL(expectedfms3, float, 64, 1) [] = {DA6 + -DB6 * DE3};
+VECT_VAR_DECL(expectedfma0, float, 64, 1) [] = {DA0 + DB0 * DE0};
+VECT_VAR_DECL(expectedfma1, float, 64, 1) [] = {DA2 + DB2 * DE1};
+VECT_VAR_DECL(expectedfma2, float, 64, 1) [] = {DA4 + DB4 * DE2};
+VECT_VAR_DECL(expectedfma3, float, 64, 1) [] = {DA6 + DB6 * DE3};
+
+hfloat64_t * VECT_VAR (expectedfms0_static, hfloat, 64, 1) =
+  (hfloat64_t *) VECT_VAR (expectedfms0, float, 64, 1);
+hfloat64_t * VECT_VAR (expectedfms1_static, hfloat, 64, 1) =
+  (hfloat64_t *) VECT_VAR (expectedfms1, float, 64, 1);
+hfloat64_t * VECT_VAR (expectedfms2_static, hfloat, 64, 1) =
+  (hfloat64_t *) VECT_VAR (expectedfms2, float, 64, 1);
+hfloat64_t * VECT_VAR (expectedfms3_static, hfloat, 64, 1) =
+  (hfloat64_t *) VECT_VAR (expectedfms3, float, 64, 1);
+hfloat64_t * VECT_VAR (expectedfma0_static, hfloat, 64, 1) =
+  (hfloat64_t *) VECT_VAR (expectedfma0, float, 64, 1);
+hfloat64_t * VECT_VAR (expectedfma1_static, hfloat, 64, 1) =
+  (hfloat64_t *) VECT_VAR (expectedfma1, float, 64, 1);
+hfloat64_t * VECT_VAR (expectedfma2_static, hfloat, 64, 1) =
+  (hfloat64_t *) VECT_VAR (expectedfma2, float, 64, 1);
+hfloat64_t * VECT_VAR (expectedfma3_static, hfloat, 64, 1) =
+  (hfloat64_t *) VECT_VAR (expectedfma3, float, 64, 1);
+
+void exec_vfma_vfms_n (void)
+{
+#undef TEST_MSG
+#define TEST_MSG "VFMS_VFMA_N (FP32)"
+  clean_results ();
+
+  DECL_VARIABLE(vsrc_1, float, 32, 2);
+  DECL_VARIABLE(vsrc_2, float, 32, 2);
+  VECT_VAR_DECL (buf_src_1, float, 32, 2) [] = {A0, A1};
+  VECT_VAR_DECL (buf_src_2, float, 32, 2) [] = {B0, B1};
+  VLOAD (vsrc_1, buf_src_1, , float, f, 32, 2);
+  VLOAD (vsrc_2, buf_src_2, , float, f, 32, 2);
+  DECL_VARIABLE (vector_res, float, 32, 2) =
+    vfms_n_f32 (VECT_VAR (vsrc_1, float, 32, 2),
+		VECT_VAR (vsrc_2, float, 32, 2), elem0);
+  vst1_f32 (VECT_VAR (result, float, 32, 2),
+	    VECT_VAR (vector_res, float, 32, 2));
+  CHECK_FP (TEST_MSG, float, 32, 2, PRIx16, expectedfms0_static, "");
+  VECT_VAR (vector_res, float, 32, 2) =
+    vfma_n_f32 (VECT_VAR (vsrc_1, float, 32, 2),
+		VECT_VAR (vsrc_2, float, 32, 2), elem0);
+  vst1_f32 (VECT_VAR (result, float, 32, 2),
+	    VECT_VAR (vector_res, float, 32, 2));
+  CHECK_FP (TEST_MSG, float, 32, 2, PRIx16, expectedfma0_static, "");
+
+  VECT_VAR_DECL (buf_src_3, float, 32, 2) [] = {A2, A3};
+  VECT_VAR_DECL (buf_src_4, float, 32, 2) [] = {B2, B3};
+  VLOAD (vsrc_1, buf_src_3, , float, f, 32, 2);
+  VLOAD (vsrc_2, buf_src_4, , float, f, 32, 2);
+  VECT_VAR (vector_res, float, 32, 2) =
+    vfms_n_f32 (VECT_VAR (vsrc_1, float, 32, 2),
+		VECT_VAR (vsrc_2, float, 32, 2), elem1);
+  vst1_f32 (VECT_VAR (result, float, 32, 2),
+	    VECT_VAR (vector_res, float, 32, 2));
+  CHECK_FP (TEST_MSG, float, 32, 2, PRIx16, expectedfms1_static, "");
+  VECT_VAR (vector_res, float, 32, 2) =
+    vfma_n_f32 (VECT_VAR (vsrc_1, float, 32, 2),
+		VECT_VAR (vsrc_2, float, 32, 2), elem1);
+  vst1_f32 (VECT_VAR (result, float, 32, 2),
+	    VECT_VAR (vector_res, float, 32, 2));
+  CHECK_FP (TEST_MSG, float, 32, 2, PRIx16, expectedfma1_static, "");
+
+  VECT_VAR_DECL (buf_src_5, float, 32, 2) [] = {A4, A5};
+  VECT_VAR_DECL (buf_src_6, float, 32, 2) [] = {B4, B5};
+  VLOAD (vsrc_1, buf_src_5, , float, f, 32, 2);
+  VLOAD (vsrc_2, buf_src_6, , float, f, 32, 2);
+  VECT_VAR (vector_res, float, 32, 2) =
+    vfms_n_f32 (VECT_VAR (vsrc_1, float, 32, 2),
+		VECT_VAR (vsrc_2, float, 32, 2), elem2);
+  vst1_f32 (VECT_VAR (result, float, 32, 2),
+	    VECT_VAR (vector_res, float, 32, 2));
+  CHECK_FP (TEST_MSG, float, 32, 2, PRIx16, expectedfms2_static, "");
+  VECT_VAR (vector_res, float, 32, 2) =
+    vfma_n_f32 (VECT_VAR (vsrc_1, float, 32, 2),
+		VECT_VAR (vsrc_2, float, 32, 2), elem2);
+  vst1_f32 (VECT_VAR (result, float, 32, 2),
+	    VECT_VAR (vector_res, float, 32, 2));
+  CHECK_FP (TEST_MSG, float, 32, 2, PRIx16, expectedfma2_static, "");
+
+  VECT_VAR_DECL (buf_src_7, float, 32, 2) [] = {A6, A7};
+  VECT_VAR_DECL (buf_src_8, float, 32, 2) [] = {B6, B7};
+  VLOAD (vsrc_1, buf_src_7, , float, f, 32, 2);
+  VLOAD (vsrc_2, buf_src_8, , float, f, 32, 2);
+  VECT_VAR (vector_res, float, 32, 2) =
+    vfms_n_f32 (VECT_VAR (vsrc_1, float, 32, 2),
+		VECT_VAR (vsrc_2, float, 32, 2), elem3);
+  vst1_f32 (VECT_VAR (result, float, 32, 2),
+	    VECT_VAR (vector_res, float, 32, 2));
+  CHECK_FP (TEST_MSG, float, 32, 2, PRIx16, expectedfms3_static, "");
+  VECT_VAR (vector_res, float, 32, 2) =
+    vfma_n_f32 (VECT_VAR (vsrc_1, float, 32, 2),
+		VECT_VAR (vsrc_2, float, 32, 2), elem3);
+  vst1_f32 (VECT_VAR (result, float, 32, 2),
+	    VECT_VAR (vector_res, float, 32, 2));
+  CHECK_FP (TEST_MSG, float, 32, 2, PRIx16, expectedfma3_static, "");
+
+#undef TEST_MSG
+#define TEST_MSG "VFMSQ_VFMAQ_N (FP32)"
+  clean_results ();
+
+  DECL_VARIABLE(vsrc_1, float, 32, 4);
+  DECL_VARIABLE(vsrc_2, float, 32, 4);
+  VECT_VAR_DECL (buf_src_1, float, 32, 4) [] = {A0, A1, A2, A3};
+  VECT_VAR_DECL (buf_src_2, float, 32, 4) [] = {B0, B1, B2, B3};
+  VLOAD (vsrc_1, buf_src_1, q, float, f, 32, 4);
+  VLOAD (vsrc_2, buf_src_2, q, float, f, 32, 4);
+  DECL_VARIABLE (vector_res, float, 32, 4) =
+    vfmsq_n_f32 (VECT_VAR (vsrc_1, float, 32, 4),
+		 VECT_VAR (vsrc_2, float, 32, 4), elem0);
+  vst1q_f32 (VECT_VAR (result, float, 32, 4),
+	     VECT_VAR (vector_res, float, 32, 4));
+  CHECK_FP (TEST_MSG, float, 32, 4, PRIx16, expectedfms0_static, "");
+  VECT_VAR (vector_res, float, 32, 4) =
+    vfmaq_n_f32 (VECT_VAR (vsrc_1, float, 32, 4),
+		 VECT_VAR (vsrc_2, float, 32, 4), elem0);
+  vst1q_f32 (VECT_VAR (result, float, 32, 4),
+	     VECT_VAR (vector_res, float, 32, 4));
+  CHECK_FP (TEST_MSG, float, 32, 4, PRIx16, expectedfma0_static, "");
+
+  VECT_VAR_DECL (buf_src_3, float, 32, 4) [] = {A4, A5, A6, A7};
+  VECT_VAR_DECL (buf_src_4, float, 32, 4) [] = {B4, B5, B6, B7};
+  VLOAD (vsrc_1, buf_src_3, q, float, f, 32, 4);
+  VLOAD (vsrc_2, buf_src_4, q, float, f, 32, 4);
+  VECT_VAR (vector_res, float, 32, 4) =
+    vfmsq_n_f32 (VECT_VAR (vsrc_1, float, 32, 4),
+		 VECT_VAR (vsrc_2, float, 32, 4), elem1);
+  vst1q_f32 (VECT_VAR (result, float, 32, 4),
+	     VECT_VAR (vector_res, float, 32, 4));
+  CHECK_FP (TEST_MSG, float, 32, 4, PRIx16, expectedfms1_static, "");
+  VECT_VAR (vector_res, float, 32, 4) =
+    vfmaq_n_f32 (VECT_VAR (vsrc_1, float, 32, 4),
+		 VECT_VAR (vsrc_2, float, 32, 4), elem1);
+  vst1q_f32 (VECT_VAR (result, float, 32, 4),
+	     VECT_VAR (vector_res, float, 32, 4));
+  CHECK_FP (TEST_MSG, float, 32, 4, PRIx16, expectedfma1_static, "");
+
+  VECT_VAR_DECL (buf_src_5, float, 32, 4) [] = {A0, A2, A4, A6};
+  VECT_VAR_DECL (buf_src_6, float, 32, 4) [] = {B0, B2, B4, B6};
+  VLOAD (vsrc_1, buf_src_5, q, float, f, 32, 4);
+  VLOAD (vsrc_2, buf_src_6, q, float, f, 32, 4);
+  VECT_VAR (vector_res, float, 32, 4) =
+    vfmsq_n_f32 (VECT_VAR (vsrc_1, float, 32, 4),
+		 VECT_VAR (vsrc_2, float, 32, 4), elem2);
+  vst1q_f32 (VECT_VAR (result, float, 32, 4),
+	     VECT_VAR (vector_res, float, 32, 4));
+  CHECK_FP (TEST_MSG, float, 32, 4, PRIx16, expectedfms2_static, "");
+  VECT_VAR (vector_res, float, 32, 4) =
+    vfmaq_n_f32 (VECT_VAR (vsrc_1, float, 32, 4),
+		 VECT_VAR (vsrc_2, float, 32, 4), elem2);
+  vst1q_f32 (VECT_VAR (result, float, 32, 4),
+	     VECT_VAR (vector_res, float, 32, 4));
+  CHECK_FP (TEST_MSG, float, 32, 4, PRIx16, expectedfma2_static, "");
+
+  VECT_VAR_DECL (buf_src_7, float, 32, 4) [] = {A1, A3, A5, A7};
+  VECT_VAR_DECL (buf_src_8, float, 32, 4) [] = {B1, B3, B5, B7};
+  VLOAD (vsrc_1, buf_src_7, q, float, f, 32, 4);
+  VLOAD (vsrc_2, buf_src_8, q, float, f, 32, 4);
+  VECT_VAR (vector_res, float, 32, 4) =
+    vfmsq_n_f32 (VECT_VAR (vsrc_1, float, 32, 4),
+		 VECT_VAR (vsrc_2, float, 32, 4), elem3);
+  vst1q_f32 (VECT_VAR (result, float, 32, 4),
+	     VECT_VAR (vector_res, float, 32, 4));
+  CHECK_FP (TEST_MSG, float, 32, 4, PRIx16, expectedfms3_static, "");
+  VECT_VAR (vector_res, float, 32, 4) =
+    vfmaq_n_f32 (VECT_VAR (vsrc_1, float, 32, 4),
+		 VECT_VAR (vsrc_2, float, 32, 4), elem3);
+  vst1q_f32 (VECT_VAR (result, float, 32, 4),
+	     VECT_VAR (vector_res, float, 32, 4));
+  CHECK_FP (TEST_MSG, float, 32, 4, PRIx16, expectedfma3_static, "");
+
+#undef TEST_MSG
+#define TEST_MSG "VFMSQ_VFMAQ_N (FP64)"
+  clean_results ();
+
+  DECL_VARIABLE(vsrc_1, float, 64, 2);
+  DECL_VARIABLE(vsrc_2, float, 64, 2);
+  VECT_VAR_DECL (buf_src_1, float, 64, 2) [] = {DA0, DA1};
+  VECT_VAR_DECL (buf_src_2, float, 64, 2) [] = {DB0, DB1};
+  VLOAD (vsrc_1, buf_src_1, q, float, f, 64, 2);
+  VLOAD (vsrc_2, buf_src_2, q, float, f, 64, 2);
+  DECL_VARIABLE (vector_res, float, 64, 2) =
+    vfmsq_n_f64 (VECT_VAR (vsrc_1, float, 64, 2),
+		 VECT_VAR (vsrc_2, float, 64, 2), delem0);
+  vst1q_f64 (VECT_VAR (result, float, 64, 2),
+	     VECT_VAR (vector_res, float, 64, 2));
+  CHECK_FP (TEST_MSG, float, 64, 2, PRIx64, expectedfms0_static, "");
+  VECT_VAR (vector_res, float, 64, 2) =
+    vfmaq_n_f64 (VECT_VAR (vsrc_1, float, 64, 2),
+		 VECT_VAR (vsrc_2, float, 64, 2), delem0);
+  vst1q_f64 (VECT_VAR (result, float, 64, 2),
+	     VECT_VAR (vector_res, float, 64, 2));
+  CHECK_FP (TEST_MSG, float, 64, 2, PRIx64, expectedfma0_static, "");
+
+  VECT_VAR_DECL (buf_src_3, float, 64, 2) [] = {DA2, DA3};
+  VECT_VAR_DECL (buf_src_4, float, 64, 2) [] = {DB2, DB3};
+  VLOAD (vsrc_1, buf_src_3, q, float, f, 64, 2);
+  VLOAD (vsrc_2, buf_src_4, q, float, f, 64, 2);
+  VECT_VAR (vector_res, float, 64, 2) =
+    vfmsq_n_f64 (VECT_VAR (vsrc_1, float, 64, 2),
+		 VECT_VAR (vsrc_2, float, 64, 2), delem1);
+  vst1q_f64 (VECT_VAR (result, float, 64, 2),
+	     VECT_VAR (vector_res, float, 64, 2));
+  CHECK_FP (TEST_MSG, float, 64, 2, PRIx64, expectedfms1_static, "");
+  VECT_VAR (vector_res, float, 64, 2) =
+    vfmaq_n_f64 (VECT_VAR (vsrc_1, float, 64, 2),
+		 VECT_VAR (vsrc_2, float, 64, 2), delem1);
+  vst1q_f64 (VECT_VAR (result, float, 64, 2),
+	     VECT_VAR (vector_res, float, 64, 2));
+  CHECK_FP (TEST_MSG, float, 64, 2, PRIx64, expectedfma1_static, "");
+
+  VECT_VAR_DECL (buf_src_5, float, 64, 2) [] = {DA4, DA5};
+  VECT_VAR_DECL (buf_src_6, float, 64, 2) [] = {DB4, DB5};
+  VLOAD (vsrc_1, buf_src_5, q, float, f, 64, 2);
+  VLOAD (vsrc_2, buf_src_6, q, float, f, 64, 2);
+  VECT_VAR (vector_res, float, 64, 2) =
+    vfmsq_n_f64 (VECT_VAR (vsrc_1, float, 64, 2),
+		 VECT_VAR (vsrc_2, float, 64, 2), delem2);
+  vst1q_f64 (VECT_VAR (result, float, 64, 2),
+	     VECT_VAR (vector_res, float, 64, 2));
+  CHECK_FP (TEST_MSG, float, 64, 2, PRIx64, expectedfms2_static, "");
+  VECT_VAR (vector_res, float, 64, 2) =
+    vfmaq_n_f64 (VECT_VAR (vsrc_1, float, 64, 2),
+		 VECT_VAR (vsrc_2, float, 64, 2), delem2);
+  vst1q_f64 (VECT_VAR (result, float, 64, 2),
+	     VECT_VAR (vector_res, float, 64, 2));
+  CHECK_FP (TEST_MSG, float, 64, 2, PRIx64, expectedfma2_static, "");
+
+  VECT_VAR_DECL (buf_src_7, float, 64, 2) [] = {DA6, DA7};
+  VECT_VAR_DECL (buf_src_8, float, 64, 2) [] = {DB6, DB7};
+  VLOAD (vsrc_1, buf_src_7, q, float, f, 64, 2);
+  VLOAD (vsrc_2, buf_src_8, q, float, f, 64, 2);
+  VECT_VAR (vector_res, float, 64, 2) =
+    vfmsq_n_f64 (VECT_VAR (vsrc_1, float, 64, 2),
+		 VECT_VAR (vsrc_2, float, 64, 2), delem3);
+  vst1q_f64 (VECT_VAR (result, float, 64, 2),
+	     VECT_VAR (vector_res, float, 64, 2));
+  CHECK_FP (TEST_MSG, float, 64, 2, PRIx64, expectedfms3_static, "");
+  VECT_VAR (vector_res, float, 64, 2) =
+    vfmaq_n_f64 (VECT_VAR (vsrc_1, float, 64, 2),
+		 VECT_VAR (vsrc_2, float, 64, 2), delem3);
+  vst1q_f64 (VECT_VAR (result, float, 64, 2),
+	     VECT_VAR (vector_res, float, 64, 2));
+  CHECK_FP (TEST_MSG, float, 64, 2, PRIx64, expectedfma3_static, "");
+
+#undef TEST_MSG
+#define TEST_MSG "VFMS_VFMA_N (FP64)"
+  clean_results ();
+
+  DECL_VARIABLE(vsrc_1, float, 64, 1);
+  DECL_VARIABLE(vsrc_2, float, 64, 1);
+  VECT_VAR_DECL (buf_src_1, float, 64, 1) [] = {DA0};
+  VECT_VAR_DECL (buf_src_2, float, 64, 1) [] = {DB0};
+  VLOAD (vsrc_1, buf_src_1, , float, f, 64, 1);
+  VLOAD (vsrc_2, buf_src_2, , float, f, 64, 1);
+  DECL_VARIABLE (vector_res, float, 64, 1) =
+    vfms_n_f64 (VECT_VAR (vsrc_1, float, 64, 1),
+		VECT_VAR (vsrc_2, float, 64, 1), delem0);
+  vst1_f64 (VECT_VAR (result, float, 64, 1),
+	     VECT_VAR (vector_res, float, 64, 1));
+  CHECK_FP (TEST_MSG, float, 64, 1, PRIx64, expectedfms0_static, "");
+  VECT_VAR (vector_res, float, 64, 1) =
+    vfma_n_f64 (VECT_VAR (vsrc_1, float, 64, 1),
+		VECT_VAR (vsrc_2, float, 64, 1), delem0);
+  vst1_f64 (VECT_VAR (result, float, 64, 1),
+	     VECT_VAR (vector_res, float, 64, 1));
+  CHECK_FP (TEST_MSG, float, 64, 1, PRIx64, expectedfma0_static, "");
+
+  VECT_VAR_DECL (buf_src_3, float, 64, 1) [] = {DA2};
+  VECT_VAR_DECL (buf_src_4, float, 64, 1) [] = {DB2};
+  VLOAD (vsrc_1, buf_src_3, , float, f, 64, 1);
+  VLOAD (vsrc_2, buf_src_4, , float, f, 64, 1);
+  VECT_VAR (vector_res, float, 64, 1) =
+    vfms_n_f64 (VECT_VAR (vsrc_1, float, 64, 1),
+		VECT_VAR (vsrc_2, float, 64, 1), delem1);
+  vst1_f64 (VECT_VAR (result, float, 64, 1),
+	     VECT_VAR (vector_res, float, 64, 1));
+  CHECK_FP (TEST_MSG, float, 64, 1, PRIx64, expectedfms1_static, "");
+  VECT_VAR (vector_res, float, 64, 1) =
+    vfma_n_f64 (VECT_VAR (vsrc_1, float, 64, 1),
+		VECT_VAR (vsrc_2, float, 64, 1), delem1);
+  vst1_f64 (VECT_VAR (result, float, 64, 1),
+	     VECT_VAR (vector_res, float, 64, 1));
+  CHECK_FP (TEST_MSG, float, 64, 1, PRIx64, expectedfma1_static, "");
+
+  VECT_VAR_DECL (buf_src_5, float, 64, 1) [] = {DA4};
+  VECT_VAR_DECL (buf_src_6, float, 64, 1) [] = {DB4};
+  VLOAD (vsrc_1, buf_src_5, , float, f, 64, 1);
+  VLOAD (vsrc_2, buf_src_6, , float, f, 64, 1);
+  VECT_VAR (vector_res, float, 64, 1) =
+    vfms_n_f64 (VECT_VAR (vsrc_1, float, 64, 1),
+		VECT_VAR (vsrc_2, float, 64, 1), delem2);
+  vst1_f64 (VECT_VAR (result, float, 64, 1),
+	     VECT_VAR (vector_res, float, 64, 1));
+  CHECK_FP (TEST_MSG, float, 64, 1, PRIx64, expectedfms2_static, "");
+  VECT_VAR (vector_res, float, 64, 1) =
+    vfma_n_f64 (VECT_VAR (vsrc_1, float, 64, 1),
+		VECT_VAR (vsrc_2, float, 64, 1), delem2);
+  vst1_f64 (VECT_VAR (result, float, 64, 1),
+	     VECT_VAR (vector_res, float, 64, 1));
+  CHECK_FP (TEST_MSG, float, 64, 1, PRIx64, expectedfma2_static, "");
+
+  VECT_VAR_DECL (buf_src_7, float, 64, 1) [] = {DA6};
+  VECT_VAR_DECL (buf_src_8, float, 64, 1) [] = {DB6};
+  VLOAD (vsrc_1, buf_src_7, , float, f, 64, 1);
+  VLOAD (vsrc_2, buf_src_8, , float, f, 64, 1);
+  VECT_VAR (vector_res, float, 64, 1) =
+    vfms_n_f64 (VECT_VAR (vsrc_1, float, 64, 1),
+		VECT_VAR (vsrc_2, float, 64, 1), delem3);
+  vst1_f64 (VECT_VAR (result, float, 64, 1),
+	     VECT_VAR (vector_res, float, 64, 1));
+  CHECK_FP (TEST_MSG, float, 64, 1, PRIx64, expectedfms3_static, "");
+  VECT_VAR (vector_res, float, 64, 1) =
+    vfma_n_f64 (VECT_VAR (vsrc_1, float, 64, 1),
+		VECT_VAR (vsrc_2, float, 64, 1), delem3);
+  vst1_f64 (VECT_VAR (result, float, 64, 1),
+	     VECT_VAR (vector_res, float, 64, 1));
+  CHECK_FP (TEST_MSG, float, 64, 1, PRIx64, expectedfma3_static, "");
+}
+#endif
+
+int
+main (void)
+{
+#if defined(__aarch64__) && defined(__ARM_FEATURE_FMA)
+  exec_vfma_vfms_n ();
+#endif
+  return 0;
+}
--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vfmsh_f16_1.c
@@ -0,0 +1,40 @@
+/* { dg-do run } */
+/* { dg-require-effective-target arm_v8_2a_fp16_scalar_hw } */
+/* { dg-add-options arm_v8_2a_fp16_scalar }  */
+
+#include <arm_fp16.h>
+
+/* Expected results (16-bit hexadecimal representation).  */
+uint16_t expected[] =
+{
+  0x0000 /* 0.000000 */,
+  0x8000 /* -0.000000 */,
+  0x42af /* 3.341797 */,
+  0x5043 /* 34.093750 */,
+  0xccd2 /* -19.281250 */,
+  0x3712 /* 0.441895 */,
+  0x3acc /* 0.849609 */,
+  0x4848 /* 8.562500 */,
+  0xcc43 /* -17.046875 */,
+  0xd65c /* -101.750000 */,
+  0x4185 /* 2.759766 */,
+  0xcd39 /* -20.890625 */,
+  0xd45b /* -69.687500 */,
+  0x5241 /* 50.031250 */,
+  0xc675 /* -6.457031 */,
+  0x4d07 /* 20.109375 */,
+  0x7c00 /* inf */,
+  0xfc00 /* -inf */
+};
+
+#define TEST_MSG "VFMSH_F16"
+#define INSN_NAME vfmsh_f16
+
+#define EXPECTED expected
+
+#define INPUT_TYPE float16_t
+#define OUTPUT_TYPE float16_t
+#define OUTPUT_TYPE_SIZE 16
+
+/* Include the template for binary scalar operations.  */
+#include "ternary_scalar_op.inc"
--- a/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vget_high.c
+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vget_high.c
@@ -63,8 +63,8 @@ void exec_vget_high (void)
   CHECK(TEST_MSG, uint, 16, 4, PRIx16, expected, "");
   CHECK(TEST_MSG, uint, 32, 2, PRIx32, expected, "");
   CHECK(TEST_MSG, uint, 64, 1, PRIx64, expected, "");
-  CHECK(TEST_MSG, poly, 8, 8, PRIx8, expected, "");
-  CHECK(TEST_MSG, poly, 16, 4, PRIx16, expected, "");
+  CHECK_POLY(TEST_MSG, poly, 8, 8, PRIx8, expected, "");
+  CHECK_POLY(TEST_MSG, poly, 16, 4, PRIx16, expected, "");
   CHECK_FP(TEST_MSG, float, 32, 2, PRIx32, expected, "");
 }
 
--- a/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vget_lane.c
+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vget_lane.c
@@ -13,6 +13,7 @@ uint32_t   expected_u32  = 0xfffffff1;
 uint64_t   expected_u64  = 0xfffffffffffffff0;
 poly8_t    expected_p8   = 0xf6;
 poly16_t   expected_p16  = 0xfff2;
+hfloat16_t expected_f16  = 0xcb80;
 hfloat32_t expected_f32  = 0xc1700000;
 
 int8_t     expectedq_s8  = 0xff;
@@ -25,6 +26,7 @@ uint32_t   expectedq_u32 = 0xfffffff2;
 uint64_t   expectedq_u64 = 0xfffffffffffffff1;
 poly8_t    expectedq_p8  = 0xfe;
 poly16_t   expectedq_p16 = 0xfff6;
+hfloat16_t expectedq_f16 = 0xca80;
 hfloat32_t expectedq_f32 = 0xc1500000;
 
 int error_found = 0;
@@ -52,6 +54,12 @@ void exec_vget_lane (void)
     uint32_t var_int32;
     float32_t var_float32;
   } var_int32_float32;
+#if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
+  union {
+    uint16_t var_int16;
+    float16_t var_float16;
+  } var_int16_float16;
+#endif
 
 #define TEST_VGET_LANE_FP(Q, T1, T2, W, N, L)				   \
   VAR(var, T1, W) = vget##Q##_lane_##T2##W(VECT_VAR(vector, T1, W, N), L); \
@@ -81,10 +89,17 @@ void exec_vget_lane (void)
   VAR_DECL(var, uint, 64);
   VAR_DECL(var, poly, 8);
   VAR_DECL(var, poly, 16);
+#if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
+  VAR_DECL(var, float, 16);
+#endif
   VAR_DECL(var, float, 32);
 
   /* Initialize input values.  */
   TEST_MACRO_ALL_VARIANTS_2_5(VLOAD, vector, buffer);
+#if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
+  VLOAD(vector, buffer, , float, f, 16, 4);
+  VLOAD(vector, buffer, q, float, f, 16, 8);
+#endif
   VLOAD(vector, buffer, , float, f, 32, 2);
   VLOAD(vector, buffer, q, float, f, 32, 4);
 
@@ -99,6 +114,9 @@ void exec_vget_lane (void)
   TEST_VGET_LANE(, uint, u, 64, 1, 0);
   TEST_VGET_LANE(, poly, p, 8, 8, 6);
   TEST_VGET_LANE(, poly, p, 16, 4, 2);
+#if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
+  TEST_VGET_LANE_FP(, float, f, 16, 4, 1);
+#endif
   TEST_VGET_LANE_FP(, float, f, 32, 2, 1);
 
   TEST_VGET_LANE(q, int, s, 8, 16, 15);
@@ -111,6 +129,9 @@ void exec_vget_lane (void)
   TEST_VGET_LANE(q, uint, u, 64, 2, 1);
   TEST_VGET_LANE(q, poly, p, 8, 16, 14);
   TEST_VGET_LANE(q, poly, p, 16, 8, 6);
+#if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
+  TEST_VGET_LANE_FP(q, float, f, 16, 8, 3);
+#endif
   TEST_VGET_LANE_FP(q, float, f, 32, 4, 3);
 }
 
--- a/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vget_low.c
+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vget_low.c
@@ -63,8 +63,8 @@ void exec_vget_low (void)
   CHECK(TEST_MSG, uint, 16, 4, PRIx16, expected, "");
   CHECK(TEST_MSG, uint, 32, 2, PRIx32, expected, "");
   CHECK(TEST_MSG, uint, 64, 1, PRIx64, expected, "");
-  CHECK(TEST_MSG, poly, 8, 8, PRIx8, expected, "");
-  CHECK(TEST_MSG, poly, 16, 4, PRIx16, expected, "");
+  CHECK_POLY(TEST_MSG, poly, 8, 8, PRIx8, expected, "");
+  CHECK_POLY(TEST_MSG, poly, 16, 4, PRIx16, expected, "");
 #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
   CHECK_FP(TEST_MSG, float, 16, 4, PRIx16, expected, "");
 #endif
--- a/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vld2_lane_f16_indices_1.c
+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vld2_lane_f16_indices_1.c
@@ -2,6 +2,7 @@
 
 /* { dg-do compile } */
 /* { dg-skip-if "" { *-*-* } { "-fno-fat-lto-objects" } } */
+/* { dg-require-effective-target arm_neon_fp16_ok { target { arm*-*-* } } } */
 
 float16x4x2_t
 f_vld2_lane_f16 (float16_t * p, float16x4x2_t v)
--- a/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vld2q_lane_f16_indices_1.c
+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vld2q_lane_f16_indices_1.c
@@ -2,6 +2,7 @@
 
 /* { dg-do compile } */
 /* { dg-skip-if "" { *-*-* } { "-fno-fat-lto-objects" } } */
+/* { dg-require-effective-target arm_neon_fp16_ok { target { arm*-*-* } } } */
 
 float16x8x2_t
 f_vld2q_lane_f16 (float16_t * p, float16x8x2_t v)
--- a/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vld3_lane_f16_indices_1.c
+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vld3_lane_f16_indices_1.c
@@ -2,6 +2,7 @@
 
 /* { dg-do compile } */
 /* { dg-skip-if "" { *-*-* } { "-fno-fat-lto-objects" } } */
+/* { dg-require-effective-target arm_neon_fp16_ok { target { arm*-*-* } } } */
 
 float16x4x3_t
 f_vld3_lane_f16 (float16_t * p, float16x4x3_t v)
--- a/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vld3q_lane_f16_indices_1.c
+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vld3q_lane_f16_indices_1.c
@@ -2,6 +2,7 @@
 
 /* { dg-do compile } */
 /* { dg-skip-if "" { *-*-* } { "-fno-fat-lto-objects" } } */
+/* { dg-require-effective-target arm_neon_fp16_ok { target { arm*-*-* } } } */
 
 float16x8x3_t
 f_vld3q_lane_f16 (float16_t * p, float16x8x3_t v)
--- a/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vld4_lane_f16_indices_1.c
+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vld4_lane_f16_indices_1.c
@@ -2,6 +2,7 @@
 
 /* { dg-do compile } */
 /* { dg-skip-if "" { *-*-* } { "-fno-fat-lto-objects" } } */
+/* { dg-require-effective-target arm_neon_fp16_ok { target { arm*-*-* } } } */
 
 float16x4x4_t
 f_vld4_lane_f16 (float16_t * p, float16x4x4_t v)
--- a/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vld4q_lane_f16_indices_1.c
+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vld4q_lane_f16_indices_1.c
@@ -2,6 +2,7 @@
 
 /* { dg-do compile } */
 /* { dg-skip-if "" { *-*-* } { "-fno-fat-lto-objects" } } */
+/* { dg-require-effective-target arm_neon_fp16_ok { target { arm*-*-* } } } */
 
 float16x8x4_t
 f_vld4q_lane_f16 (float16_t * p, float16x8x4_t v)
--- a/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vldX.c
+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vldX.c
@@ -528,8 +528,8 @@ void exec_vldX (void)
     CHECK(test_name, uint, 16, 4, PRIx16, EXPECTED, comment);		\
     CHECK(test_name, uint, 32, 2, PRIx32, EXPECTED, comment);		\
     CHECK(test_name, uint, 64, 1, PRIx64, EXPECTED, comment);		\
-    CHECK(test_name, poly, 8, 8, PRIx8, EXPECTED, comment);		\
-    CHECK(test_name, poly, 16, 4, PRIx16, EXPECTED, comment);		\
+    CHECK_POLY(test_name, poly, 8, 8, PRIx8, EXPECTED, comment);	\
+    CHECK_POLY(test_name, poly, 16, 4, PRIx16, EXPECTED, comment);	\
     CHECK_FP(test_name, float, 32, 2, PRIx32, EXPECTED, comment);	\
 									\
     CHECK(test_name, int, 8, 16, PRIx8, EXPECTED, comment);		\
@@ -538,8 +538,8 @@ void exec_vldX (void)
     CHECK(test_name, uint, 8, 16, PRIx8, EXPECTED, comment);		\
     CHECK(test_name, uint, 16, 8, PRIx16, EXPECTED, comment);		\
     CHECK(test_name, uint, 32, 4, PRIx32, EXPECTED, comment);		\
-    CHECK(test_name, poly, 8, 16, PRIx8, EXPECTED, comment);		\
-    CHECK(test_name, poly, 16, 8, PRIx16, EXPECTED, comment);		\
+    CHECK_POLY(test_name, poly, 8, 16, PRIx8, EXPECTED, comment);	\
+    CHECK_POLY(test_name, poly, 16, 8, PRIx16, EXPECTED, comment);	\
     CHECK_FP(test_name, float, 32, 4, PRIx32, EXPECTED, comment)
 
 #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
--- a/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vldX_dup.c
+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vldX_dup.c
@@ -270,8 +270,8 @@ void exec_vldX_dup (void)
     CHECK(test_name, uint, 16, 4, PRIx16, EXPECTED, comment);		\
     CHECK(test_name, uint, 32, 2, PRIx32, EXPECTED, comment);		\
     CHECK(test_name, uint, 64, 1, PRIx64, EXPECTED, comment);		\
-    CHECK(test_name, poly, 8, 8, PRIx8, EXPECTED, comment);		\
-    CHECK(test_name, poly, 16, 4, PRIx16, EXPECTED, comment);		\
+    CHECK_POLY(test_name, poly, 8, 8, PRIx8, EXPECTED, comment);	\
+    CHECK_POLY(test_name, poly, 16, 4, PRIx16, EXPECTED, comment);	\
     CHECK_FP(test_name, float, 32, 2, PRIx32, EXPECTED, comment)
 
 #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
--- a/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vldX_lane.c
+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vldX_lane.c
@@ -451,14 +451,14 @@ void exec_vldX_lane (void)
     CHECK(test_name, uint, 8, 8, PRIx8, EXPECTED, comment);		\
     CHECK(test_name, uint, 16, 4, PRIx16, EXPECTED, comment);		\
     CHECK(test_name, uint, 32, 2, PRIx32, EXPECTED, comment);		\
-    CHECK(test_name, poly, 8, 8, PRIx8, EXPECTED, comment);		\
-    CHECK(test_name, poly, 16, 4, PRIx16, EXPECTED, comment);		\
+    CHECK_POLY(test_name, poly, 8, 8, PRIx8, EXPECTED, comment);	\
+    CHECK_POLY(test_name, poly, 16, 4, PRIx16, EXPECTED, comment);	\
     CHECK_FP(test_name, float, 32, 2, PRIx32, EXPECTED, comment);	\
     CHECK(test_name, int, 16, 8, PRIx16, EXPECTED, comment);		\
     CHECK(test_name, int, 32, 4, PRIx32, EXPECTED, comment);		\
     CHECK(test_name, uint, 16, 8, PRIx16, EXPECTED, comment);		\
     CHECK(test_name, uint, 32, 4, PRIx32, EXPECTED, comment);		\
-    CHECK(test_name, poly, 16, 8, PRIx16, EXPECTED, comment);		\
+    CHECK_POLY(test_name, poly, 16, 8, PRIx16, EXPECTED, comment);	\
     CHECK_FP(test_name, float, 32, 4, PRIx32, EXPECTED, comment)
 
 #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
--- a/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vmax.c
+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vmax.c
@@ -7,6 +7,10 @@
 
 #define HAS_FLOAT_VARIANT
 
+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+#define HAS_FLOAT16_VARIANT
+#endif
+
 /* Expected results.  */
 VECT_VAR_DECL(expected,int,8,8) [] = { 0xf3, 0xf3, 0xf3, 0xf3,
 				       0xf4, 0xf5, 0xf6, 0xf7 };
@@ -16,6 +20,9 @@ VECT_VAR_DECL(expected,uint,8,8) [] = { 0xf3, 0xf3, 0xf3, 0xf3,
 					0xf4, 0xf5, 0xf6, 0xf7 };
 VECT_VAR_DECL(expected,uint,16,4) [] = { 0xfff1, 0xfff1, 0xfff2, 0xfff3 };
 VECT_VAR_DECL(expected,uint,32,2) [] = { 0xfffffff0, 0xfffffff1 };
+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+VECT_VAR_DECL(expected, hfloat, 16, 4) [] = { 0xcbc0, 0xcb80, 0xcb00, 0xca80 };
+#endif
 VECT_VAR_DECL(expected,hfloat,32,2) [] = { 0xc1780000, 0xc1700000 };
 VECT_VAR_DECL(expected,int,8,16) [] = { 0xf4, 0xf4, 0xf4, 0xf4,
 					0xf4, 0xf5, 0xf6, 0xf7,
@@ -33,10 +40,36 @@ VECT_VAR_DECL(expected,uint,16,8) [] = { 0xfff2, 0xfff2, 0xfff2, 0xfff3,
 					 0xfff4, 0xfff5, 0xfff6, 0xfff7 };
 VECT_VAR_DECL(expected,uint,32,4) [] = { 0xfffffff1, 0xfffffff1,
 					 0xfffffff2, 0xfffffff3 };
+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+VECT_VAR_DECL(expected, hfloat, 16, 8) [] = { 0xcb40, 0xcb40, 0xcb00, 0xca80,
+					      0xca00, 0xc980, 0xc900, 0xc880 };
+#endif
 VECT_VAR_DECL(expected,hfloat,32,4) [] = { 0xc1680000, 0xc1680000,
 					   0xc1600000, 0xc1500000 };
 
 /* Expected results with special FP values.  */
+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+VECT_VAR_DECL(expected_nan, hfloat, 16, 8) [] = { 0x7e00, 0x7e00,
+						  0x7e00, 0x7e00,
+						  0x7e00, 0x7e00,
+						  0x7e00, 0x7e00 };
+VECT_VAR_DECL(expected_mnan, hfloat, 16, 8) [] = { 0x7e00, 0x7e00,
+						   0x7e00, 0x7e00,
+						   0x7e00, 0x7e00,
+						   0x7e00, 0x7e00 };
+VECT_VAR_DECL(expected_inf, hfloat, 16, 8) [] = { 0x7c00, 0x7c00,
+						  0x7c00, 0x7c00,
+						  0x7c00, 0x7c00,
+						  0x7c00, 0x7c00 };
+VECT_VAR_DECL(expected_minf, hfloat, 16, 8) [] = { 0x3c00, 0x3c00,
+						   0x3c00, 0x3c00,
+						   0x3c00, 0x3c00,
+						   0x3c00, 0x3c00 };
+VECT_VAR_DECL(expected_zero1, hfloat, 16, 8) [] = { 0x0, 0x0, 0x0, 0x0,
+						    0x0, 0x0, 0x0, 0x0 };
+VECT_VAR_DECL(expected_zero2, hfloat, 16, 8) [] = { 0x0, 0x0, 0x0, 0x0,
+						    0x0, 0x0, 0x0, 0x0 };
+#endif
 VECT_VAR_DECL(expected_nan,hfloat,32,4) [] = { 0x7fc00000, 0x7fc00000,
 					       0x7fc00000, 0x7fc00000 };
 VECT_VAR_DECL(expected_mnan,hfloat,32,4) [] = { 0x7fc00000, 0x7fc00000,
--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vmaxh_f16_1.c
@@ -0,0 +1,34 @@
+/* { dg-do run } */
+/* { dg-require-effective-target arm_v8_2a_fp16_scalar_hw } */
+/* { dg-add-options arm_v8_2a_fp16_scalar }  */
+/* { dg-skip-if "" { arm*-*-* } } */
+
+#include <arm_fp16.h>
+
+/* Input values.  */
+#define A 123.4
+#define B -567.8
+#define C -34.8
+#define D 1024
+#define E 663.1
+#define F 169.1
+#define G -4.8
+#define H 77
+
+float16_t input_1[] = { A, B, C, D };
+float16_t input_2[] = { E, F, G, H };
+float16_t expected[] = { E, F, G, D };
+
+#define TEST_MSG "VMAXH_F16"
+#define INSN_NAME vmaxh_f16
+
+#define INPUT_1 input_1
+#define INPUT_2 input_2
+#define EXPECTED expected
+
+#define INPUT_TYPE float16_t
+#define OUTPUT_TYPE float16_t
+#define OUTPUT_TYPE_SIZE 16
+
+/* Include the template for unary scalar operations.  */
+#include "binary_scalar_op.inc"
--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vmaxnm_1.c
@@ -0,0 +1,47 @@
+/* This file tests an intrinsic which currently has only an f16 variant and that
+   is only available when FP16 arithmetic instructions are supported.  */
+/* { dg-require-effective-target arm_v8_2a_fp16_neon_hw } */
+
+#include <arm_neon.h>
+#include "arm-neon-ref.h"
+#include "compute-ref-data.h"
+
+#define INSN_NAME vmaxnm
+#define TEST_MSG "VMAXNM/VMAXNMQ"
+
+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+#define HAS_FLOAT16_VARIANT
+#endif
+
+/* Expected results.  */
+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+VECT_VAR_DECL(expected, hfloat, 16, 4) [] = { 0xcbc0, 0xcb80, 0xcb00, 0xca80 };
+VECT_VAR_DECL(expected, hfloat, 16, 8) [] = { 0xcb40, 0xcb40, 0xcb00, 0xca80,
+					      0xca00, 0xc980, 0xc900, 0xc880 };
+#endif
+
+/* Expected results with special FP values.  */
+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+VECT_VAR_DECL(expected_nan, hfloat, 16, 8) [] = { 0x3c00, 0x3c00,
+						  0x3c00, 0x3c00,
+						  0x3c00, 0x3c00,
+						  0x3c00, 0x3c00 };
+VECT_VAR_DECL(expected_mnan, hfloat, 16, 8) [] = { 0x3c00, 0x3c00,
+						   0x3c00, 0x3c00,
+						   0x3c00, 0x3c00,
+						   0x3c00, 0x3c00 };
+VECT_VAR_DECL(expected_inf, hfloat, 16, 8) [] = { 0x7c00, 0x7c00,
+						  0x7c00, 0x7c00,
+						  0x7c00, 0x7c00,
+						  0x7c00, 0x7c00 };
+VECT_VAR_DECL(expected_minf, hfloat, 16, 8) [] = { 0x3c00, 0x3c00,
+						   0x3c00, 0x3c00,
+						   0x3c00, 0x3c00,
+						   0x3c00, 0x3c00 };
+VECT_VAR_DECL(expected_zero1, hfloat, 16, 8) [] = { 0x0, 0x0, 0x0, 0x0,
+						    0x0, 0x0, 0x0, 0x0 };
+VECT_VAR_DECL(expected_zero2, hfloat, 16, 8) [] = { 0x0, 0x0, 0x0, 0x0,
+						    0x0, 0x0, 0x0, 0x0 };
+#endif
+
+#include "binary_op_float.inc"
--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vmaxnmh_f16_1.c
@@ -0,0 +1,42 @@
+/* { dg-do run } */
+/* { dg-require-effective-target arm_v8_2a_fp16_scalar_hw } */
+/* { dg-add-options arm_v8_2a_fp16_scalar }  */
+
+#include <arm_fp16.h>
+
+#define INFF __builtin_inf ()
+
+/* Expected results (16-bit hexadecimal representation).  */
+uint16_t expected[] =
+{
+  0x3c00 /* 1.000000 */,
+  0x3c00 /* 1.000000 */,
+  0x4000 /* 2.000000 */,
+  0x5640 /* 100.000000 */,
+  0x4f80 /* 30.000000 */,
+  0x3666 /* 0.399902 */,
+  0x3800 /* 0.500000 */,
+  0x3d52 /* 1.330078 */,
+  0xc64d /* -6.300781 */,
+  0x4d00 /* 20.000000 */,
+  0x355d /* 0.335205 */,
+  0x409a /* 2.300781 */,
+  0x3c00 /* 1.000000 */,
+  0x4a91 /* 13.132812 */,
+  0x34f6 /* 0.310059 */,
+  0x4d00 /* 20.000000 */,
+  0x7c00 /* inf */,
+  0x7c00 /* inf */
+};
+
+#define TEST_MSG "VMAXNMH_F16"
+#define INSN_NAME vmaxnmh_f16
+
+#define EXPECTED expected
+
+#define INPUT_TYPE float16_t
+#define OUTPUT_TYPE float16_t
+#define OUTPUT_TYPE_SIZE 16
+
+/* Include the template for binary scalar operations.  */
+#include "binary_scalar_op.inc"
--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vmaxnmv_f16_1.c
@@ -0,0 +1,131 @@
+/* { dg-do run } */
+/* { dg-require-effective-target arm_v8_2a_fp16_neon_hw } */
+/* { dg-add-options arm_v8_2a_fp16_neon } */
+/* { dg-skip-if "" { arm*-*-* } } */
+
+#include <arm_neon.h>
+#include "arm-neon-ref.h"
+#include "compute-ref-data.h"
+
+#define FP16_C(a) ((__fp16) a)
+#define A0 FP16_C (34.8)
+#define B0 FP16_C (__builtin_nanf (""))
+#define C0 FP16_C (-__builtin_nanf (""))
+#define D0 FP16_C (0.0)
+
+#define A1 FP16_C (1025.8)
+#define B1 FP16_C (13.4)
+#define C1 FP16_C (__builtin_nanf (""))
+#define D1 FP16_C (10)
+#define E1 FP16_C (-0.0)
+#define F1 FP16_C (-__builtin_nanf (""))
+#define G1 FP16_C (0.0)
+#define H1 FP16_C (10)
+
+/* Expected results for vmaxnmv.  */
+uint16_t expect = 0x505A /* A0.  */;
+uint16_t expect_alt = 0x6402 /* A1.  */;
+
+void exec_vmaxnmv_f16 (void)
+{
+#undef TEST_MSG
+#define TEST_MSG "VMAXNMV (FP16)"
+  clean_results ();
+
+  DECL_VARIABLE(vsrc, float, 16, 4);
+  VECT_VAR_DECL (buf_src, float, 16, 4) [] = {A0, B0, C0, D0};
+  VLOAD (vsrc, buf_src, , float, f, 16, 4);
+  float16_t vector_res = vmaxnmv_f16 (VECT_VAR (vsrc, float, 16, 4));
+
+  if (* (uint16_t *) &vector_res != expect)
+    abort ();
+
+  VECT_VAR_DECL (buf_src1, float, 16, 4) [] = {B0, A0, C0, D0};
+  VLOAD (vsrc, buf_src1, , float, f, 16, 4);
+  vector_res = vmaxnmv_f16 (VECT_VAR (vsrc, float, 16, 4));
+
+  if (* (uint16_t *) &vector_res != expect)
+    abort ();
+
+  VECT_VAR_DECL (buf_src2, float, 16, 4) [] = {B0, C0, A0, D0};
+  VLOAD (vsrc, buf_src2, , float, f, 16, 4);
+  vector_res = vmaxnmv_f16 (VECT_VAR (vsrc, float, 16, 4));
+
+  if (* (uint16_t *) &vector_res != expect)
+    abort ();
+
+  VECT_VAR_DECL (buf_src3, float, 16, 4) [] = {B0, C0, D0, A0};
+  VLOAD (vsrc, buf_src3, , float, f, 16, 4);
+  vector_res = vmaxnmv_f16 (VECT_VAR (vsrc, float, 16, 4));
+
+  if (* (uint16_t *) &vector_res != expect)
+    abort ();
+
+#undef TEST_MSG
+#define TEST_MSG "VMAXNMVQ (FP16)"
+  clean_results ();
+
+  DECL_VARIABLE(vsrc, float, 16, 8);
+  VECT_VAR_DECL (buf_src, float, 16, 8) [] = {A1, B1, C1, D1, E1, F1, G1, H1};
+  VLOAD (vsrc, buf_src, q, float, f, 16, 8);
+  vector_res = vmaxnmvq_f16 (VECT_VAR (vsrc, float, 16, 8));
+
+  if (* (uint16_t *) &vector_res != expect_alt)
+    abort ();
+
+  VECT_VAR_DECL (buf_src1, float, 16, 8) [] = {B1, A1, C1, D1, E1, F1, G1, H1};
+  VLOAD (vsrc, buf_src1, q, float, f, 16, 8);
+  vector_res = vmaxnmvq_f16 (VECT_VAR (vsrc, float, 16, 8));
+
+  if (* (uint16_t *) &vector_res != expect_alt)
+    abort ();
+
+  VECT_VAR_DECL (buf_src2, float, 16, 8) [] = {B1, C1, A1, D1, E1, F1, G1, H1};
+  VLOAD (vsrc, buf_src2, q, float, f, 16, 8);
+  vector_res = vmaxnmvq_f16 (VECT_VAR (vsrc, float, 16, 8));
+
+  if (* (uint16_t *) &vector_res != expect_alt)
+    abort ();
+
+  VECT_VAR_DECL (buf_src3, float, 16, 8) [] = {B1, C1, D1, A1, E1, F1, G1, H1};
+  VLOAD (vsrc, buf_src3, q, float, f, 16, 8);
+  vector_res = vmaxnmvq_f16 (VECT_VAR (vsrc, float, 16, 8));
+
+  if (* (uint16_t *) &vector_res != expect_alt)
+    abort ();
+
+  VECT_VAR_DECL (buf_src4, float, 16, 8) [] = {B1, C1, D1, E1, A1, F1, G1, H1};
+  VLOAD (vsrc, buf_src4, q, float, f, 16, 8);
+  vector_res = vmaxnmvq_f16 (VECT_VAR (vsrc, float, 16, 8));
+
+  if (* (uint16_t *) &vector_res != expect_alt)
+    abort ();
+
+  VECT_VAR_DECL (buf_src5, float, 16, 8) [] = {B1, C1, D1, E1, F1, A1, G1, H1};
+  VLOAD (vsrc, buf_src5, q, float, f, 16, 8);
+  vector_res = vmaxnmvq_f16 (VECT_VAR (vsrc, float, 16, 8));
+
+  if (* (uint16_t *) &vector_res != expect_alt)
+    abort ();
+
+  VECT_VAR_DECL (buf_src6, float, 16, 8) [] = {B1, C1, D1, E1, F1, G1, A1, H1};
+  VLOAD (vsrc, buf_src6, q, float, f, 16, 8);
+  vector_res = vmaxnmvq_f16 (VECT_VAR (vsrc, float, 16, 8));
+
+  if (* (uint16_t *) &vector_res != expect_alt)
+    abort ();
+
+  VECT_VAR_DECL (buf_src7, float, 16, 8) [] = {B1, C1, D1, E1, F1, G1, H1, A1};
+  VLOAD (vsrc, buf_src7, q, float, f, 16, 8);
+  vector_res = vmaxnmvq_f16 (VECT_VAR (vsrc, float, 16, 8));
+
+  if (* (uint16_t *) &vector_res != expect_alt)
+    abort ();
+}
+
+int
+main (void)
+{
+  exec_vmaxnmv_f16 ();
+  return 0;
+}
--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vmaxv_f16_1.c
@@ -0,0 +1,131 @@
+/* { dg-do run } */
+/* { dg-require-effective-target arm_v8_2a_fp16_neon_hw } */
+/* { dg-add-options arm_v8_2a_fp16_neon } */
+/* { dg-skip-if "" { arm*-*-* } } */
+
+#include <arm_neon.h>
+#include "arm-neon-ref.h"
+#include "compute-ref-data.h"
+
+#define FP16_C(a) ((__fp16) a)
+#define A0 FP16_C (123.4)
+#define B0 FP16_C (-567.8)
+#define C0 FP16_C (34.8)
+#define D0 FP16_C (0.0)
+
+#define A1 FP16_C (1025.8)
+#define B1 FP16_C (13.4)
+#define C1 FP16_C (-567.8)
+#define D1 FP16_C (10)
+#define E1 FP16_C (-0.0)
+#define F1 FP16_C (567.8)
+#define G1 FP16_C (0.0)
+#define H1 FP16_C (10)
+
+/* Expected results for vmaxv.  */
+uint16_t expect = 0x57B6 /* A0.  */;
+uint16_t expect_alt = 0x6402 /* A1.  */;
+
+void exec_vmaxv_f16 (void)
+{
+#undef TEST_MSG
+#define TEST_MSG "VMAXV (FP16)"
+  clean_results ();
+
+  DECL_VARIABLE(vsrc, float, 16, 4);
+  VECT_VAR_DECL (buf_src, float, 16, 4) [] = {A0, B0, C0, D0};
+  VLOAD (vsrc, buf_src, , float, f, 16, 4);
+  float16_t vector_res = vmaxv_f16 (VECT_VAR (vsrc, float, 16, 4));
+
+  if (* (uint16_t *) &vector_res != expect)
+    abort ();
+
+  VECT_VAR_DECL (buf_src1, float, 16, 4) [] = {B0, A0, C0, D0};
+  VLOAD (vsrc, buf_src1, , float, f, 16, 4);
+  vector_res = vmaxv_f16 (VECT_VAR (vsrc, float, 16, 4));
+
+  if (* (uint16_t *) &vector_res != expect)
+    abort ();
+
+  VECT_VAR_DECL (buf_src2, float, 16, 4) [] = {B0, C0, A0, D0};
+  VLOAD (vsrc, buf_src2, , float, f, 16, 4);
+  vector_res = vmaxv_f16 (VECT_VAR (vsrc, float, 16, 4));
+
+  if (* (uint16_t *) &vector_res != expect)
+    abort ();
+
+  VECT_VAR_DECL (buf_src3, float, 16, 4) [] = {B0, C0, D0, A0};
+  VLOAD (vsrc, buf_src3, , float, f, 16, 4);
+  vector_res = vmaxv_f16 (VECT_VAR (vsrc, float, 16, 4));
+
+  if (* (uint16_t *) &vector_res != expect)
+    abort ();
+
+#undef TEST_MSG
+#define TEST_MSG "VMAXVQ (FP16)"
+  clean_results ();
+
+  DECL_VARIABLE(vsrc, float, 16, 8);
+  VECT_VAR_DECL (buf_src, float, 16, 8) [] = {A1, B1, C1, D1, E1, F1, G1, H1};
+  VLOAD (vsrc, buf_src, q, float, f, 16, 8);
+  vector_res = vmaxvq_f16 (VECT_VAR (vsrc, float, 16, 8));
+
+  if (* (uint16_t *) &vector_res != expect_alt)
+    abort ();
+
+  VECT_VAR_DECL (buf_src1, float, 16, 8) [] = {B1, A1, C1, D1, E1, F1, G1, H1};
+  VLOAD (vsrc, buf_src1, q, float, f, 16, 8);
+  vector_res = vmaxvq_f16 (VECT_VAR (vsrc, float, 16, 8));
+
+  if (* (uint16_t *) &vector_res != expect_alt)
+    abort ();
+
+  VECT_VAR_DECL (buf_src2, float, 16, 8) [] = {B1, C1, A1, D1, E1, F1, G1, H1};
+  VLOAD (vsrc, buf_src2, q, float, f, 16, 8);
+  vector_res = vmaxvq_f16 (VECT_VAR (vsrc, float, 16, 8));
+
+  if (* (uint16_t *) &vector_res != expect_alt)
+    abort ();
+
+  VECT_VAR_DECL (buf_src3, float, 16, 8) [] = {B1, C1, D1, A1, E1, F1, G1, H1};
+  VLOAD (vsrc, buf_src3, q, float, f, 16, 8);
+  vector_res = vmaxvq_f16 (VECT_VAR (vsrc, float, 16, 8));
+
+  if (* (uint16_t *) &vector_res != expect_alt)
+    abort ();
+
+  VECT_VAR_DECL (buf_src4, float, 16, 8) [] = {B1, C1, D1, E1, A1, F1, G1, H1};
+  VLOAD (vsrc, buf_src4, q, float, f, 16, 8);
+  vector_res = vmaxvq_f16 (VECT_VAR (vsrc, float, 16, 8));
+
+  if (* (uint16_t *) &vector_res != expect_alt)
+    abort ();
+
+  VECT_VAR_DECL (buf_src5, float, 16, 8) [] = {B1, C1, D1, E1, F1, A1, G1, H1};
+  VLOAD (vsrc, buf_src5, q, float, f, 16, 8);
+  vector_res = vmaxvq_f16 (VECT_VAR (vsrc, float, 16, 8));
+
+  if (* (uint16_t *) &vector_res != expect_alt)
+    abort ();
+
+  VECT_VAR_DECL (buf_src6, float, 16, 8) [] = {B1, C1, D1, E1, F1, G1, A1, H1};
+  VLOAD (vsrc, buf_src6, q, float, f, 16, 8);
+  vector_res = vmaxvq_f16 (VECT_VAR (vsrc, float, 16, 8));
+
+  if (* (uint16_t *) &vector_res != expect_alt)
+    abort ();
+
+  VECT_VAR_DECL (buf_src7, float, 16, 8) [] = {B1, C1, D1, E1, F1, G1, H1, A1};
+  VLOAD (vsrc, buf_src7, q, float, f, 16, 8);
+  vector_res = vmaxvq_f16 (VECT_VAR (vsrc, float, 16, 8));
+
+  if (* (uint16_t *) &vector_res != expect_alt)
+    abort ();
+}
+
+int
+main (void)
+{
+  exec_vmaxv_f16 ();
+  return 0;
+}
--- a/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vmin.c
+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vmin.c
@@ -7,6 +7,10 @@
 
 #define HAS_FLOAT_VARIANT
 
+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+#define HAS_FLOAT16_VARIANT
+#endif
+
 /* Expected results.  */
 VECT_VAR_DECL(expected,int,8,8) [] = { 0xf0, 0xf1, 0xf2, 0xf3,
 				       0xf3, 0xf3, 0xf3, 0xf3 };
@@ -16,6 +20,9 @@ VECT_VAR_DECL(expected,uint,8,8) [] = { 0xf0, 0xf1, 0xf2, 0xf3,
 					0xf3, 0xf3, 0xf3, 0xf3 };
 VECT_VAR_DECL(expected,uint,16,4) [] = { 0xfff0, 0xfff1, 0xfff1, 0xfff1 };
 VECT_VAR_DECL(expected,uint,32,2) [] = { 0xfffffff0, 0xfffffff0 };
+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+VECT_VAR_DECL(expected, hfloat, 16, 4) [] = { 0xcc00, 0xcbc0, 0xcbc0, 0xcbc0 };
+#endif
 VECT_VAR_DECL(expected,hfloat,32,2) [] = { 0xc1800000, 0xc1780000 };
 VECT_VAR_DECL(expected,int,8,16) [] = { 0xf0, 0xf1, 0xf2, 0xf3,
 					0xf4, 0xf4, 0xf4, 0xf4,
@@ -31,11 +38,41 @@ VECT_VAR_DECL(expected,uint,8,16) [] = { 0xf0, 0xf1, 0xf2, 0xf3,
 					 0xf9, 0xf9, 0xf9, 0xf9 };
 VECT_VAR_DECL(expected,uint,16,8) [] = { 0xfff0, 0xfff1, 0xfff2, 0xfff2,
 					 0xfff2, 0xfff2, 0xfff2, 0xfff2 };
+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+VECT_VAR_DECL(expected, hfloat, 16, 8) [] = { 0xcc00, 0xcb80, 0xcb40, 0xcb40,
+					      0xcb40, 0xcb40, 0xcb40, 0xcb40 };
+#endif
 VECT_VAR_DECL(expected,uint,32,4) [] = { 0xfffffff0, 0xfffffff1,
 					 0xfffffff1, 0xfffffff1 };
 VECT_VAR_DECL(expected,hfloat,32,4) [] = { 0xc1800000, 0xc1700000,
 					   0xc1680000, 0xc1680000 };
 /* Expected results with special FP values.  */
+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+VECT_VAR_DECL(expected_nan, hfloat, 16, 8) [] = { 0x7e00, 0x7e00,
+						  0x7e00, 0x7e00,
+						  0x7e00, 0x7e00,
+						  0x7e00, 0x7e00 };
+VECT_VAR_DECL(expected_mnan, hfloat, 16, 8) [] = { 0x7e00, 0x7e00,
+						   0x7e00, 0x7e00,
+						   0x7e00, 0x7e00,
+						   0x7e00, 0x7e00 };
+VECT_VAR_DECL(expected_inf, hfloat, 16, 8) [] = { 0x3c00, 0x3c00,
+						  0x3c00, 0x3c00,
+						  0x3c00, 0x3c00,
+						  0x3c00, 0x3c00 };
+VECT_VAR_DECL(expected_minf, hfloat, 16, 8) [] = { 0xfc00, 0xfc00,
+						   0xfc00, 0xfc00,
+						   0xfc00, 0xfc00,
+						   0xfc00, 0xfc00 };
+VECT_VAR_DECL(expected_zero1, hfloat, 16, 8) [] = { 0x8000, 0x8000,
+						    0x8000, 0x8000,
+						    0x8000, 0x8000,
+						    0x8000, 0x8000 };
+VECT_VAR_DECL(expected_zero2, hfloat, 16, 8) [] = { 0x8000, 0x8000,
+						    0x8000, 0x8000,
+						    0x8000, 0x8000,
+						    0x8000, 0x8000 };
+#endif
 VECT_VAR_DECL(expected_nan,hfloat,32,4) [] = { 0x7fc00000, 0x7fc00000,
 					       0x7fc00000, 0x7fc00000 };
 VECT_VAR_DECL(expected_mnan,hfloat,32,4) [] = { 0x7fc00000, 0x7fc00000,
--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vminh_f16_1.c
@@ -0,0 +1,34 @@
+/* { dg-do run } */
+/* { dg-require-effective-target arm_v8_2a_fp16_scalar_hw } */
+/* { dg-add-options arm_v8_2a_fp16_scalar }  */
+/* { dg-skip-if "" { arm*-*-* } } */
+
+#include <arm_fp16.h>
+
+/* Input values.  */
+#define A 123.4
+#define B -567.8
+#define C -34.8
+#define D 1024
+#define E 663.1
+#define F 169.1
+#define G -4.8
+#define H 77
+
+float16_t input_1[] = { A, B, C, D };
+float16_t input_2[] = { E, F, G, H };
+float16_t expected[] = { A, B, C, H };
+
+#define TEST_MSG "VMINH_F16"
+#define INSN_NAME vminh_f16
+
+#define INPUT_1 input_1
+#define INPUT_2 input_2
+#define EXPECTED expected
+
+#define INPUT_TYPE float16_t
+#define OUTPUT_TYPE float16_t
+#define OUTPUT_TYPE_SIZE 16
+
+/* Include the template for unary scalar operations.  */
+#include "binary_scalar_op.inc"
--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vminnm_1.c
@@ -0,0 +1,51 @@
+/* This file tests an intrinsic which currently has only an f16 variant and that
+   is only available when FP16 arithmetic instructions are supported.  */
+/* { dg-require-effective-target arm_v8_2a_fp16_neon_hw } */
+
+#include <arm_neon.h>
+#include "arm-neon-ref.h"
+#include "compute-ref-data.h"
+
+#define INSN_NAME vminnm
+#define TEST_MSG "VMINNM/VMINMQ"
+
+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+#define HAS_FLOAT16_VARIANT
+#endif
+
+/* Expected results.  */
+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+VECT_VAR_DECL(expected, hfloat, 16, 4) [] = { 0xcc00, 0xcbc0, 0xcbc0, 0xcbc0 };
+VECT_VAR_DECL(expected, hfloat, 16, 8) [] = { 0xcc00, 0xcb80, 0xcb40, 0xcb40,
+					      0xcb40, 0xcb40, 0xcb40, 0xcb40 };
+#endif
+
+/* Expected results with special FP values.  */
+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+VECT_VAR_DECL(expected_nan, hfloat, 16, 8) [] = { 0x3c00, 0x3c00,
+						  0x3c00, 0x3c00,
+						  0x3c00, 0x3c00,
+						  0x3c00, 0x3c00 };
+VECT_VAR_DECL(expected_mnan, hfloat, 16, 8) [] = { 0x3c00, 0x3c00,
+						   0x3c00, 0x3c00,
+						   0x3c00, 0x3c00,
+						   0x3c00, 0x3c00 };
+VECT_VAR_DECL(expected_inf, hfloat, 16, 8) [] = { 0x3c00, 0x3c00,
+						  0x3c00, 0x3c00,
+						  0x3c00, 0x3c00,
+						  0x3c00, 0x3c00 };
+VECT_VAR_DECL(expected_minf, hfloat, 16, 8) [] = { 0xfc00, 0xfc00,
+						   0xfc00, 0xfc00,
+						   0xfc00, 0xfc00,
+						   0xfc00, 0xfc00 };
+VECT_VAR_DECL(expected_zero1, hfloat, 16, 8) [] = { 0x8000, 0x8000,
+						    0x8000, 0x8000,
+						    0x8000, 0x8000,
+						    0x8000, 0x8000 };
+VECT_VAR_DECL(expected_zero2, hfloat, 16, 8) [] = { 0x8000, 0x8000,
+						    0x8000, 0x8000,
+						    0x8000, 0x8000,
+						    0x8000, 0x8000 };
+#endif
+
+#include "binary_op_float.inc"
--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vminnmh_f16_1.c
@@ -0,0 +1,42 @@
+/* { dg-do run } */
+/* { dg-require-effective-target arm_v8_2a_fp16_scalar_hw } */
+/* { dg-add-options arm_v8_2a_fp16_scalar }  */
+
+#include <arm_fp16.h>
+
+#define INFF __builtin_inf ()
+
+/* Expected results (16-bit hexadecimal representation).  */
+uint16_t expected[] =
+{
+  0x0000 /* 0.000000 */,
+  0x8000 /* -0.000000 */,
+  0xc454 /* -4.328125 */,
+  0x4233 /* 3.099609 */,
+  0x4d00 /* 20.000000 */,
+  0xa51f /* -0.020004 */,
+  0xc09a /* -2.300781 */,
+  0xc73b /* -7.230469 */,
+  0xc79a /* -7.601562 */,
+  0x34f6 /* 0.310059 */,
+  0xc73b /* -7.230469 */,
+  0x3800 /* 0.500000 */,
+  0xc79a /* -7.601562 */,
+  0x451a /* 5.101562 */,
+  0xc64d /* -6.300781 */,
+  0x3556 /* 0.333496 */,
+  0xfc00 /* -inf */,
+  0xfc00 /* -inf */
+};
+
+#define TEST_MSG "VMINNMH_F16"
+#define INSN_NAME vminnmh_f16
+
+#define EXPECTED expected
+
+#define INPUT_TYPE float16_t
+#define OUTPUT_TYPE float16_t
+#define OUTPUT_TYPE_SIZE 16
+
+/* Include the template for binary scalar operations.  */
+#include "binary_scalar_op.inc"
--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vminnmv_f16_1.c
@@ -0,0 +1,131 @@
+/* { dg-do run } */
+/* { dg-require-effective-target arm_v8_2a_fp16_neon_hw } */
+/* { dg-add-options arm_v8_2a_fp16_neon } */
+/* { dg-skip-if "" { arm*-*-* } } */
+
+#include <arm_neon.h>
+#include "arm-neon-ref.h"
+#include "compute-ref-data.h"
+
+#define FP16_C(a) ((__fp16) a)
+#define A0 FP16_C (-567.8)
+#define B0 FP16_C (__builtin_nanf (""))
+#define C0 FP16_C (34.8)
+#define D0 FP16_C (-__builtin_nanf (""))
+
+#define A1 FP16_C (-567.8)
+#define B1 FP16_C (1025.8)
+#define C1 FP16_C (-__builtin_nanf (""))
+#define D1 FP16_C (10)
+#define E1 FP16_C (-0.0)
+#define F1 FP16_C (__builtin_nanf (""))
+#define G1 FP16_C (0.0)
+#define H1 FP16_C (10)
+
+/* Expected results for vminnmv.  */
+uint16_t expect = 0xE070 /* A0.  */;
+uint16_t expect_alt = 0xE070 /* A1.  */;
+
+void exec_vminnmv_f16 (void)
+{
+#undef TEST_MSG
+#define TEST_MSG "VMINNMV (FP16)"
+  clean_results ();
+
+  DECL_VARIABLE(vsrc, float, 16, 4);
+  VECT_VAR_DECL (buf_src, float, 16, 4) [] = {A0, B0, C0, D0};
+  VLOAD (vsrc, buf_src, , float, f, 16, 4);
+  float16_t vector_res = vminnmv_f16 (VECT_VAR (vsrc, float, 16, 4));
+
+  if (* (uint16_t *) &vector_res != expect)
+    abort ();
+
+  VECT_VAR_DECL (buf_src1, float, 16, 4) [] = {B0, A0, C0, D0};
+  VLOAD (vsrc, buf_src1, , float, f, 16, 4);
+  vector_res = vminnmv_f16 (VECT_VAR (vsrc, float, 16, 4));
+
+  if (* (uint16_t *) &vector_res != expect)
+    abort ();
+
+  VECT_VAR_DECL (buf_src2, float, 16, 4) [] = {B0, C0, A0, D0};
+  VLOAD (vsrc, buf_src2, , float, f, 16, 4);
+  vector_res = vminnmv_f16 (VECT_VAR (vsrc, float, 16, 4));
+
+  if (* (uint16_t *) &vector_res != expect)
+    abort ();
+
+  VECT_VAR_DECL (buf_src3, float, 16, 4) [] = {B0, C0, D0, A0};
+  VLOAD (vsrc, buf_src3, , float, f, 16, 4);
+  vector_res = vminnmv_f16 (VECT_VAR (vsrc, float, 16, 4));
+
+  if (* (uint16_t *) &vector_res != expect)
+    abort ();
+
+#undef TEST_MSG
+#define TEST_MSG "VMINNMVQ (FP16)"
+  clean_results ();
+
+  DECL_VARIABLE(vsrc, float, 16, 8);
+  VECT_VAR_DECL (buf_src, float, 16, 8) [] = {A1, B1, C1, D1, E1, F1, G1, H1};
+  VLOAD (vsrc, buf_src, q, float, f, 16, 8);
+  vector_res = vminnmvq_f16 (VECT_VAR (vsrc, float, 16, 8));
+
+  if (* (uint16_t *) &vector_res != expect_alt)
+    abort ();
+
+  VECT_VAR_DECL (buf_src1, float, 16, 8) [] = {B1, A1, C1, D1, E1, F1, G1, H1};
+  VLOAD (vsrc, buf_src1, q, float, f, 16, 8);
+  vector_res = vminnmvq_f16 (VECT_VAR (vsrc, float, 16, 8));
+
+  if (* (uint16_t *) &vector_res != expect_alt)
+    abort ();
+
+  VECT_VAR_DECL (buf_src2, float, 16, 8) [] = {B1, C1, A1, D1, E1, F1, G1, H1};
+  VLOAD (vsrc, buf_src2, q, float, f, 16, 8);
+  vector_res = vminnmvq_f16 (VECT_VAR (vsrc, float, 16, 8));
+
+  if (* (uint16_t *) &vector_res != expect_alt)
+    abort ();
+
+  VECT_VAR_DECL (buf_src3, float, 16, 8) [] = {B1, C1, D1, A1, E1, F1, G1, H1};
+  VLOAD (vsrc, buf_src3, q, float, f, 16, 8);
+  vector_res = vminnmvq_f16 (VECT_VAR (vsrc, float, 16, 8));
+
+  if (* (uint16_t *) &vector_res != expect_alt)
+    abort ();
+
+  VECT_VAR_DECL (buf_src4, float, 16, 8) [] = {B1, C1, D1, E1, A1, F1, G1, H1};
+  VLOAD (vsrc, buf_src4, q, float, f, 16, 8);
+  vector_res = vminnmvq_f16 (VECT_VAR (vsrc, float, 16, 8));
+
+  if (* (uint16_t *) &vector_res != expect_alt)
+    abort ();
+
+  VECT_VAR_DECL (buf_src5, float, 16, 8) [] = {B1, C1, D1, E1, F1, A1, G1, H1};
+  VLOAD (vsrc, buf_src5, q, float, f, 16, 8);
+  vector_res = vminnmvq_f16 (VECT_VAR (vsrc, float, 16, 8));
+
+  if (* (uint16_t *) &vector_res != expect_alt)
+    abort ();
+
+  VECT_VAR_DECL (buf_src6, float, 16, 8) [] = {B1, C1, D1, E1, F1, G1, A1, H1};
+  VLOAD (vsrc, buf_src6, q, float, f, 16, 8);
+  vector_res = vminnmvq_f16 (VECT_VAR (vsrc, float, 16, 8));
+
+  if (* (uint16_t *) &vector_res != expect_alt)
+    abort ();
+
+  VECT_VAR_DECL (buf_src7, float, 16, 8) [] = {B1, C1, D1, E1, F1, G1, H1, A1};
+  VLOAD (vsrc, buf_src7, q, float, f, 16, 8);
+  vector_res = vminnmvq_f16 (VECT_VAR (vsrc, float, 16, 8));
+
+  if (* (uint16_t *) &vector_res != expect_alt)
+    abort ();
+}
+
+int
+main (void)
+{
+  exec_vminnmv_f16 ();
+  return 0;
+}
--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vminv_f16_1.c
@@ -0,0 +1,131 @@
+/* { dg-do run } */
+/* { dg-require-effective-target arm_v8_2a_fp16_neon_hw } */
+/* { dg-add-options arm_v8_2a_fp16_neon } */
+/* { dg-skip-if "" { arm*-*-* } } */
+
+#include <arm_neon.h>
+#include "arm-neon-ref.h"
+#include "compute-ref-data.h"
+
+#define FP16_C(a) ((__fp16) a)
+#define A0 FP16_C (-567.8)
+#define B0 FP16_C (123.4)
+#define C0 FP16_C (34.8)
+#define D0 FP16_C (0.0)
+
+#define A1 FP16_C (-567.8)
+#define B1 FP16_C (1025.8)
+#define C1 FP16_C (13.4)
+#define D1 FP16_C (10)
+#define E1 FP16_C (-0.0)
+#define F1 FP16_C (567.8)
+#define G1 FP16_C (0.0)
+#define H1 FP16_C (10)
+
+/* Expected results for vminv.  */
+uint16_t expect = 0xE070 /* A0.  */;
+uint16_t expect_alt = 0xE070 /* A1.  */;
+
+void exec_vminv_f16 (void)
+{
+#undef TEST_MSG
+#define TEST_MSG "VMINV (FP16)"
+  clean_results ();
+
+  DECL_VARIABLE(vsrc, float, 16, 4);
+  VECT_VAR_DECL (buf_src, float, 16, 4) [] = {A0, B0, C0, D0};
+  VLOAD (vsrc, buf_src, , float, f, 16, 4);
+  float16_t vector_res = vminv_f16 (VECT_VAR (vsrc, float, 16, 4));
+
+  if (* (uint16_t *) &vector_res != expect)
+    abort ();
+
+  VECT_VAR_DECL (buf_src1, float, 16, 4) [] = {B0, A0, C0, D0};
+  VLOAD (vsrc, buf_src1, , float, f, 16, 4);
+  vector_res = vminv_f16 (VECT_VAR (vsrc, float, 16, 4));
+
+  if (* (uint16_t *) &vector_res != expect)
+    abort ();
+
+  VECT_VAR_DECL (buf_src2, float, 16, 4) [] = {B0, C0, A0, D0};
+  VLOAD (vsrc, buf_src2, , float, f, 16, 4);
+  vector_res = vminv_f16 (VECT_VAR (vsrc, float, 16, 4));
+
+  if (* (uint16_t *) &vector_res != expect)
+    abort ();
+
+  VECT_VAR_DECL (buf_src3, float, 16, 4) [] = {B0, C0, D0, A0};
+  VLOAD (vsrc, buf_src3, , float, f, 16, 4);
+  vector_res = vminv_f16 (VECT_VAR (vsrc, float, 16, 4));
+
+  if (* (uint16_t *) &vector_res != expect)
+    abort ();
+
+#undef TEST_MSG
+#define TEST_MSG "VMINVQ (FP16)"
+  clean_results ();
+
+  DECL_VARIABLE(vsrc, float, 16, 8);
+  VECT_VAR_DECL (buf_src, float, 16, 8) [] = {A1, B1, C1, D1, E1, F1, G1, H1};
+  VLOAD (vsrc, buf_src, q, float, f, 16, 8);
+  vector_res = vminvq_f16 (VECT_VAR (vsrc, float, 16, 8));
+
+  if (* (uint16_t *) &vector_res != expect_alt)
+    abort ();
+
+  VECT_VAR_DECL (buf_src1, float, 16, 8) [] = {B1, A1, C1, D1, E1, F1, G1, H1};
+  VLOAD (vsrc, buf_src1, q, float, f, 16, 8);
+  vector_res = vminvq_f16 (VECT_VAR (vsrc, float, 16, 8));
+
+  if (* (uint16_t *) &vector_res != expect_alt)
+    abort ();
+
+  VECT_VAR_DECL (buf_src2, float, 16, 8) [] = {B1, C1, A1, D1, E1, F1, G1, H1};
+  VLOAD (vsrc, buf_src2, q, float, f, 16, 8);
+  vector_res = vminvq_f16 (VECT_VAR (vsrc, float, 16, 8));
+
+  if (* (uint16_t *) &vector_res != expect_alt)
+    abort ();
+
+  VECT_VAR_DECL (buf_src3, float, 16, 8) [] = {B1, C1, D1, A1, E1, F1, G1, H1};
+  VLOAD (vsrc, buf_src3, q, float, f, 16, 8);
+  vector_res = vminvq_f16 (VECT_VAR (vsrc, float, 16, 8));
+
+  if (* (uint16_t *) &vector_res != expect_alt)
+    abort ();
+
+  VECT_VAR_DECL (buf_src4, float, 16, 8) [] = {B1, C1, D1, E1, A1, F1, G1, H1};
+  VLOAD (vsrc, buf_src4, q, float, f, 16, 8);
+  vector_res = vminvq_f16 (VECT_VAR (vsrc, float, 16, 8));
+
+  if (* (uint16_t *) &vector_res != expect_alt)
+    abort ();
+
+  VECT_VAR_DECL (buf_src5, float, 16, 8) [] = {B1, C1, D1, E1, F1, A1, G1, H1};
+  VLOAD (vsrc, buf_src5, q, float, f, 16, 8);
+  vector_res = vminvq_f16 (VECT_VAR (vsrc, float, 16, 8));
+
+  if (* (uint16_t *) &vector_res != expect_alt)
+    abort ();
+
+  VECT_VAR_DECL (buf_src6, float, 16, 8) [] = {B1, C1, D1, E1, F1, G1, A1, H1};
+  VLOAD (vsrc, buf_src6, q, float, f, 16, 8);
+  vector_res = vminvq_f16 (VECT_VAR (vsrc, float, 16, 8));
+
+  if (* (uint16_t *) &vector_res != expect_alt)
+    abort ();
+
+  VECT_VAR_DECL (buf_src7, float, 16, 8) [] = {B1, C1, D1, E1, F1, G1, H1, A1};
+  VLOAD (vsrc, buf_src7, q, float, f, 16, 8);
+  vector_res = vminvq_f16 (VECT_VAR (vsrc, float, 16, 8));
+
+  if (* (uint16_t *) &vector_res != expect_alt)
+    abort ();
+}
+
+int
+main (void)
+{
+  exec_vminv_f16 ();
+  return 0;
+}
--- a/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vmovn.c
+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vmovn.c
@@ -35,11 +35,11 @@ void exec_vmovn (void)
   TEST_VMOVN(uint, u, 32, 16, 4);
   TEST_VMOVN(uint, u, 64, 32, 2);
 
-  CHECK(TEST_MSG, int, 8, 8, PRIx32, expected, "");
-  CHECK(TEST_MSG, int, 16, 4, PRIx64, expected, "");
+  CHECK(TEST_MSG, int, 8, 8, PRIx8, expected, "");
+  CHECK(TEST_MSG, int, 16, 4, PRIx16, expected, "");
   CHECK(TEST_MSG, int, 32, 2, PRIx32, expected, "");
-  CHECK(TEST_MSG, uint, 8, 8, PRIx32, expected, "");
-  CHECK(TEST_MSG, uint, 16, 4, PRIx64, expected, "");
+  CHECK(TEST_MSG, uint, 8, 8, PRIx8, expected, "");
+  CHECK(TEST_MSG, uint, 16, 4, PRIx16, expected, "");
   CHECK(TEST_MSG, uint, 32, 2, PRIx32, expected, "");
 }
 
--- a/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vmul.c
+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vmul.c
@@ -13,6 +13,10 @@ VECT_VAR_DECL(expected,uint,16,4) [] = { 0xfab0, 0xfb05, 0xfb5a, 0xfbaf };
 VECT_VAR_DECL(expected,uint,32,2) [] = { 0xfffff9a0, 0xfffffa06 };
 VECT_VAR_DECL(expected,poly,8,8) [] = { 0xc0, 0x84, 0x48, 0xc,
 					0xd0, 0x94, 0x58, 0x1c };
+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+VECT_VAR_DECL(expected, hfloat, 16, 4) [] = { 0xe02a, 0xdfcf,
+					      0xdf4a, 0xdec4 };
+#endif
 VECT_VAR_DECL(expected,hfloat,32,2) [] = { 0xc4053333, 0xc3f9c000 };
 VECT_VAR_DECL(expected,int,8,16) [] = { 0x90, 0x7, 0x7e, 0xf5,
 					0x6c, 0xe3, 0x5a, 0xd1,
@@ -34,13 +38,15 @@ VECT_VAR_DECL(expected,poly,8,16) [] = { 0x60, 0xca, 0x34, 0x9e,
 					 0xc8, 0x62, 0x9c, 0x36,
 					 0x30, 0x9a, 0x64, 0xce,
 					 0x98, 0x32, 0xcc, 0x66 };
+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+VECT_VAR_DECL(expected, hfloat, 16, 8) [] = { 0xe63a, 0xe5d6, 0xe573, 0xe50f,
+					      0xe4ac, 0xe448, 0xe3c8, 0xe301 };
+#endif
 VECT_VAR_DECL(expected,hfloat,32,4) [] = { 0xc4c73333, 0xc4bac000,
 					   0xc4ae4ccd, 0xc4a1d999 };
 
-#ifndef INSN_NAME
 #define INSN_NAME vmul
 #define TEST_MSG "VMUL"
-#endif
 
 #define FNNAME1(NAME) exec_ ## NAME
 #define FNNAME(NAME) FNNAME1(NAME)
@@ -80,6 +86,17 @@ void FNNAME (INSN_NAME) (void)
   DECL_VMUL(poly, 8, 16);
   DECL_VMUL(float, 32, 4);
 
+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+  DECL_VARIABLE(vector1, float, 16, 4);
+  DECL_VARIABLE(vector1, float, 16, 8);
+
+  DECL_VARIABLE(vector2, float, 16, 4);
+  DECL_VARIABLE(vector2, float, 16, 8);
+
+  DECL_VARIABLE(vector_res, float, 16, 4);
+  DECL_VARIABLE(vector_res, float, 16, 8);
+#endif
+
   clean_results ();
 
   /* Initialize input "vector1" from "buffer".  */
@@ -99,6 +116,10 @@ void FNNAME (INSN_NAME) (void)
   VLOAD(vector1, buffer, q, uint, u, 32, 4);
   VLOAD(vector1, buffer, q, poly, p, 8, 16);
   VLOAD(vector1, buffer, q, float, f, 32, 4);
+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+  VLOAD(vector1, buffer, , float, f, 16, 4);
+  VLOAD(vector1, buffer, q, float, f, 16, 8);
+#endif
 
   /* Choose init value arbitrarily.  */
   VDUP(vector2, , int, s, 8, 8, 0x11);
@@ -117,6 +138,10 @@ void FNNAME (INSN_NAME) (void)
   VDUP(vector2, q, uint, u, 32, 4, 0xCC);
   VDUP(vector2, q, poly, p, 8, 16, 0xAA);
   VDUP(vector2, q, float, f, 32, 4, 99.6f);
+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+  VDUP(vector2, , float, f, 16, 4, 33.3f);
+  VDUP(vector2, q, float, f, 16, 8, 99.6f);
+#endif
 
   /* Execute the tests.  */
   TEST_VMUL(INSN_NAME, , int, s, 8, 8);
@@ -135,6 +160,10 @@ void FNNAME (INSN_NAME) (void)
   TEST_VMUL(INSN_NAME, q, uint, u, 32, 4);
   TEST_VMUL(INSN_NAME, q, poly, p, 8, 16);
   TEST_VMUL(INSN_NAME, q, float, f, 32, 4);
+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+  TEST_VMUL(INSN_NAME, , float, f, 16, 4);
+  TEST_VMUL(INSN_NAME, q, float, f, 16, 8);
+#endif
 
   CHECK(TEST_MSG, int, 8, 8, PRIx8, expected, "");
   CHECK(TEST_MSG, int, 16, 4, PRIx16, expected, "");
@@ -142,7 +171,7 @@ void FNNAME (INSN_NAME) (void)
   CHECK(TEST_MSG, uint, 8, 8, PRIx8, expected, "");
   CHECK(TEST_MSG, uint, 16, 4, PRIx16, expected, "");
   CHECK(TEST_MSG, uint, 32, 2, PRIx32, expected, "");
-  CHECK(TEST_MSG, poly, 8, 8, PRIx8, expected, "");
+  CHECK_POLY(TEST_MSG, poly, 8, 8, PRIx8, expected, "");
   CHECK_FP(TEST_MSG, float, 32, 2, PRIx32, expected, "");
   CHECK(TEST_MSG, int, 8, 16, PRIx8, expected, "");
   CHECK(TEST_MSG, int, 16, 8, PRIx16, expected, "");
@@ -150,8 +179,12 @@ void FNNAME (INSN_NAME) (void)
   CHECK(TEST_MSG, uint, 8, 16, PRIx8, expected, "");
   CHECK(TEST_MSG, uint, 16, 8, PRIx16, expected, "");
   CHECK(TEST_MSG, uint, 32, 4, PRIx32, expected, "");
-  CHECK(TEST_MSG, poly, 8, 16, PRIx8, expected, "");
+  CHECK_POLY(TEST_MSG, poly, 8, 16, PRIx8, expected, "");
   CHECK_FP(TEST_MSG, float, 32, 4, PRIx32, expected, "");
+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+  CHECK_FP(TEST_MSG, float, 16, 4, PRIx16, expected, "");
+  CHECK_FP(TEST_MSG, float, 16, 8, PRIx16, expected, "");
+#endif
 }
 
 int main (void)
--- a/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vmul_lane.c
+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vmul_lane.c
@@ -7,6 +7,9 @@ VECT_VAR_DECL(expected,int,16,4) [] = { 0xffc0, 0xffc4, 0xffc8, 0xffcc };
 VECT_VAR_DECL(expected,int,32,2) [] = { 0xfffffde0, 0xfffffe02 };
 VECT_VAR_DECL(expected,uint,16,4) [] = { 0xbbc0, 0xc004, 0xc448, 0xc88c };
 VECT_VAR_DECL(expected,uint,32,2) [] = { 0xfffface0, 0xffffb212 };
+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+VECT_VAR_DECL(expected, hfloat, 16, 4) [] = { 0xddb3, 0xdd58, 0xdcfd, 0xdca1 };
+#endif
 VECT_VAR_DECL(expected,hfloat,32,2) [] = { 0xc3b66666, 0xc3ab0000 };
 VECT_VAR_DECL(expected,int,16,8) [] = { 0xffc0, 0xffc4, 0xffc8, 0xffcc,
 					0xffd0, 0xffd4, 0xffd8, 0xffdc };
@@ -16,6 +19,10 @@ VECT_VAR_DECL(expected,uint,16,8) [] = { 0xbbc0, 0xc004, 0xc448, 0xc88c,
 					 0xccd0, 0xd114, 0xd558, 0xd99c };
 VECT_VAR_DECL(expected,uint,32,4) [] = { 0xfffface0, 0xffffb212,
 					 0xffffb744, 0xffffbc76 };
+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+VECT_VAR_DECL(expected, hfloat, 16, 8) [] = { 0xddb3, 0xdd58, 0xdcfd, 0xdca1,
+					      0xdc46, 0xdbd6, 0xdb20, 0xda69 };
+#endif
 VECT_VAR_DECL(expected,hfloat,32,4) [] = { 0xc3b66666, 0xc3ab0000,
 					   0xc39f9999, 0xc3943333 };
 
@@ -45,11 +52,20 @@ void exec_vmul_lane (void)
 
   DECL_VMUL(vector);
   DECL_VMUL(vector_res);
+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+  DECL_VARIABLE(vector, float, 16, 4);
+  DECL_VARIABLE(vector, float, 16, 8);
+  DECL_VARIABLE(vector_res, float, 16, 4);
+  DECL_VARIABLE(vector_res, float, 16, 8);
+#endif
 
   DECL_VARIABLE(vector2, int, 16, 4);
   DECL_VARIABLE(vector2, int, 32, 2);
   DECL_VARIABLE(vector2, uint, 16, 4);
   DECL_VARIABLE(vector2, uint, 32, 2);
+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+  DECL_VARIABLE(vector2, float, 16, 4);
+#endif
   DECL_VARIABLE(vector2, float, 32, 2);
 
   clean_results ();
@@ -59,11 +75,17 @@ void exec_vmul_lane (void)
   VLOAD(vector, buffer, , int, s, 32, 2);
   VLOAD(vector, buffer, , uint, u, 16, 4);
   VLOAD(vector, buffer, , uint, u, 32, 2);
+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+  VLOAD(vector, buffer, , float, f, 16, 4);
+#endif
   VLOAD(vector, buffer, , float, f, 32, 2);
   VLOAD(vector, buffer, q, int, s, 16, 8);
   VLOAD(vector, buffer, q, int, s, 32, 4);
   VLOAD(vector, buffer, q, uint, u, 16, 8);
   VLOAD(vector, buffer, q, uint, u, 32, 4);
+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+  VLOAD(vector, buffer, q, float, f, 16, 8);
+#endif
   VLOAD(vector, buffer, q, float, f, 32, 4);
 
   /* Initialize vector2.  */
@@ -71,6 +93,9 @@ void exec_vmul_lane (void)
   VDUP(vector2, , int, s, 32, 2, 0x22);
   VDUP(vector2, , uint, u, 16, 4, 0x444);
   VDUP(vector2, , uint, u, 32, 2, 0x532);
+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+  VDUP(vector2, , float, f, 16, 4, 22.8f);
+#endif
   VDUP(vector2, , float, f, 32, 2, 22.8f);
 
   /* Choose lane arbitrarily.  */
@@ -78,22 +103,34 @@ void exec_vmul_lane (void)
   TEST_VMUL_LANE(, int, s, 32, 2, 2, 1);
   TEST_VMUL_LANE(, uint, u, 16, 4, 4, 2);
   TEST_VMUL_LANE(, uint, u, 32, 2, 2, 1);
+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+  TEST_VMUL_LANE(, float, f, 16, 4, 4, 1);
+#endif
   TEST_VMUL_LANE(, float, f, 32, 2, 2, 1);
   TEST_VMUL_LANE(q, int, s, 16, 8, 4, 2);
   TEST_VMUL_LANE(q, int, s, 32, 4, 2, 0);
   TEST_VMUL_LANE(q, uint, u, 16, 8, 4, 2);
   TEST_VMUL_LANE(q, uint, u, 32, 4, 2, 1);
+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+  TEST_VMUL_LANE(q, float, f, 16, 8, 4, 0);
+#endif
   TEST_VMUL_LANE(q, float, f, 32, 4, 2, 0);
 
-  CHECK(TEST_MSG, int, 16, 4, PRIx64, expected, "");
+  CHECK(TEST_MSG, int, 16, 4, PRIx16, expected, "");
   CHECK(TEST_MSG, int, 32, 2, PRIx32, expected, "");
-  CHECK(TEST_MSG, uint, 16, 4, PRIx64, expected, "");
+  CHECK(TEST_MSG, uint, 16, 4, PRIx16, expected, "");
   CHECK(TEST_MSG, uint, 32, 2, PRIx32, expected, "");
+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+  CHECK_FP(TEST_MSG, float, 16, 4, PRIx16, expected, "");
+#endif
   CHECK_FP(TEST_MSG, float, 32, 2, PRIx32, expected, "");
-  CHECK(TEST_MSG, int, 16, 8, PRIx64, expected, "");
+  CHECK(TEST_MSG, int, 16, 8, PRIx16, expected, "");
   CHECK(TEST_MSG, int, 32, 4, PRIx32, expected, "");
-  CHECK(TEST_MSG, uint, 16, 8, PRIx64, expected, "");
+  CHECK(TEST_MSG, uint, 16, 8, PRIx16, expected, "");
   CHECK(TEST_MSG, uint, 32, 4, PRIx32, expected, "");
+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+  CHECK_FP(TEST_MSG, float, 16, 8, PRIx16, expected, "");
+#endif
   CHECK_FP(TEST_MSG, float, 32, 4, PRIx32, expected, "");
 }
 
--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vmul_lane_f16_1.c
@@ -0,0 +1,454 @@
+/* { dg-do run } */
+/* { dg-require-effective-target arm_v8_2a_fp16_neon_hw } */
+/* { dg-add-options arm_v8_2a_fp16_neon } */
+/* { dg-skip-if "" { arm*-*-* } } */
+
+#include <arm_neon.h>
+#include "arm-neon-ref.h"
+#include "compute-ref-data.h"
+
+#define FP16_C(a) ((__fp16) a)
+#define A FP16_C (13.4)
+#define B FP16_C (-56.8)
+#define C FP16_C (-34.8)
+#define D FP16_C (12)
+#define E FP16_C (63.1)
+#define F FP16_C (19.1)
+#define G FP16_C (-4.8)
+#define H FP16_C (77)
+
+#define I FP16_C (0.7)
+#define J FP16_C (-78)
+#define K FP16_C (11.23)
+#define L FP16_C (98)
+#define M FP16_C (87.1)
+#define N FP16_C (-8)
+#define O FP16_C (-1.1)
+#define P FP16_C (-9.7)
+
+/* Expected results for vmul_lane.  */
+VECT_VAR_DECL (expected0_static, hfloat, 16, 4) []
+  = { 0x629B /* A * E.  */,
+      0xEB00 /* B * E.  */,
+      0xE84A /* C * E.  */,
+      0x61EA /* D * E.  */ };
+
+VECT_VAR_DECL (expected1_static, hfloat, 16, 4) []
+  = { 0x5BFF /* A * F.  */,
+      0xE43D /* B * F.  */,
+      0xE131 /* C * F.  */,
+      0x5B29 /* D * F.  */ };
+
+VECT_VAR_DECL (expected2_static, hfloat, 16, 4) []
+  = { 0xD405 /* A * G.  */,
+      0x5C43 /* B * G.  */,
+      0x5939 /* C * G.  */,
+      0xD334 /* D * G.  */ };
+
+VECT_VAR_DECL (expected3_static, hfloat, 16, 4) []
+  = { 0x6408 /* A * H.  */,
+      0xEC46 /* B * H.  */,
+      0xE93C /* C * H.  */,
+      0x6338 /* D * H.  */ };
+
+/* Expected results for vmulq_lane.  */
+VECT_VAR_DECL (expected0_static, hfloat, 16, 8) []
+  = { 0x629B /* A * E.  */,
+      0xEB00 /* B * E.  */,
+      0xE84A /* C * E.  */,
+      0x61EA /* D * E.  */,
+      0x5186 /* I * E.  */,
+      0xECCE /* J * E.  */,
+      0x6189 /* K * E.  */,
+      0x6E0A /* L * E.  */ };
+
+VECT_VAR_DECL (expected1_static, hfloat, 16, 8) []
+  = { 0x5BFF /* A * F.  */,
+      0xE43D /* B * F.  */,
+      0xE131 /* C * F.  */,
+      0x5B29 /* D * F.  */,
+      0x4AAF /* I * F.  */,
+      0xE5D1 /* J * F.  */,
+      0x5AB3 /* K * F.  */,
+      0x674F /* L * F.  */ };
+
+VECT_VAR_DECL (expected2_static, hfloat, 16, 8) []
+  = { 0xD405 /* A * G.  */,
+      0x5C43 /* B * G.  */,
+      0x5939 /* C * G.  */,
+      0xD334 /* D * G.  */,
+      0xC2B9 /* I * G.  */,
+      0x5DDA /* J * G.  */,
+      0xD2BD /* K * G.  */,
+      0xDF5A /* L * G.  */ };
+
+VECT_VAR_DECL (expected3_static, hfloat, 16, 8) []
+  = { 0x6408 /* A * H.  */,
+      0xEC46 /* B * H.  */,
+      0xE93C /* C * H.  */,
+      0x6338 /* D * H.  */,
+      0x52BD /* I * H.  */,
+      0xEDDE /* J * H.  */,
+      0x62C1 /* K * H.  */,
+      0x6F5E /* L * H.  */ };
+
+/* Expected results for vmul_laneq.  */
+VECT_VAR_DECL (expected_laneq0_static, hfloat, 16, 4) []
+  = { 0x629B /* A * E.  */,
+      0xEB00 /* B * E.  */,
+      0xE84A /* C * E.  */,
+      0x61EA /* D * E.  */ };
+
+VECT_VAR_DECL (expected_laneq1_static, hfloat, 16, 4) []
+  = { 0x5BFF /* A * F.  */,
+      0xE43D /* B * F.  */,
+      0xE131 /* C * F.  */,
+      0x5B29 /* D * F.  */ };
+
+VECT_VAR_DECL (expected_laneq2_static, hfloat, 16, 4) []
+  = { 0xD405 /* A * G.  */,
+      0x5C43 /* B * G.  */,
+      0x5939 /* C * G.  */,
+      0xD334 /* D * G.  */ };
+
+VECT_VAR_DECL (expected_laneq3_static, hfloat, 16, 4) []
+  = { 0x6408 /* A * H.  */,
+      0xEC46 /* B * H.  */,
+      0xE93C /* C * H.  */,
+      0x6338 /* D * H.  */ };
+
+VECT_VAR_DECL (expected_laneq4_static, hfloat, 16, 4) []
+  = { 0x648F /* A * M.  */,
+      0xECD5 /* B * M.  */,
+      0xE9ED /* C * M.  */,
+      0x6416 /* D * M.  */ };
+
+VECT_VAR_DECL (expected_laneq5_static, hfloat, 16, 4) []
+  = { 0xD6B3 /* A * N.  */,
+      0x5F1A /* B * N.  */,
+      0x5C5A /* C * N.  */,
+      0xD600 /* D * N.  */ };
+
+VECT_VAR_DECL (expected_laneq6_static, hfloat, 16, 4) []
+  = { 0xCB5E /* A * O.  */,
+      0x53CF /* B * O.  */,
+      0x50C9 /* C * O.  */,
+      0xCA99 /* D * O.  */ };
+
+VECT_VAR_DECL (expected_laneq7_static, hfloat, 16, 4) []
+  = { 0xD810 /* A * P.  */,
+      0x604F /* B * P.  */,
+      0x5D47 /* C * P.  */,
+      0xD747 /* D * P.  */ };
+
+/* Expected results for vmulq_laneq.  */
+VECT_VAR_DECL (expected_laneq0_static, hfloat, 16, 8) []
+  = { 0x629B /* A * E.  */,
+      0xEB00 /* B * E.  */,
+      0xE84A /* C * E.  */,
+      0x61EA /* D * E.  */,
+      0x5186 /* I * E.  */,
+      0xECCE /* J * E.  */,
+      0x6189 /* K * E.  */,
+      0x6E0A /* L * E.  */ };
+
+VECT_VAR_DECL (expected_laneq1_static, hfloat, 16, 8) []
+  = { 0x5BFF /* A * F.  */,
+      0xE43D /* B * F.  */,
+      0xE131 /* C * F.  */,
+      0x5B29 /* D * F.  */,
+      0x4AAF /* I * F.  */,
+      0xE5D1 /* J * F.  */,
+      0x5AB3 /* K * F.  */,
+      0x674F /* L * F.  */ };
+
+VECT_VAR_DECL (expected_laneq2_static, hfloat, 16, 8) []
+  = { 0xD405 /* A * G.  */,
+      0x5C43 /* B * G.  */,
+      0x5939 /* C * G.  */,
+      0xD334 /* D * G.  */,
+      0xC2B9 /* I * G.  */,
+      0x5DDA /* J * G.  */,
+      0xD2BD /* K * G.  */,
+      0xDF5A /* L * G.  */ };
+
+VECT_VAR_DECL (expected_laneq3_static, hfloat, 16, 8) []
+  = { 0x6408 /* A * H.  */,
+      0xEC46 /* B * H.  */,
+      0xE93C /* C * H.  */,
+      0x6338 /* D * H.  */,
+      0x52BD /* I * H.  */,
+      0xEDDE /* J * H.  */,
+      0x62C1 /* K * H.  */,
+      0x6F5E /* L * H.  */ };
+
+VECT_VAR_DECL (expected_laneq4_static, hfloat, 16, 8) []
+  = { 0x648F /* A * M.  */,
+      0xECD5 /* B * M.  */,
+      0xE9ED /* C * M.  */,
+      0x6416 /* D * M.  */,
+      0x53A0 /* I * M.  */,
+      0xEEA3 /* J * M.  */,
+      0x63A4 /* K * M.  */,
+      0x702B /* L * M.  */ };
+
+VECT_VAR_DECL (expected_laneq5_static, hfloat, 16, 8) []
+  = { 0xD6B3 /* A * N.  */,
+      0x5F1A /* B * N.  */,
+      0x5C5A /* C * N.  */,
+      0xD600 /* D * N.  */,
+      0xC59A /* I * N.  */,
+      0x60E0 /* J * N.  */,
+      0xD59D /* K * N.  */,
+      0xE220 /* L * N.  */ };
+
+VECT_VAR_DECL (expected_laneq6_static, hfloat, 16, 8) []
+  = { 0xCB5E /* A * O.  */,
+      0x53CF /* B * O.  */,
+      0x50C9 /* C * O.  */,
+      0xCA99 /* D * O.  */,
+      0xBA29 /* I * O.  */,
+      0x555C /* J * O.  */,
+      0xCA2C /* K * O.  */,
+      0xD6BC /* L * O.  */ };
+
+VECT_VAR_DECL (expected_laneq7_static, hfloat, 16, 8) []
+  = { 0xD810 /* A * P.  */,
+      0x604F /* B * P.  */,
+      0x5D47 /* C * P.  */,
+      0xD747 /* D * P.  */,
+      0xC6CB /* I * P.  */,
+      0x61EA /* J * P.  */,
+      0xD6CF /* K * P.  */,
+      0xE36E /* L * P.  */ };
+
+void exec_vmul_lane_f16 (void)
+{
+#undef TEST_MSG
+#define TEST_MSG "VMUL_LANE (FP16)"
+  clean_results ();
+
+  DECL_VARIABLE(vsrc_1, float, 16, 4);
+  DECL_VARIABLE(vsrc_2, float, 16, 4);
+  VECT_VAR_DECL (buf_src_1, float, 16, 4) [] = {A, B, C, D};
+  VECT_VAR_DECL (buf_src_2, float, 16, 4) [] = {E, F, G, H};
+  VLOAD (vsrc_1, buf_src_1, , float, f, 16, 4);
+  VLOAD (vsrc_2, buf_src_2, , float, f, 16, 4);
+  DECL_VARIABLE (vector_res, float, 16, 4)
+    = vmul_lane_f16 (VECT_VAR (vsrc_1, float, 16, 4),
+		     VECT_VAR (vsrc_2, float, 16, 4), 0);
+  vst1_f16 (VECT_VAR (result, float, 16, 4),
+	    VECT_VAR (vector_res, float, 16, 4));
+
+  CHECK_FP (TEST_MSG, float, 16, 4, PRIx16, expected0_static, "");
+
+  VECT_VAR (vector_res, float, 16, 4)
+    = vmul_lane_f16 (VECT_VAR (vsrc_1, float, 16, 4),
+		     VECT_VAR (vsrc_2, float, 16, 4), 1);
+  vst1_f16 (VECT_VAR (result, float, 16, 4),
+	    VECT_VAR (vector_res, float, 16, 4));
+
+  CHECK_FP (TEST_MSG, float, 16, 4, PRIx16, expected1_static, "");
+
+  VECT_VAR (vector_res, float, 16, 4)
+    = vmul_lane_f16 (VECT_VAR (vsrc_1, float, 16, 4),
+		     VECT_VAR (vsrc_2, float, 16, 4), 2);
+  vst1_f16 (VECT_VAR (result, float, 16, 4),
+	    VECT_VAR (vector_res, float, 16, 4));
+
+  CHECK_FP (TEST_MSG, float, 16, 4, PRIx16, expected2_static, "");
+
+  VECT_VAR (vector_res, float, 16, 4)
+    = vmul_lane_f16 (VECT_VAR (vsrc_1, float, 16, 4),
+		     VECT_VAR (vsrc_2, float, 16, 4), 3);
+  vst1_f16 (VECT_VAR (result, float, 16, 4),
+	    VECT_VAR (vector_res, float, 16, 4));
+
+  CHECK_FP (TEST_MSG, float, 16, 4, PRIx16, expected3_static, "");
+
+#undef TEST_MSG
+#define TEST_MSG "VMULQ_LANE (FP16)"
+  clean_results ();
+
+  DECL_VARIABLE(vsrc_1, float, 16, 8);
+  VECT_VAR_DECL (buf_src_1, float, 16, 8) [] = {A, B, C, D, I, J, K, L};
+  VLOAD (vsrc_1, buf_src_1, q, float, f, 16, 8);
+  DECL_VARIABLE (vector_res, float, 16, 8)
+    = vmulq_lane_f16 (VECT_VAR (vsrc_1, float, 16, 8),
+		      VECT_VAR (vsrc_2, float, 16, 4), 0);
+
+  vst1q_f16 (VECT_VAR (result, float, 16, 8),
+	     VECT_VAR (vector_res, float, 16, 8));
+
+  CHECK_FP (TEST_MSG, float, 16, 8, PRIx16, expected0_static, "");
+
+  VECT_VAR (vector_res, float, 16, 8)
+    = vmulq_lane_f16 (VECT_VAR (vsrc_1, float, 16, 8),
+		      VECT_VAR (vsrc_2, float, 16, 4), 1);
+  vst1q_f16 (VECT_VAR (result, float, 16, 8),
+	     VECT_VAR (vector_res, float, 16, 8));
+
+  CHECK_FP (TEST_MSG, float, 16, 8, PRIx16, expected1_static, "");
+
+  VECT_VAR (vector_res, float, 16, 8)
+    = vmulq_lane_f16 (VECT_VAR (vsrc_1, float, 16, 8),
+		      VECT_VAR (vsrc_2, float, 16, 4), 2);
+  vst1q_f16 (VECT_VAR (result, float, 16, 8),
+	     VECT_VAR (vector_res, float, 16, 8));
+
+  CHECK_FP (TEST_MSG, float, 16, 8, PRIx16, expected2_static, "");
+
+  VECT_VAR (vector_res, float, 16, 8)
+    = vmulq_lane_f16 (VECT_VAR (vsrc_1, float, 16, 8),
+		      VECT_VAR (vsrc_2, float, 16, 4), 3);
+  vst1q_f16 (VECT_VAR (result, float, 16, 8),
+	     VECT_VAR (vector_res, float, 16, 8));
+
+  CHECK_FP (TEST_MSG, float, 16, 8, PRIx16, expected3_static, "");
+
+#undef TEST_MSG
+#define TEST_MSG "VMUL_LANEQ (FP16)"
+  clean_results ();
+
+  DECL_VARIABLE(vsrc_2, float, 16, 8);
+  VECT_VAR_DECL (buf_src_2, float, 16, 8) [] = {E, F, G, H, M, N, O, P};
+  VLOAD (vsrc_2, buf_src_2, q, float, f, 16, 8);
+  VECT_VAR (vector_res, float, 16, 4)
+    = vmul_laneq_f16 (VECT_VAR (vsrc_1, float, 16, 4),
+		      VECT_VAR (vsrc_2, float, 16, 8), 0);
+  vst1_f16 (VECT_VAR (result, float, 16, 4),
+	    VECT_VAR (vector_res, float, 16, 4));
+
+  CHECK_FP (TEST_MSG, float, 16, 4, PRIx16, expected_laneq0_static, "");
+
+  VECT_VAR (vector_res, float, 16, 4)
+    = vmul_laneq_f16 (VECT_VAR (vsrc_1, float, 16, 4),
+		      VECT_VAR (vsrc_2, float, 16, 8), 1);
+  vst1_f16 (VECT_VAR (result, float, 16, 4),
+	    VECT_VAR (vector_res, float, 16, 4));
+
+  CHECK_FP (TEST_MSG, float, 16, 4, PRIx16, expected_laneq1_static, "");
+
+  VECT_VAR (vector_res, float, 16, 4)
+    = vmul_laneq_f16 (VECT_VAR (vsrc_1, float, 16, 4),
+		      VECT_VAR (vsrc_2, float, 16, 8), 2);
+  vst1_f16 (VECT_VAR (result, float, 16, 4),
+	    VECT_VAR (vector_res, float, 16, 4));
+
+  CHECK_FP (TEST_MSG, float, 16, 4, PRIx16, expected_laneq2_static, "");
+
+  VECT_VAR (vector_res, float, 16, 4)
+    = vmul_laneq_f16 (VECT_VAR (vsrc_1, float, 16, 4),
+		      VECT_VAR (vsrc_2, float, 16, 8), 3);
+  vst1_f16 (VECT_VAR (result, float, 16, 4),
+	    VECT_VAR (vector_res, float, 16, 4));
+
+  CHECK_FP (TEST_MSG, float, 16, 4, PRIx16, expected_laneq3_static, "");
+
+  VECT_VAR (vector_res, float, 16, 4)
+    = vmul_laneq_f16 (VECT_VAR (vsrc_1, float, 16, 4),
+		      VECT_VAR (vsrc_2, float, 16, 8), 4);
+  vst1_f16 (VECT_VAR (result, float, 16, 4),
+	    VECT_VAR (vector_res, float, 16, 4));
+
+  CHECK_FP (TEST_MSG, float, 16, 4, PRIx16, expected_laneq4_static, "");
+
+  VECT_VAR (vector_res, float, 16, 4)
+    = vmul_laneq_f16 (VECT_VAR (vsrc_1, float, 16, 4),
+		      VECT_VAR (vsrc_2, float, 16, 8), 5);
+  vst1_f16 (VECT_VAR (result, float, 16, 4),
+	    VECT_VAR (vector_res, float, 16, 4));
+
+  CHECK_FP (TEST_MSG, float, 16, 4, PRIx16, expected_laneq5_static, "");
+
+  VECT_VAR (vector_res, float, 16, 4)
+    = vmul_laneq_f16 (VECT_VAR (vsrc_1, float, 16, 4),
+		      VECT_VAR (vsrc_2, float, 16, 8), 6);
+  vst1_f16 (VECT_VAR (result, float, 16, 4),
+	    VECT_VAR (vector_res, float, 16, 4));
+
+  CHECK_FP (TEST_MSG, float, 16, 4, PRIx16, expected_laneq6_static, "");
+
+  VECT_VAR (vector_res, float, 16, 4)
+    = vmul_laneq_f16 (VECT_VAR (vsrc_1, float, 16, 4),
+		      VECT_VAR (vsrc_2, float, 16, 8), 7);
+  vst1_f16 (VECT_VAR (result, float, 16, 4),
+	    VECT_VAR (vector_res, float, 16, 4));
+
+  CHECK_FP (TEST_MSG, float, 16, 4, PRIx16, expected_laneq7_static, "");
+
+#undef TEST_MSG
+#define TEST_MSG "VMULQ_LANEQ (FP16)"
+  clean_results ();
+
+  VECT_VAR (vector_res, float, 16, 8)
+    = vmulq_laneq_f16 (VECT_VAR (vsrc_1, float, 16, 8),
+		       VECT_VAR (vsrc_2, float, 16, 8), 0);
+  vst1q_f16 (VECT_VAR (result, float, 16, 8),
+	     VECT_VAR (vector_res, float, 16, 8));
+
+  CHECK_FP (TEST_MSG, float, 16, 8, PRIx16, expected_laneq0_static, "");
+
+  VECT_VAR (vector_res, float, 16, 8)
+    = vmulq_laneq_f16 (VECT_VAR (vsrc_1, float, 16, 8),
+		       VECT_VAR (vsrc_2, float, 16, 8), 1);
+  vst1q_f16 (VECT_VAR (result, float, 16, 8),
+	     VECT_VAR (vector_res, float, 16, 8));
+
+  CHECK_FP (TEST_MSG, float, 16, 8, PRIx16, expected_laneq1_static, "");
+
+  VECT_VAR (vector_res, float, 16, 8)
+    = vmulq_laneq_f16 (VECT_VAR (vsrc_1, float, 16, 8),
+		       VECT_VAR (vsrc_2, float, 16, 8), 2);
+  vst1q_f16 (VECT_VAR (result, float, 16, 8),
+	     VECT_VAR (vector_res, float, 16, 8));
+
+  CHECK_FP (TEST_MSG, float, 16, 8, PRIx16, expected_laneq2_static, "");
+
+  VECT_VAR (vector_res, float, 16, 8)
+    = vmulq_laneq_f16 (VECT_VAR (vsrc_1, float, 16, 8),
+		       VECT_VAR (vsrc_2, float, 16, 8), 3);
+  vst1q_f16 (VECT_VAR (result, float, 16, 8),
+	     VECT_VAR (vector_res, float, 16, 8));
+
+  CHECK_FP (TEST_MSG, float, 16, 8, PRIx16, expected_laneq3_static, "");
+
+  VECT_VAR (vector_res, float, 16, 8)
+    = vmulq_laneq_f16 (VECT_VAR (vsrc_1, float, 16, 8),
+		       VECT_VAR (vsrc_2, float, 16, 8), 4);
+  vst1q_f16 (VECT_VAR (result, float, 16, 8),
+	     VECT_VAR (vector_res, float, 16, 8));
+
+  CHECK_FP (TEST_MSG, float, 16, 8, PRIx16, expected_laneq4_static, "");
+
+  VECT_VAR (vector_res, float, 16, 8)
+    = vmulq_laneq_f16 (VECT_VAR (vsrc_1, float, 16, 8),
+		       VECT_VAR (vsrc_2, float, 16, 8), 5);
+  vst1q_f16 (VECT_VAR (result, float, 16, 8),
+	     VECT_VAR (vector_res, float, 16, 8));
+
+  CHECK_FP (TEST_MSG, float, 16, 8, PRIx16, expected_laneq5_static, "");
+
+  VECT_VAR (vector_res, float, 16, 8)
+    = vmulq_laneq_f16 (VECT_VAR (vsrc_1, float, 16, 8),
+		       VECT_VAR (vsrc_2, float, 16, 8), 6);
+  vst1q_f16 (VECT_VAR (result, float, 16, 8),
+	     VECT_VAR (vector_res, float, 16, 8));
+
+  CHECK_FP (TEST_MSG, float, 16, 8, PRIx16, expected_laneq6_static, "");
+
+  VECT_VAR (vector_res, float, 16, 8)
+    = vmulq_laneq_f16 (VECT_VAR (vsrc_1, float, 16, 8),
+		       VECT_VAR (vsrc_2, float, 16, 8), 7);
+  vst1q_f16 (VECT_VAR (result, float, 16, 8),
+	     VECT_VAR (vector_res, float, 16, 8));
+
+  CHECK_FP (TEST_MSG, float, 16, 8, PRIx16, expected_laneq7_static, "");
+}
+
+int
+main (void)
+{
+  exec_vmul_lane_f16 ();
+  return 0;
+}
--- a/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vmul_n.c
+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vmul_n.c
@@ -7,6 +7,9 @@ VECT_VAR_DECL(expected,int,16,4) [] = { 0xfef0, 0xff01, 0xff12, 0xff23 };
 VECT_VAR_DECL(expected,int,32,2) [] = { 0xfffffde0, 0xfffffe02 };
 VECT_VAR_DECL(expected,uint,16,4) [] = { 0xfcd0, 0xfd03, 0xfd36, 0xfd69 };
 VECT_VAR_DECL(expected,uint,32,2) [] = { 0xfffffbc0, 0xfffffc04 };
+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+VECT_VAR_DECL(expected, hfloat, 16, 4) [] = { 0xdd93, 0xdd3a, 0xdce1, 0xdc87 };
+#endif
 VECT_VAR_DECL(expected,hfloat,32,2) [] = { 0xc3b26666, 0xc3a74000 };
 VECT_VAR_DECL(expected,int,16,8) [] = { 0xfab0, 0xfb05, 0xfb5a, 0xfbaf,
 					0xfc04, 0xfc59, 0xfcae, 0xfd03 };
@@ -16,6 +19,10 @@ VECT_VAR_DECL(expected,uint,16,8) [] = { 0xf890, 0xf907, 0xf97e, 0xf9f5,
 					 0xfa6c, 0xfae3, 0xfb5a, 0xfbd1 };
 VECT_VAR_DECL(expected,uint,32,4) [] = { 0xfffff780, 0xfffff808,
 					 0xfffff890, 0xfffff918 };
+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+VECT_VAR_DECL(expected, hfloat, 16, 8) [] = { 0xe58e, 0xe535, 0xe4dc, 0xe483,
+					      0xe42a, 0xe3a3, 0xe2f2, 0xe240 };
+#endif
 VECT_VAR_DECL(expected,hfloat,32,4) [] = { 0xc4b1cccd, 0xc4a6b000,
 					   0xc49b9333, 0xc4907667 };
 
@@ -50,6 +57,13 @@ void FNNAME (INSN_NAME) (void)
   DECL_VMUL(vector);
   DECL_VMUL(vector_res);
 
+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+  DECL_VARIABLE(vector, float, 16, 4);
+  DECL_VARIABLE(vector, float, 16, 8);
+  DECL_VARIABLE(vector_res, float, 16, 4);
+  DECL_VARIABLE(vector_res, float, 16, 8);
+#endif
+
   clean_results ();
 
   /* Initialize vector from pre-initialized values.  */
@@ -57,11 +71,17 @@ void FNNAME (INSN_NAME) (void)
   VLOAD(vector, buffer, , int, s, 32, 2);
   VLOAD(vector, buffer, , uint, u, 16, 4);
   VLOAD(vector, buffer, , uint, u, 32, 2);
+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+  VLOAD(vector, buffer, , float, f, 16, 4);
+#endif
   VLOAD(vector, buffer, , float, f, 32, 2);
   VLOAD(vector, buffer, q, int, s, 16, 8);
   VLOAD(vector, buffer, q, int, s, 32, 4);
   VLOAD(vector, buffer, q, uint, u, 16, 8);
   VLOAD(vector, buffer, q, uint, u, 32, 4);
+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+  VLOAD(vector, buffer, q, float, f, 16, 8);
+#endif
   VLOAD(vector, buffer, q, float, f, 32, 4);
 
   /* Choose multiplier arbitrarily.  */
@@ -69,22 +89,34 @@ void FNNAME (INSN_NAME) (void)
   TEST_VMUL_N(, int, s, 32, 2, 0x22);
   TEST_VMUL_N(, uint, u, 16, 4, 0x33);
   TEST_VMUL_N(, uint, u, 32, 2, 0x44);
+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+  TEST_VMUL_N(, float, f, 16, 4, 22.3f);
+#endif
   TEST_VMUL_N(, float, f, 32, 2, 22.3f);
   TEST_VMUL_N(q, int, s, 16, 8, 0x55);
   TEST_VMUL_N(q, int, s, 32, 4, 0x66);
   TEST_VMUL_N(q, uint, u, 16, 8, 0x77);
   TEST_VMUL_N(q, uint, u, 32, 4, 0x88);
+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+  TEST_VMUL_N(q, float, f, 16, 8, 88.9f);
+#endif
   TEST_VMUL_N(q, float, f, 32, 4, 88.9f);
 
-  CHECK(TEST_MSG, int, 16, 4, PRIx64, expected, "");
+  CHECK(TEST_MSG, int, 16, 4, PRIx16, expected, "");
   CHECK(TEST_MSG, int, 32, 2, PRIx32, expected, "");
-  CHECK(TEST_MSG, uint, 16, 4, PRIx64, expected, "");
+  CHECK(TEST_MSG, uint, 16, 4, PRIx16, expected, "");
   CHECK(TEST_MSG, uint, 32, 2, PRIx32, expected, "");
+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+  CHECK_FP(TEST_MSG, float, 16, 4, PRIx16, expected, "");
+#endif
   CHECK_FP(TEST_MSG, float, 32, 2, PRIx32, expected, "");
-  CHECK(TEST_MSG, int, 16, 8, PRIx64, expected, "");
+  CHECK(TEST_MSG, int, 16, 8, PRIx16, expected, "");
   CHECK(TEST_MSG, int, 32, 4, PRIx32, expected, "");
-  CHECK(TEST_MSG, uint, 16, 8, PRIx64, expected, "");
+  CHECK(TEST_MSG, uint, 16, 8, PRIx16, expected, "");
   CHECK(TEST_MSG, uint, 32, 4, PRIx32, expected, "");
+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+  CHECK_FP(TEST_MSG, float, 16, 8, PRIx16, expected, "");
+#endif
   CHECK_FP(TEST_MSG, float, 32, 4, PRIx32, expected, "");
 }
 
--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vmulh_f16_1.c
@@ -0,0 +1,42 @@
+/* { dg-do run } */
+/* { dg-require-effective-target arm_v8_2a_fp16_scalar_hw } */
+/* { dg-add-options arm_v8_2a_fp16_scalar }  */
+
+#include <arm_fp16.h>
+
+#define INFF __builtin_inf ()
+
+/* Expected results (16-bit hexadecimal representation).  */
+uint16_t expected[] =
+{
+  0x0000 /* 0.000000 */,
+  0x8000 /* -0.000000 */,
+  0xc854 /* -8.656250 */,
+  0x5cd8 /* 310.000000 */,
+  0x60b0 /* 600.000000 */,
+  0xa019 /* -0.008003 */,
+  0xbc9a /* -1.150391 */,
+  0xc8cf /* -9.617188 */,
+  0x51fd /* 47.906250 */,
+  0x4634 /* 6.203125 */,
+  0xc0d9 /* -2.423828 */,
+  0x3c9a /* 1.150391 */,
+  0xc79a /* -7.601562 */,
+  0x5430 /* 67.000000 */,
+  0xbfd0 /* -1.953125 */,
+  0x46ac /* 6.671875 */,
+  0xfc00 /* -inf */,
+  0xfc00 /* -inf */
+};
+
+#define TEST_MSG "VMULH_F16"
+#define INSN_NAME vmulh_f16
+
+#define EXPECTED expected
+
+#define INPUT_TYPE float16_t
+#define OUTPUT_TYPE float16_t
+#define OUTPUT_TYPE_SIZE 16
+
+/* Include the template for binary scalar operations.  */
+#include "binary_scalar_op.inc"
--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vmulh_lane_f16_1.c
@@ -0,0 +1,90 @@
+/* { dg-do run } */
+/* { dg-require-effective-target arm_v8_2a_fp16_scalar_hw } */
+/* { dg-add-options arm_v8_2a_fp16_neon } */
+/* { dg-skip-if "" { arm*-*-* } } */
+
+#include <arm_neon.h>
+#include "arm-neon-ref.h"
+#include "compute-ref-data.h"
+
+#define FP16_C(a) ((__fp16) a)
+#define A FP16_C (13.4)
+#define B FP16_C (-56.8)
+#define C FP16_C (-34.8)
+#define D FP16_C (12)
+#define E FP16_C (63.1)
+#define F FP16_C (19.1)
+#define G FP16_C (-4.8)
+#define H FP16_C (77)
+
+#define I FP16_C (0.7)
+#define J FP16_C (-78)
+#define K FP16_C (11.23)
+#define L FP16_C (98)
+#define M FP16_C (87.1)
+#define N FP16_C (-8)
+#define O FP16_C (-1.1)
+#define P FP16_C (-9.7)
+
+extern void abort ();
+
+float16_t src1[8] = { A, B, C, D, I, J, K, L };
+VECT_VAR_DECL (src2, float, 16, 4) [] = { E, F, G, H };
+VECT_VAR_DECL (src2, float, 16, 8) [] = { E, F, G, H, M, N, O, P };
+
+/* Expected results for vmulh_lane.  */
+uint16_t expected[4] = { 0x629B /* A * E.  */, 0xE43D /* B * F.  */,
+			 0x5939 /* C * G.  */, 0x6338 /* D * H.  */ };
+
+
+/* Expected results for vmulh_lane.  */
+uint16_t expected_laneq[8] = { 0x629B /* A * E.  */,
+			       0xE43D /* B * F.  */,
+			       0x5939 /* C * G.  */,
+			       0x6338 /* D * H.  */,
+			       0x53A0 /* I * M.  */,
+			       0x60E0 /* J * N.  */,
+			       0xCA2C /* K * O.  */,
+			       0xE36E /* L * P.  */ };
+
+void exec_vmulh_lane_f16 (void)
+{
+#define CHECK_LANE(N)\
+  ret = vmulh_lane_f16 (src1[N], VECT_VAR (vsrc2, float, 16, 4), N);\
+  if (*(uint16_t *) &ret != expected[N])\
+    abort ();
+
+  DECL_VARIABLE(vsrc2, float, 16, 4);
+  VLOAD (vsrc2, src2, , float, f, 16, 4);
+  float16_t ret;
+
+  CHECK_LANE(0)
+  CHECK_LANE(1)
+  CHECK_LANE(2)
+  CHECK_LANE(3)
+
+#undef CHECK_LANE
+#define CHECK_LANE(N)\
+  ret = vmulh_laneq_f16 (src1[N], VECT_VAR (vsrc2, float, 16, 8), N);\
+  if (*(uint16_t *) &ret != expected_laneq[N])\
+    abort ();
+
+  DECL_VARIABLE(vsrc2, float, 16, 8);
+  VLOAD (vsrc2, src2, q, float, f, 16, 8);
+
+  CHECK_LANE(0)
+  CHECK_LANE(1)
+  CHECK_LANE(2)
+  CHECK_LANE(3)
+  CHECK_LANE(4)
+  CHECK_LANE(5)
+  CHECK_LANE(6)
+  CHECK_LANE(7)
+}
+
+int
+main (void)
+{
+  exec_vmulh_lane_f16 ();
+  return 0;
+}
--- a/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vmull.c
+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vmull.c
@@ -59,13 +59,13 @@ void exec_vmull (void)
   TEST_VMULL(uint, u, 32, 64, 2);
   TEST_VMULL(poly, p, 8, 16, 8);
 
-  CHECK(TEST_MSG, int, 16, 8, PRIx64, expected, "");
+  CHECK(TEST_MSG, int, 16, 8, PRIx16, expected, "");
   CHECK(TEST_MSG, int, 32, 4, PRIx32, expected, "");
-  CHECK(TEST_MSG, int, 64, 2, PRIx32, expected, "");
-  CHECK(TEST_MSG, uint, 16, 8, PRIx64, expected, "");
+  CHECK(TEST_MSG, int, 64, 2, PRIx64, expected, "");
+  CHECK(TEST_MSG, uint, 16, 8, PRIx16, expected, "");
   CHECK(TEST_MSG, uint, 32, 4, PRIx32, expected, "");
-  CHECK(TEST_MSG, uint, 64, 2, PRIx32, expected, "");
-  CHECK(TEST_MSG, poly, 16, 8, PRIx16, expected, "");
+  CHECK(TEST_MSG, uint, 64, 2, PRIx64, expected, "");
+  CHECK_POLY(TEST_MSG, poly, 16, 8, PRIx16, expected, "");
 }
 
 int main (void)
--- a/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vmull_lane.c
+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vmull_lane.c
@@ -54,9 +54,9 @@ void exec_vmull_lane (void)
   TEST_VMULL_LANE(uint, u, 32, 64, 2, 1);
 
   CHECK(TEST_MSG, int, 32, 4, PRIx32, expected, "");
-  CHECK(TEST_MSG, int, 64, 2, PRIx32, expected, "");
+  CHECK(TEST_MSG, int, 64, 2, PRIx64, expected, "");
   CHECK(TEST_MSG, uint, 32, 4, PRIx32, expected, "");
-  CHECK(TEST_MSG, uint, 64, 2, PRIx32, expected, "");
+  CHECK(TEST_MSG, uint, 64, 2, PRIx64, expected, "");
 }
 
 int main (void)
--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vmulx_f16_1.c
@@ -0,0 +1,84 @@
+/* { dg-do run } */
+/* { dg-require-effective-target arm_v8_2a_fp16_neon_hw } */
+/* { dg-add-options arm_v8_2a_fp16_neon } */
+/* { dg-skip-if "" { arm*-*-* } } */
+
+#include <arm_neon.h>
+#include "arm-neon-ref.h"
+#include "compute-ref-data.h"
+
+#define FP16_C(a) ((__fp16) a)
+#define A FP16_C (13.4)
+#define B FP16_C (__builtin_inff ())
+#define C FP16_C (-34.8)
+#define D FP16_C (-__builtin_inff ())
+#define E FP16_C (63.1)
+#define F FP16_C (0.0)
+#define G FP16_C (-4.8)
+#define H FP16_C (0.0)
+
+#define I FP16_C (0.7)
+#define J FP16_C (-__builtin_inff ())
+#define K FP16_C (11.23)
+#define L FP16_C (98)
+#define M FP16_C (87.1)
+#define N FP16_C (-0.0)
+#define O FP16_C (-1.1)
+#define P FP16_C (7)
+
+/* Expected results for vmulx.  */
+VECT_VAR_DECL (expected_static, hfloat, 16, 4) []
+  = { 0x629B /* A * E.  */, 0x4000 /* FP16_C (2.0f).  */,
+      0x5939 /* C * G.  */, 0xC000 /* FP16_C (-2.0f).  */ };
+
+VECT_VAR_DECL (expected_static, hfloat, 16, 8) []
+  = { 0x629B /* A * E.  */, 0x4000 /* FP16_C (2.0f).  */,
+      0x5939 /* C * G.  */, 0xC000 /* FP16_C (-2.0f).  */,
+      0x53A0 /* I * M.  */, 0x4000 /* FP16_C (2.0f).  */,
+      0xCA2C /* K * O.  */, 0x615C /* L * P.  */ };
+
+void exec_vmulx_f16 (void)
+{
+#undef TEST_MSG
+#define TEST_MSG "VMULX (FP16)"
+  clean_results ();
+
+  DECL_VARIABLE(vsrc_1, float, 16, 4);
+  DECL_VARIABLE(vsrc_2, float, 16, 4);
+  VECT_VAR_DECL (buf_src_1, float, 16, 4) [] = {A, B, C, D};
+  VECT_VAR_DECL (buf_src_2, float, 16, 4) [] = {E, F, G, H};
+  VLOAD (vsrc_1, buf_src_1, , float, f, 16, 4);
+  VLOAD (vsrc_2, buf_src_2, , float, f, 16, 4);
+  DECL_VARIABLE (vector_res, float, 16, 4)
+    = vmulx_f16 (VECT_VAR (vsrc_1, float, 16, 4),
+		 VECT_VAR (vsrc_2, float, 16, 4));
+  vst1_f16 (VECT_VAR (result, float, 16, 4),
+	    VECT_VAR (vector_res, float, 16, 4));
+
+  CHECK_FP (TEST_MSG, float, 16, 4, PRIx16, expected_static, "");
+
+#undef TEST_MSG
+#define TEST_MSG "VMULXQ (FP16)"
+  clean_results ();
+
+  DECL_VARIABLE(vsrc_1, float, 16, 8);
+  DECL_VARIABLE(vsrc_2, float, 16, 8);
+  VECT_VAR_DECL (buf_src_1, float, 16, 8) [] = {A, B, C, D, I, J, K, L};
+  VECT_VAR_DECL (buf_src_2, float, 16, 8) [] = {E, F, G, H, M, N, O, P};
+  VLOAD (vsrc_1, buf_src_1, q, float, f, 16, 8);
+  VLOAD (vsrc_2, buf_src_2, q, float, f, 16, 8);
+  DECL_VARIABLE (vector_res, float, 16, 8)
+    = vmulxq_f16 (VECT_VAR (vsrc_1, float, 16, 8),
+		  VECT_VAR (vsrc_2, float, 16, 8));
+  vst1q_f16 (VECT_VAR (result, float, 16, 8),
+	     VECT_VAR (vector_res, float, 16, 8));
+
+  CHECK_FP (TEST_MSG, float, 16, 8, PRIx16, expected_static, "");
+}
+
+int
+main (void)
+{
+  exec_vmulx_f16 ();
+  return 0;
+}
--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vmulx_lane_f16_1.c
@@ -0,0 +1,452 @@
+/* { dg-do run } */
+/* { dg-require-effective-target arm_v8_2a_fp16_neon_hw } */
+/* { dg-add-options arm_v8_2a_fp16_neon } */
+/* { dg-skip-if "" { arm*-*-* } } */
+
+#include <arm_neon.h>
+#include "arm-neon-ref.h"
+#include "compute-ref-data.h"
+
+#define FP16_C(a) ((__fp16) a)
+#define A FP16_C (13.4)
+#define B FP16_C (__builtin_inff ())
+#define C FP16_C (-34.8)
+#define D FP16_C (-__builtin_inff ())
+#define E FP16_C (-0.0)
+#define F FP16_C (19.1)
+#define G FP16_C (-4.8)
+#define H FP16_C (0.0)
+
+#define I FP16_C (0.7)
+#define J FP16_C (-78)
+#define K FP16_C (-__builtin_inff ())
+#define L FP16_C (98)
+#define M FP16_C (87.1)
+#define N FP16_C (-8)
+#define O FP16_C (-1.1)
+#define P FP16_C (-0.0)
+
+/* Expected results for vmulx_lane.  */
+VECT_VAR_DECL (expected0_static, hfloat, 16, 4) []
+  = { 0x8000 /* A * E.  */,
+      0xC000 /* FP16_C (-2.0f).  */,
+      0x0000 /* C * E.  */,
+      0x4000 /* FP16_C (2.0f).  */ };
+
+VECT_VAR_DECL (expected1_static, hfloat, 16, 4) []
+  = { 0x5BFF /* A * F.  */,
+      0x7C00 /* B * F.  */,
+      0xE131 /* C * F.  */,
+      0xFC00 /* D * F.  */ };
+
+VECT_VAR_DECL (expected2_static, hfloat, 16, 4) []
+  = { 0xD405 /* A * G.  */,
+      0xFC00 /* B * G.  */,
+      0x5939 /* C * G.  */,
+      0x7C00 /* D * G.  */ };
+
+VECT_VAR_DECL (expected3_static, hfloat, 16, 4) []
+  = { 0x0000 /* A * H.  */,
+      0x4000 /* FP16_C (2.0f).  */,
+      0x8000 /* C * H.  */,
+      0xC000 /* FP16_C (-2.0f).  */ };
+
+/* Expected results for vmulxq_lane.  */
+VECT_VAR_DECL (expected0_static, hfloat, 16, 8) []
+  = { 0x8000 /* A * E.  */,
+      0xC000 /* FP16_C (-2.0f).  */,
+      0x0000 /* C * E.  */,
+      0x4000 /* FP16_C (2.0f).  */,
+      0x8000 /* I * E.  */,
+      0x0000 /* J * E.  */,
+      0x4000 /* FP16_C (2.0f).  */,
+      0x8000 /* L * E.  */ };
+
+VECT_VAR_DECL (expected1_static, hfloat, 16, 8) []
+  = { 0x5BFF /* A * F.  */,
+      0x7C00 /* B * F.  */,
+      0xE131 /* C * F.  */,
+      0xFC00 /* D * F.  */,
+      0x4AAF /* I * F.  */,
+      0xE5D1 /* J * F.  */,
+      0xFC00 /* K * F.  */,
+      0x674F /* L * F.  */ };
+
+VECT_VAR_DECL (expected2_static, hfloat, 16, 8) []
+  = { 0xD405 /* A * G.  */,
+      0xFC00 /* B * G.  */,
+      0x5939 /* C * G.  */,
+      0x7C00 /* D * G.  */,
+      0xC2B9 /* I * G.  */,
+      0x5DDA /* J * G.  */,
+      0x7C00 /* K * G.  */,
+      0xDF5A /* L * G.  */ };
+
+VECT_VAR_DECL (expected3_static, hfloat, 16, 8) []
+  = { 0x0000 /* A * H.  */,
+      0x4000 /* FP16_C (2.0f).  */,
+      0x8000 /* C * H.  */,
+      0xC000 /* FP16_C (-2.0f).  */,
+      0x0000 /* I * H.  */,
+      0x8000 /* J * H.  */,
+      0xC000 /* FP16_C (-2.0f).  */,
+      0x0000 /* L * H.  */};
+
+/* Expected results for vmulx_laneq.  */
+VECT_VAR_DECL (expected_laneq0_static, hfloat, 16, 4) []
+  = { 0x8000 /* A * E.  */,
+      0xC000 /* FP16_C (-2.0f).  */,
+      0x0000 /* C * E.  */,
+      0x4000 /* FP16_C (2.0f).  */ };
+
+VECT_VAR_DECL (expected_laneq1_static, hfloat, 16, 4) []
+  = { 0x5BFF /* A * F.  */,
+      0x7C00 /* B * F.  */,
+      0xE131 /* C * F.  */,
+      0xFC00 /* D * F.  */ };
+
+VECT_VAR_DECL (expected_laneq2_static, hfloat, 16, 4) []
+  = { 0xD405 /* A * G.  */,
+      0xFC00 /* B * G.  */,
+      0x5939 /* C * G.  */,
+      0x7C00 /* D * G.  */ };
+
+VECT_VAR_DECL (expected_laneq3_static, hfloat, 16, 4) []
+  = { 0x0000 /* A * H.  */,
+      0x4000 /* FP16_C (2.0f).  */,
+      0x8000 /* C * H.  */,
+      0xC000 /* FP16_C (-2.0f).  */ };
+
+VECT_VAR_DECL (expected_laneq4_static, hfloat, 16, 4) []
+  = { 0x648F /* A * M.  */,
+      0x7C00 /* B * M.  */,
+      0xE9ED /* C * M.  */,
+      0xFC00 /* D * M.  */ };
+
+VECT_VAR_DECL (expected_laneq5_static, hfloat, 16, 4) []
+  = { 0xD6B3 /* A * N.  */,
+      0xFC00 /* B * N.  */,
+      0x5C5A /* C * N.  */,
+      0x7C00 /* D * N.  */ };
+
+VECT_VAR_DECL (expected_laneq6_static, hfloat, 16, 4) []
+  = { 0xCB5E /* A * O.  */,
+      0xFC00 /* B * O.  */,
+      0x50C9 /* C * O.  */,
+      0x7C00 /* D * O.  */ };
+
+VECT_VAR_DECL (expected_laneq7_static, hfloat, 16, 4) []
+  = { 0x8000 /* A * P.  */,
+      0xC000 /* FP16_C (-2.0f).  */,
+      0x0000 /* C * P.  */,
+      0x4000 /* FP16_C (2.0f).  */ };
+
+VECT_VAR_DECL (expected_laneq0_static, hfloat, 16, 8) []
+  = { 0x8000 /* A * E.  */,
+      0xC000 /* FP16_C (-2.0f).  */,
+      0x0000 /* C * E.  */,
+      0x4000 /* FP16_C (2.0f).  */,
+      0x8000 /* I * E.  */,
+      0x0000 /* J * E.  */,
+      0x4000 /* FP16_C (2.0f).  */,
+      0x8000 /* L * E.  */  };
+
+VECT_VAR_DECL (expected_laneq1_static, hfloat, 16, 8) []
+  = { 0x5BFF /* A * F.  */,
+      0x7C00 /* B * F.  */,
+      0xE131 /* C * F.  */,
+      0xFC00 /* D * F.  */,
+      0x4AAF /* I * F.  */,
+      0xE5D1 /* J * F.  */,
+      0xFC00 /* K * F.  */,
+      0x674F /* L * F.  */ };
+
+VECT_VAR_DECL (expected_laneq2_static, hfloat, 16, 8) []
+  = { 0xD405 /* A * G.  */,
+      0xFC00 /* B * G.  */,
+      0x5939 /* C * G.  */,
+      0x7C00 /* D * G.  */,
+      0xC2B9 /* I * G.  */,
+      0x5DDA /* J * G.  */,
+      0x7C00 /* K * G.  */,
+      0xDF5A /* L * G.  */ };
+
+VECT_VAR_DECL (expected_laneq3_static, hfloat, 16, 8) []
+  = { 0x0000 /* A * H.  */,
+      0x4000 /* FP16_C (2.0f).  */,
+      0x8000 /* C * H.  */,
+      0xC000 /* FP16_C (-2.0f).  */,
+      0x0000 /* I * H.  */,
+      0x8000 /* J * H.  */,
+      0xC000 /* FP16_C (-2.0f).  */,
+      0x0000 /* L * H.  */ };
+
+VECT_VAR_DECL (expected_laneq4_static, hfloat, 16, 8) []
+  = { 0x648F /* A * M.  */,
+      0x7C00 /* B * M.  */,
+      0xE9ED /* C * M.  */,
+      0xFC00 /* D * M.  */,
+      0x53A0 /* I * M.  */,
+      0xEEA3 /* J * M.  */,
+      0xFC00 /* K * M.  */,
+      0x702B /* L * M.  */ };
+
+VECT_VAR_DECL (expected_laneq5_static, hfloat, 16, 8) []
+  = { 0xD6B3 /* A * N.  */,
+      0xFC00 /* B * N.  */,
+      0x5C5A /* C * N.  */,
+      0x7C00 /* D * N.  */,
+      0xC59A /* I * N.  */,
+      0x60E0 /* J * N.  */,
+      0x7C00 /* K * N.  */,
+      0xE220 /* L * N.  */ };
+
+VECT_VAR_DECL (expected_laneq6_static, hfloat, 16, 8) []
+  = { 0xCB5E /* A * O.  */,
+      0xFC00 /* B * O.  */,
+      0x50C9 /* C * O.  */,
+      0x7C00 /* D * O.  */,
+      0xBA29 /* I * O.  */,
+      0x555C /* J * O.  */,
+      0x7C00 /* K * O.  */,
+      0xD6BC /* L * O.  */ };
+
+VECT_VAR_DECL (expected_laneq7_static, hfloat, 16, 8) []
+  = { 0x8000 /* A * P.  */,
+      0xC000 /* FP16_C (-2.0f).  */,
+      0x0000 /* C * P.  */,
+      0x4000 /* FP16_C (2.0f).  */,
+      0x8000 /* I * P.  */,
+      0x0000 /* J * P.  */,
+      0x4000 /* FP16_C (2.0f).  */,
+      0x8000 /* L * P.  */ };
+
+void exec_vmulx_lane_f16 (void)
+{
+#undef TEST_MSG
+#define TEST_MSG "VMULX_LANE (FP16)"
+  clean_results ();
+
+  DECL_VARIABLE(vsrc_1, float, 16, 4);
+  DECL_VARIABLE(vsrc_2, float, 16, 4);
+  VECT_VAR_DECL (buf_src_1, float, 16, 4) [] = {A, B, C, D};
+  VECT_VAR_DECL (buf_src_2, float, 16, 4) [] = {E, F, G, H};
+  VLOAD (vsrc_1, buf_src_1, , float, f, 16, 4);
+  VLOAD (vsrc_2, buf_src_2, , float, f, 16, 4);
+  DECL_VARIABLE (vector_res, float, 16, 4)
+    = vmulx_lane_f16 (VECT_VAR (vsrc_1, float, 16, 4),
+		      VECT_VAR (vsrc_2, float, 16, 4), 0);
+  vst1_f16 (VECT_VAR (result, float, 16, 4),
+	    VECT_VAR (vector_res, float, 16, 4));
+
+  CHECK_FP (TEST_MSG, float, 16, 4, PRIx16, expected0_static, "");
+
+  VECT_VAR (vector_res, float, 16, 4)
+    = vmulx_lane_f16 (VECT_VAR (vsrc_1, float, 16, 4),
+		      VECT_VAR (vsrc_2, float, 16, 4), 1);
+  vst1_f16 (VECT_VAR (result, float, 16, 4),
+	    VECT_VAR (vector_res, float, 16, 4));
+
+  CHECK_FP (TEST_MSG, float, 16, 4, PRIx16, expected1_static, "");
+
+  VECT_VAR (vector_res, float, 16, 4)
+    = vmulx_lane_f16 (VECT_VAR (vsrc_1, float, 16, 4),
+		      VECT_VAR (vsrc_2, float, 16, 4), 2);
+  vst1_f16 (VECT_VAR (result, float, 16, 4),
+	    VECT_VAR (vector_res, float, 16, 4));
+
+  CHECK_FP (TEST_MSG, float, 16, 4, PRIx16, expected2_static, "");
+
+  VECT_VAR (vector_res, float, 16, 4)
+    = vmulx_lane_f16 (VECT_VAR (vsrc_1, float, 16, 4),
+		      VECT_VAR (vsrc_2, float, 16, 4), 3);
+  vst1_f16 (VECT_VAR (result, float, 16, 4),
+	    VECT_VAR (vector_res, float, 16, 4));
+
+  CHECK_FP (TEST_MSG, float, 16, 4, PRIx16, expected3_static, "");
+
+#undef TEST_MSG
+#define TEST_MSG "VMULXQ_LANE (FP16)"
+  clean_results ();
+
+  DECL_VARIABLE(vsrc_1, float, 16, 8);
+  VECT_VAR_DECL (buf_src_1, float, 16, 8) [] = {A, B, C, D, I, J, K, L};
+  VLOAD (vsrc_1, buf_src_1, q, float, f, 16, 8);
+  DECL_VARIABLE (vector_res, float, 16, 8)
+    = vmulxq_lane_f16 (VECT_VAR (vsrc_1, float, 16, 8),
+		       VECT_VAR (vsrc_2, float, 16, 4), 0);
+  vst1q_f16 (VECT_VAR (result, float, 16, 8),
+	     VECT_VAR (vector_res, float, 16, 8));
+
+  CHECK_FP (TEST_MSG, float, 16, 8, PRIx16, expected0_static, "");
+
+  VECT_VAR (vector_res, float, 16, 8)
+    = vmulxq_lane_f16 (VECT_VAR (vsrc_1, float, 16, 8),
+		       VECT_VAR (vsrc_2, float, 16, 4), 1);
+  vst1q_f16 (VECT_VAR (result, float, 16, 8),
+	     VECT_VAR (vector_res, float, 16, 8));
+
+  CHECK_FP (TEST_MSG, float, 16, 8, PRIx16, expected1_static, "");
+
+  VECT_VAR (vector_res, float, 16, 8)
+    = vmulxq_lane_f16 (VECT_VAR (vsrc_1, float, 16, 8),
+		       VECT_VAR (vsrc_2, float, 16, 4), 2);
+  vst1q_f16 (VECT_VAR (result, float, 16, 8),
+	     VECT_VAR (vector_res, float, 16, 8));
+
+  CHECK_FP (TEST_MSG, float, 16, 8, PRIx16, expected2_static, "");
+
+  VECT_VAR (vector_res, float, 16, 8)
+    = vmulxq_lane_f16 (VECT_VAR (vsrc_1, float, 16, 8),
+		       VECT_VAR (vsrc_2, float, 16, 4), 3);
+  vst1q_f16 (VECT_VAR (result, float, 16, 8),
+	     VECT_VAR (vector_res, float, 16, 8));
+
+  CHECK_FP (TEST_MSG, float, 16, 8, PRIx16, expected3_static, "");
+
+#undef TEST_MSG
+#define TEST_MSG "VMULX_LANEQ (FP16)"
+  clean_results ();
+
+  DECL_VARIABLE(vsrc_2, float, 16, 8);
+  VECT_VAR_DECL (buf_src_2, float, 16, 8) [] = {E, F, G, H, M, N, O, P};
+  VLOAD (vsrc_2, buf_src_2, q, float, f, 16, 8);
+  VECT_VAR (vector_res, float, 16, 4)
+    = vmulx_laneq_f16 (VECT_VAR (vsrc_1, float, 16, 4),
+		       VECT_VAR (vsrc_2, float, 16, 8), 0);
+  vst1_f16 (VECT_VAR (result, float, 16, 4),
+	    VECT_VAR (vector_res, float, 16, 4));
+
+  CHECK_FP (TEST_MSG, float, 16, 4, PRIx16, expected_laneq0_static, "");
+
+  VECT_VAR (vector_res, float, 16, 4)
+    = vmulx_laneq_f16 (VECT_VAR (vsrc_1, float, 16, 4),
+		       VECT_VAR (vsrc_2, float, 16, 8), 1);
+  vst1_f16 (VECT_VAR (result, float, 16, 4),
+	    VECT_VAR (vector_res, float, 16, 4));
+
+  CHECK_FP (TEST_MSG, float, 16, 4, PRIx16, expected_laneq1_static, "");
+
+  VECT_VAR (vector_res, float, 16, 4)
+    = vmulx_laneq_f16 (VECT_VAR (vsrc_1, float, 16, 4),
+		       VECT_VAR (vsrc_2, float, 16, 8), 2);
+  vst1_f16 (VECT_VAR (result, float, 16, 4),
+	    VECT_VAR (vector_res, float, 16, 4));
+
+  CHECK_FP (TEST_MSG, float, 16, 4, PRIx16, expected_laneq2_static, "");
+
+  VECT_VAR (vector_res, float, 16, 4)
+    = vmulx_laneq_f16 (VECT_VAR (vsrc_1, float, 16, 4),
+		       VECT_VAR (vsrc_2, float, 16, 8), 3);
+  vst1_f16 (VECT_VAR (result, float, 16, 4),
+	    VECT_VAR (vector_res, float, 16, 4));
+
+  CHECK_FP (TEST_MSG, float, 16, 4, PRIx16, expected_laneq3_static, "");
+
+  VECT_VAR (vector_res, float, 16, 4)
+    = vmulx_laneq_f16 (VECT_VAR (vsrc_1, float, 16, 4),
+		       VECT_VAR (vsrc_2, float, 16, 8), 4);
+  vst1_f16 (VECT_VAR (result, float, 16, 4),
+	    VECT_VAR (vector_res, float, 16, 4));
+
+  CHECK_FP (TEST_MSG, float, 16, 4, PRIx16, expected_laneq4_static, "");
+
+  VECT_VAR (vector_res, float, 16, 4)
+    = vmulx_laneq_f16 (VECT_VAR (vsrc_1, float, 16, 4),
+		       VECT_VAR (vsrc_2, float, 16, 8), 5);
+  vst1_f16 (VECT_VAR (result, float, 16, 4),
+	    VECT_VAR (vector_res, float, 16, 4));
+
+  CHECK_FP (TEST_MSG, float, 16, 4, PRIx16, expected_laneq5_static, "");
+
+  VECT_VAR (vector_res, float, 16, 4)
+    = vmulx_laneq_f16 (VECT_VAR (vsrc_1, float, 16, 4),
+		       VECT_VAR (vsrc_2, float, 16, 8), 6);
+  vst1_f16 (VECT_VAR (result, float, 16, 4),
+	    VECT_VAR (vector_res, float, 16, 4));
+
+  CHECK_FP (TEST_MSG, float, 16, 4, PRIx16, expected_laneq6_static, "");
+
+  VECT_VAR (vector_res, float, 16, 4)
+    = vmulx_laneq_f16 (VECT_VAR (vsrc_1, float, 16, 4),
+		       VECT_VAR (vsrc_2, float, 16, 8), 7);
+  vst1_f16 (VECT_VAR (result, float, 16, 4),
+	    VECT_VAR (vector_res, float, 16, 4));
+
+  CHECK_FP (TEST_MSG, float, 16, 4, PRIx16, expected_laneq7_static, "");
+
+#undef TEST_MSG
+#define TEST_MSG "VMULXQ_LANEQ (FP16)"
+  clean_results ();
+
+  VECT_VAR (vector_res, float, 16, 8)
+    = vmulxq_laneq_f16 (VECT_VAR (vsrc_1, float, 16, 8),
+			VECT_VAR (vsrc_2, float, 16, 8), 0);
+  vst1q_f16 (VECT_VAR (result, float, 16, 8),
+	     VECT_VAR (vector_res, float, 16, 8));
+
+  CHECK_FP (TEST_MSG, float, 16, 8, PRIx16, expected_laneq0_static, "");
+
+  VECT_VAR (vector_res, float, 16, 8)
+    = vmulxq_laneq_f16 (VECT_VAR (vsrc_1, float, 16, 8),
+			VECT_VAR (vsrc_2, float, 16, 8), 1);
+  vst1q_f16 (VECT_VAR (result, float, 16, 8),
+	     VECT_VAR (vector_res, float, 16, 8));
+
+  CHECK_FP (TEST_MSG, float, 16, 8, PRIx16, expected_laneq1_static, "");
+
+  VECT_VAR (vector_res, float, 16, 8)
+    = vmulxq_laneq_f16 (VECT_VAR (vsrc_1, float, 16, 8),
+			VECT_VAR (vsrc_2, float, 16, 8), 2);
+  vst1q_f16 (VECT_VAR (result, float, 16, 8),
+	     VECT_VAR (vector_res, float, 16, 8));
+
+  CHECK_FP (TEST_MSG, float, 16, 8, PRIx16, expected_laneq2_static, "");
+
+  VECT_VAR (vector_res, float, 16, 8)
+    = vmulxq_laneq_f16 (VECT_VAR (vsrc_1, float, 16, 8),
+			VECT_VAR (vsrc_2, float, 16, 8), 3);
+  vst1q_f16 (VECT_VAR (result, float, 16, 8),
+	     VECT_VAR (vector_res, float, 16, 8));
+
+  CHECK_FP (TEST_MSG, float, 16, 8, PRIx16, expected_laneq3_static, "");
+
+  VECT_VAR (vector_res, float, 16, 8)
+    = vmulxq_laneq_f16 (VECT_VAR (vsrc_1, float, 16, 8),
+			VECT_VAR (vsrc_2, float, 16, 8), 4);
+  vst1q_f16 (VECT_VAR (result, float, 16, 8),
+	     VECT_VAR (vector_res, float, 16, 8));
+
+  CHECK_FP (TEST_MSG, float, 16, 8, PRIx16, expected_laneq4_static, "");
+
+  VECT_VAR (vector_res, float, 16, 8)
+    = vmulxq_laneq_f16 (VECT_VAR (vsrc_1, float, 16, 8),
+			VECT_VAR (vsrc_2, float, 16, 8), 5);
+  vst1q_f16 (VECT_VAR (result, float, 16, 8),
+	     VECT_VAR (vector_res, float, 16, 8));
+
+  CHECK_FP (TEST_MSG, float, 16, 8, PRIx16, expected_laneq5_static, "");
+
+  VECT_VAR (vector_res, float, 16, 8)
+    = vmulxq_laneq_f16 (VECT_VAR (vsrc_1, float, 16, 8),
+			VECT_VAR (vsrc_2, float, 16, 8), 6);
+  vst1q_f16 (VECT_VAR (result, float, 16, 8),
+	     VECT_VAR (vector_res, float, 16, 8));
+
+  CHECK_FP (TEST_MSG, float, 16, 8, PRIx16, expected_laneq6_static, "");
+
+  VECT_VAR (vector_res, float, 16, 8)
+    = vmulxq_laneq_f16 (VECT_VAR (vsrc_1, float, 16, 8),
+			VECT_VAR (vsrc_2, float, 16, 8), 7);
+  vst1q_f16 (VECT_VAR (result, float, 16, 8),
+	     VECT_VAR (vector_res, float, 16, 8));
+
+  CHECK_FP (TEST_MSG, float, 16, 8, PRIx16, expected_laneq7_static, "");
+}
+
+int
+main (void)
+{
+  exec_vmulx_lane_f16 ();
+  return 0;
+}
--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vmulx_n_f16_1.c
@@ -0,0 +1,177 @@
+/* { dg-do run } */
+/* { dg-require-effective-target arm_v8_2a_fp16_neon_hw } */
+/* { dg-add-options arm_v8_2a_fp16_neon } */
+/* { dg-skip-if "" { arm*-*-* } } */
+
+#include <arm_neon.h>
+#include "arm-neon-ref.h"
+#include "compute-ref-data.h"
+
+#define FP16_C(a) ((__fp16) a)
+#define A FP16_C (13.4)
+#define B FP16_C (__builtin_inff ())
+#define C FP16_C (-34.8)
+#define D FP16_C (-__builtin_inff ())
+#define E FP16_C (-0.0)
+#define F FP16_C (19.1)
+#define G FP16_C (-4.8)
+#define H FP16_C (0.0)
+
+float16_t elemE = E;
+float16_t elemF = F;
+float16_t elemG = G;
+float16_t elemH = H;
+
+#define I FP16_C (0.7)
+#define J FP16_C (-78)
+#define K FP16_C (11.23)
+#define L FP16_C (98)
+#define M FP16_C (87.1)
+#define N FP16_C (-8)
+#define O FP16_C (-1.1)
+#define P FP16_C (-9.7)
+
+/* Expected results for vmulx_n.  */
+VECT_VAR_DECL (expected0_static, hfloat, 16, 4) []
+  = { 0x8000 /* A * E.  */,
+      0xC000 /* FP16_C (-2.0f).  */,
+      0x0000 /* C * E.  */,
+      0x4000 /* FP16_C (2.0f).  */ };
+
+VECT_VAR_DECL (expected1_static, hfloat, 16, 4) []
+  = { 0x5BFF /* A * F.  */,
+      0x7C00 /* B * F.  */,
+      0xE131 /* C * F.  */,
+      0xFC00 /* D * F.  */ };
+
+VECT_VAR_DECL (expected2_static, hfloat, 16, 4) []
+  = { 0xD405 /* A * G.  */,
+      0xFC00 /* B * G.  */,
+      0x5939 /* C * G.  */,
+      0x7C00 /* D * G.  */ };
+
+VECT_VAR_DECL (expected3_static, hfloat, 16, 4) []
+  = { 0x0000 /* A * H.  */,
+      0x4000 /* FP16_C (2.0f).  */,
+      0x8000 /* C * H.  */,
+      0xC000 /* FP16_C (-2.0f).  */ };
+
+VECT_VAR_DECL (expected0_static, hfloat, 16, 8) []
+  = { 0x8000 /* A * E.  */,
+      0xC000 /* FP16_C (-2.0f).  */,
+      0x0000 /* C * E.  */,
+      0x4000 /* FP16_C (2.0f).  */,
+      0x8000 /* I * E.  */,
+      0x0000 /* J * E.  */,
+      0x8000 /* K * E.  */,
+      0x8000 /* L * E.  */ };
+
+VECT_VAR_DECL (expected1_static, hfloat, 16, 8) []
+  = { 0x5BFF /* A * F.  */,
+      0x7C00 /* B * F.  */,
+      0xE131 /* C * F.  */,
+      0xFC00 /* D * F.  */,
+      0x4AAF /* I * F.  */,
+      0xE5D1 /* J * F.  */,
+      0x5AB3 /* K * F.  */,
+      0x674F /* L * F.  */ };
+
+VECT_VAR_DECL (expected2_static, hfloat, 16, 8) []
+  = { 0xD405 /* A * G.  */,
+      0xFC00 /* B * G.  */,
+      0x5939 /* C * G.  */,
+      0x7C00 /* D * G.  */,
+      0xC2B9 /* I * G.  */,
+      0x5DDA /* J * G.  */,
+      0xD2BD /* K * G.  */,
+      0xDF5A /* L * G.  */ };
+
+VECT_VAR_DECL (expected3_static, hfloat, 16, 8) []
+  = { 0x0000 /* A * H.  */,
+      0x4000 /* FP16_C (2.0f).  */,
+      0x8000 /* C * H.  */,
+      0xC000 /* FP16_C (-2.0f).  */,
+      0x0000 /* I * H.  */,
+      0x8000 /* J * H.  */,
+      0x0000 /* K * H.  */,
+      0x0000 /* L * H.  */ };
+
+void exec_vmulx_n_f16 (void)
+{
+#undef TEST_MSG
+#define TEST_MSG "VMULX_N (FP16)"
+  clean_results ();
+
+  DECL_VARIABLE (vsrc_1, float, 16, 4);
+  VECT_VAR_DECL (buf_src_1, float, 16, 4) [] = {A, B, C, D};
+  VLOAD (vsrc_1, buf_src_1, , float, f, 16, 4);
+  DECL_VARIABLE (vector_res, float, 16, 4)
+    = vmulx_n_f16 (VECT_VAR (vsrc_1, float, 16, 4), elemE);
+  vst1_f16 (VECT_VAR (result, float, 16, 4),
+	    VECT_VAR (vector_res, float, 16, 4));
+
+  CHECK_FP (TEST_MSG, float, 16, 4, PRIx16, expected0_static, "");
+
+  VECT_VAR (vector_res, float, 16, 4)
+    = vmulx_n_f16 (VECT_VAR (vsrc_1, float, 16, 4), elemF);
+  vst1_f16 (VECT_VAR (result, float, 16, 4),
+	    VECT_VAR (vector_res, float, 16, 4));
+
+  CHECK_FP (TEST_MSG, float, 16, 4, PRIx16, expected1_static, "");
+
+  VECT_VAR (vector_res, float, 16, 4)
+    = vmulx_n_f16 (VECT_VAR (vsrc_1, float, 16, 4), elemG);
+  vst1_f16 (VECT_VAR (result, float, 16, 4),
+	    VECT_VAR (vector_res, float, 16, 4));
+
+  CHECK_FP (TEST_MSG, float, 16, 4, PRIx16, expected2_static, "");
+
+  VECT_VAR (vector_res, float, 16, 4)
+    = vmulx_n_f16 (VECT_VAR (vsrc_1, float, 16, 4), elemH);
+  vst1_f16 (VECT_VAR (result, float, 16, 4),
+	    VECT_VAR (vector_res, float, 16, 4));
+
+  CHECK_FP (TEST_MSG, float, 16, 4, PRIx16, expected3_static, "");
+
+#undef TEST_MSG
+#define TEST_MSG "VMULXQ_N (FP16)"
+  clean_results ();
+
+  DECL_VARIABLE (vsrc_1, float, 16, 8);
+  VECT_VAR_DECL (buf_src_1, float, 16, 8) [] = {A, B, C, D, I, J, K, L};
+  VLOAD (vsrc_1, buf_src_1, q, float, f, 16, 8);
+  DECL_VARIABLE (vector_res, float, 16, 8)
+    = vmulxq_n_f16 (VECT_VAR (vsrc_1, float, 16, 8), elemE);
+  vst1q_f16 (VECT_VAR (result, float, 16, 8),
+	     VECT_VAR (vector_res, float, 16, 8));
+
+  CHECK_FP (TEST_MSG, float, 16, 8, PRIx16, expected0_static, "");
+
+  VECT_VAR (vector_res, float, 16, 8)
+    = vmulxq_n_f16 (VECT_VAR (vsrc_1, float, 16, 8), elemF);
+  vst1q_f16 (VECT_VAR (result, float, 16, 8),
+	     VECT_VAR (vector_res, float, 16, 8));
+
+  CHECK_FP (TEST_MSG, float, 16, 8, PRIx16, expected1_static, "");
+
+  VECT_VAR (vector_res, float, 16, 8)
+    = vmulxq_n_f16 (VECT_VAR (vsrc_1, float, 16, 8), elemG);
+  vst1q_f16 (VECT_VAR (result, float, 16, 8),
+	     VECT_VAR (vector_res, float, 16, 8));
+
+  CHECK_FP (TEST_MSG, float, 16, 8, PRIx16, expected2_static, "");
+
+  VECT_VAR (vector_res, float, 16, 8)
+    = vmulxq_n_f16 (VECT_VAR (vsrc_1, float, 16, 8), elemH);
+  vst1q_f16 (VECT_VAR (result, float, 16, 8),
+	     VECT_VAR (vector_res, float, 16, 8));
+
+  CHECK_FP (TEST_MSG, float, 16, 8, PRIx16, expected3_static, "");
+}
+
+int
+main (void)
+{
+  exec_vmulx_n_f16 ();
+  return 0;
+}
--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vmulxh_f16_1.c
@@ -0,0 +1,50 @@
+/* { dg-do run } */
+/* { dg-require-effective-target arm_v8_2a_fp16_scalar_hw } */
+/* { dg-add-options arm_v8_2a_fp16_scalar }  */
+/* { dg-skip-if "" { arm*-*-* } } */
+
+#include <arm_fp16.h>
+
+/* Input values.  */
+#define A 13.4
+#define B __builtin_inff ()
+#define C -34.8
+#define D -__builtin_inff ()
+#define E 63.1
+#define F 0.0
+#define G -4.8
+#define H 0.0
+
+#define I 0.7
+#define J -__builtin_inff ()
+#define K 11.23
+#define L 98
+#define M 87.1
+#define N -0.0
+#define O -1.1
+#define P 7
+
+float16_t input_1[] = { A, B, C, D, I, J, K, L };
+float16_t input_2[] = { E, F, G, H, M, N, O, P };
+uint16_t expected[] = { 0x629B /* A * E.  */,
+			0x4000 /* FP16_C (2.0f).  */,
+			0x5939 /* C * G.  */,
+			0xC000 /* FP16_C (-2.0f).  */,
+			0x53A0 /* I * M.  */,
+			0x4000 /* FP16_C (2.0f).  */,
+			0xCA2C /* K * O.  */,
+			0x615C /* L * P.  */ };
+
+#define TEST_MSG "VMULXH_F16"
+#define INSN_NAME vmulxh_f16
+
+#define INPUT_1 input_1
+#define INPUT_2 input_2
+#define EXPECTED expected
+
+#define INPUT_TYPE float16_t
+#define OUTPUT_TYPE float16_t
+#define OUTPUT_TYPE_SIZE 16
+
+/* Include the template for unary scalar operations.  */
+#include "binary_scalar_op.inc"
--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vmulxh_lane_f16_1.c
@@ -0,0 +1,91 @@
+/* { dg-do run } */
+/* { dg-require-effective-target arm_v8_2a_fp16_scalar_hw } */
+/* { dg-add-options arm_v8_2a_fp16_neon } */
+/* { dg-skip-if "" { arm*-*-* } } */
+
+#include <arm_neon.h>
+#include "arm-neon-ref.h"
+#include "compute-ref-data.h"
+
+#define FP16_C(a) ((__fp16) a)
+#define A FP16_C (13.4)
+#define B FP16_C (__builtin_inff ())
+#define C FP16_C (-34.8)
+#define D FP16_C (-__builtin_inff ())
+#define E FP16_C (63.1)
+#define F FP16_C (0.0)
+#define G FP16_C (-4.8)
+#define H FP16_C (0.0)
+
+#define I FP16_C (0.7)
+#define J FP16_C (-__builtin_inff ())
+#define K FP16_C (11.23)
+#define L FP16_C (98)
+#define M FP16_C (87.1)
+#define N FP16_C (-0.0)
+#define O FP16_C (-1.1)
+#define P FP16_C (7)
+
+extern void abort ();
+
+float16_t src1[8] = { A, B, C, D, I, J, K, L };
+VECT_VAR_DECL (src2, float, 16, 4) [] = { E, F, G, H };
+VECT_VAR_DECL (src2, float, 16, 8) [] = { E, F, G, H, M, N, O, P };
+
+/* Expected results for vmulxh_lane.  */
+uint16_t expected[4] = { 0x629B /* A * E.  */,
+			 0x4000 /* FP16_C (2.0f).  */,
+			 0x5939 /* C * G.  */,
+			 0xC000 /* FP16_C (-2.0f).  */ };
+
+/* Expected results for vmulxh_lane.  */
+uint16_t expected_laneq[8] = { 0x629B /* A * E.  */,
+			       0x4000 /* FP16_C (2.0f).  */,
+			       0x5939 /* C * G.  */,
+			       0xC000 /* FP16_C (-2.0f).  */,
+			       0x53A0 /* I * M.  */,
+			       0x4000 /* FP16_C (2.0f).  */,
+			       0xCA2C /* K * O.  */,
+			       0x615C /* L * P.  */ };
+
+void exec_vmulxh_lane_f16 (void)
+{
+#define CHECK_LANE(N)\
+  ret = vmulxh_lane_f16 (src1[N], VECT_VAR (vsrc2, float, 16, 4), N);\
+  if (*(uint16_t *) &ret != expected[N])\
+    abort ();
+
+  DECL_VARIABLE(vsrc2, float, 16, 4);
+  VLOAD (vsrc2, src2, , float, f, 16, 4);
+  float16_t ret;
+
+  CHECK_LANE(0)
+  CHECK_LANE(1)
+  CHECK_LANE(2)
+  CHECK_LANE(3)
+
+#undef CHECK_LANE
+#define CHECK_LANE(N)\
+  ret = vmulxh_laneq_f16 (src1[N], VECT_VAR (vsrc2, float, 16, 8), N);\
+  if (*(uint16_t *) &ret != expected_laneq[N])\
+    abort ();
+
+  DECL_VARIABLE(vsrc2, float, 16, 8);
+  VLOAD (vsrc2, src2, q, float, f, 16, 8);
+
+  CHECK_LANE(0)
+  CHECK_LANE(1)
+  CHECK_LANE(2)
+  CHECK_LANE(3)
+  CHECK_LANE(4)
+  CHECK_LANE(5)
+  CHECK_LANE(6)
+  CHECK_LANE(7)
+}
+
+int
+main (void)
+{
+  exec_vmulxh_lane_f16 ();
+  return 0;
+}
--- a/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vmvn.c
+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vmvn.c
@@ -120,14 +120,14 @@ FNNAME (INSN_NAME)
   CHECK(TEST_MSG, uint, 8, 8, PRIx8, expected, "");
   CHECK(TEST_MSG, uint, 16, 4, PRIx16, expected, "");
   CHECK(TEST_MSG, uint, 32, 2, PRIx32, expected, "");
-  CHECK(TEST_MSG, poly, 8, 8, PRIx8, expected, "");
+  CHECK_POLY(TEST_MSG, poly, 8, 8, PRIx8, expected, "");
   CHECK(TEST_MSG, int, 8, 16, PRIx8, expected, "");
   CHECK(TEST_MSG, int, 16, 8, PRIx16, expected, "");
   CHECK(TEST_MSG, int, 32, 4, PRIx32, expected, "");
   CHECK(TEST_MSG, uint, 8, 16, PRIx8, expected, "");
   CHECK(TEST_MSG, uint, 16, 8, PRIx16, expected, "");
   CHECK(TEST_MSG, uint, 32, 4, PRIx32, expected, "");
-  CHECK(TEST_MSG, poly, 8, 16, PRIx8, expected, "");
+  CHECK_POLY(TEST_MSG, poly, 8, 16, PRIx8, expected, "");
 }
 
 int main (void)
--- a/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vneg.c
+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vneg.c
@@ -21,24 +21,53 @@ VECT_VAR_DECL(expected,int,32,4) [] = { 0x10, 0xf, 0xe, 0xd };
 /* Expected results for float32 variants. Needs to be separated since
    the generic test function does not test floating-point
    versions.  */
+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+VECT_VAR_DECL(expected_float16, hfloat, 16, 4) [] = { 0xc09a, 0xc09a,
+						      0xc09a, 0xc09a };
+VECT_VAR_DECL(expected_float16, hfloat, 16, 8) [] = { 0xc2cd, 0xc2cd,
+						      0xc2cd, 0xc2cd,
+						      0xc2cd, 0xc2cd,
+						      0xc2cd, 0xc2cd };
+#endif
 VECT_VAR_DECL(expected_float32,hfloat,32,2) [] = { 0xc0133333, 0xc0133333 };
 VECT_VAR_DECL(expected_float32,hfloat,32,4) [] = { 0xc059999a, 0xc059999a,
 						   0xc059999a, 0xc059999a };
 
 void exec_vneg_f32(void)
 {
+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+  DECL_VARIABLE(vector, float, 16, 4);
+  DECL_VARIABLE(vector, float, 16, 8);
+#endif
   DECL_VARIABLE(vector, float, 32, 2);
   DECL_VARIABLE(vector, float, 32, 4);
 
+
+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+  DECL_VARIABLE(vector_res, float, 16, 4);
+  DECL_VARIABLE(vector_res, float, 16, 8);
+#endif
   DECL_VARIABLE(vector_res, float, 32, 2);
   DECL_VARIABLE(vector_res, float, 32, 4);
 
+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+  VDUP(vector, , float, f, 16, 4, 2.3f);
+  VDUP(vector, q, float, f, 16, 8, 3.4f);
+#endif
   VDUP(vector, , float, f, 32, 2, 2.3f);
   VDUP(vector, q, float, f, 32, 4, 3.4f);
 
+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+  TEST_UNARY_OP(INSN_NAME, , float, f, 16, 4);
+  TEST_UNARY_OP(INSN_NAME, q, float, f, 16, 8);
+#endif
   TEST_UNARY_OP(INSN_NAME, , float, f, 32, 2);
   TEST_UNARY_OP(INSN_NAME, q, float, f, 32, 4);
 
+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+  CHECK_FP(TEST_MSG, float, 16, 4, PRIx16, expected_float16, "");
+  CHECK_FP(TEST_MSG, float, 16, 8, PRIx16, expected_float16, "");
+#endif
   CHECK_FP(TEST_MSG, float, 32, 2, PRIx32, expected_float32, "");
   CHECK_FP(TEST_MSG, float, 32, 4, PRIx32, expected_float32, "");
 }
--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vnegh_f16_1.c
@@ -0,0 +1,39 @@
+/* { dg-do run } */
+/* { dg-require-effective-target arm_v8_2a_fp16_scalar_hw } */
+/* { dg-add-options arm_v8_2a_fp16_scalar }  */
+
+#include <arm_fp16.h>
+
+uint16_t expected[] =
+{
+  0x8000 /* -0.000000 */,
+  0x0000 /* 0.000000 */,
+  0xc000 /* -2.000000 */,
+  0xc233 /* -3.099609 */,
+  0xcd00 /* -20.000000 */,
+  0xb666 /* -0.399902 */,
+  0x409a /* 2.300781 */,
+  0xbd52 /* -1.330078 */,
+  0x479a /* 7.601562 */,
+  0xb4f6 /* -0.310059 */,
+  0xb55d /* -0.335205 */,
+  0xb800 /* -0.500000 */,
+  0xbc00 /* -1.000000 */,
+  0xca91 /* -13.132812 */,
+  0x464d /* 6.300781 */,
+  0xcd00 /* -20.000000 */,
+  0xfc00 /* -inf */,
+  0x7c00 /* inf */
+};
+
+#define TEST_MSG "VNEGH_F16"
+#define INSN_NAME vnegh_f16
+
+#define EXPECTED expected
+
+#define INPUT_TYPE float16_t
+#define OUTPUT_TYPE float16_t
+#define OUTPUT_TYPE_SIZE 16
+
+/* Include the template for unary scalar operations.  */
+#include "unary_scalar_op.inc"
--- a/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vpXXX.inc
+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vpXXX.inc
@@ -21,6 +21,9 @@ void FNNAME (INSN_NAME) (void)
   DECL_VARIABLE(vector, uint, 8, 8);
   DECL_VARIABLE(vector, uint, 16, 4);
   DECL_VARIABLE(vector, uint, 32, 2);
+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+  DECL_VARIABLE(vector, float, 16, 4);
+#endif
   DECL_VARIABLE(vector, float, 32, 2);
 
   DECL_VARIABLE(vector_res, int, 8, 8);
@@ -29,6 +32,9 @@ void FNNAME (INSN_NAME) (void)
   DECL_VARIABLE(vector_res, uint, 8, 8);
   DECL_VARIABLE(vector_res, uint, 16, 4);
   DECL_VARIABLE(vector_res, uint, 32, 2);
+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+  DECL_VARIABLE(vector_res, float, 16, 4);
+#endif
   DECL_VARIABLE(vector_res, float, 32, 2);
 
   clean_results ();
@@ -40,6 +46,9 @@ void FNNAME (INSN_NAME) (void)
   VLOAD(vector, buffer, , uint, u, 8, 8);
   VLOAD(vector, buffer, , uint, u, 16, 4);
   VLOAD(vector, buffer, , uint, u, 32, 2);
+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+  VLOAD(vector, buffer, , float, f, 16, 4);
+#endif
   VLOAD(vector, buffer, , float, f, 32, 2);
 
   /* Apply a binary operator named INSN_NAME.  */
@@ -49,14 +58,20 @@ void FNNAME (INSN_NAME) (void)
   TEST_VPXXX(INSN_NAME, uint, u, 8, 8);
   TEST_VPXXX(INSN_NAME, uint, u, 16, 4);
   TEST_VPXXX(INSN_NAME, uint, u, 32, 2);
+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+  TEST_VPXXX(INSN_NAME, float, f, 16, 4);
+#endif
   TEST_VPXXX(INSN_NAME, float, f, 32, 2);
 
-  CHECK(TEST_MSG, int, 8, 8, PRIx32, expected, "");
-  CHECK(TEST_MSG, int, 16, 4, PRIx64, expected, "");
+  CHECK(TEST_MSG, int, 8, 8, PRIx8, expected, "");
+  CHECK(TEST_MSG, int, 16, 4, PRIx16, expected, "");
   CHECK(TEST_MSG, int, 32, 2, PRIx32, expected, "");
-  CHECK(TEST_MSG, uint, 8, 8, PRIx32, expected, "");
-  CHECK(TEST_MSG, uint, 16, 4, PRIx64, expected, "");
+  CHECK(TEST_MSG, uint, 8, 8, PRIx8, expected, "");
+  CHECK(TEST_MSG, uint, 16, 4, PRIx16, expected, "");
   CHECK(TEST_MSG, uint, 32, 2, PRIx32, expected, "");
+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+  CHECK_FP(TEST_MSG, float, 16, 4, PRIx16, expected, "");
+#endif
   CHECK_FP(TEST_MSG, float, 32, 2, PRIx32, expected, "");
 }
 
--- a/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vpadd.c
+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vpadd.c
@@ -14,6 +14,9 @@ VECT_VAR_DECL(expected,uint,8,8) [] = { 0xe1, 0xe5, 0xe9, 0xed,
 					0xe1, 0xe5, 0xe9, 0xed };
 VECT_VAR_DECL(expected,uint,16,4) [] = { 0xffe1, 0xffe5, 0xffe1, 0xffe5 };
 VECT_VAR_DECL(expected,uint,32,2) [] = { 0xffffffe1, 0xffffffe1 };
+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+VECT_VAR_DECL(expected, hfloat, 16, 4) [] = { 0xcfc0, 0xcec0, 0xcfc0, 0xcec0 };
+#endif
 VECT_VAR_DECL(expected,hfloat,32,2) [] = { 0xc1f80000, 0xc1f80000 };
 
 #include "vpXXX.inc"
--- a/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vpmax.c
+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vpmax.c
@@ -15,6 +15,9 @@ VECT_VAR_DECL(expected,uint,8,8) [] = { 0xf1, 0xf3, 0xf5, 0xf7,
 					0xf1, 0xf3, 0xf5, 0xf7 };
 VECT_VAR_DECL(expected,uint,16,4) [] = { 0xfff1, 0xfff3, 0xfff1, 0xfff3 };
 VECT_VAR_DECL(expected,uint,32,2) [] = { 0xfffffff1, 0xfffffff1 };
+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+VECT_VAR_DECL(expected, hfloat, 16, 4) [] = { 0xcb80, 0xca80, 0xcb80, 0xca80 };
+#endif
 VECT_VAR_DECL(expected,hfloat,32,2) [] = { 0xc1700000, 0xc1700000 };
 
 #include "vpXXX.inc"
--- a/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vpmin.c
+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vpmin.c
@@ -15,6 +15,9 @@ VECT_VAR_DECL(expected,uint,8,8) [] = { 0xf0, 0xf2, 0xf4, 0xf6,
 					0xf0, 0xf2, 0xf4, 0xf6 };
 VECT_VAR_DECL(expected,uint,16,4) [] = { 0xfff0, 0xfff2, 0xfff0, 0xfff2 };
 VECT_VAR_DECL(expected,uint,32,2) [] = { 0xfffffff0, 0xfffffff0 };
+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+VECT_VAR_DECL(expected, hfloat, 16, 4) [] = { 0xcc00, 0xcb00, 0xcc00, 0xcb00 };
+#endif
 VECT_VAR_DECL(expected,hfloat,32,2) [] = { 0xc1800000, 0xc1800000 };
 
 #include "vpXXX.inc"
--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vpminmaxnm_f16_1.c
@@ -0,0 +1,114 @@
+/* { dg-do run } */
+/* { dg-require-effective-target arm_v8_2a_fp16_neon_hw } */
+/* { dg-add-options arm_v8_2a_fp16_neon } */
+/* { dg-skip-if "" { arm*-*-* } } */
+
+#include <arm_neon.h>
+#include "arm-neon-ref.h"
+#include "compute-ref-data.h"
+
+#define FP16_C(a) ((__fp16) a)
+#define A FP16_C (123.4)
+#define B FP16_C (__builtin_nanf ("")) /* NaN */
+#define C FP16_C (-34.8)
+#define D FP16_C (1024)
+#define E FP16_C (663.1)
+#define F FP16_C (169.1)
+#define G FP16_C (-4.8)
+#define H FP16_C (-__builtin_nanf ("")) /* NaN */
+
+#define I FP16_C (0.7)
+#define J FP16_C (-78)
+#define K FP16_C (101.23)
+#define L FP16_C (-1098)
+#define M FP16_C (870.1)
+#define N FP16_C (-8781)
+#define O FP16_C (__builtin_inff ()) /* +Inf */
+#define P FP16_C (-__builtin_inff ()) /* -Inf */
+
+
+/* Expected results for vpminnm.  */
+VECT_VAR_DECL (expected_min_static, hfloat, 16, 4) []
+  = { 0x57B6 /* A.  */, 0xD05A /* C.  */, 0x5949 /* F.  */, 0xC4CD /* G.  */ };
+
+VECT_VAR_DECL (expected_min_static, hfloat, 16, 8) []
+  = { 0x57B6 /* A.  */, 0xD05A /* C.  */, 0xD4E0 /* J.  */, 0xE44A /* L.  */,
+      0x5949 /* F.  */, 0xC4CD /* G.  */, 0xF04A /* N.  */, 0xFC00 /* P.  */ };
+
+/* expected_max results for vpmaxnm.  */
+VECT_VAR_DECL (expected_max_static, hfloat, 16, 4) []
+  = { 0x57B6 /* A.  */, 0x6400 /* D.  */, 0x612E /* E.  */, 0xC4CD /* G.  */ };
+
+VECT_VAR_DECL (expected_max_static, hfloat, 16, 8) []
+  = { 0x57B6 /* A.  */, 0x6400 /* D.  */, 0x399A /* I.  */, 0x5654 /* K.  */,
+      0x612E /* E.  */, 0xC4CD /* G.  */, 0x62CC /* M.  */, 0x7C00 /* O.  */ };
+
+void exec_vpminmaxnm_f16 (void)
+{
+#undef TEST_MSG
+#define TEST_MSG "VPMINNM (FP16)"
+  clean_results ();
+
+  DECL_VARIABLE(vsrc_1, float, 16, 4);
+  DECL_VARIABLE(vsrc_2, float, 16, 4);
+  VECT_VAR_DECL (buf_src_1, float, 16, 4) [] = {A, B, C, D};
+  VECT_VAR_DECL (buf_src_2, float, 16, 4) [] = {E, F, G, H};
+  VLOAD (vsrc_1, buf_src_1, , float, f, 16, 4);
+  VLOAD (vsrc_2, buf_src_2, , float, f, 16, 4);
+  DECL_VARIABLE (vector_res, float, 16, 4)
+    = vpminnm_f16 (VECT_VAR (vsrc_1, float, 16, 4),
+		   VECT_VAR (vsrc_2, float, 16, 4));
+  vst1_f16 (VECT_VAR (result, float, 16, 4),
+	    VECT_VAR (vector_res, float, 16, 4));
+
+  CHECK_FP (TEST_MSG, float, 16, 4, PRIx16, expected_min_static, "");
+
+#undef TEST_MSG
+#define TEST_MSG "VPMINNMQ (FP16)"
+  clean_results ();
+
+  DECL_VARIABLE(vsrc_1, float, 16, 8);
+  DECL_VARIABLE(vsrc_2, float, 16, 8);
+  VECT_VAR_DECL (buf_src_1, float, 16, 8) [] = {A, B, C, D, I, J, K, L};
+  VECT_VAR_DECL (buf_src_2, float, 16, 8) [] = {E, F, G, H, M, N, O, P};
+  VLOAD (vsrc_1, buf_src_1, q, float, f, 16, 8);
+  VLOAD (vsrc_2, buf_src_2, q, float, f, 16, 8);
+  DECL_VARIABLE (vector_res, float, 16, 8)
+    = vpminnmq_f16 (VECT_VAR (vsrc_1, float, 16, 8),
+		    VECT_VAR (vsrc_2, float, 16, 8));
+  vst1q_f16 (VECT_VAR (result, float, 16, 8),
+	     VECT_VAR (vector_res, float, 16, 8));
+
+  CHECK_FP (TEST_MSG, float, 16, 8, PRIx16, expected_min_static, "");
+
+#undef TEST_MSG
+#define TEST_MSG "VPMAXNM (FP16)"
+  clean_results ();
+
+  VECT_VAR (vector_res, float, 16, 4)
+    = vpmaxnm_f16 (VECT_VAR (vsrc_1, float, 16, 4),
+		   VECT_VAR (vsrc_2, float, 16, 4));
+  vst1_f16 (VECT_VAR (result, float, 16, 4),
+	    VECT_VAR (vector_res, float, 16, 4));
+
+  CHECK_FP (TEST_MSG, float, 16, 4, PRIx16, expected_max_static, "");
+
+#undef TEST_MSG
+#define TEST_MSG "VPMAXNMQ (FP16)"
+  clean_results ();
+
+  VECT_VAR (vector_res, float, 16, 8)
+    = vpmaxnmq_f16 (VECT_VAR (vsrc_1, float, 16, 8),
+		    VECT_VAR (vsrc_2, float, 16, 8));
+  vst1q_f16 (VECT_VAR (result, float, 16, 8),
+	     VECT_VAR (vector_res, float, 16, 8));
+
+  CHECK_FP (TEST_MSG, float, 16, 8, PRIx16, expected_max_static, "");
+}
+
+int
+main (void)
+{
+  exec_vpminmaxnm_f16 ();
+  return 0;
+}
--- a/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vqabs.c
+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vqabs.c
@@ -90,9 +90,9 @@ void vqabs_extra()
   TEST_UNARY_SAT_OP(INSN_NAME, q, int, s, 32, 4, expected_cumulative_sat_min_neg, MSG);
 
   CHECK(TEST_MSG, int, 8, 8, PRIx8, expected_min_neg, MSG);
-  CHECK(TEST_MSG, int, 16, 4, PRIx8, expected_min_neg, MSG);
-  CHECK(TEST_MSG, int, 32, 2, PRIx8, expected_min_neg, MSG);
+  CHECK(TEST_MSG, int, 16, 4, PRIx16, expected_min_neg, MSG);
+  CHECK(TEST_MSG, int, 32, 2, PRIx32, expected_min_neg, MSG);
   CHECK(TEST_MSG, int, 8, 16, PRIx8, expected_min_neg, MSG);
-  CHECK(TEST_MSG, int, 16, 8, PRIx8, expected_min_neg, MSG);
-  CHECK(TEST_MSG, int, 32, 4, PRIx8, expected_min_neg, MSG);
+  CHECK(TEST_MSG, int, 16, 8, PRIx16, expected_min_neg, MSG);
+  CHECK(TEST_MSG, int, 32, 4, PRIx32, expected_min_neg, MSG);
 }
--- a/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vqdmull.c
+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vqdmull.c
@@ -63,8 +63,8 @@ void FNNAME (INSN_NAME) (void)
   TEST_VQDMULL(int, s, 16, 32, 4, expected_cumulative_sat, "");
   TEST_VQDMULL(int, s, 32, 64, 2, expected_cumulative_sat, "");
 
-  CHECK (TEST_MSG, int, 32, 4, PRIx16, expected, "");
-  CHECK (TEST_MSG, int, 64, 2, PRIx32, expected, "");
+  CHECK (TEST_MSG, int, 32, 4, PRIx32, expected, "");
+  CHECK (TEST_MSG, int, 64, 2, PRIx64, expected, "");
 
   VDUP(vector, , int, s, 16, 4, 0x8000);
   VDUP(vector2, , int, s, 16, 4, 0x8000);
@@ -75,8 +75,8 @@ void FNNAME (INSN_NAME) (void)
   TEST_VQDMULL(int, s, 16, 32, 4, expected_cumulative_sat2, TEST_MSG2);
   TEST_VQDMULL(int, s, 32, 64, 2, expected_cumulative_sat2, TEST_MSG2);
 
-  CHECK (TEST_MSG, int, 32, 4, PRIx16, expected2, TEST_MSG2);
-  CHECK (TEST_MSG, int, 64, 2, PRIx32, expected2, TEST_MSG2);
+  CHECK (TEST_MSG, int, 32, 4, PRIx32, expected2, TEST_MSG2);
+  CHECK (TEST_MSG, int, 64, 2, PRIx64, expected2, TEST_MSG2);
 }
 
 int main (void)
--- a/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vqneg.c
+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vqneg.c
@@ -90,9 +90,9 @@ void vqneg_extra()
   TEST_UNARY_SAT_OP(INSN_NAME, q, int, s, 32, 4, expected_cumulative_sat_min_neg, MSG);
 
   CHECK(TEST_MSG, int, 8, 8, PRIx8, expected_min_neg, MSG);
-  CHECK(TEST_MSG, int, 16, 4, PRIx8, expected_min_neg, MSG);
-  CHECK(TEST_MSG, int, 32, 2, PRIx8, expected_min_neg, MSG);
+  CHECK(TEST_MSG, int, 16, 4, PRIx16, expected_min_neg, MSG);
+  CHECK(TEST_MSG, int, 32, 2, PRIx32, expected_min_neg, MSG);
   CHECK(TEST_MSG, int, 8, 16, PRIx8, expected_min_neg, MSG);
-  CHECK(TEST_MSG, int, 16, 8, PRIx8, expected_min_neg, MSG);
-  CHECK(TEST_MSG, int, 32, 4, PRIx8, expected_min_neg, MSG);
+  CHECK(TEST_MSG, int, 16, 8, PRIx16, expected_min_neg, MSG);
+  CHECK(TEST_MSG, int, 32, 4, PRIx32, expected_min_neg, MSG);
 }
--- a/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vqtbX.c
+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vqtbX.c
@@ -318,13 +318,13 @@ void exec_vqtbX (void)
 
   CHECK(TEST_MSG, int, 8, 8, PRIx8, expected_vqtbl1, "");
   CHECK(TEST_MSG, uint, 8, 8, PRIx8, expected_vqtbl1, "");
-  CHECK(TEST_MSG, poly, 8, 8, PRIx8, expected_vqtbl1, "");
+  CHECK_POLY(TEST_MSG, poly, 8, 8, PRIx8, expected_vqtbl1, "");
 
 #undef TEST_MSG
 #define TEST_MSG "VQTBL1Q"
   CHECK(TEST_MSG, int, 8, 16, PRIx8, expected_vqtbl1q, "");
   CHECK(TEST_MSG, uint, 8, 16, PRIx8, expected_vqtbl1q, "");
-  CHECK(TEST_MSG, poly, 8, 16, PRIx8, expected_vqtbl1q, "");
+  CHECK_POLY(TEST_MSG, poly, 8, 16, PRIx8, expected_vqtbl1q, "");
 
   /* Check vqtbl2.  */
   clean_results ();
@@ -334,13 +334,13 @@ void exec_vqtbX (void)
 
   CHECK(TEST_MSG, int, 8, 8, PRIx8, expected_vqtbl2, "");
   CHECK(TEST_MSG, uint, 8, 8, PRIx8, expected_vqtbl2, "");
-  CHECK(TEST_MSG, poly, 8, 8, PRIx8, expected_vqtbl2, "");
+  CHECK_POLY(TEST_MSG, poly, 8, 8, PRIx8, expected_vqtbl2, "");
 
 #undef TEST_MSG
 #define TEST_MSG "VQTBL2Q"
   CHECK(TEST_MSG, int, 8, 16, PRIx8, expected_vqtbl2q, "");
   CHECK(TEST_MSG, uint, 8, 16, PRIx8, expected_vqtbl2q, "");
-  CHECK(TEST_MSG, poly, 8, 16, PRIx8, expected_vqtbl2q, "");
+  CHECK_POLY(TEST_MSG, poly, 8, 16, PRIx8, expected_vqtbl2q, "");
 
   /* Check vqtbl3.  */
   clean_results ();
@@ -350,13 +350,13 @@ void exec_vqtbX (void)
 
   CHECK(TEST_MSG, int, 8, 8, PRIx8, expected_vqtbl3, "");
   CHECK(TEST_MSG, uint, 8, 8, PRIx8, expected_vqtbl3, "");
-  CHECK(TEST_MSG, poly, 8, 8, PRIx8, expected_vqtbl3, "");
+  CHECK_POLY(TEST_MSG, poly, 8, 8, PRIx8, expected_vqtbl3, "");
 
 #undef TEST_MSG
 #define TEST_MSG "VQTBL3Q"
   CHECK(TEST_MSG, int, 8, 16, PRIx8, expected_vqtbl3q, "");
   CHECK(TEST_MSG, uint, 8, 16, PRIx8, expected_vqtbl3q, "");
-  CHECK(TEST_MSG, poly, 8, 16, PRIx8, expected_vqtbl3q, "");
+  CHECK_POLY(TEST_MSG, poly, 8, 16, PRIx8, expected_vqtbl3q, "");
 
   /* Check vqtbl4.  */
   clean_results ();
@@ -366,13 +366,13 @@ void exec_vqtbX (void)
 
   CHECK(TEST_MSG, int, 8, 8, PRIx8, expected_vqtbl4, "");
   CHECK(TEST_MSG, uint, 8, 8, PRIx8, expected_vqtbl4, "");
-  CHECK(TEST_MSG, poly, 8, 8, PRIx8, expected_vqtbl4, "");
+  CHECK_POLY(TEST_MSG, poly, 8, 8, PRIx8, expected_vqtbl4, "");
 
 #undef TEST_MSG
 #define TEST_MSG "VQTBL4Q"
   CHECK(TEST_MSG, int, 8, 16, PRIx8, expected_vqtbl4q, "");
   CHECK(TEST_MSG, uint, 8, 16, PRIx8, expected_vqtbl4q, "");
-  CHECK(TEST_MSG, poly, 8, 16, PRIx8, expected_vqtbl4q, "");
+  CHECK_POLY(TEST_MSG, poly, 8, 16, PRIx8, expected_vqtbl4q, "");
 
 
   /* Now test VQTBX.  */
@@ -455,13 +455,13 @@ void exec_vqtbX (void)
 
   CHECK(TEST_MSG, int, 8, 8, PRIx8, expected_vqtbx1, "");
   CHECK(TEST_MSG, uint, 8, 8, PRIx8, expected_vqtbx1, "");
-  CHECK(TEST_MSG, poly, 8, 8, PRIx8, expected_vqtbx1, "");
+  CHECK_POLY(TEST_MSG, poly, 8, 8, PRIx8, expected_vqtbx1, "");
 
 #undef TEST_MSG
 #define TEST_MSG "VQTBX1Q"
   CHECK(TEST_MSG, int, 8, 16, PRIx8, expected_vqtbx1q, "");
   CHECK(TEST_MSG, uint, 8, 16, PRIx8, expected_vqtbx1q, "");
-  CHECK(TEST_MSG, poly, 8, 16, PRIx8, expected_vqtbx1q, "");
+  CHECK_POLY(TEST_MSG, poly, 8, 16, PRIx8, expected_vqtbx1q, "");
 
   /* Check vqtbx2.  */
   clean_results ();
@@ -471,13 +471,13 @@ void exec_vqtbX (void)
 
   CHECK(TEST_MSG, int, 8, 8, PRIx8, expected_vqtbx2, "");
   CHECK(TEST_MSG, uint, 8, 8, PRIx8, expected_vqtbx2, "");
-  CHECK(TEST_MSG, poly, 8, 8, PRIx8, expected_vqtbx2, "");
+  CHECK_POLY(TEST_MSG, poly, 8, 8, PRIx8, expected_vqtbx2, "");
 
 #undef TEST_MSG
 #define TEST_MSG "VQTBX2Q"
   CHECK(TEST_MSG, int, 8, 16, PRIx8, expected_vqtbx2q, "");
   CHECK(TEST_MSG, uint, 8, 16, PRIx8, expected_vqtbx2q, "");
-  CHECK(TEST_MSG, poly, 8, 16, PRIx8, expected_vqtbx2q, "");
+  CHECK_POLY(TEST_MSG, poly, 8, 16, PRIx8, expected_vqtbx2q, "");
 
   /* Check vqtbx3.  */
   clean_results ();
@@ -487,13 +487,13 @@ void exec_vqtbX (void)
 
   CHECK(TEST_MSG, int, 8, 8, PRIx8, expected_vqtbx3, "");
   CHECK(TEST_MSG, uint, 8, 8, PRIx8, expected_vqtbx3, "");
-  CHECK(TEST_MSG, poly, 8, 8, PRIx8, expected_vqtbx3, "");
+  CHECK_POLY(TEST_MSG, poly, 8, 8, PRIx8, expected_vqtbx3, "");
 
 #undef TEST_MSG
 #define TEST_MSG "VQTBX3Q"
   CHECK(TEST_MSG, int, 8, 16, PRIx8, expected_vqtbx3q, "");
   CHECK(TEST_MSG, uint, 8, 16, PRIx8, expected_vqtbx3q, "");
-  CHECK(TEST_MSG, poly, 8, 16, PRIx8, expected_vqtbx3q, "");
+  CHECK_POLY(TEST_MSG, poly, 8, 16, PRIx8, expected_vqtbx3q, "");
 
   /* Check vqtbx4.  */
   clean_results ();
@@ -503,13 +503,13 @@ void exec_vqtbX (void)
 
   CHECK(TEST_MSG, int, 8, 8, PRIx8, expected_vqtbx4, "");
   CHECK(TEST_MSG, uint, 8, 8, PRIx8, expected_vqtbx4, "");
-  CHECK(TEST_MSG, poly, 8, 8, PRIx8, expected_vqtbx4, "");
+  CHECK_POLY(TEST_MSG, poly, 8, 8, PRIx8, expected_vqtbx4, "");
 
 #undef TEST_MSG
 #define TEST_MSG "VQTBX4Q"
   CHECK(TEST_MSG, int, 8, 16, PRIx8, expected_vqtbx4q, "");
   CHECK(TEST_MSG, uint, 8, 16, PRIx8, expected_vqtbx4q, "");
-  CHECK(TEST_MSG, poly, 8, 16, PRIx8, expected_vqtbx4q, "");
+  CHECK_POLY(TEST_MSG, poly, 8, 16, PRIx8, expected_vqtbx4q, "");
 }
 
 int main (void)
--- a/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vrecpe.c
+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vrecpe.c
@@ -7,6 +7,14 @@
 VECT_VAR_DECL(expected_positive,uint,32,2) [] = { 0xffffffff, 0xffffffff };
 VECT_VAR_DECL(expected_positive,uint,32,4) [] = { 0xbf000000, 0xbf000000,
 						  0xbf000000, 0xbf000000 };
+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+VECT_VAR_DECL(expected_positive, hfloat, 16, 4) [] = { 0x3834, 0x3834,
+						       0x3834, 0x3834 };
+VECT_VAR_DECL(expected_positive, hfloat, 16, 8) [] = { 0x2018, 0x2018,
+						       0x2018, 0x2018,
+						       0x2018, 0x2018,
+						       0x2018, 0x2018 };
+#endif
 VECT_VAR_DECL(expected_positive,hfloat,32,2) [] = { 0x3f068000, 0x3f068000 };
 VECT_VAR_DECL(expected_positive,hfloat,32,4) [] = { 0x3c030000, 0x3c030000,
 						    0x3c030000, 0x3c030000 };
@@ -15,24 +23,56 @@ VECT_VAR_DECL(expected_positive,hfloat,32,4) [] = { 0x3c030000, 0x3c030000,
 VECT_VAR_DECL(expected_negative,uint,32,2) [] = { 0x80000000, 0x80000000 };
 VECT_VAR_DECL(expected_negative,uint,32,4) [] = { 0xee800000, 0xee800000,
 						  0xee800000, 0xee800000 };
+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+VECT_VAR_DECL(expected_negative, hfloat, 16, 4) [] = { 0xae64, 0xae64,
+						       0xae64, 0xae64 };
+VECT_VAR_DECL(expected_negative, hfloat, 16, 8) [] = { 0xa018, 0xa018,
+						       0xa018, 0xa018,
+						       0xa018, 0xa018,
+						       0xa018, 0xa018 };
+#endif
 VECT_VAR_DECL(expected_negative,hfloat,32,2) [] = { 0xbdcc8000, 0xbdcc8000 };
 VECT_VAR_DECL(expected_negative,hfloat,32,4) [] = { 0xbc030000, 0xbc030000,
 						    0xbc030000, 0xbc030000 };
 
 /* Expected results with FP special values (NaN, infinity).  */
+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+VECT_VAR_DECL(expected_fp1, hfloat, 16, 4) [] = { 0x7e00, 0x7e00,
+						  0x7e00, 0x7e00 };
+VECT_VAR_DECL(expected_fp1, hfloat, 16, 8) [] = { 0x0, 0x0, 0x0, 0x0,
+						  0x0, 0x0, 0x0, 0x0 };
+#endif
 VECT_VAR_DECL(expected_fp1,hfloat,32,2) [] = { 0x7fc00000, 0x7fc00000 };
 VECT_VAR_DECL(expected_fp1,hfloat,32,4) [] = { 0x0, 0x0, 0x0, 0x0 };
 
 /* Expected results with FP special values (zero, large value).  */
+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+VECT_VAR_DECL(expected_fp2, hfloat, 16, 4) [] = { 0x7c00, 0x7c00,
+						  0x7c00, 0x7c00 };
+VECT_VAR_DECL(expected_fp2, hfloat, 16, 8) [] = { 0x0, 0x0, 0x0, 0x0,
+						  0x0, 0x0, 0x0, 0x0 };
+#endif
 VECT_VAR_DECL(expected_fp2,hfloat,32,2) [] = { 0x7f800000, 0x7f800000 };
 VECT_VAR_DECL(expected_fp2,hfloat,32,4) [] = { 0x0, 0x0, 0x0, 0x0 };
 
 /* Expected results with FP special values (-0, -infinity).  */
+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+VECT_VAR_DECL(expected_fp3, hfloat, 16, 4) [] = { 0xfc00, 0xfc00,
+						  0xfc00, 0xfc00};
+VECT_VAR_DECL(expected_fp3, hfloat, 16, 8) [] = { 0x8000, 0x8000,
+						  0x8000, 0x8000,
+						  0x8000, 0x8000,
+						  0x8000, 0x8000 };
+#endif
 VECT_VAR_DECL(expected_fp3,hfloat,32,2) [] = { 0xff800000, 0xff800000 };
 VECT_VAR_DECL(expected_fp3,hfloat,32,4) [] = { 0x80000000, 0x80000000,
 					       0x80000000, 0x80000000 };
 
 /* Expected results with FP special large negative value.  */
+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+VECT_VAR_DECL(expected_fp4, hfloat, 16, 4) [] = { 0x8000, 0x8000,
+						  0x8000, 0x8000 };
+#endif
 VECT_VAR_DECL(expected_fp4,hfloat,32,2) [] = { 0x80000000, 0x80000000 };
 
 #define TEST_MSG "VRECPE/VRECPEQ"
@@ -50,11 +90,19 @@ void exec_vrecpe(void)
   /* No need for 64 bits variants.  */
   DECL_VARIABLE(vector, uint, 32, 2);
   DECL_VARIABLE(vector, uint, 32, 4);
+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+  DECL_VARIABLE(vector, float, 16, 4);
+  DECL_VARIABLE(vector, float, 16, 8);
+#endif
   DECL_VARIABLE(vector, float, 32, 2);
   DECL_VARIABLE(vector, float, 32, 4);
 
   DECL_VARIABLE(vector_res, uint, 32, 2);
   DECL_VARIABLE(vector_res, uint, 32, 4);
+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+  DECL_VARIABLE(vector_res, float, 16, 4);
+  DECL_VARIABLE(vector_res, float, 16, 8);
+#endif
   DECL_VARIABLE(vector_res, float, 32, 2);
   DECL_VARIABLE(vector_res, float, 32, 4);
 
@@ -62,88 +110,165 @@ void exec_vrecpe(void)
 
   /* Choose init value arbitrarily, positive.  */
   VDUP(vector, , uint, u, 32, 2, 0x12345678);
+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+  VDUP(vector, , float, f, 16, 4, 1.9f);
+#endif
   VDUP(vector, , float, f, 32, 2, 1.9f);
+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+  VDUP(vector, q, float, f, 16, 8, 125.0f);
+#endif
   VDUP(vector, q, uint, u, 32, 4, 0xABCDEF10);
   VDUP(vector, q, float, f, 32, 4, 125.0f);
 
   /* Apply the operator.  */
   TEST_VRECPE(, uint, u, 32, 2);
+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+  TEST_VRECPE(, float, f, 16, 4);
+#endif
   TEST_VRECPE(, float, f, 32, 2);
   TEST_VRECPE(q, uint, u, 32, 4);
+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+  TEST_VRECPE(q, float, f, 16, 8);
+#endif
   TEST_VRECPE(q, float, f, 32, 4);
 
 #define CMT " (positive input)"
   CHECK(TEST_MSG, uint, 32, 2, PRIx32, expected_positive, CMT);
   CHECK(TEST_MSG, uint, 32, 4, PRIx32, expected_positive, CMT);
+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+  CHECK_FP(TEST_MSG, float, 16, 4, PRIx16, expected_positive, CMT);
+  CHECK_FP(TEST_MSG, float, 16, 8, PRIx16, expected_positive, CMT);
+#endif
   CHECK_FP(TEST_MSG, float, 32, 2, PRIx32, expected_positive, CMT);
   CHECK_FP(TEST_MSG, float, 32, 4, PRIx32, expected_positive, CMT);
 
   /* Choose init value arbitrarily,negative.  */
   VDUP(vector, , uint, u, 32, 2, 0xFFFFFFFF);
+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+  VDUP(vector, , float, f, 16, 4, -10.0f);
+#endif
   VDUP(vector, , float, f, 32, 2, -10.0f);
   VDUP(vector, q, uint, u, 32, 4, 0x89081234);
+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+  VDUP(vector, q, float, f, 16, 8, -125.0f);
+#endif
   VDUP(vector, q, float, f, 32, 4, -125.0f);
 
   /* Apply the operator.  */
   TEST_VRECPE(, uint, u, 32, 2);
+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+  TEST_VRECPE(, float, f, 16, 4);
+#endif
   TEST_VRECPE(, float, f, 32, 2);
   TEST_VRECPE(q, uint, u, 32, 4);
+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+  TEST_VRECPE(q, float, f, 16, 8);
+#endif
   TEST_VRECPE(q, float, f, 32, 4);
 
 #undef CMT
 #define CMT " (negative input)"
   CHECK(TEST_MSG, uint, 32, 2, PRIx32, expected_negative, CMT);
   CHECK(TEST_MSG, uint, 32, 4, PRIx32, expected_negative, CMT);
+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+  CHECK_FP(TEST_MSG, float, 16, 4, PRIx16, expected_negative, CMT);
+  CHECK_FP(TEST_MSG, float, 16, 8, PRIx16, expected_negative, CMT);
+#endif
   CHECK_FP(TEST_MSG, float, 32, 2, PRIx32, expected_negative, CMT);
   CHECK_FP(TEST_MSG, float, 32, 4, PRIx32, expected_negative, CMT);
 
   /* Test FP variants with special input values (NaN, infinity).  */
+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+  VDUP(vector, , float, f, 16, 4, NAN);
+  VDUP(vector, q, float, f, 16, 8, HUGE_VALF);
+#endif
   VDUP(vector, , float, f, 32, 2, NAN);
   VDUP(vector, q, float, f, 32, 4, HUGE_VALF);
 
   /* Apply the operator.  */
+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+  TEST_VRECPE(, float, f, 16, 4);
+  TEST_VRECPE(q, float, f, 16, 8);
+#endif
   TEST_VRECPE(, float, f, 32, 2);
   TEST_VRECPE(q, float, f, 32, 4);
 
 #undef CMT
 #define CMT " FP special (NaN, infinity)"
+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+  CHECK_FP(TEST_MSG, float, 16, 4, PRIx16, expected_fp1, CMT);
+  CHECK_FP(TEST_MSG, float, 16, 8, PRIx16, expected_fp1, CMT);
+#endif
   CHECK_FP(TEST_MSG, float, 32, 2, PRIx32, expected_fp1, CMT);
   CHECK_FP(TEST_MSG, float, 32, 4, PRIx32, expected_fp1, CMT);
 
   /* Test FP variants with special input values (zero, large value).  */
+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+  VDUP(vector, , float, f, 16, 4, 0.0f);
+  VDUP(vector, q, float, f, 16, 8, 8.97229e37f /*9.0e37f*/);
+#endif
   VDUP(vector, , float, f, 32, 2, 0.0f);
   VDUP(vector, q, float, f, 32, 4, 8.97229e37f /*9.0e37f*/);
 
   /* Apply the operator.  */
+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+  TEST_VRECPE(, float, f, 16, 4);
+  TEST_VRECPE(q, float, f, 16, 8);
+#endif
   TEST_VRECPE(, float, f, 32, 2);
   TEST_VRECPE(q, float, f, 32, 4);
 
 #undef CMT
 #define CMT " FP special (zero, large value)"
+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+  CHECK_FP(TEST_MSG, float, 16, 4, PRIx16, expected_fp2, CMT);
+  CHECK_FP(TEST_MSG, float, 16, 8, PRIx16, expected_fp2, CMT);
+#endif
   CHECK_FP(TEST_MSG, float, 32, 2, PRIx32, expected_fp2, CMT);
   CHECK_FP(TEST_MSG, float, 32, 4, PRIx32, expected_fp2, CMT);
 
   /* Test FP variants with special input values (-0, -infinity).  */
+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+  VDUP(vector, , float, f, 16, 4, -0.0f);
+  VDUP(vector, q, float, f, 16, 8, -HUGE_VALF);
+#endif
   VDUP(vector, , float, f, 32, 2, -0.0f);
   VDUP(vector, q, float, f, 32, 4, -HUGE_VALF);
 
   /* Apply the operator.  */
+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+  TEST_VRECPE(, float, f, 16, 4);
+  TEST_VRECPE(q, float, f, 16, 8);
+#endif
   TEST_VRECPE(, float, f, 32, 2);
   TEST_VRECPE(q, float, f, 32, 4);
 
 #undef CMT
 #define CMT " FP special (-0, -infinity)"
+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+  CHECK_FP(TEST_MSG, float, 16, 4, PRIx16, expected_fp3, CMT);
+  CHECK_FP(TEST_MSG, float, 16, 8, PRIx16, expected_fp3, CMT);
+#endif
   CHECK_FP(TEST_MSG, float, 32, 2, PRIx32, expected_fp3, CMT);
   CHECK_FP(TEST_MSG, float, 32, 4, PRIx32, expected_fp3, CMT);
 
   /* Test FP variants with special input values (large negative value).  */
+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+  VDUP(vector, , float, f, 16, 4, -9.0e37f);
+#endif
   VDUP(vector, , float, f, 32, 2, -9.0e37f);
 
   /* Apply the operator.  */
+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+  TEST_VRECPE(, float, f, 16, 4);
+#endif
   TEST_VRECPE(, float, f, 32, 2);
 
 #undef CMT
 #define CMT " FP special (large negative value)"
+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+  CHECK_FP(TEST_MSG, float, 16, 4, PRIx16, expected_fp4, CMT);
+#endif
   CHECK_FP(TEST_MSG, float, 32, 2, PRIx32, expected_fp4, CMT);
 }
 
--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vrecpeh_f16_1.c
@@ -0,0 +1,42 @@
+/* { dg-do run } */
+/* { dg-require-effective-target arm_v8_2a_fp16_scalar_hw } */
+/* { dg-add-options arm_v8_2a_fp16_scalar }  */
+/* { dg-skip-if "" { arm*-*-* } } */
+
+#include <arm_fp16.h>
+
+/* Input values.  */
+#define A 123.4
+#define B 567.8
+#define C 34.8
+#define D 1024
+#define E 663.1
+#define F 144.0
+#define G 4.8
+#define H 77
+
+#define RECP_A 0x2028 /* 1/A.  */
+#define RECP_B 0x1734 /* 1/B.  */
+#define RECP_C 0x275C /* 1/C.  */
+#define RECP_D 0x13FC /* 1/D.  */
+#define RECP_E 0x162C /* 1/E.  */
+#define RECP_F 0x1F18 /* 1/F.  */
+#define RECP_G 0x32A8 /* 1/G.  */
+#define RECP_H 0x22A4 /* 1/H.  */
+
+float16_t input[] = { A, B, C, D, E, F, G, H };
+uint16_t expected[] = { RECP_A, RECP_B, RECP_C, RECP_D,
+		        RECP_E, RECP_F, RECP_G, RECP_H };
+
+#define TEST_MSG "VRECPEH_F16"
+#define INSN_NAME vrecpeh_f16
+
+#define INPUT input
+#define EXPECTED expected
+
+#define INPUT_TYPE float16_t
+#define OUTPUT_TYPE float16_t
+#define OUTPUT_TYPE_SIZE 16
+
+/* Include the template for unary scalar operations.  */
+#include "unary_scalar_op.inc"
--- a/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vrecps.c
+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vrecps.c
@@ -4,22 +4,51 @@
 #include <math.h>
 
 /* Expected results with positive input.  */
+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+VECT_VAR_DECL(expected, hfloat, 16, 4) [] = { 0xd70c, 0xd70c, 0xd70c, 0xd70c };
+VECT_VAR_DECL(expected, hfloat, 16, 8) [] = { 0xcedc, 0xcedc, 0xcedc, 0xcedc,
+					      0xcedc, 0xcedc, 0xcedc, 0xcedc };
+#endif
 VECT_VAR_DECL(expected,hfloat,32,2) [] = { 0xc2e19eb7, 0xc2e19eb7 };
 VECT_VAR_DECL(expected,hfloat,32,4) [] = { 0xc1db851f, 0xc1db851f,
 					   0xc1db851f, 0xc1db851f };
 
 /* Expected results with FP special values (NaN).  */
+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+VECT_VAR_DECL(expected_fp1, hfloat, 16, 4) [] = { 0x7e00, 0x7e00,
+						  0x7e00, 0x7e00 };
+VECT_VAR_DECL(expected_fp1, hfloat, 16, 8) [] = { 0x7e00, 0x7e00,
+						  0x7e00, 0x7e00,
+						  0x7e00, 0x7e00,
+						  0x7e00, 0x7e00 };
+#endif
 VECT_VAR_DECL(expected_fp1,hfloat,32,2) [] = { 0x7fc00000, 0x7fc00000 };
 VECT_VAR_DECL(expected_fp1,hfloat,32,4) [] = { 0x7fc00000, 0x7fc00000,
 					       0x7fc00000, 0x7fc00000 };
 
 /* Expected results with FP special values (infinity, 0) and normal
    values.  */
+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+VECT_VAR_DECL(expected_fp2, hfloat, 16, 4) [] = { 0xfc00, 0xfc00,
+						  0xfc00, 0xfc00 };
+VECT_VAR_DECL(expected_fp2, hfloat, 16, 8) [] = { 0x4000, 0x4000,
+						  0x4000, 0x4000,
+						  0x4000, 0x4000,
+						  0x4000, 0x4000 };
+#endif
 VECT_VAR_DECL(expected_fp2,hfloat,32,2) [] = { 0xff800000, 0xff800000 };
 VECT_VAR_DECL(expected_fp2,hfloat,32,4) [] = { 0x40000000, 0x40000000,
 					       0x40000000, 0x40000000 };
 
 /* Expected results with FP special values (infinity, 0).  */
+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+VECT_VAR_DECL(expected_fp3, hfloat, 16, 4) [] = { 0x4000, 0x4000,
+						  0x4000, 0x4000 };
+VECT_VAR_DECL(expected_fp3, hfloat, 16, 8) [] = { 0x4000, 0x4000,
+						  0x4000, 0x4000,
+						  0x4000, 0x4000,
+						  0x4000, 0x4000 };
+#endif
 VECT_VAR_DECL(expected_fp3,hfloat,32,2) [] = { 0x40000000, 0x40000000 };
 VECT_VAR_DECL(expected_fp3,hfloat,32,4) [] = { 0x40000000, 0x40000000,
 					       0x40000000, 0x40000000 };
@@ -38,74 +67,143 @@ void exec_vrecps(void)
 		    VECT_VAR(vector_res, T1, W, N))
 
   /* No need for integer variants.  */
+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+  DECL_VARIABLE(vector, float, 16, 4);
+  DECL_VARIABLE(vector, float, 16, 8);
+#endif
   DECL_VARIABLE(vector, float, 32, 2);
   DECL_VARIABLE(vector, float, 32, 4);
 
+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+  DECL_VARIABLE(vector2, float, 16, 4);
+  DECL_VARIABLE(vector2, float, 16, 8);
+#endif
   DECL_VARIABLE(vector2, float, 32, 2);
   DECL_VARIABLE(vector2, float, 32, 4);
 
+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+  DECL_VARIABLE(vector_res, float, 16, 4);
+  DECL_VARIABLE(vector_res, float, 16, 8);
+#endif
   DECL_VARIABLE(vector_res, float, 32, 2);
   DECL_VARIABLE(vector_res, float, 32, 4);
 
   clean_results ();
 
   /* Choose init value arbitrarily.  */
+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+  VDUP(vector, , float, f, 16, 4, 12.9f);
+  VDUP(vector, q, float, f, 16, 8, 9.2f);
+#endif
   VDUP(vector, , float, f, 32, 2, 12.9f);
   VDUP(vector, q, float, f, 32, 4, 9.2f);
 
+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+  VDUP(vector2, , float, f, 16, 4, 8.9f);
+  VDUP(vector2, q, float, f, 16, 8, 3.2f);
+#endif
   VDUP(vector2, , float, f, 32, 2, 8.9f);
   VDUP(vector2, q, float, f, 32, 4, 3.2f);
 
   /* Apply the operator.  */
+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+  TEST_VRECPS(, float, f, 16, 4);
+  TEST_VRECPS(q, float, f, 16, 8);
+#endif
   TEST_VRECPS(, float, f, 32, 2);
   TEST_VRECPS(q, float, f, 32, 4);
 
 #define CMT " (positive input)"
+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+  CHECK_FP(TEST_MSG, float, 16, 4, PRIx16, expected, CMT);
+  CHECK_FP(TEST_MSG, float, 16, 8, PRIx16, expected, CMT);
+#endif
   CHECK_FP(TEST_MSG, float, 32, 2, PRIx32, expected, CMT);
   CHECK_FP(TEST_MSG, float, 32, 4, PRIx32, expected, CMT);
 
 
   /* Test FP variants with special input values (NaN).  */
+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+  VDUP(vector, , float, f, 16, 4, NAN);
+  VDUP(vector2, q, float, f, 16, 8, NAN);
+#endif
   VDUP(vector, , float, f, 32, 2, NAN);
   VDUP(vector2, q, float, f, 32, 4, NAN);
 
   /* Apply the operator.  */
+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+  TEST_VRECPS(, float, f, 16, 4);
+  TEST_VRECPS(q, float, f, 16, 8);
+#endif
   TEST_VRECPS(, float, f, 32, 2);
   TEST_VRECPS(q, float, f, 32, 4);
 
 #undef CMT
 #define CMT " FP special (NaN)"
+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+  CHECK_FP(TEST_MSG, float, 16, 4, PRIx16, expected_fp1, CMT);
+  CHECK_FP(TEST_MSG, float, 16, 8, PRIx16, expected_fp1, CMT);
+#endif
   CHECK_FP(TEST_MSG, float, 32, 2, PRIx32, expected_fp1, CMT);
   CHECK_FP(TEST_MSG, float, 32, 4, PRIx32, expected_fp1, CMT);
 
 
   /* Test FP variants with special input values (infinity, 0).  */
+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+  VDUP(vector, , float, f, 16, 4, HUGE_VALF);
+  VDUP(vector, q, float, f, 16, 8, 0.0f);
+  VDUP(vector2, q, float, f, 16, 8, 3.2f); /* Restore a normal value.  */
+#endif
   VDUP(vector, , float, f, 32, 2, HUGE_VALF);
   VDUP(vector, q, float, f, 32, 4, 0.0f);
   VDUP(vector2, q, float, f, 32, 4, 3.2f); /* Restore a normal value.  */
 
+
   /* Apply the operator.  */
+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+  TEST_VRECPS(, float, f, 16, 4);
+  TEST_VRECPS(q, float, f, 16, 8);
+#endif
   TEST_VRECPS(, float, f, 32, 2);
   TEST_VRECPS(q, float, f, 32, 4);
 
 #undef CMT
 #define CMT " FP special (infinity, 0) and normal value"
+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+  CHECK_FP(TEST_MSG, float, 16, 4, PRIx16, expected_fp2, CMT);
+  CHECK_FP(TEST_MSG, float, 16, 8, PRIx16, expected_fp2, CMT);
+#endif
   CHECK_FP(TEST_MSG, float, 32, 2, PRIx32, expected_fp2, CMT);
   CHECK_FP(TEST_MSG, float, 32, 4, PRIx32, expected_fp2, CMT);
 
 
   /* Test FP variants with only special input values (infinity, 0).  */
+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+  VDUP(vector, , float, f, 16, 4, HUGE_VALF);
+  VDUP(vector, q, float, f, 16, 8, 0.0f);
+  VDUP(vector2, , float, f, 16, 4, 0.0f);
+  VDUP(vector2, q, float, f, 16, 8, HUGE_VALF);
+#endif
   VDUP(vector, , float, f, 32, 2, HUGE_VALF);
   VDUP(vector, q, float, f, 32, 4, 0.0f);
   VDUP(vector2, , float, f, 32, 2, 0.0f);
   VDUP(vector2, q, float, f, 32, 4, HUGE_VALF);
 
+
   /* Apply the operator */
+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+  TEST_VRECPS(, float, f, 16, 4);
+  TEST_VRECPS(q, float, f, 16, 8);
+#endif
   TEST_VRECPS(, float, f, 32, 2);
   TEST_VRECPS(q, float, f, 32, 4);
 
 #undef CMT
 #define CMT " FP special (infinity, 0)"
+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+  CHECK_FP(TEST_MSG, float, 16, 4, PRIx16, expected_fp3, CMT);
+  CHECK_FP(TEST_MSG, float, 16, 8, PRIx16, expected_fp3, CMT);
+#endif
   CHECK_FP(TEST_MSG, float, 32, 2, PRIx32, expected_fp3, CMT);
   CHECK_FP(TEST_MSG, float, 32, 4, PRIx32, expected_fp3, CMT);
 }
--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vrecpsh_f16_1.c
@@ -0,0 +1,50 @@
+/* { dg-do run } */
+/* { dg-require-effective-target arm_v8_2a_fp16_scalar_hw } */
+/* { dg-add-options arm_v8_2a_fp16_scalar }  */
+/* { dg-skip-if "" { arm*-*-* } } */
+
+#include <arm_fp16.h>
+
+/* Input values.  */
+#define A 12.4
+#define B -5.8
+#define C -3.8
+#define D 10
+#define E 66.1
+#define F 16.1
+#define G -4.8
+#define H -77
+
+#define I 0.7
+#define J -78
+#define K 10.23
+#define L 98
+#define M 87
+#define N -87.81
+#define O -1.1
+#define P 47.8
+
+float16_t input_1[] = { A, B, C, D, I, J, K, L };
+float16_t input_2[] = { E, F, G, H, M, N, O, P };
+uint16_t expected[] = { 0xE264 /* 2.0f - A * E.  */,
+			0x55F6 /* 2.0f - B * F.  */,
+			0xCC10 /* 2.0f - C * G.  */,
+			0x6208 /* 2.0f - D * H.  */,
+			0xD35D /* 2.0f - I * M.  */,
+			0xEEB0 /* 2.0f - J * N.  */,
+			0x4A9F /* 2.0f - K * O.  */,
+			0xEC93 /* 2.0f - L * P.  */ };
+
+#define TEST_MSG "VRECPSH_F16"
+#define INSN_NAME vrecpsh_f16
+
+#define INPUT_1 input_1
+#define INPUT_2 input_2
+#define EXPECTED expected
+
+#define INPUT_TYPE float16_t
+#define OUTPUT_TYPE float16_t
+#define OUTPUT_TYPE_SIZE 16
+
+/* Include the template for unary scalar operations.  */
+#include "binary_scalar_op.inc"
--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vrecpxh_f16_1.c
@@ -0,0 +1,32 @@
+/* { dg-do run } */
+/* { dg-require-effective-target arm_v8_2a_fp16_scalar_hw } */
+/* { dg-add-options arm_v8_2a_fp16_scalar }  */
+/* { dg-skip-if "" { arm*-*-* } } */
+
+#include <arm_fp16.h>
+
+/* Input values.  */
+
+float16_t input[] = { 123.4, 567.8, 34.8, 1024, 663.1, 144.0, 4.8, 77 };
+/*  Expected results are calculated by:
+  for (index = 0; index < 8; index++)
+    {
+      uint16_t src_cast = * (uint16_t *) &src[index];
+      * (uint16_t *) &expected[index] =
+	(src_cast & 0x8000) | (~src_cast & 0x7C00);
+    }  */
+uint16_t expected[8] = { 0x2800, 0x1C00, 0x2C00, 0x1800,
+			 0x1C00, 0x2400, 0x3800, 0x2800 };
+
+#define TEST_MSG "VRECPXH_F16"
+#define INSN_NAME vrecpxh_f16
+
+#define INPUT input
+#define EXPECTED expected
+
+#define INPUT_TYPE float16_t
+#define OUTPUT_TYPE float16_t
+#define OUTPUT_TYPE_SIZE 16
+
+/* Include the template for unary scalar operations.  */
+#include "unary_scalar_op.inc"
--- a/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vreinterpret.c
+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vreinterpret.c
@@ -21,6 +21,8 @@ VECT_VAR_DECL(expected_s8_8,int,8,8) [] = { 0xf0, 0xf1, 0xf2, 0xf3,
 					    0xf4, 0xf5, 0xf6, 0xf7 };
 VECT_VAR_DECL(expected_s8_9,int,8,8) [] = { 0xf0, 0xff, 0xf1, 0xff,
 					    0xf2, 0xff, 0xf3, 0xff };
+VECT_VAR_DECL(expected_s8_10,int,8,8) [] = { 0x00, 0xcc, 0x80, 0xcb,
+					     0x00, 0xcb, 0x80, 0xca };
 
 /* Expected results for vreinterpret_s16_xx.  */
 VECT_VAR_DECL(expected_s16_1,int,16,4) [] = { 0xf1f0, 0xf3f2, 0xf5f4, 0xf7f6 };
@@ -32,6 +34,7 @@ VECT_VAR_DECL(expected_s16_6,int,16,4) [] = { 0xfff0, 0xffff, 0xfff1, 0xffff };
 VECT_VAR_DECL(expected_s16_7,int,16,4) [] = { 0xfff0, 0xffff, 0xffff, 0xffff };
 VECT_VAR_DECL(expected_s16_8,int,16,4) [] = { 0xf1f0, 0xf3f2, 0xf5f4, 0xf7f6 };
 VECT_VAR_DECL(expected_s16_9,int,16,4) [] = { 0xfff0, 0xfff1, 0xfff2, 0xfff3 };
+VECT_VAR_DECL(expected_s16_10,int,16,4) [] = { 0xcc00, 0xcb80, 0xcb00, 0xca80 };
 
 /* Expected results for vreinterpret_s32_xx.  */
 VECT_VAR_DECL(expected_s32_1,int,32,2) [] = { 0xf3f2f1f0, 0xf7f6f5f4 };
@@ -43,6 +46,7 @@ VECT_VAR_DECL(expected_s32_6,int,32,2) [] = { 0xfffffff0, 0xfffffff1 };
 VECT_VAR_DECL(expected_s32_7,int,32,2) [] = { 0xfffffff0, 0xffffffff };
 VECT_VAR_DECL(expected_s32_8,int,32,2) [] = { 0xf3f2f1f0, 0xf7f6f5f4 };
 VECT_VAR_DECL(expected_s32_9,int,32,2) [] = { 0xfff1fff0, 0xfff3fff2 };
+VECT_VAR_DECL(expected_s32_10,int,32,2) [] = { 0xcb80cc00, 0xca80cb00 };
 
 /* Expected results for vreinterpret_s64_xx.  */
 VECT_VAR_DECL(expected_s64_1,int,64,1) [] = { 0xf7f6f5f4f3f2f1f0 };
@@ -54,6 +58,7 @@ VECT_VAR_DECL(expected_s64_6,int,64,1) [] = { 0xfffffff1fffffff0 };
 VECT_VAR_DECL(expected_s64_7,int,64,1) [] = { 0xfffffffffffffff0 };
 VECT_VAR_DECL(expected_s64_8,int,64,1) [] = { 0xf7f6f5f4f3f2f1f0 };
 VECT_VAR_DECL(expected_s64_9,int,64,1) [] = { 0xfff3fff2fff1fff0 };
+VECT_VAR_DECL(expected_s64_10,int,64,1) [] = { 0xca80cb00cb80cc00 };
 
 /* Expected results for vreinterpret_u8_xx.  */
 VECT_VAR_DECL(expected_u8_1,uint,8,8) [] = { 0xf0, 0xf1, 0xf2, 0xf3,
@@ -74,6 +79,8 @@ VECT_VAR_DECL(expected_u8_8,uint,8,8) [] = { 0xf0, 0xf1, 0xf2, 0xf3,
 					     0xf4, 0xf5, 0xf6, 0xf7 };
 VECT_VAR_DECL(expected_u8_9,uint,8,8) [] = { 0xf0, 0xff, 0xf1, 0xff,
 					     0xf2, 0xff, 0xf3, 0xff };
+VECT_VAR_DECL(expected_u8_10,uint,8,8) [] = { 0x00, 0xcc, 0x80, 0xcb,
+					      0x00, 0xcb, 0x80, 0xca };
 
 /* Expected results for vreinterpret_u16_xx.  */
 VECT_VAR_DECL(expected_u16_1,uint,16,4) [] = { 0xf1f0, 0xf3f2, 0xf5f4, 0xf7f6 };
@@ -85,6 +92,7 @@ VECT_VAR_DECL(expected_u16_6,uint,16,4) [] = { 0xfff0, 0xffff, 0xfff1, 0xffff };
 VECT_VAR_DECL(expected_u16_7,uint,16,4) [] = { 0xfff0, 0xffff, 0xffff, 0xffff };
 VECT_VAR_DECL(expected_u16_8,uint,16,4) [] = { 0xf1f0, 0xf3f2, 0xf5f4, 0xf7f6 };
 VECT_VAR_DECL(expected_u16_9,uint,16,4) [] = { 0xfff0, 0xfff1, 0xfff2, 0xfff3 };
+VECT_VAR_DECL(expected_u16_10,uint,16,4) [] = { 0xcc00, 0xcb80, 0xcb00, 0xca80 };
 
 /* Expected results for vreinterpret_u32_xx.  */
 VECT_VAR_DECL(expected_u32_1,uint,32,2) [] = { 0xf3f2f1f0, 0xf7f6f5f4 };
@@ -96,6 +104,7 @@ VECT_VAR_DECL(expected_u32_6,uint,32,2) [] = { 0xfff1fff0, 0xfff3fff2 };
 VECT_VAR_DECL(expected_u32_7,uint,32,2) [] = { 0xfffffff0, 0xffffffff };
 VECT_VAR_DECL(expected_u32_8,uint,32,2) [] = { 0xf3f2f1f0, 0xf7f6f5f4 };
 VECT_VAR_DECL(expected_u32_9,uint,32,2) [] = { 0xfff1fff0, 0xfff3fff2 };
+VECT_VAR_DECL(expected_u32_10,uint,32,2) [] = { 0xcb80cc00, 0xca80cb00 };
 
 /* Expected results for vreinterpret_u64_xx.  */
 VECT_VAR_DECL(expected_u64_1,uint,64,1) [] = { 0xf7f6f5f4f3f2f1f0 };
@@ -107,6 +116,7 @@ VECT_VAR_DECL(expected_u64_6,uint,64,1) [] = { 0xfff3fff2fff1fff0 };
 VECT_VAR_DECL(expected_u64_7,uint,64,1) [] = { 0xfffffff1fffffff0 };
 VECT_VAR_DECL(expected_u64_8,uint,64,1) [] = { 0xf7f6f5f4f3f2f1f0 };
 VECT_VAR_DECL(expected_u64_9,uint,64,1) [] = { 0xfff3fff2fff1fff0 };
+VECT_VAR_DECL(expected_u64_10,uint,64,1) [] = { 0xca80cb00cb80cc00 };
 
 /* Expected results for vreinterpret_p8_xx.  */
 VECT_VAR_DECL(expected_p8_1,poly,8,8) [] = { 0xf0, 0xf1, 0xf2, 0xf3,
@@ -127,6 +137,8 @@ VECT_VAR_DECL(expected_p8_8,poly,8,8) [] = { 0xf0, 0xff, 0xff, 0xff,
 					     0xff, 0xff, 0xff, 0xff };
 VECT_VAR_DECL(expected_p8_9,poly,8,8) [] = { 0xf0, 0xff, 0xf1, 0xff,
 					     0xf2, 0xff, 0xf3, 0xff };
+VECT_VAR_DECL(expected_p8_10,poly,8,8) [] = { 0x00, 0xcc, 0x80, 0xcb,
+					      0x00, 0xcb, 0x80, 0xca };
 
 /* Expected results for vreinterpret_p16_xx.  */
 VECT_VAR_DECL(expected_p16_1,poly,16,4) [] = { 0xf1f0, 0xf3f2, 0xf5f4, 0xf7f6 };
@@ -138,6 +150,7 @@ VECT_VAR_DECL(expected_p16_6,poly,16,4) [] = { 0xfff0, 0xfff1, 0xfff2, 0xfff3 };
 VECT_VAR_DECL(expected_p16_7,poly,16,4) [] = { 0xfff0, 0xffff, 0xfff1, 0xffff };
 VECT_VAR_DECL(expected_p16_8,poly,16,4) [] = { 0xfff0, 0xffff, 0xffff, 0xffff };
 VECT_VAR_DECL(expected_p16_9,poly,16,4) [] = { 0xf1f0, 0xf3f2, 0xf5f4, 0xf7f6 };
+VECT_VAR_DECL(expected_p16_10,poly,16,4) [] = { 0xcc00, 0xcb80, 0xcb00, 0xca80 };
 
 /* Expected results for vreinterpretq_s8_xx.  */
 VECT_VAR_DECL(expected_q_s8_1,int,8,16) [] = { 0xf0, 0xff, 0xf1, 0xff,
@@ -176,6 +189,10 @@ VECT_VAR_DECL(expected_q_s8_9,int,8,16) [] = { 0xf0, 0xff, 0xf1, 0xff,
 					       0xf2, 0xff, 0xf3, 0xff,
 					       0xf4, 0xff, 0xf5, 0xff,
 					       0xf6, 0xff, 0xf7, 0xff };
+VECT_VAR_DECL(expected_q_s8_10,int,8,16) [] = { 0x00, 0xcc, 0x80, 0xcb,
+						0x00, 0xcb, 0x80, 0xca,
+						0x00, 0xca, 0x80, 0xc9,
+						0x00, 0xc9, 0x80, 0xc8 };
 
 /* Expected results for vreinterpretq_s16_xx.  */
 VECT_VAR_DECL(expected_q_s16_1,int,16,8) [] = { 0xf1f0, 0xf3f2,
@@ -214,6 +231,10 @@ VECT_VAR_DECL(expected_q_s16_9,int,16,8) [] = { 0xfff0, 0xfff1,
 						0xfff2, 0xfff3,
 						0xfff4, 0xfff5,
 						0xfff6, 0xfff7 };
+VECT_VAR_DECL(expected_q_s16_10,int,16,8) [] = { 0xcc00, 0xcb80,
+						 0xcb00, 0xca80,
+						 0xca00, 0xc980,
+						 0xc900, 0xc880 };
 
 /* Expected results for vreinterpretq_s32_xx.  */
 VECT_VAR_DECL(expected_q_s32_1,int,32,4) [] = { 0xf3f2f1f0, 0xf7f6f5f4,
@@ -234,6 +255,8 @@ VECT_VAR_DECL(expected_q_s32_8,int,32,4) [] = { 0xf3f2f1f0, 0xf7f6f5f4,
 						0xfbfaf9f8, 0xfffefdfc };
 VECT_VAR_DECL(expected_q_s32_9,int,32,4) [] = { 0xfff1fff0, 0xfff3fff2,
 						0xfff5fff4, 0xfff7fff6 };
+VECT_VAR_DECL(expected_q_s32_10,int,32,4) [] = { 0xcb80cc00, 0xca80cb00,
+						 0xc980ca00, 0xc880c900 };
 
 /* Expected results for vreinterpretq_s64_xx.  */
 VECT_VAR_DECL(expected_q_s64_1,int,64,2) [] = { 0xf7f6f5f4f3f2f1f0,
@@ -254,6 +277,8 @@ VECT_VAR_DECL(expected_q_s64_8,int,64,2) [] = { 0xf7f6f5f4f3f2f1f0,
 						0xfffefdfcfbfaf9f8 };
 VECT_VAR_DECL(expected_q_s64_9,int,64,2) [] = { 0xfff3fff2fff1fff0,
 						0xfff7fff6fff5fff4 };
+VECT_VAR_DECL(expected_q_s64_10,int,64,2) [] = { 0xca80cb00cb80cc00,
+						 0xc880c900c980ca00 };
 
 /* Expected results for vreinterpretq_u8_xx.  */
 VECT_VAR_DECL(expected_q_u8_1,uint,8,16) [] = { 0xf0, 0xf1, 0xf2, 0xf3,
@@ -292,6 +317,10 @@ VECT_VAR_DECL(expected_q_u8_9,uint,8,16) [] = { 0xf0, 0xff, 0xf1, 0xff,
 						0xf2, 0xff, 0xf3, 0xff,
 						0xf4, 0xff, 0xf5, 0xff,
 						0xf6, 0xff, 0xf7, 0xff };
+VECT_VAR_DECL(expected_q_u8_10,uint,8,16) [] = { 0x00, 0xcc, 0x80, 0xcb,
+						 0x00, 0xcb, 0x80, 0xca,
+						 0x00, 0xca, 0x80, 0xc9,
+						 0x00, 0xc9, 0x80, 0xc8 };
 
 /* Expected results for vreinterpretq_u16_xx.  */
 VECT_VAR_DECL(expected_q_u16_1,uint,16,8) [] = { 0xf1f0, 0xf3f2,
@@ -330,6 +359,10 @@ VECT_VAR_DECL(expected_q_u16_9,uint,16,8) [] = { 0xfff0, 0xfff1,
 						 0xfff2, 0xfff3,
 						 0xfff4, 0xfff5,
 						 0xfff6, 0xfff7 };
+VECT_VAR_DECL(expected_q_u16_10,uint,16,8) [] = { 0xcc00, 0xcb80,
+						  0xcb00, 0xca80,
+						  0xca00, 0xc980,
+						  0xc900, 0xc880 };
 
 /* Expected results for vreinterpretq_u32_xx.  */
 VECT_VAR_DECL(expected_q_u32_1,uint,32,4) [] = { 0xf3f2f1f0, 0xf7f6f5f4,
@@ -350,6 +383,8 @@ VECT_VAR_DECL(expected_q_u32_8,uint,32,4) [] = { 0xf3f2f1f0, 0xf7f6f5f4,
 						 0xfbfaf9f8, 0xfffefdfc };
 VECT_VAR_DECL(expected_q_u32_9,uint,32,4) [] = { 0xfff1fff0, 0xfff3fff2,
 						 0xfff5fff4, 0xfff7fff6 };
+VECT_VAR_DECL(expected_q_u32_10,uint,32,4) [] = { 0xcb80cc00, 0xca80cb00,
+						  0xc980ca00, 0xc880c900 };
 
 /* Expected results for vreinterpretq_u64_xx.  */
 VECT_VAR_DECL(expected_q_u64_1,uint,64,2) [] = { 0xf7f6f5f4f3f2f1f0,
@@ -370,6 +405,92 @@ VECT_VAR_DECL(expected_q_u64_8,uint,64,2) [] = { 0xf7f6f5f4f3f2f1f0,
 						0xfffefdfcfbfaf9f8 };
 VECT_VAR_DECL(expected_q_u64_9,uint,64,2) [] = { 0xfff3fff2fff1fff0,
 						 0xfff7fff6fff5fff4 };
+VECT_VAR_DECL(expected_q_u64_10,uint,64,2) [] = { 0xca80cb00cb80cc00,
+						  0xc880c900c980ca00 };
+
+/* Expected results for vreinterpretq_p8_xx.  */
+VECT_VAR_DECL(expected_q_p8_1,poly,8,16) [] = { 0xf0, 0xf1, 0xf2, 0xf3,
+						0xf4, 0xf5, 0xf6, 0xf7,
+						0xf8, 0xf9, 0xfa, 0xfb,
+						0xfc, 0xfd, 0xfe, 0xff };
+VECT_VAR_DECL(expected_q_p8_2,poly,8,16) [] = { 0xf0, 0xff, 0xf1, 0xff,
+						0xf2, 0xff, 0xf3, 0xff,
+						0xf4, 0xff, 0xf5, 0xff,
+						0xf6, 0xff, 0xf7, 0xff };
+VECT_VAR_DECL(expected_q_p8_3,poly,8,16) [] = { 0xf0, 0xff, 0xff, 0xff,
+						0xf1, 0xff, 0xff, 0xff,
+						0xf2, 0xff, 0xff, 0xff,
+						0xf3, 0xff, 0xff, 0xff };
+VECT_VAR_DECL(expected_q_p8_4,poly,8,16) [] = { 0xf0, 0xff, 0xff, 0xff,
+						0xff, 0xff, 0xff, 0xff,
+						0xf1, 0xff, 0xff, 0xff,
+						0xff, 0xff, 0xff, 0xff };
+VECT_VAR_DECL(expected_q_p8_5,poly,8,16) [] = { 0xf0, 0xf1, 0xf2, 0xf3,
+						0xf4, 0xf5, 0xf6, 0xf7,
+						0xf8, 0xf9, 0xfa, 0xfb,
+						0xfc, 0xfd, 0xfe, 0xff };
+VECT_VAR_DECL(expected_q_p8_6,poly,8,16) [] = { 0xf0, 0xff, 0xf1, 0xff,
+						0xf2, 0xff, 0xf3, 0xff,
+						0xf4, 0xff, 0xf5, 0xff,
+						0xf6, 0xff, 0xf7, 0xff };
+VECT_VAR_DECL(expected_q_p8_7,poly,8,16) [] = { 0xf0, 0xff, 0xff, 0xff,
+						0xf1, 0xff, 0xff, 0xff,
+						0xf2, 0xff, 0xff, 0xff,
+						0xf3, 0xff, 0xff, 0xff };
+VECT_VAR_DECL(expected_q_p8_8,poly,8,16) [] = { 0xf0, 0xff, 0xff, 0xff,
+						0xff, 0xff, 0xff, 0xff,
+						0xf1, 0xff, 0xff, 0xff,
+						0xff, 0xff, 0xff, 0xff };
+VECT_VAR_DECL(expected_q_p8_9,poly,8,16) [] = { 0xf0, 0xff, 0xf1, 0xff,
+						0xf2, 0xff, 0xf3, 0xff,
+						0xf4, 0xff, 0xf5, 0xff,
+						0xf6, 0xff, 0xf7, 0xff };
+VECT_VAR_DECL(expected_q_p8_10,poly,8,16) [] = { 0x00, 0xcc, 0x80, 0xcb,
+						 0x00, 0xcb, 0x80, 0xca,
+						 0x00, 0xca, 0x80, 0xc9,
+						 0x00, 0xc9, 0x80, 0xc8 };
+
+/* Expected results for vreinterpretq_p16_xx.  */
+VECT_VAR_DECL(expected_q_p16_1,poly,16,8) [] = { 0xf1f0, 0xf3f2,
+						 0xf5f4, 0xf7f6,
+						 0xf9f8, 0xfbfa,
+						 0xfdfc, 0xfffe };
+VECT_VAR_DECL(expected_q_p16_2,poly,16,8) [] = { 0xfff0, 0xfff1,
+						 0xfff2, 0xfff3,
+						 0xfff4, 0xfff5,
+						 0xfff6, 0xfff7 };
+VECT_VAR_DECL(expected_q_p16_3,poly,16,8) [] = { 0xfff0, 0xffff,
+						 0xfff1, 0xffff,
+						 0xfff2, 0xffff,
+						 0xfff3, 0xffff };
+VECT_VAR_DECL(expected_q_p16_4,poly,16,8) [] = { 0xfff0, 0xffff,
+						 0xffff, 0xffff,
+						 0xfff1, 0xffff,
+						 0xffff, 0xffff };
+VECT_VAR_DECL(expected_q_p16_5,poly,16,8) [] = { 0xf1f0, 0xf3f2,
+						 0xf5f4, 0xf7f6,
+						 0xf9f8, 0xfbfa,
+						 0xfdfc, 0xfffe };
+VECT_VAR_DECL(expected_q_p16_6,poly,16,8) [] = { 0xfff0, 0xfff1,
+						 0xfff2, 0xfff3,
+						 0xfff4, 0xfff5,
+						 0xfff6, 0xfff7 };
+VECT_VAR_DECL(expected_q_p16_7,poly,16,8) [] = { 0xfff0, 0xffff,
+						 0xfff1, 0xffff,
+						 0xfff2, 0xffff,
+						 0xfff3, 0xffff };
+VECT_VAR_DECL(expected_q_p16_8,poly,16,8) [] = { 0xfff0, 0xffff,
+						 0xffff, 0xffff,
+						 0xfff1, 0xffff,
+						 0xffff, 0xffff };
+VECT_VAR_DECL(expected_q_p16_9,poly,16,8) [] = { 0xf1f0, 0xf3f2,
+						 0xf5f4, 0xf7f6,
+						 0xf9f8, 0xfbfa,
+						 0xfdfc, 0xfffe };
+VECT_VAR_DECL(expected_q_p16_10,poly,16,8) [] = { 0xcc00, 0xcb80,
+						  0xcb00, 0xca80,
+						  0xca00, 0xc980,
+						  0xc900, 0xc880 };
 
 /* Expected results for vreinterpret_f32_xx.  */
 VECT_VAR_DECL(expected_f32_1,hfloat,32,2) [] = { 0xf3f2f1f0, 0xf7f6f5f4 };
@@ -382,6 +503,7 @@ VECT_VAR_DECL(expected_f32_7,hfloat,32,2) [] = { 0xfffffff0, 0xfffffff1 };
 VECT_VAR_DECL(expected_f32_8,hfloat,32,2) [] = { 0xfffffff0, 0xffffffff };
 VECT_VAR_DECL(expected_f32_9,hfloat,32,2) [] = { 0xf3f2f1f0, 0xf7f6f5f4 };
 VECT_VAR_DECL(expected_f32_10,hfloat,32,2) [] = { 0xfff1fff0, 0xfff3fff2 };
+VECT_VAR_DECL(expected_f32_11,hfloat,32,2) [] = { 0xcb80cc00, 0xca80cb00 };
 
 /* Expected results for vreinterpretq_f32_xx.  */
 VECT_VAR_DECL(expected_q_f32_1,hfloat,32,4) [] = { 0xf3f2f1f0, 0xf7f6f5f4,
@@ -404,8 +526,10 @@ VECT_VAR_DECL(expected_q_f32_9,hfloat,32,4) [] = { 0xf3f2f1f0, 0xf7f6f5f4,
 						   0xfbfaf9f8, 0xfffefdfc };
 VECT_VAR_DECL(expected_q_f32_10,hfloat,32,4) [] = { 0xfff1fff0, 0xfff3fff2,
 						    0xfff5fff4, 0xfff7fff6 };
+VECT_VAR_DECL(expected_q_f32_11,hfloat,32,4) [] = { 0xcb80cc00, 0xca80cb00,
+						    0xc980ca00, 0xc880c900 };
 
-/* Expected results for vreinterpretq_xx_f32.  */
+/* Expected results for vreinterpret_xx_f32.  */
 VECT_VAR_DECL(expected_xx_f32_1,int,8,8) [] = { 0x0, 0x0, 0x80, 0xc1,
 						0x0, 0x0, 0x70, 0xc1 };
 VECT_VAR_DECL(expected_xx_f32_2,int,16,4) [] = { 0x0, 0xc180, 0x0, 0xc170 };
@@ -419,6 +543,7 @@ VECT_VAR_DECL(expected_xx_f32_8,uint,64,1) [] = { 0xc1700000c1800000 };
 VECT_VAR_DECL(expected_xx_f32_9,poly,8,8) [] = { 0x0, 0x0, 0x80, 0xc1,
 						 0x0, 0x0, 0x70, 0xc1 };
 VECT_VAR_DECL(expected_xx_f32_10,poly,16,4) [] = { 0x0, 0xc180, 0x0, 0xc170 };
+VECT_VAR_DECL(expected_xx_f32_11,hfloat,16,4) [] = { 0x0, 0xc180, 0x0, 0xc170 };
 
 /* Expected results for vreinterpretq_xx_f32.  */
 VECT_VAR_DECL(expected_q_xx_f32_1,int,8,16) [] = { 0x0, 0x0, 0x80, 0xc1,
@@ -447,6 +572,62 @@ VECT_VAR_DECL(expected_q_xx_f32_9,poly,8,16) [] = { 0x0, 0x0, 0x80, 0xc1,
 						    0x0, 0x0, 0x50, 0xc1 };
 VECT_VAR_DECL(expected_q_xx_f32_10,poly,16,8) [] = { 0x0, 0xc180, 0x0, 0xc170,
 						     0x0, 0xc160, 0x0, 0xc150 };
+VECT_VAR_DECL(expected_q_xx_f32_11,hfloat,16,8) [] = { 0x0, 0xc180, 0x0, 0xc170,
+						      0x0, 0xc160, 0x0, 0xc150 };
+
+/* Expected results for vreinterpret_f16_xx.  */
+VECT_VAR_DECL(expected_f16_1,hfloat,16,4) [] = { 0xf1f0, 0xf3f2, 0xf5f4, 0xf7f6 };
+VECT_VAR_DECL(expected_f16_2,hfloat,16,4) [] = { 0xfff0, 0xfff1, 0xfff2, 0xfff3 };
+VECT_VAR_DECL(expected_f16_3,hfloat,16,4) [] = { 0xfff0, 0xffff, 0xfff1, 0xffff };
+VECT_VAR_DECL(expected_f16_4,hfloat,16,4) [] = { 0xfff0, 0xffff, 0xffff, 0xffff };
+VECT_VAR_DECL(expected_f16_5,hfloat,16,4) [] = { 0xf1f0, 0xf3f2, 0xf5f4, 0xf7f6 };
+VECT_VAR_DECL(expected_f16_6,hfloat,16,4) [] = { 0xfff0, 0xfff1, 0xfff2, 0xfff3 };
+VECT_VAR_DECL(expected_f16_7,hfloat,16,4) [] = { 0xfff0, 0xffff, 0xfff1, 0xffff };
+VECT_VAR_DECL(expected_f16_8,hfloat,16,4) [] = { 0xfff0, 0xffff, 0xffff, 0xffff };
+VECT_VAR_DECL(expected_f16_9,hfloat,16,4) [] = { 0xf1f0, 0xf3f2, 0xf5f4, 0xf7f6 };
+VECT_VAR_DECL(expected_f16_10,hfloat,16,4) [] = { 0xfff0, 0xfff1, 0xfff2, 0xfff3 };
+
+/* Expected results for vreinterpretq_f16_xx.  */
+VECT_VAR_DECL(expected_q_f16_1,hfloat,16,8) [] = { 0xf1f0, 0xf3f2,
+						   0xf5f4, 0xf7f6,
+						   0xf9f8, 0xfbfa,
+						   0xfdfc, 0xfffe };
+VECT_VAR_DECL(expected_q_f16_2,hfloat,16,8) [] = { 0xfff0, 0xfff1,
+						   0xfff2, 0xfff3,
+						   0xfff4, 0xfff5,
+						   0xfff6, 0xfff7 };
+VECT_VAR_DECL(expected_q_f16_3,hfloat,16,8) [] = { 0xfff0, 0xffff,
+						   0xfff1, 0xffff,
+						   0xfff2, 0xffff,
+						   0xfff3, 0xffff };
+VECT_VAR_DECL(expected_q_f16_4,hfloat,16,8) [] = { 0xfff0, 0xffff,
+						   0xffff, 0xffff,
+						   0xfff1, 0xffff,
+						   0xffff, 0xffff };
+VECT_VAR_DECL(expected_q_f16_5,hfloat,16,8) [] = { 0xf1f0, 0xf3f2,
+						   0xf5f4, 0xf7f6,
+						   0xf9f8, 0xfbfa,
+						   0xfdfc, 0xfffe };
+VECT_VAR_DECL(expected_q_f16_6,hfloat,16,8) [] = { 0xfff0, 0xfff1,
+						   0xfff2, 0xfff3,
+						   0xfff4, 0xfff5,
+						   0xfff6, 0xfff7 };
+VECT_VAR_DECL(expected_q_f16_7,hfloat,16,8) [] = { 0xfff0, 0xffff,
+						   0xfff1, 0xffff,
+						   0xfff2, 0xffff,
+						   0xfff3, 0xffff };
+VECT_VAR_DECL(expected_q_f16_8,hfloat,16,8) [] = { 0xfff0, 0xffff,
+						   0xffff, 0xffff,
+						   0xfff1, 0xffff,
+						   0xffff, 0xffff };
+VECT_VAR_DECL(expected_q_f16_9,hfloat,16,8) [] = { 0xf1f0, 0xf3f2,
+						   0xf5f4, 0xf7f6,
+						   0xf9f8, 0xfbfa,
+						   0xfdfc, 0xfffe };
+VECT_VAR_DECL(expected_q_f16_10,hfloat,16,8) [] = { 0xfff0, 0xfff1,
+						    0xfff2, 0xfff3,
+						    0xfff4, 0xfff5,
+						    0xfff6, 0xfff7 };
 
 #define TEST_MSG "VREINTERPRET/VREINTERPRETQ"
 
@@ -484,6 +665,10 @@ void exec_vreinterpret (void)
 
   /* Initialize input "vector" from "buffer".  */
   TEST_MACRO_ALL_VARIANTS_2_5(VLOAD, vector, buffer);
+#if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
+  VLOAD(vector, buffer, , float, f, 16, 4);
+  VLOAD(vector, buffer, q, float, f, 16, 8);
+#endif
   VLOAD(vector, buffer, , float, f, 32, 2);
   VLOAD(vector, buffer, q, float, f, 32, 4);
 
@@ -497,6 +682,9 @@ void exec_vreinterpret (void)
   TEST_VREINTERPRET(, int, s, 8, 8, uint, u, 64, 1, expected_s8_7);
   TEST_VREINTERPRET(, int, s, 8, 8, poly, p, 8, 8, expected_s8_8);
   TEST_VREINTERPRET(, int, s, 8, 8, poly, p, 16, 4, expected_s8_9);
+#if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
+  TEST_VREINTERPRET(, int, s, 8, 8, float, f, 16, 4, expected_s8_10);
+#endif
 
   /* vreinterpret_s16_xx.  */
   TEST_VREINTERPRET(, int, s, 16, 4, int, s, 8, 8, expected_s16_1);
@@ -508,6 +696,9 @@ void exec_vreinterpret (void)
   TEST_VREINTERPRET(, int, s, 16, 4, uint, u, 64, 1, expected_s16_7);
   TEST_VREINTERPRET(, int, s, 16, 4, poly, p, 8, 8, expected_s16_8);
   TEST_VREINTERPRET(, int, s, 16, 4, poly, p, 16, 4, expected_s16_9);
+#if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
+  TEST_VREINTERPRET(, int, s, 16, 4, float, f, 16, 4, expected_s16_10);
+#endif
 
   /* vreinterpret_s32_xx.  */
   TEST_VREINTERPRET(, int, s, 32, 2, int, s, 8, 8, expected_s32_1);
@@ -519,6 +710,9 @@ void exec_vreinterpret (void)
   TEST_VREINTERPRET(, int, s, 32, 2, uint, u, 64, 1, expected_s32_7);
   TEST_VREINTERPRET(, int, s, 32, 2, poly, p, 8, 8, expected_s32_8);
   TEST_VREINTERPRET(, int, s, 32, 2, poly, p, 16, 4, expected_s32_9);
+#if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
+  TEST_VREINTERPRET(, int, s, 32, 2, float, f, 16, 4, expected_s32_10);
+#endif
 
   /* vreinterpret_s64_xx.  */
   TEST_VREINTERPRET(, int, s, 64, 1, int, s, 8, 8, expected_s64_1);
@@ -530,6 +724,9 @@ void exec_vreinterpret (void)
   TEST_VREINTERPRET(, int, s, 64, 1, uint, u, 64, 1, expected_s64_7);
   TEST_VREINTERPRET(, int, s, 64, 1, poly, p, 8, 8, expected_s64_8);
   TEST_VREINTERPRET(, int, s, 64, 1, poly, p, 16, 4, expected_s64_9);
+#if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
+  TEST_VREINTERPRET(, int, s, 64, 1, float, f, 16, 4, expected_s64_10);
+#endif
 
   /* vreinterpret_u8_xx.  */
   TEST_VREINTERPRET(, uint, u, 8, 8, int, s, 8, 8, expected_u8_1);
@@ -541,6 +738,9 @@ void exec_vreinterpret (void)
   TEST_VREINTERPRET(, uint, u, 8, 8, uint, u, 64, 1, expected_u8_7);
   TEST_VREINTERPRET(, uint, u, 8, 8, poly, p, 8, 8, expected_u8_8);
   TEST_VREINTERPRET(, uint, u, 8, 8, poly, p, 16, 4, expected_u8_9);
+#if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
+  TEST_VREINTERPRET(, uint, u, 8, 8, float, f, 16, 4, expected_u8_10);
+#endif
 
   /* vreinterpret_u16_xx.  */
   TEST_VREINTERPRET(, uint, u, 16, 4, int, s, 8, 8, expected_u16_1);
@@ -552,6 +752,9 @@ void exec_vreinterpret (void)
   TEST_VREINTERPRET(, uint, u, 16, 4, uint, u, 64, 1, expected_u16_7);
   TEST_VREINTERPRET(, uint, u, 16, 4, poly, p, 8, 8, expected_u16_8);
   TEST_VREINTERPRET(, uint, u, 16, 4, poly, p, 16, 4, expected_u16_9);
+#if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
+  TEST_VREINTERPRET(, uint, u, 16, 4, float, f, 16, 4, expected_u16_10);
+#endif
 
   /* vreinterpret_u32_xx.  */
   TEST_VREINTERPRET(, uint, u, 32, 2, int, s, 8, 8, expected_u32_1);
@@ -563,6 +766,9 @@ void exec_vreinterpret (void)
   TEST_VREINTERPRET(, uint, u, 32, 2, uint, u, 64, 1, expected_u32_7);
   TEST_VREINTERPRET(, uint, u, 32, 2, poly, p, 8, 8, expected_u32_8);
   TEST_VREINTERPRET(, uint, u, 32, 2, poly, p, 16, 4, expected_u32_9);
+#if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
+  TEST_VREINTERPRET(, uint, u, 32, 2, float, f, 16, 4, expected_u32_10);
+#endif
 
   /* vreinterpret_u64_xx.  */
   TEST_VREINTERPRET(, uint, u, 64, 1, int, s, 8, 8, expected_u64_1);
@@ -574,6 +780,9 @@ void exec_vreinterpret (void)
   TEST_VREINTERPRET(, uint, u, 64, 1, uint, u, 32, 2, expected_u64_7);
   TEST_VREINTERPRET(, uint, u, 64, 1, poly, p, 8, 8, expected_u64_8);
   TEST_VREINTERPRET(, uint, u, 64, 1, poly, p, 16, 4, expected_u64_9);
+#if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
+  TEST_VREINTERPRET(, uint, u, 64, 1, float, f, 16, 4, expected_u64_10);
+#endif
 
   /* vreinterpret_p8_xx.  */
   TEST_VREINTERPRET_POLY(, poly, p, 8, 8, int, s, 8, 8, expected_p8_1);
@@ -585,6 +794,9 @@ void exec_vreinterpret (void)
   TEST_VREINTERPRET_POLY(, poly, p, 8, 8, uint, u, 32, 2, expected_p8_7);
   TEST_VREINTERPRET_POLY(, poly, p, 8, 8, uint, u, 64, 1, expected_p8_8);
   TEST_VREINTERPRET_POLY(, poly, p, 8, 8, poly, p, 16, 4, expected_p8_9);
+#if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
+  TEST_VREINTERPRET_POLY(, poly, p, 8, 8, float, f, 16, 4, expected_p8_10);
+#endif
 
   /* vreinterpret_p16_xx.  */
   TEST_VREINTERPRET_POLY(, poly, p, 16, 4, int, s, 8, 8, expected_p16_1);
@@ -596,6 +808,9 @@ void exec_vreinterpret (void)
   TEST_VREINTERPRET_POLY(, poly, p, 16, 4, uint, u, 32, 2, expected_p16_7);
   TEST_VREINTERPRET_POLY(, poly, p, 16, 4, uint, u, 64, 1, expected_p16_8);
   TEST_VREINTERPRET_POLY(, poly, p, 16, 4, poly, p, 8, 8, expected_p16_9);
+#if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
+  TEST_VREINTERPRET_POLY(, poly, p, 16, 4, float, f, 16, 4, expected_p16_10);
+#endif
 
   /* vreinterpretq_s8_xx.  */
   TEST_VREINTERPRET(q, int, s, 8, 16, int, s, 16, 8, expected_q_s8_1);
@@ -607,6 +822,9 @@ void exec_vreinterpret (void)
   TEST_VREINTERPRET(q, int, s, 8, 16, uint, u, 64, 2, expected_q_s8_7);
   TEST_VREINTERPRET(q, int, s, 8, 16, poly, p, 8, 16, expected_q_s8_8);
   TEST_VREINTERPRET(q, int, s, 8, 16, poly, p, 16, 8, expected_q_s8_9);
+#if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
+  TEST_VREINTERPRET(q, int, s, 8, 16, float, f, 16, 8, expected_q_s8_10);
+#endif
 
   /* vreinterpretq_s16_xx.  */
   TEST_VREINTERPRET(q, int, s, 16, 8, int, s, 8, 16, expected_q_s16_1);
@@ -618,6 +836,9 @@ void exec_vreinterpret (void)
   TEST_VREINTERPRET(q, int, s, 16, 8, uint, u, 64, 2, expected_q_s16_7);
   TEST_VREINTERPRET(q, int, s, 16, 8, poly, p, 8, 16, expected_q_s16_8);
   TEST_VREINTERPRET(q, int, s, 16, 8, poly, p, 16, 8, expected_q_s16_9);
+#if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
+  TEST_VREINTERPRET(q, int, s, 16, 8, float, f, 16, 8, expected_q_s16_10);
+#endif
 
   /* vreinterpretq_s32_xx.  */
   TEST_VREINTERPRET(q, int, s, 32, 4, int, s, 8, 16, expected_q_s32_1);
@@ -629,6 +850,9 @@ void exec_vreinterpret (void)
   TEST_VREINTERPRET(q, int, s, 32, 4, uint, u, 64, 2, expected_q_s32_7);
   TEST_VREINTERPRET(q, int, s, 32, 4, poly, p, 8, 16, expected_q_s32_8);
   TEST_VREINTERPRET(q, int, s, 32, 4, poly, p, 16, 8, expected_q_s32_9);
+#if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
+  TEST_VREINTERPRET(q, int, s, 32, 4, float, f, 16, 8, expected_q_s32_10);
+#endif
 
   /* vreinterpretq_s64_xx.  */
   TEST_VREINTERPRET(q, int, s, 64, 2, int, s, 8, 16, expected_q_s64_1);
@@ -640,6 +864,9 @@ void exec_vreinterpret (void)
   TEST_VREINTERPRET(q, int, s, 64, 2, uint, u, 64, 2, expected_q_s64_7);
   TEST_VREINTERPRET(q, int, s, 64, 2, poly, p, 8, 16, expected_q_s64_8);
   TEST_VREINTERPRET(q, int, s, 64, 2, poly, p, 16, 8, expected_q_s64_9);
+#if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
+  TEST_VREINTERPRET(q, int, s, 64, 2, float, f, 16, 8, expected_q_s64_10);
+#endif
 
   /* vreinterpretq_u8_xx.  */
   TEST_VREINTERPRET(q, uint, u, 8, 16, int, s, 8, 16, expected_q_u8_1);
@@ -651,6 +878,9 @@ void exec_vreinterpret (void)
   TEST_VREINTERPRET(q, uint, u, 8, 16, uint, u, 64, 2, expected_q_u8_7);
   TEST_VREINTERPRET(q, uint, u, 8, 16, poly, p, 8, 16, expected_q_u8_8);
   TEST_VREINTERPRET(q, uint, u, 8, 16, poly, p, 16, 8, expected_q_u8_9);
+#if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
+  TEST_VREINTERPRET(q, uint, u, 8, 16, float, f, 16, 8, expected_q_u8_10);
+#endif
 
   /* vreinterpretq_u16_xx.  */
   TEST_VREINTERPRET(q, uint, u, 16, 8, int, s, 8, 16, expected_q_u16_1);
@@ -662,6 +892,9 @@ void exec_vreinterpret (void)
   TEST_VREINTERPRET(q, uint, u, 16, 8, uint, u, 64, 2, expected_q_u16_7);
   TEST_VREINTERPRET(q, uint, u, 16, 8, poly, p, 8, 16, expected_q_u16_8);
   TEST_VREINTERPRET(q, uint, u, 16, 8, poly, p, 16, 8, expected_q_u16_9);
+#if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
+  TEST_VREINTERPRET(q, uint, u, 16, 8, float, f, 16, 8, expected_q_u16_10);
+#endif
 
   /* vreinterpretq_u32_xx.  */
   TEST_VREINTERPRET(q, uint, u, 32, 4, int, s, 8, 16, expected_q_u32_1);
@@ -673,6 +906,9 @@ void exec_vreinterpret (void)
   TEST_VREINTERPRET(q, uint, u, 32, 4, uint, u, 64, 2, expected_q_u32_7);
   TEST_VREINTERPRET(q, uint, u, 32, 4, poly, p, 8, 16, expected_q_u32_8);
   TEST_VREINTERPRET(q, uint, u, 32, 4, poly, p, 16, 8, expected_q_u32_9);
+#if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
+  TEST_VREINTERPRET(q, uint, u, 32, 4, float, f, 16, 8, expected_q_u32_10);
+#endif
 
   /* vreinterpretq_u64_xx.  */
   TEST_VREINTERPRET(q, uint, u, 64, 2, int, s, 8, 16, expected_q_u64_1);
@@ -684,6 +920,37 @@ void exec_vreinterpret (void)
   TEST_VREINTERPRET(q, uint, u, 64, 2, uint, u, 32, 4, expected_q_u64_7);
   TEST_VREINTERPRET(q, uint, u, 64, 2, poly, p, 8, 16, expected_q_u64_8);
   TEST_VREINTERPRET(q, uint, u, 64, 2, poly, p, 16, 8, expected_q_u64_9);
+#if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
+  TEST_VREINTERPRET(q, uint, u, 64, 2, float, f, 16, 8, expected_q_u64_10);
+#endif
+
+  /* vreinterpretq_p8_xx.  */
+  TEST_VREINTERPRET_POLY(q, poly, p, 8, 16, int, s, 8, 16, expected_q_p8_1);
+  TEST_VREINTERPRET_POLY(q, poly, p, 8, 16, int, s, 16, 8, expected_q_p8_2);
+  TEST_VREINTERPRET_POLY(q, poly, p, 8, 16, int, s, 32, 4, expected_q_p8_3);
+  TEST_VREINTERPRET_POLY(q, poly, p, 8, 16, int, s, 64, 2, expected_q_p8_4);
+  TEST_VREINTERPRET_POLY(q, poly, p, 8, 16, uint, u, 8, 16, expected_q_p8_5);
+  TEST_VREINTERPRET_POLY(q, poly, p, 8, 16, uint, u, 16, 8, expected_q_p8_6);
+  TEST_VREINTERPRET_POLY(q, poly, p, 8, 16, uint, u, 32, 4, expected_q_p8_7);
+  TEST_VREINTERPRET_POLY(q, poly, p, 8, 16, uint, u, 64, 2, expected_q_p8_8);
+  TEST_VREINTERPRET_POLY(q, poly, p, 8, 16, poly, p, 16, 8, expected_q_p8_9);
+#if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
+  TEST_VREINTERPRET_POLY(q, poly, p, 8, 16, float, f, 16, 8, expected_q_p8_10);
+#endif
+
+  /* vreinterpretq_p16_xx.  */
+  TEST_VREINTERPRET_POLY(q, poly, p, 16, 8, int, s, 8, 16, expected_q_p16_1);
+  TEST_VREINTERPRET_POLY(q, poly, p, 16, 8, int, s, 16, 8, expected_q_p16_2);
+  TEST_VREINTERPRET_POLY(q, poly, p, 16, 8, int, s, 32, 4, expected_q_p16_3);
+  TEST_VREINTERPRET_POLY(q, poly, p, 16, 8, int, s, 64, 2, expected_q_p16_4);
+  TEST_VREINTERPRET_POLY(q, poly, p, 16, 8, uint, u, 8, 16, expected_q_p16_5);
+  TEST_VREINTERPRET_POLY(q, poly, p, 16, 8, uint, u, 16, 8, expected_q_p16_6);
+  TEST_VREINTERPRET_POLY(q, poly, p, 16, 8, uint, u, 32, 4, expected_q_p16_7);
+  TEST_VREINTERPRET_POLY(q, poly, p, 16, 8, uint, u, 64, 2, expected_q_p16_8);
+  TEST_VREINTERPRET_POLY(q, poly, p, 16, 8, poly, p, 8, 16, expected_q_p16_9);
+#if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
+  TEST_VREINTERPRET_POLY(q, poly, p, 16, 8, float, f, 16, 8, expected_q_p16_10);
+#endif
 
   /* vreinterpret_f32_xx.  */
   TEST_VREINTERPRET_FP(, float, f, 32, 2, int, s, 8, 8, expected_f32_1);
@@ -696,6 +963,9 @@ void exec_vreinterpret (void)
   TEST_VREINTERPRET_FP(, float, f, 32, 2, uint, u, 64, 1, expected_f32_8);
   TEST_VREINTERPRET_FP(, float, f, 32, 2, poly, p, 8, 8, expected_f32_9);
   TEST_VREINTERPRET_FP(, float, f, 32, 2, poly, p, 16, 4, expected_f32_10);
+#if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
+  TEST_VREINTERPRET_FP(, float, f, 32, 2, float, f, 16, 4, expected_f32_11);
+#endif
 
   /* vreinterpretq_f32_xx.  */
   TEST_VREINTERPRET_FP(q, float, f, 32, 4, int, s, 8, 16, expected_q_f32_1);
@@ -708,6 +978,9 @@ void exec_vreinterpret (void)
   TEST_VREINTERPRET_FP(q, float, f, 32, 4, uint, u, 64, 2, expected_q_f32_8);
   TEST_VREINTERPRET_FP(q, float, f, 32, 4, poly, p, 8, 16, expected_q_f32_9);
   TEST_VREINTERPRET_FP(q, float, f, 32, 4, poly, p, 16, 8, expected_q_f32_10);
+#if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
+  TEST_VREINTERPRET_FP(q, float, f, 32, 4, float, f, 16, 8, expected_q_f32_11);
+#endif
 
   /* vreinterpret_xx_f32.  */
   TEST_VREINTERPRET(, int, s, 8, 8, float, f, 32, 2, expected_xx_f32_1);
@@ -720,6 +993,9 @@ void exec_vreinterpret (void)
   TEST_VREINTERPRET(, uint, u, 64, 1, float, f, 32, 2, expected_xx_f32_8);
   TEST_VREINTERPRET_POLY(, poly, p, 8, 8, float, f, 32, 2, expected_xx_f32_9);
   TEST_VREINTERPRET_POLY(, poly, p, 16, 4, float, f, 32, 2, expected_xx_f32_10);
+#if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
+  TEST_VREINTERPRET_FP(, float, f, 16, 4, float, f, 32, 2, expected_xx_f32_11);
+#endif
 
   /* vreinterpretq_xx_f32.  */
   TEST_VREINTERPRET(q, int, s, 8, 16, float, f, 32, 4, expected_q_xx_f32_1);
@@ -732,6 +1008,33 @@ void exec_vreinterpret (void)
   TEST_VREINTERPRET(q, uint, u, 64, 2, float, f, 32, 4, expected_q_xx_f32_8);
   TEST_VREINTERPRET_POLY(q, poly, p, 8, 16, float, f, 32, 4, expected_q_xx_f32_9);
   TEST_VREINTERPRET_POLY(q, poly, p, 16, 8, float, f, 32, 4, expected_q_xx_f32_10);
+#if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
+  TEST_VREINTERPRET_FP(q, float, f, 16, 8, float, f, 32, 4, expected_q_xx_f32_11);
+
+  /* vreinterpret_f16_xx.  */
+  TEST_VREINTERPRET_FP(, float, f, 16, 4, int, s, 8, 8, expected_f16_1);
+  TEST_VREINTERPRET_FP(, float, f, 16, 4, int, s, 16, 4, expected_f16_2);
+  TEST_VREINTERPRET_FP(, float, f, 16, 4, int, s, 32, 2, expected_f16_3);
+  TEST_VREINTERPRET_FP(, float, f, 16, 4, int, s, 64, 1, expected_f16_4);
+  TEST_VREINTERPRET_FP(, float, f, 16, 4, uint, u, 8, 8, expected_f16_5);
+  TEST_VREINTERPRET_FP(, float, f, 16, 4, uint, u, 16, 4, expected_f16_6);
+  TEST_VREINTERPRET_FP(, float, f, 16, 4, uint, u, 32, 2, expected_f16_7);
+  TEST_VREINTERPRET_FP(, float, f, 16, 4, uint, u, 64, 1, expected_f16_8);
+  TEST_VREINTERPRET_FP(, float, f, 16, 4, poly, p, 8, 8, expected_f16_9);
+  TEST_VREINTERPRET_FP(, float, f, 16, 4, poly, p, 16, 4, expected_f16_10);
+
+  /* vreinterpretq_f16_xx.  */
+  TEST_VREINTERPRET_FP(q, float, f, 16, 8, int, s, 8, 16, expected_q_f16_1);
+  TEST_VREINTERPRET_FP(q, float, f, 16, 8, int, s, 16, 8, expected_q_f16_2);
+  TEST_VREINTERPRET_FP(q, float, f, 16, 8, int, s, 32, 4, expected_q_f16_3);
+  TEST_VREINTERPRET_FP(q, float, f, 16, 8, int, s, 64, 2, expected_q_f16_4);
+  TEST_VREINTERPRET_FP(q, float, f, 16, 8, uint, u, 8, 16, expected_q_f16_5);
+  TEST_VREINTERPRET_FP(q, float, f, 16, 8, uint, u, 16, 8, expected_q_f16_6);
+  TEST_VREINTERPRET_FP(q, float, f, 16, 8, uint, u, 32, 4, expected_q_f16_7);
+  TEST_VREINTERPRET_FP(q, float, f, 16, 8, uint, u, 64, 2, expected_q_f16_8);
+  TEST_VREINTERPRET_FP(q, float, f, 16, 8, poly, p, 8, 16, expected_q_f16_9);
+  TEST_VREINTERPRET_FP(q, float, f, 16, 8, poly, p, 16, 8, expected_q_f16_10);
+#endif
 }
 
 int main (void)
--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vreinterpret_p128.c
@@ -0,0 +1,165 @@
+/* This file contains tests for the vreinterpret *p128 intrinsics.  */
+
+/* { dg-require-effective-target arm_crypto_ok { target { arm*-*-* } } } */
+/* { dg-add-options arm_crypto } */
+/* { dg-additional-options "-march=armv8-a+crypto" { target { aarch64*-*-* } } }*/
+
+#include <arm_neon.h>
+#include "arm-neon-ref.h"
+#include "compute-ref-data.h"
+
+/* Expected results: vreinterpretq_p128_*.  */
+VECT_VAR_DECL(vreint_expected_q_p128_s8,poly,64,2) [] = { 0xf7f6f5f4f3f2f1f0,
+							  0xfffefdfcfbfaf9f8 };
+VECT_VAR_DECL(vreint_expected_q_p128_s16,poly,64,2) [] = { 0xfff3fff2fff1fff0,
+							   0xfff7fff6fff5fff4 };
+VECT_VAR_DECL(vreint_expected_q_p128_s32,poly,64,2) [] = { 0xfffffff1fffffff0,
+							   0xfffffff3fffffff2 };
+VECT_VAR_DECL(vreint_expected_q_p128_s64,poly,64,2) [] = { 0xfffffffffffffff0,
+							   0xfffffffffffffff1 };
+VECT_VAR_DECL(vreint_expected_q_p128_u8,poly,64,2) [] = { 0xf7f6f5f4f3f2f1f0,
+							  0xfffefdfcfbfaf9f8 };
+VECT_VAR_DECL(vreint_expected_q_p128_u16,poly,64,2) [] = { 0xfff3fff2fff1fff0,
+							   0xfff7fff6fff5fff4 };
+VECT_VAR_DECL(vreint_expected_q_p128_u32,poly,64,2) [] = { 0xfffffff1fffffff0,
+							   0xfffffff3fffffff2 };
+VECT_VAR_DECL(vreint_expected_q_p128_u64,poly,64,2) [] = { 0xfffffffffffffff0,
+							   0xfffffffffffffff1 };
+VECT_VAR_DECL(vreint_expected_q_p128_p8,poly,64,2) [] = { 0xf7f6f5f4f3f2f1f0,
+							  0xfffefdfcfbfaf9f8 };
+VECT_VAR_DECL(vreint_expected_q_p128_p16,poly,64,2) [] = { 0xfff3fff2fff1fff0,
+							   0xfff7fff6fff5fff4 };
+VECT_VAR_DECL(vreint_expected_q_p128_f32,poly,64,2) [] = { 0xc1700000c1800000,
+							   0xc1500000c1600000 };
+VECT_VAR_DECL(vreint_expected_q_p128_f16,poly,64,2) [] = { 0xca80cb00cb80cc00,
+							   0xc880c900c980ca00 };
+
+/* Expected results: vreinterpretq_*_p128.  */
+VECT_VAR_DECL(vreint_expected_q_s8_p128,int,8,16) [] = { 0xf0, 0xff, 0xff, 0xff,
+							 0xff, 0xff, 0xff, 0xff,
+							 0xf1, 0xff, 0xff, 0xff,
+							 0xff, 0xff, 0xff, 0xff };
+VECT_VAR_DECL(vreint_expected_q_s16_p128,int,16,8) [] = { 0xfff0, 0xffff,
+							  0xffff, 0xffff,
+							  0xfff1, 0xffff,
+							  0xffff, 0xffff };
+VECT_VAR_DECL(vreint_expected_q_s32_p128,int,32,4) [] = { 0xfffffff0, 0xffffffff,
+							  0xfffffff1, 0xffffffff };
+VECT_VAR_DECL(vreint_expected_q_s64_p128,int,64,2) [] = { 0xfffffffffffffff0,
+							  0xfffffffffffffff1 };
+VECT_VAR_DECL(vreint_expected_q_u8_p128,uint,8,16) [] = { 0xf0, 0xff, 0xff, 0xff,
+							  0xff, 0xff, 0xff, 0xff,
+							  0xf1, 0xff, 0xff, 0xff,
+							  0xff, 0xff, 0xff, 0xff };
+VECT_VAR_DECL(vreint_expected_q_u16_p128,uint,16,8) [] = { 0xfff0, 0xffff,
+							   0xffff, 0xffff,
+							   0xfff1, 0xffff,
+							   0xffff, 0xffff };
+VECT_VAR_DECL(vreint_expected_q_u32_p128,uint,32,4) [] = { 0xfffffff0, 0xffffffff,
+							   0xfffffff1, 0xffffffff };
+VECT_VAR_DECL(vreint_expected_q_u64_p128,uint,64,2) [] = { 0xfffffffffffffff0,
+							   0xfffffffffffffff1 };
+VECT_VAR_DECL(vreint_expected_q_p8_p128,poly,8,16) [] = { 0xf0, 0xff, 0xff, 0xff,
+							  0xff, 0xff, 0xff, 0xff,
+							  0xf1, 0xff, 0xff, 0xff,
+							  0xff, 0xff, 0xff, 0xff };
+VECT_VAR_DECL(vreint_expected_q_p16_p128,poly,16,8) [] = { 0xfff0, 0xffff,
+							   0xffff, 0xffff,
+							   0xfff1, 0xffff,
+							   0xffff, 0xffff };
+VECT_VAR_DECL(vreint_expected_q_p64_p128,uint,64,2) [] = { 0xfffffffffffffff0,
+							   0xfffffffffffffff1 };
+VECT_VAR_DECL(vreint_expected_q_f32_p128,hfloat,32,4) [] = { 0xfffffff0, 0xffffffff,
+							     0xfffffff1, 0xffffffff };
+VECT_VAR_DECL(vreint_expected_q_f16_p128,hfloat,16,8) [] = { 0xfff0, 0xffff,
+							     0xffff, 0xffff,
+							     0xfff1, 0xffff,
+							     0xffff, 0xffff };
+
+int main (void)
+{
+  DECL_VARIABLE_128BITS_VARIANTS(vreint_vector);
+  DECL_VARIABLE_128BITS_VARIANTS(vreint_vector_res);
+
+  clean_results ();
+
+  TEST_MACRO_128BITS_VARIANTS_2_5(VLOAD, vreint_vector, buffer);
+  VLOAD(vreint_vector, buffer, q, poly, p, 64, 2);
+#if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
+  VLOAD(vreint_vector, buffer, q, float, f, 16, 8);
+#endif
+  VLOAD(vreint_vector, buffer, q, float, f, 32, 4);
+
+  /* vreinterpretq_p128_* tests.  */
+#undef TEST_MSG
+#define TEST_MSG "VREINTERPRETQ_P128_*"
+
+  /* Since there is no way to store a poly128_t value, convert to
+     poly64x2_t before storing. This means that we are not able to
+     test vreinterpretq_p128* alone, and that errors in
+     vreinterpretq_p64_p128 could compensate for errors in
+     vreinterpretq_p128*.  */
+#define TEST_VREINTERPRET128(Q, T1, T2, W, N, TS1, TS2, WS, NS, EXPECTED) \
+  VECT_VAR(vreint_vector_res, poly, 64, 2) =  vreinterpretq_p64_p128(	\
+    vreinterpret##Q##_##T2##W##_##TS2##WS(VECT_VAR(vreint_vector, TS1, WS, NS))); \
+  vst1##Q##_##T2##64(VECT_VAR(result, poly, 64, 2),			\
+                     VECT_VAR(vreint_vector_res, poly, 64, 2));		\
+  CHECK_POLY(TEST_MSG, T1, 64, 2, PRIx##64, EXPECTED, "");
+
+  TEST_VREINTERPRET128(q, poly, p, 128, 1, int, s, 8, 16, vreint_expected_q_p128_s8);
+  TEST_VREINTERPRET128(q, poly, p, 128, 1, int, s, 16, 8, vreint_expected_q_p128_s16);
+  TEST_VREINTERPRET128(q, poly, p, 128, 1, int, s, 32, 4, vreint_expected_q_p128_s32);
+  TEST_VREINTERPRET128(q, poly, p, 128, 1, int, s, 64, 2, vreint_expected_q_p128_s64);
+  TEST_VREINTERPRET128(q, poly, p, 128, 1, uint, u, 8, 16, vreint_expected_q_p128_u8);
+  TEST_VREINTERPRET128(q, poly, p, 128, 1, uint, u, 16, 8, vreint_expected_q_p128_u16);
+  TEST_VREINTERPRET128(q, poly, p, 128, 1, uint, u, 32, 4, vreint_expected_q_p128_u32);
+  TEST_VREINTERPRET128(q, poly, p, 128, 1, uint, u, 64, 2, vreint_expected_q_p128_u64);
+  TEST_VREINTERPRET128(q, poly, p, 128, 1, poly, p, 8, 16, vreint_expected_q_p128_p8);
+  TEST_VREINTERPRET128(q, poly, p, 128, 1, poly, p, 16, 8, vreint_expected_q_p128_p16);
+#if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
+  TEST_VREINTERPRET128(q, poly, p, 128, 1, float, f, 16, 8, vreint_expected_q_p128_f16);
+#endif
+  TEST_VREINTERPRET128(q, poly, p, 128, 1, float, f, 32, 4, vreint_expected_q_p128_f32);
+
+  /* vreinterpretq_*_p128 tests.  */
+#undef TEST_MSG
+#define TEST_MSG "VREINTERPRETQ_*_P128"
+
+  /* Since there is no way to load a poly128_t value, load a
+     poly64x2_t and convert it to poly128_t. This means that we are
+     not able to test vreinterpretq_*_p128 alone, and that errors in
+     vreinterpretq_p128_p64 could compensate for errors in
+     vreinterpretq_*_p128*.  */
+#define TEST_VREINTERPRET_FROM_P128(Q, T1, T2, W, N, TS1, TS2, WS, NS, EXPECTED) \
+  VECT_VAR(vreint_vector_res, T1, W, N) =				\
+    vreinterpret##Q##_##T2##W##_##TS2##WS(				\
+  vreinterpretq_p128_p64(VECT_VAR(vreint_vector, TS1, 64, 2)));		\
+  vst1##Q##_##T2##W(VECT_VAR(result, T1, W, N),				\
+		    VECT_VAR(vreint_vector_res, T1, W, N));		\
+  CHECK(TEST_MSG, T1, W, N, PRIx##W, EXPECTED, "");
+
+#define TEST_VREINTERPRET_FP_FROM_P128(Q, T1, T2, W, N, TS1, TS2, WS, NS, EXPECTED) \
+  VECT_VAR(vreint_vector_res, T1, W, N) =				\
+    vreinterpret##Q##_##T2##W##_##TS2##WS(				\
+  vreinterpretq_p128_p64(VECT_VAR(vreint_vector, TS1, 64, 2)));		\
+  vst1##Q##_##T2##W(VECT_VAR(result, T1, W, N),				\
+		    VECT_VAR(vreint_vector_res, T1, W, N));		\
+  CHECK_FP(TEST_MSG, T1, W, N, PRIx##W, EXPECTED, "");
+
+  TEST_VREINTERPRET_FROM_P128(q, int, s, 8, 16, poly, p, 128, 1, vreint_expected_q_s8_p128);
+  TEST_VREINTERPRET_FROM_P128(q, int, s, 16, 8, poly, p, 128, 1, vreint_expected_q_s16_p128);
+  TEST_VREINTERPRET_FROM_P128(q, int, s, 32, 4, poly, p, 128, 1, vreint_expected_q_s32_p128);
+  TEST_VREINTERPRET_FROM_P128(q, int, s, 64, 2, poly, p, 128, 1, vreint_expected_q_s64_p128);
+  TEST_VREINTERPRET_FROM_P128(q, uint, u, 8, 16, poly, p, 128, 1, vreint_expected_q_u8_p128);
+  TEST_VREINTERPRET_FROM_P128(q, uint, u, 16, 8, poly, p, 128, 1, vreint_expected_q_u16_p128);
+  TEST_VREINTERPRET_FROM_P128(q, uint, u, 32, 4, poly, p, 128, 1, vreint_expected_q_u32_p128);
+  TEST_VREINTERPRET_FROM_P128(q, uint, u, 64, 2, poly, p, 128, 1, vreint_expected_q_u64_p128);
+  TEST_VREINTERPRET_FROM_P128(q, poly, p, 8, 16, poly, p, 128, 1, vreint_expected_q_p8_p128);
+  TEST_VREINTERPRET_FROM_P128(q, poly, p, 16, 8, poly, p, 128, 1, vreint_expected_q_p16_p128);
+#if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
+  TEST_VREINTERPRET_FP_FROM_P128(q, float, f, 16, 8, poly, p, 128, 1, vreint_expected_q_f16_p128);
+#endif
+  TEST_VREINTERPRET_FP_FROM_P128(q, float, f, 32, 4, poly, p, 128, 1, vreint_expected_q_f32_p128);
+
+  return 0;
+}
--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vreinterpret_p64.c
@@ -0,0 +1,216 @@
+/* This file contains tests for the vreinterpret *p64 intrinsics.  */
+
+/* { dg-require-effective-target arm_crypto_ok { target { arm*-*-* } } } */
+/* { dg-add-options arm_crypto } */
+/* { dg-additional-options "-march=armv8-a+crypto" { target { aarch64*-*-* } } }*/
+
+#include <arm_neon.h>
+#include "arm-neon-ref.h"
+#include "compute-ref-data.h"
+
+/* Expected results: vreinterpret_p64_*.  */
+VECT_VAR_DECL(vreint_expected_p64_s8,poly,64,1) [] = { 0xf7f6f5f4f3f2f1f0 };
+VECT_VAR_DECL(vreint_expected_p64_s16,poly,64,1) [] = { 0xfff3fff2fff1fff0 };
+VECT_VAR_DECL(vreint_expected_p64_s32,poly,64,1) [] = { 0xfffffff1fffffff0 };
+VECT_VAR_DECL(vreint_expected_p64_s64,poly,64,1) [] = { 0xfffffffffffffff0 };
+VECT_VAR_DECL(vreint_expected_p64_u8,poly,64,1) [] = { 0xf7f6f5f4f3f2f1f0 };
+VECT_VAR_DECL(vreint_expected_p64_u16,poly,64,1) [] = { 0xfff3fff2fff1fff0 };
+VECT_VAR_DECL(vreint_expected_p64_u32,poly,64,1) [] = { 0xfffffff1fffffff0 };
+VECT_VAR_DECL(vreint_expected_p64_u64,poly,64,1) [] = { 0xfffffffffffffff0 };
+VECT_VAR_DECL(vreint_expected_p64_p8,poly,64,1) [] = { 0xf7f6f5f4f3f2f1f0 };
+VECT_VAR_DECL(vreint_expected_p64_p16,poly,64,1) [] = { 0xfff3fff2fff1fff0 };
+VECT_VAR_DECL(vreint_expected_p64_f32,poly,64,1) [] = { 0xc1700000c1800000 };
+VECT_VAR_DECL(vreint_expected_p64_f16,poly,64,1) [] = { 0xca80cb00cb80cc00 };
+
+/* Expected results: vreinterpretq_p64_*.  */
+VECT_VAR_DECL(vreint_expected_q_p64_s8,poly,64,2) [] = { 0xf7f6f5f4f3f2f1f0,
+							 0xfffefdfcfbfaf9f8 };
+VECT_VAR_DECL(vreint_expected_q_p64_s16,poly,64,2) [] = { 0xfff3fff2fff1fff0,
+							  0xfff7fff6fff5fff4 };
+VECT_VAR_DECL(vreint_expected_q_p64_s32,poly,64,2) [] = { 0xfffffff1fffffff0,
+							  0xfffffff3fffffff2 };
+VECT_VAR_DECL(vreint_expected_q_p64_s64,poly,64,2) [] = { 0xfffffffffffffff0,
+							  0xfffffffffffffff1 };
+VECT_VAR_DECL(vreint_expected_q_p64_u8,poly,64,2) [] = { 0xf7f6f5f4f3f2f1f0,
+							 0xfffefdfcfbfaf9f8 };
+VECT_VAR_DECL(vreint_expected_q_p64_u16,poly,64,2) [] = { 0xfff3fff2fff1fff0,
+							  0xfff7fff6fff5fff4 };
+VECT_VAR_DECL(vreint_expected_q_p64_u32,poly,64,2) [] = { 0xfffffff1fffffff0,
+							  0xfffffff3fffffff2 };
+VECT_VAR_DECL(vreint_expected_q_p64_u64,poly,64,2) [] = { 0xfffffffffffffff0,
+							  0xfffffffffffffff1 };
+VECT_VAR_DECL(vreint_expected_q_p64_p8,poly,64,2) [] = { 0xf7f6f5f4f3f2f1f0,
+							 0xfffefdfcfbfaf9f8 };
+VECT_VAR_DECL(vreint_expected_q_p64_p16,poly,64,2) [] = { 0xfff3fff2fff1fff0,
+							  0xfff7fff6fff5fff4 };
+VECT_VAR_DECL(vreint_expected_q_p64_f32,poly,64,2) [] = { 0xc1700000c1800000,
+							  0xc1500000c1600000 };
+VECT_VAR_DECL(vreint_expected_q_p64_f16,poly,64,2) [] = { 0xca80cb00cb80cc00,
+							  0xc880c900c980ca00 };
+
+/* Expected results: vreinterpret_*_p64.  */
+VECT_VAR_DECL(vreint_expected_s8_p64,int,8,8) [] = { 0xf0, 0xff, 0xff, 0xff,
+						     0xff, 0xff, 0xff, 0xff };
+VECT_VAR_DECL(vreint_expected_s16_p64,int,16,4) [] = { 0xfff0, 0xffff, 0xffff, 0xffff };
+VECT_VAR_DECL(vreint_expected_s32_p64,int,32,2) [] = { 0xfffffff0, 0xffffffff };
+VECT_VAR_DECL(vreint_expected_s64_p64,int,64,1) [] = { 0xfffffffffffffff0 };
+VECT_VAR_DECL(vreint_expected_u8_p64,uint,8,8) [] = { 0xf0, 0xff, 0xff, 0xff,
+						      0xff, 0xff, 0xff, 0xff };
+VECT_VAR_DECL(vreint_expected_u16_p64,uint,16,4) [] = { 0xfff0, 0xffff, 0xffff, 0xffff };
+VECT_VAR_DECL(vreint_expected_u32_p64,uint,32,2) [] = { 0xfffffff0, 0xffffffff };
+VECT_VAR_DECL(vreint_expected_u64_p64,uint,64,1) [] = { 0xfffffffffffffff0 };
+VECT_VAR_DECL(vreint_expected_p8_p64,poly,8,8) [] = { 0xf0, 0xff, 0xff, 0xff,
+						      0xff, 0xff, 0xff, 0xff };
+VECT_VAR_DECL(vreint_expected_p16_p64,poly,16,4) [] = { 0xfff0, 0xffff, 0xffff, 0xffff };
+VECT_VAR_DECL(vreint_expected_f32_p64,hfloat,32,2) [] = { 0xfffffff0, 0xffffffff };
+VECT_VAR_DECL(vreint_expected_f16_p64,hfloat,16,4) [] = { 0xfff0, 0xffff, 0xffff, 0xffff };
+
+/* Expected results: vreinterpretq_*_p64.  */
+VECT_VAR_DECL(vreint_expected_q_s8_p64,int,8,16) [] = { 0xf0, 0xff, 0xff, 0xff,
+							0xff, 0xff, 0xff, 0xff,
+							0xf1, 0xff, 0xff, 0xff,
+							0xff, 0xff, 0xff, 0xff };
+VECT_VAR_DECL(vreint_expected_q_s16_p64,int,16,8) [] = { 0xfff0, 0xffff,
+							 0xffff, 0xffff,
+							 0xfff1, 0xffff,
+							 0xffff, 0xffff };
+VECT_VAR_DECL(vreint_expected_q_s32_p64,int,32,4) [] = { 0xfffffff0, 0xffffffff,
+							 0xfffffff1, 0xffffffff };
+VECT_VAR_DECL(vreint_expected_q_s64_p64,int,64,2) [] = { 0xfffffffffffffff0,
+							 0xfffffffffffffff1 };
+VECT_VAR_DECL(vreint_expected_q_u8_p64,uint,8,16) [] = { 0xf0, 0xff, 0xff, 0xff,
+							 0xff, 0xff, 0xff, 0xff,
+							 0xf1, 0xff, 0xff, 0xff,
+							 0xff, 0xff, 0xff, 0xff };
+VECT_VAR_DECL(vreint_expected_q_u16_p64,uint,16,8) [] = { 0xfff0, 0xffff,
+							  0xffff, 0xffff,
+							  0xfff1, 0xffff,
+							  0xffff, 0xffff };
+VECT_VAR_DECL(vreint_expected_q_u32_p64,uint,32,4) [] = { 0xfffffff0, 0xffffffff,
+							  0xfffffff1, 0xffffffff };
+VECT_VAR_DECL(vreint_expected_q_u64_p64,uint,64,2) [] = { 0xfffffffffffffff0,
+							  0xfffffffffffffff1 };
+VECT_VAR_DECL(vreint_expected_q_p8_p64,poly,8,16) [] = { 0xf0, 0xff, 0xff, 0xff,
+							 0xff, 0xff, 0xff, 0xff,
+							 0xf1, 0xff, 0xff, 0xff,
+							 0xff, 0xff, 0xff, 0xff };
+VECT_VAR_DECL(vreint_expected_q_p16_p64,poly,16,8) [] = { 0xfff0, 0xffff,
+							  0xffff, 0xffff,
+							  0xfff1, 0xffff,
+							  0xffff, 0xffff };
+VECT_VAR_DECL(vreint_expected_q_f32_p64,hfloat,32,4) [] = { 0xfffffff0, 0xffffffff,
+							    0xfffffff1, 0xffffffff };
+VECT_VAR_DECL(vreint_expected_q_f16_p64,hfloat,16,8) [] = { 0xfff0, 0xffff,
+							    0xffff, 0xffff,
+							    0xfff1, 0xffff,
+							    0xffff, 0xffff };
+
+int main (void)
+{
+#define TEST_VREINTERPRET(Q, T1, T2, W, N, TS1, TS2, WS, NS, EXPECTED)	\
+  VECT_VAR(vreint_vector_res, T1, W, N) =				\
+    vreinterpret##Q##_##T2##W##_##TS2##WS(VECT_VAR(vreint_vector, TS1, WS, NS)); \
+  vst1##Q##_##T2##W(VECT_VAR(result, T1, W, N),				\
+		    VECT_VAR(vreint_vector_res, T1, W, N));		\
+  CHECK(TEST_MSG, T1, W, N, PRIx##W, EXPECTED, "");
+
+#define TEST_VREINTERPRET_TO_POLY(Q, T1, T2, W, N, TS1, TS2, WS, NS, EXPECTED)	\
+  VECT_VAR(vreint_vector_res, T1, W, N) =				\
+    vreinterpret##Q##_##T2##W##_##TS2##WS(VECT_VAR(vreint_vector, TS1, WS, NS)); \
+  vst1##Q##_##T2##W(VECT_VAR(result, T1, W, N),				\
+		    VECT_VAR(vreint_vector_res, T1, W, N));		\
+  CHECK_POLY(TEST_MSG, T1, W, N, PRIx##W, EXPECTED, "");
+
+#define TEST_VREINTERPRET_FP(Q, T1, T2, W, N, TS1, TS2, WS, NS, EXPECTED) \
+  VECT_VAR(vreint_vector_res, T1, W, N) =				\
+    vreinterpret##Q##_##T2##W##_##TS2##WS(VECT_VAR(vreint_vector, TS1, WS, NS)); \
+  vst1##Q##_##T2##W(VECT_VAR(result, T1, W, N),				\
+		    VECT_VAR(vreint_vector_res, T1, W, N));		\
+  CHECK_FP(TEST_MSG, T1, W, N, PRIx##W, EXPECTED, "");
+
+  DECL_VARIABLE_ALL_VARIANTS(vreint_vector);
+  DECL_VARIABLE_ALL_VARIANTS(vreint_vector_res);
+
+  clean_results ();
+
+  TEST_MACRO_ALL_VARIANTS_2_5(VLOAD, vreint_vector, buffer);
+  VLOAD(vreint_vector, buffer, , poly, p, 64, 1);
+  VLOAD(vreint_vector, buffer, q, poly, p, 64, 2);
+#if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
+  VLOAD(vreint_vector, buffer, , float, f, 16, 4);
+  VLOAD(vreint_vector, buffer, q, float, f, 16, 8);
+#endif
+  VLOAD(vreint_vector, buffer, , float, f, 32, 2);
+  VLOAD(vreint_vector, buffer, q, float, f, 32, 4);
+
+  /* vreinterpret_p64_* tests.  */
+#undef TEST_MSG
+#define TEST_MSG "VREINTERPRET_P64_*"
+  TEST_VREINTERPRET_TO_POLY(, poly, p, 64, 1, int, s, 8, 8, vreint_expected_p64_s8);
+  TEST_VREINTERPRET_TO_POLY(, poly, p, 64, 1, int, s, 16, 4, vreint_expected_p64_s16);
+  TEST_VREINTERPRET_TO_POLY(, poly, p, 64, 1, int, s, 32, 2, vreint_expected_p64_s32);
+  TEST_VREINTERPRET_TO_POLY(, poly, p, 64, 1, int, s, 64, 1, vreint_expected_p64_s64);
+  TEST_VREINTERPRET_TO_POLY(, poly, p, 64, 1, uint, u, 8, 8, vreint_expected_p64_u8);
+  TEST_VREINTERPRET_TO_POLY(, poly, p, 64, 1, uint, u, 16, 4, vreint_expected_p64_u16);
+  TEST_VREINTERPRET_TO_POLY(, poly, p, 64, 1, uint, u, 32, 2, vreint_expected_p64_u32);
+  TEST_VREINTERPRET_TO_POLY(, poly, p, 64, 1, uint, u, 64, 1, vreint_expected_p64_u64);
+  TEST_VREINTERPRET_TO_POLY(, poly, p, 64, 1, poly, p, 8, 8, vreint_expected_p64_p8);
+  TEST_VREINTERPRET_TO_POLY(, poly, p, 64, 1, poly, p, 16, 4, vreint_expected_p64_p16);
+#if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
+  TEST_VREINTERPRET_TO_POLY(, poly, p, 64, 1, float, f, 16, 4, vreint_expected_p64_f16);
+#endif
+  TEST_VREINTERPRET_TO_POLY(, poly, p, 64, 1, float, f, 32, 2, vreint_expected_p64_f32);
+
+  /* vreinterpretq_p64_* tests.  */
+#undef TEST_MSG
+#define TEST_MSG "VREINTERPRETQ_P64_*"
+  TEST_VREINTERPRET_TO_POLY(q, poly, p, 64, 2, int, s, 8, 16, vreint_expected_q_p64_s8);
+  TEST_VREINTERPRET_TO_POLY(q, poly, p, 64, 2, int, s, 16, 8, vreint_expected_q_p64_s16);
+  TEST_VREINTERPRET_TO_POLY(q, poly, p, 64, 2, int, s, 32, 4, vreint_expected_q_p64_s32);
+  TEST_VREINTERPRET_TO_POLY(q, poly, p, 64, 2, int, s, 64, 2, vreint_expected_q_p64_s64);
+  TEST_VREINTERPRET_TO_POLY(q, poly, p, 64, 2, uint, u, 8, 16, vreint_expected_q_p64_u8);
+  TEST_VREINTERPRET_TO_POLY(q, poly, p, 64, 2, uint, u, 16, 8, vreint_expected_q_p64_u16);
+  TEST_VREINTERPRET_TO_POLY(q, poly, p, 64, 2, uint, u, 32, 4, vreint_expected_q_p64_u32);
+  TEST_VREINTERPRET_TO_POLY(q, poly, p, 64, 2, uint, u, 64, 2, vreint_expected_q_p64_u64);
+  TEST_VREINTERPRET_TO_POLY(q, poly, p, 64, 2, poly, p, 8, 16, vreint_expected_q_p64_p8);
+  TEST_VREINTERPRET_TO_POLY(q, poly, p, 64, 2, poly, p, 16, 8, vreint_expected_q_p64_p16);
+#if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
+  TEST_VREINTERPRET_TO_POLY(q, poly, p, 64, 2, float, f, 16, 8, vreint_expected_q_p64_f16);
+#endif
+  TEST_VREINTERPRET_TO_POLY(q, poly, p, 64, 2, float, f, 32, 4, vreint_expected_q_p64_f32);
+
+  /* vreinterpret_*_p64 tests.  */
+#undef TEST_MSG
+#define TEST_MSG "VREINTERPRET_*_P64"
+
+  TEST_VREINTERPRET(, int, s, 8, 8, poly, p, 64, 1, vreint_expected_s8_p64);
+  TEST_VREINTERPRET(, int, s, 16, 4, poly, p, 64, 1, vreint_expected_s16_p64);
+  TEST_VREINTERPRET(, int, s, 32, 2, poly, p, 64, 1, vreint_expected_s32_p64);
+  TEST_VREINTERPRET(, int, s, 64, 1, poly, p, 64, 1, vreint_expected_s64_p64);
+  TEST_VREINTERPRET(, uint, u, 8, 8, poly, p, 64, 1, vreint_expected_u8_p64);
+  TEST_VREINTERPRET(, uint, u, 16, 4, poly, p, 64, 1, vreint_expected_u16_p64);
+  TEST_VREINTERPRET(, uint, u, 32, 2, poly, p, 64, 1, vreint_expected_u32_p64);
+  TEST_VREINTERPRET(, uint, u, 64, 1, poly, p, 64, 1, vreint_expected_u64_p64);
+  TEST_VREINTERPRET_TO_POLY(, poly, p, 8, 8, poly, p, 64, 1, vreint_expected_p8_p64);
+  TEST_VREINTERPRET_TO_POLY(, poly, p, 16, 4, poly, p, 64, 1, vreint_expected_p16_p64);
+#if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
+  TEST_VREINTERPRET_FP(, float, f, 16, 4, poly, p, 64, 1, vreint_expected_f16_p64);
+#endif
+  TEST_VREINTERPRET_FP(, float, f, 32, 2, poly, p, 64, 1, vreint_expected_f32_p64);
+  TEST_VREINTERPRET(q, int, s, 8, 16, poly, p, 64, 2, vreint_expected_q_s8_p64);
+  TEST_VREINTERPRET(q, int, s, 16, 8, poly, p, 64, 2, vreint_expected_q_s16_p64);
+  TEST_VREINTERPRET(q, int, s, 32, 4, poly, p, 64, 2, vreint_expected_q_s32_p64);
+  TEST_VREINTERPRET(q, int, s, 64, 2, poly, p, 64, 2, vreint_expected_q_s64_p64);
+  TEST_VREINTERPRET(q, uint, u, 8, 16, poly, p, 64, 2, vreint_expected_q_u8_p64);
+  TEST_VREINTERPRET(q, uint, u, 16, 8, poly, p, 64, 2, vreint_expected_q_u16_p64);
+  TEST_VREINTERPRET(q, uint, u, 32, 4, poly, p, 64, 2, vreint_expected_q_u32_p64);
+  TEST_VREINTERPRET(q, uint, u, 64, 2, poly, p, 64, 2, vreint_expected_q_u64_p64);
+  TEST_VREINTERPRET_TO_POLY(q, poly, p, 8, 16, poly, p, 64, 2, vreint_expected_q_p8_p64);
+  TEST_VREINTERPRET_TO_POLY(q, poly, p, 16, 8, poly, p, 64, 2, vreint_expected_q_p16_p64);
+#if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
+  TEST_VREINTERPRET_FP(q, float, f, 16, 8, poly, p, 64, 2, vreint_expected_q_f16_p64);
+#endif
+  TEST_VREINTERPRET_FP(q, float, f, 32, 4, poly, p, 64, 2, vreint_expected_q_f32_p64);
+
+  return 0;
+}
--- a/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vrev.c
+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vrev.c
@@ -63,6 +63,10 @@ VECT_VAR_DECL(expected_vrev64,uint,32,2) [] = { 0xfffffff1, 0xfffffff0 };
 VECT_VAR_DECL(expected_vrev64,poly,8,8) [] = { 0xf7, 0xf6, 0xf5, 0xf4,
 					       0xf3, 0xf2, 0xf1, 0xf0 };
 VECT_VAR_DECL(expected_vrev64,poly,16,4) [] = { 0xfff3, 0xfff2, 0xfff1, 0xfff0 };
+#if defined (FP16_SUPPORTED)
+VECT_VAR_DECL (expected_vrev64, hfloat, 16, 4) [] = { 0xca80, 0xcb00,
+						      0xcb80, 0xcc00 };
+#endif
 VECT_VAR_DECL(expected_vrev64,hfloat,32,2) [] = { 0xc1700000, 0xc1800000 };
 VECT_VAR_DECL(expected_vrev64,int,8,16) [] = { 0xf7, 0xf6, 0xf5, 0xf4,
 					       0xf3, 0xf2, 0xf1, 0xf0,
@@ -86,6 +90,12 @@ VECT_VAR_DECL(expected_vrev64,poly,8,16) [] = { 0xf7, 0xf6, 0xf5, 0xf4,
 						0xfb, 0xfa, 0xf9, 0xf8 };
 VECT_VAR_DECL(expected_vrev64,poly,16,8) [] = { 0xfff3, 0xfff2, 0xfff1, 0xfff0,
 						0xfff7, 0xfff6, 0xfff5, 0xfff4 };
+#if defined (FP16_SUPPORTED)
+VECT_VAR_DECL (expected_vrev64, hfloat, 16, 8) [] = { 0xca80, 0xcb00,
+						      0xcb80, 0xcc00,
+						      0xc880, 0xc900,
+						      0xc980, 0xca00 };
+#endif
 VECT_VAR_DECL(expected_vrev64,hfloat,32,4) [] = { 0xc1700000, 0xc1800000,
 						  0xc1500000, 0xc1600000 };
 
@@ -104,6 +114,10 @@ void exec_vrev (void)
 
   /* Initialize input "vector" from "buffer".  */
   TEST_MACRO_ALL_VARIANTS_2_5(VLOAD, vector, buffer);
+#if defined (FP16_SUPPORTED)
+  VLOAD (vector, buffer, , float, f, 16, 4);
+  VLOAD (vector, buffer, q, float, f, 16, 8);
+#endif
   VLOAD(vector, buffer, , float, f, 32, 2);
   VLOAD(vector, buffer, q, float, f, 32, 4);
 
@@ -118,10 +132,10 @@ void exec_vrev (void)
 
   CHECK(TEST_MSG, int, 8, 8, PRIx8, expected_vrev16, "");
   CHECK(TEST_MSG, uint, 8, 8, PRIx8, expected_vrev16, "");
-  CHECK(TEST_MSG, poly, 8, 8, PRIx8, expected_vrev16, "");
+  CHECK_POLY(TEST_MSG, poly, 8, 8, PRIx8, expected_vrev16, "");
   CHECK(TEST_MSG, int, 8, 16, PRIx8, expected_vrev16, "");
   CHECK(TEST_MSG, uint, 8, 16, PRIx8, expected_vrev16, "");
-  CHECK(TEST_MSG, poly, 8, 16, PRIx8, expected_vrev16, "");
+  CHECK_POLY(TEST_MSG, poly, 8, 16, PRIx8, expected_vrev16, "");
 
 #undef TEST_MSG
 #define TEST_MSG "VREV32"
@@ -142,14 +156,14 @@ void exec_vrev (void)
   CHECK(TEST_MSG, int, 16, 4, PRIx16, expected_vrev32, "");
   CHECK(TEST_MSG, uint, 8, 8, PRIx8, expected_vrev32, "");
   CHECK(TEST_MSG, uint, 16, 4, PRIx16, expected_vrev32, "");
-  CHECK(TEST_MSG, poly, 8, 8, PRIx8, expected_vrev32, "");
-  CHECK(TEST_MSG, poly, 16, 4, PRIx16, expected_vrev32, "");
+  CHECK_POLY(TEST_MSG, poly, 8, 8, PRIx8, expected_vrev32, "");
+  CHECK_POLY(TEST_MSG, poly, 16, 4, PRIx16, expected_vrev32, "");
   CHECK(TEST_MSG, int, 8, 16, PRIx8, expected_vrev32, "");
   CHECK(TEST_MSG, int, 16, 8, PRIx16, expected_vrev32, "");
   CHECK(TEST_MSG, uint, 8, 16, PRIx8, expected_vrev32, "");
   CHECK(TEST_MSG, uint, 16, 8, PRIx16, expected_vrev32, "");
-  CHECK(TEST_MSG, poly, 8, 16, PRIx8, expected_vrev32, "");
-  CHECK(TEST_MSG, poly, 16, 8, PRIx16, expected_vrev32, "");
+  CHECK_POLY(TEST_MSG, poly, 8, 16, PRIx8, expected_vrev32, "");
+  CHECK_POLY(TEST_MSG, poly, 16, 8, PRIx16, expected_vrev32, "");
 
 #undef TEST_MSG
 #define TEST_MSG "VREV64"
@@ -176,17 +190,23 @@ void exec_vrev (void)
   CHECK(TEST_MSG, uint, 8, 8, PRIx8, expected_vrev64, "");
   CHECK(TEST_MSG, uint, 16, 4, PRIx16, expected_vrev64, "");
   CHECK(TEST_MSG, uint, 32, 2, PRIx32, expected_vrev64, "");
-  CHECK(TEST_MSG, poly, 8, 8, PRIx8, expected_vrev64, "");
-  CHECK(TEST_MSG, poly, 16, 4, PRIx16, expected_vrev64, "");
+  CHECK_POLY(TEST_MSG, poly, 8, 8, PRIx8, expected_vrev64, "");
+  CHECK_POLY(TEST_MSG, poly, 16, 4, PRIx16, expected_vrev64, "");
   CHECK(TEST_MSG, int, 8, 16, PRIx8, expected_vrev64, "");
   CHECK(TEST_MSG, int, 16, 8, PRIx16, expected_vrev64, "");
   CHECK(TEST_MSG, int, 32, 4, PRIx32, expected_vrev64, "");
   CHECK(TEST_MSG, uint, 8, 16, PRIx8, expected_vrev64, "");
   CHECK(TEST_MSG, uint, 16, 8, PRIx16, expected_vrev64, "");
   CHECK(TEST_MSG, uint, 32, 4, PRIx32, expected_vrev64, "");
-  CHECK(TEST_MSG, poly, 8, 16, PRIx8, expected_vrev64, "");
-  CHECK(TEST_MSG, poly, 16, 8, PRIx16, expected_vrev64, "");
+  CHECK_POLY(TEST_MSG, poly, 8, 16, PRIx8, expected_vrev64, "");
+  CHECK_POLY(TEST_MSG, poly, 16, 8, PRIx16, expected_vrev64, "");
 
+#if defined (FP16_SUPPORTED)
+  TEST_VREV (, float, f, 16, 4, 64);
+  TEST_VREV (q, float, f, 16, 8, 64);
+  CHECK_FP(TEST_MSG, float, 16, 4, PRIx32, expected_vrev64, "");
+  CHECK_FP(TEST_MSG, float, 16, 8, PRIx32, expected_vrev64, "");
+#endif
   TEST_VREV(, float, f, 32, 2, 64);
   TEST_VREV(q, float, f, 32, 4, 64);
   CHECK_FP(TEST_MSG, float, 32, 2, PRIx32, expected_vrev64, "");
--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vrnd.c
@@ -0,0 +1,24 @@
+/* { dg-require-effective-target arm_v8_neon_hw } */
+/* { dg-add-options arm_v8_neon } */
+
+#include <arm_neon.h>
+#include "arm-neon-ref.h"
+#include "compute-ref-data.h"
+
+/* Expected results.  */
+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+VECT_VAR_DECL (expected, hfloat, 16, 4) [] = { 0xcc00, 0xcb80,
+					       0xcb00, 0xca80 };
+VECT_VAR_DECL (expected, hfloat, 16, 8) [] = { 0xcc00, 0xcb80,
+					       0xcb00, 0xca80,
+					       0xca00, 0xc980,
+					       0xc900, 0xc880 };
+#endif
+VECT_VAR_DECL (expected, hfloat, 32, 2) [] = { 0xc1800000, 0xc1700000 };
+VECT_VAR_DECL (expected, hfloat, 32, 4) [] = { 0xc1800000, 0xc1700000,
+					       0xc1600000, 0xc1500000 };
+
+#define INSN vrnd
+#define TEST_MSG "VRND"
+
+#include "vrndX.inc"
--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vrndX.inc
@@ -0,0 +1,63 @@
+#define FNNAME1(NAME) exec_ ## NAME
+#define FNNAME(NAME) FNNAME1 (NAME)
+
+void FNNAME (INSN) (void)
+{
+  /* vector_res = vrndX (vector), then store the result.  */
+#define TEST_VRND2(INSN, Q, T1, T2, W, N)				\
+  VECT_VAR (vector_res, T1, W, N) =					\
+    INSN##Q##_##T2##W (VECT_VAR (vector, T1, W, N));			\
+    vst1##Q##_##T2##W (VECT_VAR (result, T1, W, N),			\
+		       VECT_VAR (vector_res, T1, W, N))
+
+  /* Two auxliary macros are necessary to expand INSN.  */
+#define TEST_VRND1(INSN, Q, T1, T2, W, N)	\
+  TEST_VRND2 (INSN, Q, T1, T2, W, N)
+
+#define TEST_VRND(Q, T1, T2, W, N)		\
+  TEST_VRND1 (INSN, Q, T1, T2, W, N)
+
+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+  DECL_VARIABLE(vector, float, 16, 4);
+  DECL_VARIABLE(vector, float, 16, 8);
+#endif
+  DECL_VARIABLE (vector, float, 32, 2);
+  DECL_VARIABLE (vector, float, 32, 4);
+
+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+  DECL_VARIABLE(vector_res, float, 16, 4);
+  DECL_VARIABLE(vector_res, float, 16, 8);
+#endif
+  DECL_VARIABLE (vector_res, float, 32, 2);
+  DECL_VARIABLE (vector_res, float, 32, 4);
+
+  clean_results ();
+
+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+  VLOAD (vector, buffer, , float, f, 16, 4);
+  VLOAD (vector, buffer, q, float, f, 16, 8);
+#endif
+  VLOAD (vector, buffer, , float, f, 32, 2);
+  VLOAD (vector, buffer, q, float, f, 32, 4);
+
+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+  TEST_VRND ( , float, f, 16, 4);
+  TEST_VRND (q, float, f, 16, 8);
+#endif
+  TEST_VRND ( , float, f, 32, 2);
+  TEST_VRND (q, float, f, 32, 4);
+
+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+  CHECK_FP (TEST_MSG, float, 16, 4, PRIx16, expected, "");
+  CHECK_FP (TEST_MSG, float, 16, 8, PRIx16, expected, "");
+#endif
+  CHECK_FP (TEST_MSG, float, 32, 2, PRIx32, expected, "");
+  CHECK_FP (TEST_MSG, float, 32, 4, PRIx32, expected, "");
+}
+
+int
+main (void)
+{
+  FNNAME (INSN) ();
+  return 0;
+}
--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vrnda.c
@@ -0,0 +1,25 @@
+/* { dg-require-effective-target arm_v8_neon_hw } */
+/* { dg-add-options arm_v8_neon } */
+
+#include <arm_neon.h>
+#include "arm-neon-ref.h"
+#include "compute-ref-data.h"
+
+/* Expected results.  */
+/* Expected results.  */
+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+VECT_VAR_DECL (expected, hfloat, 16, 4) [] = { 0xcc00, 0xcb80,
+					       0xcb00, 0xca80 };
+VECT_VAR_DECL (expected, hfloat, 16, 8) [] = { 0xcc00, 0xcb80,
+					       0xcb00, 0xca80,
+					       0xca00, 0xc980,
+					       0xc900, 0xc880 };
+#endif
+VECT_VAR_DECL (expected, hfloat, 32, 2) [] = { 0xc1800000, 0xc1700000 };
+VECT_VAR_DECL (expected, hfloat, 32, 4) [] = { 0xc1800000, 0xc1700000,
+					       0xc1600000, 0xc1500000 };
+
+#define INSN vrnda
+#define TEST_MSG "VRNDA"
+
+#include "vrndX.inc"
--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vrndah_f16_1.c
@@ -0,0 +1,40 @@
+/* { dg-do run } */
+/* { dg-require-effective-target arm_v8_2a_fp16_scalar_hw } */
+/* { dg-add-options arm_v8_2a_fp16_scalar }  */
+
+#include <arm_fp16.h>
+
+/* Expected results (16-bit hexadecimal representation).  */
+uint16_t expected[] =
+{
+  0x0000 /* 0.000000 */,
+  0x8000 /* -0.000000 */,
+  0x4000 /* 2.000000 */,
+  0x4200 /* 3.000000 */,
+  0x4d00 /* 20.000000 */,
+  0x0000 /* 0.000000 */,
+  0xc000 /* -2.000000 */,
+  0x3c00 /* 1.000000 */,
+  0xc800 /* -8.000000 */,
+  0x0000 /* 0.000000 */,
+  0x0000 /* 0.000000 */,
+  0x3c00 /* 1.000000 */,
+  0x3c00 /* 1.000000 */,
+  0x4a80 /* 13.000000 */,
+  0xc600 /* -6.000000 */,
+  0x4d00 /* 20.000000 */,
+  0x7c00 /* inf */,
+  0xfc00 /* -inf */
+};
+
+#define TEST_MSG "VRNDAH_F16"
+#define INSN_NAME vrndah_f16
+
+#define EXPECTED expected
+
+#define INPUT_TYPE float16_t
+#define OUTPUT_TYPE float16_t
+#define OUTPUT_TYPE_SIZE 16
+
+/* Include the template for unary scalar operations.  */
+#include "unary_scalar_op.inc"
--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vrndh_f16_1.c
@@ -0,0 +1,40 @@
+/* { dg-do run } */
+/* { dg-require-effective-target arm_v8_2a_fp16_scalar_hw } */
+/* { dg-add-options arm_v8_2a_fp16_scalar }  */
+
+#include <arm_fp16.h>
+
+/* Expected results (16-bit hexadecimal representation).  */
+uint16_t expected[] =
+{
+  0x0000 /* 0.000000 */,
+  0x8000 /* -0.000000 */,
+  0x4000 /* 2.000000 */,
+  0x4200 /* 3.000000 */,
+  0x4d00 /* 20.000000 */,
+  0x0000 /* 0.000000 */,
+  0xc000 /* -2.000000 */,
+  0x3c00 /* 1.000000 */,
+  0xc700 /* -7.000000 */,
+  0x0000 /* 0.000000 */,
+  0x0000 /* 0.000000 */,
+  0x0000 /* 0.000000 */,
+  0x3c00 /* 1.000000 */,
+  0x4a80 /* 13.000000 */,
+  0xc600 /* -6.000000 */,
+  0x4d00 /* 20.000000 */,
+  0x7c00 /* inf */,
+  0xfc00 /* -inf */
+};
+
+#define TEST_MSG "VRNDH_F16"
+#define INSN_NAME vrndh_f16
+
+#define EXPECTED expected
+
+#define INPUT_TYPE float16_t
+#define OUTPUT_TYPE float16_t
+#define OUTPUT_TYPE_SIZE 16
+
+/* Include the template for unary scalar operations.  */
+#include "unary_scalar_op.inc"
--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vrndi_f16_1.c
@@ -0,0 +1,71 @@
+/* { dg-do run } */
+/* { dg-require-effective-target arm_v8_2a_fp16_neon_hw } */
+/* { dg-add-options arm_v8_2a_fp16_neon } */
+/* { dg-skip-if "" { arm*-*-* } } */
+
+#include <arm_neon.h>
+#include "arm-neon-ref.h"
+#include "compute-ref-data.h"
+
+#define FP16_C(a) ((__fp16) a)
+#define A FP16_C (123.4)
+#define RNDI_A 0x57B0 /* FP16_C (123).  */
+#define B FP16_C (-567.5)
+#define RNDI_B 0xE070 /* FP16_C (-568).  */
+#define C FP16_C (-34.8)
+#define RNDI_C 0xD060 /* FP16_C (-35).  */
+#define D FP16_C (1024)
+#define RNDI_D 0x6400 /* FP16_C (1024).  */
+#define E FP16_C (663.1)
+#define RNDI_E 0x612E /* FP16_C (663).  */
+#define F FP16_C (169.1)
+#define RNDI_F 0x5948 /* FP16_C (169).  */
+#define G FP16_C (-4.8)
+#define RNDI_G 0xC500 /* FP16_C (-5).  */
+#define H FP16_C (77.5)
+#define RNDI_H 0x54E0 /* FP16_C (78).  */
+
+/* Expected results for vrndi.  */
+VECT_VAR_DECL (expected_static, hfloat, 16, 4) []
+  = { RNDI_A, RNDI_B, RNDI_C, RNDI_D };
+
+VECT_VAR_DECL (expected_static, hfloat, 16, 8) []
+  = { RNDI_A, RNDI_B, RNDI_C, RNDI_D, RNDI_E, RNDI_F, RNDI_G, RNDI_H };
+
+void exec_vrndi_f16 (void)
+{
+#undef TEST_MSG
+#define TEST_MSG "VRNDI (FP16)"
+  clean_results ();
+
+  DECL_VARIABLE(vsrc, float, 16, 4);
+  VECT_VAR_DECL (buf_src, float, 16, 4) [] = {A, B, C, D};
+  VLOAD (vsrc, buf_src, , float, f, 16, 4);
+  DECL_VARIABLE (vector_res, float, 16, 4)
+    = vrndi_f16 (VECT_VAR (vsrc, float, 16, 4));
+  vst1_f16 (VECT_VAR (result, float, 16, 4),
+	    VECT_VAR (vector_res, float, 16, 4));
+
+  CHECK_FP (TEST_MSG, float, 16, 4, PRIx16, expected_static, "");
+
+#undef TEST_MSG
+#define TEST_MSG "VRNDIQ (FP16)"
+  clean_results ();
+
+  DECL_VARIABLE(vsrc, float, 16, 8);
+  VECT_VAR_DECL (buf_src, float, 16, 8) [] = {A, B, C, D, E, F, G, H};
+  VLOAD (vsrc, buf_src, q, float, f, 16, 8);
+  DECL_VARIABLE (vector_res, float, 16, 8)
+    = vrndiq_f16 (VECT_VAR (vsrc, float, 16, 8));
+  vst1q_f16 (VECT_VAR (result, float, 16, 8),
+	     VECT_VAR (vector_res, float, 16, 8));
+
+  CHECK_FP (TEST_MSG, float, 16, 8, PRIx16, expected_static, "");
+}
+
+int
+main (void)
+{
+  exec_vrndi_f16 ();
+  return 0;
+}
--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vrndih_f16_1.c
@@ -0,0 +1,40 @@
+/* { dg-do run } */
+/* { dg-require-effective-target arm_v8_2a_fp16_scalar_hw } */
+/* { dg-add-options arm_v8_2a_fp16_scalar }  */
+
+#include <arm_fp16.h>
+
+/* Expected results (16-bit hexadecimal representation).  */
+uint16_t expected[] =
+{
+  0x0000 /* 0.000000 */,
+  0x8000 /* -0.000000 */,
+  0x4000 /* 2.000000 */,
+  0x4200 /* 3.000000 */,
+  0x4d00 /* 20.000000 */,
+  0x0000 /* 0.000000 */,
+  0xc000 /* -2.000000 */,
+  0x3c00 /* 1.000000 */,
+  0xc800 /* -8.000000 */,
+  0x0000 /* 0.000000 */,
+  0x0000 /* 0.000000 */,
+  0x0000 /* 0.000000 */,
+  0x3c00 /* 1.000000 */,
+  0x4a80 /* 13.000000 */,
+  0xc600 /* -6.000000 */,
+  0x4d00 /* 20.000000 */,
+  0x7c00 /* inf */,
+  0xfc00 /* -inf */
+};
+
+#define TEST_MSG "VRNDIH_F16"
+#define INSN_NAME vrndih_f16
+
+#define EXPECTED expected
+
+#define INPUT_TYPE float16_t
+#define OUTPUT_TYPE float16_t
+#define OUTPUT_TYPE_SIZE 16
+
+/* Include the template for unary scalar operations.  */
+#include "unary_scalar_op.inc"
--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vrndm.c
@@ -0,0 +1,25 @@
+/* { dg-require-effective-target arm_v8_neon_hw } */
+/* { dg-add-options arm_v8_neon } */
+
+#include <arm_neon.h>
+#include "arm-neon-ref.h"
+#include "compute-ref-data.h"
+
+/* Expected results.  */
+/* Expected results.  */
+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+VECT_VAR_DECL (expected, hfloat, 16, 4) [] = { 0xcc00, 0xcb80,
+					       0xcb00, 0xca80 };
+VECT_VAR_DECL (expected, hfloat, 16, 8) [] = { 0xcc00, 0xcb80,
+					       0xcb00, 0xca80,
+					       0xca00, 0xc980,
+					       0xc900, 0xc880 };
+#endif
+VECT_VAR_DECL (expected, hfloat, 32, 2) [] = { 0xc1800000, 0xc1700000 };
+VECT_VAR_DECL (expected, hfloat, 32, 4) [] = { 0xc1800000, 0xc1700000,
+					       0xc1600000, 0xc1500000 };
+
+#define INSN vrndm
+#define TEST_MSG "VRNDM"
+
+#include "vrndX.inc"
--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vrndmh_f16_1.c
@@ -0,0 +1,40 @@
+/* { dg-do run } */
+/* { dg-require-effective-target arm_v8_2a_fp16_scalar_hw } */
+/* { dg-add-options arm_v8_2a_fp16_scalar }  */
+
+#include <arm_fp16.h>
+
+/* Expected results (16-bit hexadecimal representation).  */
+uint16_t expected[] =
+{
+  0x0000 /* 0.000000 */,
+  0x8000 /* -0.000000 */,
+  0x4000 /* 2.000000 */,
+  0x4200 /* 3.000000 */,
+  0x4d00 /* 20.000000 */,
+  0x0000 /* 0.000000 */,
+  0xc200 /* -3.000000 */,
+  0x3c00 /* 1.000000 */,
+  0xc800 /* -8.000000 */,
+  0x0000 /* 0.000000 */,
+  0x0000 /* 0.000000 */,
+  0x0000 /* 0.000000 */,
+  0x3c00 /* 1.000000 */,
+  0x4a80 /* 13.000000 */,
+  0xc700 /* -7.000000 */,
+  0x4d00 /* 20.000000 */,
+  0x7c00 /* inf */,
+  0xfc00 /* -inf */
+};
+
+#define TEST_MSG "VRNDMH_F16"
+#define INSN_NAME vrndmh_f16
+
+#define EXPECTED expected
+
+#define INPUT_TYPE float16_t
+#define OUTPUT_TYPE float16_t
+#define OUTPUT_TYPE_SIZE 16
+
+/* Include the template for unary scalar operations.  */
+#include "unary_scalar_op.inc"
--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vrndn.c
@@ -0,0 +1,25 @@
+/* { dg-require-effective-target arm_v8_neon_hw } */
+/* { dg-add-options arm_v8_neon } */
+
+#include <arm_neon.h>
+#include "arm-neon-ref.h"
+#include "compute-ref-data.h"
+
+/* Expected results.  */
+/* Expected results.  */
+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+VECT_VAR_DECL (expected, hfloat, 16, 4) [] = { 0xcc00, 0xcb80,
+					       0xcb00, 0xca80 };
+VECT_VAR_DECL (expected, hfloat, 16, 8) [] = { 0xcc00, 0xcb80,
+					       0xcb00, 0xca80,
+					       0xca00, 0xc980,
+					       0xc900, 0xc880 };
+#endif
+VECT_VAR_DECL (expected, hfloat, 32, 2) [] = { 0xc1800000, 0xc1700000 };
+VECT_VAR_DECL (expected, hfloat, 32, 4) [] = { 0xc1800000, 0xc1700000,
+					       0xc1600000, 0xc1500000 };
+
+#define INSN vrndn
+#define TEST_MSG "VRNDN"
+
+#include "vrndX.inc"
--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vrndnh_f16_1.c
@@ -0,0 +1,40 @@
+/* { dg-do run } */
+/* { dg-require-effective-target arm_v8_2a_fp16_scalar_hw } */
+/* { dg-add-options arm_v8_2a_fp16_scalar }  */
+
+#include <arm_fp16.h>
+
+/* Expected results (16-bit hexadecimal representation).  */
+uint16_t expected[] =
+{
+  0x0000 /* 0.000000 */,
+  0x8000 /* -0.000000 */,
+  0x4000 /* 2.000000 */,
+  0x4200 /* 3.000000 */,
+  0x4d00 /* 20.000000 */,
+  0x0000 /* 0.000000 */,
+  0xc000 /* -2.000000 */,
+  0x3c00 /* 1.000000 */,
+  0xc800 /* -8.000000 */,
+  0x0000 /* 0.000000 */,
+  0x0000 /* 0.000000 */,
+  0x0000 /* 0.000000 */,
+  0x3c00 /* 1.000000 */,
+  0x4a80 /* 13.000000 */,
+  0xc600 /* -6.000000 */,
+  0x4d00 /* 20.000000 */,
+  0x7c00 /* inf */,
+  0xfc00 /* -inf */
+};
+
+#define TEST_MSG "VRNDNH_F16"
+#define INSN_NAME vrndnh_f16
+
+#define EXPECTED expected
+
+#define INPUT_TYPE float16_t
+#define OUTPUT_TYPE float16_t
+#define OUTPUT_TYPE_SIZE 16
+
+/* Include the template for unary scalar operations.  */
+#include "unary_scalar_op.inc"
--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vrndp.c
@@ -0,0 +1,24 @@
+/* { dg-require-effective-target arm_v8_neon_hw } */
+/* { dg-add-options arm_v8_neon } */
+
+#include <arm_neon.h>
+#include "arm-neon-ref.h"
+#include "compute-ref-data.h"
+
+/* Expected results.  */
+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+VECT_VAR_DECL (expected, hfloat, 16, 4) [] = { 0xcc00, 0xcb80,
+					       0xcb00, 0xca80 };
+VECT_VAR_DECL (expected, hfloat, 16, 8) [] = { 0xcc00, 0xcb80,
+					       0xcb00, 0xca80,
+					       0xca00, 0xc980,
+					       0xc900, 0xc880 };
+#endif
+VECT_VAR_DECL (expected, hfloat, 32, 2) [] = { 0xc1800000, 0xc1700000 };
+VECT_VAR_DECL (expected, hfloat, 32, 4) [] = { 0xc1800000, 0xc1700000,
+					       0xc1600000, 0xc1500000 };
+
+#define INSN vrndp
+#define TEST_MSG "VRNDP"
+
+#include "vrndX.inc"
--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vrndph_f16_1.c
@@ -0,0 +1,40 @@
+/* { dg-do run } */
+/* { dg-require-effective-target arm_v8_2a_fp16_scalar_hw } */
+/* { dg-add-options arm_v8_2a_fp16_scalar }  */
+
+#include <arm_fp16.h>
+
+/* Expected results (16-bit hexadecimal representation).  */
+uint16_t expected[] =
+{
+  0x0000 /* 0.000000 */,
+  0x8000 /* -0.000000 */,
+  0x4000 /* 2.000000 */,
+  0x4400 /* 4.000000 */,
+  0x4d00 /* 20.000000 */,
+  0x3c00 /* 1.000000 */,
+  0xc000 /* -2.000000 */,
+  0x4000 /* 2.000000 */,
+  0xc700 /* -7.000000 */,
+  0x3c00 /* 1.000000 */,
+  0x3c00 /* 1.000000 */,
+  0x3c00 /* 1.000000 */,
+  0x3c00 /* 1.000000 */,
+  0x4b00 /* 14.000000 */,
+  0xc600 /* -6.000000 */,
+  0x4d00 /* 20.000000 */,
+  0x7c00 /* inf */,
+  0xfc00 /* -inf */
+};
+
+#define TEST_MSG "VRNDPH_F16"
+#define INSN_NAME vrndph_f16
+
+#define EXPECTED expected
+
+#define INPUT_TYPE float16_t
+#define OUTPUT_TYPE float16_t
+#define OUTPUT_TYPE_SIZE 16
+
+/* Include the template for unary scalar operations.  */
+#include "unary_scalar_op.inc"
--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vrndx.c
@@ -0,0 +1,24 @@
+/* { dg-require-effective-target arm_v8_neon_hw } */
+/* { dg-add-options arm_v8_neon } */
+
+#include <arm_neon.h>
+#include "arm-neon-ref.h"
+#include "compute-ref-data.h"
+
+/* Expected results.  */
+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+VECT_VAR_DECL (expected, hfloat, 16, 4) [] = { 0xcc00, 0xcb80,
+					       0xcb00, 0xca80 };
+VECT_VAR_DECL (expected, hfloat, 16, 8) [] = { 0xcc00, 0xcb80,
+					       0xcb00, 0xca80,
+					       0xca00, 0xc980,
+					       0xc900, 0xc880 };
+#endif
+VECT_VAR_DECL (expected, hfloat, 32, 2) [] = { 0xc1800000, 0xc1700000 };
+VECT_VAR_DECL (expected, hfloat, 32, 4) [] = { 0xc1800000, 0xc1700000,
+					       0xc1600000, 0xc1500000 };
+
+#define INSN vrndx
+#define TEST_MSG "VRNDX"
+
+#include "vrndX.inc"
--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vrndxh_f16_1.c
@@ -0,0 +1,40 @@
+/* { dg-do run } */
+/* { dg-require-effective-target arm_v8_2a_fp16_scalar_hw } */
+/* { dg-add-options arm_v8_2a_fp16_scalar }  */
+
+#include <arm_fp16.h>
+
+/* Expected results (16-bit hexadecimal representation).  */
+uint16_t expected[] =
+{
+  0x0000 /* 0.000000 */,
+  0x8000 /* -0.000000 */,
+  0x4000 /* 2.000000 */,
+  0x4200 /* 3.000000 */,
+  0x4d00 /* 20.000000 */,
+  0x0000 /* 0.000000 */,
+  0xc000 /* -2.000000 */,
+  0x3c00 /* 1.000000 */,
+  0xc800 /* -8.000000 */,
+  0x0000 /* 0.000000 */,
+  0x0000 /* 0.000000 */,
+  0x0000 /* 0.000000 */,
+  0x3c00 /* 1.000000 */,
+  0x4a80 /* 13.000000 */,
+  0xc600 /* -6.000000 */,
+  0x4d00 /* 20.000000 */,
+  0x7c00 /* inf */,
+  0xfc00 /* -inf */
+};
+
+#define TEST_MSG "VRNDNH_F16"
+#define INSN_NAME vrndnh_f16
+
+#define EXPECTED expected
+
+#define INPUT_TYPE float16_t
+#define OUTPUT_TYPE float16_t
+#define OUTPUT_TYPE_SIZE 16
+
+/* Include the template for unary scalar operations.  */
+#include "unary_scalar_op.inc"
--- a/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vrsqrte.c
+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vrsqrte.c
@@ -7,6 +7,11 @@
 VECT_VAR_DECL(expected,uint,32,2) [] = { 0xffffffff, 0xffffffff };
 VECT_VAR_DECL(expected,uint,32,4) [] = { 0x9c800000, 0x9c800000,
 					 0x9c800000, 0x9c800000 };
+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+VECT_VAR_DECL(expected, hfloat, 16, 4) [] = { 0x324c, 0x324c, 0x324c, 0x324c };
+VECT_VAR_DECL(expected, hfloat, 16, 8) [] = { 0x3380, 0x3380, 0x3380, 0x3380,
+					      0x3380, 0x3380, 0x3380, 0x3380 };
+#endif
 VECT_VAR_DECL(expected,hfloat,32,2) [] = { 0x3e498000, 0x3e498000 };
 VECT_VAR_DECL(expected,hfloat,32,4) [] = { 0x3e700000, 0x3e700000,
 					   0x3e700000, 0x3e700000 };
@@ -22,17 +27,39 @@ VECT_VAR_DECL(expected_2,uint,32,4) [] = { 0xed000000, 0xed000000,
 					   0xed000000, 0xed000000 };
 
 /* Expected results with FP special inputs values (NaNs, ...).  */
+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+VECT_VAR_DECL(expected_fp1, hfloat, 16, 4) [] = { 0x7e00, 0x7e00,
+						  0x7e00, 0x7e00 };
+VECT_VAR_DECL(expected_fp1, hfloat, 16, 8) [] = { 0x7c00, 0x7c00,
+						  0x7c00, 0x7c00,
+						  0x7c00, 0x7c00,
+						  0x7c00, 0x7c00 };
+#endif
 VECT_VAR_DECL(expected_fp1,hfloat,32,2) [] = { 0x7fc00000, 0x7fc00000 };
 VECT_VAR_DECL(expected_fp1,hfloat,32,4) [] = { 0x7f800000, 0x7f800000,
 					       0x7f800000, 0x7f800000 };
 
 /* Expected results with FP special inputs values
    (negative, infinity).  */
+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+VECT_VAR_DECL(expected_fp2, hfloat, 16, 4) [] = { 0x7e00, 0x7e00,
+						  0x7e00, 0x7e00 };
+VECT_VAR_DECL(expected_fp2, hfloat, 16, 8) [] = { 0x0, 0x0, 0x0, 0x0, 0x0,
+						  0x0, 0x0, 0x0 };
+#endif
 VECT_VAR_DECL(expected_fp2,hfloat,32,2) [] = { 0x7fc00000, 0x7fc00000 };
 VECT_VAR_DECL(expected_fp2,hfloat,32,4) [] = { 0x0, 0x0, 0x0, 0x0 };
 
 /* Expected results with FP special inputs values
    (-0, -infinity).  */
+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+VECT_VAR_DECL(expected_fp3, hfloat, 16, 4) [] = { 0xfc00, 0xfc00,
+						  0xfc00, 0xfc00 };
+VECT_VAR_DECL(expected_fp3, hfloat, 16, 8) [] = { 0x7e00, 0x7e00,
+						  0x7e00, 0x7e00,
+						  0x7e00, 0x7e00,
+						  0x7e00, 0x7e00 };
+#endif
 VECT_VAR_DECL(expected_fp3,hfloat,32,2) [] = { 0xff800000, 0xff800000 };
 VECT_VAR_DECL(expected_fp3,hfloat,32,4) [] = { 0x7fc00000, 0x7fc00000,
 					       0x7fc00000, 0x7fc00000 };
@@ -50,32 +77,60 @@ void exec_vrsqrte(void)
 		    VECT_VAR(vector_res, T1, W, N))
 
   DECL_VARIABLE(vector, uint, 32, 2);
+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+  DECL_VARIABLE(vector, float, 16, 4);
+#endif
   DECL_VARIABLE(vector, float, 32, 2);
   DECL_VARIABLE(vector, uint, 32, 4);
+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+  DECL_VARIABLE(vector, float, 16, 8);
+#endif
   DECL_VARIABLE(vector, float, 32, 4);
 
   DECL_VARIABLE(vector_res, uint, 32, 2);
+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+  DECL_VARIABLE(vector_res, float, 16, 4);
+#endif
   DECL_VARIABLE(vector_res, float, 32, 2);
   DECL_VARIABLE(vector_res, uint, 32, 4);
+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+  DECL_VARIABLE(vector_res, float, 16, 8);
+#endif
   DECL_VARIABLE(vector_res, float, 32, 4);
 
   clean_results ();
 
   /* Choose init value arbitrarily.  */
   VDUP(vector, , uint, u, 32, 2, 0x12345678);
+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+  VDUP(vector, , float, f, 16, 4, 25.799999f);
+#endif
   VDUP(vector, , float, f, 32, 2, 25.799999f);
   VDUP(vector, q, uint, u, 32, 4, 0xABCDEF10);
+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+  VDUP(vector, q, float, f, 16, 8, 18.2f);
+#endif
   VDUP(vector, q, float, f, 32, 4, 18.2f);
 
   /* Apply the operator.  */
   TEST_VRSQRTE(, uint, u, 32, 2);
+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+  TEST_VRSQRTE(, float, f, 16, 4);
+#endif
   TEST_VRSQRTE(, float, f, 32, 2);
   TEST_VRSQRTE(q, uint, u, 32, 4);
+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+  TEST_VRSQRTE(q, float, f, 16, 8);
+#endif
   TEST_VRSQRTE(q, float, f, 32, 4);
 
 #define CMT ""
   CHECK(TEST_MSG, uint, 32, 2, PRIx32, expected, CMT);
   CHECK(TEST_MSG, uint, 32, 4, PRIx32, expected, CMT);
+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+  CHECK_FP(TEST_MSG, float, 16, 4, PRIx16, expected, CMT);
+  CHECK_FP(TEST_MSG, float, 16, 8, PRIx16, expected, CMT);
+#endif
   CHECK_FP(TEST_MSG, float, 32, 2, PRIx32, expected, CMT);
   CHECK_FP(TEST_MSG, float, 32, 4, PRIx32, expected, CMT);
 
@@ -110,42 +165,78 @@ void exec_vrsqrte(void)
 
 
   /* Test FP variants with special input values (NaNs, ...).  */
+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+  VDUP(vector, , float, f, 16, 4, NAN);
+  VDUP(vector, q, float, f, 16, 8, 0.0f);
+#endif
   VDUP(vector, , float, f, 32, 2, NAN);
   VDUP(vector, q, float, f, 32, 4, 0.0f);
 
   /* Apply the operator.  */
+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+  TEST_VRSQRTE(, float, f, 16, 4);
+  TEST_VRSQRTE(q, float, f, 16, 8);
+#endif
   TEST_VRSQRTE(, float, f, 32, 2);
   TEST_VRSQRTE(q, float, f, 32, 4);
 
 #undef CMT
 #define CMT " FP special (NaN, 0)"
+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+  CHECK_FP(TEST_MSG, float, 16, 4, PRIx16, expected_fp1, CMT);
+  CHECK_FP(TEST_MSG, float, 16, 8, PRIx16, expected_fp1, CMT);
+#endif
   CHECK_FP(TEST_MSG, float, 32, 2, PRIx32, expected_fp1, CMT);
   CHECK_FP(TEST_MSG, float, 32, 4, PRIx32, expected_fp1, CMT);
 
 
   /* Test FP variants with special input values (negative, infinity).  */
+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+  VDUP(vector, , float, f, 16, 4, -1.0f);
+  VDUP(vector, q, float, f, 16, 8, HUGE_VALF);
+#endif
   VDUP(vector, , float, f, 32, 2, -1.0f);
   VDUP(vector, q, float, f, 32, 4, HUGE_VALF);
 
   /* Apply the operator.  */
+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+  TEST_VRSQRTE(, float, f, 16, 4);
+  TEST_VRSQRTE(q, float, f, 16, 8);
+#endif
   TEST_VRSQRTE(, float, f, 32, 2);
   TEST_VRSQRTE(q, float, f, 32, 4);
 
 #undef CMT
 #define CMT " FP special (negative, infinity)"
+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+  CHECK_FP(TEST_MSG, float, 16, 4, PRIx16, expected_fp2, CMT);
+  CHECK_FP(TEST_MSG, float, 16, 8, PRIx16, expected_fp2, CMT);
+#endif
   CHECK_FP(TEST_MSG, float, 32, 2, PRIx32, expected_fp2, CMT);
   CHECK_FP(TEST_MSG, float, 32, 4, PRIx32, expected_fp2, CMT);
 
   /* Test FP variants with special input values (-0, -infinity).  */
+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+  VDUP(vector, , float, f, 16, 4, -0.0f);
+  VDUP(vector, q, float, f, 16, 8, -HUGE_VALF);
+#endif
   VDUP(vector, , float, f, 32, 2, -0.0f);
   VDUP(vector, q, float, f, 32, 4, -HUGE_VALF);
 
   /* Apply the operator.  */
+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+  TEST_VRSQRTE(, float, f, 16, 4);
+  TEST_VRSQRTE(q, float, f, 16, 8);
+#endif
   TEST_VRSQRTE(, float, f, 32, 2);
   TEST_VRSQRTE(q, float, f, 32, 4);
 
 #undef CMT
 #define CMT " FP special (-0, -infinity)"
+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+  CHECK_FP(TEST_MSG, float, 16, 4, PRIx16, expected_fp3, CMT);
+  CHECK_FP(TEST_MSG, float, 16, 8, PRIx16, expected_fp3, CMT);
+#endif
   CHECK_FP(TEST_MSG, float, 32, 2, PRIx32, expected_fp3, CMT);
   CHECK_FP(TEST_MSG, float, 32, 4, PRIx32, expected_fp3, CMT);
 }
--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vrsqrteh_f16_1.c
@@ -0,0 +1,30 @@
+/* { dg-do run } */
+/* { dg-require-effective-target arm_v8_2a_fp16_scalar_hw } */
+/* { dg-add-options arm_v8_2a_fp16_scalar }  */
+/* { dg-skip-if "" { arm*-*-* } } */
+
+#include <arm_fp16.h>
+
+/* Input values.  */
+float16_t input[] = { 123.4, 67.8, 34.8, 24.0, 66.1, 144.0, 4.8, 77.0 };
+uint16_t expected[] = { 0x2DC4 /* FP16_C (1/__builtin_sqrtf (123.4)).  */,
+			0x2FC8 /* FP16_C (1/__builtin_sqrtf (67.8)).  */,
+			0x316C /* FP16_C (1/__builtin_sqrtf (34.8)).  */,
+			0x3288 /* FP16_C (1/__builtin_sqrtf (24.0)).  */,
+			0x2FDC /* FP16_C (1/__builtin_sqrtf (66.1)).  */,
+			0x2D54 /* FP16_C (1/__builtin_sqrtf (144.0)).  */,
+			0x3750 /* FP16_C (1/__builtin_sqrtf (4.8)).  */,
+			0x2F48 /* FP16_C (1/__builtin_sqrtf (77.0)).  */ };
+
+#define TEST_MSG "VRSQRTEH_F16"
+#define INSN_NAME vrsqrteh_f16
+
+#define INPUT input
+#define EXPECTED expected
+
+#define INPUT_TYPE float16_t
+#define OUTPUT_TYPE float16_t
+#define OUTPUT_TYPE_SIZE 16
+
+/* Include the template for unary scalar operations.  */
+#include "unary_scalar_op.inc"
--- a/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vrsqrts.c
+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vrsqrts.c
@@ -4,22 +4,51 @@
 #include <math.h>
 
 /* Expected results.  */
+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+VECT_VAR_DECL(expected, hfloat, 16, 4) [] = { 0xd3cb, 0xd3cb, 0xd3cb, 0xd3cb };
+VECT_VAR_DECL(expected, hfloat, 16, 8) [] = { 0xc726, 0xc726, 0xc726, 0xc726,
+					      0xc726, 0xc726, 0xc726, 0xc726 };
+#endif
 VECT_VAR_DECL(expected,hfloat,32,2) [] = { 0xc2796b84, 0xc2796b84 };
 VECT_VAR_DECL(expected,hfloat,32,4) [] = { 0xc0e4a3d8, 0xc0e4a3d8,
 					   0xc0e4a3d8, 0xc0e4a3d8 };
 
 /* Expected results with input=NaN.  */
+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+VECT_VAR_DECL(expected_nan, hfloat, 16, 4) [] = { 0x7e00, 0x7e00,
+						  0x7e00, 0x7e00 };
+VECT_VAR_DECL(expected_nan, hfloat, 16, 8) [] = { 0x7e00, 0x7e00,
+						  0x7e00, 0x7e00,
+						  0x7e00, 0x7e00,
+						  0x7e00, 0x7e00 };
+#endif
 VECT_VAR_DECL(expected_nan,hfloat,32,2) [] = { 0x7fc00000, 0x7fc00000 };
 VECT_VAR_DECL(expected_nan,hfloat,32,4) [] = { 0x7fc00000, 0x7fc00000,
 					       0x7fc00000, 0x7fc00000 };
 
 /* Expected results with FP special inputs values (infinity, 0).  */
+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+VECT_VAR_DECL(expected_fp1, hfloat, 16, 4) [] = { 0xfc00, 0xfc00,
+						  0xfc00, 0xfc00 };
+VECT_VAR_DECL(expected_fp1, hfloat, 16, 8) [] = { 0x3e00, 0x3e00,
+						  0x3e00, 0x3e00,
+						  0x3e00, 0x3e00,
+						  0x3e00, 0x3e00 };
+#endif
 VECT_VAR_DECL(expected_fp1,hfloat,32,2) [] = { 0xff800000, 0xff800000 };
 VECT_VAR_DECL(expected_fp1,hfloat,32,4) [] = { 0x3fc00000, 0x3fc00000,
 					       0x3fc00000, 0x3fc00000 };
 
 /* Expected results with only FP special inputs values (infinity,
    0).  */
+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+VECT_VAR_DECL(expected_fp2, hfloat, 16, 4) [] = { 0x3e00, 0x3e00,
+						  0x3e00, 0x3e00 };
+VECT_VAR_DECL(expected_fp2, hfloat, 16, 8) [] = { 0x3e00, 0x3e00,
+						  0x3e00, 0x3e00,
+						  0x3e00, 0x3e00,
+						  0x3e00, 0x3e00 };
+#endif
 VECT_VAR_DECL(expected_fp2,hfloat,32,2) [] = { 0x3fc00000, 0x3fc00000 };
 VECT_VAR_DECL(expected_fp2,hfloat,32,4) [] = { 0x3fc00000, 0x3fc00000,
 					       0x3fc00000, 0x3fc00000 };
@@ -38,75 +67,143 @@ void exec_vrsqrts(void)
 		    VECT_VAR(vector_res, T1, W, N))
 
   /* No need for integer variants.  */
+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+  DECL_VARIABLE(vector, float, 16, 4);
+  DECL_VARIABLE(vector, float, 16, 8);
+#endif
   DECL_VARIABLE(vector, float, 32, 2);
   DECL_VARIABLE(vector, float, 32, 4);
 
+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+  DECL_VARIABLE(vector2, float, 16, 4);
+  DECL_VARIABLE(vector2, float, 16, 8);
+#endif
   DECL_VARIABLE(vector2, float, 32, 2);
   DECL_VARIABLE(vector2, float, 32, 4);
 
+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+  DECL_VARIABLE(vector_res, float, 16, 4);
+  DECL_VARIABLE(vector_res, float, 16, 8);
+#endif
   DECL_VARIABLE(vector_res, float, 32, 2);
   DECL_VARIABLE(vector_res, float, 32, 4);
 
   clean_results ();
 
   /* Choose init value arbitrarily.  */
+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+  VDUP(vector, , float, f, 16, 4, 12.9f);
+  VDUP(vector, q, float, f, 16, 8, 9.1f);
+#endif
   VDUP(vector, , float, f, 32, 2, 12.9f);
   VDUP(vector, q, float, f, 32, 4, 9.1f);
 
+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+  VDUP(vector2, , float, f, 16, 4, 9.9f);
+  VDUP(vector2, q, float, f, 16, 8, 1.9f);
+#endif
   VDUP(vector2, , float, f, 32, 2, 9.9f);
   VDUP(vector2, q, float, f, 32, 4, 1.9f);
 
   /* Apply the operator.  */
+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+  TEST_VRSQRTS(, float, f, 16, 4);
+  TEST_VRSQRTS(q, float, f, 16, 8);
+#endif
   TEST_VRSQRTS(, float, f, 32, 2);
   TEST_VRSQRTS(q, float, f, 32, 4);
 
 #define CMT ""
+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+  CHECK_FP(TEST_MSG, float, 16, 4, PRIx16, expected, CMT);
+  CHECK_FP(TEST_MSG, float, 16, 8, PRIx16, expected, CMT);
+#endif
   CHECK_FP(TEST_MSG, float, 32, 2, PRIx32, expected, CMT);
   CHECK_FP(TEST_MSG, float, 32, 4, PRIx32, expected, CMT);
 
 
   /* Test FP variants with special input values (NaN).  */
+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+  VDUP(vector, , float, f, 16, 4, NAN);
+  VDUP(vector2, q, float, f, 16, 8, NAN);
+#endif
   VDUP(vector, , float, f, 32, 2, NAN);
   VDUP(vector2, q, float, f, 32, 4, NAN);
 
   /* Apply the operator.  */
+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+  TEST_VRSQRTS(, float, f, 16, 4);
+  TEST_VRSQRTS(q, float, f, 16, 8);
+#endif
   TEST_VRSQRTS(, float, f, 32, 2);
   TEST_VRSQRTS(q, float, f, 32, 4);
 
 #undef CMT
 #define CMT " FP special (NAN) and normal values"
+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+  CHECK_FP(TEST_MSG, float, 16, 4, PRIx16, expected_nan, CMT);
+  CHECK_FP(TEST_MSG, float, 16, 8, PRIx16, expected_nan, CMT);
+#endif
   CHECK_FP(TEST_MSG, float, 32, 2, PRIx32, expected_nan, CMT);
   CHECK_FP(TEST_MSG, float, 32, 4, PRIx32, expected_nan, CMT);
 
 
   /* Test FP variants with special input values (infinity, 0).  */
+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+  VDUP(vector, , float, f, 16, 4, HUGE_VALF);
+  VDUP(vector, q, float, f, 16, 8, 0.0f);
+  /* Restore a normal value in vector2.  */
+  VDUP(vector2, q, float, f, 16, 8, 3.2f);
+#endif
   VDUP(vector, , float, f, 32, 2, HUGE_VALF);
   VDUP(vector, q, float, f, 32, 4, 0.0f);
   /* Restore a normal value in vector2.  */
   VDUP(vector2, q, float, f, 32, 4, 3.2f);
 
   /* Apply the operator.  */
+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+  TEST_VRSQRTS(, float, f, 16, 4);
+  TEST_VRSQRTS(q, float, f, 16, 8);
+#endif
   TEST_VRSQRTS(, float, f, 32, 2);
   TEST_VRSQRTS(q, float, f, 32, 4);
 
 #undef CMT
 #define CMT " FP special (infinity, 0) and normal values"
+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+  CHECK_FP(TEST_MSG, float, 16, 4, PRIx16, expected_fp1, CMT);
+  CHECK_FP(TEST_MSG, float, 16, 8, PRIx16, expected_fp1, CMT);
+#endif
   CHECK_FP(TEST_MSG, float, 32, 2, PRIx32, expected_fp1, CMT);
   CHECK_FP(TEST_MSG, float, 32, 4, PRIx32, expected_fp1, CMT);
 
 
   /* Test FP variants with only special input values (infinity, 0).  */
+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+  VDUP(vector, , float, f, 16, 4, HUGE_VALF);
+  VDUP(vector, q, float, f, 16, 8, 0.0f);
+  VDUP(vector2, , float, f, 16, 4, -0.0f);
+  VDUP(vector2, q, float, f, 16, 8, HUGE_VALF);
+#endif
   VDUP(vector, , float, f, 32, 2, HUGE_VALF);
   VDUP(vector, q, float, f, 32, 4, 0.0f);
   VDUP(vector2, , float, f, 32, 2, -0.0f);
   VDUP(vector2, q, float, f, 32, 4, HUGE_VALF);
 
   /* Apply the operator.  */
+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+  TEST_VRSQRTS(, float, f, 16, 4);
+  TEST_VRSQRTS(q, float, f, 16, 8);
+#endif
   TEST_VRSQRTS(, float, f, 32, 2);
   TEST_VRSQRTS(q, float, f, 32, 4);
 
 #undef CMT
 #define CMT " only FP special (infinity, 0)"
+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+  CHECK_FP(TEST_MSG, float, 16, 4, PRIx16, expected_fp2, CMT);
+  CHECK_FP(TEST_MSG, float, 16, 8, PRIx16, expected_fp2, CMT);
+#endif
   CHECK_FP(TEST_MSG, float, 32, 2, PRIx32, expected_fp2, CMT);
   CHECK_FP(TEST_MSG, float, 32, 4, PRIx32, expected_fp2, CMT);
 }
--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vrsqrtsh_f16_1.c
@@ -0,0 +1,50 @@
+/* { dg-do run } */
+/* { dg-require-effective-target arm_v8_2a_fp16_scalar_hw } */
+/* { dg-add-options arm_v8_2a_fp16_scalar }  */
+/* { dg-skip-if "" { arm*-*-* } } */
+
+#include <arm_fp16.h>
+
+/* Input values.  */
+#define A 12.4
+#define B -5.8
+#define C -3.8
+#define D 10
+#define E 66.1
+#define F 16.1
+#define G -4.8
+#define H -77
+
+#define I 0.7
+#define J -78
+#define K 10.23
+#define L 98
+#define M 87
+#define N -87.81
+#define O -1.1
+#define P 47.8
+
+float16_t input_1[] = { A, B, C, D, I, J, K, L };
+float16_t input_2[] = { E, F, G, H, M, N, O, P };
+uint16_t expected[] = { 0xDE62 /* (3.0f + (-A) * E) / 2.0f.  */,
+			0x5206 /* (3.0f + (-B) * F) / 2.0f.  */,
+			0xC7A0 /* (3.0f + (-C) * G) / 2.0f.  */,
+			0x5E0A /* (3.0f + (-D) * H) / 2.0f.  */,
+			0xCF3D /* (3.0f + (-I) * M) / 2.0f.  */,
+			0xEAB0 /* (3.0f + (-J) * N) / 2.0f.  */,
+			0x471F /* (3.0f + (-K) * O) / 2.0f.  */,
+			0xE893 /* (3.0f + (-L) * P) / 2.0f.  */ };
+
+#define TEST_MSG "VRSQRTSH_F16"
+#define INSN_NAME vrsqrtsh_f16
+
+#define INPUT_1 input_1
+#define INPUT_2 input_2
+#define EXPECTED expected
+
+#define INPUT_TYPE float16_t
+#define OUTPUT_TYPE float16_t
+#define OUTPUT_TYPE_SIZE 16
+
+/* Include the template for unary scalar operations.  */
+#include "binary_scalar_op.inc"
--- a/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vsXi_n.inc
+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vsXi_n.inc
@@ -76,16 +76,16 @@ void FNNAME (INSN_NAME) (void)
   CHECK(TEST_MSG, uint, 16, 4, PRIx16, expected, "");
   CHECK(TEST_MSG, uint, 32, 2, PRIx32, expected, "");
   CHECK(TEST_MSG, uint, 64, 1, PRIx64, expected, "");
-  CHECK(TEST_MSG, poly, 8, 8, PRIx8, expected, "");
-  CHECK(TEST_MSG, poly, 16, 4, PRIx16, expected, "");
+  CHECK_POLY(TEST_MSG, poly, 8, 8, PRIx8, expected, "");
+  CHECK_POLY(TEST_MSG, poly, 16, 4, PRIx16, expected, "");
   CHECK(TEST_MSG, int, 8, 16, PRIx8, expected, "");
   CHECK(TEST_MSG, int, 16, 8, PRIx16, expected, "");
   CHECK(TEST_MSG, int, 32, 4, PRIx32, expected, "");
   CHECK(TEST_MSG, uint, 8, 16, PRIx8, expected, "");
   CHECK(TEST_MSG, uint, 16, 8, PRIx16, expected, "");
   CHECK(TEST_MSG, uint, 32, 4, PRIx32, expected, "");
-  CHECK(TEST_MSG, poly, 8, 16, PRIx8, expected, "");
-  CHECK(TEST_MSG, poly, 16, 8, PRIx16, expected, "");
+  CHECK_POLY(TEST_MSG, poly, 8, 16, PRIx8, expected, "");
+  CHECK_POLY(TEST_MSG, poly, 16, 8, PRIx16, expected, "");
 
 #ifdef EXTRA_TESTS
   EXTRA_TESTS();
--- a/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vshl.c
+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vshl.c
@@ -101,10 +101,8 @@ VECT_VAR_DECL(expected_negative_shift,uint,64,2) [] = { 0x7ffffffffffffff,
 							0x7ffffffffffffff };
 
 
-#ifndef INSN_NAME
 #define INSN_NAME vshl
 #define TEST_MSG "VSHL/VSHLQ"
-#endif
 
 #define FNNAME1(NAME) exec_ ## NAME
 #define FNNAME(NAME) FNNAME1(NAME)
--- a/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vshuffle.inc
+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vshuffle.inc
@@ -53,9 +53,17 @@ void FNNAME (INSN_NAME) (void)
   DECL_VSHUFFLE(float, 32, 4)
 
   DECL_ALL_VSHUFFLE();
+#if defined (FP16_SUPPORTED)
+  DECL_VSHUFFLE (float, 16, 4);
+  DECL_VSHUFFLE (float, 16, 8);
+#endif
 
   /* Initialize input "vector" from "buffer".  */
   TEST_MACRO_ALL_VARIANTS_2_5(VLOAD, vector1, buffer);
+#if defined (FP16_SUPPORTED)
+  VLOAD (vector1, buffer, , float, f, 16, 4);
+  VLOAD (vector1, buffer, q, float, f, 16, 8);
+#endif
   VLOAD(vector1, buffer, , float, f, 32, 2);
   VLOAD(vector1, buffer, q, float, f, 32, 4);
 
@@ -68,6 +76,9 @@ void FNNAME (INSN_NAME) (void)
   VDUP(vector2, , uint, u, 32, 2, 0x77);
   VDUP(vector2, , poly, p, 8, 8, 0x55);
   VDUP(vector2, , poly, p, 16, 4, 0x66);
+#if defined (FP16_SUPPORTED)
+  VDUP (vector2, , float, f, 16, 4, 14.6f);   /* 14.6f is 0x4b4d.  */
+#endif
   VDUP(vector2, , float, f, 32, 2, 33.6f);
 
   VDUP(vector2, q, int, s, 8, 16, 0x11);
@@ -78,8 +89,11 @@ void FNNAME (INSN_NAME) (void)
   VDUP(vector2, q, uint, u, 32, 4, 0x77);
   VDUP(vector2, q, poly, p, 8, 16, 0x55);
   VDUP(vector2, q, poly, p, 16, 8, 0x66);
+#if defined (FP16_SUPPORTED)
+  VDUP (vector2, q, float, f, 16, 8, 14.6f);
+#endif
   VDUP(vector2, q, float, f, 32, 4, 33.8f);
-  
+
 #define TEST_ALL_VSHUFFLE(INSN)				\
   TEST_VSHUFFLE(INSN, , int, s, 8, 8);			\
   TEST_VSHUFFLE(INSN, , int, s, 16, 4);			\
@@ -100,6 +114,10 @@ void FNNAME (INSN_NAME) (void)
   TEST_VSHUFFLE(INSN, q, poly, p, 16, 8);		\
   TEST_VSHUFFLE(INSN, q, float, f, 32, 4)
 
+#define TEST_VSHUFFLE_FP16(INSN)		\
+  TEST_VSHUFFLE(INSN, , float, f, 16, 4);	\
+  TEST_VSHUFFLE(INSN, q, float, f, 16, 8);
+
 #define TEST_ALL_EXTRA_CHUNKS()			\
   TEST_EXTRA_CHUNK(int, 8, 8, 1);		\
   TEST_EXTRA_CHUNK(int, 16, 4, 1);		\
@@ -130,8 +148,8 @@ void FNNAME (INSN_NAME) (void)
     CHECK(test_name, uint, 8, 8, PRIx8, EXPECTED, comment);		\
     CHECK(test_name, uint, 16, 4, PRIx16, EXPECTED, comment);		\
     CHECK(test_name, uint, 32, 2, PRIx32, EXPECTED, comment);		\
-    CHECK(test_name, poly, 8, 8, PRIx8, EXPECTED, comment);		\
-    CHECK(test_name, poly, 16, 4, PRIx16, EXPECTED, comment);		\
+    CHECK_POLY(test_name, poly, 8, 8, PRIx8, EXPECTED, comment);		\
+    CHECK_POLY(test_name, poly, 16, 4, PRIx16, EXPECTED, comment);		\
     CHECK_FP(test_name, float, 32, 2, PRIx32, EXPECTED, comment);	\
 									\
     CHECK(test_name, int, 8, 16, PRIx8, EXPECTED, comment);		\
@@ -140,20 +158,40 @@ void FNNAME (INSN_NAME) (void)
     CHECK(test_name, uint, 8, 16, PRIx8, EXPECTED, comment);		\
     CHECK(test_name, uint, 16, 8, PRIx16, EXPECTED, comment);		\
     CHECK(test_name, uint, 32, 4, PRIx32, EXPECTED, comment);		\
-    CHECK(test_name, poly, 8, 16, PRIx8, EXPECTED, comment);		\
-    CHECK(test_name, poly, 16, 8, PRIx16, EXPECTED, comment);		\
+    CHECK_POLY(test_name, poly, 8, 16, PRIx8, EXPECTED, comment);		\
+    CHECK_POLY(test_name, poly, 16, 8, PRIx16, EXPECTED, comment);		\
     CHECK_FP(test_name, float, 32, 4, PRIx32, EXPECTED, comment);	\
-  }									\
+  }
+
+#define CHECK_RESULTS_VSHUFFLE_FP16(test_name,EXPECTED,comment)		\
+  {									\
+    CHECK_FP (test_name, float, 16, 4, PRIx16, EXPECTED, comment);	\
+    CHECK_FP (test_name, float, 16, 8, PRIx16, EXPECTED, comment);	\
+  }
 
   clean_results ();
 
   /* Execute the tests.  */
   TEST_ALL_VSHUFFLE(INSN_NAME);
+#if defined (FP16_SUPPORTED)
+  TEST_VSHUFFLE_FP16 (INSN_NAME);
+#endif
 
   CHECK_RESULTS_VSHUFFLE (TEST_MSG, expected0, "(chunk 0)");
+#if defined (FP16_SUPPORTED)
+  CHECK_RESULTS_VSHUFFLE_FP16 (TEST_MSG, expected0, "(chunk 0)");
+#endif
 
   TEST_ALL_EXTRA_CHUNKS();
+#if defined (FP16_SUPPORTED)
+  TEST_EXTRA_CHUNK (float, 16, 4, 1);
+  TEST_EXTRA_CHUNK (float, 16, 8, 1);
+#endif
+
   CHECK_RESULTS_VSHUFFLE (TEST_MSG, expected1, "(chunk 1)");
+#if defined (FP16_SUPPORTED)
+  CHECK_RESULTS_VSHUFFLE_FP16 (TEST_MSG, expected1, "(chunk 1)");
+#endif
 }
 
 int main (void)
--- a/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vsli_n.c
+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vsli_n.c
@@ -161,14 +161,16 @@ void vsli_extra(void)
   CHECK(TEST_MSG, uint, 16, 4, PRIx16, expected_max_shift, COMMENT);
   CHECK(TEST_MSG, uint, 32, 2, PRIx32, expected_max_shift, COMMENT);
   CHECK(TEST_MSG, uint, 64, 1, PRIx64, expected_max_shift, COMMENT);
-  CHECK(TEST_MSG, poly, 8, 8, PRIx8, expected_max_shift, COMMENT);
-  CHECK(TEST_MSG, poly, 16, 4, PRIx16, expected_max_shift, COMMENT);
+  CHECK_POLY(TEST_MSG, poly, 8, 8, PRIx8, expected_max_shift, COMMENT);
+  CHECK_POLY(TEST_MSG, poly, 16, 4, PRIx16, expected_max_shift, COMMENT);
   CHECK(TEST_MSG, int, 8, 16, PRIx8, expected_max_shift, COMMENT);
   CHECK(TEST_MSG, int, 16, 8, PRIx16, expected_max_shift, COMMENT);
   CHECK(TEST_MSG, int, 32, 4, PRIx32, expected_max_shift, COMMENT);
+  CHECK(TEST_MSG, int, 64, 2, PRIx64, expected_max_shift, COMMENT);
   CHECK(TEST_MSG, uint, 8, 16, PRIx8, expected_max_shift, COMMENT);
   CHECK(TEST_MSG, uint, 16, 8, PRIx16, expected_max_shift, COMMENT);
   CHECK(TEST_MSG, uint, 32, 4, PRIx32, expected_max_shift, COMMENT);
-  CHECK(TEST_MSG, poly, 8, 16, PRIx8, expected_max_shift, COMMENT);
-  CHECK(TEST_MSG, poly, 16, 8, PRIx16, expected_max_shift, COMMENT);
+  CHECK(TEST_MSG, uint, 64, 2, PRIx64, expected_max_shift, COMMENT);
+  CHECK_POLY(TEST_MSG, poly, 8, 16, PRIx8, expected_max_shift, COMMENT);
+  CHECK_POLY(TEST_MSG, poly, 16, 8, PRIx16, expected_max_shift, COMMENT);
 }
--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vsqrt_f16_1.c
@@ -0,0 +1,72 @@
+/* { dg-do run } */
+/* { dg-require-effective-target arm_v8_2a_fp16_neon_hw } */
+/* { dg-add-options arm_v8_2a_fp16_neon } */
+/* { dg-skip-if "" { arm*-*-* } } */
+
+#include <arm_neon.h>
+#include "arm-neon-ref.h"
+#include "compute-ref-data.h"
+
+#define FP16_C(a) ((__fp16) a)
+#define A FP16_C (123.4)
+#define B FP16_C (567.8)
+#define C FP16_C (34.8)
+#define D FP16_C (1024)
+#define E FP16_C (663.1)
+#define F FP16_C (144.0)
+#define G FP16_C (4.8)
+#define H FP16_C (77)
+
+#define SQRT_A 0x498E /* FP16_C (__builtin_sqrtf (123.4)).  */
+#define SQRT_B 0x4DF5 /* FP16_C (__builtin_sqrtf (567.8)).  */
+#define SQRT_C 0x45E6 /* FP16_C (__builtin_sqrtf (34.8)).  */
+#define SQRT_D 0x5000 /* FP16_C (__builtin_sqrtf (1024)).  */
+#define SQRT_E 0x4E70 /* FP16_C (__builtin_sqrtf (663.1)).  */
+#define SQRT_F 0x4A00 /* FP16_C (__builtin_sqrtf (144.0)).  */
+#define SQRT_G 0x4062 /* FP16_C (__builtin_sqrtf (4.8)).  */
+#define SQRT_H 0x4863 /* FP16_C (__builtin_sqrtf (77)).  */
+
+/* Expected results for vsqrt.  */
+VECT_VAR_DECL (expected_static, hfloat, 16, 4) []
+  = { SQRT_A, SQRT_B, SQRT_C, SQRT_D };
+
+VECT_VAR_DECL (expected_static, hfloat, 16, 8) []
+  = { SQRT_A, SQRT_B, SQRT_C, SQRT_D, SQRT_E, SQRT_F, SQRT_G, SQRT_H };
+
+void exec_vsqrt_f16 (void)
+{
+#undef TEST_MSG
+#define TEST_MSG "VSQRT (FP16)"
+  clean_results ();
+
+  DECL_VARIABLE(vsrc, float, 16, 4);
+  VECT_VAR_DECL (buf_src, float, 16, 4) [] = {A, B, C, D};
+  VLOAD (vsrc, buf_src, , float, f, 16, 4);
+  DECL_VARIABLE (vector_res, float, 16, 4)
+    = vsqrt_f16 (VECT_VAR (vsrc, float, 16, 4));
+  vst1_f16 (VECT_VAR (result, float, 16, 4),
+	    VECT_VAR (vector_res, float, 16, 4));
+
+  CHECK_FP (TEST_MSG, float, 16, 4, PRIx16, expected_static, "");
+
+#undef TEST_MSG
+#define TEST_MSG "VSQRTQ (FP16)"
+  clean_results ();
+
+  DECL_VARIABLE(vsrc, float, 16, 8);
+  VECT_VAR_DECL (buf_src, float, 16, 8) [] = {A, B, C, D, E, F, G, H};
+  VLOAD (vsrc, buf_src, q, float, f, 16, 8);
+  DECL_VARIABLE (vector_res, float, 16, 8)
+    = vsqrtq_f16 (VECT_VAR (vsrc, float, 16, 8));
+  vst1q_f16 (VECT_VAR (result, float, 16, 8),
+	     VECT_VAR (vector_res, float, 16, 8));
+
+  CHECK_FP (TEST_MSG, float, 16, 8, PRIx16, expected_static, "");
+}
+
+int
+main (void)
+{
+  exec_vsqrt_f16 ();
+  return 0;
+}
--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vsqrth_f16_1.c
@@ -0,0 +1,40 @@
+/* { dg-do run } */
+/* { dg-require-effective-target arm_v8_2a_fp16_scalar_hw } */
+/* { dg-add-options arm_v8_2a_fp16_scalar }  */
+
+#include <arm_fp16.h>
+
+/* Expected results (16-bit hexadecimal representation).  */
+uint16_t expected[] =
+{
+  0x0000 /* 0.000000 */,
+  0x8000 /* -0.000000 */,
+  0x3da8 /* 1.414062 */,
+  0x3f0b /* 1.760742 */,
+  0x4479 /* 4.472656 */,
+  0x390f /* 0.632324 */,
+  0x7e00 /* nan */,
+  0x3c9d /* 1.153320 */,
+  0x7e00 /* nan */,
+  0x3874 /* 0.556641 */,
+  0x38a2 /* 0.579102 */,
+  0x39a8 /* 0.707031 */,
+  0x3c00 /* 1.000000 */,
+  0x433f /* 3.623047 */,
+  0x7e00 /* nan */,
+  0x4479 /* 4.472656 */,
+  0x7c00 /* inf */,
+  0x7e00 /* nan */
+};
+
+#define TEST_MSG "VSQRTH_F16"
+#define INSN_NAME vsqrth_f16
+
+#define EXPECTED expected
+
+#define INPUT_TYPE float16_t
+#define OUTPUT_TYPE float16_t
+#define OUTPUT_TYPE_SIZE 16
+
+/* Include the template for unary scalar operations.  */
+#include "unary_scalar_op.inc"
--- a/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vsri_n.c
+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vsri_n.c
@@ -163,14 +163,14 @@ void vsri_extra(void)
   CHECK(TEST_MSG, uint, 16, 4, PRIx16, expected_max_shift, COMMENT);
   CHECK(TEST_MSG, uint, 32, 2, PRIx32, expected_max_shift, COMMENT);
   CHECK(TEST_MSG, uint, 64, 1, PRIx64, expected_max_shift, COMMENT);
-  CHECK(TEST_MSG, poly, 8, 8, PRIx8, expected_max_shift, COMMENT);
-  CHECK(TEST_MSG, poly, 16, 4, PRIx16, expected_max_shift, COMMENT);
+  CHECK_POLY(TEST_MSG, poly, 8, 8, PRIx8, expected_max_shift, COMMENT);
+  CHECK_POLY(TEST_MSG, poly, 16, 4, PRIx16, expected_max_shift, COMMENT);
   CHECK(TEST_MSG, int, 8, 16, PRIx8, expected_max_shift, COMMENT);
   CHECK(TEST_MSG, int, 16, 8, PRIx16, expected_max_shift, COMMENT);
   CHECK(TEST_MSG, int, 32, 4, PRIx32, expected_max_shift, COMMENT);
   CHECK(TEST_MSG, uint, 8, 16, PRIx8, expected_max_shift, COMMENT);
   CHECK(TEST_MSG, uint, 16, 8, PRIx16, expected_max_shift, COMMENT);
   CHECK(TEST_MSG, uint, 32, 4, PRIx32, expected_max_shift, COMMENT);
-  CHECK(TEST_MSG, poly, 8, 16, PRIx8, expected_max_shift, COMMENT);
-  CHECK(TEST_MSG, poly, 16, 8, PRIx16, expected_max_shift, COMMENT);
+  CHECK_POLY(TEST_MSG, poly, 8, 16, PRIx8, expected_max_shift, COMMENT);
+  CHECK_POLY(TEST_MSG, poly, 16, 8, PRIx16, expected_max_shift, COMMENT);
 }
--- a/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vst2_lane_f16_indices_1.c
+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vst2_lane_f16_indices_1.c
@@ -2,6 +2,7 @@
 
 /* { dg-do compile } */
 /* { dg-skip-if "" { *-*-* } { "-fno-fat-lto-objects" } } */
+/* { dg-require-effective-target arm_neon_fp16_ok { target { arm*-*-* } } } */
 
 void
 f_vst2_lane_f16 (float16_t * p, float16x4x2_t v)
--- a/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vst2q_lane_f16_indices_1.c
+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vst2q_lane_f16_indices_1.c
@@ -2,6 +2,7 @@
 
 /* { dg-do compile } */
 /* { dg-skip-if "" { *-*-* } { "-fno-fat-lto-objects" } } */
+/* { dg-require-effective-target arm_neon_fp16_ok { target { arm*-*-* } } } */
 
 void
 f_vst2q_lane_f16 (float16_t * p, float16x8x2_t v)
--- a/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vst3_lane_f16_indices_1.c
+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vst3_lane_f16_indices_1.c
@@ -2,6 +2,7 @@
 
 /* { dg-do compile } */
 /* { dg-skip-if "" { *-*-* } { "-fno-fat-lto-objects" } } */
+/* { dg-require-effective-target arm_neon_fp16_ok { target { arm*-*-* } } } */
 
 void
 f_vst3_lane_f16 (float16_t * p, float16x4x3_t v)
--- a/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vst3q_lane_f16_indices_1.c
+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vst3q_lane_f16_indices_1.c
@@ -2,6 +2,7 @@
 
 /* { dg-do compile } */
 /* { dg-skip-if "" { *-*-* } { "-fno-fat-lto-objects" } } */
+/* { dg-require-effective-target arm_neon_fp16_ok { target { arm*-*-* } } } */
 
 void
 f_vst3q_lane_f16 (float16_t * p, float16x8x3_t v)
--- a/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vst4_lane_f16_indices_1.c
+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vst4_lane_f16_indices_1.c
@@ -2,6 +2,7 @@
 
 /* { dg-do compile } */
 /* { dg-skip-if "" { *-*-* } { "-fno-fat-lto-objects" } } */
+/* { dg-require-effective-target arm_neon_fp16_ok { target { arm*-*-* } } } */
 
 void
 f_vst4_lane_f16 (float16_t * p, float16x4x4_t v)
--- a/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vst4q_lane_f16_indices_1.c
+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vst4q_lane_f16_indices_1.c
@@ -2,6 +2,7 @@
 
 /* { dg-do compile } */
 /* { dg-skip-if "" { *-*-* } { "-fno-fat-lto-objects" } } */
+/* { dg-require-effective-target arm_neon_fp16_ok { target { arm*-*-* } } } */
 
 void
 f_vst4q_lane_f16 (float16_t * p, float16x8x4_t v)
--- a/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vstX_lane.c
+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vstX_lane.c
@@ -14,6 +14,7 @@ VECT_VAR_DECL(expected_st2_0,uint,32,2) [] = { 0xfffffff0, 0xfffffff1 };
 VECT_VAR_DECL(expected_st2_0,poly,8,8) [] = { 0xf0, 0xf1, 0x0, 0x0,
 					      0x0, 0x0, 0x0, 0x0 };
 VECT_VAR_DECL(expected_st2_0,poly,16,4) [] = { 0xfff0, 0xfff1, 0x0, 0x0 };
+VECT_VAR_DECL(expected_st2_0,hfloat,16,4) [] = { 0xcc00, 0xcb80, 0x0, 0x0 };
 VECT_VAR_DECL(expected_st2_0,hfloat,32,2) [] = { 0xc1800000, 0xc1700000 };
 VECT_VAR_DECL(expected_st2_0,int,16,8) [] = { 0xfff0, 0xfff1, 0x0, 0x0,
 					      0x0, 0x0, 0x0, 0x0 };
@@ -24,6 +25,8 @@ VECT_VAR_DECL(expected_st2_0,uint,32,4) [] = { 0xfffffff0, 0xfffffff1,
 					       0x0, 0x0 };
 VECT_VAR_DECL(expected_st2_0,poly,16,8) [] = { 0xfff0, 0xfff1, 0x0, 0x0,
 					       0x0, 0x0, 0x0, 0x0 };
+VECT_VAR_DECL(expected_st2_0,hfloat,16,8) [] = { 0xcc00, 0xcb80, 0x0, 0x0,
+						 0x0, 0x0, 0x0, 0x0 };
 VECT_VAR_DECL(expected_st2_0,hfloat,32,4) [] = { 0xc1800000, 0xc1700000,
 						 0x0, 0x0 };
 
@@ -39,6 +42,7 @@ VECT_VAR_DECL(expected_st2_1,uint,32,2) [] = { 0x0, 0x0 };
 VECT_VAR_DECL(expected_st2_1,poly,8,8) [] = { 0x0, 0x0, 0x0, 0x0,
 					      0x0, 0x0, 0x0, 0x0 };
 VECT_VAR_DECL(expected_st2_1,poly,16,4) [] = { 0x0, 0x0, 0x0, 0x0 };
+VECT_VAR_DECL(expected_st2_1,hfloat,16,4) [] = { 0x0, 0x0, 0x0, 0x0 };
 VECT_VAR_DECL(expected_st2_1,hfloat,32,2) [] = { 0x0, 0x0 };
 VECT_VAR_DECL(expected_st2_1,int,16,8) [] = { 0x0, 0x0, 0x0, 0x0,
 					      0x0, 0x0, 0x0, 0x0 };
@@ -48,6 +52,8 @@ VECT_VAR_DECL(expected_st2_1,uint,16,8) [] = { 0x0, 0x0, 0x0, 0x0,
 VECT_VAR_DECL(expected_st2_1,uint,32,4) [] = { 0x0, 0x0, 0x0, 0x0 };
 VECT_VAR_DECL(expected_st2_1,poly,16,8) [] = { 0x0, 0x0, 0x0, 0x0,
 					       0x0, 0x0, 0x0, 0x0 };
+VECT_VAR_DECL(expected_st2_1,hfloat,16,8) [] = { 0x0, 0x0, 0x0, 0x0,
+						 0x0, 0x0, 0x0, 0x0 };
 VECT_VAR_DECL(expected_st2_1,hfloat,32,4) [] = { 0x0, 0x0, 0x0, 0x0 };
 
 /* Expected results for vst3, chunk 0.  */
@@ -62,6 +68,7 @@ VECT_VAR_DECL(expected_st3_0,uint,32,2) [] = { 0xfffffff0, 0xfffffff1 };
 VECT_VAR_DECL(expected_st3_0,poly,8,8) [] = { 0xf0, 0xf1, 0xf2, 0x0,
 					      0x0, 0x0, 0x0, 0x0 };
 VECT_VAR_DECL(expected_st3_0,poly,16,4) [] = { 0xfff0, 0xfff1, 0xfff2, 0x0 };
+VECT_VAR_DECL(expected_st3_0,hfloat,16,4) [] = { 0xcc00, 0xcb80, 0xcb00, 0x0 };
 VECT_VAR_DECL(expected_st3_0,hfloat,32,2) [] = { 0xc1800000, 0xc1700000 };
 VECT_VAR_DECL(expected_st3_0,int,16,8) [] = { 0xfff0, 0xfff1, 0xfff2, 0x0,
 					      0x0, 0x0, 0x0, 0x0 };
@@ -73,6 +80,8 @@ VECT_VAR_DECL(expected_st3_0,uint,32,4) [] = { 0xfffffff0, 0xfffffff1,
 					       0xfffffff2, 0x0 };
 VECT_VAR_DECL(expected_st3_0,poly,16,8) [] = { 0xfff0, 0xfff1, 0xfff2, 0x0,
 					       0x0, 0x0, 0x0, 0x0 };
+VECT_VAR_DECL(expected_st3_0,hfloat,16,8) [] = { 0xcc00, 0xcb80, 0xcb00, 0x0,
+						 0x0, 0x0, 0x0, 0x0 };
 VECT_VAR_DECL(expected_st3_0,hfloat,32,4) [] = { 0xc1800000, 0xc1700000,
 						 0xc1600000, 0x0 };
 
@@ -88,6 +97,7 @@ VECT_VAR_DECL(expected_st3_1,uint,32,2) [] = { 0xfffffff2, 0x0 };
 VECT_VAR_DECL(expected_st3_1,poly,8,8) [] = { 0x0, 0x0, 0x0, 0x0,
 					      0x0, 0x0, 0x0, 0x0 };
 VECT_VAR_DECL(expected_st3_1,poly,16,4) [] = { 0x0, 0x0, 0x0, 0x0 };
+VECT_VAR_DECL(expected_st3_1,hfloat,16,4) [] = { 0x0, 0x0, 0x0, 0x0 };
 VECT_VAR_DECL(expected_st3_1,hfloat,32,2) [] = { 0xc1600000, 0x0 };
 VECT_VAR_DECL(expected_st3_1,int,16,8) [] = { 0x0, 0x0, 0x0, 0x0,
 					      0x0, 0x0, 0x0, 0x0 };
@@ -97,6 +107,8 @@ VECT_VAR_DECL(expected_st3_1,uint,16,8) [] = { 0x0, 0x0, 0x0, 0x0,
 VECT_VAR_DECL(expected_st3_1,uint,32,4) [] = { 0x0, 0x0, 0x0, 0x0 };
 VECT_VAR_DECL(expected_st3_1,poly,16,8) [] = { 0x0, 0x0, 0x0, 0x0,
 					       0x0, 0x0, 0x0, 0x0 };
+VECT_VAR_DECL(expected_st3_1,hfloat,16,8) [] = { 0x0, 0x0, 0x0, 0x0,
+						 0x0, 0x0, 0x0, 0x0 };
 VECT_VAR_DECL(expected_st3_1,hfloat,32,4) [] = { 0x0, 0x0, 0x0, 0x0 };
 
 /* Expected results for vst3, chunk 2.  */
@@ -111,6 +123,7 @@ VECT_VAR_DECL(expected_st3_2,uint,32,2) [] = { 0x0, 0x0 };
 VECT_VAR_DECL(expected_st3_2,poly,8,8) [] = { 0x0, 0x0, 0x0, 0x0,
 					      0x0, 0x0, 0x0, 0x0 };
 VECT_VAR_DECL(expected_st3_2,poly,16,4) [] = { 0x0, 0x0, 0x0, 0x0 };
+VECT_VAR_DECL(expected_st3_2,hfloat,16,4) [] = { 0x0, 0x0, 0x0, 0x0 };
 VECT_VAR_DECL(expected_st3_2,hfloat,32,2) [] = { 0x0, 0x0 };
 VECT_VAR_DECL(expected_st3_2,int,16,8) [] = { 0x0, 0x0, 0x0, 0x0,
 					      0x0, 0x0, 0x0, 0x0 };
@@ -120,6 +133,8 @@ VECT_VAR_DECL(expected_st3_2,uint,16,8) [] = { 0x0, 0x0, 0x0, 0x0,
 VECT_VAR_DECL(expected_st3_2,uint,32,4) [] = { 0x0, 0x0, 0x0, 0x0 };
 VECT_VAR_DECL(expected_st3_2,poly,16,8) [] = { 0x0, 0x0, 0x0, 0x0,
 					       0x0, 0x0, 0x0, 0x0 };
+VECT_VAR_DECL(expected_st3_2,hfloat,16,8) [] = { 0x0, 0x0, 0x0, 0x0,
+						 0x0, 0x0, 0x0, 0x0 };
 VECT_VAR_DECL(expected_st3_2,hfloat,32,4) [] = { 0x0, 0x0, 0x0, 0x0 };
 
 /* Expected results for vst4, chunk 0.  */
@@ -134,6 +149,7 @@ VECT_VAR_DECL(expected_st4_0,uint,32,2) [] = { 0xfffffff0, 0xfffffff1 };
 VECT_VAR_DECL(expected_st4_0,poly,8,8) [] = { 0xf0, 0xf1, 0xf2, 0xf3,
 					      0x0, 0x0, 0x0, 0x0 };
 VECT_VAR_DECL(expected_st4_0,poly,16,4) [] = { 0xfff0, 0xfff1, 0xfff2, 0xfff3 };
+VECT_VAR_DECL(expected_st4_0,hfloat,16,4) [] = { 0xcc00, 0xcb80, 0xcb00, 0xca80 };
 VECT_VAR_DECL(expected_st4_0,hfloat,32,2) [] = { 0xc1800000, 0xc1700000 };
 VECT_VAR_DECL(expected_st4_0,int,16,8) [] = { 0xfff0, 0xfff1, 0xfff2, 0xfff3,
 					      0x0, 0x0, 0x0, 0x0 };
@@ -145,6 +161,8 @@ VECT_VAR_DECL(expected_st4_0,uint,32,4) [] = { 0xfffffff0, 0xfffffff1,
 					       0xfffffff2, 0xfffffff3 };
 VECT_VAR_DECL(expected_st4_0,poly,16,8) [] = { 0xfff0, 0xfff1, 0xfff2, 0xfff3,
 					       0x0, 0x0, 0x0, 0x0 };
+VECT_VAR_DECL(expected_st4_0,hfloat,16,8) [] = { 0xcc00, 0xcb80, 0xcb00, 0xca80,
+						 0x0, 0x0, 0x0, 0x0 };
 VECT_VAR_DECL(expected_st4_0,hfloat,32,4) [] = { 0xc1800000, 0xc1700000,
 						 0xc1600000, 0xc1500000 };
 
@@ -160,6 +178,7 @@ VECT_VAR_DECL(expected_st4_1,uint,32,2) [] = { 0xfffffff2, 0xfffffff3 };
 VECT_VAR_DECL(expected_st4_1,poly,8,8) [] = { 0x0, 0x0, 0x0, 0x0,
 					      0x0, 0x0, 0x0, 0x0 };
 VECT_VAR_DECL(expected_st4_1,poly,16,4) [] = { 0x0, 0x0, 0x0, 0x0 };
+VECT_VAR_DECL(expected_st4_1,hfloat,16,4) [] = { 0x0, 0x0, 0x0, 0x0 };
 VECT_VAR_DECL(expected_st4_1,hfloat,32,2) [] = { 0xc1600000, 0xc1500000 };
 VECT_VAR_DECL(expected_st4_1,int,16,8) [] = { 0x0, 0x0, 0x0, 0x0,
 					      0x0, 0x0, 0x0, 0x0 };
@@ -169,6 +188,8 @@ VECT_VAR_DECL(expected_st4_1,uint,16,8) [] = { 0x0, 0x0, 0x0, 0x0,
 VECT_VAR_DECL(expected_st4_1,uint,32,4) [] = { 0x0, 0x0, 0x0, 0x0 };
 VECT_VAR_DECL(expected_st4_1,poly,16,8) [] = { 0x0, 0x0, 0x0, 0x0,
 					       0x0, 0x0, 0x0, 0x0 };
+VECT_VAR_DECL(expected_st4_1,hfloat,16,8) [] = { 0x0, 0x0, 0x0, 0x0,
+						 0x0, 0x0, 0x0, 0x0 };
 VECT_VAR_DECL(expected_st4_1,hfloat,32,4) [] = { 0x0, 0x0, 0x0, 0x0 };
 
 /* Expected results for vst4, chunk 2.  */
@@ -183,6 +204,7 @@ VECT_VAR_DECL(expected_st4_2,uint,32,2) [] = { 0x0, 0x0 };
 VECT_VAR_DECL(expected_st4_2,poly,8,8) [] = { 0x0, 0x0, 0x0, 0x0,
 					      0x0, 0x0, 0x0, 0x0 };
 VECT_VAR_DECL(expected_st4_2,poly,16,4) [] = { 0x0, 0x0, 0x0, 0x0 };
+VECT_VAR_DECL(expected_st4_2,hfloat,16,4) [] = { 0x0, 0x0, 0x0, 0x0 };
 VECT_VAR_DECL(expected_st4_2,hfloat,32,2) [] = { 0x0, 0x0 };
 VECT_VAR_DECL(expected_st4_2,int,16,8) [] = { 0x0, 0x0, 0x0, 0x0,
 					      0x0, 0x0, 0x0, 0x0 };
@@ -192,6 +214,8 @@ VECT_VAR_DECL(expected_st4_2,uint,16,8) [] = { 0x0, 0x0, 0x0, 0x0,
 VECT_VAR_DECL(expected_st4_2,uint,32,4) [] = { 0x0, 0x0, 0x0, 0x0 };
 VECT_VAR_DECL(expected_st4_2,poly,16,8) [] = { 0x0, 0x0, 0x0, 0x0,
 					       0x0, 0x0, 0x0, 0x0 };
+VECT_VAR_DECL(expected_st4_2,hfloat,16,8) [] = { 0x0, 0x0, 0x0, 0x0,
+					       0x0, 0x0, 0x0, 0x0 };
 VECT_VAR_DECL(expected_st4_2,hfloat,32,4) [] = { 0x0, 0x0, 0x0, 0x0 };
 
 /* Expected results for vst4, chunk 3.  */
@@ -206,6 +230,7 @@ VECT_VAR_DECL(expected_st4_3,uint,32,2) [] = { 0x0, 0x0 };
 VECT_VAR_DECL(expected_st4_3,poly,8,8) [] = { 0x0, 0x0, 0x0, 0x0,
 					      0x0, 0x0, 0x0, 0x0 };
 VECT_VAR_DECL(expected_st4_3,poly,16,4) [] = { 0x0, 0x0, 0x0, 0x0 };
+VECT_VAR_DECL(expected_st4_3,hfloat,16,4) [] = { 0x0, 0x0, 0x0, 0x0 };
 VECT_VAR_DECL(expected_st4_3,hfloat,32,2) [] = { 0x0, 0x0 };
 VECT_VAR_DECL(expected_st4_3,int,16,8) [] = { 0x0, 0x0, 0x0, 0x0,
 					      0x0, 0x0, 0x0, 0x0 };
@@ -215,6 +240,8 @@ VECT_VAR_DECL(expected_st4_3,uint,16,8) [] = { 0x0, 0x0, 0x0, 0x0,
 VECT_VAR_DECL(expected_st4_3,uint,32,4) [] = { 0x0, 0x0, 0x0, 0x0 };
 VECT_VAR_DECL(expected_st4_3,poly,16,8) [] = { 0x0, 0x0, 0x0, 0x0,
 					       0x0, 0x0, 0x0, 0x0 };
+VECT_VAR_DECL(expected_st4_3,hfloat,16,8) [] = { 0x0, 0x0, 0x0, 0x0,
+						 0x0, 0x0, 0x0, 0x0 };
 VECT_VAR_DECL(expected_st4_3,hfloat,32,4) [] = { 0x0, 0x0, 0x0, 0x0 };
 
 /* Declare additional input buffers as needed.  */
@@ -229,6 +256,9 @@ VECT_VAR_DECL_INIT(buffer_vld2_lane, uint, 32, 2);
 VECT_VAR_DECL_INIT(buffer_vld2_lane, uint, 64, 2);
 VECT_VAR_DECL_INIT(buffer_vld2_lane, poly, 8, 2);
 VECT_VAR_DECL_INIT(buffer_vld2_lane, poly, 16, 2);
+#if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
+VECT_VAR_DECL_INIT(buffer_vld2_lane, float, 16, 2);
+#endif
 VECT_VAR_DECL_INIT(buffer_vld2_lane, float, 32, 2);
 
 /* Input buffers for vld3_lane.  */
@@ -242,6 +272,9 @@ VECT_VAR_DECL_INIT(buffer_vld3_lane, uint, 32, 3);
 VECT_VAR_DECL_INIT(buffer_vld3_lane, uint, 64, 3);
 VECT_VAR_DECL_INIT(buffer_vld3_lane, poly, 8, 3);
 VECT_VAR_DECL_INIT(buffer_vld3_lane, poly, 16, 3);
+#if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
+VECT_VAR_DECL_INIT(buffer_vld3_lane, float, 16, 3);
+#endif
 VECT_VAR_DECL_INIT(buffer_vld3_lane, float, 32, 3);
 
 /* Input buffers for vld4_lane.  */
@@ -255,6 +288,9 @@ VECT_VAR_DECL_INIT(buffer_vld4_lane, uint, 32, 4);
 VECT_VAR_DECL_INIT(buffer_vld4_lane, uint, 64, 4);
 VECT_VAR_DECL_INIT(buffer_vld4_lane, poly, 8, 4);
 VECT_VAR_DECL_INIT(buffer_vld4_lane, poly, 16, 4);
+#if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
+VECT_VAR_DECL_INIT(buffer_vld4_lane, float, 16, 4);
+#endif
 VECT_VAR_DECL_INIT(buffer_vld4_lane, float, 32, 4);
 
 void exec_vstX_lane (void)
@@ -302,7 +338,7 @@ void exec_vstX_lane (void)
 
   /* We need all variants in 64 bits, but there is no 64x2 variant,
      nor 128 bits vectors of int8/uint8/poly8.  */
-#define DECL_ALL_VSTX_LANE(X)			\
+#define DECL_ALL_VSTX_LANE_NO_FP16(X)		\
   DECL_VSTX_LANE(int, 8, 8, X);			\
   DECL_VSTX_LANE(int, 16, 4, X);		\
   DECL_VSTX_LANE(int, 32, 2, X);		\
@@ -319,11 +355,20 @@ void exec_vstX_lane (void)
   DECL_VSTX_LANE(poly, 16, 8, X);		\
   DECL_VSTX_LANE(float, 32, 4, X)
 
+#if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
+#define DECL_ALL_VSTX_LANE(X)		\
+  DECL_ALL_VSTX_LANE_NO_FP16(X);	\
+  DECL_VSTX_LANE(float, 16, 4, X);	\
+  DECL_VSTX_LANE(float, 16, 8, X)
+#else
+#define DECL_ALL_VSTX_LANE(X) DECL_ALL_VSTX_LANE_NO_FP16(X)
+#endif
+
 #define DUMMY_ARRAY(V, T, W, N, L) VECT_VAR_DECL(V,T,W,N)[N*L]
 
   /* Use the same lanes regardless of the size of the array (X), for
      simplicity.  */
-#define TEST_ALL_VSTX_LANE(X)			\
+#define TEST_ALL_VSTX_LANE_NO_FP16(X)		\
   TEST_VSTX_LANE(, int, s, 8, 8, X, 7);		\
   TEST_VSTX_LANE(, int, s, 16, 4, X, 2);	\
   TEST_VSTX_LANE(, int, s, 32, 2, X, 0);	\
@@ -340,7 +385,16 @@ void exec_vstX_lane (void)
   TEST_VSTX_LANE(q, poly, p, 16, 8, X, 5);	\
   TEST_VSTX_LANE(q, float, f, 32, 4, X, 2)
 
-#define TEST_ALL_EXTRA_CHUNKS(X, Y)		\
+#if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
+#define TEST_ALL_VSTX_LANE(X)			\
+  TEST_ALL_VSTX_LANE_NO_FP16(X);		\
+  TEST_VSTX_LANE(, float, f, 16, 4, X, 2);	\
+  TEST_VSTX_LANE(q, float, f, 16, 8, X, 6)
+#else
+#define TEST_ALL_VSTX_LANE(X) TEST_ALL_VSTX_LANE_NO_FP16(X)
+#endif
+
+#define TEST_ALL_EXTRA_CHUNKS_NO_FP16(X, Y)	\
   TEST_EXTRA_CHUNK(int, 8, 8, X, Y);		\
   TEST_EXTRA_CHUNK(int, 16, 4, X, Y);		\
   TEST_EXTRA_CHUNK(int, 32, 2, X, Y);		\
@@ -357,6 +411,15 @@ void exec_vstX_lane (void)
   TEST_EXTRA_CHUNK(poly, 16, 8, X, Y);		\
   TEST_EXTRA_CHUNK(float, 32, 4, X, Y)
 
+#if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
+#define TEST_ALL_EXTRA_CHUNKS(X,Y)		\
+  TEST_ALL_EXTRA_CHUNKS_NO_FP16(X, Y);		\
+  TEST_EXTRA_CHUNK(float, 16, 4, X, Y);		\
+  TEST_EXTRA_CHUNK(float, 16, 8, X, Y)
+#else
+#define TEST_ALL_EXTRA_CHUNKS(X,Y) TEST_ALL_EXTRA_CHUNKS_NO_FP16(X, Y)
+#endif
+
   /* Declare the temporary buffers / variables.  */
   DECL_ALL_VSTX_LANE(2);
   DECL_ALL_VSTX_LANE(3);
@@ -371,12 +434,18 @@ void exec_vstX_lane (void)
   DUMMY_ARRAY(buffer_src, uint, 32, 2, 4);
   DUMMY_ARRAY(buffer_src, poly, 8, 8, 4);
   DUMMY_ARRAY(buffer_src, poly, 16, 4, 4);
+#if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
+  DUMMY_ARRAY(buffer_src, float, 16, 4, 4);
+#endif
   DUMMY_ARRAY(buffer_src, float, 32, 2, 4);
   DUMMY_ARRAY(buffer_src, int, 16, 8, 4);
   DUMMY_ARRAY(buffer_src, int, 32, 4, 4);
   DUMMY_ARRAY(buffer_src, uint, 16, 8, 4);
   DUMMY_ARRAY(buffer_src, uint, 32, 4, 4);
   DUMMY_ARRAY(buffer_src, poly, 16, 8, 4);
+#if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
+  DUMMY_ARRAY(buffer_src, float, 16, 8, 4);
+#endif
   DUMMY_ARRAY(buffer_src, float, 32, 4, 4);
 
   /* Check vst2_lane/vst2q_lane.  */
@@ -391,15 +460,19 @@ void exec_vstX_lane (void)
   CHECK(TEST_MSG, uint, 8, 8, PRIx8, expected_st2_0, CMT);
   CHECK(TEST_MSG, uint, 16, 4, PRIx16, expected_st2_0, CMT);
   CHECK(TEST_MSG, uint, 32, 2, PRIx32, expected_st2_0, CMT);
-  CHECK(TEST_MSG, poly, 8, 8, PRIx8, expected_st2_0, CMT);
-  CHECK(TEST_MSG, poly, 16, 4, PRIx16, expected_st2_0, CMT);
+  CHECK_POLY(TEST_MSG, poly, 8, 8, PRIx8, expected_st2_0, CMT);
+  CHECK_POLY(TEST_MSG, poly, 16, 4, PRIx16, expected_st2_0, CMT);
   CHECK_FP(TEST_MSG, float, 32, 2, PRIx32, expected_st2_0, CMT);
   CHECK(TEST_MSG, int, 16, 8, PRIx16, expected_st2_0, CMT);
   CHECK(TEST_MSG, int, 32, 4, PRIx32, expected_st2_0, CMT);
   CHECK(TEST_MSG, uint, 16, 8, PRIx16, expected_st2_0, CMT);
   CHECK(TEST_MSG, uint, 32, 4, PRIx32, expected_st2_0, CMT);
-  CHECK(TEST_MSG, poly, 16, 8, PRIx16, expected_st2_0, CMT);
+  CHECK_POLY(TEST_MSG, poly, 16, 8, PRIx16, expected_st2_0, CMT);
   CHECK_FP(TEST_MSG, float, 32, 4, PRIx32, expected_st2_0, CMT);
+#if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
+  CHECK_FP(TEST_MSG, float, 16, 4, PRIx16, expected_st2_0, CMT);
+  CHECK_FP(TEST_MSG, float, 16, 8, PRIx16, expected_st2_0, CMT);
+#endif
 
   TEST_ALL_EXTRA_CHUNKS(2, 1);
 #undef CMT
@@ -410,15 +483,19 @@ void exec_vstX_lane (void)
   CHECK(TEST_MSG, uint, 8, 8, PRIx8, expected_st2_1, CMT);
   CHECK(TEST_MSG, uint, 16, 4, PRIx16, expected_st2_1, CMT);
   CHECK(TEST_MSG, uint, 32, 2, PRIx32, expected_st2_1, CMT);
-  CHECK(TEST_MSG, poly, 8, 8, PRIx8, expected_st2_1, CMT);
-  CHECK(TEST_MSG, poly, 16, 4, PRIx16, expected_st2_1, CMT);
+  CHECK_POLY(TEST_MSG, poly, 8, 8, PRIx8, expected_st2_1, CMT);
+  CHECK_POLY(TEST_MSG, poly, 16, 4, PRIx16, expected_st2_1, CMT);
   CHECK_FP(TEST_MSG, float, 32, 2, PRIx32, expected_st2_1, CMT);
   CHECK(TEST_MSG, int, 16, 8, PRIx16, expected_st2_1, CMT);
   CHECK(TEST_MSG, int, 32, 4, PRIx32, expected_st2_1, CMT);
   CHECK(TEST_MSG, uint, 16, 8, PRIx16, expected_st2_1, CMT);
   CHECK(TEST_MSG, uint, 32, 4, PRIx32, expected_st2_1, CMT);
-  CHECK(TEST_MSG, poly, 16, 8, PRIx16, expected_st2_1, CMT);
+  CHECK_POLY(TEST_MSG, poly, 16, 8, PRIx16, expected_st2_1, CMT);
   CHECK_FP(TEST_MSG, float, 32, 4, PRIx32, expected_st2_1, CMT);
+#if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
+  CHECK_FP(TEST_MSG, float, 16, 4, PRIx16, expected_st2_1, CMT);
+  CHECK_FP(TEST_MSG, float, 16, 8, PRIx16, expected_st2_1, CMT);
+#endif
 
 
   /* Check vst3_lane/vst3q_lane.  */
@@ -435,15 +512,19 @@ void exec_vstX_lane (void)
   CHECK(TEST_MSG, uint, 8, 8, PRIx8, expected_st3_0, CMT);
   CHECK(TEST_MSG, uint, 16, 4, PRIx16, expected_st3_0, CMT);
   CHECK(TEST_MSG, uint, 32, 2, PRIx32, expected_st3_0, CMT);
-  CHECK(TEST_MSG, poly, 8, 8, PRIx8, expected_st3_0, CMT);
-  CHECK(TEST_MSG, poly, 16, 4, PRIx16, expected_st3_0, CMT);
+  CHECK_POLY(TEST_MSG, poly, 8, 8, PRIx8, expected_st3_0, CMT);
+  CHECK_POLY(TEST_MSG, poly, 16, 4, PRIx16, expected_st3_0, CMT);
   CHECK_FP(TEST_MSG, float, 32, 2, PRIx32, expected_st3_0, CMT);
   CHECK(TEST_MSG, int, 16, 8, PRIx16, expected_st3_0, CMT);
   CHECK(TEST_MSG, int, 32, 4, PRIx32, expected_st3_0, CMT);
   CHECK(TEST_MSG, uint, 16, 8, PRIx16, expected_st3_0, CMT);
   CHECK(TEST_MSG, uint, 32, 4, PRIx32, expected_st3_0, CMT);
-  CHECK(TEST_MSG, poly, 16, 8, PRIx16, expected_st3_0, CMT);
+  CHECK_POLY(TEST_MSG, poly, 16, 8, PRIx16, expected_st3_0, CMT);
   CHECK_FP(TEST_MSG, float, 32, 4, PRIx32, expected_st3_0, CMT);
+#if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
+  CHECK_FP(TEST_MSG, float, 16, 4, PRIx16, expected_st3_0, CMT);
+  CHECK_FP(TEST_MSG, float, 16, 8, PRIx16, expected_st3_0, CMT);
+#endif
 
   TEST_ALL_EXTRA_CHUNKS(3, 1);
 
@@ -455,15 +536,19 @@ void exec_vstX_lane (void)
   CHECK(TEST_MSG, uint, 8, 8, PRIx8, expected_st3_1, CMT);
   CHECK(TEST_MSG, uint, 16, 4, PRIx16, expected_st3_1, CMT);
   CHECK(TEST_MSG, uint, 32, 2, PRIx32, expected_st3_1, CMT);
-  CHECK(TEST_MSG, poly, 8, 8, PRIx8, expected_st3_1, CMT);
-  CHECK(TEST_MSG, poly, 16, 4, PRIx16, expected_st3_1, CMT);
+  CHECK_POLY(TEST_MSG, poly, 8, 8, PRIx8, expected_st3_1, CMT);
+  CHECK_POLY(TEST_MSG, poly, 16, 4, PRIx16, expected_st3_1, CMT);
   CHECK_FP(TEST_MSG, float, 32, 2, PRIx32, expected_st3_1, CMT);
   CHECK(TEST_MSG, int, 16, 8, PRIx16, expected_st3_1, CMT);
   CHECK(TEST_MSG, int, 32, 4, PRIx32, expected_st3_1, CMT);
   CHECK(TEST_MSG, uint, 16, 8, PRIx16, expected_st3_1, CMT);
   CHECK(TEST_MSG, uint, 32, 4, PRIx32, expected_st3_1, CMT);
-  CHECK(TEST_MSG, poly, 16, 8, PRIx16, expected_st3_1, CMT);
+  CHECK_POLY(TEST_MSG, poly, 16, 8, PRIx16, expected_st3_1, CMT);
   CHECK_FP(TEST_MSG, float, 32, 4, PRIx32, expected_st3_1, CMT);
+#if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
+  CHECK_FP(TEST_MSG, float, 16, 4, PRIx16, expected_st3_1, CMT);
+  CHECK_FP(TEST_MSG, float, 16, 8, PRIx16, expected_st3_1, CMT);
+#endif
 
   TEST_ALL_EXTRA_CHUNKS(3, 2);
 
@@ -475,15 +560,19 @@ void exec_vstX_lane (void)
   CHECK(TEST_MSG, uint, 8, 8, PRIx8, expected_st3_2, CMT);
   CHECK(TEST_MSG, uint, 16, 4, PRIx16, expected_st3_2, CMT);
   CHECK(TEST_MSG, uint, 32, 2, PRIx32, expected_st3_2, CMT);
-  CHECK(TEST_MSG, poly, 8, 8, PRIx8, expected_st3_2, CMT);
-  CHECK(TEST_MSG, poly, 16, 4, PRIx16, expected_st3_2, CMT);
+  CHECK_POLY(TEST_MSG, poly, 8, 8, PRIx8, expected_st3_2, CMT);
+  CHECK_POLY(TEST_MSG, poly, 16, 4, PRIx16, expected_st3_2, CMT);
   CHECK_FP(TEST_MSG, float, 32, 2, PRIx32, expected_st3_2, CMT);
   CHECK(TEST_MSG, int, 16, 8, PRIx16, expected_st3_2, CMT);
   CHECK(TEST_MSG, int, 32, 4, PRIx32, expected_st3_2, CMT);
   CHECK(TEST_MSG, uint, 16, 8, PRIx16, expected_st3_2, CMT);
   CHECK(TEST_MSG, uint, 32, 4, PRIx32, expected_st3_2, CMT);
-  CHECK(TEST_MSG, poly, 16, 8, PRIx16, expected_st3_2, CMT);
+  CHECK_POLY(TEST_MSG, poly, 16, 8, PRIx16, expected_st3_2, CMT);
   CHECK_FP(TEST_MSG, float, 32, 4, PRIx32, expected_st3_2, CMT);
+#if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
+  CHECK_FP(TEST_MSG, float, 16, 4, PRIx16, expected_st3_2, CMT);
+  CHECK_FP(TEST_MSG, float, 16, 8, PRIx16, expected_st3_2, CMT);
+#endif
 
 
   /* Check vst4_lane/vst4q_lane.  */
@@ -500,15 +589,19 @@ void exec_vstX_lane (void)
   CHECK(TEST_MSG, uint, 8, 8, PRIx8, expected_st4_0, CMT);
   CHECK(TEST_MSG, uint, 16, 4, PRIx16, expected_st4_0, CMT);
   CHECK(TEST_MSG, uint, 32, 2, PRIx32, expected_st4_0, CMT);
-  CHECK(TEST_MSG, poly, 8, 8, PRIx8, expected_st4_0, CMT);
-  CHECK(TEST_MSG, poly, 16, 4, PRIx16, expected_st4_0, CMT);
+  CHECK_POLY(TEST_MSG, poly, 8, 8, PRIx8, expected_st4_0, CMT);
+  CHECK_POLY(TEST_MSG, poly, 16, 4, PRIx16, expected_st4_0, CMT);
   CHECK_FP(TEST_MSG, float, 32, 2, PRIx32, expected_st4_0, CMT);
   CHECK(TEST_MSG, int, 16, 8, PRIx16, expected_st4_0, CMT);
   CHECK(TEST_MSG, int, 32, 4, PRIx32, expected_st4_0, CMT);
   CHECK(TEST_MSG, uint, 16, 8, PRIx16, expected_st4_0, CMT);
   CHECK(TEST_MSG, uint, 32, 4, PRIx32, expected_st4_0, CMT);
-  CHECK(TEST_MSG, poly, 16, 8, PRIx16, expected_st4_0, CMT);
+  CHECK_POLY(TEST_MSG, poly, 16, 8, PRIx16, expected_st4_0, CMT);
   CHECK_FP(TEST_MSG, float, 32, 4, PRIx32, expected_st4_0, CMT);
+#if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
+  CHECK_FP(TEST_MSG, float, 16, 4, PRIx16, expected_st4_0, CMT);
+  CHECK_FP(TEST_MSG, float, 16, 8, PRIx16, expected_st4_0, CMT);
+#endif
 
   TEST_ALL_EXTRA_CHUNKS(4, 1);
 
@@ -520,15 +613,19 @@ void exec_vstX_lane (void)
   CHECK(TEST_MSG, uint, 8, 8, PRIx8, expected_st4_1, CMT);
   CHECK(TEST_MSG, uint, 16, 4, PRIx16, expected_st4_1, CMT);
   CHECK(TEST_MSG, uint, 32, 2, PRIx32, expected_st4_1, CMT);
-  CHECK(TEST_MSG, poly, 8, 8, PRIx8, expected_st4_1, CMT);
-  CHECK(TEST_MSG, poly, 16, 4, PRIx16, expected_st4_1, CMT);
+  CHECK_POLY(TEST_MSG, poly, 8, 8, PRIx8, expected_st4_1, CMT);
+  CHECK_POLY(TEST_MSG, poly, 16, 4, PRIx16, expected_st4_1, CMT);
   CHECK_FP(TEST_MSG, float, 32, 2, PRIx32, expected_st4_1, CMT);
   CHECK(TEST_MSG, int, 16, 8, PRIx16, expected_st4_1, CMT);
   CHECK(TEST_MSG, int, 32, 4, PRIx32, expected_st4_1, CMT);
   CHECK(TEST_MSG, uint, 16, 8, PRIx16, expected_st4_1, CMT);
   CHECK(TEST_MSG, uint, 32, 4, PRIx32, expected_st4_1, CMT);
-  CHECK(TEST_MSG, poly, 16, 8, PRIx16, expected_st4_1, CMT);
+  CHECK_POLY(TEST_MSG, poly, 16, 8, PRIx16, expected_st4_1, CMT);
   CHECK_FP(TEST_MSG, float, 32, 4, PRIx32, expected_st4_1, CMT);
+#if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
+  CHECK_FP(TEST_MSG, float, 16, 4, PRIx16, expected_st4_1, CMT);
+  CHECK_FP(TEST_MSG, float, 16, 8, PRIx16, expected_st4_1, CMT);
+#endif
 
   TEST_ALL_EXTRA_CHUNKS(4, 2);
 
@@ -540,15 +637,19 @@ void exec_vstX_lane (void)
   CHECK(TEST_MSG, uint, 8, 8, PRIx8, expected_st4_2, CMT);
   CHECK(TEST_MSG, uint, 16, 4, PRIx16, expected_st4_2, CMT);
   CHECK(TEST_MSG, uint, 32, 2, PRIx32, expected_st4_2, CMT);
-  CHECK(TEST_MSG, poly, 8, 8, PRIx8, expected_st4_2, CMT);
-  CHECK(TEST_MSG, poly, 16, 4, PRIx16, expected_st4_2, CMT);
+  CHECK_POLY(TEST_MSG, poly, 8, 8, PRIx8, expected_st4_2, CMT);
+  CHECK_POLY(TEST_MSG, poly, 16, 4, PRIx16, expected_st4_2, CMT);
   CHECK_FP(TEST_MSG, float, 32, 2, PRIx32, expected_st4_2, CMT);
   CHECK(TEST_MSG, int, 16, 8, PRIx16, expected_st4_2, CMT);
   CHECK(TEST_MSG, int, 32, 4, PRIx32, expected_st4_2, CMT);
   CHECK(TEST_MSG, uint, 16, 8, PRIx16, expected_st4_2, CMT);
   CHECK(TEST_MSG, uint, 32, 4, PRIx32, expected_st4_2, CMT);
-  CHECK(TEST_MSG, poly, 16, 8, PRIx16, expected_st4_2, CMT);
+  CHECK_POLY(TEST_MSG, poly, 16, 8, PRIx16, expected_st4_2, CMT);
   CHECK_FP(TEST_MSG, float, 32, 4, PRIx32, expected_st4_2, CMT);
+#if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
+  CHECK_FP(TEST_MSG, float, 16, 4, PRIx16, expected_st4_2, CMT);
+  CHECK_FP(TEST_MSG, float, 16, 8, PRIx16, expected_st4_2, CMT);
+#endif
 
   TEST_ALL_EXTRA_CHUNKS(4, 3);
 
@@ -560,15 +661,19 @@ void exec_vstX_lane (void)
   CHECK(TEST_MSG, uint, 8, 8, PRIx8, expected_st4_3, CMT);
   CHECK(TEST_MSG, uint, 16, 4, PRIx16, expected_st4_3, CMT);
   CHECK(TEST_MSG, uint, 32, 2, PRIx32, expected_st4_3, CMT);
-  CHECK(TEST_MSG, poly, 8, 8, PRIx8, expected_st4_3, CMT);
-  CHECK(TEST_MSG, poly, 16, 4, PRIx16, expected_st4_3, CMT);
+  CHECK_POLY(TEST_MSG, poly, 8, 8, PRIx8, expected_st4_3, CMT);
+  CHECK_POLY(TEST_MSG, poly, 16, 4, PRIx16, expected_st4_3, CMT);
   CHECK_FP(TEST_MSG, float, 32, 2, PRIx32, expected_st4_3, CMT);
   CHECK(TEST_MSG, int, 16, 8, PRIx16, expected_st4_3, CMT);
   CHECK(TEST_MSG, int, 32, 4, PRIx32, expected_st4_3, CMT);
   CHECK(TEST_MSG, uint, 16, 8, PRIx16, expected_st4_3, CMT);
   CHECK(TEST_MSG, uint, 32, 4, PRIx32, expected_st4_3, CMT);
-  CHECK(TEST_MSG, poly, 16, 8, PRIx16, expected_st4_3, CMT);
+  CHECK_POLY(TEST_MSG, poly, 16, 8, PRIx16, expected_st4_3, CMT);
   CHECK_FP(TEST_MSG, float, 32, 4, PRIx32, expected_st4_3, CMT);
+#if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
+  CHECK_FP(TEST_MSG, float, 16, 4, PRIx16, expected_st4_3, CMT);
+  CHECK_FP(TEST_MSG, float, 16, 8, PRIx16, expected_st4_3, CMT);
+#endif
 }
 
 int main (void)
--- a/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vsub.c
+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vsub.c
@@ -44,6 +44,14 @@ VECT_VAR_DECL(expected,uint,64,2) [] = { 0xffffffffffffffed,
 VECT_VAR_DECL(expected_float32,hfloat,32,2) [] = { 0xc00ccccd, 0xc00ccccd };
 VECT_VAR_DECL(expected_float32,hfloat,32,4) [] = { 0xc00ccccc, 0xc00ccccc,
 						   0xc00ccccc, 0xc00ccccc };
+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+VECT_VAR_DECL(expected_float16, hfloat, 16, 4) [] = { 0xc066, 0xc066,
+						      0xc066, 0xc066 };
+VECT_VAR_DECL(expected_float16, hfloat, 16, 8) [] = { 0xc067, 0xc067,
+						      0xc067, 0xc067,
+						      0xc067, 0xc067,
+						      0xc067, 0xc067 };
+#endif
 
 void exec_vsub_f32(void)
 {
@@ -67,4 +75,27 @@ void exec_vsub_f32(void)
 
   CHECK_FP(TEST_MSG, float, 32, 2, PRIx32, expected_float32, "");
   CHECK_FP(TEST_MSG, float, 32, 4, PRIx32, expected_float32, "");
+
+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+  DECL_VARIABLE(vector, float, 16, 4);
+  DECL_VARIABLE(vector, float, 16, 8);
+
+  DECL_VARIABLE(vector2, float, 16, 4);
+  DECL_VARIABLE(vector2, float, 16, 8);
+
+  DECL_VARIABLE(vector_res, float, 16, 4);
+  DECL_VARIABLE(vector_res, float, 16, 8);
+
+  VDUP(vector, , float, f, 16, 4, 2.3f);
+  VDUP(vector, q, float, f, 16, 8, 3.4f);
+
+  VDUP(vector2, , float, f, 16, 4, 4.5f);
+  VDUP(vector2, q, float, f, 16, 8, 5.6f);
+
+  TEST_BINARY_OP(INSN_NAME, , float, f, 16, 4);
+  TEST_BINARY_OP(INSN_NAME, q, float, f, 16, 8);
+
+  CHECK_FP(TEST_MSG, float, 16, 4, PRIx16, expected_float16, "");
+  CHECK_FP(TEST_MSG, float, 16, 8, PRIx16, expected_float16, "");
+#endif
 }
--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vsubh_f16_1.c
@@ -0,0 +1,42 @@
+/* { dg-do run } */
+/* { dg-require-effective-target arm_v8_2a_fp16_scalar_hw } */
+/* { dg-add-options arm_v8_2a_fp16_scalar }  */
+
+#include <arm_fp16.h>
+
+#define INFF __builtin_inf ()
+
+/* Expected results (16-bit hexadecimal representation).  */
+uint16_t expected[] =
+{
+  0xbc00 /* -1.000000 */,
+  0xbc00 /* -1.000000 */,
+  0x4654 /* 6.328125 */,
+  0xd60e /* -96.875000 */,
+  0xc900 /* -10.000000 */,
+  0x36b8 /* 0.419922 */,
+  0xc19a /* -2.800781 */,
+  0x4848 /* 8.562500 */,
+  0xbd34 /* -1.300781 */,
+  0xccec /* -19.687500 */,
+  0x4791 /* 7.566406 */,
+  0xbf34 /* -1.800781 */,
+  0x484d /* 8.601562 */,
+  0x4804 /* 8.031250 */,
+  0xc69c /* -6.609375 */,
+  0x4ceb /* 19.671875 */,
+  0x7c00 /* inf */,
+  0xfc00 /* -inf */
+};
+
+#define TEST_MSG "VSUB_F16"
+#define INSN_NAME vsubh_f16
+
+#define EXPECTED expected
+
+#define INPUT_TYPE float16_t
+#define OUTPUT_TYPE float16_t
+#define OUTPUT_TYPE_SIZE 16
+
+/* Include the template for binary scalar operations.  */
+#include "binary_scalar_op.inc"
--- a/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vtbX.c
+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vtbX.c
@@ -167,7 +167,7 @@ void exec_vtbX (void)
 
   CHECK(TEST_MSG, int, 8, 8, PRIx8, expected_vtbl1, "");
   CHECK(TEST_MSG, uint, 8, 8, PRIx8, expected_vtbl1, "");
-  CHECK(TEST_MSG, poly, 8, 8, PRIx8, expected_vtbl1, "");
+  CHECK_POLY(TEST_MSG, poly, 8, 8, PRIx8, expected_vtbl1, "");
 
   /* Check vtbl2.  */
   clean_results ();
@@ -177,7 +177,7 @@ void exec_vtbX (void)
 
   CHECK(TEST_MSG, int, 8, 8, PRIx8, expected_vtbl2, "");
   CHECK(TEST_MSG, uint, 8, 8, PRIx8, expected_vtbl2, "");
-  CHECK(TEST_MSG, poly, 8, 8, PRIx8, expected_vtbl2, "");
+  CHECK_POLY(TEST_MSG, poly, 8, 8, PRIx8, expected_vtbl2, "");
 
   /* Check vtbl3.  */
   clean_results ();
@@ -187,7 +187,7 @@ void exec_vtbX (void)
 
   CHECK(TEST_MSG, int, 8, 8, PRIx8, expected_vtbl3, "");
   CHECK(TEST_MSG, uint, 8, 8, PRIx8, expected_vtbl3, "");
-  CHECK(TEST_MSG, poly, 8, 8, PRIx8, expected_vtbl3, "");
+  CHECK_POLY(TEST_MSG, poly, 8, 8, PRIx8, expected_vtbl3, "");
 
   /* Check vtbl4.  */
   clean_results ();
@@ -197,7 +197,7 @@ void exec_vtbX (void)
 
   CHECK(TEST_MSG, int, 8, 8, PRIx8, expected_vtbl4, "");
   CHECK(TEST_MSG, uint, 8, 8, PRIx8, expected_vtbl4, "");
-  CHECK(TEST_MSG, poly, 8, 8, PRIx8, expected_vtbl4, "");
+  CHECK_POLY(TEST_MSG, poly, 8, 8, PRIx8, expected_vtbl4, "");
 
 
   /* Now test VTBX.  */
@@ -249,7 +249,7 @@ void exec_vtbX (void)
 
   CHECK(TEST_MSG, int, 8, 8, PRIx8, expected_vtbx1, "");
   CHECK(TEST_MSG, uint, 8, 8, PRIx8, expected_vtbx1, "");
-  CHECK(TEST_MSG, poly, 8, 8, PRIx8, expected_vtbx1, "");
+  CHECK_POLY(TEST_MSG, poly, 8, 8, PRIx8, expected_vtbx1, "");
 
   /* Check vtbx2.  */
   clean_results ();
@@ -259,7 +259,7 @@ void exec_vtbX (void)
 
   CHECK(TEST_MSG, int, 8, 8, PRIx8, expected_vtbx2, "");
   CHECK(TEST_MSG, uint, 8, 8, PRIx8, expected_vtbx2, "");
-  CHECK(TEST_MSG, poly, 8, 8, PRIx8, expected_vtbx2, "");
+  CHECK_POLY(TEST_MSG, poly, 8, 8, PRIx8, expected_vtbx2, "");
 
   /* Check vtbx3.  */
   clean_results ();
@@ -269,7 +269,7 @@ void exec_vtbX (void)
 
   CHECK(TEST_MSG, int, 8, 8, PRIx8, expected_vtbx3, "");
   CHECK(TEST_MSG, uint, 8, 8, PRIx8, expected_vtbx3, "");
-  CHECK(TEST_MSG, poly, 8, 8, PRIx8, expected_vtbx3, "");
+  CHECK_POLY(TEST_MSG, poly, 8, 8, PRIx8, expected_vtbx3, "");
 
   /* Check vtbx4.  */
   clean_results ();
@@ -279,7 +279,7 @@ void exec_vtbX (void)
 
   CHECK(TEST_MSG, int, 8, 8, PRIx8, expected_vtbx4, "");
   CHECK(TEST_MSG, uint, 8, 8, PRIx8, expected_vtbx4, "");
-  CHECK(TEST_MSG, poly, 8, 8, PRIx8, expected_vtbx4, "");
+  CHECK_POLY(TEST_MSG, poly, 8, 8, PRIx8, expected_vtbx4, "");
 }
 
 int main (void)
--- a/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vtrn.c
+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vtrn.c
@@ -15,6 +15,10 @@ VECT_VAR_DECL(expected0,uint,32,2) [] = { 0xfffffff0, 0xfffffff1 };
 VECT_VAR_DECL(expected0,poly,8,8) [] = { 0xf0, 0xf1, 0x55, 0x55,
 					 0xf2, 0xf3, 0x55, 0x55 };
 VECT_VAR_DECL(expected0,poly,16,4) [] = { 0xfff0, 0xfff1, 0x66, 0x66 };
+#if defined (FP16_SUPPORTED)
+VECT_VAR_DECL (expected0, hfloat, 16, 4) [] = { 0xcc00, 0xcb80,
+						0x4b4d, 0x4b4d };
+#endif
 VECT_VAR_DECL(expected0,hfloat,32,2) [] = { 0xc1800000, 0xc1700000 };
 VECT_VAR_DECL(expected0,int,8,16) [] = { 0xf0, 0xf1, 0x11, 0x11,
 					 0xf2, 0xf3, 0x11, 0x11,
@@ -36,6 +40,12 @@ VECT_VAR_DECL(expected0,poly,8,16) [] = { 0xf0, 0xf1, 0x55, 0x55,
 					  0xf6, 0xf7, 0x55, 0x55 };
 VECT_VAR_DECL(expected0,poly,16,8) [] = { 0xfff0, 0xfff1, 0x66, 0x66,
 					  0xfff2, 0xfff3, 0x66, 0x66 };
+#if defined (FP16_SUPPORTED)
+VECT_VAR_DECL (expected0, hfloat, 16, 8) [] = { 0xcc00, 0xcb80,
+						0x4b4d, 0x4b4d,
+						0xcb00, 0xca80,
+						0x4b4d, 0x4b4d };
+#endif
 VECT_VAR_DECL(expected0,hfloat,32,4) [] = { 0xc1800000, 0xc1700000,
 					    0x42073333, 0x42073333 };
 
@@ -51,6 +61,10 @@ VECT_VAR_DECL(expected1,uint,32,2) [] = { 0x77, 0x77 };
 VECT_VAR_DECL(expected1,poly,8,8) [] = { 0xf4, 0xf5, 0x55, 0x55,
 					 0xf6, 0xf7, 0x55, 0x55 };
 VECT_VAR_DECL(expected1,poly,16,4) [] = { 0xfff2, 0xfff3, 0x66, 0x66 };
+#if defined (FP16_SUPPORTED)
+VECT_VAR_DECL (expected1, hfloat, 16, 4) [] = { 0xcb00, 0xca80,
+						0x4b4d, 0x4b4d };
+#endif
 VECT_VAR_DECL(expected1,hfloat,32,2) [] = { 0x42066666, 0x42066666 };
 VECT_VAR_DECL(expected1,int,8,16) [] = { 0xf8, 0xf9, 0x11, 0x11,
 					 0xfa, 0xfb, 0x11, 0x11,
@@ -72,6 +86,12 @@ VECT_VAR_DECL(expected1,poly,8,16) [] = { 0xf8, 0xf9, 0x55, 0x55,
 					  0xfe, 0xff, 0x55, 0x55 };
 VECT_VAR_DECL(expected1,poly,16,8) [] = { 0xfff4, 0xfff5, 0x66, 0x66,
 					  0xfff6, 0xfff7, 0x66, 0x66 };
+#if defined (FP16_SUPPORTED)
+VECT_VAR_DECL (expected1, hfloat, 16, 8) [] = { 0xca00, 0xc980,
+						0x4b4d, 0x4b4d,
+						0xc900, 0xc880,
+						0x4b4d, 0x4b4d };
+#endif
 VECT_VAR_DECL(expected1,hfloat,32,4) [] = { 0xc1600000, 0xc1500000,
 					    0x42073333, 0x42073333 };
 
--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vtrn_half.c
@@ -0,0 +1,263 @@
+/* { dg-do run } */
+/* { dg-skip-if "" { arm*-*-* } } */
+
+#include <arm_neon.h>
+#include "arm-neon-ref.h"
+#include "compute-ref-data.h"
+
+/* Expected results.  */
+VECT_VAR_DECL(expected,int,8,8) [] = { 0xf0, 0x11, 0xf2, 0x11,
+				       0xf4, 0x11, 0xf6, 0x11 };
+VECT_VAR_DECL(expected,int,16,4) [] = { 0xfff0, 0x22, 0xfff2, 0x22 };
+VECT_VAR_DECL(expected,int,32,2) [] = { 0xfffffff0, 0x33 };
+VECT_VAR_DECL(expected,int,64,1) [] = { 0xfffffffffffffff0 };
+VECT_VAR_DECL(expected,uint,8,8) [] = { 0xf0, 0x55, 0xf2, 0x55,
+					0xf4, 0x55, 0xf6, 0x55 };
+VECT_VAR_DECL(expected,uint,16,4) [] = { 0xfff0, 0x66, 0xfff2, 0x66 };
+VECT_VAR_DECL(expected,uint,32,2) [] = { 0xfffffff0, 0x77 };
+VECT_VAR_DECL(expected,uint,64,1) [] = { 0xfffffffffffffff0 };
+VECT_VAR_DECL(expected,poly,8,8) [] = { 0xf0, 0x55, 0xf2, 0x55,
+					0xf4, 0x55, 0xf6, 0x55 };
+VECT_VAR_DECL(expected,poly,16,4) [] = { 0xfff0, 0x66, 0xfff2, 0x66 };
+VECT_VAR_DECL(expected,hfloat,32,2) [] = { 0xc1800000, 0x42066666 };
+#if defined (FP16_SUPPORTED)
+VECT_VAR_DECL (expected, hfloat, 16, 4) [] = { 0xcc00, 0x4b4d,
+					       0xcb00, 0x4b4d };
+#endif
+VECT_VAR_DECL(expected,int,8,16) [] = { 0xf0, 0x11, 0xf2, 0x11,
+					0xf4, 0x11, 0xf6, 0x11,
+					0xf8, 0x11, 0xfa, 0x11,
+					0xfc, 0x11, 0xfe, 0x11 };
+VECT_VAR_DECL(expected,int,16,8) [] = { 0xfff0, 0x22, 0xfff2, 0x22,
+					0xfff4, 0x22, 0xfff6, 0x22 };
+VECT_VAR_DECL(expected,int,32,4) [] = { 0xfffffff0, 0x33,
+					0xfffffff2, 0x33 };
+VECT_VAR_DECL(expected,int,64,2) [] = { 0xfffffffffffffff0,
+					0x44 };
+VECT_VAR_DECL(expected,uint,8,16) [] = { 0xf0, 0x55, 0xf2, 0x55,
+					 0xf4, 0x55, 0xf6, 0x55,
+					 0xf8, 0x55, 0xfa, 0x55,
+					 0xfc, 0x55, 0xfe, 0x55 };
+VECT_VAR_DECL(expected,uint,16,8) [] = { 0xfff0, 0x66, 0xfff2, 0x66,
+					 0xfff4, 0x66, 0xfff6, 0x66 };
+VECT_VAR_DECL(expected,uint,32,4) [] = { 0xfffffff0, 0x77,
+					 0xfffffff2, 0x77 };
+VECT_VAR_DECL(expected,uint,64,2) [] = { 0xfffffffffffffff0,
+					 0x88 };
+VECT_VAR_DECL(expected,poly,8,16) [] = { 0xf0, 0x55, 0xf2, 0x55,
+					 0xf4, 0x55, 0xf6, 0x55,
+					 0xf8, 0x55, 0xfa, 0x55,
+					 0xfc, 0x55, 0xfe, 0x55 };
+VECT_VAR_DECL(expected,poly,16,8) [] = { 0xfff0, 0x66, 0xfff2, 0x66,
+					 0xfff4, 0x66, 0xfff6, 0x66 };
+#if defined (FP16_SUPPORTED)
+VECT_VAR_DECL (expected, hfloat, 16, 8) [] = { 0xcc00, 0x4b4d,
+					       0xcb00, 0x4b4d,
+					       0xca00, 0x4b4d,
+					       0xc900, 0x4b4d };
+#endif
+VECT_VAR_DECL(expected,hfloat,32,4) [] = { 0xc1800000, 0x42073333,
+					   0xc1600000, 0x42073333 };
+
+#define TEST_MSG "VTRN1"
+void exec_vtrn_half (void)
+{
+#define TEST_VTRN(PART, Q, T1, T2, W, N)		\
+  VECT_VAR(vector_res, T1, W, N) =			\
+    vtrn##PART##Q##_##T2##W(VECT_VAR(vector, T1, W, N),	\
+		       VECT_VAR(vector2, T1, W, N));	\
+  vst1##Q##_##T2##W(VECT_VAR(result, T1, W, N), VECT_VAR(vector_res, T1, W, N))
+
+#define TEST_VTRN1(Q, T1, T2, W, N) TEST_VTRN(1, Q, T1, T2, W, N)
+
+  /* Input vector can only have 64 bits.  */
+  DECL_VARIABLE_ALL_VARIANTS(vector);
+  DECL_VARIABLE_ALL_VARIANTS(vector2);
+  DECL_VARIABLE(vector, float, 64, 2);
+  DECL_VARIABLE(vector2, float, 64, 2);
+
+  DECL_VARIABLE_ALL_VARIANTS(vector_res);
+  DECL_VARIABLE(vector_res, float, 64, 2);
+
+  clean_results ();
+  /* We don't have vtrn1_T64x1, so set expected to the clean value.  */
+  CLEAN(expected, int, 64, 1);
+  CLEAN(expected, uint, 64, 1);
+
+  TEST_MACRO_ALL_VARIANTS_2_5(VLOAD, vector, buffer);
+#if defined (FP16_SUPPORTED)
+  VLOAD(vector, buffer, , float, f, 16, 4);
+  VLOAD(vector, buffer, q, float, f, 16, 8);
+#endif
+  VLOAD(vector, buffer, , float, f, 32, 2);
+  VLOAD(vector, buffer, q, float, f, 32, 4);
+  VLOAD(vector, buffer, q, float, f, 64, 2);
+
+  /* Choose arbitrary initialization values.  */
+  VDUP(vector2, , int, s, 8, 8, 0x11);
+  VDUP(vector2, , int, s, 16, 4, 0x22);
+  VDUP(vector2, , int, s, 32, 2, 0x33);
+  VDUP(vector2, , uint, u, 8, 8, 0x55);
+  VDUP(vector2, , uint, u, 16, 4, 0x66);
+  VDUP(vector2, , uint, u, 32, 2, 0x77);
+  VDUP(vector2, , poly, p, 8, 8, 0x55);
+  VDUP(vector2, , poly, p, 16, 4, 0x66);
+#if defined (FP16_SUPPORTED)
+  VDUP (vector2, , float, f, 16, 4, 14.6f);   /* 14.6f is 0x4b4d.  */
+#endif
+  VDUP(vector2, , float, f, 32, 2, 33.6f);
+
+  VDUP(vector2, q, int, s, 8, 16, 0x11);
+  VDUP(vector2, q, int, s, 16, 8, 0x22);
+  VDUP(vector2, q, int, s, 32, 4, 0x33);
+  VDUP(vector2, q, int, s, 64, 2, 0x44);
+  VDUP(vector2, q, uint, u, 8, 16, 0x55);
+  VDUP(vector2, q, uint, u, 16, 8, 0x66);
+  VDUP(vector2, q, uint, u, 32, 4, 0x77);
+  VDUP(vector2, q, uint, u, 64, 2, 0x88);
+  VDUP(vector2, q, poly, p, 8, 16, 0x55);
+  VDUP(vector2, q, poly, p, 16, 8, 0x66);
+#if defined (FP16_SUPPORTED)
+  VDUP (vector2, q, float, f, 16, 8, 14.6f);
+#endif
+  VDUP(vector2, q, float, f, 32, 4, 33.8f);
+  VDUP(vector2, q, float, f, 64, 2, 33.8f);
+
+  TEST_VTRN1(, int, s, 8, 8);
+  TEST_VTRN1(, int, s, 16, 4);
+  TEST_VTRN1(, int, s, 32, 2);
+  TEST_VTRN1(, uint, u, 8, 8);
+  TEST_VTRN1(, uint, u, 16, 4);
+  TEST_VTRN1(, uint, u, 32, 2);
+  TEST_VTRN1(, poly, p, 8, 8);
+  TEST_VTRN1(, poly, p, 16, 4);
+#if defined (FP16_SUPPORTED)
+  TEST_VTRN1(, float, f, 16, 4);
+#endif
+  TEST_VTRN1(, float, f, 32, 2);
+
+  TEST_VTRN1(q, int, s, 8, 16);
+  TEST_VTRN1(q, int, s, 16, 8);
+  TEST_VTRN1(q, int, s, 32, 4);
+  TEST_VTRN1(q, int, s, 64, 2);
+  TEST_VTRN1(q, uint, u, 8, 16);
+  TEST_VTRN1(q, uint, u, 16, 8);
+  TEST_VTRN1(q, uint, u, 32, 4);
+  TEST_VTRN1(q, uint, u, 64, 2);
+  TEST_VTRN1(q, poly, p, 8, 16);
+  TEST_VTRN1(q, poly, p, 16, 8);
+#if defined (FP16_SUPPORTED)
+  TEST_VTRN1(q, float, f, 16, 8);
+#endif
+  TEST_VTRN1(q, float, f, 32, 4);
+  TEST_VTRN1(q, float, f, 64, 2);
+
+#if defined (FP16_SUPPORTED)
+  CHECK_RESULTS (TEST_MSG, "");
+#else
+  CHECK_RESULTS_NO_FP16 (TEST_MSG, "");
+#endif
+
+#undef TEST_MSG
+#define TEST_MSG "VTRN2"
+
+#define TEST_VTRN2(Q, T1, T2, W, N) TEST_VTRN(2, Q, T1, T2, W, N)
+
+/* Expected results.  */
+VECT_VAR_DECL(expected2,int,8,8) [] = { 0xf1, 0x11, 0xf3, 0x11,
+					0xf5, 0x11, 0xf7, 0x11 };
+VECT_VAR_DECL(expected2,int,16,4) [] = { 0xfff1, 0x22, 0xfff3, 0x22 };
+VECT_VAR_DECL(expected2,int,32,2) [] = { 0xfffffff1, 0x33 };
+VECT_VAR_DECL(expected2,int,64,1) [] = { 0xfffffffffffffff1 };
+VECT_VAR_DECL(expected2,uint,8,8) [] = { 0xf1, 0x55, 0xf3, 0x55,
+					 0xf5, 0x55, 0xf7, 0x55 };
+VECT_VAR_DECL(expected2,uint,16,4) [] = { 0xfff1, 0x66, 0xfff3, 0x66 };
+VECT_VAR_DECL(expected2,uint,32,2) [] = { 0xfffffff1, 0x77 };
+VECT_VAR_DECL(expected2,uint,64,1) [] = { 0xfffffffffffffff1 };
+VECT_VAR_DECL(expected2,poly,8,8) [] = { 0xf1, 0x55, 0xf3, 0x55,
+					 0xf5, 0x55, 0xf7, 0x55 };
+VECT_VAR_DECL(expected2,poly,16,4) [] = { 0xfff1, 0x66, 0xfff3, 0x66 };
+VECT_VAR_DECL(expected2,hfloat,32,2) [] = { 0xc1700000, 0x42066666 };
+#if defined (FP16_SUPPORTED)
+VECT_VAR_DECL (expected2, hfloat, 16, 4) [] = { 0xcb80, 0x4b4d,
+						0xca80, 0x4b4d };
+#endif
+VECT_VAR_DECL(expected2,int,8,16) [] = { 0xf1, 0x11, 0xf3, 0x11,
+					 0xf5, 0x11, 0xf7, 0x11,
+					 0xf9, 0x11, 0xfb, 0x11,
+					 0xfd, 0x11, 0xff, 0x11 };
+VECT_VAR_DECL(expected2,int,16,8) [] = { 0xfff1, 0x22, 0xfff3, 0x22,
+					 0xfff5, 0x22, 0xfff7, 0x22 };
+VECT_VAR_DECL(expected2,int,32,4) [] = { 0xfffffff1, 0x33,
+					 0xfffffff3, 0x33 };
+VECT_VAR_DECL(expected2,int,64,2) [] = { 0xfffffffffffffff1,
+					 0x44 };
+VECT_VAR_DECL(expected2,uint,8,16) [] = { 0xf1, 0x55, 0xf3, 0x55,
+					  0xf5, 0x55, 0xf7, 0x55,
+					  0xf9, 0x55, 0xfb, 0x55,
+					  0xfd, 0x55, 0xff, 0x55 };
+VECT_VAR_DECL(expected2,uint,16,8) [] = { 0xfff1, 0x66, 0xfff3, 0x66,
+					  0xfff5, 0x66, 0xfff7, 0x66 };
+VECT_VAR_DECL(expected2,uint,32,4) [] = { 0xfffffff1, 0x77,
+					  0xfffffff3, 0x77 };
+VECT_VAR_DECL(expected2,uint,64,2) [] = { 0xfffffffffffffff1,
+					  0x88 };
+VECT_VAR_DECL(expected2,poly,8,16) [] = { 0xf1, 0x55, 0xf3, 0x55,
+					  0xf5, 0x55, 0xf7, 0x55,
+					  0xf9, 0x55, 0xfb, 0x55,
+					  0xfd, 0x55, 0xff, 0x55 };
+VECT_VAR_DECL(expected2,poly,16,8) [] = { 0xfff1, 0x66, 0xfff3, 0x66,
+					  0xfff5, 0x66, 0xfff7, 0x66 };
+#if defined (FP16_SUPPORTED)
+VECT_VAR_DECL (expected2, hfloat, 16, 8) [] = { 0xcb80, 0x4b4d,
+						0xca80, 0x4b4d,
+						0xc980, 0x4b4d,
+						0xc880, 0x4b4d };
+#endif
+VECT_VAR_DECL(expected2,hfloat,32,4) [] = { 0xc1700000, 0x42073333,
+					    0xc1500000, 0x42073333 };
+  clean_results ();
+  CLEAN(expected2, int, 64, 1);
+  CLEAN(expected2, uint, 64, 1);
+
+  TEST_VTRN2(, int, s, 8, 8);
+  TEST_VTRN2(, int, s, 16, 4);
+  TEST_VTRN2(, int, s, 32, 2);
+  TEST_VTRN2(, uint, u, 8, 8);
+  TEST_VTRN2(, uint, u, 16, 4);
+  TEST_VTRN2(, uint, u, 32, 2);
+  TEST_VTRN2(, poly, p, 8, 8);
+  TEST_VTRN2(, poly, p, 16, 4);
+#if defined (FP16_SUPPORTED)
+  TEST_VTRN2(, float, f, 16, 4);
+#endif
+  TEST_VTRN2(, float, f, 32, 2);
+
+  TEST_VTRN2(q, int, s, 8, 16);
+  TEST_VTRN2(q, int, s, 16, 8);
+  TEST_VTRN2(q, int, s, 32, 4);
+  TEST_VTRN2(q, int, s, 64, 2);
+  TEST_VTRN2(q, uint, u, 8, 16);
+  TEST_VTRN2(q, uint, u, 16, 8);
+  TEST_VTRN2(q, uint, u, 32, 4);
+  TEST_VTRN2(q, uint, u, 64, 2);
+  TEST_VTRN2(q, poly, p, 8, 16);
+  TEST_VTRN2(q, poly, p, 16, 8);
+#if defined (FP16_SUPPORTED)
+  TEST_VTRN2(q, float, f, 16, 8);
+#endif
+  TEST_VTRN2(q, float, f, 32, 4);
+  TEST_VTRN2(q, float, f, 64, 2);
+
+  CHECK_RESULTS_NAMED (TEST_MSG, expected2, "");
+#if defined (FP16_SUPPORTED)
+  CHECK_FP(TEST_MSG, float, 16, 4, PRIx16, expected2, "");
+  CHECK_FP(TEST_MSG, float, 16, 8, PRIx16, expected2, "");
+#endif
+}
+
+int main (void)
+{
+  exec_vtrn_half ();
+  return 0;
+}
--- a/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vtst.c
+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vtst.c
@@ -32,10 +32,21 @@ VECT_VAR_DECL(expected_unsigned,uint,16,8) [] = { 0x0, 0xffff,
 VECT_VAR_DECL(expected_unsigned,uint,32,4) [] = { 0x0, 0xffffffff,
 						  0x0, 0xffffffff };
 
-#ifndef INSN_NAME
+/* Expected results with poly input.  */
+VECT_VAR_DECL(expected_poly,uint,8,8) [] = { 0x0, 0xff, 0xff, 0xff,
+					     0xff, 0xff, 0xff, 0xff };
+VECT_VAR_DECL(expected_poly,uint,8,16) [] = { 0x0, 0xff, 0xff, 0xff,
+					      0xff, 0xff, 0xff, 0xff,
+					      0xff, 0xff, 0xff, 0xff,
+					      0xff, 0xff, 0xff, 0xff };
+VECT_VAR_DECL(expected_poly,uint,16,4) [] = { 0x0, 0xffff, 0x0, 0xffff };
+VECT_VAR_DECL(expected_poly,uint,16,8) [] = { 0x0, 0xffff,
+					      0x0, 0xffff,
+					      0xffff, 0xffff,
+					      0xffff, 0xffff };
+
 #define INSN_NAME vtst
 #define TEST_MSG "VTST/VTSTQ"
-#endif
 
 /* We can't use the standard ref_v_binary_op.c template because vtst
    has no 64 bits variant, and outputs are always of uint type.  */
@@ -73,12 +84,16 @@ FNNAME (INSN_NAME)
   VDUP(vector2, , uint, u, 8, 8, 15);
   VDUP(vector2, , uint, u, 16, 4, 5);
   VDUP(vector2, , uint, u, 32, 2, 1);
+  VDUP(vector2, , poly, p, 8, 8, 15);
+  VDUP(vector2, , poly, p, 16, 4, 5);
   VDUP(vector2, q, int, s, 8, 16, 15);
   VDUP(vector2, q, int, s, 16, 8, 5);
   VDUP(vector2, q, int, s, 32, 4, 1);
   VDUP(vector2, q, uint, u, 8, 16, 15);
   VDUP(vector2, q, uint, u, 16, 8, 5);
   VDUP(vector2, q, uint, u, 32, 4, 1);
+  VDUP(vector2, q, poly, p, 8, 16, 15);
+  VDUP(vector2, q, poly, p, 16, 8, 5);
 
 #define TEST_MACRO_NO64BIT_VARIANT_1_5(MACRO, VAR, T1, T2)	\
   MACRO(VAR, , T1, T2, 8, 8);					\
@@ -111,6 +126,18 @@ FNNAME (INSN_NAME)
   CHECK(TEST_MSG, uint, 8, 16, PRIx8, expected_unsigned, CMT);
   CHECK(TEST_MSG, uint, 16, 8, PRIx16, expected_unsigned, CMT);
   CHECK(TEST_MSG, uint, 32, 4, PRIx32, expected_unsigned, CMT);
+
+  /* Now, test the variants with poly8 and poly16 as input.  */
+#undef CMT
+#define CMT " (poly input)"
+  TEST_BINARY_OP(INSN_NAME, , poly, p, 8, 8);
+  TEST_BINARY_OP(INSN_NAME, , poly, p, 16, 4);
+  TEST_BINARY_OP(INSN_NAME, q, poly, p, 8, 16);
+  TEST_BINARY_OP(INSN_NAME, q, poly, p, 16, 8);
+  CHECK(TEST_MSG, uint, 8, 8, PRIx8, expected_poly, CMT);
+  CHECK(TEST_MSG, uint, 16, 4, PRIx16, expected_poly, CMT);
+  CHECK(TEST_MSG, uint, 8, 16, PRIx8, expected_poly, CMT);
+  CHECK(TEST_MSG, uint, 16, 8, PRIx16, expected_poly, CMT);
 }
 
 int main (void)
--- a/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vuzp.c
+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vuzp.c
@@ -19,6 +19,10 @@ VECT_VAR_DECL(expected0,poly,8,8) [] = { 0xf0, 0xf1, 0xf2, 0xf3,
 					 0xf4, 0xf5, 0xf6, 0xf7 };
 VECT_VAR_DECL(expected0,poly,16,4) [] = { 0xfff0, 0xfff1,
 					  0xfff2, 0xfff3 };
+#if defined (FP16_SUPPORTED)
+VECT_VAR_DECL (expected0, hfloat, 16, 4) [] = { 0xcc00, 0xcb80,
+						0xcb00, 0xca80 };
+#endif
 VECT_VAR_DECL(expected0,hfloat,32,2) [] = { 0xc1800000, 0xc1700000 };
 VECT_VAR_DECL(expected0,int,8,16) [] = { 0xf0, 0xf1, 0xf2, 0xf3,
 					 0xf4, 0xf5, 0xf6, 0xf7,
@@ -48,6 +52,12 @@ VECT_VAR_DECL(expected0,poly,16,8) [] = { 0xfff0, 0xfff1,
 					  0xfff2, 0xfff3,
 					  0xfff4, 0xfff5,
 					  0xfff6, 0xfff7 };
+#if defined (FP16_SUPPORTED)
+VECT_VAR_DECL (expected0, hfloat, 16, 8) [] = { 0xcc00, 0xcb80,
+						0xcb00, 0xca80,
+						0xca00, 0xc980,
+						0xc900, 0xc880 };
+#endif
 VECT_VAR_DECL(expected0,hfloat,32,4) [] = { 0xc1800000, 0xc1700000,
 					    0xc1600000, 0xc1500000 };
 
@@ -63,6 +73,10 @@ VECT_VAR_DECL(expected1,uint,32,2) [] = { 0x77, 0x77 };
 VECT_VAR_DECL(expected1,poly,8,8) [] = { 0x55, 0x55, 0x55, 0x55,
 					 0x55, 0x55, 0x55, 0x55 };
 VECT_VAR_DECL(expected1,poly,16,4) [] = { 0x66, 0x66, 0x66, 0x66 };
+#if defined (FP16_SUPPORTED)
+VECT_VAR_DECL (expected1, hfloat, 16, 4) [] = { 0x4b4d, 0x4b4d,
+						0x4b4d, 0x4b4d };
+#endif
 VECT_VAR_DECL(expected1,hfloat,32,2) [] = { 0x42066666, 0x42066666 };
 VECT_VAR_DECL(expected1,int,8,16) [] = { 0x11, 0x11, 0x11, 0x11,
 					 0x11, 0x11, 0x11, 0x11,
@@ -84,6 +98,12 @@ VECT_VAR_DECL(expected1,poly,8,16) [] = { 0x55, 0x55, 0x55, 0x55,
 					  0x55, 0x55, 0x55, 0x55 };
 VECT_VAR_DECL(expected1,poly,16,8) [] = { 0x66, 0x66, 0x66, 0x66,
 					  0x66, 0x66, 0x66, 0x66 };
+#if defined (FP16_SUPPORTED)
+VECT_VAR_DECL (expected1, hfloat, 16, 8) [] = { 0x4b4d, 0x4b4d,
+						0x4b4d, 0x4b4d,
+						0x4b4d, 0x4b4d,
+						0x4b4d, 0x4b4d };
+#endif
 VECT_VAR_DECL(expected1,hfloat,32,4) [] = { 0x42073333, 0x42073333,
 					    0x42073333, 0x42073333 };
 
--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vuzp_half.c
@@ -0,0 +1,259 @@
+/* { dg-do run } */
+/* { dg-skip-if "" { arm*-*-* } } */
+
+#include <arm_neon.h>
+#include "arm-neon-ref.h"
+#include "compute-ref-data.h"
+
+/* Expected results.  */
+VECT_VAR_DECL(expected,int,8,8) [] = { 0xf0, 0xf2, 0xf4, 0xf6,
+				       0x11, 0x11, 0x11, 0x11 };
+VECT_VAR_DECL(expected,int,16,4) [] = { 0xfff0, 0xfff2, 0x22, 0x22 };
+VECT_VAR_DECL(expected,int,32,2) [] = { 0xfffffff0, 0x33 };
+VECT_VAR_DECL(expected,int,64,1) [] = { 0xfffffffffffffff0 };
+VECT_VAR_DECL(expected,uint,8,8) [] = { 0xf0, 0xf2, 0xf4, 0xf6,
+					0x55, 0x55, 0x55, 0x55 };
+VECT_VAR_DECL(expected,uint,16,4) [] = { 0xfff0, 0xfff2, 0x66, 0x66 };
+VECT_VAR_DECL(expected,uint,32,2) [] = { 0xfffffff0, 0x77 };
+VECT_VAR_DECL(expected,uint,64,1) [] = { 0xfffffffffffffff0 };
+VECT_VAR_DECL(expected,poly,8,8) [] = { 0xf0, 0xf2, 0xf4, 0xf6,
+					0x55, 0x55, 0x55, 0x55 };
+VECT_VAR_DECL(expected,poly,16,4) [] = { 0xfff0, 0xfff2, 0x66, 0x66 };
+VECT_VAR_DECL(expected,hfloat,32,2) [] = { 0xc1800000, 0x42066666 };
+#if defined (FP16_SUPPORTED)
+VECT_VAR_DECL (expected, hfloat, 16, 4) [] = { 0xcc00, 0xcb00,
+					       0x4b4d, 0x4b4d };
+#endif
+VECT_VAR_DECL(expected,int,8,16) [] = { 0xf0, 0xf2, 0xf4, 0xf6,
+					0xf8, 0xfa, 0xfc, 0xfe,
+					0x11, 0x11, 0x11, 0x11,
+					0x11, 0x11, 0x11, 0x11 };
+VECT_VAR_DECL(expected,int,16,8) [] = { 0xfff0, 0xfff2, 0xfff4, 0xfff6,
+					0x22, 0x22, 0x22, 0x22 };
+VECT_VAR_DECL(expected,int,32,4) [] = { 0xfffffff0, 0xfffffff2,
+					0x33, 0x33 };
+VECT_VAR_DECL(expected,int,64,2) [] = { 0xfffffffffffffff0,
+					0x44 };
+VECT_VAR_DECL(expected,uint,8,16) [] = { 0xf0, 0xf2, 0xf4, 0xf6,
+					 0xf8, 0xfa, 0xfc, 0xfe,
+					 0x55, 0x55, 0x55, 0x55,
+					 0x55, 0x55, 0x55, 0x55 };
+VECT_VAR_DECL(expected,uint,16,8) [] = { 0xfff0, 0xfff2, 0xfff4, 0xfff6,
+					 0x66, 0x66, 0x66, 0x66 };
+VECT_VAR_DECL(expected,uint,32,4) [] = { 0xfffffff0, 0xfffffff2, 0x77, 0x77 };
+VECT_VAR_DECL(expected,uint,64,2) [] = { 0xfffffffffffffff0,
+					 0x88 };
+VECT_VAR_DECL(expected,poly,8,16) [] = { 0xf0, 0xf2, 0xf4, 0xf6,
+					 0xf8, 0xfa, 0xfc, 0xfe,
+					 0x55, 0x55, 0x55, 0x55,
+					 0x55, 0x55, 0x55, 0x55 };
+VECT_VAR_DECL(expected,poly,16,8) [] = { 0xfff0, 0xfff2, 0xfff4, 0xfff6,
+					 0x66, 0x66, 0x66, 0x66 };
+#if defined (FP16_SUPPORTED)
+VECT_VAR_DECL (expected, hfloat, 16, 8) [] = { 0xcc00, 0xcb00, 0xca00, 0xc900,
+					       0x4b4d, 0x4b4d, 0x4b4d, 0x4b4d };
+#endif
+VECT_VAR_DECL(expected,hfloat,32,4) [] = { 0xc1800000, 0xc1600000,
+					   0x42073333, 0x42073333 };
+
+#define TEST_MSG "VUZP1"
+void exec_vuzp_half (void)
+{
+#define TEST_VUZP(PART, Q, T1, T2, W, N)		\
+  VECT_VAR(vector_res, T1, W, N) =			\
+    vuzp##PART##Q##_##T2##W(VECT_VAR(vector, T1, W, N),	\
+		       VECT_VAR(vector2, T1, W, N));	\
+  vst1##Q##_##T2##W(VECT_VAR(result, T1, W, N), VECT_VAR(vector_res, T1, W, N))
+
+#define TEST_VUZP1(Q, T1, T2, W, N) TEST_VUZP(1, Q, T1, T2, W, N)
+
+  /* Input vector can only have 64 bits.  */
+  DECL_VARIABLE_ALL_VARIANTS(vector);
+  DECL_VARIABLE_ALL_VARIANTS(vector2);
+  DECL_VARIABLE(vector, float, 64, 2);
+  DECL_VARIABLE(vector2, float, 64, 2);
+
+  DECL_VARIABLE_ALL_VARIANTS(vector_res);
+  DECL_VARIABLE(vector_res, float, 64, 2);
+
+  clean_results ();
+  /* We don't have vuzp1_T64x1, so set expected to the clean value.  */
+  CLEAN(expected, int, 64, 1);
+  CLEAN(expected, uint, 64, 1);
+
+  TEST_MACRO_ALL_VARIANTS_2_5(VLOAD, vector, buffer);
+#if defined (FP16_SUPPORTED)
+  VLOAD(vector, buffer, , float, f, 16, 4);
+  VLOAD(vector, buffer, q, float, f, 16, 8);
+#endif
+  VLOAD(vector, buffer, , float, f, 32, 2);
+  VLOAD(vector, buffer, q, float, f, 32, 4);
+  VLOAD(vector, buffer, q, float, f, 64, 2);
+
+  /* Choose arbitrary initialization values.  */
+  VDUP(vector2, , int, s, 8, 8, 0x11);
+  VDUP(vector2, , int, s, 16, 4, 0x22);
+  VDUP(vector2, , int, s, 32, 2, 0x33);
+  VDUP(vector2, , uint, u, 8, 8, 0x55);
+  VDUP(vector2, , uint, u, 16, 4, 0x66);
+  VDUP(vector2, , uint, u, 32, 2, 0x77);
+  VDUP(vector2, , poly, p, 8, 8, 0x55);
+  VDUP(vector2, , poly, p, 16, 4, 0x66);
+#if defined (FP16_SUPPORTED)
+  VDUP (vector2, , float, f, 16, 4, 14.6f);   /* 14.6f is 0x4b4d.  */
+#endif
+  VDUP(vector2, , float, f, 32, 2, 33.6f);
+
+  VDUP(vector2, q, int, s, 8, 16, 0x11);
+  VDUP(vector2, q, int, s, 16, 8, 0x22);
+  VDUP(vector2, q, int, s, 32, 4, 0x33);
+  VDUP(vector2, q, int, s, 64, 2, 0x44);
+  VDUP(vector2, q, uint, u, 8, 16, 0x55);
+  VDUP(vector2, q, uint, u, 16, 8, 0x66);
+  VDUP(vector2, q, uint, u, 32, 4, 0x77);
+  VDUP(vector2, q, uint, u, 64, 2, 0x88);
+  VDUP(vector2, q, poly, p, 8, 16, 0x55);
+  VDUP(vector2, q, poly, p, 16, 8, 0x66);
+#if defined (FP16_SUPPORTED)
+  VDUP (vector2, q, float, f, 16, 8, 14.6f);
+#endif
+  VDUP(vector2, q, float, f, 32, 4, 33.8f);
+  VDUP(vector2, q, float, f, 64, 2, 33.8f);
+
+  TEST_VUZP1(, int, s, 8, 8);
+  TEST_VUZP1(, int, s, 16, 4);
+  TEST_VUZP1(, int, s, 32, 2);
+  TEST_VUZP1(, uint, u, 8, 8);
+  TEST_VUZP1(, uint, u, 16, 4);
+  TEST_VUZP1(, uint, u, 32, 2);
+  TEST_VUZP1(, poly, p, 8, 8);
+  TEST_VUZP1(, poly, p, 16, 4);
+#if defined (FP16_SUPPORTED)
+  TEST_VUZP1(, float, f, 16, 4);
+#endif
+  TEST_VUZP1(, float, f, 32, 2);
+
+  TEST_VUZP1(q, int, s, 8, 16);
+  TEST_VUZP1(q, int, s, 16, 8);
+  TEST_VUZP1(q, int, s, 32, 4);
+  TEST_VUZP1(q, int, s, 64, 2);
+  TEST_VUZP1(q, uint, u, 8, 16);
+  TEST_VUZP1(q, uint, u, 16, 8);
+  TEST_VUZP1(q, uint, u, 32, 4);
+  TEST_VUZP1(q, uint, u, 64, 2);
+  TEST_VUZP1(q, poly, p, 8, 16);
+  TEST_VUZP1(q, poly, p, 16, 8);
+#if defined (FP16_SUPPORTED)
+  TEST_VUZP1(q, float, f, 16, 8);
+#endif
+  TEST_VUZP1(q, float, f, 32, 4);
+  TEST_VUZP1(q, float, f, 64, 2);
+
+#if defined (FP16_SUPPORTED)
+  CHECK_RESULTS (TEST_MSG, "");
+#else
+  CHECK_RESULTS_NO_FP16 (TEST_MSG, "");
+#endif
+
+#undef TEST_MSG
+#define TEST_MSG "VUZP2"
+
+#define TEST_VUZP2(Q, T1, T2, W, N) TEST_VUZP(2, Q, T1, T2, W, N)
+
+/* Expected results.  */
+VECT_VAR_DECL(expected2,int,8,8) [] = { 0xf1, 0xf3, 0xf5, 0xf7,
+					0x11, 0x11, 0x11, 0x11 };
+VECT_VAR_DECL(expected2,int,16,4) [] = { 0xfff1, 0xfff3, 0x22, 0x22 };
+VECT_VAR_DECL(expected2,int,32,2) [] = { 0xfffffff1, 0x33 };
+VECT_VAR_DECL(expected2,int,64,1) [] = { 0xfffffffffffffff1 };
+VECT_VAR_DECL(expected2,uint,8,8) [] = { 0xf1, 0xf3, 0xf5, 0xf7,
+					 0x55, 0x55, 0x55, 0x55 };
+VECT_VAR_DECL(expected2,uint,16,4) [] = { 0xfff1, 0xfff3, 0x66, 0x66 };
+VECT_VAR_DECL(expected2,uint,32,2) [] = { 0xfffffff1, 0x77 };
+VECT_VAR_DECL(expected2,uint,64,1) [] = { 0xfffffffffffffff1 };
+VECT_VAR_DECL(expected2,poly,8,8) [] = { 0xf1, 0xf3, 0xf5, 0xf7,
+					 0x55, 0x55, 0x55, 0x55 };
+VECT_VAR_DECL(expected2,poly,16,4) [] = { 0xfff1, 0xfff3, 0x66, 0x66 };
+VECT_VAR_DECL(expected2,hfloat,32,2) [] = { 0xc1700000, 0x42066666 };
+#if defined (FP16_SUPPORTED)
+VECT_VAR_DECL (expected2, hfloat, 16, 4) [] = { 0xcb80, 0xca80,
+						0x4b4d, 0x4b4d };
+#endif
+VECT_VAR_DECL(expected2,int,8,16) [] = { 0xf1, 0xf3, 0xf5, 0xf7,
+					 0xf9, 0xfb, 0xfd, 0xff,
+					 0x11, 0x11, 0x11, 0x11,
+					 0x11, 0x11, 0x11, 0x11 };
+VECT_VAR_DECL(expected2,int,16,8) [] = { 0xfff1, 0xfff3, 0xfff5, 0xfff7,
+					 0x22, 0x22, 0x22, 0x22 };
+VECT_VAR_DECL(expected2,int,32,4) [] = { 0xfffffff1, 0xfffffff3,
+					 0x33, 0x33 };
+VECT_VAR_DECL(expected2,int,64,2) [] = { 0xfffffffffffffff1,
+					 0x44 };
+VECT_VAR_DECL(expected2,uint,8,16) [] = { 0xf1, 0xf3, 0xf5, 0xf7,
+					  0xf9, 0xfb, 0xfd, 0xff,
+					  0x55, 0x55, 0x55, 0x55,
+					  0x55, 0x55, 0x55, 0x55 };
+VECT_VAR_DECL(expected2,uint,16,8) [] = { 0xfff1, 0xfff3, 0xfff5, 0xfff7,
+					  0x66, 0x66, 0x66, 0x66 };
+VECT_VAR_DECL(expected2,uint,32,4) [] = { 0xfffffff1, 0xfffffff3, 0x77, 0x77 };
+VECT_VAR_DECL(expected2,uint,64,2) [] = { 0xfffffffffffffff1,
+					  0x88 };
+VECT_VAR_DECL(expected2,poly,8,16) [] = { 0xf1, 0xf3, 0xf5, 0xf7,
+					  0xf9, 0xfb, 0xfd, 0xff,
+					  0x55, 0x55, 0x55, 0x55,
+					  0x55, 0x55, 0x55, 0x55 };
+VECT_VAR_DECL(expected2,poly,16,8) [] = { 0xfff1, 0xfff3, 0xfff5, 0xfff7,
+					  0x66, 0x66, 0x66, 0x66 };
+#if defined (FP16_SUPPORTED)
+VECT_VAR_DECL (expected2, hfloat, 16, 8) [] = { 0xcb80, 0xca80, 0xc980, 0xc880,
+						0x4b4d, 0x4b4d, 0x4b4d, 0x4b4d
+					      };
+#endif
+VECT_VAR_DECL(expected2,hfloat,32,4) [] = { 0xc1700000, 0xc1500000,
+					    0x42073333, 0x42073333 };
+
+  clean_results ();
+  CLEAN(expected2, int, 64, 1);
+  CLEAN(expected2, uint, 64, 1);
+
+  TEST_VUZP2(, int, s, 8, 8);
+  TEST_VUZP2(, int, s, 16, 4);
+  TEST_VUZP2(, int, s, 32, 2);
+  TEST_VUZP2(, uint, u, 8, 8);
+  TEST_VUZP2(, uint, u, 16, 4);
+  TEST_VUZP2(, uint, u, 32, 2);
+  TEST_VUZP2(, poly, p, 8, 8);
+  TEST_VUZP2(, poly, p, 16, 4);
+#if defined (FP16_SUPPORTED)
+  TEST_VUZP2(, float, f, 16, 4);
+#endif
+  TEST_VUZP2(, float, f, 32, 2);
+
+  TEST_VUZP2(q, int, s, 8, 16);
+  TEST_VUZP2(q, int, s, 16, 8);
+  TEST_VUZP2(q, int, s, 32, 4);
+  TEST_VUZP2(q, int, s, 64, 2);
+  TEST_VUZP2(q, uint, u, 8, 16);
+  TEST_VUZP2(q, uint, u, 16, 8);
+  TEST_VUZP2(q, uint, u, 32, 4);
+  TEST_VUZP2(q, uint, u, 64, 2);
+  TEST_VUZP2(q, poly, p, 8, 16);
+  TEST_VUZP2(q, poly, p, 16, 8);
+#if defined (FP16_SUPPORTED)
+  TEST_VUZP2(q, float, f, 16, 8);
+#endif
+  TEST_VUZP2(q, float, f, 32, 4);
+  TEST_VUZP2(q, float, f, 64, 2);
+
+  CHECK_RESULTS_NAMED (TEST_MSG, expected2, "");
+#if defined (FP16_SUPPORTED)
+  CHECK_FP(TEST_MSG, float, 16, 4, PRIx16, expected2, "");
+  CHECK_FP(TEST_MSG, float, 16, 8, PRIx16, expected2, "");
+#endif
+}
+
+int main (void)
+{
+  exec_vuzp_half ();
+  return 0;
+}
--- a/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vzip.c
+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vzip.c
@@ -18,6 +18,10 @@ VECT_VAR_DECL(expected0,poly,8,8) [] = { 0xf0, 0xf4, 0x55, 0x55,
 					 0xf1, 0xf5, 0x55, 0x55 };
 VECT_VAR_DECL(expected0,poly,16,4) [] = { 0xfff0, 0xfff2,
 					  0x66, 0x66 };
+#if defined (FP16_SUPPORTED)
+VECT_VAR_DECL (expected0, hfloat, 16, 4) [] = { 0xcc00, 0xcb00,
+						0x4b4d, 0x4b4d };
+#endif
 VECT_VAR_DECL(expected0,hfloat,32,2) [] = { 0xc1800000, 0xc1700000 };
 VECT_VAR_DECL(expected0,int,8,16) [] = { 0xf0, 0xf8, 0x11, 0x11,
 					 0xf1, 0xf9, 0x11, 0x11,
@@ -41,6 +45,12 @@ VECT_VAR_DECL(expected0,poly,8,16) [] = { 0xf0, 0xf8, 0x55, 0x55,
 					  0xf3, 0xfb, 0x55, 0x55 };
 VECT_VAR_DECL(expected0,poly,16,8) [] = { 0xfff0, 0xfff4, 0x66, 0x66,
 					  0xfff1, 0xfff5, 0x66, 0x66 };
+#if defined (FP16_SUPPORTED)
+VECT_VAR_DECL (expected0, hfloat, 16, 8) [] = { 0xcc00, 0xca00,
+						0x4b4d, 0x4b4d,
+						0xcb80, 0xc980,
+						0x4b4d, 0x4b4d };
+#endif
 VECT_VAR_DECL(expected0,hfloat,32,4) [] = { 0xc1800000, 0xc1600000,
 					    0x42073333, 0x42073333 };
 
@@ -59,6 +69,10 @@ VECT_VAR_DECL(expected1,poly,8,8) [] = { 0xf2, 0xf6, 0x55, 0x55,
 					 0xf3, 0xf7, 0x55, 0x55 };
 VECT_VAR_DECL(expected1,poly,16,4) [] = { 0xfff1, 0xfff3,
 					  0x66, 0x66 };
+#if defined (FP16_SUPPORTED)
+VECT_VAR_DECL (expected1, hfloat, 16, 4) [] = { 0xcb80, 0xca80,
+						0x4b4d, 0x4b4d };
+#endif
 VECT_VAR_DECL(expected1,hfloat,32,2) [] = { 0x42066666, 0x42066666 };
 VECT_VAR_DECL(expected1,int,8,16) [] = { 0xf4, 0xfc, 0x11, 0x11,
 					 0xf5, 0xfd, 0x11, 0x11,
@@ -82,6 +96,12 @@ VECT_VAR_DECL(expected1,poly,8,16) [] = { 0xf4, 0xfc, 0x55, 0x55,
 					  0xf7, 0xff, 0x55, 0x55 };
 VECT_VAR_DECL(expected1,poly,16,8) [] = { 0xfff2, 0xfff6, 0x66, 0x66,
 					  0xfff3, 0xfff7, 0x66, 0x66 };
+#if defined (FP16_SUPPORTED)
+VECT_VAR_DECL (expected1, hfloat, 16, 8) [] = { 0xcb00, 0xc900,
+						0x4b4d, 0x4b4d,
+						0xca80, 0xc880,
+						0x4b4d, 0x4b4d };
+#endif
 VECT_VAR_DECL(expected1,hfloat,32,4) [] = { 0xc1700000, 0xc1500000,
 					    0x42073333, 0x42073333 };
 
--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vzip_half.c
@@ -0,0 +1,263 @@
+/* { dg-do run } */
+/* { dg-skip-if "" { arm*-*-* } } */
+
+#include <arm_neon.h>
+#include "arm-neon-ref.h"
+#include "compute-ref-data.h"
+
+/* Expected results.  */
+VECT_VAR_DECL(expected,int,8,8) [] = { 0xf0, 0x11, 0xf1, 0x11,
+				       0xf2, 0x11, 0xf3, 0x11 };
+VECT_VAR_DECL(expected,int,16,4) [] = { 0xfff0, 0x22, 0xfff1, 0x22 };
+VECT_VAR_DECL(expected,int,32,2) [] = { 0xfffffff0, 0x33 };
+VECT_VAR_DECL(expected,int,64,1) [] = { 0xfffffffffffffff0 };
+VECT_VAR_DECL(expected,uint,8,8) [] = { 0xf0, 0x55, 0xf1, 0x55,
+					0xf2, 0x55, 0xf3, 0x55 };
+VECT_VAR_DECL(expected,uint,16,4) [] = { 0xfff0, 0x66, 0xfff1, 0x66 };
+VECT_VAR_DECL(expected,uint,32,2) [] = { 0xfffffff0, 0x77 };
+VECT_VAR_DECL(expected,uint,64,1) [] = { 0xfffffffffffffff0 };
+VECT_VAR_DECL(expected,poly,8,8) [] = { 0xf0, 0x55, 0xf1, 0x55,
+					0xf2, 0x55, 0xf3, 0x55 };
+VECT_VAR_DECL(expected,poly,16,4) [] = { 0xfff0, 0x66, 0xfff1, 0x66 };
+VECT_VAR_DECL(expected,hfloat,32,2) [] = { 0xc1800000, 0x42066666 };
+#if defined (FP16_SUPPORTED)
+VECT_VAR_DECL (expected, hfloat, 16, 4) [] = { 0xcc00, 0x4b4d,
+					       0xcb80, 0x4b4d };
+#endif
+VECT_VAR_DECL(expected,int,8,16) [] = { 0xf0, 0x11, 0xf1, 0x11,
+					0xf2, 0x11, 0xf3, 0x11,
+					0xf4, 0x11, 0xf5, 0x11,
+					0xf6, 0x11, 0xf7, 0x11 };
+VECT_VAR_DECL(expected,int,16,8) [] = { 0xfff0, 0x22, 0xfff1, 0x22,
+					0xfff2, 0x22, 0xfff3, 0x22 };
+VECT_VAR_DECL(expected,int,32,4) [] = { 0xfffffff0, 0x33,
+					0xfffffff1, 0x33 };
+VECT_VAR_DECL(expected,int,64,2) [] = { 0xfffffffffffffff0,
+					0x44 };
+VECT_VAR_DECL(expected,uint,8,16) [] = { 0xf0, 0x55, 0xf1, 0x55,
+					 0xf2, 0x55, 0xf3, 0x55,
+					 0xf4, 0x55, 0xf5, 0x55,
+					 0xf6, 0x55, 0xf7, 0x55 };
+VECT_VAR_DECL(expected,uint,16,8) [] = { 0xfff0, 0x66, 0xfff1, 0x66,
+					 0xfff2, 0x66, 0xfff3, 0x66 };
+VECT_VAR_DECL(expected,uint,32,4) [] = { 0xfffffff0, 0x77,
+					 0xfffffff1, 0x77 };
+VECT_VAR_DECL(expected,uint,64,2) [] = { 0xfffffffffffffff0,
+					 0x88 };
+VECT_VAR_DECL(expected,poly,8,16) [] = { 0xf0, 0x55, 0xf1, 0x55,
+					 0xf2, 0x55, 0xf3, 0x55,
+					 0xf4, 0x55, 0xf5, 0x55,
+					 0xf6, 0x55, 0xf7, 0x55 };
+VECT_VAR_DECL(expected,poly,16,8) [] = { 0xfff0, 0x66, 0xfff1, 0x66,
+					 0xfff2, 0x66, 0xfff3, 0x66 };
+#if defined (FP16_SUPPORTED)
+VECT_VAR_DECL (expected, hfloat, 16, 8) [] = { 0xcc00, 0x4b4d,
+					       0xcb80, 0x4b4d,
+					       0xcb00, 0x4b4d,
+					       0xca80, 0x4b4d };
+#endif
+VECT_VAR_DECL(expected,hfloat,32,4) [] = { 0xc1800000, 0x42073333,
+					   0xc1700000, 0x42073333 };
+
+#define TEST_MSG "VZIP1"
+void exec_vzip_half (void)
+{
+#define TEST_VZIP(PART, Q, T1, T2, W, N)		\
+  VECT_VAR(vector_res, T1, W, N) =			\
+    vzip##PART##Q##_##T2##W(VECT_VAR(vector, T1, W, N),	\
+		       VECT_VAR(vector2, T1, W, N));	\
+  vst1##Q##_##T2##W(VECT_VAR(result, T1, W, N), VECT_VAR(vector_res, T1, W, N))
+
+#define TEST_VZIP1(Q, T1, T2, W, N) TEST_VZIP(1, Q, T1, T2, W, N)
+
+  /* Input vector can only have 64 bits.  */
+  DECL_VARIABLE_ALL_VARIANTS(vector);
+  DECL_VARIABLE_ALL_VARIANTS(vector2);
+  DECL_VARIABLE(vector, float, 64, 2);
+  DECL_VARIABLE(vector2, float, 64, 2);
+
+  DECL_VARIABLE_ALL_VARIANTS(vector_res);
+  DECL_VARIABLE(vector_res, float, 64, 2);
+
+  clean_results ();
+  /* We don't have vzip1_T64x1, so set expected to the clean value.  */
+  CLEAN(expected, int, 64, 1);
+  CLEAN(expected, uint, 64, 1);
+
+  TEST_MACRO_ALL_VARIANTS_2_5(VLOAD, vector, buffer);
+#if defined (FP16_SUPPORTED)
+  VLOAD(vector, buffer, , float, f, 16, 4);
+  VLOAD(vector, buffer, q, float, f, 16, 8);
+#endif
+  VLOAD(vector, buffer, , float, f, 32, 2);
+  VLOAD(vector, buffer, q, float, f, 32, 4);
+  VLOAD(vector, buffer, q, float, f, 64, 2);
+
+  /* Choose arbitrary initialization values.  */
+  VDUP(vector2, , int, s, 8, 8, 0x11);
+  VDUP(vector2, , int, s, 16, 4, 0x22);
+  VDUP(vector2, , int, s, 32, 2, 0x33);
+  VDUP(vector2, , uint, u, 8, 8, 0x55);
+  VDUP(vector2, , uint, u, 16, 4, 0x66);
+  VDUP(vector2, , uint, u, 32, 2, 0x77);
+  VDUP(vector2, , poly, p, 8, 8, 0x55);
+  VDUP(vector2, , poly, p, 16, 4, 0x66);
+#if defined (FP16_SUPPORTED)
+  VDUP (vector2, , float, f, 16, 4, 14.6f);   /* 14.6f is 0x4b4d.  */
+#endif
+  VDUP(vector2, , float, f, 32, 2, 33.6f);
+
+  VDUP(vector2, q, int, s, 8, 16, 0x11);
+  VDUP(vector2, q, int, s, 16, 8, 0x22);
+  VDUP(vector2, q, int, s, 32, 4, 0x33);
+  VDUP(vector2, q, int, s, 64, 2, 0x44);
+  VDUP(vector2, q, uint, u, 8, 16, 0x55);
+  VDUP(vector2, q, uint, u, 16, 8, 0x66);
+  VDUP(vector2, q, uint, u, 32, 4, 0x77);
+  VDUP(vector2, q, uint, u, 64, 2, 0x88);
+  VDUP(vector2, q, poly, p, 8, 16, 0x55);
+  VDUP(vector2, q, poly, p, 16, 8, 0x66);
+#if defined (FP16_SUPPORTED)
+  VDUP (vector2, q, float, f, 16, 8, 14.6f);
+#endif
+  VDUP(vector2, q, float, f, 32, 4, 33.8f);
+  VDUP(vector2, q, float, f, 64, 2, 33.8f);
+
+  TEST_VZIP1(, int, s, 8, 8);
+  TEST_VZIP1(, int, s, 16, 4);
+  TEST_VZIP1(, int, s, 32, 2);
+  TEST_VZIP1(, uint, u, 8, 8);
+  TEST_VZIP1(, uint, u, 16, 4);
+  TEST_VZIP1(, uint, u, 32, 2);
+  TEST_VZIP1(, poly, p, 8, 8);
+  TEST_VZIP1(, poly, p, 16, 4);
+#if defined (FP16_SUPPORTED)
+  TEST_VZIP1(, float, f, 16, 4);
+#endif
+  TEST_VZIP1(, float, f, 32, 2);
+
+  TEST_VZIP1(q, int, s, 8, 16);
+  TEST_VZIP1(q, int, s, 16, 8);
+  TEST_VZIP1(q, int, s, 32, 4);
+  TEST_VZIP1(q, int, s, 64, 2);
+  TEST_VZIP1(q, uint, u, 8, 16);
+  TEST_VZIP1(q, uint, u, 16, 8);
+  TEST_VZIP1(q, uint, u, 32, 4);
+  TEST_VZIP1(q, uint, u, 64, 2);
+  TEST_VZIP1(q, poly, p, 8, 16);
+  TEST_VZIP1(q, poly, p, 16, 8);
+#if defined (FP16_SUPPORTED)
+  TEST_VZIP1(q, float, f, 16, 8);
+#endif
+  TEST_VZIP1(q, float, f, 32, 4);
+  TEST_VZIP1(q, float, f, 64, 2);
+
+#if defined (FP16_SUPPORTED)
+  CHECK_RESULTS (TEST_MSG, "");
+#else
+  CHECK_RESULTS_NO_FP16 (TEST_MSG, "");
+#endif
+
+#undef TEST_MSG
+#define TEST_MSG "VZIP2"
+
+#define TEST_VZIP2(Q, T1, T2, W, N) TEST_VZIP(2, Q, T1, T2, W, N)
+
+/* Expected results.  */
+VECT_VAR_DECL(expected2,int,8,8) [] = { 0xf4, 0x11, 0xf5, 0x11,
+					0xf6, 0x11, 0xf7, 0x11 };
+VECT_VAR_DECL(expected2,int,16,4) [] = { 0xfff2, 0x22, 0xfff3, 0x22 };
+VECT_VAR_DECL(expected2,int,32,2) [] = { 0xfffffff1, 0x33 };
+VECT_VAR_DECL(expected2,int,64,1) [] = { 0xfffffffffffffff1 };
+VECT_VAR_DECL(expected2,uint,8,8) [] = { 0xf4, 0x55, 0xf5, 0x55,
+					 0xf6, 0x55, 0xf7, 0x55 };
+VECT_VAR_DECL(expected2,uint,16,4) [] = { 0xfff2, 0x66, 0xfff3, 0x66 };
+VECT_VAR_DECL(expected2,uint,32,2) [] = { 0xfffffff1, 0x77 };
+VECT_VAR_DECL(expected2,uint,64,1) [] = { 0xfffffffffffffff1 };
+VECT_VAR_DECL(expected2,poly,8,8) [] = { 0xf4, 0x55, 0xf5, 0x55,
+					 0xf6, 0x55, 0xf7, 0x55 };
+VECT_VAR_DECL(expected2,poly,16,4) [] = { 0xfff2, 0x66, 0xfff3, 0x66 };
+VECT_VAR_DECL(expected2,hfloat,32,2) [] = { 0xc1700000, 0x42066666 };
+#if defined (FP16_SUPPORTED)
+VECT_VAR_DECL (expected2, hfloat, 16, 4) [] = { 0xcb00, 0x4b4d,
+						0xca80, 0x4b4d };
+#endif
+VECT_VAR_DECL(expected2,int,8,16) [] = { 0xf8, 0x11, 0xf9, 0x11,
+					 0xfa, 0x11, 0xfb, 0x11,
+					 0xfc, 0x11, 0xfd, 0x11,
+					 0xfe, 0x11, 0xff, 0x11 };
+VECT_VAR_DECL(expected2,int,16,8) [] = { 0xfff4, 0x22, 0xfff5, 0x22,
+					 0xfff6, 0x22, 0xfff7, 0x22 };
+VECT_VAR_DECL(expected2,int,32,4) [] = { 0xfffffff2, 0x33,
+					 0xfffffff3, 0x33 };
+VECT_VAR_DECL(expected2,int,64,2) [] = { 0xfffffffffffffff1,
+					 0x44 };
+VECT_VAR_DECL(expected2,uint,8,16) [] = { 0xf8, 0x55, 0xf9, 0x55,
+					  0xfa, 0x55, 0xfb, 0x55,
+					  0xfc, 0x55, 0xfd, 0x55,
+					  0xfe, 0x55, 0xff, 0x55 };
+VECT_VAR_DECL(expected2,uint,16,8) [] = { 0xfff4, 0x66, 0xfff5, 0x66,
+					  0xfff6, 0x66, 0xfff7, 0x66 };
+VECT_VAR_DECL(expected2,uint,32,4) [] = { 0xfffffff2, 0x77,
+					  0xfffffff3, 0x77 };
+VECT_VAR_DECL(expected2,uint,64,2) [] = { 0xfffffffffffffff1,
+					  0x88 };
+VECT_VAR_DECL(expected2,poly,8,16) [] = { 0xf8, 0x55, 0xf9, 0x55,
+					  0xfa, 0x55, 0xfb, 0x55,
+					  0xfc, 0x55, 0xfd, 0x55,
+					  0xfe, 0x55, 0xff, 0x55 };
+VECT_VAR_DECL(expected2,poly,16,8) [] = { 0xfff4, 0x66, 0xfff5, 0x66,
+					  0xfff6, 0x66, 0xfff7, 0x66 };
+#if defined (FP16_SUPPORTED)
+VECT_VAR_DECL (expected2, hfloat, 16, 8) [] = { 0xca00, 0x4b4d,
+						0xc980, 0x4b4d,
+						0xc900, 0x4b4d,
+						0xc880, 0x4b4d };
+#endif
+VECT_VAR_DECL(expected2,hfloat,32,4) [] = { 0xc1600000, 0x42073333,
+					    0xc1500000, 0x42073333 };
+  clean_results ();
+  CLEAN(expected2, int, 64, 1);
+  CLEAN(expected2, uint, 64, 1);
+
+  TEST_VZIP2(, int, s, 8, 8);
+  TEST_VZIP2(, int, s, 16, 4);
+  TEST_VZIP2(, int, s, 32, 2);
+  TEST_VZIP2(, uint, u, 8, 8);
+  TEST_VZIP2(, uint, u, 16, 4);
+  TEST_VZIP2(, uint, u, 32, 2);
+  TEST_VZIP2(, poly, p, 8, 8);
+  TEST_VZIP2(, poly, p, 16, 4);
+#if defined (FP16_SUPPORTED)
+  TEST_VZIP2(, float, f, 16, 4);
+#endif
+  TEST_VZIP2(, float, f, 32, 2);
+
+  TEST_VZIP2(q, int, s, 8, 16);
+  TEST_VZIP2(q, int, s, 16, 8);
+  TEST_VZIP2(q, int, s, 32, 4);
+  TEST_VZIP2(q, int, s, 64, 2);
+  TEST_VZIP2(q, uint, u, 8, 16);
+  TEST_VZIP2(q, uint, u, 16, 8);
+  TEST_VZIP2(q, uint, u, 32, 4);
+  TEST_VZIP2(q, uint, u, 64, 2);
+  TEST_VZIP2(q, poly, p, 8, 16);
+  TEST_VZIP2(q, poly, p, 16, 8);
+#if defined (FP16_SUPPORTED)
+  TEST_VZIP2(q, float, f, 16, 8);
+#endif
+  TEST_VZIP2(q, float, f, 32, 4);
+  TEST_VZIP2(q, float, f, 64, 2);
+
+  CHECK_RESULTS_NAMED (TEST_MSG, expected2, "");
+#if defined (FP16_SUPPORTED)
+  CHECK_FP(TEST_MSG, float, 16, 4, PRIx16, expected2, "");
+  CHECK_FP(TEST_MSG, float, 16, 8, PRIx16, expected2, "");
+#endif
+}
+
+int main (void)
+{
+  exec_vzip_half ();
+  return 0;
+}
--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/aarch64/ands_3.c
@@ -0,0 +1,12 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+
+int
+f9 (unsigned char x, int y)
+{
+  if (y > 1 && x == 0)
+    return 10;
+  return x;
+}
+
+/* { dg-final { scan-assembler "ands\t(x|w)\[0-9\]+,\[ \t\]*(x|w)\[0-9\]+,\[ \t\]*255" } } */
--- a/src/gcc/testsuite/gcc.target/aarch64/cpu-diagnostics-1.c
+++ b/src/gcc/testsuite/gcc.target/aarch64/cpu-diagnostics-1.c
@@ -1,4 +1,5 @@
 /* { dg-error "unknown" "" {target "aarch64*-*-*" } } */
+/* { dg-skip-if "do not override -mcpu" { *-*-* } { "-mcpu=*" } { "" } } */
 /* { dg-options "-O2 -mcpu=dummy" } */
 
 void f ()
--- a/src/gcc/testsuite/gcc.target/aarch64/cpu-diagnostics-2.c
+++ b/src/gcc/testsuite/gcc.target/aarch64/cpu-diagnostics-2.c
@@ -1,4 +1,5 @@
 /* { dg-error "missing" "" {target "aarch64*-*-*" } } */
+/* { dg-skip-if "do not override -mcpu" { *-*-* } { "-mcpu=*" } { "" } } */
 /* { dg-options "-O2 -mcpu=cortex-a53+no" } */
 
 void f ()
--- a/src/gcc/testsuite/gcc.target/aarch64/cpu-diagnostics-3.c
+++ b/src/gcc/testsuite/gcc.target/aarch64/cpu-diagnostics-3.c
@@ -1,4 +1,5 @@
 /* { dg-error "invalid feature" "" {target "aarch64*-*-*" } } */
+/* { dg-skip-if "do not override -mcpu" { *-*-* } { "-mcpu=*" } { "" } } */
 /* { dg-options "-O2 -mcpu=cortex-a53+dummy" } */
 
 void f ()
--- a/src/gcc/testsuite/gcc.target/aarch64/cpu-diagnostics-4.c
+++ b/src/gcc/testsuite/gcc.target/aarch64/cpu-diagnostics-4.c
@@ -1,4 +1,5 @@
 /* { dg-error "missing" "" {target "aarch64*-*-*" } } */
+/* { dg-skip-if "do not override -mcpu" { *-*-* } { "-mcpu=*" } { "" } } */
 /* { dg-options "-O2 -mcpu=+dummy" } */
 
 void f ()
--- a/src/gcc/testsuite/gcc.target/aarch64/fmaxmin.c
+++ b/src/gcc/testsuite/gcc.target/aarch64/fmaxmin.c
@@ -1,5 +1,5 @@
 /* { dg-do run } */
-/* { dg-options "-O2 -ftree-vectorize -fno-inline -save-temps" } */
+/* { dg-options "-O2 -ftree-vectorize -fno-inline -fno-vect-cost-model -save-temps" } */
 
 
 extern void abort (void);
--- a/src/gcc/testsuite/gcc.target/aarch64/fmla_intrinsic_1.c
+++ b/src/gcc/testsuite/gcc.target/aarch64/fmla_intrinsic_1.c
@@ -110,6 +110,6 @@ main (int argc, char **argv)
 /* vfmaq_lane_f64.
    vfma_laneq_f64.
    vfmaq_laneq_f64.  */
-/* { dg-final { scan-assembler-times "fmla\\tv\[0-9\]+\.2d, v\[0-9\]+\.2d, v\[0-9\]+\.2d\\\[\[0-9\]+\\\]" 3 } } */
+/* { dg-final { scan-assembler-times "fmla\\tv\[0-9\]+\.2d, v\[0-9\]+\.2d, v\[0-9\]+\.2?d\\\[\[0-9\]+\\\]" 3 } } */
 
 
--- a/src/gcc/testsuite/gcc.target/aarch64/fmls_intrinsic_1.c
+++ b/src/gcc/testsuite/gcc.target/aarch64/fmls_intrinsic_1.c
@@ -111,6 +111,6 @@ main (int argc, char **argv)
 /* vfmsq_lane_f64.
    vfms_laneq_f64.
    vfmsq_laneq_f64.  */
-/* { dg-final { scan-assembler-times "fmls\\tv\[0-9\]+\.2d, v\[0-9\]+\.2d, v\[0-9\]+\.2d\\\[\[0-9\]+\\\]" 3 } } */
+/* { dg-final { scan-assembler-times "fmls\\tv\[0-9\]+\.2d, v\[0-9\]+\.2d, v\[0-9\]+\.2?d\\\[\[0-9\]+\\\]" 3 } } */
 
 
--- a/src/gcc/testsuite/gcc.target/aarch64/fmovd-zero-reg.c
+++ b/src/gcc/testsuite/gcc.target/aarch64/fmovd-zero-reg.c
@@ -8,4 +8,4 @@ foo (void)
   bar (0.0);
 }
 
-/* { dg-final { scan-assembler "fmov\\td0, xzr" } } */
+/* { dg-final { scan-assembler "movi\\td0, #0" } } */
--- a/src/gcc/testsuite/gcc.target/aarch64/fmovf-zero-reg.c
+++ b/src/gcc/testsuite/gcc.target/aarch64/fmovf-zero-reg.c
@@ -8,4 +8,4 @@ foo (void)
   bar (0.0);
 }
 
-/* { dg-final { scan-assembler "fmov\\ts0, wzr" } } */
+/* { dg-final { scan-assembler "movi\\tv0\.2s, #0" } } */
--- a/src/gcc/testsuite/gcc.target/aarch64/fmul_fcvt_2.c
+++ b/src/gcc/testsuite/gcc.target/aarch64/fmul_fcvt_2.c
@@ -1,5 +1,5 @@
 /* { dg-do run } */
-/* { dg-options "-save-temps -O2 -ftree-vectorize -fno-inline" } */
+/* { dg-options "-save-temps -O2 -ftree-vectorize -fno-inline -fno-vect-cost-model" } */
 
 #define N 1024
 
--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/aarch64/ifcvt_multiple_sets_subreg_1.c
@@ -0,0 +1,30 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -fdump-rtl-ce1" } */
+
+/* Check that the inner if is transformed into CSELs.  */
+
+int
+foo (int *x, int *z, int a)
+{
+  int b = 0;
+  int c = 0;
+  int d = 0;
+  int i;
+
+  for (i = 0; i < a; i++)
+    {
+      if (x[i] < c)
+	{
+	  b = z[i];
+	  if (c < b)
+	    {
+	      c = b;
+	      d = i;
+	    }
+	}
+    }
+
+  return c + d;
+}
+
+/* { dg-final { scan-rtl-dump "if-conversion succeeded through noce_convert_multiple_sets" "ce1" } } */
--- a/src/gcc/testsuite/gcc.target/aarch64/ldp_stp_1.c
+++ b/src/gcc/testsuite/gcc.target/aarch64/ldp_stp_1.c
@@ -1,4 +1,4 @@
-/* { dg-options "-O2" } */
+/* { dg-options "-O2 -mcpu=generic" } */
 
 int arr[4][4];
 
--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/aarch64/ldp_stp_unaligned_1.c
@@ -0,0 +1,20 @@
+/* { dg-options "-O2" } */
+
+/* Check that we can use a REG + IMM addressing mode when moving an unaligned
+   TImode value to and from memory.  */
+
+struct foo
+{
+  long long b;
+  __int128 a;
+} __attribute__ ((packed));
+
+void
+bar (struct foo *p, struct foo *q)
+{
+  p->a = q->a;
+}
+
+/* { dg-final { scan-assembler-not "add\tx\[0-9\]+, x\[0-9\]+" } } */
+/* { dg-final { scan-assembler-times "ldp\tx\[0-9\]+, x\[0-9\], .*8" 1 } } */
+/* { dg-final { scan-assembler-times "stp\tx\[0-9\]+, x\[0-9\], .*8" 1 } } */
--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/aarch64/popcnt.c
@@ -0,0 +1,23 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+
+int
+foo (int x)
+{
+  return __builtin_popcount (x);
+}
+
+long
+foo1 (long x)
+{
+  return __builtin_popcountl (x);
+}
+
+long long
+foo2 (long long x)
+{
+  return __builtin_popcountll (x);
+}
+
+/* { dg-final { scan-assembler-not "popcount" } } */
+/* { dg-final { scan-assembler-times "cnt\t" 3 } } */
--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/aarch64/pr37780_1.c
@@ -0,0 +1,46 @@
+/* Test that we can remove the conditional move due to CLZ
+   and CTZ being defined at zero.  */
+
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+
+int
+fooctz (int i)
+{
+  return (i == 0) ? 32 : __builtin_ctz (i);
+}
+
+int
+fooctz2 (int i)
+{
+  return (i != 0) ? __builtin_ctz (i) : 32;
+}
+
+unsigned int
+fooctz3 (unsigned int i)
+{
+  return (i > 0) ?  __builtin_ctz (i) : 32;
+}
+
+/* { dg-final { scan-assembler-times "rbit\t*" 3 } } */
+
+int
+fooclz (int i)
+{
+  return (i == 0) ? 32 : __builtin_clz (i);
+}
+
+int
+fooclz2 (int i)
+{
+  return (i != 0) ? __builtin_clz (i) : 32;
+}
+
+unsigned int
+fooclz3 (unsigned int i)
+{
+  return (i > 0) ? __builtin_clz (i) : 32;
+}
+
+/* { dg-final { scan-assembler-times "clz\t" 6 } } */
+/* { dg-final { scan-assembler-not "cmp\t.*0" } } */
--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/aarch64/pr63874.c
@@ -0,0 +1,22 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+/* { dg-skip-if "Not applicable for mcmodel=large" { aarch64*-*-* }  { "-mcmodel=large" } { "" } } */
+
+extern void __attribute__((weak)) foo_weakref (void);
+void __attribute__((weak, noinline)) bar (void)
+{
+ return;
+}
+void (*f) (void);
+void (*g) (void);
+
+int
+main (void)
+{
+ f = &foo_weakref;
+ g = &bar;
+ return 0;
+}
+
+/* { dg-final { scan-assembler-not "adr*foo_weakref" } } */
+/* { dg-final { scan-assembler-not "\\.(word|xword)\tbar" } } */
--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/aarch64/pr71727.c
@@ -0,0 +1,33 @@
+/* { dg-do compile } */
+/* { dg-options "-mstrict-align -O3" } */
+
+struct test_struct_s
+{
+  long a;
+  long b;
+  long c;
+  long d;
+  unsigned long e;
+};
+
+
+char _a;
+struct test_struct_s xarray[128];
+
+void
+_start (void)
+{
+  struct test_struct_s *new_entry;
+
+  new_entry = &xarray[0];
+  new_entry->a = 1;
+  new_entry->b = 2;
+  new_entry->c = 3;
+  new_entry->d = 4;
+  new_entry->e = 5;
+
+  return;
+}
+
+/* { dg-final { scan-assembler-times "mov\tx" 5 {target lp64} } } */
+/* { dg-final { scan-assembler-not "add\tx0, x0, :" {target lp64} } } */
--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/aarch64/pr78382.c
@@ -0,0 +1,10 @@
+/* { dg-require-effective-target fpic } */
+/* { dg-options "-mtls-dialect=trad -fpic" } */
+
+__thread int abc;
+void
+foo ()
+{
+  int *p;
+  p = &abc;
+}
--- a/src/gcc/testsuite/gcc.target/aarch64/simd/vminmaxnm_1.c
+++ b/src/gcc/testsuite/gcc.target/aarch64/simd/vminmaxnm_1.c
@@ -1,4 +1,4 @@
-/* Test the `v[min|max]nm{q}_f*' AArch64 SIMD intrinsic.  */
+/* Test the `v[min|max]{nm}{q}_f*' AArch64 SIMD intrinsic.  */
 
 /* { dg-do run } */
 /* { dg-options "-O2" } */
@@ -18,6 +18,7 @@ extern void abort ();
 int
 main (int argc, char **argv)
 {
+  /* v{min|max}nm_f32 normal.  */
   float32x2_t f32x2_input1 = vdup_n_f32 (-1.0);
   float32x2_t f32x2_input2 = vdup_n_f32 (0.0);
   float32x2_t f32x2_exp_minnm  = vdup_n_f32 (-1.0);
@@ -28,6 +29,7 @@ main (int argc, char **argv)
   CHECK (uint32_t, 2, f32x2_ret_minnm, f32x2_exp_minnm);
   CHECK (uint32_t, 2, f32x2_ret_maxnm, f32x2_exp_maxnm);
 
+  /* v{min|max}nm_f32 NaN.  */
   f32x2_input1 = vdup_n_f32 (__builtin_nanf (""));
   f32x2_input2 = vdup_n_f32 (1.0);
   f32x2_exp_minnm  = vdup_n_f32 (1.0);
@@ -38,6 +40,7 @@ main (int argc, char **argv)
   CHECK (uint32_t, 2, f32x2_ret_minnm, f32x2_exp_minnm);
   CHECK (uint32_t, 2, f32x2_ret_maxnm, f32x2_exp_maxnm);
 
+  /* v{min|max}nmq_f32 normal.  */
   float32x4_t f32x4_input1 = vdupq_n_f32 (-1024.0);
   float32x4_t f32x4_input2 = vdupq_n_f32 (77.0);
   float32x4_t f32x4_exp_minnm  = vdupq_n_f32 (-1024.0);
@@ -48,6 +51,7 @@ main (int argc, char **argv)
   CHECK (uint32_t, 4, f32x4_ret_minnm, f32x4_exp_minnm);
   CHECK (uint32_t, 4, f32x4_ret_maxnm, f32x4_exp_maxnm);
 
+  /* v{min|max}nmq_f32 NaN.  */
   f32x4_input1 = vdupq_n_f32 (-__builtin_nanf (""));
   f32x4_input2 = vdupq_n_f32 (-1.0);
   f32x4_exp_minnm  = vdupq_n_f32 (-1.0);
@@ -58,16 +62,57 @@ main (int argc, char **argv)
   CHECK (uint32_t, 4, f32x4_ret_minnm, f32x4_exp_minnm);
   CHECK (uint32_t, 4, f32x4_ret_maxnm, f32x4_exp_maxnm);
 
+  /* v{min|max}nm_f64 normal.  */
+  float64x1_t f64x1_input1 = vdup_n_f64 (1.23);
+  float64x1_t f64x1_input2 = vdup_n_f64 (4.56);
+  float64x1_t f64x1_exp_minnm  = vdup_n_f64 (1.23);
+  float64x1_t f64x1_exp_maxnm  = vdup_n_f64 (4.56);
+  float64x1_t f64x1_ret_minnm  = vminnm_f64 (f64x1_input1, f64x1_input2);
+  float64x1_t f64x1_ret_maxnm  = vmaxnm_f64 (f64x1_input1, f64x1_input2);
+  CHECK (uint64_t, 1, f64x1_ret_minnm, f64x1_exp_minnm);
+  CHECK (uint64_t, 1, f64x1_ret_maxnm, f64x1_exp_maxnm);
+
+  /* v{min|max}_f64 normal.  */
+  float64x1_t f64x1_exp_min  = vdup_n_f64 (1.23);
+  float64x1_t f64x1_exp_max  = vdup_n_f64 (4.56);
+  float64x1_t f64x1_ret_min  = vmin_f64 (f64x1_input1, f64x1_input2);
+  float64x1_t f64x1_ret_max  = vmax_f64 (f64x1_input1, f64x1_input2);
+  CHECK (uint64_t, 1, f64x1_ret_min, f64x1_exp_min);
+  CHECK (uint64_t, 1, f64x1_ret_max, f64x1_exp_max);
+
+  /* v{min|max}nmq_f64 normal.  */
   float64x2_t f64x2_input1 = vdupq_n_f64 (1.23);
   float64x2_t f64x2_input2 = vdupq_n_f64 (4.56);
   float64x2_t f64x2_exp_minnm  = vdupq_n_f64 (1.23);
   float64x2_t f64x2_exp_maxnm  = vdupq_n_f64 (4.56);
   float64x2_t f64x2_ret_minnm  = vminnmq_f64 (f64x2_input1, f64x2_input2);
   float64x2_t f64x2_ret_maxnm  = vmaxnmq_f64 (f64x2_input1, f64x2_input2);
-
   CHECK (uint64_t, 2, f64x2_ret_minnm, f64x2_exp_minnm);
   CHECK (uint64_t, 2, f64x2_ret_maxnm, f64x2_exp_maxnm);
 
+  /* v{min|max}nm_f64 NaN.  */
+  f64x1_input1 = vdup_n_f64 (-__builtin_nanf (""));
+  f64x1_input2 = vdup_n_f64 (1.0);
+  f64x1_exp_minnm  = vdup_n_f64 (1.0);
+  f64x1_exp_maxnm  = vdup_n_f64 (1.0);
+  f64x1_ret_minnm  = vminnm_f64 (f64x1_input1, f64x1_input2);
+  f64x1_ret_maxnm  = vmaxnm_f64 (f64x1_input1, f64x1_input2);
+
+  CHECK (uint64_t, 1, f64x1_ret_minnm, f64x1_exp_minnm);
+  CHECK (uint64_t, 1, f64x1_ret_maxnm, f64x1_exp_maxnm);
+
+  /* v{min|max}_f64 NaN.  */
+  f64x1_input1 = vdup_n_f64 (-__builtin_nanf (""));
+  f64x1_input2 = vdup_n_f64 (1.0);
+  f64x1_exp_minnm  = vdup_n_f64 (-__builtin_nanf (""));
+  f64x1_exp_maxnm  = vdup_n_f64 (-__builtin_nanf (""));
+  f64x1_ret_minnm  = vmin_f64 (f64x1_input1, f64x1_input2);
+  f64x1_ret_maxnm  = vmax_f64 (f64x1_input1, f64x1_input2);
+
+  CHECK (uint64_t, 1, f64x1_ret_minnm, f64x1_exp_minnm);
+  CHECK (uint64_t, 1, f64x1_ret_maxnm, f64x1_exp_maxnm);
+
+  /* v{min|max}nmq_f64 NaN.  */
   f64x2_input1 = vdupq_n_f64 (-__builtin_nan (""));
   f64x2_input2 = vdupq_n_f64 (1.0);
   f64x2_exp_minnm  = vdupq_n_f64 (1.0);
--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/aarch64/simd/vmul_elem_1.c
@@ -0,0 +1,541 @@
+/* Test the vmul_n_f64 AArch64 SIMD intrinsic.  */
+
+/* { dg-do run } */
+/* { dg-options "-O2 --save-temps" } */
+
+#include "arm_neon.h"
+
+extern void abort (void);
+
+#define A (132.4f)
+#define B (-0.0f)
+#define C (-34.8f)
+#define D (289.34f)
+float32_t expected2_1[2] = {A * A, B * A};
+float32_t expected2_2[2] = {A * B, B * B};
+float32_t expected4_1[4] = {A * A, B * A, C * A, D * A};
+float32_t expected4_2[4] = {A * B, B * B, C * B, D * B};
+float32_t expected4_3[4] = {A * C, B * C, C * C, D * C};
+float32_t expected4_4[4] = {A * D, B * D, C * D, D * D};
+float32_t _elemA = A;
+float32_t _elemB = B;
+float32_t _elemC = C;
+float32_t _elemD = D;
+
+#define AD (1234.5)
+#define BD (-0.0)
+#define CD (71.3)
+#define DD (-1024.4)
+float64_t expectedd2_1[2] = {AD * CD, BD * CD};
+float64_t expectedd2_2[2] = {AD * DD, BD * DD};
+float64_t _elemdC = CD;
+float64_t _elemdD = DD;
+
+
+#define AS (1024)
+#define BS (-31)
+#define CS (0)
+#define DS (655)
+int32_t expecteds2_1[2] = {AS * AS, BS * AS};
+int32_t expecteds2_2[2] = {AS * BS, BS * BS};
+int32_t expecteds4_1[4] = {AS * AS, BS * AS, CS * AS, DS * AS};
+int32_t expecteds4_2[4] = {AS * BS, BS * BS, CS * BS, DS * BS};
+int32_t expecteds4_3[4] = {AS * CS, BS * CS, CS * CS, DS * CS};
+int32_t expecteds4_4[4] = {AS * DS, BS * DS, CS * DS, DS * DS};
+int32_t _elemsA = AS;
+int32_t _elemsB = BS;
+int32_t _elemsC = CS;
+int32_t _elemsD = DS;
+
+#define AH ((int16_t) 0)
+#define BH ((int16_t) -32)
+#define CH ((int16_t) 102)
+#define DH ((int16_t) -51)
+#define EH ((int16_t) 71)
+#define FH ((int16_t) -91)
+#define GH ((int16_t) 48)
+#define HH ((int16_t) 255)
+int16_t expectedh4_1[4] = {AH * AH, BH * AH, CH * AH, DH * AH};
+int16_t expectedh4_2[4] = {AH * BH, BH * BH, CH * BH, DH * BH};
+int16_t expectedh4_3[4] = {AH * CH, BH * CH, CH * CH, DH * CH};
+int16_t expectedh4_4[4] = {AH * DH, BH * DH, CH * DH, DH * DH};
+int16_t expectedh8_1[8] = {AH * AH, BH * AH, CH * AH, DH * AH,
+			   EH * AH, FH * AH, GH * AH, HH * AH};
+int16_t expectedh8_2[8] = {AH * BH, BH * BH, CH * BH, DH * BH,
+			   EH * BH, FH * BH, GH * BH, HH * BH};
+int16_t expectedh8_3[8] = {AH * CH, BH * CH, CH * CH, DH * CH,
+			   EH * CH, FH * CH, GH * CH, HH * CH};
+int16_t expectedh8_4[8] = {AH * DH, BH * DH, CH * DH, DH * DH,
+			   EH * DH, FH * DH, GH * DH, HH * DH};
+int16_t expectedh8_5[8] = {AH * EH, BH * EH, CH * EH, DH * EH,
+			   EH * EH, FH * EH, GH * EH, HH * EH};
+int16_t expectedh8_6[8] = {AH * FH, BH * FH, CH * FH, DH * FH,
+			   EH * FH, FH * FH, GH * FH, HH * FH};
+int16_t expectedh8_7[8] = {AH * GH, BH * GH, CH * GH, DH * GH,
+			   EH * GH, FH * GH, GH * GH, HH * GH};
+int16_t expectedh8_8[8] = {AH * HH, BH * HH, CH * HH, DH * HH,
+			   EH * HH, FH * HH, GH * HH, HH * HH};
+int16_t _elemhA = AH;
+int16_t _elemhB = BH;
+int16_t _elemhC = CH;
+int16_t _elemhD = DH;
+int16_t _elemhE = EH;
+int16_t _elemhF = FH;
+int16_t _elemhG = GH;
+int16_t _elemhH = HH;
+
+#define AUS (1024)
+#define BUS (31)
+#define CUS (0)
+#define DUS (655)
+uint32_t expectedus2_1[2] = {AUS * AUS, BUS * AUS};
+uint32_t expectedus2_2[2] = {AUS * BUS, BUS * BUS};
+uint32_t expectedus4_1[4] = {AUS * AUS, BUS * AUS, CUS * AUS, DUS * AUS};
+uint32_t expectedus4_2[4] = {AUS * BUS, BUS * BUS, CUS * BUS, DUS * BUS};
+uint32_t expectedus4_3[4] = {AUS * CUS, BUS * CUS, CUS * CUS, DUS * CUS};
+uint32_t expectedus4_4[4] = {AUS * DUS, BUS * DUS, CUS * DUS, DUS * DUS};
+uint32_t _elemusA = AUS;
+uint32_t _elemusB = BUS;
+uint32_t _elemusC = CUS;
+uint32_t _elemusD = DUS;
+
+#define AUH ((uint16_t) 0)
+#define BUH ((uint16_t) 32)
+#define CUH ((uint16_t) 102)
+#define DUH ((uint16_t) 51)
+#define EUH ((uint16_t) 71)
+#define FUH ((uint16_t) 91)
+#define GUH ((uint16_t) 48)
+#define HUH ((uint16_t) 255)
+uint16_t expecteduh4_1[4] = {AUH * AUH, BUH * AUH, CUH * AUH, DUH * AUH};
+uint16_t expecteduh4_2[4] = {AUH * BUH, BUH * BUH, CUH * BUH, DUH * BUH};
+uint16_t expecteduh4_3[4] = {AUH * CUH, BUH * CUH, CUH * CUH, DUH * CUH};
+uint16_t expecteduh4_4[4] = {AUH * DUH, BUH * DUH, CUH * DUH, DUH * DUH};
+uint16_t expecteduh8_1[8] = {AUH * AUH, BUH * AUH, CUH * AUH, DUH * AUH,
+			     EUH * AUH, FUH * AUH, GUH * AUH, HUH * AUH};
+uint16_t expecteduh8_2[8] = {AUH * BUH, BUH * BUH, CUH * BUH, DUH * BUH,
+			     EUH * BUH, FUH * BUH, GUH * BUH, HUH * BUH};
+uint16_t expecteduh8_3[8] = {AUH * CUH, BUH * CUH, CUH * CUH, DUH * CUH,
+			     EUH * CUH, FUH * CUH, GUH * CUH, HUH * CUH};
+uint16_t expecteduh8_4[8] = {AUH * DUH, BUH * DUH, CUH * DUH, DUH * DUH,
+			     EUH * DUH, FUH * DUH, GUH * DUH, HUH * DUH};
+uint16_t expecteduh8_5[8] = {AUH * EUH, BUH * EUH, CUH * EUH, DUH * EUH,
+			     EUH * EUH, FUH * EUH, GUH * EUH, HUH * EUH};
+uint16_t expecteduh8_6[8] = {AUH * FUH, BUH * FUH, CUH * FUH, DUH * FUH,
+			     EUH * FUH, FUH * FUH, GUH * FUH, HUH * FUH};
+uint16_t expecteduh8_7[8] = {AUH * GUH, BUH * GUH, CUH * GUH, DUH * GUH,
+			     EUH * GUH, FUH * GUH, GUH * GUH, HUH * GUH};
+uint16_t expecteduh8_8[8] = {AUH * HUH, BUH * HUH, CUH * HUH, DUH * HUH,
+			     EUH * HUH, FUH * HUH, GUH * HUH, HUH * HUH};
+uint16_t _elemuhA = AUH;
+uint16_t _elemuhB = BUH;
+uint16_t _elemuhC = CUH;
+uint16_t _elemuhD = DUH;
+uint16_t _elemuhE = EUH;
+uint16_t _elemuhF = FUH;
+uint16_t _elemuhG = GUH;
+uint16_t _elemuhH = HUH;
+
+void
+check_v2sf (float32_t elemA, float32_t elemB)
+{
+  int32_t indx;
+  const float32_t vec32x2_buf[2] = {A, B};
+  float32x2_t vec32x2_src = vld1_f32 (vec32x2_buf);
+  float32_t vec32x2_res[2];
+
+  vst1_f32 (vec32x2_res, vmul_n_f32 (vec32x2_src, elemA));
+
+  for (indx = 0; indx < 2; indx++)
+    if (* (uint32_t *) &vec32x2_res[indx] != * (uint32_t *) &expected2_1[indx])
+      abort ();
+
+  vst1_f32 (vec32x2_res, vmul_n_f32 (vec32x2_src, elemB));
+
+  for (indx = 0; indx < 2; indx++)
+    if (* (uint32_t *) &vec32x2_res[indx] != * (uint32_t *) &expected2_2[indx])
+      abort ();
+
+/* { dg-final { scan-assembler-times "fmul\tv\[0-9\]+\.2s, v\[0-9\]+\.2s, v\[0-9\]+\.s\\\[0\\\]" 2 } } */
+}
+
+void
+check_v4sf (float32_t elemA, float32_t elemB, float32_t elemC, float32_t elemD)
+{
+  int32_t indx;
+  const float32_t vec32x4_buf[4] = {A, B, C, D};
+  float32x4_t vec32x4_src = vld1q_f32 (vec32x4_buf);
+  float32_t vec32x4_res[4];
+
+  vst1q_f32 (vec32x4_res, vmulq_n_f32 (vec32x4_src, elemA));
+
+  for (indx = 0; indx < 4; indx++)
+    if (* (uint32_t *) &vec32x4_res[indx] != * (uint32_t *) &expected4_1[indx])
+      abort ();
+
+  vst1q_f32 (vec32x4_res, vmulq_n_f32 (vec32x4_src, elemB));
+
+  for (indx = 0; indx < 4; indx++)
+    if (* (uint32_t *) &vec32x4_res[indx] != * (uint32_t *) &expected4_2[indx])
+      abort ();
+
+  vst1q_f32 (vec32x4_res, vmulq_n_f32 (vec32x4_src, elemC));
+
+  for (indx = 0; indx < 4; indx++)
+    if (* (uint32_t *) &vec32x4_res[indx] != * (uint32_t *) &expected4_3[indx])
+      abort ();
+
+  vst1q_f32 (vec32x4_res, vmulq_n_f32 (vec32x4_src, elemD));
+
+  for (indx = 0; indx < 4; indx++)
+    if (* (uint32_t *) &vec32x4_res[indx] != * (uint32_t *) &expected4_4[indx])
+      abort ();
+
+/* { dg-final { scan-assembler-times "fmul\tv\[0-9\]+\.4s, v\[0-9\]+\.4s, v\[0-9\]+\.s\\\[0\\\]" 4 } } */
+}
+
+void
+check_v2df (float64_t elemdC, float64_t elemdD)
+{
+  int32_t indx;
+  const float64_t vec64x2_buf[2] = {AD, BD};
+  float64x2_t vec64x2_src = vld1q_f64 (vec64x2_buf);
+  float64_t vec64x2_res[2];
+
+  vst1q_f64 (vec64x2_res, vmulq_n_f64 (vec64x2_src, elemdC));
+
+  for (indx = 0; indx < 2; indx++)
+    if (* (uint64_t *) &vec64x2_res[indx] != * (uint64_t *) &expectedd2_1[indx])
+      abort ();
+
+  vst1q_f64 (vec64x2_res, vmulq_n_f64 (vec64x2_src, elemdD));
+
+  for (indx = 0; indx < 2; indx++)
+    if (* (uint64_t *) &vec64x2_res[indx] != * (uint64_t *) &expectedd2_2[indx])
+      abort ();
+
+/* { dg-final { scan-assembler-times "fmul\tv\[0-9\]+\.2d, v\[0-9\]+\.2d, v\[0-9\]+\.d\\\[0\\\]" 2 } } */
+}
+
+void
+check_v2si (int32_t elemsA, int32_t elemsB)
+{
+  int32_t indx;
+  const int32_t vecs32x2_buf[2] = {AS, BS};
+  int32x2_t vecs32x2_src = vld1_s32 (vecs32x2_buf);
+  int32_t vecs32x2_res[2];
+
+  vst1_s32 (vecs32x2_res, vmul_n_s32 (vecs32x2_src, elemsA));
+
+  for (indx = 0; indx < 2; indx++)
+    if (vecs32x2_res[indx] != expecteds2_1[indx])
+      abort ();
+
+  vst1_s32 (vecs32x2_res, vmul_n_s32 (vecs32x2_src, elemsB));
+
+  for (indx = 0; indx < 2; indx++)
+    if (vecs32x2_res[indx] != expecteds2_2[indx])
+      abort ();
+}
+
+void
+check_v2si_unsigned (uint32_t elemusA, uint32_t elemusB)
+{
+  int indx;
+  const uint32_t vecus32x2_buf[2] = {AUS, BUS};
+  uint32x2_t vecus32x2_src = vld1_u32 (vecus32x2_buf);
+  uint32_t vecus32x2_res[2];
+
+  vst1_u32 (vecus32x2_res, vmul_n_u32 (vecus32x2_src, elemusA));
+
+  for (indx = 0; indx < 2; indx++)
+    if (vecus32x2_res[indx] != expectedus2_1[indx])
+      abort ();
+
+  vst1_u32 (vecus32x2_res, vmul_n_u32 (vecus32x2_src, elemusB));
+
+  for (indx = 0; indx < 2; indx++)
+    if (vecus32x2_res[indx] != expectedus2_2[indx])
+      abort ();
+
+/* { dg-final { scan-assembler-times "\tmul\tv\[0-9\]+\.2s, v\[0-9\]+\.2s, v\[0-9\]+\.s\\\[0\\\]" 4 } } */
+}
+
+void
+check_v4si (int32_t elemsA, int32_t elemsB, int32_t elemsC, int32_t elemsD)
+{
+  int32_t indx;
+  const int32_t vecs32x4_buf[4] = {AS, BS, CS, DS};
+  int32x4_t vecs32x4_src = vld1q_s32 (vecs32x4_buf);
+  int32_t vecs32x4_res[4];
+
+  vst1q_s32 (vecs32x4_res, vmulq_n_s32 (vecs32x4_src, elemsA));
+
+  for (indx = 0; indx < 4; indx++)
+    if (vecs32x4_res[indx] != expecteds4_1[indx])
+      abort ();
+
+  vst1q_s32 (vecs32x4_res, vmulq_n_s32 (vecs32x4_src, elemsB));
+
+  for (indx = 0; indx < 4; indx++)
+    if (vecs32x4_res[indx] != expecteds4_2[indx])
+      abort ();
+
+  vst1q_s32 (vecs32x4_res, vmulq_n_s32 (vecs32x4_src, elemsC));
+
+  for (indx = 0; indx < 4; indx++)
+    if (vecs32x4_res[indx] != expecteds4_3[indx])
+      abort ();
+
+  vst1q_s32 (vecs32x4_res, vmulq_n_s32 (vecs32x4_src, elemsD));
+
+  for (indx = 0; indx < 4; indx++)
+    if (vecs32x4_res[indx] != expecteds4_4[indx])
+      abort ();
+}
+
+void
+check_v4si_unsigned (uint32_t elemusA, uint32_t elemusB, uint32_t elemusC,
+		     uint32_t elemusD)
+{
+  int indx;
+  const uint32_t vecus32x4_buf[4] = {AUS, BUS, CUS, DUS};
+  uint32x4_t vecus32x4_src = vld1q_u32 (vecus32x4_buf);
+  uint32_t vecus32x4_res[4];
+
+  vst1q_u32 (vecus32x4_res, vmulq_n_u32 (vecus32x4_src, elemusA));
+
+  for (indx = 0; indx < 4; indx++)
+    if (vecus32x4_res[indx] != expectedus4_1[indx])
+      abort ();
+
+  vst1q_u32 (vecus32x4_res, vmulq_n_u32 (vecus32x4_src, elemusB));
+
+  for (indx = 0; indx < 4; indx++)
+    if (vecus32x4_res[indx] != expectedus4_2[indx])
+      abort ();
+
+  vst1q_u32 (vecus32x4_res, vmulq_n_u32 (vecus32x4_src, elemusC));
+
+  for (indx = 0; indx < 4; indx++)
+    if (vecus32x4_res[indx] != expectedus4_3[indx])
+      abort ();
+
+  vst1q_u32 (vecus32x4_res, vmulq_n_u32 (vecus32x4_src, elemusD));
+
+  for (indx = 0; indx < 4; indx++)
+    if (vecus32x4_res[indx] != expectedus4_4[indx])
+      abort ();
+
+/* { dg-final { scan-assembler-times "\tmul\tv\[0-9\]+\.4s, v\[0-9\]+\.4s, v\[0-9\]+\.s\\\[0\\\]" 8 } } */
+}
+
+
+void
+check_v4hi (int16_t elemhA, int16_t elemhB, int16_t elemhC, int16_t elemhD)
+{
+  int32_t indx;
+  const int16_t vech16x4_buf[4] = {AH, BH, CH, DH};
+  int16x4_t vech16x4_src = vld1_s16 (vech16x4_buf);
+  int16_t vech16x4_res[4];
+
+  vst1_s16 (vech16x4_res, vmul_n_s16 (vech16x4_src, elemhA));
+
+  for (indx = 0; indx < 4; indx++)
+    if (vech16x4_res[indx] != expectedh4_1[indx])
+      abort ();
+
+  vst1_s16 (vech16x4_res, vmul_n_s16 (vech16x4_src, elemhB));
+
+  for (indx = 0; indx < 4; indx++)
+    if (vech16x4_res[indx] != expectedh4_2[indx])
+      abort ();
+
+  vst1_s16 (vech16x4_res, vmul_n_s16 (vech16x4_src, elemhC));
+
+  for (indx = 0; indx < 4; indx++)
+    if (vech16x4_res[indx] != expectedh4_3[indx])
+      abort ();
+
+  vst1_s16 (vech16x4_res, vmul_n_s16 (vech16x4_src, elemhD));
+
+  for (indx = 0; indx < 4; indx++)
+    if (vech16x4_res[indx] != expectedh4_4[indx])
+      abort ();
+}
+
+void
+check_v4hi_unsigned (uint16_t elemuhA, uint16_t elemuhB, uint16_t elemuhC,
+		     uint16_t elemuhD)
+{
+  int indx;
+  const uint16_t vecuh16x4_buf[4] = {AUH, BUH, CUH, DUH};
+  uint16x4_t vecuh16x4_src = vld1_u16 (vecuh16x4_buf);
+  uint16_t vecuh16x4_res[4];
+
+  vst1_u16 (vecuh16x4_res, vmul_n_u16 (vecuh16x4_src, elemuhA));
+
+  for (indx = 0; indx < 4; indx++)
+    if (vecuh16x4_res[indx] != expecteduh4_1[indx])
+      abort ();
+
+  vst1_u16 (vecuh16x4_res, vmul_n_u16 (vecuh16x4_src, elemuhB));
+
+  for (indx = 0; indx < 4; indx++)
+    if (vecuh16x4_res[indx] != expecteduh4_2[indx])
+      abort ();
+
+  vst1_u16 (vecuh16x4_res, vmul_n_u16 (vecuh16x4_src, elemuhC));
+
+  for (indx = 0; indx < 4; indx++)
+    if (vecuh16x4_res[indx] != expecteduh4_3[indx])
+      abort ();
+
+  vst1_u16 (vecuh16x4_res, vmul_n_u16 (vecuh16x4_src, elemuhD));
+
+  for (indx = 0; indx < 4; indx++)
+    if (vecuh16x4_res[indx] != expecteduh4_4[indx])
+      abort ();
+
+/* { dg-final { scan-assembler-times "mul\tv\[0-9\]+\.4h, v\[0-9\]+\.4h, v\[0-9\]+\.h\\\[0\\\]" 8 } } */
+}
+
+void
+check_v8hi (int16_t elemhA, int16_t elemhB, int16_t elemhC, int16_t elemhD,
+	    int16_t elemhE, int16_t elemhF, int16_t elemhG, int16_t elemhH)
+{
+  int32_t indx;
+  const int16_t vech16x8_buf[8] = {AH, BH, CH, DH, EH, FH, GH, HH};
+  int16x8_t vech16x8_src = vld1q_s16 (vech16x8_buf);
+  int16_t vech16x8_res[8];
+
+  vst1q_s16 (vech16x8_res, vmulq_n_s16 (vech16x8_src, elemhA));
+
+  for (indx = 0; indx < 8; indx++)
+    if (vech16x8_res[indx] != expectedh8_1[indx])
+      abort ();
+
+  vst1q_s16 (vech16x8_res, vmulq_n_s16 (vech16x8_src, elemhB));
+
+  for (indx = 0; indx < 8; indx++)
+    if (vech16x8_res[indx] != expectedh8_2[indx])
+      abort ();
+
+  vst1q_s16 (vech16x8_res, vmulq_n_s16 (vech16x8_src, elemhC));
+
+  for (indx = 0; indx < 8; indx++)
+    if (vech16x8_res[indx] != expectedh8_3[indx])
+      abort ();
+
+  vst1q_s16 (vech16x8_res, vmulq_n_s16 (vech16x8_src, elemhD));
+
+  for (indx = 0; indx < 8; indx++)
+    if (vech16x8_res[indx] != expectedh8_4[indx])
+      abort ();
+
+  vst1q_s16 (vech16x8_res, vmulq_n_s16 (vech16x8_src, elemhE));
+
+  for (indx = 0; indx < 8; indx++)
+    if (vech16x8_res[indx] != expectedh8_5[indx])
+      abort ();
+
+  vst1q_s16 (vech16x8_res, vmulq_n_s16 (vech16x8_src, elemhF));
+
+  for (indx = 0; indx < 8; indx++)
+    if (vech16x8_res[indx] != expectedh8_6[indx])
+      abort ();
+
+  vst1q_s16 (vech16x8_res, vmulq_n_s16 (vech16x8_src, elemhG));
+
+  for (indx = 0; indx < 8; indx++)
+    if (vech16x8_res[indx] != expectedh8_7[indx])
+      abort ();
+
+  vst1q_s16 (vech16x8_res, vmulq_n_s16 (vech16x8_src, elemhH));
+
+  for (indx = 0; indx < 8; indx++)
+    if (vech16x8_res[indx] != expectedh8_8[indx])
+      abort ();
+}
+
+void
+check_v8hi_unsigned (uint16_t elemuhA, uint16_t elemuhB, uint16_t elemuhC,
+		     uint16_t elemuhD, uint16_t elemuhE, uint16_t elemuhF,
+		     uint16_t elemuhG, uint16_t elemuhH)
+{
+  int indx;
+  const uint16_t vecuh16x8_buf[8] = {AUH, BUH, CUH, DUH, EUH, FUH, GUH, HUH};
+  uint16x8_t vecuh16x8_src = vld1q_u16 (vecuh16x8_buf);
+  uint16_t vecuh16x8_res[8];
+
+  vst1q_u16 (vecuh16x8_res, vmulq_n_u16 (vecuh16x8_src, elemuhA));
+
+  for (indx = 0; indx < 8; indx++)
+    if (vecuh16x8_res[indx] != expecteduh8_1[indx])
+      abort ();
+
+  vst1q_u16 (vecuh16x8_res, vmulq_n_u16 (vecuh16x8_src, elemuhB));
+
+  for (indx = 0; indx < 8; indx++)
+    if (vecuh16x8_res[indx] != expecteduh8_2[indx])
+      abort ();
+
+  vst1q_u16 (vecuh16x8_res, vmulq_n_u16 (vecuh16x8_src, elemuhC));
+
+  for (indx = 0; indx < 8; indx++)
+    if (vecuh16x8_res[indx] != expecteduh8_3[indx])
+      abort ();
+
+  vst1q_u16 (vecuh16x8_res, vmulq_n_u16 (vecuh16x8_src, elemuhD));
+
+  for (indx = 0; indx < 8; indx++)
+    if (vecuh16x8_res[indx] != expecteduh8_4[indx])
+      abort ();
+
+  vst1q_u16 (vecuh16x8_res, vmulq_n_u16 (vecuh16x8_src, elemuhE));
+
+  for (indx = 0; indx < 8; indx++)
+    if (vecuh16x8_res[indx] != expecteduh8_5[indx])
+      abort ();
+
+  vst1q_u16 (vecuh16x8_res, vmulq_n_u16 (vecuh16x8_src, elemuhF));
+
+  for (indx = 0; indx < 8; indx++)
+    if (vecuh16x8_res[indx] != expecteduh8_6[indx])
+      abort ();
+
+  vst1q_u16 (vecuh16x8_res, vmulq_n_u16 (vecuh16x8_src, elemuhG));
+
+  for (indx = 0; indx < 8; indx++)
+    if (vecuh16x8_res[indx] != expecteduh8_7[indx])
+      abort ();
+
+  vst1q_u16 (vecuh16x8_res, vmulq_n_u16 (vecuh16x8_src, elemuhH));
+
+  for (indx = 0; indx < 8; indx++)
+    if (vecuh16x8_res[indx] != expecteduh8_8[indx])
+      abort ();
+
+/* { dg-final { scan-assembler-times "mul\tv\[0-9\]+\.8h, v\[0-9\]+\.8h, v\[0-9\]+\.h\\\[0\\\]" 16 } } */
+}
+
+int
+main (void)
+{
+  check_v2sf (_elemA, _elemB);
+  check_v4sf (_elemA, _elemB, _elemC, _elemD);
+  check_v2df (_elemdC, _elemdD);
+  check_v2si (_elemsA, _elemsB);
+  check_v4si (_elemsA, _elemsB, _elemsC, _elemsD);
+  check_v4hi (_elemhA, _elemhB, _elemhC, _elemhD);
+  check_v8hi (_elemhA, _elemhB, _elemhC, _elemhD,
+	      _elemhE, _elemhF, _elemhG, _elemhH);
+  check_v2si_unsigned (_elemusA, _elemusB);
+  check_v4si_unsigned (_elemusA, _elemusB, _elemusC, _elemusD);
+  check_v4hi_unsigned (_elemuhA, _elemuhB, _elemuhC, _elemuhD);
+  check_v8hi_unsigned (_elemuhA, _elemuhB, _elemuhC, _elemuhD,
+		       _elemuhE, _elemuhF, _elemuhG, _elemuhH);
+
+  return 0;
+}
+
--- a/src/gcc/testsuite/gcc.target/aarch64/store-pair-1.c
+++ b/src/gcc/testsuite/gcc.target/aarch64/store-pair-1.c
@@ -1,5 +1,5 @@
 /* { dg-do compile } */
-/* { dg-options "-O2" } */
+/* { dg-options "-O2 -mcpu=generic" } */
 
 int f(int *a, int b)
 {
--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/aarch64/store_repeating_constant_1.c
@@ -0,0 +1,11 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mtune=generic" } */
+
+void
+foo (unsigned long long *a)
+{
+  a[0] = 0x0140c0da0140c0daULL;
+}
+
+/* { dg-final { scan-assembler-times "movk\\tw.*" 1 } } */
+/* { dg-final { scan-assembler-times "stp\tw\[0-9\]+, w\[0-9\]+.*" 1 } } */
--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/aarch64/store_repeating_constant_2.c
@@ -0,0 +1,15 @@
+/* { dg-do compile } */
+/* { dg-options "-Os" } */
+
+/* Check that for -Os we synthesize only the bottom half and then
+   store it twice with an STP rather than synthesizing it twice in each
+   half of an X-reg.  */
+
+void
+foo (unsigned long long *a)
+{
+  a[0] = 0xc0da0000c0daULL;
+}
+
+/* { dg-final { scan-assembler-times "mov\\tw.*" 1 } } */
+/* { dg-final { scan-assembler-times "stp\tw\[0-9\]+, w\[0-9\]+.*" 1 } } */
--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/aarch64/struct_return.c
@@ -0,0 +1,31 @@
+/* Test the absence of a spurious move from x8 to x0 for functions
+   return structures.  */
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+
+struct s
+{
+  long x;
+  long y;
+  long z;
+};
+
+struct s __attribute__((noinline))
+foo (long a, long d, long c)
+{
+  struct s b;
+  b.x = a;
+  b.y = d;
+  b.z = c;
+  return b;
+}
+
+int
+main (void)
+{
+  struct s x;
+  x = foo ( 10, 20, 30);
+  return x.x + x.y + x.z;
+}
+
+/* { dg-final { scan-assembler-not "mov\tx0, x8" } } */
--- a/src/gcc/testsuite/gcc.target/aarch64/test_frame_10.c
+++ b/src/gcc/testsuite/gcc.target/aarch64/test_frame_10.c
@@ -4,8 +4,7 @@
      * total frame size > 512.
        area except outgoing <= 512
      * number of callee-saved reg >= 2.
-     * Split stack adjustment into two subtractions.
-       the first subtractions could be optimized into "stp !".  */
+     * Use a single stack adjustment, no writeback.  */
 
 /* { dg-do run } */
 /* { dg-options "-O2 -fomit-frame-pointer --save-temps" } */
@@ -15,6 +14,6 @@
 t_frame_pattern_outgoing (test10, 480, "x19", 24, a[8], a[9], a[10])
 t_frame_run (test10)
 
-/* { dg-final { scan-assembler-times "stp\tx19, x30, \\\[sp, -\[0-9\]+\\\]!" 1 } } */
-/* { dg-final { scan-assembler-times "ldp\tx19, x30, \\\[sp\\\], \[0-9\]+" 1 } } */
+/* { dg-final { scan-assembler-times "stp\tx19, x30, \\\[sp, \[0-9\]+\\\]" 1 } } */
+/* { dg-final { scan-assembler-times "ldp\tx19, x30, \\\[sp, \[0-9\]+\\\]" 1 } } */
 
--- a/src/gcc/testsuite/gcc.target/aarch64/test_frame_12.c
+++ b/src/gcc/testsuite/gcc.target/aarch64/test_frame_12.c
@@ -13,6 +13,6 @@ t_frame_run (test12)
 
 /* { dg-final { scan-assembler-times "sub\tsp, sp, #\[0-9\]+" 1 } } */
 
-/* Check epilogue using write-back.  */
-/* { dg-final { scan-assembler-times "ldp\tx29, x30, \\\[sp\\\], \[0-9\]+" 3 } } */
+/* Check epilogue using no write-back.  */
+/* { dg-final { scan-assembler-times "ldp\tx29, x30, \\\[sp, \[0-9\]+\\\]" 1 } } */
 
--- a/src/gcc/testsuite/gcc.target/aarch64/test_frame_13.c
+++ b/src/gcc/testsuite/gcc.target/aarch64/test_frame_13.c
@@ -2,8 +2,7 @@
      * without outgoing.
      * total frame size > 512.
      * number of callee-save reg >= 2.
-     * split the stack adjustment into two substractions,
-       the second could be optimized into "stp !".  */
+     * Use a single stack adjustment, no writeback.  */
 
 /* { dg-do run } */
 /* { dg-options "-O2 --save-temps" } */
@@ -14,4 +13,4 @@ t_frame_pattern (test13, 700, )
 t_frame_run (test13)
 
 /* { dg-final { scan-assembler-times "sub\tsp, sp, #\[0-9\]+" 1 } } */
-/* { dg-final { scan-assembler-times "stp\tx29, x30, \\\[sp, -\[0-9\]+\\\]!" 2 } } */
+/* { dg-final { scan-assembler-times "stp\tx29, x30, \\\[sp\\\]" 1 } } */
--- a/src/gcc/testsuite/gcc.target/aarch64/test_frame_15.c
+++ b/src/gcc/testsuite/gcc.target/aarch64/test_frame_15.c
@@ -3,8 +3,7 @@
      * total frame size > 512.
        area except outgoing <= 512
      * number of callee-save reg >= 2.
-     * split the stack adjustment into two substractions,
-       the first could be optimized into "stp !".  */
+     * Use a single stack adjustment, no writeback.  */
 
 /* { dg-do run } */
 /* { dg-options "-O2 --save-temps" } */
@@ -15,4 +14,4 @@ t_frame_pattern_outgoing (test15, 480, , 8, a[8])
 t_frame_run (test15)
 
 /* { dg-final { scan-assembler-times "sub\tsp, sp, #\[0-9\]+" 1 } } */
-/* { dg-final { scan-assembler-times "stp\tx29, x30, \\\[sp, -\[0-9\]+\\\]!" 3 } } */
+/* { dg-final { scan-assembler-times "stp\tx29, x30, \\\[sp, \[0-9\]+\\\]" 1 } } */
--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/aarch64/test_frame_16.c
@@ -0,0 +1,25 @@
+/* Verify:
+     * with outgoing.
+     * single int register push.
+     * varargs and callee-save size >= 256
+     * Use 2 stack adjustments.  */
+
+/* { dg-do compile } */
+/* { dg-options "-O2 -fomit-frame-pointer --save-temps" } */
+
+#define REP8(X) X,X,X,X,X,X,X,X
+#define REP64(X) REP8(REP8(X))
+
+void outgoing (__builtin_va_list, ...);
+
+double vararg_outgoing (int x1, ...)
+{
+  double a1 = x1, a2 = x1 * 2, a3 = x1 * 3, a4 = x1 * 4, a5 = x1 * 5, a6 = x1 * 6;
+  __builtin_va_list vl;
+  __builtin_va_start (vl, x1);
+  outgoing (vl, a1, a2, a3, a4, a5, a6, REP64 (1));
+  __builtin_va_end (vl);
+  return a1 + a2 + a3 + a4 + a5 + a6;
+}
+
+/* { dg-final { scan-assembler-times "sub\tsp, sp, #\[0-9\]+" 2 } } */
--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/aarch64/test_frame_17.c
@@ -0,0 +1,21 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 --save-temps" } */
+
+/* Test reuse of stack adjustment temporaries.  */
+
+void foo ();
+
+int reuse_mov (int i)
+{
+  int arr[1025];
+  return arr[i];
+}
+
+int no_reuse_mov (int i)
+{
+  int arr[1025];
+  foo ();
+  return arr[i];
+}
+
+/* { dg-final { scan-assembler-times "mov\tx16, \[0-9\]+" 3 } } */
--- a/src/gcc/testsuite/gcc.target/aarch64/test_frame_6.c
+++ b/src/gcc/testsuite/gcc.target/aarch64/test_frame_6.c
@@ -3,8 +3,7 @@
      * without outgoing.
      * total frame size > 512.
      * number of callee-saved reg == 1.
-     * split stack adjustment into two subtractions.
-       the second subtraction should use "str !".  */
+     * use a single stack adjustment, no writeback.  */
 
 /* { dg-do run } */
 /* { dg-options "-O2 -fomit-frame-pointer --save-temps" } */
@@ -14,6 +13,7 @@
 t_frame_pattern (test6, 700, )
 t_frame_run (test6)
 
-/* { dg-final { scan-assembler-times "str\tx30, \\\[sp, -\[0-9\]+\\\]!" 2 } } */
-/* { dg-final { scan-assembler-times "ldr\tx30, \\\[sp\\\], \[0-9\]+" 2 } } */
+/* { dg-final { scan-assembler-times "str\tx30, \\\[sp\\\]" 1 } } */
+/* { dg-final { scan-assembler-times "ldr\tx30, \\\[sp\\\]" 2 } } */
+/* { dg-final { scan-assembler-times "ldr\tx30, \\\[sp\\\]," 1 } } */
 
--- a/src/gcc/testsuite/gcc.target/aarch64/test_frame_7.c
+++ b/src/gcc/testsuite/gcc.target/aarch64/test_frame_7.c
@@ -3,8 +3,7 @@
      * without outgoing.
      * total frame size > 512.
      * number of callee-saved reg == 2.
-     * split stack adjustment into two subtractions.
-       the second subtraction should use "stp !".  */
+     * use a single stack adjustment, no writeback.  */
 
 /* { dg-do run } */
 /* { dg-options "-O2 -fomit-frame-pointer --save-temps" } */
@@ -14,6 +13,6 @@
 t_frame_pattern (test7, 700, "x19")
 t_frame_run (test7)
 
-/* { dg-final { scan-assembler-times "stp\tx19, x30, \\\[sp, -\[0-9\]+\\\]!" 1 } } */
-/* { dg-final { scan-assembler-times "ldp\tx19, x30, \\\[sp\\\], \[0-9\]+" 1 } } */
+/* { dg-final { scan-assembler-times "stp\tx19, x30, \\\[sp]" 1 } } */
+/* { dg-final { scan-assembler-times "ldp\tx19, x30, \\\[sp\\\]" 1 } } */
 
--- a/src/gcc/testsuite/gcc.target/aarch64/test_frame_8.c
+++ b/src/gcc/testsuite/gcc.target/aarch64/test_frame_8.c
@@ -12,6 +12,6 @@
 t_frame_pattern_outgoing (test8, 700, , 8, a[8])
 t_frame_run (test8)
 
-/* { dg-final { scan-assembler-times "str\tx30, \\\[sp, -\[0-9\]+\\\]!" 3 } } */
-/* { dg-final { scan-assembler-times "ldr\tx30, \\\[sp\\\], \[0-9\]+" 3 } } */
+/* { dg-final { scan-assembler-times "str\tx30, \\\[sp, \[0-9\]+\\\]" 1 } } */
+/* { dg-final { scan-assembler-times "ldr\tx30, \\\[sp, \[0-9\]+\\\]" 1 } } */
 
--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/aarch64/thunderxloadpair.c
@@ -0,0 +1,20 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mcpu=thunderx" } */
+
+struct ldp
+{
+  long long c;
+  int a, b;
+};
+
+
+int f(struct ldp *a)
+{
+  return a->a + a->b;
+}
+
+
+/* We know the alignement of a->a to be 8 byte aligned so it is profitable
+   to do ldp. */
+/* { dg-final { scan-assembler-times "ldp\tw\[0-9\]+, w\[0-9\]" 1 } } */
+
--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/aarch64/thunderxnoloadpair.c
@@ -0,0 +1,17 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mcpu=thunderx" } */
+
+struct noldp
+{
+  int a, b;
+};
+
+
+int f(struct noldp *a)
+{
+  return a->a + a->b;
+}
+
+/* We know the alignement of a->a to be 4 byte aligned so it is not profitable
+   to do ldp. */
+/* { dg-final { scan-assembler-not "ldp\tw\[0-9\]+, w\[0-9\]" } } */
--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/aarch64/ubfiz_lsl_1.c
@@ -0,0 +1,13 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+
+/* Check that an X-reg UBFIZ can be simplified into a W-reg LSL.  */
+
+long long
+f2 (long long x)
+{
+  return (x << 5) & 0xffffffff;
+}
+
+/* { dg-final { scan-assembler "lsl\tw" } } */
+/* { dg-final { scan-assembler-not "ubfiz\tx" } } */
--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/aarch64/ubfx_lsr_1.c
@@ -0,0 +1,14 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+
+/* Check that an X-reg UBFX can be simplified into a W-reg LSR.  */
+
+int
+f (unsigned long long x)
+{
+  x = (x >> 24) & 255;
+  return x + 1;
+}
+
+/* { dg-final { scan-assembler "lsr\tw" } } */
+/* { dg-final { scan-assembler-not "ubfx\tx" } } */
--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/aarch64/va_arg_1.c
@@ -0,0 +1,11 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 --save-temps" } */
+
+int
+f (int a, ...)
+{
+  /* { dg-final { scan-assembler-not "str" } } */
+  return a;
+}
+
+/* { dg-final { cleanup-saved-temps } } */
--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/aarch64/va_arg_2.c
@@ -0,0 +1,18 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 --save-temps" } */
+
+int
+foo (char *fmt, ...)
+{
+  int d;
+  __builtin_va_list ap;
+
+  __builtin_va_start (ap, fmt);
+  d = __builtin_va_arg (ap, int);
+  __builtin_va_end (ap);
+
+  /* { dg-final { scan-assembler-not "x7" } } */
+  return d;
+}
+
+/* { dg-final { cleanup-saved-temps } } */
--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/aarch64/va_arg_3.c
@@ -0,0 +1,26 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 --save-temps" } */
+
+int d2i (double a);
+
+int
+foo (char *fmt, ...)
+{
+  int d, e;
+  double f, g;
+  __builtin_va_list ap;
+
+  __builtin_va_start (ap, fmt);
+  d = __builtin_va_arg (ap, int);
+  f = __builtin_va_arg (ap, double);
+  g = __builtin_va_arg (ap, double);
+  d += d2i (f);
+  d += d2i (g);
+  __builtin_va_end (ap);
+
+  /* { dg-final { scan-assembler-not "x7" } } */
+  /* { dg-final { scan-assembler-not "q7" } } */
+  return d;
+}
+
+/* { dg-final { cleanup-saved-temps } } */
--- a/src/gcc/testsuite/gcc.target/aarch64/vect-abs-compile.c
+++ b/src/gcc/testsuite/gcc.target/aarch64/vect-abs-compile.c
@@ -1,6 +1,6 @@
 
 /* { dg-do compile } */
-/* { dg-options "-O3" } */
+/* { dg-options "-O3 -fno-vect-cost-model" } */
 
 #define N 16
 
--- a/src/gcc/testsuite/gcc.target/aarch64/vect-clz.c
+++ b/src/gcc/testsuite/gcc.target/aarch64/vect-clz.c
@@ -1,5 +1,5 @@
 /* { dg-do run } */
-/* { dg-options "-O3 -save-temps -fno-inline" } */
+/* { dg-options "-O3 -save-temps -fno-inline -fno-vect-cost-model" } */
 
 extern void abort ();
 
--- a/src/gcc/testsuite/gcc.target/aarch64/vect-fcm-eq-d.c
+++ b/src/gcc/testsuite/gcc.target/aarch64/vect-fcm-eq-d.c
@@ -1,5 +1,5 @@
 /* { dg-do run } */
-/* { dg-options "-O2 -ftree-vectorize -fdump-tree-vect-all -fno-unroll-loops --save-temps -fno-inline" } */
+/* { dg-options "-O2 -ftree-vectorize -fdump-tree-vect-all -fno-unroll-loops --save-temps -fno-inline -fno-vect-cost-model" } */
 
 #define FTYPE double
 #define ITYPE long
--- a/src/gcc/testsuite/gcc.target/aarch64/vect-fcm-ge-d.c
+++ b/src/gcc/testsuite/gcc.target/aarch64/vect-fcm-ge-d.c
@@ -1,5 +1,5 @@
 /* { dg-do run } */
-/* { dg-options "-O2 -ftree-vectorize -fdump-tree-vect-all -fno-unroll-loops --save-temps -fno-inline" } */
+/* { dg-options "-O2 -ftree-vectorize -fdump-tree-vect-all -fno-unroll-loops --save-temps -fno-inline -fno-vect-cost-model" } */
 
 #define FTYPE double
 #define ITYPE long
--- a/src/gcc/testsuite/gcc.target/aarch64/vect-fcm-gt-d.c
+++ b/src/gcc/testsuite/gcc.target/aarch64/vect-fcm-gt-d.c
@@ -1,5 +1,5 @@
 /* { dg-do run } */
-/* { dg-options "-O2 -ftree-vectorize -fdump-tree-vect-all -fno-unroll-loops --save-temps -fno-inline" } */
+/* { dg-options "-O2 -ftree-vectorize -fdump-tree-vect-all -fno-unroll-loops --save-temps -fno-inline -fno-vect-cost-model" } */
 
 #define FTYPE double
 #define ITYPE long
--- a/src/gcc/testsuite/gcc.target/aarch64/vect-fmovd-zero.c
+++ b/src/gcc/testsuite/gcc.target/aarch64/vect-fmovd-zero.c
@@ -1,5 +1,5 @@
 /* { dg-do compile } */
-/* { dg-options "-O2 -ftree-vectorize -fdump-tree-vect-all" } */
+/* { dg-options "-O2 -ftree-vectorize -fdump-tree-vect-all -fno-vect-cost-model" } */
 
 #define N 32
 
--- a/src/gcc/testsuite/gcc.target/aarch64/vect-fmovd.c
+++ b/src/gcc/testsuite/gcc.target/aarch64/vect-fmovd.c
@@ -1,5 +1,5 @@
 /* { dg-do compile } */
-/* { dg-options "-O2 -ftree-vectorize -fdump-tree-vect-all" } */
+/* { dg-options "-O2 -ftree-vectorize -fdump-tree-vect-all -fno-vect-cost-model" } */
 
 #define N 32
 
--- a/src/gcc/testsuite/gcc.target/aarch64/vect-fmovf-zero.c
+++ b/src/gcc/testsuite/gcc.target/aarch64/vect-fmovf-zero.c
@@ -1,5 +1,5 @@
 /* { dg-do compile } */
-/* { dg-options "-O2 -ftree-vectorize -fdump-tree-vect-all" } */
+/* { dg-options "-O2 -ftree-vectorize -fdump-tree-vect-all -fno-vect-cost-model" } */
 
 #define N 32
 
--- a/src/gcc/testsuite/gcc.target/aarch64/vect-fmovf.c
+++ b/src/gcc/testsuite/gcc.target/aarch64/vect-fmovf.c
@@ -1,5 +1,5 @@
 /* { dg-do compile } */
-/* { dg-options "-O2 -ftree-vectorize -fdump-tree-vect-all" } */
+/* { dg-options "-O2 -ftree-vectorize -fdump-tree-vect-all -fno-vect-cost-model" } */
 
 #define N 32
 
--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/aarch64/vect_copy_lane_1.c
@@ -0,0 +1,86 @@
+/* { dg-do compile } */
+/* { dg-options "-O3" } */
+
+#include "arm_neon.h"
+
+#define BUILD_TEST(TYPE1, TYPE2, Q1, Q2, SUFFIX, INDEX1, INDEX2)	\
+TYPE1 __attribute__((noinline,noclone))					\
+test_copy##Q1##_lane##Q2##_##SUFFIX (TYPE1 a, TYPE2 b)			\
+{									\
+  return vcopy##Q1##_lane##Q2##_##SUFFIX (a, INDEX1, b, INDEX2);	\
+}
+
+/* vcopy_lane.  */
+BUILD_TEST (poly8x8_t, poly8x8_t, , , p8, 7, 6)
+BUILD_TEST (int8x8_t,  int8x8_t,  , , s8, 7, 6)
+BUILD_TEST (uint8x8_t, uint8x8_t, , , u8, 7, 6)
+/* { dg-final { scan-assembler-times "ins\\tv0.b\\\[7\\\], v1.b\\\[6\\\]" 3 } } */
+BUILD_TEST (poly16x4_t, poly16x4_t, , , p16, 3, 2)
+BUILD_TEST (int16x4_t,  int16x4_t,  , , s16, 3, 2)
+BUILD_TEST (uint16x4_t, uint16x4_t, , , u16, 3, 2)
+/* { dg-final { scan-assembler-times "ins\\tv0.h\\\[3\\\], v1.h\\\[2\\\]" 3 } } */
+BUILD_TEST (float32x2_t, float32x2_t, , , f32, 1, 0)
+BUILD_TEST (int32x2_t,   int32x2_t,   , , s32, 1, 0)
+BUILD_TEST (uint32x2_t,  uint32x2_t,  , , u32, 1, 0)
+/* { dg-final { scan-assembler-times "ins\\tv0.s\\\[1\\\], v1.s\\\[0\\\]" 3 } } */
+BUILD_TEST (int64x1_t,   int64x1_t,   , , s64, 0, 0)
+BUILD_TEST (uint64x1_t,  uint64x1_t,  , , u64, 0, 0)
+BUILD_TEST (float64x1_t, float64x1_t, , , f64, 0, 0)
+/* { dg-final { scan-assembler-times "fmov\\td0, d1" 3 } } */
+
+/* vcopy_laneq.  */
+
+BUILD_TEST (poly8x8_t, poly8x16_t, , q, p8, 7, 15)
+BUILD_TEST (int8x8_t,  int8x16_t,  , q, s8, 7, 15)
+BUILD_TEST (uint8x8_t, uint8x16_t, , q, u8, 7, 15)
+/* { dg-final { scan-assembler-times "ins\\tv0.b\\\[7\\\], v1.b\\\[15\\\]" 3 } } */
+BUILD_TEST (poly16x4_t, poly16x8_t, , q, p16, 3, 7)
+BUILD_TEST (int16x4_t,  int16x8_t,  , q, s16, 3, 7)
+BUILD_TEST (uint16x4_t, uint16x8_t, , q, u16, 3, 7)
+/* { dg-final { scan-assembler-times "ins\\tv0.h\\\[3\\\], v1.h\\\[7\\\]" 3 } } */
+BUILD_TEST (float32x2_t, float32x4_t, , q, f32, 1, 3)
+BUILD_TEST (int32x2_t,   int32x4_t,   , q, s32, 1, 3)
+BUILD_TEST (uint32x2_t,  uint32x4_t,  , q, u32, 1, 3)
+/* { dg-final { scan-assembler-times "ins\\tv0.s\\\[1\\\], v1.s\\\[3\\\]" 3 } } */
+BUILD_TEST (float64x1_t, float64x2_t, , q, f64, 0, 1)
+BUILD_TEST (int64x1_t,  int64x2_t,    , q, s64, 0, 1)
+BUILD_TEST (uint64x1_t, uint64x2_t,   , q, u64, 0, 1)
+/* XFAIL due to PR 71307.  */
+/* { dg-final { scan-assembler-times "dup\\td0, v1.d\\\[1\\\]" 3 { xfail *-*-* } } } */
+
+/* vcopyq_lane.  */
+BUILD_TEST (poly8x16_t, poly8x8_t, q, , p8, 15, 7)
+BUILD_TEST (int8x16_t,  int8x8_t,  q, , s8, 15, 7)
+BUILD_TEST (uint8x16_t, uint8x8_t, q, , u8, 15, 7)
+/* { dg-final { scan-assembler-times "ins\\tv0.b\\\[15\\\], v1.b\\\[7\\\]" 3 } } */
+BUILD_TEST (poly16x8_t, poly16x4_t, q, , p16, 7, 3)
+BUILD_TEST (int16x8_t,  int16x4_t,  q, , s16, 7, 3)
+BUILD_TEST (uint16x8_t, uint16x4_t, q, , u16, 7, 3)
+/* { dg-final { scan-assembler-times "ins\\tv0.h\\\[7\\\], v1.h\\\[3\\\]" 3 } } */
+BUILD_TEST (float32x4_t, float32x2_t, q, , f32, 3, 1)
+BUILD_TEST (int32x4_t,   int32x2_t,   q, , s32, 3, 1)
+BUILD_TEST (uint32x4_t,  uint32x2_t,  q, , u32, 3, 1)
+/* { dg-final { scan-assembler-times "ins\\tv0.s\\\[3\\\], v1.s\\\[1\\\]" 3 } } */
+BUILD_TEST (float64x2_t, float64x1_t, q, , f64, 1, 0)
+BUILD_TEST (int64x2_t,   int64x1_t,   q, , s64, 1, 0)
+BUILD_TEST (uint64x2_t,  uint64x1_t,  q, , u64, 1, 0)
+/* { dg-final { scan-assembler-times "ins\\tv0.d\\\[1\\\], v1.d\\\[0\\\]" 3 } } */
+
+/* vcopyq_laneq.  */
+
+BUILD_TEST (poly8x16_t, poly8x16_t, q, q, p8, 14, 15)
+BUILD_TEST (int8x16_t,  int8x16_t,  q, q, s8, 14, 15)
+BUILD_TEST (uint8x16_t, uint8x16_t, q, q, u8, 14, 15)
+/* { dg-final { scan-assembler-times "ins\\tv0.b\\\[14\\\], v1.b\\\[15\\\]" 3 } } */
+BUILD_TEST (poly16x8_t, poly16x8_t, q, q, p16, 6, 7)
+BUILD_TEST (int16x8_t,  int16x8_t,  q, q, s16, 6, 7)
+BUILD_TEST (uint16x8_t, uint16x8_t, q, q, u16, 6, 7)
+/* { dg-final { scan-assembler-times "ins\\tv0.h\\\[6\\\], v1.h\\\[7\\\]" 3 } } */
+BUILD_TEST (float32x4_t, float32x4_t, q, q, f32, 2, 3)
+BUILD_TEST (int32x4_t,   int32x4_t,   q, q, s32, 2, 3)
+BUILD_TEST (uint32x4_t,  uint32x4_t,  q, q, u32, 2, 3)
+/* { dg-final { scan-assembler-times "ins\\tv0.s\\\[2\\\], v1.s\\\[3\\\]" 3 } } */
+BUILD_TEST (float64x2_t, float64x2_t, q, q, f64, 1, 1)
+BUILD_TEST (int64x2_t,   int64x2_t,   q,  q, s64, 1, 1)
+BUILD_TEST (uint64x2_t,  uint64x2_t,  q, q, u64, 1, 1)
+/* { dg-final { scan-assembler-times "ins\\tv0.d\\\[1\\\], v1.d\\\[1\\\]" 3 } } */
--- a/src/gcc/testsuite/gcc.target/aarch64/vect_ctz_1.c
+++ b/src/gcc/testsuite/gcc.target/aarch64/vect_ctz_1.c
@@ -1,5 +1,5 @@
 /* { dg-do run } */
-/* { dg-options "-O3 -save-temps -fno-inline" } */
+/* { dg-options "-O3 -save-temps -fno-inline -fno-vect-cost-model" } */
 
 extern void abort ();
 
--- a/src/gcc/testsuite/gcc.target/aarch64/vector_initialization_nostack.c
+++ b/src/gcc/testsuite/gcc.target/aarch64/vector_initialization_nostack.c
@@ -38,14 +38,14 @@ f11 (void)
   return sum;
 }
 
-char arr_c[100][100];
+char arr_c[100];
 char
 f12 (void)
 {
   int i;
   char sum = 0;
   for (i = 0; i < 100; i++)
-    sum += arr_c[i][0] * arr_c[0][i];
+    sum += arr_c[i] * arr_c[i];
   return sum;
 }
 
--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/aarch64/vget_set_lane_1.c
@@ -0,0 +1,72 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+
+#include "arm_neon.h"
+
+#define BUILD_TEST(TYPE1, TYPE2, Q1, Q2, SUFFIX, INDEX1, INDEX2)	\
+TYPE1 __attribute__((noinline,noclone))				\
+test_copy##Q1##_lane##Q2##_##SUFFIX (TYPE1 a, TYPE2 b)			\
+{									\
+  return vset##Q1##_lane_##SUFFIX (vget##Q2##_lane_##SUFFIX (b, INDEX2),\
+				    a, INDEX1);				\
+}
+
+BUILD_TEST (poly8x8_t, poly8x8_t, , , p8, 7, 6)
+BUILD_TEST (int8x8_t,  int8x8_t,  , , s8, 7, 6)
+BUILD_TEST (uint8x8_t, uint8x8_t, , , u8, 7, 6)
+/* { dg-final { scan-assembler-times "ins\\tv0.b\\\[7\\\], v1.b\\\[6\\\]" 3 } } */
+BUILD_TEST (poly16x4_t, poly16x4_t, , , p16, 3, 2)
+BUILD_TEST (int16x4_t,  int16x4_t,  , , s16, 3, 2)
+BUILD_TEST (uint16x4_t, uint16x4_t, , , u16, 3, 2)
+/* { dg-final { scan-assembler-times "ins\\tv0.h\\\[3\\\], v1.h\\\[2\\\]" 3 } } */
+BUILD_TEST (float32x2_t, float32x2_t, , , f32, 1, 0)
+BUILD_TEST (int32x2_t,   int32x2_t,   , , s32, 1, 0)
+BUILD_TEST (uint32x2_t,  uint32x2_t,  , , u32, 1, 0)
+/* { dg-final { scan-assembler-times "ins\\tv0.s\\\[1\\\], v1.s\\\[0\\\]" 3 } } */
+
+BUILD_TEST (poly8x8_t, poly8x16_t, , q, p8, 7, 15)
+BUILD_TEST (int8x8_t,  int8x16_t,  , q, s8, 7, 15)
+BUILD_TEST (uint8x8_t, uint8x16_t, , q, u8, 7, 15)
+/* { dg-final { scan-assembler-times "ins\\tv0.b\\\[7\\\], v1.b\\\[15\\\]" 3 } } */
+BUILD_TEST (poly16x4_t, poly16x8_t, , q, p16, 3, 7)
+BUILD_TEST (int16x4_t,  int16x8_t,  , q, s16, 3, 7)
+BUILD_TEST (uint16x4_t, uint16x8_t, , q, u16, 3, 7)
+/* { dg-final { scan-assembler-times "ins\\tv0.h\\\[3\\\], v1.h\\\[7\\\]" 3 } } */
+BUILD_TEST (float32x2_t, float32x4_t, , q, f32, 1, 3)
+BUILD_TEST (int32x2_t,   int32x4_t,   , q, s32, 1, 3)
+BUILD_TEST (uint32x2_t,  uint32x4_t,  , q, u32, 1, 3)
+/* { dg-final { scan-assembler-times "ins\\tv0.s\\\[1\\\], v1.s\\\[3\\\]" 3 } } */
+
+BUILD_TEST (poly8x16_t, poly8x8_t, q, , p8, 15, 7)
+BUILD_TEST (int8x16_t,  int8x8_t,  q, , s8, 15, 7)
+BUILD_TEST (uint8x16_t, uint8x8_t, q, , u8, 15, 7)
+/* { dg-final { scan-assembler-times "ins\\tv0.b\\\[15\\\], v1.b\\\[7\\\]" 3 } } */
+BUILD_TEST (poly16x8_t, poly16x4_t, q, , p16, 7, 3)
+BUILD_TEST (int16x8_t,  int16x4_t,  q, , s16, 7, 3)
+BUILD_TEST (uint16x8_t, uint16x4_t, q, , u16, 7, 3)
+/* { dg-final { scan-assembler-times "ins\\tv0.h\\\[7\\\], v1.h\\\[3\\\]" 3 } } */
+BUILD_TEST (float32x4_t, float32x2_t, q, , f32, 3, 1)
+BUILD_TEST (int32x4_t,   int32x2_t,   q, , s32, 3, 1)
+BUILD_TEST (uint32x4_t,  uint32x2_t,  q, , u32, 3, 1)
+/* { dg-final { scan-assembler-times "ins\\tv0.s\\\[3\\\], v1.s\\\[1\\\]" 3 } } */
+BUILD_TEST (float64x2_t, float64x1_t, q, , f64, 1, 0)
+BUILD_TEST (int64x2_t,   int64x1_t,   q, , s64, 1, 0)
+BUILD_TEST (uint64x2_t,  uint64x1_t,  q, , u64, 1, 0)
+/* { dg-final { scan-assembler-times "ins\\tv0.d\\\[1\\\], v1.d\\\[0\\\]" 3 } } */
+
+BUILD_TEST (poly8x16_t, poly8x16_t, q, q, p8, 14, 15)
+BUILD_TEST (int8x16_t,  int8x16_t,  q, q, s8, 14, 15)
+BUILD_TEST (uint8x16_t, uint8x16_t, q, q, u8, 14, 15)
+/* { dg-final { scan-assembler-times "ins\\tv0.b\\\[14\\\], v1.b\\\[15\\\]" 3 } } */
+BUILD_TEST (poly16x8_t, poly16x8_t, q, q, p16, 6, 7)
+BUILD_TEST (int16x8_t,  int16x8_t,  q, q, s16, 6, 7)
+BUILD_TEST (uint16x8_t, uint16x8_t, q, q, u16, 6, 7)
+/* { dg-final { scan-assembler-times "ins\\tv0.h\\\[6\\\], v1.h\\\[7\\\]" 3 } } */
+BUILD_TEST (float32x4_t, float32x4_t, q, q, f32, 2, 3)
+BUILD_TEST (int32x4_t,   int32x4_t,   q, q, s32, 2, 3)
+BUILD_TEST (uint32x4_t,  uint32x4_t,  q, q, u32, 2, 3)
+/* { dg-final { scan-assembler-times "ins\\tv0.s\\\[2\\\], v1.s\\\[3\\\]" 3 } } */
+BUILD_TEST (float64x2_t, float64x2_t, q, q, f64, 1, 1)
+BUILD_TEST (int64x2_t,   int64x2_t,   q,  q, s64, 1, 1)
+BUILD_TEST (uint64x2_t,  uint64x2_t,  q, q, u64, 1, 1)
+/* { dg-final { scan-assembler-times "ins\\tv0.d\\\[1\\\], v1.d\\\[1\\\]" 3 } } */
--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/aarch64/vminmaxnm.c
@@ -0,0 +1,37 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+
+#include "arm_neon.h"
+
+/* For each of these intrinsics, we map directly to an unspec in RTL.
+   We're just using the argument directly and returning the result, so we
+   can precisely specify the exact instruction pattern and register
+   allocations we expect.  */
+
+float64x1_t
+test_vmaxnm_f64 (float64x1_t a, float64x1_t b)
+{
+  /* { dg-final { scan-assembler-times "fmaxnm\td0, d0, d1" 1 } } */
+  return vmaxnm_f64 (a, b);
+}
+
+float64x1_t
+test_vminnm_f64 (float64x1_t a, float64x1_t b)
+{
+  /* { dg-final { scan-assembler-times "fminnm\td0, d0, d1" 1 } } */
+  return vminnm_f64 (a, b);
+}
+
+float64x1_t
+test_vmax_f64 (float64x1_t a, float64x1_t b)
+{
+  /* { dg-final { scan-assembler-times "fmax\td0, d0, d1" 1 } } */
+  return vmax_f64 (a, b);
+}
+
+float64x1_t
+test_vmin_f64 (float64x1_t a, float64x1_t b)
+{
+  /* { dg-final { scan-assembler-times "fmin\td0, d0, d1" 1 } } */
+  return vmin_f64 (a, b);
+}
\ No newline at end of file
--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/arm/aapcs/neon-vect10.c
@@ -0,0 +1,32 @@
+/* Test AAPCS layout (VFP variant for Neon types) */
+
+/* { dg-do run { target arm_eabi } } */
+/* { dg-require-effective-target arm_hard_vfp_ok }  */
+/* { dg-require-effective-target arm_neon_fp16_hw } */
+/* { dg-add-options arm_neon_fp16 } */
+
+#ifndef IN_FRAMEWORK
+#define VFP
+#define NEON
+#define TESTFILE "neon-vect10.c"
+#include "neon-constants.h"
+
+#include "abitest.h"
+#else
+
+ARG (int32x4_t, i32x4_constvec2, Q0) /* D0, D1.  */
+#if defined (__ARM_BIG_ENDIAN)
+ARG (__fp16, 3.0f, S4 + 2) /* D2, Q1.  */
+#else
+ARG (__fp16, 3.0f, S4) /* D2, Q1.  */
+#endif
+ARG (int32x4x2_t, i32x4x2_constvec1, Q2) /* Q2, Q3 - D4-D6 , s5-s12.  */
+ARG (double, 12.0, D3) /* Backfill this particular argument.  */
+#if defined (__ARM_BIG_ENDIAN)
+ARG (__fp16, 5.0f, S5 + 2) /* Backfill in S5.  */
+#else
+ARG (__fp16, 5.0f, S5) /* Backfill in S5.  */
+#endif
+ARG (int32x4x2_t, i32x4x2_constvec2, STACK)
+LAST_ARG (int, 3, R0)
+#endif
--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/arm/aapcs/neon-vect9.c
@@ -0,0 +1,24 @@
+/* Test AAPCS layout (VFP variant for Neon types) */
+
+/* { dg-do run { target arm_eabi } } */
+/* { dg-require-effective-target arm_hard_vfp_ok }  */
+/* { dg-require-effective-target arm_neon_fp16_hw } */
+/* { dg-add-options arm_neon_fp16 } */
+
+#ifndef IN_FRAMEWORK
+#define VFP
+#define NEON
+#define TESTFILE "neon-vect9.c"
+#include "neon-constants.h"
+
+#include "abitest.h"
+#else
+
+ARG (int32x4_t, i32x4_constvec2, Q0) /* D0, D1.  */
+#if defined (__ARM_BIG_ENDIAN)
+ARG (__fp16, 3.0f, S4 + 2) /* D2, Q1 occupied.  */
+#else
+ARG (__fp16, 3.0f, S4) /* D2, Q1 occupied.  */
+#endif
+LAST_ARG (int, 3, R0)
+#endif
--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/arm/aapcs/vfp18.c
@@ -0,0 +1,28 @@
+/* Test AAPCS layout (VFP variant)  */
+
+/* { dg-do run { target arm_eabi } }  */
+/* { dg-require-effective-target arm_hard_vfp_ok }  */
+/* { dg-require-effective-target arm_fp16_hw }  */
+/* { dg-add-options arm_fp16_ieee }  */
+
+#ifndef IN_FRAMEWORK
+#define VFP
+#define TESTFILE "vfp18.c"
+#include "abitest.h"
+
+#else
+#if defined (__ARM_BIG_ENDIAN)
+ARG (__fp16, 1.0f, S0 + 2)
+#else
+ARG (__fp16, 1.0f, S0)
+#endif
+ARG (float, 2.0f, S1)
+ARG (double, 4.0, D1)
+ARG (float, 2.0f, S4)
+#if defined (__ARM_BIG_ENDIAN)
+ARG (__fp16, 1.0f, S5 + 2)
+#else
+ARG (__fp16, 1.0f, S5)
+#endif
+LAST_ARG (int, 3, R0)
+#endif
--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/arm/aapcs/vfp19.c
@@ -0,0 +1,30 @@
+/* Test AAPCS layout (VFP variant)  */
+
+/* { dg-do run { target arm_eabi } }  */
+/* { dg-require-effective-target arm_hard_vfp_ok }  */
+/* { dg-require-effective-target arm_fp16_hw }  */
+/* { dg-add-options arm_fp16_ieee }  */
+
+#ifndef IN_FRAMEWORK
+#define VFP
+#define TESTFILE "vfp19.c"
+
+__complex__ x = 1.0+2.0i;
+
+#include "abitest.h"
+#else
+#if defined (__ARM_BIG_ENDIAN)
+ARG (__fp16, 1.0f, S0 + 2)
+#else
+ARG (__fp16, 1.0f, S0)
+#endif
+ARG (float, 2.0f, S1)
+ARG (__complex__ double, x, D1)
+ARG (float, 3.0f, S6)
+#if defined (__ARM_BIG_ENDIAN)
+ARG (__fp16, 2.0f, S7 + 2)
+#else
+ARG (__fp16, 2.0f, S7)
+#endif
+LAST_ARG (int, 3, R0)
+#endif
--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/arm/aapcs/vfp20.c
@@ -0,0 +1,22 @@
+/* Test AAPCS layout (VFP variant)  */
+
+/* { dg-do run { target arm_eabi } }  */
+/* { dg-require-effective-target arm_hard_vfp_ok }  */
+/* { dg-require-effective-target arm_fp16_hw }  */
+/* { dg-add-options arm_fp16_ieee }  */
+
+#ifndef IN_FRAMEWORK
+#define VFP
+#define TESTFILE "vfp20.c"
+
+#define PCSATTR __attribute__((pcs("aapcs")))
+
+#include "abitest.h"
+#else
+ARG (float, 1.0f, R0)
+ARG (double, 2.0, R2)
+ARG (float, 3.0f, STACK)
+ARG (__fp16, 2.0f, STACK+4)
+LAST_ARG (double, 4.0, STACK+8)
+#endif
+
--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/arm/aapcs/vfp21.c
@@ -0,0 +1,26 @@
+/* Test AAPCS layout (VFP variant)  */
+
+/* { dg-do run { target arm_eabi } }  */
+/* { dg-require-effective-target arm_hard_vfp_ok }  */
+/* { dg-require-effective-target arm_fp16_hw }  */
+/* { dg-add-options arm_fp16_ieee }  */
+
+#ifndef IN_FRAMEWORK
+#define VFP
+#define TESTFILE "vfp21.c"
+
+#define PCSATTR __attribute__((pcs("aapcs")))
+
+#include "abitest.h"
+#else
+#if defined (__ARM_BIG_ENDIAN)
+ARG (__fp16, 1.0f, R0 + 2)
+#else
+ARG (__fp16, 1.0f, R0)
+#endif
+ARG (double, 2.0, R2)
+ARG (__fp16, 3.0f, STACK)
+ARG (float, 2.0f, STACK+4)
+LAST_ARG (double, 4.0, STACK+8)
+#endif
+
--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/arm/aapcs/vfp22.c
@@ -0,0 +1,28 @@
+/* Test AAPCS layout (VFP variant)  */
+
+/* { dg-do run { target arm_eabi } }  */
+/* { dg-require-effective-target arm_hard_vfp_ok }  */
+/* { dg-require-effective-target arm_fp16_hw }  */
+/* { dg-add-options arm_fp16_alternative }  */
+
+#ifndef IN_FRAMEWORK
+#define VFP
+#define TESTFILE "vfp22.c"
+#include "abitest.h"
+
+#else
+#if defined (__ARM_BIG_ENDIAN)
+ARG (__fp16, 1.0f, S0 + 2)
+#else
+ARG (__fp16, 1.0f, S0)
+#endif
+ARG (float, 2.0f, S1)
+ARG (double, 4.0, D1)
+ARG (float, 2.0f, S4)
+#if defined (__ARM_BIG_ENDIAN)
+ARG (__fp16, 1.0f, S5 + 2)
+#else
+ARG (__fp16, 1.0f, S5)
+#endif
+LAST_ARG (int, 3, R0)
+#endif
--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/arm/aapcs/vfp23.c
@@ -0,0 +1,30 @@
+/* Test AAPCS layout (VFP variant)  */
+
+/* { dg-do run { target arm_eabi } }  */
+/* { dg-require-effective-target arm_hard_vfp_ok }  */
+/* { dg-require-effective-target arm_fp16_hw }  */
+/* { dg-add-options arm_fp16_alternative }  */
+
+#ifndef IN_FRAMEWORK
+#define VFP
+#define TESTFILE "vfp23.c"
+
+__complex__ x = 1.0+2.0i;
+
+#include "abitest.h"
+#else
+#if defined (__ARM_BIG_ENDIAN)
+ARG (__fp16, 1.0f, S0 + 2)
+#else
+ARG (__fp16, 1.0f, S0)
+#endif
+ARG (float, 2.0f, S1)
+ARG (__complex__ double, x, D1)
+ARG (float, 3.0f, S6)
+#if defined (__ARM_BIG_ENDIAN)
+ARG (__fp16, 2.0f, S7 + 2)
+#else
+ARG (__fp16, 2.0f, S7)
+#endif
+LAST_ARG (int, 3, R0)
+#endif
--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/arm/aapcs/vfp24.c
@@ -0,0 +1,21 @@
+/* Test AAPCS layout (VFP variant)  */
+
+/* { dg-do run { target arm_eabi } }  */
+/* { dg-require-effective-target arm_hard_vfp_ok }  */
+/* { dg-require-effective-target arm_fp16_hw }  */
+/* { dg-add-options arm_fp16_alternative }  */
+
+#ifndef IN_FRAMEWORK
+#define VFP
+#define TESTFILE "vfp24.c"
+
+#define PCSATTR __attribute__((pcs("aapcs")))
+
+#include "abitest.h"
+#else
+ARG (float, 1.0f, R0)
+ARG (double, 2.0, R2)
+ARG (float, 3.0f, STACK)
+ARG (__fp16, 2.0f, STACK+4)
+LAST_ARG (double, 4.0, STACK+8)
+#endif
--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/arm/aapcs/vfp25.c
@@ -0,0 +1,25 @@
+/* Test AAPCS layout (VFP variant)  */
+
+/* { dg-do run { target arm_eabi } }  */
+/* { dg-require-effective-target arm_hard_vfp_ok }  */
+/* { dg-require-effective-target arm_fp16_hw }  */
+/* { dg-add-options arm_fp16_alternative }  */
+
+#ifndef IN_FRAMEWORK
+#define VFP
+#define TESTFILE "vfp25.c"
+
+#define PCSATTR __attribute__((pcs("aapcs")))
+
+#include "abitest.h"
+#else
+#if defined (__ARM_BIG_ENDIAN)
+ARG (__fp16, 1.0f, R0 + 2)
+#else
+ARG (__fp16, 1.0f, R0)
+#endif
+ARG (double, 2.0, R2)
+ARG (__fp16, 3.0f, STACK)
+ARG (float, 2.0f, STACK+4)
+LAST_ARG (double, 4.0, STACK+8)
+#endif
--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/arm/armv5_thumb_isa.c
@@ -0,0 +1,8 @@
+/* { dg-require-effective-target arm_arch_v5_ok } */
+/* { dg-add-options arm_arch_v5 } */
+
+#if __ARM_ARCH_ISA_THUMB
+#error "__ARM_ARCH_ISA_THUMB defined for ARMv5"
+#endif
+
+int foo;
--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/arm/armv8_2-fp16-arith-1.c
@@ -0,0 +1,105 @@
+/* { dg-do compile }  */
+/* { dg-require-effective-target arm_v8_2a_fp16_neon_ok }  */
+/* { dg-options "-O2 -ffast-math" }  */
+/* { dg-add-options arm_v8_2a_fp16_neon }  */
+
+/* Test instructions generated for half-precision arithmetic.  */
+
+typedef __fp16 float16_t;
+typedef __simd64_float16_t float16x4_t;
+typedef __simd128_float16_t float16x8_t;
+
+typedef short int16x4_t __attribute__ ((vector_size (8)));
+typedef short int int16x8_t  __attribute__ ((vector_size (16)));
+
+float16_t
+fp16_abs (float16_t a)
+{
+  return (a < 0) ? -a : a;
+}
+
+#define TEST_UNOP(NAME, OPERATOR, TY)		\
+  TY test_##NAME##_##TY (TY a)			\
+  {						\
+    return OPERATOR (a);			\
+  }
+
+#define TEST_BINOP(NAME, OPERATOR, TY)		\
+  TY test_##NAME##_##TY (TY a, TY b)		\
+  {						\
+    return a OPERATOR b;			\
+  }
+
+#define TEST_CMP(NAME, OPERATOR, RTY, TY)	\
+  RTY test_##NAME##_##TY (TY a, TY b)		\
+  {						\
+    return a OPERATOR b;			\
+  }
+
+/* Scalars.  */
+
+TEST_UNOP (neg, -, float16_t)
+TEST_UNOP (abs, fp16_abs, float16_t)
+
+TEST_BINOP (add, +, float16_t)
+TEST_BINOP (sub, -, float16_t)
+TEST_BINOP (mult, *, float16_t)
+TEST_BINOP (div, /, float16_t)
+
+TEST_CMP (equal, ==, int, float16_t)
+TEST_CMP (unequal, !=, int, float16_t)
+TEST_CMP (lessthan, <, int, float16_t)
+TEST_CMP (greaterthan, >, int, float16_t)
+TEST_CMP (lessthanequal, <=, int, float16_t)
+TEST_CMP (greaterthanqual, >=, int, float16_t)
+
+/* Vectors of size 4.  */
+
+TEST_UNOP (neg, -, float16x4_t)
+
+TEST_BINOP (add, +, float16x4_t)
+TEST_BINOP (sub, -, float16x4_t)
+TEST_BINOP (mult, *, float16x4_t)
+TEST_BINOP (div, /, float16x4_t)
+
+TEST_CMP (equal, ==, int16x4_t, float16x4_t)
+TEST_CMP (unequal, !=, int16x4_t, float16x4_t)
+TEST_CMP (lessthan, <, int16x4_t, float16x4_t)
+TEST_CMP (greaterthan, >, int16x4_t, float16x4_t)
+TEST_CMP (lessthanequal, <=, int16x4_t, float16x4_t)
+TEST_CMP (greaterthanqual, >=, int16x4_t, float16x4_t)
+
+/* Vectors of size 8.  */
+
+TEST_UNOP (neg, -, float16x8_t)
+
+TEST_BINOP (add, +, float16x8_t)
+TEST_BINOP (sub, -, float16x8_t)
+TEST_BINOP (mult, *, float16x8_t)
+TEST_BINOP (div, /, float16x8_t)
+
+TEST_CMP (equal, ==, int16x8_t, float16x8_t)
+TEST_CMP (unequal, !=, int16x8_t, float16x8_t)
+TEST_CMP (lessthan, <, int16x8_t, float16x8_t)
+TEST_CMP (greaterthan, >, int16x8_t, float16x8_t)
+TEST_CMP (lessthanequal, <=, int16x8_t, float16x8_t)
+TEST_CMP (greaterthanqual, >=, int16x8_t, float16x8_t)
+
+/* { dg-final { scan-assembler-times {vneg\.f16\ts[0-9]+, s[0-9]+} 1 } }  */
+/* { dg-final { scan-assembler-times {vneg\.f16\td[0-9]+, d[0-9]+} 1 } }  */
+/* { dg-final { scan-assembler-times {vneg\.f16\tq[0-9]+, q[0-9]+} 1 } }  */
+/* { dg-final { scan-assembler-times {vabs\.f16\ts[0-9]+, s[0-9]+} 2 } }  */
+
+/* { dg-final { scan-assembler-times {vadd\.f16\ts[0-9]+, s[0-9]+, s[0-9]+} 13 } }  */
+/* { dg-final { scan-assembler-times {vsub\.f16\ts[0-9]+, s[0-9]+, s[0-9]+} 13 } }  */
+/* { dg-final { scan-assembler-times {vmul\.f16\ts[0-9]+, s[0-9]+, s[0-9]+} 13 } }  */
+/* { dg-final { scan-assembler-times {vdiv\.f16\ts[0-9]+, s[0-9]+, s[0-9]+} 13 } }  */
+/* { dg-final { scan-assembler-times {vcmp\.f32\ts[0-9]+, s[0-9]+} 26 } }  */
+/* { dg-final { scan-assembler-times {vcmpe\.f32\ts[0-9]+, s[0-9]+} 52 } }  */
+
+/* { dg-final { scan-assembler-not {vadd\.f32} } }  */
+/* { dg-final { scan-assembler-not {vsub\.f32} } }  */
+/* { dg-final { scan-assembler-not {vmul\.f32} } }  */
+/* { dg-final { scan-assembler-not {vdiv\.f32} } }  */
+/* { dg-final { scan-assembler-not {vcmp\.f16} } }  */
+/* { dg-final { scan-assembler-not {vcmpe\.f16} } }  */
--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/arm/armv8_2-fp16-conv-1.c
@@ -0,0 +1,101 @@
+/* { dg-do compile }  */
+/* { dg-require-effective-target arm_v8_2a_fp16_scalar_ok }  */
+/* { dg-options "-O2" }  */
+/* { dg-add-options arm_v8_2a_fp16_scalar }  */
+
+/* Test ARMv8.2 FP16 conversions.  */
+#include <arm_fp16.h>
+
+float
+f16_to_f32 (__fp16 a)
+{
+  return (float)a;
+}
+
+float
+f16_to_pf32 (__fp16* a)
+{
+  return (float)*a;
+}
+
+short
+f16_to_s16 (__fp16 a)
+{
+  return (short)a;
+}
+
+short
+pf16_to_s16 (__fp16* a)
+{
+  return (short)*a;
+}
+
+/* { dg-final { scan-assembler-times {vcvtb\.f32\.f16\ts[0-9]+, s[0-9]+} 4 } }  */
+
+__fp16
+f32_to_f16 (float a)
+{
+  return (__fp16)a;
+}
+
+void
+f32_to_pf16 (__fp16* x, float a)
+{
+  *x = (__fp16)a;
+}
+
+__fp16
+s16_to_f16 (short a)
+{
+  return (__fp16)a;
+}
+
+void
+s16_to_pf16 (__fp16* x, short a)
+{
+  *x = (__fp16)a;
+}
+
+/* { dg-final { scan-assembler-times {vcvtb\.f16\.f32\ts[0-9]+, s[0-9]+} 4 } }  */
+
+float
+s16_to_f32 (short a)
+{
+  return (float)a;
+}
+
+/* { dg-final { scan-assembler-times {vcvt\.f32\.s32\ts[0-9]+, s[0-9]+} 3 } }  */
+
+short
+f32_to_s16 (float a)
+{
+  return (short)a;
+}
+
+/* { dg-final { scan-assembler-times {vcvt\.s32\.f32\ts[0-9]+, s[0-9]+} 3 } }  */
+
+unsigned short
+f32_to_u16 (float a)
+{
+  return (unsigned short)a;
+}
+
+/* { dg-final { scan-assembler-times {vcvt\.u32\.f32\ts[0-9]+, s[0-9]+} 1 } }  */
+
+short
+f64_to_s16 (double a)
+{
+  return (short)a;
+}
+
+/* { dg-final { scan-assembler-times {vcvt\.s32\.f64\ts[0-9]+, d[0-9]+} 1 } }  */
+
+unsigned short
+f64_to_u16 (double a)
+{
+  return (unsigned short)a;
+}
+
+/* { dg-final { scan-assembler-times {vcvt\.s32\.f64\ts[0-9]+, d[0-9]+} 1 } }  */
+
+
--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/arm/armv8_2-fp16-move-1.c
@@ -0,0 +1,165 @@
+/* { dg-do compile }  */
+/* { dg-require-effective-target arm_v8_2a_fp16_scalar_ok }  */
+/* { dg-options "-O2" }  */
+/* { dg-add-options arm_v8_2a_fp16_scalar }  */
+
+__fp16
+test_load_1 (__fp16* a)
+{
+  return *a;
+}
+
+__fp16
+test_load_2 (__fp16* a, int i)
+{
+  return a[i];
+}
+
+/* { dg-final { scan-assembler-times {vld1\.16\t\{d[0-9]+\[[0-9]+\]\}, \[r[0-9]+\]} 2 } }  */
+
+void
+test_store_1 (__fp16* a, __fp16 b)
+{
+  *a = b;
+}
+
+void
+test_store_2 (__fp16* a, int i, __fp16 b)
+{
+  a[i] = b;
+}
+
+/* { dg-final { scan-assembler-times {vst1\.16\t\{d[0-9]+\[[0-9]+\]\}, \[r[0-9]+\]} 2 } }  */
+
+__fp16
+test_load_store_1 (__fp16* a, int i, __fp16* b)
+{
+  a[i] = b[i];
+}
+
+__fp16
+test_load_store_2 (__fp16* a, int i, __fp16* b)
+{
+  a[i] = b[i + 2];
+  return a[i];
+}
+/* { dg-final { scan-assembler-times {ldrh\tr[0-9]+} 2 } }  */
+/* { dg-final { scan-assembler-times {strh\tr[0-9]+} 2 } }  */
+
+__fp16
+test_select_1 (int sel, __fp16 a, __fp16 b)
+{
+  if (sel)
+    return a;
+  else
+    return b;
+}
+
+__fp16
+test_select_2 (int sel, __fp16 a, __fp16 b)
+{
+  return sel ? a : b;
+}
+
+__fp16
+test_select_3 (__fp16 a, __fp16 b, __fp16 c)
+{
+  return (a == b) ? b : c;
+}
+
+__fp16
+test_select_4 (__fp16 a, __fp16 b, __fp16 c)
+{
+  return (a != b) ? b : c;
+}
+
+__fp16
+test_select_5 (__fp16 a, __fp16 b, __fp16 c)
+{
+  return (a < b) ? b : c;
+}
+
+__fp16
+test_select_6 (__fp16 a, __fp16 b, __fp16 c)
+{
+  return (a <= b) ? b : c;
+}
+
+__fp16
+test_select_7 (__fp16 a, __fp16 b, __fp16 c)
+{
+  return (a > b) ? b : c;
+}
+
+__fp16
+test_select_8 (__fp16 a, __fp16 b, __fp16 c)
+{
+  return (a >= b) ? b : c;
+}
+
+/* { dg-final { scan-assembler-times {vseleq\.f16\ts[0-9]+, s[0-9]+, s[0-9]+} 4 } } */
+/* { dg-final { scan-assembler-times {vselgt\.f16\ts[0-9]+, s[0-9]+, s[0-9]+} 1 } }  */
+/* { dg-final { scan-assembler-times {vselge\.f16\ts[0-9]+, s[0-9]+, s[0-9]+} 1 } }  */
+
+/* { dg-final { scan-assembler-times {vmov\.f16\ts[0-9]+, r[0-9]+} 4 } }  */
+/* { dg-final { scan-assembler-times {vmov\.f16\tr[0-9]+, s[0-9]+} 4 } }  */
+
+int
+test_compare_1 (__fp16 a, __fp16 b)
+{
+  if (a == b)
+    return -1;
+  else
+    return 0;
+}
+
+int
+test_compare_ (__fp16 a, __fp16 b)
+{
+  if (a != b)
+    return -1;
+  else
+    return 0;
+}
+
+int
+test_compare_2 (__fp16 a, __fp16 b)
+{
+  if (a > b)
+    return -1;
+  else
+    return 0;
+}
+
+int
+test_compare_3 (__fp16 a, __fp16 b)
+{
+  if (a >= b)
+    return -1;
+  else
+    return 0;
+}
+
+int
+test_compare_4 (__fp16 a, __fp16 b)
+{
+  if (a < b)
+    return -1;
+  else
+    return 0;
+}
+
+int
+test_compare_5 (__fp16 a, __fp16 b)
+{
+  if (a <= b)
+    return -1;
+  else
+    return 0;
+}
+
+/* { dg-final { scan-assembler-not {vcmp\.f16} } }  */
+/* { dg-final { scan-assembler-not {vcmpe\.f16} } }  */
+
+/* { dg-final { scan-assembler-times {vcmp\.f32} 4 } }  */
+/* { dg-final { scan-assembler-times {vcmpe\.f32} 8 } }  */
--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/arm/armv8_2-fp16-neon-1.c
@@ -0,0 +1,490 @@
+/* { dg-do compile }  */
+/* { dg-require-effective-target arm_v8_2a_fp16_neon_ok }  */
+/* { dg-options "-O2" }  */
+/* { dg-add-options arm_v8_2a_fp16_neon }  */
+
+/* Test instructions generated for the FP16 vector intrinsics.  */
+
+#include <arm_neon.h>
+
+#define MSTRCAT(L, str)	L##str
+
+#define UNOP_TEST(insn)				\
+  float16x4_t					\
+  MSTRCAT (test_##insn, _16x4) (float16x4_t a)	\
+  {						\
+    return MSTRCAT (insn, _f16) (a);		\
+  }						\
+  float16x8_t					\
+  MSTRCAT (test_##insn, _16x8) (float16x8_t a)	\
+  {						\
+    return MSTRCAT (insn, q_f16) (a);		\
+  }
+
+#define BINOP_TEST(insn)					\
+  float16x4_t							\
+  MSTRCAT (test_##insn, _16x4) (float16x4_t a, float16x4_t b)	\
+  {								\
+    return MSTRCAT (insn, _f16) (a, b);				\
+  }								\
+  float16x8_t							\
+  MSTRCAT (test_##insn, _16x8) (float16x8_t a, float16x8_t b)	\
+  {								\
+    return MSTRCAT (insn, q_f16) (a, b);			\
+  }
+
+#define BINOP_LANE_TEST(insn, I)					\
+  float16x4_t								\
+  MSTRCAT (test_##insn##_lane, _16x4) (float16x4_t a, float16x4_t b)	\
+  {									\
+    return MSTRCAT (insn, _lane_f16) (a, b, I);				\
+  }									\
+  float16x8_t								\
+  MSTRCAT (test_##insn##_lane, _16x8) (float16x8_t a, float16x4_t b)	\
+  {									\
+    return MSTRCAT (insn, q_lane_f16) (a, b, I);			\
+  }
+
+#define BINOP_LANEQ_TEST(insn, I)					\
+  float16x4_t								\
+  MSTRCAT (test_##insn##_laneq, _16x4) (float16x4_t a, float16x8_t b)	\
+  {									\
+    return MSTRCAT (insn, _laneq_f16) (a, b, I);			\
+  }									\
+  float16x8_t								\
+  MSTRCAT (test_##insn##_laneq, _16x8) (float16x8_t a, float16x8_t b)	\
+  {									\
+    return MSTRCAT (insn, q_laneq_f16) (a, b, I);			\
+  }									\
+
+#define BINOP_N_TEST(insn)					\
+  float16x4_t							\
+  MSTRCAT (test_##insn##_n, _16x4) (float16x4_t a, float16_t b)	\
+  {								\
+    return MSTRCAT (insn, _n_f16) (a, b);			\
+  }								\
+  float16x8_t							\
+  MSTRCAT (test_##insn##_n, _16x8) (float16x8_t a, float16_t b)	\
+  {								\
+    return MSTRCAT (insn, q_n_f16) (a, b);			\
+  }
+
+#define TERNOP_TEST(insn)						\
+  float16_t								\
+  MSTRCAT (test_##insn, _16) (float16_t a, float16_t b, float16_t c)	\
+  {									\
+    return MSTRCAT (insn, h_f16) (a, b, c);				\
+  }									\
+  float16x4_t								\
+  MSTRCAT (test_##insn, _16x4) (float16x4_t a, float16x4_t b,		\
+			       float16x4_t c)				\
+  {									\
+    return MSTRCAT (insn, _f16) (a, b, c);				\
+  }									\
+  float16x8_t								\
+  MSTRCAT (test_##insn, _16x8) (float16x8_t a, float16x8_t b,		\
+			       float16x8_t c)				\
+  {									\
+    return MSTRCAT (insn, q_f16) (a, b, c);				\
+  }
+
+#define VCMP1_TEST(insn)			\
+  uint16x4_t					\
+  MSTRCAT (test_##insn, _16x4) (float16x4_t a)	\
+  {						\
+    return MSTRCAT (insn, _f16) (a);		\
+  }						\
+  uint16x8_t					\
+  MSTRCAT (test_##insn, _16x8) (float16x8_t a)	\
+  {						\
+    return MSTRCAT (insn, q_f16) (a);		\
+  }
+
+#define VCMP2_TEST(insn)					\
+  uint16x4_t							\
+  MSTRCAT (test_##insn, _16x4) (float16x4_t a, float16x4_t b)	\
+  {								\
+    return MSTRCAT (insn, _f16) (a, b);				\
+  }								\
+  uint16x8_t							\
+  MSTRCAT (test_##insn, _16x8) (float16x8_t a, float16x8_t b)	\
+  {								\
+    return MSTRCAT (insn, q_f16) (a, b);			\
+  }
+
+#define VCVT_TEST(insn, TY, TO, FR)			\
+  MSTRCAT (TO, 16x4_t)					\
+  MSTRCAT (test_##insn, TY) (MSTRCAT (FR, 16x4_t) a)	\
+  {							\
+    return MSTRCAT (insn, TY) (a);			\
+  }							\
+  MSTRCAT (TO, 16x8_t)					\
+  MSTRCAT (test_##insn##_q, TY) (MSTRCAT (FR, 16x8_t) a)	\
+  {							\
+    return MSTRCAT (insn, q##TY) (a);			\
+  }
+
+#define VCVT_N_TEST(insn, TY, TO, FR)			\
+  MSTRCAT (TO, 16x4_t)					\
+  MSTRCAT (test_##insn##_n, TY) (MSTRCAT (FR, 16x4_t) a)	\
+  {							\
+    return MSTRCAT (insn, _n##TY) (a, 1);		\
+  }							\
+  MSTRCAT (TO, 16x8_t)					\
+  MSTRCAT (test_##insn##_n_q, TY) (MSTRCAT (FR, 16x8_t) a)	\
+  {							\
+    return MSTRCAT (insn, q_n##TY) (a, 1);		\
+  }
+
+VCMP1_TEST (vceqz)
+/* { dg-final { scan-assembler-times {vceq\.f16\td[0-9]+, d[0-0]+, #0} 1 } }  */
+/* { dg-final { scan-assembler-times {vceq\.f16\tq[0-9]+, q[0-9]+, #0} 1 } }  */
+
+VCMP1_TEST (vcgtz)
+/* { dg-final { scan-assembler-times {vcgt\.f16\td[0-9]+, d[0-9]+, #0} 1 } }  */
+/* { dg-final { scan-assembler-times {vceq\.f16\tq[0-9]+, q[0-9]+, #0} 1 } }  */
+
+VCMP1_TEST (vcgez)
+/* { dg-final { scan-assembler-times {vcge\.f16\td[0-9]+, d[0-9]+, #0} 1 } }  */
+/* { dg-final { scan-assembler-times {vcge\.f16\tq[0-9]+, q[0-9]+, #0} 1 } }  */
+
+VCMP1_TEST (vcltz)
+/* { dg-final { scan-assembler-times {vclt.f16\td[0-9]+, d[0-9]+, #0} 1 } }  */
+/* { dg-final { scan-assembler-times {vclt.f16\tq[0-9]+, q[0-9]+, #0} 1 } }  */
+
+VCMP1_TEST (vclez)
+/* { dg-final { scan-assembler-times {vcle\.f16\td[0-9]+, d[0-9]+, #0} 1 } }  */
+/* { dg-final { scan-assembler-times {vcle\.f16\tq[0-9]+, q[0-9]+, #0} 1 } }  */
+
+VCVT_TEST (vcvt, _f16_s16, float, int)
+VCVT_N_TEST (vcvt, _f16_s16, float, int)
+/* { dg-final { scan-assembler-times {vcvt\.f16\.s16\td[0-9]+, d[0-9]+} 2 } }
+   { dg-final { scan-assembler-times {vcvt\.f16\.s16\tq[0-9]+, q[0-9]+} 2 } }
+   { dg-final { scan-assembler-times {vcvt\.f16\.s16\td[0-9]+, d[0-9]+, #1} 1 } }
+   { dg-final { scan-assembler-times {vcvt\.f16\.s16\tq[0-9]+, q[0-9]+, #1} 1 } }  */
+
+VCVT_TEST (vcvt, _f16_u16, float, uint)
+VCVT_N_TEST (vcvt, _f16_u16, float, uint)
+/* { dg-final { scan-assembler-times {vcvt\.f16\.u16\td[0-9]+, d[0-9]+} 2 } }
+   { dg-final { scan-assembler-times {vcvt\.f16\.u16\tq[0-9]+, q[0-9]+} 2 } }
+   { dg-final { scan-assembler-times {vcvt\.f16\.u16\td[0-9]+, d[0-9]+, #1} 1 } }
+   { dg-final { scan-assembler-times {vcvt\.f16\.u16\tq[0-9]+, q[0-9]+, #1} 1 } }  */
+
+VCVT_TEST (vcvt, _s16_f16, int, float)
+VCVT_N_TEST (vcvt, _s16_f16, int, float)
+/* { dg-final { scan-assembler-times {vcvt\.s16\.f16\td[0-9]+, d[0-9]+} 2 } }
+   { dg-final { scan-assembler-times {vcvt\.s16\.f16\tq[0-9]+, q[0-9]+} 2 } }
+   { dg-final { scan-assembler-times {vcvt\.s16\.f16\td[0-9]+, d[0-9]+, #1} 1 } }
+   { dg-final { scan-assembler-times {vcvt\.s16\.f16\tq[0-9]+, q[0-9]+, #1} 1 } }  */
+
+VCVT_TEST (vcvt, _u16_f16, uint, float)
+VCVT_N_TEST (vcvt, _u16_f16, uint, float)
+/* { dg-final { scan-assembler-times {vcvt\.u16\.f16\td[0-9]+, d[0-9]+} 2 } }
+   { dg-final { scan-assembler-times {vcvt\.u16\.f16\tq[0-9]+, q[0-9]+} 2 } }
+   { dg-final { scan-assembler-times {vcvt\.u16\.f16\td[0-9]+, d[0-9]+, #1} 1 } }
+   { dg-final { scan-assembler-times {vcvt\.u16\.f16\tq[0-9]+, q[0-9]+, #1} 1 } }  */
+
+VCVT_TEST (vcvta, _s16_f16, int, float)
+/* { dg-final { scan-assembler-times {vcvta\.s16\.f16\td[0-9]+, d[0-9]+} 1 } }
+   { dg-final { scan-assembler-times {vcvta\.s16\.f16\tq[0-9]+, q[0-9]+} 1 } }
+*/
+
+VCVT_TEST (vcvta, _u16_f16, uint, float)
+/* { dg-final { scan-assembler-times {vcvta\.u16\.f16\td[0-9]+, d[0-9]+} 1 } }
+   { dg-final { scan-assembler-times {vcvta\.u16\.f16\tq[0-9]+, q[0-9]+} 1 } }
+*/
+
+VCVT_TEST (vcvtm, _s16_f16, int, float)
+/* { dg-final { scan-assembler-times {vcvtm\.s16\.f16\td[0-9]+, d[0-9]+} 1 } }
+   { dg-final { scan-assembler-times {vcvtm\.s16\.f16\tq[0-9]+, q[0-9]+} 1 } }
+*/
+
+VCVT_TEST (vcvtm, _u16_f16, uint, float)
+/* { dg-final { scan-assembler-times {vcvtm\.u16\.f16\td[0-9]+, d[0-9]+} 1 } }
+   { dg-final { scan-assembler-times {vcvtm\.u16\.f16\tq[0-9]+, q[0-9]+} 1 } }
+*/
+
+VCVT_TEST (vcvtn, _s16_f16, int, float)
+/* { dg-final { scan-assembler-times {vcvtn\.s16\.f16\td[0-9]+, d[0-9]+} 1 } }
+   { dg-final { scan-assembler-times {vcvtn\.s16\.f16\tq[0-9]+, q[0-9]+} 1 } }
+*/
+
+VCVT_TEST (vcvtn, _u16_f16, uint, float)
+/* { dg-final { scan-assembler-times {vcvtn\.u16\.f16\td[0-9]+, d[0-9]+} 1 } }
+   { dg-final { scan-assembler-times {vcvtn\.u16\.f16\tq[0-9]+, q[0-9]+} 1 } }
+*/
+
+VCVT_TEST (vcvtp, _s16_f16, int, float)
+/* { dg-final { scan-assembler-times {vcvtp\.s16\.f16\td[0-9]+, d[0-9]+} 1 } }
+   { dg-final { scan-assembler-times {vcvtp\.s16\.f16\tq[0-9]+, q[0-9]+} 1 } }
+*/
+
+VCVT_TEST (vcvtp, _u16_f16, uint, float)
+/* { dg-final { scan-assembler-times {vcvtp\.u16\.f16\td[0-9]+, d[0-9]+} 1 } }
+   { dg-final { scan-assembler-times {vcvtp\.u16\.f16\tq[0-9]+, q[0-9]+} 1 } }
+*/
+
+UNOP_TEST (vabs)
+/* { dg-final { scan-assembler-times {vabs\.f16\td[0-9]+, d[0-9]+} 1 } }
+   { dg-final { scan-assembler-times {vabs\.f16\tq[0-9]+, q[0-9]+} 1 } }  */
+
+UNOP_TEST (vneg)
+/* { dg-final { scan-assembler-times {vneg\.f16\td[0-9]+, d[0-9]+} 1 } }
+   { dg-final { scan-assembler-times {vneg\.f16\tq[0-9]+, q[0-9]+} 1 } }  */
+
+UNOP_TEST (vrecpe)
+/* { dg-final { scan-assembler-times {vrecpe\.f16\td[0-9]+, d[0-9]+} 1 } }
+   { dg-final { scan-assembler-times {vrecpe\.f16\tq[0-9]+, q[0-9]+} 1 } }  */
+
+UNOP_TEST (vrnd)
+/* { dg-final { scan-assembler-times {vrintz\.f16\td[0-9]+, d[0-9]+} 1 } }
+   { dg-final { scan-assembler-times {vrintz\.f16\tq[0-9]+, q[0-9]+} 1 } }  */
+
+UNOP_TEST (vrnda)
+/* { dg-final { scan-assembler-times {vrinta\.f16\td[0-9]+, d[0-9]+} 1 } }
+   { dg-final { scan-assembler-times {vrinta\.f16\tq[0-9]+, q[0-9]+} 1 } }  */
+
+UNOP_TEST (vrndm)
+/* { dg-final { scan-assembler-times {vrintm\.f16\td[0-9]+, d[0-9]+} 1 } }
+   { dg-final { scan-assembler-times {vrintm\.f16\tq[0-9]+, q[0-9]+} 1 } }  */
+
+UNOP_TEST (vrndn)
+/* { dg-final { scan-assembler-times {vrintn\.f16\td[0-9]+, d[0-9]+} 1 } }
+   { dg-final { scan-assembler-times {vrintn\.f16\tq[0-9]+, q[0-9]+} 1 } }  */
+
+UNOP_TEST (vrndp)
+/* { dg-final { scan-assembler-times {vrintp\.f16\td[0-9]+, d[0-9]+} 1 } }
+   { dg-final { scan-assembler-times {vrintp\.f16\tq[0-9]+, q[0-9]+} 1 } }  */
+
+UNOP_TEST (vrndx)
+/* { dg-final { scan-assembler-times {vrintx\.f16\td[0-9]+, d[0-9]+} 1 } }
+   { dg-final { scan-assembler-times {vrintx\.f16\tq[0-9]+, q[0-9]+} 1 } }  */
+
+UNOP_TEST (vrsqrte)
+/* { dg-final { scan-assembler-times {vrsqrte\.f16\td[0-9]+, d[0-9]+} 1 } }
+   { dg-final { scan-assembler-times {vrsqrte\.f16\tq[0-9]+, q[0-9]+} 1 } }  */
+
+BINOP_TEST (vadd)
+/* { dg-final { scan-assembler-times {vadd\.f16\td[0-9]+, d[0-9]+, d[0-9]+} 1 } }
+   { dg-final { scan-assembler-times {vadd\.f16\tq[0-9]+, q[0-9]+, q[0-9]+} 1 } }  */
+
+BINOP_TEST (vabd)
+/* { dg-final { scan-assembler-times {vabd\.f16\td[0-9]+, d[0-9]+, d[0-9]+} 1 } }
+   { dg-final { scan-assembler-times {vabd\.f16\tq[0-9]+, q[0-9]+, q[0-9]+} 1 } }  */
+
+VCMP2_TEST (vcage)
+/* { dg-final { scan-assembler-times {vacge\.f16\td[0-9]+, d[0-9]+, d[0-9]+} 1 } }
+   { dg-final { scan-assembler-times {vacge\.f16\tq[0-9]+, q[0-9]+, q[0-9]+} 1 } }  */
+
+VCMP2_TEST (vcagt)
+/* { dg-final { scan-assembler-times {vacgt\.f16\td[0-9]+, d[0-9]+, d[0-9]+} 1 } }
+   { dg-final { scan-assembler-times {vacgt\.f16\tq[0-9]+, q[0-9]+, q[0-9]+} 1 } }  */
+
+VCMP2_TEST (vcale)
+/* { dg-final { scan-assembler-times {vacle\.f16\td[0-9]+, d[0-9]+, d[0-9]+} 1 } }
+   { dg-final { scan-assembler-times {vacle\.f16\tq[0-9]+, q[0-9]+, q[0-9]+} 1 } }  */
+
+VCMP2_TEST (vcalt)
+/* { dg-final { scan-assembler-times {vaclt\.f16\td[0-9]+, d[0-9]+, d[0-9]+} 1 } }
+   { dg-final { scan-assembler-times {vaclt\.f16\tq[0-9]+, q[0-9]+, q[0-9]+} 1 } }  */
+
+VCMP2_TEST (vceq)
+/* { dg-final { scan-assembler-times {vceq\.f16\td[0-9]+, d[0-9]+, d[0-9]+} 1 } }
+   { dg-final { scan-assembler-times {vceq\.f16\tq[0-9]+, q[0-9]+, q[0-9]+} 1 } }  */
+
+VCMP2_TEST (vcge)
+/* { dg-final { scan-assembler-times {vcge\.f16\td[0-9]+, d[0-9]+, d[0-9]+} 1 } }
+   { dg-final { scan-assembler-times {vcge\.f16\tq[0-9]+, q[0-9]+, q[0-9]+} 1 } }  */
+
+VCMP2_TEST (vcgt)
+/* { dg-final { scan-assembler-times {vcgt\.f16\td[0-9]+, d[0-9]+, d[0-9]+} 1 } }
+   { dg-final { scan-assembler-times {vcgt\.f16\tq[0-9]+, q[0-9]+, q[0-9]+} 1 } }  */
+
+VCMP2_TEST (vcle)
+/* { dg-final { scan-assembler-times {vcle\.f16\td[0-9]+, d[0-9]+, d[0-9]+} 1 } }
+   { dg-final { scan-assembler-times {vcle\.f16\tq[0-9]+, q[0-9]+, q[0-9]+} 1 } }  */
+
+VCMP2_TEST (vclt)
+/* { dg-final { scan-assembler-times {vclt\.f16\td[0-9]+, d[0-9]+, d[0-9]+} 1 } }
+   { dg-final { scan-assembler-times {vclt\.f16\tq[0-9]+, q[0-9]+, q[0-9]+} 1 } }  */
+
+BINOP_TEST (vmax)
+/* { dg-final { scan-assembler-times {vmax\.f16\td[0-9]+, d[0-9]+, d[0-9]+} 1 } }
+   { dg-final { scan-assembler-times {vmax\.f16\tq[0-9]+, q[0-9]+, q[0-9]+} 1 } }  */
+
+BINOP_TEST (vmin)
+/* { dg-final { scan-assembler-times {vmin\.f16\td[0-9]+, d[0-9]+, d[0-9]+} 1 } }
+   { dg-final { scan-assembler-times {vmin\.f16\tq[0-9]+, q[0-9]+, q[0-9]+} 1 } }  */
+
+BINOP_TEST (vmaxnm)
+/* { dg-final { scan-assembler-times {vmaxnm\.f16\td[0-9]+, d[0-9]+, d[0-9]+} 1 } }
+  { dg-final { scan-assembler-times {vmaxnm\.f16\tq[0-9]+, q[0-9]+, q[0-9]+} 1 } }  */
+
+BINOP_TEST (vminnm)
+/* { dg-final { scan-assembler-times {vminnm\.f16\td[0-9]+, d[0-9]+, d[0-9]+} 1 } }
+  { dg-final { scan-assembler-times {vminnm\.f16\tq[0-9]+, q[0-9]+, q[0-9]+} 1 } }  */
+
+BINOP_TEST (vmul)
+/* { dg-final { scan-assembler-times {vmul\.f16\td[0-9]+, d[0-9]+, d[0-9]+} 3 } }
+   { dg-final { scan-assembler-times {vmul\.f16\tq[0-9]+, q[0-9]+, q[0-9]+} 1 } }  */
+BINOP_LANE_TEST (vmul, 2)
+/* { dg-final { scan-assembler-times {vmul\.f16\td[0-9]+, d[0-9]+, d[0-9]+\[2\]} 1 } }
+   { dg-final { scan-assembler-times {vmul\.f16\tq[0-9]+, q[0-9]+, d[0-9]+\[2\]} 1 } }  */
+BINOP_N_TEST (vmul)
+/* { dg-final { scan-assembler-times {vmul\.f16\td[0-9]+, d[0-9]+, d[0-9]+\[0\]} 1 } }
+   { dg-final { scan-assembler-times {vmul\.f16\tq[0-9]+, q[0-9]+, d[0-9]+\[0\]} 1 } }*/
+
+float16x4_t
+test_vpadd_16x4 (float16x4_t a, float16x4_t b)
+{
+  return vpadd_f16 (a, b);
+}
+/* { dg-final { scan-assembler-times {vpadd\.f16\td[0-9]+, d[0-9]+, d[0-9]+} 1 } } */
+
+float16x4_t
+test_vpmax_16x4 (float16x4_t a, float16x4_t b)
+{
+  return vpmax_f16 (a, b);
+}
+/* { dg-final { scan-assembler-times {vpmax\.f16\td[0-9]+, d[0-9]+, d[0-9]+} 1 } } */
+
+float16x4_t
+test_vpmin_16x4 (float16x4_t a, float16x4_t b)
+{
+  return vpmin_f16 (a, b);
+}
+/* { dg-final { scan-assembler-times {vpmin\.f16\td[0-9]+, d[0-9]+, d[0-9]+} 1 } } */
+
+BINOP_TEST (vsub)
+/* { dg-final { scan-assembler-times {vsub\.f16\td[0-9]+, d[0-9]+, d[0-9]+} 1 } }
+   { dg-final { scan-assembler-times {vsub\.f16\tq[0-9]+, q[0-9]+, q[0-9]+} 1 } }  */
+
+BINOP_TEST (vrecps)
+/* { dg-final { scan-assembler-times {vrecps\.f16\td[0-9]+, d[0-9]+, d[0-9]+} 1 } }
+  { dg-final { scan-assembler-times {vrecps\.f16\tq[0-9]+, q[0-9]+, q[0-9]+} 1 } }  */
+
+BINOP_TEST (vrsqrts)
+/* { dg-final { scan-assembler-times {vrsqrts\.f16\td[0-9]+, d[0-9]+, d[0-9]+} 1 } }
+  { dg-final { scan-assembler-times {vrsqrts\.f16\tq[0-9]+, q[0-9]+, q[0-9]+} 1 } }  */
+
+TERNOP_TEST (vfma)
+/* { dg-final { scan-assembler-times {vfma\.f16\td[0-9]+, d[0-9]+, d[0-9]+} 1 } }
+  { dg-final { scan-assembler-times {vfma\.f16\tq[0-9]+, q[0-9]+, q[0-9]+} 1 } }  */
+
+TERNOP_TEST (vfms)
+/* { dg-final { scan-assembler-times {vfms\.f16\td[0-9]+, d[0-9]+, d[0-9]+} 1 } }
+  { dg-final { scan-assembler-times {vfms\.f16\tq[0-9]+, q[0-9]+, q[0-9]+} 1 } }  */
+
+float16x4_t
+test_vmov_n_f16 (float16_t a)
+{
+  return vmov_n_f16 (a);
+}
+
+float16x4_t
+test_vdup_n_f16 (float16_t a)
+{
+  return vdup_n_f16 (a);
+}
+/* { dg-final { scan-assembler-times {vdup\.16\td[0-9]+, r[0-9]+} 2 } }  */
+
+float16x8_t
+test_vmovq_n_f16 (float16_t a)
+{
+  return vmovq_n_f16 (a);
+}
+
+float16x8_t
+test_vdupq_n_f16 (float16_t a)
+{
+  return vdupq_n_f16 (a);
+}
+/* { dg-final { scan-assembler-times {vdup\.16\tq[0-9]+, r[0-9]+} 2 } }  */
+
+float16x4_t
+test_vdup_lane_f16 (float16x4_t a)
+{
+  return vdup_lane_f16 (a, 1);
+}
+/* { dg-final { scan-assembler-times {vdup\.16\td[0-9]+, d[0-9]+\[1\]} 1 } }  */
+
+float16x8_t
+test_vdupq_lane_f16 (float16x4_t a)
+{
+  return vdupq_lane_f16 (a, 1);
+}
+/* { dg-final { scan-assembler-times {vdup\.16\tq[0-9]+, d[0-9]+\[1\]} 1 } }  */
+
+float16x4_t
+test_vext_f16 (float16x4_t a, float16x4_t b)
+{
+  return vext_f16 (a, b, 1);
+}
+/* { dg-final { scan-assembler-times {vext\.16\td[0-9]+, d[0-9]+, d[0-9]+, #1} 1 } } */
+
+float16x8_t
+test_vextq_f16 (float16x8_t a, float16x8_t b)
+{
+  return vextq_f16 (a, b, 1);
+}
+/*   { dg-final { scan-assembler-times {vext\.16\tq[0-9]+, q[0-9]+, q[0-9]+, #1} 1 } }  */
+
+UNOP_TEST (vrev64)
+/* { dg-final { scan-assembler-times {vrev64\.16\td[0-9]+, d[0-9]+} 1 } }
+   { dg-final { scan-assembler-times {vrev64\.16\tq[0-9]+, q[0-9]+} 1 } }  */
+
+float16x4_t
+test_vbsl16x4 (uint16x4_t a, float16x4_t b, float16x4_t c)
+{
+  return vbsl_f16 (a, b, c);
+}
+/* { dg-final { scan-assembler-times {vbsl\td[0-9]+, d[0-9]+, d[0-9]+} 1 } }  */
+
+float16x8_t
+test_vbslq16x8 (uint16x8_t a, float16x8_t b, float16x8_t c)
+{
+  return vbslq_f16 (a, b, c);
+}
+/*{ dg-final { scan-assembler-times {vbsl\tq[0-9]+, q[0-9]+, q[0-9]+} 1 } }  */
+
+float16x4x2_t
+test_vzip16x4 (float16x4_t a, float16x4_t b)
+{
+  return vzip_f16 (a, b);
+}
+/* { dg-final { scan-assembler-times {vzip\.16\td[0-9]+, d[0-9]+} 1 } }  */
+
+float16x8x2_t
+test_vzipq16x8 (float16x8_t a, float16x8_t b)
+{
+  return vzipq_f16 (a, b);
+}
+/*{ dg-final { scan-assembler-times {vzip\.16\tq[0-9]+, q[0-9]+} 1 } }  */
+
+float16x4x2_t
+test_vuzp16x4 (float16x4_t a, float16x4_t b)
+{
+  return vuzp_f16 (a, b);
+}
+/* { dg-final { scan-assembler-times {vuzp\.16\td[0-9]+, d[0-9]+} 1 } }  */
+
+float16x8x2_t
+test_vuzpq16x8 (float16x8_t a, float16x8_t b)
+{
+  return vuzpq_f16 (a, b);
+}
+/*{ dg-final { scan-assembler-times {vuzp\.16\tq[0-9]+, q[0-9]+} 1 } }  */
+
+float16x4x2_t
+test_vtrn16x4 (float16x4_t a, float16x4_t b)
+{
+  return vtrn_f16 (a, b);
+}
+/* { dg-final { scan-assembler-times {vtrn\.16\td[0-9]+, d[0-9]+} 1 } }  */
+
+float16x8x2_t
+test_vtrnq16x8 (float16x8_t a, float16x8_t b)
+{
+  return vtrnq_f16 (a, b);
+}
+/*{ dg-final { scan-assembler-times {vtrn\.16\tq[0-9]+, q[0-9]+} 1 } }  */
--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/arm/armv8_2-fp16-scalar-1.c
@@ -0,0 +1,203 @@
+/* { dg-do compile }  */
+/* { dg-require-effective-target arm_v8_2a_fp16_scalar_ok }  */
+/* { dg-options "-O2" }  */
+/* { dg-add-options arm_v8_2a_fp16_scalar }  */
+
+/* Test instructions generated for the FP16 scalar intrinsics.  */
+#include <arm_fp16.h>
+
+#define MSTRCAT(L, str)	L##str
+
+#define UNOP_TEST(insn)				\
+  float16_t					\
+  MSTRCAT (test_##insn, 16) (float16_t a)	\
+  {						\
+    return MSTRCAT (insn, h_f16) (a);		\
+  }
+
+#define BINOP_TEST(insn)				\
+  float16_t						\
+  MSTRCAT (test_##insn, 16) (float16_t a, float16_t b)	\
+  {							\
+    return MSTRCAT (insn, h_f16) (a, b);		\
+  }
+
+#define TERNOP_TEST(insn)						\
+  float16_t								\
+  MSTRCAT (test_##insn, 16) (float16_t a, float16_t b, float16_t c)	\
+  {									\
+    return MSTRCAT (insn, h_f16) (a, b, c);				\
+  }
+
+float16_t
+test_vcvth_f16_s32 (int32_t a)
+{
+  return vcvth_f16_s32 (a);
+}
+
+float16_t
+test_vcvth_n_f16_s32 (int32_t a)
+{
+  return vcvth_n_f16_s32 (a, 1);
+}
+/* { dg-final { scan-assembler-times {vcvt\.f16\.s32\ts[0-9]+, s[0-9]+} 2 } }  */
+/* { dg-final { scan-assembler-times {vcvt\.f16\.s32\ts[0-9]+, s[0-9]+, #1} 1 } }  */
+
+float16_t
+test_vcvth_f16_u32 (uint32_t a)
+{
+  return vcvth_f16_u32 (a);
+}
+
+float16_t
+test_vcvth_n_f16_u32 (uint32_t a)
+{
+  return vcvth_n_f16_u32 (a, 1);
+}
+
+/* { dg-final { scan-assembler-times {vcvt\.f16\.u32\ts[0-9]+, s[0-9]+} 2 } }  */
+/* { dg-final { scan-assembler-times {vcvt\.f16\.u32\ts[0-9]+, s[0-9]+, #1} 1 } }  */
+
+uint32_t
+test_vcvth_u32_f16 (float16_t a)
+{
+  return vcvth_u32_f16 (a);
+}
+/* { dg-final { scan-assembler-times {vcvt\.u32\.f16\ts[0-9]+, s[0-9]+} 2 } }  */
+
+uint32_t
+test_vcvth_n_u32_f16 (float16_t a)
+{
+  return vcvth_n_u32_f16 (a, 1);
+}
+/* { dg-final { scan-assembler-times {vcvt\.u32\.f16\ts[0-9]+, s[0-9]+, #1} 1 } }  */
+
+int32_t
+test_vcvth_s32_f16 (float16_t a)
+{
+  return vcvth_s32_f16 (a);
+}
+
+int32_t
+test_vcvth_n_s32_f16 (float16_t a)
+{
+  return vcvth_n_s32_f16 (a, 1);
+}
+
+/* { dg-final { scan-assembler-times {vcvt\.s32\.f16\ts[0-9]+, s[0-9]+} 2 } }  */
+/* { dg-final { scan-assembler-times {vcvt\.s32\.f16\ts[0-9]+, s[0-9]+, #1} 1 } }  */
+
+int32_t
+test_vcvtah_s32_f16 (float16_t a)
+{
+  return vcvtah_s32_f16 (a);
+}
+/* { dg-final { scan-assembler-times {vcvta\.s32\.f16\ts[0-9]+, s[0-9]+} 1 } }  */
+
+uint32_t
+test_vcvtah_u32_f16 (float16_t a)
+{
+  return vcvtah_u32_f16 (a);
+}
+/* { dg-final { scan-assembler-times {vcvta\.u32\.f16\ts[0-9]+, s[0-9]+} 1 } }  */
+
+int32_t
+test_vcvtmh_s32_f16 (float16_t a)
+{
+  return vcvtmh_s32_f16 (a);
+}
+/* { dg-final { scan-assembler-times {vcvtm\.s32\.f16\ts[0-9]+, s[0-9]+} 1 } }  */
+
+uint32_t
+test_vcvtmh_u32_f16 (float16_t a)
+{
+  return vcvtmh_u32_f16 (a);
+}
+/* { dg-final { scan-assembler-times {vcvtm\.u32\.f16\ts[0-9]+, s[0-9]+} 1 } }
+ */
+
+int32_t
+test_vcvtnh_s32_f16 (float16_t a)
+{
+  return vcvtnh_s32_f16 (a);
+}
+/* { dg-final { scan-assembler-times {vcvtn\.s32\.f16\ts[0-9]+, s[0-9]+} 1 } }
+ */
+
+uint32_t
+test_vcvtnh_u32_f16 (float16_t a)
+{
+  return vcvtnh_u32_f16 (a);
+}
+/* { dg-final { scan-assembler-times {vcvtn\.u32\.f16\ts[0-9]+, s[0-9]+} 1 } }
+ */
+
+int32_t
+test_vcvtph_s32_f16 (float16_t a)
+{
+  return vcvtph_s32_f16 (a);
+}
+/* { dg-final { scan-assembler-times {vcvtp\.s32\.f16\ts[0-9]+, s[0-9]+} 1 } }
+ */
+
+uint32_t
+test_vcvtph_u32_f16 (float16_t a)
+{
+  return vcvtph_u32_f16 (a);
+}
+/* { dg-final { scan-assembler-times {vcvtp\.u32\.f16\ts[0-9]+, s[0-9]+} 1 } }
+ */
+
+UNOP_TEST (vabs)
+/* { dg-final { scan-assembler-times {vabs\.f16\ts[0-9]+, s[0-9]+} 1 } }  */
+
+UNOP_TEST (vneg)
+/* { dg-final { scan-assembler-times {vneg\.f16\ts[0-9]+, s[0-9]+} 1 } }  */
+
+UNOP_TEST (vrnd)
+/* { dg-final { scan-assembler-times {vrintz\.f16\ts[0-9]+, s[0-9]+} 1 } }  */
+
+UNOP_TEST (vrndi)
+/* { dg-final { scan-assembler-times {vrintr\.f16\ts[0-9]+, s[0-9]+} 1 } }  */
+
+UNOP_TEST (vrnda)
+/* { dg-final { scan-assembler-times {vrinta\.f16\ts[0-9]+, s[0-9]+} 1 } }  */
+
+UNOP_TEST (vrndm)
+/* { dg-final { scan-assembler-times {vrinta\.f16\ts[0-9]+, s[0-9]+} 1 } }  */
+
+UNOP_TEST (vrndn)
+/* { dg-final { scan-assembler-times {vrinta\.f16\ts[0-9]+, s[0-9]+} 1 } }  */
+
+UNOP_TEST (vrndp)
+/* { dg-final { scan-assembler-times {vrinta\.f16\ts[0-9]+, s[0-9]+} 1 } }  */
+
+UNOP_TEST (vrndx)
+/* { dg-final { scan-assembler-times {vrinta\.f16\ts[0-9]+, s[0-9]+} 1 } }  */
+
+UNOP_TEST (vsqrt)
+/* { dg-final { scan-assembler-times {vsqrt\.f16\ts[0-9]+, s[0-9]+} 1 } }  */
+
+BINOP_TEST (vadd)
+/* { dg-final { scan-assembler-times {vadd\.f16\ts[0-9]+, s[0-9]+, s[0-9]+} 1 } }  */
+
+BINOP_TEST (vdiv)
+/* { dg-final { scan-assembler-times {vdiv\.f16\ts[0-9]+, s[0-9]+, s[0-9]+} 1 } }  */
+
+BINOP_TEST (vmaxnm)
+/* { dg-final { scan-assembler-times {vmaxnm\.f16\ts[0-9]+, s[0-9]+, s[0-9]+} 1 } }  */
+
+BINOP_TEST (vminnm)
+/* { dg-final { scan-assembler-times {vminnm\.f16\ts[0-9]+, s[0-9]+, s[0-9]+} 1 } }  */
+
+BINOP_TEST (vmul)
+/* { dg-final { scan-assembler-times {vmul\.f16\ts[0-9]+, s[0-9]+, s[0-9]+} 1 } }  */
+
+BINOP_TEST (vsub)
+/* { dg-final { scan-assembler-times {vsub\.f16\ts[0-9]+, s[0-9]+, s[0-9]+} 1 } }  */
+
+TERNOP_TEST (vfma)
+/* { dg-final { scan-assembler-times {vfma\.f16\ts[0-9]+, s[0-9]+, s[0-9]+} 1 } }  */
+
+TERNOP_TEST (vfms)
+/* { dg-final { scan-assembler-times {vfms\.f16\ts[0-9]+, s[0-9]+, s[0-9]+} 1 } }  */
--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/arm/armv8_2-fp16-scalar-2.c
@@ -0,0 +1,71 @@
+/* { dg-do compile }  */
+/* { dg-require-effective-target arm_v8_2a_fp16_scalar_ok }  */
+/* { dg-options "-O2 -std=c11" }  */
+/* { dg-add-options arm_v8_2a_fp16_scalar }  */
+
+/* Test compiler use of FP16 instructions.  */
+#include <arm_fp16.h>
+
+float16_t
+test_mov_imm_1 (float16_t a)
+{
+  return 1.0;
+}
+
+float16_t
+test_mov_imm_2 (float16_t a)
+{
+  float16_t b = 1.0;
+  return b;
+}
+
+float16_t
+test_vmov_imm_3 (float16_t a)
+{
+  float16_t b = 1.0;
+  return vaddh_f16 (a, b);
+}
+
+float16_t
+test_vmov_imm_4 (float16_t a)
+{
+  return vaddh_f16 (a, 1.0);
+}
+
+/* { dg-final { scan-assembler-times {vmov.f16\ts[0-9]+, #1\.0e\+0} 4 } }
+   { dg-final { scan-assembler-times {vadd.f16\ts[0-9]+, s[0-9]+, s[0-9]+} 2 } } */
+
+float16_t
+test_vmla_1 (float16_t a, float16_t b, float16_t c)
+{
+  return vaddh_f16 (vmulh_f16 (a, b), c);
+}
+/* { dg-final { scan-assembler-times {vmla\.f16\ts[0-9]+, s[0-9]+, s[0-9]+} 1 } }  */
+
+float16_t
+test_vmla_2 (float16_t a, float16_t b, float16_t c)
+{
+  return vsubh_f16 (vmulh_f16 (vnegh_f16 (a), b), c);
+}
+/* { dg-final { scan-assembler-times {vnmla\.f16\ts[0-9]+, s[0-9]+, s[0-9]+} 1 } } */
+
+float16_t
+test_vmls_1 (float16_t a, float16_t b, float16_t c)
+{
+  return vsubh_f16 (c, vmulh_f16 (a, b));
+}
+
+float16_t
+test_vmls_2 (float16_t a, float16_t b, float16_t c)
+{
+  return vsubh_f16 (a, vmulh_f16 (b, c));
+}
+/* { dg-final { scan-assembler-times {vmls\.f16} 2 } } */
+
+float16_t
+test_vnmls_1 (float16_t a, float16_t b, float16_t c)
+{
+  return vsubh_f16 (vmulh_f16 (a, b), c);
+}
+/* { dg-final { scan-assembler-times {vnmls\.f16\ts[0-9]+, s[0-9]+, s[0-9]+} 1 } } */
+
--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/arm/atomic-comp-swap-release-acquire-1.c
@@ -0,0 +1,10 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target arm_arch_v8a_ok } */
+/* { dg-options "-O2 -fno-ipa-icf" } */
+/* { dg-add-options arm_arch_v8a } */
+
+#include "../aarch64/atomic-comp-swap-release-acquire.x"
+
+/* { dg-final { scan-assembler-times "ldaex" 4 } } */
+/* { dg-final { scan-assembler-times "stlex" 4 } } */
+/* { dg-final { scan-assembler-not "dmb" } } */
--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/arm/atomic-comp-swap-release-acquire-2.c
@@ -0,0 +1,10 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target arm_arch_v8m_main_ok } */
+/* { dg-options "-O2 -fno-ipa-icf" } */
+/* { dg-add-options arm_arch_v8m_main } */
+
+#include "../aarch64/atomic-comp-swap-release-acquire.x"
+
+/* { dg-final { scan-assembler-times "ldaex" 4 } } */
+/* { dg-final { scan-assembler-times "stlex" 4 } } */
+/* { dg-final { scan-assembler-not "dmb" } } */
--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/arm/atomic-comp-swap-release-acquire-3.c
@@ -0,0 +1,10 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target arm_arch_v8m_base_ok } */
+/* { dg-options "-O2 -fno-ipa-icf" } */
+/* { dg-add-options arm_arch_v8m_base } */
+
+#include "../aarch64/atomic-comp-swap-release-acquire.x"
+
+/* { dg-final { scan-assembler-times "ldaex" 4 } } */
+/* { dg-final { scan-assembler-times "stlex" 4 } } */
+/* { dg-final { scan-assembler-not "dmb" } } */
--- a/src/gcc/testsuite/gcc.target/arm/atomic-comp-swap-release-acquire.c
+++ b/src//dev/null
@@ -1,10 +0,0 @@
-/* { dg-do compile } */
-/* { dg-require-effective-target arm_arch_v8a_ok } */
-/* { dg-options "-O2 -fno-ipa-icf" } */
-/* { dg-add-options arm_arch_v8a } */
-
-#include "../aarch64/atomic-comp-swap-release-acquire.x"
-
-/* { dg-final { scan-assembler-times "ldaex" 4 } } */
-/* { dg-final { scan-assembler-times "stlex" 4 } } */
-/* { dg-final { scan-assembler-not "dmb" } } */
--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/arm/atomic-op-acq_rel-1.c
@@ -0,0 +1,10 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target arm_arch_v8a_ok } */
+/* { dg-options "-O2" } */
+/* { dg-add-options arm_arch_v8a } */
+
+#include "../aarch64/atomic-op-acq_rel.x"
+
+/* { dg-final { scan-assembler-times "ldaex\tr\[0-9\]+, \\\[r\[0-9\]+\\\]" 6 } } */
+/* { dg-final { scan-assembler-times "stlex\t...?, r\[0-9\]+, \\\[r\[0-9\]+\\\]" 6 } } */
+/* { dg-final { scan-assembler-not "dmb" } } */
--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/arm/atomic-op-acq_rel-2.c
@@ -0,0 +1,10 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target arm_arch_v8m_main_ok } */
+/* { dg-options "-O2" } */
+/* { dg-add-options arm_arch_v8m_main } */
+
+#include "../aarch64/atomic-op-acq_rel.x"
+
+/* { dg-final { scan-assembler-times "ldaex\tr\[0-9\]+, \\\[r\[0-9\]+\\\]" 6 } } */
+/* { dg-final { scan-assembler-times "stlex\t...?, r\[0-9\]+, \\\[r\[0-9\]+\\\]" 6 } } */
+/* { dg-final { scan-assembler-not "dmb" } } */
--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/arm/atomic-op-acq_rel-3.c
@@ -0,0 +1,10 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target arm_arch_v8m_base_ok } */
+/* { dg-options "-O2" } */
+/* { dg-add-options arm_arch_v8m_base } */
+
+#include "../aarch64/atomic-op-acq_rel.x"
+
+/* { dg-final { scan-assembler-times "ldaex\tr\[0-9\]+, \\\[r\[0-9\]+\\\]" 6 } } */
+/* { dg-final { scan-assembler-times "stlex\t...?, r\[0-9\]+, \\\[r\[0-9\]+\\\]" 6 } } */
+/* { dg-final { scan-assembler-not "dmb" } } */
--- a/src/gcc/testsuite/gcc.target/arm/atomic-op-acq_rel.c
+++ b/src//dev/null
@@ -1,10 +0,0 @@
-/* { dg-do compile } */
-/* { dg-require-effective-target arm_arch_v8a_ok } */
-/* { dg-options "-O2" } */
-/* { dg-add-options arm_arch_v8a } */
-
-#include "../aarch64/atomic-op-acq_rel.x"
-
-/* { dg-final { scan-assembler-times "ldaex\tr\[0-9\]+, \\\[r\[0-9\]+\\\]" 6 } } */
-/* { dg-final { scan-assembler-times "stlex\t...?, r\[0-9\]+, \\\[r\[0-9\]+\\\]" 6 } } */
-/* { dg-final { scan-assembler-not "dmb" } } */
--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/arm/atomic-op-acquire-1.c
@@ -0,0 +1,10 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target arm_arch_v8a_ok } */
+/* { dg-options "-O2" } */
+/* { dg-add-options arm_arch_v8a } */
+
+#include "../aarch64/atomic-op-acquire.x"
+
+/* { dg-final { scan-assembler-times "ldaex\tr\[0-9\]+, \\\[r\[0-9\]+\\\]" 6 } } */
+/* { dg-final { scan-assembler-times "strex\t...?, r\[0-9\]+, \\\[r\[0-9\]+\\\]" 6 } } */
+/* { dg-final { scan-assembler-not "dmb" } } */
--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/arm/atomic-op-acquire-2.c
@@ -0,0 +1,10 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target arm_arch_v8m_main_ok } */
+/* { dg-options "-O2" } */
+/* { dg-add-options arm_arch_v8m_main } */
+
+#include "../aarch64/atomic-op-acquire.x"
+
+/* { dg-final { scan-assembler-times "ldaex\tr\[0-9\]+, \\\[r\[0-9\]+\\\]" 6 } } */
+/* { dg-final { scan-assembler-times "strex\t...?, r\[0-9\]+, \\\[r\[0-9\]+\\\]" 6 } } */
+/* { dg-final { scan-assembler-not "dmb" } } */
--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/arm/atomic-op-acquire-3.c
@@ -0,0 +1,10 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target arm_arch_v8m_base_ok } */
+/* { dg-options "-O2" } */
+/* { dg-add-options arm_arch_v8m_base } */
+
+#include "../aarch64/atomic-op-acquire.x"
+
+/* { dg-final { scan-assembler-times "ldaex\tr\[0-9\]+, \\\[r\[0-9\]+\\\]" 6 } } */
+/* { dg-final { scan-assembler-times "strex\t...?, r\[0-9\]+, \\\[r\[0-9\]+\\\]" 6 } } */
+/* { dg-final { scan-assembler-not "dmb" } } */
--- a/src/gcc/testsuite/gcc.target/arm/atomic-op-acquire.c
+++ b/src//dev/null
@@ -1,10 +0,0 @@
-/* { dg-do compile } */
-/* { dg-require-effective-target arm_arch_v8a_ok } */
-/* { dg-options "-O2" } */
-/* { dg-add-options arm_arch_v8a } */
-
-#include "../aarch64/atomic-op-acquire.x"
-
-/* { dg-final { scan-assembler-times "ldaex\tr\[0-9\]+, \\\[r\[0-9\]+\\\]" 6 } } */
-/* { dg-final { scan-assembler-times "strex\t...?, r\[0-9\]+, \\\[r\[0-9\]+\\\]" 6 } } */
-/* { dg-final { scan-assembler-not "dmb" } } */
--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/arm/atomic-op-char-1.c
@@ -0,0 +1,10 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target arm_arch_v8a_ok } */
+/* { dg-options "-O2" } */
+/* { dg-add-options arm_arch_v8a } */
+
+#include "../aarch64/atomic-op-char.x"
+
+/* { dg-final { scan-assembler-times "ldrexb\tr\[0-9\]+, \\\[r\[0-9\]+\\\]" 6 } } */
+/* { dg-final { scan-assembler-times "strexb\t...?, r\[0-9\]+, \\\[r\[0-9\]+\\\]" 6 } } */
+/* { dg-final { scan-assembler-not "dmb" } } */
--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/arm/atomic-op-char-2.c
@@ -0,0 +1,10 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target arm_arch_v8m_main_ok } */
+/* { dg-options "-O2" } */
+/* { dg-add-options arm_arch_v8m_main } */
+
+#include "../aarch64/atomic-op-char.x"
+
+/* { dg-final { scan-assembler-times "ldrexb\tr\[0-9\]+, \\\[r\[0-9\]+\\\]" 6 } } */
+/* { dg-final { scan-assembler-times "strexb\t...?, r\[0-9\]+, \\\[r\[0-9\]+\\\]" 6 } } */
+/* { dg-final { scan-assembler-not "dmb" } } */
--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/arm/atomic-op-char-3.c
@@ -0,0 +1,10 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target arm_arch_v8m_base_ok } */
+/* { dg-options "-O2" } */
+/* { dg-add-options arm_arch_v8m_base } */
+
+#include "../aarch64/atomic-op-char.x"
+
+/* { dg-final { scan-assembler-times "ldrexb\tr\[0-9\]+, \\\[r\[0-9\]+\\\]" 6 } } */
+/* { dg-final { scan-assembler-times "strexb\t...?, r\[0-9\]+, \\\[r\[0-9\]+\\\]" 6 } } */
+/* { dg-final { scan-assembler-not "dmb" } } */
--- a/src/gcc/testsuite/gcc.target/arm/atomic-op-char.c
+++ b/src//dev/null
@@ -1,10 +0,0 @@
-/* { dg-do compile } */
-/* { dg-require-effective-target arm_arch_v8a_ok } */
-/* { dg-options "-O2" } */
-/* { dg-add-options arm_arch_v8a } */
-
-#include "../aarch64/atomic-op-char.x"
-
-/* { dg-final { scan-assembler-times "ldrexb\tr\[0-9\]+, \\\[r\[0-9\]+\\\]" 6 } } */
-/* { dg-final { scan-assembler-times "strexb\t...?, r\[0-9\]+, \\\[r\[0-9\]+\\\]" 6 } } */
-/* { dg-final { scan-assembler-not "dmb" } } */
--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/arm/atomic-op-consume-1.c
@@ -0,0 +1,11 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target arm_arch_v8a_ok } */
+/* { dg-options "-O2" } */
+/* { dg-add-options arm_arch_v8a } */
+
+#include "../aarch64/atomic-op-consume.x"
+
+/* Scan for ldaex is a PR59448 consume workaround.  */
+/* { dg-final { scan-assembler-times "ldaex\tr\[0-9\]+, \\\[r\[0-9\]+\\\]" 6 } } */
+/* { dg-final { scan-assembler-times "strex\t...?, r\[0-9\]+, \\\[r\[0-9\]+\\\]" 6 } } */
+/* { dg-final { scan-assembler-not "dmb" } } */
--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/arm/atomic-op-consume-2.c
@@ -0,0 +1,11 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target arm_arch_v8m_main_ok } */
+/* { dg-options "-O2" } */
+/* { dg-add-options arm_arch_v8m_main } */
+
+#include "../aarch64/atomic-op-consume.x"
+
+/* Scan for ldaex is a PR59448 consume workaround.  */
+/* { dg-final { scan-assembler-times "ldaex\tr\[0-9\]+, \\\[r\[0-9\]+\\\]" 6 } } */
+/* { dg-final { scan-assembler-times "strex\t...?, r\[0-9\]+, \\\[r\[0-9\]+\\\]" 6 } } */
+/* { dg-final { scan-assembler-not "dmb" } } */
--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/arm/atomic-op-consume-3.c
@@ -0,0 +1,11 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target arm_arch_v8m_base_ok } */
+/* { dg-options "-O2" } */
+/* { dg-add-options arm_arch_v8m_base } */
+
+#include "../aarch64/atomic-op-consume.x"
+
+/* Scan for ldaex is a PR59448 consume workaround.  */
+/* { dg-final { scan-assembler-times "ldaex\tr\[0-9\]+, \\\[r\[0-9\]+\\\]" 6 } } */
+/* { dg-final { scan-assembler-times "strex\t...?, r\[0-9\]+, \\\[r\[0-9\]+\\\]" 6 } } */
+/* { dg-final { scan-assembler-not "dmb" } } */
--- a/src/gcc/testsuite/gcc.target/arm/atomic-op-consume.c
+++ b/src//dev/null
@@ -1,11 +0,0 @@
-/* { dg-do compile } */
-/* { dg-require-effective-target arm_arch_v8a_ok } */
-/* { dg-options "-O2" } */
-/* { dg-add-options arm_arch_v8a } */
-
-#include "../aarch64/atomic-op-consume.x"
-
-/* Scan for ldaex is a PR59448 consume workaround.  */
-/* { dg-final { scan-assembler-times "ldaex\tr\[0-9\]+, \\\[r\[0-9\]+\\\]" 6 } } */
-/* { dg-final { scan-assembler-times "strex\t...?, r\[0-9\]+, \\\[r\[0-9\]+\\\]" 6 } } */
-/* { dg-final { scan-assembler-not "dmb" } } */
--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/arm/atomic-op-int-1.c
@@ -0,0 +1,10 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target arm_arch_v8a_ok } */
+/* { dg-options "-O2" } */
+/* { dg-add-options arm_arch_v8a } */
+
+#include "../aarch64/atomic-op-int.x"
+
+/* { dg-final { scan-assembler-times "ldrex\tr\[0-9\]+, \\\[r\[0-9\]+\\\]" 6 } } */
+/* { dg-final { scan-assembler-times "strex\t...?, r\[0-9\]+, \\\[r\[0-9\]+\\\]" 6 } } */
+/* { dg-final { scan-assembler-not "dmb" } } */
--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/arm/atomic-op-int-2.c
@@ -0,0 +1,10 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target arm_arch_v8m_main_ok } */
+/* { dg-options "-O2" } */
+/* { dg-add-options arm_arch_v8m_main } */
+
+#include "../aarch64/atomic-op-int.x"
+
+/* { dg-final { scan-assembler-times "ldrex\tr\[0-9\]+, \\\[r\[0-9\]+\\\]" 6 } } */
+/* { dg-final { scan-assembler-times "strex\t...?, r\[0-9\]+, \\\[r\[0-9\]+\\\]" 6 } } */
+/* { dg-final { scan-assembler-not "dmb" } } */
--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/arm/atomic-op-int-3.c
@@ -0,0 +1,10 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target arm_arch_v8m_base_ok } */
+/* { dg-options "-O2" } */
+/* { dg-add-options arm_arch_v8m_base } */
+
+#include "../aarch64/atomic-op-int.x"
+
+/* { dg-final { scan-assembler-times "ldrex\tr\[0-9\]+, \\\[r\[0-9\]+\\\]" 6 } } */
+/* { dg-final { scan-assembler-times "strex\t...?, r\[0-9\]+, \\\[r\[0-9\]+\\\]" 6 } } */
+/* { dg-final { scan-assembler-not "dmb" } } */
--- a/src/gcc/testsuite/gcc.target/arm/atomic-op-int.c
+++ b/src//dev/null
@@ -1,10 +0,0 @@
-/* { dg-do compile } */
-/* { dg-require-effective-target arm_arch_v8a_ok } */
-/* { dg-options "-O2" } */
-/* { dg-add-options arm_arch_v8a } */
-
-#include "../aarch64/atomic-op-int.x"
-
-/* { dg-final { scan-assembler-times "ldrex\tr\[0-9\]+, \\\[r\[0-9\]+\\\]" 6 } } */
-/* { dg-final { scan-assembler-times "strex\t...?, r\[0-9\]+, \\\[r\[0-9\]+\\\]" 6 } } */
-/* { dg-final { scan-assembler-not "dmb" } } */
--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/arm/atomic-op-relaxed-1.c
@@ -0,0 +1,10 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target arm_arch_v8a_ok } */
+/* { dg-options "-O2" } */
+/* { dg-add-options arm_arch_v8a } */
+
+#include "../aarch64/atomic-op-relaxed.x"
+
+/* { dg-final { scan-assembler-times "ldrex\tr\[0-9\]+, \\\[r\[0-9\]+\\\]" 6 } } */
+/* { dg-final { scan-assembler-times "strex\t...?, r\[0-9\]+, \\\[r\[0-9\]+\\\]" 6 } } */
+/* { dg-final { scan-assembler-not "dmb" } } */
--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/arm/atomic-op-relaxed-2.c
@@ -0,0 +1,10 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target arm_arch_v8m_main_ok } */
+/* { dg-options "-O2" } */
+/* { dg-add-options arm_arch_v8m_main } */
+
+#include "../aarch64/atomic-op-relaxed.x"
+
+/* { dg-final { scan-assembler-times "ldrex\tr\[0-9\]+, \\\[r\[0-9\]+\\\]" 6 } } */
+/* { dg-final { scan-assembler-times "strex\t...?, r\[0-9\]+, \\\[r\[0-9\]+\\\]" 6 } } */
+/* { dg-final { scan-assembler-not "dmb" } } */
--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/arm/atomic-op-relaxed-3.c
@@ -0,0 +1,10 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target arm_arch_v8m_base_ok } */
+/* { dg-options "-O2" } */
+/* { dg-add-options arm_arch_v8m_base } */
+
+#include "../aarch64/atomic-op-relaxed.x"
+
+/* { dg-final { scan-assembler-times "ldrex\tr\[0-9\]+, \\\[r\[0-9\]+\\\]" 6 } } */
+/* { dg-final { scan-assembler-times "strex\t...?, r\[0-9\]+, \\\[r\[0-9\]+\\\]" 6 } } */
+/* { dg-final { scan-assembler-not "dmb" } } */
--- a/src/gcc/testsuite/gcc.target/arm/atomic-op-relaxed.c
+++ b/src//dev/null
@@ -1,10 +0,0 @@
-/* { dg-do compile } */
-/* { dg-require-effective-target arm_arch_v8a_ok } */
-/* { dg-options "-O2" } */
-/* { dg-add-options arm_arch_v8a } */
-
-#include "../aarch64/atomic-op-relaxed.x"
-
-/* { dg-final { scan-assembler-times "ldrex\tr\[0-9\]+, \\\[r\[0-9\]+\\\]" 6 } } */
-/* { dg-final { scan-assembler-times "strex\t...?, r\[0-9\]+, \\\[r\[0-9\]+\\\]" 6 } } */
-/* { dg-final { scan-assembler-not "dmb" } } */
--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/arm/atomic-op-release-1.c
@@ -0,0 +1,10 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target arm_arch_v8a_ok } */
+/* { dg-options "-O2" } */
+/* { dg-add-options arm_arch_v8a } */
+
+#include "../aarch64/atomic-op-release.x"
+
+/* { dg-final { scan-assembler-times "ldrex\tr\[0-9\]+, \\\[r\[0-9\]+\\\]" 6 } } */
+/* { dg-final { scan-assembler-times "stlex\t...?, r\[0-9\]+, \\\[r\[0-9\]+\\\]" 6 } } */
+/* { dg-final { scan-assembler-not "dmb" } } */
--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/arm/atomic-op-release-2.c
@@ -0,0 +1,10 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target arm_arch_v8m_main_ok } */
+/* { dg-options "-O2" } */
+/* { dg-add-options arm_arch_v8m_main } */
+
+#include "../aarch64/atomic-op-release.x"
+
+/* { dg-final { scan-assembler-times "ldrex\tr\[0-9\]+, \\\[r\[0-9\]+\\\]" 6 } } */
+/* { dg-final { scan-assembler-times "stlex\t...?, r\[0-9\]+, \\\[r\[0-9\]+\\\]" 6 } } */
+/* { dg-final { scan-assembler-not "dmb" } } */
--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/arm/atomic-op-release-3.c
@@ -0,0 +1,10 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target arm_arch_v8m_base_ok } */
+/* { dg-options "-O2" } */
+/* { dg-add-options arm_arch_v8m_base } */
+
+#include "../aarch64/atomic-op-release.x"
+
+/* { dg-final { scan-assembler-times "ldrex\tr\[0-9\]+, \\\[r\[0-9\]+\\\]" 6 } } */
+/* { dg-final { scan-assembler-times "stlex\t...?, r\[0-9\]+, \\\[r\[0-9\]+\\\]" 6 } } */
+/* { dg-final { scan-assembler-not "dmb" } } */
--- a/src/gcc/testsuite/gcc.target/arm/atomic-op-release.c
+++ b/src//dev/null
@@ -1,10 +0,0 @@
-/* { dg-do compile } */
-/* { dg-require-effective-target arm_arch_v8a_ok } */
-/* { dg-options "-O2" } */
-/* { dg-add-options arm_arch_v8a } */
-
-#include "../aarch64/atomic-op-release.x"
-
-/* { dg-final { scan-assembler-times "ldrex\tr\[0-9\]+, \\\[r\[0-9\]+\\\]" 6 } } */
-/* { dg-final { scan-assembler-times "stlex\t...?, r\[0-9\]+, \\\[r\[0-9\]+\\\]" 6 } } */
-/* { dg-final { scan-assembler-not "dmb" } } */
--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/arm/atomic-op-seq_cst-1.c
@@ -0,0 +1,10 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target arm_arch_v8a_ok } */
+/* { dg-options "-O2" } */
+/* { dg-add-options arm_arch_v8a } */
+
+#include "../aarch64/atomic-op-seq_cst.x"
+
+/* { dg-final { scan-assembler-times "ldaex\tr\[0-9\]+, \\\[r\[0-9\]+\\\]" 6 } } */
+/* { dg-final { scan-assembler-times "stlex\t...?, r\[0-9\]+, \\\[r\[0-9\]+\\\]" 6 } } */
+/* { dg-final { scan-assembler-not "dmb" } } */
--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/arm/atomic-op-seq_cst-2.c
@@ -0,0 +1,10 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target arm_arch_v8m_main_ok } */
+/* { dg-options "-O2" } */
+/* { dg-add-options arm_arch_v8m_main } */
+
+#include "../aarch64/atomic-op-seq_cst.x"
+
+/* { dg-final { scan-assembler-times "ldaex\tr\[0-9\]+, \\\[r\[0-9\]+\\\]" 6 } } */
+/* { dg-final { scan-assembler-times "stlex\t...?, r\[0-9\]+, \\\[r\[0-9\]+\\\]" 6 } } */
+/* { dg-final { scan-assembler-not "dmb" } } */
--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/arm/atomic-op-seq_cst-3.c
@@ -0,0 +1,10 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target arm_arch_v8m_base_ok } */
+/* { dg-options "-O2" } */
+/* { dg-add-options arm_arch_v8m_base } */
+
+#include "../aarch64/atomic-op-seq_cst.x"
+
+/* { dg-final { scan-assembler-times "ldaex\tr\[0-9\]+, \\\[r\[0-9\]+\\\]" 6 } } */
+/* { dg-final { scan-assembler-times "stlex\t...?, r\[0-9\]+, \\\[r\[0-9\]+\\\]" 6 } } */
+/* { dg-final { scan-assembler-not "dmb" } } */
--- a/src/gcc/testsuite/gcc.target/arm/atomic-op-seq_cst.c
+++ b/src//dev/null
@@ -1,10 +0,0 @@
-/* { dg-do compile } */
-/* { dg-require-effective-target arm_arch_v8a_ok } */
-/* { dg-options "-O2" } */
-/* { dg-add-options arm_arch_v8a } */
-
-#include "../aarch64/atomic-op-seq_cst.x"
-
-/* { dg-final { scan-assembler-times "ldaex\tr\[0-9\]+, \\\[r\[0-9\]+\\\]" 6 } } */
-/* { dg-final { scan-assembler-times "stlex\t...?, r\[0-9\]+, \\\[r\[0-9\]+\\\]" 6 } } */
-/* { dg-final { scan-assembler-not "dmb" } } */
--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/arm/atomic-op-short-1.c
@@ -0,0 +1,10 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target arm_arch_v8a_ok } */
+/* { dg-options "-O2" } */
+/* { dg-add-options arm_arch_v8a } */
+
+#include "../aarch64/atomic-op-short.x"
+
+/* { dg-final { scan-assembler-times "ldrexh\tr\[0-9\]+, \\\[r\[0-9\]+\\\]" 6 } } */
+/* { dg-final { scan-assembler-times "strexh\t...?, r\[0-9\]+, \\\[r\[0-9\]+\\\]" 6 } } */
+/* { dg-final { scan-assembler-not "dmb" } } */
--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/arm/atomic-op-short-2.c
@@ -0,0 +1,10 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target arm_arch_v8m_main_ok } */
+/* { dg-options "-O2" } */
+/* { dg-add-options arm_arch_v8m_main } */
+
+#include "../aarch64/atomic-op-short.x"
+
+/* { dg-final { scan-assembler-times "ldrexh\tr\[0-9\]+, \\\[r\[0-9\]+\\\]" 6 } } */
+/* { dg-final { scan-assembler-times "strexh\t...?, r\[0-9\]+, \\\[r\[0-9\]+\\\]" 6 } } */
+/* { dg-final { scan-assembler-not "dmb" } } */
--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/arm/atomic-op-short-3.c
@@ -0,0 +1,10 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target arm_arch_v8m_base_ok } */
+/* { dg-options "-O2" } */
+/* { dg-add-options arm_arch_v8m_base } */
+
+#include "../aarch64/atomic-op-short.x"
+
+/* { dg-final { scan-assembler-times "ldrexh\tr\[0-9\]+, \\\[r\[0-9\]+\\\]" 6 } } */
+/* { dg-final { scan-assembler-times "strexh\t...?, r\[0-9\]+, \\\[r\[0-9\]+\\\]" 6 } } */
+/* { dg-final { scan-assembler-not "dmb" } } */
--- a/src/gcc/testsuite/gcc.target/arm/atomic-op-short.c
+++ b/src//dev/null
@@ -1,10 +0,0 @@
-/* { dg-do compile } */
-/* { dg-require-effective-target arm_arch_v8a_ok } */
-/* { dg-options "-O2" } */
-/* { dg-add-options arm_arch_v8a } */
-
-#include "../aarch64/atomic-op-short.x"
-
-/* { dg-final { scan-assembler-times "ldrexh\tr\[0-9\]+, \\\[r\[0-9\]+\\\]" 6 } } */
-/* { dg-final { scan-assembler-times "strexh\t...?, r\[0-9\]+, \\\[r\[0-9\]+\\\]" 6 } } */
-/* { dg-final { scan-assembler-not "dmb" } } */
--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/arm/attr-fp16-arith-1.c
@@ -0,0 +1,58 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target arm_v8_2a_fp16_neon_ok } */
+/* { dg-options "-O2" } */
+/* { dg-add-options arm_v8_2a_fp16_scalar } */
+
+/* Reset fpu to a value compatible with the next pragmas.  */
+#pragma GCC target ("fpu=vfp")
+
+#pragma GCC push_options
+#pragma GCC target ("fpu=fp-armv8")
+
+#ifndef __ARM_FEATURE_FP16_SCALAR_ARITHMETIC
+#error __ARM_FEATURE_FP16_SCALAR_ARITHMETIC not defined.
+#endif
+
+#pragma GCC push_options
+#pragma GCC target ("fpu=neon-fp-armv8")
+
+#ifndef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+#error __ARM_FEATURE_FP16_VECTOR_ARITHMETIC not defined.
+#endif
+
+#ifndef __ARM_NEON
+#error __ARM_NEON not defined.
+#endif
+
+#if !defined (__ARM_FP) || !(__ARM_FP & 0x2)
+#error Invalid value for __ARM_FP
+#endif
+
+#include "arm_neon.h"
+
+float16_t
+foo (float16x4_t b)
+{
+  float16x4_t a = {2.0, 3.0, 4.0, 5.0};
+  float16x4_t res = vadd_f16 (a, b);
+
+  return res[0];
+}
+
+/* { dg-final { scan-assembler "vadd\\.f16\td\[0-9\]+, d\[0-9\]+" } } */
+
+#pragma GCC pop_options
+
+/* Check that the FP version is correctly reset to mfpu=fp-armv8.  */
+
+#if !defined (__ARM_FP) || !(__ARM_FP & 0x2)
+#error __ARM_FP should record FP16 support.
+#endif
+
+#pragma GCC pop_options
+
+/* Check that the FP version is correctly reset to mfpu=vfp.  */
+
+#if !defined (__ARM_FP) || (__ARM_FP & 0x2)
+#error Unexpected value for __ARM_FP.
+#endif
--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/arm/builtin_saddl.c
@@ -0,0 +1,17 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" }  */
+/* { dg-require-effective-target arm32 } */
+extern void overflow_handler ();
+
+long overflow_add (long x, long y)
+{
+  long r;
+
+  int ovr = __builtin_saddl_overflow (x, y, &r);
+  if (ovr)
+    overflow_handler ();
+
+  return r;
+}
+
+/* { dg-final { scan-assembler "adds" } } */
--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/arm/builtin_saddll.c
@@ -0,0 +1,18 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" }  */
+/* { dg-require-effective-target arm32 } */
+extern void overflow_handler ();
+
+long long overflow_add (long long x, long long y)
+{
+  long long r;
+
+  int ovr = __builtin_saddll_overflow (x, y, &r);
+  if (ovr)
+    overflow_handler ();
+
+  return r;
+}
+
+/* { dg-final { scan-assembler "adds" } } */
+/* { dg-final { scan-assembler "adcs" } } */
--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/arm/builtin_ssubl.c
@@ -0,0 +1,17 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" }  */
+/* { dg-require-effective-target arm32 } */
+extern void overflow_handler ();
+
+long overflow_sub (long x, long y)
+{
+  long r;
+
+  int ovr = __builtin_ssubl_overflow (x, y, &r);
+  if (ovr)
+    overflow_handler ();
+
+  return r;
+}
+
+/* { dg-final { scan-assembler "subs" } } */
--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/arm/builtin_ssubll.c
@@ -0,0 +1,18 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" }  */
+/* { dg-require-effective-target arm32 } */
+extern void overflow_handler ();
+
+long long overflow_sub (long long x, long long y)
+{
+  long long r;
+
+  int ovr = __builtin_ssubll_overflow (x, y, &r);
+  if (ovr)
+    overflow_handler ();
+
+  return r;
+}
+
+/* { dg-final { scan-assembler "subs" } } */
+/* { dg-final { scan-assembler "sbcs" } } */
--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/arm/builtin_uaddl.c
@@ -0,0 +1,17 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" }  */
+/* { dg-require-effective-target arm32 } */
+extern void overflow_handler ();
+
+unsigned long overflow_add (unsigned long x, unsigned long y)
+{
+  unsigned long r;
+
+  int ovr = __builtin_uaddl_overflow (x, y, &r);
+  if (ovr)
+    overflow_handler ();
+
+  return r;
+}
+
+/* { dg-final { scan-assembler "adds" } } */
--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/arm/builtin_uaddll.c
@@ -0,0 +1,18 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" }  */
+/* { dg-require-effective-target arm32 } */
+extern void overflow_handler ();
+
+unsigned long long overflow_add (unsigned long long x, unsigned long long y)
+{
+  unsigned long long r;
+
+  int ovr = __builtin_uaddll_overflow (x, y, &r);
+  if (ovr)
+    overflow_handler ();
+
+  return r;
+}
+
+/* { dg-final { scan-assembler "adds" } } */
+/* { dg-final { scan-assembler "adcs" } } */
--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/arm/builtin_usubl.c
@@ -0,0 +1,17 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" }  */
+/* { dg-require-effective-target arm32 } */
+extern void overflow_handler ();
+
+unsigned long overflow_sub (unsigned long x, unsigned long y)
+{
+  unsigned long r;
+
+  int ovr = __builtin_usubl_overflow (x, y, &r);
+  if (ovr)
+    overflow_handler ();
+
+  return r;
+}
+
+/* { dg-final { scan-assembler "subs" } } */
--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/arm/builtin_usubll.c
@@ -0,0 +1,18 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" }  */
+/* { dg-require-effective-target arm32 } */
+extern void overflow_handler ();
+
+unsigned long long overflow_sub (unsigned long long x, unsigned long long y)
+{
+  unsigned long long r;
+
+  int ovr = __builtin_usubll_overflow (x, y, &r);
+  if (ovr)
+    overflow_handler ();
+
+  return r;
+}
+
+/* { dg-final { scan-assembler "subs" } } */
+/* { dg-final { scan-assembler "sbcs" } } */
--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/arm/cbz.c
@@ -0,0 +1,12 @@
+/* { dg-do compile {target { arm_thumb2 || arm_thumb1_cbz_ok } } } */
+/* { dg-options "-O2" } */
+
+int
+foo (int a, int *b)
+{
+  if (a)
+    *b = 1;
+  return 0;
+}
+
+/* { dg-final { scan-assembler-times "cbz\\tr\\d" 1 } } */
--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/arm/cmse/baseline/bitfield-4.c
@@ -0,0 +1,57 @@
+/* { dg-do compile } */
+/* { dg-options "-mcmse" } */
+
+typedef struct
+{
+  unsigned char a;
+  unsigned int b:5;
+  unsigned int c:11, :0, d:8;
+  struct { unsigned int ee:2; } e;
+} test_st;
+
+typedef union
+{
+  test_st st;
+  struct
+    {
+      unsigned int v1;
+      unsigned int v2;
+      unsigned int v3;
+      unsigned int v4;
+    }values;
+} read_st;
+
+
+typedef void __attribute__ ((cmse_nonsecure_call)) (*foo_ns) (test_st);
+
+extern void foo (test_st st);
+
+int
+main (void)
+{
+  read_st r;
+  foo_ns f;
+
+  f = (foo_ns) 0x200000;
+  r.values.v1 = 0xFFFFFFFF;
+  r.values.v2 = 0xFFFFFFFF;
+  r.values.v3 = 0xFFFFFFFF;
+  r.values.v4 = 0xFFFFFFFF;
+
+  f (r.st);
+  return 0;
+}
+
+/* { dg-final { scan-assembler "mov\tip, r4" } } */
+/* { dg-final { scan-assembler "movw\tr4, #65535" } } */
+/* { dg-final { scan-assembler "movt\tr4, 255" } } */
+/* { dg-final { scan-assembler "ands\tr0, r4" } } */
+/* { dg-final { scan-assembler "movs\tr4, #255" } } */
+/* { dg-final { scan-assembler "ands\tr1, r4" } } */
+/* { dg-final { scan-assembler "movs\tr4, #3" } } */
+/* { dg-final { scan-assembler "ands\tr2, r4" } } */
+/* { dg-final { scan-assembler "mov\tr4, ip" } } */
+/* { dg-final { scan-assembler "lsrs\tr4, r4, #1" } } */
+/* { dg-final { scan-assembler "lsls\tr4, r4, #1" } } */
+/* { dg-final { scan-assembler "movs\tr3, r4" } } */
+/* { dg-final { scan-assembler "bl\t__gnu_cmse_nonsecure_call" } } */
--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/arm/cmse/baseline/bitfield-5.c
@@ -0,0 +1,53 @@
+/* { dg-do compile } */
+/* { dg-options "-mcmse" } */
+
+typedef struct
+{
+  unsigned char	  a;
+  unsigned short  b :5;
+  unsigned char	  c;
+  unsigned short  d :11;
+} test_st;
+
+typedef union
+{
+  test_st st;
+  struct
+    {
+      unsigned int v1;
+      unsigned int v2;
+      unsigned int v3;
+      unsigned int v4;
+    }values;
+} read_st;
+
+
+typedef void __attribute__ ((cmse_nonsecure_call)) (*foo_ns) (test_st);
+
+int
+main (void)
+{
+  read_st r;
+  foo_ns f;
+
+  f = (foo_ns) 0x200000;
+  r.values.v1 = 0xFFFFFFFF;
+  r.values.v2 = 0xFFFFFFFF;
+
+  f (r.st);
+  return 0;
+}
+
+/* { dg-final { scan-assembler "mov\tip, r4" } } */
+/* { dg-final { scan-assembler "movw\tr4, #8191" } } */
+/* { dg-final { scan-assembler "movt\tr4, 255" } } */
+/* { dg-final { scan-assembler "ands\tr0, r4" } } */
+/* { dg-final { scan-assembler "movw\tr4, #2047" } } */
+/* { dg-final { scan-assembler "ands\tr1, r4" } } */
+/* { dg-final { scan-assembler "mov\tr4, ip" } } */
+/* { dg-final { scan-assembler "lsrs\tr4, r4, #1" } } */
+/* { dg-final { scan-assembler "lsls\tr4, r4, #1" } } */
+/* { dg-final { scan-assembler "movs\tr2, r4" } } */
+/* { dg-final { scan-assembler "movs\tr3, r4" } } */
+/* { dg-final { scan-assembler "bl\t__gnu_cmse_nonsecure_call" } } */
+
--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/arm/cmse/baseline/bitfield-6.c
@@ -0,0 +1,63 @@
+/* { dg-do compile } */
+/* { dg-options "-mcmse" } */
+
+typedef struct
+{
+  unsigned char a;
+  unsigned int	b : 3;
+  unsigned int	c : 14;
+  unsigned int	d : 1;
+  struct {
+      unsigned int    ee  : 2;
+      unsigned short  ff  : 15;
+  } e;
+  unsigned char	g : 1;
+  unsigned char	  : 4;
+  unsigned char	h : 3;
+} test_st;
+
+typedef union
+{
+  test_st st;
+  struct
+    {
+      unsigned int v1;
+      unsigned int v2;
+      unsigned int v3;
+      unsigned int v4;
+    }values;
+} read_st;
+
+
+typedef void __attribute__ ((cmse_nonsecure_call)) (*foo_ns) (test_st);
+
+int
+main (void)
+{
+  read_st r;
+  foo_ns f;
+
+  f = (foo_ns) 0x200000;
+  r.values.v1 = 0xFFFFFFFF;
+  r.values.v2 = 0xFFFFFFFF;
+  r.values.v3 = 0xFFFFFFFF;
+  r.values.v4 = 0xFFFFFFFF;
+
+  f (r.st);
+  return 0;
+}
+
+/* { dg-final { scan-assembler "mov\tip, r4" } } */
+/* { dg-final { scan-assembler "movw\tr4, #65535" } } */
+/* { dg-final { scan-assembler "movt\tr4, 1023" } } */
+/* { dg-final { scan-assembler "ands\tr0, r4" } } */
+/* { dg-final { scan-assembler "movs\tr4, #3" } } */
+/* { dg-final { scan-assembler "movt\tr4, 32767" } } */
+/* { dg-final { scan-assembler "ands\tr1, r4" } } */
+/* { dg-final { scan-assembler "movs\tr4, #255" } } */
+/* { dg-final { scan-assembler "ands\tr2, r4" } } */
+/* { dg-final { scan-assembler "mov\tr4, ip" } } */
+/* { dg-final { scan-assembler "lsrs\tr4, r4, #1" } } */
+/* { dg-final { scan-assembler "lsls\tr4, r4, #1" } } */
+/* { dg-final { scan-assembler "movs\tr3, r4" } } */
+/* { dg-final { scan-assembler "bl\t__gnu_cmse_nonsecure_call" } } */
--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/arm/cmse/baseline/bitfield-7.c
@@ -0,0 +1,54 @@
+/* { dg-do compile } */
+/* { dg-options "-mcmse" } */
+
+typedef struct
+{
+  unsigned char	  a;
+  unsigned short  b :5;
+  unsigned char	  c;
+  unsigned short  d :11;
+} test_st;
+
+typedef union
+{
+  test_st st;
+  struct
+    {
+      unsigned int v1;
+      unsigned int v2;
+      unsigned int v3;
+      unsigned int v4;
+    }values;
+} read_st;
+
+
+typedef void __attribute__ ((cmse_nonsecure_call)) (*foo_ns) (test_st);
+
+int
+main (void)
+{
+  read_st r;
+  foo_ns f;
+
+  f = (foo_ns) 0x200000;
+  r.values.v1 = 0xFFFFFFFF;
+  r.values.v2 = 0xFFFFFFFF;
+
+  f (r.st);
+  return 0;
+}
+
+
+/* { dg-final { scan-assembler "mov\tip, r4" } } */
+/* { dg-final { scan-assembler "movw\tr4, #8191" } } */
+/* { dg-final { scan-assembler "movt\tr4, 255" } } */
+/* { dg-final { scan-assembler "ands\tr0, r4" } } */
+/* { dg-final { scan-assembler "movw\tr4, #2047" } } */
+/* { dg-final { scan-assembler "ands\tr1, r4" } } */
+/* { dg-final { scan-assembler "mov\tr4, ip" } } */
+/* { dg-final { scan-assembler "lsrs\tr4, r4, #1" } } */
+/* { dg-final { scan-assembler "lsls\tr4, r4, #1" } } */
+/* { dg-final { scan-assembler "movs\tr2, r4" } } */
+/* { dg-final { scan-assembler "movs\tr3, r4" } } */
+/* { dg-final { scan-assembler "bl\t__gnu_cmse_nonsecure_call" } } */
+
--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/arm/cmse/baseline/bitfield-8.c
@@ -0,0 +1,57 @@
+/* { dg-do compile } */
+/* { dg-options "-mcmse" } */
+
+typedef struct
+{
+  unsigned char	  a;
+  unsigned int	    :0;
+  unsigned int	  b :1;
+  unsigned short    :0;
+  unsigned short  c;
+  unsigned int	    :0;
+  unsigned int	  d :21;
+} test_st;
+
+typedef union
+{
+  test_st st;
+  struct
+    {
+      unsigned int v1;
+      unsigned int v2;
+      unsigned int v3;
+      unsigned int v4;
+    }values;
+} read_st;
+
+typedef void __attribute__ ((cmse_nonsecure_call)) (*foo_ns) (test_st);
+
+int
+main (void)
+{
+  read_st r;
+  foo_ns f;
+
+  f = (foo_ns) 0x200000;
+  r.values.v1 = 0xFFFFFFFF;
+  r.values.v2 = 0xFFFFFFFF;
+  r.values.v3 = 0xFFFFFFFF;
+
+  f (r.st);
+  return 0;
+}
+
+/* { dg-final { scan-assembler "mov\tip, r4" } } */
+/* { dg-final { scan-assembler "movs\tr4, #255" } } */
+/* { dg-final { scan-assembler "ands\tr0, r4" } } */
+/* { dg-final { scan-assembler "movs\tr4, #1" } } */
+/* { dg-final { scan-assembler "movt\tr4, 65535" } } */
+/* { dg-final { scan-assembler "ands\tr1, r4" } } */
+/* { dg-final { scan-assembler "movw\tr4, #65535" } } */
+/* { dg-final { scan-assembler "movt\tr4, 31" } } */
+/* { dg-final { scan-assembler "ands\tr2, r4" } } */
+/* { dg-final { scan-assembler "mov\tr4, ip" } } */
+/* { dg-final { scan-assembler "lsrs\tr4, r4, #1" } } */
+/* { dg-final { scan-assembler "lsls\tr4, r4, #1" } } */
+/* { dg-final { scan-assembler "movs\tr3, r4" } } */
+/* { dg-final { scan-assembler "bl\t__gnu_cmse_nonsecure_call" } } */
--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/arm/cmse/baseline/bitfield-9.c
@@ -0,0 +1,56 @@
+/* { dg-do compile } */
+/* { dg-options "-mcmse" } */
+
+typedef struct
+{
+  char a:3;
+} test_st3;
+
+typedef struct
+{
+  char a:3;
+} test_st2;
+
+typedef struct
+{
+  test_st2 st2;
+  test_st3 st3;
+} test_st;
+
+typedef union
+{
+  test_st st;
+  struct
+    {
+      unsigned int v1;
+      unsigned int v2;
+      unsigned int v3;
+      unsigned int v4;
+    }values;
+} read_st;
+
+typedef void __attribute__ ((cmse_nonsecure_call)) (*foo_ns) (test_st);
+
+int
+main (void)
+{
+  read_st r;
+  foo_ns f;
+
+  f = (foo_ns) 0x200000;
+  r.values.v1 = 0xFFFFFFFF;
+
+  f (r.st);
+  return 0;
+}
+
+/* { dg-final { scan-assembler "mov\tip, r4" } } */
+/* { dg-final { scan-assembler "movw\tr4, #1799" } } */
+/* { dg-final { scan-assembler "ands\tr0, r4" } } */
+/* { dg-final { scan-assembler "mov\tr4, ip" } } */
+/* { dg-final { scan-assembler "lsrs\tr4, r4, #1" } } */
+/* { dg-final { scan-assembler "lsls\tr4, r4, #1" } } */
+/* { dg-final { scan-assembler "movs\tr1, r4" } } */
+/* { dg-final { scan-assembler "movs\tr2, r4" } } */
+/* { dg-final { scan-assembler "movs\tr3, r4" } } */
+/* { dg-final { scan-assembler "bl\t__gnu_cmse_nonsecure_call" } } */
--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/arm/cmse/baseline/bitfield-and-union-1.c
@@ -0,0 +1,96 @@
+/* { dg-do compile } */
+/* { dg-options "-mcmse" } */
+
+typedef struct
+{
+  unsigned short a :11;
+} test_st_4;
+
+typedef union
+{
+  char	      a;
+  test_st_4 st4;
+}test_un_2;
+
+typedef struct
+{
+  unsigned char	  a;
+  unsigned int	    :0;
+  unsigned int	  b :1;
+  unsigned short    :0;
+  unsigned short  c;
+  unsigned int	    :0;
+  unsigned int	  d :21;
+} test_st_3;
+
+typedef struct
+{
+  unsigned char	  a :3;
+  unsigned int	  b :13;
+  test_un_2	  un2;
+} test_st_2;
+
+typedef union
+{
+  test_st_2 st2;
+  test_st_3 st3;
+}test_un_1;
+
+typedef struct
+{
+  unsigned char	  a :2;
+  unsigned char	    :0;
+  unsigned short  b :5;
+  unsigned char	    :0;
+  unsigned char	  c :4;
+  test_un_1	  un1;
+} test_st_1;
+
+typedef union
+{
+  test_st_1 st1;
+  struct
+    {
+      unsigned int v1;
+      unsigned int v2;
+      unsigned int v3;
+      unsigned int v4;
+    }values;
+} read_st_1;
+
+
+typedef void __attribute__ ((cmse_nonsecure_call)) (*foo_ns) (test_st_1);
+
+int
+main (void)
+{
+  read_st_1 r;
+  foo_ns f;
+
+  f = (foo_ns) 0x200000;
+  r.values.v1 = 0xFFFFFFFF;
+  r.values.v2 = 0xFFFFFFFF;
+  r.values.v3 = 0xFFFFFFFF;
+  r.values.v4 = 0xFFFFFFFF;
+
+  f (r.st1);
+  return 0;
+}
+
+/* { dg-final { scan-assembler "mov\tip, r4" } } */
+/* { dg-final { scan-assembler "movw\tr4, #7939" } } */
+/* { dg-final { scan-assembler "movt\tr4, 15" } } */
+/* { dg-final { scan-assembler "ands\tr0, r4" } } */
+/* { dg-final { scan-assembler "movw\tr4, #65535" } } */
+/* { dg-final { scan-assembler "movt\tr4, 2047" } } */
+/* { dg-final { scan-assembler "ands\tr1, r4" } } */
+/* { dg-final { scan-assembler "movs\tr4, #1" } } */
+/* { dg-final { scan-assembler "movt\tr4, 65535" } } */
+/* { dg-final { scan-assembler "ands\tr2, r4" } } */
+/* { dg-final { scan-assembler "movw\tr4, #65535" } } */
+/* { dg-final { scan-assembler "movt\tr4, 31" } } */
+/* { dg-final { scan-assembler "ands\tr3, r4" } } */
+/* { dg-final { scan-assembler "mov\tr4, ip" } } */
+/* { dg-final { scan-assembler "lsrs\tr4, r4, #1" } } */
+/* { dg-final { scan-assembler "lsls\tr4, r4, #1" } } */
+/* { dg-final { scan-assembler "bl\t__gnu_cmse_nonsecure_call" } } */
--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/arm/cmse/baseline/cmse-11.c
@@ -0,0 +1,22 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target arm_arch_v8m_base_ok } */
+/* { dg-add-options arm_arch_v8m_base } */
+/* { dg-options "-mcmse" }  */
+
+int __attribute__ ((cmse_nonsecure_call)) (*bar) (int);
+
+int
+foo (int a)
+{
+  return bar (bar (a + 1));
+}
+
+/* Checks for saving and clearing prior to function call.  */
+/* { dg-final { scan-assembler "lsrs\tr4, r4, #1" } } */
+/* { dg-final { scan-assembler "lsls\tr4, r4, #1" } } */
+/* { dg-final { scan-assembler "movs\tr1, r4" } } */
+/* { dg-final { scan-assembler "movs\tr2, r4" } } */
+/* { dg-final { scan-assembler "movs\tr3, r4" } } */
+
+/* Now we check that we use the correct intrinsic to call.  */
+/* { dg-final { scan-assembler "bl\t__gnu_cmse_nonsecure_call" } } */
--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/arm/cmse/baseline/cmse-13.c
@@ -0,0 +1,25 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target arm_arch_v8m_base_ok } */
+/* { dg-add-options arm_arch_v8m_base } */
+/* { dg-options "-mcmse" } */
+
+int __attribute__ ((cmse_nonsecure_call)) (*bar) (float, double);
+
+int
+foo (int a)
+{
+  return bar (1.0f, 2.0) + a + 1;
+}
+
+/* Checks for saving and clearing prior to function call.  */
+/* { dg-final { scan-assembler "lsrs\tr4, r4, #1" } } */
+/* { dg-final { scan-assembler "lsls\tr4, r4, #1" } } */
+/* { dg-final { scan-assembler-not "movs\tr0, r4" } } */
+/* { dg-final { scan-assembler "\n\tmovs\tr1, r4" } } */
+/* { dg-final { scan-assembler-not "\n\tmovs\tr2, r4\n\tmovs\tr3, r4" } } */
+/* { dg-final { scan-assembler-not "vmov" } } */
+/* { dg-final { scan-assembler-not "vmsr" } } */
+
+/* Now we check that we use the correct intrinsic to call.  */
+/* { dg-final { scan-assembler "bl\t__gnu_cmse_nonsecure_call" } } */
+
--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/arm/cmse/baseline/cmse-2.c
@@ -0,0 +1,19 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target arm_arch_v8m_base_ok } */
+/* { dg-add-options arm_arch_v8m_base } */
+/* { dg-options "-mcmse" }  */
+
+extern float bar (void);
+
+float __attribute__ ((cmse_nonsecure_entry))
+foo (void)
+{
+  return bar ();
+}
+/* { dg-final { scan-assembler "movs\tr1, r0" } } */
+/* { dg-final { scan-assembler "movs\tr2, r0" } } */
+/* { dg-final { scan-assembler "movs\tr3, r0" } } */
+/* { dg-final { scan-assembler "mov\tip, r0" } } */
+/* { dg-final { scan-assembler "mov\tlr, r0" } } */
+/* { dg-final { scan-assembler "msr\tAPSR_nzcvq," } } */
+/* { dg-final { scan-assembler "bxns" } } */
--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/arm/cmse/baseline/cmse-6.c
@@ -0,0 +1,21 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target arm_arch_v8m_base_ok } */
+/* { dg-add-options arm_arch_v8m_base } */
+/* { dg-options "-mcmse" }  */
+
+int __attribute__ ((cmse_nonsecure_call)) (*bar) (double);
+
+int
+foo (int a)
+{
+  return bar (2.0) + a + 1;
+}
+
+/* Remember dont clear r0 and r1, because we are passing the double parameter
+ * for bar in them.  */
+/* { dg-final { scan-assembler "lsrs\tr4, r4, #1" } } */
+/* { dg-final { scan-assembler "lsls\tr4, r4, #1" } } */
+/* { dg-final { scan-assembler "movs\tr2, r4" } } */
+
+/* Now we check that we use the correct intrinsic to call.  */
+/* { dg-final { scan-assembler "bl\t__gnu_cmse_nonsecure_call" } } */
--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/arm/cmse/baseline/softfp.c
@@ -0,0 +1,29 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target arm_arch_v8m_base_ok } */
+/* { dg-add-options arm_arch_v8m_base } */
+/* { dg-options "-mcmse -mfloat-abi=softfp" } */
+
+double __attribute__ ((cmse_nonsecure_call)) (*bar) (float, double);
+
+double
+foo (double a)
+{
+  return bar (1.0f, 2.0) + a;
+}
+
+float __attribute__ ((cmse_nonsecure_entry))
+baz (float a, double b)
+{
+  return (float) bar (a, b);
+}
+
+/* Make sure we are not using FP instructions, since ARMv8-M Baseline does not
+   support such instructions.  */
+/* { dg-final { scan-assembler-not "vmov" } } */
+/* { dg-final { scan-assembler-not "vmsr" } } */
+/* { dg-final { scan-assembler-not "vmrs" } } */
+
+/* Just double checking that we are still doing cmse though.  */
+/* { dg-final { scan-assembler-not "vmrs" } } */
+/* { dg-final { scan-assembler "bl\t__gnu_cmse_nonsecure_call" } } */
+
--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/arm/cmse/baseline/union-1.c
@@ -0,0 +1,71 @@
+/* { dg-do compile } */
+/* { dg-options "-mcmse" } */
+
+typedef struct
+{
+  unsigned char	  a :2;
+  unsigned char	    :0;
+  unsigned short  b :5;
+  unsigned char	    :0;
+  unsigned short  c :3;
+  unsigned char	    :0;
+  unsigned int	  d :9;
+} test_st_1;
+
+typedef struct
+{
+  unsigned short  a :7;
+  unsigned char	    :0;
+  unsigned char	  b :1;
+  unsigned char	    :0;
+  unsigned short  c :6;
+} test_st_2;
+
+typedef union
+{
+  test_st_1 st_1;
+  test_st_2 st_2;
+}test_un;
+
+typedef union
+{
+  test_un un;
+  struct
+    {
+      unsigned int v1;
+      unsigned int v2;
+      unsigned int v3;
+      unsigned int v4;
+    }values;
+} read_un;
+
+
+typedef void __attribute__ ((cmse_nonsecure_call)) (*foo_ns) (test_un);
+
+int
+main (void)
+{
+  read_un r;
+  foo_ns f;
+
+  f = (foo_ns) 0x200000;
+  r.values.v1 = 0xFFFFFFFF;
+  r.values.v2 = 0xFFFFFFFF;
+
+  f (r.un);
+  return 0;
+}
+
+/* { dg-final { scan-assembler "mov\tip, r4" } } */
+/* { dg-final { scan-assembler "movw\tr4, #8063" } } */
+/* { dg-final { scan-assembler "movt\tr4, 63" } } */
+/* { dg-final { scan-assembler "ands\tr0, r4" } } */
+/* { dg-final { scan-assembler "movw\tr4, #511" } } */
+/* { dg-final { scan-assembler "ands\tr1, r4" } } */
+/* { dg-final { scan-assembler "mov\tr4, ip" } } */
+/* { dg-final { scan-assembler "lsrs\tr4, r4, #1" } } */
+/* { dg-final { scan-assembler "lsls\tr4, r4, #1" } } */
+/* { dg-final { scan-assembler "movs\tr2, r4" } } */
+/* { dg-final { scan-assembler "movs\tr3, r4" } } */
+/* { dg-final { scan-assembler "bl\t__gnu_cmse_nonsecure_call" } } */
+
--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/arm/cmse/baseline/union-2.c
@@ -0,0 +1,86 @@
+/* { dg-do compile } */
+/* { dg-options "-mcmse" } */
+
+typedef struct
+{
+  unsigned char	  a :2;
+  unsigned char	    :0;
+  unsigned short  b :5;
+  unsigned char	    :0;
+  unsigned short  c :3;
+  unsigned char	    :0;
+  unsigned int	  d :9;
+} test_st_1;
+
+typedef struct
+{
+  unsigned short  a :7;
+  unsigned char	    :0;
+  unsigned char	  b :1;
+  unsigned char	    :0;
+  unsigned short  c :6;
+} test_st_2;
+
+typedef struct
+{
+  unsigned char	  a;
+  unsigned int	    :0;
+  unsigned int	  b :1;
+  unsigned short    :0;
+  unsigned short  c;
+  unsigned int	    :0;
+  unsigned int	  d :21;
+} test_st_3;
+
+typedef union
+{
+  test_st_1 st_1;
+  test_st_2 st_2;
+  test_st_3 st_3;
+}test_un;
+
+typedef union
+{
+  test_un un;
+  struct
+    {
+      unsigned int v1;
+      unsigned int v2;
+      unsigned int v3;
+      unsigned int v4;
+    }values;
+} read_un;
+
+
+typedef void __attribute__ ((cmse_nonsecure_call)) (*foo_ns) (test_un);
+
+int
+main (void)
+{
+  read_un r;
+  foo_ns f;
+
+  f = (foo_ns) 0x200000;
+  r.values.v1 = 0xFFFFFFFF;
+  r.values.v2 = 0xFFFFFFFF;
+  r.values.v3 = 0xFFFFFFFF;
+
+  f (r.un);
+  return 0;
+}
+
+/* { dg-final { scan-assembler "mov\tip, r4" } } */
+/* { dg-final { scan-assembler "movw\tr4, #8191" } } */
+/* { dg-final { scan-assembler "movt\tr4, 63" } } */
+/* { dg-final { scan-assembler "ands\tr0, r4" } } */
+/* { dg-final { scan-assembler "movw\tr4, #511" } } */
+/* { dg-final { scan-assembler "movt\tr4, 65535" } } */
+/* { dg-final { scan-assembler "ands\tr1, r4" } } */
+/* { dg-final { scan-assembler "movw\tr4, #65535" } } */
+/* { dg-final { scan-assembler "movt\tr4, 31" } } */
+/* { dg-final { scan-assembler "ands\tr2, r4" } } */
+/* { dg-final { scan-assembler "mov\tr4, ip" } } */
+/* { dg-final { scan-assembler "lsrs\tr4, r4, #1" } } */
+/* { dg-final { scan-assembler "lsls\tr4, r4, #1" } } */
+/* { dg-final { scan-assembler "movs\tr3, r4" } } */
+/* { dg-final { scan-assembler "bl\t__gnu_cmse_nonsecure_call" } } */
--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/arm/cmse/bitfield-1.c
@@ -0,0 +1,39 @@
+/* { dg-do run } */
+/* { dg-options "--save-temps -mcmse -Wl,--section-start,.gnu.sgstubs=0x20400000" } */
+
+typedef struct
+{
+  unsigned short  a : 6;
+  unsigned char	  b : 3;
+  unsigned char	  c;
+  unsigned short  d : 8;
+} test_st;
+
+test_st __attribute__ ((cmse_nonsecure_entry)) foo (void)
+{
+  test_st t;
+  t.a = 63u;
+  t.b = 7u;
+  t.c = 255u;
+  t.d = 255u;
+  return t;
+}
+
+int
+main (void)
+{
+  test_st t;
+  t = foo ();
+  if (t.a != 63u
+      || t.b != 7u
+      || t.c != 255u
+      || t.d != 255u)
+    __builtin_abort ();
+  return 0;
+}
+
+/* { dg-final { scan-assembler "movw\tr1, #1855" } } */
+/* { dg-final { scan-assembler "movt\tr1, 65535" } } */
+/* { dg-final { scan-assembler "ands\tr0(, r0)?, r1" } } */
+/* { dg-final { scan-assembler "bxns" } } */
+
--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/arm/cmse/bitfield-2.c
@@ -0,0 +1,36 @@
+/* { dg-do run } */
+/* { dg-options "--save-temps -mcmse -Wl,--section-start,.gnu.sgstubs=0x20400000" } */
+
+typedef struct
+{
+  short	      a : 7;
+  signed char b : 3;
+  short	      c : 11;
+} test_st;
+
+test_st __attribute__ ((cmse_nonsecure_entry)) foo (void)
+{
+  test_st t;
+  t.a = -64;
+  t.b = -4 ;
+  t.c = -1024;
+  return t;
+}
+
+int
+main (void)
+{
+  test_st t;
+  t = foo ();
+  if (t.a != -64
+      || t.b != -4
+      || t.c != -1024)
+    __builtin_abort ();
+  return 0;
+}
+
+/* { dg-final { scan-assembler "movw\tr1, #1919" } } */
+/* { dg-final { scan-assembler "movt\tr1, 2047" } } */
+/* { dg-final { scan-assembler "ands\tr0(, r0)?, r1" } } */
+/* { dg-final { scan-assembler "bxns" } } */
+
--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/arm/cmse/bitfield-3.c
@@ -0,0 +1,37 @@
+/* { dg-do run } */
+/* { dg-options "--save-temps -mcmse -Wl,--section-start,.gnu.sgstubs=0x20400000" } */
+
+typedef struct
+{
+  short	      a;
+  signed char b : 2;
+  short		: 1;
+  signed char c : 3;
+} test_st;
+
+test_st __attribute__ ((cmse_nonsecure_entry)) foo (void)
+{
+  test_st t;
+  t.a = -32768;
+  t.b = -2;
+  t.c = -4;
+  return t;
+}
+
+int
+main (void)
+{
+  test_st t;
+  t = foo ();
+  if (t.a != -32768
+      || t.b != -2
+      || t.c != -4)
+    __builtin_abort ();
+  return 0;
+}
+
+/* { dg-final { scan-assembler "movw\tr1, #65535" } } */
+/* { dg-final { scan-assembler "movt\tr1, 63" } } */
+/* { dg-final { scan-assembler "ands\tr0(, r0)?, r1" } } */
+/* { dg-final { scan-assembler "bxns" } } */
+
--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/arm/cmse/cmse-1.c
@@ -0,0 +1,106 @@
+/* { dg-do compile } */
+/* { dg-options "-Os -mcmse -fdump-rtl-expand" }  */
+
+#include <arm_cmse.h>
+
+extern int a;
+extern int bar (void);
+
+int foo (char * p)
+{
+  cmse_address_info_t cait;
+
+  cait = cmse_TT (&a);
+  if (cait.flags.mpu_region)
+    a++;
+
+  cait = cmse_TT_fptr (&bar);
+  if (cait.flags.mpu_region)
+    a+= bar ();
+
+  cait = cmse_TTA (&a);
+  if (cait.flags.mpu_region)
+    a++;
+
+  cait = cmse_TTA_fptr (&bar);
+  if (cait.flags.mpu_region)
+    a+= bar ();
+
+  cait = cmse_TTT (&a);
+  if (cait.flags.mpu_region)
+    a++;
+
+  cait = cmse_TTT_fptr (&bar);
+  if (cait.flags.mpu_region)
+    a+= bar ();
+
+  cait = cmse_TTAT (&a);
+  if (cait.flags.mpu_region)
+    a++;
+
+  cait = cmse_TTAT_fptr (&bar);
+  if (cait.flags.mpu_region)
+    a+= bar ();
+
+  p = (char *) cmse_check_address_range ((void *) p, sizeof (char), 0);
+  p = (char *) cmse_check_address_range ((void *) p, sizeof (char),
+					 CMSE_MPU_UNPRIV);
+  p = (char *) cmse_check_address_range ((void *) p, sizeof (char),
+					 CMSE_MPU_READWRITE);
+  p = (char *) cmse_check_address_range ((void *) p, sizeof (char),
+					 CMSE_MPU_UNPRIV | CMSE_MPU_READ);
+  p = (char *) cmse_check_address_range ((void *) p, sizeof (char),
+					 CMSE_AU_NONSECURE
+					 | CMSE_MPU_NONSECURE);
+  p = (char *) cmse_check_address_range ((void *) p, sizeof (char),
+					 CMSE_NONSECURE | CMSE_MPU_UNPRIV);
+
+  p = (char *) cmse_check_pointed_object (p, CMSE_NONSECURE | CMSE_MPU_UNPRIV);
+
+  return a;
+}
+/* { dg-final { scan-assembler-times "\ttt " 2 } } */
+/* { dg-final { scan-assembler-times "ttt " 2 } } */
+/* { dg-final { scan-assembler-times "tta " 2 } } */
+/* { dg-final { scan-assembler-times "ttat " 2 } } */
+/* { dg-final { scan-assembler-times "bl.cmse_check_address_range" 7 } } */
+/* { dg-final { scan-assembler-not "cmse_check_pointed_object" } } */
+
+int __attribute__ ((cmse_nonsecure_entry))
+baz (void)
+{
+  return cmse_nonsecure_caller ();
+}
+
+typedef int __attribute__ ((cmse_nonsecure_call)) (int_nsfunc_t) (void);
+
+int default_callback (void)
+{
+  return 0;
+}
+
+int_nsfunc_t * fp = (int_nsfunc_t *) default_callback;
+
+void __attribute__ ((cmse_nonsecure_entry))
+qux (int_nsfunc_t * callback)
+{
+  fp = cmse_nsfptr_create (callback);
+}
+
+int call_callback (void)
+{
+  if (cmse_is_nsfptr (fp))
+      return fp ();
+  else
+    return default_callback ();
+}
+/* { dg-final { scan-assembler "baz:" } } */
+/* { dg-final { scan-assembler "__acle_se_baz:" } } */
+/* { dg-final { scan-assembler "qux:" } } */
+/* { dg-final { scan-assembler "__acle_se_qux:" } } */
+/* { dg-final { scan-assembler-not "\tcmse_nonsecure_caller" } } */
+/* { dg-final { scan-rtl-dump "and.*reg.*const_int 1" expand } } */
+/* { dg-final { scan-assembler "bic" } } */
+/* { dg-final { scan-assembler "push\t\{r4, r5, r6" } } */
+/* { dg-final { scan-assembler "msr\tAPSR_nzcvq" } } */
+/* { dg-final { scan-assembler-times "bl\\s+__gnu_cmse_nonsecure_call" 1 } } */
--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/arm/cmse/cmse-10.c
@@ -0,0 +1,9 @@
+/* { dg-do compile } */
+/* { dg-options "-mcmse" }  */
+
+void
+foo (void) {}
+
+/* { dg-final { scan-assembler-not "bxns" } } */
+/* { dg-final { scan-assembler "foo:" } } */
+/* { dg-final { scan-assembler-not "__acle_se_foo:" } } */
--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/arm/cmse/cmse-12.c
@@ -0,0 +1,14 @@
+/* { dg-do compile } */
+/* { dg-options "-mcmse" }  */
+#include <arm_cmse.h>
+
+char *
+foo (char * p)
+{
+  if (!cmse_is_nsfptr (p))
+    return cmse_nsfptr_create (p);
+}
+
+/* Checks for saving and clearing prior to function call.  */
+/* { dg-final { scan-assembler-not "cmse_is_nsfptr" } } */
+/* { dg-final { scan-assembler-not "cmse_nsfptr_create" } } */
--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/arm/cmse/cmse-14.c
@@ -0,0 +1,13 @@
+/* { dg-do compile } */
+/* { dg-options "-mcmse" } */
+
+
+int __attribute__ ((cmse_nonsecure_call)) (*bar) (void);
+
+int foo (void)
+{
+  return bar ();
+}
+
+/* { dg-final { scan-assembler "bl\t__gnu_cmse_nonsecure_call" } } */
+/* { dg-final { scan-assembler-not "b\[^ y\n\]*\\s+bar" } } */
--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/arm/cmse/cmse-15.c
@@ -0,0 +1,72 @@
+/* { dg-do compile } */
+/* { dg-options "-mcmse" } */
+
+int __attribute__ ((cmse_nonsecure_call)) (*ns_foo) (void);
+int (*s_bar) (void);
+int __attribute__ ((cmse_nonsecure_call)) (**ns_foo2) (void);
+int (**s_bar2) (void);
+
+typedef int __attribute__ ((cmse_nonsecure_call)) ns_foo_t (void);
+typedef int s_bar_t (void);
+typedef int __attribute__ ((cmse_nonsecure_call)) (* ns_foo_ptr) (void);
+typedef int (*s_bar_ptr) (void);
+
+int nonsecure0 (ns_foo_t * ns_foo_p)
+{
+  return ns_foo_p ();
+}
+
+int nonsecure1 (ns_foo_t ** ns_foo_p)
+{
+  return (*ns_foo_p) ();
+}
+
+int nonsecure2 (ns_foo_ptr ns_foo_p)
+{
+  return ns_foo_p ();
+}
+int nonsecure3 (ns_foo_ptr * ns_foo_p)
+{
+  return (*ns_foo_p) ();
+}
+
+int secure0 (s_bar_t * s_bar_p)
+{
+  return s_bar_p ();
+}
+
+int secure1 (s_bar_t ** s_bar_p)
+{
+  return (*s_bar_p) ();
+}
+
+int secure2 (s_bar_ptr s_bar_p)
+{
+  return s_bar_p ();
+}
+
+int secure3 (s_bar_ptr * s_bar_p)
+{
+  return (*s_bar_p) ();
+}
+
+int nonsecure4 (void)
+{
+  return ns_foo ();
+}
+
+int nonsecure5 (void)
+{
+  return (*ns_foo2) ();
+}
+
+int secure4 (void)
+{
+  return s_bar ();
+}
+
+int secure5 (void)
+{
+  return (*s_bar2) ();
+}
+/* { dg-final { scan-assembler-times "bl\\s+__gnu_cmse_nonsecure_call" 6 } } */
--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/arm/cmse/cmse-3.c
@@ -0,0 +1,45 @@
+/* { dg-do compile } */
+/* { dg-options "-mcmse" }  */
+
+struct span {
+  int a, b;
+};
+struct span2 {
+  float a, b, c, d;
+};
+
+union test_union
+{
+  long long a;
+  int b;
+  struct span2 c;
+} test_union;
+
+void __attribute__ ((cmse_nonsecure_entry))
+foo (long long a, int b, long long c) {} /* { dg-error "not available to functions with arguments passed on the stack" } */
+
+void __attribute__ ((cmse_nonsecure_entry))
+bar (long long a, int b, struct span c) {} /* { dg-error "not available to functions with arguments passed on the stack" } */
+
+void __attribute__ ((cmse_nonsecure_entry))
+baz (int a, ...) {} /* { dg-error "not available to functions with variable number of arguments" } */
+
+struct span __attribute__ ((cmse_nonsecure_entry))
+qux (void) { /* { dg-error "not available to functions that return value on the stack" } */
+  struct span ret = {0, 0};
+  return ret;
+}
+
+void __attribute__ ((cmse_nonsecure_entry))
+norf (struct span2 a) {}
+
+void __attribute__ ((cmse_nonsecure_entry))
+foo2 (long long a, int b, union test_union c) {} /* { dg-error "not available to functions with arguments passed on the stack" } */
+
+typedef void __attribute__ ((cmse_nonsecure_call)) bar2 (long long a, int b, long long c); /* { dg-error "not available to functions with arguments passed on the stack" } */
+
+typedef void __attribute__ ((cmse_nonsecure_call)) baz2 (long long a, int b, struct span c); /* { dg-error "not available to functions with arguments passed on the stack" } */
+
+typedef struct span __attribute__ ((cmse_nonsecure_call)) qux2 (void); /* { dg-error "not available to functions that return value on the stack" } */
+
+typedef void __attribute__ ((cmse_nonsecure_call)) norf2 (int a, ...); /* { dg-error "not available to functions with variable number of arguments" } */
--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/arm/cmse/cmse-4.c
@@ -0,0 +1,34 @@
+/* { dg-do compile } */
+/* { dg-options "-mcmse" }  */
+
+struct span {
+  int a, b;
+};
+
+extern int qux (void);
+
+void __attribute__ ((cmse_nonsecure_entry))
+foo (void) {}
+
+static void __attribute__ ((cmse_nonsecure_entry))
+bar (void) {} /* { dg-warning "has no effect on functions with static linkage" } */
+
+int __attribute__ ((cmse_nonsecure_entry))
+baz (void)
+{
+  return qux ();
+}
+
+void __attribute__ ((cmse_nonsecure_call))
+quux (void) {} /* { dg-warning "attribute only applies to base type of a function pointer" } */
+
+int __attribute__ ((cmse_nonsecure_call)) norf; /* { dg-warning "attribute only applies to base type of a function pointer" } */
+
+/* { dg-final { scan-assembler-times "bxns" 2 } } */
+/* { dg-final { scan-assembler "foo:" } } */
+/* { dg-final { scan-assembler "__acle_se_foo:" } } */
+/* { dg-final { scan-assembler-not "__acle_se_bar:" } } */
+/* { dg-final { scan-assembler "baz:" } } */
+/* { dg-final { scan-assembler "__acle_se_baz:" } } */
+/* { dg-final { scan-assembler-not "__acle_se_quux:" } } */
+/* { dg-final { scan-assembler-not "__acle_se_norf:" } } */
--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/arm/cmse/cmse-9.c
@@ -0,0 +1,20 @@
+/* { dg-do compile } */
+/* { dg-skip-if "Testing exclusion of -mcmse" { arm-*-* } { "-mcmse" } { "" } }  */
+
+
+void __attribute__ ((cmse_nonsecure_call)) (*bar) (int); /* { dg-warning "attribute ignored without -mcmse option" } */
+typedef void __attribute__ ((cmse_nonsecure_call)) baz (int); /* { dg-warning "attribute ignored without -mcmse option" } */
+
+int __attribute__ ((cmse_nonsecure_entry))
+foo (int a, baz b)
+{ /* { dg-warning "attribute ignored without -mcmse option" } */
+  bar (a);
+  b (a);
+  return a + 1;
+}
+
+/* { dg-final { scan-assembler-not "bxns" } } */
+/* { dg-final { scan-assembler-not "blxns" } } */
+/* { dg-final { scan-assembler-not "bl\t__gnu_cmse_nonsecure_call" } } */
+/* { dg-final { scan-assembler "foo:" } } */
+/* { dg-final { scan-assembler-not "__acle_se_foo:" } } */
--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/arm/cmse/cmse.exp
@@ -0,0 +1,72 @@
+#   Copyright (C) 1997-2016 Free Software Foundation, Inc.
+
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with GCC; see the file COPYING3.  If not see
+# <http://www.gnu.org/licenses/>.
+
+# GCC testsuite for ARMv8-M Security Extensions using the `dg.exp' driver.
+
+# Load support procs.
+load_lib gcc-dg.exp
+
+# Exit immediately if the target does not support -mcmse.
+if ![check_effective_target_arm_cmse_ok] then {
+    return
+}
+
+# If a testcase doesn't have special options, use these.
+global DEFAULT_CFLAGS
+if ![info exists DEFAULT_CFLAGS] then {
+    set DEFAULT_CFLAGS " -ansi -pedantic-errors"
+}
+
+# Initialize `dg'.
+dg-init
+
+set saved-dg-do-what-default ${dg-do-what-default}
+set dg-do-what-default "assemble"
+
+set saved-lto_torture_options ${LTO_TORTURE_OPTIONS}
+set LTO_TORTURE_OPTIONS ""
+
+# These are for both baseline and mainline.
+gcc-dg-runtest [lsort [glob $srcdir/$subdir/*.c]] \
+	"" $DEFAULT_CFLAGS
+
+if {[check_effective_target_arm_arch_v8m_base_ok]} then {
+    # Baseline only
+    gcc-dg-runtest [lsort [glob $srcdir/$subdir/baseline/*.c]] \
+	    "" $DEFAULT_CFLAGS
+}
+
+if {[check_effective_target_arm_arch_v8m_main_ok]} then {
+    gcc-dg-runtest [lsort [glob $srcdir/$subdir/mainline/*.c]] \
+	    "" $DEFAULT_CFLAGS
+    # Mainline -mfloat-abi=soft
+    gcc-dg-runtest [lsort [glob $srcdir/$subdir/mainline/soft/*.c]] \
+	    "-mfloat-abi=soft" $DEFAULT_CFLAGS
+    gcc-dg-runtest [lsort [glob $srcdir/$subdir/mainline/softfp/*.c]] \
+	    "" $DEFAULT_CFLAGS
+    gcc-dg-runtest [lsort [glob $srcdir/$subdir/mainline/softfp-sp/*.c]] \
+	    "" $DEFAULT_CFLAGS
+    gcc-dg-runtest [lsort [glob $srcdir/$subdir/mainline/hard/*.c]] \
+	    "" $DEFAULT_CFLAGS
+    gcc-dg-runtest [lsort [glob $srcdir/$subdir/mainline/hard-sp/*.c]] \
+	    "" $DEFAULT_CFLAGS
+}
+
+set LTO_TORTURE_OPTIONS ${saved-lto_torture_options}
+set dg-do-what-default ${saved-dg-do-what-default}
+
+# All done.
+dg-finish
--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/arm/cmse/mainline/bitfield-4.c
@@ -0,0 +1,55 @@
+/* { dg-do compile } */
+/* { dg-options "-mcmse" } */
+
+typedef struct
+{
+  unsigned char a;
+  unsigned int b:5;
+  unsigned int c:11, :0, d:8;
+  struct { unsigned int ee:2; } e;
+} test_st;
+
+typedef union
+{
+  test_st st;
+  struct
+    {
+      unsigned int v1;
+      unsigned int v2;
+      unsigned int v3;
+      unsigned int v4;
+    }values;
+} read_st;
+
+
+typedef void __attribute__ ((cmse_nonsecure_call)) (*foo_ns) (test_st);
+
+extern void foo (test_st st);
+
+int
+main (void)
+{
+  read_st r;
+  foo_ns f;
+
+  f = (foo_ns) 0x200000;
+  r.values.v1 = 0xFFFFFFFF;
+  r.values.v2 = 0xFFFFFFFF;
+  r.values.v3 = 0xFFFFFFFF;
+  r.values.v4 = 0xFFFFFFFF;
+
+  f (r.st);
+  return 0;
+}
+
+/* { dg-final { scan-assembler "movw\tip, #65535" } } */
+/* { dg-final { scan-assembler "movt\tip, 255" } } */
+/* { dg-final { scan-assembler "and\tr0, r0, ip" } } */
+/* { dg-final { scan-assembler "mov\tip, #255" } } */
+/* { dg-final { scan-assembler "and\tr1, r1, ip" } } */
+/* { dg-final { scan-assembler "mov\tip, #3" } } */
+/* { dg-final { scan-assembler "and\tr2, r2, ip" } } */
+/* { dg-final { scan-assembler "lsrs\tr4, r4, #1" } } */
+/* { dg-final { scan-assembler "lsls\tr4, r4, #1" } } */
+/* { dg-final { scan-assembler "mov\tr3, r4" } } */
+/* { dg-final { scan-assembler "bl\t__gnu_cmse_nonsecure_call" } } */
--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/arm/cmse/mainline/bitfield-5.c
@@ -0,0 +1,51 @@
+/* { dg-do compile } */
+/* { dg-options "-mcmse" } */
+
+typedef struct
+{
+  unsigned char	  a;
+  unsigned short  b :5;
+  unsigned char	  c;
+  unsigned short  d :11;
+} test_st;
+
+typedef union
+{
+  test_st st;
+  struct
+    {
+      unsigned int v1;
+      unsigned int v2;
+      unsigned int v3;
+      unsigned int v4;
+    }values;
+} read_st;
+
+
+typedef void __attribute__ ((cmse_nonsecure_call)) (*foo_ns) (test_st);
+
+int
+main (void)
+{
+  read_st r;
+  foo_ns f;
+
+  f = (foo_ns) 0x200000;
+  r.values.v1 = 0xFFFFFFFF;
+  r.values.v2 = 0xFFFFFFFF;
+
+  f (r.st);
+  return 0;
+}
+
+/* { dg-final { scan-assembler "movw\tip, #8191" } } */
+/* { dg-final { scan-assembler "movt\tip, 255" } } */
+/* { dg-final { scan-assembler "and\tr0, r0, ip" } } */
+/* { dg-final { scan-assembler "movw\tip, #2047" } } */
+/* { dg-final { scan-assembler "and\tr1, r1, ip" } } */
+/* { dg-final { scan-assembler "lsrs\tr4, r4, #1" } } */
+/* { dg-final { scan-assembler "lsls\tr4, r4, #1" } } */
+/* { dg-final { scan-assembler "mov\tr2, r4" } } */
+/* { dg-final { scan-assembler "mov\tr3, r4" } } */
+/* { dg-final { scan-assembler "bl\t__gnu_cmse_nonsecure_call" } } */
+
--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/arm/cmse/mainline/bitfield-6.c
@@ -0,0 +1,61 @@
+/* { dg-do compile } */
+/* { dg-options "-mcmse" } */
+
+typedef struct
+{
+  unsigned char a;
+  unsigned int	b : 3;
+  unsigned int	c : 14;
+  unsigned int	d : 1;
+  struct {
+      unsigned int    ee  : 2;
+      unsigned short  ff  : 15;
+  } e;
+  unsigned char	g : 1;
+  unsigned char	  : 4;
+  unsigned char	h : 3;
+} test_st;
+
+typedef union
+{
+  test_st st;
+  struct
+    {
+      unsigned int v1;
+      unsigned int v2;
+      unsigned int v3;
+      unsigned int v4;
+    }values;
+} read_st;
+
+
+typedef void __attribute__ ((cmse_nonsecure_call)) (*foo_ns) (test_st);
+
+int
+main (void)
+{
+  read_st r;
+  foo_ns f;
+
+  f = (foo_ns) 0x200000;
+  r.values.v1 = 0xFFFFFFFF;
+  r.values.v2 = 0xFFFFFFFF;
+  r.values.v3 = 0xFFFFFFFF;
+  r.values.v4 = 0xFFFFFFFF;
+
+  f (r.st);
+  return 0;
+}
+
+/* { dg-final { scan-assembler "movw\tip, #65535" } } */
+/* { dg-final { scan-assembler "movt\tip, 1023" } } */
+/* { dg-final { scan-assembler "and\tr0, r0, ip" } } */
+/* { dg-final { scan-assembler "mov\tip, #3" } } */
+/* { dg-final { scan-assembler "movt\tip, 32767" } } */
+/* { dg-final { scan-assembler "and\tr1, r1, ip" } } */
+/* { dg-final { scan-assembler "mov\tip, #255" } } */
+/* { dg-final { scan-assembler "and\tr2, r2, ip" } } */
+/* { dg-final { scan-assembler "lsrs\tr4, r4, #1" } } */
+/* { dg-final { scan-assembler "lsls\tr4, r4, #1" } } */
+/* { dg-final { scan-assembler "mov\tr3, r4" } } */
+/* { dg-final { scan-assembler "bl\t__gnu_cmse_nonsecure_call" } } */
--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/arm/cmse/mainline/bitfield-7.c
@@ -0,0 +1,52 @@
+/* { dg-do compile } */
+/* { dg-options "-mcmse" } */
+
+typedef struct
+{
+  unsigned char	  a;
+  unsigned short  b :5;
+  unsigned char	  c;
+  unsigned short  d :11;
+} test_st;
+
+typedef union
+{
+  test_st st;
+  struct
+    {
+      unsigned int v1;
+      unsigned int v2;
+      unsigned int v3;
+      unsigned int v4;
+    }values;
+} read_st;
+
+
+typedef void __attribute__ ((cmse_nonsecure_call)) (*foo_ns) (test_st);
+
+int
+main (void)
+{
+  read_st r;
+  foo_ns f;
+
+  f = (foo_ns) 0x200000;
+  r.values.v1 = 0xFFFFFFFF;
+  r.values.v2 = 0xFFFFFFFF;
+
+  f (r.st);
+  return 0;
+}
+
+
+/* { dg-final { scan-assembler "movw\tip, #8191" } } */
+/* { dg-final { scan-assembler "movt\tip, 255" } } */
+/* { dg-final { scan-assembler "and\tr0, r0, ip" } } */
+/* { dg-final { scan-assembler "movw\tip, #2047" } } */
+/* { dg-final { scan-assembler "and\tr1, r1, ip" } } */
+/* { dg-final { scan-assembler "lsrs\tr4, r4, #1" } } */
+/* { dg-final { scan-assembler "lsls\tr4, r4, #1" } } */
+/* { dg-final { scan-assembler "mov\tr2, r4" } } */
+/* { dg-final { scan-assembler "mov\tr3, r4" } } */
+/* { dg-final { scan-assembler "bl\t__gnu_cmse_nonsecure_call" } } */
+
--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/arm/cmse/mainline/bitfield-8.c
@@ -0,0 +1,55 @@
+/* { dg-do compile } */
+/* { dg-options "-mcmse" } */
+
+typedef struct
+{
+  unsigned char	  a;
+  unsigned int	    :0;
+  unsigned int	  b :1;
+  unsigned short    :0;
+  unsigned short  c;
+  unsigned int	    :0;
+  unsigned int	  d :21;
+} test_st;
+
+typedef union
+{
+  test_st st;
+  struct
+    {
+      unsigned int v1;
+      unsigned int v2;
+      unsigned int v3;
+      unsigned int v4;
+    }values;
+} read_st;
+
+typedef void __attribute__ ((cmse_nonsecure_call)) (*foo_ns) (test_st);
+
+int
+main (void)
+{
+  read_st r;
+  foo_ns f;
+
+  f = (foo_ns) 0x200000;
+  r.values.v1 = 0xFFFFFFFF;
+  r.values.v2 = 0xFFFFFFFF;
+  r.values.v3 = 0xFFFFFFFF;
+
+  f (r.st);
+  return 0;
+}
+
+/* { dg-final { scan-assembler "mov\tip, #255" } } */
+/* { dg-final { scan-assembler "and\tr0, r0, ip" } } */
+/* { dg-final { scan-assembler "mov\tip, #1" } } */
+/* { dg-final { scan-assembler "movt\tip, 65535" } } */
+/* { dg-final { scan-assembler "and\tr1, r1, ip" } } */
+/* { dg-final { scan-assembler "movw\tip, #65535" } } */
+/* { dg-final { scan-assembler "movt\tip, 31" } } */
+/* { dg-final { scan-assembler "and\tr2, r2, ip" } } */
+/* { dg-final { scan-assembler "lsrs\tr4, r4, #1" } } */
+/* { dg-final { scan-assembler "lsls\tr4, r4, #1" } } */
+/* { dg-final { scan-assembler "mov\tr3, r4" } } */
+/* { dg-final { scan-assembler "bl\t__gnu_cmse_nonsecure_call" } } */
--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/arm/cmse/mainline/bitfield-9.c
@@ -0,0 +1,54 @@
+/* { dg-do compile } */
+/* { dg-options "-mcmse" } */
+
+typedef struct
+{
+  char a:3;
+} test_st3;
+
+typedef struct
+{
+  char a:3;
+} test_st2;
+
+typedef struct
+{
+  test_st2 st2;
+  test_st3 st3;
+} test_st;
+
+typedef union
+{
+  test_st st;
+  struct
+    {
+      unsigned int v1;
+      unsigned int v2;
+      unsigned int v3;
+      unsigned int v4;
+    }values;
+} read_st;
+
+typedef void __attribute__ ((cmse_nonsecure_call)) (*foo_ns) (test_st);
+
+int
+main (void)
+{
+  read_st r;
+  foo_ns f;
+
+  f = (foo_ns) 0x200000;
+  r.values.v1 = 0xFFFFFFFF;
+
+  f (r.st);
+  return 0;
+}
+
+/* { dg-final { scan-assembler "movw\tip, #1799" } } */
+/* { dg-final { scan-assembler "and\tr0, r0, ip" } } */
+/* { dg-final { scan-assembler "lsrs\tr4, r4, #1" } } */
+/* { dg-final { scan-assembler "lsls\tr4, r4, #1" } } */
+/* { dg-final { scan-assembler "mov\tr1, r4" } } */
+/* { dg-final { scan-assembler "mov\tr2, r4" } } */
+/* { dg-final { scan-assembler "mov\tr3, r4" } } */
+/* { dg-final { scan-assembler "bl\t__gnu_cmse_nonsecure_call" } } */
--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/arm/cmse/mainline/bitfield-and-union-1.c
@@ -0,0 +1,94 @@
+/* { dg-do compile } */
+/* { dg-options "-mcmse" } */
+
+typedef struct
+{
+  unsigned short a :11;
+} test_st_4;
+
+typedef union
+{
+  char	      a;
+  test_st_4 st4;
+}test_un_2;
+
+typedef struct
+{
+  unsigned char	  a;
+  unsigned int	    :0;
+  unsigned int	  b :1;
+  unsigned short    :0;
+  unsigned short  c;
+  unsigned int	    :0;
+  unsigned int	  d :21;
+} test_st_3;
+
+typedef struct
+{
+  unsigned char	  a :3;
+  unsigned int	  b :13;
+  test_un_2	  un2;
+} test_st_2;
+
+typedef union
+{
+  test_st_2 st2;
+  test_st_3 st3;
+}test_un_1;
+
+typedef struct
+{
+  unsigned char	  a :2;
+  unsigned char	    :0;
+  unsigned short  b :5;
+  unsigned char	    :0;
+  unsigned char	  c :4;
+  test_un_1	  un1;
+} test_st_1;
+
+typedef union
+{
+  test_st_1 st1;
+  struct
+    {
+      unsigned int v1;
+      unsigned int v2;
+      unsigned int v3;
+      unsigned int v4;
+    }values;
+} read_st_1;
+
+
+typedef void __attribute__ ((cmse_nonsecure_call)) (*foo_ns) (test_st_1);
+
+int
+main (void)
+{
+  read_st_1 r;
+  foo_ns f;
+
+  f = (foo_ns) 0x200000;
+  r.values.v1 = 0xFFFFFFFF;
+  r.values.v2 = 0xFFFFFFFF;
+  r.values.v3 = 0xFFFFFFFF;
+  r.values.v4 = 0xFFFFFFFF;
+
+  f (r.st1);
+  return 0;
+}
+
+/* { dg-final { scan-assembler "movw\tip, #7939" } } */
+/* { dg-final { scan-assembler "movt\tip, 15" } } */
+/* { dg-final { scan-assembler "and\tr0, r0, ip" } } */
+/* { dg-final { scan-assembler "movw\tip, #65535" } } */
+/* { dg-final { scan-assembler "movt\tip, 2047" } } */
+/* { dg-final { scan-assembler "and\tr1, r1, ip" } } */
+/* { dg-final { scan-assembler "mov\tip, #1" } } */
+/* { dg-final { scan-assembler "movt\tip, 65535" } } */
+/* { dg-final { scan-assembler "and\tr2, r2, ip" } } */
+/* { dg-final { scan-assembler "movw\tip, #65535" } } */
+/* { dg-final { scan-assembler "movt\tip, 31" } } */
+/* { dg-final { scan-assembler "and\tr3, r3, ip" } } */
+/* { dg-final { scan-assembler "lsrs\tr4, r4, #1" } } */
+/* { dg-final { scan-assembler "lsls\tr4, r4, #1" } } */
+/* { dg-final { scan-assembler "bl\t__gnu_cmse_nonsecure_call" } } */
--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/arm/cmse/mainline/hard-sp/cmse-13.c
@@ -0,0 +1,43 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target arm_arch_v8m_main_ok } */
+/* { dg-add-options arm_arch_v8m_main } */
+/* { dg-skip-if "Do not combine float-abi= hard | soft | softfp" {*-*-*} {"-mfloat-abi=soft" -mfloat-abi=softfp } {""} } */
+/* { dg-skip-if "Skip these if testing double precision" {*-*-*} {"-mfpu=fpv[4-5]-d16"} {""} } */
+/* { dg-options "-mcmse -mfloat-abi=hard -mfpu=fpv5-sp-d16" }  */
+
+
+int __attribute__ ((cmse_nonsecure_call)) (*bar) (float, double);
+
+int
+foo (int a)
+{
+  return bar (3.0f, 2.0) + a + 1;
+}
+
+/* Checks for saving and clearing prior to function call.  */
+/* { dg-final { scan-assembler "lsrs\tr4, r4, #1" } } */
+/* { dg-final { scan-assembler "lsls\tr4, r4, #1" } } */
+/* { dg-final { scan-assembler "mov\tr0, r4" } } */
+/* { dg-final { scan-assembler "mov\tr1, r4" } } */
+/* { dg-final { scan-assembler "mov\tr2, r4" } } */
+/* { dg-final { scan-assembler "mov\tr3, r4" } } */
+/* { dg-final { scan-assembler-not "vldr\.32\ts0, .L" } } */
+/* { dg-final { scan-assembler "vldr\.32\ts1, .L" } } */
+/* { dg-final { scan-assembler-not "vldr\.32\ts2, .L" } } */
+/* { dg-final { scan-assembler-not "vldr\.32\ts3, .L" } } */
+/* { dg-final { scan-assembler "vldr\.32\ts4, .L" } } */
+/* { dg-final { scan-assembler "vldr\.32\ts5, .L" } } */
+/* { dg-final { scan-assembler "vldr\.32\ts6, .L" } } */
+/* { dg-final { scan-assembler "vldr\.32\ts7, .L" } } */
+/* { dg-final { scan-assembler "vldr\.32\ts8, .L" } } */
+/* { dg-final { scan-assembler "vldr\.32\ts9, .L" } } */
+/* { dg-final { scan-assembler "vldr\.32\ts10, .L" } } */
+/* { dg-final { scan-assembler "vldr\.32\ts11, .L" } } */
+/* { dg-final { scan-assembler "vldr\.32\ts12, .L" } } */
+/* { dg-final { scan-assembler "vldr\.32\ts13, .L" } } */
+/* { dg-final { scan-assembler "vldr\.32\ts14, .L" } } */
+/* { dg-final { scan-assembler "vldr\.32\ts15, .L" } } */
+
+/* Now we check that we use the correct intrinsic to call.  */
+/* { dg-final { scan-assembler "bl\t__gnu_cmse_nonsecure_call" } } */
+
--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/arm/cmse/mainline/hard-sp/cmse-5.c
@@ -0,0 +1,45 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target arm_arch_v8m_main_ok } */
+/* { dg-add-options arm_arch_v8m_main } */
+/* { dg-skip-if "Do not combine float-abi= hard | soft | softfp" {*-*-*} {"-mfloat-abi=soft" -mfloat-abi=softfp } {""} } */
+/* { dg-skip-if "Skip these if testing double precision" {*-*-*} {"-mfpu=fpv[4-5]-d16"} {""} } */
+/* { dg-options "-mcmse -mfloat-abi=hard -mfpu=fpv5-sp-d16" }  */
+
+extern float bar (void);
+
+float __attribute__ ((cmse_nonsecure_entry))
+foo (void)
+{
+  return bar ();
+}
+/* { dg-final { scan-assembler "mov\tr0, lr" } } */
+/* { dg-final { scan-assembler "mov\tr1, lr" } } */
+/* { dg-final { scan-assembler "mov\tr2, lr" } } */
+/* { dg-final { scan-assembler "mov\tr3, lr" } } */
+/* { dg-final { scan-assembler-not "vmov\.f32\ts0, #1\.0" } } */
+/* { dg-final { scan-assembler "vmov\.f32\ts1, #1\.0" } } */
+/* { dg-final { scan-assembler "vmov\.f32\ts2, #1\.0" } } */
+/* { dg-final { scan-assembler "vmov\.f32\ts3, #1\.0" } } */
+/* { dg-final { scan-assembler "vmov\.f32\ts4, #1\.0" } } */
+/* { dg-final { scan-assembler "vmov\.f32\ts5, #1\.0" } } */
+/* { dg-final { scan-assembler "vmov\.f32\ts6, #1\.0" } } */
+/* { dg-final { scan-assembler "vmov\.f32\ts7, #1\.0" } } */
+/* { dg-final { scan-assembler "vmov\.f32\ts8, #1\.0" } } */
+/* { dg-final { scan-assembler "vmov\.f32\ts9, #1\.0" } } */
+/* { dg-final { scan-assembler "vmov\.f32\ts10, #1\.0" } } */
+/* { dg-final { scan-assembler "vmov\.f32\ts11, #1\.0" } } */
+/* { dg-final { scan-assembler "vmov\.f32\ts12, #1\.0" } } */
+/* { dg-final { scan-assembler "vmov\.f32\ts13, #1\.0" } } */
+/* { dg-final { scan-assembler "vmov\.f32\ts14, #1\.0" } } */
+/* { dg-final { scan-assembler "vmov\.f32\ts15, #1\.0" } } */
+/* { dg-final { scan-assembler "msr\tAPSR_nzcvq, lr" { target { arm_arch_v8m_main_ok && { ! arm_dsp } } } } } */
+/* { dg-final { scan-assembler "msr\tAPSR_nzcvqg, lr" { target { arm_arch_v8m_main_ok && arm_dsp } } } } */
+/* { dg-final { scan-assembler "push\t{r4}" } } */
+/* { dg-final { scan-assembler "vmrs\tip, fpscr" } } */
+/* { dg-final { scan-assembler "movw\tr4, #65376" } } */
+/* { dg-final { scan-assembler "movt\tr4, #4095" } } */
+/* { dg-final { scan-assembler "and\tip, r4" } } */
+/* { dg-final { scan-assembler "vmsr\tfpscr, ip" } } */
+/* { dg-final { scan-assembler "pop\t{r4}" } } */
+/* { dg-final { scan-assembler "mov\tip, lr" } } */
+/* { dg-final { scan-assembler "bxns" } } */
--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/arm/cmse/mainline/hard-sp/cmse-7.c
@@ -0,0 +1,42 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target arm_arch_v8m_main_ok } */
+/* { dg-add-options arm_arch_v8m_main } */
+/* { dg-skip-if "Do not combine float-abi= hard | soft | softfp" {*-*-*} {"-mfloat-abi=soft" -mfloat-abi=softfp } {""} } */
+/* { dg-skip-if "Skip these if testing double precision" {*-*-*} {"-mfpu=fpv[4-5]-d16"} {""} } */
+/* { dg-options "-mcmse -mfloat-abi=hard -mfpu=fpv5-sp-d16" }  */
+
+int __attribute__ ((cmse_nonsecure_call)) (*bar) (void);
+
+int
+foo (int a)
+{
+  return bar () + a + 1;
+}
+
+/* Checks for saving and clearing prior to function call.  */
+/* { dg-final { scan-assembler "lsrs\tr4, r4, #1" } } */
+/* { dg-final { scan-assembler "lsls\tr4, r4, #1" } } */
+/* { dg-final { scan-assembler "mov\tr0, r4" } } */
+/* { dg-final { scan-assembler "mov\tr1, r4" } } */
+/* { dg-final { scan-assembler "mov\tr2, r4" } } */
+/* { dg-final { scan-assembler "mov\tr3, r4" } } */
+/* { dg-final { scan-assembler "vldr\.32\ts0, .L" } } */
+/* { dg-final { scan-assembler "vldr\.32\ts1, .L" } } */
+/* { dg-final { scan-assembler "vldr\.32\ts2, .L" } } */
+/* { dg-final { scan-assembler "vldr\.32\ts3, .L" } } */
+/* { dg-final { scan-assembler "vldr\.32\ts4, .L" } } */
+/* { dg-final { scan-assembler "vldr\.32\ts5, .L" } } */
+/* { dg-final { scan-assembler "vldr\.32\ts6, .L" } } */
+/* { dg-final { scan-assembler "vldr\.32\ts7, .L" } } */
+/* { dg-final { scan-assembler "vldr\.32\ts8, .L" } } */
+/* { dg-final { scan-assembler "vldr\.32\ts9, .L" } } */
+/* { dg-final { scan-assembler "vldr\.32\ts10, .L" } } */
+/* { dg-final { scan-assembler "vldr\.32\ts11, .L" } } */
+/* { dg-final { scan-assembler "vldr\.32\ts12, .L" } } */
+/* { dg-final { scan-assembler "vldr\.32\ts13, .L" } } */
+/* { dg-final { scan-assembler "vldr\.32\ts14, .L" } } */
+/* { dg-final { scan-assembler "vldr\.32\ts15, .L" } } */
+
+/* Now we check that we use the correct intrinsic to call.  */
+/* { dg-final { scan-assembler "bl\t__gnu_cmse_nonsecure_call" } } */
+
--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/arm/cmse/mainline/hard-sp/cmse-8.c
@@ -0,0 +1,41 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target arm_arch_v8m_main_ok } */
+/* { dg-add-options arm_arch_v8m_main } */
+/* { dg-skip-if "Do not combine float-abi= hard | soft | softfp" {*-*-*} {"-mfloat-abi=soft" -mfloat-abi=softfp } {""} } */
+/* { dg-skip-if "Skip these if testing double precision" {*-*-*} {"-mfpu=fpv[4-5]-d16"} {""} } */
+/* { dg-options "-mcmse -mfloat-abi=hard -mfpu=fpv5-sp-d16" }  */
+
+int __attribute__ ((cmse_nonsecure_call)) (*bar) (double);
+
+int
+foo (int a)
+{
+  return bar (2.0) + a + 1;
+}
+
+/* Checks for saving and clearing prior to function call.  */
+/* { dg-final { scan-assembler "lsrs\tr4, r4, #1" } } */
+/* { dg-final { scan-assembler "lsls\tr4, r4, #1" } } */
+/* { dg-final { scan-assembler "mov\tr0, r4" } } */
+/* { dg-final { scan-assembler "mov\tr1, r4" } } */
+/* { dg-final { scan-assembler "mov\tr2, r4" } } */
+/* { dg-final { scan-assembler "mov\tr3, r4" } } */
+/* { dg-final { scan-assembler-not "vldr\.32\ts0, .L" } } */
+/* { dg-final { scan-assembler-not "vldr\.32\ts1, .L" } } */
+/* { dg-final { scan-assembler "vldr\.32\ts2, .L" } } */
+/* { dg-final { scan-assembler "vldr\.32\ts3, .L" } } */
+/* { dg-final { scan-assembler "vldr\.32\ts4, .L" } } */
+/* { dg-final { scan-assembler "vldr\.32\ts5, .L" } } */
+/* { dg-final { scan-assembler "vldr\.32\ts6, .L" } } */
+/* { dg-final { scan-assembler "vldr\.32\ts7, .L" } } */
+/* { dg-final { scan-assembler "vldr\.32\ts8, .L" } } */
+/* { dg-final { scan-assembler "vldr\.32\ts9, .L" } } */
+/* { dg-final { scan-assembler "vldr\.32\ts10, .L" } } */
+/* { dg-final { scan-assembler "vldr\.32\ts11, .L" } } */
+/* { dg-final { scan-assembler "vldr\.32\ts12, .L" } } */
+/* { dg-final { scan-assembler "vldr\.32\ts13, .L" } } */
+/* { dg-final { scan-assembler "vldr\.32\ts14, .L" } } */
+/* { dg-final { scan-assembler "vldr\.32\ts15, .L" } } */
+
+/* Now we check that we use the correct intrinsic to call.  */
+/* { dg-final { scan-assembler "bl\t__gnu_cmse_nonsecure_call" } } */
--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/arm/cmse/mainline/hard/cmse-13.c
@@ -0,0 +1,38 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target arm_arch_v8m_main_ok } */
+/* { dg-add-options arm_arch_v8m_main } */
+/* { dg-skip-if "Do not combine float-abi= hard | soft | softfp" {*-*-*} {"-mfloat-abi=soft" -mfloat-abi=softfp } {""} } */
+/* { dg-skip-if "Skip these if testing single precision" {*-*-*} {"-mfpu=*-sp-*"} {""} } */
+/* { dg-options "-mcmse -mfloat-abi=hard -mfpu=fpv5-d16" }  */
+
+
+int __attribute__ ((cmse_nonsecure_call)) (*bar) (float, double);
+
+int
+foo (int a)
+{
+  return bar (3.0f, 2.0) + a + 1;
+}
+
+/* Checks for saving and clearing prior to function call.  */
+/* { dg-final { scan-assembler "lsrs\tr4, r4, #1" } } */
+/* { dg-final { scan-assembler "lsls\tr4, r4, #1" } } */
+/* { dg-final { scan-assembler "mov\tr0, r4" } } */
+/* { dg-final { scan-assembler "mov\tr1, r4" } } */
+/* { dg-final { scan-assembler "mov\tr2, r4" } } */
+/* { dg-final { scan-assembler "mov\tr3, r4" } } */
+/* { dg-final { scan-assembler "vldr\.32\ts1, .L" } } */
+/* { dg-final { scan-assembler-not "vldr\.64\td0, .L" } } */
+/* { dg-final { scan-assembler-not "vldr\.32\ts0, .L" } } */
+/* { dg-final { scan-assembler-not "vldr\.64\td1, .L" } } */
+/* { dg-final { scan-assembler-not "vldr\.32\ts2, .L" } } */
+/* { dg-final { scan-assembler-not "vldr\.32\ts3, .L" } } */
+/* { dg-final { scan-assembler "vldr\.64\td2, .L" } } */
+/* { dg-final { scan-assembler "vldr\.64\td3, .L" } } */
+/* { dg-final { scan-assembler "vldr\.64\td4, .L" } } */
+/* { dg-final { scan-assembler "vldr\.64\td5, .L" } } */
+/* { dg-final { scan-assembler "vldr\.64\td6, .L" } } */
+/* { dg-final { scan-assembler "vldr\.64\td7, .L" } } */
+
+/* Now we check that we use the correct intrinsic to call.  */
+/* { dg-final { scan-assembler "bl\t__gnu_cmse_nonsecure_call" } } */
--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/arm/cmse/mainline/hard/cmse-5.c
@@ -0,0 +1,38 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target arm_arch_v8m_main_ok } */
+/* { dg-add-options arm_arch_v8m_main } */
+/* { dg-skip-if "Do not combine float-abi= hard | soft | softfp" {*-*-*} {"-mfloat-abi=soft" -mfloat-abi=softfp } {""} } */
+/* { dg-skip-if "Skip these if testing single precision" {*-*-*} {"-mfpu=*-sp-*"} {""} } */
+/* { dg-options "-mcmse -mfloat-abi=hard -mfpu=fpv5-d16" }  */
+
+extern float bar (void);
+
+float __attribute__ ((cmse_nonsecure_entry))
+foo (void)
+{
+  return bar ();
+}
+/* { dg-final { scan-assembler "mov\tr0, lr" } } */
+/* { dg-final { scan-assembler "mov\tr1, lr" } } */
+/* { dg-final { scan-assembler "mov\tr2, lr" } } */
+/* { dg-final { scan-assembler "mov\tr3, lr" } } */
+/* { dg-final { scan-assembler-not "vmov\.f32\ts0, #1\.0" } } */
+/* { dg-final { scan-assembler "vmov\.f32\ts1, #1\.0" } } */
+/* { dg-final { scan-assembler "vmov\.f64\td1, #1\.0" } } */
+/* { dg-final { scan-assembler "vmov\.f64\td2, #1\.0" } } */
+/* { dg-final { scan-assembler "vmov\.f64\td3, #1\.0" } } */
+/* { dg-final { scan-assembler "vmov\.f64\td4, #1\.0" } } */
+/* { dg-final { scan-assembler "vmov\.f64\td5, #1\.0" } } */
+/* { dg-final { scan-assembler "vmov\.f64\td6, #1\.0" } } */
+/* { dg-final { scan-assembler "vmov\.f64\td7, #1\.0" } } */
+/* { dg-final { scan-assembler "msr\tAPSR_nzcvq, lr" { target { arm_arch_v8m_main_ok && { ! arm_dsp } } } } } */
+/* { dg-final { scan-assembler "msr\tAPSR_nzcvqg, lr" { target { arm_arch_v8m_main_ok && arm_dsp } } } } */
+/* { dg-final { scan-assembler "push\t{r4}" } } */
+/* { dg-final { scan-assembler "vmrs\tip, fpscr" } } */
+/* { dg-final { scan-assembler "movw\tr4, #65376" } } */
+/* { dg-final { scan-assembler "movt\tr4, #4095" } } */
+/* { dg-final { scan-assembler "and\tip, r4" } } */
+/* { dg-final { scan-assembler "vmsr\tfpscr, ip" } } */
+/* { dg-final { scan-assembler "pop\t{r4}" } } */
+/* { dg-final { scan-assembler "mov\tip, lr" } } */
+/* { dg-final { scan-assembler "bxns" } } */
--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/arm/cmse/mainline/hard/cmse-7.c
@@ -0,0 +1,34 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target arm_arch_v8m_main_ok } */
+/* { dg-add-options arm_arch_v8m_main } */
+/* { dg-skip-if "Do not combine float-abi= hard | soft | softfp" {*-*-*} {"-mfloat-abi=soft" -mfloat-abi=softfp } {""} } */
+/* { dg-skip-if "Skip these if testing single precision" {*-*-*} {"-mfpu=*-sp-*"} {""} } */
+/* { dg-options "-mcmse -mfloat-abi=hard -mfpu=fpv5-d16" }  */
+
+int __attribute__ ((cmse_nonsecure_call)) (*bar) (void);
+
+int
+foo (int a)
+{
+  return bar () + a + 1;
+}
+
+/* Checks for saving and clearing prior to function call.  */
+/* { dg-final { scan-assembler "lsrs\tr4, r4, #1" } } */
+/* { dg-final { scan-assembler "lsls\tr4, r4, #1" } } */
+/* { dg-final { scan-assembler "mov\tr0, r4" } } */
+/* { dg-final { scan-assembler "mov\tr1, r4" } } */
+/* { dg-final { scan-assembler "mov\tr2, r4" } } */
+/* { dg-final { scan-assembler "mov\tr3, r4" } } */
+/* { dg-final { scan-assembler "vldr\.64\td0, .L" } } */
+/* { dg-final { scan-assembler "vldr\.64\td1, .L" } } */
+/* { dg-final { scan-assembler "vldr\.64\td2, .L" } } */
+/* { dg-final { scan-assembler "vldr\.64\td3, .L" } } */
+/* { dg-final { scan-assembler "vldr\.64\td4, .L" } } */
+/* { dg-final { scan-assembler "vldr\.64\td5, .L" } } */
+/* { dg-final { scan-assembler "vldr\.64\td6, .L" } } */
+/* { dg-final { scan-assembler "vldr\.64\td7, .L" } } */
+
+/* Now we check that we use the correct intrinsic to call.  */
+/* { dg-final { scan-assembler "bl\t__gnu_cmse_nonsecure_call" } } */
+
--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/arm/cmse/mainline/hard/cmse-8.c
@@ -0,0 +1,33 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target arm_arch_v8m_main_ok } */
+/* { dg-add-options arm_arch_v8m_main } */
+/* { dg-skip-if "Do not combine float-abi= hard | soft | softfp" {*-*-*} {"-mfloat-abi=soft" -mfloat-abi=softfp } {""} } */
+/* { dg-skip-if "Skip these if testing single precision" {*-*-*} {"-mfpu=*-sp-*"} {""} } */
+/* { dg-options "-mcmse -mfloat-abi=hard -mfpu=fpv5-d16" }  */
+
+int __attribute__ ((cmse_nonsecure_call)) (*bar) (double);
+
+int
+foo (int a)
+{
+  return bar (2.0) + a + 1;
+}
+
+/* Checks for saving and clearing prior to function call.  */
+/* { dg-final { scan-assembler "lsrs\tr4, r4, #1" } } */
+/* { dg-final { scan-assembler "lsls\tr4, r4, #1" } } */
+/* { dg-final { scan-assembler "mov\tr0, r4" } } */
+/* { dg-final { scan-assembler "mov\tr1, r4" } } */
+/* { dg-final { scan-assembler "mov\tr2, r4" } } */
+/* { dg-final { scan-assembler "mov\tr3, r4" } } */
+/* { dg-final { scan-assembler-not "vldr\.64\td0, .L" } } */
+/* { dg-final { scan-assembler "vldr\.64\td1, .L" } } */
+/* { dg-final { scan-assembler "vldr\.64\td2, .L" } } */
+/* { dg-final { scan-assembler "vldr\.64\td3, .L" } } */
+/* { dg-final { scan-assembler "vldr\.64\td4, .L" } } */
+/* { dg-final { scan-assembler "vldr\.64\td5, .L" } } */
+/* { dg-final { scan-assembler "vldr\.64\td6, .L" } } */
+/* { dg-final { scan-assembler "vldr\.64\td7, .L" } } */
+
+/* Now we check that we use the correct intrinsic to call.  */
+/* { dg-final { scan-assembler "bl\t__gnu_cmse_nonsecure_call" } } */
--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/arm/cmse/mainline/soft/cmse-13.c
@@ -0,0 +1,27 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target arm_arch_v8m_main_ok } */
+/* { dg-add-options arm_arch_v8m_main } */
+/* { dg-skip-if "Do not combine float-abi= hard | soft | softfp" {*-*-*} {"-mfloat-abi=hard" -mfloat-abi=softfp } {""} } */
+/* { dg-options "-mcmse -mfloat-abi=soft" }  */
+
+int __attribute__ ((cmse_nonsecure_call)) (*bar) (float, double);
+
+int
+foo (int a)
+{
+  return bar (1.0f, 2.0) + a + 1;
+}
+
+/* Checks for saving and clearing prior to function call.  */
+/* { dg-final { scan-assembler "lsrs\tr4, r4, #1" } } */
+/* { dg-final { scan-assembler "lsls\tr4, r4, #1" } } */
+/* { dg-final { scan-assembler-not "mov\tr0, r4" } } */
+/* { dg-final { scan-assembler "mov\tr1, r4" } } */
+/* { dg-final { scan-assembler-not "mov\tr2, r4" } } */
+/* { dg-final { scan-assembler-not "mov\tr3, r4" } } */
+/* { dg-final { scan-assembler-not "vmov" } } */
+/* { dg-final { scan-assembler-not "vmsr" } } */
+
+/* Now we check that we use the correct intrinsic to call.  */
+/* { dg-final { scan-assembler "bl\t__gnu_cmse_nonsecure_call" } } */
+
--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/arm/cmse/mainline/soft/cmse-5.c
@@ -0,0 +1,24 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target arm_arch_v8m_main_ok } */
+/* { dg-add-options arm_arch_v8m_main } */
+/* { dg-skip-if "Do not combine float-abi= hard | soft | softfp" {*-*-*} {"-mfloat-abi=hard" -mfloat-abi=softfp } {""} } */
+/* { dg-options "-mcmse -mfloat-abi=soft" }  */
+
+extern float bar (void);
+
+float __attribute__ ((cmse_nonsecure_entry))
+foo (void)
+{
+  return bar ();
+}
+
+/* { dg-final { scan-assembler "mov\tr1, lr" } } */
+/* { dg-final { scan-assembler "mov\tr2, lr" } } */
+/* { dg-final { scan-assembler "mov\tr3, lr" } } */
+/* { dg-final { scan-assembler "mov\tip, lr" } } */
+/* { dg-final { scan-assembler-not "vmov" } } */
+/* { dg-final { scan-assembler-not "vmsr" } } */
+/* { dg-final { scan-assembler "msr\tAPSR_nzcvq, lr" { target { arm_arch_v8m_main_ok && { ! arm_dsp } } } } } */
+/* { dg-final { scan-assembler "msr\tAPSR_nzcvqg, lr" { target { arm_arch_v8m_main_ok && arm_dsp } } } } */
+/* { dg-final { scan-assembler "bxns" } } */
+
--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/arm/cmse/mainline/soft/cmse-7.c
@@ -0,0 +1,27 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target arm_arch_v8m_main_ok } */
+/* { dg-add-options arm_arch_v8m_main } */
+/* { dg-skip-if "Do not combine float-abi= hard | soft | softfp" {*-*-*} {"-mfloat-abi=hard" -mfloat-abi=softfp } {""} } */
+/* { dg-options "-mcmse -mfloat-abi=soft" }  */
+
+int __attribute__ ((cmse_nonsecure_call)) (*bar) (void);
+
+int
+foo (int a)
+{
+  return bar () + a + 1;
+}
+
+/* Checks for saving and clearing prior to function call.  */
+/* { dg-final { scan-assembler "lsrs\tr4, r4, #1" } } */
+/* { dg-final { scan-assembler "lsls\tr4, r4, #1" } } */
+/* { dg-final { scan-assembler "mov\tr0, r4" } } */
+/* { dg-final { scan-assembler "mov\tr1, r4" } } */
+/* { dg-final { scan-assembler "mov\tr2, r4" } } */
+/* { dg-final { scan-assembler "mov\tr3, r4" } } */
+/* { dg-final { scan-assembler-not "vmov" } } */
+/* { dg-final { scan-assembler-not "vmsr" } } */
+
+/* Now we check that we use the correct intrinsic to call.  */
+/* { dg-final { scan-assembler "bl\t__gnu_cmse_nonsecure_call" } } */
+
--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/arm/cmse/mainline/soft/cmse-8.c
@@ -0,0 +1,26 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target arm_arch_v8m_main_ok } */
+/* { dg-add-options arm_arch_v8m_main } */
+/* { dg-skip-if "Do not combine float-abi= hard | soft | softfp" {*-*-*} {"-mfloat-abi=hard" -mfloat-abi=softfp } {""} } */
+/* { dg-options "-mcmse -mfloat-abi=soft" }  */
+
+int __attribute__ ((cmse_nonsecure_call)) (*bar) (double);
+
+int
+foo (int a)
+{
+  return bar (2.0) + a + 1;
+}
+
+/* Checks for saving and clearing prior to function call.  */
+/* { dg-final { scan-assembler "lsrs\tr4, r4, #1" } } */
+/* { dg-final { scan-assembler "lsls\tr4, r4, #1" } } */
+/* { dg-final { scan-assembler-not "mov\tr0, r4" } } */
+/* { dg-final { scan-assembler-not "mov\tr1, r4" } } */
+/* { dg-final { scan-assembler "mov\tr2, r4" } } */
+/* { dg-final { scan-assembler "mov\tr3, r4" } } */
+/* { dg-final { scan-assembler-not "vmov" } } */
+/* { dg-final { scan-assembler-not "vmsr" } } */
+
+/* Now we check that we use the correct intrinsic to call.  */
+/* { dg-final { scan-assembler "bl\t__gnu_cmse_nonsecure_call" } } */
--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/arm/cmse/mainline/softfp-sp/cmse-5.c
@@ -0,0 +1,46 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target arm_arch_v8m_main_ok } */
+/* { dg-add-options arm_arch_v8m_main } */
+/* { dg-skip-if "Do not combine float-abi= hard | soft | softfp" {*-*-*} {"-mfloat-abi=soft" -mfloat-abi=hard } {""} } */
+/* { dg-skip-if "Skip these if testing double precision" {*-*-*} {"-mfpu=fpv[4-5]-d16"} {""} } */
+/* { dg-options "-mcmse -mfloat-abi=softfp -mfpu=fpv5-sp-d16" }  */
+
+extern float bar (void);
+
+float __attribute__ ((cmse_nonsecure_entry))
+foo (void)
+{
+  return bar ();
+}
+/* { dg-final { scan-assembler "__acle_se_foo:" } } */
+/* { dg-final { scan-assembler-not "mov\tr0, lr" } } */
+/* { dg-final { scan-assembler "mov\tr1, lr" } } */
+/* { dg-final { scan-assembler "mov\tr2, lr" } } */
+/* { dg-final { scan-assembler "mov\tr3, lr" } } */
+/* { dg-final { scan-assembler "vmov\.f32\ts0, #1\.0" } } */
+/* { dg-final { scan-assembler "vmov\.f32\ts1, #1\.0" } } */
+/* { dg-final { scan-assembler "vmov\.f32\ts2, #1\.0" } } */
+/* { dg-final { scan-assembler "vmov\.f32\ts3, #1\.0" } } */
+/* { dg-final { scan-assembler "vmov\.f32\ts4, #1\.0" } } */
+/* { dg-final { scan-assembler "vmov\.f32\ts5, #1\.0" } } */
+/* { dg-final { scan-assembler "vmov\.f32\ts6, #1\.0" } } */
+/* { dg-final { scan-assembler "vmov\.f32\ts7, #1\.0" } } */
+/* { dg-final { scan-assembler "vmov\.f32\ts8, #1\.0" } } */
+/* { dg-final { scan-assembler "vmov\.f32\ts9, #1\.0" } } */
+/* { dg-final { scan-assembler "vmov\.f32\ts10, #1\.0" } } */
+/* { dg-final { scan-assembler "vmov\.f32\ts11, #1\.0" } } */
+/* { dg-final { scan-assembler "vmov\.f32\ts12, #1\.0" } } */
+/* { dg-final { scan-assembler "vmov\.f32\ts13, #1\.0" } } */
+/* { dg-final { scan-assembler "vmov\.f32\ts14, #1\.0" } } */
+/* { dg-final { scan-assembler "vmov\.f32\ts15, #1\.0" } } */
+/* { dg-final { scan-assembler "msr\tAPSR_nzcvq, lr" { target { arm_arch_v8m_main_ok && { ! arm_dsp } } } } } */
+/* { dg-final { scan-assembler "msr\tAPSR_nzcvqg, lr" { target { arm_arch_v8m_main_ok && arm_dsp } } } } */
+/* { dg-final { scan-assembler "push\t{r4}" } } */
+/* { dg-final { scan-assembler "vmrs\tip, fpscr" } } */
+/* { dg-final { scan-assembler "movw\tr4, #65376" } } */
+/* { dg-final { scan-assembler "movt\tr4, #4095" } } */
+/* { dg-final { scan-assembler "and\tip, r4" } } */
+/* { dg-final { scan-assembler "vmsr\tfpscr, ip" } } */
+/* { dg-final { scan-assembler "pop\t{r4}" } } */
+/* { dg-final { scan-assembler "mov\tip, lr" } } */
+/* { dg-final { scan-assembler "bxns" } } */
--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/arm/cmse/mainline/softfp-sp/cmse-7.c
@@ -0,0 +1,26 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target arm_arch_v8m_main_ok } */
+/* { dg-add-options arm_arch_v8m_main } */
+/* { dg-skip-if "Do not combine float-abi= hard | soft | softfp" {*-*-*} {"-mfloat-abi=soft" -mfloat-abi=hard } {""} } */
+/* { dg-skip-if "Skip these if testing double precision" {*-*-*} {"-mfpu=fpv[4-5]-d16"} {""} } */
+/* { dg-options "-mcmse -mfloat-abi=softfp -mfpu=fpv5-sp-d16" }  */
+
+int __attribute__ ((cmse_nonsecure_call)) (*bar) (void);
+
+int
+foo (int a)
+{
+  return bar () + a + 1;
+}
+
+/* Checks for saving and clearing prior to function call.  */
+/* { dg-final { scan-assembler "lsrs\tr4, r4, #1" } } */
+/* { dg-final { scan-assembler "lsls\tr4, r4, #1" } } */
+/* { dg-final { scan-assembler "mov\tr0, r4" } } */
+/* { dg-final { scan-assembler "mov\tr1, r4" } } */
+/* { dg-final { scan-assembler "mov\tr2, r4" } } */
+/* { dg-final { scan-assembler "mov\tr3, r4" } } */
+
+/* Now we check that we use the correct intrinsic to call.  */
+/* { dg-final { scan-assembler "bl\t__gnu_cmse_nonsecure_call" } } */
+
--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/arm/cmse/mainline/softfp-sp/cmse-8.c
@@ -0,0 +1,25 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target arm_arch_v8m_main_ok } */
+/* { dg-add-options arm_arch_v8m_main } */
+/* { dg-skip-if "Do not combine float-abi= hard | soft | softfp" {*-*-*} {"-mfloat-abi=soft" -mfloat-abi=hard } {""} } */
+/* { dg-skip-if "Skip these if testing double precision" {*-*-*} {"-mfpu=fpv[4-5]-d16"} {""} } */
+/* { dg-options "-mcmse -mfloat-abi=softfp -mfpu=fpv5-sp-d16" }  */
+
+int __attribute__ ((cmse_nonsecure_call)) (*bar) (double);
+
+int
+foo (int a)
+{
+  return bar (2.0) + a + 1;
+}
+
+/* Checks for saving and clearing prior to function call.  */
+/* { dg-final { scan-assembler "lsrs\tr4, r4, #1" } } */
+/* { dg-final { scan-assembler "lsls\tr4, r4, #1" } } */
+/* { dg-final { scan-assembler-not "mov\tr0, r4" } } */
+/* { dg-final { scan-assembler-not "mov\tr1, r4" } } */
+/* { dg-final { scan-assembler "mov\tr2, r4" } } */
+/* { dg-final { scan-assembler "mov\tr3, r4" } } */
+
+/* Now we check that we use the correct intrinsic to call.  */
+/* { dg-final { scan-assembler "bl\t__gnu_cmse_nonsecure_call" } } */
--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/arm/cmse/mainline/softfp/cmse-13.c
@@ -0,0 +1,25 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target arm_arch_v8m_main_ok } */
+/* { dg-add-options arm_arch_v8m_main } */
+/* { dg-skip-if "Do not combine float-abi= hard | soft | softfp" {*-*-*} {"-mfloat-abi=soft" -mfloat-abi=hard } {""} } */
+/* { dg-skip-if "Skip these if testing single precision" {*-*-*} {"-mfpu=*-sp-*"} {""} } */
+/* { dg-options "-mcmse -mfloat-abi=softfp -mfpu=fpv5-d16" }  */
+
+int __attribute__ ((cmse_nonsecure_call)) (*bar) (float, double);
+
+int
+foo (int a)
+{
+  return bar (1.0f, 2.0) + a + 1;
+}
+
+/* Checks for saving and clearing prior to function call.  */
+/* { dg-final { scan-assembler "lsrs\tr4, r4, #1" } } */
+/* { dg-final { scan-assembler "lsls\tr4, r4, #1" } } */
+/* { dg-final { scan-assembler-not "mov\tr0, r4" } } */
+/* { dg-final { scan-assembler "\n\tmov\tr1, r4" } } */
+/* { dg-final { scan-assembler-not "\n\tmov\tr2, r4\n\tmov\tr3, r4" } } */
+
+/* Now we check that we use the correct intrinsic to call.  */
+/* { dg-final { scan-assembler "bl\t__gnu_cmse_nonsecure_call" } } */
+
--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/arm/cmse/mainline/softfp/cmse-5.c
@@ -0,0 +1,38 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target arm_arch_v8m_main_ok } */
+/* { dg-add-options arm_arch_v8m_main } */
+/* { dg-skip-if "Do not combine float-abi= hard | soft | softfp" {*-*-*} {"-mfloat-abi=soft" -mfloat-abi=hard } {""} } */
+/* { dg-skip-if "Skip these if testing single precision" {*-*-*} {"-mfpu=*-sp-*"} {""} } */
+/* { dg-options "-mcmse -mfloat-abi=softfp -mfpu=fpv5-d16" }  */
+
+extern float bar (void);
+
+float __attribute__ ((cmse_nonsecure_entry))
+foo (void)
+{
+  return bar ();
+}
+/* { dg-final { scan-assembler "__acle_se_foo:" } } */
+/* { dg-final { scan-assembler-not "mov\tr0, lr" } } */
+/* { dg-final { scan-assembler "mov\tr1, lr" } } */
+/* { dg-final { scan-assembler "mov\tr2, lr" } } */
+/* { dg-final { scan-assembler "mov\tr3, lr" } } */
+/* { dg-final { scan-assembler "vmov\.f64\td0, #1\.0" } } */
+/* { dg-final { scan-assembler "vmov\.f64\td1, #1\.0" } } */
+/* { dg-final { scan-assembler "vmov\.f64\td2, #1\.0" } } */
+/* { dg-final { scan-assembler "vmov\.f64\td3, #1\.0" } } */
+/* { dg-final { scan-assembler "vmov\.f64\td4, #1\.0" } } */
+/* { dg-final { scan-assembler "vmov\.f64\td5, #1\.0" } } */
+/* { dg-final { scan-assembler "vmov\.f64\td6, #1\.0" } } */
+/* { dg-final { scan-assembler "vmov\.f64\td7, #1\.0" } } */
+/* { dg-final { scan-assembler "msr\tAPSR_nzcvq, lr" { target { arm_arch_v8m_main_ok && { ! arm_dsp } } } } } */
+/* { dg-final { scan-assembler "msr\tAPSR_nzcvqg, lr" { target { arm_arch_v8m_main_ok && arm_dsp } } } } */
+/* { dg-final { scan-assembler "push\t{r4}" } } */
+/* { dg-final { scan-assembler "vmrs\tip, fpscr" } } */
+/* { dg-final { scan-assembler "movw\tr4, #65376" } } */
+/* { dg-final { scan-assembler "movt\tr4, #4095" } } */
+/* { dg-final { scan-assembler "and\tip, r4" } } */
+/* { dg-final { scan-assembler "vmsr\tfpscr, ip" } } */
+/* { dg-final { scan-assembler "pop\t{r4}" } } */
+/* { dg-final { scan-assembler "mov\tip, lr" } } */
+/* { dg-final { scan-assembler "bxns" } } */
--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/arm/cmse/mainline/softfp/cmse-7.c
@@ -0,0 +1,26 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target arm_arch_v8m_main_ok } */
+/* { dg-add-options arm_arch_v8m_main } */
+/* { dg-skip-if "Do not combine float-abi= hard | soft | softfp" {*-*-*} {"-mfloat-abi=soft" -mfloat-abi=hard } {""} } */
+/* { dg-skip-if "Skip these if testing single precision" {*-*-*} {"-mfpu=*-sp-*"} {""} } */
+/* { dg-options "-mcmse -mfloat-abi=softfp -mfpu=fpv5-d16" }  */
+
+int __attribute__ ((cmse_nonsecure_call)) (*bar) (void);
+
+int
+foo (int a)
+{
+  return bar () + a + 1;
+}
+
+/* Checks for saving and clearing prior to function call.  */
+/* { dg-final { scan-assembler "lsrs\tr4, r4, #1" } } */
+/* { dg-final { scan-assembler "lsls\tr4, r4, #1" } } */
+/* { dg-final { scan-assembler "mov\tr0, r4" } } */
+/* { dg-final { scan-assembler "mov\tr1, r4" } } */
+/* { dg-final { scan-assembler "mov\tr2, r4" } } */
+/* { dg-final { scan-assembler "mov\tr3, r4" } } */
+
+/* Now we check that we use the correct intrinsic to call.  */
+/* { dg-final { scan-assembler "bl\t__gnu_cmse_nonsecure_call" } } */
+
--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/arm/cmse/mainline/softfp/cmse-8.c
@@ -0,0 +1,25 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target arm_arch_v8m_main_ok } */
+/* { dg-add-options arm_arch_v8m_main } */
+/* { dg-skip-if "Do not combine float-abi= hard | soft | softfp" {*-*-*} {"-mfloat-abi=soft" -mfloat-abi=hard } {""} } */
+/* { dg-skip-if "Skip these if testing single precision" {*-*-*} {"-mfpu=*-sp-*"} {""} } */
+/* { dg-options "-mcmse -mfloat-abi=softfp -mfpu=fpv5-d16" }  */
+
+int __attribute__ ((cmse_nonsecure_call)) (*bar) (double);
+
+int
+foo (int a)
+{
+  return bar (2.0) + a + 1;
+}
+
+/* Checks for saving and clearing prior to function call.  */
+/* { dg-final { scan-assembler "lsrs\tr4, r4, #1" } } */
+/* { dg-final { scan-assembler "lsls\tr4, r4, #1" } } */
+/* { dg-final { scan-assembler-not "mov\tr0, r4" } } */
+/* { dg-final { scan-assembler-not "mov\tr1, r4" } } */
+/* { dg-final { scan-assembler "mov\tr2, r4" } } */
+/* { dg-final { scan-assembler "mov\tr3, r4" } } */
+
+/* Now we check that we use the correct intrinsic to call.  */
+/* { dg-final { scan-assembler "bl\t__gnu_cmse_nonsecure_call" } } */
--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/arm/cmse/mainline/union-1.c
@@ -0,0 +1,69 @@
+/* { dg-do compile } */
+/* { dg-options "-mcmse" } */
+
+typedef struct
+{
+  unsigned char	  a :2;
+  unsigned char	    :0;
+  unsigned short  b :5;
+  unsigned char	    :0;
+  unsigned short  c :3;
+  unsigned char	    :0;
+  unsigned int	  d :9;
+} test_st_1;
+
+typedef struct
+{
+  unsigned short  a :7;
+  unsigned char	    :0;
+  unsigned char	  b :1;
+  unsigned char	    :0;
+  unsigned short  c :6;
+} test_st_2;
+
+typedef union
+{
+  test_st_1 st_1;
+  test_st_2 st_2;
+}test_un;
+
+typedef union
+{
+  test_un un;
+  struct
+    {
+      unsigned int v1;
+      unsigned int v2;
+      unsigned int v3;
+      unsigned int v4;
+    }values;
+} read_un;
+
+
+typedef void __attribute__ ((cmse_nonsecure_call)) (*foo_ns) (test_un);
+
+int
+main (void)
+{
+  read_un r;
+  foo_ns f;
+
+  f = (foo_ns) 0x200000;
+  r.values.v1 = 0xFFFFFFFF;
+  r.values.v2 = 0xFFFFFFFF;
+
+  f (r.un);
+  return 0;
+}
+
+/* { dg-final { scan-assembler "movw\tip, #8063" } } */
+/* { dg-final { scan-assembler "movt\tip, 63" } } */
+/* { dg-final { scan-assembler "and\tr0, r0, ip" } } */
+/* { dg-final { scan-assembler "movw\tip, #511" } } */
+/* { dg-final { scan-assembler "and\tr1, r1, ip" } } */
+/* { dg-final { scan-assembler "lsrs\tr4, r4, #1" } } */
+/* { dg-final { scan-assembler "lsls\tr4, r4, #1" } } */
+/* { dg-final { scan-assembler "mov\tr2, r4" } } */
+/* { dg-final { scan-assembler "mov\tr3, r4" } } */
+/* { dg-final { scan-assembler "bl\t__gnu_cmse_nonsecure_call" } } */
+
--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/arm/cmse/mainline/union-2.c
@@ -0,0 +1,84 @@
+/* { dg-do compile } */
+/* { dg-options "-mcmse" } */
+
+typedef struct
+{
+  unsigned char	  a :2;
+  unsigned char	    :0;
+  unsigned short  b :5;
+  unsigned char	    :0;
+  unsigned short  c :3;
+  unsigned char	    :0;
+  unsigned int	  d :9;
+} test_st_1;
+
+typedef struct
+{
+  unsigned short  a :7;
+  unsigned char	    :0;
+  unsigned char	  b :1;
+  unsigned char	    :0;
+  unsigned short  c :6;
+} test_st_2;
+
+typedef struct
+{
+  unsigned char	  a;
+  unsigned int	    :0;
+  unsigned int	  b :1;
+  unsigned short    :0;
+  unsigned short  c;
+  unsigned int	    :0;
+  unsigned int	  d :21;
+} test_st_3;
+
+typedef union
+{
+  test_st_1 st_1;
+  test_st_2 st_2;
+  test_st_3 st_3;
+}test_un;
+
+typedef union
+{
+  test_un un;
+  struct
+    {
+      unsigned int v1;
+      unsigned int v2;
+      unsigned int v3;
+      unsigned int v4;
+    }values;
+} read_un;
+
+
+typedef void __attribute__ ((cmse_nonsecure_call)) (*foo_ns) (test_un);
+
+int
+main (void)
+{
+  read_un r;
+  foo_ns f;
+
+  f = (foo_ns) 0x200000;
+  r.values.v1 = 0xFFFFFFFF;
+  r.values.v2 = 0xFFFFFFFF;
+  r.values.v3 = 0xFFFFFFFF;
+
+  f (r.un);
+  return 0;
+}
+
+/* { dg-final { scan-assembler "movw\tip, #8191" } } */
+/* { dg-final { scan-assembler "movt\tip, 63" } } */
+/* { dg-final { scan-assembler "and\tr0, r0, ip" } } */
+/* { dg-final { scan-assembler "movw\tip, #511" } } */
+/* { dg-final { scan-assembler "movt\tip, 65535" } } */
+/* { dg-final { scan-assembler "and\tr1, r1, ip" } } */
+/* { dg-final { scan-assembler "movw\tip, #65535" } } */
+/* { dg-final { scan-assembler "movt\tip, 31" } } */
+/* { dg-final { scan-assembler "and\tr2, r2, ip" } } */
+/* { dg-final { scan-assembler "lsrs\tr4, r4, #1" } } */
+/* { dg-final { scan-assembler "lsls\tr4, r4, #1" } } */
+/* { dg-final { scan-assembler "mov\tr3, r4" } } */
+/* { dg-final { scan-assembler "bl\t__gnu_cmse_nonsecure_call" } } */
--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/arm/cmse/struct-1.c
@@ -0,0 +1,33 @@
+/* { dg-do run } */
+/* { dg-options "--save-temps -mcmse -Wl,--section-start,.gnu.sgstubs=0x20400000" } */
+
+typedef struct
+{
+  unsigned char	  a;
+  unsigned short  b;
+} test_st;
+
+test_st __attribute__ ((cmse_nonsecure_entry)) foo (void)
+{
+  test_st t;
+  t.a = 255u;
+  t.b = 32767u;
+  return t;
+}
+
+int
+main (void)
+{
+  test_st t;
+  t = foo ();
+  if (t.a != 255u || t.b != 32767u)
+    __builtin_abort ();
+  return 0;
+}
+
+/* { dg-final { scan-assembler "movs\tr1, #255" } } */
+/* { dg-final { scan-assembler "movt\tr1, 65535" } } */
+/* { dg-final { scan-assembler "ands\tr0(, r0)?, r1" } } */
+/* { dg-final { scan-assembler "bxns" } } */
+
+
--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/arm/data-rel-1.c
@@ -0,0 +1,12 @@
+/* { dg-options "-fPIC -mno-pic-data-is-text-relative" } */
+/* { dg-final { scan-assembler-not "j-\\(.LPIC"  } } */
+/* { dg-final { scan-assembler-not "_GLOBAL_OFFSET_TABLE_-\\(.LPIC" } } */
+/* { dg-final { scan-assembler "j\\(GOT\\)" } } */
+/* { dg-final { scan-assembler "(ldr|mov)\tr\[0-9\]+, \\\[?r9" } } */
+
+static int j;
+
+int *Foo ()
+{
+  return &j;
+}
--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/arm/data-rel-2.c
@@ -0,0 +1,11 @@
+/* { dg-options "-fPIC -mno-pic-data-is-text-relative -mno-single-pic-base" } */
+/* { dg-final { scan-assembler-not "j-\\(.LPIC"  } } */
+/* { dg-final { scan-assembler "_GLOBAL_OFFSET_TABLE_-\\(.LPIC" } } */
+/* { dg-final { scan-assembler "j\\(GOT\\)" } } */
+
+static int j;
+
+int *Foo ()
+{
+  return &j;
+}
--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/arm/data-rel-3.c
@@ -0,0 +1,11 @@
+/* { dg-options "-fPIC -mpic-data-is-text-relative" } */
+/* { dg-final { scan-assembler "j-\\(.LPIC"  } } */
+/* { dg-final { scan-assembler-not "_GLOBAL_OFFSET_TABLE_-\\(.LPIC" } } */
+/* { dg-final { scan-assembler-not "j\\(GOT\\)" } } */
+
+static int j;
+
+int *Foo ()
+{
+  return &j;
+}
--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/arm/fp16-aapcs-1.c
@@ -0,0 +1,21 @@
+/* { dg-do compile }  */
+/* { dg-require-effective-target arm_hard_vfp_ok }  */
+/* { dg-require-effective-target arm_fp16_ok } */
+/* { dg-options "-O2" }  */
+/* { dg-add-options arm_fp16_ieee } */
+
+/* Test __fp16 arguments and return value in registers (hard-float).  */
+
+void
+swap (__fp16, __fp16);
+
+__fp16
+F (__fp16 a, __fp16 b, __fp16 c)
+{
+  swap (b, a);
+  return c;
+}
+
+/* { dg-final { scan-assembler {vmov(\.f16)?\tr[0-9]+, s[0-9]+} } }  */
+/* { dg-final { scan-assembler {vmov(\.f32)?\ts1, s0} } }  */
+/* { dg-final { scan-assembler {vmov(\.f16)?\ts0, r[0-9]+} } }  */
--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/arm/fp16-aapcs-2.c
@@ -0,0 +1,21 @@
+/* { dg-do compile }  */
+/* { dg-require-effective-target arm_fp16_ok } */
+/* { dg-options "-mfloat-abi=softfp -O2" }  */
+/* { dg-add-options arm_fp16_ieee } */
+/* { dg-skip-if "incompatible float-abi" { arm*-*-* } { "-mfloat-abi=hard" } } */
+
+/* Test __fp16 arguments and return value in registers (softfp).  */
+
+void
+swap (__fp16, __fp16);
+
+__fp16
+F (__fp16 a, __fp16 b, __fp16 c)
+{
+  swap (b, a);
+  return c;
+}
+
+/* { dg-final { scan-assembler-times {mov\tr[0-9]+, r[0-2]} 3 } }  */
+/* { dg-final { scan-assembler-times {mov\tr1, r0} 1 } }  */
+/* { dg-final { scan-assembler-times {mov\tr0, r[0-9]+} 2 } }  */
--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/arm/fp16-aapcs-3.c
@@ -0,0 +1,21 @@
+/* { dg-do compile }  */
+/* { dg-require-effective-target arm_hard_vfp_ok }  */
+/* { dg-require-effective-target arm_fp16_ok } */
+/* { dg-options "-O2" }  */
+/* { dg-add-options arm_fp16_alternative } */
+
+/* Test __fp16 arguments and return value in registers (hard-float).  */
+
+void
+swap (__fp16, __fp16);
+
+__fp16
+F (__fp16 a, __fp16 b, __fp16 c)
+{
+  swap (b, a);
+  return c;
+}
+
+/* { dg-final { scan-assembler-times {vmov\tr[0-9]+, s[0-2]} 2 } }  */
+/* { dg-final { scan-assembler-times {vmov.f32\ts1, s0} 1 } }  */
+/* { dg-final { scan-assembler-times {vmov\ts0, r[0-9]+} 2 } }  */
--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/arm/fp16-aapcs-4.c
@@ -0,0 +1,21 @@
+/* { dg-do compile }  */
+/* { dg-require-effective-target arm_fp16_ok } */
+/* { dg-options "-mfloat-abi=softfp -O2" }  */
+/* { dg-add-options arm_fp16_alternative } */
+/* { dg-skip-if "incompatible float-abi" { arm*-*-* } { "-mfloat-abi=hard" } } */
+
+/* Test __fp16 arguments and return value in registers (softfp).  */
+
+void
+swap (__fp16, __fp16);
+
+__fp16
+F (__fp16 a, __fp16 b, __fp16 c)
+{
+  swap (b, a);
+  return c;
+}
+
+/* { dg-final { scan-assembler-times {mov\tr[0-9]+, r[0-2]} 3 } }  */
+/* { dg-final { scan-assembler-times {mov\tr1, r0} 1 } }  */
+/* { dg-final { scan-assembler-times {mov\tr0, r[0-9]+} 2 } }  */
--- a/src/gcc/testsuite/gcc.target/arm/fp16-compile-alt-1.c
+++ b/src/gcc/testsuite/gcc.target/arm/fp16-compile-alt-1.c
@@ -1,4 +1,5 @@
 /* { dg-do compile } */
+/* { dg-require-effective-target arm_fp16_alternative_ok } */
 /* { dg-options "-mfp16-format=alternative" } */
 
 __fp16 xx = 0.0;
--- a/src/gcc/testsuite/gcc.target/arm/fp16-compile-alt-10.c
+++ b/src/gcc/testsuite/gcc.target/arm/fp16-compile-alt-10.c
@@ -1,4 +1,5 @@
 /* { dg-do compile } */
+/* { dg-require-effective-target arm_fp16_alternative_ok } */
 /* { dg-options "-mfp16-format=alternative -pedantic -std=gnu99" } */
 
 #include <math.h>
--- a/src/gcc/testsuite/gcc.target/arm/fp16-compile-alt-11.c
+++ b/src/gcc/testsuite/gcc.target/arm/fp16-compile-alt-11.c
@@ -1,4 +1,5 @@
 /* { dg-do compile } */
+/* { dg-require-effective-target arm_fp16_alternative_ok } */
 /* { dg-options "-mfp16-format=alternative -pedantic -std=gnu99" } */
 
 #include <math.h>
--- a/src/gcc/testsuite/gcc.target/arm/fp16-compile-alt-12.c
+++ b/src/gcc/testsuite/gcc.target/arm/fp16-compile-alt-12.c
@@ -1,4 +1,5 @@
 /* { dg-do compile } */
+/* { dg-require-effective-target arm_fp16_alternative_ok } */
 /* { dg-options "-mfp16-format=alternative" } */
 
 float xx __attribute__((mode(HF))) = 0.0;
--- a/src/gcc/testsuite/gcc.target/arm/fp16-compile-alt-2.c
+++ b/src/gcc/testsuite/gcc.target/arm/fp16-compile-alt-2.c
@@ -1,4 +1,5 @@
 /* { dg-do compile } */
+/* { dg-require-effective-target arm_fp16_alternative_ok } */
 /* { dg-options "-mfp16-format=alternative" } */
 
 /* Encoding taken from:  http://en.wikipedia.org/wiki/Half_precision */
--- a/src/gcc/testsuite/gcc.target/arm/fp16-compile-alt-3.c
+++ b/src/gcc/testsuite/gcc.target/arm/fp16-compile-alt-3.c
@@ -1,4 +1,5 @@
 /* { dg-do compile } */
+/* { dg-require-effective-target arm_fp16_alternative_ok } */
 /* { dg-options "-mfp16-format=alternative" } */
 
 /* Encoding taken from:  http://en.wikipedia.org/wiki/Half_precision */
--- a/src/gcc/testsuite/gcc.target/arm/fp16-compile-alt-4.c
+++ b/src/gcc/testsuite/gcc.target/arm/fp16-compile-alt-4.c
@@ -1,4 +1,5 @@
 /* { dg-do compile } */
+/* { dg-require-effective-target arm_fp16_alternative_ok } */
 /* { dg-options "-mfp16-format=alternative" } */
 
 /* Encoding taken from:  http://en.wikipedia.org/wiki/Half_precision */
--- a/src/gcc/testsuite/gcc.target/arm/fp16-compile-alt-5.c
+++ b/src/gcc/testsuite/gcc.target/arm/fp16-compile-alt-5.c
@@ -1,4 +1,5 @@
 /* { dg-do compile } */
+/* { dg-require-effective-target arm_fp16_alternative_ok } */
 /* { dg-options "-mfp16-format=alternative" } */
 
 /* Encoding taken from:  http://en.wikipedia.org/wiki/Half_precision */
--- a/src/gcc/testsuite/gcc.target/arm/fp16-compile-alt-6.c
+++ b/src/gcc/testsuite/gcc.target/arm/fp16-compile-alt-6.c
@@ -1,4 +1,5 @@
 /* { dg-do compile } */
+/* { dg-require-effective-target arm_fp16_alternative_ok } */
 /* { dg-options "-mfp16-format=alternative" } */
 
 /* This number is the maximum value representable in the alternative
--- a/src/gcc/testsuite/gcc.target/arm/fp16-compile-alt-7.c
+++ b/src/gcc/testsuite/gcc.target/arm/fp16-compile-alt-7.c
@@ -1,4 +1,5 @@
 /* { dg-do compile } */
+/* { dg-require-effective-target arm_fp16_alternative_ok } */
 /* { dg-options "-mfp16-format=alternative -pedantic" } */
 
 /* This number overflows the range of the alternative encoding.  Since this
--- a/src/gcc/testsuite/gcc.target/arm/fp16-compile-alt-8.c
+++ b/src/gcc/testsuite/gcc.target/arm/fp16-compile-alt-8.c
@@ -1,4 +1,5 @@
 /* { dg-do compile } */
+/* { dg-require-effective-target arm_fp16_alternative_ok } */
 /* { dg-options "-mfp16-format=alternative" } */
 
 /* Encoding taken from:  http://en.wikipedia.org/wiki/Half_precision */
--- a/src/gcc/testsuite/gcc.target/arm/fp16-compile-alt-9.c
+++ b/src/gcc/testsuite/gcc.target/arm/fp16-compile-alt-9.c
@@ -1,4 +1,5 @@
 /* { dg-do compile } */
+/* { dg-require-effective-target arm_fp16_alternative_ok } */
 /* { dg-options "-mfp16-format=alternative" } */
 
 /* Encoding taken from:  http://en.wikipedia.org/wiki/Half_precision */
--- a/src/gcc/testsuite/gcc.target/arm/fp16-compile-none-1.c
+++ b/src/gcc/testsuite/gcc.target/arm/fp16-compile-none-1.c
@@ -1,4 +1,5 @@
 /* { dg-do compile } */
+/* { dg-require-effective-target arm_fp16_none_ok } */
 /* { dg-options "-mfp16-format=none" } */
 
 /* __fp16 type name is not recognized unless you explicitly enable it
--- a/src/gcc/testsuite/gcc.target/arm/fp16-compile-none-2.c
+++ b/src/gcc/testsuite/gcc.target/arm/fp16-compile-none-2.c
@@ -1,4 +1,5 @@
 /* { dg-do compile } */
+/* { dg-require-effective-target arm_fp16_none_ok } */
 /* { dg-options "-mfp16-format=none" } */
 
 /* mode(HF) attributes are not recognized unless you explicitly enable
--- a/src/gcc/testsuite/gcc.target/arm/fp16-param-1.c
+++ b/src/gcc/testsuite/gcc.target/arm/fp16-param-1.c
@@ -1,10 +1,14 @@
 /* { dg-do compile } */
 /* { dg-options "-mfp16-format=ieee" } */
 
-/* Functions cannot have parameters of type __fp16.  */
-extern void f (__fp16);		/* { dg-error "parameters cannot have __fp16 type" } */
-extern void (*pf) (__fp16);	/* { dg-error "parameters cannot have __fp16 type" } */
+/* Test that the ACLE macro is defined.  */
+#if __ARM_FP16_ARGS != 1
+#error Unexpected value for __ARM_FP16_ARGS
+#endif
+
+/* Test that __fp16 is supported as a parameter type.  */
+extern void f (__fp16);
+extern void (*pf) (__fp16);
 
-/* These should be OK.  */
 extern void g (__fp16 *);
 extern void (*pg) (__fp16 *);
--- a/src/gcc/testsuite/gcc.target/arm/fp16-return-1.c
+++ b/src/gcc/testsuite/gcc.target/arm/fp16-return-1.c
@@ -1,10 +1,9 @@
 /* { dg-do compile } */
 /* { dg-options "-mfp16-format=ieee" } */
 
-/* Functions cannot return type __fp16.  */
-extern __fp16 f (void);		/* { dg-error "cannot return __fp16" } */
-extern __fp16 (*pf) (void);	/* { dg-error "cannot return __fp16" } */
+/* Test that __fp16 is supported as a return type.  */
+extern __fp16 f (void);
+extern __fp16 (*pf) (void);
 
-/* These should be OK.  */
 extern __fp16 *g (void);
 extern __fp16 *(*pg) (void);
--- a/src/gcc/testsuite/gcc.target/arm/fp16-rounding-alt-1.c
+++ b/src/gcc/testsuite/gcc.target/arm/fp16-rounding-alt-1.c
@@ -3,6 +3,7 @@
    from double to __fp16.  */
 
 /* { dg-do run } */
+/* { dg-require-effective-target arm_fp16_alternative_ok } */
 /* { dg-options "-mfp16-format=alternative" } */
 
 #include <stdlib.h>
--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/arm/movdi_movw.c
@@ -0,0 +1,12 @@
+/* { dg-do compile { target { arm_thumb2_ok || arm_thumb1_movt_ok } } } */
+/* { dg-options "-O2" } */
+
+long long
+movdi (int a)
+{
+  return 0xF0F0;
+}
+
+/* Accept r1 because big endian targets put the low bits in the highest
+   numbered register of a pair.  */
+/* { dg-final { scan-assembler-times "movw\tr\[01\], #61680" 1 } } */
--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/arm/movhi_movw.c
@@ -0,0 +1,10 @@
+/* { dg-do compile { target { arm_thumb2_ok || arm_thumb1_movt_ok } } } */
+/* { dg-options "-O2" } */
+
+short
+movsi (void)
+{
+  return (short) 0x7070;
+}
+
+/* { dg-final { scan-assembler-times "movw\tr0, #28784" 1 } } */
--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/arm/movsi_movw.c
@@ -0,0 +1,10 @@
+/* { dg-do compile { target { arm_thumb2_ok || arm_thumb1_movt_ok } } } */
+/* { dg-options "-O2" } */
+
+int
+movsi (void)
+{
+  return 0xF0F0;
+}
+
+/* { dg-final { scan-assembler-times "movw\tr0, #61680" 1 } } */
--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/arm/neon-vaddws16.c
@@ -0,0 +1,19 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target arm_neon_ok } */
+/* { dg-options "-O3" } */
+/* { dg-add-options arm_neon } */
+
+
+
+int
+t6 (int len, void * dummy, short * __restrict x)
+{
+  len = len & ~31;
+  int result = 0;
+  __asm volatile ("");
+  for (int i = 0; i < len; i++)
+    result += x[i];
+  return result;
+}
+
+/* { dg-final { scan-assembler "vaddw\.s16" } } */
--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/arm/neon-vaddws32.c
@@ -0,0 +1,18 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target arm_neon_ok } */
+/* { dg-options "-O3" } */
+/* { dg-add-options arm_neon } */
+
+
+int
+t6 (int len, void * dummy, int * __restrict x)
+{
+  len = len & ~31;
+  long long result = 0;
+  __asm volatile ("");
+  for (int i = 0; i < len; i++)
+    result += x[i];
+  return result;
+}
+
+/* { dg-final { scan-assembler "vaddw\.s32" } } */
--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/arm/neon-vaddwu16.c
@@ -0,0 +1,18 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target arm_neon_ok } */
+/* { dg-options "-O3" } */
+/* { dg-add-options arm_neon } */
+
+
+int
+t6 (int len, void * dummy, unsigned short * __restrict x)
+{
+  len = len & ~31;
+  unsigned int result = 0;
+  __asm volatile ("");
+  for (int i = 0; i < len; i++)
+    result += x[i];
+  return result;
+}
+
+/* { dg-final { scan-assembler "vaddw.u16" } } */
--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/arm/neon-vaddwu32.c
@@ -0,0 +1,18 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target arm_neon_ok } */
+/* { dg-options "-O3" } */
+/* { dg-add-options arm_neon } */
+
+
+int
+t6 (int len, void * dummy, unsigned int * __restrict x)
+{
+  len = len & ~31;
+  unsigned long long result = 0;
+  __asm volatile ("");
+  for (int i = 0; i < len; i++)
+    result += x[i];
+  return result;
+}
+
+/* { dg-final { scan-assembler "vaddw\.u32" } } */
--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/arm/neon-vaddwu8.c
@@ -0,0 +1,19 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target arm_neon_ok } */
+/* { dg-options "-O3" } */
+/* { dg-add-options arm_neon } */
+
+
+
+int
+t6 (int len, void * dummy, char * __restrict x)
+{
+  len = len & ~31;
+  unsigned short result = 0;
+  __asm volatile ("");
+  for (int i = 0; i < len; i++)
+    result += x[i];
+  return result;
+}
+
+/* { dg-final { scan-assembler "vaddw\.u8" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/neon.exp
+++ b/src//dev/null
@@ -1,35 +0,0 @@
-# Copyright (C) 1997-2016 Free Software Foundation, Inc.
-
-# This program is free software; you can redistribute it and/or modify
-# it under the terms of the GNU General Public License as published by
-# the Free Software Foundation; either version 3 of the License, or
-# (at your option) any later version.
-#
-# This program is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-# GNU General Public License for more details.
-#
-# You should have received a copy of the GNU General Public License
-# along with GCC; see the file COPYING3.  If not see
-# <http://www.gnu.org/licenses/>.
-
-# GCC testsuite that uses the `dg.exp' driver.
-
-# Exit immediately if this isn't an ARM target.
-if ![istarget arm*-*-*] then {
-  return
-}
-
-# Load support procs.
-load_lib gcc-dg.exp
-
-# Initialize `dg'.
-dg-init
-
-# Main loop.
-dg-runtest [lsort [glob -nocomplain $srcdir/$subdir/*.\[cCS\]]] \
-	"" ""
-
-# All done.
-dg-finish
--- a/src/gcc/testsuite/gcc.target/arm/neon/polytypes.c
+++ b/src//dev/null
@@ -1,48 +0,0 @@
-/* Check that NEON polynomial vector types are suitably incompatible with
-   integer vector types of the same layout.  */
-
-/* { dg-do compile } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-add-options arm_neon } */
-
-#include <arm_neon.h>
-
-void s64_8 (int8x8_t a) {}
-void u64_8 (uint8x8_t a) {}
-void p64_8 (poly8x8_t a) {}
-void s64_16 (int16x4_t a) {}
-void u64_16 (uint16x4_t a) {}
-void p64_16 (poly16x4_t a) {}
-
-void s128_8 (int8x16_t a) {}
-void u128_8 (uint8x16_t a) {}
-void p128_8 (poly8x16_t a) {}
-void s128_16 (int16x8_t a) {}
-void u128_16 (uint16x8_t a) {}
-void p128_16 (poly16x8_t a) {}
-
-void foo ()
-{
-  poly8x8_t v64_8;
-  poly16x4_t v64_16;
-  poly8x16_t v128_8;
-  poly16x8_t v128_16;
-
-  s64_8 (v64_8); /* { dg-message "use -flax-vector-conversions" } */
-  /* { dg-error "incompatible type for argument 1 of 's64_8'" "" { target *-*-* } 31 } */
-  u64_8 (v64_8); /* { dg-error "incompatible type for argument 1 of 'u64_8'" } */
-  p64_8 (v64_8);
-
-  s64_16 (v64_16); /* { dg-error "incompatible type for argument 1 of 's64_16'" } */
-  u64_16 (v64_16); /* { dg-error "incompatible type for argument 1 of 'u64_16'" } */
-  p64_16 (v64_16);
-
-  s128_8 (v128_8); /* { dg-error "incompatible type for argument 1 of 's128_8'" } */
-  u128_8 (v128_8); /* { dg-error "incompatible type for argument 1 of 'u128_8'" } */
-  p128_8 (v128_8);
-
-  s128_16 (v128_16); /* { dg-error "incompatible type for argument 1 of 's128_16'" } */
-  u128_16 (v128_16); /* { dg-error "incompatible type for argument 1 of 'u128_16'" } */
-  p128_16 (v128_16);
-}
-/* { dg-message "note: expected '\[^'\n\]*' but argument is of type '\[^'\n\]*'" "note: expected" { target *-*-* } 0 } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/pr51534.c
+++ b/src//dev/null
@@ -1,83 +0,0 @@
-/* Test the vector comparison intrinsics when comparing to immediate zero.
-   */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -mfloat-abi=hard -O3" } */
-/* { dg-add-options arm_neon } */
-
-#include <arm_neon.h>
-
-#define GEN_TEST(T, D, C, R) \
-  R test_##C##_##T (T a) { return C (a, D (0)); }
-
-#define GEN_DOUBLE_TESTS(S, T, C) \
-  GEN_TEST (T, vdup_n_s##S, C##_s##S, u##T) \
-  GEN_TEST (u##T, vdup_n_u##S, C##_u##S, u##T) 
-
-#define GEN_QUAD_TESTS(S, T, C) \
-  GEN_TEST (T, vdupq_n_s##S, C##q_s##S, u##T) \
-  GEN_TEST (u##T, vdupq_n_u##S, C##q_u##S, u##T) 
-
-#define GEN_COND_TESTS(C) \
-  GEN_DOUBLE_TESTS (8, int8x8_t, C) \
-  GEN_DOUBLE_TESTS (16, int16x4_t, C) \
-  GEN_DOUBLE_TESTS (32, int32x2_t, C) \
-  GEN_QUAD_TESTS (8, int8x16_t, C) \
-  GEN_QUAD_TESTS (16, int16x8_t, C) \
-  GEN_QUAD_TESTS (32, int32x4_t, C)
-
-GEN_COND_TESTS(vcgt)
-GEN_COND_TESTS(vcge)
-GEN_COND_TESTS(vclt)
-GEN_COND_TESTS(vcle)
-GEN_COND_TESTS(vceq)
-
-/* Scan for expected outputs.  */
-/* { dg-final { scan-assembler "vcgt\.s8\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, #0" } } */
-/* { dg-final { scan-assembler-times "vcgt\.u8\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+" 2 } } */
-/* { dg-final { scan-assembler "vcgt\.s16\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, #0" } } */
-/* { dg-final { scan-assembler-times "vcgt\.u16\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+" 2 } } */
-/* { dg-final { scan-assembler "vcgt\.s32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, #0" } } */
-/* { dg-final { scan-assembler-times "vcgt\.u32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+" 2 } } */
-/* { dg-final { scan-assembler "vcgt\.s8\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, #0" } } */
-/* { dg-final { scan-assembler-times "vcgt\.u8\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+" 2 } } */
-/* { dg-final { scan-assembler "vcgt\.s16\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, #0" } } */
-/* { dg-final { scan-assembler-times "vcgt\.u16\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+" 2 } } */
-/* { dg-final { scan-assembler "vcgt\.s32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, #0" } } */
-/* { dg-final { scan-assembler-times "vcgt\.u32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+" 2 } } */
-/* { dg-final { scan-assembler "vcge\.s8\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, #0" } } */
-/* { dg-final { scan-assembler-times "vcge\.u8\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+" 2 } } */
-/* { dg-final { scan-assembler "vcge\.s16\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, #0" } } */
-/* { dg-final { scan-assembler-times "vcge\.u16\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+" 2 } } */
-/* { dg-final { scan-assembler "vcge\.s32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, #0" } } */
-/* { dg-final { scan-assembler-times "vcge\.u32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+" 2 } } */
-/* { dg-final { scan-assembler "vcge\.s8\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, #0" } } */
-/* { dg-final { scan-assembler-times "vcge\.u8\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+" 2 } } */
-/* { dg-final { scan-assembler "vcge\.s16\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, #0" } } */
-/* { dg-final { scan-assembler-times "vcge\.u16\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+" 2 } } */
-/* { dg-final { scan-assembler "vcge\.s32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, #0" } } */
-/* { dg-final { scan-assembler-times "vcge\.u32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+" 2 } } */
-/* { dg-final { scan-assembler "vclt\.s8\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, #0" } } */
-/* { dg-final { scan-assembler "vclt\.s16\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, #0" } } */
-/* { dg-final { scan-assembler "vclt\.s32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, #0" } } */
-/* { dg-final { scan-assembler "vclt\.s8\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, #0" } } */
-/* { dg-final { scan-assembler "vclt\.s16\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, #0" } } */
-/* { dg-final { scan-assembler "vclt\.s32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, #0" } } */
-/* { dg-final { scan-assembler "vcle\.s8\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, #0" } } */
-/* { dg-final { scan-assembler "vcle\.s16\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, #0" } } */
-/* { dg-final { scan-assembler "vcle\.s32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, #0" } } */
-/* { dg-final { scan-assembler "vcle\.s8\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, #0" } } */
-/* { dg-final { scan-assembler "vcle\.s16\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, #0" } } */
-/* { dg-final { scan-assembler "vcle\.s32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, #0" } } */
-/* { dg-final { scan-assembler-times "vceq\.i8\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, #0" 2 } } */
-/* { dg-final { scan-assembler-times "vceq\.i16\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, #0" 2 } } */
-/* { dg-final { scan-assembler-times "vceq\.i32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, #0" 2 } } */
-/* { dg-final { scan-assembler-times "vceq\.i8\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, #0" 2 } } */
-/* { dg-final { scan-assembler-times "vceq\.i16\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, #0" 2 } } */
-/* { dg-final { scan-assembler-times "vceq\.i32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, #0" 2 } } */
-
-/* And ensure we don't have unexpected output too.  */
-/* { dg-final { scan-assembler-not "vc\[gl\]\[te\]\.u\[0-9\]+\[ 	\]+\[qQdD\]\[0-9\]+, \[qQdD\]\[0-9\]+, #0" } } */
-
-/* Tidy up.  */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vRaddhns16.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vRaddhns16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vRaddhns16 (void)
-{
-  int8x8_t out_int8x8_t;
-  int16x8_t arg0_int16x8_t;
-  int16x8_t arg1_int16x8_t;
-
-  out_int8x8_t = vraddhn_s16 (arg0_int16x8_t, arg1_int16x8_t);
-}
-
-/* { dg-final { scan-assembler "vraddhn\.i16\[ 	\]+\[dD\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vRaddhns32.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vRaddhns32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vRaddhns32 (void)
-{
-  int16x4_t out_int16x4_t;
-  int32x4_t arg0_int32x4_t;
-  int32x4_t arg1_int32x4_t;
-
-  out_int16x4_t = vraddhn_s32 (arg0_int32x4_t, arg1_int32x4_t);
-}
-
-/* { dg-final { scan-assembler "vraddhn\.i32\[ 	\]+\[dD\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vRaddhns64.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vRaddhns64' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vRaddhns64 (void)
-{
-  int32x2_t out_int32x2_t;
-  int64x2_t arg0_int64x2_t;
-  int64x2_t arg1_int64x2_t;
-
-  out_int32x2_t = vraddhn_s64 (arg0_int64x2_t, arg1_int64x2_t);
-}
-
-/* { dg-final { scan-assembler "vraddhn\.i64\[ 	\]+\[dD\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vRaddhnu16.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vRaddhnu16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vRaddhnu16 (void)
-{
-  uint8x8_t out_uint8x8_t;
-  uint16x8_t arg0_uint16x8_t;
-  uint16x8_t arg1_uint16x8_t;
-
-  out_uint8x8_t = vraddhn_u16 (arg0_uint16x8_t, arg1_uint16x8_t);
-}
-
-/* { dg-final { scan-assembler "vraddhn\.i16\[ 	\]+\[dD\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vRaddhnu32.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vRaddhnu32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vRaddhnu32 (void)
-{
-  uint16x4_t out_uint16x4_t;
-  uint32x4_t arg0_uint32x4_t;
-  uint32x4_t arg1_uint32x4_t;
-
-  out_uint16x4_t = vraddhn_u32 (arg0_uint32x4_t, arg1_uint32x4_t);
-}
-
-/* { dg-final { scan-assembler "vraddhn\.i32\[ 	\]+\[dD\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vRaddhnu64.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vRaddhnu64' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vRaddhnu64 (void)
-{
-  uint32x2_t out_uint32x2_t;
-  uint64x2_t arg0_uint64x2_t;
-  uint64x2_t arg1_uint64x2_t;
-
-  out_uint32x2_t = vraddhn_u64 (arg0_uint64x2_t, arg1_uint64x2_t);
-}
-
-/* { dg-final { scan-assembler "vraddhn\.i64\[ 	\]+\[dD\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vRhaddQs16.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vRhaddQs16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vRhaddQs16 (void)
-{
-  int16x8_t out_int16x8_t;
-  int16x8_t arg0_int16x8_t;
-  int16x8_t arg1_int16x8_t;
-
-  out_int16x8_t = vrhaddq_s16 (arg0_int16x8_t, arg1_int16x8_t);
-}
-
-/* { dg-final { scan-assembler "vrhadd\.s16\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vRhaddQs32.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vRhaddQs32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vRhaddQs32 (void)
-{
-  int32x4_t out_int32x4_t;
-  int32x4_t arg0_int32x4_t;
-  int32x4_t arg1_int32x4_t;
-
-  out_int32x4_t = vrhaddq_s32 (arg0_int32x4_t, arg1_int32x4_t);
-}
-
-/* { dg-final { scan-assembler "vrhadd\.s32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vRhaddQs8.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vRhaddQs8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vRhaddQs8 (void)
-{
-  int8x16_t out_int8x16_t;
-  int8x16_t arg0_int8x16_t;
-  int8x16_t arg1_int8x16_t;
-
-  out_int8x16_t = vrhaddq_s8 (arg0_int8x16_t, arg1_int8x16_t);
-}
-
-/* { dg-final { scan-assembler "vrhadd\.s8\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vRhaddQu16.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vRhaddQu16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vRhaddQu16 (void)
-{
-  uint16x8_t out_uint16x8_t;
-  uint16x8_t arg0_uint16x8_t;
-  uint16x8_t arg1_uint16x8_t;
-
-  out_uint16x8_t = vrhaddq_u16 (arg0_uint16x8_t, arg1_uint16x8_t);
-}
-
-/* { dg-final { scan-assembler "vrhadd\.u16\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vRhaddQu32.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vRhaddQu32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vRhaddQu32 (void)
-{
-  uint32x4_t out_uint32x4_t;
-  uint32x4_t arg0_uint32x4_t;
-  uint32x4_t arg1_uint32x4_t;
-
-  out_uint32x4_t = vrhaddq_u32 (arg0_uint32x4_t, arg1_uint32x4_t);
-}
-
-/* { dg-final { scan-assembler "vrhadd\.u32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vRhaddQu8.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vRhaddQu8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vRhaddQu8 (void)
-{
-  uint8x16_t out_uint8x16_t;
-  uint8x16_t arg0_uint8x16_t;
-  uint8x16_t arg1_uint8x16_t;
-
-  out_uint8x16_t = vrhaddq_u8 (arg0_uint8x16_t, arg1_uint8x16_t);
-}
-
-/* { dg-final { scan-assembler "vrhadd\.u8\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vRhadds16.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vRhadds16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vRhadds16 (void)
-{
-  int16x4_t out_int16x4_t;
-  int16x4_t arg0_int16x4_t;
-  int16x4_t arg1_int16x4_t;
-
-  out_int16x4_t = vrhadd_s16 (arg0_int16x4_t, arg1_int16x4_t);
-}
-
-/* { dg-final { scan-assembler "vrhadd\.s16\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vRhadds32.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vRhadds32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vRhadds32 (void)
-{
-  int32x2_t out_int32x2_t;
-  int32x2_t arg0_int32x2_t;
-  int32x2_t arg1_int32x2_t;
-
-  out_int32x2_t = vrhadd_s32 (arg0_int32x2_t, arg1_int32x2_t);
-}
-
-/* { dg-final { scan-assembler "vrhadd\.s32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vRhadds8.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vRhadds8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vRhadds8 (void)
-{
-  int8x8_t out_int8x8_t;
-  int8x8_t arg0_int8x8_t;
-  int8x8_t arg1_int8x8_t;
-
-  out_int8x8_t = vrhadd_s8 (arg0_int8x8_t, arg1_int8x8_t);
-}
-
-/* { dg-final { scan-assembler "vrhadd\.s8\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vRhaddu16.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vRhaddu16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vRhaddu16 (void)
-{
-  uint16x4_t out_uint16x4_t;
-  uint16x4_t arg0_uint16x4_t;
-  uint16x4_t arg1_uint16x4_t;
-
-  out_uint16x4_t = vrhadd_u16 (arg0_uint16x4_t, arg1_uint16x4_t);
-}
-
-/* { dg-final { scan-assembler "vrhadd\.u16\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vRhaddu32.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vRhaddu32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vRhaddu32 (void)
-{
-  uint32x2_t out_uint32x2_t;
-  uint32x2_t arg0_uint32x2_t;
-  uint32x2_t arg1_uint32x2_t;
-
-  out_uint32x2_t = vrhadd_u32 (arg0_uint32x2_t, arg1_uint32x2_t);
-}
-
-/* { dg-final { scan-assembler "vrhadd\.u32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vRhaddu8.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vRhaddu8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vRhaddu8 (void)
-{
-  uint8x8_t out_uint8x8_t;
-  uint8x8_t arg0_uint8x8_t;
-  uint8x8_t arg1_uint8x8_t;
-
-  out_uint8x8_t = vrhadd_u8 (arg0_uint8x8_t, arg1_uint8x8_t);
-}
-
-/* { dg-final { scan-assembler "vrhadd\.u8\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vRshlQs16.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vRshlQs16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vRshlQs16 (void)
-{
-  int16x8_t out_int16x8_t;
-  int16x8_t arg0_int16x8_t;
-  int16x8_t arg1_int16x8_t;
-
-  out_int16x8_t = vrshlq_s16 (arg0_int16x8_t, arg1_int16x8_t);
-}
-
-/* { dg-final { scan-assembler "vrshl\.s16\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vRshlQs32.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vRshlQs32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vRshlQs32 (void)
-{
-  int32x4_t out_int32x4_t;
-  int32x4_t arg0_int32x4_t;
-  int32x4_t arg1_int32x4_t;
-
-  out_int32x4_t = vrshlq_s32 (arg0_int32x4_t, arg1_int32x4_t);
-}
-
-/* { dg-final { scan-assembler "vrshl\.s32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vRshlQs64.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vRshlQs64' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vRshlQs64 (void)
-{
-  int64x2_t out_int64x2_t;
-  int64x2_t arg0_int64x2_t;
-  int64x2_t arg1_int64x2_t;
-
-  out_int64x2_t = vrshlq_s64 (arg0_int64x2_t, arg1_int64x2_t);
-}
-
-/* { dg-final { scan-assembler "vrshl\.s64\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vRshlQs8.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vRshlQs8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vRshlQs8 (void)
-{
-  int8x16_t out_int8x16_t;
-  int8x16_t arg0_int8x16_t;
-  int8x16_t arg1_int8x16_t;
-
-  out_int8x16_t = vrshlq_s8 (arg0_int8x16_t, arg1_int8x16_t);
-}
-
-/* { dg-final { scan-assembler "vrshl\.s8\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vRshlQu16.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vRshlQu16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vRshlQu16 (void)
-{
-  uint16x8_t out_uint16x8_t;
-  uint16x8_t arg0_uint16x8_t;
-  int16x8_t arg1_int16x8_t;
-
-  out_uint16x8_t = vrshlq_u16 (arg0_uint16x8_t, arg1_int16x8_t);
-}
-
-/* { dg-final { scan-assembler "vrshl\.u16\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vRshlQu32.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vRshlQu32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vRshlQu32 (void)
-{
-  uint32x4_t out_uint32x4_t;
-  uint32x4_t arg0_uint32x4_t;
-  int32x4_t arg1_int32x4_t;
-
-  out_uint32x4_t = vrshlq_u32 (arg0_uint32x4_t, arg1_int32x4_t);
-}
-
-/* { dg-final { scan-assembler "vrshl\.u32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vRshlQu64.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vRshlQu64' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vRshlQu64 (void)
-{
-  uint64x2_t out_uint64x2_t;
-  uint64x2_t arg0_uint64x2_t;
-  int64x2_t arg1_int64x2_t;
-
-  out_uint64x2_t = vrshlq_u64 (arg0_uint64x2_t, arg1_int64x2_t);
-}
-
-/* { dg-final { scan-assembler "vrshl\.u64\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vRshlQu8.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vRshlQu8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vRshlQu8 (void)
-{
-  uint8x16_t out_uint8x16_t;
-  uint8x16_t arg0_uint8x16_t;
-  int8x16_t arg1_int8x16_t;
-
-  out_uint8x16_t = vrshlq_u8 (arg0_uint8x16_t, arg1_int8x16_t);
-}
-
-/* { dg-final { scan-assembler "vrshl\.u8\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vRshls16.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vRshls16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vRshls16 (void)
-{
-  int16x4_t out_int16x4_t;
-  int16x4_t arg0_int16x4_t;
-  int16x4_t arg1_int16x4_t;
-
-  out_int16x4_t = vrshl_s16 (arg0_int16x4_t, arg1_int16x4_t);
-}
-
-/* { dg-final { scan-assembler "vrshl\.s16\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vRshls32.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vRshls32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vRshls32 (void)
-{
-  int32x2_t out_int32x2_t;
-  int32x2_t arg0_int32x2_t;
-  int32x2_t arg1_int32x2_t;
-
-  out_int32x2_t = vrshl_s32 (arg0_int32x2_t, arg1_int32x2_t);
-}
-
-/* { dg-final { scan-assembler "vrshl\.s32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vRshls64.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vRshls64' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vRshls64 (void)
-{
-  int64x1_t out_int64x1_t;
-  int64x1_t arg0_int64x1_t;
-  int64x1_t arg1_int64x1_t;
-
-  out_int64x1_t = vrshl_s64 (arg0_int64x1_t, arg1_int64x1_t);
-}
-
-/* { dg-final { scan-assembler "vrshl\.s64\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vRshls8.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vRshls8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vRshls8 (void)
-{
-  int8x8_t out_int8x8_t;
-  int8x8_t arg0_int8x8_t;
-  int8x8_t arg1_int8x8_t;
-
-  out_int8x8_t = vrshl_s8 (arg0_int8x8_t, arg1_int8x8_t);
-}
-
-/* { dg-final { scan-assembler "vrshl\.s8\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vRshlu16.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vRshlu16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vRshlu16 (void)
-{
-  uint16x4_t out_uint16x4_t;
-  uint16x4_t arg0_uint16x4_t;
-  int16x4_t arg1_int16x4_t;
-
-  out_uint16x4_t = vrshl_u16 (arg0_uint16x4_t, arg1_int16x4_t);
-}
-
-/* { dg-final { scan-assembler "vrshl\.u16\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vRshlu32.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vRshlu32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vRshlu32 (void)
-{
-  uint32x2_t out_uint32x2_t;
-  uint32x2_t arg0_uint32x2_t;
-  int32x2_t arg1_int32x2_t;
-
-  out_uint32x2_t = vrshl_u32 (arg0_uint32x2_t, arg1_int32x2_t);
-}
-
-/* { dg-final { scan-assembler "vrshl\.u32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vRshlu64.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vRshlu64' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vRshlu64 (void)
-{
-  uint64x1_t out_uint64x1_t;
-  uint64x1_t arg0_uint64x1_t;
-  int64x1_t arg1_int64x1_t;
-
-  out_uint64x1_t = vrshl_u64 (arg0_uint64x1_t, arg1_int64x1_t);
-}
-
-/* { dg-final { scan-assembler "vrshl\.u64\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vRshlu8.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vRshlu8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vRshlu8 (void)
-{
-  uint8x8_t out_uint8x8_t;
-  uint8x8_t arg0_uint8x8_t;
-  int8x8_t arg1_int8x8_t;
-
-  out_uint8x8_t = vrshl_u8 (arg0_uint8x8_t, arg1_int8x8_t);
-}
-
-/* { dg-final { scan-assembler "vrshl\.u8\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vRshrQ_ns16.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vRshrQ_ns16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vRshrQ_ns16 (void)
-{
-  int16x8_t out_int16x8_t;
-  int16x8_t arg0_int16x8_t;
-
-  out_int16x8_t = vrshrq_n_s16 (arg0_int16x8_t, 1);
-}
-
-/* { dg-final { scan-assembler "vrshr\.s16\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vRshrQ_ns32.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vRshrQ_ns32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vRshrQ_ns32 (void)
-{
-  int32x4_t out_int32x4_t;
-  int32x4_t arg0_int32x4_t;
-
-  out_int32x4_t = vrshrq_n_s32 (arg0_int32x4_t, 1);
-}
-
-/* { dg-final { scan-assembler "vrshr\.s32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vRshrQ_ns64.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vRshrQ_ns64' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vRshrQ_ns64 (void)
-{
-  int64x2_t out_int64x2_t;
-  int64x2_t arg0_int64x2_t;
-
-  out_int64x2_t = vrshrq_n_s64 (arg0_int64x2_t, 1);
-}
-
-/* { dg-final { scan-assembler "vrshr\.s64\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vRshrQ_ns8.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vRshrQ_ns8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vRshrQ_ns8 (void)
-{
-  int8x16_t out_int8x16_t;
-  int8x16_t arg0_int8x16_t;
-
-  out_int8x16_t = vrshrq_n_s8 (arg0_int8x16_t, 1);
-}
-
-/* { dg-final { scan-assembler "vrshr\.s8\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vRshrQ_nu16.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vRshrQ_nu16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vRshrQ_nu16 (void)
-{
-  uint16x8_t out_uint16x8_t;
-  uint16x8_t arg0_uint16x8_t;
-
-  out_uint16x8_t = vrshrq_n_u16 (arg0_uint16x8_t, 1);
-}
-
-/* { dg-final { scan-assembler "vrshr\.u16\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vRshrQ_nu32.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vRshrQ_nu32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vRshrQ_nu32 (void)
-{
-  uint32x4_t out_uint32x4_t;
-  uint32x4_t arg0_uint32x4_t;
-
-  out_uint32x4_t = vrshrq_n_u32 (arg0_uint32x4_t, 1);
-}
-
-/* { dg-final { scan-assembler "vrshr\.u32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vRshrQ_nu64.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vRshrQ_nu64' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vRshrQ_nu64 (void)
-{
-  uint64x2_t out_uint64x2_t;
-  uint64x2_t arg0_uint64x2_t;
-
-  out_uint64x2_t = vrshrq_n_u64 (arg0_uint64x2_t, 1);
-}
-
-/* { dg-final { scan-assembler "vrshr\.u64\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vRshrQ_nu8.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vRshrQ_nu8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vRshrQ_nu8 (void)
-{
-  uint8x16_t out_uint8x16_t;
-  uint8x16_t arg0_uint8x16_t;
-
-  out_uint8x16_t = vrshrq_n_u8 (arg0_uint8x16_t, 1);
-}
-
-/* { dg-final { scan-assembler "vrshr\.u8\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vRshr_ns16.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vRshr_ns16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vRshr_ns16 (void)
-{
-  int16x4_t out_int16x4_t;
-  int16x4_t arg0_int16x4_t;
-
-  out_int16x4_t = vrshr_n_s16 (arg0_int16x4_t, 1);
-}
-
-/* { dg-final { scan-assembler "vrshr\.s16\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vRshr_ns32.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vRshr_ns32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vRshr_ns32 (void)
-{
-  int32x2_t out_int32x2_t;
-  int32x2_t arg0_int32x2_t;
-
-  out_int32x2_t = vrshr_n_s32 (arg0_int32x2_t, 1);
-}
-
-/* { dg-final { scan-assembler "vrshr\.s32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vRshr_ns64.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vRshr_ns64' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vRshr_ns64 (void)
-{
-  int64x1_t out_int64x1_t;
-  int64x1_t arg0_int64x1_t;
-
-  out_int64x1_t = vrshr_n_s64 (arg0_int64x1_t, 1);
-}
-
-/* { dg-final { scan-assembler "vrshr\.s64\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vRshr_ns8.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vRshr_ns8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vRshr_ns8 (void)
-{
-  int8x8_t out_int8x8_t;
-  int8x8_t arg0_int8x8_t;
-
-  out_int8x8_t = vrshr_n_s8 (arg0_int8x8_t, 1);
-}
-
-/* { dg-final { scan-assembler "vrshr\.s8\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vRshr_nu16.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vRshr_nu16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vRshr_nu16 (void)
-{
-  uint16x4_t out_uint16x4_t;
-  uint16x4_t arg0_uint16x4_t;
-
-  out_uint16x4_t = vrshr_n_u16 (arg0_uint16x4_t, 1);
-}
-
-/* { dg-final { scan-assembler "vrshr\.u16\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vRshr_nu32.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vRshr_nu32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vRshr_nu32 (void)
-{
-  uint32x2_t out_uint32x2_t;
-  uint32x2_t arg0_uint32x2_t;
-
-  out_uint32x2_t = vrshr_n_u32 (arg0_uint32x2_t, 1);
-}
-
-/* { dg-final { scan-assembler "vrshr\.u32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vRshr_nu64.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vRshr_nu64' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vRshr_nu64 (void)
-{
-  uint64x1_t out_uint64x1_t;
-  uint64x1_t arg0_uint64x1_t;
-
-  out_uint64x1_t = vrshr_n_u64 (arg0_uint64x1_t, 1);
-}
-
-/* { dg-final { scan-assembler "vrshr\.u64\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vRshr_nu8.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vRshr_nu8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vRshr_nu8 (void)
-{
-  uint8x8_t out_uint8x8_t;
-  uint8x8_t arg0_uint8x8_t;
-
-  out_uint8x8_t = vrshr_n_u8 (arg0_uint8x8_t, 1);
-}
-
-/* { dg-final { scan-assembler "vrshr\.u8\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vRshrn_ns16.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vRshrn_ns16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vRshrn_ns16 (void)
-{
-  int8x8_t out_int8x8_t;
-  int16x8_t arg0_int16x8_t;
-
-  out_int8x8_t = vrshrn_n_s16 (arg0_int16x8_t, 1);
-}
-
-/* { dg-final { scan-assembler "vrshrn\.i16\[ 	\]+\[dD\]\[0-9\]+, \[qQ\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vRshrn_ns32.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vRshrn_ns32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vRshrn_ns32 (void)
-{
-  int16x4_t out_int16x4_t;
-  int32x4_t arg0_int32x4_t;
-
-  out_int16x4_t = vrshrn_n_s32 (arg0_int32x4_t, 1);
-}
-
-/* { dg-final { scan-assembler "vrshrn\.i32\[ 	\]+\[dD\]\[0-9\]+, \[qQ\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vRshrn_ns64.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vRshrn_ns64' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vRshrn_ns64 (void)
-{
-  int32x2_t out_int32x2_t;
-  int64x2_t arg0_int64x2_t;
-
-  out_int32x2_t = vrshrn_n_s64 (arg0_int64x2_t, 1);
-}
-
-/* { dg-final { scan-assembler "vrshrn\.i64\[ 	\]+\[dD\]\[0-9\]+, \[qQ\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vRshrn_nu16.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vRshrn_nu16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vRshrn_nu16 (void)
-{
-  uint8x8_t out_uint8x8_t;
-  uint16x8_t arg0_uint16x8_t;
-
-  out_uint8x8_t = vrshrn_n_u16 (arg0_uint16x8_t, 1);
-}
-
-/* { dg-final { scan-assembler "vrshrn\.i16\[ 	\]+\[dD\]\[0-9\]+, \[qQ\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vRshrn_nu32.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vRshrn_nu32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vRshrn_nu32 (void)
-{
-  uint16x4_t out_uint16x4_t;
-  uint32x4_t arg0_uint32x4_t;
-
-  out_uint16x4_t = vrshrn_n_u32 (arg0_uint32x4_t, 1);
-}
-
-/* { dg-final { scan-assembler "vrshrn\.i32\[ 	\]+\[dD\]\[0-9\]+, \[qQ\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vRshrn_nu64.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vRshrn_nu64' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vRshrn_nu64 (void)
-{
-  uint32x2_t out_uint32x2_t;
-  uint64x2_t arg0_uint64x2_t;
-
-  out_uint32x2_t = vrshrn_n_u64 (arg0_uint64x2_t, 1);
-}
-
-/* { dg-final { scan-assembler "vrshrn\.i64\[ 	\]+\[dD\]\[0-9\]+, \[qQ\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vRsraQ_ns16.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vRsraQ_ns16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vRsraQ_ns16 (void)
-{
-  int16x8_t out_int16x8_t;
-  int16x8_t arg0_int16x8_t;
-  int16x8_t arg1_int16x8_t;
-
-  out_int16x8_t = vrsraq_n_s16 (arg0_int16x8_t, arg1_int16x8_t, 1);
-}
-
-/* { dg-final { scan-assembler "vrsra\.s16\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vRsraQ_ns32.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vRsraQ_ns32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vRsraQ_ns32 (void)
-{
-  int32x4_t out_int32x4_t;
-  int32x4_t arg0_int32x4_t;
-  int32x4_t arg1_int32x4_t;
-
-  out_int32x4_t = vrsraq_n_s32 (arg0_int32x4_t, arg1_int32x4_t, 1);
-}
-
-/* { dg-final { scan-assembler "vrsra\.s32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vRsraQ_ns64.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vRsraQ_ns64' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vRsraQ_ns64 (void)
-{
-  int64x2_t out_int64x2_t;
-  int64x2_t arg0_int64x2_t;
-  int64x2_t arg1_int64x2_t;
-
-  out_int64x2_t = vrsraq_n_s64 (arg0_int64x2_t, arg1_int64x2_t, 1);
-}
-
-/* { dg-final { scan-assembler "vrsra\.s64\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vRsraQ_ns8.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vRsraQ_ns8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vRsraQ_ns8 (void)
-{
-  int8x16_t out_int8x16_t;
-  int8x16_t arg0_int8x16_t;
-  int8x16_t arg1_int8x16_t;
-
-  out_int8x16_t = vrsraq_n_s8 (arg0_int8x16_t, arg1_int8x16_t, 1);
-}
-
-/* { dg-final { scan-assembler "vrsra\.s8\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vRsraQ_nu16.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vRsraQ_nu16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vRsraQ_nu16 (void)
-{
-  uint16x8_t out_uint16x8_t;
-  uint16x8_t arg0_uint16x8_t;
-  uint16x8_t arg1_uint16x8_t;
-
-  out_uint16x8_t = vrsraq_n_u16 (arg0_uint16x8_t, arg1_uint16x8_t, 1);
-}
-
-/* { dg-final { scan-assembler "vrsra\.u16\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vRsraQ_nu32.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vRsraQ_nu32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vRsraQ_nu32 (void)
-{
-  uint32x4_t out_uint32x4_t;
-  uint32x4_t arg0_uint32x4_t;
-  uint32x4_t arg1_uint32x4_t;
-
-  out_uint32x4_t = vrsraq_n_u32 (arg0_uint32x4_t, arg1_uint32x4_t, 1);
-}
-
-/* { dg-final { scan-assembler "vrsra\.u32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vRsraQ_nu64.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vRsraQ_nu64' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vRsraQ_nu64 (void)
-{
-  uint64x2_t out_uint64x2_t;
-  uint64x2_t arg0_uint64x2_t;
-  uint64x2_t arg1_uint64x2_t;
-
-  out_uint64x2_t = vrsraq_n_u64 (arg0_uint64x2_t, arg1_uint64x2_t, 1);
-}
-
-/* { dg-final { scan-assembler "vrsra\.u64\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vRsraQ_nu8.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vRsraQ_nu8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vRsraQ_nu8 (void)
-{
-  uint8x16_t out_uint8x16_t;
-  uint8x16_t arg0_uint8x16_t;
-  uint8x16_t arg1_uint8x16_t;
-
-  out_uint8x16_t = vrsraq_n_u8 (arg0_uint8x16_t, arg1_uint8x16_t, 1);
-}
-
-/* { dg-final { scan-assembler "vrsra\.u8\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vRsra_ns16.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vRsra_ns16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vRsra_ns16 (void)
-{
-  int16x4_t out_int16x4_t;
-  int16x4_t arg0_int16x4_t;
-  int16x4_t arg1_int16x4_t;
-
-  out_int16x4_t = vrsra_n_s16 (arg0_int16x4_t, arg1_int16x4_t, 1);
-}
-
-/* { dg-final { scan-assembler "vrsra\.s16\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vRsra_ns32.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vRsra_ns32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vRsra_ns32 (void)
-{
-  int32x2_t out_int32x2_t;
-  int32x2_t arg0_int32x2_t;
-  int32x2_t arg1_int32x2_t;
-
-  out_int32x2_t = vrsra_n_s32 (arg0_int32x2_t, arg1_int32x2_t, 1);
-}
-
-/* { dg-final { scan-assembler "vrsra\.s32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vRsra_ns64.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vRsra_ns64' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vRsra_ns64 (void)
-{
-  int64x1_t out_int64x1_t;
-  int64x1_t arg0_int64x1_t;
-  int64x1_t arg1_int64x1_t;
-
-  out_int64x1_t = vrsra_n_s64 (arg0_int64x1_t, arg1_int64x1_t, 1);
-}
-
-/* { dg-final { scan-assembler "vrsra\.s64\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vRsra_ns8.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vRsra_ns8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vRsra_ns8 (void)
-{
-  int8x8_t out_int8x8_t;
-  int8x8_t arg0_int8x8_t;
-  int8x8_t arg1_int8x8_t;
-
-  out_int8x8_t = vrsra_n_s8 (arg0_int8x8_t, arg1_int8x8_t, 1);
-}
-
-/* { dg-final { scan-assembler "vrsra\.s8\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vRsra_nu16.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vRsra_nu16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vRsra_nu16 (void)
-{
-  uint16x4_t out_uint16x4_t;
-  uint16x4_t arg0_uint16x4_t;
-  uint16x4_t arg1_uint16x4_t;
-
-  out_uint16x4_t = vrsra_n_u16 (arg0_uint16x4_t, arg1_uint16x4_t, 1);
-}
-
-/* { dg-final { scan-assembler "vrsra\.u16\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vRsra_nu32.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vRsra_nu32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vRsra_nu32 (void)
-{
-  uint32x2_t out_uint32x2_t;
-  uint32x2_t arg0_uint32x2_t;
-  uint32x2_t arg1_uint32x2_t;
-
-  out_uint32x2_t = vrsra_n_u32 (arg0_uint32x2_t, arg1_uint32x2_t, 1);
-}
-
-/* { dg-final { scan-assembler "vrsra\.u32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vRsra_nu64.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vRsra_nu64' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vRsra_nu64 (void)
-{
-  uint64x1_t out_uint64x1_t;
-  uint64x1_t arg0_uint64x1_t;
-  uint64x1_t arg1_uint64x1_t;
-
-  out_uint64x1_t = vrsra_n_u64 (arg0_uint64x1_t, arg1_uint64x1_t, 1);
-}
-
-/* { dg-final { scan-assembler "vrsra\.u64\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vRsra_nu8.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vRsra_nu8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vRsra_nu8 (void)
-{
-  uint8x8_t out_uint8x8_t;
-  uint8x8_t arg0_uint8x8_t;
-  uint8x8_t arg1_uint8x8_t;
-
-  out_uint8x8_t = vrsra_n_u8 (arg0_uint8x8_t, arg1_uint8x8_t, 1);
-}
-
-/* { dg-final { scan-assembler "vrsra\.u8\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vRsubhns16.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vRsubhns16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vRsubhns16 (void)
-{
-  int8x8_t out_int8x8_t;
-  int16x8_t arg0_int16x8_t;
-  int16x8_t arg1_int16x8_t;
-
-  out_int8x8_t = vrsubhn_s16 (arg0_int16x8_t, arg1_int16x8_t);
-}
-
-/* { dg-final { scan-assembler "vrsubhn\.i16\[ 	\]+\[dD\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vRsubhns32.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vRsubhns32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vRsubhns32 (void)
-{
-  int16x4_t out_int16x4_t;
-  int32x4_t arg0_int32x4_t;
-  int32x4_t arg1_int32x4_t;
-
-  out_int16x4_t = vrsubhn_s32 (arg0_int32x4_t, arg1_int32x4_t);
-}
-
-/* { dg-final { scan-assembler "vrsubhn\.i32\[ 	\]+\[dD\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vRsubhns64.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vRsubhns64' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vRsubhns64 (void)
-{
-  int32x2_t out_int32x2_t;
-  int64x2_t arg0_int64x2_t;
-  int64x2_t arg1_int64x2_t;
-
-  out_int32x2_t = vrsubhn_s64 (arg0_int64x2_t, arg1_int64x2_t);
-}
-
-/* { dg-final { scan-assembler "vrsubhn\.i64\[ 	\]+\[dD\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vRsubhnu16.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vRsubhnu16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vRsubhnu16 (void)
-{
-  uint8x8_t out_uint8x8_t;
-  uint16x8_t arg0_uint16x8_t;
-  uint16x8_t arg1_uint16x8_t;
-
-  out_uint8x8_t = vrsubhn_u16 (arg0_uint16x8_t, arg1_uint16x8_t);
-}
-
-/* { dg-final { scan-assembler "vrsubhn\.i16\[ 	\]+\[dD\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vRsubhnu32.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vRsubhnu32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vRsubhnu32 (void)
-{
-  uint16x4_t out_uint16x4_t;
-  uint32x4_t arg0_uint32x4_t;
-  uint32x4_t arg1_uint32x4_t;
-
-  out_uint16x4_t = vrsubhn_u32 (arg0_uint32x4_t, arg1_uint32x4_t);
-}
-
-/* { dg-final { scan-assembler "vrsubhn\.i32\[ 	\]+\[dD\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vRsubhnu64.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vRsubhnu64' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vRsubhnu64 (void)
-{
-  uint32x2_t out_uint32x2_t;
-  uint64x2_t arg0_uint64x2_t;
-  uint64x2_t arg1_uint64x2_t;
-
-  out_uint32x2_t = vrsubhn_u64 (arg0_uint64x2_t, arg1_uint64x2_t);
-}
-
-/* { dg-final { scan-assembler "vrsubhn\.i64\[ 	\]+\[dD\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vabaQs16.c
+++ b/src//dev/null
@@ -1,21 +0,0 @@
-/* Test the `vabaQs16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vabaQs16 (void)
-{
-  int16x8_t out_int16x8_t;
-  int16x8_t arg0_int16x8_t;
-  int16x8_t arg1_int16x8_t;
-  int16x8_t arg2_int16x8_t;
-
-  out_int16x8_t = vabaq_s16 (arg0_int16x8_t, arg1_int16x8_t, arg2_int16x8_t);
-}
-
-/* { dg-final { scan-assembler "vaba\.s16\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vabaQs32.c
+++ b/src//dev/null
@@ -1,21 +0,0 @@
-/* Test the `vabaQs32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vabaQs32 (void)
-{
-  int32x4_t out_int32x4_t;
-  int32x4_t arg0_int32x4_t;
-  int32x4_t arg1_int32x4_t;
-  int32x4_t arg2_int32x4_t;
-
-  out_int32x4_t = vabaq_s32 (arg0_int32x4_t, arg1_int32x4_t, arg2_int32x4_t);
-}
-
-/* { dg-final { scan-assembler "vaba\.s32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vabaQs8.c
+++ b/src//dev/null
@@ -1,21 +0,0 @@
-/* Test the `vabaQs8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vabaQs8 (void)
-{
-  int8x16_t out_int8x16_t;
-  int8x16_t arg0_int8x16_t;
-  int8x16_t arg1_int8x16_t;
-  int8x16_t arg2_int8x16_t;
-
-  out_int8x16_t = vabaq_s8 (arg0_int8x16_t, arg1_int8x16_t, arg2_int8x16_t);
-}
-
-/* { dg-final { scan-assembler "vaba\.s8\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vabaQu16.c
+++ b/src//dev/null
@@ -1,21 +0,0 @@
-/* Test the `vabaQu16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vabaQu16 (void)
-{
-  uint16x8_t out_uint16x8_t;
-  uint16x8_t arg0_uint16x8_t;
-  uint16x8_t arg1_uint16x8_t;
-  uint16x8_t arg2_uint16x8_t;
-
-  out_uint16x8_t = vabaq_u16 (arg0_uint16x8_t, arg1_uint16x8_t, arg2_uint16x8_t);
-}
-
-/* { dg-final { scan-assembler "vaba\.u16\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vabaQu32.c
+++ b/src//dev/null
@@ -1,21 +0,0 @@
-/* Test the `vabaQu32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vabaQu32 (void)
-{
-  uint32x4_t out_uint32x4_t;
-  uint32x4_t arg0_uint32x4_t;
-  uint32x4_t arg1_uint32x4_t;
-  uint32x4_t arg2_uint32x4_t;
-
-  out_uint32x4_t = vabaq_u32 (arg0_uint32x4_t, arg1_uint32x4_t, arg2_uint32x4_t);
-}
-
-/* { dg-final { scan-assembler "vaba\.u32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vabaQu8.c
+++ b/src//dev/null
@@ -1,21 +0,0 @@
-/* Test the `vabaQu8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vabaQu8 (void)
-{
-  uint8x16_t out_uint8x16_t;
-  uint8x16_t arg0_uint8x16_t;
-  uint8x16_t arg1_uint8x16_t;
-  uint8x16_t arg2_uint8x16_t;
-
-  out_uint8x16_t = vabaq_u8 (arg0_uint8x16_t, arg1_uint8x16_t, arg2_uint8x16_t);
-}
-
-/* { dg-final { scan-assembler "vaba\.u8\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vabals16.c
+++ b/src//dev/null
@@ -1,21 +0,0 @@
-/* Test the `vabals16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vabals16 (void)
-{
-  int32x4_t out_int32x4_t;
-  int32x4_t arg0_int32x4_t;
-  int16x4_t arg1_int16x4_t;
-  int16x4_t arg2_int16x4_t;
-
-  out_int32x4_t = vabal_s16 (arg0_int32x4_t, arg1_int16x4_t, arg2_int16x4_t);
-}
-
-/* { dg-final { scan-assembler "vabal\.s16\[ 	\]+\[qQ\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vabals32.c
+++ b/src//dev/null
@@ -1,21 +0,0 @@
-/* Test the `vabals32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vabals32 (void)
-{
-  int64x2_t out_int64x2_t;
-  int64x2_t arg0_int64x2_t;
-  int32x2_t arg1_int32x2_t;
-  int32x2_t arg2_int32x2_t;
-
-  out_int64x2_t = vabal_s32 (arg0_int64x2_t, arg1_int32x2_t, arg2_int32x2_t);
-}
-
-/* { dg-final { scan-assembler "vabal\.s32\[ 	\]+\[qQ\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vabals8.c
+++ b/src//dev/null
@@ -1,21 +0,0 @@
-/* Test the `vabals8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vabals8 (void)
-{
-  int16x8_t out_int16x8_t;
-  int16x8_t arg0_int16x8_t;
-  int8x8_t arg1_int8x8_t;
-  int8x8_t arg2_int8x8_t;
-
-  out_int16x8_t = vabal_s8 (arg0_int16x8_t, arg1_int8x8_t, arg2_int8x8_t);
-}
-
-/* { dg-final { scan-assembler "vabal\.s8\[ 	\]+\[qQ\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vabalu16.c
+++ b/src//dev/null
@@ -1,21 +0,0 @@
-/* Test the `vabalu16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vabalu16 (void)
-{
-  uint32x4_t out_uint32x4_t;
-  uint32x4_t arg0_uint32x4_t;
-  uint16x4_t arg1_uint16x4_t;
-  uint16x4_t arg2_uint16x4_t;
-
-  out_uint32x4_t = vabal_u16 (arg0_uint32x4_t, arg1_uint16x4_t, arg2_uint16x4_t);
-}
-
-/* { dg-final { scan-assembler "vabal\.u16\[ 	\]+\[qQ\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vabalu32.c
+++ b/src//dev/null
@@ -1,21 +0,0 @@
-/* Test the `vabalu32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vabalu32 (void)
-{
-  uint64x2_t out_uint64x2_t;
-  uint64x2_t arg0_uint64x2_t;
-  uint32x2_t arg1_uint32x2_t;
-  uint32x2_t arg2_uint32x2_t;
-
-  out_uint64x2_t = vabal_u32 (arg0_uint64x2_t, arg1_uint32x2_t, arg2_uint32x2_t);
-}
-
-/* { dg-final { scan-assembler "vabal\.u32\[ 	\]+\[qQ\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vabalu8.c
+++ b/src//dev/null
@@ -1,21 +0,0 @@
-/* Test the `vabalu8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vabalu8 (void)
-{
-  uint16x8_t out_uint16x8_t;
-  uint16x8_t arg0_uint16x8_t;
-  uint8x8_t arg1_uint8x8_t;
-  uint8x8_t arg2_uint8x8_t;
-
-  out_uint16x8_t = vabal_u8 (arg0_uint16x8_t, arg1_uint8x8_t, arg2_uint8x8_t);
-}
-
-/* { dg-final { scan-assembler "vabal\.u8\[ 	\]+\[qQ\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vabas16.c
+++ b/src//dev/null
@@ -1,21 +0,0 @@
-/* Test the `vabas16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vabas16 (void)
-{
-  int16x4_t out_int16x4_t;
-  int16x4_t arg0_int16x4_t;
-  int16x4_t arg1_int16x4_t;
-  int16x4_t arg2_int16x4_t;
-
-  out_int16x4_t = vaba_s16 (arg0_int16x4_t, arg1_int16x4_t, arg2_int16x4_t);
-}
-
-/* { dg-final { scan-assembler "vaba\.s16\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vabas32.c
+++ b/src//dev/null
@@ -1,21 +0,0 @@
-/* Test the `vabas32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vabas32 (void)
-{
-  int32x2_t out_int32x2_t;
-  int32x2_t arg0_int32x2_t;
-  int32x2_t arg1_int32x2_t;
-  int32x2_t arg2_int32x2_t;
-
-  out_int32x2_t = vaba_s32 (arg0_int32x2_t, arg1_int32x2_t, arg2_int32x2_t);
-}
-
-/* { dg-final { scan-assembler "vaba\.s32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vabas8.c
+++ b/src//dev/null
@@ -1,21 +0,0 @@
-/* Test the `vabas8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vabas8 (void)
-{
-  int8x8_t out_int8x8_t;
-  int8x8_t arg0_int8x8_t;
-  int8x8_t arg1_int8x8_t;
-  int8x8_t arg2_int8x8_t;
-
-  out_int8x8_t = vaba_s8 (arg0_int8x8_t, arg1_int8x8_t, arg2_int8x8_t);
-}
-
-/* { dg-final { scan-assembler "vaba\.s8\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vabau16.c
+++ b/src//dev/null
@@ -1,21 +0,0 @@
-/* Test the `vabau16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vabau16 (void)
-{
-  uint16x4_t out_uint16x4_t;
-  uint16x4_t arg0_uint16x4_t;
-  uint16x4_t arg1_uint16x4_t;
-  uint16x4_t arg2_uint16x4_t;
-
-  out_uint16x4_t = vaba_u16 (arg0_uint16x4_t, arg1_uint16x4_t, arg2_uint16x4_t);
-}
-
-/* { dg-final { scan-assembler "vaba\.u16\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vabau32.c
+++ b/src//dev/null
@@ -1,21 +0,0 @@
-/* Test the `vabau32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vabau32 (void)
-{
-  uint32x2_t out_uint32x2_t;
-  uint32x2_t arg0_uint32x2_t;
-  uint32x2_t arg1_uint32x2_t;
-  uint32x2_t arg2_uint32x2_t;
-
-  out_uint32x2_t = vaba_u32 (arg0_uint32x2_t, arg1_uint32x2_t, arg2_uint32x2_t);
-}
-
-/* { dg-final { scan-assembler "vaba\.u32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vabau8.c
+++ b/src//dev/null
@@ -1,21 +0,0 @@
-/* Test the `vabau8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vabau8 (void)
-{
-  uint8x8_t out_uint8x8_t;
-  uint8x8_t arg0_uint8x8_t;
-  uint8x8_t arg1_uint8x8_t;
-  uint8x8_t arg2_uint8x8_t;
-
-  out_uint8x8_t = vaba_u8 (arg0_uint8x8_t, arg1_uint8x8_t, arg2_uint8x8_t);
-}
-
-/* { dg-final { scan-assembler "vaba\.u8\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vabdQf32.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vabdQf32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vabdQf32 (void)
-{
-  float32x4_t out_float32x4_t;
-  float32x4_t arg0_float32x4_t;
-  float32x4_t arg1_float32x4_t;
-
-  out_float32x4_t = vabdq_f32 (arg0_float32x4_t, arg1_float32x4_t);
-}
-
-/* { dg-final { scan-assembler "vabd\.f32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vabdQs16.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vabdQs16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vabdQs16 (void)
-{
-  int16x8_t out_int16x8_t;
-  int16x8_t arg0_int16x8_t;
-  int16x8_t arg1_int16x8_t;
-
-  out_int16x8_t = vabdq_s16 (arg0_int16x8_t, arg1_int16x8_t);
-}
-
-/* { dg-final { scan-assembler "vabd\.s16\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vabdQs32.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vabdQs32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vabdQs32 (void)
-{
-  int32x4_t out_int32x4_t;
-  int32x4_t arg0_int32x4_t;
-  int32x4_t arg1_int32x4_t;
-
-  out_int32x4_t = vabdq_s32 (arg0_int32x4_t, arg1_int32x4_t);
-}
-
-/* { dg-final { scan-assembler "vabd\.s32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vabdQs8.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vabdQs8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vabdQs8 (void)
-{
-  int8x16_t out_int8x16_t;
-  int8x16_t arg0_int8x16_t;
-  int8x16_t arg1_int8x16_t;
-
-  out_int8x16_t = vabdq_s8 (arg0_int8x16_t, arg1_int8x16_t);
-}
-
-/* { dg-final { scan-assembler "vabd\.s8\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vabdQu16.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vabdQu16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vabdQu16 (void)
-{
-  uint16x8_t out_uint16x8_t;
-  uint16x8_t arg0_uint16x8_t;
-  uint16x8_t arg1_uint16x8_t;
-
-  out_uint16x8_t = vabdq_u16 (arg0_uint16x8_t, arg1_uint16x8_t);
-}
-
-/* { dg-final { scan-assembler "vabd\.u16\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vabdQu32.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vabdQu32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vabdQu32 (void)
-{
-  uint32x4_t out_uint32x4_t;
-  uint32x4_t arg0_uint32x4_t;
-  uint32x4_t arg1_uint32x4_t;
-
-  out_uint32x4_t = vabdq_u32 (arg0_uint32x4_t, arg1_uint32x4_t);
-}
-
-/* { dg-final { scan-assembler "vabd\.u32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vabdQu8.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vabdQu8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vabdQu8 (void)
-{
-  uint8x16_t out_uint8x16_t;
-  uint8x16_t arg0_uint8x16_t;
-  uint8x16_t arg1_uint8x16_t;
-
-  out_uint8x16_t = vabdq_u8 (arg0_uint8x16_t, arg1_uint8x16_t);
-}
-
-/* { dg-final { scan-assembler "vabd\.u8\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vabdf32.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vabdf32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vabdf32 (void)
-{
-  float32x2_t out_float32x2_t;
-  float32x2_t arg0_float32x2_t;
-  float32x2_t arg1_float32x2_t;
-
-  out_float32x2_t = vabd_f32 (arg0_float32x2_t, arg1_float32x2_t);
-}
-
-/* { dg-final { scan-assembler "vabd\.f32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vabdls16.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vabdls16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vabdls16 (void)
-{
-  int32x4_t out_int32x4_t;
-  int16x4_t arg0_int16x4_t;
-  int16x4_t arg1_int16x4_t;
-
-  out_int32x4_t = vabdl_s16 (arg0_int16x4_t, arg1_int16x4_t);
-}
-
-/* { dg-final { scan-assembler "vabdl\.s16\[ 	\]+\[qQ\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vabdls32.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vabdls32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vabdls32 (void)
-{
-  int64x2_t out_int64x2_t;
-  int32x2_t arg0_int32x2_t;
-  int32x2_t arg1_int32x2_t;
-
-  out_int64x2_t = vabdl_s32 (arg0_int32x2_t, arg1_int32x2_t);
-}
-
-/* { dg-final { scan-assembler "vabdl\.s32\[ 	\]+\[qQ\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vabdls8.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vabdls8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vabdls8 (void)
-{
-  int16x8_t out_int16x8_t;
-  int8x8_t arg0_int8x8_t;
-  int8x8_t arg1_int8x8_t;
-
-  out_int16x8_t = vabdl_s8 (arg0_int8x8_t, arg1_int8x8_t);
-}
-
-/* { dg-final { scan-assembler "vabdl\.s8\[ 	\]+\[qQ\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vabdlu16.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vabdlu16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vabdlu16 (void)
-{
-  uint32x4_t out_uint32x4_t;
-  uint16x4_t arg0_uint16x4_t;
-  uint16x4_t arg1_uint16x4_t;
-
-  out_uint32x4_t = vabdl_u16 (arg0_uint16x4_t, arg1_uint16x4_t);
-}
-
-/* { dg-final { scan-assembler "vabdl\.u16\[ 	\]+\[qQ\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vabdlu32.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vabdlu32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vabdlu32 (void)
-{
-  uint64x2_t out_uint64x2_t;
-  uint32x2_t arg0_uint32x2_t;
-  uint32x2_t arg1_uint32x2_t;
-
-  out_uint64x2_t = vabdl_u32 (arg0_uint32x2_t, arg1_uint32x2_t);
-}
-
-/* { dg-final { scan-assembler "vabdl\.u32\[ 	\]+\[qQ\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vabdlu8.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vabdlu8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vabdlu8 (void)
-{
-  uint16x8_t out_uint16x8_t;
-  uint8x8_t arg0_uint8x8_t;
-  uint8x8_t arg1_uint8x8_t;
-
-  out_uint16x8_t = vabdl_u8 (arg0_uint8x8_t, arg1_uint8x8_t);
-}
-
-/* { dg-final { scan-assembler "vabdl\.u8\[ 	\]+\[qQ\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vabds16.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vabds16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vabds16 (void)
-{
-  int16x4_t out_int16x4_t;
-  int16x4_t arg0_int16x4_t;
-  int16x4_t arg1_int16x4_t;
-
-  out_int16x4_t = vabd_s16 (arg0_int16x4_t, arg1_int16x4_t);
-}
-
-/* { dg-final { scan-assembler "vabd\.s16\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vabds32.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vabds32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vabds32 (void)
-{
-  int32x2_t out_int32x2_t;
-  int32x2_t arg0_int32x2_t;
-  int32x2_t arg1_int32x2_t;
-
-  out_int32x2_t = vabd_s32 (arg0_int32x2_t, arg1_int32x2_t);
-}
-
-/* { dg-final { scan-assembler "vabd\.s32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vabds8.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vabds8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vabds8 (void)
-{
-  int8x8_t out_int8x8_t;
-  int8x8_t arg0_int8x8_t;
-  int8x8_t arg1_int8x8_t;
-
-  out_int8x8_t = vabd_s8 (arg0_int8x8_t, arg1_int8x8_t);
-}
-
-/* { dg-final { scan-assembler "vabd\.s8\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vabdu16.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vabdu16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vabdu16 (void)
-{
-  uint16x4_t out_uint16x4_t;
-  uint16x4_t arg0_uint16x4_t;
-  uint16x4_t arg1_uint16x4_t;
-
-  out_uint16x4_t = vabd_u16 (arg0_uint16x4_t, arg1_uint16x4_t);
-}
-
-/* { dg-final { scan-assembler "vabd\.u16\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vabdu32.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vabdu32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vabdu32 (void)
-{
-  uint32x2_t out_uint32x2_t;
-  uint32x2_t arg0_uint32x2_t;
-  uint32x2_t arg1_uint32x2_t;
-
-  out_uint32x2_t = vabd_u32 (arg0_uint32x2_t, arg1_uint32x2_t);
-}
-
-/* { dg-final { scan-assembler "vabd\.u32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vabdu8.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vabdu8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vabdu8 (void)
-{
-  uint8x8_t out_uint8x8_t;
-  uint8x8_t arg0_uint8x8_t;
-  uint8x8_t arg1_uint8x8_t;
-
-  out_uint8x8_t = vabd_u8 (arg0_uint8x8_t, arg1_uint8x8_t);
-}
-
-/* { dg-final { scan-assembler "vabd\.u8\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vabsQf32.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vabsQf32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vabsQf32 (void)
-{
-  float32x4_t out_float32x4_t;
-  float32x4_t arg0_float32x4_t;
-
-  out_float32x4_t = vabsq_f32 (arg0_float32x4_t);
-}
-
-/* { dg-final { scan-assembler "vabs\.f32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vabsQs16.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vabsQs16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vabsQs16 (void)
-{
-  int16x8_t out_int16x8_t;
-  int16x8_t arg0_int16x8_t;
-
-  out_int16x8_t = vabsq_s16 (arg0_int16x8_t);
-}
-
-/* { dg-final { scan-assembler "vabs\.s16\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vabsQs32.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vabsQs32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vabsQs32 (void)
-{
-  int32x4_t out_int32x4_t;
-  int32x4_t arg0_int32x4_t;
-
-  out_int32x4_t = vabsq_s32 (arg0_int32x4_t);
-}
-
-/* { dg-final { scan-assembler "vabs\.s32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vabsQs8.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vabsQs8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vabsQs8 (void)
-{
-  int8x16_t out_int8x16_t;
-  int8x16_t arg0_int8x16_t;
-
-  out_int8x16_t = vabsq_s8 (arg0_int8x16_t);
-}
-
-/* { dg-final { scan-assembler "vabs\.s8\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vabsf32.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vabsf32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vabsf32 (void)
-{
-  float32x2_t out_float32x2_t;
-  float32x2_t arg0_float32x2_t;
-
-  out_float32x2_t = vabs_f32 (arg0_float32x2_t);
-}
-
-/* { dg-final { scan-assembler "vabs\.f32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vabss16.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vabss16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vabss16 (void)
-{
-  int16x4_t out_int16x4_t;
-  int16x4_t arg0_int16x4_t;
-
-  out_int16x4_t = vabs_s16 (arg0_int16x4_t);
-}
-
-/* { dg-final { scan-assembler "vabs\.s16\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vabss32.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vabss32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vabss32 (void)
-{
-  int32x2_t out_int32x2_t;
-  int32x2_t arg0_int32x2_t;
-
-  out_int32x2_t = vabs_s32 (arg0_int32x2_t);
-}
-
-/* { dg-final { scan-assembler "vabs\.s32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vabss8.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vabss8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vabss8 (void)
-{
-  int8x8_t out_int8x8_t;
-  int8x8_t arg0_int8x8_t;
-
-  out_int8x8_t = vabs_s8 (arg0_int8x8_t);
-}
-
-/* { dg-final { scan-assembler "vabs\.s8\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vaddQf32.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vaddQf32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vaddQf32 (void)
-{
-  float32x4_t out_float32x4_t;
-  float32x4_t arg0_float32x4_t;
-  float32x4_t arg1_float32x4_t;
-
-  out_float32x4_t = vaddq_f32 (arg0_float32x4_t, arg1_float32x4_t);
-}
-
-/* { dg-final { scan-assembler "vadd\.f32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vaddQs16.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vaddQs16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vaddQs16 (void)
-{
-  int16x8_t out_int16x8_t;
-  int16x8_t arg0_int16x8_t;
-  int16x8_t arg1_int16x8_t;
-
-  out_int16x8_t = vaddq_s16 (arg0_int16x8_t, arg1_int16x8_t);
-}
-
-/* { dg-final { scan-assembler "vadd\.i16\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vaddQs32.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vaddQs32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vaddQs32 (void)
-{
-  int32x4_t out_int32x4_t;
-  int32x4_t arg0_int32x4_t;
-  int32x4_t arg1_int32x4_t;
-
-  out_int32x4_t = vaddq_s32 (arg0_int32x4_t, arg1_int32x4_t);
-}
-
-/* { dg-final { scan-assembler "vadd\.i32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vaddQs64.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vaddQs64' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vaddQs64 (void)
-{
-  int64x2_t out_int64x2_t;
-  int64x2_t arg0_int64x2_t;
-  int64x2_t arg1_int64x2_t;
-
-  out_int64x2_t = vaddq_s64 (arg0_int64x2_t, arg1_int64x2_t);
-}
-
-/* { dg-final { scan-assembler "vadd\.i64\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vaddQs8.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vaddQs8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vaddQs8 (void)
-{
-  int8x16_t out_int8x16_t;
-  int8x16_t arg0_int8x16_t;
-  int8x16_t arg1_int8x16_t;
-
-  out_int8x16_t = vaddq_s8 (arg0_int8x16_t, arg1_int8x16_t);
-}
-
-/* { dg-final { scan-assembler "vadd\.i8\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vaddQu16.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vaddQu16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vaddQu16 (void)
-{
-  uint16x8_t out_uint16x8_t;
-  uint16x8_t arg0_uint16x8_t;
-  uint16x8_t arg1_uint16x8_t;
-
-  out_uint16x8_t = vaddq_u16 (arg0_uint16x8_t, arg1_uint16x8_t);
-}
-
-/* { dg-final { scan-assembler "vadd\.i16\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vaddQu32.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vaddQu32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vaddQu32 (void)
-{
-  uint32x4_t out_uint32x4_t;
-  uint32x4_t arg0_uint32x4_t;
-  uint32x4_t arg1_uint32x4_t;
-
-  out_uint32x4_t = vaddq_u32 (arg0_uint32x4_t, arg1_uint32x4_t);
-}
-
-/* { dg-final { scan-assembler "vadd\.i32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vaddQu64.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vaddQu64' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vaddQu64 (void)
-{
-  uint64x2_t out_uint64x2_t;
-  uint64x2_t arg0_uint64x2_t;
-  uint64x2_t arg1_uint64x2_t;
-
-  out_uint64x2_t = vaddq_u64 (arg0_uint64x2_t, arg1_uint64x2_t);
-}
-
-/* { dg-final { scan-assembler "vadd\.i64\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vaddQu8.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vaddQu8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vaddQu8 (void)
-{
-  uint8x16_t out_uint8x16_t;
-  uint8x16_t arg0_uint8x16_t;
-  uint8x16_t arg1_uint8x16_t;
-
-  out_uint8x16_t = vaddq_u8 (arg0_uint8x16_t, arg1_uint8x16_t);
-}
-
-/* { dg-final { scan-assembler "vadd\.i8\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vaddf32.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vaddf32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vaddf32 (void)
-{
-  float32x2_t out_float32x2_t;
-  float32x2_t arg0_float32x2_t;
-  float32x2_t arg1_float32x2_t;
-
-  out_float32x2_t = vadd_f32 (arg0_float32x2_t, arg1_float32x2_t);
-}
-
-/* { dg-final { scan-assembler "vadd\.f32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vaddhns16.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vaddhns16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vaddhns16 (void)
-{
-  int8x8_t out_int8x8_t;
-  int16x8_t arg0_int16x8_t;
-  int16x8_t arg1_int16x8_t;
-
-  out_int8x8_t = vaddhn_s16 (arg0_int16x8_t, arg1_int16x8_t);
-}
-
-/* { dg-final { scan-assembler "vaddhn\.i16\[ 	\]+\[dD\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vaddhns32.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vaddhns32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vaddhns32 (void)
-{
-  int16x4_t out_int16x4_t;
-  int32x4_t arg0_int32x4_t;
-  int32x4_t arg1_int32x4_t;
-
-  out_int16x4_t = vaddhn_s32 (arg0_int32x4_t, arg1_int32x4_t);
-}
-
-/* { dg-final { scan-assembler "vaddhn\.i32\[ 	\]+\[dD\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vaddhns64.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vaddhns64' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vaddhns64 (void)
-{
-  int32x2_t out_int32x2_t;
-  int64x2_t arg0_int64x2_t;
-  int64x2_t arg1_int64x2_t;
-
-  out_int32x2_t = vaddhn_s64 (arg0_int64x2_t, arg1_int64x2_t);
-}
-
-/* { dg-final { scan-assembler "vaddhn\.i64\[ 	\]+\[dD\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vaddhnu16.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vaddhnu16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vaddhnu16 (void)
-{
-  uint8x8_t out_uint8x8_t;
-  uint16x8_t arg0_uint16x8_t;
-  uint16x8_t arg1_uint16x8_t;
-
-  out_uint8x8_t = vaddhn_u16 (arg0_uint16x8_t, arg1_uint16x8_t);
-}
-
-/* { dg-final { scan-assembler "vaddhn\.i16\[ 	\]+\[dD\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vaddhnu32.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vaddhnu32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vaddhnu32 (void)
-{
-  uint16x4_t out_uint16x4_t;
-  uint32x4_t arg0_uint32x4_t;
-  uint32x4_t arg1_uint32x4_t;
-
-  out_uint16x4_t = vaddhn_u32 (arg0_uint32x4_t, arg1_uint32x4_t);
-}
-
-/* { dg-final { scan-assembler "vaddhn\.i32\[ 	\]+\[dD\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vaddhnu64.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vaddhnu64' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vaddhnu64 (void)
-{
-  uint32x2_t out_uint32x2_t;
-  uint64x2_t arg0_uint64x2_t;
-  uint64x2_t arg1_uint64x2_t;
-
-  out_uint32x2_t = vaddhn_u64 (arg0_uint64x2_t, arg1_uint64x2_t);
-}
-
-/* { dg-final { scan-assembler "vaddhn\.i64\[ 	\]+\[dD\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vaddls16.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vaddls16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vaddls16 (void)
-{
-  int32x4_t out_int32x4_t;
-  int16x4_t arg0_int16x4_t;
-  int16x4_t arg1_int16x4_t;
-
-  out_int32x4_t = vaddl_s16 (arg0_int16x4_t, arg1_int16x4_t);
-}
-
-/* { dg-final { scan-assembler "vaddl\.s16\[ 	\]+\[qQ\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vaddls32.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vaddls32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vaddls32 (void)
-{
-  int64x2_t out_int64x2_t;
-  int32x2_t arg0_int32x2_t;
-  int32x2_t arg1_int32x2_t;
-
-  out_int64x2_t = vaddl_s32 (arg0_int32x2_t, arg1_int32x2_t);
-}
-
-/* { dg-final { scan-assembler "vaddl\.s32\[ 	\]+\[qQ\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vaddls8.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vaddls8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vaddls8 (void)
-{
-  int16x8_t out_int16x8_t;
-  int8x8_t arg0_int8x8_t;
-  int8x8_t arg1_int8x8_t;
-
-  out_int16x8_t = vaddl_s8 (arg0_int8x8_t, arg1_int8x8_t);
-}
-
-/* { dg-final { scan-assembler "vaddl\.s8\[ 	\]+\[qQ\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vaddlu16.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vaddlu16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vaddlu16 (void)
-{
-  uint32x4_t out_uint32x4_t;
-  uint16x4_t arg0_uint16x4_t;
-  uint16x4_t arg1_uint16x4_t;
-
-  out_uint32x4_t = vaddl_u16 (arg0_uint16x4_t, arg1_uint16x4_t);
-}
-
-/* { dg-final { scan-assembler "vaddl\.u16\[ 	\]+\[qQ\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vaddlu32.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vaddlu32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vaddlu32 (void)
-{
-  uint64x2_t out_uint64x2_t;
-  uint32x2_t arg0_uint32x2_t;
-  uint32x2_t arg1_uint32x2_t;
-
-  out_uint64x2_t = vaddl_u32 (arg0_uint32x2_t, arg1_uint32x2_t);
-}
-
-/* { dg-final { scan-assembler "vaddl\.u32\[ 	\]+\[qQ\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vaddlu8.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vaddlu8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vaddlu8 (void)
-{
-  uint16x8_t out_uint16x8_t;
-  uint8x8_t arg0_uint8x8_t;
-  uint8x8_t arg1_uint8x8_t;
-
-  out_uint16x8_t = vaddl_u8 (arg0_uint8x8_t, arg1_uint8x8_t);
-}
-
-/* { dg-final { scan-assembler "vaddl\.u8\[ 	\]+\[qQ\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vadds16.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vadds16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vadds16 (void)
-{
-  int16x4_t out_int16x4_t;
-  int16x4_t arg0_int16x4_t;
-  int16x4_t arg1_int16x4_t;
-
-  out_int16x4_t = vadd_s16 (arg0_int16x4_t, arg1_int16x4_t);
-}
-
-/* { dg-final { scan-assembler "vadd\.i16\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vadds32.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vadds32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vadds32 (void)
-{
-  int32x2_t out_int32x2_t;
-  int32x2_t arg0_int32x2_t;
-  int32x2_t arg1_int32x2_t;
-
-  out_int32x2_t = vadd_s32 (arg0_int32x2_t, arg1_int32x2_t);
-}
-
-/* { dg-final { scan-assembler "vadd\.i32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vadds64.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vadds64' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vadds64 (void)
-{
-  int64x1_t out_int64x1_t;
-  int64x1_t arg0_int64x1_t;
-  int64x1_t arg1_int64x1_t;
-
-  out_int64x1_t = vadd_s64 (arg0_int64x1_t, arg1_int64x1_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vadds8.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vadds8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vadds8 (void)
-{
-  int8x8_t out_int8x8_t;
-  int8x8_t arg0_int8x8_t;
-  int8x8_t arg1_int8x8_t;
-
-  out_int8x8_t = vadd_s8 (arg0_int8x8_t, arg1_int8x8_t);
-}
-
-/* { dg-final { scan-assembler "vadd\.i8\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vaddu16.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vaddu16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vaddu16 (void)
-{
-  uint16x4_t out_uint16x4_t;
-  uint16x4_t arg0_uint16x4_t;
-  uint16x4_t arg1_uint16x4_t;
-
-  out_uint16x4_t = vadd_u16 (arg0_uint16x4_t, arg1_uint16x4_t);
-}
-
-/* { dg-final { scan-assembler "vadd\.i16\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vaddu32.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vaddu32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vaddu32 (void)
-{
-  uint32x2_t out_uint32x2_t;
-  uint32x2_t arg0_uint32x2_t;
-  uint32x2_t arg1_uint32x2_t;
-
-  out_uint32x2_t = vadd_u32 (arg0_uint32x2_t, arg1_uint32x2_t);
-}
-
-/* { dg-final { scan-assembler "vadd\.i32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vaddu64.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vaddu64' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vaddu64 (void)
-{
-  uint64x1_t out_uint64x1_t;
-  uint64x1_t arg0_uint64x1_t;
-  uint64x1_t arg1_uint64x1_t;
-
-  out_uint64x1_t = vadd_u64 (arg0_uint64x1_t, arg1_uint64x1_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vaddu8.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vaddu8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vaddu8 (void)
-{
-  uint8x8_t out_uint8x8_t;
-  uint8x8_t arg0_uint8x8_t;
-  uint8x8_t arg1_uint8x8_t;
-
-  out_uint8x8_t = vadd_u8 (arg0_uint8x8_t, arg1_uint8x8_t);
-}
-
-/* { dg-final { scan-assembler "vadd\.i8\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vaddws16.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vaddws16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vaddws16 (void)
-{
-  int32x4_t out_int32x4_t;
-  int32x4_t arg0_int32x4_t;
-  int16x4_t arg1_int16x4_t;
-
-  out_int32x4_t = vaddw_s16 (arg0_int32x4_t, arg1_int16x4_t);
-}
-
-/* { dg-final { scan-assembler "vaddw\.s16\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vaddws32.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vaddws32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vaddws32 (void)
-{
-  int64x2_t out_int64x2_t;
-  int64x2_t arg0_int64x2_t;
-  int32x2_t arg1_int32x2_t;
-
-  out_int64x2_t = vaddw_s32 (arg0_int64x2_t, arg1_int32x2_t);
-}
-
-/* { dg-final { scan-assembler "vaddw\.s32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vaddws8.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vaddws8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vaddws8 (void)
-{
-  int16x8_t out_int16x8_t;
-  int16x8_t arg0_int16x8_t;
-  int8x8_t arg1_int8x8_t;
-
-  out_int16x8_t = vaddw_s8 (arg0_int16x8_t, arg1_int8x8_t);
-}
-
-/* { dg-final { scan-assembler "vaddw\.s8\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vaddwu16.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vaddwu16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vaddwu16 (void)
-{
-  uint32x4_t out_uint32x4_t;
-  uint32x4_t arg0_uint32x4_t;
-  uint16x4_t arg1_uint16x4_t;
-
-  out_uint32x4_t = vaddw_u16 (arg0_uint32x4_t, arg1_uint16x4_t);
-}
-
-/* { dg-final { scan-assembler "vaddw\.u16\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vaddwu32.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vaddwu32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vaddwu32 (void)
-{
-  uint64x2_t out_uint64x2_t;
-  uint64x2_t arg0_uint64x2_t;
-  uint32x2_t arg1_uint32x2_t;
-
-  out_uint64x2_t = vaddw_u32 (arg0_uint64x2_t, arg1_uint32x2_t);
-}
-
-/* { dg-final { scan-assembler "vaddw\.u32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vaddwu8.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vaddwu8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vaddwu8 (void)
-{
-  uint16x8_t out_uint16x8_t;
-  uint16x8_t arg0_uint16x8_t;
-  uint8x8_t arg1_uint8x8_t;
-
-  out_uint16x8_t = vaddw_u8 (arg0_uint16x8_t, arg1_uint8x8_t);
-}
-
-/* { dg-final { scan-assembler "vaddw\.u8\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vandQs16.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vandQs16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vandQs16 (void)
-{
-  int16x8_t out_int16x8_t;
-  int16x8_t arg0_int16x8_t;
-  int16x8_t arg1_int16x8_t;
-
-  out_int16x8_t = vandq_s16 (arg0_int16x8_t, arg1_int16x8_t);
-}
-
-/* { dg-final { scan-assembler "vand\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vandQs32.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vandQs32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vandQs32 (void)
-{
-  int32x4_t out_int32x4_t;
-  int32x4_t arg0_int32x4_t;
-  int32x4_t arg1_int32x4_t;
-
-  out_int32x4_t = vandq_s32 (arg0_int32x4_t, arg1_int32x4_t);
-}
-
-/* { dg-final { scan-assembler "vand\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vandQs64.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vandQs64' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vandQs64 (void)
-{
-  int64x2_t out_int64x2_t;
-  int64x2_t arg0_int64x2_t;
-  int64x2_t arg1_int64x2_t;
-
-  out_int64x2_t = vandq_s64 (arg0_int64x2_t, arg1_int64x2_t);
-}
-
-/* { dg-final { scan-assembler "vand\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vandQs8.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vandQs8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vandQs8 (void)
-{
-  int8x16_t out_int8x16_t;
-  int8x16_t arg0_int8x16_t;
-  int8x16_t arg1_int8x16_t;
-
-  out_int8x16_t = vandq_s8 (arg0_int8x16_t, arg1_int8x16_t);
-}
-
-/* { dg-final { scan-assembler "vand\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vandQu16.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vandQu16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vandQu16 (void)
-{
-  uint16x8_t out_uint16x8_t;
-  uint16x8_t arg0_uint16x8_t;
-  uint16x8_t arg1_uint16x8_t;
-
-  out_uint16x8_t = vandq_u16 (arg0_uint16x8_t, arg1_uint16x8_t);
-}
-
-/* { dg-final { scan-assembler "vand\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vandQu32.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vandQu32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vandQu32 (void)
-{
-  uint32x4_t out_uint32x4_t;
-  uint32x4_t arg0_uint32x4_t;
-  uint32x4_t arg1_uint32x4_t;
-
-  out_uint32x4_t = vandq_u32 (arg0_uint32x4_t, arg1_uint32x4_t);
-}
-
-/* { dg-final { scan-assembler "vand\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vandQu64.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vandQu64' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vandQu64 (void)
-{
-  uint64x2_t out_uint64x2_t;
-  uint64x2_t arg0_uint64x2_t;
-  uint64x2_t arg1_uint64x2_t;
-
-  out_uint64x2_t = vandq_u64 (arg0_uint64x2_t, arg1_uint64x2_t);
-}
-
-/* { dg-final { scan-assembler "vand\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vandQu8.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vandQu8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vandQu8 (void)
-{
-  uint8x16_t out_uint8x16_t;
-  uint8x16_t arg0_uint8x16_t;
-  uint8x16_t arg1_uint8x16_t;
-
-  out_uint8x16_t = vandq_u8 (arg0_uint8x16_t, arg1_uint8x16_t);
-}
-
-/* { dg-final { scan-assembler "vand\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vands16.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vands16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vands16 (void)
-{
-  int16x4_t out_int16x4_t;
-  int16x4_t arg0_int16x4_t;
-  int16x4_t arg1_int16x4_t;
-
-  out_int16x4_t = vand_s16 (arg0_int16x4_t, arg1_int16x4_t);
-}
-
-/* { dg-final { scan-assembler "vand\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vands32.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vands32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vands32 (void)
-{
-  int32x2_t out_int32x2_t;
-  int32x2_t arg0_int32x2_t;
-  int32x2_t arg1_int32x2_t;
-
-  out_int32x2_t = vand_s32 (arg0_int32x2_t, arg1_int32x2_t);
-}
-
-/* { dg-final { scan-assembler "vand\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vands64.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vands64' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vands64 (void)
-{
-  int64x1_t out_int64x1_t;
-  int64x1_t arg0_int64x1_t;
-  int64x1_t arg1_int64x1_t;
-
-  out_int64x1_t = vand_s64 (arg0_int64x1_t, arg1_int64x1_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vands8.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vands8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vands8 (void)
-{
-  int8x8_t out_int8x8_t;
-  int8x8_t arg0_int8x8_t;
-  int8x8_t arg1_int8x8_t;
-
-  out_int8x8_t = vand_s8 (arg0_int8x8_t, arg1_int8x8_t);
-}
-
-/* { dg-final { scan-assembler "vand\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vandu16.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vandu16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vandu16 (void)
-{
-  uint16x4_t out_uint16x4_t;
-  uint16x4_t arg0_uint16x4_t;
-  uint16x4_t arg1_uint16x4_t;
-
-  out_uint16x4_t = vand_u16 (arg0_uint16x4_t, arg1_uint16x4_t);
-}
-
-/* { dg-final { scan-assembler "vand\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vandu32.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vandu32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vandu32 (void)
-{
-  uint32x2_t out_uint32x2_t;
-  uint32x2_t arg0_uint32x2_t;
-  uint32x2_t arg1_uint32x2_t;
-
-  out_uint32x2_t = vand_u32 (arg0_uint32x2_t, arg1_uint32x2_t);
-}
-
-/* { dg-final { scan-assembler "vand\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vandu64.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vandu64' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vandu64 (void)
-{
-  uint64x1_t out_uint64x1_t;
-  uint64x1_t arg0_uint64x1_t;
-  uint64x1_t arg1_uint64x1_t;
-
-  out_uint64x1_t = vand_u64 (arg0_uint64x1_t, arg1_uint64x1_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vandu8.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vandu8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vandu8 (void)
-{
-  uint8x8_t out_uint8x8_t;
-  uint8x8_t arg0_uint8x8_t;
-  uint8x8_t arg1_uint8x8_t;
-
-  out_uint8x8_t = vand_u8 (arg0_uint8x8_t, arg1_uint8x8_t);
-}
-
-/* { dg-final { scan-assembler "vand\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vbicQs16.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vbicQs16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O2" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-int16x8_t out_int16x8_t;
-int16x8_t arg0_int16x8_t;
-int16x8_t arg1_int16x8_t;
-void test_vbicQs16 (void)
-{
-
-  out_int16x8_t = vbicq_s16 (arg0_int16x8_t, arg1_int16x8_t);
-}
-
-/* { dg-final { scan-assembler "vbic\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vbicQs32.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vbicQs32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O2" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-int32x4_t out_int32x4_t;
-int32x4_t arg0_int32x4_t;
-int32x4_t arg1_int32x4_t;
-void test_vbicQs32 (void)
-{
-
-  out_int32x4_t = vbicq_s32 (arg0_int32x4_t, arg1_int32x4_t);
-}
-
-/* { dg-final { scan-assembler "vbic\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vbicQs64.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vbicQs64' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O2" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-int64x2_t out_int64x2_t;
-int64x2_t arg0_int64x2_t;
-int64x2_t arg1_int64x2_t;
-void test_vbicQs64 (void)
-{
-
-  out_int64x2_t = vbicq_s64 (arg0_int64x2_t, arg1_int64x2_t);
-}
-
-/* { dg-final { scan-assembler "vbic\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vbicQs8.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vbicQs8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O2" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-int8x16_t out_int8x16_t;
-int8x16_t arg0_int8x16_t;
-int8x16_t arg1_int8x16_t;
-void test_vbicQs8 (void)
-{
-
-  out_int8x16_t = vbicq_s8 (arg0_int8x16_t, arg1_int8x16_t);
-}
-
-/* { dg-final { scan-assembler "vbic\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vbicQu16.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vbicQu16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O2" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-uint16x8_t out_uint16x8_t;
-uint16x8_t arg0_uint16x8_t;
-uint16x8_t arg1_uint16x8_t;
-void test_vbicQu16 (void)
-{
-
-  out_uint16x8_t = vbicq_u16 (arg0_uint16x8_t, arg1_uint16x8_t);
-}
-
-/* { dg-final { scan-assembler "vbic\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vbicQu32.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vbicQu32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O2" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-uint32x4_t out_uint32x4_t;
-uint32x4_t arg0_uint32x4_t;
-uint32x4_t arg1_uint32x4_t;
-void test_vbicQu32 (void)
-{
-
-  out_uint32x4_t = vbicq_u32 (arg0_uint32x4_t, arg1_uint32x4_t);
-}
-
-/* { dg-final { scan-assembler "vbic\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vbicQu64.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vbicQu64' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O2" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-uint64x2_t out_uint64x2_t;
-uint64x2_t arg0_uint64x2_t;
-uint64x2_t arg1_uint64x2_t;
-void test_vbicQu64 (void)
-{
-
-  out_uint64x2_t = vbicq_u64 (arg0_uint64x2_t, arg1_uint64x2_t);
-}
-
-/* { dg-final { scan-assembler "vbic\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vbicQu8.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vbicQu8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O2" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-uint8x16_t out_uint8x16_t;
-uint8x16_t arg0_uint8x16_t;
-uint8x16_t arg1_uint8x16_t;
-void test_vbicQu8 (void)
-{
-
-  out_uint8x16_t = vbicq_u8 (arg0_uint8x16_t, arg1_uint8x16_t);
-}
-
-/* { dg-final { scan-assembler "vbic\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vbics16.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vbics16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O2" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-int16x4_t out_int16x4_t;
-int16x4_t arg0_int16x4_t;
-int16x4_t arg1_int16x4_t;
-void test_vbics16 (void)
-{
-
-  out_int16x4_t = vbic_s16 (arg0_int16x4_t, arg1_int16x4_t);
-}
-
-/* { dg-final { scan-assembler "vbic\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vbics32.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vbics32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O2" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-int32x2_t out_int32x2_t;
-int32x2_t arg0_int32x2_t;
-int32x2_t arg1_int32x2_t;
-void test_vbics32 (void)
-{
-
-  out_int32x2_t = vbic_s32 (arg0_int32x2_t, arg1_int32x2_t);
-}
-
-/* { dg-final { scan-assembler "vbic\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vbics64.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vbics64' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O2" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-int64x1_t out_int64x1_t;
-int64x1_t arg0_int64x1_t;
-int64x1_t arg1_int64x1_t;
-void test_vbics64 (void)
-{
-
-  out_int64x1_t = vbic_s64 (arg0_int64x1_t, arg1_int64x1_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vbics8.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vbics8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O2" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-int8x8_t out_int8x8_t;
-int8x8_t arg0_int8x8_t;
-int8x8_t arg1_int8x8_t;
-void test_vbics8 (void)
-{
-
-  out_int8x8_t = vbic_s8 (arg0_int8x8_t, arg1_int8x8_t);
-}
-
-/* { dg-final { scan-assembler "vbic\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vbicu16.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vbicu16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O2" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-uint16x4_t out_uint16x4_t;
-uint16x4_t arg0_uint16x4_t;
-uint16x4_t arg1_uint16x4_t;
-void test_vbicu16 (void)
-{
-
-  out_uint16x4_t = vbic_u16 (arg0_uint16x4_t, arg1_uint16x4_t);
-}
-
-/* { dg-final { scan-assembler "vbic\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vbicu32.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vbicu32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O2" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-uint32x2_t out_uint32x2_t;
-uint32x2_t arg0_uint32x2_t;
-uint32x2_t arg1_uint32x2_t;
-void test_vbicu32 (void)
-{
-
-  out_uint32x2_t = vbic_u32 (arg0_uint32x2_t, arg1_uint32x2_t);
-}
-
-/* { dg-final { scan-assembler "vbic\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vbicu64.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vbicu64' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O2" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-uint64x1_t out_uint64x1_t;
-uint64x1_t arg0_uint64x1_t;
-uint64x1_t arg1_uint64x1_t;
-void test_vbicu64 (void)
-{
-
-  out_uint64x1_t = vbic_u64 (arg0_uint64x1_t, arg1_uint64x1_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vbicu8.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vbicu8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O2" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-uint8x8_t out_uint8x8_t;
-uint8x8_t arg0_uint8x8_t;
-uint8x8_t arg1_uint8x8_t;
-void test_vbicu8 (void)
-{
-
-  out_uint8x8_t = vbic_u8 (arg0_uint8x8_t, arg1_uint8x8_t);
-}
-
-/* { dg-final { scan-assembler "vbic\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vbslQf32.c
+++ b/src//dev/null
@@ -1,21 +0,0 @@
-/* Test the `vbslQf32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vbslQf32 (void)
-{
-  float32x4_t out_float32x4_t;
-  uint32x4_t arg0_uint32x4_t;
-  float32x4_t arg1_float32x4_t;
-  float32x4_t arg2_float32x4_t;
-
-  out_float32x4_t = vbslq_f32 (arg0_uint32x4_t, arg1_float32x4_t, arg2_float32x4_t);
-}
-
-/* { dg-final { scan-assembler "((vbsl)|(vbit)|(vbif))\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vbslQp16.c
+++ b/src//dev/null
@@ -1,21 +0,0 @@
-/* Test the `vbslQp16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vbslQp16 (void)
-{
-  poly16x8_t out_poly16x8_t;
-  uint16x8_t arg0_uint16x8_t;
-  poly16x8_t arg1_poly16x8_t;
-  poly16x8_t arg2_poly16x8_t;
-
-  out_poly16x8_t = vbslq_p16 (arg0_uint16x8_t, arg1_poly16x8_t, arg2_poly16x8_t);
-}
-
-/* { dg-final { scan-assembler "((vbsl)|(vbit)|(vbif))\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vbslQp64.c
+++ b/src//dev/null
@@ -1,21 +0,0 @@
-/* Test the `vbslQp64' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_crypto_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_crypto } */
-
-#include "arm_neon.h"
-
-void test_vbslQp64 (void)
-{
-  poly64x2_t out_poly64x2_t;
-  uint64x2_t arg0_uint64x2_t;
-  poly64x2_t arg1_poly64x2_t;
-  poly64x2_t arg2_poly64x2_t;
-
-  out_poly64x2_t = vbslq_p64 (arg0_uint64x2_t, arg1_poly64x2_t, arg2_poly64x2_t);
-}
-
-/* { dg-final { scan-assembler "((vbsl)|(vbit)|(vbif))\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vbslQp8.c
+++ b/src//dev/null
@@ -1,21 +0,0 @@
-/* Test the `vbslQp8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vbslQp8 (void)
-{
-  poly8x16_t out_poly8x16_t;
-  uint8x16_t arg0_uint8x16_t;
-  poly8x16_t arg1_poly8x16_t;
-  poly8x16_t arg2_poly8x16_t;
-
-  out_poly8x16_t = vbslq_p8 (arg0_uint8x16_t, arg1_poly8x16_t, arg2_poly8x16_t);
-}
-
-/* { dg-final { scan-assembler "((vbsl)|(vbit)|(vbif))\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vbslQs16.c
+++ b/src//dev/null
@@ -1,21 +0,0 @@
-/* Test the `vbslQs16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vbslQs16 (void)
-{
-  int16x8_t out_int16x8_t;
-  uint16x8_t arg0_uint16x8_t;
-  int16x8_t arg1_int16x8_t;
-  int16x8_t arg2_int16x8_t;
-
-  out_int16x8_t = vbslq_s16 (arg0_uint16x8_t, arg1_int16x8_t, arg2_int16x8_t);
-}
-
-/* { dg-final { scan-assembler "((vbsl)|(vbit)|(vbif))\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vbslQs32.c
+++ b/src//dev/null
@@ -1,21 +0,0 @@
-/* Test the `vbslQs32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vbslQs32 (void)
-{
-  int32x4_t out_int32x4_t;
-  uint32x4_t arg0_uint32x4_t;
-  int32x4_t arg1_int32x4_t;
-  int32x4_t arg2_int32x4_t;
-
-  out_int32x4_t = vbslq_s32 (arg0_uint32x4_t, arg1_int32x4_t, arg2_int32x4_t);
-}
-
-/* { dg-final { scan-assembler "((vbsl)|(vbit)|(vbif))\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vbslQs64.c
+++ b/src//dev/null
@@ -1,21 +0,0 @@
-/* Test the `vbslQs64' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vbslQs64 (void)
-{
-  int64x2_t out_int64x2_t;
-  uint64x2_t arg0_uint64x2_t;
-  int64x2_t arg1_int64x2_t;
-  int64x2_t arg2_int64x2_t;
-
-  out_int64x2_t = vbslq_s64 (arg0_uint64x2_t, arg1_int64x2_t, arg2_int64x2_t);
-}
-
-/* { dg-final { scan-assembler "((vbsl)|(vbit)|(vbif))\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vbslQs8.c
+++ b/src//dev/null
@@ -1,21 +0,0 @@
-/* Test the `vbslQs8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vbslQs8 (void)
-{
-  int8x16_t out_int8x16_t;
-  uint8x16_t arg0_uint8x16_t;
-  int8x16_t arg1_int8x16_t;
-  int8x16_t arg2_int8x16_t;
-
-  out_int8x16_t = vbslq_s8 (arg0_uint8x16_t, arg1_int8x16_t, arg2_int8x16_t);
-}
-
-/* { dg-final { scan-assembler "((vbsl)|(vbit)|(vbif))\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vbslQu16.c
+++ b/src//dev/null
@@ -1,21 +0,0 @@
-/* Test the `vbslQu16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vbslQu16 (void)
-{
-  uint16x8_t out_uint16x8_t;
-  uint16x8_t arg0_uint16x8_t;
-  uint16x8_t arg1_uint16x8_t;
-  uint16x8_t arg2_uint16x8_t;
-
-  out_uint16x8_t = vbslq_u16 (arg0_uint16x8_t, arg1_uint16x8_t, arg2_uint16x8_t);
-}
-
-/* { dg-final { scan-assembler "((vbsl)|(vbit)|(vbif))\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vbslQu32.c
+++ b/src//dev/null
@@ -1,21 +0,0 @@
-/* Test the `vbslQu32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vbslQu32 (void)
-{
-  uint32x4_t out_uint32x4_t;
-  uint32x4_t arg0_uint32x4_t;
-  uint32x4_t arg1_uint32x4_t;
-  uint32x4_t arg2_uint32x4_t;
-
-  out_uint32x4_t = vbslq_u32 (arg0_uint32x4_t, arg1_uint32x4_t, arg2_uint32x4_t);
-}
-
-/* { dg-final { scan-assembler "((vbsl)|(vbit)|(vbif))\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vbslQu64.c
+++ b/src//dev/null
@@ -1,21 +0,0 @@
-/* Test the `vbslQu64' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vbslQu64 (void)
-{
-  uint64x2_t out_uint64x2_t;
-  uint64x2_t arg0_uint64x2_t;
-  uint64x2_t arg1_uint64x2_t;
-  uint64x2_t arg2_uint64x2_t;
-
-  out_uint64x2_t = vbslq_u64 (arg0_uint64x2_t, arg1_uint64x2_t, arg2_uint64x2_t);
-}
-
-/* { dg-final { scan-assembler "((vbsl)|(vbit)|(vbif))\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vbslQu8.c
+++ b/src//dev/null
@@ -1,21 +0,0 @@
-/* Test the `vbslQu8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vbslQu8 (void)
-{
-  uint8x16_t out_uint8x16_t;
-  uint8x16_t arg0_uint8x16_t;
-  uint8x16_t arg1_uint8x16_t;
-  uint8x16_t arg2_uint8x16_t;
-
-  out_uint8x16_t = vbslq_u8 (arg0_uint8x16_t, arg1_uint8x16_t, arg2_uint8x16_t);
-}
-
-/* { dg-final { scan-assembler "((vbsl)|(vbit)|(vbif))\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vbslf32.c
+++ b/src//dev/null
@@ -1,21 +0,0 @@
-/* Test the `vbslf32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vbslf32 (void)
-{
-  float32x2_t out_float32x2_t;
-  uint32x2_t arg0_uint32x2_t;
-  float32x2_t arg1_float32x2_t;
-  float32x2_t arg2_float32x2_t;
-
-  out_float32x2_t = vbsl_f32 (arg0_uint32x2_t, arg1_float32x2_t, arg2_float32x2_t);
-}
-
-/* { dg-final { scan-assembler "((vbsl)|(vbit)|(vbif))\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vbslp16.c
+++ b/src//dev/null
@@ -1,21 +0,0 @@
-/* Test the `vbslp16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vbslp16 (void)
-{
-  poly16x4_t out_poly16x4_t;
-  uint16x4_t arg0_uint16x4_t;
-  poly16x4_t arg1_poly16x4_t;
-  poly16x4_t arg2_poly16x4_t;
-
-  out_poly16x4_t = vbsl_p16 (arg0_uint16x4_t, arg1_poly16x4_t, arg2_poly16x4_t);
-}
-
-/* { dg-final { scan-assembler "((vbsl)|(vbit)|(vbif))\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vbslp64.c
+++ b/src//dev/null
@@ -1,21 +0,0 @@
-/* Test the `vbslp64' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_crypto_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_crypto } */
-
-#include "arm_neon.h"
-
-void test_vbslp64 (void)
-{
-  poly64x1_t out_poly64x1_t;
-  uint64x1_t arg0_uint64x1_t;
-  poly64x1_t arg1_poly64x1_t;
-  poly64x1_t arg2_poly64x1_t;
-
-  out_poly64x1_t = vbsl_p64 (arg0_uint64x1_t, arg1_poly64x1_t, arg2_poly64x1_t);
-}
-
-/* { dg-final { scan-assembler "((vbsl)|(vbit)|(vbif))\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vbslp8.c
+++ b/src//dev/null
@@ -1,21 +0,0 @@
-/* Test the `vbslp8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vbslp8 (void)
-{
-  poly8x8_t out_poly8x8_t;
-  uint8x8_t arg0_uint8x8_t;
-  poly8x8_t arg1_poly8x8_t;
-  poly8x8_t arg2_poly8x8_t;
-
-  out_poly8x8_t = vbsl_p8 (arg0_uint8x8_t, arg1_poly8x8_t, arg2_poly8x8_t);
-}
-
-/* { dg-final { scan-assembler "((vbsl)|(vbit)|(vbif))\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vbsls16.c
+++ b/src//dev/null
@@ -1,21 +0,0 @@
-/* Test the `vbsls16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vbsls16 (void)
-{
-  int16x4_t out_int16x4_t;
-  uint16x4_t arg0_uint16x4_t;
-  int16x4_t arg1_int16x4_t;
-  int16x4_t arg2_int16x4_t;
-
-  out_int16x4_t = vbsl_s16 (arg0_uint16x4_t, arg1_int16x4_t, arg2_int16x4_t);
-}
-
-/* { dg-final { scan-assembler "((vbsl)|(vbit)|(vbif))\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vbsls32.c
+++ b/src//dev/null
@@ -1,21 +0,0 @@
-/* Test the `vbsls32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vbsls32 (void)
-{
-  int32x2_t out_int32x2_t;
-  uint32x2_t arg0_uint32x2_t;
-  int32x2_t arg1_int32x2_t;
-  int32x2_t arg2_int32x2_t;
-
-  out_int32x2_t = vbsl_s32 (arg0_uint32x2_t, arg1_int32x2_t, arg2_int32x2_t);
-}
-
-/* { dg-final { scan-assembler "((vbsl)|(vbit)|(vbif))\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vbsls64.c
+++ b/src//dev/null
@@ -1,21 +0,0 @@
-/* Test the `vbsls64' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vbsls64 (void)
-{
-  int64x1_t out_int64x1_t;
-  uint64x1_t arg0_uint64x1_t;
-  int64x1_t arg1_int64x1_t;
-  int64x1_t arg2_int64x1_t;
-
-  out_int64x1_t = vbsl_s64 (arg0_uint64x1_t, arg1_int64x1_t, arg2_int64x1_t);
-}
-
-/* { dg-final { scan-assembler "((vbsl)|(vbit)|(vbif))\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vbsls8.c
+++ b/src//dev/null
@@ -1,21 +0,0 @@
-/* Test the `vbsls8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vbsls8 (void)
-{
-  int8x8_t out_int8x8_t;
-  uint8x8_t arg0_uint8x8_t;
-  int8x8_t arg1_int8x8_t;
-  int8x8_t arg2_int8x8_t;
-
-  out_int8x8_t = vbsl_s8 (arg0_uint8x8_t, arg1_int8x8_t, arg2_int8x8_t);
-}
-
-/* { dg-final { scan-assembler "((vbsl)|(vbit)|(vbif))\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vbslu16.c
+++ b/src//dev/null
@@ -1,21 +0,0 @@
-/* Test the `vbslu16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vbslu16 (void)
-{
-  uint16x4_t out_uint16x4_t;
-  uint16x4_t arg0_uint16x4_t;
-  uint16x4_t arg1_uint16x4_t;
-  uint16x4_t arg2_uint16x4_t;
-
-  out_uint16x4_t = vbsl_u16 (arg0_uint16x4_t, arg1_uint16x4_t, arg2_uint16x4_t);
-}
-
-/* { dg-final { scan-assembler "((vbsl)|(vbit)|(vbif))\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vbslu32.c
+++ b/src//dev/null
@@ -1,21 +0,0 @@
-/* Test the `vbslu32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vbslu32 (void)
-{
-  uint32x2_t out_uint32x2_t;
-  uint32x2_t arg0_uint32x2_t;
-  uint32x2_t arg1_uint32x2_t;
-  uint32x2_t arg2_uint32x2_t;
-
-  out_uint32x2_t = vbsl_u32 (arg0_uint32x2_t, arg1_uint32x2_t, arg2_uint32x2_t);
-}
-
-/* { dg-final { scan-assembler "((vbsl)|(vbit)|(vbif))\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vbslu64.c
+++ b/src//dev/null
@@ -1,21 +0,0 @@
-/* Test the `vbslu64' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vbslu64 (void)
-{
-  uint64x1_t out_uint64x1_t;
-  uint64x1_t arg0_uint64x1_t;
-  uint64x1_t arg1_uint64x1_t;
-  uint64x1_t arg2_uint64x1_t;
-
-  out_uint64x1_t = vbsl_u64 (arg0_uint64x1_t, arg1_uint64x1_t, arg2_uint64x1_t);
-}
-
-/* { dg-final { scan-assembler "((vbsl)|(vbit)|(vbif))\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vbslu8.c
+++ b/src//dev/null
@@ -1,21 +0,0 @@
-/* Test the `vbslu8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vbslu8 (void)
-{
-  uint8x8_t out_uint8x8_t;
-  uint8x8_t arg0_uint8x8_t;
-  uint8x8_t arg1_uint8x8_t;
-  uint8x8_t arg2_uint8x8_t;
-
-  out_uint8x8_t = vbsl_u8 (arg0_uint8x8_t, arg1_uint8x8_t, arg2_uint8x8_t);
-}
-
-/* { dg-final { scan-assembler "((vbsl)|(vbit)|(vbif))\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vcageQf32.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vcageQf32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vcageQf32 (void)
-{
-  uint32x4_t out_uint32x4_t;
-  float32x4_t arg0_float32x4_t;
-  float32x4_t arg1_float32x4_t;
-
-  out_uint32x4_t = vcageq_f32 (arg0_float32x4_t, arg1_float32x4_t);
-}
-
-/* { dg-final { scan-assembler "vacge\.f32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vcagef32.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vcagef32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vcagef32 (void)
-{
-  uint32x2_t out_uint32x2_t;
-  float32x2_t arg0_float32x2_t;
-  float32x2_t arg1_float32x2_t;
-
-  out_uint32x2_t = vcage_f32 (arg0_float32x2_t, arg1_float32x2_t);
-}
-
-/* { dg-final { scan-assembler "vacge\.f32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vcagtQf32.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vcagtQf32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vcagtQf32 (void)
-{
-  uint32x4_t out_uint32x4_t;
-  float32x4_t arg0_float32x4_t;
-  float32x4_t arg1_float32x4_t;
-
-  out_uint32x4_t = vcagtq_f32 (arg0_float32x4_t, arg1_float32x4_t);
-}
-
-/* { dg-final { scan-assembler "vacgt\.f32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vcagtf32.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vcagtf32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vcagtf32 (void)
-{
-  uint32x2_t out_uint32x2_t;
-  float32x2_t arg0_float32x2_t;
-  float32x2_t arg1_float32x2_t;
-
-  out_uint32x2_t = vcagt_f32 (arg0_float32x2_t, arg1_float32x2_t);
-}
-
-/* { dg-final { scan-assembler "vacgt\.f32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vcaleQf32.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vcaleQf32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vcaleQf32 (void)
-{
-  uint32x4_t out_uint32x4_t;
-  float32x4_t arg0_float32x4_t;
-  float32x4_t arg1_float32x4_t;
-
-  out_uint32x4_t = vcaleq_f32 (arg0_float32x4_t, arg1_float32x4_t);
-}
-
-/* { dg-final { scan-assembler "vacge\.f32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vcalef32.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vcalef32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vcalef32 (void)
-{
-  uint32x2_t out_uint32x2_t;
-  float32x2_t arg0_float32x2_t;
-  float32x2_t arg1_float32x2_t;
-
-  out_uint32x2_t = vcale_f32 (arg0_float32x2_t, arg1_float32x2_t);
-}
-
-/* { dg-final { scan-assembler "vacge\.f32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vcaltQf32.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vcaltQf32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vcaltQf32 (void)
-{
-  uint32x4_t out_uint32x4_t;
-  float32x4_t arg0_float32x4_t;
-  float32x4_t arg1_float32x4_t;
-
-  out_uint32x4_t = vcaltq_f32 (arg0_float32x4_t, arg1_float32x4_t);
-}
-
-/* { dg-final { scan-assembler "vacgt\.f32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vcaltf32.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vcaltf32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vcaltf32 (void)
-{
-  uint32x2_t out_uint32x2_t;
-  float32x2_t arg0_float32x2_t;
-  float32x2_t arg1_float32x2_t;
-
-  out_uint32x2_t = vcalt_f32 (arg0_float32x2_t, arg1_float32x2_t);
-}
-
-/* { dg-final { scan-assembler "vacgt\.f32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vceqQf32.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vceqQf32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vceqQf32 (void)
-{
-  uint32x4_t out_uint32x4_t;
-  float32x4_t arg0_float32x4_t;
-  float32x4_t arg1_float32x4_t;
-
-  out_uint32x4_t = vceqq_f32 (arg0_float32x4_t, arg1_float32x4_t);
-}
-
-/* { dg-final { scan-assembler "vceq\.f32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vceqQp8.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vceqQp8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vceqQp8 (void)
-{
-  uint8x16_t out_uint8x16_t;
-  poly8x16_t arg0_poly8x16_t;
-  poly8x16_t arg1_poly8x16_t;
-
-  out_uint8x16_t = vceqq_p8 (arg0_poly8x16_t, arg1_poly8x16_t);
-}
-
-/* { dg-final { scan-assembler "vceq\.i8\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vceqQs16.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vceqQs16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vceqQs16 (void)
-{
-  uint16x8_t out_uint16x8_t;
-  int16x8_t arg0_int16x8_t;
-  int16x8_t arg1_int16x8_t;
-
-  out_uint16x8_t = vceqq_s16 (arg0_int16x8_t, arg1_int16x8_t);
-}
-
-/* { dg-final { scan-assembler "vceq\.i16\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vceqQs32.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vceqQs32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vceqQs32 (void)
-{
-  uint32x4_t out_uint32x4_t;
-  int32x4_t arg0_int32x4_t;
-  int32x4_t arg1_int32x4_t;
-
-  out_uint32x4_t = vceqq_s32 (arg0_int32x4_t, arg1_int32x4_t);
-}
-
-/* { dg-final { scan-assembler "vceq\.i32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vceqQs8.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vceqQs8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vceqQs8 (void)
-{
-  uint8x16_t out_uint8x16_t;
-  int8x16_t arg0_int8x16_t;
-  int8x16_t arg1_int8x16_t;
-
-  out_uint8x16_t = vceqq_s8 (arg0_int8x16_t, arg1_int8x16_t);
-}
-
-/* { dg-final { scan-assembler "vceq\.i8\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vceqQu16.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vceqQu16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vceqQu16 (void)
-{
-  uint16x8_t out_uint16x8_t;
-  uint16x8_t arg0_uint16x8_t;
-  uint16x8_t arg1_uint16x8_t;
-
-  out_uint16x8_t = vceqq_u16 (arg0_uint16x8_t, arg1_uint16x8_t);
-}
-
-/* { dg-final { scan-assembler "vceq\.i16\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vceqQu32.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vceqQu32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vceqQu32 (void)
-{
-  uint32x4_t out_uint32x4_t;
-  uint32x4_t arg0_uint32x4_t;
-  uint32x4_t arg1_uint32x4_t;
-
-  out_uint32x4_t = vceqq_u32 (arg0_uint32x4_t, arg1_uint32x4_t);
-}
-
-/* { dg-final { scan-assembler "vceq\.i32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vceqQu8.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vceqQu8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vceqQu8 (void)
-{
-  uint8x16_t out_uint8x16_t;
-  uint8x16_t arg0_uint8x16_t;
-  uint8x16_t arg1_uint8x16_t;
-
-  out_uint8x16_t = vceqq_u8 (arg0_uint8x16_t, arg1_uint8x16_t);
-}
-
-/* { dg-final { scan-assembler "vceq\.i8\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vceqf32.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vceqf32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vceqf32 (void)
-{
-  uint32x2_t out_uint32x2_t;
-  float32x2_t arg0_float32x2_t;
-  float32x2_t arg1_float32x2_t;
-
-  out_uint32x2_t = vceq_f32 (arg0_float32x2_t, arg1_float32x2_t);
-}
-
-/* { dg-final { scan-assembler "vceq\.f32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vceqp8.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vceqp8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vceqp8 (void)
-{
-  uint8x8_t out_uint8x8_t;
-  poly8x8_t arg0_poly8x8_t;
-  poly8x8_t arg1_poly8x8_t;
-
-  out_uint8x8_t = vceq_p8 (arg0_poly8x8_t, arg1_poly8x8_t);
-}
-
-/* { dg-final { scan-assembler "vceq\.i8\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vceqs16.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vceqs16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vceqs16 (void)
-{
-  uint16x4_t out_uint16x4_t;
-  int16x4_t arg0_int16x4_t;
-  int16x4_t arg1_int16x4_t;
-
-  out_uint16x4_t = vceq_s16 (arg0_int16x4_t, arg1_int16x4_t);
-}
-
-/* { dg-final { scan-assembler "vceq\.i16\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vceqs32.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vceqs32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vceqs32 (void)
-{
-  uint32x2_t out_uint32x2_t;
-  int32x2_t arg0_int32x2_t;
-  int32x2_t arg1_int32x2_t;
-
-  out_uint32x2_t = vceq_s32 (arg0_int32x2_t, arg1_int32x2_t);
-}
-
-/* { dg-final { scan-assembler "vceq\.i32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vceqs8.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vceqs8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vceqs8 (void)
-{
-  uint8x8_t out_uint8x8_t;
-  int8x8_t arg0_int8x8_t;
-  int8x8_t arg1_int8x8_t;
-
-  out_uint8x8_t = vceq_s8 (arg0_int8x8_t, arg1_int8x8_t);
-}
-
-/* { dg-final { scan-assembler "vceq\.i8\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vcequ16.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vcequ16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vcequ16 (void)
-{
-  uint16x4_t out_uint16x4_t;
-  uint16x4_t arg0_uint16x4_t;
-  uint16x4_t arg1_uint16x4_t;
-
-  out_uint16x4_t = vceq_u16 (arg0_uint16x4_t, arg1_uint16x4_t);
-}
-
-/* { dg-final { scan-assembler "vceq\.i16\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vcequ32.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vcequ32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vcequ32 (void)
-{
-  uint32x2_t out_uint32x2_t;
-  uint32x2_t arg0_uint32x2_t;
-  uint32x2_t arg1_uint32x2_t;
-
-  out_uint32x2_t = vceq_u32 (arg0_uint32x2_t, arg1_uint32x2_t);
-}
-
-/* { dg-final { scan-assembler "vceq\.i32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vcequ8.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vcequ8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vcequ8 (void)
-{
-  uint8x8_t out_uint8x8_t;
-  uint8x8_t arg0_uint8x8_t;
-  uint8x8_t arg1_uint8x8_t;
-
-  out_uint8x8_t = vceq_u8 (arg0_uint8x8_t, arg1_uint8x8_t);
-}
-
-/* { dg-final { scan-assembler "vceq\.i8\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vcgeQf32.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vcgeQf32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vcgeQf32 (void)
-{
-  uint32x4_t out_uint32x4_t;
-  float32x4_t arg0_float32x4_t;
-  float32x4_t arg1_float32x4_t;
-
-  out_uint32x4_t = vcgeq_f32 (arg0_float32x4_t, arg1_float32x4_t);
-}
-
-/* { dg-final { scan-assembler "vcge\.f32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vcgeQs16.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vcgeQs16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vcgeQs16 (void)
-{
-  uint16x8_t out_uint16x8_t;
-  int16x8_t arg0_int16x8_t;
-  int16x8_t arg1_int16x8_t;
-
-  out_uint16x8_t = vcgeq_s16 (arg0_int16x8_t, arg1_int16x8_t);
-}
-
-/* { dg-final { scan-assembler "vcge\.s16\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vcgeQs32.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vcgeQs32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vcgeQs32 (void)
-{
-  uint32x4_t out_uint32x4_t;
-  int32x4_t arg0_int32x4_t;
-  int32x4_t arg1_int32x4_t;
-
-  out_uint32x4_t = vcgeq_s32 (arg0_int32x4_t, arg1_int32x4_t);
-}
-
-/* { dg-final { scan-assembler "vcge\.s32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vcgeQs8.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vcgeQs8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vcgeQs8 (void)
-{
-  uint8x16_t out_uint8x16_t;
-  int8x16_t arg0_int8x16_t;
-  int8x16_t arg1_int8x16_t;
-
-  out_uint8x16_t = vcgeq_s8 (arg0_int8x16_t, arg1_int8x16_t);
-}
-
-/* { dg-final { scan-assembler "vcge\.s8\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vcgeQu16.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vcgeQu16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vcgeQu16 (void)
-{
-  uint16x8_t out_uint16x8_t;
-  uint16x8_t arg0_uint16x8_t;
-  uint16x8_t arg1_uint16x8_t;
-
-  out_uint16x8_t = vcgeq_u16 (arg0_uint16x8_t, arg1_uint16x8_t);
-}
-
-/* { dg-final { scan-assembler "vcge\.u16\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vcgeQu32.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vcgeQu32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vcgeQu32 (void)
-{
-  uint32x4_t out_uint32x4_t;
-  uint32x4_t arg0_uint32x4_t;
-  uint32x4_t arg1_uint32x4_t;
-
-  out_uint32x4_t = vcgeq_u32 (arg0_uint32x4_t, arg1_uint32x4_t);
-}
-
-/* { dg-final { scan-assembler "vcge\.u32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vcgeQu8.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vcgeQu8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vcgeQu8 (void)
-{
-  uint8x16_t out_uint8x16_t;
-  uint8x16_t arg0_uint8x16_t;
-  uint8x16_t arg1_uint8x16_t;
-
-  out_uint8x16_t = vcgeq_u8 (arg0_uint8x16_t, arg1_uint8x16_t);
-}
-
-/* { dg-final { scan-assembler "vcge\.u8\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vcgef32.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vcgef32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vcgef32 (void)
-{
-  uint32x2_t out_uint32x2_t;
-  float32x2_t arg0_float32x2_t;
-  float32x2_t arg1_float32x2_t;
-
-  out_uint32x2_t = vcge_f32 (arg0_float32x2_t, arg1_float32x2_t);
-}
-
-/* { dg-final { scan-assembler "vcge\.f32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vcges16.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vcges16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vcges16 (void)
-{
-  uint16x4_t out_uint16x4_t;
-  int16x4_t arg0_int16x4_t;
-  int16x4_t arg1_int16x4_t;
-
-  out_uint16x4_t = vcge_s16 (arg0_int16x4_t, arg1_int16x4_t);
-}
-
-/* { dg-final { scan-assembler "vcge\.s16\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vcges32.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vcges32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vcges32 (void)
-{
-  uint32x2_t out_uint32x2_t;
-  int32x2_t arg0_int32x2_t;
-  int32x2_t arg1_int32x2_t;
-
-  out_uint32x2_t = vcge_s32 (arg0_int32x2_t, arg1_int32x2_t);
-}
-
-/* { dg-final { scan-assembler "vcge\.s32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vcges8.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vcges8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vcges8 (void)
-{
-  uint8x8_t out_uint8x8_t;
-  int8x8_t arg0_int8x8_t;
-  int8x8_t arg1_int8x8_t;
-
-  out_uint8x8_t = vcge_s8 (arg0_int8x8_t, arg1_int8x8_t);
-}
-
-/* { dg-final { scan-assembler "vcge\.s8\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vcgeu16.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vcgeu16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vcgeu16 (void)
-{
-  uint16x4_t out_uint16x4_t;
-  uint16x4_t arg0_uint16x4_t;
-  uint16x4_t arg1_uint16x4_t;
-
-  out_uint16x4_t = vcge_u16 (arg0_uint16x4_t, arg1_uint16x4_t);
-}
-
-/* { dg-final { scan-assembler "vcge\.u16\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vcgeu32.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vcgeu32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vcgeu32 (void)
-{
-  uint32x2_t out_uint32x2_t;
-  uint32x2_t arg0_uint32x2_t;
-  uint32x2_t arg1_uint32x2_t;
-
-  out_uint32x2_t = vcge_u32 (arg0_uint32x2_t, arg1_uint32x2_t);
-}
-
-/* { dg-final { scan-assembler "vcge\.u32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vcgeu8.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vcgeu8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vcgeu8 (void)
-{
-  uint8x8_t out_uint8x8_t;
-  uint8x8_t arg0_uint8x8_t;
-  uint8x8_t arg1_uint8x8_t;
-
-  out_uint8x8_t = vcge_u8 (arg0_uint8x8_t, arg1_uint8x8_t);
-}
-
-/* { dg-final { scan-assembler "vcge\.u8\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vcgtQf32.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vcgtQf32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vcgtQf32 (void)
-{
-  uint32x4_t out_uint32x4_t;
-  float32x4_t arg0_float32x4_t;
-  float32x4_t arg1_float32x4_t;
-
-  out_uint32x4_t = vcgtq_f32 (arg0_float32x4_t, arg1_float32x4_t);
-}
-
-/* { dg-final { scan-assembler "vcgt\.f32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vcgtQs16.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vcgtQs16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vcgtQs16 (void)
-{
-  uint16x8_t out_uint16x8_t;
-  int16x8_t arg0_int16x8_t;
-  int16x8_t arg1_int16x8_t;
-
-  out_uint16x8_t = vcgtq_s16 (arg0_int16x8_t, arg1_int16x8_t);
-}
-
-/* { dg-final { scan-assembler "vcgt\.s16\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vcgtQs32.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vcgtQs32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vcgtQs32 (void)
-{
-  uint32x4_t out_uint32x4_t;
-  int32x4_t arg0_int32x4_t;
-  int32x4_t arg1_int32x4_t;
-
-  out_uint32x4_t = vcgtq_s32 (arg0_int32x4_t, arg1_int32x4_t);
-}
-
-/* { dg-final { scan-assembler "vcgt\.s32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vcgtQs8.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vcgtQs8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vcgtQs8 (void)
-{
-  uint8x16_t out_uint8x16_t;
-  int8x16_t arg0_int8x16_t;
-  int8x16_t arg1_int8x16_t;
-
-  out_uint8x16_t = vcgtq_s8 (arg0_int8x16_t, arg1_int8x16_t);
-}
-
-/* { dg-final { scan-assembler "vcgt\.s8\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vcgtQu16.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vcgtQu16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vcgtQu16 (void)
-{
-  uint16x8_t out_uint16x8_t;
-  uint16x8_t arg0_uint16x8_t;
-  uint16x8_t arg1_uint16x8_t;
-
-  out_uint16x8_t = vcgtq_u16 (arg0_uint16x8_t, arg1_uint16x8_t);
-}
-
-/* { dg-final { scan-assembler "vcgt\.u16\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vcgtQu32.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vcgtQu32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vcgtQu32 (void)
-{
-  uint32x4_t out_uint32x4_t;
-  uint32x4_t arg0_uint32x4_t;
-  uint32x4_t arg1_uint32x4_t;
-
-  out_uint32x4_t = vcgtq_u32 (arg0_uint32x4_t, arg1_uint32x4_t);
-}
-
-/* { dg-final { scan-assembler "vcgt\.u32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vcgtQu8.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vcgtQu8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vcgtQu8 (void)
-{
-  uint8x16_t out_uint8x16_t;
-  uint8x16_t arg0_uint8x16_t;
-  uint8x16_t arg1_uint8x16_t;
-
-  out_uint8x16_t = vcgtq_u8 (arg0_uint8x16_t, arg1_uint8x16_t);
-}
-
-/* { dg-final { scan-assembler "vcgt\.u8\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vcgtf32.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vcgtf32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vcgtf32 (void)
-{
-  uint32x2_t out_uint32x2_t;
-  float32x2_t arg0_float32x2_t;
-  float32x2_t arg1_float32x2_t;
-
-  out_uint32x2_t = vcgt_f32 (arg0_float32x2_t, arg1_float32x2_t);
-}
-
-/* { dg-final { scan-assembler "vcgt\.f32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vcgts16.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vcgts16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vcgts16 (void)
-{
-  uint16x4_t out_uint16x4_t;
-  int16x4_t arg0_int16x4_t;
-  int16x4_t arg1_int16x4_t;
-
-  out_uint16x4_t = vcgt_s16 (arg0_int16x4_t, arg1_int16x4_t);
-}
-
-/* { dg-final { scan-assembler "vcgt\.s16\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vcgts32.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vcgts32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vcgts32 (void)
-{
-  uint32x2_t out_uint32x2_t;
-  int32x2_t arg0_int32x2_t;
-  int32x2_t arg1_int32x2_t;
-
-  out_uint32x2_t = vcgt_s32 (arg0_int32x2_t, arg1_int32x2_t);
-}
-
-/* { dg-final { scan-assembler "vcgt\.s32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vcgts8.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vcgts8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vcgts8 (void)
-{
-  uint8x8_t out_uint8x8_t;
-  int8x8_t arg0_int8x8_t;
-  int8x8_t arg1_int8x8_t;
-
-  out_uint8x8_t = vcgt_s8 (arg0_int8x8_t, arg1_int8x8_t);
-}
-
-/* { dg-final { scan-assembler "vcgt\.s8\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vcgtu16.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vcgtu16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vcgtu16 (void)
-{
-  uint16x4_t out_uint16x4_t;
-  uint16x4_t arg0_uint16x4_t;
-  uint16x4_t arg1_uint16x4_t;
-
-  out_uint16x4_t = vcgt_u16 (arg0_uint16x4_t, arg1_uint16x4_t);
-}
-
-/* { dg-final { scan-assembler "vcgt\.u16\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vcgtu32.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vcgtu32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vcgtu32 (void)
-{
-  uint32x2_t out_uint32x2_t;
-  uint32x2_t arg0_uint32x2_t;
-  uint32x2_t arg1_uint32x2_t;
-
-  out_uint32x2_t = vcgt_u32 (arg0_uint32x2_t, arg1_uint32x2_t);
-}
-
-/* { dg-final { scan-assembler "vcgt\.u32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vcgtu8.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vcgtu8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vcgtu8 (void)
-{
-  uint8x8_t out_uint8x8_t;
-  uint8x8_t arg0_uint8x8_t;
-  uint8x8_t arg1_uint8x8_t;
-
-  out_uint8x8_t = vcgt_u8 (arg0_uint8x8_t, arg1_uint8x8_t);
-}
-
-/* { dg-final { scan-assembler "vcgt\.u8\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vcleQf32.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vcleQf32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vcleQf32 (void)
-{
-  uint32x4_t out_uint32x4_t;
-  float32x4_t arg0_float32x4_t;
-  float32x4_t arg1_float32x4_t;
-
-  out_uint32x4_t = vcleq_f32 (arg0_float32x4_t, arg1_float32x4_t);
-}
-
-/* { dg-final { scan-assembler "vcge\.f32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vcleQs16.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vcleQs16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vcleQs16 (void)
-{
-  uint16x8_t out_uint16x8_t;
-  int16x8_t arg0_int16x8_t;
-  int16x8_t arg1_int16x8_t;
-
-  out_uint16x8_t = vcleq_s16 (arg0_int16x8_t, arg1_int16x8_t);
-}
-
-/* { dg-final { scan-assembler "vcge\.s16\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vcleQs32.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vcleQs32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vcleQs32 (void)
-{
-  uint32x4_t out_uint32x4_t;
-  int32x4_t arg0_int32x4_t;
-  int32x4_t arg1_int32x4_t;
-
-  out_uint32x4_t = vcleq_s32 (arg0_int32x4_t, arg1_int32x4_t);
-}
-
-/* { dg-final { scan-assembler "vcge\.s32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vcleQs8.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vcleQs8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vcleQs8 (void)
-{
-  uint8x16_t out_uint8x16_t;
-  int8x16_t arg0_int8x16_t;
-  int8x16_t arg1_int8x16_t;
-
-  out_uint8x16_t = vcleq_s8 (arg0_int8x16_t, arg1_int8x16_t);
-}
-
-/* { dg-final { scan-assembler "vcge\.s8\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vcleQu16.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vcleQu16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vcleQu16 (void)
-{
-  uint16x8_t out_uint16x8_t;
-  uint16x8_t arg0_uint16x8_t;
-  uint16x8_t arg1_uint16x8_t;
-
-  out_uint16x8_t = vcleq_u16 (arg0_uint16x8_t, arg1_uint16x8_t);
-}
-
-/* { dg-final { scan-assembler "vcge\.u16\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vcleQu32.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vcleQu32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vcleQu32 (void)
-{
-  uint32x4_t out_uint32x4_t;
-  uint32x4_t arg0_uint32x4_t;
-  uint32x4_t arg1_uint32x4_t;
-
-  out_uint32x4_t = vcleq_u32 (arg0_uint32x4_t, arg1_uint32x4_t);
-}
-
-/* { dg-final { scan-assembler "vcge\.u32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vcleQu8.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vcleQu8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vcleQu8 (void)
-{
-  uint8x16_t out_uint8x16_t;
-  uint8x16_t arg0_uint8x16_t;
-  uint8x16_t arg1_uint8x16_t;
-
-  out_uint8x16_t = vcleq_u8 (arg0_uint8x16_t, arg1_uint8x16_t);
-}
-
-/* { dg-final { scan-assembler "vcge\.u8\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vclef32.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vclef32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vclef32 (void)
-{
-  uint32x2_t out_uint32x2_t;
-  float32x2_t arg0_float32x2_t;
-  float32x2_t arg1_float32x2_t;
-
-  out_uint32x2_t = vcle_f32 (arg0_float32x2_t, arg1_float32x2_t);
-}
-
-/* { dg-final { scan-assembler "vcge\.f32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vcles16.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vcles16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vcles16 (void)
-{
-  uint16x4_t out_uint16x4_t;
-  int16x4_t arg0_int16x4_t;
-  int16x4_t arg1_int16x4_t;
-
-  out_uint16x4_t = vcle_s16 (arg0_int16x4_t, arg1_int16x4_t);
-}
-
-/* { dg-final { scan-assembler "vcge\.s16\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vcles32.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vcles32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vcles32 (void)
-{
-  uint32x2_t out_uint32x2_t;
-  int32x2_t arg0_int32x2_t;
-  int32x2_t arg1_int32x2_t;
-
-  out_uint32x2_t = vcle_s32 (arg0_int32x2_t, arg1_int32x2_t);
-}
-
-/* { dg-final { scan-assembler "vcge\.s32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vcles8.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vcles8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vcles8 (void)
-{
-  uint8x8_t out_uint8x8_t;
-  int8x8_t arg0_int8x8_t;
-  int8x8_t arg1_int8x8_t;
-
-  out_uint8x8_t = vcle_s8 (arg0_int8x8_t, arg1_int8x8_t);
-}
-
-/* { dg-final { scan-assembler "vcge\.s8\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vcleu16.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vcleu16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vcleu16 (void)
-{
-  uint16x4_t out_uint16x4_t;
-  uint16x4_t arg0_uint16x4_t;
-  uint16x4_t arg1_uint16x4_t;
-
-  out_uint16x4_t = vcle_u16 (arg0_uint16x4_t, arg1_uint16x4_t);
-}
-
-/* { dg-final { scan-assembler "vcge\.u16\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vcleu32.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vcleu32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vcleu32 (void)
-{
-  uint32x2_t out_uint32x2_t;
-  uint32x2_t arg0_uint32x2_t;
-  uint32x2_t arg1_uint32x2_t;
-
-  out_uint32x2_t = vcle_u32 (arg0_uint32x2_t, arg1_uint32x2_t);
-}
-
-/* { dg-final { scan-assembler "vcge\.u32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vcleu8.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vcleu8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vcleu8 (void)
-{
-  uint8x8_t out_uint8x8_t;
-  uint8x8_t arg0_uint8x8_t;
-  uint8x8_t arg1_uint8x8_t;
-
-  out_uint8x8_t = vcle_u8 (arg0_uint8x8_t, arg1_uint8x8_t);
-}
-
-/* { dg-final { scan-assembler "vcge\.u8\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vclsQs16.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vclsQs16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vclsQs16 (void)
-{
-  int16x8_t out_int16x8_t;
-  int16x8_t arg0_int16x8_t;
-
-  out_int16x8_t = vclsq_s16 (arg0_int16x8_t);
-}
-
-/* { dg-final { scan-assembler "vcls\.s16\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vclsQs32.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vclsQs32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vclsQs32 (void)
-{
-  int32x4_t out_int32x4_t;
-  int32x4_t arg0_int32x4_t;
-
-  out_int32x4_t = vclsq_s32 (arg0_int32x4_t);
-}
-
-/* { dg-final { scan-assembler "vcls\.s32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vclsQs8.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vclsQs8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vclsQs8 (void)
-{
-  int8x16_t out_int8x16_t;
-  int8x16_t arg0_int8x16_t;
-
-  out_int8x16_t = vclsq_s8 (arg0_int8x16_t);
-}
-
-/* { dg-final { scan-assembler "vcls\.s8\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vclss16.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vclss16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vclss16 (void)
-{
-  int16x4_t out_int16x4_t;
-  int16x4_t arg0_int16x4_t;
-
-  out_int16x4_t = vcls_s16 (arg0_int16x4_t);
-}
-
-/* { dg-final { scan-assembler "vcls\.s16\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vclss32.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vclss32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vclss32 (void)
-{
-  int32x2_t out_int32x2_t;
-  int32x2_t arg0_int32x2_t;
-
-  out_int32x2_t = vcls_s32 (arg0_int32x2_t);
-}
-
-/* { dg-final { scan-assembler "vcls\.s32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vclss8.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vclss8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vclss8 (void)
-{
-  int8x8_t out_int8x8_t;
-  int8x8_t arg0_int8x8_t;
-
-  out_int8x8_t = vcls_s8 (arg0_int8x8_t);
-}
-
-/* { dg-final { scan-assembler "vcls\.s8\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vcltQf32.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vcltQf32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vcltQf32 (void)
-{
-  uint32x4_t out_uint32x4_t;
-  float32x4_t arg0_float32x4_t;
-  float32x4_t arg1_float32x4_t;
-
-  out_uint32x4_t = vcltq_f32 (arg0_float32x4_t, arg1_float32x4_t);
-}
-
-/* { dg-final { scan-assembler "vcgt\.f32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vcltQs16.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vcltQs16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vcltQs16 (void)
-{
-  uint16x8_t out_uint16x8_t;
-  int16x8_t arg0_int16x8_t;
-  int16x8_t arg1_int16x8_t;
-
-  out_uint16x8_t = vcltq_s16 (arg0_int16x8_t, arg1_int16x8_t);
-}
-
-/* { dg-final { scan-assembler "vcgt\.s16\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vcltQs32.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vcltQs32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vcltQs32 (void)
-{
-  uint32x4_t out_uint32x4_t;
-  int32x4_t arg0_int32x4_t;
-  int32x4_t arg1_int32x4_t;
-
-  out_uint32x4_t = vcltq_s32 (arg0_int32x4_t, arg1_int32x4_t);
-}
-
-/* { dg-final { scan-assembler "vcgt\.s32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vcltQs8.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vcltQs8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vcltQs8 (void)
-{
-  uint8x16_t out_uint8x16_t;
-  int8x16_t arg0_int8x16_t;
-  int8x16_t arg1_int8x16_t;
-
-  out_uint8x16_t = vcltq_s8 (arg0_int8x16_t, arg1_int8x16_t);
-}
-
-/* { dg-final { scan-assembler "vcgt\.s8\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vcltQu16.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vcltQu16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vcltQu16 (void)
-{
-  uint16x8_t out_uint16x8_t;
-  uint16x8_t arg0_uint16x8_t;
-  uint16x8_t arg1_uint16x8_t;
-
-  out_uint16x8_t = vcltq_u16 (arg0_uint16x8_t, arg1_uint16x8_t);
-}
-
-/* { dg-final { scan-assembler "vcgt\.u16\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vcltQu32.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vcltQu32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vcltQu32 (void)
-{
-  uint32x4_t out_uint32x4_t;
-  uint32x4_t arg0_uint32x4_t;
-  uint32x4_t arg1_uint32x4_t;
-
-  out_uint32x4_t = vcltq_u32 (arg0_uint32x4_t, arg1_uint32x4_t);
-}
-
-/* { dg-final { scan-assembler "vcgt\.u32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vcltQu8.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vcltQu8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vcltQu8 (void)
-{
-  uint8x16_t out_uint8x16_t;
-  uint8x16_t arg0_uint8x16_t;
-  uint8x16_t arg1_uint8x16_t;
-
-  out_uint8x16_t = vcltq_u8 (arg0_uint8x16_t, arg1_uint8x16_t);
-}
-
-/* { dg-final { scan-assembler "vcgt\.u8\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vcltf32.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vcltf32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vcltf32 (void)
-{
-  uint32x2_t out_uint32x2_t;
-  float32x2_t arg0_float32x2_t;
-  float32x2_t arg1_float32x2_t;
-
-  out_uint32x2_t = vclt_f32 (arg0_float32x2_t, arg1_float32x2_t);
-}
-
-/* { dg-final { scan-assembler "vcgt\.f32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vclts16.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vclts16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vclts16 (void)
-{
-  uint16x4_t out_uint16x4_t;
-  int16x4_t arg0_int16x4_t;
-  int16x4_t arg1_int16x4_t;
-
-  out_uint16x4_t = vclt_s16 (arg0_int16x4_t, arg1_int16x4_t);
-}
-
-/* { dg-final { scan-assembler "vcgt\.s16\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vclts32.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vclts32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vclts32 (void)
-{
-  uint32x2_t out_uint32x2_t;
-  int32x2_t arg0_int32x2_t;
-  int32x2_t arg1_int32x2_t;
-
-  out_uint32x2_t = vclt_s32 (arg0_int32x2_t, arg1_int32x2_t);
-}
-
-/* { dg-final { scan-assembler "vcgt\.s32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vclts8.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vclts8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vclts8 (void)
-{
-  uint8x8_t out_uint8x8_t;
-  int8x8_t arg0_int8x8_t;
-  int8x8_t arg1_int8x8_t;
-
-  out_uint8x8_t = vclt_s8 (arg0_int8x8_t, arg1_int8x8_t);
-}
-
-/* { dg-final { scan-assembler "vcgt\.s8\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vcltu16.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vcltu16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vcltu16 (void)
-{
-  uint16x4_t out_uint16x4_t;
-  uint16x4_t arg0_uint16x4_t;
-  uint16x4_t arg1_uint16x4_t;
-
-  out_uint16x4_t = vclt_u16 (arg0_uint16x4_t, arg1_uint16x4_t);
-}
-
-/* { dg-final { scan-assembler "vcgt\.u16\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vcltu32.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vcltu32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vcltu32 (void)
-{
-  uint32x2_t out_uint32x2_t;
-  uint32x2_t arg0_uint32x2_t;
-  uint32x2_t arg1_uint32x2_t;
-
-  out_uint32x2_t = vclt_u32 (arg0_uint32x2_t, arg1_uint32x2_t);
-}
-
-/* { dg-final { scan-assembler "vcgt\.u32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vcltu8.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vcltu8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vcltu8 (void)
-{
-  uint8x8_t out_uint8x8_t;
-  uint8x8_t arg0_uint8x8_t;
-  uint8x8_t arg1_uint8x8_t;
-
-  out_uint8x8_t = vclt_u8 (arg0_uint8x8_t, arg1_uint8x8_t);
-}
-
-/* { dg-final { scan-assembler "vcgt\.u8\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vclzQs16.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vclzQs16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vclzQs16 (void)
-{
-  int16x8_t out_int16x8_t;
-  int16x8_t arg0_int16x8_t;
-
-  out_int16x8_t = vclzq_s16 (arg0_int16x8_t);
-}
-
-/* { dg-final { scan-assembler "vclz\.i16\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vclzQs32.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vclzQs32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vclzQs32 (void)
-{
-  int32x4_t out_int32x4_t;
-  int32x4_t arg0_int32x4_t;
-
-  out_int32x4_t = vclzq_s32 (arg0_int32x4_t);
-}
-
-/* { dg-final { scan-assembler "vclz\.i32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vclzQs8.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vclzQs8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vclzQs8 (void)
-{
-  int8x16_t out_int8x16_t;
-  int8x16_t arg0_int8x16_t;
-
-  out_int8x16_t = vclzq_s8 (arg0_int8x16_t);
-}
-
-/* { dg-final { scan-assembler "vclz\.i8\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vclzQu16.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vclzQu16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vclzQu16 (void)
-{
-  uint16x8_t out_uint16x8_t;
-  uint16x8_t arg0_uint16x8_t;
-
-  out_uint16x8_t = vclzq_u16 (arg0_uint16x8_t);
-}
-
-/* { dg-final { scan-assembler "vclz\.i16\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vclzQu32.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vclzQu32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vclzQu32 (void)
-{
-  uint32x4_t out_uint32x4_t;
-  uint32x4_t arg0_uint32x4_t;
-
-  out_uint32x4_t = vclzq_u32 (arg0_uint32x4_t);
-}
-
-/* { dg-final { scan-assembler "vclz\.i32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vclzQu8.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vclzQu8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vclzQu8 (void)
-{
-  uint8x16_t out_uint8x16_t;
-  uint8x16_t arg0_uint8x16_t;
-
-  out_uint8x16_t = vclzq_u8 (arg0_uint8x16_t);
-}
-
-/* { dg-final { scan-assembler "vclz\.i8\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vclzs16.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vclzs16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vclzs16 (void)
-{
-  int16x4_t out_int16x4_t;
-  int16x4_t arg0_int16x4_t;
-
-  out_int16x4_t = vclz_s16 (arg0_int16x4_t);
-}
-
-/* { dg-final { scan-assembler "vclz\.i16\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vclzs32.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vclzs32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vclzs32 (void)
-{
-  int32x2_t out_int32x2_t;
-  int32x2_t arg0_int32x2_t;
-
-  out_int32x2_t = vclz_s32 (arg0_int32x2_t);
-}
-
-/* { dg-final { scan-assembler "vclz\.i32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vclzs8.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vclzs8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vclzs8 (void)
-{
-  int8x8_t out_int8x8_t;
-  int8x8_t arg0_int8x8_t;
-
-  out_int8x8_t = vclz_s8 (arg0_int8x8_t);
-}
-
-/* { dg-final { scan-assembler "vclz\.i8\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vclzu16.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vclzu16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vclzu16 (void)
-{
-  uint16x4_t out_uint16x4_t;
-  uint16x4_t arg0_uint16x4_t;
-
-  out_uint16x4_t = vclz_u16 (arg0_uint16x4_t);
-}
-
-/* { dg-final { scan-assembler "vclz\.i16\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vclzu32.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vclzu32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vclzu32 (void)
-{
-  uint32x2_t out_uint32x2_t;
-  uint32x2_t arg0_uint32x2_t;
-
-  out_uint32x2_t = vclz_u32 (arg0_uint32x2_t);
-}
-
-/* { dg-final { scan-assembler "vclz\.i32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vclzu8.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vclzu8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vclzu8 (void)
-{
-  uint8x8_t out_uint8x8_t;
-  uint8x8_t arg0_uint8x8_t;
-
-  out_uint8x8_t = vclz_u8 (arg0_uint8x8_t);
-}
-
-/* { dg-final { scan-assembler "vclz\.i8\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vcntQp8.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vcntQp8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vcntQp8 (void)
-{
-  poly8x16_t out_poly8x16_t;
-  poly8x16_t arg0_poly8x16_t;
-
-  out_poly8x16_t = vcntq_p8 (arg0_poly8x16_t);
-}
-
-/* { dg-final { scan-assembler "vcnt\.8\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vcntQs8.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vcntQs8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vcntQs8 (void)
-{
-  int8x16_t out_int8x16_t;
-  int8x16_t arg0_int8x16_t;
-
-  out_int8x16_t = vcntq_s8 (arg0_int8x16_t);
-}
-
-/* { dg-final { scan-assembler "vcnt\.8\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vcntQu8.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vcntQu8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vcntQu8 (void)
-{
-  uint8x16_t out_uint8x16_t;
-  uint8x16_t arg0_uint8x16_t;
-
-  out_uint8x16_t = vcntq_u8 (arg0_uint8x16_t);
-}
-
-/* { dg-final { scan-assembler "vcnt\.8\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vcntp8.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vcntp8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vcntp8 (void)
-{
-  poly8x8_t out_poly8x8_t;
-  poly8x8_t arg0_poly8x8_t;
-
-  out_poly8x8_t = vcnt_p8 (arg0_poly8x8_t);
-}
-
-/* { dg-final { scan-assembler "vcnt\.8\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vcnts8.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vcnts8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vcnts8 (void)
-{
-  int8x8_t out_int8x8_t;
-  int8x8_t arg0_int8x8_t;
-
-  out_int8x8_t = vcnt_s8 (arg0_int8x8_t);
-}
-
-/* { dg-final { scan-assembler "vcnt\.8\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vcntu8.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vcntu8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vcntu8 (void)
-{
-  uint8x8_t out_uint8x8_t;
-  uint8x8_t arg0_uint8x8_t;
-
-  out_uint8x8_t = vcnt_u8 (arg0_uint8x8_t);
-}
-
-/* { dg-final { scan-assembler "vcnt\.8\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vcombinef32.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vcombinef32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vcombinef32 (void)
-{
-  float32x4_t out_float32x4_t;
-  float32x2_t arg0_float32x2_t;
-  float32x2_t arg1_float32x2_t;
-
-  out_float32x4_t = vcombine_f32 (arg0_float32x2_t, arg1_float32x2_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vcombinep16.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vcombinep16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vcombinep16 (void)
-{
-  poly16x8_t out_poly16x8_t;
-  poly16x4_t arg0_poly16x4_t;
-  poly16x4_t arg1_poly16x4_t;
-
-  out_poly16x8_t = vcombine_p16 (arg0_poly16x4_t, arg1_poly16x4_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vcombinep64.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vcombinep64' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_crypto_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_crypto } */
-
-#include "arm_neon.h"
-
-void test_vcombinep64 (void)
-{
-  poly64x2_t out_poly64x2_t;
-  poly64x1_t arg0_poly64x1_t;
-  poly64x1_t arg1_poly64x1_t;
-
-  out_poly64x2_t = vcombine_p64 (arg0_poly64x1_t, arg1_poly64x1_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vcombinep8.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vcombinep8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vcombinep8 (void)
-{
-  poly8x16_t out_poly8x16_t;
-  poly8x8_t arg0_poly8x8_t;
-  poly8x8_t arg1_poly8x8_t;
-
-  out_poly8x16_t = vcombine_p8 (arg0_poly8x8_t, arg1_poly8x8_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vcombines16.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vcombines16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vcombines16 (void)
-{
-  int16x8_t out_int16x8_t;
-  int16x4_t arg0_int16x4_t;
-  int16x4_t arg1_int16x4_t;
-
-  out_int16x8_t = vcombine_s16 (arg0_int16x4_t, arg1_int16x4_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vcombines32.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vcombines32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vcombines32 (void)
-{
-  int32x4_t out_int32x4_t;
-  int32x2_t arg0_int32x2_t;
-  int32x2_t arg1_int32x2_t;
-
-  out_int32x4_t = vcombine_s32 (arg0_int32x2_t, arg1_int32x2_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vcombines64.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vcombines64' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vcombines64 (void)
-{
-  int64x2_t out_int64x2_t;
-  int64x1_t arg0_int64x1_t;
-  int64x1_t arg1_int64x1_t;
-
-  out_int64x2_t = vcombine_s64 (arg0_int64x1_t, arg1_int64x1_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vcombines8.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vcombines8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vcombines8 (void)
-{
-  int8x16_t out_int8x16_t;
-  int8x8_t arg0_int8x8_t;
-  int8x8_t arg1_int8x8_t;
-
-  out_int8x16_t = vcombine_s8 (arg0_int8x8_t, arg1_int8x8_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vcombineu16.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vcombineu16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vcombineu16 (void)
-{
-  uint16x8_t out_uint16x8_t;
-  uint16x4_t arg0_uint16x4_t;
-  uint16x4_t arg1_uint16x4_t;
-
-  out_uint16x8_t = vcombine_u16 (arg0_uint16x4_t, arg1_uint16x4_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vcombineu32.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vcombineu32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vcombineu32 (void)
-{
-  uint32x4_t out_uint32x4_t;
-  uint32x2_t arg0_uint32x2_t;
-  uint32x2_t arg1_uint32x2_t;
-
-  out_uint32x4_t = vcombine_u32 (arg0_uint32x2_t, arg1_uint32x2_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vcombineu64.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vcombineu64' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vcombineu64 (void)
-{
-  uint64x2_t out_uint64x2_t;
-  uint64x1_t arg0_uint64x1_t;
-  uint64x1_t arg1_uint64x1_t;
-
-  out_uint64x2_t = vcombine_u64 (arg0_uint64x1_t, arg1_uint64x1_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vcombineu8.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vcombineu8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vcombineu8 (void)
-{
-  uint8x16_t out_uint8x16_t;
-  uint8x8_t arg0_uint8x8_t;
-  uint8x8_t arg1_uint8x8_t;
-
-  out_uint8x16_t = vcombine_u8 (arg0_uint8x8_t, arg1_uint8x8_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vcreatef32.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vcreatef32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vcreatef32 (void)
-{
-  float32x2_t out_float32x2_t;
-  uint64_t arg0_uint64_t;
-
-  out_float32x2_t = vcreate_f32 (arg0_uint64_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vcreatep16.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vcreatep16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vcreatep16 (void)
-{
-  poly16x4_t out_poly16x4_t;
-  uint64_t arg0_uint64_t;
-
-  out_poly16x4_t = vcreate_p16 (arg0_uint64_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vcreatep64.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vcreatep64' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_crypto_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_crypto } */
-
-#include "arm_neon.h"
-
-void test_vcreatep64 (void)
-{
-  poly64x1_t out_poly64x1_t;
-  uint64_t arg0_uint64_t;
-
-  out_poly64x1_t = vcreate_p64 (arg0_uint64_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vcreatep8.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vcreatep8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vcreatep8 (void)
-{
-  poly8x8_t out_poly8x8_t;
-  uint64_t arg0_uint64_t;
-
-  out_poly8x8_t = vcreate_p8 (arg0_uint64_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vcreates16.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vcreates16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vcreates16 (void)
-{
-  int16x4_t out_int16x4_t;
-  uint64_t arg0_uint64_t;
-
-  out_int16x4_t = vcreate_s16 (arg0_uint64_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vcreates32.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vcreates32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vcreates32 (void)
-{
-  int32x2_t out_int32x2_t;
-  uint64_t arg0_uint64_t;
-
-  out_int32x2_t = vcreate_s32 (arg0_uint64_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vcreates64.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vcreates64' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vcreates64 (void)
-{
-  int64x1_t out_int64x1_t;
-  uint64_t arg0_uint64_t;
-
-  out_int64x1_t = vcreate_s64 (arg0_uint64_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vcreates8.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vcreates8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vcreates8 (void)
-{
-  int8x8_t out_int8x8_t;
-  uint64_t arg0_uint64_t;
-
-  out_int8x8_t = vcreate_s8 (arg0_uint64_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vcreateu16.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vcreateu16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vcreateu16 (void)
-{
-  uint16x4_t out_uint16x4_t;
-  uint64_t arg0_uint64_t;
-
-  out_uint16x4_t = vcreate_u16 (arg0_uint64_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vcreateu32.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vcreateu32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vcreateu32 (void)
-{
-  uint32x2_t out_uint32x2_t;
-  uint64_t arg0_uint64_t;
-
-  out_uint32x2_t = vcreate_u32 (arg0_uint64_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vcreateu64.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vcreateu64' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vcreateu64 (void)
-{
-  uint64x1_t out_uint64x1_t;
-  uint64_t arg0_uint64_t;
-
-  out_uint64x1_t = vcreate_u64 (arg0_uint64_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vcreateu8.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vcreateu8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vcreateu8 (void)
-{
-  uint8x8_t out_uint8x8_t;
-  uint64_t arg0_uint64_t;
-
-  out_uint8x8_t = vcreate_u8 (arg0_uint64_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vcvtQ_nf32_s32.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vcvtQ_nf32_s32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vcvtQ_nf32_s32 (void)
-{
-  float32x4_t out_float32x4_t;
-  int32x4_t arg0_int32x4_t;
-
-  out_float32x4_t = vcvtq_n_f32_s32 (arg0_int32x4_t, 1);
-}
-
-/* { dg-final { scan-assembler "vcvt\.f32.s32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vcvtQ_nf32_u32.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vcvtQ_nf32_u32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vcvtQ_nf32_u32 (void)
-{
-  float32x4_t out_float32x4_t;
-  uint32x4_t arg0_uint32x4_t;
-
-  out_float32x4_t = vcvtq_n_f32_u32 (arg0_uint32x4_t, 1);
-}
-
-/* { dg-final { scan-assembler "vcvt\.f32.u32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vcvtQ_ns32_f32.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vcvtQ_ns32_f32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vcvtQ_ns32_f32 (void)
-{
-  int32x4_t out_int32x4_t;
-  float32x4_t arg0_float32x4_t;
-
-  out_int32x4_t = vcvtq_n_s32_f32 (arg0_float32x4_t, 1);
-}
-
-/* { dg-final { scan-assembler "vcvt\.s32.f32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vcvtQ_nu32_f32.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vcvtQ_nu32_f32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vcvtQ_nu32_f32 (void)
-{
-  uint32x4_t out_uint32x4_t;
-  float32x4_t arg0_float32x4_t;
-
-  out_uint32x4_t = vcvtq_n_u32_f32 (arg0_float32x4_t, 1);
-}
-
-/* { dg-final { scan-assembler "vcvt\.u32.f32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vcvtQf32_s32.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vcvtQf32_s32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vcvtQf32_s32 (void)
-{
-  float32x4_t out_float32x4_t;
-  int32x4_t arg0_int32x4_t;
-
-  out_float32x4_t = vcvtq_f32_s32 (arg0_int32x4_t);
-}
-
-/* { dg-final { scan-assembler "vcvt\.f32.s32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vcvtQf32_u32.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vcvtQf32_u32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vcvtQf32_u32 (void)
-{
-  float32x4_t out_float32x4_t;
-  uint32x4_t arg0_uint32x4_t;
-
-  out_float32x4_t = vcvtq_f32_u32 (arg0_uint32x4_t);
-}
-
-/* { dg-final { scan-assembler "vcvt\.f32.u32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vcvtQs32_f32.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vcvtQs32_f32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vcvtQs32_f32 (void)
-{
-  int32x4_t out_int32x4_t;
-  float32x4_t arg0_float32x4_t;
-
-  out_int32x4_t = vcvtq_s32_f32 (arg0_float32x4_t);
-}
-
-/* { dg-final { scan-assembler "vcvt\.s32.f32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vcvtQu32_f32.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vcvtQu32_f32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vcvtQu32_f32 (void)
-{
-  uint32x4_t out_uint32x4_t;
-  float32x4_t arg0_float32x4_t;
-
-  out_uint32x4_t = vcvtq_u32_f32 (arg0_float32x4_t);
-}
-
-/* { dg-final { scan-assembler "vcvt\.u32.f32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vcvt_nf32_s32.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vcvt_nf32_s32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vcvt_nf32_s32 (void)
-{
-  float32x2_t out_float32x2_t;
-  int32x2_t arg0_int32x2_t;
-
-  out_float32x2_t = vcvt_n_f32_s32 (arg0_int32x2_t, 1);
-}
-
-/* { dg-final { scan-assembler "vcvt\.f32.s32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vcvt_nf32_u32.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vcvt_nf32_u32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vcvt_nf32_u32 (void)
-{
-  float32x2_t out_float32x2_t;
-  uint32x2_t arg0_uint32x2_t;
-
-  out_float32x2_t = vcvt_n_f32_u32 (arg0_uint32x2_t, 1);
-}
-
-/* { dg-final { scan-assembler "vcvt\.f32.u32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vcvt_ns32_f32.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vcvt_ns32_f32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vcvt_ns32_f32 (void)
-{
-  int32x2_t out_int32x2_t;
-  float32x2_t arg0_float32x2_t;
-
-  out_int32x2_t = vcvt_n_s32_f32 (arg0_float32x2_t, 1);
-}
-
-/* { dg-final { scan-assembler "vcvt\.s32.f32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vcvt_nu32_f32.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vcvt_nu32_f32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vcvt_nu32_f32 (void)
-{
-  uint32x2_t out_uint32x2_t;
-  float32x2_t arg0_float32x2_t;
-
-  out_uint32x2_t = vcvt_n_u32_f32 (arg0_float32x2_t, 1);
-}
-
-/* { dg-final { scan-assembler "vcvt\.u32.f32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vcvtf16_f32.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vcvtf16_f32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_fp16_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon_fp16 } */
-
-#include "arm_neon.h"
-
-void test_vcvtf16_f32 (void)
-{
-  float16x4_t out_float16x4_t;
-  float32x4_t arg0_float32x4_t;
-
-  out_float16x4_t = vcvt_f16_f32 (arg0_float32x4_t);
-}
-
-/* { dg-final { scan-assembler "vcvt\.f16.f32\[ 	\]+\[dD\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vcvtf32_f16.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vcvtf32_f16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_fp16_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon_fp16 } */
-
-#include "arm_neon.h"
-
-void test_vcvtf32_f16 (void)
-{
-  float32x4_t out_float32x4_t;
-  float16x4_t arg0_float16x4_t;
-
-  out_float32x4_t = vcvt_f32_f16 (arg0_float16x4_t);
-}
-
-/* { dg-final { scan-assembler "vcvt\.f32.f16\[ 	\]+\[qQ\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vcvtf32_s32.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vcvtf32_s32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vcvtf32_s32 (void)
-{
-  float32x2_t out_float32x2_t;
-  int32x2_t arg0_int32x2_t;
-
-  out_float32x2_t = vcvt_f32_s32 (arg0_int32x2_t);
-}
-
-/* { dg-final { scan-assembler "vcvt\.f32.s32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vcvtf32_u32.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vcvtf32_u32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vcvtf32_u32 (void)
-{
-  float32x2_t out_float32x2_t;
-  uint32x2_t arg0_uint32x2_t;
-
-  out_float32x2_t = vcvt_f32_u32 (arg0_uint32x2_t);
-}
-
-/* { dg-final { scan-assembler "vcvt\.f32.u32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vcvts32_f32.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vcvts32_f32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vcvts32_f32 (void)
-{
-  int32x2_t out_int32x2_t;
-  float32x2_t arg0_float32x2_t;
-
-  out_int32x2_t = vcvt_s32_f32 (arg0_float32x2_t);
-}
-
-/* { dg-final { scan-assembler "vcvt\.s32.f32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vcvtu32_f32.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vcvtu32_f32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vcvtu32_f32 (void)
-{
-  uint32x2_t out_uint32x2_t;
-  float32x2_t arg0_float32x2_t;
-
-  out_uint32x2_t = vcvt_u32_f32 (arg0_float32x2_t);
-}
-
-/* { dg-final { scan-assembler "vcvt\.u32.f32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vdupQ_lanef32.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vdupQ_lanef32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vdupQ_lanef32 (void)
-{
-  float32x4_t out_float32x4_t;
-  float32x2_t arg0_float32x2_t;
-
-  out_float32x4_t = vdupq_lane_f32 (arg0_float32x2_t, 1);
-}
-
-/* { dg-final { scan-assembler "vdup\.32\[ 	\]+\[qQ\]\[0-9\]+, \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vdupQ_lanep16.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vdupQ_lanep16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vdupQ_lanep16 (void)
-{
-  poly16x8_t out_poly16x8_t;
-  poly16x4_t arg0_poly16x4_t;
-
-  out_poly16x8_t = vdupq_lane_p16 (arg0_poly16x4_t, 1);
-}
-
-/* { dg-final { scan-assembler "vdup\.16\[ 	\]+\[qQ\]\[0-9\]+, \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vdupQ_lanep64.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vdupQ_lanep64' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_crypto_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_crypto } */
-
-#include "arm_neon.h"
-
-void test_vdupQ_lanep64 (void)
-{
-  poly64x2_t out_poly64x2_t;
-  poly64x1_t arg0_poly64x1_t;
-
-  out_poly64x2_t = vdupq_lane_p64 (arg0_poly64x1_t, 0);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vdupQ_lanep8.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vdupQ_lanep8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vdupQ_lanep8 (void)
-{
-  poly8x16_t out_poly8x16_t;
-  poly8x8_t arg0_poly8x8_t;
-
-  out_poly8x16_t = vdupq_lane_p8 (arg0_poly8x8_t, 1);
-}
-
-/* { dg-final { scan-assembler "vdup\.8\[ 	\]+\[qQ\]\[0-9\]+, \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vdupQ_lanes16.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vdupQ_lanes16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vdupQ_lanes16 (void)
-{
-  int16x8_t out_int16x8_t;
-  int16x4_t arg0_int16x4_t;
-
-  out_int16x8_t = vdupq_lane_s16 (arg0_int16x4_t, 1);
-}
-
-/* { dg-final { scan-assembler "vdup\.16\[ 	\]+\[qQ\]\[0-9\]+, \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vdupQ_lanes32.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vdupQ_lanes32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vdupQ_lanes32 (void)
-{
-  int32x4_t out_int32x4_t;
-  int32x2_t arg0_int32x2_t;
-
-  out_int32x4_t = vdupq_lane_s32 (arg0_int32x2_t, 1);
-}
-
-/* { dg-final { scan-assembler "vdup\.32\[ 	\]+\[qQ\]\[0-9\]+, \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vdupQ_lanes64.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vdupQ_lanes64' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vdupQ_lanes64 (void)
-{
-  int64x2_t out_int64x2_t;
-  int64x1_t arg0_int64x1_t;
-
-  out_int64x2_t = vdupq_lane_s64 (arg0_int64x1_t, 0);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vdupQ_lanes8.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vdupQ_lanes8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vdupQ_lanes8 (void)
-{
-  int8x16_t out_int8x16_t;
-  int8x8_t arg0_int8x8_t;
-
-  out_int8x16_t = vdupq_lane_s8 (arg0_int8x8_t, 1);
-}
-
-/* { dg-final { scan-assembler "vdup\.8\[ 	\]+\[qQ\]\[0-9\]+, \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vdupQ_laneu16.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vdupQ_laneu16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vdupQ_laneu16 (void)
-{
-  uint16x8_t out_uint16x8_t;
-  uint16x4_t arg0_uint16x4_t;
-
-  out_uint16x8_t = vdupq_lane_u16 (arg0_uint16x4_t, 1);
-}
-
-/* { dg-final { scan-assembler "vdup\.16\[ 	\]+\[qQ\]\[0-9\]+, \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vdupQ_laneu32.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vdupQ_laneu32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vdupQ_laneu32 (void)
-{
-  uint32x4_t out_uint32x4_t;
-  uint32x2_t arg0_uint32x2_t;
-
-  out_uint32x4_t = vdupq_lane_u32 (arg0_uint32x2_t, 1);
-}
-
-/* { dg-final { scan-assembler "vdup\.32\[ 	\]+\[qQ\]\[0-9\]+, \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vdupQ_laneu64.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vdupQ_laneu64' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vdupQ_laneu64 (void)
-{
-  uint64x2_t out_uint64x2_t;
-  uint64x1_t arg0_uint64x1_t;
-
-  out_uint64x2_t = vdupq_lane_u64 (arg0_uint64x1_t, 0);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vdupQ_laneu8.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vdupQ_laneu8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vdupQ_laneu8 (void)
-{
-  uint8x16_t out_uint8x16_t;
-  uint8x8_t arg0_uint8x8_t;
-
-  out_uint8x16_t = vdupq_lane_u8 (arg0_uint8x8_t, 1);
-}
-
-/* { dg-final { scan-assembler "vdup\.8\[ 	\]+\[qQ\]\[0-9\]+, \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vdupQ_nf32.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vdupQ_nf32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vdupQ_nf32 (void)
-{
-  float32x4_t out_float32x4_t;
-  float32_t arg0_float32_t;
-
-  out_float32x4_t = vdupq_n_f32 (arg0_float32_t);
-}
-
-/* { dg-final { scan-assembler "vdup\.32\[ 	\]+\[qQ\]\[0-9\]+, (\[rR\]\[0-9\]+|\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vdupQ_np16.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vdupQ_np16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vdupQ_np16 (void)
-{
-  poly16x8_t out_poly16x8_t;
-  poly16_t arg0_poly16_t;
-
-  out_poly16x8_t = vdupq_n_p16 (arg0_poly16_t);
-}
-
-/* { dg-final { scan-assembler "vdup\.16\[ 	\]+\[qQ\]\[0-9\]+, (\[rR\]\[0-9\]+|\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vdupQ_np64.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vdupQ_np64' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_crypto_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_crypto } */
-
-#include "arm_neon.h"
-
-void test_vdupQ_np64 (void)
-{
-  poly64x2_t out_poly64x2_t;
-  poly64_t arg0_poly64_t;
-
-  out_poly64x2_t = vdupq_n_p64 (arg0_poly64_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vdupQ_np8.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vdupQ_np8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vdupQ_np8 (void)
-{
-  poly8x16_t out_poly8x16_t;
-  poly8_t arg0_poly8_t;
-
-  out_poly8x16_t = vdupq_n_p8 (arg0_poly8_t);
-}
-
-/* { dg-final { scan-assembler "vdup\.8\[ 	\]+\[qQ\]\[0-9\]+, (\[rR\]\[0-9\]+|\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vdupQ_ns16.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vdupQ_ns16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vdupQ_ns16 (void)
-{
-  int16x8_t out_int16x8_t;
-  int16_t arg0_int16_t;
-
-  out_int16x8_t = vdupq_n_s16 (arg0_int16_t);
-}
-
-/* { dg-final { scan-assembler "vdup\.16\[ 	\]+\[qQ\]\[0-9\]+, (\[rR\]\[0-9\]+|\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vdupQ_ns32.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vdupQ_ns32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vdupQ_ns32 (void)
-{
-  int32x4_t out_int32x4_t;
-  int32_t arg0_int32_t;
-
-  out_int32x4_t = vdupq_n_s32 (arg0_int32_t);
-}
-
-/* { dg-final { scan-assembler "vdup\.32\[ 	\]+\[qQ\]\[0-9\]+, (\[rR\]\[0-9\]+|\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vdupQ_ns64.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vdupQ_ns64' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vdupQ_ns64 (void)
-{
-  int64x2_t out_int64x2_t;
-  int64_t arg0_int64_t;
-
-  out_int64x2_t = vdupq_n_s64 (arg0_int64_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vdupQ_ns8.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vdupQ_ns8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vdupQ_ns8 (void)
-{
-  int8x16_t out_int8x16_t;
-  int8_t arg0_int8_t;
-
-  out_int8x16_t = vdupq_n_s8 (arg0_int8_t);
-}
-
-/* { dg-final { scan-assembler "vdup\.8\[ 	\]+\[qQ\]\[0-9\]+, (\[rR\]\[0-9\]+|\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vdupQ_nu16.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vdupQ_nu16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vdupQ_nu16 (void)
-{
-  uint16x8_t out_uint16x8_t;
-  uint16_t arg0_uint16_t;
-
-  out_uint16x8_t = vdupq_n_u16 (arg0_uint16_t);
-}
-
-/* { dg-final { scan-assembler "vdup\.16\[ 	\]+\[qQ\]\[0-9\]+, (\[rR\]\[0-9\]+|\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vdupQ_nu32.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vdupQ_nu32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vdupQ_nu32 (void)
-{
-  uint32x4_t out_uint32x4_t;
-  uint32_t arg0_uint32_t;
-
-  out_uint32x4_t = vdupq_n_u32 (arg0_uint32_t);
-}
-
-/* { dg-final { scan-assembler "vdup\.32\[ 	\]+\[qQ\]\[0-9\]+, (\[rR\]\[0-9\]+|\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vdupQ_nu64.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vdupQ_nu64' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vdupQ_nu64 (void)
-{
-  uint64x2_t out_uint64x2_t;
-  uint64_t arg0_uint64_t;
-
-  out_uint64x2_t = vdupq_n_u64 (arg0_uint64_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vdupQ_nu8.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vdupQ_nu8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vdupQ_nu8 (void)
-{
-  uint8x16_t out_uint8x16_t;
-  uint8_t arg0_uint8_t;
-
-  out_uint8x16_t = vdupq_n_u8 (arg0_uint8_t);
-}
-
-/* { dg-final { scan-assembler "vdup\.8\[ 	\]+\[qQ\]\[0-9\]+, (\[rR\]\[0-9\]+|\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vdup_lanef32.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vdup_lanef32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vdup_lanef32 (void)
-{
-  float32x2_t out_float32x2_t;
-  float32x2_t arg0_float32x2_t;
-
-  out_float32x2_t = vdup_lane_f32 (arg0_float32x2_t, 1);
-}
-
-/* { dg-final { scan-assembler "vdup\.32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vdup_lanep16.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vdup_lanep16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vdup_lanep16 (void)
-{
-  poly16x4_t out_poly16x4_t;
-  poly16x4_t arg0_poly16x4_t;
-
-  out_poly16x4_t = vdup_lane_p16 (arg0_poly16x4_t, 1);
-}
-
-/* { dg-final { scan-assembler "vdup\.16\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vdup_lanep64.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vdup_lanep64' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_crypto_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_crypto } */
-
-#include "arm_neon.h"
-
-void test_vdup_lanep64 (void)
-{
-  poly64x1_t out_poly64x1_t;
-  poly64x1_t arg0_poly64x1_t;
-
-  out_poly64x1_t = vdup_lane_p64 (arg0_poly64x1_t, 0);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vdup_lanep8.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vdup_lanep8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vdup_lanep8 (void)
-{
-  poly8x8_t out_poly8x8_t;
-  poly8x8_t arg0_poly8x8_t;
-
-  out_poly8x8_t = vdup_lane_p8 (arg0_poly8x8_t, 1);
-}
-
-/* { dg-final { scan-assembler "vdup\.8\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vdup_lanes16.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vdup_lanes16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vdup_lanes16 (void)
-{
-  int16x4_t out_int16x4_t;
-  int16x4_t arg0_int16x4_t;
-
-  out_int16x4_t = vdup_lane_s16 (arg0_int16x4_t, 1);
-}
-
-/* { dg-final { scan-assembler "vdup\.16\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vdup_lanes32.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vdup_lanes32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vdup_lanes32 (void)
-{
-  int32x2_t out_int32x2_t;
-  int32x2_t arg0_int32x2_t;
-
-  out_int32x2_t = vdup_lane_s32 (arg0_int32x2_t, 1);
-}
-
-/* { dg-final { scan-assembler "vdup\.32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vdup_lanes64.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vdup_lanes64' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vdup_lanes64 (void)
-{
-  int64x1_t out_int64x1_t;
-  int64x1_t arg0_int64x1_t;
-
-  out_int64x1_t = vdup_lane_s64 (arg0_int64x1_t, 0);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vdup_lanes8.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vdup_lanes8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vdup_lanes8 (void)
-{
-  int8x8_t out_int8x8_t;
-  int8x8_t arg0_int8x8_t;
-
-  out_int8x8_t = vdup_lane_s8 (arg0_int8x8_t, 1);
-}
-
-/* { dg-final { scan-assembler "vdup\.8\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vdup_laneu16.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vdup_laneu16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vdup_laneu16 (void)
-{
-  uint16x4_t out_uint16x4_t;
-  uint16x4_t arg0_uint16x4_t;
-
-  out_uint16x4_t = vdup_lane_u16 (arg0_uint16x4_t, 1);
-}
-
-/* { dg-final { scan-assembler "vdup\.16\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vdup_laneu32.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vdup_laneu32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vdup_laneu32 (void)
-{
-  uint32x2_t out_uint32x2_t;
-  uint32x2_t arg0_uint32x2_t;
-
-  out_uint32x2_t = vdup_lane_u32 (arg0_uint32x2_t, 1);
-}
-
-/* { dg-final { scan-assembler "vdup\.32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vdup_laneu64.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vdup_laneu64' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vdup_laneu64 (void)
-{
-  uint64x1_t out_uint64x1_t;
-  uint64x1_t arg0_uint64x1_t;
-
-  out_uint64x1_t = vdup_lane_u64 (arg0_uint64x1_t, 0);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vdup_laneu8.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vdup_laneu8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vdup_laneu8 (void)
-{
-  uint8x8_t out_uint8x8_t;
-  uint8x8_t arg0_uint8x8_t;
-
-  out_uint8x8_t = vdup_lane_u8 (arg0_uint8x8_t, 1);
-}
-
-/* { dg-final { scan-assembler "vdup\.8\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vdup_nf32.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vdup_nf32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vdup_nf32 (void)
-{
-  float32x2_t out_float32x2_t;
-  float32_t arg0_float32_t;
-
-  out_float32x2_t = vdup_n_f32 (arg0_float32_t);
-}
-
-/* { dg-final { scan-assembler "vdup\.32\[ 	\]+\[dD\]\[0-9\]+, (\[rR\]\[0-9\]+|\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vdup_np16.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vdup_np16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vdup_np16 (void)
-{
-  poly16x4_t out_poly16x4_t;
-  poly16_t arg0_poly16_t;
-
-  out_poly16x4_t = vdup_n_p16 (arg0_poly16_t);
-}
-
-/* { dg-final { scan-assembler "vdup\.16\[ 	\]+\[dD\]\[0-9\]+, (\[rR\]\[0-9\]+|\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vdup_np64.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vdup_np64' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_crypto_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_crypto } */
-
-#include "arm_neon.h"
-
-void test_vdup_np64 (void)
-{
-  poly64x1_t out_poly64x1_t;
-  poly64_t arg0_poly64_t;
-
-  out_poly64x1_t = vdup_n_p64 (arg0_poly64_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vdup_np8.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vdup_np8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vdup_np8 (void)
-{
-  poly8x8_t out_poly8x8_t;
-  poly8_t arg0_poly8_t;
-
-  out_poly8x8_t = vdup_n_p8 (arg0_poly8_t);
-}
-
-/* { dg-final { scan-assembler "vdup\.8\[ 	\]+\[dD\]\[0-9\]+, (\[rR\]\[0-9\]+|\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vdup_ns16.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vdup_ns16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vdup_ns16 (void)
-{
-  int16x4_t out_int16x4_t;
-  int16_t arg0_int16_t;
-
-  out_int16x4_t = vdup_n_s16 (arg0_int16_t);
-}
-
-/* { dg-final { scan-assembler "vdup\.16\[ 	\]+\[dD\]\[0-9\]+, (\[rR\]\[0-9\]+|\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vdup_ns32.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vdup_ns32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vdup_ns32 (void)
-{
-  int32x2_t out_int32x2_t;
-  int32_t arg0_int32_t;
-
-  out_int32x2_t = vdup_n_s32 (arg0_int32_t);
-}
-
-/* { dg-final { scan-assembler "vdup\.32\[ 	\]+\[dD\]\[0-9\]+, (\[rR\]\[0-9\]+|\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vdup_ns64.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vdup_ns64' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vdup_ns64 (void)
-{
-  int64x1_t out_int64x1_t;
-  int64_t arg0_int64_t;
-
-  out_int64x1_t = vdup_n_s64 (arg0_int64_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vdup_ns8.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vdup_ns8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vdup_ns8 (void)
-{
-  int8x8_t out_int8x8_t;
-  int8_t arg0_int8_t;
-
-  out_int8x8_t = vdup_n_s8 (arg0_int8_t);
-}
-
-/* { dg-final { scan-assembler "vdup\.8\[ 	\]+\[dD\]\[0-9\]+, (\[rR\]\[0-9\]+|\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vdup_nu16.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vdup_nu16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vdup_nu16 (void)
-{
-  uint16x4_t out_uint16x4_t;
-  uint16_t arg0_uint16_t;
-
-  out_uint16x4_t = vdup_n_u16 (arg0_uint16_t);
-}
-
-/* { dg-final { scan-assembler "vdup\.16\[ 	\]+\[dD\]\[0-9\]+, (\[rR\]\[0-9\]+|\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vdup_nu32.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vdup_nu32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vdup_nu32 (void)
-{
-  uint32x2_t out_uint32x2_t;
-  uint32_t arg0_uint32_t;
-
-  out_uint32x2_t = vdup_n_u32 (arg0_uint32_t);
-}
-
-/* { dg-final { scan-assembler "vdup\.32\[ 	\]+\[dD\]\[0-9\]+, (\[rR\]\[0-9\]+|\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vdup_nu64.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vdup_nu64' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vdup_nu64 (void)
-{
-  uint64x1_t out_uint64x1_t;
-  uint64_t arg0_uint64_t;
-
-  out_uint64x1_t = vdup_n_u64 (arg0_uint64_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vdup_nu8.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vdup_nu8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vdup_nu8 (void)
-{
-  uint8x8_t out_uint8x8_t;
-  uint8_t arg0_uint8_t;
-
-  out_uint8x8_t = vdup_n_u8 (arg0_uint8_t);
-}
-
-/* { dg-final { scan-assembler "vdup\.8\[ 	\]+\[dD\]\[0-9\]+, (\[rR\]\[0-9\]+|\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vect-vcvt.c
+++ b/src//dev/null
@@ -1,27 +0,0 @@
-/* { dg-do compile } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-O2 -ftree-vectorize -fdump-tree-vect-details -mvectorize-with-neon-double" } */
-/* { dg-add-options arm_neon } */
-
-#define N 32
-
-int ib[N] = {0,3,6,9,12,15,18,21,24,27,30,33,36,39,42,45,0,3,6,9,12,15,18,21,24,27,30,33,36,39,42,45};
-float fa[N];
-int ia[N];
-
-int convert()
-{
-  int i;
-
-  /* int -> float */
-  for (i = 0; i < N; i++)
-    fa[i] = (float) ib[i];
-
-  /* float -> int */
-  for (i = 0; i < N; i++)
-    ia[i] = (int) fa[i];
-
-  return 0;
-}
-
-/* { dg-final { scan-tree-dump-times "vectorized 2 loops" 1 "vect" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vect-vcvtq.c
+++ b/src//dev/null
@@ -1,27 +0,0 @@
-/* { dg-do compile } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-O2 -ftree-vectorize -fdump-tree-vect-details" } */
-/* { dg-add-options arm_neon } */
-
-#define N 32
-
-int ib[N] = {0,3,6,9,12,15,18,21,24,27,30,33,36,39,42,45,0,3,6,9,12,15,18,21,24,27,30,33,36,39,42,45};
-float fa[N];
-int ia[N];
-
-int convert()
-{
-  int i;
-
-  /* int -> float */
-  for (i = 0; i < N; i++)
-    fa[i] = (float) ib[i];
-
-  /* float -> int */
-  for (i = 0; i < N; i++)
-    ia[i] = (int) fa[i];
-
-  return 0;
-}
-
-/* { dg-final { scan-tree-dump-times "vectorized 2 loops" 1 "vect" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/veorQs16.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `veorQs16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_veorQs16 (void)
-{
-  int16x8_t out_int16x8_t;
-  int16x8_t arg0_int16x8_t;
-  int16x8_t arg1_int16x8_t;
-
-  out_int16x8_t = veorq_s16 (arg0_int16x8_t, arg1_int16x8_t);
-}
-
-/* { dg-final { scan-assembler "veor\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/veorQs32.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `veorQs32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_veorQs32 (void)
-{
-  int32x4_t out_int32x4_t;
-  int32x4_t arg0_int32x4_t;
-  int32x4_t arg1_int32x4_t;
-
-  out_int32x4_t = veorq_s32 (arg0_int32x4_t, arg1_int32x4_t);
-}
-
-/* { dg-final { scan-assembler "veor\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/veorQs64.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `veorQs64' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_veorQs64 (void)
-{
-  int64x2_t out_int64x2_t;
-  int64x2_t arg0_int64x2_t;
-  int64x2_t arg1_int64x2_t;
-
-  out_int64x2_t = veorq_s64 (arg0_int64x2_t, arg1_int64x2_t);
-}
-
-/* { dg-final { scan-assembler "veor\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/veorQs8.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `veorQs8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_veorQs8 (void)
-{
-  int8x16_t out_int8x16_t;
-  int8x16_t arg0_int8x16_t;
-  int8x16_t arg1_int8x16_t;
-
-  out_int8x16_t = veorq_s8 (arg0_int8x16_t, arg1_int8x16_t);
-}
-
-/* { dg-final { scan-assembler "veor\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/veorQu16.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `veorQu16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_veorQu16 (void)
-{
-  uint16x8_t out_uint16x8_t;
-  uint16x8_t arg0_uint16x8_t;
-  uint16x8_t arg1_uint16x8_t;
-
-  out_uint16x8_t = veorq_u16 (arg0_uint16x8_t, arg1_uint16x8_t);
-}
-
-/* { dg-final { scan-assembler "veor\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/veorQu32.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `veorQu32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_veorQu32 (void)
-{
-  uint32x4_t out_uint32x4_t;
-  uint32x4_t arg0_uint32x4_t;
-  uint32x4_t arg1_uint32x4_t;
-
-  out_uint32x4_t = veorq_u32 (arg0_uint32x4_t, arg1_uint32x4_t);
-}
-
-/* { dg-final { scan-assembler "veor\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/veorQu64.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `veorQu64' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_veorQu64 (void)
-{
-  uint64x2_t out_uint64x2_t;
-  uint64x2_t arg0_uint64x2_t;
-  uint64x2_t arg1_uint64x2_t;
-
-  out_uint64x2_t = veorq_u64 (arg0_uint64x2_t, arg1_uint64x2_t);
-}
-
-/* { dg-final { scan-assembler "veor\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/veorQu8.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `veorQu8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_veorQu8 (void)
-{
-  uint8x16_t out_uint8x16_t;
-  uint8x16_t arg0_uint8x16_t;
-  uint8x16_t arg1_uint8x16_t;
-
-  out_uint8x16_t = veorq_u8 (arg0_uint8x16_t, arg1_uint8x16_t);
-}
-
-/* { dg-final { scan-assembler "veor\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/veors16.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `veors16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_veors16 (void)
-{
-  int16x4_t out_int16x4_t;
-  int16x4_t arg0_int16x4_t;
-  int16x4_t arg1_int16x4_t;
-
-  out_int16x4_t = veor_s16 (arg0_int16x4_t, arg1_int16x4_t);
-}
-
-/* { dg-final { scan-assembler "veor\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/veors32.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `veors32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_veors32 (void)
-{
-  int32x2_t out_int32x2_t;
-  int32x2_t arg0_int32x2_t;
-  int32x2_t arg1_int32x2_t;
-
-  out_int32x2_t = veor_s32 (arg0_int32x2_t, arg1_int32x2_t);
-}
-
-/* { dg-final { scan-assembler "veor\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/veors64.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `veors64' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_veors64 (void)
-{
-  int64x1_t out_int64x1_t;
-  int64x1_t arg0_int64x1_t;
-  int64x1_t arg1_int64x1_t;
-
-  out_int64x1_t = veor_s64 (arg0_int64x1_t, arg1_int64x1_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/veors8.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `veors8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_veors8 (void)
-{
-  int8x8_t out_int8x8_t;
-  int8x8_t arg0_int8x8_t;
-  int8x8_t arg1_int8x8_t;
-
-  out_int8x8_t = veor_s8 (arg0_int8x8_t, arg1_int8x8_t);
-}
-
-/* { dg-final { scan-assembler "veor\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/veoru16.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `veoru16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_veoru16 (void)
-{
-  uint16x4_t out_uint16x4_t;
-  uint16x4_t arg0_uint16x4_t;
-  uint16x4_t arg1_uint16x4_t;
-
-  out_uint16x4_t = veor_u16 (arg0_uint16x4_t, arg1_uint16x4_t);
-}
-
-/* { dg-final { scan-assembler "veor\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/veoru32.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `veoru32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_veoru32 (void)
-{
-  uint32x2_t out_uint32x2_t;
-  uint32x2_t arg0_uint32x2_t;
-  uint32x2_t arg1_uint32x2_t;
-
-  out_uint32x2_t = veor_u32 (arg0_uint32x2_t, arg1_uint32x2_t);
-}
-
-/* { dg-final { scan-assembler "veor\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/veoru64.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `veoru64' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_veoru64 (void)
-{
-  uint64x1_t out_uint64x1_t;
-  uint64x1_t arg0_uint64x1_t;
-  uint64x1_t arg1_uint64x1_t;
-
-  out_uint64x1_t = veor_u64 (arg0_uint64x1_t, arg1_uint64x1_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/veoru8.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `veoru8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_veoru8 (void)
-{
-  uint8x8_t out_uint8x8_t;
-  uint8x8_t arg0_uint8x8_t;
-  uint8x8_t arg1_uint8x8_t;
-
-  out_uint8x8_t = veor_u8 (arg0_uint8x8_t, arg1_uint8x8_t);
-}
-
-/* { dg-final { scan-assembler "veor\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vextQf32.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vextQf32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vextQf32 (void)
-{
-  float32x4_t out_float32x4_t;
-  float32x4_t arg0_float32x4_t;
-  float32x4_t arg1_float32x4_t;
-
-  out_float32x4_t = vextq_f32 (arg0_float32x4_t, arg1_float32x4_t, 0);
-}
-
-/* { dg-final { scan-assembler "vext\.32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vextQp16.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vextQp16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vextQp16 (void)
-{
-  poly16x8_t out_poly16x8_t;
-  poly16x8_t arg0_poly16x8_t;
-  poly16x8_t arg1_poly16x8_t;
-
-  out_poly16x8_t = vextq_p16 (arg0_poly16x8_t, arg1_poly16x8_t, 0);
-}
-
-/* { dg-final { scan-assembler "vext\.16\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vextQp64.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vextQp64' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_crypto_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_crypto } */
-
-#include "arm_neon.h"
-
-void test_vextQp64 (void)
-{
-  poly64x2_t out_poly64x2_t;
-  poly64x2_t arg0_poly64x2_t;
-  poly64x2_t arg1_poly64x2_t;
-
-  out_poly64x2_t = vextq_p64 (arg0_poly64x2_t, arg1_poly64x2_t, 0);
-}
-
-/* { dg-final { scan-assembler "vext\.64\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vextQp8.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vextQp8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vextQp8 (void)
-{
-  poly8x16_t out_poly8x16_t;
-  poly8x16_t arg0_poly8x16_t;
-  poly8x16_t arg1_poly8x16_t;
-
-  out_poly8x16_t = vextq_p8 (arg0_poly8x16_t, arg1_poly8x16_t, 0);
-}
-
-/* { dg-final { scan-assembler "vext\.8\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vextQs16.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vextQs16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vextQs16 (void)
-{
-  int16x8_t out_int16x8_t;
-  int16x8_t arg0_int16x8_t;
-  int16x8_t arg1_int16x8_t;
-
-  out_int16x8_t = vextq_s16 (arg0_int16x8_t, arg1_int16x8_t, 0);
-}
-
-/* { dg-final { scan-assembler "vext\.16\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vextQs32.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vextQs32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vextQs32 (void)
-{
-  int32x4_t out_int32x4_t;
-  int32x4_t arg0_int32x4_t;
-  int32x4_t arg1_int32x4_t;
-
-  out_int32x4_t = vextq_s32 (arg0_int32x4_t, arg1_int32x4_t, 0);
-}
-
-/* { dg-final { scan-assembler "vext\.32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vextQs64.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vextQs64' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vextQs64 (void)
-{
-  int64x2_t out_int64x2_t;
-  int64x2_t arg0_int64x2_t;
-  int64x2_t arg1_int64x2_t;
-
-  out_int64x2_t = vextq_s64 (arg0_int64x2_t, arg1_int64x2_t, 0);
-}
-
-/* { dg-final { scan-assembler "vext\.64\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vextQs8.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vextQs8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vextQs8 (void)
-{
-  int8x16_t out_int8x16_t;
-  int8x16_t arg0_int8x16_t;
-  int8x16_t arg1_int8x16_t;
-
-  out_int8x16_t = vextq_s8 (arg0_int8x16_t, arg1_int8x16_t, 0);
-}
-
-/* { dg-final { scan-assembler "vext\.8\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vextQu16.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vextQu16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vextQu16 (void)
-{
-  uint16x8_t out_uint16x8_t;
-  uint16x8_t arg0_uint16x8_t;
-  uint16x8_t arg1_uint16x8_t;
-
-  out_uint16x8_t = vextq_u16 (arg0_uint16x8_t, arg1_uint16x8_t, 0);
-}
-
-/* { dg-final { scan-assembler "vext\.16\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vextQu32.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vextQu32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vextQu32 (void)
-{
-  uint32x4_t out_uint32x4_t;
-  uint32x4_t arg0_uint32x4_t;
-  uint32x4_t arg1_uint32x4_t;
-
-  out_uint32x4_t = vextq_u32 (arg0_uint32x4_t, arg1_uint32x4_t, 0);
-}
-
-/* { dg-final { scan-assembler "vext\.32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vextQu64.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vextQu64' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vextQu64 (void)
-{
-  uint64x2_t out_uint64x2_t;
-  uint64x2_t arg0_uint64x2_t;
-  uint64x2_t arg1_uint64x2_t;
-
-  out_uint64x2_t = vextq_u64 (arg0_uint64x2_t, arg1_uint64x2_t, 0);
-}
-
-/* { dg-final { scan-assembler "vext\.64\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vextQu8.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vextQu8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vextQu8 (void)
-{
-  uint8x16_t out_uint8x16_t;
-  uint8x16_t arg0_uint8x16_t;
-  uint8x16_t arg1_uint8x16_t;
-
-  out_uint8x16_t = vextq_u8 (arg0_uint8x16_t, arg1_uint8x16_t, 0);
-}
-
-/* { dg-final { scan-assembler "vext\.8\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vextf32.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vextf32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vextf32 (void)
-{
-  float32x2_t out_float32x2_t;
-  float32x2_t arg0_float32x2_t;
-  float32x2_t arg1_float32x2_t;
-
-  out_float32x2_t = vext_f32 (arg0_float32x2_t, arg1_float32x2_t, 0);
-}
-
-/* { dg-final { scan-assembler "vext\.32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vextp16.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vextp16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vextp16 (void)
-{
-  poly16x4_t out_poly16x4_t;
-  poly16x4_t arg0_poly16x4_t;
-  poly16x4_t arg1_poly16x4_t;
-
-  out_poly16x4_t = vext_p16 (arg0_poly16x4_t, arg1_poly16x4_t, 0);
-}
-
-/* { dg-final { scan-assembler "vext\.16\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vextp64.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vextp64' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_crypto_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_crypto } */
-
-#include "arm_neon.h"
-
-void test_vextp64 (void)
-{
-  poly64x1_t out_poly64x1_t;
-  poly64x1_t arg0_poly64x1_t;
-  poly64x1_t arg1_poly64x1_t;
-
-  out_poly64x1_t = vext_p64 (arg0_poly64x1_t, arg1_poly64x1_t, 0);
-}
-
-/* { dg-final { scan-assembler "vext\.64\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vextp8.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vextp8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vextp8 (void)
-{
-  poly8x8_t out_poly8x8_t;
-  poly8x8_t arg0_poly8x8_t;
-  poly8x8_t arg1_poly8x8_t;
-
-  out_poly8x8_t = vext_p8 (arg0_poly8x8_t, arg1_poly8x8_t, 0);
-}
-
-/* { dg-final { scan-assembler "vext\.8\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vexts16.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vexts16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vexts16 (void)
-{
-  int16x4_t out_int16x4_t;
-  int16x4_t arg0_int16x4_t;
-  int16x4_t arg1_int16x4_t;
-
-  out_int16x4_t = vext_s16 (arg0_int16x4_t, arg1_int16x4_t, 0);
-}
-
-/* { dg-final { scan-assembler "vext\.16\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vexts32.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vexts32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vexts32 (void)
-{
-  int32x2_t out_int32x2_t;
-  int32x2_t arg0_int32x2_t;
-  int32x2_t arg1_int32x2_t;
-
-  out_int32x2_t = vext_s32 (arg0_int32x2_t, arg1_int32x2_t, 0);
-}
-
-/* { dg-final { scan-assembler "vext\.32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vexts64.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vexts64' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vexts64 (void)
-{
-  int64x1_t out_int64x1_t;
-  int64x1_t arg0_int64x1_t;
-  int64x1_t arg1_int64x1_t;
-
-  out_int64x1_t = vext_s64 (arg0_int64x1_t, arg1_int64x1_t, 0);
-}
-
-/* { dg-final { scan-assembler "vext\.64\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vexts8.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vexts8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vexts8 (void)
-{
-  int8x8_t out_int8x8_t;
-  int8x8_t arg0_int8x8_t;
-  int8x8_t arg1_int8x8_t;
-
-  out_int8x8_t = vext_s8 (arg0_int8x8_t, arg1_int8x8_t, 0);
-}
-
-/* { dg-final { scan-assembler "vext\.8\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vextu16.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vextu16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vextu16 (void)
-{
-  uint16x4_t out_uint16x4_t;
-  uint16x4_t arg0_uint16x4_t;
-  uint16x4_t arg1_uint16x4_t;
-
-  out_uint16x4_t = vext_u16 (arg0_uint16x4_t, arg1_uint16x4_t, 0);
-}
-
-/* { dg-final { scan-assembler "vext\.16\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vextu32.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vextu32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vextu32 (void)
-{
-  uint32x2_t out_uint32x2_t;
-  uint32x2_t arg0_uint32x2_t;
-  uint32x2_t arg1_uint32x2_t;
-
-  out_uint32x2_t = vext_u32 (arg0_uint32x2_t, arg1_uint32x2_t, 0);
-}
-
-/* { dg-final { scan-assembler "vext\.32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vextu64.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vextu64' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vextu64 (void)
-{
-  uint64x1_t out_uint64x1_t;
-  uint64x1_t arg0_uint64x1_t;
-  uint64x1_t arg1_uint64x1_t;
-
-  out_uint64x1_t = vext_u64 (arg0_uint64x1_t, arg1_uint64x1_t, 0);
-}
-
-/* { dg-final { scan-assembler "vext\.64\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vextu8.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vextu8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vextu8 (void)
-{
-  uint8x8_t out_uint8x8_t;
-  uint8x8_t arg0_uint8x8_t;
-  uint8x8_t arg1_uint8x8_t;
-
-  out_uint8x8_t = vext_u8 (arg0_uint8x8_t, arg1_uint8x8_t, 0);
-}
-
-/* { dg-final { scan-assembler "vext\.8\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vfmaQf32.c
+++ b/src//dev/null
@@ -1,21 +0,0 @@
-/* Test the `vfmaQf32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neonv2_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neonv2 } */
-
-#include "arm_neon.h"
-
-void test_vfmaQf32 (void)
-{
-  float32x4_t out_float32x4_t;
-  float32x4_t arg0_float32x4_t;
-  float32x4_t arg1_float32x4_t;
-  float32x4_t arg2_float32x4_t;
-
-  out_float32x4_t = vfmaq_f32 (arg0_float32x4_t, arg1_float32x4_t, arg2_float32x4_t);
-}
-
-/* { dg-final { scan-assembler "vfma\.f32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vfmaf32.c
+++ b/src//dev/null
@@ -1,21 +0,0 @@
-/* Test the `vfmaf32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neonv2_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neonv2 } */
-
-#include "arm_neon.h"
-
-void test_vfmaf32 (void)
-{
-  float32x2_t out_float32x2_t;
-  float32x2_t arg0_float32x2_t;
-  float32x2_t arg1_float32x2_t;
-  float32x2_t arg2_float32x2_t;
-
-  out_float32x2_t = vfma_f32 (arg0_float32x2_t, arg1_float32x2_t, arg2_float32x2_t);
-}
-
-/* { dg-final { scan-assembler "vfma\.f32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vfmsQf32.c
+++ b/src//dev/null
@@ -1,21 +0,0 @@
-/* Test the `vfmsQf32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neonv2_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neonv2 } */
-
-#include "arm_neon.h"
-
-void test_vfmsQf32 (void)
-{
-  float32x4_t out_float32x4_t;
-  float32x4_t arg0_float32x4_t;
-  float32x4_t arg1_float32x4_t;
-  float32x4_t arg2_float32x4_t;
-
-  out_float32x4_t = vfmsq_f32 (arg0_float32x4_t, arg1_float32x4_t, arg2_float32x4_t);
-}
-
-/* { dg-final { scan-assembler "vfms\.f32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vfmsf32.c
+++ b/src//dev/null
@@ -1,21 +0,0 @@
-/* Test the `vfmsf32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neonv2_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neonv2 } */
-
-#include "arm_neon.h"
-
-void test_vfmsf32 (void)
-{
-  float32x2_t out_float32x2_t;
-  float32x2_t arg0_float32x2_t;
-  float32x2_t arg1_float32x2_t;
-  float32x2_t arg2_float32x2_t;
-
-  out_float32x2_t = vfms_f32 (arg0_float32x2_t, arg1_float32x2_t, arg2_float32x2_t);
-}
-
-/* { dg-final { scan-assembler "vfms\.f32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vfp-shift-a2t2.c
+++ b/src//dev/null
@@ -1,27 +0,0 @@
-/* Check that NEON vector shifts support immediate values == size.  /*
-
-/* { dg-do compile } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps" } */
-/* { dg-add-options arm_neon } */
-
-#include <arm_neon.h>
-
-uint16x8_t test_vshll_n_u8 (uint8x8_t a)
-{
-    return vshll_n_u8(a, 8);
-}
-
-uint32x4_t test_vshll_n_u16 (uint16x4_t a)
-{   
-    return vshll_n_u16(a, 16);
-}
-
-uint64x2_t test_vshll_n_u32 (uint32x2_t a)
-{
-    return vshll_n_u32(a, 32);
-}
-
-/* { dg-final { scan-assembler "vshll\.u16\[ 	\]+\[qQ\]\[0-9\]+, \[dD\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
-/* { dg-final { scan-assembler "vshll\.u32\[ 	\]+\[qQ\]\[0-9\]+, \[dD\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
-/* { dg-final { scan-assembler "vshll\.u8\[ 	\]+\[qQ\]\[0-9\]+, \[dD\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vgetQ_lanef32.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vgetQ_lanef32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vgetQ_lanef32 (void)
-{
-  float32_t out_float32_t;
-  float32x4_t arg0_float32x4_t;
-
-  out_float32_t = vgetq_lane_f32 (arg0_float32x4_t, 1);
-}
-
-/* { dg-final { scan-assembler "vmov\.32\[ 	\]+\[rR\]\[0-9\]+, \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vgetQ_lanep16.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vgetQ_lanep16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vgetQ_lanep16 (void)
-{
-  poly16_t out_poly16_t;
-  poly16x8_t arg0_poly16x8_t;
-
-  out_poly16_t = vgetq_lane_p16 (arg0_poly16x8_t, 1);
-}
-
-/* { dg-final { scan-assembler "vmov\.u16\[ 	\]+\[rR\]\[0-9\]+, \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vgetQ_lanep8.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vgetQ_lanep8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vgetQ_lanep8 (void)
-{
-  poly8_t out_poly8_t;
-  poly8x16_t arg0_poly8x16_t;
-
-  out_poly8_t = vgetq_lane_p8 (arg0_poly8x16_t, 1);
-}
-
-/* { dg-final { scan-assembler "vmov\.u8\[ 	\]+\[rR\]\[0-9\]+, \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vgetQ_lanes16.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vgetQ_lanes16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vgetQ_lanes16 (void)
-{
-  int16_t out_int16_t;
-  int16x8_t arg0_int16x8_t;
-
-  out_int16_t = vgetq_lane_s16 (arg0_int16x8_t, 1);
-}
-
-/* { dg-final { scan-assembler "vmov\.s16\[ 	\]+\[rR\]\[0-9\]+, \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vgetQ_lanes32.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vgetQ_lanes32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vgetQ_lanes32 (void)
-{
-  int32_t out_int32_t;
-  int32x4_t arg0_int32x4_t;
-
-  out_int32_t = vgetq_lane_s32 (arg0_int32x4_t, 1);
-}
-
-/* { dg-final { scan-assembler "vmov\.32\[ 	\]+\[rR\]\[0-9\]+, \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vgetQ_lanes64.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vgetQ_lanes64' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vgetQ_lanes64 (void)
-{
-  register int64_t out_int64_t asm ("r0");
-  int64x2_t arg0_int64x2_t;
-
-  out_int64_t = vgetq_lane_s64 (arg0_int64x2_t, 0);
-}
-
-/* { dg-final { scan-assembler "((vmov)|(fmrrd))\[ 	\]+\[rR\]\[0-9\]+, \[rR\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vgetQ_lanes8.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vgetQ_lanes8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vgetQ_lanes8 (void)
-{
-  int8_t out_int8_t;
-  int8x16_t arg0_int8x16_t;
-
-  out_int8_t = vgetq_lane_s8 (arg0_int8x16_t, 1);
-}
-
-/* { dg-final { scan-assembler "vmov\.s8\[ 	\]+\[rR\]\[0-9\]+, \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vgetQ_laneu16.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vgetQ_laneu16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vgetQ_laneu16 (void)
-{
-  uint16_t out_uint16_t;
-  uint16x8_t arg0_uint16x8_t;
-
-  out_uint16_t = vgetq_lane_u16 (arg0_uint16x8_t, 1);
-}
-
-/* { dg-final { scan-assembler "vmov\.u16\[ 	\]+\[rR\]\[0-9\]+, \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vgetQ_laneu32.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vgetQ_laneu32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vgetQ_laneu32 (void)
-{
-  uint32_t out_uint32_t;
-  uint32x4_t arg0_uint32x4_t;
-
-  out_uint32_t = vgetq_lane_u32 (arg0_uint32x4_t, 1);
-}
-
-/* { dg-final { scan-assembler "vmov\.32\[ 	\]+\[rR\]\[0-9\]+, \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vgetQ_laneu64.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vgetQ_laneu64' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vgetQ_laneu64 (void)
-{
-  register uint64_t out_uint64_t asm ("r0");
-  uint64x2_t arg0_uint64x2_t;
-
-  out_uint64_t = vgetq_lane_u64 (arg0_uint64x2_t, 0);
-}
-
-/* { dg-final { scan-assembler "((vmov)|(fmrrd))\[ 	\]+\[rR\]\[0-9\]+, \[rR\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vgetQ_laneu8.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vgetQ_laneu8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vgetQ_laneu8 (void)
-{
-  uint8_t out_uint8_t;
-  uint8x16_t arg0_uint8x16_t;
-
-  out_uint8_t = vgetq_lane_u8 (arg0_uint8x16_t, 1);
-}
-
-/* { dg-final { scan-assembler "vmov\.u8\[ 	\]+\[rR\]\[0-9\]+, \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vget_highf32.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vget_highf32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vget_highf32 (void)
-{
-  float32x2_t out_float32x2_t;
-  float32x4_t arg0_float32x4_t;
-
-  out_float32x2_t = vget_high_f32 (arg0_float32x4_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vget_highp16.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vget_highp16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vget_highp16 (void)
-{
-  poly16x4_t out_poly16x4_t;
-  poly16x8_t arg0_poly16x8_t;
-
-  out_poly16x4_t = vget_high_p16 (arg0_poly16x8_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vget_highp64.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vget_highp64' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_crypto_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_crypto } */
-
-#include "arm_neon.h"
-
-void test_vget_highp64 (void)
-{
-  poly64x1_t out_poly64x1_t;
-  poly64x2_t arg0_poly64x2_t;
-
-  out_poly64x1_t = vget_high_p64 (arg0_poly64x2_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vget_highp8.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vget_highp8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vget_highp8 (void)
-{
-  poly8x8_t out_poly8x8_t;
-  poly8x16_t arg0_poly8x16_t;
-
-  out_poly8x8_t = vget_high_p8 (arg0_poly8x16_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vget_highs16.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vget_highs16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vget_highs16 (void)
-{
-  int16x4_t out_int16x4_t;
-  int16x8_t arg0_int16x8_t;
-
-  out_int16x4_t = vget_high_s16 (arg0_int16x8_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vget_highs32.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vget_highs32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vget_highs32 (void)
-{
-  int32x2_t out_int32x2_t;
-  int32x4_t arg0_int32x4_t;
-
-  out_int32x2_t = vget_high_s32 (arg0_int32x4_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vget_highs64.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vget_highs64' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vget_highs64 (void)
-{
-  int64x1_t out_int64x1_t;
-  int64x2_t arg0_int64x2_t;
-
-  out_int64x1_t = vget_high_s64 (arg0_int64x2_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vget_highs8.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vget_highs8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vget_highs8 (void)
-{
-  int8x8_t out_int8x8_t;
-  int8x16_t arg0_int8x16_t;
-
-  out_int8x8_t = vget_high_s8 (arg0_int8x16_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vget_highu16.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vget_highu16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vget_highu16 (void)
-{
-  uint16x4_t out_uint16x4_t;
-  uint16x8_t arg0_uint16x8_t;
-
-  out_uint16x4_t = vget_high_u16 (arg0_uint16x8_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vget_highu32.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vget_highu32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vget_highu32 (void)
-{
-  uint32x2_t out_uint32x2_t;
-  uint32x4_t arg0_uint32x4_t;
-
-  out_uint32x2_t = vget_high_u32 (arg0_uint32x4_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vget_highu64.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vget_highu64' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vget_highu64 (void)
-{
-  uint64x1_t out_uint64x1_t;
-  uint64x2_t arg0_uint64x2_t;
-
-  out_uint64x1_t = vget_high_u64 (arg0_uint64x2_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vget_highu8.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vget_highu8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vget_highu8 (void)
-{
-  uint8x8_t out_uint8x8_t;
-  uint8x16_t arg0_uint8x16_t;
-
-  out_uint8x8_t = vget_high_u8 (arg0_uint8x16_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vget_lanef32.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vget_lanef32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vget_lanef32 (void)
-{
-  float32_t out_float32_t;
-  float32x2_t arg0_float32x2_t;
-
-  out_float32_t = vget_lane_f32 (arg0_float32x2_t, 1);
-}
-
-/* { dg-final { scan-assembler "vmov\.32\[ 	\]+\[rR\]\[0-9\]+, \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vget_lanep16.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vget_lanep16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vget_lanep16 (void)
-{
-  poly16_t out_poly16_t;
-  poly16x4_t arg0_poly16x4_t;
-
-  out_poly16_t = vget_lane_p16 (arg0_poly16x4_t, 1);
-}
-
-/* { dg-final { scan-assembler "vmov\.u16\[ 	\]+\[rR\]\[0-9\]+, \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vget_lanep8.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vget_lanep8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vget_lanep8 (void)
-{
-  poly8_t out_poly8_t;
-  poly8x8_t arg0_poly8x8_t;
-
-  out_poly8_t = vget_lane_p8 (arg0_poly8x8_t, 1);
-}
-
-/* { dg-final { scan-assembler "vmov\.u8\[ 	\]+\[rR\]\[0-9\]+, \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vget_lanes16.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vget_lanes16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vget_lanes16 (void)
-{
-  int16_t out_int16_t;
-  int16x4_t arg0_int16x4_t;
-
-  out_int16_t = vget_lane_s16 (arg0_int16x4_t, 1);
-}
-
-/* { dg-final { scan-assembler "vmov\.s16\[ 	\]+\[rR\]\[0-9\]+, \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vget_lanes32.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vget_lanes32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vget_lanes32 (void)
-{
-  int32_t out_int32_t;
-  int32x2_t arg0_int32x2_t;
-
-  out_int32_t = vget_lane_s32 (arg0_int32x2_t, 1);
-}
-
-/* { dg-final { scan-assembler "vmov\.32\[ 	\]+\[rR\]\[0-9\]+, \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vget_lanes64.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vget_lanes64' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vget_lanes64 (void)
-{
-  int64_t out_int64_t;
-  int64x1_t arg0_int64x1_t;
-
-  out_int64_t = vget_lane_s64 (arg0_int64x1_t, 0);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vget_lanes8.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vget_lanes8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vget_lanes8 (void)
-{
-  int8_t out_int8_t;
-  int8x8_t arg0_int8x8_t;
-
-  out_int8_t = vget_lane_s8 (arg0_int8x8_t, 1);
-}
-
-/* { dg-final { scan-assembler "vmov\.s8\[ 	\]+\[rR\]\[0-9\]+, \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vget_laneu16.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vget_laneu16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vget_laneu16 (void)
-{
-  uint16_t out_uint16_t;
-  uint16x4_t arg0_uint16x4_t;
-
-  out_uint16_t = vget_lane_u16 (arg0_uint16x4_t, 1);
-}
-
-/* { dg-final { scan-assembler "vmov\.u16\[ 	\]+\[rR\]\[0-9\]+, \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vget_laneu32.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vget_laneu32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vget_laneu32 (void)
-{
-  uint32_t out_uint32_t;
-  uint32x2_t arg0_uint32x2_t;
-
-  out_uint32_t = vget_lane_u32 (arg0_uint32x2_t, 1);
-}
-
-/* { dg-final { scan-assembler "vmov\.32\[ 	\]+\[rR\]\[0-9\]+, \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vget_laneu64.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vget_laneu64' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vget_laneu64 (void)
-{
-  uint64_t out_uint64_t;
-  uint64x1_t arg0_uint64x1_t;
-
-  out_uint64_t = vget_lane_u64 (arg0_uint64x1_t, 0);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vget_laneu8.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vget_laneu8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vget_laneu8 (void)
-{
-  uint8_t out_uint8_t;
-  uint8x8_t arg0_uint8x8_t;
-
-  out_uint8_t = vget_lane_u8 (arg0_uint8x8_t, 1);
-}
-
-/* { dg-final { scan-assembler "vmov\.u8\[ 	\]+\[rR\]\[0-9\]+, \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vget_lowf32.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vget_lowf32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vget_lowf32 (void)
-{
-  register float32x2_t out_float32x2_t asm ("d18");
-  float32x4_t arg0_float32x4_t;
-
-  out_float32x2_t = vget_low_f32 (arg0_float32x4_t);
-}
-
-/* { dg-final { scan-assembler "vmov\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vget_lowp16.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vget_lowp16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vget_lowp16 (void)
-{
-  register poly16x4_t out_poly16x4_t asm ("d18");
-  poly16x8_t arg0_poly16x8_t;
-
-  out_poly16x4_t = vget_low_p16 (arg0_poly16x8_t);
-}
-
-/* { dg-final { scan-assembler "vmov\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vget_lowp64.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vget_lowp64' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_crypto_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_crypto } */
-
-#include "arm_neon.h"
-
-void test_vget_lowp64 (void)
-{
-  poly64x1_t out_poly64x1_t;
-  poly64x2_t arg0_poly64x2_t;
-
-  out_poly64x1_t = vget_low_p64 (arg0_poly64x2_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vget_lowp8.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vget_lowp8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vget_lowp8 (void)
-{
-  register poly8x8_t out_poly8x8_t asm ("d18");
-  poly8x16_t arg0_poly8x16_t;
-
-  out_poly8x8_t = vget_low_p8 (arg0_poly8x16_t);
-}
-
-/* { dg-final { scan-assembler "vmov\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vget_lows16.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vget_lows16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vget_lows16 (void)
-{
-  register int16x4_t out_int16x4_t asm ("d18");
-  int16x8_t arg0_int16x8_t;
-
-  out_int16x4_t = vget_low_s16 (arg0_int16x8_t);
-}
-
-/* { dg-final { scan-assembler "vmov\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vget_lows32.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vget_lows32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vget_lows32 (void)
-{
-  register int32x2_t out_int32x2_t asm ("d18");
-  int32x4_t arg0_int32x4_t;
-
-  out_int32x2_t = vget_low_s32 (arg0_int32x4_t);
-}
-
-/* { dg-final { scan-assembler "vmov\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vget_lows64.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vget_lows64' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vget_lows64 (void)
-{
-  int64x1_t out_int64x1_t;
-  int64x2_t arg0_int64x2_t;
-
-  out_int64x1_t = vget_low_s64 (arg0_int64x2_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vget_lows8.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vget_lows8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vget_lows8 (void)
-{
-  register int8x8_t out_int8x8_t asm ("d18");
-  int8x16_t arg0_int8x16_t;
-
-  out_int8x8_t = vget_low_s8 (arg0_int8x16_t);
-}
-
-/* { dg-final { scan-assembler "vmov\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vget_lowu16.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vget_lowu16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vget_lowu16 (void)
-{
-  register uint16x4_t out_uint16x4_t asm ("d18");
-  uint16x8_t arg0_uint16x8_t;
-
-  out_uint16x4_t = vget_low_u16 (arg0_uint16x8_t);
-}
-
-/* { dg-final { scan-assembler "vmov\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vget_lowu32.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vget_lowu32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vget_lowu32 (void)
-{
-  register uint32x2_t out_uint32x2_t asm ("d18");
-  uint32x4_t arg0_uint32x4_t;
-
-  out_uint32x2_t = vget_low_u32 (arg0_uint32x4_t);
-}
-
-/* { dg-final { scan-assembler "vmov\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vget_lowu64.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vget_lowu64' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vget_lowu64 (void)
-{
-  uint64x1_t out_uint64x1_t;
-  uint64x2_t arg0_uint64x2_t;
-
-  out_uint64x1_t = vget_low_u64 (arg0_uint64x2_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vget_lowu8.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vget_lowu8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vget_lowu8 (void)
-{
-  register uint8x8_t out_uint8x8_t asm ("d18");
-  uint8x16_t arg0_uint8x16_t;
-
-  out_uint8x8_t = vget_low_u8 (arg0_uint8x16_t);
-}
-
-/* { dg-final { scan-assembler "vmov\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vhaddQs16.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vhaddQs16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vhaddQs16 (void)
-{
-  int16x8_t out_int16x8_t;
-  int16x8_t arg0_int16x8_t;
-  int16x8_t arg1_int16x8_t;
-
-  out_int16x8_t = vhaddq_s16 (arg0_int16x8_t, arg1_int16x8_t);
-}
-
-/* { dg-final { scan-assembler "vhadd\.s16\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vhaddQs32.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vhaddQs32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vhaddQs32 (void)
-{
-  int32x4_t out_int32x4_t;
-  int32x4_t arg0_int32x4_t;
-  int32x4_t arg1_int32x4_t;
-
-  out_int32x4_t = vhaddq_s32 (arg0_int32x4_t, arg1_int32x4_t);
-}
-
-/* { dg-final { scan-assembler "vhadd\.s32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vhaddQs8.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vhaddQs8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vhaddQs8 (void)
-{
-  int8x16_t out_int8x16_t;
-  int8x16_t arg0_int8x16_t;
-  int8x16_t arg1_int8x16_t;
-
-  out_int8x16_t = vhaddq_s8 (arg0_int8x16_t, arg1_int8x16_t);
-}
-
-/* { dg-final { scan-assembler "vhadd\.s8\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vhaddQu16.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vhaddQu16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vhaddQu16 (void)
-{
-  uint16x8_t out_uint16x8_t;
-  uint16x8_t arg0_uint16x8_t;
-  uint16x8_t arg1_uint16x8_t;
-
-  out_uint16x8_t = vhaddq_u16 (arg0_uint16x8_t, arg1_uint16x8_t);
-}
-
-/* { dg-final { scan-assembler "vhadd\.u16\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vhaddQu32.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vhaddQu32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vhaddQu32 (void)
-{
-  uint32x4_t out_uint32x4_t;
-  uint32x4_t arg0_uint32x4_t;
-  uint32x4_t arg1_uint32x4_t;
-
-  out_uint32x4_t = vhaddq_u32 (arg0_uint32x4_t, arg1_uint32x4_t);
-}
-
-/* { dg-final { scan-assembler "vhadd\.u32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vhaddQu8.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vhaddQu8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vhaddQu8 (void)
-{
-  uint8x16_t out_uint8x16_t;
-  uint8x16_t arg0_uint8x16_t;
-  uint8x16_t arg1_uint8x16_t;
-
-  out_uint8x16_t = vhaddq_u8 (arg0_uint8x16_t, arg1_uint8x16_t);
-}
-
-/* { dg-final { scan-assembler "vhadd\.u8\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vhadds16.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vhadds16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vhadds16 (void)
-{
-  int16x4_t out_int16x4_t;
-  int16x4_t arg0_int16x4_t;
-  int16x4_t arg1_int16x4_t;
-
-  out_int16x4_t = vhadd_s16 (arg0_int16x4_t, arg1_int16x4_t);
-}
-
-/* { dg-final { scan-assembler "vhadd\.s16\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vhadds32.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vhadds32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vhadds32 (void)
-{
-  int32x2_t out_int32x2_t;
-  int32x2_t arg0_int32x2_t;
-  int32x2_t arg1_int32x2_t;
-
-  out_int32x2_t = vhadd_s32 (arg0_int32x2_t, arg1_int32x2_t);
-}
-
-/* { dg-final { scan-assembler "vhadd\.s32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vhadds8.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vhadds8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vhadds8 (void)
-{
-  int8x8_t out_int8x8_t;
-  int8x8_t arg0_int8x8_t;
-  int8x8_t arg1_int8x8_t;
-
-  out_int8x8_t = vhadd_s8 (arg0_int8x8_t, arg1_int8x8_t);
-}
-
-/* { dg-final { scan-assembler "vhadd\.s8\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vhaddu16.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vhaddu16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vhaddu16 (void)
-{
-  uint16x4_t out_uint16x4_t;
-  uint16x4_t arg0_uint16x4_t;
-  uint16x4_t arg1_uint16x4_t;
-
-  out_uint16x4_t = vhadd_u16 (arg0_uint16x4_t, arg1_uint16x4_t);
-}
-
-/* { dg-final { scan-assembler "vhadd\.u16\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vhaddu32.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vhaddu32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vhaddu32 (void)
-{
-  uint32x2_t out_uint32x2_t;
-  uint32x2_t arg0_uint32x2_t;
-  uint32x2_t arg1_uint32x2_t;
-
-  out_uint32x2_t = vhadd_u32 (arg0_uint32x2_t, arg1_uint32x2_t);
-}
-
-/* { dg-final { scan-assembler "vhadd\.u32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vhaddu8.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vhaddu8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vhaddu8 (void)
-{
-  uint8x8_t out_uint8x8_t;
-  uint8x8_t arg0_uint8x8_t;
-  uint8x8_t arg1_uint8x8_t;
-
-  out_uint8x8_t = vhadd_u8 (arg0_uint8x8_t, arg1_uint8x8_t);
-}
-
-/* { dg-final { scan-assembler "vhadd\.u8\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vhsubQs16.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vhsubQs16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vhsubQs16 (void)
-{
-  int16x8_t out_int16x8_t;
-  int16x8_t arg0_int16x8_t;
-  int16x8_t arg1_int16x8_t;
-
-  out_int16x8_t = vhsubq_s16 (arg0_int16x8_t, arg1_int16x8_t);
-}
-
-/* { dg-final { scan-assembler "vhsub\.s16\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vhsubQs32.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vhsubQs32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vhsubQs32 (void)
-{
-  int32x4_t out_int32x4_t;
-  int32x4_t arg0_int32x4_t;
-  int32x4_t arg1_int32x4_t;
-
-  out_int32x4_t = vhsubq_s32 (arg0_int32x4_t, arg1_int32x4_t);
-}
-
-/* { dg-final { scan-assembler "vhsub\.s32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vhsubQs8.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vhsubQs8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vhsubQs8 (void)
-{
-  int8x16_t out_int8x16_t;
-  int8x16_t arg0_int8x16_t;
-  int8x16_t arg1_int8x16_t;
-
-  out_int8x16_t = vhsubq_s8 (arg0_int8x16_t, arg1_int8x16_t);
-}
-
-/* { dg-final { scan-assembler "vhsub\.s8\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vhsubQu16.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vhsubQu16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vhsubQu16 (void)
-{
-  uint16x8_t out_uint16x8_t;
-  uint16x8_t arg0_uint16x8_t;
-  uint16x8_t arg1_uint16x8_t;
-
-  out_uint16x8_t = vhsubq_u16 (arg0_uint16x8_t, arg1_uint16x8_t);
-}
-
-/* { dg-final { scan-assembler "vhsub\.u16\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vhsubQu32.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vhsubQu32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vhsubQu32 (void)
-{
-  uint32x4_t out_uint32x4_t;
-  uint32x4_t arg0_uint32x4_t;
-  uint32x4_t arg1_uint32x4_t;
-
-  out_uint32x4_t = vhsubq_u32 (arg0_uint32x4_t, arg1_uint32x4_t);
-}
-
-/* { dg-final { scan-assembler "vhsub\.u32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vhsubQu8.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vhsubQu8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vhsubQu8 (void)
-{
-  uint8x16_t out_uint8x16_t;
-  uint8x16_t arg0_uint8x16_t;
-  uint8x16_t arg1_uint8x16_t;
-
-  out_uint8x16_t = vhsubq_u8 (arg0_uint8x16_t, arg1_uint8x16_t);
-}
-
-/* { dg-final { scan-assembler "vhsub\.u8\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vhsubs16.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vhsubs16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vhsubs16 (void)
-{
-  int16x4_t out_int16x4_t;
-  int16x4_t arg0_int16x4_t;
-  int16x4_t arg1_int16x4_t;
-
-  out_int16x4_t = vhsub_s16 (arg0_int16x4_t, arg1_int16x4_t);
-}
-
-/* { dg-final { scan-assembler "vhsub\.s16\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vhsubs32.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vhsubs32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vhsubs32 (void)
-{
-  int32x2_t out_int32x2_t;
-  int32x2_t arg0_int32x2_t;
-  int32x2_t arg1_int32x2_t;
-
-  out_int32x2_t = vhsub_s32 (arg0_int32x2_t, arg1_int32x2_t);
-}
-
-/* { dg-final { scan-assembler "vhsub\.s32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vhsubs8.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vhsubs8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vhsubs8 (void)
-{
-  int8x8_t out_int8x8_t;
-  int8x8_t arg0_int8x8_t;
-  int8x8_t arg1_int8x8_t;
-
-  out_int8x8_t = vhsub_s8 (arg0_int8x8_t, arg1_int8x8_t);
-}
-
-/* { dg-final { scan-assembler "vhsub\.s8\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vhsubu16.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vhsubu16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vhsubu16 (void)
-{
-  uint16x4_t out_uint16x4_t;
-  uint16x4_t arg0_uint16x4_t;
-  uint16x4_t arg1_uint16x4_t;
-
-  out_uint16x4_t = vhsub_u16 (arg0_uint16x4_t, arg1_uint16x4_t);
-}
-
-/* { dg-final { scan-assembler "vhsub\.u16\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vhsubu32.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vhsubu32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vhsubu32 (void)
-{
-  uint32x2_t out_uint32x2_t;
-  uint32x2_t arg0_uint32x2_t;
-  uint32x2_t arg1_uint32x2_t;
-
-  out_uint32x2_t = vhsub_u32 (arg0_uint32x2_t, arg1_uint32x2_t);
-}
-
-/* { dg-final { scan-assembler "vhsub\.u32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vhsubu8.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vhsubu8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vhsubu8 (void)
-{
-  uint8x8_t out_uint8x8_t;
-  uint8x8_t arg0_uint8x8_t;
-  uint8x8_t arg1_uint8x8_t;
-
-  out_uint8x8_t = vhsub_u8 (arg0_uint8x8_t, arg1_uint8x8_t);
-}
-
-/* { dg-final { scan-assembler "vhsub\.u8\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vld1Q_dupf32.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vld1Q_dupf32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vld1Q_dupf32 (void)
-{
-  float32x4_t out_float32x4_t;
-
-  out_float32x4_t = vld1q_dup_f32 (0);
-}
-
-/* { dg-final { scan-assembler "vld1\.32\[ 	\]+\\\{((\[dD\]\[0-9\]+\\\[\\\]-\[dD\]\[0-9\]+\\\[\\\])|(\[dD\]\[0-9\]+\\\[\\\], \[dD\]\[0-9\]+\\\[\\\]))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vld1Q_dupp16.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vld1Q_dupp16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vld1Q_dupp16 (void)
-{
-  poly16x8_t out_poly16x8_t;
-
-  out_poly16x8_t = vld1q_dup_p16 (0);
-}
-
-/* { dg-final { scan-assembler "vld1\.16\[ 	\]+\\\{((\[dD\]\[0-9\]+\\\[\\\]-\[dD\]\[0-9\]+\\\[\\\])|(\[dD\]\[0-9\]+\\\[\\\], \[dD\]\[0-9\]+\\\[\\\]))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vld1Q_dupp64.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vld1Q_dupp64' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_crypto_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_crypto } */
-
-#include "arm_neon.h"
-
-void test_vld1Q_dupp64 (void)
-{
-  poly64x2_t out_poly64x2_t;
-
-  out_poly64x2_t = vld1q_dup_p64 (0);
-}
-
-/* { dg-final { scan-assembler "vld1\.64\[ 	\]+((\\\{\[dD\]\[0-9\]+\\\})|(\[dD\]\[0-9\]+)), \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vld1Q_dupp8.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vld1Q_dupp8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vld1Q_dupp8 (void)
-{
-  poly8x16_t out_poly8x16_t;
-
-  out_poly8x16_t = vld1q_dup_p8 (0);
-}
-
-/* { dg-final { scan-assembler "vld1\.8\[ 	\]+\\\{((\[dD\]\[0-9\]+\\\[\\\]-\[dD\]\[0-9\]+\\\[\\\])|(\[dD\]\[0-9\]+\\\[\\\], \[dD\]\[0-9\]+\\\[\\\]))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vld1Q_dups16.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vld1Q_dups16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vld1Q_dups16 (void)
-{
-  int16x8_t out_int16x8_t;
-
-  out_int16x8_t = vld1q_dup_s16 (0);
-}
-
-/* { dg-final { scan-assembler "vld1\.16\[ 	\]+\\\{((\[dD\]\[0-9\]+\\\[\\\]-\[dD\]\[0-9\]+\\\[\\\])|(\[dD\]\[0-9\]+\\\[\\\], \[dD\]\[0-9\]+\\\[\\\]))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vld1Q_dups32.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vld1Q_dups32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vld1Q_dups32 (void)
-{
-  int32x4_t out_int32x4_t;
-
-  out_int32x4_t = vld1q_dup_s32 (0);
-}
-
-/* { dg-final { scan-assembler "vld1\.32\[ 	\]+\\\{((\[dD\]\[0-9\]+\\\[\\\]-\[dD\]\[0-9\]+\\\[\\\])|(\[dD\]\[0-9\]+\\\[\\\], \[dD\]\[0-9\]+\\\[\\\]))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vld1Q_dups64.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vld1Q_dups64' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vld1Q_dups64 (void)
-{
-  int64x2_t out_int64x2_t;
-
-  out_int64x2_t = vld1q_dup_s64 (0);
-}
-
-/* { dg-final { scan-assembler "vld1\.64\[ 	\]+((\\\{\[dD\]\[0-9\]+\\\})|(\[dD\]\[0-9\]+)), \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vld1Q_dups8.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vld1Q_dups8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vld1Q_dups8 (void)
-{
-  int8x16_t out_int8x16_t;
-
-  out_int8x16_t = vld1q_dup_s8 (0);
-}
-
-/* { dg-final { scan-assembler "vld1\.8\[ 	\]+\\\{((\[dD\]\[0-9\]+\\\[\\\]-\[dD\]\[0-9\]+\\\[\\\])|(\[dD\]\[0-9\]+\\\[\\\], \[dD\]\[0-9\]+\\\[\\\]))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vld1Q_dupu16.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vld1Q_dupu16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vld1Q_dupu16 (void)
-{
-  uint16x8_t out_uint16x8_t;
-
-  out_uint16x8_t = vld1q_dup_u16 (0);
-}
-
-/* { dg-final { scan-assembler "vld1\.16\[ 	\]+\\\{((\[dD\]\[0-9\]+\\\[\\\]-\[dD\]\[0-9\]+\\\[\\\])|(\[dD\]\[0-9\]+\\\[\\\], \[dD\]\[0-9\]+\\\[\\\]))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vld1Q_dupu32.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vld1Q_dupu32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vld1Q_dupu32 (void)
-{
-  uint32x4_t out_uint32x4_t;
-
-  out_uint32x4_t = vld1q_dup_u32 (0);
-}
-
-/* { dg-final { scan-assembler "vld1\.32\[ 	\]+\\\{((\[dD\]\[0-9\]+\\\[\\\]-\[dD\]\[0-9\]+\\\[\\\])|(\[dD\]\[0-9\]+\\\[\\\], \[dD\]\[0-9\]+\\\[\\\]))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vld1Q_dupu64.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vld1Q_dupu64' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vld1Q_dupu64 (void)
-{
-  uint64x2_t out_uint64x2_t;
-
-  out_uint64x2_t = vld1q_dup_u64 (0);
-}
-
-/* { dg-final { scan-assembler "vld1\.64\[ 	\]+((\\\{\[dD\]\[0-9\]+\\\})|(\[dD\]\[0-9\]+)), \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vld1Q_dupu8.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vld1Q_dupu8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vld1Q_dupu8 (void)
-{
-  uint8x16_t out_uint8x16_t;
-
-  out_uint8x16_t = vld1q_dup_u8 (0);
-}
-
-/* { dg-final { scan-assembler "vld1\.8\[ 	\]+\\\{((\[dD\]\[0-9\]+\\\[\\\]-\[dD\]\[0-9\]+\\\[\\\])|(\[dD\]\[0-9\]+\\\[\\\], \[dD\]\[0-9\]+\\\[\\\]))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vld1Q_lanef32.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vld1Q_lanef32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vld1Q_lanef32 (void)
-{
-  float32x4_t out_float32x4_t;
-  float32x4_t arg1_float32x4_t;
-
-  out_float32x4_t = vld1q_lane_f32 (0, arg1_float32x4_t, 1);
-}
-
-/* { dg-final { scan-assembler "vld1\.32\[ 	\]+((\\\{\[dD\]\[0-9\]+\\\[\[0-9\]+\\\]\\\})|(\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])), \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vld1Q_lanep16.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vld1Q_lanep16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vld1Q_lanep16 (void)
-{
-  poly16x8_t out_poly16x8_t;
-  poly16x8_t arg1_poly16x8_t;
-
-  out_poly16x8_t = vld1q_lane_p16 (0, arg1_poly16x8_t, 1);
-}
-
-/* { dg-final { scan-assembler "vld1\.16\[ 	\]+((\\\{\[dD\]\[0-9\]+\\\[\[0-9\]+\\\]\\\})|(\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])), \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vld1Q_lanep64.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vld1Q_lanep64' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_crypto_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_crypto } */
-
-#include "arm_neon.h"
-
-void test_vld1Q_lanep64 (void)
-{
-  poly64x2_t out_poly64x2_t;
-  poly64x2_t arg1_poly64x2_t;
-
-  out_poly64x2_t = vld1q_lane_p64 (0, arg1_poly64x2_t, 1);
-}
-
-/* { dg-final { scan-assembler "vld1\.64\[ 	\]+((\\\{\[dD\]\[0-9\]+\\\})|(\[dD\]\[0-9\]+)), \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vld1Q_lanep8.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vld1Q_lanep8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vld1Q_lanep8 (void)
-{
-  poly8x16_t out_poly8x16_t;
-  poly8x16_t arg1_poly8x16_t;
-
-  out_poly8x16_t = vld1q_lane_p8 (0, arg1_poly8x16_t, 1);
-}
-
-/* { dg-final { scan-assembler "vld1\.8\[ 	\]+((\\\{\[dD\]\[0-9\]+\\\[\[0-9\]+\\\]\\\})|(\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])), \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vld1Q_lanes16.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vld1Q_lanes16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vld1Q_lanes16 (void)
-{
-  int16x8_t out_int16x8_t;
-  int16x8_t arg1_int16x8_t;
-
-  out_int16x8_t = vld1q_lane_s16 (0, arg1_int16x8_t, 1);
-}
-
-/* { dg-final { scan-assembler "vld1\.16\[ 	\]+((\\\{\[dD\]\[0-9\]+\\\[\[0-9\]+\\\]\\\})|(\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])), \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vld1Q_lanes32.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vld1Q_lanes32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vld1Q_lanes32 (void)
-{
-  int32x4_t out_int32x4_t;
-  int32x4_t arg1_int32x4_t;
-
-  out_int32x4_t = vld1q_lane_s32 (0, arg1_int32x4_t, 1);
-}
-
-/* { dg-final { scan-assembler "vld1\.32\[ 	\]+((\\\{\[dD\]\[0-9\]+\\\[\[0-9\]+\\\]\\\})|(\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])), \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vld1Q_lanes64.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vld1Q_lanes64' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vld1Q_lanes64 (void)
-{
-  int64x2_t out_int64x2_t;
-  int64x2_t arg1_int64x2_t;
-
-  out_int64x2_t = vld1q_lane_s64 (0, arg1_int64x2_t, 1);
-}
-
-/* { dg-final { scan-assembler "vld1\.64\[ 	\]+((\\\{\[dD\]\[0-9\]+\\\})|(\[dD\]\[0-9\]+)), \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vld1Q_lanes8.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vld1Q_lanes8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vld1Q_lanes8 (void)
-{
-  int8x16_t out_int8x16_t;
-  int8x16_t arg1_int8x16_t;
-
-  out_int8x16_t = vld1q_lane_s8 (0, arg1_int8x16_t, 1);
-}
-
-/* { dg-final { scan-assembler "vld1\.8\[ 	\]+((\\\{\[dD\]\[0-9\]+\\\[\[0-9\]+\\\]\\\})|(\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])), \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vld1Q_laneu16.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vld1Q_laneu16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vld1Q_laneu16 (void)
-{
-  uint16x8_t out_uint16x8_t;
-  uint16x8_t arg1_uint16x8_t;
-
-  out_uint16x8_t = vld1q_lane_u16 (0, arg1_uint16x8_t, 1);
-}
-
-/* { dg-final { scan-assembler "vld1\.16\[ 	\]+((\\\{\[dD\]\[0-9\]+\\\[\[0-9\]+\\\]\\\})|(\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])), \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vld1Q_laneu32.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vld1Q_laneu32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vld1Q_laneu32 (void)
-{
-  uint32x4_t out_uint32x4_t;
-  uint32x4_t arg1_uint32x4_t;
-
-  out_uint32x4_t = vld1q_lane_u32 (0, arg1_uint32x4_t, 1);
-}
-
-/* { dg-final { scan-assembler "vld1\.32\[ 	\]+((\\\{\[dD\]\[0-9\]+\\\[\[0-9\]+\\\]\\\})|(\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])), \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vld1Q_laneu64.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vld1Q_laneu64' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vld1Q_laneu64 (void)
-{
-  uint64x2_t out_uint64x2_t;
-  uint64x2_t arg1_uint64x2_t;
-
-  out_uint64x2_t = vld1q_lane_u64 (0, arg1_uint64x2_t, 1);
-}
-
-/* { dg-final { scan-assembler "vld1\.64\[ 	\]+((\\\{\[dD\]\[0-9\]+\\\})|(\[dD\]\[0-9\]+)), \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vld1Q_laneu8.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vld1Q_laneu8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vld1Q_laneu8 (void)
-{
-  uint8x16_t out_uint8x16_t;
-  uint8x16_t arg1_uint8x16_t;
-
-  out_uint8x16_t = vld1q_lane_u8 (0, arg1_uint8x16_t, 1);
-}
-
-/* { dg-final { scan-assembler "vld1\.8\[ 	\]+((\\\{\[dD\]\[0-9\]+\\\[\[0-9\]+\\\]\\\})|(\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])), \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vld1Qf32.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vld1Qf32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vld1Qf32 (void)
-{
-  float32x4_t out_float32x4_t;
-
-  out_float32x4_t = vld1q_f32 (0);
-}
-
-/* { dg-final { scan-assembler "vld1\.32\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vld1Qp16.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vld1Qp16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vld1Qp16 (void)
-{
-  poly16x8_t out_poly16x8_t;
-
-  out_poly16x8_t = vld1q_p16 (0);
-}
-
-/* { dg-final { scan-assembler "vld1\.16\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vld1Qp64.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vld1Qp64' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_crypto_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_crypto } */
-
-#include "arm_neon.h"
-
-void test_vld1Qp64 (void)
-{
-  poly64x2_t out_poly64x2_t;
-
-  out_poly64x2_t = vld1q_p64 (0);
-}
-
-/* { dg-final { scan-assembler "vld1\.64\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vld1Qp8.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vld1Qp8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vld1Qp8 (void)
-{
-  poly8x16_t out_poly8x16_t;
-
-  out_poly8x16_t = vld1q_p8 (0);
-}
-
-/* { dg-final { scan-assembler "vld1\.8\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vld1Qs16.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vld1Qs16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vld1Qs16 (void)
-{
-  int16x8_t out_int16x8_t;
-
-  out_int16x8_t = vld1q_s16 (0);
-}
-
-/* { dg-final { scan-assembler "vld1\.16\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vld1Qs32.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vld1Qs32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vld1Qs32 (void)
-{
-  int32x4_t out_int32x4_t;
-
-  out_int32x4_t = vld1q_s32 (0);
-}
-
-/* { dg-final { scan-assembler "vld1\.32\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vld1Qs64.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vld1Qs64' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vld1Qs64 (void)
-{
-  int64x2_t out_int64x2_t;
-
-  out_int64x2_t = vld1q_s64 (0);
-}
-
-/* { dg-final { scan-assembler "vld1\.64\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vld1Qs8.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vld1Qs8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vld1Qs8 (void)
-{
-  int8x16_t out_int8x16_t;
-
-  out_int8x16_t = vld1q_s8 (0);
-}
-
-/* { dg-final { scan-assembler "vld1\.8\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vld1Qu16.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vld1Qu16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vld1Qu16 (void)
-{
-  uint16x8_t out_uint16x8_t;
-
-  out_uint16x8_t = vld1q_u16 (0);
-}
-
-/* { dg-final { scan-assembler "vld1\.16\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vld1Qu32.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vld1Qu32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vld1Qu32 (void)
-{
-  uint32x4_t out_uint32x4_t;
-
-  out_uint32x4_t = vld1q_u32 (0);
-}
-
-/* { dg-final { scan-assembler "vld1\.32\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vld1Qu64.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vld1Qu64' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vld1Qu64 (void)
-{
-  uint64x2_t out_uint64x2_t;
-
-  out_uint64x2_t = vld1q_u64 (0);
-}
-
-/* { dg-final { scan-assembler "vld1\.64\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vld1Qu8.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vld1Qu8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vld1Qu8 (void)
-{
-  uint8x16_t out_uint8x16_t;
-
-  out_uint8x16_t = vld1q_u8 (0);
-}
-
-/* { dg-final { scan-assembler "vld1\.8\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vld1_dupf32.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vld1_dupf32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vld1_dupf32 (void)
-{
-  float32x2_t out_float32x2_t;
-
-  out_float32x2_t = vld1_dup_f32 (0);
-}
-
-/* { dg-final { scan-assembler "vld1\.32\[ 	\]+((\\\{\[dD\]\[0-9\]+\\\[\\\]\\\})|(\[dD\]\[0-9\]+\\\[\\\])), \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vld1_dupp16.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vld1_dupp16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vld1_dupp16 (void)
-{
-  poly16x4_t out_poly16x4_t;
-
-  out_poly16x4_t = vld1_dup_p16 (0);
-}
-
-/* { dg-final { scan-assembler "vld1\.16\[ 	\]+((\\\{\[dD\]\[0-9\]+\\\[\\\]\\\})|(\[dD\]\[0-9\]+\\\[\\\])), \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vld1_dupp64.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vld1_dupp64' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_crypto_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_crypto } */
-
-#include "arm_neon.h"
-
-void test_vld1_dupp64 (void)
-{
-  poly64x1_t out_poly64x1_t;
-
-  out_poly64x1_t = vld1_dup_p64 (0);
-}
-
-/* { dg-final { scan-assembler "vld1\.64\[ 	\]+((\\\{\[dD\]\[0-9\]+\\\})|(\[dD\]\[0-9\]+)), \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vld1_dupp8.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vld1_dupp8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vld1_dupp8 (void)
-{
-  poly8x8_t out_poly8x8_t;
-
-  out_poly8x8_t = vld1_dup_p8 (0);
-}
-
-/* { dg-final { scan-assembler "vld1\.8\[ 	\]+((\\\{\[dD\]\[0-9\]+\\\[\\\]\\\})|(\[dD\]\[0-9\]+\\\[\\\])), \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vld1_dups16.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vld1_dups16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vld1_dups16 (void)
-{
-  int16x4_t out_int16x4_t;
-
-  out_int16x4_t = vld1_dup_s16 (0);
-}
-
-/* { dg-final { scan-assembler "vld1\.16\[ 	\]+((\\\{\[dD\]\[0-9\]+\\\[\\\]\\\})|(\[dD\]\[0-9\]+\\\[\\\])), \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vld1_dups32.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vld1_dups32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vld1_dups32 (void)
-{
-  int32x2_t out_int32x2_t;
-
-  out_int32x2_t = vld1_dup_s32 (0);
-}
-
-/* { dg-final { scan-assembler "vld1\.32\[ 	\]+((\\\{\[dD\]\[0-9\]+\\\[\\\]\\\})|(\[dD\]\[0-9\]+\\\[\\\])), \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vld1_dups64.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vld1_dups64' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vld1_dups64 (void)
-{
-  int64x1_t out_int64x1_t;
-
-  out_int64x1_t = vld1_dup_s64 (0);
-}
-
-/* { dg-final { scan-assembler "vld1\.64\[ 	\]+((\\\{\[dD\]\[0-9\]+\\\})|(\[dD\]\[0-9\]+)), \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vld1_dups8.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vld1_dups8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vld1_dups8 (void)
-{
-  int8x8_t out_int8x8_t;
-
-  out_int8x8_t = vld1_dup_s8 (0);
-}
-
-/* { dg-final { scan-assembler "vld1\.8\[ 	\]+((\\\{\[dD\]\[0-9\]+\\\[\\\]\\\})|(\[dD\]\[0-9\]+\\\[\\\])), \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vld1_dupu16.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vld1_dupu16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vld1_dupu16 (void)
-{
-  uint16x4_t out_uint16x4_t;
-
-  out_uint16x4_t = vld1_dup_u16 (0);
-}
-
-/* { dg-final { scan-assembler "vld1\.16\[ 	\]+((\\\{\[dD\]\[0-9\]+\\\[\\\]\\\})|(\[dD\]\[0-9\]+\\\[\\\])), \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vld1_dupu32.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vld1_dupu32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vld1_dupu32 (void)
-{
-  uint32x2_t out_uint32x2_t;
-
-  out_uint32x2_t = vld1_dup_u32 (0);
-}
-
-/* { dg-final { scan-assembler "vld1\.32\[ 	\]+((\\\{\[dD\]\[0-9\]+\\\[\\\]\\\})|(\[dD\]\[0-9\]+\\\[\\\])), \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vld1_dupu64.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vld1_dupu64' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vld1_dupu64 (void)
-{
-  uint64x1_t out_uint64x1_t;
-
-  out_uint64x1_t = vld1_dup_u64 (0);
-}
-
-/* { dg-final { scan-assembler "vld1\.64\[ 	\]+((\\\{\[dD\]\[0-9\]+\\\})|(\[dD\]\[0-9\]+)), \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vld1_dupu8.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vld1_dupu8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vld1_dupu8 (void)
-{
-  uint8x8_t out_uint8x8_t;
-
-  out_uint8x8_t = vld1_dup_u8 (0);
-}
-
-/* { dg-final { scan-assembler "vld1\.8\[ 	\]+((\\\{\[dD\]\[0-9\]+\\\[\\\]\\\})|(\[dD\]\[0-9\]+\\\[\\\])), \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vld1_lanef32.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vld1_lanef32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vld1_lanef32 (void)
-{
-  float32x2_t out_float32x2_t;
-  float32x2_t arg1_float32x2_t;
-
-  out_float32x2_t = vld1_lane_f32 (0, arg1_float32x2_t, 1);
-}
-
-/* { dg-final { scan-assembler "vld1\.32\[ 	\]+((\\\{\[dD\]\[0-9\]+\\\[\[0-9\]+\\\]\\\})|(\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])), \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vld1_lanep16.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vld1_lanep16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vld1_lanep16 (void)
-{
-  poly16x4_t out_poly16x4_t;
-  poly16x4_t arg1_poly16x4_t;
-
-  out_poly16x4_t = vld1_lane_p16 (0, arg1_poly16x4_t, 1);
-}
-
-/* { dg-final { scan-assembler "vld1\.16\[ 	\]+((\\\{\[dD\]\[0-9\]+\\\[\[0-9\]+\\\]\\\})|(\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])), \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vld1_lanep64.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vld1_lanep64' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_crypto_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_crypto } */
-
-#include "arm_neon.h"
-
-void test_vld1_lanep64 (void)
-{
-  poly64x1_t out_poly64x1_t;
-  poly64x1_t arg1_poly64x1_t;
-
-  out_poly64x1_t = vld1_lane_p64 (0, arg1_poly64x1_t, 0);
-}
-
-/* { dg-final { scan-assembler "vld1\.64\[ 	\]+((\\\{\[dD\]\[0-9\]+\\\})|(\[dD\]\[0-9\]+)), \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vld1_lanep8.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vld1_lanep8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vld1_lanep8 (void)
-{
-  poly8x8_t out_poly8x8_t;
-  poly8x8_t arg1_poly8x8_t;
-
-  out_poly8x8_t = vld1_lane_p8 (0, arg1_poly8x8_t, 1);
-}
-
-/* { dg-final { scan-assembler "vld1\.8\[ 	\]+((\\\{\[dD\]\[0-9\]+\\\[\[0-9\]+\\\]\\\})|(\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])), \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vld1_lanes16.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vld1_lanes16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vld1_lanes16 (void)
-{
-  int16x4_t out_int16x4_t;
-  int16x4_t arg1_int16x4_t;
-
-  out_int16x4_t = vld1_lane_s16 (0, arg1_int16x4_t, 1);
-}
-
-/* { dg-final { scan-assembler "vld1\.16\[ 	\]+((\\\{\[dD\]\[0-9\]+\\\[\[0-9\]+\\\]\\\})|(\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])), \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vld1_lanes32.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vld1_lanes32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vld1_lanes32 (void)
-{
-  int32x2_t out_int32x2_t;
-  int32x2_t arg1_int32x2_t;
-
-  out_int32x2_t = vld1_lane_s32 (0, arg1_int32x2_t, 1);
-}
-
-/* { dg-final { scan-assembler "vld1\.32\[ 	\]+((\\\{\[dD\]\[0-9\]+\\\[\[0-9\]+\\\]\\\})|(\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])), \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vld1_lanes64.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vld1_lanes64' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vld1_lanes64 (void)
-{
-  int64x1_t out_int64x1_t;
-  int64x1_t arg1_int64x1_t;
-
-  out_int64x1_t = vld1_lane_s64 (0, arg1_int64x1_t, 0);
-}
-
-/* { dg-final { scan-assembler "vld1\.64\[ 	\]+((\\\{\[dD\]\[0-9\]+\\\})|(\[dD\]\[0-9\]+)), \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vld1_lanes8.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vld1_lanes8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vld1_lanes8 (void)
-{
-  int8x8_t out_int8x8_t;
-  int8x8_t arg1_int8x8_t;
-
-  out_int8x8_t = vld1_lane_s8 (0, arg1_int8x8_t, 1);
-}
-
-/* { dg-final { scan-assembler "vld1\.8\[ 	\]+((\\\{\[dD\]\[0-9\]+\\\[\[0-9\]+\\\]\\\})|(\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])), \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vld1_laneu16.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vld1_laneu16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vld1_laneu16 (void)
-{
-  uint16x4_t out_uint16x4_t;
-  uint16x4_t arg1_uint16x4_t;
-
-  out_uint16x4_t = vld1_lane_u16 (0, arg1_uint16x4_t, 1);
-}
-
-/* { dg-final { scan-assembler "vld1\.16\[ 	\]+((\\\{\[dD\]\[0-9\]+\\\[\[0-9\]+\\\]\\\})|(\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])), \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vld1_laneu32.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vld1_laneu32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vld1_laneu32 (void)
-{
-  uint32x2_t out_uint32x2_t;
-  uint32x2_t arg1_uint32x2_t;
-
-  out_uint32x2_t = vld1_lane_u32 (0, arg1_uint32x2_t, 1);
-}
-
-/* { dg-final { scan-assembler "vld1\.32\[ 	\]+((\\\{\[dD\]\[0-9\]+\\\[\[0-9\]+\\\]\\\})|(\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])), \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vld1_laneu64.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vld1_laneu64' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vld1_laneu64 (void)
-{
-  uint64x1_t out_uint64x1_t;
-  uint64x1_t arg1_uint64x1_t;
-
-  out_uint64x1_t = vld1_lane_u64 (0, arg1_uint64x1_t, 0);
-}
-
-/* { dg-final { scan-assembler "vld1\.64\[ 	\]+((\\\{\[dD\]\[0-9\]+\\\})|(\[dD\]\[0-9\]+)), \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vld1_laneu8.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vld1_laneu8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vld1_laneu8 (void)
-{
-  uint8x8_t out_uint8x8_t;
-  uint8x8_t arg1_uint8x8_t;
-
-  out_uint8x8_t = vld1_lane_u8 (0, arg1_uint8x8_t, 1);
-}
-
-/* { dg-final { scan-assembler "vld1\.8\[ 	\]+((\\\{\[dD\]\[0-9\]+\\\[\[0-9\]+\\\]\\\})|(\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])), \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vld1f32.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vld1f32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vld1f32 (void)
-{
-  float32x2_t out_float32x2_t;
-
-  out_float32x2_t = vld1_f32 (0);
-}
-
-/* { dg-final { scan-assembler "vld1\.32\[ 	\]+((\\\{\[dD\]\[0-9\]+\\\})|(\[dD\]\[0-9\]+)), \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vld1p16.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vld1p16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vld1p16 (void)
-{
-  poly16x4_t out_poly16x4_t;
-
-  out_poly16x4_t = vld1_p16 (0);
-}
-
-/* { dg-final { scan-assembler "vld1\.16\[ 	\]+((\\\{\[dD\]\[0-9\]+\\\})|(\[dD\]\[0-9\]+)), \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vld1p64.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vld1p64' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_crypto_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_crypto } */
-
-#include "arm_neon.h"
-
-void test_vld1p64 (void)
-{
-  poly64x1_t out_poly64x1_t;
-
-  out_poly64x1_t = vld1_p64 (0);
-}
-
-/* { dg-final { scan-assembler "vld1\.64\[ 	\]+((\\\{\[dD\]\[0-9\]+\\\})|(\[dD\]\[0-9\]+)), \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vld1p8.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vld1p8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vld1p8 (void)
-{
-  poly8x8_t out_poly8x8_t;
-
-  out_poly8x8_t = vld1_p8 (0);
-}
-
-/* { dg-final { scan-assembler "vld1\.8\[ 	\]+((\\\{\[dD\]\[0-9\]+\\\})|(\[dD\]\[0-9\]+)), \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vld1s16.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vld1s16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vld1s16 (void)
-{
-  int16x4_t out_int16x4_t;
-
-  out_int16x4_t = vld1_s16 (0);
-}
-
-/* { dg-final { scan-assembler "vld1\.16\[ 	\]+((\\\{\[dD\]\[0-9\]+\\\})|(\[dD\]\[0-9\]+)), \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vld1s32.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vld1s32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vld1s32 (void)
-{
-  int32x2_t out_int32x2_t;
-
-  out_int32x2_t = vld1_s32 (0);
-}
-
-/* { dg-final { scan-assembler "vld1\.32\[ 	\]+((\\\{\[dD\]\[0-9\]+\\\})|(\[dD\]\[0-9\]+)), \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vld1s64.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vld1s64' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vld1s64 (void)
-{
-  int64x1_t out_int64x1_t;
-
-  out_int64x1_t = vld1_s64 (0);
-}
-
-/* { dg-final { scan-assembler "vld1\.64\[ 	\]+((\\\{\[dD\]\[0-9\]+\\\})|(\[dD\]\[0-9\]+)), \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vld1s8.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vld1s8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vld1s8 (void)
-{
-  int8x8_t out_int8x8_t;
-
-  out_int8x8_t = vld1_s8 (0);
-}
-
-/* { dg-final { scan-assembler "vld1\.8\[ 	\]+((\\\{\[dD\]\[0-9\]+\\\})|(\[dD\]\[0-9\]+)), \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vld1u16.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vld1u16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vld1u16 (void)
-{
-  uint16x4_t out_uint16x4_t;
-
-  out_uint16x4_t = vld1_u16 (0);
-}
-
-/* { dg-final { scan-assembler "vld1\.16\[ 	\]+((\\\{\[dD\]\[0-9\]+\\\})|(\[dD\]\[0-9\]+)), \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vld1u32.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vld1u32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vld1u32 (void)
-{
-  uint32x2_t out_uint32x2_t;
-
-  out_uint32x2_t = vld1_u32 (0);
-}
-
-/* { dg-final { scan-assembler "vld1\.32\[ 	\]+((\\\{\[dD\]\[0-9\]+\\\})|(\[dD\]\[0-9\]+)), \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vld1u64.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vld1u64' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vld1u64 (void)
-{
-  uint64x1_t out_uint64x1_t;
-
-  out_uint64x1_t = vld1_u64 (0);
-}
-
-/* { dg-final { scan-assembler "vld1\.64\[ 	\]+((\\\{\[dD\]\[0-9\]+\\\})|(\[dD\]\[0-9\]+)), \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vld1u8.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vld1u8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vld1u8 (void)
-{
-  uint8x8_t out_uint8x8_t;
-
-  out_uint8x8_t = vld1_u8 (0);
-}
-
-/* { dg-final { scan-assembler "vld1\.8\[ 	\]+((\\\{\[dD\]\[0-9\]+\\\})|(\[dD\]\[0-9\]+)), \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vld2Q_lanef32.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vld2Q_lanef32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vld2Q_lanef32 (void)
-{
-  float32x4x2_t out_float32x4x2_t;
-  float32x4x2_t arg1_float32x4x2_t;
-
-  out_float32x4x2_t = vld2q_lane_f32 (0, arg1_float32x4x2_t, 1);
-}
-
-/* { dg-final { scan-assembler "vld2\.32\[ 	\]+\\\{((\[dD\]\[0-9\]+\\\[\[0-9\]+\\\]-\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])|(\[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vld2Q_lanep16.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vld2Q_lanep16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vld2Q_lanep16 (void)
-{
-  poly16x8x2_t out_poly16x8x2_t;
-  poly16x8x2_t arg1_poly16x8x2_t;
-
-  out_poly16x8x2_t = vld2q_lane_p16 (0, arg1_poly16x8x2_t, 1);
-}
-
-/* { dg-final { scan-assembler "vld2\.16\[ 	\]+\\\{((\[dD\]\[0-9\]+\\\[\[0-9\]+\\\]-\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])|(\[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vld2Q_lanes16.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vld2Q_lanes16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vld2Q_lanes16 (void)
-{
-  int16x8x2_t out_int16x8x2_t;
-  int16x8x2_t arg1_int16x8x2_t;
-
-  out_int16x8x2_t = vld2q_lane_s16 (0, arg1_int16x8x2_t, 1);
-}
-
-/* { dg-final { scan-assembler "vld2\.16\[ 	\]+\\\{((\[dD\]\[0-9\]+\\\[\[0-9\]+\\\]-\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])|(\[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vld2Q_lanes32.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vld2Q_lanes32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vld2Q_lanes32 (void)
-{
-  int32x4x2_t out_int32x4x2_t;
-  int32x4x2_t arg1_int32x4x2_t;
-
-  out_int32x4x2_t = vld2q_lane_s32 (0, arg1_int32x4x2_t, 1);
-}
-
-/* { dg-final { scan-assembler "vld2\.32\[ 	\]+\\\{((\[dD\]\[0-9\]+\\\[\[0-9\]+\\\]-\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])|(\[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vld2Q_laneu16.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vld2Q_laneu16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vld2Q_laneu16 (void)
-{
-  uint16x8x2_t out_uint16x8x2_t;
-  uint16x8x2_t arg1_uint16x8x2_t;
-
-  out_uint16x8x2_t = vld2q_lane_u16 (0, arg1_uint16x8x2_t, 1);
-}
-
-/* { dg-final { scan-assembler "vld2\.16\[ 	\]+\\\{((\[dD\]\[0-9\]+\\\[\[0-9\]+\\\]-\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])|(\[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vld2Q_laneu32.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vld2Q_laneu32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vld2Q_laneu32 (void)
-{
-  uint32x4x2_t out_uint32x4x2_t;
-  uint32x4x2_t arg1_uint32x4x2_t;
-
-  out_uint32x4x2_t = vld2q_lane_u32 (0, arg1_uint32x4x2_t, 1);
-}
-
-/* { dg-final { scan-assembler "vld2\.32\[ 	\]+\\\{((\[dD\]\[0-9\]+\\\[\[0-9\]+\\\]-\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])|(\[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vld2Qf32.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vld2Qf32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vld2Qf32 (void)
-{
-  float32x4x2_t out_float32x4x2_t;
-
-  out_float32x4x2_t = vld2q_f32 (0);
-}
-
-/* { dg-final { scan-assembler "vld2\.32\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
-/* { dg-final { scan-assembler "vld2\.32\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vld2Qp16.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vld2Qp16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vld2Qp16 (void)
-{
-  poly16x8x2_t out_poly16x8x2_t;
-
-  out_poly16x8x2_t = vld2q_p16 (0);
-}
-
-/* { dg-final { scan-assembler "vld2\.16\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
-/* { dg-final { scan-assembler "vld2\.16\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vld2Qp8.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vld2Qp8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vld2Qp8 (void)
-{
-  poly8x16x2_t out_poly8x16x2_t;
-
-  out_poly8x16x2_t = vld2q_p8 (0);
-}
-
-/* { dg-final { scan-assembler "vld2\.8\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
-/* { dg-final { scan-assembler "vld2\.8\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vld2Qs16.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vld2Qs16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vld2Qs16 (void)
-{
-  int16x8x2_t out_int16x8x2_t;
-
-  out_int16x8x2_t = vld2q_s16 (0);
-}
-
-/* { dg-final { scan-assembler "vld2\.16\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
-/* { dg-final { scan-assembler "vld2\.16\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vld2Qs32.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vld2Qs32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vld2Qs32 (void)
-{
-  int32x4x2_t out_int32x4x2_t;
-
-  out_int32x4x2_t = vld2q_s32 (0);
-}
-
-/* { dg-final { scan-assembler "vld2\.32\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
-/* { dg-final { scan-assembler "vld2\.32\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vld2Qs8.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vld2Qs8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vld2Qs8 (void)
-{
-  int8x16x2_t out_int8x16x2_t;
-
-  out_int8x16x2_t = vld2q_s8 (0);
-}
-
-/* { dg-final { scan-assembler "vld2\.8\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
-/* { dg-final { scan-assembler "vld2\.8\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vld2Qu16.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vld2Qu16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vld2Qu16 (void)
-{
-  uint16x8x2_t out_uint16x8x2_t;
-
-  out_uint16x8x2_t = vld2q_u16 (0);
-}
-
-/* { dg-final { scan-assembler "vld2\.16\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
-/* { dg-final { scan-assembler "vld2\.16\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vld2Qu32.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vld2Qu32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vld2Qu32 (void)
-{
-  uint32x4x2_t out_uint32x4x2_t;
-
-  out_uint32x4x2_t = vld2q_u32 (0);
-}
-
-/* { dg-final { scan-assembler "vld2\.32\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
-/* { dg-final { scan-assembler "vld2\.32\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vld2Qu8.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vld2Qu8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vld2Qu8 (void)
-{
-  uint8x16x2_t out_uint8x16x2_t;
-
-  out_uint8x16x2_t = vld2q_u8 (0);
-}
-
-/* { dg-final { scan-assembler "vld2\.8\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
-/* { dg-final { scan-assembler "vld2\.8\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vld2_dupf32.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vld2_dupf32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vld2_dupf32 (void)
-{
-  float32x2x2_t out_float32x2x2_t;
-
-  out_float32x2x2_t = vld2_dup_f32 (0);
-}
-
-/* { dg-final { scan-assembler "vld2\.32\[ 	\]+\\\{((\[dD\]\[0-9\]+\\\[\\\]-\[dD\]\[0-9\]+\\\[\\\])|(\[dD\]\[0-9\]+\\\[\\\], \[dD\]\[0-9\]+\\\[\\\]))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vld2_dupp16.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vld2_dupp16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vld2_dupp16 (void)
-{
-  poly16x4x2_t out_poly16x4x2_t;
-
-  out_poly16x4x2_t = vld2_dup_p16 (0);
-}
-
-/* { dg-final { scan-assembler "vld2\.16\[ 	\]+\\\{((\[dD\]\[0-9\]+\\\[\\\]-\[dD\]\[0-9\]+\\\[\\\])|(\[dD\]\[0-9\]+\\\[\\\], \[dD\]\[0-9\]+\\\[\\\]))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vld2_dupp64.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vld2_dupp64' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_crypto_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_crypto } */
-
-#include "arm_neon.h"
-
-void test_vld2_dupp64 (void)
-{
-  poly64x1x2_t out_poly64x1x2_t;
-
-  out_poly64x1x2_t = vld2_dup_p64 (0);
-}
-
-/* { dg-final { scan-assembler "vld1\.64\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vld2_dupp8.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vld2_dupp8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vld2_dupp8 (void)
-{
-  poly8x8x2_t out_poly8x8x2_t;
-
-  out_poly8x8x2_t = vld2_dup_p8 (0);
-}
-
-/* { dg-final { scan-assembler "vld2\.8\[ 	\]+\\\{((\[dD\]\[0-9\]+\\\[\\\]-\[dD\]\[0-9\]+\\\[\\\])|(\[dD\]\[0-9\]+\\\[\\\], \[dD\]\[0-9\]+\\\[\\\]))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vld2_dups16.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vld2_dups16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vld2_dups16 (void)
-{
-  int16x4x2_t out_int16x4x2_t;
-
-  out_int16x4x2_t = vld2_dup_s16 (0);
-}
-
-/* { dg-final { scan-assembler "vld2\.16\[ 	\]+\\\{((\[dD\]\[0-9\]+\\\[\\\]-\[dD\]\[0-9\]+\\\[\\\])|(\[dD\]\[0-9\]+\\\[\\\], \[dD\]\[0-9\]+\\\[\\\]))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vld2_dups32.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vld2_dups32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vld2_dups32 (void)
-{
-  int32x2x2_t out_int32x2x2_t;
-
-  out_int32x2x2_t = vld2_dup_s32 (0);
-}
-
-/* { dg-final { scan-assembler "vld2\.32\[ 	\]+\\\{((\[dD\]\[0-9\]+\\\[\\\]-\[dD\]\[0-9\]+\\\[\\\])|(\[dD\]\[0-9\]+\\\[\\\], \[dD\]\[0-9\]+\\\[\\\]))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vld2_dups64.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vld2_dups64' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vld2_dups64 (void)
-{
-  int64x1x2_t out_int64x1x2_t;
-
-  out_int64x1x2_t = vld2_dup_s64 (0);
-}
-
-/* { dg-final { scan-assembler "vld1\.64\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vld2_dups8.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vld2_dups8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vld2_dups8 (void)
-{
-  int8x8x2_t out_int8x8x2_t;
-
-  out_int8x8x2_t = vld2_dup_s8 (0);
-}
-
-/* { dg-final { scan-assembler "vld2\.8\[ 	\]+\\\{((\[dD\]\[0-9\]+\\\[\\\]-\[dD\]\[0-9\]+\\\[\\\])|(\[dD\]\[0-9\]+\\\[\\\], \[dD\]\[0-9\]+\\\[\\\]))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vld2_dupu16.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vld2_dupu16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vld2_dupu16 (void)
-{
-  uint16x4x2_t out_uint16x4x2_t;
-
-  out_uint16x4x2_t = vld2_dup_u16 (0);
-}
-
-/* { dg-final { scan-assembler "vld2\.16\[ 	\]+\\\{((\[dD\]\[0-9\]+\\\[\\\]-\[dD\]\[0-9\]+\\\[\\\])|(\[dD\]\[0-9\]+\\\[\\\], \[dD\]\[0-9\]+\\\[\\\]))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vld2_dupu32.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vld2_dupu32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vld2_dupu32 (void)
-{
-  uint32x2x2_t out_uint32x2x2_t;
-
-  out_uint32x2x2_t = vld2_dup_u32 (0);
-}
-
-/* { dg-final { scan-assembler "vld2\.32\[ 	\]+\\\{((\[dD\]\[0-9\]+\\\[\\\]-\[dD\]\[0-9\]+\\\[\\\])|(\[dD\]\[0-9\]+\\\[\\\], \[dD\]\[0-9\]+\\\[\\\]))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vld2_dupu64.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vld2_dupu64' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vld2_dupu64 (void)
-{
-  uint64x1x2_t out_uint64x1x2_t;
-
-  out_uint64x1x2_t = vld2_dup_u64 (0);
-}
-
-/* { dg-final { scan-assembler "vld1\.64\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vld2_dupu8.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vld2_dupu8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vld2_dupu8 (void)
-{
-  uint8x8x2_t out_uint8x8x2_t;
-
-  out_uint8x8x2_t = vld2_dup_u8 (0);
-}
-
-/* { dg-final { scan-assembler "vld2\.8\[ 	\]+\\\{((\[dD\]\[0-9\]+\\\[\\\]-\[dD\]\[0-9\]+\\\[\\\])|(\[dD\]\[0-9\]+\\\[\\\], \[dD\]\[0-9\]+\\\[\\\]))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vld2_lanef32.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vld2_lanef32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vld2_lanef32 (void)
-{
-  float32x2x2_t out_float32x2x2_t;
-  float32x2x2_t arg1_float32x2x2_t;
-
-  out_float32x2x2_t = vld2_lane_f32 (0, arg1_float32x2x2_t, 1);
-}
-
-/* { dg-final { scan-assembler "vld2\.32\[ 	\]+\\\{((\[dD\]\[0-9\]+\\\[\[0-9\]+\\\]-\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])|(\[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vld2_lanep16.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vld2_lanep16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vld2_lanep16 (void)
-{
-  poly16x4x2_t out_poly16x4x2_t;
-  poly16x4x2_t arg1_poly16x4x2_t;
-
-  out_poly16x4x2_t = vld2_lane_p16 (0, arg1_poly16x4x2_t, 1);
-}
-
-/* { dg-final { scan-assembler "vld2\.16\[ 	\]+\\\{((\[dD\]\[0-9\]+\\\[\[0-9\]+\\\]-\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])|(\[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vld2_lanep8.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vld2_lanep8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vld2_lanep8 (void)
-{
-  poly8x8x2_t out_poly8x8x2_t;
-  poly8x8x2_t arg1_poly8x8x2_t;
-
-  out_poly8x8x2_t = vld2_lane_p8 (0, arg1_poly8x8x2_t, 1);
-}
-
-/* { dg-final { scan-assembler "vld2\.8\[ 	\]+\\\{((\[dD\]\[0-9\]+\\\[\[0-9\]+\\\]-\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])|(\[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vld2_lanes16.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vld2_lanes16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vld2_lanes16 (void)
-{
-  int16x4x2_t out_int16x4x2_t;
-  int16x4x2_t arg1_int16x4x2_t;
-
-  out_int16x4x2_t = vld2_lane_s16 (0, arg1_int16x4x2_t, 1);
-}
-
-/* { dg-final { scan-assembler "vld2\.16\[ 	\]+\\\{((\[dD\]\[0-9\]+\\\[\[0-9\]+\\\]-\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])|(\[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vld2_lanes32.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vld2_lanes32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vld2_lanes32 (void)
-{
-  int32x2x2_t out_int32x2x2_t;
-  int32x2x2_t arg1_int32x2x2_t;
-
-  out_int32x2x2_t = vld2_lane_s32 (0, arg1_int32x2x2_t, 1);
-}
-
-/* { dg-final { scan-assembler "vld2\.32\[ 	\]+\\\{((\[dD\]\[0-9\]+\\\[\[0-9\]+\\\]-\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])|(\[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vld2_lanes8.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vld2_lanes8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vld2_lanes8 (void)
-{
-  int8x8x2_t out_int8x8x2_t;
-  int8x8x2_t arg1_int8x8x2_t;
-
-  out_int8x8x2_t = vld2_lane_s8 (0, arg1_int8x8x2_t, 1);
-}
-
-/* { dg-final { scan-assembler "vld2\.8\[ 	\]+\\\{((\[dD\]\[0-9\]+\\\[\[0-9\]+\\\]-\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])|(\[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vld2_laneu16.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vld2_laneu16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vld2_laneu16 (void)
-{
-  uint16x4x2_t out_uint16x4x2_t;
-  uint16x4x2_t arg1_uint16x4x2_t;
-
-  out_uint16x4x2_t = vld2_lane_u16 (0, arg1_uint16x4x2_t, 1);
-}
-
-/* { dg-final { scan-assembler "vld2\.16\[ 	\]+\\\{((\[dD\]\[0-9\]+\\\[\[0-9\]+\\\]-\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])|(\[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vld2_laneu32.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vld2_laneu32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vld2_laneu32 (void)
-{
-  uint32x2x2_t out_uint32x2x2_t;
-  uint32x2x2_t arg1_uint32x2x2_t;
-
-  out_uint32x2x2_t = vld2_lane_u32 (0, arg1_uint32x2x2_t, 1);
-}
-
-/* { dg-final { scan-assembler "vld2\.32\[ 	\]+\\\{((\[dD\]\[0-9\]+\\\[\[0-9\]+\\\]-\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])|(\[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vld2_laneu8.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vld2_laneu8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vld2_laneu8 (void)
-{
-  uint8x8x2_t out_uint8x8x2_t;
-  uint8x8x2_t arg1_uint8x8x2_t;
-
-  out_uint8x8x2_t = vld2_lane_u8 (0, arg1_uint8x8x2_t, 1);
-}
-
-/* { dg-final { scan-assembler "vld2\.8\[ 	\]+\\\{((\[dD\]\[0-9\]+\\\[\[0-9\]+\\\]-\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])|(\[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vld2f32.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vld2f32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vld2f32 (void)
-{
-  float32x2x2_t out_float32x2x2_t;
-
-  out_float32x2x2_t = vld2_f32 (0);
-}
-
-/* { dg-final { scan-assembler "vld2\.32\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vld2p16.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vld2p16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vld2p16 (void)
-{
-  poly16x4x2_t out_poly16x4x2_t;
-
-  out_poly16x4x2_t = vld2_p16 (0);
-}
-
-/* { dg-final { scan-assembler "vld2\.16\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vld2p64.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vld2p64' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_crypto_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_crypto } */
-
-#include "arm_neon.h"
-
-void test_vld2p64 (void)
-{
-  poly64x1x2_t out_poly64x1x2_t;
-
-  out_poly64x1x2_t = vld2_p64 (0);
-}
-
-/* { dg-final { scan-assembler "vld1\.64\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vld2p8.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vld2p8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vld2p8 (void)
-{
-  poly8x8x2_t out_poly8x8x2_t;
-
-  out_poly8x8x2_t = vld2_p8 (0);
-}
-
-/* { dg-final { scan-assembler "vld2\.8\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vld2s16.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vld2s16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vld2s16 (void)
-{
-  int16x4x2_t out_int16x4x2_t;
-
-  out_int16x4x2_t = vld2_s16 (0);
-}
-
-/* { dg-final { scan-assembler "vld2\.16\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vld2s32.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vld2s32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vld2s32 (void)
-{
-  int32x2x2_t out_int32x2x2_t;
-
-  out_int32x2x2_t = vld2_s32 (0);
-}
-
-/* { dg-final { scan-assembler "vld2\.32\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vld2s64.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vld2s64' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vld2s64 (void)
-{
-  int64x1x2_t out_int64x1x2_t;
-
-  out_int64x1x2_t = vld2_s64 (0);
-}
-
-/* { dg-final { scan-assembler "vld1\.64\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vld2s8.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vld2s8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vld2s8 (void)
-{
-  int8x8x2_t out_int8x8x2_t;
-
-  out_int8x8x2_t = vld2_s8 (0);
-}
-
-/* { dg-final { scan-assembler "vld2\.8\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vld2u16.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vld2u16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vld2u16 (void)
-{
-  uint16x4x2_t out_uint16x4x2_t;
-
-  out_uint16x4x2_t = vld2_u16 (0);
-}
-
-/* { dg-final { scan-assembler "vld2\.16\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vld2u32.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vld2u32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vld2u32 (void)
-{
-  uint32x2x2_t out_uint32x2x2_t;
-
-  out_uint32x2x2_t = vld2_u32 (0);
-}
-
-/* { dg-final { scan-assembler "vld2\.32\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vld2u64.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vld2u64' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vld2u64 (void)
-{
-  uint64x1x2_t out_uint64x1x2_t;
-
-  out_uint64x1x2_t = vld2_u64 (0);
-}
-
-/* { dg-final { scan-assembler "vld1\.64\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vld2u8.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vld2u8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vld2u8 (void)
-{
-  uint8x8x2_t out_uint8x8x2_t;
-
-  out_uint8x8x2_t = vld2_u8 (0);
-}
-
-/* { dg-final { scan-assembler "vld2\.8\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vld3Q_lanef32.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vld3Q_lanef32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vld3Q_lanef32 (void)
-{
-  float32x4x3_t out_float32x4x3_t;
-  float32x4x3_t arg1_float32x4x3_t;
-
-  out_float32x4x3_t = vld3q_lane_f32 (0, arg1_float32x4x3_t, 1);
-}
-
-/* { dg-final { scan-assembler "vld3\.32\[ 	\]+\\\{((\[dD\]\[0-9\]+\\\[\[0-9\]+\\\]-\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])|(\[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vld3Q_lanep16.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vld3Q_lanep16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vld3Q_lanep16 (void)
-{
-  poly16x8x3_t out_poly16x8x3_t;
-  poly16x8x3_t arg1_poly16x8x3_t;
-
-  out_poly16x8x3_t = vld3q_lane_p16 (0, arg1_poly16x8x3_t, 1);
-}
-
-/* { dg-final { scan-assembler "vld3\.16\[ 	\]+\\\{((\[dD\]\[0-9\]+\\\[\[0-9\]+\\\]-\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])|(\[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vld3Q_lanes16.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vld3Q_lanes16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vld3Q_lanes16 (void)
-{
-  int16x8x3_t out_int16x8x3_t;
-  int16x8x3_t arg1_int16x8x3_t;
-
-  out_int16x8x3_t = vld3q_lane_s16 (0, arg1_int16x8x3_t, 1);
-}
-
-/* { dg-final { scan-assembler "vld3\.16\[ 	\]+\\\{((\[dD\]\[0-9\]+\\\[\[0-9\]+\\\]-\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])|(\[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vld3Q_lanes32.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vld3Q_lanes32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vld3Q_lanes32 (void)
-{
-  int32x4x3_t out_int32x4x3_t;
-  int32x4x3_t arg1_int32x4x3_t;
-
-  out_int32x4x3_t = vld3q_lane_s32 (0, arg1_int32x4x3_t, 1);
-}
-
-/* { dg-final { scan-assembler "vld3\.32\[ 	\]+\\\{((\[dD\]\[0-9\]+\\\[\[0-9\]+\\\]-\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])|(\[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vld3Q_laneu16.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vld3Q_laneu16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vld3Q_laneu16 (void)
-{
-  uint16x8x3_t out_uint16x8x3_t;
-  uint16x8x3_t arg1_uint16x8x3_t;
-
-  out_uint16x8x3_t = vld3q_lane_u16 (0, arg1_uint16x8x3_t, 1);
-}
-
-/* { dg-final { scan-assembler "vld3\.16\[ 	\]+\\\{((\[dD\]\[0-9\]+\\\[\[0-9\]+\\\]-\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])|(\[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vld3Q_laneu32.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vld3Q_laneu32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vld3Q_laneu32 (void)
-{
-  uint32x4x3_t out_uint32x4x3_t;
-  uint32x4x3_t arg1_uint32x4x3_t;
-
-  out_uint32x4x3_t = vld3q_lane_u32 (0, arg1_uint32x4x3_t, 1);
-}
-
-/* { dg-final { scan-assembler "vld3\.32\[ 	\]+\\\{((\[dD\]\[0-9\]+\\\[\[0-9\]+\\\]-\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])|(\[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vld3Qf32.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vld3Qf32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vld3Qf32 (void)
-{
-  float32x4x3_t out_float32x4x3_t;
-
-  out_float32x4x3_t = vld3q_f32 (0);
-}
-
-/* { dg-final { scan-assembler "vld3\.32\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
-/* { dg-final { scan-assembler "vld3\.32\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vld3Qp16.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vld3Qp16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vld3Qp16 (void)
-{
-  poly16x8x3_t out_poly16x8x3_t;
-
-  out_poly16x8x3_t = vld3q_p16 (0);
-}
-
-/* { dg-final { scan-assembler "vld3\.16\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
-/* { dg-final { scan-assembler "vld3\.16\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vld3Qp8.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vld3Qp8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vld3Qp8 (void)
-{
-  poly8x16x3_t out_poly8x16x3_t;
-
-  out_poly8x16x3_t = vld3q_p8 (0);
-}
-
-/* { dg-final { scan-assembler "vld3\.8\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
-/* { dg-final { scan-assembler "vld3\.8\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vld3Qs16.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vld3Qs16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vld3Qs16 (void)
-{
-  int16x8x3_t out_int16x8x3_t;
-
-  out_int16x8x3_t = vld3q_s16 (0);
-}
-
-/* { dg-final { scan-assembler "vld3\.16\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
-/* { dg-final { scan-assembler "vld3\.16\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vld3Qs32.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vld3Qs32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vld3Qs32 (void)
-{
-  int32x4x3_t out_int32x4x3_t;
-
-  out_int32x4x3_t = vld3q_s32 (0);
-}
-
-/* { dg-final { scan-assembler "vld3\.32\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
-/* { dg-final { scan-assembler "vld3\.32\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vld3Qs8.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vld3Qs8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vld3Qs8 (void)
-{
-  int8x16x3_t out_int8x16x3_t;
-
-  out_int8x16x3_t = vld3q_s8 (0);
-}
-
-/* { dg-final { scan-assembler "vld3\.8\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
-/* { dg-final { scan-assembler "vld3\.8\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vld3Qu16.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vld3Qu16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vld3Qu16 (void)
-{
-  uint16x8x3_t out_uint16x8x3_t;
-
-  out_uint16x8x3_t = vld3q_u16 (0);
-}
-
-/* { dg-final { scan-assembler "vld3\.16\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
-/* { dg-final { scan-assembler "vld3\.16\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vld3Qu32.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vld3Qu32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vld3Qu32 (void)
-{
-  uint32x4x3_t out_uint32x4x3_t;
-
-  out_uint32x4x3_t = vld3q_u32 (0);
-}
-
-/* { dg-final { scan-assembler "vld3\.32\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
-/* { dg-final { scan-assembler "vld3\.32\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vld3Qu8.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vld3Qu8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vld3Qu8 (void)
-{
-  uint8x16x3_t out_uint8x16x3_t;
-
-  out_uint8x16x3_t = vld3q_u8 (0);
-}
-
-/* { dg-final { scan-assembler "vld3\.8\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
-/* { dg-final { scan-assembler "vld3\.8\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vld3_dupf32.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vld3_dupf32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vld3_dupf32 (void)
-{
-  float32x2x3_t out_float32x2x3_t;
-
-  out_float32x2x3_t = vld3_dup_f32 (0);
-}
-
-/* { dg-final { scan-assembler "vld3\.32\[ 	\]+\\\{((\[dD\]\[0-9\]+\\\[\\\]-\[dD\]\[0-9\]+\\\[\\\])|(\[dD\]\[0-9\]+\\\[\\\], \[dD\]\[0-9\]+\\\[\\\], \[dD\]\[0-9\]+\\\[\\\]))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vld3_dupp16.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vld3_dupp16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vld3_dupp16 (void)
-{
-  poly16x4x3_t out_poly16x4x3_t;
-
-  out_poly16x4x3_t = vld3_dup_p16 (0);
-}
-
-/* { dg-final { scan-assembler "vld3\.16\[ 	\]+\\\{((\[dD\]\[0-9\]+\\\[\\\]-\[dD\]\[0-9\]+\\\[\\\])|(\[dD\]\[0-9\]+\\\[\\\], \[dD\]\[0-9\]+\\\[\\\], \[dD\]\[0-9\]+\\\[\\\]))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vld3_dupp64.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vld3_dupp64' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_crypto_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_crypto } */
-
-#include "arm_neon.h"
-
-void test_vld3_dupp64 (void)
-{
-  poly64x1x3_t out_poly64x1x3_t;
-
-  out_poly64x1x3_t = vld3_dup_p64 (0);
-}
-
-/* { dg-final { scan-assembler "vld1\.64\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vld3_dupp8.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vld3_dupp8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vld3_dupp8 (void)
-{
-  poly8x8x3_t out_poly8x8x3_t;
-
-  out_poly8x8x3_t = vld3_dup_p8 (0);
-}
-
-/* { dg-final { scan-assembler "vld3\.8\[ 	\]+\\\{((\[dD\]\[0-9\]+\\\[\\\]-\[dD\]\[0-9\]+\\\[\\\])|(\[dD\]\[0-9\]+\\\[\\\], \[dD\]\[0-9\]+\\\[\\\], \[dD\]\[0-9\]+\\\[\\\]))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vld3_dups16.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vld3_dups16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vld3_dups16 (void)
-{
-  int16x4x3_t out_int16x4x3_t;
-
-  out_int16x4x3_t = vld3_dup_s16 (0);
-}
-
-/* { dg-final { scan-assembler "vld3\.16\[ 	\]+\\\{((\[dD\]\[0-9\]+\\\[\\\]-\[dD\]\[0-9\]+\\\[\\\])|(\[dD\]\[0-9\]+\\\[\\\], \[dD\]\[0-9\]+\\\[\\\], \[dD\]\[0-9\]+\\\[\\\]))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vld3_dups32.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vld3_dups32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vld3_dups32 (void)
-{
-  int32x2x3_t out_int32x2x3_t;
-
-  out_int32x2x3_t = vld3_dup_s32 (0);
-}
-
-/* { dg-final { scan-assembler "vld3\.32\[ 	\]+\\\{((\[dD\]\[0-9\]+\\\[\\\]-\[dD\]\[0-9\]+\\\[\\\])|(\[dD\]\[0-9\]+\\\[\\\], \[dD\]\[0-9\]+\\\[\\\], \[dD\]\[0-9\]+\\\[\\\]))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vld3_dups64.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vld3_dups64' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vld3_dups64 (void)
-{
-  int64x1x3_t out_int64x1x3_t;
-
-  out_int64x1x3_t = vld3_dup_s64 (0);
-}
-
-/* { dg-final { scan-assembler "vld1\.64\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vld3_dups8.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vld3_dups8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vld3_dups8 (void)
-{
-  int8x8x3_t out_int8x8x3_t;
-
-  out_int8x8x3_t = vld3_dup_s8 (0);
-}
-
-/* { dg-final { scan-assembler "vld3\.8\[ 	\]+\\\{((\[dD\]\[0-9\]+\\\[\\\]-\[dD\]\[0-9\]+\\\[\\\])|(\[dD\]\[0-9\]+\\\[\\\], \[dD\]\[0-9\]+\\\[\\\], \[dD\]\[0-9\]+\\\[\\\]))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vld3_dupu16.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vld3_dupu16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vld3_dupu16 (void)
-{
-  uint16x4x3_t out_uint16x4x3_t;
-
-  out_uint16x4x3_t = vld3_dup_u16 (0);
-}
-
-/* { dg-final { scan-assembler "vld3\.16\[ 	\]+\\\{((\[dD\]\[0-9\]+\\\[\\\]-\[dD\]\[0-9\]+\\\[\\\])|(\[dD\]\[0-9\]+\\\[\\\], \[dD\]\[0-9\]+\\\[\\\], \[dD\]\[0-9\]+\\\[\\\]))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vld3_dupu32.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vld3_dupu32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vld3_dupu32 (void)
-{
-  uint32x2x3_t out_uint32x2x3_t;
-
-  out_uint32x2x3_t = vld3_dup_u32 (0);
-}
-
-/* { dg-final { scan-assembler "vld3\.32\[ 	\]+\\\{((\[dD\]\[0-9\]+\\\[\\\]-\[dD\]\[0-9\]+\\\[\\\])|(\[dD\]\[0-9\]+\\\[\\\], \[dD\]\[0-9\]+\\\[\\\], \[dD\]\[0-9\]+\\\[\\\]))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vld3_dupu64.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vld3_dupu64' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vld3_dupu64 (void)
-{
-  uint64x1x3_t out_uint64x1x3_t;
-
-  out_uint64x1x3_t = vld3_dup_u64 (0);
-}
-
-/* { dg-final { scan-assembler "vld1\.64\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vld3_dupu8.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vld3_dupu8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vld3_dupu8 (void)
-{
-  uint8x8x3_t out_uint8x8x3_t;
-
-  out_uint8x8x3_t = vld3_dup_u8 (0);
-}
-
-/* { dg-final { scan-assembler "vld3\.8\[ 	\]+\\\{((\[dD\]\[0-9\]+\\\[\\\]-\[dD\]\[0-9\]+\\\[\\\])|(\[dD\]\[0-9\]+\\\[\\\], \[dD\]\[0-9\]+\\\[\\\], \[dD\]\[0-9\]+\\\[\\\]))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vld3_lanef32.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vld3_lanef32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vld3_lanef32 (void)
-{
-  float32x2x3_t out_float32x2x3_t;
-  float32x2x3_t arg1_float32x2x3_t;
-
-  out_float32x2x3_t = vld3_lane_f32 (0, arg1_float32x2x3_t, 1);
-}
-
-/* { dg-final { scan-assembler "vld3\.32\[ 	\]+\\\{((\[dD\]\[0-9\]+\\\[\[0-9\]+\\\]-\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])|(\[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vld3_lanep16.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vld3_lanep16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vld3_lanep16 (void)
-{
-  poly16x4x3_t out_poly16x4x3_t;
-  poly16x4x3_t arg1_poly16x4x3_t;
-
-  out_poly16x4x3_t = vld3_lane_p16 (0, arg1_poly16x4x3_t, 1);
-}
-
-/* { dg-final { scan-assembler "vld3\.16\[ 	\]+\\\{((\[dD\]\[0-9\]+\\\[\[0-9\]+\\\]-\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])|(\[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vld3_lanep8.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vld3_lanep8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vld3_lanep8 (void)
-{
-  poly8x8x3_t out_poly8x8x3_t;
-  poly8x8x3_t arg1_poly8x8x3_t;
-
-  out_poly8x8x3_t = vld3_lane_p8 (0, arg1_poly8x8x3_t, 1);
-}
-
-/* { dg-final { scan-assembler "vld3\.8\[ 	\]+\\\{((\[dD\]\[0-9\]+\\\[\[0-9\]+\\\]-\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])|(\[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vld3_lanes16.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vld3_lanes16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vld3_lanes16 (void)
-{
-  int16x4x3_t out_int16x4x3_t;
-  int16x4x3_t arg1_int16x4x3_t;
-
-  out_int16x4x3_t = vld3_lane_s16 (0, arg1_int16x4x3_t, 1);
-}
-
-/* { dg-final { scan-assembler "vld3\.16\[ 	\]+\\\{((\[dD\]\[0-9\]+\\\[\[0-9\]+\\\]-\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])|(\[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vld3_lanes32.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vld3_lanes32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vld3_lanes32 (void)
-{
-  int32x2x3_t out_int32x2x3_t;
-  int32x2x3_t arg1_int32x2x3_t;
-
-  out_int32x2x3_t = vld3_lane_s32 (0, arg1_int32x2x3_t, 1);
-}
-
-/* { dg-final { scan-assembler "vld3\.32\[ 	\]+\\\{((\[dD\]\[0-9\]+\\\[\[0-9\]+\\\]-\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])|(\[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vld3_lanes8.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vld3_lanes8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vld3_lanes8 (void)
-{
-  int8x8x3_t out_int8x8x3_t;
-  int8x8x3_t arg1_int8x8x3_t;
-
-  out_int8x8x3_t = vld3_lane_s8 (0, arg1_int8x8x3_t, 1);
-}
-
-/* { dg-final { scan-assembler "vld3\.8\[ 	\]+\\\{((\[dD\]\[0-9\]+\\\[\[0-9\]+\\\]-\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])|(\[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vld3_laneu16.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vld3_laneu16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vld3_laneu16 (void)
-{
-  uint16x4x3_t out_uint16x4x3_t;
-  uint16x4x3_t arg1_uint16x4x3_t;
-
-  out_uint16x4x3_t = vld3_lane_u16 (0, arg1_uint16x4x3_t, 1);
-}
-
-/* { dg-final { scan-assembler "vld3\.16\[ 	\]+\\\{((\[dD\]\[0-9\]+\\\[\[0-9\]+\\\]-\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])|(\[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vld3_laneu32.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vld3_laneu32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vld3_laneu32 (void)
-{
-  uint32x2x3_t out_uint32x2x3_t;
-  uint32x2x3_t arg1_uint32x2x3_t;
-
-  out_uint32x2x3_t = vld3_lane_u32 (0, arg1_uint32x2x3_t, 1);
-}
-
-/* { dg-final { scan-assembler "vld3\.32\[ 	\]+\\\{((\[dD\]\[0-9\]+\\\[\[0-9\]+\\\]-\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])|(\[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vld3_laneu8.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vld3_laneu8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vld3_laneu8 (void)
-{
-  uint8x8x3_t out_uint8x8x3_t;
-  uint8x8x3_t arg1_uint8x8x3_t;
-
-  out_uint8x8x3_t = vld3_lane_u8 (0, arg1_uint8x8x3_t, 1);
-}
-
-/* { dg-final { scan-assembler "vld3\.8\[ 	\]+\\\{((\[dD\]\[0-9\]+\\\[\[0-9\]+\\\]-\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])|(\[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vld3f32.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vld3f32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vld3f32 (void)
-{
-  float32x2x3_t out_float32x2x3_t;
-
-  out_float32x2x3_t = vld3_f32 (0);
-}
-
-/* { dg-final { scan-assembler "vld3\.32\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vld3p16.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vld3p16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vld3p16 (void)
-{
-  poly16x4x3_t out_poly16x4x3_t;
-
-  out_poly16x4x3_t = vld3_p16 (0);
-}
-
-/* { dg-final { scan-assembler "vld3\.16\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vld3p64.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vld3p64' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_crypto_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_crypto } */
-
-#include "arm_neon.h"
-
-void test_vld3p64 (void)
-{
-  poly64x1x3_t out_poly64x1x3_t;
-
-  out_poly64x1x3_t = vld3_p64 (0);
-}
-
-/* { dg-final { scan-assembler "vld1\.64\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vld3p8.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vld3p8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vld3p8 (void)
-{
-  poly8x8x3_t out_poly8x8x3_t;
-
-  out_poly8x8x3_t = vld3_p8 (0);
-}
-
-/* { dg-final { scan-assembler "vld3\.8\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vld3s16.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vld3s16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vld3s16 (void)
-{
-  int16x4x3_t out_int16x4x3_t;
-
-  out_int16x4x3_t = vld3_s16 (0);
-}
-
-/* { dg-final { scan-assembler "vld3\.16\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vld3s32.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vld3s32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vld3s32 (void)
-{
-  int32x2x3_t out_int32x2x3_t;
-
-  out_int32x2x3_t = vld3_s32 (0);
-}
-
-/* { dg-final { scan-assembler "vld3\.32\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vld3s64.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vld3s64' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vld3s64 (void)
-{
-  int64x1x3_t out_int64x1x3_t;
-
-  out_int64x1x3_t = vld3_s64 (0);
-}
-
-/* { dg-final { scan-assembler "vld1\.64\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vld3s8.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vld3s8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vld3s8 (void)
-{
-  int8x8x3_t out_int8x8x3_t;
-
-  out_int8x8x3_t = vld3_s8 (0);
-}
-
-/* { dg-final { scan-assembler "vld3\.8\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vld3u16.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vld3u16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vld3u16 (void)
-{
-  uint16x4x3_t out_uint16x4x3_t;
-
-  out_uint16x4x3_t = vld3_u16 (0);
-}
-
-/* { dg-final { scan-assembler "vld3\.16\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vld3u32.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vld3u32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vld3u32 (void)
-{
-  uint32x2x3_t out_uint32x2x3_t;
-
-  out_uint32x2x3_t = vld3_u32 (0);
-}
-
-/* { dg-final { scan-assembler "vld3\.32\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vld3u64.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vld3u64' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vld3u64 (void)
-{
-  uint64x1x3_t out_uint64x1x3_t;
-
-  out_uint64x1x3_t = vld3_u64 (0);
-}
-
-/* { dg-final { scan-assembler "vld1\.64\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vld3u8.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vld3u8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vld3u8 (void)
-{
-  uint8x8x3_t out_uint8x8x3_t;
-
-  out_uint8x8x3_t = vld3_u8 (0);
-}
-
-/* { dg-final { scan-assembler "vld3\.8\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vld4Q_lanef32.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vld4Q_lanef32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vld4Q_lanef32 (void)
-{
-  float32x4x4_t out_float32x4x4_t;
-  float32x4x4_t arg1_float32x4x4_t;
-
-  out_float32x4x4_t = vld4q_lane_f32 (0, arg1_float32x4x4_t, 1);
-}
-
-/* { dg-final { scan-assembler "vld4\.32\[ 	\]+\\\{((\[dD\]\[0-9\]+\\\[\[0-9\]+\\\]-\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])|(\[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vld4Q_lanep16.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vld4Q_lanep16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vld4Q_lanep16 (void)
-{
-  poly16x8x4_t out_poly16x8x4_t;
-  poly16x8x4_t arg1_poly16x8x4_t;
-
-  out_poly16x8x4_t = vld4q_lane_p16 (0, arg1_poly16x8x4_t, 1);
-}
-
-/* { dg-final { scan-assembler "vld4\.16\[ 	\]+\\\{((\[dD\]\[0-9\]+\\\[\[0-9\]+\\\]-\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])|(\[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vld4Q_lanes16.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vld4Q_lanes16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vld4Q_lanes16 (void)
-{
-  int16x8x4_t out_int16x8x4_t;
-  int16x8x4_t arg1_int16x8x4_t;
-
-  out_int16x8x4_t = vld4q_lane_s16 (0, arg1_int16x8x4_t, 1);
-}
-
-/* { dg-final { scan-assembler "vld4\.16\[ 	\]+\\\{((\[dD\]\[0-9\]+\\\[\[0-9\]+\\\]-\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])|(\[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vld4Q_lanes32.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vld4Q_lanes32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vld4Q_lanes32 (void)
-{
-  int32x4x4_t out_int32x4x4_t;
-  int32x4x4_t arg1_int32x4x4_t;
-
-  out_int32x4x4_t = vld4q_lane_s32 (0, arg1_int32x4x4_t, 1);
-}
-
-/* { dg-final { scan-assembler "vld4\.32\[ 	\]+\\\{((\[dD\]\[0-9\]+\\\[\[0-9\]+\\\]-\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])|(\[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vld4Q_laneu16.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vld4Q_laneu16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vld4Q_laneu16 (void)
-{
-  uint16x8x4_t out_uint16x8x4_t;
-  uint16x8x4_t arg1_uint16x8x4_t;
-
-  out_uint16x8x4_t = vld4q_lane_u16 (0, arg1_uint16x8x4_t, 1);
-}
-
-/* { dg-final { scan-assembler "vld4\.16\[ 	\]+\\\{((\[dD\]\[0-9\]+\\\[\[0-9\]+\\\]-\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])|(\[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vld4Q_laneu32.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vld4Q_laneu32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vld4Q_laneu32 (void)
-{
-  uint32x4x4_t out_uint32x4x4_t;
-  uint32x4x4_t arg1_uint32x4x4_t;
-
-  out_uint32x4x4_t = vld4q_lane_u32 (0, arg1_uint32x4x4_t, 1);
-}
-
-/* { dg-final { scan-assembler "vld4\.32\[ 	\]+\\\{((\[dD\]\[0-9\]+\\\[\[0-9\]+\\\]-\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])|(\[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vld4Qf32.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vld4Qf32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vld4Qf32 (void)
-{
-  float32x4x4_t out_float32x4x4_t;
-
-  out_float32x4x4_t = vld4q_f32 (0);
-}
-
-/* { dg-final { scan-assembler "vld4\.32\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
-/* { dg-final { scan-assembler "vld4\.32\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vld4Qp16.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vld4Qp16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vld4Qp16 (void)
-{
-  poly16x8x4_t out_poly16x8x4_t;
-
-  out_poly16x8x4_t = vld4q_p16 (0);
-}
-
-/* { dg-final { scan-assembler "vld4\.16\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
-/* { dg-final { scan-assembler "vld4\.16\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vld4Qp8.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vld4Qp8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vld4Qp8 (void)
-{
-  poly8x16x4_t out_poly8x16x4_t;
-
-  out_poly8x16x4_t = vld4q_p8 (0);
-}
-
-/* { dg-final { scan-assembler "vld4\.8\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
-/* { dg-final { scan-assembler "vld4\.8\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vld4Qs16.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vld4Qs16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vld4Qs16 (void)
-{
-  int16x8x4_t out_int16x8x4_t;
-
-  out_int16x8x4_t = vld4q_s16 (0);
-}
-
-/* { dg-final { scan-assembler "vld4\.16\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
-/* { dg-final { scan-assembler "vld4\.16\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vld4Qs32.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vld4Qs32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vld4Qs32 (void)
-{
-  int32x4x4_t out_int32x4x4_t;
-
-  out_int32x4x4_t = vld4q_s32 (0);
-}
-
-/* { dg-final { scan-assembler "vld4\.32\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
-/* { dg-final { scan-assembler "vld4\.32\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vld4Qs8.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vld4Qs8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vld4Qs8 (void)
-{
-  int8x16x4_t out_int8x16x4_t;
-
-  out_int8x16x4_t = vld4q_s8 (0);
-}
-
-/* { dg-final { scan-assembler "vld4\.8\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
-/* { dg-final { scan-assembler "vld4\.8\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vld4Qu16.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vld4Qu16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vld4Qu16 (void)
-{
-  uint16x8x4_t out_uint16x8x4_t;
-
-  out_uint16x8x4_t = vld4q_u16 (0);
-}
-
-/* { dg-final { scan-assembler "vld4\.16\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
-/* { dg-final { scan-assembler "vld4\.16\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vld4Qu32.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vld4Qu32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vld4Qu32 (void)
-{
-  uint32x4x4_t out_uint32x4x4_t;
-
-  out_uint32x4x4_t = vld4q_u32 (0);
-}
-
-/* { dg-final { scan-assembler "vld4\.32\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
-/* { dg-final { scan-assembler "vld4\.32\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vld4Qu8.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vld4Qu8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vld4Qu8 (void)
-{
-  uint8x16x4_t out_uint8x16x4_t;
-
-  out_uint8x16x4_t = vld4q_u8 (0);
-}
-
-/* { dg-final { scan-assembler "vld4\.8\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
-/* { dg-final { scan-assembler "vld4\.8\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vld4_dupf32.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vld4_dupf32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vld4_dupf32 (void)
-{
-  float32x2x4_t out_float32x2x4_t;
-
-  out_float32x2x4_t = vld4_dup_f32 (0);
-}
-
-/* { dg-final { scan-assembler "vld4\.32\[ 	\]+\\\{((\[dD\]\[0-9\]+\\\[\\\]-\[dD\]\[0-9\]+\\\[\\\])|(\[dD\]\[0-9\]+\\\[\\\], \[dD\]\[0-9\]+\\\[\\\], \[dD\]\[0-9\]+\\\[\\\], \[dD\]\[0-9\]+\\\[\\\]))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vld4_dupp16.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vld4_dupp16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vld4_dupp16 (void)
-{
-  poly16x4x4_t out_poly16x4x4_t;
-
-  out_poly16x4x4_t = vld4_dup_p16 (0);
-}
-
-/* { dg-final { scan-assembler "vld4\.16\[ 	\]+\\\{((\[dD\]\[0-9\]+\\\[\\\]-\[dD\]\[0-9\]+\\\[\\\])|(\[dD\]\[0-9\]+\\\[\\\], \[dD\]\[0-9\]+\\\[\\\], \[dD\]\[0-9\]+\\\[\\\], \[dD\]\[0-9\]+\\\[\\\]))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vld4_dupp64.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vld4_dupp64' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_crypto_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_crypto } */
-
-#include "arm_neon.h"
-
-void test_vld4_dupp64 (void)
-{
-  poly64x1x4_t out_poly64x1x4_t;
-
-  out_poly64x1x4_t = vld4_dup_p64 (0);
-}
-
-/* { dg-final { scan-assembler "vld1\.64\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vld4_dupp8.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vld4_dupp8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vld4_dupp8 (void)
-{
-  poly8x8x4_t out_poly8x8x4_t;
-
-  out_poly8x8x4_t = vld4_dup_p8 (0);
-}
-
-/* { dg-final { scan-assembler "vld4\.8\[ 	\]+\\\{((\[dD\]\[0-9\]+\\\[\\\]-\[dD\]\[0-9\]+\\\[\\\])|(\[dD\]\[0-9\]+\\\[\\\], \[dD\]\[0-9\]+\\\[\\\], \[dD\]\[0-9\]+\\\[\\\], \[dD\]\[0-9\]+\\\[\\\]))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vld4_dups16.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vld4_dups16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vld4_dups16 (void)
-{
-  int16x4x4_t out_int16x4x4_t;
-
-  out_int16x4x4_t = vld4_dup_s16 (0);
-}
-
-/* { dg-final { scan-assembler "vld4\.16\[ 	\]+\\\{((\[dD\]\[0-9\]+\\\[\\\]-\[dD\]\[0-9\]+\\\[\\\])|(\[dD\]\[0-9\]+\\\[\\\], \[dD\]\[0-9\]+\\\[\\\], \[dD\]\[0-9\]+\\\[\\\], \[dD\]\[0-9\]+\\\[\\\]))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vld4_dups32.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vld4_dups32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vld4_dups32 (void)
-{
-  int32x2x4_t out_int32x2x4_t;
-
-  out_int32x2x4_t = vld4_dup_s32 (0);
-}
-
-/* { dg-final { scan-assembler "vld4\.32\[ 	\]+\\\{((\[dD\]\[0-9\]+\\\[\\\]-\[dD\]\[0-9\]+\\\[\\\])|(\[dD\]\[0-9\]+\\\[\\\], \[dD\]\[0-9\]+\\\[\\\], \[dD\]\[0-9\]+\\\[\\\], \[dD\]\[0-9\]+\\\[\\\]))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vld4_dups64.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vld4_dups64' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vld4_dups64 (void)
-{
-  int64x1x4_t out_int64x1x4_t;
-
-  out_int64x1x4_t = vld4_dup_s64 (0);
-}
-
-/* { dg-final { scan-assembler "vld1\.64\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vld4_dups8.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vld4_dups8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vld4_dups8 (void)
-{
-  int8x8x4_t out_int8x8x4_t;
-
-  out_int8x8x4_t = vld4_dup_s8 (0);
-}
-
-/* { dg-final { scan-assembler "vld4\.8\[ 	\]+\\\{((\[dD\]\[0-9\]+\\\[\\\]-\[dD\]\[0-9\]+\\\[\\\])|(\[dD\]\[0-9\]+\\\[\\\], \[dD\]\[0-9\]+\\\[\\\], \[dD\]\[0-9\]+\\\[\\\], \[dD\]\[0-9\]+\\\[\\\]))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vld4_dupu16.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vld4_dupu16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vld4_dupu16 (void)
-{
-  uint16x4x4_t out_uint16x4x4_t;
-
-  out_uint16x4x4_t = vld4_dup_u16 (0);
-}
-
-/* { dg-final { scan-assembler "vld4\.16\[ 	\]+\\\{((\[dD\]\[0-9\]+\\\[\\\]-\[dD\]\[0-9\]+\\\[\\\])|(\[dD\]\[0-9\]+\\\[\\\], \[dD\]\[0-9\]+\\\[\\\], \[dD\]\[0-9\]+\\\[\\\], \[dD\]\[0-9\]+\\\[\\\]))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vld4_dupu32.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vld4_dupu32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vld4_dupu32 (void)
-{
-  uint32x2x4_t out_uint32x2x4_t;
-
-  out_uint32x2x4_t = vld4_dup_u32 (0);
-}
-
-/* { dg-final { scan-assembler "vld4\.32\[ 	\]+\\\{((\[dD\]\[0-9\]+\\\[\\\]-\[dD\]\[0-9\]+\\\[\\\])|(\[dD\]\[0-9\]+\\\[\\\], \[dD\]\[0-9\]+\\\[\\\], \[dD\]\[0-9\]+\\\[\\\], \[dD\]\[0-9\]+\\\[\\\]))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vld4_dupu64.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vld4_dupu64' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vld4_dupu64 (void)
-{
-  uint64x1x4_t out_uint64x1x4_t;
-
-  out_uint64x1x4_t = vld4_dup_u64 (0);
-}
-
-/* { dg-final { scan-assembler "vld1\.64\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vld4_dupu8.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vld4_dupu8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vld4_dupu8 (void)
-{
-  uint8x8x4_t out_uint8x8x4_t;
-
-  out_uint8x8x4_t = vld4_dup_u8 (0);
-}
-
-/* { dg-final { scan-assembler "vld4\.8\[ 	\]+\\\{((\[dD\]\[0-9\]+\\\[\\\]-\[dD\]\[0-9\]+\\\[\\\])|(\[dD\]\[0-9\]+\\\[\\\], \[dD\]\[0-9\]+\\\[\\\], \[dD\]\[0-9\]+\\\[\\\], \[dD\]\[0-9\]+\\\[\\\]))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vld4_lanef32.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vld4_lanef32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vld4_lanef32 (void)
-{
-  float32x2x4_t out_float32x2x4_t;
-  float32x2x4_t arg1_float32x2x4_t;
-
-  out_float32x2x4_t = vld4_lane_f32 (0, arg1_float32x2x4_t, 1);
-}
-
-/* { dg-final { scan-assembler "vld4\.32\[ 	\]+\\\{((\[dD\]\[0-9\]+\\\[\[0-9\]+\\\]-\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])|(\[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vld4_lanep16.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vld4_lanep16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vld4_lanep16 (void)
-{
-  poly16x4x4_t out_poly16x4x4_t;
-  poly16x4x4_t arg1_poly16x4x4_t;
-
-  out_poly16x4x4_t = vld4_lane_p16 (0, arg1_poly16x4x4_t, 1);
-}
-
-/* { dg-final { scan-assembler "vld4\.16\[ 	\]+\\\{((\[dD\]\[0-9\]+\\\[\[0-9\]+\\\]-\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])|(\[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vld4_lanep8.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vld4_lanep8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vld4_lanep8 (void)
-{
-  poly8x8x4_t out_poly8x8x4_t;
-  poly8x8x4_t arg1_poly8x8x4_t;
-
-  out_poly8x8x4_t = vld4_lane_p8 (0, arg1_poly8x8x4_t, 1);
-}
-
-/* { dg-final { scan-assembler "vld4\.8\[ 	\]+\\\{((\[dD\]\[0-9\]+\\\[\[0-9\]+\\\]-\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])|(\[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vld4_lanes16.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vld4_lanes16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vld4_lanes16 (void)
-{
-  int16x4x4_t out_int16x4x4_t;
-  int16x4x4_t arg1_int16x4x4_t;
-
-  out_int16x4x4_t = vld4_lane_s16 (0, arg1_int16x4x4_t, 1);
-}
-
-/* { dg-final { scan-assembler "vld4\.16\[ 	\]+\\\{((\[dD\]\[0-9\]+\\\[\[0-9\]+\\\]-\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])|(\[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vld4_lanes32.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vld4_lanes32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vld4_lanes32 (void)
-{
-  int32x2x4_t out_int32x2x4_t;
-  int32x2x4_t arg1_int32x2x4_t;
-
-  out_int32x2x4_t = vld4_lane_s32 (0, arg1_int32x2x4_t, 1);
-}
-
-/* { dg-final { scan-assembler "vld4\.32\[ 	\]+\\\{((\[dD\]\[0-9\]+\\\[\[0-9\]+\\\]-\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])|(\[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vld4_lanes8.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vld4_lanes8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vld4_lanes8 (void)
-{
-  int8x8x4_t out_int8x8x4_t;
-  int8x8x4_t arg1_int8x8x4_t;
-
-  out_int8x8x4_t = vld4_lane_s8 (0, arg1_int8x8x4_t, 1);
-}
-
-/* { dg-final { scan-assembler "vld4\.8\[ 	\]+\\\{((\[dD\]\[0-9\]+\\\[\[0-9\]+\\\]-\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])|(\[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vld4_laneu16.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vld4_laneu16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vld4_laneu16 (void)
-{
-  uint16x4x4_t out_uint16x4x4_t;
-  uint16x4x4_t arg1_uint16x4x4_t;
-
-  out_uint16x4x4_t = vld4_lane_u16 (0, arg1_uint16x4x4_t, 1);
-}
-
-/* { dg-final { scan-assembler "vld4\.16\[ 	\]+\\\{((\[dD\]\[0-9\]+\\\[\[0-9\]+\\\]-\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])|(\[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vld4_laneu32.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vld4_laneu32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vld4_laneu32 (void)
-{
-  uint32x2x4_t out_uint32x2x4_t;
-  uint32x2x4_t arg1_uint32x2x4_t;
-
-  out_uint32x2x4_t = vld4_lane_u32 (0, arg1_uint32x2x4_t, 1);
-}
-
-/* { dg-final { scan-assembler "vld4\.32\[ 	\]+\\\{((\[dD\]\[0-9\]+\\\[\[0-9\]+\\\]-\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])|(\[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vld4_laneu8.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vld4_laneu8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vld4_laneu8 (void)
-{
-  uint8x8x4_t out_uint8x8x4_t;
-  uint8x8x4_t arg1_uint8x8x4_t;
-
-  out_uint8x8x4_t = vld4_lane_u8 (0, arg1_uint8x8x4_t, 1);
-}
-
-/* { dg-final { scan-assembler "vld4\.8\[ 	\]+\\\{((\[dD\]\[0-9\]+\\\[\[0-9\]+\\\]-\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])|(\[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vld4f32.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vld4f32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vld4f32 (void)
-{
-  float32x2x4_t out_float32x2x4_t;
-
-  out_float32x2x4_t = vld4_f32 (0);
-}
-
-/* { dg-final { scan-assembler "vld4\.32\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vld4p16.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vld4p16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vld4p16 (void)
-{
-  poly16x4x4_t out_poly16x4x4_t;
-
-  out_poly16x4x4_t = vld4_p16 (0);
-}
-
-/* { dg-final { scan-assembler "vld4\.16\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vld4p64.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vld4p64' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_crypto_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_crypto } */
-
-#include "arm_neon.h"
-
-void test_vld4p64 (void)
-{
-  poly64x1x4_t out_poly64x1x4_t;
-
-  out_poly64x1x4_t = vld4_p64 (0);
-}
-
-/* { dg-final { scan-assembler "vld1\.64\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vld4p8.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vld4p8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vld4p8 (void)
-{
-  poly8x8x4_t out_poly8x8x4_t;
-
-  out_poly8x8x4_t = vld4_p8 (0);
-}
-
-/* { dg-final { scan-assembler "vld4\.8\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vld4s16.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vld4s16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vld4s16 (void)
-{
-  int16x4x4_t out_int16x4x4_t;
-
-  out_int16x4x4_t = vld4_s16 (0);
-}
-
-/* { dg-final { scan-assembler "vld4\.16\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vld4s32.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vld4s32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vld4s32 (void)
-{
-  int32x2x4_t out_int32x2x4_t;
-
-  out_int32x2x4_t = vld4_s32 (0);
-}
-
-/* { dg-final { scan-assembler "vld4\.32\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vld4s64.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vld4s64' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vld4s64 (void)
-{
-  int64x1x4_t out_int64x1x4_t;
-
-  out_int64x1x4_t = vld4_s64 (0);
-}
-
-/* { dg-final { scan-assembler "vld1\.64\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vld4s8.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vld4s8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vld4s8 (void)
-{
-  int8x8x4_t out_int8x8x4_t;
-
-  out_int8x8x4_t = vld4_s8 (0);
-}
-
-/* { dg-final { scan-assembler "vld4\.8\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vld4u16.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vld4u16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vld4u16 (void)
-{
-  uint16x4x4_t out_uint16x4x4_t;
-
-  out_uint16x4x4_t = vld4_u16 (0);
-}
-
-/* { dg-final { scan-assembler "vld4\.16\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vld4u32.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vld4u32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vld4u32 (void)
-{
-  uint32x2x4_t out_uint32x2x4_t;
-
-  out_uint32x2x4_t = vld4_u32 (0);
-}
-
-/* { dg-final { scan-assembler "vld4\.32\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vld4u64.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vld4u64' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vld4u64 (void)
-{
-  uint64x1x4_t out_uint64x1x4_t;
-
-  out_uint64x1x4_t = vld4_u64 (0);
-}
-
-/* { dg-final { scan-assembler "vld1\.64\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vld4u8.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vld4u8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vld4u8 (void)
-{
-  uint8x8x4_t out_uint8x8x4_t;
-
-  out_uint8x8x4_t = vld4_u8 (0);
-}
-
-/* { dg-final { scan-assembler "vld4\.8\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vmaxQf32.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vmaxQf32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vmaxQf32 (void)
-{
-  float32x4_t out_float32x4_t;
-  float32x4_t arg0_float32x4_t;
-  float32x4_t arg1_float32x4_t;
-
-  out_float32x4_t = vmaxq_f32 (arg0_float32x4_t, arg1_float32x4_t);
-}
-
-/* { dg-final { scan-assembler "vmax\.f32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vmaxQs16.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vmaxQs16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vmaxQs16 (void)
-{
-  int16x8_t out_int16x8_t;
-  int16x8_t arg0_int16x8_t;
-  int16x8_t arg1_int16x8_t;
-
-  out_int16x8_t = vmaxq_s16 (arg0_int16x8_t, arg1_int16x8_t);
-}
-
-/* { dg-final { scan-assembler "vmax\.s16\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vmaxQs32.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vmaxQs32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vmaxQs32 (void)
-{
-  int32x4_t out_int32x4_t;
-  int32x4_t arg0_int32x4_t;
-  int32x4_t arg1_int32x4_t;
-
-  out_int32x4_t = vmaxq_s32 (arg0_int32x4_t, arg1_int32x4_t);
-}
-
-/* { dg-final { scan-assembler "vmax\.s32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vmaxQs8.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vmaxQs8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vmaxQs8 (void)
-{
-  int8x16_t out_int8x16_t;
-  int8x16_t arg0_int8x16_t;
-  int8x16_t arg1_int8x16_t;
-
-  out_int8x16_t = vmaxq_s8 (arg0_int8x16_t, arg1_int8x16_t);
-}
-
-/* { dg-final { scan-assembler "vmax\.s8\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vmaxQu16.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vmaxQu16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vmaxQu16 (void)
-{
-  uint16x8_t out_uint16x8_t;
-  uint16x8_t arg0_uint16x8_t;
-  uint16x8_t arg1_uint16x8_t;
-
-  out_uint16x8_t = vmaxq_u16 (arg0_uint16x8_t, arg1_uint16x8_t);
-}
-
-/* { dg-final { scan-assembler "vmax\.u16\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vmaxQu32.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vmaxQu32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vmaxQu32 (void)
-{
-  uint32x4_t out_uint32x4_t;
-  uint32x4_t arg0_uint32x4_t;
-  uint32x4_t arg1_uint32x4_t;
-
-  out_uint32x4_t = vmaxq_u32 (arg0_uint32x4_t, arg1_uint32x4_t);
-}
-
-/* { dg-final { scan-assembler "vmax\.u32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vmaxQu8.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vmaxQu8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vmaxQu8 (void)
-{
-  uint8x16_t out_uint8x16_t;
-  uint8x16_t arg0_uint8x16_t;
-  uint8x16_t arg1_uint8x16_t;
-
-  out_uint8x16_t = vmaxq_u8 (arg0_uint8x16_t, arg1_uint8x16_t);
-}
-
-/* { dg-final { scan-assembler "vmax\.u8\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vmaxf32.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vmaxf32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vmaxf32 (void)
-{
-  float32x2_t out_float32x2_t;
-  float32x2_t arg0_float32x2_t;
-  float32x2_t arg1_float32x2_t;
-
-  out_float32x2_t = vmax_f32 (arg0_float32x2_t, arg1_float32x2_t);
-}
-
-/* { dg-final { scan-assembler "vmax\.f32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vmaxs16.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vmaxs16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vmaxs16 (void)
-{
-  int16x4_t out_int16x4_t;
-  int16x4_t arg0_int16x4_t;
-  int16x4_t arg1_int16x4_t;
-
-  out_int16x4_t = vmax_s16 (arg0_int16x4_t, arg1_int16x4_t);
-}
-
-/* { dg-final { scan-assembler "vmax\.s16\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vmaxs32.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vmaxs32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vmaxs32 (void)
-{
-  int32x2_t out_int32x2_t;
-  int32x2_t arg0_int32x2_t;
-  int32x2_t arg1_int32x2_t;
-
-  out_int32x2_t = vmax_s32 (arg0_int32x2_t, arg1_int32x2_t);
-}
-
-/* { dg-final { scan-assembler "vmax\.s32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vmaxs8.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vmaxs8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vmaxs8 (void)
-{
-  int8x8_t out_int8x8_t;
-  int8x8_t arg0_int8x8_t;
-  int8x8_t arg1_int8x8_t;
-
-  out_int8x8_t = vmax_s8 (arg0_int8x8_t, arg1_int8x8_t);
-}
-
-/* { dg-final { scan-assembler "vmax\.s8\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vmaxu16.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vmaxu16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vmaxu16 (void)
-{
-  uint16x4_t out_uint16x4_t;
-  uint16x4_t arg0_uint16x4_t;
-  uint16x4_t arg1_uint16x4_t;
-
-  out_uint16x4_t = vmax_u16 (arg0_uint16x4_t, arg1_uint16x4_t);
-}
-
-/* { dg-final { scan-assembler "vmax\.u16\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vmaxu32.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vmaxu32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vmaxu32 (void)
-{
-  uint32x2_t out_uint32x2_t;
-  uint32x2_t arg0_uint32x2_t;
-  uint32x2_t arg1_uint32x2_t;
-
-  out_uint32x2_t = vmax_u32 (arg0_uint32x2_t, arg1_uint32x2_t);
-}
-
-/* { dg-final { scan-assembler "vmax\.u32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vmaxu8.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vmaxu8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vmaxu8 (void)
-{
-  uint8x8_t out_uint8x8_t;
-  uint8x8_t arg0_uint8x8_t;
-  uint8x8_t arg1_uint8x8_t;
-
-  out_uint8x8_t = vmax_u8 (arg0_uint8x8_t, arg1_uint8x8_t);
-}
-
-/* { dg-final { scan-assembler "vmax\.u8\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vminQf32.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vminQf32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vminQf32 (void)
-{
-  float32x4_t out_float32x4_t;
-  float32x4_t arg0_float32x4_t;
-  float32x4_t arg1_float32x4_t;
-
-  out_float32x4_t = vminq_f32 (arg0_float32x4_t, arg1_float32x4_t);
-}
-
-/* { dg-final { scan-assembler "vmin\.f32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vminQs16.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vminQs16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vminQs16 (void)
-{
-  int16x8_t out_int16x8_t;
-  int16x8_t arg0_int16x8_t;
-  int16x8_t arg1_int16x8_t;
-
-  out_int16x8_t = vminq_s16 (arg0_int16x8_t, arg1_int16x8_t);
-}
-
-/* { dg-final { scan-assembler "vmin\.s16\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vminQs32.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vminQs32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vminQs32 (void)
-{
-  int32x4_t out_int32x4_t;
-  int32x4_t arg0_int32x4_t;
-  int32x4_t arg1_int32x4_t;
-
-  out_int32x4_t = vminq_s32 (arg0_int32x4_t, arg1_int32x4_t);
-}
-
-/* { dg-final { scan-assembler "vmin\.s32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vminQs8.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vminQs8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vminQs8 (void)
-{
-  int8x16_t out_int8x16_t;
-  int8x16_t arg0_int8x16_t;
-  int8x16_t arg1_int8x16_t;
-
-  out_int8x16_t = vminq_s8 (arg0_int8x16_t, arg1_int8x16_t);
-}
-
-/* { dg-final { scan-assembler "vmin\.s8\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vminQu16.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vminQu16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vminQu16 (void)
-{
-  uint16x8_t out_uint16x8_t;
-  uint16x8_t arg0_uint16x8_t;
-  uint16x8_t arg1_uint16x8_t;
-
-  out_uint16x8_t = vminq_u16 (arg0_uint16x8_t, arg1_uint16x8_t);
-}
-
-/* { dg-final { scan-assembler "vmin\.u16\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vminQu32.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vminQu32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vminQu32 (void)
-{
-  uint32x4_t out_uint32x4_t;
-  uint32x4_t arg0_uint32x4_t;
-  uint32x4_t arg1_uint32x4_t;
-
-  out_uint32x4_t = vminq_u32 (arg0_uint32x4_t, arg1_uint32x4_t);
-}
-
-/* { dg-final { scan-assembler "vmin\.u32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vminQu8.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vminQu8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vminQu8 (void)
-{
-  uint8x16_t out_uint8x16_t;
-  uint8x16_t arg0_uint8x16_t;
-  uint8x16_t arg1_uint8x16_t;
-
-  out_uint8x16_t = vminq_u8 (arg0_uint8x16_t, arg1_uint8x16_t);
-}
-
-/* { dg-final { scan-assembler "vmin\.u8\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vminf32.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vminf32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vminf32 (void)
-{
-  float32x2_t out_float32x2_t;
-  float32x2_t arg0_float32x2_t;
-  float32x2_t arg1_float32x2_t;
-
-  out_float32x2_t = vmin_f32 (arg0_float32x2_t, arg1_float32x2_t);
-}
-
-/* { dg-final { scan-assembler "vmin\.f32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vmins16.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vmins16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vmins16 (void)
-{
-  int16x4_t out_int16x4_t;
-  int16x4_t arg0_int16x4_t;
-  int16x4_t arg1_int16x4_t;
-
-  out_int16x4_t = vmin_s16 (arg0_int16x4_t, arg1_int16x4_t);
-}
-
-/* { dg-final { scan-assembler "vmin\.s16\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vmins32.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vmins32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vmins32 (void)
-{
-  int32x2_t out_int32x2_t;
-  int32x2_t arg0_int32x2_t;
-  int32x2_t arg1_int32x2_t;
-
-  out_int32x2_t = vmin_s32 (arg0_int32x2_t, arg1_int32x2_t);
-}
-
-/* { dg-final { scan-assembler "vmin\.s32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vmins8.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vmins8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vmins8 (void)
-{
-  int8x8_t out_int8x8_t;
-  int8x8_t arg0_int8x8_t;
-  int8x8_t arg1_int8x8_t;
-
-  out_int8x8_t = vmin_s8 (arg0_int8x8_t, arg1_int8x8_t);
-}
-
-/* { dg-final { scan-assembler "vmin\.s8\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vminu16.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vminu16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vminu16 (void)
-{
-  uint16x4_t out_uint16x4_t;
-  uint16x4_t arg0_uint16x4_t;
-  uint16x4_t arg1_uint16x4_t;
-
-  out_uint16x4_t = vmin_u16 (arg0_uint16x4_t, arg1_uint16x4_t);
-}
-
-/* { dg-final { scan-assembler "vmin\.u16\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vminu32.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vminu32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vminu32 (void)
-{
-  uint32x2_t out_uint32x2_t;
-  uint32x2_t arg0_uint32x2_t;
-  uint32x2_t arg1_uint32x2_t;
-
-  out_uint32x2_t = vmin_u32 (arg0_uint32x2_t, arg1_uint32x2_t);
-}
-
-/* { dg-final { scan-assembler "vmin\.u32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vminu8.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vminu8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vminu8 (void)
-{
-  uint8x8_t out_uint8x8_t;
-  uint8x8_t arg0_uint8x8_t;
-  uint8x8_t arg1_uint8x8_t;
-
-  out_uint8x8_t = vmin_u8 (arg0_uint8x8_t, arg1_uint8x8_t);
-}
-
-/* { dg-final { scan-assembler "vmin\.u8\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vmlaQ_lanef32.c
+++ b/src//dev/null
@@ -1,21 +0,0 @@
-/* Test the `vmlaQ_lanef32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vmlaQ_lanef32 (void)
-{
-  float32x4_t out_float32x4_t;
-  float32x4_t arg0_float32x4_t;
-  float32x4_t arg1_float32x4_t;
-  float32x2_t arg2_float32x2_t;
-
-  out_float32x4_t = vmlaq_lane_f32 (arg0_float32x4_t, arg1_float32x4_t, arg2_float32x2_t, 1);
-}
-
-/* { dg-final { scan-assembler "vmla\.f32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vmlaQ_lanes16.c
+++ b/src//dev/null
@@ -1,21 +0,0 @@
-/* Test the `vmlaQ_lanes16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vmlaQ_lanes16 (void)
-{
-  int16x8_t out_int16x8_t;
-  int16x8_t arg0_int16x8_t;
-  int16x8_t arg1_int16x8_t;
-  int16x4_t arg2_int16x4_t;
-
-  out_int16x8_t = vmlaq_lane_s16 (arg0_int16x8_t, arg1_int16x8_t, arg2_int16x4_t, 1);
-}
-
-/* { dg-final { scan-assembler "vmla\.i16\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vmlaQ_lanes32.c
+++ b/src//dev/null
@@ -1,21 +0,0 @@
-/* Test the `vmlaQ_lanes32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vmlaQ_lanes32 (void)
-{
-  int32x4_t out_int32x4_t;
-  int32x4_t arg0_int32x4_t;
-  int32x4_t arg1_int32x4_t;
-  int32x2_t arg2_int32x2_t;
-
-  out_int32x4_t = vmlaq_lane_s32 (arg0_int32x4_t, arg1_int32x4_t, arg2_int32x2_t, 1);
-}
-
-/* { dg-final { scan-assembler "vmla\.i32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vmlaQ_laneu16.c
+++ b/src//dev/null
@@ -1,21 +0,0 @@
-/* Test the `vmlaQ_laneu16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vmlaQ_laneu16 (void)
-{
-  uint16x8_t out_uint16x8_t;
-  uint16x8_t arg0_uint16x8_t;
-  uint16x8_t arg1_uint16x8_t;
-  uint16x4_t arg2_uint16x4_t;
-
-  out_uint16x8_t = vmlaq_lane_u16 (arg0_uint16x8_t, arg1_uint16x8_t, arg2_uint16x4_t, 1);
-}
-
-/* { dg-final { scan-assembler "vmla\.i16\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vmlaQ_laneu32.c
+++ b/src//dev/null
@@ -1,21 +0,0 @@
-/* Test the `vmlaQ_laneu32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vmlaQ_laneu32 (void)
-{
-  uint32x4_t out_uint32x4_t;
-  uint32x4_t arg0_uint32x4_t;
-  uint32x4_t arg1_uint32x4_t;
-  uint32x2_t arg2_uint32x2_t;
-
-  out_uint32x4_t = vmlaq_lane_u32 (arg0_uint32x4_t, arg1_uint32x4_t, arg2_uint32x2_t, 1);
-}
-
-/* { dg-final { scan-assembler "vmla\.i32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vmlaQ_nf32.c
+++ b/src//dev/null
@@ -1,21 +0,0 @@
-/* Test the `vmlaQ_nf32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vmlaQ_nf32 (void)
-{
-  float32x4_t out_float32x4_t;
-  float32x4_t arg0_float32x4_t;
-  float32x4_t arg1_float32x4_t;
-  float32_t arg2_float32_t;
-
-  out_float32x4_t = vmlaq_n_f32 (arg0_float32x4_t, arg1_float32x4_t, arg2_float32_t);
-}
-
-/* { dg-final { scan-assembler "vmla\.f32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vmlaQ_ns16.c
+++ b/src//dev/null
@@ -1,21 +0,0 @@
-/* Test the `vmlaQ_ns16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vmlaQ_ns16 (void)
-{
-  int16x8_t out_int16x8_t;
-  int16x8_t arg0_int16x8_t;
-  int16x8_t arg1_int16x8_t;
-  int16_t arg2_int16_t;
-
-  out_int16x8_t = vmlaq_n_s16 (arg0_int16x8_t, arg1_int16x8_t, arg2_int16_t);
-}
-
-/* { dg-final { scan-assembler "vmla\.i16\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vmlaQ_ns32.c
+++ b/src//dev/null
@@ -1,21 +0,0 @@
-/* Test the `vmlaQ_ns32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vmlaQ_ns32 (void)
-{
-  int32x4_t out_int32x4_t;
-  int32x4_t arg0_int32x4_t;
-  int32x4_t arg1_int32x4_t;
-  int32_t arg2_int32_t;
-
-  out_int32x4_t = vmlaq_n_s32 (arg0_int32x4_t, arg1_int32x4_t, arg2_int32_t);
-}
-
-/* { dg-final { scan-assembler "vmla\.i32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vmlaQ_nu16.c
+++ b/src//dev/null
@@ -1,21 +0,0 @@
-/* Test the `vmlaQ_nu16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vmlaQ_nu16 (void)
-{
-  uint16x8_t out_uint16x8_t;
-  uint16x8_t arg0_uint16x8_t;
-  uint16x8_t arg1_uint16x8_t;
-  uint16_t arg2_uint16_t;
-
-  out_uint16x8_t = vmlaq_n_u16 (arg0_uint16x8_t, arg1_uint16x8_t, arg2_uint16_t);
-}
-
-/* { dg-final { scan-assembler "vmla\.i16\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vmlaQ_nu32.c
+++ b/src//dev/null
@@ -1,21 +0,0 @@
-/* Test the `vmlaQ_nu32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vmlaQ_nu32 (void)
-{
-  uint32x4_t out_uint32x4_t;
-  uint32x4_t arg0_uint32x4_t;
-  uint32x4_t arg1_uint32x4_t;
-  uint32_t arg2_uint32_t;
-
-  out_uint32x4_t = vmlaq_n_u32 (arg0_uint32x4_t, arg1_uint32x4_t, arg2_uint32_t);
-}
-
-/* { dg-final { scan-assembler "vmla\.i32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vmlaQf32.c
+++ b/src//dev/null
@@ -1,21 +0,0 @@
-/* Test the `vmlaQf32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vmlaQf32 (void)
-{
-  float32x4_t out_float32x4_t;
-  float32x4_t arg0_float32x4_t;
-  float32x4_t arg1_float32x4_t;
-  float32x4_t arg2_float32x4_t;
-
-  out_float32x4_t = vmlaq_f32 (arg0_float32x4_t, arg1_float32x4_t, arg2_float32x4_t);
-}
-
-/* { dg-final { scan-assembler "vmla\.f32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vmlaQs16.c
+++ b/src//dev/null
@@ -1,21 +0,0 @@
-/* Test the `vmlaQs16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vmlaQs16 (void)
-{
-  int16x8_t out_int16x8_t;
-  int16x8_t arg0_int16x8_t;
-  int16x8_t arg1_int16x8_t;
-  int16x8_t arg2_int16x8_t;
-
-  out_int16x8_t = vmlaq_s16 (arg0_int16x8_t, arg1_int16x8_t, arg2_int16x8_t);
-}
-
-/* { dg-final { scan-assembler "vmla\.i16\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vmlaQs32.c
+++ b/src//dev/null
@@ -1,21 +0,0 @@
-/* Test the `vmlaQs32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vmlaQs32 (void)
-{
-  int32x4_t out_int32x4_t;
-  int32x4_t arg0_int32x4_t;
-  int32x4_t arg1_int32x4_t;
-  int32x4_t arg2_int32x4_t;
-
-  out_int32x4_t = vmlaq_s32 (arg0_int32x4_t, arg1_int32x4_t, arg2_int32x4_t);
-}
-
-/* { dg-final { scan-assembler "vmla\.i32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vmlaQs8.c
+++ b/src//dev/null
@@ -1,21 +0,0 @@
-/* Test the `vmlaQs8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vmlaQs8 (void)
-{
-  int8x16_t out_int8x16_t;
-  int8x16_t arg0_int8x16_t;
-  int8x16_t arg1_int8x16_t;
-  int8x16_t arg2_int8x16_t;
-
-  out_int8x16_t = vmlaq_s8 (arg0_int8x16_t, arg1_int8x16_t, arg2_int8x16_t);
-}
-
-/* { dg-final { scan-assembler "vmla\.i8\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vmlaQu16.c
+++ b/src//dev/null
@@ -1,21 +0,0 @@
-/* Test the `vmlaQu16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vmlaQu16 (void)
-{
-  uint16x8_t out_uint16x8_t;
-  uint16x8_t arg0_uint16x8_t;
-  uint16x8_t arg1_uint16x8_t;
-  uint16x8_t arg2_uint16x8_t;
-
-  out_uint16x8_t = vmlaq_u16 (arg0_uint16x8_t, arg1_uint16x8_t, arg2_uint16x8_t);
-}
-
-/* { dg-final { scan-assembler "vmla\.i16\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vmlaQu32.c
+++ b/src//dev/null
@@ -1,21 +0,0 @@
-/* Test the `vmlaQu32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vmlaQu32 (void)
-{
-  uint32x4_t out_uint32x4_t;
-  uint32x4_t arg0_uint32x4_t;
-  uint32x4_t arg1_uint32x4_t;
-  uint32x4_t arg2_uint32x4_t;
-
-  out_uint32x4_t = vmlaq_u32 (arg0_uint32x4_t, arg1_uint32x4_t, arg2_uint32x4_t);
-}
-
-/* { dg-final { scan-assembler "vmla\.i32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vmlaQu8.c
+++ b/src//dev/null
@@ -1,21 +0,0 @@
-/* Test the `vmlaQu8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vmlaQu8 (void)
-{
-  uint8x16_t out_uint8x16_t;
-  uint8x16_t arg0_uint8x16_t;
-  uint8x16_t arg1_uint8x16_t;
-  uint8x16_t arg2_uint8x16_t;
-
-  out_uint8x16_t = vmlaq_u8 (arg0_uint8x16_t, arg1_uint8x16_t, arg2_uint8x16_t);
-}
-
-/* { dg-final { scan-assembler "vmla\.i8\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vmla_lanef32.c
+++ b/src//dev/null
@@ -1,21 +0,0 @@
-/* Test the `vmla_lanef32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vmla_lanef32 (void)
-{
-  float32x2_t out_float32x2_t;
-  float32x2_t arg0_float32x2_t;
-  float32x2_t arg1_float32x2_t;
-  float32x2_t arg2_float32x2_t;
-
-  out_float32x2_t = vmla_lane_f32 (arg0_float32x2_t, arg1_float32x2_t, arg2_float32x2_t, 1);
-}
-
-/* { dg-final { scan-assembler "vmla\.f32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vmla_lanes16.c
+++ b/src//dev/null
@@ -1,21 +0,0 @@
-/* Test the `vmla_lanes16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vmla_lanes16 (void)
-{
-  int16x4_t out_int16x4_t;
-  int16x4_t arg0_int16x4_t;
-  int16x4_t arg1_int16x4_t;
-  int16x4_t arg2_int16x4_t;
-
-  out_int16x4_t = vmla_lane_s16 (arg0_int16x4_t, arg1_int16x4_t, arg2_int16x4_t, 1);
-}
-
-/* { dg-final { scan-assembler "vmla\.i16\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vmla_lanes32.c
+++ b/src//dev/null
@@ -1,21 +0,0 @@
-/* Test the `vmla_lanes32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vmla_lanes32 (void)
-{
-  int32x2_t out_int32x2_t;
-  int32x2_t arg0_int32x2_t;
-  int32x2_t arg1_int32x2_t;
-  int32x2_t arg2_int32x2_t;
-
-  out_int32x2_t = vmla_lane_s32 (arg0_int32x2_t, arg1_int32x2_t, arg2_int32x2_t, 1);
-}
-
-/* { dg-final { scan-assembler "vmla\.i32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vmla_laneu16.c
+++ b/src//dev/null
@@ -1,21 +0,0 @@
-/* Test the `vmla_laneu16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vmla_laneu16 (void)
-{
-  uint16x4_t out_uint16x4_t;
-  uint16x4_t arg0_uint16x4_t;
-  uint16x4_t arg1_uint16x4_t;
-  uint16x4_t arg2_uint16x4_t;
-
-  out_uint16x4_t = vmla_lane_u16 (arg0_uint16x4_t, arg1_uint16x4_t, arg2_uint16x4_t, 1);
-}
-
-/* { dg-final { scan-assembler "vmla\.i16\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vmla_laneu32.c
+++ b/src//dev/null
@@ -1,21 +0,0 @@
-/* Test the `vmla_laneu32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vmla_laneu32 (void)
-{
-  uint32x2_t out_uint32x2_t;
-  uint32x2_t arg0_uint32x2_t;
-  uint32x2_t arg1_uint32x2_t;
-  uint32x2_t arg2_uint32x2_t;
-
-  out_uint32x2_t = vmla_lane_u32 (arg0_uint32x2_t, arg1_uint32x2_t, arg2_uint32x2_t, 1);
-}
-
-/* { dg-final { scan-assembler "vmla\.i32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vmla_nf32.c
+++ b/src//dev/null
@@ -1,21 +0,0 @@
-/* Test the `vmla_nf32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vmla_nf32 (void)
-{
-  float32x2_t out_float32x2_t;
-  float32x2_t arg0_float32x2_t;
-  float32x2_t arg1_float32x2_t;
-  float32_t arg2_float32_t;
-
-  out_float32x2_t = vmla_n_f32 (arg0_float32x2_t, arg1_float32x2_t, arg2_float32_t);
-}
-
-/* { dg-final { scan-assembler "vmla\.f32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vmla_ns16.c
+++ b/src//dev/null
@@ -1,21 +0,0 @@
-/* Test the `vmla_ns16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vmla_ns16 (void)
-{
-  int16x4_t out_int16x4_t;
-  int16x4_t arg0_int16x4_t;
-  int16x4_t arg1_int16x4_t;
-  int16_t arg2_int16_t;
-
-  out_int16x4_t = vmla_n_s16 (arg0_int16x4_t, arg1_int16x4_t, arg2_int16_t);
-}
-
-/* { dg-final { scan-assembler "vmla\.i16\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vmla_ns32.c
+++ b/src//dev/null
@@ -1,21 +0,0 @@
-/* Test the `vmla_ns32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vmla_ns32 (void)
-{
-  int32x2_t out_int32x2_t;
-  int32x2_t arg0_int32x2_t;
-  int32x2_t arg1_int32x2_t;
-  int32_t arg2_int32_t;
-
-  out_int32x2_t = vmla_n_s32 (arg0_int32x2_t, arg1_int32x2_t, arg2_int32_t);
-}
-
-/* { dg-final { scan-assembler "vmla\.i32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vmla_nu16.c
+++ b/src//dev/null
@@ -1,21 +0,0 @@
-/* Test the `vmla_nu16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vmla_nu16 (void)
-{
-  uint16x4_t out_uint16x4_t;
-  uint16x4_t arg0_uint16x4_t;
-  uint16x4_t arg1_uint16x4_t;
-  uint16_t arg2_uint16_t;
-
-  out_uint16x4_t = vmla_n_u16 (arg0_uint16x4_t, arg1_uint16x4_t, arg2_uint16_t);
-}
-
-/* { dg-final { scan-assembler "vmla\.i16\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vmla_nu32.c
+++ b/src//dev/null
@@ -1,21 +0,0 @@
-/* Test the `vmla_nu32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vmla_nu32 (void)
-{
-  uint32x2_t out_uint32x2_t;
-  uint32x2_t arg0_uint32x2_t;
-  uint32x2_t arg1_uint32x2_t;
-  uint32_t arg2_uint32_t;
-
-  out_uint32x2_t = vmla_n_u32 (arg0_uint32x2_t, arg1_uint32x2_t, arg2_uint32_t);
-}
-
-/* { dg-final { scan-assembler "vmla\.i32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vmlaf32.c
+++ b/src//dev/null
@@ -1,21 +0,0 @@
-/* Test the `vmlaf32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vmlaf32 (void)
-{
-  float32x2_t out_float32x2_t;
-  float32x2_t arg0_float32x2_t;
-  float32x2_t arg1_float32x2_t;
-  float32x2_t arg2_float32x2_t;
-
-  out_float32x2_t = vmla_f32 (arg0_float32x2_t, arg1_float32x2_t, arg2_float32x2_t);
-}
-
-/* { dg-final { scan-assembler "vmla\.f32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vmlal_lanes16.c
+++ b/src//dev/null
@@ -1,21 +0,0 @@
-/* Test the `vmlal_lanes16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vmlal_lanes16 (void)
-{
-  int32x4_t out_int32x4_t;
-  int32x4_t arg0_int32x4_t;
-  int16x4_t arg1_int16x4_t;
-  int16x4_t arg2_int16x4_t;
-
-  out_int32x4_t = vmlal_lane_s16 (arg0_int32x4_t, arg1_int16x4_t, arg2_int16x4_t, 1);
-}
-
-/* { dg-final { scan-assembler "vmlal\.s16\[ 	\]+\[qQ\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vmlal_lanes32.c
+++ b/src//dev/null
@@ -1,21 +0,0 @@
-/* Test the `vmlal_lanes32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vmlal_lanes32 (void)
-{
-  int64x2_t out_int64x2_t;
-  int64x2_t arg0_int64x2_t;
-  int32x2_t arg1_int32x2_t;
-  int32x2_t arg2_int32x2_t;
-
-  out_int64x2_t = vmlal_lane_s32 (arg0_int64x2_t, arg1_int32x2_t, arg2_int32x2_t, 1);
-}
-
-/* { dg-final { scan-assembler "vmlal\.s32\[ 	\]+\[qQ\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vmlal_laneu16.c
+++ b/src//dev/null
@@ -1,21 +0,0 @@
-/* Test the `vmlal_laneu16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vmlal_laneu16 (void)
-{
-  uint32x4_t out_uint32x4_t;
-  uint32x4_t arg0_uint32x4_t;
-  uint16x4_t arg1_uint16x4_t;
-  uint16x4_t arg2_uint16x4_t;
-
-  out_uint32x4_t = vmlal_lane_u16 (arg0_uint32x4_t, arg1_uint16x4_t, arg2_uint16x4_t, 1);
-}
-
-/* { dg-final { scan-assembler "vmlal\.u16\[ 	\]+\[qQ\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vmlal_laneu32.c
+++ b/src//dev/null
@@ -1,21 +0,0 @@
-/* Test the `vmlal_laneu32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vmlal_laneu32 (void)
-{
-  uint64x2_t out_uint64x2_t;
-  uint64x2_t arg0_uint64x2_t;
-  uint32x2_t arg1_uint32x2_t;
-  uint32x2_t arg2_uint32x2_t;
-
-  out_uint64x2_t = vmlal_lane_u32 (arg0_uint64x2_t, arg1_uint32x2_t, arg2_uint32x2_t, 1);
-}
-
-/* { dg-final { scan-assembler "vmlal\.u32\[ 	\]+\[qQ\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vmlal_ns16.c
+++ b/src//dev/null
@@ -1,21 +0,0 @@
-/* Test the `vmlal_ns16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vmlal_ns16 (void)
-{
-  int32x4_t out_int32x4_t;
-  int32x4_t arg0_int32x4_t;
-  int16x4_t arg1_int16x4_t;
-  int16_t arg2_int16_t;
-
-  out_int32x4_t = vmlal_n_s16 (arg0_int32x4_t, arg1_int16x4_t, arg2_int16_t);
-}
-
-/* { dg-final { scan-assembler "vmlal\.s16\[ 	\]+\[qQ\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vmlal_ns32.c
+++ b/src//dev/null
@@ -1,21 +0,0 @@
-/* Test the `vmlal_ns32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vmlal_ns32 (void)
-{
-  int64x2_t out_int64x2_t;
-  int64x2_t arg0_int64x2_t;
-  int32x2_t arg1_int32x2_t;
-  int32_t arg2_int32_t;
-
-  out_int64x2_t = vmlal_n_s32 (arg0_int64x2_t, arg1_int32x2_t, arg2_int32_t);
-}
-
-/* { dg-final { scan-assembler "vmlal\.s32\[ 	\]+\[qQ\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vmlal_nu16.c
+++ b/src//dev/null
@@ -1,21 +0,0 @@
-/* Test the `vmlal_nu16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vmlal_nu16 (void)
-{
-  uint32x4_t out_uint32x4_t;
-  uint32x4_t arg0_uint32x4_t;
-  uint16x4_t arg1_uint16x4_t;
-  uint16_t arg2_uint16_t;
-
-  out_uint32x4_t = vmlal_n_u16 (arg0_uint32x4_t, arg1_uint16x4_t, arg2_uint16_t);
-}
-
-/* { dg-final { scan-assembler "vmlal\.u16\[ 	\]+\[qQ\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vmlal_nu32.c
+++ b/src//dev/null
@@ -1,21 +0,0 @@
-/* Test the `vmlal_nu32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vmlal_nu32 (void)
-{
-  uint64x2_t out_uint64x2_t;
-  uint64x2_t arg0_uint64x2_t;
-  uint32x2_t arg1_uint32x2_t;
-  uint32_t arg2_uint32_t;
-
-  out_uint64x2_t = vmlal_n_u32 (arg0_uint64x2_t, arg1_uint32x2_t, arg2_uint32_t);
-}
-
-/* { dg-final { scan-assembler "vmlal\.u32\[ 	\]+\[qQ\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vmlals16.c
+++ b/src//dev/null
@@ -1,21 +0,0 @@
-/* Test the `vmlals16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vmlals16 (void)
-{
-  int32x4_t out_int32x4_t;
-  int32x4_t arg0_int32x4_t;
-  int16x4_t arg1_int16x4_t;
-  int16x4_t arg2_int16x4_t;
-
-  out_int32x4_t = vmlal_s16 (arg0_int32x4_t, arg1_int16x4_t, arg2_int16x4_t);
-}
-
-/* { dg-final { scan-assembler "vmlal\.s16\[ 	\]+\[qQ\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vmlals32.c
+++ b/src//dev/null
@@ -1,21 +0,0 @@
-/* Test the `vmlals32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vmlals32 (void)
-{
-  int64x2_t out_int64x2_t;
-  int64x2_t arg0_int64x2_t;
-  int32x2_t arg1_int32x2_t;
-  int32x2_t arg2_int32x2_t;
-
-  out_int64x2_t = vmlal_s32 (arg0_int64x2_t, arg1_int32x2_t, arg2_int32x2_t);
-}
-
-/* { dg-final { scan-assembler "vmlal\.s32\[ 	\]+\[qQ\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vmlals8.c
+++ b/src//dev/null
@@ -1,21 +0,0 @@
-/* Test the `vmlals8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vmlals8 (void)
-{
-  int16x8_t out_int16x8_t;
-  int16x8_t arg0_int16x8_t;
-  int8x8_t arg1_int8x8_t;
-  int8x8_t arg2_int8x8_t;
-
-  out_int16x8_t = vmlal_s8 (arg0_int16x8_t, arg1_int8x8_t, arg2_int8x8_t);
-}
-
-/* { dg-final { scan-assembler "vmlal\.s8\[ 	\]+\[qQ\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vmlalu16.c
+++ b/src//dev/null
@@ -1,21 +0,0 @@
-/* Test the `vmlalu16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vmlalu16 (void)
-{
-  uint32x4_t out_uint32x4_t;
-  uint32x4_t arg0_uint32x4_t;
-  uint16x4_t arg1_uint16x4_t;
-  uint16x4_t arg2_uint16x4_t;
-
-  out_uint32x4_t = vmlal_u16 (arg0_uint32x4_t, arg1_uint16x4_t, arg2_uint16x4_t);
-}
-
-/* { dg-final { scan-assembler "vmlal\.u16\[ 	\]+\[qQ\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vmlalu32.c
+++ b/src//dev/null
@@ -1,21 +0,0 @@
-/* Test the `vmlalu32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vmlalu32 (void)
-{
-  uint64x2_t out_uint64x2_t;
-  uint64x2_t arg0_uint64x2_t;
-  uint32x2_t arg1_uint32x2_t;
-  uint32x2_t arg2_uint32x2_t;
-
-  out_uint64x2_t = vmlal_u32 (arg0_uint64x2_t, arg1_uint32x2_t, arg2_uint32x2_t);
-}
-
-/* { dg-final { scan-assembler "vmlal\.u32\[ 	\]+\[qQ\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vmlalu8.c
+++ b/src//dev/null
@@ -1,21 +0,0 @@
-/* Test the `vmlalu8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vmlalu8 (void)
-{
-  uint16x8_t out_uint16x8_t;
-  uint16x8_t arg0_uint16x8_t;
-  uint8x8_t arg1_uint8x8_t;
-  uint8x8_t arg2_uint8x8_t;
-
-  out_uint16x8_t = vmlal_u8 (arg0_uint16x8_t, arg1_uint8x8_t, arg2_uint8x8_t);
-}
-
-/* { dg-final { scan-assembler "vmlal\.u8\[ 	\]+\[qQ\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vmlas16.c
+++ b/src//dev/null
@@ -1,21 +0,0 @@
-/* Test the `vmlas16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vmlas16 (void)
-{
-  int16x4_t out_int16x4_t;
-  int16x4_t arg0_int16x4_t;
-  int16x4_t arg1_int16x4_t;
-  int16x4_t arg2_int16x4_t;
-
-  out_int16x4_t = vmla_s16 (arg0_int16x4_t, arg1_int16x4_t, arg2_int16x4_t);
-}
-
-/* { dg-final { scan-assembler "vmla\.i16\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vmlas32.c
+++ b/src//dev/null
@@ -1,21 +0,0 @@
-/* Test the `vmlas32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vmlas32 (void)
-{
-  int32x2_t out_int32x2_t;
-  int32x2_t arg0_int32x2_t;
-  int32x2_t arg1_int32x2_t;
-  int32x2_t arg2_int32x2_t;
-
-  out_int32x2_t = vmla_s32 (arg0_int32x2_t, arg1_int32x2_t, arg2_int32x2_t);
-}
-
-/* { dg-final { scan-assembler "vmla\.i32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vmlas8.c
+++ b/src//dev/null
@@ -1,21 +0,0 @@
-/* Test the `vmlas8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vmlas8 (void)
-{
-  int8x8_t out_int8x8_t;
-  int8x8_t arg0_int8x8_t;
-  int8x8_t arg1_int8x8_t;
-  int8x8_t arg2_int8x8_t;
-
-  out_int8x8_t = vmla_s8 (arg0_int8x8_t, arg1_int8x8_t, arg2_int8x8_t);
-}
-
-/* { dg-final { scan-assembler "vmla\.i8\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vmlau16.c
+++ b/src//dev/null
@@ -1,21 +0,0 @@
-/* Test the `vmlau16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vmlau16 (void)
-{
-  uint16x4_t out_uint16x4_t;
-  uint16x4_t arg0_uint16x4_t;
-  uint16x4_t arg1_uint16x4_t;
-  uint16x4_t arg2_uint16x4_t;
-
-  out_uint16x4_t = vmla_u16 (arg0_uint16x4_t, arg1_uint16x4_t, arg2_uint16x4_t);
-}
-
-/* { dg-final { scan-assembler "vmla\.i16\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vmlau32.c
+++ b/src//dev/null
@@ -1,21 +0,0 @@
-/* Test the `vmlau32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vmlau32 (void)
-{
-  uint32x2_t out_uint32x2_t;
-  uint32x2_t arg0_uint32x2_t;
-  uint32x2_t arg1_uint32x2_t;
-  uint32x2_t arg2_uint32x2_t;
-
-  out_uint32x2_t = vmla_u32 (arg0_uint32x2_t, arg1_uint32x2_t, arg2_uint32x2_t);
-}
-
-/* { dg-final { scan-assembler "vmla\.i32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vmlau8.c
+++ b/src//dev/null
@@ -1,21 +0,0 @@
-/* Test the `vmlau8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vmlau8 (void)
-{
-  uint8x8_t out_uint8x8_t;
-  uint8x8_t arg0_uint8x8_t;
-  uint8x8_t arg1_uint8x8_t;
-  uint8x8_t arg2_uint8x8_t;
-
-  out_uint8x8_t = vmla_u8 (arg0_uint8x8_t, arg1_uint8x8_t, arg2_uint8x8_t);
-}
-
-/* { dg-final { scan-assembler "vmla\.i8\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vmlsQ_lanef32.c
+++ b/src//dev/null
@@ -1,21 +0,0 @@
-/* Test the `vmlsQ_lanef32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vmlsQ_lanef32 (void)
-{
-  float32x4_t out_float32x4_t;
-  float32x4_t arg0_float32x4_t;
-  float32x4_t arg1_float32x4_t;
-  float32x2_t arg2_float32x2_t;
-
-  out_float32x4_t = vmlsq_lane_f32 (arg0_float32x4_t, arg1_float32x4_t, arg2_float32x2_t, 1);
-}
-
-/* { dg-final { scan-assembler "vmls\.f32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vmlsQ_lanes16.c
+++ b/src//dev/null
@@ -1,21 +0,0 @@
-/* Test the `vmlsQ_lanes16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vmlsQ_lanes16 (void)
-{
-  int16x8_t out_int16x8_t;
-  int16x8_t arg0_int16x8_t;
-  int16x8_t arg1_int16x8_t;
-  int16x4_t arg2_int16x4_t;
-
-  out_int16x8_t = vmlsq_lane_s16 (arg0_int16x8_t, arg1_int16x8_t, arg2_int16x4_t, 1);
-}
-
-/* { dg-final { scan-assembler "vmls\.i16\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vmlsQ_lanes32.c
+++ b/src//dev/null
@@ -1,21 +0,0 @@
-/* Test the `vmlsQ_lanes32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vmlsQ_lanes32 (void)
-{
-  int32x4_t out_int32x4_t;
-  int32x4_t arg0_int32x4_t;
-  int32x4_t arg1_int32x4_t;
-  int32x2_t arg2_int32x2_t;
-
-  out_int32x4_t = vmlsq_lane_s32 (arg0_int32x4_t, arg1_int32x4_t, arg2_int32x2_t, 1);
-}
-
-/* { dg-final { scan-assembler "vmls\.i32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vmlsQ_laneu16.c
+++ b/src//dev/null
@@ -1,21 +0,0 @@
-/* Test the `vmlsQ_laneu16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vmlsQ_laneu16 (void)
-{
-  uint16x8_t out_uint16x8_t;
-  uint16x8_t arg0_uint16x8_t;
-  uint16x8_t arg1_uint16x8_t;
-  uint16x4_t arg2_uint16x4_t;
-
-  out_uint16x8_t = vmlsq_lane_u16 (arg0_uint16x8_t, arg1_uint16x8_t, arg2_uint16x4_t, 1);
-}
-
-/* { dg-final { scan-assembler "vmls\.i16\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vmlsQ_laneu32.c
+++ b/src//dev/null
@@ -1,21 +0,0 @@
-/* Test the `vmlsQ_laneu32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vmlsQ_laneu32 (void)
-{
-  uint32x4_t out_uint32x4_t;
-  uint32x4_t arg0_uint32x4_t;
-  uint32x4_t arg1_uint32x4_t;
-  uint32x2_t arg2_uint32x2_t;
-
-  out_uint32x4_t = vmlsq_lane_u32 (arg0_uint32x4_t, arg1_uint32x4_t, arg2_uint32x2_t, 1);
-}
-
-/* { dg-final { scan-assembler "vmls\.i32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vmlsQ_nf32.c
+++ b/src//dev/null
@@ -1,21 +0,0 @@
-/* Test the `vmlsQ_nf32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vmlsQ_nf32 (void)
-{
-  float32x4_t out_float32x4_t;
-  float32x4_t arg0_float32x4_t;
-  float32x4_t arg1_float32x4_t;
-  float32_t arg2_float32_t;
-
-  out_float32x4_t = vmlsq_n_f32 (arg0_float32x4_t, arg1_float32x4_t, arg2_float32_t);
-}
-
-/* { dg-final { scan-assembler "vmls\.f32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vmlsQ_ns16.c
+++ b/src//dev/null
@@ -1,21 +0,0 @@
-/* Test the `vmlsQ_ns16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vmlsQ_ns16 (void)
-{
-  int16x8_t out_int16x8_t;
-  int16x8_t arg0_int16x8_t;
-  int16x8_t arg1_int16x8_t;
-  int16_t arg2_int16_t;
-
-  out_int16x8_t = vmlsq_n_s16 (arg0_int16x8_t, arg1_int16x8_t, arg2_int16_t);
-}
-
-/* { dg-final { scan-assembler "vmls\.i16\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vmlsQ_ns32.c
+++ b/src//dev/null
@@ -1,21 +0,0 @@
-/* Test the `vmlsQ_ns32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vmlsQ_ns32 (void)
-{
-  int32x4_t out_int32x4_t;
-  int32x4_t arg0_int32x4_t;
-  int32x4_t arg1_int32x4_t;
-  int32_t arg2_int32_t;
-
-  out_int32x4_t = vmlsq_n_s32 (arg0_int32x4_t, arg1_int32x4_t, arg2_int32_t);
-}
-
-/* { dg-final { scan-assembler "vmls\.i32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vmlsQ_nu16.c
+++ b/src//dev/null
@@ -1,21 +0,0 @@
-/* Test the `vmlsQ_nu16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vmlsQ_nu16 (void)
-{
-  uint16x8_t out_uint16x8_t;
-  uint16x8_t arg0_uint16x8_t;
-  uint16x8_t arg1_uint16x8_t;
-  uint16_t arg2_uint16_t;
-
-  out_uint16x8_t = vmlsq_n_u16 (arg0_uint16x8_t, arg1_uint16x8_t, arg2_uint16_t);
-}
-
-/* { dg-final { scan-assembler "vmls\.i16\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vmlsQ_nu32.c
+++ b/src//dev/null
@@ -1,21 +0,0 @@
-/* Test the `vmlsQ_nu32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vmlsQ_nu32 (void)
-{
-  uint32x4_t out_uint32x4_t;
-  uint32x4_t arg0_uint32x4_t;
-  uint32x4_t arg1_uint32x4_t;
-  uint32_t arg2_uint32_t;
-
-  out_uint32x4_t = vmlsq_n_u32 (arg0_uint32x4_t, arg1_uint32x4_t, arg2_uint32_t);
-}
-
-/* { dg-final { scan-assembler "vmls\.i32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vmlsQf32.c
+++ b/src//dev/null
@@ -1,21 +0,0 @@
-/* Test the `vmlsQf32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vmlsQf32 (void)
-{
-  float32x4_t out_float32x4_t;
-  float32x4_t arg0_float32x4_t;
-  float32x4_t arg1_float32x4_t;
-  float32x4_t arg2_float32x4_t;
-
-  out_float32x4_t = vmlsq_f32 (arg0_float32x4_t, arg1_float32x4_t, arg2_float32x4_t);
-}
-
-/* { dg-final { scan-assembler "vmls\.f32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vmlsQs16.c
+++ b/src//dev/null
@@ -1,21 +0,0 @@
-/* Test the `vmlsQs16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vmlsQs16 (void)
-{
-  int16x8_t out_int16x8_t;
-  int16x8_t arg0_int16x8_t;
-  int16x8_t arg1_int16x8_t;
-  int16x8_t arg2_int16x8_t;
-
-  out_int16x8_t = vmlsq_s16 (arg0_int16x8_t, arg1_int16x8_t, arg2_int16x8_t);
-}
-
-/* { dg-final { scan-assembler "vmls\.i16\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vmlsQs32.c
+++ b/src//dev/null
@@ -1,21 +0,0 @@
-/* Test the `vmlsQs32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vmlsQs32 (void)
-{
-  int32x4_t out_int32x4_t;
-  int32x4_t arg0_int32x4_t;
-  int32x4_t arg1_int32x4_t;
-  int32x4_t arg2_int32x4_t;
-
-  out_int32x4_t = vmlsq_s32 (arg0_int32x4_t, arg1_int32x4_t, arg2_int32x4_t);
-}
-
-/* { dg-final { scan-assembler "vmls\.i32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vmlsQs8.c
+++ b/src//dev/null
@@ -1,21 +0,0 @@
-/* Test the `vmlsQs8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vmlsQs8 (void)
-{
-  int8x16_t out_int8x16_t;
-  int8x16_t arg0_int8x16_t;
-  int8x16_t arg1_int8x16_t;
-  int8x16_t arg2_int8x16_t;
-
-  out_int8x16_t = vmlsq_s8 (arg0_int8x16_t, arg1_int8x16_t, arg2_int8x16_t);
-}
-
-/* { dg-final { scan-assembler "vmls\.i8\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vmlsQu16.c
+++ b/src//dev/null
@@ -1,21 +0,0 @@
-/* Test the `vmlsQu16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vmlsQu16 (void)
-{
-  uint16x8_t out_uint16x8_t;
-  uint16x8_t arg0_uint16x8_t;
-  uint16x8_t arg1_uint16x8_t;
-  uint16x8_t arg2_uint16x8_t;
-
-  out_uint16x8_t = vmlsq_u16 (arg0_uint16x8_t, arg1_uint16x8_t, arg2_uint16x8_t);
-}
-
-/* { dg-final { scan-assembler "vmls\.i16\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vmlsQu32.c
+++ b/src//dev/null
@@ -1,21 +0,0 @@
-/* Test the `vmlsQu32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vmlsQu32 (void)
-{
-  uint32x4_t out_uint32x4_t;
-  uint32x4_t arg0_uint32x4_t;
-  uint32x4_t arg1_uint32x4_t;
-  uint32x4_t arg2_uint32x4_t;
-
-  out_uint32x4_t = vmlsq_u32 (arg0_uint32x4_t, arg1_uint32x4_t, arg2_uint32x4_t);
-}
-
-/* { dg-final { scan-assembler "vmls\.i32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vmlsQu8.c
+++ b/src//dev/null
@@ -1,21 +0,0 @@
-/* Test the `vmlsQu8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vmlsQu8 (void)
-{
-  uint8x16_t out_uint8x16_t;
-  uint8x16_t arg0_uint8x16_t;
-  uint8x16_t arg1_uint8x16_t;
-  uint8x16_t arg2_uint8x16_t;
-
-  out_uint8x16_t = vmlsq_u8 (arg0_uint8x16_t, arg1_uint8x16_t, arg2_uint8x16_t);
-}
-
-/* { dg-final { scan-assembler "vmls\.i8\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vmls_lanef32.c
+++ b/src//dev/null
@@ -1,21 +0,0 @@
-/* Test the `vmls_lanef32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vmls_lanef32 (void)
-{
-  float32x2_t out_float32x2_t;
-  float32x2_t arg0_float32x2_t;
-  float32x2_t arg1_float32x2_t;
-  float32x2_t arg2_float32x2_t;
-
-  out_float32x2_t = vmls_lane_f32 (arg0_float32x2_t, arg1_float32x2_t, arg2_float32x2_t, 1);
-}
-
-/* { dg-final { scan-assembler "vmls\.f32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vmls_lanes16.c
+++ b/src//dev/null
@@ -1,21 +0,0 @@
-/* Test the `vmls_lanes16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vmls_lanes16 (void)
-{
-  int16x4_t out_int16x4_t;
-  int16x4_t arg0_int16x4_t;
-  int16x4_t arg1_int16x4_t;
-  int16x4_t arg2_int16x4_t;
-
-  out_int16x4_t = vmls_lane_s16 (arg0_int16x4_t, arg1_int16x4_t, arg2_int16x4_t, 1);
-}
-
-/* { dg-final { scan-assembler "vmls\.i16\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vmls_lanes32.c
+++ b/src//dev/null
@@ -1,21 +0,0 @@
-/* Test the `vmls_lanes32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vmls_lanes32 (void)
-{
-  int32x2_t out_int32x2_t;
-  int32x2_t arg0_int32x2_t;
-  int32x2_t arg1_int32x2_t;
-  int32x2_t arg2_int32x2_t;
-
-  out_int32x2_t = vmls_lane_s32 (arg0_int32x2_t, arg1_int32x2_t, arg2_int32x2_t, 1);
-}
-
-/* { dg-final { scan-assembler "vmls\.i32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vmls_laneu16.c
+++ b/src//dev/null
@@ -1,21 +0,0 @@
-/* Test the `vmls_laneu16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vmls_laneu16 (void)
-{
-  uint16x4_t out_uint16x4_t;
-  uint16x4_t arg0_uint16x4_t;
-  uint16x4_t arg1_uint16x4_t;
-  uint16x4_t arg2_uint16x4_t;
-
-  out_uint16x4_t = vmls_lane_u16 (arg0_uint16x4_t, arg1_uint16x4_t, arg2_uint16x4_t, 1);
-}
-
-/* { dg-final { scan-assembler "vmls\.i16\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vmls_laneu32.c
+++ b/src//dev/null
@@ -1,21 +0,0 @@
-/* Test the `vmls_laneu32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vmls_laneu32 (void)
-{
-  uint32x2_t out_uint32x2_t;
-  uint32x2_t arg0_uint32x2_t;
-  uint32x2_t arg1_uint32x2_t;
-  uint32x2_t arg2_uint32x2_t;
-
-  out_uint32x2_t = vmls_lane_u32 (arg0_uint32x2_t, arg1_uint32x2_t, arg2_uint32x2_t, 1);
-}
-
-/* { dg-final { scan-assembler "vmls\.i32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vmls_nf32.c
+++ b/src//dev/null
@@ -1,21 +0,0 @@
-/* Test the `vmls_nf32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vmls_nf32 (void)
-{
-  float32x2_t out_float32x2_t;
-  float32x2_t arg0_float32x2_t;
-  float32x2_t arg1_float32x2_t;
-  float32_t arg2_float32_t;
-
-  out_float32x2_t = vmls_n_f32 (arg0_float32x2_t, arg1_float32x2_t, arg2_float32_t);
-}
-
-/* { dg-final { scan-assembler "vmls\.f32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vmls_ns16.c
+++ b/src//dev/null
@@ -1,21 +0,0 @@
-/* Test the `vmls_ns16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vmls_ns16 (void)
-{
-  int16x4_t out_int16x4_t;
-  int16x4_t arg0_int16x4_t;
-  int16x4_t arg1_int16x4_t;
-  int16_t arg2_int16_t;
-
-  out_int16x4_t = vmls_n_s16 (arg0_int16x4_t, arg1_int16x4_t, arg2_int16_t);
-}
-
-/* { dg-final { scan-assembler "vmls\.i16\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vmls_ns32.c
+++ b/src//dev/null
@@ -1,21 +0,0 @@
-/* Test the `vmls_ns32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vmls_ns32 (void)
-{
-  int32x2_t out_int32x2_t;
-  int32x2_t arg0_int32x2_t;
-  int32x2_t arg1_int32x2_t;
-  int32_t arg2_int32_t;
-
-  out_int32x2_t = vmls_n_s32 (arg0_int32x2_t, arg1_int32x2_t, arg2_int32_t);
-}
-
-/* { dg-final { scan-assembler "vmls\.i32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vmls_nu16.c
+++ b/src//dev/null
@@ -1,21 +0,0 @@
-/* Test the `vmls_nu16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vmls_nu16 (void)
-{
-  uint16x4_t out_uint16x4_t;
-  uint16x4_t arg0_uint16x4_t;
-  uint16x4_t arg1_uint16x4_t;
-  uint16_t arg2_uint16_t;
-
-  out_uint16x4_t = vmls_n_u16 (arg0_uint16x4_t, arg1_uint16x4_t, arg2_uint16_t);
-}
-
-/* { dg-final { scan-assembler "vmls\.i16\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vmls_nu32.c
+++ b/src//dev/null
@@ -1,21 +0,0 @@
-/* Test the `vmls_nu32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vmls_nu32 (void)
-{
-  uint32x2_t out_uint32x2_t;
-  uint32x2_t arg0_uint32x2_t;
-  uint32x2_t arg1_uint32x2_t;
-  uint32_t arg2_uint32_t;
-
-  out_uint32x2_t = vmls_n_u32 (arg0_uint32x2_t, arg1_uint32x2_t, arg2_uint32_t);
-}
-
-/* { dg-final { scan-assembler "vmls\.i32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vmlsf32.c
+++ b/src//dev/null
@@ -1,21 +0,0 @@
-/* Test the `vmlsf32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vmlsf32 (void)
-{
-  float32x2_t out_float32x2_t;
-  float32x2_t arg0_float32x2_t;
-  float32x2_t arg1_float32x2_t;
-  float32x2_t arg2_float32x2_t;
-
-  out_float32x2_t = vmls_f32 (arg0_float32x2_t, arg1_float32x2_t, arg2_float32x2_t);
-}
-
-/* { dg-final { scan-assembler "vmls\.f32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vmlsl_lanes16.c
+++ b/src//dev/null
@@ -1,21 +0,0 @@
-/* Test the `vmlsl_lanes16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vmlsl_lanes16 (void)
-{
-  int32x4_t out_int32x4_t;
-  int32x4_t arg0_int32x4_t;
-  int16x4_t arg1_int16x4_t;
-  int16x4_t arg2_int16x4_t;
-
-  out_int32x4_t = vmlsl_lane_s16 (arg0_int32x4_t, arg1_int16x4_t, arg2_int16x4_t, 1);
-}
-
-/* { dg-final { scan-assembler "vmlsl\.s16\[ 	\]+\[qQ\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vmlsl_lanes32.c
+++ b/src//dev/null
@@ -1,21 +0,0 @@
-/* Test the `vmlsl_lanes32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vmlsl_lanes32 (void)
-{
-  int64x2_t out_int64x2_t;
-  int64x2_t arg0_int64x2_t;
-  int32x2_t arg1_int32x2_t;
-  int32x2_t arg2_int32x2_t;
-
-  out_int64x2_t = vmlsl_lane_s32 (arg0_int64x2_t, arg1_int32x2_t, arg2_int32x2_t, 1);
-}
-
-/* { dg-final { scan-assembler "vmlsl\.s32\[ 	\]+\[qQ\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vmlsl_laneu16.c
+++ b/src//dev/null
@@ -1,21 +0,0 @@
-/* Test the `vmlsl_laneu16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vmlsl_laneu16 (void)
-{
-  uint32x4_t out_uint32x4_t;
-  uint32x4_t arg0_uint32x4_t;
-  uint16x4_t arg1_uint16x4_t;
-  uint16x4_t arg2_uint16x4_t;
-
-  out_uint32x4_t = vmlsl_lane_u16 (arg0_uint32x4_t, arg1_uint16x4_t, arg2_uint16x4_t, 1);
-}
-
-/* { dg-final { scan-assembler "vmlsl\.u16\[ 	\]+\[qQ\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vmlsl_laneu32.c
+++ b/src//dev/null
@@ -1,21 +0,0 @@
-/* Test the `vmlsl_laneu32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vmlsl_laneu32 (void)
-{
-  uint64x2_t out_uint64x2_t;
-  uint64x2_t arg0_uint64x2_t;
-  uint32x2_t arg1_uint32x2_t;
-  uint32x2_t arg2_uint32x2_t;
-
-  out_uint64x2_t = vmlsl_lane_u32 (arg0_uint64x2_t, arg1_uint32x2_t, arg2_uint32x2_t, 1);
-}
-
-/* { dg-final { scan-assembler "vmlsl\.u32\[ 	\]+\[qQ\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vmlsl_ns16.c
+++ b/src//dev/null
@@ -1,21 +0,0 @@
-/* Test the `vmlsl_ns16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vmlsl_ns16 (void)
-{
-  int32x4_t out_int32x4_t;
-  int32x4_t arg0_int32x4_t;
-  int16x4_t arg1_int16x4_t;
-  int16_t arg2_int16_t;
-
-  out_int32x4_t = vmlsl_n_s16 (arg0_int32x4_t, arg1_int16x4_t, arg2_int16_t);
-}
-
-/* { dg-final { scan-assembler "vmlsl\.s16\[ 	\]+\[qQ\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vmlsl_ns32.c
+++ b/src//dev/null
@@ -1,21 +0,0 @@
-/* Test the `vmlsl_ns32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vmlsl_ns32 (void)
-{
-  int64x2_t out_int64x2_t;
-  int64x2_t arg0_int64x2_t;
-  int32x2_t arg1_int32x2_t;
-  int32_t arg2_int32_t;
-
-  out_int64x2_t = vmlsl_n_s32 (arg0_int64x2_t, arg1_int32x2_t, arg2_int32_t);
-}
-
-/* { dg-final { scan-assembler "vmlsl\.s32\[ 	\]+\[qQ\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vmlsl_nu16.c
+++ b/src//dev/null
@@ -1,21 +0,0 @@
-/* Test the `vmlsl_nu16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vmlsl_nu16 (void)
-{
-  uint32x4_t out_uint32x4_t;
-  uint32x4_t arg0_uint32x4_t;
-  uint16x4_t arg1_uint16x4_t;
-  uint16_t arg2_uint16_t;
-
-  out_uint32x4_t = vmlsl_n_u16 (arg0_uint32x4_t, arg1_uint16x4_t, arg2_uint16_t);
-}
-
-/* { dg-final { scan-assembler "vmlsl\.u16\[ 	\]+\[qQ\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vmlsl_nu32.c
+++ b/src//dev/null
@@ -1,21 +0,0 @@
-/* Test the `vmlsl_nu32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vmlsl_nu32 (void)
-{
-  uint64x2_t out_uint64x2_t;
-  uint64x2_t arg0_uint64x2_t;
-  uint32x2_t arg1_uint32x2_t;
-  uint32_t arg2_uint32_t;
-
-  out_uint64x2_t = vmlsl_n_u32 (arg0_uint64x2_t, arg1_uint32x2_t, arg2_uint32_t);
-}
-
-/* { dg-final { scan-assembler "vmlsl\.u32\[ 	\]+\[qQ\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vmlsls16.c
+++ b/src//dev/null
@@ -1,21 +0,0 @@
-/* Test the `vmlsls16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vmlsls16 (void)
-{
-  int32x4_t out_int32x4_t;
-  int32x4_t arg0_int32x4_t;
-  int16x4_t arg1_int16x4_t;
-  int16x4_t arg2_int16x4_t;
-
-  out_int32x4_t = vmlsl_s16 (arg0_int32x4_t, arg1_int16x4_t, arg2_int16x4_t);
-}
-
-/* { dg-final { scan-assembler "vmlsl\.s16\[ 	\]+\[qQ\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vmlsls32.c
+++ b/src//dev/null
@@ -1,21 +0,0 @@
-/* Test the `vmlsls32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vmlsls32 (void)
-{
-  int64x2_t out_int64x2_t;
-  int64x2_t arg0_int64x2_t;
-  int32x2_t arg1_int32x2_t;
-  int32x2_t arg2_int32x2_t;
-
-  out_int64x2_t = vmlsl_s32 (arg0_int64x2_t, arg1_int32x2_t, arg2_int32x2_t);
-}
-
-/* { dg-final { scan-assembler "vmlsl\.s32\[ 	\]+\[qQ\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vmlsls8.c
+++ b/src//dev/null
@@ -1,21 +0,0 @@
-/* Test the `vmlsls8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vmlsls8 (void)
-{
-  int16x8_t out_int16x8_t;
-  int16x8_t arg0_int16x8_t;
-  int8x8_t arg1_int8x8_t;
-  int8x8_t arg2_int8x8_t;
-
-  out_int16x8_t = vmlsl_s8 (arg0_int16x8_t, arg1_int8x8_t, arg2_int8x8_t);
-}
-
-/* { dg-final { scan-assembler "vmlsl\.s8\[ 	\]+\[qQ\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vmlslu16.c
+++ b/src//dev/null
@@ -1,21 +0,0 @@
-/* Test the `vmlslu16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vmlslu16 (void)
-{
-  uint32x4_t out_uint32x4_t;
-  uint32x4_t arg0_uint32x4_t;
-  uint16x4_t arg1_uint16x4_t;
-  uint16x4_t arg2_uint16x4_t;
-
-  out_uint32x4_t = vmlsl_u16 (arg0_uint32x4_t, arg1_uint16x4_t, arg2_uint16x4_t);
-}
-
-/* { dg-final { scan-assembler "vmlsl\.u16\[ 	\]+\[qQ\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vmlslu32.c
+++ b/src//dev/null
@@ -1,21 +0,0 @@
-/* Test the `vmlslu32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vmlslu32 (void)
-{
-  uint64x2_t out_uint64x2_t;
-  uint64x2_t arg0_uint64x2_t;
-  uint32x2_t arg1_uint32x2_t;
-  uint32x2_t arg2_uint32x2_t;
-
-  out_uint64x2_t = vmlsl_u32 (arg0_uint64x2_t, arg1_uint32x2_t, arg2_uint32x2_t);
-}
-
-/* { dg-final { scan-assembler "vmlsl\.u32\[ 	\]+\[qQ\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vmlslu8.c
+++ b/src//dev/null
@@ -1,21 +0,0 @@
-/* Test the `vmlslu8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vmlslu8 (void)
-{
-  uint16x8_t out_uint16x8_t;
-  uint16x8_t arg0_uint16x8_t;
-  uint8x8_t arg1_uint8x8_t;
-  uint8x8_t arg2_uint8x8_t;
-
-  out_uint16x8_t = vmlsl_u8 (arg0_uint16x8_t, arg1_uint8x8_t, arg2_uint8x8_t);
-}
-
-/* { dg-final { scan-assembler "vmlsl\.u8\[ 	\]+\[qQ\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vmlss16.c
+++ b/src//dev/null
@@ -1,21 +0,0 @@
-/* Test the `vmlss16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vmlss16 (void)
-{
-  int16x4_t out_int16x4_t;
-  int16x4_t arg0_int16x4_t;
-  int16x4_t arg1_int16x4_t;
-  int16x4_t arg2_int16x4_t;
-
-  out_int16x4_t = vmls_s16 (arg0_int16x4_t, arg1_int16x4_t, arg2_int16x4_t);
-}
-
-/* { dg-final { scan-assembler "vmls\.i16\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vmlss32.c
+++ b/src//dev/null
@@ -1,21 +0,0 @@
-/* Test the `vmlss32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vmlss32 (void)
-{
-  int32x2_t out_int32x2_t;
-  int32x2_t arg0_int32x2_t;
-  int32x2_t arg1_int32x2_t;
-  int32x2_t arg2_int32x2_t;
-
-  out_int32x2_t = vmls_s32 (arg0_int32x2_t, arg1_int32x2_t, arg2_int32x2_t);
-}
-
-/* { dg-final { scan-assembler "vmls\.i32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vmlss8.c
+++ b/src//dev/null
@@ -1,21 +0,0 @@
-/* Test the `vmlss8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vmlss8 (void)
-{
-  int8x8_t out_int8x8_t;
-  int8x8_t arg0_int8x8_t;
-  int8x8_t arg1_int8x8_t;
-  int8x8_t arg2_int8x8_t;
-
-  out_int8x8_t = vmls_s8 (arg0_int8x8_t, arg1_int8x8_t, arg2_int8x8_t);
-}
-
-/* { dg-final { scan-assembler "vmls\.i8\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vmlsu16.c
+++ b/src//dev/null
@@ -1,21 +0,0 @@
-/* Test the `vmlsu16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vmlsu16 (void)
-{
-  uint16x4_t out_uint16x4_t;
-  uint16x4_t arg0_uint16x4_t;
-  uint16x4_t arg1_uint16x4_t;
-  uint16x4_t arg2_uint16x4_t;
-
-  out_uint16x4_t = vmls_u16 (arg0_uint16x4_t, arg1_uint16x4_t, arg2_uint16x4_t);
-}
-
-/* { dg-final { scan-assembler "vmls\.i16\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vmlsu32.c
+++ b/src//dev/null
@@ -1,21 +0,0 @@
-/* Test the `vmlsu32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vmlsu32 (void)
-{
-  uint32x2_t out_uint32x2_t;
-  uint32x2_t arg0_uint32x2_t;
-  uint32x2_t arg1_uint32x2_t;
-  uint32x2_t arg2_uint32x2_t;
-
-  out_uint32x2_t = vmls_u32 (arg0_uint32x2_t, arg1_uint32x2_t, arg2_uint32x2_t);
-}
-
-/* { dg-final { scan-assembler "vmls\.i32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vmlsu8.c
+++ b/src//dev/null
@@ -1,21 +0,0 @@
-/* Test the `vmlsu8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vmlsu8 (void)
-{
-  uint8x8_t out_uint8x8_t;
-  uint8x8_t arg0_uint8x8_t;
-  uint8x8_t arg1_uint8x8_t;
-  uint8x8_t arg2_uint8x8_t;
-
-  out_uint8x8_t = vmls_u8 (arg0_uint8x8_t, arg1_uint8x8_t, arg2_uint8x8_t);
-}
-
-/* { dg-final { scan-assembler "vmls\.i8\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vmovQ_nf32.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vmovQ_nf32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vmovQ_nf32 (void)
-{
-  float32x4_t out_float32x4_t;
-  float32_t arg0_float32_t;
-
-  out_float32x4_t = vmovq_n_f32 (arg0_float32_t);
-}
-
-/* { dg-final { scan-assembler "vdup\.32\[ 	\]+\[qQ\]\[0-9\]+, (\[rR\]\[0-9\]+|\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vmovQ_np16.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vmovQ_np16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vmovQ_np16 (void)
-{
-  poly16x8_t out_poly16x8_t;
-  poly16_t arg0_poly16_t;
-
-  out_poly16x8_t = vmovq_n_p16 (arg0_poly16_t);
-}
-
-/* { dg-final { scan-assembler "vdup\.16\[ 	\]+\[qQ\]\[0-9\]+, (\[rR\]\[0-9\]+|\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vmovQ_np8.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vmovQ_np8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vmovQ_np8 (void)
-{
-  poly8x16_t out_poly8x16_t;
-  poly8_t arg0_poly8_t;
-
-  out_poly8x16_t = vmovq_n_p8 (arg0_poly8_t);
-}
-
-/* { dg-final { scan-assembler "vdup\.8\[ 	\]+\[qQ\]\[0-9\]+, (\[rR\]\[0-9\]+|\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vmovQ_ns16.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vmovQ_ns16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vmovQ_ns16 (void)
-{
-  int16x8_t out_int16x8_t;
-  int16_t arg0_int16_t;
-
-  out_int16x8_t = vmovq_n_s16 (arg0_int16_t);
-}
-
-/* { dg-final { scan-assembler "vdup\.16\[ 	\]+\[qQ\]\[0-9\]+, (\[rR\]\[0-9\]+|\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vmovQ_ns32.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vmovQ_ns32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vmovQ_ns32 (void)
-{
-  int32x4_t out_int32x4_t;
-  int32_t arg0_int32_t;
-
-  out_int32x4_t = vmovq_n_s32 (arg0_int32_t);
-}
-
-/* { dg-final { scan-assembler "vdup\.32\[ 	\]+\[qQ\]\[0-9\]+, (\[rR\]\[0-9\]+|\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vmovQ_ns64.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vmovQ_ns64' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vmovQ_ns64 (void)
-{
-  int64x2_t out_int64x2_t;
-  int64_t arg0_int64_t;
-
-  out_int64x2_t = vmovq_n_s64 (arg0_int64_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vmovQ_ns8.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vmovQ_ns8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vmovQ_ns8 (void)
-{
-  int8x16_t out_int8x16_t;
-  int8_t arg0_int8_t;
-
-  out_int8x16_t = vmovq_n_s8 (arg0_int8_t);
-}
-
-/* { dg-final { scan-assembler "vdup\.8\[ 	\]+\[qQ\]\[0-9\]+, (\[rR\]\[0-9\]+|\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vmovQ_nu16.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vmovQ_nu16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vmovQ_nu16 (void)
-{
-  uint16x8_t out_uint16x8_t;
-  uint16_t arg0_uint16_t;
-
-  out_uint16x8_t = vmovq_n_u16 (arg0_uint16_t);
-}
-
-/* { dg-final { scan-assembler "vdup\.16\[ 	\]+\[qQ\]\[0-9\]+, (\[rR\]\[0-9\]+|\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vmovQ_nu32.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vmovQ_nu32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vmovQ_nu32 (void)
-{
-  uint32x4_t out_uint32x4_t;
-  uint32_t arg0_uint32_t;
-
-  out_uint32x4_t = vmovq_n_u32 (arg0_uint32_t);
-}
-
-/* { dg-final { scan-assembler "vdup\.32\[ 	\]+\[qQ\]\[0-9\]+, (\[rR\]\[0-9\]+|\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vmovQ_nu64.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vmovQ_nu64' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vmovQ_nu64 (void)
-{
-  uint64x2_t out_uint64x2_t;
-  uint64_t arg0_uint64_t;
-
-  out_uint64x2_t = vmovq_n_u64 (arg0_uint64_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vmovQ_nu8.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vmovQ_nu8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vmovQ_nu8 (void)
-{
-  uint8x16_t out_uint8x16_t;
-  uint8_t arg0_uint8_t;
-
-  out_uint8x16_t = vmovq_n_u8 (arg0_uint8_t);
-}
-
-/* { dg-final { scan-assembler "vdup\.8\[ 	\]+\[qQ\]\[0-9\]+, (\[rR\]\[0-9\]+|\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vmov_nf32.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vmov_nf32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vmov_nf32 (void)
-{
-  float32x2_t out_float32x2_t;
-  float32_t arg0_float32_t;
-
-  out_float32x2_t = vmov_n_f32 (arg0_float32_t);
-}
-
-/* { dg-final { scan-assembler "vdup\.32\[ 	\]+\[dD\]\[0-9\]+, (\[rR\]\[0-9\]+|\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vmov_np16.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vmov_np16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vmov_np16 (void)
-{
-  poly16x4_t out_poly16x4_t;
-  poly16_t arg0_poly16_t;
-
-  out_poly16x4_t = vmov_n_p16 (arg0_poly16_t);
-}
-
-/* { dg-final { scan-assembler "vdup\.16\[ 	\]+\[dD\]\[0-9\]+, (\[rR\]\[0-9\]+|\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vmov_np8.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vmov_np8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vmov_np8 (void)
-{
-  poly8x8_t out_poly8x8_t;
-  poly8_t arg0_poly8_t;
-
-  out_poly8x8_t = vmov_n_p8 (arg0_poly8_t);
-}
-
-/* { dg-final { scan-assembler "vdup\.8\[ 	\]+\[dD\]\[0-9\]+, (\[rR\]\[0-9\]+|\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vmov_ns16.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vmov_ns16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vmov_ns16 (void)
-{
-  int16x4_t out_int16x4_t;
-  int16_t arg0_int16_t;
-
-  out_int16x4_t = vmov_n_s16 (arg0_int16_t);
-}
-
-/* { dg-final { scan-assembler "vdup\.16\[ 	\]+\[dD\]\[0-9\]+, (\[rR\]\[0-9\]+|\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vmov_ns32.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vmov_ns32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vmov_ns32 (void)
-{
-  int32x2_t out_int32x2_t;
-  int32_t arg0_int32_t;
-
-  out_int32x2_t = vmov_n_s32 (arg0_int32_t);
-}
-
-/* { dg-final { scan-assembler "vdup\.32\[ 	\]+\[dD\]\[0-9\]+, (\[rR\]\[0-9\]+|\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vmov_ns64.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vmov_ns64' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vmov_ns64 (void)
-{
-  int64x1_t out_int64x1_t;
-  int64_t arg0_int64_t;
-
-  out_int64x1_t = vmov_n_s64 (arg0_int64_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vmov_ns8.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vmov_ns8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vmov_ns8 (void)
-{
-  int8x8_t out_int8x8_t;
-  int8_t arg0_int8_t;
-
-  out_int8x8_t = vmov_n_s8 (arg0_int8_t);
-}
-
-/* { dg-final { scan-assembler "vdup\.8\[ 	\]+\[dD\]\[0-9\]+, (\[rR\]\[0-9\]+|\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vmov_nu16.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vmov_nu16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vmov_nu16 (void)
-{
-  uint16x4_t out_uint16x4_t;
-  uint16_t arg0_uint16_t;
-
-  out_uint16x4_t = vmov_n_u16 (arg0_uint16_t);
-}
-
-/* { dg-final { scan-assembler "vdup\.16\[ 	\]+\[dD\]\[0-9\]+, (\[rR\]\[0-9\]+|\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vmov_nu32.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vmov_nu32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vmov_nu32 (void)
-{
-  uint32x2_t out_uint32x2_t;
-  uint32_t arg0_uint32_t;
-
-  out_uint32x2_t = vmov_n_u32 (arg0_uint32_t);
-}
-
-/* { dg-final { scan-assembler "vdup\.32\[ 	\]+\[dD\]\[0-9\]+, (\[rR\]\[0-9\]+|\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vmov_nu64.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vmov_nu64' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vmov_nu64 (void)
-{
-  uint64x1_t out_uint64x1_t;
-  uint64_t arg0_uint64_t;
-
-  out_uint64x1_t = vmov_n_u64 (arg0_uint64_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vmov_nu8.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vmov_nu8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vmov_nu8 (void)
-{
-  uint8x8_t out_uint8x8_t;
-  uint8_t arg0_uint8_t;
-
-  out_uint8x8_t = vmov_n_u8 (arg0_uint8_t);
-}
-
-/* { dg-final { scan-assembler "vdup\.8\[ 	\]+\[dD\]\[0-9\]+, (\[rR\]\[0-9\]+|\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vmovls16.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vmovls16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vmovls16 (void)
-{
-  int32x4_t out_int32x4_t;
-  int16x4_t arg0_int16x4_t;
-
-  out_int32x4_t = vmovl_s16 (arg0_int16x4_t);
-}
-
-/* { dg-final { scan-assembler "vmovl\.s16\[ 	\]+\[qQ\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vmovls32.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vmovls32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vmovls32 (void)
-{
-  int64x2_t out_int64x2_t;
-  int32x2_t arg0_int32x2_t;
-
-  out_int64x2_t = vmovl_s32 (arg0_int32x2_t);
-}
-
-/* { dg-final { scan-assembler "vmovl\.s32\[ 	\]+\[qQ\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vmovls8.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vmovls8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vmovls8 (void)
-{
-  int16x8_t out_int16x8_t;
-  int8x8_t arg0_int8x8_t;
-
-  out_int16x8_t = vmovl_s8 (arg0_int8x8_t);
-}
-
-/* { dg-final { scan-assembler "vmovl\.s8\[ 	\]+\[qQ\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vmovlu16.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vmovlu16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vmovlu16 (void)
-{
-  uint32x4_t out_uint32x4_t;
-  uint16x4_t arg0_uint16x4_t;
-
-  out_uint32x4_t = vmovl_u16 (arg0_uint16x4_t);
-}
-
-/* { dg-final { scan-assembler "vmovl\.u16\[ 	\]+\[qQ\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vmovlu32.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vmovlu32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vmovlu32 (void)
-{
-  uint64x2_t out_uint64x2_t;
-  uint32x2_t arg0_uint32x2_t;
-
-  out_uint64x2_t = vmovl_u32 (arg0_uint32x2_t);
-}
-
-/* { dg-final { scan-assembler "vmovl\.u32\[ 	\]+\[qQ\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vmovlu8.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vmovlu8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vmovlu8 (void)
-{
-  uint16x8_t out_uint16x8_t;
-  uint8x8_t arg0_uint8x8_t;
-
-  out_uint16x8_t = vmovl_u8 (arg0_uint8x8_t);
-}
-
-/* { dg-final { scan-assembler "vmovl\.u8\[ 	\]+\[qQ\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vmovns16.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vmovns16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vmovns16 (void)
-{
-  int8x8_t out_int8x8_t;
-  int16x8_t arg0_int16x8_t;
-
-  out_int8x8_t = vmovn_s16 (arg0_int16x8_t);
-}
-
-/* { dg-final { scan-assembler "vmovn\.i16\[ 	\]+\[dD\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vmovns32.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vmovns32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vmovns32 (void)
-{
-  int16x4_t out_int16x4_t;
-  int32x4_t arg0_int32x4_t;
-
-  out_int16x4_t = vmovn_s32 (arg0_int32x4_t);
-}
-
-/* { dg-final { scan-assembler "vmovn\.i32\[ 	\]+\[dD\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vmovns64.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vmovns64' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vmovns64 (void)
-{
-  int32x2_t out_int32x2_t;
-  int64x2_t arg0_int64x2_t;
-
-  out_int32x2_t = vmovn_s64 (arg0_int64x2_t);
-}
-
-/* { dg-final { scan-assembler "vmovn\.i64\[ 	\]+\[dD\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vmovnu16.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vmovnu16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vmovnu16 (void)
-{
-  uint8x8_t out_uint8x8_t;
-  uint16x8_t arg0_uint16x8_t;
-
-  out_uint8x8_t = vmovn_u16 (arg0_uint16x8_t);
-}
-
-/* { dg-final { scan-assembler "vmovn\.i16\[ 	\]+\[dD\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vmovnu32.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vmovnu32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vmovnu32 (void)
-{
-  uint16x4_t out_uint16x4_t;
-  uint32x4_t arg0_uint32x4_t;
-
-  out_uint16x4_t = vmovn_u32 (arg0_uint32x4_t);
-}
-
-/* { dg-final { scan-assembler "vmovn\.i32\[ 	\]+\[dD\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vmovnu64.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vmovnu64' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vmovnu64 (void)
-{
-  uint32x2_t out_uint32x2_t;
-  uint64x2_t arg0_uint64x2_t;
-
-  out_uint32x2_t = vmovn_u64 (arg0_uint64x2_t);
-}
-
-/* { dg-final { scan-assembler "vmovn\.i64\[ 	\]+\[dD\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vmulQ_lanef32.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vmulQ_lanef32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vmulQ_lanef32 (void)
-{
-  float32x4_t out_float32x4_t;
-  float32x4_t arg0_float32x4_t;
-  float32x2_t arg1_float32x2_t;
-
-  out_float32x4_t = vmulq_lane_f32 (arg0_float32x4_t, arg1_float32x2_t, 1);
-}
-
-/* { dg-final { scan-assembler "vmul\.f32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vmulQ_lanes16.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vmulQ_lanes16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vmulQ_lanes16 (void)
-{
-  int16x8_t out_int16x8_t;
-  int16x8_t arg0_int16x8_t;
-  int16x4_t arg1_int16x4_t;
-
-  out_int16x8_t = vmulq_lane_s16 (arg0_int16x8_t, arg1_int16x4_t, 1);
-}
-
-/* { dg-final { scan-assembler "vmul\.i16\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vmulQ_lanes32.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vmulQ_lanes32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vmulQ_lanes32 (void)
-{
-  int32x4_t out_int32x4_t;
-  int32x4_t arg0_int32x4_t;
-  int32x2_t arg1_int32x2_t;
-
-  out_int32x4_t = vmulq_lane_s32 (arg0_int32x4_t, arg1_int32x2_t, 1);
-}
-
-/* { dg-final { scan-assembler "vmul\.i32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vmulQ_laneu16.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vmulQ_laneu16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vmulQ_laneu16 (void)
-{
-  uint16x8_t out_uint16x8_t;
-  uint16x8_t arg0_uint16x8_t;
-  uint16x4_t arg1_uint16x4_t;
-
-  out_uint16x8_t = vmulq_lane_u16 (arg0_uint16x8_t, arg1_uint16x4_t, 1);
-}
-
-/* { dg-final { scan-assembler "vmul\.i16\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vmulQ_laneu32.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vmulQ_laneu32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vmulQ_laneu32 (void)
-{
-  uint32x4_t out_uint32x4_t;
-  uint32x4_t arg0_uint32x4_t;
-  uint32x2_t arg1_uint32x2_t;
-
-  out_uint32x4_t = vmulq_lane_u32 (arg0_uint32x4_t, arg1_uint32x2_t, 1);
-}
-
-/* { dg-final { scan-assembler "vmul\.i32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vmulQ_nf32.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vmulQ_nf32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vmulQ_nf32 (void)
-{
-  float32x4_t out_float32x4_t;
-  float32x4_t arg0_float32x4_t;
-  float32_t arg1_float32_t;
-
-  out_float32x4_t = vmulq_n_f32 (arg0_float32x4_t, arg1_float32_t);
-}
-
-/* { dg-final { scan-assembler "vmul\.f32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vmulQ_ns16.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vmulQ_ns16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vmulQ_ns16 (void)
-{
-  int16x8_t out_int16x8_t;
-  int16x8_t arg0_int16x8_t;
-  int16_t arg1_int16_t;
-
-  out_int16x8_t = vmulq_n_s16 (arg0_int16x8_t, arg1_int16_t);
-}
-
-/* { dg-final { scan-assembler "vmul\.i16\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vmulQ_ns32.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vmulQ_ns32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vmulQ_ns32 (void)
-{
-  int32x4_t out_int32x4_t;
-  int32x4_t arg0_int32x4_t;
-  int32_t arg1_int32_t;
-
-  out_int32x4_t = vmulq_n_s32 (arg0_int32x4_t, arg1_int32_t);
-}
-
-/* { dg-final { scan-assembler "vmul\.i32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vmulQ_nu16.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vmulQ_nu16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vmulQ_nu16 (void)
-{
-  uint16x8_t out_uint16x8_t;
-  uint16x8_t arg0_uint16x8_t;
-  uint16_t arg1_uint16_t;
-
-  out_uint16x8_t = vmulq_n_u16 (arg0_uint16x8_t, arg1_uint16_t);
-}
-
-/* { dg-final { scan-assembler "vmul\.i16\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vmulQ_nu32.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vmulQ_nu32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vmulQ_nu32 (void)
-{
-  uint32x4_t out_uint32x4_t;
-  uint32x4_t arg0_uint32x4_t;
-  uint32_t arg1_uint32_t;
-
-  out_uint32x4_t = vmulq_n_u32 (arg0_uint32x4_t, arg1_uint32_t);
-}
-
-/* { dg-final { scan-assembler "vmul\.i32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vmulQf32.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vmulQf32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vmulQf32 (void)
-{
-  float32x4_t out_float32x4_t;
-  float32x4_t arg0_float32x4_t;
-  float32x4_t arg1_float32x4_t;
-
-  out_float32x4_t = vmulq_f32 (arg0_float32x4_t, arg1_float32x4_t);
-}
-
-/* { dg-final { scan-assembler "vmul\.f32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vmulQp8.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vmulQp8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vmulQp8 (void)
-{
-  poly8x16_t out_poly8x16_t;
-  poly8x16_t arg0_poly8x16_t;
-  poly8x16_t arg1_poly8x16_t;
-
-  out_poly8x16_t = vmulq_p8 (arg0_poly8x16_t, arg1_poly8x16_t);
-}
-
-/* { dg-final { scan-assembler "vmul\.p8\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vmulQs16.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vmulQs16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vmulQs16 (void)
-{
-  int16x8_t out_int16x8_t;
-  int16x8_t arg0_int16x8_t;
-  int16x8_t arg1_int16x8_t;
-
-  out_int16x8_t = vmulq_s16 (arg0_int16x8_t, arg1_int16x8_t);
-}
-
-/* { dg-final { scan-assembler "vmul\.i16\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vmulQs32.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vmulQs32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vmulQs32 (void)
-{
-  int32x4_t out_int32x4_t;
-  int32x4_t arg0_int32x4_t;
-  int32x4_t arg1_int32x4_t;
-
-  out_int32x4_t = vmulq_s32 (arg0_int32x4_t, arg1_int32x4_t);
-}
-
-/* { dg-final { scan-assembler "vmul\.i32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vmulQs8.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vmulQs8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vmulQs8 (void)
-{
-  int8x16_t out_int8x16_t;
-  int8x16_t arg0_int8x16_t;
-  int8x16_t arg1_int8x16_t;
-
-  out_int8x16_t = vmulq_s8 (arg0_int8x16_t, arg1_int8x16_t);
-}
-
-/* { dg-final { scan-assembler "vmul\.i8\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vmulQu16.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vmulQu16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vmulQu16 (void)
-{
-  uint16x8_t out_uint16x8_t;
-  uint16x8_t arg0_uint16x8_t;
-  uint16x8_t arg1_uint16x8_t;
-
-  out_uint16x8_t = vmulq_u16 (arg0_uint16x8_t, arg1_uint16x8_t);
-}
-
-/* { dg-final { scan-assembler "vmul\.i16\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vmulQu32.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vmulQu32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vmulQu32 (void)
-{
-  uint32x4_t out_uint32x4_t;
-  uint32x4_t arg0_uint32x4_t;
-  uint32x4_t arg1_uint32x4_t;
-
-  out_uint32x4_t = vmulq_u32 (arg0_uint32x4_t, arg1_uint32x4_t);
-}
-
-/* { dg-final { scan-assembler "vmul\.i32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vmulQu8.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vmulQu8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vmulQu8 (void)
-{
-  uint8x16_t out_uint8x16_t;
-  uint8x16_t arg0_uint8x16_t;
-  uint8x16_t arg1_uint8x16_t;
-
-  out_uint8x16_t = vmulq_u8 (arg0_uint8x16_t, arg1_uint8x16_t);
-}
-
-/* { dg-final { scan-assembler "vmul\.i8\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vmul_lanef32.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vmul_lanef32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vmul_lanef32 (void)
-{
-  float32x2_t out_float32x2_t;
-  float32x2_t arg0_float32x2_t;
-  float32x2_t arg1_float32x2_t;
-
-  out_float32x2_t = vmul_lane_f32 (arg0_float32x2_t, arg1_float32x2_t, 1);
-}
-
-/* { dg-final { scan-assembler "vmul\.f32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vmul_lanes16.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vmul_lanes16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vmul_lanes16 (void)
-{
-  int16x4_t out_int16x4_t;
-  int16x4_t arg0_int16x4_t;
-  int16x4_t arg1_int16x4_t;
-
-  out_int16x4_t = vmul_lane_s16 (arg0_int16x4_t, arg1_int16x4_t, 1);
-}
-
-/* { dg-final { scan-assembler "vmul\.i16\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vmul_lanes32.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vmul_lanes32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vmul_lanes32 (void)
-{
-  int32x2_t out_int32x2_t;
-  int32x2_t arg0_int32x2_t;
-  int32x2_t arg1_int32x2_t;
-
-  out_int32x2_t = vmul_lane_s32 (arg0_int32x2_t, arg1_int32x2_t, 1);
-}
-
-/* { dg-final { scan-assembler "vmul\.i32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vmul_laneu16.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vmul_laneu16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vmul_laneu16 (void)
-{
-  uint16x4_t out_uint16x4_t;
-  uint16x4_t arg0_uint16x4_t;
-  uint16x4_t arg1_uint16x4_t;
-
-  out_uint16x4_t = vmul_lane_u16 (arg0_uint16x4_t, arg1_uint16x4_t, 1);
-}
-
-/* { dg-final { scan-assembler "vmul\.i16\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vmul_laneu32.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vmul_laneu32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vmul_laneu32 (void)
-{
-  uint32x2_t out_uint32x2_t;
-  uint32x2_t arg0_uint32x2_t;
-  uint32x2_t arg1_uint32x2_t;
-
-  out_uint32x2_t = vmul_lane_u32 (arg0_uint32x2_t, arg1_uint32x2_t, 1);
-}
-
-/* { dg-final { scan-assembler "vmul\.i32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vmul_nf32.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vmul_nf32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vmul_nf32 (void)
-{
-  float32x2_t out_float32x2_t;
-  float32x2_t arg0_float32x2_t;
-  float32_t arg1_float32_t;
-
-  out_float32x2_t = vmul_n_f32 (arg0_float32x2_t, arg1_float32_t);
-}
-
-/* { dg-final { scan-assembler "vmul\.f32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vmul_ns16.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vmul_ns16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vmul_ns16 (void)
-{
-  int16x4_t out_int16x4_t;
-  int16x4_t arg0_int16x4_t;
-  int16_t arg1_int16_t;
-
-  out_int16x4_t = vmul_n_s16 (arg0_int16x4_t, arg1_int16_t);
-}
-
-/* { dg-final { scan-assembler "vmul\.i16\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vmul_ns32.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vmul_ns32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vmul_ns32 (void)
-{
-  int32x2_t out_int32x2_t;
-  int32x2_t arg0_int32x2_t;
-  int32_t arg1_int32_t;
-
-  out_int32x2_t = vmul_n_s32 (arg0_int32x2_t, arg1_int32_t);
-}
-
-/* { dg-final { scan-assembler "vmul\.i32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vmul_nu16.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vmul_nu16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vmul_nu16 (void)
-{
-  uint16x4_t out_uint16x4_t;
-  uint16x4_t arg0_uint16x4_t;
-  uint16_t arg1_uint16_t;
-
-  out_uint16x4_t = vmul_n_u16 (arg0_uint16x4_t, arg1_uint16_t);
-}
-
-/* { dg-final { scan-assembler "vmul\.i16\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vmul_nu32.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vmul_nu32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vmul_nu32 (void)
-{
-  uint32x2_t out_uint32x2_t;
-  uint32x2_t arg0_uint32x2_t;
-  uint32_t arg1_uint32_t;
-
-  out_uint32x2_t = vmul_n_u32 (arg0_uint32x2_t, arg1_uint32_t);
-}
-
-/* { dg-final { scan-assembler "vmul\.i32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vmulf32.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vmulf32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vmulf32 (void)
-{
-  float32x2_t out_float32x2_t;
-  float32x2_t arg0_float32x2_t;
-  float32x2_t arg1_float32x2_t;
-
-  out_float32x2_t = vmul_f32 (arg0_float32x2_t, arg1_float32x2_t);
-}
-
-/* { dg-final { scan-assembler "vmul\.f32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vmull_lanes16.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vmull_lanes16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vmull_lanes16 (void)
-{
-  int32x4_t out_int32x4_t;
-  int16x4_t arg0_int16x4_t;
-  int16x4_t arg1_int16x4_t;
-
-  out_int32x4_t = vmull_lane_s16 (arg0_int16x4_t, arg1_int16x4_t, 1);
-}
-
-/* { dg-final { scan-assembler "vmull\.s16\[ 	\]+\[qQ\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vmull_lanes32.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vmull_lanes32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vmull_lanes32 (void)
-{
-  int64x2_t out_int64x2_t;
-  int32x2_t arg0_int32x2_t;
-  int32x2_t arg1_int32x2_t;
-
-  out_int64x2_t = vmull_lane_s32 (arg0_int32x2_t, arg1_int32x2_t, 1);
-}
-
-/* { dg-final { scan-assembler "vmull\.s32\[ 	\]+\[qQ\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vmull_laneu16.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vmull_laneu16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vmull_laneu16 (void)
-{
-  uint32x4_t out_uint32x4_t;
-  uint16x4_t arg0_uint16x4_t;
-  uint16x4_t arg1_uint16x4_t;
-
-  out_uint32x4_t = vmull_lane_u16 (arg0_uint16x4_t, arg1_uint16x4_t, 1);
-}
-
-/* { dg-final { scan-assembler "vmull\.u16\[ 	\]+\[qQ\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vmull_laneu32.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vmull_laneu32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vmull_laneu32 (void)
-{
-  uint64x2_t out_uint64x2_t;
-  uint32x2_t arg0_uint32x2_t;
-  uint32x2_t arg1_uint32x2_t;
-
-  out_uint64x2_t = vmull_lane_u32 (arg0_uint32x2_t, arg1_uint32x2_t, 1);
-}
-
-/* { dg-final { scan-assembler "vmull\.u32\[ 	\]+\[qQ\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vmull_ns16.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vmull_ns16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vmull_ns16 (void)
-{
-  int32x4_t out_int32x4_t;
-  int16x4_t arg0_int16x4_t;
-  int16_t arg1_int16_t;
-
-  out_int32x4_t = vmull_n_s16 (arg0_int16x4_t, arg1_int16_t);
-}
-
-/* { dg-final { scan-assembler "vmull\.s16\[ 	\]+\[qQ\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vmull_ns32.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vmull_ns32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vmull_ns32 (void)
-{
-  int64x2_t out_int64x2_t;
-  int32x2_t arg0_int32x2_t;
-  int32_t arg1_int32_t;
-
-  out_int64x2_t = vmull_n_s32 (arg0_int32x2_t, arg1_int32_t);
-}
-
-/* { dg-final { scan-assembler "vmull\.s32\[ 	\]+\[qQ\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vmull_nu16.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vmull_nu16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vmull_nu16 (void)
-{
-  uint32x4_t out_uint32x4_t;
-  uint16x4_t arg0_uint16x4_t;
-  uint16_t arg1_uint16_t;
-
-  out_uint32x4_t = vmull_n_u16 (arg0_uint16x4_t, arg1_uint16_t);
-}
-
-/* { dg-final { scan-assembler "vmull\.u16\[ 	\]+\[qQ\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vmull_nu32.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vmull_nu32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vmull_nu32 (void)
-{
-  uint64x2_t out_uint64x2_t;
-  uint32x2_t arg0_uint32x2_t;
-  uint32_t arg1_uint32_t;
-
-  out_uint64x2_t = vmull_n_u32 (arg0_uint32x2_t, arg1_uint32_t);
-}
-
-/* { dg-final { scan-assembler "vmull\.u32\[ 	\]+\[qQ\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vmullp8.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vmullp8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vmullp8 (void)
-{
-  poly16x8_t out_poly16x8_t;
-  poly8x8_t arg0_poly8x8_t;
-  poly8x8_t arg1_poly8x8_t;
-
-  out_poly16x8_t = vmull_p8 (arg0_poly8x8_t, arg1_poly8x8_t);
-}
-
-/* { dg-final { scan-assembler "vmull\.p8\[ 	\]+\[qQ\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vmulls16.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vmulls16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vmulls16 (void)
-{
-  int32x4_t out_int32x4_t;
-  int16x4_t arg0_int16x4_t;
-  int16x4_t arg1_int16x4_t;
-
-  out_int32x4_t = vmull_s16 (arg0_int16x4_t, arg1_int16x4_t);
-}
-
-/* { dg-final { scan-assembler "vmull\.s16\[ 	\]+\[qQ\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vmulls32.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vmulls32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vmulls32 (void)
-{
-  int64x2_t out_int64x2_t;
-  int32x2_t arg0_int32x2_t;
-  int32x2_t arg1_int32x2_t;
-
-  out_int64x2_t = vmull_s32 (arg0_int32x2_t, arg1_int32x2_t);
-}
-
-/* { dg-final { scan-assembler "vmull\.s32\[ 	\]+\[qQ\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vmulls8.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vmulls8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vmulls8 (void)
-{
-  int16x8_t out_int16x8_t;
-  int8x8_t arg0_int8x8_t;
-  int8x8_t arg1_int8x8_t;
-
-  out_int16x8_t = vmull_s8 (arg0_int8x8_t, arg1_int8x8_t);
-}
-
-/* { dg-final { scan-assembler "vmull\.s8\[ 	\]+\[qQ\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vmullu16.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vmullu16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vmullu16 (void)
-{
-  uint32x4_t out_uint32x4_t;
-  uint16x4_t arg0_uint16x4_t;
-  uint16x4_t arg1_uint16x4_t;
-
-  out_uint32x4_t = vmull_u16 (arg0_uint16x4_t, arg1_uint16x4_t);
-}
-
-/* { dg-final { scan-assembler "vmull\.u16\[ 	\]+\[qQ\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vmullu32.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vmullu32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vmullu32 (void)
-{
-  uint64x2_t out_uint64x2_t;
-  uint32x2_t arg0_uint32x2_t;
-  uint32x2_t arg1_uint32x2_t;
-
-  out_uint64x2_t = vmull_u32 (arg0_uint32x2_t, arg1_uint32x2_t);
-}
-
-/* { dg-final { scan-assembler "vmull\.u32\[ 	\]+\[qQ\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vmullu8.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vmullu8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vmullu8 (void)
-{
-  uint16x8_t out_uint16x8_t;
-  uint8x8_t arg0_uint8x8_t;
-  uint8x8_t arg1_uint8x8_t;
-
-  out_uint16x8_t = vmull_u8 (arg0_uint8x8_t, arg1_uint8x8_t);
-}
-
-/* { dg-final { scan-assembler "vmull\.u8\[ 	\]+\[qQ\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vmulp8.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vmulp8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vmulp8 (void)
-{
-  poly8x8_t out_poly8x8_t;
-  poly8x8_t arg0_poly8x8_t;
-  poly8x8_t arg1_poly8x8_t;
-
-  out_poly8x8_t = vmul_p8 (arg0_poly8x8_t, arg1_poly8x8_t);
-}
-
-/* { dg-final { scan-assembler "vmul\.p8\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vmuls16.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vmuls16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vmuls16 (void)
-{
-  int16x4_t out_int16x4_t;
-  int16x4_t arg0_int16x4_t;
-  int16x4_t arg1_int16x4_t;
-
-  out_int16x4_t = vmul_s16 (arg0_int16x4_t, arg1_int16x4_t);
-}
-
-/* { dg-final { scan-assembler "vmul\.i16\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vmuls32.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vmuls32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vmuls32 (void)
-{
-  int32x2_t out_int32x2_t;
-  int32x2_t arg0_int32x2_t;
-  int32x2_t arg1_int32x2_t;
-
-  out_int32x2_t = vmul_s32 (arg0_int32x2_t, arg1_int32x2_t);
-}
-
-/* { dg-final { scan-assembler "vmul\.i32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vmuls8.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vmuls8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vmuls8 (void)
-{
-  int8x8_t out_int8x8_t;
-  int8x8_t arg0_int8x8_t;
-  int8x8_t arg1_int8x8_t;
-
-  out_int8x8_t = vmul_s8 (arg0_int8x8_t, arg1_int8x8_t);
-}
-
-/* { dg-final { scan-assembler "vmul\.i8\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vmulu16.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vmulu16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vmulu16 (void)
-{
-  uint16x4_t out_uint16x4_t;
-  uint16x4_t arg0_uint16x4_t;
-  uint16x4_t arg1_uint16x4_t;
-
-  out_uint16x4_t = vmul_u16 (arg0_uint16x4_t, arg1_uint16x4_t);
-}
-
-/* { dg-final { scan-assembler "vmul\.i16\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vmulu32.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vmulu32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vmulu32 (void)
-{
-  uint32x2_t out_uint32x2_t;
-  uint32x2_t arg0_uint32x2_t;
-  uint32x2_t arg1_uint32x2_t;
-
-  out_uint32x2_t = vmul_u32 (arg0_uint32x2_t, arg1_uint32x2_t);
-}
-
-/* { dg-final { scan-assembler "vmul\.i32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vmulu8.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vmulu8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vmulu8 (void)
-{
-  uint8x8_t out_uint8x8_t;
-  uint8x8_t arg0_uint8x8_t;
-  uint8x8_t arg1_uint8x8_t;
-
-  out_uint8x8_t = vmul_u8 (arg0_uint8x8_t, arg1_uint8x8_t);
-}
-
-/* { dg-final { scan-assembler "vmul\.i8\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vmvnQp8.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vmvnQp8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vmvnQp8 (void)
-{
-  poly8x16_t out_poly8x16_t;
-  poly8x16_t arg0_poly8x16_t;
-
-  out_poly8x16_t = vmvnq_p8 (arg0_poly8x16_t);
-}
-
-/* { dg-final { scan-assembler "vmvn\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vmvnQs16.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vmvnQs16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vmvnQs16 (void)
-{
-  int16x8_t out_int16x8_t;
-  int16x8_t arg0_int16x8_t;
-
-  out_int16x8_t = vmvnq_s16 (arg0_int16x8_t);
-}
-
-/* { dg-final { scan-assembler "vmvn\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vmvnQs32.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vmvnQs32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vmvnQs32 (void)
-{
-  int32x4_t out_int32x4_t;
-  int32x4_t arg0_int32x4_t;
-
-  out_int32x4_t = vmvnq_s32 (arg0_int32x4_t);
-}
-
-/* { dg-final { scan-assembler "vmvn\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vmvnQs8.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vmvnQs8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vmvnQs8 (void)
-{
-  int8x16_t out_int8x16_t;
-  int8x16_t arg0_int8x16_t;
-
-  out_int8x16_t = vmvnq_s8 (arg0_int8x16_t);
-}
-
-/* { dg-final { scan-assembler "vmvn\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vmvnQu16.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vmvnQu16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vmvnQu16 (void)
-{
-  uint16x8_t out_uint16x8_t;
-  uint16x8_t arg0_uint16x8_t;
-
-  out_uint16x8_t = vmvnq_u16 (arg0_uint16x8_t);
-}
-
-/* { dg-final { scan-assembler "vmvn\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vmvnQu32.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vmvnQu32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vmvnQu32 (void)
-{
-  uint32x4_t out_uint32x4_t;
-  uint32x4_t arg0_uint32x4_t;
-
-  out_uint32x4_t = vmvnq_u32 (arg0_uint32x4_t);
-}
-
-/* { dg-final { scan-assembler "vmvn\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vmvnQu8.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vmvnQu8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vmvnQu8 (void)
-{
-  uint8x16_t out_uint8x16_t;
-  uint8x16_t arg0_uint8x16_t;
-
-  out_uint8x16_t = vmvnq_u8 (arg0_uint8x16_t);
-}
-
-/* { dg-final { scan-assembler "vmvn\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vmvnp8.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vmvnp8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vmvnp8 (void)
-{
-  poly8x8_t out_poly8x8_t;
-  poly8x8_t arg0_poly8x8_t;
-
-  out_poly8x8_t = vmvn_p8 (arg0_poly8x8_t);
-}
-
-/* { dg-final { scan-assembler "vmvn\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vmvns16.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vmvns16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vmvns16 (void)
-{
-  int16x4_t out_int16x4_t;
-  int16x4_t arg0_int16x4_t;
-
-  out_int16x4_t = vmvn_s16 (arg0_int16x4_t);
-}
-
-/* { dg-final { scan-assembler "vmvn\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vmvns32.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vmvns32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vmvns32 (void)
-{
-  int32x2_t out_int32x2_t;
-  int32x2_t arg0_int32x2_t;
-
-  out_int32x2_t = vmvn_s32 (arg0_int32x2_t);
-}
-
-/* { dg-final { scan-assembler "vmvn\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vmvns8.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vmvns8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vmvns8 (void)
-{
-  int8x8_t out_int8x8_t;
-  int8x8_t arg0_int8x8_t;
-
-  out_int8x8_t = vmvn_s8 (arg0_int8x8_t);
-}
-
-/* { dg-final { scan-assembler "vmvn\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vmvnu16.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vmvnu16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vmvnu16 (void)
-{
-  uint16x4_t out_uint16x4_t;
-  uint16x4_t arg0_uint16x4_t;
-
-  out_uint16x4_t = vmvn_u16 (arg0_uint16x4_t);
-}
-
-/* { dg-final { scan-assembler "vmvn\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vmvnu32.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vmvnu32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vmvnu32 (void)
-{
-  uint32x2_t out_uint32x2_t;
-  uint32x2_t arg0_uint32x2_t;
-
-  out_uint32x2_t = vmvn_u32 (arg0_uint32x2_t);
-}
-
-/* { dg-final { scan-assembler "vmvn\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vmvnu8.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vmvnu8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vmvnu8 (void)
-{
-  uint8x8_t out_uint8x8_t;
-  uint8x8_t arg0_uint8x8_t;
-
-  out_uint8x8_t = vmvn_u8 (arg0_uint8x8_t);
-}
-
-/* { dg-final { scan-assembler "vmvn\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vnegQf32.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vnegQf32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vnegQf32 (void)
-{
-  float32x4_t out_float32x4_t;
-  float32x4_t arg0_float32x4_t;
-
-  out_float32x4_t = vnegq_f32 (arg0_float32x4_t);
-}
-
-/* { dg-final { scan-assembler "vneg\.f32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vnegQs16.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vnegQs16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vnegQs16 (void)
-{
-  int16x8_t out_int16x8_t;
-  int16x8_t arg0_int16x8_t;
-
-  out_int16x8_t = vnegq_s16 (arg0_int16x8_t);
-}
-
-/* { dg-final { scan-assembler "vneg\.s16\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vnegQs32.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vnegQs32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vnegQs32 (void)
-{
-  int32x4_t out_int32x4_t;
-  int32x4_t arg0_int32x4_t;
-
-  out_int32x4_t = vnegq_s32 (arg0_int32x4_t);
-}
-
-/* { dg-final { scan-assembler "vneg\.s32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vnegQs8.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vnegQs8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vnegQs8 (void)
-{
-  int8x16_t out_int8x16_t;
-  int8x16_t arg0_int8x16_t;
-
-  out_int8x16_t = vnegq_s8 (arg0_int8x16_t);
-}
-
-/* { dg-final { scan-assembler "vneg\.s8\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vnegf32.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vnegf32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vnegf32 (void)
-{
-  float32x2_t out_float32x2_t;
-  float32x2_t arg0_float32x2_t;
-
-  out_float32x2_t = vneg_f32 (arg0_float32x2_t);
-}
-
-/* { dg-final { scan-assembler "vneg\.f32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vnegs16.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vnegs16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vnegs16 (void)
-{
-  int16x4_t out_int16x4_t;
-  int16x4_t arg0_int16x4_t;
-
-  out_int16x4_t = vneg_s16 (arg0_int16x4_t);
-}
-
-/* { dg-final { scan-assembler "vneg\.s16\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vnegs32.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vnegs32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vnegs32 (void)
-{
-  int32x2_t out_int32x2_t;
-  int32x2_t arg0_int32x2_t;
-
-  out_int32x2_t = vneg_s32 (arg0_int32x2_t);
-}
-
-/* { dg-final { scan-assembler "vneg\.s32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vnegs8.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vnegs8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vnegs8 (void)
-{
-  int8x8_t out_int8x8_t;
-  int8x8_t arg0_int8x8_t;
-
-  out_int8x8_t = vneg_s8 (arg0_int8x8_t);
-}
-
-/* { dg-final { scan-assembler "vneg\.s8\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vornQs16.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vornQs16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O2" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-int16x8_t out_int16x8_t;
-int16x8_t arg0_int16x8_t;
-int16x8_t arg1_int16x8_t;
-void test_vornQs16 (void)
-{
-
-  out_int16x8_t = vornq_s16 (arg0_int16x8_t, arg1_int16x8_t);
-}
-
-/* { dg-final { scan-assembler "vorn\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vornQs32.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vornQs32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O2" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-int32x4_t out_int32x4_t;
-int32x4_t arg0_int32x4_t;
-int32x4_t arg1_int32x4_t;
-void test_vornQs32 (void)
-{
-
-  out_int32x4_t = vornq_s32 (arg0_int32x4_t, arg1_int32x4_t);
-}
-
-/* { dg-final { scan-assembler "vorn\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vornQs64.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vornQs64' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O2" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-int64x2_t out_int64x2_t;
-int64x2_t arg0_int64x2_t;
-int64x2_t arg1_int64x2_t;
-void test_vornQs64 (void)
-{
-
-  out_int64x2_t = vornq_s64 (arg0_int64x2_t, arg1_int64x2_t);
-}
-
-/* { dg-final { scan-assembler "vorn\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vornQs8.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vornQs8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O2" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-int8x16_t out_int8x16_t;
-int8x16_t arg0_int8x16_t;
-int8x16_t arg1_int8x16_t;
-void test_vornQs8 (void)
-{
-
-  out_int8x16_t = vornq_s8 (arg0_int8x16_t, arg1_int8x16_t);
-}
-
-/* { dg-final { scan-assembler "vorn\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vornQu16.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vornQu16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O2" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-uint16x8_t out_uint16x8_t;
-uint16x8_t arg0_uint16x8_t;
-uint16x8_t arg1_uint16x8_t;
-void test_vornQu16 (void)
-{
-
-  out_uint16x8_t = vornq_u16 (arg0_uint16x8_t, arg1_uint16x8_t);
-}
-
-/* { dg-final { scan-assembler "vorn\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vornQu32.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vornQu32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O2" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-uint32x4_t out_uint32x4_t;
-uint32x4_t arg0_uint32x4_t;
-uint32x4_t arg1_uint32x4_t;
-void test_vornQu32 (void)
-{
-
-  out_uint32x4_t = vornq_u32 (arg0_uint32x4_t, arg1_uint32x4_t);
-}
-
-/* { dg-final { scan-assembler "vorn\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vornQu64.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vornQu64' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O2" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-uint64x2_t out_uint64x2_t;
-uint64x2_t arg0_uint64x2_t;
-uint64x2_t arg1_uint64x2_t;
-void test_vornQu64 (void)
-{
-
-  out_uint64x2_t = vornq_u64 (arg0_uint64x2_t, arg1_uint64x2_t);
-}
-
-/* { dg-final { scan-assembler "vorn\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vornQu8.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vornQu8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O2" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-uint8x16_t out_uint8x16_t;
-uint8x16_t arg0_uint8x16_t;
-uint8x16_t arg1_uint8x16_t;
-void test_vornQu8 (void)
-{
-
-  out_uint8x16_t = vornq_u8 (arg0_uint8x16_t, arg1_uint8x16_t);
-}
-
-/* { dg-final { scan-assembler "vorn\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vorns16.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vorns16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O2" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-int16x4_t out_int16x4_t;
-int16x4_t arg0_int16x4_t;
-int16x4_t arg1_int16x4_t;
-void test_vorns16 (void)
-{
-
-  out_int16x4_t = vorn_s16 (arg0_int16x4_t, arg1_int16x4_t);
-}
-
-/* { dg-final { scan-assembler "vorn\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vorns32.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vorns32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O2" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-int32x2_t out_int32x2_t;
-int32x2_t arg0_int32x2_t;
-int32x2_t arg1_int32x2_t;
-void test_vorns32 (void)
-{
-
-  out_int32x2_t = vorn_s32 (arg0_int32x2_t, arg1_int32x2_t);
-}
-
-/* { dg-final { scan-assembler "vorn\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vorns64.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vorns64' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O2" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-int64x1_t out_int64x1_t;
-int64x1_t arg0_int64x1_t;
-int64x1_t arg1_int64x1_t;
-void test_vorns64 (void)
-{
-
-  out_int64x1_t = vorn_s64 (arg0_int64x1_t, arg1_int64x1_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vorns8.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vorns8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O2" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-int8x8_t out_int8x8_t;
-int8x8_t arg0_int8x8_t;
-int8x8_t arg1_int8x8_t;
-void test_vorns8 (void)
-{
-
-  out_int8x8_t = vorn_s8 (arg0_int8x8_t, arg1_int8x8_t);
-}
-
-/* { dg-final { scan-assembler "vorn\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vornu16.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vornu16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O2" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-uint16x4_t out_uint16x4_t;
-uint16x4_t arg0_uint16x4_t;
-uint16x4_t arg1_uint16x4_t;
-void test_vornu16 (void)
-{
-
-  out_uint16x4_t = vorn_u16 (arg0_uint16x4_t, arg1_uint16x4_t);
-}
-
-/* { dg-final { scan-assembler "vorn\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vornu32.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vornu32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O2" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-uint32x2_t out_uint32x2_t;
-uint32x2_t arg0_uint32x2_t;
-uint32x2_t arg1_uint32x2_t;
-void test_vornu32 (void)
-{
-
-  out_uint32x2_t = vorn_u32 (arg0_uint32x2_t, arg1_uint32x2_t);
-}
-
-/* { dg-final { scan-assembler "vorn\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vornu64.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vornu64' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O2" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-uint64x1_t out_uint64x1_t;
-uint64x1_t arg0_uint64x1_t;
-uint64x1_t arg1_uint64x1_t;
-void test_vornu64 (void)
-{
-
-  out_uint64x1_t = vorn_u64 (arg0_uint64x1_t, arg1_uint64x1_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vornu8.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vornu8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O2" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-uint8x8_t out_uint8x8_t;
-uint8x8_t arg0_uint8x8_t;
-uint8x8_t arg1_uint8x8_t;
-void test_vornu8 (void)
-{
-
-  out_uint8x8_t = vorn_u8 (arg0_uint8x8_t, arg1_uint8x8_t);
-}
-
-/* { dg-final { scan-assembler "vorn\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vorrQs16.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vorrQs16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vorrQs16 (void)
-{
-  int16x8_t out_int16x8_t;
-  int16x8_t arg0_int16x8_t;
-  int16x8_t arg1_int16x8_t;
-
-  out_int16x8_t = vorrq_s16 (arg0_int16x8_t, arg1_int16x8_t);
-}
-
-/* { dg-final { scan-assembler "vorr\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vorrQs32.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vorrQs32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vorrQs32 (void)
-{
-  int32x4_t out_int32x4_t;
-  int32x4_t arg0_int32x4_t;
-  int32x4_t arg1_int32x4_t;
-
-  out_int32x4_t = vorrq_s32 (arg0_int32x4_t, arg1_int32x4_t);
-}
-
-/* { dg-final { scan-assembler "vorr\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vorrQs64.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vorrQs64' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vorrQs64 (void)
-{
-  int64x2_t out_int64x2_t;
-  int64x2_t arg0_int64x2_t;
-  int64x2_t arg1_int64x2_t;
-
-  out_int64x2_t = vorrq_s64 (arg0_int64x2_t, arg1_int64x2_t);
-}
-
-/* { dg-final { scan-assembler "vorr\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vorrQs8.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vorrQs8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vorrQs8 (void)
-{
-  int8x16_t out_int8x16_t;
-  int8x16_t arg0_int8x16_t;
-  int8x16_t arg1_int8x16_t;
-
-  out_int8x16_t = vorrq_s8 (arg0_int8x16_t, arg1_int8x16_t);
-}
-
-/* { dg-final { scan-assembler "vorr\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vorrQu16.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vorrQu16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vorrQu16 (void)
-{
-  uint16x8_t out_uint16x8_t;
-  uint16x8_t arg0_uint16x8_t;
-  uint16x8_t arg1_uint16x8_t;
-
-  out_uint16x8_t = vorrq_u16 (arg0_uint16x8_t, arg1_uint16x8_t);
-}
-
-/* { dg-final { scan-assembler "vorr\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vorrQu32.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vorrQu32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vorrQu32 (void)
-{
-  uint32x4_t out_uint32x4_t;
-  uint32x4_t arg0_uint32x4_t;
-  uint32x4_t arg1_uint32x4_t;
-
-  out_uint32x4_t = vorrq_u32 (arg0_uint32x4_t, arg1_uint32x4_t);
-}
-
-/* { dg-final { scan-assembler "vorr\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vorrQu64.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vorrQu64' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vorrQu64 (void)
-{
-  uint64x2_t out_uint64x2_t;
-  uint64x2_t arg0_uint64x2_t;
-  uint64x2_t arg1_uint64x2_t;
-
-  out_uint64x2_t = vorrq_u64 (arg0_uint64x2_t, arg1_uint64x2_t);
-}
-
-/* { dg-final { scan-assembler "vorr\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vorrQu8.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vorrQu8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vorrQu8 (void)
-{
-  uint8x16_t out_uint8x16_t;
-  uint8x16_t arg0_uint8x16_t;
-  uint8x16_t arg1_uint8x16_t;
-
-  out_uint8x16_t = vorrq_u8 (arg0_uint8x16_t, arg1_uint8x16_t);
-}
-
-/* { dg-final { scan-assembler "vorr\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vorrs16.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vorrs16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vorrs16 (void)
-{
-  int16x4_t out_int16x4_t;
-  int16x4_t arg0_int16x4_t;
-  int16x4_t arg1_int16x4_t;
-
-  out_int16x4_t = vorr_s16 (arg0_int16x4_t, arg1_int16x4_t);
-}
-
-/* { dg-final { scan-assembler "vorr\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vorrs32.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vorrs32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vorrs32 (void)
-{
-  int32x2_t out_int32x2_t;
-  int32x2_t arg0_int32x2_t;
-  int32x2_t arg1_int32x2_t;
-
-  out_int32x2_t = vorr_s32 (arg0_int32x2_t, arg1_int32x2_t);
-}
-
-/* { dg-final { scan-assembler "vorr\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vorrs64.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vorrs64' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vorrs64 (void)
-{
-  int64x1_t out_int64x1_t;
-  int64x1_t arg0_int64x1_t;
-  int64x1_t arg1_int64x1_t;
-
-  out_int64x1_t = vorr_s64 (arg0_int64x1_t, arg1_int64x1_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vorrs8.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vorrs8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vorrs8 (void)
-{
-  int8x8_t out_int8x8_t;
-  int8x8_t arg0_int8x8_t;
-  int8x8_t arg1_int8x8_t;
-
-  out_int8x8_t = vorr_s8 (arg0_int8x8_t, arg1_int8x8_t);
-}
-
-/* { dg-final { scan-assembler "vorr\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vorru16.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vorru16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vorru16 (void)
-{
-  uint16x4_t out_uint16x4_t;
-  uint16x4_t arg0_uint16x4_t;
-  uint16x4_t arg1_uint16x4_t;
-
-  out_uint16x4_t = vorr_u16 (arg0_uint16x4_t, arg1_uint16x4_t);
-}
-
-/* { dg-final { scan-assembler "vorr\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vorru32.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vorru32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vorru32 (void)
-{
-  uint32x2_t out_uint32x2_t;
-  uint32x2_t arg0_uint32x2_t;
-  uint32x2_t arg1_uint32x2_t;
-
-  out_uint32x2_t = vorr_u32 (arg0_uint32x2_t, arg1_uint32x2_t);
-}
-
-/* { dg-final { scan-assembler "vorr\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vorru64.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vorru64' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vorru64 (void)
-{
-  uint64x1_t out_uint64x1_t;
-  uint64x1_t arg0_uint64x1_t;
-  uint64x1_t arg1_uint64x1_t;
-
-  out_uint64x1_t = vorr_u64 (arg0_uint64x1_t, arg1_uint64x1_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vorru8.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vorru8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vorru8 (void)
-{
-  uint8x8_t out_uint8x8_t;
-  uint8x8_t arg0_uint8x8_t;
-  uint8x8_t arg1_uint8x8_t;
-
-  out_uint8x8_t = vorr_u8 (arg0_uint8x8_t, arg1_uint8x8_t);
-}
-
-/* { dg-final { scan-assembler "vorr\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vpadalQs16.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vpadalQs16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vpadalQs16 (void)
-{
-  int32x4_t out_int32x4_t;
-  int32x4_t arg0_int32x4_t;
-  int16x8_t arg1_int16x8_t;
-
-  out_int32x4_t = vpadalq_s16 (arg0_int32x4_t, arg1_int16x8_t);
-}
-
-/* { dg-final { scan-assembler "vpadal\.s16\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vpadalQs32.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vpadalQs32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vpadalQs32 (void)
-{
-  int64x2_t out_int64x2_t;
-  int64x2_t arg0_int64x2_t;
-  int32x4_t arg1_int32x4_t;
-
-  out_int64x2_t = vpadalq_s32 (arg0_int64x2_t, arg1_int32x4_t);
-}
-
-/* { dg-final { scan-assembler "vpadal\.s32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vpadalQs8.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vpadalQs8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vpadalQs8 (void)
-{
-  int16x8_t out_int16x8_t;
-  int16x8_t arg0_int16x8_t;
-  int8x16_t arg1_int8x16_t;
-
-  out_int16x8_t = vpadalq_s8 (arg0_int16x8_t, arg1_int8x16_t);
-}
-
-/* { dg-final { scan-assembler "vpadal\.s8\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vpadalQu16.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vpadalQu16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vpadalQu16 (void)
-{
-  uint32x4_t out_uint32x4_t;
-  uint32x4_t arg0_uint32x4_t;
-  uint16x8_t arg1_uint16x8_t;
-
-  out_uint32x4_t = vpadalq_u16 (arg0_uint32x4_t, arg1_uint16x8_t);
-}
-
-/* { dg-final { scan-assembler "vpadal\.u16\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vpadalQu32.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vpadalQu32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vpadalQu32 (void)
-{
-  uint64x2_t out_uint64x2_t;
-  uint64x2_t arg0_uint64x2_t;
-  uint32x4_t arg1_uint32x4_t;
-
-  out_uint64x2_t = vpadalq_u32 (arg0_uint64x2_t, arg1_uint32x4_t);
-}
-
-/* { dg-final { scan-assembler "vpadal\.u32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vpadalQu8.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vpadalQu8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vpadalQu8 (void)
-{
-  uint16x8_t out_uint16x8_t;
-  uint16x8_t arg0_uint16x8_t;
-  uint8x16_t arg1_uint8x16_t;
-
-  out_uint16x8_t = vpadalq_u8 (arg0_uint16x8_t, arg1_uint8x16_t);
-}
-
-/* { dg-final { scan-assembler "vpadal\.u8\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vpadals16.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vpadals16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vpadals16 (void)
-{
-  int32x2_t out_int32x2_t;
-  int32x2_t arg0_int32x2_t;
-  int16x4_t arg1_int16x4_t;
-
-  out_int32x2_t = vpadal_s16 (arg0_int32x2_t, arg1_int16x4_t);
-}
-
-/* { dg-final { scan-assembler "vpadal\.s16\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vpadals32.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vpadals32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vpadals32 (void)
-{
-  int64x1_t out_int64x1_t;
-  int64x1_t arg0_int64x1_t;
-  int32x2_t arg1_int32x2_t;
-
-  out_int64x1_t = vpadal_s32 (arg0_int64x1_t, arg1_int32x2_t);
-}
-
-/* { dg-final { scan-assembler "vpadal\.s32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vpadals8.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vpadals8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vpadals8 (void)
-{
-  int16x4_t out_int16x4_t;
-  int16x4_t arg0_int16x4_t;
-  int8x8_t arg1_int8x8_t;
-
-  out_int16x4_t = vpadal_s8 (arg0_int16x4_t, arg1_int8x8_t);
-}
-
-/* { dg-final { scan-assembler "vpadal\.s8\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vpadalu16.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vpadalu16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vpadalu16 (void)
-{
-  uint32x2_t out_uint32x2_t;
-  uint32x2_t arg0_uint32x2_t;
-  uint16x4_t arg1_uint16x4_t;
-
-  out_uint32x2_t = vpadal_u16 (arg0_uint32x2_t, arg1_uint16x4_t);
-}
-
-/* { dg-final { scan-assembler "vpadal\.u16\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vpadalu32.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vpadalu32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vpadalu32 (void)
-{
-  uint64x1_t out_uint64x1_t;
-  uint64x1_t arg0_uint64x1_t;
-  uint32x2_t arg1_uint32x2_t;
-
-  out_uint64x1_t = vpadal_u32 (arg0_uint64x1_t, arg1_uint32x2_t);
-}
-
-/* { dg-final { scan-assembler "vpadal\.u32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vpadalu8.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vpadalu8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vpadalu8 (void)
-{
-  uint16x4_t out_uint16x4_t;
-  uint16x4_t arg0_uint16x4_t;
-  uint8x8_t arg1_uint8x8_t;
-
-  out_uint16x4_t = vpadal_u8 (arg0_uint16x4_t, arg1_uint8x8_t);
-}
-
-/* { dg-final { scan-assembler "vpadal\.u8\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vpaddf32.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vpaddf32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vpaddf32 (void)
-{
-  float32x2_t out_float32x2_t;
-  float32x2_t arg0_float32x2_t;
-  float32x2_t arg1_float32x2_t;
-
-  out_float32x2_t = vpadd_f32 (arg0_float32x2_t, arg1_float32x2_t);
-}
-
-/* { dg-final { scan-assembler "vpadd\.f32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vpaddlQs16.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vpaddlQs16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vpaddlQs16 (void)
-{
-  int32x4_t out_int32x4_t;
-  int16x8_t arg0_int16x8_t;
-
-  out_int32x4_t = vpaddlq_s16 (arg0_int16x8_t);
-}
-
-/* { dg-final { scan-assembler "vpaddl\.s16\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vpaddlQs32.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vpaddlQs32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vpaddlQs32 (void)
-{
-  int64x2_t out_int64x2_t;
-  int32x4_t arg0_int32x4_t;
-
-  out_int64x2_t = vpaddlq_s32 (arg0_int32x4_t);
-}
-
-/* { dg-final { scan-assembler "vpaddl\.s32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vpaddlQs8.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vpaddlQs8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vpaddlQs8 (void)
-{
-  int16x8_t out_int16x8_t;
-  int8x16_t arg0_int8x16_t;
-
-  out_int16x8_t = vpaddlq_s8 (arg0_int8x16_t);
-}
-
-/* { dg-final { scan-assembler "vpaddl\.s8\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vpaddlQu16.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vpaddlQu16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vpaddlQu16 (void)
-{
-  uint32x4_t out_uint32x4_t;
-  uint16x8_t arg0_uint16x8_t;
-
-  out_uint32x4_t = vpaddlq_u16 (arg0_uint16x8_t);
-}
-
-/* { dg-final { scan-assembler "vpaddl\.u16\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vpaddlQu32.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vpaddlQu32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vpaddlQu32 (void)
-{
-  uint64x2_t out_uint64x2_t;
-  uint32x4_t arg0_uint32x4_t;
-
-  out_uint64x2_t = vpaddlq_u32 (arg0_uint32x4_t);
-}
-
-/* { dg-final { scan-assembler "vpaddl\.u32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vpaddlQu8.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vpaddlQu8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vpaddlQu8 (void)
-{
-  uint16x8_t out_uint16x8_t;
-  uint8x16_t arg0_uint8x16_t;
-
-  out_uint16x8_t = vpaddlq_u8 (arg0_uint8x16_t);
-}
-
-/* { dg-final { scan-assembler "vpaddl\.u8\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vpaddls16.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vpaddls16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vpaddls16 (void)
-{
-  int32x2_t out_int32x2_t;
-  int16x4_t arg0_int16x4_t;
-
-  out_int32x2_t = vpaddl_s16 (arg0_int16x4_t);
-}
-
-/* { dg-final { scan-assembler "vpaddl\.s16\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vpaddls32.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vpaddls32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vpaddls32 (void)
-{
-  int64x1_t out_int64x1_t;
-  int32x2_t arg0_int32x2_t;
-
-  out_int64x1_t = vpaddl_s32 (arg0_int32x2_t);
-}
-
-/* { dg-final { scan-assembler "vpaddl\.s32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vpaddls8.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vpaddls8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vpaddls8 (void)
-{
-  int16x4_t out_int16x4_t;
-  int8x8_t arg0_int8x8_t;
-
-  out_int16x4_t = vpaddl_s8 (arg0_int8x8_t);
-}
-
-/* { dg-final { scan-assembler "vpaddl\.s8\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vpaddlu16.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vpaddlu16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vpaddlu16 (void)
-{
-  uint32x2_t out_uint32x2_t;
-  uint16x4_t arg0_uint16x4_t;
-
-  out_uint32x2_t = vpaddl_u16 (arg0_uint16x4_t);
-}
-
-/* { dg-final { scan-assembler "vpaddl\.u16\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vpaddlu32.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vpaddlu32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vpaddlu32 (void)
-{
-  uint64x1_t out_uint64x1_t;
-  uint32x2_t arg0_uint32x2_t;
-
-  out_uint64x1_t = vpaddl_u32 (arg0_uint32x2_t);
-}
-
-/* { dg-final { scan-assembler "vpaddl\.u32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vpaddlu8.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vpaddlu8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vpaddlu8 (void)
-{
-  uint16x4_t out_uint16x4_t;
-  uint8x8_t arg0_uint8x8_t;
-
-  out_uint16x4_t = vpaddl_u8 (arg0_uint8x8_t);
-}
-
-/* { dg-final { scan-assembler "vpaddl\.u8\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vpadds16.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vpadds16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vpadds16 (void)
-{
-  int16x4_t out_int16x4_t;
-  int16x4_t arg0_int16x4_t;
-  int16x4_t arg1_int16x4_t;
-
-  out_int16x4_t = vpadd_s16 (arg0_int16x4_t, arg1_int16x4_t);
-}
-
-/* { dg-final { scan-assembler "vpadd\.i16\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vpadds32.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vpadds32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vpadds32 (void)
-{
-  int32x2_t out_int32x2_t;
-  int32x2_t arg0_int32x2_t;
-  int32x2_t arg1_int32x2_t;
-
-  out_int32x2_t = vpadd_s32 (arg0_int32x2_t, arg1_int32x2_t);
-}
-
-/* { dg-final { scan-assembler "vpadd\.i32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vpadds8.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vpadds8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vpadds8 (void)
-{
-  int8x8_t out_int8x8_t;
-  int8x8_t arg0_int8x8_t;
-  int8x8_t arg1_int8x8_t;
-
-  out_int8x8_t = vpadd_s8 (arg0_int8x8_t, arg1_int8x8_t);
-}
-
-/* { dg-final { scan-assembler "vpadd\.i8\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vpaddu16.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vpaddu16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vpaddu16 (void)
-{
-  uint16x4_t out_uint16x4_t;
-  uint16x4_t arg0_uint16x4_t;
-  uint16x4_t arg1_uint16x4_t;
-
-  out_uint16x4_t = vpadd_u16 (arg0_uint16x4_t, arg1_uint16x4_t);
-}
-
-/* { dg-final { scan-assembler "vpadd\.i16\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vpaddu32.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vpaddu32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vpaddu32 (void)
-{
-  uint32x2_t out_uint32x2_t;
-  uint32x2_t arg0_uint32x2_t;
-  uint32x2_t arg1_uint32x2_t;
-
-  out_uint32x2_t = vpadd_u32 (arg0_uint32x2_t, arg1_uint32x2_t);
-}
-
-/* { dg-final { scan-assembler "vpadd\.i32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vpaddu8.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vpaddu8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vpaddu8 (void)
-{
-  uint8x8_t out_uint8x8_t;
-  uint8x8_t arg0_uint8x8_t;
-  uint8x8_t arg1_uint8x8_t;
-
-  out_uint8x8_t = vpadd_u8 (arg0_uint8x8_t, arg1_uint8x8_t);
-}
-
-/* { dg-final { scan-assembler "vpadd\.i8\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vpmaxf32.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vpmaxf32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vpmaxf32 (void)
-{
-  float32x2_t out_float32x2_t;
-  float32x2_t arg0_float32x2_t;
-  float32x2_t arg1_float32x2_t;
-
-  out_float32x2_t = vpmax_f32 (arg0_float32x2_t, arg1_float32x2_t);
-}
-
-/* { dg-final { scan-assembler "vpmax\.f32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vpmaxs16.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vpmaxs16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vpmaxs16 (void)
-{
-  int16x4_t out_int16x4_t;
-  int16x4_t arg0_int16x4_t;
-  int16x4_t arg1_int16x4_t;
-
-  out_int16x4_t = vpmax_s16 (arg0_int16x4_t, arg1_int16x4_t);
-}
-
-/* { dg-final { scan-assembler "vpmax\.s16\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vpmaxs32.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vpmaxs32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vpmaxs32 (void)
-{
-  int32x2_t out_int32x2_t;
-  int32x2_t arg0_int32x2_t;
-  int32x2_t arg1_int32x2_t;
-
-  out_int32x2_t = vpmax_s32 (arg0_int32x2_t, arg1_int32x2_t);
-}
-
-/* { dg-final { scan-assembler "vpmax\.s32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vpmaxs8.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vpmaxs8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vpmaxs8 (void)
-{
-  int8x8_t out_int8x8_t;
-  int8x8_t arg0_int8x8_t;
-  int8x8_t arg1_int8x8_t;
-
-  out_int8x8_t = vpmax_s8 (arg0_int8x8_t, arg1_int8x8_t);
-}
-
-/* { dg-final { scan-assembler "vpmax\.s8\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vpmaxu16.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vpmaxu16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vpmaxu16 (void)
-{
-  uint16x4_t out_uint16x4_t;
-  uint16x4_t arg0_uint16x4_t;
-  uint16x4_t arg1_uint16x4_t;
-
-  out_uint16x4_t = vpmax_u16 (arg0_uint16x4_t, arg1_uint16x4_t);
-}
-
-/* { dg-final { scan-assembler "vpmax\.u16\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vpmaxu32.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vpmaxu32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vpmaxu32 (void)
-{
-  uint32x2_t out_uint32x2_t;
-  uint32x2_t arg0_uint32x2_t;
-  uint32x2_t arg1_uint32x2_t;
-
-  out_uint32x2_t = vpmax_u32 (arg0_uint32x2_t, arg1_uint32x2_t);
-}
-
-/* { dg-final { scan-assembler "vpmax\.u32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vpmaxu8.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vpmaxu8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vpmaxu8 (void)
-{
-  uint8x8_t out_uint8x8_t;
-  uint8x8_t arg0_uint8x8_t;
-  uint8x8_t arg1_uint8x8_t;
-
-  out_uint8x8_t = vpmax_u8 (arg0_uint8x8_t, arg1_uint8x8_t);
-}
-
-/* { dg-final { scan-assembler "vpmax\.u8\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vpminf32.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vpminf32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vpminf32 (void)
-{
-  float32x2_t out_float32x2_t;
-  float32x2_t arg0_float32x2_t;
-  float32x2_t arg1_float32x2_t;
-
-  out_float32x2_t = vpmin_f32 (arg0_float32x2_t, arg1_float32x2_t);
-}
-
-/* { dg-final { scan-assembler "vpmin\.f32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vpmins16.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vpmins16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vpmins16 (void)
-{
-  int16x4_t out_int16x4_t;
-  int16x4_t arg0_int16x4_t;
-  int16x4_t arg1_int16x4_t;
-
-  out_int16x4_t = vpmin_s16 (arg0_int16x4_t, arg1_int16x4_t);
-}
-
-/* { dg-final { scan-assembler "vpmin\.s16\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vpmins32.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vpmins32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vpmins32 (void)
-{
-  int32x2_t out_int32x2_t;
-  int32x2_t arg0_int32x2_t;
-  int32x2_t arg1_int32x2_t;
-
-  out_int32x2_t = vpmin_s32 (arg0_int32x2_t, arg1_int32x2_t);
-}
-
-/* { dg-final { scan-assembler "vpmin\.s32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vpmins8.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vpmins8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vpmins8 (void)
-{
-  int8x8_t out_int8x8_t;
-  int8x8_t arg0_int8x8_t;
-  int8x8_t arg1_int8x8_t;
-
-  out_int8x8_t = vpmin_s8 (arg0_int8x8_t, arg1_int8x8_t);
-}
-
-/* { dg-final { scan-assembler "vpmin\.s8\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vpminu16.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vpminu16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vpminu16 (void)
-{
-  uint16x4_t out_uint16x4_t;
-  uint16x4_t arg0_uint16x4_t;
-  uint16x4_t arg1_uint16x4_t;
-
-  out_uint16x4_t = vpmin_u16 (arg0_uint16x4_t, arg1_uint16x4_t);
-}
-
-/* { dg-final { scan-assembler "vpmin\.u16\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vpminu32.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vpminu32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vpminu32 (void)
-{
-  uint32x2_t out_uint32x2_t;
-  uint32x2_t arg0_uint32x2_t;
-  uint32x2_t arg1_uint32x2_t;
-
-  out_uint32x2_t = vpmin_u32 (arg0_uint32x2_t, arg1_uint32x2_t);
-}
-
-/* { dg-final { scan-assembler "vpmin\.u32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vpminu8.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vpminu8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vpminu8 (void)
-{
-  uint8x8_t out_uint8x8_t;
-  uint8x8_t arg0_uint8x8_t;
-  uint8x8_t arg1_uint8x8_t;
-
-  out_uint8x8_t = vpmin_u8 (arg0_uint8x8_t, arg1_uint8x8_t);
-}
-
-/* { dg-final { scan-assembler "vpmin\.u8\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vqRdmulhQ_lanes16.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vqRdmulhQ_lanes16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vqRdmulhQ_lanes16 (void)
-{
-  int16x8_t out_int16x8_t;
-  int16x8_t arg0_int16x8_t;
-  int16x4_t arg1_int16x4_t;
-
-  out_int16x8_t = vqrdmulhq_lane_s16 (arg0_int16x8_t, arg1_int16x4_t, 1);
-}
-
-/* { dg-final { scan-assembler "vqrdmulh\.s16\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vqRdmulhQ_lanes32.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vqRdmulhQ_lanes32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vqRdmulhQ_lanes32 (void)
-{
-  int32x4_t out_int32x4_t;
-  int32x4_t arg0_int32x4_t;
-  int32x2_t arg1_int32x2_t;
-
-  out_int32x4_t = vqrdmulhq_lane_s32 (arg0_int32x4_t, arg1_int32x2_t, 1);
-}
-
-/* { dg-final { scan-assembler "vqrdmulh\.s32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vqRdmulhQ_ns16.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vqRdmulhQ_ns16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vqRdmulhQ_ns16 (void)
-{
-  int16x8_t out_int16x8_t;
-  int16x8_t arg0_int16x8_t;
-  int16_t arg1_int16_t;
-
-  out_int16x8_t = vqrdmulhq_n_s16 (arg0_int16x8_t, arg1_int16_t);
-}
-
-/* { dg-final { scan-assembler "vqrdmulh\.s16\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vqRdmulhQ_ns32.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vqRdmulhQ_ns32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vqRdmulhQ_ns32 (void)
-{
-  int32x4_t out_int32x4_t;
-  int32x4_t arg0_int32x4_t;
-  int32_t arg1_int32_t;
-
-  out_int32x4_t = vqrdmulhq_n_s32 (arg0_int32x4_t, arg1_int32_t);
-}
-
-/* { dg-final { scan-assembler "vqrdmulh\.s32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vqRdmulhQs16.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vqRdmulhQs16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vqRdmulhQs16 (void)
-{
-  int16x8_t out_int16x8_t;
-  int16x8_t arg0_int16x8_t;
-  int16x8_t arg1_int16x8_t;
-
-  out_int16x8_t = vqrdmulhq_s16 (arg0_int16x8_t, arg1_int16x8_t);
-}
-
-/* { dg-final { scan-assembler "vqrdmulh\.s16\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vqRdmulhQs32.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vqRdmulhQs32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vqRdmulhQs32 (void)
-{
-  int32x4_t out_int32x4_t;
-  int32x4_t arg0_int32x4_t;
-  int32x4_t arg1_int32x4_t;
-
-  out_int32x4_t = vqrdmulhq_s32 (arg0_int32x4_t, arg1_int32x4_t);
-}
-
-/* { dg-final { scan-assembler "vqrdmulh\.s32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vqRdmulh_lanes16.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vqRdmulh_lanes16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vqRdmulh_lanes16 (void)
-{
-  int16x4_t out_int16x4_t;
-  int16x4_t arg0_int16x4_t;
-  int16x4_t arg1_int16x4_t;
-
-  out_int16x4_t = vqrdmulh_lane_s16 (arg0_int16x4_t, arg1_int16x4_t, 1);
-}
-
-/* { dg-final { scan-assembler "vqrdmulh\.s16\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vqRdmulh_lanes32.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vqRdmulh_lanes32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vqRdmulh_lanes32 (void)
-{
-  int32x2_t out_int32x2_t;
-  int32x2_t arg0_int32x2_t;
-  int32x2_t arg1_int32x2_t;
-
-  out_int32x2_t = vqrdmulh_lane_s32 (arg0_int32x2_t, arg1_int32x2_t, 1);
-}
-
-/* { dg-final { scan-assembler "vqrdmulh\.s32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vqRdmulh_ns16.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vqRdmulh_ns16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vqRdmulh_ns16 (void)
-{
-  int16x4_t out_int16x4_t;
-  int16x4_t arg0_int16x4_t;
-  int16_t arg1_int16_t;
-
-  out_int16x4_t = vqrdmulh_n_s16 (arg0_int16x4_t, arg1_int16_t);
-}
-
-/* { dg-final { scan-assembler "vqrdmulh\.s16\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vqRdmulh_ns32.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vqRdmulh_ns32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vqRdmulh_ns32 (void)
-{
-  int32x2_t out_int32x2_t;
-  int32x2_t arg0_int32x2_t;
-  int32_t arg1_int32_t;
-
-  out_int32x2_t = vqrdmulh_n_s32 (arg0_int32x2_t, arg1_int32_t);
-}
-
-/* { dg-final { scan-assembler "vqrdmulh\.s32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vqRdmulhs16.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vqRdmulhs16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vqRdmulhs16 (void)
-{
-  int16x4_t out_int16x4_t;
-  int16x4_t arg0_int16x4_t;
-  int16x4_t arg1_int16x4_t;
-
-  out_int16x4_t = vqrdmulh_s16 (arg0_int16x4_t, arg1_int16x4_t);
-}
-
-/* { dg-final { scan-assembler "vqrdmulh\.s16\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vqRdmulhs32.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vqRdmulhs32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vqRdmulhs32 (void)
-{
-  int32x2_t out_int32x2_t;
-  int32x2_t arg0_int32x2_t;
-  int32x2_t arg1_int32x2_t;
-
-  out_int32x2_t = vqrdmulh_s32 (arg0_int32x2_t, arg1_int32x2_t);
-}
-
-/* { dg-final { scan-assembler "vqrdmulh\.s32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vqRshlQs16.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vqRshlQs16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vqRshlQs16 (void)
-{
-  int16x8_t out_int16x8_t;
-  int16x8_t arg0_int16x8_t;
-  int16x8_t arg1_int16x8_t;
-
-  out_int16x8_t = vqrshlq_s16 (arg0_int16x8_t, arg1_int16x8_t);
-}
-
-/* { dg-final { scan-assembler "vqrshl\.s16\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vqRshlQs32.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vqRshlQs32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vqRshlQs32 (void)
-{
-  int32x4_t out_int32x4_t;
-  int32x4_t arg0_int32x4_t;
-  int32x4_t arg1_int32x4_t;
-
-  out_int32x4_t = vqrshlq_s32 (arg0_int32x4_t, arg1_int32x4_t);
-}
-
-/* { dg-final { scan-assembler "vqrshl\.s32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vqRshlQs64.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vqRshlQs64' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vqRshlQs64 (void)
-{
-  int64x2_t out_int64x2_t;
-  int64x2_t arg0_int64x2_t;
-  int64x2_t arg1_int64x2_t;
-
-  out_int64x2_t = vqrshlq_s64 (arg0_int64x2_t, arg1_int64x2_t);
-}
-
-/* { dg-final { scan-assembler "vqrshl\.s64\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vqRshlQs8.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vqRshlQs8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vqRshlQs8 (void)
-{
-  int8x16_t out_int8x16_t;
-  int8x16_t arg0_int8x16_t;
-  int8x16_t arg1_int8x16_t;
-
-  out_int8x16_t = vqrshlq_s8 (arg0_int8x16_t, arg1_int8x16_t);
-}
-
-/* { dg-final { scan-assembler "vqrshl\.s8\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vqRshlQu16.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vqRshlQu16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vqRshlQu16 (void)
-{
-  uint16x8_t out_uint16x8_t;
-  uint16x8_t arg0_uint16x8_t;
-  int16x8_t arg1_int16x8_t;
-
-  out_uint16x8_t = vqrshlq_u16 (arg0_uint16x8_t, arg1_int16x8_t);
-}
-
-/* { dg-final { scan-assembler "vqrshl\.u16\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vqRshlQu32.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vqRshlQu32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vqRshlQu32 (void)
-{
-  uint32x4_t out_uint32x4_t;
-  uint32x4_t arg0_uint32x4_t;
-  int32x4_t arg1_int32x4_t;
-
-  out_uint32x4_t = vqrshlq_u32 (arg0_uint32x4_t, arg1_int32x4_t);
-}
-
-/* { dg-final { scan-assembler "vqrshl\.u32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vqRshlQu64.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vqRshlQu64' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vqRshlQu64 (void)
-{
-  uint64x2_t out_uint64x2_t;
-  uint64x2_t arg0_uint64x2_t;
-  int64x2_t arg1_int64x2_t;
-
-  out_uint64x2_t = vqrshlq_u64 (arg0_uint64x2_t, arg1_int64x2_t);
-}
-
-/* { dg-final { scan-assembler "vqrshl\.u64\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vqRshlQu8.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vqRshlQu8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vqRshlQu8 (void)
-{
-  uint8x16_t out_uint8x16_t;
-  uint8x16_t arg0_uint8x16_t;
-  int8x16_t arg1_int8x16_t;
-
-  out_uint8x16_t = vqrshlq_u8 (arg0_uint8x16_t, arg1_int8x16_t);
-}
-
-/* { dg-final { scan-assembler "vqrshl\.u8\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vqRshls16.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vqRshls16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vqRshls16 (void)
-{
-  int16x4_t out_int16x4_t;
-  int16x4_t arg0_int16x4_t;
-  int16x4_t arg1_int16x4_t;
-
-  out_int16x4_t = vqrshl_s16 (arg0_int16x4_t, arg1_int16x4_t);
-}
-
-/* { dg-final { scan-assembler "vqrshl\.s16\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vqRshls32.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vqRshls32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vqRshls32 (void)
-{
-  int32x2_t out_int32x2_t;
-  int32x2_t arg0_int32x2_t;
-  int32x2_t arg1_int32x2_t;
-
-  out_int32x2_t = vqrshl_s32 (arg0_int32x2_t, arg1_int32x2_t);
-}
-
-/* { dg-final { scan-assembler "vqrshl\.s32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vqRshls64.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vqRshls64' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vqRshls64 (void)
-{
-  int64x1_t out_int64x1_t;
-  int64x1_t arg0_int64x1_t;
-  int64x1_t arg1_int64x1_t;
-
-  out_int64x1_t = vqrshl_s64 (arg0_int64x1_t, arg1_int64x1_t);
-}
-
-/* { dg-final { scan-assembler "vqrshl\.s64\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vqRshls8.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vqRshls8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vqRshls8 (void)
-{
-  int8x8_t out_int8x8_t;
-  int8x8_t arg0_int8x8_t;
-  int8x8_t arg1_int8x8_t;
-
-  out_int8x8_t = vqrshl_s8 (arg0_int8x8_t, arg1_int8x8_t);
-}
-
-/* { dg-final { scan-assembler "vqrshl\.s8\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vqRshlu16.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vqRshlu16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vqRshlu16 (void)
-{
-  uint16x4_t out_uint16x4_t;
-  uint16x4_t arg0_uint16x4_t;
-  int16x4_t arg1_int16x4_t;
-
-  out_uint16x4_t = vqrshl_u16 (arg0_uint16x4_t, arg1_int16x4_t);
-}
-
-/* { dg-final { scan-assembler "vqrshl\.u16\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vqRshlu32.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vqRshlu32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vqRshlu32 (void)
-{
-  uint32x2_t out_uint32x2_t;
-  uint32x2_t arg0_uint32x2_t;
-  int32x2_t arg1_int32x2_t;
-
-  out_uint32x2_t = vqrshl_u32 (arg0_uint32x2_t, arg1_int32x2_t);
-}
-
-/* { dg-final { scan-assembler "vqrshl\.u32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vqRshlu64.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vqRshlu64' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vqRshlu64 (void)
-{
-  uint64x1_t out_uint64x1_t;
-  uint64x1_t arg0_uint64x1_t;
-  int64x1_t arg1_int64x1_t;
-
-  out_uint64x1_t = vqrshl_u64 (arg0_uint64x1_t, arg1_int64x1_t);
-}
-
-/* { dg-final { scan-assembler "vqrshl\.u64\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vqRshlu8.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vqRshlu8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vqRshlu8 (void)
-{
-  uint8x8_t out_uint8x8_t;
-  uint8x8_t arg0_uint8x8_t;
-  int8x8_t arg1_int8x8_t;
-
-  out_uint8x8_t = vqrshl_u8 (arg0_uint8x8_t, arg1_int8x8_t);
-}
-
-/* { dg-final { scan-assembler "vqrshl\.u8\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vqRshrn_ns16.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vqRshrn_ns16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vqRshrn_ns16 (void)
-{
-  int8x8_t out_int8x8_t;
-  int16x8_t arg0_int16x8_t;
-
-  out_int8x8_t = vqrshrn_n_s16 (arg0_int16x8_t, 1);
-}
-
-/* { dg-final { scan-assembler "vqrshrn\.s16\[ 	\]+\[dD\]\[0-9\]+, \[qQ\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vqRshrn_ns32.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vqRshrn_ns32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vqRshrn_ns32 (void)
-{
-  int16x4_t out_int16x4_t;
-  int32x4_t arg0_int32x4_t;
-
-  out_int16x4_t = vqrshrn_n_s32 (arg0_int32x4_t, 1);
-}
-
-/* { dg-final { scan-assembler "vqrshrn\.s32\[ 	\]+\[dD\]\[0-9\]+, \[qQ\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vqRshrn_ns64.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vqRshrn_ns64' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vqRshrn_ns64 (void)
-{
-  int32x2_t out_int32x2_t;
-  int64x2_t arg0_int64x2_t;
-
-  out_int32x2_t = vqrshrn_n_s64 (arg0_int64x2_t, 1);
-}
-
-/* { dg-final { scan-assembler "vqrshrn\.s64\[ 	\]+\[dD\]\[0-9\]+, \[qQ\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vqRshrn_nu16.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vqRshrn_nu16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vqRshrn_nu16 (void)
-{
-  uint8x8_t out_uint8x8_t;
-  uint16x8_t arg0_uint16x8_t;
-
-  out_uint8x8_t = vqrshrn_n_u16 (arg0_uint16x8_t, 1);
-}
-
-/* { dg-final { scan-assembler "vqrshrn\.u16\[ 	\]+\[dD\]\[0-9\]+, \[qQ\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vqRshrn_nu32.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vqRshrn_nu32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vqRshrn_nu32 (void)
-{
-  uint16x4_t out_uint16x4_t;
-  uint32x4_t arg0_uint32x4_t;
-
-  out_uint16x4_t = vqrshrn_n_u32 (arg0_uint32x4_t, 1);
-}
-
-/* { dg-final { scan-assembler "vqrshrn\.u32\[ 	\]+\[dD\]\[0-9\]+, \[qQ\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vqRshrn_nu64.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vqRshrn_nu64' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vqRshrn_nu64 (void)
-{
-  uint32x2_t out_uint32x2_t;
-  uint64x2_t arg0_uint64x2_t;
-
-  out_uint32x2_t = vqrshrn_n_u64 (arg0_uint64x2_t, 1);
-}
-
-/* { dg-final { scan-assembler "vqrshrn\.u64\[ 	\]+\[dD\]\[0-9\]+, \[qQ\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vqRshrun_ns16.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vqRshrun_ns16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vqRshrun_ns16 (void)
-{
-  uint8x8_t out_uint8x8_t;
-  int16x8_t arg0_int16x8_t;
-
-  out_uint8x8_t = vqrshrun_n_s16 (arg0_int16x8_t, 1);
-}
-
-/* { dg-final { scan-assembler "vqrshrun\.s16\[ 	\]+\[dD\]\[0-9\]+, \[qQ\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vqRshrun_ns32.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vqRshrun_ns32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vqRshrun_ns32 (void)
-{
-  uint16x4_t out_uint16x4_t;
-  int32x4_t arg0_int32x4_t;
-
-  out_uint16x4_t = vqrshrun_n_s32 (arg0_int32x4_t, 1);
-}
-
-/* { dg-final { scan-assembler "vqrshrun\.s32\[ 	\]+\[dD\]\[0-9\]+, \[qQ\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vqRshrun_ns64.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vqRshrun_ns64' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vqRshrun_ns64 (void)
-{
-  uint32x2_t out_uint32x2_t;
-  int64x2_t arg0_int64x2_t;
-
-  out_uint32x2_t = vqrshrun_n_s64 (arg0_int64x2_t, 1);
-}
-
-/* { dg-final { scan-assembler "vqrshrun\.s64\[ 	\]+\[dD\]\[0-9\]+, \[qQ\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vqabsQs16.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vqabsQs16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vqabsQs16 (void)
-{
-  int16x8_t out_int16x8_t;
-  int16x8_t arg0_int16x8_t;
-
-  out_int16x8_t = vqabsq_s16 (arg0_int16x8_t);
-}
-
-/* { dg-final { scan-assembler "vqabs\.s16\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vqabsQs32.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vqabsQs32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vqabsQs32 (void)
-{
-  int32x4_t out_int32x4_t;
-  int32x4_t arg0_int32x4_t;
-
-  out_int32x4_t = vqabsq_s32 (arg0_int32x4_t);
-}
-
-/* { dg-final { scan-assembler "vqabs\.s32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vqabsQs8.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vqabsQs8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vqabsQs8 (void)
-{
-  int8x16_t out_int8x16_t;
-  int8x16_t arg0_int8x16_t;
-
-  out_int8x16_t = vqabsq_s8 (arg0_int8x16_t);
-}
-
-/* { dg-final { scan-assembler "vqabs\.s8\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vqabss16.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vqabss16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vqabss16 (void)
-{
-  int16x4_t out_int16x4_t;
-  int16x4_t arg0_int16x4_t;
-
-  out_int16x4_t = vqabs_s16 (arg0_int16x4_t);
-}
-
-/* { dg-final { scan-assembler "vqabs\.s16\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vqabss32.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vqabss32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vqabss32 (void)
-{
-  int32x2_t out_int32x2_t;
-  int32x2_t arg0_int32x2_t;
-
-  out_int32x2_t = vqabs_s32 (arg0_int32x2_t);
-}
-
-/* { dg-final { scan-assembler "vqabs\.s32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vqabss8.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vqabss8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vqabss8 (void)
-{
-  int8x8_t out_int8x8_t;
-  int8x8_t arg0_int8x8_t;
-
-  out_int8x8_t = vqabs_s8 (arg0_int8x8_t);
-}
-
-/* { dg-final { scan-assembler "vqabs\.s8\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vqaddQs16.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vqaddQs16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vqaddQs16 (void)
-{
-  int16x8_t out_int16x8_t;
-  int16x8_t arg0_int16x8_t;
-  int16x8_t arg1_int16x8_t;
-
-  out_int16x8_t = vqaddq_s16 (arg0_int16x8_t, arg1_int16x8_t);
-}
-
-/* { dg-final { scan-assembler "vqadd\.s16\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vqaddQs32.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vqaddQs32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vqaddQs32 (void)
-{
-  int32x4_t out_int32x4_t;
-  int32x4_t arg0_int32x4_t;
-  int32x4_t arg1_int32x4_t;
-
-  out_int32x4_t = vqaddq_s32 (arg0_int32x4_t, arg1_int32x4_t);
-}
-
-/* { dg-final { scan-assembler "vqadd\.s32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vqaddQs64.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vqaddQs64' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vqaddQs64 (void)
-{
-  int64x2_t out_int64x2_t;
-  int64x2_t arg0_int64x2_t;
-  int64x2_t arg1_int64x2_t;
-
-  out_int64x2_t = vqaddq_s64 (arg0_int64x2_t, arg1_int64x2_t);
-}
-
-/* { dg-final { scan-assembler "vqadd\.s64\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vqaddQs8.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vqaddQs8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vqaddQs8 (void)
-{
-  int8x16_t out_int8x16_t;
-  int8x16_t arg0_int8x16_t;
-  int8x16_t arg1_int8x16_t;
-
-  out_int8x16_t = vqaddq_s8 (arg0_int8x16_t, arg1_int8x16_t);
-}
-
-/* { dg-final { scan-assembler "vqadd\.s8\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vqaddQu16.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vqaddQu16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vqaddQu16 (void)
-{
-  uint16x8_t out_uint16x8_t;
-  uint16x8_t arg0_uint16x8_t;
-  uint16x8_t arg1_uint16x8_t;
-
-  out_uint16x8_t = vqaddq_u16 (arg0_uint16x8_t, arg1_uint16x8_t);
-}
-
-/* { dg-final { scan-assembler "vqadd\.u16\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vqaddQu32.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vqaddQu32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vqaddQu32 (void)
-{
-  uint32x4_t out_uint32x4_t;
-  uint32x4_t arg0_uint32x4_t;
-  uint32x4_t arg1_uint32x4_t;
-
-  out_uint32x4_t = vqaddq_u32 (arg0_uint32x4_t, arg1_uint32x4_t);
-}
-
-/* { dg-final { scan-assembler "vqadd\.u32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vqaddQu64.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vqaddQu64' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vqaddQu64 (void)
-{
-  uint64x2_t out_uint64x2_t;
-  uint64x2_t arg0_uint64x2_t;
-  uint64x2_t arg1_uint64x2_t;
-
-  out_uint64x2_t = vqaddq_u64 (arg0_uint64x2_t, arg1_uint64x2_t);
-}
-
-/* { dg-final { scan-assembler "vqadd\.u64\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vqaddQu8.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vqaddQu8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vqaddQu8 (void)
-{
-  uint8x16_t out_uint8x16_t;
-  uint8x16_t arg0_uint8x16_t;
-  uint8x16_t arg1_uint8x16_t;
-
-  out_uint8x16_t = vqaddq_u8 (arg0_uint8x16_t, arg1_uint8x16_t);
-}
-
-/* { dg-final { scan-assembler "vqadd\.u8\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vqadds16.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vqadds16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vqadds16 (void)
-{
-  int16x4_t out_int16x4_t;
-  int16x4_t arg0_int16x4_t;
-  int16x4_t arg1_int16x4_t;
-
-  out_int16x4_t = vqadd_s16 (arg0_int16x4_t, arg1_int16x4_t);
-}
-
-/* { dg-final { scan-assembler "vqadd\.s16\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vqadds32.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vqadds32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vqadds32 (void)
-{
-  int32x2_t out_int32x2_t;
-  int32x2_t arg0_int32x2_t;
-  int32x2_t arg1_int32x2_t;
-
-  out_int32x2_t = vqadd_s32 (arg0_int32x2_t, arg1_int32x2_t);
-}
-
-/* { dg-final { scan-assembler "vqadd\.s32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vqadds64.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vqadds64' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vqadds64 (void)
-{
-  int64x1_t out_int64x1_t;
-  int64x1_t arg0_int64x1_t;
-  int64x1_t arg1_int64x1_t;
-
-  out_int64x1_t = vqadd_s64 (arg0_int64x1_t, arg1_int64x1_t);
-}
-
-/* { dg-final { scan-assembler "vqadd\.s64\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vqadds8.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vqadds8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vqadds8 (void)
-{
-  int8x8_t out_int8x8_t;
-  int8x8_t arg0_int8x8_t;
-  int8x8_t arg1_int8x8_t;
-
-  out_int8x8_t = vqadd_s8 (arg0_int8x8_t, arg1_int8x8_t);
-}
-
-/* { dg-final { scan-assembler "vqadd\.s8\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vqaddu16.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vqaddu16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vqaddu16 (void)
-{
-  uint16x4_t out_uint16x4_t;
-  uint16x4_t arg0_uint16x4_t;
-  uint16x4_t arg1_uint16x4_t;
-
-  out_uint16x4_t = vqadd_u16 (arg0_uint16x4_t, arg1_uint16x4_t);
-}
-
-/* { dg-final { scan-assembler "vqadd\.u16\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vqaddu32.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vqaddu32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vqaddu32 (void)
-{
-  uint32x2_t out_uint32x2_t;
-  uint32x2_t arg0_uint32x2_t;
-  uint32x2_t arg1_uint32x2_t;
-
-  out_uint32x2_t = vqadd_u32 (arg0_uint32x2_t, arg1_uint32x2_t);
-}
-
-/* { dg-final { scan-assembler "vqadd\.u32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vqaddu64.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vqaddu64' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vqaddu64 (void)
-{
-  uint64x1_t out_uint64x1_t;
-  uint64x1_t arg0_uint64x1_t;
-  uint64x1_t arg1_uint64x1_t;
-
-  out_uint64x1_t = vqadd_u64 (arg0_uint64x1_t, arg1_uint64x1_t);
-}
-
-/* { dg-final { scan-assembler "vqadd\.u64\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vqaddu8.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vqaddu8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vqaddu8 (void)
-{
-  uint8x8_t out_uint8x8_t;
-  uint8x8_t arg0_uint8x8_t;
-  uint8x8_t arg1_uint8x8_t;
-
-  out_uint8x8_t = vqadd_u8 (arg0_uint8x8_t, arg1_uint8x8_t);
-}
-
-/* { dg-final { scan-assembler "vqadd\.u8\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vqdmlal_lanes16.c
+++ b/src//dev/null
@@ -1,21 +0,0 @@
-/* Test the `vqdmlal_lanes16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vqdmlal_lanes16 (void)
-{
-  int32x4_t out_int32x4_t;
-  int32x4_t arg0_int32x4_t;
-  int16x4_t arg1_int16x4_t;
-  int16x4_t arg2_int16x4_t;
-
-  out_int32x4_t = vqdmlal_lane_s16 (arg0_int32x4_t, arg1_int16x4_t, arg2_int16x4_t, 1);
-}
-
-/* { dg-final { scan-assembler "vqdmlal\.s16\[ 	\]+\[qQ\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vqdmlal_lanes32.c
+++ b/src//dev/null
@@ -1,21 +0,0 @@
-/* Test the `vqdmlal_lanes32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vqdmlal_lanes32 (void)
-{
-  int64x2_t out_int64x2_t;
-  int64x2_t arg0_int64x2_t;
-  int32x2_t arg1_int32x2_t;
-  int32x2_t arg2_int32x2_t;
-
-  out_int64x2_t = vqdmlal_lane_s32 (arg0_int64x2_t, arg1_int32x2_t, arg2_int32x2_t, 1);
-}
-
-/* { dg-final { scan-assembler "vqdmlal\.s32\[ 	\]+\[qQ\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vqdmlal_ns16.c
+++ b/src//dev/null
@@ -1,21 +0,0 @@
-/* Test the `vqdmlal_ns16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vqdmlal_ns16 (void)
-{
-  int32x4_t out_int32x4_t;
-  int32x4_t arg0_int32x4_t;
-  int16x4_t arg1_int16x4_t;
-  int16_t arg2_int16_t;
-
-  out_int32x4_t = vqdmlal_n_s16 (arg0_int32x4_t, arg1_int16x4_t, arg2_int16_t);
-}
-
-/* { dg-final { scan-assembler "vqdmlal\.s16\[ 	\]+\[qQ\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vqdmlal_ns32.c
+++ b/src//dev/null
@@ -1,21 +0,0 @@
-/* Test the `vqdmlal_ns32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vqdmlal_ns32 (void)
-{
-  int64x2_t out_int64x2_t;
-  int64x2_t arg0_int64x2_t;
-  int32x2_t arg1_int32x2_t;
-  int32_t arg2_int32_t;
-
-  out_int64x2_t = vqdmlal_n_s32 (arg0_int64x2_t, arg1_int32x2_t, arg2_int32_t);
-}
-
-/* { dg-final { scan-assembler "vqdmlal\.s32\[ 	\]+\[qQ\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vqdmlals16.c
+++ b/src//dev/null
@@ -1,21 +0,0 @@
-/* Test the `vqdmlals16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vqdmlals16 (void)
-{
-  int32x4_t out_int32x4_t;
-  int32x4_t arg0_int32x4_t;
-  int16x4_t arg1_int16x4_t;
-  int16x4_t arg2_int16x4_t;
-
-  out_int32x4_t = vqdmlal_s16 (arg0_int32x4_t, arg1_int16x4_t, arg2_int16x4_t);
-}
-
-/* { dg-final { scan-assembler "vqdmlal\.s16\[ 	\]+\[qQ\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vqdmlals32.c
+++ b/src//dev/null
@@ -1,21 +0,0 @@
-/* Test the `vqdmlals32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vqdmlals32 (void)
-{
-  int64x2_t out_int64x2_t;
-  int64x2_t arg0_int64x2_t;
-  int32x2_t arg1_int32x2_t;
-  int32x2_t arg2_int32x2_t;
-
-  out_int64x2_t = vqdmlal_s32 (arg0_int64x2_t, arg1_int32x2_t, arg2_int32x2_t);
-}
-
-/* { dg-final { scan-assembler "vqdmlal\.s32\[ 	\]+\[qQ\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vqdmlsl_lanes16.c
+++ b/src//dev/null
@@ -1,21 +0,0 @@
-/* Test the `vqdmlsl_lanes16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vqdmlsl_lanes16 (void)
-{
-  int32x4_t out_int32x4_t;
-  int32x4_t arg0_int32x4_t;
-  int16x4_t arg1_int16x4_t;
-  int16x4_t arg2_int16x4_t;
-
-  out_int32x4_t = vqdmlsl_lane_s16 (arg0_int32x4_t, arg1_int16x4_t, arg2_int16x4_t, 1);
-}
-
-/* { dg-final { scan-assembler "vqdmlsl\.s16\[ 	\]+\[qQ\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vqdmlsl_lanes32.c
+++ b/src//dev/null
@@ -1,21 +0,0 @@
-/* Test the `vqdmlsl_lanes32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vqdmlsl_lanes32 (void)
-{
-  int64x2_t out_int64x2_t;
-  int64x2_t arg0_int64x2_t;
-  int32x2_t arg1_int32x2_t;
-  int32x2_t arg2_int32x2_t;
-
-  out_int64x2_t = vqdmlsl_lane_s32 (arg0_int64x2_t, arg1_int32x2_t, arg2_int32x2_t, 1);
-}
-
-/* { dg-final { scan-assembler "vqdmlsl\.s32\[ 	\]+\[qQ\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vqdmlsl_ns16.c
+++ b/src//dev/null
@@ -1,21 +0,0 @@
-/* Test the `vqdmlsl_ns16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vqdmlsl_ns16 (void)
-{
-  int32x4_t out_int32x4_t;
-  int32x4_t arg0_int32x4_t;
-  int16x4_t arg1_int16x4_t;
-  int16_t arg2_int16_t;
-
-  out_int32x4_t = vqdmlsl_n_s16 (arg0_int32x4_t, arg1_int16x4_t, arg2_int16_t);
-}
-
-/* { dg-final { scan-assembler "vqdmlsl\.s16\[ 	\]+\[qQ\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vqdmlsl_ns32.c
+++ b/src//dev/null
@@ -1,21 +0,0 @@
-/* Test the `vqdmlsl_ns32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vqdmlsl_ns32 (void)
-{
-  int64x2_t out_int64x2_t;
-  int64x2_t arg0_int64x2_t;
-  int32x2_t arg1_int32x2_t;
-  int32_t arg2_int32_t;
-
-  out_int64x2_t = vqdmlsl_n_s32 (arg0_int64x2_t, arg1_int32x2_t, arg2_int32_t);
-}
-
-/* { dg-final { scan-assembler "vqdmlsl\.s32\[ 	\]+\[qQ\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vqdmlsls16.c
+++ b/src//dev/null
@@ -1,21 +0,0 @@
-/* Test the `vqdmlsls16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vqdmlsls16 (void)
-{
-  int32x4_t out_int32x4_t;
-  int32x4_t arg0_int32x4_t;
-  int16x4_t arg1_int16x4_t;
-  int16x4_t arg2_int16x4_t;
-
-  out_int32x4_t = vqdmlsl_s16 (arg0_int32x4_t, arg1_int16x4_t, arg2_int16x4_t);
-}
-
-/* { dg-final { scan-assembler "vqdmlsl\.s16\[ 	\]+\[qQ\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vqdmlsls32.c
+++ b/src//dev/null
@@ -1,21 +0,0 @@
-/* Test the `vqdmlsls32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vqdmlsls32 (void)
-{
-  int64x2_t out_int64x2_t;
-  int64x2_t arg0_int64x2_t;
-  int32x2_t arg1_int32x2_t;
-  int32x2_t arg2_int32x2_t;
-
-  out_int64x2_t = vqdmlsl_s32 (arg0_int64x2_t, arg1_int32x2_t, arg2_int32x2_t);
-}
-
-/* { dg-final { scan-assembler "vqdmlsl\.s32\[ 	\]+\[qQ\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vqdmulhQ_lanes16.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vqdmulhQ_lanes16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vqdmulhQ_lanes16 (void)
-{
-  int16x8_t out_int16x8_t;
-  int16x8_t arg0_int16x8_t;
-  int16x4_t arg1_int16x4_t;
-
-  out_int16x8_t = vqdmulhq_lane_s16 (arg0_int16x8_t, arg1_int16x4_t, 1);
-}
-
-/* { dg-final { scan-assembler "vqdmulh\.s16\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vqdmulhQ_lanes32.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vqdmulhQ_lanes32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vqdmulhQ_lanes32 (void)
-{
-  int32x4_t out_int32x4_t;
-  int32x4_t arg0_int32x4_t;
-  int32x2_t arg1_int32x2_t;
-
-  out_int32x4_t = vqdmulhq_lane_s32 (arg0_int32x4_t, arg1_int32x2_t, 1);
-}
-
-/* { dg-final { scan-assembler "vqdmulh\.s32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vqdmulhQ_ns16.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vqdmulhQ_ns16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vqdmulhQ_ns16 (void)
-{
-  int16x8_t out_int16x8_t;
-  int16x8_t arg0_int16x8_t;
-  int16_t arg1_int16_t;
-
-  out_int16x8_t = vqdmulhq_n_s16 (arg0_int16x8_t, arg1_int16_t);
-}
-
-/* { dg-final { scan-assembler "vqdmulh\.s16\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vqdmulhQ_ns32.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vqdmulhQ_ns32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vqdmulhQ_ns32 (void)
-{
-  int32x4_t out_int32x4_t;
-  int32x4_t arg0_int32x4_t;
-  int32_t arg1_int32_t;
-
-  out_int32x4_t = vqdmulhq_n_s32 (arg0_int32x4_t, arg1_int32_t);
-}
-
-/* { dg-final { scan-assembler "vqdmulh\.s32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vqdmulhQs16.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vqdmulhQs16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vqdmulhQs16 (void)
-{
-  int16x8_t out_int16x8_t;
-  int16x8_t arg0_int16x8_t;
-  int16x8_t arg1_int16x8_t;
-
-  out_int16x8_t = vqdmulhq_s16 (arg0_int16x8_t, arg1_int16x8_t);
-}
-
-/* { dg-final { scan-assembler "vqdmulh\.s16\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vqdmulhQs32.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vqdmulhQs32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vqdmulhQs32 (void)
-{
-  int32x4_t out_int32x4_t;
-  int32x4_t arg0_int32x4_t;
-  int32x4_t arg1_int32x4_t;
-
-  out_int32x4_t = vqdmulhq_s32 (arg0_int32x4_t, arg1_int32x4_t);
-}
-
-/* { dg-final { scan-assembler "vqdmulh\.s32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vqdmulh_lanes16.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vqdmulh_lanes16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vqdmulh_lanes16 (void)
-{
-  int16x4_t out_int16x4_t;
-  int16x4_t arg0_int16x4_t;
-  int16x4_t arg1_int16x4_t;
-
-  out_int16x4_t = vqdmulh_lane_s16 (arg0_int16x4_t, arg1_int16x4_t, 1);
-}
-
-/* { dg-final { scan-assembler "vqdmulh\.s16\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vqdmulh_lanes32.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vqdmulh_lanes32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vqdmulh_lanes32 (void)
-{
-  int32x2_t out_int32x2_t;
-  int32x2_t arg0_int32x2_t;
-  int32x2_t arg1_int32x2_t;
-
-  out_int32x2_t = vqdmulh_lane_s32 (arg0_int32x2_t, arg1_int32x2_t, 1);
-}
-
-/* { dg-final { scan-assembler "vqdmulh\.s32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vqdmulh_ns16.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vqdmulh_ns16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vqdmulh_ns16 (void)
-{
-  int16x4_t out_int16x4_t;
-  int16x4_t arg0_int16x4_t;
-  int16_t arg1_int16_t;
-
-  out_int16x4_t = vqdmulh_n_s16 (arg0_int16x4_t, arg1_int16_t);
-}
-
-/* { dg-final { scan-assembler "vqdmulh\.s16\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vqdmulh_ns32.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vqdmulh_ns32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vqdmulh_ns32 (void)
-{
-  int32x2_t out_int32x2_t;
-  int32x2_t arg0_int32x2_t;
-  int32_t arg1_int32_t;
-
-  out_int32x2_t = vqdmulh_n_s32 (arg0_int32x2_t, arg1_int32_t);
-}
-
-/* { dg-final { scan-assembler "vqdmulh\.s32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vqdmulhs16.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vqdmulhs16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vqdmulhs16 (void)
-{
-  int16x4_t out_int16x4_t;
-  int16x4_t arg0_int16x4_t;
-  int16x4_t arg1_int16x4_t;
-
-  out_int16x4_t = vqdmulh_s16 (arg0_int16x4_t, arg1_int16x4_t);
-}
-
-/* { dg-final { scan-assembler "vqdmulh\.s16\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vqdmulhs32.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vqdmulhs32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vqdmulhs32 (void)
-{
-  int32x2_t out_int32x2_t;
-  int32x2_t arg0_int32x2_t;
-  int32x2_t arg1_int32x2_t;
-
-  out_int32x2_t = vqdmulh_s32 (arg0_int32x2_t, arg1_int32x2_t);
-}
-
-/* { dg-final { scan-assembler "vqdmulh\.s32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vqdmull_lanes16.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vqdmull_lanes16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vqdmull_lanes16 (void)
-{
-  int32x4_t out_int32x4_t;
-  int16x4_t arg0_int16x4_t;
-  int16x4_t arg1_int16x4_t;
-
-  out_int32x4_t = vqdmull_lane_s16 (arg0_int16x4_t, arg1_int16x4_t, 1);
-}
-
-/* { dg-final { scan-assembler "vqdmull\.s16\[ 	\]+\[qQ\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vqdmull_lanes32.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vqdmull_lanes32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vqdmull_lanes32 (void)
-{
-  int64x2_t out_int64x2_t;
-  int32x2_t arg0_int32x2_t;
-  int32x2_t arg1_int32x2_t;
-
-  out_int64x2_t = vqdmull_lane_s32 (arg0_int32x2_t, arg1_int32x2_t, 1);
-}
-
-/* { dg-final { scan-assembler "vqdmull\.s32\[ 	\]+\[qQ\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vqdmull_ns16.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vqdmull_ns16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vqdmull_ns16 (void)
-{
-  int32x4_t out_int32x4_t;
-  int16x4_t arg0_int16x4_t;
-  int16_t arg1_int16_t;
-
-  out_int32x4_t = vqdmull_n_s16 (arg0_int16x4_t, arg1_int16_t);
-}
-
-/* { dg-final { scan-assembler "vqdmull\.s16\[ 	\]+\[qQ\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vqdmull_ns32.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vqdmull_ns32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vqdmull_ns32 (void)
-{
-  int64x2_t out_int64x2_t;
-  int32x2_t arg0_int32x2_t;
-  int32_t arg1_int32_t;
-
-  out_int64x2_t = vqdmull_n_s32 (arg0_int32x2_t, arg1_int32_t);
-}
-
-/* { dg-final { scan-assembler "vqdmull\.s32\[ 	\]+\[qQ\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vqdmulls16.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vqdmulls16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vqdmulls16 (void)
-{
-  int32x4_t out_int32x4_t;
-  int16x4_t arg0_int16x4_t;
-  int16x4_t arg1_int16x4_t;
-
-  out_int32x4_t = vqdmull_s16 (arg0_int16x4_t, arg1_int16x4_t);
-}
-
-/* { dg-final { scan-assembler "vqdmull\.s16\[ 	\]+\[qQ\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vqdmulls32.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vqdmulls32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vqdmulls32 (void)
-{
-  int64x2_t out_int64x2_t;
-  int32x2_t arg0_int32x2_t;
-  int32x2_t arg1_int32x2_t;
-
-  out_int64x2_t = vqdmull_s32 (arg0_int32x2_t, arg1_int32x2_t);
-}
-
-/* { dg-final { scan-assembler "vqdmull\.s32\[ 	\]+\[qQ\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vqmovns16.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vqmovns16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vqmovns16 (void)
-{
-  int8x8_t out_int8x8_t;
-  int16x8_t arg0_int16x8_t;
-
-  out_int8x8_t = vqmovn_s16 (arg0_int16x8_t);
-}
-
-/* { dg-final { scan-assembler "vqmovn\.s16\[ 	\]+\[dD\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vqmovns32.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vqmovns32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vqmovns32 (void)
-{
-  int16x4_t out_int16x4_t;
-  int32x4_t arg0_int32x4_t;
-
-  out_int16x4_t = vqmovn_s32 (arg0_int32x4_t);
-}
-
-/* { dg-final { scan-assembler "vqmovn\.s32\[ 	\]+\[dD\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vqmovns64.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vqmovns64' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vqmovns64 (void)
-{
-  int32x2_t out_int32x2_t;
-  int64x2_t arg0_int64x2_t;
-
-  out_int32x2_t = vqmovn_s64 (arg0_int64x2_t);
-}
-
-/* { dg-final { scan-assembler "vqmovn\.s64\[ 	\]+\[dD\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vqmovnu16.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vqmovnu16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vqmovnu16 (void)
-{
-  uint8x8_t out_uint8x8_t;
-  uint16x8_t arg0_uint16x8_t;
-
-  out_uint8x8_t = vqmovn_u16 (arg0_uint16x8_t);
-}
-
-/* { dg-final { scan-assembler "vqmovn\.u16\[ 	\]+\[dD\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vqmovnu32.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vqmovnu32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vqmovnu32 (void)
-{
-  uint16x4_t out_uint16x4_t;
-  uint32x4_t arg0_uint32x4_t;
-
-  out_uint16x4_t = vqmovn_u32 (arg0_uint32x4_t);
-}
-
-/* { dg-final { scan-assembler "vqmovn\.u32\[ 	\]+\[dD\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vqmovnu64.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vqmovnu64' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vqmovnu64 (void)
-{
-  uint32x2_t out_uint32x2_t;
-  uint64x2_t arg0_uint64x2_t;
-
-  out_uint32x2_t = vqmovn_u64 (arg0_uint64x2_t);
-}
-
-/* { dg-final { scan-assembler "vqmovn\.u64\[ 	\]+\[dD\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vqmovuns16.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vqmovuns16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vqmovuns16 (void)
-{
-  uint8x8_t out_uint8x8_t;
-  int16x8_t arg0_int16x8_t;
-
-  out_uint8x8_t = vqmovun_s16 (arg0_int16x8_t);
-}
-
-/* { dg-final { scan-assembler "vqmovun\.s16\[ 	\]+\[dD\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vqmovuns32.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vqmovuns32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vqmovuns32 (void)
-{
-  uint16x4_t out_uint16x4_t;
-  int32x4_t arg0_int32x4_t;
-
-  out_uint16x4_t = vqmovun_s32 (arg0_int32x4_t);
-}
-
-/* { dg-final { scan-assembler "vqmovun\.s32\[ 	\]+\[dD\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vqmovuns64.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vqmovuns64' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vqmovuns64 (void)
-{
-  uint32x2_t out_uint32x2_t;
-  int64x2_t arg0_int64x2_t;
-
-  out_uint32x2_t = vqmovun_s64 (arg0_int64x2_t);
-}
-
-/* { dg-final { scan-assembler "vqmovun\.s64\[ 	\]+\[dD\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vqnegQs16.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vqnegQs16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vqnegQs16 (void)
-{
-  int16x8_t out_int16x8_t;
-  int16x8_t arg0_int16x8_t;
-
-  out_int16x8_t = vqnegq_s16 (arg0_int16x8_t);
-}
-
-/* { dg-final { scan-assembler "vqneg\.s16\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vqnegQs32.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vqnegQs32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vqnegQs32 (void)
-{
-  int32x4_t out_int32x4_t;
-  int32x4_t arg0_int32x4_t;
-
-  out_int32x4_t = vqnegq_s32 (arg0_int32x4_t);
-}
-
-/* { dg-final { scan-assembler "vqneg\.s32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vqnegQs8.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vqnegQs8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vqnegQs8 (void)
-{
-  int8x16_t out_int8x16_t;
-  int8x16_t arg0_int8x16_t;
-
-  out_int8x16_t = vqnegq_s8 (arg0_int8x16_t);
-}
-
-/* { dg-final { scan-assembler "vqneg\.s8\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vqnegs16.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vqnegs16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vqnegs16 (void)
-{
-  int16x4_t out_int16x4_t;
-  int16x4_t arg0_int16x4_t;
-
-  out_int16x4_t = vqneg_s16 (arg0_int16x4_t);
-}
-
-/* { dg-final { scan-assembler "vqneg\.s16\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vqnegs32.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vqnegs32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vqnegs32 (void)
-{
-  int32x2_t out_int32x2_t;
-  int32x2_t arg0_int32x2_t;
-
-  out_int32x2_t = vqneg_s32 (arg0_int32x2_t);
-}
-
-/* { dg-final { scan-assembler "vqneg\.s32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vqnegs8.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vqnegs8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vqnegs8 (void)
-{
-  int8x8_t out_int8x8_t;
-  int8x8_t arg0_int8x8_t;
-
-  out_int8x8_t = vqneg_s8 (arg0_int8x8_t);
-}
-
-/* { dg-final { scan-assembler "vqneg\.s8\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vqshlQ_ns16.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vqshlQ_ns16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vqshlQ_ns16 (void)
-{
-  int16x8_t out_int16x8_t;
-  int16x8_t arg0_int16x8_t;
-
-  out_int16x8_t = vqshlq_n_s16 (arg0_int16x8_t, 1);
-}
-
-/* { dg-final { scan-assembler "vqshl\.s16\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vqshlQ_ns32.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vqshlQ_ns32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vqshlQ_ns32 (void)
-{
-  int32x4_t out_int32x4_t;
-  int32x4_t arg0_int32x4_t;
-
-  out_int32x4_t = vqshlq_n_s32 (arg0_int32x4_t, 1);
-}
-
-/* { dg-final { scan-assembler "vqshl\.s32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vqshlQ_ns64.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vqshlQ_ns64' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vqshlQ_ns64 (void)
-{
-  int64x2_t out_int64x2_t;
-  int64x2_t arg0_int64x2_t;
-
-  out_int64x2_t = vqshlq_n_s64 (arg0_int64x2_t, 1);
-}
-
-/* { dg-final { scan-assembler "vqshl\.s64\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vqshlQ_ns8.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vqshlQ_ns8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vqshlQ_ns8 (void)
-{
-  int8x16_t out_int8x16_t;
-  int8x16_t arg0_int8x16_t;
-
-  out_int8x16_t = vqshlq_n_s8 (arg0_int8x16_t, 1);
-}
-
-/* { dg-final { scan-assembler "vqshl\.s8\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vqshlQ_nu16.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vqshlQ_nu16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vqshlQ_nu16 (void)
-{
-  uint16x8_t out_uint16x8_t;
-  uint16x8_t arg0_uint16x8_t;
-
-  out_uint16x8_t = vqshlq_n_u16 (arg0_uint16x8_t, 1);
-}
-
-/* { dg-final { scan-assembler "vqshl\.u16\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vqshlQ_nu32.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vqshlQ_nu32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vqshlQ_nu32 (void)
-{
-  uint32x4_t out_uint32x4_t;
-  uint32x4_t arg0_uint32x4_t;
-
-  out_uint32x4_t = vqshlq_n_u32 (arg0_uint32x4_t, 1);
-}
-
-/* { dg-final { scan-assembler "vqshl\.u32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vqshlQ_nu64.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vqshlQ_nu64' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vqshlQ_nu64 (void)
-{
-  uint64x2_t out_uint64x2_t;
-  uint64x2_t arg0_uint64x2_t;
-
-  out_uint64x2_t = vqshlq_n_u64 (arg0_uint64x2_t, 1);
-}
-
-/* { dg-final { scan-assembler "vqshl\.u64\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vqshlQ_nu8.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vqshlQ_nu8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vqshlQ_nu8 (void)
-{
-  uint8x16_t out_uint8x16_t;
-  uint8x16_t arg0_uint8x16_t;
-
-  out_uint8x16_t = vqshlq_n_u8 (arg0_uint8x16_t, 1);
-}
-
-/* { dg-final { scan-assembler "vqshl\.u8\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vqshlQs16.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vqshlQs16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vqshlQs16 (void)
-{
-  int16x8_t out_int16x8_t;
-  int16x8_t arg0_int16x8_t;
-  int16x8_t arg1_int16x8_t;
-
-  out_int16x8_t = vqshlq_s16 (arg0_int16x8_t, arg1_int16x8_t);
-}
-
-/* { dg-final { scan-assembler "vqshl\.s16\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vqshlQs32.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vqshlQs32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vqshlQs32 (void)
-{
-  int32x4_t out_int32x4_t;
-  int32x4_t arg0_int32x4_t;
-  int32x4_t arg1_int32x4_t;
-
-  out_int32x4_t = vqshlq_s32 (arg0_int32x4_t, arg1_int32x4_t);
-}
-
-/* { dg-final { scan-assembler "vqshl\.s32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vqshlQs64.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vqshlQs64' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vqshlQs64 (void)
-{
-  int64x2_t out_int64x2_t;
-  int64x2_t arg0_int64x2_t;
-  int64x2_t arg1_int64x2_t;
-
-  out_int64x2_t = vqshlq_s64 (arg0_int64x2_t, arg1_int64x2_t);
-}
-
-/* { dg-final { scan-assembler "vqshl\.s64\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vqshlQs8.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vqshlQs8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vqshlQs8 (void)
-{
-  int8x16_t out_int8x16_t;
-  int8x16_t arg0_int8x16_t;
-  int8x16_t arg1_int8x16_t;
-
-  out_int8x16_t = vqshlq_s8 (arg0_int8x16_t, arg1_int8x16_t);
-}
-
-/* { dg-final { scan-assembler "vqshl\.s8\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vqshlQu16.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vqshlQu16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vqshlQu16 (void)
-{
-  uint16x8_t out_uint16x8_t;
-  uint16x8_t arg0_uint16x8_t;
-  int16x8_t arg1_int16x8_t;
-
-  out_uint16x8_t = vqshlq_u16 (arg0_uint16x8_t, arg1_int16x8_t);
-}
-
-/* { dg-final { scan-assembler "vqshl\.u16\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vqshlQu32.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vqshlQu32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vqshlQu32 (void)
-{
-  uint32x4_t out_uint32x4_t;
-  uint32x4_t arg0_uint32x4_t;
-  int32x4_t arg1_int32x4_t;
-
-  out_uint32x4_t = vqshlq_u32 (arg0_uint32x4_t, arg1_int32x4_t);
-}
-
-/* { dg-final { scan-assembler "vqshl\.u32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vqshlQu64.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vqshlQu64' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vqshlQu64 (void)
-{
-  uint64x2_t out_uint64x2_t;
-  uint64x2_t arg0_uint64x2_t;
-  int64x2_t arg1_int64x2_t;
-
-  out_uint64x2_t = vqshlq_u64 (arg0_uint64x2_t, arg1_int64x2_t);
-}
-
-/* { dg-final { scan-assembler "vqshl\.u64\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vqshlQu8.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vqshlQu8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vqshlQu8 (void)
-{
-  uint8x16_t out_uint8x16_t;
-  uint8x16_t arg0_uint8x16_t;
-  int8x16_t arg1_int8x16_t;
-
-  out_uint8x16_t = vqshlq_u8 (arg0_uint8x16_t, arg1_int8x16_t);
-}
-
-/* { dg-final { scan-assembler "vqshl\.u8\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vqshl_ns16.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vqshl_ns16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vqshl_ns16 (void)
-{
-  int16x4_t out_int16x4_t;
-  int16x4_t arg0_int16x4_t;
-
-  out_int16x4_t = vqshl_n_s16 (arg0_int16x4_t, 1);
-}
-
-/* { dg-final { scan-assembler "vqshl\.s16\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vqshl_ns32.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vqshl_ns32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vqshl_ns32 (void)
-{
-  int32x2_t out_int32x2_t;
-  int32x2_t arg0_int32x2_t;
-
-  out_int32x2_t = vqshl_n_s32 (arg0_int32x2_t, 1);
-}
-
-/* { dg-final { scan-assembler "vqshl\.s32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vqshl_ns64.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vqshl_ns64' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vqshl_ns64 (void)
-{
-  int64x1_t out_int64x1_t;
-  int64x1_t arg0_int64x1_t;
-
-  out_int64x1_t = vqshl_n_s64 (arg0_int64x1_t, 1);
-}
-
-/* { dg-final { scan-assembler "vqshl\.s64\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vqshl_ns8.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vqshl_ns8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vqshl_ns8 (void)
-{
-  int8x8_t out_int8x8_t;
-  int8x8_t arg0_int8x8_t;
-
-  out_int8x8_t = vqshl_n_s8 (arg0_int8x8_t, 1);
-}
-
-/* { dg-final { scan-assembler "vqshl\.s8\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vqshl_nu16.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vqshl_nu16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vqshl_nu16 (void)
-{
-  uint16x4_t out_uint16x4_t;
-  uint16x4_t arg0_uint16x4_t;
-
-  out_uint16x4_t = vqshl_n_u16 (arg0_uint16x4_t, 1);
-}
-
-/* { dg-final { scan-assembler "vqshl\.u16\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vqshl_nu32.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vqshl_nu32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vqshl_nu32 (void)
-{
-  uint32x2_t out_uint32x2_t;
-  uint32x2_t arg0_uint32x2_t;
-
-  out_uint32x2_t = vqshl_n_u32 (arg0_uint32x2_t, 1);
-}
-
-/* { dg-final { scan-assembler "vqshl\.u32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vqshl_nu64.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vqshl_nu64' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vqshl_nu64 (void)
-{
-  uint64x1_t out_uint64x1_t;
-  uint64x1_t arg0_uint64x1_t;
-
-  out_uint64x1_t = vqshl_n_u64 (arg0_uint64x1_t, 1);
-}
-
-/* { dg-final { scan-assembler "vqshl\.u64\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vqshl_nu8.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vqshl_nu8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vqshl_nu8 (void)
-{
-  uint8x8_t out_uint8x8_t;
-  uint8x8_t arg0_uint8x8_t;
-
-  out_uint8x8_t = vqshl_n_u8 (arg0_uint8x8_t, 1);
-}
-
-/* { dg-final { scan-assembler "vqshl\.u8\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vqshls16.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vqshls16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vqshls16 (void)
-{
-  int16x4_t out_int16x4_t;
-  int16x4_t arg0_int16x4_t;
-  int16x4_t arg1_int16x4_t;
-
-  out_int16x4_t = vqshl_s16 (arg0_int16x4_t, arg1_int16x4_t);
-}
-
-/* { dg-final { scan-assembler "vqshl\.s16\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vqshls32.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vqshls32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vqshls32 (void)
-{
-  int32x2_t out_int32x2_t;
-  int32x2_t arg0_int32x2_t;
-  int32x2_t arg1_int32x2_t;
-
-  out_int32x2_t = vqshl_s32 (arg0_int32x2_t, arg1_int32x2_t);
-}
-
-/* { dg-final { scan-assembler "vqshl\.s32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vqshls64.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vqshls64' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vqshls64 (void)
-{
-  int64x1_t out_int64x1_t;
-  int64x1_t arg0_int64x1_t;
-  int64x1_t arg1_int64x1_t;
-
-  out_int64x1_t = vqshl_s64 (arg0_int64x1_t, arg1_int64x1_t);
-}
-
-/* { dg-final { scan-assembler "vqshl\.s64\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vqshls8.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vqshls8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vqshls8 (void)
-{
-  int8x8_t out_int8x8_t;
-  int8x8_t arg0_int8x8_t;
-  int8x8_t arg1_int8x8_t;
-
-  out_int8x8_t = vqshl_s8 (arg0_int8x8_t, arg1_int8x8_t);
-}
-
-/* { dg-final { scan-assembler "vqshl\.s8\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vqshlu16.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vqshlu16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vqshlu16 (void)
-{
-  uint16x4_t out_uint16x4_t;
-  uint16x4_t arg0_uint16x4_t;
-  int16x4_t arg1_int16x4_t;
-
-  out_uint16x4_t = vqshl_u16 (arg0_uint16x4_t, arg1_int16x4_t);
-}
-
-/* { dg-final { scan-assembler "vqshl\.u16\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vqshlu32.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vqshlu32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vqshlu32 (void)
-{
-  uint32x2_t out_uint32x2_t;
-  uint32x2_t arg0_uint32x2_t;
-  int32x2_t arg1_int32x2_t;
-
-  out_uint32x2_t = vqshl_u32 (arg0_uint32x2_t, arg1_int32x2_t);
-}
-
-/* { dg-final { scan-assembler "vqshl\.u32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vqshlu64.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vqshlu64' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vqshlu64 (void)
-{
-  uint64x1_t out_uint64x1_t;
-  uint64x1_t arg0_uint64x1_t;
-  int64x1_t arg1_int64x1_t;
-
-  out_uint64x1_t = vqshl_u64 (arg0_uint64x1_t, arg1_int64x1_t);
-}
-
-/* { dg-final { scan-assembler "vqshl\.u64\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vqshlu8.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vqshlu8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vqshlu8 (void)
-{
-  uint8x8_t out_uint8x8_t;
-  uint8x8_t arg0_uint8x8_t;
-  int8x8_t arg1_int8x8_t;
-
-  out_uint8x8_t = vqshl_u8 (arg0_uint8x8_t, arg1_int8x8_t);
-}
-
-/* { dg-final { scan-assembler "vqshl\.u8\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vqshluQ_ns16.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vqshluQ_ns16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vqshluQ_ns16 (void)
-{
-  uint16x8_t out_uint16x8_t;
-  int16x8_t arg0_int16x8_t;
-
-  out_uint16x8_t = vqshluq_n_s16 (arg0_int16x8_t, 1);
-}
-
-/* { dg-final { scan-assembler "vqshlu\.s16\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vqshluQ_ns32.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vqshluQ_ns32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vqshluQ_ns32 (void)
-{
-  uint32x4_t out_uint32x4_t;
-  int32x4_t arg0_int32x4_t;
-
-  out_uint32x4_t = vqshluq_n_s32 (arg0_int32x4_t, 1);
-}
-
-/* { dg-final { scan-assembler "vqshlu\.s32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vqshluQ_ns64.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vqshluQ_ns64' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vqshluQ_ns64 (void)
-{
-  uint64x2_t out_uint64x2_t;
-  int64x2_t arg0_int64x2_t;
-
-  out_uint64x2_t = vqshluq_n_s64 (arg0_int64x2_t, 1);
-}
-
-/* { dg-final { scan-assembler "vqshlu\.s64\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vqshluQ_ns8.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vqshluQ_ns8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vqshluQ_ns8 (void)
-{
-  uint8x16_t out_uint8x16_t;
-  int8x16_t arg0_int8x16_t;
-
-  out_uint8x16_t = vqshluq_n_s8 (arg0_int8x16_t, 1);
-}
-
-/* { dg-final { scan-assembler "vqshlu\.s8\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vqshlu_ns16.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vqshlu_ns16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vqshlu_ns16 (void)
-{
-  uint16x4_t out_uint16x4_t;
-  int16x4_t arg0_int16x4_t;
-
-  out_uint16x4_t = vqshlu_n_s16 (arg0_int16x4_t, 1);
-}
-
-/* { dg-final { scan-assembler "vqshlu\.s16\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vqshlu_ns32.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vqshlu_ns32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vqshlu_ns32 (void)
-{
-  uint32x2_t out_uint32x2_t;
-  int32x2_t arg0_int32x2_t;
-
-  out_uint32x2_t = vqshlu_n_s32 (arg0_int32x2_t, 1);
-}
-
-/* { dg-final { scan-assembler "vqshlu\.s32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vqshlu_ns64.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vqshlu_ns64' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vqshlu_ns64 (void)
-{
-  uint64x1_t out_uint64x1_t;
-  int64x1_t arg0_int64x1_t;
-
-  out_uint64x1_t = vqshlu_n_s64 (arg0_int64x1_t, 1);
-}
-
-/* { dg-final { scan-assembler "vqshlu\.s64\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vqshlu_ns8.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vqshlu_ns8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vqshlu_ns8 (void)
-{
-  uint8x8_t out_uint8x8_t;
-  int8x8_t arg0_int8x8_t;
-
-  out_uint8x8_t = vqshlu_n_s8 (arg0_int8x8_t, 1);
-}
-
-/* { dg-final { scan-assembler "vqshlu\.s8\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vqshrn_ns16.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vqshrn_ns16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vqshrn_ns16 (void)
-{
-  int8x8_t out_int8x8_t;
-  int16x8_t arg0_int16x8_t;
-
-  out_int8x8_t = vqshrn_n_s16 (arg0_int16x8_t, 1);
-}
-
-/* { dg-final { scan-assembler "vqshrn\.s16\[ 	\]+\[dD\]\[0-9\]+, \[qQ\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vqshrn_ns32.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vqshrn_ns32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vqshrn_ns32 (void)
-{
-  int16x4_t out_int16x4_t;
-  int32x4_t arg0_int32x4_t;
-
-  out_int16x4_t = vqshrn_n_s32 (arg0_int32x4_t, 1);
-}
-
-/* { dg-final { scan-assembler "vqshrn\.s32\[ 	\]+\[dD\]\[0-9\]+, \[qQ\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vqshrn_ns64.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vqshrn_ns64' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vqshrn_ns64 (void)
-{
-  int32x2_t out_int32x2_t;
-  int64x2_t arg0_int64x2_t;
-
-  out_int32x2_t = vqshrn_n_s64 (arg0_int64x2_t, 1);
-}
-
-/* { dg-final { scan-assembler "vqshrn\.s64\[ 	\]+\[dD\]\[0-9\]+, \[qQ\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vqshrn_nu16.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vqshrn_nu16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vqshrn_nu16 (void)
-{
-  uint8x8_t out_uint8x8_t;
-  uint16x8_t arg0_uint16x8_t;
-
-  out_uint8x8_t = vqshrn_n_u16 (arg0_uint16x8_t, 1);
-}
-
-/* { dg-final { scan-assembler "vqshrn\.u16\[ 	\]+\[dD\]\[0-9\]+, \[qQ\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vqshrn_nu32.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vqshrn_nu32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vqshrn_nu32 (void)
-{
-  uint16x4_t out_uint16x4_t;
-  uint32x4_t arg0_uint32x4_t;
-
-  out_uint16x4_t = vqshrn_n_u32 (arg0_uint32x4_t, 1);
-}
-
-/* { dg-final { scan-assembler "vqshrn\.u32\[ 	\]+\[dD\]\[0-9\]+, \[qQ\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vqshrn_nu64.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vqshrn_nu64' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vqshrn_nu64 (void)
-{
-  uint32x2_t out_uint32x2_t;
-  uint64x2_t arg0_uint64x2_t;
-
-  out_uint32x2_t = vqshrn_n_u64 (arg0_uint64x2_t, 1);
-}
-
-/* { dg-final { scan-assembler "vqshrn\.u64\[ 	\]+\[dD\]\[0-9\]+, \[qQ\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vqshrun_ns16.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vqshrun_ns16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vqshrun_ns16 (void)
-{
-  uint8x8_t out_uint8x8_t;
-  int16x8_t arg0_int16x8_t;
-
-  out_uint8x8_t = vqshrun_n_s16 (arg0_int16x8_t, 1);
-}
-
-/* { dg-final { scan-assembler "vqshrun\.s16\[ 	\]+\[dD\]\[0-9\]+, \[qQ\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vqshrun_ns32.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vqshrun_ns32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vqshrun_ns32 (void)
-{
-  uint16x4_t out_uint16x4_t;
-  int32x4_t arg0_int32x4_t;
-
-  out_uint16x4_t = vqshrun_n_s32 (arg0_int32x4_t, 1);
-}
-
-/* { dg-final { scan-assembler "vqshrun\.s32\[ 	\]+\[dD\]\[0-9\]+, \[qQ\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vqshrun_ns64.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vqshrun_ns64' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vqshrun_ns64 (void)
-{
-  uint32x2_t out_uint32x2_t;
-  int64x2_t arg0_int64x2_t;
-
-  out_uint32x2_t = vqshrun_n_s64 (arg0_int64x2_t, 1);
-}
-
-/* { dg-final { scan-assembler "vqshrun\.s64\[ 	\]+\[dD\]\[0-9\]+, \[qQ\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vqsubQs16.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vqsubQs16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vqsubQs16 (void)
-{
-  int16x8_t out_int16x8_t;
-  int16x8_t arg0_int16x8_t;
-  int16x8_t arg1_int16x8_t;
-
-  out_int16x8_t = vqsubq_s16 (arg0_int16x8_t, arg1_int16x8_t);
-}
-
-/* { dg-final { scan-assembler "vqsub\.s16\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vqsubQs32.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vqsubQs32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vqsubQs32 (void)
-{
-  int32x4_t out_int32x4_t;
-  int32x4_t arg0_int32x4_t;
-  int32x4_t arg1_int32x4_t;
-
-  out_int32x4_t = vqsubq_s32 (arg0_int32x4_t, arg1_int32x4_t);
-}
-
-/* { dg-final { scan-assembler "vqsub\.s32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vqsubQs64.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vqsubQs64' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vqsubQs64 (void)
-{
-  int64x2_t out_int64x2_t;
-  int64x2_t arg0_int64x2_t;
-  int64x2_t arg1_int64x2_t;
-
-  out_int64x2_t = vqsubq_s64 (arg0_int64x2_t, arg1_int64x2_t);
-}
-
-/* { dg-final { scan-assembler "vqsub\.s64\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vqsubQs8.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vqsubQs8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vqsubQs8 (void)
-{
-  int8x16_t out_int8x16_t;
-  int8x16_t arg0_int8x16_t;
-  int8x16_t arg1_int8x16_t;
-
-  out_int8x16_t = vqsubq_s8 (arg0_int8x16_t, arg1_int8x16_t);
-}
-
-/* { dg-final { scan-assembler "vqsub\.s8\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vqsubQu16.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vqsubQu16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vqsubQu16 (void)
-{
-  uint16x8_t out_uint16x8_t;
-  uint16x8_t arg0_uint16x8_t;
-  uint16x8_t arg1_uint16x8_t;
-
-  out_uint16x8_t = vqsubq_u16 (arg0_uint16x8_t, arg1_uint16x8_t);
-}
-
-/* { dg-final { scan-assembler "vqsub\.u16\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vqsubQu32.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vqsubQu32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vqsubQu32 (void)
-{
-  uint32x4_t out_uint32x4_t;
-  uint32x4_t arg0_uint32x4_t;
-  uint32x4_t arg1_uint32x4_t;
-
-  out_uint32x4_t = vqsubq_u32 (arg0_uint32x4_t, arg1_uint32x4_t);
-}
-
-/* { dg-final { scan-assembler "vqsub\.u32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vqsubQu64.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vqsubQu64' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vqsubQu64 (void)
-{
-  uint64x2_t out_uint64x2_t;
-  uint64x2_t arg0_uint64x2_t;
-  uint64x2_t arg1_uint64x2_t;
-
-  out_uint64x2_t = vqsubq_u64 (arg0_uint64x2_t, arg1_uint64x2_t);
-}
-
-/* { dg-final { scan-assembler "vqsub\.u64\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vqsubQu8.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vqsubQu8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vqsubQu8 (void)
-{
-  uint8x16_t out_uint8x16_t;
-  uint8x16_t arg0_uint8x16_t;
-  uint8x16_t arg1_uint8x16_t;
-
-  out_uint8x16_t = vqsubq_u8 (arg0_uint8x16_t, arg1_uint8x16_t);
-}
-
-/* { dg-final { scan-assembler "vqsub\.u8\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vqsubs16.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vqsubs16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vqsubs16 (void)
-{
-  int16x4_t out_int16x4_t;
-  int16x4_t arg0_int16x4_t;
-  int16x4_t arg1_int16x4_t;
-
-  out_int16x4_t = vqsub_s16 (arg0_int16x4_t, arg1_int16x4_t);
-}
-
-/* { dg-final { scan-assembler "vqsub\.s16\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vqsubs32.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vqsubs32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vqsubs32 (void)
-{
-  int32x2_t out_int32x2_t;
-  int32x2_t arg0_int32x2_t;
-  int32x2_t arg1_int32x2_t;
-
-  out_int32x2_t = vqsub_s32 (arg0_int32x2_t, arg1_int32x2_t);
-}
-
-/* { dg-final { scan-assembler "vqsub\.s32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vqsubs64.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vqsubs64' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vqsubs64 (void)
-{
-  int64x1_t out_int64x1_t;
-  int64x1_t arg0_int64x1_t;
-  int64x1_t arg1_int64x1_t;
-
-  out_int64x1_t = vqsub_s64 (arg0_int64x1_t, arg1_int64x1_t);
-}
-
-/* { dg-final { scan-assembler "vqsub\.s64\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vqsubs8.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vqsubs8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vqsubs8 (void)
-{
-  int8x8_t out_int8x8_t;
-  int8x8_t arg0_int8x8_t;
-  int8x8_t arg1_int8x8_t;
-
-  out_int8x8_t = vqsub_s8 (arg0_int8x8_t, arg1_int8x8_t);
-}
-
-/* { dg-final { scan-assembler "vqsub\.s8\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vqsubu16.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vqsubu16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vqsubu16 (void)
-{
-  uint16x4_t out_uint16x4_t;
-  uint16x4_t arg0_uint16x4_t;
-  uint16x4_t arg1_uint16x4_t;
-
-  out_uint16x4_t = vqsub_u16 (arg0_uint16x4_t, arg1_uint16x4_t);
-}
-
-/* { dg-final { scan-assembler "vqsub\.u16\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vqsubu32.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vqsubu32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vqsubu32 (void)
-{
-  uint32x2_t out_uint32x2_t;
-  uint32x2_t arg0_uint32x2_t;
-  uint32x2_t arg1_uint32x2_t;
-
-  out_uint32x2_t = vqsub_u32 (arg0_uint32x2_t, arg1_uint32x2_t);
-}
-
-/* { dg-final { scan-assembler "vqsub\.u32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vqsubu64.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vqsubu64' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vqsubu64 (void)
-{
-  uint64x1_t out_uint64x1_t;
-  uint64x1_t arg0_uint64x1_t;
-  uint64x1_t arg1_uint64x1_t;
-
-  out_uint64x1_t = vqsub_u64 (arg0_uint64x1_t, arg1_uint64x1_t);
-}
-
-/* { dg-final { scan-assembler "vqsub\.u64\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vqsubu8.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vqsubu8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vqsubu8 (void)
-{
-  uint8x8_t out_uint8x8_t;
-  uint8x8_t arg0_uint8x8_t;
-  uint8x8_t arg1_uint8x8_t;
-
-  out_uint8x8_t = vqsub_u8 (arg0_uint8x8_t, arg1_uint8x8_t);
-}
-
-/* { dg-final { scan-assembler "vqsub\.u8\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vrecpeQf32.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vrecpeQf32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vrecpeQf32 (void)
-{
-  float32x4_t out_float32x4_t;
-  float32x4_t arg0_float32x4_t;
-
-  out_float32x4_t = vrecpeq_f32 (arg0_float32x4_t);
-}
-
-/* { dg-final { scan-assembler "vrecpe\.f32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vrecpeQu32.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vrecpeQu32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vrecpeQu32 (void)
-{
-  uint32x4_t out_uint32x4_t;
-  uint32x4_t arg0_uint32x4_t;
-
-  out_uint32x4_t = vrecpeq_u32 (arg0_uint32x4_t);
-}
-
-/* { dg-final { scan-assembler "vrecpe\.u32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vrecpef32.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vrecpef32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vrecpef32 (void)
-{
-  float32x2_t out_float32x2_t;
-  float32x2_t arg0_float32x2_t;
-
-  out_float32x2_t = vrecpe_f32 (arg0_float32x2_t);
-}
-
-/* { dg-final { scan-assembler "vrecpe\.f32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vrecpeu32.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vrecpeu32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vrecpeu32 (void)
-{
-  uint32x2_t out_uint32x2_t;
-  uint32x2_t arg0_uint32x2_t;
-
-  out_uint32x2_t = vrecpe_u32 (arg0_uint32x2_t);
-}
-
-/* { dg-final { scan-assembler "vrecpe\.u32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vrecpsQf32.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vrecpsQf32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vrecpsQf32 (void)
-{
-  float32x4_t out_float32x4_t;
-  float32x4_t arg0_float32x4_t;
-  float32x4_t arg1_float32x4_t;
-
-  out_float32x4_t = vrecpsq_f32 (arg0_float32x4_t, arg1_float32x4_t);
-}
-
-/* { dg-final { scan-assembler "vrecps\.f32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vrecpsf32.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vrecpsf32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vrecpsf32 (void)
-{
-  float32x2_t out_float32x2_t;
-  float32x2_t arg0_float32x2_t;
-  float32x2_t arg1_float32x2_t;
-
-  out_float32x2_t = vrecps_f32 (arg0_float32x2_t, arg1_float32x2_t);
-}
-
-/* { dg-final { scan-assembler "vrecps\.f32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretQf32_p128.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vreinterpretQf32_p128' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_crypto_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_crypto } */
-
-#include "arm_neon.h"
-
-void test_vreinterpretQf32_p128 (void)
-{
-  float32x4_t out_float32x4_t;
-  poly128_t arg0_poly128_t;
-
-  out_float32x4_t = vreinterpretq_f32_p128 (arg0_poly128_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretQf32_p16.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vreinterpretQf32_p16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vreinterpretQf32_p16 (void)
-{
-  float32x4_t out_float32x4_t;
-  poly16x8_t arg0_poly16x8_t;
-
-  out_float32x4_t = vreinterpretq_f32_p16 (arg0_poly16x8_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretQf32_p64.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vreinterpretQf32_p64' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_crypto_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_crypto } */
-
-#include "arm_neon.h"
-
-void test_vreinterpretQf32_p64 (void)
-{
-  float32x4_t out_float32x4_t;
-  poly64x2_t arg0_poly64x2_t;
-
-  out_float32x4_t = vreinterpretq_f32_p64 (arg0_poly64x2_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretQf32_p8.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vreinterpretQf32_p8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vreinterpretQf32_p8 (void)
-{
-  float32x4_t out_float32x4_t;
-  poly8x16_t arg0_poly8x16_t;
-
-  out_float32x4_t = vreinterpretq_f32_p8 (arg0_poly8x16_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretQf32_s16.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vreinterpretQf32_s16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vreinterpretQf32_s16 (void)
-{
-  float32x4_t out_float32x4_t;
-  int16x8_t arg0_int16x8_t;
-
-  out_float32x4_t = vreinterpretq_f32_s16 (arg0_int16x8_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretQf32_s32.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vreinterpretQf32_s32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vreinterpretQf32_s32 (void)
-{
-  float32x4_t out_float32x4_t;
-  int32x4_t arg0_int32x4_t;
-
-  out_float32x4_t = vreinterpretq_f32_s32 (arg0_int32x4_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretQf32_s64.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vreinterpretQf32_s64' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vreinterpretQf32_s64 (void)
-{
-  float32x4_t out_float32x4_t;
-  int64x2_t arg0_int64x2_t;
-
-  out_float32x4_t = vreinterpretq_f32_s64 (arg0_int64x2_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretQf32_s8.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vreinterpretQf32_s8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vreinterpretQf32_s8 (void)
-{
-  float32x4_t out_float32x4_t;
-  int8x16_t arg0_int8x16_t;
-
-  out_float32x4_t = vreinterpretq_f32_s8 (arg0_int8x16_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretQf32_u16.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vreinterpretQf32_u16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vreinterpretQf32_u16 (void)
-{
-  float32x4_t out_float32x4_t;
-  uint16x8_t arg0_uint16x8_t;
-
-  out_float32x4_t = vreinterpretq_f32_u16 (arg0_uint16x8_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretQf32_u32.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vreinterpretQf32_u32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vreinterpretQf32_u32 (void)
-{
-  float32x4_t out_float32x4_t;
-  uint32x4_t arg0_uint32x4_t;
-
-  out_float32x4_t = vreinterpretq_f32_u32 (arg0_uint32x4_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretQf32_u64.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vreinterpretQf32_u64' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vreinterpretQf32_u64 (void)
-{
-  float32x4_t out_float32x4_t;
-  uint64x2_t arg0_uint64x2_t;
-
-  out_float32x4_t = vreinterpretq_f32_u64 (arg0_uint64x2_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretQf32_u8.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vreinterpretQf32_u8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vreinterpretQf32_u8 (void)
-{
-  float32x4_t out_float32x4_t;
-  uint8x16_t arg0_uint8x16_t;
-
-  out_float32x4_t = vreinterpretq_f32_u8 (arg0_uint8x16_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretQp128_f32.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vreinterpretQp128_f32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_crypto_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_crypto } */
-
-#include "arm_neon.h"
-
-void test_vreinterpretQp128_f32 (void)
-{
-  poly128_t out_poly128_t;
-  float32x4_t arg0_float32x4_t;
-
-  out_poly128_t = vreinterpretq_p128_f32 (arg0_float32x4_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretQp128_p16.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vreinterpretQp128_p16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_crypto_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_crypto } */
-
-#include "arm_neon.h"
-
-void test_vreinterpretQp128_p16 (void)
-{
-  poly128_t out_poly128_t;
-  poly16x8_t arg0_poly16x8_t;
-
-  out_poly128_t = vreinterpretq_p128_p16 (arg0_poly16x8_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretQp128_p64.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vreinterpretQp128_p64' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_crypto_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_crypto } */
-
-#include "arm_neon.h"
-
-void test_vreinterpretQp128_p64 (void)
-{
-  poly128_t out_poly128_t;
-  poly64x2_t arg0_poly64x2_t;
-
-  out_poly128_t = vreinterpretq_p128_p64 (arg0_poly64x2_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretQp128_p8.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vreinterpretQp128_p8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_crypto_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_crypto } */
-
-#include "arm_neon.h"
-
-void test_vreinterpretQp128_p8 (void)
-{
-  poly128_t out_poly128_t;
-  poly8x16_t arg0_poly8x16_t;
-
-  out_poly128_t = vreinterpretq_p128_p8 (arg0_poly8x16_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretQp128_s16.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vreinterpretQp128_s16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_crypto_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_crypto } */
-
-#include "arm_neon.h"
-
-void test_vreinterpretQp128_s16 (void)
-{
-  poly128_t out_poly128_t;
-  int16x8_t arg0_int16x8_t;
-
-  out_poly128_t = vreinterpretq_p128_s16 (arg0_int16x8_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretQp128_s32.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vreinterpretQp128_s32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_crypto_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_crypto } */
-
-#include "arm_neon.h"
-
-void test_vreinterpretQp128_s32 (void)
-{
-  poly128_t out_poly128_t;
-  int32x4_t arg0_int32x4_t;
-
-  out_poly128_t = vreinterpretq_p128_s32 (arg0_int32x4_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretQp128_s64.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vreinterpretQp128_s64' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_crypto_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_crypto } */
-
-#include "arm_neon.h"
-
-void test_vreinterpretQp128_s64 (void)
-{
-  poly128_t out_poly128_t;
-  int64x2_t arg0_int64x2_t;
-
-  out_poly128_t = vreinterpretq_p128_s64 (arg0_int64x2_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretQp128_s8.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vreinterpretQp128_s8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_crypto_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_crypto } */
-
-#include "arm_neon.h"
-
-void test_vreinterpretQp128_s8 (void)
-{
-  poly128_t out_poly128_t;
-  int8x16_t arg0_int8x16_t;
-
-  out_poly128_t = vreinterpretq_p128_s8 (arg0_int8x16_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretQp128_u16.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vreinterpretQp128_u16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_crypto_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_crypto } */
-
-#include "arm_neon.h"
-
-void test_vreinterpretQp128_u16 (void)
-{
-  poly128_t out_poly128_t;
-  uint16x8_t arg0_uint16x8_t;
-
-  out_poly128_t = vreinterpretq_p128_u16 (arg0_uint16x8_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretQp128_u32.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vreinterpretQp128_u32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_crypto_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_crypto } */
-
-#include "arm_neon.h"
-
-void test_vreinterpretQp128_u32 (void)
-{
-  poly128_t out_poly128_t;
-  uint32x4_t arg0_uint32x4_t;
-
-  out_poly128_t = vreinterpretq_p128_u32 (arg0_uint32x4_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretQp128_u64.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vreinterpretQp128_u64' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_crypto_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_crypto } */
-
-#include "arm_neon.h"
-
-void test_vreinterpretQp128_u64 (void)
-{
-  poly128_t out_poly128_t;
-  uint64x2_t arg0_uint64x2_t;
-
-  out_poly128_t = vreinterpretq_p128_u64 (arg0_uint64x2_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretQp128_u8.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vreinterpretQp128_u8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_crypto_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_crypto } */
-
-#include "arm_neon.h"
-
-void test_vreinterpretQp128_u8 (void)
-{
-  poly128_t out_poly128_t;
-  uint8x16_t arg0_uint8x16_t;
-
-  out_poly128_t = vreinterpretq_p128_u8 (arg0_uint8x16_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretQp16_f32.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vreinterpretQp16_f32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vreinterpretQp16_f32 (void)
-{
-  poly16x8_t out_poly16x8_t;
-  float32x4_t arg0_float32x4_t;
-
-  out_poly16x8_t = vreinterpretq_p16_f32 (arg0_float32x4_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretQp16_p128.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vreinterpretQp16_p128' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_crypto_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_crypto } */
-
-#include "arm_neon.h"
-
-void test_vreinterpretQp16_p128 (void)
-{
-  poly16x8_t out_poly16x8_t;
-  poly128_t arg0_poly128_t;
-
-  out_poly16x8_t = vreinterpretq_p16_p128 (arg0_poly128_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretQp16_p64.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vreinterpretQp16_p64' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_crypto_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_crypto } */
-
-#include "arm_neon.h"
-
-void test_vreinterpretQp16_p64 (void)
-{
-  poly16x8_t out_poly16x8_t;
-  poly64x2_t arg0_poly64x2_t;
-
-  out_poly16x8_t = vreinterpretq_p16_p64 (arg0_poly64x2_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretQp16_p8.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vreinterpretQp16_p8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vreinterpretQp16_p8 (void)
-{
-  poly16x8_t out_poly16x8_t;
-  poly8x16_t arg0_poly8x16_t;
-
-  out_poly16x8_t = vreinterpretq_p16_p8 (arg0_poly8x16_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretQp16_s16.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vreinterpretQp16_s16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vreinterpretQp16_s16 (void)
-{
-  poly16x8_t out_poly16x8_t;
-  int16x8_t arg0_int16x8_t;
-
-  out_poly16x8_t = vreinterpretq_p16_s16 (arg0_int16x8_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretQp16_s32.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vreinterpretQp16_s32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vreinterpretQp16_s32 (void)
-{
-  poly16x8_t out_poly16x8_t;
-  int32x4_t arg0_int32x4_t;
-
-  out_poly16x8_t = vreinterpretq_p16_s32 (arg0_int32x4_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretQp16_s64.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vreinterpretQp16_s64' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vreinterpretQp16_s64 (void)
-{
-  poly16x8_t out_poly16x8_t;
-  int64x2_t arg0_int64x2_t;
-
-  out_poly16x8_t = vreinterpretq_p16_s64 (arg0_int64x2_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretQp16_s8.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vreinterpretQp16_s8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vreinterpretQp16_s8 (void)
-{
-  poly16x8_t out_poly16x8_t;
-  int8x16_t arg0_int8x16_t;
-
-  out_poly16x8_t = vreinterpretq_p16_s8 (arg0_int8x16_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretQp16_u16.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vreinterpretQp16_u16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vreinterpretQp16_u16 (void)
-{
-  poly16x8_t out_poly16x8_t;
-  uint16x8_t arg0_uint16x8_t;
-
-  out_poly16x8_t = vreinterpretq_p16_u16 (arg0_uint16x8_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretQp16_u32.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vreinterpretQp16_u32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vreinterpretQp16_u32 (void)
-{
-  poly16x8_t out_poly16x8_t;
-  uint32x4_t arg0_uint32x4_t;
-
-  out_poly16x8_t = vreinterpretq_p16_u32 (arg0_uint32x4_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretQp16_u64.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vreinterpretQp16_u64' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vreinterpretQp16_u64 (void)
-{
-  poly16x8_t out_poly16x8_t;
-  uint64x2_t arg0_uint64x2_t;
-
-  out_poly16x8_t = vreinterpretq_p16_u64 (arg0_uint64x2_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretQp16_u8.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vreinterpretQp16_u8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vreinterpretQp16_u8 (void)
-{
-  poly16x8_t out_poly16x8_t;
-  uint8x16_t arg0_uint8x16_t;
-
-  out_poly16x8_t = vreinterpretq_p16_u8 (arg0_uint8x16_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretQp64_f32.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vreinterpretQp64_f32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_crypto_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_crypto } */
-
-#include "arm_neon.h"
-
-void test_vreinterpretQp64_f32 (void)
-{
-  poly64x2_t out_poly64x2_t;
-  float32x4_t arg0_float32x4_t;
-
-  out_poly64x2_t = vreinterpretq_p64_f32 (arg0_float32x4_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretQp64_p128.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vreinterpretQp64_p128' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_crypto_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_crypto } */
-
-#include "arm_neon.h"
-
-void test_vreinterpretQp64_p128 (void)
-{
-  poly64x2_t out_poly64x2_t;
-  poly128_t arg0_poly128_t;
-
-  out_poly64x2_t = vreinterpretq_p64_p128 (arg0_poly128_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretQp64_p16.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vreinterpretQp64_p16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_crypto_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_crypto } */
-
-#include "arm_neon.h"
-
-void test_vreinterpretQp64_p16 (void)
-{
-  poly64x2_t out_poly64x2_t;
-  poly16x8_t arg0_poly16x8_t;
-
-  out_poly64x2_t = vreinterpretq_p64_p16 (arg0_poly16x8_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretQp64_p8.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vreinterpretQp64_p8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_crypto_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_crypto } */
-
-#include "arm_neon.h"
-
-void test_vreinterpretQp64_p8 (void)
-{
-  poly64x2_t out_poly64x2_t;
-  poly8x16_t arg0_poly8x16_t;
-
-  out_poly64x2_t = vreinterpretq_p64_p8 (arg0_poly8x16_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretQp64_s16.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vreinterpretQp64_s16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_crypto_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_crypto } */
-
-#include "arm_neon.h"
-
-void test_vreinterpretQp64_s16 (void)
-{
-  poly64x2_t out_poly64x2_t;
-  int16x8_t arg0_int16x8_t;
-
-  out_poly64x2_t = vreinterpretq_p64_s16 (arg0_int16x8_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretQp64_s32.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vreinterpretQp64_s32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_crypto_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_crypto } */
-
-#include "arm_neon.h"
-
-void test_vreinterpretQp64_s32 (void)
-{
-  poly64x2_t out_poly64x2_t;
-  int32x4_t arg0_int32x4_t;
-
-  out_poly64x2_t = vreinterpretq_p64_s32 (arg0_int32x4_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretQp64_s64.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vreinterpretQp64_s64' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_crypto_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_crypto } */
-
-#include "arm_neon.h"
-
-void test_vreinterpretQp64_s64 (void)
-{
-  poly64x2_t out_poly64x2_t;
-  int64x2_t arg0_int64x2_t;
-
-  out_poly64x2_t = vreinterpretq_p64_s64 (arg0_int64x2_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretQp64_s8.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vreinterpretQp64_s8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_crypto_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_crypto } */
-
-#include "arm_neon.h"
-
-void test_vreinterpretQp64_s8 (void)
-{
-  poly64x2_t out_poly64x2_t;
-  int8x16_t arg0_int8x16_t;
-
-  out_poly64x2_t = vreinterpretq_p64_s8 (arg0_int8x16_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretQp64_u16.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vreinterpretQp64_u16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_crypto_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_crypto } */
-
-#include "arm_neon.h"
-
-void test_vreinterpretQp64_u16 (void)
-{
-  poly64x2_t out_poly64x2_t;
-  uint16x8_t arg0_uint16x8_t;
-
-  out_poly64x2_t = vreinterpretq_p64_u16 (arg0_uint16x8_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretQp64_u32.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vreinterpretQp64_u32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_crypto_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_crypto } */
-
-#include "arm_neon.h"
-
-void test_vreinterpretQp64_u32 (void)
-{
-  poly64x2_t out_poly64x2_t;
-  uint32x4_t arg0_uint32x4_t;
-
-  out_poly64x2_t = vreinterpretq_p64_u32 (arg0_uint32x4_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretQp64_u64.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vreinterpretQp64_u64' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_crypto_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_crypto } */
-
-#include "arm_neon.h"
-
-void test_vreinterpretQp64_u64 (void)
-{
-  poly64x2_t out_poly64x2_t;
-  uint64x2_t arg0_uint64x2_t;
-
-  out_poly64x2_t = vreinterpretq_p64_u64 (arg0_uint64x2_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretQp64_u8.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vreinterpretQp64_u8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_crypto_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_crypto } */
-
-#include "arm_neon.h"
-
-void test_vreinterpretQp64_u8 (void)
-{
-  poly64x2_t out_poly64x2_t;
-  uint8x16_t arg0_uint8x16_t;
-
-  out_poly64x2_t = vreinterpretq_p64_u8 (arg0_uint8x16_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretQp8_f32.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vreinterpretQp8_f32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vreinterpretQp8_f32 (void)
-{
-  poly8x16_t out_poly8x16_t;
-  float32x4_t arg0_float32x4_t;
-
-  out_poly8x16_t = vreinterpretq_p8_f32 (arg0_float32x4_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretQp8_p128.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vreinterpretQp8_p128' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_crypto_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_crypto } */
-
-#include "arm_neon.h"
-
-void test_vreinterpretQp8_p128 (void)
-{
-  poly8x16_t out_poly8x16_t;
-  poly128_t arg0_poly128_t;
-
-  out_poly8x16_t = vreinterpretq_p8_p128 (arg0_poly128_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretQp8_p16.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vreinterpretQp8_p16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vreinterpretQp8_p16 (void)
-{
-  poly8x16_t out_poly8x16_t;
-  poly16x8_t arg0_poly16x8_t;
-
-  out_poly8x16_t = vreinterpretq_p8_p16 (arg0_poly16x8_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretQp8_p64.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vreinterpretQp8_p64' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_crypto_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_crypto } */
-
-#include "arm_neon.h"
-
-void test_vreinterpretQp8_p64 (void)
-{
-  poly8x16_t out_poly8x16_t;
-  poly64x2_t arg0_poly64x2_t;
-
-  out_poly8x16_t = vreinterpretq_p8_p64 (arg0_poly64x2_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretQp8_s16.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vreinterpretQp8_s16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vreinterpretQp8_s16 (void)
-{
-  poly8x16_t out_poly8x16_t;
-  int16x8_t arg0_int16x8_t;
-
-  out_poly8x16_t = vreinterpretq_p8_s16 (arg0_int16x8_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretQp8_s32.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vreinterpretQp8_s32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vreinterpretQp8_s32 (void)
-{
-  poly8x16_t out_poly8x16_t;
-  int32x4_t arg0_int32x4_t;
-
-  out_poly8x16_t = vreinterpretq_p8_s32 (arg0_int32x4_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretQp8_s64.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vreinterpretQp8_s64' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vreinterpretQp8_s64 (void)
-{
-  poly8x16_t out_poly8x16_t;
-  int64x2_t arg0_int64x2_t;
-
-  out_poly8x16_t = vreinterpretq_p8_s64 (arg0_int64x2_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretQp8_s8.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vreinterpretQp8_s8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vreinterpretQp8_s8 (void)
-{
-  poly8x16_t out_poly8x16_t;
-  int8x16_t arg0_int8x16_t;
-
-  out_poly8x16_t = vreinterpretq_p8_s8 (arg0_int8x16_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretQp8_u16.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vreinterpretQp8_u16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vreinterpretQp8_u16 (void)
-{
-  poly8x16_t out_poly8x16_t;
-  uint16x8_t arg0_uint16x8_t;
-
-  out_poly8x16_t = vreinterpretq_p8_u16 (arg0_uint16x8_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretQp8_u32.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vreinterpretQp8_u32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vreinterpretQp8_u32 (void)
-{
-  poly8x16_t out_poly8x16_t;
-  uint32x4_t arg0_uint32x4_t;
-
-  out_poly8x16_t = vreinterpretq_p8_u32 (arg0_uint32x4_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretQp8_u64.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vreinterpretQp8_u64' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vreinterpretQp8_u64 (void)
-{
-  poly8x16_t out_poly8x16_t;
-  uint64x2_t arg0_uint64x2_t;
-
-  out_poly8x16_t = vreinterpretq_p8_u64 (arg0_uint64x2_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretQp8_u8.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vreinterpretQp8_u8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vreinterpretQp8_u8 (void)
-{
-  poly8x16_t out_poly8x16_t;
-  uint8x16_t arg0_uint8x16_t;
-
-  out_poly8x16_t = vreinterpretq_p8_u8 (arg0_uint8x16_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretQs16_f32.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vreinterpretQs16_f32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vreinterpretQs16_f32 (void)
-{
-  int16x8_t out_int16x8_t;
-  float32x4_t arg0_float32x4_t;
-
-  out_int16x8_t = vreinterpretq_s16_f32 (arg0_float32x4_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretQs16_p128.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vreinterpretQs16_p128' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_crypto_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_crypto } */
-
-#include "arm_neon.h"
-
-void test_vreinterpretQs16_p128 (void)
-{
-  int16x8_t out_int16x8_t;
-  poly128_t arg0_poly128_t;
-
-  out_int16x8_t = vreinterpretq_s16_p128 (arg0_poly128_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretQs16_p16.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vreinterpretQs16_p16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vreinterpretQs16_p16 (void)
-{
-  int16x8_t out_int16x8_t;
-  poly16x8_t arg0_poly16x8_t;
-
-  out_int16x8_t = vreinterpretq_s16_p16 (arg0_poly16x8_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretQs16_p64.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vreinterpretQs16_p64' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_crypto_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_crypto } */
-
-#include "arm_neon.h"
-
-void test_vreinterpretQs16_p64 (void)
-{
-  int16x8_t out_int16x8_t;
-  poly64x2_t arg0_poly64x2_t;
-
-  out_int16x8_t = vreinterpretq_s16_p64 (arg0_poly64x2_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretQs16_p8.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vreinterpretQs16_p8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vreinterpretQs16_p8 (void)
-{
-  int16x8_t out_int16x8_t;
-  poly8x16_t arg0_poly8x16_t;
-
-  out_int16x8_t = vreinterpretq_s16_p8 (arg0_poly8x16_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretQs16_s32.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vreinterpretQs16_s32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vreinterpretQs16_s32 (void)
-{
-  int16x8_t out_int16x8_t;
-  int32x4_t arg0_int32x4_t;
-
-  out_int16x8_t = vreinterpretq_s16_s32 (arg0_int32x4_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretQs16_s64.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vreinterpretQs16_s64' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vreinterpretQs16_s64 (void)
-{
-  int16x8_t out_int16x8_t;
-  int64x2_t arg0_int64x2_t;
-
-  out_int16x8_t = vreinterpretq_s16_s64 (arg0_int64x2_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretQs16_s8.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vreinterpretQs16_s8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vreinterpretQs16_s8 (void)
-{
-  int16x8_t out_int16x8_t;
-  int8x16_t arg0_int8x16_t;
-
-  out_int16x8_t = vreinterpretq_s16_s8 (arg0_int8x16_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretQs16_u16.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vreinterpretQs16_u16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vreinterpretQs16_u16 (void)
-{
-  int16x8_t out_int16x8_t;
-  uint16x8_t arg0_uint16x8_t;
-
-  out_int16x8_t = vreinterpretq_s16_u16 (arg0_uint16x8_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretQs16_u32.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vreinterpretQs16_u32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vreinterpretQs16_u32 (void)
-{
-  int16x8_t out_int16x8_t;
-  uint32x4_t arg0_uint32x4_t;
-
-  out_int16x8_t = vreinterpretq_s16_u32 (arg0_uint32x4_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretQs16_u64.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vreinterpretQs16_u64' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vreinterpretQs16_u64 (void)
-{
-  int16x8_t out_int16x8_t;
-  uint64x2_t arg0_uint64x2_t;
-
-  out_int16x8_t = vreinterpretq_s16_u64 (arg0_uint64x2_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretQs16_u8.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vreinterpretQs16_u8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vreinterpretQs16_u8 (void)
-{
-  int16x8_t out_int16x8_t;
-  uint8x16_t arg0_uint8x16_t;
-
-  out_int16x8_t = vreinterpretq_s16_u8 (arg0_uint8x16_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretQs32_f32.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vreinterpretQs32_f32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vreinterpretQs32_f32 (void)
-{
-  int32x4_t out_int32x4_t;
-  float32x4_t arg0_float32x4_t;
-
-  out_int32x4_t = vreinterpretq_s32_f32 (arg0_float32x4_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretQs32_p128.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vreinterpretQs32_p128' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_crypto_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_crypto } */
-
-#include "arm_neon.h"
-
-void test_vreinterpretQs32_p128 (void)
-{
-  int32x4_t out_int32x4_t;
-  poly128_t arg0_poly128_t;
-
-  out_int32x4_t = vreinterpretq_s32_p128 (arg0_poly128_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretQs32_p16.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vreinterpretQs32_p16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vreinterpretQs32_p16 (void)
-{
-  int32x4_t out_int32x4_t;
-  poly16x8_t arg0_poly16x8_t;
-
-  out_int32x4_t = vreinterpretq_s32_p16 (arg0_poly16x8_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretQs32_p64.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vreinterpretQs32_p64' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_crypto_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_crypto } */
-
-#include "arm_neon.h"
-
-void test_vreinterpretQs32_p64 (void)
-{
-  int32x4_t out_int32x4_t;
-  poly64x2_t arg0_poly64x2_t;
-
-  out_int32x4_t = vreinterpretq_s32_p64 (arg0_poly64x2_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretQs32_p8.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vreinterpretQs32_p8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vreinterpretQs32_p8 (void)
-{
-  int32x4_t out_int32x4_t;
-  poly8x16_t arg0_poly8x16_t;
-
-  out_int32x4_t = vreinterpretq_s32_p8 (arg0_poly8x16_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretQs32_s16.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vreinterpretQs32_s16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vreinterpretQs32_s16 (void)
-{
-  int32x4_t out_int32x4_t;
-  int16x8_t arg0_int16x8_t;
-
-  out_int32x4_t = vreinterpretq_s32_s16 (arg0_int16x8_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretQs32_s64.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vreinterpretQs32_s64' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vreinterpretQs32_s64 (void)
-{
-  int32x4_t out_int32x4_t;
-  int64x2_t arg0_int64x2_t;
-
-  out_int32x4_t = vreinterpretq_s32_s64 (arg0_int64x2_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretQs32_s8.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vreinterpretQs32_s8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vreinterpretQs32_s8 (void)
-{
-  int32x4_t out_int32x4_t;
-  int8x16_t arg0_int8x16_t;
-
-  out_int32x4_t = vreinterpretq_s32_s8 (arg0_int8x16_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretQs32_u16.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vreinterpretQs32_u16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vreinterpretQs32_u16 (void)
-{
-  int32x4_t out_int32x4_t;
-  uint16x8_t arg0_uint16x8_t;
-
-  out_int32x4_t = vreinterpretq_s32_u16 (arg0_uint16x8_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretQs32_u32.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vreinterpretQs32_u32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vreinterpretQs32_u32 (void)
-{
-  int32x4_t out_int32x4_t;
-  uint32x4_t arg0_uint32x4_t;
-
-  out_int32x4_t = vreinterpretq_s32_u32 (arg0_uint32x4_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretQs32_u64.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vreinterpretQs32_u64' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vreinterpretQs32_u64 (void)
-{
-  int32x4_t out_int32x4_t;
-  uint64x2_t arg0_uint64x2_t;
-
-  out_int32x4_t = vreinterpretq_s32_u64 (arg0_uint64x2_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretQs32_u8.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vreinterpretQs32_u8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vreinterpretQs32_u8 (void)
-{
-  int32x4_t out_int32x4_t;
-  uint8x16_t arg0_uint8x16_t;
-
-  out_int32x4_t = vreinterpretq_s32_u8 (arg0_uint8x16_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretQs64_f32.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vreinterpretQs64_f32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vreinterpretQs64_f32 (void)
-{
-  int64x2_t out_int64x2_t;
-  float32x4_t arg0_float32x4_t;
-
-  out_int64x2_t = vreinterpretq_s64_f32 (arg0_float32x4_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretQs64_p128.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vreinterpretQs64_p128' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_crypto_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_crypto } */
-
-#include "arm_neon.h"
-
-void test_vreinterpretQs64_p128 (void)
-{
-  int64x2_t out_int64x2_t;
-  poly128_t arg0_poly128_t;
-
-  out_int64x2_t = vreinterpretq_s64_p128 (arg0_poly128_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretQs64_p16.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vreinterpretQs64_p16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vreinterpretQs64_p16 (void)
-{
-  int64x2_t out_int64x2_t;
-  poly16x8_t arg0_poly16x8_t;
-
-  out_int64x2_t = vreinterpretq_s64_p16 (arg0_poly16x8_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretQs64_p64.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vreinterpretQs64_p64' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_crypto_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_crypto } */
-
-#include "arm_neon.h"
-
-void test_vreinterpretQs64_p64 (void)
-{
-  int64x2_t out_int64x2_t;
-  poly64x2_t arg0_poly64x2_t;
-
-  out_int64x2_t = vreinterpretq_s64_p64 (arg0_poly64x2_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretQs64_p8.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vreinterpretQs64_p8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vreinterpretQs64_p8 (void)
-{
-  int64x2_t out_int64x2_t;
-  poly8x16_t arg0_poly8x16_t;
-
-  out_int64x2_t = vreinterpretq_s64_p8 (arg0_poly8x16_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretQs64_s16.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vreinterpretQs64_s16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vreinterpretQs64_s16 (void)
-{
-  int64x2_t out_int64x2_t;
-  int16x8_t arg0_int16x8_t;
-
-  out_int64x2_t = vreinterpretq_s64_s16 (arg0_int16x8_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretQs64_s32.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vreinterpretQs64_s32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vreinterpretQs64_s32 (void)
-{
-  int64x2_t out_int64x2_t;
-  int32x4_t arg0_int32x4_t;
-
-  out_int64x2_t = vreinterpretq_s64_s32 (arg0_int32x4_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretQs64_s8.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vreinterpretQs64_s8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vreinterpretQs64_s8 (void)
-{
-  int64x2_t out_int64x2_t;
-  int8x16_t arg0_int8x16_t;
-
-  out_int64x2_t = vreinterpretq_s64_s8 (arg0_int8x16_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretQs64_u16.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vreinterpretQs64_u16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vreinterpretQs64_u16 (void)
-{
-  int64x2_t out_int64x2_t;
-  uint16x8_t arg0_uint16x8_t;
-
-  out_int64x2_t = vreinterpretq_s64_u16 (arg0_uint16x8_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretQs64_u32.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vreinterpretQs64_u32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vreinterpretQs64_u32 (void)
-{
-  int64x2_t out_int64x2_t;
-  uint32x4_t arg0_uint32x4_t;
-
-  out_int64x2_t = vreinterpretq_s64_u32 (arg0_uint32x4_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretQs64_u64.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vreinterpretQs64_u64' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vreinterpretQs64_u64 (void)
-{
-  int64x2_t out_int64x2_t;
-  uint64x2_t arg0_uint64x2_t;
-
-  out_int64x2_t = vreinterpretq_s64_u64 (arg0_uint64x2_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretQs64_u8.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vreinterpretQs64_u8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vreinterpretQs64_u8 (void)
-{
-  int64x2_t out_int64x2_t;
-  uint8x16_t arg0_uint8x16_t;
-
-  out_int64x2_t = vreinterpretq_s64_u8 (arg0_uint8x16_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretQs8_f32.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vreinterpretQs8_f32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vreinterpretQs8_f32 (void)
-{
-  int8x16_t out_int8x16_t;
-  float32x4_t arg0_float32x4_t;
-
-  out_int8x16_t = vreinterpretq_s8_f32 (arg0_float32x4_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretQs8_p128.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vreinterpretQs8_p128' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_crypto_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_crypto } */
-
-#include "arm_neon.h"
-
-void test_vreinterpretQs8_p128 (void)
-{
-  int8x16_t out_int8x16_t;
-  poly128_t arg0_poly128_t;
-
-  out_int8x16_t = vreinterpretq_s8_p128 (arg0_poly128_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretQs8_p16.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vreinterpretQs8_p16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vreinterpretQs8_p16 (void)
-{
-  int8x16_t out_int8x16_t;
-  poly16x8_t arg0_poly16x8_t;
-
-  out_int8x16_t = vreinterpretq_s8_p16 (arg0_poly16x8_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretQs8_p64.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vreinterpretQs8_p64' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_crypto_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_crypto } */
-
-#include "arm_neon.h"
-
-void test_vreinterpretQs8_p64 (void)
-{
-  int8x16_t out_int8x16_t;
-  poly64x2_t arg0_poly64x2_t;
-
-  out_int8x16_t = vreinterpretq_s8_p64 (arg0_poly64x2_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretQs8_p8.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vreinterpretQs8_p8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vreinterpretQs8_p8 (void)
-{
-  int8x16_t out_int8x16_t;
-  poly8x16_t arg0_poly8x16_t;
-
-  out_int8x16_t = vreinterpretq_s8_p8 (arg0_poly8x16_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretQs8_s16.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vreinterpretQs8_s16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vreinterpretQs8_s16 (void)
-{
-  int8x16_t out_int8x16_t;
-  int16x8_t arg0_int16x8_t;
-
-  out_int8x16_t = vreinterpretq_s8_s16 (arg0_int16x8_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretQs8_s32.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vreinterpretQs8_s32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vreinterpretQs8_s32 (void)
-{
-  int8x16_t out_int8x16_t;
-  int32x4_t arg0_int32x4_t;
-
-  out_int8x16_t = vreinterpretq_s8_s32 (arg0_int32x4_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretQs8_s64.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vreinterpretQs8_s64' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vreinterpretQs8_s64 (void)
-{
-  int8x16_t out_int8x16_t;
-  int64x2_t arg0_int64x2_t;
-
-  out_int8x16_t = vreinterpretq_s8_s64 (arg0_int64x2_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretQs8_u16.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vreinterpretQs8_u16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vreinterpretQs8_u16 (void)
-{
-  int8x16_t out_int8x16_t;
-  uint16x8_t arg0_uint16x8_t;
-
-  out_int8x16_t = vreinterpretq_s8_u16 (arg0_uint16x8_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretQs8_u32.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vreinterpretQs8_u32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vreinterpretQs8_u32 (void)
-{
-  int8x16_t out_int8x16_t;
-  uint32x4_t arg0_uint32x4_t;
-
-  out_int8x16_t = vreinterpretq_s8_u32 (arg0_uint32x4_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretQs8_u64.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vreinterpretQs8_u64' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vreinterpretQs8_u64 (void)
-{
-  int8x16_t out_int8x16_t;
-  uint64x2_t arg0_uint64x2_t;
-
-  out_int8x16_t = vreinterpretq_s8_u64 (arg0_uint64x2_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretQs8_u8.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vreinterpretQs8_u8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vreinterpretQs8_u8 (void)
-{
-  int8x16_t out_int8x16_t;
-  uint8x16_t arg0_uint8x16_t;
-
-  out_int8x16_t = vreinterpretq_s8_u8 (arg0_uint8x16_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretQu16_f32.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vreinterpretQu16_f32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vreinterpretQu16_f32 (void)
-{
-  uint16x8_t out_uint16x8_t;
-  float32x4_t arg0_float32x4_t;
-
-  out_uint16x8_t = vreinterpretq_u16_f32 (arg0_float32x4_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretQu16_p128.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vreinterpretQu16_p128' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_crypto_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_crypto } */
-
-#include "arm_neon.h"
-
-void test_vreinterpretQu16_p128 (void)
-{
-  uint16x8_t out_uint16x8_t;
-  poly128_t arg0_poly128_t;
-
-  out_uint16x8_t = vreinterpretq_u16_p128 (arg0_poly128_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretQu16_p16.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vreinterpretQu16_p16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vreinterpretQu16_p16 (void)
-{
-  uint16x8_t out_uint16x8_t;
-  poly16x8_t arg0_poly16x8_t;
-
-  out_uint16x8_t = vreinterpretq_u16_p16 (arg0_poly16x8_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretQu16_p64.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vreinterpretQu16_p64' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_crypto_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_crypto } */
-
-#include "arm_neon.h"
-
-void test_vreinterpretQu16_p64 (void)
-{
-  uint16x8_t out_uint16x8_t;
-  poly64x2_t arg0_poly64x2_t;
-
-  out_uint16x8_t = vreinterpretq_u16_p64 (arg0_poly64x2_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretQu16_p8.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vreinterpretQu16_p8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vreinterpretQu16_p8 (void)
-{
-  uint16x8_t out_uint16x8_t;
-  poly8x16_t arg0_poly8x16_t;
-
-  out_uint16x8_t = vreinterpretq_u16_p8 (arg0_poly8x16_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretQu16_s16.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vreinterpretQu16_s16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vreinterpretQu16_s16 (void)
-{
-  uint16x8_t out_uint16x8_t;
-  int16x8_t arg0_int16x8_t;
-
-  out_uint16x8_t = vreinterpretq_u16_s16 (arg0_int16x8_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretQu16_s32.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vreinterpretQu16_s32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vreinterpretQu16_s32 (void)
-{
-  uint16x8_t out_uint16x8_t;
-  int32x4_t arg0_int32x4_t;
-
-  out_uint16x8_t = vreinterpretq_u16_s32 (arg0_int32x4_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretQu16_s64.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vreinterpretQu16_s64' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vreinterpretQu16_s64 (void)
-{
-  uint16x8_t out_uint16x8_t;
-  int64x2_t arg0_int64x2_t;
-
-  out_uint16x8_t = vreinterpretq_u16_s64 (arg0_int64x2_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretQu16_s8.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vreinterpretQu16_s8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vreinterpretQu16_s8 (void)
-{
-  uint16x8_t out_uint16x8_t;
-  int8x16_t arg0_int8x16_t;
-
-  out_uint16x8_t = vreinterpretq_u16_s8 (arg0_int8x16_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretQu16_u32.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vreinterpretQu16_u32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vreinterpretQu16_u32 (void)
-{
-  uint16x8_t out_uint16x8_t;
-  uint32x4_t arg0_uint32x4_t;
-
-  out_uint16x8_t = vreinterpretq_u16_u32 (arg0_uint32x4_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretQu16_u64.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vreinterpretQu16_u64' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vreinterpretQu16_u64 (void)
-{
-  uint16x8_t out_uint16x8_t;
-  uint64x2_t arg0_uint64x2_t;
-
-  out_uint16x8_t = vreinterpretq_u16_u64 (arg0_uint64x2_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretQu16_u8.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vreinterpretQu16_u8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vreinterpretQu16_u8 (void)
-{
-  uint16x8_t out_uint16x8_t;
-  uint8x16_t arg0_uint8x16_t;
-
-  out_uint16x8_t = vreinterpretq_u16_u8 (arg0_uint8x16_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretQu32_f32.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vreinterpretQu32_f32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vreinterpretQu32_f32 (void)
-{
-  uint32x4_t out_uint32x4_t;
-  float32x4_t arg0_float32x4_t;
-
-  out_uint32x4_t = vreinterpretq_u32_f32 (arg0_float32x4_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretQu32_p128.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vreinterpretQu32_p128' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_crypto_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_crypto } */
-
-#include "arm_neon.h"
-
-void test_vreinterpretQu32_p128 (void)
-{
-  uint32x4_t out_uint32x4_t;
-  poly128_t arg0_poly128_t;
-
-  out_uint32x4_t = vreinterpretq_u32_p128 (arg0_poly128_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretQu32_p16.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vreinterpretQu32_p16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vreinterpretQu32_p16 (void)
-{
-  uint32x4_t out_uint32x4_t;
-  poly16x8_t arg0_poly16x8_t;
-
-  out_uint32x4_t = vreinterpretq_u32_p16 (arg0_poly16x8_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretQu32_p64.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vreinterpretQu32_p64' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_crypto_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_crypto } */
-
-#include "arm_neon.h"
-
-void test_vreinterpretQu32_p64 (void)
-{
-  uint32x4_t out_uint32x4_t;
-  poly64x2_t arg0_poly64x2_t;
-
-  out_uint32x4_t = vreinterpretq_u32_p64 (arg0_poly64x2_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretQu32_p8.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vreinterpretQu32_p8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vreinterpretQu32_p8 (void)
-{
-  uint32x4_t out_uint32x4_t;
-  poly8x16_t arg0_poly8x16_t;
-
-  out_uint32x4_t = vreinterpretq_u32_p8 (arg0_poly8x16_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretQu32_s16.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vreinterpretQu32_s16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vreinterpretQu32_s16 (void)
-{
-  uint32x4_t out_uint32x4_t;
-  int16x8_t arg0_int16x8_t;
-
-  out_uint32x4_t = vreinterpretq_u32_s16 (arg0_int16x8_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretQu32_s32.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vreinterpretQu32_s32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vreinterpretQu32_s32 (void)
-{
-  uint32x4_t out_uint32x4_t;
-  int32x4_t arg0_int32x4_t;
-
-  out_uint32x4_t = vreinterpretq_u32_s32 (arg0_int32x4_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretQu32_s64.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vreinterpretQu32_s64' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vreinterpretQu32_s64 (void)
-{
-  uint32x4_t out_uint32x4_t;
-  int64x2_t arg0_int64x2_t;
-
-  out_uint32x4_t = vreinterpretq_u32_s64 (arg0_int64x2_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretQu32_s8.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vreinterpretQu32_s8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vreinterpretQu32_s8 (void)
-{
-  uint32x4_t out_uint32x4_t;
-  int8x16_t arg0_int8x16_t;
-
-  out_uint32x4_t = vreinterpretq_u32_s8 (arg0_int8x16_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretQu32_u16.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vreinterpretQu32_u16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vreinterpretQu32_u16 (void)
-{
-  uint32x4_t out_uint32x4_t;
-  uint16x8_t arg0_uint16x8_t;
-
-  out_uint32x4_t = vreinterpretq_u32_u16 (arg0_uint16x8_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretQu32_u64.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vreinterpretQu32_u64' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vreinterpretQu32_u64 (void)
-{
-  uint32x4_t out_uint32x4_t;
-  uint64x2_t arg0_uint64x2_t;
-
-  out_uint32x4_t = vreinterpretq_u32_u64 (arg0_uint64x2_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretQu32_u8.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vreinterpretQu32_u8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vreinterpretQu32_u8 (void)
-{
-  uint32x4_t out_uint32x4_t;
-  uint8x16_t arg0_uint8x16_t;
-
-  out_uint32x4_t = vreinterpretq_u32_u8 (arg0_uint8x16_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretQu64_f32.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vreinterpretQu64_f32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vreinterpretQu64_f32 (void)
-{
-  uint64x2_t out_uint64x2_t;
-  float32x4_t arg0_float32x4_t;
-
-  out_uint64x2_t = vreinterpretq_u64_f32 (arg0_float32x4_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretQu64_p128.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vreinterpretQu64_p128' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_crypto_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_crypto } */
-
-#include "arm_neon.h"
-
-void test_vreinterpretQu64_p128 (void)
-{
-  uint64x2_t out_uint64x2_t;
-  poly128_t arg0_poly128_t;
-
-  out_uint64x2_t = vreinterpretq_u64_p128 (arg0_poly128_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretQu64_p16.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vreinterpretQu64_p16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vreinterpretQu64_p16 (void)
-{
-  uint64x2_t out_uint64x2_t;
-  poly16x8_t arg0_poly16x8_t;
-
-  out_uint64x2_t = vreinterpretq_u64_p16 (arg0_poly16x8_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretQu64_p64.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vreinterpretQu64_p64' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_crypto_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_crypto } */
-
-#include "arm_neon.h"
-
-void test_vreinterpretQu64_p64 (void)
-{
-  uint64x2_t out_uint64x2_t;
-  poly64x2_t arg0_poly64x2_t;
-
-  out_uint64x2_t = vreinterpretq_u64_p64 (arg0_poly64x2_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretQu64_p8.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vreinterpretQu64_p8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vreinterpretQu64_p8 (void)
-{
-  uint64x2_t out_uint64x2_t;
-  poly8x16_t arg0_poly8x16_t;
-
-  out_uint64x2_t = vreinterpretq_u64_p8 (arg0_poly8x16_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretQu64_s16.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vreinterpretQu64_s16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vreinterpretQu64_s16 (void)
-{
-  uint64x2_t out_uint64x2_t;
-  int16x8_t arg0_int16x8_t;
-
-  out_uint64x2_t = vreinterpretq_u64_s16 (arg0_int16x8_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretQu64_s32.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vreinterpretQu64_s32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vreinterpretQu64_s32 (void)
-{
-  uint64x2_t out_uint64x2_t;
-  int32x4_t arg0_int32x4_t;
-
-  out_uint64x2_t = vreinterpretq_u64_s32 (arg0_int32x4_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretQu64_s64.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vreinterpretQu64_s64' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vreinterpretQu64_s64 (void)
-{
-  uint64x2_t out_uint64x2_t;
-  int64x2_t arg0_int64x2_t;
-
-  out_uint64x2_t = vreinterpretq_u64_s64 (arg0_int64x2_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretQu64_s8.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vreinterpretQu64_s8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vreinterpretQu64_s8 (void)
-{
-  uint64x2_t out_uint64x2_t;
-  int8x16_t arg0_int8x16_t;
-
-  out_uint64x2_t = vreinterpretq_u64_s8 (arg0_int8x16_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretQu64_u16.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vreinterpretQu64_u16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vreinterpretQu64_u16 (void)
-{
-  uint64x2_t out_uint64x2_t;
-  uint16x8_t arg0_uint16x8_t;
-
-  out_uint64x2_t = vreinterpretq_u64_u16 (arg0_uint16x8_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretQu64_u32.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vreinterpretQu64_u32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vreinterpretQu64_u32 (void)
-{
-  uint64x2_t out_uint64x2_t;
-  uint32x4_t arg0_uint32x4_t;
-
-  out_uint64x2_t = vreinterpretq_u64_u32 (arg0_uint32x4_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretQu64_u8.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vreinterpretQu64_u8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vreinterpretQu64_u8 (void)
-{
-  uint64x2_t out_uint64x2_t;
-  uint8x16_t arg0_uint8x16_t;
-
-  out_uint64x2_t = vreinterpretq_u64_u8 (arg0_uint8x16_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretQu8_f32.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vreinterpretQu8_f32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vreinterpretQu8_f32 (void)
-{
-  uint8x16_t out_uint8x16_t;
-  float32x4_t arg0_float32x4_t;
-
-  out_uint8x16_t = vreinterpretq_u8_f32 (arg0_float32x4_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretQu8_p128.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vreinterpretQu8_p128' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_crypto_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_crypto } */
-
-#include "arm_neon.h"
-
-void test_vreinterpretQu8_p128 (void)
-{
-  uint8x16_t out_uint8x16_t;
-  poly128_t arg0_poly128_t;
-
-  out_uint8x16_t = vreinterpretq_u8_p128 (arg0_poly128_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretQu8_p16.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vreinterpretQu8_p16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vreinterpretQu8_p16 (void)
-{
-  uint8x16_t out_uint8x16_t;
-  poly16x8_t arg0_poly16x8_t;
-
-  out_uint8x16_t = vreinterpretq_u8_p16 (arg0_poly16x8_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretQu8_p64.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vreinterpretQu8_p64' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_crypto_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_crypto } */
-
-#include "arm_neon.h"
-
-void test_vreinterpretQu8_p64 (void)
-{
-  uint8x16_t out_uint8x16_t;
-  poly64x2_t arg0_poly64x2_t;
-
-  out_uint8x16_t = vreinterpretq_u8_p64 (arg0_poly64x2_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretQu8_p8.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vreinterpretQu8_p8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vreinterpretQu8_p8 (void)
-{
-  uint8x16_t out_uint8x16_t;
-  poly8x16_t arg0_poly8x16_t;
-
-  out_uint8x16_t = vreinterpretq_u8_p8 (arg0_poly8x16_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretQu8_s16.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vreinterpretQu8_s16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vreinterpretQu8_s16 (void)
-{
-  uint8x16_t out_uint8x16_t;
-  int16x8_t arg0_int16x8_t;
-
-  out_uint8x16_t = vreinterpretq_u8_s16 (arg0_int16x8_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretQu8_s32.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vreinterpretQu8_s32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vreinterpretQu8_s32 (void)
-{
-  uint8x16_t out_uint8x16_t;
-  int32x4_t arg0_int32x4_t;
-
-  out_uint8x16_t = vreinterpretq_u8_s32 (arg0_int32x4_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretQu8_s64.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vreinterpretQu8_s64' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vreinterpretQu8_s64 (void)
-{
-  uint8x16_t out_uint8x16_t;
-  int64x2_t arg0_int64x2_t;
-
-  out_uint8x16_t = vreinterpretq_u8_s64 (arg0_int64x2_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretQu8_s8.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vreinterpretQu8_s8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vreinterpretQu8_s8 (void)
-{
-  uint8x16_t out_uint8x16_t;
-  int8x16_t arg0_int8x16_t;
-
-  out_uint8x16_t = vreinterpretq_u8_s8 (arg0_int8x16_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretQu8_u16.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vreinterpretQu8_u16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vreinterpretQu8_u16 (void)
-{
-  uint8x16_t out_uint8x16_t;
-  uint16x8_t arg0_uint16x8_t;
-
-  out_uint8x16_t = vreinterpretq_u8_u16 (arg0_uint16x8_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretQu8_u32.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vreinterpretQu8_u32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vreinterpretQu8_u32 (void)
-{
-  uint8x16_t out_uint8x16_t;
-  uint32x4_t arg0_uint32x4_t;
-
-  out_uint8x16_t = vreinterpretq_u8_u32 (arg0_uint32x4_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretQu8_u64.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vreinterpretQu8_u64' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vreinterpretQu8_u64 (void)
-{
-  uint8x16_t out_uint8x16_t;
-  uint64x2_t arg0_uint64x2_t;
-
-  out_uint8x16_t = vreinterpretq_u8_u64 (arg0_uint64x2_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretf32_p16.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vreinterpretf32_p16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vreinterpretf32_p16 (void)
-{
-  float32x2_t out_float32x2_t;
-  poly16x4_t arg0_poly16x4_t;
-
-  out_float32x2_t = vreinterpret_f32_p16 (arg0_poly16x4_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretf32_p64.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vreinterpretf32_p64' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_crypto_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_crypto } */
-
-#include "arm_neon.h"
-
-void test_vreinterpretf32_p64 (void)
-{
-  float32x2_t out_float32x2_t;
-  poly64x1_t arg0_poly64x1_t;
-
-  out_float32x2_t = vreinterpret_f32_p64 (arg0_poly64x1_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretf32_p8.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vreinterpretf32_p8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vreinterpretf32_p8 (void)
-{
-  float32x2_t out_float32x2_t;
-  poly8x8_t arg0_poly8x8_t;
-
-  out_float32x2_t = vreinterpret_f32_p8 (arg0_poly8x8_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretf32_s16.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vreinterpretf32_s16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vreinterpretf32_s16 (void)
-{
-  float32x2_t out_float32x2_t;
-  int16x4_t arg0_int16x4_t;
-
-  out_float32x2_t = vreinterpret_f32_s16 (arg0_int16x4_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretf32_s32.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vreinterpretf32_s32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vreinterpretf32_s32 (void)
-{
-  float32x2_t out_float32x2_t;
-  int32x2_t arg0_int32x2_t;
-
-  out_float32x2_t = vreinterpret_f32_s32 (arg0_int32x2_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretf32_s64.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vreinterpretf32_s64' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vreinterpretf32_s64 (void)
-{
-  float32x2_t out_float32x2_t;
-  int64x1_t arg0_int64x1_t;
-
-  out_float32x2_t = vreinterpret_f32_s64 (arg0_int64x1_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretf32_s8.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vreinterpretf32_s8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vreinterpretf32_s8 (void)
-{
-  float32x2_t out_float32x2_t;
-  int8x8_t arg0_int8x8_t;
-
-  out_float32x2_t = vreinterpret_f32_s8 (arg0_int8x8_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretf32_u16.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vreinterpretf32_u16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vreinterpretf32_u16 (void)
-{
-  float32x2_t out_float32x2_t;
-  uint16x4_t arg0_uint16x4_t;
-
-  out_float32x2_t = vreinterpret_f32_u16 (arg0_uint16x4_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretf32_u32.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vreinterpretf32_u32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vreinterpretf32_u32 (void)
-{
-  float32x2_t out_float32x2_t;
-  uint32x2_t arg0_uint32x2_t;
-
-  out_float32x2_t = vreinterpret_f32_u32 (arg0_uint32x2_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretf32_u64.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vreinterpretf32_u64' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vreinterpretf32_u64 (void)
-{
-  float32x2_t out_float32x2_t;
-  uint64x1_t arg0_uint64x1_t;
-
-  out_float32x2_t = vreinterpret_f32_u64 (arg0_uint64x1_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretf32_u8.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vreinterpretf32_u8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vreinterpretf32_u8 (void)
-{
-  float32x2_t out_float32x2_t;
-  uint8x8_t arg0_uint8x8_t;
-
-  out_float32x2_t = vreinterpret_f32_u8 (arg0_uint8x8_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretp16_f32.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vreinterpretp16_f32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vreinterpretp16_f32 (void)
-{
-  poly16x4_t out_poly16x4_t;
-  float32x2_t arg0_float32x2_t;
-
-  out_poly16x4_t = vreinterpret_p16_f32 (arg0_float32x2_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretp16_p64.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vreinterpretp16_p64' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_crypto_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_crypto } */
-
-#include "arm_neon.h"
-
-void test_vreinterpretp16_p64 (void)
-{
-  poly16x4_t out_poly16x4_t;
-  poly64x1_t arg0_poly64x1_t;
-
-  out_poly16x4_t = vreinterpret_p16_p64 (arg0_poly64x1_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretp16_p8.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vreinterpretp16_p8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vreinterpretp16_p8 (void)
-{
-  poly16x4_t out_poly16x4_t;
-  poly8x8_t arg0_poly8x8_t;
-
-  out_poly16x4_t = vreinterpret_p16_p8 (arg0_poly8x8_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretp16_s16.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vreinterpretp16_s16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vreinterpretp16_s16 (void)
-{
-  poly16x4_t out_poly16x4_t;
-  int16x4_t arg0_int16x4_t;
-
-  out_poly16x4_t = vreinterpret_p16_s16 (arg0_int16x4_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretp16_s32.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vreinterpretp16_s32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vreinterpretp16_s32 (void)
-{
-  poly16x4_t out_poly16x4_t;
-  int32x2_t arg0_int32x2_t;
-
-  out_poly16x4_t = vreinterpret_p16_s32 (arg0_int32x2_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretp16_s64.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vreinterpretp16_s64' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vreinterpretp16_s64 (void)
-{
-  poly16x4_t out_poly16x4_t;
-  int64x1_t arg0_int64x1_t;
-
-  out_poly16x4_t = vreinterpret_p16_s64 (arg0_int64x1_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretp16_s8.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vreinterpretp16_s8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vreinterpretp16_s8 (void)
-{
-  poly16x4_t out_poly16x4_t;
-  int8x8_t arg0_int8x8_t;
-
-  out_poly16x4_t = vreinterpret_p16_s8 (arg0_int8x8_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretp16_u16.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vreinterpretp16_u16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vreinterpretp16_u16 (void)
-{
-  poly16x4_t out_poly16x4_t;
-  uint16x4_t arg0_uint16x4_t;
-
-  out_poly16x4_t = vreinterpret_p16_u16 (arg0_uint16x4_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretp16_u32.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vreinterpretp16_u32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vreinterpretp16_u32 (void)
-{
-  poly16x4_t out_poly16x4_t;
-  uint32x2_t arg0_uint32x2_t;
-
-  out_poly16x4_t = vreinterpret_p16_u32 (arg0_uint32x2_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretp16_u64.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vreinterpretp16_u64' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vreinterpretp16_u64 (void)
-{
-  poly16x4_t out_poly16x4_t;
-  uint64x1_t arg0_uint64x1_t;
-
-  out_poly16x4_t = vreinterpret_p16_u64 (arg0_uint64x1_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretp16_u8.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vreinterpretp16_u8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vreinterpretp16_u8 (void)
-{
-  poly16x4_t out_poly16x4_t;
-  uint8x8_t arg0_uint8x8_t;
-
-  out_poly16x4_t = vreinterpret_p16_u8 (arg0_uint8x8_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretp64_f32.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vreinterpretp64_f32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_crypto_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_crypto } */
-
-#include "arm_neon.h"
-
-void test_vreinterpretp64_f32 (void)
-{
-  poly64x1_t out_poly64x1_t;
-  float32x2_t arg0_float32x2_t;
-
-  out_poly64x1_t = vreinterpret_p64_f32 (arg0_float32x2_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretp64_p16.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vreinterpretp64_p16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_crypto_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_crypto } */
-
-#include "arm_neon.h"
-
-void test_vreinterpretp64_p16 (void)
-{
-  poly64x1_t out_poly64x1_t;
-  poly16x4_t arg0_poly16x4_t;
-
-  out_poly64x1_t = vreinterpret_p64_p16 (arg0_poly16x4_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretp64_p8.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vreinterpretp64_p8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_crypto_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_crypto } */
-
-#include "arm_neon.h"
-
-void test_vreinterpretp64_p8 (void)
-{
-  poly64x1_t out_poly64x1_t;
-  poly8x8_t arg0_poly8x8_t;
-
-  out_poly64x1_t = vreinterpret_p64_p8 (arg0_poly8x8_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretp64_s16.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vreinterpretp64_s16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_crypto_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_crypto } */
-
-#include "arm_neon.h"
-
-void test_vreinterpretp64_s16 (void)
-{
-  poly64x1_t out_poly64x1_t;
-  int16x4_t arg0_int16x4_t;
-
-  out_poly64x1_t = vreinterpret_p64_s16 (arg0_int16x4_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretp64_s32.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vreinterpretp64_s32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_crypto_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_crypto } */
-
-#include "arm_neon.h"
-
-void test_vreinterpretp64_s32 (void)
-{
-  poly64x1_t out_poly64x1_t;
-  int32x2_t arg0_int32x2_t;
-
-  out_poly64x1_t = vreinterpret_p64_s32 (arg0_int32x2_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretp64_s64.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vreinterpretp64_s64' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_crypto_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_crypto } */
-
-#include "arm_neon.h"
-
-void test_vreinterpretp64_s64 (void)
-{
-  poly64x1_t out_poly64x1_t;
-  int64x1_t arg0_int64x1_t;
-
-  out_poly64x1_t = vreinterpret_p64_s64 (arg0_int64x1_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretp64_s8.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vreinterpretp64_s8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_crypto_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_crypto } */
-
-#include "arm_neon.h"
-
-void test_vreinterpretp64_s8 (void)
-{
-  poly64x1_t out_poly64x1_t;
-  int8x8_t arg0_int8x8_t;
-
-  out_poly64x1_t = vreinterpret_p64_s8 (arg0_int8x8_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretp64_u16.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vreinterpretp64_u16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_crypto_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_crypto } */
-
-#include "arm_neon.h"
-
-void test_vreinterpretp64_u16 (void)
-{
-  poly64x1_t out_poly64x1_t;
-  uint16x4_t arg0_uint16x4_t;
-
-  out_poly64x1_t = vreinterpret_p64_u16 (arg0_uint16x4_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretp64_u32.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vreinterpretp64_u32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_crypto_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_crypto } */
-
-#include "arm_neon.h"
-
-void test_vreinterpretp64_u32 (void)
-{
-  poly64x1_t out_poly64x1_t;
-  uint32x2_t arg0_uint32x2_t;
-
-  out_poly64x1_t = vreinterpret_p64_u32 (arg0_uint32x2_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretp64_u64.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vreinterpretp64_u64' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_crypto_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_crypto } */
-
-#include "arm_neon.h"
-
-void test_vreinterpretp64_u64 (void)
-{
-  poly64x1_t out_poly64x1_t;
-  uint64x1_t arg0_uint64x1_t;
-
-  out_poly64x1_t = vreinterpret_p64_u64 (arg0_uint64x1_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretp64_u8.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vreinterpretp64_u8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_crypto_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_crypto } */
-
-#include "arm_neon.h"
-
-void test_vreinterpretp64_u8 (void)
-{
-  poly64x1_t out_poly64x1_t;
-  uint8x8_t arg0_uint8x8_t;
-
-  out_poly64x1_t = vreinterpret_p64_u8 (arg0_uint8x8_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretp8_f32.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vreinterpretp8_f32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vreinterpretp8_f32 (void)
-{
-  poly8x8_t out_poly8x8_t;
-  float32x2_t arg0_float32x2_t;
-
-  out_poly8x8_t = vreinterpret_p8_f32 (arg0_float32x2_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretp8_p16.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vreinterpretp8_p16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vreinterpretp8_p16 (void)
-{
-  poly8x8_t out_poly8x8_t;
-  poly16x4_t arg0_poly16x4_t;
-
-  out_poly8x8_t = vreinterpret_p8_p16 (arg0_poly16x4_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretp8_p64.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vreinterpretp8_p64' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_crypto_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_crypto } */
-
-#include "arm_neon.h"
-
-void test_vreinterpretp8_p64 (void)
-{
-  poly8x8_t out_poly8x8_t;
-  poly64x1_t arg0_poly64x1_t;
-
-  out_poly8x8_t = vreinterpret_p8_p64 (arg0_poly64x1_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretp8_s16.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vreinterpretp8_s16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vreinterpretp8_s16 (void)
-{
-  poly8x8_t out_poly8x8_t;
-  int16x4_t arg0_int16x4_t;
-
-  out_poly8x8_t = vreinterpret_p8_s16 (arg0_int16x4_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretp8_s32.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vreinterpretp8_s32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vreinterpretp8_s32 (void)
-{
-  poly8x8_t out_poly8x8_t;
-  int32x2_t arg0_int32x2_t;
-
-  out_poly8x8_t = vreinterpret_p8_s32 (arg0_int32x2_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretp8_s64.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vreinterpretp8_s64' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vreinterpretp8_s64 (void)
-{
-  poly8x8_t out_poly8x8_t;
-  int64x1_t arg0_int64x1_t;
-
-  out_poly8x8_t = vreinterpret_p8_s64 (arg0_int64x1_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretp8_s8.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vreinterpretp8_s8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vreinterpretp8_s8 (void)
-{
-  poly8x8_t out_poly8x8_t;
-  int8x8_t arg0_int8x8_t;
-
-  out_poly8x8_t = vreinterpret_p8_s8 (arg0_int8x8_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretp8_u16.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vreinterpretp8_u16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vreinterpretp8_u16 (void)
-{
-  poly8x8_t out_poly8x8_t;
-  uint16x4_t arg0_uint16x4_t;
-
-  out_poly8x8_t = vreinterpret_p8_u16 (arg0_uint16x4_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretp8_u32.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vreinterpretp8_u32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vreinterpretp8_u32 (void)
-{
-  poly8x8_t out_poly8x8_t;
-  uint32x2_t arg0_uint32x2_t;
-
-  out_poly8x8_t = vreinterpret_p8_u32 (arg0_uint32x2_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretp8_u64.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vreinterpretp8_u64' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vreinterpretp8_u64 (void)
-{
-  poly8x8_t out_poly8x8_t;
-  uint64x1_t arg0_uint64x1_t;
-
-  out_poly8x8_t = vreinterpret_p8_u64 (arg0_uint64x1_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretp8_u8.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vreinterpretp8_u8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vreinterpretp8_u8 (void)
-{
-  poly8x8_t out_poly8x8_t;
-  uint8x8_t arg0_uint8x8_t;
-
-  out_poly8x8_t = vreinterpret_p8_u8 (arg0_uint8x8_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterprets16_f32.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vreinterprets16_f32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vreinterprets16_f32 (void)
-{
-  int16x4_t out_int16x4_t;
-  float32x2_t arg0_float32x2_t;
-
-  out_int16x4_t = vreinterpret_s16_f32 (arg0_float32x2_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterprets16_p16.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vreinterprets16_p16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vreinterprets16_p16 (void)
-{
-  int16x4_t out_int16x4_t;
-  poly16x4_t arg0_poly16x4_t;
-
-  out_int16x4_t = vreinterpret_s16_p16 (arg0_poly16x4_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterprets16_p64.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vreinterprets16_p64' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_crypto_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_crypto } */
-
-#include "arm_neon.h"
-
-void test_vreinterprets16_p64 (void)
-{
-  int16x4_t out_int16x4_t;
-  poly64x1_t arg0_poly64x1_t;
-
-  out_int16x4_t = vreinterpret_s16_p64 (arg0_poly64x1_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterprets16_p8.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vreinterprets16_p8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vreinterprets16_p8 (void)
-{
-  int16x4_t out_int16x4_t;
-  poly8x8_t arg0_poly8x8_t;
-
-  out_int16x4_t = vreinterpret_s16_p8 (arg0_poly8x8_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterprets16_s32.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vreinterprets16_s32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vreinterprets16_s32 (void)
-{
-  int16x4_t out_int16x4_t;
-  int32x2_t arg0_int32x2_t;
-
-  out_int16x4_t = vreinterpret_s16_s32 (arg0_int32x2_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterprets16_s64.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vreinterprets16_s64' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vreinterprets16_s64 (void)
-{
-  int16x4_t out_int16x4_t;
-  int64x1_t arg0_int64x1_t;
-
-  out_int16x4_t = vreinterpret_s16_s64 (arg0_int64x1_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterprets16_s8.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vreinterprets16_s8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vreinterprets16_s8 (void)
-{
-  int16x4_t out_int16x4_t;
-  int8x8_t arg0_int8x8_t;
-
-  out_int16x4_t = vreinterpret_s16_s8 (arg0_int8x8_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterprets16_u16.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vreinterprets16_u16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vreinterprets16_u16 (void)
-{
-  int16x4_t out_int16x4_t;
-  uint16x4_t arg0_uint16x4_t;
-
-  out_int16x4_t = vreinterpret_s16_u16 (arg0_uint16x4_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterprets16_u32.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vreinterprets16_u32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vreinterprets16_u32 (void)
-{
-  int16x4_t out_int16x4_t;
-  uint32x2_t arg0_uint32x2_t;
-
-  out_int16x4_t = vreinterpret_s16_u32 (arg0_uint32x2_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterprets16_u64.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vreinterprets16_u64' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vreinterprets16_u64 (void)
-{
-  int16x4_t out_int16x4_t;
-  uint64x1_t arg0_uint64x1_t;
-
-  out_int16x4_t = vreinterpret_s16_u64 (arg0_uint64x1_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterprets16_u8.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vreinterprets16_u8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vreinterprets16_u8 (void)
-{
-  int16x4_t out_int16x4_t;
-  uint8x8_t arg0_uint8x8_t;
-
-  out_int16x4_t = vreinterpret_s16_u8 (arg0_uint8x8_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterprets32_f32.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vreinterprets32_f32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vreinterprets32_f32 (void)
-{
-  int32x2_t out_int32x2_t;
-  float32x2_t arg0_float32x2_t;
-
-  out_int32x2_t = vreinterpret_s32_f32 (arg0_float32x2_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterprets32_p16.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vreinterprets32_p16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vreinterprets32_p16 (void)
-{
-  int32x2_t out_int32x2_t;
-  poly16x4_t arg0_poly16x4_t;
-
-  out_int32x2_t = vreinterpret_s32_p16 (arg0_poly16x4_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterprets32_p64.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vreinterprets32_p64' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_crypto_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_crypto } */
-
-#include "arm_neon.h"
-
-void test_vreinterprets32_p64 (void)
-{
-  int32x2_t out_int32x2_t;
-  poly64x1_t arg0_poly64x1_t;
-
-  out_int32x2_t = vreinterpret_s32_p64 (arg0_poly64x1_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterprets32_p8.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vreinterprets32_p8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vreinterprets32_p8 (void)
-{
-  int32x2_t out_int32x2_t;
-  poly8x8_t arg0_poly8x8_t;
-
-  out_int32x2_t = vreinterpret_s32_p8 (arg0_poly8x8_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterprets32_s16.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vreinterprets32_s16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vreinterprets32_s16 (void)
-{
-  int32x2_t out_int32x2_t;
-  int16x4_t arg0_int16x4_t;
-
-  out_int32x2_t = vreinterpret_s32_s16 (arg0_int16x4_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterprets32_s64.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vreinterprets32_s64' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vreinterprets32_s64 (void)
-{
-  int32x2_t out_int32x2_t;
-  int64x1_t arg0_int64x1_t;
-
-  out_int32x2_t = vreinterpret_s32_s64 (arg0_int64x1_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterprets32_s8.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vreinterprets32_s8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vreinterprets32_s8 (void)
-{
-  int32x2_t out_int32x2_t;
-  int8x8_t arg0_int8x8_t;
-
-  out_int32x2_t = vreinterpret_s32_s8 (arg0_int8x8_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterprets32_u16.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vreinterprets32_u16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vreinterprets32_u16 (void)
-{
-  int32x2_t out_int32x2_t;
-  uint16x4_t arg0_uint16x4_t;
-
-  out_int32x2_t = vreinterpret_s32_u16 (arg0_uint16x4_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterprets32_u32.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vreinterprets32_u32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vreinterprets32_u32 (void)
-{
-  int32x2_t out_int32x2_t;
-  uint32x2_t arg0_uint32x2_t;
-
-  out_int32x2_t = vreinterpret_s32_u32 (arg0_uint32x2_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterprets32_u64.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vreinterprets32_u64' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vreinterprets32_u64 (void)
-{
-  int32x2_t out_int32x2_t;
-  uint64x1_t arg0_uint64x1_t;
-
-  out_int32x2_t = vreinterpret_s32_u64 (arg0_uint64x1_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterprets32_u8.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vreinterprets32_u8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vreinterprets32_u8 (void)
-{
-  int32x2_t out_int32x2_t;
-  uint8x8_t arg0_uint8x8_t;
-
-  out_int32x2_t = vreinterpret_s32_u8 (arg0_uint8x8_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterprets64_f32.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vreinterprets64_f32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vreinterprets64_f32 (void)
-{
-  int64x1_t out_int64x1_t;
-  float32x2_t arg0_float32x2_t;
-
-  out_int64x1_t = vreinterpret_s64_f32 (arg0_float32x2_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterprets64_p16.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vreinterprets64_p16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vreinterprets64_p16 (void)
-{
-  int64x1_t out_int64x1_t;
-  poly16x4_t arg0_poly16x4_t;
-
-  out_int64x1_t = vreinterpret_s64_p16 (arg0_poly16x4_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterprets64_p64.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vreinterprets64_p64' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_crypto_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_crypto } */
-
-#include "arm_neon.h"
-
-void test_vreinterprets64_p64 (void)
-{
-  int64x1_t out_int64x1_t;
-  poly64x1_t arg0_poly64x1_t;
-
-  out_int64x1_t = vreinterpret_s64_p64 (arg0_poly64x1_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterprets64_p8.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vreinterprets64_p8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vreinterprets64_p8 (void)
-{
-  int64x1_t out_int64x1_t;
-  poly8x8_t arg0_poly8x8_t;
-
-  out_int64x1_t = vreinterpret_s64_p8 (arg0_poly8x8_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterprets64_s16.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vreinterprets64_s16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vreinterprets64_s16 (void)
-{
-  int64x1_t out_int64x1_t;
-  int16x4_t arg0_int16x4_t;
-
-  out_int64x1_t = vreinterpret_s64_s16 (arg0_int16x4_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterprets64_s32.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vreinterprets64_s32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vreinterprets64_s32 (void)
-{
-  int64x1_t out_int64x1_t;
-  int32x2_t arg0_int32x2_t;
-
-  out_int64x1_t = vreinterpret_s64_s32 (arg0_int32x2_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterprets64_s8.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vreinterprets64_s8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vreinterprets64_s8 (void)
-{
-  int64x1_t out_int64x1_t;
-  int8x8_t arg0_int8x8_t;
-
-  out_int64x1_t = vreinterpret_s64_s8 (arg0_int8x8_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterprets64_u16.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vreinterprets64_u16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vreinterprets64_u16 (void)
-{
-  int64x1_t out_int64x1_t;
-  uint16x4_t arg0_uint16x4_t;
-
-  out_int64x1_t = vreinterpret_s64_u16 (arg0_uint16x4_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterprets64_u32.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vreinterprets64_u32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vreinterprets64_u32 (void)
-{
-  int64x1_t out_int64x1_t;
-  uint32x2_t arg0_uint32x2_t;
-
-  out_int64x1_t = vreinterpret_s64_u32 (arg0_uint32x2_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterprets64_u64.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vreinterprets64_u64' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vreinterprets64_u64 (void)
-{
-  int64x1_t out_int64x1_t;
-  uint64x1_t arg0_uint64x1_t;
-
-  out_int64x1_t = vreinterpret_s64_u64 (arg0_uint64x1_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterprets64_u8.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vreinterprets64_u8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vreinterprets64_u8 (void)
-{
-  int64x1_t out_int64x1_t;
-  uint8x8_t arg0_uint8x8_t;
-
-  out_int64x1_t = vreinterpret_s64_u8 (arg0_uint8x8_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterprets8_f32.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vreinterprets8_f32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vreinterprets8_f32 (void)
-{
-  int8x8_t out_int8x8_t;
-  float32x2_t arg0_float32x2_t;
-
-  out_int8x8_t = vreinterpret_s8_f32 (arg0_float32x2_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterprets8_p16.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vreinterprets8_p16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vreinterprets8_p16 (void)
-{
-  int8x8_t out_int8x8_t;
-  poly16x4_t arg0_poly16x4_t;
-
-  out_int8x8_t = vreinterpret_s8_p16 (arg0_poly16x4_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterprets8_p64.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vreinterprets8_p64' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_crypto_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_crypto } */
-
-#include "arm_neon.h"
-
-void test_vreinterprets8_p64 (void)
-{
-  int8x8_t out_int8x8_t;
-  poly64x1_t arg0_poly64x1_t;
-
-  out_int8x8_t = vreinterpret_s8_p64 (arg0_poly64x1_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterprets8_p8.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vreinterprets8_p8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vreinterprets8_p8 (void)
-{
-  int8x8_t out_int8x8_t;
-  poly8x8_t arg0_poly8x8_t;
-
-  out_int8x8_t = vreinterpret_s8_p8 (arg0_poly8x8_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterprets8_s16.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vreinterprets8_s16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vreinterprets8_s16 (void)
-{
-  int8x8_t out_int8x8_t;
-  int16x4_t arg0_int16x4_t;
-
-  out_int8x8_t = vreinterpret_s8_s16 (arg0_int16x4_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterprets8_s32.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vreinterprets8_s32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vreinterprets8_s32 (void)
-{
-  int8x8_t out_int8x8_t;
-  int32x2_t arg0_int32x2_t;
-
-  out_int8x8_t = vreinterpret_s8_s32 (arg0_int32x2_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterprets8_s64.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vreinterprets8_s64' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vreinterprets8_s64 (void)
-{
-  int8x8_t out_int8x8_t;
-  int64x1_t arg0_int64x1_t;
-
-  out_int8x8_t = vreinterpret_s8_s64 (arg0_int64x1_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterprets8_u16.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vreinterprets8_u16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vreinterprets8_u16 (void)
-{
-  int8x8_t out_int8x8_t;
-  uint16x4_t arg0_uint16x4_t;
-
-  out_int8x8_t = vreinterpret_s8_u16 (arg0_uint16x4_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterprets8_u32.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vreinterprets8_u32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vreinterprets8_u32 (void)
-{
-  int8x8_t out_int8x8_t;
-  uint32x2_t arg0_uint32x2_t;
-
-  out_int8x8_t = vreinterpret_s8_u32 (arg0_uint32x2_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterprets8_u64.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vreinterprets8_u64' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vreinterprets8_u64 (void)
-{
-  int8x8_t out_int8x8_t;
-  uint64x1_t arg0_uint64x1_t;
-
-  out_int8x8_t = vreinterpret_s8_u64 (arg0_uint64x1_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterprets8_u8.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vreinterprets8_u8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vreinterprets8_u8 (void)
-{
-  int8x8_t out_int8x8_t;
-  uint8x8_t arg0_uint8x8_t;
-
-  out_int8x8_t = vreinterpret_s8_u8 (arg0_uint8x8_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretu16_f32.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vreinterpretu16_f32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vreinterpretu16_f32 (void)
-{
-  uint16x4_t out_uint16x4_t;
-  float32x2_t arg0_float32x2_t;
-
-  out_uint16x4_t = vreinterpret_u16_f32 (arg0_float32x2_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretu16_p16.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vreinterpretu16_p16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vreinterpretu16_p16 (void)
-{
-  uint16x4_t out_uint16x4_t;
-  poly16x4_t arg0_poly16x4_t;
-
-  out_uint16x4_t = vreinterpret_u16_p16 (arg0_poly16x4_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretu16_p64.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vreinterpretu16_p64' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_crypto_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_crypto } */
-
-#include "arm_neon.h"
-
-void test_vreinterpretu16_p64 (void)
-{
-  uint16x4_t out_uint16x4_t;
-  poly64x1_t arg0_poly64x1_t;
-
-  out_uint16x4_t = vreinterpret_u16_p64 (arg0_poly64x1_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretu16_p8.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vreinterpretu16_p8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vreinterpretu16_p8 (void)
-{
-  uint16x4_t out_uint16x4_t;
-  poly8x8_t arg0_poly8x8_t;
-
-  out_uint16x4_t = vreinterpret_u16_p8 (arg0_poly8x8_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretu16_s16.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vreinterpretu16_s16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vreinterpretu16_s16 (void)
-{
-  uint16x4_t out_uint16x4_t;
-  int16x4_t arg0_int16x4_t;
-
-  out_uint16x4_t = vreinterpret_u16_s16 (arg0_int16x4_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretu16_s32.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vreinterpretu16_s32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vreinterpretu16_s32 (void)
-{
-  uint16x4_t out_uint16x4_t;
-  int32x2_t arg0_int32x2_t;
-
-  out_uint16x4_t = vreinterpret_u16_s32 (arg0_int32x2_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretu16_s64.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vreinterpretu16_s64' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vreinterpretu16_s64 (void)
-{
-  uint16x4_t out_uint16x4_t;
-  int64x1_t arg0_int64x1_t;
-
-  out_uint16x4_t = vreinterpret_u16_s64 (arg0_int64x1_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretu16_s8.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vreinterpretu16_s8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vreinterpretu16_s8 (void)
-{
-  uint16x4_t out_uint16x4_t;
-  int8x8_t arg0_int8x8_t;
-
-  out_uint16x4_t = vreinterpret_u16_s8 (arg0_int8x8_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretu16_u32.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vreinterpretu16_u32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vreinterpretu16_u32 (void)
-{
-  uint16x4_t out_uint16x4_t;
-  uint32x2_t arg0_uint32x2_t;
-
-  out_uint16x4_t = vreinterpret_u16_u32 (arg0_uint32x2_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretu16_u64.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vreinterpretu16_u64' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vreinterpretu16_u64 (void)
-{
-  uint16x4_t out_uint16x4_t;
-  uint64x1_t arg0_uint64x1_t;
-
-  out_uint16x4_t = vreinterpret_u16_u64 (arg0_uint64x1_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretu16_u8.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vreinterpretu16_u8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vreinterpretu16_u8 (void)
-{
-  uint16x4_t out_uint16x4_t;
-  uint8x8_t arg0_uint8x8_t;
-
-  out_uint16x4_t = vreinterpret_u16_u8 (arg0_uint8x8_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretu32_f32.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vreinterpretu32_f32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vreinterpretu32_f32 (void)
-{
-  uint32x2_t out_uint32x2_t;
-  float32x2_t arg0_float32x2_t;
-
-  out_uint32x2_t = vreinterpret_u32_f32 (arg0_float32x2_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretu32_p16.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vreinterpretu32_p16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vreinterpretu32_p16 (void)
-{
-  uint32x2_t out_uint32x2_t;
-  poly16x4_t arg0_poly16x4_t;
-
-  out_uint32x2_t = vreinterpret_u32_p16 (arg0_poly16x4_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretu32_p64.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vreinterpretu32_p64' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_crypto_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_crypto } */
-
-#include "arm_neon.h"
-
-void test_vreinterpretu32_p64 (void)
-{
-  uint32x2_t out_uint32x2_t;
-  poly64x1_t arg0_poly64x1_t;
-
-  out_uint32x2_t = vreinterpret_u32_p64 (arg0_poly64x1_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretu32_p8.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vreinterpretu32_p8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vreinterpretu32_p8 (void)
-{
-  uint32x2_t out_uint32x2_t;
-  poly8x8_t arg0_poly8x8_t;
-
-  out_uint32x2_t = vreinterpret_u32_p8 (arg0_poly8x8_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretu32_s16.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vreinterpretu32_s16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vreinterpretu32_s16 (void)
-{
-  uint32x2_t out_uint32x2_t;
-  int16x4_t arg0_int16x4_t;
-
-  out_uint32x2_t = vreinterpret_u32_s16 (arg0_int16x4_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretu32_s32.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vreinterpretu32_s32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vreinterpretu32_s32 (void)
-{
-  uint32x2_t out_uint32x2_t;
-  int32x2_t arg0_int32x2_t;
-
-  out_uint32x2_t = vreinterpret_u32_s32 (arg0_int32x2_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretu32_s64.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vreinterpretu32_s64' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vreinterpretu32_s64 (void)
-{
-  uint32x2_t out_uint32x2_t;
-  int64x1_t arg0_int64x1_t;
-
-  out_uint32x2_t = vreinterpret_u32_s64 (arg0_int64x1_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretu32_s8.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vreinterpretu32_s8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vreinterpretu32_s8 (void)
-{
-  uint32x2_t out_uint32x2_t;
-  int8x8_t arg0_int8x8_t;
-
-  out_uint32x2_t = vreinterpret_u32_s8 (arg0_int8x8_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretu32_u16.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vreinterpretu32_u16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vreinterpretu32_u16 (void)
-{
-  uint32x2_t out_uint32x2_t;
-  uint16x4_t arg0_uint16x4_t;
-
-  out_uint32x2_t = vreinterpret_u32_u16 (arg0_uint16x4_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretu32_u64.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vreinterpretu32_u64' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vreinterpretu32_u64 (void)
-{
-  uint32x2_t out_uint32x2_t;
-  uint64x1_t arg0_uint64x1_t;
-
-  out_uint32x2_t = vreinterpret_u32_u64 (arg0_uint64x1_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretu32_u8.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vreinterpretu32_u8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vreinterpretu32_u8 (void)
-{
-  uint32x2_t out_uint32x2_t;
-  uint8x8_t arg0_uint8x8_t;
-
-  out_uint32x2_t = vreinterpret_u32_u8 (arg0_uint8x8_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretu64_f32.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vreinterpretu64_f32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vreinterpretu64_f32 (void)
-{
-  uint64x1_t out_uint64x1_t;
-  float32x2_t arg0_float32x2_t;
-
-  out_uint64x1_t = vreinterpret_u64_f32 (arg0_float32x2_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretu64_p16.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vreinterpretu64_p16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vreinterpretu64_p16 (void)
-{
-  uint64x1_t out_uint64x1_t;
-  poly16x4_t arg0_poly16x4_t;
-
-  out_uint64x1_t = vreinterpret_u64_p16 (arg0_poly16x4_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretu64_p64.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vreinterpretu64_p64' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_crypto_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_crypto } */
-
-#include "arm_neon.h"
-
-void test_vreinterpretu64_p64 (void)
-{
-  uint64x1_t out_uint64x1_t;
-  poly64x1_t arg0_poly64x1_t;
-
-  out_uint64x1_t = vreinterpret_u64_p64 (arg0_poly64x1_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretu64_p8.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vreinterpretu64_p8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vreinterpretu64_p8 (void)
-{
-  uint64x1_t out_uint64x1_t;
-  poly8x8_t arg0_poly8x8_t;
-
-  out_uint64x1_t = vreinterpret_u64_p8 (arg0_poly8x8_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretu64_s16.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vreinterpretu64_s16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vreinterpretu64_s16 (void)
-{
-  uint64x1_t out_uint64x1_t;
-  int16x4_t arg0_int16x4_t;
-
-  out_uint64x1_t = vreinterpret_u64_s16 (arg0_int16x4_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretu64_s32.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vreinterpretu64_s32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vreinterpretu64_s32 (void)
-{
-  uint64x1_t out_uint64x1_t;
-  int32x2_t arg0_int32x2_t;
-
-  out_uint64x1_t = vreinterpret_u64_s32 (arg0_int32x2_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretu64_s64.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vreinterpretu64_s64' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vreinterpretu64_s64 (void)
-{
-  uint64x1_t out_uint64x1_t;
-  int64x1_t arg0_int64x1_t;
-
-  out_uint64x1_t = vreinterpret_u64_s64 (arg0_int64x1_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretu64_s8.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vreinterpretu64_s8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vreinterpretu64_s8 (void)
-{
-  uint64x1_t out_uint64x1_t;
-  int8x8_t arg0_int8x8_t;
-
-  out_uint64x1_t = vreinterpret_u64_s8 (arg0_int8x8_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretu64_u16.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vreinterpretu64_u16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vreinterpretu64_u16 (void)
-{
-  uint64x1_t out_uint64x1_t;
-  uint16x4_t arg0_uint16x4_t;
-
-  out_uint64x1_t = vreinterpret_u64_u16 (arg0_uint16x4_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretu64_u32.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vreinterpretu64_u32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vreinterpretu64_u32 (void)
-{
-  uint64x1_t out_uint64x1_t;
-  uint32x2_t arg0_uint32x2_t;
-
-  out_uint64x1_t = vreinterpret_u64_u32 (arg0_uint32x2_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretu64_u8.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vreinterpretu64_u8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vreinterpretu64_u8 (void)
-{
-  uint64x1_t out_uint64x1_t;
-  uint8x8_t arg0_uint8x8_t;
-
-  out_uint64x1_t = vreinterpret_u64_u8 (arg0_uint8x8_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretu8_f32.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vreinterpretu8_f32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vreinterpretu8_f32 (void)
-{
-  uint8x8_t out_uint8x8_t;
-  float32x2_t arg0_float32x2_t;
-
-  out_uint8x8_t = vreinterpret_u8_f32 (arg0_float32x2_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretu8_p16.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vreinterpretu8_p16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vreinterpretu8_p16 (void)
-{
-  uint8x8_t out_uint8x8_t;
-  poly16x4_t arg0_poly16x4_t;
-
-  out_uint8x8_t = vreinterpret_u8_p16 (arg0_poly16x4_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretu8_p64.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vreinterpretu8_p64' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_crypto_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_crypto } */
-
-#include "arm_neon.h"
-
-void test_vreinterpretu8_p64 (void)
-{
-  uint8x8_t out_uint8x8_t;
-  poly64x1_t arg0_poly64x1_t;
-
-  out_uint8x8_t = vreinterpret_u8_p64 (arg0_poly64x1_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretu8_p8.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vreinterpretu8_p8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vreinterpretu8_p8 (void)
-{
-  uint8x8_t out_uint8x8_t;
-  poly8x8_t arg0_poly8x8_t;
-
-  out_uint8x8_t = vreinterpret_u8_p8 (arg0_poly8x8_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretu8_s16.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vreinterpretu8_s16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vreinterpretu8_s16 (void)
-{
-  uint8x8_t out_uint8x8_t;
-  int16x4_t arg0_int16x4_t;
-
-  out_uint8x8_t = vreinterpret_u8_s16 (arg0_int16x4_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretu8_s32.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vreinterpretu8_s32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vreinterpretu8_s32 (void)
-{
-  uint8x8_t out_uint8x8_t;
-  int32x2_t arg0_int32x2_t;
-
-  out_uint8x8_t = vreinterpret_u8_s32 (arg0_int32x2_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretu8_s64.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vreinterpretu8_s64' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vreinterpretu8_s64 (void)
-{
-  uint8x8_t out_uint8x8_t;
-  int64x1_t arg0_int64x1_t;
-
-  out_uint8x8_t = vreinterpret_u8_s64 (arg0_int64x1_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretu8_s8.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vreinterpretu8_s8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vreinterpretu8_s8 (void)
-{
-  uint8x8_t out_uint8x8_t;
-  int8x8_t arg0_int8x8_t;
-
-  out_uint8x8_t = vreinterpret_u8_s8 (arg0_int8x8_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretu8_u16.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vreinterpretu8_u16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vreinterpretu8_u16 (void)
-{
-  uint8x8_t out_uint8x8_t;
-  uint16x4_t arg0_uint16x4_t;
-
-  out_uint8x8_t = vreinterpret_u8_u16 (arg0_uint16x4_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretu8_u32.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vreinterpretu8_u32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vreinterpretu8_u32 (void)
-{
-  uint8x8_t out_uint8x8_t;
-  uint32x2_t arg0_uint32x2_t;
-
-  out_uint8x8_t = vreinterpret_u8_u32 (arg0_uint32x2_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretu8_u64.c
+++ b/src//dev/null
@@ -1,18 +0,0 @@
-/* Test the `vreinterpretu8_u64' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vreinterpretu8_u64 (void)
-{
-  uint8x8_t out_uint8x8_t;
-  uint64x1_t arg0_uint64x1_t;
-
-  out_uint8x8_t = vreinterpret_u8_u64 (arg0_uint64x1_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vrev16Qp8.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vrev16Qp8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vrev16Qp8 (void)
-{
-  poly8x16_t out_poly8x16_t;
-  poly8x16_t arg0_poly8x16_t;
-
-  out_poly8x16_t = vrev16q_p8 (arg0_poly8x16_t);
-}
-
-/* { dg-final { scan-assembler "vrev16\.8\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vrev16Qs8.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vrev16Qs8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vrev16Qs8 (void)
-{
-  int8x16_t out_int8x16_t;
-  int8x16_t arg0_int8x16_t;
-
-  out_int8x16_t = vrev16q_s8 (arg0_int8x16_t);
-}
-
-/* { dg-final { scan-assembler "vrev16\.8\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vrev16Qu8.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vrev16Qu8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vrev16Qu8 (void)
-{
-  uint8x16_t out_uint8x16_t;
-  uint8x16_t arg0_uint8x16_t;
-
-  out_uint8x16_t = vrev16q_u8 (arg0_uint8x16_t);
-}
-
-/* { dg-final { scan-assembler "vrev16\.8\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vrev16p8.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vrev16p8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vrev16p8 (void)
-{
-  poly8x8_t out_poly8x8_t;
-  poly8x8_t arg0_poly8x8_t;
-
-  out_poly8x8_t = vrev16_p8 (arg0_poly8x8_t);
-}
-
-/* { dg-final { scan-assembler "vrev16\.8\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vrev16s8.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vrev16s8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vrev16s8 (void)
-{
-  int8x8_t out_int8x8_t;
-  int8x8_t arg0_int8x8_t;
-
-  out_int8x8_t = vrev16_s8 (arg0_int8x8_t);
-}
-
-/* { dg-final { scan-assembler "vrev16\.8\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vrev16u8.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vrev16u8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vrev16u8 (void)
-{
-  uint8x8_t out_uint8x8_t;
-  uint8x8_t arg0_uint8x8_t;
-
-  out_uint8x8_t = vrev16_u8 (arg0_uint8x8_t);
-}
-
-/* { dg-final { scan-assembler "vrev16\.8\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vrev32Qp16.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vrev32Qp16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vrev32Qp16 (void)
-{
-  poly16x8_t out_poly16x8_t;
-  poly16x8_t arg0_poly16x8_t;
-
-  out_poly16x8_t = vrev32q_p16 (arg0_poly16x8_t);
-}
-
-/* { dg-final { scan-assembler "vrev32\.16\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vrev32Qp8.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vrev32Qp8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vrev32Qp8 (void)
-{
-  poly8x16_t out_poly8x16_t;
-  poly8x16_t arg0_poly8x16_t;
-
-  out_poly8x16_t = vrev32q_p8 (arg0_poly8x16_t);
-}
-
-/* { dg-final { scan-assembler "vrev32\.8\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vrev32Qs16.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vrev32Qs16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vrev32Qs16 (void)
-{
-  int16x8_t out_int16x8_t;
-  int16x8_t arg0_int16x8_t;
-
-  out_int16x8_t = vrev32q_s16 (arg0_int16x8_t);
-}
-
-/* { dg-final { scan-assembler "vrev32\.16\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vrev32Qs8.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vrev32Qs8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vrev32Qs8 (void)
-{
-  int8x16_t out_int8x16_t;
-  int8x16_t arg0_int8x16_t;
-
-  out_int8x16_t = vrev32q_s8 (arg0_int8x16_t);
-}
-
-/* { dg-final { scan-assembler "vrev32\.8\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vrev32Qu16.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vrev32Qu16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vrev32Qu16 (void)
-{
-  uint16x8_t out_uint16x8_t;
-  uint16x8_t arg0_uint16x8_t;
-
-  out_uint16x8_t = vrev32q_u16 (arg0_uint16x8_t);
-}
-
-/* { dg-final { scan-assembler "vrev32\.16\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vrev32Qu8.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vrev32Qu8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vrev32Qu8 (void)
-{
-  uint8x16_t out_uint8x16_t;
-  uint8x16_t arg0_uint8x16_t;
-
-  out_uint8x16_t = vrev32q_u8 (arg0_uint8x16_t);
-}
-
-/* { dg-final { scan-assembler "vrev32\.8\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vrev32p16.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vrev32p16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vrev32p16 (void)
-{
-  poly16x4_t out_poly16x4_t;
-  poly16x4_t arg0_poly16x4_t;
-
-  out_poly16x4_t = vrev32_p16 (arg0_poly16x4_t);
-}
-
-/* { dg-final { scan-assembler "vrev32\.16\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vrev32p8.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vrev32p8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vrev32p8 (void)
-{
-  poly8x8_t out_poly8x8_t;
-  poly8x8_t arg0_poly8x8_t;
-
-  out_poly8x8_t = vrev32_p8 (arg0_poly8x8_t);
-}
-
-/* { dg-final { scan-assembler "vrev32\.8\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vrev32s16.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vrev32s16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vrev32s16 (void)
-{
-  int16x4_t out_int16x4_t;
-  int16x4_t arg0_int16x4_t;
-
-  out_int16x4_t = vrev32_s16 (arg0_int16x4_t);
-}
-
-/* { dg-final { scan-assembler "vrev32\.16\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vrev32s8.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vrev32s8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vrev32s8 (void)
-{
-  int8x8_t out_int8x8_t;
-  int8x8_t arg0_int8x8_t;
-
-  out_int8x8_t = vrev32_s8 (arg0_int8x8_t);
-}
-
-/* { dg-final { scan-assembler "vrev32\.8\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vrev32u16.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vrev32u16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vrev32u16 (void)
-{
-  uint16x4_t out_uint16x4_t;
-  uint16x4_t arg0_uint16x4_t;
-
-  out_uint16x4_t = vrev32_u16 (arg0_uint16x4_t);
-}
-
-/* { dg-final { scan-assembler "vrev32\.16\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vrev32u8.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vrev32u8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vrev32u8 (void)
-{
-  uint8x8_t out_uint8x8_t;
-  uint8x8_t arg0_uint8x8_t;
-
-  out_uint8x8_t = vrev32_u8 (arg0_uint8x8_t);
-}
-
-/* { dg-final { scan-assembler "vrev32\.8\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vrev64Qf32.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vrev64Qf32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vrev64Qf32 (void)
-{
-  float32x4_t out_float32x4_t;
-  float32x4_t arg0_float32x4_t;
-
-  out_float32x4_t = vrev64q_f32 (arg0_float32x4_t);
-}
-
-/* { dg-final { scan-assembler "vrev64\.32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vrev64Qp16.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vrev64Qp16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vrev64Qp16 (void)
-{
-  poly16x8_t out_poly16x8_t;
-  poly16x8_t arg0_poly16x8_t;
-
-  out_poly16x8_t = vrev64q_p16 (arg0_poly16x8_t);
-}
-
-/* { dg-final { scan-assembler "vrev64\.16\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vrev64Qp8.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vrev64Qp8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vrev64Qp8 (void)
-{
-  poly8x16_t out_poly8x16_t;
-  poly8x16_t arg0_poly8x16_t;
-
-  out_poly8x16_t = vrev64q_p8 (arg0_poly8x16_t);
-}
-
-/* { dg-final { scan-assembler "vrev64\.8\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vrev64Qs16.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vrev64Qs16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vrev64Qs16 (void)
-{
-  int16x8_t out_int16x8_t;
-  int16x8_t arg0_int16x8_t;
-
-  out_int16x8_t = vrev64q_s16 (arg0_int16x8_t);
-}
-
-/* { dg-final { scan-assembler "vrev64\.16\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vrev64Qs32.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vrev64Qs32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vrev64Qs32 (void)
-{
-  int32x4_t out_int32x4_t;
-  int32x4_t arg0_int32x4_t;
-
-  out_int32x4_t = vrev64q_s32 (arg0_int32x4_t);
-}
-
-/* { dg-final { scan-assembler "vrev64\.32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vrev64Qs8.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vrev64Qs8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vrev64Qs8 (void)
-{
-  int8x16_t out_int8x16_t;
-  int8x16_t arg0_int8x16_t;
-
-  out_int8x16_t = vrev64q_s8 (arg0_int8x16_t);
-}
-
-/* { dg-final { scan-assembler "vrev64\.8\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vrev64Qu16.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vrev64Qu16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vrev64Qu16 (void)
-{
-  uint16x8_t out_uint16x8_t;
-  uint16x8_t arg0_uint16x8_t;
-
-  out_uint16x8_t = vrev64q_u16 (arg0_uint16x8_t);
-}
-
-/* { dg-final { scan-assembler "vrev64\.16\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vrev64Qu32.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vrev64Qu32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vrev64Qu32 (void)
-{
-  uint32x4_t out_uint32x4_t;
-  uint32x4_t arg0_uint32x4_t;
-
-  out_uint32x4_t = vrev64q_u32 (arg0_uint32x4_t);
-}
-
-/* { dg-final { scan-assembler "vrev64\.32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vrev64Qu8.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vrev64Qu8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vrev64Qu8 (void)
-{
-  uint8x16_t out_uint8x16_t;
-  uint8x16_t arg0_uint8x16_t;
-
-  out_uint8x16_t = vrev64q_u8 (arg0_uint8x16_t);
-}
-
-/* { dg-final { scan-assembler "vrev64\.8\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vrev64f32.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vrev64f32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vrev64f32 (void)
-{
-  float32x2_t out_float32x2_t;
-  float32x2_t arg0_float32x2_t;
-
-  out_float32x2_t = vrev64_f32 (arg0_float32x2_t);
-}
-
-/* { dg-final { scan-assembler "vrev64\.32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vrev64p16.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vrev64p16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vrev64p16 (void)
-{
-  poly16x4_t out_poly16x4_t;
-  poly16x4_t arg0_poly16x4_t;
-
-  out_poly16x4_t = vrev64_p16 (arg0_poly16x4_t);
-}
-
-/* { dg-final { scan-assembler "vrev64\.16\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vrev64p8.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vrev64p8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vrev64p8 (void)
-{
-  poly8x8_t out_poly8x8_t;
-  poly8x8_t arg0_poly8x8_t;
-
-  out_poly8x8_t = vrev64_p8 (arg0_poly8x8_t);
-}
-
-/* { dg-final { scan-assembler "vrev64\.8\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vrev64s16.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vrev64s16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vrev64s16 (void)
-{
-  int16x4_t out_int16x4_t;
-  int16x4_t arg0_int16x4_t;
-
-  out_int16x4_t = vrev64_s16 (arg0_int16x4_t);
-}
-
-/* { dg-final { scan-assembler "vrev64\.16\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vrev64s32.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vrev64s32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vrev64s32 (void)
-{
-  int32x2_t out_int32x2_t;
-  int32x2_t arg0_int32x2_t;
-
-  out_int32x2_t = vrev64_s32 (arg0_int32x2_t);
-}
-
-/* { dg-final { scan-assembler "vrev64\.32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vrev64s8.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vrev64s8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vrev64s8 (void)
-{
-  int8x8_t out_int8x8_t;
-  int8x8_t arg0_int8x8_t;
-
-  out_int8x8_t = vrev64_s8 (arg0_int8x8_t);
-}
-
-/* { dg-final { scan-assembler "vrev64\.8\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vrev64u16.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vrev64u16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vrev64u16 (void)
-{
-  uint16x4_t out_uint16x4_t;
-  uint16x4_t arg0_uint16x4_t;
-
-  out_uint16x4_t = vrev64_u16 (arg0_uint16x4_t);
-}
-
-/* { dg-final { scan-assembler "vrev64\.16\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vrev64u32.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vrev64u32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vrev64u32 (void)
-{
-  uint32x2_t out_uint32x2_t;
-  uint32x2_t arg0_uint32x2_t;
-
-  out_uint32x2_t = vrev64_u32 (arg0_uint32x2_t);
-}
-
-/* { dg-final { scan-assembler "vrev64\.32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vrev64u8.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vrev64u8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vrev64u8 (void)
-{
-  uint8x8_t out_uint8x8_t;
-  uint8x8_t arg0_uint8x8_t;
-
-  out_uint8x8_t = vrev64_u8 (arg0_uint8x8_t);
-}
-
-/* { dg-final { scan-assembler "vrev64\.8\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vrndaf32.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vrndaf32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_v8_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_v8_neon } */
-
-#include "arm_neon.h"
-
-void test_vrndaf32 (void)
-{
-  float32x2_t out_float32x2_t;
-  float32x2_t arg0_float32x2_t;
-
-  out_float32x2_t = vrnda_f32 (arg0_float32x2_t);
-}
-
-/* { dg-final { scan-assembler "vrinta\.f32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vrndaqf32.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vrndaq_f32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_v8_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_v8_neon } */
-
-#include "arm_neon.h"
-
-void test_vrndaqf32 (void)
-{
-  float32x4_t out_float32x4_t;
-  float32x4_t arg0_float32x4_t;
-
-  out_float32x4_t = vrndaq_f32 (arg0_float32x4_t);
-}
-
-/* { dg-final { scan-assembler "vrinta\.f32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vrndf32.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vrndf32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_v8_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_v8_neon } */
-
-#include "arm_neon.h"
-
-void test_vrndf32 (void)
-{
-  float32x2_t out_float32x2_t;
-  float32x2_t arg0_float32x2_t;
-
-  out_float32x2_t = vrnd_f32 (arg0_float32x2_t);
-}
-
-/* { dg-final { scan-assembler "vrintz\.f32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vrndmf32.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vrndmf32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_v8_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_v8_neon } */
-
-#include "arm_neon.h"
-
-void test_vrndmf32 (void)
-{
-  float32x2_t out_float32x2_t;
-  float32x2_t arg0_float32x2_t;
-
-  out_float32x2_t = vrndm_f32 (arg0_float32x2_t);
-}
-
-/* { dg-final { scan-assembler "vrintm\.f32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vrndmqf32.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vrndmq_f32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_v8_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_v8_neon } */
-
-#include "arm_neon.h"
-
-void test_vrndmqf32 (void)
-{
-  float32x4_t out_float32x4_t;
-  float32x4_t arg0_float32x4_t;
-
-  out_float32x4_t = vrndmq_f32 (arg0_float32x4_t);
-}
-
-/* { dg-final { scan-assembler "vrintm\.f32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vrndnf32.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vrndnf32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_v8_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_v8_neon } */
-
-#include "arm_neon.h"
-
-void test_vrndnf32 (void)
-{
-  float32x2_t out_float32x2_t;
-  float32x2_t arg0_float32x2_t;
-
-  out_float32x2_t = vrndn_f32 (arg0_float32x2_t);
-}
-
-/* { dg-final { scan-assembler "vrintn\.f32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vrndnqf32.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vrndnq_f32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_v8_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_v8_neon } */
-
-#include "arm_neon.h"
-
-void test_vrndnqf32 (void)
-{
-  float32x4_t out_float32x4_t;
-  float32x4_t arg0_float32x4_t;
-
-  out_float32x4_t = vrndnq_f32 (arg0_float32x4_t);
-}
-
-/* { dg-final { scan-assembler "vrintn\.f32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vrndpf32.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vrndpf32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_v8_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_v8_neon } */
-
-#include "arm_neon.h"
-
-void test_vrndpf32 (void)
-{
-  float32x2_t out_float32x2_t;
-  float32x2_t arg0_float32x2_t;
-
-  out_float32x2_t = vrndp_f32 (arg0_float32x2_t);
-}
-
-/* { dg-final { scan-assembler "vrintp\.f32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vrndpqf32.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vrndpq_f32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_v8_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_v8_neon } */
-
-#include "arm_neon.h"
-
-void test_vrndpqf32 (void)
-{
-  float32x4_t out_float32x4_t;
-  float32x4_t arg0_float32x4_t;
-
-  out_float32x4_t = vrndpq_f32 (arg0_float32x4_t);
-}
-
-/* { dg-final { scan-assembler "vrintp\.f32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vrndqf32.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vrndqf32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_v8_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_v8_neon } */
-
-#include "arm_neon.h"
-
-void test_vrndqf32 (void)
-{
-  float32x4_t out_float32x4_t;
-  float32x4_t arg0_float32x4_t;
-
-  out_float32x4_t = vrndq_f32 (arg0_float32x4_t);
-}
-
-/* { dg-final { scan-assembler "vrintz\.f32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vrsqrteQf32.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vrsqrteQf32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vrsqrteQf32 (void)
-{
-  float32x4_t out_float32x4_t;
-  float32x4_t arg0_float32x4_t;
-
-  out_float32x4_t = vrsqrteq_f32 (arg0_float32x4_t);
-}
-
-/* { dg-final { scan-assembler "vrsqrte\.f32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vrsqrteQu32.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vrsqrteQu32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vrsqrteQu32 (void)
-{
-  uint32x4_t out_uint32x4_t;
-  uint32x4_t arg0_uint32x4_t;
-
-  out_uint32x4_t = vrsqrteq_u32 (arg0_uint32x4_t);
-}
-
-/* { dg-final { scan-assembler "vrsqrte\.u32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vrsqrtef32.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vrsqrtef32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vrsqrtef32 (void)
-{
-  float32x2_t out_float32x2_t;
-  float32x2_t arg0_float32x2_t;
-
-  out_float32x2_t = vrsqrte_f32 (arg0_float32x2_t);
-}
-
-/* { dg-final { scan-assembler "vrsqrte\.f32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vrsqrteu32.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vrsqrteu32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vrsqrteu32 (void)
-{
-  uint32x2_t out_uint32x2_t;
-  uint32x2_t arg0_uint32x2_t;
-
-  out_uint32x2_t = vrsqrte_u32 (arg0_uint32x2_t);
-}
-
-/* { dg-final { scan-assembler "vrsqrte\.u32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vrsqrtsQf32.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vrsqrtsQf32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vrsqrtsQf32 (void)
-{
-  float32x4_t out_float32x4_t;
-  float32x4_t arg0_float32x4_t;
-  float32x4_t arg1_float32x4_t;
-
-  out_float32x4_t = vrsqrtsq_f32 (arg0_float32x4_t, arg1_float32x4_t);
-}
-
-/* { dg-final { scan-assembler "vrsqrts\.f32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vrsqrtsf32.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vrsqrtsf32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vrsqrtsf32 (void)
-{
-  float32x2_t out_float32x2_t;
-  float32x2_t arg0_float32x2_t;
-  float32x2_t arg1_float32x2_t;
-
-  out_float32x2_t = vrsqrts_f32 (arg0_float32x2_t, arg1_float32x2_t);
-}
-
-/* { dg-final { scan-assembler "vrsqrts\.f32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vsetQ_lanef32.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vsetQ_lanef32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vsetQ_lanef32 (void)
-{
-  float32x4_t out_float32x4_t;
-  float32_t arg0_float32_t;
-  float32x4_t arg1_float32x4_t;
-
-  out_float32x4_t = vsetq_lane_f32 (arg0_float32_t, arg1_float32x4_t, 1);
-}
-
-/* { dg-final { scan-assembler "vmov\.32\[ 	\]+\[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[rR\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vsetQ_lanep16.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vsetQ_lanep16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vsetQ_lanep16 (void)
-{
-  poly16x8_t out_poly16x8_t;
-  poly16_t arg0_poly16_t;
-  poly16x8_t arg1_poly16x8_t;
-
-  out_poly16x8_t = vsetq_lane_p16 (arg0_poly16_t, arg1_poly16x8_t, 1);
-}
-
-/* { dg-final { scan-assembler "vmov\.16\[ 	\]+\[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[rR\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vsetQ_lanep8.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vsetQ_lanep8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vsetQ_lanep8 (void)
-{
-  poly8x16_t out_poly8x16_t;
-  poly8_t arg0_poly8_t;
-  poly8x16_t arg1_poly8x16_t;
-
-  out_poly8x16_t = vsetq_lane_p8 (arg0_poly8_t, arg1_poly8x16_t, 1);
-}
-
-/* { dg-final { scan-assembler "vmov\.8\[ 	\]+\[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[rR\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vsetQ_lanes16.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vsetQ_lanes16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vsetQ_lanes16 (void)
-{
-  int16x8_t out_int16x8_t;
-  int16_t arg0_int16_t;
-  int16x8_t arg1_int16x8_t;
-
-  out_int16x8_t = vsetq_lane_s16 (arg0_int16_t, arg1_int16x8_t, 1);
-}
-
-/* { dg-final { scan-assembler "vmov\.16\[ 	\]+\[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[rR\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vsetQ_lanes32.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vsetQ_lanes32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vsetQ_lanes32 (void)
-{
-  int32x4_t out_int32x4_t;
-  int32_t arg0_int32_t;
-  int32x4_t arg1_int32x4_t;
-
-  out_int32x4_t = vsetq_lane_s32 (arg0_int32_t, arg1_int32x4_t, 1);
-}
-
-/* { dg-final { scan-assembler "vmov\.32\[ 	\]+\[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[rR\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vsetQ_lanes64.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vsetQ_lanes64' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vsetQ_lanes64 (void)
-{
-  int64x2_t out_int64x2_t;
-  int64_t arg0_int64_t;
-  int64x2_t arg1_int64x2_t;
-
-  out_int64x2_t = vsetq_lane_s64 (arg0_int64_t, arg1_int64x2_t, 0);
-}
-
-/* { dg-final { scan-assembler "vmov\[ 	\]+\[dD\]\[0-9\]+, \[rR\]\[0-9\]+, \[rR\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vsetQ_lanes8.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vsetQ_lanes8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vsetQ_lanes8 (void)
-{
-  int8x16_t out_int8x16_t;
-  int8_t arg0_int8_t;
-  int8x16_t arg1_int8x16_t;
-
-  out_int8x16_t = vsetq_lane_s8 (arg0_int8_t, arg1_int8x16_t, 1);
-}
-
-/* { dg-final { scan-assembler "vmov\.8\[ 	\]+\[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[rR\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vsetQ_laneu16.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vsetQ_laneu16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vsetQ_laneu16 (void)
-{
-  uint16x8_t out_uint16x8_t;
-  uint16_t arg0_uint16_t;
-  uint16x8_t arg1_uint16x8_t;
-
-  out_uint16x8_t = vsetq_lane_u16 (arg0_uint16_t, arg1_uint16x8_t, 1);
-}
-
-/* { dg-final { scan-assembler "vmov\.16\[ 	\]+\[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[rR\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vsetQ_laneu32.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vsetQ_laneu32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vsetQ_laneu32 (void)
-{
-  uint32x4_t out_uint32x4_t;
-  uint32_t arg0_uint32_t;
-  uint32x4_t arg1_uint32x4_t;
-
-  out_uint32x4_t = vsetq_lane_u32 (arg0_uint32_t, arg1_uint32x4_t, 1);
-}
-
-/* { dg-final { scan-assembler "vmov\.32\[ 	\]+\[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[rR\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vsetQ_laneu64.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vsetQ_laneu64' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vsetQ_laneu64 (void)
-{
-  uint64x2_t out_uint64x2_t;
-  uint64_t arg0_uint64_t;
-  uint64x2_t arg1_uint64x2_t;
-
-  out_uint64x2_t = vsetq_lane_u64 (arg0_uint64_t, arg1_uint64x2_t, 0);
-}
-
-/* { dg-final { scan-assembler "vmov\[ 	\]+\[dD\]\[0-9\]+, \[rR\]\[0-9\]+, \[rR\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vsetQ_laneu8.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vsetQ_laneu8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vsetQ_laneu8 (void)
-{
-  uint8x16_t out_uint8x16_t;
-  uint8_t arg0_uint8_t;
-  uint8x16_t arg1_uint8x16_t;
-
-  out_uint8x16_t = vsetq_lane_u8 (arg0_uint8_t, arg1_uint8x16_t, 1);
-}
-
-/* { dg-final { scan-assembler "vmov\.8\[ 	\]+\[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[rR\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vset_lanef32.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vset_lanef32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vset_lanef32 (void)
-{
-  float32x2_t out_float32x2_t;
-  float32_t arg0_float32_t;
-  float32x2_t arg1_float32x2_t;
-
-  out_float32x2_t = vset_lane_f32 (arg0_float32_t, arg1_float32x2_t, 1);
-}
-
-/* { dg-final { scan-assembler "vmov\.32\[ 	\]+\[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[rR\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vset_lanep16.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vset_lanep16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vset_lanep16 (void)
-{
-  poly16x4_t out_poly16x4_t;
-  poly16_t arg0_poly16_t;
-  poly16x4_t arg1_poly16x4_t;
-
-  out_poly16x4_t = vset_lane_p16 (arg0_poly16_t, arg1_poly16x4_t, 1);
-}
-
-/* { dg-final { scan-assembler "vmov\.16\[ 	\]+\[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[rR\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vset_lanep8.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vset_lanep8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vset_lanep8 (void)
-{
-  poly8x8_t out_poly8x8_t;
-  poly8_t arg0_poly8_t;
-  poly8x8_t arg1_poly8x8_t;
-
-  out_poly8x8_t = vset_lane_p8 (arg0_poly8_t, arg1_poly8x8_t, 1);
-}
-
-/* { dg-final { scan-assembler "vmov\.8\[ 	\]+\[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[rR\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vset_lanes16.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vset_lanes16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vset_lanes16 (void)
-{
-  int16x4_t out_int16x4_t;
-  int16_t arg0_int16_t;
-  int16x4_t arg1_int16x4_t;
-
-  out_int16x4_t = vset_lane_s16 (arg0_int16_t, arg1_int16x4_t, 1);
-}
-
-/* { dg-final { scan-assembler "vmov\.16\[ 	\]+\[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[rR\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vset_lanes32.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vset_lanes32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vset_lanes32 (void)
-{
-  int32x2_t out_int32x2_t;
-  int32_t arg0_int32_t;
-  int32x2_t arg1_int32x2_t;
-
-  out_int32x2_t = vset_lane_s32 (arg0_int32_t, arg1_int32x2_t, 1);
-}
-
-/* { dg-final { scan-assembler "vmov\.32\[ 	\]+\[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[rR\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vset_lanes64.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vset_lanes64' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vset_lanes64 (void)
-{
-  int64x1_t out_int64x1_t;
-  int64_t arg0_int64_t;
-  int64x1_t arg1_int64x1_t;
-
-  out_int64x1_t = vset_lane_s64 (arg0_int64_t, arg1_int64x1_t, 0);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vset_lanes8.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vset_lanes8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vset_lanes8 (void)
-{
-  int8x8_t out_int8x8_t;
-  int8_t arg0_int8_t;
-  int8x8_t arg1_int8x8_t;
-
-  out_int8x8_t = vset_lane_s8 (arg0_int8_t, arg1_int8x8_t, 1);
-}
-
-/* { dg-final { scan-assembler "vmov\.8\[ 	\]+\[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[rR\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vset_laneu16.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vset_laneu16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vset_laneu16 (void)
-{
-  uint16x4_t out_uint16x4_t;
-  uint16_t arg0_uint16_t;
-  uint16x4_t arg1_uint16x4_t;
-
-  out_uint16x4_t = vset_lane_u16 (arg0_uint16_t, arg1_uint16x4_t, 1);
-}
-
-/* { dg-final { scan-assembler "vmov\.16\[ 	\]+\[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[rR\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vset_laneu32.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vset_laneu32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vset_laneu32 (void)
-{
-  uint32x2_t out_uint32x2_t;
-  uint32_t arg0_uint32_t;
-  uint32x2_t arg1_uint32x2_t;
-
-  out_uint32x2_t = vset_lane_u32 (arg0_uint32_t, arg1_uint32x2_t, 1);
-}
-
-/* { dg-final { scan-assembler "vmov\.32\[ 	\]+\[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[rR\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vset_laneu64.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vset_laneu64' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vset_laneu64 (void)
-{
-  uint64x1_t out_uint64x1_t;
-  uint64_t arg0_uint64_t;
-  uint64x1_t arg1_uint64x1_t;
-
-  out_uint64x1_t = vset_lane_u64 (arg0_uint64_t, arg1_uint64x1_t, 0);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vset_laneu8.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vset_laneu8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vset_laneu8 (void)
-{
-  uint8x8_t out_uint8x8_t;
-  uint8_t arg0_uint8_t;
-  uint8x8_t arg1_uint8x8_t;
-
-  out_uint8x8_t = vset_lane_u8 (arg0_uint8_t, arg1_uint8x8_t, 1);
-}
-
-/* { dg-final { scan-assembler "vmov\.8\[ 	\]+\[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[rR\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vshlQ_ns16.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vshlQ_ns16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vshlQ_ns16 (void)
-{
-  int16x8_t out_int16x8_t;
-  int16x8_t arg0_int16x8_t;
-
-  out_int16x8_t = vshlq_n_s16 (arg0_int16x8_t, 1);
-}
-
-/* { dg-final { scan-assembler "vshl\.i16\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vshlQ_ns32.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vshlQ_ns32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vshlQ_ns32 (void)
-{
-  int32x4_t out_int32x4_t;
-  int32x4_t arg0_int32x4_t;
-
-  out_int32x4_t = vshlq_n_s32 (arg0_int32x4_t, 1);
-}
-
-/* { dg-final { scan-assembler "vshl\.i32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vshlQ_ns64.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vshlQ_ns64' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vshlQ_ns64 (void)
-{
-  int64x2_t out_int64x2_t;
-  int64x2_t arg0_int64x2_t;
-
-  out_int64x2_t = vshlq_n_s64 (arg0_int64x2_t, 1);
-}
-
-/* { dg-final { scan-assembler "vshl\.i64\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vshlQ_ns8.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vshlQ_ns8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vshlQ_ns8 (void)
-{
-  int8x16_t out_int8x16_t;
-  int8x16_t arg0_int8x16_t;
-
-  out_int8x16_t = vshlq_n_s8 (arg0_int8x16_t, 1);
-}
-
-/* { dg-final { scan-assembler "vshl\.i8\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vshlQ_nu16.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vshlQ_nu16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vshlQ_nu16 (void)
-{
-  uint16x8_t out_uint16x8_t;
-  uint16x8_t arg0_uint16x8_t;
-
-  out_uint16x8_t = vshlq_n_u16 (arg0_uint16x8_t, 1);
-}
-
-/* { dg-final { scan-assembler "vshl\.i16\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vshlQ_nu32.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vshlQ_nu32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vshlQ_nu32 (void)
-{
-  uint32x4_t out_uint32x4_t;
-  uint32x4_t arg0_uint32x4_t;
-
-  out_uint32x4_t = vshlq_n_u32 (arg0_uint32x4_t, 1);
-}
-
-/* { dg-final { scan-assembler "vshl\.i32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vshlQ_nu64.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vshlQ_nu64' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vshlQ_nu64 (void)
-{
-  uint64x2_t out_uint64x2_t;
-  uint64x2_t arg0_uint64x2_t;
-
-  out_uint64x2_t = vshlq_n_u64 (arg0_uint64x2_t, 1);
-}
-
-/* { dg-final { scan-assembler "vshl\.i64\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vshlQ_nu8.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vshlQ_nu8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vshlQ_nu8 (void)
-{
-  uint8x16_t out_uint8x16_t;
-  uint8x16_t arg0_uint8x16_t;
-
-  out_uint8x16_t = vshlq_n_u8 (arg0_uint8x16_t, 1);
-}
-
-/* { dg-final { scan-assembler "vshl\.i8\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vshlQs16.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vshlQs16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vshlQs16 (void)
-{
-  int16x8_t out_int16x8_t;
-  int16x8_t arg0_int16x8_t;
-  int16x8_t arg1_int16x8_t;
-
-  out_int16x8_t = vshlq_s16 (arg0_int16x8_t, arg1_int16x8_t);
-}
-
-/* { dg-final { scan-assembler "vshl\.s16\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vshlQs32.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vshlQs32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vshlQs32 (void)
-{
-  int32x4_t out_int32x4_t;
-  int32x4_t arg0_int32x4_t;
-  int32x4_t arg1_int32x4_t;
-
-  out_int32x4_t = vshlq_s32 (arg0_int32x4_t, arg1_int32x4_t);
-}
-
-/* { dg-final { scan-assembler "vshl\.s32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vshlQs64.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vshlQs64' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vshlQs64 (void)
-{
-  int64x2_t out_int64x2_t;
-  int64x2_t arg0_int64x2_t;
-  int64x2_t arg1_int64x2_t;
-
-  out_int64x2_t = vshlq_s64 (arg0_int64x2_t, arg1_int64x2_t);
-}
-
-/* { dg-final { scan-assembler "vshl\.s64\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vshlQs8.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vshlQs8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vshlQs8 (void)
-{
-  int8x16_t out_int8x16_t;
-  int8x16_t arg0_int8x16_t;
-  int8x16_t arg1_int8x16_t;
-
-  out_int8x16_t = vshlq_s8 (arg0_int8x16_t, arg1_int8x16_t);
-}
-
-/* { dg-final { scan-assembler "vshl\.s8\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vshlQu16.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vshlQu16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vshlQu16 (void)
-{
-  uint16x8_t out_uint16x8_t;
-  uint16x8_t arg0_uint16x8_t;
-  int16x8_t arg1_int16x8_t;
-
-  out_uint16x8_t = vshlq_u16 (arg0_uint16x8_t, arg1_int16x8_t);
-}
-
-/* { dg-final { scan-assembler "vshl\.u16\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vshlQu32.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vshlQu32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vshlQu32 (void)
-{
-  uint32x4_t out_uint32x4_t;
-  uint32x4_t arg0_uint32x4_t;
-  int32x4_t arg1_int32x4_t;
-
-  out_uint32x4_t = vshlq_u32 (arg0_uint32x4_t, arg1_int32x4_t);
-}
-
-/* { dg-final { scan-assembler "vshl\.u32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vshlQu64.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vshlQu64' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vshlQu64 (void)
-{
-  uint64x2_t out_uint64x2_t;
-  uint64x2_t arg0_uint64x2_t;
-  int64x2_t arg1_int64x2_t;
-
-  out_uint64x2_t = vshlq_u64 (arg0_uint64x2_t, arg1_int64x2_t);
-}
-
-/* { dg-final { scan-assembler "vshl\.u64\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vshlQu8.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vshlQu8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vshlQu8 (void)
-{
-  uint8x16_t out_uint8x16_t;
-  uint8x16_t arg0_uint8x16_t;
-  int8x16_t arg1_int8x16_t;
-
-  out_uint8x16_t = vshlq_u8 (arg0_uint8x16_t, arg1_int8x16_t);
-}
-
-/* { dg-final { scan-assembler "vshl\.u8\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vshl_ns16.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vshl_ns16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vshl_ns16 (void)
-{
-  int16x4_t out_int16x4_t;
-  int16x4_t arg0_int16x4_t;
-
-  out_int16x4_t = vshl_n_s16 (arg0_int16x4_t, 1);
-}
-
-/* { dg-final { scan-assembler "vshl\.i16\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vshl_ns32.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vshl_ns32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vshl_ns32 (void)
-{
-  int32x2_t out_int32x2_t;
-  int32x2_t arg0_int32x2_t;
-
-  out_int32x2_t = vshl_n_s32 (arg0_int32x2_t, 1);
-}
-
-/* { dg-final { scan-assembler "vshl\.i32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vshl_ns64.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vshl_ns64' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vshl_ns64 (void)
-{
-  int64x1_t out_int64x1_t;
-  int64x1_t arg0_int64x1_t;
-
-  out_int64x1_t = vshl_n_s64 (arg0_int64x1_t, 1);
-}
-
-/* { dg-final { scan-assembler "vshl\.i64\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vshl_ns8.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vshl_ns8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vshl_ns8 (void)
-{
-  int8x8_t out_int8x8_t;
-  int8x8_t arg0_int8x8_t;
-
-  out_int8x8_t = vshl_n_s8 (arg0_int8x8_t, 1);
-}
-
-/* { dg-final { scan-assembler "vshl\.i8\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vshl_nu16.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vshl_nu16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vshl_nu16 (void)
-{
-  uint16x4_t out_uint16x4_t;
-  uint16x4_t arg0_uint16x4_t;
-
-  out_uint16x4_t = vshl_n_u16 (arg0_uint16x4_t, 1);
-}
-
-/* { dg-final { scan-assembler "vshl\.i16\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vshl_nu32.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vshl_nu32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vshl_nu32 (void)
-{
-  uint32x2_t out_uint32x2_t;
-  uint32x2_t arg0_uint32x2_t;
-
-  out_uint32x2_t = vshl_n_u32 (arg0_uint32x2_t, 1);
-}
-
-/* { dg-final { scan-assembler "vshl\.i32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vshl_nu64.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vshl_nu64' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vshl_nu64 (void)
-{
-  uint64x1_t out_uint64x1_t;
-  uint64x1_t arg0_uint64x1_t;
-
-  out_uint64x1_t = vshl_n_u64 (arg0_uint64x1_t, 1);
-}
-
-/* { dg-final { scan-assembler "vshl\.i64\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vshl_nu8.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vshl_nu8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vshl_nu8 (void)
-{
-  uint8x8_t out_uint8x8_t;
-  uint8x8_t arg0_uint8x8_t;
-
-  out_uint8x8_t = vshl_n_u8 (arg0_uint8x8_t, 1);
-}
-
-/* { dg-final { scan-assembler "vshl\.i8\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vshll_ns16.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vshll_ns16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vshll_ns16 (void)
-{
-  int32x4_t out_int32x4_t;
-  int16x4_t arg0_int16x4_t;
-
-  out_int32x4_t = vshll_n_s16 (arg0_int16x4_t, 1);
-}
-
-/* { dg-final { scan-assembler "vshll\.s16\[ 	\]+\[qQ\]\[0-9\]+, \[dD\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vshll_ns32.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vshll_ns32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vshll_ns32 (void)
-{
-  int64x2_t out_int64x2_t;
-  int32x2_t arg0_int32x2_t;
-
-  out_int64x2_t = vshll_n_s32 (arg0_int32x2_t, 1);
-}
-
-/* { dg-final { scan-assembler "vshll\.s32\[ 	\]+\[qQ\]\[0-9\]+, \[dD\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vshll_ns8.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vshll_ns8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vshll_ns8 (void)
-{
-  int16x8_t out_int16x8_t;
-  int8x8_t arg0_int8x8_t;
-
-  out_int16x8_t = vshll_n_s8 (arg0_int8x8_t, 1);
-}
-
-/* { dg-final { scan-assembler "vshll\.s8\[ 	\]+\[qQ\]\[0-9\]+, \[dD\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vshll_nu16.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vshll_nu16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vshll_nu16 (void)
-{
-  uint32x4_t out_uint32x4_t;
-  uint16x4_t arg0_uint16x4_t;
-
-  out_uint32x4_t = vshll_n_u16 (arg0_uint16x4_t, 1);
-}
-
-/* { dg-final { scan-assembler "vshll\.u16\[ 	\]+\[qQ\]\[0-9\]+, \[dD\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vshll_nu32.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vshll_nu32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vshll_nu32 (void)
-{
-  uint64x2_t out_uint64x2_t;
-  uint32x2_t arg0_uint32x2_t;
-
-  out_uint64x2_t = vshll_n_u32 (arg0_uint32x2_t, 1);
-}
-
-/* { dg-final { scan-assembler "vshll\.u32\[ 	\]+\[qQ\]\[0-9\]+, \[dD\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vshll_nu8.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vshll_nu8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vshll_nu8 (void)
-{
-  uint16x8_t out_uint16x8_t;
-  uint8x8_t arg0_uint8x8_t;
-
-  out_uint16x8_t = vshll_n_u8 (arg0_uint8x8_t, 1);
-}
-
-/* { dg-final { scan-assembler "vshll\.u8\[ 	\]+\[qQ\]\[0-9\]+, \[dD\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vshls16.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vshls16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vshls16 (void)
-{
-  int16x4_t out_int16x4_t;
-  int16x4_t arg0_int16x4_t;
-  int16x4_t arg1_int16x4_t;
-
-  out_int16x4_t = vshl_s16 (arg0_int16x4_t, arg1_int16x4_t);
-}
-
-/* { dg-final { scan-assembler "vshl\.s16\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vshls32.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vshls32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vshls32 (void)
-{
-  int32x2_t out_int32x2_t;
-  int32x2_t arg0_int32x2_t;
-  int32x2_t arg1_int32x2_t;
-
-  out_int32x2_t = vshl_s32 (arg0_int32x2_t, arg1_int32x2_t);
-}
-
-/* { dg-final { scan-assembler "vshl\.s32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vshls64.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vshls64' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vshls64 (void)
-{
-  int64x1_t out_int64x1_t;
-  int64x1_t arg0_int64x1_t;
-  int64x1_t arg1_int64x1_t;
-
-  out_int64x1_t = vshl_s64 (arg0_int64x1_t, arg1_int64x1_t);
-}
-
-/* { dg-final { scan-assembler "vshl\.s64\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vshls8.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vshls8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vshls8 (void)
-{
-  int8x8_t out_int8x8_t;
-  int8x8_t arg0_int8x8_t;
-  int8x8_t arg1_int8x8_t;
-
-  out_int8x8_t = vshl_s8 (arg0_int8x8_t, arg1_int8x8_t);
-}
-
-/* { dg-final { scan-assembler "vshl\.s8\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vshlu16.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vshlu16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vshlu16 (void)
-{
-  uint16x4_t out_uint16x4_t;
-  uint16x4_t arg0_uint16x4_t;
-  int16x4_t arg1_int16x4_t;
-
-  out_uint16x4_t = vshl_u16 (arg0_uint16x4_t, arg1_int16x4_t);
-}
-
-/* { dg-final { scan-assembler "vshl\.u16\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vshlu32.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vshlu32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vshlu32 (void)
-{
-  uint32x2_t out_uint32x2_t;
-  uint32x2_t arg0_uint32x2_t;
-  int32x2_t arg1_int32x2_t;
-
-  out_uint32x2_t = vshl_u32 (arg0_uint32x2_t, arg1_int32x2_t);
-}
-
-/* { dg-final { scan-assembler "vshl\.u32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vshlu64.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vshlu64' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vshlu64 (void)
-{
-  uint64x1_t out_uint64x1_t;
-  uint64x1_t arg0_uint64x1_t;
-  int64x1_t arg1_int64x1_t;
-
-  out_uint64x1_t = vshl_u64 (arg0_uint64x1_t, arg1_int64x1_t);
-}
-
-/* { dg-final { scan-assembler "vshl\.u64\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vshlu8.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vshlu8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vshlu8 (void)
-{
-  uint8x8_t out_uint8x8_t;
-  uint8x8_t arg0_uint8x8_t;
-  int8x8_t arg1_int8x8_t;
-
-  out_uint8x8_t = vshl_u8 (arg0_uint8x8_t, arg1_int8x8_t);
-}
-
-/* { dg-final { scan-assembler "vshl\.u8\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vshrQ_ns16.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vshrQ_ns16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vshrQ_ns16 (void)
-{
-  int16x8_t out_int16x8_t;
-  int16x8_t arg0_int16x8_t;
-
-  out_int16x8_t = vshrq_n_s16 (arg0_int16x8_t, 1);
-}
-
-/* { dg-final { scan-assembler "vshr\.s16\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vshrQ_ns32.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vshrQ_ns32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vshrQ_ns32 (void)
-{
-  int32x4_t out_int32x4_t;
-  int32x4_t arg0_int32x4_t;
-
-  out_int32x4_t = vshrq_n_s32 (arg0_int32x4_t, 1);
-}
-
-/* { dg-final { scan-assembler "vshr\.s32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vshrQ_ns64.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vshrQ_ns64' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vshrQ_ns64 (void)
-{
-  int64x2_t out_int64x2_t;
-  int64x2_t arg0_int64x2_t;
-
-  out_int64x2_t = vshrq_n_s64 (arg0_int64x2_t, 1);
-}
-
-/* { dg-final { scan-assembler "vshr\.s64\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vshrQ_ns8.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vshrQ_ns8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vshrQ_ns8 (void)
-{
-  int8x16_t out_int8x16_t;
-  int8x16_t arg0_int8x16_t;
-
-  out_int8x16_t = vshrq_n_s8 (arg0_int8x16_t, 1);
-}
-
-/* { dg-final { scan-assembler "vshr\.s8\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vshrQ_nu16.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vshrQ_nu16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vshrQ_nu16 (void)
-{
-  uint16x8_t out_uint16x8_t;
-  uint16x8_t arg0_uint16x8_t;
-
-  out_uint16x8_t = vshrq_n_u16 (arg0_uint16x8_t, 1);
-}
-
-/* { dg-final { scan-assembler "vshr\.u16\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vshrQ_nu32.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vshrQ_nu32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vshrQ_nu32 (void)
-{
-  uint32x4_t out_uint32x4_t;
-  uint32x4_t arg0_uint32x4_t;
-
-  out_uint32x4_t = vshrq_n_u32 (arg0_uint32x4_t, 1);
-}
-
-/* { dg-final { scan-assembler "vshr\.u32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vshrQ_nu64.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vshrQ_nu64' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vshrQ_nu64 (void)
-{
-  uint64x2_t out_uint64x2_t;
-  uint64x2_t arg0_uint64x2_t;
-
-  out_uint64x2_t = vshrq_n_u64 (arg0_uint64x2_t, 1);
-}
-
-/* { dg-final { scan-assembler "vshr\.u64\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vshrQ_nu8.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vshrQ_nu8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vshrQ_nu8 (void)
-{
-  uint8x16_t out_uint8x16_t;
-  uint8x16_t arg0_uint8x16_t;
-
-  out_uint8x16_t = vshrq_n_u8 (arg0_uint8x16_t, 1);
-}
-
-/* { dg-final { scan-assembler "vshr\.u8\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vshr_ns16.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vshr_ns16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vshr_ns16 (void)
-{
-  int16x4_t out_int16x4_t;
-  int16x4_t arg0_int16x4_t;
-
-  out_int16x4_t = vshr_n_s16 (arg0_int16x4_t, 1);
-}
-
-/* { dg-final { scan-assembler "vshr\.s16\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vshr_ns32.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vshr_ns32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vshr_ns32 (void)
-{
-  int32x2_t out_int32x2_t;
-  int32x2_t arg0_int32x2_t;
-
-  out_int32x2_t = vshr_n_s32 (arg0_int32x2_t, 1);
-}
-
-/* { dg-final { scan-assembler "vshr\.s32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vshr_ns64.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vshr_ns64' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vshr_ns64 (void)
-{
-  int64x1_t out_int64x1_t;
-  int64x1_t arg0_int64x1_t;
-
-  out_int64x1_t = vshr_n_s64 (arg0_int64x1_t, 1);
-}
-
-/* { dg-final { scan-assembler "vshr\.s64\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vshr_ns8.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vshr_ns8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vshr_ns8 (void)
-{
-  int8x8_t out_int8x8_t;
-  int8x8_t arg0_int8x8_t;
-
-  out_int8x8_t = vshr_n_s8 (arg0_int8x8_t, 1);
-}
-
-/* { dg-final { scan-assembler "vshr\.s8\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vshr_nu16.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vshr_nu16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vshr_nu16 (void)
-{
-  uint16x4_t out_uint16x4_t;
-  uint16x4_t arg0_uint16x4_t;
-
-  out_uint16x4_t = vshr_n_u16 (arg0_uint16x4_t, 1);
-}
-
-/* { dg-final { scan-assembler "vshr\.u16\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vshr_nu32.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vshr_nu32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vshr_nu32 (void)
-{
-  uint32x2_t out_uint32x2_t;
-  uint32x2_t arg0_uint32x2_t;
-
-  out_uint32x2_t = vshr_n_u32 (arg0_uint32x2_t, 1);
-}
-
-/* { dg-final { scan-assembler "vshr\.u32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vshr_nu64.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vshr_nu64' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vshr_nu64 (void)
-{
-  uint64x1_t out_uint64x1_t;
-  uint64x1_t arg0_uint64x1_t;
-
-  out_uint64x1_t = vshr_n_u64 (arg0_uint64x1_t, 1);
-}
-
-/* { dg-final { scan-assembler "vshr\.u64\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vshr_nu8.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vshr_nu8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vshr_nu8 (void)
-{
-  uint8x8_t out_uint8x8_t;
-  uint8x8_t arg0_uint8x8_t;
-
-  out_uint8x8_t = vshr_n_u8 (arg0_uint8x8_t, 1);
-}
-
-/* { dg-final { scan-assembler "vshr\.u8\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vshrn_ns16.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vshrn_ns16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vshrn_ns16 (void)
-{
-  int8x8_t out_int8x8_t;
-  int16x8_t arg0_int16x8_t;
-
-  out_int8x8_t = vshrn_n_s16 (arg0_int16x8_t, 1);
-}
-
-/* { dg-final { scan-assembler "vshrn\.i16\[ 	\]+\[dD\]\[0-9\]+, \[qQ\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vshrn_ns32.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vshrn_ns32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vshrn_ns32 (void)
-{
-  int16x4_t out_int16x4_t;
-  int32x4_t arg0_int32x4_t;
-
-  out_int16x4_t = vshrn_n_s32 (arg0_int32x4_t, 1);
-}
-
-/* { dg-final { scan-assembler "vshrn\.i32\[ 	\]+\[dD\]\[0-9\]+, \[qQ\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vshrn_ns64.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vshrn_ns64' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vshrn_ns64 (void)
-{
-  int32x2_t out_int32x2_t;
-  int64x2_t arg0_int64x2_t;
-
-  out_int32x2_t = vshrn_n_s64 (arg0_int64x2_t, 1);
-}
-
-/* { dg-final { scan-assembler "vshrn\.i64\[ 	\]+\[dD\]\[0-9\]+, \[qQ\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vshrn_nu16.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vshrn_nu16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vshrn_nu16 (void)
-{
-  uint8x8_t out_uint8x8_t;
-  uint16x8_t arg0_uint16x8_t;
-
-  out_uint8x8_t = vshrn_n_u16 (arg0_uint16x8_t, 1);
-}
-
-/* { dg-final { scan-assembler "vshrn\.i16\[ 	\]+\[dD\]\[0-9\]+, \[qQ\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vshrn_nu32.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vshrn_nu32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vshrn_nu32 (void)
-{
-  uint16x4_t out_uint16x4_t;
-  uint32x4_t arg0_uint32x4_t;
-
-  out_uint16x4_t = vshrn_n_u32 (arg0_uint32x4_t, 1);
-}
-
-/* { dg-final { scan-assembler "vshrn\.i32\[ 	\]+\[dD\]\[0-9\]+, \[qQ\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vshrn_nu64.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vshrn_nu64' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vshrn_nu64 (void)
-{
-  uint32x2_t out_uint32x2_t;
-  uint64x2_t arg0_uint64x2_t;
-
-  out_uint32x2_t = vshrn_n_u64 (arg0_uint64x2_t, 1);
-}
-
-/* { dg-final { scan-assembler "vshrn\.i64\[ 	\]+\[dD\]\[0-9\]+, \[qQ\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vsliQ_np16.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vsliQ_np16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vsliQ_np16 (void)
-{
-  poly16x8_t out_poly16x8_t;
-  poly16x8_t arg0_poly16x8_t;
-  poly16x8_t arg1_poly16x8_t;
-
-  out_poly16x8_t = vsliq_n_p16 (arg0_poly16x8_t, arg1_poly16x8_t, 1);
-}
-
-/* { dg-final { scan-assembler "vsli\.16\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vsliQ_np64.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vsliQ_np64' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_crypto_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_crypto } */
-
-#include "arm_neon.h"
-
-void test_vsliQ_np64 (void)
-{
-  poly64x2_t out_poly64x2_t;
-  poly64x2_t arg0_poly64x2_t;
-  poly64x2_t arg1_poly64x2_t;
-
-  out_poly64x2_t = vsliq_n_p64 (arg0_poly64x2_t, arg1_poly64x2_t, 1);
-}
-
-/* { dg-final { scan-assembler "vsli\.64\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vsliQ_np8.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vsliQ_np8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vsliQ_np8 (void)
-{
-  poly8x16_t out_poly8x16_t;
-  poly8x16_t arg0_poly8x16_t;
-  poly8x16_t arg1_poly8x16_t;
-
-  out_poly8x16_t = vsliq_n_p8 (arg0_poly8x16_t, arg1_poly8x16_t, 1);
-}
-
-/* { dg-final { scan-assembler "vsli\.8\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vsliQ_ns16.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vsliQ_ns16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vsliQ_ns16 (void)
-{
-  int16x8_t out_int16x8_t;
-  int16x8_t arg0_int16x8_t;
-  int16x8_t arg1_int16x8_t;
-
-  out_int16x8_t = vsliq_n_s16 (arg0_int16x8_t, arg1_int16x8_t, 1);
-}
-
-/* { dg-final { scan-assembler "vsli\.16\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vsliQ_ns32.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vsliQ_ns32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vsliQ_ns32 (void)
-{
-  int32x4_t out_int32x4_t;
-  int32x4_t arg0_int32x4_t;
-  int32x4_t arg1_int32x4_t;
-
-  out_int32x4_t = vsliq_n_s32 (arg0_int32x4_t, arg1_int32x4_t, 1);
-}
-
-/* { dg-final { scan-assembler "vsli\.32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vsliQ_ns64.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vsliQ_ns64' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vsliQ_ns64 (void)
-{
-  int64x2_t out_int64x2_t;
-  int64x2_t arg0_int64x2_t;
-  int64x2_t arg1_int64x2_t;
-
-  out_int64x2_t = vsliq_n_s64 (arg0_int64x2_t, arg1_int64x2_t, 1);
-}
-
-/* { dg-final { scan-assembler "vsli\.64\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vsliQ_ns8.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vsliQ_ns8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vsliQ_ns8 (void)
-{
-  int8x16_t out_int8x16_t;
-  int8x16_t arg0_int8x16_t;
-  int8x16_t arg1_int8x16_t;
-
-  out_int8x16_t = vsliq_n_s8 (arg0_int8x16_t, arg1_int8x16_t, 1);
-}
-
-/* { dg-final { scan-assembler "vsli\.8\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vsliQ_nu16.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vsliQ_nu16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vsliQ_nu16 (void)
-{
-  uint16x8_t out_uint16x8_t;
-  uint16x8_t arg0_uint16x8_t;
-  uint16x8_t arg1_uint16x8_t;
-
-  out_uint16x8_t = vsliq_n_u16 (arg0_uint16x8_t, arg1_uint16x8_t, 1);
-}
-
-/* { dg-final { scan-assembler "vsli\.16\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vsliQ_nu32.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vsliQ_nu32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vsliQ_nu32 (void)
-{
-  uint32x4_t out_uint32x4_t;
-  uint32x4_t arg0_uint32x4_t;
-  uint32x4_t arg1_uint32x4_t;
-
-  out_uint32x4_t = vsliq_n_u32 (arg0_uint32x4_t, arg1_uint32x4_t, 1);
-}
-
-/* { dg-final { scan-assembler "vsli\.32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vsliQ_nu64.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vsliQ_nu64' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vsliQ_nu64 (void)
-{
-  uint64x2_t out_uint64x2_t;
-  uint64x2_t arg0_uint64x2_t;
-  uint64x2_t arg1_uint64x2_t;
-
-  out_uint64x2_t = vsliq_n_u64 (arg0_uint64x2_t, arg1_uint64x2_t, 1);
-}
-
-/* { dg-final { scan-assembler "vsli\.64\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vsliQ_nu8.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vsliQ_nu8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vsliQ_nu8 (void)
-{
-  uint8x16_t out_uint8x16_t;
-  uint8x16_t arg0_uint8x16_t;
-  uint8x16_t arg1_uint8x16_t;
-
-  out_uint8x16_t = vsliq_n_u8 (arg0_uint8x16_t, arg1_uint8x16_t, 1);
-}
-
-/* { dg-final { scan-assembler "vsli\.8\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vsli_np16.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vsli_np16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vsli_np16 (void)
-{
-  poly16x4_t out_poly16x4_t;
-  poly16x4_t arg0_poly16x4_t;
-  poly16x4_t arg1_poly16x4_t;
-
-  out_poly16x4_t = vsli_n_p16 (arg0_poly16x4_t, arg1_poly16x4_t, 1);
-}
-
-/* { dg-final { scan-assembler "vsli\.16\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vsli_np64.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vsli_np64' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_crypto_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_crypto } */
-
-#include "arm_neon.h"
-
-void test_vsli_np64 (void)
-{
-  poly64x1_t out_poly64x1_t;
-  poly64x1_t arg0_poly64x1_t;
-  poly64x1_t arg1_poly64x1_t;
-
-  out_poly64x1_t = vsli_n_p64 (arg0_poly64x1_t, arg1_poly64x1_t, 1);
-}
-
-/* { dg-final { scan-assembler "vsli\.64\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vsli_np8.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vsli_np8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vsli_np8 (void)
-{
-  poly8x8_t out_poly8x8_t;
-  poly8x8_t arg0_poly8x8_t;
-  poly8x8_t arg1_poly8x8_t;
-
-  out_poly8x8_t = vsli_n_p8 (arg0_poly8x8_t, arg1_poly8x8_t, 1);
-}
-
-/* { dg-final { scan-assembler "vsli\.8\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vsli_ns16.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vsli_ns16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vsli_ns16 (void)
-{
-  int16x4_t out_int16x4_t;
-  int16x4_t arg0_int16x4_t;
-  int16x4_t arg1_int16x4_t;
-
-  out_int16x4_t = vsli_n_s16 (arg0_int16x4_t, arg1_int16x4_t, 1);
-}
-
-/* { dg-final { scan-assembler "vsli\.16\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vsli_ns32.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vsli_ns32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vsli_ns32 (void)
-{
-  int32x2_t out_int32x2_t;
-  int32x2_t arg0_int32x2_t;
-  int32x2_t arg1_int32x2_t;
-
-  out_int32x2_t = vsli_n_s32 (arg0_int32x2_t, arg1_int32x2_t, 1);
-}
-
-/* { dg-final { scan-assembler "vsli\.32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vsli_ns64.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vsli_ns64' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vsli_ns64 (void)
-{
-  int64x1_t out_int64x1_t;
-  int64x1_t arg0_int64x1_t;
-  int64x1_t arg1_int64x1_t;
-
-  out_int64x1_t = vsli_n_s64 (arg0_int64x1_t, arg1_int64x1_t, 1);
-}
-
-/* { dg-final { scan-assembler "vsli\.64\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vsli_ns8.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vsli_ns8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vsli_ns8 (void)
-{
-  int8x8_t out_int8x8_t;
-  int8x8_t arg0_int8x8_t;
-  int8x8_t arg1_int8x8_t;
-
-  out_int8x8_t = vsli_n_s8 (arg0_int8x8_t, arg1_int8x8_t, 1);
-}
-
-/* { dg-final { scan-assembler "vsli\.8\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vsli_nu16.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vsli_nu16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vsli_nu16 (void)
-{
-  uint16x4_t out_uint16x4_t;
-  uint16x4_t arg0_uint16x4_t;
-  uint16x4_t arg1_uint16x4_t;
-
-  out_uint16x4_t = vsli_n_u16 (arg0_uint16x4_t, arg1_uint16x4_t, 1);
-}
-
-/* { dg-final { scan-assembler "vsli\.16\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vsli_nu32.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vsli_nu32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vsli_nu32 (void)
-{
-  uint32x2_t out_uint32x2_t;
-  uint32x2_t arg0_uint32x2_t;
-  uint32x2_t arg1_uint32x2_t;
-
-  out_uint32x2_t = vsli_n_u32 (arg0_uint32x2_t, arg1_uint32x2_t, 1);
-}
-
-/* { dg-final { scan-assembler "vsli\.32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vsli_nu64.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vsli_nu64' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vsli_nu64 (void)
-{
-  uint64x1_t out_uint64x1_t;
-  uint64x1_t arg0_uint64x1_t;
-  uint64x1_t arg1_uint64x1_t;
-
-  out_uint64x1_t = vsli_n_u64 (arg0_uint64x1_t, arg1_uint64x1_t, 1);
-}
-
-/* { dg-final { scan-assembler "vsli\.64\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vsli_nu8.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vsli_nu8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vsli_nu8 (void)
-{
-  uint8x8_t out_uint8x8_t;
-  uint8x8_t arg0_uint8x8_t;
-  uint8x8_t arg1_uint8x8_t;
-
-  out_uint8x8_t = vsli_n_u8 (arg0_uint8x8_t, arg1_uint8x8_t, 1);
-}
-
-/* { dg-final { scan-assembler "vsli\.8\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vsraQ_ns16.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vsraQ_ns16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vsraQ_ns16 (void)
-{
-  int16x8_t out_int16x8_t;
-  int16x8_t arg0_int16x8_t;
-  int16x8_t arg1_int16x8_t;
-
-  out_int16x8_t = vsraq_n_s16 (arg0_int16x8_t, arg1_int16x8_t, 1);
-}
-
-/* { dg-final { scan-assembler "vsra\.s16\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vsraQ_ns32.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vsraQ_ns32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vsraQ_ns32 (void)
-{
-  int32x4_t out_int32x4_t;
-  int32x4_t arg0_int32x4_t;
-  int32x4_t arg1_int32x4_t;
-
-  out_int32x4_t = vsraq_n_s32 (arg0_int32x4_t, arg1_int32x4_t, 1);
-}
-
-/* { dg-final { scan-assembler "vsra\.s32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vsraQ_ns64.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vsraQ_ns64' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vsraQ_ns64 (void)
-{
-  int64x2_t out_int64x2_t;
-  int64x2_t arg0_int64x2_t;
-  int64x2_t arg1_int64x2_t;
-
-  out_int64x2_t = vsraq_n_s64 (arg0_int64x2_t, arg1_int64x2_t, 1);
-}
-
-/* { dg-final { scan-assembler "vsra\.s64\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vsraQ_ns8.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vsraQ_ns8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vsraQ_ns8 (void)
-{
-  int8x16_t out_int8x16_t;
-  int8x16_t arg0_int8x16_t;
-  int8x16_t arg1_int8x16_t;
-
-  out_int8x16_t = vsraq_n_s8 (arg0_int8x16_t, arg1_int8x16_t, 1);
-}
-
-/* { dg-final { scan-assembler "vsra\.s8\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vsraQ_nu16.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vsraQ_nu16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vsraQ_nu16 (void)
-{
-  uint16x8_t out_uint16x8_t;
-  uint16x8_t arg0_uint16x8_t;
-  uint16x8_t arg1_uint16x8_t;
-
-  out_uint16x8_t = vsraq_n_u16 (arg0_uint16x8_t, arg1_uint16x8_t, 1);
-}
-
-/* { dg-final { scan-assembler "vsra\.u16\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vsraQ_nu32.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vsraQ_nu32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vsraQ_nu32 (void)
-{
-  uint32x4_t out_uint32x4_t;
-  uint32x4_t arg0_uint32x4_t;
-  uint32x4_t arg1_uint32x4_t;
-
-  out_uint32x4_t = vsraq_n_u32 (arg0_uint32x4_t, arg1_uint32x4_t, 1);
-}
-
-/* { dg-final { scan-assembler "vsra\.u32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vsraQ_nu64.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vsraQ_nu64' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vsraQ_nu64 (void)
-{
-  uint64x2_t out_uint64x2_t;
-  uint64x2_t arg0_uint64x2_t;
-  uint64x2_t arg1_uint64x2_t;
-
-  out_uint64x2_t = vsraq_n_u64 (arg0_uint64x2_t, arg1_uint64x2_t, 1);
-}
-
-/* { dg-final { scan-assembler "vsra\.u64\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vsraQ_nu8.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vsraQ_nu8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vsraQ_nu8 (void)
-{
-  uint8x16_t out_uint8x16_t;
-  uint8x16_t arg0_uint8x16_t;
-  uint8x16_t arg1_uint8x16_t;
-
-  out_uint8x16_t = vsraq_n_u8 (arg0_uint8x16_t, arg1_uint8x16_t, 1);
-}
-
-/* { dg-final { scan-assembler "vsra\.u8\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vsra_ns16.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vsra_ns16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vsra_ns16 (void)
-{
-  int16x4_t out_int16x4_t;
-  int16x4_t arg0_int16x4_t;
-  int16x4_t arg1_int16x4_t;
-
-  out_int16x4_t = vsra_n_s16 (arg0_int16x4_t, arg1_int16x4_t, 1);
-}
-
-/* { dg-final { scan-assembler "vsra\.s16\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vsra_ns32.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vsra_ns32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vsra_ns32 (void)
-{
-  int32x2_t out_int32x2_t;
-  int32x2_t arg0_int32x2_t;
-  int32x2_t arg1_int32x2_t;
-
-  out_int32x2_t = vsra_n_s32 (arg0_int32x2_t, arg1_int32x2_t, 1);
-}
-
-/* { dg-final { scan-assembler "vsra\.s32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vsra_ns64.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vsra_ns64' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vsra_ns64 (void)
-{
-  int64x1_t out_int64x1_t;
-  int64x1_t arg0_int64x1_t;
-  int64x1_t arg1_int64x1_t;
-
-  out_int64x1_t = vsra_n_s64 (arg0_int64x1_t, arg1_int64x1_t, 1);
-}
-
-/* { dg-final { scan-assembler "vsra\.s64\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vsra_ns8.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vsra_ns8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vsra_ns8 (void)
-{
-  int8x8_t out_int8x8_t;
-  int8x8_t arg0_int8x8_t;
-  int8x8_t arg1_int8x8_t;
-
-  out_int8x8_t = vsra_n_s8 (arg0_int8x8_t, arg1_int8x8_t, 1);
-}
-
-/* { dg-final { scan-assembler "vsra\.s8\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vsra_nu16.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vsra_nu16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vsra_nu16 (void)
-{
-  uint16x4_t out_uint16x4_t;
-  uint16x4_t arg0_uint16x4_t;
-  uint16x4_t arg1_uint16x4_t;
-
-  out_uint16x4_t = vsra_n_u16 (arg0_uint16x4_t, arg1_uint16x4_t, 1);
-}
-
-/* { dg-final { scan-assembler "vsra\.u16\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vsra_nu32.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vsra_nu32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vsra_nu32 (void)
-{
-  uint32x2_t out_uint32x2_t;
-  uint32x2_t arg0_uint32x2_t;
-  uint32x2_t arg1_uint32x2_t;
-
-  out_uint32x2_t = vsra_n_u32 (arg0_uint32x2_t, arg1_uint32x2_t, 1);
-}
-
-/* { dg-final { scan-assembler "vsra\.u32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vsra_nu64.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vsra_nu64' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vsra_nu64 (void)
-{
-  uint64x1_t out_uint64x1_t;
-  uint64x1_t arg0_uint64x1_t;
-  uint64x1_t arg1_uint64x1_t;
-
-  out_uint64x1_t = vsra_n_u64 (arg0_uint64x1_t, arg1_uint64x1_t, 1);
-}
-
-/* { dg-final { scan-assembler "vsra\.u64\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vsra_nu8.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vsra_nu8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vsra_nu8 (void)
-{
-  uint8x8_t out_uint8x8_t;
-  uint8x8_t arg0_uint8x8_t;
-  uint8x8_t arg1_uint8x8_t;
-
-  out_uint8x8_t = vsra_n_u8 (arg0_uint8x8_t, arg1_uint8x8_t, 1);
-}
-
-/* { dg-final { scan-assembler "vsra\.u8\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vsriQ_np16.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vsriQ_np16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vsriQ_np16 (void)
-{
-  poly16x8_t out_poly16x8_t;
-  poly16x8_t arg0_poly16x8_t;
-  poly16x8_t arg1_poly16x8_t;
-
-  out_poly16x8_t = vsriq_n_p16 (arg0_poly16x8_t, arg1_poly16x8_t, 1);
-}
-
-/* { dg-final { scan-assembler "vsri\.16\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vsriQ_np64.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vsriQ_np64' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_crypto_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_crypto } */
-
-#include "arm_neon.h"
-
-void test_vsriQ_np64 (void)
-{
-  poly64x2_t out_poly64x2_t;
-  poly64x2_t arg0_poly64x2_t;
-  poly64x2_t arg1_poly64x2_t;
-
-  out_poly64x2_t = vsriq_n_p64 (arg0_poly64x2_t, arg1_poly64x2_t, 1);
-}
-
-/* { dg-final { scan-assembler "vsri\.64\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vsriQ_np8.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vsriQ_np8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vsriQ_np8 (void)
-{
-  poly8x16_t out_poly8x16_t;
-  poly8x16_t arg0_poly8x16_t;
-  poly8x16_t arg1_poly8x16_t;
-
-  out_poly8x16_t = vsriq_n_p8 (arg0_poly8x16_t, arg1_poly8x16_t, 1);
-}
-
-/* { dg-final { scan-assembler "vsri\.8\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vsriQ_ns16.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vsriQ_ns16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vsriQ_ns16 (void)
-{
-  int16x8_t out_int16x8_t;
-  int16x8_t arg0_int16x8_t;
-  int16x8_t arg1_int16x8_t;
-
-  out_int16x8_t = vsriq_n_s16 (arg0_int16x8_t, arg1_int16x8_t, 1);
-}
-
-/* { dg-final { scan-assembler "vsri\.16\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vsriQ_ns32.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vsriQ_ns32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vsriQ_ns32 (void)
-{
-  int32x4_t out_int32x4_t;
-  int32x4_t arg0_int32x4_t;
-  int32x4_t arg1_int32x4_t;
-
-  out_int32x4_t = vsriq_n_s32 (arg0_int32x4_t, arg1_int32x4_t, 1);
-}
-
-/* { dg-final { scan-assembler "vsri\.32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vsriQ_ns64.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vsriQ_ns64' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vsriQ_ns64 (void)
-{
-  int64x2_t out_int64x2_t;
-  int64x2_t arg0_int64x2_t;
-  int64x2_t arg1_int64x2_t;
-
-  out_int64x2_t = vsriq_n_s64 (arg0_int64x2_t, arg1_int64x2_t, 1);
-}
-
-/* { dg-final { scan-assembler "vsri\.64\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vsriQ_ns8.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vsriQ_ns8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vsriQ_ns8 (void)
-{
-  int8x16_t out_int8x16_t;
-  int8x16_t arg0_int8x16_t;
-  int8x16_t arg1_int8x16_t;
-
-  out_int8x16_t = vsriq_n_s8 (arg0_int8x16_t, arg1_int8x16_t, 1);
-}
-
-/* { dg-final { scan-assembler "vsri\.8\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vsriQ_nu16.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vsriQ_nu16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vsriQ_nu16 (void)
-{
-  uint16x8_t out_uint16x8_t;
-  uint16x8_t arg0_uint16x8_t;
-  uint16x8_t arg1_uint16x8_t;
-
-  out_uint16x8_t = vsriq_n_u16 (arg0_uint16x8_t, arg1_uint16x8_t, 1);
-}
-
-/* { dg-final { scan-assembler "vsri\.16\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vsriQ_nu32.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vsriQ_nu32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vsriQ_nu32 (void)
-{
-  uint32x4_t out_uint32x4_t;
-  uint32x4_t arg0_uint32x4_t;
-  uint32x4_t arg1_uint32x4_t;
-
-  out_uint32x4_t = vsriq_n_u32 (arg0_uint32x4_t, arg1_uint32x4_t, 1);
-}
-
-/* { dg-final { scan-assembler "vsri\.32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vsriQ_nu64.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vsriQ_nu64' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vsriQ_nu64 (void)
-{
-  uint64x2_t out_uint64x2_t;
-  uint64x2_t arg0_uint64x2_t;
-  uint64x2_t arg1_uint64x2_t;
-
-  out_uint64x2_t = vsriq_n_u64 (arg0_uint64x2_t, arg1_uint64x2_t, 1);
-}
-
-/* { dg-final { scan-assembler "vsri\.64\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vsriQ_nu8.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vsriQ_nu8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vsriQ_nu8 (void)
-{
-  uint8x16_t out_uint8x16_t;
-  uint8x16_t arg0_uint8x16_t;
-  uint8x16_t arg1_uint8x16_t;
-
-  out_uint8x16_t = vsriq_n_u8 (arg0_uint8x16_t, arg1_uint8x16_t, 1);
-}
-
-/* { dg-final { scan-assembler "vsri\.8\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vsri_np16.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vsri_np16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vsri_np16 (void)
-{
-  poly16x4_t out_poly16x4_t;
-  poly16x4_t arg0_poly16x4_t;
-  poly16x4_t arg1_poly16x4_t;
-
-  out_poly16x4_t = vsri_n_p16 (arg0_poly16x4_t, arg1_poly16x4_t, 1);
-}
-
-/* { dg-final { scan-assembler "vsri\.16\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vsri_np64.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vsri_np64' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_crypto_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_crypto } */
-
-#include "arm_neon.h"
-
-void test_vsri_np64 (void)
-{
-  poly64x1_t out_poly64x1_t;
-  poly64x1_t arg0_poly64x1_t;
-  poly64x1_t arg1_poly64x1_t;
-
-  out_poly64x1_t = vsri_n_p64 (arg0_poly64x1_t, arg1_poly64x1_t, 1);
-}
-
-/* { dg-final { scan-assembler "vsri\.64\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vsri_np8.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vsri_np8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vsri_np8 (void)
-{
-  poly8x8_t out_poly8x8_t;
-  poly8x8_t arg0_poly8x8_t;
-  poly8x8_t arg1_poly8x8_t;
-
-  out_poly8x8_t = vsri_n_p8 (arg0_poly8x8_t, arg1_poly8x8_t, 1);
-}
-
-/* { dg-final { scan-assembler "vsri\.8\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vsri_ns16.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vsri_ns16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vsri_ns16 (void)
-{
-  int16x4_t out_int16x4_t;
-  int16x4_t arg0_int16x4_t;
-  int16x4_t arg1_int16x4_t;
-
-  out_int16x4_t = vsri_n_s16 (arg0_int16x4_t, arg1_int16x4_t, 1);
-}
-
-/* { dg-final { scan-assembler "vsri\.16\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vsri_ns32.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vsri_ns32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vsri_ns32 (void)
-{
-  int32x2_t out_int32x2_t;
-  int32x2_t arg0_int32x2_t;
-  int32x2_t arg1_int32x2_t;
-
-  out_int32x2_t = vsri_n_s32 (arg0_int32x2_t, arg1_int32x2_t, 1);
-}
-
-/* { dg-final { scan-assembler "vsri\.32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vsri_ns64.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vsri_ns64' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vsri_ns64 (void)
-{
-  int64x1_t out_int64x1_t;
-  int64x1_t arg0_int64x1_t;
-  int64x1_t arg1_int64x1_t;
-
-  out_int64x1_t = vsri_n_s64 (arg0_int64x1_t, arg1_int64x1_t, 1);
-}
-
-/* { dg-final { scan-assembler "vsri\.64\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vsri_ns8.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vsri_ns8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vsri_ns8 (void)
-{
-  int8x8_t out_int8x8_t;
-  int8x8_t arg0_int8x8_t;
-  int8x8_t arg1_int8x8_t;
-
-  out_int8x8_t = vsri_n_s8 (arg0_int8x8_t, arg1_int8x8_t, 1);
-}
-
-/* { dg-final { scan-assembler "vsri\.8\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vsri_nu16.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vsri_nu16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vsri_nu16 (void)
-{
-  uint16x4_t out_uint16x4_t;
-  uint16x4_t arg0_uint16x4_t;
-  uint16x4_t arg1_uint16x4_t;
-
-  out_uint16x4_t = vsri_n_u16 (arg0_uint16x4_t, arg1_uint16x4_t, 1);
-}
-
-/* { dg-final { scan-assembler "vsri\.16\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vsri_nu32.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vsri_nu32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vsri_nu32 (void)
-{
-  uint32x2_t out_uint32x2_t;
-  uint32x2_t arg0_uint32x2_t;
-  uint32x2_t arg1_uint32x2_t;
-
-  out_uint32x2_t = vsri_n_u32 (arg0_uint32x2_t, arg1_uint32x2_t, 1);
-}
-
-/* { dg-final { scan-assembler "vsri\.32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vsri_nu64.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vsri_nu64' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vsri_nu64 (void)
-{
-  uint64x1_t out_uint64x1_t;
-  uint64x1_t arg0_uint64x1_t;
-  uint64x1_t arg1_uint64x1_t;
-
-  out_uint64x1_t = vsri_n_u64 (arg0_uint64x1_t, arg1_uint64x1_t, 1);
-}
-
-/* { dg-final { scan-assembler "vsri\.64\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vsri_nu8.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vsri_nu8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vsri_nu8 (void)
-{
-  uint8x8_t out_uint8x8_t;
-  uint8x8_t arg0_uint8x8_t;
-  uint8x8_t arg1_uint8x8_t;
-
-  out_uint8x8_t = vsri_n_u8 (arg0_uint8x8_t, arg1_uint8x8_t, 1);
-}
-
-/* { dg-final { scan-assembler "vsri\.8\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vst1Q_lanef32.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vst1Q_lanef32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vst1Q_lanef32 (void)
-{
-  float32_t *arg0_float32_t;
-  float32x4_t arg1_float32x4_t;
-
-  vst1q_lane_f32 (arg0_float32_t, arg1_float32x4_t, 1);
-}
-
-/* { dg-final { scan-assembler "vst1\.32\[ 	\]+((\\\{\[dD\]\[0-9\]+\\\[\[0-9\]+\\\]\\\})|(\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])), \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vst1Q_lanep16.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vst1Q_lanep16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vst1Q_lanep16 (void)
-{
-  poly16_t *arg0_poly16_t;
-  poly16x8_t arg1_poly16x8_t;
-
-  vst1q_lane_p16 (arg0_poly16_t, arg1_poly16x8_t, 1);
-}
-
-/* { dg-final { scan-assembler "vst1\.16\[ 	\]+((\\\{\[dD\]\[0-9\]+\\\[\[0-9\]+\\\]\\\})|(\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])), \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vst1Q_lanep64.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vst1Q_lanep64' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_crypto_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_crypto } */
-
-#include "arm_neon.h"
-
-void test_vst1Q_lanep64 (void)
-{
-  poly64_t *arg0_poly64_t;
-  poly64x2_t arg1_poly64x2_t;
-
-  vst1q_lane_p64 (arg0_poly64_t, arg1_poly64x2_t, 1);
-}
-
-/* { dg-final { scan-assembler "vst1\.64\[ 	\]+((\\\{\[dD\]\[0-9\]+\\\})|(\[dD\]\[0-9\]+)), \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vst1Q_lanep8.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vst1Q_lanep8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vst1Q_lanep8 (void)
-{
-  poly8_t *arg0_poly8_t;
-  poly8x16_t arg1_poly8x16_t;
-
-  vst1q_lane_p8 (arg0_poly8_t, arg1_poly8x16_t, 1);
-}
-
-/* { dg-final { scan-assembler "vst1\.8\[ 	\]+((\\\{\[dD\]\[0-9\]+\\\[\[0-9\]+\\\]\\\})|(\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])), \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vst1Q_lanes16.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vst1Q_lanes16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vst1Q_lanes16 (void)
-{
-  int16_t *arg0_int16_t;
-  int16x8_t arg1_int16x8_t;
-
-  vst1q_lane_s16 (arg0_int16_t, arg1_int16x8_t, 1);
-}
-
-/* { dg-final { scan-assembler "vst1\.16\[ 	\]+((\\\{\[dD\]\[0-9\]+\\\[\[0-9\]+\\\]\\\})|(\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])), \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vst1Q_lanes32.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vst1Q_lanes32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vst1Q_lanes32 (void)
-{
-  int32_t *arg0_int32_t;
-  int32x4_t arg1_int32x4_t;
-
-  vst1q_lane_s32 (arg0_int32_t, arg1_int32x4_t, 1);
-}
-
-/* { dg-final { scan-assembler "vst1\.32\[ 	\]+((\\\{\[dD\]\[0-9\]+\\\[\[0-9\]+\\\]\\\})|(\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])), \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vst1Q_lanes64.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vst1Q_lanes64' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vst1Q_lanes64 (void)
-{
-  int64_t *arg0_int64_t;
-  int64x2_t arg1_int64x2_t;
-
-  vst1q_lane_s64 (arg0_int64_t, arg1_int64x2_t, 1);
-}
-
-/* { dg-final { scan-assembler "vst1\.64\[ 	\]+((\\\{\[dD\]\[0-9\]+\\\})|(\[dD\]\[0-9\]+)), \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vst1Q_lanes8.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vst1Q_lanes8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vst1Q_lanes8 (void)
-{
-  int8_t *arg0_int8_t;
-  int8x16_t arg1_int8x16_t;
-
-  vst1q_lane_s8 (arg0_int8_t, arg1_int8x16_t, 1);
-}
-
-/* { dg-final { scan-assembler "vst1\.8\[ 	\]+((\\\{\[dD\]\[0-9\]+\\\[\[0-9\]+\\\]\\\})|(\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])), \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vst1Q_laneu16.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vst1Q_laneu16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vst1Q_laneu16 (void)
-{
-  uint16_t *arg0_uint16_t;
-  uint16x8_t arg1_uint16x8_t;
-
-  vst1q_lane_u16 (arg0_uint16_t, arg1_uint16x8_t, 1);
-}
-
-/* { dg-final { scan-assembler "vst1\.16\[ 	\]+((\\\{\[dD\]\[0-9\]+\\\[\[0-9\]+\\\]\\\})|(\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])), \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vst1Q_laneu32.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vst1Q_laneu32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vst1Q_laneu32 (void)
-{
-  uint32_t *arg0_uint32_t;
-  uint32x4_t arg1_uint32x4_t;
-
-  vst1q_lane_u32 (arg0_uint32_t, arg1_uint32x4_t, 1);
-}
-
-/* { dg-final { scan-assembler "vst1\.32\[ 	\]+((\\\{\[dD\]\[0-9\]+\\\[\[0-9\]+\\\]\\\})|(\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])), \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vst1Q_laneu64-1.c
+++ b/src//dev/null
@@ -1,25 +0,0 @@
-/* Test the `vst1Q_laneu64' ARM Neon intrinsic.  */
-
-/* Detect ICE in the case of unaligned memory address.  */
-
-/* { dg-do compile } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-unsigned char dummy_store[1000];
-
-void
-foo (char* addr)
-{
-  uint8x16_t vdata = vld1q_u8 (addr);
-  vst1q_lane_u64 ((uint64_t*) &dummy_store, vreinterpretq_u64_u8 (vdata), 0);
-}
-
-uint64_t
-bar (uint64x2_t vdata)
-{
-  vdata = vld1q_lane_u64 ((uint64_t*) &dummy_store, vdata, 0);
-  return vgetq_lane_u64 (vdata, 0);
-}
--- a/src/gcc/testsuite/gcc.target/arm/neon/vst1Q_laneu64.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vst1Q_laneu64' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vst1Q_laneu64 (void)
-{
-  uint64_t *arg0_uint64_t;
-  uint64x2_t arg1_uint64x2_t;
-
-  vst1q_lane_u64 (arg0_uint64_t, arg1_uint64x2_t, 1);
-}
-
-/* { dg-final { scan-assembler "vst1\.64\[ 	\]+((\\\{\[dD\]\[0-9\]+\\\})|(\[dD\]\[0-9\]+)), \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vst1Q_laneu8.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vst1Q_laneu8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vst1Q_laneu8 (void)
-{
-  uint8_t *arg0_uint8_t;
-  uint8x16_t arg1_uint8x16_t;
-
-  vst1q_lane_u8 (arg0_uint8_t, arg1_uint8x16_t, 1);
-}
-
-/* { dg-final { scan-assembler "vst1\.8\[ 	\]+((\\\{\[dD\]\[0-9\]+\\\[\[0-9\]+\\\]\\\})|(\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])), \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vst1Qf32.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vst1Qf32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vst1Qf32 (void)
-{
-  float32_t *arg0_float32_t;
-  float32x4_t arg1_float32x4_t;
-
-  vst1q_f32 (arg0_float32_t, arg1_float32x4_t);
-}
-
-/* { dg-final { scan-assembler "vst1\.32\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vst1Qp16.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vst1Qp16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vst1Qp16 (void)
-{
-  poly16_t *arg0_poly16_t;
-  poly16x8_t arg1_poly16x8_t;
-
-  vst1q_p16 (arg0_poly16_t, arg1_poly16x8_t);
-}
-
-/* { dg-final { scan-assembler "vst1\.16\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vst1Qp64.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vst1Qp64' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_crypto_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_crypto } */
-
-#include "arm_neon.h"
-
-void test_vst1Qp64 (void)
-{
-  poly64_t *arg0_poly64_t;
-  poly64x2_t arg1_poly64x2_t;
-
-  vst1q_p64 (arg0_poly64_t, arg1_poly64x2_t);
-}
-
-/* { dg-final { scan-assembler "vst1\.64\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vst1Qp8.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vst1Qp8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vst1Qp8 (void)
-{
-  poly8_t *arg0_poly8_t;
-  poly8x16_t arg1_poly8x16_t;
-
-  vst1q_p8 (arg0_poly8_t, arg1_poly8x16_t);
-}
-
-/* { dg-final { scan-assembler "vst1\.8\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vst1Qs16.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vst1Qs16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vst1Qs16 (void)
-{
-  int16_t *arg0_int16_t;
-  int16x8_t arg1_int16x8_t;
-
-  vst1q_s16 (arg0_int16_t, arg1_int16x8_t);
-}
-
-/* { dg-final { scan-assembler "vst1\.16\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vst1Qs32.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vst1Qs32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vst1Qs32 (void)
-{
-  int32_t *arg0_int32_t;
-  int32x4_t arg1_int32x4_t;
-
-  vst1q_s32 (arg0_int32_t, arg1_int32x4_t);
-}
-
-/* { dg-final { scan-assembler "vst1\.32\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vst1Qs64.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vst1Qs64' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vst1Qs64 (void)
-{
-  int64_t *arg0_int64_t;
-  int64x2_t arg1_int64x2_t;
-
-  vst1q_s64 (arg0_int64_t, arg1_int64x2_t);
-}
-
-/* { dg-final { scan-assembler "vst1\.64\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vst1Qs8.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vst1Qs8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vst1Qs8 (void)
-{
-  int8_t *arg0_int8_t;
-  int8x16_t arg1_int8x16_t;
-
-  vst1q_s8 (arg0_int8_t, arg1_int8x16_t);
-}
-
-/* { dg-final { scan-assembler "vst1\.8\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vst1Qu16.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vst1Qu16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vst1Qu16 (void)
-{
-  uint16_t *arg0_uint16_t;
-  uint16x8_t arg1_uint16x8_t;
-
-  vst1q_u16 (arg0_uint16_t, arg1_uint16x8_t);
-}
-
-/* { dg-final { scan-assembler "vst1\.16\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vst1Qu32.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vst1Qu32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vst1Qu32 (void)
-{
-  uint32_t *arg0_uint32_t;
-  uint32x4_t arg1_uint32x4_t;
-
-  vst1q_u32 (arg0_uint32_t, arg1_uint32x4_t);
-}
-
-/* { dg-final { scan-assembler "vst1\.32\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vst1Qu64.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vst1Qu64' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vst1Qu64 (void)
-{
-  uint64_t *arg0_uint64_t;
-  uint64x2_t arg1_uint64x2_t;
-
-  vst1q_u64 (arg0_uint64_t, arg1_uint64x2_t);
-}
-
-/* { dg-final { scan-assembler "vst1\.64\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vst1Qu8.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vst1Qu8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vst1Qu8 (void)
-{
-  uint8_t *arg0_uint8_t;
-  uint8x16_t arg1_uint8x16_t;
-
-  vst1q_u8 (arg0_uint8_t, arg1_uint8x16_t);
-}
-
-/* { dg-final { scan-assembler "vst1\.8\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vst1_lanef32.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vst1_lanef32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vst1_lanef32 (void)
-{
-  float32_t *arg0_float32_t;
-  float32x2_t arg1_float32x2_t;
-
-  vst1_lane_f32 (arg0_float32_t, arg1_float32x2_t, 1);
-}
-
-/* { dg-final { scan-assembler "vst1\.32\[ 	\]+((\\\{\[dD\]\[0-9\]+\\\[\[0-9\]+\\\]\\\})|(\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])), \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vst1_lanep16.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vst1_lanep16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vst1_lanep16 (void)
-{
-  poly16_t *arg0_poly16_t;
-  poly16x4_t arg1_poly16x4_t;
-
-  vst1_lane_p16 (arg0_poly16_t, arg1_poly16x4_t, 1);
-}
-
-/* { dg-final { scan-assembler "vst1\.16\[ 	\]+((\\\{\[dD\]\[0-9\]+\\\[\[0-9\]+\\\]\\\})|(\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])), \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vst1_lanep64.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vst1_lanep64' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_crypto_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_crypto } */
-
-#include "arm_neon.h"
-
-void test_vst1_lanep64 (void)
-{
-  poly64_t *arg0_poly64_t;
-  poly64x1_t arg1_poly64x1_t;
-
-  vst1_lane_p64 (arg0_poly64_t, arg1_poly64x1_t, 0);
-}
-
-/* { dg-final { scan-assembler "vst1\.64\[ 	\]+((\\\{\[dD\]\[0-9\]+\\\})|(\[dD\]\[0-9\]+)), \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vst1_lanep8.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vst1_lanep8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vst1_lanep8 (void)
-{
-  poly8_t *arg0_poly8_t;
-  poly8x8_t arg1_poly8x8_t;
-
-  vst1_lane_p8 (arg0_poly8_t, arg1_poly8x8_t, 1);
-}
-
-/* { dg-final { scan-assembler "vst1\.8\[ 	\]+((\\\{\[dD\]\[0-9\]+\\\[\[0-9\]+\\\]\\\})|(\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])), \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vst1_lanes16.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vst1_lanes16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vst1_lanes16 (void)
-{
-  int16_t *arg0_int16_t;
-  int16x4_t arg1_int16x4_t;
-
-  vst1_lane_s16 (arg0_int16_t, arg1_int16x4_t, 1);
-}
-
-/* { dg-final { scan-assembler "vst1\.16\[ 	\]+((\\\{\[dD\]\[0-9\]+\\\[\[0-9\]+\\\]\\\})|(\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])), \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vst1_lanes32.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vst1_lanes32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vst1_lanes32 (void)
-{
-  int32_t *arg0_int32_t;
-  int32x2_t arg1_int32x2_t;
-
-  vst1_lane_s32 (arg0_int32_t, arg1_int32x2_t, 1);
-}
-
-/* { dg-final { scan-assembler "vst1\.32\[ 	\]+((\\\{\[dD\]\[0-9\]+\\\[\[0-9\]+\\\]\\\})|(\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])), \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vst1_lanes64.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vst1_lanes64' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vst1_lanes64 (void)
-{
-  int64_t *arg0_int64_t;
-  int64x1_t arg1_int64x1_t;
-
-  vst1_lane_s64 (arg0_int64_t, arg1_int64x1_t, 0);
-}
-
-/* { dg-final { scan-assembler "vst1\.64\[ 	\]+((\\\{\[dD\]\[0-9\]+\\\})|(\[dD\]\[0-9\]+)), \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vst1_lanes8.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vst1_lanes8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vst1_lanes8 (void)
-{
-  int8_t *arg0_int8_t;
-  int8x8_t arg1_int8x8_t;
-
-  vst1_lane_s8 (arg0_int8_t, arg1_int8x8_t, 1);
-}
-
-/* { dg-final { scan-assembler "vst1\.8\[ 	\]+((\\\{\[dD\]\[0-9\]+\\\[\[0-9\]+\\\]\\\})|(\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])), \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vst1_laneu16.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vst1_laneu16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vst1_laneu16 (void)
-{
-  uint16_t *arg0_uint16_t;
-  uint16x4_t arg1_uint16x4_t;
-
-  vst1_lane_u16 (arg0_uint16_t, arg1_uint16x4_t, 1);
-}
-
-/* { dg-final { scan-assembler "vst1\.16\[ 	\]+((\\\{\[dD\]\[0-9\]+\\\[\[0-9\]+\\\]\\\})|(\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])), \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vst1_laneu32.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vst1_laneu32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vst1_laneu32 (void)
-{
-  uint32_t *arg0_uint32_t;
-  uint32x2_t arg1_uint32x2_t;
-
-  vst1_lane_u32 (arg0_uint32_t, arg1_uint32x2_t, 1);
-}
-
-/* { dg-final { scan-assembler "vst1\.32\[ 	\]+((\\\{\[dD\]\[0-9\]+\\\[\[0-9\]+\\\]\\\})|(\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])), \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vst1_laneu64.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vst1_laneu64' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vst1_laneu64 (void)
-{
-  uint64_t *arg0_uint64_t;
-  uint64x1_t arg1_uint64x1_t;
-
-  vst1_lane_u64 (arg0_uint64_t, arg1_uint64x1_t, 0);
-}
-
-/* { dg-final { scan-assembler "vst1\.64\[ 	\]+((\\\{\[dD\]\[0-9\]+\\\})|(\[dD\]\[0-9\]+)), \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vst1_laneu8.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vst1_laneu8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vst1_laneu8 (void)
-{
-  uint8_t *arg0_uint8_t;
-  uint8x8_t arg1_uint8x8_t;
-
-  vst1_lane_u8 (arg0_uint8_t, arg1_uint8x8_t, 1);
-}
-
-/* { dg-final { scan-assembler "vst1\.8\[ 	\]+((\\\{\[dD\]\[0-9\]+\\\[\[0-9\]+\\\]\\\})|(\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])), \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vst1f32.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vst1f32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vst1f32 (void)
-{
-  float32_t *arg0_float32_t;
-  float32x2_t arg1_float32x2_t;
-
-  vst1_f32 (arg0_float32_t, arg1_float32x2_t);
-}
-
-/* { dg-final { scan-assembler "vst1\.32\[ 	\]+((\\\{\[dD\]\[0-9\]+\\\})|(\[dD\]\[0-9\]+)), \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vst1p16.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vst1p16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vst1p16 (void)
-{
-  poly16_t *arg0_poly16_t;
-  poly16x4_t arg1_poly16x4_t;
-
-  vst1_p16 (arg0_poly16_t, arg1_poly16x4_t);
-}
-
-/* { dg-final { scan-assembler "vst1\.16\[ 	\]+((\\\{\[dD\]\[0-9\]+\\\})|(\[dD\]\[0-9\]+)), \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vst1p64.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vst1p64' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_crypto_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_crypto } */
-
-#include "arm_neon.h"
-
-void test_vst1p64 (void)
-{
-  poly64_t *arg0_poly64_t;
-  poly64x1_t arg1_poly64x1_t;
-
-  vst1_p64 (arg0_poly64_t, arg1_poly64x1_t);
-}
-
-/* { dg-final { scan-assembler "vst1\.64\[ 	\]+((\\\{\[dD\]\[0-9\]+\\\})|(\[dD\]\[0-9\]+)), \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vst1p8.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vst1p8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vst1p8 (void)
-{
-  poly8_t *arg0_poly8_t;
-  poly8x8_t arg1_poly8x8_t;
-
-  vst1_p8 (arg0_poly8_t, arg1_poly8x8_t);
-}
-
-/* { dg-final { scan-assembler "vst1\.8\[ 	\]+((\\\{\[dD\]\[0-9\]+\\\})|(\[dD\]\[0-9\]+)), \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vst1s16.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vst1s16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vst1s16 (void)
-{
-  int16_t *arg0_int16_t;
-  int16x4_t arg1_int16x4_t;
-
-  vst1_s16 (arg0_int16_t, arg1_int16x4_t);
-}
-
-/* { dg-final { scan-assembler "vst1\.16\[ 	\]+((\\\{\[dD\]\[0-9\]+\\\})|(\[dD\]\[0-9\]+)), \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vst1s32.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vst1s32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vst1s32 (void)
-{
-  int32_t *arg0_int32_t;
-  int32x2_t arg1_int32x2_t;
-
-  vst1_s32 (arg0_int32_t, arg1_int32x2_t);
-}
-
-/* { dg-final { scan-assembler "vst1\.32\[ 	\]+((\\\{\[dD\]\[0-9\]+\\\})|(\[dD\]\[0-9\]+)), \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vst1s64.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vst1s64' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vst1s64 (void)
-{
-  int64_t *arg0_int64_t;
-  int64x1_t arg1_int64x1_t;
-
-  vst1_s64 (arg0_int64_t, arg1_int64x1_t);
-}
-
-/* { dg-final { scan-assembler "vst1\.64\[ 	\]+((\\\{\[dD\]\[0-9\]+\\\})|(\[dD\]\[0-9\]+)), \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vst1s8.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vst1s8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vst1s8 (void)
-{
-  int8_t *arg0_int8_t;
-  int8x8_t arg1_int8x8_t;
-
-  vst1_s8 (arg0_int8_t, arg1_int8x8_t);
-}
-
-/* { dg-final { scan-assembler "vst1\.8\[ 	\]+((\\\{\[dD\]\[0-9\]+\\\})|(\[dD\]\[0-9\]+)), \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vst1u16.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vst1u16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vst1u16 (void)
-{
-  uint16_t *arg0_uint16_t;
-  uint16x4_t arg1_uint16x4_t;
-
-  vst1_u16 (arg0_uint16_t, arg1_uint16x4_t);
-}
-
-/* { dg-final { scan-assembler "vst1\.16\[ 	\]+((\\\{\[dD\]\[0-9\]+\\\})|(\[dD\]\[0-9\]+)), \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vst1u32.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vst1u32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vst1u32 (void)
-{
-  uint32_t *arg0_uint32_t;
-  uint32x2_t arg1_uint32x2_t;
-
-  vst1_u32 (arg0_uint32_t, arg1_uint32x2_t);
-}
-
-/* { dg-final { scan-assembler "vst1\.32\[ 	\]+((\\\{\[dD\]\[0-9\]+\\\})|(\[dD\]\[0-9\]+)), \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vst1u64.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vst1u64' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vst1u64 (void)
-{
-  uint64_t *arg0_uint64_t;
-  uint64x1_t arg1_uint64x1_t;
-
-  vst1_u64 (arg0_uint64_t, arg1_uint64x1_t);
-}
-
-/* { dg-final { scan-assembler "vst1\.64\[ 	\]+((\\\{\[dD\]\[0-9\]+\\\})|(\[dD\]\[0-9\]+)), \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vst1u8.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vst1u8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vst1u8 (void)
-{
-  uint8_t *arg0_uint8_t;
-  uint8x8_t arg1_uint8x8_t;
-
-  vst1_u8 (arg0_uint8_t, arg1_uint8x8_t);
-}
-
-/* { dg-final { scan-assembler "vst1\.8\[ 	\]+((\\\{\[dD\]\[0-9\]+\\\})|(\[dD\]\[0-9\]+)), \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vst2Q_lanef32.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vst2Q_lanef32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vst2Q_lanef32 (void)
-{
-  float32_t *arg0_float32_t;
-  float32x4x2_t arg1_float32x4x2_t;
-
-  vst2q_lane_f32 (arg0_float32_t, arg1_float32x4x2_t, 1);
-}
-
-/* { dg-final { scan-assembler "vst2\.32\[ 	\]+\\\{((\[dD\]\[0-9\]+\\\[\[0-9\]+\\\]-\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])|(\[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vst2Q_lanep16.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vst2Q_lanep16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vst2Q_lanep16 (void)
-{
-  poly16_t *arg0_poly16_t;
-  poly16x8x2_t arg1_poly16x8x2_t;
-
-  vst2q_lane_p16 (arg0_poly16_t, arg1_poly16x8x2_t, 1);
-}
-
-/* { dg-final { scan-assembler "vst2\.16\[ 	\]+\\\{((\[dD\]\[0-9\]+\\\[\[0-9\]+\\\]-\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])|(\[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vst2Q_lanes16.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vst2Q_lanes16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vst2Q_lanes16 (void)
-{
-  int16_t *arg0_int16_t;
-  int16x8x2_t arg1_int16x8x2_t;
-
-  vst2q_lane_s16 (arg0_int16_t, arg1_int16x8x2_t, 1);
-}
-
-/* { dg-final { scan-assembler "vst2\.16\[ 	\]+\\\{((\[dD\]\[0-9\]+\\\[\[0-9\]+\\\]-\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])|(\[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vst2Q_lanes32.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vst2Q_lanes32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vst2Q_lanes32 (void)
-{
-  int32_t *arg0_int32_t;
-  int32x4x2_t arg1_int32x4x2_t;
-
-  vst2q_lane_s32 (arg0_int32_t, arg1_int32x4x2_t, 1);
-}
-
-/* { dg-final { scan-assembler "vst2\.32\[ 	\]+\\\{((\[dD\]\[0-9\]+\\\[\[0-9\]+\\\]-\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])|(\[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vst2Q_laneu16.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vst2Q_laneu16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vst2Q_laneu16 (void)
-{
-  uint16_t *arg0_uint16_t;
-  uint16x8x2_t arg1_uint16x8x2_t;
-
-  vst2q_lane_u16 (arg0_uint16_t, arg1_uint16x8x2_t, 1);
-}
-
-/* { dg-final { scan-assembler "vst2\.16\[ 	\]+\\\{((\[dD\]\[0-9\]+\\\[\[0-9\]+\\\]-\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])|(\[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vst2Q_laneu32.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vst2Q_laneu32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vst2Q_laneu32 (void)
-{
-  uint32_t *arg0_uint32_t;
-  uint32x4x2_t arg1_uint32x4x2_t;
-
-  vst2q_lane_u32 (arg0_uint32_t, arg1_uint32x4x2_t, 1);
-}
-
-/* { dg-final { scan-assembler "vst2\.32\[ 	\]+\\\{((\[dD\]\[0-9\]+\\\[\[0-9\]+\\\]-\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])|(\[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vst2Qf32.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vst2Qf32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vst2Qf32 (void)
-{
-  float32_t *arg0_float32_t;
-  float32x4x2_t arg1_float32x4x2_t;
-
-  vst2q_f32 (arg0_float32_t, arg1_float32x4x2_t);
-}
-
-/* { dg-final { scan-assembler "vst2\.32\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
-/* { dg-final { scan-assembler "vst2\.32\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vst2Qp16.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vst2Qp16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vst2Qp16 (void)
-{
-  poly16_t *arg0_poly16_t;
-  poly16x8x2_t arg1_poly16x8x2_t;
-
-  vst2q_p16 (arg0_poly16_t, arg1_poly16x8x2_t);
-}
-
-/* { dg-final { scan-assembler "vst2\.16\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
-/* { dg-final { scan-assembler "vst2\.16\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vst2Qp8.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vst2Qp8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vst2Qp8 (void)
-{
-  poly8_t *arg0_poly8_t;
-  poly8x16x2_t arg1_poly8x16x2_t;
-
-  vst2q_p8 (arg0_poly8_t, arg1_poly8x16x2_t);
-}
-
-/* { dg-final { scan-assembler "vst2\.8\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
-/* { dg-final { scan-assembler "vst2\.8\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vst2Qs16.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vst2Qs16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vst2Qs16 (void)
-{
-  int16_t *arg0_int16_t;
-  int16x8x2_t arg1_int16x8x2_t;
-
-  vst2q_s16 (arg0_int16_t, arg1_int16x8x2_t);
-}
-
-/* { dg-final { scan-assembler "vst2\.16\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
-/* { dg-final { scan-assembler "vst2\.16\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vst2Qs32.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vst2Qs32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vst2Qs32 (void)
-{
-  int32_t *arg0_int32_t;
-  int32x4x2_t arg1_int32x4x2_t;
-
-  vst2q_s32 (arg0_int32_t, arg1_int32x4x2_t);
-}
-
-/* { dg-final { scan-assembler "vst2\.32\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
-/* { dg-final { scan-assembler "vst2\.32\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vst2Qs8.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vst2Qs8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vst2Qs8 (void)
-{
-  int8_t *arg0_int8_t;
-  int8x16x2_t arg1_int8x16x2_t;
-
-  vst2q_s8 (arg0_int8_t, arg1_int8x16x2_t);
-}
-
-/* { dg-final { scan-assembler "vst2\.8\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
-/* { dg-final { scan-assembler "vst2\.8\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vst2Qu16.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vst2Qu16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vst2Qu16 (void)
-{
-  uint16_t *arg0_uint16_t;
-  uint16x8x2_t arg1_uint16x8x2_t;
-
-  vst2q_u16 (arg0_uint16_t, arg1_uint16x8x2_t);
-}
-
-/* { dg-final { scan-assembler "vst2\.16\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
-/* { dg-final { scan-assembler "vst2\.16\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vst2Qu32.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vst2Qu32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vst2Qu32 (void)
-{
-  uint32_t *arg0_uint32_t;
-  uint32x4x2_t arg1_uint32x4x2_t;
-
-  vst2q_u32 (arg0_uint32_t, arg1_uint32x4x2_t);
-}
-
-/* { dg-final { scan-assembler "vst2\.32\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
-/* { dg-final { scan-assembler "vst2\.32\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vst2Qu8.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vst2Qu8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vst2Qu8 (void)
-{
-  uint8_t *arg0_uint8_t;
-  uint8x16x2_t arg1_uint8x16x2_t;
-
-  vst2q_u8 (arg0_uint8_t, arg1_uint8x16x2_t);
-}
-
-/* { dg-final { scan-assembler "vst2\.8\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
-/* { dg-final { scan-assembler "vst2\.8\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vst2_lanef32.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vst2_lanef32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vst2_lanef32 (void)
-{
-  float32_t *arg0_float32_t;
-  float32x2x2_t arg1_float32x2x2_t;
-
-  vst2_lane_f32 (arg0_float32_t, arg1_float32x2x2_t, 1);
-}
-
-/* { dg-final { scan-assembler "vst2\.32\[ 	\]+\\\{((\[dD\]\[0-9\]+\\\[\[0-9\]+\\\]-\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])|(\[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vst2_lanep16.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vst2_lanep16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vst2_lanep16 (void)
-{
-  poly16_t *arg0_poly16_t;
-  poly16x4x2_t arg1_poly16x4x2_t;
-
-  vst2_lane_p16 (arg0_poly16_t, arg1_poly16x4x2_t, 1);
-}
-
-/* { dg-final { scan-assembler "vst2\.16\[ 	\]+\\\{((\[dD\]\[0-9\]+\\\[\[0-9\]+\\\]-\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])|(\[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vst2_lanep8.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vst2_lanep8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vst2_lanep8 (void)
-{
-  poly8_t *arg0_poly8_t;
-  poly8x8x2_t arg1_poly8x8x2_t;
-
-  vst2_lane_p8 (arg0_poly8_t, arg1_poly8x8x2_t, 1);
-}
-
-/* { dg-final { scan-assembler "vst2\.8\[ 	\]+\\\{((\[dD\]\[0-9\]+\\\[\[0-9\]+\\\]-\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])|(\[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vst2_lanes16.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vst2_lanes16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vst2_lanes16 (void)
-{
-  int16_t *arg0_int16_t;
-  int16x4x2_t arg1_int16x4x2_t;
-
-  vst2_lane_s16 (arg0_int16_t, arg1_int16x4x2_t, 1);
-}
-
-/* { dg-final { scan-assembler "vst2\.16\[ 	\]+\\\{((\[dD\]\[0-9\]+\\\[\[0-9\]+\\\]-\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])|(\[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vst2_lanes32.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vst2_lanes32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vst2_lanes32 (void)
-{
-  int32_t *arg0_int32_t;
-  int32x2x2_t arg1_int32x2x2_t;
-
-  vst2_lane_s32 (arg0_int32_t, arg1_int32x2x2_t, 1);
-}
-
-/* { dg-final { scan-assembler "vst2\.32\[ 	\]+\\\{((\[dD\]\[0-9\]+\\\[\[0-9\]+\\\]-\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])|(\[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vst2_lanes8.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vst2_lanes8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vst2_lanes8 (void)
-{
-  int8_t *arg0_int8_t;
-  int8x8x2_t arg1_int8x8x2_t;
-
-  vst2_lane_s8 (arg0_int8_t, arg1_int8x8x2_t, 1);
-}
-
-/* { dg-final { scan-assembler "vst2\.8\[ 	\]+\\\{((\[dD\]\[0-9\]+\\\[\[0-9\]+\\\]-\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])|(\[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vst2_laneu16.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vst2_laneu16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vst2_laneu16 (void)
-{
-  uint16_t *arg0_uint16_t;
-  uint16x4x2_t arg1_uint16x4x2_t;
-
-  vst2_lane_u16 (arg0_uint16_t, arg1_uint16x4x2_t, 1);
-}
-
-/* { dg-final { scan-assembler "vst2\.16\[ 	\]+\\\{((\[dD\]\[0-9\]+\\\[\[0-9\]+\\\]-\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])|(\[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vst2_laneu32.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vst2_laneu32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vst2_laneu32 (void)
-{
-  uint32_t *arg0_uint32_t;
-  uint32x2x2_t arg1_uint32x2x2_t;
-
-  vst2_lane_u32 (arg0_uint32_t, arg1_uint32x2x2_t, 1);
-}
-
-/* { dg-final { scan-assembler "vst2\.32\[ 	\]+\\\{((\[dD\]\[0-9\]+\\\[\[0-9\]+\\\]-\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])|(\[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vst2_laneu8.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vst2_laneu8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vst2_laneu8 (void)
-{
-  uint8_t *arg0_uint8_t;
-  uint8x8x2_t arg1_uint8x8x2_t;
-
-  vst2_lane_u8 (arg0_uint8_t, arg1_uint8x8x2_t, 1);
-}
-
-/* { dg-final { scan-assembler "vst2\.8\[ 	\]+\\\{((\[dD\]\[0-9\]+\\\[\[0-9\]+\\\]-\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])|(\[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vst2f32.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vst2f32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vst2f32 (void)
-{
-  float32_t *arg0_float32_t;
-  float32x2x2_t arg1_float32x2x2_t;
-
-  vst2_f32 (arg0_float32_t, arg1_float32x2x2_t);
-}
-
-/* { dg-final { scan-assembler "vst2\.32\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vst2p16.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vst2p16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vst2p16 (void)
-{
-  poly16_t *arg0_poly16_t;
-  poly16x4x2_t arg1_poly16x4x2_t;
-
-  vst2_p16 (arg0_poly16_t, arg1_poly16x4x2_t);
-}
-
-/* { dg-final { scan-assembler "vst2\.16\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vst2p64.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vst2p64' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_crypto_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_crypto } */
-
-#include "arm_neon.h"
-
-void test_vst2p64 (void)
-{
-  poly64_t *arg0_poly64_t;
-  poly64x1x2_t arg1_poly64x1x2_t;
-
-  vst2_p64 (arg0_poly64_t, arg1_poly64x1x2_t);
-}
-
-/* { dg-final { scan-assembler "vst1\.64\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vst2p8.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vst2p8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vst2p8 (void)
-{
-  poly8_t *arg0_poly8_t;
-  poly8x8x2_t arg1_poly8x8x2_t;
-
-  vst2_p8 (arg0_poly8_t, arg1_poly8x8x2_t);
-}
-
-/* { dg-final { scan-assembler "vst2\.8\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vst2s16.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vst2s16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vst2s16 (void)
-{
-  int16_t *arg0_int16_t;
-  int16x4x2_t arg1_int16x4x2_t;
-
-  vst2_s16 (arg0_int16_t, arg1_int16x4x2_t);
-}
-
-/* { dg-final { scan-assembler "vst2\.16\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vst2s32.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vst2s32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vst2s32 (void)
-{
-  int32_t *arg0_int32_t;
-  int32x2x2_t arg1_int32x2x2_t;
-
-  vst2_s32 (arg0_int32_t, arg1_int32x2x2_t);
-}
-
-/* { dg-final { scan-assembler "vst2\.32\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vst2s64.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vst2s64' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vst2s64 (void)
-{
-  int64_t *arg0_int64_t;
-  int64x1x2_t arg1_int64x1x2_t;
-
-  vst2_s64 (arg0_int64_t, arg1_int64x1x2_t);
-}
-
-/* { dg-final { scan-assembler "vst1\.64\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vst2s8.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vst2s8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vst2s8 (void)
-{
-  int8_t *arg0_int8_t;
-  int8x8x2_t arg1_int8x8x2_t;
-
-  vst2_s8 (arg0_int8_t, arg1_int8x8x2_t);
-}
-
-/* { dg-final { scan-assembler "vst2\.8\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vst2u16.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vst2u16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vst2u16 (void)
-{
-  uint16_t *arg0_uint16_t;
-  uint16x4x2_t arg1_uint16x4x2_t;
-
-  vst2_u16 (arg0_uint16_t, arg1_uint16x4x2_t);
-}
-
-/* { dg-final { scan-assembler "vst2\.16\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vst2u32.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vst2u32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vst2u32 (void)
-{
-  uint32_t *arg0_uint32_t;
-  uint32x2x2_t arg1_uint32x2x2_t;
-
-  vst2_u32 (arg0_uint32_t, arg1_uint32x2x2_t);
-}
-
-/* { dg-final { scan-assembler "vst2\.32\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vst2u64.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vst2u64' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vst2u64 (void)
-{
-  uint64_t *arg0_uint64_t;
-  uint64x1x2_t arg1_uint64x1x2_t;
-
-  vst2_u64 (arg0_uint64_t, arg1_uint64x1x2_t);
-}
-
-/* { dg-final { scan-assembler "vst1\.64\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vst2u8.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vst2u8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vst2u8 (void)
-{
-  uint8_t *arg0_uint8_t;
-  uint8x8x2_t arg1_uint8x8x2_t;
-
-  vst2_u8 (arg0_uint8_t, arg1_uint8x8x2_t);
-}
-
-/* { dg-final { scan-assembler "vst2\.8\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vst3Q_lanef32.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vst3Q_lanef32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vst3Q_lanef32 (void)
-{
-  float32_t *arg0_float32_t;
-  float32x4x3_t arg1_float32x4x3_t;
-
-  vst3q_lane_f32 (arg0_float32_t, arg1_float32x4x3_t, 1);
-}
-
-/* { dg-final { scan-assembler "vst3\.32\[ 	\]+\\\{((\[dD\]\[0-9\]+\\\[\[0-9\]+\\\]-\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])|(\[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vst3Q_lanep16.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vst3Q_lanep16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vst3Q_lanep16 (void)
-{
-  poly16_t *arg0_poly16_t;
-  poly16x8x3_t arg1_poly16x8x3_t;
-
-  vst3q_lane_p16 (arg0_poly16_t, arg1_poly16x8x3_t, 1);
-}
-
-/* { dg-final { scan-assembler "vst3\.16\[ 	\]+\\\{((\[dD\]\[0-9\]+\\\[\[0-9\]+\\\]-\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])|(\[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vst3Q_lanes16.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vst3Q_lanes16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vst3Q_lanes16 (void)
-{
-  int16_t *arg0_int16_t;
-  int16x8x3_t arg1_int16x8x3_t;
-
-  vst3q_lane_s16 (arg0_int16_t, arg1_int16x8x3_t, 1);
-}
-
-/* { dg-final { scan-assembler "vst3\.16\[ 	\]+\\\{((\[dD\]\[0-9\]+\\\[\[0-9\]+\\\]-\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])|(\[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vst3Q_lanes32.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vst3Q_lanes32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vst3Q_lanes32 (void)
-{
-  int32_t *arg0_int32_t;
-  int32x4x3_t arg1_int32x4x3_t;
-
-  vst3q_lane_s32 (arg0_int32_t, arg1_int32x4x3_t, 1);
-}
-
-/* { dg-final { scan-assembler "vst3\.32\[ 	\]+\\\{((\[dD\]\[0-9\]+\\\[\[0-9\]+\\\]-\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])|(\[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vst3Q_laneu16.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vst3Q_laneu16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vst3Q_laneu16 (void)
-{
-  uint16_t *arg0_uint16_t;
-  uint16x8x3_t arg1_uint16x8x3_t;
-
-  vst3q_lane_u16 (arg0_uint16_t, arg1_uint16x8x3_t, 1);
-}
-
-/* { dg-final { scan-assembler "vst3\.16\[ 	\]+\\\{((\[dD\]\[0-9\]+\\\[\[0-9\]+\\\]-\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])|(\[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vst3Q_laneu32.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vst3Q_laneu32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vst3Q_laneu32 (void)
-{
-  uint32_t *arg0_uint32_t;
-  uint32x4x3_t arg1_uint32x4x3_t;
-
-  vst3q_lane_u32 (arg0_uint32_t, arg1_uint32x4x3_t, 1);
-}
-
-/* { dg-final { scan-assembler "vst3\.32\[ 	\]+\\\{((\[dD\]\[0-9\]+\\\[\[0-9\]+\\\]-\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])|(\[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vst3Qf32.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vst3Qf32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vst3Qf32 (void)
-{
-  float32_t *arg0_float32_t;
-  float32x4x3_t arg1_float32x4x3_t;
-
-  vst3q_f32 (arg0_float32_t, arg1_float32x4x3_t);
-}
-
-/* { dg-final { scan-assembler "vst3\.32\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
-/* { dg-final { scan-assembler "vst3\.32\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vst3Qp16.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vst3Qp16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vst3Qp16 (void)
-{
-  poly16_t *arg0_poly16_t;
-  poly16x8x3_t arg1_poly16x8x3_t;
-
-  vst3q_p16 (arg0_poly16_t, arg1_poly16x8x3_t);
-}
-
-/* { dg-final { scan-assembler "vst3\.16\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
-/* { dg-final { scan-assembler "vst3\.16\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vst3Qp8.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vst3Qp8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vst3Qp8 (void)
-{
-  poly8_t *arg0_poly8_t;
-  poly8x16x3_t arg1_poly8x16x3_t;
-
-  vst3q_p8 (arg0_poly8_t, arg1_poly8x16x3_t);
-}
-
-/* { dg-final { scan-assembler "vst3\.8\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
-/* { dg-final { scan-assembler "vst3\.8\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vst3Qs16.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vst3Qs16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vst3Qs16 (void)
-{
-  int16_t *arg0_int16_t;
-  int16x8x3_t arg1_int16x8x3_t;
-
-  vst3q_s16 (arg0_int16_t, arg1_int16x8x3_t);
-}
-
-/* { dg-final { scan-assembler "vst3\.16\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
-/* { dg-final { scan-assembler "vst3\.16\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vst3Qs32.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vst3Qs32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vst3Qs32 (void)
-{
-  int32_t *arg0_int32_t;
-  int32x4x3_t arg1_int32x4x3_t;
-
-  vst3q_s32 (arg0_int32_t, arg1_int32x4x3_t);
-}
-
-/* { dg-final { scan-assembler "vst3\.32\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
-/* { dg-final { scan-assembler "vst3\.32\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vst3Qs8.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vst3Qs8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vst3Qs8 (void)
-{
-  int8_t *arg0_int8_t;
-  int8x16x3_t arg1_int8x16x3_t;
-
-  vst3q_s8 (arg0_int8_t, arg1_int8x16x3_t);
-}
-
-/* { dg-final { scan-assembler "vst3\.8\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
-/* { dg-final { scan-assembler "vst3\.8\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vst3Qu16.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vst3Qu16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vst3Qu16 (void)
-{
-  uint16_t *arg0_uint16_t;
-  uint16x8x3_t arg1_uint16x8x3_t;
-
-  vst3q_u16 (arg0_uint16_t, arg1_uint16x8x3_t);
-}
-
-/* { dg-final { scan-assembler "vst3\.16\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
-/* { dg-final { scan-assembler "vst3\.16\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vst3Qu32.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vst3Qu32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vst3Qu32 (void)
-{
-  uint32_t *arg0_uint32_t;
-  uint32x4x3_t arg1_uint32x4x3_t;
-
-  vst3q_u32 (arg0_uint32_t, arg1_uint32x4x3_t);
-}
-
-/* { dg-final { scan-assembler "vst3\.32\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
-/* { dg-final { scan-assembler "vst3\.32\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vst3Qu8.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vst3Qu8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vst3Qu8 (void)
-{
-  uint8_t *arg0_uint8_t;
-  uint8x16x3_t arg1_uint8x16x3_t;
-
-  vst3q_u8 (arg0_uint8_t, arg1_uint8x16x3_t);
-}
-
-/* { dg-final { scan-assembler "vst3\.8\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
-/* { dg-final { scan-assembler "vst3\.8\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vst3_lanef32.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vst3_lanef32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vst3_lanef32 (void)
-{
-  float32_t *arg0_float32_t;
-  float32x2x3_t arg1_float32x2x3_t;
-
-  vst3_lane_f32 (arg0_float32_t, arg1_float32x2x3_t, 1);
-}
-
-/* { dg-final { scan-assembler "vst3\.32\[ 	\]+\\\{((\[dD\]\[0-9\]+\\\[\[0-9\]+\\\]-\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])|(\[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vst3_lanep16.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vst3_lanep16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vst3_lanep16 (void)
-{
-  poly16_t *arg0_poly16_t;
-  poly16x4x3_t arg1_poly16x4x3_t;
-
-  vst3_lane_p16 (arg0_poly16_t, arg1_poly16x4x3_t, 1);
-}
-
-/* { dg-final { scan-assembler "vst3\.16\[ 	\]+\\\{((\[dD\]\[0-9\]+\\\[\[0-9\]+\\\]-\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])|(\[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vst3_lanep8.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vst3_lanep8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vst3_lanep8 (void)
-{
-  poly8_t *arg0_poly8_t;
-  poly8x8x3_t arg1_poly8x8x3_t;
-
-  vst3_lane_p8 (arg0_poly8_t, arg1_poly8x8x3_t, 1);
-}
-
-/* { dg-final { scan-assembler "vst3\.8\[ 	\]+\\\{((\[dD\]\[0-9\]+\\\[\[0-9\]+\\\]-\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])|(\[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vst3_lanes16.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vst3_lanes16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vst3_lanes16 (void)
-{
-  int16_t *arg0_int16_t;
-  int16x4x3_t arg1_int16x4x3_t;
-
-  vst3_lane_s16 (arg0_int16_t, arg1_int16x4x3_t, 1);
-}
-
-/* { dg-final { scan-assembler "vst3\.16\[ 	\]+\\\{((\[dD\]\[0-9\]+\\\[\[0-9\]+\\\]-\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])|(\[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vst3_lanes32.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vst3_lanes32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vst3_lanes32 (void)
-{
-  int32_t *arg0_int32_t;
-  int32x2x3_t arg1_int32x2x3_t;
-
-  vst3_lane_s32 (arg0_int32_t, arg1_int32x2x3_t, 1);
-}
-
-/* { dg-final { scan-assembler "vst3\.32\[ 	\]+\\\{((\[dD\]\[0-9\]+\\\[\[0-9\]+\\\]-\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])|(\[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vst3_lanes8.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vst3_lanes8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vst3_lanes8 (void)
-{
-  int8_t *arg0_int8_t;
-  int8x8x3_t arg1_int8x8x3_t;
-
-  vst3_lane_s8 (arg0_int8_t, arg1_int8x8x3_t, 1);
-}
-
-/* { dg-final { scan-assembler "vst3\.8\[ 	\]+\\\{((\[dD\]\[0-9\]+\\\[\[0-9\]+\\\]-\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])|(\[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vst3_laneu16.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vst3_laneu16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vst3_laneu16 (void)
-{
-  uint16_t *arg0_uint16_t;
-  uint16x4x3_t arg1_uint16x4x3_t;
-
-  vst3_lane_u16 (arg0_uint16_t, arg1_uint16x4x3_t, 1);
-}
-
-/* { dg-final { scan-assembler "vst3\.16\[ 	\]+\\\{((\[dD\]\[0-9\]+\\\[\[0-9\]+\\\]-\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])|(\[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vst3_laneu32.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vst3_laneu32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vst3_laneu32 (void)
-{
-  uint32_t *arg0_uint32_t;
-  uint32x2x3_t arg1_uint32x2x3_t;
-
-  vst3_lane_u32 (arg0_uint32_t, arg1_uint32x2x3_t, 1);
-}
-
-/* { dg-final { scan-assembler "vst3\.32\[ 	\]+\\\{((\[dD\]\[0-9\]+\\\[\[0-9\]+\\\]-\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])|(\[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vst3_laneu8.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vst3_laneu8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vst3_laneu8 (void)
-{
-  uint8_t *arg0_uint8_t;
-  uint8x8x3_t arg1_uint8x8x3_t;
-
-  vst3_lane_u8 (arg0_uint8_t, arg1_uint8x8x3_t, 1);
-}
-
-/* { dg-final { scan-assembler "vst3\.8\[ 	\]+\\\{((\[dD\]\[0-9\]+\\\[\[0-9\]+\\\]-\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])|(\[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vst3f32.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vst3f32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vst3f32 (void)
-{
-  float32_t *arg0_float32_t;
-  float32x2x3_t arg1_float32x2x3_t;
-
-  vst3_f32 (arg0_float32_t, arg1_float32x2x3_t);
-}
-
-/* { dg-final { scan-assembler "vst3\.32\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vst3p16.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vst3p16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vst3p16 (void)
-{
-  poly16_t *arg0_poly16_t;
-  poly16x4x3_t arg1_poly16x4x3_t;
-
-  vst3_p16 (arg0_poly16_t, arg1_poly16x4x3_t);
-}
-
-/* { dg-final { scan-assembler "vst3\.16\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vst3p64.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vst3p64' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_crypto_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_crypto } */
-
-#include "arm_neon.h"
-
-void test_vst3p64 (void)
-{
-  poly64_t *arg0_poly64_t;
-  poly64x1x3_t arg1_poly64x1x3_t;
-
-  vst3_p64 (arg0_poly64_t, arg1_poly64x1x3_t);
-}
-
-/* { dg-final { scan-assembler "vst1\.64\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vst3p8.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vst3p8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vst3p8 (void)
-{
-  poly8_t *arg0_poly8_t;
-  poly8x8x3_t arg1_poly8x8x3_t;
-
-  vst3_p8 (arg0_poly8_t, arg1_poly8x8x3_t);
-}
-
-/* { dg-final { scan-assembler "vst3\.8\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vst3s16.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vst3s16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vst3s16 (void)
-{
-  int16_t *arg0_int16_t;
-  int16x4x3_t arg1_int16x4x3_t;
-
-  vst3_s16 (arg0_int16_t, arg1_int16x4x3_t);
-}
-
-/* { dg-final { scan-assembler "vst3\.16\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vst3s32.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vst3s32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vst3s32 (void)
-{
-  int32_t *arg0_int32_t;
-  int32x2x3_t arg1_int32x2x3_t;
-
-  vst3_s32 (arg0_int32_t, arg1_int32x2x3_t);
-}
-
-/* { dg-final { scan-assembler "vst3\.32\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vst3s64.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vst3s64' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vst3s64 (void)
-{
-  int64_t *arg0_int64_t;
-  int64x1x3_t arg1_int64x1x3_t;
-
-  vst3_s64 (arg0_int64_t, arg1_int64x1x3_t);
-}
-
-/* { dg-final { scan-assembler "vst1\.64\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vst3s8.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vst3s8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vst3s8 (void)
-{
-  int8_t *arg0_int8_t;
-  int8x8x3_t arg1_int8x8x3_t;
-
-  vst3_s8 (arg0_int8_t, arg1_int8x8x3_t);
-}
-
-/* { dg-final { scan-assembler "vst3\.8\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vst3u16.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vst3u16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vst3u16 (void)
-{
-  uint16_t *arg0_uint16_t;
-  uint16x4x3_t arg1_uint16x4x3_t;
-
-  vst3_u16 (arg0_uint16_t, arg1_uint16x4x3_t);
-}
-
-/* { dg-final { scan-assembler "vst3\.16\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vst3u32.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vst3u32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vst3u32 (void)
-{
-  uint32_t *arg0_uint32_t;
-  uint32x2x3_t arg1_uint32x2x3_t;
-
-  vst3_u32 (arg0_uint32_t, arg1_uint32x2x3_t);
-}
-
-/* { dg-final { scan-assembler "vst3\.32\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vst3u64.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vst3u64' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vst3u64 (void)
-{
-  uint64_t *arg0_uint64_t;
-  uint64x1x3_t arg1_uint64x1x3_t;
-
-  vst3_u64 (arg0_uint64_t, arg1_uint64x1x3_t);
-}
-
-/* { dg-final { scan-assembler "vst1\.64\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vst3u8.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vst3u8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vst3u8 (void)
-{
-  uint8_t *arg0_uint8_t;
-  uint8x8x3_t arg1_uint8x8x3_t;
-
-  vst3_u8 (arg0_uint8_t, arg1_uint8x8x3_t);
-}
-
-/* { dg-final { scan-assembler "vst3\.8\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vst4Q_lanef32.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vst4Q_lanef32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vst4Q_lanef32 (void)
-{
-  float32_t *arg0_float32_t;
-  float32x4x4_t arg1_float32x4x4_t;
-
-  vst4q_lane_f32 (arg0_float32_t, arg1_float32x4x4_t, 1);
-}
-
-/* { dg-final { scan-assembler "vst4\.32\[ 	\]+\\\{((\[dD\]\[0-9\]+\\\[\[0-9\]+\\\]-\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])|(\[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vst4Q_lanep16.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vst4Q_lanep16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vst4Q_lanep16 (void)
-{
-  poly16_t *arg0_poly16_t;
-  poly16x8x4_t arg1_poly16x8x4_t;
-
-  vst4q_lane_p16 (arg0_poly16_t, arg1_poly16x8x4_t, 1);
-}
-
-/* { dg-final { scan-assembler "vst4\.16\[ 	\]+\\\{((\[dD\]\[0-9\]+\\\[\[0-9\]+\\\]-\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])|(\[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vst4Q_lanes16.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vst4Q_lanes16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vst4Q_lanes16 (void)
-{
-  int16_t *arg0_int16_t;
-  int16x8x4_t arg1_int16x8x4_t;
-
-  vst4q_lane_s16 (arg0_int16_t, arg1_int16x8x4_t, 1);
-}
-
-/* { dg-final { scan-assembler "vst4\.16\[ 	\]+\\\{((\[dD\]\[0-9\]+\\\[\[0-9\]+\\\]-\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])|(\[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vst4Q_lanes32.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vst4Q_lanes32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vst4Q_lanes32 (void)
-{
-  int32_t *arg0_int32_t;
-  int32x4x4_t arg1_int32x4x4_t;
-
-  vst4q_lane_s32 (arg0_int32_t, arg1_int32x4x4_t, 1);
-}
-
-/* { dg-final { scan-assembler "vst4\.32\[ 	\]+\\\{((\[dD\]\[0-9\]+\\\[\[0-9\]+\\\]-\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])|(\[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vst4Q_laneu16.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vst4Q_laneu16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vst4Q_laneu16 (void)
-{
-  uint16_t *arg0_uint16_t;
-  uint16x8x4_t arg1_uint16x8x4_t;
-
-  vst4q_lane_u16 (arg0_uint16_t, arg1_uint16x8x4_t, 1);
-}
-
-/* { dg-final { scan-assembler "vst4\.16\[ 	\]+\\\{((\[dD\]\[0-9\]+\\\[\[0-9\]+\\\]-\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])|(\[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vst4Q_laneu32.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vst4Q_laneu32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vst4Q_laneu32 (void)
-{
-  uint32_t *arg0_uint32_t;
-  uint32x4x4_t arg1_uint32x4x4_t;
-
-  vst4q_lane_u32 (arg0_uint32_t, arg1_uint32x4x4_t, 1);
-}
-
-/* { dg-final { scan-assembler "vst4\.32\[ 	\]+\\\{((\[dD\]\[0-9\]+\\\[\[0-9\]+\\\]-\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])|(\[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vst4Qf32.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vst4Qf32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vst4Qf32 (void)
-{
-  float32_t *arg0_float32_t;
-  float32x4x4_t arg1_float32x4x4_t;
-
-  vst4q_f32 (arg0_float32_t, arg1_float32x4x4_t);
-}
-
-/* { dg-final { scan-assembler "vst4\.32\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
-/* { dg-final { scan-assembler "vst4\.32\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vst4Qp16.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vst4Qp16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vst4Qp16 (void)
-{
-  poly16_t *arg0_poly16_t;
-  poly16x8x4_t arg1_poly16x8x4_t;
-
-  vst4q_p16 (arg0_poly16_t, arg1_poly16x8x4_t);
-}
-
-/* { dg-final { scan-assembler "vst4\.16\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
-/* { dg-final { scan-assembler "vst4\.16\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vst4Qp8.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vst4Qp8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vst4Qp8 (void)
-{
-  poly8_t *arg0_poly8_t;
-  poly8x16x4_t arg1_poly8x16x4_t;
-
-  vst4q_p8 (arg0_poly8_t, arg1_poly8x16x4_t);
-}
-
-/* { dg-final { scan-assembler "vst4\.8\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
-/* { dg-final { scan-assembler "vst4\.8\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vst4Qs16.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vst4Qs16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vst4Qs16 (void)
-{
-  int16_t *arg0_int16_t;
-  int16x8x4_t arg1_int16x8x4_t;
-
-  vst4q_s16 (arg0_int16_t, arg1_int16x8x4_t);
-}
-
-/* { dg-final { scan-assembler "vst4\.16\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
-/* { dg-final { scan-assembler "vst4\.16\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vst4Qs32.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vst4Qs32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vst4Qs32 (void)
-{
-  int32_t *arg0_int32_t;
-  int32x4x4_t arg1_int32x4x4_t;
-
-  vst4q_s32 (arg0_int32_t, arg1_int32x4x4_t);
-}
-
-/* { dg-final { scan-assembler "vst4\.32\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
-/* { dg-final { scan-assembler "vst4\.32\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vst4Qs8.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vst4Qs8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vst4Qs8 (void)
-{
-  int8_t *arg0_int8_t;
-  int8x16x4_t arg1_int8x16x4_t;
-
-  vst4q_s8 (arg0_int8_t, arg1_int8x16x4_t);
-}
-
-/* { dg-final { scan-assembler "vst4\.8\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
-/* { dg-final { scan-assembler "vst4\.8\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vst4Qu16.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vst4Qu16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vst4Qu16 (void)
-{
-  uint16_t *arg0_uint16_t;
-  uint16x8x4_t arg1_uint16x8x4_t;
-
-  vst4q_u16 (arg0_uint16_t, arg1_uint16x8x4_t);
-}
-
-/* { dg-final { scan-assembler "vst4\.16\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
-/* { dg-final { scan-assembler "vst4\.16\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vst4Qu32.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vst4Qu32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vst4Qu32 (void)
-{
-  uint32_t *arg0_uint32_t;
-  uint32x4x4_t arg1_uint32x4x4_t;
-
-  vst4q_u32 (arg0_uint32_t, arg1_uint32x4x4_t);
-}
-
-/* { dg-final { scan-assembler "vst4\.32\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
-/* { dg-final { scan-assembler "vst4\.32\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vst4Qu8.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vst4Qu8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vst4Qu8 (void)
-{
-  uint8_t *arg0_uint8_t;
-  uint8x16x4_t arg1_uint8x16x4_t;
-
-  vst4q_u8 (arg0_uint8_t, arg1_uint8x16x4_t);
-}
-
-/* { dg-final { scan-assembler "vst4\.8\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
-/* { dg-final { scan-assembler "vst4\.8\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vst4_lanef32.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vst4_lanef32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vst4_lanef32 (void)
-{
-  float32_t *arg0_float32_t;
-  float32x2x4_t arg1_float32x2x4_t;
-
-  vst4_lane_f32 (arg0_float32_t, arg1_float32x2x4_t, 1);
-}
-
-/* { dg-final { scan-assembler "vst4\.32\[ 	\]+\\\{((\[dD\]\[0-9\]+\\\[\[0-9\]+\\\]-\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])|(\[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vst4_lanep16.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vst4_lanep16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vst4_lanep16 (void)
-{
-  poly16_t *arg0_poly16_t;
-  poly16x4x4_t arg1_poly16x4x4_t;
-
-  vst4_lane_p16 (arg0_poly16_t, arg1_poly16x4x4_t, 1);
-}
-
-/* { dg-final { scan-assembler "vst4\.16\[ 	\]+\\\{((\[dD\]\[0-9\]+\\\[\[0-9\]+\\\]-\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])|(\[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vst4_lanep8.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vst4_lanep8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vst4_lanep8 (void)
-{
-  poly8_t *arg0_poly8_t;
-  poly8x8x4_t arg1_poly8x8x4_t;
-
-  vst4_lane_p8 (arg0_poly8_t, arg1_poly8x8x4_t, 1);
-}
-
-/* { dg-final { scan-assembler "vst4\.8\[ 	\]+\\\{((\[dD\]\[0-9\]+\\\[\[0-9\]+\\\]-\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])|(\[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vst4_lanes16.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vst4_lanes16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vst4_lanes16 (void)
-{
-  int16_t *arg0_int16_t;
-  int16x4x4_t arg1_int16x4x4_t;
-
-  vst4_lane_s16 (arg0_int16_t, arg1_int16x4x4_t, 1);
-}
-
-/* { dg-final { scan-assembler "vst4\.16\[ 	\]+\\\{((\[dD\]\[0-9\]+\\\[\[0-9\]+\\\]-\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])|(\[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vst4_lanes32.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vst4_lanes32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vst4_lanes32 (void)
-{
-  int32_t *arg0_int32_t;
-  int32x2x4_t arg1_int32x2x4_t;
-
-  vst4_lane_s32 (arg0_int32_t, arg1_int32x2x4_t, 1);
-}
-
-/* { dg-final { scan-assembler "vst4\.32\[ 	\]+\\\{((\[dD\]\[0-9\]+\\\[\[0-9\]+\\\]-\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])|(\[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vst4_lanes8.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vst4_lanes8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vst4_lanes8 (void)
-{
-  int8_t *arg0_int8_t;
-  int8x8x4_t arg1_int8x8x4_t;
-
-  vst4_lane_s8 (arg0_int8_t, arg1_int8x8x4_t, 1);
-}
-
-/* { dg-final { scan-assembler "vst4\.8\[ 	\]+\\\{((\[dD\]\[0-9\]+\\\[\[0-9\]+\\\]-\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])|(\[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vst4_laneu16.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vst4_laneu16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vst4_laneu16 (void)
-{
-  uint16_t *arg0_uint16_t;
-  uint16x4x4_t arg1_uint16x4x4_t;
-
-  vst4_lane_u16 (arg0_uint16_t, arg1_uint16x4x4_t, 1);
-}
-
-/* { dg-final { scan-assembler "vst4\.16\[ 	\]+\\\{((\[dD\]\[0-9\]+\\\[\[0-9\]+\\\]-\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])|(\[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vst4_laneu32.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vst4_laneu32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vst4_laneu32 (void)
-{
-  uint32_t *arg0_uint32_t;
-  uint32x2x4_t arg1_uint32x2x4_t;
-
-  vst4_lane_u32 (arg0_uint32_t, arg1_uint32x2x4_t, 1);
-}
-
-/* { dg-final { scan-assembler "vst4\.32\[ 	\]+\\\{((\[dD\]\[0-9\]+\\\[\[0-9\]+\\\]-\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])|(\[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vst4_laneu8.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vst4_laneu8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vst4_laneu8 (void)
-{
-  uint8_t *arg0_uint8_t;
-  uint8x8x4_t arg1_uint8x8x4_t;
-
-  vst4_lane_u8 (arg0_uint8_t, arg1_uint8x8x4_t, 1);
-}
-
-/* { dg-final { scan-assembler "vst4\.8\[ 	\]+\\\{((\[dD\]\[0-9\]+\\\[\[0-9\]+\\\]-\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])|(\[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vst4f32.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vst4f32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vst4f32 (void)
-{
-  float32_t *arg0_float32_t;
-  float32x2x4_t arg1_float32x2x4_t;
-
-  vst4_f32 (arg0_float32_t, arg1_float32x2x4_t);
-}
-
-/* { dg-final { scan-assembler "vst4\.32\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vst4p16.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vst4p16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vst4p16 (void)
-{
-  poly16_t *arg0_poly16_t;
-  poly16x4x4_t arg1_poly16x4x4_t;
-
-  vst4_p16 (arg0_poly16_t, arg1_poly16x4x4_t);
-}
-
-/* { dg-final { scan-assembler "vst4\.16\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vst4p64.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vst4p64' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_crypto_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_crypto } */
-
-#include "arm_neon.h"
-
-void test_vst4p64 (void)
-{
-  poly64_t *arg0_poly64_t;
-  poly64x1x4_t arg1_poly64x1x4_t;
-
-  vst4_p64 (arg0_poly64_t, arg1_poly64x1x4_t);
-}
-
-/* { dg-final { scan-assembler "vst1\.64\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vst4p8.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vst4p8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vst4p8 (void)
-{
-  poly8_t *arg0_poly8_t;
-  poly8x8x4_t arg1_poly8x8x4_t;
-
-  vst4_p8 (arg0_poly8_t, arg1_poly8x8x4_t);
-}
-
-/* { dg-final { scan-assembler "vst4\.8\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vst4s16.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vst4s16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vst4s16 (void)
-{
-  int16_t *arg0_int16_t;
-  int16x4x4_t arg1_int16x4x4_t;
-
-  vst4_s16 (arg0_int16_t, arg1_int16x4x4_t);
-}
-
-/* { dg-final { scan-assembler "vst4\.16\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vst4s32.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vst4s32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vst4s32 (void)
-{
-  int32_t *arg0_int32_t;
-  int32x2x4_t arg1_int32x2x4_t;
-
-  vst4_s32 (arg0_int32_t, arg1_int32x2x4_t);
-}
-
-/* { dg-final { scan-assembler "vst4\.32\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vst4s64.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vst4s64' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vst4s64 (void)
-{
-  int64_t *arg0_int64_t;
-  int64x1x4_t arg1_int64x1x4_t;
-
-  vst4_s64 (arg0_int64_t, arg1_int64x1x4_t);
-}
-
-/* { dg-final { scan-assembler "vst1\.64\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vst4s8.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vst4s8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vst4s8 (void)
-{
-  int8_t *arg0_int8_t;
-  int8x8x4_t arg1_int8x8x4_t;
-
-  vst4_s8 (arg0_int8_t, arg1_int8x8x4_t);
-}
-
-/* { dg-final { scan-assembler "vst4\.8\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vst4u16.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vst4u16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vst4u16 (void)
-{
-  uint16_t *arg0_uint16_t;
-  uint16x4x4_t arg1_uint16x4x4_t;
-
-  vst4_u16 (arg0_uint16_t, arg1_uint16x4x4_t);
-}
-
-/* { dg-final { scan-assembler "vst4\.16\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vst4u32.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vst4u32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vst4u32 (void)
-{
-  uint32_t *arg0_uint32_t;
-  uint32x2x4_t arg1_uint32x2x4_t;
-
-  vst4_u32 (arg0_uint32_t, arg1_uint32x2x4_t);
-}
-
-/* { dg-final { scan-assembler "vst4\.32\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vst4u64.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vst4u64' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vst4u64 (void)
-{
-  uint64_t *arg0_uint64_t;
-  uint64x1x4_t arg1_uint64x1x4_t;
-
-  vst4_u64 (arg0_uint64_t, arg1_uint64x1x4_t);
-}
-
-/* { dg-final { scan-assembler "vst1\.64\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vst4u8.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vst4u8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vst4u8 (void)
-{
-  uint8_t *arg0_uint8_t;
-  uint8x8x4_t arg1_uint8x8x4_t;
-
-  vst4_u8 (arg0_uint8_t, arg1_uint8x8x4_t);
-}
-
-/* { dg-final { scan-assembler "vst4\.8\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vsubQf32.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vsubQf32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vsubQf32 (void)
-{
-  float32x4_t out_float32x4_t;
-  float32x4_t arg0_float32x4_t;
-  float32x4_t arg1_float32x4_t;
-
-  out_float32x4_t = vsubq_f32 (arg0_float32x4_t, arg1_float32x4_t);
-}
-
-/* { dg-final { scan-assembler "vsub\.f32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vsubQs16.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vsubQs16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vsubQs16 (void)
-{
-  int16x8_t out_int16x8_t;
-  int16x8_t arg0_int16x8_t;
-  int16x8_t arg1_int16x8_t;
-
-  out_int16x8_t = vsubq_s16 (arg0_int16x8_t, arg1_int16x8_t);
-}
-
-/* { dg-final { scan-assembler "vsub\.i16\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vsubQs32.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vsubQs32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vsubQs32 (void)
-{
-  int32x4_t out_int32x4_t;
-  int32x4_t arg0_int32x4_t;
-  int32x4_t arg1_int32x4_t;
-
-  out_int32x4_t = vsubq_s32 (arg0_int32x4_t, arg1_int32x4_t);
-}
-
-/* { dg-final { scan-assembler "vsub\.i32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vsubQs64.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vsubQs64' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vsubQs64 (void)
-{
-  int64x2_t out_int64x2_t;
-  int64x2_t arg0_int64x2_t;
-  int64x2_t arg1_int64x2_t;
-
-  out_int64x2_t = vsubq_s64 (arg0_int64x2_t, arg1_int64x2_t);
-}
-
-/* { dg-final { scan-assembler "vsub\.i64\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vsubQs8.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vsubQs8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vsubQs8 (void)
-{
-  int8x16_t out_int8x16_t;
-  int8x16_t arg0_int8x16_t;
-  int8x16_t arg1_int8x16_t;
-
-  out_int8x16_t = vsubq_s8 (arg0_int8x16_t, arg1_int8x16_t);
-}
-
-/* { dg-final { scan-assembler "vsub\.i8\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vsubQu16.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vsubQu16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vsubQu16 (void)
-{
-  uint16x8_t out_uint16x8_t;
-  uint16x8_t arg0_uint16x8_t;
-  uint16x8_t arg1_uint16x8_t;
-
-  out_uint16x8_t = vsubq_u16 (arg0_uint16x8_t, arg1_uint16x8_t);
-}
-
-/* { dg-final { scan-assembler "vsub\.i16\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vsubQu32.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vsubQu32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vsubQu32 (void)
-{
-  uint32x4_t out_uint32x4_t;
-  uint32x4_t arg0_uint32x4_t;
-  uint32x4_t arg1_uint32x4_t;
-
-  out_uint32x4_t = vsubq_u32 (arg0_uint32x4_t, arg1_uint32x4_t);
-}
-
-/* { dg-final { scan-assembler "vsub\.i32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vsubQu64.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vsubQu64' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vsubQu64 (void)
-{
-  uint64x2_t out_uint64x2_t;
-  uint64x2_t arg0_uint64x2_t;
-  uint64x2_t arg1_uint64x2_t;
-
-  out_uint64x2_t = vsubq_u64 (arg0_uint64x2_t, arg1_uint64x2_t);
-}
-
-/* { dg-final { scan-assembler "vsub\.i64\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vsubQu8.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vsubQu8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vsubQu8 (void)
-{
-  uint8x16_t out_uint8x16_t;
-  uint8x16_t arg0_uint8x16_t;
-  uint8x16_t arg1_uint8x16_t;
-
-  out_uint8x16_t = vsubq_u8 (arg0_uint8x16_t, arg1_uint8x16_t);
-}
-
-/* { dg-final { scan-assembler "vsub\.i8\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vsubf32.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vsubf32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vsubf32 (void)
-{
-  float32x2_t out_float32x2_t;
-  float32x2_t arg0_float32x2_t;
-  float32x2_t arg1_float32x2_t;
-
-  out_float32x2_t = vsub_f32 (arg0_float32x2_t, arg1_float32x2_t);
-}
-
-/* { dg-final { scan-assembler "vsub\.f32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vsubhns16.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vsubhns16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vsubhns16 (void)
-{
-  int8x8_t out_int8x8_t;
-  int16x8_t arg0_int16x8_t;
-  int16x8_t arg1_int16x8_t;
-
-  out_int8x8_t = vsubhn_s16 (arg0_int16x8_t, arg1_int16x8_t);
-}
-
-/* { dg-final { scan-assembler "vsubhn\.i16\[ 	\]+\[dD\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vsubhns32.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vsubhns32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vsubhns32 (void)
-{
-  int16x4_t out_int16x4_t;
-  int32x4_t arg0_int32x4_t;
-  int32x4_t arg1_int32x4_t;
-
-  out_int16x4_t = vsubhn_s32 (arg0_int32x4_t, arg1_int32x4_t);
-}
-
-/* { dg-final { scan-assembler "vsubhn\.i32\[ 	\]+\[dD\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vsubhns64.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vsubhns64' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vsubhns64 (void)
-{
-  int32x2_t out_int32x2_t;
-  int64x2_t arg0_int64x2_t;
-  int64x2_t arg1_int64x2_t;
-
-  out_int32x2_t = vsubhn_s64 (arg0_int64x2_t, arg1_int64x2_t);
-}
-
-/* { dg-final { scan-assembler "vsubhn\.i64\[ 	\]+\[dD\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vsubhnu16.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vsubhnu16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vsubhnu16 (void)
-{
-  uint8x8_t out_uint8x8_t;
-  uint16x8_t arg0_uint16x8_t;
-  uint16x8_t arg1_uint16x8_t;
-
-  out_uint8x8_t = vsubhn_u16 (arg0_uint16x8_t, arg1_uint16x8_t);
-}
-
-/* { dg-final { scan-assembler "vsubhn\.i16\[ 	\]+\[dD\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vsubhnu32.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vsubhnu32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vsubhnu32 (void)
-{
-  uint16x4_t out_uint16x4_t;
-  uint32x4_t arg0_uint32x4_t;
-  uint32x4_t arg1_uint32x4_t;
-
-  out_uint16x4_t = vsubhn_u32 (arg0_uint32x4_t, arg1_uint32x4_t);
-}
-
-/* { dg-final { scan-assembler "vsubhn\.i32\[ 	\]+\[dD\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vsubhnu64.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vsubhnu64' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vsubhnu64 (void)
-{
-  uint32x2_t out_uint32x2_t;
-  uint64x2_t arg0_uint64x2_t;
-  uint64x2_t arg1_uint64x2_t;
-
-  out_uint32x2_t = vsubhn_u64 (arg0_uint64x2_t, arg1_uint64x2_t);
-}
-
-/* { dg-final { scan-assembler "vsubhn\.i64\[ 	\]+\[dD\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vsubls16.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vsubls16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vsubls16 (void)
-{
-  int32x4_t out_int32x4_t;
-  int16x4_t arg0_int16x4_t;
-  int16x4_t arg1_int16x4_t;
-
-  out_int32x4_t = vsubl_s16 (arg0_int16x4_t, arg1_int16x4_t);
-}
-
-/* { dg-final { scan-assembler "vsubl\.s16\[ 	\]+\[qQ\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vsubls32.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vsubls32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vsubls32 (void)
-{
-  int64x2_t out_int64x2_t;
-  int32x2_t arg0_int32x2_t;
-  int32x2_t arg1_int32x2_t;
-
-  out_int64x2_t = vsubl_s32 (arg0_int32x2_t, arg1_int32x2_t);
-}
-
-/* { dg-final { scan-assembler "vsubl\.s32\[ 	\]+\[qQ\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vsubls8.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vsubls8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vsubls8 (void)
-{
-  int16x8_t out_int16x8_t;
-  int8x8_t arg0_int8x8_t;
-  int8x8_t arg1_int8x8_t;
-
-  out_int16x8_t = vsubl_s8 (arg0_int8x8_t, arg1_int8x8_t);
-}
-
-/* { dg-final { scan-assembler "vsubl\.s8\[ 	\]+\[qQ\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vsublu16.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vsublu16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vsublu16 (void)
-{
-  uint32x4_t out_uint32x4_t;
-  uint16x4_t arg0_uint16x4_t;
-  uint16x4_t arg1_uint16x4_t;
-
-  out_uint32x4_t = vsubl_u16 (arg0_uint16x4_t, arg1_uint16x4_t);
-}
-
-/* { dg-final { scan-assembler "vsubl\.u16\[ 	\]+\[qQ\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vsublu32.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vsublu32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vsublu32 (void)
-{
-  uint64x2_t out_uint64x2_t;
-  uint32x2_t arg0_uint32x2_t;
-  uint32x2_t arg1_uint32x2_t;
-
-  out_uint64x2_t = vsubl_u32 (arg0_uint32x2_t, arg1_uint32x2_t);
-}
-
-/* { dg-final { scan-assembler "vsubl\.u32\[ 	\]+\[qQ\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vsublu8.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vsublu8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vsublu8 (void)
-{
-  uint16x8_t out_uint16x8_t;
-  uint8x8_t arg0_uint8x8_t;
-  uint8x8_t arg1_uint8x8_t;
-
-  out_uint16x8_t = vsubl_u8 (arg0_uint8x8_t, arg1_uint8x8_t);
-}
-
-/* { dg-final { scan-assembler "vsubl\.u8\[ 	\]+\[qQ\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vsubs16.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vsubs16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vsubs16 (void)
-{
-  int16x4_t out_int16x4_t;
-  int16x4_t arg0_int16x4_t;
-  int16x4_t arg1_int16x4_t;
-
-  out_int16x4_t = vsub_s16 (arg0_int16x4_t, arg1_int16x4_t);
-}
-
-/* { dg-final { scan-assembler "vsub\.i16\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vsubs32.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vsubs32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vsubs32 (void)
-{
-  int32x2_t out_int32x2_t;
-  int32x2_t arg0_int32x2_t;
-  int32x2_t arg1_int32x2_t;
-
-  out_int32x2_t = vsub_s32 (arg0_int32x2_t, arg1_int32x2_t);
-}
-
-/* { dg-final { scan-assembler "vsub\.i32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vsubs64.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vsubs64' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vsubs64 (void)
-{
-  int64x1_t out_int64x1_t;
-  int64x1_t arg0_int64x1_t;
-  int64x1_t arg1_int64x1_t;
-
-  out_int64x1_t = vsub_s64 (arg0_int64x1_t, arg1_int64x1_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vsubs8.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vsubs8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vsubs8 (void)
-{
-  int8x8_t out_int8x8_t;
-  int8x8_t arg0_int8x8_t;
-  int8x8_t arg1_int8x8_t;
-
-  out_int8x8_t = vsub_s8 (arg0_int8x8_t, arg1_int8x8_t);
-}
-
-/* { dg-final { scan-assembler "vsub\.i8\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vsubu16.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vsubu16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vsubu16 (void)
-{
-  uint16x4_t out_uint16x4_t;
-  uint16x4_t arg0_uint16x4_t;
-  uint16x4_t arg1_uint16x4_t;
-
-  out_uint16x4_t = vsub_u16 (arg0_uint16x4_t, arg1_uint16x4_t);
-}
-
-/* { dg-final { scan-assembler "vsub\.i16\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vsubu32.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vsubu32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vsubu32 (void)
-{
-  uint32x2_t out_uint32x2_t;
-  uint32x2_t arg0_uint32x2_t;
-  uint32x2_t arg1_uint32x2_t;
-
-  out_uint32x2_t = vsub_u32 (arg0_uint32x2_t, arg1_uint32x2_t);
-}
-
-/* { dg-final { scan-assembler "vsub\.i32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vsubu64.c
+++ b/src//dev/null
@@ -1,19 +0,0 @@
-/* Test the `vsubu64' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vsubu64 (void)
-{
-  uint64x1_t out_uint64x1_t;
-  uint64x1_t arg0_uint64x1_t;
-  uint64x1_t arg1_uint64x1_t;
-
-  out_uint64x1_t = vsub_u64 (arg0_uint64x1_t, arg1_uint64x1_t);
-}
-
--- a/src/gcc/testsuite/gcc.target/arm/neon/vsubu8.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vsubu8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vsubu8 (void)
-{
-  uint8x8_t out_uint8x8_t;
-  uint8x8_t arg0_uint8x8_t;
-  uint8x8_t arg1_uint8x8_t;
-
-  out_uint8x8_t = vsub_u8 (arg0_uint8x8_t, arg1_uint8x8_t);
-}
-
-/* { dg-final { scan-assembler "vsub\.i8\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vsubws16.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vsubws16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vsubws16 (void)
-{
-  int32x4_t out_int32x4_t;
-  int32x4_t arg0_int32x4_t;
-  int16x4_t arg1_int16x4_t;
-
-  out_int32x4_t = vsubw_s16 (arg0_int32x4_t, arg1_int16x4_t);
-}
-
-/* { dg-final { scan-assembler "vsubw\.s16\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vsubws32.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vsubws32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vsubws32 (void)
-{
-  int64x2_t out_int64x2_t;
-  int64x2_t arg0_int64x2_t;
-  int32x2_t arg1_int32x2_t;
-
-  out_int64x2_t = vsubw_s32 (arg0_int64x2_t, arg1_int32x2_t);
-}
-
-/* { dg-final { scan-assembler "vsubw\.s32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vsubws8.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vsubws8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vsubws8 (void)
-{
-  int16x8_t out_int16x8_t;
-  int16x8_t arg0_int16x8_t;
-  int8x8_t arg1_int8x8_t;
-
-  out_int16x8_t = vsubw_s8 (arg0_int16x8_t, arg1_int8x8_t);
-}
-
-/* { dg-final { scan-assembler "vsubw\.s8\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vsubwu16.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vsubwu16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vsubwu16 (void)
-{
-  uint32x4_t out_uint32x4_t;
-  uint32x4_t arg0_uint32x4_t;
-  uint16x4_t arg1_uint16x4_t;
-
-  out_uint32x4_t = vsubw_u16 (arg0_uint32x4_t, arg1_uint16x4_t);
-}
-
-/* { dg-final { scan-assembler "vsubw\.u16\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vsubwu32.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vsubwu32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vsubwu32 (void)
-{
-  uint64x2_t out_uint64x2_t;
-  uint64x2_t arg0_uint64x2_t;
-  uint32x2_t arg1_uint32x2_t;
-
-  out_uint64x2_t = vsubw_u32 (arg0_uint64x2_t, arg1_uint32x2_t);
-}
-
-/* { dg-final { scan-assembler "vsubw\.u32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vsubwu8.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vsubwu8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vsubwu8 (void)
-{
-  uint16x8_t out_uint16x8_t;
-  uint16x8_t arg0_uint16x8_t;
-  uint8x8_t arg1_uint8x8_t;
-
-  out_uint16x8_t = vsubw_u8 (arg0_uint16x8_t, arg1_uint8x8_t);
-}
-
-/* { dg-final { scan-assembler "vsubw\.u8\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vtbl1p8.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vtbl1p8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vtbl1p8 (void)
-{
-  poly8x8_t out_poly8x8_t;
-  poly8x8_t arg0_poly8x8_t;
-  uint8x8_t arg1_uint8x8_t;
-
-  out_poly8x8_t = vtbl1_p8 (arg0_poly8x8_t, arg1_uint8x8_t);
-}
-
-/* { dg-final { scan-assembler "vtbl\.8\[ 	\]+\[dD\]\[0-9\]+, ((\\\{\[dD\]\[0-9\]+\\\})|(\[dD\]\[0-9\]+)), \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vtbl1s8.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vtbl1s8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vtbl1s8 (void)
-{
-  int8x8_t out_int8x8_t;
-  int8x8_t arg0_int8x8_t;
-  int8x8_t arg1_int8x8_t;
-
-  out_int8x8_t = vtbl1_s8 (arg0_int8x8_t, arg1_int8x8_t);
-}
-
-/* { dg-final { scan-assembler "vtbl\.8\[ 	\]+\[dD\]\[0-9\]+, ((\\\{\[dD\]\[0-9\]+\\\})|(\[dD\]\[0-9\]+)), \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vtbl1u8.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vtbl1u8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vtbl1u8 (void)
-{
-  uint8x8_t out_uint8x8_t;
-  uint8x8_t arg0_uint8x8_t;
-  uint8x8_t arg1_uint8x8_t;
-
-  out_uint8x8_t = vtbl1_u8 (arg0_uint8x8_t, arg1_uint8x8_t);
-}
-
-/* { dg-final { scan-assembler "vtbl\.8\[ 	\]+\[dD\]\[0-9\]+, ((\\\{\[dD\]\[0-9\]+\\\})|(\[dD\]\[0-9\]+)), \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vtbl2p8.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vtbl2p8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vtbl2p8 (void)
-{
-  poly8x8_t out_poly8x8_t;
-  poly8x8x2_t arg0_poly8x8x2_t;
-  uint8x8_t arg1_uint8x8_t;
-
-  out_poly8x8_t = vtbl2_p8 (arg0_poly8x8x2_t, arg1_uint8x8_t);
-}
-
-/* { dg-final { scan-assembler "vtbl\.8\[ 	\]+\[dD\]\[0-9\]+, \\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vtbl2s8.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vtbl2s8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vtbl2s8 (void)
-{
-  int8x8_t out_int8x8_t;
-  int8x8x2_t arg0_int8x8x2_t;
-  int8x8_t arg1_int8x8_t;
-
-  out_int8x8_t = vtbl2_s8 (arg0_int8x8x2_t, arg1_int8x8_t);
-}
-
-/* { dg-final { scan-assembler "vtbl\.8\[ 	\]+\[dD\]\[0-9\]+, \\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vtbl2u8.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vtbl2u8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vtbl2u8 (void)
-{
-  uint8x8_t out_uint8x8_t;
-  uint8x8x2_t arg0_uint8x8x2_t;
-  uint8x8_t arg1_uint8x8_t;
-
-  out_uint8x8_t = vtbl2_u8 (arg0_uint8x8x2_t, arg1_uint8x8_t);
-}
-
-/* { dg-final { scan-assembler "vtbl\.8\[ 	\]+\[dD\]\[0-9\]+, \\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vtbl3p8.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vtbl3p8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vtbl3p8 (void)
-{
-  poly8x8_t out_poly8x8_t;
-  poly8x8x3_t arg0_poly8x8x3_t;
-  uint8x8_t arg1_uint8x8_t;
-
-  out_poly8x8_t = vtbl3_p8 (arg0_poly8x8x3_t, arg1_uint8x8_t);
-}
-
-/* { dg-final { scan-assembler "vtbl\.8\[ 	\]+\[dD\]\[0-9\]+, \\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vtbl3s8.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vtbl3s8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vtbl3s8 (void)
-{
-  int8x8_t out_int8x8_t;
-  int8x8x3_t arg0_int8x8x3_t;
-  int8x8_t arg1_int8x8_t;
-
-  out_int8x8_t = vtbl3_s8 (arg0_int8x8x3_t, arg1_int8x8_t);
-}
-
-/* { dg-final { scan-assembler "vtbl\.8\[ 	\]+\[dD\]\[0-9\]+, \\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vtbl3u8.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vtbl3u8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vtbl3u8 (void)
-{
-  uint8x8_t out_uint8x8_t;
-  uint8x8x3_t arg0_uint8x8x3_t;
-  uint8x8_t arg1_uint8x8_t;
-
-  out_uint8x8_t = vtbl3_u8 (arg0_uint8x8x3_t, arg1_uint8x8_t);
-}
-
-/* { dg-final { scan-assembler "vtbl\.8\[ 	\]+\[dD\]\[0-9\]+, \\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vtbl4p8.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vtbl4p8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vtbl4p8 (void)
-{
-  poly8x8_t out_poly8x8_t;
-  poly8x8x4_t arg0_poly8x8x4_t;
-  uint8x8_t arg1_uint8x8_t;
-
-  out_poly8x8_t = vtbl4_p8 (arg0_poly8x8x4_t, arg1_uint8x8_t);
-}
-
-/* { dg-final { scan-assembler "vtbl\.8\[ 	\]+\[dD\]\[0-9\]+, \\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vtbl4s8.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vtbl4s8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vtbl4s8 (void)
-{
-  int8x8_t out_int8x8_t;
-  int8x8x4_t arg0_int8x8x4_t;
-  int8x8_t arg1_int8x8_t;
-
-  out_int8x8_t = vtbl4_s8 (arg0_int8x8x4_t, arg1_int8x8_t);
-}
-
-/* { dg-final { scan-assembler "vtbl\.8\[ 	\]+\[dD\]\[0-9\]+, \\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vtbl4u8.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vtbl4u8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vtbl4u8 (void)
-{
-  uint8x8_t out_uint8x8_t;
-  uint8x8x4_t arg0_uint8x8x4_t;
-  uint8x8_t arg1_uint8x8_t;
-
-  out_uint8x8_t = vtbl4_u8 (arg0_uint8x8x4_t, arg1_uint8x8_t);
-}
-
-/* { dg-final { scan-assembler "vtbl\.8\[ 	\]+\[dD\]\[0-9\]+, \\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vtbx1p8.c
+++ b/src//dev/null
@@ -1,21 +0,0 @@
-/* Test the `vtbx1p8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vtbx1p8 (void)
-{
-  poly8x8_t out_poly8x8_t;
-  poly8x8_t arg0_poly8x8_t;
-  poly8x8_t arg1_poly8x8_t;
-  uint8x8_t arg2_uint8x8_t;
-
-  out_poly8x8_t = vtbx1_p8 (arg0_poly8x8_t, arg1_poly8x8_t, arg2_uint8x8_t);
-}
-
-/* { dg-final { scan-assembler "vtbx\.8\[ 	\]+\[dD\]\[0-9\]+, ((\\\{\[dD\]\[0-9\]+\\\})|(\[dD\]\[0-9\]+)), \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vtbx1s8.c
+++ b/src//dev/null
@@ -1,21 +0,0 @@
-/* Test the `vtbx1s8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vtbx1s8 (void)
-{
-  int8x8_t out_int8x8_t;
-  int8x8_t arg0_int8x8_t;
-  int8x8_t arg1_int8x8_t;
-  int8x8_t arg2_int8x8_t;
-
-  out_int8x8_t = vtbx1_s8 (arg0_int8x8_t, arg1_int8x8_t, arg2_int8x8_t);
-}
-
-/* { dg-final { scan-assembler "vtbx\.8\[ 	\]+\[dD\]\[0-9\]+, ((\\\{\[dD\]\[0-9\]+\\\})|(\[dD\]\[0-9\]+)), \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vtbx1u8.c
+++ b/src//dev/null
@@ -1,21 +0,0 @@
-/* Test the `vtbx1u8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vtbx1u8 (void)
-{
-  uint8x8_t out_uint8x8_t;
-  uint8x8_t arg0_uint8x8_t;
-  uint8x8_t arg1_uint8x8_t;
-  uint8x8_t arg2_uint8x8_t;
-
-  out_uint8x8_t = vtbx1_u8 (arg0_uint8x8_t, arg1_uint8x8_t, arg2_uint8x8_t);
-}
-
-/* { dg-final { scan-assembler "vtbx\.8\[ 	\]+\[dD\]\[0-9\]+, ((\\\{\[dD\]\[0-9\]+\\\})|(\[dD\]\[0-9\]+)), \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vtbx2p8.c
+++ b/src//dev/null
@@ -1,21 +0,0 @@
-/* Test the `vtbx2p8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vtbx2p8 (void)
-{
-  poly8x8_t out_poly8x8_t;
-  poly8x8_t arg0_poly8x8_t;
-  poly8x8x2_t arg1_poly8x8x2_t;
-  uint8x8_t arg2_uint8x8_t;
-
-  out_poly8x8_t = vtbx2_p8 (arg0_poly8x8_t, arg1_poly8x8x2_t, arg2_uint8x8_t);
-}
-
-/* { dg-final { scan-assembler "vtbx\.8\[ 	\]+\[dD\]\[0-9\]+, \\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vtbx2s8.c
+++ b/src//dev/null
@@ -1,21 +0,0 @@
-/* Test the `vtbx2s8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vtbx2s8 (void)
-{
-  int8x8_t out_int8x8_t;
-  int8x8_t arg0_int8x8_t;
-  int8x8x2_t arg1_int8x8x2_t;
-  int8x8_t arg2_int8x8_t;
-
-  out_int8x8_t = vtbx2_s8 (arg0_int8x8_t, arg1_int8x8x2_t, arg2_int8x8_t);
-}
-
-/* { dg-final { scan-assembler "vtbx\.8\[ 	\]+\[dD\]\[0-9\]+, \\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vtbx2u8.c
+++ b/src//dev/null
@@ -1,21 +0,0 @@
-/* Test the `vtbx2u8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vtbx2u8 (void)
-{
-  uint8x8_t out_uint8x8_t;
-  uint8x8_t arg0_uint8x8_t;
-  uint8x8x2_t arg1_uint8x8x2_t;
-  uint8x8_t arg2_uint8x8_t;
-
-  out_uint8x8_t = vtbx2_u8 (arg0_uint8x8_t, arg1_uint8x8x2_t, arg2_uint8x8_t);
-}
-
-/* { dg-final { scan-assembler "vtbx\.8\[ 	\]+\[dD\]\[0-9\]+, \\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vtbx3p8.c
+++ b/src//dev/null
@@ -1,21 +0,0 @@
-/* Test the `vtbx3p8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vtbx3p8 (void)
-{
-  poly8x8_t out_poly8x8_t;
-  poly8x8_t arg0_poly8x8_t;
-  poly8x8x3_t arg1_poly8x8x3_t;
-  uint8x8_t arg2_uint8x8_t;
-
-  out_poly8x8_t = vtbx3_p8 (arg0_poly8x8_t, arg1_poly8x8x3_t, arg2_uint8x8_t);
-}
-
-/* { dg-final { scan-assembler "vtbx\.8\[ 	\]+\[dD\]\[0-9\]+, \\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vtbx3s8.c
+++ b/src//dev/null
@@ -1,21 +0,0 @@
-/* Test the `vtbx3s8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vtbx3s8 (void)
-{
-  int8x8_t out_int8x8_t;
-  int8x8_t arg0_int8x8_t;
-  int8x8x3_t arg1_int8x8x3_t;
-  int8x8_t arg2_int8x8_t;
-
-  out_int8x8_t = vtbx3_s8 (arg0_int8x8_t, arg1_int8x8x3_t, arg2_int8x8_t);
-}
-
-/* { dg-final { scan-assembler "vtbx\.8\[ 	\]+\[dD\]\[0-9\]+, \\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vtbx3u8.c
+++ b/src//dev/null
@@ -1,21 +0,0 @@
-/* Test the `vtbx3u8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vtbx3u8 (void)
-{
-  uint8x8_t out_uint8x8_t;
-  uint8x8_t arg0_uint8x8_t;
-  uint8x8x3_t arg1_uint8x8x3_t;
-  uint8x8_t arg2_uint8x8_t;
-
-  out_uint8x8_t = vtbx3_u8 (arg0_uint8x8_t, arg1_uint8x8x3_t, arg2_uint8x8_t);
-}
-
-/* { dg-final { scan-assembler "vtbx\.8\[ 	\]+\[dD\]\[0-9\]+, \\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vtbx4p8.c
+++ b/src//dev/null
@@ -1,21 +0,0 @@
-/* Test the `vtbx4p8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vtbx4p8 (void)
-{
-  poly8x8_t out_poly8x8_t;
-  poly8x8_t arg0_poly8x8_t;
-  poly8x8x4_t arg1_poly8x8x4_t;
-  uint8x8_t arg2_uint8x8_t;
-
-  out_poly8x8_t = vtbx4_p8 (arg0_poly8x8_t, arg1_poly8x8x4_t, arg2_uint8x8_t);
-}
-
-/* { dg-final { scan-assembler "vtbx\.8\[ 	\]+\[dD\]\[0-9\]+, \\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vtbx4s8.c
+++ b/src//dev/null
@@ -1,21 +0,0 @@
-/* Test the `vtbx4s8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vtbx4s8 (void)
-{
-  int8x8_t out_int8x8_t;
-  int8x8_t arg0_int8x8_t;
-  int8x8x4_t arg1_int8x8x4_t;
-  int8x8_t arg2_int8x8_t;
-
-  out_int8x8_t = vtbx4_s8 (arg0_int8x8_t, arg1_int8x8x4_t, arg2_int8x8_t);
-}
-
-/* { dg-final { scan-assembler "vtbx\.8\[ 	\]+\[dD\]\[0-9\]+, \\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vtbx4u8.c
+++ b/src//dev/null
@@ -1,21 +0,0 @@
-/* Test the `vtbx4u8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vtbx4u8 (void)
-{
-  uint8x8_t out_uint8x8_t;
-  uint8x8_t arg0_uint8x8_t;
-  uint8x8x4_t arg1_uint8x8x4_t;
-  uint8x8_t arg2_uint8x8_t;
-
-  out_uint8x8_t = vtbx4_u8 (arg0_uint8x8_t, arg1_uint8x8x4_t, arg2_uint8x8_t);
-}
-
-/* { dg-final { scan-assembler "vtbx\.8\[ 	\]+\[dD\]\[0-9\]+, \\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vtrnQf32.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vtrnQf32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vtrnQf32 (void)
-{
-  float32x4x2_t out_float32x4x2_t;
-  float32x4_t arg0_float32x4_t;
-  float32x4_t arg1_float32x4_t;
-
-  out_float32x4x2_t = vtrnq_f32 (arg0_float32x4_t, arg1_float32x4_t);
-}
-
-/* { dg-final { scan-assembler "vtrn\.32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vtrnQp16.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vtrnQp16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vtrnQp16 (void)
-{
-  poly16x8x2_t out_poly16x8x2_t;
-  poly16x8_t arg0_poly16x8_t;
-  poly16x8_t arg1_poly16x8_t;
-
-  out_poly16x8x2_t = vtrnq_p16 (arg0_poly16x8_t, arg1_poly16x8_t);
-}
-
-/* { dg-final { scan-assembler "vtrn\.16\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vtrnQp8.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vtrnQp8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vtrnQp8 (void)
-{
-  poly8x16x2_t out_poly8x16x2_t;
-  poly8x16_t arg0_poly8x16_t;
-  poly8x16_t arg1_poly8x16_t;
-
-  out_poly8x16x2_t = vtrnq_p8 (arg0_poly8x16_t, arg1_poly8x16_t);
-}
-
-/* { dg-final { scan-assembler "vtrn\.8\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vtrnQs16.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vtrnQs16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vtrnQs16 (void)
-{
-  int16x8x2_t out_int16x8x2_t;
-  int16x8_t arg0_int16x8_t;
-  int16x8_t arg1_int16x8_t;
-
-  out_int16x8x2_t = vtrnq_s16 (arg0_int16x8_t, arg1_int16x8_t);
-}
-
-/* { dg-final { scan-assembler "vtrn\.16\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vtrnQs32.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vtrnQs32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vtrnQs32 (void)
-{
-  int32x4x2_t out_int32x4x2_t;
-  int32x4_t arg0_int32x4_t;
-  int32x4_t arg1_int32x4_t;
-
-  out_int32x4x2_t = vtrnq_s32 (arg0_int32x4_t, arg1_int32x4_t);
-}
-
-/* { dg-final { scan-assembler "vtrn\.32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vtrnQs8.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vtrnQs8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vtrnQs8 (void)
-{
-  int8x16x2_t out_int8x16x2_t;
-  int8x16_t arg0_int8x16_t;
-  int8x16_t arg1_int8x16_t;
-
-  out_int8x16x2_t = vtrnq_s8 (arg0_int8x16_t, arg1_int8x16_t);
-}
-
-/* { dg-final { scan-assembler "vtrn\.8\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vtrnQu16.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vtrnQu16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vtrnQu16 (void)
-{
-  uint16x8x2_t out_uint16x8x2_t;
-  uint16x8_t arg0_uint16x8_t;
-  uint16x8_t arg1_uint16x8_t;
-
-  out_uint16x8x2_t = vtrnq_u16 (arg0_uint16x8_t, arg1_uint16x8_t);
-}
-
-/* { dg-final { scan-assembler "vtrn\.16\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vtrnQu32.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vtrnQu32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vtrnQu32 (void)
-{
-  uint32x4x2_t out_uint32x4x2_t;
-  uint32x4_t arg0_uint32x4_t;
-  uint32x4_t arg1_uint32x4_t;
-
-  out_uint32x4x2_t = vtrnq_u32 (arg0_uint32x4_t, arg1_uint32x4_t);
-}
-
-/* { dg-final { scan-assembler "vtrn\.32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vtrnQu8.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vtrnQu8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vtrnQu8 (void)
-{
-  uint8x16x2_t out_uint8x16x2_t;
-  uint8x16_t arg0_uint8x16_t;
-  uint8x16_t arg1_uint8x16_t;
-
-  out_uint8x16x2_t = vtrnq_u8 (arg0_uint8x16_t, arg1_uint8x16_t);
-}
-
-/* { dg-final { scan-assembler "vtrn\.8\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vtrnf32.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vtrnf32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vtrnf32 (void)
-{
-  float32x2x2_t out_float32x2x2_t;
-  float32x2_t arg0_float32x2_t;
-  float32x2_t arg1_float32x2_t;
-
-  out_float32x2x2_t = vtrn_f32 (arg0_float32x2_t, arg1_float32x2_t);
-}
-
-/* { dg-final { scan-assembler "vuzp\.32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vtrnp16.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vtrnp16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vtrnp16 (void)
-{
-  poly16x4x2_t out_poly16x4x2_t;
-  poly16x4_t arg0_poly16x4_t;
-  poly16x4_t arg1_poly16x4_t;
-
-  out_poly16x4x2_t = vtrn_p16 (arg0_poly16x4_t, arg1_poly16x4_t);
-}
-
-/* { dg-final { scan-assembler "vtrn\.16\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vtrnp8.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vtrnp8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vtrnp8 (void)
-{
-  poly8x8x2_t out_poly8x8x2_t;
-  poly8x8_t arg0_poly8x8_t;
-  poly8x8_t arg1_poly8x8_t;
-
-  out_poly8x8x2_t = vtrn_p8 (arg0_poly8x8_t, arg1_poly8x8_t);
-}
-
-/* { dg-final { scan-assembler "vtrn\.8\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vtrns16.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vtrns16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vtrns16 (void)
-{
-  int16x4x2_t out_int16x4x2_t;
-  int16x4_t arg0_int16x4_t;
-  int16x4_t arg1_int16x4_t;
-
-  out_int16x4x2_t = vtrn_s16 (arg0_int16x4_t, arg1_int16x4_t);
-}
-
-/* { dg-final { scan-assembler "vtrn\.16\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vtrns32.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vtrns32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vtrns32 (void)
-{
-  int32x2x2_t out_int32x2x2_t;
-  int32x2_t arg0_int32x2_t;
-  int32x2_t arg1_int32x2_t;
-
-  out_int32x2x2_t = vtrn_s32 (arg0_int32x2_t, arg1_int32x2_t);
-}
-
-/* { dg-final { scan-assembler "vuzp\.32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vtrns8.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vtrns8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vtrns8 (void)
-{
-  int8x8x2_t out_int8x8x2_t;
-  int8x8_t arg0_int8x8_t;
-  int8x8_t arg1_int8x8_t;
-
-  out_int8x8x2_t = vtrn_s8 (arg0_int8x8_t, arg1_int8x8_t);
-}
-
-/* { dg-final { scan-assembler "vtrn\.8\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vtrnu16.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vtrnu16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vtrnu16 (void)
-{
-  uint16x4x2_t out_uint16x4x2_t;
-  uint16x4_t arg0_uint16x4_t;
-  uint16x4_t arg1_uint16x4_t;
-
-  out_uint16x4x2_t = vtrn_u16 (arg0_uint16x4_t, arg1_uint16x4_t);
-}
-
-/* { dg-final { scan-assembler "vtrn\.16\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vtrnu32.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vtrnu32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vtrnu32 (void)
-{
-  uint32x2x2_t out_uint32x2x2_t;
-  uint32x2_t arg0_uint32x2_t;
-  uint32x2_t arg1_uint32x2_t;
-
-  out_uint32x2x2_t = vtrn_u32 (arg0_uint32x2_t, arg1_uint32x2_t);
-}
-
-/* { dg-final { scan-assembler "vuzp\.32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vtrnu8.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vtrnu8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vtrnu8 (void)
-{
-  uint8x8x2_t out_uint8x8x2_t;
-  uint8x8_t arg0_uint8x8_t;
-  uint8x8_t arg1_uint8x8_t;
-
-  out_uint8x8x2_t = vtrn_u8 (arg0_uint8x8_t, arg1_uint8x8_t);
-}
-
-/* { dg-final { scan-assembler "vtrn\.8\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vtstQp8.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vtstQp8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vtstQp8 (void)
-{
-  uint8x16_t out_uint8x16_t;
-  poly8x16_t arg0_poly8x16_t;
-  poly8x16_t arg1_poly8x16_t;
-
-  out_uint8x16_t = vtstq_p8 (arg0_poly8x16_t, arg1_poly8x16_t);
-}
-
-/* { dg-final { scan-assembler "vtst\.8\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vtstQs16.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vtstQs16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vtstQs16 (void)
-{
-  uint16x8_t out_uint16x8_t;
-  int16x8_t arg0_int16x8_t;
-  int16x8_t arg1_int16x8_t;
-
-  out_uint16x8_t = vtstq_s16 (arg0_int16x8_t, arg1_int16x8_t);
-}
-
-/* { dg-final { scan-assembler "vtst\.16\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vtstQs32.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vtstQs32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vtstQs32 (void)
-{
-  uint32x4_t out_uint32x4_t;
-  int32x4_t arg0_int32x4_t;
-  int32x4_t arg1_int32x4_t;
-
-  out_uint32x4_t = vtstq_s32 (arg0_int32x4_t, arg1_int32x4_t);
-}
-
-/* { dg-final { scan-assembler "vtst\.32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vtstQs8.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vtstQs8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vtstQs8 (void)
-{
-  uint8x16_t out_uint8x16_t;
-  int8x16_t arg0_int8x16_t;
-  int8x16_t arg1_int8x16_t;
-
-  out_uint8x16_t = vtstq_s8 (arg0_int8x16_t, arg1_int8x16_t);
-}
-
-/* { dg-final { scan-assembler "vtst\.8\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vtstQu16.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vtstQu16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vtstQu16 (void)
-{
-  uint16x8_t out_uint16x8_t;
-  uint16x8_t arg0_uint16x8_t;
-  uint16x8_t arg1_uint16x8_t;
-
-  out_uint16x8_t = vtstq_u16 (arg0_uint16x8_t, arg1_uint16x8_t);
-}
-
-/* { dg-final { scan-assembler "vtst\.16\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vtstQu32.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vtstQu32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vtstQu32 (void)
-{
-  uint32x4_t out_uint32x4_t;
-  uint32x4_t arg0_uint32x4_t;
-  uint32x4_t arg1_uint32x4_t;
-
-  out_uint32x4_t = vtstq_u32 (arg0_uint32x4_t, arg1_uint32x4_t);
-}
-
-/* { dg-final { scan-assembler "vtst\.32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vtstQu8.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vtstQu8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vtstQu8 (void)
-{
-  uint8x16_t out_uint8x16_t;
-  uint8x16_t arg0_uint8x16_t;
-  uint8x16_t arg1_uint8x16_t;
-
-  out_uint8x16_t = vtstq_u8 (arg0_uint8x16_t, arg1_uint8x16_t);
-}
-
-/* { dg-final { scan-assembler "vtst\.8\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vtstp8.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vtstp8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vtstp8 (void)
-{
-  uint8x8_t out_uint8x8_t;
-  poly8x8_t arg0_poly8x8_t;
-  poly8x8_t arg1_poly8x8_t;
-
-  out_uint8x8_t = vtst_p8 (arg0_poly8x8_t, arg1_poly8x8_t);
-}
-
-/* { dg-final { scan-assembler "vtst\.8\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vtsts16.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vtsts16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vtsts16 (void)
-{
-  uint16x4_t out_uint16x4_t;
-  int16x4_t arg0_int16x4_t;
-  int16x4_t arg1_int16x4_t;
-
-  out_uint16x4_t = vtst_s16 (arg0_int16x4_t, arg1_int16x4_t);
-}
-
-/* { dg-final { scan-assembler "vtst\.16\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vtsts32.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vtsts32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vtsts32 (void)
-{
-  uint32x2_t out_uint32x2_t;
-  int32x2_t arg0_int32x2_t;
-  int32x2_t arg1_int32x2_t;
-
-  out_uint32x2_t = vtst_s32 (arg0_int32x2_t, arg1_int32x2_t);
-}
-
-/* { dg-final { scan-assembler "vtst\.32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vtsts8.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vtsts8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vtsts8 (void)
-{
-  uint8x8_t out_uint8x8_t;
-  int8x8_t arg0_int8x8_t;
-  int8x8_t arg1_int8x8_t;
-
-  out_uint8x8_t = vtst_s8 (arg0_int8x8_t, arg1_int8x8_t);
-}
-
-/* { dg-final { scan-assembler "vtst\.8\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vtstu16.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vtstu16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vtstu16 (void)
-{
-  uint16x4_t out_uint16x4_t;
-  uint16x4_t arg0_uint16x4_t;
-  uint16x4_t arg1_uint16x4_t;
-
-  out_uint16x4_t = vtst_u16 (arg0_uint16x4_t, arg1_uint16x4_t);
-}
-
-/* { dg-final { scan-assembler "vtst\.16\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vtstu32.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vtstu32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vtstu32 (void)
-{
-  uint32x2_t out_uint32x2_t;
-  uint32x2_t arg0_uint32x2_t;
-  uint32x2_t arg1_uint32x2_t;
-
-  out_uint32x2_t = vtst_u32 (arg0_uint32x2_t, arg1_uint32x2_t);
-}
-
-/* { dg-final { scan-assembler "vtst\.32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vtstu8.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vtstu8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vtstu8 (void)
-{
-  uint8x8_t out_uint8x8_t;
-  uint8x8_t arg0_uint8x8_t;
-  uint8x8_t arg1_uint8x8_t;
-
-  out_uint8x8_t = vtst_u8 (arg0_uint8x8_t, arg1_uint8x8_t);
-}
-
-/* { dg-final { scan-assembler "vtst\.8\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vuzpQf32.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vuzpQf32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vuzpQf32 (void)
-{
-  float32x4x2_t out_float32x4x2_t;
-  float32x4_t arg0_float32x4_t;
-  float32x4_t arg1_float32x4_t;
-
-  out_float32x4x2_t = vuzpq_f32 (arg0_float32x4_t, arg1_float32x4_t);
-}
-
-/* { dg-final { scan-assembler "vuzp\.32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vuzpQp16.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vuzpQp16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vuzpQp16 (void)
-{
-  poly16x8x2_t out_poly16x8x2_t;
-  poly16x8_t arg0_poly16x8_t;
-  poly16x8_t arg1_poly16x8_t;
-
-  out_poly16x8x2_t = vuzpq_p16 (arg0_poly16x8_t, arg1_poly16x8_t);
-}
-
-/* { dg-final { scan-assembler "vuzp\.16\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vuzpQp8.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vuzpQp8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vuzpQp8 (void)
-{
-  poly8x16x2_t out_poly8x16x2_t;
-  poly8x16_t arg0_poly8x16_t;
-  poly8x16_t arg1_poly8x16_t;
-
-  out_poly8x16x2_t = vuzpq_p8 (arg0_poly8x16_t, arg1_poly8x16_t);
-}
-
-/* { dg-final { scan-assembler "vuzp\.8\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vuzpQs16.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vuzpQs16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vuzpQs16 (void)
-{
-  int16x8x2_t out_int16x8x2_t;
-  int16x8_t arg0_int16x8_t;
-  int16x8_t arg1_int16x8_t;
-
-  out_int16x8x2_t = vuzpq_s16 (arg0_int16x8_t, arg1_int16x8_t);
-}
-
-/* { dg-final { scan-assembler "vuzp\.16\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vuzpQs32.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vuzpQs32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vuzpQs32 (void)
-{
-  int32x4x2_t out_int32x4x2_t;
-  int32x4_t arg0_int32x4_t;
-  int32x4_t arg1_int32x4_t;
-
-  out_int32x4x2_t = vuzpq_s32 (arg0_int32x4_t, arg1_int32x4_t);
-}
-
-/* { dg-final { scan-assembler "vuzp\.32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vuzpQs8.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vuzpQs8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vuzpQs8 (void)
-{
-  int8x16x2_t out_int8x16x2_t;
-  int8x16_t arg0_int8x16_t;
-  int8x16_t arg1_int8x16_t;
-
-  out_int8x16x2_t = vuzpq_s8 (arg0_int8x16_t, arg1_int8x16_t);
-}
-
-/* { dg-final { scan-assembler "vuzp\.8\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vuzpQu16.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vuzpQu16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vuzpQu16 (void)
-{
-  uint16x8x2_t out_uint16x8x2_t;
-  uint16x8_t arg0_uint16x8_t;
-  uint16x8_t arg1_uint16x8_t;
-
-  out_uint16x8x2_t = vuzpq_u16 (arg0_uint16x8_t, arg1_uint16x8_t);
-}
-
-/* { dg-final { scan-assembler "vuzp\.16\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vuzpQu32.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vuzpQu32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vuzpQu32 (void)
-{
-  uint32x4x2_t out_uint32x4x2_t;
-  uint32x4_t arg0_uint32x4_t;
-  uint32x4_t arg1_uint32x4_t;
-
-  out_uint32x4x2_t = vuzpq_u32 (arg0_uint32x4_t, arg1_uint32x4_t);
-}
-
-/* { dg-final { scan-assembler "vuzp\.32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vuzpQu8.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vuzpQu8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vuzpQu8 (void)
-{
-  uint8x16x2_t out_uint8x16x2_t;
-  uint8x16_t arg0_uint8x16_t;
-  uint8x16_t arg1_uint8x16_t;
-
-  out_uint8x16x2_t = vuzpq_u8 (arg0_uint8x16_t, arg1_uint8x16_t);
-}
-
-/* { dg-final { scan-assembler "vuzp\.8\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vuzpf32.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vuzpf32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vuzpf32 (void)
-{
-  float32x2x2_t out_float32x2x2_t;
-  float32x2_t arg0_float32x2_t;
-  float32x2_t arg1_float32x2_t;
-
-  out_float32x2x2_t = vuzp_f32 (arg0_float32x2_t, arg1_float32x2_t);
-}
-
-/* { dg-final { scan-assembler "vuzp\.32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vuzpp16.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vuzpp16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vuzpp16 (void)
-{
-  poly16x4x2_t out_poly16x4x2_t;
-  poly16x4_t arg0_poly16x4_t;
-  poly16x4_t arg1_poly16x4_t;
-
-  out_poly16x4x2_t = vuzp_p16 (arg0_poly16x4_t, arg1_poly16x4_t);
-}
-
-/* { dg-final { scan-assembler "vuzp\.16\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vuzpp8.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vuzpp8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vuzpp8 (void)
-{
-  poly8x8x2_t out_poly8x8x2_t;
-  poly8x8_t arg0_poly8x8_t;
-  poly8x8_t arg1_poly8x8_t;
-
-  out_poly8x8x2_t = vuzp_p8 (arg0_poly8x8_t, arg1_poly8x8_t);
-}
-
-/* { dg-final { scan-assembler "vuzp\.8\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vuzps16.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vuzps16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vuzps16 (void)
-{
-  int16x4x2_t out_int16x4x2_t;
-  int16x4_t arg0_int16x4_t;
-  int16x4_t arg1_int16x4_t;
-
-  out_int16x4x2_t = vuzp_s16 (arg0_int16x4_t, arg1_int16x4_t);
-}
-
-/* { dg-final { scan-assembler "vuzp\.16\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vuzps32.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vuzps32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vuzps32 (void)
-{
-  int32x2x2_t out_int32x2x2_t;
-  int32x2_t arg0_int32x2_t;
-  int32x2_t arg1_int32x2_t;
-
-  out_int32x2x2_t = vuzp_s32 (arg0_int32x2_t, arg1_int32x2_t);
-}
-
-/* { dg-final { scan-assembler "vuzp\.32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vuzps8.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vuzps8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vuzps8 (void)
-{
-  int8x8x2_t out_int8x8x2_t;
-  int8x8_t arg0_int8x8_t;
-  int8x8_t arg1_int8x8_t;
-
-  out_int8x8x2_t = vuzp_s8 (arg0_int8x8_t, arg1_int8x8_t);
-}
-
-/* { dg-final { scan-assembler "vuzp\.8\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vuzpu16.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vuzpu16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vuzpu16 (void)
-{
-  uint16x4x2_t out_uint16x4x2_t;
-  uint16x4_t arg0_uint16x4_t;
-  uint16x4_t arg1_uint16x4_t;
-
-  out_uint16x4x2_t = vuzp_u16 (arg0_uint16x4_t, arg1_uint16x4_t);
-}
-
-/* { dg-final { scan-assembler "vuzp\.16\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vuzpu32.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vuzpu32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vuzpu32 (void)
-{
-  uint32x2x2_t out_uint32x2x2_t;
-  uint32x2_t arg0_uint32x2_t;
-  uint32x2_t arg1_uint32x2_t;
-
-  out_uint32x2x2_t = vuzp_u32 (arg0_uint32x2_t, arg1_uint32x2_t);
-}
-
-/* { dg-final { scan-assembler "vuzp\.32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vuzpu8.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vuzpu8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vuzpu8 (void)
-{
-  uint8x8x2_t out_uint8x8x2_t;
-  uint8x8_t arg0_uint8x8_t;
-  uint8x8_t arg1_uint8x8_t;
-
-  out_uint8x8x2_t = vuzp_u8 (arg0_uint8x8_t, arg1_uint8x8_t);
-}
-
-/* { dg-final { scan-assembler "vuzp\.8\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vzipQf32.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vzipQf32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vzipQf32 (void)
-{
-  float32x4x2_t out_float32x4x2_t;
-  float32x4_t arg0_float32x4_t;
-  float32x4_t arg1_float32x4_t;
-
-  out_float32x4x2_t = vzipq_f32 (arg0_float32x4_t, arg1_float32x4_t);
-}
-
-/* { dg-final { scan-assembler "vzip\.32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vzipQp16.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vzipQp16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vzipQp16 (void)
-{
-  poly16x8x2_t out_poly16x8x2_t;
-  poly16x8_t arg0_poly16x8_t;
-  poly16x8_t arg1_poly16x8_t;
-
-  out_poly16x8x2_t = vzipq_p16 (arg0_poly16x8_t, arg1_poly16x8_t);
-}
-
-/* { dg-final { scan-assembler "vzip\.16\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vzipQp8.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vzipQp8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vzipQp8 (void)
-{
-  poly8x16x2_t out_poly8x16x2_t;
-  poly8x16_t arg0_poly8x16_t;
-  poly8x16_t arg1_poly8x16_t;
-
-  out_poly8x16x2_t = vzipq_p8 (arg0_poly8x16_t, arg1_poly8x16_t);
-}
-
-/* { dg-final { scan-assembler "vzip\.8\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vzipQs16.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vzipQs16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vzipQs16 (void)
-{
-  int16x8x2_t out_int16x8x2_t;
-  int16x8_t arg0_int16x8_t;
-  int16x8_t arg1_int16x8_t;
-
-  out_int16x8x2_t = vzipq_s16 (arg0_int16x8_t, arg1_int16x8_t);
-}
-
-/* { dg-final { scan-assembler "vzip\.16\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vzipQs32.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vzipQs32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vzipQs32 (void)
-{
-  int32x4x2_t out_int32x4x2_t;
-  int32x4_t arg0_int32x4_t;
-  int32x4_t arg1_int32x4_t;
-
-  out_int32x4x2_t = vzipq_s32 (arg0_int32x4_t, arg1_int32x4_t);
-}
-
-/* { dg-final { scan-assembler "vzip\.32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vzipQs8.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vzipQs8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vzipQs8 (void)
-{
-  int8x16x2_t out_int8x16x2_t;
-  int8x16_t arg0_int8x16_t;
-  int8x16_t arg1_int8x16_t;
-
-  out_int8x16x2_t = vzipq_s8 (arg0_int8x16_t, arg1_int8x16_t);
-}
-
-/* { dg-final { scan-assembler "vzip\.8\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vzipQu16.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vzipQu16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vzipQu16 (void)
-{
-  uint16x8x2_t out_uint16x8x2_t;
-  uint16x8_t arg0_uint16x8_t;
-  uint16x8_t arg1_uint16x8_t;
-
-  out_uint16x8x2_t = vzipq_u16 (arg0_uint16x8_t, arg1_uint16x8_t);
-}
-
-/* { dg-final { scan-assembler "vzip\.16\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vzipQu32.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vzipQu32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vzipQu32 (void)
-{
-  uint32x4x2_t out_uint32x4x2_t;
-  uint32x4_t arg0_uint32x4_t;
-  uint32x4_t arg1_uint32x4_t;
-
-  out_uint32x4x2_t = vzipq_u32 (arg0_uint32x4_t, arg1_uint32x4_t);
-}
-
-/* { dg-final { scan-assembler "vzip\.32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vzipQu8.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vzipQu8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vzipQu8 (void)
-{
-  uint8x16x2_t out_uint8x16x2_t;
-  uint8x16_t arg0_uint8x16_t;
-  uint8x16_t arg1_uint8x16_t;
-
-  out_uint8x16x2_t = vzipq_u8 (arg0_uint8x16_t, arg1_uint8x16_t);
-}
-
-/* { dg-final { scan-assembler "vzip\.8\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vzipf32.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vzipf32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vzipf32 (void)
-{
-  float32x2x2_t out_float32x2x2_t;
-  float32x2_t arg0_float32x2_t;
-  float32x2_t arg1_float32x2_t;
-
-  out_float32x2x2_t = vzip_f32 (arg0_float32x2_t, arg1_float32x2_t);
-}
-
-/* { dg-final { scan-assembler "vuzp\.32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vzipp16.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vzipp16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vzipp16 (void)
-{
-  poly16x4x2_t out_poly16x4x2_t;
-  poly16x4_t arg0_poly16x4_t;
-  poly16x4_t arg1_poly16x4_t;
-
-  out_poly16x4x2_t = vzip_p16 (arg0_poly16x4_t, arg1_poly16x4_t);
-}
-
-/* { dg-final { scan-assembler "vzip\.16\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vzipp8.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vzipp8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vzipp8 (void)
-{
-  poly8x8x2_t out_poly8x8x2_t;
-  poly8x8_t arg0_poly8x8_t;
-  poly8x8_t arg1_poly8x8_t;
-
-  out_poly8x8x2_t = vzip_p8 (arg0_poly8x8_t, arg1_poly8x8_t);
-}
-
-/* { dg-final { scan-assembler "vzip\.8\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vzips16.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vzips16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vzips16 (void)
-{
-  int16x4x2_t out_int16x4x2_t;
-  int16x4_t arg0_int16x4_t;
-  int16x4_t arg1_int16x4_t;
-
-  out_int16x4x2_t = vzip_s16 (arg0_int16x4_t, arg1_int16x4_t);
-}
-
-/* { dg-final { scan-assembler "vzip\.16\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vzips32.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vzips32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vzips32 (void)
-{
-  int32x2x2_t out_int32x2x2_t;
-  int32x2_t arg0_int32x2_t;
-  int32x2_t arg1_int32x2_t;
-
-  out_int32x2x2_t = vzip_s32 (arg0_int32x2_t, arg1_int32x2_t);
-}
-
-/* { dg-final { scan-assembler "vuzp\.32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vzips8.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vzips8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vzips8 (void)
-{
-  int8x8x2_t out_int8x8x2_t;
-  int8x8_t arg0_int8x8_t;
-  int8x8_t arg1_int8x8_t;
-
-  out_int8x8x2_t = vzip_s8 (arg0_int8x8_t, arg1_int8x8_t);
-}
-
-/* { dg-final { scan-assembler "vzip\.8\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vzipu16.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vzipu16' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vzipu16 (void)
-{
-  uint16x4x2_t out_uint16x4x2_t;
-  uint16x4_t arg0_uint16x4_t;
-  uint16x4_t arg1_uint16x4_t;
-
-  out_uint16x4x2_t = vzip_u16 (arg0_uint16x4_t, arg1_uint16x4_t);
-}
-
-/* { dg-final { scan-assembler "vzip\.16\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vzipu32.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vzipu32' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vzipu32 (void)
-{
-  uint32x2x2_t out_uint32x2x2_t;
-  uint32x2_t arg0_uint32x2_t;
-  uint32x2_t arg1_uint32x2_t;
-
-  out_uint32x2x2_t = vzip_u32 (arg0_uint32x2_t, arg1_uint32x2_t);
-}
-
-/* { dg-final { scan-assembler "vuzp\.32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- a/src/gcc/testsuite/gcc.target/arm/neon/vzipu8.c
+++ b/src//dev/null
@@ -1,20 +0,0 @@
-/* Test the `vzipu8' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
-
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
-/* { dg-add-options arm_neon } */
-
-#include "arm_neon.h"
-
-void test_vzipu8 (void)
-{
-  uint8x8x2_t out_uint8x8x2_t;
-  uint8x8_t arg0_uint8x8_t;
-  uint8x8_t arg1_uint8x8_t;
-
-  out_uint8x8x2_t = vzip_u8 (arg0_uint8x8_t, arg1_uint8x8_t);
-}
-
-/* { dg-final { scan-assembler "vzip\.8\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/arm/optional_thumb-1.c
@@ -0,0 +1,7 @@
+/* { dg-do compile { target { ! default_mode } } } */
+/* { dg-skip-if "-marm/-mthumb/-march/-mcpu given" { *-*-* } { "-marm" "-mthumb" "-march=*" "-mcpu=*" } } */
+/* { dg-options "-march=armv6-m" } */
+
+/* Check that -mthumb is not needed when compiling for a Thumb-only target.  */
+
+int foo;
--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/arm/optional_thumb-2.c
@@ -0,0 +1,7 @@
+/* { dg-do compile { target { ! default_mode } } } */
+/* { dg-skip-if "-marm/-mthumb/-march/-mcpu given" { *-*-* } { "-marm" "-mthumb" "-march=*" "-mcpu=*" } } */
+/* { dg-options "-mcpu=cortex-m4" } */
+
+/* Check that -mthumb is not needed when compiling for a Thumb-only target.  */
+
+int foo;
--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/arm/optional_thumb-3.c
@@ -0,0 +1,9 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target arm_cortex_m } */
+/* { dg-skip-if "-mthumb given" { *-*-* } { "-mthumb" } } */
+/* { dg-options "-marm" } */
+/* { dg-error "target CPU does not support ARM mode" "missing error with -marm on Thumb-only targets" { target *-*-* } 0 } */
+
+/* Check that -marm gives an error when compiling for a Thumb-only target.  */
+
+int foo;
--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/arm/polytypes.c
@@ -0,0 +1,48 @@
+/* Check that NEON polynomial vector types are suitably incompatible with
+   integer vector types of the same layout.  */
+
+/* { dg-do compile } */
+/* { dg-require-effective-target arm_neon_ok } */
+/* { dg-add-options arm_neon } */
+
+#include <arm_neon.h>
+
+void s64_8 (int8x8_t a) {}
+void u64_8 (uint8x8_t a) {}
+void p64_8 (poly8x8_t a) {}
+void s64_16 (int16x4_t a) {}
+void u64_16 (uint16x4_t a) {}
+void p64_16 (poly16x4_t a) {}
+
+void s128_8 (int8x16_t a) {}
+void u128_8 (uint8x16_t a) {}
+void p128_8 (poly8x16_t a) {}
+void s128_16 (int16x8_t a) {}
+void u128_16 (uint16x8_t a) {}
+void p128_16 (poly16x8_t a) {}
+
+void foo ()
+{
+  poly8x8_t v64_8;
+  poly16x4_t v64_16;
+  poly8x16_t v128_8;
+  poly16x8_t v128_16;
+
+  s64_8 (v64_8); /* { dg-message "use -flax-vector-conversions" } */
+  /* { dg-error "incompatible type for argument 1 of 's64_8'" "" { target *-*-* } 31 } */
+  u64_8 (v64_8); /* { dg-error "incompatible type for argument 1 of 'u64_8'" } */
+  p64_8 (v64_8);
+
+  s64_16 (v64_16); /* { dg-error "incompatible type for argument 1 of 's64_16'" } */
+  u64_16 (v64_16); /* { dg-error "incompatible type for argument 1 of 'u64_16'" } */
+  p64_16 (v64_16);
+
+  s128_8 (v128_8); /* { dg-error "incompatible type for argument 1 of 's128_8'" } */
+  u128_8 (v128_8); /* { dg-error "incompatible type for argument 1 of 'u128_8'" } */
+  p128_8 (v128_8);
+
+  s128_16 (v128_16); /* { dg-error "incompatible type for argument 1 of 's128_16'" } */
+  u128_16 (v128_16); /* { dg-error "incompatible type for argument 1 of 'u128_16'" } */
+  p128_16 (v128_16);
+}
+/* { dg-message "note: expected '\[^'\n\]*' but argument is of type '\[^'\n\]*'" "note: expected" { target *-*-* } 0 } */
--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/arm/pr37780_1.c
@@ -0,0 +1,48 @@
+/* Test that we can remove the conditional move due to CLZ
+   being defined at zero.  */
+
+/* { dg-do compile } */
+/* { dg-require-effective-target arm_arch_v6t2_ok } */
+/* { dg-options "-O2" } */
+/* { dg-add-options arm_arch_v6t2 } */
+
+int
+fooctz (int i)
+{
+  return (i == 0) ? 32 : __builtin_ctz (i);
+}
+
+int
+fooctz2 (int i)
+{
+  return (i != 0) ? __builtin_ctz (i) : 32;
+}
+
+unsigned int
+fooctz3 (unsigned int i)
+{
+  return (i > 0) ?  __builtin_ctz (i) : 32;
+}
+
+/* { dg-final { scan-assembler-times "rbit\t*" 3 } } */
+
+int
+fooclz (int i)
+{
+  return (i == 0) ? 32 : __builtin_clz (i);
+}
+
+int
+fooclz2 (int i)
+{
+  return (i != 0) ? __builtin_clz (i) : 32;
+}
+
+unsigned int
+fooclz3 (unsigned int i)
+{
+  return (i > 0) ? __builtin_clz (i) : 32;
+}
+
+/* { dg-final { scan-assembler-times "clz\t" 6 } } */
+/* { dg-final { scan-assembler-not "cmp\t.*0" } } */
--- a/src/gcc/testsuite/gcc.target/arm/pr42574.c
+++ b/src/gcc/testsuite/gcc.target/arm/pr42574.c
@@ -1,5 +1,5 @@
+/* { dg-do compile { target { arm_thumb1_ok && { ! arm_thumb1_movt_ok } } } } */
 /* { dg-options "-mthumb -Os -fpic" }  */
-/* { dg-require-effective-target arm_thumb1_ok } */
 /* { dg-require-effective-target fpic } */
 /* Make sure the address of glob.c is calculated only once and using
    a logical shift for the offset (200<<1).  */
--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/arm/pr51534.c
@@ -0,0 +1,83 @@
+/* Test the vector comparison intrinsics when comparing to immediate zero.
+   */
+
+/* { dg-do assemble } */
+/* { dg-require-effective-target arm_neon_ok } */
+/* { dg-options "-save-temps -mfloat-abi=hard -O3" } */
+/* { dg-add-options arm_neon } */
+
+#include <arm_neon.h>
+
+#define GEN_TEST(T, D, C, R) \
+  R test_##C##_##T (T a) { return C (a, D (0)); }
+
+#define GEN_DOUBLE_TESTS(S, T, C) \
+  GEN_TEST (T, vdup_n_s##S, C##_s##S, u##T) \
+  GEN_TEST (u##T, vdup_n_u##S, C##_u##S, u##T) 
+
+#define GEN_QUAD_TESTS(S, T, C) \
+  GEN_TEST (T, vdupq_n_s##S, C##q_s##S, u##T) \
+  GEN_TEST (u##T, vdupq_n_u##S, C##q_u##S, u##T) 
+
+#define GEN_COND_TESTS(C) \
+  GEN_DOUBLE_TESTS (8, int8x8_t, C) \
+  GEN_DOUBLE_TESTS (16, int16x4_t, C) \
+  GEN_DOUBLE_TESTS (32, int32x2_t, C) \
+  GEN_QUAD_TESTS (8, int8x16_t, C) \
+  GEN_QUAD_TESTS (16, int16x8_t, C) \
+  GEN_QUAD_TESTS (32, int32x4_t, C)
+
+GEN_COND_TESTS(vcgt)
+GEN_COND_TESTS(vcge)
+GEN_COND_TESTS(vclt)
+GEN_COND_TESTS(vcle)
+GEN_COND_TESTS(vceq)
+
+/* Scan for expected outputs.  */
+/* { dg-final { scan-assembler "vcgt\.s8\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, #0" } } */
+/* { dg-final { scan-assembler-times "vcgt\.u8\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+" 2 } } */
+/* { dg-final { scan-assembler "vcgt\.s16\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, #0" } } */
+/* { dg-final { scan-assembler-times "vcgt\.u16\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+" 2 } } */
+/* { dg-final { scan-assembler "vcgt\.s32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, #0" } } */
+/* { dg-final { scan-assembler-times "vcgt\.u32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+" 2 } } */
+/* { dg-final { scan-assembler "vcgt\.s8\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, #0" } } */
+/* { dg-final { scan-assembler-times "vcgt\.u8\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+" 2 } } */
+/* { dg-final { scan-assembler "vcgt\.s16\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, #0" } } */
+/* { dg-final { scan-assembler-times "vcgt\.u16\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+" 2 } } */
+/* { dg-final { scan-assembler "vcgt\.s32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, #0" } } */
+/* { dg-final { scan-assembler-times "vcgt\.u32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+" 2 } } */
+/* { dg-final { scan-assembler "vcge\.s8\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, #0" } } */
+/* { dg-final { scan-assembler-times "vcge\.u8\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+" 2 } } */
+/* { dg-final { scan-assembler "vcge\.s16\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, #0" } } */
+/* { dg-final { scan-assembler-times "vcge\.u16\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+" 2 } } */
+/* { dg-final { scan-assembler "vcge\.s32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, #0" } } */
+/* { dg-final { scan-assembler-times "vcge\.u32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+" 2 } } */
+/* { dg-final { scan-assembler "vcge\.s8\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, #0" } } */
+/* { dg-final { scan-assembler-times "vcge\.u8\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+" 2 } } */
+/* { dg-final { scan-assembler "vcge\.s16\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, #0" } } */
+/* { dg-final { scan-assembler-times "vcge\.u16\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+" 2 } } */
+/* { dg-final { scan-assembler "vcge\.s32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, #0" } } */
+/* { dg-final { scan-assembler-times "vcge\.u32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+" 2 } } */
+/* { dg-final { scan-assembler "vclt\.s8\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, #0" } } */
+/* { dg-final { scan-assembler "vclt\.s16\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, #0" } } */
+/* { dg-final { scan-assembler "vclt\.s32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, #0" } } */
+/* { dg-final { scan-assembler "vclt\.s8\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, #0" } } */
+/* { dg-final { scan-assembler "vclt\.s16\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, #0" } } */
+/* { dg-final { scan-assembler "vclt\.s32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, #0" } } */
+/* { dg-final { scan-assembler "vcle\.s8\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, #0" } } */
+/* { dg-final { scan-assembler "vcle\.s16\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, #0" } } */
+/* { dg-final { scan-assembler "vcle\.s32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, #0" } } */
+/* { dg-final { scan-assembler "vcle\.s8\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, #0" } } */
+/* { dg-final { scan-assembler "vcle\.s16\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, #0" } } */
+/* { dg-final { scan-assembler "vcle\.s32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, #0" } } */
+/* { dg-final { scan-assembler-times "vceq\.i8\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, #0" 2 } } */
+/* { dg-final { scan-assembler-times "vceq\.i16\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, #0" 2 } } */
+/* { dg-final { scan-assembler-times "vceq\.i32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, #0" 2 } } */
+/* { dg-final { scan-assembler-times "vceq\.i8\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, #0" 2 } } */
+/* { dg-final { scan-assembler-times "vceq\.i16\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, #0" 2 } } */
+/* { dg-final { scan-assembler-times "vceq\.i32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, #0" 2 } } */
+
+/* And ensure we don't have unexpected output too.  */
+/* { dg-final { scan-assembler-not "vc\[gl\]\[te\]\.u\[0-9\]+\[ 	\]+\[qQdD\]\[0-9\]+, \[qQdD\]\[0-9\]+, #0" } } */
+
+/* Tidy up.  */
--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/arm/pr79145.c
@@ -0,0 +1,16 @@
+/* { dg-do compile } */
+/* { dg-skip-if "Test is specific to the iWMMXt" { arm*-*-* } { "-mcpu=*" } { "-mcpu=iwmmxt" } } */
+/* { dg-skip-if "Test is specific to the iWMMXt" { arm*-*-* } { "-mabi=*" } { "-mabi=iwmmxt" } } */
+/* { dg-skip-if "Test is specific to the iWMMXt" { arm*-*-* } { "-march=*" } { "-march=iwmmxt" } } */
+/* { dg-skip-if "Test is specific to ARM mode" { arm*-*-* } { "-mthumb" } { "" } } */
+/* { dg-require-effective-target arm32 } */
+/* { dg-require-effective-target arm_iwmmxt_ok } */
+/* { dg-options "-mcpu=iwmmxt" } */
+
+int
+main (void)
+{
+  volatile long long t1;
+  t1 ^= 0x55;
+  return 0;
+}
--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/arm/short-vfp-1.c
@@ -0,0 +1,45 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target arm_vfp_ok }
+/* { dg-options "-mfpu=vfp" } */
+
+int
+test_sisf (float x)
+{
+  return (int)x;
+}
+
+short
+test_hisf (float x)
+{
+  return (short)x;
+}
+
+float
+test_sfsi (int x)
+{
+  return (float)x;
+}
+
+float
+test_sfhi (short x)
+{
+  return (float)x;
+}
+
+short
+test_hisi (int x)
+{
+  return (short)x;
+}
+
+int
+test_sihi (short x)
+{
+  return (int)x;
+}
+
+/* {dg-final { scan-assembler-times {vcvt\.s32\.f32\ts[0-9]+,s[0-9]+} 2 }} */
+/* {dg-final { scan-assembler-times {vcvt\.f32\.s32\ts[0-9]+,s[0-9]+} 2 }} */
+/* {dg-final { scan-assembler-times {vmov\tr[0-9]+,s[0-9]+} 2 }} */
+/* {dg-final { scan-assembler-times {vmov\ts[0-9]+,r[0-9]+} 2 }} */
+/* {dg-final { scan-assembler-times {sxth\tr[0-9]+,r[0-9]+} 2 }} */
--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/arm/simd/vmaxnm_f32_1.c
@@ -0,0 +1,159 @@
+/* Test the `vmaxnmf32' ARM Neon intrinsic.  */
+
+/* { dg-do run } */
+/* { dg-require-effective-target arm_v8_neon_hw } */
+/* { dg-options "-save-temps -O3 -march=armv8-a" } */
+/* { dg-add-options arm_v8_neon } */
+
+#include "arm_neon.h"
+
+extern void abort ();
+
+void __attribute__ ((noinline))
+test_vmaxnm_f32__regular_input1 ()
+{
+  float32_t a1[] = {1,2};
+  float32_t b1[] = {3,4};
+  float32x2_t a = vld1_f32 (a1);
+  float32x2_t b = vld1_f32 (b1);
+  float32x2_t c = vmaxnm_f32 (a, b);
+  float32_t actual[2];
+  vst1_f32 (actual, c);
+
+  for (int i = 0; i < 2; ++i)
+    if (actual[i] != b1[i])
+      abort ();
+}
+
+void __attribute__ ((noinline))
+test_vmaxnm_f32__regular_input2 ()
+{
+  float32_t a1[] = {3,2};
+  float32_t b1[] = {1,4};
+  float32_t e[] = {3,4};
+  float32x2_t a = vld1_f32 (a1);
+  float32x2_t b = vld1_f32 (b1);
+  float32x2_t c = vmaxnm_f32 (a, b);
+  float32_t actual[2];
+  vst1_f32 (actual, c);
+
+  for (int i = 0; i < 2; ++i)
+    if (actual[i] != e[i])
+      abort ();
+}
+
+void __attribute__ ((noinline))
+test_vmaxnm_f32__quiet_NaN_one_arg ()
+{
+  /* When given a quiet NaN, vmaxnm returns the other operand.
+     In this test case we have NaNs in only one operand.  */
+  float32_t n = __builtin_nanf ("");
+  float32_t a1[] = {1,2};
+  float32_t b1[] = {n,n};
+  float32_t e[] = {1,2};
+  float32x2_t a = vld1_f32 (a1);
+  float32x2_t b = vld1_f32 (b1);
+  float32x2_t c = vmaxnm_f32 (a, b);
+  float32_t actual[2];
+  vst1_f32 (actual, c);
+
+  for (int i = 0; i < 2; ++i)
+    if (actual[i] != e[i])
+      abort ();
+}
+
+void __attribute__ ((noinline))
+test_vmaxnm_f32__quiet_NaN_both_args ()
+{
+  /* When given a quiet NaN, vmaxnm returns the other operand.
+     In this test case we have NaNs in both operands.  */
+  float32_t n = __builtin_nanf ("");
+  float32_t a1[] = {n,2};
+  float32_t b1[] = {1,n};
+  float32_t e[] = {1,2};
+  float32x2_t a = vld1_f32 (a1);
+  float32x2_t b = vld1_f32 (b1);
+  float32x2_t c = vmaxnm_f32 (a, b);
+  float32_t actual[2];
+  vst1_f32 (actual, c);
+
+  for (int i = 0; i < 2; ++i)
+    if (actual[i] != e[i])
+      abort ();
+}
+
+void __attribute__ ((noinline))
+test_vmaxnm_f32__zero_both_args ()
+{
+  /* For 0 and -0, vmaxnm returns 0.  Since 0 == -0, check sign bit.  */
+  float32_t a1[] = {0.0, 0.0};
+  float32_t b1[] = {-0.0, -0.0};
+  float32_t e[] = {0.0, 0.0};
+
+  float32x2_t a = vld1_f32 (a1);
+  float32x2_t b = vld1_f32 (b1);
+  float32x2_t c = vmaxnm_f32 (a, b);
+
+  float32_t actual1[2];
+  vst1_f32 (actual1, c);
+
+  for (int i = 0; i < 2; ++i)
+    if (actual1[i] != e[i] || __builtin_signbit (actual1[i]) != 0)
+      abort ();
+}
+
+void __attribute__ ((noinline))
+test_vmaxnm_f32__inf_both_args ()
+{
+  /* The max of inf and inf is inf.  The max of -inf and -inf is -inf.  */
+  float32_t inf = __builtin_huge_valf ();
+  float32_t a1[] = {inf, -inf};
+  float32_t b1[] = {inf, -inf};
+  float32_t e[] = {inf, -inf};
+
+  float32x2_t a = vld1_f32 (a1);
+  float32x2_t b = vld1_f32 (b1);
+  float32x2_t c = vmaxnm_f32 (a, b);
+
+  float32_t actual1[2];
+  vst1_f32 (actual1, c);
+
+  for (int i = 0; i < 2; ++i)
+    if (actual1[i] != e[i])
+      abort ();
+}
+
+void __attribute__ ((noinline))
+test_vmaxnm_f32__two_quiet_NaNs_both_args ()
+{
+  /* When given 2 NaNs, return a NaN.  Since a NaN is not equal to anything,
+     not even another NaN, use __builtin_isnan () to check.  */
+  float32_t n = __builtin_nanf ("");
+  float32_t a1[] = {n,n};
+  float32_t b1[] = {n,n};
+  float32_t e[] = {n,n};
+  float32x2_t a = vld1_f32 (a1);
+  float32x2_t b = vld1_f32 (b1);
+  float32x2_t c = vmaxnm_f32 (a, b);
+  float32_t actual[2];
+  vst1_f32 (actual, c);
+
+  for (int i = 0; i < 2; ++i)
+    if (!__builtin_isnan (actual[i]))
+      abort ();
+}
+
+int
+main ()
+{
+  test_vmaxnm_f32__regular_input1 ();
+  test_vmaxnm_f32__regular_input2 ();
+  test_vmaxnm_f32__quiet_NaN_one_arg ();
+  test_vmaxnm_f32__quiet_NaN_both_args ();
+  test_vmaxnm_f32__zero_both_args ();
+  test_vmaxnm_f32__inf_both_args ();
+  test_vmaxnm_f32__two_quiet_NaNs_both_args ();
+  return 0;
+}
+
+/* { dg-final { scan-assembler-times "vmaxnm\.f32\t\[dD\]\[0-9\]+, ?\[dD\]\[0-9\]+, ?\[dD\]\[0-9\]+\n" 7 } } */
--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/arm/simd/vmaxnmq_f32_1.c
@@ -0,0 +1,160 @@
+/* Test the `vmaxnmqf32' ARM Neon intrinsic.  */
+
+/* { dg-do run } */
+/* { dg-require-effective-target arm_v8_neon_hw } */
+/* { dg-options "-save-temps -O3 -march=armv8-a" } */
+/* { dg-add-options arm_v8_neon } */
+
+#include "arm_neon.h"
+
+extern void abort ();
+
+void __attribute__ ((noinline))
+test_vmaxnmq_f32__regular_input1 ()
+{
+  float32_t a1[] = {1,2,5,6};
+  float32_t b1[] = {3,4,7,8};
+  float32x4_t a = vld1q_f32 (a1);
+  float32x4_t b = vld1q_f32 (b1);
+  float32x4_t c = vmaxnmq_f32 (a, b);
+  float32_t actual[4];
+  vst1q_f32 (actual, c);
+
+  for (int i = 0; i < 4; ++i)
+    if (actual[i] != b1[i])
+      abort ();
+}
+
+void __attribute__ ((noinline))
+test_vmaxnmq_f32__regular_input2 ()
+{
+  float32_t a1[] = {3,2,7,6};
+  float32_t b1[] = {1,4,5,8};
+  float32_t e[] = {3,4,7,8};
+  float32x4_t a = vld1q_f32 (a1);
+  float32x4_t b = vld1q_f32 (b1);
+  float32x4_t c = vmaxnmq_f32 (a, b);
+  float32_t actual[4];
+  vst1q_f32 (actual, c);
+
+  for (int i = 0; i < 4; ++i)
+    if (actual[i] != e[i])
+      abort ();
+}
+
+
+void __attribute__ ((noinline))
+test_vmaxnmq_f32__quiet_NaN_one_arg ()
+{
+  /* When given a quiet NaN, vmaxnmq returns the other operand.
+     In this test case we have NaNs in only one operand.  */
+  float32_t n = __builtin_nanf ("");
+  float32_t a1[] = {1,2,3,4};
+  float32_t b1[] = {n,n,n,n};
+  float32_t e[] = {1,2,3,4};
+  float32x4_t a = vld1q_f32 (a1);
+  float32x4_t b = vld1q_f32 (b1);
+  float32x4_t c = vmaxnmq_f32 (a, b);
+  float32_t actual[4];
+  vst1q_f32 (actual, c);
+
+  for (int i = 0; i < 4; ++i)
+    if (actual[i] != e[i])
+      abort ();
+}
+
+void __attribute__ ((noinline))
+test_vmaxnmq_f32__quiet_NaN_both_args ()
+{
+  /* When given a quiet NaN, vmaxnmq returns the other operand.
+     In this test case we have NaNs in both operands.  */
+  float32_t n = __builtin_nanf ("");
+  float32_t a1[] = {n,2,n,4};
+  float32_t b1[] = {1,n,3,n};
+  float32_t e[] = {1,2,3,4};
+  float32x4_t a = vld1q_f32 (a1);
+  float32x4_t b = vld1q_f32 (b1);
+  float32x4_t c = vmaxnmq_f32 (a, b);
+  float32_t actual[4];
+  vst1q_f32 (actual, c);
+
+  for (int i = 0; i < 4; ++i)
+    if (actual[i] != e[i])
+      abort ();
+}
+
+void __attribute__ ((noinline))
+test_vmaxnmq_f32__zero_both_args ()
+{
+  /* For 0 and -0, vmaxnmq returns 0.  Since 0 == -0, check sign bit.  */
+  float32_t a1[] = {0.0, 0.0, -0.0, -0.0};
+  float32_t b1[] = {-0.0, -0.0, 0.0, 0.0};
+  float32_t e[] = {0.0, 0.0, 0.0, 0.0};
+
+  float32x4_t a = vld1q_f32 (a1);
+  float32x4_t b = vld1q_f32 (b1);
+  float32x4_t c = vmaxnmq_f32 (a, b);
+
+  float32_t actual1[4];
+  vst1q_f32 (actual1, c);
+
+  for (int i = 0; i < 4; ++i)
+    if (actual1[i] != e[i] || __builtin_signbit (actual1[i]) != 0)
+      abort ();
+}
+
+void __attribute__ ((noinline))
+test_vmaxnmq_f32__inf_both_args ()
+{
+  /* The max of inf and inf is inf.  The max of -inf and -inf is -inf.  */
+  float32_t inf = __builtin_huge_valf ();
+  float32_t a1[] = {inf, -inf, inf, inf};
+  float32_t b1[] = {inf, -inf, -inf, -inf};
+  float32_t e[] = {inf, -inf, inf, inf};
+
+  float32x4_t a = vld1q_f32 (a1);
+  float32x4_t b = vld1q_f32 (b1);
+  float32x4_t c = vmaxnmq_f32 (a, b);
+
+  float32_t actual1[4];
+  vst1q_f32 (actual1, c);
+
+  for (int i = 0; i < 4; ++i)
+    if (actual1[i] != e[i])
+      abort ();
+}
+
+void __attribute__ ((noinline))
+test_vmaxnmq_f32__two_quiet_NaNs_both_args ()
+{
+  /* When given 2 NaNs, return a NaN.  Since a NaN is not equal to anything,
+     not even another NaN, use __builtin_isnan () to check.  */
+  float32_t n = __builtin_nanf ("");
+  float32_t a1[] = {n,n,n,n};
+  float32_t b1[] = {n,n,n,n};
+  float32_t e[] = {n,n};
+  float32x4_t a = vld1q_f32 (a1);
+  float32x4_t b = vld1q_f32 (b1);
+  float32x4_t c = vmaxnmq_f32 (a, b);
+  float32_t actual[4];
+  vst1q_f32 (actual, c);
+
+  for (int i = 0; i < 4; ++i)
+    if (!__builtin_isnan (actual[i]))
+      abort ();
+}
+
+int
+main ()
+{
+  test_vmaxnmq_f32__regular_input1 ();
+  test_vmaxnmq_f32__regular_input2 ();
+  test_vmaxnmq_f32__quiet_NaN_one_arg ();
+  test_vmaxnmq_f32__quiet_NaN_both_args ();
+  test_vmaxnmq_f32__zero_both_args ();
+  test_vmaxnmq_f32__inf_both_args ();
+  test_vmaxnmq_f32__two_quiet_NaNs_both_args ();
+  return 0;
+}
+
+/* { dg-final { scan-assembler-times "vmaxnm\.f32\t\[qQ\]\[0-9\]+, ?\[qQ\]\[0-9\]+, ?\[qQ\]\[0-9\]+\n" 7 } } */
--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/arm/simd/vminnm_f32_1.c
@@ -0,0 +1,159 @@
+/* Test the `vminnmf32' ARM Neon intrinsic.  */
+
+/* { dg-do run } */
+/* { dg-require-effective-target arm_v8_neon_hw } */
+/* { dg-options "-save-temps -O3 -march=armv8-a" } */
+/* { dg-add-options arm_v8_neon } */
+
+#include "arm_neon.h"
+
+extern void abort ();
+
+void __attribute__ ((noinline))
+test_vminnm_f32__regular_input1 ()
+{
+  float32_t a1[] = {1,2};
+  float32_t b1[] = {3,4};
+  float32x2_t a = vld1_f32 (a1);
+  float32x2_t b = vld1_f32 (b1);
+  float32x2_t c = vminnm_f32 (a, b);
+  float32_t actual[2];
+  vst1_f32 (actual, c);
+
+  for (int i = 0; i < 2; ++i)
+    if (actual[i] != a1[i])
+      abort ();
+}
+
+void __attribute__ ((noinline))
+test_vminnm_f32__regular_input2 ()
+{
+  float32_t a1[] = {3,2};
+  float32_t b1[] = {1,4};
+  float32_t e[] = {1,2};
+  float32x2_t a = vld1_f32 (a1);
+  float32x2_t b = vld1_f32 (b1);
+  float32x2_t c = vminnm_f32 (a, b);
+  float32_t actual[2];
+  vst1_f32 (actual, c);
+
+  for (int i = 0; i < 2; ++i)
+    if (actual[i] != e[i])
+      abort ();
+}
+
+void __attribute__ ((noinline))
+test_vminnm_f32__quiet_NaN_one_arg ()
+{
+  /* When given a quiet NaN, vminnm returns the other operand.
+     In this test case we have NaNs in only one operand.  */
+  float32_t n = __builtin_nanf ("");
+  float32_t a1[] = {1,2};
+  float32_t b1[] = {n,n};
+  float32_t e[] = {1,2};
+  float32x2_t a = vld1_f32 (a1);
+  float32x2_t b = vld1_f32 (b1);
+  float32x2_t c = vminnm_f32 (a, b);
+  float32_t actual[2];
+  vst1_f32 (actual, c);
+
+  for (int i = 0; i < 2; ++i)
+    if (actual[i] != e[i])
+      abort ();
+}
+
+void __attribute__ ((noinline))
+test_vminnm_f32__quiet_NaN_both_args ()
+{
+  /* When given a quiet NaN, vminnm returns the other operand.
+     In this test case we have NaNs in both operands.  */
+  float32_t n = __builtin_nanf ("");
+  float32_t a1[] = {n,2};
+  float32_t b1[] = {1,n};
+  float32_t e[] = {1,2};
+  float32x2_t a = vld1_f32 (a1);
+  float32x2_t b = vld1_f32 (b1);
+  float32x2_t c = vminnm_f32 (a, b);
+  float32_t actual[2];
+  vst1_f32 (actual, c);
+
+  for (int i = 0; i < 2; ++i)
+    if (actual[i] != e[i])
+      abort ();
+}
+
+void __attribute__ ((noinline))
+test_vminnm_f32__zero_both_args ()
+{
+  /* For 0 and -0, vminnm returns -0.  Since 0 == -0, check sign bit.  */
+  float32_t a1[] = {0.0,0.0};
+  float32_t b1[] = {-0.0, -0.0};
+  float32_t e[] = {-0.0, -0.0};
+
+  float32x2_t a = vld1_f32 (a1);
+  float32x2_t b = vld1_f32 (b1);
+  float32x2_t c = vminnm_f32 (a, b);
+
+  float32_t actual1[2];
+  vst1_f32 (actual1, c);
+
+  for (int i = 0; i < 2; ++i)
+    if (actual1[i] != e[i] || __builtin_signbit (actual1[i]) == 0)
+      abort ();
+}
+
+void __attribute__ ((noinline))
+test_vminnm_f32__inf_both_args ()
+{
+  /* The min of inf and inf is inf.  The min of -inf and -inf is -inf.  */
+  float32_t inf = __builtin_huge_valf ();
+  float32_t a1[] = {inf, -inf};
+  float32_t b1[] = {inf, -inf};
+  float32_t e[] = {inf, -inf};
+
+  float32x2_t a = vld1_f32 (a1);
+  float32x2_t b = vld1_f32 (b1);
+  float32x2_t c = vminnm_f32 (a, b);
+
+  float32_t actual1[2];
+  vst1_f32 (actual1, c);
+
+  for (int i = 0; i < 2; ++i)
+    if (actual1[i] != e[i])
+      abort ();
+}
+
+void __attribute__ ((noinline))
+test_vminnm_f32__two_quiet_NaNs_both_args ()
+{
+  /* When given 2 NaNs, return a NaN.  Since a NaN is not equal to anything,
+     not even another NaN, use __builtin_isnan () to check.  */
+  float32_t n = __builtin_nanf ("");
+  float32_t a1[] = {n,n};
+  float32_t b1[] = {n,n};
+  float32_t e[] = {n,n};
+  float32x2_t a = vld1_f32 (a1);
+  float32x2_t b = vld1_f32 (b1);
+  float32x2_t c = vminnm_f32 (a, b);
+  float32_t actual[2];
+  vst1_f32 (actual, c);
+
+  for (int i = 0; i < 2; ++i)
+    if (!__builtin_isnan (actual[i]))
+      abort ();
+}
+
+int
+main ()
+{
+  test_vminnm_f32__regular_input1 ();
+  test_vminnm_f32__regular_input2 ();
+  test_vminnm_f32__quiet_NaN_one_arg ();
+  test_vminnm_f32__quiet_NaN_both_args ();
+  test_vminnm_f32__zero_both_args ();
+  test_vminnm_f32__inf_both_args ();
+  test_vminnm_f32__two_quiet_NaNs_both_args ();
+  return 0;
+}
+
+/* { dg-final { scan-assembler-times "vminnm\.f32\t\[dD\]\[0-9\]+, ?\[dD\]\[0-9\]+, ?\[dD\]\[0-9\]+\n" 7 } } */
--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/arm/simd/vminnmq_f32_1.c
@@ -0,0 +1,159 @@
+/* Test the `vminnmqf32' ARM Neon intrinsic.  */
+
+/* { dg-do run } */
+/* { dg-require-effective-target arm_v8_neon_hw } */
+/* { dg-options "-save-temps -O3 -march=armv8-a" } */
+/* { dg-add-options arm_v8_neon } */
+
+#include "arm_neon.h"
+
+extern void abort ();
+
+void __attribute__ ((noinline))
+test_vminnmq_f32__regular_input1 ()
+{
+  float32_t a1[] = {1,2,5,6};
+  float32_t b1[] = {3,4,7,8};
+  float32x4_t a = vld1q_f32 (a1);
+  float32x4_t b = vld1q_f32 (b1);
+  float32x4_t c = vminnmq_f32 (a, b);
+  float32_t actual[4];
+  vst1q_f32 (actual, c);
+
+  for (int i = 0; i < 4; ++i)
+    if (actual[i] != a1[i])
+      abort ();
+}
+
+void __attribute__ ((noinline))
+test_vminnmq_f32__regular_input2 ()
+{
+  float32_t a1[] = {3,2,7,6};
+  float32_t b1[] = {1,4,5,8};
+  float32_t e[] = {1,2,5,6};
+  float32x4_t a = vld1q_f32 (a1);
+  float32x4_t b = vld1q_f32 (b1);
+  float32x4_t c = vminnmq_f32 (a, b);
+  float32_t actual[4];
+  vst1q_f32 (actual, c);
+
+  for (int i = 0; i < 4; ++i)
+    if (actual[i] != e[i])
+      abort ();
+}
+
+void __attribute__ ((noinline))
+test_vminnmq_f32__quiet_NaN_one_arg ()
+{
+  /* When given a quiet NaN, vminnmq returns the other operand.
+     In this test case we have NaNs in only one operand.  */
+  float32_t n = __builtin_nanf ("");
+  float32_t a1[] = {1,2,3,4};
+  float32_t b1[] = {n,n,n,n};
+  float32_t e[] = {1,2,3,4};
+  float32x4_t a = vld1q_f32 (a1);
+  float32x4_t b = vld1q_f32 (b1);
+  float32x4_t c = vminnmq_f32 (a, b);
+  float32_t actual[4];
+  vst1q_f32 (actual, c);
+
+  for (int i = 0; i < 4; ++i)
+    if (actual[i] != e[i])
+      abort ();
+}
+
+void __attribute__ ((noinline))
+test_vminnmq_f32__quiet_NaN_both_args ()
+{
+  /* When given a quiet NaN, vminnmq returns the other operand.
+     In this test case we have NaNs in both operands.  */
+  float32_t n = __builtin_nanf ("");
+  float32_t a1[] = {n,2,n,4};
+  float32_t b1[] = {1,n,3,n};
+  float32_t e[] = {1,2,3,4};
+  float32x4_t a = vld1q_f32 (a1);
+  float32x4_t b = vld1q_f32 (b1);
+  float32x4_t c = vminnmq_f32 (a, b);
+  float32_t actual[4];
+  vst1q_f32 (actual, c);
+
+  for (int i = 0; i < 4; ++i)
+    if (actual[i] != e[i])
+      abort ();
+}
+
+void __attribute__ ((noinline))
+test_vminnmq_f32__zero_both_args ()
+{
+  /* For 0 and -0, vminnmq returns -0.  Since 0 == -0, check sign bit.  */
+  float32_t a1[] = {0.0, 0.0, -0.0, -0.0};
+  float32_t b1[] = {-0.0, -0.0, 0.0, 0.0};
+  float32_t e[] = {-0.0, -0.0, -0.0, -0.0};
+
+  float32x4_t a = vld1q_f32 (a1);
+  float32x4_t b = vld1q_f32 (b1);
+  float32x4_t c = vminnmq_f32 (a, b);
+
+  float32_t actual1[4];
+  vst1q_f32 (actual1, c);
+
+  for (int i = 0; i < 4; ++i)
+    if (actual1[i] != e[i] || __builtin_signbit (actual1[i]) == 0)
+      abort ();
+}
+
+void __attribute__ ((noinline))
+test_vminnmq_f32__inf_both_args ()
+{
+  /* The min of inf and inf is inf.  The min of -inf and -inf is -inf.  */
+  float32_t inf = __builtin_huge_valf ();
+  float32_t a1[] = {inf, -inf, inf, inf};
+  float32_t b1[] = {inf, -inf, -inf, -inf};
+  float32_t e[] = {inf, -inf, -inf, -inf};
+
+  float32x4_t a = vld1q_f32 (a1);
+  float32x4_t b = vld1q_f32 (b1);
+  float32x4_t c = vminnmq_f32 (a, b);
+
+  float32_t actual1[4];
+  vst1q_f32 (actual1, c);
+
+  for (int i = 0; i < 4; ++i)
+    if (actual1[i] != e[i])
+      abort ();
+}
+
+void __attribute__ ((noinline))
+test_vminnmq_f32__two_quiet_NaNs_both_args ()
+{
+  /* When given 2 NaNs, return a NaN.  Since a NaN is not equal to anything,
+     not even another NaN, use __builtin_isnan () to check.  */
+  float32_t n = __builtin_nanf ("");
+  float32_t a1[] = {n,n,n,n};
+  float32_t b1[] = {n,n,n,n};
+  float32_t e[] = {n,n};
+  float32x4_t a = vld1q_f32 (a1);
+  float32x4_t b = vld1q_f32 (b1);
+  float32x4_t c = vminnmq_f32 (a, b);
+  float32_t actual[4];
+  vst1q_f32 (actual, c);
+
+  for (int i = 0; i < 4; ++i)
+    if (!__builtin_isnan (actual[i]))
+      abort ();
+}
+
+int
+main ()
+{
+  test_vminnmq_f32__regular_input1 ();
+  test_vminnmq_f32__regular_input2 ();
+  test_vminnmq_f32__quiet_NaN_one_arg ();
+  test_vminnmq_f32__quiet_NaN_both_args ();
+  test_vminnmq_f32__zero_both_args ();
+  test_vminnmq_f32__inf_both_args ();
+  test_vminnmq_f32__two_quiet_NaNs_both_args ();
+  return 0;
+}
+
+/* { dg-final { scan-assembler-times "vminnm\.f32\t\[qQ\]\[0-9\]+, ?\[qQ\]\[0-9\]+, ?\[qQ\]\[0-9\]+\n" 7 } } */
--- a/src/gcc/testsuite/gcc.target/arm/unsigned-extend-2.c
+++ b/src/gcc/testsuite/gcc.target/arm/unsigned-extend-2.c
@@ -2,13 +2,13 @@
 /* { dg-require-effective-target arm_thumb2_ok } */
 /* { dg-options "-O" } */
 
-unsigned short foo (unsigned short x)
+unsigned short foo (unsigned short x, unsigned short c)
 {
   unsigned char i = 0;
   for (i = 0; i < 8; i++)
     {
       x >>= 1;
-      x &= 0x7fff;
+      x &= c;
     }
   return x;
 }
--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/arm/vect-vcvt.c
@@ -0,0 +1,27 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target arm_neon_ok } */
+/* { dg-options "-O2 -ftree-vectorize -fdump-tree-vect-details -mvectorize-with-neon-double" } */
+/* { dg-add-options arm_neon } */
+
+#define N 32
+
+int ib[N] = {0,3,6,9,12,15,18,21,24,27,30,33,36,39,42,45,0,3,6,9,12,15,18,21,24,27,30,33,36,39,42,45};
+float fa[N];
+int ia[N];
+
+int convert()
+{
+  int i;
+
+  /* int -> float */
+  for (i = 0; i < N; i++)
+    fa[i] = (float) ib[i];
+
+  /* float -> int */
+  for (i = 0; i < N; i++)
+    ia[i] = (int) fa[i];
+
+  return 0;
+}
+
+/* { dg-final { scan-tree-dump-times "vectorized 2 loops" 1 "vect" } } */
--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/arm/vect-vcvtq.c
@@ -0,0 +1,27 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target arm_neon_ok } */
+/* { dg-options "-O2 -ftree-vectorize -fdump-tree-vect-details" } */
+/* { dg-add-options arm_neon } */
+
+#define N 32
+
+int ib[N] = {0,3,6,9,12,15,18,21,24,27,30,33,36,39,42,45,0,3,6,9,12,15,18,21,24,27,30,33,36,39,42,45};
+float fa[N];
+int ia[N];
+
+int convert()
+{
+  int i;
+
+  /* int -> float */
+  for (i = 0; i < N; i++)
+    fa[i] = (float) ib[i];
+
+  /* float -> int */
+  for (i = 0; i < N; i++)
+    ia[i] = (int) fa[i];
+
+  return 0;
+}
+
+/* { dg-final { scan-tree-dump-times "vectorized 2 loops" 1 "vect" } } */
--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/arm/vfp-shift-a2t2.c
@@ -0,0 +1,27 @@
+/* Check that NEON vector shifts support immediate values == size.  /*
+
+/* { dg-do compile } */
+/* { dg-require-effective-target arm_neon_ok } */
+/* { dg-options "-save-temps" } */
+/* { dg-add-options arm_neon } */
+
+#include <arm_neon.h>
+
+uint16x8_t test_vshll_n_u8 (uint8x8_t a)
+{
+    return vshll_n_u8(a, 8);
+}
+
+uint32x4_t test_vshll_n_u16 (uint16x4_t a)
+{   
+    return vshll_n_u16(a, 16);
+}
+
+uint64x2_t test_vshll_n_u32 (uint32x2_t a)
+{
+    return vshll_n_u32(a, 32);
+}
+
+/* { dg-final { scan-assembler "vshll\.u16\[ 	\]+\[qQ\]\[0-9\]+, \[dD\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
+/* { dg-final { scan-assembler "vshll\.u32\[ 	\]+\[qQ\]\[0-9\]+, \[dD\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
+/* { dg-final { scan-assembler "vshll\.u8\[ 	\]+\[qQ\]\[0-9\]+, \[dD\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/arm/vst1Q_laneu64-1.c
@@ -0,0 +1,25 @@
+/* Test the `vst1Q_laneu64' ARM Neon intrinsic.  */
+
+/* Detect ICE in the case of unaligned memory address.  */
+
+/* { dg-do compile } */
+/* { dg-require-effective-target arm_neon_ok } */
+/* { dg-add-options arm_neon } */
+
+#include "arm_neon.h"
+
+unsigned char dummy_store[1000];
+
+void
+foo (unsigned char* addr)
+{
+  uint8x16_t vdata = vld1q_u8 (addr);
+  vst1q_lane_u64 ((uint64_t*) &dummy_store, vreinterpretq_u64_u8 (vdata), 0);
+}
+
+uint64_t
+bar (uint64x2_t vdata)
+{
+  vdata = vld1q_lane_u64 ((uint64_t*) &dummy_store, vdata, 0);
+  return vgetq_lane_u64 (vdata, 0);
+}
--- a/src/gcc/testsuite/lib/gcc-dg.exp
+++ b/src/gcc/testsuite/lib/gcc-dg.exp
@@ -403,6 +403,7 @@ if { [info procs ${tool}_load] != [list] \
 	    switch [lindex $result 0] {
 		"pass" { set status "fail" }
 		"fail" { set status "pass" }
+		default { set status [lindex $result 0] }
 	    }
 	    set result [list $status [lindex $result 1]]
 	}
--- a/src/gcc/testsuite/lib/target-supports.exp
+++ b/src/gcc/testsuite/lib/target-supports.exp
@@ -252,6 +252,20 @@ proc check_runtime {prop args} {
     }]
 }
 
+# Return 1 if GCC was configured with $pattern.
+proc check_configured_with { pattern } {
+    global tool
+
+    set gcc_output [${tool}_target_compile "-v" "" "none" ""]
+    if { [ regexp "Configured with: \[^\n\]*$pattern" $gcc_output ] } {
+        verbose "Matched: $pattern" 2
+        return 1
+    }
+
+    verbose "Failed to match: $pattern" 2
+    return 0
+}
+
 ###############################
 # proc check_weak_available { }
 ###############################
@@ -2936,6 +2950,28 @@ proc add_options_for_arm_v8_1a_neon { flags } {
     return "$flags $et_arm_v8_1a_neon_flags -march=armv8.1-a"
 }
 
+# Add the options needed for ARMv8.2 with the scalar FP16 extension.
+# Also adds the ARMv8 FP options for ARM and for AArch64.
+
+proc add_options_for_arm_v8_2a_fp16_scalar { flags } {
+    if { ! [check_effective_target_arm_v8_2a_fp16_scalar_ok] } {
+	return "$flags"
+    }
+    global et_arm_v8_2a_fp16_scalar_flags
+    return "$flags $et_arm_v8_2a_fp16_scalar_flags"
+}
+
+# Add the options needed for ARMv8.2 with the FP16 extension.  Also adds
+# the ARMv8 NEON options for ARM and for AArch64.
+
+proc add_options_for_arm_v8_2a_fp16_neon { flags } {
+    if { ! [check_effective_target_arm_v8_2a_fp16_neon_ok] } {
+	return "$flags"
+    }
+    global et_arm_v8_2a_fp16_neon_flags
+    return "$flags $et_arm_v8_2a_fp16_neon_flags"
+}
+
 proc add_options_for_arm_crc { flags } {
     if { ! [check_effective_target_arm_crc_ok] } {
         return "$flags"
@@ -3022,23 +3058,25 @@ proc check_effective_target_arm_crc_ok { } {
 
 proc check_effective_target_arm_neon_fp16_ok_nocache { } {
     global et_arm_neon_fp16_flags
+    global et_arm_neon_flags
     set et_arm_neon_fp16_flags ""
-    if { [check_effective_target_arm32] } {
+    if { [check_effective_target_arm32]
+	 && [check_effective_target_arm_neon_ok] } {
 	foreach flags {"" "-mfloat-abi=softfp" "-mfpu=neon-fp16"
 		       "-mfpu=neon-fp16 -mfloat-abi=softfp"
 		       "-mfp16-format=ieee"
 		       "-mfloat-abi=softfp -mfp16-format=ieee"
 		       "-mfpu=neon-fp16 -mfp16-format=ieee"
 		       "-mfpu=neon-fp16 -mfloat-abi=softfp -mfp16-format=ieee"} {
-	    if { [check_no_compiler_messages_nocache arm_neon_fp_16_ok object {
+	    if { [check_no_compiler_messages_nocache arm_neon_fp16_ok object {
 		#include "arm_neon.h"
 		float16x4_t
 		foo (float32x4_t arg)
 		{
                   return vcvt_f16_f32 (arg);
 		}
-	    } "$flags"] } {
-		set et_arm_neon_fp16_flags $flags
+	    } "$et_arm_neon_flags $flags"] } {
+		set et_arm_neon_fp16_flags [concat $et_arm_neon_flags $flags]
 		return 1
 	    }
 	}
@@ -3075,6 +3113,65 @@ proc add_options_for_arm_neon_fp16 { flags } {
     return "$flags $et_arm_neon_fp16_flags"
 }
 
+# Return 1 if this is an ARM target supporting the FP16 alternative
+# format.  Some multilibs may be incompatible with the options needed.  Also
+# set et_arm_neon_fp16_flags to the best options to add.
+
+proc check_effective_target_arm_fp16_alternative_ok_nocache { } {
+    global et_arm_neon_fp16_flags
+    set et_arm_neon_fp16_flags ""
+    if { [check_effective_target_arm32] } {
+	foreach flags {"" "-mfloat-abi=softfp" "-mfpu=neon-fp16"
+		       "-mfpu=neon-fp16 -mfloat-abi=softfp"} {
+	    if { [check_no_compiler_messages_nocache \
+		      arm_fp16_alternative_ok object {
+		#if !defined (__ARM_FP16_FORMAT_ALTERNATIVE)
+		#error __ARM_FP16_FORMAT_ALTERNATIVE not defined
+		#endif
+	    } "$flags -mfp16-format=alternative"] } {
+		set et_arm_neon_fp16_flags "$flags -mfp16-format=alternative"
+		return 1
+	    }
+	}
+    }
+
+    return 0
+}
+
+proc check_effective_target_arm_fp16_alternative_ok { } {
+    return [check_cached_effective_target arm_fp16_alternative_ok \
+		check_effective_target_arm_fp16_alternative_ok_nocache]
+}
+
+# Return 1 if this is an ARM target supports specifying the FP16 none
+# format.  Some multilibs may be incompatible with the options needed.
+
+proc check_effective_target_arm_fp16_none_ok_nocache { } {
+    if { [check_effective_target_arm32] } {
+	foreach flags {"" "-mfloat-abi=softfp" "-mfpu=neon-fp16"
+		       "-mfpu=neon-fp16 -mfloat-abi=softfp"} {
+	    if { [check_no_compiler_messages_nocache \
+		      arm_fp16_none_ok object {
+		#if defined (__ARM_FP16_FORMAT_ALTERNATIVE)
+		#error __ARM_FP16_FORMAT_ALTERNATIVE defined
+		#endif
+		#if defined (__ARM_FP16_FORMAT_IEEE)
+		#error __ARM_FP16_FORMAT_IEEE defined
+		#endif
+	    } "$flags -mfp16-format=none"] } {
+		return 1
+	    }
+	}
+    }
+
+    return 0
+}
+
+proc check_effective_target_arm_fp16_none_ok { } {
+    return [check_cached_effective_target arm_fp16_none_ok \
+		check_effective_target_arm_fp16_none_ok_nocache]
+}
+
 # Return 1 if this is an ARM target supporting -mfpu=neon-fp-armv8
 # -mfloat-abi=softfp or equivalent options.  Some multilibs may be
 # incompatible with these options.  Also set et_arm_v8_neon_flags to the
@@ -3117,8 +3214,10 @@ proc check_effective_target_arm_v8_neon_ok { } {
 
 proc check_effective_target_arm_neonv2_ok_nocache { } {
     global et_arm_neonv2_flags
+    global et_arm_neon_flags
     set et_arm_neonv2_flags ""
-    if { [check_effective_target_arm32] } {
+    if { [check_effective_target_arm32]
+	 && [check_effective_target_arm_neon_ok] } {
 	foreach flags {"" "-mfloat-abi=softfp" "-mfpu=neon-vfpv4" "-mfpu=neon-vfpv4 -mfloat-abi=softfp"} {
 	    if { [check_no_compiler_messages_nocache arm_neonv2_ok object {
 		#include "arm_neon.h"
@@ -3127,8 +3226,8 @@ proc check_effective_target_arm_neonv2_ok_nocache { } {
                 {
                   return vfma_f32 (a, b, c);
                 }
-	    } "$flags"] } {
-		set et_arm_neonv2_flags $flags
+	    } "$et_arm_neon_flags $flags"] } {
+		set et_arm_neonv2_flags [concat $et_arm_neon_flags $flags]
 		return 1
 	    }
 	}
@@ -3142,9 +3241,9 @@ proc check_effective_target_arm_neonv2_ok { } {
 		check_effective_target_arm_neonv2_ok_nocache]
 }
 
-# Add the options needed for NEON.  We need either -mfloat-abi=softfp
-# or -mfloat-abi=hard, but if one is already specified by the
-# multilib, use it.
+# Add the options needed for VFP FP16 support.  We need either
+# -mfloat-abi=softfp or -mfloat-abi=hard.  If one is already specified by
+# the multilib, use it.
 
 proc add_options_for_arm_fp16 { flags } {
     if { ! [check_effective_target_arm_fp16_ok] } {
@@ -3154,9 +3253,32 @@ proc add_options_for_arm_fp16 { flags } {
     return "$flags $et_arm_fp16_flags"
 }
 
+# Add the options needed to enable support for IEEE format
+# half-precision support.  This is valid for ARM targets.
+
+proc add_options_for_arm_fp16_ieee { flags } {
+    if { ! [check_effective_target_arm_fp16_ok] } {
+	return "$flags"
+    }
+    global et_arm_fp16_flags
+    return "$flags $et_arm_fp16_flags -mfp16-format=ieee"
+}
+
+# Add the options needed to enable support for ARM Alternative format
+# half-precision support.  This is valid for ARM targets.
+
+proc add_options_for_arm_fp16_alternative { flags } {
+    if { ! [check_effective_target_arm_fp16_ok] } {
+	return "$flags"
+    }
+    global et_arm_fp16_flags
+    return "$flags $et_arm_fp16_flags -mfp16-format=alternative"
+}
+
 # Return 1 if this is an ARM target that can support a VFP fp16 variant.
 # Skip multilibs that are incompatible with these options and set
-# et_arm_fp16_flags to the best options to add.
+# et_arm_fp16_flags to the best options to add.  This test is valid for
+# ARM only.
 
 proc check_effective_target_arm_fp16_ok_nocache { } {
     global et_arm_fp16_flags
@@ -3164,7 +3286,10 @@ proc check_effective_target_arm_fp16_ok_nocache { } {
     if { ! [check_effective_target_arm32] } {
 	return 0;
     }
-    if [check-flags [list "" { *-*-* } { "-mfpu=*" } { "-mfpu=*fp16*" "-mfpu=*fpv[4-9]*" "-mfpu=*fpv[1-9][0-9]*" } ]] {
+    if [check-flags \
+	    [list "" { *-*-* } { "-mfpu=*" } \
+		 { "-mfpu=*fp16*" "-mfpu=*fpv[4-9]*" \
+		       "-mfpu=*fpv[1-9][0-9]*" "-mfpu=*fp-armv8*" } ]] {
 	# Multilib flags would override -mfpu.
 	return 0
     }
@@ -3200,6 +3325,28 @@ proc check_effective_target_arm_fp16_ok { } {
 		check_effective_target_arm_fp16_ok_nocache]
 }
 
+# Return 1 if the target supports executing VFP FP16 instructions, 0
+# otherwise.  This test is valid for ARM only.
+
+proc check_effective_target_arm_fp16_hw { } {
+    if {! [check_effective_target_arm_fp16_ok] } {
+	return 0
+    }
+    global et_arm_fp16_flags
+    check_runtime_nocache arm_fp16_hw {
+	int
+	main (int argc, char **argv)
+	{
+	  __fp16 a = 1.0;
+	  float r;
+	  asm ("vcvtb.f32.f16 %0, %1"
+	       : "=w" (r) : "w" (a)
+	       : /* No clobbers.  */);
+	  return (r == 1.0) ? 0 : 1;
+	}
+    } "$et_arm_fp16_flags -mfp16-format=ieee"
+}
+
 # Creates a series of routines that return 1 if the given architecture
 # can be selected and a routine to give the flags to select that architecture
 # Note: Extra flags may be added to disable options from newer compilers
@@ -3209,22 +3356,26 @@ proc check_effective_target_arm_fp16_ok { } {
 # Usage: /* { dg-require-effective-target arm_arch_v5_ok } */
 #        /* { dg-add-options arm_arch_v5 } */
 #	 /* { dg-require-effective-target arm_arch_v5_multilib } */
-foreach { armfunc armflag armdef } { v4 "-march=armv4 -marm" __ARM_ARCH_4__
-				     v4t "-march=armv4t" __ARM_ARCH_4T__
-				     v5 "-march=armv5 -marm" __ARM_ARCH_5__
-				     v5t "-march=armv5t" __ARM_ARCH_5T__
-				     v5te "-march=armv5te" __ARM_ARCH_5TE__
-				     v6 "-march=armv6" __ARM_ARCH_6__
-				     v6k "-march=armv6k" __ARM_ARCH_6K__
-				     v6t2 "-march=armv6t2" __ARM_ARCH_6T2__
-				     v6z "-march=armv6z" __ARM_ARCH_6Z__
-				     v6m "-march=armv6-m -mthumb" __ARM_ARCH_6M__
-				     v7a "-march=armv7-a" __ARM_ARCH_7A__
-				     v7r "-march=armv7-r" __ARM_ARCH_7R__
-				     v7m "-march=armv7-m -mthumb" __ARM_ARCH_7M__
-				     v7em "-march=armv7e-m -mthumb" __ARM_ARCH_7EM__
-				     v8a "-march=armv8-a" __ARM_ARCH_8A__
-				     v8_1a "-march=armv8.1a" __ARM_ARCH_8A__ } {
+foreach { armfunc armflag armdef } {
+	v4 "-march=armv4 -marm" __ARM_ARCH_4__
+	v4t "-march=armv4t" __ARM_ARCH_4T__
+	v5 "-march=armv5 -marm" __ARM_ARCH_5__
+	v5t "-march=armv5t" __ARM_ARCH_5T__
+	v5te "-march=armv5te" __ARM_ARCH_5TE__
+	v6 "-march=armv6" __ARM_ARCH_6__
+	v6k "-march=armv6k" __ARM_ARCH_6K__
+	v6t2 "-march=armv6t2" __ARM_ARCH_6T2__
+	v6z "-march=armv6z" __ARM_ARCH_6Z__
+	v6m "-march=armv6-m -mthumb -mfloat-abi=soft" __ARM_ARCH_6M__
+	v7a "-march=armv7-a" __ARM_ARCH_7A__
+	v7r "-march=armv7-r" __ARM_ARCH_7R__
+	v7m "-march=armv7-m -mthumb" __ARM_ARCH_7M__
+	v7em "-march=armv7e-m -mthumb" __ARM_ARCH_7EM__
+	v8a "-march=armv8-a" __ARM_ARCH_8A__
+	v8_1a "-march=armv8.1a" __ARM_ARCH_8A__
+	v8_2a "-march=armv8.2a" __ARM_ARCH_8A__
+	v8m_base "-march=armv8-m.base -mthumb -mfloat-abi=soft" __ARM_ARCH_8M_BASE__
+	v8m_main "-march=armv8-m.main -mthumb" __ARM_ARCH_8M_MAIN__ } {
     eval [string map [list FUNC $armfunc FLAG $armflag DEF $armdef ] {
 	proc check_effective_target_arm_arch_FUNC_ok { } {
 	    if { [ string match "*-marm*" "FLAG" ] &&
@@ -3274,6 +3425,12 @@ proc add_options_for_arm_arch_v7ve { flags } {
     return "$flags -march=armv7ve"
 }
 
+# Return 1 if GCC was configured with --with-mode=
+proc check_effective_target_default_mode { } {
+
+    return [check_configured_with "with-mode="]
+}
+
 # Return 1 if this is an ARM target where -marm causes ARM to be
 # used (not Thumb)
 
@@ -3352,15 +3509,60 @@ proc check_effective_target_arm_cortex_m { } {
 	return 0
     }
     return [check_no_compiler_messages arm_cortex_m assembly {
-	#if !defined(__ARM_ARCH_7M__) \
-            && !defined (__ARM_ARCH_7EM__) \
-            && !defined (__ARM_ARCH_6M__)
-	#error !__ARM_ARCH_7M__ && !__ARM_ARCH_7EM__ && !__ARM_ARCH_6M__
+	#if defined(__ARM_ARCH_ISA_ARM)
+	#error __ARM_ARCH_ISA_ARM is defined
 	#endif
 	int i;
     } "-mthumb"]
 }
 
+# Return 1 if this is an ARM target where -mthumb causes Thumb-1 to be
+# used and MOVT/MOVW instructions to be available.
+
+proc check_effective_target_arm_thumb1_movt_ok {} {
+    if [check_effective_target_arm_thumb1_ok] {
+	return [check_no_compiler_messages arm_movt object {
+	    int
+	    foo (void)
+	    {
+	      asm ("movt r0, #42");
+	    }
+	} "-mthumb"]
+    } else {
+	return 0
+    }
+}
+
+# Return 1 if this is an ARM target where -mthumb causes Thumb-1 to be
+# used and CBZ and CBNZ instructions are available.
+
+proc check_effective_target_arm_thumb1_cbz_ok {} {
+    if [check_effective_target_arm_thumb1_ok] {
+	return [check_no_compiler_messages arm_movt object {
+	    int
+	    foo (void)
+	    {
+	      asm ("cbz r0, 2f\n2:");
+	    }
+	} "-mthumb"]
+    } else {
+	return 0
+    }
+}
+
+# Return 1 if this is an ARM target where ARMv8-M Security Extensions is
+# available.
+
+proc check_effective_target_arm_cmse_ok {} {
+    return [check_no_compiler_messages arm_cmse object {
+	int
+	foo (void)
+	{
+	  asm ("bxns r0");
+	}
+    } "-mcmse"];
+}
+
 # Return 1 if this compilation turns on string_ops_prefer_neon on.
 
 proc check_effective_target_arm_tune_string_ops_prefer_neon { } {
@@ -3436,6 +3638,76 @@ proc check_effective_target_arm_v8_1a_neon_ok { } {
 		check_effective_target_arm_v8_1a_neon_ok_nocache]
 }
 
+# Return 1 if the target supports ARMv8.2 scalar FP16 arithmetic
+# instructions, 0 otherwise.  The test is valid for ARM and for AArch64.
+# Record the command line options needed.
+
+proc check_effective_target_arm_v8_2a_fp16_scalar_ok_nocache { } {
+    global et_arm_v8_2a_fp16_scalar_flags
+    set et_arm_v8_2a_fp16_scalar_flags ""
+
+    if { ![istarget arm*-*-*] && ![istarget aarch64*-*-*] } {
+	return 0;
+    }
+
+    # Iterate through sets of options to find the compiler flags that
+    # need to be added to the -march option.
+    foreach flags {"" "-mfpu=fp-armv8" "-mfloat-abi=softfp" \
+		       "-mfpu=fp-armv8 -mfloat-abi=softfp"} {
+	if { [check_no_compiler_messages_nocache \
+		  arm_v8_2a_fp16_scalar_ok object {
+	    #if !defined (__ARM_FEATURE_FP16_SCALAR_ARITHMETIC)
+	    #error "__ARM_FEATURE_FP16_SCALAR_ARITHMETIC not defined"
+	    #endif
+	} "$flags -march=armv8.2-a+fp16"] } {
+	    set et_arm_v8_2a_fp16_scalar_flags "$flags -march=armv8.2-a+fp16"
+	    return 1
+	}
+    }
+
+    return 0;
+}
+
+proc check_effective_target_arm_v8_2a_fp16_scalar_ok { } {
+    return [check_cached_effective_target arm_v8_2a_fp16_scalar_ok \
+		check_effective_target_arm_v8_2a_fp16_scalar_ok_nocache]
+}
+
+# Return 1 if the target supports ARMv8.2 Adv.SIMD FP16 arithmetic
+# instructions, 0 otherwise.  The test is valid for ARM and for AArch64.
+# Record the command line options needed.
+
+proc check_effective_target_arm_v8_2a_fp16_neon_ok_nocache { } {
+    global et_arm_v8_2a_fp16_neon_flags
+    set et_arm_v8_2a_fp16_neon_flags ""
+
+    if { ![istarget arm*-*-*] && ![istarget aarch64*-*-*] } {
+	return 0;
+    }
+
+    # Iterate through sets of options to find the compiler flags that
+    # need to be added to the -march option.
+    foreach flags {"" "-mfpu=neon-fp-armv8" "-mfloat-abi=softfp" \
+		       "-mfpu=neon-fp-armv8 -mfloat-abi=softfp"} {
+	if { [check_no_compiler_messages_nocache \
+		  arm_v8_2a_fp16_neon_ok object {
+	    #if !defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+	    #error "__ARM_FEATURE_FP16_VECTOR_ARITHMETIC not defined"
+	    #endif
+	} "$flags -march=armv8.2-a+fp16"] } {
+	    set et_arm_v8_2a_fp16_neon_flags "$flags -march=armv8.2-a+fp16"
+	    return 1
+	}
+    }
+
+    return 0;
+}
+
+proc check_effective_target_arm_v8_2a_fp16_neon_ok { } {
+    return [check_cached_effective_target arm_v8_2a_fp16_neon_ok \
+		check_effective_target_arm_v8_2a_fp16_neon_ok_nocache]
+}
+
 # Return 1 if the target supports executing ARMv8 NEON instructions, 0
 # otherwise.
 
@@ -3445,11 +3717,17 @@ proc check_effective_target_arm_v8_neon_hw { } {
 	int
 	main (void)
 	{
-	  float32x2_t a;
+	  float32x2_t a = { 1.0f, 2.0f };
+	  #ifdef __ARM_ARCH_ISA_A64
+	  asm ("frinta %0.2s, %1.2s"
+	      : "=w" (a)
+	      : "w" (a));
+	  #else
 	  asm ("vrinta.f32 %P0, %P1"
 	       : "=w" (a)
 	       : "0" (a));
-	  return 0;
+	  #endif
+	  return a[0] == 2.0f;
 	}
     } [add_options_for_arm_v8_neon ""]]
 }
@@ -3492,6 +3770,81 @@ proc check_effective_target_arm_v8_1a_neon_hw { } {
     } [add_options_for_arm_v8_1a_neon ""]]
 }
 
+# Return 1 if the target supports executing floating point instructions from
+# ARMv8.2 with the FP16 extension, 0 otherwise.  The test is valid for ARM and
+# for AArch64.
+
+proc check_effective_target_arm_v8_2a_fp16_scalar_hw { } {
+    if { ![check_effective_target_arm_v8_2a_fp16_scalar_ok] } {
+	return 0;
+    }
+    return [check_runtime arm_v8_2a_fp16_scalar_hw_available {
+	int
+	main (void)
+	{
+	  __fp16 a = 1.0;
+	  __fp16 result;
+
+	  #ifdef __ARM_ARCH_ISA_A64
+
+	  asm ("fabs %h0, %h1"
+	       : "=w"(result)
+	       : "w"(a)
+	       : /* No clobbers.  */);
+
+	  #else
+
+	  asm ("vabs.f16 %0, %1"
+	       : "=w"(result)
+	       : "w"(a)
+	       : /* No clobbers.  */);
+
+	  #endif
+
+	  return (result == 1.0) ? 0 : 1;
+	}
+    } [add_options_for_arm_v8_2a_fp16_scalar ""]]
+}
+
+# Return 1 if the target supports executing Adv.SIMD instructions from ARMv8.2
+# with the FP16 extension, 0 otherwise.  The test is valid for ARM and for
+# AArch64.
+
+proc check_effective_target_arm_v8_2a_fp16_neon_hw { } {
+    if { ![check_effective_target_arm_v8_2a_fp16_neon_ok] } {
+	return 0;
+    }
+    return [check_runtime arm_v8_2a_fp16_neon_hw_available {
+	int
+	main (void)
+	{
+	  #ifdef __ARM_ARCH_ISA_A64
+
+	  __Float16x4_t a = {1.0, -1.0, 1.0, -1.0};
+	  __Float16x4_t result;
+
+	  asm ("fabs %0.4h, %1.4h"
+	       : "=w"(result)
+	       : "w"(a)
+	       : /* No clobbers.  */);
+
+	  #else
+
+	  __simd64_float16_t a = {1.0, -1.0, 1.0, -1.0};
+	  __simd64_float16_t result;
+
+	  asm ("vabs.f16 %P0, %P1"
+	       : "=w"(result)
+	       : "w"(a)
+	       : /* No clobbers.  */);
+
+	  #endif
+
+	  return (result[0] == 1.0) ? 0 : 1;
+	}
+    } [add_options_for_arm_v8_2a_fp16_neon ""]]
+}
+
 # Return 1 if this is a ARM target with NEON enabled.
 
 proc check_effective_target_arm_neon { } {
@@ -3526,6 +3879,25 @@ proc check_effective_target_arm_neonv2 { } {
     }
 }
 
+# Return 1 if this is an ARM target with load acquire and store release
+# instructions for 8-, 16- and 32-bit types.
+
+proc check_effective_target_arm_acq_rel { } {
+    return [check_no_compiler_messages arm_acq_rel object {
+	void
+	load_acquire_store_release (void)
+	{
+	  asm ("lda r0, [r1]\n\t"
+	       "stl r0, [r1]\n\t"
+	       "ldah r0, [r1]\n\t"
+	       "stlh r0, [r1]\n\t"
+	       "ldab r0, [r1]\n\t"
+	       "stlb r0, [r1]"
+	       : : : "r0", "memory");
+	}
+    }]
+}
+
 # Return 1 if this a Loongson-2E or -2F target using an ABI that supports
 # the Loongson vector modes.
 
@@ -4380,6 +4752,8 @@ proc check_effective_target_vect_widen_sum_hi_to_si_pattern { } {
         set et_vect_widen_sum_hi_to_si_pattern_saved 0
         if { [istarget powerpc*-*-*]
              || [istarget aarch64*-*-*]
+	     || ([istarget arm*-*-*] &&
+		 [check_effective_target_arm_neon_ok])
              || [istarget ia64-*-*] } {
             set et_vect_widen_sum_hi_to_si_pattern_saved 1
         }
@@ -5755,6 +6129,8 @@ proc check_effective_target_sync_int_long { } {
 	     || [istarget aarch64*-*-*]
 	     || [istarget alpha*-*-*] 
 	     || [istarget arm*-*-linux-*] 
+	     || ([istarget arm*-*-*]
+		 && [check_effective_target_arm_acq_rel])
 	     || [istarget bfin*-*linux*]
 	     || [istarget hppa*-*linux*]
 	     || [istarget s390*-*-*] 
@@ -5788,6 +6164,8 @@ proc check_effective_target_sync_char_short { } {
 	     || [istarget i?86-*-*] || [istarget x86_64-*-*]
 	     || [istarget alpha*-*-*] 
 	     || [istarget arm*-*-linux-*] 
+	     || ([istarget arm*-*-*]
+		 && [check_effective_target_arm_acq_rel])
 	     || [istarget hppa*-*linux*]
 	     || [istarget s390*-*-*] 
 	     || [istarget powerpc*-*-*]
--- a/src/gcc/tree-inline.c
+++ b/src/gcc/tree-inline.c
@@ -244,6 +244,7 @@ remap_ssa_name (tree name, copy_body_data *id)
       /* At least IPA points-to info can be directly transferred.  */
       if (id->src_cfun->gimple_df
 	  && id->src_cfun->gimple_df->ipa_pta
+	  && POINTER_TYPE_P (TREE_TYPE (name))
 	  && (pi = SSA_NAME_PTR_INFO (name))
 	  && !pi->pt.anything)
 	{
@@ -276,6 +277,7 @@ remap_ssa_name (tree name, copy_body_data *id)
       /* At least IPA points-to info can be directly transferred.  */
       if (id->src_cfun->gimple_df
 	  && id->src_cfun->gimple_df->ipa_pta
+	  && POINTER_TYPE_P (TREE_TYPE (name))
 	  && (pi = SSA_NAME_PTR_INFO (name))
 	  && !pi->pt.anything)
 	{
--- a/src/gcc/tree-scalar-evolution.c
+++ b/src/gcc/tree-scalar-evolution.c
@@ -1937,6 +1937,36 @@ interpret_rhs_expr (struct loop *loop, gimple *at_stmt,
       res = chrec_convert (type, chrec1, at_stmt);
       break;
 
+    case BIT_AND_EXPR:
+      /* Given int variable A, handle A&0xffff as (int)(unsigned short)A.
+	 If A is SCEV and its value is in the range of representable set
+	 of type unsigned short, the result expression is a (no-overflow)
+	 SCEV.  */
+      res = chrec_dont_know;
+      if (tree_fits_uhwi_p (rhs2))
+	{
+	  int precision;
+	  unsigned HOST_WIDE_INT val = tree_to_uhwi (rhs2);
+
+	  val ++;
+	  /* Skip if value of rhs2 wraps in unsigned HOST_WIDE_INT or
+	     it's not the maximum value of a smaller type than rhs1.  */
+	  if (val != 0
+	      && (precision = exact_log2 (val)) > 0
+	      && (unsigned) precision < TYPE_PRECISION (TREE_TYPE (rhs1)))
+	    {
+	      tree utype = build_nonstandard_integer_type (precision, 1);
+
+	      if (TYPE_PRECISION (utype) < TYPE_PRECISION (TREE_TYPE (rhs1)))
+		{
+		  chrec1 = analyze_scalar_evolution (loop, rhs1);
+		  chrec1 = chrec_convert (utype, chrec1, at_stmt);
+		  res = chrec_convert (TREE_TYPE (rhs1), chrec1, at_stmt);
+		}
+	    }
+	}
+      break;
+
     default:
       res = chrec_dont_know;
       break;
--- a/src/gcc/tree-ssa-address.c
+++ b/src/gcc/tree-ssa-address.c
@@ -877,6 +877,10 @@ copy_ref_info (tree new_ref, tree old_ref)
 	      && TREE_CODE (old_ref) == MEM_REF
 	      && !(TREE_CODE (new_ref) == TARGET_MEM_REF
 		   && (TMR_INDEX2 (new_ref)
+		       /* TODO: Below conditions can be relaxed if TMR_INDEX
+			  is an indcution variable and its initial value and
+			  step are aligned.  */
+		       || (TMR_INDEX (new_ref) && !TMR_STEP (new_ref))
 		       || (TMR_STEP (new_ref)
 			   && (TREE_INT_CST_LOW (TMR_STEP (new_ref))
 			       < align)))))
--- a/src/gcc/tree-ssa-ccp.c
+++ b/src/gcc/tree-ssa-ccp.c
@@ -229,13 +229,12 @@ debug_lattice_value (ccp_prop_value_t val)
   fprintf (stderr, "\n");
 }
 
-/* Extend NONZERO_BITS to a full mask, with the upper bits being set.  */
+/* Extend NONZERO_BITS to a full mask, based on sgn.  */ 
 
 static widest_int
-extend_mask (const wide_int &nonzero_bits)
+extend_mask (const wide_int &nonzero_bits, signop sgn)
 {
-  return (wi::mask <widest_int> (wi::get_precision (nonzero_bits), true)
-	  | widest_int::from (nonzero_bits, UNSIGNED));
+  return widest_int::from (nonzero_bits, sgn); 
 }
 
 /* Compute a default value for variable VAR and store it in the
@@ -284,7 +283,7 @@ get_default_value (tree var)
 		{
 		  val.lattice_val = CONSTANT;
 		  val.value = build_zero_cst (TREE_TYPE (var));
-		  val.mask = extend_mask (nonzero_bits);
+		  val.mask = extend_mask (nonzero_bits, TYPE_SIGN (TREE_TYPE (var)));
 		}
 	    }
 	}
@@ -1939,7 +1938,7 @@ evaluate_stmt (gimple *stmt)
 	    {
 	      val.lattice_val = CONSTANT;
 	      val.value = build_zero_cst (TREE_TYPE (lhs));
-	      val.mask = extend_mask (nonzero_bits);
+	      val.mask = extend_mask (nonzero_bits, TYPE_SIGN (TREE_TYPE (lhs)));
 	      is_constant = true;
 	    }
 	  else
@@ -1950,7 +1949,8 @@ evaluate_stmt (gimple *stmt)
 	      if (nonzero_bits == 0)
 		val.mask = 0;
 	      else
-		val.mask = val.mask & extend_mask (nonzero_bits);
+		val.mask = val.mask & extend_mask (nonzero_bits,
+						   TYPE_SIGN (TREE_TYPE (lhs)));
 	    }
 	}
     }
--- a/src/gcc/tree-ssa-strlen.c
+++ b/src/gcc/tree-ssa-strlen.c
@@ -2263,7 +2263,7 @@ public:
 };
 
 /* Callback for walk_dominator_tree.  Attempt to optimize various
-   string ops by remembering string lenths pointed by pointer SSA_NAMEs.  */
+   string ops by remembering string lengths pointed by pointer SSA_NAMEs.  */
 
 edge
 strlen_dom_walker::before_dom_children (basic_block bb)
--- a/src/gcc/tree-vect-data-refs.c
+++ b/src/gcc/tree-vect-data-refs.c
@@ -2250,6 +2250,7 @@ vect_analyze_group_access_1 (struct data_reference *dr)
 	{
 	  GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = stmt;
 	  GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize;
+	  GROUP_GAP (stmt_info) = groupsize - 1;
 	  if (dump_enabled_p ())
 	    {
 	      dump_printf_loc (MSG_NOTE, vect_location,
--- a/src/gcc/tree-vect-loop-manip.c
+++ b/src/gcc/tree-vect-loop-manip.c
@@ -40,6 +40,7 @@ along with GCC; see the file COPYING3.  If not see
 #include "cfgloop.h"
 #include "tree-scalar-evolution.h"
 #include "tree-vectorizer.h"
+#include "tree-ssa-loop-ivopts.h"
 
 /*************************************************************************
   Simple Loop Peeling Utilities
@@ -1594,10 +1595,26 @@ vect_can_advance_ivs_p (loop_vec_info loop_vinfo)
         }
 
       /* FORNOW: We do not transform initial conditions of IVs
+	 which evolution functions are not invariants in the loop.  */
+
+      if (!expr_invariant_in_loop_p (loop, evolution_part))
+	{
+	  if (dump_enabled_p ())
+	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+			     "evolution not invariant in loop.\n");
+	  return false;
+	}
+
+      /* FORNOW: We do not transform initial conditions of IVs
 	 which evolution functions are a polynomial of degree >= 2.  */
 
       if (tree_is_chrec (evolution_part))
-	return false;
+	{
+	  if (dump_enabled_p ())
+	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+			     "evolution is chrec.\n");
+	  return false;
+	}
     }
 
   return true;
--- a/src/gcc/tree-vect-patterns.c
+++ b/src/gcc/tree-vect-patterns.c
@@ -2136,32 +2136,313 @@ vect_recog_vector_vector_shift_pattern (vec<gimple *> *stmts,
   return pattern_stmt;
 }
 
-/* Detect multiplication by constant which are postive or negatives of power 2,
-   and convert them to shift patterns.
+/* Return true iff the target has a vector optab implementing the operation
+   CODE on type VECTYPE.  */
 
-   Mult with constants that are postive power of two.
-   type a_t;
-   type b_t
-   S1: b_t = a_t * n
+static bool
+target_has_vecop_for_code (tree_code code, tree vectype)
+{
+  optab voptab = optab_for_tree_code (code, vectype, optab_vector);
+  return voptab
+	 && optab_handler (voptab, TYPE_MODE (vectype)) != CODE_FOR_nothing;
+}
 
-   or
+/* Verify that the target has optabs of VECTYPE to perform all the steps
+   needed by the multiplication-by-immediate synthesis algorithm described by
+   ALG and VAR.  If SYNTH_SHIFT_P is true ensure that vector addition is
+   present.  Return true iff the target supports all the steps.  */
+
+static bool
+target_supports_mult_synth_alg (struct algorithm *alg, mult_variant var,
+				 tree vectype, bool synth_shift_p)
+{
+  if (alg->op[0] != alg_zero && alg->op[0] != alg_m)
+    return false;
+
+  bool supports_vminus = target_has_vecop_for_code (MINUS_EXPR, vectype);
+  bool supports_vplus = target_has_vecop_for_code (PLUS_EXPR, vectype);
+
+  if (var == negate_variant
+      && !target_has_vecop_for_code (NEGATE_EXPR, vectype))
+    return false;
+
+  /* If we must synthesize shifts with additions make sure that vector
+     addition is available.  */
+  if ((var == add_variant || synth_shift_p) && !supports_vplus)
+    return false;
+
+  for (int i = 1; i < alg->ops; i++)
+    {
+      switch (alg->op[i])
+	{
+	case alg_shift:
+	  break;
+	case alg_add_t_m2:
+	case alg_add_t2_m:
+	case alg_add_factor:
+	  if (!supports_vplus)
+	    return false;
+	  break;
+	case alg_sub_t_m2:
+	case alg_sub_t2_m:
+	case alg_sub_factor:
+	  if (!supports_vminus)
+	    return false;
+	  break;
+	case alg_unknown:
+	case alg_m:
+	case alg_zero:
+	case alg_impossible:
+	  return false;
+	default:
+	  gcc_unreachable ();
+	}
+    }
+
+  return true;
+}
+
+/* Synthesize a left shift of OP by AMNT bits using a series of additions and
+   putting the final result in DEST.  Append all statements but the last into
+   VINFO.  Return the last statement.  */
+
+static gimple *
+synth_lshift_by_additions (tree dest, tree op, HOST_WIDE_INT amnt,
+			   stmt_vec_info vinfo)
+{
+  HOST_WIDE_INT i;
+  tree itype = TREE_TYPE (op);
+  tree prev_res = op;
+  gcc_assert (amnt >= 0);
+  for (i = 0; i < amnt; i++)
+    {
+      tree tmp_var = (i < amnt - 1) ? vect_recog_temp_ssa_var (itype, NULL)
+		      : dest;
+      gimple *stmt
+        = gimple_build_assign (tmp_var, PLUS_EXPR, prev_res, prev_res);
+      prev_res = tmp_var;
+      if (i < amnt - 1)
+	append_pattern_def_seq (vinfo, stmt);
+      else
+	return stmt;
+    }
+  gcc_unreachable ();
+  return NULL;
+}
+
+/* Helper for vect_synth_mult_by_constant.  Apply a binary operation
+   CODE to operands OP1 and OP2, creating a new temporary SSA var in
+   the process if necessary.  Append the resulting assignment statements
+   to the sequence in STMT_VINFO.  Return the SSA variable that holds the
+   result of the binary operation.  If SYNTH_SHIFT_P is true synthesize
+   left shifts using additions.  */
+
+static tree
+apply_binop_and_append_stmt (tree_code code, tree op1, tree op2,
+			     stmt_vec_info stmt_vinfo, bool synth_shift_p)
+{
+  if (integer_zerop (op2)
+      && (code == LSHIFT_EXPR
+	  || code == PLUS_EXPR))
+    {
+      gcc_assert (TREE_CODE (op1) == SSA_NAME);
+      return op1;
+    }
+
+  gimple *stmt;
+  tree itype = TREE_TYPE (op1);
+  tree tmp_var = vect_recog_temp_ssa_var (itype, NULL);
+
+  if (code == LSHIFT_EXPR
+      && synth_shift_p)
+    {
+      stmt = synth_lshift_by_additions (tmp_var, op1, TREE_INT_CST_LOW (op2),
+					 stmt_vinfo);
+      append_pattern_def_seq (stmt_vinfo, stmt);
+      return tmp_var;
+    }
+
+  stmt = gimple_build_assign (tmp_var, code, op1, op2);
+  append_pattern_def_seq (stmt_vinfo, stmt);
+  return tmp_var;
+}
+
+/* Synthesize a multiplication of OP by an INTEGER_CST VAL using shifts
+   and simple arithmetic operations to be vectorized.  Record the statements
+   produced in STMT_VINFO and return the last statement in the sequence or
+   NULL if it's not possible to synthesize such a multiplication.
+   This function mirrors the behavior of expand_mult_const in expmed.c but
+   works on tree-ssa form.  */
+
+static gimple *
+vect_synth_mult_by_constant (tree op, tree val,
+			     stmt_vec_info stmt_vinfo)
+{
+  tree itype = TREE_TYPE (op);
+  machine_mode mode = TYPE_MODE (itype);
+  struct algorithm alg;
+  mult_variant variant;
+  if (!tree_fits_shwi_p (val))
+    return NULL;
+
+  /* Multiplication synthesis by shifts, adds and subs can introduce
+     signed overflow where the original operation didn't.  Perform the
+     operations on an unsigned type and cast back to avoid this.
+     In the future we may want to relax this for synthesis algorithms
+     that we can prove do not cause unexpected overflow.  */
+  bool cast_to_unsigned_p = !TYPE_OVERFLOW_WRAPS (itype);
+
+  tree multtype = cast_to_unsigned_p ? unsigned_type_for (itype) : itype;
+
+  /* Targets that don't support vector shifts but support vector additions
+     can synthesize shifts that way.  */
+  bool synth_shift_p = !vect_supportable_shift (LSHIFT_EXPR, multtype);
+
+  HOST_WIDE_INT hwval = tree_to_shwi (val);
+  /* Use MAX_COST here as we don't want to limit the sequence on rtx costs.
+     The vectorizer's benefit analysis will decide whether it's beneficial
+     to do this.  */
+  bool possible = choose_mult_variant (mode, hwval, &alg,
+					&variant, MAX_COST);
+  if (!possible)
+    return NULL;
 
-   Mult with constants that are negative power of two.
-   S2: b_t = a_t * -n
+  tree vectype = get_vectype_for_scalar_type (multtype);
+
+  if (!vectype
+      || !target_supports_mult_synth_alg (&alg, variant,
+					   vectype, synth_shift_p))
+    return NULL;
+
+  tree accumulator;
+
+  /* Clear out the sequence of statements so we can populate it below.  */
+  STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo) = NULL;
+  gimple *stmt = NULL;
+
+  if (cast_to_unsigned_p)
+    {
+      tree tmp_op = vect_recog_temp_ssa_var (multtype, NULL);
+      stmt = gimple_build_assign (tmp_op, CONVERT_EXPR, op);
+      append_pattern_def_seq (stmt_vinfo, stmt);
+      op = tmp_op;
+    }
+
+  if (alg.op[0] == alg_zero)
+    accumulator = build_int_cst (multtype, 0);
+  else
+    accumulator = op;
+
+  bool needs_fixup = (variant == negate_variant)
+		      || (variant == add_variant);
+
+  for (int i = 1; i < alg.ops; i++)
+    {
+      tree shft_log = build_int_cst (multtype, alg.log[i]);
+      tree accum_tmp = vect_recog_temp_ssa_var (multtype, NULL);
+      tree tmp_var = NULL_TREE;
+
+      switch (alg.op[i])
+	{
+	case alg_shift:
+	  if (synth_shift_p)
+	    stmt
+	      = synth_lshift_by_additions (accum_tmp, accumulator, alg.log[i],
+					    stmt_vinfo);
+	  else
+	    stmt = gimple_build_assign (accum_tmp, LSHIFT_EXPR, accumulator,
+					 shft_log);
+	  break;
+	case alg_add_t_m2:
+	  tmp_var
+	    = apply_binop_and_append_stmt (LSHIFT_EXPR, op, shft_log,
+					    stmt_vinfo, synth_shift_p);
+	  stmt = gimple_build_assign (accum_tmp, PLUS_EXPR, accumulator,
+				       tmp_var);
+	  break;
+	case alg_sub_t_m2:
+	  tmp_var = apply_binop_and_append_stmt (LSHIFT_EXPR, op,
+						  shft_log, stmt_vinfo,
+						  synth_shift_p);
+	  /* In some algorithms the first step involves zeroing the
+	     accumulator.  If subtracting from such an accumulator
+	     just emit the negation directly.  */
+	  if (integer_zerop (accumulator))
+	    stmt = gimple_build_assign (accum_tmp, NEGATE_EXPR, tmp_var);
+	  else
+	    stmt = gimple_build_assign (accum_tmp, MINUS_EXPR, accumulator,
+					tmp_var);
+	  break;
+	case alg_add_t2_m:
+	  tmp_var
+	    = apply_binop_and_append_stmt (LSHIFT_EXPR, accumulator, shft_log,
+					   stmt_vinfo, synth_shift_p);
+	  stmt = gimple_build_assign (accum_tmp, PLUS_EXPR, tmp_var, op);
+	  break;
+	case alg_sub_t2_m:
+	  tmp_var
+	    = apply_binop_and_append_stmt (LSHIFT_EXPR, accumulator, shft_log,
+					   stmt_vinfo, synth_shift_p);
+	  stmt = gimple_build_assign (accum_tmp, MINUS_EXPR, tmp_var, op);
+	  break;
+	case alg_add_factor:
+	  tmp_var
+	    = apply_binop_and_append_stmt (LSHIFT_EXPR, accumulator, shft_log,
+					    stmt_vinfo, synth_shift_p);
+	  stmt = gimple_build_assign (accum_tmp, PLUS_EXPR, accumulator,
+				       tmp_var);
+	  break;
+	case alg_sub_factor:
+	  tmp_var
+	    = apply_binop_and_append_stmt (LSHIFT_EXPR, accumulator, shft_log,
+					   stmt_vinfo, synth_shift_p);
+	  stmt = gimple_build_assign (accum_tmp, MINUS_EXPR, tmp_var,
+				      accumulator);
+	  break;
+	default:
+	  gcc_unreachable ();
+	}
+      /* We don't want to append the last stmt in the sequence to stmt_vinfo
+	 but rather return it directly.  */
+
+      if ((i < alg.ops - 1) || needs_fixup || cast_to_unsigned_p)
+	append_pattern_def_seq (stmt_vinfo, stmt);
+      accumulator = accum_tmp;
+    }
+  if (variant == negate_variant)
+    {
+      tree accum_tmp = vect_recog_temp_ssa_var (multtype, NULL);
+      stmt = gimple_build_assign (accum_tmp, NEGATE_EXPR, accumulator);
+      accumulator = accum_tmp;
+      if (cast_to_unsigned_p)
+	append_pattern_def_seq (stmt_vinfo, stmt);
+    }
+  else if (variant == add_variant)
+    {
+      tree accum_tmp = vect_recog_temp_ssa_var (multtype, NULL);
+      stmt = gimple_build_assign (accum_tmp, PLUS_EXPR, accumulator, op);
+      accumulator = accum_tmp;
+      if (cast_to_unsigned_p)
+	append_pattern_def_seq (stmt_vinfo, stmt);
+    }
+  /* Move back to a signed if needed.  */
+  if (cast_to_unsigned_p)
+    {
+      tree accum_tmp = vect_recog_temp_ssa_var (itype, NULL);
+      stmt = gimple_build_assign (accum_tmp, CONVERT_EXPR, accumulator);
+    }
+
+  return stmt;
+}
+
+/* Detect multiplication by constant and convert it into a sequence of
+   shifts and additions, subtractions, negations.  We reuse the
+   choose_mult_variant algorithms from expmed.c
 
    Input/Output:
 
    STMTS: Contains a stmt from which the pattern search begins,
-   i.e. the mult stmt.  Convert the mult operation to LSHIFT if
-   constant operand is a power of 2.
-   type a_t, b_t
-   S1': b_t = a_t << log2 (n)
-
-   Convert the mult operation to LSHIFT and followed by a NEGATE
-   if constant operand is a negative power of 2.
-   type a_t, b_t, res_T;
-   S2': b_t = a_t << log2 (n)
-   S3': res_T  = - (b_t)
+   i.e. the mult stmt.
 
  Output:
 
@@ -2169,8 +2450,8 @@ vect_recog_vector_vector_shift_pattern (vec<gimple *> *stmts,
 
   * TYPE_OUT: The type of the output of this pattern.
 
-  * Return value: A new stmt that will be used to replace the multiplication
-    S1 or S2 stmt.  */
+  * Return value: A new stmt that will be used to replace
+    the multiplication.  */
 
 static gimple *
 vect_recog_mult_pattern (vec<gimple *> *stmts,
@@ -2178,11 +2459,8 @@ vect_recog_mult_pattern (vec<gimple *> *stmts,
 {
   gimple *last_stmt = stmts->pop ();
   tree oprnd0, oprnd1, vectype, itype;
-  gimple *pattern_stmt, *def_stmt;
-  optab optab;
+  gimple *pattern_stmt;
   stmt_vec_info stmt_vinfo = vinfo_for_stmt (last_stmt);
-  int power2_val, power2_neg_val;
-  tree shift;
 
   if (!is_gimple_assign (last_stmt))
     return NULL;
@@ -2206,52 +2484,17 @@ vect_recog_mult_pattern (vec<gimple *> *stmts,
 
   /* If the target can handle vectorized multiplication natively,
      don't attempt to optimize this.  */
-  optab = optab_for_tree_code (MULT_EXPR, vectype, optab_default);
-  if (optab != unknown_optab)
+  optab mul_optab = optab_for_tree_code (MULT_EXPR, vectype, optab_default);
+  if (mul_optab != unknown_optab)
     {
       machine_mode vec_mode = TYPE_MODE (vectype);
-      int icode = (int) optab_handler (optab, vec_mode);
+      int icode = (int) optab_handler (mul_optab, vec_mode);
       if (icode != CODE_FOR_nothing)
-	return NULL;
+       return NULL;
     }
 
-  /* If target cannot handle vector left shift then we cannot
-     optimize and bail out.  */
-  optab = optab_for_tree_code (LSHIFT_EXPR, vectype, optab_vector);
-  if (!optab
-      || optab_handler (optab, TYPE_MODE (vectype)) == CODE_FOR_nothing)
-    return NULL;
-
-  power2_val = wi::exact_log2 (oprnd1);
-  power2_neg_val = wi::exact_log2 (wi::neg (oprnd1));
-
-  /* Handle constant operands that are postive or negative powers of 2.  */
-  if (power2_val != -1)
-    {
-      shift = build_int_cst (itype, power2_val);
-      pattern_stmt
-	= gimple_build_assign (vect_recog_temp_ssa_var (itype, NULL),
-			       LSHIFT_EXPR, oprnd0, shift);
-    }
-  else if (power2_neg_val != -1)
-    {
-      /* If the target cannot handle vector NEGATE then we cannot
-	 do the optimization.  */
-      optab = optab_for_tree_code (NEGATE_EXPR, vectype, optab_vector);
-      if (!optab
-	  || optab_handler (optab, TYPE_MODE (vectype)) == CODE_FOR_nothing)
-	return NULL;
-
-      shift = build_int_cst (itype, power2_neg_val);
-      def_stmt
-	= gimple_build_assign (vect_recog_temp_ssa_var (itype, NULL),
-			       LSHIFT_EXPR, oprnd0, shift);
-      new_pattern_def_seq (stmt_vinfo, def_stmt);
-      pattern_stmt
-	 = gimple_build_assign (vect_recog_temp_ssa_var (itype, NULL),
-				NEGATE_EXPR, gimple_assign_lhs (def_stmt));
-    }
-  else
+  pattern_stmt = vect_synth_mult_by_constant (oprnd0, oprnd1, stmt_vinfo);
+  if (!pattern_stmt)
     return NULL;
 
   /* Pattern detected.  */
--- a/src/gcc/tree-vect-stmts.c
+++ b/src/gcc/tree-vect-stmts.c
@@ -6354,12 +6354,22 @@ vectorizable_load (gimple *stmt, gimple_stmt_iterator *gsi, gimple **vec_stmt,
       gcc_assert (!nested_in_vect_loop && !STMT_VINFO_GATHER_SCATTER_P (stmt_info));
 
       first_stmt = GROUP_FIRST_ELEMENT (stmt_info);
+      group_size = GROUP_SIZE (vinfo_for_stmt (first_stmt));
+
+      if (!slp
+	  && !PURE_SLP_STMT (stmt_info)
+	  && !STMT_VINFO_STRIDED_P (stmt_info))
+	{
+	  if (vect_load_lanes_supported (vectype, group_size))
+	    load_lanes_p = true;
+	  else if (!vect_grouped_load_supported (vectype, group_size))
+	    return false;
+	}
 
       /* If this is single-element interleaving with an element distance
          that leaves unused vector loads around punt - we at least create
 	 very sub-optimal code in that case (and blow up memory,
 	 see PR65518).  */
-      bool force_peeling = false;
       if (first_stmt == stmt
 	  && !GROUP_NEXT_ELEMENT (stmt_info))
 	{
@@ -6373,7 +6383,7 @@ vectorizable_load (gimple *stmt, gimple_stmt_iterator *gsi, gimple **vec_stmt,
 	    }
 
 	  /* Single-element interleaving requires peeling for gaps.  */
-	  force_peeling = true;
+	  gcc_assert (GROUP_GAP (stmt_info));
 	}
 
       /* If there is a gap in the end of the group or the group size cannot
@@ -6381,9 +6391,8 @@ vectorizable_load (gimple *stmt, gimple_stmt_iterator *gsi, gimple **vec_stmt,
 	 elements in the last iteration and thus need to peel that off.  */
       if (loop_vinfo
 	  && ! STMT_VINFO_STRIDED_P (stmt_info)
-	  && (force_peeling
-	      || GROUP_GAP (vinfo_for_stmt (first_stmt)) != 0
-	      || (!slp && vf % GROUP_SIZE (vinfo_for_stmt (first_stmt)) != 0)))
+	  && (GROUP_GAP (vinfo_for_stmt (first_stmt)) != 0
+	      || (!slp && !load_lanes_p && vf % group_size != 0)))
 	{
 	  if (dump_enabled_p ())
 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
@@ -6403,8 +6412,6 @@ vectorizable_load (gimple *stmt, gimple_stmt_iterator *gsi, gimple **vec_stmt,
       if (slp && SLP_TREE_LOAD_PERMUTATION (slp_node).exists ())
 	slp_perm = true;
 
-      group_size = GROUP_SIZE (vinfo_for_stmt (first_stmt));
-
       /* ???  The following is overly pessimistic (as well as the loop
          case above) in the case we can statically determine the excess
 	 elements loaded are within the bounds of a decl that is accessed.
@@ -6417,16 +6424,6 @@ vectorizable_load (gimple *stmt, gimple_stmt_iterator *gsi, gimple **vec_stmt,
 	  return false;
 	}
 
-      if (!slp
-	  && !PURE_SLP_STMT (stmt_info)
-	  && !STMT_VINFO_STRIDED_P (stmt_info))
-	{
-	  if (vect_load_lanes_supported (vectype, group_size))
-	    load_lanes_p = true;
-	  else if (!vect_grouped_load_supported (vectype, group_size))
-	    return false;
-	}
-
       /* Invalidate assumptions made by dependence analysis when vectorization
 	 on the unrolled body effectively re-orders stmts.  */
       if (!PURE_SLP_STMT (stmt_info)
--- a/src/gcc/tree-vectorizer.c
+++ b/src/gcc/tree-vectorizer.c
@@ -794,38 +794,142 @@ make_pass_slp_vectorize (gcc::context *ctxt)
      This should involve global alignment analysis and in the future also
      array padding.  */
 
+static unsigned get_vec_alignment_for_type (tree);
+static hash_map<tree, unsigned> *type_align_map;
+
+/* Return alignment of array's vector type corresponding to scalar type.
+   0 if no vector type exists.  */
+static unsigned
+get_vec_alignment_for_array_type (tree type) 
+{
+  gcc_assert (TREE_CODE (type) == ARRAY_TYPE);
+
+  tree vectype = get_vectype_for_scalar_type (strip_array_types (type));
+  if (!vectype
+      || !TYPE_SIZE (type)
+      || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST
+      || tree_int_cst_lt (TYPE_SIZE (type), TYPE_SIZE (vectype)))
+    return 0;
+
+  return TYPE_ALIGN (vectype);
+}
+
+/* Return alignment of field having maximum alignment of vector type
+   corresponding to it's scalar type. For now, we only consider fields whose
+   offset is a multiple of it's vector alignment.
+   0 if no suitable field is found.  */
+static unsigned
+get_vec_alignment_for_record_type (tree type) 
+{
+  gcc_assert (TREE_CODE (type) == RECORD_TYPE);
+
+  unsigned max_align = 0, alignment;
+  HOST_WIDE_INT offset;
+  tree offset_tree;
+
+  if (TYPE_PACKED (type))
+    return 0;
+
+  unsigned *slot = type_align_map->get (type);
+  if (slot)
+    return *slot;
+
+  for (tree field = first_field (type);
+       field != NULL_TREE;
+       field = DECL_CHAIN (field))
+    {
+      /* Skip if not FIELD_DECL or if alignment is set by user.  */ 
+      if (TREE_CODE (field) != FIELD_DECL
+	  || DECL_USER_ALIGN (field)
+	  || DECL_ARTIFICIAL (field))
+	continue;
+
+      /* We don't need to process the type further if offset is variable,
+	 since the offsets of remaining members will also be variable.  */
+      if (TREE_CODE (DECL_FIELD_OFFSET (field)) != INTEGER_CST
+	  || TREE_CODE (DECL_FIELD_BIT_OFFSET (field)) != INTEGER_CST)
+	break;
+
+      /* Similarly stop processing the type if offset_tree
+	 does not fit in unsigned HOST_WIDE_INT.  */
+      offset_tree = bit_position (field);
+      if (!tree_fits_uhwi_p (offset_tree))
+	break;
+
+      offset = tree_to_uhwi (offset_tree); 
+      alignment = get_vec_alignment_for_type (TREE_TYPE (field));
+
+      /* Get maximum alignment of vectorized field/array among those members
+	 whose offset is multiple of the vector alignment.  */ 
+      if (alignment
+	  && (offset % alignment == 0)
+	  && (alignment > max_align))
+	max_align = alignment;
+    }
+
+  type_align_map->put (type, max_align);
+  return max_align;
+}
+
+/* Return alignment of vector type corresponding to decl's scalar type
+   or 0 if it doesn't exist or the vector alignment is lesser than
+   decl's alignment.  */
+static unsigned
+get_vec_alignment_for_type (tree type)
+{
+  if (type == NULL_TREE)
+    return 0;
+
+  gcc_assert (TYPE_P (type));
+
+  static unsigned alignment = 0;
+  switch (TREE_CODE (type))
+    {
+      case ARRAY_TYPE:
+	alignment = get_vec_alignment_for_array_type (type);
+	break;
+      case RECORD_TYPE:
+	alignment = get_vec_alignment_for_record_type (type);
+	break;
+      default:
+	alignment = 0;
+	break;
+    }
+
+  return (alignment > TYPE_ALIGN (type)) ? alignment : 0;
+}
+
+/* Entry point to increase_alignment pass.  */
 static unsigned int
 increase_alignment (void)
 {
   varpool_node *vnode;
 
   vect_location = UNKNOWN_LOCATION;
+  type_align_map = new hash_map<tree, unsigned>;
 
   /* Increase the alignment of all global arrays for vectorization.  */
   FOR_EACH_DEFINED_VARIABLE (vnode)
     {
-      tree vectype, decl = vnode->decl;
-      tree t;
+      tree decl = vnode->decl;
       unsigned int alignment;
 
-      t = TREE_TYPE (decl);
-      if (TREE_CODE (t) != ARRAY_TYPE)
-        continue;
-      vectype = get_vectype_for_scalar_type (strip_array_types (t));
-      if (!vectype)
-        continue;
-      alignment = TYPE_ALIGN (vectype);
-      if (DECL_ALIGN (decl) >= alignment)
-        continue;
-
-      if (vect_can_force_dr_alignment_p (decl, alignment))
+      if ((decl_in_symtab_p (decl)
+	  && !symtab_node::get (decl)->can_increase_alignment_p ())
+	  || DECL_USER_ALIGN (decl) || DECL_ARTIFICIAL (decl))
+	continue;
+
+      alignment = get_vec_alignment_for_type (TREE_TYPE (decl));
+      if (alignment && vect_can_force_dr_alignment_p (decl, alignment))
         {
-	  vnode->increase_alignment (TYPE_ALIGN (vectype));
+	  vnode->increase_alignment (alignment);
           dump_printf (MSG_NOTE, "Increasing alignment of decl: ");
           dump_generic_expr (MSG_NOTE, TDF_SLIM, decl);
           dump_printf (MSG_NOTE, "\n");
         }
     }
+
+  delete type_align_map;
   return 0;
 }
 
--- a/src/gcc/tree-vrp.c
+++ b/src/gcc/tree-vrp.c
@@ -3165,6 +3165,24 @@ extract_range_from_binary_expr_1 (value_range *vr,
 	  if (int_cst_range1 && tree_int_cst_sgn (vr1.min) >= 0)
 	    wmax = wi::min (wmax, vr1.max, TYPE_SIGN (expr_type));
 	  max = wide_int_to_tree (expr_type, wmax);
+	  cmp = compare_values (min, max);
+	  /* PR68217: In case of signed & sign-bit-CST should
+	     result in [-INF, 0] instead of [-INF, INF].  */
+	  if (cmp == -2 || cmp == 1)
+	    {
+	      wide_int sign_bit
+		= wi::set_bit_in_zero (TYPE_PRECISION (expr_type) - 1,
+				       TYPE_PRECISION (expr_type));
+	      if (!TYPE_UNSIGNED (expr_type)
+		  && ((value_range_constant_singleton (&vr0)
+		       && !wi::cmps (vr0.min, sign_bit))
+		      || (value_range_constant_singleton (&vr1)
+			  && !wi::cmps (vr1.min, sign_bit))))
+		{
+		  min = TYPE_MIN_VALUE (expr_type);
+		  max = build_int_cst (expr_type, 0);
+		}
+	    }
 	}
       else if (code == BIT_IOR_EXPR)
 	{
@@ -3859,7 +3877,8 @@ extract_range_basic (value_range *vr, gimple *stmt)
 	  arg = gimple_call_arg (stmt, 0);
 	  if (TREE_CODE (arg) == SSA_NAME
 	      && SSA_NAME_IS_DEFAULT_DEF (arg)
-	      && TREE_CODE (SSA_NAME_VAR (arg)) == PARM_DECL)
+	      && TREE_CODE (SSA_NAME_VAR (arg)) == PARM_DECL
+	      && cfun->after_inlining)
 	    {
 	      set_value_range_to_null (vr, type);
 	      return;
@@ -9935,6 +9954,40 @@ simplify_internal_call_using_ranges (gimple_stmt_iterator *gsi, gimple *stmt)
   return true;
 }
 
+/* Return true if VAR is a two-valued variable.  Set a and b with the
+   two-values when it is true.  Return false otherwise.  */
+
+static bool
+two_valued_val_range_p (tree var, tree *a, tree *b)
+{
+  value_range *vr = get_value_range (var);
+  if ((vr->type != VR_RANGE
+       && vr->type != VR_ANTI_RANGE)
+      || TREE_CODE (vr->min) != INTEGER_CST
+      || TREE_CODE (vr->max) != INTEGER_CST)
+    return false;
+
+  if (vr->type == VR_RANGE
+      && wi::sub (vr->max, vr->min) == 1)
+    {
+      *a = vr->min;
+      *b = vr->max;
+      return true;
+    }
+
+  /* ~[TYPE_MIN + 1, TYPE_MAX - 1] */
+  if (vr->type == VR_ANTI_RANGE
+      && wi::sub (vr->min, vrp_val_min (TREE_TYPE (var))) == 1
+      && wi::sub (vrp_val_max (TREE_TYPE (var)), vr->max) == 1)
+    {
+      *a = vrp_val_min (TREE_TYPE (var));
+      *b = vrp_val_max (TREE_TYPE (var));
+      return true;
+    }
+
+  return false;
+}
+
 /* Simplify STMT using ranges if possible.  */
 
 static bool
@@ -9945,6 +9998,68 @@ simplify_stmt_using_ranges (gimple_stmt_iterator *gsi)
     {
       enum tree_code rhs_code = gimple_assign_rhs_code (stmt);
       tree rhs1 = gimple_assign_rhs1 (stmt);
+      tree rhs2 = gimple_assign_rhs2 (stmt);
+      tree lhs = gimple_assign_lhs (stmt);
+      tree val1 = NULL_TREE, val2 = NULL_TREE;
+      use_operand_p use_p;
+      gimple *use_stmt;
+
+      /* Convert:
+	 LHS = CST BINOP VAR
+	 Where VAR is two-valued and LHS is used in GIMPLE_COND only
+	 To:
+	 LHS = VAR == VAL1 ? (CST BINOP VAL1) : (CST BINOP VAL2)
+
+	 Also handles:
+	 LHS = VAR BINOP CST
+	 Where VAR is two-valued and LHS is used in GIMPLE_COND only
+	 To:
+	 LHS = VAR == VAL1 ? (VAL1 BINOP CST) : (VAL2 BINOP CST) */
+
+      if (TREE_CODE_CLASS (rhs_code) == tcc_binary
+	  && INTEGRAL_TYPE_P (TREE_TYPE (lhs))
+	  && ((TREE_CODE (rhs1) == INTEGER_CST
+	       && TREE_CODE (rhs2) == SSA_NAME)
+	      || (TREE_CODE (rhs2) == INTEGER_CST
+		  && TREE_CODE (rhs1) == SSA_NAME))
+	  && single_imm_use (lhs, &use_p, &use_stmt)
+	  && gimple_code (use_stmt) == GIMPLE_COND)
+
+	{
+	  tree new_rhs1 = NULL_TREE;
+	  tree new_rhs2 = NULL_TREE;
+	  tree cmp_var = NULL_TREE;
+
+	  if (TREE_CODE (rhs2) == SSA_NAME
+	      && two_valued_val_range_p (rhs2, &val1, &val2))
+	    {
+	      /* Optimize RHS1 OP [VAL1, VAL2].  */
+	      new_rhs1 = int_const_binop (rhs_code, rhs1, val1);
+	      new_rhs2 = int_const_binop (rhs_code, rhs1, val2);
+	      cmp_var = rhs2;
+	    }
+	  else if (TREE_CODE (rhs1) == SSA_NAME
+		   && two_valued_val_range_p (rhs1, &val1, &val2))
+	    {
+	      /* Optimize [VAL1, VAL2] OP RHS2.  */
+	      new_rhs1 = int_const_binop (rhs_code, val1, rhs2);
+	      new_rhs2 = int_const_binop (rhs_code, val2, rhs2);
+	      cmp_var = rhs1;
+	    }
+
+	  /* If we could not find two-vals or the optimzation is invalid as
+	     in divide by zero, new_rhs1 / new_rhs will be NULL_TREE.  */
+	  if (new_rhs1 && new_rhs2)
+	    {
+	      tree cond = build2 (EQ_EXPR, TREE_TYPE (cmp_var), cmp_var, val1);
+	      gimple_assign_set_rhs_with_ops (gsi,
+					      COND_EXPR, cond,
+					      new_rhs1,
+					      new_rhs2);
+	      update_stmt (gsi_stmt (*gsi));
+	      return true;
+	    }
+	}
 
       switch (rhs_code)
 	{
--- a/src/gcc/tree.h
+++ b/src/gcc/tree.h
@@ -4628,69 +4628,6 @@ extern void warn_deprecated_use (tree, tree);
 extern void cache_integer_cst (tree);
 extern const char *combined_fn_name (combined_fn);
 
-/* Return the memory model from a host integer.  */
-static inline enum memmodel
-memmodel_from_int (unsigned HOST_WIDE_INT val)
-{
-  return (enum memmodel) (val & MEMMODEL_MASK);
-}
-
-/* Return the base memory model from a host integer.  */
-static inline enum memmodel
-memmodel_base (unsigned HOST_WIDE_INT val)
-{
-  return (enum memmodel) (val & MEMMODEL_BASE_MASK);
-}
-
-/* Return TRUE if the memory model is RELAXED.  */
-static inline bool
-is_mm_relaxed (enum memmodel model)
-{
-  return (model & MEMMODEL_BASE_MASK) == MEMMODEL_RELAXED;
-}
-
-/* Return TRUE if the memory model is CONSUME.  */
-static inline bool
-is_mm_consume (enum memmodel model)
-{
-  return (model & MEMMODEL_BASE_MASK) == MEMMODEL_CONSUME;
-}
-
-/* Return TRUE if the memory model is ACQUIRE.  */
-static inline bool
-is_mm_acquire (enum memmodel model)
-{
-  return (model & MEMMODEL_BASE_MASK) == MEMMODEL_ACQUIRE;
-}
-
-/* Return TRUE if the memory model is RELEASE.  */
-static inline bool
-is_mm_release (enum memmodel model)
-{
-  return (model & MEMMODEL_BASE_MASK) == MEMMODEL_RELEASE;
-}
-
-/* Return TRUE if the memory model is ACQ_REL.  */
-static inline bool
-is_mm_acq_rel (enum memmodel model)
-{
-  return (model & MEMMODEL_BASE_MASK) == MEMMODEL_ACQ_REL;
-}
-
-/* Return TRUE if the memory model is SEQ_CST.  */
-static inline bool
-is_mm_seq_cst (enum memmodel model)
-{
-  return (model & MEMMODEL_BASE_MASK) == MEMMODEL_SEQ_CST;
-}
-
-/* Return TRUE if the memory model is a SYNC variant.  */
-static inline bool
-is_mm_sync (enum memmodel model)
-{
-  return (model & MEMMODEL_SYNC);
-}
-
 /* Compare and hash for any structure which begins with a canonical
    pointer.  Assumes all pointers are interchangeable, which is sort
    of already assumed by gcc elsewhere IIRC.  */
--- a/src/gcc/tsan.c
+++ b/src/gcc/tsan.c
@@ -25,6 +25,7 @@ along with GCC; see the file COPYING3.  If not see
 #include "backend.h"
 #include "rtl.h"
 #include "tree.h"
+#include "memmodel.h"
 #include "gimple.h"
 #include "tree-pass.h"
 #include "ssa.h"
--- a/src/gcc/varasm.c
+++ b/src/gcc/varasm.c
@@ -6776,6 +6776,16 @@ default_use_anchors_for_symbol_p (const_rtx symbol)
 	 sections that should be marked as small in the section directive.  */
       if (targetm.in_small_data_p (decl))
 	return false;
+
+      /* Don't use section anchors for decls that won't fit inside a single
+	 anchor range to reduce the amount of instructions required to refer
+	 to the entire declaration.  */
+      if (DECL_SIZE_UNIT (decl) == NULL_TREE
+	  || !tree_fits_uhwi_p (DECL_SIZE_UNIT (decl))
+	  || (tree_to_uhwi (DECL_SIZE_UNIT (decl))
+	      >= (unsigned HOST_WIDE_INT) targetm.max_anchor_offset))
+	return false;
+
     }
   return true;
 }
--- a/src/libcpp/expr.c
+++ b/src/libcpp/expr.c
@@ -1073,7 +1073,7 @@ eval_token (cpp_reader *pfile, const cpp_token *token,
 	  result.low = 0;
 	  if (CPP_OPTION (pfile, warn_undef) && !pfile->state.skip_eval)
 	    cpp_warning_with_line (pfile, CPP_W_UNDEF, virtual_location, 0,
-				   "\"%s\" is not defined",
+				   "\"%s\" is not defined, evaluates to 0",
 				   NODE_NAME (token->val.node.node));
 	}
       break;
--- a/src/libcpp/lex.c
+++ b/src/libcpp/lex.c
@@ -750,6 +750,101 @@ search_line_fast (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
   }
 }
 
+#elif defined (__ARM_NEON) && defined (__ARM_64BIT_STATE)
+#include "arm_neon.h"
+
+/* This doesn't have to be the exact page size, but no system may use
+   a size smaller than this.  ARMv8 requires a minimum page size of
+   4k.  The impact of being conservative here is a small number of
+   cases will take the slightly slower entry path into the main
+   loop.  */
+
+#define AARCH64_MIN_PAGE_SIZE 4096
+
+static const uchar *
+search_line_fast (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
+{
+  const uint8x16_t repl_nl = vdupq_n_u8 ('\n');
+  const uint8x16_t repl_cr = vdupq_n_u8 ('\r');
+  const uint8x16_t repl_bs = vdupq_n_u8 ('\\');
+  const uint8x16_t repl_qm = vdupq_n_u8 ('?');
+  const uint8x16_t xmask = (uint8x16_t) vdupq_n_u64 (0x8040201008040201ULL);
+
+#ifdef __AARCH64EB
+  const int16x8_t shift = {8, 8, 8, 8, 0, 0, 0, 0};
+#else
+  const int16x8_t shift = {0, 0, 0, 0, 8, 8, 8, 8};
+#endif
+
+  unsigned int found;
+  const uint8_t *p;
+  uint8x16_t data;
+  uint8x16_t t;
+  uint16x8_t m;
+  uint8x16_t u, v, w;
+
+  /* Align the source pointer.  */
+  p = (const uint8_t *)((uintptr_t)s & -16);
+
+  /* Assuming random string start positions, with a 4k page size we'll take
+     the slow path about 0.37% of the time.  */
+  if (__builtin_expect ((AARCH64_MIN_PAGE_SIZE
+			 - (((uintptr_t) s) & (AARCH64_MIN_PAGE_SIZE - 1)))
+			< 16, 0))
+    {
+      /* Slow path: the string starts near a possible page boundary.  */
+      uint32_t misalign, mask;
+
+      misalign = (uintptr_t)s & 15;
+      mask = (-1u << misalign) & 0xffff;
+      data = vld1q_u8 (p);
+      t = vceqq_u8 (data, repl_nl);
+      u = vceqq_u8 (data, repl_cr);
+      v = vorrq_u8 (t, vceqq_u8 (data, repl_bs));
+      w = vorrq_u8 (u, vceqq_u8 (data, repl_qm));
+      t = vorrq_u8 (v, w);
+      t = vandq_u8 (t, xmask);
+      m = vpaddlq_u8 (t);
+      m = vshlq_u16 (m, shift);
+      found = vaddvq_u16 (m);
+      found &= mask;
+      if (found)
+	return (const uchar*)p + __builtin_ctz (found);
+    }
+  else
+    {
+      data = vld1q_u8 ((const uint8_t *) s);
+      t = vceqq_u8 (data, repl_nl);
+      u = vceqq_u8 (data, repl_cr);
+      v = vorrq_u8 (t, vceqq_u8 (data, repl_bs));
+      w = vorrq_u8 (u, vceqq_u8 (data, repl_qm));
+      t = vorrq_u8 (v, w);
+      if (__builtin_expect (vpaddd_u64 ((uint64x2_t)t), 0))
+	goto done;
+    }
+
+  do
+    {
+      p += 16;
+      data = vld1q_u8 (p);
+      t = vceqq_u8 (data, repl_nl);
+      u = vceqq_u8 (data, repl_cr);
+      v = vorrq_u8 (t, vceqq_u8 (data, repl_bs));
+      w = vorrq_u8 (u, vceqq_u8 (data, repl_qm));
+      t = vorrq_u8 (v, w);
+    } while (!vpaddd_u64 ((uint64x2_t)t));
+
+done:
+  /* Now that we've found the terminating substring, work out precisely where
+     we need to stop.  */
+  t = vandq_u8 (t, xmask);
+  m = vpaddlq_u8 (t);
+  m = vshlq_u16 (m, shift);
+  found = vaddvq_u16 (m);
+  return (((((uintptr_t) p) < (uintptr_t) s) ? s : (const uchar *)p)
+	  + __builtin_ctz (found));
+}
+
 #elif defined (__ARM_NEON)
 #include "arm_neon.h"
 
--- a/src/libgcc/Makefile.in
+++ b/src/libgcc/Makefile.in
@@ -414,8 +414,9 @@ lib2funcs = _muldi3 _negdi2 _lshrdi3 _ashldi3 _ashrdi3 _cmpdi2 _ucmpdi2	   \
 	    _negvsi2 _negvdi2 _ctors _ffssi2 _ffsdi2 _clz _clzsi2 _clzdi2  \
 	    _ctzsi2 _ctzdi2 _popcount_tab _popcountsi2 _popcountdi2	   \
 	    _paritysi2 _paritydi2 _powisf2 _powidf2 _powixf2 _powitf2	   \
-	    _mulsc3 _muldc3 _mulxc3 _multc3 _divsc3 _divdc3 _divxc3	   \
-	    _divtc3 _bswapsi2 _bswapdi2 _clrsbsi2 _clrsbdi2
+	    _mulhc3 _mulsc3 _muldc3 _mulxc3 _multc3 _divhc3 _divsc3	   \
+	    _divdc3 _divxc3 _divtc3 _bswapsi2 _bswapdi2 _clrsbsi2	   \
+	    _clrsbdi2
 
 # The floating-point conversion routines that involve a single-word integer.
 # XX stands for the integer mode.
--- a/src/libgcc/config.host
+++ b/src/libgcc/config.host
@@ -1399,4 +1399,8 @@ i[34567]86-*-linux* | x86_64-*-linux*)
 	fi
 	tm_file="${tm_file} i386/value-unwind.h"
 	;;
+aarch64*-*-*)
+	# ILP32 needs an extra header for unwinding
+	tm_file="${tm_file} aarch64/value-unwind.h"
+	;;
 esac
--- /dev/null
+++ b/src/libgcc/config/aarch64/value-unwind.h
@@ -0,0 +1,25 @@
+/* Store register values as _Unwind_Word type in DWARF2 EH unwind context.
+   Copyright (C) 2017 Free Software Foundation, Inc.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify it
+   under the terms of the GNU General Public License as published
+   by the Free Software Foundation; either version 3, or (at your
+   option) any later version.
+
+   GCC is distributed in the hope that it will be useful, but WITHOUT
+   ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+   or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
+   License for more details.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+/* Define this macro if the target stores register values as _Unwind_Word
+   type in unwind context.  Only enable it for ilp32.  */
+#if defined __aarch64__ && !defined __LP64__
+# define REG_VALUE_IN_UNWIND_CONTEXT
+#endif
--- a/src/libgcc/config/arm/bpabi-v6m.S
+++ b/src/libgcc/config/arm/bpabi-v6m.S
@@ -1,4 +1,5 @@
-/* Miscellaneous BPABI functions.  ARMv6M implementation
+/* Miscellaneous BPABI functions.  Thumb-1 implementation, suitable for ARMv4T,
+   ARMv6-M and ARMv8-M Baseline like ISA variants.
 
    Copyright (C) 2006-2016 Free Software Foundation, Inc.
    Contributed by CodeSourcery.
--- /dev/null
+++ b/src/libgcc/config/arm/cmse.c
@@ -0,0 +1,108 @@
+/* ARMv8-M Security Extensions routines.
+   Copyright (C) 2015-2016 Free Software Foundation, Inc.
+   Contributed by ARM Ltd.
+
+   This file is free software; you can redistribute it and/or modify it
+   under the terms of the GNU General Public License as published by the
+   Free Software Foundation; either version 3, or (at your option) any
+   later version.
+
+   This file is distributed in the hope that it will be useful, but
+   WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+
+#if __ARM_FEATURE_CMSE & 1
+
+#include <arm_cmse.h>
+
+/* ARM intrinsic function to perform a permission check on a given
+   address range.  See ACLE changes for ARMv8-M.  */
+
+void *
+cmse_check_address_range (void *p, size_t size, int flags)
+{
+  cmse_address_info_t permb, perme;
+  char *pb = (char *) p, *pe;
+
+  /* Check if the range wraps around.  */
+  if (UINTPTR_MAX - (uintptr_t) p < size)
+    return NULL;
+
+  /* Check if an unknown flag is present.  */
+  int known = CMSE_MPU_UNPRIV | CMSE_MPU_READWRITE | CMSE_MPU_READ;
+  int known_secure_level = CMSE_MPU_UNPRIV;
+#if __ARM_FEATURE_CMSE & 2
+  known |= CMSE_AU_NONSECURE | CMSE_MPU_NONSECURE;
+  known_secure_level |= CMSE_MPU_NONSECURE;
+#endif
+  if (flags & (~known))
+    return NULL;
+
+  /* Execute the right variant of the TT instructions.  */
+  pe = pb + size - 1;
+  const int singleCheck = (((uintptr_t) pb ^ (uintptr_t) pe) < 32);
+  switch (flags & known_secure_level)
+    {
+    case 0:
+      permb = cmse_TT (pb);
+      perme = singleCheck ? permb : cmse_TT (pe);
+      break;
+    case CMSE_MPU_UNPRIV:
+      permb = cmse_TTT (pb);
+      perme = singleCheck ? permb : cmse_TTT (pe);
+      break;
+#if __ARM_FEATURE_CMSE & 2
+    case CMSE_MPU_NONSECURE:
+      permb = cmse_TTA (pb);
+      perme = singleCheck ? permb : cmse_TTA (pe);
+      break;
+    case CMSE_MPU_UNPRIV | CMSE_MPU_NONSECURE:
+      permb = cmse_TTAT (pb);
+      perme = singleCheck ? permb : cmse_TTAT (pe);
+      break;
+#endif
+    default:
+      /* Invalid flag, eg.  CMSE_MPU_NONSECURE specified but
+	 __ARM_FEATURE_CMSE & 2 == 0.  */
+      return NULL;
+    }
+
+  /* Check that the range does not cross MPU, SAU, or IDAU boundaries.  */
+  if (permb.value != perme.value)
+    return NULL;
+
+  /* Check the permissions on the range.  */
+  switch (flags & (~known_secure_level))
+    {
+#if __ARM_FEATURE_CMSE & 2
+    case CMSE_MPU_READ | CMSE_MPU_READWRITE | CMSE_AU_NONSECURE:
+    case		 CMSE_MPU_READWRITE | CMSE_AU_NONSECURE:
+      return permb.flags.nonsecure_readwrite_ok	? p : NULL;
+    case CMSE_MPU_READ | CMSE_AU_NONSECURE:
+      return permb.flags.nonsecure_read_ok	? p : NULL;
+    case CMSE_AU_NONSECURE:
+      return permb.flags.secure			? NULL : p;
+#endif
+    case CMSE_MPU_READ | CMSE_MPU_READWRITE:
+    case		 CMSE_MPU_READWRITE:
+      return permb.flags.readwrite_ok		? p : NULL;
+    case CMSE_MPU_READ:
+      return permb.flags.read_ok		? p : NULL;
+    default:
+      return NULL;
+    }
+}
+
+
+#endif /* __ARM_FEATURE_CMSE & 1.  */
--- /dev/null
+++ b/src/libgcc/config/arm/cmse_nonsecure_call.S
@@ -0,0 +1,131 @@
+/* CMSE wrapper function used to save, clear and restore callee saved registers
+   for cmse_nonsecure_call's.
+
+   Copyright (C) 2016 Free Software Foundation, Inc.
+   Contributed by ARM Ltd.
+
+   This file is free software; you can redistribute it and/or modify it
+   under the terms of the GNU General Public License as published by the
+   Free Software Foundation; either version 3, or (at your option) any
+   later version.
+
+   This file is distributed in the hope that it will be useful, but
+   WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+.syntax unified
+.thumb
+.global __gnu_cmse_nonsecure_call
+__gnu_cmse_nonsecure_call:
+#if defined(__ARM_ARCH_8M_MAIN__)
+push	    {r5-r11,lr}
+mov	    r7, r4
+mov	    r8, r4
+mov	    r9, r4
+mov	    r10, r4
+mov	    r11, r4
+mov	    ip, r4
+
+/* Save and clear callee-saved registers only if we are dealing with hard float
+   ABI.  The unused caller-saved registers have already been cleared by GCC
+   generated code.  */
+#ifdef __ARM_PCS_VFP
+vpush.f64   {d8-d15}
+mov	    r5, #0
+vmov	    d8, r5, r5
+#if __ARM_FP & 0x04
+vmov	    s18, s19, r5, r5
+vmov	    s20, s21, r5, r5
+vmov	    s22, s23, r5, r5
+vmov	    s24, s25, r5, r5
+vmov	    s26, s27, r5, r5
+vmov	    s28, s29, r5, r5
+vmov	    s30, s31, r5, r5
+#elif __ARM_FP & 0x08
+vmov.f64    d9, d8
+vmov.f64    d10, d8
+vmov.f64    d11, d8
+vmov.f64    d12, d8
+vmov.f64    d13, d8
+vmov.f64    d14, d8
+vmov.f64    d15, d8
+#else
+#error "Half precision implementation not supported."
+#endif
+/* Clear the cumulative exception-status bits (0-4,7) and the
+   condition code bits (28-31) of the FPSCR.  */
+vmrs	    r5, fpscr
+movw	    r6, #65376
+movt	    r6, #4095
+ands	    r5, r6
+vmsr	    fpscr, r5
+
+/* We are not dealing with hard float ABI, so we can safely use the vlstm and
+   vlldm instructions without needing to preserve the registers used for
+   argument passing.  */
+#else
+sub	    sp, sp, #0x88 /* Reserve stack space to save all floating point
+			     registers, including FPSCR.  */
+vlstm	    sp		  /* Lazy store and clearance of d0-d16 and FPSCR.  */
+#endif /* __ARM_PCS_VFP */
+
+/* Make sure to clear the 'GE' bits of the APSR register if 32-bit SIMD
+   instructions are available.  */
+#if defined(__ARM_FEATURE_SIMD32)
+msr	    APSR_nzcvqg, r4
+#else
+msr	    APSR_nzcvq, r4
+#endif
+
+mov	    r5, r4
+mov	    r6, r4
+blxns	    r4
+
+#ifdef __ARM_PCS_VFP
+vpop.f64    {d8-d15}
+#else
+vlldm	    sp		  /* Lazy restore of d0-d16 and FPSCR.  */
+add	    sp, sp, #0x88 /* Free space used to save floating point registers.  */
+#endif /* __ARM_PCS_VFP */
+
+pop	    {r5-r11, pc}
+
+#elif defined (__ARM_ARCH_8M_BASE__)
+push	    {r5-r7, lr}
+mov	    r5, r8
+mov	    r6, r9
+mov	    r7, r10
+push	    {r5-r7}
+mov	    r5, r11
+push	    {r5}
+mov	    r5, r4
+mov	    r6, r4
+mov	    r7, r4
+mov	    r8, r4
+mov	    r9, r4
+mov	    r10, r4
+mov	    r11, r4
+mov	    ip, r4
+msr	    APSR_nzcvq, r4
+blxns	    r4
+pop	    {r5}
+mov	    r11, r5
+pop	    {r5-r7}
+mov	    r10, r7
+mov	    r9, r6
+mov	    r8, r5
+pop	    {r5-r7, pc}
+
+#else
+#error "This should only be used for armv8-m base- and mainline."
+#endif
--- a/src/libgcc/config/arm/ieee754-df.S
+++ b/src/libgcc/config/arm/ieee754-df.S
@@ -160,8 +160,8 @@ ARM_FUNC_ALIAS aeabi_dadd adddf3
 	teq	r4, r5
 	beq	LSYM(Lad_d)
 
-@ CFI note: we're lucky that the branches to Lad_* that appear after this function
-@ have a CFI state that's exactly the same as the one we're in at this
+@ CFI note: we're lucky that the branches to Lad_* that appear after this
+@ function have a CFI state that's exactly the same as the one we're in at this
 @ point. Otherwise the CFI would change to a different state after the branch,
 @ which would be disastrous for backtracing.
 LSYM(Lad_x):
@@ -507,11 +507,15 @@ ARM_FUNC_ALIAS aeabi_f2d extendsfdf2
 	eorne	xh, xh, #0x38000000	@ fixup exponent otherwise.
 	RETc(ne)			@ and return it.
 
-	teq	r2, #0			@ if actually 0
-	do_it	ne, e
-	teqne	r3, #0xff000000		@ or INF or NAN
+	bics	r2, r2, #0xff000000	@ isolate mantissa
+	do_it	eq			@ if 0, that is ZERO or INF,
 	RETc(eq)			@ we are done already.
 
+	teq	r3, #0xff000000		@ check for NAN
+	do_it	eq, t
+	orreq	xh, xh, #0x00080000	@ change to quiet NAN
+	RETc(eq)			@ and return it.
+
 	@ value was denormalized.  We can normalize it now.
 	do_push	{r4, r5, lr}
 	.cfi_adjust_cfa_offset 12   @ CFA is now sp + previousOffset + 12
@@ -1158,8 +1162,8 @@ ARM_FUNC_ALIAS eqdf2 cmpdf2
 1:	str	ip, [sp, #-4]!
 	.cfi_adjust_cfa_offset 4        @ CFA is now sp + previousOffset + 4.
 	@ We're not adding CFI for ip as it's pushed into the stack
-	@ only because @ it may be popped off later as a return value
-	@ (i.e. we're not preserving @ it anyways).
+	@ only because it may be popped off later as a return value
+	@ (i.e. we're not preserving it anyways).
 
 	@ Trap any INF/NAN first.
 	mov	ip, xh, lsl #1
@@ -1169,14 +1173,14 @@ ARM_FUNC_ALIAS eqdf2 cmpdf2
 	COND(mvn,s,ne)	ip, ip, asr #21
 	beq	3f
 	.cfi_remember_state
-	@ Save the current CFI state. This is done because the branch
-	@ is conditional, @ and if we don't take it we'll issue a
-	@ .cfi_adjust_cfa_offset and return.  @ If we do take it,
-	@ however, the .cfi_adjust_cfa_offset from the non-branch @ code
-	@ will affect the branch code as well. To avoid this we'll
-	@ restore @ the current state before executing the branch code.
-
-	@ Test for equality.  @ Note that 0.0 is equal to -0.0.
+	@ Save the current CFI state.  This is done because the branch
+	@ is conditional, and if we don't take it we'll issue a
+	@ .cfi_adjust_cfa_offset and return.  If we do take it,
+	@ however, the .cfi_adjust_cfa_offset from the non-branch code
+	@ will affect the branch code as well.  To avoid this we'll
+	@ restore the current state before executing the branch code.
+
+	@ Test for equality.  Note that 0.0 is equal to -0.0.
 2:	add	sp, sp, #4
 	.cfi_adjust_cfa_offset -4       @ CFA is now sp + previousOffset.
 
--- a/src/libgcc/config/arm/lib1funcs.S
+++ b/src/libgcc/config/arm/lib1funcs.S
@@ -108,7 +108,8 @@ see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
 # define __ARM_ARCH__ 7
 #endif
 
-#if defined(__ARM_ARCH_8A__)
+#if defined(__ARM_ARCH_8A__) || defined(__ARM_ARCH_8M_BASE__) \
+	|| defined(__ARM_ARCH_8M_MAIN__)
 # define __ARM_ARCH__ 8
 #endif
 
@@ -124,10 +125,14 @@ see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
      && !defined(__thumb2__)		\
      && (!defined(__THUMB_INTERWORK__)	\
 	 || defined (__OPTIMIZE_SIZE__)	\
-	 || defined(__ARM_ARCH_6M__)))
+	 || !__ARM_ARCH_ISA_ARM))
 # define __prefer_thumb__
 #endif
 
+#if !__ARM_ARCH_ISA_ARM && __ARM_ARCH_ISA_THUMB == 1
+#define NOT_ISA_TARGET_32BIT 1
+#endif
+
 /* How to return from a function call depends on the architecture variant.  */
 
 #if (__ARM_ARCH__ > 4) || defined(__ARM_ARCH_4T__)
@@ -305,35 +310,14 @@ LSYM(Lend_fde):
 
 #ifdef __ARM_EABI__
 .macro THUMB_LDIV0 name signed
-#if defined(__ARM_ARCH_6M__)
-	.ifc \signed, unsigned
-	cmp	r0, #0
-	beq	1f
-	mov	r0, #0
-	mvn	r0, r0		@ 0xffffffff
-1:
-	.else
-	cmp	r0, #0
-	beq	2f
-	blt	3f
+#ifdef NOT_ISA_TARGET_32BIT
+
+	push	{r0, lr}
 	mov	r0, #0
-	mvn	r0, r0
-	lsr	r0, r0, #1	@ 0x7fffffff
-	b	2f
-3:	mov	r0, #0x80
-	lsl	r0, r0, #24	@ 0x80000000
-2:
-	.endif
-	push	{r0, r1, r2}
-	ldr	r0, 4f
-	adr	r1, 4f
-	add	r0, r1
-	str	r0, [sp, #8]
+	bl	SYM(__aeabi_idiv0)
 	@ We know we are not on armv4t, so pop pc is safe.
-	pop	{r0, r1, pc}
-	.align	2
-4:
-	.word	__aeabi_idiv0 - 4b
+	pop	{r1, pc}
+
 #elif defined(__thumb2__)
 	.syntax unified
 	.ifc \signed, unsigned
@@ -478,7 +462,7 @@ _L__\name:
 
 #else /* !(__INTERWORKING_STUBS__ || __thumb2__) */
 
-#ifdef __ARM_ARCH_6M__
+#ifdef NOT_ISA_TARGET_32BIT
 #define EQUIV .thumb_set
 #else
 .macro	ARM_FUNC_START name sp_section=
@@ -510,7 +494,7 @@ SYM (__\name):
 #endif
 .endm
 
-#ifndef __ARM_ARCH_6M__
+#ifndef NOT_ISA_TARGET_32BIT
 .macro	ARM_FUNC_ALIAS new old
 	.globl	SYM (__\new)
 	EQUIV	SYM (__\new), SYM (__\old)
@@ -945,7 +929,170 @@ LSYM(Lover7):
 	add	dividend, work
   .endif
 LSYM(Lgot_result):
-.endm	
+.endm
+
+/* If performance is preferred, the following functions are provided.  */
+#if defined(__prefer_thumb__) && !defined(__OPTIMIZE_SIZE__)
+
+/* Branch to div(n), and jump to label if curbit is lo than divisior.  */
+.macro BranchToDiv n, label
+	lsr	curbit, dividend, \n
+	cmp	curbit, divisor
+	blo	\label
+.endm
+
+/* Body of div(n).  Shift the divisor in n bits and compare the divisor
+   and dividend.  Update the dividend as the substruction result.  */
+.macro DoDiv n
+	lsr	curbit, dividend, \n
+	cmp	curbit, divisor
+	bcc	1f
+	lsl	curbit, divisor, \n
+	sub	dividend, dividend, curbit
+
+1:	adc	result, result
+.endm
+
+/* The body of division with positive divisor.  Unless the divisor is very
+   big, shift it up in multiples of four bits, since this is the amount of
+   unwinding in the main division loop.  Continue shifting until the divisor
+   is larger than the dividend.  */
+.macro THUMB1_Div_Positive
+	mov	result, #0
+	BranchToDiv #1, LSYM(Lthumb1_div1)
+	BranchToDiv #4, LSYM(Lthumb1_div4)
+	BranchToDiv #8, LSYM(Lthumb1_div8)
+	BranchToDiv #12, LSYM(Lthumb1_div12)
+	BranchToDiv #16, LSYM(Lthumb1_div16)
+LSYM(Lthumb1_div_large_positive):
+	mov	result, #0xff
+	lsl	divisor, divisor, #8
+	rev	result, result
+	lsr	curbit, dividend, #16
+	cmp	curbit, divisor
+	blo	1f
+	asr	result, #8
+	lsl	divisor, divisor, #8
+	beq	LSYM(Ldivbyzero_waypoint)
+
+1:	lsr	curbit, dividend, #12
+	cmp	curbit, divisor
+	blo	LSYM(Lthumb1_div12)
+	b	LSYM(Lthumb1_div16)
+LSYM(Lthumb1_div_loop):
+	lsr	divisor, divisor, #8
+LSYM(Lthumb1_div16):
+	Dodiv	#15
+	Dodiv	#14
+	Dodiv	#13
+	Dodiv	#12
+LSYM(Lthumb1_div12):
+	Dodiv	#11
+	Dodiv	#10
+	Dodiv	#9
+	Dodiv	#8
+	bcs	LSYM(Lthumb1_div_loop)
+LSYM(Lthumb1_div8):
+	Dodiv	#7
+	Dodiv	#6
+	Dodiv	#5
+LSYM(Lthumb1_div5):
+	Dodiv	#4
+LSYM(Lthumb1_div4):
+	Dodiv	#3
+LSYM(Lthumb1_div3):
+	Dodiv	#2
+LSYM(Lthumb1_div2):
+	Dodiv	#1
+LSYM(Lthumb1_div1):
+	sub	divisor, dividend, divisor
+	bcs	1f
+	cpy	divisor, dividend
+
+1:	adc	result, result
+	cpy	dividend, result
+	RET
+
+LSYM(Ldivbyzero_waypoint):
+	b	LSYM(Ldiv0)
+.endm
+
+/* The body of division with negative divisor.  Similar with
+   THUMB1_Div_Positive except that the shift steps are in multiples
+   of six bits.  */
+.macro THUMB1_Div_Negative
+	lsr	result, divisor, #31
+	beq	1f
+	neg	divisor, divisor
+
+1:	asr	curbit, dividend, #32
+	bcc	2f
+	neg	dividend, dividend
+
+2:	eor	curbit, result
+	mov	result, #0
+	cpy	ip, curbit
+	BranchToDiv #4, LSYM(Lthumb1_div_negative4)
+	BranchToDiv #8, LSYM(Lthumb1_div_negative8)
+LSYM(Lthumb1_div_large):
+	mov	result, #0xfc
+	lsl	divisor, divisor, #6
+	rev	result, result
+	lsr	curbit, dividend, #8
+	cmp	curbit, divisor
+	blo	LSYM(Lthumb1_div_negative8)
+
+	lsl	divisor, divisor, #6
+	asr	result, result, #6
+	cmp	curbit, divisor
+	blo	LSYM(Lthumb1_div_negative8)
+
+	lsl	divisor, divisor, #6
+	asr	result, result, #6
+	cmp	curbit, divisor
+	blo	LSYM(Lthumb1_div_negative8)
+
+	lsl	divisor, divisor, #6
+	beq	LSYM(Ldivbyzero_negative)
+	asr	result, result, #6
+	b	LSYM(Lthumb1_div_negative8)
+LSYM(Lthumb1_div_negative_loop):
+	lsr	divisor, divisor, #6
+LSYM(Lthumb1_div_negative8):
+	DoDiv	#7
+	DoDiv	#6
+	DoDiv	#5
+	DoDiv	#4
+LSYM(Lthumb1_div_negative4):
+	DoDiv	#3
+	DoDiv	#2
+	bcs	LSYM(Lthumb1_div_negative_loop)
+	DoDiv	#1
+	sub	divisor, dividend, divisor
+	bcs	1f
+	cpy	divisor, dividend
+
+1:	cpy	curbit, ip
+	adc	result, result
+	asr	curbit, curbit, #1
+	cpy	dividend, result
+	bcc	2f
+	neg	dividend, dividend
+	cmp	curbit, #0
+
+2:	bpl	3f
+	neg	divisor, divisor
+
+3:	RET
+
+LSYM(Ldivbyzero_negative):
+	cpy	curbit, ip
+	asr	curbit, curbit, #1
+	bcc	LSYM(Ldiv0)
+	neg	dividend, dividend
+.endm
+#endif /* ARM Thumb version.  */
+
 /* ------------------------------------------------------------------------ */
 /*		Start of the Real Functions				    */
 /* ------------------------------------------------------------------------ */
@@ -955,6 +1102,7 @@ LSYM(Lgot_result):
 
 	FUNC_START udivsi3
 	FUNC_ALIAS aeabi_uidiv udivsi3
+#if defined(__OPTIMIZE_SIZE__)
 
 	cmp	divisor, #0
 	beq	LSYM(Ldiv0)
@@ -972,6 +1120,14 @@ LSYM(udivsi3_skip_div0_test):
 	pop	{ work }
 	RET
 
+/* Implementation of aeabi_uidiv for ARMv6m.  This version is only
+   used in ARMv6-M when we need an efficient implementation.  */
+#else
+LSYM(udivsi3_skip_div0_test):
+	THUMB1_Div_Positive
+
+#endif /* __OPTIMIZE_SIZE__ */
+
 #elif defined(__ARM_ARCH_EXT_IDIV__)
 
 	ARM_FUNC_START udivsi3
@@ -1023,12 +1179,21 @@ LSYM(udivsi3_skip_div0_test):
 FUNC_START aeabi_uidivmod
 	cmp	r1, #0
 	beq	LSYM(Ldiv0)
+# if defined(__OPTIMIZE_SIZE__)
 	push	{r0, r1, lr}
 	bl	LSYM(udivsi3_skip_div0_test)
 	POP	{r1, r2, r3}
 	mul	r2, r0
 	sub	r1, r1, r2
 	bx	r3
+# else
+	/* Both the quotient and remainder are calculated simultaneously
+	   in THUMB1_Div_Positive.  There is no need to calculate the
+	   remainder again here.  */
+	b	LSYM(udivsi3_skip_div0_test)
+	RET
+# endif /* __OPTIMIZE_SIZE__ */
+
 #elif defined(__ARM_ARCH_EXT_IDIV__)
 ARM_FUNC_START aeabi_uidivmod
 	cmp	r1, #0
@@ -1054,7 +1219,7 @@ ARM_FUNC_START aeabi_uidivmod
 /* ------------------------------------------------------------------------ */
 #ifdef L_umodsi3
 
-#ifdef __ARM_ARCH_EXT_IDIV__
+#if defined(__ARM_ARCH_EXT_IDIV__) && __ARM_ARCH_ISA_THUMB != 1
 
 	ARM_FUNC_START umodsi3
 
@@ -1084,7 +1249,7 @@ LSYM(Lover10):
 	RET
 	
 #else  /* ARM version.  */
-	
+
 	FUNC_START umodsi3
 
 	subs	r2, r1, #1			@ compare divisor with 1
@@ -1109,8 +1274,9 @@ LSYM(Lover10):
 
 #if defined(__prefer_thumb__)
 
-	FUNC_START divsi3	
+	FUNC_START divsi3
 	FUNC_ALIAS aeabi_idiv divsi3
+#if defined(__OPTIMIZE_SIZE__)
 
 	cmp	divisor, #0
 	beq	LSYM(Ldiv0)
@@ -1133,7 +1299,7 @@ LSYM(Lover11):
 	blo	LSYM(Lgot_result)
 
 	THUMB_DIV_MOD_BODY 0
-	
+
 	mov	r0, result
 	mov	work, ip
 	cmp	work, #0
@@ -1143,6 +1309,22 @@ LSYM(Lover12):
 	pop	{ work }
 	RET
 
+/* Implementation of aeabi_idiv for ARMv6m.  This version is only
+   used in ARMv6-M when we need an efficient implementation.  */
+#else
+LSYM(divsi3_skip_div0_test):
+	cpy	curbit, dividend
+	orr	curbit, divisor
+	bmi	LSYM(Lthumb1_div_negative)
+
+LSYM(Lthumb1_div_positive):
+	THUMB1_Div_Positive
+
+LSYM(Lthumb1_div_negative):
+	THUMB1_Div_Negative
+
+#endif /* __OPTIMIZE_SIZE__ */
+
 #elif defined(__ARM_ARCH_EXT_IDIV__)
 
 	ARM_FUNC_START divsi3
@@ -1154,8 +1336,8 @@ LSYM(Lover12):
 	RET
 
 #else /* ARM/Thumb-2 version.  */
-	
-	ARM_FUNC_START divsi3	
+
+	ARM_FUNC_START divsi3
 	ARM_FUNC_ALIAS aeabi_idiv divsi3
 
 	cmp	r1, #0
@@ -1209,12 +1391,21 @@ LSYM(divsi3_skip_div0_test):
 FUNC_START aeabi_idivmod
 	cmp	r1, #0
 	beq	LSYM(Ldiv0)
+# if defined(__OPTIMIZE_SIZE__)
 	push	{r0, r1, lr}
 	bl	LSYM(divsi3_skip_div0_test)
 	POP	{r1, r2, r3}
 	mul	r2, r0
 	sub	r1, r1, r2
 	bx	r3
+# else
+	/* Both the quotient and remainder are calculated simultaneously
+	   in THUMB1_Div_Positive and THUMB1_Div_Negative.  There is no
+	   need to calculate the remainder again here.  */
+	b	LSYM(divsi3_skip_div0_test)
+	RET
+# endif /* __OPTIMIZE_SIZE__ */
+
 #elif defined(__ARM_ARCH_EXT_IDIV__)
 ARM_FUNC_START aeabi_idivmod
 	cmp 	r1, #0
@@ -1240,7 +1431,7 @@ ARM_FUNC_START aeabi_idivmod
 /* ------------------------------------------------------------------------ */
 #ifdef L_modsi3
 
-#if defined(__ARM_ARCH_EXT_IDIV__)
+#if defined(__ARM_ARCH_EXT_IDIV__) && __ARM_ARCH_ISA_THUMB != 1
 
 	ARM_FUNC_START modsi3
 
@@ -1508,14 +1699,15 @@ LSYM(Lover12):
 
 #endif /* __symbian__ */
 
-#if ((__ARM_ARCH__ > 5) && !defined(__ARM_ARCH_6M__)) \
-    || defined(__ARM_ARCH_5E__) || defined(__ARM_ARCH_5TE__) \
-    || defined(__ARM_ARCH_5TEJ__)
+#if (__ARM_ARCH_ISA_THUMB == 2	\
+     || (__ARM_ARCH_ISA_ARM	\
+	 && (__ARM_ARCH__ > 5	\
+	     || (__ARM_ARCH__ == 5 && __ARM_ARCH_ISA_THUMB))))
 #define HAVE_ARM_CLZ 1
 #endif
 
 #ifdef L_clzsi2
-#if defined(__ARM_ARCH_6M__)
+#ifdef NOT_ISA_TARGET_32BIT
 FUNC_START clzsi2
 	mov	r1, #28
 	mov	r3, #1
@@ -1576,7 +1768,7 @@ ARM_FUNC_START clzsi2
 #ifdef L_clzdi2
 #if !defined(HAVE_ARM_CLZ)
 
-# if defined(__ARM_ARCH_6M__)
+# ifdef NOT_ISA_TARGET_32BIT
 FUNC_START clzdi2
 	push	{r4, lr}
 # else
@@ -1601,7 +1793,7 @@ ARM_FUNC_START clzdi2
 	bl	__clzsi2
 # endif
 2:
-# if defined(__ARM_ARCH_6M__)
+# ifdef NOT_ISA_TARGET_32BIT
 	pop	{r4, pc}
 # else
 	RETLDM	r4
@@ -1623,7 +1815,7 @@ ARM_FUNC_START clzdi2
 #endif /* L_clzdi2 */
 
 #ifdef L_ctzsi2
-#if defined(__ARM_ARCH_6M__)
+#ifdef NOT_ISA_TARGET_32BIT
 FUNC_START ctzsi2
 	neg	r1, r0
 	and	r0, r0, r1
@@ -1738,7 +1930,7 @@ ARM_FUNC_START ctzsi2
 
 /* Don't bother with the old interworking routines for Thumb-2.  */
 /* ??? Maybe only omit these on "m" variants.  */
-#if !defined(__thumb2__) && !defined(__ARM_ARCH_6M__)
+#if !defined(__thumb2__) && __ARM_ARCH_ISA_ARM
 
 #if defined L_interwork_call_via_rX
 
@@ -1983,11 +2175,12 @@ LSYM(Lchange_\register):
 .endm
 
 #ifndef __symbian__
-#ifndef __ARM_ARCH_6M__
+/* The condition here must match the one in gcc/config/arm/elf.h.  */
+#ifndef NOT_ISA_TARGET_32BIT
 #include "ieee754-df.S"
 #include "ieee754-sf.S"
 #include "bpabi.S"
-#else /* __ARM_ARCH_6M__ */
+#else /* NOT_ISA_TARGET_32BIT */
 #include "bpabi-v6m.S"
-#endif /* __ARM_ARCH_6M__ */
+#endif /* NOT_ISA_TARGET_32BIT */
 #endif /* !__symbian__ */
--- a/src/libgcc/config/arm/libunwind.S
+++ b/src/libgcc/config/arm/libunwind.S
@@ -58,7 +58,7 @@
 #endif
 #endif
 
-#ifdef __ARM_ARCH_6M__
+#if !__ARM_ARCH_ISA_ARM && __ARM_ARCH_ISA_THUMB == 1
 
 /* r0 points to a 16-word block.  Upload these values to the actual core
    state.  */
@@ -169,7 +169,7 @@ FUNC_START gnu_Unwind_Save_WMMXC
 	UNPREFIX \name
 .endm
 
-#else /* !__ARM_ARCH_6M__ */
+#else /* __ARM_ARCH_ISA_ARM || __ARM_ARCH_ISA_THUMB != 1 */
 
 /* r0 points to a 16-word block.  Upload these values to the actual core
    state.  */
@@ -351,7 +351,7 @@ ARM_FUNC_START gnu_Unwind_Save_WMMXC
 	UNPREFIX \name
 .endm
 
-#endif /* !__ARM_ARCH_6M__ */
+#endif /* __ARM_ARCH_ISA_ARM || __ARM_ARCH_ISA_THUMB != 1 */
 
 UNWIND_WRAPPER _Unwind_RaiseException 1
 UNWIND_WRAPPER _Unwind_Resume 1
--- a/src/libgcc/config/arm/t-arm
+++ b/src/libgcc/config/arm/t-arm
@@ -1,3 +1,17 @@
 LIB1ASMSRC = arm/lib1funcs.S
 LIB1ASMFUNCS = _thumb1_case_sqi _thumb1_case_uqi _thumb1_case_shi \
 	_thumb1_case_uhi _thumb1_case_si
+
+HAVE_CMSE:=$(findstring __ARM_FEATURE_CMSE,$(shell $(gcc_compile_bare) -dM -E - </dev/null))
+ifneq ($(shell $(gcc_compile_bare) -E -mcmse - </dev/null 2>/dev/null),)
+CMSE_OPTS:=-mcmse
+endif
+
+ifdef HAVE_CMSE
+libgcc-objects += cmse.o cmse_nonsecure_call.o
+
+cmse.o: $(srcdir)/config/arm/cmse.c
+	$(gcc_compile) -c $(CMSE_OPTS) $<
+cmse_nonsecure_call.o: $(srcdir)/config/arm/cmse_nonsecure_call.S
+		       $(gcc_compile) -c $<
+endif
--- a/src/libgcc/config/arm/t-softfp
+++ b/src/libgcc/config/arm/t-softfp
@@ -1,2 +1,2 @@
-softfp_wrap_start := '\#ifdef __ARM_ARCH_6M__'
+softfp_wrap_start := '\#if !__ARM_ARCH_ISA_ARM && __ARM_ARCH_ISA_THUMB == 1'
 softfp_wrap_end := '\#endif'
--- a/src/libgcc/libgcc2.c
+++ b/src/libgcc/libgcc2.c
@@ -1852,7 +1852,8 @@ NAME (TYPE x, int m)
 
 #endif
 
-#if ((defined(L_mulsc3) || defined(L_divsc3)) && LIBGCC2_HAS_SF_MODE) \
+#if((defined(L_mulhc3) || defined(L_divhc3)) && LIBGCC2_HAS_HF_MODE) \
+    || ((defined(L_mulsc3) || defined(L_divsc3)) && LIBGCC2_HAS_SF_MODE) \
     || ((defined(L_muldc3) || defined(L_divdc3)) && LIBGCC2_HAS_DF_MODE) \
     || ((defined(L_mulxc3) || defined(L_divxc3)) && LIBGCC2_HAS_XF_MODE) \
     || ((defined(L_multc3) || defined(L_divtc3)) && LIBGCC2_HAS_TF_MODE)
@@ -1861,7 +1862,13 @@ NAME (TYPE x, int m)
 #undef double
 #undef long
 
-#if defined(L_mulsc3) || defined(L_divsc3)
+#if defined(L_mulhc3) || defined(L_divhc3)
+# define MTYPE	HFtype
+# define CTYPE	HCtype
+# define MODE	hc
+# define CEXT	__LIBGCC_HF_FUNC_EXT__
+# define NOTRUNC (!__LIBGCC_HF_EXCESS_PRECISION__)
+#elif defined(L_mulsc3) || defined(L_divsc3)
 # define MTYPE	SFtype
 # define CTYPE	SCtype
 # define MODE	sc
@@ -1922,7 +1929,7 @@ extern void *compile_type_assert[sizeof(INFINITY) == sizeof(MTYPE) ? 1 : -1];
 # define TRUNC(x)	__asm__ ("" : "=m"(x) : "m"(x))
 #endif
 
-#if defined(L_mulsc3) || defined(L_muldc3) \
+#if defined(L_mulhc3) || defined(L_mulsc3) || defined(L_muldc3) \
     || defined(L_mulxc3) || defined(L_multc3)
 
 CTYPE
@@ -1992,7 +1999,7 @@ CONCAT3(__mul,MODE,3) (MTYPE a, MTYPE b, MTYPE c, MTYPE d)
 }
 #endif /* complex multiply */
 
-#if defined(L_divsc3) || defined(L_divdc3) \
+#if defined(L_divhc3) || defined(L_divsc3) || defined(L_divdc3) \
     || defined(L_divxc3) || defined(L_divtc3)
 
 CTYPE
--- a/src/libgcc/libgcc2.h
+++ b/src/libgcc/libgcc2.h
@@ -34,6 +34,12 @@ extern void __clear_cache (char *, char *);
 extern void __eprintf (const char *, const char *, unsigned int, const char *)
   __attribute__ ((__noreturn__));
 
+#ifdef __LIBGCC_HAS_HF_MODE__
+#define LIBGCC2_HAS_HF_MODE 1
+#else
+#define LIBGCC2_HAS_HF_MODE 0
+#endif
+
 #ifdef __LIBGCC_HAS_SF_MODE__
 #define LIBGCC2_HAS_SF_MODE 1
 #else
@@ -133,6 +139,10 @@ typedef unsigned int UTItype	__attribute__ ((mode (TI)));
 #endif
 #endif
 
+#if LIBGCC2_HAS_HF_MODE
+typedef		float HFtype	__attribute__ ((mode (HF)));
+typedef _Complex float HCtype	__attribute__ ((mode (HC)));
+#endif
 #if LIBGCC2_HAS_SF_MODE
 typedef 	float SFtype	__attribute__ ((mode (SF)));
 typedef _Complex float SCtype	__attribute__ ((mode (SC)));
@@ -424,6 +434,10 @@ extern SItype __negvsi2 (SItype);
 #endif /* COMPAT_SIMODE_TRAPPING_ARITHMETIC */
 
 #undef int
+#if LIBGCC2_HAS_HF_MODE
+extern HCtype __divhc3 (HFtype, HFtype, HFtype, HFtype);
+extern HCtype __mulhc3 (HFtype, HFtype, HFtype, HFtype);
+#endif
 #if LIBGCC2_HAS_SF_MODE
 extern DWtype __fixsfdi (SFtype);
 extern SFtype __floatdisf (DWtype);
--- a/src/libstdc++-v3/acinclude.m4
+++ b/src/libstdc++-v3/acinclude.m4
@@ -632,10 +632,10 @@ dnl  baseline_dir
 dnl  baseline_subdir_switch
 dnl
 AC_DEFUN([GLIBCXX_CONFIGURE_TESTSUITE], [
-  if $GLIBCXX_IS_NATIVE ; then
-    # Do checks for resource limit functions.
-    GLIBCXX_CHECK_SETRLIMIT
+  # Do checks for resource limit functions.
+  GLIBCXX_CHECK_SETRLIMIT
 
+  if $GLIBCXX_IS_NATIVE ; then
     # Look for setenv, so that extended locale tests can be performed.
     GLIBCXX_CHECK_STDLIB_DECL_AND_LINKAGE_3(setenv)
   fi
--- a/src/libstdc++-v3/configure
+++ b/src/libstdc++-v3/configure
@@ -79519,8 +79519,7 @@ $as_echo "$ac_cv_x86_rdrand" >&6; }
 
 # This depends on GLIBCXX_ENABLE_SYMVERS and GLIBCXX_IS_NATIVE.
 
-  if $GLIBCXX_IS_NATIVE ; then
-    # Do checks for resource limit functions.
+  # Do checks for resource limit functions.
 
   setrlimit_have_headers=yes
   for ac_header in unistd.h sys/time.h sys/resource.h
@@ -79749,6 +79748,7 @@ $as_echo "#define _GLIBCXX_RES_LIMITS 1" >>confdefs.h
 $as_echo "$ac_res_limits" >&6; }
 
 
+  if $GLIBCXX_IS_NATIVE ; then
     # Look for setenv, so that extended locale tests can be performed.
 
   { $as_echo "$as_me:${as_lineno-$LINENO}: checking for setenv declaration" >&5
--- a/src/libstdc++-v3/testsuite/29_atomics/atomic/65913.cc
+++ b/src/libstdc++-v3/testsuite/29_atomics/atomic/65913.cc
@@ -15,7 +15,8 @@
 // with this library; see the file COPYING3.  If not see
 // <http://www.gnu.org/licenses/>.
 
-// { dg-do run { target x86_64-*-linux* powerpc*-*-linux* } }
+// { dg-do run }
+// { dg-require-atomic-builtins "" }
 // { dg-options "-std=gnu++11 -O0" }
 
 #include <atomic>