From d5de327da0b174c5d44f776f827c1ea8bfe99095 Mon Sep 17 00:00:00 2001
From: Reid Kleckner <reid@kleckner.net>
Date: Tue, 28 Oct 2014 01:29:26 +0000
Subject: [PATCH] X86: Implement the vectorcall calling convention

This is a Microsoft calling convention that supports both x86 and x86_64
subtargets. It passes vector and floating point arguments in XMM0-XMM5,
and passes them indirectly once they are consumed.

Homogenous vector aggregates of up to four elements can be passed in
sequential vector registers, but this part is not implemented in LLVM
and will be handled in Clang.

On 32-bit x86, it is similar to fastcall in that it uses ecx:edx as
integer register parameters and is callee cleanup. On x86_64, it
delegates to the normal win64 calling convention.

Reviewers: majnemer

Differential Revision: http://reviews.llvm.org/D5943

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@220745 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/IR/CallingConv.h    |  6 ++-
 lib/AsmParser/LLLexer.cpp        |  1 +
 lib/AsmParser/LLParser.cpp       |  2 +
 lib/AsmParser/LLToken.h          |  2 +-
 lib/IR/AsmWriter.cpp             |  1 +
 lib/IR/Mangler.cpp               | 81 ++++++++++++++++------------
 lib/Target/X86/X86CallingConv.h  | 13 +++++
 lib/Target/X86/X86CallingConv.td | 64 ++++++++++++++++++++++
 test/CodeGen/X86/vectorcall.ll   | 93 ++++++++++++++++++++++++++++++++
 9 files changed, 227 insertions(+), 36 deletions(-)
 create mode 100644 test/CodeGen/X86/vectorcall.ll

diff --git a/include/llvm/IR/CallingConv.h b/include/llvm/IR/CallingConv.h
index 702c35f58fa..9872e6ec794 100644
--- a/include/llvm/IR/CallingConv.h
+++ b/include/llvm/IR/CallingConv.h
@@ -140,7 +140,11 @@ namespace CallingConv {
     /// convention differs from the more common \c X86_64_SysV convention
     /// in a number of ways, most notably in that XMM registers used to pass
     /// arguments are shadowed by GPRs, and vice versa.
-    X86_64_Win64 = 79
+    X86_64_Win64 = 79,
+
+    /// \brief MSVC calling convention that passes vectors and vector aggregates
+    /// in SSE registers.
+    X86_VectorCall = 80
   };
 } // End CallingConv namespace
 
diff --git a/lib/AsmParser/LLLexer.cpp b/lib/AsmParser/LLLexer.cpp
index e205a7abe4e..6523bcee060 100644
--- a/lib/AsmParser/LLLexer.cpp
+++ b/lib/AsmParser/LLLexer.cpp
@@ -580,6 +580,7 @@ lltok::Kind LLLexer::LexIdentifier() {
   KEYWORD(x86_stdcallcc);
   KEYWORD(x86_fastcallcc);
   KEYWORD(x86_thiscallcc);
+  KEYWORD(x86_vectorcallcc);
   KEYWORD(arm_apcscc);
   KEYWORD(arm_aapcscc);
   KEYWORD(arm_aapcs_vfpcc);
diff --git a/lib/AsmParser/LLParser.cpp b/lib/AsmParser/LLParser.cpp
index cfdb1d4742f..b7818bbf527 100644
--- a/lib/AsmParser/LLParser.cpp
+++ b/lib/AsmParser/LLParser.cpp
@@ -1448,6 +1448,7 @@ bool LLParser::ParseOptionalDLLStorageClass(unsigned &Res) {
 ///   ::= 'x86_stdcallcc'
 ///   ::= 'x86_fastcallcc'
 ///   ::= 'x86_thiscallcc'
+///   ::= 'x86_vectorcallcc'
 ///   ::= 'arm_apcscc'
 ///   ::= 'arm_aapcscc'
 ///   ::= 'arm_aapcs_vfpcc'
@@ -1473,6 +1474,7 @@ bool LLParser::ParseOptionalCallingConv(unsigned &CC) {
   case lltok::kw_x86_stdcallcc:  CC = CallingConv::X86_StdCall; break;
   case lltok::kw_x86_fastcallcc: CC = CallingConv::X86_FastCall; break;
   case lltok::kw_x86_thiscallcc: CC = CallingConv::X86_ThisCall; break;
+  case lltok::kw_x86_vectorcallcc:CC = CallingConv::X86_VectorCall; break;
   case lltok::kw_arm_apcscc:     CC = CallingConv::ARM_APCS; break;
   case lltok::kw_arm_aapcscc:    CC = CallingConv::ARM_AAPCS; break;
   case lltok::kw_arm_aapcs_vfpcc:CC = CallingConv::ARM_AAPCS_VFP; break;
diff --git a/lib/AsmParser/LLToken.h b/lib/AsmParser/LLToken.h
index 2b39ebd2b57..f9821f76efe 100644
--- a/lib/AsmParser/LLToken.h
+++ b/lib/AsmParser/LLToken.h
@@ -87,7 +87,7 @@ namespace lltok {
 
     kw_cc, kw_ccc, kw_fastcc, kw_coldcc,
     kw_intel_ocl_bicc,
-    kw_x86_stdcallcc, kw_x86_fastcallcc, kw_x86_thiscallcc,
+    kw_x86_stdcallcc, kw_x86_fastcallcc, kw_x86_thiscallcc, kw_x86_vectorcallcc,
     kw_arm_apcscc, kw_arm_aapcscc, kw_arm_aapcs_vfpcc,
     kw_msp430_intrcc,
     kw_ptx_kernel, kw_ptx_device,
diff --git a/lib/IR/AsmWriter.cpp b/lib/IR/AsmWriter.cpp
index 31d960e96cb..449225ae8c3 100644
--- a/lib/IR/AsmWriter.cpp
+++ b/lib/IR/AsmWriter.cpp
@@ -285,6 +285,7 @@ static void PrintCallingConv(unsigned cc, raw_ostream &Out) {
   case CallingConv::X86_StdCall:   Out << "x86_stdcallcc"; break;
   case CallingConv::X86_FastCall:  Out << "x86_fastcallcc"; break;
   case CallingConv::X86_ThisCall:  Out << "x86_thiscallcc"; break;
+  case CallingConv::X86_VectorCall:Out << "x86_vectorcallcc"; break;
   case CallingConv::Intel_OCL_BI:  Out << "intel_ocl_bicc"; break;
   case CallingConv::ARM_APCS:      Out << "arm_apcscc"; break;
   case CallingConv::ARM_AAPCS:     Out << "arm_aapcscc"; break;
diff --git a/lib/IR/Mangler.cpp b/lib/IR/Mangler.cpp
index bfed3e39f4e..c7eb666ee0f 100644
--- a/lib/IR/Mangler.cpp
+++ b/lib/IR/Mangler.cpp
@@ -22,7 +22,7 @@ using namespace llvm;
 
 static void getNameWithPrefixx(raw_ostream &OS, const Twine &GVName,
                               Mangler::ManglerPrefixTy PrefixTy,
-                              const DataLayout &DL, bool UseAt) {
+                              const DataLayout &DL, char Prefix) {
   SmallString<256> TmpData;
   StringRef Name = GVName.toStringRef(TmpData);
   assert(!Name.empty() && "getNameWithPrefix requires non-empty name");
@@ -39,13 +39,8 @@ static void getNameWithPrefixx(raw_ostream &OS, const Twine &GVName,
   else if (PrefixTy == Mangler::LinkerPrivate)
     OS << DL.getLinkerPrivateGlobalPrefix();
 
-  if (UseAt) {
-    OS << '@';
-  } else {
-    char Prefix = DL.getGlobalPrefix();
-    if (Prefix != '\0')
-      OS << Prefix;
-  }
+  if (Prefix != '\0')
+    OS << Prefix;
 
   // If this is a simple string that doesn't need escaping, just append it.
   OS << Name;
@@ -53,7 +48,8 @@ static void getNameWithPrefixx(raw_ostream &OS, const Twine &GVName,
 
 void Mangler::getNameWithPrefix(raw_ostream &OS, const Twine &GVName,
                                 ManglerPrefixTy PrefixTy) const {
-  return getNameWithPrefixx(OS, GVName, PrefixTy, *DL, false);
+  char Prefix = DL->getGlobalPrefix();
+  return getNameWithPrefixx(OS, GVName, PrefixTy, *DL, Prefix);
 }
 
 void Mangler::getNameWithPrefix(SmallVectorImpl<char> &OutName,
@@ -63,11 +59,21 @@ void Mangler::getNameWithPrefix(SmallVectorImpl<char> &OutName,
   return getNameWithPrefix(OS, GVName, PrefixTy);
 }
 
-/// AddFastCallStdCallSuffix - Microsoft fastcall and stdcall functions require
-/// a suffix on their name indicating the number of words of arguments they
-/// take.
-static void AddFastCallStdCallSuffix(raw_ostream &OS, const Function *F,
-                                     const DataLayout &TD) {
+static bool hasByteCountSuffix(CallingConv::ID CC) {
+  switch (CC) {
+  case CallingConv::X86_FastCall:
+  case CallingConv::X86_StdCall:
+  case CallingConv::X86_VectorCall:
+    return true;
+  default:
+    return false;
+  }
+}
+
+/// Microsoft fastcall and stdcall functions require a suffix on their name
+/// indicating the number of words of arguments they take.
+static void addByteCountSuffix(raw_ostream &OS, const Function *F,
+                               const DataLayout &TD) {
   // Calculate arguments size total.
   unsigned ArgWords = 0;
   for (Function::const_arg_iterator AI = F->arg_begin(), AE = F->arg_end();
@@ -76,8 +82,9 @@ static void AddFastCallStdCallSuffix(raw_ostream &OS, const Function *F,
     // 'Dereference' type in case of byval or inalloca parameter attribute.
     if (AI->hasByValOrInAllocaAttr())
       Ty = cast<PointerType>(Ty)->getElementType();
-    // Size should be aligned to DWORD boundary
-    ArgWords += ((TD.getTypeAllocSize(Ty) + 3)/4)*4;
+    // Size should be aligned to pointer size.
+    unsigned PtrSize = TD.getPointerSize();
+    ArgWords += RoundUpToAlignment(TD.getTypeAllocSize(Ty), PtrSize);
   }
 
   OS << '@' << ArgWords;
@@ -106,34 +113,40 @@ void Mangler::getNameWithPrefix(raw_ostream &OS, const GlobalValue *GV,
   }
 
   StringRef Name = GV->getName();
-
-  bool UseAt = false;
-  const Function *MSFunc = nullptr;
-  CallingConv::ID CC;
-  if (Name[0] != '\1' && DL->hasMicrosoftFastStdCallMangling()) {
-    if ((MSFunc = dyn_cast<Function>(GV))) {
-      CC = MSFunc->getCallingConv();
-      // fastcall functions need to start with @ instead of _.
-      if (CC == CallingConv::X86_FastCall)
-        UseAt = true;
-    }
+  char Prefix = DL->getGlobalPrefix();
+
+  // Mangle functions with Microsoft calling conventions specially.  Only do
+  // this mangling for x86_64 vectorcall and 32-bit x86.
+  const Function *MSFunc = dyn_cast<Function>(GV);
+  if (Name.startswith("\01"))
+    MSFunc = nullptr; // Don't mangle when \01 is present.
+  CallingConv::ID CC = MSFunc ? MSFunc->getCallingConv() : CallingConv::C;
+  if (!DL->hasMicrosoftFastStdCallMangling() &&
+      CC != CallingConv::X86_VectorCall)
+    MSFunc = nullptr;
+  if (MSFunc) {
+    if (CC == CallingConv::X86_FastCall)
+      Prefix = '@'; // fastcall functions have an @ prefix instead of _.
+    else if (CC == CallingConv::X86_VectorCall)
+      Prefix = '\0'; // vectorcall functions have no prefix.
   }
 
-  getNameWithPrefixx(OS, Name, PrefixTy, *DL, UseAt);
+  getNameWithPrefixx(OS, Name, PrefixTy, *DL, Prefix);
 
   if (!MSFunc)
     return;
 
-  // If we are supposed to add a microsoft-style suffix for stdcall/fastcall,
-  // add it.
-  // fastcall and stdcall functions usually need @42 at the end to specify
-  // the argument info.
+  // If we are supposed to add a microsoft-style suffix for stdcall, fastcall,
+  // or vectorcall, add it.  These functions have a suffix of @N where N is the
+  // cumulative byte size of all of the parameters to the function in decimal.
+  if (CC == CallingConv::X86_VectorCall)
+    OS << '@'; // vectorcall functions use a double @ suffix.
   FunctionType *FT = MSFunc->getFunctionType();
-  if ((CC == CallingConv::X86_FastCall || CC == CallingConv::X86_StdCall) &&
+  if (hasByteCountSuffix(CC) &&
       // "Pure" variadic functions do not receive @0 suffix.
       (!FT->isVarArg() || FT->getNumParams() == 0 ||
        (FT->getNumParams() == 1 && MSFunc->hasStructRetAttr())))
-    AddFastCallStdCallSuffix(OS, MSFunc, *DL);
+    addByteCountSuffix(OS, MSFunc, *DL);
 }
 
 void Mangler::getNameWithPrefix(SmallVectorImpl<char> &OutName,
diff --git a/lib/Target/X86/X86CallingConv.h b/lib/Target/X86/X86CallingConv.h
index 15a455ae29b..0eb2494f1d6 100644
--- a/lib/Target/X86/X86CallingConv.h
+++ b/lib/Target/X86/X86CallingConv.h
@@ -20,6 +20,19 @@
 
 namespace llvm {
 
+inline bool CC_X86_32_VectorCallIndirect(unsigned &ValNo, MVT &ValVT,
+                                         MVT &LocVT,
+                                         CCValAssign::LocInfo &LocInfo,
+                                         ISD::ArgFlagsTy &ArgFlags,
+                                         CCState &State) {
+  // Similar to CCPassIndirect, with the addition of inreg.
+  LocVT = MVT::i32;
+  LocInfo = CCValAssign::Indirect;
+  ArgFlags.setInReg();
+  return false; // Continue the search, but now for i32.
+}
+
+
 inline bool CC_X86_AnyReg_Error(unsigned &, MVT &, MVT &,
                                 CCValAssign::LocInfo &, ISD::ArgFlagsTy &,
                                 CCState &) {
diff --git a/lib/Target/X86/X86CallingConv.td b/lib/Target/X86/X86CallingConv.td
index dec73eac606..75a2ec00468 100644
--- a/lib/Target/X86/X86CallingConv.td
+++ b/lib/Target/X86/X86CallingConv.td
@@ -124,6 +124,24 @@ def RetCC_X86_32_HiPE : CallingConv<[
   CCIfType<[i32], CCAssignToReg<[ESI, EBP, EAX, EDX]>>
 ]>;
 
+// X86-32 HiPE return-value convention.
+def RetCC_X86_32_VectorCall : CallingConv<[
+  // Vector types are returned in XMM0,XMM1,XMMM2 and XMM3.
+  CCIfType<[f32, f64, v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
+            CCAssignToReg<[XMM0,XMM1,XMM2,XMM3]>>,
+
+  // 256-bit FP vectors
+  CCIfType<[v32i8, v16i16, v8i32, v4i64, v8f32, v4f64],
+            CCAssignToReg<[YMM0,YMM1,YMM2,YMM3]>>,
+
+  // 512-bit FP vectors
+  CCIfType<[v64i8, v32i16, v16i32, v8i64, v16f32, v8f64],
+            CCAssignToReg<[ZMM0,ZMM1,ZMM2,ZMM3]>>,
+
+  // Return integers in the standard way.
+  CCDelegateTo<RetCC_X86Common>
+]>;
+
 // X86-64 C return-value convention.
 def RetCC_X86_64_C : CallingConv<[
   // The X86-64 calling convention always returns FP values in XMM0.
@@ -179,6 +197,7 @@ def RetCC_X86_32 : CallingConv<[
   CCIfCC<"CallingConv::Fast", CCDelegateTo<RetCC_X86_32_Fast>>,
   // If HiPE, use RetCC_X86_32_HiPE.
   CCIfCC<"CallingConv::HiPE", CCDelegateTo<RetCC_X86_32_HiPE>>,
+  CCIfCC<"CallingConv::X86_VectorCall", CCDelegateTo<RetCC_X86_32_VectorCall>>,
 
   // Otherwise, use RetCC_X86_32_C.
   CCDelegateTo<RetCC_X86_32_C>
@@ -330,6 +349,25 @@ def CC_X86_Win64_C : CallingConv<[
   CCIfType<[f80], CCAssignToStack<0, 0>>
 ]>;
 
+def CC_X86_Win64_VectorCall : CallingConv<[
+  // The first 6 floating point and vector types of 128 bits or less use
+  // XMM0-XMM5.
+  CCIfType<[f32, f64, v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
+           CCAssignToReg<[XMM0, XMM1, XMM2, XMM3, XMM4, XMM5]>>,
+
+  // 256-bit vectors use YMM registers.
+  CCIfType<[v32i8, v16i16, v8i32, v4i64, v8f32, v4f64],
+           CCAssignToReg<[YMM0, YMM1, YMM2, YMM3, YMM4, YMM5]>>,
+
+  // 512-bit vectors use ZMM registers.
+  CCIfType<[v64i8, v32i16, v16i32, v8i64, v16f32, v8f64],
+           CCAssignToReg<[ZMM0, ZMM1, ZMM2, ZMM3, ZMM4, ZMM5]>>,
+
+  // Delegate to fastcall to handle integer types.
+  CCDelegateTo<CC_X86_Win64_C>
+]>;
+
+
 def CC_X86_64_GHC : CallingConv<[
   // Promote i8/i16/i32 arguments to i64.
   CCIfType<[i8, i16, i32], CCPromoteToType<i64>>,
@@ -463,6 +501,30 @@ def CC_X86_32_FastCall : CallingConv<[
   CCDelegateTo<CC_X86_32_Common>
 ]>;
 
+def CC_X86_32_VectorCall : CallingConv<[
+  // The first 6 floating point and vector types of 128 bits or less use
+  // XMM0-XMM5.
+  CCIfType<[f32, f64, v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
+           CCAssignToReg<[XMM0, XMM1, XMM2, XMM3, XMM4, XMM5]>>,
+
+  // 256-bit vectors use YMM registers.
+  CCIfType<[v32i8, v16i16, v8i32, v4i64, v8f32, v4f64],
+           CCAssignToReg<[YMM0, YMM1, YMM2, YMM3, YMM4, YMM5]>>,
+
+  // 512-bit vectors use ZMM registers.
+  CCIfType<[v64i8, v32i16, v16i32, v8i64, v16f32, v8f64],
+           CCAssignToReg<[ZMM0, ZMM1, ZMM2, ZMM3, ZMM4, ZMM5]>>,
+
+  // Otherwise, pass it indirectly.
+  CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64,
+            v32i8, v16i16, v8i32, v4i64, v8f32, v4f64,
+            v64i8, v32i16, v16i32, v8i64, v16f32, v8f64],
+           CCCustom<"CC_X86_32_VectorCallIndirect">>,
+
+  // Delegate to fastcall to handle integer types.
+  CCDelegateTo<CC_X86_32_FastCall>
+]>;
+
 def CC_X86_32_ThisCall_Common : CallingConv<[
   // The first integer argument is passed in ECX
   CCIfType<[i32], CCAssignToReg<[ECX]>>,
@@ -576,6 +638,7 @@ def CC_Intel_OCL_BI : CallingConv<[
 // This is the root argument convention for the X86-32 backend.
 def CC_X86_32 : CallingConv<[
   CCIfCC<"CallingConv::X86_FastCall", CCDelegateTo<CC_X86_32_FastCall>>,
+  CCIfCC<"CallingConv::X86_VectorCall", CCDelegateTo<CC_X86_32_VectorCall>>,
   CCIfCC<"CallingConv::X86_ThisCall", CCDelegateTo<CC_X86_32_ThisCall>>,
   CCIfCC<"CallingConv::Fast", CCDelegateTo<CC_X86_32_FastCC>>,
   CCIfCC<"CallingConv::GHC", CCDelegateTo<CC_X86_32_GHC>>,
@@ -593,6 +656,7 @@ def CC_X86_64 : CallingConv<[
   CCIfCC<"CallingConv::AnyReg", CCDelegateTo<CC_X86_64_AnyReg>>,
   CCIfCC<"CallingConv::X86_64_Win64", CCDelegateTo<CC_X86_Win64_C>>,
   CCIfCC<"CallingConv::X86_64_SysV", CCDelegateTo<CC_X86_64_C>>,
+  CCIfCC<"CallingConv::X86_VectorCall", CCDelegateTo<CC_X86_Win64_VectorCall>>,
 
   // Mingw64 and native Win64 use Win64 CC
   CCIfSubtarget<"isTargetWin64()", CCDelegateTo<CC_X86_Win64_C>>,
diff --git a/test/CodeGen/X86/vectorcall.ll b/test/CodeGen/X86/vectorcall.ll
new file mode 100644
index 00000000000..1e52654e99f
--- /dev/null
+++ b/test/CodeGen/X86/vectorcall.ll
@@ -0,0 +1,93 @@
+; RUN: llc -mtriple=i686-pc-win32 -mattr=+sse2 < %s | FileCheck %s --check-prefix=CHECK --check-prefix=X86
+; RUN: llc -mtriple=x86_64-pc-win32 < %s | FileCheck %s --check-prefix=CHECK --check-prefix=X64
+
+; Test integer arguments.
+
+define x86_vectorcallcc i32 @test_int_1() {
+  ret i32 0
+}
+
+; CHECK-LABEL: {{^}}test_int_1@@0:
+; CHECK: xorl %eax, %eax
+
+define x86_vectorcallcc i32 @test_int_2(i32 inreg %a) {
+  ret i32 %a
+}
+
+; X86-LABEL: {{^}}test_int_2@@4:
+; X64-LABEL: {{^}}test_int_2@@8:
+; CHECK: movl %ecx, %eax
+
+define x86_vectorcallcc i32 @test_int_3(i64 inreg %a) {
+  %at = trunc i64 %a to i32
+  ret i32 %at
+}
+
+; X86-LABEL: {{^}}test_int_3@@8:
+; X64-LABEL: {{^}}test_int_3@@8:
+; CHECK: movl %ecx, %eax
+
+define x86_vectorcallcc i32 @test_int_4(i32 inreg %a, i32 inreg %b) {
+  %s = add i32 %a, %b
+  ret i32 %s
+}
+
+; X86-LABEL: {{^}}test_int_4@@8:
+; X86: leal (%ecx,%edx), %eax
+
+; X64-LABEL: {{^}}test_int_4@@16:
+; X64: leal (%rcx,%rdx), %eax
+
+define x86_vectorcallcc i32 @"\01test_int_5"(i32, i32) {
+  ret i32 0
+}
+; CHECK-LABEL: {{^}}test_int_5:
+
+define x86_vectorcallcc double @test_fp_1(double %a, double %b) {
+  ret double %b
+}
+; CHECK-LABEL: {{^}}test_fp_1@@16:
+; CHECK: movaps %xmm1, %xmm0
+
+define x86_vectorcallcc double @test_fp_2(
+    double, double, double, double, double, double, double %r) {
+  ret double %r
+}
+; CHECK-LABEL: {{^}}test_fp_2@@56:
+; CHECK: movsd {{[0-9]+\(%[re]sp\)}}, %xmm0
+
+define x86_vectorcallcc {double, double, double, double} @test_fp_3() {
+  ret {double, double, double, double}
+        { double 0.0, double 0.0, double 0.0, double 0.0 }
+}
+; CHECK-LABEL: {{^}}test_fp_3@@0:
+; CHECK: xorps %xmm0
+; CHECK: xorps %xmm1
+; CHECK: xorps %xmm2
+; CHECK: xorps %xmm3
+
+; FIXME: Returning via x87 isn't compatible, but its hard to structure the
+; tablegen any other way.
+define x86_vectorcallcc {double, double, double, double, double} @test_fp_4() {
+  ret {double, double, double, double, double}
+        { double 0.0, double 0.0, double 0.0, double 0.0, double 0.0 }
+}
+; CHECK-LABEL: {{^}}test_fp_4@@0:
+; CHECK: fldz
+; CHECK: xorps %xmm0
+; CHECK: xorps %xmm1
+; CHECK: xorps %xmm2
+; CHECK: xorps %xmm3
+
+define x86_vectorcallcc <16 x i8> @test_vec_1(<16 x i8> %a, <16 x i8> %b) {
+  ret <16 x i8> %b
+}
+; CHECK-LABEL: {{^}}test_vec_1@@32:
+; CHECK: movaps %xmm1, %xmm0
+
+define x86_vectorcallcc <16 x i8> @test_vec_2(
+    double, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> %r) {
+  ret <16 x i8> %r
+}
+; CHECK-LABEL: {{^}}test_vec_2@@104:
+; CHECK: movaps (%{{[re]}}cx), %xmm0
-- 
2.34.1