From 35eab1db2f21aee9678fe946a5d983a67285e7e4 Mon Sep 17 00:00:00 2001
From: Joey Gouly <joey.gouly@arm.com>
Date: Thu, 22 Aug 2013 15:29:11 +0000
Subject: [PATCH] [ARMv8] Add CodeGen support for VSEL.

This uses the ARMcmov pattern that Tim cleaned up in r188995.

Thanks to Simon Tatham for his floating point help!


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@189024 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/ARM/ARMISelLowering.cpp |  94 ++++++++-
 lib/Target/ARM/ARMInstrVFP.td      |  20 +-
 test/CodeGen/ARM/vsel.ll           | 309 +++++++++++++++++++++++++++++
 3 files changed, 414 insertions(+), 9 deletions(-)
 create mode 100644 test/CodeGen/ARM/vsel.ll

diff --git a/lib/Target/ARM/ARMISelLowering.cpp b/lib/Target/ARM/ARMISelLowering.cpp
index ebfa1b118eb..7021941076f 100644
--- a/lib/Target/ARM/ARMISelLowering.cpp
+++ b/lib/Target/ARM/ARMISelLowering.cpp
@@ -3178,6 +3178,61 @@ SDValue ARMTargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
                          SelectTrue, SelectFalse, ISD::SETNE);
 }
 
+static ISD::CondCode getInverseCCForVSEL(ISD::CondCode CC) {
+  if (CC == ISD::SETNE)
+    return ISD::SETEQ;
+  return ISD::getSetCCSwappedOperands(CC);
+}
+
+static void checkVSELConstraints(ISD::CondCode CC, ARMCC::CondCodes &CondCode,
+                                 bool &swpCmpOps, bool &swpVselOps) {
+  // Start by selecting the GE condition code for opcodes that return true for
+  // 'equality'
+  if (CC == ISD::SETUGE || CC == ISD::SETOGE || CC == ISD::SETOLE ||
+      CC == ISD::SETULE)
+    CondCode = ARMCC::GE;
+
+  // and GT for opcodes that return false for 'equality'.
+  else if (CC == ISD::SETUGT || CC == ISD::SETOGT || CC == ISD::SETOLT ||
+           CC == ISD::SETULT)
+    CondCode = ARMCC::GT;
+
+  // Since we are constrained to GE/GT, if the opcode contains 'less', we need
+  // to swap the compare operands.
+  if (CC == ISD::SETOLE || CC == ISD::SETULE || CC == ISD::SETOLT ||
+      CC == ISD::SETULT)
+    swpCmpOps = true;
+
+  // Both GT and GE are ordered comparisons, and return false for 'unordered'.
+  // If we have an unordered opcode, we need to swap the operands to the VSEL
+  // instruction (effectively negating the condition).
+  //
+  // This also has the effect of swapping which one of 'less' or 'greater'
+  // returns true, so we also swap the compare operands. It also switches
+  // whether we return true for 'equality', so we compensate by picking the
+  // opposite condition code to our original choice.
+  if (CC == ISD::SETULE || CC == ISD::SETULT || CC == ISD::SETUGE ||
+      CC == ISD::SETUGT) {
+    swpCmpOps = !swpCmpOps;
+    swpVselOps = !swpVselOps;
+    CondCode = CondCode == ARMCC::GT ? ARMCC::GE : ARMCC::GT;
+  }
+
+  // 'ordered' is 'anything but unordered', so use the VS condition code and
+  // swap the VSEL operands.
+  if (CC == ISD::SETO) {
+    CondCode = ARMCC::VS;
+    swpVselOps = true;
+  }
+
+  // 'unordered or not equal' is 'anything but equal', so use the EQ condition
+  // code and swap the VSEL operands.
+  if (CC == ISD::SETUNE) {
+    CondCode = ARMCC::EQ;
+    swpVselOps = true;
+  }
+}
+
 SDValue ARMTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
   EVT VT = Op.getValueType();
   SDValue LHS = Op.getOperand(0);
@@ -3188,15 +3243,52 @@ SDValue ARMTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
   SDLoc dl(Op);
 
   if (LHS.getValueType() == MVT::i32) {
+    // Try to generate VSEL on ARMv8.
+    // The VSEL instruction can't use all the usual ARM condition
+    // codes: it only has two bits to select the condition code, so it's
+    // constrained to use only GE, GT, VS and EQ.
+    //
+    // To implement all the various ISD::SETXXX opcodes, we sometimes need to
+    // swap the operands of the previous compare instruction (effectively
+    // inverting the compare condition, swapping 'less' and 'greater') and
+    // sometimes need to swap the operands to the VSEL (which inverts the
+    // condition in the sense of firing whenever the previous condition didn't)
+    if (getSubtarget()->hasV8FP() && (TrueVal.getValueType() == MVT::f32 ||
+                                      TrueVal.getValueType() == MVT::f64)) {
+      ARMCC::CondCodes CondCode = IntCCToARMCC(CC);
+      if (CondCode == ARMCC::LT || CondCode == ARMCC::LE ||
+          CondCode == ARMCC::VC || CondCode == ARMCC::NE) {
+        CC = getInverseCCForVSEL(CC);
+        std::swap(TrueVal, FalseVal);
+      }
+    }
+
     SDValue ARMcc;
     SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
     SDValue Cmp = getARMCmp(LHS, RHS, CC, ARMcc, DAG, dl);
-    return DAG.getNode(ARMISD::CMOV, dl, VT, FalseVal, TrueVal, ARMcc, CCR,Cmp);
+    return DAG.getNode(ARMISD::CMOV, dl, VT, FalseVal, TrueVal, ARMcc, CCR,
+                       Cmp);
   }
 
   ARMCC::CondCodes CondCode, CondCode2;
   FPCCToARMCC(CC, CondCode, CondCode2);
 
+  // Try to generate VSEL on ARMv8.
+  if (getSubtarget()->hasV8FP() && (TrueVal.getValueType() == MVT::f32 ||
+                                    TrueVal.getValueType() == MVT::f64)) {
+    bool swpCmpOps = false;
+    bool swpVselOps = false;
+    checkVSELConstraints(CC, CondCode, swpCmpOps, swpVselOps);
+
+    if (CondCode == ARMCC::GT || CondCode == ARMCC::GE ||
+        CondCode == ARMCC::VS || CondCode == ARMCC::EQ) {
+      if (swpCmpOps)
+        std::swap(LHS, RHS);
+      if (swpVselOps)
+        std::swap(TrueVal, FalseVal);
+    }
+  }
+
   SDValue ARMcc = DAG.getConstant(CondCode, MVT::i32);
   SDValue Cmp = getVFPCmp(LHS, RHS, DAG, dl);
   SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
diff --git a/lib/Target/ARM/ARMInstrVFP.td b/lib/Target/ARM/ARMInstrVFP.td
index c6b8bc3f929..b4df4d787e3 100644
--- a/lib/Target/ARM/ARMInstrVFP.td
+++ b/lib/Target/ARM/ARMInstrVFP.td
@@ -333,24 +333,28 @@ def VNMULS : ASbI<0b11100, 0b10, 1, 0,
   let D = VFPNeonA8Domain;
 }
 
-multiclass vsel_inst<string op, bits<2> opc> {
-  let DecoderNamespace = "VFPV8", PostEncoderMethod = "" in {
+multiclass vsel_inst<string op, bits<2> opc, int CC> {
+  let DecoderNamespace = "VFPV8", PostEncoderMethod = "",
+      Uses = [CPSR], AddedComplexity = 4 in {
     def S : ASbInp<0b11100, opc, 0,
                    (outs SPR:$Sd), (ins SPR:$Sn, SPR:$Sm),
                    NoItinerary, !strconcat("vsel", op, ".f32\t$Sd, $Sn, $Sm"),
-                   []>, Requires<[HasV8FP]>;
+                   [(set SPR:$Sd, (ARMcmov SPR:$Sm, SPR:$Sn, CC))]>,
+                   Requires<[HasV8FP]>;
 
     def D : ADbInp<0b11100, opc, 0,
                    (outs DPR:$Dd), (ins DPR:$Dn, DPR:$Dm),
                    NoItinerary, !strconcat("vsel", op, ".f64\t$Dd, $Dn, $Dm"),
-                   []>, Requires<[HasV8FP]>;
+                   [(set DPR:$Dd, (ARMcmov (f64 DPR:$Dm), (f64 DPR:$Dn), CC))]>,
+                   Requires<[HasV8FP]>;
   }
 }
 
-defm VSELGT : vsel_inst<"gt", 0b11>;
-defm VSELGE : vsel_inst<"ge", 0b10>;
-defm VSELEQ : vsel_inst<"eq", 0b00>;
-defm VSELVS : vsel_inst<"vs", 0b01>;
+// The CC constants here match ARMCC::CondCodes.
+defm VSELGT : vsel_inst<"gt", 0b11, 12>;
+defm VSELGE : vsel_inst<"ge", 0b10, 10>;
+defm VSELEQ : vsel_inst<"eq", 0b00, 0>;
+defm VSELVS : vsel_inst<"vs", 0b01, 6>;
 
 multiclass vmaxmin_inst<string op, bit opc> {
   let DecoderNamespace = "VFPV8", PostEncoderMethod = "" in {
diff --git a/test/CodeGen/ARM/vsel.ll b/test/CodeGen/ARM/vsel.ll
new file mode 100644
index 00000000000..f4ee800f6fd
--- /dev/null
+++ b/test/CodeGen/ARM/vsel.ll
@@ -0,0 +1,309 @@
+; RUN: llc < %s -mtriple=armv8-linux-gnueabihf -mattr=+v8fp -float-abi=hard | FileCheck %s
+@varfloat = global float 0.0
+@vardouble = global double 0.0
+define void @test_vsel32sgt(i32 %lhs32, i32 %rhs32, float %a, float %b) {
+; CHECK: test_vsel32sgt
+  %tst1 = icmp sgt i32 %lhs32, %rhs32
+  %val1 = select i1 %tst1, float %a, float %b
+  store float %val1, float* @varfloat
+; CHECK: cmp r0, r1
+; CHECK: vselgt.f32 s0, s0, s1
+  ret void
+}
+define void @test_vsel64sgt(i32 %lhs32, i32 %rhs32, double %a, double %b) {
+; CHECK: test_vsel64sgt
+  %tst1 = icmp sgt i32 %lhs32, %rhs32
+  %val1 = select i1 %tst1, double %a, double %b
+  store double %val1, double* @vardouble
+; CHECK: cmp r0, r1
+; CHECK: vselgt.f64 d16, d0, d1
+  ret void
+}
+define void @test_vsel32sge(i32 %lhs32, i32 %rhs32, float %a, float %b) {
+; CHECK: test_vsel32sge
+  %tst1 = icmp sge i32 %lhs32, %rhs32
+  %val1 = select i1 %tst1, float %a, float %b
+  store float %val1, float* @varfloat
+; CHECK: cmp r0, r1
+; CHECK: vselge.f32 s0, s0, s1
+  ret void
+}
+define void @test_vsel64sge(i32 %lhs32, i32 %rhs32, double %a, double %b) {
+; CHECK: test_vsel64sge
+  %tst1 = icmp sge i32 %lhs32, %rhs32
+  %val1 = select i1 %tst1, double %a, double %b
+  store double %val1, double* @vardouble
+; CHECK: cmp r0, r1
+; CHECK: vselge.f64 d16, d0, d1
+  ret void
+}
+define void @test_vsel32eq(i32 %lhs32, i32 %rhs32, float %a, float %b) {
+; CHECK: test_vsel32eq
+  %tst1 = icmp eq i32 %lhs32, %rhs32
+  %val1 = select i1 %tst1, float %a, float %b
+  store float %val1, float* @varfloat
+; CHECK: cmp r0, r1
+; CHECK: vseleq.f32 s0, s0, s1
+  ret void
+}
+define void @test_vsel64eq(i32 %lhs32, i32 %rhs32, double %a, double %b) {
+; CHECK: test_vsel64eq
+  %tst1 = icmp eq i32 %lhs32, %rhs32
+  %val1 = select i1 %tst1, double %a, double %b
+  store double %val1, double* @vardouble
+; CHECK: cmp r0, r1
+; CHECK: vseleq.f64 d16, d0, d1
+  ret void
+}
+define void @test_vsel32slt(i32 %lhs32, i32 %rhs32, float %a, float %b) {
+; CHECK: test_vsel32slt
+  %tst1 = icmp slt i32 %lhs32, %rhs32
+  %val1 = select i1 %tst1, float %a, float %b
+  store float %val1, float* @varfloat
+; CHECK: cmp r0, r1
+; CHECK: vselgt.f32 s0, s1, s0
+  ret void
+}
+define void @test_vsel64slt(i32 %lhs32, i32 %rhs32, double %a, double %b) {
+; CHECK: test_vsel64slt
+  %tst1 = icmp slt i32 %lhs32, %rhs32
+  %val1 = select i1 %tst1, double %a, double %b
+  store double %val1, double* @vardouble
+; CHECK: cmp r0, r1
+; CHECK: vselgt.f64 d16, d1, d0
+  ret void
+}
+define void @test_vsel32sle(i32 %lhs32, i32 %rhs32, float %a, float %b) {
+; CHECK: test_vsel32sle
+  %tst1 = icmp sle i32 %lhs32, %rhs32
+  %val1 = select i1 %tst1, float %a, float %b
+  store float %val1, float* @varfloat
+; CHECK: cmp r0, r1
+; CHECK: vselge.f32 s0, s1, s0
+  ret void
+}
+define void @test_vsel64sle(i32 %lhs32, i32 %rhs32, double %a, double %b) {
+; CHECK: test_vsel64sle
+  %tst1 = icmp sle i32 %lhs32, %rhs32
+  %val1 = select i1 %tst1, double %a, double %b
+  store double %val1, double* @vardouble
+; CHECK: cmp r0, r1
+; CHECK: vselge.f64 d16, d1, d0
+  ret void
+}
+define void @test_vsel32ogt(float %lhs32, float %rhs32, float %a, float %b) {
+; CHECK: test_vsel32ogt
+  %tst1 = fcmp ogt float %lhs32, %rhs32
+  %val1 = select i1 %tst1, float %a, float %b
+  store float %val1, float* @varfloat
+; CHECK: vcmpe.f32 s0, s1
+; CHECK: vselgt.f32 s0, s2, s3
+  ret void
+}
+define void @test_vsel64ogt(float %lhs32, float %rhs32, double %a, double %b) {
+; CHECK: test_vsel64ogt
+  %tst1 = fcmp ogt float %lhs32, %rhs32
+  %val1 = select i1 %tst1, double %a, double %b
+  store double %val1, double* @vardouble
+; CHECK: vcmpe.f32 s0, s1
+; CHECK: vselgt.f64 d16, d1, d2
+  ret void
+}
+define void @test_vsel32oge(float %lhs32, float %rhs32, float %a, float %b) {
+; CHECK: test_vsel32oge
+  %tst1 = fcmp oge float %lhs32, %rhs32
+  %val1 = select i1 %tst1, float %a, float %b
+  store float %val1, float* @varfloat
+; CHECK: vcmpe.f32 s0, s1
+; CHECK: vselge.f32 s0, s2, s3
+  ret void
+}
+define void @test_vsel64oge(float %lhs32, float %rhs32, double %a, double %b) {
+; CHECK: test_vsel64oge
+  %tst1 = fcmp oge float %lhs32, %rhs32
+  %val1 = select i1 %tst1, double %a, double %b
+  store double %val1, double* @vardouble
+; CHECK: vcmpe.f32 s0, s1
+; CHECK: vselge.f64 d16, d1, d2
+  ret void
+}
+define void @test_vsel32oeq(float %lhs32, float %rhs32, float %a, float %b) {
+; CHECK: test_vsel32oeq
+  %tst1 = fcmp oeq float %lhs32, %rhs32
+  %val1 = select i1 %tst1, float %a, float %b
+  store float %val1, float* @varfloat
+; CHECK: vcmpe.f32 s0, s1
+; CHECK: vseleq.f32 s0, s2, s3
+  ret void
+}
+define void @test_vsel64oeq(float %lhs32, float %rhs32, double %a, double %b) {
+; CHECK: test_vsel64oeq
+  %tst1 = fcmp oeq float %lhs32, %rhs32
+  %val1 = select i1 %tst1, double %a, double %b
+  store double %val1, double* @vardouble
+; CHECK: vcmpe.f32 s0, s1
+; CHECK: vseleq.f64 d16, d1, d2
+  ret void
+}
+define void @test_vsel32ugt(float %lhs32, float %rhs32, float %a, float %b) {
+; CHECK: test_vsel32ugt
+  %tst1 = fcmp ugt float %lhs32, %rhs32
+  %val1 = select i1 %tst1, float %a, float %b
+  store float %val1, float* @varfloat
+; CHECK: vcmpe.f32 s1, s0
+; CHECK: vselge.f32 s0, s3, s2
+  ret void
+}
+define void @test_vsel64ugt(float %lhs32, float %rhs32, double %a, double %b) {
+; CHECK: test_vsel64ugt
+  %tst1 = fcmp ugt float %lhs32, %rhs32
+  %val1 = select i1 %tst1, double %a, double %b
+  store double %val1, double* @vardouble
+; CHECK: vcmpe.f32 s1, s0
+; CHECK: vselge.f64 d16, d2, d1
+  ret void
+}
+define void @test_vsel32uge(float %lhs32, float %rhs32, float %a, float %b) {
+; CHECK: test_vsel32uge
+  %tst1 = fcmp uge float %lhs32, %rhs32
+  %val1 = select i1 %tst1, float %a, float %b
+  store float %val1, float* @varfloat
+; CHECK: vcmpe.f32 s1, s0
+; CHECK: vselgt.f32 s0, s3, s2
+  ret void
+}
+define void @test_vsel64uge(float %lhs32, float %rhs32, double %a, double %b) {
+; CHECK: test_vsel64uge
+  %tst1 = fcmp uge float %lhs32, %rhs32
+  %val1 = select i1 %tst1, double %a, double %b
+  store double %val1, double* @vardouble
+; CHECK: vcmpe.f32 s1, s0
+; CHECK: vselgt.f64 d16, d2, d1
+  ret void
+}
+define void @test_vsel32olt(float %lhs32, float %rhs32, float %a, float %b) {
+; CHECK: test_vsel32olt
+  %tst1 = fcmp olt float %lhs32, %rhs32
+  %val1 = select i1 %tst1, float %a, float %b
+  store float %val1, float* @varfloat
+; CHECK: vcmpe.f32 s1, s0
+; CHECK: vselgt.f32 s0, s2, s3
+  ret void
+}
+define void @test_vsel64olt(float %lhs32, float %rhs32, double %a, double %b) {
+; CHECK: test_vsel64olt
+  %tst1 = fcmp olt float %lhs32, %rhs32
+  %val1 = select i1 %tst1, double %a, double %b
+  store double %val1, double* @vardouble
+; CHECK: vcmpe.f32 s1, s0
+; CHECK: vselgt.f64 d16, d1, d2
+  ret void
+}
+define void @test_vsel32ult(float %lhs32, float %rhs32, float %a, float %b) {
+; CHECK: test_vsel32ult
+  %tst1 = fcmp ult float %lhs32, %rhs32
+  %val1 = select i1 %tst1, float %a, float %b
+  store float %val1, float* @varfloat
+; CHECK: vcmpe.f32 s0, s1
+; CHECK: vselge.f32 s0, s3, s2
+  ret void
+}
+define void @test_vsel64ult(float %lhs32, float %rhs32, double %a, double %b) {
+; CHECK: test_vsel64ult
+  %tst1 = fcmp ult float %lhs32, %rhs32
+  %val1 = select i1 %tst1, double %a, double %b
+  store double %val1, double* @vardouble
+; CHECK: vcmpe.f32 s0, s1
+; CHECK: vselge.f64 d16, d2, d1
+  ret void
+}
+define void @test_vsel32ole(float %lhs32, float %rhs32, float %a, float %b) {
+; CHECK: test_vsel32ole
+  %tst1 = fcmp ole float %lhs32, %rhs32
+  %val1 = select i1 %tst1, float %a, float %b
+  store float %val1, float* @varfloat
+; CHECK: vcmpe.f32 s1, s0
+; CHECK: vselge.f32 s0, s2, s3
+  ret void
+}
+define void @test_vsel64ole(float %lhs32, float %rhs32, double %a, double %b) {
+; CHECK: test_vsel64ole
+  %tst1 = fcmp ole float %lhs32, %rhs32
+  %val1 = select i1 %tst1, double %a, double %b
+  store double %val1, double* @vardouble
+; CHECK: vcmpe.f32 s1, s0
+; CHECK: vselge.f64 d16, d1, d2
+  ret void
+}
+define void @test_vsel32ule(float %lhs32, float %rhs32, float %a, float %b) {
+; CHECK: test_vsel32ule
+  %tst1 = fcmp ule float %lhs32, %rhs32
+  %val1 = select i1 %tst1, float %a, float %b
+  store float %val1, float* @varfloat
+; CHECK: vcmpe.f32 s0, s1
+; CHECK: vselgt.f32 s0, s3, s2
+  ret void
+}
+define void @test_vsel64ule(float %lhs32, float %rhs32, double %a, double %b) {
+; CHECK: test_vsel64ule
+  %tst1 = fcmp ule float %lhs32, %rhs32
+  %val1 = select i1 %tst1, double %a, double %b
+  store double %val1, double* @vardouble
+; CHECK: vcmpe.f32 s0, s1
+; CHECK: vselgt.f64 d16, d2, d1
+  ret void
+}
+define void @test_vsel32ord(float %lhs32, float %rhs32, float %a, float %b) {
+; CHECK: test_vsel32ord
+  %tst1 = fcmp ord float %lhs32, %rhs32
+  %val1 = select i1 %tst1, float %a, float %b
+  store float %val1, float* @varfloat
+; CHECK: vcmpe.f32 s0, s1
+; CHECK: vselvs.f32 s0, s3, s2
+  ret void
+}
+define void @test_vsel64ord(float %lhs32, float %rhs32, double %a, double %b) {
+; CHECK: test_vsel64ord
+  %tst1 = fcmp ord float %lhs32, %rhs32
+  %val1 = select i1 %tst1, double %a, double %b
+  store double %val1, double* @vardouble
+; CHECK: vcmpe.f32 s0, s1
+; CHECK: vselvs.f64 d16, d2, d1
+  ret void
+}
+define void @test_vsel32une(float %lhs32, float %rhs32, float %a, float %b) {
+; CHECK: test_vsel32une
+  %tst1 = fcmp une float %lhs32, %rhs32
+  %val1 = select i1 %tst1, float %a, float %b
+  store float %val1, float* @varfloat
+; CHECK: vcmpe.f32 s0, s1
+; CHECK: vseleq.f32 s0, s3, s2
+  ret void
+}
+define void @test_vsel64une(float %lhs32, float %rhs32, double %a, double %b) {
+; CHECK: test_vsel64une
+  %tst1 = fcmp une float %lhs32, %rhs32
+  %val1 = select i1 %tst1, double %a, double %b
+  store double %val1, double* @vardouble
+; CHECK: vcmpe.f32 s0, s1
+; CHECK: vseleq.f64 d16, d2, d1
+  ret void
+}
+define void @test_vsel32uno(float %lhs32, float %rhs32, float %a, float %b) {
+; CHECK: test_vsel32uno
+  %tst1 = fcmp uno float %lhs32, %rhs32
+  %val1 = select i1 %tst1, float %a, float %b
+  store float %val1, float* @varfloat
+; CHECK: vcmpe.f32 s0, s1
+; CHECK: vselvs.f32 s0, s2, s3
+  ret void
+}
+define void @test_vsel64uno(float %lhs32, float %rhs32, double %a, double %b) {
+; CHECK: test_vsel64uno
+  %tst1 = fcmp uno float %lhs32, %rhs32
+  %val1 = select i1 %tst1, double %a, double %b
+  store double %val1, double* @vardouble
+; CHECK: vcmpe.f32 s0, s1
+; CHECK: vselvs.f64 d16, d1, d2
+  ret void
+}
-- 
2.34.1