[AArch64]Add support to spill/fill D tuples such as DPair/DTriple/DQuad. There is...

author Hao Liu <Hao.Liu@arm.com>

Tue, 7 Jan 2014 10:50:43 +0000 (10:50 +0000)

committer Hao Liu <Hao.Liu@arm.com>

Tue, 7 Jan 2014 10:50:43 +0000 (10:50 +0000)
author Hao Liu <Hao.Liu@arm.com>
Tue, 7 Jan 2014 10:50:43 +0000 (10:50 +0000)
committer Hao Liu <Hao.Liu@arm.com>
Tue, 7 Jan 2014 10:50:43 +0000 (10:50 +0000)
diff --git a/lib/Target/AArch64/AArch64InstrInfo.cpp b/lib/Target/AArch64/AArch64InstrInfo.cpp

index f4d13932a519a926bd2e0175e36ba9d745838d2e..b0b0a8716b602400ceca0b819ede335c9c760e92 100644 (file)
--- a/lib/Target/AArch64/AArch64InstrInfo.cpp
+++ b/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -477,12 +477,18 @@ AArch64InstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
      default:
        llvm_unreachable("Unknown size for regclass");
      }
-  } else { // The spill of D tuples is implemented by Q tuples
-    if (RC == &AArch64::QPairRegClass)
+  } else { // For a super register class has more than one sub registers
+    if (AArch64::DPairRegClass.hasSubClassEq(RC))
+      StoreOp = AArch64::ST1x2_8B;
+    else if (AArch64::DTripleRegClass.hasSubClassEq(RC))
+      StoreOp = AArch64::ST1x3_8B;
+    else if (AArch64::DQuadRegClass.hasSubClassEq(RC))
+      StoreOp = AArch64::ST1x4_8B;
+    else if (AArch64::QPairRegClass.hasSubClassEq(RC))
        StoreOp = AArch64::ST1x2_16B;
-    else if (RC == &AArch64::QTripleRegClass)
+    else if (AArch64::QTripleRegClass.hasSubClassEq(RC))
        StoreOp = AArch64::ST1x3_16B;
-    else if (RC == &AArch64::QQuadRegClass)
+    else if (AArch64::QQuadRegClass.hasSubClassEq(RC))
        StoreOp = AArch64::ST1x4_16B;
      else
        llvm_unreachable("Unknown reg class");
@@ -537,12 +543,18 @@ AArch64InstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
      default:
        llvm_unreachable("Unknown size for regclass");
      }
-  } else { // The spill of D tuples is implemented by Q tuples
-    if (RC == &AArch64::QPairRegClass)
+  } else { // For a super register class has more than one sub registers
+    if (AArch64::DPairRegClass.hasSubClassEq(RC))
+      LoadOp = AArch64::LD1x2_8B;
+    else if (AArch64::DTripleRegClass.hasSubClassEq(RC))
+      LoadOp = AArch64::LD1x3_8B;
+    else if (AArch64::DQuadRegClass.hasSubClassEq(RC))
+      LoadOp = AArch64::LD1x4_8B;
+    else if (AArch64::QPairRegClass.hasSubClassEq(RC))
        LoadOp = AArch64::LD1x2_16B;
-    else if (RC == &AArch64::QTripleRegClass)
+    else if (AArch64::QTripleRegClass.hasSubClassEq(RC))
        LoadOp = AArch64::LD1x3_16B;
-    else if (RC == &AArch64::QQuadRegClass)
+    else if (AArch64::QQuadRegClass.hasSubClassEq(RC))
        LoadOp = AArch64::LD1x4_16B;
      else
        llvm_unreachable("Unknown reg class");
@@ -649,6 +661,17 @@ void AArch64InstrInfo::getAddressConstraints(const MachineInstr &MI,
      MinOffset = -0x40 * AccessScale;
      MaxOffset = 0x3f * AccessScale;
      return;
+  case AArch64::LD1x2_8B: case AArch64::ST1x2_8B:
+    AccessScale = 16;
+    MinOffset = 0;
+    MaxOffset = 0xfff * AccessScale;
+    return;
+  case AArch64::LD1x3_8B: case AArch64::ST1x3_8B:
+    AccessScale = 24;
+    MinOffset = 0;
+    MaxOffset = 0xfff * AccessScale;
+    return;
+  case AArch64::LD1x4_8B: case AArch64::ST1x4_8B:
    case AArch64::LD1x2_16B: case AArch64::ST1x2_16B:
      AccessScale = 32;
      MinOffset = 0;
diff --git a/lib/Target/AArch64/AArch64RegisterInfo.cpp b/lib/Target/AArch64/AArch64RegisterInfo.cpp

index 618f6fb9289b041b18c2af4f85ecf74636815845..973faf7363a5b4d7816265cd62569a163dee711f 100644 (file)
--- a/lib/Target/AArch64/AArch64RegisterInfo.cpp
+++ b/lib/Target/AArch64/AArch64RegisterInfo.cpp
@@ -77,7 +77,10 @@ AArch64RegisterInfo::getReservedRegs(const MachineFunction &MF) const {
  }
  
  static bool hasFrameOffset(int opcode) {
-  return opcode != AArch64::LD1x2_16B && opcode != AArch64::LD1x3_16B &&
+  return opcode != AArch64::LD1x2_8B  && opcode != AArch64::LD1x3_8B  &&
+         opcode != AArch64::LD1x4_8B  && opcode != AArch64::ST1x2_8B  &&
+         opcode != AArch64::ST1x3_8B  && opcode != AArch64::ST1x4_8B  &&
+         opcode != AArch64::LD1x2_16B && opcode != AArch64::LD1x3_16B &&
           opcode != AArch64::LD1x4_16B && opcode != AArch64::ST1x2_16B &&
           opcode != AArch64::ST1x3_16B && opcode != AArch64::ST1x4_16B;
  }
diff --git a/lib/Target/AArch64/AArch64RegisterInfo.td b/lib/Target/AArch64/AArch64RegisterInfo.td

index 8b1a9cb907401d2fa1de189020773e2e940c1c15..cfc0c953bd22312dd1d6e3a69b30bd37e543bbfe 100644 (file)
--- a/lib/Target/AArch64/AArch64RegisterInfo.td
+++ b/lib/Target/AArch64/AArch64RegisterInfo.td
@@ -30,7 +30,6 @@ def dsub_0 : SubRegIndex<64>;
  def dsub_1 : SubRegIndex<64, 64>;
  def dsub_2 : ComposedSubRegIndex<qsub_1, dsub_0>;
  def dsub_3 : ComposedSubRegIndex<qsub_1, dsub_1>;
-def dsub_4 : ComposedSubRegIndex<qsub_2, dsub_0>;
  }
  
  // Registers are identified with 5-bit ID numbers.
@@ -206,7 +205,7 @@ def FlagClass : RegisterClass<"AArch64", [i32], 32, (add NZCV)> {
  //===----------------------------------------------------------------------===//
  //  Consecutive vector registers
  //===----------------------------------------------------------------------===//
-// 2 Consecutive 64-bit registers: D0_D1, D1_D2, ..., D30_D31
+// 2 Consecutive 64-bit registers: D0_D1, D1_D2, ..., D31_D0
  def Tuples2D : RegisterTuples<[dsub_0, dsub_1],
                                [(rotl FPR64, 0), (rotl FPR64, 1)]>;
                                
diff --git a/test/CodeGen/AArch64/neon-vector-list-spill.ll b/test/CodeGen/AArch64/neon-vector-list-spill.ll

index 9ac2c05ebd0f6f4ed484d9aef94be5c7d1db05aa..3ab69c4a02af7d8b46485e61836b64f8cf209c7f 100644 (file)
--- a/test/CodeGen/AArch64/neon-vector-list-spill.ll
+++ b/test/CodeGen/AArch64/neon-vector-list-spill.ll
@@ -132,3 +132,44 @@ declare { <4 x float>, <4 x float>, <4 x float> } @llvm.arm.neon.vld3.v4f32(i8*,
  declare { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld4.v16i8(i8*, i32)
  
  declare void @foo()
+
+; FIXME: We should not generate ld/st for such register spill/fill, because the
+; test case seems very simple and the register pressure is not high. If the
+; spill/fill algorithm is optimized, this test case may not be triggered. And
+; then we can delete it.
+; check the spill for Register Class QPair_with_qsub_0_in_FPR128Lo
+define <8 x i16> @test_2xFPR128Lo(i64 %got, i8* %ptr, <1 x i64> %a) {
+  tail call void @llvm.arm.neon.vst2lane.v1i64(i8* %ptr, <1 x i64> zeroinitializer, <1 x i64> zeroinitializer, i32 0, i32 8)
+  tail call void @foo()
+  %sv = shufflevector <1 x i64> zeroinitializer, <1 x i64> %a, <2 x i32> <i32 0, i32 1>
+  %1 = bitcast <2 x i64> %sv to <8 x i16>
+  %2 = shufflevector <8 x i16> %1, <8 x i16> undef, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
+  %3 = mul <8 x i16> %2, %2
+  ret <8 x i16> %3
+}
+
+; check the spill for Register Class QTriple_with_qsub_0_in_FPR128Lo
+define <8 x i16> @test_3xFPR128Lo(i64 %got, i8* %ptr, <1 x i64> %a) {
+  tail call void @llvm.arm.neon.vst3lane.v1i64(i8* %ptr, <1 x i64> zeroinitializer, <1 x i64> zeroinitializer, <1 x i64> zeroinitializer, i32 0, i32 8)
+  tail call void @foo()
+  %sv = shufflevector <1 x i64> zeroinitializer, <1 x i64> %a, <2 x i32> <i32 0, i32 1>
+  %1 = bitcast <2 x i64> %sv to <8 x i16>
+  %2 = shufflevector <8 x i16> %1, <8 x i16> undef, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
+  %3 = mul <8 x i16> %2, %2
+  ret <8 x i16> %3
+}
+
+; check the spill for Register Class QQuad_with_qsub_0_in_FPR128Lo
+define <8 x i16> @test_4xFPR128Lo(i64 %got, i8* %ptr, <1 x i64> %a) {
+  tail call void @llvm.arm.neon.vst4lane.v1i64(i8* %ptr, <1 x i64> zeroinitializer, <1 x i64> zeroinitializer, <1 x i64> zeroinitializer, <1 x i64> zeroinitializer, i32 0, i32 8)
+  tail call void @foo()
+  %sv = shufflevector <1 x i64> zeroinitializer, <1 x i64> %a, <2 x i32> <i32 0, i32 1>
+  %1 = bitcast <2 x i64> %sv to <8 x i16>
+  %2 = shufflevector <8 x i16> %1, <8 x i16> undef, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
+  %3 = mul <8 x i16> %2, %2
+  ret <8 x i16> %3
+}
+
+declare void @llvm.arm.neon.vst2lane.v1i64(i8*, <1 x i64>, <1 x i64>, i32, i32)
+declare void @llvm.arm.neon.vst3lane.v1i64(i8*, <1 x i64>, <1 x i64>, <1 x i64>, i32, i32)
+declare void @llvm.arm.neon.vst4lane.v1i64(i8*, <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64>, i32, i32)
+\ No newline at end of file
author	Hao Liu <Hao.Liu@arm.com>
	Tue, 7 Jan 2014 10:50:43 +0000 (10:50 +0000)
committer	Hao Liu <Hao.Liu@arm.com>
	Tue, 7 Jan 2014 10:50:43 +0000 (10:50 +0000)
lib/Target/AArch64/AArch64InstrInfo.cpp		patch \| blob \| history
lib/Target/AArch64/AArch64RegisterInfo.cpp		patch \| blob \| history
lib/Target/AArch64/AArch64RegisterInfo.td		patch \| blob \| history
test/CodeGen/AArch64/neon-vector-list-spill.ll		patch \| blob \| history