From bab06ba696694e7f62f964af7ee5290a13f78340 Mon Sep 17 00:00:00 2001
From: JF Bastien <jfb@google.com>
Date: Fri, 17 May 2013 23:49:01 +0000
Subject: [PATCH] Support unaligned load/store on more ARM targets

This patch matches GCC behavior: the code used to only allow unaligned
load/store on ARM for v6+ Darwin, it will now allow unaligned load/store
for v6+ Darwin as well as for v7+ on Linux and NaCl.

The distinction is made because v6 doesn't guarantee support (but LLVM
assumes that Apple controls hardware+kernel and therefore have
conformant v6 CPUs), whereas v7 does provide this guarantee (and
Linux/NaCl behave sanely).

The patch keeps the -arm-strict-align command line option, and adds
-arm-no-strict-align. They behave similarly to GCC's -mstrict-align and
-mnostrict-align.

I originally encountered this discrepancy in FastIsel tests which expect
unaligned load/store generation. Overall this should slightly improve
performance in most cases because of reduced I$ pressure.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@182175 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/ARM/ARMSubtarget.cpp     |  51 ++++++++--
 lib/Target/ARM/ARMSubtarget.h       |   5 +-
 test/CodeGen/ARM/fast-isel-align.ll | 144 ++++++++++++++++++++++++++++
 test/CodeGen/ARM/fast-isel.ll       | 133 +------------------------
 4 files changed, 193 insertions(+), 140 deletions(-)
 create mode 100644 test/CodeGen/ARM/fast-isel-align.ll
diff --git a/lib/Target/ARM/ARMSubtarget.cpp b/lib/Target/ARM/ARMSubtarget.cpp
index 8653c462f06..c7d97437906 100644
--- a/lib/Target/ARM/ARMSubtarget.cpp
+++ b/lib/Target/ARM/ARMSubtarget.cpp
@@ -38,9 +38,24 @@ static cl::opt<bool>
 UseFusedMulOps("arm-use-mulops",
                cl::init(true), cl::Hidden);
 
-static cl::opt<bool>
-StrictAlign("arm-strict-align", cl::Hidden,
-            cl::desc("Disallow all unaligned memory accesses"));
+enum AlignMode {
+  DefaultAlign,
+  StrictAlign,
+  NoStrictAlign
+};
+
+static cl::opt<AlignMode>
+Align(cl::desc("Load/store alignment support"),
+      cl::Hidden, cl::init(DefaultAlign),
+      cl::values(
+          clEnumValN(DefaultAlign,  "arm-default-align",
+                     "Generate unaligned accesses only on hardware/OS "
+                     "combinations that are known to support them"),
+          clEnumValN(StrictAlign,   "arm-strict-align",
+                     "Disallow all unaligned memory accesses"),
+          clEnumValN(NoStrictAlign, "arm-no-strict-align",
+                     "Allow unaligned memory accesses"),
+          clEnumValEnd));
 
 ARMSubtarget::ARMSubtarget(const std::string &TT, const std::string &CPU,
                            const std::string &FS, const TargetOptions &Options)
@@ -162,10 +177,32 @@ void ARMSubtarget::resetSubtargetFeatures(StringRef CPU, StringRef FS) {
   if (!isThumb() || hasThumb2())
     PostRAScheduler = true;
 
-  // v6+ may or may not support unaligned mem access depending on the system
-  // configuration.
-  if (!StrictAlign && hasV6Ops() && isTargetDarwin())
-    AllowsUnalignedMem = true;
+  switch (Align) {
+    case DefaultAlign:
+      // Assume pre-ARMv6 doesn't support unaligned accesses.
+      //
+      // ARMv6 may or may not support unaligned accesses depending on the
+      // SCTLR.U bit, which is architecture-specific. We assume ARMv6
+      // Darwin targets support unaligned accesses, and others don't.
+      //
+      // ARMv7 always has SCTLR.U set to 1, but it has a new SCTLR.A bit
+      // which raises an alignment fault on unaligned accesses. Linux
+      // defaults this bit to 0 and handles it as a system-wide (not
+      // per-process) setting. It is therefore safe to assume that ARMv7+
+      // Linux targets support unaligned accesses. The same goes for NaCl.
+      //
+      // The above behavior is consistent with GCC.
+      AllowsUnalignedMem = (
+          (hasV7Ops() && (isTargetLinux() || isTargetNaCl())) ||
+          (hasV6Ops() && isTargetDarwin()));
+      break;
+    case StrictAlign:
+      AllowsUnalignedMem = false;
+      break;
+    case NoStrictAlign:
+      AllowsUnalignedMem = true;
+      break;
+  }
 
   // NEON f32 ops are non-IEEE 754 compliant. Darwin is ok with it by default.
   uint64_t Bits = getFeatureBits();
diff --git a/lib/Target/ARM/ARMSubtarget.h b/lib/Target/ARM/ARMSubtarget.h
index 038eb76ae1d..d01316511c4 100644
--- a/lib/Target/ARM/ARMSubtarget.h
+++ b/lib/Target/ARM/ARMSubtarget.h
@@ -270,9 +270,8 @@ public:
 
   bool isTargetIOS() const { return TargetTriple.getOS() == Triple::IOS; }
   bool isTargetDarwin() const { return TargetTriple.isOSDarwin(); }
-  bool isTargetNaCl() const {
-    return TargetTriple.getOS() == Triple::NaCl;
-  }
+  bool isTargetNaCl() const { return TargetTriple.getOS() == Triple::NaCl; }
+  bool isTargetLinux() const { return TargetTriple.getOS() == Triple::Linux; }
   bool isTargetELF() const { return !isTargetDarwin(); }
 
   bool isAPCS_ABI() const { return TargetABI == ARM_ABI_APCS; }
diff --git a/test/CodeGen/ARM/fast-isel-align.ll b/test/CodeGen/ARM/fast-isel-align.ll
new file mode 100644
index 00000000000..4e28a10cd15
--- /dev/null
+++ b/test/CodeGen/ARM/fast-isel-align.ll
@@ -0,0 +1,144 @@
+; RUN: llc < %s -O0 -fast-isel-abort -relocation-model=dynamic-no-pic -mtriple=armv7-apple-ios | FileCheck %s --check-prefix=ARM
+; RUN: llc < %s -O0 -fast-isel-abort -relocation-model=dynamic-no-pic -mtriple=thumbv7-apple-ios | FileCheck %s --check-prefix=THUMB
+; RUN: llc < %s -O0 -arm-strict-align -relocation-model=dynamic-no-pic -mtriple=armv7-apple-ios | FileCheck %s --check-prefix=ARM-STRICT-ALIGN
+; RUN: llc < %s -O0 -arm-strict-align -relocation-model=dynamic-no-pic -mtriple=thumbv7-apple-ios | FileCheck %s --check-prefix=THUMB-STRICT-ALIGN
+
+; RUN: llc < %s -O0 -fast-isel-abort -relocation-model=dynamic-no-pic -mtriple=armv7-linux-gnueabi | FileCheck %s --check-prefix=ARM
+; RUN: llc < %s -O0 -fast-isel-abort -relocation-model=dynamic-no-pic -mtriple=thumbv7-linux-gnueabi | FileCheck %s --check-prefix=THUMB
+; RUN: llc < %s -O0 -arm-strict-align -relocation-model=dynamic-no-pic -mtriple=armv7-linux-gnueabi | FileCheck %s --check-prefix=ARM-STRICT-ALIGN
+; RUN: llc < %s -O0 -arm-strict-align -relocation-model=dynamic-no-pic -mtriple=thumbv7-linux-gnueabi | FileCheck %s --check-prefix=THUMB-STRICT-ALIGN
+
+; RUN: llc < %s -O0 -fast-isel-abort -relocation-model=dynamic-no-pic -mtriple=armv7-unknown-nacl | FileCheck %s --check-prefix=ARM
+; RUN: llc < %s -O0 -arm-strict-align -relocation-model=dynamic-no-pic -mtriple=armv7-unknown-nacl | FileCheck %s --check-prefix=ARM-STRICT-ALIGN
+
+; RUN: llc < %s -O0  -fast-isel-abort -relocation-model=dynamic-no-pic -mtriple=armv7-unknown-unknown | FileCheck %s --check-prefix=ARM-STRICT-ALIGN
+; RUN: llc < %s -O0  -fast-isel-abort -relocation-model=dynamic-no-pic -mtriple=thumbv7-unknown-unknown | FileCheck %s --check-prefix=THUMB-STRICT-ALIGN
+; RUN: llc < %s -O0 -arm-no-strict-align -fast-isel-abort -relocation-model=dynamic-no-pic -mtriple=armv7-unknown-unknown | FileCheck %s --check-prefix=ARM
+; RUN: llc < %s -O0 -arm-no-strict-align -fast-isel-abort -relocation-model=dynamic-no-pic -mtriple=thumbv7-unknown-unknown | FileCheck %s --check-prefix=THUMB
+; RUN: llc < %s -O0 -relocation-model=dynamic-no-pic -mtriple=armv7-unknown-unknown | FileCheck %s --check-prefix=ARM-STRICT-ALIGN
+; RUN: llc < %s -O0 -relocation-model=dynamic-no-pic -mtriple=thumbv7-unknown-unknown | FileCheck %s --check-prefix=THUMB-STRICT-ALIGN
+
+; Check unaligned stores
+%struct.anon = type <{ float }>
+
+@a = common global %struct.anon* null, align 4
+
+define void @unaligned_store(float %x, float %y) nounwind {
+entry:
+; ARM: @unaligned_store
+; ARM: vmov r1, s0
+; ARM: str r1, [r0]
+
+; THUMB: @unaligned_store
+; THUMB: vmov r1, s0
+; THUMB: str r1, [r0]
+
+  %add = fadd float %x, %y
+  %0 = load %struct.anon** @a, align 4
+  %x1 = getelementptr inbounds %struct.anon* %0, i32 0, i32 0
+  store float %add, float* %x1, align 1
+  ret void
+}
+
+; Doublewords require only word-alignment.
+; rdar://10528060
+%struct.anon.0 = type { double }
+
+@foo_unpacked = common global %struct.anon.0 zeroinitializer, align 4
+
+define void @word_aligned_f64_store(double %a, double %b) nounwind {
+entry:
+; ARM: @word_aligned_f64_store
+; THUMB: @word_aligned_f64_store
+  %add = fadd double %a, %b
+  store double %add, double* getelementptr inbounds (%struct.anon.0* @foo_unpacked, i32 0, i32 0), align 4
+; ARM: vstr d16, [r0]
+; THUMB: vstr d16, [r0]
+  ret void
+}
+
+; Check unaligned loads of floats
+%class.TAlignTest = type <{ i16, float }>
+
+define zeroext i1 @unaligned_f32_load(%class.TAlignTest* %this) nounwind align 2 {
+entry:
+; ARM: @unaligned_f32_load
+; THUMB: @unaligned_f32_load
+  %0 = alloca %class.TAlignTest*, align 4
+  store %class.TAlignTest* %this, %class.TAlignTest** %0, align 4
+  %1 = load %class.TAlignTest** %0
+  %2 = getelementptr inbounds %class.TAlignTest* %1, i32 0, i32 1
+  %3 = load float* %2, align 1
+  %4 = fcmp une float %3, 0.000000e+00
+; ARM: ldr r[[R:[0-9]+]], [r0, #2]
+; ARM: vmov s0, r[[R]]
+; ARM: vcmpe.f32 s0, #0
+; THUMB: ldr.w r[[R:[0-9]+]], [r0, #2]
+; THUMB: vmov s0, r[[R]]
+; THUMB: vcmpe.f32 s0, #0
+  ret i1 %4
+}
+
+define void @unaligned_i16_store(i16 %x, i16* %y) nounwind {
+entry:
+; ARM-STRICT-ALIGN: @unaligned_i16_store
+; ARM-STRICT-ALIGN: strb
+; ARM-STRICT-ALIGN: strb
+
+; THUMB-STRICT-ALIGN: @unaligned_i16_store
+; THUMB-STRICT-ALIGN: strb
+; THUMB-STRICT-ALIGN: strb
+
+  store i16 %x, i16* %y, align 1
+  ret void
+}
+
+define i16 @unaligned_i16_load(i16* %x) nounwind {
+entry:
+; ARM-STRICT-ALIGN: @unaligned_i16_load
+; ARM-STRICT-ALIGN: ldrb
+; ARM-STRICT-ALIGN: ldrb
+
+; THUMB-STRICT-ALIGN: @unaligned_i16_load
+; THUMB-STRICT-ALIGN: ldrb
+; THUMB-STRICT-ALIGN: ldrb
+
+  %0 = load i16* %x, align 1
+  ret i16 %0
+}
+
+define void @unaligned_i32_store(i32 %x, i32* %y) nounwind {
+entry:
+; ARM-STRICT-ALIGN: @unaligned_i32_store
+; ARM-STRICT-ALIGN: strb
+; ARM-STRICT-ALIGN: strb
+; ARM-STRICT-ALIGN: strb
+; ARM-STRICT-ALIGN: strb
+
+; THUMB-STRICT-ALIGN: @unaligned_i32_store
+; THUMB-STRICT-ALIGN: strb
+; THUMB-STRICT-ALIGN: strb
+; THUMB-STRICT-ALIGN: strb
+; THUMB-STRICT-ALIGN: strb
+
+  store i32 %x, i32* %y, align 1
+  ret void
+}
+
+define i32 @unaligned_i32_load(i32* %x) nounwind {
+entry:
+; ARM-STRICT-ALIGN: @unaligned_i32_load
+; ARM-STRICT-ALIGN: ldrb
+; ARM-STRICT-ALIGN: ldrb
+; ARM-STRICT-ALIGN: ldrb
+; ARM-STRICT-ALIGN: ldrb
+
+; THUMB-STRICT-ALIGN: @unaligned_i32_load
+; THUMB-STRICT-ALIGN: ldrb
+; THUMB-STRICT-ALIGN: ldrb
+; THUMB-STRICT-ALIGN: ldrb
+; THUMB-STRICT-ALIGN: ldrb
+
+  %0 = load i32* %x, align 1
+  ret i32 %0
+}
diff --git a/test/CodeGen/ARM/fast-isel.ll b/test/CodeGen/ARM/fast-isel.ll
index 39ffcac2922..c4274c5eb5e 100644
--- a/test/CodeGen/ARM/fast-isel.ll
+++ b/test/CodeGen/ARM/fast-isel.ll
@@ -1,7 +1,5 @@
 ; RUN: llc < %s -O0 -fast-isel-abort -relocation-model=dynamic-no-pic -mtriple=armv7-apple-ios | FileCheck %s --check-prefix=ARM
 ; RUN: llc < %s -O0 -fast-isel-abort -relocation-model=dynamic-no-pic -mtriple=thumbv7-apple-ios | FileCheck %s --check-prefix=THUMB
-; RUN: llc < %s -O0 -arm-strict-align -relocation-model=dynamic-no-pic -mtriple=armv7-apple-ios | FileCheck %s --check-prefix=ARM-STRICT-ALIGN
-; RUN: llc < %s -O0 -arm-strict-align -relocation-model=dynamic-no-pic -mtriple=thumbv7-apple-ios | FileCheck %s --check-prefix=THUMB-STRICT-ALIGN
 
 ; Very basic fast-isel functionality.
 define i32 @add(i32 %a, i32 %b) nounwind {
@@ -163,67 +161,6 @@ define void @test4() {
 ; ARM: str r1, [r0]
 }
 
-; Check unaligned stores
-%struct.anon = type <{ float }>
-
-@a = common global %struct.anon* null, align 4
-
-define void @unaligned_store(float %x, float %y) nounwind {
-entry:
-; ARM: @unaligned_store
-; ARM: vmov r1, s0
-; ARM: str r1, [r0]
-
-; THUMB: @unaligned_store
-; THUMB: vmov r1, s0
-; THUMB: str r1, [r0]
-
-  %add = fadd float %x, %y
-  %0 = load %struct.anon** @a, align 4
-  %x1 = getelementptr inbounds %struct.anon* %0, i32 0, i32 0
-  store float %add, float* %x1, align 1
-  ret void
-}
-
-; Doublewords require only word-alignment.
-; rdar://10528060
-%struct.anon.0 = type { double }
-
-@foo_unpacked = common global %struct.anon.0 zeroinitializer, align 4
-
-define void @test5(double %a, double %b) nounwind {
-entry:
-; ARM: @test5
-; THUMB: @test5
-  %add = fadd double %a, %b
-  store double %add, double* getelementptr inbounds (%struct.anon.0* @foo_unpacked, i32 0, i32 0), align 4
-; ARM: vstr d16, [r0]
-; THUMB: vstr d16, [r0]
-  ret void
-}
-
-; Check unaligned loads of floats
-%class.TAlignTest = type <{ i16, float }>
-
-define zeroext i1 @test6(%class.TAlignTest* %this) nounwind align 2 {
-entry:
-; ARM: @test6
-; THUMB: @test6
-  %0 = alloca %class.TAlignTest*, align 4
-  store %class.TAlignTest* %this, %class.TAlignTest** %0, align 4
-  %1 = load %class.TAlignTest** %0
-  %2 = getelementptr inbounds %class.TAlignTest* %1, i32 0, i32 1
-  %3 = load float* %2, align 1
-  %4 = fcmp une float %3, 0.000000e+00
-; ARM: ldr r0, [r0, #2]
-; ARM: vmov s0, r0
-; ARM: vcmpe.f32 s0, #0
-; THUMB: ldr.w r0, [r0, #2]
-; THUMB: vmov s0, r0
-; THUMB: vcmpe.f32 s0, #0
-  ret i1 %4
-}
-
 ; ARM: @urem_fold
 ; THUMB: @urem_fold
 ; ARM: and r0, r0, #31
@@ -233,10 +170,10 @@ define i32 @urem_fold(i32 %a) nounwind {
   ret i32 %rem
 }
 
-define i32 @test7() noreturn nounwind  {
+define i32 @trap_intrinsic() noreturn nounwind  {
 entry:
-; ARM: @test7
-; THUMB: @test7
+; ARM: @trap_intrinsic
+; THUMB: @trap_intrinsic
 ; ARM: trap
 ; THUMB: trap
   tail call void @llvm.trap( )
@@ -244,67 +181,3 @@ entry:
 }
 
 declare void @llvm.trap() nounwind
-
-define void @unaligned_i16_store(i16 %x, i16* %y) nounwind {
-entry:
-; ARM-STRICT-ALIGN: @unaligned_i16_store
-; ARM-STRICT-ALIGN: strb
-; ARM-STRICT-ALIGN: strb
-
-; THUMB-STRICT-ALIGN: @unaligned_i16_store
-; THUMB-STRICT-ALIGN: strb
-; THUMB-STRICT-ALIGN: strb
-
-  store i16 %x, i16* %y, align 1
-  ret void
-}
-
-define i16 @unaligned_i16_load(i16* %x) nounwind {
-entry:
-; ARM-STRICT-ALIGN: @unaligned_i16_load
-; ARM-STRICT-ALIGN: ldrb
-; ARM-STRICT-ALIGN: ldrb
-
-; THUMB-STRICT-ALIGN: @unaligned_i16_load
-; THUMB-STRICT-ALIGN: ldrb
-; THUMB-STRICT-ALIGN: ldrb
-
-  %0 = load i16* %x, align 1
-  ret i16 %0
-}
-
-define void @unaligned_i32_store(i32 %x, i32* %y) nounwind {
-entry:
-; ARM-STRICT-ALIGN: @unaligned_i32_store
-; ARM-STRICT-ALIGN: strb
-; ARM-STRICT-ALIGN: strb
-; ARM-STRICT-ALIGN: strb
-; ARM-STRICT-ALIGN: strb
-
-; THUMB-STRICT-ALIGN: @unaligned_i32_store
-; THUMB-STRICT-ALIGN: strb
-; THUMB-STRICT-ALIGN: strb
-; THUMB-STRICT-ALIGN: strb
-; THUMB-STRICT-ALIGN: strb
-
-  store i32 %x, i32* %y, align 1
-  ret void
-}
-
-define i32 @unaligned_i32_load(i32* %x) nounwind {
-entry:
-; ARM-STRICT-ALIGN: @unaligned_i32_load
-; ARM-STRICT-ALIGN: ldrb
-; ARM-STRICT-ALIGN: ldrb
-; ARM-STRICT-ALIGN: ldrb
-; ARM-STRICT-ALIGN: ldrb
-
-; THUMB-STRICT-ALIGN: @unaligned_i32_load
-; THUMB-STRICT-ALIGN: ldrb
-; THUMB-STRICT-ALIGN: ldrb
-; THUMB-STRICT-ALIGN: ldrb
-; THUMB-STRICT-ALIGN: ldrb
-
-  %0 = load i32* %x, align 1
-  ret i32 %0
-}
-- 
2.34.1