let MIOperandInfo = (ops DPR:$Vd, i32imm:$idx);
}
+def dword_alignedload : PatFrag<(ops node:$ptr), (load node:$ptr), [{
+ return cast<LoadSDNode>(N)->getAlignment() >= 8;
+}]>;
+def dword_alignedstore : PatFrag<(ops node:$val, node:$ptr),
+ (store node:$val, node:$ptr), [{
+ return cast<StoreSDNode>(N)->getAlignment() >= 8;
+}]>;
+def word_alignedload : PatFrag<(ops node:$ptr), (load node:$ptr), [{
+ return cast<LoadSDNode>(N)->getAlignment() == 4;
+}]>;
+def word_alignedstore : PatFrag<(ops node:$val, node:$ptr),
+ (store node:$val, node:$ptr), [{
+ return cast<StoreSDNode>(N)->getAlignment() == 4;
+}]>;
def hword_alignedload : PatFrag<(ops node:$ptr), (load node:$ptr), [{
return cast<LoadSDNode>(N)->getAlignment() == 2;
}]>;
def : Pat<(non_word_alignedstore (f64 DPR:$value), addrmode6:$addr),
(VST1d64 addrmode6:$addr, DPR:$value)>, Requires<[IsBE]>;
+// Use vld1/vst1 for Q and QQ. Also use them for unaligned v2f64
+// load / store if it's legal.
+def : Pat<(v2f64 (dword_alignedload addrmode6:$addr)),
+ (VLD1q64 addrmode6:$addr)>;
+def : Pat<(dword_alignedstore (v2f64 QPR:$value), addrmode6:$addr),
+ (VST1q64 addrmode6:$addr, QPR:$value)>;
+def : Pat<(v2f64 (word_alignedload addrmode6:$addr)),
+ (VLD1q32 addrmode6:$addr)>;
+def : Pat<(word_alignedstore (v2f64 QPR:$value), addrmode6:$addr),
+ (VST1q32 addrmode6:$addr, QPR:$value)>;
+def : Pat<(v2f64 (hword_alignedload addrmode6:$addr)),
+ (VLD1q16 addrmode6:$addr)>, Requires<[IsLE]>;
+def : Pat<(hword_alignedstore (v2f64 QPR:$value), addrmode6:$addr),
+ (VST1q16 addrmode6:$addr, QPR:$value)>, Requires<[IsLE]>;
+def : Pat<(v2f64 (byte_alignedload addrmode6:$addr)),
+ (VLD1q8 addrmode6:$addr)>, Requires<[IsLE]>;
+def : Pat<(byte_alignedstore (v2f64 QPR:$value), addrmode6:$addr),
+ (VST1q8 addrmode6:$addr, QPR:$value)>, Requires<[IsLE]>;
+
//===----------------------------------------------------------------------===//
// NEON pattern fragments
//===----------------------------------------------------------------------===//
; CHECK: movw r1, :lower16:{{.*}}
; CHECK: movt r1, :upper16:{{.*}}
-; CHECK: vldmia r1
+; CHECK: vld1.64 {{.*}}, [r1, :128]
; CHECK: vsqrt.f32 {{s[0-9]+}}, {{s[0-9]+}}
; CHECK: vsqrt.f32 {{s[0-9]+}}, {{s[0-9]+}}
; CHECK: vsqrt.f32 {{s[0-9]+}}, {{s[0-9]+}}
; CHECK: vsqrt.f32 {{s[0-9]+}}, {{s[0-9]+}}
-; CHECK: vstmia {{.*}}
+; CHECK: vst1.64 {{.*}}
L.entry:
%0 = load <4 x float>* @A, align 16
; CHECK: movw [[reg0:r[0-9]+]], :lower16:{{.*}}
; CHECK: movt [[reg0]], :upper16:{{.*}}
-; CHECK: vldmia r{{[0-9][0-9]?}}, {{.*}}
+; CHECK: vld1.64
; CHECK: {{v?mov(.32)?}} r0,
; CHECK: bl {{.*}}cosf
; CHECK: {{v?mov(.32)?}} r0,
; CHECK: bl {{.*}}cosf
-; CHECK: vstmia {{.*}}
+; CHECK: vst1.64
L.entry:
%0 = load <4 x float>* @A, align 16
; CHECK: movw [[reg0:r[0-9]+]], :lower16:{{.*}}
; CHECK: movt [[reg0]], :upper16:{{.*}}
-; CHECK: vldmia r{{[0-9][0-9]?}}, {{.*}}
+; CHECK: vld1.64
; CHECK: {{v?mov(.32)?}} r0,
; CHECK: bl {{.*}}expf
; CHECK: {{v?mov(.32)?}} r0,
; CHECK: bl {{.*}}expf
-; CHECK: vstmia {{.*}}
+; CHECK: vst1.64
L.entry:
%0 = load <4 x float>* @A, align 16
; CHECK: movw [[reg0:r[0-9]+]], :lower16:{{.*}}
; CHECK: movt [[reg0]], :upper16:{{.*}}
-; CHECK: vldmia r{{[0-9][0-9]?}}, {{.*}}
+; CHECK: vld1.64
; CHECK: {{v?mov(.32)?}} r0,
; CHECK: bl {{.*}}exp2f
; CHECK: {{v?mov(.32)?}} r0,
; CHECK: bl {{.*}}exp2f
-; CHECK: vstmia {{.*}}
+; CHECK: vst1.64
L.entry:
%0 = load <4 x float>* @A, align 16
; CHECK: movw [[reg0:r[0-9]+]], :lower16:{{.*}}
; CHECK: movt [[reg0]], :upper16:{{.*}}
-; CHECK: vldmia r{{[0-9][0-9]?}}, {{.*}}
+; CHECK: vld1.64
; CHECK: {{v?mov(.32)?}} r0,
; CHECK: bl {{.*}}log10f
; CHECK: {{v?mov(.32)?}} r0,
; CHECK: bl {{.*}}log10f
-; CHECK: vstmia {{.*}}
+; CHECK: vst1.64
L.entry:
%0 = load <4 x float>* @A, align 16
; CHECK: movw [[reg0:r[0-9]+]], :lower16:{{.*}}
; CHECK: movt [[reg0]], :upper16:{{.*}}
-; CHECK: vldmia r{{[0-9][0-9]?}}, {{.*}}
+; CHECK: vld1.64
; CHECK: {{v?mov(.32)?}} r0,
; CHECK: bl {{.*}}logf
; CHECK: {{v?mov(.32)?}} r0,
; CHECK: bl {{.*}}logf
-; CHECK: vstmia {{.*}}
+; CHECK: vst1.64
L.entry:
%0 = load <4 x float>* @A, align 16
; CHECK: movw [[reg0:r[0-9]+]], :lower16:{{.*}}
; CHECK: movt [[reg0]], :upper16:{{.*}}
-; CHECK: vldmia r{{[0-9][0-9]?}}, {{.*}}
+; CHECK: vld1.64
; CHECK: {{v?mov(.32)?}} r0,
; CHECK: bl {{.*}}log2f
; CHECK: {{v?mov(.32)?}} r0,
; CHECK: bl {{.*}}log2f
-; CHECK: vstmia {{.*}}
+; CHECK: vst1.64
L.entry:
%0 = load <4 x float>* @A, align 16
; CHECK: movw [[reg0:r[0-9]+]], :lower16:{{.*}}
; CHECK: movt [[reg0]], :upper16:{{.*}}
-; CHECK: vldmia r{{[0-9][0-9]?}}, {{.*}}
+; CHECK: vld1.64
; CHECK: {{v?mov(.32)?}} r0,
; CHECK: bl {{.*}}powf
; CHECK: {{v?mov(.32)?}} r0,
; CHECK: bl {{.*}}powf
-; CHECK: vstmia {{.*}}
+; CHECK: vst1.64
L.entry:
; CHECK: movw [[reg0:r[0-9]+]], :lower16:{{.*}}
; CHECK: movt [[reg0]], :upper16:{{.*}}
-; CHECK: vldmia [[reg0]], {{.*}}
+; CHECK: vld1.64 {{.*}}, :128
; CHECK: vmul.f32 {{.*}}
-; CHECK: vstmia {{.*}}
+; CHECK: vst1.64
L.entry:
; CHECK: movw [[reg0:r[0-9]+]], :lower16:{{.*}}
; CHECK: movt [[reg0]], :upper16:{{.*}}
-; CHECK: vldmia r{{[0-9][0-9]?}}, {{.*}}
+; CHECK: vld1.64
; CHECK: {{v?mov(.32)?}} r0,
; CHECK: bl {{.*}}sinf
; CHECK: {{v?mov(.32)?}} r0,
; CHECK: bl {{.*}}sinf
-; CHECK: vstmia {{.*}}
+; CHECK: vst1.64
L.entry:
%0 = load <4 x float>* @A, align 16
; CHECK: movw [[reg0:r[0-9]+]], :lower16:{{.*}}
; CHECK: movt [[reg0]], :upper16:{{.*}}
-; CHECK: vldmia r{{[0-9][0-9]?}}, {{.*}}
+; CHECK: vld1.64
; CHECK: {{v?mov(.32)?}} r0,
; CHECK: bl {{.*}}floorf
; CHECK: {{v?mov(.32)?}} r0,
; CHECK: bl {{.*}}floorf
-; CHECK: vstmia {{.*}}
+; CHECK: vst1.64
L.entry:
%0 = load <4 x float>* @A, align 16
--- /dev/null
+;RUN: llc < %s -march=arm -mattr=+v7 -mattr=+neon | FileCheck %s
+
+;ALIGN = 1
+;SIZE = 64
+;TYPE = <8 x i8>
+define void @v64_v8i8_1(i8* noalias nocapture %out, i8* noalias nocapture %in) nounwind {
+;CHECK: v64_v8i8_1:
+entry:
+ %po = getelementptr i8* %out, i32 0
+ %pi = getelementptr i8* %in, i32 0
+ %vi = bitcast i8* %pi to <8 x i8>*
+ %vo = bitcast i8* %po to <8 x i8>*
+;CHECK: vld1.8
+ %v1 = load <8 x i8>* %vi, align 1
+;CHECK: vst1.8
+ store <8 x i8> %v1, <8 x i8>* %vo, align 1
+ ret void
+}
+
+
+;ALIGN = 1
+;SIZE = 64
+;TYPE = <4 x i16>
+define void @v64_v4i16_1(i8* noalias nocapture %out, i8* noalias nocapture %in) nounwind {
+;CHECK: v64_v4i16_1:
+entry:
+ %po = getelementptr i8* %out, i32 0
+ %pi = getelementptr i8* %in, i32 0
+ %vi = bitcast i8* %pi to <4 x i16>*
+ %vo = bitcast i8* %po to <4 x i16>*
+;CHECK: vld1.8
+ %v1 = load <4 x i16>* %vi, align 1
+;CHECK: vst1.8
+ store <4 x i16> %v1, <4 x i16>* %vo, align 1
+ ret void
+}
+
+
+;ALIGN = 1
+;SIZE = 64
+;TYPE = <2 x i32>
+define void @v64_v2i32_1(i8* noalias nocapture %out, i8* noalias nocapture %in) nounwind {
+;CHECK: v64_v2i32_1:
+entry:
+ %po = getelementptr i8* %out, i32 0
+ %pi = getelementptr i8* %in, i32 0
+ %vi = bitcast i8* %pi to <2 x i32>*
+ %vo = bitcast i8* %po to <2 x i32>*
+;CHECK: vld1.8
+ %v1 = load <2 x i32>* %vi, align 1
+;CHECK: vst1.8
+ store <2 x i32> %v1, <2 x i32>* %vo, align 1
+ ret void
+}
+
+
+;ALIGN = 1
+;SIZE = 64
+;TYPE = <2 x float>
+define void @v64_v2f32_1(i8* noalias nocapture %out, i8* noalias nocapture %in) nounwind {
+;CHECK: v64_v2f32_1:
+entry:
+ %po = getelementptr i8* %out, i32 0
+ %pi = getelementptr i8* %in, i32 0
+ %vi = bitcast i8* %pi to <2 x float>*
+ %vo = bitcast i8* %po to <2 x float>*
+;CHECK: vld1.8
+ %v1 = load <2 x float>* %vi, align 1
+;CHECK: vst1.8
+ store <2 x float> %v1, <2 x float>* %vo, align 1
+ ret void
+}
+
+
+;ALIGN = 1
+;SIZE = 128
+;TYPE = <16 x i8>
+define void @v128_v16i8_1(i8* noalias nocapture %out, i8* noalias nocapture %in) nounwind {
+;CHECK: v128_v16i8_1:
+entry:
+ %po = getelementptr i8* %out, i32 0
+ %pi = getelementptr i8* %in, i32 0
+ %vi = bitcast i8* %pi to <16 x i8>*
+ %vo = bitcast i8* %po to <16 x i8>*
+;CHECK: vld1.8
+ %v1 = load <16 x i8>* %vi, align 1
+;CHECK: vst1.8
+ store <16 x i8> %v1, <16 x i8>* %vo, align 1
+ ret void
+}
+
+
+;ALIGN = 1
+;SIZE = 128
+;TYPE = <8 x i16>
+define void @v128_v8i16_1(i8* noalias nocapture %out, i8* noalias nocapture %in) nounwind {
+;CHECK: v128_v8i16_1:
+entry:
+ %po = getelementptr i8* %out, i32 0
+ %pi = getelementptr i8* %in, i32 0
+ %vi = bitcast i8* %pi to <8 x i16>*
+ %vo = bitcast i8* %po to <8 x i16>*
+;CHECK: vld1.8
+ %v1 = load <8 x i16>* %vi, align 1
+;CHECK: vst1.8
+ store <8 x i16> %v1, <8 x i16>* %vo, align 1
+ ret void
+}
+
+
+;ALIGN = 1
+;SIZE = 128
+;TYPE = <4 x i32>
+define void @v128_v4i32_1(i8* noalias nocapture %out, i8* noalias nocapture %in) nounwind {
+;CHECK: v128_v4i32_1:
+entry:
+ %po = getelementptr i8* %out, i32 0
+ %pi = getelementptr i8* %in, i32 0
+ %vi = bitcast i8* %pi to <4 x i32>*
+ %vo = bitcast i8* %po to <4 x i32>*
+;CHECK: vld1.8
+ %v1 = load <4 x i32>* %vi, align 1
+;CHECK: vst1.8
+ store <4 x i32> %v1, <4 x i32>* %vo, align 1
+ ret void
+}
+
+
+;ALIGN = 1
+;SIZE = 128
+;TYPE = <2 x i64>
+define void @v128_v2i64_1(i8* noalias nocapture %out, i8* noalias nocapture %in) nounwind {
+;CHECK: v128_v2i64_1:
+entry:
+ %po = getelementptr i8* %out, i32 0
+ %pi = getelementptr i8* %in, i32 0
+ %vi = bitcast i8* %pi to <2 x i64>*
+ %vo = bitcast i8* %po to <2 x i64>*
+;CHECK: vld1.8
+ %v1 = load <2 x i64>* %vi, align 1
+;CHECK: vst1.8
+ store <2 x i64> %v1, <2 x i64>* %vo, align 1
+ ret void
+}
+
+
+;ALIGN = 1
+;SIZE = 128
+;TYPE = <4 x float>
+define void @v128_v4f32_1(i8* noalias nocapture %out, i8* noalias nocapture %in) nounwind {
+;CHECK: v128_v4f32_1:
+entry:
+ %po = getelementptr i8* %out, i32 0
+ %pi = getelementptr i8* %in, i32 0
+ %vi = bitcast i8* %pi to <4 x float>*
+ %vo = bitcast i8* %po to <4 x float>*
+;CHECK: vld1.8
+ %v1 = load <4 x float>* %vi, align 1
+;CHECK: vst1.8
+ store <4 x float> %v1, <4 x float>* %vo, align 1
+ ret void
+}
+
+
+;ALIGN = 2
+;SIZE = 64
+;TYPE = <8 x i8>
+define void @v64_v8i8_2(i8* noalias nocapture %out, i8* noalias nocapture %in) nounwind {
+;CHECK: v64_v8i8_2:
+entry:
+ %po = getelementptr i8* %out, i32 0
+ %pi = getelementptr i8* %in, i32 0
+ %vi = bitcast i8* %pi to <8 x i8>*
+ %vo = bitcast i8* %po to <8 x i8>*
+;CHECK: vld1.16
+ %v1 = load <8 x i8>* %vi, align 2
+;CHECK: vst1.16
+ store <8 x i8> %v1, <8 x i8>* %vo, align 2
+ ret void
+}
+
+
+;ALIGN = 2
+;SIZE = 64
+;TYPE = <4 x i16>
+define void @v64_v4i16_2(i8* noalias nocapture %out, i8* noalias nocapture %in) nounwind {
+;CHECK: v64_v4i16_2:
+entry:
+ %po = getelementptr i8* %out, i32 0
+ %pi = getelementptr i8* %in, i32 0
+ %vi = bitcast i8* %pi to <4 x i16>*
+ %vo = bitcast i8* %po to <4 x i16>*
+;CHECK: vld1.16
+ %v1 = load <4 x i16>* %vi, align 2
+;CHECK: vst1.16
+ store <4 x i16> %v1, <4 x i16>* %vo, align 2
+ ret void
+}
+
+
+;ALIGN = 2
+;SIZE = 64
+;TYPE = <2 x i32>
+define void @v64_v2i32_2(i8* noalias nocapture %out, i8* noalias nocapture %in) nounwind {
+;CHECK: v64_v2i32_2:
+entry:
+ %po = getelementptr i8* %out, i32 0
+ %pi = getelementptr i8* %in, i32 0
+ %vi = bitcast i8* %pi to <2 x i32>*
+ %vo = bitcast i8* %po to <2 x i32>*
+;CHECK: vld1.16
+ %v1 = load <2 x i32>* %vi, align 2
+;CHECK: vst1.16
+ store <2 x i32> %v1, <2 x i32>* %vo, align 2
+ ret void
+}
+
+
+;ALIGN = 2
+;SIZE = 64
+;TYPE = <2 x float>
+define void @v64_v2f32_2(i8* noalias nocapture %out, i8* noalias nocapture %in) nounwind {
+;CHECK: v64_v2f32_2:
+entry:
+ %po = getelementptr i8* %out, i32 0
+ %pi = getelementptr i8* %in, i32 0
+ %vi = bitcast i8* %pi to <2 x float>*
+ %vo = bitcast i8* %po to <2 x float>*
+;CHECK: vld1.16
+ %v1 = load <2 x float>* %vi, align 2
+;CHECK: vst1.16
+ store <2 x float> %v1, <2 x float>* %vo, align 2
+ ret void
+}
+
+
+;ALIGN = 2
+;SIZE = 128
+;TYPE = <16 x i8>
+define void @v128_v16i8_2(i8* noalias nocapture %out, i8* noalias nocapture %in) nounwind {
+;CHECK: v128_v16i8_2:
+entry:
+ %po = getelementptr i8* %out, i32 0
+ %pi = getelementptr i8* %in, i32 0
+ %vi = bitcast i8* %pi to <16 x i8>*
+ %vo = bitcast i8* %po to <16 x i8>*
+;CHECK: vld1.16
+ %v1 = load <16 x i8>* %vi, align 2
+;CHECK: vst1.16
+ store <16 x i8> %v1, <16 x i8>* %vo, align 2
+ ret void
+}
+
+
+;ALIGN = 2
+;SIZE = 128
+;TYPE = <8 x i16>
+define void @v128_v8i16_2(i8* noalias nocapture %out, i8* noalias nocapture %in) nounwind {
+;CHECK: v128_v8i16_2:
+entry:
+ %po = getelementptr i8* %out, i32 0
+ %pi = getelementptr i8* %in, i32 0
+ %vi = bitcast i8* %pi to <8 x i16>*
+ %vo = bitcast i8* %po to <8 x i16>*
+;CHECK: vld1.16
+ %v1 = load <8 x i16>* %vi, align 2
+;CHECK: vst1.16
+ store <8 x i16> %v1, <8 x i16>* %vo, align 2
+ ret void
+}
+
+
+;ALIGN = 2
+;SIZE = 128
+;TYPE = <4 x i32>
+define void @v128_v4i32_2(i8* noalias nocapture %out, i8* noalias nocapture %in) nounwind {
+;CHECK: v128_v4i32_2:
+entry:
+ %po = getelementptr i8* %out, i32 0
+ %pi = getelementptr i8* %in, i32 0
+ %vi = bitcast i8* %pi to <4 x i32>*
+ %vo = bitcast i8* %po to <4 x i32>*
+;CHECK: vld1.16
+ %v1 = load <4 x i32>* %vi, align 2
+;CHECK: vst1.16
+ store <4 x i32> %v1, <4 x i32>* %vo, align 2
+ ret void
+}
+
+
+;ALIGN = 2
+;SIZE = 128
+;TYPE = <2 x i64>
+define void @v128_v2i64_2(i8* noalias nocapture %out, i8* noalias nocapture %in) nounwind {
+;CHECK: v128_v2i64_2:
+entry:
+ %po = getelementptr i8* %out, i32 0
+ %pi = getelementptr i8* %in, i32 0
+ %vi = bitcast i8* %pi to <2 x i64>*
+ %vo = bitcast i8* %po to <2 x i64>*
+;CHECK: vld1.16
+ %v1 = load <2 x i64>* %vi, align 2
+;CHECK: vst1.16
+ store <2 x i64> %v1, <2 x i64>* %vo, align 2
+ ret void
+}
+
+
+;ALIGN = 2
+;SIZE = 128
+;TYPE = <4 x float>
+define void @v128_v4f32_2(i8* noalias nocapture %out, i8* noalias nocapture %in) nounwind {
+;CHECK: v128_v4f32_2:
+entry:
+ %po = getelementptr i8* %out, i32 0
+ %pi = getelementptr i8* %in, i32 0
+ %vi = bitcast i8* %pi to <4 x float>*
+ %vo = bitcast i8* %po to <4 x float>*
+;CHECK: vld1.16
+ %v1 = load <4 x float>* %vi, align 2
+;CHECK: vst1.16
+ store <4 x float> %v1, <4 x float>* %vo, align 2
+ ret void
+}
+
+
+;ALIGN = 4
+;SIZE = 64
+;TYPE = <8 x i8>
+define void @v64_v8i8_4(i8* noalias nocapture %out, i8* noalias nocapture %in) nounwind {
+;CHECK: v64_v8i8_4:
+entry:
+ %po = getelementptr i8* %out, i32 0
+ %pi = getelementptr i8* %in, i32 0
+ %vi = bitcast i8* %pi to <8 x i8>*
+ %vo = bitcast i8* %po to <8 x i8>*
+;CHECK: vldr
+ %v1 = load <8 x i8>* %vi, align 4
+;CHECK: vstr
+ store <8 x i8> %v1, <8 x i8>* %vo, align 4
+ ret void
+}
+
+
+;ALIGN = 4
+;SIZE = 64
+;TYPE = <4 x i16>
+define void @v64_v4i16_4(i8* noalias nocapture %out, i8* noalias nocapture %in) nounwind {
+;CHECK: v64_v4i16_4:
+entry:
+ %po = getelementptr i8* %out, i32 0
+ %pi = getelementptr i8* %in, i32 0
+ %vi = bitcast i8* %pi to <4 x i16>*
+ %vo = bitcast i8* %po to <4 x i16>*
+;CHECK: vldr
+ %v1 = load <4 x i16>* %vi, align 4
+;CHECK: vstr
+ store <4 x i16> %v1, <4 x i16>* %vo, align 4
+ ret void
+}
+
+
+;ALIGN = 4
+;SIZE = 64
+;TYPE = <2 x i32>
+define void @v64_v2i32_4(i8* noalias nocapture %out, i8* noalias nocapture %in) nounwind {
+;CHECK: v64_v2i32_4:
+entry:
+ %po = getelementptr i8* %out, i32 0
+ %pi = getelementptr i8* %in, i32 0
+ %vi = bitcast i8* %pi to <2 x i32>*
+ %vo = bitcast i8* %po to <2 x i32>*
+;CHECK: vldr
+ %v1 = load <2 x i32>* %vi, align 4
+;CHECK: vstr
+ store <2 x i32> %v1, <2 x i32>* %vo, align 4
+ ret void
+}
+
+
+;ALIGN = 4
+;SIZE = 64
+;TYPE = <2 x float>
+define void @v64_v2f32_4(i8* noalias nocapture %out, i8* noalias nocapture %in) nounwind {
+;CHECK: v64_v2f32_4:
+entry:
+ %po = getelementptr i8* %out, i32 0
+ %pi = getelementptr i8* %in, i32 0
+ %vi = bitcast i8* %pi to <2 x float>*
+ %vo = bitcast i8* %po to <2 x float>*
+;CHECK: vldr
+ %v1 = load <2 x float>* %vi, align 4
+;CHECK: vstr
+ store <2 x float> %v1, <2 x float>* %vo, align 4
+ ret void
+}
+
+
+;ALIGN = 4
+;SIZE = 128
+;TYPE = <16 x i8>
+define void @v128_v16i8_4(i8* noalias nocapture %out, i8* noalias nocapture %in) nounwind {
+;CHECK: v128_v16i8_4:
+entry:
+ %po = getelementptr i8* %out, i32 0
+ %pi = getelementptr i8* %in, i32 0
+ %vi = bitcast i8* %pi to <16 x i8>*
+ %vo = bitcast i8* %po to <16 x i8>*
+;CHECK: vld1.32
+ %v1 = load <16 x i8>* %vi, align 4
+;CHECK: vst1.32
+ store <16 x i8> %v1, <16 x i8>* %vo, align 4
+ ret void
+}
+
+
+;ALIGN = 4
+;SIZE = 128
+;TYPE = <8 x i16>
+define void @v128_v8i16_4(i8* noalias nocapture %out, i8* noalias nocapture %in) nounwind {
+;CHECK: v128_v8i16_4:
+entry:
+ %po = getelementptr i8* %out, i32 0
+ %pi = getelementptr i8* %in, i32 0
+ %vi = bitcast i8* %pi to <8 x i16>*
+ %vo = bitcast i8* %po to <8 x i16>*
+;CHECK: vld1.32
+ %v1 = load <8 x i16>* %vi, align 4
+;CHECK: vst1.32
+ store <8 x i16> %v1, <8 x i16>* %vo, align 4
+ ret void
+}
+
+
+;ALIGN = 4
+;SIZE = 128
+;TYPE = <4 x i32>
+define void @v128_v4i32_4(i8* noalias nocapture %out, i8* noalias nocapture %in) nounwind {
+;CHECK: v128_v4i32_4:
+entry:
+ %po = getelementptr i8* %out, i32 0
+ %pi = getelementptr i8* %in, i32 0
+ %vi = bitcast i8* %pi to <4 x i32>*
+ %vo = bitcast i8* %po to <4 x i32>*
+;CHECK: vld1.32
+ %v1 = load <4 x i32>* %vi, align 4
+;CHECK: vst1.32
+ store <4 x i32> %v1, <4 x i32>* %vo, align 4
+ ret void
+}
+
+
+;ALIGN = 4
+;SIZE = 128
+;TYPE = <2 x i64>
+define void @v128_v2i64_4(i8* noalias nocapture %out, i8* noalias nocapture %in) nounwind {
+;CHECK: v128_v2i64_4:
+entry:
+ %po = getelementptr i8* %out, i32 0
+ %pi = getelementptr i8* %in, i32 0
+ %vi = bitcast i8* %pi to <2 x i64>*
+ %vo = bitcast i8* %po to <2 x i64>*
+;CHECK: vld1.32
+ %v1 = load <2 x i64>* %vi, align 4
+;CHECK: vst1.32
+ store <2 x i64> %v1, <2 x i64>* %vo, align 4
+ ret void
+}
+
+
+;ALIGN = 4
+;SIZE = 128
+;TYPE = <4 x float>
+define void @v128_v4f32_4(i8* noalias nocapture %out, i8* noalias nocapture %in) nounwind {
+;CHECK: v128_v4f32_4:
+entry:
+ %po = getelementptr i8* %out, i32 0
+ %pi = getelementptr i8* %in, i32 0
+ %vi = bitcast i8* %pi to <4 x float>*
+ %vo = bitcast i8* %po to <4 x float>*
+;CHECK: vld1.32
+ %v1 = load <4 x float>* %vi, align 4
+;CHECK: vst1.32
+ store <4 x float> %v1, <4 x float>* %vo, align 4
+ ret void
+}
+