[AArch64] Add additional extract-extend patterns for smov

[oota-llvm.git] / test / CodeGen / NVPTX / access-non-generic.ll
diff --git a/test/CodeGen/NVPTX/access-non-generic.ll b/test/CodeGen/NVPTX/access-non-generic.ll

index c225abf0fd85a5fa10ae5393c70ceb94bf603409..c1327274a9cf6711933bd790a3328ab92b386010 100644 (file)
--- a/test/CodeGen/NVPTX/access-non-generic.ll
+++ b/test/CodeGen/NVPTX/access-non-generic.ll
@@ -18,7 +18,7 @@ define float @ld_st_shared_f32(i32 %i, float %v) {
  ; IR-NOT: addrspacecast
  ; PTX-LABEL: ld_st_shared_f32(
    ; load cast
-  %1 = load float* addrspacecast (float addrspace(3)* @scalar to float*), align 4
+  %1 = load float, float* addrspacecast (float addrspace(3)* @scalar to float*), align 4
  ; PTX: ld.shared.f32 %f{{[0-9]+}}, [scalar];
    ; store cast
    store float %v, float* addrspacecast (float addrspace(3)* @scalar to float*), align 4
@@ -29,7 +29,7 @@ define float @ld_st_shared_f32(i32 %i, float %v) {
  
    ; cast; load
    %2 = addrspacecast float addrspace(3)* @scalar to float*
-  %3 = load float* %2, align 4
+  %3 = load float, float* %2, align 4
  ; PTX: ld.shared.f32 %f{{[0-9]+}}, [scalar];
    ; cast; store
    store float %v, float* %2, align 4
@@ -38,17 +38,17 @@ define float @ld_st_shared_f32(i32 %i, float %v) {
  ; PTX: bar.sync 0;
  
    ; load gep cast
-  %4 = load float* getelementptr inbounds ([10 x float]* addrspacecast ([10 x float] addrspace(3)* @array to [10 x float]*), i32 0, i32 5), align 4
+  %4 = load float, float* getelementptr inbounds ([10 x float], [10 x float]* addrspacecast ([10 x float] addrspace(3)* @array to [10 x float]*), i32 0, i32 5), align 4
  ; PTX: ld.shared.f32 %f{{[0-9]+}}, [array+20];
    ; store gep cast
-  store float %v, float* getelementptr inbounds ([10 x float]* addrspacecast ([10 x float] addrspace(3)* @array to [10 x float]*), i32 0, i32 5), align 4
+  store float %v, float* getelementptr inbounds ([10 x float], [10 x float]* addrspacecast ([10 x float] addrspace(3)* @array to [10 x float]*), i32 0, i32 5), align 4
  ; PTX: st.shared.f32 [array+20], %f{{[0-9]+}};
    call void @llvm.cuda.syncthreads()
  ; PTX: bar.sync 0;
  
    ; gep cast; load
-  %5 = getelementptr inbounds [10 x float]* addrspacecast ([10 x float] addrspace(3)* @array to [10 x float]*), i32 0, i32 5
-  %6 = load float* %5, align 4
+  %5 = getelementptr inbounds [10 x float], [10 x float]* addrspacecast ([10 x float] addrspace(3)* @array to [10 x float]*), i32 0, i32 5
+  %6 = load float, float* %5, align 4
  ; PTX: ld.shared.f32 %f{{[0-9]+}}, [array+20];
    ; gep cast; store
    store float %v, float* %5, align 4
@@ -58,8 +58,8 @@ define float @ld_st_shared_f32(i32 %i, float %v) {
  
    ; cast; gep; load
    %7 = addrspacecast [10 x float] addrspace(3)* @array to [10 x float]*
-  %8 = getelementptr inbounds [10 x float]* %7, i32 0, i32 %i
-  %9 = load float* %8, align 4
+  %8 = getelementptr inbounds [10 x float], [10 x float]* %7, i32 0, i32 %i
+  %9 = load float, float* %8, align 4
  ; PTX: ld.shared.f32 %f{{[0-9]+}}, [%{{(r|rl|rd)[0-9]+}}];
    ; cast; gep; store
    store float %v, float* %8, align 4
@@ -78,13 +78,51 @@ define float @ld_st_shared_f32(i32 %i, float %v) {
  ; addrspacecast with a bitcast.
  define i32 @ld_int_from_float() {
  ; IR-LABEL: @ld_int_from_float
-; IR: load i32 addrspace(3)* bitcast (float addrspace(3)* @scalar to i32 addrspace(3)*)
+; IR: load i32, i32 addrspace(3)* bitcast (float addrspace(3)* @scalar to i32 addrspace(3)*)
  ; PTX-LABEL: ld_int_from_float(
  ; PTX: ld.shared.u{{(32|64)}}
-  %1 = load i32* addrspacecast(float addrspace(3)* @scalar to i32*), align 4
+  %1 = load i32, i32* addrspacecast(float addrspace(3)* @scalar to i32*), align 4
    ret i32 %1
  }
  
+define i32 @ld_int_from_global_float(float addrspace(1)* %input, i32 %i, i32 %j) {
+; IR-LABEL: @ld_int_from_global_float(
+; PTX-LABEL: ld_int_from_global_float(
+  %1 = addrspacecast float addrspace(1)* %input to float*
+  %2 = getelementptr float, float* %1, i32 %i
+; IR-NEXT: getelementptr float, float addrspace(1)* %input, i32 %i
+  %3 = getelementptr float, float* %2, i32 %j
+; IR-NEXT: getelementptr float, float addrspace(1)* {{%[^,]+}}, i32 %j
+  %4 = bitcast float* %3 to i32*
+; IR-NEXT: bitcast float addrspace(1)* {{%[^ ]+}} to i32 addrspace(1)*
+  %5 = load i32, i32* %4
+; IR-NEXT: load i32, i32 addrspace(1)* {{%.+}}
+; PTX-LABEL: ld.global
+  ret i32 %5
+}
+
+define void @nested_const_expr() {
+; PTX-LABEL: nested_const_expr(
+  ; store 1 to bitcast(gep(addrspacecast(array), 0, 1))
+  store i32 1, i32* bitcast (float* getelementptr ([10 x float], [10 x float]* addrspacecast ([10 x float] addrspace(3)* @array to [10 x float]*), i64 0, i64 1) to i32*), align 4
+; PTX: mov.u32 %r1, 1;
+; PTX-NEXT: st.shared.u32 [array+4], %r1;
+  ret void
+}
+
+define void @rauw(float addrspace(1)* %input) {
+  %generic_input = addrspacecast float addrspace(1)* %input to float*
+  %addr = getelementptr float, float* %generic_input, i64 10
+  %v = load float, float* %addr
+  store float %v, float* %addr
+  ret void
+; IR-LABEL: @rauw(
+; IR-NEXT: %1 = getelementptr float, float addrspace(1)* %input, i64 10
+; IR-NEXT: %v = load float, float addrspace(1)* %1
+; IR-NEXT: store float %v, float addrspace(1)* %1
+; IR-NEXT: ret void
+}
+
  declare void @llvm.cuda.syncthreads() #3
  
  attributes #3 = { noduplicate nounwind }