test/Transforms/SLPVectorizer/AArch64/commute.ll

   1 ; RUN: opt -S -slp-vectorizer %s | FileCheck %s
   2 target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
   3 target triple = "aarch64--linux-gnu"
   4
   5 %structA = type { [2 x float] }
   6
   7 define void @test1(%structA* nocapture readonly %J, i32 %xmin, i32 %ymin) {
   8 ; CHECK-LABEL: test1
   9 ; CHECK: %arrayidx4 = getelementptr inbounds %structA, %structA* %J, i64 0, i32 0, i64 0
  10 ; CHECK: %arrayidx9 = getelementptr inbounds %structA, %structA* %J, i64 0, i32 0, i64 1
  11 ; CHECK: %3 = bitcast float* %arrayidx4 to <2 x float>*
  12 ; CHECK: %4 = load <2 x float>* %3, align 4
  13 ; CHECK: %5 = fsub fast <2 x float> %2, %4
  14 ; CHECK: %6 = fmul fast <2 x float> %5, %5
  15 ; CHECK: %7 = extractelement <2 x float> %6, i32 0
  16 ; CHECK: %8 = extractelement <2 x float> %6, i32 1
  17 ; CHECK: %add = fadd fast float %7, %8
  18 ; CHECK: %cmp = fcmp oeq float %add, 0.000000e+00
  19
  20 entry:
  21   br label %for.body3.lr.ph
  22
  23 for.body3.lr.ph:
  24   %conv5 = sitofp i32 %ymin to float
  25   %conv = sitofp i32 %xmin to float
  26   %arrayidx4 = getelementptr inbounds %structA, %structA* %J, i64 0, i32 0, i64 0
  27   %0 = load float* %arrayidx4, align 4
  28   %sub = fsub fast float %conv, %0
  29   %arrayidx9 = getelementptr inbounds %structA, %structA* %J, i64 0, i32 0, i64 1
  30   %1 = load float* %arrayidx9, align 4
  31   %sub10 = fsub fast float %conv5, %1
  32   %mul11 = fmul fast float %sub, %sub
  33   %mul12 = fmul fast float %sub10, %sub10
  34   %add = fadd fast float %mul11, %mul12
  35   %cmp = fcmp oeq float %add, 0.000000e+00
  36   br i1 %cmp, label %for.body3.lr.ph, label %for.end27
  37
  38 for.end27:
  39   ret void
  40 }
  41
  42 define void @test2(%structA* nocapture readonly %J, i32 %xmin, i32 %ymin) {
  43 ; CHECK-LABEL: test2
  44 ; CHECK: %arrayidx4 = getelementptr inbounds %structA, %structA* %J, i64 0, i32 0, i64 0
  45 ; CHECK: %arrayidx9 = getelementptr inbounds %structA, %structA* %J, i64 0, i32 0, i64 1
  46 ; CHECK: %3 = bitcast float* %arrayidx4 to <2 x float>*
  47 ; CHECK: %4 = load <2 x float>* %3, align 4
  48 ; CHECK: %5 = fsub fast <2 x float> %2, %4
  49 ; CHECK: %6 = fmul fast <2 x float> %5, %5
  50 ; CHECK: %7 = extractelement <2 x float> %6, i32 0
  51 ; CHECK: %8 = extractelement <2 x float> %6, i32 1
  52 ; CHECK: %add = fadd fast float %8, %7
  53 ; CHECK: %cmp = fcmp oeq float %add, 0.000000e+00
  54
  55 entry:
  56   br label %for.body3.lr.ph
  57
  58 for.body3.lr.ph:
  59   %conv5 = sitofp i32 %ymin to float
  60   %conv = sitofp i32 %xmin to float
  61   %arrayidx4 = getelementptr inbounds %structA, %structA* %J, i64 0, i32 0, i64 0
  62   %0 = load float* %arrayidx4, align 4
  63   %sub = fsub fast float %conv, %0
  64   %arrayidx9 = getelementptr inbounds %structA, %structA* %J, i64 0, i32 0, i64 1
  65   %1 = load float* %arrayidx9, align 4
  66   %sub10 = fsub fast float %conv5, %1
  67   %mul11 = fmul fast float %sub, %sub
  68   %mul12 = fmul fast float %sub10, %sub10
  69   %add = fadd fast float %mul12, %mul11         ;;;<---- Operands commuted!!
  70   %cmp = fcmp oeq float %add, 0.000000e+00
  71   br i1 %cmp, label %for.body3.lr.ph, label %for.end27
  72
  73 for.end27:
  74   ret void
  75 }