I have a piece of code that I want to unroll by various unroll factors and then look at the resulting bitcode. To do so I'm doing the following:
1) I first compile the code using clang:
clang -O0 -S -emit-llvm trainingCode.cpp -o trainingCode.ll
2) I then run a couple of passes on the resulting bitcode (as recommended by this):
opt -mem2reg -simplifycfg -loops -lcssa -loop-simplify -loop-rotate -inline -inline-threshold=1000000 trainingCode.ll -o trainingCode.bc > /dev/null
3) Finally, I run the unrolling pass:
opt -loop-unroll -unroll-count=2 -unroll-allow-partial trainingCode.bc -o unrolledTrainingCode.bc > /dev/null
I then repeat this with various unroll factors from 1 to 4.
For a simple piece of code like the following, this works exactly like I need it to:
#include <math.h>
int main() {
volatile float checksum = 0.0;
for (int i = 0; i < 10; i++) {
float fff = 0.112345;
fff *= fff;
fff += 1.13;
checksum += fff/10000;
}
}
But I get really weird behavior when I increase the complexity/size of the body to this for instance:
#include <math.h>
int main() {
volatile float checksum = 0.0;
for (int i = 0; i < 10; i++) {
float fff = 0.112345;
fff *= sqrt(fff) + fff;
fff += 1.13;
fff *= sqrt(fff) + fff;
fff += 17.16;
fff *= sqrt(fff) + fff;
fff += 15.13;
fff *= sqrt(fff) + fff;
fff += 21.13;
fff *= sqrt(fff) + fff;
fff += 81.13;
fff *= sqrt(fff) + fff;
fff += 11.13;
fff *= sqrt(fff) + fff;
fff += 81.13;
fff *= sqrt(fff) + fff;
fff += 11.13;
fff *= sqrt(fff) + fff;
fff += 91.13;
fff *= sqrt(fff) + fff;
fff += 11.13;
checksum += (fff + i)/10000;
}
}
For unroll factors of 1-2, everything works fine, but if I try to unroll using a factor greater than 2, LLVM completely unrolls the loop. This happens for any loop with sufficiently large of a body. For instance, this is (an excerpt) of the resulting bitcode of using any unroll factor greater than 2 for the code directly above:
; ModuleID = 'unrolledtrainingCode3.bc'
source_filename = "p1HighComplexity.cpp"
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux-gnu"
; Function Attrs: norecurse uwtable
define i32 #main() #0 {
entry:
%checksum = alloca float, align 4
store volatile float 0.000000e+00, float* %checksum, align 4
br label %for.body
for.body: ; preds = %entry
%call.i = call float #sqrtf(float 0x3FBCC2A460000000) #2
%add = fadd float %call.i, 0x3FBCC2A460000000
%mul = fmul float 0x3FBCC2A460000000, %add
%conv = fpext float %mul to double
%add1 = fadd double %conv, 1.130000e+00
%conv2 = fptrunc double %add1 to float
%call.i2 = call float #sqrtf(float %conv2) #2
%add4 = fadd float %call.i2, %conv2
%mul5 = fmul float %conv2, %add4
%conv6 = fpext float %mul5 to double
%add7 = fadd double %conv6, 1.716000e+01
%conv8 = fptrunc double %add7 to float
%call.i3 = call float #sqrtf(float %conv8) #2
%add10 = fadd float %call.i3, %conv8
%mul11 = fmul float %conv8, %add10
%conv12 = fpext float %mul11 to double
%add13 = fadd double %conv12, 1.513000e+01
%conv14 = fptrunc double %add13 to float
%call.i4 = call float #sqrtf(float %conv14) #2
%add16 = fadd float %call.i4, %conv14
%mul17 = fmul float %conv14, %add16
%conv18 = fpext float %mul17 to double
%add19 = fadd double %conv18, 2.113000e+01
%conv20 = fptrunc double %add19 to float
%call.i5 = call float #sqrtf(float %conv20) #2
%add22 = fadd float %call.i5, %conv20
%mul23 = fmul float %conv20, %add22
%conv24 = fpext float %mul23 to double
%add25 = fadd double %conv24, 0x40544851EB851EB8
%conv26 = fptrunc double %add25 to float
%call.i6 = call float #sqrtf(float %conv26) #2
%add28 = fadd float %call.i6, %conv26
%mul29 = fmul float %conv26, %add28
%conv30 = fpext float %mul29 to double
%add31 = fadd double %conv30, 1.113000e+01
%conv32 = fptrunc double %add31 to float
%call.i7 = call float #sqrtf(float %conv32) #2
%add34 = fadd float %call.i7, %conv32
%mul35 = fmul float %conv32, %add34
%conv36 = fpext float %mul35 to double
%add37 = fadd double %conv36, 0x40544851EB851EB8
%conv38 = fptrunc double %add37 to float
%call.i8 = call float #sqrtf(float %conv38) #2
%add40 = fadd float %call.i8, %conv38
%mul41 = fmul float %conv38, %add40
%conv42 = fpext float %mul41 to double
%add43 = fadd double %conv42, 1.113000e+01
%conv44 = fptrunc double %add43 to float
%call.i9 = call float #sqrtf(float %conv44) #2
%add46 = fadd float %call.i9, %conv44
%mul47 = fmul float %conv44, %add46
%conv48 = fpext float %mul47 to double
%add49 = fadd double %conv48, 0x4056C851EB851EB8
%conv50 = fptrunc double %add49 to float
%call.i10 = call float #sqrtf(float %conv50) #2
%add52 = fadd float %call.i10, %conv50
%mul53 = fmul float %conv50, %add52
%conv54 = fpext float %mul53 to double
%add55 = fadd double %conv54, 1.113000e+01
%conv56 = fptrunc double %add55 to float
%div = fdiv float %conv56, 1.000000e+04
%0 = load volatile float, float* %checksum, align 4
%add57 = fadd float %0, %div
store volatile float %add57, float* %checksum, align 4
%call.i.1 = call float #sqrtf(float 0x3FBCC2A460000000) #2
%add.1 = fadd float %call.i.1, 0x3FBCC2A460000000
%mul.1 = fmul float 0x3FBCC2A460000000, %add.1
%conv.1 = fpext float %mul.1 to double
%add1.1 = fadd double %conv.1, 1.130000e+00
%conv2.1 = fptrunc double %add1.1 to float
%call.i2.1 = call float #sqrtf(float %conv2.1) #2
%add4.1 = fadd float %call.i2.1, %conv2.1
%mul5.1 = fmul float %conv2.1, %add4.1
%conv6.1 = fpext float %mul5.1 to double
%add7.1 = fadd double %conv6.1, 1.716000e+01
%conv8.1 = fptrunc double %add7.1 to float
%call.i3.1 = call float #sqrtf(float %conv8.1) #2
%add10.1 = fadd float %call.i3.1, %conv8.1
%mul11.1 = fmul float %conv8.1, %add10.1
%conv12.1 = fpext float %mul11.1 to double
%add13.1 = fadd double %conv12.1, 1.513000e+01
%conv14.1 = fptrunc double %add13.1 to float
%call.i4.1 = call float #sqrtf(float %conv14.1) #2
%add16.1 = fadd float %call.i4.1, %conv14.1
%mul17.1 = fmul float %conv14.1, %add16.1
%conv18.1 = fpext float %mul17.1 to double
%add19.1 = fadd double %conv18.1, 2.113000e+01
%conv20.1 = fptrunc double %add19.1 to float
%call.i5.1 = call float #sqrtf(float %conv20.1) #2
%add22.1 = fadd float %call.i5.1, %conv20.1
%mul23.1 = fmul float %conv20.1, %add22.1
%conv24.1 = fpext float %mul23.1 to double
%add25.1 = fadd double %conv24.1, 0x40544851EB851EB8
%conv26.1 = fptrunc double %add25.1 to float
%call.i6.1 = call float #sqrtf(float %conv26.1) #2
%add28.1 = fadd float %call.i6.1, %conv26.1
%mul29.1 = fmul float %conv26.1, %add28.1
%conv30.1 = fpext float %mul29.1 to double
%add31.1 = fadd double %conv30.1, 1.113000e+01
%conv32.1 = fptrunc double %add31.1 to float
%call.i7.1 = call float #sqrtf(float %conv32.1) #2
%add34.1 = fadd float %call.i7.1, %conv32.1
%mul35.1 = fmul float %conv32.1, %add34.1
%conv36.1 = fpext float %mul35.1 to double
%add37.1 = fadd double %conv36.1, 0x40544851EB851EB8
%conv38.1 = fptrunc double %add37.1 to float
%call.i8.1 = call float #sqrtf(float %conv38.1) #2
%add40.1 = fadd float %call.i8.1, %conv38.1
%mul41.1 = fmul float %conv38.1, %add40.1
%conv42.1 = fpext float %mul41.1 to double
%add43.1 = fadd double %conv42.1, 1.113000e+01
%conv44.1 = fptrunc double %add43.1 to float
%call.i9.1 = call float #sqrtf(float %conv44.1) #2
%add46.1 = fadd float %call.i9.1, %conv44.1
%mul47.1 = fmul float %conv44.1, %add46.1
%conv48.1 = fpext float %mul47.1 to double
%add49.1 = fadd double %conv48.1, 0x4056C851EB851EB8
%conv50.1 = fptrunc double %add49.1 to float
%call.i10.1 = call float #sqrtf(float %conv50.1) #2
%add52.1 = fadd float %call.i10.1, %conv50.1
%mul53.1 = fmul float %conv50.1, %add52.1
%conv54.1 = fpext float %mul53.1 to double
%add55.1 = fadd double %conv54.1, 1.113000e+01
%conv56.1 = fptrunc double %add55.1 to float
%div.1 = fdiv float %conv56.1, 1.000000e+04
%1 = load volatile float, float* %checksum, align 4
%add57.1 = fadd float %1, %div.1
store volatile float %add57.1, float* %checksum, align 4
%call.i.2 = call float #sqrtf(float 0x3FBCC2A460000000) #2
%add.2 = fadd float %call.i.2, 0x3FBCC2A460000000
%mul.2 = fmul float 0x3FBCC2A460000000, %add.2
%conv.2 = fpext float %mul.2 to double
%add1.2 = fadd double %conv.2, 1.130000e+00
%conv2.2 = fptrunc double %add1.2 to float
%call.i2.2 = call float #sqrtf(float %conv2.2) #2
%add4.2 = fadd float %call.i2.2, %conv2.2
%mul5.2 = fmul float %conv2.2, %add4.2
%conv6.2 = fpext float %mul5.2 to double
%add7.2 = fadd double %conv6.2, 1.716000e+01
%conv8.2 = fptrunc double %add7.2 to float
%call.i3.2 = call float #sqrtf(float %conv8.2) #2
%add10.2 = fadd float %call.i3.2, %conv8.2
%mul11.2 = fmul float %conv8.2, %add10.2
%conv12.2 = fpext float %mul11.2 to double
%add13.2 = fadd double %conv12.2, 1.513000e+01
%conv14.2 = fptrunc double %add13.2 to float
%call.i4.2 = call float #sqrtf(float %conv14.2) #2
%add16.2 = fadd float %call.i4.2, %conv14.2
%mul17.2 = fmul float %conv14.2, %add16.2
%conv18.2 = fpext float %mul17.2 to double
%add19.2 = fadd double %conv18.2, 2.113000e+01
%conv20.2 = fptrunc double %add19.2 to float
%call.i5.2 = call float #sqrtf(float %conv20.2) #2
%add22.2 = fadd float %call.i5.2, %conv20.2
%mul23.2 = fmul float %conv20.2, %add22.2
%conv24.2 = fpext float %mul23.2 to double
%add25.2 = fadd double %conv24.2, 0x40544851EB851EB8
%conv26.2 = fptrunc double %add25.2 to float
%call.i6.2 = call float #sqrtf(float %conv26.2) #2
%add28.2 = fadd float %call.i6.2, %conv26.2
%mul29.2 = fmul float %conv26.2, %add28.2
%conv30.2 = fpext float %mul29.2 to double
%add31.2 = fadd double %conv30.2, 1.113000e+01
%conv32.2 = fptrunc double %add31.2 to float
%call.i7.2 = call float #sqrtf(float %conv32.2) #2
%add34.2 = fadd float %call.i7.2, %conv32.2
%mul35.2 = fmul float %conv32.2, %add34.2
%conv36.2 = fpext float %mul35.2 to double
%add37.2 = fadd double %conv36.2, 0x40544851EB851EB8
%conv38.2 = fptrunc double %add37.2 to float
%call.i8.2 = call float #sqrtf(float %conv38.2) #2
%add40.2 = fadd float %call.i8.2, %conv38.2
%mul41.2 = fmul float %conv38.2, %add40.2
%conv42.2 = fpext float %mul41.2 to double
%add43.2 = fadd double %conv42.2, 1.113000e+01
%conv44.2 = fptrunc double %add43.2 to float
%call.i9.2 = call float #sqrtf(float %conv44.2) #2
%add46.2 = fadd float %call.i9.2, %conv44.2
%mul47.2 = fmul float %conv44.2, %add46.2
%conv48.2 = fpext float %mul47.2 to double
%add49.2 = fadd double %conv48.2, 0x4056C851EB851EB8
%conv50.2 = fptrunc double %add49.2 to float
%call.i10.2 = call float #sqrtf(float %conv50.2) #2
%add52.2 = fadd float %call.i10.2, %conv50.2
%mul53.2 = fmul float %conv50.2, %add52.2
%conv54.2 = fpext float %mul53.2 to double
%add55.2 = fadd double %conv54.2, 1.113000e+01
%conv56.2 = fptrunc double %add55.2 to float
%div.2 = fdiv float %conv56.2, 1.000000e+04
%2 = load volatile float, float* %checksum, align 4
%add57.2 = fadd float %2, %div.2
store volatile float %add57.2, float* %checksum, align 4
%call.i.3 = call float #sqrtf(float 0x3FBCC2A460000000) #2
%add.3 = fadd float %call.i.3, 0x3FBCC2A460000000
%mul.3 = fmul float 0x3FBCC2A460000000, %add.3
%conv.3 = fpext float %mul.3 to double
%add1.3 = fadd double %conv.3, 1.130000e+00
%conv2.3 = fptrunc double %add1.3 to float
%call.i2.3 = call float #sqrtf(float %conv2.3) #2
%add4.3 = fadd float %call.i2.3, %conv2.3
%mul5.3 = fmul float %conv2.3, %add4.3
%conv6.3 = fpext float %mul5.3 to double
%add7.3 = fadd double %conv6.3, 1.716000e+01
%conv8.3 = fptrunc double %add7.3 to float
%call.i3.3 = call float #sqrtf(float %conv8.3) #2
%add10.3 = fadd float %call.i3.3, %conv8.3
%mul11.3 = fmul float %conv8.3, %add10.3
%conv12.3 = fpext float %mul11.3 to double
%add13.3 = fadd double %conv12.3, 1.513000e+01
%conv14.3 = fptrunc double %add13.3 to float
%call.i4.3 = call float #sqrtf(float %conv14.3) #2
%add16.3 = fadd float %call.i4.3, %conv14.3
%mul17.3 = fmul float %conv14.3, %add16.3
%conv18.3 = fpext float %mul17.3 to double
%add19.3 = fadd double %conv18.3, 2.113000e+01
%conv20.3 = fptrunc double %add19.3 to float
%call.i5.3 = call float #sqrtf(float %conv20.3) #2
%add22.3 = fadd float %call.i5.3, %conv20.3
%mul23.3 = fmul float %conv20.3, %add22.3
%conv24.3 = fpext float %mul23.3 to double
%add25.3 = fadd double %conv24.3, 0x40544851EB851EB8
%conv26.3 = fptrunc double %add25.3 to float
%call.i6.3 = call float #sqrtf(float %conv26.3) #2
%add28.3 = fadd float %call.i6.3, %conv26.3
%mul29.3 = fmul float %conv26.3, %add28.3
%conv30.3 = fpext float %mul29.3 to double
%add31.3 = fadd double %conv30.3, 1.113000e+01
%conv32.3 = fptrunc double %add31.3 to float
%call.i7.3 = call float #sqrtf(float %conv32.3) #2
%add34.3 = fadd float %call.i7.3, %conv32.3
%mul35.3 = fmul float %conv32.3, %add34.3
%conv36.3 = fpext float %mul35.3 to double
%add37.3 = fadd double %conv36.3, 0x40544851EB851EB8
%conv38.3 = fptrunc double %add37.3 to float
%call.i8.3 = call float #sqrtf(float %conv38.3) #2
%add40.3 = fadd float %call.i8.3, %conv38.3
%mul41.3 = fmul float %conv38.3, %add40.3
%conv42.3 = fpext float %mul41.3 to double
%add43.3 = fadd double %conv42.3, 1.113000e+01
%conv44.3 = fptrunc double %add43.3 to float
%call.i9.3 = call float #sqrtf(float %conv44.3) #2
%add46.3 = fadd float %call.i9.3, %conv44.3
%mul47.3 = fmul float %conv44.3, %add46.3
%conv48.3 = fpext float %mul47.3 to double
%add49.3 = fadd double %conv48.3, 0x4056C851EB851EB8
%conv50.3 = fptrunc double %add49.3 to float
%call.i10.3 = call float #sqrtf(float %conv50.3) #2
%add52.3 = fadd float %call.i10.3, %conv50.3
%mul53.3 = fmul float %conv50.3, %add52.3
%conv54.3 = fpext float %mul53.3 to double
%add55.3 = fadd double %conv54.3, 1.113000e+01
%conv56.3 = fptrunc double %add55.3 to float
%div.3 = fdiv float %conv56.3, 1.000000e+04
%3 = load volatile float, float* %checksum, align 4
%add57.3 = fadd float %3, %div.3
store volatile float %add57.3, float* %checksum, align 4
%call.i.4 = call float #sqrtf(float 0x3FBCC2A460000000) #2
%add.4 = fadd float %call.i.4, 0x3FBCC2A460000000
%mul.4 = fmul float 0x3FBCC2A460000000, %add.4
%conv.4 = fpext float %mul.4 to double
%add1.4 = fadd double %conv.4, 1.130000e+00
%conv2.4 = fptrunc double %add1.4 to float
%call.i2.4 = call float #sqrtf(float %conv2.4) #2
%add4.4 = fadd float %call.i2.4, %conv2.4
%mul5.4 = fmul float %conv2.4, %add4.4
%conv6.4 = fpext float %mul5.4 to double
%add7.4 = fadd double %conv6.4, 1.716000e+01
%conv8.4 = fptrunc double %add7.4 to float
%call.i3.4 = call float #sqrtf(float %conv8.4) #2
%add10.4 = fadd float %call.i3.4, %conv8.4
%mul11.4 = fmul float %conv8.4, %add10.4
%conv12.4 = fpext float %mul11.4 to double
%add13.4 = fadd double %conv12.4, 1.513000e+01
%conv14.4 = fptrunc double %add13.4 to float
%call.i4.4 = call float #sqrtf(float %conv14.4) #2
%add16.4 = fadd float %call.i4.4, %conv14.4
%mul17.4 = fmul float %conv14.4, %add16.4
%conv18.4 = fpext float %mul17.4 to double
%add19.4 = fadd double %conv18.4, 2.113000e+01
%conv20.4 = fptrunc double %add19.4 to float
%call.i5.4 = call float #sqrtf(float %conv20.4) #2
%add22.4 = fadd float %call.i5.4, %conv20.4
%mul23.4 = fmul float %conv20.4, %add22.4
%conv24.4 = fpext float %mul23.4 to double
%add25.4 = fadd double %conv24.4, 0x40544851EB851EB8
%conv26.4 = fptrunc double %add25.4 to float
%call.i6.4 = call float #sqrtf(float %conv26.4) #2
%add28.4 = fadd float %call.i6.4, %conv26.4
%mul29.4 = fmul float %conv26.4, %add28.4
%conv30.4 = fpext float %mul29.4 to double
%add31.4 = fadd double %conv30.4, 1.113000e+01
%conv32.4 = fptrunc double %add31.4 to float
%call.i7.4 = call float #sqrtf(float %conv32.4) #2
%add34.4 = fadd float %call.i7.4, %conv32.4
%mul35.4 = fmul float %conv32.4, %add34.4
%conv36.4 = fpext float %mul35.4 to double
%add37.4 = fadd double %conv36.4, 0x40544851EB851EB8
%conv38.4 = fptrunc double %add37.4 to float
%call.i8.4 = call float #sqrtf(float %conv38.4) #2
%add40.4 = fadd float %call.i8.4, %conv38.4
%mul41.4 = fmul float %conv38.4, %add40.4
%conv42.4 = fpext float %mul41.4 to double
%add43.4 = fadd double %conv42.4, 1.113000e+01
%conv44.4 = fptrunc double %add43.4 to float
%call.i9.4 = call float #sqrtf(float %conv44.4) #2
%add46.4 = fadd float %call.i9.4, %conv44.4
%mul47.4 = fmul float %conv44.4, %add46.4
%conv48.4 = fpext float %mul47.4 to double
%add49.4 = fadd double %conv48.4, 0x4056C851EB851EB8
%conv50.4 = fptrunc double %add49.4 to float
%call.i10.4 = call float #sqrtf(float %conv50.4) #2
%add52.4 = fadd float %call.i10.4, %conv50.4
%mul53.4 = fmul float %conv50.4, %add52.4
%conv54.4 = fpext float %mul53.4 to double
%add55.4 = fadd double %conv54.4, 1.113000e+01
%conv56.4 = fptrunc double %add55.4 to float
%div.4 = fdiv float %conv56.4, 1.000000e+04
%4 = load volatile float, float* %checksum, align 4
%add57.4 = fadd float %4, %div.4
store volatile float %add57.4, float* %checksum, align 4
%call.i.5 = call float #sqrtf(float 0x3FBCC2A460000000) #2
%add.5 = fadd float %call.i.5, 0x3FBCC2A460000000
%mul.5 = fmul float 0x3FBCC2A460000000, %add.5
%conv.5 = fpext float %mul.5 to double
%add1.5 = fadd double %conv.5, 1.130000e+00
%conv2.5 = fptrunc double %add1.5 to float
%call.i2.5 = call float #sqrtf(float %conv2.5) #2
%add4.5 = fadd float %call.i2.5, %conv2.5
%mul5.5 = fmul float %conv2.5, %add4.5
%conv6.5 = fpext float %mul5.5 to double
%add7.5 = fadd double %conv6.5, 1.716000e+01
%conv8.5 = fptrunc double %add7.5 to float
%call.i3.5 = call float #sqrtf(float %conv8.5) #2
%add10.5 = fadd float %call.i3.5, %conv8.5
%mul11.5 = fmul float %conv8.5, %add10.5
%conv12.5 = fpext float %mul11.5 to double
%add13.5 = fadd double %conv12.5, 1.513000e+01
%conv14.5 = fptrunc double %add13.5 to float
%call.i4.5 = call float #sqrtf(float %conv14.5) #2
%add16.5 = fadd float %call.i4.5, %conv14.5
%mul17.5 = fmul float %conv14.5, %add16.5
%conv18.5 = fpext float %mul17.5 to double
%add19.5 = fadd double %conv18.5, 2.113000e+01
%conv20.5 = fptrunc double %add19.5 to float
%call.i5.5 = call float #sqrtf(float %conv20.5) #2
%add22.5 = fadd float %call.i5.5, %conv20.5
%mul23.5 = fmul float %conv20.5, %add22.5
%conv24.5 = fpext float %mul23.5 to double
%add25.5 = fadd double %conv24.5, 0x40544851EB851EB8
%conv26.5 = fptrunc double %add25.5 to float
%call.i6.5 = call float #sqrtf(float %conv26.5) #2
%add28.5 = fadd float %call.i6.5, %conv26.5
%mul29.5 = fmul float %conv26.5, %add28.5
%conv30.5 = fpext float %mul29.5 to double
%add31.5 = fadd double %conv30.5, 1.113000e+01
%conv32.5 = fptrunc double %add31.5 to float
%call.i7.5 = call float #sqrtf(float %conv32.5) #2
%add34.5 = fadd float %call.i7.5, %conv32.5
%mul35.5 = fmul float %conv32.5, %add34.5
%conv36.5 = fpext float %mul35.5 to double
%add37.5 = fadd double %conv36.5, 0x40544851EB851EB8
%conv38.5 = fptrunc double %add37.5 to float
%call.i8.5 = call float #sqrtf(float %conv38.5) #2
%add40.5 = fadd float %call.i8.5, %conv38.5
%mul41.5 = fmul float %conv38.5, %add40.5
%conv42.5 = fpext float %mul41.5 to double
%add43.5 = fadd double %conv42.5, 1.113000e+01
%conv44.5 = fptrunc double %add43.5 to float
%call.i9.5 = call float #sqrtf(float %conv44.5) #2
%add46.5 = fadd float %call.i9.5, %conv44.5
%mul47.5 = fmul float %conv44.5, %add46.5
%conv48.5 = fpext float %mul47.5 to double
%add49.5 = fadd double %conv48.5, 0x4056C851EB851EB8
%conv50.5 = fptrunc double %add49.5 to float
%call.i10.5 = call float #sqrtf(float %conv50.5) #2
%add52.5 = fadd float %call.i10.5, %conv50.5
%mul53.5 = fmul float %conv50.5, %add52.5
%conv54.5 = fpext float %mul53.5 to double
%add55.5 = fadd double %conv54.5, 1.113000e+01
%conv56.5 = fptrunc double %add55.5 to float
%div.5 = fdiv float %conv56.5, 1.000000e+04
%5 = load volatile float, float* %checksum, align 4
%add57.5 = fadd float %5, %div.5
store volatile float %add57.5, float* %checksum, align 4
%call.i.6 = call float #sqrtf(float 0x3FBCC2A460000000) #2
%add.6 = fadd float %call.i.6, 0x3FBCC2A460000000
%mul.6 = fmul float 0x3FBCC2A460000000, %add.6
%conv.6 = fpext float %mul.6 to double
%add1.6 = fadd double %conv.6, 1.130000e+00
%conv2.6 = fptrunc double %add1.6 to float
%call.i2.6 = call float #sqrtf(float %conv2.6) #2
%add4.6 = fadd float %call.i2.6, %conv2.6
%mul5.6 = fmul float %conv2.6, %add4.6
%conv6.6 = fpext float %mul5.6 to double
%add7.6 = fadd double %conv6.6, 1.716000e+01
%conv8.6 = fptrunc double %add7.6 to float
%call.i3.6 = call float #sqrtf(float %conv8.6) #2
%add10.6 = fadd float %call.i3.6, %conv8.6
%mul11.6 = fmul float %conv8.6, %add10.6
%conv12.6 = fpext float %mul11.6 to double
%add13.6 = fadd double %conv12.6, 1.513000e+01
%conv14.6 = fptrunc double %add13.6 to float
%call.i4.6 = call float #sqrtf(float %conv14.6) #2
%add16.6 = fadd float %call.i4.6, %conv14.6
%mul17.6 = fmul float %conv14.6, %add16.6
%conv18.6 = fpext float %mul17.6 to double
%add19.6 = fadd double %conv18.6, 2.113000e+01
%conv20.6 = fptrunc double %add19.6 to float
%call.i5.6 = call float #sqrtf(float %conv20.6) #2
%add22.6 = fadd float %call.i5.6, %conv20.6
%mul23.6 = fmul float %conv20.6, %add22.6
%conv24.6 = fpext float %mul23.6 to double
%add25.6 = fadd double %conv24.6, 0x40544851EB851EB8
%conv26.6 = fptrunc double %add25.6 to float
%call.i6.6 = call float #sqrtf(float %conv26.6) #2
%add28.6 = fadd float %call.i6.6, %conv26.6
%mul29.6 = fmul float %conv26.6, %add28.6
%conv30.6 = fpext float %mul29.6 to double
%add31.6 = fadd double %conv30.6, 1.113000e+01
%conv32.6 = fptrunc double %add31.6 to float
%call.i7.6 = call float #sqrtf(float %conv32.6) #2
%add34.6 = fadd float %call.i7.6, %conv32.6
%mul35.6 = fmul float %conv32.6, %add34.6
%conv36.6 = fpext float %mul35.6 to double
%add37.6 = fadd double %conv36.6, 0x40544851EB851EB8
%conv38.6 = fptrunc double %add37.6 to float
%call.i8.6 = call float #sqrtf(float %conv38.6) #2
%add40.6 = fadd float %call.i8.6, %conv38.6
%mul41.6 = fmul float %conv38.6, %add40.6
%conv42.6 = fpext float %mul41.6 to double
%add43.6 = fadd double %conv42.6, 1.113000e+01
%conv44.6 = fptrunc double %add43.6 to float
%call.i9.6 = call float #sqrtf(float %conv44.6) #2
%add46.6 = fadd float %call.i9.6, %conv44.6
%mul47.6 = fmul float %conv44.6, %add46.6
%conv48.6 = fpext float %mul47.6 to double
%add49.6 = fadd double %conv48.6, 0x4056C851EB851EB8
%conv50.6 = fptrunc double %add49.6 to float
%call.i10.6 = call float #sqrtf(float %conv50.6) #2
%add52.6 = fadd float %call.i10.6, %conv50.6
%mul53.6 = fmul float %conv50.6, %add52.6
%conv54.6 = fpext float %mul53.6 to double
%add55.6 = fadd double %conv54.6, 1.113000e+01
%conv56.6 = fptrunc double %add55.6 to float
%div.6 = fdiv float %conv56.6, 1.000000e+04
%6 = load volatile float, float* %checksum, align 4
%add57.6 = fadd float %6, %div.6
As you can see, the loop was fully unrolled, despite me specifying an unroll factor of 3. I was hoping someone would have some insight into why this was happening...
LLVM has different heuristics for determining whether to unroll and whether to unroll fully, because unrolling a loop fully is often more beneficial than unrolling it partially, particularly if the trip count is fixed and known at compile time, because then all the checks and branches can be eliminated.
My quick search only turned up the source code and not the documentation,
but I think full unrolling is limited by a different setting: -unroll-full-max-count.
Basically the problem is related to x86 assembler where you have a number that you want to set to either zero or the number itself using an and. If you and that number with negative one you get back the number itself but if you and it with zero you get zero.
Now the problem I'm having with SSE instrinsics is that floats aren't the same in binary as doubles (or maybe I'm mistaken). Anyways here's the code, I've tried using all kinds of floats to mask the second and third numbers (127.0f and 99.0f respectively) but no luck.
#include <xmmintrin.h>
#include <stdio.h>
void print_4_bit_num(const char * label, __m128 var)
{
float *val = (float *) &var;
printf("%s: %f %f %f %f\n",
label, val[3], val[2], val[1], val[0]);
}
int main()
{
__m128 v1 = _mm_set_ps(1.0f, 127.0f, 99.0f, 1.0f);
__m128 v2 = _mm_set_ps(1.0f, 65535.0f, 127.0f, 0.0f);
__m128 v = _mm_and_ps(v1, v2);
print_4_bit_num("v1", v1);
print_4_bit_num("v2", v2);
print_4_bit_num("v ", v);
return 0;
}
You need to use a bitwise (integer) mask when you AND, so to e.g. clear alternate values in a vector you might do something like this:
__m128 v1 = _mm_set_ps(1.0f, 127.0f, 99.0f, 1.0f);
__m128 v2 = _mm_castsi128_ps(_mm_set_epi32(0, -1, 0, -1));
__m128 v = _mm_and_ps(v1, v2); // => v = { 0.0f, 127.0f, 0.0f, 1.0f }
You can cast any SSE vector to any SSE vector type of the same size (128 bit, or 256 bit), and you will get the exact same bits as before; there won't be any actual code. Obviously if you cast 4 float to 2 double you get nonsense, but for your case you cast float to some integer type, do the and, cast the result back.
If you have SSE4.1 (which I bet you do), you should consider _mm_blendv_ps(a,b,mask). This only uses the sign bit of its mask argument and essentially implements the vectorised mask<0?b:a.