Why is LLVM completely unrolling my loops? - c++

I have a piece of code that I want to unroll by various unroll factors and then look at the resulting bitcode. To do so I'm doing the following:
1) I first compile the code using clang:
clang -O0 -S -emit-llvm trainingCode.cpp -o trainingCode.ll
2) I then run a couple of passes on the resulting bitcode (as recommended by this):
opt -mem2reg -simplifycfg -loops -lcssa -loop-simplify -loop-rotate -inline -inline-threshold=1000000 trainingCode.ll -o trainingCode.bc > /dev/null
3) Finally, I run the unrolling pass:
opt -loop-unroll -unroll-count=2 -unroll-allow-partial trainingCode.bc -o unrolledTrainingCode.bc > /dev/null
I then repeat this with various unroll factors from 1 to 4.
For a simple piece of code like the following, this works exactly like I need it to:
#include <math.h>
int main() {
volatile float checksum = 0.0;
for (int i = 0; i < 10; i++) {
float fff = 0.112345;
fff *= fff;
fff += 1.13;
checksum += fff/10000;
}
}
But I get really weird behavior when I increase the complexity/size of the body to this for instance:
#include <math.h>
int main() {
volatile float checksum = 0.0;
for (int i = 0; i < 10; i++) {
float fff = 0.112345;
fff *= sqrt(fff) + fff;
fff += 1.13;
fff *= sqrt(fff) + fff;
fff += 17.16;
fff *= sqrt(fff) + fff;
fff += 15.13;
fff *= sqrt(fff) + fff;
fff += 21.13;
fff *= sqrt(fff) + fff;
fff += 81.13;
fff *= sqrt(fff) + fff;
fff += 11.13;
fff *= sqrt(fff) + fff;
fff += 81.13;
fff *= sqrt(fff) + fff;
fff += 11.13;
fff *= sqrt(fff) + fff;
fff += 91.13;
fff *= sqrt(fff) + fff;
fff += 11.13;
checksum += (fff + i)/10000;
}
}
For unroll factors of 1-2, everything works fine, but if I try to unroll using a factor greater than 2, LLVM completely unrolls the loop. This happens for any loop with sufficiently large of a body. For instance, this is (an excerpt) of the resulting bitcode of using any unroll factor greater than 2 for the code directly above:
; ModuleID = 'unrolledtrainingCode3.bc'
source_filename = "p1HighComplexity.cpp"
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux-gnu"
; Function Attrs: norecurse uwtable
define i32 #main() #0 {
entry:
%checksum = alloca float, align 4
store volatile float 0.000000e+00, float* %checksum, align 4
br label %for.body
for.body: ; preds = %entry
%call.i = call float #sqrtf(float 0x3FBCC2A460000000) #2
%add = fadd float %call.i, 0x3FBCC2A460000000
%mul = fmul float 0x3FBCC2A460000000, %add
%conv = fpext float %mul to double
%add1 = fadd double %conv, 1.130000e+00
%conv2 = fptrunc double %add1 to float
%call.i2 = call float #sqrtf(float %conv2) #2
%add4 = fadd float %call.i2, %conv2
%mul5 = fmul float %conv2, %add4
%conv6 = fpext float %mul5 to double
%add7 = fadd double %conv6, 1.716000e+01
%conv8 = fptrunc double %add7 to float
%call.i3 = call float #sqrtf(float %conv8) #2
%add10 = fadd float %call.i3, %conv8
%mul11 = fmul float %conv8, %add10
%conv12 = fpext float %mul11 to double
%add13 = fadd double %conv12, 1.513000e+01
%conv14 = fptrunc double %add13 to float
%call.i4 = call float #sqrtf(float %conv14) #2
%add16 = fadd float %call.i4, %conv14
%mul17 = fmul float %conv14, %add16
%conv18 = fpext float %mul17 to double
%add19 = fadd double %conv18, 2.113000e+01
%conv20 = fptrunc double %add19 to float
%call.i5 = call float #sqrtf(float %conv20) #2
%add22 = fadd float %call.i5, %conv20
%mul23 = fmul float %conv20, %add22
%conv24 = fpext float %mul23 to double
%add25 = fadd double %conv24, 0x40544851EB851EB8
%conv26 = fptrunc double %add25 to float
%call.i6 = call float #sqrtf(float %conv26) #2
%add28 = fadd float %call.i6, %conv26
%mul29 = fmul float %conv26, %add28
%conv30 = fpext float %mul29 to double
%add31 = fadd double %conv30, 1.113000e+01
%conv32 = fptrunc double %add31 to float
%call.i7 = call float #sqrtf(float %conv32) #2
%add34 = fadd float %call.i7, %conv32
%mul35 = fmul float %conv32, %add34
%conv36 = fpext float %mul35 to double
%add37 = fadd double %conv36, 0x40544851EB851EB8
%conv38 = fptrunc double %add37 to float
%call.i8 = call float #sqrtf(float %conv38) #2
%add40 = fadd float %call.i8, %conv38
%mul41 = fmul float %conv38, %add40
%conv42 = fpext float %mul41 to double
%add43 = fadd double %conv42, 1.113000e+01
%conv44 = fptrunc double %add43 to float
%call.i9 = call float #sqrtf(float %conv44) #2
%add46 = fadd float %call.i9, %conv44
%mul47 = fmul float %conv44, %add46
%conv48 = fpext float %mul47 to double
%add49 = fadd double %conv48, 0x4056C851EB851EB8
%conv50 = fptrunc double %add49 to float
%call.i10 = call float #sqrtf(float %conv50) #2
%add52 = fadd float %call.i10, %conv50
%mul53 = fmul float %conv50, %add52
%conv54 = fpext float %mul53 to double
%add55 = fadd double %conv54, 1.113000e+01
%conv56 = fptrunc double %add55 to float
%div = fdiv float %conv56, 1.000000e+04
%0 = load volatile float, float* %checksum, align 4
%add57 = fadd float %0, %div
store volatile float %add57, float* %checksum, align 4
%call.i.1 = call float #sqrtf(float 0x3FBCC2A460000000) #2
%add.1 = fadd float %call.i.1, 0x3FBCC2A460000000
%mul.1 = fmul float 0x3FBCC2A460000000, %add.1
%conv.1 = fpext float %mul.1 to double
%add1.1 = fadd double %conv.1, 1.130000e+00
%conv2.1 = fptrunc double %add1.1 to float
%call.i2.1 = call float #sqrtf(float %conv2.1) #2
%add4.1 = fadd float %call.i2.1, %conv2.1
%mul5.1 = fmul float %conv2.1, %add4.1
%conv6.1 = fpext float %mul5.1 to double
%add7.1 = fadd double %conv6.1, 1.716000e+01
%conv8.1 = fptrunc double %add7.1 to float
%call.i3.1 = call float #sqrtf(float %conv8.1) #2
%add10.1 = fadd float %call.i3.1, %conv8.1
%mul11.1 = fmul float %conv8.1, %add10.1
%conv12.1 = fpext float %mul11.1 to double
%add13.1 = fadd double %conv12.1, 1.513000e+01
%conv14.1 = fptrunc double %add13.1 to float
%call.i4.1 = call float #sqrtf(float %conv14.1) #2
%add16.1 = fadd float %call.i4.1, %conv14.1
%mul17.1 = fmul float %conv14.1, %add16.1
%conv18.1 = fpext float %mul17.1 to double
%add19.1 = fadd double %conv18.1, 2.113000e+01
%conv20.1 = fptrunc double %add19.1 to float
%call.i5.1 = call float #sqrtf(float %conv20.1) #2
%add22.1 = fadd float %call.i5.1, %conv20.1
%mul23.1 = fmul float %conv20.1, %add22.1
%conv24.1 = fpext float %mul23.1 to double
%add25.1 = fadd double %conv24.1, 0x40544851EB851EB8
%conv26.1 = fptrunc double %add25.1 to float
%call.i6.1 = call float #sqrtf(float %conv26.1) #2
%add28.1 = fadd float %call.i6.1, %conv26.1
%mul29.1 = fmul float %conv26.1, %add28.1
%conv30.1 = fpext float %mul29.1 to double
%add31.1 = fadd double %conv30.1, 1.113000e+01
%conv32.1 = fptrunc double %add31.1 to float
%call.i7.1 = call float #sqrtf(float %conv32.1) #2
%add34.1 = fadd float %call.i7.1, %conv32.1
%mul35.1 = fmul float %conv32.1, %add34.1
%conv36.1 = fpext float %mul35.1 to double
%add37.1 = fadd double %conv36.1, 0x40544851EB851EB8
%conv38.1 = fptrunc double %add37.1 to float
%call.i8.1 = call float #sqrtf(float %conv38.1) #2
%add40.1 = fadd float %call.i8.1, %conv38.1
%mul41.1 = fmul float %conv38.1, %add40.1
%conv42.1 = fpext float %mul41.1 to double
%add43.1 = fadd double %conv42.1, 1.113000e+01
%conv44.1 = fptrunc double %add43.1 to float
%call.i9.1 = call float #sqrtf(float %conv44.1) #2
%add46.1 = fadd float %call.i9.1, %conv44.1
%mul47.1 = fmul float %conv44.1, %add46.1
%conv48.1 = fpext float %mul47.1 to double
%add49.1 = fadd double %conv48.1, 0x4056C851EB851EB8
%conv50.1 = fptrunc double %add49.1 to float
%call.i10.1 = call float #sqrtf(float %conv50.1) #2
%add52.1 = fadd float %call.i10.1, %conv50.1
%mul53.1 = fmul float %conv50.1, %add52.1
%conv54.1 = fpext float %mul53.1 to double
%add55.1 = fadd double %conv54.1, 1.113000e+01
%conv56.1 = fptrunc double %add55.1 to float
%div.1 = fdiv float %conv56.1, 1.000000e+04
%1 = load volatile float, float* %checksum, align 4
%add57.1 = fadd float %1, %div.1
store volatile float %add57.1, float* %checksum, align 4
%call.i.2 = call float #sqrtf(float 0x3FBCC2A460000000) #2
%add.2 = fadd float %call.i.2, 0x3FBCC2A460000000
%mul.2 = fmul float 0x3FBCC2A460000000, %add.2
%conv.2 = fpext float %mul.2 to double
%add1.2 = fadd double %conv.2, 1.130000e+00
%conv2.2 = fptrunc double %add1.2 to float
%call.i2.2 = call float #sqrtf(float %conv2.2) #2
%add4.2 = fadd float %call.i2.2, %conv2.2
%mul5.2 = fmul float %conv2.2, %add4.2
%conv6.2 = fpext float %mul5.2 to double
%add7.2 = fadd double %conv6.2, 1.716000e+01
%conv8.2 = fptrunc double %add7.2 to float
%call.i3.2 = call float #sqrtf(float %conv8.2) #2
%add10.2 = fadd float %call.i3.2, %conv8.2
%mul11.2 = fmul float %conv8.2, %add10.2
%conv12.2 = fpext float %mul11.2 to double
%add13.2 = fadd double %conv12.2, 1.513000e+01
%conv14.2 = fptrunc double %add13.2 to float
%call.i4.2 = call float #sqrtf(float %conv14.2) #2
%add16.2 = fadd float %call.i4.2, %conv14.2
%mul17.2 = fmul float %conv14.2, %add16.2
%conv18.2 = fpext float %mul17.2 to double
%add19.2 = fadd double %conv18.2, 2.113000e+01
%conv20.2 = fptrunc double %add19.2 to float
%call.i5.2 = call float #sqrtf(float %conv20.2) #2
%add22.2 = fadd float %call.i5.2, %conv20.2
%mul23.2 = fmul float %conv20.2, %add22.2
%conv24.2 = fpext float %mul23.2 to double
%add25.2 = fadd double %conv24.2, 0x40544851EB851EB8
%conv26.2 = fptrunc double %add25.2 to float
%call.i6.2 = call float #sqrtf(float %conv26.2) #2
%add28.2 = fadd float %call.i6.2, %conv26.2
%mul29.2 = fmul float %conv26.2, %add28.2
%conv30.2 = fpext float %mul29.2 to double
%add31.2 = fadd double %conv30.2, 1.113000e+01
%conv32.2 = fptrunc double %add31.2 to float
%call.i7.2 = call float #sqrtf(float %conv32.2) #2
%add34.2 = fadd float %call.i7.2, %conv32.2
%mul35.2 = fmul float %conv32.2, %add34.2
%conv36.2 = fpext float %mul35.2 to double
%add37.2 = fadd double %conv36.2, 0x40544851EB851EB8
%conv38.2 = fptrunc double %add37.2 to float
%call.i8.2 = call float #sqrtf(float %conv38.2) #2
%add40.2 = fadd float %call.i8.2, %conv38.2
%mul41.2 = fmul float %conv38.2, %add40.2
%conv42.2 = fpext float %mul41.2 to double
%add43.2 = fadd double %conv42.2, 1.113000e+01
%conv44.2 = fptrunc double %add43.2 to float
%call.i9.2 = call float #sqrtf(float %conv44.2) #2
%add46.2 = fadd float %call.i9.2, %conv44.2
%mul47.2 = fmul float %conv44.2, %add46.2
%conv48.2 = fpext float %mul47.2 to double
%add49.2 = fadd double %conv48.2, 0x4056C851EB851EB8
%conv50.2 = fptrunc double %add49.2 to float
%call.i10.2 = call float #sqrtf(float %conv50.2) #2
%add52.2 = fadd float %call.i10.2, %conv50.2
%mul53.2 = fmul float %conv50.2, %add52.2
%conv54.2 = fpext float %mul53.2 to double
%add55.2 = fadd double %conv54.2, 1.113000e+01
%conv56.2 = fptrunc double %add55.2 to float
%div.2 = fdiv float %conv56.2, 1.000000e+04
%2 = load volatile float, float* %checksum, align 4
%add57.2 = fadd float %2, %div.2
store volatile float %add57.2, float* %checksum, align 4
%call.i.3 = call float #sqrtf(float 0x3FBCC2A460000000) #2
%add.3 = fadd float %call.i.3, 0x3FBCC2A460000000
%mul.3 = fmul float 0x3FBCC2A460000000, %add.3
%conv.3 = fpext float %mul.3 to double
%add1.3 = fadd double %conv.3, 1.130000e+00
%conv2.3 = fptrunc double %add1.3 to float
%call.i2.3 = call float #sqrtf(float %conv2.3) #2
%add4.3 = fadd float %call.i2.3, %conv2.3
%mul5.3 = fmul float %conv2.3, %add4.3
%conv6.3 = fpext float %mul5.3 to double
%add7.3 = fadd double %conv6.3, 1.716000e+01
%conv8.3 = fptrunc double %add7.3 to float
%call.i3.3 = call float #sqrtf(float %conv8.3) #2
%add10.3 = fadd float %call.i3.3, %conv8.3
%mul11.3 = fmul float %conv8.3, %add10.3
%conv12.3 = fpext float %mul11.3 to double
%add13.3 = fadd double %conv12.3, 1.513000e+01
%conv14.3 = fptrunc double %add13.3 to float
%call.i4.3 = call float #sqrtf(float %conv14.3) #2
%add16.3 = fadd float %call.i4.3, %conv14.3
%mul17.3 = fmul float %conv14.3, %add16.3
%conv18.3 = fpext float %mul17.3 to double
%add19.3 = fadd double %conv18.3, 2.113000e+01
%conv20.3 = fptrunc double %add19.3 to float
%call.i5.3 = call float #sqrtf(float %conv20.3) #2
%add22.3 = fadd float %call.i5.3, %conv20.3
%mul23.3 = fmul float %conv20.3, %add22.3
%conv24.3 = fpext float %mul23.3 to double
%add25.3 = fadd double %conv24.3, 0x40544851EB851EB8
%conv26.3 = fptrunc double %add25.3 to float
%call.i6.3 = call float #sqrtf(float %conv26.3) #2
%add28.3 = fadd float %call.i6.3, %conv26.3
%mul29.3 = fmul float %conv26.3, %add28.3
%conv30.3 = fpext float %mul29.3 to double
%add31.3 = fadd double %conv30.3, 1.113000e+01
%conv32.3 = fptrunc double %add31.3 to float
%call.i7.3 = call float #sqrtf(float %conv32.3) #2
%add34.3 = fadd float %call.i7.3, %conv32.3
%mul35.3 = fmul float %conv32.3, %add34.3
%conv36.3 = fpext float %mul35.3 to double
%add37.3 = fadd double %conv36.3, 0x40544851EB851EB8
%conv38.3 = fptrunc double %add37.3 to float
%call.i8.3 = call float #sqrtf(float %conv38.3) #2
%add40.3 = fadd float %call.i8.3, %conv38.3
%mul41.3 = fmul float %conv38.3, %add40.3
%conv42.3 = fpext float %mul41.3 to double
%add43.3 = fadd double %conv42.3, 1.113000e+01
%conv44.3 = fptrunc double %add43.3 to float
%call.i9.3 = call float #sqrtf(float %conv44.3) #2
%add46.3 = fadd float %call.i9.3, %conv44.3
%mul47.3 = fmul float %conv44.3, %add46.3
%conv48.3 = fpext float %mul47.3 to double
%add49.3 = fadd double %conv48.3, 0x4056C851EB851EB8
%conv50.3 = fptrunc double %add49.3 to float
%call.i10.3 = call float #sqrtf(float %conv50.3) #2
%add52.3 = fadd float %call.i10.3, %conv50.3
%mul53.3 = fmul float %conv50.3, %add52.3
%conv54.3 = fpext float %mul53.3 to double
%add55.3 = fadd double %conv54.3, 1.113000e+01
%conv56.3 = fptrunc double %add55.3 to float
%div.3 = fdiv float %conv56.3, 1.000000e+04
%3 = load volatile float, float* %checksum, align 4
%add57.3 = fadd float %3, %div.3
store volatile float %add57.3, float* %checksum, align 4
%call.i.4 = call float #sqrtf(float 0x3FBCC2A460000000) #2
%add.4 = fadd float %call.i.4, 0x3FBCC2A460000000
%mul.4 = fmul float 0x3FBCC2A460000000, %add.4
%conv.4 = fpext float %mul.4 to double
%add1.4 = fadd double %conv.4, 1.130000e+00
%conv2.4 = fptrunc double %add1.4 to float
%call.i2.4 = call float #sqrtf(float %conv2.4) #2
%add4.4 = fadd float %call.i2.4, %conv2.4
%mul5.4 = fmul float %conv2.4, %add4.4
%conv6.4 = fpext float %mul5.4 to double
%add7.4 = fadd double %conv6.4, 1.716000e+01
%conv8.4 = fptrunc double %add7.4 to float
%call.i3.4 = call float #sqrtf(float %conv8.4) #2
%add10.4 = fadd float %call.i3.4, %conv8.4
%mul11.4 = fmul float %conv8.4, %add10.4
%conv12.4 = fpext float %mul11.4 to double
%add13.4 = fadd double %conv12.4, 1.513000e+01
%conv14.4 = fptrunc double %add13.4 to float
%call.i4.4 = call float #sqrtf(float %conv14.4) #2
%add16.4 = fadd float %call.i4.4, %conv14.4
%mul17.4 = fmul float %conv14.4, %add16.4
%conv18.4 = fpext float %mul17.4 to double
%add19.4 = fadd double %conv18.4, 2.113000e+01
%conv20.4 = fptrunc double %add19.4 to float
%call.i5.4 = call float #sqrtf(float %conv20.4) #2
%add22.4 = fadd float %call.i5.4, %conv20.4
%mul23.4 = fmul float %conv20.4, %add22.4
%conv24.4 = fpext float %mul23.4 to double
%add25.4 = fadd double %conv24.4, 0x40544851EB851EB8
%conv26.4 = fptrunc double %add25.4 to float
%call.i6.4 = call float #sqrtf(float %conv26.4) #2
%add28.4 = fadd float %call.i6.4, %conv26.4
%mul29.4 = fmul float %conv26.4, %add28.4
%conv30.4 = fpext float %mul29.4 to double
%add31.4 = fadd double %conv30.4, 1.113000e+01
%conv32.4 = fptrunc double %add31.4 to float
%call.i7.4 = call float #sqrtf(float %conv32.4) #2
%add34.4 = fadd float %call.i7.4, %conv32.4
%mul35.4 = fmul float %conv32.4, %add34.4
%conv36.4 = fpext float %mul35.4 to double
%add37.4 = fadd double %conv36.4, 0x40544851EB851EB8
%conv38.4 = fptrunc double %add37.4 to float
%call.i8.4 = call float #sqrtf(float %conv38.4) #2
%add40.4 = fadd float %call.i8.4, %conv38.4
%mul41.4 = fmul float %conv38.4, %add40.4
%conv42.4 = fpext float %mul41.4 to double
%add43.4 = fadd double %conv42.4, 1.113000e+01
%conv44.4 = fptrunc double %add43.4 to float
%call.i9.4 = call float #sqrtf(float %conv44.4) #2
%add46.4 = fadd float %call.i9.4, %conv44.4
%mul47.4 = fmul float %conv44.4, %add46.4
%conv48.4 = fpext float %mul47.4 to double
%add49.4 = fadd double %conv48.4, 0x4056C851EB851EB8
%conv50.4 = fptrunc double %add49.4 to float
%call.i10.4 = call float #sqrtf(float %conv50.4) #2
%add52.4 = fadd float %call.i10.4, %conv50.4
%mul53.4 = fmul float %conv50.4, %add52.4
%conv54.4 = fpext float %mul53.4 to double
%add55.4 = fadd double %conv54.4, 1.113000e+01
%conv56.4 = fptrunc double %add55.4 to float
%div.4 = fdiv float %conv56.4, 1.000000e+04
%4 = load volatile float, float* %checksum, align 4
%add57.4 = fadd float %4, %div.4
store volatile float %add57.4, float* %checksum, align 4
%call.i.5 = call float #sqrtf(float 0x3FBCC2A460000000) #2
%add.5 = fadd float %call.i.5, 0x3FBCC2A460000000
%mul.5 = fmul float 0x3FBCC2A460000000, %add.5
%conv.5 = fpext float %mul.5 to double
%add1.5 = fadd double %conv.5, 1.130000e+00
%conv2.5 = fptrunc double %add1.5 to float
%call.i2.5 = call float #sqrtf(float %conv2.5) #2
%add4.5 = fadd float %call.i2.5, %conv2.5
%mul5.5 = fmul float %conv2.5, %add4.5
%conv6.5 = fpext float %mul5.5 to double
%add7.5 = fadd double %conv6.5, 1.716000e+01
%conv8.5 = fptrunc double %add7.5 to float
%call.i3.5 = call float #sqrtf(float %conv8.5) #2
%add10.5 = fadd float %call.i3.5, %conv8.5
%mul11.5 = fmul float %conv8.5, %add10.5
%conv12.5 = fpext float %mul11.5 to double
%add13.5 = fadd double %conv12.5, 1.513000e+01
%conv14.5 = fptrunc double %add13.5 to float
%call.i4.5 = call float #sqrtf(float %conv14.5) #2
%add16.5 = fadd float %call.i4.5, %conv14.5
%mul17.5 = fmul float %conv14.5, %add16.5
%conv18.5 = fpext float %mul17.5 to double
%add19.5 = fadd double %conv18.5, 2.113000e+01
%conv20.5 = fptrunc double %add19.5 to float
%call.i5.5 = call float #sqrtf(float %conv20.5) #2
%add22.5 = fadd float %call.i5.5, %conv20.5
%mul23.5 = fmul float %conv20.5, %add22.5
%conv24.5 = fpext float %mul23.5 to double
%add25.5 = fadd double %conv24.5, 0x40544851EB851EB8
%conv26.5 = fptrunc double %add25.5 to float
%call.i6.5 = call float #sqrtf(float %conv26.5) #2
%add28.5 = fadd float %call.i6.5, %conv26.5
%mul29.5 = fmul float %conv26.5, %add28.5
%conv30.5 = fpext float %mul29.5 to double
%add31.5 = fadd double %conv30.5, 1.113000e+01
%conv32.5 = fptrunc double %add31.5 to float
%call.i7.5 = call float #sqrtf(float %conv32.5) #2
%add34.5 = fadd float %call.i7.5, %conv32.5
%mul35.5 = fmul float %conv32.5, %add34.5
%conv36.5 = fpext float %mul35.5 to double
%add37.5 = fadd double %conv36.5, 0x40544851EB851EB8
%conv38.5 = fptrunc double %add37.5 to float
%call.i8.5 = call float #sqrtf(float %conv38.5) #2
%add40.5 = fadd float %call.i8.5, %conv38.5
%mul41.5 = fmul float %conv38.5, %add40.5
%conv42.5 = fpext float %mul41.5 to double
%add43.5 = fadd double %conv42.5, 1.113000e+01
%conv44.5 = fptrunc double %add43.5 to float
%call.i9.5 = call float #sqrtf(float %conv44.5) #2
%add46.5 = fadd float %call.i9.5, %conv44.5
%mul47.5 = fmul float %conv44.5, %add46.5
%conv48.5 = fpext float %mul47.5 to double
%add49.5 = fadd double %conv48.5, 0x4056C851EB851EB8
%conv50.5 = fptrunc double %add49.5 to float
%call.i10.5 = call float #sqrtf(float %conv50.5) #2
%add52.5 = fadd float %call.i10.5, %conv50.5
%mul53.5 = fmul float %conv50.5, %add52.5
%conv54.5 = fpext float %mul53.5 to double
%add55.5 = fadd double %conv54.5, 1.113000e+01
%conv56.5 = fptrunc double %add55.5 to float
%div.5 = fdiv float %conv56.5, 1.000000e+04
%5 = load volatile float, float* %checksum, align 4
%add57.5 = fadd float %5, %div.5
store volatile float %add57.5, float* %checksum, align 4
%call.i.6 = call float #sqrtf(float 0x3FBCC2A460000000) #2
%add.6 = fadd float %call.i.6, 0x3FBCC2A460000000
%mul.6 = fmul float 0x3FBCC2A460000000, %add.6
%conv.6 = fpext float %mul.6 to double
%add1.6 = fadd double %conv.6, 1.130000e+00
%conv2.6 = fptrunc double %add1.6 to float
%call.i2.6 = call float #sqrtf(float %conv2.6) #2
%add4.6 = fadd float %call.i2.6, %conv2.6
%mul5.6 = fmul float %conv2.6, %add4.6
%conv6.6 = fpext float %mul5.6 to double
%add7.6 = fadd double %conv6.6, 1.716000e+01
%conv8.6 = fptrunc double %add7.6 to float
%call.i3.6 = call float #sqrtf(float %conv8.6) #2
%add10.6 = fadd float %call.i3.6, %conv8.6
%mul11.6 = fmul float %conv8.6, %add10.6
%conv12.6 = fpext float %mul11.6 to double
%add13.6 = fadd double %conv12.6, 1.513000e+01
%conv14.6 = fptrunc double %add13.6 to float
%call.i4.6 = call float #sqrtf(float %conv14.6) #2
%add16.6 = fadd float %call.i4.6, %conv14.6
%mul17.6 = fmul float %conv14.6, %add16.6
%conv18.6 = fpext float %mul17.6 to double
%add19.6 = fadd double %conv18.6, 2.113000e+01
%conv20.6 = fptrunc double %add19.6 to float
%call.i5.6 = call float #sqrtf(float %conv20.6) #2
%add22.6 = fadd float %call.i5.6, %conv20.6
%mul23.6 = fmul float %conv20.6, %add22.6
%conv24.6 = fpext float %mul23.6 to double
%add25.6 = fadd double %conv24.6, 0x40544851EB851EB8
%conv26.6 = fptrunc double %add25.6 to float
%call.i6.6 = call float #sqrtf(float %conv26.6) #2
%add28.6 = fadd float %call.i6.6, %conv26.6
%mul29.6 = fmul float %conv26.6, %add28.6
%conv30.6 = fpext float %mul29.6 to double
%add31.6 = fadd double %conv30.6, 1.113000e+01
%conv32.6 = fptrunc double %add31.6 to float
%call.i7.6 = call float #sqrtf(float %conv32.6) #2
%add34.6 = fadd float %call.i7.6, %conv32.6
%mul35.6 = fmul float %conv32.6, %add34.6
%conv36.6 = fpext float %mul35.6 to double
%add37.6 = fadd double %conv36.6, 0x40544851EB851EB8
%conv38.6 = fptrunc double %add37.6 to float
%call.i8.6 = call float #sqrtf(float %conv38.6) #2
%add40.6 = fadd float %call.i8.6, %conv38.6
%mul41.6 = fmul float %conv38.6, %add40.6
%conv42.6 = fpext float %mul41.6 to double
%add43.6 = fadd double %conv42.6, 1.113000e+01
%conv44.6 = fptrunc double %add43.6 to float
%call.i9.6 = call float #sqrtf(float %conv44.6) #2
%add46.6 = fadd float %call.i9.6, %conv44.6
%mul47.6 = fmul float %conv44.6, %add46.6
%conv48.6 = fpext float %mul47.6 to double
%add49.6 = fadd double %conv48.6, 0x4056C851EB851EB8
%conv50.6 = fptrunc double %add49.6 to float
%call.i10.6 = call float #sqrtf(float %conv50.6) #2
%add52.6 = fadd float %call.i10.6, %conv50.6
%mul53.6 = fmul float %conv50.6, %add52.6
%conv54.6 = fpext float %mul53.6 to double
%add55.6 = fadd double %conv54.6, 1.113000e+01
%conv56.6 = fptrunc double %add55.6 to float
%div.6 = fdiv float %conv56.6, 1.000000e+04
%6 = load volatile float, float* %checksum, align 4
%add57.6 = fadd float %6, %div.6
As you can see, the loop was fully unrolled, despite me specifying an unroll factor of 3. I was hoping someone would have some insight into why this was happening...

LLVM has different heuristics for determining whether to unroll and whether to unroll fully, because unrolling a loop fully is often more beneficial than unrolling it partially, particularly if the trip count is fixed and known at compile time, because then all the checks and branches can be eliminated.
My quick search only turned up the source code and not the documentation,
but I think full unrolling is limited by a different setting: -unroll-full-max-count.

Related

How to linearize the product of two float variables

I would like linearizate the product of two float variables. Suppose that a model has the product x * y, where x and y are float, with 0 <= x <= 1 and 0 <= y <= 1. How linearizate this product?
I gave an example in OPL/CPLEX here
What you can do is remember that
4*x*y=(x+y)*(x+y)-(x-y)(x-y)
So if you do a variable change X=x+y and Y=x-y
x*y
becomes
1/4*(X*X-Y*Y)
which is separable.
And then you are able to interpolate the function x*x by piecewise linear function:
// y=x*x interpolation
int sampleSize=10000;
float s=0;
float e=100;
float x[i in 0..sampleSize]=s+(e-s)*i/sampleSize;
int nbSegments=20;
float x2[i in 0..nbSegments]=(s)+(e-s)*i/nbSegments;
float y2[i in 0..nbSegments]=x2[i]*x2[i];
float firstSlope=0;
float lastSlope=0;
tuple breakpoint // y=f(x)
{
key float x;
float y;
}
sorted { breakpoint } breakpoints={<x2[i],y2[i]> | i in 0..nbSegments};
float slopesBeforeBreakpoint[b in breakpoints]=
(b.x==first(breakpoints).x)
?firstSlope
:(b.y-prev(breakpoints,b).y)/(b.x-prev(breakpoints,b).x);
pwlFunction f=piecewise(b in breakpoints)
{ slopesBeforeBreakpoint[b]->b.x; lastSlope } (first(breakpoints).x, first(breakpoints).y);
assert forall(b in breakpoints) f(b.x)==b.y;
float maxError=max (i in 0..sampleSize) abs(x[i]*x[i]-f(x[i]));
float averageError=1/(sampleSize+1)*sum (i in 0..sampleSize) abs(x[i]*x[i]-f(x[i]));
execute
{
writeln("maxError = ",maxError);
writeln("averageError = ",averageError);
}
dvar float a in 0..10;
dvar float b in 0..10;
dvar float squareaplusb;
dvar float squareaminusb;
maximize a+b;
dvar float ab;
subject to
{
ab<=10;
ab==1/4*(squareaplusb-squareaminusb);
squareaplusb==f(a+b);
squareaminusb==f(a-b);
}

How does this lighting calculation work?

I have that piece of code that is responsible for lighting a pyramid.
float Geometric3D::calculateLight(int vert1, int vert2, int vert3) {
float ax = tabX[vert2] - tabX[vert1];
float ay = tabY[vert2] - tabY[vert1];
float az = tabZ[vert2] - tabZ[vert1];
float bx = tabX[vert3] - tabX[vert1];
float by = tabY[vert3] - tabY[vert1];
float bz = tabZ[vert3] - tabZ[vert1];
float Nx = (ay * bz) - (az * by);
float Ny = (az * bx) - (ax * bz);;
float Nz = (ax * by) - (ay * bx);;
float Lx = -300.0f;
float Ly = -300.0f;
float Lz = -1000.0f;
float lenN = sqrtf((Nx * Nx) + (Ny * Ny) + (Nz * Nz));
float lenL = sqrtf((Lx * Lx) + (Ly * Ly) + (Lz * Lz));
float res = ((Nx * Lx) + (Ny * Ly) + (Nz * Lz)) / (lenN * lenL);
if (res < 0.0f)
res = -res;
return res;
}
I cannot understand calculations at the end. Can someone explain me the maths that is behind them? I know that firstly program calculates two vectors of a plane to compute the normal of it (which goes for vector N). Vector L stand for lighting but what happens next? Why do we calculate length of normal and light then multiply it and divide by their sizes?

Optimizing circle-circle collision response

With SFML, I made an algorithm that calculates the trajectoris of two balls after a collision; it works fine, but if I try with more than 30 balls, it freezes instantly or after 10-20 seconds.
I tried to avoid doing the same calculations multiple times, but it doesn't work.
Any suggestions?(I have an high-end PC, the problem is not there)
Phi is the collision angle, dis is distance;
void collisionResponse(Circle &a, Circle &b)
{
float mass1 = a.getMass();
float mass2 = b.getMass();
float disX = a.pos.x - b.pos.x;
float disY = a.pos.y - b.pos.y;
float phi = atan2(disY, disX);
float speed1 = a.getSpeed();
float speed2 = b.getSpeed();
float angle1 = a.getAngle();
float angle2 = b.getAngle();
float v1x = speed1*cos((angle1 - phi));
float v1y = speed1*sin((angle1 - phi));
float v2x = speed2*cos((angle2 - phi));
float v2y = speed2*sin((angle2 - phi));
float f1x = ((mass1 - mass2)*v1x + (mass2 + mass2)*v2x) / (mass1+mass2);
float f2x = ((mass1 + mass1)*v1x + (mass2 - mass1)*v2x) / (mass1+mass2);
float f1y = v1y;
float f2y = v2y;
float cosphi = cos(phi);
float sinphi = sin(phi);
float cosphiPI = cos(phi + PI / 2);
float sinphiPI = sin(phi + PI / 2);
a.speed.x = cosphi*f1x + cosphiPI*f1y;
a.speed.y = sinphi*f1x + sinphiPI*f1y;
b.speed.x = cosphi*f2x + cosphiPI*f2y;
b.speed.y = sinphi*f2x + sinphiPI*f2y;
while (sqr(a.pos.x - b.pos.x) + sqr(a.pos.y - b.pos.y) <= sqr(a.getRadius()+ b.getRadius()))
{
a.Move();
b.Move();
}
}

C fmod function: Floating point error and optimization

I'm trying to calculate the true course from one point to anoter on the surface of the earth in as few CPU cycles as possible. The result should be a double 0 <= tc < 360, however in a few special cases i get the result 360 (should be reported as 0). I realize that this is due to machine precision when working with fmod and floating point number, but what will be the most efficient workaround of the problem?
#include <stdio.h>
#include <math.h>
#define EPS 1e-15 // EPS a small number ~ machine precision
#define R2D 57.295779513082320876798154814105 //multiply radian with R2D to get degrees
#define D2R 0.01745329251994329576923690768489 //multiply degrees with D2R to get radians
#define TWO_PI 6.283185307179586476925286766559 //2*Pi
/*----------------------------------------------------------------------------
* Course between points
* We obtain the initial course, tc1, (at point 1) from point 1 to point 2
* by the following. The formula fails if the initial point is a pole. We can
* special case this with as IF statement
----------------------------------------------------------------------------
Implementation
Argument 1: INPUT - Pointer to double containing Latitude of point 1 in degrees
Argument 2: INPUT - Pointer to double containing Longitude of point 1 in degrees
Argument 3: INPUT - Pointer to double containing Latitude of point 2 in degrees
Argument 4: INPUT - Pointer to double containing Longitude of point 2 in degrees
RETURN: Double containing initial course in degrees from point1 to point 2
--------------------------------------------------------------------------*/
double _stdcall CourseInitial (double *lat1, double *lon1, double *lat2, double *lon2)
{
double radLat1 = D2R * *lat1;
double radLon1 = D2R * *lon1;
double radLat2 = D2R * *lat2;
double radLon2 = D2R * *lon2;
double tc = 0;
if (cos(radLat1) < EPS) { // EPS a small number ~ machine precision
if (radLat1 > 0) {
tc = 180; // starting from N pole
} else {
tc = 0; // starting from S pole
}
} else {
// Calculate true course [180, 540]
tc = R2D * atan2(sin(radLon2-radLon1),
cos(radLat1) * tan(radLat2) - sin(radLat1) * cos(radLon2-radLon1)
) + 360;
}
//Return 0 <= true course < 360
return fmod(tc, 360);
}
int main(void)
{
double lat1 = 89;
double lon1 = 17;
double lat2 = 68;
double lon2 = -163;
double tc = 0;
tc = CourseInitial(&lat1, &lon1, &lat2, &lon2);
printf("The course from point 1 to 2 is: %.5f", tc);
return 0;
}
Output:
The course from point 1 to 2 is: 360.00000
A comparison of the given value of the constant D2R and the closest 64 bit and 80 bit floating point number.
80 bit 0x3FF98EFA351294E9C8AF = 1.745329251994329577083321213687439055206596094649285078048706054e-2
D2R = 1.745329251994329576923690768489e-2
80 bit 0x3FF98EFA351294E9C8AE = 1.74532925199432957691391462423657898739293159451335668563842773e-2
64 bit 0x3F91DF46A2529D3A = 1.745329251994329894381863255148346070200204849243164e-2
D2R = 1.745329251994329576923690768489e-2
64 bit 0x3F91DF46A2529D39 = 1.7453292519943295474371680597869271878153085708618164e-2
These are the values chosen by my compiler:
0x3FF98EFA351294E9C8AE = 0 011111111111001 1000111011111010001101010001001010010100111010011100100010101110
0x3F91DF46A2529D39 = 0 01111111001 0001110111110100011010100010010100101001110100111001
The conversion was performed with tools and information on the web pages listed below:
http://www.exploringbinary.com/binary-converter
http://apfloat.appspot.com
http://en.wikipedia.org/wiki/Extended_precision
http://en.wikipedia.org/wiki/Double-precision_floating-point_format
Location of problem
The difference in the result between the two different levels of optimization occurs
when calculating radLon2-radLon1. The result of this calculation is shown here.
-O0 radLon2-radLon1 = 0xC00921FB54442D19
-O1 radLon2-radLon1 = 0xC00921FB54442D18
The difference is in the least significant bit bringing the -O0 result past the real value of pi.
-O0 radLon2-radLon1 = -3.14159265358979356008717331861
Pi with 50 decimals = 3.14159265358979323846264338327950288419716939937510
-O1 radLon2-radLon1 = -3,14159265358979311599796346854
-O0 calculation loads one value on the Floating Point Stack and subtracts the other with
the FSUB command (line 004014a8:). The -O1 calculation loads both values onto the Floating Point Stack
and subtracts them with the FSUBP command (line 00401480:)
Disassembly after compilation with no optimization -O0
(...)
004014a5: fld QWORD PTR [ebp-0x30] // ST(0) = radLon2 (from [ebp-0x30])
004014a8: fsub QWORD PTR [ebp-0x20] // ST(0) = ST(0) - radLon1 (from [ebp-0x20])
004014ab: fstp QWORD PTR [esp] // [esp] = (radLon2-radLon1) = 0xC00921FB54442D19
004014ae: call 0x4080d0 <sin> // ST(0) = sin(radLon2-radLon1)
(...)
-------------------------------------------------------------------------------------
Significant values:
radLon2 = 0xC006C253F2B53437 (-2.84488668075075734620327239099 )
radLon1 = 0x3FD2FD3B0C77C70D ( 0.296705972839036047350447233839 )
radLon2-radLon1 = 0xC00921FB54442D19 (-3.14159265358979356008717331861 )
sin(radLon2-radLon1) = 0x3CB72D0000000000 ( 3.21628574467824890348310873378e-16 )
Later atan2(y, x) is calculated with these values
x = 0x3FF0B04ED1755590 ( 1.04304391688978981278523860965 )
y = 0x3CB72D0000000000 ( 3.21628574467824890348310873378e-16 )
atan2(y, x) = 0x3CB63828CAA39659 ( 3.08355735803412799607014393888e-16 )
Disassembly after compilation with optimization -O1
(...)
29 double radLon2 = D2R * *lon2;
0040146c: fld QWORD PTR ds:0x40a0c0 // ST(0) = D2R (from [ds:0x40a0c0])
00401472: fmul QWORD PTR [esp+0x30] // ST(0) = ST(0) * lon2 (from [ESP+0x30])
// ST(0) = -2.8448866807507573
27 double radLon1 = D2R * *lon1;
00401476: fld QWORD PTR ds:0x40a0c0 // ST(0) = D2R (from [ds:0x40a0c0])
0040147c: fmul QWORD PTR [esp+0x20] // ST(0) = ST(0) * lon1 (from [ESP+0x20])
// ST(0) = 0.29670597283903605 (radLon1)
// ST(1) = -2.8448866807507573 (radLon2)
00401480: fsubp st(1),st // ST(1) = ST(1)-ST(0) then POP stack
00401482: fst QWORD PTR [esp+0x20] // [esp+0x20] = (radLon2-radLon1) = 0xC00921FB54442D18
(...)
40 tc = R2D * atan2(sin(radLon2-radLon1),
00401492: fld QWORD PTR [esp+0x20] // ST(0)=(radLon2-radLon1)
00401496: fstp QWORD PTR [esp] // [esp]=(radLon2-radLon1)
00401499: call 0x4080e0 <sin> // ST(0)=sin(radLon2-radLon1)
(...)
-------------------------------------------------------------------------------------
Significant values
radLon2 = 0xC006C253F2B53437 (-2.84488668075075734620327239099 )
radLon1 = 0x3FD2FD3B0C77C70D ( 0.296705972839036047350447233839 )
radLon2-radLon1 = 0xC00921FB54442D18 (-3,14159265358979311599796346854 )
sin(radLon2-radLon1) = 0xBCA1A60000000000 (-1.22460635382237725821141793858e-16 )
Later atan2(y, x) is calculated with these values
x = 0x3FF0B04ED1755590 ( 1.04304391688978981278523860965 )
y = 0xBCA1A60000000000 (-1.22460635382237725821141793858e-16 )
atan2(y, x) = 0xBCA0EB8D90F27437 (-1.1740697913027295863036855646E-16 )
=====================================================================================
Attempted solution
In CourseInitial() radLon1 and radLon2 is not used independently. So I tried the following.
double radDeltaLon = D2R * (*lon2-*lon1);
(...)
tc = R2D * atan2(sin(radDeltaLon),
cos(radLat1) * tan(radLat2) - sin(radLat1) * cos(radDeltaLon)
) + 360;
This did not work. Debugging showed that the problematic value close to Pi showed up
another place in the code and the end result was the same.
One solution
At the end of each of the defined constants I added an L and with this converting them to Long Doubles (80-bit floating point numbers). This is the same precision that the CPU has in it's Floating Point Registers and solved the problem in some cases.
#define R2D 57.295779513082320876798154814105L //multiply radian with R2D to get degrees
#define D2R 0.01745329251994329576923690768489L //multiply degrees with D2R to get radians
#define TWO_PI 6.283185307179586476925286766559L //2*Pi
Final solution
// Calculate true course [-180, 180)
tc = atan2(sin(radDeltaLon),
cos(radLat1) * tan(radLat2) - sin(radLat1) * cos(radDeltaLon)
);
if (fabs(tc) < EPS) {
tc = 0; //Prevents fmod(tc, 360) from returning 360 due to rounding error
} else {
tc *= R2D; //Convert to degrees after tc has been checked for machine precision
tc += 360; //tc [180, 540)
}
return fmod(tc, 360); // returns tc [0, 360)

SSE Bilinear interpolation

I'm implementing bilinear interpolation in a tight loop and trying to optimize it with SSE, but I get zero speed-up from it.
Here is the code, the non-SIMD version uses a simple vector structure which could be defined as struct Vec3f { float x, y, z; } with implemented multiplication and addition operators:
#ifdef USE_SIMD
const Color c11 = pixelCache[y1 * size.x + x1];
const Color c12 = pixelCache[y2 * size.x + x1];
const Color c22 = pixelCache[y2 * size.x + x2];
const Color c21 = pixelCache[y1 * size.x + x2];
__declspec(align(16)) float mc11[4] = { 1.0, c11.GetB(), c11.GetG(), c11.GetR() };
__declspec(align(16)) float mc12[4] = { 1.0, c12.GetB(), c12.GetG(), c12.GetR() };
__declspec(align(16)) float mc22[4] = { 1.0, c22.GetB(), c22.GetG(), c22.GetR() };
__declspec(align(16)) float mc21[4] = { 1.0, c21.GetB(), c21.GetG(), c21.GetR() };
// scalars in vector form for SSE
const float s11 = (x2-x)*(y2-y);
const float s12 = (x2-x)*(y-y1);
const float s22 = (x-x1)*(y-y1);
const float s21 = (x-x1)*(y2-y);
__declspec(align(16)) float ms11[4] = {1.0, s11, s11, s11};
__declspec(align(16)) float ms12[4] = {1.0, s12, s12, s12};
__declspec(align(16)) float ms22[4] = {1.0, s22, s22, s22};
__declspec(align(16)) float ms21[4] = {1.0, s21, s21, s21};
__asm {
movaps xmm0, mc11
movaps xmm1, mc12
movaps xmm2, mc22
movaps xmm3, mc21
movaps xmm4, ms11
movaps xmm5, ms12
movaps xmm6, ms22
movaps xmm7, ms21
mulps xmm0, xmm4
mulps xmm1, xmm5
mulps xmm2, xmm6
mulps xmm3, xmm7
addps xmm0, xmm1
addps xmm0, xmm2
addps xmm0, xmm3
movaps mc11, xmm0
}
#else
const Vec3f c11 = toFloat(pixelCache[y1 * size.x + x1]);
const Vec3f c12 = toFloat(pixelCache[y2 * size.x + x1]);
const Vec3f c22 = toFloat(pixelCache[y2 * size.x + x2]);
const Vec3f c21 = toFloat(pixelCache[y1 * size.x + x2]);
const Vec3f colour =
c11*(x2-x)*(y2-y) +
c21*(x-x1)*(y2-y) +
c12*(x2-x)*(y-y1) +
c22*(x-x1)*(y-y1);
#endif
Rearranging the asm code to reuse the registers(ended up with just three xmm registers) didn't give any effect. I've also tried using intrinsics:
// perform bilinear interpolation
const Vec3f c11 = toFloat(pixelCache[y1 * size.x + x1]);
const Vec3f c12 = toFloat(pixelCache[y2 * size.x + x1]);
const Vec3f c22 = toFloat(pixelCache[y2 * size.x + x2]);
const Vec3f c21 = toFloat(pixelCache[y1 * size.x + x2]);
// scalars in vector form for SSE
const float s11 = (x2-x)*(y2-y);
const float s12 = (x2-x)*(y-y1);
const float s22 = (x-x1)*(y-y1);
const float s21 = (x-x1)*(y2-y);
__m128 mc11 = _mm_set_ps(1.f, c11.b, c11.g, c11.r);
__m128 mc12 = _mm_set_ps(1.f, c12.b, c12.g, c12.r);
__m128 mc22 = _mm_set_ps(1.f, c22.b, c22.g, c22.r);
__m128 mc21 = _mm_set_ps(1.f, c21.b, c21.g, c21.r);
__m128 ms11 = _mm_set_ps(1.f, s11, s11, s11);
__m128 ms12 = _mm_set_ps(1.f, s12, s12, s12);
__m128 ms22 = _mm_set_ps(1.f, s22, s22, s22);
__m128 ms21 = _mm_set_ps(1.f, s21, s21, s21);
mc11 = _mm_mul_ps(mc11, ms11);
mc12 = _mm_mul_ps(mc12, ms12);
mc22 = _mm_mul_ps(mc22, ms22);
mc21 = _mm_mul_ps(mc21, ms21);
mc11 = _mm_add_ps(mc11, mc12);
mc11 = _mm_add_ps(mc11, mc22);
mc11 = _mm_add_ps(mc11, mc21);
Vec3f colour;
_mm_storeu_ps(colour.array, mc11);
And to no avail. Am I missing something, or it is impossible to gain any extra speed here?
Why floating point?
Given packed pixel argb for a, b, c, d, and xerr, yerr in the range 0-256, a simple example is:
// =================================================================================================================
// xs_Bilerp
// =================================================================================================================
finline uint32 xs_Bilerp (uint32 a, uint32 b, uint32 c, uint32 d, uint32 xerr, uint32 yerr)
{
#define xs_rbmask 0x00ff00ff
#define xs_agmask 0xff00ff00
if (a==b && c==d && a==d) return a;
const uint32 arb = a & xs_rbmask;
const uint32 crb = c & xs_rbmask;
const uint32 aag = a & xs_agmask;
const uint32 cag = c & xs_agmask;
const uint32 rbdx1 = (b & xs_rbmask) - arb;
const uint32 rbdx2 = (d & xs_rbmask) - crb;
const uint32 agdx1 = ((b & xs_agmask)>>8) - (aag >> 8);
const uint32 agdx2 = ((d & xs_agmask)>>8) - (cag >> 8);
const uint32 rb1 = (arb + ((rbdx1 * xerr) >> 8)) & xs_rbmask;
const uint32 ag1 = (aag + ((agdx1 * xerr) )) & xs_agmask;
const uint32 rbdy = ((crb + ((rbdx2 * xerr) >> 8)) & xs_rbmask) - rb1;
const uint32 agdy = (((cag + ((agdx2 * xerr) )) & xs_agmask)>>8) - (ag1 >> 8);
const uint32 rb = (rb1 + ((rbdy * yerr) >> 8)) & xs_rbmask;
const uint32 ag = (ag1 + ((agdy * yerr) )) & xs_agmask;
return ag | rb;
}