Related
There is a bool variable named "Enable", when "Enable" is false, I want to create following function:
void test_false()
{
float dst[4] = {1.0, 1.0, 1.0, 1.0};
float src[4] = {1.0, 2.0, 3.0, 4.0};
float * dst_addr = dst;
float * src_addr = src;
asm volatile (
"vld1.32 {q0}, [%[src]] \n"
"vld1.32 {q1}, [%[dst]] \n"
"vadd.f32 q0, q0, q1 \n"
"vadd.f32 q0, q0, q1 \n"
"vst1.32 {q0}, [%[dst]] \n"
:[src]"+r"(src_addr),
[dst]"+r"(dst_addr)
:
: "q0", "q1", "q2", "q3", "memory"
);
for (int i = 0; i < 4; i++)
{
printf("%f, ", dst[i]);//0.0 0.0 0.0 0.0
}
}
And when "Enable" is true, I want to create following function:
void test_true()
{
float dst[4] = {1.0, 1.0, 1.0, 1.0};
float src[4] = {1.0, 2.0, 3.0, 4.0};
float * dst_addr = dst;
float * src_addr = src;
asm volatile (
"vld1.32 {q0}, [%[src]] \n"
"vld1.32 {q1}, [%[dst]] \n"
"vadd.f32 q0, q0, q1 \n"
"vadd.f32 q0, q0, q1 \n"
"vadd.f32 q0, q0, q1 \n" //Only here is different from test_false()
"vst1.32 {q0}, [%[dst]] \n"
:[src]"+r"(src_addr),
[dst]"+r"(dst_addr)
:
: "q0", "q1", "q2", "q3", "memory"
);
for (int i = 0; i < 4; i++)
{
printf("%f, ", dst[i]);//0.0 0.0 0.0 0.0
}
}
But I don't want to save two copies of code, because most of them are the same. I want to use “c++ Template + Conditional Compile” to solve my problem. The code is as follows. But it didn't work. Whether the Enable is true or false, the compiler creates the code same as test_true().
template<bool Enable>
void test_tmp()
{
float dst[4] = {1.0, 1.0, 1.0, 1.0};
float src[4] = {1.0, 2.0, 3.0, 4.0};
float * dst_addr = dst;
float * src_addr = src;
if (Enable)
{
#define FUSE_
}
asm volatile (
"vld1.32 {q0}, [%[src]] \n"
"vld1.32 {q1}, [%[dst]] \n"
"vadd.f32 q0, q0, q1 \n"
"vadd.f32 q0, q0, q1 \n"
#ifdef FUSE_
"vadd.f32 q0, q0, q1 \n"
#endif
"vst1.32 {q0}, [%[dst]] \n"
:[src]"+r"(src_addr),
[dst]"+r"(dst_addr)
:
: "q0", "q1", "q2", "q3", "memory"
);
for (int i = 0; i < 4; i++)
{
printf("%f, ", dst[i]);//0.0 0.0 0.0 0.0
}
#undef FUSE_
}
template void test_tmp<true>();
template void test_tmp<false>();
It doesn't seem possible to write code like function test_tmp(). Does anyone know how to solve my problem? Thanks a lot.
If you use C temporaries and output operands for all live registers in the first half that line up with input constraints for the 2nd half, you should be able to split it up your inline asm without any performance loss, especially if you use specific memory input/output constraints instead of a catch-all "memory" clobber. But it will get a lot more complicated.
This obviously doesn't work, because the C preprocessor runs before the C++ compiler even looks at if() statements.
if (Enable) {
#define FUSE_ // always defined, regardless of Enable
}
But the GNU assembler has its own macro / conditional-assembly directives like .if which operate on the asm the compiler emits after making text substitutions into the asm() template, including actual numeric values for immediate input operands.
Use the bool as an input operand for an assembler .if directive
Use an "i" (Enable) input constraint. Normally the %0 or %[enable] expansion of that would be #0 or #1, because that's how to print an ARM immediate. But GCC has a %c0 / %c[enable] modifier that will print a constant without punctuation. (It's documented for x86, but works the same way for ARM and presumably all other architectures. Documentation for ARM / AArch64 operand modifiers is being worked on; I've been sitting on an email about that...)
".if %c[enable] \n\t" for [enable] "i" (c_var) will substitute as .if 0 or .if 1 into the inline-asm template, exactly what we need to make .if / .endif work at assemble time.
Full example:
template<bool Enable>
void test_tmp(float dst[4])
{
//float dst[4] = {1.0, 1.0, 1.0, 1.0};
// static const // non-static-const so we can see the memory clobber vs. dummy src stop this from optimizing away init of src[] on the stack
float src[4] = {1.0, 2.0, 3.0, 4.0};
float * dst_addr = dst;
const float * src_addr = src;
asm (
"vld1.32 {q1}, [%[dst]] # dummy dst = %[dummy_memdst]\n" // hopefully they pick the same regs?
"vld1.32 {q0}, [%[src]] # dummy src = %[dummy_memsrc]\n"
"vadd.f32 q0, q0, q1 \n" // TODO: optimize to q1+q1 first, without a dep on src
"vadd.f32 q0, q0, q1 \n" // allowing q0+=q1 and q1+=q1 in parallel if we need q0 += 3*q1
// #ifdef FUSE_
".if %c[enable]\n" // %c modifier: print constant without punctuation, same as documented for x86
"vadd.f32 q0, q0, q1 \n"
".endif \n"
// #endif
"vst1.32 {q0}, [%[dst]] \n"
: [dummy_memdst] "+m" (*(float(*)[4])dst_addr)
: [src]"r"(src_addr),
[dst]"r"(dst_addr),
[enable]"i"(Enable)
, [dummy_memsrc] "m" (*(const float(*)[4])src_addr)
: "q0", "q1", "q2", "q3" //, "memory"
);
/*
for (int i = 0; i < 4; i++)
{
printf("%f, ", dst[i]);//0.0 0.0 0.0 0.0
}
*/
}
float dst[4] = {1.0, 1.0, 1.0, 1.0};
template void test_tmp<true>(float *);
template void test_tmp<false>(float *);
compiles with GCC and Clang on the Godbolt compiler explorer
With gcc, you only get the compiler's .s output, so you have to turn off some of the usual compiler-explorer filters and look through the directives. All 3 vadd.f32 instructions are there in the false version, but one of them is surrounded by .if 0 / .endif.
But clang's built-in assembler processes assembler directives internally before turning things back into asm if that output is requested. (Normally clang/LLVM goes straight to machine code, unlike gcc which always runs a separate assembler).
Just to be clear, this works with gcc and clang, but it's just easier to see it on Godbolt with clang. (Because Godbolt doesn't have a "binary" mode that actually assembles and then disassembles, except for x86). Clang output for the false version
...
vld1.32 {d2, d3}, [r0] # dummy dst = [r0]
vld1.32 {d0, d1}, [r1] # dummy src = [r1]
vadd.f32 q0, q0, q1
vadd.f32 q0, q0, q1
vst1.32 {d0, d1}, [r0]
...
Notice that clang picked the same GP register for the raw pointers as it used for the memory operand. (gcc seems to choose [sp] for src_mem, but a different reg for the pointer input that you use manually inside an addressing mode). If you hadn't forced it to have the pointers in registers, it could have used an SP-relative addressing mode with an offset for the vector loads, potentially taking advantage of ARM addressing modes.
If you're really not going to modify the pointers inside the asm (e.g. with post-increment addressing modes), then "r" input-only operands makes the most sense. If we'd left in the printf loop, the compiler would have needed dst again after the asm, so it would benefit from having it still in a register. A "+r"(dst_addr) input forces the compiler to assume that that register is no longer usable as a copy of dst. Anyway, gcc always copies the registers, even when it doesn't need it later, whether I make it "r" or "+r", so that's weird.
Using (dummy) memory inputs / outputs means we can drop the volatile, so the compiler can optimize it normally as a pure function of its inputs. (And optimize it away if the result is unused.)
Hopefully this isn't worse code-gen that with the "memory" clobber. But it would probably be better if you just used the "=m" and "m" memory operands, and didn't ask for pointers in registers at all. (That doesn't help if you're going to loop over the array with inline asm, though.)
See also Looping over arrays with inline assembly
I haven't been doing ARM assembly for few years, and I never really bothered to learn GCC inline assembly properly, but I think your code can be rewritten like this, using intrinsics:
#include <cstdio>
#include <arm_neon.h>
template<bool Enable>
void test_tmp()
{
const float32x4_t src = {1.0, 2.0, 3.0, 4.0};
const float32x4_t src2 = {1.0, 1.0, 1.0, 1.0};
float32x4_t z;
z = vaddq_f32(src, src2);
z = vaddq_f32(z, src2);
if (Enable) z = vaddq_f32(z, src2);
float result[4];
vst1q_f32(result, z);
for (int i = 0; i < 4; i++)
{
printf("%f, ", result[i]);//0.0 0.0 0.0 0.0
}
}
template void test_tmp<true>();
template void test_tmp<false>();
You can see resulting machine code + toy around live at: https://godbolt.org/z/Fg7Tci
Compiled with ARM gcc8.2 and command line options "-O3 -mfloat-abi=softfp -mfpu=neon" the "true" variant is:
void test_tmp<true>():
vmov.f32 q9, #1.0e+0 # v4sf
vldr d16, .L6
vldr d17, .L6+8
# and the FALSE variant has one less vadd.f32 in this part
vadd.f32 q8, q8, q9
vadd.f32 q8, q8, q9
vadd.f32 q8, q8, q9
push {r4, r5, r6, lr}
sub sp, sp, #16
vst1.32 {d16-d17}, [sp:64]
mov r4, sp
ldr r5, .L6+16
add r6, sp, #16
.L2:
vldmia.32 r4!, {s15}
vcvt.f64.f32 d16, s15
mov r0, r5
vmov r2, r3, d16
bl printf
cmp r4, r6
bne .L2
add sp, sp, #16
pop {r4, r5, r6, pc}
.L6:
.word 1065353216
.word 1073741824
.word 1077936128
.word 1082130432
.word .LC0
.LC0:
.ascii "%f, \000"
This still leaves me profoundly confused by why the gcc doesn't simply calculate final string with values as string for output, as the inputs are constant. Maybe it's some math-rule about precision preventing it to do that in compile-time as the result could differ slightly from actual target HW platform FPU? I.e. with some fast-math switch it would probably drop that code completely and just produce one output string...
But I guess your code is not actually proper "MCVE" of what you are doing, and the test values would be fed into some real function you are testing, or something like that.
Anyway, if you are working on performance optimizations, you should probably rather avoid inline assembly completely and use intrinsics instead, as that allows the compiler to better allocate registers and optimize code around the calculations (I didn't track it precisely, but I think the last version of this experiment in godbolt was 2-4 instructions shorter/simpler than the original using inline assembly).
Plus you will avoid the incorrect asm constraints like your example code has, those are always tricky to get correctly and pure PITA to maintain if you keep modifying the inlined code often.
I used method:
val = 0.299 * R + 0.587 * G + 0.114 * B;
image.setRGBA(val, val, val, 0);
to convert bitmap (24-bit RGB888 and 32-bit RGBA8888) to grayscale successfully.
Example: BMP 24-bit: 0E 0F EF (BGR) --> 51 51 51 (Grayscale) // using above method
But can't apply for bitmap 16-bit RGBA4444.
Example: BMP 16-bit: 'A7 36' (BGRA) --> 'CE 39' (Grayscale) // ???
Anyone know how?
Are you sure you need RGBA4444 format? Maybe you need an old format where green channel gets 6 bits while red and blue get 5 bits (total 16 bits)
In case of 5-6-5 format - the answer is simple.
Just do
R = (R>>3); G = (G>>2); B = (B>>3); to reduce the 24 bits into 16. Now just combine them using | operation.
Here is a sample code in C
// Combine RGB into 16bits 565 representation. Assuming all inputs are in range of 0..255
static INT16 make565(int red, int green, int blue){
return (INT16)( ((red << 8) & 0xf800)|
((green << 2) & 0x03e0)|
((blue >> 3) & 0x001f));
}
The method above uses roughly the same structure as the regular ARGB construction method but squeezes the colors to 16 bits instead of 32 like in the following example:
// Combine RGB into 32bit ARGB representation. Assuming all inputs are in range of 0..255
static INT32 makeARGB(int red, int green, int blue){
return (INT32)((red)|(green << 8)|(blue<<16)|(0xFF000000)); // Alpha is always FF
}
If you do need RGBA4444 then the method would be a combination of the two above
// Combine RGBA into 32bit 4444 representation. Assuming all inputs are in range of 0..255
static INT16 make4444(int red, int green, int blue, int alpha){
return (INT32)((red>>4)|(green&0xF0)|((blue&0xF0)<<4)|((alpha&0xF0)<<8));
}
I'm trying to write a better version of cv::resize() of the OpenCV, and I came a cross a code that is here: https://github.com/rmaz/NEON-Image-Downscaling/blob/master/ImageResize/BDPViewController.m
The code is for downsampling an image by 2 but I can not get the algorithm. I would like first to convert that algorithm to C then try to modify it for Learning purposes. Is it easy also to convert it to downsample by any size ?
The function is:
static void inline resizeRow(uint32_t *dst, uint32_t *src, uint32_t pixelsPerRow)
{
const uint32_t * rowB = src + pixelsPerRow;
// force the number of pixels per row to a multiple of 8
pixelsPerRow = 8 * (pixelsPerRow / 8);
__asm__ volatile("Lresizeloop: \n" // start loop
"vld1.32 {d0-d3}, [%1]! \n" // load 8 pixels from the top row
"vld1.32 {d4-d7}, [%2]! \n" // load 8 pixels from the bottom row
"vhadd.u8 q0, q0, q2 \n" // average the pixels vertically
"vhadd.u8 q1, q1, q3 \n"
"vtrn.32 q0, q2 \n" // transpose to put the horizontally adjacent pixels in different registers
"vtrn.32 q1, q3 \n"
"vhadd.u8 q0, q0, q2 \n" // average the pixels horizontally
"vhadd.u8 q1, q1, q3 \n"
"vtrn.32 d0, d1 \n" // fill the registers with pixels
"vtrn.32 d2, d3 \n"
"vswp d1, d2 \n"
"vst1.64 {d0-d1}, [%0]! \n" // store the result
"subs %3, %3, #8 \n" // subtract 8 from the pixel count
"bne Lresizeloop \n" // repeat until the row is complete
: "=r"(dst), "=r"(src), "=r"(rowB), "=r"(pixelsPerRow)
: "0"(dst), "1"(src), "2"(rowB), "3"(pixelsPerRow)
: "q0", "q1", "q2", "q3", "cc"
);
}
To call it:
// downscale the image in place
for (size_t rowIndex = 0; rowIndex < height; rowIndex+=2)
{
void *sourceRow = (uint8_t *)buffer + rowIndex * bytesPerRow;
void *destRow = (uint8_t *)buffer + (rowIndex / 2) * bytesPerRow;
resizeRow(destRow, sourceRow, width);
}
The algorithm is pretty straightforward. It reads 8 pixels from the current line and 8 from the line below. It then uses the vhadd (halving-add) instruction to average the 8 pixels vertically. It then transposes the position of the pixels so that the horizontally adjacent pixel pairs are now in separate registers (arranged vertically). It then does another set of halving-adds to average those together. The result is then transformed again to put them in their original positions and written to the destination. This algorithm could be rewritten to handle different integral sizes of scaling, but as written it can only do 2x2 to 1 reduction with averaging. Here's the C code equivalent:
static void inline resizeRow(uint32_t *dst, uint32_t *src, uint32_t pixelsPerRow)
{
uint8_t * pSrc8 = (uint8_t *)src;
uint8_t * pDest8 = (uint8_t *)dst;
int stride = pixelsPerRow * sizeof(uint32_t);
int x;
int r, g, b, a;
for (x=0; x<pixelsPerRow; x++)
{
r = pSrc8[0] + pSrc8[4] + pSrc8[stride+0] + pSrc8[stride+4];
g = pSrc8[1] + pSrc8[5] + pSrc8[stride+1] + pSrc8[stride+5];
b = pSrc8[2] + pSrc8[6] + pSrc8[stride+2] + pSrc8[stride+6];
a = pSrc8[3] + pSrc8[7] + pSrc8[stride+3] + pSrc8[stride+7];
pDest8[0] = (uint8_t)((r + 2)/4); // average with rounding
pDest8[1] = (uint8_t)((g + 2)/4);
pDest8[2] = (uint8_t)((b + 2)/4);
pDest8[3] = (uint8_t)((a + 2)/4);
pSrc8 += 8; // skip forward 2 source pixels
pDest8 += 4; // skip forward 1 destination pixel
}
I'm optimizing an element wise multiplication of two single dimensional arrays for a dual Cortex-A9 processor. Linux is running on the board and I'm using the GCC 4.5.2 compiler.
So the following is my C++ inline assembler function. src1, src2 and dst are 16 byte aligned.
Update: Testable Code:
void Multiply(
const float* __restrict__ src1,
const float* __restrict__ src2,
float* __restrict__ dst,
const unsigned int width,
const unsigned int height)
{
int loopBound = (width * height) / 4;
asm volatile(
".loop: \n\t"
"vld1.32 {q1}, [%[src1]:128]! \n\t"
"vld1.32 {q2}, [%[src2]:128]! \n\t"
"vmul.f32 q0, q1, q2 \n\t"
"vst1.32 {q0}, [%[dst]:128]! \n\t"
"subs %[lBound], %[lBound], $1 \n\t"
"bge .loop \n\t"
:
:[dst] "r" (dst), [src1] "r" (src1), [src2] "r" (src2),
[lBound] "r" (loopBound)
:"memory", "d0", "d1", "d2", "d3", "d4", "d5
);
}
//The following function describes how to test the element wise multiplication
void Test()
{
const unsigned int width = 1024, height = 1024;
float* src1 __attribute__((aligned(16))) = new float[width * height];
float* src2 __attribute__((aligned(16))) = new float[width * height];
float* dst __attribute__((aligned(16))) = new float[width * height];
for(unsigned int i = 0; i < (width * height); i++)
{
src1[i] = (float)rand();
src2[i] = (float)rand();
}
Multiply(src1, src2, dst, width, height);
std::cout << dst[0] << std::endl;
}
The calculation of 1024*1024 values takes ~0.016 s. (Two threads - each thread calculates a half of the array). Naively interpreted, the calculation of one iteration takes 122 cycles. This seems to be a bit slow. But where is the bottleneck?
I even tried the pld command for preloading elements in the L2 cache, "unrolling" the loop by calculating up to 20 values per iteration and reordering the instructions to make shure the processor is not waiting for memory. I didn't get that much speedup (max 0.001 s faster).
Do you have any suggestions for speeding up the calculation?
I don't really know that much about the NEON. However, I think that you have data dependencies that cause performance issues. I would suggest you prime the loop with some loads and then place them between the multiply and store. I think the store is probably blocking until the multiply is done.
asm volatile(
"vld1.32 {q1}, [%[src1]:128]! \n\t"
"vld1.32 {q2}, [%[src2]:128]! \n\t"
".loop: \n\t"
"vmul.f32 q0, q1, q2 \n\t"
"vld1.32 {q1}, [%[src1]:128]! \n\t"
"vld1.32 {q2}, [%[src2]:128]! \n\t"
"vst1.32 {q0}, [%[dst]:128]! \n\t"
"subs %[lBound], %[lBound], $1 \n\t"
"bge .loop \n\t"
:
:[dst] "r" (dst), [src1] "r" (src1), [src2] "r" (src2),
[lBound] "r" (loopBound)
:"memory", "d0", "d1", "d2", "d3", "d4", "d5
);
This way you should be able to parallel the loads with the multiply. You will need to over-allocate the source arrays or change the loop index and do a final multiply and store. If the NEON ops are not affecting the condition codes, you can re-order the subs as well and place it earlier.
Edit: In fact, the Cortex A-9 Media processing Engine document recommends interleaving ARM and NEON instructions as they can execute in parallel. Also, the NEON instructions seem to set FPSCR and not the ARM CPSR so re-ordering the subs would decrease the execution time. You may also cache align the loop.
I am using c++ , I want to do alpha blend using the following code.
#define CLAMPTOBYTE(color) \
if ((color) & (~255)) { \
color = (BYTE)((-(color)) >> 31); \
} else { \
color = (BYTE)(color); \
}
#define GET_BYTE(accessPixel, x, y, scanline, bpp) \
((BYTE*)((accessPixel) + (y) * (scanline) + (x) * (bpp)))
for (int y = top ; y < bottom; ++y)
{
BYTE* resultByte = GET_BYTE(resultBits, left, y, stride, bytepp);
BYTE* srcByte = GET_BYTE(srcBits, left, y, stride, bytepp);
BYTE* srcByteTop = GET_BYTE(srcBitsTop, left, y, stride, bytepp);
BYTE* maskCurrent = GET_GREY(maskSrc, left, y, width);
int alpha = 0;
int red = 0;
int green = 0;
int blue = 0;
for (int x = left; x < right; ++x)
{
alpha = *maskCurrent;
red = (srcByteTop[R] * alpha + srcByte[R] * (255 - alpha)) / 255;
green = (srcByteTop[G] * alpha + srcByte[G] * (255 - alpha)) / 255;
blue = (srcByteTop[B] * alpha + srcByte[B] * (255 - alpha)) / 255;
CLAMPTOBYTE(red);
CLAMPTOBYTE(green);
CLAMPTOBYTE(blue);
resultByte[R] = red;
resultByte[G] = green;
resultByte[B] = blue;
srcByte += bytepp;
srcByteTop += bytepp;
resultByte += bytepp;
++maskCurrent;
}
}
however I find it is still slow, it takes about 40 - 60 ms when compose two 600 * 600 image.
Is there any method to improve the speed to less then 16ms?
Can any body help me to speed this code? Many thanks!
Use SSE - start around page 131.
The basic workflow
Load 4 pixels from src (16 1 byte numbers) RGBA RGBA RGBA RGBA (streaming load)
Load 4 more which you want to blend with srcbytetop RGBx RGBx RGBx RGBx
Do some swizzling so that the A term in 1 fills every slot I.e
xxxA xxxB xxxC xxxD -> AAAA BBBB CCCC DDDD
In my solution below I opted instead to re-use your existing "maskcurrent" array but having alpha integrated into the "A" field of 1 will require less loads from memory and thus be faster. Swizzling in this case would probably be: And with mask to select A, B, C, D. Shift right 8, Or with origional, shift right 16, or again.
Add the above to a vector that is all -255 in every slot
Multiply 1 * 4 (source with 255-alpha) and 2 * 3 (result with alpha).
You should be able to use the "multiply and discard bottom 8 bits" SSE2 instruction for this.
add those two (4 and 5) together
Store those somewhere else (if possible) or on top of your destination (if you must)
Here is a starting point for you:
//Define your image with __declspec(align(16)) i.e char __declspec(align(16)) image[640*480]
// so the first byte is aligned correctly for SIMD.
// Stride must be a multiple of 16.
for (int y = top ; y < bottom; ++y)
{
BYTE* resultByte = GET_BYTE(resultBits, left, y, stride, bytepp);
BYTE* srcByte = GET_BYTE(srcBits, left, y, stride, bytepp);
BYTE* srcByteTop = GET_BYTE(srcBitsTop, left, y, stride, bytepp);
BYTE* maskCurrent = GET_GREY(maskSrc, left, y, width);
for (int x = left; x < right; x += 4)
{
//If you can't align, use _mm_loadu_si128()
// Step 1
__mm128i src = _mm_load_si128(reinterpret_cast<__mm128i*>(srcByte))
// Step 2
__mm128i srcTop = _mm_load_si128(reinterpret_cast<__mm128i*>(srcByteTop))
// Step 3
// Fill the 4 positions for the first pixel with maskCurrent[0], etc
// Could do better with shifts and so on, but this is clear
__mm128i mask = _mm_set_epi8(maskCurrent[0],maskCurrent[0],maskCurrent[0],maskCurrent[0],
maskCurrent[1],maskCurrent[1],maskCurrent[1],maskCurrent[1],
maskCurrent[2],maskCurrent[2],maskCurrent[2],maskCurrent[2],
maskCurrent[3],maskCurrent[3],maskCurrent[3],maskCurrent[3],
)
// step 4
__mm128i maskInv = _mm_subs_epu8(_mm_set1_epu8(255), mask)
//Todo : Multiply, with saturate - find correct instructions for 4..6
//note you can use Multiply and add _mm_madd_epi16
alpha = *maskCurrent;
red = (srcByteTop[R] * alpha + srcByte[R] * (255 - alpha)) / 255;
green = (srcByteTop[G] * alpha + srcByte[G] * (255 - alpha)) / 255;
blue = (srcByteTop[B] * alpha + srcByte[B] * (255 - alpha)) / 255;
CLAMPTOBYTE(red);
CLAMPTOBYTE(green);
CLAMPTOBYTE(blue);
resultByte[R] = red;
resultByte[G] = green;
resultByte[B] = blue;
//----
// Step 7 - store result.
//Store aligned if output is aligned on 16 byte boundrary
_mm_store_si128(reinterpret_cast<__mm128i*>(resultByte), result)
//Slow version if you can't guarantee alignment
//_mm_storeu_si128(reinterpret_cast<__mm128i*>(resultByte), result)
//Move pointers forward 4 places
srcByte += bytepp * 4;
srcByteTop += bytepp * 4;
resultByte += bytepp * 4;
maskCurrent += 4;
}
}
To find out which AMD processors will run this code (currently it is using SSE2 instructions) see Wikipedia's List of AMD Turion microprocessors. You could also look at other lists of processors on Wikipedia but my research shows that AMD cpus from around 4 years ago all support at least SSE2.
You should expect a good SSE2 implimentation to run around 8-16 times faster than your current code. That is because we eliminate branches in the loop, process 4 pixels (or 12 channels) at once and improve cache performance by using streaming instructions. As an alternative to SSE, you could probably make your existing code run much faster by eliminating the if checks you are using for saturation. Beyond that I would need to run a profiler on your workload.
Of course, the best solution is to use hardware support (i.e code your problem up in DirectX) and have it done on the video card.
You can always calculate the alpha of red and blue at the same time. You can also use this trick with the SIMD implementation mentioned before.
unsigned int blendPreMulAlpha(unsigned int colora, unsigned int colorb, unsigned int alpha)
{
unsigned int rb = (colora & 0xFF00FF) + ( (alpha * (colorb & 0xFF00FF)) >> 8 );
unsigned int g = (colora & 0x00FF00) + ( (alpha * (colorb & 0x00FF00)) >> 8 );
return (rb & 0xFF00FF) + (g & 0x00FF00);
}
unsigned int blendAlpha(unsigned int colora, unsigned int colorb, unsigned int alpha)
{
unsigned int rb1 = ((0x100 - alpha) * (colora & 0xFF00FF)) >> 8;
unsigned int rb2 = (alpha * (colorb & 0xFF00FF)) >> 8;
unsigned int g1 = ((0x100 - alpha) * (colora & 0x00FF00)) >> 8;
unsigned int g2 = (alpha * (colorb & 0x00FF00)) >> 8;
return ((rb1 | rb2) & 0xFF00FF) + ((g1 | g2) & 0x00FF00);
}
0 <= alpha <= 0x100
For people that want to divide by 255, i found a perfect formula:
pt->r = (r+1 + (r >> 8)) >> 8; // fast way to divide by 255
Here's some pointers.
Consider using pre-multiplied foreground images as described by Porter and Duff. As well as potentially being faster, you avoid a lot of potential colour-fringing effects.
The compositing equation changes from
r = kA + (1-k)B
... to ...
r = A + (1-k)B
Alternatively, you can rework the standard equation to remove one multiply.
r = kA + (1-k)B
== kA + B - kB
== k(A-B) + B
I may be wrong, but I think you shouldn't need the clamping either...
I can't comment because I don't have enough reputation, but I want to say that Jasper's version will not overflow for valid input.
Masking the multiplication result is necessary because otherwise the red+blue multiplication would leave bits in the green channel (this would also be true if you multiplied red and blue separately, you'd still need to mask out bits in the blue channel) and the green multiplication would leave bits in the blue channel.
These are bits that are lost to right shift if you separate the components out, as is often the case with alpha blending.
So they're not overflow, or underflow. They're just useless bits that need to be masked out to achieve expected results.
That said, Jasper's version is incorrect. It should be 0xFF-alpha (255-alpha), not 0x100-alpha (256-alpha). This would probably not produce a visible error.
I've found an adaptation of Jasper's code to be be faster than my old alpha blending code, which was already decent, and am currently using it in my software renderer project. I work with 32-bit ARGB pixels:
Pixel AlphaBlendPixels(Pixel p1, Pixel p2)
{
static const int AMASK = 0xFF000000;
static const int RBMASK = 0x00FF00FF;
static const int GMASK = 0x0000FF00;
static const int AGMASK = AMASK | GMASK;
static const int ONEALPHA = 0x01000000;
unsigned int a = (p2 & AMASK) >> 24;
unsigned int na = 255 - a;
unsigned int rb = ((na * (p1 & RBMASK)) + (a * (p2 & RBMASK))) >> 8;
unsigned int ag = (na * ((p1 & AGMASK) >> 8)) + (a * (ONEALPHA | ((p2 & GMASK) >> 8)));
return ((rb & RBMASK) | (ag & AGMASK));
}
No exactly answering the question but...
One thing is to do it fast, the other thing is to do it right.
Alpha compositing is a dangerous beast, it looks straight forward and intuitive but common errors have been widespread for decades without anybody noticing it (almost)!
The most famous and common mistake is about NOT using premultiplied alpha. I highly recommend this: Alpha Blending for Leaves
You can use 4 bytes per pixel in both images (for memory alignment), and then use SSE instructions to process all channels together. Search "visual studio sse intrinsics".
First of all lets use the proper formula for each color component
You start with this:
v = ( 1-t ) * v0 + t * v1
where
t=interpolation parameter [0..1]
v0=source color value
v1=transfer color value
v=output value
Reshuffling the terms, we can reduce the number of operations:
v = v0 + t * (v1 - v0)
You would need to perform this calculation once per color channel (3 times for RGB).
For 8-bit unsigned color components, you need to use correct fixed point math:
i = i0 + t * ( ( i1 - i0 ) + 127 ) / 255
where
t = interpolation parameter [0..255]
i0= source color value [0..255]
i1= transfer color value [0..255]
i = output color
If you leave out the +127 then your colors will be biased towards the darker end. Very often, people use /256 or >> 8 for speed. This is not correct! If you divide by 256, you will never be able to reach pure white (255,255,255) because 255/256 is slightly less than one.
I hope this helps.
I think hardware support will help you. try to move the logic from software to hardware if feasible
I've done similar code in unsafe C#. Is there any reason you aren't looping through each pixel directly? Why use all the BYTE* and GET_BYTE() calls? That is probably part of the speed issue.
What does GET_GRAY look like?
More importantly, are you sure your platform doesn't expose alpha blending capabilities? What platform are you targeting? Wiki informs me that the following support it out of the box:
Mac OS X
Windows 2000, XP, Server 2003, Windows CE, Vista and Windows 7
The XRender extension to the X Window System (this includes modern Linux systems)
RISC OS Adjust
QNX Neutrino
Plan 9
Inferno
AmigaOS 4.1
BeOS, Zeta and Haiku
Syllable
MorphOS
The main problem will be the poor loop construct, possibly made worse by a compiler failing to eliminate CSE's. Move the real common bits outside the loops. int red isn't common, thouigh - that should be inside the inner loop.
Furthermore, red, green and blue are independent. If you calculate them in turn, you don't need to keep interim red results in registers when you are calculating green results. This is especially important on CPUs with limited registers like x86.
There will be only a limited number of values allowed for bytepp. Make it a template parameter, and then call the right instantiation from a switch. This will produce multiple copies of your function, but each can be optimized a lot better.
As noted, clamping is not needed. In alphablending, you're creating a linear combination of two images a[x][y] and b[x][y]. Since 0<=alpha<=255, you know that each output is bound by max(255*a[x][y], 255*b[x][y]). And since your output range is the same as both input ranges (0-255), this is OK.
With a small loss of precision, you could calculate (a[x][y]*alpha * b[x][y]*(256-alpha))>>8. Bitshifts are often faster than division.
Depending on the target architecture, you could try either vectorize or parallellize the function.
Other than that, try to linearize the whole method (i.e. no loop-in-loop) and work with a quadruple of bytes at once, that would lose the overhead of working with single bytes plus make it easier for the compiler to optimize the code.
Move it to the GPU.
I am assuming that you want to do this in a completely portable way, without the help of a GPU, the use of a proprietry intel SIMD library (which may not work as efficiently on AMD processors).
Put the following inplace of your calculation for RGB
R = TopR + (SourceR * alpha) >> 8;
G = TopG + (SourceG * alpha) >> 8;
B = TopB + (SourceB * alpha) >> 8;
It is a more efficient calculation.
Also use shift left instruction on your get pixel macro instead of multiplying by the BPP.
This one works when the first color, (colora, the destination) has also alpha channel (blending two transparent ARGB colors)
The alpha is in the second color's alpha (colorb, the source)
This adds the two alphas (0 = transparent, 255 = fully opaque)
It is a modified version of Jasper Bekkers' answer.
I use it to blend transparent pixel art on to a transparent screen.
Uint32 alphaBlend(unsigned int colora, unsigned int colorb) {
unsigned int a2 = (colorb & 0xFF000000) >> 24;
unsigned int alpha = a2;
if (alpha == 0) return colora;
if (alpha == 255) return colorb;
unsigned int a1 = (colora & 0xFF000000) >> 24;
unsigned int nalpha = 0x100 - alpha;
unsigned int rb1 = (nalpha * (colora & 0xFF00FF)) >> 8;
unsigned int rb2 = (alpha * (colorb & 0xFF00FF)) >> 8;
unsigned int g1 = (nalpha * (colora & 0x00FF00)) >> 8;
unsigned int g2 = (alpha * (colorb & 0x00FF00)) >> 8;
unsigned int anew = a1 + a2;
if (anew > 255) {anew = 255;}
return ((rb1 + rb2) & 0xFF00FF) + ((g1 + g2) & 0x00FF00) + (anew << 24);
}
Here's my adaption of a software alpha blend that works well for 2 unsigned integers.
My code differs a bit as the code above is basically always assuming the destination alpha is 255.
With a decent optimizing compiler most calculations should be in registers as the scope of most variables is very short. I also opted to progressively shift the result << 8 incrementally to avoid << 24, << 16 when putting the ARGB back together. I know it's a long time ago... but I remember on the 286 cycles for a shift was (1 + 1*each bit shifted) so assume there is still some sort of penalty for larger shifts.
Also... instead of "/ 255" I opted for ">> 8" which can be changed as desired.
/*
alpha blend source and destination, either may have an alpha!!!!
Src AAAAAAAA RRRRRRRR GGGGGGGG BBBBBBBB
Dest AAAAAAAA RRRRRRRR GGGGGGGG BBBBBBBB
res AAAAAAAA RRRRRRRR GGGGGGGG BBBBBBBB
NOTE - α = αsrc + αdest(1.0-αsrc) where α = 0.0 - 1.0
ALSO - DWORD is unsigned int so (F8000000 >> 24) = F8 not FFFFFFF8 as it would with int (signed)
*/
inline DWORD raw_blend(const DWORD src, const DWORD dest)
{
// setup and calculate α
DWORD src_a = src >> 24;
DWORD src_a_neg = 255 - src_a;
DWORD dest_a = dest >> 24;
DWORD res = src_a + ((dest_a * src_a_neg) >> 8);
// setup and calculate R
DWORD src_r = (src >> 16) & 255;
DWORD dest_r = (dest >> 16) & 255;
res = (res << 8) | (((src_r * src_a) + (dest_r * src_a_neg)) >> 8);
// setup and calculate G
DWORD src_g = (src >> 8) & 255;
DWORD dest_g = (dest >> 8) & 255;
res = (res << 8) | (((src_g * src_a) + (dest_g * src_a_neg)) >> 8);
// setup and calculate B
DWORD src_b = src & 255;
DWORD dest_b = dest & 255;
return (res << 8) | (((src_b * src_a) + (dest_b * src_a_neg)) >> 8);
}
; In\ EAX = background color (ZRBG) 32bit (Z mean zero, always is zero)
; In\ EDX = foreground color (RBGA) 32bit
; Out\ EAX = new color
; free registers (R10, RDI, RSI, RSP, RBP)
abg2:
mov r15b, dl ; av
movzx ecx, dl
not ecx ; faster than 255 - dl
mov r14b, cl ; rem
shr edx, 8
and edx, 0x00FFFFFF
mov r12d, edx
mov r13d, eax ; RBGA ---> ZRGB
; s: eax
; d: edx
;=============================red = ((s >> 16) * rem + (d >> 16) * av) >> 8;
mov edx, r12d
shr edx, 0x10
movzx eax, r14b
imul edx, eax
mov ecx, r13d
shr ecx, 0x10
movzx eax, r15b
imul eax, ecx
lea eax, [eax + edx] ; faster than add eax, edx
shr eax, 0x8
mov r9b, al
shl r9d, 8
;=============================green = (((s >> 8) & 0x0000ff) * rem + ((d >> 8) & 0x0000ff) * av) >> 8;
mov eax, r12d
shr eax, 0x8
movzx edx, al
movzx eax, r14b
imul edx, eax
mov eax, r13d
shr eax, 0x8
movzx ecx, al
movzx eax, r15b
imul eax, ecx
lea eax, [eax, + edx] ; faster than add eax, edx
shr eax, 0x8
mov r9b, al
shl r9d, 8
;=============================blue = ((s & 0x0000ff) * rem + (d & 0x0000ff) * av) >> 8;
movzx edx, r12b
movzx eax, r14b
imul edx, eax
movzx ecx, r13b
movzx eax, r15b
imul eax, ecx
lea eax, [eax + edx] ; faster than add eax, edx
shr eax, 0x8
mov r9b, al
mov eax, r9d
ret