I have 2 pyrDown implementation with SSE2 and AVX instructions set. They are differ and AVX implementation get wrong image result. Also AVX implementation is slower that SSE2 impl. It's strange. Whats wrong with AVX implementation and how it make faster?
// SSE2 implementation
static __inline __m128i average2RowsSingle(const uint8_t* __restrict__ src, size_t srcStep) {
__m128i v0 = _mm_load_si128((const __m128i *)src);
__m128i v1 = _mm_load_si128((const __m128i *)&src[srcStep]);
return _mm_avg_epu8(v0, v1);
}
// SSSE3 version
// I used `__restrict__` to give the compiler more flexibility in unrolling
void average2Rows(const uint8_t* __restrict__ src,
uint8_t*__restrict__ dst,
size_t srcStep,
size_t size)
{
const __m128i vk1 = _mm_set1_epi8(1);
const __m128i add2 = _mm_set1_epi16(2);
size_t dstsize = size/2;
for (size_t i = 0; i < dstsize - 15; i += 16)
{
const size_t ii = i*2;
// based on https://stackoverflow.com/a/45564565/820795
__m128i left = average2RowsSingle(src+ii, srcStep);
__m128i right = average2RowsSingle(src+ii+16, srcStep);
__m128i w0 = _mm_maddubs_epi16(left, vk1); // unpack and horizontal add
__m128i w1 = _mm_maddubs_epi16(right, vk1);
w0 = _mm_srli_epi16(w0, 1); // divide by 2
w1 = _mm_srli_epi16(w1, 1);
w0 = _mm_packus_epi16(w0, w1); // pack
_mm_storeu_si128((__m128i *)&dst[i], w0);
}
}
// AVX implementation
static __m256i average2RowsSingle(const uint8_t* __restrict__ src, size_t srcStep) {
auto v0 = _mm256_load_si256((const __m256i*)src);
auto v1 = _mm256_load_si256((const __m256i*)&src[srcStep]);
return _mm256_avg_epu8(v0, v1);
}
void average2Rows(const uint8_t* __restrict__ src,
uint8_t*__restrict__ dst,
size_t srcStep,
size_t size) {
const __m128i vk1 = _mm_set1_epi8(1);
size_t dstsize = size/2;
const signed char o = -1; // make shuffle zero
const __m256i vec_r_i16 = _mm256_set_epi8(o,30, o,28, o,26, o,24, o,22, o,20, o,18, o,16,
o,14, o,12, o,10, o, 8, o, 6, o, 4, o, 2, o, 0);
const __m256i vec_l_i16 = _mm256_set_epi8(o,31, o,29, o,27, o,25, o,23, o,21, o,19, o,17,
o,15, o,13, o,11, o, 9, o, 7, o, 5, o, 3, o, 1);
for (size_t i = 0; i < dstsize - 31; i += 32)
{
const size_t ii = i * 2;
auto left = average2RowsSingle(src + ii, srcStep);
auto right = average2RowsSingle(src + ii + 32, srcStep);
auto w0 = _mm256_shuffle_epi8(left, vec_r_i16);
auto w1 = _mm256_shuffle_epi8(left, vec_l_i16);
left = _mm256_srli_epi16(_mm256_add_epi16(w0, w1), 1);
w0 = _mm256_shuffle_epi8(right, vec_r_i16);
w1 = _mm256_shuffle_epi8(right, vec_l_i16);
right = _mm256_srli_epi16(_mm256_add_epi16(w0, w1), 1);
left = _mm256_packus_epi16(left, right);
_mm256_storeu_si256((__m256i *) &dst[i], left);
}
}
Wrong result after AVX implementation:
With help of #chtz I come up to this code:
inline __m256i average2RowsSingle(const uint8_t* __restrict__ src, size_t srcStep) {
auto v0 = _mm256_loadu_si256((const __m256i *)src);
auto v1 = _mm256_loadu_si256((const __m256i *)&src[srcStep]);
return _mm256_avg_epu8(v0, v1);
}
void average2Rows(const uint8_t* __restrict__ src,
uint8_t*__restrict__ dst,
size_t srcStep,
size_t size) {
const auto vk1 = _mm256_set1_epi8(1);
const size_t dstSize = size/2;
for (size_t i = 0; i < dstSize - 31; i += 32)
{
const size_t ii = i * 2;
// based on https://stackoverflow.com/a/45564565/820795
auto left = average2RowsSingle(src + ii, srcStep);
auto right = average2RowsSingle(src + ii + 32, srcStep);
auto w0 = _mm256_maddubs_epi16(left, vk1); // unpack and horizontal add
auto w1 = _mm256_maddubs_epi16(right, vk1);
w0 = _mm256_srli_epi16(w0, 1); // divide by 2
w1 = _mm256_srli_epi16(w1, 1);
w0 = _mm256_packus_epi16(w0, w1); // pack
w0 = _mm256_permute4x64_epi64(w0, 0xd8); // shuffle to get correct order
_mm256_storeu_si256((__m256i *)&dst[i], w0);
}
}
Result image:
Related
I'd like to implement a fast correlation coefficient computation using SSE/AVX2. The operands are two unsigned char vectors. The function should be mostly equivalent to this:
float correlate_simple(const unsigned char* vec1, const unsigned char* vec2, size_t length)
{
int sum1 = 0;
int sum2 = 0;
int sum11 = 0;
int sum22 = 0;
int sum12 = 0;
for (size_t i = length; i > 0; --i, ++vec1, ++vec2) {
sum1 += *vec1;
sum2 += *vec2;
sum11 += *vec1 * *vec1;
sum22 += *vec2 * *vec2;
sum12 += *vec1 * *vec2;
}
double mean1 = double(sum1) / double(length);
double mean2 = double(sum2) / double(length);
double mean11 = double(sum11) / double(length);
double mean22 = double(sum22) / double(length);
double mean12 = double(sum12) / double(length);
double b = (mean11 - mean1 * mean1) * (mean22 - mean2 * mean2);
if (b <= 0.0)
return 0.0f;
double a = (mean12 - mean1 * mean2);
return float(a / sqrt(b));
}
The parameter length ranges from 1 to less than 1000.
In order to do this, I researched on how to implement an inner product of two unsigned byte arrays. However, I could not come up with a solution that does not involve converting all the unsigned 8 bit values to signed 16 bit values.
The intrinsic _mm256_maddubs_epi16(a, b) expects b to be a signed byte. This would not be a problem in this case since subtracting some constant (here: 127) from b does not change the correlation coefficient. Unfortunately I could not find an intrinsic that would allow me to subtract 127 from unsigned bytes producing signed bytes (not relying on some two's complement magic).
// vec: const unsigned char*
auto x = _mm256_load_si256((const __m256i*) vec);
auto v = _mm256_set1_epi8(127);
// wrong if vec[i] is less than 127:
auto x_centered = _mm256_sub_epi8 (x, v);
What would be the best approach here to compute inner products (and finally a correlation coefficient)?
Addendum:
Below is my current implementation of a pure inner product. I decided to convert to 16 bit integer to avoid overflow errors.
Update: Changed from reading 128 bits to 256 bits at a time.
int accumulate_i32(__m256i x)
{
auto tmp1 = _mm256_srli_si256(x, 8);
x = _mm256_add_epi32(x, tmp1);
auto tmp2 = _mm256_extractf128_si256(x, 1);
tmp2 = _mm_add_epi32(tmp2, _mm256_castsi256_si128(x));
return _mm_cvtsi128_si32(tmp2) + _mm_extract_epi32(tmp2, 1);
}
int inner_product_avx(const unsigned char* vec1, const unsigned char* vec2, unsigned int length)
{
constexpr unsigned int memoryAlignmentBytes = 32;
constexpr unsigned int bytesPerPack = 256 / 8;
assert((reinterpret_cast<std::uintptr_t>(vec1) % memoryAlignmentBytes) == 0);
assert((reinterpret_cast<std::uintptr_t>(vec2) % memoryAlignmentBytes) == 0);
// compute middle part via AVX2
unsigned int packCount = length / bytesPerPack;
const __m256i zeros = _mm256_setzero_si256();
auto sumlo = _mm256_setzero_si256();
auto sumhi = _mm256_setzero_si256();
for (unsigned int packIdx = 0; packIdx < packCount; ++packIdx) {
auto x1 = _mm256_load_si256((const __m256i*)vec1);
auto x2 = _mm256_load_si256((const __m256i*)vec2);
auto x1lo = _mm256_unpacklo_epi8(x1, zeros);
auto x1hi = _mm256_unpackhi_epi8(x1, zeros);
auto x2lo = _mm256_unpacklo_epi8(x2, zeros);
auto x2hi = _mm256_unpackhi_epi8(x2, zeros);
auto tmplo = _mm256_madd_epi16(x1lo, x2lo);
auto tmphi = _mm256_madd_epi16(x1hi, x2hi);
sumlo = _mm256_add_epi32(sumlo, tmplo);
sumhi = _mm256_add_epi32(sumhi, tmphi);
vec1 += bytesPerPack;
vec2 += bytesPerPack;
}
int sum = accumulate_i32(sumlo) + accumulate_i32(sumhi);
// compute remaining part that cannot be represented as a
// whole packed integer
unsigned int packRestCount = length % bytesPerPack;
for (size_t i = packRestCount; i > 0; --i, ++vec1, ++vec2)
sum += int(*vec1) * int(*vec2);
return sum;
}
This takes roughly 20 % of the time of the simple C++ implementation (see below). Considering the fact that the AVX code works on 16 16-bit integers simultaneously, I would have expected a higher gain. - Is this reasonable or did I miss something?
Unrolling the last loop in the AVX code did not descrease computation time.
int inner_product_simple(const unsigned char* vec1, const unsigned char* vec2, size_t length)
{
int sum = 0;
for (size_t i = length; i > 0; --i, ++vec1, ++vec2)
sum += int(*vec1) * int(*vec2);
return sum;
}
I would start from something like that. It uses 32-bit accumulators just like your current code. Untested.
namespace
{
// Compute sum of the 64-bit lanes, convert to double
inline double hadd_epi64( const __m256i i64 )
{
__m128i res = _mm256_castsi256_si128( i64 );
res = _mm_add_epi64( res, _mm256_extractf128_si256( i64, 1 ) );
res = _mm_add_epi64( res, _mm_unpackhi_epi64( res, res ) );
return (double)_mm_cvtsi128_si64( res );
}
// Convert 32-bit lanes into 64-bit, compute sum of the 8, convert to double
inline double hadd_epu32( __m256i x )
{
const __m256i zero = _mm256_setzero_si256();
__m256i i64 = _mm256_unpacklo_epi32( x, zero );
i64 = _mm256_add_epi64( i64, _mm256_unpackhi_epi32( x, zero ) );
return hadd_epi64( i64 );
};
}
class InnerProduct
{
// These fields are interpreted as 64-bit integers
__m256i a, b;
// These fields are interpreted as 32-bit integers
__m256i aa, bb, ab;
// Accumulate products of 16-bit numbers, 16 of them at once
inline void add16( __m256i x, __m256i y )
{
const __m256i x2 = _mm256_madd_epi16( x, x );
const __m256i y2 = _mm256_madd_epi16( y, y );
const __m256i prod = _mm256_madd_epi16( x, y );
aa = _mm256_add_epi32( aa, x2 );
bb = _mm256_add_epi32( bb, y2 );
ab = _mm256_add_epi32( ab, prod );
}
public:
InnerProduct()
{
a = b = aa = bb = ab = _mm256_setzero_si256();
}
// Handle 32 bytes
inline void addBytes( __m256i x, __m256i y )
{
// Accumulate values
const __m256i zero = _mm256_setzero_si256();
a = _mm256_add_epi64( a, _mm256_sad_epu8( x, zero ) );
b = _mm256_add_epi64( b, _mm256_sad_epu8( y, zero ) );
// Split the vectors into 2 sets of 16-bit numbers, accumulate products
const __m256i z = _mm256_unpacklo_epi8( x, zero );
const __m256i w = _mm256_unpacklo_epi8( y, zero );
add16( z, w );
x = _mm256_unpackhi_epi8( x, zero );
y = _mm256_unpackhi_epi8( y, zero );
add16( x, y );
}
// Compute the result
float compute( size_t count ) const
{
const double div = (double)count;
const double mean1 = hadd_epi64( a ) / div;
const double mean2 = hadd_epi64( b ) / div;
const double mean11 = hadd_epu32( aa ) / div;
const double mean22 = hadd_epu32( bb ) / div;
const double mean12 = hadd_epu32( ab ) / div;
const double b = ( mean11 - mean1 * mean1 ) * ( mean22 - mean2 * mean2 );
if( b <= 0 )
return 0;
const double a = ( mean12 - mean1 * mean2 );
return float( a / sqrt( b ) );
}
};
// Load 1-31 bytes into AVX register, zero out unused higher bytes
inline __m256i loadPartial( const uint8_t* p, size_t length )
{
alignas( 32 ) std::array<uint8_t, 32> arr{};
memcpy( arr.data(), p, length );
return _mm256_load_si256( ( const __m256i* )arr.data() );
}
float correlate_simple( const uint8_t* vec1, const uint8_t* vec2, const size_t length )
{
InnerProduct ip;
const uint8_t* const vec1End = vec1 + ( ( length / 32 ) * 32 );
for( ; vec1 < vec1End; vec1 += 32, vec2 += 32 )
{
const __m256i x = _mm256_loadu_si256( ( const __m256i * )vec1 );
const __m256i y = _mm256_loadu_si256( ( const __m256i * )vec2 );
ip.addBytes( x, y );
}
const size_t remainder = length % 32;
if( remainder > 0 )
{
const __m256i x = loadPartial( vec1, remainder );
const __m256i y = loadPartial( vec2, remainder );
ip.addBytes( x, y );
}
return ip.compute( length );
}
Unfortunately I could not find an intrinsic that would allow me to subtract 127 from unsigned bytes producing signed bytes (not relying on some two's complement magic).
Intel x86 CPUs use 2's compliment representation of signed numbers, this is why there are no separate versions of SIMD intrinsics for signed/unsigned packed integers. Intel SIMD intrinsics are outside the scope of C++ standard and have a specific well-defined behaviour.
I have a function:
void Func(const int * a, const int * b, size_t size, int p, int * c)
{
for (size_t i = 0; i < size; ++i)
c[i] = (a[i]*b[i])%p;
}
This function performs many modulo multiplication for arrays of integer.
All integers are positive.
And I need to improve its performance.
I thought about SSE and AVX. But they don't have an operation to vectorize modulo multiplication.
Or maybe I'm wrong?
Maybe anybody know any posibility to solve this problem?
At first I want to note that modulo operation can be realized with using of float point numbers:
d % p = d - int(float(d)/float(p))*p.
Although the amount of operation in right part is greater then in left one this part is preferable because it can be vectorized with using of SSE/AVX.
An implementation with SSE4.1 for 32x32 => 32-bit integer multiplication. Note that conversion from FP back to integer is done with round-to-nearest; use truncation toward zero (cvttps_epi32) if you want semantics like C float->integer conversions.
void Func(const int * a, const int * b, size_t size, int p, int * c)
{
__m128 _k = _mm_set1_ps(1.0f / p);
__m128i _p = _mm_set1_epi32(p);
for (size_t i = 0; i < size; i += 4)
{
__m128i _a = _mm_loadu_si128((__m128i*)(a + i));
__m128i _b = _mm_loadu_si128((__m128i*)(b + i));
__m128i _d = _mm_mullo_epi32(_a, _b);
__m128i _e = _mm_cvtps_epi32(_mm_mul_ps(_mm_cvtepi32_ps(_d), _k)); // e = int(float(d)/float(p));
__m128i _c = _mm_sub_epi32(_d, _mm_mullo_epi32(_e, _p));
_mm_storeu_si128((__m128i*)(c + i), _c);
}
}
An implementation with using of AVX :
void Func(const int * a, const int * b, size_t size, int p, int * c)
{
__m256 _k = _mm256_set1_ps(1.0f / p);
__m256i _p = _mm256_set1_epi32(p);
for (size_t i = 0; i < size; i += 8)
{
__m256i _a = _mm256_loadu_si128((__m256i*)(a + i));
__m256i _b = _mm256_loadu_si128((__m256i*)(b + i));
__m256i _d = _mm256_mullo_epi32(_a, _b);
__m256i _e = _mm256_cvtps_epi32(_mm256_mul_ps(_mm256_cvtepi32_ps(_d), _k)); // e = int(float(d)/float(p));
__m256i _c = _mm256_sub_epi32(_d, _mm256_mullo_epi32(_e, _p));
_mm256_storeu_si128((__m256i*)(c + i), _c);
}
}
Actually there is an instrinsic that is performing this operation:
_mm256_irem_epi32
https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_irem_epi32
I try to port a SSE function which get absolute difference of two 8-bit unsigned integer arrays.
It looks like:
uint64_t AbsDiffSum(const uint8_t * a, const uint8_t * b, size_t size)
{
assert(size%16 == 0);
__m128i _sum = _mm_setzero_si128();
for(size_t i = 0; i < size; i += 16)
{
const __m128i _a = _mm_loadu_si128((__m128i*)(a + i));
const __m128i _b = _mm_loadu_si128((__m128i*)(b + i));
_sum = _mm_add_epi64(_sum, _mm_sad_epu8(_a, _b));
}
return _mm_cvtsi128_si64(_mm_add_epi64(_sum, _mm_srli_si128(_sum, 8)));
}
Main work is performed by intrinsic function _mm_sad_epu8().
Is there an analogue for Altivec?
Unfortunately, there is no direct analogue of intrinsic function _mm_sad_epu8 for Altivec.
But there is a possibility to emulate it:
typedef __vector uint8_t uint8x16_t;
typedef __vector uint32_t uint32x4_t;
const uint8_t K8_01 = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
uint64_t AbsDiffSum(const uint8_t * a, const uint8_t * b, size_t size)
{
uint32x4_t _sum = {0, 0, 0, 0};
for(size_t i = 0; i < size; i += 16)
{
// Aligned loading of 128-bit vector
uint8x16_t _a = vec_ld(a + i);
// Aligned loading of 128-bit vector
uint8x16_t _b = vec_ld(b + i);
// Find absolute difference of two 8-bit unsigned
uint8x16_t absDifference = vec_sub(vec_max(a, b), vec_min(a, b));
// Sum result with using of vec_msum
_sum = vec_msum(absDifference, K8_01, _sum);
}
return vec_extract(_sum, 0) + vec_extract(_sum, 1) +
vec_extract(_sum, 2) + vec_extract(_sum, 3);
}
How to use load/store to do aligned int16_t byte swapping correctly?
void byte_swapping(uint16_t* dest, const uint16_t* src,
size_t count) {
__m128i _s, _d;
for (uint16_t const * end(dest + count); dest != end; dest += 8, src += 8)
{
_s = _mm_load_si128((__m128i*)src);
_d = _mm_or_si128(_mm_slli_epi16(_s, 8), _mm_srli_epi16(_s, 8));
_mm_store_si128((__m128i*) dest, _d);
}
}
Your code will fail when count is not a multiple of 8, or when either src or dest is not 16 byte aligned.
Here is a fixed (and tested) version of your code:
void byte_swapping(uint16_t* dest, const uint16_t* src, size_t count)
{
size_t i;
for (i = 0; i + 8 <= count; i += 8)
{
__m128i s = _mm_loadu_si128((__m128i*)&src[i]);
__m128i d = _mm_or_si128(_mm_slli_epi16(s, 8), _mm_srli_epi16(s, 8));
_mm_storeu_si128((__m128i*)&dest[i], d);
}
for ( ; i < count; ++i) // handle residual elements
{
uint16_t w = src[i];
w = (w >> 8) | (w << 8);
dest[i] = w;
}
}
Context
I've made a directshow filter to change contrast and brightness of my video. I want to speed it up.
Working filter without SSE
HRESULT CBrightness::Transform(IMediaSample *pMediaSample)
{
...
BYTE *pData; // Pointer to the actual image buffer
pMediaSample->GetPointer(&pData);
int numPixels = cxImage * cyImage;
...
prgb = (RGBTRIPLE*) pData;
for (int iPixel=0; iPixel < numPixels; iPixel++ ) {
RGBTRIPLE *ppixel = prgb + iPixel;
ppixel->rgbtGreen = ppixel->rgbtGreen * _contrastPower + _brightnessPower;
ppixel->rgbtBlue = ppixel->rgbtBlue * _contrastPower + _brightnessPower;
ppixel->rgbtRed = ppixel->rgbtRed * _contrastPower + _brightnessPower;
if(ppixel->rgbtGreen>255) ppixel->rgbtGreen = 255;
if(ppixel->rgbtBlue>255) ppixel->rgbtBlue = 255;
if(ppixel->rgbtRed>255) ppixel->rgbtRed = 255;
}
...
}
Not working filter with SEE
HRESULT CBrightness::Transform(IMediaSample *pMediaSample)
{
BYTE *pData; // Pointer to the actual image buffer
long lDataLen; // Holds length of any given sample
int iPixel; // Used to loop through the image pixels
RGBTRIPLE *prgb; // Holds a pointer to the current pixel
AM_MEDIA_TYPE* pType = &m_pInput->CurrentMediaType();
VIDEOINFOHEADER *pvi = (VIDEOINFOHEADER *) pType->pbFormat;
ASSERT(pvi);
CheckPointer(pMediaSample,E_POINTER);
pMediaSample->GetPointer(&pData);
lDataLen = pMediaSample->GetSize();
// Get the image properties from the BITMAPINFOHEADER
int cxImage = pvi->bmiHeader.biWidth;
int cyImage = pvi->bmiHeader.biHeight;
int numPixels = cxImage * cyImage;
prgb = (RGBTRIPLE*) pData;
double dcontrast = 0.7;
__m128d cStore = _mm_set1_pd(dcontrast);
BYTE *pDataOutput = new BYTE[lDataLen];
for (iPixel=0; iPixel < numPixels; iPixel += 4 ) {
//unpack to 32 bits
__m128i current = _mm_unpacklo_epi8( _mm_loadu_si128( (__m128i*)( prgb+iPixel ) ), _mm_setzero_si128());
__m128d image = _mm_cvtepi32_pd(_mm_unpacklo_epi16(current, _mm_setzero_si128()));
//vector operations
__m128d result = _mm_mul_pd(cStore, image);
//pack back to 8 bits
__m128i pack_32 = _mm_cvtpd_epi32 (result);
__m128i pack_16 = _mm_packs_epi32 (pack_32, pack_32);
__m128i pack_8 = _mm_packus_epi16(pack_16, pack_16);
//store the new pixel in pDataOutput
_mm_storeu_si128((__m128i*)(pDataOutput+iPixel), pack_8);
//also tryed to store the result in the original array
//_mm_storeu_si128((__m128i*)(prgb+iPixel), pack_8); // blacks out the whole video
}
//assign the original pointer to point at the start of the new data array
pData = pDataOutput;
return NOERROR;
}
Problems
This code does nothing to the original stream:
//store the new pixel in pDataOutput
_mm_storeu_si128((__m128i*)(pDataOutput+iPixel), pack_8);
....
pData = pDataOutput;
This code blacks out the whole video:
_mm_storeu_si128((__m128i*)(prgb+iPixel), pack_8);
Questions
Am I using the SSE instructions correctly ?
How do I assign the modified data to the original media sample pointer ?
Maybe this example will be useful to you:
void Filter(const uint8_t * src, size_t width, size_t height, double contrast, double brightness, uint8_t * dst)
{
const int shift = 8;
size_t size = width*height*3;
__m128i _contrast16 = _mm_set1_epi16(int16_t(contrast*(1 << shift)));
__m128i _brightness16 = _mm_set1_epi16(int16_t(brightness*(1 << shift)));
for(size_t i = 0; i < size; i += sizeof(__m128i))
{
__m128i _src8 = _mm_load_si128((__m128i*)(src + i));
__m128i _srcLo16 = _mm_unpacklo_epi8(_src8, _mm_setzero_si128());
__m128i _srcHi16 = _mm_unpackhi_epi8(_src8, _mm_setzero_si128());
__m128i _dstLo16 = _mm_srai_epi16(_mm_add_epi16(_brightness16, _mm_mullo_epi16(_contrast16, _srcLo16)), shift);
__m128i _dstHi16 = _mm_srai_epi16(_mm_add_epi16(_brightness16, _mm_mullo_epi16(_contrast16, _srcHi16)), shift);
_mm_store_si128((__m128i*)(dst + i), _mm_packus_epi16(_dstLo16, _dstHi16));
}
}
If use the individual coefficients for each channel:
inline void Filter(const uint8_t * src, const __m128i & contrastLo, const __m128i & contrastHi,
const __m128i & brightnessLo, const __m128i & brightnessHi, int shift, uint8_t * dst)
{
__m128i _src8 = _mm_load_si128((__m128i*)src);
__m128i _srcLo16 = _mm_unpacklo_epi8(_src8, _mm_setzero_si128());
__m128i _srcHi16 = _mm_unpackhi_epi8(_src8, _mm_setzero_si128());
__m128i _dstLo16 = _mm_srai_epi16(_mm_add_epi16(brightnessLo, _mm_mullo_epi16(contrastLo, _srcLo16)), shift);
__m128i _dstHi16 = _mm_srai_epi16(_mm_add_epi16(brightnessHi, _mm_mullo_epi16(contrastHi, _srcHi16)), shift);
_mm_store_si128((__m128i*)dst, _mm_packus_epi16(_dstLo16, _dstHi16));
}
void Filter(const uint8_t * src, size_t width, size_t height, double contrast[3], double brightness[3], uint8_t * dst)
{
const int shift = 8;
size_t size = width*height*3;
const int16_t
c0 = int16_t(contrast[0]*(1 << shift)),
c1 = int16_t(contrast[1]*(1 << shift)),
c2 = int16_t(contrast[2]*(1 << shift));
const int16_t
b0 = int16_t(brightness[0]*(1 << shift)),
b1 = int16_t(brightness[1]*(1 << shift)),
b2 = int16_t(brightness[2]*(1 << shift));
__m128i _contrast[3], _brightness[3];
_contrast[0] = _mm_setr_epi16(c0, c1, c2, c0, c1, c2, c0, c1);
_contrast[1] = _mm_setr_epi16(c2, c0, c1, c2, c0, c1, c2, c0);
_contrast[2] = _mm_setr_epi16(c1, c2, c0, c1, c2, c0, c1, c2);
_brightness[0] = _mm_setr_epi16(b0, b1, b2, b0, b1, b2, b0, b1);
_brightness[1] = _mm_setr_epi16(b2, b0, b1, b2, b0, b1, b2, b0);
_brightness[2] = _mm_setr_epi16(b1, b2, b0, b1, b2, b0, b1, b2);
for(size_t i = 0; i < size;)
{
Filter(src + i, _contrast[0], _contrast[1], _brightness[0], _brightness[1], shift, dst + i);
i += sizeof(__m128i);
Filter(src + i, _contrast[2], _contrast[0], _brightness[2], _brightness[0], shift, dst + i);
i += sizeof(__m128i);
Filter(src + i, _contrast[1], _contrast[2], _brightness[1], _brightness[2], shift, dst + i);
i += sizeof(__m128i);
}
}