Related
I would like to modify the code for an OpenCV mean filter to use Intel intrinsics. I'm an SSE newbie and I really don't know where to start from. I checked a lot of resources on the web, but I didn't have a lot of success.
This is the program:
#include "opencv2/imgproc/imgproc.hpp"
#include "opencv2/highgui/highgui.hpp"
using namespace cv;
using namespace std;
int main()
{
int A[3][3] = { { 1, 1, 1 }, { 1, 1, 1 }, { 1, 1, 1 } };
int c = 0;
int d = 0;
Mat var1 = imread("images.jpg", 1);
Mat var2(var1.rows, var1.cols, CV_8UC3, Scalar(0, 0, 0));
for (int i = 0; i < var1.rows; i++)
{
var2.at<Vec3b>(i, 0) = var1.at<Vec3b>(i, 0);
var2.at<Vec3b>(i, var1.cols - 1) = var1.at<Vec3b>(i, var1.cols - 1);
}
for (int i = 0; i < var1.cols; i++)
{
var2.at<Vec3b>(0, i) = var1.at<Vec3b>(0, i);
var2.at<Vec3b>(var1.rows - 1, i) = var1.at<Vec3b>(var1.rows - 1, i);
}
for (int i = 0; i < var1.rows; i++) {
for (int j = 0; j < var1.cols; j++)
{
c = 0;
for (int m = i; m < var1.rows; m++, c++)
{
if (c < 3)
{
d = 0;
for (int n = j; n < var1.cols; n++, d++)
{
if (d < 3)
{
if ((i + 1) < var1.rows && (j + 1) < var1.cols)
{
var2.at<Vec3b>(i + 1, j + 1)[0] += var1.at<Vec3b>(m, n)[0] * A[m - i][n - j] / 9;
var2.at<Vec3b>(i + 1, j + 1)[1] += var1.at<Vec3b>(m, n)[1] * A[m - i][n - j] / 9;
var2.at<Vec3b>(i + 1, j + 1)[2] += var1.at<Vec3b>(m, n)[2] * A[m - i][n - j] / 9;
}
}
}
}
}
}
}
imshow("window1", var1);
imshow("window2", var2);
waitKey(0);
return(0);
}
The part that I find difficult is understanding how to convert the innermost 2 loops, where the mean value is computed. Any help will be greatly appreciated.
Just for fun, I thought it might be interesting to start with a naive implementation of a 3x3 mean filter and then optimise this incrementally, ending up with a SIMD (SSE) implementation, measuring the throughput improvement at each stage.
1 - Mean_3_3_ref - reference implementation
This is just a simple scalar implementation which we'll use as a baseline for throughput and for validating further implementations:
void Mean_3_3_ref(const Mat &image_in, Mat &image_out)
{
for (int y = 1; y < image_in.rows - 1; ++y)
{
for (int x = 1; x < image_in.cols - 1; ++x)
{
for (int c = 0; c < 3; ++c)
{
image_out.at<Vec3b>(y, x)[c] = (image_in.at<Vec3b>(y - 1, x - 1)[c] +
image_in.at<Vec3b>(y - 1, x )[c] +
image_in.at<Vec3b>(y - 1, x + 1)[c] +
image_in.at<Vec3b>(y , x - 1)[c] +
image_in.at<Vec3b>(y , x )[c] +
image_in.at<Vec3b>(y , x + 1)[c] +
image_in.at<Vec3b>(y + 1, x - 1)[c] +
image_in.at<Vec3b>(y + 1, x )[c] +
image_in.at<Vec3b>(y + 1, x + 1)[c] + 4) / 9;
}
}
}
}
2 - Mean_3_3_scalar - somewhat optimised scalar implementation
Exploit the redundancy in summing successive columns - we save the last two column sums so that we only need to calculate one new column sum (per channel) on each iteration:
void Mean_3_3_scalar(const Mat &image_in, Mat &image_out)
{
for (int y = 1; y < image_in.rows - 1; ++y)
{
int r_1, g_1, b_1;
int r0, g0, b0;
int r1, g1, b1;
r_1 = g_1 = b_1 = 0;
r0 = g0 = b0 = 0;
for (int yy = y - 1; yy <= y + 1; ++yy)
{
r_1 += image_in.at<Vec3b>(yy, 0)[0];
g_1 += image_in.at<Vec3b>(yy, 0)[1];
b_1 += image_in.at<Vec3b>(yy, 0)[2];
r0 += image_in.at<Vec3b>(yy, 1)[0];
g0 += image_in.at<Vec3b>(yy, 1)[1];
b0 += image_in.at<Vec3b>(yy, 1)[2];
}
for (int x = 1; x < image_in.cols - 1; ++x)
{
r1 = g1 = b1 = 0;
for (int yy = y - 1; yy <= y + 1; ++yy)
{
r1 += image_in.at<Vec3b>(yy, x + 1)[0];
g1 += image_in.at<Vec3b>(yy, x + 1)[1];
b1 += image_in.at<Vec3b>(yy, x + 1)[2];
}
image_out.at<Vec3b>(y, x)[0] = (r_1 + r0 + r1 + 4) / 9;
image_out.at<Vec3b>(y, x)[1] = (g_1 + g0 + g1 + 4) / 9;
image_out.at<Vec3b>(y, x)[2] = (b_1 + b0 + b1 + 4) / 9;
r_1 = r0;
g_1 = g0;
b_1 = b0;
r0 = r1;
g0 = g1;
b0 = b1;
}
}
}
3 - Mean_3_3_scalar_opt - further optimised scalar implementation
As per Mean_3_3_scalar, but also remove OpenCV overheads by caching pointers to each row that we are working on:
void Mean_3_3_scalar_opt(const Mat &image_in, Mat &image_out)
{
for (int y = 1; y < image_in.rows - 1; ++y)
{
const uint8_t * const input_1 = image_in.ptr(y - 1);
const uint8_t * const input0 = image_in.ptr(y);
const uint8_t * const input1 = image_in.ptr(y + 1);
uint8_t * const output = image_out.ptr(y);
int r_1 = input_1[0] + input0[0] + input1[0];
int g_1 = input_1[1] + input0[1] + input1[1];
int b_1 = input_1[2] + input0[2] + input1[2];
int r0 = input_1[3] + input0[3] + input1[3];
int g0 = input_1[4] + input0[4] + input1[4];
int b0 = input_1[5] + input0[5] + input1[5];
for (int x = 1; x < image_in.cols - 1; ++x)
{
int r1 = input_1[x * 3 + 3] + input0[x * 3 + 3] + input1[x * 3 + 3];
int g1 = input_1[x * 3 + 4] + input0[x * 3 + 4] + input1[x * 3 + 4];
int b1 = input_1[x * 3 + 5] + input0[x * 3 + 5] + input1[x * 3 + 5];
output[x * 3 ] = (r_1 + r0 + r1 + 4) / 9;
output[x * 3 + 1] = (g_1 + g0 + g1 + 4) / 9;
output[x * 3 + 2] = (b_1 + b0 + b1 + 4) / 9;
r_1 = r0;
g_1 = g0;
b_1 = b0;
r0 = r1;
g0 = g1;
b0 = b1;
}
}
}
4 - Mean_3_3_blur - leverage OpenCV's blur function
OpenCV has a function called blur, which is based on the function boxFilter, which is just another name for a mean filter. Since OpenCV code has been quite heavily optimised over the years (using SIMD in many cases), let's see if this makes a big improvement over our scalar code:
void Mean_3_3_blur(const Mat &image_in, Mat &image_out)
{
blur(image_in, image_out, Size(3, 3));
}
5 - Mean_3_3_SSE - SSE implementation
This a reasonably efficient SIMD implementation. It uses the same techniques as the scalar code above in order to eliminate redundancy in processing successive pixels:
#include <tmmintrin.h> // Note: requires SSSE3 (aka MNI)
inline void Load2(const ssize_t offset, const uint8_t* const src, __m128i& vh, __m128i& vl)
{
const __m128i v = _mm_loadu_si128((__m128i *)(src + offset));
vh = _mm_unpacklo_epi8(v, _mm_setzero_si128());
vl = _mm_unpackhi_epi8(v, _mm_setzero_si128());
}
inline void Store2(const ssize_t offset, uint8_t* const dest, const __m128i vh, const __m128i vl)
{
__m128i v = _mm_packus_epi16(vh, vl);
_mm_storeu_si128((__m128i *)(dest + offset), v);
}
template <int SHIFT> __m128i ShiftL(const __m128i v0, const __m128i v1) { return _mm_alignr_epi8(v1, v0, SHIFT * sizeof(short)); }
template <int SHIFT> __m128i ShiftR(const __m128i v0, const __m128i v1) { return _mm_alignr_epi8(v1, v0, 16 - SHIFT * sizeof(short)); }
template <int CHANNELS> void Mean_3_3_SSE_Impl(const Mat &image_in, Mat &image_out)
{
const int nx = image_in.cols;
const int ny = image_in.rows;
const int kx = 3 / 2; // x, y borders
const int ky = 3 / 2;
const int kScale = 3 * 3; // scale factor = total number of pixels in sum
const __m128i vkScale = _mm_set1_epi16((32768 + kScale / 2) / kScale);
const int nx0 = ((nx + kx) * CHANNELS + 15) & ~15; // round up total width to multiple of 16
int x, y;
for (y = ky; y < ny - ky; ++y)
{
const uint8_t * const input_1 = image_in.ptr(y - 1);
const uint8_t * const input0 = image_in.ptr(y);
const uint8_t * const input1 = image_in.ptr(y + 1);
uint8_t * const output = image_out.ptr(y);
__m128i vsuml_1, vsumh0, vsuml0;
__m128i vh, vl;
vsuml_1 = _mm_set1_epi16(0);
Load2(0, input_1, vsumh0, vsuml0);
Load2(0, input0, vh, vl);
vsumh0 = _mm_add_epi16(vsumh0, vh);
vsuml0 = _mm_add_epi16(vsuml0, vl);
Load2(0, input1, vh, vl);
vsumh0 = _mm_add_epi16(vsumh0, vh);
vsuml0 = _mm_add_epi16(vsuml0, vl);
for (x = 0; x < nx0; x += 16)
{
__m128i vsumh1, vsuml1, vsumh, vsuml;
Load2((x + 16), input_1, vsumh1, vsuml1);
Load2((x + 16), input0, vh, vl);
vsumh1 = _mm_add_epi16(vsumh1, vh);
vsuml1 = _mm_add_epi16(vsuml1, vl);
Load2((x + 16), input1, vh, vl);
vsumh1 = _mm_add_epi16(vsumh1, vh);
vsuml1 = _mm_add_epi16(vsuml1, vl);
vsumh = _mm_add_epi16(vsumh0, ShiftR<CHANNELS>(vsuml_1, vsumh0));
vsuml = _mm_add_epi16(vsuml0, ShiftR<CHANNELS>(vsumh0, vsuml0));
vsumh = _mm_add_epi16(vsumh, ShiftL<CHANNELS>(vsumh0, vsuml0));
vsuml = _mm_add_epi16(vsuml, ShiftL<CHANNELS>(vsuml0, vsumh1));
// round mean
vsumh = _mm_mulhrs_epi16(vsumh, vkScale);
vsuml = _mm_mulhrs_epi16(vsuml, vkScale);
Store2(x, output, vsumh, vsuml);
vsuml_1 = vsuml0;
vsumh0 = vsumh1;
vsuml0 = vsuml1;
}
}
}
void Mean_3_3_SSE(const Mat &image_in, Mat &image_out)
{
const int channels = image_in.channels();
switch (channels)
{
case 1:
Mean_3_3_SSE_Impl<1>(image_in, image_out);
break;
case 3:
Mean_3_3_SSE_Impl<3>(image_in, image_out);
break;
default:
throw("Unsupported format.");
break;
}
}
Results
I benchmarked all of the above implementations on an 8th gen Core i9 (MacBook Pro 16,1) at 2.4 GHz, with an image size of 2337 rows x 3180 cols. The compiler was Apple clang version 12.0.5 (clang-1205.0.22.9) and the only optimisation switch was -O3. OpenCV version was 4.5.0 (via Homebrew). (Note: I verified that for Mean_3_3_blur the cv::blur function was dispatched to an AVX2 implementation.) The results:
Mean_3_3_ref 62153 µs
Mean_3_3_scalar 41144 µs = 1.51062x
Mean_3_3_scalar_opt 26238 µs = 2.36882x
Mean_3_3_blur 20121 µs = 3.08896x
Mean_3_3_SSE 4838 µs = 12.84680x
Notes
I have ignored the border pixels in all implementations - if required these can either be filled with pixels from the original image or using some other form of edge pixel processing.
The code is not "industrial strength" - it was only written for benchmarking purposes.
There are a few further possible optimisations, e.g. use wider SIMD (AVX2, AVX512), exploit the redundancy between successive rows, etc - these are left as an exercise for the reader.
The SSE implementation is fastest, but this comes at the cost of increased complexity, decreased mantainability and reduced portability.
The OpenCV blur function gives the second best performance, and should probably be the preferred solution if it meets throughput requirements - it's the simplest solution, and simple is good.
In relation to my previous question BitMap_blur efect, i have succeeded to make the bit map blurred but the problem is the colors of the blurred picture has been changed:
Original photo: https://ibb.co/eFHg8G
Blurred photo: https://ibb.co/mQDShb
The code of the blurring algorytm is the same as in my previous question:
for (xx = 0; xx < bitmapInfoHeader.biWidth; xx++)
{
for (yy = 0; yy <bitmapInfoHeader.biHeight; yy++)
{
avgB = avgG = avgR = 0;
Counter = 0;
for (x = xx; x < bitmapInfoHeader.biWidth && x < xx + blurSize; x++)
{
for (y = yy; y < bitmapInfoHeader.biHeight && y < yy + blurSize; y++)
{
avgB += bitmapImage[x *3 + y*bitmapInfoHeader.biWidth * 3 + 0]; //bitmapimage[x][y];
avgG += bitmapImage[x *3 + y*bitmapInfoHeader.biWidth * 3 + 1];
avgR += bitmapImage[x *3 + y*bitmapInfoHeader.biWidth * 3 + 2];
Counter++;
}
}
avgB = avgB / Counter;
avgG = avgG / Counter;
avgR = avgR / Counter;
bitmapImage[xx * 3 + yy*bitmapInfoHeader.biWidth * 3 + 0] = avgB;
bitmapImage[xx * 3 + yy*bitmapInfoHeader.biWidth * 3 + 1] = avgG;
bitmapImage[xx * 3 + yy*bitmapInfoHeader.biWidth * 3 + 2] = avgR;
}
}
So what am doing wrong here?
It actually looks like size of each line is padded to be multiple of 4 bytes. To get correct byte offset of each line you will need to replace
* bitmapInfoHeader.biWidth * 3
with
* (bitmapInfoHeader.biWidth * 3 + padding_bytes_count)
where
padding_bytes_count =
(
(
bitmapFileHeader.bfSize - bitmapFileHeader.bfOffBits
-
bitmapInfoHeader.biWidth * bitmapInfoHeader.biHeight * 3
)
/
bitmapInfoHeader.biHeight
);
For your tiger image padding_bytes_count should be 2.
Here, I create a semi-portable bitmap reader/writer.. Works on Windows, Linux Mint, MacOS High Sierra. I didn't test other platforms.. but it should work.
It has:
Portability
Load 24-bit bitmaps.
Load 32-bit bitmaps.
Write 24-bit bitmaps.
Write 32-bit bitmaps.
Convert between 24-bit and 32-bit bitmaps.
Convert between 32-bit and 24-bit bitmaps.
It doesn't have:
Support for Alpha Transparency. Alpha transparency has special fields and flags required to be set in the header. I don't feel like writing them in so it won't support it.
Only part of it that doesn't seem very portable would be the #pragma pack..
#include <iostream>
#include <fstream>
#if defined(_WIN32) || defined(_WIN64)
#include <windows.h>
#endif
typedef struct
{
uint8_t r, g, b, a;
} rgb32;
#if !defined(_WIN32) && !defined(_WIN64)
#pragma pack(2)
typedef struct
{
uint16_t bfType;
uint32_t bfSize;
uint16_t bfReserved1;
uint16_t bfReserved2;
uint32_t bfOffBits;
} BITMAPFILEHEADER;
#pragma pack()
#pragma pack(2)
typedef struct
{
uint32_t biSize;
int32_t biWidth;
int32_t biHeight;
uint16_t biPlanes;
uint16_t biBitCount;
uint32_t biCompression;
uint32_t biSizeImage;
int16_t biXPelsPerMeter;
int16_t biYPelsPerMeter;
uint32_t biClrUsed;
uint32_t biClrImportant;
} BITMAPINFOHEADER;
#pragma pack()
#endif
#pragma pack(2)
typedef struct
{
BITMAPFILEHEADER bfh;
BITMAPINFOHEADER bih;
} BMPINFO;
#pragma pack()
class bitmap
{
private:
BMPINFO bmpInfo;
uint8_t* pixels;
public:
bitmap(const char* path);
~bitmap();
void save(const char* path, uint16_t bit_count = 24);
rgb32* getPixel(uint32_t x, uint32_t y) const;
void setPixel(rgb32* pixel, uint32_t x, uint32_t y);
uint32_t getWidth() const;
uint32_t getHeight() const;
uint16_t bitCount() const;
};
bitmap::bitmap(const char* path) : bmpInfo(), pixels(nullptr)
{
std::ifstream file(path, std::ios::in | std::ios::binary);
if (file)
{
file.read(reinterpret_cast<char*>(&bmpInfo.bfh), sizeof(bmpInfo.bfh));
if (bmpInfo.bfh.bfType != 0x4d42)
{
throw std::runtime_error("Invalid format. Only bitmaps are supported.");
}
file.read(reinterpret_cast<char*>(&bmpInfo.bih), sizeof(bmpInfo.bih));
if (bmpInfo.bih.biCompression != 0)
{
std::cerr<<bmpInfo.bih.biCompression<<"\n";
throw std::runtime_error("Invalid bitmap. Only uncompressed bitmaps are supported.");
}
if (bmpInfo.bih.biBitCount != 24 && bmpInfo.bih.biBitCount != 32)
{
throw std::runtime_error("Invalid bitmap. Only 24bit and 32bit bitmaps are supported.");
}
file.seekg(bmpInfo.bfh.bfOffBits, std::ios::beg);
pixels = new uint8_t[bmpInfo.bfh.bfSize - bmpInfo.bfh.bfOffBits];
file.read(reinterpret_cast<char*>(&pixels[0]), bmpInfo.bfh.bfSize - bmpInfo.bfh.bfOffBits);
uint8_t* temp = new uint8_t[bmpInfo.bih.biWidth * bmpInfo.bih.biHeight * sizeof(rgb32)];
uint8_t* in = pixels;
rgb32* out = reinterpret_cast<rgb32*>(temp);
int padding = bmpInfo.bih.biBitCount == 24 ? ((bmpInfo.bih.biSizeImage - bmpInfo.bih.biWidth * bmpInfo.bih.biHeight * 3) / bmpInfo.bih.biHeight) : 0;
for (int i = 0; i < bmpInfo.bih.biHeight; ++i, in += padding)
{
for (int j = 0; j < bmpInfo.bih.biWidth; ++j)
{
out->b = *(in++);
out->g = *(in++);
out->r = *(in++);
out->a = bmpInfo.bih.biBitCount == 32 ? *(in++) : 0xFF;
++out;
}
}
delete[] pixels;
pixels = temp;
}
}
bitmap::~bitmap()
{
delete[] pixels;
}
void bitmap::save(const char* path, uint16_t bit_count)
{
std::ofstream file(path, std::ios::out | std::ios::binary);
if (file)
{
bmpInfo.bih.biBitCount = bit_count;
uint32_t size = ((bmpInfo.bih.biWidth * bmpInfo.bih.biBitCount + 31) / 32) * 4 * bmpInfo.bih.biHeight;
bmpInfo.bfh.bfSize = bmpInfo.bfh.bfOffBits + size;
file.write(reinterpret_cast<char*>(&bmpInfo.bfh), sizeof(bmpInfo.bfh));
file.write(reinterpret_cast<char*>(&bmpInfo.bih), sizeof(bmpInfo.bih));
file.seekp(bmpInfo.bfh.bfOffBits, std::ios::beg);
uint8_t* out = NULL;
rgb32* in = reinterpret_cast<rgb32*>(pixels);
uint8_t* temp = out = new uint8_t[bmpInfo.bih.biWidth * bmpInfo.bih.biHeight * sizeof(rgb32)];
int padding = bmpInfo.bih.biBitCount == 24 ? ((bmpInfo.bih.biSizeImage - bmpInfo.bih.biWidth * bmpInfo.bih.biHeight * 3) / bmpInfo.bih.biHeight) : 0;
for (int i = 0; i < bmpInfo.bih.biHeight; ++i, out += padding)
{
for (int j = 0; j < bmpInfo.bih.biWidth; ++j)
{
*(out++) = in->b;
*(out++) = in->g;
*(out++) = in->r;
if (bmpInfo.bih.biBitCount == 32)
{
*(out++) = in->a;
}
++in;
}
}
file.write(reinterpret_cast<char*>(&temp[0]), size); //bmpInfo.bfh.bfSize - bmpInfo.bfh.bfOffBits
delete[] temp;
}
}
rgb32* bitmap::getPixel(uint32_t x, uint32_t y) const
{
rgb32* temp = reinterpret_cast<rgb32*>(pixels);
return &temp[(bmpInfo.bih.biHeight - 1 - y) * bmpInfo.bih.biWidth + x];
}
void bitmap::setPixel(rgb32* pixel, uint32_t x, uint32_t y)
{
rgb32* temp = reinterpret_cast<rgb32*>(pixels);
memcpy(&temp[(bmpInfo.bih.biHeight - 1 - y) * bmpInfo.bih.biWidth + x], pixel, sizeof(rgb32));
};
uint32_t bitmap::getWidth() const
{
return bmpInfo.bih.biWidth;
}
uint32_t bitmap::getHeight() const
{
return bmpInfo.bih.biHeight;
}
uint16_t bitmap::bitCount() const
{
return bmpInfo.bih.biBitCount;
}
void apply_blur(int x, int y, bitmap* bmp, int blurRadius)
{
double blurValue = 0.111;
int r = 0;
int g = 0 ;
int b = 0;
for (int k = y - blurRadius; k <= blurRadius; ++k)
{
for (int l = x - blurRadius; l <= blurRadius; ++l)
{
rgb32* pixel = bmp->getPixel(l, k);
r += blurValue * pixel->r;
g += blurValue * pixel->g;
b += blurValue * pixel->b;
}
}
rgb32 pixel = *bmp->getPixel(x, y);
pixel.r = r;
pixel.g = g;
pixel.b = b;
bmp->setPixel(&pixel, x, y);
}
int main(int argc, const char * argv[])
{
bitmap bmp{"/Users/brandon/Desktop/tiger.bmp"};
bmp.save("/Users/brandon/Desktop/blurred-tiger-24.bmp");
bmp.save("/Users/brandon/Desktop/blurred-tiger-32.bmp", 32);
return 0;
}
Now all you have to do is add your blur algorithm.. I tried it, but couldn't figure out the blurring part.. I ended up porting an algorithm found here: http://blog.ivank.net/fastest-gaussian-blur.html
void blur(bitmap* bmp, int radius)
{
float rs = ceil(radius * 2.57);
for (int i = 0; i < bmp->getHeight(); ++i)
{
for (int j = 0; j < bmp->getWidth(); ++j)
{
double r = 0, g = 0, b = 0;
double count = 0;
for (int iy = i - rs; iy < i + rs + 1; ++iy)
{
for (int ix = j - rs; ix < j + rs + 1; ++ix)
{
auto x = std::min(static_cast<int>(bmp->getWidth()) - 1, std::max(0, ix));
auto y = std::min(static_cast<int>(bmp->getHeight()) - 1, std::max(0, iy));
auto dsq = ((ix - j) * (ix - j)) + ((iy - i) * (iy - i));
auto wght = std::exp(-dsq / (2.0 * radius * radius)) / (M_PI * 2.0 * radius * radius);
rgb32* pixel = bmp->getPixel(x, y);
r += pixel->r * wght;
g += pixel->g * wght;
b += pixel->b * wght;
count += wght;
}
}
rgb32* pixel = bmp->getPixel(j, i);
pixel->r = std::round(r / count);
pixel->g = std::round(g / count);
pixel->b = std::round(b / count);
}
}
}
int main(int argc, const char * argv[])
{
bitmap bmp{"/Users/brandon/Desktop/tiger.bmp"};
blur(&bmp, 5);
bmp.save("/Users/brandon/Desktop/blurred-tiger.bmp");
return 0;
}
The result becomes:
Since iam only applying the blur effect on 24-bitmaps, I add the padding thing and modified my 3th and 4th loop:
for (x = xx; x < bitmapInfoHeader.biWidth && x < xx + blurSize; **x+=3**)
{
for (y = yy; y < bitmapInfoHeader.biHeight && y < yy + blurSize; **y+=3**)
And it works! the photo still have a weard thin line on the left but i think this is a read/write bitmap problem and i can handle it myself :)
The blurred photo: https://ibb.co/iGp9Cb and another blurred picture: https://ibb.co/jFXUCb
Thank you guys for your answers! it helped alot
I need to compare a big amount of similar images of small size (up to 200x200).
So I try to implement SSIM (Structural similarity see https://en.wikipedia.org/wiki/Structural_similarity ) algorithm.
SSIM requires calculation of covariance of two 8-bit gray images.
A trivial implementation look like:
float SigmaXY(const uint8_t * x, const uint8_t * y, size_t size, float averageX, float averageY)
{
float sum = 0;
for(size_t i = 0; i < size; ++i)
sum += (x[i] - averageX) * (y[i] - averageY);
return sum / size;
}
But it has poor performance.
So I hope to improve it with using SIMD or CUDA (I heard that it can be done).
Unfortunately I have no experience to do this.
How it will look? And where I have to go?
I have another nice solution!
At first I want to mention some mathematical formulas:
averageX = Sum(x[i])/size;
averageY = Sum(y[i])/size;
And therefore:
Sum((x[i] - averageX)*(y[i] - averageY))/size =
Sum(x[i]*y[i])/size - Sum(x[i]*averageY)/size -
Sum(averageX*y[i])/size + Sum(averageX*averageY)/size =
Sum(x[i]*y[i])/size - averageY*Sum(x[i])/size -
averageX*Sum(y[i])/size + averageX*averageY*Sum(1)/size =
Sum(x[i]*y[i])/size - averageY*averageX -
averageX*averageY + averageX*averageY =
Sum(x[i]*y[i])/size - averageY*averageX;
It allows to modify our algorithm:
float SigmaXY(const uint8_t * x, const uint8_t * y, size_t size, float averageX, float averageY)
{
uint32_t sum = 0; // If images will have size greater then 256x256 than you have to use uint64_t.
for(size_t i = 0; i < size; ++i)
sum += x[i]*y[i];
return sum / size - averageY*averageX;
}
And only after that we can use SIMD (I used SSE2):
#include <emmintrin.h>
inline __m128i SigmaXY(__m128i x, __m128i y)
{
__m128i lo = _mm_madd_epi16(_mm_unpacklo_epi8(x, _mm_setzero_si128()), _mm_unpacklo_epi8(y, _mm_setzero_si128()));
__m128i hi = _mm_madd_epi16(_mm_unpackhi_epi8(y, _mm_setzero_si128()), _mm_unpackhi_epi8(y, _mm_setzero_si128()));
return _mm_add_epi32(lo, hi);
}
float SigmaXY(const uint8_t * x, const uint8_t * y, size_t size, float averageX, float averageY)
{
uint32_t sum = 0;
size_t i = 0, alignedSize = size/16*16;
if(size >= 16)
{
__m128i sums = _mm_setzero_si128();
for(; i < alignedSize; i += 16)
{
__m128i _x = _mm_loadu_si128((__m128i*)(x + i));
__m128i _y = _mm_loadu_si128((__m128i*)(y + i));
sums = _mm_add_epi32(sums, SigmaXY(_x, _y));
}
uint32_t _sums[4];
_mm_storeu_si128(_sums, sums);
sum = _sums[0] + _sums[1] + _sums[2] + _sums[3];
}
for(; i < size; ++i)
sum += x[i]*y[i];
return sum / size - averageY*averageX;
}
There is a SIMD implementation of the algorithm (I used SSE4.1):
#include <smmintrin.h>
template <int shift> inline __m128 SigmaXY(const __m128i & x, const __m128i & y, __m128 & averageX, __m128 & averageY)
{
__m128 _x = _mm_cvtepi32_ps(_mm_cvtepu8_epi32(_mm_srli_si128(x, shift)));
__m128 _y = _mm_cvtepi32_ps(_mm_cvtepu8_epi32(_mm_srli_si128(y, shift)));
return _mm_mul_ps(_mm_sub_ps(_x, averageX), _mm_sub_ps(_y, averageY))
}
float SigmaXY(const uint8_t * x, const uint8_t * y, size_t size, float averageX, float averageY)
{
float sum = 0;
size_t i = 0, alignedSize = size/16*16;
if(size >= 16)
{
__m128 sums = _mm_setzero_ps();
__m128 avgX = _mm_set1_ps(averageX);
__m128 avgY = _mm_set1_ps(averageY);
for(; i < alignedSize; i += 16)
{
__m128i _x = _mm_loadu_si128((__m128i*)(x + i));
__m128i _y = _mm_loadu_si128((__m128i*)(y + i));
sums = _mm_add_ps(sums, SigmaXY<0>(_x, _y, avgX, avgY);
sums = _mm_add_ps(sums, SigmaXY<4>(_x, _y, avgX, avgY);
sums = _mm_add_ps(sums, SigmaXY<8>(_x, _y, avgX, avgY);
sums = _mm_add_ps(sums, SigmaXY<12>(_x, _y, avgX, avgY);
}
float _sums[4];
_mm_storeu_ps(_sums, sums);
sum = _sums[0] + _sums[1] + _sums[2] + _sums[3];
}
for(; i < size; ++i)
sum += (x[i] - averageX) * (y[i] - averageY);
return sum / size;
}
I hope that it will useful for you.
I need to find the largest element in 1d matrix and its column and row indexes.
I use 1d matrix, so just finding the max element's index is needed first and then it is easy to get row and column.
My problem is that I cannot get that index.
I have a working function that finds largest element and uses SSE, here it is:
float find_largest_element_in_matrix_SSE(float* m, unsigned const int dims)
{
size_t i;
int index = -1;
__m128 max_el = _mm_loadu_ps(m);
__m128 curr;
for (i = 4; i < dims * dims; i += 4)
{
curr = _mm_loadu_ps(m + i);
max_el = _mm_max_ps(max_el, curr);
}
__declspec(align(16))float max_v[4] = { 0 };
_mm_store_ps(max_v, max_el);
return max(max(max(max_v[0], max_v[1]), max_v[2]), max_v[3]);
}
and also I have a non-working function that uses AVX:
float find_largest_element_in_matrix_AVX(float* m, unsigned const int dims)
{
size_t i;
int index = -1;
__m256 max_el = _mm256_loadu_ps(m);
__m256 curr;
for (i = 8; i < dims * dims; i += 8)
{
curr = _mm256_loadu_ps(m + i);
max_el = _mm256_max_ps(max_el, curr);
}
__declspec(align(32))float max_v[8] = { 0 };
_mm256_store_ps(max_v, max_el);
__m256 y = _mm256_permute2f128_ps(max_el, max_el, 1);
__m256 m1 = _mm256_max_ps(max_el, y);m1[1] = max(max_el[1], max_el[3])
__m256 m2 = _mm256_permute_ps(m1, 5);
__m256 m_res = _mm256_max_ps(m1, m2);
return m[0];
}
Could anyone help me with actually finding the index of the max element and make my AVX version work?
Here's a working SSE (SSE 4) implementation that returns the max val and corresponding index, along with a scalar reference implementation and test harness:
#include <stdio.h>
#include <stdint.h>
#include <stdlib.h>
#include <time.h>
#include <smmintrin.h> // SSE 4.1
float find_largest_element_in_matrix_ref(const float* m, int dims, int *maxIndex)
{
float maxVal = m[0];
int i;
*maxIndex = 0;
for (i = 1; i < dims * dims; ++i)
{
if (m[i] > maxVal)
{
maxVal = m[i];
*maxIndex = i;
}
}
return maxVal;
}
float find_largest_element_in_matrix_SSE(const float* m, int dims, int *maxIndex)
{
float maxVal = m[0];
float aMaxVal[4];
int32_t aMaxIndex[4];
int i;
*maxIndex = 0;
const __m128i vIndexInc = _mm_set1_epi32(4);
__m128i vMaxIndex = _mm_setr_epi32(0, 1, 2, 3);
__m128i vIndex = vMaxIndex;
__m128 vMaxVal = _mm_loadu_ps(m);
for (i = 4; i < dims * dims; i += 4)
{
__m128 v = _mm_loadu_ps(&m[i]);
__m128 vcmp = _mm_cmpgt_ps(v, vMaxVal);
vIndex = _mm_add_epi32(vIndex, vIndexInc);
vMaxVal = _mm_max_ps(vMaxVal, v);
vMaxIndex = _mm_blendv_epi8(vMaxIndex, vIndex, _mm_castps_si128(vcmp));
}
_mm_storeu_ps(aMaxVal, vMaxVal);
_mm_storeu_si128((__m128i *)aMaxIndex, vMaxIndex);
maxVal = aMaxVal[0];
*maxIndex = aMaxIndex[0];
for (i = 1; i < 4; ++i)
{
if (aMaxVal[i] > maxVal)
{
maxVal = aMaxVal[i];
*maxIndex = aMaxIndex[i];
}
}
return maxVal;
}
int main()
{
const int dims = 1024;
float m[dims * dims];
float maxVal_ref, maxVal_SSE;
int maxIndex_ref = -1, maxIndex_SSE = -1;
int i;
srand(time(NULL));
for (i = 0; i < dims * dims; ++i)
{
m[i] = (float)rand() / RAND_MAX;
}
maxVal_ref = find_largest_element_in_matrix_ref(m, dims, &maxIndex_ref);
maxVal_SSE = find_largest_element_in_matrix_SSE(m, dims, &maxIndex_SSE);
if (maxVal_ref == maxVal_SSE && maxIndex_ref == maxIndex_SSE)
{
printf("PASS: maxVal = %f, maxIndex = %d\n",
maxVal_ref, maxIndex_ref);
}
else
{
printf("FAIL: maxVal_ref = %f, maxVal_SSE = %f, maxIndex_ref = %d, maxIndex_SSE = %d\n",
maxVal_ref, maxVal_SSE, maxIndex_ref, maxIndex_SSE);
}
return 0;
}
Compile and run:
$ gcc -Wall -msse4 Yakovenko.c && ./a.out
PASS: maxVal = 0.999999, maxIndex = 120409
Obviously you can get the row and column indices if needed:
int rowIndex = maxIndex / dims;
int colIndex = maxIndex % dims;
From here it should be fairly straightforward to write an AVX2 implementation.
One approach would be to calculate maximum in the first pass, and find the index by linear search in the second pass. Here is a sample implementation in SSE2:
#define anybit __builtin_ctz //or lookup table with 16 entries...
float find_largest_element_in_matrix_SSE(const float* m, int dims, int *maxIndex) {
//first pass: calculate maximum as usual
__m128 vMaxVal = _mm_loadu_ps(m);
for (int i = 4; i < dims * dims; i += 4)
vMaxVal = _mm_max_ps(vMaxVal, _mm_loadu_ps(&m[i]));
//perform in-register reduction
vMaxVal = _mm_max_ps(vMaxVal, _mm_shuffle_ps(vMaxVal, vMaxVal, _MM_SHUFFLE(2, 3, 0, 1)));
vMaxVal = _mm_max_ps(vMaxVal, _mm_shuffle_ps(vMaxVal, vMaxVal, _MM_SHUFFLE(1, 0, 3, 2)));
//second pass: search for maximal value
for (int i = 0; i < dims * dims; i += 4) {
__m128 vIsMax = _mm_cmpeq_ps(vMaxVal, _mm_loadu_ps(&m[i]));
if (int mask = _mm_movemask_ps(vIsMax)) {
*maxIndex = i + anybit(mask);
return _mm_cvtss_f32(vMaxVal);
}
}
}
Note that the branch in the second loop should be almost perfectly predicted unless your input data is very small.
The solution suffers from several problems, notably:
It may work incorrectly in presence of weird floating point values, e.g. with NaNs.
If your matrix does not fit into CPU cache, then the code would read the matrix twice from the main memory, so it would be two times slower than the single-pass approach. This can be solved for large matrices by block-wise processing.
In the first loop each iteration depends on the previous one (vMaxVal is both modified and read) so it would be slowed down by latency of _mm_max_ps. Perhaps it would be great to unroll the first loop a bit (2x or 4x), while having 4 independent registers for vMaxVal (actually, the second loop would also benefit from unrolling).
Porting to AVX should be pretty straight-forward, except for the in-register reduction:
vMaxVal = _mm256_max_ps(vMaxVal, _mm256_shuffle_ps(vMaxVal, vMaxVal, _MM_SHUFFLE(2, 3, 0, 1)));
vMaxVal = _mm256_max_ps(vMaxVal, _mm256_shuffle_ps(vMaxVal, vMaxVal, _MM_SHUFFLE(1, 0, 3, 2)));
vMaxVal = _mm256_max_ps(vMaxVal, _mm256_permute2f128_ps(vMaxVal, vMaxVal, 1));
yet another approach:
void find_largest_element_in_matrix_SSE(float * matrix, size_t n, int * row, int * column, float * v){
__m128 indecies = _mm_setr_ps(0, 1, 2, 3);
__m128 update = _mm_setr_ps(4, 4, 4, 4);
__m128 max_indecies = _mm_setr_ps(0, 1, 2, 3);
__m128 max = _mm_load_ps(matrix);
for (int i = 4; i < n * n; i+=4){
indecies = _mm_add_ps(indecies, update);
__m128 pm2 = _mm_load_ps(&matrix[i]);
__m128 mask = _mm_cmpge_ps(max, pm2);
max = _mm_max_ps(max, pm2);
max_indecies = _mm_or_ps(_mm_and_ps(max_indecies, mask), _mm_andnot_ps(mask, indecies));
}
__declspec (align(16)) int max_ind[4];
__m128i maxi = _mm_cvtps_epi32(max_indecies);
_mm_store_si128((__m128i *) max_ind, maxi);
int c = max_ind[0];
for (int i = 1; i < 4; i++)
if (matrix[max_ind[i]] >= matrix[c] && max_ind[i] < c){
c = max_ind[i];
}
*v = matrix[c];
*row = c / n;
*column = c % n;
}
void find_largest_element_in_matrix_AVX(float * matrix, size_t n, int * row, int * column, float * v){
__m256 indecies = _mm256_setr_ps(0, 1, 2, 3, 4, 5, 6, 7);
__m256 update = _mm256_setr_ps(8, 8, 8, 8, 8, 8, 8, 8);
__m256 max_indecies = _mm256_setr_ps(0, 1, 2, 3, 4, 5, 6, 7);
__m256 max = _mm256_load_ps(matrix);
for (int i = 8; i < n * n; i += 8){
indecies = _mm256_add_ps(indecies, update);
__m256 pm2 = _mm256_load_ps(&matrix[i]);
__m256 mask = _mm256_cmp_ps(max, pm2, _CMP_GE_OQ);
max = _mm256_max_ps(max, pm2);
max_indecies = _mm256_or_ps(_mm256_and_ps(max_indecies, mask), _mm256_andnot_ps(mask, indecies));
}
__declspec (align(32)) int max_ind[8];
__m256i maxi = _mm256_cvtps_epi32(max_indecies);
_mm256_store_si256((__m256i *) max_ind, maxi);
int c = max_ind[0];
for (int i = 1; i < 8; i++)
if (matrix[max_ind[i]] >= matrix[c] && max_ind[i] < c){
c = max_ind[i];
}
*v = matrix[c];
*row = c / n;
*column = c % n;
}
I have a mask (8-bit gray image) and I need calculate center of region with given index of the mask.
To do this I need calculate moments of first order along axes X and Y for this mask.
Currently I'm using next code:
void GetCenter(const uint8_t * mask, size_t stride, size_t width, size_t height,
uint8_t index, double * centerX, double * centerY)
{
uint64_t sum = 0, sumX = 0, sumY = 0;
for(size_t y = 0; y < height; ++y)
{
for(size_t x = 0; x < width; ++x)
{
if(mask[x] == index)
{
sum++;
sumX += x;
sumY += y;
}
}
mask += stride;
}
*centerX = sum ? (double)sumX/sum : 0;
*centerY = sum ? (double)sumY/sum : 0;
}
And I have a question: Is there any way to improve performance of this algorithm?
There is a way to greatly (more then ten times) improve performance of this algorithm.
To do it you need use SIMD instructions of CPU such as (SSE2, AVX2, Altivec, NEON etc.).
I wrote an example with using of SSE2 instructions (AVX2 code will be similar to it):
const __m128i K_0 = _mm_setzero_si128();
const __m128i K8_1 = _mm_set1_epi8(1);
const __m128i K16_1 = _mm_set1_epi16(1);
const __m128i K16_8 = _mm_set1_epi16(8);
const __m128i K16_I = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7);
inline void AddMoments(const __m128i & mask, const __m128i & x, const __m128i & y,
__m128i & sumX, __m128i & sumY)
{
sumX = _mm_add_epi32(sumX, _mm_madd_epi16(_mm_and_si128(mask, x), K16_1));
sumY = _mm_add_epi32(sumY, _mm_madd_epi16(_mm_and_si128(mask, y), K16_1));
}
inline int ExtractSum(__m128i a)
{
return _mm_cvtsi128_si32(a) + _mm_cvtsi128_si32(_mm_srli_si128(a, 4)) +
_mm_cvtsi128_si32(_mm_srli_si128(a, 8)) + _mm_cvtsi128_si32(_mm_srli_si128(a, 12));
}
void GetCenter(const uint8_t * mask, size_t stride, size_t width, size_t height,
uint8_t index, double * centerX, double * centerY)
{
size_t alignedWidth = width & ~(sizeof(__m128i) - 1);
const __m128i _index = _mm_set1_epi8(index);
uint64_t sum = 0, sumX = 0, sumY = 0;
for(size_t y = 0; y < height; ++y)
{
size_t x = 0;
__m128i _x = K16_I;
__m128i _y = _mm_set1_epi16((short)y);
__m128i _sum = K_0;
__m128i _sumX = K_0;
__m128i _sumY = K_0;
for(; x < alignedWidth; x += sizeof(__m128i))
{
__m128i _mask = _mm_and_si128(_mm_cmpeq_epi8(_mm_loadu_si128((__m128i*)(mask + x)), _index), K8_1);
_sum = _mm_add_epi64(_sum, _mm_sad_epu8(_mask, K_0));
AddMoments(_mm_cmpeq_epi16(_mm_unpacklo_epi8(_mask, K_0), K16_1), _x, _y, _sumX, _sumY);
_x = _mm_add_epi16(_x, K16_8);
AddMoments(_mm_cmpeq_epi16(_mm_unpackhi_epi8(_mask, K_0), K16_1), _x, _y, _sumX, _sumY);
_x = _mm_add_epi16(_x, K16_8);
}
sum += ExtractSum(_sum);
sumX += ExtractSum(_sumX);
sumY += ExtractSum(_sumY);
for(; x < width; ++x)
{
if(mask[x] == index)
{
sum++;
sumX += x;
sumY += y;
}
}
mask += stride;
}
*centerX = sum ? (double)sumX/sum : 0;
*centerY = sum ? (double)sumY/sum : 0;
}
P.S. There is a more simple and cross platform way to improve performance with using of external library (http://simd.sourceforge.net/):
void GetCenter(const uint8_t * mask, size_t stride, size_t width, size_t height,
uint8_t index, double * centerX, double * centerY)
{
uint64_t sum, sumX, sumY, sumXX, sumXY, sumYY;
::SimdGetMoments(mask, stride, width, height, index,
&sum, &sumX, &sumY, &sumXX, &sumXY, &sumYY);
*centerX = sum ? (double)sumX/sum : 0;
*centerY = sum ? (double)sumY/sum : 0;
}
An implementation with using of _mm_movemask_epi8 and 8-bit lookup tables:
uint8_t g_sum[1 << 8], g_sumX[1 << 8];
bool Init()
{
for(int i = 0, n = 1 << 8; i < n; ++i)
{
g_sum[i] = 0;
g_sumX[i] = 0;
for(int j = 0; j < 8; ++j)
{
g_sum[i] += (i >> j) & 1;
g_sumX[i] += ((i >> j) & 1)*j;
}
}
return true;
}
bool g_inited = Init();
inline void AddMoments(uint8_t mask, size_t x, size_t y,
uint64_t & sum, uint64_t & sumX, uint64_t & sumY)
{
int value = g_sum[mask];
sum += value;
sumX += x * value + g_sumX[mask];
sumY += y * value;
}
void GetCenter(const uint8_t * mask, size_t stride, size_t width, size_t height,
uint8_t index, double * centerX, double * centerY)
{
size_t alignedWidth = width & ~(sizeof(__m128i) - 1);
const __m128i _index = _mm_set1_epi8(index);
union PackedValue
{
uint8_t u8[4];
uint16_t u16[2];
uint32_t u32;
} _mask;
uint64_t sum = 0, sumX = 0, sumY = 0;
for(size_t y = 0; y < height; ++y)
{
size_t x = 0;
for(; x < alignedWidth; x += sizeof(__m128i))
{
_mask.u32 = _mm_movemask_epi8(_mm_cmpeq_epi8(
_mm_loadu_si128((__m128i*)(mask + x)), _index));
AddMoments(_mask.u8[0], x, y, sum, sumX, sumY);
AddMoments(_mask.u8[1], x + 8, y, sum, sumX, sumY);
}
for(; x < width; ++x)
{
if(mask[x] == index)
{
sum++;
sumX += x;
sumY += y;
}
}
mask += stride;
}
*centerX = sum ? (double)sumX/sum : 0;
*centerY = sum ? (double)sumY/sum : 0;
}
An implementation with using of _mm_movemask_epi8 and 16-bit lookup tables:
uint16_t g_sum[1 << 16], g_sumX[1 << 16];
bool Init()
{
for(int i = 0, n = 1 << 16; i < n; ++i)
{
g_sum[i] = 0;
g_sumX[i] = 0;
for(int j = 0; j < 16; ++j)
{
g_sum[i] += (i >> j) & 1;
g_sumX[i] += ((i >> j) & 1)*j;
}
}
return true;
}
bool g_inited = Init();
inline void AddMoments(uint16_t mask, size_t x, size_t y,
uint64_t & sum, uint64_t & sumX, uint64_t & sumY)
{
int value = g_sum[mask];
sum += value;
sumX += x * value + g_sumX[mask];
sumY += y * value;
}
void GetCenter(const uint8_t * mask, size_t stride, size_t width, size_t height,
uint8_t index, double * centerX, double * centerY)
{
size_t alignedWidth = width & ~(sizeof(__m128i) - 1);
const __m128i _index = _mm_set1_epi8(index);
union PackedValue
{
uint8_t u8[4];
uint16_t u16[2];
uint32_t u32;
} _mask;
uint64_t sum = 0, sumX = 0, sumY = 0;
for(size_t y = 0; y < height; ++y)
{
size_t x = 0;
for(; x < alignedWidth; x += sizeof(__m128i))
{
_mask.u32 = _mm_movemask_epi8(_mm_cmpeq_epi8(
_mm_loadu_si128((__m128i*)(mask + x)), _index));
AddMoments(_mask.u16[0], x, y, sum, sumX, sumY);
}
for(; x < width; ++x)
{
if(mask[x] == index)
{
sum++;
sumX += x;
sumY += y;
}
}
mask += stride;
}
*centerX = sum ? (double)sumX/sum : 0;
*centerY = sum ? (double)sumY/sum : 0;
}
Performance comparison for 1920x1080 image:
Base version: 8.261 ms;
1-st optimization:0.363 ms (in 22 times faster);
2-nd optimization:0.280 ms (in 29 times faster);
3-rd optimization:0.299 ms (in 27 times faster);
4-th optimization:0.325 ms (in 25 times faster);
As you can see above the code with using of 8-bit lookup tables has better performance then the code with using of 16-bit lookup tables. But anyway external library is better though it performs additional calculations of the second order moments.
Another acceleration technique is by run-length coding.
You can decompose the rows in horizontal runs where the mask is active. You can detect the runs on the fly, or precompute them and store the image in that form, if that makes sense.
Then a run can be accumulated as a whole. Let a run start from (X, Y) and have length L, then use
Sum+= L;
SumX+= (2 * X + L + 1) * L;
SumY+= Y * L;
In the end, divide SumX by 2.
The longer the runs, the more effective the trick is.
Using SSE2 or later, you try with the instruction PMOVMSKB—Move Byte Mask.
First compare 16 mask pixels to the (replicated) index value to get 16 comparison results. Then pack these to a 16 bits number, using the magical instruction.
Then, using two precomputed lookup tables, perform the accumulations in scalar mode.
One lookup table gives you the count of active mask pixels, and the other gives you the sum of active mask pixels weighted by their abscissa, i.e the X moment.
Something like
int PackedValue= _mm_movemask_epi8(_mm_cmpeq_epi8(_mm_loadu_si128((__m128i*)&Mask[X]), ReplicatedIndex));
Sum+= Count[PackedValue];
SumX+= X * Count[PackedValue] + MomentX[PackedValue];
SumY+= Y * Count[PackedValue];
Depending on the amount of memory you agree to spend, the lookup tables can have byte indexes (256 entries, use the table twice) or word indexes (65536 entries). In both cases, the count and moment values fit in a single byte (1 to 8/16 and 0 to 28/120 respectively).
AVX implementation is also possible, packing 32 pixels at a time. Lookup tables with doubleword indexes seem unreasonable, though. :-)