opencv mean function for gray image implemention via neon - c++

uint64_t sum = 0;
const uint8_t* src = (const uint8_t*);
int i, k, w = gray.cols, h = gray.rows, pitch = gray.step.p[0], w32 = w >> 5 << 5;
for(i = 0; i < h; ++ i)
const uint8_t* line = src;
for(k = 0; k < w32; k += 32)
uint8x16_t a16 = vld1q_u8(line); line += 16;
uint8x16_t b16 = vld1q_u8(line); line += 16;
uint16x8_t a8 = vpaddlq_u8(a16);
uint16x8_t b8 = vpaddlq_u8(b16);
uint32x4_t a4 = vpaddlq_u16(a8);
uint32x4_t b4 = vpaddlq_u16(b8);
uint64x2_t a2 = vpaddlq_u32(a4);
a2 = vpadalq_u32(a2, b4);
sum += vgetq_lane_u64(a2, 0) + vgetq_lane_u64(a2, 1);
for( ; k < w; ++ k)
sum += src[k];
src += pitch;
printf("%f\t%f", (double)sum / (double)(w * h), cv::mean(gray));
When I tested by a binary image, the result is equal.
When the image is grayscal, the result not equal, but I can't find the problem in my code.
Edit: now, upper code is right, the benchmark of loop 1000 times for a 8bit gray image is:
cv::mean 0.000328
myMean 0.000091


SSE mean filter in c++ and OpenCV

I would like to modify the code for an OpenCV mean filter to use Intel intrinsics. I'm an SSE newbie and I really don't know where to start from. I checked a lot of resources on the web, but I didn't have a lot of success.
This is the program:
#include "opencv2/imgproc/imgproc.hpp"
#include "opencv2/highgui/highgui.hpp"
using namespace cv;
using namespace std;
int main()
int A[3][3] = { { 1, 1, 1 }, { 1, 1, 1 }, { 1, 1, 1 } };
int c = 0;
int d = 0;
Mat var1 = imread("images.jpg", 1);
Mat var2(var1.rows, var1.cols, CV_8UC3, Scalar(0, 0, 0));
for (int i = 0; i < var1.rows; i++)
{<Vec3b>(i, 0) =<Vec3b>(i, 0);<Vec3b>(i, var1.cols - 1) =<Vec3b>(i, var1.cols - 1);
for (int i = 0; i < var1.cols; i++)
{<Vec3b>(0, i) =<Vec3b>(0, i);<Vec3b>(var1.rows - 1, i) =<Vec3b>(var1.rows - 1, i);
for (int i = 0; i < var1.rows; i++) {
for (int j = 0; j < var1.cols; j++)
c = 0;
for (int m = i; m < var1.rows; m++, c++)
if (c < 3)
d = 0;
for (int n = j; n < var1.cols; n++, d++)
if (d < 3)
if ((i + 1) < var1.rows && (j + 1) < var1.cols)
{<Vec3b>(i + 1, j + 1)[0] +=<Vec3b>(m, n)[0] * A[m - i][n - j] / 9;<Vec3b>(i + 1, j + 1)[1] +=<Vec3b>(m, n)[1] * A[m - i][n - j] / 9;<Vec3b>(i + 1, j + 1)[2] +=<Vec3b>(m, n)[2] * A[m - i][n - j] / 9;
imshow("window1", var1);
imshow("window2", var2);
The part that I find difficult is understanding how to convert the innermost 2 loops, where the mean value is computed. Any help will be greatly appreciated.
Just for fun, I thought it might be interesting to start with a naive implementation of a 3x3 mean filter and then optimise this incrementally, ending up with a SIMD (SSE) implementation, measuring the throughput improvement at each stage.
1 - Mean_3_3_ref - reference implementation
This is just a simple scalar implementation which we'll use as a baseline for throughput and for validating further implementations:
void Mean_3_3_ref(const Mat &image_in, Mat &image_out)
for (int y = 1; y < image_in.rows - 1; ++y)
for (int x = 1; x < image_in.cols - 1; ++x)
for (int c = 0; c < 3; ++c)
{<Vec3b>(y, x)[c] = (<Vec3b>(y - 1, x - 1)[c] +<Vec3b>(y - 1, x )[c] +<Vec3b>(y - 1, x + 1)[c] +<Vec3b>(y , x - 1)[c] +<Vec3b>(y , x )[c] +<Vec3b>(y , x + 1)[c] +<Vec3b>(y + 1, x - 1)[c] +<Vec3b>(y + 1, x )[c] +<Vec3b>(y + 1, x + 1)[c] + 4) / 9;
2 - Mean_3_3_scalar - somewhat optimised scalar implementation
Exploit the redundancy in summing successive columns - we save the last two column sums so that we only need to calculate one new column sum (per channel) on each iteration:
void Mean_3_3_scalar(const Mat &image_in, Mat &image_out)
for (int y = 1; y < image_in.rows - 1; ++y)
int r_1, g_1, b_1;
int r0, g0, b0;
int r1, g1, b1;
r_1 = g_1 = b_1 = 0;
r0 = g0 = b0 = 0;
for (int yy = y - 1; yy <= y + 1; ++yy)
r_1 +=<Vec3b>(yy, 0)[0];
g_1 +=<Vec3b>(yy, 0)[1];
b_1 +=<Vec3b>(yy, 0)[2];
r0 +=<Vec3b>(yy, 1)[0];
g0 +=<Vec3b>(yy, 1)[1];
b0 +=<Vec3b>(yy, 1)[2];
for (int x = 1; x < image_in.cols - 1; ++x)
r1 = g1 = b1 = 0;
for (int yy = y - 1; yy <= y + 1; ++yy)
r1 +=<Vec3b>(yy, x + 1)[0];
g1 +=<Vec3b>(yy, x + 1)[1];
b1 +=<Vec3b>(yy, x + 1)[2];
}<Vec3b>(y, x)[0] = (r_1 + r0 + r1 + 4) / 9;<Vec3b>(y, x)[1] = (g_1 + g0 + g1 + 4) / 9;<Vec3b>(y, x)[2] = (b_1 + b0 + b1 + 4) / 9;
r_1 = r0;
g_1 = g0;
b_1 = b0;
r0 = r1;
g0 = g1;
b0 = b1;
3 - Mean_3_3_scalar_opt - further optimised scalar implementation
As per Mean_3_3_scalar, but also remove OpenCV overheads by caching pointers to each row that we are working on:
void Mean_3_3_scalar_opt(const Mat &image_in, Mat &image_out)
for (int y = 1; y < image_in.rows - 1; ++y)
const uint8_t * const input_1 = image_in.ptr(y - 1);
const uint8_t * const input0 = image_in.ptr(y);
const uint8_t * const input1 = image_in.ptr(y + 1);
uint8_t * const output = image_out.ptr(y);
int r_1 = input_1[0] + input0[0] + input1[0];
int g_1 = input_1[1] + input0[1] + input1[1];
int b_1 = input_1[2] + input0[2] + input1[2];
int r0 = input_1[3] + input0[3] + input1[3];
int g0 = input_1[4] + input0[4] + input1[4];
int b0 = input_1[5] + input0[5] + input1[5];
for (int x = 1; x < image_in.cols - 1; ++x)
int r1 = input_1[x * 3 + 3] + input0[x * 3 + 3] + input1[x * 3 + 3];
int g1 = input_1[x * 3 + 4] + input0[x * 3 + 4] + input1[x * 3 + 4];
int b1 = input_1[x * 3 + 5] + input0[x * 3 + 5] + input1[x * 3 + 5];
output[x * 3 ] = (r_1 + r0 + r1 + 4) / 9;
output[x * 3 + 1] = (g_1 + g0 + g1 + 4) / 9;
output[x * 3 + 2] = (b_1 + b0 + b1 + 4) / 9;
r_1 = r0;
g_1 = g0;
b_1 = b0;
r0 = r1;
g0 = g1;
b0 = b1;
4 - Mean_3_3_blur - leverage OpenCV's blur function
OpenCV has a function called blur, which is based on the function boxFilter, which is just another name for a mean filter. Since OpenCV code has been quite heavily optimised over the years (using SIMD in many cases), let's see if this makes a big improvement over our scalar code:
void Mean_3_3_blur(const Mat &image_in, Mat &image_out)
blur(image_in, image_out, Size(3, 3));
5 - Mean_3_3_SSE - SSE implementation
This a reasonably efficient SIMD implementation. It uses the same techniques as the scalar code above in order to eliminate redundancy in processing successive pixels:
#include <tmmintrin.h> // Note: requires SSSE3 (aka MNI)
inline void Load2(const ssize_t offset, const uint8_t* const src, __m128i& vh, __m128i& vl)
const __m128i v = _mm_loadu_si128((__m128i *)(src + offset));
vh = _mm_unpacklo_epi8(v, _mm_setzero_si128());
vl = _mm_unpackhi_epi8(v, _mm_setzero_si128());
inline void Store2(const ssize_t offset, uint8_t* const dest, const __m128i vh, const __m128i vl)
__m128i v = _mm_packus_epi16(vh, vl);
_mm_storeu_si128((__m128i *)(dest + offset), v);
template <int SHIFT> __m128i ShiftL(const __m128i v0, const __m128i v1) { return _mm_alignr_epi8(v1, v0, SHIFT * sizeof(short)); }
template <int SHIFT> __m128i ShiftR(const __m128i v0, const __m128i v1) { return _mm_alignr_epi8(v1, v0, 16 - SHIFT * sizeof(short)); }
template <int CHANNELS> void Mean_3_3_SSE_Impl(const Mat &image_in, Mat &image_out)
const int nx = image_in.cols;
const int ny = image_in.rows;
const int kx = 3 / 2; // x, y borders
const int ky = 3 / 2;
const int kScale = 3 * 3; // scale factor = total number of pixels in sum
const __m128i vkScale = _mm_set1_epi16((32768 + kScale / 2) / kScale);
const int nx0 = ((nx + kx) * CHANNELS + 15) & ~15; // round up total width to multiple of 16
int x, y;
for (y = ky; y < ny - ky; ++y)
const uint8_t * const input_1 = image_in.ptr(y - 1);
const uint8_t * const input0 = image_in.ptr(y);
const uint8_t * const input1 = image_in.ptr(y + 1);
uint8_t * const output = image_out.ptr(y);
__m128i vsuml_1, vsumh0, vsuml0;
__m128i vh, vl;
vsuml_1 = _mm_set1_epi16(0);
Load2(0, input_1, vsumh0, vsuml0);
Load2(0, input0, vh, vl);
vsumh0 = _mm_add_epi16(vsumh0, vh);
vsuml0 = _mm_add_epi16(vsuml0, vl);
Load2(0, input1, vh, vl);
vsumh0 = _mm_add_epi16(vsumh0, vh);
vsuml0 = _mm_add_epi16(vsuml0, vl);
for (x = 0; x < nx0; x += 16)
__m128i vsumh1, vsuml1, vsumh, vsuml;
Load2((x + 16), input_1, vsumh1, vsuml1);
Load2((x + 16), input0, vh, vl);
vsumh1 = _mm_add_epi16(vsumh1, vh);
vsuml1 = _mm_add_epi16(vsuml1, vl);
Load2((x + 16), input1, vh, vl);
vsumh1 = _mm_add_epi16(vsumh1, vh);
vsuml1 = _mm_add_epi16(vsuml1, vl);
vsumh = _mm_add_epi16(vsumh0, ShiftR<CHANNELS>(vsuml_1, vsumh0));
vsuml = _mm_add_epi16(vsuml0, ShiftR<CHANNELS>(vsumh0, vsuml0));
vsumh = _mm_add_epi16(vsumh, ShiftL<CHANNELS>(vsumh0, vsuml0));
vsuml = _mm_add_epi16(vsuml, ShiftL<CHANNELS>(vsuml0, vsumh1));
// round mean
vsumh = _mm_mulhrs_epi16(vsumh, vkScale);
vsuml = _mm_mulhrs_epi16(vsuml, vkScale);
Store2(x, output, vsumh, vsuml);
vsuml_1 = vsuml0;
vsumh0 = vsumh1;
vsuml0 = vsuml1;
void Mean_3_3_SSE(const Mat &image_in, Mat &image_out)
const int channels = image_in.channels();
switch (channels)
case 1:
Mean_3_3_SSE_Impl<1>(image_in, image_out);
case 3:
Mean_3_3_SSE_Impl<3>(image_in, image_out);
throw("Unsupported format.");
I benchmarked all of the above implementations on an 8th gen Core i9 (MacBook Pro 16,1) at 2.4 GHz, with an image size of 2337 rows x 3180 cols. The compiler was Apple clang version 12.0.5 (clang-1205.0.22.9) and the only optimisation switch was -O3. OpenCV version was 4.5.0 (via Homebrew). (Note: I verified that for Mean_3_3_blur the cv::blur function was dispatched to an AVX2 implementation.) The results:
Mean_3_3_ref 62153 µs
Mean_3_3_scalar 41144 µs = 1.51062x
Mean_3_3_scalar_opt 26238 µs = 2.36882x
Mean_3_3_blur 20121 µs = 3.08896x
Mean_3_3_SSE 4838 µs = 12.84680x
I have ignored the border pixels in all implementations - if required these can either be filled with pixels from the original image or using some other form of edge pixel processing.
The code is not "industrial strength" - it was only written for benchmarking purposes.
There are a few further possible optimisations, e.g. use wider SIMD (AVX2, AVX512), exploit the redundancy between successive rows, etc - these are left as an exercise for the reader.
The SSE implementation is fastest, but this comes at the cost of increased complexity, decreased mantainability and reduced portability.
The OpenCV blur function gives the second best performance, and should probably be the preferred solution if it meets throughput requirements - it's the simplest solution, and simple is good.

Using Gaussian Filter in C++ AMP returning wrong colours

I'm trying to use a Gaussian filter on a generated Mandelbrot set, and I have a solution that will work sequentially. However, I do want to have a solution that works using GPU processing.
I'm using C++ AMP to use GPU processing. An issue with storing the individual color channels of the pixels is that in AMP you can't use unsigned 8 bit integers ( uint8_t ), therefore I've had to store the channels in unsigned 32 bit integers ( uint32_t ). All of the syntax that I've implemented into the AMP code is the same as the sequential one, however the colours of the one generated in AMP has the wrong colours output, however the shape of the Mandelbrot can still be seen.
So, I believe its the way that I'm recollecting the channels together for the pixel that's giving the wrong colour. If anymore details are needed, I can provide them.
Sequential Output
Sequential Code:
void applyFilter(){
int kernelCentreX, kernelCentreY;// center index of kernel
int kernelRadius = 5 / 2;
uint8_t r, g, b;
float kernelTotal = 0.0;
float redTotal = 0.0, blueTotal = 0.0, greenTotal = 0.0;
float sum;
the_clock::time_point start = the_clock::now();
for (int y = 0; y < HEIGHT; y++) { //loop through image height
for (int x = 0; x < WIDTH; x++) {//loop through image width
float redTotal = 0.0, blueTotal = 0.0, greenTotal = 0.0;
float kernelTotal = 0.0;
for (int v = 0; v < 5; v++) { //loop through for kernel height
for (int u = 0; u < 5; u++) { //loop through for kernel width
// Current position
int cX = x + u - kernelRadius;
int cY = y + v - kernelRadius;
//Make sure we stay in boundries
if (cX < 0 || cX > WIDTH - 1 || cY < 0 || cY > HEIGHT - 1)
//Get colour channels of current pixel
r = (image[cY][cX] >> 16);
g = (image[cY][cX] >> 8);
b = (image[cY][cX]);
//Get colour channels of current pixel
/*r = image[cY][cX] >> 16;
g = image[cY][cX] >> 8;
b = image[cY][cX];*/
//Calculate Totals
redTotal += r *kernel[v][u];
greenTotal += g *kernel[v][u];
blueTotal += b *kernel[v][u];
kernelTotal += kernel[v][u];
//Calculate new pixel values
r = (redTotal / kernelTotal);
g = (greenTotal / kernelTotal);
b = (blueTotal / kernelTotal);
image[y][x] = (r << 16 | g << 8 | b);
the_clock::time_point end = the_clock::now();
auto time_taken = duration_cast<milliseconds>(end - start).count();
cout << "Time taken to apply kernel(sequential) " << time_taken << "ms" << endl;
Parallel Output
Parallel Code:
void applyFilterAMP() {
the_clock::time_point start = the_clock::now();
uint32_t *pImage = &(image[0][0]);
uint32_t *pFilteredImage = &(filteredImage[0][0]);
float *pKernel = &(kernel[0][0]);
uint32_t pi[HEIGHT/3][WIDTH/3][3];
uint32_t* pPi = &(pi[0][0][0]);
array_view<float, 2>k(5, 5, pKernel);
array_view<uint32_t, 2> m(HEIGHT, WIDTH, pImage);
array_view<uint32_t, 2> fi(HEIGHT, WIDTH, pFilteredImage);
//array_view<uint32_t, 3> piColor(HEIGHT/3, WIDTH/3, 3, pPi);
static const int TileSize = 16;
parallel_for_each(fi.extent, [=](concurrency::index<2> idx) restrict(amp) {
//parallel_for_each(fi.extent.tile<TileSize, TileSize>(), [=](tiled_index<TileSize, TileSize> tidx) restrict(amp) {
int kernelCentreX, kernelCentreY;// center index of kernel
int kernelRadius = 5/2;
int y = idx[0];
int x = idx[1];
//int y =[0];
//int x =[1];
float kernelTotal = 0.0;
float redTotal = 0.0, blueTotal = 0.0, greenTotal = 0.0;
float sum=0.0;
uint32_t r, g, b, newPixel;
for (int v = 0; v < 5; v++) { //loop through for kernel height
for (int u = 0; u < 5; u++) { //loop through for kernel width
// Current position
int cX = x + u - kernelRadius;
int cY = y + v - kernelRadius;
//int cX =[1] + u - kernelRadius;
//int cY =[0] + v - kernelRadius;
//Make sure we stay in boundries
if ((cX < 0 || cX > WIDTH - 1 || cY < 0 || cY > HEIGHT - 1))
//Get colour channels of pixel
r = m[cY][cX] >> 16;
g = m[cY][cX] >> 8;
b = m[cY][cX];
//Calculate Totals
redTotal += r *k[v][u];
greenTotal += g *k[v][u];
blueTotal += b *k[v][u];
kernelTotal += k[v][u];
//Calculate new pixel values
r = (redTotal / kernelTotal);
g = (greenTotal / kernelTotal);
b = (blueTotal / kernelTotal);
fi[y][x] = (r << 16 | g << 8 | b);
the_clock::time_point end = the_clock::now();
auto time_taken = duration_cast<milliseconds>(end - start).count();
cout << "Time taken to apply kernel(parallel) " << time_taken << "ms" << endl;
errno_t err = memcpy_s(image, sizeof(uint32_t)*(HEIGHT * WIDTH), filteredImage, sizeof(uint32_t)*(HEIGHT * WIDTH));
if (err)
printf("Error executing memcpy_s.\n");
/*for (int j = 0; j < HEIGHT; j++) {
for (int i = 0; i < WIDTH; i++) {
image[j][i] = filteredImage[j][i];

meshgrid matlab to c++ different results

here is the matlab code i'm trying to convert to c++
size(Iorig) == 1334X 2026
%% label checkers
Label = zeros(size(Iorig));
Margins = 11;
[X,Y] = meshgrid(1:size(Iorig,2),1:size(Iorig,1));
k = 1;
for i = 1:4
for j = 1:6
rr = rect{i,j};
x1 = rr(1);
x2 = rr(1) + rr(3);
y1 = rr(2);
y2 = rr(2) + rr(4);
Label(X>=x1+Margins&X<x2-Margins&Y>=y1+Margins&Y<y2-Margins) = k;
k = k+1;
I understand that we want to label the rectangles which are found in the previous step, there are 24 of those.
but I don't understand how to convert this line into easy c++ code without allocating a huge buffer of X and Y which basically just holds... indices..
thanks for your help here is what i started doing.
//label Checkers
List<List<int>^>^ label = gcnew List<List<int>^>();
int margins = 11;
int k = 1;
for (size_t i = 0; i < 4; i++)
for (size_t j = 0; j < 6; j++)
MacbethCheckerBatchesColor^ rect = autoDetectMacbethResult[i * 6 + j];
Point^ r = rect->Points[0];
int x1 = r->X;
int y1 = r->Y;
r = rect->Points[2];
int x2 = r->X;
int y2 = r->Y;
for (int h = 0; h < inputImage->HeightLines; h++)
List<int>^ tempRow = gcnew List<int>();
for (int w = 0; w < inputImage->WidthColumns; w++)
if ( (w>= x1+margins) & (w<x2-margins) & (h >= y1+margins) & (h<y2-margins) )
k= k+100;//i tried here many other numbers... same result
Here is my result can you please help me find my mistake, the rectangles are the same, I guesss I have some other logical mistake.

Implement RGBtoHSV C++ , wrong H output

I am trying to do Sobel operator in the HSV dimension (told to do this in the HSV by my guide but I dont understand why it will work better on HSV than on RGB) .
I have built a function that converts from RGB to HSV . while I have some mediocre knowledge in C++ I am getting confused by the Image Processing thus I tried to keep the code as simple as possible , meaning I dont care (at this stage) about time nor space .
From looking on the results I got in gray levels bmp photos , my V and S seems to be fine but my H looks very gibbrish .
I got 2 questions here :
1. How a normal H photo in gray level should look a like comparing to the source photo ?
2. Where was I wrong in the code :
void RGBtoHSV(unsigned char image[][NUMBER_OF_COLUMNS][NUMBER_OF_COLORS],
double Rn, Gn, Bn;
double C;
double H, S, V;
for (int row = 0; row < NUMBER_OF_ROWS; row++)
for (int column = 0; column < NUMBER_OF_COLUMNS; column++)
Rn = (1.0*image[row][column][R]) / 255;
Gn = (1.0*image[row][column][G] )/ 255;
Bn = (1.0*image[row][column][B] )/ 255;
//double RGBn[3] = { Rn, Gn, Bn };
double max = Rn;
if (max < Gn) max = Gn;
if (max < Bn) max = Bn;
double min = Rn;
if (min > Gn) min = Gn;
if (min > Bn) min = Bn;
C = max - min;
H = 0;
if (max==0)
S = 0;
H = -1; //undifined;
V = max;
/* if (max == Rn)
H = (60.0* ((int)((Gn - Bn) / C) % 6));
else if (max == Gn)
H = 60.0*( (Bn - Rn)/C + 2);
H = 60.0*( (Rn - Gn)/C + 4);
if (max == Rn)
H = ( 60.0* ( (Gn - Bn) / C) ) ;
else if (max == Gn)
H = 60.0*((Bn - Rn) / C + 2);
H = 60.0*((Rn - Gn) / C + 4);
V = max; //AKA lightness
S = C / max; //saturation
while (H < 0)
H += 360;
while (H>360)
H -= 360;
Him[row][column] = (float)H;
Vim[row][column] = (float)V;
Sim[row][column] = (float)S;
also my hsvtorgb :
void HSVtoRGB(unsigned char image[][NUMBER_OF_COLUMNS][NUMBER_OF_COLORS],
double R1, G1, B1;
double C;
double V;
double S;
double H;
int Htag;
double Htag2;
double x;
double m;
for (int row = 0; row < NUMBER_OF_ROWS; row++)
for (int column = 0; column < NUMBER_OF_COLUMNS; column++)
H = (double)Him[row][column];
S = (double)Sim[row][column];
V = (double)Vim[row][column];
C = V*S;
Htag = (int) (H / 60.0);
Htag2 = H/ 60.0;
//x = C*(1 - abs(Htag % 2 - 1));
double tmp1 = fmod(Htag2, 2);
double temp=(1 - abs(tmp1 - 1));
x = C*temp;
//switch (Htag)
switch (Htag)
case 0 :
R1 = C;
G1 = x;
B1 = 0;
case 1:
R1 = x;
G1 = C;
B1 = 0;
case 2:
R1 = 0;
G1 = C;
B1 = x;
case 3:
R1 = 0;
G1 = x;
B1 = C;
case 4:
R1 = x;
G1 = 0;
B1 = C;
case 5:
R1 = C;
G1 = 0;
B1 = x;
R1 = 0;
G1 = 0;
B1 = 0;
m = V - C;
//this is also good change I found
//image[row][column][R] = unsigned char( (R1 + m)*255);
//image[row][column][G] = unsigned char( (G1 + m)*255);
//image[row][column][B] = unsigned char( (B1 + m)*255);
image[row][column][R] = round((R1 + m) * 255);
image[row][column][G] = round((G1 + m) * 255);
image[row][column][B] = round((B1 + m) * 255);
void HSVfloattoGrayconvert(unsigned char grayimage[NUMBER_OF_ROWS] [NUMBER_OF_COLUMNS], float hsvimage[NUMBER_OF_ROWS][NUMBER_OF_COLUMNS], char hsv)
//grayimage , flaotimage , h/s/v
float factor;
if (hsv == 'h' || hsv == 'H') factor = (float) 1 / 360;
else factor = 1;
for (int row = 0; row < NUMBER_OF_ROWS; row++)
for (int column = 0; column < NUMBER_OF_COLUMNS; column++)
grayimage[row][column] = (unsigned char) (0.5f + 255.0f * (float)hsvimage[row][column] / factor);
and my main:
unsigned char HimageGray[NUMBER_OF_ROWS][NUMBER_OF_COLUMNS];
unsigned char VimageGray[NUMBER_OF_ROWS][NUMBER_OF_COLUMNS];
unsigned char SimageGray[NUMBER_OF_ROWS][NUMBER_OF_COLUMNS];
unsigned char HAfterSobel[NUMBER_OF_ROWS][NUMBER_OF_COLUMNS];
unsigned char VAfterSobel[NUMBER_OF_ROWS][NUMBER_OF_COLUMNS];
unsigned char SAfterSobal[NUMBER_OF_ROWS][NUMBER_OF_COLUMNS];
int KernelX[3][3] = {
{-1,0,+1}, {-2,0,2}, {-1,0,1 }
int KernelY[3][3] = {
{-1,-2,-1}, {0,0,0}, {1,2,1}
void main()
LoadBgrImageFromTrueColorBmpFile(ColorImage1, "P22A.bmp");
// add noise
AddSaltAndPepperNoiseRGB(ColorImage1, 350, 255);
StoreBgrImageAsTrueColorBmpFile(ColorImage1, "saltandpepper.bmp");
AddGaussNoiseCPPstileRGB(ColorImage1, 0.0, 1.0);
StoreBgrImageAsTrueColorBmpFile(ColorImage1, "Saltandgauss.bmp");
//saves hsv in float array
RGBtoHSV(ColorImage1, Himage, Vimage, Simage);
//saves hsv float arrays in unsigned char arrays
HSVfloattoGrayconvert(HimageGray, Himage, 'h');
HSVfloattoGrayconvert(VimageGray, Vimage, 'v');
HSVfloattoGrayconvert(SimageGray, Simage, 's');
StoreGrayImageAsGrayBmpFile(HimageGray, "P22H.bmp");
StoreGrayImageAsGrayBmpFile(VimageGray, "P22V.bmp");
StoreGrayImageAsGrayBmpFile(SimageGray, "P22S.bmp");
edit : Changed Code + add sources for equations :
Soruce : for equations :
listening to #gpasch advice and using better reference and deleting the mod6 I am now able to restore the RGB original photo!!! but unfortunately now my H photo in grayscale is even more chaotic than before .
I'll edit the code about so it will have more info about how I am saving the H grayscale photo .
That is the peril of going through garbage web sites; I suggest the following:
That mod 6 seems fishy there.
You also need to make sure you understand that H is in degrees from 0 to 360; if your filter expects 0..1 you have the change.
I am trying to do Sobel operator in the HSV dimension (told to do this in the HSV by my guide but I dont understand why it will work better on HSV than on RGB)
It depends on what you are trying to achieve. If you're trying to do edge detection based on brightness for example, then just working with say the V channel might be simpler than processing all three channels of RGB and combining them afterwards.
How a normal H photo in gray level should look a like comparing to the source photo ?
You would see regions which are a similar colour appear as a similar shade of grey, and for a real-world scene you would still see gradients. But where there are spatially adjacent regions with colours far apart in hue, there would be a sharp jump. The shapes would generally be recognisable though.
Where was I wrong in the code :
There are two main problems with your code. The first is that the hue scaling in HSVfloattoGrayconvert is wrong. Your code is setting factor=1.0/360.0f but then dividing by the factor, which means it's multiplying by 360. If you simply multiply by the factor, it produces the expected output. This is because the earlier calculation uses normalised values (0..1) for S and V but angle in degrees for H, so you need to divide by 360 to normalise H.
Second, the conversion back to RGB has a problem, mainly to do with calculating Htag where you want the original value for calculating x but the floor only when switching on the sector.
Note that despite what #gpasch suggested, the mod 6 operation is actually correct. This is because the conversion you are using is based on the hexagonal colour space model for HSV, and this is used to determine which sector your colour is in. For a continuous model, you could use a radial conversion instead which is slightly different. Both are well explained on Wikipedia.
I took your code, added a few functions to generate input data and save output files so it is completely standalone, and fixed the bugs above while making minimal changes to the source.
Given the following generated input image:
the Hue channel extracted is:
The saturation channel is:
and finally value:
After fixing up the HSV to RGB conversion, I verified that the resulting output image matches the original.
The updated code is below (as mentioned above, changed minimally to make a standalone test):
#include <string>
#include <cmath>
#include <cstdlib>
enum ColorIndex
R = 0,
G = 1,
B = 2,
const unsigned NUMBER_OF_COLUMNS = 256;
const unsigned NUMBER_OF_ROWS = 256;
const unsigned NUMBER_OF_COLORS = 3;
void RGBtoHSV(unsigned char image[][NUMBER_OF_COLUMNS][NUMBER_OF_COLORS],
double Rn, Gn, Bn;
double C;
double H, S, V;
for (int row = 0; row < NUMBER_OF_ROWS; row++)
for (int column = 0; column < NUMBER_OF_COLUMNS; column++)
Rn = image[row][column][R] / 255.0;
Gn = image[row][column][G] / 255.0;
Bn = image[row][column][B] / 255.0;
double max = Rn;
if (max < Gn) max = Gn;
if (max < Bn) max = Bn;
double min = Rn;
if (min > Gn) min = Gn;
if (min > Bn) min = Bn;
C = max - min;
H = 0;
if (max==0)
S = 0;
H = 0; // Undefined
V = max;
if (max == Rn)
H = 60.0*fmod((Gn - Bn) / C, 6.0);
else if (max == Gn)
H = 60.0*((Bn - Rn) / C + 2);
H = 60.0*((Rn - Gn) / C + 4);
V = max; //AKA lightness
S = C / max; //saturation
while (H < 0)
H += 360.0;
while (H > 360)
H -= 360.0;
Him[row][column] = (float)H;
Vim[row][column] = (float)V;
Sim[row][column] = (float)S;
void HSVtoRGB(unsigned char image[][NUMBER_OF_COLUMNS][NUMBER_OF_COLORS],
double R1, G1, B1;
double C;
double V;
double S;
double H;
double Htag;
double x;
double m;
for (int row = 0; row < NUMBER_OF_ROWS; row++)
for (int column = 0; column < NUMBER_OF_COLUMNS; column++)
H = (double)Him[row][column];
S = (double)Sim[row][column];
V = (double)Vim[row][column];
C = V*S;
Htag = H / 60.0;
double x = C*(1.0 - fabs(fmod(Htag, 2.0) - 1.0));
int i = floor(Htag);
switch (i)
case 0 :
R1 = C;
G1 = x;
B1 = 0;
case 1:
R1 = x;
G1 = C;
B1 = 0;
case 2:
R1 = 0;
G1 = C;
B1 = x;
case 3:
R1 = 0;
G1 = x;
B1 = C;
case 4:
R1 = x;
G1 = 0;
B1 = C;
case 5:
R1 = C;
G1 = 0;
B1 = x;
R1 = 0;
G1 = 0;
B1 = 0;
m = V - C;
image[row][column][R] = round((R1 + m) * 255);
image[row][column][G] = round((G1 + m) * 255);
image[row][column][B] = round((B1 + m) * 255);
void HSVfloattoGrayconvert(unsigned char grayimage[][NUMBER_OF_COLUMNS], float hsvimage[][NUMBER_OF_COLUMNS], char hsv)
//grayimage , flaotimage , h/s/v
float factor;
if (hsv == 'h' || hsv == 'H') factor = 1.0f/360.0f;
else factor = 1.0f;
for (int row = 0; row < NUMBER_OF_ROWS; row++)
for (int column = 0; column < NUMBER_OF_COLUMNS; column++)
grayimage[row][column] = (unsigned char) (0.5f + 255.0f * (float)hsvimage[row][column] * factor);
int KernelX[3][3] = {
{-1,0,+1}, {-2,0,2}, {-1,0,1 }
int KernelY[3][3] = {
{-1,-2,-1}, {0,0,0}, {1,2,1}
void GenerateTestImage(unsigned char image[][NUMBER_OF_COLUMNS][NUMBER_OF_COLORS])
for (unsigned y = 0; y < NUMBER_OF_ROWS; y++)
for (unsigned x = 0; x < NUMBER_OF_COLUMNS; x++)
image[y][x][R] = x % 256;
image[y][x][G] = y % 256;
image[y][x][B] = (255-x) % 256;
void GenerateTestImage(unsigned char image[][NUMBER_OF_COLUMNS])
for (unsigned y = 0; y < NUMBER_OF_ROWS; y++)
for (unsigned x = 0; x < NUMBER_OF_COLUMNS; x++)
image[x][y] = x % 256;
// Color (three channel) images
void SaveImage(unsigned char image[][NUMBER_OF_COLUMNS][NUMBER_OF_COLORS], const std::string& filename)
FILE* fp = fopen(filename.c_str(), "w");
fprintf(fp, "P6\n%u %u\n255\n", NUMBER_OF_COLUMNS, NUMBER_OF_ROWS);
// Grayscale (single channel) images
void SaveImage(unsigned char image[][NUMBER_OF_COLUMNS], const std::string& filename)
FILE* fp = fopen(filename.c_str(), "w");
fprintf(fp, "P5\n%u %u\n255\n", NUMBER_OF_COLUMNS, NUMBER_OF_ROWS);
fwrite(image, 1, NUMBER_OF_ROWS*NUMBER_OF_COLUMNS, fp);
unsigned char Himage[NUMBER_OF_ROWS][NUMBER_OF_COLUMNS];
unsigned char Simage[NUMBER_OF_ROWS][NUMBER_OF_COLUMNS];
unsigned char Vimage[NUMBER_OF_ROWS][NUMBER_OF_COLUMNS];
int main()
// Test input
SaveImage(ColorImage1, "test_input.ppm");
//saves hsv in float array
RGBtoHSV(ColorImage1, HimageGray, VimageGray, SimageGray);
//saves hsv float arrays in unsigned char arrays
HSVfloattoGrayconvert(Himage, HimageGray, 'h');
HSVfloattoGrayconvert(Vimage, VimageGray, 'v');
HSVfloattoGrayconvert(Simage, SimageGray, 's');
SaveImage(Himage, "P22H.pgm");
SaveImage(Vimage, "P22V.pgm");
SaveImage(Simage, "P22S.pgm");
// Convert back to get the original test image
HSVtoRGB(ColorImage1, HimageGray, VimageGray, SimageGray);
SaveImage(ColorImage1, "test_output.ppm");
return 0;
The input image was generated by a very simple algorithm which gives us gradients in each dimension, so we can easily inspect and verify the expected output. I used ppm/pgm files as they are simpler to write and more portable than BMP.
Hope this helps - let me know if you have any questions.

Opencv 2 multi-threshold Otsu method [duplicate]

I'm trying to implement multi-level Otsu's thresholding, more specifically I need 3 thresholds/4 classes.
I'm aware of 2 similair questions on SO about it: #34856019 and #22706742.
The problem is that I don't get good results: I've read several articles with sample images and thresholds found by that code differ from the ones in these papers.
Let's say I have a picture with 3 circles on the black background, the brightness of the circles differ from very bright to dark:
Sample Image
Am I right to suppose to get as a result 4 classes: black background and 3 more classes according to circles' intensity?
My program gives me these threshold values: 226, 178, 68
As a result, the third circle is completely invisible - it's in the same class as the background.
Can someone please check these values and/or the source code? Maybe it is possible to check this image using Matlab or somehow else...
By the way, what's the best way to handle divisions by zero, which happen often with zero values in histogram?
The source code:
void MultilevelThresholding(cv::Mat& src)
int histogram[256] = { 0 };
int pixelsCount = src.cols * src.rows;
for (int y = 0; y < src.rows; y++)
for (int x = 0; x < src.cols; x++)
uchar value =<uchar>(y, x);
double c = 0;
double Mt = 0;
double p[256] = { 0 };
for (int i = 0; i < 256; i++)
p[i] = (double) histogram[i] / (double) pixelsCount;
Mt += i * p[i];
int optimalTreshold1 = 0;
int optimalTreshold2 = 0;
int optimalTreshold3 = 0;
double maxBetweenVar = 0;
double w0 = 0;
double m0 = 0;
double c0 = 0;
double p0 = 0;
double w1 = 0;
double m1 = 0;
double c1 = 0;
double p1 = 0;
double w2 = 0;
double m2 = 0;
double c2 = 0;
double p2 = 0;
for (int tr1 = 0; tr1 < 256; tr1++)
p0 += p[tr1];
w0 += (tr1 * p[tr1]);
if (p0 != 0)
m0 = w0 / p0;
c0 = p0 * (m0 - Mt) * (m0 - Mt);
c1 = 0;
w1 = 0;
m1 = 0;
p1 = 0;
for (int tr2 = tr1 + 1; tr2 < 256; tr2++)
p1 += p[tr2];
w1 += (tr2 * p[tr2]);
if (p1 != 0)
m1 = w1 / p1;
c1 = p1 * (m1 - Mt) * (m1 - Mt);
c2 = 0;
w2 = 0;
m2 = 0;
p2 = 0;
for (int tr3 = tr2 + 1; tr3 < 256; tr3++)
p2 += p[tr3];
w2 += (tr3 * p[tr3]);
if (p2 != 0)
m2 = w2 / p2;
c2 = p2 * (m2 - Mt) * (m2 - Mt);
c = c0 + c1 + c2;
if (maxBetweenVar < c)
maxBetweenVar = c;
optimalTreshold1 = tr1;
optimalTreshold2 = tr2;
optimalTreshold3 = tr3;
So, I've figured it out. The final source code for 4 classes (3 thresholds) Otsu thresholding:
// cv::Mat& src - source image's matrix
int histogram[256] = { 0 };
int pixelsCount = src.cols * src.rows;
for (int y = 0; y < src.rows; y++)
for (int x = 0; x < src.cols; x++)
uchar value =<uchar>(y, x);
double c = 0;
double Mt = 0;
double p[256] = { 0 };
for (int i = 0; i < 256; i++)
p[i] = (double) histogram[i] / (double) pixelsCount;
Mt += i * p[i];
int optimalTreshold1 = 0;
int optimalTreshold2 = 0;
int optimalTreshold3 = 0;
double maxBetweenVar = 0;
double w0 = 0;
double m0 = 0;
double c0 = 0;
double p0 = 0;
double w1 = 0;
double m1 = 0;
double c1 = 0;
double p1 = 0;
double w2 = 0;
double m2 = 0;
double c2 = 0;
double p2 = 0;
for (int tr1 = 0; tr1 < 256; tr1++)
p0 += p[tr1];
w0 += (tr1 * p[tr1]);
if (p0 != 0)
m0 = w0 / p0;
c0 = p0 * (m0 - Mt) * (m0 - Mt);
c1 = 0;
w1 = 0;
m1 = 0;
p1 = 0;
for (int tr2 = tr1 + 1; tr2 < 256; tr2++)
p1 += p[tr2];
w1 += (tr2 * p[tr2]);
if (p1 != 0)
m1 = w1 / p1;
c1 = p1 * (m1 - Mt) * (m1 - Mt);
c2 = 0;
w2 = 0;
m2 = 0;
p2 = 0;
for (int tr3 = tr2 + 1; tr3 < 256; tr3++)
p2 += p[tr3];
w2 += (tr3 * p[tr3]);
if (p2 != 0)
m2 = w2 / p2;
c2 = p2 * (m2 - Mt) * (m2 - Mt);
double p3 = 1 - (p0 + p1 + p2);
double w3 = Mt - (w0 + w1 + w2);
double m3 = w3 / p3;
double c3 = p3 * (m3 - Mt) * (m3 - Mt);
double c = c0 + c1 + c2 + c3;
if (maxBetweenVar < c)
maxBetweenVar = c;
optimalTreshold1 = tr1;
optimalTreshold2 = tr2;
optimalTreshold3 = tr3;
Source image
Result: 3 thresholds / 4 classes
threshold values: 179, 92, 25