Negative index runtime error - c++

I am using this as my reference to implement my version of Bicubic interpolation for resizing the images. Here is the function that I have so far with some changes.
IplImage * bicubic(IplImage *img, int newWidth, int newHeight)
{
IplImage *img2 ;
img2 = createImage(newWidth,newHeight);
uchar * data = (uchar*)img->imageData;
uchar * Data = (uchar*)img2->imageData;
//int a,b,c,index;
uchar Cc;
uchar C[5];
uchar d0,d2,d3,a0,a1,a2,a3;
int i,j,k,jj;
int x,y;
float dx,dy;
float tx,ty;
tx = (float)img->width /newWidth ;
ty = (float)img->height / newHeight;
printf("New Width = %d, New Height = %d WidthStep = %d", newWidth, newHeight,img->widthStep);
for(i = 0; i< newHeight; i++)
{
for(j = 0; j< newWidth; j++)
{
x = (int)(tx * j);
y = (int)(ty * i);
dx = tx * j - x;
dy = ty * i - y;
for(k = 0;k < 3;k++)
{
for(jj = 0;jj <= 3 ;jj++)
{
int z = (y - 1 + jj);
//if(z > -1){
a0 = data[z * img->widthStep + (x)*img->nChannels +k];//===>Throws of runtime error
d0 = data[z * img->widthStep + (x-1)*img->nChannels +k] - a0 ;
d2 = data[z * img->widthStep + (x+1)*img->nChannels +k] - a0 ;
d3 = data[z * img->widthStep + (x+2)*img->nChannels +k] - a0 ;
a1 = -1.0/3 * d0 + d2 -1.0/6*d3;
a2 = 1.0/2 * d0 + 1.0/2*d2;
a3 = -1.0/6 * d0 - 1.0/2*d2 + 1.0/6*d3;
C[jj] = a0 + a1*dx + a2*dx*dx + a3*dx*dx*dx;
d0 = C[0]-C[1];
d2 = C[2]-C[1];
d3 = C[3]-C[1];
a0 = C[1];
a1 = -1.0/3*d0 + d2 -1.0/6*d3;
a2 = 1.0/2*d0 + 1.0/2*d2;
a3 = -1.0/6*d0 - 1.0/2*d2 + 1.0/6*d3;
Cc = a0 + a1*dy + a2*dy*dy + a3*dy*dy*dy;
Data[i*img2->widthStep +j*img2->nChannels +k ] = Cc;
//}
}
}
}
}
return img2;
}
The problem that I am facing is that when I call this bicubic function, it throws off an invalid access runtime error at the line where I find out the value of a0. I am using VS 2012 debugger and it tells me that the value of z is calculated as -1. This causes the index to access the invalid part of memory of data array.
My question is, why is this happening? Am I missing something in OpenCV's image library that can help in getting right indices so that I dont run into this error? Or am I making some mistake in accessing the correct indices?

for(i = 0; i< newHeight; i++)
{
for(j = 0; j< newWidth; j++)
{
x = (int)(tx * j);
y = (int)(ty * i);
dx = tx * j - x;
dy = ty * i - y;
for(k = 0;k < 3;k++)
{
for(jj = 0;jj <= 3 ;jj++)
{
int z = (y - 1 + jj);
//if(z > -1){
a0 = data[z * img->widthStep + (x)*img->nChannels +k];//===>Throws of runtime error
d0 = data[z * img->widthStep + (x-1)*img->nChannels +k] - a0 ;
On the first iteration, i and j are 0. as are k and jj
This means that:
y = (int)(ty * i); //y = ty * 0 (== 0)
int z = (y - 1 + jj); //z = 0 - 1 + 0 (==-1)
And so in the line:
a0 = data[z * img->widthStep + (x)*img->nChannels +k];//===>Throws of runtime error
the index is:
(-1) * img->widthStep + (x)*img->nChannels +k
simplifies to:
(-1) * img->widthStep + 0 + 0
which is:
-img->widthStep
This is of course out of bounds, leading to the crash.

Related

SSE mean filter in c++ and OpenCV

I would like to modify the code for an OpenCV mean filter to use Intel intrinsics. I'm an SSE newbie and I really don't know where to start from. I checked a lot of resources on the web, but I didn't have a lot of success.
This is the program:
#include "opencv2/imgproc/imgproc.hpp"
#include "opencv2/highgui/highgui.hpp"
using namespace cv;
using namespace std;
int main()
{
int A[3][3] = { { 1, 1, 1 }, { 1, 1, 1 }, { 1, 1, 1 } };
int c = 0;
int d = 0;
Mat var1 = imread("images.jpg", 1);
Mat var2(var1.rows, var1.cols, CV_8UC3, Scalar(0, 0, 0));
for (int i = 0; i < var1.rows; i++)
{
var2.at<Vec3b>(i, 0) = var1.at<Vec3b>(i, 0);
var2.at<Vec3b>(i, var1.cols - 1) = var1.at<Vec3b>(i, var1.cols - 1);
}
for (int i = 0; i < var1.cols; i++)
{
var2.at<Vec3b>(0, i) = var1.at<Vec3b>(0, i);
var2.at<Vec3b>(var1.rows - 1, i) = var1.at<Vec3b>(var1.rows - 1, i);
}
for (int i = 0; i < var1.rows; i++) {
for (int j = 0; j < var1.cols; j++)
{
c = 0;
for (int m = i; m < var1.rows; m++, c++)
{
if (c < 3)
{
d = 0;
for (int n = j; n < var1.cols; n++, d++)
{
if (d < 3)
{
if ((i + 1) < var1.rows && (j + 1) < var1.cols)
{
var2.at<Vec3b>(i + 1, j + 1)[0] += var1.at<Vec3b>(m, n)[0] * A[m - i][n - j] / 9;
var2.at<Vec3b>(i + 1, j + 1)[1] += var1.at<Vec3b>(m, n)[1] * A[m - i][n - j] / 9;
var2.at<Vec3b>(i + 1, j + 1)[2] += var1.at<Vec3b>(m, n)[2] * A[m - i][n - j] / 9;
}
}
}
}
}
}
}
imshow("window1", var1);
imshow("window2", var2);
waitKey(0);
return(0);
}
The part that I find difficult is understanding how to convert the innermost 2 loops, where the mean value is computed. Any help will be greatly appreciated.
Just for fun, I thought it might be interesting to start with a naive implementation of a 3x3 mean filter and then optimise this incrementally, ending up with a SIMD (SSE) implementation, measuring the throughput improvement at each stage.
1 - Mean_3_3_ref - reference implementation
This is just a simple scalar implementation which we'll use as a baseline for throughput and for validating further implementations:
void Mean_3_3_ref(const Mat &image_in, Mat &image_out)
{
for (int y = 1; y < image_in.rows - 1; ++y)
{
for (int x = 1; x < image_in.cols - 1; ++x)
{
for (int c = 0; c < 3; ++c)
{
image_out.at<Vec3b>(y, x)[c] = (image_in.at<Vec3b>(y - 1, x - 1)[c] +
image_in.at<Vec3b>(y - 1, x )[c] +
image_in.at<Vec3b>(y - 1, x + 1)[c] +
image_in.at<Vec3b>(y , x - 1)[c] +
image_in.at<Vec3b>(y , x )[c] +
image_in.at<Vec3b>(y , x + 1)[c] +
image_in.at<Vec3b>(y + 1, x - 1)[c] +
image_in.at<Vec3b>(y + 1, x )[c] +
image_in.at<Vec3b>(y + 1, x + 1)[c] + 4) / 9;
}
}
}
}
2 - Mean_3_3_scalar - somewhat optimised scalar implementation
Exploit the redundancy in summing successive columns - we save the last two column sums so that we only need to calculate one new column sum (per channel) on each iteration:
void Mean_3_3_scalar(const Mat &image_in, Mat &image_out)
{
for (int y = 1; y < image_in.rows - 1; ++y)
{
int r_1, g_1, b_1;
int r0, g0, b0;
int r1, g1, b1;
r_1 = g_1 = b_1 = 0;
r0 = g0 = b0 = 0;
for (int yy = y - 1; yy <= y + 1; ++yy)
{
r_1 += image_in.at<Vec3b>(yy, 0)[0];
g_1 += image_in.at<Vec3b>(yy, 0)[1];
b_1 += image_in.at<Vec3b>(yy, 0)[2];
r0 += image_in.at<Vec3b>(yy, 1)[0];
g0 += image_in.at<Vec3b>(yy, 1)[1];
b0 += image_in.at<Vec3b>(yy, 1)[2];
}
for (int x = 1; x < image_in.cols - 1; ++x)
{
r1 = g1 = b1 = 0;
for (int yy = y - 1; yy <= y + 1; ++yy)
{
r1 += image_in.at<Vec3b>(yy, x + 1)[0];
g1 += image_in.at<Vec3b>(yy, x + 1)[1];
b1 += image_in.at<Vec3b>(yy, x + 1)[2];
}
image_out.at<Vec3b>(y, x)[0] = (r_1 + r0 + r1 + 4) / 9;
image_out.at<Vec3b>(y, x)[1] = (g_1 + g0 + g1 + 4) / 9;
image_out.at<Vec3b>(y, x)[2] = (b_1 + b0 + b1 + 4) / 9;
r_1 = r0;
g_1 = g0;
b_1 = b0;
r0 = r1;
g0 = g1;
b0 = b1;
}
}
}
3 - Mean_3_3_scalar_opt - further optimised scalar implementation
As per Mean_3_3_scalar, but also remove OpenCV overheads by caching pointers to each row that we are working on:
void Mean_3_3_scalar_opt(const Mat &image_in, Mat &image_out)
{
for (int y = 1; y < image_in.rows - 1; ++y)
{
const uint8_t * const input_1 = image_in.ptr(y - 1);
const uint8_t * const input0 = image_in.ptr(y);
const uint8_t * const input1 = image_in.ptr(y + 1);
uint8_t * const output = image_out.ptr(y);
int r_1 = input_1[0] + input0[0] + input1[0];
int g_1 = input_1[1] + input0[1] + input1[1];
int b_1 = input_1[2] + input0[2] + input1[2];
int r0 = input_1[3] + input0[3] + input1[3];
int g0 = input_1[4] + input0[4] + input1[4];
int b0 = input_1[5] + input0[5] + input1[5];
for (int x = 1; x < image_in.cols - 1; ++x)
{
int r1 = input_1[x * 3 + 3] + input0[x * 3 + 3] + input1[x * 3 + 3];
int g1 = input_1[x * 3 + 4] + input0[x * 3 + 4] + input1[x * 3 + 4];
int b1 = input_1[x * 3 + 5] + input0[x * 3 + 5] + input1[x * 3 + 5];
output[x * 3 ] = (r_1 + r0 + r1 + 4) / 9;
output[x * 3 + 1] = (g_1 + g0 + g1 + 4) / 9;
output[x * 3 + 2] = (b_1 + b0 + b1 + 4) / 9;
r_1 = r0;
g_1 = g0;
b_1 = b0;
r0 = r1;
g0 = g1;
b0 = b1;
}
}
}
4 - Mean_3_3_blur - leverage OpenCV's blur function
OpenCV has a function called blur, which is based on the function boxFilter, which is just another name for a mean filter. Since OpenCV code has been quite heavily optimised over the years (using SIMD in many cases), let's see if this makes a big improvement over our scalar code:
void Mean_3_3_blur(const Mat &image_in, Mat &image_out)
{
blur(image_in, image_out, Size(3, 3));
}
5 - Mean_3_3_SSE - SSE implementation
This a reasonably efficient SIMD implementation. It uses the same techniques as the scalar code above in order to eliminate redundancy in processing successive pixels:
#include <tmmintrin.h> // Note: requires SSSE3 (aka MNI)
inline void Load2(const ssize_t offset, const uint8_t* const src, __m128i& vh, __m128i& vl)
{
const __m128i v = _mm_loadu_si128((__m128i *)(src + offset));
vh = _mm_unpacklo_epi8(v, _mm_setzero_si128());
vl = _mm_unpackhi_epi8(v, _mm_setzero_si128());
}
inline void Store2(const ssize_t offset, uint8_t* const dest, const __m128i vh, const __m128i vl)
{
__m128i v = _mm_packus_epi16(vh, vl);
_mm_storeu_si128((__m128i *)(dest + offset), v);
}
template <int SHIFT> __m128i ShiftL(const __m128i v0, const __m128i v1) { return _mm_alignr_epi8(v1, v0, SHIFT * sizeof(short)); }
template <int SHIFT> __m128i ShiftR(const __m128i v0, const __m128i v1) { return _mm_alignr_epi8(v1, v0, 16 - SHIFT * sizeof(short)); }
template <int CHANNELS> void Mean_3_3_SSE_Impl(const Mat &image_in, Mat &image_out)
{
const int nx = image_in.cols;
const int ny = image_in.rows;
const int kx = 3 / 2; // x, y borders
const int ky = 3 / 2;
const int kScale = 3 * 3; // scale factor = total number of pixels in sum
const __m128i vkScale = _mm_set1_epi16((32768 + kScale / 2) / kScale);
const int nx0 = ((nx + kx) * CHANNELS + 15) & ~15; // round up total width to multiple of 16
int x, y;
for (y = ky; y < ny - ky; ++y)
{
const uint8_t * const input_1 = image_in.ptr(y - 1);
const uint8_t * const input0 = image_in.ptr(y);
const uint8_t * const input1 = image_in.ptr(y + 1);
uint8_t * const output = image_out.ptr(y);
__m128i vsuml_1, vsumh0, vsuml0;
__m128i vh, vl;
vsuml_1 = _mm_set1_epi16(0);
Load2(0, input_1, vsumh0, vsuml0);
Load2(0, input0, vh, vl);
vsumh0 = _mm_add_epi16(vsumh0, vh);
vsuml0 = _mm_add_epi16(vsuml0, vl);
Load2(0, input1, vh, vl);
vsumh0 = _mm_add_epi16(vsumh0, vh);
vsuml0 = _mm_add_epi16(vsuml0, vl);
for (x = 0; x < nx0; x += 16)
{
__m128i vsumh1, vsuml1, vsumh, vsuml;
Load2((x + 16), input_1, vsumh1, vsuml1);
Load2((x + 16), input0, vh, vl);
vsumh1 = _mm_add_epi16(vsumh1, vh);
vsuml1 = _mm_add_epi16(vsuml1, vl);
Load2((x + 16), input1, vh, vl);
vsumh1 = _mm_add_epi16(vsumh1, vh);
vsuml1 = _mm_add_epi16(vsuml1, vl);
vsumh = _mm_add_epi16(vsumh0, ShiftR<CHANNELS>(vsuml_1, vsumh0));
vsuml = _mm_add_epi16(vsuml0, ShiftR<CHANNELS>(vsumh0, vsuml0));
vsumh = _mm_add_epi16(vsumh, ShiftL<CHANNELS>(vsumh0, vsuml0));
vsuml = _mm_add_epi16(vsuml, ShiftL<CHANNELS>(vsuml0, vsumh1));
// round mean
vsumh = _mm_mulhrs_epi16(vsumh, vkScale);
vsuml = _mm_mulhrs_epi16(vsuml, vkScale);
Store2(x, output, vsumh, vsuml);
vsuml_1 = vsuml0;
vsumh0 = vsumh1;
vsuml0 = vsuml1;
}
}
}
void Mean_3_3_SSE(const Mat &image_in, Mat &image_out)
{
const int channels = image_in.channels();
switch (channels)
{
case 1:
Mean_3_3_SSE_Impl<1>(image_in, image_out);
break;
case 3:
Mean_3_3_SSE_Impl<3>(image_in, image_out);
break;
default:
throw("Unsupported format.");
break;
}
}
Results
I benchmarked all of the above implementations on an 8th gen Core i9 (MacBook Pro 16,1) at 2.4 GHz, with an image size of 2337 rows x 3180 cols. The compiler was Apple clang version 12.0.5 (clang-1205.0.22.9) and the only optimisation switch was -O3. OpenCV version was 4.5.0 (via Homebrew). (Note: I verified that for Mean_3_3_blur the cv::blur function was dispatched to an AVX2 implementation.) The results:
Mean_3_3_ref 62153 µs
Mean_3_3_scalar 41144 µs = 1.51062x
Mean_3_3_scalar_opt 26238 µs = 2.36882x
Mean_3_3_blur 20121 µs = 3.08896x
Mean_3_3_SSE 4838 µs = 12.84680x
Notes
I have ignored the border pixels in all implementations - if required these can either be filled with pixels from the original image or using some other form of edge pixel processing.
The code is not "industrial strength" - it was only written for benchmarking purposes.
There are a few further possible optimisations, e.g. use wider SIMD (AVX2, AVX512), exploit the redundancy between successive rows, etc - these are left as an exercise for the reader.
The SSE implementation is fastest, but this comes at the cost of increased complexity, decreased mantainability and reduced portability.
The OpenCV blur function gives the second best performance, and should probably be the preferred solution if it meets throughput requirements - it's the simplest solution, and simple is good.

Opencv 2 multi-threshold Otsu method [duplicate]

I'm trying to implement multi-level Otsu's thresholding, more specifically I need 3 thresholds/4 classes.
I'm aware of 2 similair questions on SO about it: #34856019 and #22706742.
The problem is that I don't get good results: I've read several articles with sample images and thresholds found by that code differ from the ones in these papers.
Let's say I have a picture with 3 circles on the black background, the brightness of the circles differ from very bright to dark:
Sample Image
Am I right to suppose to get as a result 4 classes: black background and 3 more classes according to circles' intensity?
My program gives me these threshold values: 226, 178, 68
As a result, the third circle is completely invisible - it's in the same class as the background.
Can someone please check these values and/or the source code? Maybe it is possible to check this image using Matlab or somehow else...
By the way, what's the best way to handle divisions by zero, which happen often with zero values in histogram?
The source code:
void MultilevelThresholding(cv::Mat& src)
{
int histogram[256] = { 0 };
int pixelsCount = src.cols * src.rows;
for (int y = 0; y < src.rows; y++)
{
for (int x = 0; x < src.cols; x++)
{
uchar value = src.at<uchar>(y, x);
histogram[value]++;
}
}
double c = 0;
double Mt = 0;
double p[256] = { 0 };
for (int i = 0; i < 256; i++)
{
p[i] = (double) histogram[i] / (double) pixelsCount;
Mt += i * p[i];
}
int optimalTreshold1 = 0;
int optimalTreshold2 = 0;
int optimalTreshold3 = 0;
double maxBetweenVar = 0;
double w0 = 0;
double m0 = 0;
double c0 = 0;
double p0 = 0;
double w1 = 0;
double m1 = 0;
double c1 = 0;
double p1 = 0;
double w2 = 0;
double m2 = 0;
double c2 = 0;
double p2 = 0;
for (int tr1 = 0; tr1 < 256; tr1++)
{
p0 += p[tr1];
w0 += (tr1 * p[tr1]);
if (p0 != 0)
{
m0 = w0 / p0;
}
c0 = p0 * (m0 - Mt) * (m0 - Mt);
c1 = 0;
w1 = 0;
m1 = 0;
p1 = 0;
for (int tr2 = tr1 + 1; tr2 < 256; tr2++)
{
p1 += p[tr2];
w1 += (tr2 * p[tr2]);
if (p1 != 0)
{
m1 = w1 / p1;
}
c1 = p1 * (m1 - Mt) * (m1 - Mt);
c2 = 0;
w2 = 0;
m2 = 0;
p2 = 0;
for (int tr3 = tr2 + 1; tr3 < 256; tr3++)
{
p2 += p[tr3];
w2 += (tr3 * p[tr3]);
if (p2 != 0)
{
m2 = w2 / p2;
}
c2 = p2 * (m2 - Mt) * (m2 - Mt);
c = c0 + c1 + c2;
if (maxBetweenVar < c)
{
maxBetweenVar = c;
optimalTreshold1 = tr1;
optimalTreshold2 = tr2;
optimalTreshold3 = tr3;
}
}
}
}
So, I've figured it out. The final source code for 4 classes (3 thresholds) Otsu thresholding:
// cv::Mat& src - source image's matrix
int histogram[256] = { 0 };
int pixelsCount = src.cols * src.rows;
for (int y = 0; y < src.rows; y++)
{
for (int x = 0; x < src.cols; x++)
{
uchar value = src.at<uchar>(y, x);
histogram[value]++;
}
}
double c = 0;
double Mt = 0;
double p[256] = { 0 };
for (int i = 0; i < 256; i++)
{
p[i] = (double) histogram[i] / (double) pixelsCount;
Mt += i * p[i];
}
int optimalTreshold1 = 0;
int optimalTreshold2 = 0;
int optimalTreshold3 = 0;
double maxBetweenVar = 0;
double w0 = 0;
double m0 = 0;
double c0 = 0;
double p0 = 0;
double w1 = 0;
double m1 = 0;
double c1 = 0;
double p1 = 0;
double w2 = 0;
double m2 = 0;
double c2 = 0;
double p2 = 0;
for (int tr1 = 0; tr1 < 256; tr1++)
{
p0 += p[tr1];
w0 += (tr1 * p[tr1]);
if (p0 != 0)
{
m0 = w0 / p0;
}
c0 = p0 * (m0 - Mt) * (m0 - Mt);
c1 = 0;
w1 = 0;
m1 = 0;
p1 = 0;
for (int tr2 = tr1 + 1; tr2 < 256; tr2++)
{
p1 += p[tr2];
w1 += (tr2 * p[tr2]);
if (p1 != 0)
{
m1 = w1 / p1;
}
c1 = p1 * (m1 - Mt) * (m1 - Mt);
c2 = 0;
w2 = 0;
m2 = 0;
p2 = 0;
for (int tr3 = tr2 + 1; tr3 < 256; tr3++)
{
p2 += p[tr3];
w2 += (tr3 * p[tr3]);
if (p2 != 0)
{
m2 = w2 / p2;
}
c2 = p2 * (m2 - Mt) * (m2 - Mt);
double p3 = 1 - (p0 + p1 + p2);
double w3 = Mt - (w0 + w1 + w2);
double m3 = w3 / p3;
double c3 = p3 * (m3 - Mt) * (m3 - Mt);
double c = c0 + c1 + c2 + c3;
if (maxBetweenVar < c)
{
maxBetweenVar = c;
optimalTreshold1 = tr1;
optimalTreshold2 = tr2;
optimalTreshold3 = tr3;
}
}
}
}
Source image
Result: 3 thresholds / 4 classes
threshold values: 179, 92, 25

How to optimize YUV to RGB color conversion code

I have written a function to convert an image in YUV420P to RGB but it is taking 30 millisecond to convert an image (size: 1280 x 720) into RGB, but when I am using ffmpeg function ( as this) to convert YUV image into RGB its taking only 2 millisecond for the same image. What is the problem with my code ? How can I optimize the code that I have written ??
My code is given below
int step = origImage->widthStep;
uchar *data = (uchar *)origImage->imageData;
int size = origImage->width * origImage->height;
IplImage* img1 = cvCreateImage(cvGetSize(origImage), IPL_DEPTH_8U, 3);
for (int i = 0; i<origImage->height; i++)
{
for (int j=0; j<origImage->width; j++)
{
float Y = data[i*step + j];
float U = data[ (int)(size + (i/2)*(step/2) + j/2) ];
float V = data[ (int)(size*1.25 + (i/2)*(step/2) + j/2)];
float R = Y + 1.402 * (V - 128);
float G = Y - 0.344 * (U - 128) - 0.714 * (V - 128);
float B = Y + 1.772 * (U - 128);
if (R < 0){ R = 0; } if (G < 0){ G = 0; } if (B < 0){ B = 0; }
if (R > 255 ){ R = 255; } if (G > 255) { G = 255; } if (B > 255) { B = 255; }
cvSet2D(img1, i, j,cvScalar(B,G,R));
}
}
Here, try this(should reduce to 25 milliseconds):
int step = origImage->widthStep;
uchar *data = (uchar *)origImage->imageData;
int size = origImage->width * origImage->height;
IplImage* img1 = cvCreateImage(cvGetSize(origImage), IPL_DEPTH_8U, 3);
int stepDb2=step /2;
float sizeMb1d25=size*1.25 ;
int origImagePTheight=origImage->height;
int origImagePTwidth=origImage->width;
for (int i = 0; i<origImagePTheight; i++)
{
float idb2=i/2;
int iStep=i*step;
for (int j=0; j<origImagePTwidth; j++)
{
float variable=idb2*stepDb2 + j/2;
float Y = data[iStep + j];
float U = -128 + data[ (int)(size + variable) ];
float V = -128 + data[ (int)(sizeMb1d25 + variable)];
float R = Y + 1.402 * V ;
float G = Y - 0.344 * U - 0.714 * V;
float B = Y + 1.772 * U;
R= R * !(R<0);
G= G * !(G<0);
B= B * !(B<0);
R=R*(!(R>255)) + 255 * (R>255);
G=G*(!(G>255)) + 255 * (G>255);
B=B*(!(B>255)) + 255 * (B>255);
cvSet2D(img1, i, j,cvScalar(B,G,R));
}
}

Bi-Cubic Interpolation Algorithm for Image Scaling

I'm trying to write a basic bicubic resize algorithm to resize a 24-bit RGB bitmap. I have a general understanding of the math involved, and I'm using this implementation from Google Code as a guide. I'm not using any external libraries here - I'm just experimenting with the algorithm itself. The bitmap is represented as a plain std::vector<unsigned char>:
inline unsigned char getpixel(const std::vector<unsigned char>& in,
std::size_t src_width, std::size_t src_height, unsigned x, unsigned y, int channel)
{
if (x < src_width && y < src_height)
return in[(x * 3 * src_width) + (3 * y) + channel];
return 0;
}
std::vector<unsigned char> bicubicresize(const std::vector<unsigned char>& in,
std::size_t src_width, std::size_t src_height, std::size_t dest_width, std::size_t dest_height)
{
std::vector<unsigned char> out(dest_width * dest_height * 3);
const float tx = float(src_width) / dest_width;
const float ty = float(src_height) / dest_height;
const int channels = 3;
const std::size_t row_stride = dest_width * channels;
unsigned char C[5] = { 0 };
for (int i = 0; i < dest_height; ++i)
{
for (int j = 0; j < dest_width; ++j)
{
const int x = int(tx * j);
const int y = int(ty * i);
const float dx = tx * j - x;
const float dy = ty * i - y;
for (int k = 0; k < 3; ++k)
{
for (int jj = 0; jj < 4; ++jj)
{
const int z = y - 1 + jj;
unsigned char a0 = getpixel(in, src_width, src_height, z, x, k);
unsigned char d0 = getpixel(in, src_width, src_height, z, x - 1, k) - a0;
unsigned char d2 = getpixel(in, src_width, src_height, z, x + 1, k) - a0;
unsigned char d3 = getpixel(in, src_width, src_height, z, x + 2, k) - a0;
unsigned char a1 = -1.0 / 3 * d0 + d2 - 1.0 / 6 * d3;
unsigned char a2 = 1.0 / 2 * d0 + 1.0 / 2 * d2;
unsigned char a3 = -1.0 / 6 * d0 - 1.0 / 2 * d2 + 1.0 / 6 * d3;
C[jj] = a0 + a1 * dx + a2 * dx * dx + a3 * dx * dx * dx;
d0 = C[0] - C[1];
d2 = C[2] - C[1];
d3 = C[3] - C[1];
a0 = C[1];
a1 = -1.0 / 3 * d0 + d2 -1.0 / 6 * d3;
a2 = 1.0 / 2 * d0 + 1.0 / 2 * d2;
a3 = -1.0 / 6 * d0 - 1.0 / 2 * d2 + 1.0 / 6 * d3;
out[i * row_stride + j * channels + k] = a0 + a1 * dy + a2 * dy * dy + a3 * dy * dy * dy;
}
}
}
}
return out;
}
Problem: When I use this algorithm to downscale an image, it works except the output image contains all black pixels on the right side for some reason, giving the appearance that it's been "cropped".
Example:
INPUT IMAGE:
OUTPUT IMAGE:
Question: Reviewing the algorithm, I can't see why this would happen. Does anyone see the flaw here?
try not exchanging width and height.
for (int i = 0; i < dest_width; ++i)
{
for (int j = 0; j < dest_height; ++j)
I suggest don't use this function because it was written very bad. You need to make two convolutions: at first by X coordinate then by Y. In this function all these convolutions are making in the same time that leads to very slow work. And if You would look at jj loop body you could notice that all second part of body begining from "d0 = C[0] - C[1];" could be moved outside jj loop because only the last iteration of this loop takes effect on out[] array (all previous iterations results will be overwrited).
You should switch the x and z when you call getpixel, and in getpixel you should index the array using:
[(y * 3 * src_width) + (3 * x) + channel]
In getpixel(in, src_width, src_height, z, x, k):
z mean horizontal offset
x mean vertical offset
So just need patch the getpixel function, below is the patched code:
inline unsigned char getpixel(const std::vector<unsigned char>& in,
std::size_t src_width, std::size_t src_height, unsigned y, unsigned x, int channel)
{
if (x < src_width && y < src_height)
return in[(y * 3 * src_width) + (3 * x) + channel];
return 0;
}
std::vector<unsigned char> bicubicresize(const std::vector<unsigned char>& in,
std::size_t src_width, std::size_t src_height, std::size_t dest_width, std::size_t dest_height)
{
std::vector<unsigned char> out(dest_width * dest_height * 3);
const float tx = float(src_width) / dest_width;
const float ty = float(src_height) / dest_height;
const int channels = 3;
const std::size_t row_stride = dest_width * channels;
unsigned char C[5] = { 0 };
for (int i = 0; i < dest_height; ++i)
{
for (int j = 0; j < dest_width; ++j)
{
const int x = int(tx * j);
const int y = int(ty * i);
const float dx = tx * j - x;
const float dy = ty * i - y;
for (int k = 0; k < 3; ++k)
{
for (int jj = 0; jj < 4; ++jj)
{
const int z = y - 1 + jj;
unsigned char a0 = getpixel(in, src_width, src_height, z, x, k);
unsigned char d0 = getpixel(in, src_width, src_height, z, x - 1, k) - a0;
unsigned char d2 = getpixel(in, src_width, src_height, z, x + 1, k) - a0;
unsigned char d3 = getpixel(in, src_width, src_height, z, x + 2, k) - a0;
unsigned char a1 = -1.0 / 3 * d0 + d2 - 1.0 / 6 * d3;
unsigned char a2 = 1.0 / 2 * d0 + 1.0 / 2 * d2;
unsigned char a3 = -1.0 / 6 * d0 - 1.0 / 2 * d2 + 1.0 / 6 * d3;
C[jj] = a0 + a1 * dx + a2 * dx * dx + a3 * dx * dx * dx;
d0 = C[0] - C[1];
d2 = C[2] - C[1];
d3 = C[3] - C[1];
a0 = C[1];
a1 = -1.0 / 3 * d0 + d2 -1.0 / 6 * d3;
a2 = 1.0 / 2 * d0 + 1.0 / 2 * d2;
a3 = -1.0 / 6 * d0 - 1.0 / 2 * d2 + 1.0 / 6 * d3;
out[i * row_stride + j * channels + k] = a0 + a1 * dy + a2 * dy * dy + a3 * dy * dy * dy;
}
}
}
}
return out;
}

My Particle Swarm Optimization code generates different answers in C++ and MATLAB

I have written a global version of Particle Swarm Optimization algorithm in C++.
I tried to write it exactly as same as my MATLAB PSO code that have written before, but this code generates different and so worst answers.
The MATLAB code is:
clear all;
numofdims = 30;
numofparticles = 50;
c1 = 2;
c2 = 2;
numofiterations = 1000;
V = zeros(50, 30);
initialpop = V;
Vmin = zeros(30, 1);
Vmax = Vmin;
Xmax = ones(30, 1) * 100;
Xmin = -Xmax;
pbestfits = zeros(50, 1);
worsts = zeros(50, 1);
bests = zeros(50, 1);
meanfits = zeros(50, 1);
pbests = zeros(50, 30);
initialpop = Xmin + (Xmax - Xmin) .* rand(numofparticles, numofdims);
X = initialpop;
fitnesses = testfunc1(X);
[minfit, minfitidx] = min(fitnesses);
gbestfit = minfit;
gbest = X(minfitidx, :);
for i = 1:numofdims
Vmax(i) = 0.2 * (Xmax(i) - Xmin(i));
Vmin(i) = -Vmax(i);
end
for t = 1:1000
w = 0.9 - 0.7 * (t / numofiterations);
for i = 1:numofparticles
if(fitnesses(i) < pbestfits(i))
pbestfits(i) = fitnesses(i);
pbests(i, :) = X(i, :);
end
end
for i = 1:numofparticles
for j = 1:numofdims
V(i, j) = min(max((w * V(i, j) + rand * c1 * (pbests(i, j) - X(i, j))...
+ rand * c2 * (gbest(j) - X(i, j))), Vmin(j)), Vmax(j));
X(i, j) = min(max((X(i, j) + V(i, j)), Xmin(j)), Xmax(j));
end
end
fitnesses = testfunc1(X);
[minfit, minfitidx] = min(fitnesses);
if(minfit < gbestfit)
gbestfit = minfit;
gbest = X(minfitidx, :);
end
worsts(t) = max(fitnesses);
bests(t) = gbestfit;
meanfits(t) = mean(fitnesses);
end
In which, testfunc1 is:
function [out] = testfunc1(R)
out = sum(R .^ 2, 2);
end
The C++ code is:
#include <cstring>
#include <iostream>
#include <cmath>
#include <algorithm>
#include <ctime>
#define rand_01 ((float)rand() / (float)RAND_MAX)
const int numofdims = 30;
const int numofparticles = 50;
using namespace std;
void fitnessfunc(float X[numofparticles][numofdims], float fitnesses[numofparticles])
{
memset(fitnesses, 0, sizeof (float) * numofparticles);
for(int i = 0; i < numofparticles; i++)
{
for(int j = 0; j < numofdims; j++)
{
fitnesses[i] += (pow(X[i][j], 2));
}
}
}
float mean(float inputval[], int vallength)
{
int addvalue = 0;
for(int i = 0; i < vallength; i++)
{
addvalue += inputval[i];
}
return (float)(addvalue / vallength);
}
void PSO(int numofiterations, float c1, float c2,
float Xmin[numofdims], float Xmax[numofdims], float initialpop[numofparticles][numofdims],
float worsts[], float meanfits[], float bests[], float *gbestfit, float gbest[numofdims])
{
float V[numofparticles][numofdims] = {0};
float X[numofparticles][numofdims];
float Vmax[numofdims];
float Vmin[numofdims];
float pbests[numofparticles][numofdims];
float pbestfits[numofparticles];
float fitnesses[numofparticles];
float w;
float minfit;
int minfitidx;
memcpy(X, initialpop, sizeof(float) * numofparticles * numofdims);
fitnessfunc(X, fitnesses);
minfit = *min_element(fitnesses, fitnesses + numofparticles);
minfitidx = min_element(fitnesses, fitnesses + numofparticles) - fitnesses;
*gbestfit = minfit;
memcpy(gbest, X[minfitidx], sizeof(float) * numofdims);
for(int i = 0; i < numofdims; i++)
{
Vmax[i] = 0.2 * (Xmax[i] - Xmin[i]);
Vmin[i] = -Vmax[i];
}
for(int t = 0; t < 1000; t++)
{
w = 0.9 - 0.7 * (float) (t / numofiterations);
for(int i = 0; i < numofparticles; i++)
{
if(fitnesses[i] < pbestfits[i])
{
pbestfits[i] = fitnesses[i];
memcpy(pbests[i], X[i], sizeof(float) * numofdims);
}
}
for(int i = 0; i < numofparticles; i++)
{
for(int j = 0; j < numofdims; j++)
{
V[i][j] = min(max((w * V[i][j] + rand_01 * c1 * (pbests[i][j] - X[i][j])
+ rand_01 * c2 * (gbest[j] - X[i][j])), Vmin[j]), Vmax[j]);
X[i][j] = min(max((X[i][j] + V[i][j]), Xmin[j]), Xmax[j]);
}
}
fitnessfunc(X, fitnesses);
minfit = *min_element(fitnesses, fitnesses + numofparticles);
minfitidx = min_element(fitnesses, fitnesses + numofparticles) - fitnesses;
if(minfit < *gbestfit)
{
*gbestfit = minfit;
memcpy(gbest, X[minfitidx], sizeof(float) * numofdims);
}
worsts[t] = *max_element(fitnesses, fitnesses + numofparticles);
bests[t] = *gbestfit;
meanfits[t] = mean(fitnesses, numofparticles);
}
}
int main()
{
time_t t;
srand((unsigned) time(&t));
float xmin[30], xmax[30];
float initpop[50][30];
float worsts[1000], bests[1000];
float meanfits[1000];
float gbestfit;
float gbest[30];
for(int i = 0; i < 30; i++)
{
xmax[i] = 100;
xmin[i] = -100;
}
for(int i = 0; i < 50; i++)
for(int j = 0; j < 30; j++)
{
initpop[i][j] = rand() % (100 + 100 + 1) - 100;
}
PSO(1000, 2, 2, xmin, xmax, initpop, worsts, meanfits, bests, &gbestfit, gbest);
cout<<"fitness: "<<gbestfit<<endl;
return 0;
}
I have debugged two codes many times but can not find the difference which makes answers different.
It is making me crazy!
May you help me please?
Update:
Please consider that, the function mean is just used for reporting some information and is not used in the optimization procedure.
You've got integer division in the following line
w = 0.9 - 0.7 * (float) (t / numofiterations);
w will be 0.2 for every iteration, change it to
w = 0.9 - 0.7 * t / numofiterations;
The first multiplication will automatically promote t to a double the division should then promote numof iterations to a double.
The parenthesis means it will be done first and therefore not be promoted as wo integers is involved in the division.
This could be a mistake in function mean:
return (float)(addvalue / vallength);
This is integer division, so the result is truncated down, then cast to float. It is unlikely this is what you want.