So my code is suppsed to work like this:
-take in_martix of NxN elements and R factor
-it should give back a matrix of size [N-2R]*[N-2R] with each element being a sum of in_matrix elements in R radius it should work like this for N=4 R=1
Even though my code works for smaller matrixes, for bigger ones like 1024 or 2048 or even bigger R factors it gives back a matrix of 0's. Is it a problem inside my code or just my GPU can't compute more calculations ?
Code: (for testing purposes initial matrix is filled with 1's so every element of out_matrix should == (2R+1)^2
#include "cuda_runtime.h"
#include <stdio.h>
#include <iostream>
#include <cuda_profiler_api.h>
#define N 1024
#define R 128
#define K 1
#define THREAD_BLOCK_SIZE 8
using namespace std;
__global__ void MatrixStencil(int* d_tab_begin, int* d_out_begin, int d_N, int d_R, int d_K) {
int tx = threadIdx.x + blockIdx.x * blockDim.x;
int ty = threadIdx.y + blockIdx.y * blockDim.y;
int out_local = 0;
for (int col = tx; col <= tx + 2 * d_R ; col++)
for (int row = ty; row <= ty + 2 * d_R ; row++)
out_local += *(d_tab_begin + col * d_N + row);
*(d_out_begin + (tx) * (d_N - 2 * R) + ty) = out_local;
}
void random_ints(int tab[N][N]) {
for (int i = 0; i < N; i++)
for (int j = 0; j < N; j++)
tab[i][j] = 1;
}
int main() {
static int tab[N][N];
random_ints(tab);
int tab_size = sizeof(int) * N * N;
int out_size = sizeof(int) * (N - 2 * R) * (N - 2 * R);
dim3 BLOCK_SIZE(THREAD_BLOCK_SIZE, THREAD_BLOCK_SIZE);
dim3 GRID_SIZE(ceil((float)N / (float)(THREAD_BLOCK_SIZE )), ceil((float)N / (float)(THREAD_BLOCK_SIZE )));
void** d_tab;
void** d_out;
cudaMalloc((void**)&d_tab, tab_size);
cudaMalloc((void**)&d_out, out_size);
cudaMemcpyAsync(d_tab, tab, tab_size, cudaMemcpyHostToDevice);
int* d_tab_begin = (int*)(d_tab);
int* d_out_begin = (int*)(d_out);
MatrixStencil << < GRID_SIZE, BLOCK_SIZE>> > (d_tab_begin, d_out_begin, N, R, K);
int* out = (int*)malloc(out_size);
cudaMemcpyAsync(out, d_out, out_size, cudaMemcpyDeviceToHost);
cudaThreadSynchronize();
for (int col = 0; col < N - 2 * R; col++)
{
for (int row = 0; row < N - 2 * R; row++)
{
cout << *(out + ((col * (N - 2 * R)) + row)) << " ";
}
cout << endl;
}
}
Finally thanks to Robert I found how to make the code work - by adding if statment
if ((tx < d_N - 2 * d_R) && (ty < d_N - 2 * d_R)) {
for (int col = tx; col <= tx + 2 * d_R; col++)
for (int row = ty; row <= ty + 2 * d_R; row++)
out_local += *(d_tab_begin + col * d_N + row);
*(d_out_begin + (tx) * (d_N - 2 * R) + ty) = out_local;
}
I just tried to finish a problem:Path Finder #3: the Alpinist in Codewars. I had passed all basic test cases and there were any errors under my own test cases. But when i submited my solution, my code failed for random test cased. My solution of problem is graph searching based Dijkstra algorithm and priority_queue. I think there my some potential errors i didn't consider. Please help me check it. I have tried achieve it for three hours.
My solution is below.
#include <iostream>
#include <cmath>
#include <vector>
#include <queue>
using namespace std;
const int INF = 1e9;
const int WHITE = -1;
const int GRAY = 0;
const int BLACK = 1;
int path_finder(string maze)
{
int result = 0;
vector<pair<int, int>> element;
vector<vector<pair<int, int>>> altitude;
int width = (-1 + sqrt(5 + 4 * maze.size())) / 2;
auto tem = maze.find('\n');
while (tem != string::npos)
{
maze.erase(tem, 1);
tem = maze.find('\n');
}
for (int i = 0; i < width; ++i)
{
for (int j = 0; j < width; ++j)
{
altitude.push_back(element);
if (i >= 1)
altitude[i * width + j].push_back(make_pair(i * width + j - width, abs(maze[i * width + j] - maze[i * width + j - width])));
if (i < width - 1)
altitude[i * width + j].push_back(make_pair(i * width + j + width, abs(maze[i * width + j] - maze[i * width + j + width])));
if (j >= 1)
altitude[i * width + j].push_back(make_pair(i * width + j - 1, abs(maze[i * width + j] - maze[i * width + j - 1])));
if (j < width - 1)
altitude[i * width + j].push_back(make_pair(i * width + j + 1, abs(maze[i * width + j] - maze[i * width + j + 1])));
}
}
int* distance = new int[width * width];
int* state = new int[width * width];
for (int i = 0; i < width * width; ++i)
{
distance[i] = INF;
state[i] = WHITE;
}
priority_queue<pair<int, int>> unfinished;
unfinished.push(make_pair(0, 0));
state[0] = GRAY;
distance[0] = 0;
while (!unfinished.empty())
{
pair<int, int> tem = unfinished.top();
unfinished.pop();
state[tem.second] = BLACK;
if(distance[tem.second] < tem.first * (-1))
continue;
for (int i = 0; i < altitude[tem.second].size(); ++i)
{
if(state[altitude[tem.second][i].first] != BLACK)
{
unfinished.push(make_pair(-1 * altitude[tem.second][i].second, altitude[tem.second][i].first));
if (distance[tem.second] + altitude[tem.second][i].second < distance[altitude[tem.second][i].first])
{
distance[altitude[tem.second][i].first] = distance[tem.second] + altitude[tem.second][i].second;
state[altitude[tem.second][i].first] = GRAY;
}
}
}
}
result = distance[width * width - 1];
return result;
}
Here is a test case where your code has the wrong answer.
"53072\n"
"09003\n"
"29977\n"
"31707\n"
"59844"
The least cost is 13, with this path:
{1 1 0 0 0}
{0 1 0 0 0}
{0 1 1 1 1}
{0 0 0 0 1}
{0 0 0 0 1}
But your program outputs 15.
I would like to modify the code for an OpenCV mean filter to use Intel intrinsics. I'm an SSE newbie and I really don't know where to start from. I checked a lot of resources on the web, but I didn't have a lot of success.
This is the program:
#include "opencv2/imgproc/imgproc.hpp"
#include "opencv2/highgui/highgui.hpp"
using namespace cv;
using namespace std;
int main()
{
int A[3][3] = { { 1, 1, 1 }, { 1, 1, 1 }, { 1, 1, 1 } };
int c = 0;
int d = 0;
Mat var1 = imread("images.jpg", 1);
Mat var2(var1.rows, var1.cols, CV_8UC3, Scalar(0, 0, 0));
for (int i = 0; i < var1.rows; i++)
{
var2.at<Vec3b>(i, 0) = var1.at<Vec3b>(i, 0);
var2.at<Vec3b>(i, var1.cols - 1) = var1.at<Vec3b>(i, var1.cols - 1);
}
for (int i = 0; i < var1.cols; i++)
{
var2.at<Vec3b>(0, i) = var1.at<Vec3b>(0, i);
var2.at<Vec3b>(var1.rows - 1, i) = var1.at<Vec3b>(var1.rows - 1, i);
}
for (int i = 0; i < var1.rows; i++) {
for (int j = 0; j < var1.cols; j++)
{
c = 0;
for (int m = i; m < var1.rows; m++, c++)
{
if (c < 3)
{
d = 0;
for (int n = j; n < var1.cols; n++, d++)
{
if (d < 3)
{
if ((i + 1) < var1.rows && (j + 1) < var1.cols)
{
var2.at<Vec3b>(i + 1, j + 1)[0] += var1.at<Vec3b>(m, n)[0] * A[m - i][n - j] / 9;
var2.at<Vec3b>(i + 1, j + 1)[1] += var1.at<Vec3b>(m, n)[1] * A[m - i][n - j] / 9;
var2.at<Vec3b>(i + 1, j + 1)[2] += var1.at<Vec3b>(m, n)[2] * A[m - i][n - j] / 9;
}
}
}
}
}
}
}
imshow("window1", var1);
imshow("window2", var2);
waitKey(0);
return(0);
}
The part that I find difficult is understanding how to convert the innermost 2 loops, where the mean value is computed. Any help will be greatly appreciated.
Just for fun, I thought it might be interesting to start with a naive implementation of a 3x3 mean filter and then optimise this incrementally, ending up with a SIMD (SSE) implementation, measuring the throughput improvement at each stage.
1 - Mean_3_3_ref - reference implementation
This is just a simple scalar implementation which we'll use as a baseline for throughput and for validating further implementations:
void Mean_3_3_ref(const Mat &image_in, Mat &image_out)
{
for (int y = 1; y < image_in.rows - 1; ++y)
{
for (int x = 1; x < image_in.cols - 1; ++x)
{
for (int c = 0; c < 3; ++c)
{
image_out.at<Vec3b>(y, x)[c] = (image_in.at<Vec3b>(y - 1, x - 1)[c] +
image_in.at<Vec3b>(y - 1, x )[c] +
image_in.at<Vec3b>(y - 1, x + 1)[c] +
image_in.at<Vec3b>(y , x - 1)[c] +
image_in.at<Vec3b>(y , x )[c] +
image_in.at<Vec3b>(y , x + 1)[c] +
image_in.at<Vec3b>(y + 1, x - 1)[c] +
image_in.at<Vec3b>(y + 1, x )[c] +
image_in.at<Vec3b>(y + 1, x + 1)[c] + 4) / 9;
}
}
}
}
2 - Mean_3_3_scalar - somewhat optimised scalar implementation
Exploit the redundancy in summing successive columns - we save the last two column sums so that we only need to calculate one new column sum (per channel) on each iteration:
void Mean_3_3_scalar(const Mat &image_in, Mat &image_out)
{
for (int y = 1; y < image_in.rows - 1; ++y)
{
int r_1, g_1, b_1;
int r0, g0, b0;
int r1, g1, b1;
r_1 = g_1 = b_1 = 0;
r0 = g0 = b0 = 0;
for (int yy = y - 1; yy <= y + 1; ++yy)
{
r_1 += image_in.at<Vec3b>(yy, 0)[0];
g_1 += image_in.at<Vec3b>(yy, 0)[1];
b_1 += image_in.at<Vec3b>(yy, 0)[2];
r0 += image_in.at<Vec3b>(yy, 1)[0];
g0 += image_in.at<Vec3b>(yy, 1)[1];
b0 += image_in.at<Vec3b>(yy, 1)[2];
}
for (int x = 1; x < image_in.cols - 1; ++x)
{
r1 = g1 = b1 = 0;
for (int yy = y - 1; yy <= y + 1; ++yy)
{
r1 += image_in.at<Vec3b>(yy, x + 1)[0];
g1 += image_in.at<Vec3b>(yy, x + 1)[1];
b1 += image_in.at<Vec3b>(yy, x + 1)[2];
}
image_out.at<Vec3b>(y, x)[0] = (r_1 + r0 + r1 + 4) / 9;
image_out.at<Vec3b>(y, x)[1] = (g_1 + g0 + g1 + 4) / 9;
image_out.at<Vec3b>(y, x)[2] = (b_1 + b0 + b1 + 4) / 9;
r_1 = r0;
g_1 = g0;
b_1 = b0;
r0 = r1;
g0 = g1;
b0 = b1;
}
}
}
3 - Mean_3_3_scalar_opt - further optimised scalar implementation
As per Mean_3_3_scalar, but also remove OpenCV overheads by caching pointers to each row that we are working on:
void Mean_3_3_scalar_opt(const Mat &image_in, Mat &image_out)
{
for (int y = 1; y < image_in.rows - 1; ++y)
{
const uint8_t * const input_1 = image_in.ptr(y - 1);
const uint8_t * const input0 = image_in.ptr(y);
const uint8_t * const input1 = image_in.ptr(y + 1);
uint8_t * const output = image_out.ptr(y);
int r_1 = input_1[0] + input0[0] + input1[0];
int g_1 = input_1[1] + input0[1] + input1[1];
int b_1 = input_1[2] + input0[2] + input1[2];
int r0 = input_1[3] + input0[3] + input1[3];
int g0 = input_1[4] + input0[4] + input1[4];
int b0 = input_1[5] + input0[5] + input1[5];
for (int x = 1; x < image_in.cols - 1; ++x)
{
int r1 = input_1[x * 3 + 3] + input0[x * 3 + 3] + input1[x * 3 + 3];
int g1 = input_1[x * 3 + 4] + input0[x * 3 + 4] + input1[x * 3 + 4];
int b1 = input_1[x * 3 + 5] + input0[x * 3 + 5] + input1[x * 3 + 5];
output[x * 3 ] = (r_1 + r0 + r1 + 4) / 9;
output[x * 3 + 1] = (g_1 + g0 + g1 + 4) / 9;
output[x * 3 + 2] = (b_1 + b0 + b1 + 4) / 9;
r_1 = r0;
g_1 = g0;
b_1 = b0;
r0 = r1;
g0 = g1;
b0 = b1;
}
}
}
4 - Mean_3_3_blur - leverage OpenCV's blur function
OpenCV has a function called blur, which is based on the function boxFilter, which is just another name for a mean filter. Since OpenCV code has been quite heavily optimised over the years (using SIMD in many cases), let's see if this makes a big improvement over our scalar code:
void Mean_3_3_blur(const Mat &image_in, Mat &image_out)
{
blur(image_in, image_out, Size(3, 3));
}
5 - Mean_3_3_SSE - SSE implementation
This a reasonably efficient SIMD implementation. It uses the same techniques as the scalar code above in order to eliminate redundancy in processing successive pixels:
#include <tmmintrin.h> // Note: requires SSSE3 (aka MNI)
inline void Load2(const ssize_t offset, const uint8_t* const src, __m128i& vh, __m128i& vl)
{
const __m128i v = _mm_loadu_si128((__m128i *)(src + offset));
vh = _mm_unpacklo_epi8(v, _mm_setzero_si128());
vl = _mm_unpackhi_epi8(v, _mm_setzero_si128());
}
inline void Store2(const ssize_t offset, uint8_t* const dest, const __m128i vh, const __m128i vl)
{
__m128i v = _mm_packus_epi16(vh, vl);
_mm_storeu_si128((__m128i *)(dest + offset), v);
}
template <int SHIFT> __m128i ShiftL(const __m128i v0, const __m128i v1) { return _mm_alignr_epi8(v1, v0, SHIFT * sizeof(short)); }
template <int SHIFT> __m128i ShiftR(const __m128i v0, const __m128i v1) { return _mm_alignr_epi8(v1, v0, 16 - SHIFT * sizeof(short)); }
template <int CHANNELS> void Mean_3_3_SSE_Impl(const Mat &image_in, Mat &image_out)
{
const int nx = image_in.cols;
const int ny = image_in.rows;
const int kx = 3 / 2; // x, y borders
const int ky = 3 / 2;
const int kScale = 3 * 3; // scale factor = total number of pixels in sum
const __m128i vkScale = _mm_set1_epi16((32768 + kScale / 2) / kScale);
const int nx0 = ((nx + kx) * CHANNELS + 15) & ~15; // round up total width to multiple of 16
int x, y;
for (y = ky; y < ny - ky; ++y)
{
const uint8_t * const input_1 = image_in.ptr(y - 1);
const uint8_t * const input0 = image_in.ptr(y);
const uint8_t * const input1 = image_in.ptr(y + 1);
uint8_t * const output = image_out.ptr(y);
__m128i vsuml_1, vsumh0, vsuml0;
__m128i vh, vl;
vsuml_1 = _mm_set1_epi16(0);
Load2(0, input_1, vsumh0, vsuml0);
Load2(0, input0, vh, vl);
vsumh0 = _mm_add_epi16(vsumh0, vh);
vsuml0 = _mm_add_epi16(vsuml0, vl);
Load2(0, input1, vh, vl);
vsumh0 = _mm_add_epi16(vsumh0, vh);
vsuml0 = _mm_add_epi16(vsuml0, vl);
for (x = 0; x < nx0; x += 16)
{
__m128i vsumh1, vsuml1, vsumh, vsuml;
Load2((x + 16), input_1, vsumh1, vsuml1);
Load2((x + 16), input0, vh, vl);
vsumh1 = _mm_add_epi16(vsumh1, vh);
vsuml1 = _mm_add_epi16(vsuml1, vl);
Load2((x + 16), input1, vh, vl);
vsumh1 = _mm_add_epi16(vsumh1, vh);
vsuml1 = _mm_add_epi16(vsuml1, vl);
vsumh = _mm_add_epi16(vsumh0, ShiftR<CHANNELS>(vsuml_1, vsumh0));
vsuml = _mm_add_epi16(vsuml0, ShiftR<CHANNELS>(vsumh0, vsuml0));
vsumh = _mm_add_epi16(vsumh, ShiftL<CHANNELS>(vsumh0, vsuml0));
vsuml = _mm_add_epi16(vsuml, ShiftL<CHANNELS>(vsuml0, vsumh1));
// round mean
vsumh = _mm_mulhrs_epi16(vsumh, vkScale);
vsuml = _mm_mulhrs_epi16(vsuml, vkScale);
Store2(x, output, vsumh, vsuml);
vsuml_1 = vsuml0;
vsumh0 = vsumh1;
vsuml0 = vsuml1;
}
}
}
void Mean_3_3_SSE(const Mat &image_in, Mat &image_out)
{
const int channels = image_in.channels();
switch (channels)
{
case 1:
Mean_3_3_SSE_Impl<1>(image_in, image_out);
break;
case 3:
Mean_3_3_SSE_Impl<3>(image_in, image_out);
break;
default:
throw("Unsupported format.");
break;
}
}
Results
I benchmarked all of the above implementations on an 8th gen Core i9 (MacBook Pro 16,1) at 2.4 GHz, with an image size of 2337 rows x 3180 cols. The compiler was Apple clang version 12.0.5 (clang-1205.0.22.9) and the only optimisation switch was -O3. OpenCV version was 4.5.0 (via Homebrew). (Note: I verified that for Mean_3_3_blur the cv::blur function was dispatched to an AVX2 implementation.) The results:
Mean_3_3_ref 62153 µs
Mean_3_3_scalar 41144 µs = 1.51062x
Mean_3_3_scalar_opt 26238 µs = 2.36882x
Mean_3_3_blur 20121 µs = 3.08896x
Mean_3_3_SSE 4838 µs = 12.84680x
Notes
I have ignored the border pixels in all implementations - if required these can either be filled with pixels from the original image or using some other form of edge pixel processing.
The code is not "industrial strength" - it was only written for benchmarking purposes.
There are a few further possible optimisations, e.g. use wider SIMD (AVX2, AVX512), exploit the redundancy between successive rows, etc - these are left as an exercise for the reader.
The SSE implementation is fastest, but this comes at the cost of increased complexity, decreased mantainability and reduced portability.
The OpenCV blur function gives the second best performance, and should probably be the preferred solution if it meets throughput requirements - it's the simplest solution, and simple is good.
I'm trying to create a class that can procedurally create prisms (or cylinders if the precision is high enough) but only the sides of the 3d model are showing (not the top and bottom). This is using openGL and c++. Not going for efficiency, just modifying a previous class that made a sphere.
#define numSlices 2
Prism::Prism() {
init(3);
}
Prism::Prism(int prec) {
init(prec);
}
float Prism::toRadians(float degrees) { return (degrees * 2.0f * 3.14159f) / 360.0f; }
void Prism::init(int prec) {
prec = (prec < 3) ? 3 : prec;
numVertices = (prec + 1) * (numSlices+1);
numIndices = prec * numSlices * 6;
for (int i = 0; i < numVertices; i++) { vertices.push_back(glm::vec3()); }
for (int i = 0; i < numVertices; i++) { texCoords.push_back(glm::vec2()); }
for (int i = 0; i < numVertices; i++) { normals.push_back(glm::vec3()); }
for (int i = 0; i < numVertices; i++) { tangents.push_back(glm::vec3()); }
for (int i = 0; i < numIndices; i++) { indices.push_back(0); }
// calculate triangle vertices
for (int i = 0; i <= numSlices; i++) {
for (int j = 0; j <= prec; j++) {
float y = i;
float x = -(float)cos(toRadians(j * 360.0f / (float)prec));
float z = (float)sin(toRadians(j * 360.0f / (float)prec));
vertices[i * (prec + 1) + j] = glm::vec3(x, y, z);
texCoords[i * (prec + 1) + j] = glm::vec2(((float)j / prec), ((float)i / numSlices));
}
}
// calculate triangle indices
for (int i = 0; i < numSlices; i++) {
for (int j = 0; j < prec; j++) {
indices[6 * (i * prec + j) + 0] = i * (prec + 1) + j;
indices[6 * (i * prec + j) + 1] = i * (prec + 1) + j + 1;
indices[6 * (i * prec + j) + 2] = (i + 1) * (prec + 1) + j;
indices[6 * (i * prec + j) + 3] = i * (prec + 1) + j + 1;
indices[6 * (i * prec + j) + 4] = (i + 1) * (prec + 1) + j + 1;
indices[6 * (i * prec + j) + 5] = (i + 1) * (prec + 1) + j;
}
}
}
Any tips or solutions that stick closely to the code already written would much appreciated.
To render the top and bottom of the cylinder, you can create a "triangle fan" that starts from a vertex at the center of the top/bottom of the cylinder and creates one triangle for every side.
Adapting your code: (untested, I may have made mistakes against winding order)
int bottom_center = vertices.length(); vertices.push_back(glm::vec3(0,0,0));
int top_center = vertices.length(); vertices.push_back(glm::vec3(0,numSlices,0));
// Bottom
for (int j = 0; j < prec; j++) {
int base = 0;
indices.push_back(bottom_center);
indices.push_back(base+j);
indices.push_back(base+j+1);
}
// Top
for (int j = 0; j < prec; j++) {
int base = numSlices * (prec+1);
indices.push_back(top_center);
indices.push_back(base+j);
indices.push_back(base+j+1);
}
See http://www.songho.ca/opengl/gl_cylinder.html for a more worked-out example.
I am trying to apply the sobel filter algorithm to a given picture (grayscale in this case) given my approach to accessing the pixels of the picture. Since I am accessing them in a way that doesn't use libraries, I am having trouble figuring out how to apply the algorithm given this approach. This first part of the code is just accessing pixel data:
Part 1:
CKingimageDoc* pDoc = GetDocument(); // get picture
int iBitPerPixel = pDoc->_bmp->bitsperpixel; // used to see if grayscale(8 bits) or RGB (24 bits)
int iWidth = pDoc->_bmp->width;
int iHeight = pDoc->_bmp->height;
BYTE *pImg = pDoc->_bmp->point; // pointer used to point at pixels in the image
const int area = iWidth * iHeight;
int Wp = iWidth;
int intensity;
if (iBitPerPixel == 8) ////Grayscale 8 bits image
{
int r = iWidth % 4; // pixels leftover from width (remainder has to be factor of 8 or 24)
int p = (4-r) % 4; // has to be a factor of number of bits in pixel, num leftover to take care of
Wp = iWidth + p;
Part 2 (The actual application of the sobel filter algorithm):
float kernelx[3][3] = { { -1, 0, 1 },
{ -2, 0, 2 },
{ -1, 0, 1 } };
float kernely[3][3] = { { -1, -2, -1 },
{ 0, 0, 0 },
{ 1, 2, 1 } };
double magX = 0.0; // this is your magnitude
for (int a = 0; a < 3; a++) {
for (int b = 0; b < 3; b++) {
magX += pImg[i*Wp + j] * kernelx[a][b]; // where i get confused
}
}
}
Any and all help is greatly appreciated.
You have to use appropriate pixel from neighborhood of center pixel to multiply with kernel entry:
//row, col - coordinates of central pixel for calculation
for (int row = 1; row < height - 1; row++) {
for (int col = 1; col < width - 1; col++) {
double magX = 0.0; // this is your magnitude
for (int a = 0; a < 3; a++) {
for (int b = 0; b < 3; b++) {
magX += pImg[(row - 1 + a) * Wp + col - 1 + b] * kernelx[a][b];
}
}
resultImg[row * Wp + col] = magX;
}
}
I omitted border pixels
CKingimageDoc* pDoc = GetDocument(); // get picture
int iBitPerPixel = pDoc->_bmp->bitsperpixel; // used to see if grayscale(8b) or RGB(24b)
int iWidth = pDoc->_bmp->width;
int iHeight = pDoc->_bmp->height;
BYTE *pImg = pDoc->_bmp->point; // pointer used to point at pixels in the image
const int area = iWidth * iHeight;
BYTE *pImg2 = new BYTE[area];
if (iBitPerPixel == 8) // Grayscale 8bit image
{
int pixel_x;
int pixel_y;
float sobel_x[3][3] =
{ { -1, 0, 1 },
{ -2, 0, 2 },
{ -1, 0, 1 } };
float sobel_y[3][3] =
{ { -1, -2, -1 },
{ 0, 0, 0 },
{ 1, 2, 1 } };
for (int x=1; x < iWidth-1; x++)
{
for (int y=1; y < iHeight-1; y++)
{
pixel_x = (sobel_x[0][0] * pImg[iWidth * (y-1) + (x-1)])
+ (sobel_x[0][1] * pImg[iWidth * (y-1) + x ])
+ (sobel_x[0][2] * pImg[iWidth * (y-1) + (x+1)])
+ (sobel_x[1][0] * pImg[iWidth * y + (x-1)])
+ (sobel_x[1][1] * pImg[iWidth * y + x ])
+ (sobel_x[1][2] * pImg[iWidth * y + (x+1)])
+ (sobel_x[2][0] * pImg[iWidth * (y+1) + (x-1)])
+ (sobel_x[2][1] * pImg[iWidth * (y+1) + x ])
+ (sobel_x[2][2] * pImg[iWidth * (y+1) + (x+1)]);
pixel_y = (sobel_y[0][0] * pImg[iWidth * (y-1) + (x-1)])
+ (sobel_y[0][1] * pImg[iWidth * (y-1) + x ])
+ (sobel_y[0][2] * pImg[iWidth * (y-1) + (x+1)])
+ (sobel_y[1][0] * pImg[iWidth * y + (x-1)])
+ (sobel_y[1][1] * pImg[iWidth * y + x ])
+ (sobel_y[1][2] * pImg[iWidth * y + (x+1)])
+ (sobel_y[2][0] * pImg[iWidth * (y+1) + (x-1)])
+ (sobel_y[2][1] * pImg[iWidth * (y+1) + x ])
+ (sobel_y[2][2] * pImg[iWidth * (y+1) + (x+1)]);
int val = (int)sqrt((pixel_x * pixel_x) + (pixel_y * pixel_y));
if(val < 0) val = 0;
if(val > 255) val = 255;
pImg2[iHeight * y + x] = val;
}
}
}