list of white pixels indices in image using CUDA - c++

Given a binary image, I want to return the list of indices for white pixels in it using GPU (Compute Unified Device Architecture). How to determine the index for points vector?
Here is the CUDA Kernel .
//copy only active pixel locations
__global__ void get_white_pixels_kernel(unsigned char* bin_image,
float * points,
int width,
int height,
int grayWidthStep)
int row_index = threadIdx.y+ blockIdx.y*blockDim.y;
int col_index = threadIdx.x+blockIdx.x*blockDim.x;
if ((col_index < width) && (row_index < height))
//Location of gray pixel in output
const int gray_tid = row_index * grayWidthStep + col_index;
points[--here is the index]= Point2f(row_index,col_index);

Following is a naive method to achieve the desired functionality:
Generate a mask of pixel indices with dummy values for pixel with zero value.
Count the number of non-zero pixels
Create an output vector with length equal to non-zero count.
Copy the non-zero pixel indices from the generated mask to the output vector (a process known as stream-compaction)
Following is a sample code for the above mentioned process.
#include <cstdio>
#include <vector>
#include <cuda_runtime.h>
#include <thrust/count.h>
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include <thrust/execution_policy.h>
#include <opencv2/opencv.hpp>
static void _check_err(cudaError_t err, const char* file, int line)
const char* err_str = cudaGetErrorString(err);
printf("CUDA Error: %s\nFile: %s\nLine: %d\n", err_str, file, line);
#define CHECK_ERR(err) _check_err((err), __FILE__, __LINE__)
__global__ void kernel_find_indices(const unsigned char* input, int width, int height, int step, int2* indices)
const int x = blockIdx.x * blockDim.x + threadIdx.x;
const int y = blockIdx.y * blockDim.y + threadIdx.y;
if(x < width && y < height)
const int tidPixel = y * step + x;
const int tidIndex = y * width + x;
unsigned char value = input[tidPixel];
int2 index_to_write;
//Write actual index to pixels with non-zero value
index_to_write.x = x;
index_to_write.y = y;
//Write dummy index to pixels with zero value
index_to_write.x = -1;
index_to_write.y = -1;
indices[tidIndex] = index_to_write;
//Operator to check whether an index is of a non-zero pixel
struct isNonZeroIndex
__host__ __device__ bool operator()(const int2 &idx)
return (idx.x != -1) && (idx.y != -1);
std::vector<cv::Point> getIndicesOfNonZeroPixels(cv::Mat input)
std::vector<int2> output_int2;
std::vector<cv::Point> output;
int pixelCount = input.cols * input.rows;
size_t imageBytes= input.step * input.rows;
unsigned char* image_d;
thrust::device_vector<int2> index_buffer_d(pixelCount);
//Allocate device memory for input image
CHECK_ERR(cudaMalloc(&image_d, imageBytes));
//Copy input image to device
CHECK_ERR(cudaMemcpy(image_d, input.ptr(), imageBytes, cudaMemcpyHostToDevice));
dim3 block(16,16);
dim3 grid;
grid.x = (input.cols + block.x - 1) / block.x;
grid.y = (input.rows + block.y - 1) / block.y;
//Generate an index mask with dummy values for indices with zero pixel value
kernel_find_indices<<<grid, block>>>(image_d, input.cols, input.rows, input.step, thrust::raw_pointer_cast(;
int nonZeroCount = thrust::count_if(index_buffer_d.begin(), index_buffer_d.end(), isNonZeroIndex());
//Keep only those indices whose pixel value is non-zero (stream compaction)
thrust::device_vector<int2> compacted(nonZeroCount);
thrust::copy_if(index_buffer_d.begin(), index_buffer_d.end(), compacted.begin(), isNonZeroIndex());
//Copy non-zero pixel indices to host
thrust::copy(compacted.begin(), compacted.end(), output_int2.begin());
//Convert vector<int2> to vector<cv::Point>
for(size_t i=0; i<nonZeroCount; i++)
output[i] = cv::Point(output_int2[i].x, output_int2[i].y);
return output;
void run_test()
//Generate a sample test image
cv::Mat test = cv::Mat::zeros(100,100, CV_8UC1);
cv::rectangle(test, cv::Rect(5,5,20,20), cv::Scalar::all(255), CV_FILLED);
//Get pixel indices of non-zero pixels
std::vector<cv::Point> indices = getIndicesOfNonZeroPixels(test);
//Display those indices
for(size_t i=0; i<indices.size(); i++)
printf("%d, %d\n", indices[i].x, indices[i].y);
//Show image
cv::imshow("Sample", test);
int main(int argc, char** argv)
return 0;
Compilation Command
nvcc -o nz -arch=sm_61 -L/usr/local/lib -lopencv_core
-lopencv_highgui -lopencv_imgproc
Please keep in mind that this code is for image of type 8UC1 (8 bit, single channel) only. You can easily extend it to other data-types as required.


CUDA Speed Slower than expected - Image Processing

I am new to CUDA development and wanted to write a simple benchmark to test some image processing feasibility. I have 32 images that are each 720x540, one byte per pixel greyscale.
I am running benchmarks for 10 seconds, and counting how many times they are able to process. There are three benchmarks I am running:
The first is just transferring the images into the GPU global memory, via cudaMemcpy
The second is transferring and processing the images.
The third is running the equivalent test on a CPU.
For a starting, simple test, the image processing is just counting the number of pixels above a certain greyscale value. I'm finding that accessing global memory on the GPU is very slow. I have my benchmark structured such that it creates one block per image, and one thread per row in each image. Each thread counts its pixels into a shared memory array, after which the first thread sums them up (See below).
The issue I am having is that this all runs very slowly - about 50fps. Much slower than a CPU version - about 230fps. If I comment out the pixel value comparison, resulting in just a count of all pixels, I get 6x the performance. I tried using texture memory but didn't see a performance gain. I am running a Quadro K2000. Also: the image copy only benchmark is able to copy at around 330fps, so that doesn't appear to be the issue.
Any help / pointers would be appreciated. Thank you.
__global__ void ThreadPerRowCounter(int Threshold, int W, int H, U8 **AllPixels, int *AllReturns)
extern __shared__ int row_counts[];//this parameter to kernel call "<<<, ,>>>" sets the size
//see here for indexing
int myImage = blockIdx.y * gridDim.x + blockIdx.x;
int myStartRow = (threadIdx.y * blockDim.x + threadIdx.x);
unsigned char *imageStart = AllPixels[myImage];
unsigned char *pixelStart = imageStart + myStartRow * W;
unsigned char *pixelEnd = pixelStart + W;
unsigned char *pixelItr = pixelStart;
int row_count = 0;
while(pixelItr < pixelEnd)
if (*pixelItr > Threshold) //REMOVING THIS LINE GIVES 6x PERFORMANCE
row_counts[myStartRow] = row_count;
if (myStartRow == 0)
{//first thread sums up for the while image
int image_count = 0;
for (int i = 0; i < H; i++)
image_count += row_counts[i];
AllReturns[myImage] = image_count;
extern "C" void cuda_Benchmark(int nImages, int W, int H, U8** AllPixels, int *AllReturns, int Threshold)
ThreadPerRowCounter<<<nImages, H, sizeof(int)*H>>> (
W, H,
//wait for all blocks to finish
Two changes to your kernel design can result in a significant speedup:
Perform the operations column-wise instead of row-wise. The general background for why this matters/helps is described here.
Replace your final operation with a canonical parallel reduction.
According to my testing, those 2 changes result in ~22x speedup in kernel performance:
$ cat
#include <iostream>
#include <helper_cuda.h>
typedef unsigned char U8;
__global__ void ThreadPerRowCounter(int Threshold, int W, int H, U8 **AllPixels, int *AllReturns)
extern __shared__ int row_counts[];//this parameter to kernel call "<<<, ,>>>" sets the size
//see here for indexing
int myImage = blockIdx.y * gridDim.x + blockIdx.x;
int myStartRow = (threadIdx.y * blockDim.x + threadIdx.x);
unsigned char *imageStart = AllPixels[myImage];
unsigned char *pixelStart = imageStart + myStartRow * W;
unsigned char *pixelEnd = pixelStart + W;
unsigned char *pixelItr = pixelStart;
int row_count = 0;
while(pixelItr < pixelEnd)
if (*pixelItr > Threshold) //REMOVING THIS LINE GIVES 6x PERFORMANCE
row_counts[myStartRow] = row_count;
if (myStartRow == 0)
{//first thread sums up for the while image
int image_count = 0;
for (int i = 0; i < H; i++)
image_count += row_counts[i];
AllReturns[myImage] = image_count;
__global__ void ThreadPerColCounter(int Threshold, int W, int H, U8 **AllPixels, int *AllReturns, int rsize)
extern __shared__ int col_counts[];//this parameter to kernel call "<<<, ,>>>" sets the size
int myImage = blockIdx.y * gridDim.x + blockIdx.x;
unsigned char *imageStart = AllPixels[myImage];
int myStartCol = (threadIdx.y * blockDim.x + threadIdx.x);
int col_count = 0;
for (int i = 0; i < H; i++) if (imageStart[myStartCol+i*W]> Threshold) col_count++;
col_counts[threadIdx.x] = col_count;
for (int i = rsize; i > 0; i>>=1){
if ((threadIdx.x+i < W) && (threadIdx.x < i)) col_counts[threadIdx.x] += col_counts[threadIdx.x+i];
if (!threadIdx.x) AllReturns[myImage] = col_counts[0];
void cuda_Benchmark(int nImages, int W, int H, U8** AllPixels, int *AllReturns, int Threshold)
ThreadPerRowCounter<<<nImages, H, sizeof(int)*H>>> (
W, H,
//wait for all blocks to finish
unsigned next_power_of_2(unsigned v){
v |= v >> 1;
v |= v >> 2;
v |= v >> 4;
v |= v >> 8;
v |= v >> 16;
return v;}
void cuda_Benchmark1(int nImages, int W, int H, U8** AllPixels, int *AllReturns, int Threshold)
int rsize = next_power_of_2(W/2);
ThreadPerColCounter<<<nImages, W, sizeof(int)*W>>> (
W, H,
AllReturns, rsize);
//wait for all blocks to finish
int main(){
const int my_W = 720;
const int my_H = 540;
const int n_img = 128;
const int my_thresh = 10;
U8 **img_p, **img_ph;
U8 *img, *img_h;
int *res, *res_h, *res_h1;
img_ph = (U8 **)malloc(n_img*sizeof(U8*));
cudaMalloc(&img_p, n_img*sizeof(U8*));
cudaMalloc(&img, n_img*my_W*my_H*sizeof(U8));
img_h = new U8[n_img*my_W*my_H];
for (int i = 0; i < n_img*my_W*my_H; i++) img_h[i] = rand()%20;
cudaMemcpy(img, img_h, n_img*my_W*my_H*sizeof(U8), cudaMemcpyHostToDevice);
for (int i = 0; i < n_img; i++) img_ph[i] = img+my_W*my_H*i;
cudaMemcpy(img_p, img_ph, n_img*sizeof(U8*), cudaMemcpyHostToDevice);
cudaMalloc(&res, n_img*sizeof(int));
cuda_Benchmark(n_img, my_W, my_H, img_p, res, my_thresh);
res_h = new int[n_img];
cudaMemcpy(res_h, res, n_img*sizeof(int), cudaMemcpyDeviceToHost);
cuda_Benchmark1(n_img, my_W, my_H, img_p, res, my_thresh);
res_h1 = new int[n_img];
cudaMemcpy(res_h1, res, n_img*sizeof(int), cudaMemcpyDeviceToHost);
for (int i = 0; i < n_img; i++) if (res_h[i] != res_h1[i]) {std::cout << "mismatch at: " << i << " was: " << res_h1[i] << " should be: " << res_h[i] << std::endl; return 0;}
$ nvcc -o t49 -I/usr/local/cuda/samples/common/inc
$ cuda-memcheck ./t49
========= ERROR SUMMARY: 0 errors
$ nvprof ./t49
==1756== NVPROF is profiling process 1756, command: ./t49
==1756== Profiling application: ./t49
==1756== Profiling result:
Type Time(%) Time Calls Avg Min Max Name
GPU activities: 72.02% 54.325ms 1 54.325ms 54.325ms 54.325ms ThreadPerRowCounter(int, int, int, unsigned char**, int*)
24.71% 18.639ms 2 9.3195ms 1.2800us 18.638ms [CUDA memcpy HtoD]
3.26% 2.4586ms 1 2.4586ms 2.4586ms 2.4586ms ThreadPerColCounter(int, int, int, unsigned char**, int*, int)
0.00% 3.1040us 2 1.5520us 1.5360us 1.5680us [CUDA memcpy DtoH]
API calls: 43.63% 59.427ms 3 19.809ms 18.514us 59.159ms cudaMalloc
41.70% 56.789ms 2 28.394ms 2.4619ms 54.327ms cudaDeviceSynchronize
14.02% 19.100ms 4 4.7749ms 17.749us 18.985ms cudaMemcpy
0.52% 705.26us 96 7.3460us 203ns 327.21us cuDeviceGetAttribute
0.05% 69.268us 1 69.268us 69.268us 69.268us cuDeviceTotalMem
0.04% 50.688us 1 50.688us 50.688us 50.688us cuDeviceGetName
0.04% 47.683us 2 23.841us 14.352us 33.331us cudaLaunchKernel
0.00% 3.1770us 1 3.1770us 3.1770us 3.1770us cuDeviceGetPCIBusId
0.00% 1.5610us 3 520ns 249ns 824ns cuDeviceGetCount
0.00% 1.0550us 2 527ns 266ns 789ns cuDeviceGet
(Quadro K2000, CUDA 9.2.148, Fedora Core 27)
(The next_power_of_2 code is lifted from this answer)
I don't claim correctness for this code or any other code that I post. Anyone using any code I post does so at their own risk. I merely claim that I have attempted to address the questions in the original posting, and provide some explanation thereof. I am not claiming my code is defect-free, or that it is suitable for any particular purpose. Use it (or not) at your own risk.

Average Filter with OpenCV

I'm trying to implement an Averaging filter with a 5x5 kernel, although there is a function within OpenCV for this, I need to do it without it.
There is something wrong and I think that are the variables uchar, but I tried int, float and double and the image resulting it's not correct. I use an image with a padding of 7.
#include <opencv2/core/core.hpp>
#include <opencv2/highgui/highgui.hpp>
#include <opencv2/opencv.hpp>
#include "filter.h"
#include <iostream>
#include <fstream>
using namespace std;
using namespace cv;
cv::Mat filter::mean_filter(cv::Mat& image_in){
int centro = 7;
float total = 0.0;
double window[25];
double mean= 0.0;
int final=0;
int nlines, ncols;
cv::Mat kernel = cv::Mat::ones(5, 5, CV_32S);
cv::Mat image_out = cv::Mat::zeros(nlines,ncols,CV_32S);
for (unsigned int j=centro; j<nlines - centro; j++){
for (unsigned int z=centro; z<ncols - centro; z++){
window[2]<uchar>(j ,z-2);
window[7]<uchar>(j ,z-1);
window[12]<uchar>(j ,z);
window[17]<uchar>(j ,z+2);
window[22]<uchar>(j ,z+1);
for (unsigned int k=0; k<25; k++){
return image_out;
I changed your code a bit and have a working solution. It is a quite primitiv approach but it works.
Possible improvements could be to reuse some of the already accumulated pixel-values by tracking which pixels leave the kernel area and which pixels enter it.
Another possibility for improvement is to parallelise the loop over the image.
cv::Mat mean_filter(cv::Mat& image_in, int kernel)
// Make sure you get a grayscale image.
assert(image_in.type() == CV_8UC1);
// Make sure your kernel is an uneven number
assert(kernel % 2 == 1);
// Make sure your kernel is bigger than 1
assert(kernel >= 1);
// for padding calculate the border needed
int padding = (kernel - 1) / 2;
int mean = 0.0;
int final = 0;
int nlines, ncols;
cv::Mat img_temp;
nlines = image_in.size().height;
ncols = image_in.size().width;
// Make propper padding. Here it is done with 0. Padding describes the adding of a border to the image in order to avoid a cropping by applying a filter-mask.
copyMakeBorder(image_in, img_temp, padding, padding, padding, padding, BORDER_CONSTANT, 0);
// allocate the output image as grayscale as the input is grayscale as well
cv::Mat image_out = cv::Mat::zeros(nlines, ncols, CV_8UC1);
// loop over whole image
for (unsigned int j = padding; j<nlines + padding; j++){
for (unsigned int z = padding; z<ncols + padding; z++){
mean = 0.0;
// loop over kernel area
for (int x = -padding; x <= padding; x++){
for (int y = -padding; y <= padding; y++){
// accumulate all pixel-values
mean +=<uchar>(j + x, z + y);
mean = mean / (kernel * kernel);
final = round(mean);
// cast result to uchar and set pixel in output image<uchar>(j - padding, z - padding) = (uchar)final;
return image_out;

Cuda - Device values 0 after kernel execution

For some reason when I execute my program the device variables have a zero values. Just before I execute the cuda kernel the device variables have the correct values. The output image is just black of the original image size. All the memory allocations and copying to and from host seem to be correct.
Thanks for any help!
// Includes, system
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>
#ifdef _WIN32
# define NOMINMAX
# include <windows.h>
#define Image_Size 512
#define Kernel_Size 3
// Includes CUDA
#include <cuda_runtime.h>
// Utilities and timing functions
#include "./inc/helper_functions.h" // includes cuda.h and cuda_runtime_api.h
// CUDA helper functions
#include "./inc/helper_cuda.h" // helper functions for CUDA error check
const char *imageFilename = "lena_bw.pgm";
const char *sampleName = "simpleTexture";
#define C_PI 3.141592653589793238462643383279502884197169399375
void __global__ SwirlCu(int width, int height, int stride, float *pRawBitmapOrig, float *pBitmapCopy, double factor)
// This function effectively swirls an image
// This CUDA kernel is basically the exact same code as the CPU-only, except it has a slightly different setup
// Each thread on the GPU will process exactly one pixel
// Before doing anything, we need to determine the current pixel we are calculating in this thread
// Original code used i as y, and j as x. We will do the same so we can just re-use CPU code in the CUDA kernel
int i = blockIdx.y * blockDim.y + threadIdx.y;
int j = blockIdx.x * blockDim.x + threadIdx.x;
// Test to see if we're testing a valid pixel
if (i >= height || j >= width) return; // Don't bother doing the calculation. We're not in a valid pixel location
double cX = (double)width/2.0f;
double cY = (double)height/2.0f;
double relY = cY-i;
double relX = j-cX;
// relX and relY are points in our UV space
// Calculate the angle our points are relative to UV origin. Everything is in radians.
double originalAngle;
if (relX != 0)
originalAngle = atan(abs(relY)/abs(relX));
if ( relX > 0 && relY < 0) originalAngle = 2.0f*C_PI - originalAngle;
else if (relX <= 0 && relY >=0) originalAngle = C_PI-originalAngle;
else if (relX <=0 && relY <0) originalAngle += C_PI;
// Take care of rare special case
if (relY >= 0) originalAngle = 0.5f * C_PI;
else originalAngle = 1.5f * C_PI;
// Calculate the distance from the center of the UV using pythagorean distance
double radius = sqrt(relX*relX + relY*relY);
// Use any equation we want to determine how much to rotate image by
//double newAngle = originalAngle + factor*radius; // a progressive twist
double newAngle = originalAngle + 1/(factor*radius+(4.0f/C_PI));
// Transform source UV coordinates back into bitmap coordinates
int srcX = (int)(floor(radius * cos(newAngle)+0.5f));
int srcY = (int)(floor(radius * sin(newAngle)+0.5f));
srcX += cX;
srcY += cY;
srcY = height - srcY;
// Clamp the source to legal image pixel
if (srcX < 0) srcX = 0;
else if (srcX >= width) srcX = width-1;
if (srcY < 0) srcY = 0;
else if (srcY >= height) srcY = height-1;
// Set the pixel color
// Since each thread writes to exactly 1 unique pixel, we don't have to do anything special here
pRawBitmapOrig[i*stride/4 + j] = pBitmapCopy[srcY*stride/4 + srcX];
// Declaration, forward
void runTest(int argc, char **argv);
// Program main
int main(int argc, char **argv)
printf("%s starting...\n", sampleName);
// Process command-line arguments
if (argc > 1)
if (checkCmdLineFlag(argc, (const char **) argv, "input"))
getCmdLineArgumentString(argc,(const char **) argv,"input",(char **) &imageFilename);
else if (checkCmdLineFlag(argc, (const char **) argv, "reference"))
printf("-reference flag should be used with -input flag");
runTest(argc, argv);
printf("%s completed",
//exit(testResult ? EXIT_SUCCESS : EXIT_FAILURE);
//! Run a simple test for CUDA
void runTest(int argc, char **argv)
int devID = findCudaDevice(argc, (const char **) argv);
unsigned int kernel_bytes = Kernel_Size * Kernel_Size * sizeof(float);
// load image from disk
float *hData = NULL;
float *host_array_kernel = 0;
float *device_array_Image = 0;
float *device_array_kernel = 0;
float *device_array_Result = 0;
unsigned int width, height;
char *imagePath = sdkFindFilePath(imageFilename, argv[0]);
if (imagePath == NULL)
printf("Unable to source image file: %s\n", imageFilename);
sdkLoadPGM(imagePath, &hData, &width, &height);
unsigned int size = width * height * sizeof(float);
printf("Loaded '%s', %d x %d pixels\n", imageFilename, width, height);
// Allocation of device arrays using CudaMalloc
cudaMalloc((void**)&device_array_Image, size);
cudaMalloc((void**)&device_array_kernel, kernel_bytes);
cudaMalloc((void**)&device_array_Result, size);
host_array_kernel = (float*)malloc(kernel_bytes); // kernel
// Allocate mem for the result on host side
float *hOutputDataSharp = (float *) malloc(size);
GenerateKernel (host_array_kernel);
// copy arrays and kernel from host to device
checkCudaErrors(cudaMemcpy(device_array_Image, hData, size, cudaMemcpyHostToDevice));
checkCudaErrors(cudaMemcpy(device_array_kernel, host_array_kernel, kernel_bytes, cudaMemcpyHostToDevice));
dim3 dimBlock(16, 16, 1);
dim3 dimGrid(width / dimBlock.x, height / dimBlock.y, 1);
//Do the Convolution
printf("DImage : '%.8f'\n",device_array_Image);
printf("DKernel : '%.8f'\n",device_array_kernel);
//serialConvolution(hData, host_array_kernel ,hOutputDataSharp);
SwirlCu<<<512, 512>>>(width, height, width*4, device_array_Image,device_array_Result, 0.005f);
printf("DResult : '%.8f'\n",device_array_Result);
cudaMemcpy(hOutputDataSharp,device_array_Result, size, cudaMemcpyDeviceToHost);
printf("HResult : '%.8f'\n",hOutputDataSharp);
// Write result to file
char outputSharp[1024];
strcpy(outputSharp, imagePath);
strcpy(outputSharp, "data/serial_sharptest.pgm");
sdkSavePGM(outputSharp, hOutputDataSharp, width, height);
Your code is writing in the source image:
pRawBitmapOrig[i*stride/4 + j] = pBitmapCopy[srcY*stride/4 + srcX];
which writes to device_array_Image which is the source, not the destination you are expecting results in.
Moreover, I am very curious on the output of printf("DResult : '%.8f'\n",device_array_Result); as device_array_Result is in GPU address space and allocated with cudaMalloc. On which device are you running ?

calling c function from MATLAB?

I want to call a c function from matlab, for that I tried writing a wrapper function using MEX. While compiling I am getting
error C2109: subscript requires array or pointer type
and error C2440: 'function' : cannot convert from 'double *' to 'double'
Can anyone help me where i did the mistake??
#include "mex.h"
#include "matrix.h"
#include "CVIPtoolkit.h"
#include "CVIPtools.h"
#include "CVIPmatrix.h"
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
void midd(double outdata, int type, int height, int width){
Image *outputImage;
byte **output;
int r,c;
mexPrintf("type %d\n", type);
mexPrintf("height %d\n", height);
mexPrintf("width %d\n", width);
outputImage=new_Image (PGM, GRAY_SCALE, 0, height, width, CVIP_BYTE, REAL );
outputImage = h_image(type, height,width);
output = getData_Image(outputImage, 0);
for(r=0; r < height; r++) {
for(c=0; c < width; c++)
mexPrintf("type %d\n", type);
mexPrintf("height %d\n", height);
mexPrintf("width %d\n", width);
outdata[r+height*c+height*width] =output[r][c]; /* passing data back to MATLAB variable from CVIPtools variable */
void mexFunction(int nlhs, mxArray *plhs[], int nrhs, const mxArray *prhs[])
double *outdata;
int type, height, width;
// double *indata = (double *)mxGetData(prhs[0]);
type = mxGetScalar(prhs[0]);
height = mxGetScalar(prhs[1]);
width = mxGetScalar(prhs[2]);
mexPrintf("type %d\n", type);
mexPrintf("height %d\n", height);
mexPrintf("width %d\n", width);
plhs[0] = mxCreateDoubleMatrix(height,width,mxREAL);
outdata = mxGetData(plhs[0]);
midd(outdata, type, height, width);
The c function i am trying to call is as follows:
#include "CVIPtoolkit.h"
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
Image *
h_image(int type, unsigned int height, unsigned int width){
/* type = 1, Constant
* type = 2, Fixed mask
* type = 3, Gaussian
unsigned int r, c, hf_w = width/2, hf_h = height/2;
Image *outimage;
float **outdata, sum = 0.0, sigma, tmp1, tmp2, tmp;
if (height < 3 || width < 3) {
fprintf(stderr, "Masksize too small, at least 3x3\n");
return (Image *)NULL;
outimage = new_Image(PGM, GRAY_SCALE, 1, height, width, CVIP_FLOAT, REAL);
outdata = (float **)getData_Image(outimage, 0);
switch (type) {
case 1:
for (r = 0; r < height; r++)
for (c = 0; c < width; c++) {
outdata[r][c] = 1.0;
sum += outdata[r][c];
case 2:
for (r = 0; r < height; r++)
for (c = 0; c < width; c++) {
outdata[r][c] = 1.0;
sum += outdata[r][c];
outdata[height/2][width/2] = height * width;
sum = sum - 1.0 + outdata[height/2][width/2];
case 3:
c = (width + height) /4;
r = (width + height) /2;
sigma = sqrt(c*c / (2 * log(2) + (r - 3) * log(3)));
sigma = 1.0 / 2.0 /sigma/sigma;
tmp = width * height;
for (r = 0; r < height; r++)
for (c = 0; c < width; c++) {
tmp1 = (r-hf_h)*(r-hf_h); tmp2 = (c-hf_w)*(c-hf_w);
outdata[r][c] = tmp*exp(- (tmp1 + tmp2) * sigma);
sum += outdata[r][c];
fprintf(stderr, "Incorrect mask type number: %d\n", type);
return (Image *)NULL;
return outimage;
In your main function, outdata is a pointer to a double, yet your function midd takes in an actual double itself. That's why you're getting that error in type.
Simply change your function declaration so that the first input accepts a pointer to a double:
void midd(double *outdata, int type, int height, int width)
// ^^^^^^^^
Minor Note
I question your copying of your image data back to a MEX array here:
outdata[r+height*c+height*width] =output[r][c];
You don't need height*width as the offset. r + height*c is enough to access a single channel 2D matrix in column-major order. You only need to offset by height*width if you have a multi-channel image. That offset allows you to access image data in other channels... and since you only have single channel data (it looks like so...), this offset isn't required.
Therefore, you simply need to do:
outdata[r + height*c] = output[r][c];
If you don't do this, I suspect you will eventually get segmentation faults because you'll eventually access parts of memory you aren't allowed to access.
Also, once you fully test your code, get rid of the mexPrintf statements. It's going to unnecessarily flood your Command Prompt with print messages since you have it inside a nested for loop. I suspect you did this for debugging, and that's perfectly fine, but I would suggest you attach the MEX function to an actual debugger and debug your code properly instead of the print statements.
See my post on how to get that set up here: Preventing a MEX file from crashing in MATLAB

c++: Convert 24bpp to 8 bpp or 1bpp image

I have to convert a 24bpp image to a 1bpp image or 8bpp image based on color table. The caller expects a unsigned char* in either case (which would be further processed or maybe for now debug output by sending the BITMAPINFOHEADER.biBitCount to its proper value, 8 or 1).
I have code to extract the color index into the palette (colorIndexArray is from color conversion or dithering algorithms)... I can get the info for an 8bpp bitmap...
But my problem is, I don't know how to put this info into a 1bpp bitmap
typedef struct {
unsigned int size;
unsigned char* pixels;
} ColorIndexArray;
unsigned char* convertImage(const ColorIndexArray& colorIndexArray, unsigned int paletteSize)
unsigned char* outputImage;
if (paleteSize > 2)
outputImage = (unsigned char*)LocalAlloc(LPTR, colorIndexArray.size);
for (int i=0; i<colorIndexArray.size; i++)
*(outputImage+i) = colorIndexArray.pixels[i];
// this works great
else // monochrome, caller has palette colors likely b/w (or purple/magenta or anything), must be 1bpp
outputImage = (unsigned char*)LocalAlloc(LPTR, colorIndexArray.size / 8);
// how can i place the unsigned char* info (which is already
// determined based on desired algorithm, representing index in
// color table) into the output image inside a single bit ?
// (obviously its value for a monochrome image would be 0 or 1 but
// it is saved as unsigned char* at the algorithm output)
// And how do I advance the pointer ?
// Will it be type safe ? Aligned to byte ? or do I have to fill
// with something at the end to make multiple of 8 bits ?
return outputImage;
Trying this after comment suggestion:
#include <GdiPlus.h>
else {
Gdiplus::Bitmap monoBitmap(w, h, PixelFormat1bppIndexed);
Gdiplus::BitmapData monoBitmapData;
Gdiplus::Rect rect(0, 0, w, h);
monoBitmap.LockBits(&rect, Gdiplus::ImageLockModeWrite, PixelFormat1bppIndexed, &monoBitmapData);
outputImage = (unsigned char*)monoBitmapData.Scan0;
for (unsigned int y = 0; y < h; y++)
for (unsigned int x = 0; x < w; x++)
if (colorIndexArray.pixels[x + y * w])
outputImage[y*monoBitmapData.Stride + x / 8] |= (unsigned char)(0x80 >> (x % 8));
return outputImage;
(Also need to allocate the memory for outputImage)
Based on the example suggested by Hans Passant (thank you also for pointing out how important the stride is), I wrote this little conversion
unsigned long stride = (((w + 31) & ~31) >> 3);
outputImage = (unsigned char*)LocalAlloc(LPTR, stride * h);
for (unsigned int y = 0; y < h; y++)
unsigned char* b = (unsigned char*)LocalAlloc(LPTR, stride);
for (unsigned int x = 0; x < w; x++)
if (colorIndexArray.pixels[x + y * w])
b[x / 8] |= (unsigned char)(0x80 >> (x % 8));
CopyMemory(outputImage + stride * y, b, stride);