calling c function from MATLAB? - c++

I want to call a c function from matlab, for that I tried writing a wrapper function using MEX. While compiling I am getting
error C2109: subscript requires array or pointer type
and error C2440: 'function' : cannot convert from 'double *' to 'double'
Can anyone help me where i did the mistake??
#include "mex.h"
#include "matrix.h"
#include "CVIPtoolkit.h"
#include "CVIPtools.h"
#include "CVIPmatrix.h"
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
void midd(double outdata, int type, int height, int width){
Image *outputImage;
byte **output;
int r,c;
mexPrintf("type %d\n", type);
mexPrintf("height %d\n", height);
mexPrintf("width %d\n", width);
outputImage=new_Image (PGM, GRAY_SCALE, 0, height, width, CVIP_BYTE, REAL );
outputImage = h_image(type, height,width);
output = getData_Image(outputImage, 0);
for(r=0; r < height; r++) {
for(c=0; c < width; c++)
{
mexPrintf("type %d\n", type);
mexPrintf("height %d\n", height);
mexPrintf("width %d\n", width);
outdata[r+height*c+height*width] =output[r][c]; /* passing data back to MATLAB variable from CVIPtools variable */
}
}
}
void mexFunction(int nlhs, mxArray *plhs[], int nrhs, const mxArray *prhs[])
{
double *outdata;
int type, height, width;
// double *indata = (double *)mxGetData(prhs[0]);
type = mxGetScalar(prhs[0]);
height = mxGetScalar(prhs[1]);
width = mxGetScalar(prhs[2]);
mexPrintf("type %d\n", type);
mexPrintf("height %d\n", height);
mexPrintf("width %d\n", width);
plhs[0] = mxCreateDoubleMatrix(height,width,mxREAL);
outdata = mxGetData(plhs[0]);
midd(outdata, type, height, width);
}
The c function i am trying to call is as follows:
#include "CVIPtoolkit.h"
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
Image *
h_image(int type, unsigned int height, unsigned int width){
/* type = 1, Constant
* type = 2, Fixed mask
* type = 3, Gaussian
*/
unsigned int r, c, hf_w = width/2, hf_h = height/2;
Image *outimage;
float **outdata, sum = 0.0, sigma, tmp1, tmp2, tmp;
if (height < 3 || width < 3) {
fprintf(stderr, "Masksize too small, at least 3x3\n");
return (Image *)NULL;
}
outimage = new_Image(PGM, GRAY_SCALE, 1, height, width, CVIP_FLOAT, REAL);
outdata = (float **)getData_Image(outimage, 0);
switch (type) {
case 1:
for (r = 0; r < height; r++)
for (c = 0; c < width; c++) {
outdata[r][c] = 1.0;
sum += outdata[r][c];
}
break;
case 2:
for (r = 0; r < height; r++)
for (c = 0; c < width; c++) {
outdata[r][c] = 1.0;
sum += outdata[r][c];
}
outdata[height/2][width/2] = height * width;
sum = sum - 1.0 + outdata[height/2][width/2];
break;
case 3:
c = (width + height) /4;
r = (width + height) /2;
sigma = sqrt(c*c / (2 * log(2) + (r - 3) * log(3)));
sigma = 1.0 / 2.0 /sigma/sigma;
tmp = width * height;
for (r = 0; r < height; r++)
for (c = 0; c < width; c++) {
tmp1 = (r-hf_h)*(r-hf_h); tmp2 = (c-hf_w)*(c-hf_w);
outdata[r][c] = tmp*exp(- (tmp1 + tmp2) * sigma);
sum += outdata[r][c];
}
break;
default:
fprintf(stderr, "Incorrect mask type number: %d\n", type);
return (Image *)NULL;
}
return outimage;
}

In your main function, outdata is a pointer to a double, yet your function midd takes in an actual double itself. That's why you're getting that error in type.
Simply change your function declaration so that the first input accepts a pointer to a double:
void midd(double *outdata, int type, int height, int width)
// ^^^^^^^^
Minor Note
I question your copying of your image data back to a MEX array here:
outdata[r+height*c+height*width] =output[r][c];
You don't need height*width as the offset. r + height*c is enough to access a single channel 2D matrix in column-major order. You only need to offset by height*width if you have a multi-channel image. That offset allows you to access image data in other channels... and since you only have single channel data (it looks like so...), this offset isn't required.
Therefore, you simply need to do:
outdata[r + height*c] = output[r][c];
If you don't do this, I suspect you will eventually get segmentation faults because you'll eventually access parts of memory you aren't allowed to access.
Also, once you fully test your code, get rid of the mexPrintf statements. It's going to unnecessarily flood your Command Prompt with print messages since you have it inside a nested for loop. I suspect you did this for debugging, and that's perfectly fine, but I would suggest you attach the MEX function to an actual debugger and debug your code properly instead of the print statements.
See my post on how to get that set up here: Preventing a MEX file from crashing in MATLAB

Related

list of white pixels indices in image using CUDA

Given a binary image, I want to return the list of indices for white pixels in it using GPU (Compute Unified Device Architecture). How to determine the index for points vector?
Here is the CUDA Kernel .
//copy only active pixel locations
__global__ void get_white_pixels_kernel(unsigned char* bin_image,
float * points,
int width,
int height,
int grayWidthStep)
{
int row_index = threadIdx.y+ blockIdx.y*blockDim.y;
int col_index = threadIdx.x+blockIdx.x*blockDim.x;
if ((col_index < width) && (row_index < height))
{
//Location of gray pixel in output
const int gray_tid = row_index * grayWidthStep + col_index;
if(input[gray_tid]==255)
points[--here is the index]= Point2f(row_index,col_index);
}
}
Following is a naive method to achieve the desired functionality:
Generate a mask of pixel indices with dummy values for pixel with zero value.
Count the number of non-zero pixels
Create an output vector with length equal to non-zero count.
Copy the non-zero pixel indices from the generated mask to the output vector (a process known as stream-compaction)
Following is a sample code for the above mentioned process.
Code
#include <cstdio>
#include <vector>
#include <cuda_runtime.h>
#include <thrust/count.h>
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include <thrust/execution_policy.h>
#include <opencv2/opencv.hpp>
static void _check_err(cudaError_t err, const char* file, int line)
{
if(err)
{
const char* err_str = cudaGetErrorString(err);
printf("CUDA Error: %s\nFile: %s\nLine: %d\n", err_str, file, line);
exit(EXIT_FAILURE);
}
}
#define CHECK_ERR(err) _check_err((err), __FILE__, __LINE__)
__global__ void kernel_find_indices(const unsigned char* input, int width, int height, int step, int2* indices)
{
const int x = blockIdx.x * blockDim.x + threadIdx.x;
const int y = blockIdx.y * blockDim.y + threadIdx.y;
if(x < width && y < height)
{
const int tidPixel = y * step + x;
const int tidIndex = y * width + x;
unsigned char value = input[tidPixel];
int2 index_to_write;
if(value)
{
//Write actual index to pixels with non-zero value
index_to_write.x = x;
index_to_write.y = y;
}
else
{
//Write dummy index to pixels with zero value
index_to_write.x = -1;
index_to_write.y = -1;
}
indices[tidIndex] = index_to_write;
}
}
//Operator to check whether an index is of a non-zero pixel
struct isNonZeroIndex
{
__host__ __device__ bool operator()(const int2 &idx)
{
return (idx.x != -1) && (idx.y != -1);
}
};
std::vector<cv::Point> getIndicesOfNonZeroPixels(cv::Mat input)
{
std::vector<int2> output_int2;
std::vector<cv::Point> output;
int pixelCount = input.cols * input.rows;
size_t imageBytes= input.step * input.rows;
unsigned char* image_d;
thrust::device_vector<int2> index_buffer_d(pixelCount);
//Allocate device memory for input image
CHECK_ERR(cudaMalloc(&image_d, imageBytes));
//Copy input image to device
CHECK_ERR(cudaMemcpy(image_d, input.ptr(), imageBytes, cudaMemcpyHostToDevice));
dim3 block(16,16);
dim3 grid;
grid.x = (input.cols + block.x - 1) / block.x;
grid.y = (input.rows + block.y - 1) / block.y;
//Generate an index mask with dummy values for indices with zero pixel value
kernel_find_indices<<<grid, block>>>(image_d, input.cols, input.rows, input.step, thrust::raw_pointer_cast(index_buffer_d.data()));
CHECK_ERR(cudaDeviceSynchronize());
int nonZeroCount = thrust::count_if(index_buffer_d.begin(), index_buffer_d.end(), isNonZeroIndex());
//Keep only those indices whose pixel value is non-zero (stream compaction)
thrust::device_vector<int2> compacted(nonZeroCount);
thrust::copy_if(index_buffer_d.begin(), index_buffer_d.end(), compacted.begin(), isNonZeroIndex());
//Copy non-zero pixel indices to host
output_int2.resize(nonZeroCount);
thrust::copy(compacted.begin(), compacted.end(), output_int2.begin());
CHECK_ERR(cudaFree(image_d));
//Convert vector<int2> to vector<cv::Point>
output.resize(nonZeroCount);
for(size_t i=0; i<nonZeroCount; i++)
output[i] = cv::Point(output_int2[i].x, output_int2[i].y);
return output;
}
void run_test()
{
//Generate a sample test image
cv::Mat test = cv::Mat::zeros(100,100, CV_8UC1);
cv::rectangle(test, cv::Rect(5,5,20,20), cv::Scalar::all(255), CV_FILLED);
//Get pixel indices of non-zero pixels
std::vector<cv::Point> indices = getIndicesOfNonZeroPixels(test);
//Display those indices
for(size_t i=0; i<indices.size(); i++)
{
printf("%d, %d\n", indices[i].x, indices[i].y);
}
//Show image
cv::imshow("Sample", test);
cv::waitKey();
}
int main(int argc, char** argv)
{
run_test();
return 0;
}
Compilation Command
nvcc -o nz nz.cu -arch=sm_61 -L/usr/local/lib -lopencv_core
-lopencv_highgui -lopencv_imgproc
Please keep in mind that this code is for image of type 8UC1 (8 bit, single channel) only. You can easily extend it to other data-types as required.

C++: Segmentation fault on pthread_create

I'm relatively new to C in general and I'm trying to make a small image filter while using pthreads. After a few hours of playing around with pointers and references, it goes through the compiler but then I get a segmentation fault, the code is the following:
#include <iostream>
#include <opencv2/core.hpp>
#include <opencv2/imgcodecs.hpp>
using namespace std;
using namespace cv;
#define WIDTH 3
#define HEIGHT 4
#define NUM_THREADS 4
struct readThreadParams{
Mat img;
Mat out;
int yStart;
int xEnd;
int yEnd;
int xRad;
int yRad;
};
//Find average of all pixels in WXH area
uchar getAverage(Mat &img, Mat &out, const float x1, const float y1, const int xRad, const int yRad){
//x1, y1: Pixel position being checked. xRad, yRad: how many pixels are being checked in x and y, relative to starting point.
uchar blue;
uchar green;
uchar red;
Vec3b outColor;
for (int c = 0; c < xRad; c++){
for (int r = 0; r < yRad; r++){
Vec3b intensity = img.at<Vec3b>(r, c);
blue =+ intensity.val[0];
green =+ intensity.val[1];
red =+ intensity.val[2];
}
}
outColor[0] = (blue/(xRad*yRad*4));
outColor[1] = (green/(xRad*yRad*4));
outColor[2] = (red/(xRad*yRad*4));
for (int c = 0; c< xRad; c++){
for (int r = 0; r< yRad; r++)
out.at<Vec3b>(Point(c, r)) = outColor;
}
}
void* parallel_processing_task(void * param){
//This is what each thread should do:
struct readThreadParams *input = (struct readThreadParams*)param;
Mat img = input->img;
Mat out = input->out;
const float yStart = input->yStart;
const float xEnd = input->xEnd;
const float yEnd = input->yEnd;
const float xRad = input->xRad;
const float yRad = input->yRad;
for (int c = 0; c < xEnd; c + xRad){
for (int r=yStart; r < yEnd; r + yRad){
getAverage(img, out, c, r, xRad, yRad);
}
}
}
int main(int argc, char *argv[]){
//prepare variables
pthread_t threads[NUM_THREADS];
void* return_status;
struct readThreadParams input;
int t;
Mat img = imread("image.jpg", IMREAD_COLOR);
int ROWS = img.rows;
int COLS = img.cols;
Mat out(ROWS, COLS, CV_8UC3);
input.img = img;
input.out = out;
input.xEnd = COLS;
input.xRad = WIDTH;
input.yRad = HEIGHT;
double t2 = (double) getTickCount();
for (int r = 0; r<ROWS ; ceil(ROWS/NUM_THREADS)){
input.yStart = r;
input.yEnd = r + ceil(ROWS/NUM_THREADS);
pthread_create(&threads[t], NULL, parallel_processing_task, (void *)&input);
}
for(t=0; t<NUM_THREADS; t++){
pthread_join(threads[t], &return_status);
}
t2 = ((double) getTickCount() - t2) / getTickFrequency();
//print execution time
cout << "Execution time: " << t2 << " s" << endl;
//result image
imwrite("output.png", out);
return(0);
}
I used GDB to find the culprit and managed to get as far as finding out it's on line 107:
pthread_create(&threads[t], NULL, parallel_processing_task, (void *)&input);
At this point, I tried going all over the place to find solutions, I tried the following:
Changing the way I defined the struct, making it receive pointers, which I later found out didn't work.
Changing the way I pass arguments (such as adding or removing
(*void) where it seemed proper), which ended up in a bigger mess of
errors or simply the same error at the end.
Furthermore, being new to this language doesn't really help me out when trying to read the gdb bt output:
#0__pthread_create_2_1(newthread=optimized out>, attr=<optimized out>, start_routine=<optimized out>, arg=<optimized out>) at pthread_create.c:601
#1 0x00011a00 in main(argc=1, argv=0x7efff394) at file.cpp:107
A part of me wants to think the problem is related to the optimized out parts, but looking it up yields no results, or at least, I may not be looking properly.
Any thoughts as to what I may be doing wrong here? I would very much appreciate the help!
You have not initialised t prior to using it in
pthread_create(&threads[t], NULL, parallel_processing_task, (void *)&input);
So this is likely to lead to undefined behaviour as t may be having any value that could make &threads[t] access invalid memory

Cuda - Device values 0 after kernel execution

For some reason when I execute my program the device variables have a zero values. Just before I execute the cuda kernel the device variables have the correct values. The output image is just black of the original image size. All the memory allocations and copying to and from host seem to be correct.
Thanks for any help!
// Includes, system
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>
#ifdef _WIN32
# define WINDOWS_LEAN_AND_MEAN
# define NOMINMAX
# include <windows.h>
#endif
#define Image_Size 512
#define Kernel_Size 3
// Includes CUDA
#include <cuda_runtime.h>
// Utilities and timing functions
#include "./inc/helper_functions.h" // includes cuda.h and cuda_runtime_api.h
// CUDA helper functions
#include "./inc/helper_cuda.h" // helper functions for CUDA error check
const char *imageFilename = "lena_bw.pgm";
const char *sampleName = "simpleTexture";
#define C_PI 3.141592653589793238462643383279502884197169399375
void __global__ SwirlCu(int width, int height, int stride, float *pRawBitmapOrig, float *pBitmapCopy, double factor)
{
// This function effectively swirls an image
// This CUDA kernel is basically the exact same code as the CPU-only, except it has a slightly different setup
// Each thread on the GPU will process exactly one pixel
// Before doing anything, we need to determine the current pixel we are calculating in this thread
// Original code used i as y, and j as x. We will do the same so we can just re-use CPU code in the CUDA kernel
int i = blockIdx.y * blockDim.y + threadIdx.y;
int j = blockIdx.x * blockDim.x + threadIdx.x;
// Test to see if we're testing a valid pixel
if (i >= height || j >= width) return; // Don't bother doing the calculation. We're not in a valid pixel location
double cX = (double)width/2.0f;
double cY = (double)height/2.0f;
double relY = cY-i;
double relX = j-cX;
// relX and relY are points in our UV space
// Calculate the angle our points are relative to UV origin. Everything is in radians.
double originalAngle;
if (relX != 0)
{
originalAngle = atan(abs(relY)/abs(relX));
if ( relX > 0 && relY < 0) originalAngle = 2.0f*C_PI - originalAngle;
else if (relX <= 0 && relY >=0) originalAngle = C_PI-originalAngle;
else if (relX <=0 && relY <0) originalAngle += C_PI;
}
else
{
// Take care of rare special case
if (relY >= 0) originalAngle = 0.5f * C_PI;
else originalAngle = 1.5f * C_PI;
}
// Calculate the distance from the center of the UV using pythagorean distance
double radius = sqrt(relX*relX + relY*relY);
// Use any equation we want to determine how much to rotate image by
//double newAngle = originalAngle + factor*radius; // a progressive twist
double newAngle = originalAngle + 1/(factor*radius+(4.0f/C_PI));
// Transform source UV coordinates back into bitmap coordinates
int srcX = (int)(floor(radius * cos(newAngle)+0.5f));
int srcY = (int)(floor(radius * sin(newAngle)+0.5f));
srcX += cX;
srcY += cY;
srcY = height - srcY;
// Clamp the source to legal image pixel
if (srcX < 0) srcX = 0;
else if (srcX >= width) srcX = width-1;
if (srcY < 0) srcY = 0;
else if (srcY >= height) srcY = height-1;
// Set the pixel color
// Since each thread writes to exactly 1 unique pixel, we don't have to do anything special here
pRawBitmapOrig[i*stride/4 + j] = pBitmapCopy[srcY*stride/4 + srcX];
}
////////////////////////////////////////////////////////////////////////////////
// Declaration, forward
void runTest(int argc, char **argv);
////////////////////////////////////////////////////////////////////////////////
// Program main
////////////////////////////////////////////////////////////////////////////////
int main(int argc, char **argv)
{
printf("%s starting...\n", sampleName);
// Process command-line arguments
if (argc > 1)
{
if (checkCmdLineFlag(argc, (const char **) argv, "input"))
{
getCmdLineArgumentString(argc,(const char **) argv,"input",(char **) &imageFilename);
}
else if (checkCmdLineFlag(argc, (const char **) argv, "reference"))
{
printf("-reference flag should be used with -input flag");
exit(EXIT_FAILURE);
}
}
runTest(argc, argv);
cudaDeviceReset();
printf("%s completed",
sampleName);
//exit(testResult ? EXIT_SUCCESS : EXIT_FAILURE);
}
////////////////////////////////////////////////////////////////////////////////
//! Run a simple test for CUDA
////////////////////////////////////////////////////////////////////////////////
void runTest(int argc, char **argv)
{
int devID = findCudaDevice(argc, (const char **) argv);
unsigned int kernel_bytes = Kernel_Size * Kernel_Size * sizeof(float);
// load image from disk
float *hData = NULL;
float *host_array_kernel = 0;
float *device_array_Image = 0;
float *device_array_kernel = 0;
float *device_array_Result = 0;
unsigned int width, height;
char *imagePath = sdkFindFilePath(imageFilename, argv[0]);
if (imagePath == NULL)
{
printf("Unable to source image file: %s\n", imageFilename);
exit(EXIT_FAILURE);
}
sdkLoadPGM(imagePath, &hData, &width, &height);
unsigned int size = width * height * sizeof(float);
printf("Loaded '%s', %d x %d pixels\n", imageFilename, width, height);
// Allocation of device arrays using CudaMalloc
cudaMalloc((void**)&device_array_Image, size);
cudaMalloc((void**)&device_array_kernel, kernel_bytes);
cudaMalloc((void**)&device_array_Result, size);
host_array_kernel = (float*)malloc(kernel_bytes); // kernel
// Allocate mem for the result on host side
float *hOutputDataSharp = (float *) malloc(size);
GenerateKernel (host_array_kernel);
// copy arrays and kernel from host to device
checkCudaErrors(cudaMemcpy(device_array_Image, hData, size, cudaMemcpyHostToDevice));
checkCudaErrors(cudaMemcpy(device_array_kernel, host_array_kernel, kernel_bytes, cudaMemcpyHostToDevice));
dim3 dimBlock(16, 16, 1);
dim3 dimGrid(width / dimBlock.x, height / dimBlock.y, 1);
//Do the Convolution
printf("DImage : '%.8f'\n",device_array_Image);
printf("DKernel : '%.8f'\n",device_array_kernel);
//serialConvolution(hData, host_array_kernel ,hOutputDataSharp);
SwirlCu<<<512, 512>>>(width, height, width*4, device_array_Image,device_array_Result, 0.005f);
printf("DResult : '%.8f'\n",device_array_Result);
checkCudaErrors(cudaDeviceSynchronize());
cudaMemcpy(hOutputDataSharp,device_array_Result, size, cudaMemcpyDeviceToHost);
printf("HResult : '%.8f'\n",hOutputDataSharp);
// Write result to file
char outputSharp[1024];
strcpy(outputSharp, imagePath);
strcpy(outputSharp, "data/serial_sharptest.pgm");
sdkSavePGM(outputSharp, hOutputDataSharp, width, height);
cudaFree(device_array_Result);
cudaFree(device_array_Image);
cudaFree(device_array_kernel);
free(hData);
free(imagePath);
//free(host_array_Image);
free(host_array_kernel);
free(hOutputDataSharp);
//free(hOutputImage);
//free(hOutputKernel);
}
Your code is writing in the source image:
pRawBitmapOrig[i*stride/4 + j] = pBitmapCopy[srcY*stride/4 + srcX];
which writes to device_array_Image which is the source, not the destination you are expecting results in.
Moreover, I am very curious on the output of printf("DResult : '%.8f'\n",device_array_Result); as device_array_Result is in GPU address space and allocated with cudaMalloc. On which device are you running ?

CBLAS segmenation fault with large array

this is my third post and attempt to solve this problem, which first
showed up using numpy.dot(A, A.T) where A is large, 150,000 x 265 elements.
With numpy, I got back an array with many missing values, that were just zeros.
I've tried to call BLAS thru CBLAS. I'm getting a segmentation fault error
with large arrays.
I'm running this on a machine with about 250 GB free memory.
Thanks for reading...
#include <stdio.h> /* I/O lib ISOC */
#include <stdlib.h> /* Standard Lib ISOC */
#include <cblas.h> /* C BLAS BLAS */
#include "blaio.h"
int main(int argc, char **argv) {
int row = 100000;
int col = 265;
float *a, *b, *c;
a = (float *) malloc(row * col * sizeof(float));
b = (float *) malloc(row * col * sizeof(float));
c = (float *) malloc(row * row * sizeof(float));
int i, end;
end = row * col;
for(i=0; i<end; i++)
{
a[i] = 1.0;
b[i] = 1.0;
}
for(i=0; i<(row*row); i++)
c[i] = 2.0;
// row_order transform transform rowsA colsB K alpha a lda b ldb beta c ldc
cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, row, row, col, 1.0f, a, col, b, row, 0.0f, c, row);
int num_bad = 0;
for(i=0; i<(row*row); i++)
{
if (c[i] != col)
{
printf("Bad value found: %f, at index: %i\n", c[i], i );
num_bad += 1;
}
}
printf("Number of bad values found: %i \n\n", num_bad);
//printMatrix(CblasRowMajor, row, row, c, 8, 3, NULL, NULL, NULL, NULL, NULL, "c = ");
return 0;
} /* end func main */
UPDATE:
Ray has expertly noticed that the blas I'm using via cblas, must be 32 bit and not able to access the array indices. Therefore, I've installed blas64.x86_64 and blas64-devel.x86_64.
Then, rewrote a few lines of the code above to use the direct call to sgemm without cblas.
#include <stdio.h> /* I/O lib ISOC */
#include <stdlib.h> /* Standard Lib ISOC */
int main(int argc, char **argv) {
int row = 100000;
int col = 265;
float *a, *b, *c;
a = (float *) malloc(row * col * sizeof(float));
b = (float *) malloc(row * col * sizeof(float));
c = (float *) malloc(row * row * sizeof(float));
int i, end;
end = row * col;
for(i=0; i<end; i++)
{
a[i] = 1.0;
b[i] = 1.0;
}
for(i=0; i<(row*row); i++)
c[i] = 2.0;
float alpha = 1.0, beta = 1.0;
sgemm_('N','N', &row, &row, &col, &alpha, &a[0], &col, &b[0], &row, &beta, &c[0], &row);
I compiled with:
gcc sgemm_test_fortran.c -o test -L /usr/lib64 -lblas64
The code compiled and I think it might run.. :)
The problem is that the size of your output matrix (100,000x100,000 = 1e10 elements) can't be stored in an int (2.14e9). You can fix this in your C++ code by switching the types to size_t, but you're going to run into the same problem inside the BLAS library.
What you need to to do is use a BLAS library that is compiled to use 8-byte integers; most BLAS libraries are compiled with 4-byte integers. You don't mention what BLAS library you're linking to, so it's hard to guess what the correct library name is (if it even exists) on your system.

C++/LabVIEW interop: error extracting data from LabVIEW array/ unexpected type conversion in function parameter

I'm using Cluebat-man's LabVIEW-C++ array interoperability class, and I'm getting an error extracting the data from from the array. Or, rather, the data extraction appears to succeed, but build fails when I try to use the data a line later.
(Context: The program is designed to implement Manjunath et al's peer-group filtering; the function is designed to extract the hue plane of an image. I'm fairly certain it's not a problem with the specific function, aside from perhaps its declaration of parameters, because the same problem crops up later in the program when I try to use the results from getHuePlane())
#ifndef IO_TYPE //Normal arrays or LabVIEW?
#define I_TYPE /* int* */ CLvArrayHandlePtr<unsigned __int32, 2>
#define O_TYPE /* int* */ CLvArrayHandlePtr<unsigned __int8, 2>
#define IO_TYPE
#endif
#ifndef USING_LABVIEW_DEFINED
#define USING_LABVIEW //remove if not
#define USING_LABVIEW_DEFINED
#endif
Extraction and function call:
#include "LvArrayIndexer.h"
#include "LvArrayTemplate.h"
O_TYPE pgf(I_TYPE HSLimage, int width, int height, int halfWindowSize, int noiseThreshold) {
#ifdef USING_LABVIEW
size_t size[2] = {width, height};
HSLimage.Resize(size);
CLvArrayIndexer<unsigned __int32, 2 > baseImgIndexer(HSLimage);
CLvArrayHandlePtr<unsigned __int8, 2 > hueImage;
hueImage.Resize(size);
CLvArrayIndexer<unsigned __int8, 2 > hueImgIndexer(hueImage);
int LvImageData[width][height];
#else
int hueImage[width][height];
#endif
int hueImageData[width][height];
int windowSize = 2 * halfWindowSize - 1;
int windowLength = windowSize * windowSize;
int window[windowSize][windowSize];
int flattenedWindow[windowLength];
vector<int> peerGroup;
int currentValue;
#ifdef USING_LABVIEW
for (int x = 0; x < width; x++)
for (int y = 0; y < height; y++)
LvImageData[x][y] = baseImgIndexer[x][y];
hueImageData = getHuePlane(LvImageData, width, height);
#else
hueImageData = getHuePlane(HSLimage, width, height);
#endif
//Function continues
}
Function definition:
int* getHuePlane(int* HSLimage, int width, int height) {
int hueImage[width][height];
double calcValue;
/*Get hue plane
*AL HU SA LU ->AL HU.SA LU -> AL HUF
*AL HU -> AL.HU -> 0.HU -> HU
*/
for (int x = 0; x < width; x++) {
for (int y = 0; y < height; y++) {
calcValue = int(double(HSLimage[x][y]) / 65536); //A-H-S-L; removes S-L
calcValue = (calcValue / 256) - int(calcValue / 256);
calcValue = calcValue * 256;
hueImage[x][y] = int(calcValue);
}
}
return hueImage;
}
The error is:
pgf.cpp:88:58: error: cannot convert 'int (*)[(((unsigned int)(((int)height) + -0x000000001)) + 1)]' to 'int*' for argument '1' to 'int* getHuePlane(int*, int, int)'
System information:
IDE:Netbeans 7.1
Compiler: MinGW (gcc v4.6.2)
Make: GNU make 3.79.1
System: Windows 7 version 6.1
I'm guessing the error is on this line:
hueImageData = getHuePlane(LvImageData, width, height);
The reason is because of type missmatch: LvImageData is defined as int [][] while getHuePlane expects int *.
Also, you should also get an error on this line (in getHuePlane):
calcValue = int(double(HSLimage[x][y]) / 65536); //A-H-S-L; removes S-L
This is because HSLimage in the function is an int * while you try to access it as int [][] (or int *[] as the argument type should really be).