mexcuda having breakpoint at delete[]() in .cu-file - c++

I am having some trouble finding the error I made with my memory allocation. I am currently using Visual Studio 2013, Matlab 2015b and CUDA 7.0 on a GeForce GT 630 and I am quite a newbie to GPU-programming, CUDA and mex.
When I call my code from Matlab with mexcuda it goes fine until I add the small part with colIndexStepSize to the .cu-file. The program runs normally till delete. After informing me about having reached a breakpoint here, Matlab crashes.
When I remove the code lines in question, everything runs smoothly again.
I am quite sure that there is something wrong with my memory handling but I simpy cannot find the bug. Here is the code that is making trouble:
#include <cuda_runtime.h>
#include <cuda.h>
#include <cusparse.h>
#include <device_launch_parameters.h>
#include <curand.h>
#include <vector>
// Test-Makro : (Funktionieren die Zugriffe auf die GPU?)
#define gpuErrchk(ans){gpuAssert((ans), __FILE__, __LINE__);}
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort = true){
if (code != cudaSuccess){
fprintf(stderr, "GPUassert: %s%s%d\n", cudaGetErrorString(code), file, line);
}
}
__global__ void startEndIndex(int *ergArray, int *first, int *last, float *dxmax, unsigned int *length){
unsigned int index = threadIdx.x + blockIdx.x*blockDim.x;
if (index < *length){
first[index] = (*dxmax)*ergArray[index];
last[index] = (*dxmax)*ergArray[index + 1] - 1;
}
}
void rotateOSSARTrechnung(std::vector<float> *detektor, SparseMatrix<float, float, float> *systemMatrix_coo, Volumen<float, float, float> *volumen, unsigned int iterationen, std::vector<float> *deltaBIterationN, std::vector<float> *matdVoxelGrid, float projektionen,float dxmax, float detZellen, unsigned int threads_max_n, unsigned int threads_max_m, unsigned int threads_max_nnz){
unsigned int nnz = (unsigned int)systemMatrix_coo->nnz;
unsigned int n = (unsigned int)systemMatrix_coo->columnNumber;
unsigned int mNeu = detZellen;
float *measuredValues = 0; measuredValues = new float[mNeu]();
float *volumeN = 0; volumeN =new float[n]();
float *volumeAlt = 0; volumeAlt = new float[n]();
float *initValuesM = 0; initValuesM = new float[mNeu]();
float *volumeNInitZero = 0; volumeNInitZero = new float[n]();
float *initValuesMInitZero = 0; initValuesMInitZero = new float[mNeu]();
int *cooRowHostPtr=0; cooRowHostPtr = new int[nnz]();
int *cooColHostPtr=0; cooColHostPtr = new int[nnz]();
float *cooValuesHostPtr = 0; cooValuesHostPtr = new float[nnz]();
unsigned int *colIndex = 0; colIndex = new unsigned int[nnz]();
float *valIndex = 0; valIndex = new float[nnz]();
unsigned int *colIndexStepSize = 0; colIndexStepSize = new unsigned int[n]();
for (unsigned int i = 0; i < n; i++){
colIndexStepSize[i] = nnz;
}
unsigned int length = matdVoxelGrid->size();
int *ergArray = 0; ergArray = new int[length+1]();
int *first = 0; first = new int[length]();
int *last = 0; last = new int[length]();
int *cooHostColRot = 0; cooHostColRot = new int[nnz]();
int *d_cooColPtr;
int *d_cooRowPtr;
unsigned int *d_nnz;
int *d_colIndexPtr;
float *d_valIndexPtr;
unsigned int *d_colIndexStepSizePtr;
float *d_cooValuesPtr;
float *d_measuredValues;
float *d_volume_alt;
float *d_volume_neu;
int *d_ergArray;
float *d_dxmax;
unsigned int *d_length;
unsigned int *d_size;
int *d_first;
int *d_last;
int *d_cooColRotPtr;
unsigned int *d_count;
gpuErrchk(cudaMalloc((void**)&d_cooRowPtr, nnz*sizeof(int)));;
gpuErrchk(cudaMalloc((void**)&d_cooColPtr, nnz*sizeof(int)));
gpuErrchk(cudaMalloc((void**)&d_cooValuesPtr, nnz*sizeof(float)));
gpuErrchk(cudaMalloc((void**)&d_measuredValues, mNeu*sizeof(float)));
gpuErrchk(cudaMalloc((void**)&d_volume_alt, n*sizeof(float)));
gpuErrchk(cudaMalloc((void**)&d_volume_neu, n*sizeof(float)));
gpuErrchk(cudaMalloc((void**)&d_nnz, sizeof(unsigned int)));
gpuErrchk(cudaMalloc((void**)&d_colIndexPtr, (nnz)*sizeof(int)));
gpuErrchk(cudaMalloc((void**)&d_valIndexPtr, (nnz)*sizeof(float)));
gpuErrchk(cudaMalloc((void**)&d_colIndexStepSizePtr, n*sizeof(unsigned int)));
gpuErrchk(cudaMalloc((void**)&d_ergArray, (length+1)*sizeof(int)));
gpuErrchk(cudaMalloc((void**)&d_dxmax, sizeof(float)));
gpuErrchk(cudaMalloc((void**)&d_length, sizeof(unsigned int)));
gpuErrchk(cudaMalloc((void**)&d_size, sizeof(unsigned int)));
gpuErrchk(cudaMalloc((void**)&d_first, length*sizeof(int)));
gpuErrchk(cudaMalloc((void**)&d_last, length*sizeof(int)));
gpuErrchk(cudaMalloc((void**)&d_cooColRotPtr, nnz*sizeof(int)));
gpuErrchk(cudaMalloc((void**)&d_count, sizeof(unsigned int)));
for (unsigned int i = 0; i < nnz; i++){
cooRowHostPtr[i] = systemMatrix_coo->cooRowInd->at(i);
cooColHostPtr[i] = systemMatrix_coo->cooColInd->at(i);
cooValuesHostPtr[i] = systemMatrix_coo->cooValues->at(i);
}
for (unsigned int j = 0; j < n; j++){
volumen->setValueAtElement(j, (float)cooColHostPtr[j]);
}
gpuErrchk(cudaMemcpy(d_nnz, &nnz, sizeof(unsigned int), cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(d_dxmax, &dxmax, sizeof(float), cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(d_length, &length, sizeof(unsigned int), cudaMemcpyHostToDevice));
// (Initialwerte sind immer gleich)
gpuErrchk(cudaMemcpy(d_cooRowPtr, cooRowHostPtr, nnz*sizeof(int), cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(d_cooValuesPtr, cooValuesHostPtr, nnz*sizeof(float), cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(d_cooColPtr, cooColHostPtr, nnz*sizeof(int), cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(d_valIndexPtr, cooValuesHostPtr, nnz*sizeof(float), cudaMemcpyHostToDevice));
unsigned int threads_nnz = threads_max_nnz;
unsigned int thread_length = length;
unsigned int block_length = 1;
unsigned int index = 0;
for (unsigned int s = 0; s < length; s++){
for (unsigned int t = 0; t <= s; t++){
index = s + 1;
ergArray[index] += (int)matdVoxelGrid->at(t);
}
}
gpuErrchk(cudaMemcpy(d_ergArray, ergArray, (length+1)*sizeof(int), cudaMemcpyHostToDevice));
startEndIndex <<< block_length, thread_length >>>(d_ergArray, d_first, d_last, d_dxmax, d_length);
gpuErrchk(cudaMemcpy(first, d_first, length*sizeof(int), cudaMemcpyDeviceToHost));
gpuErrchk(cudaMemcpy(last, d_last, length*sizeof(int), cudaMemcpyDeviceToHost));
for (unsigned int j = 0; j < length; j++){
volumen->setValueAtElement(j, (float)first[j]);
}
for (unsigned int j = 0; j < length; j++){
volumen->setValueAtElement(j, (float)last[j]);
}
unsigned int size = 0;
for (unsigned int iter = 0; iter < iterationen; iter++){
for (unsigned int proj = 1; proj <= projektionen; proj++){
unsigned int begin1 = (proj - 1)*mNeu;
unsigned int end1 = proj*mNeu;
for (unsigned int j = begin1; j < end1; j++){
measuredValues[j] = detektor->at(j);
}
gpuErrchk(cudaMemcpy(d_measuredValues, measuredValues, mNeu*sizeof(float), cudaMemcpyHostToDevice));
for (unsigned int u = 0; u < length; u++){
size = ceil(matdVoxelGrid->at(u)* (proj - 1) * dxmax / projektionen);
gpuErrchk(cudaMemcpy(d_size, &size, sizeof(unsigned int), cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(d_count, &u, sizeof(unsigned int), cudaMemcpyHostToDevice));
if (proj > 1){
for (unsigned int i = 0; i < nnz; i++) {//(first[u] <= cooCols[index] <= last[u]){
if (first[u] <= cooColHostPtr[i] && cooColHostPtr[i] <= last[u]){
cooHostColRot[i] = first[u] + (int)(cooColHostPtr[i] + size) % (last[u] - first[u] + 1);// (int)(cooColHostPtr[i] + size) % (last[u]); // (int)(first[u] + ((int)(cooColHostPtr[i] + dxmax) % (last[u] - first[u] + 1)));
}
}
}
else{
for (unsigned int i = 0; i < nnz; i++) {
cooHostColRot[i] = cooColHostPtr[i];
}
}
}
// --------- troubling code starts HERE ----------------
unsigned int wert = 0, index = 0;
for (unsigned int i = 0; i < nnz; i++){
index = cooHostColRot[i];
wert = colIndexStepSize[index];
if (wert >= i){
colIndexStepSize[index] = i;
}
}
for (unsigned int j = 0; j < n; j++){
volumen->setValueAtElement(j, colIndexStepSize[j]);
}
gpuErrchk(cudaMemcpy(d_colIndexStepSizePtr, colIndexStepSize, n*sizeof(unsigned int), cudaMemcpyHostToDevice));
// --------- troubling code ends HERE ----------------
gpuErrchk(cudaMemcpy(d_colIndexPtr, cooHostColRot, nnz*sizeof(int), cudaMemcpyHostToDevice));
}
}
cudaFree(d_cooRowPtr);
cudaFree(d_cooColPtr);
cudaFree(d_cooValuesPtr);
cudaFree(d_measuredValues);
cudaFree(d_volume_alt);
cudaFree(d_volume_neu);
cudaFree(d_colCount);
cudaFree(d_rowCount);
cudaFree(d_ergSumCol);
cudaFree(d_ergSumRow);
cudaFree(d_ergMult);
cudaFree(d_nnz);
cudaFree(d_faktor);
cudaFree(d_colIndexPtr);
cudaFree(d_valIndexPtr);
cudaFree(d_ergSumNNZforCol);
cudaFree(d_colIndexStepSizePtr);
cudaFree(d_deltaB);
cudaFree(d_ergArray);
cudaFree(d_dxmax);
cudaFree(d_length);
cudaFree(d_size);
cudaFree(d_first);
cudaFree(d_last);
cudaFree(d_cooColRotPtr);
cudaFree(d_count);
delete[](ergArray); ergArray = NULL;
delete[](measuredValues); measuredValues = NULL;
delete[](cooColHostPtr); cooColHostPtr = NULL;
delete[](cooRowHostPtr); cooRowHostPtr = NULL;
delete[](cooValuesHostPtr); cooValuesHostPtr = NULL;
delete[](volumeN); volumeN = NULL;
delete[](ergArray); ergArray = NULL;
delete[](initValuesM); initValuesM = NULL;
delete[](colIndex); colIndex = NULL;
delete[](valIndex); valIndex = NULL;
delete[](volumeAlt); volumeAlt = NULL;
delete[](volumeNInitZero); volumeNInitZero = NULL;
delete[](initValuesMInitZero); initValuesMInitZero = NULL;
delete[](colIndexStepSize); colIndexStepSize = NULL;
delete[](deltaBArray); deltaBArray = NULL;
delete[](first); first = NULL;
delete[](last); last = NULL;
delete[](cooHostColRot); cooHostColRot = NULL;
deltaB->~vector();
deltaB = NULL;
}
If somebody sees any mistake I made, please tell me, I am open to any advice.
Thanks in advance!
Best regards
EDIT:
#AnderBiguri was right, I made an out of bounds access to the array measuredValues. Here is the corrected part of the code in question:
for (unsigned int j = 0; j < mNeu; j++){
measuredValues[j] = detektor->at((proj-1)*mNeu+j);
}
measuredValues is only mNeu elements long but I did access some elements way behind this point.
So, thanks a lot for the help !

#AnderBiguri was right, I made an out of bounds access to the array measuredValues. Here is the corrected part of the code in question:
for (unsigned int j = 0; j < mNeu; j++){
measuredValues[j] = detektor->at((proj-1)*mNeu+j);
}
I just had to adjust the boundaries of the for loop and vector accessing to fit the bounds of the array.
Thanks a lot once again!

Related

Gaussian filter reads same value multiple time usin BMP image

I need to translate GaussianFilter that uses openCV to code that uses BMP image ( so i first read image, and translate it to greyscale). My function using openCV looks like ( basic GaussianFilter ) :
Mat CreateGaussFilter(int kernalHeight, int kernalWidth, double kernalArray[5][5]){
Mat image = imread("konik.jpg");
Mat grayScaleImage(image.size(),CV_8UC1);
Mat filter(image.size(),CV_8UC1);
cvtColor(image,grayScaleImage,CV_RGB2GRAY);
int rows=image.rows;
int cols=image.cols;
int verticleImageBound=(kernalHeight-1)/2;
int horizontalImageBound=(kernalWidth-1)/2;
for(int row=0+verticleImageBound;row<rows-verticleImageBound;row++){
for(int col=0+horizontalImageBound;col<cols-horizontalImageBound;col++){
float value=0.0;
for(int kRow=0;kRow<kernalHeight;kRow++){
for(int kCol=0;kCol<kernalWidth;kCol++){
float pixel=grayScaleImage.at<uchar>(kRow+row-verticleImageBound,kCol+col-horizontalImageBound)*kernalArray[kRow][kCol];
value+=pixel;
}
}
filter.at<uchar>(row,col)=cvRound(value);
}
}
return filter;
}
Now for BMP image:
i have loaded it using:
struct Info{
int width;
int height;
int offset;
unsigned char * info;
unsigned char * data;
int size;
};
Info readBMP(char* filename)
{
int i;
std::ifstream is(filename, std::ifstream::binary);
is.seekg(0, is.end);
i = is.tellg();
is.seekg(0);
unsigned char *info = new unsigned char[i];
is.read((char *)info,i);
int width = *(int*)&info[18];
int height = *(int*)&info[22];
int offset = *(int*)&info[10];
unsigned char a[offset];
unsigned char *b = new unsigned char[i - offset];
std::copy(info,
info + offset,
a);
std::copy(info + offset,
info + i,
b + 0);
Info dat;
dat.width = width;
dat.height = height;
dat.offset = offset;
dat.size = i;
dat.info = new unsigned char[offset - 1];
dat.data = new unsigned char[i - offset + 1];
for( int j = 0; j < offset ; j++ ){
dat.info[j] = a[j];
}
for( int j = 0; j < i - offset; j++ ){
dat.data[j] = b[j];
}
return dat;
}
turned it into grayscale usin:
void greyScale( unsigned char * src , int rows, int cols){
for( int i = 0; i < rows; i++){
for( int j = 0; j < cols; j++){
unsigned char r = src[3 * (i * cols + j)];
unsigned char g = src[3 * (i * cols + j) + 1];
unsigned char b = src[3 * (i * cols + j) + 2];
char linearIntensity = (char)(0.2126f * r + 0.7512f * g + 0);
src[3 * (i * cols + j)] = linearIntensity;
src[3 * (i * cols + j) + 1] = linearIntensity;
src[3 * (i * cols + j) + 2] = linearIntensity;
}
}
}
And now i am trying to use GaussianFilter ( translated from my OpenCV function )
void FilterCreation(double GKernel[][5]) {
// intialising standard deviation to 1.0
double sigma = 1.0;
double r, s = 2.0 * sigma * sigma;
// sum is for normalization
double sum = 0.0;
// generating 5x5 kernel
for (int x = -2; x <= 2; x++) {
for (int y = -2; y <= 2; y++) {
r = sqrt(x * x + y * y);
GKernel[x + 2][y + 2] = (exp(-(r * r) / s)) / (M_PI * s);
sum += GKernel[x + 2][y + 2];
}
}
// normalising the Kernel
for (int i = 0; i < 5; ++i)
for (int j = 0; j < 5; ++j)
GKernel[i][j] /= sum;
}
unsigned char ** CreateGaussFilter(unsigned char ** src,int kernalHeight, int kernalWidth, double kernalArray[5][5], int rows, int cols){
int verticleImageBound=(kernalHeight-1)/2;
int horizontalImageBound=(kernalWidth-1)/2;
unsigned char ** dst = new unsigned char *[rows];
for( int i = 0; i < rows; i++){
dst[i] = new unsigned char [cols];
}
for(int row=0+verticleImageBound;row<rows-verticleImageBound;row++){
for(int col=0+horizontalImageBound;col<cols-horizontalImageBound;col++){
float value=0;
for(int kRow=0;kRow<kernalHeight;kRow++){
for(int kCol=0;kCol<kernalWidth;kCol++){
float pixel =src[kRow+row-verticleImageBound][kCol+col-horizontalImageBound]*kernalArray[kRow][kCol];
value+=pixel;
}
}
dst[row][col] = round(value);
}
}
return dst;
}
Since grayscale values are same for every channel, istead of doing calculation like in grayscale function, i turned the data into 2d array and then back into 1d array using:
unsigned char ** return2darray(unsigned char *src, int width, int height, int size){
unsigned char **array = new unsigned char *[width];
for( int i = 0; i < width; i++ ){
array[i] = new unsigned char[height];
}
for( int i = 0; i < width; i++ ){
for( int j = 0; j < height; j++ ){
array[i][j] = src[3 * (i * height + j)];
}
}
return array;
}
unsigned char * return1darray(unsigned char **src, int width, int height, int size){
unsigned char *array = new unsigned char[size];
for( int i = 0; i < width; i++ ){
for( int j = 0; j < height; j++ ){
array[3 * (i * height + j)] = src[i][j];
array[3 * (i * height + j) + 1] = src[i][j];
array[3 * (i * height + j) + 2] = src[i][j];
}
}
return array;
}
And using it like:
int main() {
// load img
Info dat = readBMP("input.bmp");
// turn in into greyscale
greyScale(dat.data,dat.width,dat.height);
// turn 1d array into 2d
unsigned char** arr = return2darray(dat.data,dat.width,dat.height,dat.size);
double GKernel[5][5];
// geneate gausian filter
FilterCreation(GKernel);
// apply gausianFilter
unsigned char** filter = CreateGaussFilter(arr,5,5,GKernel,dat.width,dat.height,dat.size);
// convert it back into 1d array
unsigned char* ar = return1darray(filter,dat.width,dat.height,dat.size);
ofstream fout;
fout.open("out.bmp", ios::binary | ios::out);
fout.write( reinterpret_cast<char *>(dat.info), dat.offset);
fout.write( reinterpret_cast<char *>(ar), dat.size - dat.offset );
fout.close();
return 0;
}
But for some reason, that I cannot realize for input :
the output looks like this.
It seems like it reads the same values in periodes, but that would mean the original image would have the same periods because it just reads bytes from loaded image. The GreyScale function works as it should. I am not very proficient in manipulation with images ( i was using openCV all the time ) What could cause these periods? Thanks for the help!

Generate gradient with CImg and dynamic array

I'm trying to generate png with different resolution. But if i use dynamic array its generate only gray area. This is source of my code (C++ 16 bit grayscale gradient image from 2D array)
void generate_horizontal_gradient(char fileName[], int width, int height, int offset, bool direction)
{
unsigned short** buffer = new unsigned short* [height];
for (int i = 0; i < height; i++)
{
buffer[i] = new unsigned short[width];
}
for (int i = 0; i < height; i++)
{
unsigned short temp_data = 65535;
if (direction == true) {
for (int j = width; j > 0; j--)
{
buffer[i][j] = temp_data;
if (j < width - offset)
{
temp_data -= 65535 / (width - offset);
}
}
}
else
{
for (int j = 0; j < width; j++)
{
buffer[i][j] = temp_data;
if (j > offset)
{
temp_data -= 65535 / (width - offset);
}
}
}
}
auto hold_arr = (unsigned short*) &buffer[0][0];
cimg_library::CImg<unsigned short> img(hold_arr, width, height);
img.save_png(fileName);
}
Apparently I don’t understand something yet in two-dimensional arrays. Solved the problem through a one-dimensional array:
void generate_horizontal_gradient(char fileName[], int width, int height, int offset, bool direction)
{
unsigned short* buffer = new unsigned short[height * width];
//Add values to array.
for (int i = 0; i < height; i++)
{
unsigned short temp_data = 65535;
if (direction == true) {
for (int j = width; j > 0; j--)
{
buffer[i* width +j] = temp_data;
if (j < width - offset) temp_data -= 65535 / (width - offset);
}
}
else
{
for (int j = 0; j < width; j++)
{
buffer[i * width + j] = temp_data;
if (j > offset) temp_data -= 65535 / (width - offset);
}
}
}
unsigned short* hold_arr = (unsigned short*)& buffer[0*0];
cimg_library::CImg<unsigned short> img(buffer, width, height);
img.save_png(fileName);
}

Corrupted memory issue when deleting allocated memory

I am trying to store a sparse vector using a bit mask. I allocate a char* to represent the bit mask. However, when I delete [] the mask, I get a memory corruption error. Upon investigation, I'm seeing that it's because I'm freeing memory that I'm not supposed to. This is confusing, since I don't see how this could be the case.
When I run this on my case, it prints out "ALLOCATED" and "DEALLOCATING" but nothing further.
void set_i_bit(char* mask, int i) {
int field_num = floor(i/8);
int bit_num = i %8;
mask[field_num] = (1 << bit_num) | mask[field_num];
}
int write_sparse_with_bitmask(vector<float> arr, ofstream* fout) {
int mx_sz = arr.size() - 1;
float tol = 0.5;
char* mask = 0;
for(int i = arr.size() -1; i>=0; i-=1) {
if (fabs(arr[i]) > tol) break;
mx_sz = i;
}
int sprse_cnt = 0;
for(int i = 0; i<=mx_sz; i+=1) {
if (fabs(arr[i]) < tol) sprse_cnt++;
}
int bitmask_sz = ceil(mx_sz/8);
if (sprse_cnt*sizeof(int16_t) + sizeof(int16_t) > bitmask_sz) {
cout<<"ALLOCATED"<<endl;
mask = new char[bitmask_sz];
for (int i =0; i<bitmask_sz; i++) mask[i] = 0;
for(int i = 0; i<=mx_sz; i+=1) {
if (fabs(arr[i]) > coef_tol) {
set_i_bit(mask, i);
}
}
}
else {
bitmask_sz = 0;
}
uint16_t sz = mx_sz + 1;
uint16_t bt_msk = bitmask_sz + 1;
char flag = 0;
if (bitmask_sz > 0) {
flag = flag | 1;
}
fout->write((char*)&sz, sizeof(uint16_t));
fout->write((char*)&flag, sizeof(char));
int w_size = sizeof(uint16_t) + sizeof(char);
if (flag & 1) {
fout->write((char*)&bt_msk, sizeof(uint16_t));
fout->write(mask, sizeof(char)*bt_msk);
cout<<"DEALLOCATING"<<endl;
delete [] mask;
cout<<"THIS DOESN'T PRINT"<<endl;
w_size += sizeof(uint16_t) + sizeof(char)*bt_msk;
}
for(int i = 0; i<=mx_sz; i+=1) {
if (fabs(arr[i]) > tol || !(flag & 1)) {
int16_t vl = arr[i];
fout->write((char*) &vl, sizeof(int16_t));
w_size += sizeof(int16_t);
}
}
return w_size;
}

SSE addition and conversion

Here's the thing, how can I add two unsigned char arrays and store the result in an unsigned short array by using SSE. Can anyone give me some help or hint. This is what I have done so far. I just don't know where the error is..need some help
#include<iostream>
#include<intrin.h>
#include<windows.h>
#include<emmintrin.h>
#include<iterator>
using namespace std;
void sse_add(unsigned char * input1, unsigned char *input2, unsigned short *output, const int N)
{
unsigned char *op3 = new unsigned char[N];
unsigned char *op4 = new unsigned char[N];
__m128i *sse_op3 = (__m128i*)op3;
__m128i *sse_op4 = (__m128i*)op4;
__m128i *sse_result = (__m128i*)output;
for (int i = 0; i < N; i = i + 16)
{
__m128i src = _mm_loadu_si128((__m128i*)input1);
__m128i zero = _mm_setzero_si128();
__m128i higher = _mm_unpackhi_epi8(src, zero);
__m128i lower = _mm_unpacklo_epi8(src, zero);
_mm_storeu_si128(sse_op3, lower);
sse_op3 = sse_op3 + 1;
_mm_storeu_si128(sse_op3, higher);
sse_op3 = sse_op3 + 1;
input1 = input1 + 16;
}
for (int j = 0; j < N; j = j + 16)
{
__m128i src1 = _mm_loadu_si128((__m128i*)input2);
__m128i zero1 = _mm_setzero_si128();
__m128i higher1 = _mm_unpackhi_epi8(src1, zero1);
__m128i lower1 = _mm_unpacklo_epi8(src1, zero1);
_mm_storeu_si128(sse_op4, lower1);
sse_op4 = sse_op4 + 1;
_mm_storeu_si128(sse_op4, higher1);
sse_op4 = sse_op4 + 1;
input2 = input2 + 16;
}
__m128i *sse_op3_new = (__m128i*)op3;
__m128i *sse_op4_new = (__m128i*)op4;
for (int y = 0; y < N; y = y + 8)
{
*sse_result = _mm_adds_epi16(*sse_op3_new, *sse_op4_new);
sse_result = sse_result + 1;
sse_op3_new = sse_op3_new + 1;
sse_op4_new = sse_op4_new + 1;
}
}
void C_add(unsigned char * input1, unsigned char *input2, unsigned short *output, int N)
{
for (int i = 0; i < N; i++)
output[i] = (unsigned short)input1[i] + (unsigned short)input2[i];
}
int main()
{
int n = 1023;
unsigned char *p0 = new unsigned char[n];
unsigned char *p1 = new unsigned char[n];
unsigned short *p21 = new unsigned short[n];
unsigned short *p22 = new unsigned short[n];
for (int j = 0; j < n; j++)
{
p21[j] = rand() % 256;
p22[j] = rand() % 256;
}
C_add(p0, p1, p22, n);
cout << "C_add finished!" << endl;
sse_add(p0, p1, p21, n);
cout << "sse_add finished!" << endl;
for (int j = 0; j < n; j++)
{
if (p21[j] != p22[j])
{
cout << "diff!!!!!#######" << endl;
}
}
//system("pause");
delete[] p0;
delete[] p1;
delete[] p21;
delete[] p22;
return 0;
}
Assuming everything is aligned to _Alignof(__m128i) and the size of the array is a multiple of sizeof(__m128i), something like this should work:
void addw(size_t size, uint16_t res[size], uint8_t a[size], uint8_t b[size]) {
__m128i* r = (__m128i*) res;
__m128i* ap = (__m128i*) a;
__m128i* bp = (__m128i*) b;
for (size_t i = 0 ; i < (size / sizeof(__m128i)) ; i++) {
r[(i * 2)] = _mm_add_epi16(_mm_cvtepu8_epi16(ap[i]), _mm_cvtepu8_epi16(bp[i]));
r[(i * 2) + 1] = _mm_add_epi16(_mm_cvtepu8_epi16(_mm_srli_si128(ap[i], 8)), _mm_cvtepu8_epi16(_mm_srli_si128(bp[i], 8)));
}
}
FWIW, NEON would be a bit simpler (using vaddl_u8 and vaddl_high_u8).
If you're dealing with unaligned data you can use _mm_loadu_si128/_mm_storeu_si128. If size isn't a multiple of 16 you'll just have to do the remainder without SSE.
Note that this may be something your compiler can do automatically (I haven't checked). You may want to try something like this:
#pragma omp simd
for (size_t i = 0 ; i < size ; i++) {
res[i] = ((uint16_t) a[i]) + ((uint16_t) b[i]);
}
That uses OpenMP 4, but there is also Cilk++ (#pragma simd), clang (#pragma clang loop vectorize(enable)), gcc (#pragma GCC ivdep), or you could just hope the compiler is smart enough without the pragma hint.

can't enter into __global__ function using cuda

I have written a code on Nsight that compiles and can be executed but the first launch can't be completed.
The strange thing is that when I run it in debug mode, it works perfectly but it is too slow.
Here is the part of the code before entering the function that access the GPU (where i think there is an error I can't find) :
void parallelAction (int * dataReturned, char * data, unsigned char * descBase, int range, int cardBase, int streamIdx)
{
size_t inputBytes = range*128*sizeof(unsigned char);
size_t baseBytes = cardBase*128*sizeof(unsigned char);
size_t outputBytes = range*sizeof(int);
unsigned char * data_d;
unsigned char * descBase_d;
int * cardBase_d;
int * dataReturned_d;
cudaMalloc((void **) &data_d, inputBytes);
cudaMalloc((void **) &descBase_d, baseBytes);
cudaMalloc((void **) &cardBase_d, sizeof(int));
cudaMalloc((void **) &dataReturned_d, outputBytes);
int blockSize = 196;
int nBlocks = range/blockSize + (range%blockSize == 0?0:1);
cudaMemcpy(data_d, data, inputBytes, cudaMemcpyHostToDevice);
cudaMemcpy(descBase_d, descBase, baseBytes, cudaMemcpyHostToDevice);
cudaMemcpy(cardBase_d, &cardBase, sizeof(int), cudaMemcpyHostToDevice);
FindClosestDescriptor<<< nBlocks, blockSize >>>(dataReturned_d, data_d, descBase_d, cardBase_d);
cudaMemcpy(dataReturned, dataReturned_d, outputBytes, cudaMemcpyDeviceToHost);
cudaFree(data_d);
cudaFree(descBase_d);
cudaFree(cardBase_d);
cudaFree(dataReturned_d);
}
And the function entering the GPU (I don't think the error is here) :
__global__ void FindClosestDescriptor(int * dataReturned, unsigned char * data, unsigned char * base, int *cardBase)
{
int idx = blockDim.x * blockIdx.x + threadIdx.x;
unsigned char descriptor1[128], descriptor2[128];
int part = 0;
int result = 0;
int winner = 0;
int minDistance = 0;
int itelimit = *cardBase;
for (int k = 0; k < 128; k++)
{
descriptor1[k] = data[idx*128+k];
}
// initialize minDistance
for (int k = 0; k < 128; k++)
{
descriptor2[k] = base[k];
}
for (int k = 0; k < 128; k++)
{
part = (descriptor1[k]-descriptor2[k]);
part *= part;
minDistance += part;
}
// test all descriptors in the base :
for (int i = 1; i < itelimit; i++)
{
result = 0;
for (int k = 0; k < 128; k++)
{
descriptor2[k] = base[i*128+k];
// Calculate squared l2 distance :
part = (descriptor1[k]-descriptor2[k]);
part *= part;
result += part;
}
// Compare to minDistance
if (result < minDistance)
{
minDistance = result;
winner = i;
}
}
// Write the result in dataReturned
dataReturned[idx] = winner;
}
Thank you in advance if you can help me.
EDIT : the last cudaMemcpy returns the error "the launch timed out and was terminated".
linux has a watchdog mechanism. If your kernel runs for a long time (you say it is slow in debug mode) you can hit the linux watchdog, and receive the "launch timed out and was terminated" error.
In this case you have several things you might try. The options are covered here.