can't enter into __global__ function using cuda - c++

I have written a code on Nsight that compiles and can be executed but the first launch can't be completed.
The strange thing is that when I run it in debug mode, it works perfectly but it is too slow.
Here is the part of the code before entering the function that access the GPU (where i think there is an error I can't find) :
void parallelAction (int * dataReturned, char * data, unsigned char * descBase, int range, int cardBase, int streamIdx)
{
size_t inputBytes = range*128*sizeof(unsigned char);
size_t baseBytes = cardBase*128*sizeof(unsigned char);
size_t outputBytes = range*sizeof(int);
unsigned char * data_d;
unsigned char * descBase_d;
int * cardBase_d;
int * dataReturned_d;
cudaMalloc((void **) &data_d, inputBytes);
cudaMalloc((void **) &descBase_d, baseBytes);
cudaMalloc((void **) &cardBase_d, sizeof(int));
cudaMalloc((void **) &dataReturned_d, outputBytes);
int blockSize = 196;
int nBlocks = range/blockSize + (range%blockSize == 0?0:1);
cudaMemcpy(data_d, data, inputBytes, cudaMemcpyHostToDevice);
cudaMemcpy(descBase_d, descBase, baseBytes, cudaMemcpyHostToDevice);
cudaMemcpy(cardBase_d, &cardBase, sizeof(int), cudaMemcpyHostToDevice);
FindClosestDescriptor<<< nBlocks, blockSize >>>(dataReturned_d, data_d, descBase_d, cardBase_d);
cudaMemcpy(dataReturned, dataReturned_d, outputBytes, cudaMemcpyDeviceToHost);
cudaFree(data_d);
cudaFree(descBase_d);
cudaFree(cardBase_d);
cudaFree(dataReturned_d);
}
And the function entering the GPU (I don't think the error is here) :
__global__ void FindClosestDescriptor(int * dataReturned, unsigned char * data, unsigned char * base, int *cardBase)
{
int idx = blockDim.x * blockIdx.x + threadIdx.x;
unsigned char descriptor1[128], descriptor2[128];
int part = 0;
int result = 0;
int winner = 0;
int minDistance = 0;
int itelimit = *cardBase;
for (int k = 0; k < 128; k++)
{
descriptor1[k] = data[idx*128+k];
}
// initialize minDistance
for (int k = 0; k < 128; k++)
{
descriptor2[k] = base[k];
}
for (int k = 0; k < 128; k++)
{
part = (descriptor1[k]-descriptor2[k]);
part *= part;
minDistance += part;
}
// test all descriptors in the base :
for (int i = 1; i < itelimit; i++)
{
result = 0;
for (int k = 0; k < 128; k++)
{
descriptor2[k] = base[i*128+k];
// Calculate squared l2 distance :
part = (descriptor1[k]-descriptor2[k]);
part *= part;
result += part;
}
// Compare to minDistance
if (result < minDistance)
{
minDistance = result;
winner = i;
}
}
// Write the result in dataReturned
dataReturned[idx] = winner;
}
Thank you in advance if you can help me.
EDIT : the last cudaMemcpy returns the error "the launch timed out and was terminated".

linux has a watchdog mechanism. If your kernel runs for a long time (you say it is slow in debug mode) you can hit the linux watchdog, and receive the "launch timed out and was terminated" error.
In this case you have several things you might try. The options are covered here.

Related

How is numpy so fast?

I'm trying to understand how numpy can be so fast, based on my shocking comparison with optimized C/C++ code which is still far from reproducing numpy's speed.
Consider the following example:
Given a 2D array with shape=(N, N) and dtype=float32, which represents a list of N vectors of N dimensions, I am computing the pairwise differences between every pair of vectors. Using numpy broadcasting, this simply writes as:
def pairwise_sub_numpy( X ):
return X - X[:, None, :]
Using timeit I can measure the performance for N=512: it takes 88 ms per call on my laptop.
Now, in C/C++ a naive implementation writes as:
#define X(i, j) _X[(i)*N + (j)]
#define res(i, j, k) _res[((i)*N + (j))*N + (k)]
float* pairwise_sub_naive( const float* _X, int N )
{
float* _res = (float*) aligned_alloc( 32, N*N*N*sizeof(float));
for (int i = 0; i < N; i++) {
for (int j = 0; j < N; j++) {
for (int k = 0; k < N; k++)
res(i,j,k) = X(i,k) - X(j,k);
}
}
return _res;
}
Compiling using gcc 7.3.0 with -O3 flag, I get 195 ms per call for pairwise_sub_naive(X), which is not too bad given the simplicity of the code, but about 2 times slower than numpy.
Now I start getting serious and add some small optimizations, by indexing the row vectors directly:
float* pairwise_sub_better( const float* _X, int N )
{
float* _res = (float*) aligned_alloc( 32, N*N*N*sizeof(float));
for (int i = 0; i < N; i++) {
const float* xi = & X(i,0);
for (int j = 0; j < N; j++) {
const float* xj = & X(j,0);
float* r = &res(i,j,0);
for (int k = 0; k < N; k++)
r[k] = xi[k] - xj[k];
}
}
return _res;
}
The speed stays the same at 195 ms, which means that the compiler was able to figure that much. Let's now use SIMD vector instructions:
float* pairwise_sub_simd( const float* _X, int N )
{
float* _res = (float*) aligned_alloc( 32, N*N*N*sizeof(float));
// create caches for row vectors which are memory-aligned
float* xi = (float*)aligned_alloc(32, N * sizeof(float));
float* xj = (float*)aligned_alloc(32, N * sizeof(float));
for (int i = 0; i < N; i++) {
memcpy(xi, & X(i,0), N*sizeof(float));
for (int j = 0; j < N; j++) {
memcpy(xj, & X(j,0), N*sizeof(float));
float* r = &res(i,j,0);
for (int k = 0; k < N; k += 256/sizeof(float)) {
const __m256 A = _mm256_load_ps(xi+k);
const __m256 B = _mm256_load_ps(xj+k);
_mm256_store_ps(r+k, _mm256_sub_ps( A, B ));
}
}
}
free(xi);
free(xj);
return _res;
}
This only yields a small boost (178 ms instead of 194 ms per function call).
Then I was wondering if a "block-wise" approach, like what is used to optimize dot-products, could be beneficials:
float* pairwise_sub_blocks( const float* _X, int N )
{
float* _res = (float*) aligned_alloc( 32, N*N*N*sizeof(float));
#define B 8
float cache1[B*B], cache2[B*B];
for (int bi = 0; bi < N; bi+=B)
for (int bj = 0; bj < N; bj+=B)
for (int bk = 0; bk < N; bk+=B) {
// load first 8x8 block in the cache
for (int i = 0; i < B; i++)
for (int k = 0; k < B; k++)
cache1[B*i + k] = X(bi+i, bk+k);
// load second 8x8 block in the cache
for (int j = 0; j < B; j++)
for (int k = 0; k < B; k++)
cache2[B*j + k] = X(bj+j, bk+k);
// compute local operations on the caches
for (int i = 0; i < B; i++)
for (int j = 0; j < B; j++)
for (int k = 0; k < B; k++)
res(bi+i,bj+j,bk+k) = cache1[B*i + k] - cache2[B*j + k];
}
return _res;
}
And surprisingly, this is the slowest method so far (258 ms per function call).
To summarize, despite some efforts with some optimized C++ code, I can't come anywhere close the 88 ms / call that numpy achieves effortlessly. Any idea why?
Note: By the way, I am disabling numpy multi-threading and anyway, this kind of operation is not multi-threaded.
Edit: Exact code to benchmark the numpy code:
import numpy as np
def pairwise_sub_numpy( X ):
return X - X[:, None, :]
N = 512
X = np.random.rand(N,N).astype(np.float32)
import timeit
times = timeit.repeat('pairwise_sub_numpy( X )', globals=globals(), number=1, repeat=5)
print(f">> best of 5 = {1000*min(times):.3f} ms")
Full benchmark for C code:
#include <stdio.h>
#include <string.h>
#include <xmmintrin.h> // compile with -mavx -msse4.1
#include <pmmintrin.h>
#include <immintrin.h>
#include <time.h>
#define X(i, j) _x[(i)*N + (j)]
#define res(i, j, k) _res[((i)*N + (j))*N + (k)]
float* pairwise_sub_naive( const float* _x, int N )
{
float* _res = (float*) aligned_alloc( 32, N*N*N*sizeof(float));
for (int i = 0; i < N; i++) {
for (int j = 0; j < N; j++) {
for (int k = 0; k < N; k++)
res(i,j,k) = X(i,k) - X(j,k);
}
}
return _res;
}
float* pairwise_sub_better( const float* _x, int N )
{
float* _res = (float*) aligned_alloc( 32, N*N*N*sizeof(float));
for (int i = 0; i < N; i++) {
const float* xi = & X(i,0);
for (int j = 0; j < N; j++) {
const float* xj = & X(j,0);
float* r = &res(i,j,0);
for (int k = 0; k < N; k++)
r[k] = xi[k] - xj[k];
}
}
return _res;
}
float* pairwise_sub_simd( const float* _x, int N )
{
float* _res = (float*) aligned_alloc( 32, N*N*N*sizeof(float));
// create caches for row vectors which are memory-aligned
float* xi = (float*)aligned_alloc(32, N * sizeof(float));
float* xj = (float*)aligned_alloc(32, N * sizeof(float));
for (int i = 0; i < N; i++) {
memcpy(xi, & X(i,0), N*sizeof(float));
for (int j = 0; j < N; j++) {
memcpy(xj, & X(j,0), N*sizeof(float));
float* r = &res(i,j,0);
for (int k = 0; k < N; k += 256/sizeof(float)) {
const __m256 A = _mm256_load_ps(xi+k);
const __m256 B = _mm256_load_ps(xj+k);
_mm256_store_ps(r+k, _mm256_sub_ps( A, B ));
}
}
}
free(xi);
free(xj);
return _res;
}
float* pairwise_sub_blocks( const float* _x, int N )
{
float* _res = (float*) aligned_alloc( 32, N*N*N*sizeof(float));
#define B 8
float cache1[B*B], cache2[B*B];
for (int bi = 0; bi < N; bi+=B)
for (int bj = 0; bj < N; bj+=B)
for (int bk = 0; bk < N; bk+=B) {
// load first 8x8 block in the cache
for (int i = 0; i < B; i++)
for (int k = 0; k < B; k++)
cache1[B*i + k] = X(bi+i, bk+k);
// load second 8x8 block in the cache
for (int j = 0; j < B; j++)
for (int k = 0; k < B; k++)
cache2[B*j + k] = X(bj+j, bk+k);
// compute local operations on the caches
for (int i = 0; i < B; i++)
for (int j = 0; j < B; j++)
for (int k = 0; k < B; k++)
res(bi+i,bj+j,bk+k) = cache1[B*i + k] - cache2[B*j + k];
}
return _res;
}
int main()
{
const int N = 512;
float* _x = (float*) malloc( N * N * sizeof(float) );
for( int i = 0; i < N; i++)
for( int j = 0; j < N; j++)
X(i,j) = ((i+j*j+17*i+101) % N) / float(N);
double best = 9e9;
for( int i = 0; i < 5; i++)
{
struct timespec start, stop;
clock_gettime(CLOCK_THREAD_CPUTIME_ID, &start);
//float* res = pairwise_sub_naive( _x, N );
//float* res = pairwise_sub_better( _x, N );
//float* res = pairwise_sub_simd( _x, N );
float* res = pairwise_sub_blocks( _x, N );
clock_gettime(CLOCK_THREAD_CPUTIME_ID, &stop);
double t = (stop.tv_sec - start.tv_sec) * 1e6 + (stop.tv_nsec - start.tv_nsec) / 1e3; // in microseconds
if (t < best) best = t;
free( res );
}
printf("Best of 5 = %f ms\n", best / 1000);
free( _x );
return 0;
}
Compiled using gcc 7.3.0 gcc -Wall -O3 -mavx -msse4.1 -o test_simd test_simd.c
Summary of timings on my machine:
Implementation
Time
numpy
88 ms
C++ naive
194 ms
C++ better
195 ms
C++ SIMD
178 ms
C++ blocked
258 ms
C++ blocked (gcc 8.3.1)
217 ms
As pointed out by some of the comments numpy uses SIMD in its implementation and it does not allocate memory at the point of computation. If I eliminate the memory allocation from your implementation, pre-allocating all the buffers ahead of the computation then I get a better time compared to numpy even with the scaler version(that is the one without any optimizations).
Also in terms of SIMD and why your implementation does not perform much better than the scaler is because your memory access patterns are not ideal for SIMD usage - you do memcopy and you load into SIMD registers from locations that are far apart from each other - e.g. you fill vectors from line 0 and line 511, which might not play well with the cache or with the SIMD prefetcher.
There is also a mistake in how you load the SIMD registers(if I understood correctly what you're trying to compute): a 256 bit SIMD register can load 8 single-precision floating-point numbers 8 * 32 = 256, but in your loop you jump k by "256/sizeof(float)" which is 256/4 = 64; _x and _res are float pointers and the SIMD intrinsics expect also float pointers as arguments so instead of reading all elements from those lines every 8 floats you read them every 64 floats.
The computation can be optimized further by changing the access patterns but also by observing that you repeat some computations: e.g. when iterating with line0 as a base you compute line0 - line1 but at some future time, when iterating with line1 as a base, you need to compute line1 - line0 which is basically -(line0 - line1), that is for each line after line0 a lot of results could be reused from previous computations.
A lot of times SIMD usage or parallelization requires one to change how data is accessed or reasoned about in order to provide meaningful improvements.
Here is what I have done as a first step based on your initial implementation and it is faster than the numpy(don't mind the OpenMP stuff as it's not how its supposed to be done, I just wanted to see how it behaves trying the naive way).
C++
Time scaler version: 55 ms
Time SIMD version: 53 ms
**Time SIMD 2 version: 33 ms**
Time SIMD 3 version: 168 ms
Time OpenMP version: 59 ms
Python numpy
>> best of 5 = 88.794 ms
#include <cstdlib>
#include <xmmintrin.h> // compile with -mavx -msse4.1
#include <pmmintrin.h>
#include <immintrin.h>
#include <numeric>
#include <algorithm>
#include <chrono>
#include <iostream>
#include <cstring>
using namespace std;
float* pairwise_sub_naive (const float* input, float* output, int n)
{
for (int i = 0; i < n; i++) {
for (int j = 0; j < n; j++) {
for (int k = 0; k < n; k++)
output[(i * n + j) * n + k] = input[i * n + k] - input[j * n + k];
}
}
return output;
}
float* pairwise_sub_simd (const float* input, float* output, int n)
{
for (int i = 0; i < n; i++)
{
const int idxi = i * n;
for (int j = 0; j < n; j++)
{
const int idxj = j * n;
const int outidx = idxi + j;
for (int k = 0; k < n; k += 8)
{
__m256 A = _mm256_load_ps(input + idxi + k);
__m256 B = _mm256_load_ps(input + idxj + k);
_mm256_store_ps(output + outidx * n + k, _mm256_sub_ps( A, B ));
}
}
}
return output;
}
float* pairwise_sub_simd_2 (const float* input, float* output, int n)
{
float* line_buffer = (float*) aligned_alloc(32, n * sizeof(float));
for (int i = 0; i < n; i++)
{
const int idxi = i * n;
for (int j = 0; j < n; j++)
{
const int idxj = j * n;
const int outidx = idxi + j;
for (int k = 0; k < n; k += 8)
{
__m256 A = _mm256_load_ps(input + idxi + k);
__m256 B = _mm256_load_ps(input + idxj + k);
_mm256_store_ps(line_buffer + k, _mm256_sub_ps( A, B ));
}
memcpy(output + outidx * n, line_buffer, n);
}
}
return output;
}
float* pairwise_sub_simd_3 (const float* input, float* output, int n)
{
for (int i = 0; i < n; i++)
{
const int idxi = i * n;
for (int k = 0; k < n; k += 8)
{
__m256 A = _mm256_load_ps(input + idxi + k);
for (int j = 0; j < n; j++)
{
const int idxj = j * n;
const int outidx = (idxi + j) * n;
__m256 B = _mm256_load_ps(input + idxj + k);
_mm256_store_ps(output + outidx + k, _mm256_sub_ps( A, B ));
}
}
}
return output;
}
float* pairwise_sub_openmp (const float* input, float* output, int n)
{
int i, j;
#pragma omp parallel for private(j)
for (i = 0; i < n; i++)
{
for (j = 0; j < n; j++)
{
const int idxi = i * n;
const int idxj = j * n;
const int outidx = idxi + j;
for (int k = 0; k < n; k += 8)
{
__m256 A = _mm256_load_ps(input + idxi + k);
__m256 B = _mm256_load_ps(input + idxj + k);
_mm256_store_ps(output + outidx * n + k, _mm256_sub_ps( A, B ));
}
}
}
/*for (i = 0; i < n; i++)
{
for (j = 0; j < n; j++)
{
for (int k = 0; k < n; k++)
{
output[(i * n + j) * n + k] = input[i * n + k] - input[j * n + k];
}
}
}*/
return output;
}
int main ()
{
constexpr size_t n = 512;
constexpr size_t input_size = n * n;
constexpr size_t output_size = n * n * n;
float* input = (float*) aligned_alloc(32, input_size * sizeof(float));
float* output = (float*) aligned_alloc(32, output_size * sizeof(float));
float* input_simd = (float*) aligned_alloc(32, input_size * sizeof(float));
float* output_simd = (float*) aligned_alloc(32, output_size * sizeof(float));
float* input_par = (float*) aligned_alloc(32, input_size * sizeof(float));
float* output_par = (float*) aligned_alloc(32, output_size * sizeof(float));
iota(input, input + input_size, float(0.0));
fill(output, output + output_size, float(0.0));
iota(input_simd, input_simd + input_size, float(0.0));
fill(output_simd, output_simd + output_size, float(0.0));
iota(input_par, input_par + input_size, float(0.0));
fill(output_par, output_par + output_size, float(0.0));
std::chrono::milliseconds best_scaler{100000};
for (int i = 0; i < 5; ++i)
{
auto start = chrono::high_resolution_clock::now();
pairwise_sub_naive(input, output, n);
auto stop = chrono::high_resolution_clock::now();
auto duration = chrono::duration_cast<chrono::milliseconds>(stop - start);
if (duration < best_scaler)
{
best_scaler = duration;
}
}
cout << "Time scaler version: " << best_scaler.count() << " ms\n";
std::chrono::milliseconds best_simd{100000};
for (int i = 0; i < 5; ++i)
{
auto start = chrono::high_resolution_clock::now();
pairwise_sub_simd(input_simd, output_simd, n);
auto stop = chrono::high_resolution_clock::now();
auto duration = chrono::duration_cast<chrono::milliseconds>(stop - start);
if (duration < best_simd)
{
best_simd = duration;
}
}
cout << "Time SIMD version: " << best_simd.count() << " ms\n";
std::chrono::milliseconds best_simd_2{100000};
for (int i = 0; i < 5; ++i)
{
auto start = chrono::high_resolution_clock::now();
pairwise_sub_simd_2(input_simd, output_simd, n);
auto stop = chrono::high_resolution_clock::now();
auto duration = chrono::duration_cast<chrono::milliseconds>(stop - start);
if (duration < best_simd_2)
{
best_simd_2 = duration;
}
}
cout << "Time SIMD 2 version: " << best_simd_2.count() << " ms\n";
std::chrono::milliseconds best_simd_3{100000};
for (int i = 0; i < 5; ++i)
{
auto start = chrono::high_resolution_clock::now();
pairwise_sub_simd_3(input_simd, output_simd, n);
auto stop = chrono::high_resolution_clock::now();
auto duration = chrono::duration_cast<chrono::milliseconds>(stop - start);
if (duration < best_simd_3)
{
best_simd_3 = duration;
}
}
cout << "Time SIMD 3 version: " << best_simd_3.count() << " ms\n";
std::chrono::milliseconds best_par{100000};
for (int i = 0; i < 5; ++i)
{
auto start = chrono::high_resolution_clock::now();
pairwise_sub_openmp(input_par, output_par, n);
auto stop = chrono::high_resolution_clock::now();
auto duration = chrono::duration_cast<chrono::milliseconds>(stop - start);
if (duration < best_par)
{
best_par = duration;
}
}
cout << "Time OpenMP version: " << best_par.count() << " ms\n";
cout << "Verification\n";
if (equal(output, output + output_size, output_simd))
{
cout << "PASSED\n";
}
else
{
cout << "FAILED\n";
}
return 0;
}
Edit: Small correction as there was a wrong call related to the second version of SIMD implementation.
As you can see now, the second implementation is the fastest as it behaves the best from the point of view of the locality of reference of the cache. Examples 2 and 3 of SIMD implementations are there to illustrate for you how changing memory access patterns to influence the performance of your SIMD optimizations.
To summarize(knowing that I'm far from being complete in my advice) be mindful of your memory access patterns and of the loads and stores to\from the SIMD unit; the SIMD is a different hardware unit inside the processor's core so there is a penalty in shuffling data back and forth, hence when you load a register from memory try to do as many operations as possible with that data and do not be too eager to store it back(of course, in your example that might be all you need to do with the data). Be mindful also that there is a limited number of SIMD registers available and if you load too many then they will "spill", that is they will be stored back to temporary locations in main memory behind the scenes killing all your gains. SIMD optimization, it's a true balance act!
There is some effort to put a cross-platform intrinsics wrapper into the standard(I developed myself a closed source one in my glorious past) and even it's far from being complete, it's worth taking a look at(read the accompanying papers if you're truly interested to learn how SIMD works).
https://github.com/VcDevel/std-simd
This is a complement to the answer posted by #celakev .
I think I finally got to understand what exactly was the issue. The issue was not about allocating the memory in the main function that does the computation.
What was actually taking time is to access new (fresh) memory. I believe that the malloc call returns pages of memory which are virtual, i.e. that does not corresponds to actual physical memory -- until it is explicitly accessed. What actually takes time is the process of allocating physical memory on the fly (which I think is OS-level) when it is accessed in the function code.
Here is a proof. Consider the two following trivial functions:
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
float* just_alloc( size_t N )
{
return (float*) aligned_alloc( 32, sizeof(float)*N );
}
void just_fill( float* _arr, size_t N )
{
for (size_t i = 0; i < N; i++)
_arr[i] = 1;
}
#define Time( code_to_benchmark, cleanup_code ) \
do { \
double best = 9e9; \
for( int i = 0; i < 5; i++) { \
struct timespec start, stop; \
clock_gettime(CLOCK_THREAD_CPUTIME_ID, &start); \
code_to_benchmark; \
clock_gettime(CLOCK_THREAD_CPUTIME_ID, &stop); \
double t = (stop.tv_sec - start.tv_sec) * 1e3 + (stop.tv_nsec - start.tv_nsec) / 1e6; \
printf("Time[%d] = %f ms\n", i, t); \
if (t < best) best = t; \
cleanup_code; \
} \
printf("Best of 5 for '" #code_to_benchmark "' = %f ms\n\n", best); \
} while(0)
int main()
{
const size_t N = 512;
Time( float* arr = just_alloc(N*N*N), free(arr) );
float* arr = just_alloc(N*N*N);
Time( just_fill(arr, N*N*N), ; );
free(arr);
return 0;
}
I get the following timings, which I now detail for each of the calls:
Time[0] = 0.000931 ms
Time[1] = 0.000540 ms
Time[2] = 0.000523 ms
Time[3] = 0.000524 ms
Time[4] = 0.000521 ms
Best of 5 for 'float* arr = just_alloc(N*N*N)' = 0.000521 ms
Time[0] = 189.822237 ms
Time[1] = 45.041083 ms
Time[2] = 46.331428 ms
Time[3] = 44.729433 ms
Time[4] = 42.241279 ms
Best of 5 for 'just_fill(arr, N*N*N)' = 42.241279 ms
As you can see, allocating memory is blazingly fast, but the first time that the memory is accessed, it is 5 times slower than the other times. So, basically the reason that my code was slow was because i was each time reallocating fresh memory that had no physical address yet. (Correct me if I'm wrong but I think that's the gist of it!)
A bit late to the party, but I wanted to add a pairwise method with Eigen, which is supposed to give C++ a high-level algebra manipulation capability and use SIMD under the hood. Just like numpy.
Here is the implementation
#include <iostream>
#include <vector>
#include <chrono>
#include <algorithm>
#include <Eigen/Dense>
auto pairwise_eigen(const Eigen::MatrixXf &input, std::vector<Eigen::MatrixXf> &output) {
for (int k = 0; k < input.cols(); ++k)
output[k] = input
// subtract matrix with repeated k-th column
- input.col(k) * Eigen::RowVectorXf::Ones(input.cols());
}
int main() {
constexpr size_t n = 512;
// allocate input and output
Eigen::MatrixXf input = Eigen::MatrixXf::Random(n, n);
std::vector<Eigen::MatrixXf> output(n);
std::chrono::milliseconds best_eigen{100000};
for (int i = 0; i < 5; ++i) {
auto start = std::chrono::high_resolution_clock::now();
pairwise_eigen(input, output);
auto end = std::chrono::high_resolution_clock::now();
auto duration = std::chrono::duration_cast<std::chrono::milliseconds>(end-start);
if (duration < best_eigen)
best_eigen = duration;
}
std::cout << "Time Eigen version: " << best_eigen.count() << " ms\n";
return 0;
}
The full benchmark tests suggested by #celavek on my system are
Time scaler version: 57 ms
Time SIMD version: 58 ms
Time SIMD 2 version: 40 ms
Time SIMD 3 version: 58 ms
Time OpenMP version: 58 ms
Time Eigen version: 76 ms
Numpy >> best of 5 = 118.489 ms
Whit Eigen there is still a noticeable improvement with respect to Numpy, but not so impressive compared to the "raw" implementations (there is certainly some overhead).
An extra optimization is to allocate the output vector with copies of the input and then subtract directly from each vector entry, simply replacing the following lines
// inside the pairwise method
for (int k = 0; k < input.cols(); ++k)
output[k] -= input.col(k) * Eigen::RowVectorXf::Ones(input.cols());
// at allocation time
std::vector<Eigen::MatrixXf> output(n, input);
This pushes the best of 5 down to 60 ms.

Pass 2D thrust::device_vector Complex Matrix to CUDA kernel function

I'm new in Cuda and and I'm trying to move my existing Project to GPU using Cuda.
My code are based on complex matrices and complex buffers.
For the first step, I tried to move That nested For loop Code to Cuda (the rest will be similar):
typedef thrust::complex<double> smp_t;
uint8_t *binbuffer = (uint8_t*) malloc(8 * bufsize * sizeof(uint8_t));
smp_t *sgbuf = (smp_t*) malloc(8 * bufsize * sizeof(smp_t));
smp_t *cnbuf = (smp_t*) malloc(8 * bufsize * sizeof(smp_t));
// Create matrix.
thrust::complex<double> i_unit(0.0, 1.0);
thrust::host_vector<thrust::host_vector<smp_t>> tw(decfactor);
// Fill the Matrix
for (size_t row = 0; row < 8; row++) {
for (size_t col = 0; col < 8; col++) {
std::complex<double> tmp =
exp(-i_unit * 2.0*M_PI * ((double) col*row) / (double)8);
tw[row].push_back(tmp);
}
}
/* The Code To Move to the GPU processing */
for (unsigned int i = 0; i < bufsize; i++) {
for (size_t ch = 0; ch < 8; ch++)
for (size_t k = 0; k < 8; k++)
cnbuf[ch*bufsize + i] += sgbuf[k*bufsize+i] * tw[ch].at(k);
}
That is the Code from the .cu file that will replace the current nested for loop:
__global__ void kernel_func(cuDoubleComplex *cnbuf, cuDoubleComplex *sgbuf, smp_t *tw, size_t block_size) {
unsigned int ch = threadIdx.x;
unsigned int k = blockIdx.x;
for (int x = 0; x < block_size; ++x) {
unsigned int sig_index = k*block_size+x;
unsigned int tw_index = ch*k;
unsigned int cn_index = ch*block_size+x;
cuDoubleComplex temp = cuCmul(sgbuf[sig_index], make_cuDoubleComplex(tw[tw_index].real(), tw[tw_index].imag()));
cnbuf[cn_index] = cuCadd(temp, cnbuf[cn_index]);
}
}
void kernel_wrap(
smp_t *cnbuf,
smp_t *sgbuf,
thrust::host_vector<thrust::host_vector<smp_t>>tw,
size_t buffer_size) {
smp_t *d_sgbuf;
smp_t *d_cnbuf;
thrust::device_vector<smp_t> d_tw(8*8);
thrust::copy(&tw[0][0], &tw[7][7], d_tw.begin());
cudaMalloc((void **)&d_sgbuf, buffer_size);
cudaMalloc((void **)&d_cnbuf, buffer_size);
cudaMemcpy(d_sgbuf, sgbuf, buffer_size, cudaMemcpyDeviceToHost);
cudaMemcpy(d_cnbuf, cnbuf, buffer_size, cudaMemcpyDeviceToHost);
thrust::raw_pointer_cast(d_tw.data());
kernel_func<<<8, 8>>>(
reinterpret_cast<cuDoubleComplex*>(d_cnbuf),
reinterpret_cast<cuDoubleComplex*>(d_sgbuf),
thrust::raw_pointer_cast(d_tw.data()),
buffer_size
);
cudaError_t varCudaError1 = cudaGetLastError();
if (varCudaError1 != cudaSuccess)
{
std::cout << "Failed to launch subDelimiterExamine kernel (error code: " << cudaGetErrorString(varCudaError1) << ")!" << std::endl;
exit(EXIT_FAILURE);
}
cudaMemcpy(sgbuf, d_sgbuf, buffer_size, cudaMemcpyHostToDevice);
cudaMemcpy(cnbuf, d_cnbuf, buffer_size, cudaMemcpyHostToDevice);
}
When I'm running the code, I get the error:
Failed to launch subDelimiterExamine kernel (error code: invalid argument)!
I think that the argument that causing the troubles is the 'd_tw'.
So, my questions are:
What am I'm doing wrong with the cast of <thrust::host_vector<thrust::host_vector smp_t>> to <thrust::device_vector smp_t>> (from 2d Matrix to one flattened arr)?
Is there a better whey to work with 2D Complex numbers in CUDA?
The documentation about Complex arrays in Cuda are very poorly, where can I read abound the work with Cuda Complex matrices?
Thanks!!!!
There were various problems. I will list a few, and probably miss some. So please refer to the example code I have given for additional differences.
The most immediate problem is here:
thrust::copy(&tw[0][0], &tw[7][7], d_tw.begin());
This is what is giving rise to the invalid argument error you are seeing. Underneath the hood, thrust is going to try to use a cudaMemcpyAsync operation for this, because this is inherently a copy from host to device. We will fix this by replacing it with an ordinary cudaMemcpy operation, but to understand how to construct that, it's necessary to understand item 2.
You seem to think that a vector of vectors implies contiguous storage. It does not and that statement is not specific to thrust. Since a thrust::host_vector of vectors (or even std::vector of vectors) does not imply contiguous storage, we can't easily construct a single operation, such as cudaMemcpy or thrust::copy to copy this data. Therefore it will be necessary to explicitly flatten it.
Your directions of copy on the cudaMemcpy operations are universally backward. Where you should have had cudaMemcpyHostToDevice you had cudaMemcpyDeviceToHost, and vice-versa.
The CUDA cuComplex.h header file predates thrust, and was provided for a quick C-style method to work with complex numbers. There is no documentation for it - you have to read the file itself and work out how to use it, as seem to have already done. However, since you are using thrust::complex<> anyway, it's far simpler just to use that coding paradigm, and write you device code to look almost exactly like your host code.
You had various transfer sizes wrong. cudaMemcpy takes a size in bytes to transfer.
What follows is an example, cobbled together from the pieces you have shown, with a variety of "fixes". I'm not claiming its in any way perfect or correct, but it avoids the issues I have outlined above. Furthermore, depending on how you compile with or with a -DUSE_KERNEL define, it will either run your "original" host code and display the output, or the kernel code and display the output. According to my testing, the outputs match.
$ cat t1751.cu
#include <thrust/complex.h>
#include <thrust/copy.h>
#include <thrust/device_vector.h>
#include <thrust/host_vector.h>
#include <iostream>
#include <cstdint>
#include <cuComplex.h>
typedef thrust::complex<double> smp_t;
__global__ void kernel_func_old(cuDoubleComplex *cnbuf, cuDoubleComplex *sgbuf, smp_t *tw, size_t block_size) {
unsigned int ch = threadIdx.x;
unsigned int k = blockIdx.x;
for (int x = 0; x < block_size; ++x) {
unsigned int sig_index = k*block_size+x;
unsigned int tw_index = ch*k;
unsigned int cn_index = ch*block_size+x;
cuDoubleComplex temp = cuCmul(sgbuf[sig_index], make_cuDoubleComplex(tw[tw_index].real(), tw[tw_index].imag()));
cnbuf[cn_index] = cuCadd(temp, cnbuf[cn_index]);
}
}
__global__ void kernel_func(smp_t *cnbuf, smp_t *sgbuf, smp_t *tw, size_t block_size) {
unsigned row = blockIdx.x;
unsigned col = threadIdx.x;
unsigned idx = row*block_size+col;
for (int k = 0; k < 8; k++)
cnbuf[idx] += sgbuf[k*block_size+col] * tw[row*block_size+k];
}
void kernel_wrap(
smp_t *cnbuf,
smp_t *sgbuf,
thrust::host_vector<thrust::host_vector<smp_t>>tw,
size_t buffer_size) {
smp_t *d_sgbuf;
smp_t *d_cnbuf;
thrust::device_vector<smp_t> d_tw(8*8);
// thrust::copy(&tw[0][0], &tw[7][7], d_tw.begin());
thrust::host_vector<smp_t> htw(buffer_size*buffer_size);
for (int i = 0; i < buffer_size; i++)
for (int j = 0; j < buffer_size; j++)
htw[i*buffer_size + j] = tw[i][j];
cudaMemcpy(thrust::raw_pointer_cast(d_tw.data()), &htw[0], 8*8*sizeof(smp_t), cudaMemcpyHostToDevice);
cudaMalloc((void **)&d_sgbuf, buffer_size*buffer_size*sizeof(smp_t));
cudaMalloc((void **)&d_cnbuf, buffer_size*buffer_size*sizeof(smp_t));
cudaMemcpy(d_sgbuf, sgbuf, buffer_size*buffer_size*sizeof(smp_t), cudaMemcpyHostToDevice);
cudaMemcpy(d_cnbuf, cnbuf, buffer_size*buffer_size*sizeof(smp_t), cudaMemcpyHostToDevice);
thrust::raw_pointer_cast(d_tw.data());
kernel_func<<<8, 8>>>(d_cnbuf,d_sgbuf,thrust::raw_pointer_cast(d_tw.data()),buffer_size);
cudaError_t varCudaError1 = cudaGetLastError();
if (varCudaError1 != cudaSuccess)
{
std::cout << "Failed to launch subDelimiterExamine kernel (error code: " << cudaGetErrorString(varCudaError1) << ")!" << std::endl;
exit(EXIT_FAILURE);
}
// cudaMemcpy(sgbuf, d_sgbuf, buffer_size*buffer_size*sizeof(smp_t), cudaMemcpyDeviceToHost);
cudaMemcpy(cnbuf, d_cnbuf, buffer_size*buffer_size*sizeof(smp_t), cudaMemcpyDeviceToHost);
for (int i = 0; i < 8; i++)
for (int j = 0; j < 8; j++)
std::cout << cnbuf[i*8+j].real() << "," << cnbuf[i*8+j].imag() << std::endl;
}
int main(){
const int bufsize = 8;
const int decfactor = 8;
uint8_t *binbuffer = (uint8_t*) malloc(8 * bufsize * sizeof(uint8_t));
smp_t *sgbuf = (smp_t*) malloc(8 * bufsize * sizeof(smp_t));
smp_t *cnbuf = (smp_t*) malloc(8 * bufsize * sizeof(smp_t));
memset(cnbuf, 0, 8*bufsize*sizeof(smp_t));
// Create matrix.
thrust::complex<double> i_unit(0.0, 1.0);
#ifndef USE_KERNEL
std::vector<std::vector<smp_t> > tw(decfactor);
#else
thrust::host_vector<thrust::host_vector<smp_t>> tw(decfactor);
#endif
// Fill the Matrix
for (size_t row = 0; row < 8; row++) {
for (size_t col = 0; col < 8; col++) {
std::complex<double> tmp = exp(-i_unit * 2.0*M_PI * ((double) col*row) / (double)8);
tw[row].push_back(tmp);
}
}
thrust::complex<double> test(1.0, 1.0);
for (int i = 0; i < 8*8; i++) sgbuf[i] = test;
#ifndef USE_KERNEL
/* The Code To Move to the GPU processing */
for (unsigned int i = 0; i < bufsize; i++) {
for (size_t ch = 0; ch < 8; ch++)
for (size_t k = 0; k < 8; k++)
cnbuf[ch*bufsize + i] += sgbuf[k*bufsize+i] * tw[ch].at(k);
}
for (int i = 0; i < 8; i++)
for (int j = 0; j < 8; j++)
std::cout << cnbuf[i*8+j].real() << "," << cnbuf[i*8+j].imag() << std::endl;
#else
kernel_wrap(cnbuf,sgbuf,tw,bufsize);
#endif
}
$ nvcc -o t1751 t1751.cu -std=c++11
$ ./t1751 >out_host.txt
$ nvcc -o t1751 t1751.cu -std=c++11 -DUSE_KERNEL
$ ./t1751 >out_device.txt
$ diff out_host.txt out_device.txt
$
Remember, this is mostly your code, I am not claiming it is correct, or defect-free, or suitable for any particular purpose. Use it at your own risk.

Gaussian filter reads same value multiple time usin BMP image

I need to translate GaussianFilter that uses openCV to code that uses BMP image ( so i first read image, and translate it to greyscale). My function using openCV looks like ( basic GaussianFilter ) :
Mat CreateGaussFilter(int kernalHeight, int kernalWidth, double kernalArray[5][5]){
Mat image = imread("konik.jpg");
Mat grayScaleImage(image.size(),CV_8UC1);
Mat filter(image.size(),CV_8UC1);
cvtColor(image,grayScaleImage,CV_RGB2GRAY);
int rows=image.rows;
int cols=image.cols;
int verticleImageBound=(kernalHeight-1)/2;
int horizontalImageBound=(kernalWidth-1)/2;
for(int row=0+verticleImageBound;row<rows-verticleImageBound;row++){
for(int col=0+horizontalImageBound;col<cols-horizontalImageBound;col++){
float value=0.0;
for(int kRow=0;kRow<kernalHeight;kRow++){
for(int kCol=0;kCol<kernalWidth;kCol++){
float pixel=grayScaleImage.at<uchar>(kRow+row-verticleImageBound,kCol+col-horizontalImageBound)*kernalArray[kRow][kCol];
value+=pixel;
}
}
filter.at<uchar>(row,col)=cvRound(value);
}
}
return filter;
}
Now for BMP image:
i have loaded it using:
struct Info{
int width;
int height;
int offset;
unsigned char * info;
unsigned char * data;
int size;
};
Info readBMP(char* filename)
{
int i;
std::ifstream is(filename, std::ifstream::binary);
is.seekg(0, is.end);
i = is.tellg();
is.seekg(0);
unsigned char *info = new unsigned char[i];
is.read((char *)info,i);
int width = *(int*)&info[18];
int height = *(int*)&info[22];
int offset = *(int*)&info[10];
unsigned char a[offset];
unsigned char *b = new unsigned char[i - offset];
std::copy(info,
info + offset,
a);
std::copy(info + offset,
info + i,
b + 0);
Info dat;
dat.width = width;
dat.height = height;
dat.offset = offset;
dat.size = i;
dat.info = new unsigned char[offset - 1];
dat.data = new unsigned char[i - offset + 1];
for( int j = 0; j < offset ; j++ ){
dat.info[j] = a[j];
}
for( int j = 0; j < i - offset; j++ ){
dat.data[j] = b[j];
}
return dat;
}
turned it into grayscale usin:
void greyScale( unsigned char * src , int rows, int cols){
for( int i = 0; i < rows; i++){
for( int j = 0; j < cols; j++){
unsigned char r = src[3 * (i * cols + j)];
unsigned char g = src[3 * (i * cols + j) + 1];
unsigned char b = src[3 * (i * cols + j) + 2];
char linearIntensity = (char)(0.2126f * r + 0.7512f * g + 0);
src[3 * (i * cols + j)] = linearIntensity;
src[3 * (i * cols + j) + 1] = linearIntensity;
src[3 * (i * cols + j) + 2] = linearIntensity;
}
}
}
And now i am trying to use GaussianFilter ( translated from my OpenCV function )
void FilterCreation(double GKernel[][5]) {
// intialising standard deviation to 1.0
double sigma = 1.0;
double r, s = 2.0 * sigma * sigma;
// sum is for normalization
double sum = 0.0;
// generating 5x5 kernel
for (int x = -2; x <= 2; x++) {
for (int y = -2; y <= 2; y++) {
r = sqrt(x * x + y * y);
GKernel[x + 2][y + 2] = (exp(-(r * r) / s)) / (M_PI * s);
sum += GKernel[x + 2][y + 2];
}
}
// normalising the Kernel
for (int i = 0; i < 5; ++i)
for (int j = 0; j < 5; ++j)
GKernel[i][j] /= sum;
}
unsigned char ** CreateGaussFilter(unsigned char ** src,int kernalHeight, int kernalWidth, double kernalArray[5][5], int rows, int cols){
int verticleImageBound=(kernalHeight-1)/2;
int horizontalImageBound=(kernalWidth-1)/2;
unsigned char ** dst = new unsigned char *[rows];
for( int i = 0; i < rows; i++){
dst[i] = new unsigned char [cols];
}
for(int row=0+verticleImageBound;row<rows-verticleImageBound;row++){
for(int col=0+horizontalImageBound;col<cols-horizontalImageBound;col++){
float value=0;
for(int kRow=0;kRow<kernalHeight;kRow++){
for(int kCol=0;kCol<kernalWidth;kCol++){
float pixel =src[kRow+row-verticleImageBound][kCol+col-horizontalImageBound]*kernalArray[kRow][kCol];
value+=pixel;
}
}
dst[row][col] = round(value);
}
}
return dst;
}
Since grayscale values are same for every channel, istead of doing calculation like in grayscale function, i turned the data into 2d array and then back into 1d array using:
unsigned char ** return2darray(unsigned char *src, int width, int height, int size){
unsigned char **array = new unsigned char *[width];
for( int i = 0; i < width; i++ ){
array[i] = new unsigned char[height];
}
for( int i = 0; i < width; i++ ){
for( int j = 0; j < height; j++ ){
array[i][j] = src[3 * (i * height + j)];
}
}
return array;
}
unsigned char * return1darray(unsigned char **src, int width, int height, int size){
unsigned char *array = new unsigned char[size];
for( int i = 0; i < width; i++ ){
for( int j = 0; j < height; j++ ){
array[3 * (i * height + j)] = src[i][j];
array[3 * (i * height + j) + 1] = src[i][j];
array[3 * (i * height + j) + 2] = src[i][j];
}
}
return array;
}
And using it like:
int main() {
// load img
Info dat = readBMP("input.bmp");
// turn in into greyscale
greyScale(dat.data,dat.width,dat.height);
// turn 1d array into 2d
unsigned char** arr = return2darray(dat.data,dat.width,dat.height,dat.size);
double GKernel[5][5];
// geneate gausian filter
FilterCreation(GKernel);
// apply gausianFilter
unsigned char** filter = CreateGaussFilter(arr,5,5,GKernel,dat.width,dat.height,dat.size);
// convert it back into 1d array
unsigned char* ar = return1darray(filter,dat.width,dat.height,dat.size);
ofstream fout;
fout.open("out.bmp", ios::binary | ios::out);
fout.write( reinterpret_cast<char *>(dat.info), dat.offset);
fout.write( reinterpret_cast<char *>(ar), dat.size - dat.offset );
fout.close();
return 0;
}
But for some reason, that I cannot realize for input :
the output looks like this.
It seems like it reads the same values in periodes, but that would mean the original image would have the same periods because it just reads bytes from loaded image. The GreyScale function works as it should. I am not very proficient in manipulation with images ( i was using openCV all the time ) What could cause these periods? Thanks for the help!

mexcuda having breakpoint at delete[]() in .cu-file

I am having some trouble finding the error I made with my memory allocation. I am currently using Visual Studio 2013, Matlab 2015b and CUDA 7.0 on a GeForce GT 630 and I am quite a newbie to GPU-programming, CUDA and mex.
When I call my code from Matlab with mexcuda it goes fine until I add the small part with colIndexStepSize to the .cu-file. The program runs normally till delete. After informing me about having reached a breakpoint here, Matlab crashes.
When I remove the code lines in question, everything runs smoothly again.
I am quite sure that there is something wrong with my memory handling but I simpy cannot find the bug. Here is the code that is making trouble:
#include <cuda_runtime.h>
#include <cuda.h>
#include <cusparse.h>
#include <device_launch_parameters.h>
#include <curand.h>
#include <vector>
// Test-Makro : (Funktionieren die Zugriffe auf die GPU?)
#define gpuErrchk(ans){gpuAssert((ans), __FILE__, __LINE__);}
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort = true){
if (code != cudaSuccess){
fprintf(stderr, "GPUassert: %s%s%d\n", cudaGetErrorString(code), file, line);
}
}
__global__ void startEndIndex(int *ergArray, int *first, int *last, float *dxmax, unsigned int *length){
unsigned int index = threadIdx.x + blockIdx.x*blockDim.x;
if (index < *length){
first[index] = (*dxmax)*ergArray[index];
last[index] = (*dxmax)*ergArray[index + 1] - 1;
}
}
void rotateOSSARTrechnung(std::vector<float> *detektor, SparseMatrix<float, float, float> *systemMatrix_coo, Volumen<float, float, float> *volumen, unsigned int iterationen, std::vector<float> *deltaBIterationN, std::vector<float> *matdVoxelGrid, float projektionen,float dxmax, float detZellen, unsigned int threads_max_n, unsigned int threads_max_m, unsigned int threads_max_nnz){
unsigned int nnz = (unsigned int)systemMatrix_coo->nnz;
unsigned int n = (unsigned int)systemMatrix_coo->columnNumber;
unsigned int mNeu = detZellen;
float *measuredValues = 0; measuredValues = new float[mNeu]();
float *volumeN = 0; volumeN =new float[n]();
float *volumeAlt = 0; volumeAlt = new float[n]();
float *initValuesM = 0; initValuesM = new float[mNeu]();
float *volumeNInitZero = 0; volumeNInitZero = new float[n]();
float *initValuesMInitZero = 0; initValuesMInitZero = new float[mNeu]();
int *cooRowHostPtr=0; cooRowHostPtr = new int[nnz]();
int *cooColHostPtr=0; cooColHostPtr = new int[nnz]();
float *cooValuesHostPtr = 0; cooValuesHostPtr = new float[nnz]();
unsigned int *colIndex = 0; colIndex = new unsigned int[nnz]();
float *valIndex = 0; valIndex = new float[nnz]();
unsigned int *colIndexStepSize = 0; colIndexStepSize = new unsigned int[n]();
for (unsigned int i = 0; i < n; i++){
colIndexStepSize[i] = nnz;
}
unsigned int length = matdVoxelGrid->size();
int *ergArray = 0; ergArray = new int[length+1]();
int *first = 0; first = new int[length]();
int *last = 0; last = new int[length]();
int *cooHostColRot = 0; cooHostColRot = new int[nnz]();
int *d_cooColPtr;
int *d_cooRowPtr;
unsigned int *d_nnz;
int *d_colIndexPtr;
float *d_valIndexPtr;
unsigned int *d_colIndexStepSizePtr;
float *d_cooValuesPtr;
float *d_measuredValues;
float *d_volume_alt;
float *d_volume_neu;
int *d_ergArray;
float *d_dxmax;
unsigned int *d_length;
unsigned int *d_size;
int *d_first;
int *d_last;
int *d_cooColRotPtr;
unsigned int *d_count;
gpuErrchk(cudaMalloc((void**)&d_cooRowPtr, nnz*sizeof(int)));;
gpuErrchk(cudaMalloc((void**)&d_cooColPtr, nnz*sizeof(int)));
gpuErrchk(cudaMalloc((void**)&d_cooValuesPtr, nnz*sizeof(float)));
gpuErrchk(cudaMalloc((void**)&d_measuredValues, mNeu*sizeof(float)));
gpuErrchk(cudaMalloc((void**)&d_volume_alt, n*sizeof(float)));
gpuErrchk(cudaMalloc((void**)&d_volume_neu, n*sizeof(float)));
gpuErrchk(cudaMalloc((void**)&d_nnz, sizeof(unsigned int)));
gpuErrchk(cudaMalloc((void**)&d_colIndexPtr, (nnz)*sizeof(int)));
gpuErrchk(cudaMalloc((void**)&d_valIndexPtr, (nnz)*sizeof(float)));
gpuErrchk(cudaMalloc((void**)&d_colIndexStepSizePtr, n*sizeof(unsigned int)));
gpuErrchk(cudaMalloc((void**)&d_ergArray, (length+1)*sizeof(int)));
gpuErrchk(cudaMalloc((void**)&d_dxmax, sizeof(float)));
gpuErrchk(cudaMalloc((void**)&d_length, sizeof(unsigned int)));
gpuErrchk(cudaMalloc((void**)&d_size, sizeof(unsigned int)));
gpuErrchk(cudaMalloc((void**)&d_first, length*sizeof(int)));
gpuErrchk(cudaMalloc((void**)&d_last, length*sizeof(int)));
gpuErrchk(cudaMalloc((void**)&d_cooColRotPtr, nnz*sizeof(int)));
gpuErrchk(cudaMalloc((void**)&d_count, sizeof(unsigned int)));
for (unsigned int i = 0; i < nnz; i++){
cooRowHostPtr[i] = systemMatrix_coo->cooRowInd->at(i);
cooColHostPtr[i] = systemMatrix_coo->cooColInd->at(i);
cooValuesHostPtr[i] = systemMatrix_coo->cooValues->at(i);
}
for (unsigned int j = 0; j < n; j++){
volumen->setValueAtElement(j, (float)cooColHostPtr[j]);
}
gpuErrchk(cudaMemcpy(d_nnz, &nnz, sizeof(unsigned int), cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(d_dxmax, &dxmax, sizeof(float), cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(d_length, &length, sizeof(unsigned int), cudaMemcpyHostToDevice));
// (Initialwerte sind immer gleich)
gpuErrchk(cudaMemcpy(d_cooRowPtr, cooRowHostPtr, nnz*sizeof(int), cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(d_cooValuesPtr, cooValuesHostPtr, nnz*sizeof(float), cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(d_cooColPtr, cooColHostPtr, nnz*sizeof(int), cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(d_valIndexPtr, cooValuesHostPtr, nnz*sizeof(float), cudaMemcpyHostToDevice));
unsigned int threads_nnz = threads_max_nnz;
unsigned int thread_length = length;
unsigned int block_length = 1;
unsigned int index = 0;
for (unsigned int s = 0; s < length; s++){
for (unsigned int t = 0; t <= s; t++){
index = s + 1;
ergArray[index] += (int)matdVoxelGrid->at(t);
}
}
gpuErrchk(cudaMemcpy(d_ergArray, ergArray, (length+1)*sizeof(int), cudaMemcpyHostToDevice));
startEndIndex <<< block_length, thread_length >>>(d_ergArray, d_first, d_last, d_dxmax, d_length);
gpuErrchk(cudaMemcpy(first, d_first, length*sizeof(int), cudaMemcpyDeviceToHost));
gpuErrchk(cudaMemcpy(last, d_last, length*sizeof(int), cudaMemcpyDeviceToHost));
for (unsigned int j = 0; j < length; j++){
volumen->setValueAtElement(j, (float)first[j]);
}
for (unsigned int j = 0; j < length; j++){
volumen->setValueAtElement(j, (float)last[j]);
}
unsigned int size = 0;
for (unsigned int iter = 0; iter < iterationen; iter++){
for (unsigned int proj = 1; proj <= projektionen; proj++){
unsigned int begin1 = (proj - 1)*mNeu;
unsigned int end1 = proj*mNeu;
for (unsigned int j = begin1; j < end1; j++){
measuredValues[j] = detektor->at(j);
}
gpuErrchk(cudaMemcpy(d_measuredValues, measuredValues, mNeu*sizeof(float), cudaMemcpyHostToDevice));
for (unsigned int u = 0; u < length; u++){
size = ceil(matdVoxelGrid->at(u)* (proj - 1) * dxmax / projektionen);
gpuErrchk(cudaMemcpy(d_size, &size, sizeof(unsigned int), cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(d_count, &u, sizeof(unsigned int), cudaMemcpyHostToDevice));
if (proj > 1){
for (unsigned int i = 0; i < nnz; i++) {//(first[u] <= cooCols[index] <= last[u]){
if (first[u] <= cooColHostPtr[i] && cooColHostPtr[i] <= last[u]){
cooHostColRot[i] = first[u] + (int)(cooColHostPtr[i] + size) % (last[u] - first[u] + 1);// (int)(cooColHostPtr[i] + size) % (last[u]); // (int)(first[u] + ((int)(cooColHostPtr[i] + dxmax) % (last[u] - first[u] + 1)));
}
}
}
else{
for (unsigned int i = 0; i < nnz; i++) {
cooHostColRot[i] = cooColHostPtr[i];
}
}
}
// --------- troubling code starts HERE ----------------
unsigned int wert = 0, index = 0;
for (unsigned int i = 0; i < nnz; i++){
index = cooHostColRot[i];
wert = colIndexStepSize[index];
if (wert >= i){
colIndexStepSize[index] = i;
}
}
for (unsigned int j = 0; j < n; j++){
volumen->setValueAtElement(j, colIndexStepSize[j]);
}
gpuErrchk(cudaMemcpy(d_colIndexStepSizePtr, colIndexStepSize, n*sizeof(unsigned int), cudaMemcpyHostToDevice));
// --------- troubling code ends HERE ----------------
gpuErrchk(cudaMemcpy(d_colIndexPtr, cooHostColRot, nnz*sizeof(int), cudaMemcpyHostToDevice));
}
}
cudaFree(d_cooRowPtr);
cudaFree(d_cooColPtr);
cudaFree(d_cooValuesPtr);
cudaFree(d_measuredValues);
cudaFree(d_volume_alt);
cudaFree(d_volume_neu);
cudaFree(d_colCount);
cudaFree(d_rowCount);
cudaFree(d_ergSumCol);
cudaFree(d_ergSumRow);
cudaFree(d_ergMult);
cudaFree(d_nnz);
cudaFree(d_faktor);
cudaFree(d_colIndexPtr);
cudaFree(d_valIndexPtr);
cudaFree(d_ergSumNNZforCol);
cudaFree(d_colIndexStepSizePtr);
cudaFree(d_deltaB);
cudaFree(d_ergArray);
cudaFree(d_dxmax);
cudaFree(d_length);
cudaFree(d_size);
cudaFree(d_first);
cudaFree(d_last);
cudaFree(d_cooColRotPtr);
cudaFree(d_count);
delete[](ergArray); ergArray = NULL;
delete[](measuredValues); measuredValues = NULL;
delete[](cooColHostPtr); cooColHostPtr = NULL;
delete[](cooRowHostPtr); cooRowHostPtr = NULL;
delete[](cooValuesHostPtr); cooValuesHostPtr = NULL;
delete[](volumeN); volumeN = NULL;
delete[](ergArray); ergArray = NULL;
delete[](initValuesM); initValuesM = NULL;
delete[](colIndex); colIndex = NULL;
delete[](valIndex); valIndex = NULL;
delete[](volumeAlt); volumeAlt = NULL;
delete[](volumeNInitZero); volumeNInitZero = NULL;
delete[](initValuesMInitZero); initValuesMInitZero = NULL;
delete[](colIndexStepSize); colIndexStepSize = NULL;
delete[](deltaBArray); deltaBArray = NULL;
delete[](first); first = NULL;
delete[](last); last = NULL;
delete[](cooHostColRot); cooHostColRot = NULL;
deltaB->~vector();
deltaB = NULL;
}
If somebody sees any mistake I made, please tell me, I am open to any advice.
Thanks in advance!
Best regards
EDIT:
#AnderBiguri was right, I made an out of bounds access to the array measuredValues. Here is the corrected part of the code in question:
for (unsigned int j = 0; j < mNeu; j++){
measuredValues[j] = detektor->at((proj-1)*mNeu+j);
}
measuredValues is only mNeu elements long but I did access some elements way behind this point.
So, thanks a lot for the help !
#AnderBiguri was right, I made an out of bounds access to the array measuredValues. Here is the corrected part of the code in question:
for (unsigned int j = 0; j < mNeu; j++){
measuredValues[j] = detektor->at((proj-1)*mNeu+j);
}
I just had to adjust the boundaries of the for loop and vector accessing to fit the bounds of the array.
Thanks a lot once again!

CUDA: please help me to find error in my code

There's code, that uses GPU:
__global__ void gpu_process(float* input, float* weights, float* output, int psize, int size)
{
int i = blockIdx.x*blockDim.x + threadIdx.x;
int j = blockIdx.y*blockDim.y + threadIdx.y;
if(i < psize && j < size)
output[j] += input[i] * weights[i * size + j];
}
void process(float* input, float* weights, float* output, size_t psize, size_t size)
{
float* in_d, *w_d, *out_d;
cudaMalloc((void**)&in_d, psize * sizeof(float));
cudaMalloc((void**)&w_d, psize * size * sizeof(float));
cudaMalloc((void**)&out_d, size * sizeof(float));
for(size_t i = 0; i < size; i++)
output[i] = 0;
cudaMemcpy(in_d, input, psize * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(w_d, weights, psize * size * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(out_d, output, size * sizeof(float), cudaMemcpyHostToDevice);
int rx = psize, ry = size, block_x = min((int)psize, 32), block_y = min((int)size, 32);
dim3 dimBlock(block_x, block_y);
dim3 dimGrid(ceil(float(rx) / block_x), ceil(float(ry) / block_y));
gpu_process<<<dimGrid, dimBlock>>>(in_d, w_d, out_d, psize, size);
cudaThreadSynchronize();
cudaMemcpy(output, out_d, size * sizeof(float), cudaMemcpyDeviceToHost);
cudaFree(in_d);
cudaFree(out_d);
cudaFree(w_d);
}
There's code, that do the same thing, but uses only CPU:
int blockIdxx, blockIdxy, blockDimx, blockDimy, threadIdxx, threadIdxy;
void cpu_process(float* input, float* weights, float* output, int psize, int size)
{
int i = blockIdxx*blockDimx + threadIdxx;
int j = blockIdxy*blockDimy + threadIdxy;
if(i < psize && j < size)
output[j] += input[i] * weights[i * size + j];
}
void process(float* input, float* weights, float* output, size_t psize, size_t size)
{
for(size_t i = 0; i < size; i++)
output[i] = 0;
int rx = psize, ry = size, block_x = min((int)psize, 32), block_y = min((int)size, 32);
blockDimx = block_x;
blockDimy = block_y;
int gridDimx = ceil(float(rx) / block_x), gridDimy = ceil(float(ry) / block_y);
for(blockIdxx = 0; blockIdxx < gridDimx; blockIdxx++)
for(blockIdxy = 0; blockIdxy < gridDimy; blockIdxy++)
for(threadIdxx = 0; threadIdxx < blockDimx; threadIdxx++)
for(threadIdxy = 0; threadIdxy < blockDimy; threadIdxy++)
cpu_process(input, weights, output, psize, size);
}
Why CPU variant works correctly but GPU variant returns garbage in output? What differs in
Version of cuda-toolkit: 4.0
OS: Debian GNU/Linux, cuda installed from it's repositories.
GPU: NVIDIA GeForce GT 525M.
cudaThreadSyncronize is deprecated and should not be used, instead use cudaDeviceSyncronize, check the error codes of these, since they will return an error if a thread has failed. These also block all code thereafter until the task is completed, so you could also add some timing code inbetween to find bottlenecks.