Cuda Visual Profiler unspecified launch failure

Cuda Visual Profiler unspecified launch failure - profiling

So when I run my code it executes perfectly, but when I try to run it in Visual Profiler it works the first time, but it seems to want to run the program seven times, and the second time it results in an unspecified launch failure. Why would that happen? My code looks like below, and my error checking tells me the error is occurring with
cudaMemcpy(p->siteset, rsites, sitesize, cudaMemcpyDeviceToHost);
(Probably easiest to find in the code by searching for memcpy11, it'll be the line above)
I can't think of a reason a program would essentially seg fault the second time it's run but not the first, and if I run it multiple times in terminal it's totally fine. Can anyone come up with what might be going on?
Thanks!
void fillin(node *p, node *left, node *rt)
{
size_t stepsize = chars * sizeof(long);
size_t sitesize = chars * sizeof(sitearray);
seqptr lsites;
cudaMalloc((void **) &lsteps, stepsize);
checkCUDAError("malloc");
cudaMalloc((void **) &lsites, sitesize);
checkCUDAError("malloc");
cudaMemcpy(lsteps, left->numsteps, stepsize, cudaMemcpyHostToDevice);
checkCUDAError("memcpy7");
cudaMemcpy(lsites, left->siteset, sitesize, cudaMemcpyHostToDevice);
checkCUDAError("memcpy8");
steptr rsteps;
seqptr rsites;
cudaMalloc((void **) &rsteps, stepsize);
checkCUDAError("malloc");
cudaMalloc((void **) &rsites, sitesize);
checkCUDAError("malloc");
cudaMemcpy(rsteps, rt->numsteps, stepsize, cudaMemcpyHostToDevice);
checkCUDAError("memcpy9");
cudaMemcpy(rsites, rt->siteset, sitesize, cudaMemcpyHostToDevice);
checkCUDAError("memcpy");
//call kernel
int block_size = 1;
int n_blocks = chars;
fillinBoth <<<n_blocks, block_size>>> (lsteps, lsites, rsteps, rsites, chars);
cudaMemcpy(p->numsteps, rsteps, stepsize, cudaMemcpyDeviceToHost);
checkCUDAError("memcpy10");
cudaMemcpy(p->siteset, rsites, sitesize, cudaMemcpyDeviceToHost);
checkCUDAError("memcpy11");
cudaFree(rsites); cudaFree(rsteps);
cudaFree(lsites); cudaFree(lsteps);
checkCUDAError("free");
}
}
__global__ void fillinBoth (steptr lsteps, seqptr lsite, steptr rsteps, seqptr rsite, long max){
boolean counted;
aas aa;
long s;
long i, j, k, n;
int idx = blockIdx.x;
//reduce array references; may or may not be useful
__shared__ long ls[3];
__shared__ long rs[3];
__shared__ long qs[3];
counted = false;
k = 0;
//computation from original program, but now need to do manual address calculation
if(idx < max){
for(i = 0; i < 3; i++){
rs[i]=rsite[idx][i];
ls[i]=lsite[idx][i];
}
n = lsteps[idx] + rsteps[idx];
counted = false;
for (i = 0; i <= 5; i++) {
if (k < 3) {
switch (i) {
case 0:
s = ls[0] & rs[0];
break;
case 1:
s = (ls[0] & rs[1]) | (ls[1] & rs[0]);
break;
case 2:
s = (ls[0] & rs[2]) | (ls[1] & rs[1]) | (ls[2] & rs[0]);
break;
case 3:
s = ls[0] | (ls[1] & rs[2]) | (ls[2] & rs[1]) | rs[0];
break;
case 4:
s = ls[1] | (ls[2] & rs[2]) | rs[1];
break;
case 5:
s = ls[2] | rs[2];
break;
}
if (counted || s != 0) {
qs[k] = s;
k++;
counted = true;
} else if (!counted)
n += cudaWeight[idx];
}
}
for (i = 0; i <= 1; i++) {
for (aa = ala; (long)aa <= (long)stop; aa = (aas)((long)aa + 1)) {
if (((1L << ((long)aa)) & qs[i]) != 0) {
for (j = i + 1; j <= 2; j++)
qs[j] |= cudaTranslate[(long)aa - (long)ala][j - i];
}
}
}
rsteps[idx] = n;
for(i = 0; i < 3; i++)
rsite[idx][i]=qs[i];
}
}

Try to disable all counters in the profile session settings. Also try to remove all files like "temp_compute_profiler_1_1.csv" from your working folder (see profile setting "Working Folder", by default in is the same with the place of your executable).
There is the same error (OpenCL over CUDA): http://www.khronos.org/message_boards/viewtopic.php?t=4324

Related

What's the correct way to assign one GPU memory buffer value from another GPU memory buffer with some arithmetic on each source buffer's element?

I'm a newbie for GPU programming using Cuda toolkit, and I have to write some code offering the functionality as I mentioned in the title.
I'd like to paste the code to show what exactly I want to do.
void CTrtModelWrapper::forward(void **bindings,
unsigned height,
unsigned width,
short channel,
ColorSpaceFmt colorFmt,
PixelDataType pixelType) {
uint16_t *devInRawBuffer_ptr = (uint16_t *) bindings[0];
uint16_t *devOutRawBuffer_ptr = (uint16_t *) bindings[1];
const unsigned short bit = 16;
float *devInputBuffer_ptr = nullptr;
float *devOutputBuffer_ptr = nullptr;
unsigned volume = height * width * channel;
common::cudaCheck(cudaMalloc((void **) &devInputBuffer_ptr, volume * getElementSize(nvinfer1::DataType::kFLOAT)));
common::cudaCheck(cudaMalloc((void **) &devOutputBuffer_ptr, volume * getElementSize(nvinfer1::DataType::kFLOAT)));
unsigned short npos = 0;
switch (pixelType) {
case PixelDataType::PDT_INT8: // high 8bit
npos = bit - 8;
break;
case PixelDataType::PDT_INT10: // high 10bit
npos = bit - 10;
break;
default:
break;
}
switch (colorFmt) {
case CFMT_RGB: {
for (unsigned i = 0; i < volume; ++i) {
devInputBuffer_ptr[i] = float((devInRawBuffer_ptr[i]) >> npos); // SEGMENTATION Fault at this line
}
}
break;
default:
break;
}
void *rtBindings[2] = {devInputBuffer_ptr, devOutputBuffer_ptr};
// forward
this->_forward(rtBindings);
// convert output
unsigned short ef_bit = bit - npos;
switch (colorFmt) {
case CFMT_RGB: {
for (unsigned i = 0; i < volume; ++i) {
devOutRawBuffer_ptr[i] = clip< uint16_t >((uint16_t) devOutputBuffer_ptr[i],
0,
(uint16_t) pow(2, ef_bit)) << npos;
}
}
break;
default:
break;
}
}
bindings is a pointer to an array, the 1st element in the array is a device pointer that points to a buffer allocated using cudaMalloc on the gpu, each element in the buffer is a 16bit integer.the 2nd one the same, used to store the output data.
height,width,channel,colorFmt(RGB here),pixelType(PDT_INT8, aka 8bit) respective to the image height, width,channel number, colorspace, bits to store one pixel value.
the _forward function requires a pointer to an array, similar to bindings except that each element in the buffer should be a 32bit float number.
so I make some transformation using a loop
for (unsigned i = 0; i < volume; ++i) {
devInputBuffer_ptr[i] = float((devInRawBuffer_ptr[i]) >> npos); // SEGMENTATION Fault at this line
}
the >> operation is because the actual 8bit data is stored in the high 8 bit.
SEGMENTATION FAULT occurred at this line of code devInputBuffer_ptr[i] = float((devInRawBuffer_ptr[i]) >> npos); and i equals 0.
I try to separate this code into several line:
uint16_t value = devInRawBuffer_ptr[i];
float transferd = float(value >> npos);
devInputBuffer_ptr[i] = transferd;
and SEGMENTATION FAULT occurred at this line uint16_t value = devInRawBuffer_ptr[i];
I wonder that is this a valid way to assign value to an allocated gpu memory buffer?
PS: the buffer given in bindings are totally fine. they are from host memory using cudaMemcpy before the call to forward function, but I still paste the code below
nvinfer1::DataType type = nvinfer1::DataType::kHALF;
HostBuffer hostInputBuffer(volume, type);
DeviceBuffer deviceInputBuffer(volume, type);
HostBuffer hostOutputBuffer(volume, type);
DeviceBuffer deviceOutputBuffer(volume, type);
// HxWxC --> WxHxC
auto *hostInputDataBuffer = static_cast<unsigned short *>(hostInputBuffer.data());
for (unsigned w = 0; w < W; ++w) {
for (unsigned h = 0; h < H; ++h) {
for (unsigned c = 0; c < C; ++c) {
hostInputDataBuffer[w * H * C + h * C + c] = (unsigned short )(*(ppm.buffer.get() + h * W * C + w * C + c));
}
}
}
auto ret = cudaMemcpy(deviceInputBuffer.data(), hostInputBuffer.data(), volume * getElementSize(type),
cudaMemcpyHostToDevice);
if (ret != 0) {
std::cout << "CUDA failure: " << ret << std::endl;
return EXIT_FAILURE;
}
void *bindings[2] = {deviceInputBuffer.data(), deviceOutputBuffer.data()};
model->forward(bindings, H, W, C, sbsisr::ColorSpaceFmt::CFMT_RGB, sbsisr::PixelDataType::PDT_INT8);

In CUDA, it's generally not advisable to dereference a device pointer in host code. For example, you are creating a "device pointer" when you use cudaMalloc:
common::cudaCheck(cudaMalloc((void **) &devInputBuffer_ptr, volume * getElementSize(nvinfer1::DataType::kFLOAT)));
From the code you have posted, it's not possible to deduce that for devInRawBuffer_ptr but I'll assume it also is a device pointer.
In that case, to perform this operation:
for (unsigned i = 0; i < volume; ++i) {
devInputBuffer_ptr[i] = float((devInRawBuffer_ptr[i]) >> npos);
}
You would launch a CUDA kernel, something like this:
// put this function definition at file scope
__global__ void shift_kernel(float *dst, uint16_t *src, size_t sz, unsigned short npos){
for (size_t idx = blockIdx.x*blockDim.x+threadIdx.x, idx < sz; idx += gridDim.x*blockDim.x) dst[idx] = (float)((src[idx]) >> npos);
}
// call it like this in your code:
kernel<<<160, 1024>>>(devInputBuffer_ptr, devInRawBuffer_ptr, volume, npos);
(coded in browser, not tested)
If you'd like to learn more about what's going on here, you may wish to study CUDA. For example, you can get most of the basic concepts here and by studying the CUDA sample code vectorAdd. The grid-stride loop is discussed here.

Segmentation fault, (core dumped) when changing number of processors in lpthreads

I'm getting a segmentation fault when running my code on 8 processors but it works fine for 1 and 4 processors.
I'm using the lpthread library and this is the function I execute in each thread.
If any more code is needed I can add more.
void *compute_gauss(void *threadid){
int local_row, local_norm, col;
float multiplier;
long tid;
tid = (long)threadid;
fprintf(stdout, "Thread %ld has started\n", tid);
while (global_norm < N){
while (global_row < N) {
pthread_mutex_lock(&global_row_lock);
local_row = global_row;
global_row++;
pthread_mutex_unlock(&global_row_lock);
print_inputs();
multiplier = A[local_row][global_norm] / A[global_norm][global_norm];
for (col = global_norm; col < N; col++) {
A[local_row][col] -= A[global_norm][col] * multiplier;
}
B[local_row] -= B[global_norm] * multiplier;
}
pthread_barrier_wait(&barrier);
if (tid == 0){
global_norm++;
global_row=global_norm+1;
}
pthread_barrier_wait(&barrier); // wait until all threads arrive
}
}
Here is the calling function where I initialize barriers:
void gauss() {
int norm, row, col; /* Normalization row, and zeroing
* element row and col */
int i = 0;
float multiplier;
pthread_t threads[procs]; //declared array of threads equal in size to # processors
global_norm = 0;
global_row = global_norm+1;
printf("Computing Parallelized Algorithm.\n");
pthread_barrier_init(&barrier, NULL, procs);
/* Gaussian elimination */
for (i = 0; i < procs; i++){
pthread_create(&threads[i], NULL, &compute_gauss, (void *)i);
}
printf("finished creating threads\n");
for (i = 0; i < procs; i++){
pthread_join( threads[i], NULL);
}
printf("finished joining threads\n");
/* (Diagonal elements are not normalized to 1. This is treated in back
* * substitution.)
* */
fprintf(stdout, "pre back substition");
/* Back substitution */
for (row = N - 1; row >= 0; row--) {
X[row] = B[row];
for (col = N-1; col > row; col--) {
X[row] -= A[row][col] * X[col];
}
X[row] /= A[row][row];
}
fprintf(stdout, "post back substitution");
}

Here is one example how the code trespasses an array, please point out if I am wrong:
// suppose global_row = N - 1;
while (global_row < N) {
pthread_mutex_lock(&global_row_lock); // thread 2 waits here, global_row is N - 1;
local_row = global_row; // thread 1 is here, global_row is N - 1;
global_row++;
pthread_mutex_unlock(&global_row_lock);
// when thread 2 goes here, local_row is going to be N, out of array boundary.
multiplier = A[local_row][global_norm] / A[global_norm][global_norm];

You haven't included enough code so that I can test you program. However, I'm pretty sure that the problem is that you don't have a mutext protecting global_norm, global_row and print_inputs(). You need to protect them with a mutex, or you need to use an atomic increment operator. You're not seeing the crash under the debugger because it's changing your timing.

Shouldn't you be checking the return value of pthread_barrier_wait and checking for PTHREAD_BARRIER_SERIAL_THREAD?
It's also not clear why you call pthread_barrier_wait twice.

C++ Multithreading optimization

I am new to threading in C++ but I've done enough reading to at least get what I'm working on to compile. As of yet it hasn't improved performance at all. Right now I just have it creating the number of threads as there are loops but I can imagine that can pretty quickly cause the system to thrash. Is there a better alternative to brute force controlling the number of threads? I also intend to run this on the WestGrid computing system where I can specify the number of processors to use. What is the best way to set the number of threads to optimize for the number of processors.
void ExecuteCRTProcess(const long &numberOfRows, ZZ* rij, const ZZ &powRoh, const int &rowLength, long* PublicKey, ZZ* rQ0, const ZZ &Q0, ZZ* primes, const ZZ &productOfPrimes, ZZ* resultsArray, const bool IsItPrimeArray, long* caseStudy, long* caseStudyTracker, const ZZ &X0, const long &Roh){
int rc;
pthread_t threads[numberOfRows];
struct parameters ThreadParameters[numberOfRows];
for(int i = 0; i< numberOfRows ; i++){
FillR(rij,powRoh,rowLength); // fill up the vector rij with random numbers between 0 powRoh
MultiplyVectorByTwo(rij, rowLength, i, IsItPrimeArray); //Multiply rij vector by 2. If calculating Xi' also add 1 to appropriate values.
ThreadParameters[i].rij = rij;
ThreadParameters[i].rQ0 = rQ0;
ThreadParameters[i].primes = primes;
ThreadParameters[i].rowLength = rowLength;
ThreadParameters[i].Q0 = Q0;
ThreadParameters[i].i = i;
ThreadParameters[i].X0 = X0;
rc = pthread_create(&threads[i],NULL,CRTNew,(void *)&ThreadParameters[i]);
if(rc){
cout << "Error: unable to create thread, " << rc << endl;
exit(-1);
}
for(long j = 0; j< rowLength; j++){
cout << (resultsArray[i] % primes[j]) << " ";
}
cout << endl;*/
}
for(int i = 0; i< numberOfRows; i++){
pthread_join(threads[i], NULL);
resultsArray[i] = ThreadParameters[i].result;
}
}
The threads created run this function
void* CRTNew(void *threadArg){
struct parameters *local_data;
local_data = (struct parameters *) threadArg;
ZZ a, p, A, P, crt;
long Z, Public;
a = local_data->rQ0[local_data->i];
p = local_data->Q0;
A = local_data->rij[0];
P = local_data->primes[0];
for(int i = 1; i<=local_data->rowLength; i++){
A = A%P;
Z = CRT(a, p, A, P);
A = local_data->rij[i]; P = local_data->primes[i];
if(i == local_data->rowLength) Public = Z;
}
if(a < 0) crt = a+p;
else crt = a%p;
local_data->result = crt%local_data->X0;
pthread_exit(NULL);
}

'What is the best way to set the number of threads to optimize for the number of processors';
1) Create [num of cores] threads, (or maybe a few more), at app startup.
2) Never create any more threads
3) Never let the threads terminate.
4) Have them wait for work tasks on a producer-consumer thread, in the manner of a pool.
Alternatively, use a thread pool class or equivalent parallel language feature that already works.

SDL_GetPixel pointer problems

This is my very first question:
First of these 2 functions you see here below works fine to some extent:
Uint32 AWSprite::get_pixelColor_location(SDL_Surface * surface, int x, int y) {
int bpp = surface->format->BytesPerPixel;
/* Here p is the address to the pixel we want to retrieve */
Uint8 *p = (Uint8 *)surface->pixels + y * surface->pitch + x * bpp;
switch (bpp) {
case 1:
return *p;
case 2:
return *(Uint16 *)p;
case 3:
if (SDL_BYTEORDER == SDL_BIG_ENDIAN)
return p[0] << 16 | p[1] << 8 | p[2];
else
return p[0] | p[1] << 8 | p[2] << 16;
case 4:
return *(Uint32 *)p;
default:
return 0;
}
}
void AWSprite::set_all_frame_image_actual_size() {
/* This function finds an entire rows that has transparency
then stores the amount of rows to a Frame_image_absolute structure
*/
absolute_sprite = new Frame_image_absolute*[howManyFrames];
for (int f = 0; f < howManyFrames; f++) {
SDL_LockSurface(frames[f]);
int top_gap = 0; int bottom_gap = 0;
int per_transparent_px_count = 1;
for (int i = 0; i < frames[f]->h; i++) {
int per_transparent_px_count = 1;
if (this->get_pixelColor_location(frames[f], j, i) == transparentColour) per_transparent_px_count++;
if (per_transparent_px_count >= frames[f]->w) {
if (i < frames[f]->h / 2) {
per_transparent_px_count = 1;
top_gap++;
} else {
per_transparent_px_count = 1;
bottom_gap++;
}
}
}
}
int realHeight = frames[f]->h - (top_gap + bottom_gap);
absolute_sprite[f] = new Frame_image_absolute();
absolute_sprite[f]->offset_y = top_gap;
absolute_sprite[f]->height = realHeight;
}
}
When i ran this i get:
Unhandled exception at 0x00173746 in SE Game.exe: 0xC0000005: Access violation reading location 0x03acc0b8.
When i when through debuging, i found that it crashes at:
When iterators variable f == 31, i == 38, j = 139
And stops at AWSprite::get_pixelColor_location() in the line at " return *(Uint32 *)p;
I found that if i ran it again and go through debugging line by line then i will works sometime and sometime it dont!!! So i mean that "It crash at randomly when f > 30, i, j iterators value"
What is going on...

I cannot comment on the question yet, but here are some questions:
Where does j come from? Based on the get_pixelColor_location function I would assume that you're iterating over the width of the surface. This part seems to be missing from the code you posted.
Did you validate that i and j are within the bounds of your surface?
Also, you don't seem to Unlock the surface.
Running your function seems to work adequately here so I suspect you're reading outside of your buffer with invalid parameters.

How to speed up my sparse matrix solver?

I'm writing a sparse matrix solver using the Gauss-Seidel method. By profiling, I've determined that about half of my program's time is spent inside the solver. The performance-critical part is as follows:
size_t ic = d_ny + 1, iw = d_ny, ie = d_ny + 2, is = 1, in = 2 * d_ny + 1;
for (size_t y = 1; y < d_ny - 1; ++y) {
for (size_t x = 1; x < d_nx - 1; ++x) {
d_x[ic] = d_b[ic]
- d_w[ic] * d_x[iw] - d_e[ic] * d_x[ie]
- d_s[ic] * d_x[is] - d_n[ic] * d_x[in];
++ic; ++iw; ++ie; ++is; ++in;
}
ic += 2; iw += 2; ie += 2; is += 2; in += 2;
}
All arrays involved are of float type. Actually, they are not arrays but objects with an overloaded [] operator, which (I think) should be optimized away, but is defined as follows:
inline float &operator[](size_t i) { return d_cells[i]; }
inline float const &operator[](size_t i) const { return d_cells[i]; }
For d_nx = d_ny = 128, this can be run about 3500 times per second on an Intel i7 920. This means that the inner loop body runs 3500 * 128 * 128 = 57 million times per second. Since only some simple arithmetic is involved, that strikes me as a low number for a 2.66 GHz processor.
Maybe it's not limited by CPU power, but by memory bandwidth? Well, one 128 * 128 float array eats 65 kB, so all 6 arrays should easily fit into the CPU's L3 cache (which is 8 MB). Assuming that nothing is cached in registers, I count 15 memory accesses in the inner loop body. On a 64-bits system this is 120 bytes per iteration, so 57 million * 120 bytes = 6.8 GB/s. The L3 cache runs at 2.66 GHz, so it's the same order of magnitude. My guess is that memory is indeed the bottleneck.
To speed this up, I've attempted the following:
Compile with g++ -O3. (Well, I'd been doing this from the beginning.)
Parallelizing over 4 cores using OpenMP pragmas. I have to change to the Jacobi algorithm to avoid reads from and writes to the same array. This requires that I do twice as many iterations, leading to a net result of about the same speed.
Fiddling with implementation details of the loop body, such as using pointers instead of indices. No effect.
What's the best approach to speed this guy up? Would it help to rewrite the inner body in assembly (I'd have to learn that first)? Should I run this on the GPU instead (which I know how to do, but it's such a hassle)? Any other bright ideas?
(N.B. I do take "no" for an answer, as in: "it can't be done significantly faster, because...")
Update: as requested, here's a full program:
#include <iostream>
#include <cstdlib>
#include <cstring>
using namespace std;
size_t d_nx = 128, d_ny = 128;
float *d_x, *d_b, *d_w, *d_e, *d_s, *d_n;
void step() {
size_t ic = d_ny + 1, iw = d_ny, ie = d_ny + 2, is = 1, in = 2 * d_ny + 1;
for (size_t y = 1; y < d_ny - 1; ++y) {
for (size_t x = 1; x < d_nx - 1; ++x) {
d_x[ic] = d_b[ic]
- d_w[ic] * d_x[iw] - d_e[ic] * d_x[ie]
- d_s[ic] * d_x[is] - d_n[ic] * d_x[in];
++ic; ++iw; ++ie; ++is; ++in;
}
ic += 2; iw += 2; ie += 2; is += 2; in += 2;
}
}
void solve(size_t iters) {
for (size_t i = 0; i < iters; ++i) {
step();
}
}
void clear(float *a) {
memset(a, 0, d_nx * d_ny * sizeof(float));
}
int main(int argc, char **argv) {
size_t n = d_nx * d_ny;
d_x = new float[n]; clear(d_x);
d_b = new float[n]; clear(d_b);
d_w = new float[n]; clear(d_w);
d_e = new float[n]; clear(d_e);
d_s = new float[n]; clear(d_s);
d_n = new float[n]; clear(d_n);
solve(atoi(argv[1]));
cout << d_x[0] << endl; // prevent the thing from being optimized away
}
I compile and run it as follows:
$ g++ -o gstest -O3 gstest.cpp
$ time ./gstest 8000
0
real 0m1.052s
user 0m1.050s
sys 0m0.010s
(It does 8000 instead of 3500 iterations per second because my "real" program does a lot of other stuff too. But it's representative.)
Update 2: I've been told that unititialized values may not be representative because NaN and Inf values may slow things down. Now clearing the memory in the example code. It makes no difference for me in execution speed, though.

Couple of ideas:
Use SIMD. You could load 4 floats at a time from each array into a SIMD register (e.g. SSE on Intel, VMX on PowerPC). The disadvantage of this is that some of the d_x values will be "stale" so your convergence rate will suffer (but not as bad as a jacobi iteration); it's hard to say whether the speedup offsets it.
Use SOR. It's simple, doesn't add much computation, and can improve your convergence rate quite well, even for a relatively conservative relaxation value (say 1.5).
Use conjugate gradient. If this is for the projection step of a fluid simulation (i.e. enforcing non-compressability), you should be able to apply CG and get a much better convergence rate. A good preconditioner helps even more.
Use a specialized solver. If the linear system arises from the Poisson equation, you can do even better than conjugate gradient using an FFT-based methods.
If you can explain more about what the system you're trying to solve looks like, I can probably give some more advice on #3 and #4.

I think I've managed to optimize it, here's a code, create a new project in VC++, add this code and simply compile under "Release".
#include <iostream>
#include <cstdlib>
#include <cstring>
#define _WIN32_WINNT 0x0400
#define WIN32_LEAN_AND_MEAN
#include <windows.h>
#include <conio.h>
using namespace std;
size_t d_nx = 128, d_ny = 128;
float *d_x, *d_b, *d_w, *d_e, *d_s, *d_n;
void step_original() {
size_t ic = d_ny + 1, iw = d_ny, ie = d_ny + 2, is = 1, in = 2 * d_ny + 1;
for (size_t y = 1; y < d_ny - 1; ++y) {
for (size_t x = 1; x < d_nx - 1; ++x) {
d_x[ic] = d_b[ic]
- d_w[ic] * d_x[iw] - d_e[ic] * d_x[ie]
- d_s[ic] * d_x[is] - d_n[ic] * d_x[in];
++ic; ++iw; ++ie; ++is; ++in;
}
ic += 2; iw += 2; ie += 2; is += 2; in += 2;
}
}
void step_new() {
//size_t ic = d_ny + 1, iw = d_ny, ie = d_ny + 2, is = 1, in = 2 * d_ny + 1;
float
*d_b_ic,
*d_w_ic,
*d_e_ic,
*d_x_ic,
*d_x_iw,
*d_x_ie,
*d_x_is,
*d_x_in,
*d_n_ic,
*d_s_ic;
d_b_ic = d_b;
d_w_ic = d_w;
d_e_ic = d_e;
d_x_ic = d_x;
d_x_iw = d_x;
d_x_ie = d_x;
d_x_is = d_x;
d_x_in = d_x;
d_n_ic = d_n;
d_s_ic = d_s;
for (size_t y = 1; y < d_ny - 1; ++y)
{
for (size_t x = 1; x < d_nx - 1; ++x)
{
/*d_x[ic] = d_b[ic]
- d_w[ic] * d_x[iw] - d_e[ic] * d_x[ie]
- d_s[ic] * d_x[is] - d_n[ic] * d_x[in];*/
*d_x_ic = *d_b_ic
- *d_w_ic * *d_x_iw - *d_e_ic * *d_x_ie
- *d_s_ic * *d_x_is - *d_n_ic * *d_x_in;
//++ic; ++iw; ++ie; ++is; ++in;
d_b_ic++;
d_w_ic++;
d_e_ic++;
d_x_ic++;
d_x_iw++;
d_x_ie++;
d_x_is++;
d_x_in++;
d_n_ic++;
d_s_ic++;
}
//ic += 2; iw += 2; ie += 2; is += 2; in += 2;
d_b_ic += 2;
d_w_ic += 2;
d_e_ic += 2;
d_x_ic += 2;
d_x_iw += 2;
d_x_ie += 2;
d_x_is += 2;
d_x_in += 2;
d_n_ic += 2;
d_s_ic += 2;
}
}
void solve_original(size_t iters) {
for (size_t i = 0; i < iters; ++i) {
step_original();
}
}
void solve_new(size_t iters) {
for (size_t i = 0; i < iters; ++i) {
step_new();
}
}
void clear(float *a) {
memset(a, 0, d_nx * d_ny * sizeof(float));
}
int main(int argc, char **argv) {
size_t n = d_nx * d_ny;
d_x = new float[n]; clear(d_x);
d_b = new float[n]; clear(d_b);
d_w = new float[n]; clear(d_w);
d_e = new float[n]; clear(d_e);
d_s = new float[n]; clear(d_s);
d_n = new float[n]; clear(d_n);
if(argc < 3)
printf("app.exe (x)iters (o/n)algo\n");
bool bOriginalStep = (argv[2][0] == 'o');
size_t iters = atoi(argv[1]);
/*printf("Press any key to start!");
_getch();
printf(" Running speed test..\n");*/
__int64 freq, start, end, diff;
if(!::QueryPerformanceFrequency((LARGE_INTEGER*)&freq))
throw "Not supported!";
freq /= 1000000; // microseconds!
{
::QueryPerformanceCounter((LARGE_INTEGER*)&start);
if(bOriginalStep)
solve_original(iters);
else
solve_new(iters);
::QueryPerformanceCounter((LARGE_INTEGER*)&end);
diff = (end - start) / freq;
}
printf("Speed (%s)\t\t: %u\n", (bOriginalStep ? "original" : "new"), diff);
//_getch();
//cout << d_x[0] << endl; // prevent the thing from being optimized away
}
Run it like this:
app.exe 10000 o
app.exe 10000 n
"o" means old code, yours.
"n" is mine, the new one.
My results:
Speed (original):
1515028
1523171
1495988
Speed (new):
966012
984110
1006045
Improvement of about 30%.
The logic behind:
You've been using index counters to access/manipulate.
I use pointers.
While running, breakpoint at a certain calculation code line in VC++'s debugger, and press F8. You'll get the disassembler window.
The you'll see the produced opcodes (assembly code).
Anyway, look:
int *x = ...;
x[3] = 123;
This tells the PC to put the pointer x at a register (say EAX).
The add it (3 * sizeof(int)).
Only then, set the value to 123.
The pointers approach is much better as you can understand, because we cut the adding process, actually we handle it ourselves, thus able to optimize as needed.
I hope this helps.
Sidenote to stackoverflow.com's staff:
Great website, I hope I've heard of it long ago!

For one thing, there seems to be a pipelining issue here. The loop reads from the value in d_x that has just been written to, but apparently it has to wait for that write to complete. Just rearranging the order of the computation, doing something useful while it's waiting, makes it almost twice as fast:
d_x[ic] = d_b[ic]
- d_e[ic] * d_x[ie]
- d_s[ic] * d_x[is] - d_n[ic] * d_x[in]
- d_w[ic] * d_x[iw] /* d_x[iw] has just been written to, process this last */;
It was Eamon Nerbonne who figured this out. Many upvotes to him! I would never have guessed.

Poni's answer looks like the right one to me.
I just want to point out that in this type of problem, you often gain benefits from memory locality. Right now, the b,w,e,s,n arrays are all at separate locations in memory. If you could not fit the problem in L3 cache (mostly in L2), then this would be bad, and a solution of this sort would be helpful:
size_t d_nx = 128, d_ny = 128;
float *d_x;
struct D { float b,w,e,s,n; };
D *d;
void step() {
size_t ic = d_ny + 1, iw = d_ny, ie = d_ny + 2, is = 1, in = 2 * d_ny + 1;
for (size_t y = 1; y < d_ny - 1; ++y) {
for (size_t x = 1; x < d_nx - 1; ++x) {
d_x[ic] = d[ic].b
- d[ic].w * d_x[iw] - d[ic].e * d_x[ie]
- d[ic].s * d_x[is] - d[ic].n * d_x[in];
++ic; ++iw; ++ie; ++is; ++in;
}
ic += 2; iw += 2; ie += 2; is += 2; in += 2;
}
}
void solve(size_t iters) { for (size_t i = 0; i < iters; ++i) step(); }
void clear(float *a) { memset(a, 0, d_nx * d_ny * sizeof(float)); }
int main(int argc, char **argv) {
size_t n = d_nx * d_ny;
d_x = new float[n]; clear(d_x);
d = new D[n]; memset(d,0,n * sizeof(D));
solve(atoi(argv[1]));
cout << d_x[0] << endl; // prevent the thing from being optimized away
}
For example, this solution at 1280x1280 is a little less than 2x faster than Poni's solution (13s vs 23s in my test--your original implementation is then 22s), while at 128x128 it's 30% slower (7s vs. 10s--your original is 10s).
(Iterations were scaled up to 80000 for the base case, and 800 for the 100x larger case of 1280x1280.)

I think you're right about memory being a bottleneck. It's a pretty simple loop with just some simple arithmetic per iteration. the ic, iw, ie, is, and in indices seem to be on opposite sides of the matrix so i'm guessing that there's a bunch of cache misses there.

I'm no expert on the subject, but I've seen that there are several academic papers on improving the cache usage of the Gauss-Seidel method.
Another possible optimization is the use of the red-black variant, where points are updated in two sweeps in a chessboard-like pattern. In this way, all updates in a sweep are independent and can be parallelized.

I suggest putting in some prefetch statements and also researching "data oriented design":
void step_original() {
size_t ic = d_ny + 1, iw = d_ny, ie = d_ny + 2, is = 1, in = 2 * d_ny + 1;
float dw_ic, dx_ic, db_ic, de_ic, dn_ic, ds_ic;
float dx_iw, dx_is, dx_ie, dx_in, de_ic, db_ic;
for (size_t y = 1; y < d_ny - 1; ++y) {
for (size_t x = 1; x < d_nx - 1; ++x) {
// Perform the prefetch
// Sorting these statements by array may increase speed;
// although sorting by index name may increase speed too.
db_ic = d_b[ic];
dw_ic = d_w[ic];
dx_iw = d_x[iw];
de_ic = d_e[ic];
dx_ie = d_x[ie];
ds_ic = d_s[ic];
dx_is = d_x[is];
dn_ic = d_n[ic];
dx_in = d_x[in];
// Calculate
d_x[ic] = db_ic
- dw_ic * dx_iw - de_ic * dx_ie
- ds_ic * dx_is - dn_ic * dx_in;
++ic; ++iw; ++ie; ++is; ++in;
}
ic += 2; iw += 2; ie += 2; is += 2; in += 2;
}
}
This differs from your second method since the values are copied to local temporary variables before the calculation is performed.

We Keep Coding

c++ django amazon-web-services regex python-2.7 google-cloud-platform list unit-testing opengl ember.js

Cuda Visual Profiler unspecified launch failure - profiling

Related

What's the correct way to assign one GPU memory buffer value from another GPU memory buffer with some arithmetic on each source buffer's element?

Segmentation fault, (core dumped) when changing number of processors in lpthreads

C++ Multithreading optimization

SDL_GetPixel pointer problems

How to speed up my sparse matrix solver?

Categories

Resources