I am writing a program to do several copies of an integration in parallel to speed up its evaluation for say, 50000 different inputs.The code generally takes about 0.0005 sec to run on C++ on a single CPU core and its internal loops in my tests run for around 3000 cycles, so I think it may not be too complicated to run on a gpu thread. Also, I use Visual Studio 2013 and GTX 860m for writing my programs and do not have compiling other programs written in CUDA C.
here is my code: (you can get the data file from here ElCentro-import2.txt)
#include <stdio.h>
#include<iostream>
#include <math.h>
#include <fstream>
#include <cmath>
#include <cuda_runtime.h>
using namespace std;
__global__ void
vectorAdd(const float *A, const float *p, float *u, float *v, float *a, float *fs, float *rhat, float *phat, int j, int numElements)
{
int ni = blockDim.x * blockIdx.x + threadIdx.x;
int ti = 10000 * ni;
float t = A[ni];
float m = 1.0f;
float epsilon = 10.0f;
float gamma = 0.5f;
float beta = 1.0f / 4.0f;
float pi = 3.14159f;
float ksy = 0.04f;
float dt = 0.02f;
float fy = 1000.0f;
float c = 4.0f * ksy * pi / t;
float a1 = m / (beta * dt * dt) + gamma * c / (beta * dt);
float a2 = m / (beta * dt) + c*(gamma / beta - 1.0f);
float a3 = (1.0f / (2.0f *beta) - 1.0f)* m + dt*(gamma / (2.0f *beta) - 1.0f) *c;
float k = m * (2.0f*pi / t)*(2.0f*pi / t);
float fz;
float ab;
if (ni < numElements)
{
v[10000 * ni] = 0.0f;
u[10000 * ni] = 0.0f;
fs[10000 * ni] = 0.0f;
a[10000 * ni] = 0.0f;
for (size_t i = 0 + 10000 * ni; i < j-1 + 10000 * ni; i++)
{
u[i + 1] = u[i];
fs[i + 1] = fs[i];
phat[i + 1] = p[i + 1] + a1 * u[i] + a2*v[i] + a3*a[i];
rhat[i + 1] = phat[i + 1] - fs[i + 1] - a1 * u[i + 1];
ab = std::fabsf(rhat[i + 1]);
while (ab >= epsilon)
{
u[i + 1] = u[i + 1] + rhat[i + 1] / (k + a1);
fz = fs[i] + k*(u[i + 1] - u[i]);
if (fz > 0.0)
{
fs[i + 1] = fminf(fz, fy);
}
else
{
fs[i + 1] = fmaxf(fz, -fy);
}
rhat[i + 1] = phat[i + 1] - fs[i + 1] - a1 * u[i + 1];
ab = std::fabsf(rhat[i + 1]);
}
v[i + 1] = gamma*(u[i + 1] - u[i]) / beta / dt + (1.0f - gamma / beta)*v[i] + dt*(1.0f - gamma / 2.0f / beta)*a[i];
a[i + 1] = (u[i + 1] - u[i]) / beta / dt / dt - (1.0f / beta / dt)*v[i] + (1.0f - 1.0f / 2.0f / beta)*a[i];
}
}
}
int
main(void)
{
int numElements = 16;
int kore;
FILE* myfile;
size_t size = numElements * sizeof(float);
printf("[Vector addition of %d elements]\n", numElements);
float *h_data = (float *)malloc(10000);
float *h_datat = (float *)malloc(10000);
myfile = fopen("ElCentro-import2.txt", "r");
std::cout << "file is opened\n";
kore = 0;
while (EOF != fscanf(myfile, "%f %f \n", &h_datat[kore], &h_data[kore]))
{
kore++;
}
std::cout << kore << "file is read\n";
kore--;
size_t nsize = 10000 * numElements * sizeof(float);
float *h_A = (float *)malloc(size);
float *h_u = (float *)malloc(nsize);
float *h_v = (float *)malloc(nsize);
float *h_a = (float *)malloc(nsize);
float *h_p = (float *)malloc(nsize);
for (int i = 0; i < kore; ++i)
{
h_p[i] = -10000 * h_data[i];
}
for (int i = 0; i < numElements; ++i)
{
h_A[i] = 1.0f;
}
float *d_A = NULL;
err = cudaMalloc((void **)&d_A, size);
float *d_p = NULL;
err = cudaMalloc((void **)&d_p, nsize);
float *d_u = NULL;
err = cudaMalloc((void **)&d_u, nsize);
float *d_v = NULL;
err = cudaMalloc((void **)&d_v, nsize);
float *d_a = NULL;
err = cudaMalloc((void **)&d_a, nsize);
float *d_fs = NULL;
err = cudaMalloc((void **)&d_fs, nsize);
float *d_rhat = NULL;
err = cudaMalloc((void **)&d_rhat, nsize);
float *d_phat = NULL;
err = cudaMalloc((void **)&d_phat, nsize);
printf("Copy input data from the host memory to the CUDA device\n");
err = cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice);
err = cudaMemcpy(d_p, h_p, nsize, cudaMemcpyHostToDevice);
int threadsPerBlock = 1;
int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
printf("CUDA kernel launch with %d blocks of %d threads\n", blocksPerGrid, threadsPerBlock);
vectorAdd <<<blocksPerGrid, threadsPerBlock >>>(d_A, d_p, d_u, d_v, d_a, d_fs, d_rhat, d_phat, kore-1, numElements);
err = cudaGetLastError();
cudaDeviceSynchronize();
printf("Copy output data from the CUDA device to the host memory\n");
err = cudaMemcpy(h_u, d_u, nsize, cudaMemcpyDeviceToHost);
}
for (int i = 0; i < numElements; ++i)
{
std::cout << h_u[1000+kore * i] << "\n";;
}
printf("Test PASSED\n");
err = cudaFree(d_A);
err = cudaFree(d_p);
err = cudaFree(d_u);
err = cudaFree(d_v);
err = cudaFree(d_a);
err = cudaFree(d_fs);
err = cudaFree(d_rhat);
err = cudaFree(d_phat);
free(h_A);
free(h_p);
free(h_u);
free(h_v);
free(h_a);
free(h_data);
free(h_datat);
err = cudaDeviceReset();
printf("Done\n");
return 0;
}
My problem is that when I try to copy the results from the device to host (say copy d_u to h_u) it genrates my error checking phrase:
Failed to copy vector u from device to host (error code uspecified launch failure)
Also, if i move the kernel sync code (cudaDeviceSynchronize()) to after the kernel call, it also generates an error report about kernel launch. I am quite new to C++ and CUDA programming and this problem has got me confused for several days.
This part should print the same value (~21.8) for several times (for numElements>1)
for (int i = 0; i < numElements; ++i)
{
std::cout << h_u[1000+kore * i] << "\n";;
}
UPDATE
if I set the value of numElements equal to 1, the code runs well, but this is not the point in parallel computing. I also cheked the GPU stats in GPU-Z and memory utilization is less than 1 MB and maximum GPU load is less than 1%. (Release Build)
I have changed the build from Debug to Release and the program runs well! Also by increasing the GPU time out setting in window registry (WDDM) this problem has got nearly eliminated.
Related
I have been working on creating a mixed wave signal. My code is in c++ :
Server signal:
void server_sineWave(BitDepth buffer[], double sin_freq, double beep_freq) {
BitDepth amplitude = std::numeric_limits<BitDepth>::max() * 0.5;
QWORD c = 0;
double d = (samplerate / sin_freq);
int initial = NUM_SAMPLES / 25;
for (QWORD i = 0; i < NUM_SAMPLES; i += channels) {
buffer[i] = amplitude * sin((2 * pi * sin_freq * i) / samplerate); // sin wave generated at "freq"
if (i == initial) {
for (QWORD j = 0; j < 480; j++) {
double stream = amplitude * sin((2 * pi * sin_freq * i / samplerate));
double beep = amplitude * sin((2 * pi * beep_freq * j / samplerate));
double multiplier = .4 * (1 - cos(2 * pi * j / 480));
buffer[i] = stream + (beep * multiplier);
i++;
}
initial = i + 19200.0;
}
}
}
Client signal:
void client_sineWave(BitDepth buffer[], double sin_freq, double beep_freq) {
BitDepth amplitude = std::numeric_limits<BitDepth>::max() * 0.5;
QWORD c = 0;
double d = (samplerate / sin_freq);
int initial = NUM_SAMPLES / 25;
for (QWORD i = 0; i < NUM_SAMPLES; i += channels) {
buffer[i] = amplitude * sin(2 * pi * sin_freq * i / samplerate); // sin wave generated at "freq"
if (i == initial) {
for (QWORD j = 0; j < 480; j++) {
double stream = amplitude * sin((2 * pi * sin_freq * i / samplerate));
double beep = amplitude * sin((2 * pi * beep_freq * j / samplerate));
double multiplier = .4 * (1 - cos(2 * pi * j / 480));
buffer[i] = stream + (beep * multiplier);
// buffer[i] += (beep * multiplier);
i++;
}
initial = i + 19200.0;
//(1000 + rand() % 10000)
//double deg = 360.0 / d;
//buffer[i] = buffer[i + (1 * (channels - 1))] = sin((c++ * deg) * pi / 180) * amplitude;
}
}
}
Mixing of server and client signals:
void mix(BitDepth buffer[], BitDepth server[], BitDepth client[], double duration_milliseconds) {
QWORD num_samples = duration_milliseconds * (NUM_SAMPLES / 10000.0);
double tmp = 0;
QWORD size = NUM_SAMPLES + num_samples;
BitDepth *server_delay = new BitDepth[size];
BitDepth *client_delay = new BitDepth[size];
for (QWORD i = 0; i < size; i++) {
if (i < num_samples) {
server_delay[i] = 0;
client_delay[i + NUM_SAMPLES] = 0;
}
if (i > num_samples) {
server_delay[i] = server[i - num_samples];
client_delay[i - num_samples] = client[i - num_samples];
}
}
for (QWORD i = 0; i < NUM_SAMPLES; i += channels) {
// double multiplier = .5 * (1 - cos(2 * pi * i / NUM_SAMPLES-1));
// double multiplier = (0.54 - 0.46 * cos(2.0 * M_PI * (double) i / (double) (NUM_SAMPLES - 1)));
// server_delay[i] = multiplier * (server_delay[i]);
// client_delay[i] = multiplier * (client_delay[i]);
tmp = server_delay[i] + client_delay[i];
if (tmp > 32767) {
tmp = 32767;
} else if (tmp < -32768) {
tmp = -32768;
}
buffer[i] = tmp;
}
}
My Result in spectrogram from the above code:
Now, when I change the amplitude by increasing value from 0.5 to 0.8, in the following line:
BitDepth amplitude = std::numeric_limits<BitDepth>::max() * 0.5;
to
BitDepth amplitude = std::numeric_limits<BitDepth>::max() * 0.8;
I get following result:
I am new in DSP c++ programming and I really don't know what is this issue and how to resolve this issue?
Please help me in solving this issue.
thanks.
As Suggested by #PaulR, clipping was causing a lot of harmonics.
Your waveform is clipping (because 0.8 + 0.8 > 1.0), which will generate a lot of harmonics - look at the data in your debugger and you’ll see lots of flat peaks at +/- 32k.
So, after taking care of this limit. My issue is resolved.
Thanks alot.
I meet a problem when i used constant memory. It will happen the error:
ERROR: an illegal memory access was encountered
It seem the kernel function doesn't execute.
But if I don't chose the constant memory, everything are ok. So it makes me so confused. I had thought very long time. But I still don't the reason. Can you help me to solve the problem? Thank you very much.
If the variable s is not used constant memory, everything are ok. But if the s is used constant memory, the program will break.
the variable that used constant memory define as followed:
#ifdef USE_CONST_MEM
__constant__ Sphere s[SPHERES];
#else
Sphere *s;
#endif
the kernel function defined as followed:
#ifdef USE_CONST_MEM
__global__ void kernel(unsigned char *ptr) {
printf("ok2");
#else
__global__ void kernel(Sphere *s, unsigned char *ptr) {
#endif
// map from threadIdx/BlockIdx to pixel position
printf("ok2");
int x = threadIdx.x + blockIdx.x * blockDim.x;
int y = threadIdx.y + blockIdx.y * blockDim.y;
int offset = x + y * blockDim.x * gridDim.x;
REAL ox = (x - DIM / 2);
REAL oy = (y - DIM / 2);
REAL r = 0, g = 0, b = 0;
REAL maxz = -INF;
__syncthreads();
for (int i = 0; i<SPHERES; i++) {
REAL n;
REAL t = s[i].hit(ox, oy, &n);
if (t > maxz) {
REAL fscale = n;
r = s[i].r * fscale;
g = s[i].g * fscale;
b = s[i].b * fscale;
maxz = t;
printf("r: %.2f g: %.2f, b %.2f\n", r, g, b);
}
}
__syncthreads();
ptr[offset * 4 + 0] = (int)(r * 255);
ptr[offset * 4 + 1] = (int)(g * 255);
ptr[offset * 4 + 2] = (int)(b * 255);
ptr[offset * 4 + 3] = 255;
}
// globals needed by the update routine
struct DataBlock {
unsigned char *dev_bitmap;
CPUAnimBitmap *bitmap;
};
there is the function that call the kernel function.
void generate_frame(DataBlock *d, int ticks) {
//START_GPU
//movin the spheres
kernelMoving << <128, 32 >> >(s, SPHERES);
printf("ok0\n");
// generate a bitmap from our sphere data
dim3 grids(DIM / 16, DIM / 16);
dim3 threads(16, 16);
#ifdef USE_CONST_MEM
Sphere *d_s;
cudaGetSymbolAddress((void **)&d_s, s);
printf("ok0-1\n");
kernel << <grids, threads >> >(s, d->dev_bitmap);
cudaDeviceSynchronize();
cudaError_t error = cudaGetLastError();
if(error!=cudaSuccess)
{
fprintf(stderr,"ERROR: %s\n", cudaGetErrorString(error) );
exit(-1);
}
printf("ok0-1-1\n");
#else
printf("ok0-2\n");
kernel << <grids, threads >> >(s, d->dev_bitmap);
#endif
printf("ok1\n");
//END_GPU
HANDLE_ERROR(cudaMemcpy(d->bitmap->get_ptr(),
d->dev_bitmap,
d->bitmap->image_size(),
cudaMemcpyDeviceToHost));
}
the initialzation code as followed:
#ifdef USE_CONST_MEM
#else
HANDLE_ERROR(cudaMalloc((void**)&s,
sizeof(Sphere) * SPHERES));
#endif
// allocate temp memory, initialize it, copy to constant
// memory on the GPU, then free our temp memory
Sphere *temp_s = (Sphere*)malloc(sizeof(Sphere) * SPHERES);
for (int i = 0; i<SPHERES; i++) {
temp_s[i].r = rnd(1.0f);
temp_s[i].g = rnd(1.0f);
temp_s[i].b = rnd(1.0f);
temp_s[i].x = rnd(1000.0f) - 500;
temp_s[i].y = rnd(1000.0f) - 500;
temp_s[i].z = rnd(1000.0f) - 500;
temp_s[i].radius = rnd(10.0f) + 5;
temp_s[i].dx = STEP_SIZE * ((rand() / (float)RAND_MAX) * 2 - 1);
temp_s[i].dy = STEP_SIZE * ((rand() / (float)RAND_MAX) * 2 - 1);
temp_s[i].dz = STEP_SIZE * ((rand() / (float)RAND_MAX) * 2 - 1);
}
#ifdef USE_CONST_MEM
HANDLE_ERROR(cudaMemcpyToSymbol(s, temp_s,
sizeof(Sphere) * SPHERES));
#else
HANDLE_ERROR(cudaMemcpy(s, temp_s, sizeof(Sphere)*SPHERES, cudaMemcpyHostToDevice));
#endif
free(temp_s);
the version of cuda is 8.0. the system is ubuntu 16.04.
Yeah, I know where I am wrong. When I used constant memory, I also try to change it's value in the function kernel_moving that try to modify the constant value. So the program will break. Now, I change to this, it works.
#ifdef USE_CONST_MEM
//printf("the number of SPHERES is %d\n", SPHERES);
Sphere *temp_s = (Sphere*)malloc(sizeof(Sphere) * SPHERES);
HANDLE_ERROR(cudaMemcpyFromSymbol(temp_s, s, sizeof(Sphere) * SPHERES,0, cudaMemcpyDeviceToHost));
Sphere* dev_temp_s;
cudaMalloc((void**)&dev_temp_s, sizeof(Sphere) * SPHERES);
cudaMemcpy(dev_temp_s, temp_s, sizeof(Sphere) * SPHERES, cudaMemcpyHostToDevice);
kernelMoving << <128, 32 >> >(dev_temp_s, SPHERES);
cudaMemcpy(temp_s, dev_temp_s, sizeof(Sphere) * SPHERES, cudaMemcpyDeviceToHost);
HANDLE_ERROR(cudaMemcpyToSymbol(s, temp_s, sizeof(Sphere) * SPHERES));
free(temp_s);
cudaFree(dev_temp_s);
#else
kernelMoving << <128, 32 >> >(s, SPHERES);
#endif
I have implemented a cascaded addition function for a large vector of float values on my GPU and my CPU. That simply means that all elements of this vector shell be summed up into one result. The CPU algorithm is quite trivial and works fine, but the GPU algorithm is always 35200 off the desired result.
The minimal working code for the algorithm and comparison to the CPU is below.
The output is always this:
CPU Time: 22.760059 ms, bandwidth: 3.514929 GB/s
GPU Time (improved): 12.077088 ms, bandwidth: 6.624114 GB/s
- CPU result does not match GPU result in improved atomic add.
CPU: 10000000.000000, GPU: 10035200.000000, diff:-35200.000000
I checked it with cuda-memcheck but no errors occured in that run. I have tried many many different things but none of themworked. It if not due to the inaccuracy of the float datatype because I changed all floats to ints and still got the exact same result.
This is my code:
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <chrono>
#include <time.h>
#include <stdio.h>
#include <stdlib.h>
void reductionWithCudaImproved(float *result, const float *input);
__global__ void reductionKernelImproved(float *result, const float *input);
void reductionCPU(float *result, const float *input);
#define SIZE 10000000
#define TILE 32
#define ILP 8
#define BLOCK_X_IMPR (TILE / ILP)
#define BLOCK_Y_IMPR 32
#define BLOCK_COUNT_X_IMPR 100
int main()
{
int i;
float *input;
float resultCPU, resultGPU;
double cpuTime, cpuBandwidth;
input = (float*)malloc(SIZE * sizeof(float));
resultCPU = 0.0;
resultGPU = 0.0;
srand((int)time(NULL));
auto start = std::chrono::high_resolution_clock::now();
auto end = std::chrono::high_resolution_clock::now();
for (i = 0; i < SIZE; i++)
input[i] = 1.0;
start = std::chrono::high_resolution_clock::now();
reductionCPU(&resultCPU, input);
end = std::chrono::high_resolution_clock::now();
std::chrono::duration<double> diff = end - start;
cpuTime = (diff.count() * 1000);
cpuBandwidth = (sizeof(float) * SIZE * 2) / (cpuTime * 1000000);
printf("CPU Time: %f ms, bandwidth: %f GB/s\n\n", cpuTime, cpuBandwidth);
reductionWithCudaImproved(&resultGPU, input);
if (resultCPU != resultGPU)
printf("- CPU result does not match GPU result in improved atomic add. CPU: %f, GPU: %f, diff:%f\n\n", resultCPU, resultGPU, (resultCPU - resultGPU));
else
printf("+ CPU result matches GPU result in improved atomic add. CPU: %f, GPU: %f\n\n", resultCPU, resultGPU);
return 0;
}
void reductionCPU(float *result, const float *input)
{
for (int i = 0; i < SIZE; i++)
*result += input[i];
}
__global__ void reductionKernelImproved(float *result, const float *input)
{
int i;
int col = (blockDim.x * blockIdx.x + threadIdx.x) * ILP;
int row = blockDim.y * blockIdx.y + threadIdx.y;
int index = row * blockDim.x * BLOCK_COUNT_X_IMPR + col;
__shared__ float interResult;
if (threadIdx.x == 0 && threadIdx.y == 0)
interResult = 0.0;
__syncthreads();
#pragma unroll ILP
for (i = 0; i < ILP; i++)
{
if (index < SIZE)
{
atomicAdd(&interResult, input[index]);
index++;
}
}
__syncthreads();
if (threadIdx.x == 0 && threadIdx.y == 0)
atomicAdd(result, interResult);
}
void reductionWithCudaImproved(float *result, const float *input)
{
dim3 dim_grid, dim_block;
float *dev_input = 0;
float *dev_result = 0;
cudaEvent_t start, stop;
float elapsed = 0;
double gpuBandwidth;
dim_block.x = BLOCK_X_IMPR;
dim_block.y = BLOCK_Y_IMPR;
dim_block.z = 1;
dim_grid.x = BLOCK_COUNT_X_IMPR;
dim_grid.y = (int)ceil((float)SIZE / (float)(TILE * dim_block.y* BLOCK_COUNT_X_IMPR));
dim_grid.z = 1;
cudaSetDevice(0);
cudaMalloc((void**)&dev_input, SIZE * sizeof(float));
cudaMalloc((void**)&dev_result, sizeof(float));
cudaMemcpy(dev_input, input, SIZE * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(dev_result, result, sizeof(float), cudaMemcpyHostToDevice);
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start);
reductionKernelImproved << <dim_grid, dim_block >> >(dev_result, dev_input);
cudaEventRecord(stop);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&elapsed, start, stop);
gpuBandwidth = (sizeof(float) * SIZE * 2) / (elapsed * 1000000);
printf("GPU Time (improved): %f ms, bandwidth: %f GB/s\n", elapsed, gpuBandwidth);
cudaDeviceSynchronize();
cudaMemcpy(result, dev_result, sizeof(float), cudaMemcpyDeviceToHost);
cudaFree(dev_input);
cudaFree(dev_result);
return;
}
I think you have overlapping indices in your kernel call:
int col = (blockDim.x * blockIdx.x + threadIdx.x) * ILP;
int row = blockDim.y * blockIdx.y + threadIdx.y;
int index = row * blockDim.x * BLOCK_COUNT_X_IMPR + col;
If I am not mistaken, your blockDim.x = 4 and BLOCK_COUNT_X_IMPR = 100, so each row will jump 400 indices.
However, your col can go as high as 400 * 8.
Consider:
blockIdx = (12, 0)
threadIdx = (3, 0)
=> col = (12*4 + 3) * 8 = 408
row = 0
index = 408
blockIdx = (0, 0)
threadIdx = (1, 1)
=> col = (0*4 + 1) * 8 = 8
row = 1
index = 1 * 400 + 8 = 408
So I guess you should rewrite your index
// gridDim.x = BLOCK_COUNT_X_IMPR
int index = row * blockDim.x * gridDim.x * ILP + col;
If I change the while loop (see kernel below, it's a monstrous loop, you can't miss it) to iterate only once, it uses a negligible amount of GPU memory. However, when the loop is allowed to iterate 50,000 times as shown below, the GPU instantly takes on 2.5 GB. The problem persists even when using a "for" loop. Can someone please offer an explanation and perhaps a solution to prevent the kernel from using so much memory? This behavior is highly unusual, IMO. Thanks in advance!
#include <stdio.h>
#include <stdlib.h>
#include "cuda.h"
#include "curand.h"
#include <cuda_runtime.h>
#include "math.h"
#include <curand_kernel.h>
#include <time.h>
__global__ void myKern(const float *transMatrix, float *masterForces, const double *rands, const int r_max)
{
const int iterationsx = 50000;
const int RUsizex = 26;
int threadsPerBlock = blockDim.x * blockDim.y;
int blockId = blockIdx.x + (blockIdx.y * gridDim.x);
int threadId = threadIdx.x + (threadIdx.y * blockDim.x);
int globalIdx = (blockId * threadsPerBlock) + threadId;
int RU[RUsizex] = {0};
int index = 0;
float r = 0.0;
double temp = 0;
float forces[iterationsx] = {0.0};
int left[RUsizex - 2] = {0};
int right[RUsizex - 2] = {0};
curandState s;
curand_init (rands[globalIdx] , 0, 0, &s);
int i= 0;
while( i < iterationsx)
{
for(int k = 0; k < RUsizex - 2; k++)
{
left[k] = RU[k];
right[k] = RU[k+2];
}
for(int j = 0; j < RUsizex -2; j++)
{
r = curand_uniform(&s);
index = ((((left[j] * dimen2 + right[j]) * dimen3 + RU[j +1 ]) * dimen4) * dimen5) ;
RU[j + 1]= (RU[j + 1]) + ( r < transMatrix[index]) * (transMatrix[index + 1]) +
(! (r < transMatrix[index])) * ( r < transMatrix[index + 2]) * (transMatrix[index + 3]) +
(! ( r < transMatrix[index + 2])) * (r < transMatrix[index + 4]) * (transMatrix[index + 5]) ;
}
for(int z = 1; z < RUsizex - 1; z++)
{
temp = temp + (RU[z] ==4) + (RU[z] ==5);
}
forces[i] = temp/(24.0);
temp = 0.0;
i++;
}
for(int y = 0; y < iterationsx; y++)
{
masterForces[globalIdx + (r_max * y)] = forces[y];
}
}
The variable float forces[iterationsx] is a stack variable in a global function. This requires a stack reservation of > 200,000B per thread. The CUDA driver must allocate a local memory allocation based upon the maximum resident threads using the formula SmCount * MaxTheadsPerSm * (LocalMemoryPerThread + StackPerThread). For a full GK110 this would be 15 * 2048 * ~51KiB = 1.5 GiB.
I have written a global version of Particle Swarm Optimization algorithm in C++.
I tried to write it exactly as same as my MATLAB PSO code that have written before, but this code generates different and so worst answers.
The MATLAB code is:
clear all;
numofdims = 30;
numofparticles = 50;
c1 = 2;
c2 = 2;
numofiterations = 1000;
V = zeros(50, 30);
initialpop = V;
Vmin = zeros(30, 1);
Vmax = Vmin;
Xmax = ones(30, 1) * 100;
Xmin = -Xmax;
pbestfits = zeros(50, 1);
worsts = zeros(50, 1);
bests = zeros(50, 1);
meanfits = zeros(50, 1);
pbests = zeros(50, 30);
initialpop = Xmin + (Xmax - Xmin) .* rand(numofparticles, numofdims);
X = initialpop;
fitnesses = testfunc1(X);
[minfit, minfitidx] = min(fitnesses);
gbestfit = minfit;
gbest = X(minfitidx, :);
for i = 1:numofdims
Vmax(i) = 0.2 * (Xmax(i) - Xmin(i));
Vmin(i) = -Vmax(i);
end
for t = 1:1000
w = 0.9 - 0.7 * (t / numofiterations);
for i = 1:numofparticles
if(fitnesses(i) < pbestfits(i))
pbestfits(i) = fitnesses(i);
pbests(i, :) = X(i, :);
end
end
for i = 1:numofparticles
for j = 1:numofdims
V(i, j) = min(max((w * V(i, j) + rand * c1 * (pbests(i, j) - X(i, j))...
+ rand * c2 * (gbest(j) - X(i, j))), Vmin(j)), Vmax(j));
X(i, j) = min(max((X(i, j) + V(i, j)), Xmin(j)), Xmax(j));
end
end
fitnesses = testfunc1(X);
[minfit, minfitidx] = min(fitnesses);
if(minfit < gbestfit)
gbestfit = minfit;
gbest = X(minfitidx, :);
end
worsts(t) = max(fitnesses);
bests(t) = gbestfit;
meanfits(t) = mean(fitnesses);
end
In which, testfunc1 is:
function [out] = testfunc1(R)
out = sum(R .^ 2, 2);
end
The C++ code is:
#include <cstring>
#include <iostream>
#include <cmath>
#include <algorithm>
#include <ctime>
#define rand_01 ((float)rand() / (float)RAND_MAX)
const int numofdims = 30;
const int numofparticles = 50;
using namespace std;
void fitnessfunc(float X[numofparticles][numofdims], float fitnesses[numofparticles])
{
memset(fitnesses, 0, sizeof (float) * numofparticles);
for(int i = 0; i < numofparticles; i++)
{
for(int j = 0; j < numofdims; j++)
{
fitnesses[i] += (pow(X[i][j], 2));
}
}
}
float mean(float inputval[], int vallength)
{
int addvalue = 0;
for(int i = 0; i < vallength; i++)
{
addvalue += inputval[i];
}
return (float)(addvalue / vallength);
}
void PSO(int numofiterations, float c1, float c2,
float Xmin[numofdims], float Xmax[numofdims], float initialpop[numofparticles][numofdims],
float worsts[], float meanfits[], float bests[], float *gbestfit, float gbest[numofdims])
{
float V[numofparticles][numofdims] = {0};
float X[numofparticles][numofdims];
float Vmax[numofdims];
float Vmin[numofdims];
float pbests[numofparticles][numofdims];
float pbestfits[numofparticles];
float fitnesses[numofparticles];
float w;
float minfit;
int minfitidx;
memcpy(X, initialpop, sizeof(float) * numofparticles * numofdims);
fitnessfunc(X, fitnesses);
minfit = *min_element(fitnesses, fitnesses + numofparticles);
minfitidx = min_element(fitnesses, fitnesses + numofparticles) - fitnesses;
*gbestfit = minfit;
memcpy(gbest, X[minfitidx], sizeof(float) * numofdims);
for(int i = 0; i < numofdims; i++)
{
Vmax[i] = 0.2 * (Xmax[i] - Xmin[i]);
Vmin[i] = -Vmax[i];
}
for(int t = 0; t < 1000; t++)
{
w = 0.9 - 0.7 * (float) (t / numofiterations);
for(int i = 0; i < numofparticles; i++)
{
if(fitnesses[i] < pbestfits[i])
{
pbestfits[i] = fitnesses[i];
memcpy(pbests[i], X[i], sizeof(float) * numofdims);
}
}
for(int i = 0; i < numofparticles; i++)
{
for(int j = 0; j < numofdims; j++)
{
V[i][j] = min(max((w * V[i][j] + rand_01 * c1 * (pbests[i][j] - X[i][j])
+ rand_01 * c2 * (gbest[j] - X[i][j])), Vmin[j]), Vmax[j]);
X[i][j] = min(max((X[i][j] + V[i][j]), Xmin[j]), Xmax[j]);
}
}
fitnessfunc(X, fitnesses);
minfit = *min_element(fitnesses, fitnesses + numofparticles);
minfitidx = min_element(fitnesses, fitnesses + numofparticles) - fitnesses;
if(minfit < *gbestfit)
{
*gbestfit = minfit;
memcpy(gbest, X[minfitidx], sizeof(float) * numofdims);
}
worsts[t] = *max_element(fitnesses, fitnesses + numofparticles);
bests[t] = *gbestfit;
meanfits[t] = mean(fitnesses, numofparticles);
}
}
int main()
{
time_t t;
srand((unsigned) time(&t));
float xmin[30], xmax[30];
float initpop[50][30];
float worsts[1000], bests[1000];
float meanfits[1000];
float gbestfit;
float gbest[30];
for(int i = 0; i < 30; i++)
{
xmax[i] = 100;
xmin[i] = -100;
}
for(int i = 0; i < 50; i++)
for(int j = 0; j < 30; j++)
{
initpop[i][j] = rand() % (100 + 100 + 1) - 100;
}
PSO(1000, 2, 2, xmin, xmax, initpop, worsts, meanfits, bests, &gbestfit, gbest);
cout<<"fitness: "<<gbestfit<<endl;
return 0;
}
I have debugged two codes many times but can not find the difference which makes answers different.
It is making me crazy!
May you help me please?
Update:
Please consider that, the function mean is just used for reporting some information and is not used in the optimization procedure.
You've got integer division in the following line
w = 0.9 - 0.7 * (float) (t / numofiterations);
w will be 0.2 for every iteration, change it to
w = 0.9 - 0.7 * t / numofiterations;
The first multiplication will automatically promote t to a double the division should then promote numof iterations to a double.
The parenthesis means it will be done first and therefore not be promoted as wo integers is involved in the division.
This could be a mistake in function mean:
return (float)(addvalue / vallength);
This is integer division, so the result is truncated down, then cast to float. It is unlikely this is what you want.