I have the following code:
#pragma acc kernels
for (i = 0; i<shape1Count; i++){
for (j = 0; j<shape2Count; j++){
if (kkk==1){
intersectionsCount++;
} else {
intersectionsCount++;
}
}
}
kkk is assigned with 1.I found that the if-condition has not even run, that means no matter it is true or false, intersectionCount is not incremented.
My assumption is, if-condition cannot be handled by GPU/accelerator. Is it true?
If it is true, what can I do to handle this?
p.s. I am very new to GPU programming.
Thanks a lot
Often the compiler can auto-detect reductions but not in this case. Hence, you need to add a reduction clause yourself. Here's the output when using pgcc version 16.4:
% cat test.c
#include <stdlib.h>
#include <stdio.h>
int main() {
int i, j, kkk, intersectionsCount;
int shape1Count,shape2Count;
shape1Count=32;
shape2Count=32;
intersectionsCount=0;
kkk=1;
#pragma acc kernels loop reduction(+:intersectionsCount)
for (i = 0; i<shape1Count; i++){
for (j = 0; j<shape2Count; j++){
if (kkk==1){
intersectionsCount++;
} else {
intersectionsCount++;
}
}
}
printf("%d\n",intersectionsCount);
exit(0);
}
% pgcc test.c -Minfo=accel -acc; a.out
main:
15, Loop is parallelizable
16, Loop is parallelizable
Accelerator kernel generated
Generating Tesla code
15, #pragma acc loop gang, vector(4) /* blockIdx.y threadIdx.y */
16, #pragma acc loop gang, vector(32) /* blockIdx.x threadIdx.x */
Generating reduction(+:intersectionsCount)
1024
Related
When I compiled the code with openacc ,I got the warning information:
$ pgc++ -o buffer1.out -acc -gpu=managed -Minfo=accel buffer.cpp
main:
137, Generating enter data copyin(mat1)
Generating NVIDIA GPU code
143, #pragma acc loop seq collapse(2)
145, collapsed */
137, Generating default present(mat1)
143, Complex loop carried dependence of prevents parallelization
145, Complex loop carried dependence of prevents parallelization
148, Reference argument passing prevents parallelization: j
Reference argument passing prevents parallelization: i
150, Accelerator restriction: induction variable live-out from loop: j
153, Accelerator restriction: induction variable live-out from loop: i
161, Generating update self(mat1)
Generating exit data delete(mat1)
Array2D<int>::operator ()(const int &, const int &):
23, Generating implicit acc routine seq
Generating acc routine seq
Generating NVIDIA GPU code
#include<iostream>
#include <string.h>
#include<queue>
#include<openacc.h>
using namespace std;
template<class T1>
class Array2D{
public:
int arows=0;
int acols=0;
T1** __restrict matImg;
Array2D(int rows,int cols,T1 defaultVal):arows(rows),acols(cols){
matImg=new T1*[rows];
for(int i=0;i<rows;i++){
matImg[i]=new T1[cols];
memset(matImg[i],defaultVal,sizeof(T1)*cols);
}
}
T1 &operator()(const int& m, const int& n){
return matImg[m][n];
}
T1 * __restrict operator()(const int&k){
return matImg[k];
}
~Array2D(){
for(int i=0;i<arows;i++){
delete[] matImg[i];
}
delete [] matImg;
}
};
int main(){
int rows=6;
int cols=10;
Array2D<int> mat1=Array2D<int>(rows,cols,0);//Array2D<int>(2,3);
#pragma acc enter data copyin(mat1[0:rows][0:cols])
#pragma acc parallel loop collapse(2) default(present)
for(int i=0;i<rows;i++){
for(int j=0;j<cols;j++){
// cout<<"mat1("<<i<<","<<j<<") :"<<mat1(i,j)<<endl;
mat1(i,j)+=2;
}
}
#pragma acc update self(mat1)
#pragma acc exit data delete(mat1)
cout<<"=============================="<<endl;
for(int i=0;i<mat1.arows;i++){
for(int j=0;j<mat1.acols;j++){
cout<<"mat1("<<i<<","<<j<<") :"<<mat1(i,j)<<endl;
}
}
}
Array2D is a class for creating 2D array object by allocating heap.
It says the array prevents the parallel in GPU code. Image is a Mat
data type from openCV. May I know why the warning says there is a
dependency?
Could somebody provide any suggestions?
Pass the index variables by value instead of by reference. By passing them by reference, the compiler must assume that the address is taken by a global pointer thus creating a dependency.
To fix:
T1 &operator()(int m, int n){
return matImg[m][n];
}
While the loop will now be successfully parallelized, you'll get a runtime error since "mat1" isn't an 2D array, but rather a class with a 2D array data member. Instead you need to do a deep copy of "mat1" to the device.
Full fix:
% cat test.cpp
#include<iostream>
#include <string.h>
#include<queue>
#include<openacc.h>
using namespace std;
template<class T1>
class Array2D{
public:
int arows=0;
int acols=0;
T1** __restrict matImg;
Array2D(int rows,int cols,T1 defaultVal):arows(rows),acols(cols){
matImg=new T1*[rows];
for(int i=0;i<rows;i++){
matImg[i]=new T1[cols];
memset(matImg[i],defaultVal,sizeof(T1)*cols);
}
}
T1 &operator()(int m, int n){
return matImg[m][n];
}
T1 * __restrict operator()(const int&k){
return matImg[k];
}
~Array2D(){
for(int i=0;i<arows;i++){
delete[] matImg[i];
}
delete [] matImg;
}
};
int main(){
int rows=6;
int cols=10;
Array2D<int> mat1=Array2D<int>(rows,cols,0);//Array2D<int>(2,3);
#pragma acc enter data copyin(mat1, mat1.matImg[0:rows][0:cols])
#pragma acc parallel loop collapse(2) default(present)
for(int i=0;i<rows;i++){
for(int j=0;j<cols;j++){
// cout<<"mat1("<<i<<","<<j<<") :"<<mat1(i,j)<<endl;
mat1(i,j)+=2;
}
}
#pragma acc update self(mat1.matImg[0:rows][0:cols])
#pragma acc exit data delete(mat1.matImg,mat1)
cout<<"=============================="<<endl;
for(int i=0;i<mat1.arows;i++){
for(int j=0;j<mat1.acols;j++){
cout<<"mat1("<<i<<","<<j<<") :"<<mat1(i,j)<<endl;
}
}
}
% nvc++ test.cpp -acc -Minfo=accel -O2 -V22.11 ; a.out
main:
50, Generating enter data copyin(mat1,mat1.matImg[:rows][:cols])
Generating NVIDIA GPU code
55, #pragma acc loop gang, vector(64) collapse(2) /* blockIdx.x threadIdx.x */
56, /* blockIdx.x threadIdx.x collapsed */
50, Generating default present(mat1.matImg[:6],mat1)
66, Generating update self(mat1.matImg[:rows][:cols])
Generating exit data delete(mat1.matImg[:1][:1],mat1)
Array2D<int>::operator ()(int, int):
24, Generating implicit acc routine seq
Generating acc routine seq
Generating NVIDIA GPU code
==============================
mat1(0,0) :2
mat1(0,1) :2
mat1(0,2) :2
mat1(0,3) :2
mat1(0,4) :2
mat1(0,5) :2
mat1(0,6) :2
mat1(0,7) :2
... cut ...
mat1(5,7) :2
mat1(5,8) :2
mat1(5,9) :2
I am trying to understand/test OpenMP with GPU offload. However, I am confused because some examples/info (1, 2, 3) in the internet are analogous or similar to mine but my example does not work as I think it should. I am using g++ 9.4 on Ubuntu 20.04 LTS and also installed gcc-9-offload-nvptx.
My example that does not work but is similar to this one:
#include <iostream>
#include <vector>
int main(int argc, char *argv[]) {
typedef double myfloat;
if (argc != 2) exit(1);
size_t size = atoi(argv[1]);
printf("Size: %zu\n", size);
std::vector<myfloat> data_1(size, 2);
myfloat *data1_ptr = data_1.data();
myfloat sum = -1;
#pragma omp target map(tofrom:sum) map(from: data1_ptr[0:size])
#pragma omp teams distribute parallel for simd reduction(+:sum) collapse(2)
for (size_t i = 0; i < size; ++i) {
for (size_t j = 0; j < size; ++j) {
myfloat term1 = data1_ptr[i] * i;
sum += term1 / (1 + term1 * term1 * term1);
}
}
printf("sum: %.2f\n", sum);
return 0;
}
When I compile it with: g++ main.cpp -o test -fopenmp -fcf-protection=none -fno-stack-protector I get the following
stack_example.cpp: In function ‘main._omp_fn.0.hsa.0’:
cc1plus: warning: could not emit HSAIL for the function [-Whsa]
cc1plus: note: support for HSA does not implement non-gridified OpenMP parallel constructs.
It does compile but when using it with
./test 10000
the printed sum is still -1. I think the sum value passed to the GPU was not returned properly but I explicitly map it, so shouldn't it be returned? Or what am I doing wrong?
EDIT 1
I was ask to modify my code because there was a historically grown redundant for loop and also sum was initialized with -1. I fixed that and also compiled it with gcc-11 which did not throw a warning or note as did gcc-9. However the behavior is similar:
Size: 100
Number of devices: 2
sum: 0.00
I checked with nvtop, the GPU is used. Because there are two GPUs I can even switch the device and can be seen by nvtop.
Solution:
The fix is very easy and stupid. Changing
map(from: data1_ptr[0:size])
to
map(tofrom: data1_ptr[0:size])
did the trick.
Even though I am not writing to the array this seemed to be the problem.
I am getting "call to cuMemcpyDtoHsync returned error 700: Illegal address during kernel execution" error when I try to parallelize this simple loop.
#include <vector>
#include <iostream>
using namespace std;
int main() {
vector<float> xF = {0, 1, 2, 3};
#pragma acc parallel loop
for (int i = 0; i < 4; ++i) {
xF[i] = 0.0;
}
return 0;
}
Compiled with: $ pgc++ -fast -acc -std=c++11 -Minfo=accel -o test test.cpp
main:
6, Accelerator kernel generated
Generating Tesla code
9, #pragma acc loop gang, vector(4) /* blockIdx.x threadIdx.x */
std::vector<float, std::allocator<float>>::operator [](unsigned long):
1, include "vector"
64, include "stl_vector.h"
771, Generating implicit acc routine seq
Generating acc routine seq
Generating Tesla code
T3 std::__copy_move_a2<(bool)0, const float *, decltype((std::allocator_traits<std::allocator<float>>::_S_pointer_helper<std::allocator<float>>((std::allocator<float>*)0)))>(T2, T2, T3):
1, include "vector"
64, include "stl_vector.h"
/usr/bin/ld: error in /tmp/pgc++cAUEgAXViQSY.o(.eh_frame); no .eh_frame_hdr table will be created.
$ ./test
call to cuStreamSynchronize returned error 700: Illegal address during kernel execution
The code runs normally without the #pragma, but I would like to make it parallel. What am I doing wrong?
Try compiling with "-ta=tesla:managed".
The problem here is that you aren't explicitly managing the data movement between the host and device and the compiler can't implicitly manage it for you since a std::vector is just a class of pointers so the compiler can't tell the size of the data. Hence, the device is using host addresses thus causing the illegal memory accesses.
While you can manage the data yourself by grabbing the vector's raw pointers and then using a data clause to copy the vector's data as well as copying the vector itself to the device, it's much easier to use CUDA Unified Memory (i.e. the "managed" flag) and have the CUDA runtime manage the data movement for you.
As Jerry notes, it's generally not recommended to use vectors in parallel code since they are not thread safe. In this case it's fine, but you may encounter other issues especially if you try to push or pop data. Better to use arrays. Plus arrays are easier to manage between the host and device copies.
% cat test.cpp
#include <vector>
#include <iostream>
using namespace std;
int main() {
vector<float> xF = {0, 1, 2, 3};
#pragma acc parallel loop
for (int i = 0; i < 4; ++i) {
xF[i] = 0.0;
}
for (int i = 0; i < 4; ++i) {
std::cout << xF[i] << std::endl;
}
return 0;
}
% pgc++ -ta=tesla:cc70,managed -Minfo=accel test.cpp --c++11 ; a.out
main:
6, Accelerator kernel generated
Generating Tesla code
9, #pragma acc loop gang, vector(4) /* blockIdx.x threadIdx.x */
6, Generating implicit copy(xF)
std::vector<float, std::allocator<float>>::operator [](unsigned long):
1, include "vector"
64, include "stl_vector.h"
771, Generating implicit acc routine seq
Generating acc routine seq
Generating Tesla code
0
0
0
0
I use android-ndk-r10d to code with C++ in android studio. Now I want to use openmp and I add the codes into Android.mk:
LOCAL_CFLAGS += -fopenmp
LOCAL_LDFLAGS += -fopenmp
and add the codes into myapp.cpp:
#include <omp.h>
#pragma omp parallel for
for(int i = 1, ii = 0; i < outImage[0]->height; i+=2, ii = i>>1) {
/* Do work... */
}
but gradle build finished with error just because of the [#pragma omp parallel for]
How can I handle the syntax error ?
May be the compiler does not like the complex structure of the for() statement. OpenMP likes to know the number of steps in advance. Also I doubt that OpenMP can deal with 2 loop variables (i and ii). Try a simple loop with fixed limits like
int kmax = (outImage[0]->height - 1) / 2; // not sure this is right
#pragma omp parallel for
for( int k=0; k<kmax; k++)
{
int i=k*2 + 1; // not sure this is right
int ii = i>>1;
/* Do work ... */
}
this is my first time posting so I will appologise for my novice mistakes. Please also excuse the fact that not all variable names are in english. My problem is the following: i've written this code using openMP in both Visual Studio 2010 and in eclipse for c/c++ using the cygwin gcc compiler toolchain. In visual I get a speed-up but in eclipse I get a slow down twice the amount of the serial version. Can someone please explain what I have done wrong please? In short i'm just simulating the speed-up from when I copy from an array of 3D vectors into a double array in order to send over MPI.
#include <omp.h>
#include <time.h>
#include <stdio.h>
#include <vector>
const int NUMAR_FORME=10;
const int NUMAR_SECUNDE_SIMULATE=60; //number of buffers
const int dimensiuni_forme[10]={100,200,300,400,500,600,700,800,900,10000}; //size of each buffer
//-------- the buffers, cuurently only worker_buffer and buff is used
std::vector<std::vector<std::vector<double> > > worker_buffer;
std::vector<std::vector<double> > send_buffer,corect;
double **buff;
double **worker_buffer1;
long i,j,k,l;
int flag=0;
int numarator=0; //number of tests runed
clock_t start;
start=clock();
worker_buffer.resize(1);
buff = new double* [2];
int de_scris=0; //this tells me in which buffer to store, nou I alternate buff[0], buff[1], buff[0], buff[1]
worker_buffer[0].resize(NUMAR_SECUNDE_SIMULATE);
for(i=0;i<NUMAR_SECUNDE_SIMULATE;i++)
worker_buffer[0][i].resize(dimensiuni_forme[9]);
while(numarator<60)
{
if(numarator!=0)
delete [] buff[de_scris];
if(numarator!=0)
de_scris=(de_scris+1)%2;
long limita;
limita=NUMAR_SECUNDE_SIMULATE*dimensiuni_forme[9]*3; //3-comes from the fact that I will have a 3D vector structure
buff[de_scris]= new double [limita];
for(i=0;i<NUMAR_SECUNDE_SIMULATE;i++)
{ for(j=0;j<dimensiuni_forme[9];j++)
{
worker_buffer[0][i][j]=(i*dimensiuni_forme[9]+j)*3;
buff[de_scris][(i*dimensiuni_forme[9]+j)*3]=worker_buffer[0][i][j];
buff[de_scris][(i*dimensiuni_forme[9]+j)*3+1]=worker_buffer[0][i][j]+0.5;
buff[de_scris][(i*dimensiuni_forme[9]+j)*3+2]=worker_buffer[0][i][j]+0.75;
}
}
numarator++;
}
start=clock()-start;
printf("TICKS TOTAL %ld \n",start);
bool ad=true;
long nr;
for(i=0;i<NUMAR_SECUNDE_SIMULATE*dimensiuni_forme[9]*3;i++)
{
if(i%3==0)
nr=i;
if(i%3==0 && buff[de_scris][i]!=i)
ad=false;
else
if(i%3==1 &&buff[de_scris][i]!=(nr+0.5))
ad=false;
else
if(i%3==2 && buff[de_scris][i]!=(nr+0.75))
ad=false;
}
if(ad==false)
printf("not correct \n");
start=clock();
numarator=0;
//parallel version
while(numarator<60)
{
if(numarator!=0)
delete [] buff[de_scris];
long index, limita,id;
omp_set_num_threads(2);
if(numarator!=0)
de_scris=(de_scris+1)%2;
limita=NUMAR_SECUNDE_SIMULATE*dimensiuni_forme[9]*3; //3-
buff[de_scris]= new double [limita];
#pragma omp parallel shared(worker_buffer,limita,buff) private(index,id)
{
printf("intram cu %d threaduri \n", omp_get_num_threads());
id=omp_get_thread_num();
//index=id;
for(index=id;(index*3)<limita;index+=omp_get_num_threads())
{
buff[de_scris][index*3]=worker_buffer[0][index/dimensiuni_forme[9]][index%dimensiuni_forme[9]]; //aici va veni send_buff[index].x
buff[de_scris][index*3+1]=buff[de_scris][index*3]+0.5;
buff[de_scris][index*3+2]=buff[de_scris][index*3]+0.75;
}
// index+=omp_get_num_threads();
}//end parallel zone
numarator++;
}
start=clock()-start;
printf("TICKS TOTAL %ld \n",start);
ad=true;
//testing for correctness
for(i=0;i<NUMAR_SECUNDE_SIMULATE*dimensiuni_forme[9]*3;i++)
{
if(i%3==0)
nr=i;
if(i%3==0 && buff[de_scris][i]!=i)
ad=false;
else
if(i%3==1 &&buff[de_scris][i]!=(nr+0.5))
ad=false;
else
if(i%3==2 && buff[de_scris][i]!=(nr+0.75))
ad=false;
}
if(ad==false)
printf("not correct \n");
return 0;
}
Judging by how you organized this for loop:
for(index=id;(index*3)<limita;index+=omp_get_num_threads())
{
buff[de_scris][index*3]=worker_buffer[0][index/dimensiuni_forme[9]][index%dimensiuni_forme[9]]; //aici va veni send_buff[index].x
buff[de_scris][index*3+1]=buff[de_scris][index*3]+0.5;
buff[de_scris][index*3+2]=buff[de_scris][index*3]+0.75;
}
and assuming that you have 4 threads, your threads will get interleaved index values:
thread 0: 0, 4, 8, 12,...
thread 1: 1, 5, 9, 13,...
thread 2: 2, 6, 10, 14,...
thread 3: 3, 7, 11, 15,...
which may be causing cache ping-pong effects, since values written by different threads may land on the same cache line, thus slowing down your execution.
Try to use a simple for loop with static partitioning instead, in order to get continuous partitions:
#pragma omp parallel for
for(index = 0; index < limita / 3;index++)
{
buff[de_scris][index*3]=worker_buffer[0][index/dimensiuni_forme[9]][index%dimensiuni_forme[9]]; //aici va veni send_buff[index].x
buff[de_scris][index*3+1]=buff[de_scris][index*3]+0.5;
buff[de_scris][index*3+2]=buff[de_scris][index*3]+0.75;
}