When I compiled the code with openacc ,I got the warning information:
$ pgc++ -o buffer1.out -acc -gpu=managed -Minfo=accel buffer.cpp
main:
137, Generating enter data copyin(mat1)
Generating NVIDIA GPU code
143, #pragma acc loop seq collapse(2)
145, collapsed */
137, Generating default present(mat1)
143, Complex loop carried dependence of prevents parallelization
145, Complex loop carried dependence of prevents parallelization
148, Reference argument passing prevents parallelization: j
Reference argument passing prevents parallelization: i
150, Accelerator restriction: induction variable live-out from loop: j
153, Accelerator restriction: induction variable live-out from loop: i
161, Generating update self(mat1)
Generating exit data delete(mat1)
Array2D<int>::operator ()(const int &, const int &):
23, Generating implicit acc routine seq
Generating acc routine seq
Generating NVIDIA GPU code
#include<iostream>
#include <string.h>
#include<queue>
#include<openacc.h>
using namespace std;
template<class T1>
class Array2D{
public:
int arows=0;
int acols=0;
T1** __restrict matImg;
Array2D(int rows,int cols,T1 defaultVal):arows(rows),acols(cols){
matImg=new T1*[rows];
for(int i=0;i<rows;i++){
matImg[i]=new T1[cols];
memset(matImg[i],defaultVal,sizeof(T1)*cols);
}
}
T1 &operator()(const int& m, const int& n){
return matImg[m][n];
}
T1 * __restrict operator()(const int&k){
return matImg[k];
}
~Array2D(){
for(int i=0;i<arows;i++){
delete[] matImg[i];
}
delete [] matImg;
}
};
int main(){
int rows=6;
int cols=10;
Array2D<int> mat1=Array2D<int>(rows,cols,0);//Array2D<int>(2,3);
#pragma acc enter data copyin(mat1[0:rows][0:cols])
#pragma acc parallel loop collapse(2) default(present)
for(int i=0;i<rows;i++){
for(int j=0;j<cols;j++){
// cout<<"mat1("<<i<<","<<j<<") :"<<mat1(i,j)<<endl;
mat1(i,j)+=2;
}
}
#pragma acc update self(mat1)
#pragma acc exit data delete(mat1)
cout<<"=============================="<<endl;
for(int i=0;i<mat1.arows;i++){
for(int j=0;j<mat1.acols;j++){
cout<<"mat1("<<i<<","<<j<<") :"<<mat1(i,j)<<endl;
}
}
}
Array2D is a class for creating 2D array object by allocating heap.
It says the array prevents the parallel in GPU code. Image is a Mat
data type from openCV. May I know why the warning says there is a
dependency?
Could somebody provide any suggestions?
Pass the index variables by value instead of by reference. By passing them by reference, the compiler must assume that the address is taken by a global pointer thus creating a dependency.
To fix:
T1 &operator()(int m, int n){
return matImg[m][n];
}
While the loop will now be successfully parallelized, you'll get a runtime error since "mat1" isn't an 2D array, but rather a class with a 2D array data member. Instead you need to do a deep copy of "mat1" to the device.
Full fix:
% cat test.cpp
#include<iostream>
#include <string.h>
#include<queue>
#include<openacc.h>
using namespace std;
template<class T1>
class Array2D{
public:
int arows=0;
int acols=0;
T1** __restrict matImg;
Array2D(int rows,int cols,T1 defaultVal):arows(rows),acols(cols){
matImg=new T1*[rows];
for(int i=0;i<rows;i++){
matImg[i]=new T1[cols];
memset(matImg[i],defaultVal,sizeof(T1)*cols);
}
}
T1 &operator()(int m, int n){
return matImg[m][n];
}
T1 * __restrict operator()(const int&k){
return matImg[k];
}
~Array2D(){
for(int i=0;i<arows;i++){
delete[] matImg[i];
}
delete [] matImg;
}
};
int main(){
int rows=6;
int cols=10;
Array2D<int> mat1=Array2D<int>(rows,cols,0);//Array2D<int>(2,3);
#pragma acc enter data copyin(mat1, mat1.matImg[0:rows][0:cols])
#pragma acc parallel loop collapse(2) default(present)
for(int i=0;i<rows;i++){
for(int j=0;j<cols;j++){
// cout<<"mat1("<<i<<","<<j<<") :"<<mat1(i,j)<<endl;
mat1(i,j)+=2;
}
}
#pragma acc update self(mat1.matImg[0:rows][0:cols])
#pragma acc exit data delete(mat1.matImg,mat1)
cout<<"=============================="<<endl;
for(int i=0;i<mat1.arows;i++){
for(int j=0;j<mat1.acols;j++){
cout<<"mat1("<<i<<","<<j<<") :"<<mat1(i,j)<<endl;
}
}
}
% nvc++ test.cpp -acc -Minfo=accel -O2 -V22.11 ; a.out
main:
50, Generating enter data copyin(mat1,mat1.matImg[:rows][:cols])
Generating NVIDIA GPU code
55, #pragma acc loop gang, vector(64) collapse(2) /* blockIdx.x threadIdx.x */
56, /* blockIdx.x threadIdx.x collapsed */
50, Generating default present(mat1.matImg[:6],mat1)
66, Generating update self(mat1.matImg[:rows][:cols])
Generating exit data delete(mat1.matImg[:1][:1],mat1)
Array2D<int>::operator ()(int, int):
24, Generating implicit acc routine seq
Generating acc routine seq
Generating NVIDIA GPU code
==============================
mat1(0,0) :2
mat1(0,1) :2
mat1(0,2) :2
mat1(0,3) :2
mat1(0,4) :2
mat1(0,5) :2
mat1(0,6) :2
mat1(0,7) :2
... cut ...
mat1(5,7) :2
mat1(5,8) :2
mat1(5,9) :2
I used nvc++ to run the C++ program with openacc and the error displayed 'libgomp: TODO'.
Here is the test code:
#include <opencv2/imgcodecs.hpp>
#include <opencv2/highgui.hpp>
#include<openacc.h>
using namespace std;
using namespace cv;
int main(){
cv::Mat srcImg=cv::imread("/home/testSpace/images/blue-mountains.jpg");
if(!srcImg.data){
cout<<"The file is not loaded or does not exist"<<endl;
return -1;
}
cout<<"Matrix"<<srcImg.rows<<" "<<srcImg.cols<<endl;
Mat duplicate(srcImg.rows,srcImg.cols, CV_8UC1,Scalar::all(255) );
#pragma acc enter data copyin(srcImg[:srcImg.rows][:srcImg.cols])
#pragma acc enter data copyin(duplicate[:duplicate.rows][:duplicate.cols])
#pragma acc parallel
{
#pragma acc loop
for(int i=0;i<srcImg.rows;i++){
#pragma acc loop
for(int j=0;j<srcImg.cols;j++){
duplicate.at<uchar>(i,j)=srcImg.at<uchar>(i,j);
}
}
#pragma acc data copyout(duplicate[:duplicate.rows][:duplicate.cols])
#pragma acc data copyout(srcImg[:srcImg.rows][:srcImg.cols])
}
cout<<"duplicate"<<": "<<(int)duplicate.at<uchar>(23,45)<<endl;
return 0;
}
Then I got error by using nvc++ to compile the code file.
main:
2216, Loop unrolled 4 times (completely unrolled)
36, Generating enter data copyin(duplicate,srcImg)
Generating NVIDIA GPU code
38, #pragma acc loop gang /* blockIdx.x */
40, #pragma acc loop vector(128) /* threadIdx.x */
36, Generating implicit copyin(duplicate.step.p[:1],srcImg.step.p[:1],srcImg,duplicate)[if not already present]
40, Loop is parallelizable
Loop not vectorized/parallelized: not countable
cv::Matx<double, (int)4, (int)1>::Matx():
The final result is :Matrix810 1440
libgomp: TODO
Can anyone please provide any hint? Moreover, I don't know why I got the error
Generating implicit
copyin(duplicate.step.p[:1],srcImg.step.p[:1],srcImg,duplicate)[if not
already present]
I used the same way to allocate memory for srcImg in GPU.
The content of run scripts:
nvc++ -g -O3 -acc -gpu=cc60,cc70 -Minfo ``pkg-config opencv4 --cflags --libs`` -nomp -o nvcpp.out test.cpp
Please check lib_information here
I am trying to understand/test OpenMP with GPU offload. However, I am confused because some examples/info (1, 2, 3) in the internet are analogous or similar to mine but my example does not work as I think it should. I am using g++ 9.4 on Ubuntu 20.04 LTS and also installed gcc-9-offload-nvptx.
My example that does not work but is similar to this one:
#include <iostream>
#include <vector>
int main(int argc, char *argv[]) {
typedef double myfloat;
if (argc != 2) exit(1);
size_t size = atoi(argv[1]);
printf("Size: %zu\n", size);
std::vector<myfloat> data_1(size, 2);
myfloat *data1_ptr = data_1.data();
myfloat sum = -1;
#pragma omp target map(tofrom:sum) map(from: data1_ptr[0:size])
#pragma omp teams distribute parallel for simd reduction(+:sum) collapse(2)
for (size_t i = 0; i < size; ++i) {
for (size_t j = 0; j < size; ++j) {
myfloat term1 = data1_ptr[i] * i;
sum += term1 / (1 + term1 * term1 * term1);
}
}
printf("sum: %.2f\n", sum);
return 0;
}
When I compile it with: g++ main.cpp -o test -fopenmp -fcf-protection=none -fno-stack-protector I get the following
stack_example.cpp: In function ‘main._omp_fn.0.hsa.0’:
cc1plus: warning: could not emit HSAIL for the function [-Whsa]
cc1plus: note: support for HSA does not implement non-gridified OpenMP parallel constructs.
It does compile but when using it with
./test 10000
the printed sum is still -1. I think the sum value passed to the GPU was not returned properly but I explicitly map it, so shouldn't it be returned? Or what am I doing wrong?
EDIT 1
I was ask to modify my code because there was a historically grown redundant for loop and also sum was initialized with -1. I fixed that and also compiled it with gcc-11 which did not throw a warning or note as did gcc-9. However the behavior is similar:
Size: 100
Number of devices: 2
sum: 0.00
I checked with nvtop, the GPU is used. Because there are two GPUs I can even switch the device and can be seen by nvtop.
Solution:
The fix is very easy and stupid. Changing
map(from: data1_ptr[0:size])
to
map(tofrom: data1_ptr[0:size])
did the trick.
Even though I am not writing to the array this seemed to be the problem.
I'm using GCC to compile the following program which uses OpenMP's target directives to offload work to a GPU:
#include <iostream>
#include <cmath>
int main(){
const int SIZE = 400000;
double *m;
m = new double[SIZE];
#pragma omp target teams distribute parallel for
for(int i=0;i<SIZE;i++)
m[i] = std::sin((double)i);
for(int i=0;i<SIZE;i++)
std::cout<<m[i]<<"\n";
}
My compilation string is as follows:
g++ -O3 test2.cpp -fopenmp -omptargets=nvptx64sm_35-nvidia-linux
Compilation succeeds, but quietly.
Using PGI+OpenACC, I'm used to a series of outputs that tell me what the compiler actually did with the directive, like so:
main:
8, Accelerator kernel generated
Generating Tesla code
11, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */
8, Generating implicit copyout(m[:400000])
How can I get similar information out of GCC? -fopt-info-all is a big mess.
I have the following code:
#pragma acc kernels
for (i = 0; i<shape1Count; i++){
for (j = 0; j<shape2Count; j++){
if (kkk==1){
intersectionsCount++;
} else {
intersectionsCount++;
}
}
}
kkk is assigned with 1.I found that the if-condition has not even run, that means no matter it is true or false, intersectionCount is not incremented.
My assumption is, if-condition cannot be handled by GPU/accelerator. Is it true?
If it is true, what can I do to handle this?
p.s. I am very new to GPU programming.
Thanks a lot
Often the compiler can auto-detect reductions but not in this case. Hence, you need to add a reduction clause yourself. Here's the output when using pgcc version 16.4:
% cat test.c
#include <stdlib.h>
#include <stdio.h>
int main() {
int i, j, kkk, intersectionsCount;
int shape1Count,shape2Count;
shape1Count=32;
shape2Count=32;
intersectionsCount=0;
kkk=1;
#pragma acc kernels loop reduction(+:intersectionsCount)
for (i = 0; i<shape1Count; i++){
for (j = 0; j<shape2Count; j++){
if (kkk==1){
intersectionsCount++;
} else {
intersectionsCount++;
}
}
}
printf("%d\n",intersectionsCount);
exit(0);
}
% pgcc test.c -Minfo=accel -acc; a.out
main:
15, Loop is parallelizable
16, Loop is parallelizable
Accelerator kernel generated
Generating Tesla code
15, #pragma acc loop gang, vector(4) /* blockIdx.y threadIdx.y */
16, #pragma acc loop gang, vector(32) /* blockIdx.x threadIdx.x */
Generating reduction(+:intersectionsCount)
1024