cublasSasum causing segmentation fault in custom TensorFlow op [duplicate] - c++

The code of cublas below give us the errors:core dumped while being at "cublasSnrm2(handle,row,dy,incy,de)",could you give some advice?
main.cu
#include <iostream>
#include "cublas.h"
#include "cublas_v2.h"
#include "helper_cuda.h"
using namespace std;
int main(int argc,char *args[])
{
float y[10] = {1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0};
int dev=0;
checkCudaErrors(cudaSetDevice(dev));
//cublas init
cublasStatus stat;
cublasInit();
cublasHandle_t handle;
stat = cublasCreate(&handle);
if (stat !=CUBLAS_STATUS_SUCCESS)
{
printf("cublas handle create failed!\n");
cublasShutdown();
}
float * dy,*de,*e;
int incy = 1,ONE = 1,row = 10;
e = (float *)malloc(sizeof(float)*ONE);
e[0]=0.0f;
checkCudaErrors(cudaMalloc(&dy,sizeof(float)*row));
checkCudaErrors(cudaMalloc(&de,sizeof(float)*ONE));
checkCudaErrors(cudaMemcpy(dy,y,row*sizeof(float),cudaMemcpyHostToDevice));
checkCudaErrors(cudaMemcpy(de,e,ONE*sizeof(float),cudaMemcpyHostToDevice));
stat = cublasSnrm2(handle,row,dy,incy,de);
if (stat !=CUBLAS_STATUS_SUCCESS)
{
printf("norm2 compute failed!\n");
cublasShutdown();
}
checkCudaErrors(cudaMemcpy(e,de,ONE*sizeof(float),cudaMemcpyDeviceToHost));
std::cout<<e[0]<<endl;
return 0;
}
makefile is below:
NVIDIA = $(HOME)/NVIDIA_CUDA-5.0_Samples
CUDA = /usr/local/cuda-5.0
NVIDINCADD = -I$(NVIDIA)/common/inc
CUDAINCADD = -I$(CUDA)/include
CC = -L/usr/lib64/ -lstdc++
GCCOPT = -O2 -fno-rtti -fno-exceptions
INTELOPT = -O3 -fno-rtti -xW -restrict -fno-alias
DEB = -g
NVCC = -G
ARCH = -arch=sm_35
bcg:main.cu
nvcc $(DEB) $(NVCC) $(ARCH) $(CC) -lm $(NVIDINCADD) $(CUDAINCADD) -lcublas -I./ -o $(#) $(<)
clean:
rm -f bcg
rm -f hyb
My OS is linux redhat 6.2,CUDA's version is 5.0, GPU is K20M.

The problem is here:
cublasSnrm2(handle,row,dy,incy,de);
By default, the last parameter is a host pointer. So either pass e to the snrm2 call rather than de or do this:
cublasSetPointerMode(handle,CUBLAS_POINTER_MODE_DEVICE);
stat = cublasSnrm2(handle,row,dy,incy,de);
The pointer mode needs to be set to device if you want to pass a device pointer to store the result.

Related

linker error: undefined symbol

What I'm trying to do
I am attempting to create 2 c++ classes.
One, named Agent that will be implemented as a member of class 2
Two, named Env that will be exposed to Python through boost.python (though I suspect this detail to be inconsequential to my problem)
The problem
After successful compilation with my make file, I attempt to run my python script and I receive an import error on my extension module (the c++ code) that reads "undefined symbol: _ZN5AgentC1Effff". All the boost-python stuff aside, I believe this to be a simple c++ linker error.
Here are my files:
Agent.h
class Agent {
public:
float xy_pos[2];
float xy_vel[2];
float yaw;
float z_pos;
Agent(float x_pos, float y_pos, float yaw, float z_pos);
};
Agent.cpp
#include "Agent.h"
Agent::Agent(float x_pos, float y_pos, float yaw, float z_pos)
{
xy_vel[0] = 0;
xy_vel[1] = 0;
xy_pos[0] = x_pos;
xy_pos[1] = y_pos;
z_pos = z_pos;
yaw = yaw;
};
test_ext.cpp (where my Env class lives)
#include "Agent.h"
#include <boost/python.hpp>
class Env{
public:
Agent * agent;
//some other members
Env() {
agent = new Agent(13, 10, 0, 2);
}
np::ndarray get_agent_vel() {
return np::from_data(agent->xy_vel, np::dtype::get_builtin<float>(),
p::make_tuple(2),
p::make_tuple(sizeof(float)),
p::object());
}
void set_agent_vel(np::ndarray vel) {
agent->xy_vel[0] = p::extract<float>(vel[0]);
agent->xy_vel[1] = p::extract<float>(vel[1]);
}
}
BOOST_PYTHON_MODULE(test_ext) {
using namespace boost::python;
class_<Env>("Env")
.def("set_agent_vel", &Env::set_agent_vel)
.def("get_agent_vel", &Env::get_agent_vel)
}
Makefile
PYTHON_VERSION = 3.5
PYTHON_INCLUDE = /usr/include/python$(PYTHON_VERSION)
# location of the Boost Python include files and library
BOOST_INC = /usr/local/include/boost_1_66_0
BOOST_LIB = /usr/local/include/boost_1_66_0/stage/lib/
# compile mesh classes
TARGET = test_ext
CFLAGS = --std=c++11
$(TARGET).so: $(TARGET).o
g++ -shared -Wl,--export-dynamic $(TARGET).o -L$(BOOST_LIB) -lboost_python3 -lboost_numpy3 -L/usr/lib/python3.5/config-3.5m-x86_64-linux-gnu -lpython3.5 -o $(TARGET).so
$(TARGET).o: $(TARGET).cpp Agent.o
g++ -I$(PYTHON_INCLUDE) -I$(BOOST_INC) -fPIC -c $(TARGET).cpp $(CFLAGS)
Agent.o: Agent.cpp Agent.h
g++ -c -Wall Agent.cpp $(CFLAGS)
You never link with Agent.o anywhere.
First of all you need to build it like you build test_ext.o with the same flags. Then you need to actually link with Agent.o when creating the shared library.

Error: 'ALIGN' undeclared (first use in this function) with ALIGN defined into macro

I have a strange error at compilation of the following source :
#include <stdio.h>
#include <stdlib.h>
#include <mach/mach_time.h>
#include <mm_malloc.h>
#ifdef SSE
#include <x86intrin.h>
#define ALIGN 16
void addition_tab(int size, double *a, double *b, double *c)
{
int i;
// Main loop
for (i=size-1; i>=0; i-=2)
{
// Intrinsic SSE syntax
const __m128d x = _mm_loadu_pd(a); // Load two x elements
const __m128d y = _mm_loadu_pd(b); // Load two y elements
const __m128d sum = _mm_add_pd(x, y); // Compute two sum elements
_mm_storeu_pd(c, sum); // Store two sum elements
// Increment pointers by 2 since SSE vectorizes on 128 bits = 16 bytes = 2*sizeof(double)
a += 2;
b += 2;
c += 2;
}
}
#endif
int main(int argc, char *argv[])
{
// Array index
int i;
// Array size as argument
int size = atoi(argv[1]);
// Time elapsed
uint64_t t1, t2;
float duration;
// Two input arrays
double *tab_x;
double *tab_y;
double *tab_z;
// Get the timebase info
mach_timebase_info_data_t info;
mach_timebase_info(&info);
#ifdef NOVEC
// Allocation
tab_x = (double*) malloc(size*sizeof(double));
tab_y = (double*) malloc(size*sizeof(double));
tab_z = (double*) malloc(size*sizeof(double));
#else
// Allocation
tab_x = (double*) _mm_malloc(size*sizeof(double),ALIGN);
tab_y = (double*) _mm_malloc(size*sizeof(double),ALIGN);
tab_z = (double*) _mm_malloc(size*sizeof(double),ALIGN);
#endif
}
If I compile with :
gcc-mp-4.9 -DNOVEC -O0 main.c -o exe
compilation is done but with :
gcc-mp-4.9 -DSSE -O3 -msse main.c -o exe
I get the following error :
main.c: In function 'main':
main.c:96:52: error: 'ALIGN' undeclared (first use in this function)
tab_x = (double*) _mm_malloc(size*sizeof(double),ALIGN);
However, variable ALIGN is defined if I pass SSE macro with gcc-mp-4.9 -DSSE, isn't it ?
I found out the root cause into your script: you are not isolating the novec so the compilation with NOVEC macro is always done. You could isolate it using:
if [ "$1" == "novec" ]; then
# Compile no vectorized and vectorized executables
$GCC -DNOVEC -O0 main_benchmark.c -o noVectorizedExe
$GCC -DNOVEC -O0 main_benchmark.c -S -o noVectorizedExe.s
elif [ "$1" == "sse" ]; then
# Compile with SSE
$GCC -DSSE -O3 -msse main_benchmark.c -o vectorizedExe
$GCC -DSSE -O3 -msse main_benchmark.c -S -o vectorizedExe.s
echo "Test"
elif [ "$1" == "avx" ]; then
# Compile with AVX256
$GCC -DAVX256 -O3 -mavx main_benchmark.c -o vectorizedExe
$GCC -DAVX256 -O3 -mavx main_benchmark.c -S -o vectorizedExe.s
fi
EDIT
I Found out it, you have a typo!!
$GCC -DNOVEV -O0 main_benchmark.c -S -o noVectorizedExe.s
should be
$GCC -DNOVEC -O0 main_benchmark.c -S -o noVectorizedExe.s

error: 'CFLAGS' does not name a type

This is my makefile:
CFLAGS=-Wall -g -O2
clean:
rm -f ex1
And when I run a script, for example, this one (ex3.c):
#include <stdio.h>
int main()
{
int age = 10;
int height = 72;
printf("I am %d years old.\n", age);
printf("I am %d inches tall.\n", height);
return 0;
}
I get the following error:
$ g++ Makefile.c -o makefile
Makefile.c:1:1: error: 'CFLAGS' does not name a type
CFLAGS=-Wall -g
^g++ ex3.c -o ex3
$
Please don't compile the makefile.
Use the make utility instead.
Synonyms include nmake and gmake.
The makefile should be passed to the make program or build utility.

cuda shared library linking: undefined reference to cudaRegisterLinkedBinary

Goal:
create a shared library containing my CUDA kernels that has a CUDA-free wrapper/header.
create a test executable for the shared library.
Problem
shared library MYLIB.so seems to compile fine. (no problem).
Error in linking:
./libMYLIB.so: undefined reference to __cudaRegisterLinkedBinary_39_tmpxft_000018cf_00000000_6_MYLIB_cpp1_ii_74c599a1
simplified makefile:
libMYlib.so : MYLIB.o
g++ -shared -Wl,-soname,libMYLIB.so -o libMYLIB.so MYLIB.o -L/the/cuda/lib/dir -lcudart
MYLIB.o : MYLIB.cu MYLIB.h
nvcc -m64 -arch=sm_20 -dc -Xcompiler '-fPIC' MYLIB.cu -o MYLIB.o -L/the/cuda/lib/dir -lcudart
test : test.cpp libMYlib.so
g++ test.cpp -o test -L. -ldl -Wl,-rpath,. -lMYLIB -L/the/cuda/lib/dir -lcudart
indeed
nm libMYLIB.so shows that all CUDA api functions are "undefined symbols":
U __cudaRegisterFunction
U __cudaRegisterLinkedBinary_39_tmpxft_0000598c_00000000_6_CUPA_cpp1_ii_74c599a1
U cudaEventRecord
U cudaFree
U cudaGetDevice
U cudaGetDeviceProperties
U cudaGetErrorString
U cudaLaunch
U cudaMalloc
U cudaMemcpy
So CUDA somehow did not get linked to the shared library MYLIB.so
What am I missing?
CUDA did not even get linked to the object file somehow:
nm MYLIB.o
U __cudaRegisterFunction
U __cudaRegisterLinkedBinary_39_tmpxft_0000598c_00000000_6_CUPA_cpp1_ii_74c599a1
U cudaEventRecord
U cudaFree
U cudaGetDevice
U cudaGetDeviceProperties
U cudaGetErrorString
U cudaLaunch
U cudaMalloc
U cudaMemcpy
(same as above)
Here's an example linux shared object creation along the lines you indicated:
create a shared library containing my CUDA kernels that has a
CUDA-free wrapper/header.
create a test executable for the shared library.
First the shared library. The build commands for this are as follows:
nvcc -arch=sm_20 -Xcompiler '-fPIC' -dc test1.cu test2.cu
nvcc -arch=sm_20 -Xcompiler '-fPIC' -dlink test1.o test2.o -o link.o
g++ -shared -o test.so test1.o test2.o link.o -L/usr/local/cuda/lib64 -lcudart
It seems you may be missing the second step above in your makefile, but I haven't analyzed if there are any other issues with your makefile.
Now, for the test executable, the build commands are as follows:
g++ -c main.cpp
g++ -o testmain main.o test.so
To run it, simply execute the testmain executable, but be sure the test.so library is on your LD_LIBRARY_PATH.
These are the files I used for test purposes:
test1.h:
int my_test_func1();
test1.cu:
#include <stdio.h>
#include "test1.h"
#define DSIZE 1024
#define DVAL 10
#define nTPB 256
#define cudaCheckErrors(msg) \
do { \
cudaError_t __err = cudaGetLastError(); \
if (__err != cudaSuccess) { \
fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
msg, cudaGetErrorString(__err), \
__FILE__, __LINE__); \
fprintf(stderr, "*** FAILED - ABORTING\n"); \
exit(1); \
} \
} while (0)
__global__ void my_kernel1(int *data){
int idx = threadIdx.x + (blockDim.x *blockIdx.x);
if (idx < DSIZE) data[idx] =+ DVAL;
}
int my_test_func1(){
int *d_data, *h_data;
h_data = (int *) malloc(DSIZE * sizeof(int));
if (h_data == 0) {printf("malloc fail\n"); exit(1);}
cudaMalloc((void **)&d_data, DSIZE * sizeof(int));
cudaCheckErrors("cudaMalloc fail");
for (int i = 0; i < DSIZE; i++) h_data[i] = 0;
cudaMemcpy(d_data, h_data, DSIZE * sizeof(int), cudaMemcpyHostToDevice);
cudaCheckErrors("cudaMemcpy fail");
my_kernel1<<<((DSIZE+nTPB-1)/nTPB), nTPB>>>(d_data);
cudaDeviceSynchronize();
cudaCheckErrors("kernel");
cudaMemcpy(h_data, d_data, DSIZE * sizeof(int), cudaMemcpyDeviceToHost);
cudaCheckErrors("cudaMemcpy 2");
for (int i = 0; i < DSIZE; i++)
if (h_data[i] != DVAL) {printf("Results check failed at offset %d, data was: %d, should be %d\n", i, h_data[i], DVAL); exit(1);}
printf("Results check 1 passed!\n");
return 0;
}
test2.h:
int my_test_func2();
test2.cu:
#include <stdio.h>
#include "test2.h"
#define DSIZE 1024
#define DVAL 20
#define nTPB 256
#define cudaCheckErrors(msg) \
do { \
cudaError_t __err = cudaGetLastError(); \
if (__err != cudaSuccess) { \
fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
msg, cudaGetErrorString(__err), \
__FILE__, __LINE__); \
fprintf(stderr, "*** FAILED - ABORTING\n"); \
exit(1); \
} \
} while (0)
__global__ void my_kernel2(int *data){
int idx = threadIdx.x + (blockDim.x *blockIdx.x);
if (idx < DSIZE) data[idx] =+ DVAL;
}
int my_test_func2(){
int *d_data, *h_data;
h_data = (int *) malloc(DSIZE * sizeof(int));
if (h_data == 0) {printf("malloc fail\n"); exit(1);}
cudaMalloc((void **)&d_data, DSIZE * sizeof(int));
cudaCheckErrors("cudaMalloc fail");
for (int i = 0; i < DSIZE; i++) h_data[i] = 0;
cudaMemcpy(d_data, h_data, DSIZE * sizeof(int), cudaMemcpyHostToDevice);
cudaCheckErrors("cudaMemcpy fail");
my_kernel2<<<((DSIZE+nTPB-1)/nTPB), nTPB>>>(d_data);
cudaDeviceSynchronize();
cudaCheckErrors("kernel");
cudaMemcpy(h_data, d_data, DSIZE * sizeof(int), cudaMemcpyDeviceToHost);
cudaCheckErrors("cudaMemcpy 2");
for (int i = 0; i < DSIZE; i++)
if (h_data[i] != DVAL) {printf("Results check failed at offset %d, data was: %d, should be %d\n", i, h_data[i], DVAL); exit(1);}
printf("Results check 2 passed!\n");
return 0;
}
main.cpp:
#include <stdio.h>
#include "test1.h"
#include "test2.h"
int main(){
my_test_func1();
my_test_func2();
return 0;
}
When I compile according to the commands given, and run ./testmain I get:
$ ./testmain
Results check 1 passed!
Results check 2 passed!
Note that if you prefer, you may generate a libtest.so instead of test.so, and then you may use a modified build sequence for the test executable:
g++ -c main.cpp
g++ -o testmain main.o -L. -ltest
I don't believe it makes any difference, but it may be more familiar syntax.
I'm sure there is more than one way to accomplish this. This is just an example.
You may wish to also review the relevant section of the nvcc manual and also review the examples.
EDIT: I tested this under cuda 5.5 RC, and the final application link step complained about not finding the cudart lib (warning: libcudart.so.5.5., needed by ./libtest.so, not found). However the following relatively simple modification (example Makefile) should work under either cuda 5.0 or cuda 5.5.
Makefile:
testmain : main.cpp libtest.so
g++ -c main.cpp
g++ -o testmain -L. -ldl -Wl,-rpath,. -ltest -L/usr/local/cuda/lib64 -lcudart main.o
libtest.so : link.o
g++ -shared -Wl,-soname,libtest.so -o libtest.so test1.o test2.o link.o -L/usr/local/cuda/lib64 -lcudart
link.o : test1.cu test2.cu test1.h test2.h
nvcc -m64 -arch=sm_20 -dc -Xcompiler '-fPIC' test1.cu test2.cu
nvcc -m64 -arch=sm_20 -Xcompiler '-fPIC' -dlink test1.o test2.o -o link.o
clean :
rm -f testmain test1.o test2.o link.o libtest.so main.o
The other answers did not work for me (maybe because I’m using cuda 10).
The solution that worked for me was compiling the cuda files as:
nvcc -dc -o cuda_file.o cuda_file.cu
Than compiling the c++ file as:
g++ -c -o cpp_file.o cpp_file.cpp
And finally linking all using nvcc:
nvcc -o my_prog cpp_file.o cuda_file.o -lcudart -lcuda -L<other stuff>
Don’t take this code literally. But the core of the solution to the error was using nvcc instead of g++ in the final linking step.

How do you wrap C++ OpenCV code with Boost::Python?

I want to wrap my C++ OpenCV code with boost::python, and to learn how to do it, I tried a toy example, in which
I use the Boost.Numpy project to provide me with boost::numpy::ndarray.
The C++ function to be wrapped, square() takes a boost::numpy::ndarray and modifies it in place by squaring each element in it.
The exported Python module name is called test.
The square() C++ function is exported as the square name in the exported module.
I am not using bjam because IMO it is too complicated and just doesn't work for me no matter what. I'm using good old make.
Now, here's the code:
// test.cpp
#include <boost/python.hpp>
#include <boost/numpy.hpp>
#include <boost/scoped_array.hpp>
#include <opencv2/imgproc/imgproc.hpp>
#include <iostream>
namespace py = boost::python;
namespace np = boost::numpy;
void square(np::ndarray& array)
{
if (array.get_dtype() != np::dtype::get_builtin<int>())
{
PyErr_SetString(PyExc_TypeError, "Incorrect array data type.");
py::throw_error_already_set();
}
size_t rows = array.shape(0), cols = array.shape(1);
size_t stride_row = array.strides(0) / sizeof(int),
stride_col = array.strides(1) / sizeof(int);
cv::Mat mat(rows, cols, CV_32S);
int *row_iter = reinterpret_cast<int*>(array.get_data());
for (int i = 0; i < rows; i++, row_iter += stride_row)
{
int *col_iter = row_iter;
int *mat_row = (int*)mat.ptr(i);
for (int j = 0; j < cols; j++, col_iter += stride_col)
{
*(mat_row + j) = (*col_iter) * (*col_iter);
}
}
for (int i = 0; i < rows; i++, row_iter += stride_row)
{
int *col_iter = row_iter;
int *mat_row = (int*)mat.ptr(i);
for (int j = 0; j < cols; j++, col_iter += stride_col)
{
*col_iter = *(mat_row + j);
}
}
}
BOOST_PYTHON_MODULE(test)
{
using namespace boost::python;
def("square", square);
}
And here's the Makefile:
PYTHON_VERSION = 2.7
PYTHON_INCLUDE = /usr/include/python$(PYTHON_VERSION)
BOOST_INC = /usr/local/include
BOOST_LIB = /usr/local/lib
OPENCV_LIB = $$(pkg-config --libs opencv)
OPENCV_INC = $$(pkg-config --cflags opencv)
TARGET = test
$(TARGET).so: $(TARGET).o
g++ -shared -Wl,--export-dynamic \
$(TARGET).o -L$(BOOST_LIB) -lboost_python \
$(OPENCV_LIB) \
-L/usr/lib/python$(PYTHON_VERSION)/config -lpython$(PYTHON_VERSION) \
-o $(TARGET).so
$(TARGET).o: $(TARGET).cpp
g++ -I$(PYTHON_INCLUDE) $(OPENCV_INC) -I$(BOOST_INC) -fPIC -c $(TARGET).cpp
With this scheme, I can type make and test.so gets created. But when I try to import it,
In [1]: import test
---------------------------------------------------------------------------
ImportError Traceback (most recent call last)
<ipython-input-1-73ae3ffe1045> in <module>()
----> 1 import test
ImportError: ./test.so: undefined symbol: _ZN5boost6python9converter21object_manager_traitsINS_5numpy7ndarrayEE10get_pytypeEv
In [2]:
This is a linker error which I can't seem to fix. Can anyone please help me with what's going on? Do you have (links to) code that already does integrate OpenCV, numpy and Boost.Python without things like Py++ or the likes?.
Okay I fixed this. It was a simple issue, but a sleepy brain and servings of bjam had made me ignore it. In the Makefile, I'd forgotten to put -lboost_numpy that links the Boost.Numpy libs to my lib. So, the modified Makefile looks like this:
PYTHON_VERSION = 2.7
PYTHON_INCLUDE = /usr/include/python$(PYTHON_VERSION)
BOOST_INC = /usr/local/include
BOOST_LIB = /usr/local/lib
OPENCV_LIB = $$(pkg-config --libs opencv)
OPENCV_INC = $$(pkg-config --cflags opencv)
TARGET = test
$(TARGET).so: $(TARGET).o
g++ -shared -Wl,--export-dynamic \
$(TARGET).o -L$(BOOST_LIB) -lboost_python -lboost_numpy \
$(OPENCV_LIB) \
-L/usr/lib/python$(PYTHON_VERSION)/config -lpython$(PYTHON_VERSION) \
-o $(TARGET).so
$(TARGET).o: $(TARGET).cpp
g++ -I$(PYTHON_INCLUDE) $(OPENCV_INC) -I$(BOOST_INC) -fPIC -c $(TARGET).cpp