Compilation error in cuda kernel calling/ passing parameters - c++

In the actual code, my intention is to get the output array by comparing the input array to the scalar. Or simply output = input > scalar.
Simple sample host-side code as shown below is working as expected.
float *h_data1 = (float *)malloc(W1*H1 * sizeof(float));
bool *h_result = (bool *)malloc(H1*W2 * sizeof(bool));
float *d_data1; gpuErrchk(cudaMalloc(&d_data1, W1*H1 * sizeof(float)));
bool *d_result; gpuErrchk(cudaMalloc(&d_result, H1*W2 * sizeof(bool)));
for (int i = 0; i < W1*H1; i++) h_data1[i] = (float)i;
gpuErrchk(cudaMemcpy(d_data1, h_data1, W1*H1 * sizeof(float), cudaMemcpyHostToDevice));
float scalar = 2;
compGraterRetOut<float, bool><< <outw, outh >> > (d_data1, d_result, scalar);
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
The device side code is
template<typename TType, typename TTypeOut>
__global__ void compGraterRetOut(TType *dataIn, TTypeOut *dataOut, const TType scalar)
{
int i = blockIdx.x * blockDim.x + threadIdx.x;
dataOut[i] = (dataIn[i] > scalar);
}
Coming to the actual code, I have an image class as shown below (Only some part of the class is shown).
template<typename TType, ImageType TImageType>
class Image
{
public:
Image(uint32_t width, uint32_t height, uint32_t depth = 1);
private:
TType* m_data;
uint32_t m_width;
uint32_t m_height;
uint32_t m_depth;
uint32_t m_bufferSize;
};
template<typename TType, ImageType TImageType>
Image<TType, TImageType>::Image(uint32_t width, uint32_t height, uint32_t depth) :m_width(width), \
m_height(height), m_depth(depth)
{
if (width == 0 || height == 0)
return;
cudaError_t cudaStatus;
//m_data = new TType[m_width * m_height * m_depth];
gpuErrchk(cudaStatus = cudaMalloc(&m_data, sizeof(TType) * m_width * m_height * m_depth));
if (cudaStatus == cudaSuccess)
{
m_bufferSize = m_width * m_height * m_depth;
}
else
{
std::cout << "Error malloc function failed [" << cudaStatus << "]" << std::endl;
}
};
To achieve the objective out = in > scalar, operator> is overloaded as shown below. This threw a compilation error as
"member "Image::m_data [with TType=float_t,
TImageType=ImageType::WHD]""
the code looks as shown below.
inline Image<uint32_t, TImageType> Image<TType, TImageType>::operator>(TType scalar) const
{
Image<uint32_t, TImageType> ret(m_width, m_height, m_depth);
compGraterRetOut<TType, uint32_t> << <m_width * 4, (m_height * m_depth/4) >> > (m_data, ret.m_data, scalar);
gpuErrchk(cudaGetLastError());
gpuErrchk(cudaDeviceSynchronize());
return std::move(ret);
}
To fix the compilation error I changed the function operator>. Here, cuda memory is allocated inside the function instead of inside of class's contructor.
template<class TType, ImageType TImageType>
inline Image<uint32_t, TImageType> Image<TType, TImageType>::operator>(TType scalar) const
{
cudaError_t cudaStatus;
uint32_t *dataout;
gpuErrchk(cudaMalloc(&dataout, m_width*m_height*m_depth * sizeof(uint32_t)));
Image<uint32_t, TImageType> ret(dataout, m_width, m_height, m_depth);
compGraterRetOut<TType, uint32_t> << <m_width * 4, (m_height * m_depth/4) >> > (m_data, dataout, scalar);
gpuErrchk(cudaGetLastError());
gpuErrchk(cudaDeviceSynchronize());
return std::move(ret);
}
Finally, my question is why last code compiled without an error, but not previous to that?

The problem has nothing to do with Cuda. It is the problem with templates and OOPS. When template class access a member in its own type it would not violate OOPS paradigm. Accessing a private member of the same class with different template arguments violate the OOPS paradigm. That is the answer.

Related

Sphere mesh in modern OpenGL

I'm trying to create a triangle mesh of a sphere surface and draw it with OpenGL 4.1.
This is the code that I'm currently using obtained from the second answer of that question, The vertex layout is [x, y, z, r, b, g, a] that's why there is 7 float foreach vertex:
std::vector<float> vertices;
std::vector<unsigned int> indices;
const float dLambda = 2 * glm::pi<float>() / meridianNumber;
const float dPhi = glm::pi<float>() / parallelNumber;
unsigned int lastVertex = 0;
for (int i = 0; i < parallelNumber; ++i) {
for (int j = 0; j < meridianNumber; ++j) {
std::cout << "lot: " << glm::degrees(j * dLambda);
std::cout << "\tlat: " << glm::degrees(i * dPhi);
std::cout << std::endl;
float lambda1 = j * dLambda;
float phi1 = i * dPhi;
float lambda2 = j+1 == parallelNumber ? 2 * glm::pi<float>()
: (j+1) * dLambda;
float phi2 = i+1 == meridianNumber ? glm::pi<float>()
: (i+1) * dPhi;
// vertex 1
vertices.emplace_back(cosf(lambda1) * sinf(phi1) * radius);
vertices.emplace_back(cosf(phi1) * radius);
vertices.emplace_back(sinf(lambda1) * sinf(phi1) * radius);
vertices.emplace_back(0.5f);
vertices.emplace_back(1.0f);
vertices.emplace_back(1.0f);
vertices.emplace_back(1.0f);
// vertex 2
vertices.emplace_back(cosf(lambda1) * sinf(phi2) * radius);
vertices.emplace_back(cosf(phi2) * radius);
vertices.emplace_back(sinf(lambda1) * sinf(phi2) * radius);
vertices.emplace_back(0.5f);
vertices.emplace_back(1.0f);
vertices.emplace_back(1.0f);
vertices.emplace_back(1.0f);
// vertex 3
vertices.emplace_back(cosf(lambda2) * sinf(phi1) * radius);
vertices.emplace_back(cosf(phi1) * radius);
vertices.emplace_back(sinf(lambda2) * sinf(phi1) * radius);
vertices.emplace_back(0.5f);
vertices.emplace_back(1.0f);
vertices.emplace_back(1.0f);
vertices.emplace_back(1.0f);
// vertex 4
vertices.emplace_back(cosf(lambda2) * sinf(phi2) * radius);
vertices.emplace_back(cosf(phi2) * radius);
vertices.emplace_back(sinf(lambda2) * sinf(phi2) * radius);
vertices.emplace_back(0.5f);
vertices.emplace_back(1.0f);
vertices.emplace_back(1.0f);
vertices.emplace_back(1.0f);
indices.emplace_back(lastVertex);
indices.emplace_back(lastVertex+1);
indices.emplace_back(lastVertex+2);
indices.emplace_back(lastVertex+1);
indices.emplace_back(lastVertex+3);
indices.emplace_back(lastVertex+2);
lastVertex += 4;
}
But I am doing something wrong because that's what I'm drawing:
the code that I'm using to draw is:
GLCall(glDrawElements(
GL_TRIANGLES,
indicesNumber,
GL_UNSIGNED_INT,
(const void*) 0
));
EDIT 1:
The VAO settings are pretty complicated because I wrote a little layer of abstraction over opengl...
I have a class called VertexBuffer that creates, keeps alive and destroys an OpenGL array buffer.
Another class IndexBuffer is very similar to the previous one that manages the Element array buffer.
This two classes are very simple to use they can be constructed, binded, unbinded and destroyed, nothing more.
There is a third class that represents a layout of a single vertex in an OpenGL vertex buffer; this class called VertexLayout contains all the data that is necessary to call the glVertexAttribPointer.
hpp:
class VertexLayout {
private:
struct Element {
unsigned int type;
unsigned int count;
unsigned char normalized;
size_t typeSize;
Element(
unsigned int type, unsigned int count, unsigned char normalized,
size_t typeSize
);
};
std::vector<Element> elements;
unsigned int stride;
public:
VertexLayout();
template<typename T>
VertexLayout &push(unsigned int count, unsigned char normalized = GL_FALSE){
std::fputs(
"this function has to be implemented for desired type",
stderr
);
assert(false);
return *this;
}
const std::vector<Element> &getElements() const;
unsigned int getStride() const;
};
cpp:
template<>
VertexLayout &VertexLayout::push<unsigned int>(
unsigned int count, unsigned char normalized
) {
elements.emplace_back(
GL_UNSIGNED_INT, count, normalized, sizeof(unsigned int)
);
stride += count * sizeof(unsigned int);
return *this;
};
template<>
VertexLayout &VertexLayout::push<unsigned char>(
unsigned int count, unsigned char normalized
) {
elements.emplace_back(
GL_UNSIGNED_BYTE, count, normalized, sizeof(unsigned char)
);
stride += count * sizeof(unsigned char);
return *this;
};
template<>
VertexLayout &VertexLayout::push<float>(unsigned int count, unsigned char normalized){
elements.emplace_back(GL_FLOAT, count, normalized, sizeof(float));
stride += count * sizeof(float);
return *this;
}
VertexLayout::Element::Element(
unsigned int type, unsigned int count,
unsigned char normalized, size_t typeSize
) : type(type), count(count), normalized(normalized), typeSize(typeSize) {}
const std::vector<VertexLayout::Element> &VertexLayout::getElements() const {
return elements;
}
unsigned int VertexLayout::getStride() const {
return stride;
}
VertexLayout::VertexLayout() : stride(0) {}
So an instance of VertexLayout should be created foreach VertexBuffer object and foreach opengl attribute should be called a push<type>(numberOfElementOfThatType).
The fourth and last class is the VertexArray class that represents a VAO: this last class keeps trace of all the VertexBuffer and IndexBuffer objects that are connected to the vao and sets the layout calling glVertexAttribPointer when a VertexBuffer is added using the following method:
void VertexArray::addBuffer(
const VertexBuffer &buffer, const VertexLayout &layout
) {
GLCall(glBindVertexArray(id));
buffer.bind();
const auto &elements = layout.getElements();
size_t offset = 0;
for (unsigned int i = 0; i < elements.size(); ++i) {
const auto &element = elements[i];
GLCall(glEnableVertexAttribArray(i));
GLCall(glVertexAttribPointer(
i, element.count, element.type, element.normalized,
layout.getStride(), (const void *)offset
));
offset += element.count * element.typeSize;
}
vertexBuffers.emplace_back(buffer);
}
GLCall is a macro that does nothing in release while in debug is clears the OpenGL erros and prints the new errors.
EDIT 2:
This is the class VertexBuffer that represents one VBO:
hpp
class VertexBuffer {
private: // static
static std::map<unsigned int, unsigned int> references;
private: // member
unsigned int rendererID;
public:
VertexBuffer();
VertexBuffer(
const void *data, unsigned long size,
unsigned int usage = GL_STATIC_DRAW
);
VertexBuffer(const VertexBuffer &oth);
VertexBuffer &operator=(const VertexBuffer &rhs);
~VertexBuffer();
void bind() const;
void unbind() const;
};
cpp:
std::map<unsigned int, unsigned int> VertexBuffer::references;
VertexBuffer::VertexBuffer(
const void *data,
unsigned long size,
unsigned int usage
) {
GLCall(glGenBuffers(1, &rendererID));
GLCall(glBindBuffer(GL_ARRAY_BUFFER, rendererID));
GLCall(glBufferData(GL_ARRAY_BUFFER, size, data, usage));
references.insert_or_assign(rendererID, 1);
}
VertexBuffer::VertexBuffer(const VertexBuffer &oth) {
if (oth.rendererID != 0){
auto ref = references.find(oth.rendererID);
assert(ref != references.end());
ref->second++;
}
rendererID = oth.rendererID;
}
VertexBuffer &VertexBuffer::operator=(const VertexBuffer &rhs) {
if (rendererID != 0) {
auto refs = references.find(rendererID);
assert(refs != references.end());
if (--refs->second == 0) {
GLCall(glDeleteBuffers(1, &rendererID));
references.erase(refs);
}
}
if (rhs.rendererID != 0){
auto ref = references.find(rhs.rendererID);
assert(ref != references.end());
ref->second++;
}
rendererID = rhs.rendererID;
return *this;
}
VertexBuffer::VertexBuffer() : rendererID(0) {}
VertexBuffer::~VertexBuffer() {
if (rendererID != 0) {
auto ref = references.find(rendererID);
assert(ref != references.end());
if (--ref->second == 0) {
GLCall(glDeleteBuffers(1, &rendererID));
references.erase(ref);
}
}
}
void VertexBuffer::bind() const {
GLCall(glBindBuffer(GL_ARRAY_BUFFER, rendererID));
}
void VertexBuffer::unbind() const {
GLCall(glBindBuffer(GL_ARRAY_BUFFER, 0));
}
In the sphere I have only one big buffer that contains both positions and colors.
I found the solution. It was a very stupid error: The constructor of the VertexBuffer class needs the size of the buffer in bytes but when I called it I passed only the size of the std::vector that is the number of elements.

Cuda C++ design: reusable class with unknown compile-time size

I am looking for a convenient design in order to be able to use a class on the device which has unknown compile-time size.
Only one instance of this class needs to be sent to the device, for which there should be a single call to cudaMalloc and cudaMemcpy (ideally).
The host version of the class would look like this:
Class A {
public:
A(int size) : table(size) {
// some useful initialization of table
}
double get(int i) const {
// return some processed element from table
}
private:
std::vector<int> table;
};
The kernel:
__global__ void kernel(const A *a){
int idx = threadIdx.x + blockDim.x * blockIdx.x;
a->get(idx); // do something useful with it
}
So far, the way I would design the device version of the class is like that:
const int sizeMax = 1000;
Class A {
public:
A(int size) {
// size checking + some useful initialization of table
}
__host__ __device__
double get(int i) const {
//
}
private:
int table[sizeMax];
};
And the client code:
A a(128);
A* da;
cudaMalloc((void**)&da, sizeof(A));
cudaMemcpy(da, &a, sizeof(A), cudaMemcpyHostToDevice);
kernel<<<1, 32>>>(da);
cudaDeviceSynchronize();
cudaFree(da);
This is rather ugly because:
it wastes bandwith by having to use too large a sizeMax in order to
be on the safe side
the class is not closed for modification, the value of sizeMax will
inevitably need to be raised at some point
Is there any other way to achieve the same thing in a cleaner way without negative performance impact? To be clear, I only need the device version of the class, the first version is just the equivalent non-CUDA code to illustrate the fact that the table size should be dynamic.
In my comment, I said:
separate host and device storage for table, contained in the class, both of which are allocated dynamically. 2. dynamic allocation of table storage size in the constructor, rather than in your client code. This could also include resizing if necessary. 3. differentiation in class methods to use either the host copy of the data or the device copy (i.e. pointer) to the data, depending on whether the method is being executed in host or device code 4. A method to copy data from host to device or vice versa, as the class context is moved from host to device or vice versa.
Here's an example of what I had in mind:
#include <stdio.h>
#include <assert.h>
#include <cuda_runtime_api.h>
#include <iostream>
template <typename T>
class gpuvec{
private:
T *h_vec = NULL;
T *d_vec = NULL;
size_t vsize = 0;
bool iscopy;
public:
__host__ __device__
T * data(){
#ifndef __CUDA_ARCH__
return h_vec;
#else
return d_vec;
#endif
}
__host__ __device__
T& operator[](size_t i) {
assert(i < vsize);
return data()[i];}
void to_device(){
assert(cudaMemcpy(d_vec, h_vec, vsize*sizeof(T), cudaMemcpyHostToDevice) == cudaSuccess);}
void to_host(){
assert(cudaMemcpy(h_vec, d_vec, vsize*sizeof(T), cudaMemcpyDeviceToHost) == cudaSuccess);}
gpuvec(gpuvec &o){
h_vec = o.h_vec;
d_vec = o.d_vec;
vsize = o.vsize;
iscopy = true;}
void copy(gpuvec &o){
free();
iscopy = false;
vsize = o.vsize;
h_vec = (T *)malloc(vsize*sizeof(T));
assert(h_vec != NULL);
assert(cudaMalloc(&d_vec, vsize*sizeof(T)) == cudaSuccess);
memcpy(h_vec, o.h_vec, vsize*sizeof(T));
assert(cudaMemcpy(d_vec, o.d_vec, vsize*sizeof(T), cudaMemcpyDeviceToDevice) == cudaSuccess);}
gpuvec(size_t ds) {
assert(ds > 0);
iscopy = false;
vsize = ds;
h_vec = (T *)malloc(vsize*sizeof(T));
assert(h_vec != NULL);
assert(cudaMalloc(&d_vec, vsize*sizeof(T)) == cudaSuccess);}
gpuvec(){
iscopy = false;
}
~gpuvec(){
if (!iscopy) free();}
void free(){
if (d_vec != NULL) cudaFree(d_vec);
d_vec = NULL;
if (h_vec != NULL) ::free(h_vec);
h_vec = NULL;}
__host__ __device__
size_t size() {
return vsize;}
};
template <typename T>
__global__ void test(gpuvec<T> d){
for (int i = 0; i < d.size(); i++){
d[i] += 1;
}
}
int main(){
size_t ds = 10;
gpuvec<int> A(ds);
A.to_device();
test<<<1,1>>>(A);
A.to_host();
for (size_t i = 0; i < ds; i++)
std::cout << A[i];
std::cout << std::endl;
gpuvec<int> B;
B.copy(A);
A.free();
B.to_device();
test<<<1,1>>>(B);
B.to_host();
for (size_t i = 0; i < ds; i++)
std::cout << B[i];
std::cout << std::endl;
B.free();
}
I'm sure quite a few criticisms could be made. This may not adhere to any particular opinion of what "vector syntax" should be. Furthermore I'm sure there are use cases it does not cover, and it may contain outright defects. To create a robust host/device vector realization may require as much work and complexity as thrust host and device vectors. I'm not suggesting that thrust vectors are a drop-in answer for what the question seems to be asking, however.
Based on Robert Crovella's answer, here is a simplified (device only, so ignoring points 3 & 4) working solution:
Class A {
public:
A(int size) : table(size) {
// some useful initialization of table
cudaMalloc((void**)&dTable, sizeof(int) * size);
cudaMemcpy(dTable, &table[0], sizeof(int) * size, cudaMemcpyHostToDevice);
}
~A() {
cudaFree(dTable);
}
__device__
double get(int i) const {
// return some processed element of dTable
}
private:
std::vector<int> table;
int *dTable;
};
Kernel and client code stay exactly the same.

How to implement custom loss function correctly in caffe?

I am beginer at caffe and I am implementing a custom loss function of caffe. But the error is occurred at runtest.
My loss function is similar with Euclidean loss. The original Euclidean loss equation is below.
enter image description here
I would like to implement 2D distance loss. So I made the equation like below.
enter image description here
Then, the runtest result is error at backward function. I thought the way to write back propagation is something wrong. However, I am not sure what is wrong. I simply modify Euclidean loss to adjust my loss function and I wrote gradient of my loss function at back propagation. Do you know why the error is occured?
imgdist_loss_layer.cpp
#include <vector>
#include "caffe/layers/imgdist_loss_layer.hpp"
#include "caffe/util/math_functions.hpp"
namespace caffe {
template <typename Dtype>
void ImgdistLossLayer<Dtype>::Reshape(
const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
LossLayer<Dtype>::Reshape(bottom, top);
CHECK_EQ(bottom[0]->count(1), bottom[1]->count(1))
<< "Inputs must have the same dimension.";
diff_.ReshapeLike(*bottom[0]);
}
// forward propagation
// calculate loss
template <typename Dtype>
void ImgdistLossLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
const vector<Blob<Dtype>*>& top) {
int count = bottom[0]->count() / 2;
Dtype loss = 0;
for (int i = 0; i < count; ++i) {
Dtype x_sub = bottom[0]->cpu_data()[2 * i] - bottom[1]->cpu_data()[2 * i];
Dtype y_sub = bottom[0]->cpu_data()[2 * i + 1] - bottom[1]->cpu_data()[2 * i + 1];
loss += x_sub*x_sub + y_sub*y_sub;
}
loss = loss / bottom[0]->num();
top[0]->mutable_cpu_data()[0] = loss;
}
// back propagation
// calculate gradient
template <typename Dtype>
void ImgdistLossLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
for (int i = 0; i < 2; ++i) {
if (propagate_down[i]) {
const Dtype* bottom_data_0 = bottom[0]->cpu_data();
const Dtype* bottom_data_1 = bottom[1]->cpu_data();
Dtype* bottom_diff = bottom[i]->mutable_cpu_diff();
const int count = bottom[0]->count() / 2;
for (int j = 0; j < count; ++j) {
const Dtype x_sub = bottom_data_0[2 * j] - bottom_data_1[2 * j];
const Dtype y_sub = bottom_data_0[2 * j + 1] - bottom_data_1[2 * j + 1];
const Dtype sign = (i == 0) ? 1 : -1;
const Dtype alpha_0 = (sign * Dtype(2) * x_sub + y_sub * y_sub) / bottom[i]->num();
const Dtype alpha_1 = (x_sub * x_sub + sign * Dtype(2) * y_sub) / bottom[i]->num();
bottom_diff[2 * j] = top[0]->cpu_diff()[0] * alpha_0;
bottom_diff[2 * j + 1] = top[0]->cpu_diff()[0] * alpha_1;
} // j
}
} // i
}
#ifdef CPU_ONLY
STUB_GPU(ImgDistLossLayer);
#endif
INSTANTIATE_CLASS(ImgdistLossLayer);
REGISTER_LAYER_CLASS(ImgdistLoss);
} // namespace caffe
imgdist_loss_layer.cu
#include <vector>
#include "caffe/layers/imgdist_loss_layer.hpp"
#include "caffe/util/math_functions.hpp"
namespace caffe {
// forward propagation loop
template <typename Dtype>
__global__ void imgdistLossForwardGPU(const int nthreads,
const Dtype* input_data, const Dtype* target, Dtype* loss) {
CUDA_KERNEL_LOOP(i, nthreads) {
loss[i] = (input_data[2 * i] - target[2 * i]) * (input_data[2 * i] - target[2 * i])
+ (input_data[2 * i + 1] - target[2 * i + 1]) * (input_data[2 * i + 1] - target[2 * i + 1]);
}
}
// forward propagation
template <typename Dtype>
void ImgdistLossLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
const vector<Blob<Dtype>*>& top) {
const int count = bottom[0]->count() / 2;
const Dtype* input_data = bottom[0]->gpu_data();
const Dtype* target = bottom[1]->gpu_data();
Dtype* loss_data = bottom[0]->mutable_gpu_diff();
imgdistLossForwardGPU<Dtype><<<CAFFE_GET_BLOCKS(count), CAFFE_CUDA_NUM_THREADS>>>(
count, input_data, target, loss_data);
CUDA_POST_KERNEL_CHECK;
Dtype loss;
caffe_gpu_asum(count, loss_data, &loss);
loss = loss / bottom[0]->num();
top[0]->mutable_cpu_data()[0] = loss;
}
// back propagation loop
template <typename Dtype>
__global__ void imgdistLossBackwardGPU(const int nthreads,
const Dtype* input_data, const Dtype* target, Dtype* diff,
const Dtype sign, const Dtype toploss, const Dtype bottom_num) {
CUDA_KERNEL_LOOP(i, nthreads) {
const Dtype x_sub = input_data[2 * i] - target[2 * i];
const Dtype y_sub = input_data[2 * i + 1] - target[2 * i + 1];
const Dtype alpha_0 = (sign * Dtype(2) * x_sub + y_sub * y_sub) / bottom_num;
const Dtype alpha_1 = (x_sub * x_sub + sign * Dtype(2) * y_sub) / bottom_num;
diff[2 * i] = toploss * alpha_0;
diff[2 * i + 1] = toploss * alpha_1;
}
}
// back propagation
template <typename Dtype>
void ImgdistLossLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
for (int i = 0; i < 2; ++i) {
if (propagate_down[i]) {
const Dtype sign = (i == 0) ? 1 : -1;
const int count = bottom[0]->count() / 2;
const Dtype* input_data = bottom[0]->gpu_data();
const Dtype* target = bottom[1]->gpu_data();
const Dtype toploss = top[0]->cpu_diff()[0];
const Dtype bottom_num = bottom[i]->num();
Dtype* bottom_diff = bottom[i]->mutable_gpu_diff();
imgdistLossBackwardGPU<Dtype><<<CAFFE_GET_BLOCKS(count), CAFFE_CUDA_NUM_THREADS>>>(
count, input_data, target, bottom_diff, sign, toploss, bottom_num);
CUDA_POST_KERNEL_CHECK;
}
}
}
INSTANTIATE_LAYER_GPU_FUNCS(ImgdistLossLayer);
} // namespace caffe
imgdist_loss_layer.hpp (only change class name)
#ifndef CAFFE_IMGDIST_LOSS_LAYER_HPP_
#define CAFFE_IMGDIST_LOSS_LAYER_HPP_
#include <vector>
#include "caffe/blob.hpp"
#include "caffe/layer.hpp"
#include "caffe/proto/caffe.pb.h"
#include "caffe/layers/loss_layer.hpp"
namespace caffe {
template <typename Dtype>
class ImgdistLossLayer : public LossLayer<Dtype> {
public:
explicit ImgdistLossLayer(const LayerParameter& param)
: LossLayer<Dtype>(param), diff_() {}
virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
const vector<Blob<Dtype>*>& top);
virtual inline const char* type() const { return "ImgdistLoss"; }
virtual inline bool AllowForceBackward(const int bottom_index) const {
return true;
}
protected:
/// #copydoc EuclideanLossLayer
virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
const vector<Blob<Dtype>*>& top);
virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
const vector<Blob<Dtype>*>& top);
virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
Blob<Dtype> diff_;
};
} // namespace caffe
#endif // CAFFE_EUCLIDEAN_LOSS_LAYER_HPP_
test_imgdist_loss_layer.cpp
#include <cmath>
#include <vector>
#include "gtest/gtest.h"
#include "caffe/blob.hpp"
#include "caffe/common.hpp"
#include "caffe/filler.hpp"
#include "caffe/layers/imgdist_loss_layer.hpp"
#include "caffe/test/test_caffe_main.hpp"
#include "caffe/test/test_gradient_check_util.hpp"
namespace caffe {
template<typename TypeParam>
class ImgdistLossLayerTest : public MultiDeviceTest<TypeParam> {
typedef typename TypeParam::Dtype Dtype;
protected:
ImgdistLossLayerTest()
: blob_bottom_data_(new Blob<Dtype>(10, 5, 1, 1)),
blob_bottom_label_(new Blob<Dtype>(10, 5, 1, 1)),
blob_top_loss_(new Blob<Dtype>()) {
// fill the values
FillerParameter filler_param;
GaussianFiller<Dtype> filler(filler_param);
filler.Fill(this->blob_bottom_data_);
blob_bottom_vec_.push_back(blob_bottom_data_);
filler.Fill(this->blob_bottom_label_);
blob_bottom_vec_.push_back(blob_bottom_label_);
blob_top_vec_.push_back(blob_top_loss_);
}
virtual ~ImgdistLossLayerTest() {
delete blob_bottom_data_;
delete blob_bottom_label_;
delete blob_top_loss_;
}
void TestForward() {
// Get the loss without a specified objective weight -- should be
// equivalent to explicitly specifying a weight of 1.
LayerParameter layer_param;
ImgdistLossLayer<Dtype> layer_weight_1(layer_param);
layer_weight_1.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
const Dtype loss_weight_1 =
layer_weight_1.Forward(this->blob_bottom_vec_, this->blob_top_vec_);
// Get the loss again with a different objective weight; check that it is
// scaled appropriately.
const Dtype kLossWeight = 3.7;
layer_param.add_loss_weight(kLossWeight);
ImgdistLossLayer<Dtype> layer_weight_2(layer_param);
layer_weight_2.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
const Dtype loss_weight_2 =
layer_weight_2.Forward(this->blob_bottom_vec_, this->blob_top_vec_);
const Dtype kErrorMargin = 1e-5;
EXPECT_NEAR(loss_weight_1 * kLossWeight, loss_weight_2, kErrorMargin);
// Make sure the loss is non-trivial.
const Dtype kNonTrivialAbsThresh = 1e-1;
EXPECT_GE(fabs(loss_weight_1), kNonTrivialAbsThresh);
}
Blob<Dtype>* const blob_bottom_data_;
Blob<Dtype>* const blob_bottom_label_;
Blob<Dtype>* const blob_top_loss_;
vector<Blob<Dtype>*> blob_bottom_vec_;
vector<Blob<Dtype>*> blob_top_vec_;
};
TYPED_TEST_CASE(ImgdistLossLayerTest, TestDtypesAndDevices);
TYPED_TEST(ImgdistLossLayerTest, TestForward) {
this->TestForward();
}
TYPED_TEST(ImgdistLossLayerTest, TestGradient) {
typedef typename TypeParam::Dtype Dtype;
LayerParameter layer_param;
const Dtype kLossWeight = 3.7;
layer_param.add_loss_weight(kLossWeight);
ImgdistLossLayer<Dtype> layer(layer_param);
layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
GradientChecker<Dtype> checker(1e-2, 1e-2, 1701);
checker.CheckGradientExhaustive(&layer, this->blob_bottom_vec_,
this->blob_top_vec_);
}
}
The error log is below.
C:\Projects\caffe\include\caffe/test/test_gradient_check_util.hpp(175): error: The difference between computed_gradient and estimated_gradient is 1.5981258813447825, which exceeds threshold_ * scale, where
computed_gradient evaluates to 2.755687472811343,
estimated_gradient evaluates to 1.1575615914665605, and
threshold_ * scale evaluates to 0.027556874728113429.
debug: (top_id, top_data_id, blob_id, feat_id)=0,0,1,49; feat = 1.5097962694948988; objective+ = 20.508002455868997; objective- = 20.484851224039666
[ FAILED ] ImgdistLossLayerTest/3.TestGradient, where TypeParam = struct caffe::GPUDevice<double> (204 ms)
[----------] 2 tests from ImgdistLossLayerTest/3 (222 ms total)
[----------] Global test environment tear-down
[==========] 8 tests from 4 test cases ran. (878 ms total)
[ PASSED ] 4 tests.
[ FAILED ] 4 tests, listed below:
[ FAILED ] ImgdistLossLayerTest/0.TestGradient, where TypeParam = struct caffe::CPUDevice<float>
[ FAILED ] ImgdistLossLayerTest/1.TestGradient, where TypeParam = struct caffe::CPUDevice<double>
[ FAILED ] ImgdistLossLayerTest/2.TestGradient, where TypeParam = struct caffe::GPUDevice<float>
[ FAILED ] ImgdistLossLayerTest/3.TestGradient, where TypeParam = struct caffe::GPUDevice<double>
4 FAILED TESTS

define custom scan operator

I'm trying to create my own scan operator in cub. It is working now but only for array sizes smaller than 1024 which makes me think that it only works for a block. Here is my code :
#include "cub/cub.cuh"
using namespace cub;
typedef int mytype;
struct CustomMin
{
template <typename T>
__host__ __device__
CUB_RUNTIME_FUNCTION __forceinline__
mytype operator()(const T &a, const T &b) const {
return (b < a) ? b : a;
}
};
int main(int argc, char *argv[])
{
int num_items = 512;
mytype *h_in;
mytype *h_out;
CustomMin min_op;
const size_t size = num_items * sizeof(mytype);
h_in = (mytype*)malloc(size);
h_out = (mytype*)malloc(size);
mytype *d_in = NULL;
cudaMalloc(&d_in, size);
mytype *d_out = NULL;
cudaMalloc(&d_out, size);
for (int i = 0; i < num_items; i++) {
h_in[i] = i;
}
cudaMemcpy(d_in, h_in, size, cudaMemcpyHostToDevice);
void *d_temp_storage = NULL;
size_t temp_storage_bytes = 0;
DeviceScan::InclusiveScan(d_temp_storage, temp_storage_bytes, d_in, d_out, min_op, num_items);
cudaMalloc(&d_temp_storage, temp_storage_bytes);
DeviceScan::InclusiveScan(d_temp_storage, temp_storage_bytes, d_in, d_out, min_op, num_items);
cudaMemcpy(h_out, d_out, size, cudaMemcpyDeviceToHost);
printf("done!\n");
return 0;
}
It always hangs for larger input sizes.
With CUB 1.4.1 I was able to reproduce the hang when compiling like this:
nvcc -arch=sm_35 -o t25 t25.cu
after changing num_items in the posted code to 2048.
According to my testing, the issue appears to be fixed in cub 1.5.1. Please update to the latest CUB version.

using a pointer to vector<T>::data() for cublasSgemm

I am trying to use the vector::data() pointer when using cudaMalloc, cudaMemcpy, and cublasSgemm but I can't seem to get it to work. If I am not mistaken, vector::data() should return a pointer to the actual array stored in memory for that vector so it should be the same as having a T* aArray pointer to an array of type T stored in memory. Using the latter does work, but not the data() pointer.
Here is the code I am working on:
Matrix<T> Matrix<T>::cudaProd(Matrix<T>&A,Matrix<T>&B, Matrix<T>&C)
{
C = Matrix<T>(A.height, B.width); //resizing of the vector of elements for Matrix C
//A[m][n]*B[n][k]=C[m][k]
int m = A.height;
int n = B.height;
int k = B.width;
float alpha = 1.0f;
float beta = 0.0f;
T* d_a = A.GetPointer();
T* d_b = B.GetPointer();
T* d_c = C.GetPointer();
cudaMalloc(&d_a,A.size);
cudaMalloc(&d_b,B.size);
cudaMalloc(&d_c,C.size);
cudaMemcpy(d_a,A.GetPointer(),A.size,cudaMemcpyHostToDevice);
cudaMemcpy(d_b,B.GetPointer(),B.size,cudaMemcpyHostToDevice);
cublasHandle_t handle;
cublasStatus_t status = cublasCreate(&handle);
if (status != CUBLAS_STATUS_SUCCESS)
{
std::cerr << "!!!! CUBLAS initialization error\n";
}
status = cublasSgemm(handle,CUBLAS_OP_N,CUBLAS_OP_N,k,m,n,&alpha,d_b,k,d_a,n,&beta,d_c,k);
if (status != CUBLAS_STATUS_SUCCESS)
{
std::cerr << "!!!! kernel execution error.\n";
}
status = cublasDestroy(handle);
if (status != CUBLAS_STATUS_SUCCESS)
{
std::cerr << "!!!! shutdown error (A)\n";
}
cudaMemcpy(C.GetPointer(), d_c, C.size,cudaMemcpyDeviceToHost);
cudaFree(d_a);
cudaFree(d_b);
cudaFree(d_c);
The GetPointer() member function returns vector::data() of the vector of elements for that Matrix object. Size is the vector element's size in memory.
The vector of Matrix C returns all zeros when using the data() pointer, and returns the product of Matrix A and B when using T* aArray pointers without vectors.
Is it actually possible to use vectors to store the array of elements and then the data() pointer to initialize the device copy of the array, or am I forced to use the C style array storage on the host? Also, I have tried using thrust::device_vector and that works but I would like to stay away from creating raw_pointer_casts.
Thanks for your help!
Edit:
For those having trouble with copy and pasting, here is the complete example:
#include <cuda.h>
#include <cuda_runtime.h>
#include <cuda_device_runtime_api.h>
#include <cublas_v2.h>
#include <vector>
#include <iostream>
using namespace std;
template<typename T> class Matrix
{
public:
~Matrix();
Matrix();
Matrix(int rows, int columns);
int width;
int height;
int stride;
size_t size;
T &GetElement(int row, int column);
void SetElement(int row, int column, T value);
void SetElements(vector<T> value);
vector<T>& GetElements();
T* GetPointer();
Matrix<T> cudaProd(Matrix<T>&A,Matrix<T>&B, Matrix<T>&C);
private:
vector<T> elements;
T* firstElement;
};
template<typename T>
Matrix<T>::~Matrix()
{
}
template<typename T>
Matrix<T>::Matrix()
{
}
template<typename T>
Matrix<T>::Matrix(int rows, int columns)
{
height = rows;
width = columns;
stride = columns; //in row major order this is equal to the # of columns
elements.resize(rows*columns);
firstElement = elements.data();
size = height*width*sizeof(T);
}
template<typename T>
T &Matrix<T>::GetElement(int row, int column)
{
return elements[row*width + column]; //row major order return
}
template<typename T>
vector<T>& Matrix<T>::GetElements()
{
return elements; //row major order return
}
template<typename T>
void Matrix<T>::SetElement(int row, int column, T value)
{
elements[row*width + column] = value; //row major order return
}
template<typename T>
void Matrix<T>::SetElements(vector<T> value)
{
elements = value;
}
template<typename T>
T* Matrix<T>::GetPointer()
{
return firstElement;
}
template<typename T>
//Matrix Multiplication using CUDA
Matrix<T> Matrix<T>::cudaProd(Matrix<T>&A,Matrix<T>&B, Matrix<T>&C)
{
C = Matrix<T>(A.height, B.width);
//A[m][n]*B[n][k]=C[m][k]
int m = A.height;
int n = B.height;
int k = B.width;
float alpha = 1.0f;
float beta = 0.0f;
//Thrust usage
/*thrust::device_vector<T> d_A = A.GetElements();
T* d_a = thrust::raw_pointer_cast(&d_A[0]);
thrust::device_vector<T> d_B = B.GetElements();
T* d_b = thrust::raw_pointer_cast(&d_B[0]);
thrust::device_vector<T> d_C = C.GetElements();
T* d_c = thrust::raw_pointer_cast(&d_C[0]);*/
T* d_a = A.GetPointer();
T* d_b = B.GetPointer();
T* d_c = C.GetPointer();
cudaMalloc(&d_a,A.size);
cudaMalloc(&d_b,B.size);
cudaMalloc(&d_c,C.size);
cudaMemcpy(d_a,A.GetPointer(),A.size,cudaMemcpyHostToDevice);
cudaMemcpy(d_b,B.GetPointer(),B.size,cudaMemcpyHostToDevice);
cudaMemcpy(d_c,C.GetPointer(),C.size,cudaMemcpyHostToDevice);
cublasHandle_t handle;
cublasStatus_t status = cublasCreate(&handle);
if (status != CUBLAS_STATUS_SUCCESS)
{
std::cerr << "!!!! CUBLAS initialization error\n";
}
status = cublasSgemm(handle,CUBLAS_OP_N,CUBLAS_OP_N,k,m,n,&alpha,d_b,k,d_a,n,&beta,d_c,k);
if (status != CUBLAS_STATUS_SUCCESS)
{
std::cerr << "!!!! kernel execution error.\n";
}
status = cublasDestroy(handle);
if (status != CUBLAS_STATUS_SUCCESS)
{
std::cerr << "!!!! shutdown error (A)\n";
}
//thrust::copy(d_C.begin(), d_C.end(), C.GetElements().begin());
cudaMemcpy(C.GetPointer(), d_c, C.size,cudaMemcpyDeviceToHost);
cudaFree(d_a);
cudaFree(d_b);
cudaFree(d_c);
return C;
}
int main()
{
Matrix<float> A(2,2);
Matrix<float> B(2,2);
Matrix<float> C;
vector<float> aE(4,2);
vector<float> bE(4,4);
A.SetElements(aE);
B.SetElements(bE);
C = C.cudaProd(A, B, C); //function call to cudaProd()
for(int row = 0; row < A.height; ++row)
{
for(int col = 0; col < A.width; ++col)
{
cout<<A.GetElement(row, col)<<" "; //h_c is stored on device in column major order, need to switch to row major order
}
printf("\n");
}
printf("\n");
for(int row = 0; row < B.height; ++row)
{
for(int col = 0; col < B.width; ++col)
{
cout<<B.GetElement(row, col)<<" "; //h_c is stored on device in column major order, need to switch to row major order
}
printf("\n");
}
printf("\n");
for(int row = 0; row < C.height; ++row)
{
for(int col = 0; col < C.width; ++col)
{
cout<<C.GetElement(row, col)<<" "; //h_c is stored on device in column major order, need to switch to row major order
}
printf("\n");
}
printf("\n");
}
If I am not mistaken, vector::data() should return a pointer to the actual array stored in memory for that vector so it should be the same as having a T* aArray pointer to an array of type T stored in memory.
The std::vector class is an owning resource class. It means that trying to manage the underlying resource yourself with the data pointer will make you enter a world of pain.
For this very same reason:
cudaMalloc(&d_a,A.size);
cudaMalloc(&d_b,B.size);
cudaMalloc(&d_c,C.size);
and:
cudaMemcpy(C.GetPointer(), d_c, C.size,cudaMemcpyDeviceToHost);
and:
cudaFree(d_a);
cudaFree(d_b);
cudaFree(d_c);
cannot possibly work.
From the std::vector::data documentation, data() returns both const and non-const qualified pointers, depending on the fact that the vector is qualified as const or not. Quoting the documentation
If the vector object is const-qualified, the function returns a pointer to const value_type. Otherwise, it returns a pointer to value_type.
Accordingly, using
firstElement = elements.data();
in the Matrix constructor is fine to read/write the data.
The main problem with your code is that you are declaring C in the main, passing a reference to C to the cudaProd method and then internally using
C = Matrix<T>(A.height, B.width);
which will redeclare the Matrix.
If you change the definition of the cudaProd method to
template<typename T>
void cudaProd(Matrix<T>&A,Matrix<T>&B, Matrix<T>&C)
remove the
return C;
statement and allocate space for C in the main as
Matrix<float> C(2,2);
vector<float> cE(4,10);
C.SetElements(cE);
your code should work correctly.