define custom scan operator - c++

I'm trying to create my own scan operator in cub. It is working now but only for array sizes smaller than 1024 which makes me think that it only works for a block. Here is my code :
#include "cub/cub.cuh"
using namespace cub;
typedef int mytype;
struct CustomMin
{
template <typename T>
__host__ __device__
CUB_RUNTIME_FUNCTION __forceinline__
mytype operator()(const T &a, const T &b) const {
return (b < a) ? b : a;
}
};
int main(int argc, char *argv[])
{
int num_items = 512;
mytype *h_in;
mytype *h_out;
CustomMin min_op;
const size_t size = num_items * sizeof(mytype);
h_in = (mytype*)malloc(size);
h_out = (mytype*)malloc(size);
mytype *d_in = NULL;
cudaMalloc(&d_in, size);
mytype *d_out = NULL;
cudaMalloc(&d_out, size);
for (int i = 0; i < num_items; i++) {
h_in[i] = i;
}
cudaMemcpy(d_in, h_in, size, cudaMemcpyHostToDevice);
void *d_temp_storage = NULL;
size_t temp_storage_bytes = 0;
DeviceScan::InclusiveScan(d_temp_storage, temp_storage_bytes, d_in, d_out, min_op, num_items);
cudaMalloc(&d_temp_storage, temp_storage_bytes);
DeviceScan::InclusiveScan(d_temp_storage, temp_storage_bytes, d_in, d_out, min_op, num_items);
cudaMemcpy(h_out, d_out, size, cudaMemcpyDeviceToHost);
printf("done!\n");
return 0;
}
It always hangs for larger input sizes.

With CUB 1.4.1 I was able to reproduce the hang when compiling like this:
nvcc -arch=sm_35 -o t25 t25.cu
after changing num_items in the posted code to 2048.
According to my testing, the issue appears to be fixed in cub 1.5.1. Please update to the latest CUB version.

Related

Cuda C++ design: reusable class with unknown compile-time size

I am looking for a convenient design in order to be able to use a class on the device which has unknown compile-time size.
Only one instance of this class needs to be sent to the device, for which there should be a single call to cudaMalloc and cudaMemcpy (ideally).
The host version of the class would look like this:
Class A {
public:
A(int size) : table(size) {
// some useful initialization of table
}
double get(int i) const {
// return some processed element from table
}
private:
std::vector<int> table;
};
The kernel:
__global__ void kernel(const A *a){
int idx = threadIdx.x + blockDim.x * blockIdx.x;
a->get(idx); // do something useful with it
}
So far, the way I would design the device version of the class is like that:
const int sizeMax = 1000;
Class A {
public:
A(int size) {
// size checking + some useful initialization of table
}
__host__ __device__
double get(int i) const {
//
}
private:
int table[sizeMax];
};
And the client code:
A a(128);
A* da;
cudaMalloc((void**)&da, sizeof(A));
cudaMemcpy(da, &a, sizeof(A), cudaMemcpyHostToDevice);
kernel<<<1, 32>>>(da);
cudaDeviceSynchronize();
cudaFree(da);
This is rather ugly because:
it wastes bandwith by having to use too large a sizeMax in order to
be on the safe side
the class is not closed for modification, the value of sizeMax will
inevitably need to be raised at some point
Is there any other way to achieve the same thing in a cleaner way without negative performance impact? To be clear, I only need the device version of the class, the first version is just the equivalent non-CUDA code to illustrate the fact that the table size should be dynamic.
In my comment, I said:
separate host and device storage for table, contained in the class, both of which are allocated dynamically. 2. dynamic allocation of table storage size in the constructor, rather than in your client code. This could also include resizing if necessary. 3. differentiation in class methods to use either the host copy of the data or the device copy (i.e. pointer) to the data, depending on whether the method is being executed in host or device code 4. A method to copy data from host to device or vice versa, as the class context is moved from host to device or vice versa.
Here's an example of what I had in mind:
#include <stdio.h>
#include <assert.h>
#include <cuda_runtime_api.h>
#include <iostream>
template <typename T>
class gpuvec{
private:
T *h_vec = NULL;
T *d_vec = NULL;
size_t vsize = 0;
bool iscopy;
public:
__host__ __device__
T * data(){
#ifndef __CUDA_ARCH__
return h_vec;
#else
return d_vec;
#endif
}
__host__ __device__
T& operator[](size_t i) {
assert(i < vsize);
return data()[i];}
void to_device(){
assert(cudaMemcpy(d_vec, h_vec, vsize*sizeof(T), cudaMemcpyHostToDevice) == cudaSuccess);}
void to_host(){
assert(cudaMemcpy(h_vec, d_vec, vsize*sizeof(T), cudaMemcpyDeviceToHost) == cudaSuccess);}
gpuvec(gpuvec &o){
h_vec = o.h_vec;
d_vec = o.d_vec;
vsize = o.vsize;
iscopy = true;}
void copy(gpuvec &o){
free();
iscopy = false;
vsize = o.vsize;
h_vec = (T *)malloc(vsize*sizeof(T));
assert(h_vec != NULL);
assert(cudaMalloc(&d_vec, vsize*sizeof(T)) == cudaSuccess);
memcpy(h_vec, o.h_vec, vsize*sizeof(T));
assert(cudaMemcpy(d_vec, o.d_vec, vsize*sizeof(T), cudaMemcpyDeviceToDevice) == cudaSuccess);}
gpuvec(size_t ds) {
assert(ds > 0);
iscopy = false;
vsize = ds;
h_vec = (T *)malloc(vsize*sizeof(T));
assert(h_vec != NULL);
assert(cudaMalloc(&d_vec, vsize*sizeof(T)) == cudaSuccess);}
gpuvec(){
iscopy = false;
}
~gpuvec(){
if (!iscopy) free();}
void free(){
if (d_vec != NULL) cudaFree(d_vec);
d_vec = NULL;
if (h_vec != NULL) ::free(h_vec);
h_vec = NULL;}
__host__ __device__
size_t size() {
return vsize;}
};
template <typename T>
__global__ void test(gpuvec<T> d){
for (int i = 0; i < d.size(); i++){
d[i] += 1;
}
}
int main(){
size_t ds = 10;
gpuvec<int> A(ds);
A.to_device();
test<<<1,1>>>(A);
A.to_host();
for (size_t i = 0; i < ds; i++)
std::cout << A[i];
std::cout << std::endl;
gpuvec<int> B;
B.copy(A);
A.free();
B.to_device();
test<<<1,1>>>(B);
B.to_host();
for (size_t i = 0; i < ds; i++)
std::cout << B[i];
std::cout << std::endl;
B.free();
}
I'm sure quite a few criticisms could be made. This may not adhere to any particular opinion of what "vector syntax" should be. Furthermore I'm sure there are use cases it does not cover, and it may contain outright defects. To create a robust host/device vector realization may require as much work and complexity as thrust host and device vectors. I'm not suggesting that thrust vectors are a drop-in answer for what the question seems to be asking, however.
Based on Robert Crovella's answer, here is a simplified (device only, so ignoring points 3 & 4) working solution:
Class A {
public:
A(int size) : table(size) {
// some useful initialization of table
cudaMalloc((void**)&dTable, sizeof(int) * size);
cudaMemcpy(dTable, &table[0], sizeof(int) * size, cudaMemcpyHostToDevice);
}
~A() {
cudaFree(dTable);
}
__device__
double get(int i) const {
// return some processed element of dTable
}
private:
std::vector<int> table;
int *dTable;
};
Kernel and client code stay exactly the same.

BloomFilter in C++ using the MurmurHash3 hash function

I am trying to code a C++ implementation of a Bloom filter using the MurmurHash3 hash function. My implementation is based on this site: http://blog.michaelschmatz.com/2016/04/11/how-to-write-a-bloom-filter-cpp/
Somehow, in my BloomFilter header file, the hash function throws an incomplete type error, also, when I use the hash function inside of the add function, I get a "hash is ambigious error".
What can I do to fix this? I am somewhat new to C++ so I'm not exactly sure if I am using the interface/implementation of a structure correctly.
I am also using a main function that will include this file and run some tests to analyze the false positive rate, number of bits, filter size etc . . .
#ifndef BLOOM_FILTER_H
#define BLOOM_FILTER_H
#include "MurmurHash3.h"
#include <vector>
//basic structure of a bloom filter object
struct BloomFilter {
BloomFilter(uint64_t size, uint8_t numHashes);
void add(const uint8_t *data, std::size_t len);
bool possiblyContains(const uint8_t *data, std::size_t len) const;
private:
uint8_t m_numHashes;
std::vector<bool> m_bits;
};
//Bloom filter constructor
BloomFilter::BloomFilter(uint64_t size, uint8_t numHashes)
: m_bits(size),
m_numHashes(numHashes) {}
//Hash array created using the MurmurHash3 code
std::array<uint64_t, 2> hash(const uint8_t *data, std::size_t len)
{
std::array<uint64_t, 2> hashValue;
MurmurHash3_x64_128(data, len, 0, hashValue.data());
return hashValue;
}
//Hash array created using the MurmurHash3 code
inline uint64_t nthHash(uint8_t n,
uint64_t hashA,
uint64_t hashB,
uint64_t filterSize) {
return (hashA + n * hashB) % filterSize;
}
//Adds an element to the array
void BloomFilter::add(const uint8_t *data, std::size_t len) {
auto hashValues = hash(data, len);
for (int n = 0; n < m_numHashes; n++)
{
m_bits[nthHash(n, hashValues[0], hashValues[1], m_bits.size())] = true;
}
}
//Returns true or false based on a probabilistic assesment of the array using MurmurHash3
bool BloomFilter::possiblyContains(const uint8_t *data, std::size_t len) const {
auto hashValues = hash(data, len);
for (int n = 0; n < m_numHashes; n++)
{
if (!m_bits[nthHash(n, hashValues[0], hashValues[1], m_bits.size())])
{
return false;
}
}
return true;
}
#endif
If your MurmurHash3_x64_128 returns two 64-bit numbers as a hash value, I'd treat that as 4 distinct uint32_t hashes as long as you don't need more than 4 billion bits in your bit string. Most likely you don't need more than 2-3 hashses, but that depends on your use case. To figure out how many hashes you need you can check "How many hash functions does my bloom filter need?".
Using MurmurHash3_x64_128 I'd do it this way (if I were to treat it as 4 x uint32_t hashses):
void BloomFilter::add(const uint8_t *data, std::size_t len) {
auto hashValues = hash(data, len);
uint32_t* hx = reinterpret_cast<uint32_t*>(&hashValues[0]);
assert(m_numHashes <= 4);
for (int n = 0; n < m_numHashes; n++)
m_bits[hx[n] % m_bits.size()] = true;
}
Your code has some issues with types conversion that's why it didn't compile:
missing #include <array>
you have to use size_t for size (it might be 32-bit unsigned or 64-bit unsigned int)
it's better to name your hash to something else (e.g. myhash) and make it static.
Here's version of your code with these correction and this should work:
#ifndef BLOOM_FILTER_H
#define BLOOM_FILTER_H
#include "MurmurHash3.h"
#include <vector>
#include <array>
//basic structure of a bloom filter object
struct BloomFilter {
BloomFilter(size_t size, uint8_t numHashes);
void add(const uint8_t *data, std::size_t len);
bool possiblyContains(const uint8_t *data, std::size_t len) const;
private:
uint8_t m_numHashes;
std::vector<bool> m_bits;
};
//Bloom filter constructor
BloomFilter::BloomFilter(size_t size, uint8_t numHashes)
: m_bits(size),
m_numHashes(numHashes) {}
//Hash array created using the MurmurHash3 code
static std::array<uint64_t, 2> myhash(const uint8_t *data, std::size_t len)
{
std::array<uint64_t, 2> hashValue;
MurmurHash3_x64_128(data, len, 0, hashValue.data());
return hashValue;
}
//Hash array created using the MurmurHash3 code
inline size_t nthHash(int n,
uint64_t hashA,
uint64_t hashB,
size_t filterSize) {
return (hashA + n * hashB) % filterSize; // <- not sure if that is OK, perhaps it is.
}
//Adds an element to the array
void BloomFilter::add(const uint8_t *data, std::size_t len) {
auto hashValues = myhash(data, len);
for (int n = 0; n < m_numHashes; n++)
{
m_bits[nthHash(n, hashValues[0], hashValues[1], m_bits.size())] = true;
}
}
//Returns true or false based on a probabilistic assesment of the array using MurmurHash3
bool BloomFilter::possiblyContains(const uint8_t *data, std::size_t len) const {
auto hashValues = myhash(data, len);
for (int n = 0; n < m_numHashes; n++)
{
if (!m_bits[nthHash(n, hashValues[0], hashValues[1], m_bits.size())])
{
return false;
}
}
return true;
}
#endif
Run this code on ideone.
If you are just starting with c++, at first start with basic example, try to use std::hash maybe? Create working implementation, then extend it with optional hash function parameter. If you need your BloomFilter to be fast I'd probably stay away from vector<bool> and use array of unsigned ints instead.
Basic impl could something like this, provided that your have MurmurHash3 implemented:
uint32_t MurmurHash3(const char *str, size_t len);
class BloomFilter
{
public:
BloomFilter(int count_elements = 0, double bits_per_element = 10)
{
mem = NULL;
init(count_elements, bits_per_element);
}
~BloomFilter()
{
delete[] mem;
}
void init(int count_elements, double bits_per_element)
{
assert(!mem);
sz = (uint32_t)(count_elements*bits_per_element + 0.5);
mem = new uint8_t[sz / 8 + 8];
}
void add(const std::string &str)
{
add(str.data(), str.size());
}
void add(const char *str, size_t len)
{
if (len <= 0)
return;
add(MurmurHash3(str, len));
}
bool test(const std::string &str)
{
return test(str.data(), str.size());
}
bool test(const char *str, size_t len)
{
return test_hash(MurmurHash3(str, len));
}
bool test_hash(uint32_t h)
{
h %= sz;
if (0 != (mem[h / 8] & (1u << (h % 8))))
return true;
return false;
}
int mem_size() const
{
return (sz + 7) / 8;
}
private:
void add(uint32_t h)
{
h %= sz;
mem[h / 8] |= (1u << (h % 8));
}
public:
uint32_t sz;
uint8_t *mem;
};

How to obtain sizes of arrays in a loop

I am aligning several arrays in order and performing some sort of classification. I created an array to hold other arrays in order to simplify the operations that I want to perform.
Sadly, my program crashed when I ran it and I went on to debug it to finally realize that the sizeof operator is giving me sizes of pointers and not arrays within the loop.So I resorted to the cumbersome solution and my program worked.
How can I avoid this cumbersome method? I want to calculate within a loop!
#include <iostream>
#include <string>
#define ARRSIZE(X) sizeof(X) / sizeof(*X)
int classify(const char *asset, const char ***T, size_t T_size, size_t *index);
int main(void)
{
const char *names[] = { "book","resources","vehicles","buildings" };
const char *books[] = { "A","B","C","D" };
const char *resources[] = { "E","F","G" };
const char *vehicles[] = { "H","I","J","K","L","M" };
const char *buildings[] = { "N","O","P","Q","R","S","T","U","V" };
const char **T[] = { books,resources,vehicles,buildings };
size_t T_size = sizeof(T) / sizeof(*T);
size_t n, *index = new size_t[T_size];
/* This will yeild the size of pointers not arrays...
for (n = 0; n < T_size; n++) {
index[n] = ARRSIZE(T[n]);
}
*/
/* Cumbersome solution */
index[0] = ARRSIZE(books);
index[1] = ARRSIZE(resources);
index[2] = ARRSIZE(vehicles);
index[3] = ARRSIZE(buildings);
const char asset[] = "L";
int i = classify(asset, T, T_size, index);
if (i < 0) {
printf("asset is alien !!!\n");
}
else {
printf("asset ---> %s\n", names[i]);
}
delete index;
return 0;
}
int classify(const char *asset, const char ***T, size_t T_size, size_t *index)
{
size_t x, y;
for (x = 0; x < T_size; x++) {
for (y = 0; y < index[x]; y++) {
if (strcmp(asset, T[x][y]) == 0) {
return x;
}
}
}
return -1;
}
As you are including <string> and <iostream> I assume that the question is about C++ and not C. To avoid all this complication, simply use containers. E.g:
#include <vector>
std::vector<int> vect = std::vector<int>(3,0);
std::cout << vect.size() << std::endl; // prints 3
One solution if you are coding in C is to terminate your array with a special item, like NULL
const char *books[] = { "A","B","C","D", NULL };
size_t size(const char *arr[])
{
const char **p = arr;
while (*p)
{
p++;
}
return p - arr;
}
You can specify the array size explizit:
size_t n, index[] = {ARRSIZE(books), ARRSIZE(resources), ARRSIZE(vehicles), ARRSIZE(vehicles)};
or if you want to avoid double typing you can you X-Macros to roll out everything:
#define TBL \
X(books) \
X(resources) \
X(vehicles) \
X(buildings)
const char **T[] = {
#define X(x) x,
TBL
};
#undef X
size_t n, index[] = {
#define X(x) ARRSIZE(x),
TBL
};
which produces the same. See Running Demo.

How to pass non-static member function of a class into CUDA kernel function (__global__ function)

Finally, i have been able to pass a host function as a function pointer in CUDA kernel function (__global__ function). Thanks to Robert Crovella and njuffa for the answer. I have been able to pass a class member function(cpu function) as a function pointer to a CUDA kernel. But, the main problem is, I can only pass the static class member function. I am not being able to pass the function not declared as static.
My question is: How to pass non static member function into CUDA kernel
For Example:
__host__ __device__
static int CellfunPtr(void*ptr, int a);
The above function work because this member function is declared as static member function. If i do not declare this member function as a static member as ,
__host__ __device__
in CellfunPtr(void*ptr, int a);
then it does not work.
The complete code has four files.
fundef.h
typedef int (*pFunc_t)(void* ptr, int N);
solver.h file
class CalcVars {
int eqnCount;
int numCell;
int numTri;
int numTet;
public:
double* cellVel;
double* cellPre;
/** Constructor */
CalcVars(
const int eqnCount_,
const int numCell_,
const int numTri_,
const int numTet_
);
/** Destructor */
~CalcVars(void);
public:
void
CalcAdv();
__host__ __device__
static int
CellfunPtr(
void*ptr, int a
);
};
solver.cu
#include "solver.h"
#include "fundef.h"
#include <stdio.h>
__device__ pFunc_t pF1_d = CalcVars::CellfunPtr;
pFunc_t pF1_h ;
__global__ void kernel(int*a, pFunc_t func, void* thisPtr_){
int tid = threadIdx.x;
a[tid] = (*func)(thisPtr_, a[tid]);
};
/* Constructor */
CalcVars::CalcVars(
const int eqnCount_,
const int numCell_,
const int numTri_,
const int numTet_
)
{
this->eqnCount = eqnCount_;
this->numCell = numCell_;
this->numTri = numTri_;
this->cellVel = (double*) calloc((size_t) eqnCount, sizeof(double));
this->cellPre = (double*) calloc((size_t) eqnCount, sizeof(double));
}
/* Destructor */
CalcVars::~CalcVars(void)
{
free(this->cellVel);
free(this->cellPre);
}
void
CalcVars::CalcAdv(
){
/*int b1 = 0;
b1 = CellfunPtr(this, 1);*/
int Num = 50;
int *a1, *a1_dev;
a1 = (int *)malloc(Num*sizeof(int));
cudaMalloc((void**)&a1_dev, Num*sizeof(int));
for(int i = 0; i <Num; i++){
a1[i] = i;
}
cudaMemcpy(a1_dev, a1, Num*sizeof(int), cudaMemcpyHostToDevice);
//copy addresses of device functions to host
cudaMemcpyFromSymbol(&pF1_h, pF1_d, sizeof(pFunc_t));
kernel<<<1,42>>>(a1_dev, pF1_h, this);
cudaDeviceSynchronize();
cudaMemcpy(a1, a1_dev, Num*sizeof(int), cudaMemcpyDeviceToHost);
};
int
CalcVars::CellfunPtr(
void* ptr, int a
){
//CalcVars* ClsPtr = (CalcVars*)ptr;
printf("Printing from CPU function\n");
//int eqn_size = ClsPtr->eqnCount;
//printf("The number is %d",eqn_size);
return a-1;
};
main.cpp file
#include "solver.h"
int main(){
int n_Eqn, n_cell, n_tri, n_tetra;
n_Eqn = 100;
n_cell = 200;
n_tri = 300;
n_tetra = 400;
CalcVars* calcvars;
calcvars = new CalcVars(n_Eqn, n_cell, n_tri, n_tetra );
calcvars->CalcAdv();
system("pause");
}
The type of a member function is different:
typedef int (CalcVars::*MethodPtr)(int N);
__device__ MethodPtr pF1_d = &CalcVars::CellfunPtr;
You would then call it using:
__global__ void kernel(int*a, MethodPtr func, void* thisPtr_)
{
int tid = threadIdx.x;
CalcVars* c = ((CalcVars*)thisPtr_);
a[tid] = (c->*func)(a[tid]);
};
BUT the this pointer you are passing to the kernel is a host pointer:
kernel<<<1,42>>>(a1_dev, pF1_h, this);
This will result in invalid memory access in the kernel.
You would have to pass a device pointer of a CalcVars instance to the kernel in order to make it work.
As requested, a complete compilable example, which is a condensed version of your's and still suffers from the this pointer issue I wrote above.
demo.cu
#include <stdio.h>
struct CalcVars
{
void CalcAdv();
__host__ __device__
int CellfunPtr(int a);
};
typedef int (CalcVars::*MethodPtr)(int N);
__device__ MethodPtr pF1_d = &CalcVars::CellfunPtr;
MethodPtr pF1_h;
__global__ void kernel(int* a, MethodPtr func, void* thisPtr_)
{
int tid = threadIdx.x;
CalcVars* c = ((CalcVars*)thisPtr_);
a[tid] = (c->*func)(a[tid]);
};
voidCalcVars::CalcAdv()
{
int Num = 50;
int *a1, *a1_dev;
a1 = (int *)malloc(Num*sizeof(int));
cudaMalloc((void**)&a1_dev, Num*sizeof(int));
for (int i = 0; i <Num; i++)
{
a1[i] = i;
}
cudaMemcpy(a1_dev, a1, Num*sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpyFromSymbol(&pF1_h, pF1_d, sizeof(MethodPtr));
// DON'T pass the host this pointer here in real code
kernel<<<1,42>>>(a1_dev, pF1_h, this);
cudaDeviceSynchronize();
cudaMemcpy(a1, a1_dev, Num*sizeof(int), cudaMemcpyDeviceToHost);
};
int CalcVars::CellfunPtr(int a)
{
printf("Printing from CPU function\n");
return a-1;
};
int main()
{
CalcVars calcvars;
calcvars.CalcAdv();
}

using a pointer to vector<T>::data() for cublasSgemm

I am trying to use the vector::data() pointer when using cudaMalloc, cudaMemcpy, and cublasSgemm but I can't seem to get it to work. If I am not mistaken, vector::data() should return a pointer to the actual array stored in memory for that vector so it should be the same as having a T* aArray pointer to an array of type T stored in memory. Using the latter does work, but not the data() pointer.
Here is the code I am working on:
Matrix<T> Matrix<T>::cudaProd(Matrix<T>&A,Matrix<T>&B, Matrix<T>&C)
{
C = Matrix<T>(A.height, B.width); //resizing of the vector of elements for Matrix C
//A[m][n]*B[n][k]=C[m][k]
int m = A.height;
int n = B.height;
int k = B.width;
float alpha = 1.0f;
float beta = 0.0f;
T* d_a = A.GetPointer();
T* d_b = B.GetPointer();
T* d_c = C.GetPointer();
cudaMalloc(&d_a,A.size);
cudaMalloc(&d_b,B.size);
cudaMalloc(&d_c,C.size);
cudaMemcpy(d_a,A.GetPointer(),A.size,cudaMemcpyHostToDevice);
cudaMemcpy(d_b,B.GetPointer(),B.size,cudaMemcpyHostToDevice);
cublasHandle_t handle;
cublasStatus_t status = cublasCreate(&handle);
if (status != CUBLAS_STATUS_SUCCESS)
{
std::cerr << "!!!! CUBLAS initialization error\n";
}
status = cublasSgemm(handle,CUBLAS_OP_N,CUBLAS_OP_N,k,m,n,&alpha,d_b,k,d_a,n,&beta,d_c,k);
if (status != CUBLAS_STATUS_SUCCESS)
{
std::cerr << "!!!! kernel execution error.\n";
}
status = cublasDestroy(handle);
if (status != CUBLAS_STATUS_SUCCESS)
{
std::cerr << "!!!! shutdown error (A)\n";
}
cudaMemcpy(C.GetPointer(), d_c, C.size,cudaMemcpyDeviceToHost);
cudaFree(d_a);
cudaFree(d_b);
cudaFree(d_c);
The GetPointer() member function returns vector::data() of the vector of elements for that Matrix object. Size is the vector element's size in memory.
The vector of Matrix C returns all zeros when using the data() pointer, and returns the product of Matrix A and B when using T* aArray pointers without vectors.
Is it actually possible to use vectors to store the array of elements and then the data() pointer to initialize the device copy of the array, or am I forced to use the C style array storage on the host? Also, I have tried using thrust::device_vector and that works but I would like to stay away from creating raw_pointer_casts.
Thanks for your help!
Edit:
For those having trouble with copy and pasting, here is the complete example:
#include <cuda.h>
#include <cuda_runtime.h>
#include <cuda_device_runtime_api.h>
#include <cublas_v2.h>
#include <vector>
#include <iostream>
using namespace std;
template<typename T> class Matrix
{
public:
~Matrix();
Matrix();
Matrix(int rows, int columns);
int width;
int height;
int stride;
size_t size;
T &GetElement(int row, int column);
void SetElement(int row, int column, T value);
void SetElements(vector<T> value);
vector<T>& GetElements();
T* GetPointer();
Matrix<T> cudaProd(Matrix<T>&A,Matrix<T>&B, Matrix<T>&C);
private:
vector<T> elements;
T* firstElement;
};
template<typename T>
Matrix<T>::~Matrix()
{
}
template<typename T>
Matrix<T>::Matrix()
{
}
template<typename T>
Matrix<T>::Matrix(int rows, int columns)
{
height = rows;
width = columns;
stride = columns; //in row major order this is equal to the # of columns
elements.resize(rows*columns);
firstElement = elements.data();
size = height*width*sizeof(T);
}
template<typename T>
T &Matrix<T>::GetElement(int row, int column)
{
return elements[row*width + column]; //row major order return
}
template<typename T>
vector<T>& Matrix<T>::GetElements()
{
return elements; //row major order return
}
template<typename T>
void Matrix<T>::SetElement(int row, int column, T value)
{
elements[row*width + column] = value; //row major order return
}
template<typename T>
void Matrix<T>::SetElements(vector<T> value)
{
elements = value;
}
template<typename T>
T* Matrix<T>::GetPointer()
{
return firstElement;
}
template<typename T>
//Matrix Multiplication using CUDA
Matrix<T> Matrix<T>::cudaProd(Matrix<T>&A,Matrix<T>&B, Matrix<T>&C)
{
C = Matrix<T>(A.height, B.width);
//A[m][n]*B[n][k]=C[m][k]
int m = A.height;
int n = B.height;
int k = B.width;
float alpha = 1.0f;
float beta = 0.0f;
//Thrust usage
/*thrust::device_vector<T> d_A = A.GetElements();
T* d_a = thrust::raw_pointer_cast(&d_A[0]);
thrust::device_vector<T> d_B = B.GetElements();
T* d_b = thrust::raw_pointer_cast(&d_B[0]);
thrust::device_vector<T> d_C = C.GetElements();
T* d_c = thrust::raw_pointer_cast(&d_C[0]);*/
T* d_a = A.GetPointer();
T* d_b = B.GetPointer();
T* d_c = C.GetPointer();
cudaMalloc(&d_a,A.size);
cudaMalloc(&d_b,B.size);
cudaMalloc(&d_c,C.size);
cudaMemcpy(d_a,A.GetPointer(),A.size,cudaMemcpyHostToDevice);
cudaMemcpy(d_b,B.GetPointer(),B.size,cudaMemcpyHostToDevice);
cudaMemcpy(d_c,C.GetPointer(),C.size,cudaMemcpyHostToDevice);
cublasHandle_t handle;
cublasStatus_t status = cublasCreate(&handle);
if (status != CUBLAS_STATUS_SUCCESS)
{
std::cerr << "!!!! CUBLAS initialization error\n";
}
status = cublasSgemm(handle,CUBLAS_OP_N,CUBLAS_OP_N,k,m,n,&alpha,d_b,k,d_a,n,&beta,d_c,k);
if (status != CUBLAS_STATUS_SUCCESS)
{
std::cerr << "!!!! kernel execution error.\n";
}
status = cublasDestroy(handle);
if (status != CUBLAS_STATUS_SUCCESS)
{
std::cerr << "!!!! shutdown error (A)\n";
}
//thrust::copy(d_C.begin(), d_C.end(), C.GetElements().begin());
cudaMemcpy(C.GetPointer(), d_c, C.size,cudaMemcpyDeviceToHost);
cudaFree(d_a);
cudaFree(d_b);
cudaFree(d_c);
return C;
}
int main()
{
Matrix<float> A(2,2);
Matrix<float> B(2,2);
Matrix<float> C;
vector<float> aE(4,2);
vector<float> bE(4,4);
A.SetElements(aE);
B.SetElements(bE);
C = C.cudaProd(A, B, C); //function call to cudaProd()
for(int row = 0; row < A.height; ++row)
{
for(int col = 0; col < A.width; ++col)
{
cout<<A.GetElement(row, col)<<" "; //h_c is stored on device in column major order, need to switch to row major order
}
printf("\n");
}
printf("\n");
for(int row = 0; row < B.height; ++row)
{
for(int col = 0; col < B.width; ++col)
{
cout<<B.GetElement(row, col)<<" "; //h_c is stored on device in column major order, need to switch to row major order
}
printf("\n");
}
printf("\n");
for(int row = 0; row < C.height; ++row)
{
for(int col = 0; col < C.width; ++col)
{
cout<<C.GetElement(row, col)<<" "; //h_c is stored on device in column major order, need to switch to row major order
}
printf("\n");
}
printf("\n");
}
If I am not mistaken, vector::data() should return a pointer to the actual array stored in memory for that vector so it should be the same as having a T* aArray pointer to an array of type T stored in memory.
The std::vector class is an owning resource class. It means that trying to manage the underlying resource yourself with the data pointer will make you enter a world of pain.
For this very same reason:
cudaMalloc(&d_a,A.size);
cudaMalloc(&d_b,B.size);
cudaMalloc(&d_c,C.size);
and:
cudaMemcpy(C.GetPointer(), d_c, C.size,cudaMemcpyDeviceToHost);
and:
cudaFree(d_a);
cudaFree(d_b);
cudaFree(d_c);
cannot possibly work.
From the std::vector::data documentation, data() returns both const and non-const qualified pointers, depending on the fact that the vector is qualified as const or not. Quoting the documentation
If the vector object is const-qualified, the function returns a pointer to const value_type. Otherwise, it returns a pointer to value_type.
Accordingly, using
firstElement = elements.data();
in the Matrix constructor is fine to read/write the data.
The main problem with your code is that you are declaring C in the main, passing a reference to C to the cudaProd method and then internally using
C = Matrix<T>(A.height, B.width);
which will redeclare the Matrix.
If you change the definition of the cudaProd method to
template<typename T>
void cudaProd(Matrix<T>&A,Matrix<T>&B, Matrix<T>&C)
remove the
return C;
statement and allocate space for C in the main as
Matrix<float> C(2,2);
vector<float> cE(4,10);
C.SetElements(cE);
your code should work correctly.