Is it possible to utilize C++ style libraries for use in an openCL kernel?
I'm trying to implement a kernel that performs the tasks seen in the following code. There are two things that could make this really difficult: 1. The fact that I'm using the GLM math library, and 2. That I'm using structs (land_map_t).
For example, if I wanted to use a kernel to loop through a large 3-dimensional array, is it possible to include the GLM math library inside of the kernel and utilize its functionalities such as glm::simplex? I've heard that modern C++ functionalities such as classes aren't compatible with kernels.
And if that's not possible, how would one pass a struct to the kernel? should I define the same struct in both the kernel and my implementation? All the struct contains is a 3-dimensional array, so I could easily just turn it into a default C++ type if it was necessary.
land_map_t * Chunk::terrain_gen(glm::ivec3 pos)
{
float frequency = 500;
float noise_1;
land_map_t* landmap = new land_map_t;
for (int x = 0; x < chunkSize + 2; x++) {
for (int y = 0; y < chunkSize + 2; y++) {
for (int z = 0; z < chunkSize + 2; z++) {
noise_1 = (glm::simplex(
glm::vec2(glm::ivec2(x, z) + glm::ivec2(pos.x, pos.z)) / frequency));
landmap->i[x][y][z] = BLOCK::AIR;
if (pow(noise_1, 2) * 40.0 + 6.0 > (y + pos.y))
{
landmap->i[x][y][z] = BLOCK::DIRT;
}
}
}
}
return landmap;
}
You cannot include C++ libraries in OpenCL C. OpenCL is C99, not C++. There are no classes and only 1D arrays in OpenCL. Within a kernel there is also no dynamic memory allocation possible with the new operator.
The best solution is to split the class components up into arrays and within each array use linear indexing to get from (x, y, z)=(n%(Lx*Ly)%Lx, n%(Lx*Ly)/Lx, n/(Lx*Ly)) in the rectangular box of the size (Lx,Ly,Lz) to the linear index n=x+(y+z*Ly)*Lx; and back.
Your code in OpenCL could look like this:
kernel void terrain_gen(global uchar* landmap_flags, global float3* pos)
const uint n = get_global_id(0);
const uint x = n%((chunkSize+2)*(chunkSize+2))%(chunkSize+2);
const uint y = n%((chunkSize+2)*(chunkSize+2))/(chunkSize+2);
const uint z = n/((chunkSize+2)*(chunkSize+2))
// paste the SimplexNoise struct definition here
SimplexNoise simplexnoise;
simplexnoise.initialize();
const float frequency = 500;
const float noise_1 = (simplexnoise.noise(x,z)+simplexnoise.noise(pos[n].x, pos[n].z))/ frequency;
landmap_flags[n] = (noise_1*noise_1*40.0f+6.0f>(y+pos[n].y)) ? BLOCK_DIRT : BLOCK_AIR;
}
Regarding GLM, you have to port over the required functions into OpenCL C. For simplex noise, you can use something like this:
struct SimplexNoise { // simplex noise in 2D, sources: https://gist.github.com/Ellpeck/3df75965a542e2163d1ae9cf3e4777bb, https://github.com/stegu/perlin-noise/tree/master/src
const float3 grad3[12] = {
(float3)( 1, 1, 0), (float3)(-1, 1, 0), (float3)( 1,-1, 0), (float3)(-1,-1, 0),
(float3)( 1, 0, 1), (float3)(-1, 0, 1), (float3)( 1, 0,-1), (float3)(-1, 0,-1),
(float3)( 0, 1, 1), (float3)( 0,-1, 1), (float3)( 0, 1,-1), (float3)( 0,-1,-1)
};
const uchar p[256] = {
151,160,137, 91, 90, 15,131, 13,201, 95, 96, 53,194,233, 7,225,140, 36,103, 30, 69,142, 8, 99, 37,240, 21, 10, 23,190, 6,148,
247,120,234, 75, 0, 26,197, 62, 94,252,219,203,117, 35, 11, 32, 57,177, 33, 88,237,149, 56, 87,174, 20,125,136,171,168, 68,175,
74,165, 71,134,139, 48, 27,166, 77,146,158,231, 83,111,229,122, 60,211,133,230,220,105, 92, 41, 55, 46,245, 40,244,102,143, 54,
65, 25, 63,161, 1,216, 80, 73,209, 76,132,187,208, 89, 18,169,200,196,135,130,116,188,159, 86,164,100,109,198,173,186, 3, 64,
52,217,226,250,124,123, 5,202, 38,147,118,126,255, 82, 85,212,207,206, 59,227, 47, 16, 58, 17,182,189, 28, 42,223,183,170,213,
119,248,152, 2, 44,154,163, 70,221,153,101,155,167, 43,172, 9,129, 22, 39,253, 19, 98,108,110,79,113,224,232,178,185, 112,104,
218,246, 97,228,251, 34,242,193,238,210,144, 12,191,179,162,241, 81, 51,145,235,249, 14,239,107, 49,192,214, 31,181,199,106,157,
184, 84,204,176,115,121, 50, 45,127, 4,150,254,138,236,205, 93,222,114, 67, 29, 24, 72,243,141,128,195, 78, 66,215, 61,156,180
};
const float F2=0.5f*(sqrt(3.0f)-1.0f), G2=(3.0f-sqrt(3.0f))/6.0f; // skewing and unskewing factors for 2, 3, and 4 dimensions
const float F3=1.0f/3.0f, G3=1.0f/6.0f;
const float F4=(sqrt(5.0f)-1.0f)*0.25f, G4=(5.0f-sqrt(5.0f))*0.05f;
uchar perm[512]; // to remove the need for index wrapping, double the permutation table length
uchar perm12[512];
//int floor(const float x) const { return (int)x-(x<=0.0f); }
float dot(const float3 g, const float x, const float y) const { return g.x*x+g.y*y; }
void initialize() {
for(int i=0; i<512; i++) {
perm[i] = p[i&255];
perm12[i] = (uchar)(perm[i]%12);
}
}
float noise(float x, float y) const { // 2D simplex noise
float n0, n1, n2; // noise contributions from the three corners, skew the input space to determine simplex cell
float s = (x+y)*F2; // hairy factor for 2D
int i=floor(x+s), j=floor(y+s);
float t = (i+j)*G2;
float X0=i-t, Y0=j-t; // unskew the cell origin back to (x,y) space
float x0=x-X0, y0=y-Y0; // the x,y distances from the cell origin
// for the 2D case, the simplex shape is an equilateral triangle, determine simplex
int i1, j1; // offsets for second (middle) corner of simplex in (i,j) coords
if(x0>y0) { i1=1; j1=0; } // lower triangle, XY order: (0,0)->(1,0)->(1,1)
else /**/ { i1=0; j1=1; } // upper triangle, YX order: (0,0)->(0,1)->(1,1)
float x1=x0- i1+ G2, y1=y0- j1+ G2; // offsets for middle corner in (x,y) unskewed coords
float x2=x0-1.0f+2.0f*G2, y2=y0-1.0f+2.0f*G2; // offsets for last corner in (x,y) unskewed coords
int ii=i&255, jj=j&255; // work out the hashed gradient indices of the three simplex corners
int gi0 = perm12[ii +perm[jj ]];
int gi1 = perm12[ii+i1+perm[jj+j1]];
int gi2 = perm12[ii+ 1+perm[jj+ 1]];
float t0 = 0.5f-x0*x0-y0*y0; // calculate the contribution from the three corners
if(t0<0) n0 = 0.0f; else { t0 *= t0; n0 = t0*t0*dot(grad3[gi0], x0, y0); } // (x,y) of grad3 used for 2D gradient
float t1 = 0.5f-x1*x1-y1*y1;
if(t1<0) n1 = 0.0f; else { t1 *= t1; n1 = t1*t1*dot(grad3[gi1], x1, y1); }
float t2 = 0.5f-x2*x2-y2*y2;
if(t2<0) n2 = 0.0f; else { t2 *= t2; n2 = t2*t2*dot(grad3[gi2], x2, y2); }
return 70.0f*(n0+n1+n2); // add contributions from each corner to get the final noise value, result is scaled to stay inside [-1,1]
}
};
I am implementing a DCGAN network in LibTorch/Pytorch. I am following the official example in https://github.com/pytorch/examples/blob/master/cpp/dcgan/dcgan.cpp .
The only differences between my problem and the example are:
My dataset is composed by RGB pictures (CelebA dataset) while the one from the example is black and white (MNIST)
The dimensions of my pictures are 64x64 while MNIST pictures are 28x28
That said here is my code:
#include <torch/torch.h>
#include <cmath>
#include <cstdio>
#include <iostream>
#include "CustomDataset.h"
#include "parameters.h"
// The size of the noise vector fed to the generator.
const int64_t kNoiseSize = 100;
// The batch size for training.
const int64_t kBatchSize = 64;
// The number of epochs to train.
const int64_t kNumberOfEpochs = 30;
// Where to find the MNIST dataset.
const char* kDataFolder = "./data";
// After how many batches to create a new checkpoint periodically.
const int64_t kCheckpointEvery = 20;
// How many images to sample at every checkpoint.
const int64_t kNumberOfSamplesPerCheckpoint = 10;
// After how many batches to log a new update with the loss value.
const int64_t kLogInterval = 10;
using namespace torch;
struct DCGANGeneratorImpl : nn::Module {
DCGANGeneratorImpl(int kNoiseSize)
: conv1(nn::ConvTranspose2dOptions(kNoiseSize, 256, 4)
.bias(false)),
batch_norm1(256),
conv2(nn::ConvTranspose2dOptions(256, 128, 4)
.stride(2)
.padding(1)
.bias(false)),
batch_norm2(128),
conv3(nn::ConvTranspose2dOptions(128, 64, 4)
.stride(2)
.padding(1)
.bias(false)),
batch_norm3(64),
conv4(nn::ConvTranspose2dOptions(64, 32, 4)
.stride(2)
.padding(1)
.bias(false)),
batch_norm4(32),
conv5(nn::ConvTranspose2dOptions(32, 3, 4)
.stride(2)
.padding(1)
.bias(false))
{
register_module("conv1", conv1);
register_module("conv2", conv2);
register_module("conv3", conv3);
register_module("conv4", conv4);
register_module("conv5", conv5);
register_module("batch_norm1", batch_norm1);
register_module("batch_norm2", batch_norm2);
register_module("batch_norm3", batch_norm3);
register_module("batch_norm4", batch_norm4);
}
torch::Tensor forward(torch::Tensor x)
{
x = torch::relu(batch_norm1(conv1(x)));
x = torch::relu(batch_norm2(conv2(x)));
x = torch::relu(batch_norm3(conv3(x)));
x = torch::relu(batch_norm4(conv4(x)));
x = torch::tanh(conv5(x));
return x;
}
nn::ConvTranspose2d conv1, conv2, conv3, conv4, conv5;
nn::BatchNorm2d batch_norm1, batch_norm2, batch_norm3, batch_norm4;
};
TORCH_MODULE(DCGANGenerator);
int main(int argc, const char* argv[]) {
torch::manual_seed(1);
// Create the device we pass around based on whether CUDA is available.
torch::Device device(torch::kCPU);
if (torch::cuda::is_available()) {
std::cout << "CUDA is available! Training on GPU." << std::endl;
device = torch::Device(torch::kCUDA);
}
DCGANGenerator generator(kNoiseSize);
generator->to(device);
nn::Sequential discriminator(
// Layer 1
nn::Conv2d(
nn::Conv2dOptions(3, 64, 4).stride(2).padding(1).bias(false)),
nn::LeakyReLU(nn::LeakyReLUOptions().negative_slope(0.2)),
//output is 32x32
// Layer 2
nn::Conv2d(
nn::Conv2dOptions(64, 128, 4).stride(2).padding(1).bias(false)),
nn::BatchNorm2d(128),
nn::LeakyReLU(nn::LeakyReLUOptions().negative_slope(0.2)),
//output is 16x16
// Layer 3
nn::Conv2d(
nn::Conv2dOptions(128, 64, 4).stride(2).padding(1).bias(false)),
nn::BatchNorm2d(64),
nn::LeakyReLU(nn::LeakyReLUOptions().negative_slope(0.2)),
//output is 8x8
// Layer 4
nn::Conv2d(
nn::Conv2dOptions(64, 32, 5).stride(1).padding(0).bias(false)),
nn::LeakyReLU(nn::LeakyReLUOptions().negative_slope(0.2)),
// output is 4x4
// Layer 5
nn::Conv2d(
nn::Conv2dOptions(32, 1, 4).stride(1).padding(0).bias(false)),
nn::Sigmoid());
discriminator->to(device);
// Where all my pictures are;
std::string file_location{"dataset/img_align_celeba/*.jpg"};
auto dataset = CustomDataset(file_location).map(data::transforms::Stack<>());
const int64_t batches_per_epoch =
std::ceil(dataset.size().value() / static_cast<double>(kBatchSize));
auto data_loader = torch::data::make_data_loader(
std::move(dataset),
torch::data::DataLoaderOptions().batch_size(kBatchSize).workers(2));
torch::optim::Adam generator_optimizer(
generator->parameters(), torch::optim::AdamOptions(2e-4).beta1(0.5));
torch::optim::Adam discriminator_optimizer(
discriminator->parameters(), torch::optim::AdamOptions(2e-4).beta1(0.5));
int64_t checkpoint_counter = 1;
for (int64_t epoch = 1; epoch <= kNumberOfEpochs; ++epoch) {
int64_t batch_index = 0;
for (torch::data::Example<>& batch : *data_loader) {
// Train discriminator with real images.
discriminator->zero_grad();
torch::Tensor real_images = batch.data.to(device);
torch::Tensor real_labels =
torch::empty(batch.data.size(0), device).uniform_(0.8, 1.0);
torch::Tensor real_output = discriminator->forward(real_images);
torch::Tensor d_loss_real =
torch::binary_cross_entropy(real_output, real_labels);
d_loss_real.backward();
// Train discriminator with fake images.
torch::Tensor noise =
torch::randn({batch.data.size(0), kNoiseSize, 1, 1}, device);
torch::Tensor fake_images = generator->forward(noise);
torch::Tensor fake_labels = torch::zeros(batch.data.size(0), device);
torch::Tensor fake_output = discriminator->forward(fake_images.detach());
torch::Tensor d_loss_fake =
torch::binary_cross_entropy(fake_output, fake_labels);
d_loss_fake.backward();
torch::Tensor d_loss = d_loss_real + d_loss_fake;
discriminator_optimizer.step();
// Train generator.
generator->zero_grad();
fake_labels.fill_(1);
fake_output = discriminator->forward(fake_images);
torch::Tensor g_loss =
torch::binary_cross_entropy(fake_output, fake_labels);
g_loss.backward();
generator_optimizer.step();
batch_index++;
if (batch_index % kCheckpointEvery == 0) {
// Checkpoint the model and optimizer state.
torch::save(generator, "generator-checkpoint.pt");
torch::save(generator_optimizer, "generator-optimizer-checkpoint.pt");
torch::save(discriminator, "discriminator-checkpoint.pt");
torch::save(
discriminator_optimizer, "discriminator-optimizer-checkpoint.pt");
// Sample the generator and save the images.
torch::Tensor samples = generator->forward(torch::randn(
{kNumberOfSamplesPerCheckpoint, kNoiseSize, 1, 1}, device));
torch::save(
samples,
torch::str("dcgan-sample-", checkpoint_counter, ".pt"));
std::cout << "\n-> checkpoint " << ++checkpoint_counter << '\n';
}
}
}
std::cout << "Training complete!" << std::endl;
}
I save the minibatches from time to time and plot the result of inputing noise over the Generator. The problem is that in the MNIST example results are correct but in my case for each output picture I see like 9 smaller pictures with faces instead of one (see the picture attached).
How is it possible that the generator is outputting a correct shape but with 9 almost identical faces instead of one?
I am trying to get the get the color of each sample during Cycles rendering. In order to do so, I am calling the following function
void update_bcd_inputs(int x, int y, float sampleR, float sampleG, float sampleB){
float current_valueR = sStats->m_meanImage.getValue(bcd::PixelPosition(x, y), 0); // += sampleR;
float current_valueG = sStats->m_meanImage.getValue(bcd::PixelPosition(x, y), 1); // += sampleG;
float current_valueB = sStats->m_meanImage.getValue(bcd::PixelPosition(x, y), 2); // += sampleB;
sStats->m_meanImage.set(x, y, 0, current_valueR + sampleR);
sStats->m_meanImage.set(x, y, 1, current_valueG + sampleG);
sStats->m_meanImage.set(x, y, 2, current_valueB + sampleB);
}
in blender/intern/cycles/device/device_cpu.cpp
void path_trace(DeviceTask &task, RenderTile &tile, KernelGlobals *kg)
{
float *render_buffer = (float*)tile.buffer;
int start_sample = tile.start_sample;
int end_sample = tile.start_sample + tile.num_samples;
float inv_weight = 1.0f/tile.num_samples;
for(int sample = start_sample; sample < end_sample; sample++) {
if(task.get_cancel() || task_pool.canceled()) {
if(task.need_finish_queue == false)
break;
}
for(int y = tile.y; y < tile.y + tile.h; y++) {
for(int x = tile.x; x < tile.x + tile.w; x++) {
path_trace_kernel()(kg, render_buffer,
sample, x, y, tile.offset, tile.stride);
int step = tile.offset + x + y*tile.stride;
step *= kernel_data.film.pass_stride;
float sampleR = *(render_buffer +step);
float sampleG = *(render_buffer +step +1);
float sampleB = *(render_buffer +step +2);
update_bcd_inputs(x, y, sampleR*inv_weight, sampleG*inv_weight, sampleB*inv_weight);
}
}
tile.sample = sample + 1;
task.update_progress(&tile, tile.w*tile.h);
}
}
SamplesStatisticsImages sStats is an attribute of CPUDevice
struct SamplesStatisticsImages
{
SamplesStatisticsImages() = default;
SamplesStatisticsImages(int i_width, int i_height, int i_nbOfBins);
SamplesStatisticsImages(const SamplesStatisticsImages&) = default;
SamplesStatisticsImages(SamplesStatisticsImages&&) = default;
SamplesStatisticsImages& operator=(const SamplesStatisticsImages&) = default;
SamplesStatisticsImages& operator=(SamplesStatisticsImages&&) = default;
DeepImage<float> m_nbOfSamplesImage;
DeepImage<float> m_meanImage;
DeepImage<float> m_covarImage;
DeepImage<float> m_histoImage;
};
Here are the files DeepImage.hpp and DeepImage.h.
The problem is that the call to update_bcd_inputs makes Blender to crash without even trying to render an image. I just change the renderer to Cycles, try to add a new material and boom, it crashes.
I figured out it is the set function that is the problem (when I remove it, the app doesn't crash).
Can someone understand why ? I don't have strong C++ skills so I must be missing something.
Here is the crash log as well.
Thanks !
Edit: more precisions
Here is the modified device_cpu.cpp.
The sStats pointer is initialized with nullptr in the CPUDevice constructor
CPUDevice(DeviceInfo& info_, Stats &stats_, bool background_)
: Device(info_, stats_, background_),
texture_info(this, "__texture_info", MEM_TEXTURE),
histoParams(),
#define REGISTER_KERNEL(name) name ## _kernel(KERNEL_FUNCTIONS(name))
...
#undef REGISTER_KERNEL
{
...
sStats = nullptr;
}
and then allocated in
void thread_render(DeviceTask& task)
{
...
sStats = new bcd::SamplesStatisticsImages(task.w, task.h, histoParams.m_nbOfBins);
...
}
histoParams is an attribute of CPUDevice as well.
struct HistogramParameters
{
HistogramParameters() :
m_nbOfBins(20),
m_gamma(2.2f),
m_maxValue(2.5f) {}
int m_nbOfBins;
float m_gamma; ///< exponent for the exponential size increase of histogram bins
float m_maxValue;
};