OpenCL - results are not the same as CPU version

OpenCL - results are not the same as CPU version - c++

I want to use openCl for a pixel value comparison between two images. One image should be "transformed" by a transformation matrix.
1) I am facing one problem, that the result of the openCL version is not the same as the CPU version.
The pixel value difference is in my example images (imageA: all pixels are 5, imagesB: all pixel are 6) always one, so in total with 1000*1000 pixels it should be 1000000.
The CPU version is always correct, but the openCL version is always a little unprecise and also differs from time to time (e.g. 998895 or 998829).
2) Another issue I have is the runtime, because adding the difference of two compared pixel to a result variable, takes a long time. But my feelings says that it could be solved through another memory layout.
Any ideas for the problems I have?
Maybe also the way of using a two dimensional workingset leads to mistakes?
Thank you and kind regards
Hendrik
Here is the kernel:
Basically it gets two images and 700 transformation matrices (at the moment all are representing the identity).
__kernel void compliance(
__read_only image2d_t imageA,
__read_only image2d_t imageB,
__constant float *matrix,
__global int *result
)
{
for (int i = 0; i < 700; i++)
{
size_t x = get_global_id(0);
size_t y = get_global_id(1);
float t1 = matrix[0 + i * 6];
float t2 = matrix[1 + i * 6];
float t3 = matrix[2 + i * 6];
float t4 = matrix[3 + i * 6];
float t5 = matrix[4 + i * 6];
float t6 = matrix[5 + i * 6];
//calculate the other coords of the comparing pixel
int x_new = x * t1 + y * t2 + 1 * t3;
int y_new = x * t4 + y * t5 + 1 * t6;
int a = (read_imagei(imageA, (int2)(x, y)).x);
int b = (read_imagei(imageB, (int2)(x_new, y_new)).x);
int diff = b - a;
//add every different of two compared pixels to the result
result[i] += diff;
}
}
Here is my host code:
#define __CL_ENABLE_EXCEPTIONS
#include <CL/cl.hpp>
#include <utility>
#include <iostream>
#include <fstream>
#include <string>
#include <chrono>
#include <opencv2\core.hpp>
#include <opencv2\imgproc.hpp>
#include <opencv2\highgui.hpp>
using namespace std;
int main(int argc, char** argv) {
//700 transformation matrices
int numberMatrices = 700;
bool opencl = true;
//iamge width
int width = 1000;
//image height
int height = 1000;
//total number of pixels of one image
int size = width*height;
// Create two example images
const int LIST_SIZE = size;
int *imageA = new int[LIST_SIZE];
int *imageB = new int[LIST_SIZE];
for (int i = 0; i < LIST_SIZE; i++) {
//every pixel value of imageA is 5
imageA[i] = 5;
//every pixel value of imageA is 6
imageB[i] = 6;
}
//creation of n transformation matrices
const int MATRIX_SIZE = 6* numberMatrices;
float *indi = new float[MATRIX_SIZE];
//all the matrices are the same
for (int i = 0; i < numberMatrices; i++)
{
//identity matrix
indi[0 + i * 6] = 1;
indi[1 + i * 6] = 0;
indi[2 + i * 6] = 0;
indi[3 + i * 6] = 0;
indi[4 + i * 6] = 1;
indi[5 + i * 6] = 0;
}
//array to save the results of the comparison
const int RESULT_SIZE = numberMatrices;
int *result = new int[RESULT_SIZE];
if (opencl)
{
try {
// Get available platforms
vector<cl::Platform> platforms;
cl::Platform::get(&platforms);
std::cerr << "Platform number is: " << platforms.size() << std::endl;
std::string platformVendor;
platforms[0].getInfo((cl_platform_info)CL_PLATFORM_VENDOR, &platformVendor);
std::cerr << "Platform is by: " << platformVendor << "\n";
// Select the default platform and create a context using this platform and the GPU
cl_context_properties cps[3] = {
CL_CONTEXT_PLATFORM,
(cl_context_properties)(platforms[0])(),
0
};
cl::Context context(CL_DEVICE_TYPE_CPU, cps);
vector<cl::ImageFormat> format;
context.getSupportedImageFormats(CL_MEM_READ_ONLY, CL_MEM_OBJECT_IMAGE2D, &format);
/* for (int i = 0; i < format.size(); i++)
{
cout << "Channel Data Type: " << format.at(i).image_channel_data_type
<< " Channel order: " << format.at(i).image_channel_order << endl;
}*/
// Get a list of devices on this platform
vector<cl::Device> devices = context.getInfo<CL_CONTEXT_DEVICES>();
for (int i = 0; i < devices.size(); i++)
{
cout << "Device: " << devices.at(i).getInfo<CL_DEVICE_NAME>() << endl;
cout << "DOUBLE FP: " << devices.at(i).getInfo<CL_DEVICE_DOUBLE_FP_CONFIG>() << endl;
cout << "Image Max Height: " << devices.at(i).getInfo<CL_DEVICE_IMAGE2D_MAX_HEIGHT>() << endl;
cout << "Image Support: " << devices.at(i).getInfo<CL_DEVICE_IMAGE_SUPPORT>() << endl;
cout << "Local Memory Size: " << devices.at(i).getInfo<CL_DEVICE_LOCAL_MEM_SIZE>() << endl;
cout << "Clock Frequency: " << devices.at(i).getInfo<CL_DEVICE_MAX_CLOCK_FREQUENCY>() << endl;
cout << "CUs: " << devices.at(i).getInfo<CL_DEVICE_MAX_COMPUTE_UNITS>() << endl;
cout << "Driver: " << devices.at(i).getInfo<CL_DRIVER_VERSION>() << endl;
cout << "Version: " << devices.at(i).getInfo<CL_DEVICE_VERSION>() << endl;
cout << "Work Group: " << devices.at(i).getInfo<CL_DEVICE_MAX_WORK_GROUP_SIZE>() << endl;
cout << "Items: " << devices.at(i).getInfo<CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS>();
cout << endl;
}
//Create opencl image
cl::Image2D clImage_A = cl::Image2D(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, cl::ImageFormat(CL_RGBA, CL_UNSIGNED_INT8), (size_t)width, (size_t)height, 0, imageA);
cl::Image2D clImage_B = cl::Image2D(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, cl::ImageFormat(CL_RGBA, CL_UNSIGNED_INT8), (size_t)width, (size_t)height, 0, imageB);
// Create a command queue and use the first device
cl::CommandQueue queue = cl::CommandQueue(context, devices[0]);
// Read kernel source file
std::ifstream sourceFile("difference.cl");
std::string sourceCode(
std::istreambuf_iterator<char>(sourceFile),
(std::istreambuf_iterator<char>()));
cl::Program::Sources source(1, std::make_pair(sourceCode.c_str(), sourceCode.length() + 1));
// Make program of the source code in the context
cl::Program program = cl::Program(context, source);
// Build program for these specific devices
program.build(devices);
// Make kernel
cl::Kernel kernel(program, "compliance");
// Create memory buffers
cl::Buffer buffer_matrix = cl::Buffer(context, CL_MEM_READ_ONLY, LIST_SIZE * sizeof(float));
cl::Buffer buffer_result = cl::Buffer(context, CL_MEM_READ_WRITE, RESULT_SIZE * sizeof(int));
// Copy list of results to the memory buffers
queue.enqueueWriteBuffer(buffer_matrix, CL_TRUE, 0, MATRIX_SIZE * sizeof(float), indi);
// Set arguments to kernel
kernel.setArg(0, clImage_A);
kernel.setArg(1, clImage_B);
kernel.setArg(2, buffer_matrix);
kernel.setArg(3, buffer_result);
cl::Event event;
std::cout << "Start OpenCL processing.." << endl;
chrono::high_resolution_clock::time_point t1 = chrono::high_resolution_clock::now();
// Run the kernel n-times on specific ND range
for (int i = 0; i < 1; i++)
{
queue.enqueueNDRangeKernel(
kernel,
cl::NullRange,
cl::NDRange((size_t)width, (size_t)height),
cl::NDRange(1, 1),
NULL,
&event);
cout << i << " ";
event.wait();
}
chrono::high_resolution_clock::time_point t2 = chrono::high_resolution_clock::now();
auto duration_opencl = std::chrono::duration_cast<std::chrono::milliseconds>(t2 - t1).count();
std::cout << "OpenCL processing done.." << endl;
std::cout << "Start CPU Processing.." << endl;
// Read buffer_result into result
queue.enqueueReadBuffer(buffer_result, CL_TRUE, 0, RESULT_SIZE * sizeof(int), result);
//cpu version to calculate the difference between the two arryays
t1 = chrono::high_resolution_clock::now();
int different = 0;
int x_new;
int x;
for (int i = 0; i < numberMatrices; i++)
{
different = 0;
for (int n = 0; n < LIST_SIZE; n++)
{
x = imageA[n];
x_new = x;;
int a = imageA[x];
int b = imageB[x_new];
int diff = imageB[x_new] - imageA[x];
different += diff;
}
}
t2 = chrono::high_resolution_clock::now();
auto duration_cpu = std::chrono::duration_cast<std::chrono::milliseconds>(t2 - t1).count();
std::cout << "CPU processing done.." << endl;
//output of the results
std::cout << "opencl: diff " << result[0] << endl;
std::cout << "Runtime opencl: " << duration_opencl << endl;
std::cout << "CPU: diff " << different << endl;
std::cout << "Runtime CPU: " << duration_cpu << endl;
double times = (double)duration_cpu / (double)duration_opencl;
std::cout << "OpenCL is " << times << " times faster!!!" << endl;
char c;
std::cin >> c;
}
catch (cl::Error error) {
std::cout << error.what() << "(" << error.err() << ")" << std::endl;
char c;
std::cin >> c;
}
}
return 0;
}

Don't you think that you have race condition here case in this line result[i] += diff; of your OpenCL code your program do it in each work item simultaneously? So maybe it could be a problem.

The floating point arithmetics differ on the platforms. You are very likely seeing the effect of a hardware specific MAD optimization performed by the OpenCL compiler. As far as I know, disabling optimizations using -cl-opt-disable will not help in this case.

Related

Calling Armadillo function in OpenMP parallel for loop causes data corruption

I'm trying to parallelise a large linear system solve using Armadillo and OpenMP using arma::solve. Instead of directly calling the solver I would like to split the problems into smaller chunks of RHS vectors and call them in parallel in an OpenMP loop as shown in the Listing below.
This should result in the same answer since the multiple RHS are independent problems but I often get a few columns mangled when I run the code in this manner. I've even tried to enclose the write back in an omp critical section but it still fails.
Is Armadillo safe to run in this manner or am I missing something here?
// to compile the code run as
// g++ parals.cpp -I$ARMADILLO_INCLUDE_DIR -L$ARMADILLO_INCLUDE_DIR/../lib64
// -larmadillo -fopenmp -lopenblas -o parals.out
#define ARMA_DONT_USE_WRAPPER
#define ARMA_USE_BLAS
#define ARMA_USE_LAPACK
#include <armadillo>
#include <omp.h>
#include <iostream>
using namespace arma;
/*
* Solves LS for a single problem,
*
* ||AX - B ||_F^2
*/
int main(int argc, char *argv[]) {
int m = atoi(argv[1]); // A is of size m \times n
int n = atoi(argv[2]); // A is of size m \times n
int k = atoi(argv[3]); // B is of size m \times k
int seed = atoi(argv[4]); // seed for random inits
int chunk = atoi(argv[5]); // chunk size to group RHS
arma::arma_rng::set_seed(seed);
std::cout << "m::" << m << "::n::" << n << "::k::" << k
<< "::seed::" << seed << "::chunk::" << chunk
<< std::endl;
mat A(m,n,arma::fill::randu);
//std::cout << "A::" << std::endl << A << std::endl;
mat B(m,k,arma::fill::randu);
//std::cout << "B::" << std::endl << B << std::endl;
mat AtA = A.t() * A;
mat AtB = A.t() * B;
// solve sequentially
mat Xseq = arma::solve(AtA, AtB, arma::solve_opts::likely_sympd);
int num_chunks = AtB.n_cols / chunk;
if (num_chunks * chunk < AtB.n_cols) num_chunks++;
mat Xchunk(n,k,arma::fill::zeros);
for (int nt = 0; nt < num_chunks; nt++) {
int spanStart = nt * chunk;
int spanEnd = (nt + 1) * chunk - 1;
if (spanEnd > AtB.n_cols - 1) {
spanEnd = AtB.n_cols - 1;
}
mat rhs = AtB.cols(spanStart, spanEnd);
mat Y = arma::solve(AtA, rhs, arma::solve_opts::likely_sympd);
Xchunk.cols(spanStart, spanEnd) = Y;
}
bool chkchunk = arma::approx_equal(Xseq, Xchunk, "absdiff", 0.0001);
std::cout << "(Xseq == Xchunk) ? = " << chkchunk << std::endl;
if (!chkchunk) {
std::cout << "Xseq::" << std::endl << Xseq << std::endl;
std::cout << "Xchunk::" << std::endl << Xchunk << std::endl;
}
mat Xpar(n,k,arma::fill::zeros);
#pragma omp parallel for schedule(static,1)
for (int nt = 0; nt < num_chunks; nt++) {
int spanStart = nt * chunk;
int spanEnd = (nt + 1) * chunk - 1;
if (spanEnd > AtB.n_cols - 1) {
spanEnd = AtB.n_cols - 1;
}
mat rhs = AtB.cols(spanStart, spanEnd);
mat Y = arma::solve(AtA, rhs, arma::solve_opts::likely_sympd);
Xpar.cols(spanStart, spanEnd) = Y;
}
bool chkpar = arma::approx_equal(Xseq, Xpar, "absdiff", 0.0001);
std::cout << "(Xseq == Xpar) ? = " << chkpar << std::endl;
if (!chkpar) {
std::cout << "Xseq::" << std::endl << Xseq << std::endl;
std::cout << "Xpar::" << std::endl << Xpar << std::endl;
}
return 0;
}
A small driver script to hopefully reproduce my errors. My Armadillo instance is linked with OpenBLAS and I don't specify the OMP_NUM_THREADS variable.
#!/bin/bash
for i in {1..20}
do
echo $i
unset OMP_NUM_THREADS;
./parals.out 10 5 20 17 3
done

increase performance of opencl

I am trying to implement some image processing algorithm using opencl. But as i see when i use opencl it is taking around 0.5 ms to complet one process i.e one frame. Isn't there any way than i initialize the opencl parameters only once using class object declaration than only call a function run the main kernel? I tried like this by creating class but as i find context, device can't be declared and used seperately and needs to be created each time.
#include <CL/cl.hpp>
#include <chrono>
#include <iostream>
using namespace std::chrono;
using namespace std;
namespace Color {
enum Code {
FG_RED = 31,
FG_GREEN = 32,
FG_BLUE = 34,
FG_DEFAULT = 39,
BG_RED = 41,
BG_GREEN = 42,
BG_BLUE = 44,
BG_DEFAULT = 49
};
class Modifier {
Code code;
public:
Modifier(Code pCode) : code(pCode) {}
friend std::ostream& operator<<(std::ostream& os, const Modifier& mod) {
return os << "\033[" << mod.code << "m";
}
};
} // namespace Color
class useOpenCL {
public:
int size = 294400;
std::vector<cl::Platform> all_platforms;
std::vector<cl::Device> all_devices;
cl::Platform default_platform;
cl::Device default_device;
cl::Program::Sources sources;
std::string kernel_code;
cl::Kernel kernel_add;
cl::Buffer buffer_A;
useOpenCL();
~useOpenCL() {}
void backgroundSub();
};
useOpenCL::useOpenCL() {
Color::Modifier green(Color::FG_GREEN);
Color::Modifier red(Color::FG_RED);
Color::Modifier def(Color::FG_DEFAULT);
// get all platforms (drivers)
cl::Platform::get(&all_platforms);
if (all_platforms.size() == 0) {
std::cout << red << " No platforms found. Check OpenCL installation!" << def
<< endl;
exit(1);
}
default_platform = all_platforms[0];
std::cout << green << "Using platform: " << def
<< default_platform.getInfo<CL_PLATFORM_NAME>() << std::endl;
// get default device of the default platform
default_platform.getDevices(CL_DEVICE_TYPE_ALL, &all_devices);
if (all_devices.size() == 0) {
std::cout << red << " No devices found. Check OpenCL installation!" << def
<< endl;
exit(1);
}
default_device = all_devices[0];
std::cout << green << "Using device: " << def
<< default_device.getInfo<CL_DEVICE_NAME>() << std::endl;
// kernel calculates for each element C=A+B
kernel_code =
" void kernel simple_add(global const int* A, global const int* B, "
"global int* C){ "
" C[get_global_id(0)]=A[get_global_id(0)]+B[get_global_id(0)]; "
" "
" } "
" ";
sources.push_back({kernel_code.c_str(), kernel_code.length()});
}
void useOpenCL::backgroundSub() {
int A[size], B[size];
for (int i = 0; i < size; i++) {
A[i] = i;
B[i] = i + 1;
}
auto start1 = high_resolution_clock::now();
cl::Context context({default_device});
cl::Program program(context, sources);
if (program.build({default_device}) != CL_SUCCESS) {
std::cout << " Error building: "
<< program.getBuildInfo<CL_PROGRAM_BUILD_LOG>(default_device)
<< "\n";
exit(1);
}
// create buffers on the device
cl::Buffer buffer_A(context, CL_MEM_READ_WRITE, sizeof(int) * size);
cl::Buffer buffer_B(context, CL_MEM_READ_WRITE, sizeof(int) * size);
cl::Buffer buffer_C(context, CL_MEM_READ_WRITE, sizeof(int) * size);
// create queue to which we will push commands for the device.
cl::CommandQueue queue(context, default_device);
// write arrays A and B to the device
queue.enqueueWriteBuffer(buffer_A, CL_TRUE, 0, sizeof(int) * size, A);
queue.enqueueWriteBuffer(buffer_B, CL_TRUE, 0, sizeof(int) * size, B);
// run the kernel
/*cl::KernelFunctor
simple_add(cl::Kernel(program,"simple_add"),queue,cl::NullRange,cl::NDRange(10),cl::NullRange);
simple_add(buffer_A,buffer_B,buffer_C);*/
// alternative way to run the kernel
kernel_add.setArg(0, buffer_A);
kernel_add.setArg(1, buffer_B);
kernel_add.setArg(2, buffer_C);
queue.enqueueNDRangeKernel(kernel_add, cl::NullRange, cl::NDRange(size),
cl::NullRange);
queue.finish();
int C[size];
// read result C from the device to array C
queue.enqueueReadBuffer(buffer_C, CL_TRUE, 0, sizeof(int) * size, C);
/*std::cout<<" result: \n";
for(int i=0;i<size;i++){
std::cout<<C[i]<<"\t";
}*/
auto stop1 = high_resolution_clock::now();
auto duration1 = duration_cast<microseconds>(stop1 - start1);
auto FPS = 1000000.0 / duration1.count();
cout << "Segmentation FPS=" << FPS << "\t"
<< "Execution Time(sec)=" << duration1.count() / 1000000.0 << endl;
}
int main() {
useOpenCL img;
while (true) {
img.backgroundSub();
}
return 0;
}
It is giving me below results:
Segmentation FPS=13.2557 Execution Time(sec)=0.075439
Segmentation FPS=15.7602 Execution Time(sec)=0.063451
Segmentation FPS=14.3872 Execution Time(sec)=0.069506
Segmentation FPS=12.7525 Execution Time(sec)=0.078416
Which is not good since fps is only 12, 13 fps. So how can i make this program faster?

Put the initialization part that you only need to call once in the beginning in the constructor. This initialization should contain ALL memory allocation, OpenCL C code compilation and any initial memory transfers from host to device:
useOpenCL::useOpenCL() {
Color::Modifier green(Color::FG_GREEN);
Color::Modifier red(Color::FG_RED);
Color::Modifier def(Color::FG_DEFAULT);
// get all platforms (drivers)
cl::Platform::get(&all_platforms);
if (all_platforms.size() == 0) {
std::cout << red << " No platforms found. Check OpenCL installation!" << def
<< endl;
exit(1);
}
default_platform = all_platforms[0];
std::cout << green << "Using platform: " << def
<< default_platform.getInfo<CL_PLATFORM_NAME>() << std::endl;
// get default device of the default platform
default_platform.getDevices(CL_DEVICE_TYPE_ALL, &all_devices);
if (all_devices.size() == 0) {
std::cout << red << " No devices found. Check OpenCL installation!" << def
<< endl;
exit(1);
}
default_device = all_devices[0];
std::cout << green << "Using device: " << def
<< default_device.getInfo<CL_DEVICE_NAME>() << std::endl;
// kernel calculates for each element C=A+B
kernel_code =
" void kernel simple_add(global const int* A, global const int* B, "
"global int* C){ "
" C[get_global_id(0)]=A[get_global_id(0)]+B[get_global_id(0)]; "
" "
" } "
" ";
sources.push_back({kernel_code.c_str(), kernel_code.length()});
context = cl::Context({default_device});
program = cl::Program(context, sources);
if (program.build({default_device}) != CL_SUCCESS) {
std::cout << " Error building: "
<< program.getBuildInfo<CL_PROGRAM_BUILD_LOG>(default_device)
<< "\n";
exit(1);
}
// create queue to which we will push commands for the device.
queue = cl::CommandQueue(context, default_device);
// create buffers on host
int A[size], B[size];
int C[size];
for (int i = 0; i < size; i++) {
A[i] = i;
B[i] = i + 1;
}
// create buffers on the device
buffer_A = cl::Buffer(context, CL_MEM_READ_WRITE, sizeof(int) * size);
buffer_B = cl::Buffer(context, CL_MEM_READ_WRITE, sizeof(int) * size);
buffer_C = cl::Buffer(context, CL_MEM_READ_WRITE, sizeof(int) * size);
// write arrays A and B to the device
queue.enqueueWriteBuffer(buffer_A, CL_TRUE, 0, sizeof(int) * size, A);
queue.enqueueWriteBuffer(buffer_B, CL_TRUE, 0, sizeof(int) * size, B);
// alternative way to run the kernel
kernel_add.setArg(0, buffer_A);
kernel_add.setArg(1, buffer_B);
kernel_add.setArg(2, buffer_C);
}
Therefore make context, program, queue, buffer_A, buffer_B, buffer_C member variables of your class useOpenCL. Especially the memory allocation and compilation take a long time, so do them only once and reuse the buffers.
class useOpenCL {
public:
int size = 294400;
std::vector<cl::Platform> all_platforms;
std::vector<cl::Device> all_devices;
cl::Platform default_platform;
cl::Device default_device;
cl::Program::Sources sources;
std::string kernel_code;
cl::Kernel kernel_add;
cl::Buffer buffer_A;
cl::Buffer buffer_B;
cl::Buffer buffer_C;
cl::Context context;
cl::Program program;
cl::CommandQueue queue;
useOpenCL();
~useOpenCL() {}
void backgroundSub();
};
Then only the kernel call and eventually memory transfers host<->device remain for every frame calculation:
void useOpenCL::backgroundSub() {
auto start1 = high_resolution_clock::now();
// write arrays A and B to the device (ONLY IF NECESSARY FOR EVERY FRAME)
//queue.enqueueWriteBuffer(buffer_A, CL_TRUE, 0, sizeof(int) * size, A);
//queue.enqueueWriteBuffer(buffer_B, CL_TRUE, 0, sizeof(int) * size, B);
// run the kernel
queue.enqueueNDRangeKernel(kernel_add, cl::NullRange, cl::NDRange(size),
cl::NullRange);
// read result C from the device to array C
queue.enqueueReadBuffer(buffer_C, CL_TRUE, 0, sizeof(int) * size, C);
queue.finish();
auto stop1 = high_resolution_clock::now();
auto duration1 = duration_cast<microseconds>(stop1 - start1);
auto FPS = 1000000.0 / duration1.count();
cout << "Segmentation FPS=" << FPS << "\t"
<< "Execution Time(sec)=" << duration1.count() / 1000000.0 << endl;
}
The latter code can be called over and over again and should be much faster than if you re-initialize everything over and over again. Also make sure that size is large enough, otherwise the GPU might not be utilized at its full potential and the latencies for host<->device memory transfers will make every frame disproportunately slower.

What would be a more efficient way of storing variables?

I am working on a music program that calls notes from the chromatic scale based on intervals. These interval variables (h - half step, w - whole step and wh -whole and a half step) will be used for determining scale incriments (Major = WWHWWWH) and will later be used to measure interval lengths across a vector of strings to potentially output measurements like "3 Whole Steps and a Half Step".
I'm wondering what would be the more efficient way to store the simple variables, as I would eventually like to make a cellphone app out of it and want it to be as easy on the battery/memory as possible. . And I am still learning. Here are my thoughts:
int H = 1;
int W = 2;
int WH = 3;
Int Fiv = 5;
Int Sev = 7;
or
int H = 1;
int W = H+H;
int WH = W + H;
int Fiv = WH+W;
int Sev = Fiv + W;
Int H = 1; int W = H*2; int WH = W+H; etc..
I'm primarily interested in how the differentiation of initialization will effect both memory and performance if at all?
I know I shouldn't have everything in main, but this is a work in progress, and I am obviously new to programming - so please look past the layout .. here is the code it's presently being used in..
#include <algorithm>
#include <iostream>
#include <iterator>
#include <string>
#include <sstream>
#include <vector>
#include <map>
const std::vector<std::string> st_sharps{"C","C#","D","D#","E","F","F#","G","G#","A","A#","B" };
const std::vector<std::string> st_flats{"C","Db","D","Eb","E","F","Gb","G","Ab","A","Bb","B" };
struct steps{ int maj = 0; int min = 0;} step;
constexpr int H = 1;
constexpr int W = 2;
constexpr int Tre = 3;
constexpr int Fif = 5;
constexpr int Sev = 7;
const int size = st_flats.size();
const std::vector<int> Major = { W, W, H, W, W, W, H };
struct circle{
std::stringstream sharp;
std::stringstream flat;
std::stringstream minor;
std::stringstream dimin; };
struct scales{
circle fifths;
std::stringstream maj;
std::stringstream min; } scale;
int main(){
//Circle of Fifths
for (int j = 0; j < size; j++){
int five = j * Sev;
scale.fifths.sharp << st_sharps[five % size] << " ";
scale.fifths.flat << st_flats[five % size] << " ";
scale.fifths.minor << st_sharps[((size - Tre) + five) % size] << " ";
scale.fifths.dimin << st_sharps[((size - H) + five) % size] << " ";
}
std::cout << "Circle of Fifths:\n";
std::cout << "Major >> Relative Minor >> Diminished " << std::endl;
std::cout << "Maj: " << scale.fifths.sharp.str() << std::endl;
std::cout << "Min: " << scale.fifths.minor.str() << std::endl;
std::cout << "Dim: " << scale.fifths.dimin.str() << std::endl;
std::cout << "\nflats: " << scale.fifths.flat.str() << "\n" << std::endl;
//Major and Minor Scales
for (int i = 0; i < Major.size(); i++) {
scale.maj << st_sharps[step.maj] << " ";
scale.min << st_flats[((size - Tre) + step.min) % size] << " ";
step.maj += Major[i];
step.min += Major[(i + Fif) % Major.size()];
}
std::cout << "C Major:\n" << scale.maj.str() << "\n" << std::endl;
std::cout << "A Minor:\n" << scale.min.str() << "\n" << std::endl;
return 0;
}

I'd choose a version that expresses "'W' is the double of 'H'" the best way. My preferred way would therefore be:
constexpr int H = 1;
constexpr int W = 2*H;
constexpr int WH = W+H;
Note that your version int W = H++ is not what you probably intend, since H++ is not equal to H+1; it is actually equal to int W = H; H = H + 1.

nanoflann orders of magnitude slower when using high dimensional data and large samples

I'm using the vector_of_vectors example in nanoflann to find the nearest neighbors to a 128 dimensional float vector.
When using 1 Million samples everything seems fast enough: Building the tree and building the index.
But When using 10 Million samples which is 10 times larger, the tree takes a LOT more time to build and also to index.
I did this example in Python/Numpy/cKdTree and it really wasn't this slow to build the tree and index.
Is my approach wrong?
#include <nanoflann.hpp>
using namespace nanoflann;
#include "KDTreeVectorOfVectorsAdaptor.h"
#include <ctime>
#include <cstdlib>
#include <iostream>
const int SAMPLES_DIM = 128;
typedef std::vector<std::vector<float>> my_vector_of_vectors_t;
void generateRandomPointCloud(my_vector_of_vectors_t& samples,
const size_t N = 1e7,
const size_t dim = 128,
const float max_range = 1.0)
{
std::cout << "Generating " << N << " random points...";
samples.resize(N);
for (size_t i = 0; i < N; i++)
{
samples[i].resize(dim);
for (size_t d = 0; d < dim; d++)
samples[i][d] = max_range * (rand() % 1000) / (1000.0);
}
std::cout << "done\n";
}
void kdtree_demo(const size_t nSamples = 1e7, const size_t dim = 128)
{
my_vector_of_vectors_t samples;
const float max_range = 1.0;
// Generate points:
generateRandomPointCloud(samples, nSamples, dim, max_range);
// Query point:
std::vector<float> query_pt(dim);
for (size_t d = 0; d < dim; d++)
query_pt[d] = max_range * (rand() % 1000) / (1000.0);
// construct a kd-tree index:
// Dimensionality set at run-time (default: L2)
// ------------------------------------------------------------
std::cout << "Constructing Kd Tree" << std::endl;
typedef KDTreeVectorOfVectorsAdaptor<my_vector_of_vectors_t, float> my_kd_tree_t;
my_kd_tree_t mat_index(dim /*dim*/, samples, 20 /* max leaf */);
std::cout << "Building Index" << std::endl;
mat_index.index->buildIndex();
std::cout << "Initializing Indexes" << std::endl;
// do a knn search
const size_t num_results = 3;
std::vector<size_t> ret_indexes(num_results);
std::vector<float> out_dists_sqr(num_results);
std::cout << "Initializing Resultset" << std::endl;
nanoflann::KNNResultSet<float> resultSet(num_results);
resultSet.init(&ret_indexes[0], &out_dists_sqr[0]);
std::cout << "Starting " << std::endl;
mat_index.index->findNeighbors(resultSet, &query_pt[0], nanoflann::SearchParams(10));
std::cout << "knnSearch(number or results=" << num_results << "): \n";
for (size_t i = 0; i < num_results; i++)
std::cout << "ret_index[" << i << "]=" << ret_indexes[i] << " out_dist_sqr=" << out_dists_sqr[i] << std::endl;
}
int main()
{
// Randomize Seed
srand(time(NULL));
kdtree_demo(1e7 /* samples */, SAMPLES_DIM /* dim */);
}

Draw Mandelbrot using OpenCl

I want to write a program to draw a Mandelbrot set to an image.
I'm using OpenCl and cl.hpp - wrapper to c++, but I don't know why this isn't working.
I try draw image with different width and height, but mostly I get white image with random colored pixel (when in kernel I try write to output hardcoded values) or black image.
I also wondering about passing image2d_t to the kernel and "write" pixels directly to the image but when I execute my kernel I get -52 error that I set invalid arguments..
Declaration of my kernel(using image2d_t) looks like this:
__kernel syf(__write_only image2d_t img)
and I set arguments like this:
cl_mem image= clCreateImage2D(context, CL_MEM_WRITE_ONLY, &outputFormat, ImageWidth, ImageHeight, 0, 0, &err);
clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&image)
Could you look at the code and help me?
#define __CL_ENABLE_EXCEPTIONS
//#define __NO_STD_VECTOR
#include <ctime>
#include <fstream>
#include <iostream>
#include <exception>
#include "D:/Users/cuda/cuda/bmp.cpp"
#if defined (__APPLE__) || defined (MACOSX)
#include <OpenCl/cl.cpp>
#else
#include <CL/OpenCL.h>
#include <CL/cl.hpp>
#endif
const int N = 1024 * 1024 * 3;
const char * kernelSource =
"__kernel void prepareVector(__global char *output)"
"{"
" size_t xDimension = get_global_id(0); "
" size_t yDimension = get_global_id(1); "
" "
" size_t currentIndex = 3 * 1024.0f*yDimension + 3 * xDimension; "
" float xOriginal = 3.25f*((float) xDimension / 1024.0f) - 2.0f; "
" float yOriginal = 2.5f*((float) yDimension / 1024.0f) - 1.25f;"
" "
" int iteration = 0; "
" int maxIteration = 256; "
" float temp; "
" float x = 0.0f; "
" float y = 0.0f; "
" while (x*x + y*y <= 4.0f && iteration < maxIteration)"
" { "
" temp = x*x - y*y + xOriginal; "
" y = 2.0*x*y + yOriginal; "
" x = temp; "
" ++iteration; "
" } "
" "
" if (iteration == maxIteration) "
" { "
" iteration = 0; "
" } "
" output[currentIndex] = iteration; "
" output[currentIndex] = iteration;"
" output[currentIndex] = iteration;"
"}";
std::vector<cl::Platform> platforms;
std::vector<cl::Device> devices;
std::vector<cl::Kernel> allKernels;
cl::Program program;
cl_int cli_err_msg;
char *host_out_c;
int main(int argc, char* argv [])
{
try
{
size_t dimensionSize = 2;
cl::NDRange *globalSize = new cl::NDRange( 6, 8 );
cl::NDRange *localSize = new cl::NDRange( 3, 4 );
cl::NDRange *offsetSize = new cl::NDRange( 0, 0 );
host_out_c = new char[N];
cl::Platform::get(&platforms);
platforms[0].getDevices(CL_DEVICE_TYPE_ALL, &devices);
cl_context_properties properties [] =
{
CL_CONTEXT_PLATFORM, (cl_context_properties) (platforms[0])(), 0
};
//cl_context context = clCreateContext(0, devices.size(), devices, NULL, NULL, NULL);
cl::Context context(CL_DEVICE_TYPE_ALL, properties);
std::vector<cl::Device> devices = context.getInfo<CL_CONTEXT_DEVICES>();
cl::Program::Sources source(1, std::make_pair(kernelSource, strlen(kernelSource)));
program = cl::Program(context, source);
program.build(devices);
cl::Kernel kernel(program, "prepareVector", &cli_err_msg);
cl::Buffer device_out_c(context, CL_MEM_WRITE_ONLY, N*sizeof(char));
cl::Event event;
cl::CommandQueue queue(context, devices[0], 0, &cli_err_msg);
kernel.setArg(0, device_out_c);
queue.enqueueNDRangeKernel(kernel, *offsetSize, *globalSize, *localSize, NULL, &event);
queue.enqueueReadBuffer(device_out_c, true, 0, N*sizeof(char), (void*) host_out_c);
//printArray("kernel output:\n", host_out_c);
write_bmp("lel.bmp", 1024, 1024, host_out_c);
}
catch (cl::Error e)
{
std::cout << "Status: "<< program.getBuildInfo<CL_PROGRAM_BUILD_STATUS>(devices[0]) << std::endl;
std::cout << "Options: "<< program.getBuildInfo<CL_PROGRAM_BUILD_OPTIONS>(devices[0]) << std::endl;
std::cout << "Log: "<< program.getBuildInfo<CL_PROGRAM_BUILD_LOG>(devices[0]) << std::endl;
std::cout << e.what() << ": Error code: " << e.err() << std::endl;
}
return 0;
}

We Keep Coding

c++ django amazon-web-services regex python-2.7 google-cloud-platform list unit-testing opengl ember.js

OpenCL - results are not the same as CPU version - c++

Don't you think that you have race condition here case in this line result[i] += diff; of your OpenCL code your program do it in each work item simultaneously? So maybe it could be a problem.

The floating point arithmetics differ on the platforms. You are very likely seeing the effect of a hardware specific MAD optimization performed by the OpenCL compiler. As far as I know, disabling optimizations using -cl-opt-disable will not help in this case.

Related

Calling Armadillo function in OpenMP parallel for loop causes data corruption

increase performance of opencl

What would be a more efficient way of storing variables?

nanoflann orders of magnitude slower when using high dimensional data and large samples

Draw Mandelbrot using OpenCl

Categories

Resources