Random123 generating random numbers for opencl using visual studio - c++

http://www.thesalmons.org/john/random123/releases/1.00/docs/index.html
I have a hard time looking at the example for opencl and random123 as im new to OpenCL and i am not sure how I can use the provided information when im using Visual Studio 2010.
Anyone who can compose a guide for generating random numbers with the above lib and using visual studio 2010.
UPDATE:
I solved it as following and are now wondering how do I change the seed such i get random numbers at each run.
int main(int argc, char **argv)
{
const char *kernelname = "counthits";
unsigned count =10000;
cl_int err;
cl::Context cl_context;
cl::Program program;
cl::Kernel cl_kernel;
cl::Buffer cl_out;
cl::CommandQueue cl_queue;
size_t i, nthreads, hits_sz;
size_t cores, work_group_size;
cl_uint2 * hits_host;
double d = 0.; // timer
d = timer(&d);
progname = argv[0];
std::vector< cl::Platform > platformList;
CHECK(cl::Platform::get(&platformList));
CHECKERR( cl_context = createCLContext(CL_DEVICE_TYPE_GPU,cl_vendor::VENDOR_AMD, &err) );
std::vector<cl::Device> devices;
CHECKERR( devices = cl_context.getInfo<CL_CONTEXT_DEVICES>(&err) );
size_t length = 0;
const char * sourceStr = loadFileToString("pi_opencl_kernel.ocl","",&length);
cl::Program::Sources sources(1, std::make_pair(sourceStr, length));
program = cl::Program(cl_context, sources);
CHECK( program.build(devices,"-I D:\\libs\\Random123\\1.06\\include") );
CHECKERR(work_group_size = devices[0].getInfo<CL_DEVICE_MAX_WORK_GROUP_SIZE>(&err) );
CHECKERR(cores = devices[0].getInfo<CL_DEVICE_MAX_COMPUTE_UNITS>(&err) );
cores *= 16*4; //Tahiti.
if (work_group_size > 64) work_group_size /= 2;
nthreads = cores * work_group_size*32; //2048*128 = 262144
if (count == 0)
count = NTRIES/nthreads; //38
printf("Count: %lu\n",count);
hits_sz = nthreads * sizeof(hits_host[0]);//2097152
CHECKNOTZERO(hits_host = (cl_uint2 *)malloc(hits_sz));
CHECKERR ( cl_out = cl::Buffer( cl_context, CL_MEM_WRITE_ONLY | CL_MEM_USE_HOST_PTR, hits_sz, hits_host, &err));
CHECKERR ( cl_kernel = cl::Kernel(program,kernelname,&err) );
CHECK ( cl_kernel.setArg( 0, count) );
CHECK ( cl_kernel.setArg( 1, cl_out) );
CHECKERR (cl_queue = cl::CommandQueue(cl_context, devices[0], 0, &err) );
cl::Event event;
CHECK( cl_queue.enqueueNDRangeKernel(cl_kernel,cl::NullRange,cl::NDRange(nthreads), cl::NDRange(work_group_size), NULL, &event) );
event.wait();
CHECK( cl_queue.enqueueReadBuffer(cl_out, CL_TRUE, 0,hits_sz, hits_host) );
unsigned long hits = 0, tries = 0;
for (i = 0; i < nthreads; i++) {
#ifdef _DEBUG
printf("%lu %u %u\n", (unsigned long)i, hits_host[i].s[0], hits_host[i].s[1]);
#endif
hits += hits_host[i].s[0];
tries += hits_host[i].s[1];
}
return pi_check(hits, tries);
}
Kernel:
#include <Random123/threefry.h>
/*
* counthits generates 2*n x,y points and returns hits[tid] with
* the count of number of those points within the unit circle on
* each thread.
*/
__kernel void counthits(unsigned n, __global uint2 *hitsp) {
unsigned tid = get_global_id(0);
unsigned hits = 0, tries = 0;
threefry4x32_key_t k = {{tid, 0xdecafbad, 0xfacebead, 0x12345678}};
threefry4x32_ctr_t c = {{0, 0xf00dcafe, 0xdeadbeef, 0xbeeff00d}};
while (tries < n) {
union {
threefry4x32_ctr_t c;
int4 i;
} u;
c.v[0]++;
u.c = threefry4x32(c, k);
long x1 = u.i.x, y1 = u.i.y;
long x2 = u.i.z, y2 = u.i.w;
if ((x1*x1 + y1*y1) < (1L<<62)) {
hits++;
}
tries++;
if ((x2*x2 + y2*y2) < (1L<<62)) {
hits++;
}
tries++;
}
hitsp[tid].x = hits;
hitsp[tid].y = tries;
}

I haven't tested this, but roughly speaking, something like the following:
Try changing the signature of counthits to:
_kernel void counthits(unsigned n, __global uint2 *hitsp, unsigned seed)
Replace 0xdecafbad with seed
Add
char *seedstr = getenv("COUNTHITS_SEED");
unsigned seed = seedstr ? atoi(seedstr) : 0xdecafbad;
...
CHECK ( cl_kernel.setArg( 2, seed) );
to the main program (this setArg comes after setArg( 1, ...), and you can, of).

Related

Sound playback not working when using QueueAudio on C++

I am trying to play a sin wave sound with SDL2 by using the audio queue on C++. In order to do that, I have created a class "Speaker", which has a pushBeep function that is called every time a beep needs to be generated. I have created an AudioDevice successfully, and it is also successful when I do the QueueAudio to the device (I have checked on the debugger) but I can't seem to get any sound out of it.
I have tried changing the way I generate the samples in numerous ways, also, as I said previously, I have checked that the device is properly opened and the QueueAudio returns 0 for success.
This is the class
Speaker::Speaker()
{
SDL_AudioSpec ds;
ds.freq = Speaker::SPEAKER_FREQUENCY;
ds.format = AUDIO_F32;
ds.channels = 1;
ds.samples = 4096;
ds.callback = NULL;
ds.userdata = this;
SDL_AudioSpec os;
this->dev = SDL_OpenAudioDevice(NULL, 0, &ds, &os, NULL);
std::cout << "DEVICE: " << this->dev << std::endl;
SDL_PauseAudioDevice(this->dev, 0);
}
Speaker::~Speaker()
{
SDL_CloseAudioDevice(this->dev);
}
void Speaker::pushBeep(double freq, int duration) {
int nSamples = duration * Speaker::SPEAKER_FREQUENCY / 1000;
float* samples = new float[nSamples];
double v = 0.0;
for (int idx = 0; idx < nSamples; idx++) {
//float value = (float)Speaker::SPEAKER_AMPLITUDE * std::sin(v * 2 * M_PI / Speaker::SPEAKER_FREQUENCY);
float value = 440.0;
samples[idx] = value;
v += freq;
}
int a = SDL_QueueAudio(this->dev, (void*)samples, nSamples * sizeof(float));
std::cout << a << std::endl;
delete[] samples;
samples = NULL;
}
And this is how I call it
Speaker s;
s.pushBeep(440.0, 1000);
When I try with the sin wave generation code (commented) it gives me a "double to float loss of precision" error. When I use the fixed value (not commented) it does not give the error, but it still does not work.
I expect the program to output the sound.
Couple of things you are missing, or maybe you didn't add to your code snippet. You didn't specify an audio callback so when you call SDL_QueueAudio(); it didn't know what to do with the data I'm pretty sure. And you weren't calling SDL_PauseAudioDevice() in your example with the delay.
#include <math.h>
#include <SDL2/SDL.h>
#include <SDL2/SDL_audio.h>
#include <iostream>
namespace AudioGen
{
const int AMPLITUDE = 1;
const int SAMPLE_RATE = 44000;
// Globals
float *in_buffer;
SDL_atomic_t callback_sample_pos;
SDL_Event event;
SDL_bool running = SDL_TRUE;
/**
* Structure for holding audio metadata such as frequency
*/
struct AudioData
{
int sampleNum;
float frequency;
};
void audio_callback(void *user_data, Uint8 *raw_buffer, int bytes)
{
float *buffer = (float*)raw_buffer;
AudioData &audio_data(*static_cast<AudioData*>(user_data));
int nSamples = bytes / 4; // For F32
std::cout << nSamples << std::endl;
for(int i = 0; i < nSamples; i++, audio_data.sampleNum++)
{
double time = (double)audio_data.sampleNum / (double)SAMPLE_RATE;
buffer[i] = (float)(AMPLITUDE * sin(2.0f * M_PI * audio_data.frequency * time));
}
}
int buffer_length;
void callback(void *user_data, Uint8 *raw_buffer, int bytes)
{
float *buffer = (float*)raw_buffer;
int nSamples = bytes/4;
auto local_sample_pos = SDL_AtomicGet(&callback_sample_pos);
for(int i = 0; i < nSamples; ++i)
{
// Stop running audio if all samples are finished playing
if(buffer_length == local_sample_pos)
{
running = SDL_FALSE;
break;
}
buffer[i] = in_buffer[local_sample_pos];
++local_sample_pos;
}
SDL_AtomicSet(&callback_sample_pos, local_sample_pos);
}
class Speaker
{
public:
Speaker()
{
SDL_Init(SDL_INIT_AUDIO);
SDL_AudioSpec ds;
ds.freq = SAMPLE_RATE;
ds.format = AUDIO_F32;
ds.channels = 1;
ds.samples = 4096;
ds.callback = callback;
ds.userdata = &ad; // metadata for frequency
SDL_AudioSpec os;
dev = SDL_OpenAudioDevice(NULL, 0, &ds, &os, SDL_AUDIO_ALLOW_FORMAT_CHANGE);
}
~Speaker()
{
SDL_CloseAudioDevice(dev);
SDL_Quit();
}
void pushBeep(float frequency, int duration)
{
ad.frequency = frequency; // set the frequency for the beep
SDL_PauseAudioDevice(dev, 0);
SDL_Delay(duration); // wait while sound is playing
SDL_PauseAudioDevice(dev, 1);
}
void pushBeep2(float frequency, int duration )
{
int nSamples = duration * SAMPLE_RATE / 1000;
in_buffer = new float[nSamples];
buffer_length = nSamples;
for (int idx = 0; idx < nSamples; idx++) {
double time = (double)idx / (double)SAMPLE_RATE;
in_buffer[idx] = (float)(AMPLITUDE * std::sin(2.0f * M_PI * frequency * time));
}
SDL_QueueAudio(dev, in_buffer, nSamples * sizeof(float));
SDL_PauseAudioDevice(dev, 0);
while(running){
while(SDL_PollEvent(&event)!=0);
}
delete[] in_buffer;
}
private:
SDL_AudioDeviceID dev;
AudioData ad;
int sampleNum = 0;
};
} // End of namespace AudioGen
int main(int argc, char *argv[])
{
AudioGen::Speaker speaker;
//speaker.pushBeep(440, 1000);
speaker.pushBeep2(440.0f, 1000);
return 0;
}

how to fill the "data field" of wavfile

Hi i am trying to record from a board and i have successfully record 4 seconds. Problem is when i try to record for more time, i got an error telling me that there not enough memory. my target is to record a 5 minutes file. Until now i have create a buffer named snIn[256] where are the samples. i send it to a big buffer of [16K * 4sec] and when it is full, i create the wav file.
#include "SAI_InOut.hpp"
#include "F746_GUI.hpp"
#include "Delay.hpp"
#include "WaveformDisplay.hpp"
#include "SDFileSystem.h"
#include "wavfile.h"
using namespace Mikami;
#define RES_STR_SIZE 0x20
#define WAVFILE_SAMPLES_PER_SECOND 16000
#define REC_TIME 4
//Create an SDFileSystem object
SDFileSystem sd("sd");
bool flag = 1;
int count = 0;
char *res_buf;
int rp = 0;
const int NUM_SAMPLES = WAVFILE_SAMPLES_PER_SECOND * REC_TIME;
Array<int16_t> my_buffer(NUM_SAMPLES);
int j = 0;
static const char *target_filename = "/sd/rectest.wav";
const int SEG_SIZE = 256;
int sent_array = 0;
int rec(const char *filename, Array<int16_t> my_buffer)
{
j = 0;
flag = 0;
sent_array = 0;
WavFileResult result;
wavfile_info_t info;
wavfile_data_t data;
WAVFILE_INFO_AUDIO_FORMAT(&info) = 1;
WAVFILE_INFO_NUM_CHANNELS(&info) = 1;
WAVFILE_INFO_SAMPLE_RATE(&info) = WAVFILE_SAMPLES_PER_SECOND;
WAVFILE_INFO_BITS_PER_SAMPLE(&info) = 16;
WAVFILE_INFO_BYTE_RATE(&info) = WAVFILE_INFO_NUM_CHANNELS(&info) * WAVFILE_INFO_SAMPLE_RATE(&info) * (WAVFILE_INFO_BITS_PER_SAMPLE(&info) / 8);
WAVFILE_INFO_BLOCK_ALIGN(&info) = 2;
WAVFILE *wf = wavfile_open(filename, WavFileModeWrite, &result);
if (result != WavFileResultOK) {
wavfile_result_string(result, res_buf, RES_STR_SIZE);
printf("%s", res_buf);
return result;
} else printf ("Open file success \r\n");
rp = 0;
WAVFILE_DATA_NUM_CHANNELS(&data) = 1;
result = wavfile_write_info(wf, &info);
if (result != WavFileResultOK) {
wavfile_result_string(result, res_buf, RES_STR_SIZE);
printf("%s", res_buf);
return result; } else printf ("Write info success \r\n");
while ( rp < NUM_SAMPLES ) {
WAVFILE_DATA_CHANNEL_DATA(&data, 0) = my_buffer[rp];
result = wavfile_write_data(wf, &data);
rp += 1;
}
if (result != WavFileResultOK) {
wavfile_result_string(result, res_buf, RES_STR_SIZE);
printf("%s", res_buf);
return result; } else printf ("Write Data file success \r\n");
result = wavfile_close(wf);
if (result != WavFileResultOK) {
wavfile_result_string(result, res_buf , RES_STR_SIZE);
printf("%s", res_buf);
return result; } else printf ("Close file success \r\n");
//UnMount the filesystem
sd.unmount();
printf("Success rec !\r\n");
return 0;
}
int main()
{
//Mount the filesystem
sd.mount();
const float MAX_DELAY = 0.5f; // 最大遅延,単位:秒
const int FS = I2S_AUDIOFREQ_16K; // 標本化周波数: 16 kHz
const uint32_t MAX_ARRAY_SIZE = (uint32_t)(MAX_DELAY*FS);
SaiIO mySai(SaiIO::BOTH, 256, FS, INPUT_DEVICE_DIGITAL_MICROPHONE_2);
Label myLabel(185, 10, "Delay System", Label::CENTER, Font16);
// ButtonGroup: "ON", "OFF"
const uint16_t BG_LEFT = 370;
const uint16_t BG_WIDTH = 100;
const uint16_t BG_HEIGHT = 45;
ButtonGroup onOff(BG_LEFT, 40, BG_WIDTH/2, BG_HEIGHT,
2, (string[]){"ON", "OFF"}, 0, 0, 2, 1);
const uint16_t SB_LEFT = BG_LEFT - 320;
const uint16_t SB_WIDTH = 270;
const uint16_t SB_Y0 = 240;
char str[20];
sprintf(str, " %3.1f [s]", MAX_DELAY);
SeekBar barDelay(SB_LEFT, SB_Y0, SB_WIDTH,
0, MAX_ARRAY_SIZE, 0, "0", "", str);
NumericLabel<float> labelDelay(SB_LEFT+SB_WIDTH/2, SB_Y0-40, "DELEY: %4.2f", 0, Label::CENTER);
DelaySystem delaySystem(MAX_ARRAY_SIZE);
WaveformDisplay displayIn(*GuiBase::GetLcdPtr(), SB_LEFT+7, 70, 256, 9,LCD_COLOR_WHITE, LCD_COLOR_CYAN,GuiBase::ENUM_BACK);
Label inLabel(SB_LEFT-30, 65, "IN");
WaveformDisplay displayOut(*GuiBase::GetLcdPtr(), SB_LEFT+7, 130, 256, 9,LCD_COLOR_WHITE, LCD_COLOR_CYAN,GuiBase::ENUM_BACK);
Label outLabel(SB_LEFT-30, 125, "OUT");
int runStop = 1;
Array<int16_t> snIn(mySai.GetLength());
Array<int16_t> snOut(mySai.GetLength());
mySai.RecordIn();
mySai.PlayOut();
mySai.PauseOut();
while (true)
{
// On/OFF
int num;
if (onOff.GetTouchedNumber(num))
if (runStop != num)
{
if (num == 0) mySai.ResumeOut();
else mySai.PauseOut();
runStop = num;
}
if (mySai.IsCompleted())
{
for (int n=0; n<mySai.GetLength() ; n++)
{
int16_t xL, xR;
mySai.Input(xL,xR);
int16_t xn = xL + xR;
snIn[n] = xn;
my_buffer[j] = xn;
j++;
if (j == NUM_SAMPLES && flag == 1) {
rec (target_filename , my_buffer); }
int16_t yn = delaySystem.Execute(xn);
mySai.Output(yn, yn);
snOut[n] = yn;
}
mySai.Reset();
displayIn.Execute(snIn);
}
}
}
I thought about a possible solution, to fill directly the "data field" of the wavefile with the snIn[256] buffer (instead of using my_buffer) again and again and at the end close the wavfile. Please let me know what you think about that and other solutions
things to note: 1) while a write operation is being performed, more data is still coming in.
At the very least I would double buffer that data, so can be writing one buffer while the other one fills.
Usually this means using an interrupt to collect the samples (into which ever buffer is currently being filed.)
the foreground program waits for the current buffer to be 'full', then initiates write operation.,
then waits again for a buffer to be 'full'
The interrupt function tracks which buffer is being filled and the current index into that buffer. When a buffer is full, set a 'global' status to let the foreground program know which buffer is ready to be written.
The foreground program writes the buffer, then resets the status for that buffer.

OpenCl: sample float4 program - Segmentation fault (core dumped)

It is simple program that read two float4 vectors from files then calculate sum of opposite numbers.
I couldn't find the problem:
MAIN file:
#include <stdio.h>
#include <stdlib.h>
#include <iostream>
#include <iomanip>
#include <array>
#include <fstream>
#include <sstream>
#include <string>
#include <algorithm>
#include <iterator>
#ifdef __APPLE__
#include <OpenCL/opencl.h>
#else
#include <CL/cl.h>
#include <time.h>
#endif
const int number_of_points = 16; // number of points in Both A and B files (number of rows)
const int number_of_axis = 4; // number of points axis in Both A and B files (number of Columns)
using namespace std;
void checkError(cl_int err, const char *operation)
{
if (err != CL_SUCCESS)
{
fprintf(stderr, "Error during operation '%s': %d\n", operation, err);
exit(1);
}
}
int main(int argc, char *argv[]) {
clock_t tStart = clock();
// Create the two input vectors
// working variables
int i;
ifstream input_fileA, input_fileB; // input files
string line; // transfer row from file to array
float x; // transfer word from file to array
int row = 0; // number of rows of file A,B (= array)
int col = 0; // number of rows of file A,B (= array)
// working arrays
// working arrays
// int mem_size_TempA = number_of_points * number_of_axis * sizeof(cl_float);
// int mem_size_TempB = number_of_points * number_of_axis * sizeof(cl_float);
float tempAArray[number_of_points][number_of_axis]={{0}}; // array contains file A data
float tempBArray[number_of_points][number_of_axis]={{0}}; // array contains file B data
int mem_size_InputA = number_of_points * number_of_axis ;
int mem_size_InputB = number_of_points * number_of_axis ;
int mem_size_Output = number_of_points * number_of_axis ;
float *inputAArray = (float*) malloc(number_of_points*sizeof(cl_float4)); // array contains file A data
float *inputBArray = (float*) malloc(number_of_points*sizeof(cl_float4)); // array contains file B data
float *outputArray = (float*) malloc(number_of_points*sizeof(cl_float4)); // array contains file B data
// import input files
input_fileA.open(argv[1]);
input_fileB.open(argv[2]);
// transfer input files data to array
// input file A to arrayA
row = 0;
while (getline(input_fileA, line))
{
istringstream streamA(line);
col = 0;
while(streamA >> x){
tempAArray[row][col] = x;
col++;
}
row++;
}
// input file B to arrayB
row = 0;
while (getline(input_fileB, line))
{
istringstream streamB(line);
col = 0;
while(streamB >> x){
tempBArray[row][col] = x;
col++;
}
row++;
}
// switch columns of B array
for(int row_of_arrayB = 0; row_of_arrayB < number_of_points; row_of_arrayB++ )
{
float temporary = tempBArray[row_of_arrayB][2];
tempBArray[row_of_arrayB][2] = tempBArray[row_of_arrayB][1];
tempBArray[row_of_arrayB][1] = temporary;
}
// from Array to 3d vectors
// for (int row_of_array = 0; row_of_array<number_of_points; row_of_array++)
// {
// inputAArray[row_of_array] = (tempAArray[row_of_array][0], tempAArray[row_of_array][1], tempAArray[row_of_array][2],0);
// inputBArray[row_of_array] = (tempBArray[row_of_array][0], tempBArray[row_of_array][1], tempBArray[row_of_array][2],0);
// }
for (int row_of_array=0; row_of_array < number_of_points; row_of_array++)
{
inputAArray[row_of_array*number_of_points+0] = tempAArray[row_of_array][0];
inputAArray[row_of_array*number_of_points+1] = tempAArray[row_of_array][1];
inputAArray[row_of_array*number_of_points+2] = tempAArray[row_of_array][2];
inputAArray[row_of_array*number_of_points+3] = 0.0f;
inputBArray[row_of_array*number_of_points+0] = tempBArray[row_of_array][0];
inputBArray[row_of_array*number_of_points+1] = tempBArray[row_of_array][1];
inputBArray[row_of_array*number_of_points+2] = tempBArray[row_of_array][2];
inputBArray[row_of_array*number_of_points+3] = tempBArray[row_of_array][3];
outputArray[row_of_array*number_of_points+0] = 0.0f;
outputArray[row_of_array*number_of_points+1] = 0.0f;
outputArray[row_of_array*number_of_points+2] = 0.0f;
outputArray[row_of_array*number_of_points+3] = 0.0f;
// inputBArray[row_of_array] = (tempBArray[row_of_array][0], tempBArray[row_of_array][1], tempBArray[row_of_array][2],0);
}
// for (int row_of_array=0; row_of_array < number_of_points; row_of_array++)
// {
// printf("0: %f, 1: %f, 2: %f, 3: %f \n", inputAArray[row_of_array*number_of_points+0], inputAArray[row_of_array*number_of_points+1],
// inputAArray[row_of_array*number_of_points+2], inputAArray[row_of_array*number_of_points+3]);
// }
// close input files
input_fileA.close();
input_fileB.close();
// Load the kernel source code into the array source_str
FILE *fp;
char *source_str;
size_t source_size;
fp = fopen("calculate_bottom_SNM_kernel.cl", "r");
if (!fp) {
fprintf(stderr, "Failed to load kernel.\n");
exit(1);
}
fseek(fp, 0, SEEK_END);
size_t programLength = ftell(fp);
rewind(fp);
source_str = (char*)malloc(programLength+1);
source_size = fread( source_str, 1, programLength, fp);
source_str[programLength] = '\0';
fclose( fp );
// Get platform and device information
cl_platform_id platform_id = NULL;
cl_device_id device_id = NULL;
cl_uint ret_num_devices;
cl_uint ret_num_platforms;
cl_int ret = clGetPlatformIDs(1, &platform_id, &ret_num_platforms);
ret = clGetDeviceIDs( platform_id, CL_DEVICE_TYPE_ALL, 1,
&device_id, &ret_num_devices);
// Create an OpenCL context
cl_context context = clCreateContext( NULL, 1, &device_id, NULL, NULL, &ret);
// Create a command queue
cl_command_queue command_queue = clCreateCommandQueue(context, device_id, 0, &ret);
// Create memory buffers on the device for each vector
cl_mem inputa_mem_obj = clCreateBuffer(context, CL_MEM_READ_ONLY,
mem_size_InputA*sizeof(cl_float4) , NULL, &ret);
cl_mem inputb_mem_obj = clCreateBuffer(context, CL_MEM_READ_ONLY,
mem_size_InputB*sizeof(cl_float4), NULL, &ret);
cl_mem output_mem_obj = clCreateBuffer(context, CL_MEM_WRITE_ONLY,
mem_size_Output*sizeof(cl_float4), NULL, &ret);
// Copy the lists A and B to their respective memory buffers
ret = clEnqueueWriteBuffer(command_queue, inputa_mem_obj, CL_TRUE, 0,
mem_size_InputA*sizeof(cl_float4), inputAArray, 0, NULL, NULL);
ret = clEnqueueWriteBuffer(command_queue, inputb_mem_obj, CL_TRUE, 0,
mem_size_InputB*sizeof(cl_float4), inputBArray, 0, NULL, NULL);
// Create a program from the kernel source
cl_program program = clCreateProgramWithSource(context, 1,
(const char **)&source_str, (const size_t *)&source_size, &ret);
// Build the program
ret = clBuildProgram(program, 1, &device_id, NULL, NULL, NULL);
if (ret == CL_BUILD_PROGRAM_FAILURE)
{
// Get size of build log
size_t logSize;
ret = clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG,
0, NULL, &logSize);
checkError(ret, "getting build log size");
// Get build log
char log[logSize];
ret = clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG,
logSize, log, NULL);
checkError(ret, "getting build log");
printf("OpenCL program build log:\n%s\n", log);
exit(1);
}
// Create the OpenCL kernel
cl_kernel kernel = clCreateKernel(program, "calculate_bottom_SNM", &ret);
// Set the arguments of the kernel
ret = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&inputa_mem_obj);
ret = clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&inputb_mem_obj);
ret = clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&output_mem_obj);
// Execute the OpenCL kernel on the list
size_t global_item_size = number_of_points; // Process the entire lists
size_t local_item_size = 4; // Process in groups of 64
ret = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL,
&global_item_size, &local_item_size, 0, NULL, NULL);
// Read the memory buffer C on the device to the local variable C
// int *C = (int*)malloc(sizeof(int)*number_of_points);
// float *C = (float*)malloc(sizeof(float)*number_of_points);
clEnqueueReadBuffer(command_queue, output_mem_obj, CL_TRUE, 0,
mem_size_Output, outputArray, 0, NULL, NULL);
// Display the result to the screen
// float buttomSNM = 0;
// for(i = 0; i < number_of_points; i++)
// {
// for (int t=0; t<4; t++)
// {
// cout << "h" ;
//// printf("%f, \n", outputArray[i*number_of_points+t]);
// }
// }
// Clean up
ret = clFlush(command_queue);
ret = clFinish(command_queue);
ret = clReleaseKernel(kernel);
ret = clReleaseProgram(program);
ret = clReleaseMemObject(inputa_mem_obj);
ret = clReleaseMemObject(inputb_mem_obj);
ret = clReleaseMemObject(output_mem_obj);
ret = clReleaseCommandQueue(command_queue);
ret = clReleaseContext(context);
free (inputAArray);
free (inputBArray);
free (outputArray);
printf("ALL Time taken: %.2fs\n", (double)(clock() - tStart)/CLOCKS_PER_SEC);
return 0;
}
Kernel:
__kernel void calculate_bottom_SNM(__global float4 *inputAArray, __global float4 *inputBArray,
__global float4 *outputArray) {
// Get the index of the current element
int i = get_global_id(0);
int number_of_points = 16;
outputArray[i*number_of_points+0] = inputAArray[i*number_of_points+0] + inputBArray[i*number_of_points+0];
outputArray[i*number_of_points+1] = inputAArray[i*number_of_points+1] + inputBArray[i*number_of_points+1];
outputArray[i*number_of_points+2] = inputAArray[i*number_of_points+2] + inputBArray[i*number_of_points+2];
outputArray[i*number_of_points+3] = inputAArray[i*number_of_points+3] + inputBArray[i*number_of_points+3];
}
The first input files: A.txt
0 0.000000e+00 9.998994e-01
1 1.000000e-03 9.998981e-01
2 2.000000e-03 9.998967e-01
3 3.000000e-03 9.998953e-01
4 4.000000e-03 9.998939e-01
5 5.000000e-03 9.998925e-01
6 6.000000e-03 9.998911e-01
7 7.000000e-03 9.998896e-01
8 8.000000e-03 9.998881e-01
9 9.000000e-03 9.998865e-01
10 1.000000e-02 9.998850e-01
11 1.100000e-02 9.998834e-01
12 1.200000e-02 9.998817e-01
13 1.300000e-02 9.998800e-01
14 1.400000e-02 9.998783e-01
15 1.500000e-02 9.998766e-01
The second input file B:
0 0.000000e+00 9.998966e-01
1 1.000000e-03 9.998953e-01
2 2.000000e-03 9.998939e-01
3 3.000000e-03 9.998925e-01
4 4.000000e-03 9.998911e-01
5 5.000000e-03 9.998896e-01
6 6.000000e-03 9.998881e-01
7 7.000000e-03 9.998866e-01
8 8.000000e-03 9.998850e-01
9 9.000000e-03 9.998834e-01
10 1.000000e-02 9.998818e-01
11 1.100000e-02 9.998801e-01
12 1.200000e-02 9.998785e-01
13 1.300000e-02 9.998767e-01
14 1.400000e-02 9.998750e-01
15 1.500000e-02 9.998732e-01
Thanks in advance
You are computing your array indices in your kernel in a fairly strange manner:
i*number_of_points+0
i*number_of_points+1
i*number_of_points+2
i*number_of_points+3
Think about what this actually translates to for different values of i (assuming number_of_points=16):
i array indices (i*16 + (0,1,2,3))
--------------------------------------
0 0, 1, 2, 3
1 16, 17, 18, 19
2 32, 33, 34, 35
...
etc
This is surely not what you wanted! Your sample code appears to just be trying to perform a vectorised vector addition. If that's the case, your kernel code just needs to look something like this:
__kernel void vecadd(__global float4 *inputA,
__global float4 *inputB,
__global float4 *output)
{
int i = get_global_id(0);
output[i] = inputA[i] + inputB[i];
}
This works because were are performing the same operation to each element of the vector. If you have a kernel that needs to use these elements separately, you would write code like this:
float4 valueA = inputA[i];
float4 valueB = inputB[i];
float4 result;
result.x = valueA.x + valueB.x; // Do something with first component
result.y = valueA.y * valueB.y; // Do something with second component
result.z = valueA.z / valueB.z; // Do something with third component
result.w = valueA.w - valueB.w; // Do something with fourth component

C++: Convert float[] to unsigned char* or BYTE*

I'm developing a project where I need to convert PCM 16-bits 2 channels sound into a IEEE Float 32-bits 2 channels.
To do this I'm using the following code:
void CAudioConverter::ConvI16ToF32(BYTE* pcmFrom, BYTE* floatTo, int length)
{
short* src = reinterpret_cast<short*>(pcmFrom);
float* dst = reinterpret_cast<float*>(floatTo);
for (int n = 0; n < length; n++)
{
dst[n] = static_cast<float>(src[n]) / 32768.0f;
}
}
I have initialized the variable __pcm32_bytesPerFrame with:
WAVEFORMATEX* closestFormat;
ws->default_pb_dev->GetMixFormat(&closestFormat);
__pcm32_bytesPerFrame = closestFormat->nAvgBytesPerSec * (prm->samples_per_frame * 1000 / (prm->clock_rate * closestFormat->nChannels)) / 1000;
strm->pb_max_frame_count is:
hr = ws->default_pb_dev->GetBufferSize(&ws->pb_max_frame_count);
I have a while loop in a dedicated thread the does something like:
hr = strm->default_pb_dev->GetCurrentPadding(&padding);
incoming_frame = __pcm32_bytesPerFrame / 4;
frame_to_render = strm->pb_max_frame_count - padding;
if (frame_to_render >= incoming_frame)
{
frame_to_render = incoming_frame;
} else {
/* Don't get new frame because there's no space */
frame_to_render = 0;
}
if (frame_to_render > 0)
{
pjmedia_frame frame;
hr = strm->pb_client->GetBuffer(frame_to_render, &cur_pb_buf);
if (FAILED(hr)) {
continue;
}
void* destBuffer = (void*)malloc(strm->bytes_per_frame*frame_to_render*sizeof(pj_uint16_t));
if (strm->fmt_id == PJMEDIA_FORMAT_L16) {
/* PCM mode */
frame.type = PJMEDIA_FRAME_TYPE_AUDIO;
frame.size = strm->bytes_per_frame;
frame.timestamp.u64 = strm->pb_timestamp.u64;
frame.bit_info = 0;
frame.buf = destBuffer;
}
status = (*strm->pb_cb)(strm->user_data, &frame);
CAudioConverter* conv = new CAudioConverter();
conv->ConvI16ToF32((BYTE*)destBuffer, cur_pb_buf, frame_to_render);
hr = strm->pb_client->ReleaseBuffer(frame_to_render, 0);
(...)
But, to send the sound to the WASAPI capture buffer I need a BYTE*.
How can I fill my 'floatTo' argument?
Any ideas?
Thanks
What about this:
void CAudioConverter::ConvI16ToF32(BYTE* pcmFrom, BYTE* floatTo, int length)
{
short* src = reinterpret_cast<short*>(pcmFrom);
float* dst = reinterpret_cast<float*>(floatTo);
for (int n = 0; n < length; n++)
{
dst[n] = static_cast<float>(src[n]) / 32768.0f;
}
}
Additionally make sure length indicates the number of elments in pcmFrom and floatTo, and not the number of bytes allocated. In you case pcmFrom should have allocated length*2 bytes and floatTo needs room for length*4 bytes.

Using OpenGL and Cuda get a run time error for an Access violation reading location

I would just like to start by saying I am new to CUDA and OpenGL. I get the above runtime error when it runs glutMainLoop() and the mainloop runs glDrawPixels(). I have looked everywhere to figure out why this isn't working any help would be appreciated.
#include<stdio.h>
#include<stdlib.h>
#include<gl/glut.h>
#include<stb_image.c>
#include<cuda.h>
#include<cuda_gl_interop.h>
#include<gl/glext.h>
static void HandleError( cudaError_t err, const char *file, int line ) {
if (err != cudaSuccess) {
fprintf(stderr, "%s in %s at line %d\n", cudaGetErrorString( err ), file, line );
system("PAUSE");
//exit( EXIT_FAILURE );
}
}
#define HANDLE_ERROR( err ) (HandleError( err, __FILE__, __LINE__ ))
#define DIM 512
#define GET_PROC_ADDRESS( str ) wglGetProcAddress( str )
PFNGLBINDBUFFERARBPROC glBindBuffer = NULL;
PFNGLDELETEBUFFERSARBPROC glDeleteBuffers = NULL;
PFNGLGENBUFFERSARBPROC glGenBuffers = NULL;
PFNGLBUFFERDATAARBPROC glBufferData = NULL;
GLuint bufferObj; /* how OpenGL calls the buffer */
cudaGraphicsResource *resource; /* how CUDA calls this same buffer */
uchar4* devPtr; /* friendly pointer to the handles above */
const int size = DIM * DIM;
struct cuComplex
{
float r;
float i;
__device__ cuComplex(float a, float b) : r(a), i(b){ }
__device__ float magnitude2(void)
{
return r * r + i * i;
}
__device__ cuComplex operator *(const cuComplex & a)
{
return cuComplex(r*a.r - i*a.i, i*a.r + r*a.i);
}
__device__ cuComplex operator+(const cuComplex & a)
{
return cuComplex(r+a.r, i+a.i);
}
};
__device__ int julia (int x, int y)
{
const float scale = 1.5;
float jx = scale *(float)(DIM/2 - x)/(DIM/2);
float jy = scale *(float)(DIM/2 - y)/(DIM/2);
cuComplex c(-0.8, 0.156);
cuComplex a(jx,jy);
int i = 0;
for(i=0; i<200; i++)
{
a = a * a + c;
if(a.magnitude2() > 1000)
{
return 0;
}
}
return 1;
}
__global__ void kernel(uchar4 *ptr)
{
int x = blockIdx.x;
int y = blockIdx.y;
int offset = x + y * gridDim.x * blockDim.x;
int juliaValue = julia(x,y);
ptr[offset].x = 255 * juliaValue;
ptr[offset].y = 0;
ptr[offset].z = 0;
ptr[offset].w = 255;
}
static void Draw( void ) {
glDrawPixels( DIM, DIM, GL_RGBA, GL_UNSIGNED_BYTE,0); //ERROR HAPPENS HERE
glutSwapBuffers();
}
void display_and_exit() {
cudaDeviceProp prop;
int dev;
memset( &prop, 0, sizeof( cudaDeviceProp ) );
prop.major = 1;
prop.minor = 0;
HANDLE_ERROR( cudaChooseDevice( &dev, &prop ) );
/*
* Interoperability with OpenGL requires that the CUDA device
* be specified by cudaGLSetGLDevice() before any other runtime calls.
*/
HANDLE_ERROR( cudaGLSetGLDevice( dev ) );
/*
* a bug in the Windows GLUT implementation prevents us from
* passing zero arguments to glutInit()
*/
int c=1;
char* dummy = "";
glutInit( &c, &dummy );
glutInitDisplayMode( GLUT_DOUBLE | GLUT_RGBA );
glutInitWindowSize (DIM, DIM);
glutInitWindowPosition (100, 100);
glutCreateWindow("CUDA and OpenGL example" );
printf("OpenGL version: %s\n", glGetString(GL_VERSION));
dim3 grid(DIM, DIM);
kernel<<<grid,1>>>(devPtr);
glutDisplayFunc(Draw);
/*
* Get pointers to functions (glext.h)
*/
glBindBuffer = (PFNGLBINDBUFFERARBPROC)GET_PROC_ADDRESS("glBindBuffer");
glDeleteBuffers = (PFNGLDELETEBUFFERSARBPROC)GET_PROC_ADDRESS("glDeleteBuffers");
glGenBuffers = (PFNGLGENBUFFERSARBPROC)GET_PROC_ADDRESS("glGenBuffers");
glBufferData = (PFNGLBUFFERDATAARBPROC)GET_PROC_ADDRESS("glBufferData");
glutMainLoop();
}
int main() {
display_and_exit();
return(0);
}
Unless you are using a Pixel Buffer Object, then this is exactly the sort of behavior you should expect from glDrawPixels (..., 0). That last parameter is a pointer to pixel data, it should be non-NULL whenever the source of pixel data is client memory. It can be NULL if you have a Pixel Buffer Object, the same way that the pointer in a call to glVertexPointer (...) can be NULL when using a Vertex Buffer Object.
You have acquired the necessary function pointers to create a Pixel Buffer Object in your code, however have not actually created one. Thus, passing NULL to glDrawPixels (...) will cause the driver to dereference a NULL pointer.