I am having issues with Handles. I have Bytebeat (music in bytes) playing inside of a DWORD WINAPI function. When I try to terminate and close the thread, it straight up gives me the error in the title. This is my code:
#include <windows.h>
#pragma comment(lib, "Winmm.lib")
DWORD WINAPI bytebeat1(LPVOID) {
while (1) {
HWAVEOUT hwo = 0;
WAVEFORMATEX wfx = { WAVE_FORMAT_PCM, 1, 11000, 11000, 1, 8, 0 };
waveOutOpen(&hwo, WAVE_MAPPER, &wfx, 0, 0, CALLBACK_NULL);
char buffer[11000 * 6];
for (DWORD t = 0; t < sizeof(buffer); t++)
buffer[t] = static_cast<char>(t & t + t / 256) - t * (t >> 15) & 64;
WAVEHDR hdr = { buffer, sizeof(buffer), 0, 0, 0, 0, 0, 0 };
waveOutPrepareHeader(hwo, &hdr, sizeof(WAVEHDR));
waveOutWrite(hwo, &hdr, sizeof(WAVEHDR));
waveOutUnprepareHeader(hwo, &hdr, sizeof(WAVEHDR));
waveOutClose(hwo);
Sleep(6000);
}
}
DWORD WINAPI bytebeat2(LPVOID) {
while (1) {
HWAVEOUT hwo = 0;
WAVEFORMATEX wfx = { WAVE_FORMAT_PCM, 1, 8000, 8000, 1, 8, 0 };
waveOutOpen(&hwo, WAVE_MAPPER, &wfx, 0, 0, CALLBACK_NULL);
char buffer[8000 * 6];
for (DWORD t = 0; t < sizeof(buffer); t++)
buffer[t] = static_cast<char>(t, t / 5) >> t / 25 & t / 55 ^ t & 255 ^ (t / 150) ^ 2508025 * 24240835810 & (t / 100) * t / 6000 ^ 5000 * t / 2500 ^ 25 * t / 24;
WAVEHDR hdr = { buffer, sizeof(buffer), 0, 0, 0, 0, 0, 0 };
waveOutPrepareHeader(hwo, &hdr, sizeof(WAVEHDR));
waveOutWrite(hwo, &hdr, sizeof(WAVEHDR));
waveOutUnprepareHeader(hwo, &hdr, sizeof(WAVEHDR));
waveOutClose(hwo);
Sleep(6000);
}
}
int main() {
HANDLE beat1 = CreateThread(0, 0, bytebeat1, 0, 0, 0);
Sleep(6000);
TerminateThread(beat1, 0); CloseHandle(beat1);
Sleep(1000);
HANDLE beat2 = CreateThread(0, 0, bytebeat2, 0, 0, 0);
Sleep(6000);
TerminateThread(beat2, 0); CloseHandle(beat2);
}
I do not know why this is happening. The only fix is compiling it with G++ but I want it so I could just build it. Any help is appreciated. Thanks!
As the documentation makes clear, you can't use TerminateThread this way. Instead, replace the calls to Sleep with an interruptible sleep function that will terminate the thread cleanly and safely if requested to do so.
When you call TerminateThread, you are basically force-crashing your threads. They still have their own stack allocated and handles to Windows resources. They aren't cleaned up properly, causing your crash.
Here's a simple example of how to close your threads without any error. In a real-world scenario this is an unprofessional solution, but it shows the bare minimum that you need to do.
#include <windows.h>
#pragma comment(lib, "Winmm.lib")
volatile bool quit1 = false;
volatile bool quit2 = false;
DWORD WINAPI bytebeat1(LPVOID) {
while (!quit1) {
HWAVEOUT hwo = 0;
WAVEFORMATEX wfx = { WAVE_FORMAT_PCM, 1, 11000, 11000, 1, 8, 0 };
waveOutOpen(&hwo, WAVE_MAPPER, &wfx, 0, 0, CALLBACK_NULL);
char buffer[11000 * 6];
for (DWORD t = 0; t < sizeof(buffer); t++)
buffer[t] = static_cast<char>(t & t + t / 256) - t * (t >> 15) & 64;
WAVEHDR hdr = { buffer, sizeof(buffer), 0, 0, 0, 0, 0, 0 };
waveOutPrepareHeader(hwo, &hdr, sizeof(WAVEHDR));
waveOutWrite(hwo, &hdr, sizeof(WAVEHDR));
waveOutUnprepareHeader(hwo, &hdr, sizeof(WAVEHDR));
waveOutClose(hwo);
Sleep(6000);
}
return 0;
}
DWORD WINAPI bytebeat2(LPVOID) {
while (!quit2) {
HWAVEOUT hwo = 0;
WAVEFORMATEX wfx = { WAVE_FORMAT_PCM, 1, 8000, 8000, 1, 8, 0 };
waveOutOpen(&hwo, WAVE_MAPPER, &wfx, 0, 0, CALLBACK_NULL);
char buffer[8000 * 6];
for (DWORD t = 0; t < sizeof(buffer); t++)
buffer[t] = static_cast<char>(t, t / 5) >> t / 25 & t / 55 ^ t & 255 ^ (t / 150) ^ 2508025 * 24240835810 & (t / 100) * t / 6000 ^ 5000 * t / 2500 ^ 25 * t / 24;
WAVEHDR hdr = { buffer, sizeof(buffer), 0, 0, 0, 0, 0, 0 };
waveOutPrepareHeader(hwo, &hdr, sizeof(WAVEHDR));
waveOutWrite(hwo, &hdr, sizeof(WAVEHDR));
waveOutUnprepareHeader(hwo, &hdr, sizeof(WAVEHDR));
waveOutClose(hwo);
Sleep(6000);
}
return 0;
}
int main() {
HANDLE beat1 = CreateThread(0, 0, bytebeat1, 0, 0, 0);
Sleep(6000);
quit1 = true;
WaitForSingleObject(beat1, INFINITE);
CloseHandle(beat1);
Sleep(1000);
HANDLE beat2 = CreateThread(0, 0, bytebeat2, 0, 0, 0);
Sleep(6000);
quit2 = true;
WaitForSingleObject(beat2, INFINITE);
CloseHandle(beat2);
}
Related
When I use AUGraph to implemention record and play, I want to get audio data through RenderCallback.There is a problem in remote io unit callback with error kAudioConverterErr_InvalidInputSize(1768846202),I don't know how to solve it.
Is there any other way to get audio data?
NSAssert(NewAUGraph(&_auGraph) == noErr, #"NewAUGraph failed");
const AudioComponentDescription ioComponentDesc = {
.componentType = kAudioUnitType_Output,
.componentSubType = kAudioUnitSubType_RemoteIO,
.componentManufacturer = kAudioUnitManufacturer_Apple,
.componentFlags = 0,
.componentFlagsMask = 0
};
AUGraphAddNode(_auGraph, &ioComponentDesc, &_ioNode);
const AudioComponentDescription effectComponent = {
.componentType = kAudioUnitType_Effect,
.componentSubType = kAudioUnitSubType_Reverb2,
.componentManufacturer = kAudioUnitManufacturer_Apple,
.componentFlags = 0,
.componentFlagsMask = 0
};
AUGraphAddNode(_auGraph, &effectComponent, &_reverbNode);
const AudioComponentDescription inConvertComponentDesc = {
.componentType = kAudioUnitType_FormatConverter,
.componentSubType = kAudioUnitSubType_AUConverter,
.componentManufacturer = kAudioUnitManufacturer_Apple,
.componentFlags = 0,
.componentFlagsMask = 0
};
AUGraphAddNode(_auGraph, &inConvertComponentDesc, &_inConvertNode);
const AudioComponentDescription outConvertComponentDesc = {
.componentType = kAudioUnitType_FormatConverter,
.componentSubType = kAudioUnitSubType_AUConverter,
.componentManufacturer = kAudioUnitManufacturer_Apple,
.componentFlags = 0,
.componentFlagsMask = 0
};
AUGraphAddNode(_auGraph, &outConvertComponentDesc, &_outConvertNode);
NSAssert(AUGraphOpen(_auGraph) == noErr, #"AUGraphOpen failed");
AUGraphNodeInfo(_auGraph, _ioNode, NULL, &_ioUnit);
AUGraphNodeInfo(_auGraph, _reverbNode, NULL, &_reverbUnit);
AUGraphNodeInfo(_auGraph, _inConvertNode, NULL, &_inConvertUnit);
AUGraphNodeInfo(_auGraph, _outConvertNode, NULL, &_outConvertUnit);
UInt32 enable = 1;
AudioUnitElement inputBus = 1;
AudioUnitElement outputBus = 0;
AudioUnitSetProperty(_ioUnit, kAudioOutputUnitProperty_EnableIO, kAudioUnitScope_Input, inputBus, &enable, sizeof(enable));
AudioUnitSetProperty(_ioUnit, kAudioOutputUnitProperty_EnableIO, kAudioUnitScope_Output, outputBus, &enable, sizeof(enable));
AudioStreamBasicDescription absd;
memset(&absd, 0, sizeof(absd));
absd.mSampleRate = 44100;
absd.mFormatID = kAudioFormatLinearPCM;
absd.mFormatFlags = kAudioFormatFlagIsSignedInteger | kAudioFormatFlagsNativeEndian | kAudioFormatFlagIsPacked;
absd.mBitsPerChannel = 16;
absd.mChannelsPerFrame = 2;
absd.mBytesPerFrame = absd.mBitsPerChannel / 8 * absd.mChannelsPerFrame;
absd.mFramesPerPacket = 1;
absd.mBytesPerPacket = absd.mBytesPerFrame;
AudioUnitSetProperty(_ioUnit, kAudioUnitProperty_StreamFormat, kAudioUnitScope_Output, inputBus, &absd, sizeof(absd));
AudioUnitSetProperty(_ioUnit, kAudioUnitProperty_StreamFormat, kAudioUnitScope_Input, outputBus, &absd, sizeof(absd));
AURenderCallbackStruct callback = {0};
callback.inputProc = audioUnitRenderCallback;
callback.inputProcRefCon = (__bridge void *)self;
AudioUnitSetProperty(_ioUnit,
kAudioUnitProperty_SetRenderCallback,
kAudioUnitScope_Input,
outputBus,
&callback,
sizeof(callback));
OSStatus status;
AudioStreamBasicDescription reverbABSD;
memset(&reverbABSD, 0, sizeof(reverbABSD));
reverbABSD.mSampleRate = 44100;
reverbABSD.mFormatID = kAudioFormatLinearPCM;
reverbABSD.mFormatFlags = kAudioFormatFlagsNativeFloatPacked | kLinearPCMFormatFlagIsNonInterleaved;
reverbABSD.mBitsPerChannel = 32;
reverbABSD.mChannelsPerFrame = 2;
reverbABSD.mBytesPerFrame = reverbABSD.mBitsPerChannel / 8;
reverbABSD.mFramesPerPacket = 1;
reverbABSD.mBytesPerPacket = reverbABSD.mBytesPerFrame;
AudioUnitSetProperty(_reverbUnit, kAudioUnitProperty_StreamFormat, kAudioUnitScope_Input, 0, &reverbABSD, sizeof(reverbABSD));
AudioUnitSetProperty(_reverbUnit, kAudioUnitProperty_StreamFormat, kAudioUnitScope_Output, 0, &reverbABSD, sizeof(reverbABSD));
AudioUnitSetParameter(_reverbUnit, kReverb2Param_Gain, kAudioUnitScope_Global, 0, 20, 0);
AudioUnitSetParameter(_reverbUnit, kReverb2Param_DryWetMix, kAudioUnitScope_Global, 0, 50, 0);
status = AudioUnitSetProperty(_outConvertUnit, kAudioUnitProperty_StreamFormat, kAudioUnitScope_Input, 0, &reverbABSD, sizeof(reverbABSD));
status = AudioUnitSetProperty(_outConvertUnit, kAudioUnitProperty_StreamFormat, kAudioUnitScope_Output, 0, &absd, sizeof(absd));
AUGraphConnectNodeInput(_auGraph, _ioNode, 1, _inConvertNode, 0);
AUGraphConnectNodeInput(_auGraph, _inConvertNode, 0, _reverbNode, 0);
AUGraphConnectNodeInput(_auGraph, _reverbNode, 0, _outConvertNode, 0);
static OSStatus audioUnitRenderCallback(void * inRefCon,
AudioUnitRenderActionFlags * ioActionFlags,
const AudioTimeStamp * inTimeStamp,
UInt32 inBusNumber,
UInt32 inNumberFrames,
AudioBufferList * __nullable ioData)
{
XLAudioCaptureGraph *THIS = (__bridge XLAudioCaptureGraph *)inRefCon;
memset(ioData->mBuffers[0].mData, 0, ioData->mBuffers[0].mDataByteSize);
OSStatus status = AudioUnitRender(THIS.outConvertUnit, ioActionFlags, inTimeStamp, 0, inNumberFrames, ioData);
}
The audioUnitRenderCallback status return kAudioConverterErr_InvalidInputSize.
I am trying to convert a wmf file to emf file. From what I've learned on Internet, the best solution looks like this.
BYTE* buffer;
HDC hdc = CreateMetaFileA(filename);
HMETAFILE hmf = CloseMetaFile(hdc);
UINT metasize = GetMetaFileBitsEx(hmf, 0, NULL);
buffer = (BYTE*)malloc(metasize);
HENHMETAFILE hEMF = SetWinMetaFileBits(metasize, buffer, NULL, NULL);
The idea here is to use CreateMetaFileA and CloseMetaFile to get HMETAFILE hmf.
Then I tried my code and the weird thing came. The handle hmf always points ??? in memory and the metasize is always 24 with different pictures. And hEMF is always None.
This is really sad because I spend my whole night on figuring out how to make the code work.
I do read a lot of materials including
http://math2.org/luasearch-2/luadist-extract/cdlua-5.2.dist/src/win32/wmf_emf.c
https://www-user.tu-chemnitz.de/~heha/viewzip.cgi/hs/wmfsave.zip/src/wmfsave.cpp?auto=CPP
Can anyone help me here? Thanks.
You need to initialize the METAFILEPICT structure.
Minimal example:
if (hmf) {
DWORD nSize = GetMetaFileBitsEx( hmf, 0, NULL );
if (nSize) {
BYTE *lpvData = new BYTE[nSize];
if (lpvData) {
DWORD dw = GetMetaFileBitsEx( hmf, nSize, lpvData );
if (dw) {
// Fill out a METAFILEPICT structure
mp.mm = MM_ANISOTROPIC;
mp.xExt = 1000;
mp.yExt = 1000;
mp.hMF = NULL;
// Get a reference DC
hDC = GetDC( NULL );
// Make an enhanced metafile from the windows metafile
hemf = SetWinMetaFileBits( nSize, lpvData, hDC, &mp );
// Clean up
ReleaseDC( NULL, hDC );
}
delete[] lpvData;
}
DeleteMetaFile( hmf );
}
My test code:
hdcMeta = CreateMetaFile(NULL);
hBrush = CreateSolidBrush(RGB(0, 0, 255));
Rectangle(hdcMeta, 0, 0, 100, 100);
MoveToEx(hdcMeta, 0, 0, NULL);
LineTo(hdcMeta, 100, 100);
MoveToEx(hdcMeta, 0, 100, NULL);
LineTo(hdcMeta, 100, 0);
SelectObject(hdcMeta, hBrush);
Ellipse(hdcMeta, 20, 20, 80, 80);
hmf = CloseMetaFile(hdcMeta);
UINT nSize = GetMetaFileBitsEx(hmf, 0, NULL);
Debug:
You can see nSize = 114
I suspect that you use CreateMetaFileA and CloseMetaFile to directly load the file name and return a handle to a Windows-format metafile is a wrong way.
Updated:
You can get the handle of WMF file in another way.
#include <windows.h>
#include <iostream>
#include <vector>
#pragma pack(1)
typedef struct tagWIN16RECT
{
WORD left;
WORD top;
WORD right;
WORD bottom;
} WIN16RECT;
typedef struct tagPLACEABLEMETAHEADER
{
DWORD key;
WORD hmf;
WIN16RECT bbox;
WORD inch;
DWORD reserved;
WORD checksum;
} PLACEABLEMETAHEADER;
#pragma pack()
HENHMETAFILE WINAPI ConvertWMFToEWMF(IN LPCWSTR lpszMetaFile)
{
HANDLE hFile = ::CreateFileW(
lpszMetaFile,
GENERIC_READ,
0,
NULL,
OPEN_EXISTING,
FILE_ATTRIBUTE_NORMAL,
NULL);
if (hFile == NULL || hFile == INVALID_HANDLE_VALUE)
return NULL;
DWORD dwSize = ::GetFileSize(hFile, NULL);
std::vector<BYTE> data(dwSize);
DWORD dwRead;
BOOL bSuccess = ::ReadFile(hFile, &data[0], dwSize, &dwRead, NULL);
::CloseHandle(hFile);
HENHMETAFILE hEnhMetaFile = NULL;
if (bSuccess)
{
PLACEABLEMETAHEADER * hdr = (PLACEABLEMETAHEADER*)&data[0];
int iPlaceableHeaderSize = sizeof(PLACEABLEMETAHEADER);
int iOffset = 0;
if (hdr->key != 0x9AC6CDD7) //not placeable header
{
iOffset = 0; //offset remains zero
}
else
{
iOffset = iPlaceableHeaderSize; //file is offset with placeable windows metafile header
}
hEnhMetaFile = ::SetWinMetaFileBits(data.size(), &data[iOffset], NULL, NULL);
if (NULL == hEnhMetaFile)
{
DWORD dwError = GetLastError();
std::cout << "Failed with error code: " << dwError;
}
else
{
std::cout << "Success! Metafile opened and returned as enhanced metafile";
}
}
return hEnhMetaFile;
}
int main()
{
HENHMETAFILE hEMF = ConvertWMFToEWMF(L"C:\\Users\\strives\\Desktop\\AN00010.WMF");
HENHMETAFILE newHEMF = CopyEnhMetaFile(hEMF, L"new EMF.emf");
return 0;
}
This worked for me
CStatic * m_pictCtrl = (CStatic *)this->GetDlgItem(PICT_STATIC);
LPCSTR file = filePath;
ALDUSMFHEADER aldusmfHeader;
DWORD wBytesRead;
double xOri, xExt, yOri, yExt;
HANDLE fh = CreateFileA(file, GENERIC_READ, FILE_SHARE_READ, NULL, OPEN_EXISTING, FILE_ATTRIBUTE_NORMAL, NULL);
ReadFile(fh, (void *)&aldusmfHeader, ALDUSMFHEADERSIZE, &wBytesRead, NULL);
xOri = aldusmfHeader.bbox.left;
xExt = aldusmfHeader.bbox.right - xOri;
if (aldusmfHeader.bbox.bottom < aldusmfHeader.bbox.top) {
yOri = aldusmfHeader.bbox.bottom;
yExt = aldusmfHeader.bbox.top - aldusmfHeader.bbox.bottom;
}
else {
yOri = aldusmfHeader.bbox.top;
yExt = aldusmfHeader.bbox.top - aldusmfHeader.bbox.bottom;
}
if (wBytesRead == -1 || wBytesRead < ALDUSMFHEADERSIZE)
{
AfxMessageBox(L" is not a placeable Windows metafile : it cannot be converted into EMF format.");
CloseHandle(fh);
return 0;
}
// Envelope in /100 cm
double Density = static_cast<double>(aldusmfHeader.inch);
double Top = static_cast<double>(aldusmfHeader.bbox.top) / Density;
double RawBottom = static_cast<double>(aldusmfHeader.bbox.bottom) / Density;
double Left = static_cast<double>(aldusmfHeader.bbox.left) / Density;
double RawRight = static_cast<double>(aldusmfHeader.bbox.right) / Density;
// In order to correctly import the EMF metafile into WORD, add one delta
double Bottom, Right, Delta, Rate = 0.1;
if (RawBottom > RawRight)
{
Delta = Rate * RawRight;
Right = RawRight + Delta;
Bottom = Right * RawBottom / RawRight;
}
else
{
Delta = Rate * RawBottom;
Bottom = RawBottom + Delta;
Right = Bottom * RawRight / RawBottom;
}
// Metafile header
SetFilePointer(fh, ALDUSMFHEADERSIZE, NULL, FILE_BEGIN);
METAHEADER mfHeader;
ReadFile(fh, (void *)&mfHeader, sizeof(METAHEADER), &wBytesRead, NULL);
// Allocate memory in order to save into memory bits after the Aldus header in the WMF metafile
// * 2 : 16 bits API
DWORD dwSize = mfHeader.mtSize * 2 * sizeof(BYTE);
BYTE *lpMFBits = (BYTE *)malloc(dwSize);
if (lpMFBits == nullptr)
{
AfxMessageBox(L"nullptr lpmfbits");
//cout << "Not enough memory to convert " << WMFFileName << " into EMF format." << endl;
CloseHandle(fh);
return 0;
}
// Bits after the Aldus header
SetFilePointer(fh, ALDUSMFHEADERSIZE, NULL, FILE_BEGIN);
ReadFile(fh, (void *)lpMFBits, dwSize, &wBytesRead, NULL);
if (wBytesRead == -1)
{
//cout << "Error while reading " << WMFFileName << " : impossible to convert it into EMF format." << endl;
free(lpMFBits);
CloseHandle(fh);
return 0;
}
// Save these bits into a memory enhanced metafile
// The memory enhanced metafile only contains 32 bits API functions : TextOut has been converted into ExtTextOutW,
// CreateFontIndirect has been converted into ExtCreateFontIndirectW, ...
METAFILEPICT MetaFilePict;
MetaFilePict.hMF = NULL;
MetaFilePict.mm = MM_ANISOTROPIC;
double Fact = 10.0 * Density;
MetaFilePict.xExt = static_cast<LONG>(Fact * (Right - Left));
MetaFilePict.yExt = static_cast<LONG>(Fact * (Bottom - Top));
HENHMETAFILE hMemoryEnhMetafile = SetWinMetaFileBits(dwSize, lpMFBits, NULL, &MetaFilePict);
free(lpMFBits);
CloseHandle(fh);
if (m_pictCtrl->GetEnhMetaFile() == NULL)
m_pictCtrl->SetEnhMetaFile(hMemoryEnhMetafile);
My project uses DirectX 10 and some of its boilerplate to render a scene, however, it crashes with an error message "Could not initialize the model object." As far as I understand, making it up to this point means that, at the very least, the model has been successfully created, so the error must be in one of the files below, which is fortunate as the most difficult tasks are handled by the FallBodyClass.cpp that hosts OpenCL API interactions. If needed, I can try attaching parts of it in a later edit.
During debug, my IDE shows that all components of m_Model (m_vertexBuffer, m_indexBuffer etc) are shown as with _vfptr . I do not know what to make of it, but it does seem to confirm that modelclass.cpp is the point of failure.
graphicsclass.cpp
GraphicsClass::GraphicsClass()
{
m_Direct3D = 0;
m_Model = 0;
m_ColorShader = 0;
m_bodies = BODIES;
}
GraphicsClass::GraphicsClass(const GraphicsClass& other)
{}
GraphicsClass::~GraphicsClass()
{}
bool GraphicsClass::Initialize(int screenWidth, int screenHeight, HWND hwnd)
{
bool result;
// Create the Direct3D object.
m_Direct3D = new D3DClass;
if (!m_Direct3D)
{
return false;
}
// Initialize the Direct3D object.
result = m_Direct3D->Initialize(screenWidth, screenHeight, VSYNC_ENABLED, hwnd, FULL_SCREEN, SCREEN_DEPTH, SCREEN_NEAR);
if (!result)
{
MessageBox(hwnd, L"Could not initialize Direct3D", L"Error", MB_OK);
return false;
}
// Create the model object.
m_Model = new ModelClass(m_bodies);
if (!m_Model)
{
return false;
}
// Initialize the model object.
result = m_Model->Initialize(m_Direct3D->GetDevice());
if (!result)
{
MessageBox(hwnd, L"Could not initialize the model object.", L"Error", MB_OK);
return false;
}
modelclass.cpp
ModelClass::ModelClass(int bodies)
{
m_vertexBuffer = 0;
m_indexBuffer = 0;
m_positions = 0;
m_velocities = 0;
m_bodySystem = 0;
m_bodies = bodies;
}
ModelClass::ModelClass(const ModelClass& other)
{}
ModelClass::~ModelClass()
{}
bool ModelClass::Initialize(ID3D10Device* device)
{
bool result;
TwoLines twoLinesConstants = CalculateLinesConstants(M_PI_4);
m_positions = new float[COORD_DIM * m_bodies];
m_velocities = new float[VEL_DIM * m_bodies];
m_bodySystem = new class FallBodyClass(m_bodies, &m_positions, &m_velocities, twoLinesConstants, result);
if (!result) {
return false;
}
// Initialize the vertex and index buffer that hold the geometry for the triangle.
result = InitializeBuffers(device, twoLinesConstants);
if(!result)
{
return false;
}
return true;
}
FallBodyclass.cpp
FallBodyClass::FallBodyClass(int bodies, float ** positionsCPU, float ** velocitiesCPU, TwoLines twoLines, bool & success)
:bodies(bodies)
{
cl_int ret;
// getting the first available platform
cl_platform_id clPlatformID[2];
cl_platform_id GPUplatform;
cl_uint num_platforms;
//char str[1024];
ret = clGetPlatformIDs(2, clPlatformID, &num_platforms);
GPUplatform = clPlatformID[0]; //choose GPU platform
//error |= clGetPlatformInfo(GPUplatform, CL_PLATFORM_NAME, 0, NULL, NULL);
//clGetPlatformInfo(GPUplatform, CL_PLATFORM_VENDOR, sizeof(str), str, NULL);
// getting the first GPU device
ret |= clGetDeviceIDs(GPUplatform, CL_DEVICE_TYPE_GPU, 1, &device, NULL);
if (ret != CL_SUCCESS)
{
success = false;
return;
}
//clGetDeviceInfo(device, CL_DEVICE_NAME, sizeof(str), str, NULL);
// creating the context
context = clCreateContext(0, 1, &device, NULL, NULL, &ret);
if (ret != CL_SUCCESS)
{
success = false;
return;
}
cl_queue_properties props[] = {
CL_QUEUE_PROFILING_ENABLE
};
// creating the command queue
queue = clCreateCommandQueueWithProperties(context, device, props, &ret);
if (ret != CL_SUCCESS)
{
success = false;
return;
}
// setting the local variables
// (at the same time one of them supposed to be 0 and another to be 1)
read = 0;
write = 1;
// reading the kernel
FILE * f = NULL;
char fileName[18] = "kernel.cl";
f = fopen(fileName, "rb");
if(f == NULL)
{
success = false;
return;
}
// getting the length of the source code for the kernel
fseek(f, 0, SEEK_END);
size_t codeLength = ftell(f);
rewind(f);
char * code = (char *)malloc(codeLength + 1);
if (fread(code, codeLength, 1, f) != 1)
{
fclose(f);
free(code);
success = false;
return;
}
// closing the file and 0-terminating the source code
fclose(f);
code[codeLength] = '\0';
// creating the program
program = clCreateProgramWithSource(context, 1, (const char **)&code, &codeLength, &ret);
if (ret != CL_SUCCESS)
{
success = false;
return;
}
// clearing the memory
free(code);
// building the program
ret |= clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
// creating the kernel
kernel = clCreateKernel(program, "impactManager", &ret);
// setting the local size of the group the largest possible in order to load all computational units
int numGroups;
ret |= clGetDeviceInfo(device, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(numGroups), &numGroups, NULL);
localSize = bodies / numGroups;
// allocating pinned buffers for velocities and positions, and stuck
positionsCPUBuffer = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, COORD_DIM * bodies * sizeof(float) , NULL, NULL);
velocitiesCPUBuffer = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, VEL_DIM * bodies * sizeof(float) , NULL, NULL);
linesCPUBuffer = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, 8 * sizeof(float), NULL, NULL);
// get pointers to arrays to operate with the buffers (array map buffers here (to program) as float-arrays)
*positionsCPU = (float *)clEnqueueMapBuffer(queue, positionsCPUBuffer, CL_TRUE, CL_MAP_WRITE, 0, COORD_DIM * bodies * sizeof(float), 0, NULL, NULL, NULL);
*velocitiesCPU = (float *)clEnqueueMapBuffer(queue, velocitiesCPUBuffer, CL_TRUE, CL_MAP_WRITE, 0, VEL_DIM * bodies * sizeof(float), 0, NULL, NULL, NULL);
float * linesCPU = (float *)clEnqueueMapBuffer(queue, linesCPUBuffer, CL_TRUE, CL_MAP_WRITE, 0, 8 * sizeof(float), 0, NULL, NULL, NULL);
// initialization of the bodies' positions and velocities, and stuck
initBodies(*positionsCPU, *velocitiesCPU);
initLines(twoLines, linesCPU);
// unmapping the pointers to arrays (invalidates array pointers)
clEnqueueUnmapMemObject(queue, positionsCPUBuffer, *positionsCPU, 0, NULL, NULL);
clEnqueueUnmapMemObject(queue, velocitiesCPUBuffer, *velocitiesCPU, 0, NULL, NULL);
clEnqueueUnmapMemObject(queue, linesCPUBuffer, linesCPU, 0, NULL, NULL);
// allocate two arrays on GPU for positions and velocities
for (int i = 0; i < 2; ++i) {
positionsGPU[i] = clCreateBuffer(context, CL_MEM_READ_WRITE, COORD_DIM * bodies * sizeof(float), NULL, NULL);
ret |= clEnqueueWriteBuffer(queue, positionsGPU[i], CL_TRUE, 0, COORD_DIM * bodies * sizeof(float), *positionsCPU, 0, NULL, NULL);
velocitiesGPU[i] = clCreateBuffer(context, CL_MEM_READ_WRITE, VEL_DIM * bodies * sizeof(float), NULL, NULL);
ret |= clEnqueueWriteBuffer(queue, velocitiesGPU[i], CL_TRUE, 0, VEL_DIM * bodies * sizeof(float), *velocitiesCPU, 0, NULL, NULL);
}
linesGPU = clCreateBuffer(context, CL_MEM_READ_WRITE, 8 * sizeof(float), NULL, NULL);
ret |= clEnqueueWriteBuffer(queue, linesGPU, CL_TRUE, 0, 8 * sizeof(float), linesCPU, 0, NULL, NULL);
if (ret != CL_SUCCESS)
{
success = false;
return;
}
}
void FallBodyClass::initLines(IN TwoLines l, OUT float *linesCPU)
{
linesCPU[0] = l.a1;
linesCPU[1] = l.b1;
linesCPU[2] = l.R1.x;
linesCPU[3] = l.R1.y;
linesCPU[4] = l.a2;
linesCPU[5] = l.b2;
linesCPU[6] = l.R2.x;
linesCPU[7] = l.R2.y;
}
// initialization of the bodies' positions and velocities
void FallBodyClass::initBodies(float * positionsCPU, float * velocitiesCPU)
{
float scale = 0.20f;
// initialization of the memory
memset(positionsCPU, 0, COORD_DIM * bodies * sizeof(float));
memset(velocitiesCPU, 0, VEL_DIM * bodies * sizeof(float));
// for the randomization
srand((unsigned int)time(NULL));
for (int i = 0; i < bodies; i++)
{
positionsCPU[COORD_DIM * i] = 1.8*((rand() / (float)RAND_MAX) - 0.5); //x axis
positionsCPU[COORD_DIM * i + 1] = 0.9; //y axis
positionsCPU[COORD_DIM * i + 2] = 0.0f; //z axis
positionsCPU[COORD_DIM * i + 3] = 0.0f; // stuck variable
// velocities are zeros
velocitiesCPU[VEL_DIM* i] = 0.0;
velocitiesCPU[VEL_DIM* i + 1] = -2 * (rand() / (float)RAND_MAX);
velocitiesCPU[VEL_DIM* i + 2] = 0.0;
}
}
// updating the bodies' positions and velocities. Stuck is updated inside too
void FallBodyClass::update(float dt, float * positionsCPU, float * velocitiesCPU, bool & success)
{
cl_int error = CL_SUCCESS;
size_t global_work_size;
size_t local_work_size;
success = true;
if (localSize > bodies)
localSize = bodies;
local_work_size = localSize;
global_work_size = bodies;
// passing the arguments
// we write the new positions and velocities and read the previous ones
error |= clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&positionsGPU[write]);
error |= clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&velocitiesGPU[write]);
error |= clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&positionsGPU[read]);
error |= clSetKernelArg(kernel, 3, sizeof(cl_mem), (void *)&velocitiesGPU[read]);
error |= clSetKernelArg(kernel, 4, sizeof(cl_float), (void *)&dt);
error |= clSetKernelArg(kernel, 5, sizeof(cl_mem), (void *)&linesGPU);
// just swap read and write in order not to copy the arrays
int temp;
temp = write;
write = read;
read = temp;
// executing the kernel
error |= clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global_work_size, &local_work_size, 0, NULL, NULL);
// synchronization
clFinish(queue);
// asynchronously reading the updated values
error |= clEnqueueReadBuffer(queue, positionsGPU[read], CL_FALSE, 0, COORD_DIM * bodies * sizeof(float), positionsCPU, 0, NULL, NULL);
if (error != CL_SUCCESS)
{
success = false;
}
error |= clEnqueueReadBuffer(queue, velocitiesGPU[read], CL_FALSE, 0, VEL_DIM * bodies * sizeof(float), velocitiesCPU, 0, NULL, NULL);
if (error != CL_SUCCESS)
{
success = false;
}
///////////
bool toReboot = positionsCPU[3]; //fourth index of the [0] first element
//bool toReboot = false;
////////////
if (toReboot) {
positionsCPU = (float *)clEnqueueMapBuffer(queue, positionsCPUBuffer, CL_TRUE, CL_MAP_WRITE, 0, COORD_DIM * bodies * sizeof(float), 0, NULL, NULL, NULL);
velocitiesCPU = (float *)clEnqueueMapBuffer(queue, velocitiesCPUBuffer, CL_TRUE, CL_MAP_WRITE, 0, VEL_DIM * bodies * sizeof(float), 0, NULL, NULL, NULL);
initBodies(positionsCPU, velocitiesCPU);
// unmapping the pointers
clEnqueueUnmapMemObject(queue, positionsCPUBuffer, positionsCPU, 0, NULL, NULL);
clEnqueueUnmapMemObject(queue, velocitiesCPUBuffer, velocitiesCPU, 0, NULL, NULL);
//update values on GPU side
error |= clEnqueueWriteBuffer(queue, positionsGPU[read], CL_TRUE, 0, COORD_DIM * bodies * sizeof(float), positionsCPU, 0, NULL, NULL);
error |= clEnqueueWriteBuffer(queue, velocitiesGPU[read], CL_TRUE, 0, VEL_DIM * bodies * sizeof(float), velocitiesCPU, 0, NULL, NULL);
}
return;
}
FallBodyClass::~FallBodyClass(void)
{
// synchronization (if something has to be done)
clFinish(queue);
// releasing all objects
clReleaseMemObject(linesGPU);
clReleaseMemObject(linesCPUBuffer);
clReleaseMemObject(velocitiesGPU[0]);
clReleaseMemObject(velocitiesGPU[1]);
clReleaseMemObject(positionsGPU[0]);
clReleaseMemObject(positionsGPU[1]);
clReleaseMemObject(positionsCPUBuffer);
clReleaseMemObject(velocitiesCPUBuffer);
clReleaseKernel(kernel);
clReleaseProgram(program);
clReleaseCommandQueue(queue);
clReleaseContext(context);
}
Is it possible to create a multiclient pipe? With one server and multiple clients? From the official documentation I have read that " A pipe server could use a single pipe instance to connect with multiple pipe clients by connecting to and disconnecting from each client in sequence, but performance would be poor" ( https://msdn.microsoft.com/en-us/library/windows/desktop/aa365594(v=vs.85).aspx ). Is this behaviour standard (can be done using some flags or something like that) or I have to implement this behaviour by myself? I have written a test with multiply clients, but when I trying to connect by the second client, I got error STATUS_PIPE_NOT_AVAILABLE.
There is my code, it quite big, but functions test_multiple_client and test_multiple_client2 is the same
void test_mutiple_client( PVOID arg )
{
OBJECT_ATTRIBUTES oa;
UNICODE_STRING us;
IO_STATUS_BLOCK iosb;
HANDLE thread = 0, event = 0, client = 0;
NTSTATUS r;
CLIENT_ID id;
LARGE_INTEGER timeout;
ULONG i;
us.Buffer = pipename;
us.Length = sizeof pipename - 2;
us.MaximumLength = us.Length;
oa.Length = sizeof oa;
oa.RootDirectory = 0;
oa.ObjectName = &us;
oa.Attributes = OBJ_CASE_INSENSITIVE;
oa.SecurityDescriptor = 0;
oa.SecurityQualityOfService = 0;
r = NtCreateEvent( &event, EVENT_ALL_ACCESS, NULL, NotificationEvent, 0 );
ok(r == STATUS_SUCCESS, "return wrong (%08lx)\n", r);
r = NtOpenFile( &client, GENERIC_READ | GENERIC_WRITE, &oa, &iosb, FILE_SHARE_READ|FILE_SHARE_WRITE, 0 );
ok(r == STATUS_SUCCESS, "return wrong %08lx\n", r);
dprintf("mc: client1 pipe created\n");
int thread_id = __sync_add_and_fetch(&g__clientsCounter, 1);
while (g__clientsCounter != 2);
dprintf("thread %d stated\n", thread_id);
r = NtReadFile( client, event, 0, 0, &iosb, &i, sizeof i, 0, 0 );
if (r == STATUS_PENDING)
r = NtWaitForSingleObject( event, TRUE, 0 );
ok (r == STATUS_SUCCESS, "read %ld returned %08lx\n", i, r);
ok (i == 13, "lol?????");
r = NtClose( client );
ok( r == STATUS_SUCCESS, "return wrong %08lx\n", r);
}
void test_mutiple_client2( PVOID arg )
{
OBJECT_ATTRIBUTES oa;
UNICODE_STRING us;
IO_STATUS_BLOCK iosb;
HANDLE thread = 0, event = 0, client = 0;
NTSTATUS r;
CLIENT_ID id;
LARGE_INTEGER timeout;
ULONG i;
us.Buffer = pipename;
us.Length = sizeof pipename - 2;
us.MaximumLength = us.Length;
oa.Length = sizeof oa;
oa.RootDirectory = 0;
oa.ObjectName = &us;
oa.Attributes = OBJ_CASE_INSENSITIVE;
oa.SecurityDescriptor = 0;
oa.SecurityQualityOfService = 0;
r = NtCreateEvent( &event, EVENT_ALL_ACCESS, NULL, NotificationEvent, 0 );
ok(r == STATUS_SUCCESS, "return wrong (%08lx)\n", r);
r = NtOpenFile( &client, GENERIC_READ | GENERIC_WRITE, &oa, &iosb, FILE_SHARE_READ|FILE_SHARE_WRITE, 0 );
ok(r == STATUS_SUCCESS, "return wrong %08lx\n", r);
dprintf("mc: client1 pipe created\n");
int thread_id = __sync_add_and_fetch(&g__clientsCounter, 1);
while (g__clientsCounter != 2);
dprintf("thread %d stated\n", thread_id);
r = NtReadFile( client, event, 0, 0, &iosb, &i, sizeof i, 0, 0 );
if (r == STATUS_PENDING)
r = NtWaitForSingleObject( event, TRUE, 0 );
ok (r == STATUS_SUCCESS, "read %ld returned %08lx\n", i, r);
ok (i == 13, "lol?????");
r = NtClose( client );
ok( r == STATUS_SUCCESS, "return wrong %08lx\n", r);
}
void test_multiple_connections( )
{
OBJECT_ATTRIBUTES oa;
UNICODE_STRING us;
IO_STATUS_BLOCK iosb;
HANDLE pipe = 0, thread = 0, event = 0;
NTSTATUS r;
CLIENT_ID id;
LARGE_INTEGER timeout;
ULONG i;
us.Buffer = pipename;
us.Length = sizeof pipename - 2;
us.MaximumLength = us.Length;
oa.Length = sizeof oa;
oa.RootDirectory = 0;
oa.ObjectName = &us;
oa.Attributes = OBJ_CASE_INSENSITIVE;
oa.SecurityDescriptor = 0;
oa.SecurityQualityOfService = 0;
timeout.QuadPart = -10000LL;
r = NtCreateNamedPipeFile( &pipe, GENERIC_READ|GENERIC_WRITE|SYNCHRONIZE,
&oa, &iosb, FILE_SHARE_READ|FILE_SHARE_WRITE, FILE_OPEN_IF, 0, TRUE,
TRUE, FALSE, /*Unlimited*/ -1, 0, 0, &timeout );
ok( r == STATUS_SUCCESS, "return wrong %08lx\n", r);
dprintf("mc: server pipe created\n");
r = RtlCreateUserThread( NtCurrentProcess(), NULL, FALSE,
NULL, 0, 0, &test_mutiple_client, NULL, &thread, &id );
ok( r == STATUS_SUCCESS, "failed to create thread\n" );
r = RtlCreateUserThread( NtCurrentProcess(), NULL, FALSE,
NULL, 0, 0, &test_mutiple_client2, NULL, &thread, &id );
ok( r == STATUS_SUCCESS, "failed to create thread\n" );
r = NtCreateEvent( &event, EVENT_ALL_ACCESS, NULL, NotificationEvent, 0 );
ok(r == STATUS_SUCCESS, "return wrong (%08lx)\n", r);
r = NtFsControlFile( pipe, event, 0, 0, &iosb, FSCTL_PIPE_LISTEN, 0, 0, 0, 0 );
if (r == STATUS_PENDING) {
dprintf("mc: pending\n");
r = NtWaitForSingleObject( event, TRUE, 0 );
}
ok( r == STATUS_SUCCESS, "failed to listen %08lx\n", r );
dprintf("mc: server pipe listen\n");
i = 13;
while (g__clientsCounter != 2);
dprintf("server started\n");
r = NtWriteFile( pipe, event, 0, 0, &iosb, &i, sizeof i, 0, 0 );
if (r == STATUS_PENDING)
r = NtWaitForSingleObject( event, TRUE, 0 );
ok (r == STATUS_SUCCESS, "write %ld returned %08lx\n", i, r);
dprintf("server write data\n");
r = NtClose( pipe );
ok( r == STATUS_SUCCESS, "return wrong %08lx\n", r);
}
The output is
mc: server pipe created
mc: pending
mc: server pipe listen
mc: client1 pipe created
478: return wrong c00000ac
mc: client1 pipe created
thread 2 stated
488: read 2013057864 returned c0000008
489: lol?????492: return wrong c0000008
server started
thread 1 stated
server write data
4 failed, 38 passed
I also have seen an answer of stack ( Number of Clients that can connect to a Named Pipe ) where mentioned that windows pipes can hold up to 256 clients
I faced with the problem that the kernel writes data in wrong place or host reads data incorrectly sometimes. I write the same data (index at which I write the data) to two global arrays with different types. To ensure that the index is corrent are used the global counter which incremented by means of atom_inc. The problem occures when data are read from second array on the host.
For instance:
.....
output array index: 442: (output1 value:442.0000 output2 value:442)
output array index: 443: (output1 value:443.0000 output2 value:443)
output array index: 444: (output1 value:444.0000 output2 value:444)
output array index: 445: (output1 value:445.0000 output2 value:445)
output array index: 446: (output1 value:446.0000 output2 value:1152892928)
output array index: 447: (output1 value:447.0000 output2 value:447)
output array index: 448: (output1 value:448.0000 output2 value:1152909312)
output array index: 449: (output1 value:449.0000 output2 value:1152917504)
output array index: 450: (output1 value:450.0000 output2 value:1152925696)
......
As you can see at indicies 446, 448, 449 and 450+ output2 contains wrong values. What can be the possible reason of this?
Device: ATI Radeon HD5750
Code sample:
#include <stdio.h>
#include <math.h>
#include <OpenCL/OpenCL.h>
// wtf example
const char *programSource =
"__kernel void kernel1(__global uint *counter,\n" \
"__global float *weights,\n" \
"__global uint *weights_pos)\n" \
"{\n"\
"const uint global_size = get_global_size(0);\n" \
"const uint global_id = get_global_id(0);\n" \
"uint local_id = get_local_id(0);\n" \
"if(global_id == 0) {\n" \
"counter[5] = 0; // set index of pos in weights to zero\n" \
"}\n" \
"uint insert_index = atom_inc(&counter[5]);\n" \
"weights[insert_index] = insert_index;\n" \
"weights_pos[insert_index] = insert_index;\n" \
"}";
void art_process_sinogram(const char* tiff_filename,
const float *angles2,
const unsigned int n_angles2,
const unsigned int n_ray2s,
const float distanc2e)
{
/******************************
* OPENCL ENVIRONMENT
*/
cl_int status;
cl_uint numPlatforms = 0;
cl_platform_id *platforms = NULL;
cl_device_id device_id;
//discover platforms
status = clGetPlatformIDs(0, NULL, &numPlatforms);
platforms = (cl_platform_id*)malloc(numPlatforms * sizeof(cl_platform_id));
status = clGetPlatformIDs(numPlatforms, platforms, NULL);
//discover devices
cl_uint numDevices = 0;
cl_device_id *devices = NULL;
status = clGetDeviceIDs(platforms[0], CL_DEVICE_TYPE_ALL, 0, NULL, &numDevices);
devices = (cl_device_id*)malloc(numDevices * sizeof(cl_device_id));
status = clGetDeviceIDs(platforms[0], CL_DEVICE_TYPE_ALL, numDevices, devices, NULL);
device_id = devices[1];
//create context
cl_context context = NULL;
context = clCreateContext(NULL, numDevices, devices, NULL, NULL, &status);
cl_program program = clCreateProgramWithSource(context, 1, (const char **)&programSource, NULL, &status);
clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
cl_kernel kernel_weights = clCreateKernel(program, "kernel1", &status);
//create queue
cl_command_queue command_queue1 = clCreateCommandQueue(context, device_id, 0, &status);
/******************************
* HARDWARE PARAMETERS
*/
cl_uint wavefronts_per_SIMD = 7;
size_t global_work_size;
size_t local_work_size = 64;
cl_uint max_compute_units;
clGetDeviceInfo(device_id, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(cl_uint), &max_compute_units, NULL);
size_t wg_count = max_compute_units * wavefronts_per_SIMD;
global_work_size = wg_count * local_work_size;
/**************************** DATA PART *************************************/
size_t w_portion_size = 768 * sizeof(cl_float);
size_t w_pos_portion_size = 768 * sizeof(cl_uint);
size_t counters_data_size = 6 * sizeof(cl_uint);
cl_uint counters_data[6];
counters_data[0] = 1;
counters_data[1] = 2; // max number of the cells intersected by the ray
counters_data[2] = 3;
counters_data[3] = 4;
counters_data[4] = 5; // same to the number of rays
counters_data[5] = 0; // counter inside kernel
/*****************
* Main buffers
*/
cl_mem weights1_buffer = clCreateBuffer(context,
CL_MEM_READ_WRITE,
w_portion_size,
NULL,
NULL);
cl_mem weights_pos1_buffer = clCreateBuffer(context,
CL_MEM_READ_WRITE,
w_pos_portion_size,
NULL,
NULL);
/*****************
* Supplement buffers (constant)
*/
cl_mem counters_data_buffer = clCreateBuffer(context,
CL_MEM_READ_ONLY,
counters_data_size,
NULL,
&status);
cl_event supplement_buffer_ready[1];
status = clEnqueueWriteBuffer(command_queue1,
counters_data_buffer,
CL_FALSE,
0,
counters_data_size,
counters_data,
0,
NULL,
&supplement_buffer_ready[0]);
status = clSetKernelArg(kernel_weights, 0, sizeof(void *), (void *)&counters_data_buffer);
status = clSetKernelArg(kernel_weights, 1, sizeof(void *), (void *)&weights1_buffer);
status = clSetKernelArg(kernel_weights, 2, sizeof(void *), (void *)&weights_pos1_buffer);
status = clEnqueueNDRangeKernel(command_queue1,
kernel_weights,
1, // work dimensional 1D, 2D, 3D
NULL, // offset
&global_work_size, // total number of WI
&local_work_size, // nomber of WI in WG
1, // num events in wait list
supplement_buffer_ready, // event wait list
NULL); // event
clFinish(command_queue1);
cl_float *output1 = (cl_float *) clEnqueueMapBuffer(command_queue1,
weights1_buffer,//*pmain_weights_buffer,
CL_TRUE,
CL_MAP_READ,
0,
w_portion_size,
0, NULL, NULL, NULL);
cl_uint *output2 = malloc(w_portion_size);
status = clEnqueueReadBuffer(command_queue1, weights_pos1_buffer,
CL_TRUE, 0, w_pos_portion_size, output2,
0, NULL, NULL);
clFinish(command_queue1);
for(int i = 0; i < 790; ++i) {
printf("output array index: %d: (output1 value:%.4f \t output2 value:%d) \n", i, output1[i], output2[i]);
}
}
SOLUTION:
The kernel should be looks like (need checking index):
__kernel void k_1(__global uint *counter,
__global uint *weights,
__global uint2 *weights_pos)
{
const uint global_size = get_global_size(0);
const uint global_id = get_global_id(0);
uint local_id = get_local_id(0);
uint insert_index = atom_inc(&counter[5]);
if(insert_index < 768) {
weights[insert_index]= insert_index;
weights_pos[insert_index].x = insert_index;
weights_pos[insert_index].y = insert_index;
}
}
You are messing up with buffer dimensions.
1) Your buffers contains 768 elements each (see initialization of w_portion_size and w_pos_portion_size)
2) Workgroup size on my machine is 896 (see initialization of wg_count)
3) You print out 790 values.
Apart from this, one conceptual error is here:
if(global_id == 0) {
counter[5] = 0; // set index of pos in weights to zero
}
//atomic increments on counter[5]
You can't assume that the first virtual processor will execute this line before the others. You should completely remove this line, since you initialize counter[5] on the host side. (I believe that this is the cause of your problem, but I can't reproduce that).
After fixing these problems your code seems to run fine (intel implementation).
The kernel should be looks like (need checking index):
__kernel void k_1(__global uint *counter,
__global uint *weights,
__global uint2 *weights_pos)
{
const uint global_size = get_global_size(0);
const uint global_id = get_global_id(0);
uint local_id = get_local_id(0);
uint insert_index = atom_inc(&counter[5]);
if(insert_index < 768) {
weights[insert_index]= insert_index;
weights_pos[insert_index].x = insert_index;
weights_pos[insert_index].y = insert_index;
}
}