Convert NV12 to BGR by NVIDIA Performance Primitives

Convert NV12 to BGR by NVIDIA Performance Primitives - c++

I'm trying to convert NV12 image to BGR by npp, but in the final array i have zeroes.
int lumaStepBytes, chromaStepBytes;
int rgbStepBytes;
auto dpNV12LumaFrame = nppiMalloc_8u_C1(dec.GetWidth(), dec.GetHeight(), &lumaStepBytes);
auto dpNV12ChromaFrame = nppiMalloc_8u_C1(dec.GetWidth(), dec.GetChromaHeight(), &chromaStepBytes);
auto dpBGRFrame = nppiMalloc_8u_C3(dec.GetWidth(), dec.GetHeight(), &rgbStepBytes);
cudaMemcpy2D(dpNV12LumaFrame, lumaStepBytes, pFrame, dec.GetWidth(),
dec.GetWidth(), dec.GetHeight(), cudaMemcpyKind::cudaMemcpyHostToDevice);
cudaMemcpy2D(dpNV12ChromaFrame, chromaStepBytes, pFrame + dec.GetLumaPlaneSize(), dec.GetWidth(),
dec.GetWidth(), dec.GetChromaHeight(), cudaMemcpyKind::cudaMemcpyHostToDevice);
Npp8u *planesAddres[2];
planesAddres[0] = dpNV12LumaFrame;
planesAddres[1] = dpNV12ChromaFrame;
nppiNV12ToBGR_8u_P2C3R(planesAddres, lumaStepBytes,
dpBGRFrame, rgbStepBytes,
{dec.GetWidth(), dec.GetHeight()});
res.m_data.resize(dec.GetWidth() * dec.GetHeight() * 3);
cudaMemcpy2D(res.m_data.data(), dec.GetWidth(), dpBGRFrame, rgbStepBytes,
dec.GetWidth(), dec.GetHeight(), cudaMemcpyKind::cudaMemcpyDeviceToHost);
nppiFree(dpBGRFrame);
nppiFree(dpNV12ChromaFrame);
nppiFree(dpNV12LumaFrame);
dec is a video decoder which gives pFrame in NV12 format and provide additional information about that, like offsets, dimensions, NV12 planes, etc.
The same result I have if I use cu... and cuda... functions for allocating without alignment.
Do anybody have any ideas about the problem?

For questions like this the SO expectation is that you provide a complete example, see item 1 here. So I haven't tried to determine exactly what is wrong with your code.
However I can show you a complete code that converts NV12 to RGB (and other things as well) which is working correctly for me:
// sample compile command line: nvcc -o rs rs.cu -lnppicc -lnppig -DUSE_DEBUG -DUNIT_TEST
#include <nppi.h>
#include <iostream>
template <typename T>
__global__ void pack_uv(T * __restrict__ u, T * __restrict__ v, T * __restrict__ uv, const int w, const int h, const int pitch_uv, const int pitch_u, const int pitch_v){
int idx = threadIdx.x+blockDim.x*blockIdx.x;
int idy = threadIdx.y+blockDim.y*blockIdx.y;
if ((idx < w) && (idy < h)){
T *o = (T *)(((char *)uv) + idy*pitch_uv);
T *iu = (T *)(((char *)u) + idy*pitch_u);
T *iv = (T *)(((char *)v) + idy*pitch_v);
int idx2 = idx >> 1;
o[idx] = (idx&1)?iv[idx2]:iu[idx2];}
}
int rs(const int ish, const int isw, const int ipitch, const int osh, const int osw, const int opitch, const unsigned char *iy, const unsigned char *iuv, unsigned char *oy, unsigned char *ouv, unsigned char *tempbuff, int method = 0, int eInterpolation = NPPI_INTER_LANCZOS){
#ifdef USE_DEBUG
if ((iy != NULL) && (tempbuff == NULL)) std::cout << "error: tempbuff is NULL" << std::endl;
if ((iy != NULL) && (iuv == NULL)) std::cout << "error: iuv is NULL" << std::endl;
if ((iy != NULL) && (oy == NULL)) std::cout << "error: oy is NULL" << std::endl;
if ((iy != NULL) && (ouv == NULL)) std::cout << "error: ouv is NULL" << std::endl;
if (isw < 2) std::cout << "error on input width: " << isw << std::endl;
if (ish < 2) std::cout << "error on input height: " << ish << std::endl;
if (ipitch < isw) std::cout << "error on input pitch: " << ipitch << std::endl;
if (osw < 1) std::cout << "error on output width: " << osw << std::endl;
if (osh < 1) std::cout << "error on output height: " << osh << std::endl;
if (opitch < osw) std::cout << "error on output pitch: " << opitch << std::endl;
#endif
cudaError_t err;
NppStatus stat;
// convert NV12 input to RGB
if (iy == NULL){ // temp buffer sizing
// for method 1
NppiSize oSrcROI;
oSrcROI.width = isw;
oSrcROI.height = ish;
NppiSize oDstROI;
oDstROI.width = osw;
oDstROI.height = osh;
int bufferSize;
stat = nppiResizeAdvancedGetBufferHostSize_8u_C1R(oSrcROI, oDstROI, &bufferSize, NPPI_INTER_LANCZOS3_ADVANCED);
return ((ish*isw + osh*osw)*3*sizeof(unsigned char))+bufferSize; // temp buffer sizing
}
if (method == 0){
const Npp8u *pSrc[2] = {iy, iuv};
NppiSize oSizeROI;
oSizeROI.width = isw;
oSizeROI.height = ish;
#ifdef USE_709
stat = nppiNV12ToRGB_709HDTV_8u_P2C3R(pSrc, ipitch, tempbuff, isw*3*sizeof(Npp8u), oSizeROI);
#else
stat = nppiNV12ToRGB_8u_P2C3R(pSrc, ipitch, tempbuff, isw*3*sizeof(Npp8u), oSizeROI);
#endif
#ifdef USE_DEBUG
err = cudaDeviceSynchronize();
if (err != cudaSuccess) std::cout << "NV12 to RGB CUDA error: " << cudaGetErrorString(err) << std::endl;
if (stat != NPP_SUCCESS) std::cout << "NV12 to RGB NPP error: " << (int)stat << std::endl;
#endif
if (stat != NPP_SUCCESS) return -1;
// perform resize
NppiSize oSrcSize;
oSrcSize.width = isw;
oSrcSize.height = ish;
NppiRect oSrcROI;
oSrcROI.x = 0;
oSrcROI.y = 0;
oSrcROI.width = isw;
oSrcROI.height = ish;
NppiRect oDstROI;
oDstROI.x = 0;
oDstROI.y = 0;
oDstROI.width = osw;
oDstROI.height = osh;
double nXFactor = osw/(double)isw;
double nYFactor = osh/(double)ish;
double nXShift = 0;
double nYShift = 0;
stat = nppiResizeSqrPixel_8u_C3R(tempbuff, oSrcSize, isw*3*sizeof(Npp8u), oSrcROI, tempbuff+ish*isw*3, osw*3*sizeof(Npp8u), oDstROI, nXFactor, nYFactor, nXShift, nYShift, eInterpolation);
#ifdef USE_DEBUG
err = cudaDeviceSynchronize();
if (err != cudaSuccess) std::cout << "RGB LANCZOS RESIZE CUDA error: " << cudaGetErrorString(err) << std::endl;
if (stat != NPP_SUCCESS) std::cout << "RGB LANCZOS RESIZE NPP error: " << (int)stat << std::endl;
#endif
if (stat != NPP_SUCCESS) return -2;
// convert resized RGB to YUV420
Npp8u *pDst[3] = { oy, ouv, ouv + osh*opitch/4 };
int rDstStep[3] = { opitch, opitch/2, opitch/2 };
oSizeROI.width = osw;
oSizeROI.height = osh;
stat = nppiRGBToYUV420_8u_C3P3R(tempbuff+ish*isw*3, osw*3*sizeof(Npp8u), pDst, rDstStep, oSizeROI);
#ifdef USE_DEBUG
err = cudaDeviceSynchronize();
if (err != cudaSuccess) std::cout << "RGB TO YUV420 CUDA error: " << cudaGetErrorString(err) << std::endl;
if (stat != NPP_SUCCESS) std::cout << "RGB TO YUV420 NPP error: " << (int)stat << std::endl;
#endif
if (stat != NPP_SUCCESS) return -3;
// pack uv
dim3 block(32, 8);
dim3 grid((osw+block.x-1)/block.x, (osh+block.y-1)/block.y);
pack_uv<<< grid, block >>>(ouv, ouv + osh*opitch/4, tempbuff, osw, osh/2, osw, osw/2, osw/2);
err = cudaGetLastError();
#ifdef USE_DEBUG
if (err != cudaSuccess) std::cout << "PACK UV LAUNCH CUDA error: " << cudaGetErrorString(err) << std::endl;
err = cudaDeviceSynchronize();
if (err != cudaSuccess) std::cout << "PACK UV EXEC CUDA error: " << cudaGetErrorString(err) << std::endl;
#endif
if (err != cudaSuccess) return -4;
// move packed uv to output
err = cudaMemcpy2D(ouv, opitch, tempbuff, osw*sizeof(Npp8u), osw*sizeof(Npp8u), osh/2, cudaMemcpyDeviceToDevice);
#ifdef USE_DEBUG
if (err != cudaSuccess) std::cout << "PACK UV COPY CUDA error: " << cudaGetErrorString(err) << std::endl;
#endif
if (err != cudaSuccess) return -5;
}
else{ // method 1
// NV12 to YUV420 planar
const Npp8u *const pSrc[2] = {iy, iuv};
Npp8u *pDst[3] = {tempbuff, tempbuff+isw*ish, tempbuff+isw*ish+(isw*ish)/4};
int aDstStep[3] = {isw, isw/2, isw/2};
NppiSize oSizeROI;
oSizeROI.width = isw;
oSizeROI.height = ish;
stat = nppiNV12ToYUV420_8u_P2P3R(pSrc, ipitch, pDst, aDstStep, oSizeROI);
#ifdef USE_DEBUG
err = cudaDeviceSynchronize();
if (err != cudaSuccess) std::cout << "NV12 TO YUV420 CUDA error: " << cudaGetErrorString(err) << std::endl;
if (stat != NPP_SUCCESS) std::cout << "NV12 TO YUV420 NPP error: " << (int)stat << std::endl;
#endif
if (stat != NPP_SUCCESS) return -6;
// resize each plane individually
NppiSize oSrcSize = oSizeROI;
NppiRect oSrcROI;
oSrcROI.x = 0;
oSrcROI.y = 0;
oSrcROI.width = isw;
oSrcROI.height = ish;
NppiRect oDstROI;
oDstROI.x = 0;
oDstROI.y = 0;
oDstROI.width = osw;
oDstROI.height = osh;
double nXFactor = osw/(double)isw;
double nYFactor = osh/(double)ish;
// resize Y
stat = nppiResizeSqrPixel_8u_C1R_Advanced(tempbuff, oSrcSize, isw, oSrcROI, oy, opitch, oDstROI, nXFactor, nYFactor, tempbuff+(ish*isw*3),NPPI_INTER_LANCZOS3_ADVANCED);
#ifdef USE_DEBUG
err = cudaDeviceSynchronize();
if (err != cudaSuccess) std::cout << "Y RESIZE CUDA error: " << cudaGetErrorString(err) << std::endl;
if (stat != NPP_SUCCESS) std::cout << "Y RESIZE NPP error: " << (int)stat << std::endl;
#endif
if (stat != NPP_SUCCESS) return -7;
// resize U
oSrcSize.width /= 2;
oSrcSize.height /= 2;
oSrcROI.width /= 2;
oSrcROI.height /= 2;
oDstROI.width /= 2;
oDstROI.height /= 2;
stat = nppiResizeSqrPixel_8u_C1R_Advanced(tempbuff+ish*isw, oSrcSize, isw/2, oSrcROI, tempbuff+(ish*isw*3), osw/2, oDstROI, nXFactor, nYFactor, tempbuff+(ish*isw*3) + (osh*osw*3),NPPI_INTER_LANCZOS3_ADVANCED);
#ifdef USE_DEBUG
err = cudaDeviceSynchronize();
if (err != cudaSuccess) std::cout << "U RESIZE CUDA error: " << cudaGetErrorString(err) << std::endl;
if (stat != NPP_SUCCESS) std::cout << "U RESIZE NPP error: " << (int)stat << std::endl;
#endif
if (stat != NPP_SUCCESS) return -8;
// resize V
stat = nppiResizeSqrPixel_8u_C1R_Advanced(tempbuff+ish*isw+(ish*isw/4), oSrcSize, isw/2, oSrcROI, tempbuff+(ish*isw*3)+(osh*osw/4), osw/2, oDstROI, nXFactor, nYFactor, tempbuff+(ish*isw*3) + (osh*osw*3),NPPI_INTER_LANCZOS3_ADVANCED);
#ifdef USE_DEBUG
err = cudaDeviceSynchronize();
if (err != cudaSuccess) std::cout << "V RESIZE CUDA error: " << cudaGetErrorString(err) << std::endl;
if (stat != NPP_SUCCESS) std::cout << "V RESIZE NPP error: " << (int)stat << std::endl;
#endif
if (stat != NPP_SUCCESS) return -9;
// pack_uv
dim3 block(32, 8);
dim3 grid((osw+block.x-1)/block.x, (osh+block.y-1)/block.y);
pack_uv<<< grid, block >>>(tempbuff+(ish*isw*3), tempbuff+(ish*isw*3)+(osh*osw/4), ouv, osw, osh/2, opitch, osw/2, osw/2);
err = cudaGetLastError();
#ifdef USE_DEBUG
if (err != cudaSuccess) std::cout << "PACK UV LAUNCH CUDA error: " << cudaGetErrorString(err) << std::endl;
err = cudaDeviceSynchronize();
if (err != cudaSuccess) std::cout << "PACK UV EXEC CUDA error: " << cudaGetErrorString(err) << std::endl;
#endif
if (err != cudaSuccess) return -10;
}
return 0;
}
#ifdef UNIT_TEST
// timing
#include <time.h>
#include <sys/time.h>
#define USECPSEC 1000000ULL
unsigned long long dtime_usec(unsigned long long start){
timeval tv;
gettimeofday(&tv, 0);
return ((tv.tv_sec*USECPSEC)+tv.tv_usec)-start;
}
// bitmap file handling
struct Info{
int width;
int height;
int offset;
unsigned char * info;
unsigned char * data;
int size;
};
#include <fstream>
Info readBMP(const char* filename)
{
int i;
std::ifstream is(filename, std::ifstream::binary);
is.seekg(0, is.end);
i = is.tellg();
is.seekg(0);
unsigned char *info = new unsigned char[i];
is.read((char *)info,i);
int width = *(int*)&info[18];
int height = *(int*)&info[22];
int offset = *(int*)&info[10];
Info dat;
dat.width = width;
dat.height = height;
dat.offset = offset;
dat.size = i;
dat.info = new unsigned char[offset - 1];
dat.data = new unsigned char[i - offset + 1];
if ((i-offset+1) < (3*height*width)) std::cout << "size: " << i-offset+1 << " expected: " << height*width*3 << std::endl;
std::copy(info,
info + offset,
dat.info);
std::copy(info + offset,
info + i,
dat.data);
delete[] info;
return dat;
}
void writeBMP(const char *filename, Info dat){
std::ofstream fout;
fout.open(filename, std::ios::binary | std::ios::out);
fout.write( reinterpret_cast<char *>(dat.info), dat.offset);
fout.write( reinterpret_cast<char *>(dat.data), dat.size - dat.offset );
fout.close();
}
int main(int argc, char *argv[]){
int eInterpolation = NPPI_INTER_LANCZOS;
if (argc > 1) eInterpolation = atoi(argv[1]);
else{
std::cout << "Must specify a valid interpolation mode:" << std::endl;
std::cout << NPPI_INTER_NN << " :NPPI_INTER_NN" << std::endl;
std::cout << NPPI_INTER_LINEAR << " :NPPI_INTER_LINEAR" << std::endl;
std::cout << NPPI_INTER_CUBIC << " :NPPI_INTER_CUBIC" << std::endl;
std::cout << NPPI_INTER_LANCZOS << " :NPPI_INTER_LANCZOS" << std::endl;
return 0;}
int method = 0;
if (argc > 2) method = atoi(argv[2]);
// input to NV12
Info rfile = readBMP("input.bmp");
const int H = rfile.height;
const int W = rfile.width;
std::cout << "Height = " << rfile.height << std::endl;
std::cout << "Width = " << rfile.width << std::endl;
Npp8u *rgbdata, *ty, *tu, *tv, *tuv;
cudaMalloc(&rgbdata, H*W*3);
cudaMalloc(&ty, H*W);
cudaMalloc(&tu, H*W/4);
cudaMalloc(&tv, H*W/4);
cudaMalloc(&tuv, H*W/2);
cudaMemcpy(rgbdata, rfile.data, H*W*3, cudaMemcpyHostToDevice);
Npp8u *pDst[3] = { ty, tu, tv};
int rDstStep[3] = { W, W/2, W/2 };
NppiSize oSizeROI;
oSizeROI.width = W;
oSizeROI.height = H;
NppStatus stat = nppiRGBToYUV420_8u_C3P3R(rgbdata, W*3*sizeof(Npp8u), pDst, rDstStep, oSizeROI);
if (stat != NPP_SUCCESS) { std::cout << "Input NPP error" << std::endl; return 0;}
dim3 block(32, 8);
dim3 grid((W+block.x-1)/block.x, (H+block.y-1)/block.y);
pack_uv<<< grid, block >>>(tu, tv, tuv, W, H/2, W, W/2, W/2);
// 1:1 test
int buff_size = rs(H, W, W, H, W, W, NULL, NULL, NULL, NULL, NULL);
unsigned char *tbuff;
cudaError_t err = cudaMalloc(&tbuff, buff_size);
if (err != cudaSuccess) {std::cout << "on temp buff allocation of size: " << buff_size << " error: " << (int)err << std::endl; return 0;}
unsigned char *oy, *ouv;
err = cudaMalloc(&oy, H*W*sizeof(unsigned char));
if (err != cudaSuccess) {std::cout << "on oy allocation of size: " << H*W*sizeof(unsigned char) << " error: " << (int)err << std::endl; return 0;}
err = cudaMalloc(&ouv, H*W*sizeof(unsigned char)/2);
if (err != cudaSuccess) {std::cout << "on ouv allocation of size: " << H*W*sizeof(unsigned char) << " error: " << (int)err << std::endl; return 0;}
int error = rs(H, W, W, H, W, W, ty, tuv, oy, ouv, tbuff, method, eInterpolation);
if (error != 0) std::cout << "Function Failure: " << error << std::endl;
// output to RGB
const Npp8u *pSrc[2] = {ty, tuv};
oSizeROI.width = W;
oSizeROI.height = H;
#ifdef USE_709
stat = nppiNV12ToRGB_709HDTV_8u_P2C3R(pSrc, W, rgbdata, W*3*sizeof(Npp8u), oSizeROI);
#else
stat = nppiNV12ToRGB_8u_P2C3R(pSrc, W, rgbdata, W*3*sizeof(Npp8u), oSizeROI);
#endif
if (stat != NPP_SUCCESS) { std::cout << "Output NPP error" << std::endl; return 0;}
cudaMemcpy(rfile.data, rgbdata, H*W*3, cudaMemcpyDeviceToHost);
writeBMP("output.bmp", rfile);
// 2x upscale test
cudaFree(tbuff);
buff_size = rs(H, W, W, 2*H, 2*W, 2*W, NULL, NULL, NULL, NULL, NULL);
err = cudaMalloc(&tbuff, buff_size);
if (err != cudaSuccess) {std::cout << "on temp buff allocation of size: " << buff_size << " error: " << (int)err << std::endl; return 0;}
cudaFree(oy);
cudaFree(ouv);
err = cudaMalloc(&oy, 4*H*W*sizeof(unsigned char));
if (err != cudaSuccess) {std::cout << "on oy allocation of size: " << H*W*sizeof(unsigned char) << " error: " << (int)err << std::endl; return 0;}
err = cudaMalloc(&ouv, 2*H*W*sizeof(unsigned char));
if (err != cudaSuccess) {std::cout << "on ouv allocation of size: " << H*W*sizeof(unsigned char) << " error: " << (int)err << std::endl; return 0;}
unsigned long long dt = dtime_usec(0);
error = rs(H, W, W, 2*H, 2*W, 2*W, ty, tuv, oy, ouv, tbuff, method, eInterpolation);
cudaDeviceSynchronize();
dt = dtime_usec(dt);
if (error != 0) std::cout << "Function Failure: " << error << std::endl;
std::cout << "2x resize time: " << dt/(float)USECPSEC << "s" << std::endl;
// output to RGB
const Npp8u *pSrc2[2] = {oy, ouv};
oSizeROI.width = 2*W;
oSizeROI.height = 2*H;
cudaFree(rgbdata);
cudaMalloc(&rgbdata, H*W*12);
#ifdef USE_709
stat = nppiNV12ToRGB_709HDTV_8u_P2C3R(pSrc2, 2*W, rgbdata, W*6*sizeof(Npp8u), oSizeROI);
#else
stat = nppiNV12ToRGB_8u_P2C3R(pSrc2, 2*W, rgbdata, W*6*sizeof(Npp8u), oSizeROI);
#endif
if (stat != NPP_SUCCESS) { std::cout << "Output NPP error" << std::endl; return 0;}
delete[] rfile.data;
rfile.data = new unsigned char[H*W*12];
cudaMemcpy(rfile.data, rgbdata, H*W*12, cudaMemcpyDeviceToHost);
int osize = rfile.size - rfile.offset;
int nsizeinc = H*W*12 - osize;
rfile.size += nsizeinc;
*((int*)(rfile.info+18)) = 2*W;
*((int*)(rfile.info+22)) = 2*H;
writeBMP("output2.bmp", rfile);
return 0;
}
#endif
The above code does the following steps:
read in a .bmp file from disk into RGB storage
convert to YUV420
convert to NV12
resize the NV12 image (there are multiple steps here)
convert the resized NV12 image to RGB
write RGB image as a .bmp file

the problem went away when I changed the video card. GeForce 740 to 1080

NppiSize oSizeROI;
oSizeROI.width = frame_dec->width;
oSizeROI.height = frame_dec->height;
DBUG_PRINT("width: %d, height: %d, stepBytes: %d\n", oSizeROI.width, oSizeROI.height, stepBytes);
// NppStatus stat = nppiNV12ToBGR_8u_P2C3R(frame_dec->data, frame_dec->width, bgrData, frame_dec->width*3*sizeof(Npp8u), oSizeROI);
NppStatus stat;
#ifdef USE_709
stat = nppiNV12ToBGR_8u_P2C3R(frame_dec->data, frame_dec->linesize[0], bgrData, frame_dec->width*3, oSizeROI);
#else
stat = nppiNV12ToBGR_709HDTV_8u_P2C3R(frame_dec->data, frame_dec->linesize[0], bgrData, frame_dec->width * 3, oSizeROI);
#endif
unsigned char *data = (unsigned char *)malloc(frame_dec->width * frame_dec->height * 3);
cudaMemcpy(data, bgrData, frame_dec->height * frame_dec->width * 3, cudaMemcpyDeviceToHost);
cv::Mat mat_test(frame_dec->height, frame_dec->width, CV_8UC3, data);
imwrite("test.jpg", mat_test);
free(data);
nppiFree(bgrData);
exit(0);
frame_dec decoded by ffmpeg cuda decoder

Related

Playing a stereo .wav file with PortAudio & sndfile, output is fuzzy and pitched down / slowed

I've been writing some code to play back a stereo .wav file in c++ using PortAudio and sndfile, but the output sound is fuzzy and pitched down (pitch down is less of an issue for me but it may be part of the problem). It almost seems like its playing partially garbage data with the fuzz, but I don't believe there's any variable I'm using that this could happen where I'm not clearing the data first. I've tried to follow some of PortAudios examples with stereo playback but due to the input being from a .wav file rather than generated I've not been able to follow it perfectly. I've also compiled and ran some PortAudio examples (that use stereo sound) and those work fine. I'm not sure where the issue is.
Audio.h
struct AudioFile {
SNDFILE* file = nullptr;
SF_INFO info;
int buffer_size = 512;
int readHead = 0;
sf_count_t count = 1;
};
/*
Class for handling basic audio functions
*/
class Audio {
protected:
public:
/// Constructor
Audio();
/// Destructor
~Audio();
/// Load an audio file
AudioFile loadFile(const char* path);
/// Play an audio file
void playFile(AudioFile* file);
};
Audio.cpp
/// Audio constructor
Audio::Audio() {
PaError err = Pa_Initialize();
if (err != paNoError) std::cerr << "PAError: " << err << std::endl;
#ifdef DEBUG
std::cout << "Initialising PortAudio" << std::endl;
std::cout << "----------------------" << std::endl;
std::cout << "Version: " << Pa_GetVersion << std::endl;
std::cout << "Devices:" << std::endl;
std::cout << "----------------------" << std::endl;
int numDevices = Pa_GetDeviceCount();
for (int i=0; i < numDevices; i++) {
auto deviceInfo = Pa_GetDeviceInfo(i);
std::cout << "Name: " << deviceInfo->name << std::endl;
std::cout << "HostApi: " << deviceInfo->hostApi << std::endl;
std::cout << "SampleRate: " << deviceInfo->defaultSampleRate << std::endl;
std::cout << "InputChannels: " << deviceInfo->maxInputChannels << std::endl;
std::cout << "OutputChannels: " << deviceInfo->maxOutputChannels << std::endl;
std::cout << "----------------------" << std::endl;
}
#endif
}
Audio::~Audio() {
PaError err = Pa_Terminate();
if (err != paNoError) std::cerr << "PAError: " << err << std::endl;
}
/* Loads an audiofile */
AudioFile Audio::loadFile(const char* path) {
AudioFile file;
::memset(&file.info, 0, sizeof(file.info));
file.file = sf_open(path, SFM_READ, &file.info);
return file;
}
static int patestCallback(const void* inputBuffer, void* outputBuffer,
unsigned long framesPerBuffer,
const PaStreamCallbackTimeInfo* timeInfo,
PaStreamCallbackFlags statusFlags, void* userData) {
/// Prevent warnings
(void)inputBuffer;
(void)timeInfo;
(void)statusFlags;
/// an AudioFile gets passed as userData
AudioFile* file = (AudioFile*)userData;
float* out = (float*)outputBuffer;
sf_seek(file->file, file->readHead, SF_SEEK_SET);
auto data = std::make_unique<float[]>(framesPerBuffer);
file->count = sf_read_float(file->file, data.get(), framesPerBuffer);
for (int i = 0; i < framesPerBuffer; i++) {
*out++ = data[i];
}
file->readHead += file->buffer_size;
if (file->count > 0) return paContinue;
else return paComplete;
}
void Audio::playFile(AudioFile* file) {
PaStream* stream = nullptr;
PaStreamParameters params;
params.device = Pa_GetDefaultOutputDevice();
params.channelCount = file->info.channels;
params.sampleFormat = paFloat32;
params.suggestedLatency =
Pa_GetDeviceInfo(params.device)->defaultLowOutputLatency;
params.hostApiSpecificStreamInfo = nullptr;
/// Check if params work
PaError err = Pa_IsFormatSupported(nullptr, &params, file->info.samplerate);
if (err != paFormatIsSupported) {
std::cerr << "PAError: " << Pa_GetErrorText(err) << std::endl;
return;
}
err = Pa_OpenStream(&stream, nullptr, &params, file->info.samplerate,
file->buffer_size * params.channelCount, paClipOff,
&patestCallback, file);
if (err != paNoError) std::cerr << "PAError: " << Pa_GetErrorText(err) << std::endl;
err = Pa_StartStream(stream);
if (err != paNoError)
std::cerr << "PAError: " << Pa_GetErrorText(err) << std::endl;
/// wait until file finishes playing
while (file->count > 0) {}
err = Pa_StopStream(stream);
if (err != paNoError)
std::cerr << "PAError: " << Pa_GetErrorText(err) << std::endl;
err = Pa_CloseStream(stream);
if (err != paNoError)
std::cerr << "PAError: " << Pa_GetErrorText(err) << std::endl;
}
I've also tried without the data pointer (using this does seem to produce a cleaner, but still fuzzy sound) and passing the audio file by value into the playFile function. Any help is appreciated.

Ended up figuring it out, I had one primary issue, here:
err = Pa_OpenStream(&stream, nullptr, &params, file->info.samplerate,
file->buffer_size * params.channelCount, paClipOff,
&patestCallback, file);
I gave the Pa_OpenStream the buffersize * the number of channels, however I should've just been giving it the buffersize, and make the channel adjustment to the framesPerBuffer in the callback function directly:
static int patestCallback(const void* inputBuffer, void* outputBuffer,
unsigned long framesPerBuffer,
const PaStreamCallbackTimeInfo* timeInfo,
PaStreamCallbackFlags statusFlags, void* userData) {
/// Prevent warnings
(void)inputBuffer;
(void)timeInfo;
(void)statusFlags;
/// an AudioFile gets passed as userData
velox::AudioFile* file = (velox::AudioFile*)userData;
float* out = (float*)outputBuffer;
sf_seek(file->file, file->readHead, SF_SEEK_SET);
auto data = std::make_unique<float[]>(framesPerBuffer * file->info.channels);
file->count = sf_read_float(file->file, data.get(),
framesPerBuffer * file->info.channels);
for (int i = 0; i < framesPerBuffer * file->info.channels; i++) {
*out++ = data[i];
}
file->readHead += file->buffer_size;
if (file->count > 0) return paContinue;
else return paComplete;
}
This change fixed both the pitch and the fuzz, which makes sense in hindsight.

libusb failing at getting string descriptor on some devices

I am trying to get information about all attached devices.
This is my code:
libusb_device_descriptor desc;
libusb_config_descriptor *conDesc;
char szBuffer[256] = { 0 };
unsigned char strDesc[256];
libusb_device_handle *devHandle = NULL;
int retVal = 0;
__int64 i64Temp;
DWORD dwProdId;
DWORD dwProdId1;
i64Temp = 13888;
dwProdId = (DWORD)i64Temp;
retVal = libusb_open(dev, &devHandle);
int r = libusb_get_device_descriptor(dev, &desc);
if (r < 0)
{
cout << "failed to get device descriptor" << endl;
return;
}
r = libusb_get_config_descriptor(dev, 0, &conDesc);
printf("Interface Class = %d\n", conDesc->interface->altsetting->bInterfaceClass);
cout << "Number of possible configurations: " << (int)desc.bNumConfigurations << "" << endl;
cout << "Device Class: " << desc.bDeviceClass << endl;
cout << "Device Class: " << desc.bDeviceSubClass << endl;
printf("Class = %d\n", desc.bDeviceClass);
cout << "VendorID: " << desc.idVendor << endl;
cout << "ProductID: " << desc.idProduct << endl;
if (retVal == LIBUSB_SUCCESS)
{
retVal = libusb_get_string_descriptor_ascii(devHandle, desc.iManufacturer, strDesc, 256);
printf("Manufacturer: %s\n", strDesc);
retVal = libusb_get_string_descriptor_ascii(devHandle, desc.iSerialNumber, strDesc, 256);
printf("SerialNumber: %s\n", strDesc);
retVal = libusb_get_string_descriptor_ascii(devHandle, desc.iProduct, strDesc, 256);
printf("Product: %s\n", strDesc);
printf("\n\n");
}
else if (retVal != LIBUSB_SUCCESS)
{
printf("retVal failed");
printf("\n\n");
}
My code knows that there are 8 devices connected but it's getting manufacturer or serial number only on two of them. I would like to get this information for all of the attached devices.

c++ opencl return CL_OUT_OF_RESOURCES

I'm learning OpenCL and trying to apply a black and white on a picture but enqueueNDRangeKernel return CL_OUT_OF_RESOURCES and I don't understand why. OpenCL is running on a GTX 980M, and OpenCL 1.2 .
#define _CRT_SECURE_NO_WARNINGS
#include <iostream>
#include <string>
#include <CL/cl.hpp>
#define STB_IMAGE_IMPLEMENTATION
#include "stb_image.h"
#define STB_IMAGE_WRITE_IMPLEMENTATION
#include "stb_write.h"
int main() {
unsigned int vectorSize = 1000;
const std::string progCode = "__kernel \n"
"void img_kernel( __read_only image2d_t inputImage, __write_only image2d_t outputImage) \n"
"{ \n"
"const sampler_t sampler=CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; \n"
" int width = get_image_width(inputImage); \n"
" int height = get_image_height(inputImage); \n"
" int2 pixelcoord = (int2) (get_global_id(0), get_global_id(1)); \n"
" float4 pixel = read_imagef(inputImage, sampler, pixelcoord); \n"
" float color = (pixel.x + pixel.y + pixel.z)/3; \n"
" float4 outColor = (float4)(pixel.x,pixel.y,pixel.z, 1.0); \n"
" write_imagef(outputImage, pixelcoord, outColor); } \n";
int imageX, imageY, imageN;
unsigned char *dataImage = stbi_load("test.jpg", &imageX, &imageY, &imageN, 3);
if (dataImage == nullptr)
{
std::cout << "Unable to load picture" << std::endl;
getchar();
return 1;
}
cl_int error;
std::vector<cl::Platform> platformsList;
error = cl::Platform::get(&platformsList);
if (error != CL_SUCCESS)
{
std::cout << "Unable to find any OpenCL platforms" << std::endl;
getchar();
return 1;
}
std::vector<cl::Device> devicesList;
error = platformsList[0].getDevices(CL_DEVICE_TYPE_DEFAULT, &devicesList);
if (error != CL_SUCCESS)
{
std::cout << "Unable to find any OpenCL device" << std::endl;
getchar();
return 1;
}
cl::Device currentDevice = devicesList[0];
std::string nameDevice, driverDevice;
error = currentDevice.getInfo(CL_DEVICE_NAME, &nameDevice);
error = currentDevice.getInfo(CL_DRIVER_VERSION, &driverDevice);
std::cout << "Device : " << nameDevice << " " << driverDevice << std::endl;;
cl::Context context(currentDevice);
cl::Program::Sources source;
source.push_back({ progCode.c_str(), progCode.size() });
cl::Program program(context, source);
if (program.build({ currentDevice }) != CL_SUCCESS)
{
std::cout << " Error building: " << program.getBuildInfo<CL_PROGRAM_BUILD_LOG>(currentDevice) << "\n";
getchar();
exit(1);
}
cl::ImageFormat globalImgFormat;
globalImgFormat.image_channel_data_type = CL_UNSIGNED_INT8;
globalImgFormat.image_channel_order = CL_RGB;
cl::size_t<3> origin;
origin[0] = 0; origin[1] = 0, origin[2] = 0;
cl::size_t<3> region;
region[0] = imageX; region[1] = imageY; region[2] = 1;
cl::Image2D inputImage(context, CL_MEM_READ_ONLY, globalImgFormat, imageX, imageY, 0, dataImage, &error);
if (error != CL_SUCCESS)
{
std::cout << "Unable to create cl Image for input." << std::endl;
getchar();
return 1;
}
cl::Image2D outputImage(context, CL_MEM_WRITE_ONLY, globalImgFormat, imageX, imageY, 0, nullptr, &error);
if (error != CL_SUCCESS)
{
std::cout << "Unable to create cl Image for output." << std::endl;
getchar();
return 1;
}
cl::CommandQueue queue(context, currentDevice);
cl::Kernel image_kernel(program, "img_kernel", &error);
if (error != CL_SUCCESS)
{
std::cout << "Unable to create kernel." << std::endl;
getchar();
return 1;
}
error = image_kernel.setArg(0, inputImage);
if (error != CL_SUCCESS)
{
std::cout << "Unable to set param." << std::endl;
getchar();
return 1;
}
error = image_kernel.setArg(1, outputImage);
if (error != CL_SUCCESS)
{
std::cout << "Unable to set param." << std::endl;
getchar();
return 1;
}
cl::NDRange globalSize(imageX, imageY);
error = queue.enqueueNDRangeKernel(image_kernel, cl::NullRange, globalSize, cl::NullRange);
if (error != CL_SUCCESS)
{
std::cout << "Unable to compute Image data." << std::endl;
getchar();
return 1;
}
queue.finish();
unsigned char *resultPros = new unsigned char[imageX * imageY * imageN];
error = queue.enqueueReadImage(outputImage, CL_TRUE, origin, region, 0, 0, resultPros);
stbi_write_bmp("testresul.jpg", imageX, imageY, imageN, resultPros);
stbi_image_free(dataImage);
stbi_image_free(resultPros);
getchar();
return 0;
}

On NVIDIA hardware if you write outside of a buffer or image (which is an undefined operation) then CL_OUT_OF_RESOURCES is a common error to get. It's better than the early hardware or drivers that simply crashed! Double-check your writes.

It seems that the combination of image_channel_data_type and image_channel_order is not correct. Could be your problem related with this?
Please, take a look here:
https://www.khronos.org/registry/cl/sdk/1.0/docs/man/xhtml/cl_image_format.html
Regards

Draw Mandelbrot using OpenCl

I want to write a program to draw a Mandelbrot set to an image.
I'm using OpenCl and cl.hpp - wrapper to c++, but I don't know why this isn't working.
I try draw image with different width and height, but mostly I get white image with random colored pixel (when in kernel I try write to output hardcoded values) or black image.
I also wondering about passing image2d_t to the kernel and "write" pixels directly to the image but when I execute my kernel I get -52 error that I set invalid arguments..
Declaration of my kernel(using image2d_t) looks like this:
__kernel syf(__write_only image2d_t img)
and I set arguments like this:
cl_mem image= clCreateImage2D(context, CL_MEM_WRITE_ONLY, &outputFormat, ImageWidth, ImageHeight, 0, 0, &err);
clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&image)
Could you look at the code and help me?
#define __CL_ENABLE_EXCEPTIONS
//#define __NO_STD_VECTOR
#include <ctime>
#include <fstream>
#include <iostream>
#include <exception>
#include "D:/Users/cuda/cuda/bmp.cpp"
#if defined (__APPLE__) || defined (MACOSX)
#include <OpenCl/cl.cpp>
#else
#include <CL/OpenCL.h>
#include <CL/cl.hpp>
#endif
const int N = 1024 * 1024 * 3;
const char * kernelSource =
"__kernel void prepareVector(__global char *output)"
"{"
" size_t xDimension = get_global_id(0); "
" size_t yDimension = get_global_id(1); "
" "
" size_t currentIndex = 3 * 1024.0f*yDimension + 3 * xDimension; "
" float xOriginal = 3.25f*((float) xDimension / 1024.0f) - 2.0f; "
" float yOriginal = 2.5f*((float) yDimension / 1024.0f) - 1.25f;"
" "
" int iteration = 0; "
" int maxIteration = 256; "
" float temp; "
" float x = 0.0f; "
" float y = 0.0f; "
" while (x*x + y*y <= 4.0f && iteration < maxIteration)"
" { "
" temp = x*x - y*y + xOriginal; "
" y = 2.0*x*y + yOriginal; "
" x = temp; "
" ++iteration; "
" } "
" "
" if (iteration == maxIteration) "
" { "
" iteration = 0; "
" } "
" output[currentIndex] = iteration; "
" output[currentIndex] = iteration;"
" output[currentIndex] = iteration;"
"}";
std::vector<cl::Platform> platforms;
std::vector<cl::Device> devices;
std::vector<cl::Kernel> allKernels;
cl::Program program;
cl_int cli_err_msg;
char *host_out_c;
int main(int argc, char* argv [])
{
try
{
size_t dimensionSize = 2;
cl::NDRange *globalSize = new cl::NDRange( 6, 8 );
cl::NDRange *localSize = new cl::NDRange( 3, 4 );
cl::NDRange *offsetSize = new cl::NDRange( 0, 0 );
host_out_c = new char[N];
cl::Platform::get(&platforms);
platforms[0].getDevices(CL_DEVICE_TYPE_ALL, &devices);
cl_context_properties properties [] =
{
CL_CONTEXT_PLATFORM, (cl_context_properties) (platforms[0])(), 0
};
//cl_context context = clCreateContext(0, devices.size(), devices, NULL, NULL, NULL);
cl::Context context(CL_DEVICE_TYPE_ALL, properties);
std::vector<cl::Device> devices = context.getInfo<CL_CONTEXT_DEVICES>();
cl::Program::Sources source(1, std::make_pair(kernelSource, strlen(kernelSource)));
program = cl::Program(context, source);
program.build(devices);
cl::Kernel kernel(program, "prepareVector", &cli_err_msg);
cl::Buffer device_out_c(context, CL_MEM_WRITE_ONLY, N*sizeof(char));
cl::Event event;
cl::CommandQueue queue(context, devices[0], 0, &cli_err_msg);
kernel.setArg(0, device_out_c);
queue.enqueueNDRangeKernel(kernel, *offsetSize, *globalSize, *localSize, NULL, &event);
queue.enqueueReadBuffer(device_out_c, true, 0, N*sizeof(char), (void*) host_out_c);
//printArray("kernel output:\n", host_out_c);
write_bmp("lel.bmp", 1024, 1024, host_out_c);
}
catch (cl::Error e)
{
std::cout << "Status: "<< program.getBuildInfo<CL_PROGRAM_BUILD_STATUS>(devices[0]) << std::endl;
std::cout << "Options: "<< program.getBuildInfo<CL_PROGRAM_BUILD_OPTIONS>(devices[0]) << std::endl;
std::cout << "Log: "<< program.getBuildInfo<CL_PROGRAM_BUILD_LOG>(devices[0]) << std::endl;
std::cout << e.what() << ": Error code: " << e.err() << std::endl;
}
return 0;
}

Joining Portaudio and Opus

Hi im trying to take sound from an open PortAudio Stream, encode it with opus, decode it and reproduce it again with portaudio.
Im doing this as a prototype just to try and understand the mechanics of this systems so, no real interest on following this concrete flow.
Thing is, portaudio gives buffers where OPUS needs Frames. Mi thought lead me to this in the portaudio side:
err = (Pa_ReadStream(stream, readBuffer, FRAMES_PER_BUFFER));
if (err = paNoError){
qDebug()<<"Fail read";
qDebug()<<Pa_GetErrorText(err);
// blockingRecord = false;
}
while (pos<FRAMES_PER_BUFFER){
memcpy(frameBuffer,readBuffer+(pos*FRAME_SIZE*NUM_CHANNELS),FRAME_SIZE*CHANNELS);
compressedSound = om.encodeOpus(frameBuffer);
unCompressedSound = om.decodeOpus(compressedSound);
memcpy(readBuffer+(pos*FRAME_SIZE*NUM_CHANNELS),unCompressedSound,FRAME_SIZE*CHANNELS);
pos++;
}
pos = 0;
err = (Pa_WriteStream(stream, readBuffer, FRAMES_PER_BUFFER));
if (err != paNoError)
{
qDebug() << "FAIL WRITE";
qDebug()<<Pa_GetErrorText(err);
//blockingRecord = false;
}
And this on the OPUS side:
unsigned char * OpusManager::encodeOpus(unsigned char *frame){
memcpy(encoded, frame, FRAME_SIZE*CHANNELS);
int ret = opus_encode(enc, encoded, FRAME_SIZE, compressed_buffer, encoded_data_size);
if (ret<0){
qDebug()<<"Failure while compressing sound";
return NULL;
}
return (compressed_buffer);
}
unsigned char * OpusManager::decodeOpus(unsigned char *frame){
int ret= opus_decode(dec, frame, encoded_data_size, decoded, FRAME_SIZE, 0);
if (ret<0){
qDebug()<<"Failure while decompressing sound";
return NULL;
}
memcpy(uncompressed_buffer, decoded, FRAME_SIZE*CHANNELS);
return (uncompressed_buffer);
}
No errors without encocing and perfect soud. With encode i get no errors till the PA_Writestream call, where i get a "Output underflowed" PaError. I suppose the way of taking the frames ive implemmented must be waaay wrong, but cant find info to help me with this.

It seems your interpretation of Opus' frame_size parameters to opus_encode and opus_decode is incorrect. If I understand your code correctly you're recording a packet of size FRAMES_PER_BUFFER frames and then try to turn it into N packets of size FRAME_SIZE. Instead, it seems to me that Opus wants to turn your packet of FRAMES_PER_BUFFER into another packet of equal frame count, and in doing so, only uses it's FRAME_SIZE parameter as some sort of quality control parameter for the encoding process. Below you'll find a complete sample that I believe does what you want. Play around with the '480' magic number in encode()/decode() and hear audio quality change.
int opusErr;
PaError paErr;
std::string s;
int const channels = 2;
int const bufferSize = 480;
int const sampleRate = 48000;
int const durationSeconds = 5;
opus_int32 enc_bytes;
opus_int32 dec_bytes;
int framesProcessed = 0;
std::vector<unsigned short> captured(bufferSize * channels);
std::vector<unsigned short> decoded(bufferSize * channels);
// * 2: byte count, 16 bit samples
std::vector<unsigned char> encoded(bufferSize * channels * 2);
// initialize opus
OpusEncoder* enc = opus_encoder_create(
sampleRate, channels, OPUS_APPLICATION_AUDIO, &opusErr);
if (opusErr != OPUS_OK)
{
std::cout << "opus_encoder_create failed: " << opusErr << "\n";
std::getline(std::cin, s);
return 1;
}
OpusDecoder* dec = opus_decoder_create(
sampleRate, channels, &opusErr);
if (opusErr != OPUS_OK)
{
std::cout << "opus_decoder_create failed: " << opusErr << "\n";
std::getline(std::cin, s);
return 1;
}
// initialize portaudio
if ((paErr = Pa_Initialize()) != paNoError)
{
std::cout << "Pa_Initialize failed: " << Pa_GetErrorText(paErr) << "\n";
std::getline(std::cin, s);
return 1;
}
PaStream* stream = nullptr;
if ((paErr = Pa_OpenDefaultStream(&stream,
channels, channels, paInt16, sampleRate,
bufferSize, nullptr, nullptr)) != paNoError)
{
std::cout << "Pa_OpenDefaultStream failed: " << Pa_GetErrorText(paErr) << "\n";
std::getline(std::cin, s);
return 1;
}
// start stream
if ((paErr = Pa_StartStream(stream)) != paNoError)
{
std::cout << "Pa_StartStream failed: " << Pa_GetErrorText(paErr) << "\n";
std::getline(std::cin, s);
return 1;
}
// capture, encode, decode & render durationSeconds of audio
while (framesProcessed < sampleRate * durationSeconds)
{
if ((paErr = Pa_ReadStream(stream,
captured.data(), bufferSize)) != paNoError)
{
std::cout << "Pa_ReadStream failed: " << Pa_GetErrorText(paErr) << "\n";
std::getline(std::cin, s);
return 1;
}
if ((enc_bytes = opus_encode(enc, reinterpret_cast<opus_int16 const*>(
captured.data()), 480, encoded.data(), encoded.size())) < 0)
{
std::cout << "opus_encode failed: " << enc_bytes << "\n";
std::getline(std::cin, s);
return 1;
}
if ((dec_bytes = opus_decode(dec, encoded.data(), enc_bytes,
reinterpret_cast<opus_int16*>(decoded.data()), 480, 0)) < 0)
{
std::cout << "opus_decode failed: " << dec_bytes << "\n";
std::getline(std::cin, s);
return 1;
}
if ((paErr = Pa_WriteStream(stream, decoded.data(), bufferSize)) != paNoError)
{
std::cout << "Pa_WriteStream failed: " << Pa_GetErrorText(paErr) << "\n";
std::getline(std::cin, s);
return 1;
}
framesProcessed += bufferSize;
}
// stop stream
if ((paErr = Pa_StopStream(stream)) != paNoError)
{
std::cout << "Pa_StopStream failed: " << Pa_GetErrorText(paErr) << "\n";
std::getline(std::cin, s);
return 1;
}
// cleanup portaudio
if ((paErr = Pa_CloseStream(stream)) != paNoError)
{
std::cout << "Pa_CloseStream failed: " << Pa_GetErrorText(paErr) << "\n";
std::getline(std::cin, s);
return 1;
}
if ((paErr = Pa_Terminate()) != paNoError)
{
std::cout << "Pa_Terminate failed: " << Pa_GetErrorText(paErr) << "\n";
std::getline(std::cin, s);
return 1;
}
// cleanup opus
opus_decoder_destroy(dec);
opus_encoder_destroy(enc);

We Keep Coding

c++ django amazon-web-services regex python-2.7 google-cloud-platform list unit-testing opengl ember.js

Convert NV12 to BGR by NVIDIA Performance Primitives - c++

the problem went away when I changed the video card. GeForce 740 to 1080

Related

Playing a stereo .wav file with PortAudio & sndfile, output is fuzzy and pitched down / slowed

libusb failing at getting string descriptor on some devices

c++ opencl return CL_OUT_OF_RESOURCES

Draw Mandelbrot using OpenCl

Joining Portaudio and Opus

Categories

Resources