Debugging CUDA kernels with VSCode - c++

I am trying to debug a CUDA application on VSCode.
Ubuntu 20.04
VSCode 1.56.2
CUDA 11.3
gcc/g++ 9.3
For this, I have the following (up to date) extensions:
Nsight Visual Studio Code Edition
I have no problem compiling/running my program. However when I'm debugging, I am able to use the debugger correctly in CPU side but not on GPU side. Indeed, when I'm trying to add a breakpoint anywhere in the kernel, running the program always moves my breakpoint to the closing bracket and I can't see the variables.
Here are the files:
#include <iostream>
#include <math.h>
// Kernel function to add the elements of two arrays
void add(float *x, float *y)
y[blockIdx.x] = x[blockIdx.x] + y[blockIdx.x];
int main(void)
const int N = 1<<20;
float *x, *y;
// Allocate Unified Memory – accessible from CPU or GPU
cudaMallocManaged(&x, N*sizeof(float));
cudaMallocManaged(&y, N*sizeof(float));
// initialize x and y arrays on the host
for (int i = 0; i < N; i++) {
x[i] = 1.0f;
y[i] = 2.0f;
// Run kernel on 1M elements on the GPU
add<<<N, 1>>>(x, y);
// Wait for GPU to finish before accessing on host
// Check for errors (all values should be 3.0f)
float maxError = 0.0f;
for (int i = 0; i < N; i++)
maxError = fmax(maxError, fabs(y[i]-3.0f));
std::cout << "Max error: " << maxError << std::endl;
// Free memory
return 0;
cmake_minimum_required(VERSION 3.10)
# set the project name
project(add CUDA)
# add the executable
// Use IntelliSense to learn about possible attributes.
// Hover to view descriptions of existing attributes.
// For more information, visit:
"version": "0.2.0",
"configurations": [
"name": "CUDA C++: Launch",
"type": "cuda-gdb",
"request": "launch",
"program": "${workspaceFolder}/build/add"
I'm compiling this way:
mkdir build
cd build
cmake -DCMAKE_BUILD_TYPE=Debug ..
cmake --build .
Every program I am trying do the exact same thing on any kernel breakpoint.
What do I miss here?

I finally solved it by forcing -G flag when in debug mode in cmake, adding the following lines after add_executable:
target_compile_options(add PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:-G>)
With this, debugging on device works.
Thanks #RobertCrovella for the tip.


Am I able to use parallel STL algorithms from C++17/C++20 in Matlab MEX functions?

I am putting together a minimal example leveraging parallelism features in C++17/20 within Matlab MEX functions. I am able to compile and run the mex function from Matlab, but when I set the execution policy of my C++ STL function to "par" instead of "seq", Matlab gives a runtime linkage complaint. Code and error message follows:
test.m (Matlab top-level script):
vec_in = zeros(5);
coeff = 0.05;
vec_out = test_mex_gateway(vec_in, coeff);
test_mex_gateway.cpp (C++ interface to Matlab):
#include "mex.h"
extern void test_execute(float *array_in, float *array_out, const size_t vec_size, const float coeff);
void mexFunction( int nlhs,
mxArray *plhs[],
int nrhs,
const mxArray *prhs[] )
// Check for proper number of input and output arguments
if( nrhs != 2 )
mexErrMsgTxt( "3 input arguments required: input_data, coeff" );
if( nlhs > 2 )
mexErrMsgTxt( "Too many output arguments." );
const mwSize *matlab_data_dims_in;
mwSize matlab_data_dims_out[1];
// Input Parameters
float *input_data = (float *) mxGetData(prhs[0]);
float coeff = mxGetScalar(prhs[1]);
// Get dimensions
matlab_data_dims_in = mxGetDimensions(prhs[0]);
const int vec_len = matlab_data_dims_in[1];
// Set output data dimension
matlab_data_dims_out[0] = vec_len;
// Output data
plhs[0] = mxCreateNumericArray(1, matlab_data_dims_out, mxSINGLE_CLASS, mxREAL);
float *output_data = (float *) mxGetData(plhs[0]);
test_execute(input_data, output_data, vec_len, coeff);
test_execute.cpp (This is where the actual C++ STL call is made):
#include <execution> // std::execution::*
#include <numeric> // std::exclusive_scan()
void test_execute(float *array_in, float *array_out, const size_t vec_size, const float coeff)
std::execution::par, // std::execution::seq works here for Mex call, par does not
array_in + vec_size,
[coeff](float a, float b)
float ret = a + b + coeff;
return ret;
I also have a stand-alone main function to replace the Mex wrapper to do a pure C++ test, test_standalone.cpp:
#include <vector>
#include <iostream>
size_t VEC_NUM_ELEM = 10;
extern void test_execute(float *array_in, float *array_out, const size_t vec_size, const float coeff);
int main(int argc, char **argv)
if (argc != 2)
std::cout << "Try: " << argv[0] << "<coeff>" << std::endl;
return -1;
const float coeff = std::stof(argv[1]);
std::cout << "Coeff: " << coeff << std::endl;
float __attribute__ ((aligned (64))) *vec1_array = (float *)malloc(VEC_NUM_ELEM * sizeof(float));
float __attribute__ ((aligned (64))) *vec2_array = (float *)malloc(VEC_NUM_ELEM * sizeof(float));
for (unsigned i = 0; i < VEC_NUM_ELEM; i++)
vec1_array[i] = static_cast<float>(i);
test_execute(vec1_array, vec2_array, VEC_NUM_ELEM, coeff);
return 0;
Here is how I am building and linking,
rm *.o
rm *.exe
rm *.mexa64
echo "Building test_execute.cpp..."
$gpp910 -std=$cstd -I/home/m/reqs/tbb/include -L$tbblib -ltbb -Wl,rpath=$tbblib -c test_execute.cpp -fPIC
echo "Building test_standalone.cpp..."
$gpp910 -std=$cstd -L$tbblib test_execute.o test_standalone.cpp -o test_standalone.exe -ltbb
echo "Building test_mex_gateway.cpp..."
mex test_execute.o test_mex_gateway.cpp -L$tbblib -ltbb
The parallel STL calls has a requirement to link against the Intel TBB (Threading Building Blocks), so before I run Matlab to call test.m OR before I run my test_standalone.exe, I run:
export LD_LIBRARY_PATH=/home/m/reqs/tbb/lib/intel64/gcc4.8:$LD_LIBRARY_PATH
I also make sure to make the the C++ library associated with the version of GCC we built with available at runtime:
export LD_LIBRARY_PATH=/home/m/compilers/lib64:$LD_LIBRARY_PATH
When I run test_standalone.exe, everything behaves normally whether I have the execution policy set to "par" or "seq" on std::exclusive_scan. When run test.m, if "seq" was compiled, I can run with no errors. If "par" was compiled, Matlab complains at runtime about a linkage issue:
Invalid MEX-file 'test_mex_gateway.mexa64': test_mex_gateway.mexa64: undefined symbol:
I suspect this was a function that was supposed to be linked from TBB, which I confirmed:
$ nm /home/m/reqs/tbb/lib/intel64/gcc4.8/ | grep baseEl
0000000000028a30 T _ZN3tbb10interface78internal20isolate_within_arenaERNS1_13delegate_baseEl
000000000005ed70 r _ZN3tbb10interface78internal20isolate_within_arenaERNS1_13delegate_baseEl$$LSDA
I confirmed Matlab's LD_LIBRARY_PATH has the path I supplied in the above "export .." to this library.
I tried making sure my libraries came before the many Matlab-centric paths Matlab adds to LD_LIBRARY_PATH after it launches from the terminal.
I tried baking the path to the linked libraries via a -Wl,rpath=<> passage to the linker.
After almost two days, I can't figure out why Matlab is having this very specific runtime issue, especially when the pure C++ version is not. Any help would be appreciated.
RHEL 7.9
Matlab R2020a
GCC 9.1.0
TBB (Intel Thread Building Blocks) 2020.3
It appears that Matlab comes with a version of included in its installation. From what I can tell, when launching a Mex file, Matlab will use its own libraries first, regardless of your LD_LIBRARY_PATH order. This is what was giving me runtime issues as a Mex file but not as a pure C++ file. Removing the from Matlab's installation directory allowed runtime linkage to find my version of libtbb, and I was able to run without errors. Thanks to Cris Luengo for pointing me in the right direction.

Pytorch inference time difference between CUDA 10.0 & 10.2

We have a working library that uses LibTorch 1.5.0, built with CUDA 10.0 which runs as expected.
We are working on upgrading to CUDA 10.2 for various non-PyTorch related reasons. We noticed that when we run LibTorch inference on the newly compiled LibTorch (compiled exactly the same, except changing to CUDA 10.2), the runtime is about 20x slower.
We also checked it using the precompiled binaries.
This was tested on 3 different machines using 3 different GPUs (Tesla T4, GTX980 & P1000) and all gives consistent ~20x slower on CUDA 10.2
(Both on Windows 10 & Ubuntu 16.04), all with the latest drivers and on 3 different torch scripts (of the same architecture)
I've simplified the code to be extremely minimal without external dependencies other than Torch
int main(int argc, char** argv)
// Initialize CUDA device 0
std::string networkPath = DEFAULT_TORCH_SCRIPT;
if (argc > 1)
networkPath = argv[1];
auto jitModule = std::make_shared<torch::jit::Module>(torch::jit::load(networkPath, torch::kCUDA));
if (jitModule == nullptr)
std::cerr << "Failed creating module" << std::endl;
// Meaningless data, just something to pass to the module to run on
// PATCH_HEIGHT & WIDTH are defined as 256
uint8_t* data = new uint8_t[PATCH_HEIGHT * PATCH_WIDTH * 3];
memset(data, 0, PATCH_HEIGHT * PATCH_WIDTH * 3);
auto stream = at::cuda::getStreamFromPool(true, 0);
bool res = infer(jitModule, stream, data, PATCH_WIDTH, PATCH_HEIGHT);
std::cout << "Warmed up" << std::endl;
res = infer(jitModule, stream, data, PATCH_WIDTH, PATCH_HEIGHT);
delete[] data;
return 0;
// Inference function
bool infer(std::shared_ptr<JitModule>& jitModule, at::cuda::CUDAStream& stream, const uint8_t* inputData, int width, int height)
std::vector<torch::jit::IValue> tensorInput;
// This function simply uses cudaMemcpy to copy to device and create a torch::Tensor from that data
// I can paste it if it's relevant but didn't now to keep as clean as possible
if (!prepareInput(inputData, width, height, tensorInput, stream))
return false;
// Reduce memory usage, without gradients
torch::NoGradGuard noGrad;
at::cuda::CUDAStreamGuard streamGuard(stream);
auto totalTimeStart = std::chrono::high_resolution_clock::now();
// The synchronize here is just for timing sake, not use in production
auto totalTimeStop = std::chrono::high_resolution_clock::now();
printf("forward sync time = %.3f milliseconds\n",
std::chrono::duration<double, std::milli>(totalTimeStop - totalTimeStart).count());
return true;
When compiling this with Torch that was compiled using CUDA 10.0 we get a runtime of 18 ms and when we run it with Torch compiled with CUDA 10.2, we get a runtime of 430 ms
Any thoughts on that?
This issue was also posted on PyTorch Forums.
Issue on GitHub
I profiled this small program using both CUDAs
It seems that both use very different kernels
96.5% of the 10.2 computes are conv2d_grouped_direct_kernel which takes ~60-100ms on my P1000
where as the top kernels in the 10.0 run are
47.1% - cudnn::detail::implicit_convolve_sgemm (~1.5 ms)
23.1% - maxwell_scudnn_winograd_128x128_ldg1_ldg4_tile148n_nt (~0.4 ms)
8.5% - maxwell_scudnn_128x32_relu_small_nn (~0.4ms)
so it's easy to see where the time difference comes from. Now the question is, why.

NodeJS: Native c++ module with multi-threading (openmp)

I have a very basic native module for NodeJS which essentially wraps a more complicated c++ program which includes multithreading using OpenMP (it's a Monte Carlo simulation). This c++ program works fine on its own and when called from python using a cython module to wrap the c++ code. However, when I compile it to a native node module (using /OpenMP as the parameter to the MSBuild compiler) it only uses one thread.
The following is my binding.gyp for reference:
{ "targets": [
"target_name": "LossForecast",
"sources": [ "NodeLossForecast.cpp", "src/AutoDiff.cpp" ],
"include_dirs":["src", "<!(node -e \"require('nan')\")"],
'conditions': [
'cflags': ["/EHsc", "/O2", "/openmp"]
} ]}
So, I am writing down my files for running a simple node-addon-api pi program using OpenMP. Although node-addon-api is experimental, it will be soon released as an official api for node.js. Its very simple.
This is for Windows only. And I can confirm if it is running in multiple processors.
"targets": [
"target_name": "nodeOpenMP",
"defines": [
"sources": [
"include_dirs": [
"<!(echo %cd%/node_modules/node-addon-api)"
"conditions": [
'msvs_settings': {
'VCCLCompilerTool' : {
'AdditionalOptions' : ['/MT','/openmp']
You have to add VCCLCompilerTool and then under AdditionalOptions add `/openmp' as mentioned above.
Here's a simple PI program that I wrote,
#include <napi.h>
#include <omp.h>
#include <iostream>
#include <windows.h> // for GetCurrentProcessorNumber()
int getThreads(){
// Get number of threads
int n = 0;
#pragma omp parallel reduction(+:n)
n += 1;
return n;
double GetPi(short numOfThreads,long numberOfSteps){
long i;
double pi, sum = 0.0;
double step = 1.0/(double) numberOfSteps;
#pragma omp parallel
std::cout << "This thread is running on processor: " << GetCurrentProcessorNumber() << "\n";
double x;
#pragma omp for reduction(+:sum)
for(i = 0; i < numberOfSteps; i++) {
x = ( i + 0.5 ) * step;
sum += 4.0 / (1 + x*x);
std::cout << "Total no. of threads (not processors)" <<getThreads() << std::endl;
pi = step * (double)sum;
return pi;
Napi::Value calculatePi(const Napi::CallbackInfo& info ){
Napi::Env env = info.Env();
// check for no. of arguments
Napi::TypeError::New(env, "Wrong number of arguments").ThrowAsJavaScriptException();
return env.Null();
if (!info[0].IsNumber() || !info[1].IsNumber()) {
Napi::TypeError::New(env, "Wrong arguments").ThrowAsJavaScriptException();
return env.Null();
double numThreads = info[0].As<Napi::Number>().DoubleValue();
double numSteps = info[1].As<Napi::Number>().DoubleValue();
double pi = GetPi(numThreads, numSteps);
Napi::Number PI = Napi::Number::New(env, pi);
return PI;
Napi::Object Init(Napi::Env env, Napi::Object exports){
// register the functions that are to be exported
exports.Set(Napi::String::New(env, "pi"), Napi::Function::New(env, calculatePi));
return exports;
NODE_API_MODULE(nodeOpenMP, Init);
const omp = require("./build/Release/nodeOpenMP");
const numThreads = 4, numSteps = 1000000;
console.log( numThreads, numSteps );
Simply copy paste the files above accordingly. Don't forget to install node-addon-api(locally) and node-gyp(globally) and then run:
node-gyp configure build && node --no-warnings testOMP.js
You should get an output like this:
This thread is running on processor: 3
This thread is running on processor: 3
This thread is running on processor: 0
This thread is running on processor: 3
Total no. of threads (not processors): 3
I am thinking of soon making a cross-platform npm package node-openmp.
Follow this repo and contribute. I am open to any contributions.

Run CGAL c++ program on OS X?

I want to be able to execute programs on my computer. I installed CGAL using Macports, I am not sure how to proceed next. Can anybody tell me how to execute the program, I am desperately trying to run the following program but don't know how to:
#include <iostream>
#include <CGAL/Exact_predicates_inexact_constructions_kernel.h>
#include <CGAL/convex_hull_2.h>
typedef CGAL::Exact_predicates_inexact_constructions_kernel K;
typedef K::Point_2 Point_2;
int main()
Point_2 points[5] = { Point_2(0,0), Point_2(10,0), Point_2(10,10), Point_2(6,5), Point_2(4,1) };
Point_2 result[5];
Point_2 *ptr = CGAL::convex_hull_2( points, points+5, result );
std::cout << ptr - result << " points on the convex hull:" << std::endl;
for(int i = 0; i < ptr - result; i++){
std::cout << result[i] << std::endl;
return 0;
CGAL comes with a script called cgal_create_cmake_script that should be run where you saved your example file.
Then run cmake . and make
CGAl gets installed on the directory :
Write your program into a text file and save excutable.cpp
In the command line go to the directory of the executable( use cd command)
then write the following commands
cgal_create_CMakeLists -s executable //without .cpp!!
cmake -DCGAL_DIR = opt/local/include/cgal
go to the folder where you saved executable.cpp and then click on the executable file(has a black square icon)
and your done :)
NOTE: only works if you installed using macports. if you installed using homebrew directories change,the procedure remains the same :)
You also need command line tools installed.

Weird and unpredictable crash when using libx264 cross-compiled with MinGW

I'm working on a C++ project using Visual Studio 2010 on Windows. I'm linking dynamically against x264 which I built myself as a shared library using MinGW following the guide at
The strange thing is that my x264 code is working perfectly sometimes. Then when I change some line of code (or even change the comments in the file!) and recompile everything crashes on the line
encoder_ = x264_encoder_open(&param);
With the message
Access violation reading location 0x00000000
I'm not doing anything funky at all so it's probably not my code that is wrong but I guess there is something going wrong with the linking or maybe something is wrong with how I compiled x264.
The full initialization code:
x264_param_t param = { 0 };
if (x264_param_default_preset(&param, "ultrafast", "zerolatency") < 0) {
throw KStreamerException("x264_param_default_preset failed");
param.i_threads = 1;
param.i_width = 640;
param.i_height = 480;
param.i_fps_num = 10;
param.i_fps_den = 1;
encoder_ = x264_encoder_open(&param); // <-----
if (encoder_ == 0) {
throw KStreamerException("x264_encoder_open failed");
x264_picture_alloc(&pic_, X264_CSP_I420, 640, 480);
Edit: It turns out that it always works in Release mode and when using superfast instead of ultrafast it also works in Debug mode 100%. Could it be that the ultrafast mode is doing some crazy optimizations that the debugger doesn't like?
I've met this problem too with libx264-120.
libx264-120 was built on MinGW and configuration option like below.
$ ./configure --disable-cli --enable-shared --extra-ldflags=-Wl,--output-def=libx264-120.def --enable-debug --enable-win32thread
platform: X86
system: WINDOWS
cli: no
libx264: internal
shared: yes
static: no
asm: yes
interlaced: yes
avs: yes
lavf: no
ffms: no
gpac: no
gpl: yes
thread: win32
filters: crop select_every
debug: yes
gprof: no
strip: no
PIC: no
visualize: no
bit depth: 8
chroma format: all
$ make -j8
lib /def:libx264-120.def /machine:x86
#include "stdafx.h"
#include <iostream>
#include <cassert>
using namespace std;
#include <stdint.h>
extern "C"{
#include <x264.h>
int _tmain(int argc, _TCHAR* argv[])
int width(640);
int height(480);
int err(-1);
x264_param_t x264_param = {0};
err =
x264_param_default_preset(&x264_param, "veryfast", "zerolatency");
x264_param.i_threads = 8;
x264_param.i_width = width;
x264_param.i_height = height;
x264_param.i_fps_num = 60;//fps;
x264_param.i_fps_den = 1;
// Intra refres:
x264_param.i_keyint_max = 60;//fps;
x264_param.b_intra_refresh = 1;
//Rate control:
x264_param.rc.i_rc_method = X264_RC_CRF;
x264_param.rc.f_rf_constant = 25;
x264_param.rc.f_rf_constant_max = 35;
//For streaming:
x264_param.b_repeat_headers = 1;
x264_param.b_annexb = 1;
err = x264_param_apply_profile(&x264_param, "baseline");
x264_t *x264_encoder = x264_encoder_open(&x264_param);
x264_encoder = x264_encoder;
x264_encoder_close( x264_encoder );
return 0;
This program succeeds sometime. But will fail often on x264_encoder_open with the access violation.
The information for this is not existing on Google. And how to initialize x264_param_t and how to use x264_encoder_open are unclear.
It seems that behavior caused from x264's setting values, but I can't know these without reading some open source programs that using libx264.
And, this access violation seems doesn't occurs on FIRST TIME EXECUTION and on compilation with MinGW's gcc (e.g gcc -o test test.c -lx264;./test)
Since this behavior, I think that libx264 doing some strange processes of resources in DLL version of ilbx264 that was built on MinGW's gcc.
I had the same problem. The only way I was able to fix it was to build the x264 dll without the asm option (ie. specify --disable-asm)