<<< >>> cuda in vscode - c++

Are there any way to suppress "<<< >>>" error with vscode-cpptools.
I associate "*.cu" with "cpp" in setting.json.
// use normal c++ syntax highlighting for CUDA files
"files.associations": {"*.cu": "cpp"},
and work fine except of one problem, kernel execution configuration parameters surrounded by <<< and >>> mistaked as error expected an expression
dim3 dimGrid(2, 2, 1);
dim3 dimBlock(width / 2, width / 2, 1);
MatrixMulKernel<<<dimGrid, dimBlock>>>(d_M, d_N, d_P, width);
Any suggestion

googling for a few hours, find no perfect solution but some workaround.
I summarize here:
use normal c++ syntax highlighting for CUDA files by edittingsetting.json
include necessary header of CUDA in program
include dummy header to workaround INTELLISENSE
Bellow is a concrete example
setting.json
"files.associations": {
"*.cu": "cpp",
"*.cuh": "cpp"
}
cudaDmy.cuh
#pragma once
#ifdef __INTELLISENSE__
void __syncthreads(); // workaround __syncthreads warning
#define KERNEL_ARG2(grid, block)
#define KERNEL_ARG3(grid, block, sh_mem)
#define KERNEL_ARG4(grid, block, sh_mem, stream)
#else
#define KERNEL_ARG2(grid, block) <<< grid, block >>>
#define KERNEL_ARG3(grid, block, sh_mem) <<< grid, block, sh_mem >>>
#define KERNEL_ARG4(grid, block, sh_mem, stream) <<< grid, block, sh_mem,
stream >>>
#endif
matrixMul.cu
#include <stdio.h>
#include <math.h>
#include <time.h>
#include <cuda.h>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <device_functions.h>
#include <cuda_runtime_api.h>
#include "cudaDmy.cuh"
__global__ void MatrixMulKernel(float *M, float *N, float *P, int width)
{
int Row = blockIdx.y * blockDim.y + threadIdx.y;
int Col = blockIdx.x * blockDim.x + threadIdx.x;
if (Row < width && Col < width)
{
float Pvalue = 0;
for (int i = 0; i < width; ++i)
{
Pvalue += M[Row * width + i] * N[width * i + Col];
}
P[Row * width + Col] = Pvalue;
}
}
void MatMul(float *M, float *N, float *P, int width)
{
float *d_M;
float *d_N;
float *d_P;
int size = width * width * sizeof(float);
cudaMalloc((void **)&d_M, size);
cudaMemcpy(d_M, M, size, cudaMemcpyHostToDevice);
cudaMalloc((void **)&d_N, size);
cudaMemcpy(d_N, N, size, cudaMemcpyHostToDevice);
cudaMalloc((void **)&d_P, size);
dim3 dimGrid(2, 2, 1);
dim3 dimBlock(width / 2, width / 2, 1);
// <<<>>> will replace macro KERNEL_ARG2 when compiling
MatrixMulKernel KERNEL_ARG2(dimGrid,dimBlock) (d_M, d_M, d_P, width);
cudaMemcpy(P, d_P, size, cudaMemcpyDeviceToHost);
cudaFree(d_M);
cudaFree(d_N);
cudaFree(d_P);
}
int main()
{
int elem = 100;
float *M = new float[elem];
float *N = new float[elem];
float *P = new float[elem];
for (int i = 0; i < elem; ++i)
M[i] = i;
for (int i = 0; i < elem; ++i)
N[i] = i + elem;
time_t t1 = time(NULL);
MatMul(M, N, P, sqrt(elem));
time_t t2 = time(NULL);
double seconds = difftime(t2,t1);
printf ("%.3f seconds total time\n", seconds);
for (int i = 0; i < elem/1000000; ++i)
printf("%.1f\t", P[i]);
printf("\n");
delete[] M;
delete[] N;
delete[] P;
return 0;
}
Let's compile it with NVCC
nvcc matrixMul.cu -Xcudafe "--diag_suppress=unrecognized_pragma" -o runcuda
useful links:
https://devtalk.nvidia.com/default/topic/513485/cuda-programming-and-performance/__syncthreads-is-undefined-need-a-help/post/5189004/#5189004
https://stackoverflow.com/a/6182137/8037585
https://stackoverflow.com/a/27992604/8037585
https://gist.github.com/ruofeidu/df95ba27dfc6b77121b27fd4a6483426

You can just download the vscode-cudacpp extention and than in your workspace(<>.workspace) or user settings(.vscode/settings.json) enable this option:
"settings": {
"files.associations": {
"*.cu": "cuda",
"*.cuh": "cuda"
}
}

As sonulohani pointed out the cuda-cpp extension. It is good and it is the only extension available for CUDA. if you want autocomplete then try the CUDA-C++ package in sublime text editor. That provides excellent autocomplete features.

There is an official extension by NVIDIA named Nsight Visual Studio Code Edition
You could try and install it in your vscode.

Related

CudaErrorUnknown code=30 on any cuda call

I've installed cuda toolkit and I can run the samples without a problem. Now, I want to use cuda on my project and in my project I use cmake. So, in order to demonstrate my problem I created a simple example. I have 3 files, my main, which is "teste.cpp", a cuda file "hello_world.cu" and it's header. The only thing my main has is the call for a function at hello_world.cu, like this:
#include <iostream>
#include "hello_world.h"
using namespace std;
int main(int argc, char** argv)
{
teste(argc, argv);
return 0;
}
My hello_world.cu is a exact copy of the "clock" sample. So, looks like this:
// CUDA runtime
#include </usr/local/cuda-9.0/include/cuda_runtime.h>
// helper functions and utilities to work with CUDA
#include </usr/local/cuda-9.0/samples/common/inc/helper_functions.h>
#include </usr/local/cuda-9.0/samples/common/inc/helper_cuda.h>
#define NUM_BLOCKS 64
#define NUM_THREADS 256
__global__ static void timedReduction(const float *input, float *output, clock_t *timer)
{
// __shared__ float shared[2 * blockDim.x];
extern __shared__ float shared[];
const int tid = threadIdx.x;
const int bid = blockIdx.x;
if (tid == 0) timer[bid] = clock();
// Copy input.
shared[tid] = input[tid];
shared[tid + blockDim.x] = input[tid + blockDim.x];
// Perform reduction to find minimum.
for (int d = blockDim.x; d > 0; d /= 2)
{
__syncthreads();
if (tid < d)
{
float f0 = shared[tid];
float f1 = shared[tid + d];
if (f1 < f0)
{
shared[tid] = f1;
}
}
}
// Write result.
if (tid == 0) output[bid] = shared[0];
__syncthreads();
if (tid == 0) timer[bid+gridDim.x] = clock();
}
int teste(int argc, char** argv) {
printf("CUDA Clock sample\n");
// This will pick the best possible CUDA capable device
int dev = findCudaDevice(argc, (const char **)argv);
float *dinput = NULL;
float *doutput = NULL;
clock_t *dtimer = NULL;
clock_t timer[NUM_BLOCKS * 2];
float input[NUM_THREADS * 2];
for (int i = 0; i < NUM_THREADS * 2; i++)
{
input[i] = (float)i;
}
checkCudaErrors(cudaMalloc((void **)&dinput, sizeof(float) * NUM_THREADS * 2));
checkCudaErrors(cudaMalloc((void **)&dinput, sizeof(float) * NUM_THREADS * 2));
checkCudaErrors(cudaMalloc((void **)&doutput, sizeof(float) * NUM_BLOCKS));
checkCudaErrors(cudaMalloc((void **)&dtimer, sizeof(clock_t) * NUM_BLOCKS * 2));
checkCudaErrors(cudaMemcpy(dinput, input, sizeof(float) * NUM_THREADS * 2, cudaMemcpyHostToDevice));
timedReduction<<<NUM_BLOCKS, NUM_THREADS, sizeof(float) * 2 *NUM_THREADS>>>(dinput, doutput, dtimer);
checkCudaErrors(cudaMemcpy(timer, dtimer, sizeof(clock_t) * NUM_BLOCKS * 2, cudaMemcpyDeviceToHost));
checkCudaErrors(cudaFree(dinput));
checkCudaErrors(cudaFree(doutput));
checkCudaErrors(cudaFree(dtimer));
long double avgElapsedClocks = 0;
for (int i = 0; i < NUM_BLOCKS; i++)
{
avgElapsedClocks += (long double) (timer[i + NUM_BLOCKS] - timer[i]);
}
avgElapsedClocks = avgElapsedClocks/NUM_BLOCKS;
printf("Average clocks/block = %Lf\n", avgElapsedClocks);
return EXIT_SUCCESS;
}
My CMakeLists.txt looks like this:
cmake_minimum_required(VERSION 2.8)
set(CUDA_HOST_COMPILER /usr/bin/g++-4.9)
find_package(CUDA QUIET REQUIRED)
# Pass options to NVCC
set(
CUDA_NVCC_FLAGS
${CUDA_NVCC_FLAGS};
-O3 -std=c++11 -g
)
# For compilation ...
# Specify target & source files to compile it from
cuda_add_executable(
helloworld
teste.cpp
hello_world.cu
)
The code compiles, and when i run it i get this output:
CUDA Clock sample
GPU Device 0: "GeForce GTX 950M" with compute capability 5.0
CUDA error at /home/cesar/Documents/cuda_teste/hello_world.cu:69 code=30(cudaErrorUnknown) "cudaMalloc((void **)&dinput, sizeof(float) * NUM_THREADS * 2)"
Why do I get this error here, using cmake? If I go to the samples directory and try the 'clock' example directly everything works fine.. So is it a problem on my CMakeLists.txt?
I managed to find the solution.
On my CMakeLists.txt i needed to add a flag to my NVCC with "-arch=sm_50", in my case it is sm_50 due to my graphic card having a compute capability 5.0, if any one have the same error and want to try this, you have to check your compute capability

only one thread executes the cuda kernel

I am new to GPU programming and specifically CUDA/C++. I have written a simple code just to use atomicAdd to increase all members of an array by 1.
But the result shows just the first element of the array increased and others stay the same. My code is as follows.
Thanks for any help in advance.
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <iostream>
#include <fstream>
using namespace std;
__global__ void Histcount( int *a)
{
int i = blockIdx.x*blockDim.x + threadIdx.x;
{
atomicAdd(&a[i], 1);
}
}
int main()
{
int * hostarray = new int[20];
int * devarray;
cudaError_t error;
error=cudaMalloc(&devarray, sizeof(int) * 20);
for (int i = 0; i < 20; i++)
{
hostarray[i] = i ;
}
cudaMemcpy((int *)devarray, (int *)hostarray, sizeof(int) * 20, cudaMemcpyHostToDevice);
dim3 gs = (1, 1);
dim3 bs = (20, 1, 1);
Histcount <<<gs, bs >>> (devarray);
cudaMemcpy((int *)hostarray, (int *)devarray, sizeof(int) * 20, cudaMemcpyDeviceToHost);
for (int i = 0; i < 20; i++)
{
cout << hostarray[i]<<endl;
}
}
This is not a valid way to specify dim3 variables:
dim3 gs = (1, 1);
dim3 bs = (20, 1, 1);
In fact, the compiler may be throwing warnings on those lines, and if so you should not ignore those.
You should do either:
dim3 gs = dim3(1, 1);
dim3 bs = dim3(20, 1, 1);
or:
dim3 gs(1, 1);
dim3 bs(20, 1, 1);
The problem with your implementation is that the compiler doesn't know your actual intent with for example:
(20, 1, 1)
By itself as you have it, the compiler (may issue a warning and in fact) evaluates that expression to be 1, which it then assigns as a scalar to your dim3 variable. So you end up with a block size of 1 and a grid size of 1 (which was not your intent), and your code ran only 1 thread overall.

Generate Random Number Within Cuda Kernel

I am trying to use the CuRand library in CUDA. I'm just simply trying to generate a random integer per thread. Below is my results (Clearly not very random):
84
84
84
84
84
5
Please check my code out and tell me what I'm doing wrong, I'm pulling my hair out trying to figure out why this isn't working...
#include <stdio.h>
#include <stdlib.h>
#include <cuda.h>
#include <curand.h>
#include <curand_kernel.h>
__device__ float generate(curandState* globalState, int ind)
{
//int ind = threadIdx.x;
curandState localState = globalState[ind];
float RANDOM = curand_uniform( &localState );
globalState[ind] = localState;
return RANDOM;
}
__global__ void setup_kernel ( curandState * state, unsigned long seed )
{
int id = threadIdx.x;
curand_init ( seed, id, 0, &state[id] );
}
__global__ void addToCount(int N, int *y, curandState* globalState)
{
int id = threadIdx.x + blockIdx.x * blockDim.x;
while (id < N)
{
int number = generate(globalState, id) * 1000000;
printf("%i\n", number);
atomicAdd(&(y[0]), number);
id += blockDim.x * gridDim.x;
}
}
int main(void)
{
int N = 5;
int *y, *d_y;
y = (int*)malloc(N*sizeof(int));
cudaMalloc(&d_y, N * sizeof(int));
cudaMemcpy(d_y, y, N * sizeof(int), cudaMemcpyHostToDevice);
curandState* devStates;
cudaMalloc (&devStates, N * sizeof(curandState));
addToCount<<<2, 5>>>(N, d_y, devStates);
cudaMemcpy(y, d_y, N*sizeof(int), cudaMemcpyDeviceToHost);
printf("%i\n", *y);
}
AS #Robert Crovella mentioned in his comment, you forgot to setup the kernel. The curand states need to be initialized for every thread before they provide actual random numbers. If you change your main to:
int main(void)
{
int N = 5;
int *y, *d_y;
y = (int*)malloc(N*sizeof(int));
cudaMalloc(&d_y, N * sizeof(int));
cudaMemcpy(d_y, y, N * sizeof(int), cudaMemcpyHostToDevice);
curandState* devStates;
cudaMalloc (&devStates, N * sizeof(curandState));
srand(time(0));
/** ADD THESE TWO LINES **/
int seed = rand();
setup_kernel<<<2, 5>>>(devStates,seed);
/** END ADDITION **/
addToCount<<<2, 5>>>(N, d_y, devStates);
cudaMemcpy(y, d_y, N*sizeof(int), cudaMemcpyDeviceToHost);
printf("%i\n", *y);
}
You get nice results with default compilation:
nvcc /tmp/so.cu -o /tmp/so
$ /tmp/so
900981
469952
494161
31968
880329
2777391
$ /tmp/so
525835
742594
750423
117137
66318
2202307
$ /tmp/so
919262
60838
89868
57696
770764
1898428

Numerical error in cuda/cublas simple kernel using particular input

I am working with cuda and cublas and I was trying to implement simple operations like matrix element-wise multiplication/division. I am using only float for my experiments. I know the most obvious way to do it is to write a kernel like this one:
__global__ void mul_elementwise(const unsigned int n, float* source, float* dest, const float value)
{
const unsigned int offset = blockIdx.x * blockDim.x + threadIdx.x;
const unsigned int stride = blockDim.x * gridDim.x;
for (unsigned int i = offset; i < n; i += stride)
{
dest[i] = source[i] * value;
}
}
This kernel can work both for multiplication and division (just using 1/x as value). But this can be achieved using cublas library too: suppose we have a matrix A m x n stored in column-major style and a scalar x, then setting alpha = x or alpha = 1/x and d_ones as a vector of m*n 1s, we can invoke and obtain the same result
cublasSaxpy(cublas_handle, m * n, &alpha, d_ones, 1, A_dev, 1);
Both methods work just fine, but I am facing few problems with some particular matrix, for which both methods do no work. I isolated this big matrix and build a MCVE available here (you can compile it with nvcc mcve.cu -lcublas. As you can see the results in both cases are totally wrong: host result is totally different, I am trying to figure out what's going on. I do not see any error in code but maybe i should try to use double instead of float and see what happens.
Any opinions about this situation? Thanks in advance!
EDIT #1 I tried using doubles but nothing changes if I use cublasDaxpy meanwhile it works perfectly with the custom kernel. I think the values are too small so single floating point precision is not enough.
Interesting MCVE. Wouldn't it have been possible to shrink your vector down to just a few elements? Isn't it possible to show the calculation discrepancy based on just 1 vector element?
Anyway I see several problems.
Your kernel implements the following function: y=alpha*x. But SAXPY implements y=alpha*x+y. Now, if y started out as (all) zero, then these two would be the same. But that's not what you have:
CUBLAS Your Kernel
---------------------------
alpha: alpha alpha
x: 1 ahost (ahost is your huge data array)
y: ahost -
So your kernel is computing y=alpha * ahost, but your CUBLAS call is computing y = alpha*1 + ahost. I wouldn't expect the same result from these, in general.
Your analysis of error seems flawed in a few ways. First, you are computing the absolute error in a float variable (a number which will always be positive, since it's the absolute value), but then you're comparing it against a negative number:
float diff = abs(host[i]-dev[i]);
...
if (diff > (-1e12))
won't that if test always be true? Perhaps you meant 1e-12 although that would still be flawed. Looking for a fixed error threshold on a floating point comparison should be scaled to the size of the numbers being compared. float quantities only contain about 6-7 accurate decimal digits. (And summing these errors is also troublesome.)
Here is a complete code that has the above issues fixed, and produces zero sum error for all the comparisons (host<->kernel and host<->cublas):
static float array[] = {0x00000000,
0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0xB58DA1CF,0xB50D2FEC,0x34A48536,0xB4A1D5BC,0x358E1345,0x35943AAC,0xB5983F40,0xB43628BB,0xB4A95348,0xB4DB751C,0xB50C8D1A,0xB3EFCBB5,0x3552B8CD,0x3538A167,0x358FDE0D,0xB4D54CE9,0xB5D29BB7,0xB4A234EE,0x346EF2F4,0x35B5D9F2,0xB40F1487,0x3554BC20,0x33FD9466,0xB536D37D,0xB3C2E594,0xB59DA581,0x3584FC87,0x34438F09,0x35D293CB,0xB4FBB002,0xB59F41E9};
#include <iostream>
#include <stdio.h>
#include <cublas_v2.h>
#include <assert.h>
#define TOL 0.0001
typedef unsigned int u32;
#define GET_STRIDE() u32(blockDim.x * gridDim.x)
#define GET_OFFSET() u32(blockIdx.x * blockDim.x + threadIdx.x)
inline
cudaError_t checkCuda(cudaError_t result)
{
#if defined(DEBUG) || defined(_DEBUG)
if (result != cudaSuccess) {
fprintf(stderr, "CUDA Runtime Error: %s\n", cudaGetErrorString(result));
assert(result == cudaSuccess);
}
#endif
return result;
}
__global__ void div_elementwise(const u32 n, float* source, float* dest, const float value)
{
for (u32 i = GET_OFFSET(); i < n; i += GET_STRIDE())
{
dest[i] = source[i] * value;
}
}
float check_eq(float* dev, float* host, u32 len)
{
float sum = 0.0f;
for (u32 i = 0; i < len; ++i)
{
if (dev[i]!=host[i])
{
//printf("diff %d %f %f\n", i, dev[i], host[i]);
//break;
float diff = abs((host[i]-dev[i])/host[i]);
sum += diff;
if (diff > (TOL))
printf("diff %d %f\n", i, diff);
}
}
printf("%f\n", sum);
return sum;
}
void div_host(float* a, float v, u32 len)
{
for (u32 i = 0; i < len; ++i)
{
a[i]=a[i]*v;
}
}
int main()
{
u32 len = sizeof(array)/sizeof(float);
printf("array len = %d\n", len);
for (int i =0; i < len; i++) if (isnan(array[i])) {printf("nan value at %d\n",i); return -1;}
float* adev, *adevcublas, *d_zero;
float* ahost = (float*) malloc(len * sizeof(float));
checkCuda(cudaMalloc(&adev, len * sizeof(float)));
checkCuda(cudaMalloc(&adevcublas, len * sizeof(float)));
checkCuda(cudaMalloc(&d_zero, len * sizeof(float)));
memcpy(ahost, &array[0], len * sizeof(float));
checkCuda(cudaMemcpy(adev, ahost, len * sizeof(float), cudaMemcpyHostToDevice));
checkCuda(cudaMemcpy(adevcublas, ahost, len * sizeof(float), cudaMemcpyHostToDevice));
checkCuda(cudaMemset(d_zero, 0, len*sizeof(float)));
float alpha = 1/2494.f;
printf("%f\n", alpha);
div_host(ahost, alpha, len);
u32 tb = 256;
div_elementwise<<<((len + tb - 1) / tb),tb>>>(len, adev, adev, alpha);
float* r = (float*) malloc(len * sizeof(float));
checkCuda(cudaMemcpy(r, adev, len * sizeof(float), cudaMemcpyDeviceToHost));
check_eq(r,ahost,len);
cublasHandle_t ch;
cublasCreate(&ch);
float* r0 = (float*) malloc(len * sizeof(float));
cublasStatus_t stat = cublasSaxpy(ch, len, &alpha, adevcublas, 1, d_zero, 1);
if (stat != CUBLAS_STATUS_SUCCESS) {std::cout << "CUBLAS error: " << (int)stat << std::endl; return 1;}
checkCuda(cudaMemcpy(r0, d_zero, len * sizeof(float), cudaMemcpyDeviceToHost));
check_eq(r0,ahost,len);
free(r);
free(r0);
free(ahost);
cudaFree(adev);
return 0;
}

CBLAS segmenation fault with large array

this is my third post and attempt to solve this problem, which first
showed up using numpy.dot(A, A.T) where A is large, 150,000 x 265 elements.
With numpy, I got back an array with many missing values, that were just zeros.
I've tried to call BLAS thru CBLAS. I'm getting a segmentation fault error
with large arrays.
I'm running this on a machine with about 250 GB free memory.
Thanks for reading...
#include <stdio.h> /* I/O lib ISOC */
#include <stdlib.h> /* Standard Lib ISOC */
#include <cblas.h> /* C BLAS BLAS */
#include "blaio.h"
int main(int argc, char **argv) {
int row = 100000;
int col = 265;
float *a, *b, *c;
a = (float *) malloc(row * col * sizeof(float));
b = (float *) malloc(row * col * sizeof(float));
c = (float *) malloc(row * row * sizeof(float));
int i, end;
end = row * col;
for(i=0; i<end; i++)
{
a[i] = 1.0;
b[i] = 1.0;
}
for(i=0; i<(row*row); i++)
c[i] = 2.0;
// row_order transform transform rowsA colsB K alpha a lda b ldb beta c ldc
cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, row, row, col, 1.0f, a, col, b, row, 0.0f, c, row);
int num_bad = 0;
for(i=0; i<(row*row); i++)
{
if (c[i] != col)
{
printf("Bad value found: %f, at index: %i\n", c[i], i );
num_bad += 1;
}
}
printf("Number of bad values found: %i \n\n", num_bad);
//printMatrix(CblasRowMajor, row, row, c, 8, 3, NULL, NULL, NULL, NULL, NULL, "c = ");
return 0;
} /* end func main */
UPDATE:
Ray has expertly noticed that the blas I'm using via cblas, must be 32 bit and not able to access the array indices. Therefore, I've installed blas64.x86_64 and blas64-devel.x86_64.
Then, rewrote a few lines of the code above to use the direct call to sgemm without cblas.
#include <stdio.h> /* I/O lib ISOC */
#include <stdlib.h> /* Standard Lib ISOC */
int main(int argc, char **argv) {
int row = 100000;
int col = 265;
float *a, *b, *c;
a = (float *) malloc(row * col * sizeof(float));
b = (float *) malloc(row * col * sizeof(float));
c = (float *) malloc(row * row * sizeof(float));
int i, end;
end = row * col;
for(i=0; i<end; i++)
{
a[i] = 1.0;
b[i] = 1.0;
}
for(i=0; i<(row*row); i++)
c[i] = 2.0;
float alpha = 1.0, beta = 1.0;
sgemm_('N','N', &row, &row, &col, &alpha, &a[0], &col, &b[0], &row, &beta, &c[0], &row);
I compiled with:
gcc sgemm_test_fortran.c -o test -L /usr/lib64 -lblas64
The code compiled and I think it might run.. :)
The problem is that the size of your output matrix (100,000x100,000 = 1e10 elements) can't be stored in an int (2.14e9). You can fix this in your C++ code by switching the types to size_t, but you're going to run into the same problem inside the BLAS library.
What you need to to do is use a BLAS library that is compiled to use 8-byte integers; most BLAS libraries are compiled with 4-byte integers. You don't mention what BLAS library you're linking to, so it's hard to guess what the correct library name is (if it even exists) on your system.