I am having a very weird error while trying to create a CUDA kernel to execute a for loop:
#include <stdlib.h>
#include <stdio.h>
#include <thrust/reduce.h>
#include <cuda.h>
int main(int argc, char** argv)
{
float *arrayA;
cudaMalloc((void**)&arrayA, 4096 * 4096 * sizeof(float));
float *arrayB;
cudaMalloc((void**)&arrayB, 4096 * 4096 * sizeof(float));
__global__ void loopKernel(float* arrayA, float* arrayB)
{
int i = threadIdx.x + blockDim.x*blockIdx.x;
if (i < m)
{
//do stuf
}
}
loopKernel << 8, 256 >> (arrayA, arrayB);
}
the error is on the opening { for the kernel (line 14):
error: expected a ";"
it seems really odd as I get the same error on Visual Studio and linux terminal, so it is not an OS issue.
The file is also .cu so there is no way it's being sent to the wrong compiler.
Any help will be appreciated.
A __global__ function definition (i.e. kernel definition) is not something you do within the body of another function. We don't typically do this in C or C++ either (a C/C++ function definition is not usually placed within the body of another function definition).
Place your kernel definitions at global scope (i.e. outside the body of any other function definition, including main).
Something like this:
#include <stdlib.h>
#include <stdio.h>
#include <thrust/reduce.h>
#include <cuda.h>
__global__ void loopKernel(float* arrayA, float* arrayB)
{
int i = threadIdx.x + blockDim.x*blockIdx.x;
if (i < m)
{
arrayA[i] = 0.f;
arrayB[(n - 1)*m + i] = 0.f;
}
}
int main(int argc, char** argv)
{
float *arrayA;
cudaMalloc((void**)&arrayA, 4096 * 4096 * sizeof(float));
float *arrayB;
cudaMalloc((void**)&arrayB, 4096 * 4096 * sizeof(float));
loopKernel << 8, 256 >> (arrayA, arrayB);
}
There are various other issues with the posted code:
It provided no definition for m or n.
The kernel calling syntax is wrong, instead of <<...>> it should be <<<...>>>
For these types of basic issues, its probably better to study a simple (correct) code like the vectorAdd sample code.
Related
I write a simple c++ code. In my code, I create two threads, then I name the two threads TCPCall30003Proc and TCPCall30004Proc, but I can not find them using top command with option -H. My os is ubuntu 18.
#include <pthread.h>
#include <thread>
#include <stdio.h>
#include <time.h>
#include <iostream>
#include <stdlib.h>
#include <chrono>
#include <unistd.h>
void f1(int num)
{
printf("1\n");
while(1)
{
sleep(1);
}
}
void f2(int num)
{
printf("2\n");
while(1)
{
sleep(1);
}
}
int main(int argc, char **argv)
{
std::thread thd_1 = std::thread(f1, 1);
std::thread thd_2 = std::thread(f2, 1);
pthread_setname_np(thd_1.native_handle(), "TCPCall30003Proc");
pthread_setname_np(thd_2.native_handle(), "TCPCall30004Proc");
while(1)
{
sleep(1);
}
return 0;
}
From the pthread_setname_np manual page:
The thread name is a meaningful C language string, whose length is restricted to 16 characters, including the terminating null byte ('\0').
[Emphasis mine]
You names are 17 characters, including the null-terminator.
If you check what pthread_setname_np returns it should return -1 and with errno set to ERANGE.
You need to shorten your names.
I created 50 threads to read the same file at the same time and then, in each thread, tried to write its content to new file that create with different name.
The code was supposed to generate 50 different files.
But I got unexpected results that it just generate 3~5 files.
When all the read the same file, there is no race-condition, and each thread is aimed to write its content to different file.
Can somebody help me? Thank you!
My code is listed below and it is a modification from Reference
#include <unistd.h>
#include <stdio.h>
#include <fcntl.h>
#include <stdlib.h>
#include <sys/stat.h>
#include <string.h>
#include <iostream>
#include <vector>
#include <thread>
void copy(const char *src_path, const char *dst_path);
int main(int argc, char **argv)
{
std::vector<std::thread> vth;
char *tmp = "Copy.deb";
for (int i = 0; i < 50; ++i)
{
char src[40];
memset(src, '\0', sizeof(src));
sprintf(src, "%d", i);
strcat(src, tmp);
vth.emplace_back(std::bind(copy, "Original.deb", strdup(src)));
}
for (int i = 0; i < 50; ++i)
{
vth[i].join();
}
return 0;
}
void copy(const char *src_path, const char *dst_path)
{
FILE *src, *dst;
int buffer_size = 8 * 1024;
char buffer[buffer_size];
size_t length;
src = fopen(src_path, "rb");
dst = fopen(dst_path, "wb");
while (!feof(src))
{
length = fread(buffer, 1, buffer_size, src);
fwrite(buffer, 1, length, dst);
}
fclose(src);
fclose(dst);
}
I believe your problem is that you are passing src (which is a pointer to a local variable on your main thread's stack) to your thread's entry function, but since your copy() function runs asynchronously in a separate thread, the char src[40] array that you are passing a pointer to has already been been popped off of the main thread's stack (and likely overwritten by other data) before the copy() function gets a chance to read its contents.
The easy fix would be to make a copy of the string on the heap, so that you can guarantee the string will remain valid until the the copy() function executes and reads it:
vth.emplace_back(std::bind(copy, "Original.deb", strdup(src)));
... and be sure to have your copy() function free the heap-allocation when it's done using it:
void copy(const char *src_path, const char *dst_path)
{
FILE *src, *dst;
int buffer_size = 8 * 1024;
char buffer[buffer_size];
size_t length;
src = fopen(src_path, "rb");
dst = fopen(dst_path, "wb");
free(dst_path); // free the string previously allocated by strdup()
[...]
Note that you don't currently have the same problem with the "Original.deb" argument since "Original.deb" is a string-literal and therefore stored statically in the executable, which means it remains valid for as long as the program is running -- but if/when you change your code to not use a string-literal for that argument, you'd likely need to do something similar for it as well.
I am writing a piece of code to demonstrate the multi-threading share memory writing.
However, my code gets a strange 0xffffffff pointer I can't make out why. I haven't been writing cpp code for a while. please let me know if I get something wrong.
I compile with the command:
g++ --std=c++11 shared_mem_multi_write.cpp -lpthread -g
I get error echoes like:
function base_ptr: 0x5eebff, src_ptr: 0x7f21a9c4e010, size: 6220800
function base_ptr: 0xffffffffffffffff, src_ptr: 0x7f21a9c4e010, size: 6220800
function base_ptr: 0xbdd7ff, src_ptr: 0x7f21a9c4e010, size: 6220800
function base_ptr: 0x23987ff, src_ptr: 0x7f21a9c4e010, size: 6220800
function base_ptr: 0x11cc3ff, src_ptr: 0x7f21a9c4e010, size: 6220800
function base_ptr: 0x17bafff, src_ptr: 0x7f21a9c4e010, size: 6220800
function base_ptr: 0x1da9bff, src_ptr: 0x7f21a9c4e010, size: 6220800
Segmentation fault (core dumped)
my os is CentOS Linux release 7.6.1810 (Core) gcc version 4.8.5 and the code is posted below:
#include <chrono>
#include <cstdio>
#include <cstring>
#include <functional>
#include <iostream>
#include <sys/ipc.h>
#include <sys/shm.h>
#include <sys/stat.h>
#include <thread>
#include <vector>
#include <memory>
const size_t THREAD_CNT = 40;
const size_t FRAME_SIZE = 1920 * 1080 * 3;
const size_t SEG_SIZE = FRAME_SIZE * THREAD_CNT;
void func(char *base_ptr, char *src_ptr, size_t size)
{
printf("function base_ptr: %p, src_ptr: %p, size: %u\n", base_ptr, src_ptr, size);
while (1)
{
auto now = std::chrono::system_clock::now();
memcpy(base_ptr, src_ptr, size);
std::chrono::system_clock::time_point next_ts =
now + std::chrono::milliseconds(42); // 24 frame per seconds => 42 ms per frame
std::this_thread::sleep_until(next_ts);
}
}
int main(int argc, char **argv)
{
int shmkey = 666;
int shmid;
shmid = shmget(shmkey, SEG_SIZE, IPC_CREAT);
char *src_ptr = new char[FRAME_SIZE];
char *shmpointer = static_cast<char *>(shmat(shmid, nullptr, 0));
std::vector<std::shared_ptr<std::thread>> t_vec;
t_vec.reserve(THREAD_CNT);
for (int i = 0; i < THREAD_CNT; ++i)
{
//t_vec[i] = std::thread(func, i * FRAME_SIZE + shmpointer, src_ptr, FRAME_SIZE);
t_vec[i] = std::make_shared<std::thread>(func, i * FRAME_SIZE + shmpointer, src_ptr, FRAME_SIZE);
}
for (auto &&t : t_vec)
{
t->join();
}
return 0;
}
You forgot specify access rights for created SHM segment (http://man7.org/linux/man-pages/man2/shmget.2.html):
The value shmflg is composed of:
...
In addition to the above flags, the least significant 9 bits of shmflg specify the permissions granted to the owner, group, and others. These bits have the same format, and the same meaning, as the mode argument of open(2). Presently, execute permissions are not used by the system.
Change
shmid = shmget(shmkey, SEG_SIZE, IPC_CREAT);
into
shmid = shmget(shmkey, SEG_SIZE, IPC_CREAT | 0666);
It works for me now: https://wandbox.org/permlink/Am4r2GBvM7kSmpdO
Note that I use only a vector of threads (no shared pointers), as other suggested in comments. You can possibly reserve its space as well.
You forget one very important thing: Error handling!
Both the shmget and shmat functions can fail. If they fail they return the value -1.
Now if you look at the first base_ptr value, it's 0x5eebff. That just happens to be the same as FRAME_SIZE - 1 (FRAME_SIZE is 0x5eec00). That means shmat do return -1, and has failed.
Since you keep on using this erroneous value, all bets are off.
You need to check for errors, and if that happens print the value of errno to find out what have gone wrong:
void* ptr = shmat(shmid, nullptr, 0);
if (ptr == (void*) -1)
{
std::cout << "Error getting shared memory: " << std::strerror(errno) << '\n';
return EXIT_FAILURE;
}
Do something similar for shmget.
Now it's also easy to understand the 0xffffffffffffffff value. It's the two's complement hexadecimal notation for -1, and it's passed to the first thread that is created.
I am trying to call a cuda function that is defined in a cu file from a cpp file in Visual Studio but I keep receiving the following error.
TomColourCorrectionMain.obj : error LNK2019: unresolved external symbol "public: void __cdecl hwk::TomColourCorrection::brightness(int,int)" (?brightness#TomColourCorrection#hwk##QEAAXHH#Z) referenced in function "public: virtual void __cdecl hwk::TomColourCorrection::processCore(class std::shared_ptr)" (?processCore#TomColourCorrection#hwk##UEAAXV?$shared_ptr#VIImageProcessingContext#hwk###std###Z)
Now from reading other questions similar to this, I understand its to do with how the function is defined and that there is something wrong there but I can't see it from when I have defined in the header and cuda file.
This is the code I have (I am a novice at CUDA but I can compile CUDA fine and the code runs when I don't call this function in C++):
header file
#pragma once
#include "ImageProcessorWithProperties.h"
#include <iostream>
#include <cuda_runtime.h>
#include <cuda.h>
class TomColourCorrection : public ImageProcessorWithProperties, public PropertyConsumer<TomColourCorrection>{
public: TomColourCorrection(PropNodePtr n, std::function<void()> requestReprocess);
virtual void processCore(IImageProcessingContextPtr context);
static void DeclareSettings(hwk::PropNodePtr n);
virtual ~TomColourCorrection();
void brightness(int iw, int ih); (function I am talking about)
};
}
cpp file with function call //its just segments of the important code as the rest of it isn't necessary for the actual function itself
#include "stdafx.h"
#include "TomColourCorrection.h"
#include <opencv2/imgproc/imgproc.hpp>
#include <cv.h>
#include <highgui.h>
#include <opencv2/core/core.hpp>
#include <opencv2/highgui/highgui.hpp>
#include <iostream>
#include <cuda_runtime.h>
#include <cuda.h>
namespace hwk{
TomColourCorrection::TomColourCorrection(PropNodePtr n, std::function<void()> requestReprocess) :
ImageProcessorWithProperties("sandbox", n, requestReprocess),
PropertyConsumer<TomColourCorrection>(n)
{
}
void TomColourCorrection::processCore(IImageProcessingContextPtr context){
brightness(16, 16); (just generic numbers at the moment as I am trying to resolve this issue etc)
}
}
CUDA File and function definition
#include "TomColourCorrection.h"
#include "device_launch_parameters.h"
__global__ void brightness_kernel(int iw, int ih)
{
// Calculate our pixel's location
int x = (blockIdx.x * blockDim.x) + threadIdx.x;
int y = (blockIdx.y * blockDim.y) + threadIdx.y;
// Variables to store the sum
int count = 0;
float sum = 0.0;
// Do the blur operation by summing the surround pixels
/* for (int j = -(bh / 2); j <= (bh / 2); j++)
{
for (int i = -(bw / 2); i <= (bw / 2); i++)
{
// Verify that this offset is within the image boundaries
if ((x + i) < iw && (x + i) >= 0 && (y + j) < ih && (y + j) >= 0)
{
sum += (float)source[((y + j) * iw) + (x + i)];
count++;
}
}
}*/
// Average the sum
sum /= (float)count;
// dest[(y * iw) + x] = (unsigned char)sum;
}
void brightness(int iw, int ih) //, unsigned char *source, unsigned char *dest)
{
// allocate memory for the bitmap in GPU memory
unsigned char *dev_source, *dev_dest;
// cudaHostGetDevicePointer(&dev_source, source, 0);
// cudaHostGetDevicePointer(&dev_dest, dest, 0);
// Run the boxfilter kernel
dim3 blocks(iw / 16, ih / 16);
dim3 threads(16, 16);
// Execute the kernel
brightness_kernel << <blocks, threads >> >(iw, ih);
cudaThreadSynchronize();
}
Modify the TomColourCorrection.h like this:
#pragma once
#include "ImageProcessorWithProperties.h"
#include <iostream>
#include <cuda_runtime.h>
#include <cuda.h>
void brightness_wrapper(int, int);
class TomColourCorrection : public ImageProcessorWithProperties, public PropertyConsumer<TomColourCorrection>{
public:
TomColourCorrection(PropNodePtr n, std::function<void()> requestReprocess);
virtual void processCore(IImageProcessingContextPtr context);
static void DeclareSettings(hwk::PropNodePtr n);
virtual ~TomColourCorrection();
void brightness(int iw, int ih);
};
Modify your cpp file like this:
#include "stdafx.h"
#include "TomColourCorrection.h"
#include <opencv2/imgproc/imgproc.hpp>
#include <cv.h>
#include <highgui.h>
#include <opencv2/core/core.hpp>
#include <opencv2/highgui/highgui.hpp>
#include <iostream>
#include <cuda_runtime.h>
#include <cuda.h>
namespace hwk{
void TomColourCorrection::brightness(int iw, int ih){
brightness_wrapper(iw, ih);}
TomColourCorrection::TomColourCorrection(PropNodePtr n, std::function<void()> requestReprocess) : ImageProcessorWithProperties("sandbox", n, requestReprocess), PropertyConsumer<TomColourCorrection>(n)
{
}
void TomColourCorrection::processCore(IImageProcessingContextPtr context){
brightness(16, 16);
}
}
And in your cuda file change this:
void brightness(int iw, int ih) //, unsigned char *source, unsigned char *dest)
to this:
void brightness_wrapper(int iw, int ih) //, unsigned char *source, unsigned char *dest)
This is mainly just spelling out the details of Ryck's answer.
I think you need change
void brightness(int iw, int ih)
to
void TomColourCorrection::brightness(int iw, int ih)
and move the implementation to your header file or a .cpp file.
In the console in the image below, you can see that automatic linking probably isn't working correctly. What do I need to do? Below is the code I'm using. I also did a refresh, clean, and rebuild, but the error remains.
/*
* Fibonacci.h
*
* Created on: Apr 2, 2014
* Author: rose
*/
#ifndef FIBONACCI_H_
#define FIBONACCI_H_
unsigned int Fibonacci(unsigned int n);
#endif /* FIBONACCI_H_ */
/*
* Fibonacci.cpp
*
* Created on: Apr 2, 2014
* Author: rose
*/
#include "Fibonacci.h"
unsigned int Fibonacci(unsigned int n)
{
if (n==1) {
return 1;
} else if (n == 0) {
return 0;
}
return Fibonacci(n-2) + Fibonacci(n-1);
}
/*
* main.cpp
*
* Created on: Apr 2, 2014
* Author: rose
*/
#include <iostream>
#include "Fibonacci.h"
int main(int argc, char *argv[])
{
std::cout << "Fibonacci(10) = " << Fibonacci(10) << std::endl;
}
It's not a linker problem, its the compiler. You need to include Fibonacci.h in main.cpp.