Hallo,
I am new to CUDA and im trying to copy an array of data into the CUDA kernel. Im not sure what I am doing wrong and could really do with some pointers in the right direction.
My UpdatePixel function works if I dont use the array to set the data. If I set colour.x to 1 my whole screen goes red. If I use m_dataPtr[index] as colour.x only a few pixels towards the bottom of the screen goes red (less then 5 pixels). I have attached the cuda code an the c++ code that I think would be relewant. As the code works fine with colour.x = 1, i suspect its the copy//allocation part of the cuda code that is broken?
CUDA:
#include <cutil_inline.h>
#include <cutil_math.h>
__constant__ float* m_dataPtr;
__device__ unsigned int rgbaFloatToInt_new(float4 rgba)
{
rgba.x = __saturatef(rgba.x); // clamp to [0.0, 1.0]
rgba.y = __saturatef(rgba.y);
rgba.z = __saturatef(rgba.z);
rgba.w = __saturatef(rgba.w);
return (uint(rgba.w*255)<<24) | (uint(rgba.z*255)<<16) | (uint(rgba.y*255)<<8) | uint(rgba.x*255);
}
__global__ void UpdatePixel(unsigned int *outputImage, unsigned int imageW, unsigned int imageH)
{
uint x = blockIdx.x*blockDim.x + threadIdx.x;
uint y = blockIdx.y*blockDim.y + threadIdx.y;
unsigned int index = y * imageW + x;
float data = m_dataPtr[index];
float4 colour;
colour.x = data;
colour.y = 0;
colour.z = 0;
colour.w = 1;
outputImage[index] = rgbaFloatToInt_new(colour);
}
extern "C" void UpdateImage(dim3 gridSize, dim3 blockSize,uint *d_output, uint imageW, uint imageH)
{
UpdatePixel<<<gridSize, blockSize>>>( d_output, imageW, imageH);
}
extern "C" void AllocateData(size_t dataSize)
{
cudaFree(m_dataPtr);
cutilSafeCall( cudaMalloc((void**)&m_dataPtr, dataSize) );
}
extern "C" void CopyData(float *dataPtr, size_t dataSize)
{
cutilSafeCall( cudaMemcpy(m_dataPtr, dataPtr, dataSize, cudaMemcpyHostToDevice ) );
}
C++:
float *pixelData = new float[imageWidth * imageHeight];
unsigned int pixelDataSize = (sizeof(float) * imageWidth * imageHeight);
for(unsigned int x = 0; x < imageWidth; x++)
{
for(unsigned int y = 0; y < imageHeight; y++)
{
unsigned int idx = imageWidth * y + x;
pixelData[idx] = 1;
}
}
AllocateData(pixelDataSize);
CopyData(pixelData, pixelDataSize);
If you are using constant memory on the gpu, you will need to use cudaMemcpyToSymbol rather than cudaMemcpy.
Related
stb_image.h provides a method to flip an image vertically and it works fine. I tried to implement an horizontal flip aka mirror but it alters the image colors.
On pictures that only have 3 colors you could get bluish or reddish or even magenta colored pictures instead of their actual colors. It's the same if we're talking about JPEG or PNG images, you get the same strange results. Curiously if you flip that very same image vertically, its colors look normal.
I've tried testing pretty much any function you could find here and the code I'm providing you with has been the only one that got me close to my actual goal.
// Function I've been trying to implement to enable Horizontal Flip
static void stbi_horizontal_flip(void *image, int w, int h, int bytes_per_pixel)
{
size_t line_bytes = (size_t)w * bytes_per_pixel;
stbi_uc temp[line_bytes];
stbi_uc *bytes = (stbi_uc *)image;
Debug() << line_bytes;
for (int col = 0; col < h; col++) {
stbi_uc *line = bytes + col * line_bytes;
memcpy(&temp, line, line_bytes);
for (int row = 0; row < line_bytes; row++) {
line[row] = temp[line_bytes - row - bytes_per_pixel];
}
}
stbi_horizontally_flip_on_load = false;
}
// stb_image's function for Vertical Flip
static void stbi__vertical_flip(void *image, int w, int h, int bytes_per_pixel)
{
int row;
size_t bytes_per_row = (size_t)w * bytes_per_pixel;
stbi_uc temp[2048];
stbi_uc *bytes = (stbi_uc *)image;
for (row = 0; row < (h>>1); row++) {
stbi_uc *row0 = bytes + row * bytes_per_row;
stbi_uc *row1 = bytes + (h - row - 1) * bytes_per_row;
size_t bytes_left = bytes_per_row;
while (bytes_left) {
size_t bytes_copy = (bytes_left < sizeof(temp)) ? bytes_left : sizeof(temp);
memcpy(temp, row0, bytes_copy);
memcpy(row0, row1, bytes_copy);
memcpy(row1, temp, bytes_copy);
row0 += bytes_copy;
row1 += bytes_copy;
bytes_left -= bytes_copy;
}
}
}
static unsigned char *stbi__load_and_postprocess_8bit(stbi__context *s, int *x, int *y, int *comp, int req_comp)
{
stbi__result_info ri;
void *result = stbi__load_main(s, x, y, comp, req_comp, &ri, 8);
if (result == NULL) return NULL;
if (ri.bits_per_channel != 8) {
STBI_ASSERT(ri.bits_per_channel == 16);
result = stbi__convert_16_to_8((stbi__uint16 *) result, *x, *y, req_comp == 0 ? *comp : req_comp);
ri.bits_per_channel = 8;
}
// #TODO: move stbi__convert_format to here
if (stbi_horizontally_flip_on_load) {
int channels = req_comp ? req_comp : *comp;
stbi_horizontal_flip(result, *x, *y, channels * sizeof(stbi_uc));
}
if (stbi__vertically_flip_on_load) {
int channels = req_comp ? req_comp : *comp;
stbi__vertical_flip(result, *x, *y, channels * sizeof(stbi_uc));
}
return (unsigned char *) result;
}
STBIDEF stbi_uc *stbi_load_from_file(FILE *f, int *x, int *y, int *comp, int req_comp)
{
unsigned char *result;
stbi__context s;
stbi__start_file(&s,f);
result = stbi__load_and_postprocess_8bit(&s,x,y,comp,req_comp);
if (result) {
// need to 'unget' all the characters in the IO buffer
fseek(f, - (int) (s.img_buffer_end - s.img_buffer), SEEK_CUR);
}
return result;
}
STBIDEF stbi_uc *stbi_load(char const *filename, int *x, int *y, int *comp, int req_comp)
{
FILE *f = stbi__fopen(filename, "rb");
unsigned char *result;
if (!f) return stbi__errpuc("can't fopen", "Unable to open file");
result = stbi_load_from_file(f,x,y,comp,req_comp);
fclose(f);
return result;
}
STBIDEF stbi_uc *stbi_load_from_callbacks(stbi_io_callbacks const *clbk, void *user, int *x, int *y, int *comp, int req_comp)
{
stbi__context s;
stbi__start_callbacks(&s, (stbi_io_callbacks *) clbk, user);
return stbi__load_and_postprocess_8bit(&s,x,y,comp,req_comp);
}
void Gosu::load_image_file(Gosu::Bitmap& bitmap, const string& filename)
{
Buffer buffer;
load_file(buffer, filename);
load_image_file(bitmap, buffer.front_reader());
}
void Gosu::load_image_file(Gosu::Bitmap& bitmap, Reader input)
{
bool needs_color_key = is_bmp(input);
stbi_io_callbacks callbacks;
callbacks.read = read_callback;
callbacks.skip = skip_callback;
callbacks.eof = eof_callback;
int x, y, n;
stbi_uc* bytes = stbi_load_from_callbacks(&callbacks, &input, &x, &y, &n, STBI_rgb_alpha);
if (bytes == nullptr) {
throw runtime_error("Cannot load image: " + string(stbi_failure_reason()));
}
bitmap.resize(x, y);
printf("Channels %d, Gosu Color size %d, unsigned char size %d, bytes array size %d",
n, sizeof(Gosu::Color), sizeof(stbi_uc), sizeof(bytes));
// Output: Channels 3 or 4, Gosu Color size 4, unsigned char size 1, bytes array 8
memcpy(bitmap.data(), bytes, x * y * sizeof(Gosu::Color));
stbi_image_free(bytes);
if (needs_color_key) apply_color_key(bitmap, Gosu::Color::FUCHSIA);
}```
// Output: Channels 3 or 4, Gosu Color size 4, unsigned char size 1, bytes array 8
That is what I got back from stb_image, but I'd prefer to get an 8bit array instead. Even so what actually matters is to get rid of that unexpected color change.
Thanks to Igor's comment I could focus on my immediate problem and not long after I came up with the code I've posted below.
What I've been wondering since I finally could flip the images horizontally was why the other methods I found either on the web or as part of image processors' code didn't work as expected. O_o? Sometimes I copied and pasted them only changing some variables' names or types to match stb_image's and they still failed either to compile or display a decent result.
By the way, I tried before to subtract positions to the right value to no avail but it made me think some of them could be used as nice color blend effects. XD
// Horizontal Flip by Kyonides Arkanthes shared under GPLv2 or v3
static void stbi_kyon_horizontal_flip(void *image, int w, int h, int bytes_per_pixel)
{
size_t line_bytes = (size_t)w * bytes_per_pixel;
stbi_uc temp[line_bytes];
stbi_uc *bytes = (stbi_uc *)image;
int lpos, rpos;
for (int col = 0; col < h; col++) {
stbi_uc *line = bytes + col * line_bytes;
memcpy(&temp, line, line_bytes);
for (int row = 0; row < w; row++) {
lpos = row * bytes_per_pixel;
rpos = line_bytes - row * bytes_per_pixel - 1;
line[lpos] = temp[rpos - 3];
line[lpos + 1] = temp[rpos - 2];
line[lpos + 2] = temp[rpos - 1];
line[lpos + 3] = temp[rpos];
}
}
stbi_kyon_horizontally_flip_on_load = false;
}```
You just reversed the order of RGBA, you try to use this, I tested, the effect is normal.
for (int row = 0; row < Qimg2.width(); row++) {
lpos = row * bytes_per_pixel;
rpos = line_bytes - row * bytes_per_pixel - 1;
line[lpos] = temp[rpos - 2];
line[lpos + 1] = temp[rpos - 1];
line[lpos + 2] = temp[rpos - 3];
line[lpos + 3] = temp[rpos];
}
I'm trying to create a dynamic array of arrays (of arrays). But for some reason the data gets corrupted. I'm using the data to generate a texture in a OpenGL application.
The following code works fine:
unsigned char imageData[64][64][3];
for (int i = 0; i < 64; i++)
{
for (int j = 0; j < 64; j++)
{
unsigned char r = 0, g = 0, b = 0;
if (i < 32)
{
if (j < 32)
r = 255;
else
b = 255;
}
else
{
if (j < 32)
g = 255;
}
imageData[i][j][0] = r;
imageData[i][j][1] = g;
imageData[i][j][2] = b;
}
std::cout << std::endl;
}
glTexImage2D(target, 0, GL_RGB, 64, 64, 0, GL_RGB, GL_UNSIGNED_BYTE, imageData);
Problem is, I want to be able to create a texture of any size (not just 64*64). So I'm trying this:
unsigned char*** imageData = new unsigned char**[64]();
for (int i = 0; i < 64; i++)
{
imageData[i] = new unsigned char*[64]();
for (int j = 0; j < 64; j++)
{
imageData[i][j] = new unsigned char[3]();
unsigned char r = 0, g = 0, b = 0;
if (i < 32)
{
if (j < 32)
r = 255;
else
b = 255;
}
else
{
if (j < 32)
g = 255;
}
imageData[i][j][0] = r;
imageData[i][j][1] = g;
imageData[i][j][2] = b;
}
std::cout << std::endl;
}
glTexImage2D(target, 0, GL_RGB, 64, 64, 0, GL_RGB, GL_UNSIGNED_BYTE, imageData);
But that doesn't work, the image gets all messed up so I assume I'm creating the array of arrays (of arrays) incorrectly? What am I doing wrong?
Also, I guess I should be using vectors instead. But how can I cast the vector of vectors of vectors data into a (void *) ?
This line contains multiple bugs:
unsigned char* pixel = &(imageData[(y * height) + x]);
You should multiply x by height and add y. And there's also the fact that each pixel is actually 3 bytes. Some issues that led to this bug in your code (and will lead to to others)
You should also be using std::vector. You can call std::vector::data to get a pointer to the underlying data to interface to C API's.
You should have a class that represents a pixel. This will handle the offsetting correctly and give things names and made the code clearer.
Whenever you are working with a multi dimensional array that you encode into a single dimensional one, you should try to carefully write an access function that takes care of indexing so you can test it separately.
(end bulleted list... oh SO).
struct Pixel {
unsigned char red;
unsigned char blue;
unsigned char green;
};
struct TwoDimPixelArray {
TwoDimArray(int width, int height)
: m_width(width), m_height(height)
{
m_vector.resize(m_width * m_height);
}
Pixel& get(int x, int y) {
return m_vector[x*height + y];
}
Pixel* data() { return m_vector.data(); }
private:
int m_width;
int m_height;
std::vector<Pixel> m_vector;
}
int width = 64;
int height = 64;
TwoDimPixelArray imageData(width, height);
for (int x = 0; x != width ; ++ x) {
for (int y = 0; y != height ; ++y) {
auto& pixel = imageData.get(x, y);
// ... pixel.red = something, pixel.blue = something, etc
}
}
glTexImage2D(target, 0, GL_RGB, 64, 64, 0, GL_RGB, GL_UNSIGNED_BYTE, imageData.data());
You need to use continuous memory for it to work with opengl.
My solution is inspired by previous answers, with a different indexing system
unsigned char* imageData = new unsigned char[width*height*3];
unsigned char r, g, b;
const unsigned int row_size_bytes = width * 3;
for( unsigned int x = 0; x < width; x++ ) {
unsigned int current_row_offset_bytes = x * 3;
for( unsigned int y = 0; y < height; y++ ) {
unsigned int one_dim_offset = y * row_size_bytes + current_row_offset_bytes
unsigned char* pixel = &(imageData[one_dim_offset]);
pixel[0] = r;
pixel[1] = g;
pixel[2] = b;
}
}
Unfortunnately it's untested, but i'm confident assuming sizeof(char) is 1.
I made a program in C++ which calculates the mandelbrot-set. Now I want to visualize it (save it in a picture). But when I try to save a 64k picture some problems come up. So what is the best way to save a picture of the pixels or at least to visual it?
Edit:
When I want to create a for Example 64K (61440 * 34560) image there will be the error "Access violation while writing at the position 0x0..." (originally on German and translated) and the program stops. This error appears with very high resolution. On lower resolutions the program works as it is supposed to.
#include <SFML\Graphics.hpp>
#include <stdlib.h>
#include <complex>
#include <cmath>
#include <thread>
//4K : 3840 * 2160
//8K : 7680 * 4320
//16K: 15360 * 8640
//32K: 30720 * 17280
//64K: 61440 * 34560
//128K:122880 * 69120
const unsigned long width = 61440; //should be dividable by ratioX & numberOfThreads!
const unsigned long height = 34560; //should be dividable by ratioY & numberOfThreads!
const unsigned int maxIterations = 500;
const unsigned int numberOfThreads = 6;
const int maxWidth = width / 3;
const int maxHeight = height / 2;
const int minWidth = -maxWidth * 2;
const int minHeight = -maxHeight;
const double ratioX = 3.0 / width;
const double ratioY = 2.0 / height;
sf::Image img = sf::Image();
int getsGreaterThan2(std::complex<double> z, int noIterations) {
double result;
std::complex<double> zTmp = z;
std::complex<double> c = z;
for (int i = 1; i != noIterations; i++) {
zTmp = std::pow(z, 2) + c;
if (zTmp == z) {
return 0;
}
z = std::pow(z, 2) + c;
result = std::sqrt(std::pow(z.real(), 2) + std::pow(z.imag(), 2));
if (result > 2) {
return i;
}
}
return 0;
}
void fillPixelArrayThreadFunc(int noThreads, int threadNr) { //threadNr ... starts from 0
double imgNumber;
double realNumber;
double tmp;
long startWidth = ((double)width) / noThreads * threadNr + minWidth;
long endWidth = startWidth + width / noThreads;
for (long x = startWidth; x < endWidth; x++) {
imgNumber = x * ratioX;
for (long y = minHeight; y < maxHeight; y++) {
realNumber = y * ratioY;
long xArray = x - minWidth;
long yArray = y - minHeight;
tmp = getsGreaterThan2(std::complex<double>(imgNumber, realNumber), maxIterations);
if (tmp == 0) {
img.setPixel(xArray, yArray, sf::Color(0, 0, 0, 255));
}
else {
img.setPixel(xArray, yArray, sf::Color(tmp / maxIterations * 128, tmp / maxIterations * 128, tmp / maxIterations * 255, 255));
}
}
}
}
int main() {
img.create(width, height, sf::Color::Black);
std::thread *threads = new std::thread[numberOfThreads];
for (int i = 0; i < numberOfThreads; i++) {
threads[i] = std::thread(std::bind(fillPixelArrayThreadFunc, numberOfThreads, i));
}
for (int i = 0; i < numberOfThreads; i++) {
threads[i].join();
}
img.saveToFile("filename.png");
return 1;
}
Your program fails during the call img.create(width, height, sf::Color::Black);.
When you step into the sf::Image::create function you end up here where the newPixels vector is created, this simply fails when width * height is too big as in your case:
////////////////////////////////////////////////////////////
void Image::create(unsigned int width, unsigned int height, const Color& color)
{
if (width && height)
{
// Create a new pixel buffer first for exception safety's sake
std::vector<Uint8> newPixels(width * height * 4);
^61440* ^34560 = 8'493'465'600 bytes !!
Conclusion: SFML cannot handle huge images.
I had asked doubt error: calling a __host__ function from a __global__ function is not allowed and i got the ans . accordingly i have modified my code bt i am unable to access d_point[i]. how can i access that.
__global__ void densefun(int *d_counters,float2 *d_points,int d_x_max,int d_y_max,int width,int height, int min_distance,int size)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
if(i <= size)
{
float2 point = (d_points)[i];
int x = floorf(point.x);
int y = floorf(point.y);
printf(" ( %d %d )",x,y);
if(x < d_x_max && y < d_y_max)
{
x /= min_distance;
y /= min_distance;
(d_counters)[y*width+x]++;
__syncthreads();
}
}
}
void DenseSample(const Mat& grey, std::vector<Point2f>& points, const double quality, const int min_distance)
{
int width = grey.cols/min_distance;
int height = grey.rows/min_distance;
Mat eig;
cornerMinEigenVal(grey, eig, 3, 3);
double maxVal = 0;
minMaxLoc(eig, 0, &maxVal);
const double threshold = maxVal*quality;
std::vector<int> counters(width*height);
int x_max = min_distance*width;
int y_max = min_distance*height;
printf("in descriptor size:%ld ",points.size());
int *d_counters;
float2 *d_points;
cudaMalloc(&d_counters,counters.size()*width*height*sizeof(int));
printf("in cuda point size:%d ",points.size());
cudaMalloc(&d_points,points.size()*sizeof(float2));
cout<<"points.size() : "<<points.size()<<endl;
cudaMemcpy(d_points, &points, points.size()*sizeof(float2), cudaMemcpyHostToDevice);
int blk=cvFloor(points.size()/1024)+1;
cout<<"blk : "<<blk<<endl;
if(points.size()>0)
{
densefun<<<blk,1024>>>(d_counters,d_points,x_max,y_max,width,height,min_distance, points.size());
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess)
printf("Error: %s\n", cudaGetErrorString(err));
cudaMemcpy(&counters, d_counters, counters.size()* width*height*sizeof(int), cudaMemcpyDeviceToHost);
}
cudaFree(d_counters);
cudaFree(d_points);
points.clear();
int index = 0;
int offset = min_distance/2;
for(int i = 0; i < height; i++)
for(int j = 0; j < width; j++, index++)
{
if(counters[index] <= 0)
{
int x = j*min_distance+offset;
int y = i*min_distance+offset;
if(eig.at<float>(y, x) > threshold)
points.push_back(Point2f(float(x), float(y)));
}
}
}
output is:
in descriptor size:1605 in cuda point size:1605 points.size() : 1605
blk : 2
Error: an illegal memory access was encountered
in descriptor size:918 in cuda point size:918 points.size() : 918
blk : 1
Error: an illegal memory access was encountered
You create a thread gird with block length 1024 and grid length equal to
int blk=cvFloor(points.size()/1024)+1;
Which basically means that the number of threads will be multiple of 1024 greater than points.size(). In this case using:
int i = blockDim.x * blockIdx.x + threadIdx.x;
float2 point = (d_points)[i];
cannot be successful, because you can be almost certain that you will get out of bounds memory access. Add some conditional to ensure that it won't happen.
__global__ void densefun(int *d_counters,float2 *d_points,int d_x_max,int d_y_max,int width, int height, int min_distance)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
if(i < width * height)
{
//rest of the code
}
}
Also, you don't allocate enugh memory for d_points:
float2 *d_points;
cudaMalloc(&d_points,points.size()*sizeof(float));
If you want to allocate array of float2 (or copy to it) you need to use sizeof(float2).
I have a set of vectors containing the Y/U/V components of an image. I want to scale these vectors based on an integer zoom multiplier. I can think of the obvious solution which would be something like this:
void zoomBuffer(std::vector<unsigned char>& zoomedBuffer, std::vector<unsigned char>& srcBuffer, const unsigned int width, const unsigned height, const unsigned int zoom)
{
zoomedBuffer.reserve(width * zoom * height * zoom);
for (unsigned int y = 0; y < height; ++y)
{
unsigned int yBufferOffset = y * width;
unsigned int zoomYBufferOffset = y * width * zoom * zoom;
for (unsigned int x = 0; x < width; ++x)
{
unsigned int offset = yBufferOffset + x;
unsigned int zoomOffset = yBufferOffset + x * zoom;
for (unsigned int xCopy = 0; xCopy < zoom; ++xCopy)
{
zoomedBuffer.push_back(srcBuffer[offset]);
}
}
for (unsigned int yCopy = 1; yCopy < zoom; ++yCopy)
{
std::copy_n(zoomedBuffer.begin() + zoomYBufferOffset, zoom * width, std::back_inserter(zoomedBuffer));
}
}
}
then calling it like so:
zoomBuffer(zoomedY, refBlock.m_yData, refBlock.m_YWidth, refBlock.m_YHeight, 2);
zoomBuffer(zoomedU, refBlock.m_yData, refBlock.m_UWidth, refBlock.m_UHeight, 2);
zoomBuffer(zoomedV, refBlock.m_yData, refBlock.m_VWidth, refBlock.m_VHeight, 2);
This should work but I'm wondering if there's a more efficient/faster way to do this? It feels clunky.