VexCL: set values common to all vector elements - c++

I work with ray tracing and use GPU to calculate pixel colours. I was using NVIDIA CUDA and now want to go to VexCL. I'm trying to use such code:
struct Ray;
vex::Context ctx(...);
unsigned int frame_width, frame_height;
std::array<float, 4> camera_direction, camera_up;
float camera_fov;
// initialize values and store them in GPU memory too
vex::vector<Ray> rays(ctx, frame_width * frame_height);
and something like
rays = some_expression_to_calculate_ray(vex::element_index(), frame_width,
camera_direction, camera_up, camera_fov);
So my question is: how can I explain to VexCL that some values must be common for all vector elements?
I was trying VEX_CONSTANT, vex::raw_pointer but it is not what I need.

If you change the type of camera_direction and camera_up from std::array<float,4> to cl_float4, then you would be able to directly use those in an expression:
#include <vexcl/vexcl.hpp>
int main() {
vex::Context ctx(vex::Filter::Env);
VEX_FUNCTION(float, dummy, (size_t, idx)(cl_float4, dir)(cl_float4, up)(float, fov),
// whatever
return idx + length(dir - up) + fov;
cl_float4 camera_dir = {1, 2, 3, 4}, camera_up = {1, 0, 0, 0};
float camera_fov = 42;
vex::vector<float> rays(ctx, 1024);
rays = dummy(vex::element_index(), camera_dir, camera_up, camera_fov);
(I've changed rays to be a vector of floats for simplicity, see the linked question for how to work with structs in VexCL.) camera_dir, camera_up, and camera_fov are defined host-side, and they are passed to the kernel as parameters. So no unnecessary copies are being made. Here is the generated OpenCL kernel:
float dummy(ulong idx, float4 dir, float4 up, float fov) {
return idx + length(dir - up) + fov;
kernel void vexcl_vector_kernel(
ulong n,
global float * prm_1,
ulong prm_2,
float4 prm_3,
float4 prm_4,
float prm_5
for(ulong idx = get_global_id(0); idx < n; idx += get_global_size(0))
prm_1[idx] = dummy( (prm_2 + idx), prm_3, prm_4, prm_5 );


How to use stbi_write_png to write image pixel data to a png file

So I have been following the Ray tracing in a weekend and initially was writing all the pixel data to a ppm file and was just experimenting with writing to different image file formats. I was able to write to a bmp file with some help on the internet and currently I am trying to write the pixel data to a png. I have been trying to use stbi_write_png function but the resultant image created is totally different from what was require
I am detailing the code down below for the bmp part and the png and the resulting image as well for both
This is the code for the bmp write
#include "hitableList.h"
#include "sphere.h"
#include "camera.h"
#include "material.h"
#include <float.h> //for float_MAX
#include "main.h" //contains our typedef declarations, #defines and struct declarations.
typedef uint8_t u8;
typedef uint16_t u16;
typedef uint32_t u32;
typedef uint64_t u64;
typedef int8_t s8;
typedef int16_t s16;
typedef int32_t s32;
typedef int64_t s64;
typedef float f32;
#include "stb_image_write.h"
#define CHANNEL_NUM 3
internal u32 GetTotalPixelSize(image_32 Image)//take image_32 Image
u32 Result = Image.Width*Image.Height * sizeof(u32);
//Function defintion to allocate image size
internal image_32 AllocateImage(u32 width, u32 height)
image_32 Image = {};//create the image object and initialize it.
Image.Height = height;
Image.Width = width;
u32 OutputPixelSize = GetTotalPixelSize(Image);//old version of the code does this line->sizeof(u32)*image.Width*image.Height;
Image.Pixels = (u32*)malloc(OutputPixelSize);//main source of the initial nullpointer at main *Out.
internal void WriteImage(image_32 Image, const char* OutputFileName)
u32 OutputPixelSize = GetTotalPixelSize(Image);
bitmap_header Header = {};
Header.FileType = 0x4D42;
Header.FileSize = sizeof(Header) + OutputPixelSize;//Need to set it later
//Header.Reserved1;//These are reserved and set by the header itself
//Header.Reserved2;//These are reserved and set by the header itself
Header.BitmapOffset = sizeof(Header);
Header.Size = sizeof(Header) - 14;//also need to set the size of the pixels. Since the header is 50 bytes check wikipedia.
Header.Width = Image.Width;
Header.Height = Image.Height;
Header.Planes = 1;
Header.BitsPerPixel = 32;
Header.Compression = 0;
Header.SizeOfBitmap = OutputPixelSize;//writing bottom part first. Very Important.
Header.HorzResolution = 0;
Header.VertResolution = 0;
Header.ColorsUsed = 0;
Header.ColorsImportant = 0;
FILE *OutFile = fopen(OutputFileName, "wb");
if (OutFile)
fwrite(&Header, sizeof(Header), 1, OutFile);//we write it into the header
fwrite(Image.Pixels, OutputPixelSize, 1, OutFile);
fprintf(stderr, "[Error]Unable to write output file %s. \n", OutputFileName);
vec3 color(const ray& r, hitable *world, int depth)
hit_record rec;
if(world->hit(r, 0.001, FLT_MAX, rec)){
ray scattered;
vec3 attenuation;
if(depth <50 && rec.mat_ptr->scatter(r,rec,attenuation,scattered)){
return attenuation*color(scattered, world, depth+1);
return vec3(0,0,0);
vec3 unit_direction = unit_vector(r.direction());
float t = 0.5*(unit_direction.y() + 1.0);
return (1.0-t)*vec3(1.0, 1.0, 1.0) + t*vec3(0.5, 0.7, 1.0);
int main()
int nx=1280;
int ny =720;
u32 ns = 10;
u32 width = 1280;
u32 height = 720;
hitable *list[5];
list[0] = new sphere(vec3(0,0,-1), 0.5, new lambertian(vec3(0.8, 0.3, 0.3)));
list[1] = new sphere(vec3(0,-100.5,-1), 100, new lambertian(vec3(0.8, 0.8, 0.0)));
list[2] = new sphere(vec3(1,0,-1), 0.5, new metal(vec3(0.8, 0.6, 0.2), 0.3));
list[3] = new sphere(vec3(-1,0,-1), 0.5, new dielectric(1.5));
list[4] = new sphere(vec3(-1,0,-1), -0.45, new dielectric(1.5));
hitable *world = new hitable_list(list,4);
camera cam;
//u32 *Out = Image.Pixels;
u8* pixels = new u8[width * height * CHANNEL_NUM];
u32 index =0;
for(u32 y=0 ; y<height; y++)
for(u32 x=0; x<width; x++)
vec3 col(0, 0, 0);
for(u32 s=0; s < ns; s++)
float u = float(x+drand48())/float(width);
float v = float(y+drand48())/float(height);
ray r = cam.get_ray(u, v);
vec3 p = r.point_at_parameter(2.0);
col = col + color(r, world, 0); //col returns a vec3
col/=float(ns);//average sampling per pixel
vec3 BMPColor = vec3(255*col); //getting bmp color values from raytraced image.
u32 BMPvalue = BGRPack4x8(BMPColor); //packing the bmp color into an integer to write to the bitmap image.
*Out++ = BMPvalue;
if((y%64) ==0)
printf("\rRaycasting row %d%%....",100*y / height);
WriteImage(Image, "..\\data\\Hollow_Glass_Sphere.bmp");//getting the raytraced image plane on test.bmp.
return 0;
#include "hitableList.h"
#include "sphere.h"
#include "camera.h"
#include "material.h"
#include <float.h> //for float_MAX
#include "main.h" //contains our typedef declarations, #defines and struct declarations.
typedef uint8_t u8;
typedef uint16_t u16;
typedef uint32_t u32;
typedef uint64_t u64;
typedef int8_t s8;
typedef int16_t s16;
typedef int32_t s32;
typedef int64_t s64;
typedef float f32;
#include "stb_image_write.h"
#define CHANNEL_NUM 3
vec3 color(const ray& r, hitable *world, int depth)
hit_record rec;
if(world->hit(r, 0.001, FLT_MAX, rec)){
ray scattered;
vec3 attenuation;
if(depth <50 && rec.mat_ptr->scatter(r,rec,attenuation,scattered)){
return attenuation*color(scattered, world, depth+1);
return vec3(0,0,0);
vec3 unit_direction = unit_vector(r.direction());
float t = 0.5*(unit_direction.y() + 1.0);
return (1.0-t)*vec3(1.0, 1.0, 1.0) + t*vec3(0.5, 0.7, 1.0);
int main()
int nx=1280;
int ny =720;
u32 ns = 10;
u32 width = 1280;
u32 height = 720;
hitable *list[5];
list[0] = new sphere(vec3(0,0,-1), 0.5, new lambertian(vec3(0.8, 0.3, 0.3)));
list[1] = new sphere(vec3(0,-100.5,-1), 100, new lambertian(vec3(0.8, 0.8, 0.0)));
list[2] = new sphere(vec3(1,0,-1), 0.5, new metal(vec3(0.8, 0.6, 0.2), 0.3));
list[3] = new sphere(vec3(-1,0,-1), 0.5, new dielectric(1.5));
list[4] = new sphere(vec3(-1,0,-1), -0.45, new dielectric(1.5));
hitable *world = new hitable_list(list,4);
camera cam;
//u32 *Out = Image.Pixels;
u8* pixels = new u8[width * height * CHANNEL_NUM];
u32 index =0;
for(u32 y=0 ; y<height; y++)
for(u32 x=0; x<width; x++)
vec3 col(0, 0, 0);
for(u32 s=0; s < ns; s++)
float u = float(x+drand48())/float(width);
float v = float(y+drand48())/float(height);
ray r = cam.get_ray(u, v);
vec3 p = r.point_at_parameter(2.0);
col = col + color(r, world, 0); //col returns a vec3
col/=float(ns);//average sampling per pixel
col = vec3( sqrt(col[0]), sqrt(col[1]), sqrt(col[2]));
int ir = int(255.99*col[0]);
int ig = int(255.99*col[1]);
int ib = int(255.99*col[2]);
pixels[++index] = ir;
pixels[++index] = ig;
pixels[++index] = ib;
if((y%64) ==0)
printf("\rRaycasting row %d%%....",100*y / height);
stbi_write_png("testpng_4.png", width, height, CHANNEL_NUM, pixels, width*CHANNEL_NUM);
return 0;
This is the BMP image that corresponds to the bmp write code and based off of what I read and implemented in Ray tracing in a weekend book, this is the correct desired image
This is the PNG image that corresponds to the png write code and it seems like the image is flipped in colors and orientation. I have tried and failed in debugging what could be the issue in this image. The only thought that comes to mind is if it is a endianness issue. I would be really happy if anyone could help me solve this problem on the png write and if it is an endianness issue, how could I go about solving it.
Here is your output, when converted from RGB to BRG:
1. The image should be flipped in the y-axis. Instead of looping as:
for(u32 y=0; y < height; y++)
it should be something like:
for (u32 y = height - 1; y >= 0; y--)
The ir, ig, and ib values are type of int; however, the pixel buffer and the input of stbi_write_png function requires unsigned char, u8 in your case. Multiplying them by 255.9 might yield values well above 255, and you may consider clamping them between [0, 255] before casting them to unsigned char. You may consult a similar discussion here: stb image write issue
Although changing the pixel ordering from RGB to BRG will do the trick, you may also want to consider why the image looks brighter overall. This is most probably because of the sqrt operation over the col values, which will return higher values than the original col values, since cols are fractional numbers. In multisampling, dividing the sum with the number of samples should suffice. In case these do not work, I would check if the stbi_write_png is still writing an alpha channel, or your viewer treats one of the channels as an alpha channel, which is the least likely.
This is the output when the RGB channels are changed to BRG, the image is flipped vertically, and every col is squared, which looks correct:

openCl path tracer creates strange noise patterns

I've made a path tracer using openCl and c++, following the basic structure in this tutorial: As far as I can tell, nothing is wrong with the path tracing algorithm itself, but I get strange stripe patterns in the image that don't match the regular noise of path tracing. striped image
There are distinct vertical stripes and more narrow horizontal ones that make the image look granular regardless of how many samples I take per pixel. Again, pixel by pixel, the path tracer seems to be working (the outlines of objects are correct even where they appear mid-stripe) as seen here: close-up.
The only difference between my code and the one in the tutorial I link is that Sam Lapere appears to be using the c++ wrapper for openCl, and I've added a couple of features like movement. There also are a few differences in how I'm handling light bounces.
I'm new to openCl. What could be causing this? It seems like it doesn't have to do with my ray tracer itself, but somehow in the way I'm implementing openCl. I'm also using an SDL texture and renderer to show the image to the screen
here is the tracer code if it helps:
__kernel void render_kernel
(__constant struct Sphere* spheres, const int width, const int height,
const int sphere_count, __global int * output, __global float3*
pixel_buckets, __global int* counter, __constant struct Ray* camera,
__global bool* reset){
int gid = get_global_id(0);
//for movement
if (*reset){
pixel_buckets[gid] = (float3)(0,0,0);
counter[gid] = 0;
int xcoord = gid % width;
int ycoord = gid / width;
struct Ray camray = createCamRay(xcoord, ycoord, width, height, counter[gid], camera);
float3 final_color = trace(spheres, &camray, sphere_count, xcoord, ycoord);
counter[gid] ++;
//average colors
pixel_buckets[gid] += final_color;
output[gid] = colorInt(clampColor(pixel_buckets[gid] / counter[gid]));
float3 trace(__constant struct Sphere* spheres, struct Ray* camray, const int sphere_count,
unsigned int seed0, unsigned int seed1){
struct Ray ray = *camray;
struct Sphere sphere1; = (float3)(0, 0, 3);
sphere1.radius = 0.7;
sphere1.color = (float3)(1,1,0);
const int bounce_count = 8;
float3 colors[20];
float3 emiss[20];
for (int bounce = 0; bounce < bounce_count; bounce ++){
int sphere_id = 0;
float hit_distance = intersectScene(spheres, &ray, &sphere_id, sphere_count);
struct Sphere hit_sphere = spheres[sphere_id];
float3 hit_point = ray.origin + (ray.direction * hit_distance);
float3 normal = normalize(hit_point -;
if (dot(normal, -ray.direction) < 0){
normal = -normal;
//random bounce angles
float rand_theta = get_random(seed0, seed1);
float theta = acos(sqrt(rand_theta));
float rand_phi = get_random(seed0, seed1);
float phi = 2 * PI * rand_phi;
//scales the tnb vectors
float x = sin(theta) * sin(phi);
float y = sin(theta) * cos(phi);
float n = cos(theta);
float3 hemx = normalize(cross(ray.direction, normal)) * x;
float3 hemy = normalize(cross(hemx, normal)) * y;
normal = normal * n;
float3 new_ray = normalize(hemx + hemy + normal);
ray.origin = hit_point + (normal * EPSILON);
ray.direction = new_ray;
colors[bounce] = hit_sphere.color;
emiss[bounce] = hit_sphere.emmissive;
colors[bounce_count] = (float3)(0,0,0);
emiss[bounce_count] = (float3)(0,0,0);
for (int i = bounce_count - 1; i >= 0; i--){
colors[i] = (colors[i] * emiss[i]) + (colors[i] * colors[i + 1]);
return colors[0];
random number generator:
float get_random(unsigned int *seed0, unsigned int *seed1) {
/* hash the seeds using bitwise AND operations and bitshifts */
*seed0 = 36969 * ((*seed0) & 65535) + ((*seed0) >> 16);
*seed1 = 18000 * ((*seed1) & 65535) + ((*seed1) >> 16);
unsigned int ires = ((*seed0) << 16) + (*seed1);
/* use union struct to convert int to float */
union {
float f;
unsigned int ui;
} res;
res.ui = (ires & 0x007fffff) | 0x40000000; /* bitwise AND, bitwise OR */
return (res.f - 2.0f) / 2.0f;

Multithreading returns an unhandled exception for storing information

I will try to explain my problem as clear as possible. I have a multithreading framework I have to work on. It's a path tracer renderer. It gives me error when I try to store some information provided by my threads. Trying to avoid posting all the code, I will explain what I mean step by step:
my TileTracer class is a thread
class TileTracer : public Thread{
and I have a certain number of threads:
#define MAXTHREADS 32
TileTracer* worker[MAXTHREADS];
the number of working threads is set in the following initialization code, where the threads are also started:
void Renderer::Init(){
accumulator = (vec3*)MALLOC64(sizeof(vec3)* SCRWIDTH * SCRHEIGHT);
memset(accumulator, 0, SCRWIDTH * SCRHEIGHT * sizeof(vec3));
SYSTEM_INFO systeminfo;
int cores = systeminfo.dwNumberOfProcessors;
workerCount = MIN(MAXTHREADS, cores);
for (int i = 0; i < workerCount; i++)
goSignal[i] = CreateEvent(NULL, FALSE, FALSE, 0);
doneSignal[i] = CreateEvent(NULL, FALSE, FALSE, 0);
// create and start worker threads
for (int i = 0; i < workerCount; i++)
worker[i] = new TileTracer();
worker[i]->init(accumulator, i);
worker[i]->start(); //start the thread
samples = 0;
the init() method for my thread is simply defined in my header as the following:
void init(vec3* target, int idx) { accumulator = target, threadIdx = idx; }
while the start() is:
void Thread::start()
DWORD tid = 0;
m_hThread = (unsigned long*)CreateThread( NULL, 0, (LPTHREAD_START_ROUTINE)sthread_proc, (Thread*)this, 0, &tid );
setPriority( Thread::P_NORMAL );
somehow (I don't get exactly where), each thread calls the following main method which is meant to define the color of a pixel (you don't have to understand it all):
vec3 TileTracer::Sample(vec3 O, vec3 D, int depth){
vec3 color(0, 0, 0);
// trace path extension ray
float t = 1000.0f, u, v;
Triangle* tri = 0;
Scene::mbvh->pool4[0].TraceEmbree(O, D, t, u, v, tri, false);
// handle intersection, if any
if (tri)
// determine material color at intersection point
Material* mat = Scene::matList[tri->material];
Texture* tex = mat->GetTexture();
vec3 diffuse;
if (tex)
else diffuse = mat->GetColor();
vec3 I = O + t * D; //we get exactly to the intersection point on the object
//we need to store the info of each bounce of the basePath for the offsetPaths
basePath baseInfo = { O, D, I, tri };
vec3 L = vec3(-1 + Rand(2.0f), 20, 9 + Rand(2.0f)) - I; //(-1,20,9) is Hard-code of the light position, and I add Rand(2.0f) on X and Z axis
//so that I have an area light instead of a point light
float dist = length(L) * 0.99f; //if I cast a ray towards the light source I don't want to hit the source point or the light source
//otherwise it counts as a shadow even if there is not. So I make the ray a bit shorter by multiplying it for 0.99
L = normalize(L);
float ndotl = dot(tri->N, L);
if (ndotl > 0)
Triangle* tri = 0;
Scene::mbvh->pool4[0].TraceEmbree(I + L * EPSILON, L, dist, u, v, tri, true);//it just calculates the distance by throwing a ray
//I am just interested in understanding if I hit something or not
//if I don't hit anything I calculate the light transport (diffuse * ndotL * lightBrightness * 1/dist^2
if (!tri) color += diffuse * ndotl * vec3(1000.0f, 1000.0f, 850.0f) * (1.0f / (dist * dist));
// continue random walk since it is a path tracer (we do it only if we have less than 20 bounces)
if (depth < 20)
// russian roulette
float Psurvival = CLAMP((diffuse.r + diffuse.g + diffuse.b) * 0.33333f, 0.2f, 0.8f);
if (Rand(1.0f) < Psurvival)
vec3 R = DiffuseReflectionCosineWeighted(tri->N);//there is weight
color += diffuse * Sample(I + R * EPSILON, R, depth + 1) * (1.0f / Psurvival);
return color;
Now, you don't have to understand the whole code for sure because my question is the following: if you notice, in the last method there are the 2 following code lines:
basePath baseInfo = { O, D, I, tri };
I just create a simple struct "basePath" defined as follows:
struct basePath
vec3 O, D, hit;
Triangle* tri;
and I store it in a vector of struct defined at the beginning of my code:
vector<basePath> basePathHits;
The problem is that this seems bringing an exception. Indeed if I try to store these information, that I need later in my code, the program crashes giving me the exception:
Unhandled exception at 0x0FD4FAC1 (msvcr120d.dll) in Template.exe: 0xC0000005: Access violation reading location 0x3F4C1BC1.
Some other times, without changing anything, the error is different and it's the following one:
While, without storing those info, everything works perfectly. Likewise, if I set the number of cores to 1, everything works. So, how come multithreading doesn't allow me to do it? Do not hesitate to ask further info if these are not enough.
Try making the following change to your code:
//we need to store the info of each bounce of the basePath for the offsetPaths
basePath baseInfo = { O, D, I, tri };
static std::mutex myMutex;
If that removes the exceptions then the problem is unsychronised access to basePathHits (i.e. multiple threads calling push_back simultaneously). You need to think carefully about what the best solution to this will be, to minimise the impact of synchronisation on performance.
Possible I did'nt see it, but there is no protection for the target - no mutex or atomic. And as far as I know std::vector needs this for multithreading.

Several arithmetic operations parallelized in C++Amp

I am trying to parallelize a convolution filter using C++Amp. I would like the following function to start working (I don't know how to do it properly):
float* pixel_color[] = new float [16];
concurrency::array_view<float, 2> pixels(4, 4, pixel_array), taps(4, 4, myTap4Kernel_array);
concurrency::array_view<float, 1> pixel(16, pixel_color); // I don't know which data structure to use here
pixels.extent, [=](concurrency::index<2> idx) restrict(amp)
int row=idx[0];
int col=idx[1];
pixels(row, col) = taps(row, col) * pixels(row, col);
pixel[0] += pixels(row, col);
pixel_color.synchronize();<Pixel>(j, i) = pixel_color
The main problem is that I don't know how to use the pixel structure properly (which concurrent data structure to use here as I don't need all 16 elements). And I don't know if I can safely add the values this way.
The following code doesn't work, it does not add appropriate values to pixel[0].
I also would like to define
concurrency::array_view<float, 2> pixels(4, 4, pixel_array), taps(4, 4, myTap4Kernel_array);
outside the method (for example in the header file) and initialize it in the costructor or other function (as this is a bottle-neck and takes a lot of time copying the data between CPU and GPU). Does anybody know how to do this?
You're no the right track but doing in place manipulations of arrays on a GPU is tricky as you cannot guarantee the order in which different elements are updated.
Here's an example of something very similar. The ApplyColorSimplifierTiledHelper method contains an AMP restricted parallel_for_each that calls SimplifyIndexTiled for each index in the 2D array. SimplifyIndexTiled calculates a new value for each pixel in destFrame based on the value of the pixels surrounding the corresponding pixel in srcFrame. This solves the race condition issue present in your code.
This code comes from the Codeplex site for the C++ AMP book. The Cartoonizer case study includes several examples of these sorts of image processing problems implemented in C++ AMP using; arrays, textures, tiled/untiled and multi-GPU. The C++ AMP book discusses the implementation in some detail.
void ApplyColorSimplifierTiledHelper(const array<ArgbPackedPixel, 2>& srcFrame,
array<ArgbPackedPixel, 2>& destFrame, UINT neighborWindow)
const float_3 W(ImageUtils::W);
assert(neighborWindow <= FrameProcessorAmp::MaxNeighborWindow);
tiled_extent<FrameProcessorAmp::TileSize, FrameProcessorAmp::TileSize>
computeDomain = GetTiledExtent(srcFrame.extent);
parallel_for_each(computeDomain, [=, &srcFrame, &destFrame]
(tiled_index<FrameProcessorAmp::TileSize, FrameProcessorAmp::TileSize> idx)
SimplifyIndexTiled(srcFrame, destFrame, idx, neighborWindow, W);
void SimplifyIndex(const array<ArgbPackedPixel, 2>& srcFrame, array<ArgbPackedPixel,
2>& destFrame, index<2> idx,
UINT neighborWindow, const float_3& W) restrict(amp)
const int shift = neighborWindow / 2;
float sum = 0;
float_3 partialSum;
const float standardDeviation = 0.025f;
const float k = -0.5f / (standardDeviation * standardDeviation);
const int idxY = idx[0] + shift; // Corrected index for border offset.
const int idxX = idx[1] + shift;
const int y_start = idxY - shift;
const int y_end = idxY + shift;
const int x_start = idxX - shift;
const int x_end = idxX + shift;
RgbPixel orgClr = UnpackPixel(srcFrame(idxY, idxX));
for (int y = y_start; y <= y_end; ++y)
for (int x = x_start; x <= x_end; ++x)
if (x != idxX || y != idxY) // don't apply filter to the requested index, only to the neighbors
RgbPixel clr = UnpackPixel(srcFrame(y, x));
float distance = ImageUtils::GetDistance(orgClr, clr, W);
float value = concurrency::fast_math::pow(float(M_E), k * distance * distance);
sum += value;
partialSum.r += clr.r * value;
partialSum.g += clr.g * value;
partialSum.b += clr.b * value;
RgbPixel newClr;
newClr.r = static_cast<UINT>(clamp(partialSum.r / sum, 0.0f, 255.0f));
newClr.g = static_cast<UINT>(clamp(partialSum.g / sum, 0.0f, 255.0f));
newClr.b = static_cast<UINT>(clamp(partialSum.b / sum, 0.0f, 255.0f));
destFrame(idxY, idxX) = PackPixel(newClr);
The code uses ArgbPackedPixel, which is simply a mechanism for packing 8-bit RGB values into an unsigned long as C++ AMP does not support char. If your problem is small enough to fit into a texture then you may want to look at using this instead of an array as the pack/unpack is implemented in hardware on the GPU so is effectively "free", here you have to pay for it with additional compute. There is also an example of this implementation on CodePlex.
typedef unsigned long ArgbPackedPixel;
struct RgbPixel
unsigned int r;
unsigned int g;
unsigned int b;
const int fixedAlpha = 0xFF;
inline ArgbPackedPixel PackPixel(const RgbPixel& rgb) restrict(amp)
return (rgb.b | (rgb.g << 8) | (rgb.r << 16) | (fixedAlpha << 24));
inline RgbPixel UnpackPixel(const ArgbPackedPixel& packedArgb) restrict(amp)
RgbPixel rgb;
rgb.b = packedArgb & 0xFF;
rgb.g = (packedArgb & 0xFF00) >> 8;
rgb.r = (packedArgb & 0xFF0000) >> 16;
return rgb;

Can normal maps be generated from a texture?

If I have a texture, is it then possible to generate a normal-map for this texture, so it can be used for bump-mapping?
Or how are normal maps usually made?
Yes. Well, sort of. Normal maps can be accurately made from height-maps. Generally, you can also put a regular texture through and get decent results as well. Keep in mind there are other methods of making a normal map, such as taking a high-resolution model, making it low resolution, then doing ray casting to see what the normal should be for the low-resolution model to simulate the higher one.
For height-map to normal-map, you can use the Sobel Operator. This operator can be run in the x-direction, telling you the x-component of the normal, and then the y-direction, telling you the y-component. You can calculate z with 1.0 / strength where strength is the emphasis or "deepness" of the normal map. Then, take that x, y, and z, throw them into a vector, normalize it, and you have your normal at that point. Encode it into the pixel and you're done.
Here's some older incomplete-code that demonstrates this:
// pretend types, something like this
struct pixel
uint8_t red;
uint8_t green;
uint8_t blue;
struct vector3d; // a 3-vector with doubles
struct texture; // a 2d array of pixels
// determine intensity of pixel, from 0 - 1
const double intensity(const pixel& pPixel)
const double r = static_cast<double>(;
const double g = static_cast<double>(;
const double b = static_cast<double>(;
const double average = (r + g + b) / 3.0;
return average / 255.0;
const int clamp(int pX, int pMax)
if (pX > pMax)
return pMax;
else if (pX < 0)
return 0;
return pX;
// transform -1 - 1 to 0 - 255
const uint8_t map_component(double pX)
return (pX + 1.0) * (255.0 / 2.0);
texture normal_from_height(const texture& pTexture, double pStrength = 2.0)
// assume square texture, not necessarily true in real code
texture result(pTexture.size(), pTexture.size());
const int textureSize = static_cast<int>(pTexture.size());
for (size_t row = 0; row < textureSize; ++row)
for (size_t column = 0; column < textureSize; ++column)
// surrounding pixels
const pixel topLeft = pTexture(clamp(row - 1, textureSize), clamp(column - 1, textureSize));
const pixel top = pTexture(clamp(row - 1, textureSize), clamp(column, textureSize));
const pixel topRight = pTexture(clamp(row - 1, textureSize), clamp(column + 1, textureSize));
const pixel right = pTexture(clamp(row, textureSize), clamp(column + 1, textureSize));
const pixel bottomRight = pTexture(clamp(row + 1, textureSize), clamp(column + 1, textureSize));
const pixel bottom = pTexture(clamp(row + 1, textureSize), clamp(column, textureSize));
const pixel bottomLeft = pTexture(clamp(row + 1, textureSize), clamp(column - 1, textureSize));
const pixel left = pTexture(clamp(row, textureSize), clamp(column - 1, textureSize));
// their intensities
const double tl = intensity(topLeft);
const double t = intensity(top);
const double tr = intensity(topRight);
const double r = intensity(right);
const double br = intensity(bottomRight);
const double b = intensity(bottom);
const double bl = intensity(bottomLeft);
const double l = intensity(left);
// sobel filter
const double dX = (tr + 2.0 * r + br) - (tl + 2.0 * l + bl);
const double dY = (bl + 2.0 * b + br) - (tl + 2.0 * t + tr);
const double dZ = 1.0 / pStrength;
math::vector3d v(dX, dY, dZ);
// convert to rgb
result(row, column) = pixel(map_component(v.x), map_component(v.y), map_component(v.z));
return result;
There's probably many ways to generate a Normal map, but like others said, you can do it from a Height Map, and 3d packages like XSI/3dsmax/Blender/any of them can output one for you as an image.
You can then output and RGB image with the Nvidia plugin for photoshop, an algorithm to convert it or you might be able to output it directly from those 3d packages with 3rd party plugins.
Be aware that in some case, you might need to invert channels (R, G or B) from the generated normal map.
Here's some resources link with examples and more complete explanation:
I don't think normal maps are generated from a texture. they are generated from a model.
just as texturing allows you to define complex colour detail with minimal polys (as opposed to just using millions of ploys and just vertex colours to define the colour on your mesh)
A normal map allows you to define complex normal detail with minimal polys.
I believe normal maps are usually generated from a higher res mesh, and then is used with a low res mesh.
I'm sure 3D tools, such as 3ds max or maya, as well as more specific tools will do this for you. unlike textures, I don't think they are usually done by hand.
but they are generated from the mesh, not the texture.
I suggest starting with OpenCV, due to its richness in algorithms. Here's one I wrote that iteratively blurs the normal map and weights those to the overall value, essentially creating more of a topological map.
#define ROW_PTR(img, y) ((uchar*)((img).data + (img).step * y))
cv::Mat normalMap(const cv::Mat& bwTexture, double pStrength)
// assume square texture, not necessarily true in real code
int scale = 1.0;
int delta = 127;
cv::Mat sobelZ, sobelX, sobelY;
cv::Sobel(bwTexture, sobelX, CV_8U, 1, 0, 13, scale, delta, cv::BORDER_DEFAULT);
cv::Sobel(bwTexture, sobelY, CV_8U, 0, 1, 13, scale, delta, cv::BORDER_DEFAULT);
sobelZ = cv::Mat(bwTexture.rows, bwTexture.cols, CV_8UC1);
for(int y=0; y<bwTexture.rows; y++) {
const uchar *sobelXPtr = ROW_PTR(sobelX, y);
const uchar *sobelYPtr = ROW_PTR(sobelY, y);
uchar *sobelZPtr = ROW_PTR(sobelZ, y);
for(int x=0; x<bwTexture.cols; x++) {
double Gx = double(sobelXPtr[x]) / 255.0;
double Gy = double(sobelYPtr[x]) / 255.0;
double Gz = pStrength * sqrt(Gx * Gx + Gy * Gy);
uchar value = uchar(Gz * 255.0);
sobelZPtr[x] = value;
cv::Mat normalMap;
cv::merge(planes, normalMap);
cv::Mat originalNormalMap = normalMap.clone();
cv::Mat normalMapBlurred;
for (int i=0; i<3; i++) {
cv::GaussianBlur(normalMap, normalMapBlurred, cv::Size(13, 13), 5, 5);
addWeighted(normalMap, 0.4, normalMapBlurred, 0.6, 0, normalMap);
addWeighted(originalNormalMap, 0.3, normalMapBlurred, 0.7, 0, normalMap);
return normalMap;