How do I create a cudaTextureObject_t from linear memory? - c++

I cannot get bindless textures referencing linear memory to work -- the result is always a zero/black read. My initialization code:
The buffer:
int const num = 4 * 16;
int const size = num * sizeof(float);
cudaMalloc(buffer, size);
auto b = new float[num];
for (int i = 0; i < num; ++i)
{
b[i] = i % 4 == 0 ? 1 : 1;
}
cudaMemcpy(*buffer, b, size, cudaMemcpyHostToDevice);
The texture object:
cudaTextureDesc td;
memset(&td, 0, sizeof(td));
td.normalizedCoords = 0;
td.addressMode[0] = cudaAddressModeClamp;
td.addressMode[1] = cudaAddressModeClamp;
td.addressMode[2] = cudaAddressModeClamp;
td.readMode = cudaReadModeElementType;
td.sRGB = 0;
td.filterMode = cudaFilterModePoint;
td.maxAnisotropy = 16;
td.mipmapFilterMode = cudaFilterModePoint;
td.minMipmapLevelClamp = 0;
td.maxMipmapLevelClamp = 0;
td.mipmapLevelBias = 0;
struct cudaResourceDesc resDesc;
memset(&resDesc, 0, sizeof(resDesc));
resDesc.resType = cudaResourceTypeLinear;
resDesc.res.linear.devPtr = *buffer;
resDesc.res.linear.sizeInBytes = size;
resDesc.res.linear.desc.f = cudaChannelFormatKindFloat;
resDesc.res.linear.desc.x = 32;
resDesc.res.linear.desc.y = 32;
resDesc.res.linear.desc.z = 32;
resDesc.res.linear.desc.w = 32;
checkCudaErrors(cudaCreateTextureObject(texture, &resDesc, &td, nullptr));
The kernel:
__global__ void
d_render(uchar4 *d_output, uint imageW, uint imageH, float* buffer, cudaTextureObject_t texture)
{
uint x = blockIdx.x * blockDim.x + threadIdx.x;
uint y = blockIdx.y * blockDim.y + threadIdx.y;
if ((x < imageW) && (y < imageH))
{
// write output color
uint i = y * imageW + x;
//auto f = make_float4(buffer[0], buffer[1], buffer[2], buffer[3]);
auto f = tex1D<float4>(texture, 0);
d_output[i] = to_uchar4(f * 255);
}
}
The texture object is initialized with something sensible (4099) when given to the kernel. The Buffer version works flawlessly.
Why does the texture object return zero/black?

As per the CUDA programming reference guide You need to use tex1Dfetch() to read from one-dimensional textures bound to linear texture memory, and tex1D to read from one-dimensional textures bound to CUDA arrays. This applies to both CUDA texture references and CUDA textures passed by object.
The difference between the two APIs is the coordinate argument. Textures bound to linear memory can only be addressed in texture coordinates (hence the integer coordinate argument in text1Dfetch()), whereas arrays support both texture and normalised coordinates (thus the float coordinate argument in tex1D).

Related

Image subtraction with CUDA and textures

My goal is to use C++ with CUDA to subtract a dark frame from a raw image. I want to use textures for acceleration. The input of the images is cv::Mat with the type CV_8UC4 (I use the pointer to the data of the cv::Mat). This is the kernel I came up with, but I have no idea how to eventually subtract the textures from each other:
__global__ void DarkFrameSubtractionKernel(unsigned char* outputImage, size_t pitchOutputImage,
cudaTextureObject_t inputImage, cudaTextureObject_t darkImage, int width, int height)
{
const int x = blockIdx.x * blockDim.x + threadIdx.x;
const int y = blockDim.y * blockIdx.y + threadIdx.y;
const float tx = (x + 0.5f);
const float ty = (y + 0.5f);
if (x >= width || y >= height) return;
uchar4 inputImageTemp = tex2D<uchar4>(inputImage, tx, ty);
uchar4 darkImageTemp = tex2D<uchar4>(darkImage, tx, ty);
outputImage[y * pitchOutputImage + x] = inputImageTemp - darkImageTemp; // this line will throw an error
}
This is the function that calls the kernel (you can see that I create the textures from unsigned char):
void subtractDarkImage(unsigned char* inputImage, size_t pitchInputImage, unsigned char* outputImage,
size_t pitchOutputImage, unsigned char* darkImage, size_t pitchDarkImage, int width, int height,
cudaStream_t stream)
{
cudaResourceDesc resDesc = {};
resDesc.resType = cudaResourceTypePitch2D;
resDesc.res.pitch2D.width = width;
resDesc.res.pitch2D.height = height;
resDesc.res.pitch2D.devPtr = inputImage;
resDesc.res.pitch2D.pitchInBytes = pitchInputImage;
resDesc.res.pitch2D.desc = cudaCreateChannelDesc(8, 8, 8, 8, cudaChannelFormatKindUnsigned);
cudaTextureDesc texDesc = {};
texDesc.readMode = cudaReadModeElementType;
texDesc.addressMode[0] = cudaAddressModeBorder;
texDesc.addressMode[1] = cudaAddressModeBorder;
cudaTextureObject_t imageInputTex, imageDarkTex;
CUDA_CHECK(cudaCreateTextureObject(&imageInputTex, &resDesc, &texDesc, 0));
resDesc.res.pitch2D.devPtr = darkImage;
resDesc.res.pitch2D.pitchInBytes = pitchDarkImage;
CUDA_CHECK(cudaCreateTextureObject(&imageDarkTex, &resDesc, &texDesc, 0));
dim3 block(32, 8);
dim3 grid = paddedGrid(block.x, block.y, width, height);
DarkImageSubtractionKernel << <grid, block, 0, stream >> > (reinterpret_cast<uchar4*>(outputImage), pitchOutputImage / sizeof(uchar4),
imageInputTex, imageDarkTex, width, height);
CUDA_CHECK(cudaDestroyTextureObject(imageInputTex));
CUDA_CHECK(cudaDestroyTextureObject(imageDarkTex));
}
The code does not compile as I can not subtract a uchar4 from another one (in the kernel). Is there an easy way of subtraction here?
Help is very much appreciated.
Is there an easy way of subtraction here?
There are no arithmetic operators defined for CUDA built-in vector types. If you replace
outputImage[y * pitchOutputImage + x] = inputImageTemp - darkImageTemp;
with
uchar4 val;
val.x = inputImageTemp.x - darkImageTemp.x;
val.y = inputImageTemp.y - darkImageTemp.y;
val.z = inputImageTemp.z - darkImageTemp.z;
val.w = inputImageTemp.w - darkImageTemp.w;
outputImage[y * pitchOutputImage + x] = val;
things will work. If this offends you, I suggest writing a small library of helper functions to hide the mess.

Halide Jit compilation

Im trying to compile my halide program to jit to use it later in code few times on different images. But i think i making something wrong, can anyone correct me?
First I create halide function to run:
void m_gammaFunctionTMOGenerate()
{
Halide::ImageParam img(Halide::type_of<float>(), 3);
img.set_stride(0, 4);
img.set_stride(2, 1);
Halide::Var x, y, c;
Halide::Param<float> key, sat, clampMax, clampMin;
Halide::Param<bool> cS;
Halide::Func gamma;
// algorytm
//img.width() , img.height();
if (cS.get())
{
float k1 = 1.6774;
float k2 = 0.9925;
sat.set((1 + k1) * pow(key.get(), k2) / (1 + k1 * pow(key.get(), k2)));
}
Halide::Expr luminance = img(x, y, 0) * 0.072186f + img(x, y, 1) * 0.715158f + img(x, y, 2) * 0.212656f;
Halide::Expr ldr_lum = (luminance - clampMin) / (clampMax - clampMin);
Halide::clamp(ldr_lum, 0.f, 1.f);
ldr_lum = Halide::pow(ldr_lum, key);
Halide::Expr imLum = img(x, y, c) / luminance;
imLum = Halide::pow(imLum, sat) * ldr_lum;
Halide::clamp(imLum, 0.f, 1.f);
gamma(x, y, c) = imLum;
// rozkład
gamma.vectorize(x, 16).parallel(y);
// kompilacja
auto & obuff = gamma.output_buffer();
obuff.set_stride(0, 4);
obuff.set_stride(2, 1);
obuff.set_extent(2, 3);
std::vector<Halide::Argument> arguments = { img, key, sat, clampMax, clampMin, cS };
m_gammaFunction = (gammafunction)(gamma.compile_jit());
}
store it in pointer:
typedef int(*gammafunction)(buffer_t*, float, float, float, float, bool, buffer_t*);
gammafunction m_gammaFunction;
then i try to run it:
buffer_t output_buf = { 0 };
//// The host pointers point to the start of the image data:
buffer_t buf = { 0 };
buf.host = (uint8_t *)data; // Might also need const_cast
float * output = new float[width * height * 4];
output_buf.host = (uint8_t*)(output);
// // If the buffer doesn't start at (0, 0), then assign mins
output_buf.extent[0] = buf.extent[0] = width; // In elements, not bytes
output_buf.extent[1] = buf.extent[1] = height; // In elements, not bytes
output_buf.extent[2] = buf.extent[2] = 4; // Assuming RGBA
// // No need to assign additional extents as they were init'ed to zero above
output_buf.stride[0] = buf.stride[0] = 4; // RGBA interleaved
output_buf.stride[1] = buf.stride[1] = width * 4; // Assuming no line padding
output_buf.stride[2] = buf.stride[2] = 1; // Channel interleaved
output_buf.elem_size = buf.elem_size = sizeof(float);
// Run the pipeline
int error = m_photoFunction(&buf, params[0], &output_buf);
But it doesn't work...
Error:
Exception thrown at 0x000002974F552DE0 in Viewer.exe: 0xC0000005: Access violation executing location 0x000002974F552DE0.
If there is a handler for this exception, the program may be safely continued.
Edit:
Here is my code for running function:
buffer_t output_buf = { 0 };
//// The host pointers point to the start of the image data:
buffer_t buf = { 0 };
buf.host = (uint8_t *)data; // Might also need const_cast
float * output = new float[width * height * 4];
output_buf.host = (uint8_t*)(output);
// // If the buffer doesn't start at (0, 0), then assign mins
output_buf.extent[0] = buf.extent[0] = width; // In elements, not bytes
output_buf.extent[1] = buf.extent[1] = height; // In elements, not bytes
output_buf.extent[2] = buf.extent[2] = 3; // Assuming RGBA
// // No need to assign additional extents as they were init'ed to zero above
output_buf.stride[0] = buf.stride[0] = 4; // RGBA interleaved
output_buf.stride[1] = buf.stride[1] = width * 4; // Assuming no line padding
output_buf.stride[2] = buf.stride[2] = 1; // Channel interleaved
output_buf.elem_size = buf.elem_size = sizeof(float);
// Run the pipeline
int error = m_gammaFunction(&buf, params[0], params[1], params[2], params[3], params[4] > 0.5 ? true : false, &output_buf);
if (error) {
printf("Halide returned an error: %d\n", error);
return -1;
}
memcpy(output, data, size * sizeof(float));
can anyone help me with it?
Edit:
Thanks to #KhouriGiordano I found out what I was doing wrong. Indeed I switched from AOT compiling to this code. So now my code looks like that:
class GammaOperator
{
public:
GammaOperator();
int realize(buffer_t * input, float params[], buffer_t * output, int width);
private:
HalideFloat m_key;
HalideFloat m_sat;
HalideFloat m_clampMax;
HalideFloat m_clampMin;
HalideBool m_cS;
Halide::ImageParam m_img;
Halide::Var x, y, c;
Halide::Func m_gamma;
};
GammaOperator::GammaOperator()
: m_img( Halide::type_of<float>(), 3)
{
Halide::Expr w = (1.f + 1.6774f) * pow(m_key.get(), 0.9925f) / (1.f + 1.6774f * pow(m_key.get(), 0.9925f));
Halide::Expr sat = Halide::select(m_cS, m_sat, w);
Halide::Expr luminance = m_img(x, y, 0) * 0.072186f + m_img(x, y, 1) * 0.715158f + m_img(x, y, 2) * 0.212656f;
Halide::Expr ldr_lum = (luminance - m_clampMin) / (m_clampMax - m_clampMin);
ldr_lum = Halide::clamp(ldr_lum, 0.f, 1.f);
ldr_lum = Halide::pow(ldr_lum, m_key);
Halide::Expr imLum = m_img(x, y, c) / luminance;
imLum = Halide::pow(imLum, sat) * ldr_lum;
imLum = Halide::clamp(imLum, 0.f, 1.f);
m_gamma(x, y, c) = imLum;
}
int GammaOperator::realize(buffer_t * input, float params[], buffer_t * output, int width)
{
m_img.set(Halide::Buffer(Halide::type_of<float>(), input));
m_img.set_stride(0, 4);
m_img.set_stride(1, width * 4);
m_img.set_stride(2, 4);
// algorytm
m_gamma.vectorize(x, 16).parallel(y);
//params[0], params[1], params[2], params[3], params[4] > 0.5 ? true : false
//{ img, key, sat, clampMax, clampMin, cS };
m_key.set(params[0]);
m_sat.set(params[1]);
m_clampMax.set(params[2]);
m_clampMin.set(params[3]);
m_cS.set(params[4] > 0.5f ? true : false);
//// kompilacja
m_gamma.realize(Halide::Buffer(Halide::type_of<float>(), output));
return 0;
}
and i use it like that:
buffer_t output_buf = { 0 };
//// The host pointers point to the start of the image data:
buffer_t buf = { 0 };
buf.host = (uint8_t *)data; // Might also need const_cast
float * output = new float[width * height * 4];
output_buf.host = (uint8_t*)(output);
// // If the buffer doesn't start at (0, 0), then assign mins
output_buf.extent[0] = buf.extent[0] = width; // In elements, not bytes
output_buf.extent[1] = buf.extent[1] = height; // In elements, not bytes
output_buf.extent[2] = buf.extent[2] = 4; // Assuming RGBA
// // No need to assign additional extents as they were init'ed to zero above
output_buf.stride[0] = buf.stride[0] = 4; // RGBA interleaved
output_buf.stride[1] = buf.stride[1] = width * 4; // Assuming no line padding
output_buf.stride[2] = buf.stride[2] = 1; // Channel interleaved
output_buf.elem_size = buf.elem_size = sizeof(float);
// Run the pipeline
int error = s_gamma->realize(&buf, params, &output_buf, width);
but it is still crashing on m_gamma.realize function with info in console:
Error: Constraint violated: f0.stride.0 (4) == 1 (1)
By using Halide::Param::get(), you are extracting the (default of 0) value from the Param object at the time you call get(). If you want to use the parameter value given at the time you call the generated function, just use it without calling get and it should be implicitly converted to an Expr.
Since Param is not convertible to a boolean, the Halide way of doing an if is Halide::select().
You aren't using the clamped return value of Halide::clamp().
I don't see cS being used by the Halide code, only the C code.
Now to your JIT problem. It looks like you started doing AOT compilation and switched to JIT.
You make a std::vector<Halide::Argument> but don't pass it anywhere. How can Halide know what Param you want to use? It looks at the Func and finds references to ImageParam and Param objects.
How can you know what order it expects the Param? You have no control over this. I was able to dump the bitcode by defining HL_GENBITCODE=1 and then view that with llvm-dis to see your function:
int gamma
( buffer_t *img
, float clampMax
, float key
, float clampMin
, float sat
, void *user_context
, buffer_t *result
);
Use gamma.realize(Halide::Buffer(Halide::type_of<float>(), &output_buf)) instead of using gamma.compile_jit() and trying to call the generated function properly.
For one time use:
Use Image instead of ImageParam.
Use Expr instead of Param.
For repeated use with a single JIT compile:
Keep the ImageParam and Param around and set them before realizing the Func.

Loading non-power-of-two textures in Vulkan

My 2D texture loader works fine if my texture dimensions are power-of-two, but when they are not, the texture data displays as skewed. How do I fix this? I assume the issue has something to do with memory alignment and row pitch. Here's relevant parts of my loader code:
VkMemoryRequirements memReqs;
vkGetImageMemoryRequirements( GfxDeviceGlobal::device, mappableImage, &memReqs );
VkMemoryAllocateInfo memAllocInfo = {};
memAllocInfo.sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO;
memAllocInfo.pNext = nullptr;
memAllocInfo.memoryTypeIndex = 0;
memAllocInfo.allocationSize = memReqs.size;
GetMemoryType( memReqs.memoryTypeBits, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT, &memAllocInfo.memoryTypeIndex );
VkDeviceMemory mappableMemory;
err = vkAllocateMemory( GfxDeviceGlobal::device, &memAllocInfo, nullptr, &mappableMemory );
CheckVulkanResult( err, "vkAllocateMemory in Texture2D" );
err = vkBindImageMemory( GfxDeviceGlobal::device, mappableImage, mappableMemory, 0 );
CheckVulkanResult( err, "vkBindImageMemory in Texture2D" );
VkImageSubresource subRes = {};
subRes.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT;
subRes.mipLevel = 0;
subRes.arrayLayer = 0;
VkSubresourceLayout subResLayout;
vkGetImageSubresourceLayout( GfxDeviceGlobal::device, mappableImage, &subRes, &subResLayout );
void* mapped;
err = vkMapMemory( GfxDeviceGlobal::device, mappableMemory, 0, memReqs.size, 0, &mapped );
CheckVulkanResult( err, "vkMapMemory in Texture2D" );
const int bytesPerPixel = 4;
std::size_t dataSize = bytesPerPixel * width * height;
std::memcpy( mapped, data, dataSize );
vkUnmapMemory( GfxDeviceGlobal::device, mappableMemory );
The VkSubresourceLayout, which you obtained from vkGetImageSubresourceLayout will contain the pitch of the texture in the rowPitch member. It's more than likely not equal to the width, thus, when you do a memcpy of the entire data block, you're copying relevant data into the padding section of the texture.
Instead you will need to memcpy row-by-row, skipping the padding memory in the mapped texture:
const int bytesPerPixel = 4;
std::size_t dataRowSize = bytesPerPixel * width;
char* mappedBytes = (char*)mapped;
for(int i = 0; i < height; ++i)
{
std::memcpy(mapped, data, dataSize);
mappedBytes += rowPitch;
data += dataRowSize;
}
(this code assumes data is a char * as well - its declaration wasn't given)
for(int i = 0; i < height; ++i)
{
std::memcpy(mappedBytes, data, dataRowSize);
mappedBytes += layout.rowPitch;
data += dataRowSize;
}

CUDA, "illegal memory access was encountered" in Memcpy

I have this cuda file:
#include "cuda.h"
#include "../../HandleError.h"
#include "Sphere.hpp"
#include <stdlib.h>
#include <CImg.h>
#define WIDTH 1280
#define HEIGHT 720
#define rnd(x) (x*rand()/RAND_MAX)
#define SPHERES_COUNT 5
using namespace cimg_library;
__global__
void kernel(unsigned char* bitmap, Sphere* s)
{
// Map threadIdx/blockIdx to pixel position
int x = threadIdx.x + blockIdx.x * blockDim.x;
int y = threadIdx.y + blockIdx.y * blockDim.y;
int offset = x + y * blockDim.x * gridDim.x;
float ox = x - blockDim.x * gridDim.x / 2;
float oy = y - blockDim.y * gridDim.y / 2;
float r = 0.2, g = 0.2, b = 0.5;
float maxz = -INF;
for (int i = 0; i < SPHERES_COUNT; i++) {
float n, t = s[i].hit(ox, oy, &n);
if (t > maxz) {
float fscale = n;
r = s[i].r * fscale;
g = s[i].g * fscale;
b = s[i].b * fscale;
maxz = t;
}
}
bitmap[offset*3] = (int)(r * 255);
bitmap[offset*3 + 1] = (int)(g * 255);
bitmap[offset*3 + 2] = (int)(b * 255);
}
__constant__ Sphere s[SPHERES_COUNT];
int main ()
{
//Capture start time
cudaEvent_t start, stop;
HANDLE_ERROR(cudaEventCreate(&start));
HANDLE_ERROR(cudaEventCreate(&stop));
HANDLE_ERROR(cudaEventRecord(start, 0));
//Create host bitmap
CImg<unsigned char> image(WIDTH, HEIGHT, 1, 3);
image.permute_axes("cxyz");
//Allocate device bitmap data
unsigned char* dev_bitmap;
HANDLE_ERROR(cudaMalloc((void**)&dev_bitmap, image.size()*sizeof(unsigned char)));
//Generate spheres and copy them on the GPU one by one
Sphere* temp_s = (Sphere*)malloc(SPHERES_COUNT*sizeof(Sphere));
for (int i=0; i <SPHERES_COUNT; i++) {
temp_s[i].r = rnd(1.0f);
temp_s[i].g = rnd(1.0f);
temp_s[i].b = rnd(1.0f);
temp_s[i].x = rnd(1000.0f) - 500;
temp_s[i].y = rnd(1000.0f) - 500;
temp_s[i].z = rnd(1000.0f) - 500;
temp_s[i].radius = rnd(100.0f) + 20;
}
HANDLE_ERROR(cudaMemcpyToSymbol(s, temp_s, sizeof(Sphere)*SPHERES_COUNT));
free(temp_s);
//Generate a bitmap from spere data
dim3 grids(WIDTH/16, HEIGHT/16);
dim3 threads(16, 16);
kernel<<<grids, threads>>>(dev_bitmap, s);
//Copy the bitmap back from the GPU for display
HANDLE_ERROR(cudaMemcpy(image.data(), dev_bitmap,
image.size()*sizeof(unsigned char),
cudaMemcpyDeviceToHost));
cudaFree(dev_bitmap);
image.permute_axes("yzcx");
image.save("render.bmp");
}
It compiles fine, but when executed I get this error:
an illegal memory access was encountered in main.cu at line 82
that is, here:
//Copy the bitmap back from the GPU for display
HANDLE_ERROR(cudaMemcpy(image.data(), dev_bitmap,
image.size()*sizeof(unsigned char),
cudaMemcpyDeviceToHost));
I cannot understand why...
I know that If remove this:
bitmap[offset*3] = (int)(r * 255);
bitmap[offset*3 + 1] = (int)(g * 255);
bitmap[offset*3 + 2] = (int)(b * 255);
The error is not reported, so I thought It may be an out of index error, reported later, but I have An identical version of this program that makes no use of constant memory, and it works fine with the very same version of the kernel function...
There are two things at issue here. The first is this:
__constant__ Sphere s[SPHERES_COUNT];
int main ()
{
......
kernel<<<grids, threads>>>(dev_bitmap, s);
......
In host code, s is a host memory variable which provides a handle for the CUDA runtime to hook up with the device constant memory symbol. It doesn't contain a valid device pointer and can't be passed to kernel calls. The result is a invalid memory access error.
You could do this:
__constant__ Sphere s[SPHERES_COUNT];
int main ()
{
......
Sphere *d_s;
cudaGetSymbolAddress((void **)&d_s, s);
kernel<<<grids, threads>>>(dev_bitmap, d_s);
......
which would cause a symbol lookup to get the device address of s, and it would be valid to pass that to the kernel. However, the GPU relies on the compiler emitting specific instructions to access memory through the constant cache. The device compiler will only emit these instructions when it can detect that a __constant__ variable is being accessed within a kernel, which is not possible when using a pointer. You can see more about how the compiler will generate code for constant variable access in this Stack Overflow question and answer.

A method for indexing triangles from a loaded heightmap?

I am currently making a method to load in a noisy heightmap, but lack the triangles to do so. I want to make an algorithm that will take an image, its width and height and construct a terrain node out of it.
Here's what I have so far, in somewhat pseudo
Vertex* vertices = new Vertices[image.width * image.height];
Index* indices; // How do I judge how many indices I will have?
float scaleX = 1 / image.width;
float scaleY = 1 / image.height;
float currentYScale = 0;
for(int y = 0; y < image.height; ++y) {
float currentXScale = 0;
for (int x = 0; x < image.width; ++x) {
Vertex* v = vertices[x * y];
v.x = currentXScale;
v.y = currentYScale;
v.z = image[x,y];
currentXScale += scaleX;
}
currentYScale += scaleY;
}
This works well enough to my needs, my only problem is this: How would I calculate the # of indices and their positions for drawing the triangles? I have somewhat familiarity with indices, but not how to programmatically calculate them, I can only do that statically.
As far as your code above goes, using vertices[x * y] isn't right - if you use that, then e.g. vert(2,3) == vert(3,2). What you want is something like vertices[y * image.width + x], but you can do it more efficiently by incrementing a counter (see below).
Here's the equivalent code I use. It's in C# unfortunately, but hopefully it should illustrate the point:
/// <summary>
/// Constructs the vertex and index buffers for the terrain (for use when rendering the terrain).
/// </summary>
private void ConstructBuffers()
{
int heightmapHeight = Heightmap.GetLength(0);
int heightmapWidth = Heightmap.GetLength(1);
int gridHeight = heightmapHeight - 1;
int gridWidth = heightmapWidth - 1;
// Construct the individual vertices for the terrain.
var vertices = new VertexPositionTexture[heightmapHeight * heightmapWidth];
int vertIndex = 0;
for(int y = 0; y < heightmapHeight; ++y)
{
for(int x = 0; x < heightmapWidth; ++x)
{
var position = new Vector3(x, y, Heightmap[y,x]);
var texCoords = new Vector2(x * 2f / heightmapWidth, y * 2f / heightmapHeight);
vertices[vertIndex++] = new VertexPositionTexture(position, texCoords);
}
}
// Create the vertex buffer and fill it with the constructed vertices.
this.VertexBuffer = new VertexBuffer(Renderer.GraphicsDevice, typeof(VertexPositionTexture), vertices.Length, BufferUsage.WriteOnly);
this.VertexBuffer.SetData(vertices);
// Construct the index array.
var indices = new short[gridHeight * gridWidth * 6]; // 2 triangles per grid square x 3 vertices per triangle
int indicesIndex = 0;
for(int y = 0; y < gridHeight; ++y)
{
for(int x = 0; x < gridWidth; ++x)
{
int start = y * heightmapWidth + x;
indices[indicesIndex++] = (short)start;
indices[indicesIndex++] = (short)(start + 1);
indices[indicesIndex++] = (short)(start + heightmapWidth);
indices[indicesIndex++] = (short)(start + 1);
indices[indicesIndex++] = (short)(start + 1 + heightmapWidth);
indices[indicesIndex++] = (short)(start + heightmapWidth);
}
}
// Create the index buffer.
this.IndexBuffer = new IndexBuffer(Renderer.GraphicsDevice, typeof(short), indices.Length, BufferUsage.WriteOnly);
this.IndexBuffer.SetData(indices);
}
I guess the key point is that given a heightmap of size heightmapHeight * heightmapWidth, you need (heightmapHeight - 1) * (heightmapWidth - 1) * 6 indices, since you're drawing:
2 triangles per grid square
3 vertices per triangle
(heightmapHeight - 1) * (heightmapWidth - 1) grid squares in your terrain.