Exception error in CUDA/GPU on calling function

Exception error in CUDA/GPU on calling function - c++

Hi I am having an exception error in CUDA.. I am trying to run the following code as it is..
CUdeviceptr pDecodedFrame[2] = { 0, 0 };
CUdeviceptr pInteropFrame[2] = { 0, 0 };
uint32 n_Width = g_pVideoDecoder->targetWidth();
uint32 n_Height = g_pVideoDecoder->targetHeight();
dim3 block(32,16,1);
dim3 grid((nWidth+(2*block.x-1))/(2*block.x), (nHeight+(block.y-1))/block.y, 1);
NV12ToARGB_drvapi<<<block,grid>>>(&pDecodedFrame[active_field], nDecodedPitch,
&pInteropFrame[active_field], nTexturePitch, nWidth, nHeight,constHueColorSpaceMat, constAlpha);
My other function is as follows :
__global__ void NV12ToARGB_drvapi(uint32 *srcImage, size_t nSourcePitch,
uint32 *dstImage, size_t nDestPitch,
uint32 width, uint32 height, float constHueColorSpaceMat[9] , uint32 constAlpha)
{
int32 x, y;
uint32 yuv101010Pel[2];
uint32 processingPitch = ((width) + 63) & ~63;
uint32 dstImagePitch = nDestPitch >> 2;
uint8 *srcImageU8 = (uint8 *)srcImage;
processingPitch = nSourcePitch;
// Pad borders with duplicate pixels, and we multiply by 2 because we process 2 pixels per thread
x = blockIdx.x * (blockDim.x << 1) + (threadIdx.x << 1);
y = blockIdx.y * blockDim.y + threadIdx.y;
if (x >= width)
return; //x = width - 1;
if (y >= height)
return; // y = height - 1;
// Read 2 Luma components at a time, so we don't waste processing since CbCr are decimated this way.
// if we move to texture we could read 4 luminance values
yuv101010Pel[0] = (srcImageU8[y * processingPitch + x ]) << 2;
yuv101010Pel[1] = (srcImageU8[y * processingPitch + x + 1]) << 2;
uint32 chromaOffset = processingPitch * height;
int32 y_chroma = y >> 1;
if (y & 1) // odd scanline ?
{
uint32 chromaCb;
uint32 chromaCr;
chromaCb = srcImageU8[chromaOffset + y_chroma * processingPitch + x ];
chromaCr = srcImageU8[chromaOffset + y_chroma * processingPitch + x + 1];
if (y_chroma < ((height >> 1) - 1)) // interpolate chroma vertically
{
chromaCb = (chromaCb + srcImageU8[chromaOffset + (y_chroma + 1) * processingPitch + x ] + 1) >> 1;
chromaCr = (chromaCr + srcImageU8[chromaOffset + (y_chroma + 1) * processingPitch + x + 1] + 1) >> 1;
}
yuv101010Pel[0] |= (chromaCb << (COLOR_COMPONENT_BIT_SIZE + 2));
yuv101010Pel[0] |= (chromaCr << ((COLOR_COMPONENT_BIT_SIZE << 1) + 2));
yuv101010Pel[1] |= (chromaCb << (COLOR_COMPONENT_BIT_SIZE + 2));
yuv101010Pel[1] |= (chromaCr << ((COLOR_COMPONENT_BIT_SIZE << 1) + 2));
}
else
{
yuv101010Pel[0] |= ((uint32)srcImageU8[chromaOffset + y_chroma * processingPitch + x ] << (COLOR_COMPONENT_BIT_SIZE + 2));
yuv101010Pel[0] |= ((uint32)srcImageU8[chromaOffset + y_chroma * processingPitch + x + 1] << ((COLOR_COMPONENT_BIT_SIZE << 1) + 2));
yuv101010Pel[1] |= ((uint32)srcImageU8[chromaOffset + y_chroma * processingPitch + x ] << (COLOR_COMPONENT_BIT_SIZE + 2));
yuv101010Pel[1] |= ((uint32)srcImageU8[chromaOffset + y_chroma * processingPitch + x + 1] << ((COLOR_COMPONENT_BIT_SIZE << 1) + 2));
}
// this steps performs the color conversion
uint32 yuvi[6];
float red[2], green[2], blue[2];
yuvi[0] = (yuv101010Pel[0] & COLOR_COMPONENT_MASK);
yuvi[1] = ((yuv101010Pel[0] >> COLOR_COMPONENT_BIT_SIZE) & COLOR_COMPONENT_MASK);
yuvi[2] = ((yuv101010Pel[0] >> (COLOR_COMPONENT_BIT_SIZE << 1)) & COLOR_COMPONENT_MASK);
yuvi[3] = (yuv101010Pel[1] & COLOR_COMPONENT_MASK);
yuvi[4] = ((yuv101010Pel[1] >> COLOR_COMPONENT_BIT_SIZE) & COLOR_COMPONENT_MASK);
yuvi[5] = ((yuv101010Pel[1] >> (COLOR_COMPONENT_BIT_SIZE << 1)) & COLOR_COMPONENT_MASK);
// YUV to RGB Transformation conversion
YUV2RGB(&yuvi[0], &red[0], &green[0], &blue[0], constHueColorSpaceMat);
YUV2RGB(&yuvi[3], &red[1], &green[1], &blue[1], constHueColorSpaceMat);
// Clamp the results to RGBA
dstImage[y * dstImagePitch + x ] = RGBAPACK_10bit(red[0], green[0], blue[0], constAlpha);
dstImage[y * dstImagePitch + x + 1 ] = RGBAPACK_10bit(red[1], green[1], blue[1], constAlpha);
}
__device__ void YUV2RGB(uint32 *yuvi, float *red, float *green, float *blue, float constHueColorSpaceMat[9])
{
float luma, chromaCb, chromaCr;
// Prepare for hue adjustment
luma = (float)yuvi[0];
chromaCb = (float)((int32)yuvi[1] - 512.0f);
chromaCr = (float)((int32)yuvi[2] - 512.0f);
// Convert YUV To RGB with hue adjustment
*red = MUL(luma, constHueColorSpaceMat[0]) +
MUL(chromaCb, constHueColorSpaceMat[1]) +
MUL(chromaCr, constHueColorSpaceMat[2]);
*green= MUL(luma, constHueColorSpaceMat[3]) +
MUL(chromaCb, constHueColorSpaceMat[4]) +
MUL(chromaCr, constHueColorSpaceMat[5]);
*blue = MUL(luma, constHueColorSpaceMat[6]) +
MUL(chromaCb, constHueColorSpaceMat[7]) +
MUL(chromaCr, constHueColorSpaceMat[8]);
}
__device__ uint32 RGBAPACK_8bit(float red, float green, float blue, uint32 alpha)
{
uint32 ARGBpixel = 0;
// Clamp final 10 bit results
red = min(max(red, 0.0f), 255.0f);
green = min(max(green, 0.0f), 255.0f);
blue = min(max(blue, 0.0f), 255.0f);
// Convert to 8 bit unsigned integers per color component
ARGBpixel = (((uint32)blue) |
(((uint32)green) << 8) |
(((uint32)red) << 16) | (uint32)alpha);
return ARGBpixel;
}
This is the error I am having
First-chance exception at 0x7571812f in testing_project.exe: Microsoft C++ exception: cudaError at memory location 0x0015e6b8..
First-chance exception at 0x7571812f in testing_project.exe: Microsoft C++ exception: [rethrow] at memory location 0x00000000..
Everytime I am running the GPU code. Anybody can help me out here please.

If your application is running normally and returning no errors (are you doing proper cuda error checking?) the first chance exception will be seen only when running in the MSVC environment and can safely be ignored.
Some of the CUDA libraries run into exceptions during ordinary processing functions. These exceptions are handled normally, but when you are in the MSVC environment, Visual Studio gives you the option of intercepting the exception and having "first chance" access to it, before it is passed to the ordinary exception handler built into the code/library.
You should not see this if you are running your application from the command line, outside of the MSVC environment.
You may also see a difference in behavior if you switch to CUDA 5.5

Related

Can I use SIMD to speed up my image resize function and maybe avoid the nested loop?

I have a resize function that uses a nested for loop to resize a picture with bilinear interpolation and merge it with a background image.
For an image with the size of 320x240 that would result in 76800 cycles.
Regarding to this article Using SIMD Technologies on Intel® Architecture to Speed Up Game Code it should be possible to speed up the whole process by 4 times.
Does this mean I am able to use only one loop to iterate over the height and fill the whole width at once?
The current implementation took 1ms on an Intel i7 and 3 to 4 ms on an Intel Atom x7. Because the Atom is supporting SSE 4.2 too I was thinking about taking benefits of it.
But to be honest I have no idea how I should write my code to achieve this. Is this even possible for my use case?
Thank you in advance!
C++ code
void Resize24Bpp(BYTE* pd, int* pixels, int widthSource, int heightSource, int width, int height, BYTE* baseImage)
{
float xs = (float)widthSource / width;
float ys = (float)heightSource / height;
float fracx, fracy, ifracx, ifracy, sx, sy, l0, l1, rf, gf, bf;
int c, x0, x1, y0, y1;
BYTE c1a, c1r, c1g, c1b, c2a, c2r, c2g, c2b, c3a, c3r, c3g, c3b, c4a, c4r, c4g, c4b;
BYTE a, r, g, b;
// Bilinear
int srcIdx = 0;
int baseIdx = 0;
for (int y = 0; y < height; y++)
{
for (int x = 0; x < width; x++)
{
sx = x * xs;
sy = y * ys;
x0 = (int)sx;
y0 = (int)sy;
// Calculate coordinates of the 4 interpolation points
fracx = sx - x0;
fracy = sy - y0;
ifracx = 1.0f - fracx;
ifracy = 1.0f - fracy;
x1 = x0 + 1;
if (x1 >= widthSource)
{
x1 = x0;
}
y1 = y0 + 1;
if (y1 >= heightSource)
{
y1 = y0;
}
// Read source color
c = pixels[y0 * widthSource + x0];
c1a = (BYTE)(c >> 24);
c1r = (BYTE)(c >> 16);
c1g = (BYTE)(c >> 8);
c1b = (BYTE)(c);
c = pixels[y0 * widthSource + x1];
c2a = (BYTE)(c >> 24);
c2r = (BYTE)(c >> 16);
c2g = (BYTE)(c >> 8);
c2b = (BYTE)(c);
c = pixels[y1 * widthSource + x0];
c3a = (BYTE)(c >> 24);
c3r = (BYTE)(c >> 16);
c3g = (BYTE)(c >> 8);
c3b = (BYTE)(c);
c = pixels[y1 * widthSource + x1];
c4a = (BYTE)(c >> 24);
c4r = (BYTE)(c >> 16);
c4g = (BYTE)(c >> 8);
c4b = (BYTE)(c);
// Calculate colors
// Alpha
l0 = ifracx * c1a + fracx * c2a;
l1 = ifracx * c3a + fracx * c4a;
a = (BYTE)(ifracy * l0 + fracy * l1);
// Write destination
if (a > 0)
{
// Red
l0 = ifracx * c1r + fracx * c2r;
l1 = ifracx * c3r + fracx * c4r;
rf = ifracy * l0 + fracy * l1;
// Green
l0 = ifracx * c1g + fracx * c2g;
l1 = ifracx * c3g + fracx * c4g;
gf = ifracy * l0 + fracy * l1;
// Blue
l0 = ifracx * c1b + fracx * c2b;
l1 = ifracx * c3b + fracx * c4b;
bf = ifracy * l0 + fracy * l1;
// Cast to BYTE
float alpha = a / 255.0f;
r = (BYTE)((rf * alpha) + (baseImage[baseIdx] * (1.0f - alpha)));
g = (BYTE)((gf * alpha) + (baseImage[baseIdx] * (1.0f - alpha)));
b = (BYTE)((bf * alpha) + (baseImage[baseIdx] * (1.0f - alpha)));
pd[srcIdx++] = b;
pd[srcIdx++] = g;
pd[srcIdx++] = r;
}
else
{
// Red, Green, Blue
pd[srcIdx++] = baseImage[baseIdx];
pd[srcIdx++] = baseImage[baseIdx];
pd[srcIdx++] = baseImage[baseIdx];
}
baseIdx++;
}
}
}

sobel filter algorithm (C++) (no libraries)

I am trying to apply the sobel filter algorithm to a given picture (grayscale in this case) given my approach to accessing the pixels of the picture. Since I am accessing them in a way that doesn't use libraries, I am having trouble figuring out how to apply the algorithm given this approach. This first part of the code is just accessing pixel data:
Part 1:
CKingimageDoc* pDoc = GetDocument(); // get picture
int iBitPerPixel = pDoc->_bmp->bitsperpixel; // used to see if grayscale(8 bits) or RGB (24 bits)
int iWidth = pDoc->_bmp->width;
int iHeight = pDoc->_bmp->height;
BYTE *pImg = pDoc->_bmp->point; // pointer used to point at pixels in the image
const int area = iWidth * iHeight;
int Wp = iWidth;
int intensity;
if (iBitPerPixel == 8) ////Grayscale 8 bits image
{
int r = iWidth % 4; // pixels leftover from width (remainder has to be factor of 8 or 24)
int p = (4-r) % 4; // has to be a factor of number of bits in pixel, num leftover to take care of
Wp = iWidth + p;
Part 2 (The actual application of the sobel filter algorithm):
float kernelx[3][3] = { { -1, 0, 1 },
{ -2, 0, 2 },
{ -1, 0, 1 } };
float kernely[3][3] = { { -1, -2, -1 },
{ 0, 0, 0 },
{ 1, 2, 1 } };
double magX = 0.0; // this is your magnitude
for (int a = 0; a < 3; a++) {
for (int b = 0; b < 3; b++) {
magX += pImg[i*Wp + j] * kernelx[a][b]; // where i get confused
}
}
}
Any and all help is greatly appreciated.

You have to use appropriate pixel from neighborhood of center pixel to multiply with kernel entry:
//row, col - coordinates of central pixel for calculation
for (int row = 1; row < height - 1; row++) {
for (int col = 1; col < width - 1; col++) {
double magX = 0.0; // this is your magnitude
for (int a = 0; a < 3; a++) {
for (int b = 0; b < 3; b++) {
magX += pImg[(row - 1 + a) * Wp + col - 1 + b] * kernelx[a][b];
}
}
resultImg[row * Wp + col] = magX;
}
}
I omitted border pixels

CKingimageDoc* pDoc = GetDocument(); // get picture
int iBitPerPixel = pDoc->_bmp->bitsperpixel; // used to see if grayscale(8b) or RGB(24b)
int iWidth = pDoc->_bmp->width;
int iHeight = pDoc->_bmp->height;
BYTE *pImg = pDoc->_bmp->point; // pointer used to point at pixels in the image
const int area = iWidth * iHeight;
BYTE *pImg2 = new BYTE[area];
if (iBitPerPixel == 8) // Grayscale 8bit image
{
int pixel_x;
int pixel_y;
float sobel_x[3][3] =
{ { -1, 0, 1 },
{ -2, 0, 2 },
{ -1, 0, 1 } };
float sobel_y[3][3] =
{ { -1, -2, -1 },
{ 0, 0, 0 },
{ 1, 2, 1 } };
for (int x=1; x < iWidth-1; x++)
{
for (int y=1; y < iHeight-1; y++)
{
pixel_x = (sobel_x[0][0] * pImg[iWidth * (y-1) + (x-1)])
+ (sobel_x[0][1] * pImg[iWidth * (y-1) + x ])
+ (sobel_x[0][2] * pImg[iWidth * (y-1) + (x+1)])
+ (sobel_x[1][0] * pImg[iWidth * y + (x-1)])
+ (sobel_x[1][1] * pImg[iWidth * y + x ])
+ (sobel_x[1][2] * pImg[iWidth * y + (x+1)])
+ (sobel_x[2][0] * pImg[iWidth * (y+1) + (x-1)])
+ (sobel_x[2][1] * pImg[iWidth * (y+1) + x ])
+ (sobel_x[2][2] * pImg[iWidth * (y+1) + (x+1)]);
pixel_y = (sobel_y[0][0] * pImg[iWidth * (y-1) + (x-1)])
+ (sobel_y[0][1] * pImg[iWidth * (y-1) + x ])
+ (sobel_y[0][2] * pImg[iWidth * (y-1) + (x+1)])
+ (sobel_y[1][0] * pImg[iWidth * y + (x-1)])
+ (sobel_y[1][1] * pImg[iWidth * y + x ])
+ (sobel_y[1][2] * pImg[iWidth * y + (x+1)])
+ (sobel_y[2][0] * pImg[iWidth * (y+1) + (x-1)])
+ (sobel_y[2][1] * pImg[iWidth * (y+1) + x ])
+ (sobel_y[2][2] * pImg[iWidth * (y+1) + (x+1)]);
int val = (int)sqrt((pixel_x * pixel_x) + (pixel_y * pixel_y));
if(val < 0) val = 0;
if(val > 255) val = 255;
pImg2[iHeight * y + x] = val;
}
}
}

What is wrong with my perlin noise generator?

I am trying to generate perlin noise for a math essay for school, and i have some difficulties figuring out the math behind it. This is my perlin class. The perlin noise function generates ( should generate) a number between 0 and 1, that i then multiply by 255 to apply color to every pixel on the screen, please help!
#include "perlinnoise.h"
perlinnoise::perlinnoise()
{
srand(time(NULL));
double random = rand() % 1000;
for (int i = 0; i < (651 * 2); i = i + 2)
{
random = (rand() % 1000);
vecGrad[i] = random / 1000;
vecGrad[i + 1] = vecGrad[i];
vecGrad[i] = cos(vecGrad[i] * 2 * 3.1416);
vecGrad[i + 1] = sin(vecGrad[i + 1] * 2 * 3.1416);
}
}
int perlinnoise::perlinNoise(int x, int y)
{
//20 pixel in each case
//30 boxes in width and 20 boxes in height
//651 vectors to create
sf::Vector2i boxXY;
boxXY.x = ((x / 20));
boxXY.y = ((y / 20));
sf::Vector2i displacement1; displacement1.x = x - boxXY.x * 20; displacement1.y = y - boxXY.y * 20;
sf::Vector2i displacement2; displacement2.x = x - (boxXY.x * 20 + 20); displacement2.y = y - boxXY.y * 20;
sf::Vector2i displacement3; displacement3.x = x - boxXY.x * 20; displacement3.y = y - (boxXY.y * 20 + 20);
sf::Vector2i displacement4; displacement4.x = x - (boxXY.x * 20 + 20); displacement4.y = y - (boxXY.y * 20 + 20);
/*std::cout << displacement1.x << std::endl; std::cout << displacement1.y << std::endl;
std::cout << displacement2.x << std::endl; std::cout << displacement2.y << std::endl;
std::cout << displacement3.x << std::endl; std::cout << displacement3.y << std::endl;
std::cout << displacement4.x << std::endl; std::cout << displacement4.y << std::endl;*/
double dotP1 = (vecGrad[((boxXY.y * 30) + boxXY.x)] * displacement1.x) + (vecGrad[(boxXY.y * 30) + boxXY.x + 1] * displacement1.y);
double dotP2 = (vecGrad[((boxXY.y * 30) + boxXY.x + 3)] * displacement2.x) + (vecGrad[(boxXY.y * 30) + boxXY.x + 4] * displacement2.y);
double dotP3 = (vecGrad[((boxXY.y * 30 + 1) + boxXY.x)] * displacement3.x) + (vecGrad[(boxXY.y * 30) + boxXY.x + 1] * displacement3.y);
double dotP4 = (vecGrad[((boxXY.y * 30 + 1) + boxXY.x + 3)] * displacement4.x) + (vecGrad[(boxXY.y * 30) + boxXY.x + 4] * displacement4.y);
This is where i have some troubles ( I think)
int intensity = 0;
double Sx = (3 * (x - boxXY.x * 20) * (x - boxXY.x * 20)) - (2 * (x - boxXY.x * 20) * (x - boxXY.x * 20) * (x - boxXY.x * 20));
double Sy = (3 * (y - boxXY.y * 20) * (y - boxXY.y * 20)) - (2 * (y - boxXY.y * 20) * (y - boxXY.y * 20) * (y - boxXY.y * 20));
double a = dotP1 + (Sx * (dotP2 - dotP1));
double b = dotP3 + (Sx * (dotP4 - dotP3));
double aa = dotP1 + (Sy * (dotP2 - dotP1));
double bb = dotP3 + (Sy * (dotP4 - dotP3));
intensity = (a+b+aa+bb)/4;
//Should generate number between 0 and 1, but doesn't :/
return intensity;
}
perlinnoise::~perlinnoise()
{
}
I've been reading lots of articles, and they are all very unclear about the math used.I ended up generating a grid with 20*20 pixels in each, with each cross section in the grid having a randomly generated gradient vector. I then calculate the displacement vectors and then do the dot product on the four corners with displacement and gradient vectors. This first part is a bit messy as i am not very experienced, but the last part is a bit more straightforward. I use a smoothing function on the x and y axis and use that number to generate a, b, aa and bb, and i then take the average of that. This is what i thought i understood from the articles i read, but apparently it's wrong :/ Any help please?
Thanks in advance!

1555 DXT1 decompression giving incorrect image output

I am trying, in C++, to decompress 1555 DXT1 textures into RGBA 8888, storing the output into a std::string.
I have successfully decompressed 565 DXT1 to RGBA 8888 using the squish lib, but just can't seem to get 1555 working.
The program isn't crashing, and the output image looks almost correct, but there are several pixels in random places that are strange colours, as you can see in the output image below.
Here's the code.
using namespace std;
string CTexture::extractRGBAData(void)
{
string strPixels;
strPixels.resize(m_usImageSize[0] * m_usImageSize[1] * 4);
for (unsigned long i = 0, j = m_usImageSize[0] * m_usImageSize[1] * 4; i < j; i++)
{
strPixels[i] = 0;
}
if (m_strImageData.length() == 0)
{
return strPixels;
}
unsigned long uiDXTCompressionType;
if (m_uiPlatformId == 8) // GTA III, VC
{
uiDXTCompressionType = m_ucDXTCompressionType;
}
else if (m_uiPlatformId == 9) // SA
{
//uiDXTCompressionType = m_uiAlpha;
uiDXTCompressionType = m_ucDXTCompressionType;
}
else if (m_uiPlatformId == 5) // XBOX, Android
{
uiDXTCompressionType = m_uiAlpha;
}
if (uiDXTCompressionType == DXT1)
{
unsigned long uiWidth = m_usImageSize[0];
unsigned long uiHeight = m_usImageSize[1];
if (m_uiRasterFormat == FORMAT_1555)
{
unsigned long
uiPixelKey = 0,
uiTexelSeek = 0;
for (unsigned long y = 0; y < uiHeight; y += 4)
{
for (unsigned long x = 0; x < uiWidth; x += 4)
{
string strTexel = m_strImageData.substr(uiTexelSeek, 8);
unsigned char *pPixels = new unsigned char[16 * 4];
unsigned char *pBlock = new unsigned char[8];
memcpy(pBlock, strTexel.c_str(), 8);
decompress_DXT1_1555(pPixels, pBlock);
for (unsigned long yOffset = 0; yOffset < 4; yOffset++)
{
for (unsigned long xOffset = 0; xOffset < 4; xOffset++)
{
unsigned long uiPixelKey = (y * uiWidth) + x + (yOffset * uiWidth) + xOffset;
//CDebugger::log("uiPixelKey: " + CStringUtility::toString(uiPixelKey) + ", x: " + CStringUtility::toString(x) + ", y: " + CStringUtility::toString(y) + ", xOffset: " + CStringUtility::toString(xOffset) + ", yOffset: " + CStringUtility::toString(yOffset));
uiPixelKey *= 4;
if (uiPixelKey < strPixels.size()) // this checks if the height has a remainder when dividing by 4 (as the iteration does 4x4 block of pixels)
{
strPixels[uiPixelKey + 0] = pPixels[(((yOffset * 4) + xOffset) * 4) + 2] & 0xFF;
strPixels[uiPixelKey + 1] = pPixels[(((yOffset * 4) + xOffset) * 4) + 1] & 0xFF;
strPixels[uiPixelKey + 2] = pPixels[(((yOffset * 4) + xOffset) * 4) + 0] & 0xFF;
strPixels[uiPixelKey + 3] = 255;// pPixels[(((yOffset * 4) + xOffset) * 4) + 3] & 0xFF;
}
}
}
delete[] pPixels;
delete[] pBlock;
uiTexelSeek += 8;
}
}
}
}
}
void CTexture::decompress_DXT1_1555(unsigned char *pixels, unsigned char *block)
{
string strArea = string((char*)block, 8);
string strPaletteStr = strArea.substr(0, 4);
unsigned long uiIndexes = CStringUtility::unpackULong(strArea.substr(4, 4), false);
unsigned char ucPalette[4][4];
double fPalette[4][4];
unsigned short usPaletteInt[2];
usPaletteInt[0] = CStringUtility::unpackUShort(strPaletteStr.substr(0, 2), false); // 1555
usPaletteInt[1] = CStringUtility::unpackUShort(strPaletteStr.substr(2, 2), false); // 1555
// based on: http://www.glassechidna.com.au/2009/devblogs/s3tc-dxt1dxt5-texture-decompression/
float red, green, blue, alpha;
alpha = (usPaletteInt[0] >> 15) & 1;
red = ((float)((usPaletteInt[0] >> 10) & 0x1F) * 255.0 + 16.0);
red = ((red / 32.0) + red) / 32.0;
green = ((float)((usPaletteInt[0] >> 5) & 0x1F) * 255.0 + 16.0);
green = ((green / 32.0) + green) / 32.0;
blue = ((float)(usPaletteInt[0] & 0x1F)) * 255.0 + 16.0;
blue = ((blue / 32.0) + blue) / 32.0;
fPalette[0][0] = red;
fPalette[0][1] = green;
fPalette[0][2] = blue;
fPalette[0][3] = alpha;
alpha = (usPaletteInt[1] >> 15) & 1;
red = ((float)((usPaletteInt[1] >> 10) & 0x1F) * 255.0 + 16.0);
red = ((red / 32.0) + red) / 32.0;
green = ((float)((usPaletteInt[1] >> 5) & 0x1F) * 255.0 + 16.0);
green = ((green / 32.0) + green) / 32.0;
blue = ((float)(usPaletteInt[1] & 0x1F)) * 255.0 + 16.0;
blue = ((blue / 32.0) + blue) / 32.0;
fPalette[1][0] = red;
fPalette[1][1] = green;
fPalette[1][2] = blue;
fPalette[1][3] = alpha;
// fetch other 2 colours in palette, interpolated between min/max colours
if (usPaletteInt[0] > usPaletteInt[1])
{
fPalette[2][0] = (2.0 * fPalette[0][0] + fPalette[1][0]) / 3.0;
fPalette[2][1] = (2.0 * fPalette[0][1] + fPalette[1][1]) / 3.0;
fPalette[2][2] = (2.0 * fPalette[0][2] + fPalette[1][2]) / 3.0;
fPalette[2][3] = 255;
fPalette[3][0] = (fPalette[0][0] + 2.0 * fPalette[1][0]) / 3.0;
fPalette[3][1] = (fPalette[0][1] + 2.0 * fPalette[1][1]) / 3.0;
fPalette[3][2] = (fPalette[0][2] + 2.0 * fPalette[1][2]) / 3.0;
fPalette[3][3] = 255;
}
else
{
fPalette[2][0] = (fPalette[0][0] + fPalette[1][0]) / 2.0;
fPalette[2][1] = (fPalette[0][1] + fPalette[1][1]) / 2.0;
fPalette[2][2] = (fPalette[0][2] + fPalette[1][2]) / 2.0;
fPalette[2][3] = 255;
fPalette[3][0] = 0;
fPalette[3][1] = 0;
fPalette[3][2] = 0;
fPalette[3][3] = 255; // transparent black
}
for (unsigned long i5 = 0; i5 < 4; i5++)
{
ucPalette[i5][0] = fPalette[i5][0];
ucPalette[i5][1] = fPalette[i5][1];
ucPalette[i5][2] = fPalette[i5][2];
ucPalette[i5][3] = fPalette[i5][3];
}
for (unsigned long i2 = 0; i2<16; i2++)
{
unsigned char index = (uiIndexes >> (i2 * 2)) & 3;
unsigned char colour[4];
colour[0] = ((unsigned char)ucPalette[index][0]) & 0xFF;
colour[1] = ((unsigned char)ucPalette[index][1]) & 0xFF;
colour[2] = ((unsigned char)ucPalette[index][2]) & 0xFF;
colour[3] = ((unsigned char)ucPalette[index][3]) & 0xFF;
// store colour
pixels[(i2 * 4) + 0] = colour[0] & 0xFF;
pixels[(i2 * 4) + 1] = colour[1] & 0xFF;
pixels[(i2 * 4) + 2] = colour[2] & 0xFF;
pixels[(i2 * 4) + 3] = colour[3] & 0xFF;
}
}

I think you're misunderstanding how DXT1 works a bit.
There isn't any alpha in the 2 base colors. They're both in 5:6:5.
The "alpha" is only coming from the case where c0 <= c1. If the block fits this condition, then any pixel with the index 3 will be fully transparent (the 1 bit of alpha is inferred from that).
So... read 5:6:5 (and set alpha=255 for those) instead of 1:5:5:5 in the base colors, and change your alpha on the "transparent black" case from 0,0,0,255 to 0,0,0,0 (actually transparent black instead of opaque black), and you should get better results.

DirectX 9 Terrain Engine Problem C++

I'm Having a problem with my DirectX 9 Terrain Engine.. It's working fine, except for one thing, it doesn't load the heightmap in a proper way.
You can see a screenshot of the problem here: alt text http://img682.imageshack.us/img682/240/problemc.png
as you can see there is a diagonal crack through the entire map.. one side should be mirrored to render the map properly.
I'm almost sure the problem is not inside the file, as other programs don't seem to have a problem with it.
I'm loading my heightmap in this way (class header first):
class Terrain
{
public:
Terrain(const char* fileName);
~Terrain();
void Update(int x, int y);
void Render(LPDIRECT3DDEVICE9 Device);
private:
float* Data;
int Width;
int TileWidth;
bool isRendering;
bool isSwapping;
std::vector<Chunk*> RenderChunks;
};
and the constructor:
Terrain::Terrain(const char* fileName)
{
std::fstream File(fileName, std::ios::in | std::ios::binary);
File.seekg(0, std::ios::end);
int Length = File.tellg();
File.seekg(0, std::ios::beg);
int w = (int)sqrt((float)Length/4.0)-1;
Data = new float[Length / 4];
File.read((char*)Data, Length);
File.close();
Width = w;
int dataWidth = w+1;
TileWidth = w/16;
for (int y=0; y<TileWidth; y++)
{
for (int x=0; x<TileWidth; x++)
{
Chunk* c = new Chunk(x*16, y*16, 16, 512, Data);
RenderChunks.push_back(c);
}
}
}
Whenever I'm calling a height on the heightmap, i use it like this: Data[x + y*dataWidth] (just the usual way)
the Chunk class is a class that just renders a part of the heightmap, just so the detail decreases as the distance to the camera increaes.
So my question is: what could cause my problem?
EDIT: Rendering code:
void Terrain::Render(LPDIRECT3DDEVICE9 Device)
{
for (unsigned int i=0; i<RenderChunks.size(); ++i)
{
RenderChunks[i]->Render(Device);
}
}
Chunk::Chunk(int cX, int cY, int cW, int dW, float* Data):
Pos(cX, 0, cY)
{
Heights = new float[(cW + 1) * (cW + 1)];
ParentH = Data;
ParentOffset = cX + cY*dW;
ParentW = dW;
Width = cW + 1;
for (int y=0; y<Width; ++y)
{
memcpy(Heights + y*Width, Data + cX + (y+cY)*dW, sizeof(float)*Width);
}
Vertices = NULL;
Calculate(16, 16, 16, 16, 16);
}
void Chunk::Calculate(int L, int lod_L, int lod_R, int lod_U, int lod_D)
{
Detail = L;
if (Vertices) delete[] Vertices;
Vertices = new Vertex[(Width-1)*(Width-1)*6/(L*L)];
Count = (Width-1)*(Width-1)*2/(L*L);
float Height = 100.0f;
for (int y=0; y<Width-1; y += L)
{
for (int x=0; x<Width-1; x += L)
{
Vertex* thisQuad = Vertices + (y/L)*((Width-1)/L)*6 + (x/L)*6;
float heights[4] = {
Heights[(x ) + (y )*Width] * Height,
Heights[(x ) + (y + L)*Width] * Height,
Heights[(x + L) + (y )*Width] * Height,
Heights[(x + L) + (y + L)*Width] * Height};
float bonus[8] = {
heights[0],
heights[2],
heights[0],
heights[2],
heights[1],
heights[3],
heights[1],
heights[3]};
if (Pos.z + y > 0)
{
bonus[0] = ParentH[((int)Pos.x + x ) + ((int)Pos.z + y - L)*ParentW] * Height;
bonus[1] = ParentH[((int)Pos.x + x + L) + ((int)Pos.z + y - L)*ParentW] * Height;
}
if (Pos.x + x > 0)
{
bonus[2] = ParentH[((int)Pos.x + x - L) + ((int)Pos.z + y )*ParentW] * Height;
bonus[4] = ParentH[((int)Pos.x + x - L) + ((int)Pos.z + y + L)*ParentW] * Height;
}
if (Pos.x + x < ParentW-L-L)
{
bonus[3] = ParentH[((int)Pos.x + x+L+L) + ((int)Pos.z + y )*ParentW] * Height;
bonus[5] = ParentH[((int)Pos.x + x+L+L) + ((int)Pos.z + y + L)*ParentW] * Height;
}
if (Pos.z + y < ParentW-L-L)
{
bonus[6] = ParentH[((int)Pos.x + x ) + ((int)Pos.z + y+L+L)*ParentW] * Height;
bonus[7] = ParentH[((int)Pos.x + x + L) + ((int)Pos.z + y+L+L)*ParentW] * Height;
}
if (x == 0 && lod_L>L)
{
heights[0] = lerp(
Heights[(x ) + (((y )/lod_L)*lod_L )*Width],
Heights[(x ) + (((y )/lod_L)*lod_L + lod_L)*Width],
(float)((y ) % lod_L) / (float)lod_L) * Height;
heights[1] = lerp(
Heights[(x ) + (((y + L)/lod_L)*lod_L )*Width],
Heights[(x ) + (((y + L)/lod_L)*lod_L + lod_L)*Width],
(float)((y+L) % lod_L) / (float)lod_L) * Height;
}
if (x >= Width-2 && lod_R>L)
{
heights[2] = lerp(
Heights[(x + L) + (((y )/lod_R)*lod_R )*Width],
Heights[(x + L) + (((y )/lod_R)*lod_R + lod_R)*Width],
(float)((y ) % lod_R) / (float)lod_R) * Height;
heights[3] = lerp(
Heights[(x + L) + (((y + L)/lod_R)*lod_R )*Width],
Heights[(x + L) + (((y + L)/lod_R)*lod_R + lod_R)*Width],
(float)((y+L) % lod_R) / (float)lod_R) * Height;
}//*/
if (y == 0 && lod_U>L)
{
heights[0] = lerp(
Heights[(((x )/lod_U)*lod_U ) + (y )*Width],
Heights[(((x )/lod_U)*lod_U + lod_U) + (y )*Width],
(float)((x ) % lod_U) / (float)lod_U) * Height;
heights[2] = lerp(
Heights[(((x + L)/lod_U)*lod_U ) + (y )*Width],
Heights[(((x + L)/lod_U)*lod_U + lod_U) + (y )*Width],
(float)((x+L) % lod_U) / (float)lod_U) * Height;
}
if (y >= Width-2 && lod_D>L)
{
heights[1] = lerp(
Heights[(((x )/lod_D)*lod_D ) + (y + L)*Width],
Heights[(((x )/lod_D)*lod_D + lod_D) + (y + L)*Width],
(float)((x ) % lod_D) / (float)lod_D) * Height;
heights[3] = lerp(
Heights[(((x + L)/lod_D)*lod_D ) + (y + L)*Width],
Heights[(((x + L)/lod_D)*lod_D + lod_D) + (y + L)*Width],
(float)((x+L) % lod_D) / (float)lod_D) * Height;
}//*/
D3DXVECTOR3 fake(0,0,0);
Vertex p1(D3DXVECTOR3(x, heights[0], y ) + Pos, CalcNormal(bonus[2], heights[2], bonus[0], heights[1]));
Vertex p2(D3DXVECTOR3(x, heights[1], y + L) + Pos, CalcNormal(bonus[4], heights[3], heights[0], bonus[6]));
Vertex p3(D3DXVECTOR3(x + L, heights[2], y ) + Pos, CalcNormal(heights[0], bonus[3], bonus[1], heights[3]));
Vertex p4(D3DXVECTOR3(x + L, heights[3], y + L) + Pos, CalcNormal(heights[1], bonus[5], heights[2], bonus[7]));
thisQuad[0] = p1;
thisQuad[1] = p2;
thisQuad[2] = p3;
thisQuad[3] = p3;
thisQuad[4] = p2;
thisQuad[5] = p4;
}
}
}
void Chunk::Render(LPDIRECT3DDEVICE9 Device)
{
Device->SetFVF(D3DFVF_XYZ | D3DFVF_NORMAL);
Device->DrawPrimitiveUP(
D3DPT_TRIANGLELIST,
Count,
Vertices,
sizeof(Vertex));
}

I'm guessing your problem is that your chunk class takes a width (cW) and then you assign that value + 1 to the width. Im further assuming the cW is the number of texels in the heightmap(ie in a 1024x1024 heightmap cW is 1024). If thats right then by adding 1 each subseqent line will be offset to the left by 1. As you go on you make the problems worse so by 512 lines you'll be 512 to the left (or starting halfway across the texture). This would give you the diagonal shear you are seeing.

It seemed to be an "off by 1" error, but the post saying that seems to be deleted somehow..
it was the right solution anyway.

It looks a lot like the order in which you are building your triangle-strip(s) (or are you using another type of primitive?) has an issue. Can you post the relevant part of your rendering loop?
Edit: My intuition is that when you're mirroring your terrain data, you're creating criss-crossed geometry down the diagonal because the corners of your terrain quads (if you imagine them as such) are connecting to the corners diagonally across rather than directly across. I hope that some DirectX / rendering guru can give you a more precise answer based on the code you've posted.

We Keep Coding

c++ django amazon-web-services regex python-2.7 google-cloud-platform list unit-testing opengl ember.js

Exception error in CUDA/GPU on calling function - c++

Related

Can I use SIMD to speed up my image resize function and maybe avoid the nested loop?

sobel filter algorithm (C++) (no libraries)

What is wrong with my perlin noise generator?

1555 DXT1 decompression giving incorrect image output

DirectX 9 Terrain Engine Problem C++

Categories

Resources