CUDA error when handling large input - c++

So I have a rather strange error that is happening. I have a kernel that is supposed to alter the value of every element in an array. As of right now I only test with launching one thread.
__global__ void kernel(int* data) {
for (int var = 0; var < SIZE; ++var) {
data[var] = data[var] + 1;
}
}
Here is the whole code:
#include "stdint.h"
#include "stdio.h"
#include "kernelLauncher.cuh"
#include <cuda_runtime.h>
#define SIZE 10485760
typedef uint64_t POLY_64;
typedef unsigned char BYTE;
__global__ void kernel(int* data) {
for (int var = 0; var < SIZE; ++var) {
data[var] = data[var] + 1;
}
}
int main() {
int* data = (int*) malloc(sizeof(int) * SIZE);
int* data_d;
for (int var = 0; var < SIZE; ++var) {
data[var] = 1;
}
//allocate device memory for the fingerprinting data
cudaMalloc((void**) &data_d, sizeof(int) * SIZE);
//copy the data to device
CUDA_CHECK_RETURN(
cudaMemcpy(data_d, data, sizeof(int) * SIZE, cudaMemcpyHostToDevice));
kernel<<<1, 1>>>(data_d);
cudaThreadSynchronize();
CUDA_CHECK_RETURN(cudaMemcpy(data, data_d, sizeof(int) * SIZE, cudaMemcpyDeviceToHost));
//try to print the result
for (int var = 0; var < SIZE; ++var) {
printf("%d\n", data[var]);
}
CUDA_CHECK_RETURN(cudaFree(data_d));
return 0;
}
When my SIZE is defined to 1048576, I get my data back just fine. Unfortunatelly when I define it as 10485760 (10 times more). I get:
Error unspecified launch failure at line 40 in file ../src/runTest.cu
Can somebody point me in the right direction. Why is this problem happening ? Thank you in advance
EDIT: So yes.. it is the size definition. I changed my code now so there are no discrepancies between the hard coded loop value in the kernel and the defined constant. However, if I have 10485760 instead of 1048576 it simply does not work.. Why is that. This is not too much allocation at one go.. My card is a Quadro FX 770m with compute capability 1.1

So.. here is what actually seemed to be happening. As some of you suggested, the kernel was indeed taking too long and timing out (although I read from various soruces that this does not happen on Linux systems) So separating the work like this in fact solves the issue and avoids the watchdog killing the kernel :
kernel<<<1, 1>>>(data_d, 0, 1048576);
cudaDeviceSynchronize();
kernel<<<1, 1>>>(data_d, 1048576, 2097152);
cudaDeviceSynchronize();
kernel<<<1, 1>>>(data_d, 2097152, 3145728);
cudaDeviceSynchronize();
kernel<<<1, 1>>>(data_d, 3145728, 4194304);
cudaDeviceSynchronize();
kernel<<<1, 1>>>(data_d, 4194304, 5242880);
cudaDeviceSynchronize();
kernel<<<1, 1>>>(data_d, 5242880, 6291456);
cudaDeviceSynchronize();
kernel<<<1, 1>>>(data_d, 6291456, 7340032);
cudaDeviceSynchronize();
kernel<<<1, 1>>>(data_d, 7340032, 8388608);
cudaDeviceSynchronize();
Now I wonder, what is the way to avoid hitting this threshold. I tried adding
Section "Device"
Identifier "Device0"
Driver "nvidia"
VendorName "NVIDIA Corporation"
Option "Interactive" "0" #<<--- added to avoid kernel time-out
EndSection
into the device section in my Xorg.conf, but this did not really help.

Related

CUDA deep copy with other data

I'm trying to copy my struct Test to the GPU, change the data, and upload it back to the CPU. This is what I've tried so far, note that my code crashes on the last, commented out, line:
struct Test {
int x, y;
int* data;
};
// Test kernel
static __global__ void TestKernel(Test* d) {
const uint32_t index = __mul24(blockIdx.x, blockDim.x) + threadIdx.x;
// increment some values
++d->data[0];
++d->data[1];
++d->data[2];
++d->x;
++d->y;
}
// Test snippet:
Test* host = new Test{ 10, 20,new int[3]{1, 2, 3} };
Test* device = nullptr;
int* deviceData;
COMPUTE_SAFE(cudaMalloc(&device, sizeof(Test)));
COMPUTE_SAFE(cudaMalloc(&deviceData, 3 * sizeof(int)));
COMPUTE_SAFE(cudaMemcpy(device, host, sizeof(Test), cudaMemcpyHostToDevice));
COMPUTE_SAFE(cudaMemcpy(deviceData, host->data, 3 * sizeof(int), cudaMemcpyHostToDevice));
COMPUTE_SAFE(cudaMemcpy(&(device->data), &deviceData, sizeof(float*), cudaMemcpyHostToDevice));
TestKernel <<< 1, 1 >>> (device);
COMPUTE_SAFE(cudaDeviceSynchronize());
COMPUTE_SAFE(cudaMemcpy(host, device, sizeof(Test), cudaMemcpyDeviceToHost));
COMPUTE_SAFE(cudaMemcpy(host->data, deviceData, 3 * sizeof(float), cudaMemcpyDeviceToHost));
printf("\nhost:\n");
printf("%d %d\n", host->x, host->y); // works
// printf("%d %d %d\n", host->data[0], host->data[1], host->data[2]); // crashes
Note that I've seen multiple related questions, but none of them also copy some data apart from the deep copied data pointer.
My error message:
Exception thrown at 0x00007FF7A2C5297D in VFD.exe: 0xC0000005: Access
violation reading location 0x0000000B01600208.
Note that I'm probably copying the memory incorrectly, or something along those lines. If I remove the COMPUTE_SAFE(cudaMemcpy(host, device, sizeof(Test), cudaMemcpyDeviceToHost)); line I'm able to access the host->data array, but the x and y values stay unincremented for obvious reasons.
After you cudaMemcpy into the host struct back from the GPU, you override the data pointer in it with an invalid GPU data pointer.
In order to fix it you need to restore the original data pointer (and then copy the actual data).
Working version:
struct Test
{
int x, y;
int* data;
};
static __global__ void TestKernel(Test* d)
{
++(d->data[0]);
++(d->data[1]);
++(d->data[2]);
++(d->x);
++(d->y);
}
int main()
{
int* hostData = new int[3]{ 1, 2, 3 };
Test* host = new Test{ 10, 20, hostData };
int* deviceData = nullptr;
Test* device = nullptr;
COMPUTE_SAFE(cudaMalloc(&device, sizeof(Test)));
COMPUTE_SAFE(cudaMalloc(&deviceData, 3 * sizeof(int)));
COMPUTE_SAFE(cudaMemcpy(device, host, sizeof(Test), cudaMemcpyHostToDevice));
COMPUTE_SAFE(cudaMemcpy(deviceData, host->data, 3 * sizeof(int), cudaMemcpyHostToDevice));
COMPUTE_SAFE(cudaMemcpy(&(device->data), &deviceData, sizeof(int*), cudaMemcpyHostToDevice));
TestKernel << < 1, 1 >> > (device);
COMPUTE_SAFE(cudaDeviceSynchronize());
COMPUTE_SAFE(cudaMemcpy(host, device, sizeof(Test), cudaMemcpyDeviceToHost));
host->data = hostData; // Restore host data pointer
COMPUTE_SAFE(cudaMemcpy(host->data, deviceData, 3 * sizeof(int), cudaMemcpyDeviceToHost));
printf("\nhost:\n");
printf("%d %d\n", host->x, host->y);
printf("%d %d %d\n", host->data[0], host->data[1], host->data[2]);
return 0;
}
Output:
host:
11 21
2 3 4
Some notes:
For clarity I added () in the kernel increment statements.
You used sizeof(float*) for the data pointer, although it is an int* (of course the size is the same).

cudaMemcpy throws InvalidValue error when copying from device to host

I've been trying to implement a one dimensional FFT using cuFFT. An InvalidValue error is thrown and no meaningful results are produced.
I've tried to ensure that each error is caught, and I believe that the cudaMemcpy from DeviceToHost causes the issue, though I am not sure why, nor how to fix it. The data size parameter in cudaMemcpy follows the same relation as supplied by the cuFFT documentation.
#include <iostream>
#include <fstream>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <cuda_runtime_api.h>
#include <cufft.h>
// cuda macros
#define NX 100 // number of points
#define BATCH 1 // number of ffts to perform
#define RANK 1 //
#define IDIST 1 // distance between 1st elements of batches
#define ISTRIDE 1 // do every ISTRIDEth index
#define ODIST 1 // distance between 1st elements of output
#define OSTRIDE 1 // distance between output elements
void fft1d() {
// create plan for performing fft
cufftHandle plan;
if (cufftPlan1d(&plan, NX, CUFFT_R2C, BATCH) != CUFFT_SUCCESS) {
printf("Failed to create 1D plan\n");
return;
}
// assemble data
double temp_data[] = {2.598076211353316, 3.2402830701637395, 3.8494572900049224, 4.419388724529261, 4.944267282795252, 5.41874215947433, 5.837976382931011, 6.197696125093141, 6.494234270429254, 6.724567799874842, 6.886348608602047, 6.97792744346504, 6.998370716093996, 6.9474700202387565, 6.8257442563389, 6.6344343416615565, 6.37549055993378, 6.051552679431957, 5.665923042211819, 5.222532898817316, 4.725902331664744, 4.181094175657916, 3.5936624057845576, 2.9695955178498603, 2.315255479544737, 1.6373128742041732, 0.9426788984240022, 0.23843490677753865, -0.46823977812093664, -1.1701410542749289, -1.8601134815746807, -2.531123226988873, -3.176329770049035, -3.7891556376344524, -4.363353457155562, -4.893069644570959, -5.3729040779788875, -5.797965148448726, -6.163919626883915, -6.467036838555256, -6.704226694973039, -6.873071195387157, -6.971849076777267, -6.999553361041935, -6.955901620504255, -6.84133885708361, -6.657032965782207, -6.404862828733319, -6.0873991611848375, -5.707878304681281, -5.270169234606201, -4.778734118422206, -4.23858282669252, -3.6552218606153755, -3.0345982167228436, -2.383038761007964, -1.707185730522749, -1.0139290199674, -0.31033594356630245, 0.39642081173600463, 1.0991363072871054, 1.7906468025248339, 2.463902784786862, 3.1120408346390414, 3.728453594100783, 4.306857124485735, 4.841354967187034, 5.326498254347925, 5.757341256627454, 6.129491801786784, 6.439156050110601, 6.683177170206378, 6.859067520906216, 6.965034011197066, 6.999996379650895, 6.963598207007518, 6.85621054964381, 6.678928156888352, 6.433558310743566, 6.122602401787424, 5.749230429076629, 5.317248684008804, 4.831060947586139, 4.295623596650021, 3.7163950767501706, 3.0992802567403803, 2.4505702323708074, 1.7768781925409076, 1.0850720020162676, 0.3822041878858906, -0.3245599564963766, -1.0280154171511335, -1.7209909100394047, -2.3964219877733033, -3.0474230571943477, -3.667357573646071, -4.249905696354359, -4.78912871521179, -5.279529592175676, -5.716109000098287};
cufftReal *idata;
cudaMalloc((void**) &idata, sizeof(cufftComplex)*NX);
if (cudaGetLastError() != cudaSuccess) {
printf("Failed to allocate memory space for input data.\n");
return;
}
cudaMemcpy(idata, temp_data, sizeof(temp_data)/sizeof(double), cudaMemcpyHostToDevice);
if (cudaGetLastError() != cudaSuccess) {
printf("Failed to load time data to memory.\n");
return;
}
// prepare memory for return data
cufftComplex *odata;
cudaMalloc((void**) &odata, sizeof(cufftComplex)*(NX/2 + 1));
if (cudaGetLastError() != cudaSuccess) {
printf("Failed to allocate memory for output data.\n");
}
// perform fft
if (cufftExecR2C(plan, idata, odata) != CUFFT_SUCCESS) {
printf("Failed to perform fft.\n");
return;
}
I think the error is thrown here, at the cudaMemcpy.
// grab data from graphics and print (memcpy waits until complete) cuda memcopy doesn't complete
// can return errors from previous cuda calls if they haven't been caught
cufftComplex *out_temp_data;
size_t num_bytes = (NX/2 + 1)*sizeof(cufftComplex);
cudaMemcpy(out_temp_data, odata, num_bytes, cudaMemcpyDeviceToHost);
int error_value = cudaGetLastError();
printf("cudaMemcpy from device state: %i\n", error_value);
if(error_value != cudaSuccess) {
printf("Failed to pull data from device.\n");
return;
}
for (size_t i = 0; i < (NX/2 + 1); i++) {
printf("%lu %f %f\n", i, out_temp_data[i].x, out_temp_data[i].y);
}
// clean up
cufftDestroy(plan);
cudaFree(idata);
}
int main() {
fft1d();
return 0;
}
Memory must be allocated before cudaMemcpy can write the data. Thanks to generic-opto-guy for pointing this out.
In this case:
out_temp_data = new cufftComplex[NX/2 + 1];

cudaMemcpyAsync from page-locked host memory to device memory returns reading access violation error

I am coding a memory-heavy multi-GPU CUDA program. I found that my cudaMemcpyAsync calls werent actually performed asynchronously. After some research I found out that I would have to copy it from page-locked host memory to the device. So what I now attempt to do is to copy a part of the whole host input data array into a chunk of page-locked host memory, and then copy that to the device. The H2H cudaMemcpyAsync works fine without outputting any errors, the H2D afterwards gives me this error: Access violation reading address 0xWHATEVER. Additionally, in the sample code I am providing there is an identical error with the cudaMallocHost call. (this works fine in the main project)
I have tried to make a much simpler sample project (the one below). This still gives me errors, so I don't really know what to do.
#include "cuda.h"
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <stdlib.h>
#include <string>
#ifndef KERNEL_H
#define KERNEL_H
typedef struct
{
int device = 0;
double *d_array, //device array ptr
*h_array_pl; //page locked array ptr
} IOdataPtr;
#endif
void printCudaError(cudaError_t error, char err_src[]) { //error printing function to reduce line count
if (error != cudaSuccess) {
printf("Error: %i while performing %s \n", error, err_src);
}
}
int main() {
const int GPU_N = 2;
const int CALC_N = 1024*1024*1024;
cudaError_t error;
cudaStream_t stream[GPU_N];
double *h_array;
h_array = (double*)malloc(sizeof(double) * CALC_N);
for (int i = 0; i < CALC_N; i++) {
h_array[i] = 2;
}
IOdataPtr ptr[GPU_N];
for (int i = 0; i < GPU_N; i++) {
//normal host alloc
ptr[i].device = i;
error = cudaSetDevice(ptr[i].device); //select device
printCudaError(error, "cudaSetDevice");
cudaStreamCreate(&stream[i]);
printCudaError(error, "cudaStreamCreate");
error = cudaMalloc((void**)&(ptr[i].d_array),
CALC_N / GPU_N * sizeof(double));
printCudaError(error, "cudaMalloc");
error = cudaMallocHost((void **)&ptr[i].h_array_pl,
CALC_N / GPU_N * sizeof(double));
printCudaError(error, "cudaMallocHost");
//xre
//data -> pl
error = cudaMemcpyAsync(ptr[i].h_array_pl, //dst
&h_array[i * CALC_N / GPU_N], //src
CALC_N / GPU_N * sizeof(double), //amt
cudaMemcpyHostToHost, //kind
stream[i]); //stream
printCudaError(error, "cudaMemcpyAsync H2H");
//pl -> dev
error = cudaMemcpyAsync(ptr[i].d_array, //dst
ptr[i].h_array_pl, //src
CALC_N / GPU_N * sizeof(double), //amt
cudaMemcpyHostToDevice, //kind
stream[i]); //stream
printCudaError(error, "cudaMemcpyAsync H2D");
cudaStreamDestroy(stream[i]);
error = cudaFree(ptr[i].d_array);
printCudaError(error, "cudaFree");
}
printf("Well it worked");
free(h_array);
getchar();
}
The output my code gives me:
Error: 2 while performing cudaMallocHost
Error: 2 while performing cudaMemcpyAsync H2H
Error: 2 while performing cudaMemcpyAsync H2D
Error: 2 while performing cudaFree
Well it worked
Error 2 is cudaErrorMemoryAllocation
In the code you currently have posted, this line of code is wrong:
error = cudaMemcpyAsync(ptr[i].d_array, &ptr[i].h_array_pl, CALC_N / GPU_N * sizeof(double), cudaMemcpyHostToDevice, stream[i]);
^
That ampersand doesn't belong there. ptr[i].h_array_pl is already a pointer to the source of the data transfer, you should not be taking the address of that pointer.
Using the address of this pointer as the data source of the copy operation would result in incorrect and illegal host memory accesses, for the size of the transfer indicated in this code. Whether or not this would be detected depends on a number of factors, but it's possibly or probably the reason for the Access violation reading location... report, which is generally referring to an illegal access to host memory.

Problem feeding Thrust vector into getrf/getri

Continuing on my CUDA beginner's adventure, I've been introduced to Thrust, which seems a convenient lib that saves me the hassle of explicit memory (de-)allocation.
I've already tried combining it with a few cuBLAS routines, e.g. gemv, by generating a raw pointer to the underlying storage with thrust::raw_pointer_cast(array.data()) and then feeding this to the routines, and it works just fine.
The current task is to get the inverse of a matrix, and for that I'm using getrfBatched and getriBatched. From the documentation:
cublasStatus_t cublasDgetrfBatched(cublasHandle_t handle,
int n,
double *Aarray[],
int lda,
int *PivotArray,
int *infoArray,
int batchSize);
where
Aarray - device - array of pointers to <type> array
Naturally I thought I could use another layer of Thrust vector to express this array of pointers and again feed its raw pointer to cuBLAS, so here's what I did:
void test()
{
thrust::device_vector<double> in(4);
in[0] = 1;
in[1] = 3;
in[2] = 2;
in[3] = 4;
cublasStatus_t stat;
cublasHandle_t handle;
stat = cublasCreate(&handle);
thrust::device_vector<double> out(4, 0);
thrust::device_vector<int> pivot(2, 0);
int info = 0;
thrust::device_vector<double*> in_array(1);
in_array[0] = thrust::raw_pointer_cast(in.data());
thrust::device_vector<double*> out_array(1);
out_array[0] = thrust::raw_pointer_cast(out.data());
stat = cublasDgetrfBatched(handle, 2,
(double**)thrust::raw_pointer_cast(in_array.data()), 2,
thrust::raw_pointer_cast(pivot.data()), &info, 1);
stat = cublasDgetriBatched(handle, 2,
(const double**)thrust::raw_pointer_cast(in_array.data()), 2,
thrust::raw_pointer_cast(pivot.data()),
(double**)thrust::raw_pointer_cast(out_array.data()), 2, &info, 1);
}
When executed, stat says CUBLAS_STATUS_SUCCESS (0) and info says 0 (execution successful), yet if I try to access the elements of in, pivot or out with standard bracket notation, I hit a thrust::system::system_error. Seems to me that the corresponding memory got corrupted somehow.
Anything obvious that I'm missing here?
The documentation for cublas<t>getrfBatched indicates that the infoArray parameter is expected to be a pointer to device memory.
Instead you have passed a pointer to host memory:
int info = 0;
...
stat = cublasDgetrfBatched(handle, 2,
(double**)thrust::raw_pointer_cast(in_array.data()), 2,
thrust::raw_pointer_cast(pivot.data()), &info, 1);
^^^^^
If you run your code with cuda-memcheck (always a good practice, in my opinion, any time you are having trouble with a CUDA code, before asking others for help) you will receive an error of "invalid global write of size 4". This is due to the fact that a kernel launched by cublasDgetrfBatched() is attempting to write the info data to device memory using an ordinary host pointer that you provided, which is always illegal in CUDA.
CUBLAS itself does not trap errors like this for performance reasons. However the thrust API uses more rigorous synchronization and error checking, in some cases. Therefore, the use of thrust code after this error reports the error, even though the error had nothing to do with thrust (it was an asynchronously reported error from a previous kernel launch).
The solution is straightforward; provide device storage for info:
$ cat t329.cu
#include <thrust/device_vector.h>
#include <cublas_v2.h>
#include <iostream>
void test()
{
thrust::device_vector<double> in(4);
in[0] = 1;
in[1] = 3;
in[2] = 2;
in[3] = 4;
cublasStatus_t stat;
cublasHandle_t handle;
stat = cublasCreate(&handle);
thrust::device_vector<double> out(4, 0);
thrust::device_vector<int> pivot(2, 0);
thrust::device_vector<int> info(1, 0);
thrust::device_vector<double*> in_array(1);
in_array[0] = thrust::raw_pointer_cast(in.data());
thrust::device_vector<double*> out_array(1);
out_array[0] = thrust::raw_pointer_cast(out.data());
stat = cublasDgetrfBatched(handle, 2,
(double**)thrust::raw_pointer_cast(in_array.data()), 2,
thrust::raw_pointer_cast(pivot.data()), thrust::raw_pointer_cast(info.data()), 1);
stat = cublasDgetriBatched(handle, 2,
(const double**)thrust::raw_pointer_cast(in_array.data()), 2,
thrust::raw_pointer_cast(pivot.data()),
(double**)thrust::raw_pointer_cast(out_array.data()), 2, thrust::raw_pointer_cast(info.data()), 1);
for (int i = 0; i < 4; i++) {
double test = in[i];
std::cout << test << std::endl;
}
}
int main(){
test();
}
$ nvcc -o t329 t329.cu -lcublas
t329.cu(12): warning: variable "stat" was set but never used
$ cuda-memcheck ./t329
========= CUDA-MEMCHECK
3
0.333333
4
0.666667
========= ERROR SUMMARY: 0 errors
$
You'll note this change in the above code is applied to usage for both cublas calls, as the infoArray parameter has the same expectations for both.

tex1Dfetch unexpectedly returning 0

I don't believe this is the same issue as reported here :
Bound CUDA texture reads zero
CUDA 1D texture fetch always return 0
In my CUDA application I noticed that tex1Dfetch is not returning the expected value, past a certain index in the buffer. An initial observation in the application was that a value at index 0 could be read correctly, but at 12705625, the value read was 0. I made a small test program to investigate this, given below. The results are a little bit baffling to me. I'm trying to probe at what index the values no longer are read correctly. But as the value arraySize is changed, so does the "firstBadIndex". Even with arraySize =2, the second value is read incorrectly! As arraySize is made bigger, the firstBadIndex gets bigger. This happens when binding to arrays of float, float2, or float4. If the data are read from the device buffer instead (switch around the commented lines in FetchTextureData), then everything is fine. This is using CUDA 6.5, on a Tesla c2075.
Thanks for any insights or advice you might have.
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#define FLOATTYPE float4
texture<FLOATTYPE,cudaTextureType1D,cudaReadModeElementType> texture1D;
const unsigned int arraySize = 1000;
FLOATTYPE* host;
FLOATTYPE* device;
FLOATTYPE* dTemp;
FLOATTYPE hTemp[1];
__global__ void FetchTextureData(FLOATTYPE* data,FLOATTYPE* arr,int idx)
{
data[0] = tex1Dfetch(texture1D, idx);
//data[0] = arr[idx];
}
bool GetTextureValues(int idx){
FetchTextureData<<<1,1>>>(dTemp,device,idx);
// copy to the host
cudaError_t err = cudaMemcpy(hTemp,dTemp,sizeof(FLOATTYPE),cudaMemcpyDeviceToHost);
if (err != cudaSuccess) {
throw "cudaMemcpy failed!";
}
if (cudaDeviceSynchronize() != cudaSuccess) {
throw "cudaDeviceSynchronize failed!";
}
return hTemp[0].x == 1.0f;
}
int main()
{
try{
host = new FLOATTYPE[arraySize];
cudaError_t err = cudaMalloc((void**)&device,sizeof(FLOATTYPE) * arraySize);
cudaError_t err1 = cudaMalloc((void**)&dTemp,sizeof(FLOATTYPE));
if (err != cudaSuccess || err1 != cudaSuccess) {
throw "cudaMalloc failed!";
}
// make some host data
for(unsigned int i=0; i<arraySize; i++){
FLOATTYPE data = {1.0f, 0.0f, 0.0f, 0.0f};
host[i] = data;
}
// and copy it to the device
err = cudaMemcpy(device,host,sizeof(FLOATTYPE) * arraySize,cudaMemcpyHostToDevice);
if (err != cudaSuccess){
throw "cudaMemcpy failed!";
}
// set up the textures
cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<FLOATTYPE>();
texture1D.addressMode[0] = cudaAddressModeClamp;
texture1D.filterMode = cudaFilterModePoint;
texture1D.normalized = false;
cudaBindTexture(NULL, texture1D, device, channelDesc, arraySize);
// do a texture fetch and find where the fetches stop working
int lastGoodValue = -1, firstBadValue = -1;
float4 badValue = {-1.0f,0.0f,0.0f,0.0f};
for(unsigned int i=0; i<arraySize; i++){
if(i % 100000 == 0) printf("%d\n",i);
bool isGood = GetTextureValues(i);
if(firstBadValue == -1 && !isGood)
firstBadValue = i;
if(isGood)
lastGoodValue = i;
else
badValue = hTemp[0];
}
printf("lastGoodValue %d, firstBadValue %d\n",lastGoodValue,firstBadValue);
printf("Bad value is (%.2f)\n",badValue.x);
}catch(const char* err){
printf("\nCaught an error : %s\n",err);
}
return 0;
}
The problem lies in the texture set up. This:
cudaBindTexture(NULL, texture1D, device, channelDesc, arraySize);
should be:
cudaBindTexture(NULL, texture1D, device, channelDesc,
arraySize * sizeof(FLOATTYPE));
As per the documentation, the size argument is the size of the memory area in bytes, not the number of elements. I would have expected that with the clamped addressing mode, the code would still work as expected. With border mode, you should get a zero value which looks like it would trigger your bad value detection. I haven't actually run your code, so perhaps there is a subtley I'm missing somewhere. For such a simple repro case, your code structure is rather convoluted and hard to follow (at least on the mobile phone screen I am reading it on).
EDIT to add that between the time I started writing this and finished, #njuffa pointed out the same mistake in comments