I am trying to do a power spectrum analysis with FFTW using this Code below:
#define ALSA_PCM_NEW_HW_PARAMS_API
#include <iostream>
using namespace std;
#include <alsa/asoundlib.h>
#include <fftw3.h>
#include <math.h>
float map(long x, long in_min, long in_max, float out_min, float out_max)
{
return (x - in_min) * (out_max - out_min) / (in_max - in_min) + out_min;
}
float windowFunction(int n, int N)
{
return 0.5f * (1.0f - cosf(2.0f *M_PI * n / (N - 1.0f)));
}
int main() {
//FFTW
int N=8000;
float window[N];
double *in = (double*)fftw_malloc(sizeof(double) * N);
fftw_complex *out = (fftw_complex*)fftw_malloc(sizeof(fftw_complex) * N);
fftw_plan p = fftw_plan_dft_r2c_1d(N, in, out, FFTW_MEASURE);
for(int n = 0; n < N; n++)
window[n] = windowFunction(n, N);
//ALSA
long loops;
int rc;
int size;
snd_pcm_t *handle;
snd_pcm_hw_params_t *params;
unsigned int val;
int dir=0;
snd_pcm_uframes_t frames;
char *buffer;
/* Open PCM device for recording (capture). */
rc = snd_pcm_open(&handle, "default",
SND_PCM_STREAM_CAPTURE, 0);
if (rc < 0) {
fprintf(stderr,
"unable to open pcm device: %s\n",
snd_strerror(rc));
exit(1);
}
/* Allocate a hardware parameters object. */
snd_pcm_hw_params_alloca(¶ms);
/* Fill it in with default values. */
snd_pcm_hw_params_any(handle, params);
/* Set the desired hardware parameters. */
/* Interleaved mode */
snd_pcm_hw_params_set_access(handle, params,
SND_PCM_ACCESS_RW_INTERLEAVED);
/* Signed 16-bit little-endian format */
snd_pcm_hw_params_set_format(handle, params,
SND_PCM_FORMAT_S16_LE);
/* One channel (mono) */
snd_pcm_hw_params_set_channels(handle, params, 1);
/* 8000 bits/second sampling rate */
val = 8000;
snd_pcm_hw_params_set_rate_near(handle, params,
&val, &dir);
/* Set period size to 16 frames. */
frames = 16;
snd_pcm_hw_params_set_period_size_near(handle,
params, &frames, &dir);
/* Write the parameters to the driver */
rc = snd_pcm_hw_params(handle, params);
if (rc < 0) {
fprintf(stderr,
"unable to set hw parameters: %s\n",
snd_strerror(rc));
exit(1);
}
/* Use a buffer large enough to hold one period */
snd_pcm_hw_params_get_period_size(params,
&frames, &dir);
size = frames * 2; /* 2 bytes/sample, 1 channel */
buffer = (char *) malloc(size);
/* We want to loop for 5 seconds */
snd_pcm_hw_params_get_period_time(params,
&val, &dir);
loops = 1000000 / val + 25; //added this, because the first values seem to be useless
int count=0;
while (loops > 0) {
loops--;
rc = snd_pcm_readi(handle, buffer, frames);
int i;
short *samples = (short*)buffer;
for (i=0;i < 16;i++)
{
if(count>24){
//cout << (float)map(*samples, -32768, 32768, -1, 1) << endl;
in[i*count]= /*window[i]*/*(double)map(*samples, -32768, 32768, -1, 1);
}
samples++;
}
count++;
if (rc == -EPIPE) {
/* EPIPE means overrun */
fprintf(stderr, "overrun occurred\n");
snd_pcm_prepare(handle);
} else if (rc < 0) {
fprintf(stderr,
"error from read: %s\n",
snd_strerror(rc));
} else if (rc != (int)frames) {
fprintf(stderr, "short read, read %d frames\n", rc);
}
// rc = write(1, buffer, size);
// if (rc != size)
// fprintf(stderr,
// "short write: wrote %d bytes\n", rc);
}
snd_pcm_drain(handle);
snd_pcm_close(handle);
free(buffer);
//FFTW
fftw_execute(p);
for(int j=0;j<N/2;j++){
//cout << in[j] << endl;
cout << sqrt(out[j][0]*out[j][0]+out[j][1]*out[j][1])/N << endl;
/*if(out[j][1]<0.0){
cout << out[j][0] << out[j][1] << "i" << endl;
}else{
cout << out[j][0] << "+" << out[j][1] << "i" << endl;
}*/
}
fftw_destroy_plan(p);
fftw_free(in);
fftw_free(out);
fftw_cleanup();
return 0;
}
I am using 8000 Samples for the FFTW, so I get 4000 values back, which should be the power spectrum. If I now plot the data in MATLAB, the plot does not look like a power spectrum. The input must be right, because if I uncomment this
//cout << (float)map(*samples, -32768, 32768, -1, 1) << endl;
and comment that
cout << sqrt(out[j][0]*out[j][0]+out[j][1]*out[j][1])/N << endl;
and now load the output of the program (which is the input for the FFT) into MATLAB and do a FFT, the plotted data seems to be correct. I tested it with various frequencies, but I always get a weird spectrum when using my own program. As you can see, I also tried to add a hanning window before the FFT, but still no success. So what am I doing wrong here?
Thanks a lot!
The usual suspect with FFT routines is a representation mismatch issue. That is to say, the FFT function fills in an array using one type, and you interpret that array as another type.
You debug this by creating a sine input. You know that this should give a single non-zero input, and you have a reasonable expectation where the zero should be. Because your implementation is wrong, the actual FFT of your sine will differ, and it's this difference that will help troubleshoot the problem.
If you can't figure it out from just the FFT of a sine, next try a cosine, sine of different frequencies, and combinations of two such simple inputs. A cosine is merely a phase shifted sine, so that should just change the phase of that single non-zero value. And the FF should be linear, so the FF of a sum of sines has two sharp peaks.
Related
I have a homework about WAV files and FIR filters for a Digital Signal Processing class.
My program must read a WAV file, apply a filter to the data and write the output data to another WAV file again.
I have completed reading and applying filters but I can't write the WAV file. The program doesn't give any errors while compiling but the WAV file doesn't play.
If I write "temp" to the WAV, it runs properly. But if I write "data", it doesn't.
How can I write a WAV file properly?
#define _CRT_SECURE_NO_WARNINGS
#define PI 3.14f
#define WAV_HEADER_LENGTH 44
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <iostream>
#include <fstream>
char* read_wav(const char* filename, short*, short*, int*);
void write_wav(const char* filename, const char*, int);
using namespace std;
int main()
{
short nchannel, ssample;
int csample;
//Reading WAV file and returning the data.
char* temp = read_wav("sum.wav", &nchannel, &ssample, &csample);
short* data = (short*)&temp[WAV_HEADER_LENGTH];
cout << "How many coefficients are there in filter ?" << endl;
int N;
cin >> N ;
float filter[N];
cout << "Type coefficients in filter." << endl;
for(int i=0; i<N;i++){
cin >> filter[i];
}
short* output = (short*)&temp[WAV_HEADER_LENGTH];
for(int i=0; i < csample; i++){
double sum = 0;
for(int j=0; j < N; j++){
if((i - j) >= 0)
sum += filter[j] * data[i-j];
}
output[i] = (short) sum;
}
write_wav("test.wav", out, csample * ssample + WAV_HEADER_LENGTH);
}
char* read_wav(const char* filename, short* nchannel, short* ssample, int* csample) {
//Reading the file.
FILE* fp = fopen(filename, "rb");
if (!fp) {
fprintf(stderr, "Couldn't open the file \"%s\"\n", filename);
exit(0);
}
fseek(fp, 0, SEEK_END);
int file_size = ftell(fp);
fseek(fp, 0, SEEK_SET);
printf("The file \"%s\" has %d bytes\n\n", filename, file_size);
char* buffer = (char*)malloc(sizeof(char) * file_size);
fread(buffer, file_size, 1, fp);
// Dump the buffer info.
*nchannel = *(short*)&buffer[22];
*ssample = *(short*)&buffer[34] / 8;
*csample = *(int*)&buffer[40] / *ssample;
printf("ChunkSize :\t %u\n", *(int*)&buffer[4]);
printf("Format :\t %u\n", *(short*)&buffer[20]);
printf("NumChannels :\t %u\n", *(short*)&buffer[22]);
printf("SampleRate :\t %u\n", *(int*)&buffer[24]); // number of samples per second
printf("ByteRate :\t %u\n", *(int*)&buffer[28]); // number of bytes per second
printf("BitsPerSample :\t %u\n", *(short*)&buffer[34]);
printf("Subchunk2ID :\t \"%c%c%c%c\"\n", buffer[36], buffer[37], buffer[38], buffer[39]); // marks beginning of the data section
printf("Subchunk2Size :\t %u\n", *(int*)&buffer[40]); // size of data (byte)
printf("Duration :\t %fs\n\n", (float)(*(int*)&buffer[40]) / *(int*)&buffer[28]);
fclose(fp);
return buffer;
}
void write_wav(const char* filename, const char* data, int len) {
FILE* fp = fopen(filename, "wb");
if (!fp) {
fprintf(stderr, "Couldn't open the file \"%s\"\n", filename);
exit(0);
}
fwrite(data, len, 1, fp);
fclose(fp);
}
This works for me:
int main()
{
short nchannel, ssample;
int csample;
// Reading WAV file and returning the data.
char* temp = read_wav("sum.wav", &nchannel, &ssample, &csample);
short* data = (short*)&temp[WAV_HEADER_LENGTH];
// cout << "How many coefficients are there in filter ?" << endl;
const int N = 2;
// cin >> N;
float filter[N] = {0.5, 0.75};
// cout << "Type coefficients in filter." << endl;
// for (int i = 0; i < N; i++)
// {
// cin >> filter[i];
// }
short* output = (short*)&temp[WAV_HEADER_LENGTH];
for (int i = 0; i < csample; i++)
{
double sum = 0;
for (int j = 0; j < N; j++)
{
if ((i - j) >= 0) sum += filter[j] * data[i - j];
}
output[i] = (short)sum;
}
write_wav("test.wav", (char*)temp, csample * ssample + WAV_HEADER_LENGTH);
}
My changes:
The major change is to use the full buffer, with extremely misleading name: temp, instead of your out that does not compile, as the argument of write_wav.
I applied "my" filter coefficients (the sound from the output file is really distorted),
I applied my favorite indentation
If the code is to be portable, you need to check the endiannes and act accordingly.
I would expect the input and output files to be of the same length, but they're not. Please check it yourself why this is not the case.
Example:
-rw-r--r-- 1 zkoza zkoza 787306 06-23 14:09 sum.wav
-rw-r--r-- 1 zkoza zkoza 787176 06-23 14:16 test.wav
It looks like 130 bytes are missing in the output file.
Your float filter[N] with N not known at compile time is a C++ extension: please use std::vector in your final code instead.
Next time please provide also a link for any input files. For my tests, I used https://freewavesamples.com/alesis-fusion-clean-guitar-c3 , but all these little things, like finding an input file (WAV format has several flavors, I could have missed the correct one), guessing filter parameters etc. take time and effort.
Your condition if ((i - j) >= 0) can be written in a way easier to understand; preferably by changing the inner loop "header".
I'm wanting to find a way to dynamically calculate the necessary grid and block size for a calculation. I have run into the issue that the problem that I am wanting to handle is simply too large to handle in a single run of the GPU from a thread limit perspective. Here is a sample kernel setup which runs into the error that I am having:
__global__ void populateMatrixKernel(char * outMatrix, const int pointsToPopulate)
{
int i = blockIdx.x * blockDim.x + threadIdx.x;
if (i < pointsToPopulate)
{
outMatrix[i] = 'A';
}
}
cudaError_t populateMatrixCUDA(char * outMatrix, const int pointsToPopulate, cudaDeviceProp &deviceProp)
{
//Device arrays to be used
char * dev_outMatrix = 0;
cudaError_t cudaStatus;
//THIS IS THE CODE HERE I'M WANTING TO REPLACE
//Calculate the block and grid parameters
auto gridDiv = div(pointsToPopulate, deviceProp.maxThreadsPerBlock);
auto gridX = gridDiv.quot;
if (gridDiv.rem != 0)
gridX++; //Round up if we have stragling points to populate
auto blockSize = deviceProp.maxThreadsPerBlock;
int gridSize = min(16 * deviceProp.multiProcessorCount, gridX);
//END REPLACE CODE
//Allocate GPU buffers
cudaStatus = cudaMalloc((void**)&dev_outMatrix, pointsToPopulate * sizeof(char));
if (cudaStatus != cudaSuccess)
{
cerr << "cudaMalloc failed!" << endl;
goto Error;
}
populateMatrixKernel << <gridSize, blockSize >> > (dev_outMatrix, pointsToPopulate);
//Check for errors launching the kernel
cudaStatus = cudaGetLastError();
if (cudaStatus != cudaSuccess)
{
cerr << "Population launch failed: " << cudaGetErrorString(cudaStatus) << endl;
goto Error;
}
//Wait for threads to finish
cudaStatus = cudaDeviceSynchronize();
if (cudaStatus != cudaSuccess) {
cerr << "cudaDeviceSynchronize returned error code " << cudaStatus << " after launching visit and bridger analysis kernel!" << endl;
cout << "Cuda failure " << __FILE__ << ":" << __LINE__ << " '" << cudaGetErrorString(cudaStatus);
goto Error;
}
//Copy output to host memory
cudaStatus = cudaMemcpy(outMatrix, dev_outMatrix, pointsToPopulate * sizeof(char), cudaMemcpyDeviceToHost);
if (cudaStatus != cudaSuccess) {
cerr << "cudaMemcpy failed!" << endl;
goto Error;
}
Error:
cudaFree(dev_outMatrix);
return cudaStatus;
}
Now, when I test this code using the following testing setup:
//Make sure we can use the graphics card (This calculation would be unresonable otherwise)
if (cudaSetDevice(0) != cudaSuccess) {
cerr << "cudaSetDevice failed! Do you have a CUDA-capable GPU installed?" << endl;
}
cudaDeviceProp deviceProp;
cudaError_t cudaResult;
cudaResult = cudaGetDeviceProperties(&deviceProp, 0);
if (cudaResult != cudaSuccess)
{
cerr << "cudaGetDeviceProperties failed!" << endl;
}
int pointsToPopulate = 250000 * 300;
auto gpuMatrix = new char[pointsToPopulate];
fill(gpuMatrix, gpuMatrix + pointsToPopulate, 'B');
populateMatrixCUDA(gpuMatrix, pointsToPopulate, deviceProp);
for (int i = 0; i < pointsToPopulate; ++i)
{
if (gpuMatrix[i] != 'A')
{
cout << "ERROR: " << i << endl;
cin.get();
}
}
I get an error at i=81920. Moreover, if I check the memory before and after the execution, all of the memory values after 81920 go from 'B' to null. It seems that this error is originating from this line in the kernel execution parameter code:
int gridSize = min(16 * deviceProp.multiProcessorCount, gridX);
For my graphics card (GTX 980M) I get out a value for deviceProp.multiProcessorCount of 5, and if I multiply this by 16 and 1024 (for max blocks per grid) I get out the 81920. It seems that, while I am fine on the memory space side of things, I am getting choked by how many threads I can run. Now, this 16 is just being set as an arbitrary value (after looking at some example code my friend made), I was wondering if there was a way to actually calculate "What 16 should be" based on the GPUs properties instead of setting it arbitrarily. I'm wanting to write an iterative code that is able to determine the maximum amount of calculations that are able to be performed at one point in time, and then fill the matrix piece by piece accordingly, but I need to know the maximum calculation value to do this. Does anyone know of a way to calculate these parameters? If any more information is needed, I'm happy to oblige. Thank you!
There is fundamentally nothing wrong with the code you have posted. It is probably close to best practice. But it isn't compatible with the design idiom of your kernel.
As you can see here, your GPU is capable of running 2^31 - 1 or 2147483647 blocks. So you could change the code in question to this:
unsigned int gridSize = min(2147483647u, gridX);
and it should probably work. Better still, don't change that code at all, but change your kernel to something like this:
__global__ void populateMatrixKernel(char * outMatrix, const int pointsToPopulate)
{
int i = blockIdx.x * blockDim.x + threadIdx.x;
for(; i < pointsToPopulate; i += blockDim.x * gridDim.x)
{
outMatrix[i] = 'A';
}
}
That way your kernel will emit multiple outputs per thread and everything should should just work as it is intended.
I have implemented the following CUDA code but i am a little bit confused about the behavior.
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <cuda_runtime.h>
#include "cublas_v2.h"
#include <ctime>
#include <chrono>
#include <string>
#define IDX2F(i,j,ld) ((((j)-1)*(ld))+((i)-1))
void PrintMatrix(float* a, int n)
{
int j, i;
for (j = 1; j <= n; j++)
{
for (i = 1; i <= n; i++)
{
printf("%7.0f", a[IDX2F(i, j, n)]);
}
printf("\n");
}
}
float* CreateMatrix(int n)
{
float* matrix = static_cast<float *>(malloc(n * n * sizeof(float)));
if (!matrix)
{
printf("host memory allocation failed");
return nullptr;
}
for (int j = 1; j <= n; j++)
{
for (int i = 1; i <= n; i++)
{
matrix[IDX2F(i, j, n)] = 2;
}
}
return matrix;
}
long CudaMatrixMultiply(float* matrix, int n)
{
cudaError_t cudaStat;
cublasStatus_t status;
cublasHandle_t handle;
float* deviceMatrix;
cudaStat = cudaMalloc(reinterpret_cast<void**>(&deviceMatrix), n * n * sizeof(float));
if (cudaStat != cudaSuccess)
{
printf("device memory allocation failed");
return EXIT_FAILURE;
}
status = cublasCreate(&handle);
if (status != CUBLAS_STATUS_SUCCESS)
{
printf("CUBLAS initialization failed\n");
return EXIT_FAILURE;
}
status = cublasSetMatrix(n, n, sizeof(float), matrix, n, deviceMatrix, n);
if (status != CUBLAS_STATUS_SUCCESS)
{
printf("data download failed");
cudaFree(deviceMatrix);
cublasDestroy(handle);
return EXIT_FAILURE;
}
float alpha = 1;
float beta = 0;
cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, n, n, n, &alpha, deviceMatrix, n, deviceMatrix, n, &beta, deviceMatrix, n);
status = cublasGetMatrix(n, n, sizeof(float), deviceMatrix, n, matrix, n);
if (status != CUBLAS_STATUS_SUCCESS)
{
printf("data upload failed");
cudaFree(deviceMatrix);
cublasDestroy(handle);
return EXIT_FAILURE;
}
cudaFree(deviceMatrix);
cublasDestroy(handle);
return EXIT_SUCCESS;
}
float* CpuMatrixMultiply(float* matrix, int size)
{
float* result = new float[size * size]();
// Copied from https://msdn.microsoft.com/en-us/library/hh873134.aspx
for (int row = 1; row <= size; row++)
{
for (int col = 1; col <= size; col++)
{
// Multiply the row of A by the column of B to get the row, column of product.
for (int inner = 1; inner <= size; inner++)
{
// result[row][col] += matrix[row][inner] * matrix[inner][col];
result[IDX2F(col, row, size)] += matrix[IDX2F(inner, row, size)] * matrix[IDX2F(col, inner, size)];
}
}
}
free(matrix);
return result;
}
int main(void)
{
// printf("Matrix * Matrix Test\n");
int size = 1000;
int runs = 10;
for (int run = 0; run != runs; run++)
{
printf("=== Test %d (Matrix * Matrix, Size = %d) ===\n\n", run + 1, size);
printf("RAM usage is: %f GB\n", size * size * sizeof(float) / 1000000000.0);
float* cpuMatrix = CreateMatrix(size);
cpuMatrix = CpuMatrixMultiply(cpuMatrix, size);
PrintMatrix(cpuMatrix, 5);
float* gpuMatrix = CreateMatrix(size);
CudaMatrixMultiply(gpuMatrix, size);
PrintMatrix(gpuMatrix, 5);
free(cpuMatrix);
free(gpuMatrix);
}
getchar();
return EXIT_SUCCESS;
}
The ouput of the CPU version of the MatrixMultiplication is the following as expected:
4000 4000 4000 4000 4000
4000 4000 4000 4000 4000
4000 4000 4000 4000 4000
4000 4000 4000 4000 4000
4000 4000 4000 4000 4000
but the result of the GPU computed is sometimes the right one (see above) or a wrong random(?) one. When the loop is executed the first time then the result was always the right one.
I am not able to find a mistake in my code and it would be great if you could help me.
Additionally if i set size (int the main method) to e.g. 16000 then my driver is crashing and i get an error message. For this i have written a bug report to NVidea because my pc crashed twice. But maybe it is a programming fault by me?
Driver: 364.72 (newest one)
SDK: CUDA Toolkit 7.5
Graphics Card: NVidia GeForce GTX 960 (4GB)
Windows 10 64Bit
Driver Error
Display driver NVIDIA Windows kernel Mode Driver, Version 362.72 stopped responding and has successfully recovered.
Edit: With the help of the community i found out that this is a problem with the watchdog timer. See answer below.
Regarding the second part of the question, following njuffa's remark, you may change the settings for driver behavior to avoid the error when increasing size. Open NSIGHT Monitor and in Options, General, Microsoft Display Driver, change to False the WDDM TDR enabled field.
From spec, the 32bits FPU flops should be around 2.4 TFLOPS in single precision, hence your operation for a 16000 sized matrix should take at the minimum 3.5 seconds. Hence the Driver Recovery after 2 seconds.
Is there a way to read .pfm files in OpenCV?
Thank you very much for any suggestions!
PFM is an uncommon image format and I don't know why the Middlebury dataset chose to use that, probably because it uses floating point values.
Anyway I was able to read the images with OpenCV:
import numpy as np
import cv2
groundtruth = cv2.imread('disp0.pfm', cv2.IMREAD_UNCHANGED)
Note the IMREAD_UNCHANGED flag. Somehow it is able to read all the correct values even if OpenCV does not support it.
But wait a minute: inf values are commonly used to set INVALID pixel disparity, so to properly display the image you should do:
# Remove infinite value to display
groundtruth[groundtruth==np.inf] = 0
# Normalize and convert to uint8
groundtruth = cv2.normalize(groundtruth, None, 0, 255, cv2.NORM_MINMAX).astype(np.uint8)
# Show
cv2.imshow("groundtruth", groundtruth)
cv2.waitKey(0)
cv2.destroyAllWindows()
Based on the description of the ".pfm" file formate (see http://netpbm.sourceforge.net/doc/pfm.html), I wrote the following read/write functions, which only depend standard C/C++ library. It is proved to work well on reading/writing the pfm file, like, the ground truth disparity ".pfm" files from MiddleBury Computer Vision (see http://vision.middlebury.edu/stereo/submit3/).
#ifndef _PGM_H_
#define _PGM_H_
#include <fstream>
#include <iostream>
#include <algorithm>
#include <string>
#include <cstdint>
#include <cstdlib>
#include <cstring>
#include <bitset> /*std::bitset<32>*/
#include <cstdio>
enum PFM_endianness { BIG, LITTLE, ERROR};
class PFM {
public:
PFM();
inline bool is_little_big_endianness_swap(){
if (this->endianess == 0.f) {
std::cerr << "this-> endianness is not assigned yet!\n";
exit(0);
}
else {
uint32_t endianness = 0xdeadbeef;
//std::cout << "\n" << std::bitset<32>(endianness) << std::endl;
unsigned char * temp = (unsigned char *)&endianness;
//std::cout << std::bitset<8>(*temp) << std::endl;
PFM_endianness endianType_ = ((*temp) ^ 0xef == 0 ?
LITTLE : (*temp) ^ (0xde) == 0 ? BIG : ERROR);
// ".pfm" format file specifies that:
// positive scale means big endianess;
// negative scale means little endianess.
return ((BIG == endianType_) && (this->endianess < 0.f))
|| ((LITTLE == endianType_) && (this->endianess > 0.f));
}
}
template<typename T>
T * read_pfm(const std::string & filename) {
FILE * pFile;
pFile = fopen(filename.c_str(), "rb");
char c[100];
if (pFile != NULL) {
fscanf(pFile, "%s", c);
// strcmp() returns 0 if they are equal.
if (!strcmp(c, "Pf")) {
fscanf(pFile, "%s", c);
// atoi: ASCII to integer.
// itoa: integer to ASCII.
this->width = atoi(c);
fscanf(pFile, "%s", c);
this->height = atoi(c);
int length_ = this->width * this->height;
fscanf(pFile, "%s", c);
this->endianess = atof(c);
fseek(pFile, 0, SEEK_END);
long lSize = ftell(pFile);
long pos = lSize - this->width*this->height * sizeof(T);
fseek(pFile, pos, SEEK_SET);
T* img = new T[length_];
//cout << "sizeof(T) = " << sizeof(T);
fread(img, sizeof(T), length_, pFile);
fclose(pFile);
/* The raster is a sequence of pixels, packed one after another,
* with no delimiters of any kind. They are grouped by row,
* with the pixels in each row ordered left to right and
* the rows ordered bottom to top.
*/
T* tbimg = (T *)malloc(length_ * sizeof(T));// top-to-bottom.
//PFM SPEC image stored bottom -> top reversing image
for (int i = 0; i < this->height; i++) {
memcpy(&tbimg[(this->height - i - 1)*(this->width)],
&img[(i*(this->width))],
(this->width) * sizeof(T));
}
if (this->is_little_big_endianness_swap()){
std::cout << "little-big endianness transformation is needed.\n";
// little-big endianness transformation is needed.
union {
T f;
unsigned char u8[sizeof(T)];
} source, dest;
for (int i = 0; i < length_; ++i) {
source.f = tbimg[i];
for (unsigned int k = 0, s_T = sizeof(T); k < s_T; k++)
dest.u8[k] = source.u8[s_T - k - 1];
tbimg[i] = dest.f;
//cout << dest.f << ", ";
}
}
delete[] img;
return tbimg;
}
else {
std::cout << "Invalid magic number!"
<< " No Pf (meaning grayscale pfm) is missing!!\n";
fclose(pFile);
exit(0);
}
}
else {
std::cout << "Cannot open file " << filename
<< ", or it does not exist!\n";
fclose(pFile);
exit(0);
}
}
template<typename T>
void write_pfm(const std::string & filename, const T* imgbuffer,
const float & endianess_) {
std::ofstream ofs(filename.c_str(), std::ifstream::binary);
// ** 1) Identifier Line: The identifier line contains the characters
// "PF" or "Pf". PF means it's a color PFM.
// Pf means it's a grayscale PFM.
// ** 2) Dimensions Line:
// The dimensions line contains two positive decimal integers,
// separated by a blank. The first is the width of the image;
// the second is the height. Both are in pixels.
// ** 3) Scale Factor / Endianness:
// The Scale Factor / Endianness line is a queer line that jams
// endianness information into an otherwise sane description
// of a scale. The line consists of a nonzero decimal number,
// not necessarily an integer. If the number is negative, that
// means the PFM raster is little endian. Otherwise, it is big
// endian. The absolute value of the number is the scale
// factor for the image.
// The scale factor tells the units of the samples in the raster.
// You use somehow it along with some separately understood unit
// information to turn a sample value into something meaningful,
// such as watts per square meter.
ofs << "Pf\n"
<< this->width << " " << this->height << "\n"
<< endianess_ << "\n";
/* PFM raster:
* The raster is a sequence of pixels, packed one after another,
* with no delimiters of any kind. They are grouped by row,
* with the pixels in each row ordered left to right and
* the rows ordered bottom to top.
* Each pixel consists of 1 or 3 samples, packed one after another,
* with no delimiters of any kind. 1 sample for a grayscale PFM
* and 3 for a color PFM (see the Identifier Line of the PFM header).
* Each sample consists of 4 consecutive bytes. The bytes represent
* a 32 bit string, in either big endian or little endian format,
* as determined by the Scale Factor / Endianness line of the PFM
* header. That string is an IEEE 32 bit floating point number code.
* Since that's the same format that most CPUs and compiler use,
* you can usually just make a program use the bytes directly
* as a floating point number, after taking care of the
* endianness variation.
*/
int length_ = this->width*this->height;
this->endianess = endianess_;
T* tbimg = (T *)malloc(length_ * sizeof(T));
// PFM SPEC image stored bottom -> top reversing image
for (int i = 0; i < this->height; i++) {
memcpy(&tbimg[(this->height - i - 1)*this->width],
&imgbuffer[(i*this->width)],
this->width * sizeof(T));
}
if (this->is_little_big_endianness_swap()) {
std::cout << "little-big endianness transformation is needed.\n";
// little-big endianness transformation is needed.
union {
T f;
unsigned char u8[sizeof(T)];
} source, dest;
for (int i = 0; i < length_; ++i) {
source.f = tbimg[i];
for (size_t k = 0, s_T = sizeof(T); k < s_T; k++)
dest.u8[k] = source.u8[s_T - k - 1];
tbimg[i] = dest.f;
//cout << dest.f << ", ";
}
}
ofs.write((char *)tbimg, this->width*this->height * sizeof(T));
ofs.close();
free(tbimg);
}
inline float getEndianess(){return endianess;}
inline int getHeight(void){return height;}
inline int getWidth(void){return width;}
inline void setHeight(const int & h){height = h;}
inline void setWidth(const int & w){width = w;}
private:
int height;
int width;
float endianess;
};
#endif /* PGM_H_ */
Forgive me to leave lots of useless comments in the code.
A simple example shows the write/read:
int main(){
PFM pfm_rw;
string temp = "img/Motorcycle/disp0GT.pfm";
float * p_disp_gt = pfm_rw.read_pfm<float>(temp);
//int imgH = pfm_rw.getHeight();
//int imgW = pfm_rw.getWidth();
//float scale = pfm_rw.getEndianess();
string temp2 = "result/Motorcycle/disp0GT_n1.pfm";
pfm_rw.write_pfm<float>(temp2, p_disp_gt, -1.0f);
return 1;
}
As far as I know, OpenCV doesn't support to read PFM files directly.
You can refer to the code snippet here for a simple PFM reader, which will enable you to read PFM files into COLOR *data with COLOR defined as follows:
typedef struct {
float r;
float g;
float b;
} COLOR;
I'm trying to find the fft of a dynamically allocated array. The input array is copied from host to device using cudaMemcpy2D. Then the fft is taken (cufftExecR2C) and the results are copied back from device to host.
So my initial problem was how to use the pitch information in the fft. Then I found an answer here - CUFFT: How to calculate fft of pitched pointer?
But unfortunately it doesn't work. The results I get are garbage values. Given below is my code.
#define NRANK 2
#define BATCH 10
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <cufft.h>
#include <stdio.h>
#include <iomanip>
#include <iostream>
#include <vector>
using namespace std;
const size_t NX = 4;
const size_t NY = 6;
int main()
{
// Input array (static) - host side
float h_in_data_static[NX][NY] ={
{0.7943 , 0.6020 , 0.7482 , 0.9133 , 0.9961 , 0.9261},
{0.3112 , 0.2630 , 0.4505 , 0.1524 , 0.0782 , 0.1782},
{0.5285 , 0.6541 , 0.0838 , 0.8258 , 0.4427, 0.3842},
{0.1656 , 0.6892 , 0.2290 , 0.5383 , 0.1067, 0.1712}
};
// --------------------------------
// Input array (dynamic) - host side
float *h_in_data_dynamic = new float[NX*NY];
// Set the values
size_t h_ipitch;
for (int r = 0; r < NX; ++r) // this can be also done on GPU
{
for (int c = 0; c < NY; ++c)
{ h_in_data_dynamic[NY*r + c] = h_in_data_static[r][c]; }
}
// --------------------------------
// Output array - host side
float2 *h_out_data_temp = new float2[NX*(NY/2+1)] ;
// Input and Output array - device side
cufftHandle plan;
cufftReal *d_in_data;
cufftComplex * d_out_data;
int n[NRANK] = {NX, NY};
// Copy input array from Host to Device
size_t ipitch;
cudaError cudaStat1 = cudaMallocPitch((void**)&d_in_data,&ipitch,NY*sizeof(cufftReal),NX);
cout << cudaGetErrorString(cudaStat1) << endl;
cudaError cudaStat2 = cudaMemcpy2D(d_in_data,ipitch,h_in_data_dynamic,NY*sizeof(float),NY*sizeof(float),NX,cudaMemcpyHostToDevice);
cout << cudaGetErrorString(cudaStat2) << endl;
// Allocate memory for output array - device side
size_t opitch;
cudaError cudaStat3 = cudaMallocPitch((void**)&d_out_data,&opitch,(NY/2+1)*sizeof(cufftComplex),NX);
cout << cudaGetErrorString(cudaStat3) << endl;
// Performe the fft
int rank = 2; // 2D fft
int istride = 1, ostride = 1; // Stride lengths
int idist = 1, odist = 1; // Distance between batches
int inembed[] = {ipitch, NX}; // Input size with pitch
int onembed[] = {opitch, NX}; // Output size with pitch
int batch = 1;
cufftPlanMany(&plan, rank, n, inembed, istride, idist, onembed, ostride, odist, CUFFT_R2C, batch);
//cufftPlan2d(&plan, NX, NY , CUFFT_R2C);
cufftSetCompatibilityMode(plan, CUFFT_COMPATIBILITY_NATIVE);
cufftExecR2C(plan, d_in_data, d_out_data);
cudaThreadSynchronize();
// Copy d_in_data back from device to host
cudaError cudaStat4 = cudaMemcpy2D(h_out_data_temp,(NY/2+1)*sizeof(float2), d_out_data, opitch, (NY/2+1)*sizeof(cufftComplex), NX, cudaMemcpyDeviceToHost);
cout << cudaGetErrorString(cudaStat4) << endl;
// Print the results
for (int i = 0; i < NX; i++)
{
for (int j =0 ; j< NY/2 + 1; j++)
printf(" %f + %fi",h_out_data_temp[i*(NY/2+1) + j].x ,h_out_data_temp[i*(NY/2+1) + j].y);
printf("\n");
}
cudaFree(d_in_data);
return 0;
}
I think the problem is in cufftPlanMany. How can I solve this issue ?
You may want to study the advanced data layout section of the documentation carefully.
I think the previous question that was linked is somewhat confusing because that question is passing the width and height parameters in reverse order for what I would expect for a cufft 2D plan. However the answer then mimics that order so it is at least consistent.
Secondly, you missed in the previous question that the "pitch" parameters that are being passed in inembed and onembed are not the same as the pitch parameters that you would receive from a cudaMallocPitch operation. They have to be scaled by the number of bytes per data element in the input and output data sets. I'm actually not entirely sure this is the intended use of the inembed and onembed parameters, but it seems to work.
When I adjust your code to account for the above two changes, I seem to get valid results, at least they appear to be in a reasonable range. You've posted several questions now about 2D FFTs, where you've said the results are not correct. I can't do these 2D FFT's in my head, so I suggest in the future you indicate what data you are expecting.
This has the changes I made:
#define NRANK 2
#define BATCH 10
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <cufft.h>
#include <stdio.h>
#include <iomanip>
#include <iostream>
#include <vector>
using namespace std;
const size_t NX = 4;
const size_t NY = 6;
int main()
{
// Input array (static) - host side
float h_in_data_static[NX][NY] ={
{0.7943 , 0.6020 , 0.7482 , 0.9133 , 0.9961 , 0.9261},
{0.3112 , 0.2630 , 0.4505 , 0.1524 , 0.0782 , 0.1782},
{0.5285 , 0.6541 , 0.0838 , 0.8258 , 0.4427, 0.3842},
{0.1656 , 0.6892 , 0.2290 , 0.5383 , 0.1067, 0.1712}
};
// --------------------------------
// Input array (dynamic) - host side
float *h_in_data_dynamic = new float[NX*NY];
// Set the values
size_t h_ipitch;
for (int r = 0; r < NX; ++r) // this can be also done on GPU
{
for (int c = 0; c < NY; ++c)
{ h_in_data_dynamic[NY*r + c] = h_in_data_static[r][c]; }
}
// --------------------------------
int owidth = (NY/2)+1;
// Output array - host side
float2 *h_out_data_temp = new float2[NX*owidth] ;
// Input and Output array - device side
cufftHandle plan;
cufftReal *d_in_data;
cufftComplex * d_out_data;
int n[NRANK] = {NX, NY};
// Copy input array from Host to Device
size_t ipitch;
cudaError cudaStat1 = cudaMallocPitch((void**)&d_in_data,&ipitch,NY*sizeof(cufftReal),NX);
cout << cudaGetErrorString(cudaStat1) << endl;
cudaError cudaStat2 = cudaMemcpy2D(d_in_data,ipitch,h_in_data_dynamic,NY*sizeof(float),NY*sizeof(float),NX,cudaMemcpyHostToDevice);
cout << cudaGetErrorString(cudaStat2) << endl;
// Allocate memory for output array - device side
size_t opitch;
cudaError cudaStat3 = cudaMallocPitch((void**)&d_out_data,&opitch,owidth*sizeof(cufftComplex),NX);
cout << cudaGetErrorString(cudaStat3) << endl;
// Performe the fft
int rank = 2; // 2D fft
int istride = 1, ostride = 1; // Stride lengths
int idist = 1, odist = 1; // Distance between batches
int inembed[] = {NX, ipitch/sizeof(cufftReal)}; // Input size with pitch
int onembed[] = {NX, opitch/sizeof(cufftComplex)}; // Output size with pitch
int batch = 1;
if ((cufftPlanMany(&plan, rank, n, inembed, istride, idist, onembed, ostride, odist, CUFFT_R2C, batch)) != CUFFT_SUCCESS) cout<< "cufft error 1" << endl;
//cufftPlan2d(&plan, NX, NY , CUFFT_R2C);
if ((cufftSetCompatibilityMode(plan, CUFFT_COMPATIBILITY_NATIVE)) != CUFFT_SUCCESS) cout << "cufft error 2" << endl;
if ((cufftExecR2C(plan, d_in_data, d_out_data)) != CUFFT_SUCCESS) cout << "cufft error 3" << endl;
cudaDeviceSynchronize();
// Copy d_in_data back from device to host
cudaError cudaStat4 = cudaMemcpy2D(h_out_data_temp,owidth*sizeof(float2), d_out_data, opitch, owidth*sizeof(cufftComplex), NX, cudaMemcpyDeviceToHost);
cout << cudaGetErrorString(cudaStat4) << endl;
// Print the results
for (int i = 0; i < NX; i++)
{
for (int j =0 ; j< owidth; j++)
printf(" %f + %fi",h_out_data_temp[i*owidth + j].x ,h_out_data_temp[i*owidth + j].y);
printf("\n");
}
cudaFree(d_in_data);
return 0;
}