What is the risk of using struct dataContent decimate(struct dataContent)? - c++

I have written the following code to retrieve the content from the audio file. This is just part of the full project. I just want to know Would there be any risk using struct dataContent decimate(struct dataContent)? If yes, what are these and how can I improve this code to reduce the risk?
struct dataContent
{
DoubleArrayPtr data;
DoubleArrayPtr memorydata;
int numberofvalues;
int datasize;
long int sizeoffile;
};
struct dataContent decimate(struct dataContent dataprocess)
{
int i = 0, j = 0, k = 0, l = 0, m = 0, n = 0, p = 0, q = 0, r = 0, s = 0, t = 0;
cout << "Total number of blocks is: " << dataprocess.datasize << endl;
size_t size = dataprocess.datasize;
vector<double> sum(size);
vector<double> mean(size); //The mean is the arithmetic average of a set of given numbers
vector<double> secondmoment(size); //
vector<double> fourthmoment(size);
vector<double> kurtosis(size);
sum[0] = 0.0;
secondmoment[0] = 0.0;
fourthmoment[0] = 0.0;
kurtosis[0] = 0.0;
// Finding statistical moments for the data: mean, second- and fourth-order moments, and kurtosis
for(j = 0 ; j < size ; ++j)
{
// Mean Value
for(i = k ; i < (k + BUFFER_SIZE) ; i++)
{
sum[j] = sum[j] + abs(dataprocess.memorydata[i]);
//sum[j] = sum[j] + dataprocess.memorydata[i];
}
mean[j] = sum[j] / BUFFER_SIZE;
cout << "The mean of the absolute value of data in block " << (j + 1) << " is: " << mean[j] << endl;
k = k + BUFFER_SIZE;
}
return dataprocess;
} // End of decimate()
Thanks for your time.

the obvious issue is that you have BUFFER_SIZE and stride along the data array in those size pieces with no concern about running off the end.
I assume DoubleArrayPtr is double *, why not vector<double> given that you use vector elsewhere.
Also dont do this
int i = 0, j = 0, k = 0, l = 0, m = 0, n = 0, p = 0, q = 0, r = 0, s = 0, t = 0;
create an initialize at the point of use
like this
for(int j = 0 ; j < size ; ++j)
in c++ structs are types so you can do this
dataContent decimate(dataContent dataprocess)
pass in a reference to the struct thos, at the moment you are copying it
dataContent decimate(dataContent &dataprocess)
all this needs to be in codereview really tho

Related

Half-precision PyTorch float tensors have same performance as single precision float tensors?

Background: I've implemented the antiobject/"field AI" pattern (https://home.cs.colorado.edu/~ralex/papers/PDF/OOPSLA06antiobjects.pdf) for single diffusion using LibTorch/PyTorch.
This works fine, but in the process of running it on the GPU and optimizing it, I've run into a problem. I have a Titan V, which I believe excels at half-precision float math. However, when I make the tensors torch::kHalf, the performance is the same. (I've also tried torch::kFloat16). Any ideas?
The code that I timed is in update():
#define SDL_MAIN_HANDLED
#include <simple2d.h>
#include <torch/torch.h>
#include <c10/cuda/CUDAStream.h>
#include <ATen/cuda/CUDAEvent.h>
#include <math.h>
#include <chrono>
#define DEBUG_NO_DRAW
torch::Device gpu(torch::kCUDA);
torch::Device cpu(torch::kCPU);
torch::Device device = gpu;
const int windowLength = 1000;
const int64_t length = 500;
const float diffusionRate = 0.25;
const int obstacleCount = 4000;
const int entityCount = 1000;
float cellLength = windowLength / length;
torch::Tensor scent = torch::zeros({ length, length }, device).to(torch::kHalf);
torch::Tensor up, down, left, right;
torch::Tensor topWallMask = torch::ones({ length, length }, device).to(torch::kHalf);
torch::Tensor bottomWallMask = torch::ones({ length, length }, device).to(torch::kHalf);
torch::Tensor leftWallMask = torch::ones({ length, length }, device).to(torch::kHalf);
torch::Tensor rightWallMask = torch::ones({ length, length }, device).to(torch::kHalf);
torch::Tensor obstaclesMask = torch::ones({ length, length }, device).to(torch::kHalf);
torch::Tensor entities = torch::zeros({ length, length }, device).to(torch::kHalf);
c10::cuda::CUDAStream stream = c10::cuda::getCurrentCUDAStream();
std::time_t *lastFpsUpdate = NULL;
std::time_t *currentTime = new std::time_t();
int frameAccumulator = 0;
std::vector<long> updateDurations;
void update() {
torch::NoGradGuard no_grad;
AT_CUDA_CHECK(cudaStreamSynchronize(stream));
auto startTime = std::chrono::high_resolution_clock::now();
down = scent.roll(1, 0) * obstaclesMask * topWallMask;
up = scent.roll(-1, 0) * obstaclesMask * bottomWallMask;
right = scent.roll(1, 1) * obstaclesMask * leftWallMask;
left = scent.roll(-1, 1) * obstaclesMask * rightWallMask;
scent = scent + ((down - scent) + (up - scent) + (right - scent) + (left - scent)) * diffusionRate;
scent = torch::max(scent, entities);
AT_CUDA_CHECK(cudaStreamSynchronize(stream));
auto endTime = std::chrono::high_resolution_clock::now();
auto duration = std::chrono::duration_cast<std::chrono::microseconds>(endTime - startTime);
updateDurations.push_back(duration.count());
}
void render() {
if (lastFpsUpdate == NULL) {
lastFpsUpdate = new std::time_t();
std::time(lastFpsUpdate);
}
torch::Tensor sqrtedScent = scent.sqrt().to(torch::kFloat).to(cpu); // just to make darker scents a little brighter for display
auto obstaclesMaskCPU = obstaclesMask.to(torch::kFloat).to(cpu);
auto sqrtedScentAccessor = sqrtedScent.accessor<float, 2>();
auto obstaclesMaskAccessor = obstaclesMaskCPU.accessor<float, 2>();
float r = 0, g = 0, b = 0, a = 0;
#ifndef DEBUG_NO_DRAW
S2D_DrawQuad(
0, 0, 0, 0, 0, 1,
windowLength, 0, 0, 0, 0, 1,
windowLength, windowLength, 0, 0, 0, 1,
0, windowLength, 0, 0, 0, 1);
#endif
for (int i = 0; i < length; i++) {
for(int j = 0; j < length; j++) {
if (obstaclesMaskAccessor[i][j] == 0) {
r = 1; g = 1; b = 1; a = 1;
}
else {
r = 1; g = 0; b = 0; a = sqrtedScentAccessor[i][j];
}
#ifndef DEBUG_NO_DRAW
S2D_DrawQuad(cellLength * j, cellLength * i, r, g, b, a,
cellLength * (j + 1), cellLength * i, r, g, b, a,
cellLength * (j + 1), cellLength * (i + 1), r, g, b, a,
cellLength * j, cellLength * (i + 1), r, g, b, a);
#endif
}
}
frameAccumulator++;
std::time(currentTime);
if (std::difftime(*currentTime, *lastFpsUpdate) > 1.0) {
std::cout << "FPS: " << frameAccumulator << std::endl;
frameAccumulator = 0;
*lastFpsUpdate = *currentTime;
int updateCount = updateDurations.size();
long totalUpdateTime = 0;
for (int i = 0; i < updateCount; i++) {
totalUpdateTime += updateDurations[i];
}
long averageUpdateTime = totalUpdateTime / updateCount;
std::cout << "AverageUpdateTime: " << averageUpdateTime << "us" << std::endl;
updateDurations.clear();
}
}
int main() {
if (torch::cuda::is_available()) {
std::cout << "CUDA is available!" << std::endl;
}
std::cout << "Using " << (device == cpu ? "CPU" : "GPU") << std::endl;
for (int i = 0; i < length; i++) {
topWallMask[0][i] = 0;
bottomWallMask[length - 1][i] = 0;
leftWallMask[i][0] = 0;
rightWallMask[i][length - 1] = 0;
}
for (int i = 0; i < obstacleCount; i++) {
int x = rand() % length;
int y = rand() % length;
obstaclesMask[x][y] = 0;
}
//std::cout << obstaclesMask << std::endl;
for (int i = 0; i < entityCount; i++) {
int x = rand() % length;
int y = rand() % length;
if (obstaclesMask[x][y].item() == 0)
continue;
entities[x][y] = 1;
}
S2D_Window* window = S2D_CreateWindow(
"Collab Diffuse", windowLength, windowLength, update, render, 0
);
S2D_Show(window);
return 0;
}
In both single precision and half precision versions of the code, update() takes about 2700 microseconds.
I'm using PyTorch/LibTorch 1.7.1.
Any other performance tips would be appreciated. (I'm aware drawing pixel by pixel is very slow, so I plan to switch from Simple2D to something else that can draw bitmaps from memory).

OpenCV subtract row/column mean from every pixel on every nth row/column

I have a cv::Mat and I want to subtract the mean value of every 8th row (starting at 0, so 0, 8, 16 etc) from every pixel on that row.
Then I want to do the same with the columns.
For some reason, it's saying that every value on the first row is 0, when it clearly isn't.
Here's what I've tried:
int N = 8;
//cv::Mat row_mean, col_mean;
//cv::reduce(myMat, row_mean, 0, CV_REDUCE_AVG); - not giving me what I want somehow?
//cv::reduce(myMat, col_mean, 1, CV_REDUCE_AVG);
for(int r = 0; r < myMat.rows; r = r + N)
{
float rowAvg = 0.0;
for(int c = 0; c < myMat.cols; c++)
{
float val = myMat.at<float>(c, r);
if (r == 0) {
std::cout << "Row 0 val at " << c << " is " << val << std::endl;
}
rowAvg += val;
}
rowAvg /= myMat.rows;
std::cout << "Actual mean at r " << r << " is " << rowAvg << std::endl;
myMat.row(r) = myMat.row(r) - rowAvg;
}
for(int c = 0; c < myMat.cols; c = c + N)
{
float colAvg = 0.0;
for(int r = 0; r < myMat.rows; r++)
{
colAvg += myMat.at<float>(c, r);
}
colAvg /= myMat.cols;
std::cout << "Actual mean at c " << c << " is " << colAvg << std::endl;
myMat.col(c) = myMat.col(c) - colAvg;
}
As you can see, I started by attempting to use reduce, but the values I got didn't seem right.
myMat has only two dimensions, it's worth clarifying.
Update
I worked out that when I pulled the average using reduce I was looking at the wrong dimensions. So I've fixed that, I think. So now it looks like this:
cv::Mat row_mean, col_mean;
cv::reduce(myMat, row_mean, 1, CV_REDUCE_AVG);
cv::reduce(myMat, col_mean, 0, CV_REDUCE_AVG);
for(int r = 0; r < myMat.rows; r = r + N)
{
for(int c = 0; c < myMat.cols; c++)
{
myMat.at<cv::Vec3f>(r, c) = myMat.at<cv::Vec3f>(r, c) - row_mean.at<cv::Vec3f>(r, 0);
}
}
for(int c = 0; c < myMat.cols; c = c + N)
{
for(int r = 0; r < myMat.rows; r++)
{
myMat.at<cv::Vec3f>(r, c) = myMat.at<cv::Vec3f>(r, c) - col_mean.at<cv::Vec3f>(0, c);
}
}
When I stdout the row_mean Mat it starts like this:
[0, 0.020882325, 0;
0, 0.0043014521, 0;
0, 0.0077573429, 0;
0, -0.003039235, 0;
0, 0.0094852885, 0;
...
And the col_mean one looks like this:
[0, 0.011529977, 0, 0, 0.017861091, 0, 0, -0.00044186518, 0, 0, 0.0062413695, 0, 0, 0.0060342439, 0, 0, 0.0068696449,...
So it seems as if (as Miki commented) there are three channels, but two are full of 0s.
The result still doesn't seem right. When viewing the values in a TIFF container it looks like adjacent rows and columns are somehow being affected, although that might be the container attempting to render it.
I just want to make sure that what I'm doing is now correct.
Disclaimer
I'm no c++ programmer!

Distance transform :

Kindly help me with the working of Distance transform and rectify the errors. I have tried Borgefors' method which has defined values for Eucledian measure. I get all zeros as output.
Below is the code which i have tried.
int _tmain(int argc, _TCHAR* argv[])
{
Mat v = imread("ref.png", 0);
imshow("input", v);
Mat forward = (Mat_<uchar>(5, 5) << 0, 11, 0, 11, 0, 11, 7, 5, 7, 11, 0, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
Mat backward = (Mat_<uchar>(5, 5) << 0,0,0,0,0, 0,0,0,0,0, 0, 0, 0, 5, 0, 11, 7, 5, 7, 11, 0, 11, 0, 11, 0);
Mat op = cv::Mat::zeros(v.size(), CV_32FC1);
cout << forward;
cout << backward;
int r = v.rows;
int c = v.cols;
float min=100, x = 0;
int size = 3;
int lim = size / 2;
int a, b;
for (int i = lim; i <= r-1-lim; i++)
{
for (int j = lim; j <= c-1-lim; j++)
{
for (int k = -lim; k <= lim; k++)
{
for (int l = -lim; l <= lim; l++)
{
a = (v.at<uchar>(i + k, j + l));
b=(forward.at<uchar>(k + lim, l + lim));
x = a + b;
if (x>0 && min> x)
min = x;
}
}
op.at<float>(i, j) = min;
}
}
cout << min;
for (int i = (r-1-lim); i >lim; i--)
{
for (int j = (c-1-lim); j >lim; j--)
{
for (int k = -lim; k <= lim; k++)
{
for (int l = -lim; l <= lim; l++)
{
a = (v.at<uchar>(i + k, j + l));
b = (forward.at<uchar>(k + lim, l + lim));
x = a + b;
if (x >0 && min> x) min = x;
}
}
op.at<float>(i, j) = min;
}
}
cout << op;
Mat res = cv::Mat::ones(v.size(), CV_8UC1);
normalize(op, res, 0, 255, NORM_MINMAX);
imshow("output",res);
waitKey(0);
return 0;
}
Which is the best method and why it is the best way to implement Distance Transform?
Here is how to fix your code:
Apply the backward mask in the backward loop, you apply the same mask there as in the forward loop.
Use only the defined weights, the values in the mask where you wrote 0 are not part of the mask. Those pixels don't have a distance of 0!
As for your second question, it's probably out of scope for SO. But what the best method is depends very much on the goal. You have a fast and relatively accurate method here, there are other methods that are exact but more expensive.

APPCRASH (not when debugging) and Segmentation fault using QtCreator (C/C++)

I'm using QtCreator to code an algorithm that I have already coded on Matlab.
When coding this program, I have two errors. The firts one (APPCRASH) appears just when I build and execute the program normally, but not when I try to debug it (Heisenbug) and it appears on the function 'matriceA'. I tried to make the variables volatile and to write the matrix A term formulas on other function, hoping that that will stop the compiler optimization (I think that the compiler optimization might cause the problem), but I have not been able to solve the problem. I have not tried to to compile the project using the option -o0 because my professor (it's an university project) has to be able to compile it normally (without specific options).
The second one is a SISSEGV segmentation fault. It happens when the code arrives to "DestroyFloatArray(&b, width);" on InpaintingColor.
And here the codes:
clanu_process.cpp (it's little messy because I've tried a lot of things...)
#include "clanu_process.h"
#include "iomanip"
void InpaintingColor(float **Rout, float **Gout, float **Bout, float **Rin, float **Gin, float **Bin, float **Mask, int width, int height, double param)
{
cout << "1" << endl;
float alphak = 0, bethak = 0, res = 0;
float **b = 0, **xk = 0, **dk = 0, **rk = 0, **Ark = 0, **tmp1 = 0,**tmp2 = 0,**tmp3 = 0;
Ark = AllocateFloatArray( width, height);
tmp1 = AllocateFloatArray( width, height);
tmp2 = AllocateFloatArray( width, height);
tmp3 = AllocateFloatArray( width, height);
xk = AllocateFloatArray( width, height);
dk = AllocateFloatArray( width, height);
rk = AllocateFloatArray( width, height);
b = AllocateFloatArray( width, height);
cout << "2" << endl;
res = 1e8;
matrixProductByScalar(b,1.0/(3.0*256),Rin,width,height);
matrixDuplicate(xk, b, width, height);
// APPCRASH error
matriceA(Ark,xk,Mask,width,height);
//More code
// SIGSEGV error
DestroyFloatArray(&b, width);
DestroyFloatArray(&xk, width);
DestroyFloatArray(&dk, width);
DestroyFloatArray(&rk, width);
DestroyFloatArray(&Ark, width);
DestroyFloatArray(&tmp1, width);
DestroyFloatArray(&tmp2, width);
DestroyFloatArray(&tmp3, width);
}
float** matriceA(float **A, float **I, float **Masque, int N2, int N1){
volatile bool bool_iplus = false, bool_imoins = false, bool_jmoins = false, bool_jplus = false;
volatile int iplus = 0, imoins = 0, jplus = 0, jmoins = 0;
for(int i = 1; i <= N1; i++){
bool_iplus = i<N1;
iplus = i+1 < N1 ? i+1 : N1;
bool_imoins = i>1;
imoins = i-1 > 1 ? i-1 : 1;
for(int j = 1; j <= N2; j++){
bool_jplus = j<N2;
jplus = j+1 < N2 ? j+1 : N2;
bool_jmoins = j>1;
jmoins = j -1 > 1 ? j-1 : 1;
if(Masque[i-1][j-1]!=0){
//cout << "if - " << i << ", " << j<< endl;
A[i-1][j-1] = (1.0/36)*(16*I[i-1][j-1]
+ 4*(
(bool_iplus?I[iplus-1][j-1]:0)
+ (bool_imoins?I[imoins-1][j-1]:0)
+ (bool_jplus?I[i-1][jplus-1]:0)
+ (bool_jmoins?I[i-1][jmoins-1]:0)
)+(
(bool_iplus&&bool_jplus?I[iplus-1][jplus-1]:0)
+ (bool_imoins&&bool_jplus?I[imoins-1][jplus-1]:0)
+ (bool_imoins&&bool_jmoins?I[imoins-1][jmoins-1]:0))
+ (bool_iplus&&bool_jmoins?I[iplus-1][jmoins-1]:0));
}else{
//cout << "else - " << i << ", " << j << endl;
A[i-1][j-1]=
-(1.0*N1*N2)*(
-8.0*I[i-1][j-1]
+ I[iplus-1][j-1]
+ I[imoins-1][j-1]
+ I[i-1][jplus-1]
+ I[i-1][jmoins-1]
+ I[iplus-1][jplus-1]
+ I[imoins-1][jplus-1]
+ I[imoins-1][jmoins-1]
+ I[iplus-1][jmoins-1]);
}
}
}
return A;
}
The functions AllocateFloatArray and DestroyFloatArray
float ** AllocateFloatArray(int width, int height)
{
float ** r = new float*[width];
for(int i=0; i<width; i++)
r[i] = new float[height];
return r;
}
void DestroyFloatArray(float ***a, int width)
{
if( *a == 0 ) return;
for(int i=0; i<width; i++)
delete[] a[0][i];
delete[] *a;
*a = 0;
}
Thank you for your time.
I'm no sure that it's the cause of your problem but...
Your function "Matrix operations" (sum(), matrixSubstraction(), matrixAddition(), matrixProductByElement(), matrixProductByScalar(), and matrixDuplicate()) are ranging the first index from zero to width and the second one from zero to height.
If I'm not wrong, this is correct and is consistent with allocation/deallocation (AllocateFloatArray() and DestroyFloatArray()).
But look at the two matriceA() functions; they are defined as
float** matriceA(float **A, float **I, int N2, int N1)
float** matriceA(float **A, float **I, float **Masque, int N2, int N1)
In both functions the first index range from zero to N1 and the second one from zero to N2; by example
for(int i = 1; i <= N1; i++){
// ...
for(int j = 1; j <= N2; j++){
// ...
A[i-1][j-1] = (1.0/36)*(16*I[i-1][j-1] // ...
Good. But you call matriceA() in this way
matriceA(Ark,rk,Mask,width,height);
Briefly: you allocate your matrices as width * height matrices; your "matrix operations" are using they as width * height matrices but your matriceA() function are using they as height * width.
Wonderful way to devastate the memory.
I suppose the solution could be
1) switch N1 and N2 in matriceA() definition
2) or switch width and height in matriceA() calling
p.s.: sorry for my bad English.

keep the signed value that has minimal absolute value in two matrix in OpenCV

In OpenCV, I have two matrix One and Two which are the same size. I want to find the signed value that has minimal absolute value in both matrix and keep it in matrix One. For this, I use following code:
for (int i = 0; i < One.rows; ++i)
{
p=One.ptr<float>(i);
p_two = Two.ptr<float>(i);
for (int j = 0; j < One.cols; ++j)
{
if(fabsf(p_two[j])<fabsf(p[j]))
p[j] = p_two[j];
}
}
This code seems to be the bottleneck in my program. Does anyone know how to improve the performance? Thanks a lot!
Your code is not the bottleneck of your program. It's indeed very fast. You need to profile your code to see where the actual bottleneck is.
You can optimize it a little in case your matrices are continuous (which is very often in practice), like:
int rows = one.rows;
int cols = one.cols;
if (one.isContinuous() && two.isContinuous())
{
cols = rows * cols;
rows = 1;
}
for (int r = 0; r < rows; ++r)
{
float* pone = one.ptr<float>(r);
float* ptwo = two.ptr<float>(r);
for (int c = 0; c < cols; ++c)
{
if (fabs(ptwo[c]) < fabs(pone[c]))
{
pone[c] = ptwo[c];
}
}
}
Here a small evaluation also against the good alternative method proposed by #s1h in the comments:
two.copyTo(one, abs(two) < abs(one));
Time (in ms)
Size: Yuanhao s1h Miki
[3 x 3] 0.000366543 0.117294 0.000366543
[10 x 10] 0.00109963 0.0157614 0.00109963
[100 x 100] 0.0964009 0.139653 0.112529
[1280 x 720] 8.70577 11.0267 8.65372
[1000 x 1000] 9.66538 13.5068 9.02026
[1920 x 1080] 16.5681 26.9706 15.7412
[4096 x 3112] 104.423 135.629 102.595
[5000 x 5000] 196.124 277.457 187.203
You see that your method is very fast. Mine is a little bit faster. #s1h is slower, but more concise and easy to read.
Code
You can evaulate the results on your PC with this:
#include <opencv2/opencv.hpp>
#include <iostream>
using namespace std;
using namespace cv;
int main()
{
vector<Size> sizes{ Size(3, 3), Size(10, 10), Size(100, 100), Size(1280, 720), Size(1000, 1000), Size(1920, 1080), Size(4096, 3112), Size(5000, 5000) };
cout << "Size: \t\tYuanhao \ts1h \t\tMiki" << endl;
for (int is = 0; is < sizes.size(); ++is)
{
Size sz = sizes[is];
cout << sz << "\t";
Mat1f img1(sz);
randu(img1, Scalar(-100), Scalar(100));
Mat1f img2(sz);
randu(img2, Scalar(-100), Scalar(100));
{
Mat1f one = img1.clone();
Mat1f two = img2.clone();
double tic = double(getTickCount());
for (int r = 0; r < one.rows; ++r)
{
float* pone = one.ptr<float>(r);
float* ptwo = two.ptr<float>(r);
for (int c = 0; c < one.cols; ++c)
{
if (fabs(ptwo[c]) < fabs(pone[c]))
{
pone[c] = ptwo[c];
}
}
}
double toc = (double(getTickCount()) - tic) * 1000. / getTickFrequency();
cout << toc << " \t";
}
{
Mat1f one = img1.clone();
Mat1f two = img2.clone();
double tic = double(getTickCount());
two.copyTo(one, abs(two) < abs(one));
double toc = (double(getTickCount()) - tic) * 1000. / getTickFrequency();
cout << toc << " \t";
}
{
Mat1f one = img1.clone();
Mat1f two = img2.clone();
double tic = double(getTickCount());
int rows = one.rows;
int cols = one.cols;
if (one.isContinuous() && two.isContinuous())
{
cols = rows * cols;
rows = 1;
}
for (int r = 0; r < rows; ++r)
{
float* pone = one.ptr<float>(r);
float* ptwo = two.ptr<float>(r);
for (int c = 0; c < cols; ++c)
{
if (fabs(ptwo[c]) < fabs(pone[c]))
{
pone[c] = ptwo[c];
}
}
}
double toc = (double(getTickCount()) - tic) * 1000. / getTickFrequency();
cout << toc << " \t";
}
cout << endl;
}
getchar();
return 0;
}