I have implemented this code from this to vectorizing it:
int c=0;
for (int j=-halfHeight; j<=halfHeight; ++j)
{
#pragma omp simd
for(int i=-halfWidth; i<=halfWidth; ++i){
wx_[c] = ofsx + j * a12 + i * a11;
wy_[c] = ofsy + j * a22 + i * a21;
x_[c] = (int) floor(wx_[c]);
y_[c] = (int) floor(wy_[c]);
++c;
}
}
std::cout<<"First size="<<size<<std::endl;
float imat_1[size];
std::cout<<"imat1"<<std::endl;
float imat_2[size];
std::cout<<"imat2"<<std::endl;
float imat_3[size];
std::cout<<"imat3"<<std::endl;
float imat_4[size];`
std::cout<<"imat4"<<std::endl;
#pragma omp simd
for(int c=0; c<size; c++){
if (x_[c] >= 0 && y_[c] >= 0 && x_[c] < width && y_[c] < height){
wx_[c] -= x_[c];
wy_[c] -= y_[c];
imat_1[c] = im.at<float>(y_[c],x_[c]);
imat_2[c] = im.at<float>(y_[c],x_[c]+1);
imat_3[c] = im.at<float>(y_[c]+1,x_[c]);
imat_4[c] = im.at<float>(y_[c]+1,x_[c]+1);
}
else{
wx_[c] = 0;
wy_[c] = 0;
imat_1[c] = 0;
imat_2[c] = 0;
imat_3[c] = 0;
imat_4[c] = 0;
ret = true;
}
}
std::cout<<"Second"<<std::endl;
#pragma omp simd
for(int c=0; c<size; c++){
out[c] =
(1.0f - wy_[c]) * ((1.0f - wx_[c]) * imat_1[c] + wx_[c] * imat_2[c]) +
( wy_[c]) * ((1.0f - wx_[c]) * imat_3[c] + wx_[c] * imat_4[c]);
}
In particular, size can reach up to 275625. When I run this code, it goes in segmentation fault at line float imat_4[size];. In fact, this is solved by using float *imat_4 = (float*)malloc(sizeof(float)*size);
I think that this is because of this, so we run out of memory on the stack... But then, how can I solve this? I don't see much other possibilities for vectorizing this code.
notice that performance are crucial here, so allocating on the stack is less efficient (right?)
Related
I'm attempting to compare values of execution time when detecting edges of an image in a linear way and in a parallel way. Everything works fine in a linear way, but in a parallel way the image written has too much white pixels in a part of the image. To better show what I'm saying, see image below:
The left image is the output of the code executed linearly, and in the right is using parallelism. You can see the edges of the buildings in both images, and the bottom part of the right image close to its border doesen't have the same issue has the rest of it.
I cropped the "critical" part of the code that does this tasks, in hope that someone may know what may be causing this.
omp_set_nested(1);
#pragma omp parallel
while(col<cols-1) {
line = 1;
#pragma omp parallel
while(line<lines-1) {
gradient_x = 0;
gradient_y = 0;
for(int m = 0; m < mask_size; m++) {
for(int n = 0; n < mask_size; n++) {
int np_x = line + (m - 1);
int np_y = col + (n - 1);
float v = img(np_y,np_x);
int mask_index = (m*3) + n;
gradient_x = gradient_x + (x_mask[mask_index] * v);
gradient_y = gradient_y + (y_mask[mask_index] * v);
}
}
float gradient_sum = sqrt((gradient_x * gradient_x) + (gradient_y * gradient_y));
if(gradient_sum >= 255) {
gradient_sum = 255;
} else if(gradient_sum <= 0) {
gradient_sum = 0;
}
output(line, col) = gradient_sum;
#pragma omp critical
line++;
}
#pragma omp critical
col++;
}
I defined line and col variables as critical because they are the ones used for both reading and writing data, and I believe everything else is working propperly.
Without more context, is hard to tell. Nonetheless, those two nested parallel regions do not make sense, because you are not distributing tasks among threads; instead you are just executing the same code by multiple threads, with possible race-conditions on the updates of the variables gradient_x and gradient_y among others. Start with the following simpler parallel code:
omp_set_nested(0);
while(col<cols-1) {
line = 1;
while(line<lines-1) {
gradient_x = 0;
gradient_y = 0;
#pragma omp parallel for reduction(+:gradient_x,gradient_y)
for(int m = 0; m < mask_size; m++) {
for(int n = 0; n < mask_size; n++) {
int np_x = line + (m - 1);
int np_y = col + (n - 1);
float v = img(np_y,np_x);
int mask_index = (m*3) + n;
gradient_x = gradient_x + (x_mask[mask_index] * v);
gradient_y = gradient_y + (y_mask[mask_index] * v);
}
}
float gradient_sum = sqrt((gradient_x * gradient_x) + (gradient_y * gradient_y));
if(gradient_sum >= 255) {
gradient_sum = 255;
} else if(gradient_sum <= 0) {
gradient_sum = 0;
}
output(line, col) = gradient_sum;
line++;
}
col++;
}
You can try the following:
#pragma omp parallel for collapse(2)
for(int col = 0; col<cols-1; col++) {
for(int line = 1; line<lines-1; line++) {
float gradient_x = 0;
float gradient_y = 0;
for(int m = 0; m < mask_size; m++) {
for(int n = 0; n < mask_size; n++) {
int np_x = line + (m - 1);
int np_y = col + (n - 1);
float v = img(np_y,np_x);
int mask_index = (m*3) + n;
gradient_x = gradient_x + (x_mask[mask_index] * v);
gradient_y = gradient_y + (y_mask[mask_index] * v);
}
}
float gradient_sum = sqrt((gradient_x * gradient_x) +
(gradient_y * gradient_y));
if(gradient_sum >= 255) {
gradient_sum = 255;
} else if(gradient_sum <= 0) {
gradient_sum = 0;
}
output(line, col) = gradient_sum;
}
}
Of course, you need to check the race-condition in the code that you have cropped.
It's a follow-up question to this one
Now I have the code:
#include <iostream>
#include <cmath>
#include <omp.h>
#define max(a, b) (a)>(b)?(a):(b)
const int m = 2001;
const int n = 2000;
const int p = 4;
double v[m + 2][m + 2];
double x[m + 2];
double y[m + 2];
double _new[m + 2][m + 2];
double maxdiffA[p + 1];
int icol, jrow;
int main() {
omp_set_num_threads(p);
double h = 1.0 / (n + 1);
double start = omp_get_wtime();
#pragma omp parallel for private(icol) shared(x, y, v, _new)
for (icol = 0; icol <= n + 1; ++icol) {
x[icol] = y[icol] = icol * h;
_new[icol][0] = v[icol][0] = 6 - 2 * x[icol];
_new[n + 1][icol] = v[n + 1][icol] = 4 - 2 * y[icol];
_new[icol][n + 1] = v[icol][n + 1] = 3 - x[icol];
_new[0][icol] = v[0][icol] = 6 - 3 * y[icol];
}
const double eps = 0.01;
#pragma omp parallel private(icol, jrow) shared(_new, v, maxdiffA)
{
while (true) { //for [iters=1 to maxiters by 2]
#pragma omp single
for (int i = 0; i < p; i++) maxdiffA[i] = 0;
#pragma omp for
for (icol = 1; icol <= n; icol++)
for (jrow = 1; jrow <= n; jrow++)
_new[icol][jrow] =
(v[icol - 1][jrow] + v[icol + 1][jrow] + v[icol][jrow - 1] + v[icol][jrow + 1]) / 4;
#pragma omp for
for (icol = 1; icol <= n; icol++)
for (jrow = 1; jrow <= n; jrow++)
v[icol][jrow] = (_new[icol - 1][jrow] + _new[icol + 1][jrow] + _new[icol][jrow - 1] +
_new[icol][jrow + 1]) / 4;
#pragma omp for
for (icol = 1; icol <= n; icol++)
for (jrow = 1; jrow <= n; jrow++)
maxdiffA[omp_get_thread_num()] = max(maxdiffA[omp_get_thread_num()],
fabs(_new[icol][jrow] - v[icol][jrow]));
#pragma omp barrier
double maxdiff = 0.0;
for (int k = 0; k < p; ++k) {
maxdiff = max(maxdiff, maxdiffA[k]);
}
if (maxdiff < eps)
break;
#pragma omp barrier
//#pragma omp single
//std::cout << maxdiff << std::endl;
}
}
double end = omp_get_wtime();
printf("start = %.16lf\nend = %.16lf\ndiff = %.16lf\n", start, end, end - start);
return 0;
}
But why it works 2-3 times slower (32sec vs 18sec) than serial analog:
#include <iostream>
#include <cmath>
#include <omp.h>
#define max(a,b) (a)>(b)?(a):(b)
const int m = 2001;
const int n = 2000;
double v[m + 2][m + 2];
double x[m + 2];
double y[m + 2];
double _new[m + 2][m + 2];
int main() {
double h = 1.0 / (n + 1);
double start = omp_get_wtime();
for (int i = 0; i <= n + 1; ++i) {
x[i] = y[i] = i * h;
_new[i][0]=v[i][0] = 6 - 2 * x[i];
_new[n + 1][i]=v[n + 1][i] = 4 - 2 * y[i];
_new[i][n + 1]=v[i][n + 1] = 3 - x[i];
_new[0][i]=v[0][i] = 6 - 3 * y[i];
}
const double eps=0.01;
while(true){ //for [iters=1 to maxiters by 2]
double maxdiff=0.0;
for (int i=1;i<=n;i++)
for (int j=1;j<=n;j++)
_new[i][j]=(v[i-1][j]+v[i+1][j]+v[i][j-1]+v[i][j+1])/4;
for (int i=1;i<=n;i++)
for (int j=1;j<=n;j++)
v[i][j]=(_new[i-1][j]+_new[i+1][j]+_new[i][j-1]+_new[i][j+1])/4;
for (int i=1;i<=n;i++)
for (int j=1;j<=n;j++)
maxdiff=max(maxdiff, fabs(_new[i][j]-v[i][j]));
if(maxdiff<eps) break;
std::cout << maxdiff<<std::endl;
}
double end = omp_get_wtime();
printf("start = %.16lf\nend = %.16lf\ndiff = %.16lf\n", start, end, end - start);
return 0;
}
Also interesting that it works SAME TIME as version (I can post it here if you say so) which looks like so
while(true){ //106 iteratins here!!!
#pragma omp paralell for
for(...)
#pragma omp paralell for
for(...)
#pragma omp paralell for
for(...)
}
But I thought that what making omp code slow is spawning threads inside while loop 106 times... But no! Then probably threads simultaneously write to the same array cells.. But where does it happen? I don't see it could you show me please?
Maybe it's because too much barriers? But Lecturer told me to implement the code like so and "analyse it" Maybe the answer is "Jacobi algorithm isn't meant to run well in parallel"? Or it's just my lame coding?
So the root of evel was
max(maxdiffA[w],fabs(_new[icol][jrow] - v[icol][jrow]))
because it's
#define max(a, b) (a)>(b)?(a):(b)
It's probably creating TOO much branching ('if's ) Without this thing parallel version works 8 times faster and loading CPU 68% instead of 99%..
The starange thing: same "max" doesn't affect serioal version
I am writing to make you aware of a few situations. It is not short to write in a comment, so I decided to write as an answer.
every time a thread is made, it takes some time for its creation. if your program's running time in a single core is short, then the creation of threads will make this time longer for multi-core.
plus using a barrier makes all your threads wait for others, which could somehow be slowed down in cpu. this way, even if all threads finish the job very fast, that last one will make the total run time longer.
try to run your program with bigger sized arrays where time is around 2 minutes for single threading. then make your way to multi-core.
then try to wrap your main code in a normal loop to run it a few times and prints the timings for each. the first run of the loop might be slow because of loading libraries, but the next runs should be faster to prove the increasing speed.
if above suggestions do not give a result, then it means your coding needs more editing.
EDIT:
To downvoters, If you don't like a post, please at least be polite and leave a comment. Or better, give your own answer so be helpful to community.
I struggle a bit with a function. The calculation is wrong if I try to parallelize the outer loop with a
#pragma omp parallel reduction(+:det).
Can someone show me how to solve it and why it is failing?
// template<class T> using vector2D = std::vector<std::vector<T>>;
float Det(vector2DF &a, int n)
{
vector2DF m(n - 1, vector1DF(n - 1, 0));
if (n == 1) return a[0][0];
if (n == 2) return a[0][0] * a[1][1] - a[1][0] * a[0][1];
float det = 0;
for (int i = 0; i < n; i++)
{
int l = 0;
#pragma omp parallel for private(l)
for (int j = 1; j < n; j++)
{
l = 0;
for (int k = 0; k < n; k++)
{
if (k == i) continue;
m[j - 1][l] = a[j][k];
l++;
}
}
det += std::pow(-1.0, 1.0 + i + 1.0) * a[0][i] * Det(m, n - 1);
}
return det;
}
If you parallelize the outer loop, there is a race condition on this line:
m[j - 1][l] = a[j][k];
Also you likely want a parallel for reduction instead of just a parallel reduction.
The issue is, that m is shared, even though that wouldn't be necessary given that it is completely overwritten in the inner loop. Always declare variables as locally as possible, this avoids issues with wrongly shared variables, e.g.:
float Det(vector2DF &a, int n)
{
if (n == 1) return a[0][0];
if (n == 2) return a[0][0] * a[1][1] - a[1][0] * a[0][1];
float det = 0;
#pragma omp parallel reduction(+:det)
for (int i = 0; i < n; i++)
{
vector2DF m(n - 1, vector1DF(n - 1, 0));
for (int j = 1; j < n; j++)
{
int l = 0;
for (int k = 0; k < n; k++)
{
if (k == i) continue;
m[j - 1][l] = a[j][k];
l++;
}
}
det += std::pow(-1.0, 1.0 + i + 1.0) * a[0][i] * Det(m, n - 1);
}
return det;
}
Now that is correct, but since m can be expensive to allocate, performance could benefit from not doing it in each and every iteration. This can be done by splitting parallel and for directives as such:
float Det(vector2DF &a, int n)
{
if (n == 1) return a[0][0];
if (n == 2) return a[0][0] * a[1][1] - a[1][0] * a[0][1];
float det = 0;
#pragma omp parallel reduction(+:det)
{
vector2DF m(n - 1, vector1DF(n - 1, 0));
#pragma omp parallel for
for (int i = 0; i < n; i++)
{
for (int j = 1; j < n; j++)
{
int l = 0;
for (int k = 0; k < n; k++)
{
if (k == i) continue;
m[j - 1][l] = a[j][k];
l++;
}
}
det += std::pow(-1.0, 1.0 + i + 1.0) * a[0][i] * Det(m, n - 1);
}
}
return det;
}
Now you could also just declare m as firstprivate, but that would assume that the copy constructor makes a completely independent deep-copy and thus make the code more difficult to reason about.
Please be aware that you should always include expected output, actual output and a minimal complete and verifiable example.
I'm trying to implement procedural generation in my game. I want to really grasp and understand all of the algorithms nessecary rather than simply copying/pasting existing code. In order to do this I've attempted to implement 1D midpoint displacement on my own. I've used the information here to write and guide my code. Below is my completed code, it doesn't throw an error but that results don't appear correct.
srand(time(NULL));
const int lineLength = 65;
float range = 1.0;
float displacedLine[lineLength];
for (int i = 0; i < lineLength; i++)
{
displacedLine[i] = 0.0;
}
for (int p = 0; p < 100; p++)
{
int segments = 1;
for (int i = 0; i < (lineLength / pow(2, 2)); i++)
{
int segs = segments;
for (int j = 0; j < segs; j++)
{
int x = floor(lineLength / segs);
int start = (j * x) + 1;
int end = start + x;
if (i == 0)
{
end--;
}
float lo = -range;
float hi = +range;
float change = lo + static_cast <float> (rand()) / (static_cast <float> (RAND_MAX / (hi - lo)));
int center = ((end - start) / 2) + start;
displacedLine[center - 1] += change;
segments++;
}
range /= 2;
}
}
Where exactly have I made mistakes and how might I correct them?
I'm getting results like this:
But I was expecting results like this:
The answer is very simple and by the way I'm impressed you managed to debug all the potential off-by-one errors in your code. The following line is wrong:
displacedLine[center - 1] += change;
You correctly compute the center index and change amount but you missed that the change should be applied to the midpoint in terms of height. That is:
displacedLine[center - 1] = (displacedLine[start] + displacedLine[end]) / 2;
displacedLine[center - 1] += change;
I'm sure you get the idea.
The problem seems to be that you are changing only the midpoint of each line segment, rather than changing the rest of the line segment in proportion to its distance from each end to the midpoint. The following code appears to give you something more like what you're looking for:
#include <iostream>
#include <cstdlib>
#include <math.h>
#include <algorithm>
using namespace std;
void displaceMidPt (float dline[], int len, float disp) {
int midPt = len/2;
float fmidPt = float(midPt);
for (int i = 1; i <= midPt; i++) {
float ptDisp = disp * float(i)/fmidPt;
dline[i] += ptDisp;
dline[len-i] += ptDisp;
}
}
void displace (float displacedLine[], int lineLength, float range) {
for (int p = 0; p < 100; p++) {
int segs = pow(p, 2);
for (int j = 0; j < segs; j++) {
float lo = -range;
float hi = +range;
float change = lo + static_cast <float> (rand()) / (static_cast <float> (RAND_MAX / (hi - lo)));
int start = int(float(j)/float(segs)*float(lineLength));
int end = int(float(j+1)/float(segs)*float(lineLength));
displaceMidPt (displacedLine+start,end-start,change);
}
range /= 2;
}
}
void plot1D (float x[], int len, int ht = 10) {
float minX = *min_element(x,x+len);
float maxX = *max_element(x,x+len);
int xi[len];
for (int i = 0; i < len; i++) {
xi[i] = int(ht*(x[i] - minX)/(maxX - minX) + 0.5);
}
char s[len+1];
s[len] = '\0';
for (int j = ht; j >= 0; j--) {
for (int i = 0; i < len; i++) {
if (xi[i] == j) {
s[i] = '*';
} else {
s[i] = ' ';
}
}
cout << s << endl;
}
}
int main () {
srand(time(NULL));
const int lineLength = 65;
float range = 1.0;
float displacedLine[lineLength];
for (int i = 0; i < lineLength; i++) {
displacedLine[i] = 0.0;
}
displace (displacedLine,lineLength,range);
plot1D (displacedLine,lineLength);
return 0;
}
When run this way, it produces the following result:
$ c++ -lm displace.cpp
$ ./a
*
* *
* ***
* * * *
* ** **** * **
* *** **** * * * ** *
* * ** ** *** * * * *
** ** *
* * * ***
** ***
*
I have written a global version of Particle Swarm Optimization algorithm in C++.
I tried to write it exactly as same as my MATLAB PSO code that have written before, but this code generates different and so worst answers.
The MATLAB code is:
clear all;
numofdims = 30;
numofparticles = 50;
c1 = 2;
c2 = 2;
numofiterations = 1000;
V = zeros(50, 30);
initialpop = V;
Vmin = zeros(30, 1);
Vmax = Vmin;
Xmax = ones(30, 1) * 100;
Xmin = -Xmax;
pbestfits = zeros(50, 1);
worsts = zeros(50, 1);
bests = zeros(50, 1);
meanfits = zeros(50, 1);
pbests = zeros(50, 30);
initialpop = Xmin + (Xmax - Xmin) .* rand(numofparticles, numofdims);
X = initialpop;
fitnesses = testfunc1(X);
[minfit, minfitidx] = min(fitnesses);
gbestfit = minfit;
gbest = X(minfitidx, :);
for i = 1:numofdims
Vmax(i) = 0.2 * (Xmax(i) - Xmin(i));
Vmin(i) = -Vmax(i);
end
for t = 1:1000
w = 0.9 - 0.7 * (t / numofiterations);
for i = 1:numofparticles
if(fitnesses(i) < pbestfits(i))
pbestfits(i) = fitnesses(i);
pbests(i, :) = X(i, :);
end
end
for i = 1:numofparticles
for j = 1:numofdims
V(i, j) = min(max((w * V(i, j) + rand * c1 * (pbests(i, j) - X(i, j))...
+ rand * c2 * (gbest(j) - X(i, j))), Vmin(j)), Vmax(j));
X(i, j) = min(max((X(i, j) + V(i, j)), Xmin(j)), Xmax(j));
end
end
fitnesses = testfunc1(X);
[minfit, minfitidx] = min(fitnesses);
if(minfit < gbestfit)
gbestfit = minfit;
gbest = X(minfitidx, :);
end
worsts(t) = max(fitnesses);
bests(t) = gbestfit;
meanfits(t) = mean(fitnesses);
end
In which, testfunc1 is:
function [out] = testfunc1(R)
out = sum(R .^ 2, 2);
end
The C++ code is:
#include <cstring>
#include <iostream>
#include <cmath>
#include <algorithm>
#include <ctime>
#define rand_01 ((float)rand() / (float)RAND_MAX)
const int numofdims = 30;
const int numofparticles = 50;
using namespace std;
void fitnessfunc(float X[numofparticles][numofdims], float fitnesses[numofparticles])
{
memset(fitnesses, 0, sizeof (float) * numofparticles);
for(int i = 0; i < numofparticles; i++)
{
for(int j = 0; j < numofdims; j++)
{
fitnesses[i] += (pow(X[i][j], 2));
}
}
}
float mean(float inputval[], int vallength)
{
int addvalue = 0;
for(int i = 0; i < vallength; i++)
{
addvalue += inputval[i];
}
return (float)(addvalue / vallength);
}
void PSO(int numofiterations, float c1, float c2,
float Xmin[numofdims], float Xmax[numofdims], float initialpop[numofparticles][numofdims],
float worsts[], float meanfits[], float bests[], float *gbestfit, float gbest[numofdims])
{
float V[numofparticles][numofdims] = {0};
float X[numofparticles][numofdims];
float Vmax[numofdims];
float Vmin[numofdims];
float pbests[numofparticles][numofdims];
float pbestfits[numofparticles];
float fitnesses[numofparticles];
float w;
float minfit;
int minfitidx;
memcpy(X, initialpop, sizeof(float) * numofparticles * numofdims);
fitnessfunc(X, fitnesses);
minfit = *min_element(fitnesses, fitnesses + numofparticles);
minfitidx = min_element(fitnesses, fitnesses + numofparticles) - fitnesses;
*gbestfit = minfit;
memcpy(gbest, X[minfitidx], sizeof(float) * numofdims);
for(int i = 0; i < numofdims; i++)
{
Vmax[i] = 0.2 * (Xmax[i] - Xmin[i]);
Vmin[i] = -Vmax[i];
}
for(int t = 0; t < 1000; t++)
{
w = 0.9 - 0.7 * (float) (t / numofiterations);
for(int i = 0; i < numofparticles; i++)
{
if(fitnesses[i] < pbestfits[i])
{
pbestfits[i] = fitnesses[i];
memcpy(pbests[i], X[i], sizeof(float) * numofdims);
}
}
for(int i = 0; i < numofparticles; i++)
{
for(int j = 0; j < numofdims; j++)
{
V[i][j] = min(max((w * V[i][j] + rand_01 * c1 * (pbests[i][j] - X[i][j])
+ rand_01 * c2 * (gbest[j] - X[i][j])), Vmin[j]), Vmax[j]);
X[i][j] = min(max((X[i][j] + V[i][j]), Xmin[j]), Xmax[j]);
}
}
fitnessfunc(X, fitnesses);
minfit = *min_element(fitnesses, fitnesses + numofparticles);
minfitidx = min_element(fitnesses, fitnesses + numofparticles) - fitnesses;
if(minfit < *gbestfit)
{
*gbestfit = minfit;
memcpy(gbest, X[minfitidx], sizeof(float) * numofdims);
}
worsts[t] = *max_element(fitnesses, fitnesses + numofparticles);
bests[t] = *gbestfit;
meanfits[t] = mean(fitnesses, numofparticles);
}
}
int main()
{
time_t t;
srand((unsigned) time(&t));
float xmin[30], xmax[30];
float initpop[50][30];
float worsts[1000], bests[1000];
float meanfits[1000];
float gbestfit;
float gbest[30];
for(int i = 0; i < 30; i++)
{
xmax[i] = 100;
xmin[i] = -100;
}
for(int i = 0; i < 50; i++)
for(int j = 0; j < 30; j++)
{
initpop[i][j] = rand() % (100 + 100 + 1) - 100;
}
PSO(1000, 2, 2, xmin, xmax, initpop, worsts, meanfits, bests, &gbestfit, gbest);
cout<<"fitness: "<<gbestfit<<endl;
return 0;
}
I have debugged two codes many times but can not find the difference which makes answers different.
It is making me crazy!
May you help me please?
Update:
Please consider that, the function mean is just used for reporting some information and is not used in the optimization procedure.
You've got integer division in the following line
w = 0.9 - 0.7 * (float) (t / numofiterations);
w will be 0.2 for every iteration, change it to
w = 0.9 - 0.7 * t / numofiterations;
The first multiplication will automatically promote t to a double the division should then promote numof iterations to a double.
The parenthesis means it will be done first and therefore not be promoted as wo integers is involved in the division.
This could be a mistake in function mean:
return (float)(addvalue / vallength);
This is integer division, so the result is truncated down, then cast to float. It is unlikely this is what you want.