I am currently implementing Clenshaw's algorithm with Rcpp to speed up my previous implementation in R. My current implementation is as follows (note that I am using RcppParallel for other functions defined in the same source file; RcppParalell is not used in this specific function, but I've left the headers in case this is somehow relevant):
#include <Rcpp.h>
#include <RcppParallel.h>
using namespace Rcpp;
using namespace RcppParallel;
// [[Rcpp::plugins("cpp11")]]
// [[Rcpp::export]]
NumericVector clenshawAllDerivatives(double t, int N, double Ta, double Tb, NumericVector Coeffs, int derivativesOrder) {
double tau = (2*t-Ta-Tb)/(Tb-Ta);
double helperValues[derivativesOrder + 1][3];
double scale;
for(double i = N; i > 1; i--) {
helperValues[0][2] = helperValues[0][1];
helperValues[0][1] = helperValues[0][0];
helperValues[0][0] = 2*tau*helperValues[0][1]-helperValues[0][2] + Coeffs[i - 1];
scale=2.0;
for(int j = 1; j <= derivativesOrder; j++) {
helperValues[j][2] = helperValues[j][1];
helperValues[j][1] = helperValues[j][0];
helperValues[j][0] = scale*helperValues[j-1][1] + 2*tau*helperValues[j][1] - helperValues[j][2];
scale += 2.0;
}
}
NumericVector output(derivativesOrder + 1);
output[0] = tau*helperValues[0][0] - helperValues[0][1] + Coeffs[0];
scale = 1.0;
double scale2initial = ((Tb-Ta)/2 * 86400.0), scale2 = scale2initial;
for(int j = 1; j <= derivativesOrder; j++) {
output[j] = (scale*helperValues[j-1][0] + tau*helperValues[j][0] - helperValues[j][1]) / scale2;
scale += 1.0;
scale2 = scale2 * scale2initial;
}
return output;
}
An example of application of the function, with example input values:
clenshawAllDerivatives(59568.5, 11, 59568, 59584, c(-1.281626e+06, -4.069960e+03, 2.725817e+01, -9.715712e-02, -1.115373e-03, -5.121949e-04, -9.068147e-05, -6.829206e-06, 1.657523e-07 , 1.406006e-07, 2.273966e-08), 1)
When run multiple times, this returns most often the expected correct output of c(-1.277790e+06, -6.037188e-03). However, sometimes it returns instead wrong values, typically very high numbers.
Any help to identify the cause of this unexpected behavior would be greatly appreciated!
As I run the program, it crashes with segmentation fault. Also, when I debug the code in codeblocks IDE, I am unable to debug it as well. The program crashes even before debugging begins. I am not able to understand the problem. Any help would be appreciated. Thanks!!
#include <iostream>
#include <math.h>
#include <string>
using namespace std;
// Method to make strings of equal length
int makeEqualLength(string& fnum,string& snum){
int l1 = fnum.length();
int l2 = snum.length();
if(l1>l2){
int d = l1-l2;
while(d>0){
snum = '0' + snum;
d--;
}
return l1;
}
else if(l2>l1){
int d = l2-l1;
while(d>0){
fnum = '0' + fnum;
d--;
}
return l2;
}
else
return l1;
}
int singleDigitMultiplication(string& fnum,string& snum){
return ((fnum[0] -'0')*(snum[0] -'0'));
}
string addStrings(string& s1,string& s2){
int length = makeEqualLength(s1,s2);
int carry = 0;
string result;
for(int i=length-1;i>=0;i--){
int fd = s1[i]-'0';
int sd = s2[i]-'0';
int sum = (fd+sd+carry)%10+'0';
carry = (fd+sd+carry)/10;
result = (char)sum + result;
}
result = (char)carry + result;
return result;
}
long int multiplyByKaratsubaMethod(string fnum,string snum){
int length = makeEqualLength(fnum,snum);
if(length==0) return 0;
if(length==1) return singleDigitMultiplication(fnum,snum);
int fh = length/2;
int sh = length - fh;
string Xl = fnum.substr(0,fh);
string Xr = fnum.substr(fh,sh);
string Yl = snum.substr(0,fh);
string Yr = snum.substr(fh,sh);
long int P1 = multiplyByKaratsubaMethod(Xl,Yl);
long int P3 = multiplyByKaratsubaMethod(Xr,Yr);
long int P2 = multiplyByKaratsubaMethod(addStrings(Xl,Xr),addStrings(Yl,Yr)) - P1-P3;
return (P1*pow(10,length) + P2*pow(10,length/2) + P3);
}
int main()
{
string firstNum = "62";
string secondNum = "465";
long int result = multiplyByKaratsubaMethod(firstNum,secondNum);
cout << result << endl;
return 0;
}
There are three serious issues in your code:
result = (char)carry + result; does not work.The carry has a value between 0 (0 * 0) and 8 (9 * 9). It has to be converted to the corresponding ASCII value:result = (char)(carry + '0') + result;.
This leads to the next issue: The carry is even inserted if it is 0. There is an if statement missing:if (carry/* != 0*/) result = (char)(carry + '0') + result;.
After fixing the first two issues and testing again, the stack overflow still occurs. So, I compared your algorithm with another I found by google:Divide and Conquer | Set 4 (Karatsuba algorithm for fast multiplication)(and possibly was your origin because it's looking very similar). Without digging deeper, I fixed what looked like a simple transfer mistake:return P1 * pow(10, 2 * sh) + P2 * pow(10, sh) + P3;(I replaced length by 2 * sh and length/2 by sh like I saw it in the googled code.) This became obvious for me seeing in the debugger that length can have odd values so that sh and length/2 are distinct values.
Afterwards, your program became working.
I changed the main() function to test it a little bit harder:
#include <cmath>
#include <iostream>
#include <string>
using namespace std;
string intToStr(int i)
{
string text;
do {
text.insert(0, 1, i % 10 + '0');
i /= 10;
} while (i);
return text;
}
// Method to make strings of equal length
int makeEqualLength(string &fnum, string &snum)
{
int l1 = (int)fnum.length();
int l2 = (int)snum.length();
return l1 < l2
? (fnum.insert(0, l2 - l1, '0'), l2)
: (snum.insert(0, l1 - l2, '0'), l1);
}
int singleDigitMultiplication(const string& fnum, const string& snum)
{
return ((fnum[0] - '0') * (snum[0] - '0'));
}
string addStrings(string& s1, string& s2)
{
int length = makeEqualLength(s1, s2);
int carry = 0;
string result;
for (int i = length - 1; i >= 0; --i) {
int fd = s1[i] - '0';
int sd = s2[i] - '0';
int sum = (fd + sd + carry) % 10 + '0';
carry = (fd + sd + carry) / 10;
result.insert(0, 1, (char)sum);
}
if (carry) result.insert(0, 1, (char)(carry + '0'));
return result;
}
long int multiplyByKaratsubaMethod(string fnum, string snum)
{
int length = makeEqualLength(fnum, snum);
if (length == 0) return 0;
if (length == 1) return singleDigitMultiplication(fnum, snum);
int fh = length / 2;
int sh = length - fh;
string Xl = fnum.substr(0, fh);
string Xr = fnum.substr(fh, sh);
string Yl = snum.substr(0, fh);
string Yr = snum.substr(fh, sh);
long int P1 = multiplyByKaratsubaMethod(Xl, Yl);
long int P3 = multiplyByKaratsubaMethod(Xr, Yr);
long int P2
= multiplyByKaratsubaMethod(addStrings(Xl, Xr), addStrings(Yl, Yr))
- P1 - P3;
return P1 * pow(10, 2 * sh) + P2 * pow(10, sh) + P3;
}
int main()
{
int nErrors = 0;
for (int i = 0; i < 1000; i += 3) {
for (int j = 0; j < 1000; j += 3) {
long int result
= multiplyByKaratsubaMethod(intToStr(i), intToStr(j));
bool ok = result == i * j;
cout << i << " * " << j << " = " << result
<< (ok ? " OK." : " ERROR!") << endl;
nErrors += !ok;
}
}
cout << nErrors << " error(s)." << endl;
return 0;
}
Notes about changes I've made:
Concerning std library: Please, don't mix headers with ".h" and without. Every header of std library is available in "non-suffix-flavor". (The header with ".h" are either C header or old-fashioned.) Headers of C library have been adapted to C++. They have the old name with prefix "c" and without suffix ".h".
Thus, I replaced #include <math.h> by #include <cmath>.
I couldn't resist to make makeEqualLength() a little bit shorter.
Please, note, that a lot of methods in std use std::size_t instead of int or unsigned. std::size_t has appropriate width to do array subscript and pointer arithmetic i.e it has "machine word width". I believed a long time that int and unsigned should have "machine word width" also and didn't care about size_t. When we changed in Visual Studio from x86 (32 bits) to x64 (64 bits), I learnt the hard way that I had been very wrong: std::size_t is 64 bits now but int and unsigned are still 32 bits. (MS VC++ is not an exception. Other compiler vendors (but not all) do it the same way.)I inserted some C type casts to remove the warnings from compiler output. Such casts to remove warnings (regardless you use C casts or better the C++ casts) should always be used with care and should be understood as confirmation: Dear compiler. I see you have concerns but I (believe to) know and assure you that it should work fine.
I'm not sure about your intention to use long int in some places. (Probably, you transferred this code from original source without caring about.) As your surely know, the actual size of all int types may differ to match best performance of the target platform. I'm working on a Intel-PC with Windows 10, using Visual Studio. sizeof (int) == sizeof (long int) (32 bits). This is independent whether I compile x86 code (32 bits) or x64 code (64 bits). The same is true for gcc (on cygwin in my case) as well as on any Intel-PC with Linux (AFAIK). For a granted larger type than int you have to choose long long int.
I did the sample session in cygwin on Windows 10 (64 bit):
$ g++ -std=c++11 -o karatsuba karatsuba.cc
$ ./karatsuba
0 * 0 = 0 OK.
0 * 3 = 0 OK.
0 * 6 = 0 OK.
etc. etc.
999 * 993 = 992007 OK.
999 * 996 = 995004 OK.
999 * 999 = 998001 OK.
0 error(s).
$
Well, here is my code and I am having a problem because my n is not increasing:
#define N 100
#define N_EQUATIONS 18 + 2
//initial values
int v = 1;
int cai = 2;
int caSR = 3;
int nai = 4;
int ki = 5;
int dvdt = 18;
double V_init = -87.5;
double Cai_init=1.0e-4;
double cansr=1.2;
double cajsr=cansr;
double CaSR_init = cansr + cajsr;
double Nai_init = 7;
double Ki_init = 145;
double u[N + 1][N_EQUATIONS + 1];
double Im[N + 1];
int main () {
int n = 0;
for ( n = 0; n <= N; n++) {
printf("n=%.18f\n", n);
u[n][v] = V_init;
//printf("t=%.18f\n", u[n][v]);
u[n][cai] = Cai_init;
//printf("cai=%.18f\n", u[n][cai]);
u[n][caSR] = CaSR_init;
u[n][nai] = Nai_init;
u[n][ki] = Ki_init;
u[n][dvdt] = 0.0;//check it
tapend[n] = 0.0;
tapstart[n] = 0.0;
}
}
Sorry if it is a stupid question and the answer is staring me at the eyes..
P.S. see the new revised code
You are probably just confused because your printf is incorrect:
printf("n=%.18f\n", n);
should be, e.g.
printf("n=%18d\n", n);
Currently you just print garbage in your loop (0 in your case, it seems, but it could be anything), so this may give the incorrect impression that n is not incrementing correctly.
Note that if you enable compiler warnings (and compiler warnings should always be enabled), then the compiler would have pointed out this mistake to you. Always enable compiler warnings and always take notice of any warnings, understand them, and fix them.
I see in OpenCV it uses #if CV_ENABLE_UNROLLED in many places. for example
#if CV_ENABLE_UNROLLED
for( ; i <= width - 4; i += 4 )
{
ST f = ky[0];
const ST* S = (const ST*)src[0] + i, *S2;
ST s0 = f*S[0] + _delta, s1 = f*S[1] + _delta,
s2 = f*S[2] + _delta, s3 = f*S[3] + _delta;
for( k = 1; k <= ksize2; k++ )
{
S = (const ST*)src[k] + i;
S2 = (const ST*)src[-k] + i;
f = ky[k];
s0 += f*(S[0] + S2[0]);
s1 += f*(S[1] + S2[1]);
s2 += f*(S[2] + S2[2]);
s3 += f*(S[3] + S2[3]);
}
D[i] = castOp(s0); D[i+1] = castOp(s1);
D[i+2] = castOp(s2); D[i+3] = castOp(s3);
}
#endif
in filter.cpp
Does this feature have any other advantage than doing 4 float operations in a loop means less number of condition check? If yes then what is it. If no then when put it in a macro, why can't we use it always?
From OpenCV's revisions:
CV_ENABLE_UNROLLED (?? default)default) - Experimental feature, that activates some 4- or 8- unroll loops. It theoretically helps compiler optimize the code, and improves memory access. However, the coverage of this feature in code is still very limited.
I'm writing a sparse matrix solver using the Gauss-Seidel method. By profiling, I've determined that about half of my program's time is spent inside the solver. The performance-critical part is as follows:
size_t ic = d_ny + 1, iw = d_ny, ie = d_ny + 2, is = 1, in = 2 * d_ny + 1;
for (size_t y = 1; y < d_ny - 1; ++y) {
for (size_t x = 1; x < d_nx - 1; ++x) {
d_x[ic] = d_b[ic]
- d_w[ic] * d_x[iw] - d_e[ic] * d_x[ie]
- d_s[ic] * d_x[is] - d_n[ic] * d_x[in];
++ic; ++iw; ++ie; ++is; ++in;
}
ic += 2; iw += 2; ie += 2; is += 2; in += 2;
}
All arrays involved are of float type. Actually, they are not arrays but objects with an overloaded [] operator, which (I think) should be optimized away, but is defined as follows:
inline float &operator[](size_t i) { return d_cells[i]; }
inline float const &operator[](size_t i) const { return d_cells[i]; }
For d_nx = d_ny = 128, this can be run about 3500 times per second on an Intel i7 920. This means that the inner loop body runs 3500 * 128 * 128 = 57 million times per second. Since only some simple arithmetic is involved, that strikes me as a low number for a 2.66 GHz processor.
Maybe it's not limited by CPU power, but by memory bandwidth? Well, one 128 * 128 float array eats 65 kB, so all 6 arrays should easily fit into the CPU's L3 cache (which is 8 MB). Assuming that nothing is cached in registers, I count 15 memory accesses in the inner loop body. On a 64-bits system this is 120 bytes per iteration, so 57 million * 120 bytes = 6.8 GB/s. The L3 cache runs at 2.66 GHz, so it's the same order of magnitude. My guess is that memory is indeed the bottleneck.
To speed this up, I've attempted the following:
Compile with g++ -O3. (Well, I'd been doing this from the beginning.)
Parallelizing over 4 cores using OpenMP pragmas. I have to change to the Jacobi algorithm to avoid reads from and writes to the same array. This requires that I do twice as many iterations, leading to a net result of about the same speed.
Fiddling with implementation details of the loop body, such as using pointers instead of indices. No effect.
What's the best approach to speed this guy up? Would it help to rewrite the inner body in assembly (I'd have to learn that first)? Should I run this on the GPU instead (which I know how to do, but it's such a hassle)? Any other bright ideas?
(N.B. I do take "no" for an answer, as in: "it can't be done significantly faster, because...")
Update: as requested, here's a full program:
#include <iostream>
#include <cstdlib>
#include <cstring>
using namespace std;
size_t d_nx = 128, d_ny = 128;
float *d_x, *d_b, *d_w, *d_e, *d_s, *d_n;
void step() {
size_t ic = d_ny + 1, iw = d_ny, ie = d_ny + 2, is = 1, in = 2 * d_ny + 1;
for (size_t y = 1; y < d_ny - 1; ++y) {
for (size_t x = 1; x < d_nx - 1; ++x) {
d_x[ic] = d_b[ic]
- d_w[ic] * d_x[iw] - d_e[ic] * d_x[ie]
- d_s[ic] * d_x[is] - d_n[ic] * d_x[in];
++ic; ++iw; ++ie; ++is; ++in;
}
ic += 2; iw += 2; ie += 2; is += 2; in += 2;
}
}
void solve(size_t iters) {
for (size_t i = 0; i < iters; ++i) {
step();
}
}
void clear(float *a) {
memset(a, 0, d_nx * d_ny * sizeof(float));
}
int main(int argc, char **argv) {
size_t n = d_nx * d_ny;
d_x = new float[n]; clear(d_x);
d_b = new float[n]; clear(d_b);
d_w = new float[n]; clear(d_w);
d_e = new float[n]; clear(d_e);
d_s = new float[n]; clear(d_s);
d_n = new float[n]; clear(d_n);
solve(atoi(argv[1]));
cout << d_x[0] << endl; // prevent the thing from being optimized away
}
I compile and run it as follows:
$ g++ -o gstest -O3 gstest.cpp
$ time ./gstest 8000
0
real 0m1.052s
user 0m1.050s
sys 0m0.010s
(It does 8000 instead of 3500 iterations per second because my "real" program does a lot of other stuff too. But it's representative.)
Update 2: I've been told that unititialized values may not be representative because NaN and Inf values may slow things down. Now clearing the memory in the example code. It makes no difference for me in execution speed, though.
Couple of ideas:
Use SIMD. You could load 4 floats at a time from each array into a SIMD register (e.g. SSE on Intel, VMX on PowerPC). The disadvantage of this is that some of the d_x values will be "stale" so your convergence rate will suffer (but not as bad as a jacobi iteration); it's hard to say whether the speedup offsets it.
Use SOR. It's simple, doesn't add much computation, and can improve your convergence rate quite well, even for a relatively conservative relaxation value (say 1.5).
Use conjugate gradient. If this is for the projection step of a fluid simulation (i.e. enforcing non-compressability), you should be able to apply CG and get a much better convergence rate. A good preconditioner helps even more.
Use a specialized solver. If the linear system arises from the Poisson equation, you can do even better than conjugate gradient using an FFT-based methods.
If you can explain more about what the system you're trying to solve looks like, I can probably give some more advice on #3 and #4.
I think I've managed to optimize it, here's a code, create a new project in VC++, add this code and simply compile under "Release".
#include <iostream>
#include <cstdlib>
#include <cstring>
#define _WIN32_WINNT 0x0400
#define WIN32_LEAN_AND_MEAN
#include <windows.h>
#include <conio.h>
using namespace std;
size_t d_nx = 128, d_ny = 128;
float *d_x, *d_b, *d_w, *d_e, *d_s, *d_n;
void step_original() {
size_t ic = d_ny + 1, iw = d_ny, ie = d_ny + 2, is = 1, in = 2 * d_ny + 1;
for (size_t y = 1; y < d_ny - 1; ++y) {
for (size_t x = 1; x < d_nx - 1; ++x) {
d_x[ic] = d_b[ic]
- d_w[ic] * d_x[iw] - d_e[ic] * d_x[ie]
- d_s[ic] * d_x[is] - d_n[ic] * d_x[in];
++ic; ++iw; ++ie; ++is; ++in;
}
ic += 2; iw += 2; ie += 2; is += 2; in += 2;
}
}
void step_new() {
//size_t ic = d_ny + 1, iw = d_ny, ie = d_ny + 2, is = 1, in = 2 * d_ny + 1;
float
*d_b_ic,
*d_w_ic,
*d_e_ic,
*d_x_ic,
*d_x_iw,
*d_x_ie,
*d_x_is,
*d_x_in,
*d_n_ic,
*d_s_ic;
d_b_ic = d_b;
d_w_ic = d_w;
d_e_ic = d_e;
d_x_ic = d_x;
d_x_iw = d_x;
d_x_ie = d_x;
d_x_is = d_x;
d_x_in = d_x;
d_n_ic = d_n;
d_s_ic = d_s;
for (size_t y = 1; y < d_ny - 1; ++y)
{
for (size_t x = 1; x < d_nx - 1; ++x)
{
/*d_x[ic] = d_b[ic]
- d_w[ic] * d_x[iw] - d_e[ic] * d_x[ie]
- d_s[ic] * d_x[is] - d_n[ic] * d_x[in];*/
*d_x_ic = *d_b_ic
- *d_w_ic * *d_x_iw - *d_e_ic * *d_x_ie
- *d_s_ic * *d_x_is - *d_n_ic * *d_x_in;
//++ic; ++iw; ++ie; ++is; ++in;
d_b_ic++;
d_w_ic++;
d_e_ic++;
d_x_ic++;
d_x_iw++;
d_x_ie++;
d_x_is++;
d_x_in++;
d_n_ic++;
d_s_ic++;
}
//ic += 2; iw += 2; ie += 2; is += 2; in += 2;
d_b_ic += 2;
d_w_ic += 2;
d_e_ic += 2;
d_x_ic += 2;
d_x_iw += 2;
d_x_ie += 2;
d_x_is += 2;
d_x_in += 2;
d_n_ic += 2;
d_s_ic += 2;
}
}
void solve_original(size_t iters) {
for (size_t i = 0; i < iters; ++i) {
step_original();
}
}
void solve_new(size_t iters) {
for (size_t i = 0; i < iters; ++i) {
step_new();
}
}
void clear(float *a) {
memset(a, 0, d_nx * d_ny * sizeof(float));
}
int main(int argc, char **argv) {
size_t n = d_nx * d_ny;
d_x = new float[n]; clear(d_x);
d_b = new float[n]; clear(d_b);
d_w = new float[n]; clear(d_w);
d_e = new float[n]; clear(d_e);
d_s = new float[n]; clear(d_s);
d_n = new float[n]; clear(d_n);
if(argc < 3)
printf("app.exe (x)iters (o/n)algo\n");
bool bOriginalStep = (argv[2][0] == 'o');
size_t iters = atoi(argv[1]);
/*printf("Press any key to start!");
_getch();
printf(" Running speed test..\n");*/
__int64 freq, start, end, diff;
if(!::QueryPerformanceFrequency((LARGE_INTEGER*)&freq))
throw "Not supported!";
freq /= 1000000; // microseconds!
{
::QueryPerformanceCounter((LARGE_INTEGER*)&start);
if(bOriginalStep)
solve_original(iters);
else
solve_new(iters);
::QueryPerformanceCounter((LARGE_INTEGER*)&end);
diff = (end - start) / freq;
}
printf("Speed (%s)\t\t: %u\n", (bOriginalStep ? "original" : "new"), diff);
//_getch();
//cout << d_x[0] << endl; // prevent the thing from being optimized away
}
Run it like this:
app.exe 10000 o
app.exe 10000 n
"o" means old code, yours.
"n" is mine, the new one.
My results:
Speed (original):
1515028
1523171
1495988
Speed (new):
966012
984110
1006045
Improvement of about 30%.
The logic behind:
You've been using index counters to access/manipulate.
I use pointers.
While running, breakpoint at a certain calculation code line in VC++'s debugger, and press F8. You'll get the disassembler window.
The you'll see the produced opcodes (assembly code).
Anyway, look:
int *x = ...;
x[3] = 123;
This tells the PC to put the pointer x at a register (say EAX).
The add it (3 * sizeof(int)).
Only then, set the value to 123.
The pointers approach is much better as you can understand, because we cut the adding process, actually we handle it ourselves, thus able to optimize as needed.
I hope this helps.
Sidenote to stackoverflow.com's staff:
Great website, I hope I've heard of it long ago!
For one thing, there seems to be a pipelining issue here. The loop reads from the value in d_x that has just been written to, but apparently it has to wait for that write to complete. Just rearranging the order of the computation, doing something useful while it's waiting, makes it almost twice as fast:
d_x[ic] = d_b[ic]
- d_e[ic] * d_x[ie]
- d_s[ic] * d_x[is] - d_n[ic] * d_x[in]
- d_w[ic] * d_x[iw] /* d_x[iw] has just been written to, process this last */;
It was Eamon Nerbonne who figured this out. Many upvotes to him! I would never have guessed.
Poni's answer looks like the right one to me.
I just want to point out that in this type of problem, you often gain benefits from memory locality. Right now, the b,w,e,s,n arrays are all at separate locations in memory. If you could not fit the problem in L3 cache (mostly in L2), then this would be bad, and a solution of this sort would be helpful:
size_t d_nx = 128, d_ny = 128;
float *d_x;
struct D { float b,w,e,s,n; };
D *d;
void step() {
size_t ic = d_ny + 1, iw = d_ny, ie = d_ny + 2, is = 1, in = 2 * d_ny + 1;
for (size_t y = 1; y < d_ny - 1; ++y) {
for (size_t x = 1; x < d_nx - 1; ++x) {
d_x[ic] = d[ic].b
- d[ic].w * d_x[iw] - d[ic].e * d_x[ie]
- d[ic].s * d_x[is] - d[ic].n * d_x[in];
++ic; ++iw; ++ie; ++is; ++in;
}
ic += 2; iw += 2; ie += 2; is += 2; in += 2;
}
}
void solve(size_t iters) { for (size_t i = 0; i < iters; ++i) step(); }
void clear(float *a) { memset(a, 0, d_nx * d_ny * sizeof(float)); }
int main(int argc, char **argv) {
size_t n = d_nx * d_ny;
d_x = new float[n]; clear(d_x);
d = new D[n]; memset(d,0,n * sizeof(D));
solve(atoi(argv[1]));
cout << d_x[0] << endl; // prevent the thing from being optimized away
}
For example, this solution at 1280x1280 is a little less than 2x faster than Poni's solution (13s vs 23s in my test--your original implementation is then 22s), while at 128x128 it's 30% slower (7s vs. 10s--your original is 10s).
(Iterations were scaled up to 80000 for the base case, and 800 for the 100x larger case of 1280x1280.)
I think you're right about memory being a bottleneck. It's a pretty simple loop with just some simple arithmetic per iteration. the ic, iw, ie, is, and in indices seem to be on opposite sides of the matrix so i'm guessing that there's a bunch of cache misses there.
I'm no expert on the subject, but I've seen that there are several academic papers on improving the cache usage of the Gauss-Seidel method.
Another possible optimization is the use of the red-black variant, where points are updated in two sweeps in a chessboard-like pattern. In this way, all updates in a sweep are independent and can be parallelized.
I suggest putting in some prefetch statements and also researching "data oriented design":
void step_original() {
size_t ic = d_ny + 1, iw = d_ny, ie = d_ny + 2, is = 1, in = 2 * d_ny + 1;
float dw_ic, dx_ic, db_ic, de_ic, dn_ic, ds_ic;
float dx_iw, dx_is, dx_ie, dx_in, de_ic, db_ic;
for (size_t y = 1; y < d_ny - 1; ++y) {
for (size_t x = 1; x < d_nx - 1; ++x) {
// Perform the prefetch
// Sorting these statements by array may increase speed;
// although sorting by index name may increase speed too.
db_ic = d_b[ic];
dw_ic = d_w[ic];
dx_iw = d_x[iw];
de_ic = d_e[ic];
dx_ie = d_x[ie];
ds_ic = d_s[ic];
dx_is = d_x[is];
dn_ic = d_n[ic];
dx_in = d_x[in];
// Calculate
d_x[ic] = db_ic
- dw_ic * dx_iw - de_ic * dx_ie
- ds_ic * dx_is - dn_ic * dx_in;
++ic; ++iw; ++ie; ++is; ++in;
}
ic += 2; iw += 2; ie += 2; is += 2; in += 2;
}
}
This differs from your second method since the values are copied to local temporary variables before the calculation is performed.