Hi i am making my first steps in CUDA technology but i think i do not get it right.
I am trying to multiply two dimensional array by vector but something is not working
Here is the code I am trying to figure out:
#include <stdio.h>
#include <stdlib.h>
#define N 2
__global__ void Multiply(int A[N][N], int B[N], int C[N]){
int i = threadIdx.x;
int j = threadIdx.y;
int sum = A[i][j] * B[j];
C[i]= sum;
printf("%d,%d ", sum, C[i]);
}
int main(){
int A[N][N] ={ {1,1},
{1,1}
};
int B[N] = {4,6};
int C[N] = {0,0};
int (*aA)[N], (*aB), (*aC);
cudaMalloc((void**)&aA, (N*N)*sizeof(int));
cudaMalloc((void**)&aB, (N)*sizeof(int));
cudaMalloc((void**)&aC, (N)*sizeof(int));
cudaMemcpy(aA, A, (N*N)*sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(aB, B, (N)*sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(aC, C, (N)*sizeof(int), cudaMemcpyHostToDevice);
int numBlocks = 1;
dim3 threadsPerBlock(N,N);
Multiply<<<numBlocks,threadsPerBlock>>>(aA,aB,aC);
cudaMemcpy(C, aC, (N)*sizeof(int), cudaMemcpyDeviceToHost);
cudaFree(aA);
cudaFree(aB);
cudaFree(aC);
printf("\n");
system("pause");
}
in this case the Output is : 4,6 4,6 6,6 6,6 so basically the sum i giving the right values but C[i] is returning always 6 although there is sum value assigned to it.
What am I doing wrong?
Any time you are having trouble with a CUDA code, it's a good idea to use proper cuda error checking and run your code with cuda-memcheck. That's just a boiler-plate statement I make. It wouldn't actually turn up issues with the code you have shown in this case.
As was pointed out already in an answer now deleted, you are not actually summing anything together. Even though you have a variable named sum, it is not actually the sum of anything, and you have no + or summation operations in your kernel code. You are not writing a kernel that will sum anything together.
To produce a correct result, your kernel depends on cooperatively having multiple threads update a single location (C[i]). However, this requires some coordination between threads. Without any coordination, you will have threads in a race condition with each other, and the results will be unpredictable. We could sort this out using a parallel reduction, to sum together partial-products from each of the individual threads, or for simplicity we could use an atomicAdd operation, which will force threads to update (add to) C[i] one-by-one, so they don't step on each other. Using atomicAdd therefore also supplies the necessary addition (+) operation, which is lacking in your kernel.
Here's a worked code with items 2 and 3 addressed. You can run it with cuda-memcheck to verify behavioral correctness even though it has no explicit error checking:
$ cat t1037.cu
#include <stdio.h>
#include <stdlib.h>
#define N 2
__global__ void Multiply(int A[N][N], int B[N], int C[N]){
int i = threadIdx.x;
int j = threadIdx.y;
int product = A[i][j] * B[j];
atomicAdd(C+i, product);
// printf("%d,%d ", product, C[i]);
}
int main(){
int A[N][N] ={ {1,1},
{1,1}
};
int B[N] = {4,6};
int C[N] = {0,0};
int (*aA)[N], (*aB), (*aC), i;
cudaMalloc((void**)&aA, (N*N)*sizeof(int));
cudaMalloc((void**)&aB, (N)*sizeof(int));
cudaMalloc((void**)&aC, (N)*sizeof(int));
cudaMemcpy(aA, A, (N*N)*sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(aB, B, (N)*sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(aC, C, (N)*sizeof(int), cudaMemcpyHostToDevice);
int numBlocks = 1;
dim3 threadsPerBlock(N,N);
Multiply<<<numBlocks,threadsPerBlock>>>(aA,aB,aC);
cudaMemcpy(C, aC, (N)*sizeof(int), cudaMemcpyDeviceToHost);
for (i=0; i<N; i++){
printf("C[%d] = %d\n",i,C[i]);
}
cudaFree(aA);
cudaFree(aB);
cudaFree(aC);
printf("\n");
}
$ nvcc -o t1037 t1037.cu
$ cuda-memcheck ./t1037
========= CUDA-MEMCHECK
C[0] = 10
C[1] = 10
========= ERROR SUMMARY: 0 errors
$
Related
I am trying to solve about 1200000 linear systems (3x3, Ax=B) with CUDA 10.1, in particular using the CUBLAS library. I took a cue from this (useful!) post and re-wrote the suggested code in a Unified Memory version. The algorithm firstly performs a LU factorization using cublasgetrfBatched() followed by two consecutive invocations of cublastrsm() which solves upper or lower triangular linear systems. The code is attached below. It works correctly up to about 10000 matrixes and, in this case, it takes ~570 ms to perform the LU factorization (on an NVIDIA GeForce 930MX) and ~311 ms to solve the systems.
My issues/questions are:
Overload issue: it crashes allocating memory for more than 10k matrices. Why? How can I improve my code in order to solve the whole batch of 1.2 million matrices?
Time issue: my goal would be to solve all of these systems in less than 1 second. Am I currently following the correct approach? Any suggestions otherwise?
Would it be possible and/or useful, and if yes how, to use 'streams' of batches of 10k matrices?
Code:
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <assert.h>
#include <algorithm>
#include <cmath>
#include <iostream>
#include <vector>
#include <ctime>
#include <ratio>
#include <chrono>
#include <random>
#include <time.h>
#include <math.h>
// CUDA
#include <cuda.h>
#include <cuda_runtime.h>
#include "device_launch_parameters.h"
#include <cusolverDn.h>
//#include "Utilities.cuh"
using namespace std;
using namespace std::chrono;
/************************************/
/* COEFFICIENT REARRANGING FUNCTION */
/************************************/
void rearrange(double** vec, int* pivotArray, int N, int numMatrices) {
for (int nm = 0; nm < numMatrices; nm++) {
for (int i = 0; i < N; i++) {
double temp = vec[nm][i];
vec[nm][i] = vec[nm][pivotArray[N*i + nm] - 1];
vec[nm][pivotArray[N * i + nm] - 1] = temp;
}
}
}
/************************************/
/* MAIN */
/************************************/
int main() {
const int N = 3;
const int numMatrices = 10000; // I want 1200000
// random generator to fill matrices and coefficients
random_device device;
mt19937 generator(device());
uniform_real_distribution<double> distribution(1., 5.);
//ALLOCATE MEMORY - using unified memory
double** h_A;
cudaMallocManaged(&h_A, sizeof(double*) * numMatrices);
for (int nm = 0; nm < numMatrices; nm++) {
cudaMallocManaged(&(h_A[nm]), sizeof(double) * N * N);
}
double** h_b;
cudaMallocManaged(&h_b, sizeof(double*) * numMatrices);
for (int nm = 0; nm < numMatrices; nm++) {
cudaMallocManaged(&(h_b[nm]), sizeof(double) * N );
}
cout << " memory allocated" << endl;
// FILL MATRICES
for (int nm = 0; nm < numMatrices; nm++) {
for (int i = 0; i < N; i++) {
for (int j = 0; j < N; j++) {
h_A[nm][j * N + i] = distribution(generator);
}
}
}
cout << " Matrix filled " << endl;
// FILL COEFFICIENTS
for (int nm = 0; nm < numMatrices; nm++) {
for (int i = 0; i < N; i++) {
h_b[nm][i] = distribution(generator);
}
}
cout << " Coeff. vector filled " << endl;
cout << endl;
// --- CUDA solver initialization
cublasHandle_t cublas_handle;
cublasCreate_v2(&cublas_handle);
int* PivotArray;
cudaMallocManaged(&PivotArray, N * numMatrices * sizeof(int));
int* infoArray;
cudaMallocManaged(&infoArray, numMatrices * sizeof(int));
//CUBLAS LU SOLVER
high_resolution_clock::time_point t1 = high_resolution_clock::now();
cublasDgetrfBatched(cublas_handle, N, h_A, N, PivotArray, infoArray, numMatrices);
cudaDeviceSynchronize();
high_resolution_clock::time_point t2 = high_resolution_clock::now();
duration<double> time_span = duration_cast<duration<double>>(t2 - t1);
cout << "It took " << time_span.count() * 1000. << " milliseconds." << endl;
for (int i = 0; i < numMatrices; i++)
if (infoArray[i] != 0) {
fprintf(stderr, "Factorization of matrix %d Failed: Matrix may be singular\n", i);
}
// rearrange coefficient
// (temporarily on CPU, this step will be on a GPU Kernel as well)
high_resolution_clock::time_point tA = high_resolution_clock::now();
rearrange(h_b, PivotArray, N, numMatrices);
high_resolution_clock::time_point tB = high_resolution_clock::now();
duration<double> time_spanA = duration_cast<duration<double>>(tB - tA);
cout << "rearrangement took " << time_spanA.count() * 1000. << " milliseconds." << endl;
//INVERT UPPER AND LOWER TRIANGULAR MATRICES
// --- Function solves the triangular linear system with multiple right-hand sides
// --- Function overrides b as a result
const double alpha = 1.f;
high_resolution_clock::time_point t3 = high_resolution_clock::now();
cublasDtrsmBatched(cublas_handle, CUBLAS_SIDE_LEFT, CUBLAS_FILL_MODE_LOWER, CUBLAS_OP_N, CUBLAS_DIAG_UNIT, N, 1, &alpha, h_A, N, h_b, N, numMatrices);
cublasDtrsmBatched(cublas_handle, CUBLAS_SIDE_LEFT, CUBLAS_FILL_MODE_UPPER, CUBLAS_OP_N, CUBLAS_DIAG_NON_UNIT, N, 1, &alpha, h_A, N, h_b, N, numMatrices);
cudaDeviceSynchronize();
high_resolution_clock::time_point t4 = high_resolution_clock::now();
duration<double> time_span2 = duration_cast<duration<double>>(t4 - t3);
cout << "second step took " << time_span2.count() * 1000. << " milliseconds." << endl;
// --- Free resources
if (h_A) cudaFree(h_A);
if (h_b) cudaFree(h_b);
cudaDeviceReset();
return 0;
}
Overload issue: it crashes allocating memory for more than 10k matrices. Why? How can I improve my code in order to solve the whole batch of 1.2 million matrices?
In my opinion, the biggest problem in your code is that you are making horribly inefficient use of managed memory in these key allocation loops:
//ALLOCATE MEMORY - using unified memory
double** h_A;
cudaMallocManaged(&h_A, sizeof(double*) * numMatrices);
for (int nm = 0; nm < numMatrices; nm++) {
cudaMallocManaged(&(h_A[nm]), sizeof(double) * N * N);
}
double** h_b;
cudaMallocManaged(&h_b, sizeof(double*) * numMatrices);
for (int nm = 0; nm < numMatrices; nm++) {
cudaMallocManaged(&(h_b[nm]), sizeof(double) * N );
}
The problem is that each call to cudaMallocManaged has a minimum granularity. That means that if you request to allocate 1 byte, it will actually use up something like 4kbyte of memory (I believe that is the linux allocation granularity. It looks like you are on windows, and I believe the windows allocation granularity may be larger). In addition, this creates a huge inefficient data transfer load on the managed memory subsystem, when you launch a kernel (kernels will be launched in your cublas calls).
A much better way to do this is to do a single large allocation, rather than the allocation-in-a-loop, and then just subdivide that allocation using pointer arithmetic. The code could look like this:
//ALLOCATE MEMORY - using unified memory
double** h_A;
cudaMallocManaged(&h_A, sizeof(double*) * numMatrices);
cudaMallocManaged(&(h_A[0]), sizeof(double)*numMatrices*N*N);
for (int nm = 1; nm < numMatrices; nm++) {
h_A[nm] = h_A[nm-1]+ N * N;
}
double** h_b;
cudaMallocManaged(&h_b, sizeof(double*) * numMatrices);
cudaMallocManaged(&(h_b[0]), sizeof(double) * numMatrices * N);
for (int nm = 1; nm < numMatrices; nm++) {
h_b[nm] = h_b[nm-1] + N;
}
Another benefit of this is that the allocation process runs quite a bit faster.
Time issue: my goal would be to solve all of these systems in less than 1 second. Am I currently following the correct approach? Any suggestions otherwise?
With that change to your code, I am able to run successfully on a 1GB GPU (GeForce GT640), with:
const int numMatrices = 1200000;
with output like this:
$ ./t81
memory allocated
Matrix filled
Coeff. vector filled
It took 70.3032 milliseconds.
rearrangement took 60.02 milliseconds.
second step took 156.067 milliseconds.
Your GPU may be somewhat slower, but I think the overall timing should easily come in at less than 1 second.
Would it be possible and/or useful, and if yes how, to use 'streams' of batches of 10k matrices?
With the above change, I don't think you need to worry about this. Streams won't help here with overlap of compute operations. They could help with copy/compute overlap (although maybe not much on your GPU) but this would be hard to architect on windows with managed memory. For windows usage, I would probably suggest switching to ordinary CUDA separation of host and device memory, if you want to explore copy/compute overlap.
As an aside, you may be able to get a set of cublas calls that will do the work even more quickly by using direct inversion. CUBLAS has a batch direct inversion method. I normally wouldn't suggest this for solution of linear equations, but it may be something to consider for a set of 3x3 or 4x4 inversions, where you could easily check for singularity with the determinant method. Here is an example.
I have code
#include <iostream>
#include <vector>
#include <ctime>
using namespace std;
void foo(int n, double* a, double* b, double *c, double*d, double* e, double* f, double* g)
{
for (int i = 0; i < n; ++i)
a[i] = b[i] * a[i] + c[i] * (d[i] + e[i] + f[i] + g[i]);
}
int main()
{
int m = 1001001;
vector<double> a(m), b(m), c(m), d(m), f(m);
clock_t start = std::clock();
for (int i = 0; i < 1000; ++i)
foo(1000000, &a[0], &b[0], &c[0], &d[0], &d[1], &f[0], &f[1000] );
double duration = (std::clock() - start) / (double)CLOCKS_PER_SEC;
cout << "Finished in " << duration << " seconds [CPU Clock] " << endl;
}
Can you give me a workable example to optimize it with better performance? Any compiler is fine, like Intel c++ compiler and visual c++ compiler. Please suggest a CPU with good performance to do such job.
The code in question is useless. It does lots of calculations with uninitialised variables and then ignores the results. Compilers are getting more and more clever at figuring out that kind of thing and removing all the code for this. So don't be surprised if code like this doesn't take any time at all.
In C, you would declare the pointers as "const double* restrict" except a which would be double* restrict, telling the compiler that all pointers except the first one point to data that isn't going to be modified during the loop; this allows the compiler to vectorise. Not a C++ feature unfortunately afaik.
If this was your real problem, you would just swap the inner and outer loop, and remove loop invariants like this:
void foo(int iter, int n, double* a, double* b, double *c, double*d, double* e, double* f, double* g)
{
for (int i = 0; i < n; ++i) {
double xa = a [i];
double xb = b [i];
double xr = c[i] * (d[i] + e[i] + f[i] + g[i]);
for (int j = 0; j < iter; ++j)
xa = xb * xa + xr;
a [i] = xa;
}
}
You'd probably do four iterations in parallel to avoid the latency.
But in a real life situation, you would observe that in each call, you read about 40MB which is way beyond any cache. So you are limited by RAM speed. The usual solution is to split the work into smaller parts, for example 500 elements at a time, so everything fits into L1 cache, then perform the operation with the same data 1000 times.
On apple clang, I tried:
using __restict__ on the arguments to convince the compiler that there was no aliasing.
result: no change
distributing the computation over 8 threads in foo()
result: computation time increased from ~3 seconds to ~18seconds!
using #pragma omp parallel for
result: compiler ignored me and stayed with the original solution. ~3 seconds.
setting the command line option -march=native to allow the cpu's full awesomeness to shine
result: different assembler output (vectorisation applied), but run time still unchanged at ~3s
initial conclusions:
This problem is bound by memory access and not by the CPU.
You could experiment with prefetching the vectors into cache lines and then operating on them in lumps of 8 (8 doubles will fit into every cache line).
Make sure that while you are operating on x[i] to x[i+7] you are prefetching x[i+8] to x[i+15].
This might not help as you are using additions and multiplications which are so fast that your RAM may not be able to keep up anyway.
I think you should use multithreading. change foo to get fromIndex, toIndex, instead of n and distribute vectores over threads.
void foo(int fromIndex, int toIndex, double* a, double* b, double *c, double*d, double* e, double* f, double* g)
{
for (int i = fromIndex; i < toIndex; ++i)
a[i] = b[i] * a[i] + c[i] * (d[i] + e[i] + f[i] + g[i]);
}
I'd like to parallelize the following code. Especially these for loops, since it is the most expensive operation.
for (i = 0; i < d1; i++)
for (j = 0; j < d3; j++)
for (k = 0; k < d2; k++)
C[i][j] = C[i][j] + A[i][k] * B[k][j];
It is the first time I tried parallelizing code using OpenMP. I have tried several things but I always end up having a worse runtime than by using the serial version.
It would be great if u could tell me if there is something wrong with the code or the pragmas...
#include <omp.h>
#include <stdio.h>
#include <stdlib.h>
//#include <stdint.h>
// ---------------------------------------------------------------------------
// allocate space for empty matrix A[row][col]
// access to matrix elements possible with:
// - A[row][col]
// - A[0][row*col]
float **alloc_mat(int row, int col)
{
float **A1, *A2;
A1 = (float **)calloc(row, sizeof(float *)); // pointer on rows
A2 = (float *)calloc(row*col, sizeof(float)); // all matrix elements
//#pragma omp parallel for
for (int i=0; i<row; i++)
A1[i] = A2 + i*col;
return A1;
}
// ---------------------------------------------------------------------------
// random initialisation of matrix with values [0..9]
void init_mat(float **A, int row, int col)
{
//#pragma omp parallel for
for (int i = 0; i < row*col; i++)
A[0][i] = (float)(rand() % 10);
}
// ---------------------------------------------------------------------------
// DEBUG FUNCTION: printout of all matrix elements
void print_mat(float **A, int row, int col, char *tag)
{
int i, j;
printf("Matrix %s:\n", tag);
for (i = 0; i < row; i++)
{
//#pragma omp parallel for
for (j=0; j<col; j++)
printf("%6.1f ", A[i][j]);
printf("\n");
}
}
// ---------------------------------------------------------------------------
int main(int argc, char *argv[])
{
float **A, **B, **C; // matrices
int d1, d2, d3; // dimensions of matrices
int i, j, k; // loop variables
double start, end;
start = omp_get_wtime();
/* print user instruction */
if (argc != 4)
{
printf ("Matrix multiplication: C = A x B\n");
printf ("Usage: %s <NumRowA>; <NumColA> <NumColB>\n",argv[0]);
return 0;
}
/* read user input */
d1 = atoi(argv[1]); // rows of A and C
d2 = atoi(argv[2]); // cols of A and rows of B
d3 = atoi(argv[3]); // cols of B and C
printf("Matrix sizes C[%d][%d] = A[%d][%d] x B[%d][%d]\n",
d1, d3, d1, d2, d2, d3);
/* prepare matrices */
A = alloc_mat(d1, d2);
init_mat(A, d1, d2);
B = alloc_mat(d2, d3);
init_mat(B, d2, d3);
C = alloc_mat(d1, d3); // no initialisation of C,
//because it gets filled by matmult
/* serial version of matmult */
printf("Perform matrix multiplication...\n");
int sum;
//#pragma omp parallel
//{
#pragma omp parallel for collapse(3) schedule(guided)
for (i = 0; i < d1; i++)
for (j = 0; j < d3; j++)
for (k = 0; k < d2; k++){
C[i][j] = C[i][j] + A[i][k] * B[k][j];
}
//}
end = omp_get_wtime();
/* test output */
print_mat(A, d1, d2, "A");
print_mat(B, d2, d3, "B");
print_mat(C, d1, d3, "C");
printf("This task took %f seconds\n", end-start);
printf ("\nDone.\n");
return 0;
}
As #genisage suggested in the comments, the size of matrix is likely small enough that the overhead of initializing the additional threads is greater than the time savings achieved by computing the matrix multiplication in parallel. Consider the following plot, however, with data that I obtained by running your code with and without OpenMP.
I used square matrices ranging from n=10 to n=1000. Notice how somewhere between n=50 and n=100 the parallel version becomes faster.
There are other issues to consider, however, when trying to write fast matrix multiplication, which mostly have to do with using the cache effectively. First, you allocate your entire matrix contiguously (which is good), but still use two pointer redirections to access the data, which is unnecessary. Also, your matrices are stored in row major format, which means you are accessing the data in A and C contiguously, but not in B. Instead of explicitly storing B and multiplying a row of A with a column of B, you would get a faster multiplication by storing B transposed and multiplying a row of A elementwise with a row of B transpose.
This is an optimization focused only on A*B, however, and there may be other places in your code where storing B is better than B transpose, in which case often doing matrix multiplication by blocking can lead to better cache utilization
I am having troubles understanding a bug that I have in a simple Cuda kernel. I shrinked down my kernel to the minimum that still shows the error.
I have a "Polygon" class that just stores a number of points. I have a function that "adds a point" (just increments the counter), and I add 4 points to all polygons in my array of polygons. Finally, I call a function that updates the number of points using a loop. If, in this loop, I call new_nbpts++ once, I obtain the expected answer : all polygons have 4 points. If in the same loop I call new_nbpts++ a second time, then my polygons have a garbage number of points (4194304 points) which is not correct (I should get 8).
I expect there is something I misunderstood though.
Complete kernel:
#include <stdio.h>
#include <cuda.h>
class Polygon {
public:
__device__ Polygon():nbpts(0){};
__device__ void addPt() {
nbpts++;
};
__device__ void update() {
int new_nbpts = 0;
for (int i=0; i<nbpts; i++) {
new_nbpts++;
new_nbpts++; // calling that a second time screws up my result
}
nbpts = new_nbpts;
}
int nbpts;
};
__global__ void cut_poly(Polygon* polygons, int N)
{
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx>=N) return;
Polygon pol;
pol.addPt();
pol.addPt();
pol.addPt();
pol.addPt();
for (int i=0; i<N; i++) {
pol.update();
}
polygons[idx] = pol;
}
int main(int argc, unsigned char* argv[])
{
const int N = 20;
Polygon p_h[N], *p_d;
cudaError_t err = cudaMalloc((void **) &p_d, N * sizeof(Polygon));
int block_size = 4;
int n_blocks = N/block_size + (N%block_size == 0 ? 0:1);
cut_poly <<< n_blocks, block_size >>> (p_d, N);
cudaMemcpy(p_h, p_d, sizeof(Polygon)*N, cudaMemcpyDeviceToHost);
for (int i=0; i<N; i++)
printf("%d\n", p_h[i].nbpts);
cudaFree(p_d);
return 0;
}
Why are you doing this at the end of your kernel:
for (int i=0; i<N; i++) {
pol.update();
}
?
Remember each thread has it's own instance of:
Polygon pol;
If you want to update each thread's instance of pol at the end of the kernel, you only need to do:
pol.update();
Now, what happens in your case?
Suppose your update() code only has one:
new_nbpts++;
in it.
Your for loop of 0 to N-1 calling pol.update() will, on each iteration:
set new_nbpts to zero
increment new_nbpts a total of nbpts times.
replace the value of nbpts with new_nbpts
Hopefully you can see this has the effect of leaving nbpts unchanged.
Even after N iterations of the for loop that is calling pol.update(), the value of nbpts is unchanged.
Now what happens if I have:
new_nbpts++;
new_nbpts++;
in my update() method? Then on each call of pol.update(), I will:
set new_nbpts to zero
increase new_nbpts by two a total of nbpts times
replace the value of nbpts with new nbpts
Hopefully you can see this has the effect of doubling nbpts on each call of pol.update()
Now, since you are calling pol.update() N times in each thread, you are doubling the starting value of nbpts N times, i.e. nbpts *2^N. Since nbpts starts out (in this case) as 4, we have 4*2^20=4194304
I'm not really sure what you're after with all this, but my guess is you were running that for loop at the end of the kernel thinking you were going to update all the different instances of Polygon pol that way. But that's not how to do it, and all you need is a single
pol.update();
at the end of the kernel, if that was your intention.
I'm performing matrix multiplication with this simple algorithm. To be more flexible I used objects for the matricies which contain dynamicly created arrays.
Comparing this solution to my first one with static arrays it is 4 times slower. What can I do to speed up the data access? I don't want to change the algorithm.
matrix mult_std(matrix a, matrix b) {
matrix c(a.dim(), false, false);
for (int i = 0; i < a.dim(); i++)
for (int j = 0; j < a.dim(); j++) {
int sum = 0;
for (int k = 0; k < a.dim(); k++)
sum += a(i,k) * b(k,j);
c(i,j) = sum;
}
return c;
}
EDIT
I corrected my Question avove! I added the full source code below and tried some of your advices:
swapped k and j loop iterations -> performance improvement
declared dim() and operator()() as inline -> performance improvement
passing arguments by const reference -> performance loss! why? so I don't use it.
The performance is now nearly the same as it was in the old porgram. Maybe there should be a bit more improvement.
But I have another problem: I get a memory error in the function mult_strassen(...). Why?
terminate called after throwing an instance of 'std::bad_alloc'
what(): std::bad_alloc
OLD PROGRAM
main.c http://pastebin.com/qPgDWGpW
c99 main.c -o matrix -O3
NEW PROGRAM
matrix.h http://pastebin.com/TYFYCTY7
matrix.cpp http://pastebin.com/wYADLJ8Y
main.cpp http://pastebin.com/48BSqGJr
g++ main.cpp matrix.cpp -o matrix -O3.
EDIT
Here are some results. Comparison between standard algorithm (std), swapped order of j and k loop (swap) and blocked algortihm with block size 13 (block).
Speaking of speed-up, your function will be more cache-friendly if you swap the order of the k and j loop iterations:
matrix mult_std(matrix a, matrix b) {
matrix c(a.dim(), false, false);
for (int i = 0; i < a.dim(); i++)
for (int k = 0; k < a.dim(); k++)
for (int j = 0; j < a.dim(); j++) // swapped order
c(i,j) += a(i,k) * b(k,j);
return c;
}
That's because a k index on the inner-most loop will cause a cache miss in b on every iteration. With j as the inner-most index, both c and b are accessed contiguously, while a stays put.
Make sure that the members dim() and operator()() are declared inline, and that compiler optimization is turned on. Then play with options like -funroll-loops (on gcc).
How big is a.dim() anyway? If a row of the matrix doesn't fit in just a couple cache lines, you'd be better off with a block access pattern instead of a full row at-a-time.
You say you don't want to modify the algorithm, but what does that mean exactly?
Does unrolling the loop count as "modifying the algorithm"? What about using SSE/VMX whichever SIMD instructions are available on your CPU? What about employing some form of blocking to improve cache locality?
If you don't want to restructure your code at all, I doubt there's more you can do than the changes you've already made. Everything else becomes a trade-off of minor changes to the algorithm to achieve a performance boost.
Of course, you should still take a look at the asm generated by the compiler. That'll tell you much more about what can be done to speed up the code.
Use SIMD if you can. You absolutely have to use something like VMX registers if you do extensive vector math assuming you are using a platform that is capable of doing so, otherwise you will incur a huge performance hit.
Don't pass complex types like matrix by value - use a const reference.
Don't call a function in each iteration - cache dim() outside your loops.
Although compilers typically optimize this efficiently, it's often a good idea to have the caller provide a matrix reference for your function to fill out rather than returning a matrix by type. In some cases, this may result in an expensive copy operation.
Here is my implementation of the fast simple multiplication algorithm for square float matrices (2D arrays). It should be a little faster than chrisaycock code since it spares some increments.
static void fastMatrixMultiply(const int dim, float* dest, const float* srcA, const float* srcB)
{
memset( dest, 0x0, dim * dim * sizeof(float) );
for( int i = 0; i < dim; i++ ) {
for( int k = 0; k < dim; k++ )
{
const float* a = srcA + i * dim + k;
const float* b = srcB + k * dim;
float* c = dest + i * dim;
float* cMax = c + dim;
while( c < cMax )
{
*c++ += (*a) * (*b++);
}
}
}
}
Pass the parameters by const reference to start with:
matrix mult_std(matrix const& a, matrix const& b) {
To give you more details we need to know the details of the other methods used.
And to answer why the original method is 4 times faster we would need to see the original method.
The problem is undoubtedly yours as this problem has been solved a million times before.
Also when asking this type of question ALWAYS provide compilable source with appropriate inputs so we can actually build and run the code and see what is happening.
Without the code we are just guessing.
Edit
After fixing the main bug in the original C code (a buffer over-run)
I have update the code to run the test side by side in a fair comparison:
// INCLUDES -------------------------------------------------------------------
#include <stdlib.h>
#include <stdio.h>
#include <sys/time.h>
#include <time.h>
// DEFINES -------------------------------------------------------------------
// The original problem was here. The MAXDIM was 500. But we were using arrays
// that had a size of 512 in each dimension. This caused a buffer overrun that
// the dim variable and caused it to be reset to 0. The result of this was causing
// the multiplication loop to fall out before it had finished (as the loop was
// controlled by this global variable.
//
// Everything now uses the MAXDIM variable directly.
// This of course gives the C code an advantage as the compiler can optimize the
// loop explicitly for the fixed size arrays and thus unroll loops more efficiently.
#define MAXDIM 512
#define RUNS 10
// MATRIX FUNCTIONS ----------------------------------------------------------
class matrix
{
public:
matrix(int dim)
: dim_(dim)
{
data_ = new int[dim_ * dim_];
}
inline int dim() const {
return dim_;
}
inline int& operator()(unsigned row, unsigned col) {
return data_[dim_*row + col];
}
inline int operator()(unsigned row, unsigned col) const {
return data_[dim_*row + col];
}
private:
int dim_;
int* data_;
};
// ---------------------------------------------------
void random_matrix(int (&matrix)[MAXDIM][MAXDIM]) {
for (int r = 0; r < MAXDIM; r++)
for (int c = 0; c < MAXDIM; c++)
matrix[r][c] = rand() % 100;
}
void random_matrix_class(matrix& matrix) {
for (int r = 0; r < matrix.dim(); r++)
for (int c = 0; c < matrix.dim(); c++)
matrix(r, c) = rand() % 100;
}
template<typename T, typename M>
float run(T f, M const& a, M const& b, M& c)
{
float time = 0;
for (int i = 0; i < RUNS; i++) {
struct timeval start, end;
gettimeofday(&start, NULL);
f(a,b,c);
gettimeofday(&end, NULL);
long s = start.tv_sec * 1000 + start.tv_usec / 1000;
long e = end.tv_sec * 1000 + end.tv_usec / 1000;
time += e - s;
}
return time / RUNS;
}
// SEQ MULTIPLICATION ----------------------------------------------------------
int* mult_seq(int const(&a)[MAXDIM][MAXDIM], int const(&b)[MAXDIM][MAXDIM], int (&z)[MAXDIM][MAXDIM]) {
for (int r = 0; r < MAXDIM; r++) {
for (int c = 0; c < MAXDIM; c++) {
z[r][c] = 0;
for (int i = 0; i < MAXDIM; i++)
z[r][c] += a[r][i] * b[i][c];
}
}
}
void mult_std(matrix const& a, matrix const& b, matrix& z) {
for (int r = 0; r < a.dim(); r++) {
for (int c = 0; c < a.dim(); c++) {
z(r,c) = 0;
for (int i = 0; i < a.dim(); i++)
z(r,c) += a(r,i) * b(i,c);
}
}
}
// MAIN ------------------------------------------------------------------------
using namespace std;
int main(int argc, char* argv[]) {
srand(time(NULL));
int matrix_a[MAXDIM][MAXDIM];
int matrix_b[MAXDIM][MAXDIM];
int matrix_c[MAXDIM][MAXDIM];
random_matrix(matrix_a);
random_matrix(matrix_b);
printf("%d ", MAXDIM);
printf("%f \n", run(mult_seq, matrix_a, matrix_b, matrix_c));
matrix a(MAXDIM);
matrix b(MAXDIM);
matrix c(MAXDIM);
random_matrix_class(a);
random_matrix_class(b);
printf("%d ", MAXDIM);
printf("%f \n", run(mult_std, a, b, c));
return 0;
}
The results now:
$ g++ t1.cpp
$ ./a.exe
512 1270.900000
512 3308.800000
$ g++ -O3 t1.cpp
$ ./a.exe
512 284.900000
512 622.000000
From this we see the C code is about twice as fast as the C++ code when fully optimized. I can not see the reason in the code.
I'm taking a wild guess here, but if you dynamically allocating the matrices makes such a huge difference, maybe the problem is fragmentation. Again, I've no idea how the underlying matrix is implemented.
Why don't you allocate the memory for the matrices by hand, ensuring it's contiguous, and build the pointer structure yourself?
Also, does the dim() method have any extra complexity? I would declare it inline, too.