Cuda strange bug - c++

I am having troubles understanding a bug that I have in a simple Cuda kernel. I shrinked down my kernel to the minimum that still shows the error.
I have a "Polygon" class that just stores a number of points. I have a function that "adds a point" (just increments the counter), and I add 4 points to all polygons in my array of polygons. Finally, I call a function that updates the number of points using a loop. If, in this loop, I call new_nbpts++ once, I obtain the expected answer : all polygons have 4 points. If in the same loop I call new_nbpts++ a second time, then my polygons have a garbage number of points (4194304 points) which is not correct (I should get 8).
I expect there is something I misunderstood though.
Complete kernel:
#include <stdio.h>
#include <cuda.h>
class Polygon {
public:
__device__ Polygon():nbpts(0){};
__device__ void addPt() {
nbpts++;
};
__device__ void update() {
int new_nbpts = 0;
for (int i=0; i<nbpts; i++) {
new_nbpts++;
new_nbpts++; // calling that a second time screws up my result
}
nbpts = new_nbpts;
}
int nbpts;
};
__global__ void cut_poly(Polygon* polygons, int N)
{
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx>=N) return;
Polygon pol;
pol.addPt();
pol.addPt();
pol.addPt();
pol.addPt();
for (int i=0; i<N; i++) {
pol.update();
}
polygons[idx] = pol;
}
int main(int argc, unsigned char* argv[])
{
const int N = 20;
Polygon p_h[N], *p_d;
cudaError_t err = cudaMalloc((void **) &p_d, N * sizeof(Polygon));
int block_size = 4;
int n_blocks = N/block_size + (N%block_size == 0 ? 0:1);
cut_poly <<< n_blocks, block_size >>> (p_d, N);
cudaMemcpy(p_h, p_d, sizeof(Polygon)*N, cudaMemcpyDeviceToHost);
for (int i=0; i<N; i++)
printf("%d\n", p_h[i].nbpts);
cudaFree(p_d);
return 0;
}

Why are you doing this at the end of your kernel:
for (int i=0; i<N; i++) {
pol.update();
}
?
Remember each thread has it's own instance of:
Polygon pol;
If you want to update each thread's instance of pol at the end of the kernel, you only need to do:
pol.update();
Now, what happens in your case?
Suppose your update() code only has one:
new_nbpts++;
in it.
Your for loop of 0 to N-1 calling pol.update() will, on each iteration:
set new_nbpts to zero
increment new_nbpts a total of nbpts times.
replace the value of nbpts with new_nbpts
Hopefully you can see this has the effect of leaving nbpts unchanged.
Even after N iterations of the for loop that is calling pol.update(), the value of nbpts is unchanged.
Now what happens if I have:
new_nbpts++;
new_nbpts++;
in my update() method? Then on each call of pol.update(), I will:
set new_nbpts to zero
increase new_nbpts by two a total of nbpts times
replace the value of nbpts with new nbpts
Hopefully you can see this has the effect of doubling nbpts on each call of pol.update()
Now, since you are calling pol.update() N times in each thread, you are doubling the starting value of nbpts N times, i.e. nbpts *2^N. Since nbpts starts out (in this case) as 4, we have 4*2^20=4194304
I'm not really sure what you're after with all this, but my guess is you were running that for loop at the end of the kernel thinking you were going to update all the different instances of Polygon pol that way. But that's not how to do it, and all you need is a single
pol.update();
at the end of the kernel, if that was your intention.

Related

Need Help Understanding OpenMP Matrix Multiplication C++ code

Here is my Matrix Multiplication C++ OpenMP code that I have written. I am trying to use OpenMP to optimize the program. The sequential code speed was 7 seconds but when I added openMP statements but it only got faster by 3 seconds. I thought it was going to get much faster and don't understand if I'm doing it right.
The OpenMP statements are in the fill_random function and in the matrix multiplication triple for loop section in main.
I would appreciate any help or advice you can give to understand this!
#include <iostream>
#include <cassert>
#include <omp.h>
#include <chrono>
using namespace std::chrono;
double** fill_random(int rows, int cols )
{
double** mat = new double* [rows]; //Allocate rows.
#pragma omp parallell collapse(2)
for (int i = 0; i < rows; ++i)
{
mat[i] = new double[cols]; // added
for( int j = 0; j < cols; ++j)
{
mat[i][j] = rand() % 10;
}
}
return mat;
}
double** create_matrix(int rows, int cols)
{
double** mat = new double* [rows]; //Allocate rows.
for (int i = 0; i < rows; ++i)
{
mat[i] = new double[cols](); //Allocate each row and zero initialize..
}
return mat;
}
void destroy_matrix(double** &mat, int rows)
{
if (mat)
{
for (int i = 0; i < rows; ++i)
{
delete[] mat[i]; //delete each row..
}
delete[] mat; //delete the rows..
mat = nullptr;
}
}
int main()
{
int rowsA = 1000; // number of rows
int colsA= 1000; // number of columns
double** matA = fill_random(rowsA, colsA);
int rowsB = 1000; // number of rows
int colsB = 1000; // number of columns
double** matB = fill_random(rowsB, colsB);
//Checking matrix multiplication qualification
assert(colsA == rowsB);
double** matC = create_matrix(rowsA, colsB);
//measure the multiply only
const auto start = high_resolution_clock::now();
//Multiplication
#pragma omp parallel for
for(int i = 0; i < rowsA; ++i)
{
for(int j = 0; j < colsB; ++j)
{
for(int k = 0; k < colsA; ++k) //ColsA..
{
matC[i][j] += matA[i][k] * matB[k][j];
}
}
}
const auto stop = high_resolution_clock::now();
const auto duration = duration_cast<seconds>(stop - start);
std::cout << "Time taken by function: " << duration.count() << " seconds" << std::endl;
//Clean up..
destroy_matrix(matA, rowsA);
destroy_matrix(matB, rowsB);
destroy_matrix(matC, rowsA);
return 0;
}
Your problem is rather small.
The collapse in the matrix creation does nothing because the loops are not perfectly nested. On the other hand, in the multiplication routine you should add a collapse(2) directive.
Creating a matrix with an array of pointers means that the expression matB[k][j] dances all over memory. Allocate your matrices as a single array and then use i*N+j as an indexing expression. (Of course I would put that in a macro or so.)
Matrix size of 1000x1000 with double(64 bit) element type requires 8MB data. When you multiply two matrices, you read 16MB data. When you write to a third matrix, you also access 24MB data total.
If L3 cache is smaller than 24MB then RAM is bottleneck. Maybe single thread did not fully use its bandwidth but when OpenMP is used, RAM bandwidth is fully used. In your case it had only 50% headroom for bandwidth.
Naive version is not using cache well. You need to swap order of two loops to gain more caching:
loop
loop k
loop
C[..] += B[..] * A[..]
although incrementing C does not re-use a register in this optimized version, it re-uses cache that is more important in this case. If you do it, it should get ~100-200 milliseconds computation time even in single-thread.
Also if you need performance, don't do this:
//Allocate each row and zero initialize..
allocate whole matrix at once so that your matrix is not scattered in memory.
To add more threads efficiently, you can do sub-matrix multiplications to compute full matrix multiplication. Scan-line multiplication is not good for load-balancing between threads. When sub-matrices are multiplied, they give better load distribution due to caching and higher floating-point operations per element fetched from memory.
Edit:
Swapping order of loops also makes compiler able to vectorize the innermost loop because one of the input matrices becomes a constant during the innermost loop.

Why does the function return 0?

I have this function that should take a matrix, compare the diagonal elements of the matrix and find the smallest one.
Here it should compare y[0][1] and y[1][0].
#include <iostream>
int Min(int, int [][2]);
using namespace std;
int main() {
int min, y[2][2];
y[0][0]=5;
y[0][1]=4;
y[1][0]=-9;
y[1][1]=0;
min = Min(2, y);
cout<<min;
return 0;
}
int Min(int k, int x[][2]){
int min=x[0][0];
for(int i=0; i<k;i++){
if(x[i][i]<min){
min=x[i][i];
}
}
return min;
}
It always returns 0. Why?
Here it should compare y[0][1] and y[1][0].
Your function goes through the diagonal of the matrix, it hence checks y[0][0] and y[1][1]. They are 5 and 0. The result is zero, which is to be expected.
Here it should compare y[0][1] and y[1][0].
But that's not what you say here:
int min=x[0][0];
or here:
if(x[i][i]<min){
min=x[i][i];
}
Since i cannot be both 0 and 1 at the same time, this accesses x[0][0] and x[1][1].
And for those elements, 0 is the correct minimum.
As a side note, your Min function can only work with matrices of size 2, so the k parameter is unnecessary. Multi-dimensional arrays are annoying like that.
To iterate over all items instead of just [0][0] and [1][1] you need to do:
for(int i=0; i<k; i++)
{
for(int j=0; j<k; j++)
{
if(x[i][j]<min)
{
min=x[i][j];
}
}
}
int Min(int k,const int x[][2])
{
int min=x[0][0];
for(int i=0; i<k;i++)
{
for(int j=0;j<k;j++)
{
if(x[i][j]<min)
{
min=x[i][j];
}
}
}
Earlier it was always showing zero because it had a mess with the column index. Outer for loop iterates k-1 times with first iteration at i=0, and second at i=1 and during both iterations it assigns the same index to both row and column(i.e., x [0][0] and x[1][1]). Perhaps it must assign the index x[0][0], x[0][1], x[1][0], x[1][1]. Earlier, it had only two iterations(the outer for loop) but now it takes four iterations assigning the appropriate index to both column and rows the efficient number of times.

Why am I getting the error message, expected unqualified id before 'for'

I have to write a code that does this for a university assignment. I have written the average function of the code but I'm getting an error message for the other function.
The DEVBOARD_readAccelerometer function will read x,y,z components of acceleration
Firstly, you will need to write a function:
int average(int *array, int nLen);
which returns the average value of an array of integer values. (To prevent overflow, it is suggested to use a long internal variable).
The Spirit Level function will run a loop. For each iteration the Z-component of gravitational acceleration will be sampled four times at 50ms intervals and stored in a suitably sized array.
Using the average() function you have written, determine the average value and analyze
int average(int* array, int nlen) { // assuming array is int
long sum = 0L; // sum will be larger than an item, long for safety.
for (int i = 0; i < nlen; i++) {
sum += array[i];
}
return ((long)sum) / nlen;
}
float sum[3];
int j;
for (int j = 0; j < 3; j++) {
i = DEVBOARD_readAccelerometer(int* xAccel, int* yAccel, int* zAccel);
sum[j] = int * zAccel;
}
average(float* sum[j], float nlen);
printf("The average zcomponent is %f\n");
}
unqualified-id before for
You declared variable j 2 times. One before the for loop and one inside the for definition.

C++ - Efficiently computing a vector-matrix product

I need to compute a product vector-matrix as efficiently as possible. Specifically, given a vector s and a matrix A, I need to compute s * A. I have a class Vector which wraps a std::vector and a class Matrix which also wraps a std::vector (for efficiency).
The naive approach (the one that I am using at the moment) is to have something like
Vector<T> timesMatrix(Matrix<T>& matrix)
{
Vector<unsigned int> result(matrix.columns());
// constructor that does a resize on the underlying std::vector
for(unsigned int i = 0 ; i < vector.size() ; ++i)
{
for(unsigned int j = 0 ; j < matrix.columns() ; ++j)
{
result[j] += (vector[i] * matrix.getElementAt(i, j));
// getElementAt accesses the appropriate entry
// of the underlying std::vector
}
}
return result;
}
It works fine and takes nearly 12000 microseconds. Note that the vector s has 499 elements, while A is 499 x 15500.
The next step was trying to parallelize the computation: if I have N threads then I can give each thread a part of the vector s and the "corresponding" rows of the matrix A. Each thread will compute a 499-sized Vector and the final result will be their entry-wise sum.
First of all, in the class Matrix I added a method to extract some rows from a Matrix and build a smaller one:
Matrix<T> extractSomeRows(unsigned int start, unsigned int end)
{
unsigned int rowsToExtract = end - start + 1;
std::vector<T> tmp;
tmp.reserve(rowsToExtract * numColumns);
for(unsigned int i = start * numColumns ; i < (end+1) * numColumns ; ++i)
{
tmp.push_back(matrix[i]);
}
return Matrix<T>(rowsToExtract, numColumns, tmp);
}
Then I defined a thread routine
void timesMatrixThreadRoutine
(Matrix<T>& matrix, unsigned int start, unsigned int end, Vector<T>& newRow)
{
// newRow is supposed to contain the partial result
// computed by a thread
newRow.resize(matrix.columns());
for(unsigned int i = start ; i < end + 1 ; ++i)
{
for(unsigned int j = 0 ; j < matrix.columns() ; ++j)
{
newRow[j] += vector[i] * matrix.getElementAt(i - start, j);
}
}
}
And finally I modified the code of the timesMatrix method that I showed above:
Vector<T> timesMatrix(Matrix<T>& matrix)
{
static const unsigned int NUM_THREADS = 4;
unsigned int matRows = matrix.rows();
unsigned int matColumns = matrix.columns();
unsigned int rowsEachThread = vector.size()/NUM_THREADS;
std::thread threads[NUM_THREADS];
Vector<T> tmp[NUM_THREADS];
unsigned int start, end;
// all but the last thread
for(unsigned int i = 0 ; i < NUM_THREADS - 1 ; ++i)
{
start = i*rowsEachThread;
end = (i+1)*rowsEachThread - 1;
threads[i] = std::thread(&Vector<T>::timesMatrixThreadRoutine, this,
matrix.extractSomeRows(start, end), start, end, std::ref(tmp[i]));
}
// last thread
start = (NUM_THREADS-1)*rowsEachThread;
end = matRows - 1;
threads[NUM_THREADS - 1] = std::thread(&Vector<T>::timesMatrixThreadRoutine, this,
matrix.extractSomeRows(start, end), start, end, std::ref(tmp[NUM_THREADS-1]));
for(unsigned int i = 0 ; i < NUM_THREADS ; ++i)
{
threads[i].join();
}
Vector<unsigned int> result(matColumns);
for(unsigned int i = 0 ; i < NUM_THREADS ; ++i)
{
result = result + tmp[i]; // the operator+ is overloaded
}
return result;
}
It still works but now it takes nearly 30000 microseconds, which is almost three times as much as before.
Am I doing something wrong? Do you think there is a better approach?
EDIT - using a "lightweight" VirtualMatrix
Following Ilya Ovodov's suggestion, I defined a class VirtualMatrix that wraps a T* matrixData, which is initialized in the constructor as
VirtualMatrix(Matrix<T>& m)
{
numRows = m.rows();
numColumns = m.columns();
matrixData = m.pointerToData();
// pointerToData() returns underlyingVector.data();
}
Then there is a method to retrieve a specific entry of the matrix:
inline T getElementAt(unsigned int row, unsigned int column)
{
return *(matrixData + row*numColumns + column);
}
Now the execution time is better (approximately 8000 microseconds) but maybe there are some improvements to be made. In particular the thread routine is now
void timesMatrixThreadRoutine
(VirtualMatrix<T>& matrix, unsigned int startRow, unsigned int endRow, Vector<T>& newRow)
{
unsigned int matColumns = matrix.columns();
newRow.resize(matColumns);
for(unsigned int i = startRow ; i < endRow + 1 ; ++i)
{
for(unsigned int j = 0 ; j < matColumns ; ++j)
{
newRow[j] += (vector[i] * matrix.getElementAt(i, j));
}
}
}
and the really slow part is the one with the nested for loops. If I remove it, the result is obviously wrong but is "computed" in less than 500 microseconds. This to say that now passing the arguments takes almost no time and the heavy part is really the computation.
According to you, is there any way to make it even faster?
Actually you make a partial copy of matrix for each thread in extractSomeRows. It takes a lot of time.
Redesign it so that "some rows" become virtual matrix pointing at data located in original matrix.
Use vectorized assembly instructions for an architecture by making it more explicit that you want to multiply in 4's, i.e. for the x86-64 SSE2+ and possibly ARM'S NEON.
C++ compilers can often unroll the loop into vectorized code if you explicitly make an operation happen in contingent elements:
Simple and fast matrix-vector multiplication in C / C++
There is also the option of using libraries specifically made for matrix multipication. For larger matrices, it may be more efficient to use special implementations based on the Fast Fourier Transform, alternate algorithms like Strassen's Algorithm, etc. In fact, your best bet would be to use a C library like this, and then wrap it in an interface that looks similar to a C++ vector.

How to speed up matrix multiplication in C++?

I'm performing matrix multiplication with this simple algorithm. To be more flexible I used objects for the matricies which contain dynamicly created arrays.
Comparing this solution to my first one with static arrays it is 4 times slower. What can I do to speed up the data access? I don't want to change the algorithm.
matrix mult_std(matrix a, matrix b) {
matrix c(a.dim(), false, false);
for (int i = 0; i < a.dim(); i++)
for (int j = 0; j < a.dim(); j++) {
int sum = 0;
for (int k = 0; k < a.dim(); k++)
sum += a(i,k) * b(k,j);
c(i,j) = sum;
}
return c;
}
EDIT
I corrected my Question avove! I added the full source code below and tried some of your advices:
swapped k and j loop iterations -> performance improvement
declared dim() and operator()() as inline -> performance improvement
passing arguments by const reference -> performance loss! why? so I don't use it.
The performance is now nearly the same as it was in the old porgram. Maybe there should be a bit more improvement.
But I have another problem: I get a memory error in the function mult_strassen(...). Why?
terminate called after throwing an instance of 'std::bad_alloc'
what(): std::bad_alloc
OLD PROGRAM
main.c http://pastebin.com/qPgDWGpW
c99 main.c -o matrix -O3
NEW PROGRAM
matrix.h http://pastebin.com/TYFYCTY7
matrix.cpp http://pastebin.com/wYADLJ8Y
main.cpp http://pastebin.com/48BSqGJr
g++ main.cpp matrix.cpp -o matrix -O3.
EDIT
Here are some results. Comparison between standard algorithm (std), swapped order of j and k loop (swap) and blocked algortihm with block size 13 (block).
Speaking of speed-up, your function will be more cache-friendly if you swap the order of the k and j loop iterations:
matrix mult_std(matrix a, matrix b) {
matrix c(a.dim(), false, false);
for (int i = 0; i < a.dim(); i++)
for (int k = 0; k < a.dim(); k++)
for (int j = 0; j < a.dim(); j++) // swapped order
c(i,j) += a(i,k) * b(k,j);
return c;
}
That's because a k index on the inner-most loop will cause a cache miss in b on every iteration. With j as the inner-most index, both c and b are accessed contiguously, while a stays put.
Make sure that the members dim() and operator()() are declared inline, and that compiler optimization is turned on. Then play with options like -funroll-loops (on gcc).
How big is a.dim() anyway? If a row of the matrix doesn't fit in just a couple cache lines, you'd be better off with a block access pattern instead of a full row at-a-time.
You say you don't want to modify the algorithm, but what does that mean exactly?
Does unrolling the loop count as "modifying the algorithm"? What about using SSE/VMX whichever SIMD instructions are available on your CPU? What about employing some form of blocking to improve cache locality?
If you don't want to restructure your code at all, I doubt there's more you can do than the changes you've already made. Everything else becomes a trade-off of minor changes to the algorithm to achieve a performance boost.
Of course, you should still take a look at the asm generated by the compiler. That'll tell you much more about what can be done to speed up the code.
Use SIMD if you can. You absolutely have to use something like VMX registers if you do extensive vector math assuming you are using a platform that is capable of doing so, otherwise you will incur a huge performance hit.
Don't pass complex types like matrix by value - use a const reference.
Don't call a function in each iteration - cache dim() outside your loops.
Although compilers typically optimize this efficiently, it's often a good idea to have the caller provide a matrix reference for your function to fill out rather than returning a matrix by type. In some cases, this may result in an expensive copy operation.
Here is my implementation of the fast simple multiplication algorithm for square float matrices (2D arrays). It should be a little faster than chrisaycock code since it spares some increments.
static void fastMatrixMultiply(const int dim, float* dest, const float* srcA, const float* srcB)
{
memset( dest, 0x0, dim * dim * sizeof(float) );
for( int i = 0; i < dim; i++ ) {
for( int k = 0; k < dim; k++ )
{
const float* a = srcA + i * dim + k;
const float* b = srcB + k * dim;
float* c = dest + i * dim;
float* cMax = c + dim;
while( c < cMax )
{
*c++ += (*a) * (*b++);
}
}
}
}
Pass the parameters by const reference to start with:
matrix mult_std(matrix const& a, matrix const& b) {
To give you more details we need to know the details of the other methods used.
And to answer why the original method is 4 times faster we would need to see the original method.
The problem is undoubtedly yours as this problem has been solved a million times before.
Also when asking this type of question ALWAYS provide compilable source with appropriate inputs so we can actually build and run the code and see what is happening.
Without the code we are just guessing.
Edit
After fixing the main bug in the original C code (a buffer over-run)
I have update the code to run the test side by side in a fair comparison:
// INCLUDES -------------------------------------------------------------------
#include <stdlib.h>
#include <stdio.h>
#include <sys/time.h>
#include <time.h>
// DEFINES -------------------------------------------------------------------
// The original problem was here. The MAXDIM was 500. But we were using arrays
// that had a size of 512 in each dimension. This caused a buffer overrun that
// the dim variable and caused it to be reset to 0. The result of this was causing
// the multiplication loop to fall out before it had finished (as the loop was
// controlled by this global variable.
//
// Everything now uses the MAXDIM variable directly.
// This of course gives the C code an advantage as the compiler can optimize the
// loop explicitly for the fixed size arrays and thus unroll loops more efficiently.
#define MAXDIM 512
#define RUNS 10
// MATRIX FUNCTIONS ----------------------------------------------------------
class matrix
{
public:
matrix(int dim)
: dim_(dim)
{
data_ = new int[dim_ * dim_];
}
inline int dim() const {
return dim_;
}
inline int& operator()(unsigned row, unsigned col) {
return data_[dim_*row + col];
}
inline int operator()(unsigned row, unsigned col) const {
return data_[dim_*row + col];
}
private:
int dim_;
int* data_;
};
// ---------------------------------------------------
void random_matrix(int (&matrix)[MAXDIM][MAXDIM]) {
for (int r = 0; r < MAXDIM; r++)
for (int c = 0; c < MAXDIM; c++)
matrix[r][c] = rand() % 100;
}
void random_matrix_class(matrix& matrix) {
for (int r = 0; r < matrix.dim(); r++)
for (int c = 0; c < matrix.dim(); c++)
matrix(r, c) = rand() % 100;
}
template<typename T, typename M>
float run(T f, M const& a, M const& b, M& c)
{
float time = 0;
for (int i = 0; i < RUNS; i++) {
struct timeval start, end;
gettimeofday(&start, NULL);
f(a,b,c);
gettimeofday(&end, NULL);
long s = start.tv_sec * 1000 + start.tv_usec / 1000;
long e = end.tv_sec * 1000 + end.tv_usec / 1000;
time += e - s;
}
return time / RUNS;
}
// SEQ MULTIPLICATION ----------------------------------------------------------
int* mult_seq(int const(&a)[MAXDIM][MAXDIM], int const(&b)[MAXDIM][MAXDIM], int (&z)[MAXDIM][MAXDIM]) {
for (int r = 0; r < MAXDIM; r++) {
for (int c = 0; c < MAXDIM; c++) {
z[r][c] = 0;
for (int i = 0; i < MAXDIM; i++)
z[r][c] += a[r][i] * b[i][c];
}
}
}
void mult_std(matrix const& a, matrix const& b, matrix& z) {
for (int r = 0; r < a.dim(); r++) {
for (int c = 0; c < a.dim(); c++) {
z(r,c) = 0;
for (int i = 0; i < a.dim(); i++)
z(r,c) += a(r,i) * b(i,c);
}
}
}
// MAIN ------------------------------------------------------------------------
using namespace std;
int main(int argc, char* argv[]) {
srand(time(NULL));
int matrix_a[MAXDIM][MAXDIM];
int matrix_b[MAXDIM][MAXDIM];
int matrix_c[MAXDIM][MAXDIM];
random_matrix(matrix_a);
random_matrix(matrix_b);
printf("%d ", MAXDIM);
printf("%f \n", run(mult_seq, matrix_a, matrix_b, matrix_c));
matrix a(MAXDIM);
matrix b(MAXDIM);
matrix c(MAXDIM);
random_matrix_class(a);
random_matrix_class(b);
printf("%d ", MAXDIM);
printf("%f \n", run(mult_std, a, b, c));
return 0;
}
The results now:
$ g++ t1.cpp
$ ./a.exe
512 1270.900000
512 3308.800000
$ g++ -O3 t1.cpp
$ ./a.exe
512 284.900000
512 622.000000
From this we see the C code is about twice as fast as the C++ code when fully optimized. I can not see the reason in the code.
I'm taking a wild guess here, but if you dynamically allocating the matrices makes such a huge difference, maybe the problem is fragmentation. Again, I've no idea how the underlying matrix is implemented.
Why don't you allocate the memory for the matrices by hand, ensuring it's contiguous, and build the pointer structure yourself?
Also, does the dim() method have any extra complexity? I would declare it inline, too.