I'm looking to sort a large 3D array along the z-axis.
Example array is X x Y x Z (1000x1000x5)
I'd like to sort along the z-axis so I'd perform 1000x1000 sorts for 5 element along the z-axis.
Edit Update: Tried an attempt to use thrust below. It's functional and I'd store the output back, but this is very slow since I'm sorting 5 elements at a time per (x,y) location:
#include <stdio.h>
#include <stdlib.h>
#include <iostream>
#include <thrust/device_ptr.h>
#include <thrust/sort.h>
#include <thrust/gather.h>
#include <thrust/iterator/counting_iterator.h>
int main(){
int x = 1000, y = 1000, z = 5;
float*** unsorted_cube = new float** [x];
for (int i = 0; i < x; i++)
{
// Allocate memory blocks for
// rows of each 2D array
unsorted_cube[i] = new float* [y];
for (int j = 0; j < y; j++)
{
// Allocate memory blocks for
// columns of each 2D array
unsorted_cube[i][j] = new float[z];
}
}
for (int i = 0; i < x; i++)
{
for (int j = 0; j < y; j++)
{
unsorted_cube[i][j][0] = 4.0f;
unsorted_cube[i][j][1] = 3.0f;
unsorted_cube[i][j][2] = 1.0f;
unsorted_cube[i][j][3] = 5.0f;
unsorted_cube[i][j][4] = 2.0f;
}
}
for (int i = 0; i < 5; i++)
{
printf("unsorted_cube first 5 elements to sort at (0,0): %f\n", unsorted_cube[0][0][i]);
}
float* temp_input;
float* temp_output;
float* raw_ptr;
float raw_ptr_out[5];
cudaMalloc((void**)&raw_ptr, N_Size * sizeof(float));
for (int i = 0; i < x; i++)
{
for (int j = 0; j < y; j++)
{
temp_input[0] = unsorted_cube[i][j][0];
temp_input[1] = unsorted_cube[i][j][1];
temp_input[2] = unsorted_cube[i][j][2];
temp_input[3] = unsorted_cube[i][j][3];
temp_input[4] = unsorted_cube[i][j][4];
cudaMemcpy(raw_ptr, temp_input, 5 * sizeof(float), cudaMemcpyHostToDevice);
thrust::device_ptr<float> dev_ptr = thrust::device_pointer_cast(raw_ptr);
thrust::sort(dev_ptr, dev_ptr + 5);
thrust::host_vector<float> host_vec(5);
thrust::copy(dev_ptr, dev_ptr + 5, raw_ptr_out);
if (i == 0 && j == 0)
{
for (int i = 0; i < 5; i++)
{
temp_output[i] = raw_ptr_out[i];
}
printf("sorted_cube[0,0,0] : %f\n", temp_output[0]);
printf("sorted_cube[0,0,1] : %f\n", temp_output[1]);
printf("sorted_cube[0,0,2] : %f\n", temp_output[2]);
printf("sorted_cube[0,0,3] : %f\n", temp_output[3]);
printf("sorted_cube[0,0,4] : %f\n", temp_output[4]);
}
}
}
}
Assuming that the data is in a format where the values in each xy-plane are consecutive in memory: data[((z * y_length) + y) * x_length + x] (which is be best for coalescing memory accesses on the GPU, as well)
#include <thrust/device_vector.h>
#include <thrust/execution_policy.h>
#include <thrust/for_each.h>
#include <thrust/zip_iterator.h>
void sort_in_z_dir(thrust::device_vector<float> &data,
int x_length, int y_length) { // z_length == 5
auto z_stride = x_length * y_length;
thrust::for_each(
thrust::make_zip_iterator(thrust::make_tuple(
data.begin(),
data.begin() + z_stride,
data.begin() + 2 * z_stride,
data.begin() + 3 * z_stride,
data.begin() + 4 * z_stride)),
thrust::make_zip_iterator(thrust::make_tuple(
data.begin() + z_stride,
data.begin() + 2 * z_stride,
data.begin() + 3 * z_stride,
data.begin() + 4 * z_stride,
data.begin() + 5 * z_stride)),
[] __host__ __device__
(thrust::tuple<float, float, float, float, float> &values) {
float local_data[5] = {thrust::get<0>(values),
thrust::get<1>(values),
thrust::get<2>(values),
thrust::get<3>(values),
thrust::get<4>(values)};
thrust::sort(thrust::seq, local_data, local_data + 5);
thrust::get<0>(values) = local_data[0];
thrust::get<1>(values) = local_data[1];
thrust::get<2>(values) = local_data[2];
thrust::get<3>(values) = local_data[3];
thrust::get<4>(values) = local_data[4];
});
}
This solution is certainly very ugly in terms of hardcoding z_length. One can use some C++ template-"magic" to make z_length into a template parameter, but this seemed to be overkill for this answer about Thrust.
See Convert std::tuple to std::array C++11 and How to convert std::array to std::tuple? for examples on interfacing between arrays and tuples.
The good thing about this solution that up to the sorting algorithm itself it should be pretty much optimal performance-wise. I don't know if thrust::sort is optimized for such small input arrays, but you can replace it by any self written sorting algorithm as I proposed in the comments.
If you want to be able to use different z_length without all this hassle, you might prefer this solution, which sorts in global memory, which is far from optimal, and feels a bit hacky because it uses Thrust pretty much only to launch a kernel. Here you want to have the data ordered the other way around: data[((x * y_length) + y) * z_length + z]
#include <thrust/counting_iterator.h>
#include <thrust/device_vector.h>
#include <thrust/execution_policy.h>
#include <thrust/for_each.h>
void sort_in_z_dir_alternative(thrust::device_vector<float> &data,
int x_length, int y_length, int z_length) {
int n_threads = x_length * y_length;
thrust::for_each(
thrust::make_counting_iterator(0),
thrust::make_counting_iterator(n_threads),
[ddata = thrust::raw_pointer_cast(data.data()), z_length] __host__ __device__ (int idx) {
thrust::sort(thrust::seq,
ddata + z_length * idx,
ddata + z_length * (idx + 1));
});
}
If you are ok with z_length being a template parameter, this might be a solution that combines the best from both worlds (data format like in the first example):
#include <thrust/counting_iterator.h>
#include <thrust/device_vector.h>
#include <thrust/execution_policy.h>
#include <thrust/for_each.h>
template <int z_length>
void sort_in_z_dir_middle_ground(thrust::device_vector<float> &data,
int x_length, int y_length) {
int n_threads = x_length * y_length; // == z_stride
thrust::for_each(
thrust::make_counting_iterator(0),
thrust::make_counting_iterator(n_threads),
[ddata = thrust::raw_pointer_cast(data.data()),
z_length, n_threads] __host__ __device__ (int idx) {
float local_data[z_length];
#pragma unroll
for (int i = 0; i < z_length; ++i) {
local_data[i] = ddata[idx + i * n_threads];
}
thrust::sort(thrust::seq,
local_data,
local_data + z_length);
#pragma unroll
for (int i = 0; i < z_length; ++i) {
ddata[idx + i * n_threads] = local_data[i];
}
});
}
Related
I have a pointer that points to the beginning of a 1000+ elements array that is initialized as below:
int numElements = 1200;
auto data = std::unique_ptr<float>{new float[numElements]};
Now I want to 'reshape' it into something like a (20,30,20) tensor, so I can access it the way I want (I can still read while it's 1-D as well but it feels weird). I want to access like this:
data[1][10][12] = 1337.0f;
Is there an efficient way of doing this (fast and short code)?
In the meantime, this is how I do it...
#include <iostream>
using std::cout;
using std::endl;
#include <vector>
using std::vector;
size_t get_index(const size_t x, const size_t y, const size_t z, const size_t x_res, const size_t y_res, const size_t z_res)
{
return z * y_res * x_res + y * x_res + x;
}
int main(void)
{
const size_t x_res = 10;
const size_t y_res = 10;
const size_t z_res = 10;
// Use new[] to allocate, and memset to clear
//float* vf = new float[x_res * y_res * z_res];
//memset(vf, 0, sizeof(float) * x_res * y_res * z_res);
// Better yet, use a vector
vector<float> vf(x_res*y_res*z_res, 0.0f);
for (size_t x = 0; x < x_res; x++)
{
for (size_t y = 0; y < y_res; y++)
{
for (size_t z = 0; z < z_res; z++)
{
size_t index = get_index(x, y, z, x_res, y_res, z_res);
// Do stuff with vf[index] here...
}
}
}
// Make sure to deallocate memory
// delete[] vf;
return 0;
}
I'm trying to understand how numpy can be so fast, based on my shocking comparison with optimized C/C++ code which is still far from reproducing numpy's speed.
Consider the following example:
Given a 2D array with shape=(N, N) and dtype=float32, which represents a list of N vectors of N dimensions, I am computing the pairwise differences between every pair of vectors. Using numpy broadcasting, this simply writes as:
def pairwise_sub_numpy( X ):
return X - X[:, None, :]
Using timeit I can measure the performance for N=512: it takes 88 ms per call on my laptop.
Now, in C/C++ a naive implementation writes as:
#define X(i, j) _X[(i)*N + (j)]
#define res(i, j, k) _res[((i)*N + (j))*N + (k)]
float* pairwise_sub_naive( const float* _X, int N )
{
float* _res = (float*) aligned_alloc( 32, N*N*N*sizeof(float));
for (int i = 0; i < N; i++) {
for (int j = 0; j < N; j++) {
for (int k = 0; k < N; k++)
res(i,j,k) = X(i,k) - X(j,k);
}
}
return _res;
}
Compiling using gcc 7.3.0 with -O3 flag, I get 195 ms per call for pairwise_sub_naive(X), which is not too bad given the simplicity of the code, but about 2 times slower than numpy.
Now I start getting serious and add some small optimizations, by indexing the row vectors directly:
float* pairwise_sub_better( const float* _X, int N )
{
float* _res = (float*) aligned_alloc( 32, N*N*N*sizeof(float));
for (int i = 0; i < N; i++) {
const float* xi = & X(i,0);
for (int j = 0; j < N; j++) {
const float* xj = & X(j,0);
float* r = &res(i,j,0);
for (int k = 0; k < N; k++)
r[k] = xi[k] - xj[k];
}
}
return _res;
}
The speed stays the same at 195 ms, which means that the compiler was able to figure that much. Let's now use SIMD vector instructions:
float* pairwise_sub_simd( const float* _X, int N )
{
float* _res = (float*) aligned_alloc( 32, N*N*N*sizeof(float));
// create caches for row vectors which are memory-aligned
float* xi = (float*)aligned_alloc(32, N * sizeof(float));
float* xj = (float*)aligned_alloc(32, N * sizeof(float));
for (int i = 0; i < N; i++) {
memcpy(xi, & X(i,0), N*sizeof(float));
for (int j = 0; j < N; j++) {
memcpy(xj, & X(j,0), N*sizeof(float));
float* r = &res(i,j,0);
for (int k = 0; k < N; k += 256/sizeof(float)) {
const __m256 A = _mm256_load_ps(xi+k);
const __m256 B = _mm256_load_ps(xj+k);
_mm256_store_ps(r+k, _mm256_sub_ps( A, B ));
}
}
}
free(xi);
free(xj);
return _res;
}
This only yields a small boost (178 ms instead of 194 ms per function call).
Then I was wondering if a "block-wise" approach, like what is used to optimize dot-products, could be beneficials:
float* pairwise_sub_blocks( const float* _X, int N )
{
float* _res = (float*) aligned_alloc( 32, N*N*N*sizeof(float));
#define B 8
float cache1[B*B], cache2[B*B];
for (int bi = 0; bi < N; bi+=B)
for (int bj = 0; bj < N; bj+=B)
for (int bk = 0; bk < N; bk+=B) {
// load first 8x8 block in the cache
for (int i = 0; i < B; i++)
for (int k = 0; k < B; k++)
cache1[B*i + k] = X(bi+i, bk+k);
// load second 8x8 block in the cache
for (int j = 0; j < B; j++)
for (int k = 0; k < B; k++)
cache2[B*j + k] = X(bj+j, bk+k);
// compute local operations on the caches
for (int i = 0; i < B; i++)
for (int j = 0; j < B; j++)
for (int k = 0; k < B; k++)
res(bi+i,bj+j,bk+k) = cache1[B*i + k] - cache2[B*j + k];
}
return _res;
}
And surprisingly, this is the slowest method so far (258 ms per function call).
To summarize, despite some efforts with some optimized C++ code, I can't come anywhere close the 88 ms / call that numpy achieves effortlessly. Any idea why?
Note: By the way, I am disabling numpy multi-threading and anyway, this kind of operation is not multi-threaded.
Edit: Exact code to benchmark the numpy code:
import numpy as np
def pairwise_sub_numpy( X ):
return X - X[:, None, :]
N = 512
X = np.random.rand(N,N).astype(np.float32)
import timeit
times = timeit.repeat('pairwise_sub_numpy( X )', globals=globals(), number=1, repeat=5)
print(f">> best of 5 = {1000*min(times):.3f} ms")
Full benchmark for C code:
#include <stdio.h>
#include <string.h>
#include <xmmintrin.h> // compile with -mavx -msse4.1
#include <pmmintrin.h>
#include <immintrin.h>
#include <time.h>
#define X(i, j) _x[(i)*N + (j)]
#define res(i, j, k) _res[((i)*N + (j))*N + (k)]
float* pairwise_sub_naive( const float* _x, int N )
{
float* _res = (float*) aligned_alloc( 32, N*N*N*sizeof(float));
for (int i = 0; i < N; i++) {
for (int j = 0; j < N; j++) {
for (int k = 0; k < N; k++)
res(i,j,k) = X(i,k) - X(j,k);
}
}
return _res;
}
float* pairwise_sub_better( const float* _x, int N )
{
float* _res = (float*) aligned_alloc( 32, N*N*N*sizeof(float));
for (int i = 0; i < N; i++) {
const float* xi = & X(i,0);
for (int j = 0; j < N; j++) {
const float* xj = & X(j,0);
float* r = &res(i,j,0);
for (int k = 0; k < N; k++)
r[k] = xi[k] - xj[k];
}
}
return _res;
}
float* pairwise_sub_simd( const float* _x, int N )
{
float* _res = (float*) aligned_alloc( 32, N*N*N*sizeof(float));
// create caches for row vectors which are memory-aligned
float* xi = (float*)aligned_alloc(32, N * sizeof(float));
float* xj = (float*)aligned_alloc(32, N * sizeof(float));
for (int i = 0; i < N; i++) {
memcpy(xi, & X(i,0), N*sizeof(float));
for (int j = 0; j < N; j++) {
memcpy(xj, & X(j,0), N*sizeof(float));
float* r = &res(i,j,0);
for (int k = 0; k < N; k += 256/sizeof(float)) {
const __m256 A = _mm256_load_ps(xi+k);
const __m256 B = _mm256_load_ps(xj+k);
_mm256_store_ps(r+k, _mm256_sub_ps( A, B ));
}
}
}
free(xi);
free(xj);
return _res;
}
float* pairwise_sub_blocks( const float* _x, int N )
{
float* _res = (float*) aligned_alloc( 32, N*N*N*sizeof(float));
#define B 8
float cache1[B*B], cache2[B*B];
for (int bi = 0; bi < N; bi+=B)
for (int bj = 0; bj < N; bj+=B)
for (int bk = 0; bk < N; bk+=B) {
// load first 8x8 block in the cache
for (int i = 0; i < B; i++)
for (int k = 0; k < B; k++)
cache1[B*i + k] = X(bi+i, bk+k);
// load second 8x8 block in the cache
for (int j = 0; j < B; j++)
for (int k = 0; k < B; k++)
cache2[B*j + k] = X(bj+j, bk+k);
// compute local operations on the caches
for (int i = 0; i < B; i++)
for (int j = 0; j < B; j++)
for (int k = 0; k < B; k++)
res(bi+i,bj+j,bk+k) = cache1[B*i + k] - cache2[B*j + k];
}
return _res;
}
int main()
{
const int N = 512;
float* _x = (float*) malloc( N * N * sizeof(float) );
for( int i = 0; i < N; i++)
for( int j = 0; j < N; j++)
X(i,j) = ((i+j*j+17*i+101) % N) / float(N);
double best = 9e9;
for( int i = 0; i < 5; i++)
{
struct timespec start, stop;
clock_gettime(CLOCK_THREAD_CPUTIME_ID, &start);
//float* res = pairwise_sub_naive( _x, N );
//float* res = pairwise_sub_better( _x, N );
//float* res = pairwise_sub_simd( _x, N );
float* res = pairwise_sub_blocks( _x, N );
clock_gettime(CLOCK_THREAD_CPUTIME_ID, &stop);
double t = (stop.tv_sec - start.tv_sec) * 1e6 + (stop.tv_nsec - start.tv_nsec) / 1e3; // in microseconds
if (t < best) best = t;
free( res );
}
printf("Best of 5 = %f ms\n", best / 1000);
free( _x );
return 0;
}
Compiled using gcc 7.3.0 gcc -Wall -O3 -mavx -msse4.1 -o test_simd test_simd.c
Summary of timings on my machine:
Implementation
Time
numpy
88 ms
C++ naive
194 ms
C++ better
195 ms
C++ SIMD
178 ms
C++ blocked
258 ms
C++ blocked (gcc 8.3.1)
217 ms
As pointed out by some of the comments numpy uses SIMD in its implementation and it does not allocate memory at the point of computation. If I eliminate the memory allocation from your implementation, pre-allocating all the buffers ahead of the computation then I get a better time compared to numpy even with the scaler version(that is the one without any optimizations).
Also in terms of SIMD and why your implementation does not perform much better than the scaler is because your memory access patterns are not ideal for SIMD usage - you do memcopy and you load into SIMD registers from locations that are far apart from each other - e.g. you fill vectors from line 0 and line 511, which might not play well with the cache or with the SIMD prefetcher.
There is also a mistake in how you load the SIMD registers(if I understood correctly what you're trying to compute): a 256 bit SIMD register can load 8 single-precision floating-point numbers 8 * 32 = 256, but in your loop you jump k by "256/sizeof(float)" which is 256/4 = 64; _x and _res are float pointers and the SIMD intrinsics expect also float pointers as arguments so instead of reading all elements from those lines every 8 floats you read them every 64 floats.
The computation can be optimized further by changing the access patterns but also by observing that you repeat some computations: e.g. when iterating with line0 as a base you compute line0 - line1 but at some future time, when iterating with line1 as a base, you need to compute line1 - line0 which is basically -(line0 - line1), that is for each line after line0 a lot of results could be reused from previous computations.
A lot of times SIMD usage or parallelization requires one to change how data is accessed or reasoned about in order to provide meaningful improvements.
Here is what I have done as a first step based on your initial implementation and it is faster than the numpy(don't mind the OpenMP stuff as it's not how its supposed to be done, I just wanted to see how it behaves trying the naive way).
C++
Time scaler version: 55 ms
Time SIMD version: 53 ms
**Time SIMD 2 version: 33 ms**
Time SIMD 3 version: 168 ms
Time OpenMP version: 59 ms
Python numpy
>> best of 5 = 88.794 ms
#include <cstdlib>
#include <xmmintrin.h> // compile with -mavx -msse4.1
#include <pmmintrin.h>
#include <immintrin.h>
#include <numeric>
#include <algorithm>
#include <chrono>
#include <iostream>
#include <cstring>
using namespace std;
float* pairwise_sub_naive (const float* input, float* output, int n)
{
for (int i = 0; i < n; i++) {
for (int j = 0; j < n; j++) {
for (int k = 0; k < n; k++)
output[(i * n + j) * n + k] = input[i * n + k] - input[j * n + k];
}
}
return output;
}
float* pairwise_sub_simd (const float* input, float* output, int n)
{
for (int i = 0; i < n; i++)
{
const int idxi = i * n;
for (int j = 0; j < n; j++)
{
const int idxj = j * n;
const int outidx = idxi + j;
for (int k = 0; k < n; k += 8)
{
__m256 A = _mm256_load_ps(input + idxi + k);
__m256 B = _mm256_load_ps(input + idxj + k);
_mm256_store_ps(output + outidx * n + k, _mm256_sub_ps( A, B ));
}
}
}
return output;
}
float* pairwise_sub_simd_2 (const float* input, float* output, int n)
{
float* line_buffer = (float*) aligned_alloc(32, n * sizeof(float));
for (int i = 0; i < n; i++)
{
const int idxi = i * n;
for (int j = 0; j < n; j++)
{
const int idxj = j * n;
const int outidx = idxi + j;
for (int k = 0; k < n; k += 8)
{
__m256 A = _mm256_load_ps(input + idxi + k);
__m256 B = _mm256_load_ps(input + idxj + k);
_mm256_store_ps(line_buffer + k, _mm256_sub_ps( A, B ));
}
memcpy(output + outidx * n, line_buffer, n);
}
}
return output;
}
float* pairwise_sub_simd_3 (const float* input, float* output, int n)
{
for (int i = 0; i < n; i++)
{
const int idxi = i * n;
for (int k = 0; k < n; k += 8)
{
__m256 A = _mm256_load_ps(input + idxi + k);
for (int j = 0; j < n; j++)
{
const int idxj = j * n;
const int outidx = (idxi + j) * n;
__m256 B = _mm256_load_ps(input + idxj + k);
_mm256_store_ps(output + outidx + k, _mm256_sub_ps( A, B ));
}
}
}
return output;
}
float* pairwise_sub_openmp (const float* input, float* output, int n)
{
int i, j;
#pragma omp parallel for private(j)
for (i = 0; i < n; i++)
{
for (j = 0; j < n; j++)
{
const int idxi = i * n;
const int idxj = j * n;
const int outidx = idxi + j;
for (int k = 0; k < n; k += 8)
{
__m256 A = _mm256_load_ps(input + idxi + k);
__m256 B = _mm256_load_ps(input + idxj + k);
_mm256_store_ps(output + outidx * n + k, _mm256_sub_ps( A, B ));
}
}
}
/*for (i = 0; i < n; i++)
{
for (j = 0; j < n; j++)
{
for (int k = 0; k < n; k++)
{
output[(i * n + j) * n + k] = input[i * n + k] - input[j * n + k];
}
}
}*/
return output;
}
int main ()
{
constexpr size_t n = 512;
constexpr size_t input_size = n * n;
constexpr size_t output_size = n * n * n;
float* input = (float*) aligned_alloc(32, input_size * sizeof(float));
float* output = (float*) aligned_alloc(32, output_size * sizeof(float));
float* input_simd = (float*) aligned_alloc(32, input_size * sizeof(float));
float* output_simd = (float*) aligned_alloc(32, output_size * sizeof(float));
float* input_par = (float*) aligned_alloc(32, input_size * sizeof(float));
float* output_par = (float*) aligned_alloc(32, output_size * sizeof(float));
iota(input, input + input_size, float(0.0));
fill(output, output + output_size, float(0.0));
iota(input_simd, input_simd + input_size, float(0.0));
fill(output_simd, output_simd + output_size, float(0.0));
iota(input_par, input_par + input_size, float(0.0));
fill(output_par, output_par + output_size, float(0.0));
std::chrono::milliseconds best_scaler{100000};
for (int i = 0; i < 5; ++i)
{
auto start = chrono::high_resolution_clock::now();
pairwise_sub_naive(input, output, n);
auto stop = chrono::high_resolution_clock::now();
auto duration = chrono::duration_cast<chrono::milliseconds>(stop - start);
if (duration < best_scaler)
{
best_scaler = duration;
}
}
cout << "Time scaler version: " << best_scaler.count() << " ms\n";
std::chrono::milliseconds best_simd{100000};
for (int i = 0; i < 5; ++i)
{
auto start = chrono::high_resolution_clock::now();
pairwise_sub_simd(input_simd, output_simd, n);
auto stop = chrono::high_resolution_clock::now();
auto duration = chrono::duration_cast<chrono::milliseconds>(stop - start);
if (duration < best_simd)
{
best_simd = duration;
}
}
cout << "Time SIMD version: " << best_simd.count() << " ms\n";
std::chrono::milliseconds best_simd_2{100000};
for (int i = 0; i < 5; ++i)
{
auto start = chrono::high_resolution_clock::now();
pairwise_sub_simd_2(input_simd, output_simd, n);
auto stop = chrono::high_resolution_clock::now();
auto duration = chrono::duration_cast<chrono::milliseconds>(stop - start);
if (duration < best_simd_2)
{
best_simd_2 = duration;
}
}
cout << "Time SIMD 2 version: " << best_simd_2.count() << " ms\n";
std::chrono::milliseconds best_simd_3{100000};
for (int i = 0; i < 5; ++i)
{
auto start = chrono::high_resolution_clock::now();
pairwise_sub_simd_3(input_simd, output_simd, n);
auto stop = chrono::high_resolution_clock::now();
auto duration = chrono::duration_cast<chrono::milliseconds>(stop - start);
if (duration < best_simd_3)
{
best_simd_3 = duration;
}
}
cout << "Time SIMD 3 version: " << best_simd_3.count() << " ms\n";
std::chrono::milliseconds best_par{100000};
for (int i = 0; i < 5; ++i)
{
auto start = chrono::high_resolution_clock::now();
pairwise_sub_openmp(input_par, output_par, n);
auto stop = chrono::high_resolution_clock::now();
auto duration = chrono::duration_cast<chrono::milliseconds>(stop - start);
if (duration < best_par)
{
best_par = duration;
}
}
cout << "Time OpenMP version: " << best_par.count() << " ms\n";
cout << "Verification\n";
if (equal(output, output + output_size, output_simd))
{
cout << "PASSED\n";
}
else
{
cout << "FAILED\n";
}
return 0;
}
Edit: Small correction as there was a wrong call related to the second version of SIMD implementation.
As you can see now, the second implementation is the fastest as it behaves the best from the point of view of the locality of reference of the cache. Examples 2 and 3 of SIMD implementations are there to illustrate for you how changing memory access patterns to influence the performance of your SIMD optimizations.
To summarize(knowing that I'm far from being complete in my advice) be mindful of your memory access patterns and of the loads and stores to\from the SIMD unit; the SIMD is a different hardware unit inside the processor's core so there is a penalty in shuffling data back and forth, hence when you load a register from memory try to do as many operations as possible with that data and do not be too eager to store it back(of course, in your example that might be all you need to do with the data). Be mindful also that there is a limited number of SIMD registers available and if you load too many then they will "spill", that is they will be stored back to temporary locations in main memory behind the scenes killing all your gains. SIMD optimization, it's a true balance act!
There is some effort to put a cross-platform intrinsics wrapper into the standard(I developed myself a closed source one in my glorious past) and even it's far from being complete, it's worth taking a look at(read the accompanying papers if you're truly interested to learn how SIMD works).
https://github.com/VcDevel/std-simd
This is a complement to the answer posted by #celakev .
I think I finally got to understand what exactly was the issue. The issue was not about allocating the memory in the main function that does the computation.
What was actually taking time is to access new (fresh) memory. I believe that the malloc call returns pages of memory which are virtual, i.e. that does not corresponds to actual physical memory -- until it is explicitly accessed. What actually takes time is the process of allocating physical memory on the fly (which I think is OS-level) when it is accessed in the function code.
Here is a proof. Consider the two following trivial functions:
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
float* just_alloc( size_t N )
{
return (float*) aligned_alloc( 32, sizeof(float)*N );
}
void just_fill( float* _arr, size_t N )
{
for (size_t i = 0; i < N; i++)
_arr[i] = 1;
}
#define Time( code_to_benchmark, cleanup_code ) \
do { \
double best = 9e9; \
for( int i = 0; i < 5; i++) { \
struct timespec start, stop; \
clock_gettime(CLOCK_THREAD_CPUTIME_ID, &start); \
code_to_benchmark; \
clock_gettime(CLOCK_THREAD_CPUTIME_ID, &stop); \
double t = (stop.tv_sec - start.tv_sec) * 1e3 + (stop.tv_nsec - start.tv_nsec) / 1e6; \
printf("Time[%d] = %f ms\n", i, t); \
if (t < best) best = t; \
cleanup_code; \
} \
printf("Best of 5 for '" #code_to_benchmark "' = %f ms\n\n", best); \
} while(0)
int main()
{
const size_t N = 512;
Time( float* arr = just_alloc(N*N*N), free(arr) );
float* arr = just_alloc(N*N*N);
Time( just_fill(arr, N*N*N), ; );
free(arr);
return 0;
}
I get the following timings, which I now detail for each of the calls:
Time[0] = 0.000931 ms
Time[1] = 0.000540 ms
Time[2] = 0.000523 ms
Time[3] = 0.000524 ms
Time[4] = 0.000521 ms
Best of 5 for 'float* arr = just_alloc(N*N*N)' = 0.000521 ms
Time[0] = 189.822237 ms
Time[1] = 45.041083 ms
Time[2] = 46.331428 ms
Time[3] = 44.729433 ms
Time[4] = 42.241279 ms
Best of 5 for 'just_fill(arr, N*N*N)' = 42.241279 ms
As you can see, allocating memory is blazingly fast, but the first time that the memory is accessed, it is 5 times slower than the other times. So, basically the reason that my code was slow was because i was each time reallocating fresh memory that had no physical address yet. (Correct me if I'm wrong but I think that's the gist of it!)
A bit late to the party, but I wanted to add a pairwise method with Eigen, which is supposed to give C++ a high-level algebra manipulation capability and use SIMD under the hood. Just like numpy.
Here is the implementation
#include <iostream>
#include <vector>
#include <chrono>
#include <algorithm>
#include <Eigen/Dense>
auto pairwise_eigen(const Eigen::MatrixXf &input, std::vector<Eigen::MatrixXf> &output) {
for (int k = 0; k < input.cols(); ++k)
output[k] = input
// subtract matrix with repeated k-th column
- input.col(k) * Eigen::RowVectorXf::Ones(input.cols());
}
int main() {
constexpr size_t n = 512;
// allocate input and output
Eigen::MatrixXf input = Eigen::MatrixXf::Random(n, n);
std::vector<Eigen::MatrixXf> output(n);
std::chrono::milliseconds best_eigen{100000};
for (int i = 0; i < 5; ++i) {
auto start = std::chrono::high_resolution_clock::now();
pairwise_eigen(input, output);
auto end = std::chrono::high_resolution_clock::now();
auto duration = std::chrono::duration_cast<std::chrono::milliseconds>(end-start);
if (duration < best_eigen)
best_eigen = duration;
}
std::cout << "Time Eigen version: " << best_eigen.count() << " ms\n";
return 0;
}
The full benchmark tests suggested by #celavek on my system are
Time scaler version: 57 ms
Time SIMD version: 58 ms
Time SIMD 2 version: 40 ms
Time SIMD 3 version: 58 ms
Time OpenMP version: 58 ms
Time Eigen version: 76 ms
Numpy >> best of 5 = 118.489 ms
Whit Eigen there is still a noticeable improvement with respect to Numpy, but not so impressive compared to the "raw" implementations (there is certainly some overhead).
An extra optimization is to allocate the output vector with copies of the input and then subtract directly from each vector entry, simply replacing the following lines
// inside the pairwise method
for (int k = 0; k < input.cols(); ++k)
output[k] -= input.col(k) * Eigen::RowVectorXf::Ones(input.cols());
// at allocation time
std::vector<Eigen::MatrixXf> output(n, input);
This pushes the best of 5 down to 60 ms.
I have a big m-by-n chunk of memory that can be thought of m vectors of length n. I would now like to loop over all vectors and modify them. A double for loop would do, but I found that Eigen::VectorXds are faster than hand-written loops. The following works (with dummy data a):
#include <Eigen/Core>
#include <vector>
int main() {
const int m = 100;
const int n = 70;
std::vector<double> a(m*n);
auto data = a.data();
for (int i = 1; i < m; i++) {
auto r_i1 = Eigen::Map<Eigen::VectorXd>(&data[(i-1)*n], n);
auto r_i = Eigen::Map<Eigen::VectorXd>(&data[i*n], n);
auto x = r_i + r_i1;
auto z = x - r_i;
auto y = (r_i - (x-z)) + (r_i1 - z);
r_i = x;
r_i1 = y;
}
return EXIT_SUCCESS;
}
Now, I'm wondering if it's possible to speed this up even further. It seems less than ideal to me, for example, to recreate Eigen::Maps from data chunks in each individual step.
Is it possible create a bunch of Eigen::VectorXds from a large chunk of memory at once?
Following #AviGinsburg's suggestion, treating the block as an Eigen::MatrixXd works. Watch out for the column-major ordering in Eigen!
#include <Eigen/Core>
#include <vector>
#include <iostream>
int main() {
const int m = 100;
const int n = 70;
std::vector<double> a(m*n);
auto data = a.data();
auto r = Eigen::Map<Eigen::MatrixXd>(&data[0], n, m);
for (int i = 1; i < m; i++) {
auto x = r.col(i) + r.col(i-1);
auto z = x - r.col(i);
auto y = (r.col(i) - (x-z)) + (r.col(i-1) - z);
r.col(i) = x;
r.col(i-1) = y;
}
return EXIT_SUCCESS;
}
I want to speed up this nested for loop, just start learn CUDA, how could I use CUDA to parallel this c++ code ?
#define PI 3.14159265
using namespace std;
int main()
{
int nbint = 2;
int hits = 20;
int nbinp = 2;
float _theta, _phi, _l, _m, _n, _k = 0, delta = 5;
float x[20],y[20],z[20],a[20],t[20];
for (int i = 0; i < hits; ++i)
{
x[i] = rand() / (float)(RAND_MAX / 100);
}
for (int i = 0; i < hits; ++i)
{
y[i] = rand() / (float)(RAND_MAX / 100);
}
for (int i = 0; i < hits; ++i)
{
z[i] = rand() / (float)(RAND_MAX / 100);
}
for (int i = 0; i < hits; ++i)
{
a[i] = rand() / (float)(RAND_MAX / 100);
}
float maxforall = 1e-6;
float theta0;
float phi0;
for (int i = 0; i < nbint; i++)
{
_theta = (0.5 + i)*delta;
for (int j = 0; j < nbinp; j++)
{
_phi = (0.5 + j)*delta / _theta;
_l = sin(_theta* PI / 180.0)*cos(_phi* PI / 180.0);
_m = sin(_theta* PI / 180.0)*sin(_phi* PI / 180.0);
_n = cos(_theta* PI / 180.0);
for (int k = 0; k < hits; k++)
{
_k = -(_l*x[k] + _m*y[k] + _n*z[k]);
t[k] = a[k] - _k;
}
qsort(t, 0, hits - 1);
float max = t[0];
for (int k = 0; k < hits; k++)
{
if (max < t[k])
max = t[k];
}
if (max > maxforall)
{
maxforall = max;
}
}
}
return 0;
}
I want to put innermost for loop and the sort part(maybe the whole nested loop) into parallel. After sort those array I found the maximum of all arrays. I use maximum to simplify the code. The reason I need sort is that maximum represent
here is a continuous time information(all arrays contain time information). The sort part make those time from lowest to highest. Then I compare the a specific time interval(not a single value). The compare process almost like I choose maximum but with a continuous interval not a single value.
Your 3 nested loops calculate nbint*nbinp*hits values. Since each of those values is independent from each other, all values can be calculated in parallel.
You stated in your comments that you have a commutative and associative "filter condition" which reduces the output to a single scalar value. This can be exploited to avoid sorting and storing the temporary values. Instead, we can calculate the values on-the-fly and then apply a parallel reduction to determine the end result.
This can be done in "raw" CUDA, below I implemented this idea using thrust. The main idea is to run grid_op nbint*nbinp*hits times in parallel. In order to find out the three original "loop indices" from the single scalar index which is passed to grid_op the algorithm from this SO question is used.
thrust::transform_reduce performs the on-the-fly transformation and the subsequent parallel reduction (here thrust::maximum is used as a substitute).
#include <cmath>
#include <thrust/device_vector.h>
#include <thrust/functional.h>
#include <thrust/transform_reduce.h>
#include <thrust/iterator/counting_iterator.h>
#include <thrust/tuple.h>
// ### BEGIN utility for demo ####
#include <iostream>
#include <thrust/random.h>
thrust::host_vector<float> random_vector(const size_t N)
{
thrust::default_random_engine rng;
thrust::uniform_real_distribution<float> u01(0.0f, 1.0f);
thrust::host_vector<float> temp(N);
for(size_t i = 0; i < N; i++) {
temp[i] = u01(rng);
}
return temp;
}
// ### END utility for demo ####
template <typename... Iterators>
thrust::zip_iterator<thrust::tuple<Iterators...>> zip(Iterators... its)
{
return thrust::make_zip_iterator(thrust::make_tuple(its...));
}
template <typename ZipIterator>
class grid_op
{
public:
grid_op(ZipIterator zipIt, std::size_t dim1, std::size_t dim2) : zipIt(zipIt), dim1(dim1), dim2(dim2){}
__host__ __device__
float operator()(std::size_t index) const
{
const auto coords = unflatten_3d_index(index, dim1, dim2);
const auto values = zipIt[thrust::get<2>(coords)];
const float delta = 5;
const float _theta = (0.5f + thrust::get<0>(coords))*delta;
const float _phi = (0.5f + thrust::get<1>(coords))*delta / _theta;
const float _l = sin(_theta* M_PI / 180.0)*cos(_phi* M_PI / 180.0);
const float _m = sin(_theta* M_PI / 180.0)*sin(_phi* M_PI / 180.0);
const float _n = cos(_theta* M_PI / 180.0);
const float _k = -(_l*thrust::get<0>(values) + _m*thrust::get<1>(values) + _n*thrust::get<2>(values));
return (thrust::get<3>(values) - _k);
}
private:
__host__ __device__
thrust::tuple<std::size_t, std::size_t, std::size_t>
unflatten_3d_index(std::size_t index, std::size_t dim1, std::size_t dim2) const
{
// taken from https://stackoverflow.com/questions/29142417/4d-position-from-1d-index
std::size_t x = index % dim1;
std::size_t y = ( ( index - x ) / dim1 ) % dim2;
std::size_t z = ( ( index - y * dim1 - x ) / (dim1 * dim2) );
return thrust::make_tuple(x,y,z);
}
ZipIterator zipIt;
std::size_t dim1;
std::size_t dim2;
};
template <typename ZipIterator>
grid_op<ZipIterator> make_grid_op(ZipIterator zipIt, std::size_t dim1, std::size_t dim2)
{
return grid_op<ZipIterator>(zipIt, dim1, dim2);
}
int main()
{
const int nbint = 3;
const int nbinp = 4;
const int hits = 20;
const std::size_t N = nbint * nbinp * hits;
thrust::device_vector<float> d_x = random_vector(hits);
thrust::device_vector<float> d_y = random_vector(hits);
thrust::device_vector<float> d_z = random_vector(hits);
thrust::device_vector<float> d_a = random_vector(hits);
auto zipIt = zip(d_x.begin(), d_y.begin(), d_z.begin(), d_a.begin());
auto countingIt = thrust::counting_iterator<std::size_t>(0);
auto unary_op = make_grid_op(zipIt, nbint, nbinp);
auto binary_op = thrust::maximum<float>();
const float init = 0;
float max = thrust::transform_reduce(
countingIt, countingIt+N,
unary_op,
init,
binary_op
);
std::cout << "max = " << max << std::endl;
}
Closed. This question does not meet Stack Overflow guidelines. It is not currently accepting answers.
We don’t allow questions seeking recommendations for books, tools, software libraries, and more. You can edit the question so it can be answered with facts and citations.
Closed 6 years ago.
Improve this question
Does anyone know if there is any free and open source library that has implemented these two functions the way they are defined in matlab?
Thanks
FFTHIFT / IFFTSHIFT is a fancy way of doing CIRCSHIFT.
You can verify that FFTSHIFT can be rewritten as CIRCSHIFT as following.
You can define macros in C/C++ to punt FFTSHIFT to CIRCSHIFT.
A = rand(m, n);
mm = floor(m / 2);
nn = floor(n / 2);
% All three of the following should provide zeros.
circshift(A,[mm, nn]) - fftshift(A)
circshift(A,[mm, 0]) - fftshift(A, 1)
circshift(A,[ 0, nn]) - fftshift(A, 2)
Similar equivalents can be found for IFFTSHIFT.
Circular shift can be implemented very simply with the following code (Can be improved with parallel versions ofcourse).
template<class ty>
void circshift(ty *out, const ty *in, int xdim, int ydim, int xshift, int yshift)
{
for (int i = 0; i < xdim; i++) {
int ii = (i + xshift) % xdim;
for (int j = 0; j < ydim; j++) {
int jj = (j + yshift) % ydim;
out[ii * ydim + jj] = in[i * ydim + j];
}
}
}
And then
#define fftshift(out, in, x, y) circshift(out, in, x, y, (x/2), (y/2))
#define ifftshift(out, in, x, y) circshift(out, in, x, y, ((x+1)/2), ((y+1)/2))
This was done a bit impromptu. Bear with me if there are any formatting / syntactical problems.
Possible this code may help. It perform fftshift/ifftshift only for 1D array within one buffer. Algorithm of forward and backward fftshift for even number of elements is fully identical.
void swap(complex *v1, complex *v2)
{
complex tmp = *v1;
*v1 = *v2;
*v2 = tmp;
}
void fftshift(complex *data, int count)
{
int k = 0;
int c = (int) floor((float)count/2);
// For odd and for even numbers of element use different algorithm
if (count % 2 == 0)
{
for (k = 0; k < c; k++)
swap(&data[k], &data[k+c]);
}
else
{
complex tmp = data[0];
for (k = 0; k < c; k++)
{
data[k] = data[c + k + 1];
data[c + k + 1] = data[k + 1];
}
data[c] = tmp;
}
}
void ifftshift(complex *data, int count)
{
int k = 0;
int c = (int) floor((float)count/2);
if (count % 2 == 0)
{
for (k = 0; k < c; k++)
swap(&data[k], &data[k+c]);
}
else
{
complex tmp = data[count - 1];
for (k = c-1; k >= 0; k--)
{
data[c + k + 1] = data[k];
data[k] = data[c + k];
}
data[c] = tmp;
}
}
UPDATED:
Also FFT library (including fftshift operations) for arbitrary points number could be found in Optolithium (under the OptolithiumC/libs/fourier)
Normally, centering the FFT is done with v(k)=v(k)*(-1)**k in
the time domain. Shifting in the frequency domain is a poor substitute, for
mathematical reasons and for computational efficiency.
See pp 27 of:
http://show.docjava.com/pub/document/jot/v8n6.pdf
I am not sure why Matlab documentation does it the way they do,
they give no technical reference.
Or you can do it yourself by typing type fftshift and recoding that in C++. It's not that complicated of Matlab code.
Edit: I've noticed that this answer has been down-voted a few times recently and commented on in a negative way. I recall a time when type fftshift was more revealing than the current implementation, but I could be wrong. If I could delete the answer, I would as it seems no longer relevant.
Here is a version (courtesy of Octave) that implements it without
circshift.
I tested the code provided here and made an example project to test them. For 1D code one can simply use std::rotate
template <typename _Real>
static inline
void rotshift(complex<_Real> * complexVector, const size_t count)
{
int center = (int) floor((float)count/2);
if (count % 2 != 0) {
center++;
}
// odd: 012 34 changes to 34 012
std::rotate(complexVector,complexVector + center,complexVector + count);
}
template <typename _Real>
static inline
void irotshift(complex<_Real> * complexVector, const size_t count)
{
int center = (int) floor((float)count/2);
// odd: 01 234 changes to 234 01
std::rotate(complexVector,complexVector +center,complexVector + count);
}
I prefer using std::rotate over the code from Alexei due to its simplicity.
For 2D it gets more complicated. For even numbers it is basically a flip left right and flip upside down. For odd it is the circshift algorithm:
// A =
// 1 2 3
// 4 5 6
// 7 8 9
// fftshift2D(A)
// 9 | 7 8
// --------------
// 3 | 1 2
// 6 | 4 5
// ifftshift2D(A)
// 5 6 | 4
// 8 9 | 7
// --------------
// 2 3 | 1
Here I implemented the circshift code with an interface using only one array for in and output. For even numbers only a single array is required, for odd numbers a second array is temporarily created and copied back to the input array. This causes a performance decrease because of the additional time for copying the array.
template<class _Real>
static inline
void fftshift2D(complex<_Real> *data, size_t xdim, size_t ydim)
{
size_t xshift = xdim / 2;
size_t yshift = ydim / 2;
if ((xdim*ydim) % 2 != 0) {
// temp output array
std::vector<complex<_Real> > out;
out.resize(xdim * ydim);
for (size_t x = 0; x < xdim; x++) {
size_t outX = (x + xshift) % xdim;
for (size_t y = 0; y < ydim; y++) {
size_t outY = (y + yshift) % ydim;
// row-major order
out[outX + xdim * outY] = data[x + xdim * y];
}
}
// copy out back to data
copy(out.begin(), out.end(), &data[0]);
}
else {
// in and output array are the same,
// values are exchanged using swap
for (size_t x = 0; x < xdim; x++) {
size_t outX = (x + xshift) % xdim;
for (size_t y = 0; y < yshift; y++) {
size_t outY = (y + yshift) % ydim;
// row-major order
swap(data[outX + xdim * outY], data[x + xdim * y]);
}
}
}
}
template<class _Real>
static inline
void ifftshift2D(complex<_Real> *data, size_t xdim, size_t ydim)
{
size_t xshift = xdim / 2;
if (xdim % 2 != 0) {
xshift++;
}
size_t yshift = ydim / 2;
if (ydim % 2 != 0) {
yshift++;
}
if ((xdim*ydim) % 2 != 0) {
// temp output array
std::vector<complex<_Real> > out;
out.resize(xdim * ydim);
for (size_t x = 0; x < xdim; x++) {
size_t outX = (x + xshift) % xdim;
for (size_t y = 0; y < ydim; y++) {
size_t outY = (y + yshift) % ydim;
// row-major order
out[outX + xdim * outY] = data[x + xdim * y];
}
}
// copy out back to data
copy(out.begin(), out.end(), &data[0]);
}
else {
// in and output array are the same,
// values are exchanged using swap
for (size_t x = 0; x < xdim; x++) {
size_t outX = (x + xshift) % xdim;
for (size_t y = 0; y < yshift; y++) {
size_t outY = (y + yshift) % ydim;
// row-major order
swap(data[outX + xdim * outY], data[x + xdim * y]);
}
}
}
}
Notice: There are better answers provided, I just keep this here for a while for... I do not know what.
Try this:
template<class T> void ifftShift(T *out, const T* in, size_t nx, size_t ny)
{
const size_t hlen1 = (ny+1)/2;
const size_t hlen2 = ny/2;
const size_t shft1 = ((nx+1)/2)*ny + hlen1;
const size_t shft2 = (nx/2)*ny + hlen2;
const T* src = in;
for(T* tgt = out; tgt < out + shft1 - hlen1; tgt += ny, src += ny) { // (nx+1)/2 times
copy(src, src+hlen1, tgt + shft2); //1->4
copy(src+hlen1, src+ny, tgt+shft2-hlen2); } //2->3
src = in;
for(T* tgt = out; tgt < out + shft2 - hlen2; tgt += ny, src += ny ){ // nx/2 times
copy(src+shft1, src+shft1+hlen2, tgt); //4->1
copy(src+shft1-hlen1, src+shft1, tgt+hlen2); } //3->2
};
For matrices with even dimensions you can do it in-place, just passing the same pointer into in and out parameters.
Also note that for 1D arrays fftshift is just std::rotate.
You could also use arrayfire's shift function as replacement for Matlab's circshift and re-implement the rest of the code. This could be useful if you are interested in any of the other features of AF anyway (such as portability to GPU by simply changing a linker flag).
However if all your code is meant to be run on the CPU and is quite sophisticated or you don't want to use any other data format (AF requires af::arrays) stick with one of the other options.
I ended up changing to AF because I would have had to re-implement fftshift as an OpenCL kernel otherwise back in the time.
It will give equivalent result to ifftshift in matlab
ifftshift(vector< vector <double> > Hlow,int RowLineSpace, int ColumnLineSpace)
{
int pivotRow=floor(RowLineSpace/2);
int pivotCol=floor(ColumnLineSpace/2);
for(int i=pivotRow;i<RowLineSpace;i++){
for(int j=0;j<ColumnLineSpace;j++){
double temp=Hlow.at(i).at(j);
second.push_back(temp);
}
ifftShiftRow.push_back(second);
second.clear();
}
for(int i=0;i<pivotRow;i++){
for(int j=0;j<ColumnLineSpace;j++){
double temp=Hlow.at(i).at(j);
first.push_back(temp);
}
ifftShiftRow.push_back(first);
first.clear();
}
double** arr = new double*[RowLineSpace];
for(int i = 0; i < RowLineSpace; ++i)
arr[i] = new double[ColumnLineSpace];
int i1=0,j1=0;
for(int j=pivotCol;j<ColumnLineSpace;j++){
for(int i=0;i<RowLineSpace;i++){
double temp2=ifftShiftRow.at(i).at(j);
arr[i1][j1]=temp2;
i1++;
}
j1++;
i1=0;
}
for(int j=0;j<pivotCol;j++){
for(int i=0;i<RowLineSpace;i++){
double temp1=ifftShiftRow.at(i).at(j);
arr[i1][j1]=temp1;
i1++;
}
j1++;
i1=0;
}
for(int i=0;i<RowLineSpace;i++){
for(int j=0;j<ColumnLineSpace;j++){
double value=arr[i][j];
temp.push_back(value);
}
ifftShiftLow.push_back(temp);
temp.clear();
}
return ifftShiftLow;
}
Octave uses fftw to implement (i)fftshift.
You can use kissfft. It's reasonable fast, extremely simple to use, and free. Arranging the output like you want it requires only to:
a) shift by (-dim_x/2, -dim_y/2, ...), with periodic boundary conditions
b) FFT or IFFT
c) shift back by (dim_x/2, dim_y/2, ...) , with periodic boundary conditions
d) scale ? (according to your needs IFFT*FFT will scale the function by dim_x*dim_y*... by default)