Alignment in android NDK project - casting

I have an existing C++ project which I would like to port to android. Unfortunately the program causes an "Fatal signal 7 (SIGBUS)" error on android. It is working fine on other platforms (32bit/64bit Linux and Windows). Here is the part of the code, that causes the problem:
RawMem3::RawMem3(uint8_t packet_version, uint32_t flags, uint64_t packet_id,
uint64_t packet_nr, uint64_t timestamp, vector<uint16_t>& nr_channels,
vector<uint16_t>& samples_per_channel, vector<double>& data) :
size_(0) {
size_ = sizeof(packet_version) + sizeof(size_) + sizeof(flags)
+ sizeof(packet_id) + sizeof(packet_nr) + sizeof(timestamp)
+ nr_channels.size() * sizeof(boost::uint16_t)
+ samples_per_channel.size() * sizeof(boost::uint16_t)
+ data.size() * sizeof(float); // FIXXXXXME ... hardcoded sizeof() !!!!
mem_ = malloc(size_);
uint8_t* ui8_ptr = reinterpret_cast<uint8_t*>(mem_);
*ui8_ptr++ = packet_version;
uint32_t* ui32_ptr = reinterpret_cast<uint32_t*>(ui8_ptr);
*ui32_ptr++ = size_;
*ui32_ptr++ = flags;
uint64_t* ui64_ptr = reinterpret_cast<uint64_t*>(ui32_ptr);
*ui64_ptr++ = packet_id;
*ui64_ptr++ = packet_nr;
uint64_t* time_ptr = reinterpret_cast<uint64_t*>(ui64_ptr);
*time_ptr++ = timestamp;
uint16_t* ui16_ptr = reinterpret_cast<uint16_t*>(time_ptr);
for (unsigned int n = 0; n < nr_channels.size(); n++)
*ui16_ptr++ = nr_channels[n];
for (unsigned int n = 0; n < samples_per_channel.size(); n++)
*ui16_ptr++ = samples_per_channel[n];
try {
float* flt_ptr = reinterpret_cast<float*>(ui16_ptr);
for (unsigned int n = 0; n < data.size(); n++)
*flt_ptr++ = numeric_cast<float>(data[n]);
} catch (negative_overflow& e) {
cerr << "RawMem -- Constructor: " << e.what();
} catch (positive_overflow& e) {
cerr << "RawMem -- Constructor: " << e.what();
} catch (bad_numeric_cast& e) {
cerr << "RawMem -- Constructor: " << e.what();
}
Strictly speaking its this part:
uint32_t* ui32_ptr = reinterpret_cast<uint32_t*>(ui8_ptr);
*ui32_ptr++ = size_;
*ui32_ptr++ = flags
I guess it is an alignment problem. I am not a specialist in that stuff, so any ideas how i can fix this? I have already tried to change packet_version to a uint32_t. After that the SIGBUS error is gone, however my client software assuming that packed_version is a uint8_t. I can't change the client code so I need to fix this in a different way.

Use memcpy() instead of assignments:
#define APPEND(x) memcpy(ui8_ptr, &(x), sizeof((x))); ui8_ptr += sizeof((x));
uint8_t* ui8_ptr = reinterpret_cast<uint8_t*>(mem_);
APPEND(packet_version);
APPEND(size_);
APPEND(flags);
APPEND(packet_id);
APPEND(packet_nr);
APPEND(timestamp);
for (unsigned int n = 0; n < nr_channels.size(); n++)
APPEND(nr_channels[n]);
for (unsigned int n = 0; n < samples_per_channel.size(); n++)
APPEND(samples_per_channel[n]);
for (unsigned int n = 0; n < data.size(); n++) {
float qq = numeric_cast<float>(data[n]);
APPEND(qq);
}

Related

Copy 80 bit hex number from char array to uint16_t vector or array

Say I have a text file containing the 80bit hex number
0xabcdef0123456789abcd
My C++ program reads that using fstream into a char array called buffer.
But then I want to store it in a uint16_t array such that:
uint16_t * key = {0xabcd, 0xef01, 0x2345, 0x6789, 0xabcd}
I have tried several approaches, but I continue to get decimal integers, for instance:
const std::size_t strLength = strlen(buffer);
std::vector<uint16_t> arr16bit((strLength / 2) + 1);
for (std::size_t i = 0; i < strLength; ++i)
{
arr16bit[i / 2] <<= 8;
arr16bit[i / 2] |= buffer[i];
}
Yields:
arr16bit = {24930, 25444, 25958, 12337, 12851}
There must be an easy way to do this that I'm just not seeing.
Here is the full solution I came up with based on the comments:
int hex_char_to_int(char c) {
if (int(c) < 58) //numbers
return c - 48;
else if (int(c) < 91) //capital letters
return c - 65 + 10;
else if (int(c) < 123) //lower case letters
return c - 97 + 10;
}
uint16_t ints_to_int16(int i0, int i1, int i2, int i3) {
return (i3 * 16 * 16 * 16) + (i2 * 16 * 16) + (i1 * 16) + i0;
}
void readKey() {
const int bufferSize = 25;
char buffer[bufferSize] = { NULL };
ifstream* pStream = new ifstream("key.txt");
if (pStream->is_open() == true)
{
pStream->read(buffer, bufferSize);
}
cout << buffer << endl;
const size_t strLength = strlen(buffer);
int* hex_to_int = new int[strLength - 2];
for (int i = 2; i < strLength; i++) {
hex_to_int[i - 2] = hex_char_to_int(buffer[i]);
}
cout << endl;
uint16_t* key16 = new uint16_t[5];
int j = 0;
for (int i = 0; i < 5; i++) {
key16[i] = ints_to_int16(hex_to_int[j++], hex_to_int[j++], hex_to_int[j++], hex_to_int[j++]);
cout << "0x" << hex << key16[i] << " ";
}
cout << endl;
}
This outputs:
0xabcdef0123456789abcd
0xabcd 0xef01 0x2345 0x6789 0xabcd

How is numpy so fast?

I'm trying to understand how numpy can be so fast, based on my shocking comparison with optimized C/C++ code which is still far from reproducing numpy's speed.
Consider the following example:
Given a 2D array with shape=(N, N) and dtype=float32, which represents a list of N vectors of N dimensions, I am computing the pairwise differences between every pair of vectors. Using numpy broadcasting, this simply writes as:
def pairwise_sub_numpy( X ):
return X - X[:, None, :]
Using timeit I can measure the performance for N=512: it takes 88 ms per call on my laptop.
Now, in C/C++ a naive implementation writes as:
#define X(i, j) _X[(i)*N + (j)]
#define res(i, j, k) _res[((i)*N + (j))*N + (k)]
float* pairwise_sub_naive( const float* _X, int N )
{
float* _res = (float*) aligned_alloc( 32, N*N*N*sizeof(float));
for (int i = 0; i < N; i++) {
for (int j = 0; j < N; j++) {
for (int k = 0; k < N; k++)
res(i,j,k) = X(i,k) - X(j,k);
}
}
return _res;
}
Compiling using gcc 7.3.0 with -O3 flag, I get 195 ms per call for pairwise_sub_naive(X), which is not too bad given the simplicity of the code, but about 2 times slower than numpy.
Now I start getting serious and add some small optimizations, by indexing the row vectors directly:
float* pairwise_sub_better( const float* _X, int N )
{
float* _res = (float*) aligned_alloc( 32, N*N*N*sizeof(float));
for (int i = 0; i < N; i++) {
const float* xi = & X(i,0);
for (int j = 0; j < N; j++) {
const float* xj = & X(j,0);
float* r = &res(i,j,0);
for (int k = 0; k < N; k++)
r[k] = xi[k] - xj[k];
}
}
return _res;
}
The speed stays the same at 195 ms, which means that the compiler was able to figure that much. Let's now use SIMD vector instructions:
float* pairwise_sub_simd( const float* _X, int N )
{
float* _res = (float*) aligned_alloc( 32, N*N*N*sizeof(float));
// create caches for row vectors which are memory-aligned
float* xi = (float*)aligned_alloc(32, N * sizeof(float));
float* xj = (float*)aligned_alloc(32, N * sizeof(float));
for (int i = 0; i < N; i++) {
memcpy(xi, & X(i,0), N*sizeof(float));
for (int j = 0; j < N; j++) {
memcpy(xj, & X(j,0), N*sizeof(float));
float* r = &res(i,j,0);
for (int k = 0; k < N; k += 256/sizeof(float)) {
const __m256 A = _mm256_load_ps(xi+k);
const __m256 B = _mm256_load_ps(xj+k);
_mm256_store_ps(r+k, _mm256_sub_ps( A, B ));
}
}
}
free(xi);
free(xj);
return _res;
}
This only yields a small boost (178 ms instead of 194 ms per function call).
Then I was wondering if a "block-wise" approach, like what is used to optimize dot-products, could be beneficials:
float* pairwise_sub_blocks( const float* _X, int N )
{
float* _res = (float*) aligned_alloc( 32, N*N*N*sizeof(float));
#define B 8
float cache1[B*B], cache2[B*B];
for (int bi = 0; bi < N; bi+=B)
for (int bj = 0; bj < N; bj+=B)
for (int bk = 0; bk < N; bk+=B) {
// load first 8x8 block in the cache
for (int i = 0; i < B; i++)
for (int k = 0; k < B; k++)
cache1[B*i + k] = X(bi+i, bk+k);
// load second 8x8 block in the cache
for (int j = 0; j < B; j++)
for (int k = 0; k < B; k++)
cache2[B*j + k] = X(bj+j, bk+k);
// compute local operations on the caches
for (int i = 0; i < B; i++)
for (int j = 0; j < B; j++)
for (int k = 0; k < B; k++)
res(bi+i,bj+j,bk+k) = cache1[B*i + k] - cache2[B*j + k];
}
return _res;
}
And surprisingly, this is the slowest method so far (258 ms per function call).
To summarize, despite some efforts with some optimized C++ code, I can't come anywhere close the 88 ms / call that numpy achieves effortlessly. Any idea why?
Note: By the way, I am disabling numpy multi-threading and anyway, this kind of operation is not multi-threaded.
Edit: Exact code to benchmark the numpy code:
import numpy as np
def pairwise_sub_numpy( X ):
return X - X[:, None, :]
N = 512
X = np.random.rand(N,N).astype(np.float32)
import timeit
times = timeit.repeat('pairwise_sub_numpy( X )', globals=globals(), number=1, repeat=5)
print(f">> best of 5 = {1000*min(times):.3f} ms")
Full benchmark for C code:
#include <stdio.h>
#include <string.h>
#include <xmmintrin.h> // compile with -mavx -msse4.1
#include <pmmintrin.h>
#include <immintrin.h>
#include <time.h>
#define X(i, j) _x[(i)*N + (j)]
#define res(i, j, k) _res[((i)*N + (j))*N + (k)]
float* pairwise_sub_naive( const float* _x, int N )
{
float* _res = (float*) aligned_alloc( 32, N*N*N*sizeof(float));
for (int i = 0; i < N; i++) {
for (int j = 0; j < N; j++) {
for (int k = 0; k < N; k++)
res(i,j,k) = X(i,k) - X(j,k);
}
}
return _res;
}
float* pairwise_sub_better( const float* _x, int N )
{
float* _res = (float*) aligned_alloc( 32, N*N*N*sizeof(float));
for (int i = 0; i < N; i++) {
const float* xi = & X(i,0);
for (int j = 0; j < N; j++) {
const float* xj = & X(j,0);
float* r = &res(i,j,0);
for (int k = 0; k < N; k++)
r[k] = xi[k] - xj[k];
}
}
return _res;
}
float* pairwise_sub_simd( const float* _x, int N )
{
float* _res = (float*) aligned_alloc( 32, N*N*N*sizeof(float));
// create caches for row vectors which are memory-aligned
float* xi = (float*)aligned_alloc(32, N * sizeof(float));
float* xj = (float*)aligned_alloc(32, N * sizeof(float));
for (int i = 0; i < N; i++) {
memcpy(xi, & X(i,0), N*sizeof(float));
for (int j = 0; j < N; j++) {
memcpy(xj, & X(j,0), N*sizeof(float));
float* r = &res(i,j,0);
for (int k = 0; k < N; k += 256/sizeof(float)) {
const __m256 A = _mm256_load_ps(xi+k);
const __m256 B = _mm256_load_ps(xj+k);
_mm256_store_ps(r+k, _mm256_sub_ps( A, B ));
}
}
}
free(xi);
free(xj);
return _res;
}
float* pairwise_sub_blocks( const float* _x, int N )
{
float* _res = (float*) aligned_alloc( 32, N*N*N*sizeof(float));
#define B 8
float cache1[B*B], cache2[B*B];
for (int bi = 0; bi < N; bi+=B)
for (int bj = 0; bj < N; bj+=B)
for (int bk = 0; bk < N; bk+=B) {
// load first 8x8 block in the cache
for (int i = 0; i < B; i++)
for (int k = 0; k < B; k++)
cache1[B*i + k] = X(bi+i, bk+k);
// load second 8x8 block in the cache
for (int j = 0; j < B; j++)
for (int k = 0; k < B; k++)
cache2[B*j + k] = X(bj+j, bk+k);
// compute local operations on the caches
for (int i = 0; i < B; i++)
for (int j = 0; j < B; j++)
for (int k = 0; k < B; k++)
res(bi+i,bj+j,bk+k) = cache1[B*i + k] - cache2[B*j + k];
}
return _res;
}
int main()
{
const int N = 512;
float* _x = (float*) malloc( N * N * sizeof(float) );
for( int i = 0; i < N; i++)
for( int j = 0; j < N; j++)
X(i,j) = ((i+j*j+17*i+101) % N) / float(N);
double best = 9e9;
for( int i = 0; i < 5; i++)
{
struct timespec start, stop;
clock_gettime(CLOCK_THREAD_CPUTIME_ID, &start);
//float* res = pairwise_sub_naive( _x, N );
//float* res = pairwise_sub_better( _x, N );
//float* res = pairwise_sub_simd( _x, N );
float* res = pairwise_sub_blocks( _x, N );
clock_gettime(CLOCK_THREAD_CPUTIME_ID, &stop);
double t = (stop.tv_sec - start.tv_sec) * 1e6 + (stop.tv_nsec - start.tv_nsec) / 1e3; // in microseconds
if (t < best) best = t;
free( res );
}
printf("Best of 5 = %f ms\n", best / 1000);
free( _x );
return 0;
}
Compiled using gcc 7.3.0 gcc -Wall -O3 -mavx -msse4.1 -o test_simd test_simd.c
Summary of timings on my machine:
Implementation
Time
numpy
88 ms
C++ naive
194 ms
C++ better
195 ms
C++ SIMD
178 ms
C++ blocked
258 ms
C++ blocked (gcc 8.3.1)
217 ms
As pointed out by some of the comments numpy uses SIMD in its implementation and it does not allocate memory at the point of computation. If I eliminate the memory allocation from your implementation, pre-allocating all the buffers ahead of the computation then I get a better time compared to numpy even with the scaler version(that is the one without any optimizations).
Also in terms of SIMD and why your implementation does not perform much better than the scaler is because your memory access patterns are not ideal for SIMD usage - you do memcopy and you load into SIMD registers from locations that are far apart from each other - e.g. you fill vectors from line 0 and line 511, which might not play well with the cache or with the SIMD prefetcher.
There is also a mistake in how you load the SIMD registers(if I understood correctly what you're trying to compute): a 256 bit SIMD register can load 8 single-precision floating-point numbers 8 * 32 = 256, but in your loop you jump k by "256/sizeof(float)" which is 256/4 = 64; _x and _res are float pointers and the SIMD intrinsics expect also float pointers as arguments so instead of reading all elements from those lines every 8 floats you read them every 64 floats.
The computation can be optimized further by changing the access patterns but also by observing that you repeat some computations: e.g. when iterating with line0 as a base you compute line0 - line1 but at some future time, when iterating with line1 as a base, you need to compute line1 - line0 which is basically -(line0 - line1), that is for each line after line0 a lot of results could be reused from previous computations.
A lot of times SIMD usage or parallelization requires one to change how data is accessed or reasoned about in order to provide meaningful improvements.
Here is what I have done as a first step based on your initial implementation and it is faster than the numpy(don't mind the OpenMP stuff as it's not how its supposed to be done, I just wanted to see how it behaves trying the naive way).
C++
Time scaler version: 55 ms
Time SIMD version: 53 ms
**Time SIMD 2 version: 33 ms**
Time SIMD 3 version: 168 ms
Time OpenMP version: 59 ms
Python numpy
>> best of 5 = 88.794 ms
#include <cstdlib>
#include <xmmintrin.h> // compile with -mavx -msse4.1
#include <pmmintrin.h>
#include <immintrin.h>
#include <numeric>
#include <algorithm>
#include <chrono>
#include <iostream>
#include <cstring>
using namespace std;
float* pairwise_sub_naive (const float* input, float* output, int n)
{
for (int i = 0; i < n; i++) {
for (int j = 0; j < n; j++) {
for (int k = 0; k < n; k++)
output[(i * n + j) * n + k] = input[i * n + k] - input[j * n + k];
}
}
return output;
}
float* pairwise_sub_simd (const float* input, float* output, int n)
{
for (int i = 0; i < n; i++)
{
const int idxi = i * n;
for (int j = 0; j < n; j++)
{
const int idxj = j * n;
const int outidx = idxi + j;
for (int k = 0; k < n; k += 8)
{
__m256 A = _mm256_load_ps(input + idxi + k);
__m256 B = _mm256_load_ps(input + idxj + k);
_mm256_store_ps(output + outidx * n + k, _mm256_sub_ps( A, B ));
}
}
}
return output;
}
float* pairwise_sub_simd_2 (const float* input, float* output, int n)
{
float* line_buffer = (float*) aligned_alloc(32, n * sizeof(float));
for (int i = 0; i < n; i++)
{
const int idxi = i * n;
for (int j = 0; j < n; j++)
{
const int idxj = j * n;
const int outidx = idxi + j;
for (int k = 0; k < n; k += 8)
{
__m256 A = _mm256_load_ps(input + idxi + k);
__m256 B = _mm256_load_ps(input + idxj + k);
_mm256_store_ps(line_buffer + k, _mm256_sub_ps( A, B ));
}
memcpy(output + outidx * n, line_buffer, n);
}
}
return output;
}
float* pairwise_sub_simd_3 (const float* input, float* output, int n)
{
for (int i = 0; i < n; i++)
{
const int idxi = i * n;
for (int k = 0; k < n; k += 8)
{
__m256 A = _mm256_load_ps(input + idxi + k);
for (int j = 0; j < n; j++)
{
const int idxj = j * n;
const int outidx = (idxi + j) * n;
__m256 B = _mm256_load_ps(input + idxj + k);
_mm256_store_ps(output + outidx + k, _mm256_sub_ps( A, B ));
}
}
}
return output;
}
float* pairwise_sub_openmp (const float* input, float* output, int n)
{
int i, j;
#pragma omp parallel for private(j)
for (i = 0; i < n; i++)
{
for (j = 0; j < n; j++)
{
const int idxi = i * n;
const int idxj = j * n;
const int outidx = idxi + j;
for (int k = 0; k < n; k += 8)
{
__m256 A = _mm256_load_ps(input + idxi + k);
__m256 B = _mm256_load_ps(input + idxj + k);
_mm256_store_ps(output + outidx * n + k, _mm256_sub_ps( A, B ));
}
}
}
/*for (i = 0; i < n; i++)
{
for (j = 0; j < n; j++)
{
for (int k = 0; k < n; k++)
{
output[(i * n + j) * n + k] = input[i * n + k] - input[j * n + k];
}
}
}*/
return output;
}
int main ()
{
constexpr size_t n = 512;
constexpr size_t input_size = n * n;
constexpr size_t output_size = n * n * n;
float* input = (float*) aligned_alloc(32, input_size * sizeof(float));
float* output = (float*) aligned_alloc(32, output_size * sizeof(float));
float* input_simd = (float*) aligned_alloc(32, input_size * sizeof(float));
float* output_simd = (float*) aligned_alloc(32, output_size * sizeof(float));
float* input_par = (float*) aligned_alloc(32, input_size * sizeof(float));
float* output_par = (float*) aligned_alloc(32, output_size * sizeof(float));
iota(input, input + input_size, float(0.0));
fill(output, output + output_size, float(0.0));
iota(input_simd, input_simd + input_size, float(0.0));
fill(output_simd, output_simd + output_size, float(0.0));
iota(input_par, input_par + input_size, float(0.0));
fill(output_par, output_par + output_size, float(0.0));
std::chrono::milliseconds best_scaler{100000};
for (int i = 0; i < 5; ++i)
{
auto start = chrono::high_resolution_clock::now();
pairwise_sub_naive(input, output, n);
auto stop = chrono::high_resolution_clock::now();
auto duration = chrono::duration_cast<chrono::milliseconds>(stop - start);
if (duration < best_scaler)
{
best_scaler = duration;
}
}
cout << "Time scaler version: " << best_scaler.count() << " ms\n";
std::chrono::milliseconds best_simd{100000};
for (int i = 0; i < 5; ++i)
{
auto start = chrono::high_resolution_clock::now();
pairwise_sub_simd(input_simd, output_simd, n);
auto stop = chrono::high_resolution_clock::now();
auto duration = chrono::duration_cast<chrono::milliseconds>(stop - start);
if (duration < best_simd)
{
best_simd = duration;
}
}
cout << "Time SIMD version: " << best_simd.count() << " ms\n";
std::chrono::milliseconds best_simd_2{100000};
for (int i = 0; i < 5; ++i)
{
auto start = chrono::high_resolution_clock::now();
pairwise_sub_simd_2(input_simd, output_simd, n);
auto stop = chrono::high_resolution_clock::now();
auto duration = chrono::duration_cast<chrono::milliseconds>(stop - start);
if (duration < best_simd_2)
{
best_simd_2 = duration;
}
}
cout << "Time SIMD 2 version: " << best_simd_2.count() << " ms\n";
std::chrono::milliseconds best_simd_3{100000};
for (int i = 0; i < 5; ++i)
{
auto start = chrono::high_resolution_clock::now();
pairwise_sub_simd_3(input_simd, output_simd, n);
auto stop = chrono::high_resolution_clock::now();
auto duration = chrono::duration_cast<chrono::milliseconds>(stop - start);
if (duration < best_simd_3)
{
best_simd_3 = duration;
}
}
cout << "Time SIMD 3 version: " << best_simd_3.count() << " ms\n";
std::chrono::milliseconds best_par{100000};
for (int i = 0; i < 5; ++i)
{
auto start = chrono::high_resolution_clock::now();
pairwise_sub_openmp(input_par, output_par, n);
auto stop = chrono::high_resolution_clock::now();
auto duration = chrono::duration_cast<chrono::milliseconds>(stop - start);
if (duration < best_par)
{
best_par = duration;
}
}
cout << "Time OpenMP version: " << best_par.count() << " ms\n";
cout << "Verification\n";
if (equal(output, output + output_size, output_simd))
{
cout << "PASSED\n";
}
else
{
cout << "FAILED\n";
}
return 0;
}
Edit: Small correction as there was a wrong call related to the second version of SIMD implementation.
As you can see now, the second implementation is the fastest as it behaves the best from the point of view of the locality of reference of the cache. Examples 2 and 3 of SIMD implementations are there to illustrate for you how changing memory access patterns to influence the performance of your SIMD optimizations.
To summarize(knowing that I'm far from being complete in my advice) be mindful of your memory access patterns and of the loads and stores to\from the SIMD unit; the SIMD is a different hardware unit inside the processor's core so there is a penalty in shuffling data back and forth, hence when you load a register from memory try to do as many operations as possible with that data and do not be too eager to store it back(of course, in your example that might be all you need to do with the data). Be mindful also that there is a limited number of SIMD registers available and if you load too many then they will "spill", that is they will be stored back to temporary locations in main memory behind the scenes killing all your gains. SIMD optimization, it's a true balance act!
There is some effort to put a cross-platform intrinsics wrapper into the standard(I developed myself a closed source one in my glorious past) and even it's far from being complete, it's worth taking a look at(read the accompanying papers if you're truly interested to learn how SIMD works).
https://github.com/VcDevel/std-simd
This is a complement to the answer posted by #celakev .
I think I finally got to understand what exactly was the issue. The issue was not about allocating the memory in the main function that does the computation.
What was actually taking time is to access new (fresh) memory. I believe that the malloc call returns pages of memory which are virtual, i.e. that does not corresponds to actual physical memory -- until it is explicitly accessed. What actually takes time is the process of allocating physical memory on the fly (which I think is OS-level) when it is accessed in the function code.
Here is a proof. Consider the two following trivial functions:
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
float* just_alloc( size_t N )
{
return (float*) aligned_alloc( 32, sizeof(float)*N );
}
void just_fill( float* _arr, size_t N )
{
for (size_t i = 0; i < N; i++)
_arr[i] = 1;
}
#define Time( code_to_benchmark, cleanup_code ) \
do { \
double best = 9e9; \
for( int i = 0; i < 5; i++) { \
struct timespec start, stop; \
clock_gettime(CLOCK_THREAD_CPUTIME_ID, &start); \
code_to_benchmark; \
clock_gettime(CLOCK_THREAD_CPUTIME_ID, &stop); \
double t = (stop.tv_sec - start.tv_sec) * 1e3 + (stop.tv_nsec - start.tv_nsec) / 1e6; \
printf("Time[%d] = %f ms\n", i, t); \
if (t < best) best = t; \
cleanup_code; \
} \
printf("Best of 5 for '" #code_to_benchmark "' = %f ms\n\n", best); \
} while(0)
int main()
{
const size_t N = 512;
Time( float* arr = just_alloc(N*N*N), free(arr) );
float* arr = just_alloc(N*N*N);
Time( just_fill(arr, N*N*N), ; );
free(arr);
return 0;
}
I get the following timings, which I now detail for each of the calls:
Time[0] = 0.000931 ms
Time[1] = 0.000540 ms
Time[2] = 0.000523 ms
Time[3] = 0.000524 ms
Time[4] = 0.000521 ms
Best of 5 for 'float* arr = just_alloc(N*N*N)' = 0.000521 ms
Time[0] = 189.822237 ms
Time[1] = 45.041083 ms
Time[2] = 46.331428 ms
Time[3] = 44.729433 ms
Time[4] = 42.241279 ms
Best of 5 for 'just_fill(arr, N*N*N)' = 42.241279 ms
As you can see, allocating memory is blazingly fast, but the first time that the memory is accessed, it is 5 times slower than the other times. So, basically the reason that my code was slow was because i was each time reallocating fresh memory that had no physical address yet. (Correct me if I'm wrong but I think that's the gist of it!)
A bit late to the party, but I wanted to add a pairwise method with Eigen, which is supposed to give C++ a high-level algebra manipulation capability and use SIMD under the hood. Just like numpy.
Here is the implementation
#include <iostream>
#include <vector>
#include <chrono>
#include <algorithm>
#include <Eigen/Dense>
auto pairwise_eigen(const Eigen::MatrixXf &input, std::vector<Eigen::MatrixXf> &output) {
for (int k = 0; k < input.cols(); ++k)
output[k] = input
// subtract matrix with repeated k-th column
- input.col(k) * Eigen::RowVectorXf::Ones(input.cols());
}
int main() {
constexpr size_t n = 512;
// allocate input and output
Eigen::MatrixXf input = Eigen::MatrixXf::Random(n, n);
std::vector<Eigen::MatrixXf> output(n);
std::chrono::milliseconds best_eigen{100000};
for (int i = 0; i < 5; ++i) {
auto start = std::chrono::high_resolution_clock::now();
pairwise_eigen(input, output);
auto end = std::chrono::high_resolution_clock::now();
auto duration = std::chrono::duration_cast<std::chrono::milliseconds>(end-start);
if (duration < best_eigen)
best_eigen = duration;
}
std::cout << "Time Eigen version: " << best_eigen.count() << " ms\n";
return 0;
}
The full benchmark tests suggested by #celavek on my system are
Time scaler version: 57 ms
Time SIMD version: 58 ms
Time SIMD 2 version: 40 ms
Time SIMD 3 version: 58 ms
Time OpenMP version: 58 ms
Time Eigen version: 76 ms
Numpy >> best of 5 = 118.489 ms
Whit Eigen there is still a noticeable improvement with respect to Numpy, but not so impressive compared to the "raw" implementations (there is certainly some overhead).
An extra optimization is to allocate the output vector with copies of the input and then subtract directly from each vector entry, simply replacing the following lines
// inside the pairwise method
for (int k = 0; k < input.cols(); ++k)
output[k] -= input.col(k) * Eigen::RowVectorXf::Ones(input.cols());
// at allocation time
std::vector<Eigen::MatrixXf> output(n, input);
This pushes the best of 5 down to 60 ms.

Pass 2D thrust::device_vector Complex Matrix to CUDA kernel function

I'm new in Cuda and and I'm trying to move my existing Project to GPU using Cuda.
My code are based on complex matrices and complex buffers.
For the first step, I tried to move That nested For loop Code to Cuda (the rest will be similar):
typedef thrust::complex<double> smp_t;
uint8_t *binbuffer = (uint8_t*) malloc(8 * bufsize * sizeof(uint8_t));
smp_t *sgbuf = (smp_t*) malloc(8 * bufsize * sizeof(smp_t));
smp_t *cnbuf = (smp_t*) malloc(8 * bufsize * sizeof(smp_t));
// Create matrix.
thrust::complex<double> i_unit(0.0, 1.0);
thrust::host_vector<thrust::host_vector<smp_t>> tw(decfactor);
// Fill the Matrix
for (size_t row = 0; row < 8; row++) {
for (size_t col = 0; col < 8; col++) {
std::complex<double> tmp =
exp(-i_unit * 2.0*M_PI * ((double) col*row) / (double)8);
tw[row].push_back(tmp);
}
}
/* The Code To Move to the GPU processing */
for (unsigned int i = 0; i < bufsize; i++) {
for (size_t ch = 0; ch < 8; ch++)
for (size_t k = 0; k < 8; k++)
cnbuf[ch*bufsize + i] += sgbuf[k*bufsize+i] * tw[ch].at(k);
}
That is the Code from the .cu file that will replace the current nested for loop:
__global__ void kernel_func(cuDoubleComplex *cnbuf, cuDoubleComplex *sgbuf, smp_t *tw, size_t block_size) {
unsigned int ch = threadIdx.x;
unsigned int k = blockIdx.x;
for (int x = 0; x < block_size; ++x) {
unsigned int sig_index = k*block_size+x;
unsigned int tw_index = ch*k;
unsigned int cn_index = ch*block_size+x;
cuDoubleComplex temp = cuCmul(sgbuf[sig_index], make_cuDoubleComplex(tw[tw_index].real(), tw[tw_index].imag()));
cnbuf[cn_index] = cuCadd(temp, cnbuf[cn_index]);
}
}
void kernel_wrap(
smp_t *cnbuf,
smp_t *sgbuf,
thrust::host_vector<thrust::host_vector<smp_t>>tw,
size_t buffer_size) {
smp_t *d_sgbuf;
smp_t *d_cnbuf;
thrust::device_vector<smp_t> d_tw(8*8);
thrust::copy(&tw[0][0], &tw[7][7], d_tw.begin());
cudaMalloc((void **)&d_sgbuf, buffer_size);
cudaMalloc((void **)&d_cnbuf, buffer_size);
cudaMemcpy(d_sgbuf, sgbuf, buffer_size, cudaMemcpyDeviceToHost);
cudaMemcpy(d_cnbuf, cnbuf, buffer_size, cudaMemcpyDeviceToHost);
thrust::raw_pointer_cast(d_tw.data());
kernel_func<<<8, 8>>>(
reinterpret_cast<cuDoubleComplex*>(d_cnbuf),
reinterpret_cast<cuDoubleComplex*>(d_sgbuf),
thrust::raw_pointer_cast(d_tw.data()),
buffer_size
);
cudaError_t varCudaError1 = cudaGetLastError();
if (varCudaError1 != cudaSuccess)
{
std::cout << "Failed to launch subDelimiterExamine kernel (error code: " << cudaGetErrorString(varCudaError1) << ")!" << std::endl;
exit(EXIT_FAILURE);
}
cudaMemcpy(sgbuf, d_sgbuf, buffer_size, cudaMemcpyHostToDevice);
cudaMemcpy(cnbuf, d_cnbuf, buffer_size, cudaMemcpyHostToDevice);
}
When I'm running the code, I get the error:
Failed to launch subDelimiterExamine kernel (error code: invalid argument)!
I think that the argument that causing the troubles is the 'd_tw'.
So, my questions are:
What am I'm doing wrong with the cast of <thrust::host_vector<thrust::host_vector smp_t>> to <thrust::device_vector smp_t>> (from 2d Matrix to one flattened arr)?
Is there a better whey to work with 2D Complex numbers in CUDA?
The documentation about Complex arrays in Cuda are very poorly, where can I read abound the work with Cuda Complex matrices?
Thanks!!!!
There were various problems. I will list a few, and probably miss some. So please refer to the example code I have given for additional differences.
The most immediate problem is here:
thrust::copy(&tw[0][0], &tw[7][7], d_tw.begin());
This is what is giving rise to the invalid argument error you are seeing. Underneath the hood, thrust is going to try to use a cudaMemcpyAsync operation for this, because this is inherently a copy from host to device. We will fix this by replacing it with an ordinary cudaMemcpy operation, but to understand how to construct that, it's necessary to understand item 2.
You seem to think that a vector of vectors implies contiguous storage. It does not and that statement is not specific to thrust. Since a thrust::host_vector of vectors (or even std::vector of vectors) does not imply contiguous storage, we can't easily construct a single operation, such as cudaMemcpy or thrust::copy to copy this data. Therefore it will be necessary to explicitly flatten it.
Your directions of copy on the cudaMemcpy operations are universally backward. Where you should have had cudaMemcpyHostToDevice you had cudaMemcpyDeviceToHost, and vice-versa.
The CUDA cuComplex.h header file predates thrust, and was provided for a quick C-style method to work with complex numbers. There is no documentation for it - you have to read the file itself and work out how to use it, as seem to have already done. However, since you are using thrust::complex<> anyway, it's far simpler just to use that coding paradigm, and write you device code to look almost exactly like your host code.
You had various transfer sizes wrong. cudaMemcpy takes a size in bytes to transfer.
What follows is an example, cobbled together from the pieces you have shown, with a variety of "fixes". I'm not claiming its in any way perfect or correct, but it avoids the issues I have outlined above. Furthermore, depending on how you compile with or with a -DUSE_KERNEL define, it will either run your "original" host code and display the output, or the kernel code and display the output. According to my testing, the outputs match.
$ cat t1751.cu
#include <thrust/complex.h>
#include <thrust/copy.h>
#include <thrust/device_vector.h>
#include <thrust/host_vector.h>
#include <iostream>
#include <cstdint>
#include <cuComplex.h>
typedef thrust::complex<double> smp_t;
__global__ void kernel_func_old(cuDoubleComplex *cnbuf, cuDoubleComplex *sgbuf, smp_t *tw, size_t block_size) {
unsigned int ch = threadIdx.x;
unsigned int k = blockIdx.x;
for (int x = 0; x < block_size; ++x) {
unsigned int sig_index = k*block_size+x;
unsigned int tw_index = ch*k;
unsigned int cn_index = ch*block_size+x;
cuDoubleComplex temp = cuCmul(sgbuf[sig_index], make_cuDoubleComplex(tw[tw_index].real(), tw[tw_index].imag()));
cnbuf[cn_index] = cuCadd(temp, cnbuf[cn_index]);
}
}
__global__ void kernel_func(smp_t *cnbuf, smp_t *sgbuf, smp_t *tw, size_t block_size) {
unsigned row = blockIdx.x;
unsigned col = threadIdx.x;
unsigned idx = row*block_size+col;
for (int k = 0; k < 8; k++)
cnbuf[idx] += sgbuf[k*block_size+col] * tw[row*block_size+k];
}
void kernel_wrap(
smp_t *cnbuf,
smp_t *sgbuf,
thrust::host_vector<thrust::host_vector<smp_t>>tw,
size_t buffer_size) {
smp_t *d_sgbuf;
smp_t *d_cnbuf;
thrust::device_vector<smp_t> d_tw(8*8);
// thrust::copy(&tw[0][0], &tw[7][7], d_tw.begin());
thrust::host_vector<smp_t> htw(buffer_size*buffer_size);
for (int i = 0; i < buffer_size; i++)
for (int j = 0; j < buffer_size; j++)
htw[i*buffer_size + j] = tw[i][j];
cudaMemcpy(thrust::raw_pointer_cast(d_tw.data()), &htw[0], 8*8*sizeof(smp_t), cudaMemcpyHostToDevice);
cudaMalloc((void **)&d_sgbuf, buffer_size*buffer_size*sizeof(smp_t));
cudaMalloc((void **)&d_cnbuf, buffer_size*buffer_size*sizeof(smp_t));
cudaMemcpy(d_sgbuf, sgbuf, buffer_size*buffer_size*sizeof(smp_t), cudaMemcpyHostToDevice);
cudaMemcpy(d_cnbuf, cnbuf, buffer_size*buffer_size*sizeof(smp_t), cudaMemcpyHostToDevice);
thrust::raw_pointer_cast(d_tw.data());
kernel_func<<<8, 8>>>(d_cnbuf,d_sgbuf,thrust::raw_pointer_cast(d_tw.data()),buffer_size);
cudaError_t varCudaError1 = cudaGetLastError();
if (varCudaError1 != cudaSuccess)
{
std::cout << "Failed to launch subDelimiterExamine kernel (error code: " << cudaGetErrorString(varCudaError1) << ")!" << std::endl;
exit(EXIT_FAILURE);
}
// cudaMemcpy(sgbuf, d_sgbuf, buffer_size*buffer_size*sizeof(smp_t), cudaMemcpyDeviceToHost);
cudaMemcpy(cnbuf, d_cnbuf, buffer_size*buffer_size*sizeof(smp_t), cudaMemcpyDeviceToHost);
for (int i = 0; i < 8; i++)
for (int j = 0; j < 8; j++)
std::cout << cnbuf[i*8+j].real() << "," << cnbuf[i*8+j].imag() << std::endl;
}
int main(){
const int bufsize = 8;
const int decfactor = 8;
uint8_t *binbuffer = (uint8_t*) malloc(8 * bufsize * sizeof(uint8_t));
smp_t *sgbuf = (smp_t*) malloc(8 * bufsize * sizeof(smp_t));
smp_t *cnbuf = (smp_t*) malloc(8 * bufsize * sizeof(smp_t));
memset(cnbuf, 0, 8*bufsize*sizeof(smp_t));
// Create matrix.
thrust::complex<double> i_unit(0.0, 1.0);
#ifndef USE_KERNEL
std::vector<std::vector<smp_t> > tw(decfactor);
#else
thrust::host_vector<thrust::host_vector<smp_t>> tw(decfactor);
#endif
// Fill the Matrix
for (size_t row = 0; row < 8; row++) {
for (size_t col = 0; col < 8; col++) {
std::complex<double> tmp = exp(-i_unit * 2.0*M_PI * ((double) col*row) / (double)8);
tw[row].push_back(tmp);
}
}
thrust::complex<double> test(1.0, 1.0);
for (int i = 0; i < 8*8; i++) sgbuf[i] = test;
#ifndef USE_KERNEL
/* The Code To Move to the GPU processing */
for (unsigned int i = 0; i < bufsize; i++) {
for (size_t ch = 0; ch < 8; ch++)
for (size_t k = 0; k < 8; k++)
cnbuf[ch*bufsize + i] += sgbuf[k*bufsize+i] * tw[ch].at(k);
}
for (int i = 0; i < 8; i++)
for (int j = 0; j < 8; j++)
std::cout << cnbuf[i*8+j].real() << "," << cnbuf[i*8+j].imag() << std::endl;
#else
kernel_wrap(cnbuf,sgbuf,tw,bufsize);
#endif
}
$ nvcc -o t1751 t1751.cu -std=c++11
$ ./t1751 >out_host.txt
$ nvcc -o t1751 t1751.cu -std=c++11 -DUSE_KERNEL
$ ./t1751 >out_device.txt
$ diff out_host.txt out_device.txt
$
Remember, this is mostly your code, I am not claiming it is correct, or defect-free, or suitable for any particular purpose. Use it at your own risk.

SSE addition and conversion

Here's the thing, how can I add two unsigned char arrays and store the result in an unsigned short array by using SSE. Can anyone give me some help or hint. This is what I have done so far. I just don't know where the error is..need some help
#include<iostream>
#include<intrin.h>
#include<windows.h>
#include<emmintrin.h>
#include<iterator>
using namespace std;
void sse_add(unsigned char * input1, unsigned char *input2, unsigned short *output, const int N)
{
unsigned char *op3 = new unsigned char[N];
unsigned char *op4 = new unsigned char[N];
__m128i *sse_op3 = (__m128i*)op3;
__m128i *sse_op4 = (__m128i*)op4;
__m128i *sse_result = (__m128i*)output;
for (int i = 0; i < N; i = i + 16)
{
__m128i src = _mm_loadu_si128((__m128i*)input1);
__m128i zero = _mm_setzero_si128();
__m128i higher = _mm_unpackhi_epi8(src, zero);
__m128i lower = _mm_unpacklo_epi8(src, zero);
_mm_storeu_si128(sse_op3, lower);
sse_op3 = sse_op3 + 1;
_mm_storeu_si128(sse_op3, higher);
sse_op3 = sse_op3 + 1;
input1 = input1 + 16;
}
for (int j = 0; j < N; j = j + 16)
{
__m128i src1 = _mm_loadu_si128((__m128i*)input2);
__m128i zero1 = _mm_setzero_si128();
__m128i higher1 = _mm_unpackhi_epi8(src1, zero1);
__m128i lower1 = _mm_unpacklo_epi8(src1, zero1);
_mm_storeu_si128(sse_op4, lower1);
sse_op4 = sse_op4 + 1;
_mm_storeu_si128(sse_op4, higher1);
sse_op4 = sse_op4 + 1;
input2 = input2 + 16;
}
__m128i *sse_op3_new = (__m128i*)op3;
__m128i *sse_op4_new = (__m128i*)op4;
for (int y = 0; y < N; y = y + 8)
{
*sse_result = _mm_adds_epi16(*sse_op3_new, *sse_op4_new);
sse_result = sse_result + 1;
sse_op3_new = sse_op3_new + 1;
sse_op4_new = sse_op4_new + 1;
}
}
void C_add(unsigned char * input1, unsigned char *input2, unsigned short *output, int N)
{
for (int i = 0; i < N; i++)
output[i] = (unsigned short)input1[i] + (unsigned short)input2[i];
}
int main()
{
int n = 1023;
unsigned char *p0 = new unsigned char[n];
unsigned char *p1 = new unsigned char[n];
unsigned short *p21 = new unsigned short[n];
unsigned short *p22 = new unsigned short[n];
for (int j = 0; j < n; j++)
{
p21[j] = rand() % 256;
p22[j] = rand() % 256;
}
C_add(p0, p1, p22, n);
cout << "C_add finished!" << endl;
sse_add(p0, p1, p21, n);
cout << "sse_add finished!" << endl;
for (int j = 0; j < n; j++)
{
if (p21[j] != p22[j])
{
cout << "diff!!!!!#######" << endl;
}
}
//system("pause");
delete[] p0;
delete[] p1;
delete[] p21;
delete[] p22;
return 0;
}
Assuming everything is aligned to _Alignof(__m128i) and the size of the array is a multiple of sizeof(__m128i), something like this should work:
void addw(size_t size, uint16_t res[size], uint8_t a[size], uint8_t b[size]) {
__m128i* r = (__m128i*) res;
__m128i* ap = (__m128i*) a;
__m128i* bp = (__m128i*) b;
for (size_t i = 0 ; i < (size / sizeof(__m128i)) ; i++) {
r[(i * 2)] = _mm_add_epi16(_mm_cvtepu8_epi16(ap[i]), _mm_cvtepu8_epi16(bp[i]));
r[(i * 2) + 1] = _mm_add_epi16(_mm_cvtepu8_epi16(_mm_srli_si128(ap[i], 8)), _mm_cvtepu8_epi16(_mm_srli_si128(bp[i], 8)));
}
}
FWIW, NEON would be a bit simpler (using vaddl_u8 and vaddl_high_u8).
If you're dealing with unaligned data you can use _mm_loadu_si128/_mm_storeu_si128. If size isn't a multiple of 16 you'll just have to do the remainder without SSE.
Note that this may be something your compiler can do automatically (I haven't checked). You may want to try something like this:
#pragma omp simd
for (size_t i = 0 ; i < size ; i++) {
res[i] = ((uint16_t) a[i]) + ((uint16_t) b[i]);
}
That uses OpenMP 4, but there is also Cilk++ (#pragma simd), clang (#pragma clang loop vectorize(enable)), gcc (#pragma GCC ivdep), or you could just hope the compiler is smart enough without the pragma hint.

Seeking knowledge on array of arrays memory performance

Context: Multichannel real time digital audio processing.
Access pattern: "Column-major", like so:
for (int sample = 0; sample < size; ++sample)
{
for (int channel = 0; channel < size; ++channel)
{
auto data = arr[channel][sample];
// do some computations
}
}
I'm seeking advice on how to make the life easier for the CPU and memory, in general. I realize interleaving the data would be better, but it's not possible.
My theory is, that as long as you sequentially access memory for a while, the CPU will prefetch it - will this hold for N (channel) buffers? What about size of the buffers, any "breaking points"?
Will it be very beneficial to have the channels in contiguous memory (increasing locality), or does that only hold for very small buffers (like, size of cache lines)? We could be talking buffersizes > 100 kb apart.
I guess there would also be a point where the time of the computational part makes memory optimizations negligible - ?
Is this a case, where manual prefetching makes sense?
I could test/profile my own system, but I only have that - 1 system. So any design choices I make may only positively affect that particular system. Any knowledge on these matters are appreciated, links, literature etc., platform specific knowledge.
Let me know if the question is too vague, I primarily thought it would be nice to have some wiki-ish experience / info on this area.
edit:
I created a program, that tests the three cases I mentioned (distant, adjecant and contiguous mentioned in supposedly increasing performance order), which tests these patterns on small and big data sets. Maybe people will run it and report anomalies.
#include <iostream>
#include <chrono>
#include <algorithm>
const int b = 196000;
const int s = 64 / sizeof(float);
const int extra_it = 16;
float sbuf1[s];
float bbuf1[b];
int main()
{
float sbuf2[s];
float bbuf2[b];
float * sbuf3 = new float[s];
float * bbuf3 = new float[b];
float * sbuf4 = new float[s * 3];
float * bbuf4 = new float[b * 3];
float use = 0;
while (1)
{
using namespace std;
int c;
bool sorb;
cout << "small or big test (0/1)? ";
if (!(cin >> sorb))
return -1;
cout << endl << "test distant buffers (0), contiguous access (1) or adjecant access (2)? ";
if (!(cin >> c))
return -1;
auto t = std::chrono::high_resolution_clock::now();
if (c == 0)
{
// "worst case scenario", 3 distant buffers constantly touched
if (sorb)
{
for (int k = 0; k < b * extra_it; ++k)
for (int i = 0; i < s; ++i)
{
sbuf1[i] = k; // static memory
sbuf2[i] = k; // stack memory
sbuf3[i] = k; // heap memory
}
}
else
{
for (int k = 0; k < s * extra_it; ++k)
for (int i = 0; i < b; ++i)
{
bbuf1[i] = k; // static memory
bbuf2[i] = k; // stack memory
bbuf3[i] = k; // heap memory
}
}
}
else if (c == 1)
{
// "best case scenario", only contiguous memory touched, interleaved
if (sorb)
{
for (int k = 0; k < b * extra_it; ++k)
for (int i = 0; i < s * 3; i += 3)
{
sbuf4[i] = k;
sbuf4[i + 1] = k;
sbuf4[i + 2] = k;
}
}
else
{
for (int k = 0; k < s * extra_it; ++k)
for (int i = 0; i < b * 3; i += 3)
{
bbuf4[i] = k;
bbuf4[i + 1] = k;
bbuf4[i + 2] = k;
}
}
}
else if (c == 2)
{
// "compromise", adjecant memory buffers touched
if (sorb)
{
auto b1 = sbuf4;
auto b2 = sbuf4 + s;
auto b3 = sbuf4 + s * 2;
for (int k = 0; k < b * extra_it; ++k)
for (int i = 0; i < s; ++i)
{
b1[i] = k;
b2[i] = k;
b3[i] = k;
}
}
else
{
auto b1 = bbuf4;
auto b2 = bbuf4 + b;
auto b3 = bbuf4 + b * 2;
for (int k = 0; k < s * extra_it; ++k)
for (int i = 0; i < b; ++i)
{
b1[i] = k;
b2[i] = k;
b3[i] = k;
}
}
}
else
break;
cout << chrono::duration_cast<chrono::milliseconds>(chrono::high_resolution_clock::now() - t).count() << " ms" << endl;
// basically just touching the buffers, avoiding clever optimizations
use += std::accumulate(sbuf1, sbuf1 + s, 0);
use += std::accumulate(sbuf2, sbuf2 + s, 0);
use += std::accumulate(sbuf3, sbuf3 + s, 0);
use += std::accumulate(sbuf4, sbuf4 + s * 3, 0);
use -= std::accumulate(bbuf1, bbuf1 + b, 0);
use -= std::accumulate(bbuf2, bbuf2 + b, 0);
use -= std::accumulate(bbuf3, bbuf3 + b, 0);
use -= std::accumulate(bbuf4, bbuf4 + b * 3, 0);
}
std::cout << use;
std::cin.get();
}
On my Intel i7-3740qm surprisingly, distant buffers consistently outperforms the more locality-friendly tests. It is close, however.