CUDA 7.5 experimental __host__ __device__ lambdas

CUDA 7.5 experimental __host__ __device__ lambdas - c++

I played a bit with the experimental device lambdas that where introduced in CUDA 7.5 and promoted in this blog post by Mark Harris.
For the following example I removed a lot of stuff that is not needed to show my problem (my actual implementation looks a bit nicer...).
I tried to write a foreach function that operates either on vectors on device (1 thread per element) or host (serial) depending on a template parameter. With this foreach function I can easily implement BLAS functions. As an example I use assigning a scalar to each component of a vector (I attach the complete code in the end):
template<bool onDevice> void assignScalar( size_t size, double* vector, double a )
{
auto assign = [=] __host__ __device__ ( size_t index ) { vector[index] = a; };
if( onDevice )
{
foreachDevice( size, assign );
}
else
{
foreachHost( size, assign );
}
}
However, this code gives a compiler error because of the __host__ __device__ lambda:
The closure type for a lambda ("lambda ->void") cannot be used in the template argument type of a __global__ function template instantiation, unless the lambda is defined within a __device__ or __global__ function
I get the same error if I remove the __device__ from the lambda expression and I get no compile error if I remove __host__ (only __device__ lambda), but in this case the host part is not executed...
If I define the lambda as either __host__ or __device__ separately, the code compiles and works as expected.
template<bool onDevice> void assignScalar2( size_t size, double* vector, double a )
{
if( onDevice )
{
auto assign = [=] __device__ ( size_t index ) { vector[index] = a; };
foreachDevice( size, assign );
}
else
{
auto assign = [=] __host__ ( size_t index ) { vector[index] = a; };
foreachHost( size, assign );
}
}
However, this introduces code duplication and actually makes the whole idea of using lambdas useless for this example.
Is there a way to accomplish what I want to do or is this a bug in the experimental feature? Actually, defining a __host__ __device__ lambda is explicitly mentioned in the first example in the programming guide. Even for that simpler example (just return a constant value from the lambda) I couldn't find a way to use the lambda expression on both host and device.
Here is the full code, compile with options -std=c++11 --expt-extended-lambda:
#include <iostream>
using namespace std;
template<typename Operation> void foreachHost( size_t size, Operation o )
{
for( size_t i = 0; i < size; ++i )
{
o( i );
}
}
template<typename Operation> __global__ void kernel_foreach( Operation o )
{
size_t index = blockIdx.x * blockDim.x + threadIdx.x;
o( index );
}
template<typename Operation> void foreachDevice( size_t size, Operation o )
{
size_t blocksize = 32;
size_t gridsize = size/32;
kernel_foreach<<<gridsize,blocksize>>>( o );
}
__global__ void printFirstElementOnDevice( double* vector )
{
printf( "dVector[0] = %f\n", vector[0] );
}
void assignScalarHost( size_t size, double* vector, double a )
{
auto assign = [=] ( size_t index ) { vector[index] = a; };
foreachHost( size, assign );
}
void assignScalarDevice( size_t size, double* vector, double a )
{
auto assign = [=] __device__ ( size_t index ) { vector[index] = a; };
foreachDevice( size, assign );
}
// compile error:
template<bool onDevice> void assignScalar( size_t size, double* vector, double a )
{
auto assign = [=] __host__ __device__ ( size_t index ) { vector[index] = a; };
if( onDevice )
{
foreachDevice( size, assign );
}
else
{
foreachHost( size, assign );
}
}
// works:
template<bool onDevice> void assignScalar2( size_t size, double* vector, double a )
{
if( onDevice )
{
auto assign = [=] __device__ ( size_t index ) { vector[index] = a; };
foreachDevice( size, assign );
}
else
{
auto assign = [=] __host__ ( size_t index ) { vector[index] = a; };
foreachHost( size, assign );
}
}
int main()
{
size_t SIZE = 32;
double* hVector = new double[SIZE];
double* dVector;
cudaMalloc( &dVector, SIZE*sizeof(double) );
// clear memory
for( size_t i = 0; i < SIZE; ++i )
{
hVector[i] = 0;
}
cudaMemcpy( dVector, hVector, SIZE*sizeof(double), cudaMemcpyHostToDevice );
assignScalarHost( SIZE, hVector, 1.0 );
cout << "hVector[0] = " << hVector[0] << endl;
assignScalarDevice( SIZE, dVector, 2.0 );
printFirstElementOnDevice<<<1,1>>>( dVector );
cudaDeviceSynchronize();
assignScalar2<false>( SIZE, hVector, 3.0 );
cout << "hVector[0] = " << hVector[0] << endl;
assignScalar2<true>( SIZE, dVector, 4.0 );
printFirstElementOnDevice<<<1,1>>>( dVector );
cudaDeviceSynchronize();
// assignScalar<false>( SIZE, hVector, 5.0 );
// cout << "hVector[0] = " << hVector[0] << endl;
//
// assignScalar<true>( SIZE, dVector, 6.0 );
// printFirstElementOnDevice<<<1,1>>>( dVector );
// cudaDeviceSynchronize();
cudaError_t error = cudaGetLastError();
if(error!=cudaSuccess)
{
cout << "ERROR: " << cudaGetErrorString(error);
}
}
I used the production release of CUDA 7.5.
Update
I tried this third version for the assignScalar function:
template<bool onDevice> void assignScalar3( size_t size, double* vector, double a )
{
#ifdef __CUDA_ARCH__
#define LAMBDA_HOST_DEVICE __device__
#else
#define LAMBDA_HOST_DEVICE __host__
#endif
auto assign = [=] LAMBDA_HOST_DEVICE ( size_t index ) { vector[index] = a; };
if( onDevice )
{
foreachDevice( size, assign );
}
else
{
foreachHost( size, assign );
}
}
It compiles and runs without error, but the device version (assignScalar3<true>) is not executed. Actually, I thought that __CUDA_ARCH__ will always be undefined (since the function is not __device__) but I checked explicitly that there is a compile path where it is defined.

The task that I tried to accomplish with the examples provided in the question is not possible with CUDA 7.5, though it was not explicitly excluded from the allowed cases for the experimental lambda support.
NVIDIA announced that CUDA Toolkit 8.0 will support __host__ __device__ lambdas as an experimental feature, according to the blog post CUDA 8 Features Revealed.
I verified that my example works with the CUDA 8 Release Candidate (Cuda compilation tools, release 8.0, V8.0.26).
Here is the code that I finally used, compiled with nvcc -std=c++11 --expt-extended-lambda:
#include <iostream>
using namespace std;
template<typename Operation> __global__ void kernel_foreach( Operation o )
{
size_t i = blockIdx.x * blockDim.x + threadIdx.x;
o( i );
}
template<bool onDevice, typename Operation> void foreach( size_t size, Operation o )
{
if( onDevice )
{
size_t blocksize = 32;
size_t gridsize = size/32;
kernel_foreach<<<gridsize,blocksize>>>( o );
}
else
{
for( size_t i = 0; i < size; ++i )
{
o( i );
}
}
}
__global__ void printFirstElementOnDevice( double* vector )
{
printf( "dVector[0] = %f\n", vector[0] );
}
template<bool onDevice> void assignScalar( size_t size, double* vector, double a )
{
auto assign = [=] __host__ __device__ ( size_t i ) { vector[i] = a; };
foreach<onDevice>( size, assign );
}
int main()
{
size_t SIZE = 32;
double* hVector = new double[SIZE];
double* dVector;
cudaMalloc( &dVector, SIZE*sizeof(double) );
// clear memory
for( size_t i = 0; i < SIZE; ++i )
{
hVector[i] = 0;
}
cudaMemcpy( dVector, hVector, SIZE*sizeof(double), cudaMemcpyHostToDevice );
assignScalar<false>( SIZE, hVector, 3.0 );
cout << "hVector[0] = " << hVector[0] << endl;
assignScalar<true>( SIZE, dVector, 4.0 );
printFirstElementOnDevice<<<1,1>>>( dVector );
cudaDeviceSynchronize();
cudaError_t error = cudaGetLastError();
if(error!=cudaSuccess)
{
cout << "ERROR: " << cudaGetErrorString(error);
}
}

Related

Angelscript transfer array from C++

I need to transfer an array of tiles from C++ to Angelscript, I have tried adding a function that returns an std::vector but it returns this error
Failed in call to function 'RegisterGlobalFunction' with 'array<DumbTile> GetTilesAt(int x, int y)' (Code: asINVALID_DECLARATION, -10)
my code:
std::vector<DumbTile> GetTilesAt(int x, int y) {
std::vector<DumbTile> output;
for (DumbTile t : tiles) {
if (t.x == x && t.y == y) {
output.push_back(t);
}
}
return output;
}
engine->RegisterGlobalFunction("array<DumbTile> GetTilesAt(int x, int y)", asFUNCTIONPR(GetTilesAt, (int, int), std::vector<DumbTile>), asCALL_CDECL);

Is DumbTile registered before GetTilesAt() registration?
Is array<T> registered before GetTilesAt() registration?
Both needs to be registered before you can register your function.
Are you using stock array implementation (sdk/add_on/scriptarray/) or your own, std::vector<>-based implementation? When using stock addon, application must convert std::vector to CScriptArray first, as angelscript can't really do that on its own due to way how arrays works.
There is obvious alternative - switch from using std::vector to CScriptArray everywhere where you want scripts to access data, but this might be annoying.
Example std::vector<Object*> <-> CScriptArray* conversion
template<typename Type>
void AppendVectorToArrayRef( vector<Type>& vec, CScriptArray* arr )
{
if( !vec.empty() && arr )
{
uint i = (uint)arr->GetSize();
arr->Resize( (asUINT)(i + (uint)vec.size() ) );
for( uint k = 0, l = (uint)vec.size(); k < l; k++, i++ )
{
Type* p = (Type*)arr->At( i );
*p = vec[k];
(*p)->AddRef();
}
}
}
template<typename Type>
void AssignScriptArrayInVector( vector<Type>& vec, CScriptArray* arr )
{
if( arr )
{
uint count = (uint)arr->GetSize();
if( count )
{
vec.resize( count );
for( uint i = 0; i < count; i++ )
{
Type* p = (Type*)arr->At( i );
vec[i] = *p;
}
}
}
}
Code is bit old but i think it should still work, even if it begs for some refresh.

Deleting dynamically created matrix in C++

How do I delete a dynamically created matrix? This is likely a duplicate, for which I apologize, but I really can't find a clear answer on here so far. I initialize a matrix as follows:
float ** createMatrix(unsigned int Nrows, unsigned int Ncols) {
float ** result = NULL;
if (Nrows != 0 && Ncols != 0) {
// create the matrix on the heap
result = new float * [Nrows];
result[0] = new float [Nrows*Ncols]();
// link the rows
for (int i = 1; i < Nrows; i++) {
result[i] = result[i-1] + Ncols;
}
}
Now, I wish to create a function to delete it. Do I need two separate statements to delete M[0] and M, or just one for M? i.e. do I need:
void deleteMatrix(float **M){
delete[] M[0];
delete[] M;
}
OR SIMPLY:
void deleteMatrix(float **M){
delete[] M;
}
Any help/explanation would be massively appreciated. Both versions "work" and don't show any errors to me in the console when deleteMatrix(M) is run, so I'm confused. Thanks!

As many other's have stated every time you use new[] you have to have a matching [] delete. However as it currently stands with your functions and how they are declared/defined I believe you are running into an X/Y problem.
Here is why!
You propose to declare your delete function as such: Also you stated that both versions work and don't show any errors... well in your posted question both versions are exactly the same...
void deleteMatrix( float** M ) {
// delete[] rows
// delete[] cols
}
The problem that I'm seeing here is that as you pass in a pointer to a pointer of floats, the function does not know the dimensions of the matrix. It could be a 2x2, 2x3, 3x2, 3x3, MxN etc. Without knowing the dimensions of the matrix how are you able to write the for loop to traverse the inner or nested arrays? You would have to pass those dimensions into the delete function:
void deleteMatrix( float** M, int rowSize, int colSize ) {
for ( int row = 0; row < rowSize; row++ ) {
delete [] M[i];
}
delete [] M;
}
Here is an example similar to what you are trying to implement: thispointer.com
Outside of your actual problem; this tends to be more of a C approach or an outdated C++ approach. It ill advised to use raw pointers and new & delete freely. Using smart pointers is better. However for such a construct such as a matrix class, there are a plethora of freely usable libraries out there that have already defined such classes - interfaces to use and some of the most popular ones are:
GLM
Eigen
Boost
Armadillo
MTL4
Edit - User PaulMcKenzie cleared up a valid point that I have long forgotten since I've been mostly using vector<object> or vector<shared_ptr<object>>. It's been over 15 years since I've first learned about pointers - multi-dimensional arrays and I had forgotten about the concept of about contiguous arrays. The answer to a question that he posted to me in the comment section gives a clear explanation of what I've forgotten; found here. If the arrays are not contiguous in memory then yes it would be an X/Y problem without knowing their dimensions, but since they are; the sizes are not really needed to be known. And what you've already proposed should then work:
void deleteMatrix(float **M){
delete[] M[0];
delete[] M;
}
Edit - I was going back through some of my classes in my libraries and here is a template matrix class that I've written with any dimensional size matrix MxNx...ith It is very versatile in its ease and use. You can expand upon it if you want: I have not done any type checking or assertions but that can easily be added.
Using it is as simple as:
#include <iostream>
#include "Matrix.h"
int main() {
Matrix<int, 2, 2> imat3x3( 1, 2, 3, 4, 5, 6, 7, 8, 9 );
// calling elements() and using vector's [] operator
for ( int i = 0; i < 9; i++ )
std::cout << imat3x3.elements()[i] << ' ';
std::cout << '\n';
// Using class's [] operator
for ( int i = 0; i < 9; i++ )
std::cout << imat3x3[i] << ' ';
std::cout << '\n';
// Using class's () operator
for ( int i = 0; i < 9; i++ )
std::cout << imat3x3(i) << ' ';
std::cout << '\n';
// Okay that was a 3x3 matrix of ints, lets do a 2x2x2 matrix of floats
Matrix<float,2,2,2> fmat2x2x2( 0.1f, 0.2f, 0.3f, 0.4f, 0.5f, 0.6f, 0.7f, 0.8f );
// now the operators
for ( int i = 0; i < 8; i++ ) {
std::cout << fmat2x2x2[i] << "f ";
std::cout << '\n';
for ( int i = 0; i < 8; i++ ) {
std::cout << fmat2x2x2(i) << "f ";
std::cout << '\n';
std::cout << "\nPress any key and enter to quit.\n";
std::cin.get();
return 0;
}
Matrix.h
#ifndef MATRIX_H
#define MATRIX_H
#include <vector>
#include <algorithm>
#include <numeric>
template<typename Type, size_t... Dims>
class Matrix {
public:
static const size_t _numDims = sizeof...(Dims);
private:
size_t _numElements;
std::vector<Type> _elements;
std::vector<size_t> _strides;
public:
Matrix() noexcept;
template<typename... Args>
Matrix( Args&&... args ) noexcept;
const Type& operator[]( size_t idx );
const Type operator[]( size_t idx ) const;
const Type& operator() ( size_t idx );
const Type operator() ( size_t idx ) const;
size_t numElements() const {
return _elements.size();
}
const std::vector<size_t>& strides() const {
return _strides;
}
const std::vector<Type>& elements() const {
return _elements;
}
};
#include "Matrix.inl"
#endif // !MATRIX_H
Matrix.inl
template<typename Type, size_t... Dims>
Matrix<Type, Dims...>::Matrix() noexcept :
_strides( { Dims... } ) {
using std::begin;
using std::end;
auto mult = std::accumulate( begin( _strides ), end( strides ), 1, std::multiplies<>() );
_numElements = mult;
_elements.resize( _numElements );
}
template<typename Type, size_t... Dims>
template<typename... Args>
Matrix<Type, Dims...>::Matrix( Args&&... args ) noexcept :
_elements( { args... } ),
_strides( { Dims... } ) {
_numElements = _elements.size();
}
template<typename Type, size_t... Dims>
const Type Matrix<Type, Dims...>::operator[]( size_t idx ) const {
return _elements[idx];
}
template<typename Type, size_t... Dims>
const Type& Matrix<Type, Dims...>::operator[]( size_t idx ) {
return _elements[idx];
}
template<typename Type, size_t... Dims>
const Type Matrix<Type, Dims...>::operator()( size_t idx ) const {
return _elements[idx];
}
template<typename Type, size_t... Dims>
const Type& Matrix<Type, Dims...>::operator()( size_t idx ) {
return _elements[idx];
}
Matrix.cpp - This cpp file is not necessary I only have it to easily compile it while debugging the class for basic compiler errors
#include "Matrix.h"
I did not demonstrate the use of the numElements() or stride() functions but they should be fairly self explanatory. The strides function is a very nice feature since if a user calls the template as such <type, 1,3,5> giving you a 1x3x5 Matrix; these are stored in the _strides member vector. This way you always have the indexes needed for the size of each dimension.
Now if you want your matrix on the heap; instead of trying to do a double pointer or [][] and putting each element on the heap, with this class you have two options.
You can either put the instantiated object on the heap directly or you can have this class hold heap objects as well.
std::shared_ptr<Matrix<int,2,2>> ptrMat2x2; // A heap pointer of the matrix
Matrix<shared_ptr<int>,3,3> mat3x3ptrs; // A matrix of heap objects.
The code might seem a bit strange at first glance but this shows that both cases can be done:
#include <iostream>
#include "Matrix.h"
int main() {
// A Matrix<> on the heap via shared_ptr`
std::shared_ptr<Matrix<int, 2, 2>> ptrMat2x2 =
std::make_shared<Matrix<int, 2, 2>>( Matrix<int,2,2>( 1, 2, 3, 4 ) );
// accessing the elements from the shared pointer and printing
for( int i = 0; i < 4; i++ )
std::cout << (*ptrMat2x2.get())(i) << ' ';
std::cout << '\n';
// creating some basic shared_ptrs
auto a = std::make_shared<int>( 1 );
auto b = std::make_shared<int>( 2 );
auto c = std::make_shared<int>( 3 );
auto d = std::make_shared<int>( 4 );
// Matrix that holds shared_ptrs
Matrix<std::shared_ptr<int>, 2, 2> mat2x2ptrs( a, b, c, d );
// print the elements from the matrix (don't forget to dereference).
for( int i = 0; i < 4; i++ )
std::cout << *mat2x2ptrs[i].get() << ' ';
std::cout << '\n';
std::cout << "\nPress any key and enter to quit.\n";
std::cin.get();
return 0;
}

These "2D array" questions come up constantly. I think I'll answer one.
Don't use arrays[]. Don't use new[] and delete[]. Just don't. Use std::vector<std::vector<int>> and let the miracle of C++ do all the newing and deleting for you. Or for something serious, use a well-designed open source library, like boost::matrix. C++ is way cool.
The following is a starter-kit. It can be improved, "privatized", and abstracted in lots of ways.
#include <vector>
using std::size_t;
template<class T>
struct Matrix {
using matrix_type = std::vector<std::vector<T>>;
matrix_type matrix;
Matrix(size_t rows, size_t cols)
: matrix(rows, matrix_type::value_type(cols))
{}
};
int main() {
size_t Nrows = 5u;
size_t Ncols = 2u;
Matrix<int> mx(Nrows, Ncols);
auto& matrix = mx.matrix; // Now use matrix[i][j] or whatever.
// Here you can do anything with matrix that your could do with
// an array or arrays ... and more. And it cleans up after iself.
}

You have allocated two separate arrays, you need to delete two separate arrays, in reverse order.

How do I allocate memory-aligned C++ object arrays? [duplicate]

This question already has an answer here:
Parameter "size" of member operator new[] increases if class has destructor/delete[]
(1 answer)
Closed 5 years ago.
I am seeing a problem with operator new[]:
#include <stdlib.h>
#include <stdio.h>
class V4 { public:
float v[ 4 ];
V4() {}
void *operator new( size_t sz ) { return aligned_alloc( 16, sz ); }
void *operator new[]( size_t sz ) { printf( "sz: %zu\n", sz ); return aligned_alloc( 16, sz ); }
void operator delete( void *p, size_t sz ) { free( p ); }
//void operator delete[]( void *p, size_t sz ) { free( p ); }
};
class W4 { public:
float w[ 4 ];
W4() {}
void *operator new( size_t sz ) { return aligned_alloc( 16, sz ); }
void *operator new[]( size_t sz ) { printf( "sz: %zu\n", sz ); return aligned_alloc( 16, sz ); }
void operator delete( void *p, size_t sz ) { free( p ); }
void operator delete[]( void *p, size_t sz ) { free( p ); }
};
int main( int argc, char **argv ) {
printf( "sizeof( V4 ): %zu\n", sizeof( V4 ));
V4 *p = new V4[ 1 ];
printf( "p: %p\n", p );
printf( "sizeof( W4 ): %zu\n", sizeof( W4 ));
W4 *q = new W4[ 1 ];
printf( "q: %p\n", q );
exit(0);
}
Produces:
$ g++ -Wall main.cpp && ./a.out
sizeof( V4 ): 16
sz: 16
p: 0x55be98a10030
sizeof( W4 ): 16
sz: 24
q: 0x55be98a10058
Why does the alloc size increase to 24 when I include the operator delete[]? This is screwing up my aligned malloc.
$ g++ --version
g++ (Debian 7.2.0-18) 7.2.0
Copyright (C) 2017 Free Software Foundation, Inc.
This is free software; see the source for copying conditions. There is NO
warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
From looking at other questions, it seems as though the extra 8 bytes may be being used to store the array size. Even if this is expected behaviour, why is it triggered by operator delete[], and what is the correct procedure for allocating memory-aligned arrays?
EDIT Thanks, the linked questions appear to be relevant. I still think the question as asked needs an answer, however. It ought to be possible to change the example code to produce memory-aligned arrays without recourse to std::vector, in my opinion. My current thinking is that it will be necessary to allocate a yet-larger block of bytes which are 16-byte aligned, and return the pointer such that the initial 8 bytes bring the rest of the block to alignment on the 16-byte boundary. The delete[] operator would then have to perform the reverse operation before calling free(). This is pretty disgusting, but I think it is required to satisfy both the calling code (C runtime?) (which requires its 8 bytes for size storage) - and the use case which is to get 16-byte aligned Vector4s.
EDIT The linked answer is certainly relevant, but it does not address the problem of ensuring correct memory alignment.
EDIT It looks like this code will do what I want, but I don't like the magic number 8 in delete[]:
#include <stdlib.h>
#include <stdio.h>
#include <stdint.h>
class W16 { public:
float w[ 16 ];
W16() {}
void *operator new( size_t sz ) { return aligned_alloc( 16, sz ); }
void *operator new[]( size_t sz ) {
size_t r = sz % sizeof( W16 );
size_t ofs = sizeof( W16 ) - r;
size_t _sz = sz + ofs;
void *p1 = aligned_alloc( sizeof( W16 ), _sz );
void *p2 = ((uint8_t *) p1) + ofs;
printf( "sizeof( W16 ): %zx, sz: %zx, r: %zx, ofs: %zx, _sz: %zx\np1: %p\np2: %p\n\n", sizeof( W16 ), sz, r, ofs, _sz, p1, p2 );
return p2;
}
void operator delete( void *p, size_t sz ) { free( p ); }
void operator delete[]( void *p, size_t sz ) {
void *p1 = ((int8_t*) p) + 8 - sizeof( W16 );
printf( "\np2: %p\np1: %p", p, p1 );
free( p1 );
}
};
int main( int argc, char **argv ) {
printf( "sizeof( W16 ): %zx\n", sizeof( W16 ));
W16 *q = new W16[ 16 ];
printf( "&q[0]: %p\n", &q[0] );
delete[] q;
}
Output:
$ g++ -Wall main.cpp && ./a.out
sizeof( W16 ): 40
sizeof( W16 ): 40, sz: 408, r: 8, ofs: 38, _sz: 440
p1: 0x559876c68080
p2: 0x559876c680b8
&q[0]: 0x559876c680c0
p2: 0x559876c680b8
p1: 0x559876c68080
EDIT Title changed from feedback in comments. I don't think this is a 'duplicate' of the linked answer anymore, though I don't know if I can get it removed.

It looks as though this will do for me:
#include <stdlib.h>
#include <stdio.h>
#include <stdint.h>
inline void *array_alloc( size_t sz_obj, size_t sz_req ) {
size_t r = sz_req % sz_obj;
size_t ofs = sz_obj - r;
size_t sz = sz_req + ofs;
void *p1 = aligned_alloc( sz_obj, sz );
void *p2 = (void*) (((uintptr_t ) p1) + ofs);
//printf( "sz_obj: %zx, sz_req: %zx, r: %zx, ofs: %zx, sz: %zx\np1: %p\np2: %p\n\n", sz_obj, sz_req, r, ofs, sz, p1, p2 );
return p2;
}
inline void array_free( size_t sz_obj, void *p2 ) {
void *p1 = (void*) (((uint8_t*)p2) - (((uintptr_t)p2) % sz_obj));
//printf( "\np2: %p\np1: %p", p2, p1 );
free( p1 );
}
class W16 { public:
float w[ 16 ];
W16() {}
void *operator new( size_t sz ) { return aligned_alloc( 16, sz ); }
void *operator new[]( size_t sz ) { return array_alloc( sizeof( W16 ), sz ); }
void operator delete( void *p, size_t sz ) { free( p ); }
void operator delete[]( void *p, size_t sz ) { array_free( sizeof( W16 ), p ); }
};
int main( int argc, char **argv ) {
//printf( "sizeof( W16 ): %zx\n", sizeof( W16 ));
W16 *q = new W16[ 16 ];
printf( "&q[0]: %p\n", &q[0] );
delete[] q;
}
EDIT Thanks to n.m., this code works without a magic number.

Why is std::vector<bool> faster?

As I was implementing the Sieve of Eratosthenes I ran into an issue with std::vector<bool> : there is no access to the raw data.
So I decided to use a custom minimalistic implementation where I would have access to the data pointer.
#ifndef LIB_BITS_T_H
#define LIB_BITS_T_H
#include <algorithm>
template <typename B>
class bits_t{
public:
typedef B block_t;
static const size_t block_size = sizeof(block_t) * 8;
block_t* data;
size_t size;
size_t blocks;
class bit_ref{
public:
block_t* const block;
const block_t mask;
bit_ref(block_t& block, const block_t mask) noexcept : block(&block), mask(mask){}
inline void operator=(bool v) const noexcept{
if(v) *block |= mask;
else *block &= ~mask;
}
inline operator bool() const noexcept{
return (bool)(*block & mask);
}
};
bits_t() noexcept : data(nullptr){}
void resize(const size_t n, const bool v) noexcept{
block_t fill = v ? ~block_t(0) : block_t(0);
size = n;
blocks = (n + block_size - 1) / block_size;
data = new block_t[blocks];
std::fill(data, data + blocks, fill);
}
inline block_t& block_at_index(const size_t i) const noexcept{
return data[i / block_size];
}
inline size_t index_in_block(const size_t i) const noexcept{
return i % block_size;
}
inline bit_ref operator[](const size_t i) noexcept{
return bit_ref(block_at_index(i), block_t(1) << index_in_block(i));
}
~bits_t(){
delete[] data;
}
};
#endif // LIB_BITS_T_H
The code is nearly the same than the one in /usr/include/c++/4.7/bits/stl_bvector.h but is slower.
I tried an optimization,
#ifndef LIB_BITS_T_H
#define LIB_BITS_T_H
#include <algorithm>
template <typename B>
class bits_t{
const B mask[64] = {
0b0000000000000000000000000000000000000000000000000000000000000001,
0b0000000000000000000000000000000000000000000000000000000000000010,
0b0000000000000000000000000000000000000000000000000000000000000100,
0b0000000000000000000000000000000000000000000000000000000000001000,
0b0000000000000000000000000000000000000000000000000000000000010000,
0b0000000000000000000000000000000000000000000000000000000000100000,
0b0000000000000000000000000000000000000000000000000000000001000000,
0b0000000000000000000000000000000000000000000000000000000010000000,
0b0000000000000000000000000000000000000000000000000000000100000000,
0b0000000000000000000000000000000000000000000000000000001000000000,
0b0000000000000000000000000000000000000000000000000000010000000000,
0b0000000000000000000000000000000000000000000000000000100000000000,
0b0000000000000000000000000000000000000000000000000001000000000000,
0b0000000000000000000000000000000000000000000000000010000000000000,
0b0000000000000000000000000000000000000000000000000100000000000000,
0b0000000000000000000000000000000000000000000000001000000000000000,
0b0000000000000000000000000000000000000000000000010000000000000000,
0b0000000000000000000000000000000000000000000000100000000000000000,
0b0000000000000000000000000000000000000000000001000000000000000000,
0b0000000000000000000000000000000000000000000010000000000000000000,
0b0000000000000000000000000000000000000000000100000000000000000000,
0b0000000000000000000000000000000000000000001000000000000000000000,
0b0000000000000000000000000000000000000000010000000000000000000000,
0b0000000000000000000000000000000000000000100000000000000000000000,
0b0000000000000000000000000000000000000001000000000000000000000000,
0b0000000000000000000000000000000000000010000000000000000000000000,
0b0000000000000000000000000000000000000100000000000000000000000000,
0b0000000000000000000000000000000000001000000000000000000000000000,
0b0000000000000000000000000000000000010000000000000000000000000000,
0b0000000000000000000000000000000000100000000000000000000000000000,
0b0000000000000000000000000000000001000000000000000000000000000000,
0b0000000000000000000000000000000010000000000000000000000000000000,
0b0000000000000000000000000000000100000000000000000000000000000000,
0b0000000000000000000000000000001000000000000000000000000000000000,
0b0000000000000000000000000000010000000000000000000000000000000000,
0b0000000000000000000000000000100000000000000000000000000000000000,
0b0000000000000000000000000001000000000000000000000000000000000000,
0b0000000000000000000000000010000000000000000000000000000000000000,
0b0000000000000000000000000100000000000000000000000000000000000000,
0b0000000000000000000000001000000000000000000000000000000000000000,
0b0000000000000000000000010000000000000000000000000000000000000000,
0b0000000000000000000000100000000000000000000000000000000000000000,
0b0000000000000000000001000000000000000000000000000000000000000000,
0b0000000000000000000010000000000000000000000000000000000000000000,
0b0000000000000000000100000000000000000000000000000000000000000000,
0b0000000000000000001000000000000000000000000000000000000000000000,
0b0000000000000000010000000000000000000000000000000000000000000000,
0b0000000000000000100000000000000000000000000000000000000000000000,
0b0000000000000001000000000000000000000000000000000000000000000000,
0b0000000000000010000000000000000000000000000000000000000000000000,
0b0000000000000100000000000000000000000000000000000000000000000000,
0b0000000000001000000000000000000000000000000000000000000000000000,
0b0000000000010000000000000000000000000000000000000000000000000000,
0b0000000000100000000000000000000000000000000000000000000000000000,
0b0000000001000000000000000000000000000000000000000000000000000000,
0b0000000010000000000000000000000000000000000000000000000000000000,
0b0000000100000000000000000000000000000000000000000000000000000000,
0b0000001000000000000000000000000000000000000000000000000000000000,
0b0000010000000000000000000000000000000000000000000000000000000000,
0b0000100000000000000000000000000000000000000000000000000000000000,
0b0001000000000000000000000000000000000000000000000000000000000000,
0b0010000000000000000000000000000000000000000000000000000000000000,
0b0100000000000000000000000000000000000000000000000000000000000000,
0b1000000000000000000000000000000000000000000000000000000000000000
};
public:
typedef B block_t;
static const size_t block_size = sizeof(block_t) * 8;
block_t* data;
size_t size;
size_t blocks;
class bit_ref{
public:
block_t* const block;
const block_t mask;
bit_ref(block_t& block, const block_t mask) noexcept : block(&block), mask(mask){}
inline void operator=(bool v) const noexcept{
if(v) *block |= mask;
else *block &= ~mask;
}
inline operator bool() const noexcept{
return (bool)(*block & mask);
}
};
bits_t() noexcept : data(nullptr){}
void resize(const size_t n, const bool v) noexcept{
block_t fill = v ? ~block_t(0) : block_t(0);
size = n;
blocks = (n + block_size - 1) / block_size;
data = new block_t[blocks];
std::fill(data, data + blocks, fill);
}
inline block_t& block_at_index(const size_t i) const noexcept{
return data[i / block_size];
}
inline size_t index_in_block(const size_t i) const noexcept{
return i % block_size;
}
inline bit_ref operator[](const size_t i) noexcept{
return bit_ref(block_at_index(i), mask[index_in_block(i)]);
}
~bits_t(){
delete[] data;
}
};
#endif // LIB_BITS_T_H
(Compiling with g++4.7 -O3)
Eratosthenes sieve algorithm (33.333.333 bits)
std::vector<bool> 19.1s
bits_t<size_t> 19.9s
bits_t<size_t> (with lookup table) 19.7s
ctor + resize(33.333.333 bits) + dtor
std::vector<bool> 120ms
bits_t<size_t> 150ms
QUESTION : Where does the slowdown come from?

Outside of all the problems as pointed out by some other users, your resize is allocating more memory each time the current block limit is reached to add ONE block. The std::vector will double the size of the buffer (so if you already had 16 blocks, now you have 32 blocks). In other words, they will do less new than you.
This being said, you do not do the necessary delete & copy and that could have a "positive" impact in your version... ("positive" impact speed wise, it is not positive that you do not delete the old data, nor copy it in your new buffer.)
Also, the std::vector will properly enlarge the buffer and thus copy data that is likely already in your CPU cache. With your version, that cache is lost since you just ignore the old buffer on each resize().
Also when a class handles a memory buffer it is customary to implement the copy and assignment operators, for some reasons... and you could look into using a shared_ptr<>() too. The delete is then hidden and the class is a template so it is very fast (it does not add any code that you would not already have in your own version.)
=== Update
There is one other thing. You're operator [] implementation:
inline bit_ref operator[](const size_t i) noexcept{
return bit_ref(block_at_index(i), mask[index_in_block(i)]);
}
(side note: the inline is not required since the fact that you write the code within the class already means you okayed the inline capability already.)
You only offer a non-const version which "is slow" because it creates a sub-class. You should try implementing a const version that returns bool and see whether that accounts for the ~3% difference you see.
bool operator[](const size_t i) const noexcept
{
return (block_at_index(i) & mask[index_in_block(i)]) != 0;
}
Also, using a mask[] array can also slow down things. (1LL << (index & 0x3F)) should be faster (2 CPU instructions with 0 memory access).

Apparently, the wrapping of i % block_size in a function was the culprit
inline size_t index_in_block ( const size_t i ) const noexcept {
return i % block_size;
}
inline bit_ref operator[] ( const size_t i ) noexcept {
return bit_ref( block_at_index( i ), block_t( 1 ) << index_in_block( i ) );
}
so replacing the above code with
inline bit_ref operator[] ( const size_t i ) noexcept {
return bit_ref( block_at_index( i ), block_t( 1 ) << ( i % block_size ) );
}
solves the issue. However, I still don't know why it is. My best guess is that I didn't get the signature of index_in_block right and that the optimizer is thus not able to inline this function in a similar way to the manual inlining way.
Here is the new code.
#ifndef LIB_BITS_2_T_H
#define LIB_BITS_2_T_H
#include <algorithm>
template <typename B>
class bits_2_t {
public:
typedef B block_t;
static const int block_size = sizeof( block_t ) * __CHAR_BIT__;
private:
block_t* _data;
size_t _size;
size_t _blocks;
public:
class bit_ref {
public:
block_t* const block;
const block_t mask;
bit_ref ( block_t& block, const block_t mask) noexcept
: block( &block ), mask( mask ) {}
inline bool operator= ( const bool v ) const noexcept {
if ( v ) *block |= mask;
else *block &= ~mask;
return v;
}
inline operator bool() const noexcept {
return (bool)( *block & mask );
}
};
bits_2_t () noexcept : _data( nullptr ), _size( 0 ), _blocks( 0 ) {}
bits_2_t ( const size_t n ) noexcept : _data( nullptr ), _size( n ) {
_blocks = number_of_blocks_needed( n );
_data = new block_t[_blocks];
const block_t fill( 0 );
std::fill( _data, _data + _blocks, fill );
}
bits_2_t ( const size_t n, const bool v ) noexcept : _data( nullptr ), _size( n ) {
_blocks = number_of_blocks_needed( n );
_data = new block_t[_blocks];
const block_t fill = v ? ~block_t( 0 ) : block_t( 0 );
std::fill( _data, _data + _blocks, fill );
}
void resize ( const size_t n ) noexcept {
resize( n, false );
}
void resize ( const size_t n, const bool v ) noexcept {
const size_t tmpblocks = number_of_blocks_needed( n );
const size_t copysize = std::min( _blocks, tmpblocks );
block_t* tmpdata = new block_t[tmpblocks];
std::copy( _data, _data + copysize, tmpdata );
const block_t fill = v ? ~block_t( 0 ) : block_t( 0 );
std::fill( tmpdata + copysize, tmpdata + tmpblocks, fill );
delete[] _data;
_data = tmpdata;
_blocks = tmpblocks;
_size = n;
}
inline size_t number_of_blocks_needed ( const size_t n ) const noexcept {
return ( n + block_size - 1 ) / block_size;
}
inline block_t& block_at_index ( const size_t i ) const noexcept {
return _data[i / block_size];
}
inline bit_ref operator[] ( const size_t i ) noexcept {
return bit_ref( block_at_index( i ), block_t( 1 ) << ( i % block_size ) );
}
inline bool operator[] ( const size_t i ) const noexcept {
return (bool)( block_at_index( i ) & ( block_t( 1 ) << ( i % block_size ) ) );
}
inline block_t* data () {
return _data;
}
inline const block_t* data () const {
return _data;
}
inline size_t size () const {
return _size;
}
void clear () noexcept {
delete[] _data;
_size = 0;
_blocks = 0;
_data = nullptr;
}
~bits_2_t () {
clear();
}
};
#endif // LIB_BITS_2_T_H
Here are the results for this new code on my amd64 machine for primes up to 1.000.000.000 (best of 3 runs, real time).
Sieve of Eratosthenes with 1 memory unit per number ( not skipping multiples of 2 ).
bits_t<uint8_t>
real 0m23.614s user 0m23.493s sys 0m0.092s
bits_t<uint16_t>
real 0m24.399s user 0m24.294s sys 0m0.084s
bits_t<uint32_t>
real 0m23.501s user 0m23.372s sys 0m0.108s <-- best
bits_t<uint64_t>
real 0m24.393s user 0m24.304s sys 0m0.068s
std::vector<bool>
real 0m24.362s user 0m24.276s sys 0m0.056s
std::vector<uint8_t>
real 0m38.303s user 0m37.570s sys 0m0.683s
Here is the code of the sieve (where (...) should be replaced by the bit array of your choice).
#include <iostream>
typedef (...) array_t;
int main ( int argc, char const *argv[] ) {
if ( argc != 2 ) {
std::cout << "#0 missing" << std::endl;
return 1;
}
const size_t count = std::stoull( argv[1] );
array_t prime( count, true );
prime[0] = prime[1] = false;
for ( size_t k = 2 ; k * k < count ; ++k ) {
if ( prime[k] ) {
for ( size_t i = k * k ; i < count ; i += k ) {
prime[i] = false;
}
}
}
return 0;
}

insert a content of integers array to the vector of QString

I want to insert the content of array of integers :
int arr[n] to the vector of QStrings. std::vector<QString> vQString.- I can do it by inserting the array`s elements one by one :
vQString.push_back(QString::number(arr[i]));
By I prefer to do that using one insert operation - any advices?
Thanks

This isn't a 1-line solution. But is an extendable solution. What you basically do is create a template function to do the conversion for you in an exception-safe manner, like below:
namespace yournamespace {
template <typename U>
struct NumberToString {
QString operator()(const U& val) {
return QString::number(val);
}
};
template <typename T, typename U, typename Convertor>
void CopyArrayToVector(QVector<T>& dst, const U* src, const size_t size) {
QVector<T> temp;
temp.reserve(size);
for (int i = 0; i < size; ++i) {
temp.push_back(convert(src[i]));
}
dst.swap(temp);
}
}
Usage:
using yournamespace;
const size_t n = 10;
int *arr = new int[10];
QVector<String> dst;
CopyArrayToVector<QString,int,NumberToString<int> >(dst, arr, n);
DISCLAIMER: I'm not familiar with Qt framework. I whipped this up by looking at their documentation. Please feel free to correct me for any errors.

const int n = 5;
int arr[n] = { 4, 6, 2, 3, 1 };
vector< QString > v( n );
transform( arr, arr + n, v.begin(),
[] ( int i ) { return QString::number( i ); } );
for ( const QString& str : v ) {
cout << qPrintable( str ) << endl;
}
Slightly cheating...! Just use a for loop like everyone else.

We Keep Coding

c++ django amazon-web-services regex python-2.7 google-cloud-platform list unit-testing opengl ember.js

CUDA 7.5 experimental host device lambdas - c++

Related

Angelscript transfer array from C++

Deleting dynamically created matrix in C++

How do I allocate memory-aligned C++ object arrays? [duplicate]

Why is std::vector<bool> faster?

insert a content of integers array to the vector of QString

Categories

Resources