atomicCAS for bool implementation - c++

I'm trying to figure out is there a bug in the answer (now deleted) about the implementation of Cuda-like atomicCAS for bools. The code from the answer (reformatted):
static __inline__ __device__ bool atomicCAS(bool *address, bool compare, bool val)
{
unsigned long long addr = (unsigned long long)address;
unsigned pos = addr & 7; // byte position within the unsigned long long
int *int_addr = (int *)(addr - pos); // int-aligned address
int old = *int_addr, assumed, ival;
do
{
assumed = old;
if(val)
ival = old | (1 << (8 * pos));
else
ival = old & (~((0xFFU) << (8 * pos)));
old = atomicCAS(int_addr, assumed, ival);
} while(assumed != old);
return (bool)(old & ((0xFFU) << (8 * pos)));
}
According to the documentation, atomicCAS should set *address to (*address == compare ? val : *address), but in the implementation above compare argument is never used!
The code I use to reproduce the bug:
#include <cstdio>
// atomicCAS definition here
__device__ bool b;
__global__ void kernel()
{
b = false;
atomicCAS(&b, true, true); // `(b == true ? true : b)`, where b is false equals to false
printf("%d\n", b); // b is false => expected output is 0
}
int main()
{
kernel<<<1, 1>>>();
cudaDeviceSynchronize();
}
The expected output is 0, but the actual output is 1.
I have a suggestion about how to fix it but am not 100% sure it's thread-safe:
static __inline__ __device__ bool atomicCAS(bool *address, bool compare, bool val)
{
unsigned long long addr = (unsigned long long)address;
unsigned pos = addr & 3; // byte position within the int
int *int_addr = (int *)(addr - pos); // int-aligned address
int old = *int_addr, assumed, ival;
do
{
if(*address != compare) // If we expected that bool to be different, then
break; // stop trying to update it and just return it's current value
assumed = old;
if(val)
ival = old | (1 << (8 * pos));
else
ival = old & (~((0xFFU) << (8 * pos)));
old = atomicCAS(int_addr, assumed, ival);
} while(assumed != old);
return (bool)(old & ((0xFFU) << (8 * pos)));
}
My questions are
Is there a bug in the first code sample from the answer? If there is,
Does the last code sample fix it thread-safely?

Many many thanks to #RobertCrovella; the first code sample does contain a bug, the second does fix it, but is not thread-safe (see question comments for details). The thread-safe fix:
static __inline__ __device__ bool atomicCAS(bool *address, bool compare, bool val)
{
unsigned long long addr = (unsigned long long)address;
unsigned pos = addr & 3; // byte position within the int
int *int_addr = (int *)(addr - pos); // int-aligned address
int old = *int_addr, assumed, ival;
bool current_value;
do
{
current_value = (bool)(old & ((0xFFU) << (8 * pos)));
if(current_value != compare) // If we expected that bool to be different, then
break; // stop trying to update it and just return it's current value
assumed = old;
if(val)
ival = old | (1 << (8 * pos));
else
ival = old & (~((0xFFU) << (8 * pos)));
old = atomicCAS(int_addr, assumed, ival);
} while(assumed != old);
return current_value;
}

Related

unsigned char variable is not incremented

I came across one strange behaviour. In my code one variable is decremented, but not incremented and as a result my algorithm does not work. The variable name is blocksAvailable, it is defined in Chunk class, initiated with Chunk::init method, decremented with Chunk::allocate method and must be incremented with Chunk::deallocate method. So, there are just two places where this variable is mentioned - allocate and deallocate methods. In one place it gets decremented (and it works) and in other place it gets incremented and it does not work. This is the completely minimized and reproducible code:
#include <cstddef>
#include <iostream>
#include <vector>
using uchar = unsigned char;
class Chunk
{
private:
friend class FixedAllocator;
void init(size_t blockSize, uchar blocks);
void release();
void* allocate(size_t blockSize);
void deallocate(void* p, size_t blockSize);
inline bool hasBlock(void* p, size_t chunkLen) const
{
uchar * pc = static_cast<uchar*>(p);
return (pData <= pc) && (pc <= (pData + chunkLen));
}
inline bool releasable(uchar numBlocks) const
{
return blocksAvailable == numBlocks;
}
uchar* pData;
uchar firstAvailableBlock, blocksAvailable;
};
void Chunk::init(size_t blockSize, uchar blocks)
{
// for n of Ts it will allocate n * sizeof(T) memory
pData = new uchar[blockSize * blocks];
firstAvailableBlock = 0;
blocksAvailable = blocks;
uchar i = 0;
uchar* p = pData;
// used by allocate method to move forward firstAvailableBlock
for (; i != blocks; p += blockSize)
{
*p = ++i;
}
}
void Chunk::release()
{
::operator delete(pData);
}
void* Chunk::allocate(size_t blockSize)
{
if (!blocksAvailable) return 0;
// move firstAvailableBlock one block ahead
uchar* pResult = pData + firstAvailableBlock * blockSize;
firstAvailableBlock = *pResult;
--blocksAvailable;
std::cout << "blocksAvailable after allocate " << blocksAvailable << std::endl;
return pResult;
}
void Chunk::deallocate(void* p, size_t blockSize)
{
uchar* toRelease = static_cast<uchar*>(p);
// find last but one available block
firstAvailableBlock = static_cast<uchar>((toRelease - pData) / blockSize);
++blocksAvailable;
std::cout << "blocksAvailable after deallocate " << blocksAvailable << std::endl;
}
class FixedAllocator
{
private:
size_t blockSize;
uchar blocks;
using Chunks = std::vector<Chunk>;
Chunks chunks;
Chunk* allocChunk;
public:
FixedAllocator();
~FixedAllocator();
void init(size_t blockSize, size_t pageSize);
const int blockOwner(void* p) const;
void * allocate();
void deallocate(void* p);
};
FixedAllocator::FixedAllocator()
:blockSize(0),
blocks(0),
chunks(0),
allocChunk(nullptr)
{
}
FixedAllocator::~FixedAllocator()
{
Chunks::iterator it;
for (it = chunks.begin(); it != chunks.end(); ++it)
{
it->release();
}
}
void FixedAllocator::init(size_t blockSize_, size_t pageSize)
{
blockSize = blockSize_;
size_t numBlocks = pageSize / blockSize;
blocks = static_cast<uchar>(numBlocks);
}
const int FixedAllocator::blockOwner(void* p) const
{
size_t chunkLen = blocks * blockSize;
std::vector<int>::size_type i = 0, sz = chunks.size();
for (; i < sz; i++)
{
if (chunks[i].hasBlock(p, chunkLen))
{
return i;
}
}
return -1;
}
void* FixedAllocator::allocate()
{
if (!allocChunk || allocChunk->blocksAvailable == 0)
{
Chunks::iterator i = chunks.begin();
for (;;++i)
{
if (i == chunks.end())
{
// allocate memory for one more chunk
chunks.reserve(chunks.size() + 1);
Chunk newChunk;
newChunk.init(blockSize, blocks);
// add new chunk to memory pool
chunks.push_back(newChunk);
// points to new just initiated chunk
allocChunk = &chunks.back();
break;
}
if (i->blocksAvailable > 0)
{
// points to chunk with available blocks
allocChunk = &*i;
break;
}
}
}
return allocChunk->allocate(blockSize);
}
void FixedAllocator::deallocate(void* p)
{
// TODO. Optimize. Now very bruteforce and non-efficient
const int chunkPos = blockOwner(p);
if (chunkPos != -1)
{
Chunk chunk = chunks[chunkPos];
chunk.deallocate(p, blockSize);
// if chunk is releasable, release memory
if (chunk.releasable(blocks))
{
chunk.release();
chunks.erase(chunks.begin() + chunkPos);
// allocChunk may point to deleted chunk
// so, reset it
allocChunk = &chunks.back();
} else {
// there are free blocks in chunk
// so, reset allocChunk for faster future allocation
allocChunk = &chunk;
}
}
}
int main() {
FixedAllocator* alloc = new FixedAllocator();
alloc->init(4, 12);
void* p = alloc->allocate();
void* q = alloc->allocate();
void* r = alloc->allocate();
alloc->deallocate(p);
alloc->deallocate(q);
alloc->deallocate(r);
return 0;
}
As you can see, I have two debug statements in my code. One which prints blocksAvailable value after increment and one which prints its value after decrement.
But this is what I have on my screen, when I compile and run my code:
As you can see, blocksAvailable is initiated with value 3, then it gets decremented three times (three calls to allocate method), but after each decrement (call to deallocate) its value stays the same - 1. It really drives me crazy and looks like some ghost in my code. You can easily reproduce it, compile and run as simply as:
$ g++ main.cpp
$ ./a.out
I hope, someone can help me to find where this ghost appeared from.
Here is the only place in your code where you call Chunk::deallocate:
Chunk chunk = chunks[chunkPos];
chunk.deallocate(p, blockSize);
The first line makes a copy of your Chunk; the second line calls deallocate on it, which increments chunk.blocksAvailable. But chunk is just a copy of the data. Modifying it has no lasting effect.
In particular, chunks[chunkPos] is unaffected and still contains blocksAvailable = 0.

I need help reversing what a function does

So i have got this function.
UINT32 Encrypt(UINT32 instruction, int increment)
{
UINT32 linstruction = _rotl(instruction, 7);
UINT32 rinstruction = _rotr(instruction, 3);
UINT32 key = (0x1F3D8AF * increment) ^ (rinstruction ^ linstruction);
return (key ^ instruction);
}
I need to make a function that actually decrypts this and gets the instruction from the result using a key so it would be like.
t = encrypt(t, i);
t = decrypt(t, key);
Basically i want it to reverse the whole process of the encrypt so it decrypts it and gets me the instruction.
They are used in this function
int RbxOpEncoder::encode(Instruction op, int i, int key) {
int orig = ConvertInstruction(op);
int t = orig;
switch (GET_OPCODE(op)) {
case OP_JMP:
t = ((Encrypt(t, i) & 0x3FFFFFF) | ((orig >> 26 & MASK1(6, 0)) << 0x1A));
break;
case OP_CALL:
case OP_TAILCALL:
case OP_CLOSURE:
case OP_RETURN:
t = ((Calldecrypt(t, i) & 0x3FFFFFF) | ((orig >> 26 & MASK1(6, 0)) << 0x1A));
break;
}
t = EncryptOpcode(t, key);
return t;
}
You may use:
std::uint32_t decrypt(std::uint32_t instruction, int increment)
{
instruction = instruction ^ (0x1F3D8AF * increment);
for (int i = 0; i != 15; ++i) {
instruction = Encrypt(instruction, 0);
}
return instruction;
}
And then you have
decrypt(Encrypt(value, increment), increment) == value
Demo

Modifying bits in a byte with a class

I want to directly modify a bit in a byte.
In GCC, you can do it as follow:
struct virtualByte {
unsigned char b0 : 1;
unsigned char b1 : 1;
unsigned char b2 : 1;
unsigned char b3 : 1;
unsigned char b4 : 1;
unsigned char b5 : 1;
unsigned char b6 : 1;
unsigned char b7 : 1;
} __attribute__((__packed__));
#define sbit(_byte, _pos) (((volatile struct virtualByte *)&_byte)->b ## _pos)
Usage:
unsigned char myByte = 0x00;
#define firstBit sbit(myByte, 0)
firstBit = 1; // Implicit myByte |= 0x01;
To make things neater I want to have class that does this for me. I came up with the following concept:
unsigned char myByteRef = 0x00;
Byte myByte(&myByteRef);
myByte[0] = 1; // Implicit myByteRef |= 0x01;
fprintf(stderr, "%2.2X\n", myByteRef);
But this does not work because in c++ you cannot return a reference to a single bit. Overloading the constructor does not work either.
Is there a possibility to implement such behaviour? The assignment operator should directly modify its underlying byte (not a set of bytes).
You want to use std::bitset:
std::bitset<12> someBits; // 12 bits
someBits[0] = true; // set 1st bit
std::cout << someBits.count() << '\n'; // prints 1
std::bitset<12>::reference bit5 = someBits[5];
bit5 = true;
std::cout << someBits.count() << '\n'; // prints 2
You can use the index operator to return a reference to a bit in the way you want. Note that this reference is not a bool& but rather a std::bitset::reference:
Finally came to a solution, many thanks to #doc!
My solution:
class Bit {
private:
volatile uint8_t *byte;
uint8_t bitPos;
public:
Bit(void)
{
}
void init(volatile uint8_t *const byte, uint8_t const bitPos)
{
this->byte = byte;
this->bitPos = (bitPos > 7u ? 7u : bitPos);
}
void setValue(bool const bitValue)
{
if (!this->byte) return;
if (bitValue) {
*this->byte |= (1u << this->bitPos);
} else {
*this->byte &= ~(1u << this->bitPos);
}
}
};
class BitReference {
private:
Bit &ref;
public:
BitReference(Bit &ref) : ref(ref)
{
}
void operator=(bool const bitValue)
{
this->ref.setValue(bitValue);
}
};
class Byte {
private:
Bit bits[8];
public:
Byte(volatile uint8_t *const byte)
{
for (unsigned i = 0; i < 8; ++i) {
this->bits[i].init(byte, i);
}
}
/* This did the trick :)! */
BitReference operator[](size_t index)
{
if (index > 7) index = 7;
return BitReference(this->bits[index]);
}
};
Usage:
uint8_t myPort = 0x00;
int main(int const argc, const char **const argv)
{
Byte abc(&myPort);
abc[0] = 1;
abc[1] = 1;
abc[2] = 1;
abc[3] = 1;
fprintf(stderr, "%2.2X\n", myPort);
return 0;
}

Optimization of loops and if

I have a procedure looks like this:
void Process1(unsigned char* data)
{
}
void Process2(unsigned char* data)
{
}
void Process3(unsigned char* data)
{
}
#define FLAG1 (1 << 1)
#define FLAG2 (1 << 2)
#define FLAG3 (1 << 3)
void ProcessData(unsigned char* data, unsigned int bytes, unsigned int flags)
{
bool b1 = !!(flags & FLAG1);
bool b2 = !!(flags & FLAG2);
bool b3 = !!(flags & FLAG3);
for (unsigned int i = 0; i < bytes; i ++)
{
if (b1) Process1(data + i);
if (b2) Process2(data + i);
if (b3) Process3(data + i);
}
}
As it looks, flags & FLAG1 A.K.A b1 will not change in all the loops. But we still have to do branch in every loop. I just wondering if there's a way to avoid this unnecessary branch dynamically.
here is a demo of Lundin's solution.
#include <windows.h>
#include <stdio.h>
#include <time.h>
LARGE_INTEGER ls, le, ll;
#define START_CLOCK() QueryPerformanceCounter(&ls)
#define END_CLOCK() printf ("%.0lf ns\n", (QueryPerformanceCounter(&le), ((double)le.QuadPart - ls.QuadPart) / ll.QuadPart * 1000000));
void Process1(unsigned char* data)
{
(*data)++;
}
void Process2(unsigned char* data)
{
(*data)--;
}
void Process3(unsigned char* data)
{
(*data) *= (*data);
}
#define FLAG1 (1 << 1)
#define FLAG2 (1 << 2)
#define FLAG3 (1 << 3)
void ProcessData(unsigned char* data, unsigned int bytes, unsigned int flags)
{
bool b1 = !!(flags & FLAG1);
bool b2 = !!(flags & FLAG2);
bool b3 = !!(flags & FLAG3);
for (unsigned int i = 0; i < bytes; i ++)
{
if (b1) Process1(data + i);
if (b2) Process2(data + i);
if (b3) Process3(data + i);
}
}
typedef void (*proc_t)(unsigned char*);
inline static void do_nothing (unsigned char* ptr)
{
(void)ptr;
}
void ProcessData_x(unsigned char* data, unsigned int bytes, unsigned int flags)
{
bool b1 = (flags & FLAG1) != 0; // de-obfuscate the boolean logic
bool b2 = (flags & FLAG2) != 0;
bool b3 = (flags & FLAG3) != 0;
proc_t p1 = b1 ? Process1 : do_nothing;
proc_t p2 = b2 ? Process2 : do_nothing;
proc_t p3 = b3 ? Process3 : do_nothing;
for (unsigned int i = 0; i<bytes; i++)
{
p1(data + i);
p2(data + i);
p3(data + i);
}
}
int main()
{
if (!QueryPerformanceFrequency(&ll)) return 1;
const unsigned int bytes = 0xffff;
srand((unsigned int)time(NULL));
unsigned int flags = rand() & 0x7;
unsigned char* data = new unsigned char[bytes];
for (unsigned int i = 0; i < bytes; i++)
{
data[i] = (unsigned char)(rand() & 0xff);
}
START_CLOCK();
ProcessData(data, bytes, flags);
END_CLOCK();
START_CLOCK();
ProcessData_x(data, bytes, flags);
END_CLOCK();
}
here is the output:
134 ns
272 ns
I've run it several times but, unexpectedly, it costs even more time:(.. it is also compiled 'vs2010 Release x86'
First of all, it doesn't any sense to speak about optimizations without a particular system in mind...
That being said, I'd optimize away the branches in the following way:
typedef void (*proc_t)(unsigned char*);
inline static void do_nothing (unsigned char* ptr)
{
(void)ptr;
}
...
void ProcessData(unsigned char* data, unsigned int bytes, unsigned int flags)
{
bool b1 = (flags & FLAG1) != 0; // de-obfuscate the boolean logic
bool b2 = (flags & FLAG2) != 0;
bool b3 = (flags & FLAG3) != 0;
proc_t p1 = b1 ? Process1 : do_nothing;
proc_t p2 = b2 ? Process2 : do_nothing;
proc_t p3 = b3 ? Process3 : do_nothing;
for (unsigned int i = 0; i<bytes; i++)
{
p1(data + i);
p2(data + i);
p3(data + i);
}
}
A c++ solution. Similar to Lundin's answer but without calls to empty function. I'm not sure if that makes any difference in performance, the main advantage is that you don't need to manually list all the process calls in the loop. If you want to micro optimize or want c, you could use an array on stack, but you'll have to manage some counters yourself.
typedef void (*proc_t)(unsigned char*);
std::vector<proc_t> processes;
if (b1) processes.push_back(Process1);
if (b2) processes.push_back(Process2);
if (b3) processes.push_back(Process3);
for(auto p : processes)
for (unsigned int i = 0; i<bytes; i++)
p(data + i);
bool b1 = !!(flags & FLAG1);
bool b2 = !!(flags & FLAG2);
bool b3 = !!(flags & FLAG3);
int caseNow=SelectCaseAtOnce(b1,b2,b3);
if(caseNow==0)
for (unsigned int i = 0; i < bytes; i ++)
{
Process1(data + i);
}
else if(caseNow==1)
for (unsigned int i = 0; i < bytes; i ++)
{
Process2(data + i);
}
else if(caseNow==2)
for (unsigned int i = 0; i < bytes; i ++)
{
Process3(data + i);
}
else if(caseNow==3)
for (unsigned int i = 0; i < bytes; i ++)
{
Process1(data + i);
Process2(data + i);
}
if(caseNow==4)
for (unsigned int i = 0; i < bytes; i ++)
{
Process1(data + i);
Process3(data + i);
}
else if(caseNow==5)
for (unsigned int i = 0; i < bytes; i ++)
{
Process2(data + i);
Process3(data + i);
}
else if(caseNow==6)
for (unsigned int i = 0; i < bytes; i ++)
{
Process1(data + i);
Process2(data + i);
Process3(data + i);
}
else {}
Here's another solution using templates - this way you'll get an optimized version of the inner loop for each variant. If the ProcessN functions are short / simple enough to be worth inlining then this could be a worthwhile optimization.
#include <tuple>
#include <map>
#include <utility>
using namespace std;
inline void Process1(unsigned char* data) {}
inline void Process2(unsigned char* data) {}
inline void Process3(unsigned char* data) {}
#define FLAG1 (1 << 1)
#define FLAG2 (1 << 2)
#define FLAG3 (1 << 3)
template <bool b1, bool b2, bool b3>
void ProcessData(unsigned char* data, unsigned int bytes) {
for (unsigned int i = 0; i < bytes; i++) {
if (b1) Process1(data + i);
if (b2) Process2(data + i);
if (b3) Process3(data + i);
}
}
void ProcessData(unsigned char* data, unsigned int bytes, unsigned int flags) {
typedef void (*ProcessFunc)(unsigned char*, unsigned int bytes);
static map<tuple<bool, bool, bool>, ProcessFunc> funcs{
{make_tuple(false, false, false), ProcessData<false, false, false>},
{make_tuple(false, false, true), ProcessData<false, false, true>},
{make_tuple(false, true, false), ProcessData<false, true, false>},
{make_tuple(false, true, true), ProcessData<false, true, true>},
{make_tuple(true, false, false), ProcessData<true, false, false>},
{make_tuple(true, false, true), ProcessData<true, false, true>},
{make_tuple(true, true, false), ProcessData<true, true, false>},
{make_tuple(true, true, true), ProcessData<true, true, true>}};
bool b1 = !!(flags & FLAG1);
bool b2 = !!(flags & FLAG2);
bool b3 = !!(flags & FLAG3);
funcs[make_tuple(b1, b2, b3)](data, bytes);
}

Solving C++ 'target of assignment not really an lvalue' errors

Given this code:
void FrMemCopy(void *to, const void *from, size_t sz)
{
size_t sz8 = sz >> 3;
size_t sz1 = sz - (sz8 << 3);
while (sz8-- != 0) {
*((double *)to)++ = *((double *)from)++;
}
while (sz1-- != 0) {
*((char *)to)++ = *((char *)from)++;
}
}
I am receiving target of assignment not really an lvalue warnings on the 2 lines inside the while loops.
Can anyone break down those lines?
a cast then an increment?
What is a simplier way to write that?
What does the error mean?
It does not like the *((char*)to)++ statement.
Try this:
void FrMemCopy(void *to, const void *from, size_t sz)
{
size_t sz8 = sz >> 3;
size_t sz1 = sz - (sz8 << 3);
double * tod = (double *) to;
double * fromd = (double *) from;
while (sz8-- != 0) {
*(tod++) = *(fromd++);
}
char * toc = (char *) tod;
char * fromc = (char *) fromd;
while (sz1-- != 0) {
*(toc++) = *(fromc++);
}
}
You can't apply ++ to the result of a cast, only to an lvalue (a variable). So you need to create new variable with the appropriate types for the increments:
void FrMemCopy(void *to, const void *from, size_t sz)
{
size_t sz8 = sz >> 3;
size_t sz1 = sz - (sz8 << 3);
double *to1 = (double *)to;
double *from1 = (double *)from
while (sz8-- != 0) {
*to1++ = *from1++;
}
char *to2 = (char *)to1;
char *from2 = (char *)from1;
while (sz1-- != 0) {
*to2++ = *from2++;
}
}
I tried to rewrite it in a way that no warning appears:
void FrMemCopy(void *to, const void *from, size_t sz)
{
size_t sz8 = sz >> 3;
size_t sz1 = sz - (sz8 << 3);
double *xto = (double *)to;
double *xfrom = (double *)from;
while (sz8-- != 0) {
*xto++ = *xfrom++;
}
char *cto = (char *)to;
char *cfrom = (char *)from;
while (sz1-- != 0) {
*cto++ = *cfrom++;
}
}
The result of explicit type conversion is rvalue in this case - according to 5.4.1 of C++11 Standard. You cannot apply increment operator to rvalue, it shall be lvalue. See C++ value category for details.
Use temporary variables to obtain required effect:
double* to_dbl = static_cast<double*>(to);
double* from_dbl = static_cast<double*>(from);
while(sz8-- != 0)
{
*(to_dbl++) = *(from_dbl++);
}
You are performing an increment operation on the LValue (Left side value of the assignment operator). Logically and by definition, a LValue must always be a variable. It cannot be a constant. When you are performing an increment operation, it is leaving a constant value on the Left Side which is giving you the error.
First before answering let me just say: Don't try to out-micro-optimize your compiler/library. The compiler writers will win something like 99 times out of 100. Use std::copy or memcpy depending on the types you're copying and needs.
Other answers have noted that you can solve the immediate compilation errors with temporary variables.
I don't recommend this under any circumstances do the following, but I believe you can also accomplish this by casting to a reference type:
void FrMemCopy(void *to, const void *from, size_t sz)
{
size_t sz8 = sz >> 3;
size_t sz1 = sz - (sz8 << 3);
while (sz8-- != 0) {
*((double *&)to)++ = *((double *&)from)++;
}
while (sz1-- != 0) {
*((char *&)to)++ = *((char *&)from)++;
}
}