How can I verify the thread-safety of the bit manipulation in a multi-threaded application? I am trying to set and reset the bits of an atomic variable shared among various threads.
std::atomic<uint8_t> bitset_;
void set(unsigned n) {
uint8_t val = bitset_.load();
while (!bitset_.compare_exchange_weak(val, val | (1 << n), std::memory_order_release));
}
void reset(unsigned n) {
uint8_t val = bitset_.load();
while (!bitset_.compare_exchange_weak(val, val & ~(1 << n), std::memory_order_release));
}
bool test(unsigned i) { return (bitset_.load(std::memory_order_acquire) >> i) & 1; }
bool allset() { return bitset_.load(std::memory_order_acquire) == 255; }
Related
I am implementing a MidiReader
And it need me to read weather MSB First or LSB First UInts(8, 16, 32 or 64).
I know little about binary and types so I'm currently copying other's code from C#.
class ByteArrayReader
{
public:
unsigned char* ByteArray;
unsigned int Size;
unsigned int Index = 0;
ByteArrayReader(unsigned char* byteArray)
{
if (byteArray == NULL)
{
throw byteArray;
}
ByteArray = byteArray;
Size = (unsigned int)sizeof(byteArray);
Index = 0;
}
char inline Read()
{
return ByteArray[Index++];
}
void inline Forward(unsigned int length = 1)
{
Index += length;
}
void inline Backward(unsigned int length = 1)
{
if (length > Index)
{
throw length;
}
Index -= length;
}
bool operator==(ByteArrayReader) = delete;
};
These are what I copied:
uint16_t inline ReadUInt16()
{
return (uint16_t)((Read() << 8) | Read());
}
uint32_t inline ReadUInt32()
{
return (uint32_t)((((((Read() << 8) | Read()) << 8) | Read()) << 8) | Read());
}
But it's said that one of it reads MSB First UInt. So I want to ask how to read UInt types from binaries elegantly, also learning how uint is represented in bytes.
The part
(uint32_t)((((((Read() << 8) | Read()) << 8) | Read()) << 8) | Read());
is undefined behavior because each call to Read method increments a counter called Index and there is no strict order of computation of them by compiler.
It would be better if they were computed in order like this:
auto chunk1 = Read(); // Index=x
auto chunk2 = Read(); // Index=x+1
auto chunk3 = Read(); // Index=x+2
...
auto result = chunk1 << 8 | chunk2<<8 ...
to be sure incrementations are happening in order.
Order of bytes is different between little-endian and big-endian systems. Here it is asked: Detecting endianness programmatically in a C++ program
Try this:
uint32_t inline ReadUInt32MSBfirst()
{
auto b1 = Read();
auto b2 = Read();
auto b3 = Read();
auto b4 = Read();
return (uint32_t)((b1 << 24) | (b2 << 16) | (b3 << 8) | b4);
}
uint32_t inline ReadUInt32LSBfirst()
{
auto b1 = Read();
auto b2 = Read();
auto b3 = Read();
auto b4 = Read();
return (uint32_t)(b1 | (b2 << 8) | (b3 << 16) | (b4 << 24));
}
I have been working on a program that can connect to a Minecraft Server and exchange packets with it but Minecraft's server packets heavily rely on signed VarInts. On the site documenting how their communication works is explanation of VarInt and even an example implementation of them in Java:
So i followed it and it works for non negative numbers but for negative numbers it's just goes infinitely long. I serched for other means of implementing it but i couldn't find any.
Here is my version it (works as i said only for positive numbers which is not fully what i want)
class VarInt
{
public:
char* data = NULL;
int length = 0;
int Read();
void Write(int value);
~VarInt();
private:
int SEGMENT_BIT_MASK = 0x7F;
int CONTINUE_BIT_MASK = 0x80;
};
int VarInt::Read()
{
int value = 0;
int position = 0;
byte currentByte;
int i = 0;
while (true)
{
currentByte = data[i];
value |= (currentByte & SEGMENT_BIT_MASK) << position;
if ((currentByte & CONTINUE_BIT_MASK) == 0)
break;
i++;
position += 7;
if (position >= 32)
std::cout << "VarInt is too Big" << std::endl;
}
return value;
};
void VarInt::Write(int value)
{
bool state = true;
std::vector<byte> bytes;
while (state)
{
if ((value & ~SEGMENT_BIT_MASK) == 0)
{
bytes.push_back(value);
state = false;
break;
}
bytes.push_back(((value & SEGMENT_BIT_MASK) | CONTINUE_BIT_MASK));
value >>= 7;
}
int bytes_size = bytes.size();
length = bytes_size;
data = (char*)malloc(bytes_size);
int i = 0;
while (i != bytes_size)
{
data[i] = bytes.at(i);
i++;
}
};
VarInt::~VarInt()
{
};
And here are my means of testing it:
#include <iostream>
#include "MDatatypes.h"
int main()
{
ndt::VarInt test;
//Sets value of test to -127
test.Write(-127);
//Sets value of test2 to 255
ndt::VarInt test2;
test2.Write(255);
//Outputing length of each Varint in bytes
std::cout << test.length << '|' << test2.length << std::endl;
//Outputing the values of each Varint
std::cout << test.Read() << '|' << test2.Read() << std::endl;
}
I'm trying to figure out is there a bug in the answer (now deleted) about the implementation of Cuda-like atomicCAS for bools. The code from the answer (reformatted):
static __inline__ __device__ bool atomicCAS(bool *address, bool compare, bool val)
{
unsigned long long addr = (unsigned long long)address;
unsigned pos = addr & 7; // byte position within the unsigned long long
int *int_addr = (int *)(addr - pos); // int-aligned address
int old = *int_addr, assumed, ival;
do
{
assumed = old;
if(val)
ival = old | (1 << (8 * pos));
else
ival = old & (~((0xFFU) << (8 * pos)));
old = atomicCAS(int_addr, assumed, ival);
} while(assumed != old);
return (bool)(old & ((0xFFU) << (8 * pos)));
}
According to the documentation, atomicCAS should set *address to (*address == compare ? val : *address), but in the implementation above compare argument is never used!
The code I use to reproduce the bug:
#include <cstdio>
// atomicCAS definition here
__device__ bool b;
__global__ void kernel()
{
b = false;
atomicCAS(&b, true, true); // `(b == true ? true : b)`, where b is false equals to false
printf("%d\n", b); // b is false => expected output is 0
}
int main()
{
kernel<<<1, 1>>>();
cudaDeviceSynchronize();
}
The expected output is 0, but the actual output is 1.
I have a suggestion about how to fix it but am not 100% sure it's thread-safe:
static __inline__ __device__ bool atomicCAS(bool *address, bool compare, bool val)
{
unsigned long long addr = (unsigned long long)address;
unsigned pos = addr & 3; // byte position within the int
int *int_addr = (int *)(addr - pos); // int-aligned address
int old = *int_addr, assumed, ival;
do
{
if(*address != compare) // If we expected that bool to be different, then
break; // stop trying to update it and just return it's current value
assumed = old;
if(val)
ival = old | (1 << (8 * pos));
else
ival = old & (~((0xFFU) << (8 * pos)));
old = atomicCAS(int_addr, assumed, ival);
} while(assumed != old);
return (bool)(old & ((0xFFU) << (8 * pos)));
}
My questions are
Is there a bug in the first code sample from the answer? If there is,
Does the last code sample fix it thread-safely?
Many many thanks to #RobertCrovella; the first code sample does contain a bug, the second does fix it, but is not thread-safe (see question comments for details). The thread-safe fix:
static __inline__ __device__ bool atomicCAS(bool *address, bool compare, bool val)
{
unsigned long long addr = (unsigned long long)address;
unsigned pos = addr & 3; // byte position within the int
int *int_addr = (int *)(addr - pos); // int-aligned address
int old = *int_addr, assumed, ival;
bool current_value;
do
{
current_value = (bool)(old & ((0xFFU) << (8 * pos)));
if(current_value != compare) // If we expected that bool to be different, then
break; // stop trying to update it and just return it's current value
assumed = old;
if(val)
ival = old | (1 << (8 * pos));
else
ival = old & (~((0xFFU) << (8 * pos)));
old = atomicCAS(int_addr, assumed, ival);
} while(assumed != old);
return current_value;
}
So i have got this function.
UINT32 Encrypt(UINT32 instruction, int increment)
{
UINT32 linstruction = _rotl(instruction, 7);
UINT32 rinstruction = _rotr(instruction, 3);
UINT32 key = (0x1F3D8AF * increment) ^ (rinstruction ^ linstruction);
return (key ^ instruction);
}
I need to make a function that actually decrypts this and gets the instruction from the result using a key so it would be like.
t = encrypt(t, i);
t = decrypt(t, key);
Basically i want it to reverse the whole process of the encrypt so it decrypts it and gets me the instruction.
They are used in this function
int RbxOpEncoder::encode(Instruction op, int i, int key) {
int orig = ConvertInstruction(op);
int t = orig;
switch (GET_OPCODE(op)) {
case OP_JMP:
t = ((Encrypt(t, i) & 0x3FFFFFF) | ((orig >> 26 & MASK1(6, 0)) << 0x1A));
break;
case OP_CALL:
case OP_TAILCALL:
case OP_CLOSURE:
case OP_RETURN:
t = ((Calldecrypt(t, i) & 0x3FFFFFF) | ((orig >> 26 & MASK1(6, 0)) << 0x1A));
break;
}
t = EncryptOpcode(t, key);
return t;
}
You may use:
std::uint32_t decrypt(std::uint32_t instruction, int increment)
{
instruction = instruction ^ (0x1F3D8AF * increment);
for (int i = 0; i != 15; ++i) {
instruction = Encrypt(instruction, 0);
}
return instruction;
}
And then you have
decrypt(Encrypt(value, increment), increment) == value
Demo
I can't understand why theory and implementation of CRC not indentical? I mean in implementations I find first perform bitshift and then xor. But first bit will not be xored. And in explanation xor starting from first bit.
Here my code for CRC4
public enum CRC4_POLY
{
CRC4 = 0x0B //1011
};
public class CRC4Calc
{
private byte[] table = new byte[16];
public byte Checksum(params byte[] val)
{
if (val == null)
throw new ArgumentNullException("val");
byte c = 0;
foreach (byte b in val)
{
c = table[c ^ b];
}
return c;
}
public byte[] Table
{
get
{
return this.table;
}
set
{
this.table = value;
}
}
public byte[] GenerateTable(CRC4_POLY polynomial)
{
byte[] csTable = new byte[16];
for (int i = 0; i < 16; ++i)
{
int curr = i;
for (int j = 0; j < 4; ++j)
{
if ((curr & 0x8) != 0)
{
curr = ((curr << 1) & 0x0F) ^ (int)polynomial; // why not?: curr = ((curr ^ (int)polynomial) <<1) & 0x0F;
}
else
{
curr <<= 1;
}
}
csTable[i] = (byte)curr;
}
return csTable;
}
public CRC4Calc(CRC4_POLY polynomial)
{
this.table = this.GenerateTable(polynomial);
}
}
The top bit of the register before shifting out, i.e. the bit being shifted out, determines whether the polynomial is exclusive-or'ed with what remains after shifting. This is precisely the classic shift register implementation.