Optimization of loops and if - c++

I have a procedure looks like this:
void Process1(unsigned char* data)
{
}
void Process2(unsigned char* data)
{
}
void Process3(unsigned char* data)
{
}
#define FLAG1 (1 << 1)
#define FLAG2 (1 << 2)
#define FLAG3 (1 << 3)
void ProcessData(unsigned char* data, unsigned int bytes, unsigned int flags)
{
bool b1 = !!(flags & FLAG1);
bool b2 = !!(flags & FLAG2);
bool b3 = !!(flags & FLAG3);
for (unsigned int i = 0; i < bytes; i ++)
{
if (b1) Process1(data + i);
if (b2) Process2(data + i);
if (b3) Process3(data + i);
}
}
As it looks, flags & FLAG1 A.K.A b1 will not change in all the loops. But we still have to do branch in every loop. I just wondering if there's a way to avoid this unnecessary branch dynamically.
here is a demo of Lundin's solution.
#include <windows.h>
#include <stdio.h>
#include <time.h>
LARGE_INTEGER ls, le, ll;
#define START_CLOCK() QueryPerformanceCounter(&ls)
#define END_CLOCK() printf ("%.0lf ns\n", (QueryPerformanceCounter(&le), ((double)le.QuadPart - ls.QuadPart) / ll.QuadPart * 1000000));
void Process1(unsigned char* data)
{
(*data)++;
}
void Process2(unsigned char* data)
{
(*data)--;
}
void Process3(unsigned char* data)
{
(*data) *= (*data);
}
#define FLAG1 (1 << 1)
#define FLAG2 (1 << 2)
#define FLAG3 (1 << 3)
void ProcessData(unsigned char* data, unsigned int bytes, unsigned int flags)
{
bool b1 = !!(flags & FLAG1);
bool b2 = !!(flags & FLAG2);
bool b3 = !!(flags & FLAG3);
for (unsigned int i = 0; i < bytes; i ++)
{
if (b1) Process1(data + i);
if (b2) Process2(data + i);
if (b3) Process3(data + i);
}
}
typedef void (*proc_t)(unsigned char*);
inline static void do_nothing (unsigned char* ptr)
{
(void)ptr;
}
void ProcessData_x(unsigned char* data, unsigned int bytes, unsigned int flags)
{
bool b1 = (flags & FLAG1) != 0; // de-obfuscate the boolean logic
bool b2 = (flags & FLAG2) != 0;
bool b3 = (flags & FLAG3) != 0;
proc_t p1 = b1 ? Process1 : do_nothing;
proc_t p2 = b2 ? Process2 : do_nothing;
proc_t p3 = b3 ? Process3 : do_nothing;
for (unsigned int i = 0; i<bytes; i++)
{
p1(data + i);
p2(data + i);
p3(data + i);
}
}
int main()
{
if (!QueryPerformanceFrequency(&ll)) return 1;
const unsigned int bytes = 0xffff;
srand((unsigned int)time(NULL));
unsigned int flags = rand() & 0x7;
unsigned char* data = new unsigned char[bytes];
for (unsigned int i = 0; i < bytes; i++)
{
data[i] = (unsigned char)(rand() & 0xff);
}
START_CLOCK();
ProcessData(data, bytes, flags);
END_CLOCK();
START_CLOCK();
ProcessData_x(data, bytes, flags);
END_CLOCK();
}
here is the output:
134 ns
272 ns
I've run it several times but, unexpectedly, it costs even more time:(.. it is also compiled 'vs2010 Release x86'

First of all, it doesn't any sense to speak about optimizations without a particular system in mind...
That being said, I'd optimize away the branches in the following way:
typedef void (*proc_t)(unsigned char*);
inline static void do_nothing (unsigned char* ptr)
{
(void)ptr;
}
...
void ProcessData(unsigned char* data, unsigned int bytes, unsigned int flags)
{
bool b1 = (flags & FLAG1) != 0; // de-obfuscate the boolean logic
bool b2 = (flags & FLAG2) != 0;
bool b3 = (flags & FLAG3) != 0;
proc_t p1 = b1 ? Process1 : do_nothing;
proc_t p2 = b2 ? Process2 : do_nothing;
proc_t p3 = b3 ? Process3 : do_nothing;
for (unsigned int i = 0; i<bytes; i++)
{
p1(data + i);
p2(data + i);
p3(data + i);
}
}

A c++ solution. Similar to Lundin's answer but without calls to empty function. I'm not sure if that makes any difference in performance, the main advantage is that you don't need to manually list all the process calls in the loop. If you want to micro optimize or want c, you could use an array on stack, but you'll have to manage some counters yourself.
typedef void (*proc_t)(unsigned char*);
std::vector<proc_t> processes;
if (b1) processes.push_back(Process1);
if (b2) processes.push_back(Process2);
if (b3) processes.push_back(Process3);
for(auto p : processes)
for (unsigned int i = 0; i<bytes; i++)
p(data + i);

bool b1 = !!(flags & FLAG1);
bool b2 = !!(flags & FLAG2);
bool b3 = !!(flags & FLAG3);
int caseNow=SelectCaseAtOnce(b1,b2,b3);
if(caseNow==0)
for (unsigned int i = 0; i < bytes; i ++)
{
Process1(data + i);
}
else if(caseNow==1)
for (unsigned int i = 0; i < bytes; i ++)
{
Process2(data + i);
}
else if(caseNow==2)
for (unsigned int i = 0; i < bytes; i ++)
{
Process3(data + i);
}
else if(caseNow==3)
for (unsigned int i = 0; i < bytes; i ++)
{
Process1(data + i);
Process2(data + i);
}
if(caseNow==4)
for (unsigned int i = 0; i < bytes; i ++)
{
Process1(data + i);
Process3(data + i);
}
else if(caseNow==5)
for (unsigned int i = 0; i < bytes; i ++)
{
Process2(data + i);
Process3(data + i);
}
else if(caseNow==6)
for (unsigned int i = 0; i < bytes; i ++)
{
Process1(data + i);
Process2(data + i);
Process3(data + i);
}
else {}

Here's another solution using templates - this way you'll get an optimized version of the inner loop for each variant. If the ProcessN functions are short / simple enough to be worth inlining then this could be a worthwhile optimization.
#include <tuple>
#include <map>
#include <utility>
using namespace std;
inline void Process1(unsigned char* data) {}
inline void Process2(unsigned char* data) {}
inline void Process3(unsigned char* data) {}
#define FLAG1 (1 << 1)
#define FLAG2 (1 << 2)
#define FLAG3 (1 << 3)
template <bool b1, bool b2, bool b3>
void ProcessData(unsigned char* data, unsigned int bytes) {
for (unsigned int i = 0; i < bytes; i++) {
if (b1) Process1(data + i);
if (b2) Process2(data + i);
if (b3) Process3(data + i);
}
}
void ProcessData(unsigned char* data, unsigned int bytes, unsigned int flags) {
typedef void (*ProcessFunc)(unsigned char*, unsigned int bytes);
static map<tuple<bool, bool, bool>, ProcessFunc> funcs{
{make_tuple(false, false, false), ProcessData<false, false, false>},
{make_tuple(false, false, true), ProcessData<false, false, true>},
{make_tuple(false, true, false), ProcessData<false, true, false>},
{make_tuple(false, true, true), ProcessData<false, true, true>},
{make_tuple(true, false, false), ProcessData<true, false, false>},
{make_tuple(true, false, true), ProcessData<true, false, true>},
{make_tuple(true, true, false), ProcessData<true, true, false>},
{make_tuple(true, true, true), ProcessData<true, true, true>}};
bool b1 = !!(flags & FLAG1);
bool b2 = !!(flags & FLAG2);
bool b3 = !!(flags & FLAG3);
funcs[make_tuple(b1, b2, b3)](data, bytes);
}

Related

Why is the custom allocator significantly faster after adding a line of "memset"?

I am writing a "freelist allocator". I started with a line of memset in the initialization phase for debugging purposes. Later I removed the memset. But I found that the allocation part in the testing function test1_managed is running 3-4 times faster with the memset in the initialization than the one without. I am confused with the result. What reason could it be?
I compiled the code in Visual Studio 2019 (in Release mode) with MSVC 19.29.30140 and "/O2" optimization enabled.
Here is the code:
#pragma once
#include <cstdio>
#include <iostream>
#include <chrono>
using namespace std;
using namespace chrono;
const uint64_t MemSize = 1 << 20;
const uint64_t TestSize = 1 << 12;
typedef int TestType;
struct MemSegInfo {
MemSegInfo* m_next{ 0 };
MemSegInfo* m_nextFree{ 0 };
uint64_t m_size{ 0 };
uint64_t m_handle{ 0 };
};
const uint64_t infoSize = sizeof(MemSegInfo);
const uint64_t ptrSize = sizeof(MemSegInfo*);
const uint64_t maxAlignment = 16;
char* m_dataBuffer{ 0 };
char* m_dataBufferEnd{ 0 };
MemSegInfo* m_head;
MemSegInfo m_segFree;
MemSegInfo* m_segFreeCursor;
inline void* OffsetFromMemSegInfo(MemSegInfo* seg) {
return ((char*)seg) + infoSize;
}
inline MemSegInfo* OffsetToMemSegInfo(void* ptr) {
return (MemSegInfo*)(((char*)ptr) - infoSize);
}
template<class T>
inline void SetMem(MemSegInfo* seg, T&& val) {
*((T*)OffsetFromMemSegInfo(seg)) = val;
}
template<class T>
inline void SetMem(MemSegInfo* seg, const T& val) {
*((T*)OffsetFromMemSegInfo(seg)) = val;
}
template<class T>
inline T GetMem(MemSegInfo* seg) {
return *((T*)OffsetFromMemSegInfo(seg));
}
void Init() {
m_dataBuffer = (char*)malloc(MemSize);
if (!m_dataBuffer)
throw "Out of mem! ";
m_dataBufferEnd = m_dataBuffer + MemSize;
m_head = (MemSegInfo*)m_dataBuffer;
m_head->m_next = m_head;
//m_head->m_last = m_head;
m_head->m_size = MemSize - infoSize;
m_head->m_nextFree = &m_segFree;
m_segFree.m_size = 0;
m_segFree.m_nextFree = m_head;
m_segFreeCursor = &m_segFree;
/***THE MEMSET I AM TALKING ABOUT***/
memset(OffsetFromMemSegInfo(m_head), 0xfafafafa, m_head->m_size);
}
MemSegInfo* Allocate(size_t size) {
const uint64_t sizeAligned = ((size - 1) / maxAlignment + 1) * maxAlignment;
const uint64_t sizeAlloc = sizeAligned + infoSize;
MemSegInfo* segCurPrev = m_segFreeCursor;
while (segCurPrev->m_nextFree->m_size < sizeAlloc) { // Go through freelist to find the First Fit.
segCurPrev = segCurPrev->m_nextFree;
if (segCurPrev == m_segFreeCursor)
throw "Out of mem! ";
}
MemSegInfo* const segCur = segCurPrev->m_nextFree;
const uint64_t sizeRest = segCur->m_size - sizeAligned;
if (sizeRest <= infoSize) { // There is not enough space for another seg. Just gonna use it.
segCurPrev->m_nextFree = segCur->m_nextFree; // This can also deal with the case where this is the last one available.
m_segFreeCursor = segCurPrev;
}
else { // There is enough space for another seg. Separate and make a new seg.
MemSegInfo* const segNew = (MemSegInfo*)(((char*)segCur) + sizeAlloc);
MemSegInfo* const segNext = segCur->m_next;
MemSegInfo* const segNextFree = segCur->m_nextFree;
// Rearrange seg list
//segNew->m_last = segCur;
//segNext->m_last = segNew;
segNew->m_next = segNext;
segCur->m_next = segNew;
segNew->m_size = sizeRest - infoSize;
segCur->m_size = sizeAligned;
segCur->m_nextFree = nullptr;
// Join freelist
segNew->m_nextFree = segNextFree;
segCurPrev->m_nextFree = segNew;
m_segFreeCursor = segCurPrev;
}
return segCur;
}
inline void Free(MemSegInfo* segFree) {
// Join the freelist
segFree->m_nextFree = m_segFree.m_nextFree;
m_segFree.m_nextFree = segFree->m_nextFree;
}
void Cleanup() {
free(m_dataBuffer);
}
MemSegInfo* mems[16];
struct MyStruct
{
uint64_t m_a;
uint64_t m_b;
uint32_t m_c;
uint32_t m_d;
};
void test0() {
Init();
mems[0] = Allocate(8);
SetMem<uint64_t>(mems[0], 128);
uint64_t t0 = GetMem<uint64_t>(mems[0]);
mems[1] = Allocate(4);
uint64_t t1 = GetMem<uint64_t>(mems[0]);
SetMem<uint32_t>(mems[1], 256);
uint64_t t2 = GetMem<uint64_t>(mems[0]);
mems[2] = Allocate(64);
int* a = (int*)OffsetFromMemSegInfo(mems[2]);
for (int i = 0; i < 16; ++i) {
a[i] = i;
}
mems[3] = Allocate(sizeof(MyStruct) * 8);
MyStruct* b = (MyStruct*)OffsetFromMemSegInfo(mems[3]);
for (int i = 0; i < 8; ++i) {
b[i] = { (uint64_t)i, (uint64_t)i * 2, (uint32_t)i * 3, (uint32_t)i * 4 };
}
for (int i = 0; i < 16; ++i) {
printf("%d ", a[i]);
}
printf("\n");
Free(mems[1]);
printf("%lld\n", GetMem<uint64_t>(mems[0]));
for (int i = 0; i < 16; ++i) {
printf("%d ", a[i]);
}
printf("\n");
for (int i = 0; i < 8; ++i) {
printf("%lld %lld %ld %ld\t", b[i].m_a, b[i].m_b, b[i].m_c, b[i].m_d);
}
Cleanup();
}
TestType* bufferManaged[TestSize];
void test1_managed() {
Init();
/***ALLOCATION START***/
system_clock::time_point beg_alloc = system_clock::now();
TestType sum = 0;
for (uint64_t i = 0; i < TestSize; ++i) {
TestType val = i % INT32_MAX;
bufferManaged[i] = new (OffsetFromMemSegInfo(Allocate(sizeof(TestType)))) TestType(val);
sum += *bufferManaged[i];
}
/***ALLOCATION END***/
std::atomic_signal_fence(std::memory_order_seq_cst);
system_clock::time_point end_alloc = system_clock::now();
printf("managed %llu ns\n", (std::chrono::duration_cast<std::chrono::nanoseconds>(end_alloc - beg_alloc)).count());
printf("sum %d\n", sum);
system_clock::time_point beg_free = system_clock::now();
for (uint64_t i = 0; i < TestSize; ++i) {
Free((MemSegInfo*)OffsetToMemSegInfo(bufferManaged[i]));
}
Cleanup();
std::atomic_signal_fence(std::memory_order_seq_cst);
system_clock::time_point end_free = system_clock::now();
printf("managedfree %llu ns\n\n", (std::chrono::duration_cast<std::chrono::nanoseconds>(end_free - beg_free)).count());
}
TestType* bufferUnmanaged[TestSize];
void test1_unmnged() {
system_clock::time_point beg_alloc = system_clock::now();
TestType sum = 0;
for (uint64_t i = 0; i < TestSize; ++i) {
TestType val = i % INT32_MAX;
bufferUnmanaged[i] = new TestType(val);
sum += *bufferUnmanaged[i];
}
std::atomic_signal_fence(std::memory_order_seq_cst);
system_clock::time_point end_alloc = system_clock::now();
printf("unmnged %llu ns\n", (std::chrono::duration_cast<std::chrono::nanoseconds>(end_alloc - beg_alloc)).count());
printf("sum %d\n", sum);
system_clock::time_point beg_free = system_clock::now();
for (uint64_t i = 0; i < TestSize; ++i) {
delete bufferUnmanaged[i];
}
std::atomic_signal_fence(std::memory_order_seq_cst);
system_clock::time_point end_free = system_clock::now();
printf("unmngedfree %llu ns\n\n", (std::chrono::duration_cast<std::chrono::nanoseconds>(end_free - beg_free)).count());
}
void test1() {
std::chrono::time_point<std::chrono::system_clock> now = std::chrono::system_clock::now();
auto epoch = now.time_since_epoch();
auto value = std::chrono::duration_cast<std::chrono::milliseconds>(epoch);
long duration = value.count();
srand(duration);
for (int i = 0; i < 10; i++) {
printf("test%d\n", i);
test1_managed();
test1_unmnged();
printf("\n");
}
}
void test() {
test1();
}

(FUZZING) Get a pointer to data range of dynamic array

EDIT: Clarification:
If I have an array int* a = new int[10], I want to get a pointer to a, but only the values from 0 to 5, without having to allocate another array for those values.
Original post:
I created a small class to fuzz my functions, but the thing is that it is painfully slow. It takes roughly 10-20 seconds to run my function 1000 times.
I decided to improve my code by allocating a very large array at first, then filling it from 0 to a randomly generated number and then just returning a pointer to that range to use in my function instead of allocating memory and deleting it each time.
Below is my code.
I attempt to allocate 1 million bytes at first, then I want to return a range from 0 to whatever size my class generated. Currently I allocate memory once more for returning it, but that's not efficient.
I use Xorshift to generate random numbers, which should be much faster than rand() so I think besides memory allocation it's pretty good, but any suggestions are very much welcome!
Note: if you do not understand part of my code ask me (it's written quickly, so it might be unintelligible at certain parts) ;)
class fuzz {
public:
fuzz() {
this->alloc_init_buff();
}
~fuzz() {
this->dealloc_init_buff();
}
int fill_buff(unsigned int size) {
if (size > this->m_buffsize) { size = this->m_buffsize; }
for (int i = 0; i < size; ++i) {
this->m_buff[i] = this->rand_xor();
}
return size;
}
int fill_buff() {
int size = this->rand_xor(1, this->m_buffsize);
if (size > this->m_buffsize) { size = this->m_buffsize; }
for (int i = 0; i < size; ++i) {
this->m_buff[i] = this->rand_xor();
}
return size;
}
unsigned char*& get_buff(int size) {
unsigned char* temp = new unsigned char[size];
memcpy((void*)temp, (void*)this->m_buff, size);
return temp;
}
private:
struct xr_xorshift_state {
unsigned int a = 123456789, b = 362436069, c = 521288629, d = 88675123;
};
unsigned int xorshift(xr_xorshift_state* state) {
unsigned int res = 0;
res = state->a ^ (state->a << 11);
state->a = state->b; state->b = state->c; state->c = state->d;
state->d = state->d ^ (state->d >> 19) ^ (res ^ (res >> 8));
res &= 0x7fffffff;
return res;
}
unsigned int rand_xor() {
return this->xorshift(&this->m_state);
}
unsigned int rand_xor(unsigned int min, unsigned int max) {
return (min + (this->rand_xor() % (max - min)));
}
void alloc_init_buff() {
this->m_buff = new unsigned char[this->m_buffsize];
}
void dealloc_init_buff() {
delete[] this->m_buff;
}
xr_xorshift_state m_state = { 0 };
unsigned char* m_buff = { 0 };
unsigned int m_buffsize = { 1000000 };
};
int find_newline(const char* text, int size) {
int pos = 0;
while (*text != '\n') {
if (pos == size) { return 0; }
++text; ++pos;
}
return pos;
}
int main() {
fuzz fz = {};
unsigned char* randdata = nullptr;
int lap = 0;
int th = 0;
for (;;) {
if (lap == 1000) {
lap = 0;
++th;
printf("%d thousand laps done!\n", th);
}
try {
int size = fz.fill_buff();
randdata = fz.get_buff(size);
const char* d = (const char*)randdata;
find_newline(d, size);
delete[] randdata;
++lap;
}
catch (...) {
printf("error!\n");
++lap;
}
}
getchar();
return 0;
}

va_arg not incrementing C++

I have a bug with my printf() function im implementing for OS. Basically the problem is, it dosent increment through the list. For example lets say i have:
printf("%d %d",19,58);
what will show on my OS is :
19 19
the 58 for some reason is not going thourgh. I have debugged this for quite some time, but cant find problem :( . Here is the stdio.c++:
#include "stdio.h"
static size_t terminal_row = 0;
static size_t terminal_column = 0;
static uint16_t* VideoMemory =((uint16_t*)0xb8000);
static bool continue_ex = false;
SerialPort sp_std_io;
void printf(char *str, ...)
{
va_list arg;
va_start(arg, str);
for(int32_t i=0;str[i]!='\0'; ++i)
{
putchar(str[i],str[i+1],arg);
}
va_end(arg);
}
void strcat(char *destination, const char *source)
{
int x = 0;
while (destination[x] != '\0')
{
x++;
}
for (int i=0; source[i] != '\0'; i++)
{
destination[x++] = source[i];
}
destination[x] = '\0';
}
void put_char_helper_neg(char chr)
{
const size_t index = (terminal_row * VGA_WIDTH + terminal_column);
terminal_column++;
VideoMemory[index]= (VideoMemory[index] & 0xFF00)|chr;
}
void putstring_t(char str)
{
size_t index = (terminal_row * VGA_WIDTH + terminal_column);
terminal_column++;
VideoMemory[index]= (VideoMemory[index] & 0xFF00)|str;
}
void putchar(char str,char next_str, va_list arg)
{
if(!continue_ex)
{
uint32_t ch_per;
char* str_use,str_use_space;
const char per = '%';
if(str == '\b')
{
terminal_column--;
}
const size_t index = (terminal_row * VGA_WIDTH + terminal_column);
char space = ' ';
switch(str)
{
case '\n':
terminal_row++;
terminal_column = 0;
break;
case '\b':
VideoMemory[index]= (VideoMemory[index] & 0xFF00)|space;
break;
case '%':
switch(next_str)
{
case 'd':
ch_per = va_arg(arg,int);
if(ch_per<0)
{
ch_per = -ch_per;
put_char_helper_neg('-');
}
str_use = itoa(ch_per);
terminal_column++;
for(int32_t i=0;str_use[i]!='\0'; ++i)
{
putstring_t(str_use[i]);
}
// sp_std_io.write_number_serial(ch_per);
// sp_std_io.write_string_serial(str_use);
continue_ex = true;
break;
default:
terminal_column++;
VideoMemory[index]= (VideoMemory[index] & 0xFF00)|per;
}
break;
default:
terminal_column++;
VideoMemory[index]= (VideoMemory[index] & 0xFF00)|str;
break;
}
}
else
{
continue_ex = false;
}
}
int32_t strlen(int8_t* str)
{
int32_t l=0;
while(str[l]!='\0')l++;
return l;
}
char *itoa(int val)
{
uint8_t *ptr;
static uint8_t buffer[16];
ptr = buffer + sizeof(buffer);
*--ptr = '\0';
if (val == 0)
{
*--ptr = '0';
}
else while (val != 0)
{
*--ptr = (val % 10) + '0';
val = val / 10;
}
return((char*)ptr);
}
and stdio.h:
#ifndef _STD_LIB_H_
#pragma once
#define _STD_LIB_H_ 1
#include <stddef.h>
#include <stdint.h>
#include <stdarg.h>
#include "math.h"
#include "serial.h"
static const size_t VGA_WIDTH = 80;
static const size_t VGA_HEIGHT = 25;
//static int num_count_viedo_memory = 0;
void printf(char *str,...);
void putchar(char str,char next_str,va_list arg);
int32_t strlen(int8_t *str);
void strcat(char * Dest, char const * Src);
//int8_t* str_cat(int8_t *dest, const int8_t *src);
void reverse(char str[], int32_t length);
char* itoa(int val);
#endif
Like i described above , it is not incrementing through the args for some reason. Help would be appreciated! :)
Pass arg into your putchar function by reference instead of by value:
void putchar(char str,char next_str, va_list& arg)
What's happening is that it gets incremented inside your putchar function, but then the function returns and it has no effect on the variable in printf because putchar is passed a copy rather than a reference to it.

Modifying bits in a byte with a class

I want to directly modify a bit in a byte.
In GCC, you can do it as follow:
struct virtualByte {
unsigned char b0 : 1;
unsigned char b1 : 1;
unsigned char b2 : 1;
unsigned char b3 : 1;
unsigned char b4 : 1;
unsigned char b5 : 1;
unsigned char b6 : 1;
unsigned char b7 : 1;
} __attribute__((__packed__));
#define sbit(_byte, _pos) (((volatile struct virtualByte *)&_byte)->b ## _pos)
Usage:
unsigned char myByte = 0x00;
#define firstBit sbit(myByte, 0)
firstBit = 1; // Implicit myByte |= 0x01;
To make things neater I want to have class that does this for me. I came up with the following concept:
unsigned char myByteRef = 0x00;
Byte myByte(&myByteRef);
myByte[0] = 1; // Implicit myByteRef |= 0x01;
fprintf(stderr, "%2.2X\n", myByteRef);
But this does not work because in c++ you cannot return a reference to a single bit. Overloading the constructor does not work either.
Is there a possibility to implement such behaviour? The assignment operator should directly modify its underlying byte (not a set of bytes).
You want to use std::bitset:
std::bitset<12> someBits; // 12 bits
someBits[0] = true; // set 1st bit
std::cout << someBits.count() << '\n'; // prints 1
std::bitset<12>::reference bit5 = someBits[5];
bit5 = true;
std::cout << someBits.count() << '\n'; // prints 2
You can use the index operator to return a reference to a bit in the way you want. Note that this reference is not a bool& but rather a std::bitset::reference:
Finally came to a solution, many thanks to #doc!
My solution:
class Bit {
private:
volatile uint8_t *byte;
uint8_t bitPos;
public:
Bit(void)
{
}
void init(volatile uint8_t *const byte, uint8_t const bitPos)
{
this->byte = byte;
this->bitPos = (bitPos > 7u ? 7u : bitPos);
}
void setValue(bool const bitValue)
{
if (!this->byte) return;
if (bitValue) {
*this->byte |= (1u << this->bitPos);
} else {
*this->byte &= ~(1u << this->bitPos);
}
}
};
class BitReference {
private:
Bit &ref;
public:
BitReference(Bit &ref) : ref(ref)
{
}
void operator=(bool const bitValue)
{
this->ref.setValue(bitValue);
}
};
class Byte {
private:
Bit bits[8];
public:
Byte(volatile uint8_t *const byte)
{
for (unsigned i = 0; i < 8; ++i) {
this->bits[i].init(byte, i);
}
}
/* This did the trick :)! */
BitReference operator[](size_t index)
{
if (index > 7) index = 7;
return BitReference(this->bits[index]);
}
};
Usage:
uint8_t myPort = 0x00;
int main(int const argc, const char **const argv)
{
Byte abc(&myPort);
abc[0] = 1;
abc[1] = 1;
abc[2] = 1;
abc[3] = 1;
fprintf(stderr, "%2.2X\n", myPort);
return 0;
}

Odd performance issue with nested for loops

Below is the full source code you can just copy paste into Visual Studio for easy repro.
#include <Windows.h>
#include <algorithm>
#include <vector>
#include <iostream>
#include <sstream>
LARGE_INTEGER gFreq;
struct CProfileData;
// Yes, we map the pointer itself not the string, for performance reasons
std::vector<CProfileData*> gProfileData;
// simulate a draw buffer access to avoid CBlock::Draw being optimized away
float gDrawBuffer = 0;
struct CTimer
{
CTimer()
{
Reset();
}
size_t GetElapsedMicro()
{
LARGE_INTEGER now;
::QueryPerformanceCounter(&now);
return (1000000 * (now.QuadPart - m_timer.QuadPart)) / gFreq.QuadPart;
}
inline void Reset()
{
::QueryPerformanceCounter(&m_timer);
}
LARGE_INTEGER m_timer;
};
struct CProfileData
{
CProfileData() : m_hitCount(0), m_totalTime(0), m_minTime(-1),
m_maxTime(0), m_name(NULL)
{
gProfileData.push_back(this);
}
size_t m_totalTime;
size_t m_minTime;
size_t m_maxTime;
size_t m_hitCount;
const char * m_name;
};
class CSimpleProfiler
{
public:
CSimpleProfiler(const char * aLocationName, CProfileData * aData)
: m_location(aLocationName), m_data(aData)
{
::QueryPerformanceCounter(&m_clock);
}
~CSimpleProfiler()
{
CProfileData & data = *m_data;
data.m_name = m_location;
++data.m_hitCount;
LARGE_INTEGER now;
::QueryPerformanceCounter(&now);
size_t elapsed = (1000000 * (now.QuadPart - m_clock.QuadPart)) / gFreq.QuadPart;
data.m_totalTime += elapsed;
elapsed < data.m_minTime ? data.m_minTime = elapsed : true;
elapsed > data.m_maxTime ? data.m_maxTime = elapsed : true;
}
static void PrintAll()
{
std::stringstream str;
str.width(20);
str << "Location";
str.width(15);
str << "Total time";
str.width(15);
str << "Average time";
str.width(15);
str << "Hit count";
str.width(15);
str << "Min";
str.width(15);
str << "Max" << std::endl;
::OutputDebugStringA(str.str().c_str());
for (auto i = gProfileData.begin(); i != gProfileData.end(); ++i)
{
CProfileData & data = **i;
std::stringstream str;
str.width(20);
str << data.m_name;
str.width(15);
str << data.m_totalTime;
str.width(15);
str << data.m_totalTime / (float)data.m_hitCount;
str.width(15);
str << data.m_hitCount;
str.width(15);
str << data.m_minTime;
str.width(15);
str << data.m_maxTime << std::endl;
::OutputDebugStringA(str.str().c_str());
}
}
static void Clear()
{
for (auto i = gProfileData.begin(); i != gProfileData.end(); ++i)
{
(*i)->m_totalTime = 0;
(*i)->m_minTime = 0;
(*i)->m_maxTime = 0;
(*i)->m_hitCount = 0;
}
}
private:
LARGE_INTEGER m_clock;
const char * m_location;
CProfileData * m_data;
};
#define PROFILING_ENABLED
#ifdef PROFILING_ENABLED
#define SIMPLE_PROFILE \
static CProfileData pdata ## __LINE__; \
CSimpleProfiler p ## __LINE__(__FUNCTION__, & pdata ## __LINE__)
#define SIMPLE_PROFILE_WITH_NAME(Name) \
static CProfileData pdata ## __LINE__; \
CSimpleProfiler p ## __LINE__(Name, & pdata ## __LINE__)
#else
#define SIMPLE_PROFILE __noop
#define SIMPLE_PROFILE_WITH_NAME(Name) __noop
#endif
void InvalidateL1Cache()
{
const int size = 256 * 1024;
static char *c = (char *)malloc(size);
for (int i = 0; i < 0x0fff; i++)
for (int j = 0; j < size; j++)
c[j] = i*j;
}
int _tmain(int argc, _TCHAR* argv[])
{
::QueryPerformanceFrequency(&gFreq);
LARGE_INTEGER pc;
::QueryPerformanceCounter(&pc);
struct CBlock
{
float x;
float y;
void Draw(float aBlend)
{
for (size_t i = 0; i < 100; ++i )
gDrawBuffer += aBlend;
}
};
typedef std::vector<std::vector<CBlock>> Layer;
typedef std::vector<Layer> Layers;
Layers mBlocks;
// populate with dummy data;
mBlocks.push_back(Layer());
Layer & layer = mBlocks.back();
layer.resize(109);
srand(0); // for reprodicibility (determinism)
for (auto i = layer.begin(); i != layer.end(); ++i)
{
i->resize(25 + rand() % 10 - 5);
}
// end populating dummy data
while (1)
{
CSimpleProfiler::Clear();
float aBlend = 1.f / (rand() % 100);
{
for (auto i = mBlocks.begin(); i != mBlocks.end(); ++i)
{
for (auto j = i->begin(); j != i->end(); ++j)
{
CTimer t;
{
SIMPLE_PROFILE_WITH_NAME("Main_Draw_3");
for (auto blockIt = j->begin(); blockIt != j->end();)
{
CBlock * b = nullptr;
{
b = &*blockIt;
}
{
b->Draw(aBlend);
}
{
++blockIt;
}
}
}
if (t.GetElapsedMicro() > 1000)
{
::OutputDebugStringA("SLOWDOWN!\n");
CSimpleProfiler::PrintAll();
}
}
}
}
}
return 0;
}
I get the following profiling from time to time, expressed in microseconds:
SLOWDOWN!
Location Total time Average time Hit count Min Max
Main_Draw_3 2047 36.5536 56 0 1040
This spikes from time to time. Normally, it takes 100 microseconds for Main_Draw_3 block to finish, but it spikes to 1000 (the Max column) from time to time. What causes this?
I'm aware cache misses could play a role, but is it really that in this case?... What is happening here and how can I mitigate this?
More info:
compiler VS 2013, compiled with Maximize Speed (/O2)
I think there might be two issues:
Are you compiling with optimizations on? What are the flags?
Maybe you could increase the sample size (by doing for instance ten (or hundred, or thousand etc) runs of this code in one profiling run). The reason is that if the sample size is small, the standard deviation is very high