Related
Have made function that operates on several streams of data in same time, creates output result which is put to destination stream. It has been put huge amount of time to optimize performance of this function (openmp, intrinsics, and etc...). And it performs beautifully.
There is alot math involved here, needless to say very long function.
Now I want to implement in same function with math replacement code for each instance of this without writing each version of this function. Where I want to differentiate between different instances of this function using only #defines or inlined function (code has to be inlined in each version).
Went for templates, but templates allow only type specifiers, and realized that #defines can't be used here. Remaining solution would be inlined math functions, so simplified idea is to create header like this:
'alm_quasimodo.h':
#pragma once
typedef struct ALM_DATA
{
int l, t, r, b;
int scan;
BYTE* data;
} ALM_DATA;
typedef BYTE (*MATH_FX)(BYTE&, BYTE&);
// etc
inline BYTE math_a1(BYTE& A, BYTE& B){ return ((BYTE)((B > A) ? B:A)); }
inline BYTE math_a2(BYTE& A, BYTE& B){ return ((BYTE)(255 - ((long)((long)(255 - A) * (255 - B)) >> 8))); }
inline BYTE math_a3(BYTE& A, BYTE& B){ return ((BYTE)((B < 128)?(2*(((long)A>>1)+64))*((float)B/255):(255-(2*(255-(((long)A>>1)+64))*(float)(255-B)/255)))); }
// etc
template <typename MATH>
inline int const template_math_av (MATH math, ALM_DATA& a, ALM_DATA& b)
{
// ultra simplified version of very complex code
for (int y = a.t; y <= a.b; y++)
{
int yoffset = y * a.scan;
for (int x = a.l; x <= a.r; x++)
{
int xoffset = yoffset + x;
a.data[xoffset] = math(a.data[xoffset], b.data[xoffset]);
}
}
return 0;
}
ALM_API int math_caller(int condition, ALM_DATA& a, ALM_DATA& b);
and math_caller is defined in 'alm_quasimodo.cpp' as follows:
#include "stdafx.h"
#include "alm_quazimodo.h"
ALM_API int math_caller(int condition, ALM_DATA& a, ALM_DATA& b)
{
switch(condition)
{
case 1: return template_math_av<MATH_FX>(math_a1, a, b);
break;
case 2: return template_math_av<MATH_FX>(math_a2, a, b);
break;
case 3: return template_math_av<MATH_FX>(math_a3, a, b);
break;
// etc
}
return -1;
}
Main concern here is optimization, mainly in-lining of MATH function code, and not to break existing optimizations of original code. Without writing each instance of function for specific math operation, of course ;)
So does this template inlines properly all math functions?
And any suggestions how to optimize this function template?
If nothing, thanks for reading this lengthy question.
It all depends on your compiler, optimization level, and how and where are math_a1 to math_a3 functions defined.
Usually, the compiler can optimize this if the functions in question are inline function in the same compilation unit as the rest of the code.
If this doesn't happen for you, you may want to consider functors instead of functions.
Here are some simple examples I experimented with. You can do the same for your function, and check the behavior of different compilers.
For my example, GCC 7.3 and clang 6.0 are pretty good in optimizing-out function calls (provided they see the definition of the function of course). However, somewhat surprisingly, ICC 18.0.0 is only able to optimize-out functors and closures. Even inline functions give it some trouble.
Just to have some code here in case the link stops working in the future.
For the following code:
template <typename T, int size, typename Closure>
T accumulate(T (&array)[size], T init, Closure closure) {
for (int i = 0; i < size; ++i) {
init = closure(init, array[i]);
}
return init;
}
int sum(int x, int y) { return x + y; }
inline int sub_inline(int x, int y) { return x - y; }
struct mul_functor {
int operator ()(int x, int y) const { return x * y; }
};
extern int extern_operation(int x, int y);
int accumulate_function(int (&array)[5]) {
return accumulate(array, 0, sum);
}
int accumulate_inline(int (&array)[5]) {
return accumulate(array, 0, sub_inline);
}
int accumulate_functor(int (&array)[5]) {
return accumulate(array, 1, mul_functor());
}
int accumulate_closure(int (&array)[5]) {
return accumulate(array, 0, [](int x, int y) { return x | y; });
}
int accumulate_exetern(int (&array)[5]) {
return accumulate(array, 0, extern_operation);
}
GCC 7.3 (x86) produces the following assembly:
sum(int, int):
lea eax, [rdi+rsi]
ret
accumulate_function(int (&) [5]):
mov eax, DWORD PTR [rdi+4]
add eax, DWORD PTR [rdi]
add eax, DWORD PTR [rdi+8]
add eax, DWORD PTR [rdi+12]
add eax, DWORD PTR [rdi+16]
ret
accumulate_inline(int (&) [5]):
mov eax, DWORD PTR [rdi]
neg eax
sub eax, DWORD PTR [rdi+4]
sub eax, DWORD PTR [rdi+8]
sub eax, DWORD PTR [rdi+12]
sub eax, DWORD PTR [rdi+16]
ret
accumulate_functor(int (&) [5]):
mov eax, DWORD PTR [rdi]
imul eax, DWORD PTR [rdi+4]
imul eax, DWORD PTR [rdi+8]
imul eax, DWORD PTR [rdi+12]
imul eax, DWORD PTR [rdi+16]
ret
accumulate_closure(int (&) [5]):
mov eax, DWORD PTR [rdi+4]
or eax, DWORD PTR [rdi+8]
or eax, DWORD PTR [rdi+12]
or eax, DWORD PTR [rdi]
or eax, DWORD PTR [rdi+16]
ret
accumulate_exetern(int (&) [5]):
push rbp
push rbx
lea rbp, [rdi+20]
mov rbx, rdi
xor eax, eax
sub rsp, 8
.L8:
mov esi, DWORD PTR [rbx]
mov edi, eax
add rbx, 4
call extern_operation(int, int)
cmp rbx, rbp
jne .L8
add rsp, 8
pop rbx
pop rbp
ret
Edit: It seems this is a compiler bug indeed: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=82803
I am writing a wrapper for writing logs that uses TLS to store a std::stringstream buffer. This code will be used by shared-libraries. When looking at the code on godbolt.org it seems that neither gcc nor clang will cache the result of a TLS lookup (the loop repeatedly calls '__tls_get_addr()' when I believe I have designed my class in a way that should let it.
#include <sstream>
class LogStream
{
public:
LogStream()
: m_buffer(getBuffer())
{
}
LogStream(std::stringstream& buffer)
: m_buffer(buffer)
{
}
static std::stringstream& getBuffer()
{
thread_local std::stringstream buffer;
return buffer;
}
template <typename T>
inline LogStream& operator<<(const T& t)
{
m_buffer << t;
return *this;
}
private:
std::stringstream& m_buffer;
};
int main()
{
LogStream log{};
for (int i = 0; i < 12345678; ++i)
{
log << i;
}
}
Looking at the assembly code output both gcc and clang generate pretty similar output:
clang 5.0.0:
xor ebx, ebx
.LBB0_3: # =>This Inner Loop Header: Depth=1
data16
lea rdi, [rip + LogStream::getBuffer[abi:cxx11]()::buffer[abi:cxx11]#TLSGD]
data16
data16
rex64
call __tls_get_addr#PLT // Called on every loop iteration.
lea rdi, [rax + 16]
mov esi, ebx
call std::basic_ostream<char, std::char_traits<char> >::operator<<(int)#PLT
inc ebx
cmp ebx, 12345678
jne .LBB0_3
gcc 7.2:
xor ebx, ebx
.L3:
lea rdi, guard variable for LogStream::getBuffer[abi:cxx11]()::buffer#tlsld[rip]
call __tls_get_addr#PLT // Called on every loop iteration.
mov esi, ebx
add ebx, 1
lea rdi, LogStream::getBuffer[abi:cxx11]()::buffer#dtpoff[rax+16]
call std::basic_ostream<char, std::char_traits<char> >::operator<<(int)#PLT
cmp ebx, 12345678
jne .L3
How can I convince both compilers that the lookup doesn't need to be repeatedly done?
Compiler options: -std=c++11 -O3 -fPIC
Godbolt link
This really looks like an optimization bug in both Clang and GCC.
Here's what I think happens. (I might be completely off.) The compiler completely inlines everything down to this code:
int main()
{
// pseudo-access
std::stringstream& m_buffer = LogStream::getBuffer::buffer;
for (int i = 0; i < 12345678; ++i)
{
m_buffer << i;
}
}
And then, not realizing that access to a thread-local is very expensive under -fPIC, it decides that the temporary reference to the global is not necessary and inlines that as well:
int main()
{
for (int i = 0; i < 12345678; ++i)
{
// pseudo-access
LogStream::getBuffer::buffer << i;
}
}
Whatever actually happens, this is clearly a pessimization of the code your wrote. You should report this as a bug to GCC and Clang.
GCC bugtracker: https://gcc.gnu.org/bugzilla/
Clang bugtracker: https://bugs.llvm.org/
I have simple class using a kind of ATL database access.
All functions are defined in a header file.
The problematic functions all do the same. There are some macros in use. The generated code looks like this
void InitBindings()
{
if (sName) // Static global char*
m_sTableName = sName; // Save into member
{ AddCol("Name", some_constant_data... _GetOleDBType(...), ...); };
{ AddCol("Name1", some_other_constant_data_GetOleDBType(...), ...); };
...
}
AddCol returns a reference to a structure, but as you see it is ignored.
When I look into the assembler code where I have a function that uses 6 AddCol calls I can see that the function requires 2176 bytes of stack space. I have functions that requires 20kb and more. And in the debugger I can see that the stack isn't use at all. (All initialized to 0xCC and never touched)
See assembler code at the end.
The problem can be seen with VS-2015, and VS-2017.Only in Debug mode.
In Release mode the function reserves no extra stack space at all.
The only rule I see is; more AddCol calls, will cause more stack to be reserved. I can see that approximativ 500bytes per AddCol call is reserved.
Again: The function returns no object, it returns a reference to the binding information.
I already used the following pragmas in front of the function (but inside the class definition in the header):
__pragma(runtime_checks("", off)) __pragma(optimize("ts", on)) __pragma(strict_gs_check(push, off))
But no avail. This pragmas should turn optimization on, switches off runtime checks and stack checks. How can I reduce this unneeded stack space that is allocated. In some cases I can see stack overflows in the debug version, when this functions are used. No problems in the release version.
; 325 : BIND_BEGIN(CMasterData, _T("tblMasterData"))
push ebp
mov ebp, esp
sub esp, 2176 ; 00000880H
push ebx
push esi
push edi
mov DWORD PTR _this$[ebp], ecx
mov eax, OFFSET ??_C#_1BM#GOLNKAI#?$AAt?$AAb?$AAl?$AAM?$AAa?$AAs?$AAt?$AAe?$AAr?$AAD?$AAa?$AAt?$AAa?$AA?$AA#
test eax, eax
je SHORT $LN2#InitBindin
push OFFSET ??_C#_1BM#GOLNKAI#?$AAt?$AAb?$AAl?$AAM?$AAa?$AAs?$AAt?$AAe?$AAr?$AAD?$AAa?$AAt?$AAa?$AA?$AA#
mov ecx, DWORD PTR _this$[ebp]
add ecx, 136 ; 00000088H
call DWORD PTR __imp_??4?$CStringT#_WV?$StrTraitMFC_DLL#_WV?$ChTraitsCRT#_W#ATL#####ATL##QAEAAV01#PB_W#Z
$LN2#InitBindin:
; 326 : // Columns:
; 327 : B$C_IDENT (_T("Id"), m_lId);
push 0
push 0
push 1
push 4
push 0
call ?_GetOleDBType#ATL##YAGAAJ#Z ; ATL::_GetOleDBType
add esp, 4
movzx eax, ax
push eax
push 0
push OFFSET ??_C#_15NCCOGFKM#?$AAI?$AAd?$AA?$AA#
mov ecx, DWORD PTR _this$[ebp]
call ?AddCol#CDBAccess#DB##QAEAAUS_BIND#2#PB_WKGKW4TYPE#32#0_N#Z ; DB::CDBAccess::AddCol
; 328 : B$C (_T("Name"), m_szName);
push 0
push 0
push 0
push 122 ; 0000007aH
mov eax, 4
push eax
call ?_GetOleDBType#ATL##YAGQA_W#Z ; ATL::_GetOleDBType
add esp, 4
movzx ecx, ax
push ecx
push 4
push OFFSET ??_C#_19DINFBLAK#?$AAN?$AAa?$AAm?$AAe?$AA?$AA#
mov ecx, DWORD PTR _this$[ebp]
call ?AddCol#CDBAccess#DB##QAEAAUS_BIND#2#PB_WKGKW4TYPE#32#0_N#Z ; DB::CDBAccess::AddCol
; 329 : B$C (_T("Data"), m_data);
push 0
push 0
push 0
push 4
push 128 ; 00000080H
call ?_GetOleDBType#ATL##YAGAAVCComBSTR#1##Z ; ATL::_GetOleDBType
add esp, 4
movzx eax, ax
push eax
push 128 ; 00000080H
push OFFSET ??_C#_19IEEMEPMH#?$AAD?$AAa?$AAt?$AAa?$AA?$AA#
mov ecx, DWORD PTR _this$[ebp]
call ?AddCol#CDBAccess#DB##QAEAAUS_BIND#2#PB_WKGKW4TYPE#32#0_N#Z ; DB::CDBAccess::AddCol
It is a compiler bug. Already known in connect.
EDIT The problem seams to be fixed in VS-2017 15.5.1
The problem has to do with a bug in the built in offsetof.
It is not possible for me to #undef _CRT_USE_BUILTIN_OFFSETOF as written in this case.
For me it only works to #undef offsetof and to use one of this:
#define myoffsetof1(s,m) ((size_t)&reinterpret_cast<char const volatile&>((((s*)0)->m)))
#define myoffsetof2(s, m) ((size_t)&(((s*)0)->m))
#undef offsetof
#define offsetof myoffsetof1
All ATL DB consumers are affected.
Here is a minimum repro, that shows the bug. Set a breakpint on the Init function. Look into the assembler code and wonder how much stack is used!
// StackUsage.cpp : Defines the entry point for the console application.
//
#include "stdafx.h"
#include <string>
#include <list>
#include <iostream>
using namespace std;
struct CRec
{
char t1[20];
char t2[20];
char t3[20];
char t4[20];
char t5[20];
int i1, i2, i3, i4, i5;
GUID g1, g2, g3, g4, g5;
DBTIMESTAMP d1, d2, d3, d4, d5;
};
#define sizeofmember(s,m) sizeof(reinterpret_cast<const s *>(0)->m)
#define typeofmember(c,m) _GetOleDBType(((c*)0)->m)
#define myoffsetof1(s,m) ((size_t)&reinterpret_cast<char const volatile&>((((s*)0)->m)))
#define myoffsetof2(s, m) ((size_t)&(((s*)0)->m))
// Undef this lines to fix the bug
// #undef offsetof
// #define offsetof myoffsetof1
#define COL(n,v) { AddCol(n,offsetof(CRec,v),typeofmember(CRec,v),sizeofmember(CRec,v)); }
class CFoo
{
public:
CFoo()
{
Init();
}
void Init()
{
COL("t1", t1);
COL("t2", t2);
COL("t3", t3);
COL("t4", t4);
COL("t5", t5);
COL("i1", i1);
COL("i2", i2);
COL("i3", i3);
COL("i4", i4);
COL("i5", i5);
COL("g1", g1);
COL("g2", g2);
COL("g2", g3);
COL("g2", g4);
COL("g2", g5);
COL("d1", d1);
COL("d2", d2);
COL("d2", d3);
COL("d2", d4);
COL("d2", d5);
}
void AddCol(PCSTR szName, ULONG nOffset, DBTYPE wType, ULONG nSize)
{
cout << szName << '\t' << nOffset << '\t' << wType << '\t' << nSize << endl;
}
};
int main()
{
CFoo foo;
return 0;
}
So I've asked this before but with significantly less detail. The question title accurately describes the problem: I have a method in C++ that I am trying to call from assembly (x86) that has both parameters and a return value. I have a rough understanding, at best, of assembly and a fairly solid understanding of C++ (otherwise I would not have undertaken this problem). Here's what I have as far as code goes:
// methodAddr is a pointer to the method address
void* methodAddr = method->Address;
// buffer is an int array of parameter values. The parameters can be anything (of any type)
// but are copied into an int array so they can be pushed onto the stack in reverse order
// 4 bytes at a time (as in push (int)). I know there's an issue here that is irrelevent to my baseline testing, in that if any parameter is over 4 bytes it will be broken and
// reversed (which is not good) but for basic testing this isn't an issue, so overlook this.
for (int index = bufferElementCount - 1; index >= 0; index--)
{
int val = buffer[index];
__asm
{
push val
}
}
int returnValueCount = 0;
// if there is a return value, allocate some space for it and push that onto the stack after
// the parameters have been pushed on
if (method->HasReturnValue)
{
*returnSize = method->ReturnValueSize;
outVal = new char[*returnSize];
returnValueCount = (*returnSize / 4) + (*returnSize % 4 != 0 ? 1 : 0);
memset(outVal, 0, *returnSize);
for (int index = returnValueCount - 1; index >= 0; index--)
{
char* addr = ((char*)outVal) + (index * 4);
__asm
{
push addr
}
}
}
// calculate the stack pointer offset so after the call we can pop the parameters and return value
int espOffset = (bufferElementCount + returnValueCount) * 4;
// call the method
__asm
{
call methodAddr;
add esp, espOffset
};
For my basic testing I am using a method with the following signature:
Person MyMethod3( int, char, int );
The problem is this: when omit the return value from the method signature, all of the parameter values are properly passed. But when I leave the method as is, the parameter data that is passed is incorrect but the value returned is correct. So my question, obviously, is what is wrong? I've tried pushing the return value space onto the stack before the parameters. The person structure is as follows:
class Person
{
public:
Text Name;
int Age;
float Cash;
ICollection<Person*>* Friends;
};
Any help would be greatly appreciated. Thanks!
I'm using Visual Studio 2013 with the November 2013 CTP compiler for C++, targeting x86.
As it relates to disassembly, this is the straight method call:
int one = 876;
char two = 'X';
int three = 9738;
Person p = MyMethod3(one, two, three);
And here is the disassembly for that:
00CB0A20 mov dword ptr [one],36Ch
char two = 'X';
00CB0A27 mov byte ptr [two],58h
int three = 9738;
00CB0A2B mov dword ptr [three],260Ah
Person p = MyMethod3(one, two, three);
00CB0A32 push 10h
00CB0A34 lea ecx,[p]
00CB0A37 call Person::__autoclassinit2 (0C6AA2Ch)
00CB0A3C mov eax,dword ptr [three]
00CB0A3F push eax
00CB0A40 movzx ecx,byte ptr [two]
00CB0A44 push ecx
00CB0A45 mov edx,dword ptr [one]
00CB0A48 push edx
00CB0A49 lea eax,[p]
00CB0A4C push eax
00CB0A4D call MyMethod3 (0C6B783h)
00CB0A52 add esp,10h
00CB0A55 mov dword ptr [ebp-4],0
My interpretation of this is as follows:
Execute the assignments to the local variables. Then create the output register. Then put the parameters in a particular register (the order here happens to be eax, ecx, and edx, which makes sense (eax and ebx are for one, ecx is for two, and edx and some other register for the last parameter?)). Then call LEA (load-effective address) which I don't understand but have understood to be a MOV. Then it calls the method with an address as the parameter? And then moves the stack pointer to pop the parameters and return value.
Any further explanation is appreciated, as I'm sure my understanding here is somewhat flawed.
I've run into an issue porting a codebase from linux (gcc) to windows (msvc). It seems like the C99 function vsscanf isn't available and has no obvious replacement.
I've read about a solution using the internal function _input_l and linking statically to the crt runtime, but unfortunately I cannot link statically since it would mess with all the plugins (as dlls) being loaded by the application.
So is there any replacement or a way to write a wrapper for vsscanf?
Update 2016-02-24:
When this was first asked there was no native replacement but since then MSVC has implemented support for this and much more.
VS2013 and later implements vsscanf and friends.
C++11 includes support as well.
A hack that should work:
int vsscanf(const char *s, const char *fmt, va_list ap)
{
void *a[20];
int i;
for (i=0; i<sizeof(a)/sizeof(a[0]); i++) a[i] = va_arg(ap, void *);
return sscanf(s, fmt, a[0], a[1], a[2], a[3], a[4], a[5], a[6], /* etc... */);
}
Replace 20 with the max number of args you think you might need. This code isn't terribly portable but it's only intended to be used on one particular broken system missing vsscanf so that shouldn't matter so much.
A quick search turned up several suggestions, including http://www.flipcode.net/archives/vsscanf_for_Win32.shtml
As this is tagged C++ have you considered just biting the bullet and moving away from the scanf line of functions completely? The C++ idiomatic way would be to use a std::istringstream. Rewriting to make use of that instead of looking for a vsscanf replacement would possibly be easier and more portable, not to mention having much greater type safety.
Funny it never came up for me before today. I could've sworn I'd used the function in the past. But anyway, here's a solution that works and is as safe as your arguments and format string:
template < size_t _NumArgs >
int VSSCANF_S(LPCTSTR strSrc, LPCTSTR ptcFmt, INT_PTR (&arr)[_NumArgs]) {
class vaArgs
{
vaArgs() {}
INT_PTR* m_args[_NumArgs];
public:
vaArgs(INT_PTR (&arr)[_NumArgs])
{
for(size_t nIndex=0;nIndex<_NumArgs;++nIndex)
m_args[nIndex] = &arr[nIndex];
}
};
return sscanf_s(strSrc, ptcFmt, vaArgs(arr));
}
///////////////////////////////////////////////////////////////////////////////
int _tmain(int, LPCTSTR argv[])
{
INT_PTR args[3];
int nScanned = VSSCANF_S(_T("-52 Hello 456 #"), _T("%d Hello %u %c"), args);
return printf(_T("Arg1 = %d, arg2 = %u, arg3 = %c\n"), args[0], args[1], args[2]);
}
Out:
Arg1 = -52, arg2 = 456, arg3 = #
Press any key to continue . . .
Well I can't get the formatting right but you get the idea.
if you want to wrap sscanf and you are using C++11, you can do this:
template<typename... Args>
int mysscanf(const char* str, const char* fmt, Args... args) {
//...
return sscanf(str, fmt, args...);
}
to make this work on msvc, you need to download this update:
http://www.microsoft.com/en-us/download/details.aspx?id=35515
modified from :
http://www.gamedev.net/topic/310888-no-vfscanf-in-visual-studio/
#if defined(_WIN32) && (_MSC_VER <= 1500)
static int vsscanf(
const char *buffer,
const char *format,
va_list argPtr
)
{
// Get an upper bound for the # of args
size_t count = 0;
const char* p = format;
while(1)
{
char c = *(p++);
if (c == 0)
break;
if (c == '%' && (p[0] != '*' && p[0] != '%'))
++count;
}
if (count <= 0)
return 0;
int result;
// copy stack pointer
_asm
{
mov esi, esp;
}
// push variable parameters pointers on stack
for (int i = count - 1; i >= 0; --i)
{
_asm
{
mov eax, dword ptr[i];
mov ecx, dword ptr [argPtr];
mov edx, dword ptr [ecx+eax*4];
push edx;
}
}
int stackAdvance = (2 + count) * 4;
_asm
{
// now push on the fixed params
mov eax, dword ptr [format];
push eax;
mov eax, dword ptr [buffer];
push eax;
// call sscanf, and more the result in to result
call dword ptr [sscanf];
mov result, eax;
// restore stack pointer
mov eax, dword ptr[stackAdvance];
add esp, eax;
}
return result;
}
#endif // _WIN32 / _MSC_VER <= 1500
tested only on Visual Studio 2008