Error compiling in Assembly Code for cmpxchg16b - c++

I am working on a project that requires a double-width-compare-and-swap operation (cmpxchg16b). I found the following code by luke h, however when I compile it with "g++-4.7 -g -DDEBUG=1 -std=c++0x dwcas2.c -o dwcas2.o" I get the following error:
Error:
g++-4.7 -g -DDEBUG=1 -m64 -std=c++0x dwcas2.c -o dwcas2.o
dwcas2.c: Assembler messages:
dwcas2.c:29: Error: junk `ptr ' after expression
Any ideas why?, I feel like it is something small and easy to fix, I just can not see it.
Computer Specs:
64-core ThinkMate RAX QS5-4410 server running Ubuntu 12.04 LTS. It is a NUMA system with four AMD Opteron 6272 CPUs (16 cores per chip #2.1 GHz) and 314 GB of shared memory.
Code:
#include <stdint.h>
namespace types
{
struct uint128_t
{
uint64_t lo;
uint64_t hi;
}
__attribute__ (( __aligned__( 16 ) ));
}
template< class T > inline bool cas( volatile T * src, T cmp, T with );
template<> inline bool cas( volatile types::uint128_t * src, types::uint128_t cmp, types::uint128_t with )
{
bool result;
__asm__ __volatile__
(
"lock cmpxchg16b oword ptr %1\n\t"
"setz %0"
: "=q" ( result )
, "+m" ( *src )
, "+d" ( cmp.hi )
, "+a" ( cmp.lo )
: "c" ( with.hi )
, "b" ( with.lo )
: "cc"
);
return result;
}
int main()
{
using namespace types;
uint128_t test = { 0xdecafbad, 0xfeedbeef };
uint128_t cmp = test;
uint128_t with = { 0x55555555, 0xaaaaaaaa };
return ! cas( & test, cmp, with );
}

On x86 GCC defaults to using AT&T syntax assembly, but your source is in Intel syntax. You probably also need "memory" in the clobber list.

Related

C++ Inline ASM code to get clock frequency for ARM

I was trying to compile ODE(Open Dynamics Engine) physics C++ library for Android Native application on Android Studio.
When i tried to build it, it gave me some error telling that some inline ASM code is not correct as they are written for INTEL processor syntax. This are mostly to get CPU clock frequency for physics simulation purpose.
(Editor's note: this x86 GNU C inline asm is inefficient and not even safe or portable. See How to get the CPU cycle count in x86_64 from C++? for correct ways to use i386 / x86-64 rdtsc.)
static inline void getClockCount (unsigned long cc[2])
{
#ifndef X86_64_SYSTEM
asm volatile (
"rdtsc\n"
"movl %%eax,(%%esi)\n"
"movl %%edx,4(%%esi)\n"
: : "S" (cc) : "%eax","%edx","cc","memory");
#else
asm volatile (
"rdtsc\n"
"movl %%eax,(%%rsi)\n"
"movl %%edx,4(%%rsi)\n"
: : "S" (cc) : "%eax","%edx","cc","memory");
#endif
}
static inline void serialize()
{
#ifndef X86_64_SYSTEM
asm volatile (
"mov $0,%%eax\n"
"push %%ebx\n"
"cpuid\n"
"pop %%ebx\n"
: : : "%eax","%ecx","%edx","cc","memory");
#else
asm volatile (
"mov $0,%%rax\n"
"push %%rbx\n"
"cpuid\n"
"pop %%rbx\n"
: : : "%rax","%rcx","%rdx","cc","memory");
#endif
}
static inline double loadClockCount (unsigned long a[2])
{
double ret;
#ifndef X86_64_SYSTEM
asm volatile ("fildll %1; fstpl %0" : "=m" (ret) : "m" (a[0]) :
"cc","memory");
#else
asm volatile ("fildll %1; fstpl %0" : "=m" (ret) : "m" (a[0]) :
"cc","memory");
#endif
return ret;
}
I don't know how to do same for ARM? Any help?

vector storing lambdas -- CLANG vs the other two

Here is the code.
// .
#include <stdio.h>
#include <memory>
#include <vector>
#define CAT_(a, b) a##b
#define CAT(a, b) CAT_(a, b)
#define REG(x) inline auto CAT(unused_name_, __LINE__ ) = tu_register(x)
#define LNE printf("%d", __LINE__ )
typedef void (*fp_required)( ) ;
using fvec_type = std::vector< fp_required > ;
static fvec_type fvec{};
inline bool tu_register( fp_required const & fp ){
printf("\nRegistered: %p" , fp ) ;
fvec.push_back( fp ) ;
return true ;
}
REG([]{ LNE; }) ; // 3 stateless lambdas
REG([]{ LNE; }) ;
REG([]{ LNE; }) ;
int main()
{
printf("\n\nRegistered %lu lambdas. %s\n", fvec.size(), (fvec.size() > 0 ? "Now calling them": ""));
for ( auto & fun : fvec ) {
printf("\nCalling lambda %p , rezult: " , fun);
fun() ;
}
return 0;
}
CLANG compiles with no warnings, but places exactly nothing in the vector?
https://wandbox.org/permlink/nkYjgqvr5QOprKEn
G++ compiles with no warnings, and works as expected.
https://wandbox.org/permlink/a6HB6xzavE8FOyOi
MSVC (the latest VS2019, all up to date) too, compiles and works no problems.
Who is right?
That code should work, and in Clang 8.0.0, it does. Looks like it was broken after that, and doesn't work in 9.0.0 nor 10.0.0 on Wandbox.

MSVC++ 2015 - SSE compiler bug or bug/undefined behavior in my program?

I ran into some weird behavior while working on a SIMD color lerp function, and I trimmed it down into a minimal program. The SIMD code in this example no longer performs a lerp but it performs unpacking from a 32-bit color to an XMM register and then back to 32-bit.
In MSVC++ 2015 (Update 3), in Release x64 mode, the following code does not produce the correct result, but in Debug x64 or Release/Debug x86 it works correctly. This is the only code in an otherwise empty Win32 C++ console application project:
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include "emmintrin.h"
struct Color4
{
uint8_t red;
uint8_t green;
uint8_t blue;
uint8_t alpha;
Color4(uint8_t red, uint8_t green, uint8_t blue, uint8_t alpha = 255)
: red(red), green(green), blue(blue), alpha(alpha) {}
explicit Color4(uint32_t rgba)
{
red = (uint8_t)(rgba & 0xFF);
green = (uint8_t)((rgba >> 8)&0xFF);
blue = (uint8_t)((rgba >> 16) & 0xFF);
alpha = (uint8_t)((rgba >> 24) & 0xFF);
}
};
Color4 PackUnpack(Color4 col)
{
uint32_t tmp;
memcpy(&tmp, &col, sizeof(tmp));
__m128 aFloat = _mm_cvtepi32_ps(
_mm_unpacklo_epi16(
_mm_unpacklo_epi8(
_mm_set1_epi32(tmp),
_mm_setzero_si128()
),
_mm_setzero_si128()
)
);
__m128i ret = _mm_packus_epi16(
_mm_packs_epi32(
_mm_cvtps_epi32(aFloat),
_mm_setzero_si128()
),
_mm_setzero_si128()
);
return Color4((uint32_t)_mm_cvtsi128_si32(ret));
}
int main()
{
#ifdef _DEBUG
printf("DEBUG\n");
#else
printf("RELEASE\n");
#endif
Color4 c = PackUnpack(Color4(32, 64, 128, 255));
// Debug x64 or Debug/Release x86: Prints "32 64 128 255"
// Release x64: Prints "255 0 0 0"
printf("%d %d %d %d\n", c.red, c.green, c.blue, c.alpha);
return 0;
}
The Release x64 output is:
RELEASE
255 0 0 0
Debug x64 and all x86 output is:
DEBUG
32 64 128 255
The disassembly looks like it's messed up pre-computing a constant value to load into an XMM register to skip the _mm_set1_epi32 (see first movdqa instruction.)
main:
00007FF674391070 sub rsp,38h
00007FF674391074 lea rcx,[string "RELEASE\n" (07FF674392200h)]
00007FF67439107B call printf (07FF674391010h)
00007FF674391080 movdqa xmm0,xmmword ptr [__xmm#000000ff000000ff000000ff000000ff (07FF674392220h)]
00007FF674391088 lea rcx,[string "%d %d %d %d\n" (07FF674392210h)]
00007FF67439108F xorps xmm2,xmm2
00007FF674391092 mov dword ptr [rsp+40h],0FF804020h
00007FF67439109A punpcklbw xmm0,xmm2
00007FF67439109E punpcklwd xmm0,xmm2
00007FF6743910A2 cvtdq2ps xmm0,xmm0
00007FF6743910A5 cvtps2dq xmm1,xmm0
00007FF6743910A9 packssdw xmm1,xmm2
00007FF6743910AD packuswb xmm1,xmm2
00007FF6743910B1 movd r10d,xmm1
00007FF6743910B6 mov edx,r10d
00007FF6743910B9 mov r8d,r10d
00007FF6743910BC shr edx,10h
00007FF6743910BF mov eax,r10d
00007FF6743910C2 shr r8d,8
00007FF6743910C6 movzx r9d,dl
00007FF6743910CA shr eax,18h
00007FF6743910CD movzx edx,r10b
00007FF6743910D1 movzx r8d,r8b
00007FF6743910D5 mov dword ptr [rsp+20h],eax
00007FF6743910D9 call printf (07FF674391010h)
00007FF6743910DE xor eax,eax
00007FF6743910E0 add rsp,38h
00007FF6743910E4 ret
I have tried this with g++ 4.8.4 on Ubuntu 14.04 x64 and it works fine with -O3 on or off.
So my question is, is this a compiler bug, the result of using undefined/implementation defined behavior, or a more mundane bug in my code?
(The code used to use type punning via unions to get the uint32_t value out of the Color4, which I replaced with a memcpy because that's not standard... still no dice.)
Not actually an answer, but, since I don't like to put too much text into the comment, this the smallest code I could reproduce the issue with:
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include "emmintrin.h"
int main()
{
uint8_t src[4] = { 32, 64, 128, 255 };
uint32_t tmp = 0;
memcpy( &tmp, &src, sizeof( tmp ) );
auto a = _mm_set1_epi32( tmp );
printf( "tmp = 0x%08x\n", tmp );
printf( "a.m128i_i32[0] = 0x%08x\n", a.m128i_i32[0] );
return 0;
}
Expected output:
tmp = 0xff804020
a.m128i_i32[0] = 0xff804020
Output with Release x64:
tmp = 0xff804020
a.m128i_i32[0] = 0x000000ff
This is due to a compiler bug. A workaround is to use
tmp = color.red + 256 * (col.blue + 256 * (col.green + 256 * col.alpha)));
in place of the memcpy or type punning.

android ndk asm compile error: inconsistent operand constraints in an 'asm'

I am compiling a piece of asm code for android:
static void MyCPUID(UInt32 function, UInt32 *a, UInt32 *b, UInt32 *c, UInt32 *d)
{
__asm__ __volatile__ (
"cpuid"
: "=a" (*a) ,
"=b" (*b) ,
"=c" (*c) ,
"=d" (*d)
: "0" (function)) ;
}
The APP_ABI is set to 'all':
APP_ABI := all
The compiling failed when come to x86:
$ ndk-build
[armeabi-v7a] Gdbserver : [arm-linux-androideabi-4.6] libs/armeabi-v7a/gdbserver
[armeabi-v7a] Gdbsetup : libs/armeabi-v7a/gdb.setup
[armeabi] Gdbserver : [arm-linux-androideabi-4.6] libs/armeabi/gdbserver
[armeabi] Gdbsetup : libs/armeabi/gdb.setup
[x86] Gdbserver : [x86-4.6] libs/x86/gdbserver
[x86] Gdbsetup : libs/x86/gdb.setup
[mips] Gdbserver : [mipsel-linux-android-4.6] libs/mips/gdbserver
[mips] Gdbsetup : libs/mips/gdb.setup
[armeabi-v7a] Compile thumb : hello-jni <= CpuArch.c
[armeabi-v7a] SharedLibrary : libhello-jni.so
[armeabi-v7a] Install : libhello-jni.so => libs/armeabi-v7a/libhello-jni.so
[armeabi] Compile thumb : hello-jni <= CpuArch.c
[armeabi] SharedLibrary : libhello-jni.so
[armeabi] Install : libhello-jni.so => libs/armeabi/libhello-jni.so
[x86] Compile : hello-jni <= CpuArch.c
D:/adt/ndk/samples/hello-jni/jni/CpuArch.c: In function 'MyCPUID':
D:/adt/ndk/samples/hello-jni/jni/CpuArch.c:75:3: error: inconsistent operand constraints in an 'asm'
/cygdrive/d/adt/ndk/build/core/build-binary.mk:391: recipe for target '/cygdrive/d/adt/ndk/samples/hello-jni/obj/local/x86/objs-debug/hello-jni/CpuArch.o' failed
make: *** [/cygdrive/d/adt/ndk/samples/hello-jni/obj/local/x86/objs-debug/hello-jni/CpuArch.o] Error 1
I don't have much experience in asm. And the error msg seems not enough to find a solution. :(
BTW,the compiling is made in win7 using cygwin.
Full version:
static void MyCPUID(UInt32 function, UInt32 *a, UInt32 *b, UInt32 *c, UInt32 *d)
{
#ifdef USE_ASM
#ifdef _MSC_VER
UInt32 a2, b2, c2, d2;
__asm xor EBX, EBX;
__asm xor ECX, ECX;
__asm xor EDX, EDX;
__asm mov EAX, function;
__asm cpuid;
__asm mov a2, EAX;
__asm mov b2, EBX;
__asm mov c2, ECX;
__asm mov d2, EDX;
*a = a2;
*b = b2;
*c = c2;
*d = d2;
#else
__asm__ __volatile__ (
"cpuid"
: "=a" (*a) ,
"=b" (*b) ,
"=c" (*c) ,
"=d" (*d)
: "0" (function)) ;
#endif
#else
int CPUInfo[4];
__cpuid(CPUInfo, function);
*a = CPUInfo[0];
*b = CPUInfo[1];
*c = CPUInfo[2];
*d = CPUInfo[3];
#endif
}
This code is based up something I wrote in this Stackoverflow answer. One has to be careful to preserve %ebx register on some x86 based architectures/ABI. %ebx is used to relocate code (shared object etc) when position independent code (-fPIC gcc option) is being generated. The code below avoids using =b in the extended assembler output and uses a register the compiler knows is free and usable. %ebx is preserved by swapping it to the free register before and after the call to cpuid. I've also fixed a small gotchya bug related to the %ecx register. I clear it to 0 ("c"(0)) since on some architectures failure to do so will result in stale values being returned by cpuid.
static void MyCPUID(UInt32 function, UInt32 *a, UInt32 *b, UInt32 *c, UInt32 *d)
{
#if defined(__i386__)
__asm__ __volatile__ (
"xchgl\t%%ebx, %k1\n\t" \
"cpuid\n\t" \
"xchgl\t%%ebx, %k1\n\t"
: "=a"(*a), "=&r"(*b), "=c"(*c), "=d"(*d)
: "a"(function), "c"(0));
#elif defined(__x86_64__)
__asm__ __volatile__ (
"xchgq\t%%rbx, %q1\n\t" \
"cpuid\n\t" \
"xchgq\t%%rbx, %q1\n\t"
: "=a"(*a), "=&r"(*b), "=c"(*c), "=d"(*d)
: "a"(function), "c"(0));
#else
#error "Unknown architecture."
#endif
}

VS C++ ASM to GCC ASM

Did I convert this correctly?
Original VS C++ version:
_TEB *pTeb = NULL;
_asm
{
mov eax, fs:[0x18];
mov pTeb, eax;
}
My attempt (GCC):
_TEB *pTeb = NULL;
asm ("movl %%fs:0x18, %%eax\n\t"
"movl %%eax, %0"
: "=rm" (pTeb) : : "%eax");
If you need GCC syntax for Windows-related code, a good source to check is ReactOS sources. Here's their implementation of NtCurrentTeb() (with irrelevant parts removed):
unsigned long __readfsdword(const unsigned long Offset)
{
unsigned long value;
__asm__ __volatile__("movl %%fs:%a[Offset], %k[value]" : [value] "=r" (value) : [Offset] "ir" (Offset));
return value;
}
struct _TEB * NtCurrentTeb(VOID)
{
return (PTEB)__readfsdword(0x18);
}