What do these instructions in the diassembly phase indicate? - c++
Hello as I run c++ code in clion IDE debugger, after main() returns, the debugger steps into a file called disassembly, and it contains what looks like assmebly code. What are those instructions? What does it do? Should I care? as I'm new to c++ I'm familiarizing myself with the language, IDE and anything else of relevance.
start:
nop
movl %eax, %edi
callq 0x2e82e ; symbol stub for: exit
hlt
nop
nop
nop
nop
nop
nop
nop
nop
nop
nop
nop
nop
nop
exit:
jmpq *0x268c241c(%rip)
exit:
pushq %rbp
movq %rsp, %rbp
pushq %rbx
pushq %rax
movl %edi, %ebx
cmpl $0xad, %edi
jne 0x5a404 ; <+41>
leaq 0x2683a31e(%rip), %rcx
movq (%rcx), %rax
testq %rax, %rax
je 0x5a404 ; <+41>
xorl %eax, %eax
xchgq %rax, (%rcx)
testq %rax, %rax
jne 0x5a427 ; <+76>
xorl %eax, %eax
callq 0x8017c ; symbol stub for: _tlv_exit
xorl %edi, %edi
callq 0x5a196 ; __cxa_finalize
movq 0x268354f7(%rip), %rax
testq %rax, %rax
je 0x5a420 ; <+69>
callq *%rax
movl %ebx, %edi
callq 0x8000e ; symbol stub for: __exit
callq *%rax
ud2
There is also this
_tlv_exit:
pushq %rbp
movq %rsp, %rbp
pushq %rbx
pushq %rax
movq 0x268db5e9(%rip), %rdi
callq 0x2e92a ; symbol stub for: pthread_getspecific
testq %rax, %rax
je 0x18e20 ; <+54>
movq %rax, %rbx
movq 0x268db5d5(%rip), %rdi
xorl %esi, %esi
callq 0x2e942 ; symbol stub for: pthread_setspecific
movq %rbx, %rdi
addq $0x8, %rsp
popq %rbx
popq %rbp
jmp 0x1983e ; tlv_finalize_list
addq $0x8, %rsp
popq %rbx
popq %rbp
retq
nop
nop
nop
nop
nop
nop
nop
nop
nop
nop
nop
start:
nop
movl %eax, %edi
callq 0x2e82e ; symbol stub for: exit
hlt
nop
nop
nop
nop
nop
nop
nop
nop
nop
nop
nop
nop
nop
exit:
jmpq *0x268c241c(%rip)
pthread_getspecific:
jmpq *0x268c2470(%rip)
__cxa_finalize_ranges:
pushq %rbp
movq %rsp, %rbp
pushq %r15
pushq %r14
pushq %r13
pushq %r12
pushq %rbx
subq $0x18, %rsp
movl %esi, -0x2c(%rbp)
movq %rdi, -0x38(%rbp)
leaq 0x26834d24(%rip), %rdi
callq 0x804d6 ; symbol stub for: pthread_mutex_lock
movq 0x26834ca0(%rip), %r13
testq %r13, %r13
je 0x5a17c ; <+383>
movl -0x2c(%rbp), %ebx
addq $0x8, -0x38(%rbp)
movslq 0x8(%r13), %r15
testq %r15, %r15
jle 0x5a16f ; <+370>
decq %r15
movq %r15, %r14
shlq $0x5, %r14
movl 0x10(%r13,%r14), %r12d
testl %r12d, %r12d
je 0x5a03d ; <+64>
cmpl $0x0, -0x2c(%rbp)
je 0x5a102 ; <+261>
cmpl $0x1, %r12d
je 0x5a0a4 ; <+167>
cmpl $0x3, %r12d
je 0x5a0d1 ; <+212>
cmpl $0x2, %r12d
jne 0x5a102 ; <+261>
movq 0x28(%r13,%r14), %rax
movq -0x38(%rbp), %rcx
xorl %edx, %edx
movq -0x8(%rcx), %rsi
cmpq %rax, %rsi
ja 0x5a096 ; <+153>
addq (%rcx), %rsi
cmpq %rax, %rsi
ja 0x5a102 ; <+261>
incq %rdx
addq $0x10, %rcx
cmpq %rbx, %rdx
jb 0x5a085 ; <+136>
jmp 0x5a03d ; <+64>
movq 0x18(%r13,%r14), %rax
movq -0x38(%rbp), %rcx
xorl %edx, %edx
movq -0x8(%rcx), %rsi
cmpq %rax, %rsi
ja 0x5a0c0 ; <+195>
addq (%rcx), %rsi
cmpq %rax, %rsi
ja 0x5a102 ; <+261>
incq %rdx
addq $0x10, %rcx
cmpq %rbx, %rdx
jb 0x5a0af ; <+178>
jmp 0x5a03d ; <+64>
movq 0x18(%r13,%r14), %rax
movq 0x10(%rax), %rax
movq -0x38(%rbp), %rcx
xorl %edx, %edx
movq -0x8(%rcx), %rsi
cmpq %rax, %rsi
ja 0x5a0f1 ; <+244>
addq (%rcx), %rsi
cmpq %rax, %rsi
ja 0x5a102 ; <+261>
incq %rdx
addq $0x10, %rcx
cmpq %rbx, %rdx
jb 0x5a0e0 ; <+227>
jmp 0x5a03d ; <+64>
leaq 0x10(%r13,%r14), %rax
movl $0x0, (%rax)
movb $0x0, 0x26834b94(%rip)
leaq 0x26834c25(%rip), %rdi
callq 0x804e2 ; symbol stub for: pthread_mutex_unlock
cmpl $0x1, %r12d
je 0x5a13e ; <+321>
cmpl $0x3, %r12d
je 0x5a145 ; <+328>
cmpl $0x2, %r12d
jne 0x5a14d ; <+336>
movq 0x20(%r13,%r14), %rdi
callq *0x18(%r13,%r14)
jmp 0x5a14d ; <+336>
callq *0x18(%r13,%r14)
jmp 0x5a14d ; <+336>
movq 0x18(%r13,%r14), %rdi
callq *0x10(%rdi)
leaq 0x26834bec(%rip), %rdi
callq 0x804d6 ; symbol stub for: pthread_mutex_lock
cmpb $0x0, 0x26834b48(%rip)
je 0x5a03d ; <+64>
movq 0x26834b5b(%rip), %r13
jmp 0x5a173 ; <+374>
movq (%r13), %r13
testq %r13, %r13
jne 0x5a039 ; <+60>
leaq 0x26834bbd(%rip), %rdi
addq $0x18, %rsp
popq %rbx
popq %r12
popq %r13
popq %r14
popq %r15
popq %rbp
jmp 0x804e2 ; symbol stub for: pthread_mutex_unlock
__cxa_finalize:
testq %rdi, %rdi
je 0x5a1c5 ; <+47>
pushq %rbp
movq %rsp, %rbp
subq $0x10, %rsp
leaq -0x10(%rbp), %rax
movq %rdi, (%rax)
movq $0x1, 0x8(%rax)
movq %rax, %rdi
movl $0x1, %esi
callq 0x59ffd ; __cxa_finalize_ranges
addq $0x10, %rsp
popq %rbp
retq
xorl %edi, %edi
xorl %esi, %esi
jmp 0x59ffd ; __cxa_finalize_ranges
exit:
pushq %rbp
movq %rsp, %rbp
pushq %rbx
pushq %rax
movl %edi, %ebx
cmpl $0xad, %edi
jne 0x5a404 ; <+41>
leaq 0x2683a31e(%rip), %rcx
movq (%rcx), %rax
testq %rax, %rax
je 0x5a404 ; <+41>
xorl %eax, %eax
xchgq %rax, (%rcx)
testq %rax, %rax
jne 0x5a427 ; <+76>
xorl %eax, %eax
callq 0x8017c ; symbol stub for: _tlv_exit
xorl %edi, %edi
callq 0x5a196 ; __cxa_finalize
movq 0x268354f7(%rip), %rax
testq %rax, %rax
je 0x5a420 ; <+69>
callq *%rax
movl %ebx, %edi
callq 0x8000e ; symbol stub for: __exit
callq *%rax
ud2
_tlv_exit:
jmpq *0x2680cbd6(%rip)
pthread_getspecific:
movq %gs:(,%rdi,8), %rax
retq
Assembly output is just a dump of the executable code the compiler generated, but in a human-readable form1. This is not actually used by the compiler, it's just an artifact of the compilation process to be used for reference.
Remember, the compiled executable can be converted into assembly code at any time, tools like IDA Pro and Ghidra excel at doing this on any executable, but the compiler can add in contextual information that's lost in the final compilation phase in the form of comments or useful labels for things.
The compiler often emits debug hints for your compiled executable so it can turn a stack-trace into something that maps back to your original source code. These artifacts are much more useful as they allow you to step through C++ code instead of assembly code. If you ever have to debug in a library you don't have the source for you'll be stuck stepping through an assembly view of the executable code.
1 Presuming you can read assembly code.
The code you posted is support code from your libc runtime. The runtime is responsible for, among others:
implementing atexit hooks;
setting up your IO streams (cin, cout);
running constructors of any global static variables.
This answer has a more complete overview. You can search for articles about libc_start_main and related functions to learn more.
Related
Behaviour of assert
I'm trying to understand how assert behaves in the case of false statement. It calls __assert_fail and it calls std::abort(), according to documentation. So, I want to know what is going in assembly code of __assert_fail: endbr64 pushq %r13 movl %edx, %r13d movl $0x5, %edx pushq %r12 movq %rsi, %r12 leaq 0x18467e(%rip), %rsi pushq %rbp movq %rdi, %rbp leaq 0x1804f0(%rip), %rdi ; _libc_intl_domainname pushq %rbx movq %rcx, %rbx subq $0x8, %rsp callq 0x37980 ; __dcgettext movq %rbx, %r8 movl %r13d, %ecx movq %r12, %rdx movq %rax, %rdi movq %rbp, %rsi callq 0x36d70 What %r13 and %r12 stand for in this code? Where is the call of std::abort() and what is going on before and after this call?
Move semantics appear to add additional overhead in the simplest possible* example
* I believe this is the simplest possible example, but if I'm incorrect please let me know. https://godbolt.org/z/neaTse I'm attempting to learn and understand move semantics and some of their intricacies, but I've hit a bit of a snag. When attempting to compare the following 2 code snippets, the code using move semantics ends up with 8 additional lines of assembly and 2 additional moves (15 for move, 13 for no-move). Move: #include <utility> template<class T> void swap(T& a, T& b) { T tmp(std::move(a)); a = std::move(b); b = std::move(tmp); } int main(){ int a, b; swap(a, b); } No-Move: template<class T> void swap(T& a, T& b) { T tmp(a); a = b; b = tmp; } int main(){ int a, b; swap(a, b); } Move generated assembly: main: pushq %rbp movq %rsp, %rbp subq $16, %rsp leaq -8(%rbp), %rdx leaq -4(%rbp), %rax movq %rdx, %rsi movq %rax, %rdi call void swap<int>(int&, int&) movl $0, %eax leave ret void swap<int>(int&, int&): pushq %rbp movq %rsp, %rbp subq $32, %rsp movq %rdi, -24(%rbp) movq %rsi, -32(%rbp) movq -24(%rbp), %rax movq %rax, %rdi call std::remove_reference<int&>::type&& std::move<int&>(int&) movl (%rax), %eax movl %eax, -4(%rbp) movq -32(%rbp), %rax movq %rax, %rdi call std::remove_reference<int&>::type&& std::move<int&>(int&) movl (%rax), %edx movq -24(%rbp), %rax movl %edx, (%rax) leaq -4(%rbp), %rax movq %rax, %rdi call std::remove_reference<int&>::type&& std::move<int&>(int&) movl (%rax), %edx movq -32(%rbp), %rax movl %edx, (%rax) nop leave ret No-move generated assembly: main: pushq %rbp movq %rsp, %rbp subq $16, %rsp leaq -8(%rbp), %rdx leaq -4(%rbp), %rax movq %rdx, %rsi movq %rax, %rdi call void swap<int>(int&, int&) movl $0, %eax leave ret void swap<int>(int&, int&): pushq %rbp movq %rsp, %rbp movq %rdi, -24(%rbp) movq %rsi, -32(%rbp) movq -24(%rbp), %rax movl (%rax), %eax movl %eax, -4(%rbp) movq -32(%rbp), %rax movl (%rax), %edx movq -24(%rbp), %rax movl %edx, (%rax) movq -32(%rbp), %rax movl -4(%rbp), %edx movl %edx, (%rax) nop popq %rbp ret I think the the way I've internalized or abstracted move semantics for myself, is that they "enable 'newly available' optimization through the removal of costly temporary copies". Have I just internalized this incorrectly? -Or- Is this just failing because I'm using a primitive type? -Or- Have I just missed the mark entirely?
OK, the problem is inlining and effective optimization, to get around it I've annotated it with __attribute__((noinline)). I've made a class to handle the move class mover { public : int *ptr { nullptr }; __attribute__((noinline)) mover() : ptr(new int(42)) { } __attribute__((noinline)) mover(mover & other) { delete ptr; ptr = new int(*other.ptr); } __attribute__((noinline)) mover& operator= ( mover && other) { delete ptr; ptr = other.ptr; other.ptr = nullptr; return *this; } __attribute__((noinline)) mover& operator= ( mover & other) { delete ptr; ptr = new int(*other.ptr); return *this; } __attribute__((noinline)) mover(mover && other) { ptr = other.ptr; other.ptr = nullptr; } __attribute__((noinline)) ~mover() { delete ptr; } }; Specific doesn't use smart pointers to be able to see what goes on. The move swap now looks like this calling the correct constructors and operators void swap<mover>(mover&, mover&): pushq %r12 movq %rdi, %r12 pushq %rbp movq %rsi, %rbp movq %rdi, %rsi subq $24, %rsp leaq 8(%rsp), %rdi call mover::mover(mover&&) movq %rbp, %rsi movq %r12, %rdi call mover::operator=(mover&&) [clone .isra.0] leaq 8(%rsp), %rsi movq %rbp, %rdi call mover::operator=(mover&&) [clone .isra.0] leaq 8(%rsp), %rdi call mover::~mover() [complete object destructor] addq $24, %rsp popq %rbp popq %r12 ret And the copy swap looks like this calling copy and copy assign. void swap<mover>(mover&, mover&): pushq %r12 movq %rdi, %r12 pushq %rbp movq %rsi, %rbp movq %rdi, %rsi subq $24, %rsp leaq 8(%rsp), %rdi call mover::mover(mover&) movq %rbp, %rsi movq %r12, %rdi call mover::operator=(mover&) [clone .isra.0] leaq 8(%rsp), %rsi movq %rbp, %rdi call mover::operator=(mover&) [clone .isra.0] leaq 8(%rsp), %rdi call mover::~mover() [complete object destructor] addq $24, %rsp popq %rbp popq %r12 ret This biggest effect are the different move constructor mover::mover(mover&&): movq (%rsi), %rax movq $0, (%rsi) movq %rax, (%rdi) ret and copy constructor mover::mover(mover&): pushq %rbp movq %rsi, %rbp pushq %rbx movq %rdi, %rbx subq $8, %rsp movq $0, (%rdi) movl $4, %edi call operator new(unsigned long) // <---- new movq 0(%rbp), %rdx movq %rax, (%rbx) movl (%rdx), %edx movl %edx, (%rax) addq $8, %rsp popq %rbx popq %rbp ret With the new call in the latter.
64 bits architecture optimization
I'm testing a function that calculates the XOR of two char buffers. In order to increase the speed, I'm checking the speed of doing with a integer pointer (32 bits) and long long integer pointer (64 bits). I use the function with a char pointer for reference. Of course, I'm testing on a 64bits machine. But I'm not having the results that I expected. I'm trying with these 3 functions at the end. When I compare "XOR_Diff_Char" with "XOR_Diff_Int", I get an increase of speed around 3x, because the function "_Int" iterates 4 times less in the main "for". But when I compare "XOR_Diff_Int" with "XOR_Diff_QWORD", the improvement is arount 5-10%, really slower than I expected because the main "for" iterates 2x times less in "_QWORD" than in "_Int". I had tried (in order to compare speeds) to compile with different flags, between -O0 and -O3, but I found no differences. I use g++ 4.9.2-10 compiler under Debian 64bits. Do I have to put another flag? Do I suppose something and I'm wrong? Is the compiler so good that doesn't matter if you use 32 or 64 bits? ///////////////////////////////// int XOR_Diff_Int(char *pBuffIn1, char *pBuffIn2, char *pBuffOut, unsigned int sizeBuff) { int i = 0; /* Check errors ... */ int *pBuff1 = (int*)pBuffIn1; int *pBuff2 = (int*)pBuffIn2; int *pOut = (int*)pBuffOut; unsigned int sizeInt = (sizeBuff/sizeof(int)); unsigned int modInt = sizeBuff-(sizeBuff%sizeof(int)); for (i = 0; i < sizeInt; i++, pBuff1++, pBuff2++, pOut++) *pOut = *pBuff1 ^ *pBuff2; // If size is not sizeof(int) multiple for (i = modInt; i < sizeBuff; i++) pBuffOut[i] = pBuffIn1[i] ^ pBuffIn2[i]; return sizeBuff; } ///////////////////////////////// int XOR_Diff_Char(char *pBuffIn1, char *pBuffIn2, char *pBuffOut, unsigned int sizeBuff) { int i = 0; /* Check errors ... */ for (i = 0; i < sizeBuff; i++) pBuffOut[i] = pBuffIn1[i] ^ pBuffIn2[i]; return 1; } ///////////////////////////////// int XOR_Diff_QWORD(char *pBuffIn1, char *pBuffIn2, char *pBuffOut, unsigned int sizeBuff) { int i = 0; /* Check errors ... */ long long int *pBuff1 = (long long int*)pBuffIn1; long long int *pBuff2 = (long long int*)pBuffIn2; long long int *pOut = (long long int*)pBuffOut; unsigned int sizeLong = (sizeBuff/sizeof(long long int)); unsigned int modLong = sizeBuff-(sizeBuff%sizeof(long long int)); for (i = 0; i < sizeLong; i++, pBuff1++, pBuff2++, pOut++) *pOut = *pBuff1 ^ *pBuff2; // If size is not sizeof(long long int) multiple for (i = modLong; i < sizeBuff; i++) pBuffOut[i] = pBuffIn1[i] ^ pBuffIn2[i]; return 1; } EDIT: I was using the gcov utility, and I can see that the function with _QWORD executes the half number of iterations than _Int, so the speed should be the double (despite overhead of functions and so on). So I understand less why the speed is similar in both cases. For testing, I just using something as simple as gettimeofday(&t1, NULL); count = XOR_Diff_Int(pDataIn, prevData, pOut, SIZE); gettimeofday(&t2, NULL); changing "_Int" for "_QWORD" and recompiling for both types of test. EDIT 2: I don't know so much about assembler, but I compared both function (the main "for"), and I got this: // 64bits XOR movq (%rsi,%r8,8), %r9 xorq (%rdi,%r8,8), %r9 movq %r9, (%rdx,%r8,8) addq $1, %r8 cmpl %r8d, %ecx ja .L8 // 32bits XOR movl (%rsi,%r8,4), %r9d xorl (%rdi,%r8,4), %r9d movl %r9d, (%rdx,%r8,4) addq $1, %r8 cmpl %r8d, %ecx jg .L8 So I understand that the 64bits case is faster because uses 8 bytes instructions. I think that is not a "instructions" problems, but the operating system or something like that. At the moment I haven't anymore idea about this.
It seems that what you've tried to do is outsmart the compiler. The compiler won. Given the following simple function: void f(const char* lhs, const char* rhs, char* out, size_t sz) { for (size_t i = 0; i < sz; ++i ) out[i] = lhs[i] ^ rhs[i]; } and compiling with GCC with -O3 -Wall, the compiler spits out nearly 300 lines of assembler: f(char const*, char const*, char*, unsigned long): testq %rcx, %rcx je .L38 leaq 16(%rdi), %rax leaq 16(%rdx), %r9 cmpq %rax, %rdx setnb %r8b cmpq %r9, %rdi setnb %al orl %eax, %r8d leaq 16(%rsi), %rax cmpq %rax, %rdx setnb %r10b cmpq %r9, %rsi setnb %al orl %r10d, %eax testb %al, %r8b je .L3 cmpq $19, %rcx jbe .L3 movq %rdi, %r8 pushq %r13 pushq %r12 negq %r8 pushq %rbp pushq %rbx andl $15, %r8d cmpq %rcx, %r8 cmova %rcx, %r8 xorl %eax, %eax testq %r8, %r8 je .L4 movzbl (%rdi), %eax xorb (%rsi), %al cmpq $1, %r8 movb %al, (%rdx) je .L15 movzbl 1(%rdi), %eax xorb 1(%rsi), %al cmpq $2, %r8 movb %al, 1(%rdx) je .L16 movzbl 2(%rdi), %eax xorb 2(%rsi), %al cmpq $3, %r8 movb %al, 2(%rdx) je .L17 movzbl 3(%rdi), %eax xorb 3(%rsi), %al cmpq $4, %r8 movb %al, 3(%rdx) je .L18 movzbl 4(%rdi), %eax xorb 4(%rsi), %al cmpq $5, %r8 movb %al, 4(%rdx) je .L19 movzbl 5(%rdi), %eax xorb 5(%rsi), %al cmpq $6, %r8 movb %al, 5(%rdx) je .L20 movzbl 6(%rdi), %eax xorb 6(%rsi), %al cmpq $7, %r8 movb %al, 6(%rdx) je .L21 movzbl 7(%rdi), %eax xorb 7(%rsi), %al cmpq $8, %r8 movb %al, 7(%rdx) je .L22 movzbl 8(%rdi), %eax xorb 8(%rsi), %al cmpq $9, %r8 movb %al, 8(%rdx) je .L23 movzbl 9(%rdi), %eax xorb 9(%rsi), %al cmpq $10, %r8 movb %al, 9(%rdx) je .L24 movzbl 10(%rdi), %eax xorb 10(%rsi), %al cmpq $11, %r8 movb %al, 10(%rdx) je .L25 movzbl 11(%rdi), %eax xorb 11(%rsi), %al cmpq $12, %r8 movb %al, 11(%rdx) je .L26 movzbl 12(%rdi), %eax xorb 12(%rsi), %al cmpq $13, %r8 movb %al, 12(%rdx) je .L27 movzbl 13(%rdi), %eax xorb 13(%rsi), %al cmpq $14, %r8 movb %al, 13(%rdx) je .L28 movzbl 14(%rdi), %eax xorb 14(%rsi), %al movb %al, 14(%rdx) movl $15, %eax .L4: movq %rcx, %r11 leaq -1(%rcx), %r10 subq %r8, %r11 leaq -16(%r11), %r9 subq %r8, %r10 shrq $4, %r9 addq $1, %r9 movq %r9, %rbx salq $4, %rbx cmpq $14, %r10 jbe .L6 leaq (%rdi,%r8), %r13 leaq (%rsi,%r8), %r12 xorl %r10d, %r10d addq %rdx, %r8 xorl %ebp, %ebp .L8: movdqu (%r12,%r10), %xmm0 addq $1, %rbp pxor 0(%r13,%r10), %xmm0 movups %xmm0, (%r8,%r10) addq $16, %r10 cmpq %r9, %rbp jb .L8 addq %rbx, %rax cmpq %rbx, %r11 je .L1 .L6: movzbl (%rsi,%rax), %r8d xorb (%rdi,%rax), %r8b movb %r8b, (%rdx,%rax) leaq 1(%rax), %r8 cmpq %r8, %rcx jbe .L1 movzbl 1(%rdi,%rax), %r8d xorb 1(%rsi,%rax), %r8b movb %r8b, 1(%rdx,%rax) leaq 2(%rax), %r8 cmpq %r8, %rcx jbe .L1 movzbl 2(%rdi,%rax), %r8d xorb 2(%rsi,%rax), %r8b movb %r8b, 2(%rdx,%rax) leaq 3(%rax), %r8 cmpq %r8, %rcx jbe .L1 movzbl 3(%rdi,%rax), %r8d xorb 3(%rsi,%rax), %r8b movb %r8b, 3(%rdx,%rax) leaq 4(%rax), %r8 cmpq %r8, %rcx jbe .L1 movzbl 4(%rdi,%rax), %r8d xorb 4(%rsi,%rax), %r8b movb %r8b, 4(%rdx,%rax) leaq 5(%rax), %r8 cmpq %r8, %rcx jbe .L1 movzbl 5(%rdi,%rax), %r8d xorb 5(%rsi,%rax), %r8b movb %r8b, 5(%rdx,%rax) leaq 6(%rax), %r8 cmpq %r8, %rcx jbe .L1 movzbl 6(%rdi,%rax), %r8d xorb 6(%rsi,%rax), %r8b movb %r8b, 6(%rdx,%rax) leaq 7(%rax), %r8 cmpq %r8, %rcx jbe .L1 movzbl 7(%rdi,%rax), %r8d xorb 7(%rsi,%rax), %r8b movb %r8b, 7(%rdx,%rax) leaq 8(%rax), %r8 cmpq %r8, %rcx jbe .L1 movzbl 8(%rdi,%rax), %r8d xorb 8(%rsi,%rax), %r8b movb %r8b, 8(%rdx,%rax) leaq 9(%rax), %r8 cmpq %r8, %rcx jbe .L1 movzbl 9(%rdi,%rax), %r8d xorb 9(%rsi,%rax), %r8b movb %r8b, 9(%rdx,%rax) leaq 10(%rax), %r8 cmpq %r8, %rcx jbe .L1 movzbl 10(%rdi,%rax), %r8d xorb 10(%rsi,%rax), %r8b movb %r8b, 10(%rdx,%rax) leaq 11(%rax), %r8 cmpq %r8, %rcx jbe .L1 movzbl 11(%rdi,%rax), %r8d xorb 11(%rsi,%rax), %r8b movb %r8b, 11(%rdx,%rax) leaq 12(%rax), %r8 cmpq %r8, %rcx jbe .L1 movzbl 12(%rdi,%rax), %r8d xorb 12(%rsi,%rax), %r8b movb %r8b, 12(%rdx,%rax) leaq 13(%rax), %r8 cmpq %r8, %rcx jbe .L1 movzbl 13(%rdi,%rax), %r8d xorb 13(%rsi,%rax), %r8b movb %r8b, 13(%rdx,%rax) leaq 14(%rax), %r8 cmpq %r8, %rcx jbe .L1 movzbl 14(%rdi,%rax), %ecx xorb 14(%rsi,%rax), %cl movb %cl, 14(%rdx,%rax) .L1: popq %rbx popq %rbp popq %r12 popq %r13 .L38: rep ret .L3: xorl %eax, %eax .L13: movzbl (%rdi,%rax), %r8d xorb (%rsi,%rax), %r8b movb %r8b, (%rdx,%rax) addq $1, %rax cmpq %rax, %rcx jne .L13 rep ret .L28: movl $14, %eax jmp .L4 .L15: movl $1, %eax jmp .L4 .L16: movl $2, %eax jmp .L4 .L17: movl $3, %eax jmp .L4 .L18: movl $4, %eax jmp .L4 .L19: movl $5, %eax jmp .L4 .L20: movl $6, %eax jmp .L4 .L21: movl $7, %eax jmp .L4 .L22: movl $8, %eax jmp .L4 .L23: movl $9, %eax jmp .L4 .L24: movl $10, %eax jmp .L4 .L25: movl $11, %eax jmp .L4 .L26: movl $12, %eax jmp .L4 .L27: movl $13, %eax jmp .L4 It does better if we add -march=native -mtune=native The compiler has done its own striding, and done a much better job at it than it can with the variants you are producing. void f(const char* lhs, const char* rhs, char* out, size_t sz) { const int* ilhs = (const int*)lhs; const int* irhs = (const int*)rhs; int* iout = (int*)out; const size_t isz = (sz / sizeof(*ilhs)); const size_t imod = (isz * sizeof(*ilhs)); for (size_t i = 0; i < isz; ++i) *(iout++) = *(ilhs++) ^ *(irhs)++; for (size_t i = imod; i < sz; ++i) out[i] = lhs[i] ^ rhs[i]; } This produces almost 400 lines of assembler. f(char const*, char const*, char*, unsigned long): movq %rcx, %r8 pushq %r15 pushq %r14 shrq $2, %r8 pushq %r13 pushq %r12 testq %r8, %r8 pushq %rbp leaq 0(,%r8,4), %rax pushq %rbx je .L11 leaq 16(%rsi), %r9 leaq 16(%rdx), %r10 cmpq %r9, %rdx setnb %r11b cmpq %r10, %rsi setnb %r9b orl %r11d, %r9d cmpq $8, %r8 seta %r11b testb %r11b, %r9b je .L4 leaq 16(%rdi), %r9 cmpq %r9, %rdx setnb %r11b cmpq %r10, %rdi setnb %r9b orb %r9b, %r11b je .L4 movq %rdi, %r9 andl $15, %r9d shrq $2, %r9 negq %r9 andl $3, %r9d cmpq %r8, %r9 cmova %r8, %r9 testq %r9, %r9 je .L25 movl (%rdi), %r10d xorl (%rsi), %r10d cmpq $1, %r9 leaq 4(%rdx), %r13 leaq 4(%rdi), %rbp leaq 4(%rsi), %rbx movl %r10d, (%rdx) movl $1, %r10d je .L5 movl 4(%rdi), %r10d xorl 4(%rsi), %r10d cmpq $2, %r9 leaq 8(%rdx), %r13 leaq 8(%rdi), %rbp leaq 8(%rsi), %rbx movl %r10d, 4(%rdx) movl $2, %r10d je .L5 movl 8(%rdi), %r10d xorl 8(%rsi), %r10d leaq 12(%rdx), %r13 leaq 12(%rdi), %rbp leaq 12(%rsi), %rbx movl %r10d, 8(%rdx) movl $3, %r10d .L5: movq %r8, %r15 movq %rax, -16(%rsp) subq %r9, %r15 salq $2, %r9 leaq -4(%r15), %r11 leaq (%rsi,%r9), %r12 movq %r15, -24(%rsp) leaq (%rdi,%r9), %r15 addq %rdx, %r9 shrq $2, %r11 movq %r12, -40(%rsp) movq %r9, -32(%rsp) addq $1, %r11 xorl %r9d, %r9d xorl %r12d, %r12d leaq 0(,%r11,4), %r14 .L8: movq -40(%rsp), %rax addq $1, %r12 movdqu (%rax,%r9), %xmm0 movq -32(%rsp), %rax pxor (%r15,%r9), %xmm0 movups %xmm0, (%rax,%r9) addq $16, %r9 cmpq %r11, %r12 jb .L8 leaq 0(,%r14,4), %r9 addq %r14, %r10 movq -16(%rsp), %rax addq %r9, %rbp addq %r9, %rbx addq %r9, %r13 cmpq %r14, -24(%rsp) je .L11 movl 0(%rbp), %r9d xorl (%rbx), %r9d movl %r9d, 0(%r13) leaq 1(%r10), %r9 cmpq %r9, %r8 jbe .L11 movl 4(%rbp), %r9d xorl 4(%rbx), %r9d addq $2, %r10 cmpq %r10, %r8 movl %r9d, 4(%r13) jbe .L11 movl 8(%rbp), %r9d xorl 8(%rbx), %r9d movl %r9d, 8(%r13) .L11: cmpq %rax, %rcx jbe .L1 leaq 16(%rax), %r9 leaq (%rsi,%rax), %rbx movq %rcx, %r11 leaq (%rdx,%rax), %rbp subq %rax, %r11 leaq (%rdi,%rax), %r10 leaq (%rdx,%r9), %r12 leaq (%rdi,%r9), %r13 cmpq %rbx, %r12 setbe %bl addq %rsi, %r9 cmpq %r9, %rbp setnb %r9b orl %r9d, %ebx cmpq %r12, %r10 setnb %r12b cmpq %r13, %rbp setnb %r9b orl %r12d, %r9d testb %r9b, %bl je .L24 cmpq $19, %r11 jbe .L24 negq %r10 movq %rax, %r9 andl $15, %r10d cmpq %r11, %r10 cmova %r11, %r10 testq %r10, %r10 je .L15 movzbl (%rdi,%r8,4), %r9d xorb (%rsi,%r8,4), %r9b cmpq $1, %r10 movb %r9b, (%rdx,%r8,4) leaq 1(%rax), %r9 je .L15 movzbl 1(%rdi,%rax), %r8d leaq 2(%rax), %r9 xorb 1(%rsi,%rax), %r8b cmpq $2, %r10 movb %r8b, 1(%rdx,%rax) je .L15 movzbl 2(%rdi,%rax), %r8d leaq 3(%rax), %r9 xorb 2(%rsi,%rax), %r8b cmpq $3, %r10 movb %r8b, 2(%rdx,%rax) je .L15 movzbl 3(%rdi,%rax), %r8d leaq 4(%rax), %r9 xorb 3(%rsi,%rax), %r8b cmpq $4, %r10 movb %r8b, 3(%rdx,%rax) je .L15 movzbl 4(%rdi,%rax), %r8d leaq 5(%rax), %r9 xorb 4(%rsi,%rax), %r8b cmpq $5, %r10 movb %r8b, 4(%rdx,%rax) je .L15 movzbl 5(%rdi,%rax), %r8d leaq 6(%rax), %r9 xorb 5(%rsi,%rax), %r8b cmpq $6, %r10 movb %r8b, 5(%rdx,%rax) je .L15 movzbl 6(%rdi,%rax), %r8d leaq 7(%rax), %r9 xorb 6(%rsi,%rax), %r8b cmpq $7, %r10 movb %r8b, 6(%rdx,%rax) je .L15 movzbl 7(%rdi,%rax), %r8d leaq 8(%rax), %r9 xorb 7(%rsi,%rax), %r8b cmpq $8, %r10 movb %r8b, 7(%rdx,%rax) je .L15 movzbl 8(%rdi,%rax), %r8d leaq 9(%rax), %r9 xorb 8(%rsi,%rax), %r8b cmpq $9, %r10 movb %r8b, 8(%rdx,%rax) je .L15 movzbl 9(%rdi,%rax), %r8d leaq 10(%rax), %r9 xorb 9(%rsi,%rax), %r8b cmpq $10, %r10 movb %r8b, 9(%rdx,%rax) je .L15 movzbl 10(%rdi,%rax), %r8d leaq 11(%rax), %r9 xorb 10(%rsi,%rax), %r8b cmpq $11, %r10 movb %r8b, 10(%rdx,%rax) je .L15 movzbl 11(%rdi,%rax), %r8d leaq 12(%rax), %r9 xorb 11(%rsi,%rax), %r8b cmpq $12, %r10 movb %r8b, 11(%rdx,%rax) je .L15 movzbl 12(%rdi,%rax), %r8d leaq 13(%rax), %r9 xorb 12(%rsi,%rax), %r8b cmpq $13, %r10 movb %r8b, 12(%rdx,%rax) je .L15 movzbl 13(%rdi,%rax), %r8d leaq 14(%rax), %r9 xorb 13(%rsi,%rax), %r8b cmpq $14, %r10 movb %r8b, 13(%rdx,%rax) je .L15 movzbl 14(%rdi,%rax), %r8d leaq 15(%rax), %r9 xorb 14(%rsi,%rax), %r8b movb %r8b, 14(%rdx,%rax) .L15: movq %r11, %rbp leaq -1(%r11), %r8 subq %r10, %rbp leaq -16(%rbp), %rbx subq %r10, %r8 shrq $4, %rbx addq $1, %rbx movq %rbx, %r12 salq $4, %r12 cmpq $14, %r8 jbe .L17 addq %r10, %rax xorl %r8d, %r8d xorl %r10d, %r10d leaq (%rdi,%rax), %r13 leaq (%rsi,%rax), %r11 addq %rdx, %rax .L19: movdqu (%r11,%r8), %xmm0 addq $1, %r10 pxor 0(%r13,%r8), %xmm0 movups %xmm0, (%rax,%r8) addq $16, %r8 cmpq %rbx, %r10 jb .L19 addq %r12, %r9 cmpq %r12, %rbp je .L1 .L17: movzbl (%rdi,%r9), %eax xorb (%rsi,%r9), %al movb %al, (%rdx,%r9) leaq 1(%r9), %rax cmpq %rax, %rcx jbe .L1 movzbl 1(%rdi,%r9), %eax xorb 1(%rsi,%r9), %al movb %al, 1(%rdx,%r9) leaq 2(%r9), %rax cmpq %rax, %rcx jbe .L1 movzbl 2(%rdi,%r9), %eax xorb 2(%rsi,%r9), %al movb %al, 2(%rdx,%r9) leaq 3(%r9), %rax cmpq %rax, %rcx jbe .L1 movzbl 3(%rdi,%r9), %eax xorb 3(%rsi,%r9), %al movb %al, 3(%rdx,%r9) leaq 4(%r9), %rax cmpq %rax, %rcx jbe .L1 movzbl 4(%rdi,%r9), %eax xorb 4(%rsi,%r9), %al movb %al, 4(%rdx,%r9) leaq 5(%r9), %rax cmpq %rax, %rcx jbe .L1 movzbl 5(%rdi,%r9), %eax xorb 5(%rsi,%r9), %al movb %al, 5(%rdx,%r9) leaq 6(%r9), %rax cmpq %rax, %rcx jbe .L1 movzbl 6(%rdi,%r9), %eax xorb 6(%rsi,%r9), %al movb %al, 6(%rdx,%r9) leaq 7(%r9), %rax cmpq %rax, %rcx jbe .L1 movzbl 7(%rdi,%r9), %eax xorb 7(%rsi,%r9), %al movb %al, 7(%rdx,%r9) leaq 8(%r9), %rax cmpq %rax, %rcx jbe .L1 movzbl 8(%rdi,%r9), %eax xorb 8(%rsi,%r9), %al movb %al, 8(%rdx,%r9) leaq 9(%r9), %rax cmpq %rax, %rcx jbe .L1 movzbl 9(%rdi,%r9), %eax xorb 9(%rsi,%r9), %al movb %al, 9(%rdx,%r9) leaq 10(%r9), %rax cmpq %rax, %rcx jbe .L1 movzbl 10(%rdi,%r9), %eax xorb 10(%rsi,%r9), %al movb %al, 10(%rdx,%r9) leaq 11(%r9), %rax cmpq %rax, %rcx jbe .L1 movzbl 11(%rdi,%r9), %eax xorb 11(%rsi,%r9), %al movb %al, 11(%rdx,%r9) leaq 12(%r9), %rax cmpq %rax, %rcx jbe .L1 movzbl 12(%rdi,%r9), %eax xorb 12(%rsi,%r9), %al movb %al, 12(%rdx,%r9) leaq 13(%r9), %rax cmpq %rax, %rcx jbe .L1 movzbl 13(%rdi,%r9), %eax xorb 13(%rsi,%r9), %al movb %al, 13(%rdx,%r9) leaq 14(%r9), %rax cmpq %rax, %rcx jbe .L1 movzbl 14(%rdi,%r9), %eax xorb 14(%rsi,%r9), %al movb %al, 14(%rdx,%r9) .L1: popq %rbx popq %rbp popq %r12 popq %r13 popq %r14 popq %r15 ret .L24: movzbl (%rdi,%rax), %r8d xorb (%rsi,%rax), %r8b movb %r8b, (%rdx,%rax) addq $1, %rax cmpq %rax, %rcx jne .L24 jmp .L1 .L25: movq %rdx, %r13 movq %rsi, %rbx movq %rdi, %rbp xorl %r10d, %r10d jmp .L5 .L4: xorl %r9d, %r9d .L13: movl (%rdi,%r9,4), %r10d xorl (%rsi,%r9,4), %r10d movl %r10d, (%rdx,%r9,4) addq $1, %r9 cmpq %r9, %r8 jne .L13 jmp .L11 In the compiler's version of the simple function, there's an immediate and simple test for sz being zero: f(char const*, char const*, char*, unsigned long): testq %rcx, %rcx je .L38 In your version, the compiler hasn't recognized that you're making an attempt at striding, and the code has to walk through a number of steps to get there: f(char const*, char const*, char*, unsigned long): movq %rcx, %r8 pushq %r15 pushq %r14 shrq $2, %r8 pushq %r13 pushq %r12 testq %r8, %r8 pushq %rbp leaq 0(,%r8,4), %rax pushq %rbx je .L11 ... .L11: cmpq %rax, %rcx jbe .L1 ... .L1: popq %rbx popq %rbp popq %r12 popq %r13 popq %r14 popq %r15 ret We also have quite a lot of register spill here keeping track of all these variables. Lets compare a couple of early blocks of the code: Compiler: leaq 16(%rdi), %rax leaq 16(%rdx), %r9 cmpq %rax, %rdx setnb %r8b cmpq %r9, %rdi setnb %al orl %eax, %r8d leaq 16(%rsi), %rax cmpq %rax, %rdx setnb %r10b cmpq %r9, %rsi setnb %al orl %r10d, %eax testb %al, %r8b je .L3 cmpq $19, %rcx jbe .L3 movq %rdi, %r8 pushq %r13 pushq %r12 negq %r8 pushq %rbp pushq %rbx andl $15, %r8d cmpq %rcx, %r8 cmova %rcx, %r8 xorl %eax, %eax testq %r8, %r8 je .L4 Yours: leaq 16(%rsi), %r9 leaq 16(%rdx), %r10 cmpq %r9, %rdx setnb %r11b cmpq %r10, %rsi setnb %r9b orl %r11d, %r9d cmpq $8, %r8 seta %r11b testb %r11b, %r9b je .L4 leaq 16(%rdi), %r9 cmpq %r9, %rdx setnb %r11b cmpq %r10, %rdi setnb %r9b orb %r9b, %r11b je .L4 movq %rdi, %r9 andl $15, %r9d shrq $2, %r9 negq %r9 andl $3, %r9d cmpq %r8, %r9 cmova %r8, %r9 testq %r9, %r9 je .L25 We can see here that the compiler is just having to emit more instructions for each operation than it was producing by itself for the original version.
Vectorization of sin and cos
I was playing around with Compiler Explorer and ran into an anomaly (I think). If I want to make the compiler vectorize a sin calculation using libmvec, I would write: #include <cmath> #define NN 512 typedef float T; typedef T __attribute__((aligned(NN))) AT; inline T s(const T x) { return sinf(x); } void func(AT* __restrict x, AT* __restrict y, int length) { if (length & NN-1) __builtin_unreachable(); for (int i = 0; i < length; i++) { y[i] = s(x[i]); } } compile with gcc 6.2 and -O3 -march=native -ffast-math and get func(float*, float*, int): testl %edx, %edx jle .L10 leaq 8(%rsp), %r10 andq $-32, %rsp pushq -8(%r10) pushq %rbp movq %rsp, %rbp pushq %r14 xorl %r14d, %r14d pushq %r13 leal -8(%rdx), %r13d pushq %r12 shrl $3, %r13d movq %rsi, %r12 pushq %r10 addl $1, %r13d pushq %rbx movq %rdi, %rbx subq $8, %rsp .L4: vmovaps (%rbx), %ymm0 addl $1, %r14d addq $32, %r12 addq $32, %rbx call _ZGVcN8v_sinf // YAY! Vectorized trig! vmovaps %ymm0, -32(%r12) cmpl %r13d, %r14d jb .L4 vzeroupper addq $8, %rsp popq %rbx popq %r10 popq %r12 popq %r13 popq %r14 popq %rbp leaq -8(%r10), %rsp .L10: ret But when I add a cosine to the function, there is no vectorization: #include <cmath> #define NN 512 typedef float T; typedef T __attribute__((aligned(NN))) AT; inline T f(const T x) { return cosf(x)+sinf(x); } void func(AT* __restrict x, AT* __restrict y, int length) { if (length & NN-1) __builtin_unreachable(); for (int i = 0; i < length; i++) { y[i] = f(x[i]); } } which gives: func(float*, float*, int): testl %edx, %edx jle .L10 pushq %r12 leal -1(%rdx), %eax pushq %rbp leaq 4(%rdi,%rax,4), %r12 movq %rsi, %rbp pushq %rbx movq %rdi, %rbx subq $16, %rsp .L4: vmovss (%rbx), %xmm0 leaq 8(%rsp), %rsi addq $4, %rbx addq $4, %rbp leaq 12(%rsp), %rdi call sincosf // No vectorization vmovss 12(%rsp), %xmm0 vaddss 8(%rsp), %xmm0, %xmm0 vmovss %xmm0, -4(%rbp) cmpq %rbx, %r12 jne .L4 addq $16, %rsp popq %rbx popq %rbp popq %r12 .L10: ret I see two good alternatives. Either call a vectorized version of sincosf or call the vectorized sin and cos sequentially. I tried adding -fno-builtin-sincos to no avail. -fopt-info-vec-missed complains about complex float, which there is none. Is this a known issue with gcc? Either way, is there a way I can convince gcc to vectorize the latter example? (As an aside, is there any way to get gcc < 6 to vectorize trigonometric functions automatically?)
openCL glGetProgramInfo causing Core Foundation crash
I am writing a C++ command line program in XCode (6.4) on OSX Yosemite (10.10.4). I am using Apple's openCL framework and am trying to save my openCL binaries to disk. First I create my program from source and build as follows: cl_int err; cl_program result = clCreateProgramWithSource(context, numFiles, (const char **)sourceStrings, NULL, &err); ErrorManager::CheckError(err, "Failed to create a compute program"); err = clBuildProgram(result, deviceCount, devices, NULL, NULL, NULL); ErrorManager::CheckError(err, "Failed to build program"); The above code works fine and I can launch my kernels without any error. Then I try to access the binary sizes... size_t *programBinarySizes = new size_t[deviceCount]; err = clGetProgramInfo(result, CL_PROGRAM_BINARY_SIZES, sizeof(size_t) * deviceCount, programBinarySizes, NULL); However this call to clGetProgramInfo causes Xcode to throw an EXC_BREAKPOINT that has the following output: CoreFoundation`__CFTypeCollectionRetain: 0x7fff8e2c2480 <+0>: pushq %rbp 0x7fff8e2c2481 <+1>: movq %rsp, %rbp 0x7fff8e2c2484 <+4>: pushq %r14 0x7fff8e2c2486 <+6>: pushq %rbx 0x7fff8e2c2487 <+7>: movq %rsi, %rbx 0x7fff8e2c248a <+10>: testq %rbx, %rbx 0x7fff8e2c248d <+13>: je 0x7fff8e2c25ae ; <+302> 0x7fff8e2c2493 <+19>: cmpb $0x0, -0x17f41f28(%rip) ; __CFDeallocateZombies 0x7fff8e2c249a <+26>: je 0x7fff8e2c257c ; <+252> 0x7fff8e2c24a0 <+32>: testq %rdi, %rdi 0x7fff8e2c24a3 <+35>: je 0x7fff8e2c24b5 ; <+53> 0x7fff8e2c24a5 <+37>: leaq -0x17f917cc(%rip), %rax ; kCFAllocatorSystemDefault 0x7fff8e2c24ac <+44>: cmpq %rdi, (%rax) 0x7fff8e2c24af <+47>: jne 0x7fff8e2c257c ; <+252> 0x7fff8e2c24b5 <+53>: testb $0x1, %bl 0x7fff8e2c24b8 <+56>: je 0x7fff8e2c24e4 ; <+100> 0x7fff8e2c24ba <+58>: movl %ebx, %eax 0x7fff8e2c24bc <+60>: shrl %eax 0x7fff8e2c24be <+62>: andl $0x7, %eax 0x7fff8e2c24c1 <+65>: cmpl $0x6, %eax 0x7fff8e2c24c4 <+68>: ja 0x7fff8e2c259c ; <+284> 0x7fff8e2c24ca <+74>: leaq 0xef(%rip), %rcx ; <+320> 0x7fff8e2c24d1 <+81>: movslq (%rcx,%rax,4), %rax 0x7fff8e2c24d5 <+85>: addq %rcx, %rax 0x7fff8e2c24d8 <+88>: jmpq *%rax 0x7fff8e2c24da <+90>: callq 0x7fff8e2df7b0 ; CFStringGetTypeID 0x7fff8e2c24df <+95>: jmp 0x7fff8e2c2594 ; <+276> 0x7fff8e2c24e4 <+100>: movl 0x8(%rbx), %eax 0x7fff8e2c24e7 <+103>: shrl $0x8, %eax 0x7fff8e2c24ea <+106>: andl $0x3ff, %eax 0x7fff8e2c24ef <+111>: movq (%rbx), %rcx 0x7fff8e2c24f2 <+114>: testq %rcx, %rcx 0x7fff8e2c24f5 <+117>: je 0x7fff8e2c2522 ; <+162> 0x7fff8e2c24f7 <+119>: cmpq -0x17f41f96(%rip), %rcx ; __CFConstantStringClassReferencePtr 0x7fff8e2c24fe <+126>: je 0x7fff8e2c2522 ; <+162> 0x7fff8e2c2500 <+128>: leaq -0x17f43fa7(%rip), %rdx ; __CFRuntimeObjCClassTable 0x7fff8e2c2507 <+135>: movq (%rdx,%rax,8), %r14 0x7fff8e2c250b <+139>: cmpq %r14, %rcx 0x7fff8e2c250e <+142>: je 0x7fff8e2c2522 ; <+162> 0x7fff8e2c2510 <+144>: testb $0x1, %cl 0x7fff8e2c2513 <+147>: je 0x7fff8e2c2594 ; <+276> 0x7fff8e2c2515 <+149>: movq %rbx, %rdi 0x7fff8e2c2518 <+152>: callq 0x7fff8e481b6a ; symbol stub for: object_getClass 0x7fff8e2c251d <+157>: cmpq %r14, %rax 0x7fff8e2c2520 <+160>: jne 0x7fff8e2c2594 ; <+276> 0x7fff8e2c2522 <+162>: callq 0x7fff8e481aaa ; symbol stub for: objc_collectableZone 0x7fff8e2c2527 <+167>: movq %rax, %rdi 0x7fff8e2c252a <+170>: movq %rbx, %rsi 0x7fff8e2c252d <+173>: callq 0x7fff8e481594 ; symbol stub for: auto_zone_is_valid_pointer 0x7fff8e2c2532 <+178>: testl %eax, %eax 0x7fff8e2c2534 <+180>: je 0x7fff8e2c255b ; <+219> 0x7fff8e2c2536 <+182>: movl 0x8(%rbx), %eax 0x7fff8e2c2539 <+185>: shrl $0x8, %eax 0x7fff8e2c253c <+188>: andl $0x3ff, %eax 0x7fff8e2c2541 <+193>: leaq -0x17f45ff8(%rip), %rcx ; __CFRuntimeClassTable 0x7fff8e2c2548 <+200>: movq (%rcx,%rax,8), %rax 0x7fff8e2c254c <+204>: testb $0x4, (%rax) 0x7fff8e2c254f <+207>: je 0x7fff8e2c2594 ; <+276> 0x7fff8e2c2551 <+209>: movq %rbx, %rdi 0x7fff8e2c2554 <+212>: callq 0x7fff8e2bf500 ; CFRetain 0x7fff8e2c2559 <+217>: jmp 0x7fff8e2c2594 ; <+276> 0x7fff8e2c255b <+219>: cmpl $0x0, 0xc(%rbx) 0x7fff8e2c255f <+223>: je 0x7fff8e2c2594 ; <+276> 0x7fff8e2c2561 <+225>: leaq -0x17f72248(%rip), %rsi ; #"storing a non-GC object %p in a GC collection, break on CFCollection_non_gc_storage_error to debug." 0x7fff8e2c2568 <+232>: movl $0x4, %edi 0x7fff8e2c256d <+237>: xorl %eax, %eax 0x7fff8e2c256f <+239>: movq %rbx, %rdx 0x7fff8e2c2572 <+242>: callq 0x7fff8e3b61e0 ; CFLog 0x7fff8e2c2577 <+247>: callq 0x7fff8e3c3290 ; CFCollection_non_gc_storage_error 0x7fff8e2c257c <+252>: movq %rbx, %rdi 0x7fff8e2c257f <+255>: popq %rbx 0x7fff8e2c2580 <+256>: popq %r14 0x7fff8e2c2582 <+258>: popq %rbp 0x7fff8e2c2583 <+259>: jmp 0x7fff8e2bf500 ; CFRetain 0x7fff8e2c2588 <+264>: callq 0x7fff8e2df7f0 ; CFNumberGetTypeID 0x7fff8e2c258d <+269>: jmp 0x7fff8e2c2594 ; <+276> 0x7fff8e2c258f <+271>: callq 0x7fff8e2df850 ; CFDateGetTypeID 0x7fff8e2c2594 <+276>: movq %rbx, %rax 0x7fff8e2c2597 <+279>: popq %rbx 0x7fff8e2c2598 <+280>: popq %r14 0x7fff8e2c259a <+282>: popq %rbp 0x7fff8e2c259b <+283>: retq 0x7fff8e2c259c <+284>: int3 0x7fff8e2c259d <+285>: callq 0x7fff8e482020 ; symbol stub for: getpid 0x7fff8e2c25a2 <+290>: movl $0x9, %esi 0x7fff8e2c25a7 <+295>: movl %eax, %edi 0x7fff8e2c25a9 <+297>: callq 0x7fff8e482080 ; symbol stub for: kill 0x7fff8e2c25ae <+302>: leaq 0x36325e(%rip), %rax ; "*** __CFTypeCollectionRetain() called with NULL; likely a collection has been corrupted ***" 0x7fff8e2c25b5 <+309>: movq %rax, -0x17f460c4(%rip) ; gCRAnnotations + 8 0x7fff8e2c25bc <+316>: int3 -> 0x7fff8e2c25bd <+317>: jmp 0x7fff8e2c259d ; <+285> 0x7fff8e2c25bf <+319>: nop This exception doesn't get thrown if I call clGetProgramInfo with a param name other than CL_PROGRAM_BINARY_SIZES. I cannot figure out the reason why I'm getting this exception. It's puzzling because I'm writing a C++ program, not an ObjC or Swift one, so I don't see why I would be getting any errors related to Core Foundation or retain counts. My only guess is maybe there is some build settings option that I have set incorrectly that is causing my program to think this is an objc environment but that seems unlikely. Any help would be appreciated