inline assembly block with multiple outputs [duplicate]

inline assembly block with multiple outputs [duplicate] - c++

This question already has answers here:
How to invoke a system call via syscall or sysenter in inline assembly?
(2 answers)
Unexpected GCC inline ASM behaviour (clobbered variable overwritten)
(1 answer)
When to use earlyclobber constraint in extended GCC inline assembly?
(2 answers)
inline assembly constraint for value that might be overwritten
(1 answer)
Closed 1 year ago.
How does one specify multiple outputs with an inline asm statement using gcc? I don't follow how the garbage value for ret is printed, but I suspect it's possibly related to both syscall and the mov at the top of the inline assembly section both writing to an output register.
Source:
#include <string.h>
#include <iostream>
int main() {
const char* str = "Hello World\n";
long len = strlen(str);
long ret = 0;
long test = 0;
__asm__ __volatile__ (
"mov $22, %0\n\t"
"movq $1, %%rax \n\t"
"movq $1, %%rdi \n\t"
"movq %2, %%rsi \n\t"
"movl %3, %%edx \n\t"
"syscall"
: "=r"(test), "=g"(ret)
: "g"(str), "g" (len));
std::cout << ret << "\n";
return 0;
}
Output:
Hello World
4202512
Disassembly
Dump of assembler code for function main():
0x0000000000401080 <+0>: sub $0x8,%rsp
0x0000000000401084 <+4>: mov $0x16,%rax
0x000000000040108b <+11>: mov $0x1,%rax
0x0000000000401092 <+18>: mov $0x1,%rdi
0x0000000000401099 <+25>: mov $0x402010,%rsi
0x00000000004010a0 <+32>: mov $0xc,%edx
0x00000000004010a5 <+37>: syscall
0x00000000004010a7 <+39>: mov $0x404080,%edi
0x00000000004010ac <+44>: callq 0x401040 <_ZNSo9_M_insertIlEERSoT_#plt>
0x00000000004010b1 <+49>: mov $0x1,%edx
0x00000000004010b6 <+54>: mov $0x40201b,%esi
0x00000000004010bb <+59>: mov %rax,%rdi
0x00000000004010be <+62>: callq 0x401050 <_ZSt16__ostream_insertIcSt11char_traitsIcEERSt13basic_ostreamIT_T0_ES6_PKS3_l#plt>
0x00000000004010c3 <+67>: xor %eax,%eax
0x00000000004010c5 <+69>: add $0x8,%rsp
0x00000000004010c9 <+73>: retq

Related

Why do I get SIGSEGV when trying to pass a char to my function and how to fix it?

I'm overloading operator= in my class and I need to determine whether my string contains digits or not. Unfortunately, I need to use C arrays (char*) mainly, for 2 reasons:
Training
Using std::string will require me to change 100+ lines of code.
When trying to pass a value from a char* array to my function, I get a really nice SEGFAULT and unfortunately I'm not sure why. Probably I'm not using correctly my pointer.
I was searching for hours but I wasn't able to find a solution to this problem. Anyways, here's the code. If you need further information regarding this matter, please let me know in the comments.
#include <limits.h>
#include <iostream>
#include <string.h>
#include <exception>
struct notdig : public std::exception {
virtual const char* what() const throw() override{
return "int_huge: isdig(*(o + i)) returned false. \nYour declaration of an int_huge integer should contain decimal digits.";
}
};
class int_huge {
private:
char* buffera;
char* bufferb;
int numDig(unsigned long long int number){ //Gets the number of digits
int i = 0;
while(number){
number /= 10;
i++;
}
return i;
}
inline bool isdig( char character ) { //Checks whether character is digit or not
return ( '0' <= character && character <= '9' );
}
public:
int_huge(){
this->buffera = "0";
this->bufferb = "0";
}
void operator=(char* operand){
for (int i = 0; i < strlen(operand); i++){
if (!isdig(operand[i])){
throw notdig();
}
}
if (strlen(operand) >= numDig(ULLONG_MAX)){
if (strlen(operand) - numDig(ULLONG_MAX)){ //Equivalent with if (strlen(operand) != numDig(ULLONG_MAX)
int i = 0;
while (i < strlen(operand)-numDig(ULLONG_MAX)){
this->buffera[i] = operand[i];
i++;
}
this->bufferb = operand + i;
} else {
this->buffera[0] = operand[0];
this->bufferb = operand + 1;
}
} else {
this->buffera = "0";
this->bufferb = operand;
}
}
};
int main() {
int_huge object;
try {
object = "90";
} catch (std::exception &e) {
std::cout << e.what();
}
}
Disassembler results:
0x4019b4 push %ebp
0x4019b5 mov %esp,%ebp
0x4019b7 push %ebx
0x4019b8 sub $0x24,%esp
0x4019bb mov 0xc(%ebp),%eax
0x4019be mov %eax,(%esp)
0x4019c1 call 0x401350 <strlen>
0x4019c6 mov %eax,%ebx
0x4019c8 movl $0xffffffff,0x4(%esp)
0x4019d0 movl $0xffffffff,0x8(%esp)
0x4019d8 mov 0x8(%ebp),%eax
0x4019db mov %eax,(%esp)
0x4019de call 0x40195c <int_huge::numDig(unsigned long long)>
0x4019e3 cmp %eax,%ebx
0x4019e5 setae %al
0x4019e8 test %al,%al
0x4019ea je 0x401aa8 <int_huge::int_huge(char*)+244>
0x4019f0 mov 0xc(%ebp),%eax
0x4019f3 mov %eax,(%esp)
0x4019f6 call 0x401350 <strlen>
0x4019fb mov %eax,%ebx
0x4019fd movl $0xffffffff,0x4(%esp)
0x401a05 movl $0xffffffff,0x8(%esp)
0x401a0d mov 0x8(%ebp),%eax
0x401a10 mov %eax,(%esp)
0x401a13 call 0x40195c <int_huge::numDig(unsigned long long)>
0x401a18 cmp %eax,%ebx
0x401a1a setne %al
0x401a1d test %al,%al
0x401a1f je 0x401a8d <int_huge::int_huge(char*)+217>
0x401a21 movl $0x0,-0xc(%ebp)
0x401a28 mov 0xc(%ebp),%eax
0x401a2b mov %eax,(%esp)
0x401a2e call 0x401350 <strlen>
0x401a33 mov %eax,%ebx
0x401a35 movl $0xffffffff,0x4(%esp)
0x401a3d movl $0xffffffff,0x8(%esp)
0x401a45 mov 0x8(%ebp),%eax
0x401a48 mov %eax,(%esp)
0x401a4b call 0x40195c <int_huge::numDig(unsigned long long)>
0x401a50 sub %eax,%ebx
0x401a52 mov %ebx,%edx
0x401a54 mov -0xc(%ebp),%eax
0x401a57 cmp %eax,%edx
0x401a59 seta %al
0x401a5c test %al,%al
0x401a5e je 0x401a7d <int_huge::int_huge(char*)+201>
0x401a60 mov 0x8(%ebp),%eax
0x401a63 mov (%eax),%edx
0x401a65 mov -0xc(%ebp),%eax
0x401a68 add %eax,%edx
0x401a6a mov -0xc(%ebp),%ecx
0x401a6d mov 0xc(%ebp),%eax
0x401a70 add %ecx,%eax
0x401a72 movzbl (%eax),%eax
0x401a75 mov %al,(%edx) ;This is where the compiler stops. Probably due to SIGSEGV.
0x401a77 addl $0x1,-0xc(%ebp)
0x401a7b jmp 0x401a28 <int_huge::int_huge(char*)+116>
0x401a7d mov -0xc(%ebp),%edx
0x401a80 mov 0xc(%ebp),%eax
0x401a83 add %eax,%edx
0x401a85 mov 0x8(%ebp),%eax
0x401a88 mov %edx,0x4(%eax)
0x401a8b jmp 0x401aba <int_huge::int_huge(char*)+262>
0x401a8d mov 0x8(%ebp),%eax
0x401a90 mov (%eax),%eax
0x401a92 mov 0xc(%ebp),%edx
0x401a95 movzbl (%edx),%edx
0x401a98 mov %dl,(%eax)
0x401a9a mov 0xc(%ebp),%eax
0x401a9d lea 0x1(%eax),%edx
0x401aa0 mov 0x8(%ebp),%eax
0x401aa3 mov %edx,0x4(%eax)
0x401aa6 jmp 0x401aba <int_huge::int_huge(char*)+262>
0x401aa8 mov 0x8(%ebp),%eax
0x401aab movl $0x4030d2,(%eax)
0x401ab1 mov 0x8(%ebp),%eax
0x401ab4 mov 0xc(%ebp),%edx
0x401ab7 mov %edx,0x4(%eax)
0x401aba nop
0x401abb add $0x24,%esp
0x401abe pop %ebx
0x401abf pop %ebp
0x401ac0 ret

In
MyClass(){
this->a = "0";
this->b = "0";
}
and in main at
o = "90";
string literals, which may not be in writable storage, are assigned to non-constant pointers to char. The compiler should be warning you about this or outright refusing to compile if the compiler supports the C++11 or newer standards.
Where this blows up is in operator= here:
this->a[i] = o[i];
and here:
this->a[0] = o[0];
as the program attempts to write storage that cannot be written.
Solution:
Use std::string. If that is off the table, and it sounds like it is, don't use string literals. Allocate a buffer in writable memory with new, copy the literal into the buffer, and then assign the buffer. And also remember that the buffers are of a fixed size. Trying to place a larger string in the buffer will end in disaster.
This will be a memory management headache and a potential Rule of Three horror show, so be careful.

Error in simple g++ inline assembler

I'm trying to write a "hello world" program to test inline assembler in g++.
(still leaning AT&T syntax)
The code is:
#include <stdlib.h>
#include <stdio.h>
# include <iostream>
using namespace std;
int main() {
int c,d;
__asm__ __volatile__ (
"mov %eax,1; \n\t"
"cpuid; \n\t"
"mov %edx, $d; \n\t"
"mov %ecx, $c; \n\t"
);
cout << c << " " << d << "\n";
return 0;
}
I'm getting the following error:
inline1.cpp: Assembler messages:
inline1.cpp:18: Error: unsupported instruction `mov'
inline1.cpp:19: Error: unsupported instruction `mov'
Can you help me to get it done?
Tks

Your assembly code is not valid. Please carefully read on Extended Asm. Here's another good overview.
Here is a CPUID example code from here:
static inline void cpuid(int code, uint32_t* a, uint32_t* d)
{
asm volatile ( "cpuid" : "=a"(*a), "=d"(*d) : "0"(code) : "ebx", "ecx" );
}
Note the format:
first : followed by output operands: : "=a"(*a), "=d"(*d); "=a" is eax and "=b is ebx
second : followed by input operands: : "0"(code); "0" means that code should occupy the same location as output operand 0 (eax in this case)
third : followed by clobbered registers list: : "ebx", "ecx"

I kept #AMA answer as accepted one because it was complete enough. But I've put some thought on it and I concluded that it is not 100% correct.
The code I was trying to implement in GCC is the one below (Microsoft Visual Studio version).
int c,d;
_asm
{
mov eax, 1;
cpuid;
mov d, edx;
mov c, ecx;
}
When cpuid executes with eax set to 1, feature information is returned in ecx and edx.
The suggested code returns the values from eax ("=a") and edx (="d").
This can be easily seen at gdb:
(gdb) disassemble cpuid
Dump of assembler code for function cpuid(int, uint32_t*, uint32_t*):
0x0000000000000a2a <+0>: push %rbp
0x0000000000000a2b <+1>: mov %rsp,%rbp
0x0000000000000a2e <+4>: push %rbx
0x0000000000000a2f <+5>: mov %edi,-0xc(%rbp)
0x0000000000000a32 <+8>: mov %rsi,-0x18(%rbp)
0x0000000000000a36 <+12>: mov %rdx,-0x20(%rbp)
0x0000000000000a3a <+16>: mov -0xc(%rbp),%eax
0x0000000000000a3d <+19>: cpuid
0x0000000000000a3f <+21>: mov -0x18(%rbp),%rcx
0x0000000000000a43 <+25>: mov %eax,(%rcx) <== HERE
0x0000000000000a45 <+27>: mov -0x20(%rbp),%rax
0x0000000000000a49 <+31>: mov %edx,(%rax) <== HERE
0x0000000000000a4b <+33>: nop
0x0000000000000a4c <+34>: pop %rbx
0x0000000000000a4d <+35>: pop %rbp
0x0000000000000a4e <+36>: retq
End of assembler dump.
The code that generates something closer to what I want is (EDITED based on feedbacks on the comments):
static inline void cpuid2(uint32_t* d, uint32_t* c)
{
int a = 1;
asm volatile ( "cpuid" : "=d"(*d), "=c"(*c), "+a"(a) :: "ebx" );
}
The result is:
(gdb) disassemble cpuid2
Dump of assembler code for function cpuid2(uint32_t*, uint32_t*):
0x00000000000009b0 <+0>: push %rbp
0x00000000000009b1 <+1>: mov %rsp,%rbp
0x00000000000009b4 <+4>: push %rbx
0x00000000000009b5 <+5>: mov %rdi,-0x20(%rbp)
0x00000000000009b9 <+9>: mov %rsi,-0x28(%rbp)
0x00000000000009bd <+13>: movl $0x1,-0xc(%rbp)
0x00000000000009c4 <+20>: mov -0xc(%rbp),%eax
0x00000000000009c7 <+23>: cpuid
0x00000000000009c9 <+25>: mov %edx,%esi
0x00000000000009cb <+27>: mov -0x20(%rbp),%rdx
0x00000000000009cf <+31>: mov %esi,(%rdx)
0x00000000000009d1 <+33>: mov -0x28(%rbp),%rdx
0x00000000000009d5 <+37>: mov %ecx,(%rdx)
0x00000000000009d7 <+39>: mov %eax,-0xc(%rbp)
0x00000000000009da <+42>: nop
0x00000000000009db <+43>: pop %rbx
0x00000000000009dc <+44>: pop %rbp
0x00000000000009dd <+45>: retq
End of assembler dump.
Just to be clear... I know that there are better ways of doing it. But the purpose here is purely educational. Just want to understand how it works ;-)
-- edited (removed personal opinion) ---

gcc -O0 outperforming -O3 on matrix sizes that are powers of 2 (matrix transpositions)

(For testing purposes) I have written a simple Method to calculate the transpose of a nxn Matrix
void transpose(const size_t _n, double* _A) {
for(uint i=0; i < _n; ++i) {
for(uint j=i+1; j < _n; ++j) {
double tmp = _A[i*_n+j];
_A[i*_n+j] = _A[j*_n+i];
_A[j*_n+i] = tmp;
}
}
}
When using optimization levels O3 or Ofast I expected the compiler to unroll some loops which would lead to higher performance especially when the matrix size is a multiple of 2 (i.e., the double loop body can be performed each iteration) or similar. Instead what I measured was the exact opposite. Powers of 2 actually show a significant spike in execution time.
These spikes are also at regular intervals of 64, more pronounced at intervals of 128 and so on. Each spike extends to the neighboring matrix sizes like in the following table
size n time(us)
1020 2649
1021 2815
1022 3100
1023 5428
1024 15791
1025 6778
1026 3106
1027 2847
1028 2660
1029 3038
1030 2613
I compiled with a gcc version 4.8.2 but the same thing happens with a clang 3.5 so this might be some generic thing?
So my question basically is: Why is there this periodic increase in execution time? Is it some generic thing coming with any of the optimization options (as it happens with clang and gcc alike)? If so which optimization option is causing this?
And how can this be so significant that even the O0 version of the program outperforms the 03 version at multiples of 512?
EDIT: Note the magnitude of the spikes in this (logarithmic) plot. Transposing a 1024x1024 matrix with optimization actually takes as much time as transposing a 1300x1300 matrix without optimization. If this is a cache-fault / page-fault problem, then someone needs to explain to me why the memory layout is so significantly different for the optimized version of the program, that it fails for powers of two, just to recover high performance for slightly larger matrices. Shouldn't cache-faults create more of a step-like pattern? Why does the execution times go down again at all? (and why should optimization create cache-faults that weren't there before?)
EDIT: the following should be the assembler codes that gcc produced
no optimization (O0):
_Z9transposemRPd:
.LFB0:
.cfi_startproc
push rbp
.cfi_def_cfa_offset 16
.cfi_offset 6, -16
mov rbp, rsp
.cfi_def_cfa_register 6
mov QWORD PTR [rbp-24], rdi
mov QWORD PTR [rbp-32], rsi
mov DWORD PTR [rbp-4], 0
jmp .L2
.L5:
mov eax, DWORD PTR [rbp-4]
add eax, 1
mov DWORD PTR [rbp-8], eax
jmp .L3
.L4:
mov rax, QWORD PTR [rbp-32]
mov rdx, QWORD PTR [rax]
mov eax, DWORD PTR [rbp-4]
imul rax, QWORD PTR [rbp-24]
mov rcx, rax
mov eax, DWORD PTR [rbp-8]
add rax, rcx
sal rax, 3
add rax, rdx
mov rax, QWORD PTR [rax]
mov QWORD PTR [rbp-16], rax
mov rax, QWORD PTR [rbp-32]
mov rdx, QWORD PTR [rax]
mov eax, DWORD PTR [rbp-4]
imul rax, QWORD PTR [rbp-24]
mov rcx, rax
mov eax, DWORD PTR [rbp-8]
add rax, rcx
sal rax, 3
add rdx, rax
mov rax, QWORD PTR [rbp-32]
mov rcx, QWORD PTR [rax]
mov eax, DWORD PTR [rbp-8]
imul rax, QWORD PTR [rbp-24]
mov rsi, rax
mov eax, DWORD PTR [rbp-4]
add rax, rsi
sal rax, 3
add rax, rcx
mov rax, QWORD PTR [rax]
mov QWORD PTR [rdx], rax
mov rax, QWORD PTR [rbp-32]
mov rdx, QWORD PTR [rax]
mov eax, DWORD PTR [rbp-8]
imul rax, QWORD PTR [rbp-24]
mov rcx, rax
mov eax, DWORD PTR [rbp-4]
add rax, rcx
sal rax, 3
add rdx, rax
mov rax, QWORD PTR [rbp-16]
mov QWORD PTR [rdx], rax
add DWORD PTR [rbp-8], 1
.L3:
mov eax, DWORD PTR [rbp-8]
cmp rax, QWORD PTR [rbp-24]
jb .L4
add DWORD PTR [rbp-4], 1
.L2:
mov eax, DWORD PTR [rbp-4]
cmp rax, QWORD PTR [rbp-24]
jb .L5
pop rbp
.cfi_def_cfa 7, 8
ret
.cfi_endproc
.LFE0:
.size _Z9transposemRPd, .-_Z9transposemRPd
.ident "GCC: (Debian 4.8.2-15) 4.8.2"
.section .note.GNU-stack,"",#progbits
with optimization (O3)
_Z9transposemRPd:
.LFB0:
.cfi_startproc
push rbx
.cfi_def_cfa_offset 16
.cfi_offset 3, -16
xor r11d, r11d
xor ebx, ebx
.L2:
cmp r11, rdi
mov r9, r11
jae .L10
.p2align 4,,10
.p2align 3
.L7:
add ebx, 1
mov r11d, ebx
cmp rdi, r11
mov rax, r11
jbe .L2
mov r10, r9
mov r8, QWORD PTR [rsi]
mov edx, ebx
imul r10, rdi
.p2align 4,,10
.p2align 3
.L6:
lea rcx, [rax+r10]
add edx, 1
imul rax, rdi
lea rcx, [r8+rcx*8]
movsd xmm0, QWORD PTR [rcx]
add rax, r9
lea rax, [r8+rax*8]
movsd xmm1, QWORD PTR [rax]
movsd QWORD PTR [rcx], xmm1
movsd QWORD PTR [rax], xmm0
mov eax, edx
cmp rdi, rax
ja .L6
cmp r11, rdi
mov r9, r11
jb .L7
.L10:
pop rbx
.cfi_def_cfa_offset 8
ret
.cfi_endproc
.LFE0:
.size _Z9transposemRPd, .-_Z9transposemRPd
.ident "GCC: (Debian 4.8.2-15) 4.8.2"
.section .note.GNU-stack,"",#progbits

The periodic increase of execution time must be due to the cache being only N-way associative instead of fully associative. You are witnessing hash collision related to cache line selection algorithm.
The fastest L1 cache has a smaller number of cache lines than the next level L2. In each level each cache line can be filled only from a limited set of sources.
Typical HW implementations of cache line selection algorithms will just use few bits from the memory address to determine in which cache slot the data should be written -- in HW bit shifts are free.
This causes a competition between memory ranges e.g. between addresses 0x300010 and 0x341010.
In fully sequential algorithm this doesn't matter -- N is large enough for practically all algorithms of the form:
for (i=0;i<1000;i++) a[i] += b[i] * c[i] + d[i];
But when the number of the inputs (or outputs) gets larger, which happens internally when the algorithm is optimized, having one input in the cache forces another input out of the cache.
// one possible method of optimization with 2 outputs and 6 inputs
// with two unrelated execution paths -- should be faster, but maybe it isn't
for (i=0;i<500;i++) {
a[i] += b[i] * c[i] + d[i];
a[i+500] += b[i+500] * c[i+500] + d[i+500];
}
A graph in Example 5: Cache Associativity illustrates 512 byte offset between matrix lines being a global worst case dimension for the particular system. When this is known, a working mitigation is to over-allocate the matrix horizontally to some other dimension char matrix[512][512 + 64].

The improvement in performance is likely related to CPU/RAM caching.
When the data is not a power of 2, a cache line load (like 16, 32, or 64 words) transfers more than the data that is required tying up the bus—uselessly as it turns out. For a data set which is a power of 2, all of the pre-fetched data is used.
I bet if you were to disable L1 and L2 caching, the performance would be completely smooth and predictable. But it would be much slower. Caching really helps performance!

Comment with code: In the -O3 case, with
#include <cstdlib>
extern void transpose(const size_t n, double* a)
{
for (size_t i = 0; i < n; ++i) {
for (size_t j = i + 1; j < n; ++j) {
std::swap(a[i * n + j], a[j * n + i]); // or your expanded version.
}
}
}
compiling with
$ g++ --version
g++ (Ubuntu/Linaro 4.8.1-10ubuntu9) 4.8.1
...
$ g++ -g1 -std=c++11 -Wall -o test.S -S test.cpp -O3
I get
_Z9transposemPd:
.LFB68:
.cfi_startproc
.LBB2:
testq %rdi, %rdi
je .L1
leaq 8(,%rdi,8), %r10
xorl %r8d, %r8d
.LBB3:
addq $1, %r8
leaq -8(%r10), %rcx
cmpq %rdi, %r8
leaq (%rsi,%rcx), %r9
je .L1
.p2align 4,,10
.p2align 3
.L10:
movq %r9, %rdx
movq %r8, %rax
.p2align 4,,10
.p2align 3
.L5:
.LBB4:
movsd (%rdx), %xmm1
movsd (%rsi,%rax,8), %xmm0
movsd %xmm1, (%rsi,%rax,8)
.LBE4:
addq $1, %rax
.LBB5:
movsd %xmm0, (%rdx)
addq %rcx, %rdx
.LBE5:
cmpq %rdi, %rax
jne .L5
addq $1, %r8
addq %r10, %r9
addq %rcx, %rsi
cmpq %rdi, %r8
jne .L10
.L1:
rep ret
.LBE3:
.LBE2:
.cfi_endproc
And something quite different if I add -m32.
(Note: it makes no difference to the assembly whether I use std::swap or your variant)
In order to understand what is causing the spikes, though, you probably want to visualize the memory operations going on.

To add to others: g++ -std=c++11 -march=core2 -O3 -c -S - gcc version 4.8.2 (MacPorts gcc48 4.8.2_0) - x86_64-apple-darwin13.0.0 :
__Z9transposemPd:
LFB0:
testq %rdi, %rdi
je L1
leaq 8(,%rdi,8), %r10
xorl %r8d, %r8d
leaq -8(%r10), %rcx
addq $1, %r8
leaq (%rsi,%rcx), %r9
cmpq %rdi, %r8
je L1
.align 4,0x90
L10:
movq %r9, %rdx
movq %r8, %rax
.align 4,0x90
L5:
movsd (%rdx), %xmm0
movsd (%rsi,%rax,8), %xmm1
movsd %xmm0, (%rsi,%rax,8)
addq $1, %rax
movsd %xmm1, (%rdx)
addq %rcx, %rdx
cmpq %rdi, %rax
jne L5
addq $1, %r8
addq %r10, %r9
addq %rcx, %rsi
cmpq %rdi, %r8
jne L10
L1:
rep; ret
Basically the same as #ksfone's code, for:
#include <cstddef>
void transpose(const size_t _n, double* _A) {
for(size_t i=0; i < _n; ++i) {
for(size_t j=i+1; j < _n; ++j) {
double tmp = _A[i*_n+j];
_A[i*_n+j] = _A[j*_n+i];
_A[j*_n+i] = tmp;
}
}
}
Apart from the Mach-O 'as' differences (extra underscore, align and DWARF locations), it's the same. But very different from the OP's assembly output. A much 'tighter' inner loop.

How to know if loop has optimization potential

My code spends a considerable amount of time in the following loop. I used gcc-4.8 with -O3 -march=native to compile the code. Since I am an absolute newbie in optimization, how do I know if the compiler did all it could? I am running on a AMD FX(tm)-6200
float* __restrict__ ApsiPtr = Apsi.begin();
const float* const __restrict__ psiPtr = psi.begin();
const float* const __restrict__ diagPtr = diag().begin();
register const label nCells = diag().size();
for (register label cell=0; cell<nCells; cell++) {
ApsiPtr[cell] = diagPtr[cell]*psiPtr[cell];
}
ddd dumps me the following assembler code.
Dump of assembler code from 0x7ffff1d32010 to 0x7ffff1d32110:¬
=> mov (%rsp),%rdi¬
callq 0x7ffff1ba5ca0 <_ZNK4Foam9lduMatrix4diagEv#plt>¬
mov 0x8(%rax),%edx¬
test %edx,%edx¬
mov %edx,%esi¬
mov %edx,0x34(%rsp)¬
jle 0x7ffff1d3251f ¬
mov 0x38(%rsp),%r10¬
lea 0x10(%rbx),%r8¬
lea 0x10(%rbp),%rax¬
lea 0x10(%r10),%rdi¬
cmp %rdi,%rbx¬
setae %r9b¬
cmp %r8,%r10¬
setae %r11b¬
or %r11d,%r9d¬
cmp %rax,%rbx¬
setae %dl¬
cmp %r8,%rbp¬
setae %cl¬
or %ecx,%edx¬
test %dl,%r9b¬
je 0x7ffff1d32728 ¬
cmp $0xc,%esi¬
jbe 0x7ffff1d32728 ¬
shr %esi¬
lea 0x40(%r10),%rax¬
mov %rbx,%rdx¬
lea -0x5(%rsi),%r8d¬
lea (%rsi,%rsi,1),%r9d¬
mov %esi,0x38(%rsp)¬
shr $0x2,%r8d¬
mov %r9d,0x4c(%rsp)¬
mov %rbp,%rsi¬
shl $0x6,%r8¬
mov $0x0,%r9d¬
lea 0x80(%r10,%r8,1),%r11¬
mov %r11,%rcx¬
sub %rax,%rcx¬
and $0x40,%ecx¬
movupd -0x40(%rax),%xmm0¬
movupd 0x0(%rbp),%xmm1¬
prefetcht0 0x1e0(%r10)¬
prefetcht0 0x1e0(%rbp)¬
prefetchw 0x1e0(%rbx)¬
mov $0x4,%r9d¬
mulpd %xmm1,%xmm0¬
lea 0x40(%rbp),%rsi¬
lea 0x40(%rbx),%rdx¬
mov %r10,0x40(%rsp)¬
movlpd %xmm0,(%rbx)¬
movhpd %xmm0,0x8(%rbx)¬
movupd -0x30(%rax),%xmm2¬
movupd 0x10(%rbp),%xmm3¬
mulpd %xmm3,%xmm2¬
movlpd %xmm2,0x10(%rbx)¬
movhpd %xmm2,0x18(%rbx)¬
movupd -0x20(%rax),%xmm4¬
movupd 0x20(%rbp),%xmm5¬
End of assembler dump.¬

Porting Inline GASM to x64 MASM Access Violation Issue

I am currently porting some code to MS Windows x64 from the https://github.com/mono project which was written for GCC Linux and I am having some challenges.
Currently I am unsure if my translation from x64 AT&T inline ASM to x64 MASM is correct. It compiles fine but my test case fails as memcpy throws exceptions/memory access violations after my ASM function executes. Is my translation correct?
One of the things I was really challenged by was the fact that rip is not accessible in Windows x64 MASM? I really don't know how to translate those remaining lines of the AT&T syntax (see below). But I gave it a best try. Did I handle the lack of rip access correctly?
If my work is correct then why is memcpy failing?
Here is the related C++:
void mono_context_get_current(MonoContext cnt); //declare the ASM func
//Pass the static struct pointer to the ASM function mono_context_get_current
//The purpose here is to clobber it
#ifdef _MSC_VER
#define MONO_CONTEXT_GET_CURRENT(ctx) do { \
mono_context_get_current(ctx); \
} while (0)
#endif
static MonoContext cur_thread_ctx = {0};
MONO_CONTEXT_GET_CURRENT (cur_thread_ctx);
memcpy (&info->ctx, &cur_thread_ctx, sizeof (MonoContext)); //memcpy throws Exception.
Here is the current ASM function.
mono_context_get_current PROTO
.code
mono_context_get_current PROC
mov rax, rcx ;Assume that rcx contains the pointer being passed
mov [rax+00h], rax
mov [rax+08h], rbx
mov [rax+10h], rcx
mov [rax+18h], rdx ;purpose is to offset from my understanding of the GCC assembly
mov [rax+20h], rbp
mov [rax+28h], rsp
mov [rax+30h], rsi
mov [rax+38h], rdi
mov [rax+40h], r8
mov [rax+48h], r9
mov [rax+50h], r10
mov [rax+58h], r11
mov [rax+60h], r12
mov [rax+68h], r13
mov [rax+70h], r14
mov [rax+78h], r15
call $ + 5
mov rdx, [rax+80h]
pop rdx
mono_context_get_current ENDP
END
To my understanding the rcx register should contain the struct pointer and that I should be using rdx to pop.
As I mentioned I have GCC ASM for non-Win64 platforms which appears to work on those platforms. This is what that code looks like:
#define MONO_CONTEXT_GET_CURRENT(ctx) \
__asm__ __volatile__( \
"movq $0x0, 0x00(%0)\n" \
"movq %%rbx, 0x08(%0)\n" \
"movq %%rcx, 0x10(%0)\n" \
"movq %%rdx, 0x18(%0)\n" \
"movq %%rbp, 0x20(%0)\n" \
"movq %%rsp, 0x28(%0)\n" \
"movq %%rsi, 0x30(%0)\n" \
"movq %%rdi, 0x38(%0)\n" \
"movq %%r8, 0x40(%0)\n" \
"movq %%r9, 0x48(%0)\n" \
"movq %%r10, 0x50(%0)\n" \
"movq %%r11, 0x58(%0)\n" \
"movq %%r12, 0x60(%0)\n" \
"movq %%r13, 0x68(%0)\n" \
"movq %%r14, 0x70(%0)\n" \
"movq %%r15, 0x78(%0)\n" \
"leaq (%%rip), %%rdx\n" \
"movq %%rdx, 0x80(%0)\n" \
: \
: "a" (&(ctx)) \
: "rdx", "memory")
Thanks for any help you may be able to offer! I'll be the first to admit my assembly is pretty rusty.

You can let gcc create the asm file for you (gcc can produce MASM syntax as well):
gcc -S -masm=intel myfile.c

Comparing between the two versions there appears to be some discrepancy:
movq $0x0, 0x00(%0)
It doesn't look like rax is being saved but instead that memory slot is zero'ed out.
leaq (%%rip), %%rdx
You should be able to translate that into intel synatx:
lea rdx, [rip]
which is valid if you're using 64-bit relative addressing mode.
And this line is incorrectly translated from att:
call $ + 5
mov rdx, [rax+80h] ; looks reversed
pop rdx
Here's how I've translated the original gas syntax above:
mov qword ptr [rcx], 0
mov [rcx + 0x08], rbx
mov [rcx + 0x10], rax
mov [rcx + 0x18], rdx
mov [rcx + 0x20], rbp
mov [rcx + 0x28], rsp
mov [rcx + 0x30], rsi
mov [rcx + 0x38], rdi
mov [rcx + 0x40], r8
mov [rcx + 0x48], r9
mov [rcx + 0x50], r10
mov [rcx + 0x58], r11
mov [rcx + 0x60], r12
mov [rcx + 0x68], r13
mov [rcx + 0x70], r14
mov [rcx + 0x78], r15
lea rdx, [rip]
mov [rcx + 0x80], rdx
mov rdx, [rcx + 0x18] ; restore old rdx since it's on clobber list
Note that I switched rcx around with rax just to save an extra mov. So rax gets saved in place of rcx in the gas syntax. You might need to modify this depending on your invariants.
If it still crashes I'd advise stepping through it with a debugger.

We Keep Coding

c++ django amazon-web-services regex python-2.7 google-cloud-platform list unit-testing opengl ember.js

inline assembly block with multiple outputs [duplicate] - c++

Related

Why do I get SIGSEGV when trying to pass a char to my function and how to fix it?

Error in simple g++ inline assembler

gcc -O0 outperforming -O3 on matrix sizes that are powers of 2 (matrix transpositions)

How to know if loop has optimization potential

Porting Inline GASM to x64 MASM Access Violation Issue

Categories

Resources