mmap Mac: Segmentation fault - c++

The following on my Mac succeeds:
int main() {
int* addr = (int*) mmap(0, 100, 1 | 2, 2 | 4096, -1, 0);
*addr = 25;
return 0;
}
However the below code is identical but fails when I try to write to *addr with segmentation fault:
int main() {
int* addr = (int*) syscall(SYS_mmap, 0, 100, 1 | 2, 2 | 4096, -1, 0);
*addr = 25;
return 0;
}
I.e. syscall successfully returns me a memory address, but when I try writing to it it fails.
I compile it like this:
g++ ./c++/mmap.cc -o ./mmap && ./mmap
If I run both versions with dtruss:
g++ ./c++/mmap.cc -o ./mmap && sudo dtruss ./mmap
then both version succeed and I see identical mmap call for both:
mmap(0x0, 0x64, 0x3, 0x1002, 0xFFFFFFFF, 0x0) = 0xXXXXXXX 0
Why does the syscall version give me segmentation fault, what am I missing?
P.S. If I do something similar on Linux it works fine.
So, as I understand the mmap function on Mac does not execute syscall(SYS_mmap, .... What does it do then? Can anyone please give me some links where I can see implementation.
EDIT:
It looks like syscall on Mac returns only first 4 bytes. Is there a 64-bit syscall version?
DISASSEMBLED:
mmap version:
_main:
0000000100000cf0 pushq %rbp
0000000100000cf1 movq %rsp, %rbp
0000000100000cf4 subq $0x30, %rsp
0000000100000cf8 xorl %eax, %eax
0000000100000cfa movl %eax, %ecx
0000000100000cfc movl $0x64, %eax
0000000100000d01 movl %eax, %esi
0000000100000d03 movl $0x3, %edx
0000000100000d08 movl $0x1002, %eax
0000000100000d0d movl $0xffffffff, %r8d
0000000100000d13 movl $0x0, -0x14(%rbp)
0000000100000d1a movq %rcx, %rdi
0000000100000d1d movq %rcx, -0x28(%rbp)
0000000100000d21 movl %eax, %ecx
0000000100000d23 movq -0x28(%rbp), %r9
0000000100000d27 callq 0x100000ed6 ## symbol stub for: _mmap
0000000100000d2c movq 0x2cd(%rip), %rdi ## literal pool symbol address: __ZNSt3__14coutE
0000000100000d33 movq %rax, -0x20(%rbp)
0000000100000d37 movq -0x20(%rbp), %rax
0000000100000d3b movq %rax, %rsi
syscall version:
_main:
0000000100000cf0 pushq %rbp
0000000100000cf1 movq %rsp, %rbp
0000000100000cf4 subq $0x30, %rsp
0000000100000cf8 movl $0xc5, %edi
0000000100000cfd xorl %esi, %esi
0000000100000cff movl $0x64, %edx
0000000100000d04 movl $0x3, %ecx
0000000100000d09 movl $0x1002, %r8d
0000000100000d0f movl $0xffffffff, %r9d
0000000100000d15 movl $0x0, -0x14(%rbp)
0000000100000d1c movl $0x0, (%rsp)
0000000100000d23 movb $0x0, %al
0000000100000d25 callq 0x100000ed6 ## symbol stub for: _syscall
0000000100000d2a movq 0x2cf(%rip), %rdi ## literal pool symbol address: __ZNSt3__14coutE
0000000100000d31 movslq %eax, %r10
0000000100000d34 movq %r10, -0x20(%rbp)
0000000100000d38 movq -0x20(%rbp), %r10
0000000100000d3c movq %r10, %rsi

Apparently Mac does not have a 64-bit syscall function, here a is simple implementation:
#include <sys/types.h>
#define CARRY_FLAG_BIT 1
inline int64_t syscall6(int64_t num, int64_t arg1, int64_t arg2, int64_t arg3, int64_t arg4, int64_t arg5, int64_t arg6) {
int64_t result;
int64_t flags;
__asm__ __volatile__ (
"movq %6, %%r10;\n"
"movq %7, %%r8;\n"
"movq %8, %%r9;\n"
"syscall;\n"
"movq %%r11, %1;\n"
: "=a" (result), "=r" (flags)
: "a" (num), "D" (arg1), "S" (arg2), "d" (arg3), "r" (arg4), "r" (arg5), "r" (arg6)
: "%r10", "%r8", "%r9", "%rcx", "%r11"
);
return (flags & CARRY_FLAG_BIT) ? -result : result;
}
And you use it on mac by shifting system call numbers by 0x2000000:
int* addr = (int*) syscall6(0x2000000 + SYS_mmap, 0, 100, 1 | 2, 2 | 4096, -1, 0);
You can find more here.

Related

Vectorization of sin and cos

I was playing around with Compiler Explorer and ran into an anomaly (I think). If I want to make the compiler vectorize a sin calculation using libmvec, I would write:
#include <cmath>
#define NN 512
typedef float T;
typedef T __attribute__((aligned(NN))) AT;
inline T s(const T x)
{
return sinf(x);
}
void func(AT* __restrict x, AT* __restrict y, int length)
{
if (length & NN-1) __builtin_unreachable();
for (int i = 0; i < length; i++)
{
y[i] = s(x[i]);
}
}
compile with gcc 6.2 and -O3 -march=native -ffast-math and get
func(float*, float*, int):
testl %edx, %edx
jle .L10
leaq 8(%rsp), %r10
andq $-32, %rsp
pushq -8(%r10)
pushq %rbp
movq %rsp, %rbp
pushq %r14
xorl %r14d, %r14d
pushq %r13
leal -8(%rdx), %r13d
pushq %r12
shrl $3, %r13d
movq %rsi, %r12
pushq %r10
addl $1, %r13d
pushq %rbx
movq %rdi, %rbx
subq $8, %rsp
.L4:
vmovaps (%rbx), %ymm0
addl $1, %r14d
addq $32, %r12
addq $32, %rbx
call _ZGVcN8v_sinf // YAY! Vectorized trig!
vmovaps %ymm0, -32(%r12)
cmpl %r13d, %r14d
jb .L4
vzeroupper
addq $8, %rsp
popq %rbx
popq %r10
popq %r12
popq %r13
popq %r14
popq %rbp
leaq -8(%r10), %rsp
.L10:
ret
But when I add a cosine to the function, there is no vectorization:
#include <cmath>
#define NN 512
typedef float T;
typedef T __attribute__((aligned(NN))) AT;
inline T f(const T x)
{
return cosf(x)+sinf(x);
}
void func(AT* __restrict x, AT* __restrict y, int length)
{
if (length & NN-1) __builtin_unreachable();
for (int i = 0; i < length; i++)
{
y[i] = f(x[i]);
}
}
which gives:
func(float*, float*, int):
testl %edx, %edx
jle .L10
pushq %r12
leal -1(%rdx), %eax
pushq %rbp
leaq 4(%rdi,%rax,4), %r12
movq %rsi, %rbp
pushq %rbx
movq %rdi, %rbx
subq $16, %rsp
.L4:
vmovss (%rbx), %xmm0
leaq 8(%rsp), %rsi
addq $4, %rbx
addq $4, %rbp
leaq 12(%rsp), %rdi
call sincosf // No vectorization
vmovss 12(%rsp), %xmm0
vaddss 8(%rsp), %xmm0, %xmm0
vmovss %xmm0, -4(%rbp)
cmpq %rbx, %r12
jne .L4
addq $16, %rsp
popq %rbx
popq %rbp
popq %r12
.L10:
ret
I see two good alternatives. Either call a vectorized version of sincosf or call the vectorized sin and cos sequentially. I tried adding -fno-builtin-sincos to no avail. -fopt-info-vec-missed complains about complex float, which there is none.
Is this a known issue with gcc? Either way, is there a way I can convince gcc to vectorize the latter example?
(As an aside, is there any way to get gcc < 6 to vectorize trigonometric functions automatically?)

C function runs on Mac OS X 10.11 or earlier or on Windows, but crashes in Mac OS X 10.12

I have a dynamically loadable bundle on Mac OS X and it crashes on Mac OS X Sierra. It works fine on earlier Mac OS X versions (at least on 64-bit) and also (this is a cross-platform plug-in and this code is shared) on Windows, both 32- and 64-bit. The bundle is compiled on Mac OS X 10.11 with clang in universal mode without optimizations against 10.11 SDK; the minimal OS version was set first to 10.4, then to 10.11 without any effect.
It crashes in a rather simple function. I don't have access to a computer with Sierra, so all I have is a crash log, the binary that crashes, and the source. I'm trying to read the assembly dump to understand what might be wrong. So far everything seems to be OK (with some redundant operations, but no error), but I'm not fluent with assembly. Below are the relevant parts:
The crash log:
Exception Type: EXC_BAD_ACCESS (SIGSEGV)
Exception Codes: KERN_INVALID_ADDRESS at 0x0000140009f3cabc
Exception Note: EXC_CORPSE_NOTIFY
...
Thread 0 Crashed:: Dispatch queue: com.apple.main-thread
0 ... 0x0000000109f2825b Do_GetString2(unsigned char, unsigned long, unsigned short*) + 171
...
Thread 0 crashed with X86 Thread State (64-bit):
rax: 0x0000000000000000 rbx: 0x00007fcb95adc528 rcx: 0x0000140009f3cabc rdx: 0x0000000000000000
rdi: 0x0000000000000082 rsi: 0x0000000000000100 rbp: 0x00007fff5b5f5f10 rsp: 0x00007fff5b5f5f10
r8: 0x00000000000014c0 r9: 0x0000000000000001 r10: 0x0000000106c823fc r11: 0x0000000106c82400
r12: 0x0000000000000082 r13: 0x0000000000000000 r14: 0x00007fcb95adc552 r15: 0xffffffffffffffff
rip: 0x0000000109f2825b rfl: 0x0000000000010287 cr2: 0x0000140009f3cabc
The Do_GetString2 is a simple function that copies a few short ASCII strings into wider UTF-16 strings:
static void
Do_GetString2(uint8_t type, uint64_t limit, uint16_t *target)
{
const char *source; uint64_t i;
switch (type) {
case 128 : source = strings[0]; break;
case 129 : source = strings[1]; break;
case 131 : source = strings[2]; break;
}
for (i = 0; i < limit && source[i] != '\0'; ++i)
target[i] = source[i];
}
strings is like that:
static const char* strings[] = {
"... A ...",
"... B ...",
"... C ..."
};
The assembly code is below. The crashing address, as far as I undersand, is (+171) 000000000000125b, which means it crashes when it tries to read source[i], the very first character (i is 0). The type parameter is 131, the limit is 256 (0x100). (I could be wrong here, as I said, I'm not that familiar with assembly.) This is probably the very first call to the library.
__ZL13Do_GetString2hmPt:
00000000000011b0 pushq %rbp
00000000000011b1 movq %rsp, %rbp
00000000000011b4 movb %dil, %al
00000000000011b7 movb %al, -0x1(%rbp)
00000000000011ba movq %rsi, -0x10(%rbp)
00000000000011be movq %rdx, -0x18(%rbp)
00000000000011c2 movzbl -0x1(%rbp), %edi
00000000000011c6 movl %edi, %ecx
00000000000011c8 subl $0x80, %ecx
00000000000011ce movl %edi, -0x2c(%rbp)
00000000000011d1 movl %ecx, -0x30(%rbp)
00000000000011d4 je 0x121b
00000000000011da jmp 0x11df
00000000000011df movl -0x2c(%rbp), %eax
00000000000011e2 subl $0x81, %eax
00000000000011e7 movl %eax, -0x34(%rbp)
00000000000011ea je 0x122b
00000000000011f0 jmp 0x11f5
00000000000011f5 movl -0x2c(%rbp), %eax
00000000000011f8 subl $0x83, %eax
00000000000011fd movl %eax, -0x38(%rbp)
0000000000001200 jne 0x1236
0000000000001206 jmp 0x120b
000000000000120b movq __ZL14plugin_strings(%rip), %rax ## plugin_strings
0000000000001212 movq %rax, -0x20(%rbp)
0000000000001216 jmp 0x1236
000000000000121b movq 0x15406(%rip), %rax
0000000000001222 movq %rax, -0x20(%rbp)
0000000000001226 jmp 0x1236
000000000000122b movq 0x153fe(%rip), %rax
0000000000001232 movq %rax, -0x20(%rbp)
0000000000001236 movq $0x0, -0x28(%rbp)
000000000000123e xorl %eax, %eax
0000000000001240 movb %al, %cl
0000000000001242 movq -0x28(%rbp), %rdx
0000000000001246 cmpq -0x10(%rbp), %rdx
000000000000124a movb %cl, -0x39(%rbp)
000000000000124d jae 0x126d
0000000000001253 movq -0x28(%rbp), %rax
0000000000001257 movq -0x20(%rbp), %rcx
000000000000125b movsbl (%rcx,%rax), %edx # CRASHES HERE
000000000000125f cmpl $0x0, %edx
0000000000001265 setne %sil
0000000000001269 movb %sil, -0x39(%rbp)
000000000000126d movb -0x39(%rbp), %al
0000000000001270 testb $0x1, %al
0000000000001272 jne 0x127d
0000000000001278 jmp 0x12ad
000000000000127d movq -0x28(%rbp), %rax
0000000000001281 movq -0x20(%rbp), %rcx
0000000000001285 movb (%rcx,%rax), %dl
0000000000001288 movsbl %dl, %esi
000000000000128b movw %si, %di
000000000000128e movq -0x28(%rbp), %rax
0000000000001292 movq -0x18(%rbp), %rcx
0000000000001296 movw %di, (%rcx,%rax,2)
000000000000129a movq -0x28(%rbp), %rax
000000000000129e addq $0x1, %rax
00000000000012a4 movq %rax, -0x28(%rbp)
00000000000012a8 jmp 0x123e
00000000000012ad popq %rbp
00000000000012ae retq
00000000000012af nop
Does anyone see any error here? What could I try to debug this crash? This is a GUI application, so I'm not sure I can debug it with gdb or something like that over SSH.

gcc assembly when passing by reference and by value

I have a simple function that computes a product of
two double arrays:
#include <stdlib.h>
#include <emmintrin.h>
struct S {
double *x;
double *y;
double *z;
};
void f(S& s, size_t n) {
for (int i = 0; i < n; i += 2) {
__m128d xs = _mm_load_pd(&s.x[i]);
__m128d ys = _mm_load_pd(&s.y[i]);
_mm_store_pd(&s.z[i], _mm_mul_pd(xs, ys) );
}
return;
}
int main(void) {
S s;
size_t size = 4;
posix_memalign((void **)&s.x, 16, sizeof(double) * size);
posix_memalign((void **)&s.y, 16, sizeof(double) * size);
posix_memalign((void **)&s.z, 16, sizeof(double) * size);
f(s, size);
return 0;
}
Note that the first argument of function f is passed in by reference.
Let's look at the resulting assembly of f() (I removed some irrelevant
pieces, inserted comments and put some labels):
$ g++ -O3 -S asmtest.cpp
.globl _Z1fR1Sm
_Z1fR1Sm:
xorl %eax, %eax
testq %rsi, %rsi
je .L1
.L5:
movq (%rdi), %r8 # array x (1)
movq 8(%rdi), %rcx # array y (2)
movq 16(%rdi), %rdx # array z (3)
movapd (%r8,%rax,8), %xmm0 # load x[0]
mulpd (%rcx,%rax,8), %xmm0 # multiply x[0]*y[0]
movaps %xmm0, (%rdx,%rax,8) # store to y
addq $2, %rax # and loop
cmpq %rax, %rsi
ja .L5
Notice that addresses of arrays x, y, z are loaded into general-purpose
registers on each iteration, see statements (1),(2),(3). Why doesn't gcc move
these instructions outside the loop?
Now make a local copy (not a deep copy) of the structure:
void __attribute__((noinline)) f(S& args, size_t n) {
S s = args;
for (int i = 0; i < n; i += 2) {
__m128d xs = _mm_load_pd(&s.x[i]);
__m128d ys = _mm_load_pd(&s.y[i]);
_mm_store_pd(&s.z[i], _mm_mul_pd(xs, ys) );
}
return;
}
Assembly:
_Z1fR1Sm:
.LFB525:
.cfi_startproc
xorl %eax, %eax
testq %rsi, %rsi
movq (%rdi), %r8 # (1)
movq 8(%rdi), %rcx # (2)
movq 16(%rdi), %rdx # (3)
je .L1
.L5:
movapd (%r8,%rax,8), %xmm0
mulpd (%rcx,%rax,8), %xmm0
movaps %xmm0, (%rdx,%rax,8)
addq $2, %rax
cmpq %rax, %rsi
ja .L5
.L1:
rep ret
Notice that unlike in the previous code,
loads (1), (2), (3) are now outside the loop.
I would appreciate an explanation why these two assembly
codes are different. Is memory aliasing relevant here?
Thanks.
$ gcc --version
gcc (Debian 5.2.1-21) 5.2.1 20151003
Yes, gcc is reloading s.x and s.y with each iteration of the loop because gcc does not know if &s.z[i] for some i aliases part of the S object passed by reference to f(S&, size_t).
With gcc 5.2.0, applying __restrict__ to S::z and the s reference parameter to f(), i.e.:
struct S {
double *x;
double *y;
double *__restrict__ z;
};
void f(S&__restrict__ s, size_t n) {
for (int i = 0; i < n; i += 2) {
__m128d xs = _mm_load_pd(&s.x[i]);
__m128d ys = _mm_load_pd(&s.y[i]);
_mm_store_pd(&s.z[i], _mm_mul_pd(xs, ys));
}
return;
}
.. causes gcc to generate:
__Z1fR1Sm:
LFB518:
testq %rsi, %rsi
je L1
movq (%rdi), %r8
xorl %eax, %eax
movq 8(%rdi), %rcx
movq 16(%rdi), %rdx
.align 4,0x90
L4:
movapd (%r8,%rax,8), %xmm0
mulpd (%rcx,%rax,8), %xmm0
movaps %xmm0, (%rdx,%rax,8)
addq $2, %rax
cmpq %rax, %rsi
ja L4
L1:
ret
With Apple Clang 700.1.76, only __restrict__ on the s reference is needed:
__Z1fR1Sm: ## #_Z1fR1Sm
.cfi_startproc
## BB#0:
pushq %rbp
Ltmp0:
.cfi_def_cfa_offset 16
Ltmp1:
.cfi_offset %rbp, -16
movq %rsp, %rbp
Ltmp2:
.cfi_def_cfa_register %rbp
testq %rsi, %rsi
je LBB0_3
## BB#1: ## %.lr.ph
movq (%rdi), %rax
movq 8(%rdi), %rcx
movq 16(%rdi), %rdx
xorl %edi, %edi
.align 4, 0x90
LBB0_2: ## =>This Inner Loop Header: Depth=1
movapd (%rax,%rdi,8), %xmm0
mulpd (%rcx,%rdi,8), %xmm0
movapd %xmm0, (%rdx,%rdi,8)
addq $2, %rdi
cmpq %rsi, %rdi
jb LBB0_2
LBB0_3: ## %._crit_edge
popq %rbp
retq
.cfi_endproc

C++ Reverse a null terminated string using the position of the terminator as swap space

I'm working on the classic "Reverse a String" problem.
Is a good idea to use the position of the null terminator for swap space? The idea is to save the declaration of one variable.
Specifically, starting with Kernighan and Ritchie's algorithm:
void reverse(char s[])
{
int length = strlen(s);
int c, i, j;
for (i = 0, j = length - 1; i < j; i++, j--)
{
c = s[i];
s[i] = s[j];
s[j] = c;
}
}
...can we instead do the following?
void reverseUsingNullPosition(char s[]) {
int length = strlen(s);
int i, j;
for (i = 0, j = length - 1; i < j; i++, j--) {
s[length] = s[i]; // Use last position instead of a new var
s[i] = s[j];
s[j] = s[length];
}
s[length] = 0; // Replace null character
}
Notice how the "c" variable is no longer needed. We simply use the last position in the array--where the null termination resides--as our swap space. When we're done, we simply replace the 0.
Here's the main routine (Xcode):
#include <stdio.h>
#include <string>
int main(int argc, const char * argv[]) {
char cheese[] = { 'c' , 'h' , 'e' , 'd' , 'd' , 'a' , 'r' , 0 };
printf("Cheese is: %s\n", cheese); //-> Cheese is: cheddar
reverse(cheese);
printf("Cheese is: %s\n", cheese); //-> Cheese is: raddehc
reverseUsingNullPosition(cheese);
printf("Cheese is: %s\n", cheese); //-> Cheese is: cheddar
}
Yes, this can be done. No, this is not a good idea, because it makes your program much harder to optimize.
When you declare char c in the local scope, the optimizer can figure out that the value is not used beyond the s[j] = c; assignment, and could place the temporary in a register. In addition to effectively eliminating the variable for you, the optimizer could even figure out that you are performing a swap, and emit a hardware-specific instruction. All this would save you a memory access per character.
When you use s[length] for your temporary, the optimizer does not have as much freedom. It is forced to emit the write into memory. This could be just as fast due to caching, but on embedded platforms this could have a significant effect.
First of all such microoptimizations are totally irrelevant until proven relevant. We're talking about C++, you have std::string, std::reverse, you shouldn't even think about such facts.
In any case if you compile both code with -Os on Xcode you obtain for reverse:
.cfi_startproc
Lfunc_begin0:
pushq %rbp
Ltmp3:
.cfi_def_cfa_offset 16
Ltmp4:
.cfi_offset %rbp, -16
movq %rsp, %rbp
Ltmp5:
.cfi_def_cfa_register %rbp
pushq %r14
pushq %rbx
Ltmp6:
.cfi_offset %rbx, -32
Ltmp7:
.cfi_offset %r14, -24
movq %rdi, %r14
Ltmp8:
callq _strlen
Ltmp9:
leal -1(%rax), %ecx
testl %ecx, %ecx
jle LBB0_3
Ltmp10:
movslq %ecx, %rcx
addl $-2, %eax
Ltmp11:
xorl %edx, %edx
LBB0_2:
Ltmp12:
movb (%r14,%rdx), %sil
movb (%r14,%rcx), %bl
movb %bl, (%r14,%rdx)
movb %sil, (%r14,%rcx)
Ltmp13:
incq %rdx
decq %rcx
cmpl %eax, %edx
leal -1(%rax), %eax
jl LBB0_2
Ltmp14:
LBB0_3:
popq %rbx
popq %r14
popq %rbp
ret
Ltmp15:
Lfunc_end0:
.cfi_endproc
and for reverseUsingNullPosition:
.cfi_startproc
Lfunc_begin1:
pushq %rbp
Ltmp19:
.cfi_def_cfa_offset 16
Ltmp20:
.cfi_offset %rbp, -16
movq %rsp, %rbp
Ltmp21:
.cfi_def_cfa_register %rbp
pushq %rbx
pushq %rax
Ltmp22:
.cfi_offset %rbx, -24
movq %rdi, %rbx
Ltmp23:
callq _strlen
Ltmp24:
leal -1(%rax), %edx
testl %edx, %edx
Ltmp25:
movslq %eax, %rdi
jle LBB1_3
Ltmp26:
movslq %edx, %rdx
addl $-2, %eax
Ltmp27:
xorl %esi, %esi
LBB1_2:
Ltmp28:
movb (%rbx,%rsi), %cl
movb %cl, (%rbx,%rdi)
movb (%rbx,%rdx), %cl
movb %cl, (%rbx,%rsi)
movb (%rbx,%rdi), %cl
movb %cl, (%rbx,%rdx)
Ltmp29:
incq %rsi
decq %rdx
cmpl %eax, %esi
leal -1(%rax), %eax
jl LBB1_2
Ltmp30:
LBB1_3: ## %._crit_edge
movb $0, (%rbx,%rdi)
addq $8, %rsp
popq %rbx
Ltmp31:
popq %rbp
ret
Ltmp32:
Lfunc_end1:
.cfi_endproc
If you check the inner loop you have
movb (%r14,%rdx), %sil
movb (%r14,%rcx), %bl
movb %bl, (%r14,%rdx)
movb %sil, (%r14,%rcx)
vs
movb (%rbx,%rsi), %cl
movb %cl, (%rbx,%rdi)
movb (%rbx,%rdx), %cl
movb %cl, (%rbx,%rsi)
movb (%rbx,%rdi), %cl
movb %cl, (%rbx,%rdx)
So I wouldn't say you are saving so much overhead as you think (since you are accessing the array more times), maybe yes, maybe no. Which teaches you another thing: thinking that some code is more performant than other code is irrelevant, the only thing that matters is a well-done benchmark and profile of the code.
Legal: Yes
Good idea: No
The cost of an "extra" variable is zero so there is absolutely no reason to avoid it. The stack pointer needs to be changed anyway so it doesn't matter if it needs to cope with an extra int.
Further:
With compiler optimization turned on, the variable c in the original code will most likely not even exists. It will just be a register in the cpu.
With your code: Optimization will be more difficult so it is not easy to say how well the compiler will do. Maybe you'll get the same - maybe you'll get something worse. But you won't get anything better.
So just forget the idea.
We can use printf and the STL and also manually unroll things and use pointers.
#include <stdio.h>
#include <string>
#include <cstring>
void reverse(char s[])
{
char * b=s;
char * e=s+::strlen(s)-4;
while (e - b > 4)
{
std::swap(b[0], e[3]);
std::swap(b[1], e[2]);
std::swap(b[2], e[1]);
std::swap(b[3], e[0]);
b+=4;
e-=4;
}
e+=3;
while (b < e)
{
std::swap(*(b++), *(e--));
}
}
int main(int argc, const char * argv[]) {
char cheese[] = { 'c' , 'h' , 'e' , 'd' , 'd' , 'a' , 'r' , 0 };
printf("Cheese is: %s\n", cheese); //-> Cheese is: cheddar
reverse(cheese);
printf("Cheese is: %s\n", cheese); //-> Cheese is: raddehc
}
Hard to tell if its faster with just the test case of "cheddar"

g++ dumped assembly output doesn't work

I have following C++ code in main.cpp file.
int add(int a,int b)
{
int c = a + b;
return c;
}
int main()
{
int a = 2;
int b = 4;
int d = add(2,4);
}
when I ran g++ -S main.cpp I got the following assembly code.(after removing all the debug symbols). Also I have changed the code to print the sum of the 2 numbers using sys_write system call.
.text
.globl _Z3addii
_Z3addii:
pushq %rbp
movq %rsp, %rbp
movl %edi, -20(%rbp)
movl %esi, -24(%rbp)
movl -24(%rbp), %eax
movl -20(%rbp), %edx
addl %edx, %eax
movl %eax, -4(%rbp)
movl -4(%rbp), %eax
popq %rbp
ret
.globl main
main:
pushq %rbp
movq %rsp, %rbp
subq $16, %rsp
movl $2, -12(%rbp)
movl $4, -8(%rbp)
movl $4, %esi
movl $2, %edi
call _Z3addii
movl %eax, -4(%rbp)
movl $4, %edx #message length
movl -4(%rbp), %esi #message to write
movl $1, %edi #file descriptor (stdout)
movl $1, %eax #system call number (sys_write)
syscall #call kernel
movl $60, %eax # Invoke the Linux 'exit' syscall
movl $0, %edi # With a return value of 0
syscall # call kernel
ret
My problem is when I run the above assembly it gives nothing as output. I can't understand what I am missing here? Can someone please tell me what I am missing? Thanks.
commands used:
g++ -o main main.s and ./main -->no output
OS: Ubuntu 12.04 64bit and g++ version: 4.8.2
There are two things you're doing wrong:
Firstly, you're the 64-bit syscall instruction, but initialize only the %e part of the registers. Secondly, this:
movl -4(%rbp), %esi
loads the value that is at -4(%rbp) (the 6 you just calculated) into %esi, when sys_write expects the memory address of that value there (by which I mean in %rsi). It works with this:
movq $1, %rax #system call number (sys_write)
movq $1, %rdi #file descriptor (stdout)
leaq -4(%rbp), %rsi #message to write
movq $4, %rdx #message length
syscall #call kernel
Of course, you're not going to get formatted output this way. To see that the 6 is printed, you will have to pipe the output through hexdump or something similar.
Addendum: That you only initialize the %e part of the registers is actually only really critical here in the case of %rsi. %rbp holds, at the time of reading, a value with set high bits, and these are lost if only -4(%ebp) is written to %esi. Technically this also works:
movl $1, %eax #system call number (sys_write)
movl $1, %edi #file descriptor (stdout)
leaq -4(%rbp), %rsi #message to write
movl $4, %edx #message length
syscall #call kernel
...but I feel that it is rather poor style.