Replacing even array elements with zeros in assembler - c++

There is the following problem:
I have an array, it is necessary to give its size and then in this array for each element that has an even value, assign the value zeros, and return the modified array.
There is my C++ code:
#include <iostream>
int main()
{
const int size = 10;
int arr[size] = { 1,2,3,4,5,6,7,8,9,10 };
for (int i = 0; i < size; i++)
{
if (arr[i] % 2 == 0)
arr[i] = 0;
}
for (int i = 0; i < size; i++)
{
std::cout << arr[i] << ' ';
}
system("pause");
return 0;
}
There is my nasm code:
%include "io64.inc"
section .text
global CMAIN
CMAIN:
mov DWORD size$[rbp], 10 ; size of array
; elements of array
mov DWORD arr$[rbp], 1
mov DWORD arr$[rbp+4], 2
mov DWORD arr$[rbp+8], 3
mov DWORD arr$[rbp+12], 4
mov DWORD arr$[rbp+16], 5
mov DWORD arr$[rbp+20], 6
mov DWORD arr$[rbp+24], 7
mov DWORD arr$[rbp+28], 8
mov DWORD arr$[rbp+32], 9
mov DWORD arr$[rbp+36], 10
mov DWORD i$4[rbp], 0
jmp SHORT $LN4#main
$LN2#main:
mov eax, DWORD i$4[rbp]
inc eax
mov DWORD i$4[rbp], eax
$LN4#main:
cmp DWORD i$4[rbp], 10
jge SHORT $LN3#main
movsxd rax, DWORD i$4[rbp]
mov eax, DWORD arr$[rbp+rax*4]
cdq
and eax, 1
xor eax, edx
sub eax, edx
test eax, eax
jne SHORT $LN8#main
movsxd rax, DWORD i$4[rbp]
mov DWORD arr$[rbp+rax*4], 0
$LN8#main:
jmp SHORT $LN2#main
$LN3#main:
mov DWORD i$5[rbp], 0
jmp SHORT $LN7#main
$LN5#main:
mov eax, DWORD i$5[rbp]
inc eax
mov DWORD i$5[rbp], eax
$LN7#main:
cmp DWORD i$5[rbp], 10
jge SHORT $LN6#main
$LN6#main:
mov edi, eax
lea rcx, QWORD [rbp-32]
mov eax, edi
lea rsp, QWORD [rbp+360]
pop rdi
pop rbp
ret
My question is: will this code work and how can it be optimized?
I just get errors like this:
C:\Users\79268\AppData\Local\Temp\SASM\program.asm:1: fatal: unable to open include file `io64.inc'
gcc.exe: error: C:\Users\79268\AppData\Local\Temp\SASM\program.o: No such file or directory
C:\Users\79268\AppData\Local\Temp\SASM\program.asm:6: error: comma, colon, decorator or end of line expected after operand
C:\Users\79268\AppData\Local\Temp\SASM\program.asm:9: error: comma, colon, decorator or end of line expected after operand
C:\Users\79268\AppData\Local\Temp\SASM\program.asm:10: error: comma, colon, decorator or end of line expected after operand
So, I install NASM and IDE SASM correctly...
I try to rewrite code and compile it in SASM:
section .data
arr dd 1, 2, 3, 4, 5, 6, 7, 8, 9,10
section .text
global CMAIN
CMAIN:
call calc
push arr
calc:
lea rsi, [arr]
mov rcx, [10]
mov rdi, rsi
xor rbx, rbx
##for:
lodsq
test al, 1
cmovz rax, rbx
stosq
loop ##for
ret
And I get the same error:
c:/program files (x86)/sasm/mingw64/bin/../lib/gcc/x86_64-w64
mingw32/4.8.1/../../../../x86_64-w64
mingw32/lib/../lib/libmingw32.a(lib64_libmingw32_a-crt0_c.o):
crt0_c.c:(.text.startup+0x25): undefined reference to `WinMain'
Now I get the same error for updated code:
BITS 64
section .text
global _start
_start:
push arr
lea rsi, [arr]
mov rcx, [10]
mov rdi, rsi
xor rbx, rbx
##for:
lodsq
test al, 1
cmovz rax, rbx
stosq
loop ##for
ret
section .data
arr dd 1, 2, 3, 4, 5, 6, 7, 8, 9, 10
$nasm -f elf *.asm; ld -m elf_i386 -s -o demo *.o
$demo
/usr/bin/timeout: the monitored command dumped core
sh: line 1: 21184 Segmentation fault /usr/bin/timeout 10s demo
Any idea for fix and compile this programm? SASM is nit wirking on my machine, and a try to use this online NASM compilier: ASM Online compilier

Related

Why is the C++ function parameter stored 20 bytes off of the rbp in x86-64 when the method body only has one 4 byte variable?

Consider the following program, compiled using x86-64 GCC 12.2 with flags --std=c++17 -O0:
int square(int num, int num2) {
int foo = 37;
return num * num;
}
int main () {
return square(10, 5);
}
The resulting assembly using godbolt is:
square(int, int):
push rbp
mov rbp, rsp
mov DWORD PTR [rbp-20], edi
mov DWORD PTR [rbp-24], esi
mov DWORD PTR [rbp-4], 37
mov eax, DWORD PTR [rbp-20]
imul eax, eax
pop rbp
ret
main:
push rbp
mov rbp, rsp
mov esi, 5
mov edi, 10
call square(int, int)
nop
pop rbp
ret
I read about shadow spaces and it appears that in x64 there must be at minimum 32 bytes allocated: "32 bytes above the return address which the called function owns" ...
With that said, how is the offset -20 determined for the parameter num? If there's 32 bytes from rbp, wouldn't that be -24?
I noticed even if you add more local variables, it'll remain -20 until it gets pushed over to -36, but I cannot understand why. Thanks!

How do I convert C++ function to assembly(x86_64)?

This is my .CPP file
#include <iostream>
using namespace std;
extern "C" void KeysAsm(int arr[], int n, int thetha, int rho);
// Keep this and call it from assembler
extern "C"
void crim(int *xp, int *yp) {
int temp = *xp;
*xp = *yp;
*yp = temp+2;
}
// Translate this into Intel assembler
void KeysCpp(int arr[], int n, int thetha, int rho){
for (int i = 0; i < n - 1; i++) {
for (int j = 0; j < n - i - 1; j++) {
if (arr[j] > arr[j + 1]) {
crim(&arr[j], &arr[j + 1]);
}
}
arr[i]= arr[i] + thetha / rho * 2 - 4;
}
}
// Function to print an array
void printArray(int arr[], int size){
int i;
for (i = 0; i < size; i++)
cout << arr[i] << "\n";
cout << endl;
}
int main() {
int gamma1[]{
9,
270,
88,
-12,
456,
80,
45,
123,
427,
999
};
int gamma2[]{
900,
312,
542,
234,
234,
1,
566,
123,
427,
111
};
printf("Array:\n");
printArray(gamma1, 10);
KeysAsm(gamma1, 10, 5, 6);
printf("Array Result Asm:\n");
printArray(gamma1, 10);
KeysCpp(gamma2, 10, 5, 6);
printf("Array Result Cpp:\n");
printArray(gamma2, 10);
}
What I want to do is, convert the KeysCpp function into assembly language and call it from this very .CPP file. I want to keep the crim function as it is in .CPP, while only converting the KeysCpp.
Here is my .ASM file
PUBLIC KeysAsm
includelib kernel32.lib
_DATA SEGMENT
EXTERN crim:PROC
_DATA ENDS
_TEXT SEGMENT
KeysAsm PROC
push rbp
mov rbp, rsp
sub rsp, 40
mov QWORD PTR [rbp-24], rdi
mov DWORD PTR [rbp-28], esi
mov DWORD PTR [rbp-32], edx
mov DWORD PTR [rbp-36], ecx
mov DWORD PTR [rbp-4], 0
jmp L3
L3:
mov eax, DWORD PTR [rbp-28]
sub eax, 1
cmp DWORD PTR [rbp-4], eax
jl L7
L4:
mov eax, DWORD PTR [rbp-28]
sub eax, DWORD PTR [rbp-4]
sub eax, 1
cmp DWORD PTR [rbp-8], eax
jl L6
L5:
add DWORD PTR [rbp-8], 1
L6:
mov eax, DWORD PTR [rbp-8]
cdqe
lea rdx, [0+rax*4]
mov rax, QWORD PTR [rbp-24]
add rax, rdx
mov edx, DWORD PTR [rax]
mov eax, DWORD PTR [rbp-8]
cdqe
add rax, 1
lea rcx, [0+rax*4]
mov rax, QWORD PTR [rbp-24]
add rax, rcx
mov eax, DWORD PTR [rax]
cmp edx, eax
jle L5
mov eax, DWORD PTR [rbp-8]
cdqe
add rax, 1
lea rdx, [0+rax*4]
mov rax, QWORD PTR [rbp-24]
add rdx, rax
mov eax, DWORD PTR [rbp-8]
cdqe
lea rcx, [0+rax*4]
mov rax, QWORD PTR [rbp-24]
add rax, rcx
mov rsi, rdx
mov rdi, rax
call crim
L7:
mov DWORD PTR [rbp-8], 0
jmp L4
KeysAsm ENDP
_TEXT ENDS
END
I am using Visual Studio 2017 to run this project.
I am getting next error when I run this code.
Unhandled exception at 0x00007FF74B0E429C in MatrixMultiplication.exe: Stack cookie instrumentation code detected a stack-based buffer overrun. occurred
Your asm looks like it's expecting the x86-64 System V calling convention, with args in RDI, ESI, EDX, ECX. But you said you're compiling with Visual Studio, so the compiler-generated code will use the Windows x64 calling convention: RCX, EDX, R8D, R9D.
And when you call crim, it can use shadow space (32 bytes above its return address, which you didn't reserve space for).
It looks like you got this asm from un-optimized compiler output, probably from https://godbolt.org/z/ea4MPh81r using GCC for Linux, without using -mabi=ms to override the default -mabi=sysv when compiling for non-Windows targets. And then you modified it to make the loop infinite, with a jmp at the bottom instead of a ret? Maybe a different GCC version than 12.2 since the label numbers and code don't match exactly.
(The signs of being un-optimized compiler output are all the reloads from [rbp-whatever], and redoing sign-extension before using an int to index an array with cdqe. A human would know the int must be non-negative. And being GCC specifically, the numbered label like .L1: etc. where you just removed the ., and of heavily using RAX for as much as possible in a debug build. And choices like lea rdx, [0+rax*4] to copy-and-shift, and the exact syntax it used to print that instruction in Intel syntax match GCC.)
To compile a single function for Windows x64, isolate it and give the compiler only prototypes for anything it calls
extern "C" void crim(int *xp, int *yp); // prototype only
void KeysCpp(int arr[], int n, int thetha, int rho){
for (int i = 0; i < n - 1; i++) {
for (int j = 0; j < n - i - 1; j++) {
if (arr[j] > arr[j + 1]) {
crim(&arr[j], &arr[j + 1]);
}
}
arr[i]= arr[i] + thetha / rho * 2 - 4;
}
}
Then on Godbolt, use gcc -O3 -mabi=ms, or use MSVC which always targets Windows. https://godbolt.org/z/Mj5Gb54b5 shows both GCC and MSVC with optimization enabled.
KeysCpp(int*, int, int, int): ; demangled name
cmp edx, 1
jle .L11 ; "shrink wrap" optimization: early-out on n<=1 before saving regs
push r15 ; save some call-preserved regs
push r14
lea r14, [rcx+4] ; arr + 1
push r13
mov r13, rcx
Unfortunately GCC fails to hoist the thetha / rho * 2 - 4 loop-invariant, instead redoing idiv every time through the loop. Seems like an obvious optimization since those are local vars whose address hasn't been taken at all, and it keeps thetha (typo for theta?) and rho in registers. So MSVC is much more efficient here. Clang also misses this optimization.

A strange bug of g++9.3.0 -O2 on linux

I've met a strange bug of g++9.3.0 -O2 on linux
The code below is converted from my code of the SJT algorithm.
If I keep the last line init in generate, the time cost is 1200+ms.
if I delete it, the time cost is 600+ms.
This bug appears on ubuntu20.04 with g++9.3.0. I've tested it on win10 and macOS with g++9.3.0, the bug doesn't appear. I've also tested it on linux with g++8 and g++10, the bug doesn't appear, either.
Here is the code. The original question is 69468547.
I want to know what causes this strange "time cost double" behavior?
20211008: I reproduce this bug in another way. Here is the whole code.I execute the strange_func(SJT algorithm) twice in generate, the first one's time cost is 653ms and the second one's is 1322ms. You can reproduce the bug with gcc9.3.0 on linux. I've also tried gcc10, there is no bug.
#include <cstdio>
#include <cstring>
#include <chrono>
using namespace std::chrono;
#define MAXN 100
struct Permutation {
int N;
char s[2*MAXN];
int r[MAXN];
inline void init() {
memset(s, 0, sizeof(s));
memset(r, 0, sizeof(r));
}
void generate(int n) {
N = n;
init();
auto start = steady_clock::now();
strange_func();
auto end = steady_clock::now();
auto duration = duration_cast<milliseconds>(end - start);
printf("time cost(ms): %ld\n", duration.count());
init();
}
void strange_func() {
int k = N, t = -1;
while (true) {
r[N] += 1;
if (r[N] < N) {
char c = s[k]; s[k] = s[k+t]; s[k+t] = c;
k += t;
} else {
int i = N;
while (r[i] == i)
r[i] = 0, r[--i] += 1;
if (i == 0) break;
t = 0;
}
}
}
} perm;
int main() {
int n;
scanf("%d", &n);
perm.generate(n);
return 0;
}
The fact that init() is called after the strange_func() function call change the generated assembly code of the variable swap (between s[k] and s[k+t]) in the loop in strange_func()! The apparent minor assembly change has a huge impact on the performance as the loop is very sensitive to micro-optimizations and the generated code with the init() is clearly less efficient. Such a change is likely due to fragile compiler heuristics (with a clear chaotic behaviour in this specific case) and the fact the strange_func() function call is inlined.
To understand what is going on, let us analyse the assembly generated by the two variants.
Here is the assembly code of the hot loop without (left) and with (right) init():
.L2: | .L2:
add ecx, 1 | add esi, 1
mov DWORD PTR 12[rbx+rdx*4], ecx | mov DWORD PTR 12[r12+rdx*4], esi
cmp r8d, ecx | cmp ecx, esi
jle .L3 | jle .L3
|
.L13: | .L13:
movsx r9, eax | movsx r9, eax
add eax, esi | add eax, edi
add ecx, 1 | add esi, 1
movsx rdi, eax | movzx r11d, BYTE PTR 4[r12+r9]
movzx r11d, BYTE PTR 4[rbx+r9] | movsx r8, eax
mov DWORD PTR 12[rbx+rdx*4], ecx | mov DWORD PTR 12[r12+rdx*4], esi
movzx r14d, BYTE PTR 4[rbx+rdi] | mov BYTE PTR 15[rsp], r11b
| movzx r11d, BYTE PTR 4[r12+r8]
mov BYTE PTR 4[rbx+r9], r14b | mov BYTE PTR 4[r12+r9], r11b
| movzx r9d, BYTE PTR 15[rsp]
mov BYTE PTR 4[rbx+rdi], r11b | mov BYTE PTR 4[r12+r8], r9b
cmp r8d, ecx | cmp ecx, esi
jg .L13 | jg .L13
|
.L3: | .L3:
jne .L9 | jne .L9
mov rsi, r10 | mov rdi, r10
mov ecx, r8d | mov esi, ecx
.p2align 4,,10 | .p2align 4,,10
.p2align 3 | .p2align 3
|
.L6: | .L6:
mov edi, DWORD PTR 200[rsi] | mov r11d, DWORD PTR 200[rdi]
sub ecx, 1 | sub esi, 1
sub rsi, 4 | sub rdi, 4
mov DWORD PTR 208[rsi], 0 | mov DWORD PTR 208[rdi], 0
add edi, 1 | lea r8d, 1[r11]
mov DWORD PTR 204[rsi], edi | mov DWORD PTR 204[rdi], r8d
cmp ecx, edi | cmp esi, r8d
je .L6 | je .L6
test ecx, ecx | test esi, esi
je .L14 | je .L14
|
.L7: | .L7:
mov ecx, DWORD PTR 12[rbx+rdx*4] | mov esi, DWORD PTR 12[r12+rdx*4]
xor esi, esi | xor edi, edi
jmp .L2 | jmp .L2
.p2align 4,,10 | .p2align 4,,10
.p2align 3 | .p2align 3
|
.L9: | .L9:
mov ecx, r8d | mov esi, ecx
test ecx, ecx | test esi, esi
jne .L7 | jne .L7
.p2align 4,,10 | .p2align 4,,10
.p2align 3 | .p2align 3
As we can see, the L13 block contains more instructions with the init() call. The rest of the blocks look similar.
Here is a detailed analysis of the blocks without init():
movsx r9, eax
add eax, esi
add ecx, 1
movsx rdi, eax
movzx r11d, BYTE PTR 4[rbx+r9] ; Perform r11b=s[k]
mov DWORD PTR 12[rbx+rdx*4], ecx ; Perform r[N]+=1 (r[N] was stored in ecx previously)
movzx r14d, BYTE PTR 4[rbx+rdi] ; Perform r14b=s[k+t]
mov BYTE PTR 4[rbx+r9], r14b ; Perform s[k]=r14b
mov BYTE PTR 4[rbx+rdi], r11b ; Perform s[k+t]=r11b
cmp r8d, ecx
jg .L13
Here is a detailed analysis of the blocks with init():
movsx r9, eax
add eax, edi
add esi, 1
movzx r11d, BYTE PTR 4[r12+r9]
movsx r8, eax
mov DWORD PTR 12[r12+rdx*4], esi ; Perform r[N]+=1 (r[N] was stored in ecx previously)
mov BYTE PTR 15[rsp], r11b ; Perform c = s[k] (c is stored in memory)
movzx r11d, BYTE PTR 4[r12+r8]
mov BYTE PTR 4[r12+r9], r11b ; Perform s[k]=s[k+t]
movzx r9d, BYTE PTR 15[rsp]
mov BYTE PTR 4[r12+r8], r9b ; Perform s[k+t]=c
cmp ecx, esi
jg .L13
We can see that in the first case, GCC is able to swap s[k] and s[k+t] efficiently while in the second case, GCC use store one value in a temporary location in the stack which is clearly less efficient. An in-memory swap is clearly less efficient because of the data dependency and L1 cache latency (generally about 3-4 cycles on modern x86 AMD/Intel processors).
Whether this is a bug or just a missing optimization of GCC 9.3.0 is still unclear. However, this is very hard to tell without delving into an old version of the GCC code not actively maintained anymore (since March 12, 2020).
A quick workaround solution to this issue is to tell GCC not to inline the function using __attribute__((noinline)). Alternatively, it should be possible to tune GCC heuristic parameters (using the GCC command line) so that this does not happen. Another solution would be to optimize the loop to compute several permutations at once so that such micro-optimizations do not matter so much.

GCC generated assembly

Why printf function causes the change of prologue?
C code_1:
#include <cstdio>
int main(){
int a = 11;
printf("%d", a);
}
GCC -m32 generated one:
.LC0:
.string "%d"
main:
lea ecx, [esp+4] // What's purpose of this three
and esp, -16 // lines?
push DWORD PTR [ecx-4] //
push ebp
mov ebp, esp
push ecx
sub esp, 20 // why sub 20?
mov DWORD PTR [ebp-12], 11
sub esp, 8
push DWORD PTR [ebp-12]
push OFFSET FLAT:.LC0
call printf
add esp, 16
mov eax, 0
mov ecx, DWORD PTR [ebp-4]
leave
lea esp, [ecx-4]
ret
C code_2:
#include <cstdio>
int main(){
int a = 11;
}
GCC -m32:
main:
push ebp
mov ebp, esp
sub esp, 16
mov DWORD PTR [ebp-4], 11
mov eax, 0
leave
ret
What is the purpose of first three lines added in first code?
Please, explain first assembly code, if you can.
EDIT:
64-bit mode:
.LC0:
.string "%d"
main:
push rbp
mov rbp, rsp
sub rsp, 16
mov DWORD PTR [rbp-4], 11
mov eax, DWORD PTR [rbp-4]
mov esi, eax
mov edi, OFFSET FLAT:.LC0
mov eax, 0
call printf
mov eax, 0
leave
ret
The insight is that the compiler keep the stack aligned at function calls.
The alignment is 16 byte.
lea ecx, [esp+4] ;Save original ESP to ECX (ESP+4 actually)
and esp, -16 ;Align stack on 16 bytes (Lower esp)
push DWORD PTR [ecx-4] ;Push main return address (Stack at 16B + 4)
;My guess is to aid debugging tools that expect the RA
;to be at [ebp+04h]
push ebp
mov ebp, esp ;Prolog (Stack at 16B+8)
push ecx ;Save ECX (Original stack pointer) (Stack at 16B+12)
sub esp, 20 ;Reserve 20 bytes (Stack at 16B+0, ALIGNED AGAIN)
;4 for alignment + 1x16 for a variable (variable space is
;allocated in multiple of 16)
mov DWORD PTR [ebp-12], 11 ;a = 11
sub esp, 8 ;Stack at 16B+8 for later alignment
push DWORD PTR [ebp-12] ;a
push OFFSET FLAT:.LC0 ;"%d" (Stack at 16B)
call printf
add esp, 16 ;Remove args+pad from the stack (Stack at 16B)
mov eax, 0 ;Return 0
mov ecx, DWORD PTR [ebp-4] ;Restore ECX without the need to add to esp
leave ;Restore EBP
lea esp, [ecx-4] ;Restore original ESP
ret
I don't know why the compiler saves esp+4 in ecx instead of esp (esp+4 is the address of the first parameter of main).

Unhandled exception at 0x93b3237d in project00.exe: 0xC0000005: Access violation

In my program when i exit the section of ASM code and returning to the C++ code i get the Unhandled exception at 0x93b3237d in project00.exe: 0xC0000005: Access violation. In Crtexe.c at line mainret = main(argc, argv, envp); and in the disassembly when the 0C is add to ESP. i think the problem might be the return address of the main function is get corrupted before returning and that causing it to fail. Program find the intersection and union of two sets. Using VS10 and am out of ideas.
include "iostream.h"//modify line to show up in code block
using namespace std;
typedef int DWORD; //4 btye double word
typedef char BYTE; //1 byte
typedef short WORD; //2 byte double word
int main(){
int i =0;
BYTE str0[50] = "1qaz2wsx3edc4rfv5tgb6yhn7ujm8ik,9ol.0p;/-[?]F!Q";
BYTE str1[50] ="QAZ#WSX#EDC$RFV%TGB^YHN&UJM*IK)P:?_{?}|1`";
DWORD length0 ;
DWORD length1 ;
BYTE IntersectArray[50];
BYTE result [100] ;
__asm{
p:
pusha
lea eax, str0
call COUNT
mov length0,ecx
lea eax, str1
call COUNT
mov length1,ecx
call INTERSECTION
call JoinSet
xor eax,eax
popa
ret
COUNT:
mov ecx,0;
Q: mov dl, [eax]
cmp dl,0h
JE cEND
inc eax
inc ecx
jmp Q
cEnd: ret
INTERSECTION:
lea edx, str0
mov ebx, length0
lea esi, IntersectArray
first: mov al, [edx]
mov ecx, length1
lea edi, str1
repne SCASB
cmp ecx,0
JNZ INTER
Back: inc edx
cmp ebx,0
JZ EXITSTUFF
dec ebx
jmp first
INTER: mov [esi] , al
inc esi
jmp Back
EXITSTUFF:
mov [esi], 0
ret
JoinSet :
lea edi, result
lea esi, str0
mov ecx, length0
REP MOVSB
lea edx, str1
mov ebx, length1
lea esi, result + [ebx]
f: mov al, [edx]
mov ecx, length0
lea edi, str0
repne SCASB
cmp ecx,0
JNZ B
mov [esi] , al
inc esi
B: inc edx
cmp ebx,0
JZ EXITSTU
dec ebx
jmp f
EXITSTU:
mov [esi], 0
ret
}
rest
for(int i =0;i < 50;i++){
cout <<IntersectArray[i];}
cout << endl;
for (int i =0; i<100;i++)
cout <<result[i];
cout << endl;
system("pause");
return 0;
}
Probably a dumb comment since I've never done any x86 assembly. But I thought asm was inline? So what is your first 'ret' actually returning from?