I am writing a simple program that uses inline assebly to check if given word is a palindrome. The problem is it doesn't return correct answers. During debugging I found out that there's something wrong with esi register (the value in al is correct ('a'), but in bl it's not (0). I'm not sure what I'm doing wrong.
#include "stdafx.h"
#include <iostream>
#include <string>
using namespace std;
int _tmain(int argc, _TCHAR* argv[])
char s[] = "arabara";
int sizeofstring = 8; // size of s[]
int x = 0;
int y = 1; //flag when is_palindrome
lea edi, s
mov esi, edi
add esi, sizeofstring
dec esi //point to the last char
mov ecx, sizeofstring
cmp ecx, 1
je is_palindrome //single char is always a palindrome
shr ecx, 1 //divide by 2
mov al, [edi]
mov bl, [esi]
cmp al, bl
jne stop
inc edi
dec esi
loop nextchar
mov eax, y
mov x, eax //change flag to 1
cout << x << endl; //shoud print 1 when palindrome
return 0;
You set sizeofstring to 8, but your string "arabara" is seven characters long.
I am experimenting mixing assembly(x86) and C++.
I wrote a procedure in assembly and then called it from C++.
However, when writing the returned value to a local variable I get a write permission violation error.
#include <iostream>
// will return 1 if all ok and 0 if b is 0
extern "C" int integerMulDiv(int a, int b, int* prod, int* quo, int* rem);
int main() {
int a = 13, b = 4;
int p, q, r;
int res = integerMulDiv(a, b, &p, &q, &r);
std::cout << p << '\t' << q << '\t' << r << std::endl;
std::cout << res << std::endl << std::endl;
res = integerMulDiv(31, 0, &p, &q, &r);
std::cout << p << '\t' << q << '\t' << r << std::endl;
std::cout << res << std::endl << std::endl;
return 0;
The assembly procedure returns a few values through pointers and an int through RAX.
; Returns : 0 Error (division by 0)
; : 1 All ok
; *prod = a * b
; *quo = a / b
; *rem = a % b
integerMulDiv proc
push ebp
mov ebp, esp
push ebx ; save ebp and ebx
xor eax, eax
mov ecx, [ebp + 8] ; get a
mov edx, [ebp + 12] ; get b (the divisor)
or edx, edx ; check divisor
jz invalidDivizor
imul edx, ecx
mov ebx, [ebp + 16] ; get address of prod
mov [ebx], edx ; write prod
mov eax, ecx
cdq ; extend to edx
idiv dword ptr[ebx + 12]
mov ebx, [ebp + 20] ; get address of quo
mov [ebp], eax ; write quo
mov ebx, [ebp + 24] ; get address of rem
mov [ebp], edx ; write rem
mov eax, 1 ; set success
jmp returnFromProc
mov eax, 0 ; set failed
pop ebx
pop ebp
ret ; restore and return
integerMulDiv endp
I get the error after the first call of integerMulDiv, when it tries to write the result in the res variable.
The disassembly looks like this:
int res = integerMulDiv(a, b, &p, &q, &r);
002D24BD lea eax,[r]
002D24C0 push eax
002D24C1 lea ecx,[q]
002D24C4 push ecx
002D24C5 lea edx,[p]
002D24C8 push edx
002D24C9 mov eax,dword ptr [b]
002D24CC push eax
002D24CD mov ecx,dword ptr [a]
002D24D0 push ecx
002D24D1 call _integerMulDiv (02D133Eh)
002D24D6 add esp,14h
002D24D9 mov dword ptr [res],eax <- The #PF happens here
Does anyone know what is happening and why?
The following section of code stands out to me.
idiv dword ptr[ebx + 12]
mov ebx, [ebp + 20] ; get address of quo
mov [ebp], eax ; write quo
mov ebx, [ebp + 24] ; get address of rem
mov [ebp], edx ; write rem
I am not sure you are wanting to divide by the contents of memory 12 bytes after the address of the product. Perhaps you meant [ebp + 12].
After that, you are loading addresses into ebx and then writing values to ebp.
I have a problem with sum of elements of two vectors type double which are the same size. Code always returns 0.
#include <iostream>
using namespace std;
int main()
int n = 5;
double* tab = new double[n];
double* tab3 = new double[n];
for (size_t i = 0; i < n; i++)
tab[i] = 1;
tab3[i] = 1;
double sum;
__asm {
mov eax, n; //vector size
mov edi, tab; //first vector
mov esi, tab3; //second vector
fadd[edi + 8 * eax - 8];
fadd[esi + 8 * eax - 8];
dec eax;
jnz l;
fstp sum;
cout << sum;
Sadly i am not on windows, so i had to modify the code to use g++ instead of msvc, but i used intel syntax assembly too. During debugging it turned out that fadd instructions had no effect. I fixed it by adding qword ptr before the [edi + 8 * eax - 8] and [esi + 8 * eax - 8] to tell assembler to use pointers to an 8 byte value (since you are using double instead of float):
fadd qword ptr [edi + 8 * eax - 8];
fadd qword ptr [esi + 8 * eax - 8];
So you are looking for asm for this bit of C code, right? https://godbolt.org/z/vbdfEb94s
#include <cstddef>
double add(double *a, double *b, std::size_t len) {
double sum = 0;
while (len-- > 0) {
sum += *a++;
sum += *b++;
return sum;
I (meaning gcc) come up with this code for 64bit:
add(double*, double*, unsigned long):
xor eax, eax
xorps xmm0, xmm0
cmp rdx, rax
je .L1
addsd xmm0, QWORD PTR [rdi+rax*8]
addsd xmm0, QWORD PTR [rsi+rax*8]
inc rax
jmp .L3
and this for 32bit i386:
add(double*, double*, unsigned int):
push ebp
xor eax, eax
mov ebp, esp
mov ecx, DWORD PTR [ebp+8]
mov edx, DWORD PTR [ebp+12]
cmp DWORD PTR [ebp+16], eax
je .L1
fadd QWORD PTR [ecx+eax*8]
fadd QWORD PTR [edx+eax*8]
inc eax
jmp .L3
pop ebp
Godbolt Link: https://godbolt.org/g/Hv6MAL
typedef int cell;
cell y;
const cell *phys_addr = (const cell*)0x12340;
int main() {
for (int i = 0; i < 20; i++) {
for (int j = 0; j < 30; j++) {
for (int k = 0; k < 50; k++) {
const cell *subarray = (&phys_addr[i] + phys_addr[i]/sizeof(cell));
const cell *subsubarray = (&subarray[j] + subarray[j]/sizeof(cell));
y = subsubarray[k];
It feels natural to expect the compiler to optimize the above code to something similar to:
int main() {
for (int i = 0; i < 20; i++) {
const cell *subarray = (&phys_addr[i] + phys_addr[i]/sizeof(cell));
for (int j = 0; j < 30; j++) {
const cell *subsubarray = (&subarray[j] + subarray[j]/sizeof(cell));
for (int k = 0; k < 50; k++) {
y = subsubarray[k];
but the assembly generated by gcc 8.2 with -O3 -m32 as flags is:
push ebp
push edi
push esi
push ebx
sub esp, 8
mov eax, DWORD PTR phys_addr
mov DWORD PTR [esp], 0
mov DWORD PTR [esp+4], eax
mov ebp, eax
xor esi, esi
lea edi, [0+esi*4]
xor eax, eax
mov edx, DWORD PTR [ebp+0]
mov ecx, DWORD PTR [esp+4]
shr edx, 2
add edx, DWORD PTR [esp]
lea ebx, [ecx+edx*4]
lea edx, [eax+esi]
add eax, 1
mov ecx, DWORD PTR [ebx+edi]
shr ecx, 2
add edx, ecx
mov edx, DWORD PTR [ebx+edx*4]
mov DWORD PTR y, edx
cmp eax, 50
jne .L2
add esi, 1
cmp esi, 30
jne .L3
add DWORD PTR [esp], 1
mov eax, DWORD PTR [esp]
add ebp, 4
cmp eax, 20
jne .L4
add esp, 8
xor eax, eax
pop ebx
pop esi
pop edi
pop ebp
Why isn't the compiler moving the subarray and subsubarray calculation outside the inner loops?
random volatile does magic
I randomly added volatile to prevent DCE from getting rid of all the code and then somehow the loop invariants got hoisted out of the inner loops.
int main() {
for (int i = 0; i < 20; i++) {
for (int j = 0; j < 30; j++) {
for (int k = 0; k < 50; k++) {
const cell *subarray = (&phys_addr[i] + phys_addr[i]/sizeof(cell));
const cell *subsubarray = (&subarray[j] + subarray[j]/sizeof(cell));
volatile cell y = subsubarray[k];
return 0;
This mostly wasn't because of y being a local variable since using std::cout << subsubarray[k]; prevented the optimization.
The assembly generated by gcc 8.2 with -O3 -m32 as flags for the aforementioned code is:
push ebp
push edi
xor edi, edi
push esi
push ebx
sub esp, 20
mov ebp, DWORD PTR phys_addr
mov eax, DWORD PTR [ebp+0+edi*4]
xor ecx, ecx
shr eax, 2
add eax, edi
lea ebx, [ebp+0+eax*4]
lea esi, [ebx+200]
mov edx, DWORD PTR [ebx+ecx*4]
mov DWORD PTR [esp], ecx
shr edx, 2
add edx, ecx
sal edx, 2
lea eax, [ebx+edx]
add edx, esi
mov ecx, DWORD PTR [eax]
add eax, 4
mov DWORD PTR [esp+16], ecx
cmp edx, eax
jne .L2
mov ecx, DWORD PTR [esp]
add ecx, 1
cmp ecx, 30
jne .L3
add edi, 1
cmp edi, 20
jne .L4
add esp, 20
xor eax, eax
pop ebx
pop esi
pop edi
pop ebp
The loop invariants are pushed out of the inner loops. What did the random volatile do to allow the GCC to optimize the invariants? The optimization does not happen when clang 6.0.0.
It is not about random volatile fixing your problem - problem is deeper.
As you already guessed problem indeed relates to "y"
Check this example:
typedef int cell;
const cell *phys_addr = (const cell*)0x12340;
int main() {
cell y = 1;
for (int i = 0; i < 20; i++) {
for (int j = 0; j < 30; j++) {
for (int k = 0; k < 50; k++) {
const cell *subarray = (&phys_addr[i] + phys_addr[i]/sizeof(cell));
const cell *subsubarray = (&subarray[j] + subarray[j]/sizeof(cell));
y /= subsubarray[k];
return y;
I've used trick with division to avoid hard optimization (gcc can evaluate all loops and provide y directly in plain assignment; when using add, sub or multiply it will unroll innermost loop too - please play in godbolt to see how it looks)
Now disassembly looks like that:
push ebp
push edi
push esi
push ebx
sub esp, 12
mov eax, DWORD PTR phys_addr
mov DWORD PTR [esp], 0
mov DWORD PTR [esp+4], eax
mov eax, 1
mov esi, DWORD PTR [esp]
mov edi, DWORD PTR [esp+4]
mov edx, DWORD PTR [edi+esi*4]
mov DWORD PTR [esp+8], edx
shr edx, 2
add edx, esi
xor esi, esi
lea edi, [edi+edx*4]
lea ebp, [edi+200]
mov ebx, DWORD PTR [edi+esi*4]
shr ebx, 2
add ebx, esi
sal ebx, 2
lea ecx, [edi+ebx]
add ebx, ebp
idiv DWORD PTR [ecx]
add ecx, 4
cmp ebx, ecx
jne .L2
add esi, 1
cmp esi, 30
jne .L3
add DWORD PTR [esp], 1
mov edi, DWORD PTR [esp]
cmp edi, 20
jne .L4
add esp, 12
pop ebx
pop esi
pop edi
pop ebp
.long 74560
.L2 is innermost loop so code looks like expected - subarray and subsubarray are precomputed earlier.
So you may wonder - why is that when "y" is local all is ok and when global is not.
To be clear - "y" does not have to be declared in main. It could be made static like this
static cell y;
const cell * __restrict__ phys_addr = (const cell*)0x12340;
or use namespace
namespace wtf{ cell y; }
const cell * __restrict__ phys_addr = (const cell*)0x12340;
and than refer to y as wtf::y;
Still good.
All condenses to aliasing. To see it let's change y to pointer first:
typedef int cell;
cell * y;
const cell * phys_addr = (const cell*)0x12340;
int main() {
cell ylocal;
y = &ylocal;
for (int i = 0; i < 20; i++) {
for (int j = 0; j < 30; j++) {
for (int k = 0; k < 50; k++) {
const cell *subarray = (&phys_addr[i] + phys_addr[i]/sizeof(cell));
const cell *subsubarray = (&subarray[j] + subarray[j]/sizeof(cell));
*y /= subsubarray[k];
return *y;
No loop optimization again....
It may be assumed that y and phys_addr overlap - writing y may modify some memory cells so all dictionaries have to be calculated with most up to date data (const in phys_addr means only your pointer should not modify memory, not that it is globally readonly).
But if you "promise" that those addresses do not overlap optimization will come back.
typedef int cell;
cell * __restrict__ y;
const cell * __restrict__ phys_addr = (const cell*)0x12340;
int main() {
cell ylocal;
y = &ylocal;
for (int i = 0; i < 20; i++) {
for (int j = 0; j < 30; j++) {
for (int k = 0; k < 50; k++) {
const cell *subarray = (&phys_addr[i] + phys_addr[i]/sizeof(cell));
const cell *subsubarray = (&subarray[j] + subarray[j]/sizeof(cell));
*y /= subsubarray[k];
return *y;
If you are using pointers compilator may not be able to prove addresses do not alias and will use safe path. If you are 100% sure they don't use restrict to inform it about that fact.
To give you context over the code, this is a dot product computation of two vectors using pointer arithmetic (looping with pointers as well). I have linked it in my main.cpp but for some reason, when the function is called (in this case, my ASM file), i get an access violation error. Here are the two files. Thank you for your help!
#include <time.h>
#include <stdlib.h>
#include <stdio.h>
#include <tchar.h>
#include <windows.h>
#include <algorithm>
#include <iostream>
using namespace std;
extern "C" int dpp_pointerr(int *v, int *u, int n); //ASM FILE
void main(void) {
const int N = 10;
static int A[10];
static int B[10];
printf("Array A: ");
for (int i = 0; i < N; i++) { A[i] = rand() % 10; /*printf("%d ", A[i]); */ }
printf("\n\nArray B: ");
for (int j = 0; j < N; j++) { B[j] = rand() % 10;/* printf("%d ", B[j]);*/ }
int result2 = dpp_pointerr(A, B, N);
printf("\nResult after POINTER dot product: %d\n", result2);
__int64 ctr1 = 0, ctr2 = 0, freq = 0;
int acc = 0, i = 0;
if (QueryPerformanceCounter((LARGE_INTEGER *)&ctr1) != 0) {
/****************CODE TO BE TIMED HERE**********************/
//int result3= dot_product_index(A, B, N);
int result2 = dpp_pointerr(A, B, N);
QueryPerformanceCounter((LARGE_INTEGER *)&ctr2);
cout << "Start Value: " << ctr1 << endl;
cout << "End Value: " << ctr2 << endl;
QueryPerformanceFrequency((LARGE_INTEGER *)&freq);
// freq is number of counts per second. It approximates the CPU frequency
printf("QueryPerformanceCounter minimum resolution: 1/%I64u Seconds.\n", freq);
printf("ctr2 - ctr1: %f counts.\n", ((ctr2 - ctr1) * 1.0 / 1.0));
cout << "65536 Increments by 1 computation time: " << ((ctr2 - ctr1) * 1.0 / freq) << " seconds\n";
else {
DWORD dwError = GetLastError();
printf("Error value = %d", dwError);
cout << endl;
cout << "Press ENTER to finish";
; Listing generated by Microsoft (R) Optimizing Compiler Version 19.11.25547.0
TITLE C:\Users\Patrick\source\repos\dot_product_legit\dot_product_legit\dpp_pointerr.cpp
include listing.inc
.model flat, C
PUBLIC dpp_pointerr
_result$ = -32 ; size = 4
_B_beg$ = -20 ; size = 4
_A_beg$ = -8 ; size = 4
_v$ = 8 ; size = 4
_u$ = 12 ; size = 4
_n$ = 16 ; size = 4
dpp_pointerr PROC ; dot_product_pointer, COMDAT
push ebp
mov ebp, esp
sub esp, 228 ; 000000e4H
push ebx
push esi
push edi
lea edi, DWORD PTR [ebp-228]
mov ecx, 57 ; 00000039H
mov eax, -858993460 ; ccccccccH
rep stosd
mov DWORD PTR _result$[ebp], 0
; Line 11
; Line 2
push ebp
mov ebp, esp
sub esp, 228 ; 000000e4H
push ebx
push esi
push edi
lea edi, DWORD PTR [ebp-228]
mov ecx, 57 ; 00000039H
mov eax, -858993460 ; ccccccccH
rep stosd
; Line 5
mov eax, 4
imul ecx, eax, 0
add ecx, DWORD PTR _v$[ebp]
mov DWORD PTR _A_beg$[ebp], ecx
; Line 6
mov eax, 4
imul ecx, eax, 0
add ecx, DWORD PTR _u$[ebp]
mov DWORD PTR _B_beg$[ebp], ecx
; Line 8
mov DWORD PTR _result$[ebp], 0
; Line 11
mov eax, DWORD PTR _A_beg$[ebp]
mov ebx, DWORD PTR _B_beg$[ebp]
mov ecx, DWORD PTR _n$[ebp]
mov edi, DWORD PTR _v$[ebp]
lea edi, DWORD PTR [edi+ecx*4]
mov esi, DWORD PTR _u$[ebp]
lea esi, DWORD PTR [esi+ecx*4]
jmp SHORT $LN4#dot_produc
add eax, 4
add ebx, 4
cmp eax, edi
jae SHORT $LN3#dot_produc
cmp ebx, esi
jae SHORT $LN3#dot_produc
; Line 12
imul eax, ebx
add DWORD PTR _result$[ebp], eax
jmp SHORT $LN2#dot_produc
; Line 13
mov eax, DWORD PTR _result$[ebp]
; Line 14
pop edi
pop esi
pop ebx
mov esp, ebp
pop ebp
ret 0
dpp_pointerr ENDP ; dot_product_pointer
Im trying to check the even numbers from an array inside a struct, but i dont think i wrote something right. When debugging, (i.e. count = 3 v[] = {1,2,4}), after it reaches " cmp eax,[ebp+12] and je outt; " , it goes to outt: and thats it.
s is supposed to keep the sum of all even numbers, eax inside int suma(test *) is index for array, and edx keeps the sum before moving it in s
what am i doing wrong?
#include "stdafx.h"
#include <iostream>
using namespace std;
struct test {
int v[10];
short count;
test a;
int s = 6;
int suma(test *)
_asm {
mov eax, 0; // i for counting inside array
mov edx, 0; // sum of even elements
mov ebx, [ebp + 8]; // array v adress
cmp eax, [ebp + 12];
je outt;
mov ecx, [ebx + 4 * eax];
inc eax;
mov edi, ecx
and ecx, 1;
cmp ecx, 1;
je loop;
add edx, edi;
jmp loop;
mov eax, edx;
return s;
int main()
cin >> a.count;
for (int i = 0; i < a.count; i++)
cin >> a.v[i];
_asm {
LEA eax, a
push eax;
call suma;
add esp, 4;
mov s, eax;
cout << s;
return 0;