Less aggressive loop optimization when using printf than cout - c++

This question is a followup to this one:
Question on Undefined Behaviour (UB) and loop optimization
When using https://godbolt.org/ online compiler (g++ 7.2, x86_64) with -O2 setting, the loop condition gets optimized out when the loop contains std::cout but not with an identical loop using printf.
Any idea why? Both code versions are compiled with the C++ compiler. Both versions produce an UB warning, but no warning without -O2, even though the code is still in UB land.
FWIW, I also tried the MIPS g++ compiler, that one does not seem to optimize out the loop condition even with std::cout code version and -O2.
I can provide the compiler outputs if necessary, but the std::cout version is quite long.
#include <stdio.h>
int main()
{
for (int i = 0; i < 300; i++)
printf("%d %d", i, i*12345678);
}
/*#include <iostream>
int main()
{
for (int i = 0; i < 300; i++)
std::cout << i << " " << i * 12345678 << std::endl;
}*/
UPDATE: On suggestion from the comments, I removed the UB, then even the printf version removes the loop condition, instead jumping out of the loop when i is 11 (very unsurprising), see below:
#include <stdio.h>
int main()
{
for (int i = 0; i < 300; i++) {
printf("%d %d", i, i*123);
if (i * 123 > 1230) break;
}
}
// Generated assembly:
LC0:
.string "%d %d"
main:
push rbp
push rbx
xor edx, edx
xor esi, esi
mov edi, OFFSET FLAT:.LC0
xor eax, eax
sub rsp, 8
mov ebp, 123
xor ebx, ebx
call printf
.L2:
add ebx, 1
mov edx, ebp
xor eax, eax
mov esi, ebx
mov edi, OFFSET FLAT:.LC0
add ebp, 123
call printf
cmp ebx, 11
jne .L2
add rsp, 8
xor eax, eax
pop rbx
pop rbp
ret

Related

When linking ASM to C++ file, there is an access violation

To give you context over the code, this is a dot product computation of two vectors using pointer arithmetic (looping with pointers as well). I have linked it in my main.cpp but for some reason, when the function is called (in this case, my ASM file), i get an access violation error. Here are the two files. Thank you for your help!
#include <time.h>
#include <stdlib.h>
#include <stdio.h>
#include <tchar.h>
#include <windows.h>
#include <algorithm>
#include <iostream>
using namespace std;
extern "C" int dpp_pointerr(int *v, int *u, int n); //ASM FILE
void main(void) {
const int N = 10;
static int A[10];
static int B[10];
printf("Array A: ");
for (int i = 0; i < N; i++) { A[i] = rand() % 10; /*printf("%d ", A[i]); */ }
printf("\n\nArray B: ");
for (int j = 0; j < N; j++) { B[j] = rand() % 10;/* printf("%d ", B[j]);*/ }
printf("\n");
int result2 = dpp_pointerr(A, B, N);
printf("\nResult after POINTER dot product: %d\n", result2);
__int64 ctr1 = 0, ctr2 = 0, freq = 0;
int acc = 0, i = 0;
if (QueryPerformanceCounter((LARGE_INTEGER *)&ctr1) != 0) {
/****************CODE TO BE TIMED HERE**********************/
//int result3= dot_product_index(A, B, N);
int result2 = dpp_pointerr(A, B, N);
/**********************************************************/
QueryPerformanceCounter((LARGE_INTEGER *)&ctr2);
cout << "Start Value: " << ctr1 << endl;
cout << "End Value: " << ctr2 << endl;
QueryPerformanceFrequency((LARGE_INTEGER *)&freq);
// freq is number of counts per second. It approximates the CPU frequency
printf("QueryPerformanceCounter minimum resolution: 1/%I64u Seconds.\n", freq);
printf("ctr2 - ctr1: %f counts.\n", ((ctr2 - ctr1) * 1.0 / 1.0));
cout << "65536 Increments by 1 computation time: " << ((ctr2 - ctr1) * 1.0 / freq) << " seconds\n";
}
else {
DWORD dwError = GetLastError();
printf("Error value = %d", dwError);
}
cout << endl;
cout << "Press ENTER to finish";
system("pause");
}
ASM FILE
; Listing generated by Microsoft (R) Optimizing Compiler Version 19.11.25547.0
TITLE C:\Users\Patrick\source\repos\dot_product_legit\dot_product_legit\dpp_pointerr.cpp
.686P
.XMM
include listing.inc
.model flat, C
PUBLIC dpp_pointerr
_TEXT SEGMENT
_result$ = -32 ; size = 4
_B_beg$ = -20 ; size = 4
_A_beg$ = -8 ; size = 4
_v$ = 8 ; size = 4
_u$ = 12 ; size = 4
_n$ = 16 ; size = 4
?dpp_pointerr##YAHPAH0H#Z:
dpp_pointerr PROC ; dot_product_pointer, COMDAT
push ebp
mov ebp, esp
sub esp, 228 ; 000000e4H
push ebx
push esi
push edi
lea edi, DWORD PTR [ebp-228]
mov ecx, 57 ; 00000039H
mov eax, -858993460 ; ccccccccH
rep stosd
mov DWORD PTR _result$[ebp], 0
; Line 11
; Line 2
push ebp
mov ebp, esp
sub esp, 228 ; 000000e4H
push ebx
push esi
push edi
lea edi, DWORD PTR [ebp-228]
mov ecx, 57 ; 00000039H
mov eax, -858993460 ; ccccccccH
rep stosd
; Line 5
mov eax, 4
imul ecx, eax, 0
add ecx, DWORD PTR _v$[ebp]
mov DWORD PTR _A_beg$[ebp], ecx
; Line 6
mov eax, 4
imul ecx, eax, 0
add ecx, DWORD PTR _u$[ebp]
mov DWORD PTR _B_beg$[ebp], ecx
; Line 8
mov DWORD PTR _result$[ebp], 0
; Line 11
mov eax, DWORD PTR _A_beg$[ebp]
mov ebx, DWORD PTR _B_beg$[ebp]
mov ecx, DWORD PTR _n$[ebp]
mov edi, DWORD PTR _v$[ebp]
lea edi, DWORD PTR [edi+ecx*4]
mov esi, DWORD PTR _u$[ebp]
lea esi, DWORD PTR [esi+ecx*4]
jmp SHORT $LN4#dot_produc
$LN2#dot_produc:
add eax, 4
add ebx, 4
$LN4#dot_produc:
cmp eax, edi
jae SHORT $LN3#dot_produc
cmp ebx, esi
jae SHORT $LN3#dot_produc
; Line 12
imul eax, ebx
add DWORD PTR _result$[ebp], eax
jmp SHORT $LN2#dot_produc
$LN3#dot_produc:
; Line 13
mov eax, DWORD PTR _result$[ebp]
; Line 14
pop edi
pop esi
pop ebx
mov esp, ebp
pop ebp
ret 0
dpp_pointerr ENDP ; dot_product_pointer
_TEXT ENDS
END

inline assembly - check if word is a palindrome

I am writing a simple program that uses inline assebly to check if given word is a palindrome. The problem is it doesn't return correct answers. During debugging I found out that there's something wrong with esi register (the value in al is correct ('a'), but in bl it's not (0). I'm not sure what I'm doing wrong.
#include "stdafx.h"
#include <iostream>
#include <string>
using namespace std;
int _tmain(int argc, _TCHAR* argv[])
{
char s[] = "arabara";
int sizeofstring = 8; // size of s[]
int x = 0;
int y = 1; //flag when is_palindrome
__asm
{
lea edi, s
mov esi, edi
add esi, sizeofstring
dec esi //point to the last char
mov ecx, sizeofstring
cmp ecx, 1
je is_palindrome //single char is always a palindrome
shr ecx, 1 //divide by 2
nextchar:
mov al, [edi]
mov bl, [esi]
cmp al, bl
jne stop
inc edi
dec esi
loop nextchar
is_palindrome:
mov eax, y
mov x, eax //change flag to 1
stop:
}
cout << x << endl; //shoud print 1 when palindrome
system("pause");
return 0;
}
You set sizeofstring to 8, but your string "arabara" is seven characters long.

Working on Array inside Struct , with Assembly

Im trying to check the even numbers from an array inside a struct, but i dont think i wrote something right. When debugging, (i.e. count = 3 v[] = {1,2,4}), after it reaches " cmp eax,[ebp+12] and je outt; " , it goes to outt: and thats it.
s is supposed to keep the sum of all even numbers, eax inside int suma(test *) is index for array, and edx keeps the sum before moving it in s
what am i doing wrong?
#include "stdafx.h"
#include <iostream>
using namespace std;
struct test {
int v[10];
short count;
};
test a;
int s = 6;
int suma(test *)
{
_asm {
mov eax, 0; // i for counting inside array
mov edx, 0; // sum of even elements
mov ebx, [ebp + 8]; // array v adress
loop:
cmp eax, [ebp + 12];
je outt;
mov ecx, [ebx + 4 * eax];
inc eax;
mov edi, ecx
and ecx, 1;
cmp ecx, 1;
je loop;
add edx, edi;
jmp loop;
outt:
mov eax, edx;
}
return s;
}
int main()
{
cin >> a.count;
for (int i = 0; i < a.count; i++)
cin >> a.v[i];
_asm {
LEA eax, a
push eax;
call suma;
add esp, 4;
mov s, eax;
}
cout << s;
return 0;
}

Assembly language with C++

I should write the program using C++ and Assembly. Program must count the average of the array.
The CPP file must take the data from the user and display the result.In the array must be the real numbers (with floating-point).
The ASM file must count the average of this array.
That is the .cpp file:
#include <iostream.h>
#define L 4
extern "C" float average(float* tab, int G);
int main()
{
float tab[L]={0};
cout<<"Enter array: \n";
for(int i=0; i<L; i++)
cin >> tab[i];
cout << "Average value of entered array = " << average(tab, L);
cout << "\nThe end of the programm\n";
return 0;
}
And here is my assembly code:
.386
.model SMALL,c
PUBLIC average
.stack 100h
.data
.code
average PROC
push ebp
mov ebp, esp
push esi
mov ecx, [ebp+12]
mov esi, [ebp+8]
finit ;coprocessor
fldz
sum:
fadd dword ptr [esi+ecx-4] ;ST(0)=ST(0)+ST(i)
loop sum ;retry sum while cx!=0
fidiv dword ptr [ebp+12] ;Division
pop esi ;End of the programm
pop ebp
mov eax, esi
ret 8
average ENDP
END
The result always is 2.422547e+198
Where I have the mistake? Thank you!
Since it's an array of floats each taking 4 bytes you should multiply the index by 4. Also note that C calling convention mandates that the caller will free the arguments, as such your "ret 8" is wrong.
The mov eax, esi at the end is irrelevant.

Can C++ compilers optimize repeated virtual function calls on the same pointer? [duplicate]

This question already has answers here:
Hoisting the dynamic type out of a loop (a.k.a. doing Java the C++ way)
(4 answers)
Closed 10 years ago.
Suppose I have the following code
void f(PolymorphicType *p)
{
for (int i = 0; i < 1000; ++i)
{
p->virtualMethod(something);
}
}
Will the compiler's generated code dereference p's vtable entry for virtualMethod 1 or 1000 times? I am using Microsoft's compiler.
edit
here is the generated assembly for the real-world case I'm looking at. line->addPoint() is the virtual method of concern. I have no assembly experience, so I'm going over it slowly...
; 369 : for (int i = 0; i < numPts; ++i)
test ebx, ebx
je SHORT $LN1#RDS_SCANNE
lea edi, DWORD PTR [ecx+32]
npad 2
$LL3#RDS_SCANNE:
; 370 : {
; 371 : double *pts = pPoints[i].SystemXYZ;
; 372 : line->addPoint(pts[0], pts[1], pts[2]);
fld QWORD PTR [edi+8]
mov eax, DWORD PTR [esi]
mov edx, DWORD PTR [eax+16]
sub esp, 24 ; 00000018H
fstp QWORD PTR [esp+16]
mov ecx, esi
fld QWORD PTR [edi]
fstp QWORD PTR [esp+8]
fld QWORD PTR [edi-8]
fstp QWORD PTR [esp]
call edx
add edi, 96 ; 00000060H
dec ebx
jne SHORT $LL3#RDS_SCANNE
$LN314#RDS_SCANNE:
; 365 : }
In general, no, it is not possible. The function could destroy *this and placement-new some other object derived from the same base in that space.
Edit: even easier, the function could just change p. The compiler cannot possibly know who has the address of p, unless it is local to the optimization unit in question.
Impossible in general, but there are special cases that can be optimized, especially with inter-procedural analysis. VS2012 with full optimizations and whole-program optimization compiles this program:
#include <iostream>
using namespace std;
namespace {
struct A {
virtual void foo() { cout << "A::foo\n"; }
};
struct B : public A {
virtual void foo() { cout << "B::foo\n"; }
};
void test(A& a) {
for (int i = 0; i < 100; ++i)
a.foo();
}
}
int main() {
B b;
test(b);
}
to:
01251221 mov esi,64h
01251226 jmp main+10h (01251230h)
01251228 lea esp,[esp]
0125122F nop
01251230 mov ecx,dword ptr ds:[1253044h]
01251236 mov edx,12531ACh
0125123B call std::operator<<<std::char_traits<char> > (012516B0h)
01251240 dec esi
01251241 jne main+10h (01251230h)
so it's effectively optimized the loop to:
for(int i = 0; i < 100; ++i)
cout << "B::foo()\n";