I have a problem with sum of elements of two vectors type double which are the same size. Code always returns 0.
#include <iostream>
using namespace std;
int main()
{
int n = 5;
double* tab = new double[n];
double* tab3 = new double[n];
for (size_t i = 0; i < n; i++)
{
tab[i] = 1;
tab3[i] = 1;
}
double sum;
__asm {
mov eax, n; //vector size
mov edi, tab; //first vector
mov esi, tab3; //second vector
fldz;
l:
fadd[edi + 8 * eax - 8];
fadd[esi + 8 * eax - 8];
dec eax;
jnz l;
fstp sum;
}
cout << sum;
}
Sadly i am not on windows, so i had to modify the code to use g++ instead of msvc, but i used intel syntax assembly too. During debugging it turned out that fadd instructions had no effect. I fixed it by adding qword ptr before the [edi + 8 * eax - 8] and [esi + 8 * eax - 8] to tell assembler to use pointers to an 8 byte value (since you are using double instead of float):
fadd qword ptr [edi + 8 * eax - 8];
fadd qword ptr [esi + 8 * eax - 8];
So you are looking for asm for this bit of C code, right? https://godbolt.org/z/vbdfEb94s
#include <cstddef>
double add(double *a, double *b, std::size_t len) {
double sum = 0;
while (len-- > 0) {
sum += *a++;
sum += *b++;
}
return sum;
}
I (meaning gcc) come up with this code for 64bit:
add(double*, double*, unsigned long):
xor eax, eax
xorps xmm0, xmm0
.L3:
cmp rdx, rax
je .L1
addsd xmm0, QWORD PTR [rdi+rax*8]
addsd xmm0, QWORD PTR [rsi+rax*8]
inc rax
jmp .L3
.L1:
ret
and this for 32bit i386:
add(double*, double*, unsigned int):
push ebp
xor eax, eax
fldz
mov ebp, esp
mov ecx, DWORD PTR [ebp+8]
mov edx, DWORD PTR [ebp+12]
.L3:
cmp DWORD PTR [ebp+16], eax
je .L1
fadd QWORD PTR [ecx+eax*8]
fadd QWORD PTR [edx+eax*8]
inc eax
jmp .L3
.L1:
pop ebp
ret
Related
Code 1:
int solution(vector<vector<int>>& arr){
int ans, m = arr.size(), n = arr[0].size(); // it is given that m is atleast 1
for(int i = 0; i < m; i++){
for(int j = 0; j < n; j++){
// do something
// i and j are only used for iterating over the array
}
}
return ans;
}
Code 2:
int solution(vector<vector<int>>& arr){
int i, j, ans, m = arr.size(), n = arr[0].size();
for(i = 0; i < m; i++){
for(j = 0; j < n; j++){
// do something
}
}
return ans;
}
I thought code2 uses less memory since code1 declares variable j in the for loop m times (This is not true as mentioned in comments and answer). But I was doubtful when I saw many coders in online coding platforms preferring to use code1 over code2.
I appreciate any explanation about the difference in memory used for the two codes.
As Andreas Brunnet suggested I tried compiling both codes in godbolt.org and as HolyBlackCat said the assembly code for both the codes is the same. Here's the code used and the corresponding assembly code (the compiler option is x86-64 gcc 11.2).
Code:
#include <vector>
int sum(std::vector<std::vector<int>> arr) {
int i, j, ans = 0, m = arr.size(), n = arr[0].size(); // i, j deleted in code2
for(i = 0; i < m; i++){ // int i = 0 in code2
for(j = 0; j < n; j++){ // int j = 0 in code2
ans += arr[i][j];
}
}
return ans;
}
and the assembly code for both the codes is:
sum(std::vector<std::vector<int, std::allocator<int> >, std::allocator<std::vector<int, std::allocator<int> > > >):
push rbp
mov rbp, rsp
sub rsp, 48
mov QWORD PTR [rbp-40], rdi
mov DWORD PTR [rbp-12], 0
mov rax, QWORD PTR [rbp-40]
mov rdi, rax
call std::vector<std::vector<int, std::allocator<int> >, std::allocator<std::vector<int, std::allocator<int> > > >::size() const
mov DWORD PTR [rbp-16], eax
mov rax, QWORD PTR [rbp-40]
mov esi, 0
mov rdi, rax
call std::vector<std::vector<int, std::allocator<int> >, std::allocator<std::vector<int, std::allocator<int> > > >::operator[](unsigned long)
mov rdi, rax
call std::vector<int, std::allocator<int> >::size() const
mov DWORD PTR [rbp-20], eax
mov DWORD PTR [rbp-4], 0
jmp .L2
.L5:
mov DWORD PTR [rbp-8], 0
jmp .L3
.L4:
mov eax, DWORD PTR [rbp-4]
movsx rdx, eax
mov rax, QWORD PTR [rbp-40]
mov rsi, rdx
mov rdi, rax
call std::vector<std::vector<int, std::allocator<int> >, std::allocator<std::vector<int, std::allocator<int> > > >::operator[](unsigned long)
mov rdx, rax
mov eax, DWORD PTR [rbp-8]
cdqe
mov rsi, rax
mov rdi, rdx
call std::vector<int, std::allocator<int> >::operator[](unsigned long)
mov eax, DWORD PTR [rax]
add DWORD PTR [rbp-12], eax
add DWORD PTR [rbp-8], 1
.L3:
mov eax, DWORD PTR [rbp-8]
cmp eax, DWORD PTR [rbp-20]
jl .L4
add DWORD PTR [rbp-4], 1
.L2:
mov eax, DWORD PTR [rbp-4]
cmp eax, DWORD PTR [rbp-16]
jl .L5
mov eax, DWORD PTR [rbp-12]
leave
ret
and as I understand, lines 18,19,43,45,46,47 in assembly code correspond to line 5 of c++ code and lines 21,22,38,40,41,42 correspond to line 6 (Correct me if I am wrong). i.e.
18: mov DWORD PTR [rbp-4], 0
19: jmp .L2
43: add DWORD PTR [rbp-4], 1
44:.L2:
45: mov eax, DWORD PTR [rbp-4]
46: cmp eax, DWORD PTR [rbp-16]
47: jl .L5
is corresponding code of for((int) i = 0; i < m; i++) and
20:.L5:
21: mov DWORD PTR [rbp-8], 0
22: jmp .L3
38: add DWORD PTR [rbp-8], 1
39:.L3:
40: mov eax, DWORD PTR [rbp-8]
41: cmp eax, DWORD PTR [rbp-20]
42: jl .L4
correspond to for((int) j = 0; j < n; j++)
Clearly, the compiler assigns only one register for j in both codes. So, the memory used is the same.
https://leetcode.com/problems/number-of-provinces/
I was pretty excited when I solved this problem on my very first try, within only 20/30 minutes, though when I submitted my code, I ended up in 8.43 percentile. I looked at how the fastest solutions approached the problem, and low and behold, the sample top solution is nearly identical to my code, yet it runs 3x faster. I've been comparing the code and can't really point out a substantial enough difference. Both should be equally fast... Can anyone explain the why? If I'm not mistaken it's O(mn) performance in both cases.
The following is my code. It's pretty self-explanatory, so not sure heavy commenting would do any good.
class Solution {
public:
int findCircleNum(vector<vector<int>>& isConnected) {
int components = 0;
vector<bool> visited (isConnected.size(), false);
// go through each row
for (int i = 0; i < isConnected.size(); i++) {
// explore only unvisited items
if (!visited[i]) {
queue<int> q;
q.push(i);
components++;
while (!q.empty()) {
int node = q.front();
q.pop();
visited[node] = true;
// push all direct connections onto the queue so we explore them
for (int j = 0; j < isConnected[0].size(); j++) {
if (isConnected[node][j] == 1 && !visited[j]) {
q.push(j);
}
}
}
}
}
return components;
}
};
and the following is a sample top solution that runs 3x faster than my code.
class Solution {
public:
int findCircleNum(vector<vector<int>>& M) {
if (M.empty()) {
return 0;
}
int count = 0;
vector<bool> visited(M.size());
auto bfs = [&](int student) {
queue<int> q;
q.push(student);
visited[student] = true;
while (!q.empty()) {
auto current = q.front();
cout << "current " << current << endl;
q.pop();
for (int i = 0; i < M.size(); i++) {
if (M[current][i] == 1 and !visited[i]) {
visited[i] = true;
q.push(i);
}
}
}
};
for (int r = 0; r < M.size(); r++) {
if (visited[r] == false) {
count++;
bfs(r);
}
}
return count;
}
};
The difference is as far as I can [see][1] the placement of visited[i] = true;, which causes a few less memory access per iteration. Where the OP code needs to re-fetch the bool.
And there might be a data or control flow dependence between
visited[node] = true;
and
!visited[j]
That is not there in the Best code.
OP code inner loop
.L118:
mov rax, QWORD PTR [rsi+rbx]
cmp DWORD PTR [rax+rcx*4], 1
jne .L116
mov rax, rbp
mov r8, rcx
sal rax, cl
mov rcx, QWORD PTR [rsp+80]
shr r8, 6
and rax, QWORD PTR [rcx+r8*8]
jne .L116
mov rax, QWORD PTR [rsp+192]
sub rax, 4
cmp rdi, rax
je .L117
"Best" code
.L76:
mov rax, QWORD PTR [rsi+rbx]
cmp DWORD PTR [rax+rcx*4], 1
jne .L74
mov rax, QWORD PTR [r12]
mov rsi, rcx
shr rsi, 6
mov rax, QWORD PTR [rax]
lea rsi, [rax+rsi*8]
mov eax, 1
sal rax, cl
mov rcx, QWORD PTR [rsi]
test rcx, rax
jne .L74
or rax, rcx <------------ visited[i] = true;
mov QWORD PTR [rsi], rax
mov rax, QWORD PTR [rsp+96]
sub rax, 4
cmp r8, rax
je .L75
mov DWORD PTR [r8], edx
add r8, 4
mov QWORD PTR [rsp+80], r8
jmp .L74
[1]: https://godbolt.org/z/obfqf7
Godbolt Link: https://godbolt.org/g/Hv6MAL
typedef int cell;
cell y;
const cell *phys_addr = (const cell*)0x12340;
int main() {
for (int i = 0; i < 20; i++) {
for (int j = 0; j < 30; j++) {
for (int k = 0; k < 50; k++) {
const cell *subarray = (&phys_addr[i] + phys_addr[i]/sizeof(cell));
const cell *subsubarray = (&subarray[j] + subarray[j]/sizeof(cell));
y = subsubarray[k];
}
}
}
}
It feels natural to expect the compiler to optimize the above code to something similar to:
int main() {
for (int i = 0; i < 20; i++) {
const cell *subarray = (&phys_addr[i] + phys_addr[i]/sizeof(cell));
for (int j = 0; j < 30; j++) {
const cell *subsubarray = (&subarray[j] + subarray[j]/sizeof(cell));
for (int k = 0; k < 50; k++) {
y = subsubarray[k];
}
}
}
}
but the assembly generated by gcc 8.2 with -O3 -m32 as flags is:
push ebp
push edi
push esi
push ebx
sub esp, 8
mov eax, DWORD PTR phys_addr
mov DWORD PTR [esp], 0
mov DWORD PTR [esp+4], eax
mov ebp, eax
.L4:
xor esi, esi
.L3:
lea edi, [0+esi*4]
xor eax, eax
.L2:
mov edx, DWORD PTR [ebp+0]
mov ecx, DWORD PTR [esp+4]
shr edx, 2
add edx, DWORD PTR [esp]
lea ebx, [ecx+edx*4]
lea edx, [eax+esi]
add eax, 1
mov ecx, DWORD PTR [ebx+edi]
shr ecx, 2
add edx, ecx
mov edx, DWORD PTR [ebx+edx*4]
mov DWORD PTR y, edx
cmp eax, 50
jne .L2
add esi, 1
cmp esi, 30
jne .L3
add DWORD PTR [esp], 1
mov eax, DWORD PTR [esp]
add ebp, 4
cmp eax, 20
jne .L4
add esp, 8
xor eax, eax
pop ebx
pop esi
pop edi
pop ebp
ret
Why isn't the compiler moving the subarray and subsubarray calculation outside the inner loops?
random volatile does magic
I randomly added volatile to prevent DCE from getting rid of all the code and then somehow the loop invariants got hoisted out of the inner loops.
int main() {
for (int i = 0; i < 20; i++) {
for (int j = 0; j < 30; j++) {
for (int k = 0; k < 50; k++) {
const cell *subarray = (&phys_addr[i] + phys_addr[i]/sizeof(cell));
const cell *subsubarray = (&subarray[j] + subarray[j]/sizeof(cell));
volatile cell y = subsubarray[k];
}
}
}
return 0;
}
This mostly wasn't because of y being a local variable since using std::cout << subsubarray[k]; prevented the optimization.
The assembly generated by gcc 8.2 with -O3 -m32 as flags for the aforementioned code is:
main:
push ebp
push edi
xor edi, edi
push esi
push ebx
sub esp, 20
mov ebp, DWORD PTR phys_addr
.L4:
mov eax, DWORD PTR [ebp+0+edi*4]
xor ecx, ecx
shr eax, 2
add eax, edi
lea ebx, [ebp+0+eax*4]
lea esi, [ebx+200]
.L3:
mov edx, DWORD PTR [ebx+ecx*4]
mov DWORD PTR [esp], ecx
shr edx, 2
add edx, ecx
sal edx, 2
lea eax, [ebx+edx]
add edx, esi
.L2:
mov ecx, DWORD PTR [eax]
add eax, 4
mov DWORD PTR [esp+16], ecx
cmp edx, eax
jne .L2
mov ecx, DWORD PTR [esp]
add ecx, 1
cmp ecx, 30
jne .L3
add edi, 1
cmp edi, 20
jne .L4
add esp, 20
xor eax, eax
pop ebx
pop esi
pop edi
pop ebp
ret
The loop invariants are pushed out of the inner loops. What did the random volatile do to allow the GCC to optimize the invariants? The optimization does not happen when clang 6.0.0.
It is not about random volatile fixing your problem - problem is deeper.
As you already guessed problem indeed relates to "y"
Check this example:
typedef int cell;
const cell *phys_addr = (const cell*)0x12340;
int main() {
cell y = 1;
for (int i = 0; i < 20; i++) {
for (int j = 0; j < 30; j++) {
for (int k = 0; k < 50; k++) {
const cell *subarray = (&phys_addr[i] + phys_addr[i]/sizeof(cell));
const cell *subsubarray = (&subarray[j] + subarray[j]/sizeof(cell));
y /= subsubarray[k];
}
}
}
return y;
}
I've used trick with division to avoid hard optimization (gcc can evaluate all loops and provide y directly in plain assignment; when using add, sub or multiply it will unroll innermost loop too - please play in godbolt to see how it looks)
Now disassembly looks like that:
https://godbolt.org/g/R1EGSb
main:
push ebp
push edi
push esi
push ebx
sub esp, 12
mov eax, DWORD PTR phys_addr
mov DWORD PTR [esp], 0
mov DWORD PTR [esp+4], eax
mov eax, 1
.L4:
mov esi, DWORD PTR [esp]
mov edi, DWORD PTR [esp+4]
mov edx, DWORD PTR [edi+esi*4]
mov DWORD PTR [esp+8], edx
shr edx, 2
add edx, esi
xor esi, esi
lea edi, [edi+edx*4]
lea ebp, [edi+200]
.L3:
mov ebx, DWORD PTR [edi+esi*4]
shr ebx, 2
add ebx, esi
sal ebx, 2
lea ecx, [edi+ebx]
add ebx, ebp
.L2:
cdq
idiv DWORD PTR [ecx]
add ecx, 4
cmp ebx, ecx
jne .L2
add esi, 1
cmp esi, 30
jne .L3
add DWORD PTR [esp], 1
mov edi, DWORD PTR [esp]
cmp edi, 20
jne .L4
add esp, 12
pop ebx
pop esi
pop edi
pop ebp
ret
phys_addr:
.long 74560
.L2 is innermost loop so code looks like expected - subarray and subsubarray are precomputed earlier.
So you may wonder - why is that when "y" is local all is ok and when global is not.
To be clear - "y" does not have to be declared in main. It could be made static like this
static cell y;
const cell * __restrict__ phys_addr = (const cell*)0x12340;
or use namespace
namespace wtf{ cell y; }
const cell * __restrict__ phys_addr = (const cell*)0x12340;
and than refer to y as wtf::y;
Still good.
All condenses to aliasing. To see it let's change y to pointer first:
typedef int cell;
cell * y;
const cell * phys_addr = (const cell*)0x12340;
int main() {
cell ylocal;
y = &ylocal;
for (int i = 0; i < 20; i++) {
for (int j = 0; j < 30; j++) {
for (int k = 0; k < 50; k++) {
const cell *subarray = (&phys_addr[i] + phys_addr[i]/sizeof(cell));
const cell *subsubarray = (&subarray[j] + subarray[j]/sizeof(cell));
*y /= subsubarray[k];
}
}
}
return *y;
}
No loop optimization again....
It may be assumed that y and phys_addr overlap - writing y may modify some memory cells so all dictionaries have to be calculated with most up to date data (const in phys_addr means only your pointer should not modify memory, not that it is globally readonly).
But if you "promise" that those addresses do not overlap optimization will come back.
typedef int cell;
cell * __restrict__ y;
const cell * __restrict__ phys_addr = (const cell*)0x12340;
int main() {
cell ylocal;
y = &ylocal;
for (int i = 0; i < 20; i++) {
for (int j = 0; j < 30; j++) {
for (int k = 0; k < 50; k++) {
const cell *subarray = (&phys_addr[i] + phys_addr[i]/sizeof(cell));
const cell *subsubarray = (&subarray[j] + subarray[j]/sizeof(cell));
*y /= subsubarray[k];
}
}
}
return *y;
}
TL;DR;
If you are using pointers compilator may not be able to prove addresses do not alias and will use safe path. If you are 100% sure they don't use restrict to inform it about that fact.
To give you context over the code, this is a dot product computation of two vectors using pointer arithmetic (looping with pointers as well). I have linked it in my main.cpp but for some reason, when the function is called (in this case, my ASM file), i get an access violation error. Here are the two files. Thank you for your help!
#include <time.h>
#include <stdlib.h>
#include <stdio.h>
#include <tchar.h>
#include <windows.h>
#include <algorithm>
#include <iostream>
using namespace std;
extern "C" int dpp_pointerr(int *v, int *u, int n); //ASM FILE
void main(void) {
const int N = 10;
static int A[10];
static int B[10];
printf("Array A: ");
for (int i = 0; i < N; i++) { A[i] = rand() % 10; /*printf("%d ", A[i]); */ }
printf("\n\nArray B: ");
for (int j = 0; j < N; j++) { B[j] = rand() % 10;/* printf("%d ", B[j]);*/ }
printf("\n");
int result2 = dpp_pointerr(A, B, N);
printf("\nResult after POINTER dot product: %d\n", result2);
__int64 ctr1 = 0, ctr2 = 0, freq = 0;
int acc = 0, i = 0;
if (QueryPerformanceCounter((LARGE_INTEGER *)&ctr1) != 0) {
/****************CODE TO BE TIMED HERE**********************/
//int result3= dot_product_index(A, B, N);
int result2 = dpp_pointerr(A, B, N);
/**********************************************************/
QueryPerformanceCounter((LARGE_INTEGER *)&ctr2);
cout << "Start Value: " << ctr1 << endl;
cout << "End Value: " << ctr2 << endl;
QueryPerformanceFrequency((LARGE_INTEGER *)&freq);
// freq is number of counts per second. It approximates the CPU frequency
printf("QueryPerformanceCounter minimum resolution: 1/%I64u Seconds.\n", freq);
printf("ctr2 - ctr1: %f counts.\n", ((ctr2 - ctr1) * 1.0 / 1.0));
cout << "65536 Increments by 1 computation time: " << ((ctr2 - ctr1) * 1.0 / freq) << " seconds\n";
}
else {
DWORD dwError = GetLastError();
printf("Error value = %d", dwError);
}
cout << endl;
cout << "Press ENTER to finish";
system("pause");
}
ASM FILE
; Listing generated by Microsoft (R) Optimizing Compiler Version 19.11.25547.0
TITLE C:\Users\Patrick\source\repos\dot_product_legit\dot_product_legit\dpp_pointerr.cpp
.686P
.XMM
include listing.inc
.model flat, C
PUBLIC dpp_pointerr
_TEXT SEGMENT
_result$ = -32 ; size = 4
_B_beg$ = -20 ; size = 4
_A_beg$ = -8 ; size = 4
_v$ = 8 ; size = 4
_u$ = 12 ; size = 4
_n$ = 16 ; size = 4
?dpp_pointerr##YAHPAH0H#Z:
dpp_pointerr PROC ; dot_product_pointer, COMDAT
push ebp
mov ebp, esp
sub esp, 228 ; 000000e4H
push ebx
push esi
push edi
lea edi, DWORD PTR [ebp-228]
mov ecx, 57 ; 00000039H
mov eax, -858993460 ; ccccccccH
rep stosd
mov DWORD PTR _result$[ebp], 0
; Line 11
; Line 2
push ebp
mov ebp, esp
sub esp, 228 ; 000000e4H
push ebx
push esi
push edi
lea edi, DWORD PTR [ebp-228]
mov ecx, 57 ; 00000039H
mov eax, -858993460 ; ccccccccH
rep stosd
; Line 5
mov eax, 4
imul ecx, eax, 0
add ecx, DWORD PTR _v$[ebp]
mov DWORD PTR _A_beg$[ebp], ecx
; Line 6
mov eax, 4
imul ecx, eax, 0
add ecx, DWORD PTR _u$[ebp]
mov DWORD PTR _B_beg$[ebp], ecx
; Line 8
mov DWORD PTR _result$[ebp], 0
; Line 11
mov eax, DWORD PTR _A_beg$[ebp]
mov ebx, DWORD PTR _B_beg$[ebp]
mov ecx, DWORD PTR _n$[ebp]
mov edi, DWORD PTR _v$[ebp]
lea edi, DWORD PTR [edi+ecx*4]
mov esi, DWORD PTR _u$[ebp]
lea esi, DWORD PTR [esi+ecx*4]
jmp SHORT $LN4#dot_produc
$LN2#dot_produc:
add eax, 4
add ebx, 4
$LN4#dot_produc:
cmp eax, edi
jae SHORT $LN3#dot_produc
cmp ebx, esi
jae SHORT $LN3#dot_produc
; Line 12
imul eax, ebx
add DWORD PTR _result$[ebp], eax
jmp SHORT $LN2#dot_produc
$LN3#dot_produc:
; Line 13
mov eax, DWORD PTR _result$[ebp]
; Line 14
pop edi
pop esi
pop ebx
mov esp, ebp
pop ebp
ret 0
dpp_pointerr ENDP ; dot_product_pointer
_TEXT ENDS
END
Im trying to check the even numbers from an array inside a struct, but i dont think i wrote something right. When debugging, (i.e. count = 3 v[] = {1,2,4}), after it reaches " cmp eax,[ebp+12] and je outt; " , it goes to outt: and thats it.
s is supposed to keep the sum of all even numbers, eax inside int suma(test *) is index for array, and edx keeps the sum before moving it in s
what am i doing wrong?
#include "stdafx.h"
#include <iostream>
using namespace std;
struct test {
int v[10];
short count;
};
test a;
int s = 6;
int suma(test *)
{
_asm {
mov eax, 0; // i for counting inside array
mov edx, 0; // sum of even elements
mov ebx, [ebp + 8]; // array v adress
loop:
cmp eax, [ebp + 12];
je outt;
mov ecx, [ebx + 4 * eax];
inc eax;
mov edi, ecx
and ecx, 1;
cmp ecx, 1;
je loop;
add edx, edi;
jmp loop;
outt:
mov eax, edx;
}
return s;
}
int main()
{
cin >> a.count;
for (int i = 0; i < a.count; i++)
cin >> a.v[i];
_asm {
LEA eax, a
push eax;
call suma;
add esp, 4;
mov s, eax;
}
cout << s;
return 0;
}