C++ external assembly: where is error in my code? - c++

main.cpp
// Calls the external LongRandom function, written in
// assembly language, that returns an unsigned 32-bit
// random integer. Compile in the Large memory model.
// Procedure called LongRandomArray that fills an array with 32-bit unsigned
// random integers
#include <iostream.h>
#include <conio.h>
extern "C" {
unsigned long LongRandom();
void LongRandomArray(unsigned long * buffer, unsigned count);
}
const int ARRAY_SIZE = 20;
int main()
{
// Allocate array storage and fill with 32-bit
// unsigned random integers.
unsigned long * rArray = new unsigned long[ARRAY_SIZE];
LongRandomArray(rArray,ARRAY_SIZE);
for(unsigned i = 0; i < 20; i++)
{
cout << rArray[i] << ',';
}
cout << endl;
getch();
return 0;
}
LongRandom & LongRandomArray procedure module (longrand.asm)
.model large
.386
Public _LongRandom
Public _LongRandomArray
.data
seed dd 12345678h
; Return an unsigned pseudo-random 32-bit integer
; in DX:AX,in the range 0 - FFFFFFFFh.
.code
_LongRandom proc far, C
mov eax, 214013
mul seed
xor edx,edx
add eax, 2531011
mov seed, eax ; save the seed for the next call
shld edx,eax,16 ; copy upper 16 bits of EAX to DX
ret
_LongRandom endp
_LongRandomArray proc far, C
ARG bufferPtr:DWORD, count:WORD
; fill random array
mov edi,bufferPtr
mov cx, count
L1:
call _LongRandom
mov word ptr [edi],dx
add edi,2
mov word ptr [edi],ax
add edi,2
loop L1
ret
_LongRandomArray endp
end

This code is based on on an 16-bit example for MS-DOS from Kip Irvine's assembly book (6th ed.) and explicitely written for Borland C++ 5.01 and TASM 4.0 (see chapter 13.4 "Linking to C/C++ in Real-Address Mode").
Pointers in 16-bit-mode consist of a segment and an offset, usually written as segment:offset. This is not the real memory address which will calculated by the processor. You can not load segment:offset in a 32-bit-register (EDI) and store a value to the memory. So
...
mov edi,bufferPtr
...
mov word ptr [edi],dx
...
is wrong. You have to load the segment part of the pointer in a segment register e.g. ES, the offset part in a appropriate general 16-bit register eg. DI and to possibly use a segment override:
...
push es
les di,bufferPtr ; bufferPtr => ES:DI
...
mov word ptr es:[di],dx
...
pop es
...
The ARG replaces the name of the variable with the appropriate [bp+x] operand. Therefor you need a prologue (and an epilogue). TASM inserts the right instruction, if the PROC header is well written what is not the case here. Take a look at following working function:
_LongRandomArray PROC C FAR
ARG bufferPtr:DWORD, count:WORD
push es
les di,bufferPtr
mov cx, count
L1:
call _LongRandom
mov word ptr es:[di],dx
add di,2
mov word ptr es:[di],ax
add di,2
loop L1
pop es
ret
_LongRandomArray ENDP
Compile your code with BCC (not BCC32):
BCC -ml main.cpp longrand.asm

Related

Translating C++ x86 Inline assembly code to C++

I've been struggling trying to convert this assembly code to C++ code.
It's a function from an old game that takes pixel data Stmp, and I believe it places it to destination void* dest
void Function(int x, int y, int yl, void* Stmp, void* dest)
{
unsigned long size = 1280 * 2;
unsigned long j = yl;
void* Dtmp = (void*)((char*)dest + y * size + (x * 2));
_asm
{
push es;
push ds;
pop es;
mov edx,Dtmp;
mov esi,Stmp;
mov ebx,j;
xor eax,eax;
xor ecx,ecx;
loop_1:
or bx,bx;
jz exit_1;
mov edi,edx;
loop_2:
cmp word ptr[esi],0xffff;
jz exit_2;
mov ax,[esi];
add edi,eax;
mov cx,[esi+2];
add esi,4;
shr ecx,2;
jnc Next2;
movsw;
Next2:
rep movsd;
jmp loop_2;
exit_2:
add esi,2;
add edx,size;
dec bx;
jmp loop_1;
exit_1:
pop es;
};
}
That's where I've gotten as far to: (Not sure if it's even correct)
while (j > 0)
{
if (*stmp != 0xffff)
{
}
++stmp;
dtmp += size;
--j;
}
Any help is greatly appreciated. Thank you.
It saves / restores ES around setting it equal to DS so rep movsd will use the same addresses for load and store. That instruction is basically memcpy(edi, esi, ecx) but incrementing the pointers in EDI and ESI (by 4 * ecx). https://www.felixcloutier.com/x86/movs:movsb:movsw:movsd:movsq
In a flat memory model, you can totally ignore that. This code looks like it might have been written to run in 16-bit unreal mode, or possibly even real mode, hence the use of 16-bit registers all over the place.
Look like it's loading some kind of records that tell it how many bytes to copy, and reading until the end of the record, at which point it looks for the next record there. There's an outer loop around that, looping through records.
The records look like this I think:
struct sprite_line {
uint16_t skip_dstbytes, src_bytes;
uint16_t src_data[]; // flexible array member, actual size unlimited but assumed to be a multiple of 2.
};
The inner loop is this:
;; char *dstp; // in EDI
;; struct spriteline *p // in ESI
loop_2:
cmp word ptr[esi],0xffff ; while( p->skip_dstbytes != (uint16_t)-1 ) {
jz exit_2;
mov ax,[esi]; ; EAX was xor-zeroed earlier; some old CPUs maybe had slow movzx loads
add edi,eax; ; dstp += p->skip_dstbytes;
mov cx,[esi+2]; ; bytelen = p->src_len;
add esi,4; ; p->data
shr ecx,2; ; length in dwords = bytelen >> 2
jnc Next2;
movsw; ; one 16-bit (word) copy if bytelen >> 1 is odd, i.e. if last bit shifted out was a 1.
; The first bit shifted out isn't checked, so size is assumed to be a multiple of 2.
Next2:
rep movsd; ; copy in 4-byte chunks
Old CPUs (before IvyBridge) had rep movsd faster than rep movsb, otherwise this code could just have done that.
or bx,bx;
jz exit_1;
That's an obsolete idiom that comes from 8080 for test bx,bx / jnz, i.e. jump if BX was zero. So it's a while( bx != 0 ) {} loop. With dec bx in it. It's an inefficient way to write a while (--bx) loop; a compiler would put a dec/jnz .top_of_loop at the bottom, with a test once outside the loop in case it needs to run zero times. Why are loops always compiled into "do...while" style (tail jump)?
Some people would say that's what a while loop looks like in asm, if they're picturing totally naive translation from C to asm.

finding average, min and max in assembly

I'm having a problem in finding the average, min and max of an array in assembly language. i created a simple array with C++ and created a test.asm file to pass it through. i figured out the average, but now its the min and max i cant seem to figure out.
#include <iostream>
using namespace std;
extern "C"
int test(int*, int);
int main()
{
const int SIZE = 7;
int arr[SIZE] = { 1,2,3,4,5,6,7 };
int val = test(arr, SIZE);
cout << "The function test returned: " << val << endl;
return 0;
}
This is my test.asm that adds all the values and returns 4.
.686
.model flat
.code
_test PROC ;named _test because C automatically prepends an underscode, it is needed to interoperate
push ebp
mov ebp,esp ;stack pointer to ebp
mov ebx,[ebp+8] ; address of first array element
mov ecx,[ebp+12]
mov ebp,0
mov edx,0
mov eax,0
loopMe:
cmp ebp,ecx
je allDone
add eax,[ebx+edx]
add edx,4
add ebp,1
jmp loopMe
allDone:
mov edx,0
div ecx
pop ebp
ret
_test ENDP
END
I am still trying to figure out how to find the min since the max will be done in a similar way. I assume you use the cmp to compare values but everything i tried so far hasn't been successful. I'm fairly new to assembly language and its hard for me to grasp. Any help is appreciated.
Any help is appreciated
Ok, so I will show you refactored average function, even if you didn't ask for it directly. :)
Things you can learn from this:
simplified function prologue/epilogue, when ebp is not modified in code
the input array is of 32b int values, so to have correct average you should calculate 64b sum, and do the 64b sum signed division
subtle "tricks" how to get zero value (xor) or how inc is +1 to value (lowering code size)
handling zero sized array by returning fake average 0 (no crash)
addition of two 64b values composed from 32b registers/instructions
counting human "index" (+1 => direct cmp with size possible), yet addressing 32b values (usage of *4 in addressing)
renamed to getAverage
BTW, this is not optimized for performance, I tried to keep the source "simple", so it's easy to read and understand what is it doing.
_getAverage PROC
; avoiding `ebp` usage, so no need to save/set it
mov ebx,[esp+4] ; address of first array element
mov ecx,[esp+8] ; size of array
xor esi,esi ; array index 0
; 64b sum (edx:eax) = 0
xor eax,eax
cdq
; test for invalid input (zero sized array)
jecxz zeroSizeArray ; arguments validation, returns 0 for 0 size
; here "0 < size", so no "index < size" test needed for first element
; "do { ... } while(index < size);" loop variant
sumLoop:
; extend value from array[esi] to 64b (edi is upper 32b)
mov edi,[ebx+esi*4]
sar edi,31
; edx:eax += edi:array[esi] (64b array value added to 64b sum)
add eax,[ebx+esi*4]
adc edx,edi
; next index and loop while index < size
inc esi
cmp esi,ecx
jb sumLoop
; divide the 64b sum of integers by "size" to get average value
idiv ecx ; signed (!) division (input array is signed "int")
; can't overflow (Divide-error), as the sum value was accumulated
; from 32b values only, so EAX contains full correct result
zeroSizeArray:
ret
_getAverage ENDP

Can someone explain the meaning of malloc(20 * c | -(20 * (unsigned __int64)(unsigned int)c >> 32 != 0))

In decompiled code generated by IDA I see expressions like:
malloc(20 * c | -(20 * (unsigned __int64)(unsigned int)c >> 32 != 0))
malloc(6 * n | -(3 * (unsigned __int64)(unsigned int)(2 * n) >> 32 != 0))
Can someone explain the purpose of these calculations?
c and n are int (signed integer) values.
Update.
Original C++ code was compiled with MSVC for 32-bit platform.
Here's assembly code for second line of decompiled C-code above (malloc(6 * ..)):
mov ecx, [ebp+pThis]
mov [ecx+4], eax
mov eax, [ebp+pThis]
mov eax, [eax]
shl eax, 1
xor ecx, ecx
mov edx, 3
mul edx
seto cl
neg ecx
or ecx, eax
mov esi, esp
push ecx ; Size
call dword ptr ds:__imp__malloc
I'm guessing that original source code used the C++ new operator to allocate an array and was compiled with Visual C++. As user3528438's answer indicates this code is meant to prevent overflows. Specifically it's a 32-bit unsigned saturating multiply. If the result of the multiplication would be greater than 4,294,967,295, the maximum value of a 32-bit unsigned number, the result is clamped or "saturated" to that maximum.
Since Visual Studio 2005, Microsoft's C++ compiler has generated code to protect against overflows. For example, I can generate assembly code that could be decompiled into your examples by compiling the following with Visual C++:
#include <stdlib.h>
void *
operator new[](size_t n) {
return malloc(n);
}
struct S {
char a[20];
};
struct T {
char a[6];
};
void
foo(int n, S **s, T **t) {
*s = new S[n];
*t = new T[n * 2];
}
Which, with Visual Studio 2015's compiler generates the following assembly code:
mov esi, DWORD PTR _n$[esp]
xor ecx, ecx
mov eax, esi
mov edx, 20 ; 00000014H
mul edx
seto cl
neg ecx
or ecx, eax
push ecx
call _malloc
mov ecx, DWORD PTR _s$[esp+4]
; Line 19
mov edx, 6
mov DWORD PTR [ecx], eax
xor ecx, ecx
lea eax, DWORD PTR [esi+esi]
mul edx
seto cl
neg ecx
or ecx, eax
push ecx
call _malloc
Most of the decompiled expression is actually meant to handle just one assembly statement. The assembly instruction seto cl sets CL to 1 if the previous MUL instruction overflows, otherwise it sets CL to 0. Similarly the expression 20 * (unsigned __int64)(unsigned int)c >> 32 != 0 evaluates to 1 if the result of 20 * c overflows, and evaluates to 0 otherwise.
If this overflow protection wasn't there and the result of 20 * c did actually overflow then the call to malloc would probably succeed, but allocate much less memory than the program intended. The program would then likely write past the end of the memory actually allocated and trash other bits of memory. This would amount to a buffer overrun, one that could be potentially exploited by hackers.
Since this code is decompiled from ASM, so we can only guess what it actually does.
Let's first format it so figure the precedence:
malloc(20 * c | -(20 * (unsigned __int64)(unsigned int)c >> 32 != 0))
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
//this is first evaluated, promoting c to
//64 bit unsigned int without doing sign
//extension, regardless the type of c
malloc(20 * c | -(20 * (uint64_t)c >> 32 != 0))
^^^^^^^^^^^^^^^^
//then, multiply by 20, with uint64 result
malloc(20 * c | -(20 * (uint64_t)c >> 32 != 0))
^^^^^^^^^^^^^^^^^^^^^^^^^^^
//if 20c is greater than 2^32-1, then result is true,
//use -1 to generate a mask of 0xffffffff,
//bitwise operator | then masks 20c to 0xffffffff
//(2^32-1, the maximum of size_t, input type to malloc)
//regardless what 20c actually is
//if 20c is smaller than 2^32-1, then result is false,
//the mask is 0, bitwise operator | keeps the final
//input to malloc as 20c untouched
What are 20 and 6?
Those probably come from the common usage of
malloc(sizeof(Something)*count). Those two calls to malloc are probably made with sizeof(Something) and sizeof(SomethingElse) evaluated to 20 and 6 at compile time.
So what this code actually does:
My guess, it's trying to prevent sizeof(Something)*count from overflowing and cause the malloc to succeed and cause buffer overflow when the memory is used.
By evaluating the product in 64 bit unsigned int and test against 2^32-1, when size is greater than 2^32-1, the input to malloc is set to a very large value that makes it guaranteed to fail (No 32 bit system can allocate 2^32-1 bytes of memory).
Can someone explain the purpose of these calculations?
It is important to understand that compiling changes the semantic meaning of code. Much unspecified behavior of the original code becomes specified by the compilation process.
IDA has no idea whether things the generated assembly code just happens to do are important or not. To be safe, it tries to perfectly replicate the behavior of the assembly code, even in cases that cannot possibly happen given the way the code is used.
Here, IDA is probably replicating the overflow characteristics that the conversion of types just happens to have on this platform. It can't just replicate the original C code because the original C code likely had unspecified behavior for some values of c or n, likely negative ones.
For example, say I write this C code: int f(unsigned j) { return j; }. My compiler will likely turn that into very simple assembly code giving whatever behavior for negative values of j that my platform just happens to give.
But if you decompile the generated assembly, you cannot decompile it to int f(unsigned j) { return j; } because that will not behave the same as the my assembly code did on platforms with different overflow behavior. That could compile to code (on other platforms) that returns different values than my assembly code does for negative values of j.
So it is often literally impossible (in fact, incorrect) to decompile C code into the original code, it will often have these kinds of "portably replicate this platform's behavior" oddities.
it's rounding up to the nearest block size.
forgive me. What it's doing is calculating a multiple of c while simultaneously checking for a negative value (overflow):
#include <iostream>
#include <cstdint>
size_t foo(char c)
{
return 20 * c | -(20 * (std::uint64_t)(unsigned int)c >> 32 != 0);
}
int main()
{
using namespace std;
for (char i = -4 ; i < 4 ; ++i)
{
cout << "input is: " << int(i) << ", result is " << foo(i) << endl;
}
return 0;
}
results:
input is: -4, result is 18446744073709551615
input is: -3, result is 18446744073709551615
input is: -2, result is 18446744073709551615
input is: -1, result is 18446744073709551615
input is: 0, result is 0
input is: 1, result is 20
input is: 2, result is 40
input is: 3, result is 60
To me the number 18446744073709551615 doesn't mean much, at a glance. Only after seeing it expressed in hex I went "ah". – Jongware
adding << hex:
input is: -1, result is ffffffffffffffff

x86 Intel Assembly and C++ - Stack around array corrupted

Error:
Run-Time Check Failure #2 - Stack around the variable 'arr' was corrupted.
This seems to be a common error on this forum; however, I was unable to find one that had assembly code mixed into it. Basically, my program is to convert decimal to binary (16-bit representation). After completing the coding, everything seems to compute correctly and convert the decimal to binary without an issue; however, after the "Press any key to continue . . .", the error above pops up.
I do not believe the C++ code is causing the issue as it is very basic, and is there only to invoke the assembly function.
Again, the computation is correct as the program will produce the correct conversion (i.e: Decimal = 10, Binary Conversion: 0000000000001010), but just giving me the error at the end of the program.
C++ Code:
#include <iostream>
using namespace std;
extern"C" void decToBin(char[], int, int);
int main()
{
//Initialize array and variables
const int SIZE = 16;
char arr[SIZE] = { NULL };
int dec = 0;
//Ask user for integer that they want to convert
cout << "Please enter integer you want to convert to binary: ";
cin >> dec;
//Assembly function to convert integer
decToBin(arr, dec, SIZE);
cout << "The 16-bit binary representation of " << dec << " is: ";
//Display the 16-bit binary conversion
for (int i = 0; i < SIZE; i++)
cout << arr[i];
cout << endl;
system("PAUSE");
return 0;
}
Assembly Code:
.686
.model flat
.code
_decToBin PROC ;Start of project
start:
push ebp
mov ebp,esp ;Stack pointer to ebp
mov eax,[ebp+8] ;Address of first array element
mov cx,[ebp+12] ;Integer number being passed - Copying onto 16 bit register
mov edx,[ebp+16] ;Size of array
loopme: ;Loop to fill in array
mov ebx,0 ;Initializes ebx to store carry flag after shift
cmp edx,0 ;Compare edx with 0 to see if we should continue
je alldone
shl cx,1 ;Shift the value to the left
adc ebx,0 ;Check carry flag and add 1 if CF(CY) is set to 1 and stay at 0 if CF(CY) is 0
add ebx,48 ;Since array is CHAR, adding 48 will give correct 0 or 1 instead of null
mov [eax],ebx ;Copy the 0's or 1's into the array location
dec edx ;Decrement the counter
inc eax ;Move the array up an index
jmp loopme
alldone:
pop ebp
ret
_decToBin ENDP
END
I have no assembler to compile your code, but you write 32-bit values into a char[] at this line:
mov [eax],ebx ;Copy the 0's or 1's into the array location
So, the last write will update the memory locations arr[SIZE-1] to arr[SIZE+2].

Array to C++ function(using ASM) read error

Ive got a problem. I send a pointer in function that replace 3 last elements of array to 3 first. I should use unsigned char array to send and it should work with ASM.
int main(int argc, char* argv[])
{
unsigned char arr[24]={
1,2,3,4,5,6,
7,8,9,10,11,12,
13,14,15,16,17,18,
19,20,21,22,23,24
};// example
AsmFlipVertical(arr);
};
void AsmFlipVertical(unsigned char *arr)
{
_asm
{
les esi,arr ; esi=adress of first elem
mov eax,esi
add eax,21
mov edi,eax ; edi=adress of first elem+21;edi is a adress of 21th elem of array
cld
mov ecx,3
rep movsb
}
}
movsb from esi to edi
Ive got error in "rep movsb" What's wrong? If use this ASM code in main function that's okay,but I have to use ASM code in function...
You should not use any instructions that affect the segment registers in flat memory models. So, replace les esi,arr with mov esi,arr
The les esi, arr instruction is wrong (you don't want to change the es reister too) You should just use mov esi, arr
(tested - works)