I have to check whether K-th bit of a number is set or not.
Example what I mean:
Input: N = 4, K = 0
Output: false
Explanation: Binary representation of 4 is b'100, in which the 0th bit from LSB is not set and therefore it returns false.
This code does not work with the not equal to the statement:
bool checkKthBit(int n, int k)
{
if( n & (1 << k) != 0)
return true;
else
return false;
}
However, after removing the not equal operator the code works perfectly fine:
bool checkKthBit(int n, int k)
{
if (n & (1 << k))
return true;
else
return false;
}
How is this happening?
If you put the equation into brackets it does what you want:
https://godbolt.org/z/enTcjhrzT
bool checkA(int n, int k) {
if(n&(1<<k)!=0) return true; else return false;
}
bool checkB(int n, int k) {
if((n&(1<<k))!=0) return true; else return false;
}
bool checkC(int n, int k) {
if((n&(1<<k))) return true; else return false;
}
Will produce this assembly, notice that checkB and checkC are identical (correct brackets means correct precedence and the != operator can stay). While the checkA does give precedence to the equality and then the result is and-ed and that result is used for the return.
checkA(int, int):
mov eax, 1
mov ecx, esi
sal eax, cl
test eax, eax
setne al
movzx eax, al
and eax, edi
ret
checkB(int, int):
mov ecx, esi
sar edi, cl
mov eax, edi
and eax, 1
ret
checkC(int, int):
mov ecx, esi
sar edi, cl
mov eax, edi
and eax, 1
ret
See the C precedence table where the & is precedence 2 (right to left) and != is precedence 7 (left to right)
Reference from: https://en.cppreference.com/w/c/language/operator_precedence
BUT if you are using C++ notice how the precedence table changes, the & is 11 and the != is 10:
Reference from: https://en.cppreference.com/w/cpp/language/operator_precedence
If I would assume GCC as the toolchain, if you do not provide any dialect then the default will be used:
The default, if no C language dialect options are given, is
-std=gnu17.
Or for C++:
The default, if no C++ language dialect options are given, is
-std=gnu++17.
GCC reference: https://gcc.gnu.org/onlinedocs/gcc/Standards.html
Broadly speaking it's better to use more parenthesis than less, instead of depending on the precedence order and making sure you and everybody who reads it understand the caveats of the order, it's considered safer to make the order explicit:
https://softwareengineering.stackexchange.com/questions/201175/should-i-use-parentheses-in-logical-statements-even-where-not-necessary
Related
I've been struggling trying to convert this assembly code to C++ code.
It's a function from an old game that takes pixel data Stmp, and I believe it places it to destination void* dest
void Function(int x, int y, int yl, void* Stmp, void* dest)
{
unsigned long size = 1280 * 2;
unsigned long j = yl;
void* Dtmp = (void*)((char*)dest + y * size + (x * 2));
_asm
{
push es;
push ds;
pop es;
mov edx,Dtmp;
mov esi,Stmp;
mov ebx,j;
xor eax,eax;
xor ecx,ecx;
loop_1:
or bx,bx;
jz exit_1;
mov edi,edx;
loop_2:
cmp word ptr[esi],0xffff;
jz exit_2;
mov ax,[esi];
add edi,eax;
mov cx,[esi+2];
add esi,4;
shr ecx,2;
jnc Next2;
movsw;
Next2:
rep movsd;
jmp loop_2;
exit_2:
add esi,2;
add edx,size;
dec bx;
jmp loop_1;
exit_1:
pop es;
};
}
That's where I've gotten as far to: (Not sure if it's even correct)
while (j > 0)
{
if (*stmp != 0xffff)
{
}
++stmp;
dtmp += size;
--j;
}
Any help is greatly appreciated. Thank you.
It saves / restores ES around setting it equal to DS so rep movsd will use the same addresses for load and store. That instruction is basically memcpy(edi, esi, ecx) but incrementing the pointers in EDI and ESI (by 4 * ecx). https://www.felixcloutier.com/x86/movs:movsb:movsw:movsd:movsq
In a flat memory model, you can totally ignore that. This code looks like it might have been written to run in 16-bit unreal mode, or possibly even real mode, hence the use of 16-bit registers all over the place.
Look like it's loading some kind of records that tell it how many bytes to copy, and reading until the end of the record, at which point it looks for the next record there. There's an outer loop around that, looping through records.
The records look like this I think:
struct sprite_line {
uint16_t skip_dstbytes, src_bytes;
uint16_t src_data[]; // flexible array member, actual size unlimited but assumed to be a multiple of 2.
};
The inner loop is this:
;; char *dstp; // in EDI
;; struct spriteline *p // in ESI
loop_2:
cmp word ptr[esi],0xffff ; while( p->skip_dstbytes != (uint16_t)-1 ) {
jz exit_2;
mov ax,[esi]; ; EAX was xor-zeroed earlier; some old CPUs maybe had slow movzx loads
add edi,eax; ; dstp += p->skip_dstbytes;
mov cx,[esi+2]; ; bytelen = p->src_len;
add esi,4; ; p->data
shr ecx,2; ; length in dwords = bytelen >> 2
jnc Next2;
movsw; ; one 16-bit (word) copy if bytelen >> 1 is odd, i.e. if last bit shifted out was a 1.
; The first bit shifted out isn't checked, so size is assumed to be a multiple of 2.
Next2:
rep movsd; ; copy in 4-byte chunks
Old CPUs (before IvyBridge) had rep movsd faster than rep movsb, otherwise this code could just have done that.
or bx,bx;
jz exit_1;
That's an obsolete idiom that comes from 8080 for test bx,bx / jnz, i.e. jump if BX was zero. So it's a while( bx != 0 ) {} loop. With dec bx in it. It's an inefficient way to write a while (--bx) loop; a compiler would put a dec/jnz .top_of_loop at the bottom, with a test once outside the loop in case it needs to run zero times. Why are loops always compiled into "do...while" style (tail jump)?
Some people would say that's what a while loop looks like in asm, if they're picturing totally naive translation from C to asm.
How much will it affect the performance if I use:
n>>1 instead of n/2
n&1 instead of n%2!=0
n<<3 instead of n*8
n++ instead of n+=1
and so on...
and if it does increase the performance then please explain why.
Any half decent compiler will optimize the two versions into the same thing. For example, GCC compiles this:
unsigned int half1(unsigned int n) { return n / 2; }
unsigned int half2(unsigned int n) { return n >> 1; }
bool parity1(int n) { return n % 2; }
bool parity2(int n) { return n & 1; }
int mult1(int n) { return n * 8; }
int mult2(int n) { return n << 3; }
void inc1(int& n) { n += 1; }
void inc2(int& n) { n++; }
to
half1(unsigned int):
mov eax, edi
shr eax
ret
half2(unsigned int):
mov eax, edi
shr eax
ret
parity1(int):
mov eax, edi
and eax, 1
ret
parity2(int):
mov eax, edi
and eax, 1
ret
mult1(int):
lea eax, [0+rdi*8]
ret
mult2(int):
lea eax, [0+rdi*8]
ret
inc1(int&):
add DWORD PTR [rdi], 1
ret
inc2(int&):
add DWORD PTR [rdi], 1
ret
One small caveat is that in the first example, if n could be negative (in case that it is signed and the compiler can't prove that it's nonnegative), then the division and the bitshift are not equivalent and the division needs some extra instructions. Other than that, compilers are smart and they'll optimize operations with constant operands, so use whichever version makes more sense logically and is more readable.
Strictly speaking, in most cases, yes.
This is because bit manipulation is a simpler operation to perform for CPUs due to the circuitry in the APU being much simpler and requiring less discrete steps (clock cycles) to perform fully.
As others have mentioned, any compiler worth a damn will automatically detect constant operands to certain arithmetic operations with bitwise analogs (like those in your examples) and will convert them to the appropriate bitwise operations under the hood.
Keep in mind, if the operands are runtime values, such optimizations cannot occur.
my code has a lot of patterns like
int a, b.....
bool c = x ? a >= b : a <= b;
and similarly for other inequality comparison operators. Is there a way to write this to achieve better performance/branchlessness for x86.
Please spare me with have you benchmarked your code? Is this really your bottleneck? type comment. I am asking for other ways to write this so I can benchmark and test.
EDIT:
bool x
Original expression:
x ? a >= b : a <= b
Branch-free equivalent expression without short-circuit evaluation:
!!x & a >= b | !x & a <= b
This is an example of a generic pattern without resorting to arithmetic trickery. Watch out for operator precedence; you may need parentheses for more complex examples.
Another way would be :
bool c = (2*x - 1) * (a - b) >= 0;
This generates a branch-less code here: https://godbolt.org/z/1nAp7G
#include <stdbool.h>
bool foo(int a, int b, bool x)
{
return (2*x - 1) * (a - b) >= 0;
}
------------------------------------------
foo:
movzx edx, dl
sub edi, esi
lea eax, [rdx-1+rdx]
imul eax, edi
not eax
shr eax, 31
ret
Since you're just looking for equivalent expressions, this comes from patching #AlexanderZhang's comment:
(a==b) || (c != (a<b))
The way you currently have it is possibly unbeatable.
But for positive integral a and b and bool x you can use
a / b * x + b / a * !x
(You could adapt this, at the cost of extra cpu burn, by replacing a with a + 1 and similarly for b if you need to support zero.)
If a>=b, a-b will be positive and the first bit(sign bit) is 0. Otherwise a-b is negative and first bit is 1.
So we can simply “xor” the first bit of a-b and the the value of x
constexpr auto shiftBit = sizeof(int)*8-1;
bool foo(bool x, int a, int b){
return x ^ bool((a-b)>>shiftBit);
}
foo(bool, int, int):
sub esi, edx
mov eax, edi
shr esi, 31
xor eax, esi
ret
C++: How do i check if a character is between a given range of characters?
Say, if I have a string name.
I want to check if the first character of this string is between 'a' to 'n'.
How do I do it?
To do (name[0] == 'a') (name[0] == 'b')... would be too long...
If possible, I would like a solution that deals with ASCII values elegantly.
If you want to check whether or not the first character of you string is between 'a' and 'n', for instance, checking name[0] >= 'a' && name[0] <= 'n' should do the job properly.
Keep in mind, however, that if you can also have caps as a first character in your letter, you have to check (name[0] >= 'a' && name[0] <= 'n') || (name[0] >= 'A' && name[0] <= 'N') instead.
You can use std::all_of in combination with a lambda expression:
std::all_of(name.begin(), name.end(), [](char i) { return (i >= 'a' && i <= 'z'); });
Live demo
This is portable enough for most application, since the character set is usually implemented following the ASCII conventions as explain in §2.3/14:
The glyphs for the members of the basic source character set are intended to identify characters from the subset of ISO/IEC 10646 which corresponds to the ASCII character set. However, because the mapping from source file characters to the source character set (described in translation phase 1) is specified as implementation-defined, an implementation is required to document how the basic source characters are represented in source files.
The complexity of the above algorithm is O(n). The alternative (check every character to be one in the character range with k characters) is O(n*k), but at least you can be sure it's not implementation defined.
If you're sure the used character set on your platform(s) is ASCII, you can use something like :
if (std::all_of(name.begin(), name.end(), [](char c){return ((c >= 'a') && (c <= 'n'));}) ) {
// name contains only characters between 'a' and 'n' inclusive
}
Otherwise, something like this should do the trick :
if (name.find_first_not_of("abcdefghijklmn") == std::string::npos) {
// name contains only characters between 'a' and 'n' inclusive
}
An old fashioned portable method:
bool is_in_range(char range_start, char range_end, char c)
{
static const char alphabet[] = "abcdefghijklmnopqrstuvwxyz";
unsigned int start_position = 0;
unsigned int end_position = 0;
unsigned int character_position = 0;
c = std::tolower(c);
for (unsigned int i = 0; i < sizeof(alphabet); ++i)
{
if (range_start == alphabet[i])
{
start_position = i;
}
if (range_end == alphabet[i])
{
end_position = i;
}
if (c == alphabet[i])
{
character_position = i;
}
}
bool result = false;
if (end_position <= start_position)
{
result = false;
}
else
{
if ((character_position >= start_position) && (character_position <= end_position))
{
result = true;
}
}
return result;
}
loop through the string, check every character and see if it stays between a and n using str[i]>'a' and str[i]<'n'
For a contiguous range of characters you can:
_Bool isbetween(int c, int start, int end){
return ((unsigned)c-start < (end-start));
}
To account for case, use tolower() and the lower case range:
static inline int tolower(int c){
return c | ( ((unsigned)c-'A' < 26)<<5 );
}
//isbetween(tolower(x),'a','n');
For a non-contiguous range, you may need to create a mask. In this example, I will check for vowels (for brevity because there are only 5, but any combination in a range of 32 could be used or 64 with some modifications ...
in fact, a 64 bit mask on a 64 bit platform would eliminate the need for case handling).
static const unsigned vowel_mask = (1<<('a'-'a'))
|(1<<('e'-'a'))|(1<<('i'-'a'))|(1<<('o'-'a'))|(1<<('u'-'a'));
int isvowel(int c){ //checks if c is a,A,e,E,i,I,o,O,u,U
unsigned x = (c|32)-'a';
return ((x<32)<<x)&vowel_mask;
}
Note that these implementations contain no branches; however the use of unsigned comparison may prevent automatic compiler vectorization (intel intrinsics, don't have unsigned compare) ... if that is your goal, you can use 2 &ed comparisons instead. This method may or may not work on non-ascii systems depending on the separation distance of the characters.
GCC
isvowel:
or edi, 32 # tmp95,
xor eax, eax # tmp97
sub edi, 97 # x,
cmp edi, 31 # x,
setbe al #, tmp97
shlx eax, eax, edi # tmp99, tmp97, x
and eax, 1065233 # tmp96,
ret
Clang
isvowel: # #isvowel
or edi, 32
add edi, -97
mov eax, 32
xor ecx, ecx
cmp edi, eax
setb cl
shlx eax, ecx, edi
and eax, 1065233
ret
ICC
isvowel:
xor eax, eax #15.26
or edi, 32 #14.23
add edi, -97 #14.27
cmp edi, 32 #15.26
setb al #15.26
shlx eax, eax, edi #15.23
and eax, 1065233 #15.26
ret #15.26
In addition to the standard stackoverflow license, this code is released to the Public Domain
Just out of curiosity. If I have something like:
if(x < 0)
x = 0;
if(x > some_maximum)
x = some_maximum;
return x;
Is there a way to not branch? This is c++.
Addendum: I mean no branch instructions in the assembly. It's a MIPS architecture.
There are bit-tricks to find the minimum or maximum of two numbers, so you could use those to find min(max(x, 0), some_maximum). From here:
y ^ ((x ^ y) & -(x < y)); // min(x, y)
x ^ ((x ^ y) & -(x < y)); // max(x, y)
As the source states though, it's probably faster to do it the normal way, despite the branch
This is going to be compiler- and processor-dependent, but if you use ?: it can be translated to a conditional move (at least on Intel-based processors) which does not use a branch.
x = x < 0 ? 0 : x;
x = x > max ? max : x;
This can use the CMOV instruction (see http://www.intel.com/software/products/documentation/vlin/mergedprojects/analyzer_ec/mergedprojects/reference_olh/mergedProjects/instructions/instruct32_hh/vc35.htm), whose purpose is to avoid branching (and thus branch prediction penalties).
Edit: this thread may be of interest to you. Benchmarks show that conditional moves will give you speed gains only on branches that are not very predictable, whereas highly predictable branches (such as that of a long-running loop) prefer the standard approach.
In C++17 you can use std::clamp
Defined in header <algorithm>
template<class T>
constexpr const T& clamp( const T& v, const T& lo, const T& hi ); (1) (since C++17)
template<class T, class Compare>
constexpr const T& clamp( const T& v, const T& lo, const T& hi, Compare comp ); (2) (since C++17)
If v compares less than lo, returns lo; otherwise if hi compares
less than v, returns hi; otherwise returns v. Uses operator< to
compare the values.
Same as (1), but uses comp to compare the values.
Using the ternary operator :)
return x < 0 ? 0 : x > some_maximum ? : some_maximum : x;
Depends on your architecture. For ARM, at least, the compiler would probably emit conditionally executed instructions and the resulting machine code wouldn't contain a branch. I can't think of a good way to make that explicit in the C program though.
If it's possible to limit to powers of 2 (non inclusive), then just go with
int newx = x & ((highest power of 2) - 1)
not quite sure to handle the (if x < 0 case) or the generic (x < n case)
For future problems like this, the bit hack page might be useful: http://graphics.stanford.edu/~seander/bithacks.html.
Since the bithack for min and max was already posted, here is a different one:
// CHAR_BIT is number of bits per byte.
// sign = 1 if x < 0, sign = 0 otherwise (according to the page above)
int sign = (int)((unsigned int)((int)x) >> (sizeof(int) * CHAR_BIT - 1));
int y = (1-sign)*x; // if x < 0, then y = 0, else y = x.
// Depending on arch, the below _might_ cause a branch.
// (on x64 it does not cause a branch, not sure about MIPS)
int z = !(y/some_maximum); // if 0 <= y < some_maximum, z = 1, else z = 0
int ret = z*y + (1-z)*some_maximum; // if z =1, then ret = y; else ret = some_maximum.
return ret;
I just tried it out, and it worked for the few test cases i had.
Here is the assembly code from my computer (intel arch) which shows no branches.
int cap(int x)
{
00F013A0 push ebp
00F013A1 mov ebp,esp
00F013A3 sub esp,0FCh
00F013A9 push ebx
00F013AA push esi
00F013AB push edi
00F013AC lea edi,[ebp-0FCh]
00F013B2 mov ecx,3Fh
00F013B7 mov eax,0CCCCCCCCh
00F013BC rep stos dword ptr es:[edi]
int some_maximum = 100;
00F013BE mov dword ptr [some_maximum],64h
// CHAR_BIT is number of bits per byte.
// sign = 1 if x < 0, sign = 0 otherwise (according to the page above)
int sign = (int)((unsigned int)((int)x) >> (sizeof(int) * CHAR_BIT - 1));
00F013C5 mov eax,dword ptr [x]
00F013C8 shr eax,1Fh
00F013CB mov dword ptr [sign],eax
int y = (1-sign)*x; // if x < 0, then y = 0, else y = x.
00F013CE mov eax,1
00F013D3 sub eax,dword ptr [sign]
00F013D6 imul eax,dword ptr [x]
00F013DA mov dword ptr [y],eax
// Depending on arch, the below _might_ cause a branch.
// (on x64 it does not cause a branch, not sure about MIPS)
int z = !(y/some_maximum); // if 0 <= y < some_maximum, z = 1, else z = 0
00F013DD mov eax,dword ptr [y]
00F013E0 cdq
00F013E1 idiv eax,dword ptr [some_maximum]
00F013E4 neg eax
00F013E6 sbb eax,eax
00F013E8 add eax,1
00F013EB mov dword ptr [z],eax
int ret = z*y + (1-z)*some_maximum; // if z =1, then ret = y; else ret = some_maximum.
00F013EE mov eax,dword ptr [z]
00F013F1 imul eax,dword ptr [y]
00F013F5 mov ecx,1
00F013FA sub ecx,dword ptr [z]
00F013FD imul ecx,dword ptr [some_maximum]
00F01401 add eax,ecx
00F01403 mov dword ptr [ret],eax
return ret;
00F01406 mov eax,dword ptr [ret]
}
00F01409 pop edi
00F0140A pop esi
00F0140B pop ebx
00F0140C mov esp,ebp
00F0140E pop ebp
00F0140F ret
x = min(max(x,0),100);
The branching is hidden away nicely inside functions with normal names.
Suggesting to create a clip_by template.
x = ((int)(x > some_maximum)) * some_maximum
+ ((int)(x > 0 && x <= some_maximum)) * x;