I have a fixed size boolean array of size 8. The default value of all elements in the array is false. There will be a number of truth values to fill between 1-8.
I want to distribute the truth values as far away from one another as possible. I also wish to be able to randomize the configuration. In this scenario the array wraps around so position 7 is "next to" position 0 in the array.
here are some examples for fill values. I didn't include all possibilities, but hopefully it gets my point across.
1: [1, 0, 0, 0, 0, 0, 0, 0] or [0, 1, 0, 0, 0, 0, 0, 0]
2: [1, 0, 0, 0, 1, 0, 0, 0] or [0, 1, 0, 0, 0, 1, 0, 0]
3: [1, 0, 0, 1, 0, 0, 1, 0] or [0, 1, 0, 0, 1, 0, 0, 1]
4: [1, 0, 1, 0, 1, 0, 1, 0] or [0, 1, 0, 1, 0, 1, 0, 1]
5: [1, 1, 0, 1, 1, 0, 1, 0]
6: [1, 1, 0, 1, 1, 1, 0, 1]
7: [1, 1, 1, 1, 1, 1, 1, 0]
8: [1, 1, 1, 1, 1, 1, 1, 1]
The closest solution I have come up with so far hasn't quite produced the results I'm looking for...
I seek to write it in c++ but here is a little pseudo-code of my algorithm so far...
not quite working out how I wanted
truths = randBetween(1, 8)
values = [0,0,0,0,0,0,0,0]
startPosition = randBetween(0, 7) //starting index
distance = 4
for(i = 0; i < truths; i++) {
pos = i + startPosition + (i * distance)
values[pos % 8] = 1
}
this is an example output from my current code. those marked with a star are incorrect.
[0, 0, 0, 0, 1, 0, 0, 0]
[0, 1, 0, 0, 1, 0, 0, 0]*
[0, 1, 0, 0, 1, 0, 1, 0]
[0, 1, 0, 1, 1, 0, 1, 0]*
[1, 1, 0, 1, 1, 0, 1, 0]
[1, 1, 0, 1, 1, 1, 1, 0]*
[1, 1, 1, 1, 1, 1, 1, 0]
[1, 1, 1, 1, 1, 1, 1, 1]
I'm looking for a simple way to distribute the truth values evenly throughout the array without having to code for special cases.
Check this out:
#include <cassert>
#include <vector>
#include <iostream>
#include <iomanip>
/**
* Generate an even spaced pattern of ones
* #param arr destination vector of ints
* #param onescnt the requested number of ones
*/
static inline
void gen(std::vector<int>& arr, size_t onescnt) {
const size_t len = arr.size();
const size_t zeroscnt = len - onescnt;
size_t ones = 1;
size_t zeros = 1;
for (size_t i = 0; i < len; ++i) {
if (ones * zeroscnt < zeros * onescnt) {
ones++;
arr[i] = 1;
} else {
zeros++;
arr[i] = 0;
}
}
}
static inline
size_t count(const std::vector<int>& arr, int el) {
size_t cnt = 0;
for (size_t i = 0; i < arr.size(); ++i) {
cnt += arr[i] == el;
}
return cnt;
}
static inline
void gen_print(size_t len, size_t onescnt) {
std::vector<int> arr(len);
gen(arr, onescnt);
std::cout << "gen_printf(" << std::setw(2) << len << ", " << std::setw(2) << onescnt << ") = {";
for (size_t i = 0; i < len; ++i) {
std::cout << arr[i] << ",";
}
std::cout << "}\n";
assert(count(arr, 1) == onescnt);
}
int main() {
for (int i = 0; i <= 8; ++i) {
gen_print(8, i);
}
for (int i = 0; i <= 30; ++i) {
gen_print(30, i);
}
return 0;
}
Generates:
gen_printf( 8, 0) = {0,0,0,0,0,0,0,0,}
gen_printf( 8, 1) = {0,0,0,0,0,0,0,1,}
gen_printf( 8, 2) = {0,0,0,1,0,0,0,1,}
gen_printf( 8, 3) = {0,1,0,0,1,0,0,1,}
gen_printf( 8, 4) = {0,1,0,1,0,1,0,1,}
gen_printf( 8, 5) = {1,0,1,1,0,1,0,1,}
gen_printf( 8, 6) = {1,1,0,1,1,1,0,1,}
gen_printf( 8, 7) = {1,1,1,1,1,1,0,1,}
gen_printf( 8, 8) = {1,1,1,1,1,1,1,1,}
gen_printf(30, 0) = {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,}
gen_printf(30, 1) = {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,}
gen_printf(30, 2) = {0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,}
gen_printf(30, 3) = {0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,}
gen_printf(30, 4) = {0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,}
gen_printf(30, 5) = {0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,}
gen_printf(30, 6) = {0,0,0,0,1,0,0,0,0,1,0,0,0,0,1,0,0,0,0,1,0,0,0,0,1,0,0,0,0,1,}
gen_printf(30, 7) = {0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,0,1,}
gen_printf(30, 8) = {0,0,1,0,0,0,1,0,0,0,1,0,0,0,1,0,0,1,0,0,0,1,0,0,0,1,0,0,0,1,}
gen_printf(30, 9) = {0,0,1,0,0,1,0,0,0,1,0,0,1,0,0,1,0,0,0,1,0,0,1,0,0,1,0,0,0,1,}
gen_printf(30, 10) = {0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,}
gen_printf(30, 11) = {0,1,0,0,1,0,0,1,0,1,0,0,1,0,0,1,0,0,1,0,1,0,0,1,0,0,1,0,0,1,}
gen_printf(30, 12) = {0,1,0,0,1,0,1,0,0,1,0,1,0,0,1,0,1,0,0,1,0,1,0,0,1,0,1,0,0,1,}
gen_printf(30, 13) = {0,1,0,1,0,1,0,0,1,0,1,0,1,0,0,1,0,1,0,1,0,0,1,0,1,0,1,0,0,1,}
gen_printf(30, 14) = {0,1,0,1,0,1,0,1,0,1,0,1,0,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,0,1,}
gen_printf(30, 15) = {0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,}
gen_printf(30, 16) = {1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,}
gen_printf(30, 17) = {1,0,1,0,1,0,1,1,0,1,0,1,0,1,1,0,1,0,1,0,1,1,0,1,0,1,0,1,0,1,}
gen_printf(30, 18) = {1,0,1,0,1,1,0,1,0,1,1,0,1,0,1,1,0,1,0,1,1,0,1,0,1,1,0,1,0,1,}
gen_printf(30, 19) = {1,0,1,1,0,1,1,0,1,0,1,1,0,1,1,0,1,1,0,1,0,1,1,0,1,1,0,1,0,1,}
gen_printf(30, 20) = {1,0,1,1,0,1,1,0,1,1,0,1,1,0,1,1,0,1,1,0,1,1,0,1,1,0,1,1,0,1,}
gen_printf(30, 21) = {1,1,0,1,1,0,1,1,0,1,1,1,0,1,1,0,1,1,0,1,1,1,0,1,1,0,1,1,0,1,}
gen_printf(30, 22) = {1,1,0,1,1,1,0,1,1,1,0,1,1,0,1,1,1,0,1,1,1,0,1,1,1,0,1,1,0,1,}
gen_printf(30, 23) = {1,1,1,0,1,1,1,0,1,1,1,0,1,1,1,1,0,1,1,1,0,1,1,1,0,1,1,1,0,1,}
gen_printf(30, 24) = {1,1,1,0,1,1,1,1,0,1,1,1,1,0,1,1,1,1,0,1,1,1,1,0,1,1,1,1,0,1,}
gen_printf(30, 25) = {1,1,1,1,0,1,1,1,1,1,0,1,1,1,1,1,0,1,1,1,1,1,0,1,1,1,1,1,0,1,}
gen_printf(30, 26) = {1,1,1,1,1,1,0,1,1,1,1,1,1,0,1,1,1,1,1,1,1,0,1,1,1,1,1,1,0,1,}
gen_printf(30, 27) = {1,1,1,1,1,1,1,1,0,1,1,1,1,1,1,1,1,1,0,1,1,1,1,1,1,1,1,1,0,1,}
gen_printf(30, 28) = {1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,}
gen_printf(30, 29) = {1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,}
gen_printf(30, 30) = {1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,}
#edit - better evenly spaced pattern.
Explanation:
So let's take an array of 8 ints and we want to have 5 ones. The ideal ratio of (ones / zeros) in a sequence with 8 elements and 5 ones, well would be (5 / 3). We will never approach such ratio, but we can try.
The idea is to loop through the array and remember the number of ones and zeros we have written in the array. If the ratio of (written ones / written zeros) is lower then the destination ratio (ones / zeros) we want to achieve, we need to put a one to the sequence. Otherwise we put zero in the sequence. The ratio changes and we make the decision next time. The idea is to pursue the ideal ratio of ones per zeros in each slice of the array.
A simple way to do this would be to round the ideal fractional positions.
truths = randBetween(1, 8)
values = [0,0,0,0,0,0,0,0]
offset = randBetween(0, 8 * truths - 1)
for(i = 0; i < truths; i++) {
pos = (offset + (i * 8)) / truths
values[pos % 8] = 1
}
This is an application of Bresenham's line-drawing algorithm. I use it not because it's fast on old hardware, but it places true values exactly.
#include <iostream>
#include <stdexcept>
#include <string>
#include <random>
int main(int argc, char **argv) {
try {
// Read the argument.
if(argc != 2) throw std::invalid_argument("one argument");
int dy = std::stoi(argv[1]);
if(dy < 0 || dy > 8) throw std::out_of_range("[0..8]");
int values[8] = {0};
// https://en.wikipedia.org/wiki/Bresenham%27s_line_algorithm
int dx = 8;
int delta = 2 * dy - dx; // Balance the line. Permute it up later.
for(int x = 0; x < dx; x++) {
if(delta > 0) {
values[x] = 1;
delta -= 2 * dx;
}
delta += 2 * dy;
}
for(int x = 0; x < dx; x++)
std::cout << (x ? ", " : "") << values[x];
std::cout << std::endl;
// Rotate the number by a random amount.
// I'm sure there is an easier way to do this.
// https://stackoverflow.com/questions/7560114/random-number-c-in-some-range
std::random_device rd; // obtain a random number from hardware
std::mt19937 eng(rd()); // seed the generator
std::uniform_int_distribution<> distr(0, dx - 1);
int rotate = distr(eng);
bool first = true;
int x = rotate;
do {
std::cout << (first ? "" : ", ") << values[x];
first = false;
x = (x + 1) % dx;
} while(x != rotate);
std::cout << std::endl;
} catch(const std::exception &e) {
std::cerr << "Something went wrong: " << e.what() << std::endl;
return 1;
}
return 0;
}
Once you have an exact solution, rotate it by a random amount.
0, 1, 0, 0, 1, 0, 1, 0
1, 0, 0, 1, 0, 0, 1, 0
You need to calculate distance dynamically. One element is clear, that can reside at arbitrary location
2 elements is clear, too, distance needs to be 4.
4 elements need a distance of 2
8 elements a distance of 1
More difficult are numbers that don't divide the array:
3 requires a distance of 2.66.
5 requires a distance of 1.6
7 requires a distance of 0.875
Errm... In general, if you have a distance of X.Y, you will have to place some of the elements at distances of X and some at distances of X + 1. X is simple, it will be the result of an integer division: 8 / numberOfElements. The remainder will determine how often you will have to switch to X + 1: 8 % numberOfElements. For 3, this will result in 2, too, so you will have 1x distance of 2 and 2x distance of 3:
[ 1 0 1 0 0 1 0 0 ]
2 3 3 (distance to very first 1)
For 5, you'll get: 8/5 = 1, 8%5 = 3, so: 2x distance of 1, 3x distance of 2
[ 1 1 1 0 1 0 1 0 ]
1 1 2 2 2
For 7 you'll get: 8/7 = 1, 8%7 = 1, so: 7x distance of 1, 1x distance of 2
[ 1 1 1 1 1 1 1 0 ]
1 1 1 1 1 1 2
That will work for arbitrary array length L:
L/n = minimum distance
L%n = number of times to apply minimum distance
L-L%n = number of times to apply minimum distance + 1
Mathematical metrics won't reveal any difference between first applying all smaller distances then all larger ones, human sense for aesthetics, though, might prefer if you alternate between larger and smaller as often as possible – or you apply the algorithm recursively (for larger array length), to get something like 2x2, 3x3, 2x2, 3x3 instead of 4x2 and 6x3.
I am fetching text from a utf-8 text file, and doing it by chunks to increase performance.
std::ifstream.read(myChunkBuff_str, myChunkBuff_str.length())
Here is a more detailed example
I am getting around 16 thousand characters with each chunk.
My next step is to convert this std::string into something that can allow me to work on these "complex characters" individually, thus converting that std::string into std::wstring.
I am using the following function for converting, taken from here:
#include <string>
#include <codecvt>
#include <locale>
std::string narrow (const std::wstring& wide_string)
{
std::wstring_convert <std::codecvt_utf8 <wchar_t>, wchar_t> convert;
return convert.to_bytes (wide_string);
}
std::wstring widen (const std::string& utf8_string)
{
std::wstring_convert <std::codecvt_utf8 <wchar_t>, wchar_t> convert;
return convert.from_bytes (utf8_string);
}
However, at its end of the chunk one of the Russian characters might be cut-off, and the conversion will fail, with an std::range_error exception.
For example, in UTF-8 "привет" takes 15 chars and "приве" takes 13 chars.
So, if my chunk was hypothetically 14, the 'т' would be partially missing, and the conversion would throw exception.
Question:
How to detect these partially-loaded character? ('т' in this case) This would allow me to convert without it, and perhaps shift the next chunk a bit earlier than planned, to include this problematic 'т' next time?
I don't want to try or catch around these functions, as try/catch might slow me down the program. It also doesn't tell me "how much of character was missing for the conversion to actually succeed".
I know about wstring_convert::converted() but it's not really useful if my program crashes before I get to it
You could do this using a couple of functions. UTF-8 has a way to detect the beginning of a multibyte character and (from the beginning) the size of the multibyte character.
So two functions:
// returns zero if this is the first byte of a UTF-8 char
// otherwise non-zero.
static unsigned is_continuation(char c)
{
return (c & 0b10000000) && !(c & 0b01000000);
}
// if c is the *first* byte of a UTF-8 multibyte character, returns
// the total number of bytes of the character.
static unsigned size(const unsigned char c)
{
constexpr static const char u8char_size[] =
{
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3
, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
};
return u8char_size[(unsigned char)c];
}
You could track back from the end of your buffer until is_continuation(c) is false. Then check if size(c) of the current UTF-8 char is longer than the end of the buffer.
Disclaimer - last time I looked these functions were working but have not used them in a while.
Edit: to add.
If you feel like doing th whole thing manually I may as well post the code to convert a UTF-8 multibyte character to a UTF-16 multibyte or a UTF-32 char.
UTF-32 Is easy:
// returns a UTF-32 char from a `UTF-8` multibyte
// character pointed to by cp
static char32_t char32(const char* cp)
{
auto sz = size(*cp); // function above
if(sz == 1)
return *cp;
char32_t c32 = (0b0111'1111 >> sz) & (*cp);
for(unsigned i = 1; i < sz; ++i)
c32 = (c32 << 6) | (cp[i] & 0b0011'1111);
return c32;
}
UTF-16 Is a little more tricky:
// UTF-16 characters can be 1 or 2 characters wide...
using char16_pair = std::array<char16_t, 2>;
// outputs a UTF-16 char in cp16 from a `UTF-8` multibyte
// character pointed to by cp
//
// returns the number of characters in this `UTF-16` character
// (1 or 2).
static unsigned char16(const char* cp, char16_pair& cp16)
{
char32_t c32 = char32(cp);
if(c32 < 0xD800 || (c32 > 0xDFFF && c32 < 0x10000))
{
cp16[0] = char16_t(c32);
cp16[1] = 0;
return 1;
}
c32 -= 0x010000;
cp16[0] = ((0b1111'1111'1100'0000'0000 & c32) >> 10) + 0xD800;
cp16[1] = ((0b0000'0000'0011'1111'1111 & c32) >> 00) + 0xDC00;
return 2;
}
Say I have a array A of 8 numbers, I have another array B of numbers to determine how many places should the number in A be shifted to right
A 3, 6, 7, 8, 1, 2, 3, 5
B 0, 1, 0, 0, 0, 0, 0, 0
0 means valid, 1 means this number should be 1 place after, the output array is should insert 0 between after 3, the output array C should be :
C: 3,0,6,7,8,1,2,3
Whether to insert 0 or something else is not important, the point is that all numbers after 3 got shifted by one place. The outbound numbers will not be in the array anymore.
Another example:
A 3, 6, 7, 8, 1, 2, 3, 5
B 0, 1, 0, 0, 2, 0, 0, 0
C 3, 0, 6, 7, 8, 0, 1, 2
.......................................
A 3, 6, 7, 8, 1, 2, 3, 5
B 0, 1, 0, 0, 1, 0, 0, 0
C 3, 0, 6, 7, 8, 1, 2, 3
I am thinking about using scan/prefix-sum or something similar to solve this problem. also this array is small that I should be able to fit the array in one warp (<32 numbers) and use shuffle instructions. Anyone has an idea?
One possible approach.
Due to the ambiguity of your shifting (0, 1, 0, 1, 0, 1, 1, 1 and 0, 1, 0 ,0 all produce the same data offset pattern, for example) it's not possible to just create a prefix sum of the shift pattern to produce the relative offset at each position. An observation we can make, however, is that a valid offset pattern will be created if each zero in the shift pattern gets replaced by the first non-zero shift value to its left:
0, 1, 0, 0 (shift pattern)
0, 1, 1, 1 (offset pattern)
or
0, 2, 0, 2 (shift pattern)
0, 2, 2, 2 (offset pattern)
So how to do this? Let's assume we have the second test case shift pattern:
0, 1, 0, 0, 2, 0, 0, 0
Our desired offset pattern would be:
0, 1, 1, 1, 2, 2, 2, 2
for a given shift pattern, create a binary value, where each bit is one if the value at the corresponding index into the shift pattern is zero, and zero otherwise. We can use a warp vote instruction, called __ballot() for this. Each lane will get the same value from the ballot:
1 0 1 1 0 1 1 1 (this is a single binary 8-bit value in this case)
Each warp lane will now take this value, and add a value to it which has a 1 bit at the warp lane position. Using lane 1 for the remainder of the example:
+ 0 0 0 0 0 0 1 0 (the only 1 bit in this value will be at the lane index)
= 1 0 1 1 1 0 0 1
We now take the result of step 2, and bitwise exclusive-OR with the result from step 1:
= 0 0 0 0 1 1 1 0
We now count the number of 1 bits in this value (there is a __popc() intrinsic for this), and subtract one from the result. So for the lane 1 example above, the result of this step would be 2, since there are 3 bits set. This gives use the distance to the first value to our left that is non-zero in the original shift pattern. So for the lane 1 example, the first non-zero value to the left of lane 1 is 2 lanes higher, i.e. lane 3.
For each lane, we use the result of step 4 to grab the appropriate offset value for that lane. We can process all lanes at once using a __shfl_down() warp shuffle instruction.
0, 1, 1, 1, 2, 2, 2, 2
Thus producing our desired "offset pattern".
Once we have the desired offset pattern, the process of having each warp lane use its offset value to appropriately shift its data item is straightforward.
Here is a fully worked example, using your 3 test cases. Steps 1-4 above are contained in the __device__ function mydelta. The remainder of the kernel is performing the step 5 shuffle, appropriately indexing into the data, and copying the data. Due to the usage of the warp shuffle instructions, we must compile this for a cc3.0 or higher GPU. (However, it would not be difficult to replace the warp shuffle instructions with other indexing code that would allow operation on cc2.0 or greater devices.) Also, due to the various intrinsics used, this function cannot work for more than 32 data items, but that was a prerequisite condition stated in your question.
$ cat t475.cu
#include <stdio.h>
#define DSIZE 8
#define cudaCheckErrors(msg) \
do { \
cudaError_t __err = cudaGetLastError(); \
if (__err != cudaSuccess) { \
fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
msg, cudaGetErrorString(__err), \
__FILE__, __LINE__); \
fprintf(stderr, "*** FAILED - ABORTING\n"); \
exit(1); \
} \
} while (0)
__device__ int mydelta(const int shift){
unsigned nz = __ballot(shift == 0);
unsigned mylane = (threadIdx.x & 31);
unsigned lanebit = 1<<mylane;
unsigned temp = nz + lanebit;
temp = nz ^ temp;
unsigned delta = __popc(temp);
return delta-1;
}
__global__ void mykernel(const int *data, const unsigned *shift, int *result, const int limit){ // limit <= 32
if (threadIdx.x < limit){
unsigned lshift = shift[(limit - 1) - threadIdx.x];
unsigned delta = mydelta(lshift);
unsigned myshift = __shfl_down(lshift, delta);
myshift = __shfl(myshift, ((limit -1) - threadIdx.x)); // reverse offset pattern
result[threadIdx.x] = 0;
if ((myshift + threadIdx.x) < limit)
result[threadIdx.x + myshift] = data[threadIdx.x];
}
}
int main(){
int A[DSIZE] = {3, 6, 7, 8, 1, 2, 3, 5};
unsigned tc1B[DSIZE] = {0, 1, 0, 0, 0, 0, 0, 0};
unsigned tc2B[DSIZE] = {0, 1, 0, 0, 2, 0, 0, 0};
unsigned tc3B[DSIZE] = {0, 1, 0, 0, 1, 0, 0, 0};
int *d_data, *d_result, *h_result;
unsigned *d_shift;
h_result = (int *)malloc(DSIZE*sizeof(int));
if (h_result == NULL) { printf("malloc fail\n"); return 1;}
cudaMalloc(&d_data, DSIZE*sizeof(int));
cudaMalloc(&d_shift, DSIZE*sizeof(unsigned));
cudaMalloc(&d_result, DSIZE*sizeof(int));
cudaCheckErrors("cudaMalloc fail");
cudaMemcpy(d_data, A, DSIZE*sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(d_shift, tc1B, DSIZE*sizeof(unsigned), cudaMemcpyHostToDevice);
cudaCheckErrors("cudaMempcyH2D fail");
mykernel<<<1,32>>>(d_data, d_shift, d_result, DSIZE);
cudaDeviceSynchronize();
cudaCheckErrors("kernel fail");
cudaMemcpy(h_result, d_result, DSIZE*sizeof(int), cudaMemcpyDeviceToHost);
cudaCheckErrors("cudaMempcyD2H fail");
printf("index: ");
for (int i = 0; i < DSIZE; i++)
printf("%d, ", i);
printf("\nA: ");
for (int i = 0; i < DSIZE; i++)
printf("%d, ", A[i]);
printf("\ntc1 B: ");
for (int i = 0; i < DSIZE; i++)
printf("%d, ", tc1B[i]);
printf("\ntc1 C: ");
for (int i = 0; i < DSIZE; i++)
printf("%d, ", h_result[i]);
cudaMemcpy(d_shift, tc2B, DSIZE*sizeof(unsigned), cudaMemcpyHostToDevice);
cudaCheckErrors("cudaMempcyH2D fail");
mykernel<<<1,32>>>(d_data, d_shift, d_result, DSIZE);
cudaDeviceSynchronize();
cudaCheckErrors("kernel fail");
cudaMemcpy(h_result, d_result, DSIZE*sizeof(int), cudaMemcpyDeviceToHost);
cudaCheckErrors("cudaMempcyD2H fail");
printf("\ntc2 B: ");
for (int i = 0; i < DSIZE; i++)
printf("%d, ", tc2B[i]);
printf("\ntc2 C: ");
for (int i = 0; i < DSIZE; i++)
printf("%d, ", h_result[i]);
cudaMemcpy(d_shift, tc3B, DSIZE*sizeof(unsigned), cudaMemcpyHostToDevice);
cudaCheckErrors("cudaMempcyH2D fail");
mykernel<<<1,32>>>(d_data, d_shift, d_result, DSIZE);
cudaDeviceSynchronize();
cudaCheckErrors("kernel fail");
cudaMemcpy(h_result, d_result, DSIZE*sizeof(int), cudaMemcpyDeviceToHost);
cudaCheckErrors("cudaMempcyD2H fail");
printf("\ntc3 B: ");
for (int i = 0; i < DSIZE; i++)
printf("%d, ", tc3B[i]);
printf("\ntc2 C: ");
for (int i = 0; i < DSIZE; i++)
printf("%d, ", h_result[i]);
printf("\n");
return 0;
}
$ nvcc -arch=sm_35 -o t475 t475.cu
$ ./t475
index: 0, 1, 2, 3, 4, 5, 6, 7,
A: 3, 6, 7, 8, 1, 2, 3, 5,
tc1 B: 0, 1, 0, 0, 0, 0, 0, 0,
tc1 C: 3, 0, 6, 7, 8, 1, 2, 3,
tc2 B: 0, 1, 0, 0, 2, 0, 0, 0,
tc2 C: 3, 0, 6, 7, 8, 0, 1, 2,
tc3 B: 0, 1, 0, 0, 1, 0, 0, 0,
tc2 C: 3, 0, 6, 7, 8, 1, 2, 3,
$
Are there any efficient bitwise operations I can do to get the number of set bits that an integer ends with? For example 1110 = 10112 would be two trailing 1 bits. 810 = 10002 would be 0 trailing 1 bits.
Is there a better algorithm for this than a linear search? I'm implementing a randomized skip list and using random numbers to determine the maximum level of an element when inserting it. I am dealing with 32 bit integers in C++.
Edit: assembler is out of the question, I'm interested in a pure C++ solution.
Calculate ~i & (i + 1) and use the result as a lookup in a table with 32 entries. 1 means zero 1s, 2 means one 1, 4 means two 1s, and so on, except that 0 means 32 1s.
Taking the answer from Ignacio Vazquez-Abrams and completing it with the count rather than a table:
b = ~i & (i+1); // this gives a 1 to the left of the trailing 1's
b--; // this gets us just the trailing 1's that need counting
b = (b & 0x55555555) + ((b>>1) & 0x55555555); // 2 bit sums of 1 bit numbers
b = (b & 0x33333333) + ((b>>2) & 0x33333333); // 4 bit sums of 2 bit numbers
b = (b & 0x0f0f0f0f) + ((b>>4) & 0x0f0f0f0f); // 8 bit sums of 4 bit numbers
b = (b & 0x00ff00ff) + ((b>>8) & 0x00ff00ff); // 16 bit sums of 8 bit numbers
b = (b & 0x0000ffff) + ((b>>16) & 0x0000ffff); // sum of 16 bit numbers
at the end b will contain the count of 1's (the masks, adding and shifting count the 1's).
Unless I goofed of course. Test before use.
The Bit Twiddling Hacks page has a number of algorithms for counting trailing zeros. Any of them can be adapted by simply inverting your number first, and there are probably clever ways to alter the algorithms in place without doing that as well. On a modern CPU with cheap floating point operations the best is probably thus:
unsigned int v=~input; // find the number of trailing ones in input
int r; // the result goes here
float f = (float)(v & -v); // cast the least significant bit in v to a float
r = (*(uint32_t *)&f >> 23) - 0x7f;
if(r==-127) r=32;
GCC has __builtin_ctz and other compilers have their own intrinsics. Just protect it with an #ifdef:
#ifdef __GNUC__
int trailingones( uint32_t in ) {
return ~ in == 0? 32 : __builtin_ctz( ~ in );
}
#else
// portable implementation
#endif
On x86, this builtin will compile to one very fast instruction. Other platforms might be somewhat slower, but most have some kind of bit-counting functionality that will beat what you can do with pure C operators.
There may be better answers available, particularly if assembler isn't out of the question, but one viable solution would be to use a lookup table. It would have 256 entries, each returning the number of contiguous trailing 1 bits. Apply it to the lowest byte. If it's 8, apply to the next and keep count.
Implementing Steven Sudit's idea...
uint32_t n; // input value
uint8_t o; // number of trailing one bits in n
uint8_t trailing_ones[256] = {
0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4,
0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 5,
0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4,
0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 6,
0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4,
0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 5,
0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4,
0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 7,
0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4,
0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 5,
0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4,
0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 6,
0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4,
0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 5,
0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4,
0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 8};
uint8_t t;
do {
t=trailing_ones[n&255];
o+=t;
} while(t==8 && (n>>=8))
1 (best) to 4 (worst) (average 1.004) times (1 lookup + 1 comparison + 3 arithmetic operations) minus one arithmetic operation.
This code counts the number of trailing zero bits, taken from here (there's also a version that depends on the IEEE 32 bit floating point representation, but I wouldn't trust it, and the modulus/division approaches look really slick - also worth a try):
int CountTrailingZeroBits(unsigned int v) // 32 bit
{
unsigned int c = 32; // c will be the number of zero bits on the right
static const unsigned int B[] = {0x55555555, 0x33333333, 0x0F0F0F0F, 0x00FF00FF, 0x0000FFFF};
static const unsigned int S[] = {1, 2, 4, 8, 16}; // Our Magic Binary Numbers
for (int i = 4; i >= 0; --i) // unroll for more speed
{
if (v & B[i])
{
v <<= S[i];
c -= S[i];
}
}
if (v)
{
c--;
}
return c;
}
and then to count trailing ones:
int CountTrailingOneBits(unsigned int v)
{
return CountTrailingZeroBits(~v);
}
http://graphics.stanford.edu/~seander/bithacks.html might give you some inspiration.
Implementation based on Ignacio Vazquez-Abrams's answer
uint8_t trailing_ones(uint32_t i) {
return log2(~i & (i + 1));
}
Implementation of log2() is left as an exercise for the reader (see here)
Taking #phkahler's answer you can define the following preprocessor statement:
#define trailing_ones(x) __builtin_ctz(~x & (x + 1))
As you get a one left to all the prior ones, you can simply count the trailing zeros.
Blazingly fast ways to find the number of trailing 0's are given in Hacker's Delight.
You could complement your integer (or more generally, word) to find the number of trailing 1's.
I have this sample for you :
#include <stdio.h>
int trailbits ( unsigned int bits, bool zero )
{
int bitsize = sizeof(int) * 8;
int len = 0;
int trail = 0;
unsigned int compbits = bits;
if ( zero ) compbits = ~bits;
for ( ; bitsize; bitsize-- )
{
if ( compbits & 0x01 ) trail++;
else
{
if ( trail > 1 ) len++;
trail = 0;
}
compbits = compbits >> 1;
}
if ( trail > 1 ) len++;
return len;
}
void PrintBits ( unsigned int bits )
{
unsigned int pbit = 0x80000000;
for ( int len=0 ; len<32; len++ )
{
printf ( "%c ", pbit & bits ? '1' : '0' );
pbit = pbit >> 1;
}
printf ( "\n" );
}
void main(void)
{
unsigned int forbyte = 0x0CC00990;
PrintBits ( forbyte );
printf ( "Trailing ones is %d\n", trailbits ( forbyte, false ));
printf ( "Trailing zeros is %d\n", trailbits ( forbyte, true ));
}