Periodic Latency Spikes in Writing to Shared Memory on Linux - c++

I have the following code:
#pragma pack(4)
struct RECORD_HEADER {
uint64_t msgType;
uint64_t rdtsc;
};
struct BODY {
char content[488];
};
#pragma pack()
class SerializedRDTSC {
public:
typedef unsigned long long timeunit_t;
static timeunit_t start(void) {
unsigned cycles_high, cycles_low;
__asm__ __volatile__ ( "CPUID\n\t"
"RDTSC\n\t"
"mov %%edx, %0\n\t"
"mov %%eax, %1\n\t": "=r" (cycles_high), "=r" (cycles_low)::
"%rax", "%rbx", "%rcx", "%rdx");
return ( (unsigned long long)cycles_low)|( ((unsigned long long)cycles_high)<<32 );
}
static timeunit_t end(void) {
unsigned cycles_high, cycles_low;
__asm__ __volatile__( "RDTSCP\n\t"
"mov %%edx, %0\n\t"
"mov %%eax, %1\n\t"
"CPUID\n\t": "=r" (cycles_high), "=r" (cycles_low):: "%rax",
"%rbx", "%rcx", "%rdx");
return ( (unsigned long long)cycles_low)|( ((unsigned long long)cycles_high)<<32 );
}
};
char* createSHM() noexcept {
const auto sharedMemHandle = shm_open("testing", O_RDWR | O_CREAT, 0666);
if (-1 == sharedMemHandle) {
std::cout << "failed to open named shared memory: " << std::endl;
return nullptr;
}
constexpr int32_t size = (1 << 26);
ftruncate(sharedMemHandle, size);
char* ptr = (char*) mmap(nullptr, size, PROT_READ | PROT_WRITE,
MAP_SHARED, sharedMemHandle, 0);
if (MAP_FAILED == ptr) {
std::cout << errno << std::endl;
return nullptr;
}
const auto rc = fchmod(sharedMemHandle, 0666);
if (rc == -1) {
fprintf(stderr,
"Can't change permissions to 0666 on shared mem segment: %m\n");
fflush(stderr);
}
return ptr;
}
int main() {
BODY update;
srand(time(nullptr));
char* ptr = createSHM();
constexpr uint64_t n = 700;
constexpr uint64_t n2 = 10;
uint64_t m_data[n * n2];
memset(m_data, 0, sizeof(m_data));
uint64_t r = 0;
for (uint64_t i = 0; i < n; i++) {
for (uint64_t k = 0; k < n2; k++) {
// populate the header
const auto msgType = rand();
const auto rdtsc = rand();
// populate the struct randomly
uint32_t* tmp = reinterpret_cast<uint32_t*>(&update);
for (uint32_t j = 0; j < sizeof(BODY) / sizeof(uint32_t); j++) {
const uint32_t v = rand() % 32767;
tmp[j] = v;
}
// write the struct
const auto s = SerializedRDTSC::start();
memcpy(ptr, (char*)&msgType, sizeof(uint64_t));
ptr+= sizeof(uint64_t);
memcpy(ptr, (char*)&rdtsc, sizeof(uint64_t));
ptr+= sizeof(uint64_t);
memcpy(ptr, &update, sizeof(BODY));
ptr+= sizeof(BODY);
const auto e = SerializedRDTSC::end();
m_data[r++] = e - s;
}
usleep(249998);
}
for (uint32_t i = 0; i < r; i++) {
std::cout << i << "," << m_data[i] << std::endl;
}
}
And for some reason, there are periodic latency spike according to the output:
0 9408
1 210
2 162
3 176
4 172
5 164
6 172
7 8338
8 174
9 186
10 612
11 380
12 380
13 374
14 358
15 13610
16 190
17 186
18 164
19 168
20 246
21 196
22 170
23 5066
24 176
25 176
26 168
27 174
28 166
29 440
30 232
31 214
32 5128
33 180
34 178
35 172
36 174
37 184
38 170
39 162
40 5964
41 182
42 174
43 164
44 180
45 180
46 162
47 172
I already isolated the core and double-checked with htop to make sure no other processes were using the core.
My machine has an i7 CPU (nothing fancy).
And then I tried with an Xeon CPU. The pattern is about the same -- every 7-11 write, there was a spike.
With i7 CPU, I compiled with GCC 7.2 with c++17 and ran it on CentOS 7.3.
With Xeon CPU, I compiled with GCC 4.6 with c++0x and ran it on CentOS 6.5.
My questions are:
1. Why there were periodic latency spikes? (I checked with strace. And I don't see weird system call involved)
2. Any suggestion on how to investigate/understand the spike? More for my learning.
Thanks in advance!
P.S. Yes, some people object to use rdtsc to measure latency because temperature affects TSC. Tho, I don't see any better option as I don't have PTP, and clock_gettime() sometimes will have latency spikes too. If you have any suggestion, it is more than welcome :)

A memory page is 4K bytes. Every time you start writing on a new page, that page needs mapped into the process address space. Since the data you're writing every loop is 8 + 8 + 488 = 504 bytes, you'll get a spike every 8 or 9 time thru the loop.
Since the CPU can speculatively prefetch data from memory, the page fault for the 2nd page (which should occur on the 8th loop) occurs one loop earlier than expected, when the hardware prefetcher tries to access the page.

Related

C++ Value at Recursive Struct Pointer Keeps Changing

I am continuing my motion-planning algorithm with a recursive Node structure and a boost geometry rtree. However, I'm running into a couple of issues in which the value at a Node pointer keeps changing. Here is a MWE:
#include <algorithm>
#include <iostream>
#include <cmath>
#include <stdlib.h>
#include <stdio.h>
#include <string>
#include <vector>
#include <set>
#include <boost/geometry.hpp>
#include <boost/geometry/geometry.hpp>
#include <boost/geometry/geometries/point.hpp>
#include <boost/geometry/geometries/box.hpp>
#include <boost/geometry/geometries/register/point.hpp>
#include <boost/geometry/index/rtree.hpp>
using namespace std;
namespace bg = boost::geometry;
namespace bgi = boost::geometry::index;
typedef bg::model::point<double, 2, bg::cs::cartesian> point;
struct Node
{
struct Node *parent;
point pos;
int id;
};
typedef pair<Node, unsigned int> ptval;
BOOST_GEOMETRY_REGISTER_POINT_2D_GET_SET(Node, double, bg::cs::cartesian, pos.get<0>, pos.get<1>, pos.set<0>, pos.set<1>);
double px(point& p) {
return pbruh.get<0>();
}
double nx(Node *n) {
return px(n->pos);
}
double py(point& p)
{
return pbruh.get<1>();
}
double ny(Node *n)
{
return py(n->pos);
}
int main() {
point start(0, 0), end(200, 200);
Node *root = new Node;
root->parent = NULL;
root->pos = start;
root->id = 0;
Node *last = root;
bgi::rtree<ptval, bgi::quadratic<16>> pts;
pts.insert(make_pair(*root, 0));
int c = 1; // running count
srand(time(nullptr)); // seed rng
vector<int> pars; // actual parent ids
pars.push_back(-1);
Node *save;
int marker = 500;
while (c<2000)
{
// generate random pos
double xr = ((double)rand() / RAND_MAX) * 200;
double yr = ((double)rand() / RAND_MAX) * 200;
point random(xr, yr);
vector<ptval> res;
pts.query(bgi::nearest(random, 1), back_inserter(res)); // get nearest point in rtree
Node *pnearest = &(res[0].first);
Node *pnew = new Node;
pnew->pos = random;
pnew->parent = pnearest; pnew->id = c; if (c == marker) { save = pnew; }
pars.push_back(pnearest->id);
if (c > marker && c < marker+50 && save->parent->id != pars[marker]) { cout << "CHANGED " << c << " " << pars[marker] << " " << save->parent->id << " " << save->parent << " " << nx(save->parent) << " " << ny(save->parent) << endl; }
pts.insert(make_pair(*pnew, c));
last = pnew;
c++;
}
cout << "ULTIMATE " << pars[marker] << " " << save->parent->id << endl;
return 0;
}
An example output is
CHANGED 501 416 118 0x257c1b0 134.416 103.623
CHANGED 502 416 412 0x257c1b0 187.164 150.841
CHANGED 503 416 190 0x257c1b0 176.128 162.548
CHANGED 504 416 212 0x257c1b0 68.16 167.425
CHANGED 505 416 487 0x257c1b0 0.701926 114.237
CHANGED 506 416 61 0x257c1b0 16.8645 91.7386
CHANGED 507 416 221 0x257c1b0 160.991 62.9841
CHANGED 508 416 439 0x257c1b0 65.627 130.284
CHANGED 509 416 203 0x257c1b0 146.312 189.367
CHANGED 510 416 140 0x257c1b0 164.946 30.2683
CHANGED 511 416 76 0x257c1b0 193.194 146.336
CHANGED 512 416 286 0x257c1b0 29.8898 124.509
CHANGED 513 416 14 0x257c1b0 88.4732 88.3816
CHANGED 514 416 340 0x257c1b0 179.907 93.4538
CHANGED 515 416 409 0x257c1b0 26.5389 94.4609
CHANGED 516 416 488 0x257c1b0 98.8983 12.36
CHANGED 517 416 256 0x257c1b0 141.984 180.651
CHANGED 518 416 256 0x257c1b0 141.984 180.651
CHANGED 519 416 256 0x257c1b0 141.984 180.651
CHANGED 520 416 256 0x257c1b0 141.984 180.651
CHANGED 521 416 256 0x257c1b0 141.984 180.651
CHANGED 522 416 256 0x257c1b0 141.984 180.651
CHANGED 523 416 256 0x257c1b0 141.984 180.651
CHANGED 524 416 256 0x257c1b0 141.984 180.651
CHANGED 525 416 256 0x257c1b0 141.984 180.651
CHANGED 526 416 256 0x257c1b0 141.984 180.651
CHANGED 527 416 256 0x257c1b0 141.984 180.651
CHANGED 528 416 256 0x257c1b0 141.984 180.651
CHANGED 529 416 256 0x257c1b0 141.984 180.651
CHANGED 530 416 256 0x257c1b0 141.984 180.651
CHANGED 531 416 256 0x257c1b0 141.984 180.651
CHANGED 532 416 256 0x257c1b0 141.984 180.651
CHANGED 533 416 256 0x257c1b0 141.984 180.651
CHANGED 534 416 256 0x257c1b0 141.984 180.651
CHANGED 535 416 256 0x257c1b0 141.984 180.651
CHANGED 536 416 256 0x257c1b0 141.984 180.651
CHANGED 537 416 256 0x257c1b0 141.984 180.651
CHANGED 538 416 256 0x257c1b0 141.984 180.651
CHANGED 539 416 256 0x257c1b0 141.984 180.651
CHANGED 540 416 256 0x257c1b0 141.984 180.651
CHANGED 541 416 256 0x257c1b0 141.984 180.651
CHANGED 542 416 256 0x257c1b0 141.984 180.651
CHANGED 543 416 256 0x257c1b0 141.984 180.651
CHANGED 544 416 256 0x257c1b0 141.984 180.651
CHANGED 545 416 256 0x257c1b0 141.984 180.651
CHANGED 546 416 256 0x257c1b0 141.984 180.651
CHANGED 547 416 256 0x257c1b0 141.984 180.651
CHANGED 548 416 256 0x257c1b0 141.984 180.651
CHANGED 549 416 256 0x257c1b0 141.984 180.651
ULTIMATE 416 599
As you can see, for all of these iterations, same->parent->id differs from the actual id of the parent (416) when same was processed on its iteration, and for some reason same->parent->id fluctuates on these first few iterations. However, the pointer address remains the same. What is happening in the heap memory that is causing these values to fluctuate and then stop but still be at the wrong address? This is very weird. I suspect it may have something to do with the Boost RTree as I store a direct reference to each Node in it and Boost may be doing some underlying operations, but that is just a theory. I may also just be a bit misguided on how pointers work.
Does anyone know how to fix the issue?
I'm assuming the missing functions are:
double px(point& p) { return p.get<0>(); }
double py(point& p) { return p.get<1>(); }
double nx(Node* n) { return px(n->pos); }
double ny(Node* n) { return py(n->pos); }
The first real issue I see is:
pts.insert(make_pair(*root, 0));
That inserts a copy of the object pointed to by root. Any references will not point to the copy. root is always leaked for no reason.
This is a better start that might actually work until the tree invalidates references:
pts.insert(std::make_pair(Node{nullptr, start, 0}, 0));
Node const* last = &pts.begin()->first;
Same here:
pts.insert(std::make_pair(*pnew, id));
last = pnew;
*pnew is always leaked, and you didn't want last to point to it. To "fix" it naievely, I'd use a helper function:
auto insert = [&pts](Node n) -> Node const* {
pts.insert(std::make_pair(n, n.id));
auto it = std::find_if(pts.begin(), pts.end(),
[](ptval const& p) { return p.second == n.id; });
return &it->first;
};
Node const* last = insert(Node{nullptr, start, 0});
As others have noted, this is definitely wrong:
ptval res;
pts.query(bgi::nearest(newpos, 1), &res);
Node* pnearest = &(res[0].first);
This by definition takes the address of the local copy of a ptval's Node element. Instead, let's use the same naive (probably enormously inefficient) approach:
auto find = [&pts](int id) -> Node const* {
auto it = std::find_if(pts.begin(), pts.end(),
[id](ptval const& p) { return p.second == id; });
return &it->first;
};
auto insert = [&, find](Node n) -> Node const* {
pts.insert(std::make_pair(n, n.id));
return find(n.id);
};
Now you can have:
Node const* last = insert(Node{nullptr, start, 0}); // insert root
As well as:
ptval res;
pts.query(bgi::nearest(newpos, 1), &res);
Node new_node{find(res.second), newpos, id};
if (id == marker) {
marker_parent = new_node.parent;
}
parents.push_back(new_node.parent->id);
Next up
srand(time(nullptr)); // seed rng
// generate random pos
double xr = ((double)rand() / RAND_MAX) * 200;
double yr = ((double)rand() / RAND_MAX) * 200;
Prefer standard library:
std::mt19937 prng{std::random_device{}()};
std::uniform_real_distribution<double> dist(0, 200);
// generate random pos
double xr = dist(prng);
double yr = dist(prng);
Next up
Node* save;
Prefer to initialize:
Node* save = nullptr;
Know your loops.
int c = 1; // running count
// ...
while (c < 2000) {
//...
c++;
}
Should just be
for (int id = 1; id<2000; ++id) {
//...
}
Note the naming as well.
Don't use using namespace std; (Why is "using namespace std;" considered bad practice?)
Applying all this leads me to:
Live On Compiler Explorer
//#include <algorithm>
#include <iostream>
//#include <cmath>
//#include <stdlib.h>
//#include <stdio.h>
//#include <string>
//#include <vector>
//#include <set>
#include <boost/geometry.hpp>
#include <boost/geometry/geometries/box.hpp>
#include <boost/geometry/geometries/point.hpp>
#include <boost/geometry/geometries/register/point.hpp>
#include <boost/geometry/geometry.hpp>
#include <boost/geometry/index/rtree.hpp>
#include <random>
namespace bg = boost::geometry;
namespace bgi = boost::geometry::index;
typedef bg::model::point<double, 2, bg::cs::cartesian> point;
struct Node {
Node const* parent = nullptr;
point pos = {};
int id = -1;
};
typedef std::pair<Node, int> ptval;
BOOST_GEOMETRY_REGISTER_POINT_2D_GET_SET(Node, double, bg::cs::cartesian,
pos.get<0>, pos.get<1>, pos.set<0>,
pos.set<1>)
double px(point const& p) { return p.get<0>(); }
double py(point const& p) { return p.get<1>(); }
double nx(Node const* n) { return px(n->pos); }
double ny(Node const* n) { return py(n->pos); }
int main(int argc, char** argv) {
point start(0, 0) /*, end(200, 200)*/;
bgi::rtree<ptval, bgi::quadratic<16>> pts;
auto find = [&pts](int id) -> Node const* {
auto it = std::find_if(pts.begin(), pts.end(),
[id](ptval const& p) { return p.second == id; });
return &it->first;
};
auto insert = [&, find](Node n) -> Node const* {
pts.insert(std::make_pair(n, n.id));
return find(n.id);
};
Node const* last = insert(Node{nullptr, start, 0}); // insert root
std::mt19937 prng{argc > 1 ? atoi(argv[1]) : std::random_device{}()};
std::uniform_real_distribution<double> dist(0, 200);
std::vector<int> parents{-1}; // actual parent ids
Node const* marker_parent = nullptr;
int marker = 500;
for (int id = 1; id < 2000; ++id) {
// generate random pos
point newpos(dist(prng), dist(prng));
ptval res;
pts.query(bgi::nearest(newpos, 1), &res);
Node new_node{find(res.second), newpos, id};
if (id == marker) {
marker_parent = new_node.parent;
}
parents.push_back(new_node.parent->id);
if (id > marker && id < marker + 50 &&
marker_parent->id != parents[marker]) {
std::cout << "CHANGED " << id << " " << parents[marker] << " "
<< marker_parent->id << " " << marker_parent
<< " " << nx(marker_parent) << " "
<< ny(marker_parent) << std::endl;
}
last = insert(new_node);
}
std::cout << "ULTIMATE " << parents[marker] << " ";
if (marker_parent)
std::cout << marker_parent->id << std::endl;
else
std::cout << std::endl;
}
Prints (with the seed fixed at 42 for online demo):
ULTIMATE 307 1622
The funny thing is, as far as I can see last is never used. So you can probably do with much simpler and more efficient:
auto insert = [&](Node n) { pts.insert(std::make_pair(n, n.id)); };
Isolating The Bug - Root Cause
Note above when I said "might actually work until the tree invalidates references". Sadly I was unable to find any documentation of reference stability/invalidation guarantees for rtree¹.
With different seeds we still get output like e.g. for seed 47:
CHANGED 501 189 38 0x617000004e30 26.0044 140.042
...
CHANGED 549 189 38 0x617000004e30 26.0044 140.042
ULTIMATE 189 474
Obviously, at least all the lines are identical (except for the running counter 501..549). Comparing seed 40:
CHANGED 517 108 500 0x560957ad1638 109.71 53.341
...
CHANGED 549 108 500 0x560957ad1638 109.71 53.341
ULTIMATE 108 590
Illustrates that sometimes the marker "randomly" changes during the game. The problem then, clearly, is that rtree can reallocate, invalidating Node references.
The number 517 is interesting to me for its relation to 16 in bgi::quadratic<16>. So, I thought to make a much more granular test instead of randomly monitoring the 500th "marker". Why not monitor all parents?
constexpr auto N = 2000;
std::vector<Node const*> parents(N, nullptr); // parents refs
std::vector<int> parent_ids(N, -1); // parent ids
for (int id = 1; id < N; ++id) {
// generate random pos
point newpos(dist(prng), dist(prng));
ptval res;
pts.query(bgi::nearest(newpos, 1), &res);
Node new_node{find(res.second), newpos, id};
parents[id] = new_node.parent;
parent_ids[id] = new_node.parent->id;
bool invalidated = !std::equal(
parents.begin(), parents.end(), parent_ids.begin(),
[](Node const* p, int id) { return !p || p->id == id; });
std::cout << "invalidated: " << std::boolalpha << invalidated << "\n";
//assert(!invalidated);
insert(new_node);
}
Now running this a few hundred times with random seeds shows a pattern of always invalidating after 16 nodes:
$ for a in {1..100}; do ./build/sotest; done | uniq -c | sort | uniq
16 invalidated: false
1983 invalidated: true
1 ULTIMATE 101 1393
1 ULTIMATE 103 262
1 ULTIMATE 103 5
1 ULTIMATE 104 536
1 ULTIMATE 108 1406
1 ULTIMATE 111 111
This confirms beyond any statistical doubt that reallocation is causing the references (i.e. the pointers) to become invalidated.
FIXING
I suggest keeping the parent references by id only. Interestingly, since id is already part of Node you can do without the duplication in std::pair by using a custom ``indexable<>or customIndexableGetter` template argument.
Let's store references to Nodes in the tree, instead, using a custom indexable:
using NodeRef = std::reference_wrapper<Node const>;
struct MyIndexable {
using value_type = NodeRef;
using result_type = Node const&;
result_type operator()(NodeRef r) const { return r; }
};
Now we can trivially build a tree from those:
bgi::rtree<NodeRef, bgi::quadratic<16>, MyIndexable> pts;
All we need is some stable storage for the referenced nodes:
std::deque<Node> storage; // reference stability adding/removing at either end
//std::list<Node> storage; // iterator and reference stability (except removed)
auto insert = [&](Node n) {
storage.push_back(std::move(n));
pts.insert(NodeRef(storage.back()));
};
The rest of the program can remain pretty much the same - except for a few respellings to accomodate NodeRef:
Live On Coliru
#undef NDEBUG
#include <iostream>
#include <boost/geometry.hpp>
#include <boost/geometry/geometries/box.hpp>
#include <boost/geometry/geometries/point.hpp>
#include <boost/geometry/geometries/register/point.hpp>
#include <boost/geometry/geometry.hpp>
#include <boost/geometry/index/rtree.hpp>
#include <random>
namespace bg = boost::geometry;
namespace bgi = boost::geometry::index;
typedef bg::model::point<double, 2, bg::cs::cartesian> point;
struct Node {
Node const* parent = nullptr;
point pos = {};
int id = -1;
};
BOOST_GEOMETRY_REGISTER_POINT_2D_GET_SET(Node, double, bg::cs::cartesian,
pos.get<0>, pos.get<1>, pos.set<0>,
pos.set<1>)
double px(point const& p) { return p.get<0>(); }
double py(point const& p) { return p.get<1>(); }
double nx(Node const* n) { return px(n->pos); }
double ny(Node const* n) { return py(n->pos); }
using NodeRef = std::reference_wrapper<Node const>;
struct MyIndexable {
using value_type = NodeRef;
using result_type = Node const&;
result_type operator()(NodeRef r) const { return r; }
};
int main(int argc, char** argv) {
bgi::rtree<NodeRef, bgi::quadratic<16>, MyIndexable> pts;
std::deque<Node> storage; // reference stability adding/removing at either end
//std::list<Node> storage; // iterator and reference stability (except removed)
auto insert = [&](Node n) {
storage.push_back(std::move(n));
pts.insert(NodeRef(storage.back()));
};
insert({nullptr, point(0, 0), 0}); // insert root
std::mt19937 prng{argc > 1 ? atoi(argv[1]) : std::random_device{}()};
std::uniform_real_distribution<double> dist(0, 200);
constexpr auto N = 2000;
std::vector<NodeRef> parents; // parents refs
std::vector<int> parent_ids; // parent ids
for (int id = 1; id < N; ++id) {
// generate random pos
point newpos(dist(prng), dist(prng));
std::vector<NodeRef> res;
pts.query(bgi::nearest(newpos, 1), back_inserter(res));
Node new_node{&res.front().get(), newpos, id};
parents.push_back(*new_node.parent);
parent_ids.push_back(new_node.parent->id);
bool invalidated =
!std::equal(parents.begin(), parents.end(), parent_ids.begin(),
[](Node const& p, int id) { return p.id == id; });
assert(!invalidated);
insert(std::move(new_node));
}
std::cout << "ULTIMATE " << parent_ids.at(500) << " "
<< parents.at(500).get().id << "\n";
}
Which never trips the assert, and produces stable output like:
for a in {1..50}; do ./a.out; done
ULTIMATE 41 41
ULTIMATE 365 365
ULTIMATE 219 219
ULTIMATE 193 193
ULTIMATE 448 448
ULTIMATE 331 331
ULTIMATE 227 227
ULTIMATE 234 234
ULTIMATE 300 300
ULTIMATE 227 227
ULTIMATE 243 243
ULTIMATE 248 248
ULTIMATE 233 233
ULTIMATE 143 143
ULTIMATE 39 39
ULTIMATE 488 488
ULTIMATE 493 493
ULTIMATE 9 9
ULTIMATE 212 212
ULTIMATE 338 338
ULTIMATE 141 141
ULTIMATE 356 356
ULTIMATE 147 147
ULTIMATE 376 376
ULTIMATE 76 76
ULTIMATE 450 450
ULTIMATE 272 272
ULTIMATE 34 34
ULTIMATE 492 492
ULTIMATE 478 478
ULTIMATE 84 84
ULTIMATE 416 416
ULTIMATE 222 222
ULTIMATE 457 457
ULTIMATE 95 95
ULTIMATE 446 446
ULTIMATE 233 233
ULTIMATE 480 480
ULTIMATE 265 265
ULTIMATE 415 415
ULTIMATE 289 289
ULTIMATE 121 121
ULTIMATE 344 344
ULTIMATE 110 110
ULTIMATE 429 429
ULTIMATE 31 31
ULTIMATE 344 344
ULTIMATE 172 172
ULTIMATE 20 20
ULTIMATE 394 394
Bonus
A version without the now-redundant sanity checks and unused code: Live On Coliru
¹ although there are some notes about iterator invalidation

high performance calculations and saving of the threads identificators

I write grid-stride loop to have High Performance Calculations, where large N, for example long long N 1<<36, or even more. From total grid I need only some indexes, which have to satisfy the define condition.
__global__ void Indexes(int *array, int N) {
int index = blockIdx.x * blockDim.x + threadIdx.x;
while( index<N)
{
if (condition)
{....//do something to save index in array}
index += blockDim.x * gridDim.x;
}
}
Of course, it is possible use the Thrust, which allows to have both host and device arrays. But in this case obviously the calculation will be extremely ineffective, because need firstly to create a lot of non-needed elements, then to delete these.
What is the most effective way to save the indexes directly in array in device to pass in CPU?
If your output is relatively dense (i.e. a lot of indices and relatively few zeros), then the stream compaction approach suggested in comments is a good solution. There are a lot of ready-to-go stream compaction implementations which you can probably adapt to your purposes.
If your output is sparse, so you need to save relatively few indices for a lot of inputs, then stream compaction isn't such a great solution because it will waste a lot of GPU memory. In that case (and you can roughly estimate an upper bound of the number of output indices) something like this:
template <typename T>
struct Array
{
T* p;
int Nmax;
int* next;
Array() = default;
__host__ __device__
Array(T* _p, int _Nmax, int* _next) : p(_p), Nmax(_Nmax), next(_next) {};
__device__
int append(T& val)
{
int pos = atomicAdd(next, 1);
if (pos > Nmax) {
atomicExch(next, Nmax);
return -1;
} else {
p[pos] = val;
return pos;
}
};
};
is probably more appropriate. Here, the idea is to use an atomically incremented position in the output array to keep track of where a thread should store its index. The code will signal if you fill the index array, and there will be information from which you can work out a restart strategy to stop the current kernel and then start from the last known index which you were able to store.
A complete example:
$ cat append.cu
#include <iostream>
#include <thrust/device_ptr.h>
#include <thrust/device_vector.h>
#include <thrust/iterator/counting_iterator.h>
#include <thrust/copy.h>
namespace AppendArray
{
template <typename T>
struct Array
{
T* p;
int Nmax;
int* next;
Array() = default;
__host__ __device__
Array(T* _p, int _Nmax, int* _next) : p(_p), Nmax(_Nmax), next(_next) {};
__device__
int append(T& val)
{
int pos = atomicAdd(next, 1);
if (pos > Nmax) {
atomicExch(next, Nmax);
return -1;
} else {
p[pos] = val;
return pos;
}
};
};
}
__global__
void kernelfind(int* input, int N, AppendArray::Array<int> indices)
{
int idx = threadIdx.x + blockIdx.x * blockDim.x;
for(; idx < N; idx += gridDim.x*blockDim.x) {
if (input[idx] % 10000 == 0) {
if (indices.append(idx) < 0) return;
}
}
}
int main()
{
const int Ninputs = 1 << 20;
thrust::device_vector<int> inputs(Ninputs);
thrust::counting_iterator<int> vals(1);
thrust::copy(vals, vals + Ninputs, inputs.begin());
int* d_input = thrust::raw_pointer_cast(inputs.data());
int Nindices = Ninputs >> 12;
thrust::device_vector<int> indices(Nindices);
int* d_indices = thrust::raw_pointer_cast(indices.data());
int* pos; cudaMallocManaged(&pos, sizeof(int)); *pos = 0;
AppendArray::Array<int> index(d_indices, Nindices-1, pos);
int gridsize, blocksize;
cudaOccupancyMaxPotentialBlockSize(&gridsize, &blocksize, kernelfind, 0, 0);
kernelfind<<<gridsize, blocksize>>>(d_input, Ninputs, index);
cudaDeviceSynchronize();
for(int i = 0; i < *pos; ++i) {
int idx = indices[i];
std::cout << i << " " << idx << " " << inputs[idx] << std::endl;
}
return 0;
}
$ nvcc -std=c++11 -arch=sm_52 -o append append.cu
$ ./append
0 9999 10000
1 19999 20000
2 29999 30000
3 39999 40000
4 49999 50000
5 69999 70000
6 79999 80000
7 59999 60000
8 89999 90000
9 109999 110000
10 99999 100000
11 119999 120000
12 139999 140000
13 129999 130000
14 149999 150000
15 159999 160000
16 169999 170000
17 189999 190000
18 179999 180000
19 199999 200000
20 209999 210000
21 219999 220000
22 239999 240000
23 249999 250000
24 229999 230000
25 279999 280000
26 269999 270000
27 259999 260000
28 319999 320000
29 329999 330000
30 289999 290000
31 299999 300000
32 339999 340000
33 349999 350000
34 309999 310000
35 359999 360000
36 379999 380000
37 399999 400000
38 409999 410000
39 369999 370000
40 429999 430000
41 419999 420000
42 389999 390000
43 439999 440000
44 459999 460000
45 489999 490000
46 479999 480000
47 449999 450000
48 509999 510000
49 539999 540000
50 469999 470000
51 499999 500000
52 569999 570000
53 549999 550000
54 519999 520000
55 589999 590000
56 529999 530000
57 559999 560000
58 619999 620000
59 579999 580000
60 629999 630000
61 669999 670000
62 599999 600000
63 609999 610000
64 699999 700000
65 639999 640000
66 649999 650000
67 719999 720000
68 659999 660000
69 679999 680000
70 749999 750000
71 709999 710000
72 689999 690000
73 729999 730000
74 779999 780000
75 799999 800000
76 809999 810000
77 739999 740000
78 849999 850000
79 759999 760000
80 829999 830000
81 789999 790000
82 769999 770000
83 859999 860000
84 889999 890000
85 879999 880000
86 819999 820000
87 929999 930000
88 869999 870000
89 839999 840000
90 909999 910000
91 939999 940000
92 969999 970000
93 899999 900000
94 979999 980000
95 959999 960000
96 949999 950000
97 1019999 1020000
98 1009999 1010000
99 989999 990000
100 1029999 1030000
101 919999 920000
102 1039999 1040000
103 999999 1000000

Strange behaviour when comparing in SSE

I don't get the error in my code. I try to compare a buffer of unsigned char values to a constant. Then I want to store 1 or 0 depending on the comparison. Here is my code (in a structure):
void operator()(const uint8* src, int32 swidth, int32 sheight, uint8* dst, uint8 value) {
uint8 t[16];
__m128i v_one = _mm_set1_epi8((uint8)1);
__m128i v_value = _mm_set1_epi8(value);
printf("value = %d\n", value);
SHOW(t, v_one);
SHOW(t, v_value);
std::cout << "****" << std::endl;
for (int32 i = 0; i < sheight; ++i) {
const uint8* sdata = src + i * swidth;
uint8* ddata = dst + i * swidth;
int32 j = 0;
for ( ; j <= swidth - 16; j += 16) {
__m128i s = _mm_load_si128((const __m128i*)(sdata + j));
__m128i mask = _mm_cmpgt_epi8(s, v_value);
SHOW(t, s);
SHOW(t, mask);
std::cout << std::endl;
}
}
}
My first line are what I would expect:
value = 100
1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100
But then my comparison are wrong:
214 100 199 203 232 50 85 195 70 141 121 160 93 130 242 233
0 0 0 0 0 0 0 0 0 0 255 0 0 0 0 0
And I really don't get where the mistakes are.
The SHOW macro is:
#define SHOW(t, r) \
_mm_storeu_si128((__m128i*)t, r); \
printf("%3d", (int32)t[0]); \
for (int32 k = 1; k < 16; ++k) \
printf(" %3d", (int32)t[k]); \
printf("\n")
You are comparing the elements in your s array with your value array.
All the values in the value array are 100.
You have a mix of values in your s array.
However, _mm_cmpgt_epi8 works on signed values and as these are bytes it considers values from -128 to +127.
So the only possible values that are > 100 are values in the range 101 to 127.
As you've only got 1 value in that range (121) thats the only one which has its mask set.
To see this, change uint8 t[16]; to int8 t[16]; and you should get a more expected result.

How to convert a HexBytes Array to a String in C/C++ on Arduino?

I realized converting a String into a hexarray now need to convert the new array into a new string,because the function Sha256.write needs a char, which would be the way?
char hexstring[] = "020000009ecb752aac3e6d2101163b36f3e6bd67d0c95be402918f2f00000000000000001036e4ee6f31bc9a690053320286d84fbfe4e5ee0594b4ab72339366b3ff1734270061536c89001900000000";
int i;
int n;
uint8_t bytearray[80];
Serial.println("Starting...");
char tmp[3];
tmp[2] = '\0';
int j = 0;
//GET ARRAY
for(i=0;i<strlen(hexstring);i+=2) {
tmp[0] = hexstring[i];
tmp[1] = hexstring[i+1];
bytearray[j] = strtol(tmp,0,16);
j+=1;
}
for(i=0;i<80;i+=1) {
Serial.println( bytearray[i]);
}
int _batchSize;
unsigned char hash[32];
SHA256_CTX ctx;
int idx;
Serial.println("SHA256...");
for(_batchSize = 100000; _batchSize > 0; _batchSize--){
bytearray[76] = nonce;
// Sha256.write(bytearray);
sha256_init(&ctx);
sha256_update(&ctx,bytearray,80);
sha256_final(&ctx,hash); //
sha256_init(&ctx);
sha256_update(&ctx,hash,32);
sha256_final(&ctx,hash); //are this corrent? i'm need in bytes too
// print_hash(hash);
int zeroBytes = 0;
for (int i = 31; i >= 28; i--, zeroBytes++)
if(hash[i] > 0)
break;
if(zeroBytes == 4){ // SOLUTION TRUE, NOW I'M NEED THIS IN STRING
printf("0x");
for (n = 0; n < 32; n++)
Serial.println(printf("%02x", hash[n])); //ERROR :(
}
//increase
if(++nonce == 4294967295)
nonce = 0;
}
}
}
output array on Serial port:
2
0
0
0
158
203
117
42
172
62
109
33
1
22
59
54
243
230
189
103
208
201
91
228
2
145
143
47
0
0
0
0
0
0
0
0
16
54
228
238
111
49
188
154
105
0
83
50
2
134
216
79
191
228
229
238
5
148
180
171
114
51
147
102
179
255
23
52
39
0
97
83
108
137
0
25
0
0
0
0
how to convert this to a hexstring char back?
UPDATED
this solutions works for me, thanks all!
void printHash(uint8_t* hash) {
int id;
for (id=0; id<32; id++) {
Serial.print("0123456789abcdef"[hash[id]>>4]);
Serial.print("0123456789abcdef"[hash[id]&0xf]);
}
Serial.println();
}
Skip to the section Addressing your code... at bottom for most relevant content
(this stuff up here is barely useful blither)
The purpose of your function:
Sha256.write((char *)bytearray);
I believe is to write more data to the running hash. (from this)
Therefore, I am not sure in the context of your question how to convert this to a hex-string char back? how this relates to the way you are using it.
Let me offer another approach for the sake of illustrating how you might go about returning the array of ints back into the form of a "hexadecimal string":
From Here
Here is a code fragment that will calculate the digest for the string "abc"
SHA256_CTX ctx;
u_int8_t results[SHA256_DIGEST_LENGTH];
char *buf;
int n;
buf = "abc";
n = strlen(buf);
SHA256_Init(&ctx);
SHA256_Update(&ctx, (u_int8_t *)buf, n);
SHA256_Final(results, &ctx);
/* Print the digest as one long hex value */
printf("0x");
for (n = 0; n < SHA256_DIGEST_LENGTH; n++)
printf("%02x", results[n]);
putchar('\n');
resulting in:
"0xba7816bf8f01cfea414140de5dae2223b00361a396177a9cb410ff61f20015ad".
In this example The array I believe you want, is contained in u_int8_t results
There is not enough description in your post to be sure this will help, let me know in the comments, and I will try to address further questions.
Added after your edit:
Continuing from the example above, to put the array contents of results back into a string, you can do something like this:
char *newString;
newString = malloc(sizeof(char)*SHA256_DIGEST_LENGTH*2);
memset(newString, 0, sizeof(char)*SHA256_DIGEST_LENGTH*2);
strcat(newString, "0x");
for(i=0;i<SHA256_DIGEST_LENGTH;i++)
{
sprintf(newString, "%s%02x\n", newString, results[i]);
}
//use newString for stuff...
free(newString);
Addressing your code, and your question directly:
Your code block:
for(_batchSize = 100000; _batchSize > 0; _batchSize--){
bytearray[76] = _batchSize;
Sha256.write((char *)bytearray); //here are the error
}
is not necessary if all you want to do is to convert an array of int into a "hexadecimal string"
Your int array, defined as:
int bytearray[80];
Already contains all the necessary values at this point, as you illustrated with your latest edit. If you want to return this data to a "hexadecimal string" form, then this will do that for you: (replacing result with your bytearray)
char *newString;
newString = malloc(sizeof(char)*SHA256_DIGEST_LENGTH*2);//if these defines are not in your environment
memset(newString, 0, sizeof(char)*SHA256_DIGEST_LENGTH*2);//then replace them with appropriate value for length
strcat(newString, "0x");
for(i=0;i<sizeof(bytearray)/sizeof(bytearray[0]);i++)
{
sprintf(newString, "%s%02x\n", newString, bytearray[i]);
}
//use newString for stuff...
free(newString);

std::random_shuffle produce the same result even though srand(time(0)) called once

In a function, I want to generate a list of numbers in range:
(This function will be called only once when executing the program.)
void DataSet::finalize(double trainPercent, bool genValidData)
{
srand(time(0));
printf("%d\n", rand());
// indices = {0, 1, 2, 3, 4, ..., m_train.size()-1}
vector<size_t> indices(m_train.size());
for (size_t i = 0; i < indices.size(); i++)
indices[i] = i;
random_shuffle(indices.begin(), indices.end());
// Output
for (size_t i = 0; i < 10; i++)
printf("%ld ", indices[i]);
puts("");
}
The results are like:
850577673
246 239 7 102 41 201 288 23 1 237
After a few seconds:
856981140
246 239 7 102 41 201 288 23 1 237
And more:
857552578
246 239 7 102 41 201 288 23 1 237
Why the function rand() works properly but `random_shuffle' does not?
random_shuffle() isn't actually specified to use rand() and so srand() may not have any impact. If you want to be sure, you should use one of the C++11 forms, random_shuffle(b, e, RNG) or shuffle(b, e, uRNG).
An alternative would be to use random_shuffle(indices.begin(), indices.end(), rand()); because apparently your implementation of random_shuffle() is not using rand().