I'm working with long std::string(std::wstring) processing
The length of original string can be 10 to 1 million, and I got a number of substring offsets. what I need is to concatenate of multiple substrings of original string and with some new strings.
Except using using string append
auto str = new std::string();
str.append(original.substr(a,b)).append(newstr1).append(original.substr(c,d)).append.....
Is there any more efficient way like handling pointer or string iterators?
Thanks.
UPDATE:
I got several feedbacks now, except for rope I can test all the other methods
The result is following:
#include <string>
#include <iostream>
#include <chrono>
#include <ctime>
std::string GetSystemTimeEpoch(){
using namespace std::chrono;
auto now = system_clock::now();
time_point<system_clock> epoch;
microseconds ms = duration_cast<milliseconds>(now - epoch);
double epoch_time = (unsigned long long)ms.count() / 1000000.0;
unsigned long long postfix = (unsigned long long)ms.count() % 1000000;
std::time_t time = static_cast<time_t>(epoch_time);
std::tm tm = *std::localtime(&time);
char Buf[80];
std::strftime(Buf, sizeof(Buf), "%Y-%m-%dT%H:%M:%S", &tm);
std::string finaltime(Buf);
return finaltime.append(".").append(std::to_string(postfix));
}
#define TESTLENGTH1 1000000000
#define TESTLENGTH2 300000000
int main(){
std::string Str(TESTLENGTH2, 'c');
std::cout << GetSystemTimeEpoch() << " Begin of Method 1(replace)"<< std::endl;
for (size_t i = 0; i < Str.length(); i++){
Str.replace(i, 1, "d");
}
std::cout << GetSystemTimeEpoch() << " Begin of Method 2(append)" << std::endl;
std::string NewStr1;
for (size_t i = 0; i < Str.length(); i++){
NewStr1.append(Str.substr(i, 1));
}
std::cout << GetSystemTimeEpoch() << " Begin of Method 3(+=)" << std::endl;
std::string NewStr2;
for (size_t i = 0; i < Str.length(); i++){
NewStr2 += Str.substr(i, 1);
}
std::cout << GetSystemTimeEpoch() << " Begin of Method 4(reserve)" << std::endl;
std::string NewStr3;
NewStr3.reserve(TESTLENGTH2);
for (size_t i = 0; i < Str.length(); i++){
NewStr3 += Str.substr(i, 1);
}
std::cout << GetSystemTimeEpoch() << " End" << std::endl;
return 0;
}
===
2016-05-21T22:38:51.471000 Begin of Method 1(replace)
2016-05-21T22:38:58.972000 Begin of Method 2(append)
2016-05-21T22:39:14.429000 Begin of Method 3(+=)
2016-05-21T22:39:29.944000 Begin of Method 4(reserve)
2016-05-21T22:39:44.892000 End
Press any key to continue . . .
It seems the quickest way is not doing concatenation but do replace instead
for the concatenation.(method1)
Concatenation methods(2,3,4) there seems no difference.
I have not tested sgi ROPE class since I could not find a beginners document to start with :). If someone know about it, please leave a sketch or complete this testcase.
PS.
TESTLENGTH1 crashed for method 2 and 3 and 4
PS2.
Testing environment, Win7x64;VC++2013;Target Win32,Release. i5 2GHz,8GB RAM
The original STL from SGI had a data-structure called a rope. This stored an array of subsequences, so the construction of your new sequence would be O(1).
See this answer. You can download the SGI STL from here.
Regarding the tests: You should use a profiling tool or write a proper test case measuring execution time:
#include <string>
#include <iostream>
#include <chrono>
#include <random>
using std::chrono::system_clock;
#define TESTLENGTH 100000000
std::string random_string() {
std::random_device random_device;
std::mt19937 random_generator(random_device());
std::uniform_int_distribution<char>distribution;
std::string result(TESTLENGTH, 0);
for(auto& c :result)
c = distribution(random_generator);
return result;
}
void print_duration(const system_clock::time_point& start, const system_clock::time_point& stop) {
using namespace std::chrono;
auto duration = duration_cast<milliseconds>(stop - start);
std::cout << duration.count() << std::endl;
}
void utilize(const std::string& str)
{
static volatile char* result = new char[TESTLENGTH];;
std::copy(str.begin(), str.begin() + std::max(str.size(), std::string::size_type(TESTLENGTH)), result);
}
int main(){
for(unsigned loop = 0; loop < 4; ++loop) {
std::cout << "Method 1(replace): "<< std::endl;
{
std::string Str = random_string();
auto start = system_clock::now();
std::string NewStr(TESTLENGTH, 0);
for (size_t i = 0; i < Str.length(); i++){
NewStr.replace(i, 1, 1, Str[i]);
}
auto stop = system_clock::now();
print_duration(start, stop);
utilize(NewStr);
}
std::cout << "Method 2(append)" << std::endl;;
{
std::string Str = random_string();
auto start = system_clock::now();
std::string NewStr;
for (size_t i = 0; i < Str.length(); i++){
NewStr.append(1, Str[i]);
}
auto stop = system_clock::now();
print_duration(start, stop);
utilize(NewStr);
}
std::cout << "Method 3(+=)" << std::endl;
{
std::string Str = random_string();
auto start = system_clock::now();
std::string NewStr;
for (size_t i = 0; i < Str.length(); i++){
NewStr += Str[i];
}
auto stop = system_clock::now();
print_duration(start, stop);
utilize(NewStr);
}
std::cout << "Method 4(reserve)" << std::endl;
{
std::string Str = random_string();
auto start = system_clock::now();
std::string NewStr;
NewStr.reserve(TESTLENGTH);
for (size_t i = 0; i < Str.length(); i++){
NewStr += Str[i];
}
auto stop = system_clock::now();
print_duration(start, stop);
utilize(NewStr);
}
}
return 0;
}
Notes:
The code is not reflecting the original question (only single and no character sequences are appended to the result string), but it is an improvement of the code shown in the question.
I made the replace approach comparable to the other.
To prevent needless overhead the time measurement is kept to a minimum (is not using the dreadful GetSystemTimeEpoch)
To avoid unwanted overhead, I dropped std::string::substr.
To prevent unwanted compiler optimizations:
The input is randomized
The result is utilized (by copying it to a volatile address)
To get a more reliable result the measurement is executed multiple times (maybe it should be more than 4).
Results:
Having g++ 4.8.4 with g++ -std=c++11 -O3 my measurement is:
Method 1(replace):
1766
Method 2(append)
1292
Method 3(+=)
684
Method 4(reserve)
628
Method 1(replace):
1766
Method 2(append)
1275
Method 3(+=)
678
Method 4(reserve)
572
Method 1(replace):
1768
Method 2(append)
1276
Method 3(+=)
678
Method 4(reserve)
559
Method 1(replace):
1767
Method 2(append)
1276
Method 3(+=)
682
Method 4(reserve)
579
Replacing append by push_back leads to the same performance as using operator +=.
The First method is to use Martin Bonner's method.
I am not really sure, but it is worth a try.
The Second method is to use the operator +(or +=).
It will make your code shorter (and maybe even faster).
and also, you said the length could be 10 to 1 million.
then here is a good news!
according to string::max_size, the max length of a string is almost 429 million, so that is not a thing you should worry about.
Related
I want a program to find duplicated elements in 2 arrays without using 2 nested loop.
I've tried 2 for loop but it takes too much time.
Here what I have done:
for(j = 0; j < n; j++){
for(i = 0; i < m; i++){
if(arr1[i] == arr2[j]){
// function
} else if(arr1[i] != arr2[j]) {
// another function
}
}
}
Build a hashset from elements from array1, then iterate over array2 to find duplicates.
This solution will show you 3 methods and measure the time that they need.
Your approach, using a nested loop
Using std::set_intersection
Using std::unordered_set
There are of course more possible solutions.
Please see:
#include <iostream>
#include <iterator>
#include <random>
#include <chrono>
#include <algorithm>
#include <unordered_set>
constexpr size_t ArraySize1 = 100000u;
constexpr size_t ArraySize2 = 150000u;
int main() {
int arr1[ArraySize1], arr2[ArraySize2];
// ---------------------------------------------------------------
// Create some random numbers and fill both arrays with it
std::random_device rd;
std::mt19937 gen(rd());
std::uniform_int_distribution<> distrib(1, 2000000000);
for (int& i : arr1) i = distrib(gen);
for (int& i : arr2) i = distrib(gen);
// ---------------------------------------------------------------
// Test algorithms
// 1. Nested loops
auto start = std::chrono::system_clock::now();
// ---
for (size_t k = 0; k < ArraySize1; ++k)
for (size_t i = 0; i < ArraySize2; ++i)
if (arr1[k] == arr2[i])
std::cout << arr1[k] << '\n';
// ---
auto elapsed = std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::system_clock::now() - start);
std::cout << "Time with nested loops: " << elapsed.count() << " ms\n\n";
// 2. Set intersection
start = std::chrono::system_clock::now();
// ---
std::sort(std::begin(arr1), std::end(arr1));
std::sort(std::begin(arr2), std::end(arr2));
std::set_intersection(std::begin(arr1), std::end(arr1), std::begin(arr2), std::end(arr2), std::ostream_iterator<int>(std::cout, "\n"));
// ---
elapsed = std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::system_clock::now() - start);
std::cout << "Time with set_intersection: " << elapsed.count() << " ms\n\n";
// 3. std::unordred_set
start = std::chrono::system_clock::now();
std::unordered_set<int> setArray1(std::begin(arr1),std::end(arr1));
for (const int i : arr2) {
if (setArray1.count(i)) {
std::cout << i << '\n';
}
}
elapsed = std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::system_clock::now() - start);
std::cout << "Time with unordered set: " << elapsed.count() << " ms\n\n";
}
using bool visited array for array1, then check duplicates in
array2 [it depends on array elements limitation]
using map or set C++ STL
using Trie data structure (advanced technique)
I would like something that can window a std::string object into partitions of length N - for example (using a function update):
int main() {
std::string s = "abcdefg";
update<2>(s);
return 0;
}
Calling the above should result in:
ab
bc
cd
ef
fg
I have the following version of the update function:
template<std::size_t size>
void update(std::string s) {
std::string result(size, '\0');
std::stringstream ss{s};
int iterations = s.length() - size;
for (int i = 0; i<iterations; i++) {
ss.read(&result[0], result.size());
std::cout << result << std::endl;
}
return;
}
but this skips out combinations where the initial character lies at an odd index (the number of combinations is correct in my case, even though there is a repeat)
ab
cd
ef
gf
gf
A side note is that if there are any trailing characters then these should be omitted from the printed values (although I think this would be covered by the parameters of the for loop)
A final note is that I would like this to be as optimised as possible since I would typically be using strings of a very large length (>5M characters long) - my current solution may not be best for this so I am open to suggestions of alternative strategies.
With C++17 you can do it like this, which is way more readable:
void update(std::string_view s, int size) {
const int iterations = s.size() - size;
for (int i = 0; i <= iterations; i++) {
std::cout << s.substr(i, size) << "\n";
}
}
string_view is made exactly for this purpose, for fast read access to a string. string_view::substr is const complexity while string::substr is linear.
As a side note, besides what Nick mentioned, your code has few other small problems:
std::endl fflushes the stream, it heavily impacts performance. Here you could just use '\n' to make a newline.
the return at the end is absolutely redundant, void functions do not require returns
what is the purpose of templating this? This will easily bloat your code without any measurable performance increase. Just pass the N as a parameter.
also your main is declared as void and should be int (even more so as you do return a value at the end)
With range-v3, you might use sliding view:
std::string s = "abcdefg";
for (auto r : s | ranges::views::sliding(2)) {
std::cout << r << std::endl;
}
Demo
Your call to ss.read will always read two characters, and then advance the ptr in the string stream 2 characters. So you never read/repeat the previous character at the start of each line.
If you want to do it "your way" then you have to keep track of the last character seperately.
#include <iostream>
#include <sstream>
template<std::size_t size>
void update(std::string s) {
std::string result(size, '\0');
char lastChar;
std::stringstream ss{s};
int iterations = s.length() - size;
int read = 0;
if (ss.readsome(&result[0], 1)) {
lastChar = result[0];
}
for (int i = 0; i < iterations; i++) {
if (read = ss.readsome(&result[0], size - 1)) {
std::cout << lastChar << result << std::endl;
lastChar = result[read - 1];
}
}
}
That being said, the above is definitely not the best approach performance wise. You should be able to do all of this without any string streams or read function, just iterating the string. Something like this
#include <iostream>
void update(std::string s, size_t size) {
int len = s.length();
for (int i = 1; i < len; i+=size-1) {
fwrite(&s[i-1], size, 1, stdout);
putchar('\n');
}
}
I compared with Linux C regex library,
#include <iostream>
#include <chrono>
#include <regex.h>
int main()
{
const int count = 100000;
regex_t exp;
int rv = regcomp(&exp, R"_(([a-zA-Z][a-zA-Z0-9]*)://([^ /]+)(/[^ ]*)?)_", REG_EXTENDED);
if (rv != 0) {
std::cout << "regcomp failed with " << rv << std::endl;
}
auto start = std::chrono::high_resolution_clock::now();
for (int i = 0; i < count; i++)
{
regmatch_t match;
const char *sz = "http://www.abc.com";
if (regexec(&exp, sz, 1, &match, 0) == 0) {
// std::cout << sz << " matches characters " << match.rm_so << " - " << match.rm_eo << std::endl;
} else {
// std::cout << sz << " does not match" << std::endl;
}
}
auto end = std::chrono::high_resolution_clock::now();
auto elapsed = std::chrono::duration_cast<std::chrono::microseconds>(end - start);
std::cout << elapsed.count() << std::endl;
return 0;
}
The result is roughly 60-70 milliseconds on my testing machine.
Then I used libc++'s library,
#include <iostream>
#include <chrono>
#include <regex>
int main()
{
const int count = 100000;
std::regex rgx(R"_(([a-zA-Z][a-zA-Z0-9]*)://([^ /]+)(/[^ ]*)?)_", std::regex_constants::extended);
auto start = std::chrono::high_resolution_clock::now();
for (int i = 0; i < count; i++)
{
std::cmatch match;
const char sz[] = "http://www.abc.com";
if (regex_search(sz, match, rgx)) {
} else {
}
}
auto end = std::chrono::high_resolution_clock::now();
auto elapsed = std::chrono::duration_cast<std::chrono::microseconds>(end - start);
std::cout << "regex_search: " << elapsed.count() << std::endl;
start = std::chrono::high_resolution_clock::now();
for (int i = 0; i < count; i++)
{
const char sz[] = "http://www.abc.com";
if (regex_match(sz, rgx)) {
} else {
}
}
end = std::chrono::high_resolution_clock::now();
elapsed = std::chrono::duration_cast<std::chrono::microseconds>(end - start);
std::cout << "regex_match: " << elapsed.count() << std::endl;
return 0;
}
The result is roughly 2 seconds for both regex_search & regex_match. This is about 30 times slower than C's regex.h library.
Is there anything wrong with my comparison? Is C++'s regex library not suitable for high performance case?
I can understand it's slow because there's no optimization in c++'s regex library yet, but 30 times slower is just too much.
Thanks.
Hi all,
Thanks for answering.
Sorry for my mistake I was using [] for C too but later I changed, and forgot to change C++ code.
I made two changes,
I moved const char sz[] out of the loop for both C & C++.
I compiled it with -O2 (I wasn't using any optimization before), C library's implementation is still around 60 milliseconds, but libc++'s regex now gives a number says, 1 second for regex_search, and 150 milliseconds for regex_match.
This is still a bit slow, but not as much as the original comparison.
If you take a look at http://llvm.org/svn/llvm-project/libcxx/trunk/include/regex you'll see this implementation of regex_match is layered atop regex_search, and all overloads extract sub-expression match positions even if only into local temporaries that are thrown away. regex_search uses a vector of __state objects that have .resize() called on them so are presumably vectors too - all heap allocations and unnecessary when the subexpression matches aren't wanted, but would need to be tracked to support \1 etc in perl-style extensions to regular expressions: the old regcomp/regexec C functions didn't provide those extended features never have to do this extra work. Of course it would be nice if the clang implementation checked the regular expression's need for tracking matches during compilation and called leaner, faster functions to match when possible, but I guess they're just starting with support for the general case.
The following two lines do not do the same thing!
const char sz1[] = "http://www.abc.com";
const char* sz2 = "http://www.abc.com";
That's already enough to make it an unfair test.
sz and match are loop invariant, you should move them to before (in both cases for sz).
In the second case sz is an initialised array instead of a pointer to a constant literal - that is an unfair and unnecessary difference. That said, if you move the declaration to before the loop as suggested, that should make little or no difference.
Although regex_search() is overloaded for const const char* that may internally cause construction of a std::string, to avoid that possibility you should test it with:
const std::string sz( "http://www.abc.com" ) ;
(again before the loop).
So test:
std::cmatch match;
const char* = "http://www.abc.com";
for (int i = 0; i < count; i++)
{
if (regex_search(sz, match, rgx)) {
} else {
}
}
and
std::cmatch match;
const std::string sz( "http://www.abc.com" )
for (int i = 0; i < count; i++)
{
if (regex_search(sz, match, rgx)) {
} else {
}
}
Is using a vector of boolean values slower than a dynamic bitset?
I just heard about boost's dynamic bitset, and I was wondering is it worth
the trouble. Can I just use vector of boolean values instead?
A great deal here depends on how many Boolean values you're working with.
Both bitset and vector<bool> normally use a packed representation where a Boolean is stored as only a single bit.
On one hand, that imposes some overhead in the form of bit manipulation to access a single value.
On the other hand, that also means many more of your Booleans will fit in your cache.
If you're using a lot of Booleans (e.g., implementing a sieve of Eratosthenes) fitting more of them in the cache will almost always end up a net gain. The reduction in memory use will gain you a lot more than the bit manipulation loses.
Most of the arguments against std::vector<bool> come back to the fact that it is not a standard container (i.e., it does not meet the requirements for a container). IMO, this is mostly a question of expectations -- since it says vector, many people expect it to be a container (other types of vectors are), and they often react negatively to the fact that vector<bool> isn't a container.
If you're using the vector in a way that really requires it to be a container, then you probably want to use some other combination -- either deque<bool> or vector<char> can work fine. Think before you do that though -- there's a lot of (lousy, IMO) advice that vector<bool> should be avoided in general, with little or no explanation of why it should be avoided at all, or under what circumstances it makes a real difference to you.
Yes, there are situations where something else will work better. If you're in one of those situations, using something else is clearly a good idea. But, be sure you're really in one of those situations first. Anybody who tells you (for example) that "Herb says you should use vector<char>" without a lot of explanation about the tradeoffs involved should not be trusted.
Let's give a real example. Since it was mentioned in the comments, let's consider the Sieve of Eratosthenes:
#include <vector>
#include <iostream>
#include <iterator>
#include <chrono>
unsigned long primes = 0;
template <class bool_t>
unsigned long sieve(unsigned max) {
std::vector<bool_t> sieve(max, false);
sieve[0] = sieve[1] = true;
for (int i = 2; i < max; i++) {
if (!sieve[i]) {
++primes;
for (int temp = 2 * i; temp < max; temp += i)
sieve[temp] = true;
}
}
return primes;
}
// Warning: auto return type will fail with older compilers
// Fine with g++ 5.1 and VC++ 2015 though.
//
template <class F>
auto timer(F f, int max) {
auto start = std::chrono::high_resolution_clock::now();
primes += f(max);
auto stop = std::chrono::high_resolution_clock::now();
return stop - start;
}
int main() {
using namespace std::chrono;
unsigned number = 100000000;
auto using_bool = timer(sieve<bool>, number);
auto using_char = timer(sieve<char>, number);
std::cout << "ignore: " << primes << "\n";
std::cout << "Time using bool: " << duration_cast<milliseconds>(using_bool).count() << "\n";
std::cout << "Time using char: " << duration_cast<milliseconds>(using_char).count() << "\n";
}
We've used a large enough array that we can expect a large portion of it to occupy main memory. I've also gone to a little pain to ensure that the only thing that changes between one invocation and the other is the use of a vector<char> vs. vector<bool>. Here are some results. First with VC++ 2015:
ignore: 34568730
Time using bool: 2623
Time using char: 3108
...then the time using g++ 5.1:
ignore: 34568730
Time using bool: 2359
Time using char: 3116
Obviously, the vector<bool> wins in both cases--by around 15% with VC++, and over 30% with gcc. Also note that in this case, I've chosen the size to show vector<char> in quite favorable light. If, for example, I reduce number from 100000000 to 10000000, the time differential becomes much larger:
ignore: 3987474
Time using bool: 72
Time using char: 249
Although I haven't done a lot of work to confirm, I'd guess that in this case, the version using vector<bool> is saving enough space that the array fits entirely in the cache, while the vector<char> is large enough to overflow the cache, and involve a great deal of main memory access.
You should usually avoid std::vector<bool> because it is not a standard container. It's a packed version, so it breaks some valuable guarantees usually given by a vector. A valid alternative would be to use std::vector<char> which is what Herb Sutter recommends.
You can read more about it in his GotW on the subject.
Update:
As has been pointed out, vector<bool> can be used to good effect, as a packed representation improves locality on large data sets. It may very well be the fastest alternative depending on circumstances. However, I would still not recommend it by default since it breaks many of the promises established by std::vector and the packing is a speed/memory tradeoff which may be beneficial in both speed and memory.
If you choose to use it, I would do so after measuring it against vector<char> for your application. Even then, I'd recommend using a typedef to refer to it via a name which does not seem to make the guarantees which it does not hold.
#include "boost/dynamic_bitset.hpp"
#include <chrono>
#include <iostream>
#include <random>
#include <vector>
int main(int, char*[])
{
auto gen = std::bind(std::uniform_int_distribution<>(0, 1), std::default_random_engine());
std::vector<char> randomValues(1000000);
for (char & randomValue : randomValues)
{
randomValue = static_cast<char>(gen());
}
// many accesses, few initializations
auto start = std::chrono::high_resolution_clock::now();
for (int i = 0; i < 500; ++i)
{
std::vector<bool> test(1000000, false);
for (int j = 0; j < test.size(); ++j)
{
test[j] = static_cast<bool>(randomValues[j]);
}
}
auto end = std::chrono::high_resolution_clock::now();
std::cout << "Time taken1: " << std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count()
<< " milliseconds" << std::endl;
auto start2 = std::chrono::high_resolution_clock::now();
for (int i = 0; i < 500; ++i)
{
boost::dynamic_bitset<> test(1000000, false);
for (int j = 0; j < test.size(); ++j)
{
test[j] = static_cast<bool>(randomValues[j]);
}
}
auto end2 = std::chrono::high_resolution_clock::now();
std::cout << "Time taken2: " << std::chrono::duration_cast<std::chrono::milliseconds>(end2 - start2).count()
<< " milliseconds" << std::endl;
auto start3 = std::chrono::high_resolution_clock::now();
for (int i = 0; i < 500; ++i)
{
std::vector<char> test(1000000, false);
for (int j = 0; j < test.size(); ++j)
{
test[j] = static_cast<bool>(randomValues[j]);
}
}
auto end3 = std::chrono::high_resolution_clock::now();
std::cout << "Time taken3: " << std::chrono::duration_cast<std::chrono::milliseconds>(end3 - start3).count()
<< " milliseconds" << std::endl;
// few accesses, many initializations
auto start4 = std::chrono::high_resolution_clock::now();
for (int i = 0; i < 1000000; ++i)
{
std::vector<bool> test(1000000, false);
for (int j = 0; j < 500; ++j)
{
test[j] = static_cast<bool>(randomValues[j]);
}
}
auto end4 = std::chrono::high_resolution_clock::now();
std::cout << "Time taken4: " << std::chrono::duration_cast<std::chrono::milliseconds>(end4 - start4).count()
<< " milliseconds" << std::endl;
auto start5 = std::chrono::high_resolution_clock::now();
for (int i = 0; i < 1000000; ++i)
{
boost::dynamic_bitset<> test(1000000, false);
for (int j = 0; j < 500; ++j)
{
test[j] = static_cast<bool>(randomValues[j]);
}
}
auto end5 = std::chrono::high_resolution_clock::now();
std::cout << "Time taken5: " << std::chrono::duration_cast<std::chrono::milliseconds>(end5 - start5).count()
<< " milliseconds" << std::endl;
auto start6 = std::chrono::high_resolution_clock::now();
for (int i = 0; i < 1000000; ++i)
{
std::vector<char> test(1000000, false);
for (int j = 0; j < 500; ++j)
{
test[j] = static_cast<bool>(randomValues[j]);
}
}
auto end6 = std::chrono::high_resolution_clock::now();
std::cout << "Time taken6: " << std::chrono::duration_cast<std::chrono::milliseconds>(end6 - start6).count()
<< " milliseconds" << std::endl;
return EXIT_SUCCESS;
}
Time taken1: 1821 milliseconds
Time taken2: 1722 milliseconds
Time taken3: 25 milliseconds
Time taken4: 1987 milliseconds
Time taken5: 1993 milliseconds
Time taken6: 10970 milliseconds
dynamic_bitset = std::vector<bool>
if you allocate many times but you only access the array that you created few times, go for std::vector<bool> because it has lower allocation/initialization time.
if you allocate once and access many times, go for std::vector<char>, because of faster access
Also keep in mind that std::vector<bool> is NOT safe to be used is in multithreading because you might write to different bits but it might be the same byte.
It appears that the size of a dynamic bitset cannot be changed:
"The dynamic_bitset class is nearly identical to the std::bitset class. The difference is that the size of the dynamic_bitset (the number of bits) is specified at run-time during the construction of a dynamic_bitset object, whereas the size of a std::bitset is specified at compile-time through an integer template parameter." (from http://www.boost.org/doc/libs/1_36_0/libs/dynamic_bitset/dynamic_bitset.html)
As such, it should be slightly faster since it will have slightly less overhead than a vector, but you lose the ability to insert elements.
UPDATE: I just realize that OP was asking about vector<bool> vs bitset, and my answer does not answer the question, but I think I should leave it, if you search for c++ vector bool slow, you end up here.
vector<bool> is terribly slow. At least on my Arch Linux system (you can probably get a better implementation or something... but I was really surprised). If anybody has any suggestions why this is so slow, I'm all ears! (Sorry for the blunt beginning, here's the more professional part.)
I've written two implementations of the SOE, and the 'close to metal' C implementation is 10 times faster. sievec.c is the C implementation, and sievestl.cpp is the C++ implementation. I just compiled with make (implicit rules only, no makefile): and the results were 1.4 sec for the C version, and 12 sec for the C++/STL version:
sievecmp % make -B sievec && time ./sievec 27
cc sievec.c -o sievec
aa 1056282
./sievec 27 1.44s user 0.01s system 100% cpu 1.455 total
and
sievecmp % make -B sievestl && time ./sievestl 27
g++ sievestl.cpp -o sievestl
1056282./sievestl 27 12.12s user 0.01s system 100% cpu 12.114 total
sievec.c is as follows:
#include <stdio.h>
#include <stdlib.h>
typedef unsigned long prime_t;
typedef unsigned long word_t;
#define LOG_WORD_SIZE 6
#define INDEX(i) ((i)>>(LOG_WORD_SIZE))
#define MASK(i) ((word_t)(1) << ((i)&(((word_t)(1)<<LOG_WORD_SIZE)-1)))
#define GET(p,i) (p[INDEX(i)]&MASK(i))
#define SET(p,i) (p[INDEX(i)]|=MASK(i))
#define RESET(p,i) (p[INDEX(i)]&=~MASK(i))
#define p2i(p) ((p)>>1) // (((p-2)>>1))
#define i2p(i) (((i)<<1)+1) // ((i)*2+3)
unsigned long find_next_zero(unsigned long from,
unsigned long *v,
size_t N){
size_t i;
for (i = from+1; i < N; i++) {
if(GET(v,i)==0) return i;
}
return -1;
}
int main(int argc, char *argv[])
{
size_t N = atoi(argv[1]);
N = 1lu<<N;
// printf("%u\n",N);
unsigned long *v = malloc(N/8);
for(size_t i = 0; i < N/64; i++) v[i]=0;
unsigned long p = 3;
unsigned long pp = p2i(p * p);
while( pp <= N){
for(unsigned long q = pp; q < N; q += p ){
SET(v,q);
}
p = p2i(p);
p = find_next_zero(p,v,N);
p = i2p(p);
pp = p2i(p * p);
}
unsigned long sum = 0;
for(unsigned long i = 0; i+2 < N; i++)
if(GET(v,i)==0 && GET(v,i+1)==0) {
unsigned long p = i2p(i);
// cout << p << ", " << p+2 << endl;
sum++;
}
printf("aa %lu\n",sum);
// free(v);
return 0;
}
sievestl.cpp is as follows:
#include <iostream>
#include <vector>
#include <sstream>
using namespace std;
inline unsigned long i2p(unsigned long i){return (i<<1)+1; }
inline unsigned long p2i(unsigned long p){return (p>>1); }
inline unsigned long find_next_zero(unsigned long from, vector<bool> v){
size_t N = v.size();
for (size_t i = from+1; i < N; i++) {
if(v[i]==0) return i;
}
return -1;
}
int main(int argc, char *argv[])
{
stringstream ss;
ss << argv[1];
size_t N;
ss >> N;
N = 1lu<<N;
// cout << N << endl;
vector<bool> v(N);
unsigned long p = 3;
unsigned long pp = p2i(p * p);
while( pp <= N){
for(unsigned long q = pp; q < N; q += p ){
v[q] = 1;
}
p = p2i(p);
p = find_next_zero(p,v);
p = i2p(p);
pp = p2i(p * p);
}
unsigned sum = 0;
for(unsigned long i = 0; i+2 < N; i++)
if(v[i]==0 and v[i+1]==0) {
unsigned long p = i2p(i);
// cout << p << ", " << p+2 << endl;
sum++;
}
cout << sum;
return 0;
}
Recently, I was asked in a interview to implement a string reverse function using threads. I came up with most part of the solution below. Got selected or not is a different story :-). I tried to run the below solution on my home PC running Windows 8 consumer preview. The compiler is VC11 Beta.
The question is, the multi-threaded code is always either as fast or 1 millisecond slower than the sequential code. The input I gave is a text file of size 32.4 MB. Is there a way to make the multi-threaded code faster ? Or is it that the input given is too less to make any difference ?
EDIT
I only wrote void Reverse(char* str, int beg, int end, int rbegin, int rend); and
void CustomReverse(char* str); methods in the interview. All the other code is written at home.
template<typename Function>
void TimeIt(Function&& fun, const char* caption)
{
clock_t start = clock();
fun();
clock_t ticks = clock()-start;
std::cout << std::setw(30) << caption << ": " << (double)ticks/CLOCKS_PER_SEC << "\n";
}
void Reverse(char* str)
{
assert(str != NULL);
for ( int i = 0, j = strlen(str) - 1; i < j; ++i, --j)
{
if ( str[i] != str[j])
{
std::swap(str[i], str[j]);
}
}
}
void Reverse(char* str, int beg, int end, int rbegin, int rend)
{
for ( ; beg <= end && rbegin >= rend; ++beg, --rbegin)
{
if ( str[beg] != str[rbegin])
{
char temp = str[beg];
str[beg] = str[rbegin];
str[rbegin] = temp;
}
}
}
void CustomReverse(char* str)
{
int len = strlen(str);
const int MAX_THREADS = std::thread::hardware_concurrency();
std::vector<std::thread> threads;
threads.reserve(MAX_THREADS);
const int CHUNK = len / MAX_THREADS > (4096) ? (4096) : len / MAX_THREADS;
/*std::cout << "len:" << len << "\n";
std::cout << "MAX_THREADS:" << MAX_THREADS << "\n";
std::cout << "CHUNK:" << CHUNK << "\n";*/
for ( int i = 0, j = len - 1; i < j; )
{
if (i + CHUNK < j && j - CHUNK > i )
{
for ( int k = 0; k < MAX_THREADS && (i + CHUNK < j && j - CHUNK > i ); ++k)
{
threads.push_back( std::thread([=, &str]() { Reverse(str, i,
i + CHUNK, j, j - CHUNK); }));
i += CHUNK + 1;
j -= CHUNK + 1;
}
for ( auto& th : threads)
{
th.join();
}
threads.clear();
}
else
{
char temp = str[i];
str[i] = str[j];
str[j] = str[i];
i++;
j--;
}
}
}
void Write(std::ostream&& os, const std::string& str)
{
os << str << "\n";
}
void CustomReverseDemo(int argc, char** argv)
{
std::ifstream inpfile;
for ( int i = 0; i < argc; ++i)
std::cout << argv[i] << "\n";
inpfile.open(argv[1], std::ios::in);
std::ostringstream oss;
std::string line;
if (! inpfile.is_open())
{
return;
}
while (std::getline(inpfile, line))
{
oss << line;
}
std::string seq(oss.str());
std::string par(oss.str());
std::cout << "Reversing now\n";
TimeIt( [&] { CustomReverse(&par[0]); }, "Using parallel code\n");
TimeIt( [&] { Reverse(&seq[0]) ;}, "Using Sequential Code\n");
TimeIt( [&] { Reverse(&seq[0]) ;}, "Using Sequential Code\n");
TimeIt( [&] { CustomReverse(&par[0]); }, "Using parallel code\n");
Write(std::ofstream("sequential.txt"), seq);
Write(std::ofstream("Parallel.txt"), par);
}
int main(int argc, char* argv[])
{
CustomReverseDemo(argc, argv);
}
I found the code to be hard to comprehend but I have found the following problems:
Your block size of 4096 is far too small to be worth a thread. Starting a thread might be about as costly as the actual operation.
You are fork-joining a lot (for every CHUNK * MAX_THREADS chars). This is introducting a lot of unneeded join points (sequential parts) and overhead.
Partition the string statically into MAX_THREADS chunks and start MAX_THREADS threads. There are more efficient ways to do it but at least this will give you some speedup.
I tried to write the program with same functionality:
My effort of "Reversing a string using threads"
I have tested that with 2 core processor with VC11 Beta and mingw(gcc 4.8) on Windows 7
Testing results:
VC11 Beta:
7 Mb file:
Debug
Simple reverse: 0.468
Async reverse : 0.275
Release
Simple reverse: 0.006
Async reverse : 0.014
98 Mb file:
Debug
Simple reverse: 5.982
Async reverse : 3.091
Release
Simple reverse: 0.063
Async reverse : 0.079
782 Mb file
Release
Simple reverse: 0.567
Async reverse : 0.689
Mingw:
782 Mb file
Release
Simple reverse: 0.583
Async reverse : 0.566
As you can see multi-threaded code wins only in debug build. But in release compiler makes optimization and uses all cores even in case of single-threaded code.
So trust your compiler =)
While you are using all the new threading features, you aren't using all the old good parts of the standard library, like std::string and iterators
You shouldn't write the threading stuff yourself but instead use a parallel algorithms library which offers something like a parallel_for construct.
Your task can be simplified to this:
std::string str;
// fill string
auto worker = [&] (iter begin, iter end) {
for(auto it = begin; it != end; ++it) {
std::swap(*it, *(std::end(str) - std::distance(std::begin(str), it) - 1));
}
};
parallel_for(std::begin(str),
std::begin(str) + std::distance(std::begin(str), std::end(str)) / 2, worker);
Note that you need quite a big text file to gain a speed up of this parallel approach. 34 MB might not be enough.
On small strings, effects like false sharing can have a negative impact on your performance.
Limiting the chunksize to 4096 does not make any sense.
Init once and then synchronize at the end should always be the pattern for parallel operations (think map/reduce)
Smaller things:
Checking if the chars are identical is every bad for any kind of pipeline optimization. Just do the swap().
In the parallel and sequential version you use different code for the swap. why?
Starting with 300 MB string size I'm seeing that multi-threaded version (TBB-based, see below) performs on average 3 times better than the serial version. Have to admit that for this 3x speedup it uses 12 real-hw cores :). I experimented a little with grain sizes (you can specify those in TBB for the blocked_range class object), but this did not make any significant impact, default auto_partitioner seems to be able to partition the data almost optimally. The code I used:
tbb::parallel_for(tbb::blocked_range<size_t>(0, (int)str.length()/2), [&] (const tbb::blocked_range<size_t>& r) {
const size_t r_end = r.end();
for(size_t i = r.begin(); i < r_end; ++i) {
std::swap(*(std::begin(str) + i), *(std::end(str) - 1 - i));
}
});
Tested code
#include <iostream>
#include <mutex>
#include <thread>
#include <vector>
#include <string.h>
#include <stdio.h>
#include <memory.h>
#include <stdlib.h>
void strrev(char *p, char *q, int num)
{
for(int i=0;i < num ; ++i,--q, ++p)
*q = *p;
}
int main(int argc, char **argv)
{
char *str;
if(argc>1)
{
str = argv[1];
printf("String to be reversed %s\n", str);
}
else
{
return 0;
}
int length = strlen(str);
int N = 5;
char *rev_str = (char *)malloc(length+1);
rev_str[length] = '\0';
if (N>length)
{
N = length;
}
std::vector<std::thread> threads;
int begin=0, end=length-1, k = length/N;
for(int i=1; i <= N; ++i)
{
threads.emplace_back(strrev, &str[begin], &rev_str[end], k);
//strrev(&str[begin], &rev_str[end], k);
begin += k;
end -= k;
}
while (true)
{
if (end < 0 && begin > length-1)
{
break;
}
rev_str[end] = str[begin];
--end; ++begin;
}
for (auto& i: threads)
{
i.join();
}
printf("String after reversal %s\n", rev_str);
return 0;
}