Thrust execution policy issues kernel to default stream - concurrency

I am currently designing a short tutorial exhibiting various aspects and capabilities of Thrust template library.
Unfortunately, it seems that there is a problem in a code that I have written in order to show how to use copy/compute concurrency using cuda streams.
My code could be found here, in the asynchronousLaunch directory:
https://github.com/gnthibault/Cuda_Thrust_Introduction/tree/master/AsynchronousLaunch
Here is an abstract of the code that generates the problem:
//STL
#include <cstdlib>
#include <algorithm>
#include <iostream>
#include <vector>
#include <functional>
//Thrust
#include <thrust/device_vector.h>
#include <thrust/host_vector.h>
#include <thrust/execution_policy.h>
#include <thrust/scan.h>
//Cuda
#include <cuda_runtime.h>
//Local
#include "AsynchronousLaunch.cu.h"
int main( int argc, char* argv[] )
{
const size_t fullSize = 1024*1024*64;
const size_t halfSize = fullSize/2;
//Declare one host std::vector and initialize it with random values
std::vector<float> hostVector( fullSize );
std::generate(hostVector.begin(), hostVector.end(), normalRandomFunctor<float>(0.f,1.f) );
//And two device vector of Half size
thrust::device_vector<float> deviceVector0( halfSize );
thrust::device_vector<float> deviceVector1( halfSize );
//Declare and initialize also two cuda stream
cudaStream_t stream0, stream1;
cudaStreamCreate( &stream0 );
cudaStreamCreate( &stream1 );
//Now, we would like to perform an alternate scheme copy/compute
for( int i = 0; i < 10; i++ )
{
//Wait for the end of the copy to host before starting to copy back to device
cudaStreamSynchronize(stream0);
//Warning: thrust::copy does not handle asynchronous behaviour for host/device copy, you must use cudaMemcpyAsync to do so
cudaMemcpyAsync(thrust::raw_pointer_cast(deviceVector0.data()), thrust::raw_pointer_cast(hostVector.data()), halfSize*sizeof(float), cudaMemcpyHostToDevice, stream0);
cudaStreamSynchronize(stream1);
//second copy is most likely to occur sequentially after the first one
cudaMemcpyAsync(thrust::raw_pointer_cast(deviceVector1.data()), thrust::raw_pointer_cast(hostVector.data())+halfSize, halfSize*sizeof(float), cudaMemcpyHostToDevice, stream1);
//Compute on device, here inclusive scan, for histogram equalization for instance
thrust::transform( thrust::cuda::par.on(stream0), deviceVector0.begin(), deviceVector0.end(), deviceVector0.begin(), computeFunctor<float>() );
thrust::transform( thrust::cuda::par.on(stream1), deviceVector1.begin(), deviceVector1.end(), deviceVector1.begin(), computeFunctor<float>() );
//Copy back to host
cudaMemcpyAsync(thrust::raw_pointer_cast(hostVector.data()), thrust::raw_pointer_cast(deviceVector0.data()), halfSize*sizeof(float), cudaMemcpyDeviceToHost, stream0);
cudaMemcpyAsync(thrust::raw_pointer_cast(hostVector.data())+halfSize, thrust::raw_pointer_cast(deviceVector1.data()), halfSize*sizeof(float), cudaMemcpyDeviceToHost, stream1);
}
//Full Synchronize before exit
cudaDeviceSynchronize();
cudaStreamDestroy( stream0 );
cudaStreamDestroy( stream1 );
return EXIT_SUCCESS;
}
Here are the results of one instance of the program, observed through nvidia visual profile:
As yo can see, cudamemcopy (in brown) are both issued to stream 13 and 14, but kernels generated by Thrust from thrust::transform are issued to default stream (in blue in the capture)
By the way, I am using cuda toolkit version 7.0.28, with a GTX680 and gcc 4.8.2.
I would be grateful if someone could tell me what is wrong with my code.
Thank you in advance
Edit: here is the code that I consider as a solution:
//STL
#include <cstdlib>
#include <algorithm>
#include <iostream>
#include <functional>
#include <vector>
//Thrust
#include <thrust/device_vector.h>
#include <thrust/host_vector.h>
#include <thrust/execution_policy.h>
//Cuda
#include <cuda_runtime.h>
//Local definitions
template<typename T>
struct computeFunctor
{
__host__ __device__
computeFunctor() {}
__host__ __device__
T operator()( T in )
{
//Naive functor that generates expensive but useless instructions
T a = cos(in);
for(int i = 0; i < 350; i++ )
{
a+=cos(in);
}
return a;
}
};
int main( int argc, char* argv[] )
{
const size_t fullSize = 1024*1024*2;
const size_t nbOfStrip = 4;
const size_t stripSize = fullSize/nbOfStrip;
//Allocate host pinned memory in order to use asynchronous api and initialize it with random values
float* hostVector;
cudaMallocHost(&hostVector,fullSize*sizeof(float));
std::fill(hostVector, hostVector+fullSize, 1.0f );
//And one device vector of the same size
thrust::device_vector<float> deviceVector( fullSize );
//Declare and initialize also two cuda stream
std::vector<cudaStream_t> vStream(nbOfStrip);
for( auto it = vStream.begin(); it != vStream.end(); it++ )
{
cudaStreamCreate( &(*it) );
}
//Now, we would like to perform an alternate scheme copy/compute in a loop using the copyToDevice/Compute/CopyToHost for each stream scheme:
for( int i = 0; i < 5; i++ )
{
for( int j=0; j!=nbOfStrip; j++)
{
size_t offset = stripSize*j;
size_t nextOffset = stripSize*(j+1);
cudaStreamSynchronize(vStream.at(j));
cudaMemcpyAsync(thrust::raw_pointer_cast(deviceVector.data())+offset, hostVector+offset, stripSize*sizeof(float), cudaMemcpyHostToDevice, vStream.at(j));
thrust::transform( thrust::cuda::par.on(vStream.at(j)), deviceVector.begin()+offset, deviceVector.begin()+nextOffset, deviceVector.begin()+offset, computeFunctor<float>() );
cudaMemcpyAsync(hostVector+offset, thrust::raw_pointer_cast(deviceVector.data())+offset, stripSize*sizeof(float), cudaMemcpyDeviceToHost, vStream.at(j));
}
}
//On devices that do not possess multiple queues copy engine capability, this solution serializes all command even if they have been issued to different streams
//Why ? Because in the point of view of the copy engine, which is a single ressource in this case, there is a time dependency between HtoD(n) and DtoH(n) which is ok, but there is also
// a false dependency between DtoH(n) and HtoD(n+1), that preclude any copy/compute overlap
//Full Synchronize before testing second solution
cudaDeviceSynchronize();
//Now, we would like to perform an alternate scheme copy/compute in a loop using the copyToDevice for each stream /Compute for each stream /CopyToHost for each stream scheme:
for( int i = 0; i < 5; i++ )
{
for( int j=0; j!=nbOfStrip; j++)
{
cudaStreamSynchronize(vStream.at(j));
}
for( int j=0; j!=nbOfStrip; j++)
{
size_t offset = stripSize*j;
cudaMemcpyAsync(thrust::raw_pointer_cast(deviceVector.data())+offset, hostVector+offset, stripSize*sizeof(float), cudaMemcpyHostToDevice, vStream.at(j));
}
for( int j=0; j!=nbOfStrip; j++)
{
size_t offset = stripSize*j;
size_t nextOffset = stripSize*(j+1);
thrust::transform( thrust::cuda::par.on(vStream.at(j)), deviceVector.begin()+offset, deviceVector.begin()+nextOffset, deviceVector.begin()+offset, computeFunctor<float>() );
}
for( int j=0; j!=nbOfStrip; j++)
{
size_t offset = stripSize*j;
cudaMemcpyAsync(hostVector+offset, thrust::raw_pointer_cast(deviceVector.data())+offset, stripSize*sizeof(float), cudaMemcpyDeviceToHost, vStream.at(j));
}
}
//On device that do not possess multiple queues in the copy engine, this solution yield better results, on other, it should show nearly identic results
//Full Synchronize before exit
cudaDeviceSynchronize();
for( auto it = vStream.begin(); it != vStream.end(); it++ )
{
cudaStreamDestroy( *it );
}
cudaFreeHost( hostVector );
return EXIT_SUCCESS;
}
Compiled using nvcc ./test.cu -o ./test.exe -std=c++11

There are 2 things I would point out. Both of these are (now) referenced in this related question/answer which you may wish to refer to.
The failure of thrust to issue the underlying kernels to non-default streams in this case seems to be related to this issue. It can be rectified (as covered in the comments to the question) by updating to the latest thrust version. Future CUDA versions (beyond 7) will probably include a fixed thrust as well. This is probably the central issue being discussed in this question.
The question seems to also suggest that one of the goals is overlap of copy and compute:
in order to show how to use copy/compute concurrency using cuda streams
but this won't be achievable, I don't think, with the code as currently crafted, even if item 1 above is fixed. Overlap of copy with compute operations requires the proper use of cuda streams on the copy operation (cudaMemcpyAsync) as well as a pinned host allocation. The code proposed in the question is lacking any use of a pinned host allocation (std::vector does not use a pinned allocator by default, AFAIK), and so I would not expect the cudaMemcpyAsync operation to overlap with any kernel activity, even if it should be otherwise possible. To rectify this, a pinned allocator should be used, and one such example is given here.
For completeness, the question is otherwise lacking an MCVE, which is expected for questions of this type. This makes it more difficult for others to attempt to test your issue, and is explicitly a close reason on SO. Yes, you provided a link to an external github repo, but this behavior is frowned on. The MCVE requirement explicitly states that the necessary pieces should be included in the question itself (not an external reference.) Since the only lacking piece, AFAICT, is "AsynchronousLaunch.cu.h", it seems like it would have been relatively straightforward to include this one additional piece in your question. The problem with external links is that when they break in the future, the question becomes less useful for future readers. (And, forcing others to navigate an external github repo looking for specific files is not conducive to getting help, in my opinion.)

Related

if there is a limit imposed by the standard library for threads count used by subprocess opened by system() command ? c++

I use thread lib to create multiple thread for my c++ program, and I call a executable program in each thread by using system() command. The executable program is multithreading itself.
so I want to ask if there is a limit for threads counts for the executable program called by system() could used in each thread , If there are some rule in thread library or standard library to limit the usage of thread for sub excutable program called by system()command?
Above is my question, If you have any question , you could read my code example below.
please ignore the progress_bar.h, it is nothing to do with my question, it's a head file which is used to show progress bar.
parallel.h is like:
#ifndef parallel_h
#define parallel_h
#include <vector>
#include <functional>
#include <atomic>
#include <thread>
#include "progress_bar.h"
//simple thread pool implementation
//updateFun should be thread-safe!
template <class T>
void processInParallel(const std::vector<T>& scheduledTasks,
std::function<void(const T&)> updateFun,
size_t maxThreads, bool progressBar)
{
if (scheduledTasks.empty()) return;
std::atomic<size_t> jobId(0);
ProgressPercent progress(scheduledTasks.size());
if (progressBar) progress.advance(0);
auto threadWorker = [&jobId, &scheduledTasks, &updateFun,
&progress, progressBar]()
{
while (true)
{
size_t expected = 0;
while(true)
{
expected = jobId;
if (jobId == scheduledTasks.size())
{
return;
}
if (jobId.compare_exchange_weak(expected, expected + 1))
{
break;
}
}
updateFun(scheduledTasks[expected]);
if (progressBar) progress.advance();
}
};
std::vector<std::thread> threads(std::min(maxThreads,
scheduledTasks.size()));
for (size_t i = 0; i < threads.size(); ++i)
{
threads[i] = std::thread(threadWorker);
}
for (size_t i = 0; i < threads.size(); ++i)
{
threads[i].join();
}
}
#endif /* parallel_h */
in main.cpp
#include "parallel.h"
#include <iostream>
#include <function>
#include <vector>
int main(int argc, char** argv){
std::vector<int> jobids = {1,2,3,4,5};
std::function<void(const int& jobid)>testfunc = [](const int& jobid)
{
system("Call another executable program here!");
};
size_t threadnum = 5;
processInParallel(testfunc, jobids, threadnum, true);
}
can anyone give me an answer?
There are two main limits for number of threads in Linux.
One is a limit for total amount of threads in system that can be checked in:
/proc/sys/kernel/threads-max. This limit is system-wide and it gives you insight how many threads can be run by kernel.
It's worth to look also on /proc/sys/vm/max_map_count as it contains information about how many virtual memory areas a process can own.
The second one is indirect limit connected with amount of virtual memory:
number of threads = total virtual memory / (stack size*1024*1024)
As you can create new threads with custom value of stack size this limit can vary. Increasing process virtual memory or decreasing stack size for new threads can allow you to run more threads within single process. Check ulimit for more information.
If your distribution is systemd-based you'd like to look also on UserTasksMax setting.

How to get the memory used by a multidimensional vector

I am currently writing some code to create a neural network, and i am trying to make it as optimised as possible. I want to be able to get the amount of memory consumed by a object of type Network, since memory usage is very important in order to avoid cache misses. I tried using sizeof(), however this does not work, since, i assume, that vectors store the values on the heap, so the sizeof() function will just tell me the size of the pointers. Here is my code so far.
#include <iostream>
#include <vector>
#include <random>
#include <chrono>
class Timer
{
private:
std::chrono::time_point<std::chrono::high_resolution_clock> start_time;
public:
Timer(bool auto_start=true)
{
if (auto_start)
{
start();
}
}
void start()
{
start_time = std::chrono::high_resolution_clock::now();
}
float get_duration()
{
std::chrono::duration<float> duration = std::chrono::high_resolution_clock::now() - start_time;
return duration.count();
}
};
class Network
{
public:
std::vector<std::vector<std::vector<float>>> weights;
std::vector<std::vector<std::vector<float>>> deriv_weights;
std::vector<std::vector<float>> biases;
std::vector<std::vector<float>> deriv_biases;
std::vector<std::vector<float>> activations;
std::vector<std::vector<float>> deriv_activations;
};
Network create_network(std::vector<int> layers)
{
Network network;
network.weights.reserve(layers.size() - 1);
int nodes_in_prev_layer = layers[0];
for (unsigned int i = 0; i < layers.size() - 1; ++i)
{
int nodes_in_layer = layers[i + 1];
network.weights.push_back(std::vector<std::vector<float>>());
network.weights[i].reserve(nodes_in_layer);
for (int j = 0; j < nodes_in_layer; ++j)
{
network.weights[i].push_back(std::vector<float>());
network.weights[i][j].reserve(nodes_in_prev_layer);
for (int k = 0; k < nodes_in_prev_layer; ++k)
{
float input_weight = float(std::rand()) / RAND_MAX;
network.weights[i][j].push_back(input_weight);
}
}
nodes_in_prev_layer = nodes_in_layer;
}
return network;
}
int main()
{
Timer timer;
Network network = create_network({784, 800, 16, 10});
std::cout << timer.get_duration() << std::endl;
std::cout << sizeof(network) << std::endl;
std::cin.get();
}
I've recently updated our production neural network code to AVX-512; it's definitely real-world production code. A key part of our optimalisations is that each matrix is not a std::vector, but a 1D AVX-aligned array. Even without AVX alignment, we see a huge benefit in moving to a one-dimensional array backing each matrix. This means the memory access will be fully sequential, which is much faster. The size will then be (rows*cols)*sizeof(float).
We store the bias as the first full row. Commonly that's implemented by prefixing the input with a 1.0 element, but for our AVX code we use the bias as the starting values for the FMA (Fused Multiply-Add) operations. I.e. in pseudo-code result=bias; for(input:inputs) result+=(input*weight). This keeps the input also AVX-aligned.
Since each matrix is used in turn, you can safely have a std::vector<Matrix> layers.
As quote from https://stackoverflow.com/a/17254518/7588455:
Vector stores its elements in an internally-allocated memory array. You can do this:
sizeof(std::vector<int>) + (sizeof(int) * MyVector.size())
This will give you the size of the vector structure itself plus the size of all the ints in it, but it may not include whatever small overhead your memory allocator may impose. I'm not sure there's a platform-independent way to include that.
In your case only the actually internally-allocated memory array matters since you're just accessing these. Also be aware of how you're accessing the memory.
In order to write cache friendly code I highly recommend to read thru this SO post: https://stackoverflow.com/a/16699282/7588455

Truly asynchronous file IO in C++

I have a super fast M.2 drive. How fast is it? It doesn’t matter because I cannot utilize this speed anyway. That’s why I’m asking this question.
I have an app that needs a real lot of memory. So much that it won’t fit in RAM. Fortunately it is not needed all at once. Instead it is used to save intermediate results from computations.
Unfortunately the application is not able to write and reads this data fast enough. I tried using multiple reader and writer threads but it only made it worse (later I read that it is because of this).
So my question is: Is it possible to have truly asynchronous file IO in C++ to fully exploit those advertised gigabytes per second? If it is than how (in a cross platform way)?
You could also recommend a library that’s good with tasks like that if you know one because I believe that there is no point in reinventing the wheel.
Edit:
Here is code that shows how I do file IO in my program. It isn't from the mentioned program because it wouldn't be that minimal. This one ilustrates the problem nevertheless. Do not mind Windows.h. It is used only to set thread affinity. In the actual program I also set affinity , so that's why I included it.
#include <fstream>
#include <thread>
#include <memory>
#include <string>
#include <Windows.h> // for SetThreadAffinityMask()
void stress_write(unsigned bytes, int num)
{
std::ofstream out("temp" + std::to_string(num));
for (unsigned i = 0; i < bytes; ++i)
{
out << char(i);
}
}
void lock_thread(unsigned core_idx)
{
SetThreadAffinityMask(GetCurrentThread(), 1LL << core_idx);
}
int main()
{
std::ios_base::sync_with_stdio(false);
lock_thread(0);
auto worker_count = std::thread::hardware_concurrency() - 1;
std::unique_ptr<std::thread[]> threads = std::make_unique<std::thread[]>(worker_count); // faster than std::vector
for (int i = 0; i < worker_count; ++i)
{
threads[i] = std::thread(
[](unsigned idx) {
lock_thread(idx);
stress_write(1'000'000'000, idx);
},
i + 1
);
}
stress_write(1'000'000'000, 0);
for (int i = 0; i < worker_count; ++i)
{
threads[i].join();
}
}
As you can see its just plain old fstream. On my machine this uses 100% CPU, but only 7-9% disk (around 190MB/s). I am wondering if it could be increased.
The easiest thing to get (up to) a 10x speed up is to change this:
void stress_write(unsigned bytes, int num)
{
std::ofstream out("temp" + std::to_string(num));
for (unsigned i = 0; i < bytes; ++i)
{
out << char(i);
}
}
to this:
void stress_write(unsigned bytes, int num)
{
constexpr auto chunk_size = (1u << 12u); // tune as needed
std::ofstream out("temp" + std::to_string(num));
for (unsigned chunk = 0; chunk < (bytes+chunk_size-1)/chunk_size; ++chunk)
{
char chunk_buff[chunk_size];
auto count = (std::min)( bytes - chunk_size*chunk, chunk_size );
for (unsigned j = 0; j < count; ++j)
{
unsigned i = j + chunk_size*chunk;
chunk_buff[j] = char(i); // processing
}
out.write( chunk_buff, count );
}
}
where we group writes up to 4096 bytes before sending to the std ofstream.
The streaming operations have a number of annoying, hard for compilers to elide, virtual calls that dominate performance when you are writing only a handful of bytes at a time.
By chunking data into larger pieces we make the vtable lookups rare enough that they no longer dominate.
See this SO post for more details asto why.
To get the last iota of performance, you may have to use something like boost.asio or access your platforms raw async file io libraries.
But when you are working at < 10% of the drive bandwidth while railing your CPU, aim at low hanging fruit first.
Chunking the I/O is indeed the most important optimization here and should suffice in most cases. However, the direct answer to the exact question asked about asynchronous IO is the following.
Boost::Asio added support for file operations in version 1.21.0. The interface is similar to the rest of Asio.
First, we need to create an object representing a file. The most common use cases would use either a random_access_file or a stream_file. In case of this example code, a streaming file is enough.
Reading is done through async_read_some, but the usual async_read helper function can be used to read a specific number of bytes at once.
If the operating system supports that, these operations do indeed run in the background and use little processor time. Both Windows and Linux do support this.

LRU Caching & Multithreading

I have already made a post some time ago to ask about a good design for LRU caching (in C++). You can find the question, the answer and some code there:
Better understanding the LRU algorithm
I have now tried to multi-thread this code (using pthread) and came with some really unexpected results. Before even attempting to use locking, I have created a system in which each thread accesses its own cache (see code). I run this code on a 4 cores processor. I tried to run it with 1 thread and 4 thread. When it runs on 1 thread I do 1 million lookups in the cache, on 4 threads, each threads does 250K lookups. I was expecting to get a time reduction with 4 threads but get the opposite. 1 threads runs in 2.2 seconds, 4 threads runs in more than 6 seconds?? I just can't make sense of this result.
Is something wrong with my code? Can this be explained somehow (thread management takes time). It would be great to have the feedback from experts. Thanks a lot -
I compile this code with: c++ -o cache cache.cpp -std=c++0x -O3 -lpthread
#include <stdio.h>
#include <pthread.h>
#include <unistd.h>
#include <sys/syscall.h>
#include <errno.h>
#include <sys/time.h>
#include <list>
#include <cstdlib>
#include <cstdio>
#include <memory>
#include <list>
#include <unordered_map>
#include <stdint.h>
#include <iostream>
typedef uint32_t data_key_t;
using namespace std;
//using namespace std::tr1;
class TileData
{
public:
data_key_t theKey;
float *data;
static const uint32_t tileSize = 32;
static const uint32_t tileDataBlockSize;
TileData(const data_key_t &key) : theKey(key), data(NULL)
{
float *data = new float [tileSize * tileSize * tileSize];
}
~TileData()
{
/* std::cerr << "delete " << theKey << std::endl; */
if (data) delete [] data;
}
};
typedef shared_ptr<TileData> TileDataPtr; // automatic memory management!
TileDataPtr loadDataFromDisk(const data_key_t &theKey)
{
return shared_ptr<TileData>(new TileData(theKey));
}
class CacheLRU
{
public:
list<TileDataPtr> linkedList;
unordered_map<data_key_t, TileDataPtr> hashMap;
CacheLRU() : cacheHit(0), cacheMiss(0) {}
TileDataPtr getData(data_key_t theKey)
{
unordered_map<data_key_t, TileDataPtr>::const_iterator iter = hashMap.find(theKey);
if (iter != hashMap.end()) {
TileDataPtr ret = iter->second;
linkedList.remove(ret);
linkedList.push_front(ret);
++cacheHit;
return ret;
}
else {
++cacheMiss;
TileDataPtr ret = loadDataFromDisk(theKey);
linkedList.push_front(ret);
hashMap.insert(make_pair<data_key_t, TileDataPtr>(theKey, ret));
if (linkedList.size() > MAX_LRU_CACHE_SIZE) {
const TileDataPtr dropMe = linkedList.back();
hashMap.erase(dropMe->theKey);
linkedList.remove(dropMe);
}
return ret;
}
}
static const uint32_t MAX_LRU_CACHE_SIZE = 100;
uint32_t cacheMiss, cacheHit;
};
int numThreads = 1;
void *testCache(void *data)
{
struct timeval tv1, tv2;
// Measuring time before starting the threads...
double t = clock();
printf("Starting thread, lookups %d\n", (int)(1000000.f / numThreads));
CacheLRU *cache = new CacheLRU;
for (uint32_t i = 0; i < (int)(1000000.f / numThreads); ++i) {
int key = random() % 300;
TileDataPtr tileDataPtr = cache->getData(key);
}
std::cerr << "Time (sec): " << (clock() - t) / CLOCKS_PER_SEC << std::endl;
delete cache;
}
int main()
{
int i;
pthread_t thr[numThreads];
struct timeval tv1, tv2;
// Measuring time before starting the threads...
gettimeofday(&tv1, NULL);
#if 0
CacheLRU *c1 = new CacheLRU;
(*testCache)(c1);
#else
for (int i = 0; i < numThreads; ++i) {
pthread_create(&thr[i], NULL, testCache, (void*)NULL);
//pthread_detach(thr[i]);
}
for (int i = 0; i < numThreads; ++i) {
pthread_join(thr[i], NULL);
//pthread_detach(thr[i]);
}
#endif
// Measuring time after threads finished...
gettimeofday(&tv2, NULL);
if (tv1.tv_usec > tv2.tv_usec)
{
tv2.tv_sec--;
tv2.tv_usec += 1000000;
}
printf("Result - %ld.%ld\n", tv2.tv_sec - tv1.tv_sec,
tv2.tv_usec - tv1.tv_usec);
return 0;
}
A thousand apologies, by keeping debugging the code I realised I made a really bad beginner's mistake, if you look at that code:
TileData(const data_key_t &key) : theKey(key), data(NULL)
{
float *data = new float [tileSize * tileSize * tileSize];
}
from the TikeData class where data is supposed to actually be a member variable of the class... So the right code should be:
class TileData
{
public:
float *data;
TileData(const data_key_t &key) : theKey(key), data(NULL)
{
data = new float [tileSize * tileSize * tileSize];
numAlloc++;
}
};
I am so sorry about that! It's a mistake I have done in the past, and I guess prototyping is great, but it sometimes lead to do such stupid mistakes.
I ran the code with 1 and 4 threads and do now see the speedup. 1 thread takes about 2.3 seconds, 4 threads takes 0.92 seconds.
Thanks all for your help, and sorry if I made you lose your time ;-)
I don't have a concrete answer yet. I can think of several possibilities. One is that testCache() is using random(), which is almost certainly implemented with a single global mutex. (Thus all of your threads are competing for the mutex, which is now ping-ponging between the caches.) ((That's assuming that random() is actually thread-safe on your system.))
Next, testCach() is accessing a CacheLRU which is implemented with unordered_maps and shared_ptrs. The unordered_maps, in particular might be implemented with some kind of global mutex underneath that is causing all of your threads to compete for access.
To really diagnose what is going on here you should do something much simpler inside of testCache(). (First try just taking the sqrt() of an input variable 250K times (vs. 1M times). Then try linearly accessing a C array of size 250K (or 1M). Slowly build up to the complex thing you are currently doing.)
Another possibility has to do with the pthread_join. pthread_join doesn't return until all the threads are done. So if one is taking longer than the others, you are measuring the slowest one. Your computation here seems balanced, but perhaps your OS is doing something unexpected? (Like mapping several threads to one core (perhaps because you have a hyper-threaded processor?, or one thread is moving from one core to another in the middle of the run (perhaps because the OS thinks it is smart when it is not.)
This will be a bit of a "build it up" answer. I'm running your code on a Fedora 16 Linux system with a 4-core AMD cpu and 16GB of RAM.
I can confirm that I'm seeing similar "slower with more threads" behaviour. I removed the random function, which doesn't improve things at all.
I'm going to make some other minor changes.

Checking device_vector inside CUDA kernel doesn't work

I'm running CUDA 4.2 on Windows 7 64 bits in the Visual Studio 2010 Professional environment
First, I have the following code running:
// include the header files
#include <iostream>
#include <stdio.h>
#include <time.h>
#include "cuda.h"
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
using namespace std;
//kernel function
__global__
void dosomething(int *d_bPtr, int count, int* d_bStopPtr)
{
int tid = threadIdx.x + blockIdx.x * blockDim.x;
if (tid==0)
d_bStopPtr[tid]=0;
else if(tid<count)
{
d_bPtr[tid]=tid;
// only if the arrary cell before it is 0, then change it to 0 too
if (d_bStopPtr[tid-1]==0 )
d_bStopPtr[tid]=0;
}
}
int main()
{
int count=100000;
// define the vectors
thrust::host_vector <int> h_a(count);
thrust::device_vector <int> d_b(count,0);
int* d_bPtr=thrust::raw_pointer_cast(&d_b[0]);
thrust::device_vector <int> d_bStop(count,1);
int* d_bStopPtr=thrust::raw_pointer_cast(&d_bStop[0]);
// get the device property
cudaDeviceProp prop;
cudaGetDeviceProperties(&prop, 0);
int threadsPerBlock = prop.maxThreadsDim[0];
int blocksPerGrid = min(prop.maxGridSize[0], (count + threadsPerBlock - 1) / threadsPerBlock);
//copy device to host
thrust::copy(d_b.begin(),d_b.end(),h_a.begin());
cout<<h_a[100]<<"\t"<<h_a[200]<<"\t"<<h_a[300]<<"\t"<<endl;
//run the kernel
while(d_bStop[count-1])
{
dosomething<<<blocksPerGrid, threadsPerBlock>>>(d_bPtr,count,d_bStopPtr);
}
//copy device back to host again
thrust::copy(d_b.begin(),d_b.end(),h_a.begin());
cout<<h_a[100]<<"\t"<<h_a[200]<<"\t"<<h_a[300]<<"\t"<<endl;
//wait to see the console output
int x;
cin>>x;
return 0;
}
However, each time I need to check the while condition, but it is slow. So I'm thinking of checking the condition of this device vector inside the kernel, and change the code like this:
// include the header files
#include <iostream>
#include <stdio.h>
#include <time.h>
#include "cuda.h"
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
using namespace std;
//kernel function
__global__
void dosomething(int *d_bPtr, int count, int* d_bStopPtr)
{
int tid = threadIdx.x + blockIdx.x * blockDim.x;
if (tid==0)
d_bStopPtr[tid]=0;
else if(tid<count)
{
// if the last cell of the arrary is still not 0 yet, repeat
while(d_bStopPtr[count-1])
{
d_bPtr[tid]=tid;
// only if the arrary cell before it is 0, then change it to 0 too
if (d_bStopPtr[tid-1]==0 )
d_bStopPtr[tid]=0;
}
}
}
int main()
{
int count=100000;
// define the vectors
thrust::host_vector <int> h_a(count);
thrust::device_vector <int> d_b(count,0);
int* d_bPtr=thrust::raw_pointer_cast(&d_b[0]);
thrust::device_vector <int> d_bStop(count,1);
int* d_bStopPtr=thrust::raw_pointer_cast(&d_bStop[0]);
// get the device property
cudaDeviceProp prop;
cudaGetDeviceProperties(&prop, 0);
int threadsPerBlock = prop.maxThreadsDim[0];
int blocksPerGrid = min(prop.maxGridSize[0], (count + threadsPerBlock - 1) / threadsPerBlock);
//copy device to host
thrust::copy(d_b.begin(),d_b.end(),h_a.begin());
cout<<h_a[100]<<"\t"<<h_a[200]<<"\t"<<h_a[300]<<"\t"<<endl;
//run the kernel
dosomething<<<blocksPerGrid, threadsPerBlock>>>(d_bPtr,count,d_bStopPtr);
//copy device back to host again
thrust::copy(d_b.begin(),d_b.end(),h_a.begin());
cout<<h_a[100]<<"\t"<<h_a[200]<<"\t"<<h_a[300]<<"\t"<<endl;
//wait to see the console output
int x;
cin>>x;
return 0;
}
However, the second version always causes the graphic card and the computer to hang. Can you please help me with speeding up the first version? How to check the condition inside the kernel and then jump out and stop the kernel?
You are basically looking for global thread synchronous behavior. This is a no-no in GPU programming. Ideally each threadblock is independent, and can complete the work based on it's own data and processing. Creating threadblocks that depend on the results of other threadblocks to complete their work is creating the possibility of a deadlock condition. Suppose I have a GPU with 14 SMs (threadblock execution units), and suppose I create 100 threadblocks. Now suppose threadblocks 0-13 are waiting for threadblock 99 to release a lock (e.g. write a zero value to a particular location). Now suppose those first 14 threadblocks begin executing on the 14 SMs, perhaps looping, spinning on the lock value. There is no mechanism in the GPU to guarantee that threadblock 99 will execute first or even execute at all, if threadblocks 0-13 have the SMs tied up.
Let's not get into questions about "what about GMEM stalls that force eviction of threadblocks 0-13" because none of that guarantees that threadblock 99 will get priority to execute at any point. The only thing that guarantees that threadblock 99 will execute is the draining (i.e. completion) of other threadblocks. But if the other threadblocks are spinning, waiting for results from threadblock 99, that may never happen.
Good forward-compatible, scalable GPU code depends on independent parallel work. So you're advised to re-craft your algorithm to make the work you are trying to accomplish independent, at least at the inter-threadblock level.
If you must do global thread syncing, the kernel launch is the only truly guaranteed point for this, and thus your first approach is the working approach.
To help with this, it may be useful to study how reduction algorithms get implemented on a GPU. Various types of reductions have dependencies across all threads, but by creating intermediate results, we can break the work into independent pieces. The independent pieces can then be aggregated using a multi-kernel approach (or some other more advanced approaches) to speed up what amounts to a serial algorithm.
Your kernel doesn't actually do much. It sets one array equal to it's index, i.e. a[i] = i; and it sets the other array to all zeroes (although sequentially) b[i]=0;. To show an example of your first code "speeded up", you could do something like this:
// include the header files
#include <iostream>
#include <stdio.h>
#include <time.h>
#include "cuda.h"
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
using namespace std;
//kernel function
__global__
void dosomething(int *d_bPtr, int count, int* d_bStopPtr)
{
int tid = threadIdx.x + blockIdx.x * blockDim.x;
while(tid<count)
{
d_bPtr[tid]=tid;
while(d_bStopPtr[tid]!=0)
// only if the arrary cell before it is 0, then change it to 0 too
if (tid==0) d_bStopPtr[tid] =0;
else if (d_bStopPtr[tid-1]==0 )
d_bStopPtr[tid]=0;
tid += blockDim.x;
}
}
int main()
{
int count=100000;
// define the vectors
thrust::host_vector <int> h_a(count);
thrust::device_vector <int> d_b(count,0);
int* d_bPtr=thrust::raw_pointer_cast(&d_b[0]);
thrust::device_vector <int> d_bStop(count,1);
int* d_bStopPtr=thrust::raw_pointer_cast(&d_bStop[0]);
// get the device property
cudaDeviceProp prop;
cudaGetDeviceProperties(&prop, 0);
// int threadsPerBlock = prop.maxThreadsDim[0];
int threadsPerBlock = 32;
// int blocksPerGrid = min(prop.maxGridSize[0], (count + threadsPerBlock - 1) / threadsPerBlock);
int blocksPerGrid = 1;
//copy device to host
thrust::copy(d_b.begin(),d_b.end(),h_a.begin());
cout<<h_a[100]<<"\t"<<h_a[200]<<"\t"<<h_a[300]<<"\t"<<endl;
//run the kernel
// while(d_bStop[count-1])
// {
dosomething<<<blocksPerGrid, threadsPerBlock>>>(d_bPtr,count,d_bStopPtr);
// }
//copy device back to host again
cudaDeviceSynchronize();
thrust::copy(d_b.begin(),d_b.end(),h_a.begin());
cout<<h_a[100]<<"\t"<<h_a[200]<<"\t"<<h_a[300]<<"\t"<<endl;
//wait to see the console output
int x;
cin>>x;
return 0;
}
On my machine this speeds the execution time up from 10 secs to almost instantaneous (much less than 1 second). Note that this is not a great example of CUDA programming, because I am only launching one block of 32 threads. That's not enough to effectively utilize the machine. But the work done by your kernel is so trivial that I'm not sure what a good example would be. I could just create a kernel that sets one array to it's index a[i]=i; and the other array to zero b[i]=0; all in parallel. That would be even faster, and we could use the whole machine that way.