Starting from this article - Gallery of Processor Cache Effects by Igor Ostrovsky - I wanted to play with his examples on my own machine.
This is my code for the first example, that looks at how touching different cache lines affect running time:
#include <iostream>
#include <time.h>
using namespace std;
int main(int argc, char* argv[])
{
int step = 1;
const int length = 64 * 1024 * 1024;
int* arr = new int[length];
timespec t0, t1;
clock_gettime(CLOCK_REALTIME, &t0);
for (int i = 0; i < length; i += step)
arr[i] *= 3;
clock_gettime(CLOCK_REALTIME, &t1);
long int duration = (t1.tv_nsec - t0.tv_nsec);
if (duration < 0)
duration = 1000000000 + duration;
cout<< step << ", " << duration / 1000 << endl;
return 0;
}
Using various values for step, I don't see the jump in the running time:
step, microseconds
1, 451725
2, 334981
3, 287679
4, 261813
5, 254265
6, 246077
16, 215035
32, 207410
64, 202526
128, 197089
256, 195154
I would expect to see something similar with:
But from 16 onwards, the running time is halved each time we double the step.
I test it on an Ubuntu13, Xeon X5450 and compiling it with: g++ -O0.
Is something flawed with my code, or the results are actually ok?
Any insight on what I'm missing would be highly appreciated.
As i see you want to observe effect of cache line sizes, i recommend tool cachegrind, part of valgrind tool set. Your approach is right but not close to results.
#include <iostream>
#include <time.h>
#include <stdlib.h>
using namespace std;
int main(int argc, char* argv[])
{
int step = atoi(argv[1]);
const int length = 64 * 1024 * 1024;
int* arr = new int[length];
for (int i = 0; i < length; i += step)
arr[i] *= 3;
return 0;
}
Run tool valgrind --tool=cachegrind ./a.out $cacheline-size and you should see results. After plotting this you will get desired results with accuracy. Happy Experimenting!!
public class CacheLine {
public static void main(String[] args) {
CacheLine cacheLine = new CacheLine();
cacheLine.startTesting();
}
private void startTesting() {
byte[] array = new byte[128 * 1024];
for (int testIndex = 0; testIndex < 10; testIndex++) {
testMethod(array);
System.out.println("--------- // ---------");
}
}
private void testMethod(byte[] array) {
for (int len = 8192; len <= array.length; len += 8192) {
long t0 = System.nanoTime();
for (int i = 0; i < 10000; i++) {
for (int k = 0; k < len; k += 64) {
array[k] = 1;
}
}
long dT = System.nanoTime() - t0;
System.out.println("len: " + len / 1024 + " dT: " + dT + " dT/stepCount: " + (dT) / len);
}
}
}
This code helps you with determining L1 data cache size. You can read about it more in detail here. https://medium.com/#behzodbekqodirov/threading-in-java-194b7db6c1de#.kzt4w8eul
Related
this is my first time using multi-threading to speed up a heavy calculation.
Background: The idea is to calculate a Kernel Covariance matrix, by reading a list of 3D points x_test and calculating the corresponding matrix, which has dimensions x_test.size() x x_test.size().
I already sped up the calculations by only calculating the lower triangluar matrix. Since all the calculations are independent from each other I tried to speed up the process (x_test.size() = 27000 in my case) by splitting the calculations of the matrix entries row-wise, assigning a range of rows to each thread.
On a single core the calculations took about 280 seconds each time, on 4 cores it took 270-290 seconds.
main.cpp
int main(int argc, char *argv[]) {
double sigma0sq = 1;
double lengthScale [] = {0.7633, 0.6937, 3.3307e+07};
const std::vector<std::vector<double>> x_test = parse2DCsvFile(inputPath);
/* Finding data slices of similar size */
//This piece of code works, each thread is assigned roughly the same number of matrix entries
int numElements = x_test.size()*x_test.size()/2;
const int numThreads = 4;
int elemsPerThread = numElements / numThreads;
std::vector<int> indices;
int j = 0;
for(std::size_t i=1; i<x_test.size()+1; ++i){
int prod = i*(i+1)/2 - j*(j+1)/2;
if (prod > elemsPerThread) {
i--;
j = i;
indices.push_back(i);
if(indices.size() == numThreads-1)
break;
}
}
indices.insert(indices.begin(), 0);
indices.push_back(x_test.size());
/* Spreding calculations to multiple threads */
std::vector<std::thread> threads;
for(std::size_t i = 1; i < indices.size(); ++i){
threads.push_back(std::thread(calculateKMatrixCpp, x_test, lengthScale, sigma0sq, i, indices.at(i-1), indices.at(i)));
}
for(auto & th: threads){
th.join();
}
return 0;
}
As you can see, each thread performs the following calculations on the data assigned to it:
void calculateKMatrixCpp(const std::vector<std::vector<double>> xtest, double lengthScale[], double sigma0sq, int threadCounter, int start, int stop){
char buffer[8192];
std::ofstream out("lower_half_matrix_" + std::to_string(threadCounter) +".csv");
out.rdbuf()->pubsetbuf(buffer, 8196);
for(int i = start; i < stop; ++i){
for(int j = 0; j < i+1; ++j){
double kij = seKernel(xtest.at(i), xtest.at(j), lengthScale, sigma0sq);
if (j!=0)
out << ',';
out << kij;
}
if(i!=xtest.size()-1 )
out << '\n';
}
out.close();
}
and
double seKernel(const std::vector<double> x1,const std::vector<double> x2, double lengthScale[], double sigma0sq) {
double sum(0);
for(std::size_t i=0; i<x1.size();i++){
sum += pow((x1.at(i)-x2.at(i))/lengthScale[i],2);
}
return sigma0sq*exp(-0.5*sum);
}
Aspects I considered
locking by simultaneous access to data vector -> I don't pass a reference to the threads, but a copy of the data. I know this is not optimal in terms of RAM usage, but as far as I know this should prevent simultaneous data access since every thread has its own copy
Output -> every thread writes its part of the lower triangular matrix to its own file. My task manager doesn't indicate a full SSD utilization in the slightest
Compiler and machine
Windows 11
GNU GCC Compiler
Code::Blocks (although I don't think that should be of importance)
There are many details that can be improved in your code, but I think the two biggest issues are:
using vectors or vectors, which leads to fragmented data;
writing each piece of data to file as soon as its value is computed.
The first point is easy to fix: use something like std::vector<std::array<double, 3>>. In the code below I use an alias to make it more readable:
using Point3D = std::array<double, 3>;
std::vector<Point3D> x_test;
The second point is slightly harder to address. I assume you wanted to write to the disk inside each thread because you couldn't manage to write to a shared buffer that you could then write to a file.
Here is a way to do exactly that:
void calculateKMatrixCpp(
std::vector<Point3D> const& xtest, Point3D const& lengthScale, double sigma0sq,
int threadCounter, int start, int stop, std::vector<double>& kMatrix
) {
// ...
double& kij = kMatrix[i * xtest.size() + j];
kij = seKernel(xtest[i], xtest[j], lengthScale, sigma0sq);
// ...
}
// ...
threads.push_back(std::thread(
calculateKMatrixCpp, x_test, lengthScale, sigma0sq,
i, indices[i-1], indices[i], std::ref(kMatrix)
));
Here, kMatrix is the shared buffer and represents the whole matrix you are trying to compute. You need to pass it to the thread via std::ref. Each thread will write to a different location in that buffer, so there is no need for any mutex or other synchronization.
Once you make these changes and try to write kMatrix to the disk, you will realize that this is the part that takes the most time, by far.
Below is the full code I tried on my machine, and the computation time was about 2 seconds whereas the writing-to-file part took 300 seconds! No amount of multithreading can speed that up.
If you truly want to write all that data to the disk, you may have some luck with file mapping. Computing the exact size needed should be easy enough if all values have the same number of digits, and it looks like you could write the values with multithreading. I have never done anything like that, so I can't really say much more about it, but it looks to me like the fastest way to write multiple gigabytes of memory to the disk.
#include <vector>
#include <thread>
#include <iostream>
#include <string>
#include <cmath>
#include <array>
#include <random>
#include <fstream>
#include <chrono>
using Point3D = std::array<double, 3>;
auto generateSampleData() -> std::vector<Point3D> {
static std::minstd_rand g(std::random_device{}());
std::uniform_real_distribution<> d(-1.0, 1.0);
std::vector<Point3D> data;
data.reserve(27000);
for (auto i = 0; i < 27000; ++i) {
data.push_back({ d(g), d(g), d(g) });
}
return data;
}
double seKernel(Point3D const& x1, Point3D const& x2, Point3D const& lengthScale, double sigma0sq) {
double sum = 0.0;
for (auto i = 0u; i < 3u; ++i) {
double distance = (x1[i] - x2[i]) / lengthScale[i];
sum += distance*distance;
}
return sigma0sq * std::exp(-0.5*sum);
}
void calculateKMatrixCpp(std::vector<Point3D> const& xtest, Point3D const& lengthScale, double sigma0sq, int threadCounter, int start, int stop, std::vector<double>& kMatrix) {
std::cout << "start of thread " << threadCounter << "\n" << std::flush;
for(int i = start; i < stop; ++i) {
for(int j = 0; j < i+1; ++j) {
double& kij = kMatrix[i * xtest.size() + j];
kij = seKernel(xtest[i], xtest[j], lengthScale, sigma0sq);
}
}
std::cout << "end of thread " << threadCounter << "\n" << std::flush;
}
int main() {
double sigma0sq = 1;
Point3D lengthScale = {0.7633, 0.6937, 3.3307e+07};
const std::vector<Point3D> x_test = generateSampleData();
/* Finding data slices of similar size */
//This piece of code works, each thread is assigned roughly the same number of matrix entries
int numElements = x_test.size()*x_test.size()/2;
const int numThreads = 4;
int elemsPerThread = numElements / numThreads;
std::vector<int> indices;
int j = 0;
for(std::size_t i = 1; i < x_test.size()+1; ++i){
int prod = i*(i+1)/2 - j*(j+1)/2;
if (prod > elemsPerThread) {
i--;
j = i;
indices.push_back(i);
if(indices.size() == numThreads-1)
break;
}
}
indices.insert(indices.begin(), 0);
indices.push_back(x_test.size());
auto start = std::chrono::system_clock::now();
std::vector<double> kMatrix(x_test.size() * x_test.size(), 0.0);
std::vector<std::thread> threads;
for (std::size_t i = 1; i < indices.size(); ++i) {
threads.push_back(std::thread(calculateKMatrixCpp, x_test, lengthScale, sigma0sq, i, indices[i - 1], indices[i], std::ref(kMatrix)));
}
for (auto& t : threads) {
t.join();
}
auto end = std::chrono::system_clock::now();
auto elapsed_seconds = std::chrono::duration<double>(end - start).count();
std::cout << "computation time: " << elapsed_seconds << "s" << std::endl;
start = std::chrono::system_clock::now();
constexpr int buffer_size = 131072;
char buffer[buffer_size];
std::ofstream out("matrix.csv");
out.rdbuf()->pubsetbuf(buffer, buffer_size);
for (int i = 0; i < x_test.size(); ++i) {
for (int j = 0; j < i + 1; ++j) {
if (j != 0) {
out << ',';
}
out << kMatrix[i * x_test.size() + j];
}
if (i != x_test.size() - 1) {
out << '\n';
}
}
end = std::chrono::system_clock::now();
elapsed_seconds = std::chrono::duration<double>(end - start).count();
std::cout << "writing time: " << elapsed_seconds << "s" << std::endl;
}
Okey I've wrote implementation with optimized formatting.
By using #Nelfeal code it was taking on my system around 250 seconds for the run to complete with write time taking the most by far. Or rather std::ofstream formatting taking most of the time.
I've written a C++20 version via std::format_to/format. It is a multi-threaded version that takes around 25-40 seconds to complete all the computations, formatting, and writing. If run in a single thread, it takes on my system around 70 seconds. Same performance should be achievable via fmt library on C++11/14/17.
Here is the code:
import <vector>;
import <thread>;
import <iostream>;
import <string>;
import <cmath>;
import <array>;
import <random>;
import <fstream>;
import <chrono>;
import <format>;
import <filesystem>;
using Point3D = std::array<double, 3>;
auto generateSampleData(Point3D scale) -> std::vector<Point3D>
{
static std::minstd_rand g(std::random_device{}());
std::uniform_real_distribution<> d(-1.0, 1.0);
std::vector<Point3D> data;
data.reserve(27000);
for (auto i = 0; i < 27000; ++i)
{
data.push_back({ d(g)* scale[0], d(g)* scale[1], d(g)* scale[2] });
}
return data;
}
double seKernel(Point3D const& x1, Point3D const& x2, Point3D const& lengthScale, double sigma0sq) {
double sum = 0.0;
for (auto i = 0u; i < 3u; ++i) {
double distance = (x1[i] - x2[i]) / lengthScale[i];
sum += distance * distance;
}
return sigma0sq * std::exp(-0.5 * sum);
}
void calculateKMatrixCpp(std::vector<Point3D> const& xtest, Point3D lengthScale, double sigma0sq, int threadCounter, int start, int stop, std::filesystem::path localPath)
{
using namespace std::string_view_literals;
std::vector<char> buffer;
buffer.reserve(15'000);
std::ofstream out(localPath);
std::cout << std::format("starting thread {}: from {} to {}\n"sv, threadCounter, start, stop);
for (int i = start; i < stop; ++i)
{
for (int j = 0; j < i; ++j)
{
double kij = seKernel(xtest[i], xtest[j], lengthScale, sigma0sq);
std::format_to(std::back_inserter(buffer), "{:.6g}, "sv, kij);
}
double kii = seKernel(xtest[i], xtest[i], lengthScale, sigma0sq);
std::format_to(std::back_inserter(buffer), "{:.6g}\n"sv, kii);
out.write(buffer.data(), buffer.size());
buffer.clear();
}
}
int main() {
double sigma0sq = 1;
Point3D lengthScale = { 0.7633, 0.6937, 3.3307e+07 };
const std::vector<Point3D> x_test = generateSampleData(lengthScale);
/* Finding data slices of similar size */
//This piece of code works, each thread is assigned roughly the same number of matrix entries
int numElements = x_test.size() * (x_test.size()+1) / 2;
const int numThreads = 3;
int elemsPerThread = numElements / numThreads;
std::vector<int> indices;
int j = 0;
for (std::size_t i = 1; i < x_test.size() + 1; ++i) {
int prod = i * (i + 1) / 2 - j * (j + 1) / 2;
if (prod > elemsPerThread) {
i--;
j = i;
indices.push_back(i);
if (indices.size() == numThreads - 1)
break;
}
}
indices.insert(indices.begin(), 0);
indices.push_back(x_test.size());
auto start = std::chrono::system_clock::now();
std::vector<std::thread> threads;
using namespace std::string_view_literals;
for (std::size_t i = 1; i < indices.size(); ++i)
{
threads.push_back(std::thread(calculateKMatrixCpp, std::ref(x_test), lengthScale, sigma0sq, i, indices[i - 1], indices[i], std::format("./matrix_{}.csv"sv, i-1)));
}
for (auto& t : threads)
{
t.join();
}
auto end = std::chrono::system_clock::now();
auto elapsed_seconds = std::chrono::duration<double>(end - start);
std::cout << std::format("total elapsed time: {}"sv, elapsed_seconds);
return 0;
}
Note: I used 6 digits of precision here as it is the default for std::ofstream. More digits means more writing time to disk and lower performance.
I am new to Cuda. I am trying to solve the wave equation with the initial condition in the form of the Ricky momentum. The performance of the code is 12 GFlops, although my GPU performance is 3900. Why is the code so ineffective for me and how can I fix it?
main.cu
#include <iostream>
#include <cmath>
#include "step.cu"
#include <cuda.h>
#include "err.cu"
#include "err.h"
using namespace std;
int main(int argc, char const *argv[])
{
if (argc <= 3)
{
perror("Error in argc: argc<=3 (wait h, tau, C) \n");
exit(1);
}
char *eptr;
errno = 0;
long long int size,tmax;
double tau,cour,h,C, cour2;
h = std::strtod(argv[1], &eptr);
tau = std::strtod(argv[2], &eptr);
C = std::strtod(argv[3], &eptr);
tmax = 2000;
cour = C*tau/h;
cour2 = cour* cour;
size = 18*13*1024;
double *nxt_layer=nullptr;
double *layer_1=nullptr;
double *layer_2=nullptr;
double *rev_layer=nullptr;
dim3 blockSize = dim3(1024);
dim3 gridSize = dim3(size/blockSize.x);
float time;
cudaTimer timer;
cudaError_t ret = cudaMallocManaged(&nxt_layer, sizeof(double) * size);
if (ret != cudaSuccess)
{
std::cout << cudaGetErrorString(ret) << std::endl;
return 1;
}
ret = cudaMallocManaged(&layer_1, sizeof(double) * size);
if (ret != cudaSuccess)
{
std::cout << cudaGetErrorString(ret) << std::endl;
return 1;
}
ret = cudaMallocManaged(&layer_2, sizeof(double) * size);
if (ret != cudaSuccess)
{
std::cout << cudaGetErrorString(ret) << std::endl;
return 1;
}
for (int i = 0; i < size; ++i)
{
layer_1[i] = exp(-(i*h-7)*(i*h-7)/2)*((i*h-7)*(i*h-7)-1);
}
for (int i = 1; i < size/2; ++i)
{
nxt_layer[i] = layer_1[i+1]+0.5*cour2*(layer_1[i+1]-2*layer_1[i]+layer_1[i-1]);
}
nxt_layer[0] = 0; nxt_layer[size-1] = 0;
for (int i = size/2; i < size-1; ++i)
{
nxt_layer[i] = layer_1[i+1]+0.25*0.5*cour2*(layer_1[i+1]-2*layer_1[i]+layer_1[i-1]);
}
for (int i = 0; i < size-1; ++i)
{
layer_2[i] = layer_1[i];
layer_1[i] = nxt_layer[i];
}
nxt_layer[0] = 0; nxt_layer[size-1] = 0;
timer.start();
for (double t = 0; t < tmax; t=t+tau)
{
step<<<gridSize, blockSize>>>(nxt_layer, layer_1, layer_2, cour2, size);
if (CHECK_ERROR(cudaDeviceSynchronize()))
throw(-1);
nxt_layer[size-1]=0;
nxt_layer[0]=0;
}
time = timer.stop();
for (int i = 0; i < size; ++i)
{
cout<<i*h<<" "<<nxt_layer[i]<<endl;
}
}
step.cu
inline __device__ double compute(double *layer_1_tmp, double layer_2_tmp, double cour2)
{
return __fmaf_rd(cour2, layer_1_tmp[0]+layer_1_tmp[2], __fmaf_rd(2.0-2*cour2,layer_1_tmp[1],-layer_2_tmp));
}
__global__ void step(double *tmp_layer, double *layer_1, double *layer_2, double cour2, int Nx)
{
int node = threadIdx.x + blockDim.x * blockIdx.x;
if(node >= Nx-1 || node<=0) return;
double layer_1_tmp[3];
layer_1_tmp[0]=layer_1[node-1];
layer_1_tmp[1]=layer_1[node];
layer_1_tmp[2]=layer_1[node+1];
double layer_2_tmp=layer_2[node];
if(node<=Nx/2)
{
tmp_layer[node] = compute(layer_1_tmp, layer_2_tmp, 0.25*cour2);
}
else
{
tmp_layer[node] = compute(layer_1_tmp, layer_2_tmp, cour2);
}
layer_2[node]=layer_1[node];
layer_1[node]=tmp_layer[node];
}
I calculate GFlops as
long long int perfomance = size*tmax/tau;
long long int perftime = 1000*perfomance/time;
double gflops =(8*perfomance/time)/1000000;
I would be grateful for any of your comments and tips.
In the kernel, each work-item is doing only several multiplications and additions. This is negligible compared to kernel launch overhead per cuda thread and the memory access latency per layer_1 element. It's equivalent of measuring a few nanoseconds within microseconds of kernel time. Try clock measurement around the compute() function calls. It would at least give some "cycles per compute" measurement and you can find the total performance during the compute call.
clock_t c1 = clock();
compute();
clock_t c2 = clock();
timings[node] = c2-c1;
Even this is not true performance measurement as it doesn't take pipelining into consideration when multiple compute calls are made one after another. You may add another compute call after first one and gain even more performance due to pipelining and latency hiding.
Many (more consumer-oriented or semi-professional) graphics cards have better single precision than double precision performance. The single precision performance of the GTX 970 is 32x as high as its double precision performance.
Change the used data types from double to float.
please check out my code and the quesion below - thanks
Code:
#include <iostream>
#include <chrono>
using namespace std;
int bufferWriteIndex = 0;
float curSample = 0;
float damping[5] = { 1, 1, 1, 1, 1 };
float modeDampingTermsExp[5] = { 0.447604, 0.0497871, 0.00247875, 0.00012341, 1.37263e-05 };
float modeDampingTermsExp2[5] = { -0.803847, -3, -6, -9, -11.1962 };
int main(int argc, char** argv) {
float subt = 0;
int subWriteIndex = 0;
auto now = std::chrono::high_resolution_clock::now();
while (true) {
curSample = 0;
for (int i = 0; i < 5; i++) {
//Slow version
damping[i] = damping[i] * modeDampingTermsExp2[i];
//Fast version
//damping[i] = damping[i] * modeDampingTermsExp[i];
float cosT = 2 * damping[i];
for (int m = 0; m < 5; m++) {
curSample += cosT;
}
}
//t += tIncr;
bufferWriteIndex++;
//measure calculations per second
auto elapsed = std::chrono::high_resolution_clock::now() - now;
if ((elapsed / std::chrono::milliseconds(1)) > 1000) {
now = std::chrono::high_resolution_clock::now();
int idx = bufferWriteIndex;
cout << idx - subWriteIndex << endl;
subWriteIndex = idx;
}
}
}
As you can see im measuring the number of calculations or increments of bufferWriteIndex per second.
Question:
Why is performance faster when using modeDampingTermsExp -
Program output:
12625671
12285846
12819392
11179072
12272587
11722863
12648955
vs using modeDampingTermsExp2 ?
1593620
1668170
1614495
1785965
1814576
1851797
1808568
1801945
It's about 10x faster. It seems like the numbers in those 2 arrays have an impact on calculation time. Why?
I am using Visual Studio 2019 with the following flags: /O2 /Oi /Ot /fp:fast
This is because you are hitting denormal numbers (also see this question).
You can get rid of denormals like so:
#include <cmath>
// [...]
for (int i = 0; i < 5; i++) {
damping[i] = damping[i] * modeDampingTermsExp2[i];
if (std::fpclassify(damping[i]) == FP_SUBNORMAL) {
damping[i] = 0; // Treat denormals as 0.
}
float cosT = 2 * damping[i];
for (int m = 0; m < 5; m++) {
curSample += cosT;
}
}
I'm getting started in multithreaded programming so please excuse me if the following seems obvious. I am adding multithreading to an image processing program and the speedup isn't exactly the one I expected.
I'm currently getting a speedup of 4x times on a 4 physical processor cpu with hyperthreading (8), so I'd like to know if this kind of speedup is expected. The only thing I can think of is that it may make sense if both hyperthreads of a single physical CPU have to share some sort of memory bus.
Being new to multithreading it's not entirely clear to me if this would be considered an I/O bound program considering that all memory is allocated in RAM (I understand that the virtual memory manager of my OS will be the one deciding to page in/out this supposed memory amount from the heap) My machine has 16Gb of RAM in case it helps deciding if paging/swapping can be an issue.
I've written a test program showcasing the serial case and two parallel cases using QThreadPool and tbb::parallel_for
The current program as you can see has no real operations other than setting a supposed image from black to white and it's done on purpose to know what the baseline is before any real operations are applied to the image.
I'm attaching the program in hope that someone can explain me if my quest for a roughly 8x speedup is a lost cause in this kind of processing algorithm. Note that I'm not interested in other kinds of optimizations such as SIMD as my real concern is not just to make it faster, but to make it faster using purely multithreading, without getting into SSE nor processor cache level optimizations.
#include <iostream>
#include <sys/time.h>
#include <vector>
#include <QThreadPool>
#include "/usr/local/include/tbb/tbb.h"
#define LOG(x) (std::cout << x << std::endl)
struct col4
{
unsigned char r, g, b, a;
};
class QTileTask : public QRunnable
{
public:
void run()
{
for(uint32_t y = m_yStart; y < m_yEnd; y++)
{
int rowStart = y * m_width;
for(uint32_t x = m_xStart; x < m_xEnd; x++)
{
int index = rowStart + x;
m_pData[index].r = 255;
m_pData[index].g = 255;
m_pData[index].b = 255;
m_pData[index].a = 255;
}
}
}
col4* m_pData;
uint32_t m_xStart;
uint32_t m_yStart;
uint32_t m_xEnd;
uint32_t m_yEnd;
uint32_t m_width;
};
struct TBBTileTask
{
void operator()()
{
for(uint32_t y = m_yStart; y < m_yEnd; y++)
{
int rowStart = y * m_width;
for(uint32_t x = m_xStart; x < m_xEnd; x++)
{
int index = rowStart + x;
m_pData[index].r = 255;
m_pData[index].g = 255;
m_pData[index].b = 255;
m_pData[index].a = 255;
}
}
}
col4* m_pData;
uint32_t m_xStart;
uint32_t m_yStart;
uint32_t m_xEnd;
uint32_t m_yEnd;
uint32_t m_width;
};
struct TBBCaller
{
TBBCaller(std::vector<TBBTileTask>& t)
: m_tasks(t)
{}
TBBCaller(TBBCaller& e, tbb::split)
: m_tasks(e.m_tasks)
{}
void operator()(const tbb::blocked_range<size_t>& r) const
{
for (size_t i=r.begin();i!=r.end();++i)
m_tasks[i]();
}
std::vector<TBBTileTask>& m_tasks;
};
inline double getcurrenttime( void )
{
timeval t;
gettimeofday(&t, NULL);
return static_cast<double>(t.tv_sec)+(static_cast<double>(t.tv_usec) / 1000000.0);
}
char* getCmdOption(char ** begin, char ** end, const std::string & option)
{
char ** itr = std::find(begin, end, option);
if (itr != end && ++itr != end)
{
return *itr;
}
return 0;
}
bool cmdOptionExists(char** begin, char** end, const std::string& option)
{
return std::find(begin, end, option) != end;
}
void baselineSerial(col4* pData, int resolution)
{
double t = getcurrenttime();
for(int y = 0; y < resolution; y++)
{
int rowStart = y * resolution;
for(int x = 0; x < resolution; x++)
{
int index = rowStart + x;
pData[index].r = 255;
pData[index].g = 255;
pData[index].b = 255;
pData[index].a = 255;
}
}
LOG((getcurrenttime() - t) * 1000 << " ms. (Serial)");
}
void baselineParallelQt(col4* pData, int resolution, uint32_t tileSize)
{
double t = getcurrenttime();
QThreadPool pool;
for(int y = 0; y < resolution; y+=tileSize)
{
for(int x = 0; x < resolution; x+=tileSize)
{
uint32_t xEnd = std::min<uint32_t>(x+tileSize, resolution);
uint32_t yEnd = std::min<uint32_t>(y+tileSize, resolution);
QTileTask* t = new QTileTask;
t->m_pData = pData;
t->m_xStart = x;
t->m_yStart = y;
t->m_xEnd = xEnd;
t->m_yEnd = yEnd;
t->m_width = resolution;
pool.start(t);
}
}
pool.waitForDone();
LOG((getcurrenttime() - t) * 1000 << " ms. (QThreadPool)");
}
void baselineParallelTBB(col4* pData, int resolution, uint32_t tileSize)
{
double t = getcurrenttime();
std::vector<TBBTileTask> tasks;
for(int y = 0; y < resolution; y+=tileSize)
{
for(int x = 0; x < resolution; x+=tileSize)
{
uint32_t xEnd = std::min<uint32_t>(x+tileSize, resolution);
uint32_t yEnd = std::min<uint32_t>(y+tileSize, resolution);
TBBTileTask t;
t.m_pData = pData;
t.m_xStart = x;
t.m_yStart = y;
t.m_xEnd = xEnd;
t.m_yEnd = yEnd;
t.m_width = resolution;
tasks.push_back(t);
}
}
TBBCaller caller(tasks);
tbb::task_scheduler_init init;
tbb::parallel_for(tbb::blocked_range<size_t>(0, tasks.size()), caller);
LOG((getcurrenttime() - t) * 1000 << " ms. (TBB)");
}
int main(int argc, char** argv)
{
int resolution = 1;
uint32_t tileSize = 64;
char * pResText = getCmdOption(argv, argv + argc, "-r");
if (pResText)
{
resolution = atoi(pResText);
}
char * pTileSizeChr = getCmdOption(argv, argv + argc, "-b");
if (pTileSizeChr)
{
tileSize = atoi(pTileSizeChr);
}
if(resolution > 16)
resolution = 16;
resolution = resolution << 10;
uint32_t tileCount = resolution/tileSize + 1;
tileCount *= tileCount;
LOG("Resolution: " << resolution << " Tile Size: "<< tileSize);
LOG("Tile Count: " << tileCount);
uint64_t pixelCount = resolution*resolution;
col4* pData = new col4[pixelCount];
memset(pData, 0, sizeof(col4)*pixelCount);
baselineSerial(pData, resolution);
memset(pData, 0, sizeof(col4)*pixelCount);
baselineParallelQt(pData, resolution, tileSize);
memset(pData, 0, sizeof(col4)*pixelCount);
baselineParallelTBB(pData, resolution, tileSize);
delete[] pData;
return 0;
}
Yes, 4x speedup is expected. Hypertreading is a kind of time sharing implemented in hardware, so you can't expect to benefit from it if one thread is using up all superscalar pipelines available on the core, as it is your case. The other thread will necessarily have to wait.
You can expect an even lower speedup if your memory bus bandwidth is saturated by the threads running in less than the total number of cores available. Usually happens if you have too many cores, like in this question:
Why doesn't this code scale linearly?
Closed. This question is off-topic. It is not currently accepting answers.
Want to improve this question? Update the question so it's on-topic for Stack Overflow.
Closed 10 years ago.
Improve this question
I'm optimizing the function, I try every way and even sse, and modified code to return from different position to see the calculate timespan but finally I found most of the time spends on the bool judgement. Even I replace all code in the if statement with a simple add operation in it, it still cost 6000ms.
My platform is gcc 4.7.1 e5506 cpu. Its input 'a' and 'b' is a 1000size int array, and 'asize', 'bsize' are corresponding array size. MATCH_MASK = 16383, I run the function 100000 times to statistics a timespan. Is there any good idea to the problem. Thank you!
if (aoffsets[i] && boffsets[i]) // this line costs most time
Code:
uint16_t aoffsets[DOUBLE_MATCH_MASK] = {0}; // important! or it will only be right on the first time
uint16_t* boffsets = aoffsets + MATCH_MASK;
uint8_t* seen = (uint8_t *)aoffsets;
auto fn_init_offsets = [](const int32_t* x, int n_size, uint16_t offsets[])->void
{
for (int i = 0; i < n_size; ++i)
offsets[MATCH_STRIP(x[i])] = i;
};
fn_init_offsets(a, asize, aoffsets);
fn_init_offsets(b, bsize, boffsets);
uint8_t topcount = 0;
int topoffset = 0;
{
std::vector<uint8_t> count_vec(asize + bsize + 1, 0); // it's the fastest way already, very near to tls
uint8_t* counts = &(count_vec[0]);
//return aoffsets[0]; // cost 1375 ms
for (int i = 0; i < MATCH_MASK; ++i)
{
if (aoffsets[i] && boffsets[i]) // this line costs most time
{
//++affsets[i]; // for test
int offset = (aoffsets[i] -= boffsets[i]);
if ((-n_maxoffset <= offset && offset <= n_maxoffset))
{
offset += bsize;
uint8_t n_cur_count = ++counts[offset];
if (n_cur_count > topcount)
{
topcount = n_cur_count;
topoffset = offset;
}
}
}
}
}
return aoffsets[0]; // cost 6000ms
First, memset(count_vec,0, N); of a memaligned buffer wins over std::vector by 30%.
You can try to use the branchless expression (aoffsets[i] * boffsets[i]) and calculate some of not-to-be-used expressions simultaneously: offset = aoffset[i]-boffset[i]; offset+bsize; offset+n_maxoffset;.
Depending on the typical range of offset, one could be tempted to calculate the min/max of (offset+bsize) to restrict the needed memset(count_vec) at the next iteration: no need to clear already zero values.
As pointed by philipp, it's good to interleave the operations -- then again, one can read both aoffset[i] and boffset[i] simultaneously from uint32_t aboffset[N]; with some clever bit masking (that generates change mask for: aoffset[i], aoffset[i+1]) one could possibly handle 2 sets in parallel using 64-bit simulated SIMD in pure c (up to the histogram accumulation part).
You can increase the speed of your program by reducing the cache misses: aoffsets[i] and boffsets[i] are relatively far away from each other in memory. By placing them next to each other, you speed up the program significantly. On my machine (e5400 cpu, VS2012) the execution time is reduced from 3.0 seconds to 2.3 seconds:
#include <vector>
#include <windows.h>
#include <iostream>
typedef unsigned short uint16_t;
typedef int int32_t;
typedef unsigned int uint32_t;
typedef unsigned char uint8_t;
#define MATCH_MASK 16383
#define DOUBLE_MATCH_MASK (MATCH_MASK*2)
static const int MATCH_BITS = 14;
static const int MATCH_LEFT = (32 - MATCH_BITS);
#define MATCH_STRIP(x) ((uint32_t)(x) >> MATCH_LEFT)
static const int n_maxoffset = 1000;
uint16_t test(int32_t* a, int asize, int32_t* b, int bsize)
{
uint16_t offsets[DOUBLE_MATCH_MASK] = {0};
auto fn_init_offsets = [](const int32_t* x, int n_size, uint16_t offsets[])->void
{
for (int i = 0; i < n_size; ++i)
offsets[MATCH_STRIP(x[i])*2 /*important. leave space for other offsets*/] = i;
};
fn_init_offsets(a, asize, offsets);
fn_init_offsets(b, bsize, offsets+1);
uint8_t topcount = 0;
int topoffset = 0;
{
std::vector<uint8_t> count_vec(asize + bsize + 1, 0);
uint8_t* counts = &(count_vec[0]);
for (int i = 0; i < MATCH_MASK; i+=2)
{
if (offsets[i] && offsets[i+1])
{
int offset = (offsets[i] - offsets[i+1]); //NOTE: I removed
if ((-n_maxoffset <= offset && offset <= n_maxoffset))
{
offset += bsize;
uint8_t n_cur_count = ++counts[offset];
if (n_cur_count > topcount)
{
topcount = n_cur_count;
topoffset = offset;
}
}
}
}
}
return offsets[0];
}
int main(int argc, char* argv[])
{
const int sizes = 1000;
int32_t* a = new int32_t[sizes];
int32_t* b = new int32_t[sizes];
for (int i=0;i<sizes;i++)
{
a[i] = rand()*rand();
b[i] = rand()*rand();
}
//Variablen
LONGLONG g_Frequency, g_CurentCount, g_LastCount;
QueryPerformanceFrequency((LARGE_INTEGER*)&g_Frequency);
QueryPerformanceCounter((LARGE_INTEGER*)&g_CurentCount);
int sum = 0;
for (int i=0;i<100000;i++)
{
sum += test(a,sizes,b,sizes);
}
QueryPerformanceCounter((LARGE_INTEGER*)&g_LastCount);
double dTimeDiff = (((double)(g_LastCount-g_CurentCount))/((double)g_Frequency));
std::cout << "Result: " << sum << std::endl <<"time: " << dTimeDiff << std::endl;
delete[] a;
delete[] b;
return 0;
}
compared to your version of test().
#include <vector>
#include <windows.h>
#include <iostream>
typedef unsigned short uint16_t;
typedef int int32_t;
typedef unsigned int uint32_t;
typedef unsigned char uint8_t;
#define MATCH_MASK 16383
#define DOUBLE_MATCH_MASK (MATCH_MASK*2)
static const int MATCH_BITS = 14;
static const int MATCH_LEFT = (32 - MATCH_BITS);
#define MATCH_STRIP(x) ((uint32_t)(x) >> MATCH_LEFT)
static const int n_maxoffset = 1000;
uint16_t test(int32_t* a, int asize, int32_t* b, int bsize)
{
uint16_t aoffsets[DOUBLE_MATCH_MASK] = {0}; // important! or it will only be right on the first time
uint16_t* boffsets = aoffsets + MATCH_MASK;
auto fn_init_offsets = [](const int32_t* x, int n_size, uint16_t offsets[])->void
{
for (int i = 0; i < n_size; ++i)
offsets[MATCH_STRIP(x[i])] = i;
};
fn_init_offsets(a, asize, aoffsets);
fn_init_offsets(b, bsize, boffsets);
uint8_t topcount = 0;
int topoffset = 0;
{
std::vector<uint8_t> count_vec(asize + bsize + 1, 0);
uint8_t* counts = &(count_vec[0]);
for (int i = 0; i < MATCH_MASK; ++i)
{
if (aoffsets[i] && boffsets[i])
{
int offset = (aoffsets[i] - boffsets[i]); //NOTE: I removed the -= because otherwise offset would always be positive!
if ((-n_maxoffset <= offset && offset <= n_maxoffset))
{
offset += bsize;
uint8_t n_cur_count = ++counts[offset];
if (n_cur_count > topcount)
{
topcount = n_cur_count;
topoffset = offset;
}
}
}
}
}
return aoffsets[0];
}
int main(int argc, char* argv[])
{
const int sizes = 1000;
int32_t* a = new int32_t[sizes];
int32_t* b = new int32_t[sizes];
for (int i=0;i<sizes;i++)
{
a[i] = rand()*rand();
b[i] = rand()*rand();
}
LONGLONG g_Frequency, g_CurentCount, g_LastCount;
QueryPerformanceFrequency((LARGE_INTEGER*)&g_Frequency);
QueryPerformanceCounter((LARGE_INTEGER*)&g_CurentCount);
int sum = 0;
for (int i=0;i<100000;i++)
{
sum += test(a,sizes,b,sizes);
}
QueryPerformanceCounter((LARGE_INTEGER*)&g_LastCount);
double dTimeDiff = (((double)(g_LastCount-g_CurentCount))/((double)g_Frequency));
std::cout << "Result: " << sum << std::endl <<"time: " << dTimeDiff << std::endl;
delete[] a;
delete[] b;
return 0;
}