radix select using cuda - c++

I have been working to develop a radix select using CUDA which utilizes k smallest element to sort given number of elements. The main idea behind this radix select is that is scans through 32 bit integer starting from its MSB to LSB. It partitions all 0 bit on left side and all 1 bit on the right side. The side with contains k smallest elements is solved recursively. My partition process works just fine but I am having problem dealing with recursive function calls. I am unable to stop the recursion. Please help me on that!
My kernel function looks like this: This is kernel.h
#include "header.h"
#define WARP_SIZE 32
#define BLOCK_SIZE 32
__device__ int Partition(int *d_DataIn, int firstidx, int lastidx, int k, int N, int bit)
{
int threadID = threadIdx.x + BLOCK_SIZE * blockIdx.x;
int WarpID = threadID >> 5;
int LocWarpID = threadID - 32 * WarpID;
int NumWarps = N / WARP_SIZE;
int pivot;
__shared__ int DataPartition[BLOCK_SIZE];
__shared__ int DataBinary[WARP_SIZE];
for(int i = 0; i < NumWarps; i++)
{
if(LocWarpID >= firstidx && LocWarpID <=lastidx)
{
int r = d_DataIn[i * WARP_SIZE + LocWarpID];
int p = (r>>(31-bit))&1;
unsigned int B = __ballot(p);
unsigned int B_flip = ~B;
if(p==1)
{
int b = B << (32-LocWarpID);
int RightLoc = __popc(b);
DataPartition[lastidx - RightLoc] = r;
}
else
{
int b_flip = B_flip << (32 - LocWarpID);
int LeftLoc = __popc(b_flip);
DataPartition[LeftLoc] = r;
}
if(LocWarpID <= lastidx - __popc(B))
{
d_DataIn[LocWarpID] = DataPartition[LocWarpID];
}
else
{
d_DataIn[LocWarpID] = DataPartition[LocWarpID];
}
pivot = lastidx - __popc(B);
return pivot+1;
}
}
}
__device__ int RadixSelect(int *d_DataIn, int firstidx, int lastidx, int k, int N, int bit)
{
if(firstidx == lastidx)
return *d_DataIn;
int q = Partition(d_DataIn, firstidx, lastidx, k, N, bit);
int length = q - firstidx;
if(k == length)
return *d_DataIn;
else if(k < length)
return RadixSelect(d_DataIn, firstidx, q-1, k, N, bit+1);
else
return RadixSelect(d_DataIn, q, lastidx, k-length, N, bit+1);
}
__global__ void radix(int *d_DataIn, int firstidx, int lastidx, int k, int N, int bit)
{
RadixSelect(d_DataIn, firstidx, lastidx, k, N, bit);
}
Host code is main.cu and it looks like:
#include "header.h"
#include <iostream>
#include <fstream>
#include "kernel.h"
#define BLOCK_SIZE 32
using namespace std;
int main()
{
int N = 32;
thrust::host_vector<float>h_HostFloat(N);
thrust::counting_iterator <unsigned int> Numbers(0);
thrust::transform(Numbers, Numbers + N, h_HostFloat.begin(), RandomFloatNumbers(1.f, 100.f));
thrust::host_vector<int>h_HostInt(N);
thrust::transform(h_HostFloat.begin(), h_HostFloat.end(), h_HostInt.begin(), FloatToInt());
thrust::device_vector<float>d_DeviceFloat = h_HostFloat;
thrust::device_vector<int>d_DeviceInt(N);
thrust::transform(d_DeviceFloat.begin(), d_DeviceFloat.end(), d_DeviceInt.begin(), FloatToInt());
int *d_DataIn = thrust::raw_pointer_cast(d_DeviceInt.data());
int *h_DataOut;
float *h_DataOut1;
int fsize = N * sizeof(float);
int size = N * sizeof(int);
h_DataOut = new int[size];
h_DataOut1 = new float[fsize];
int firstidx = 0;
int lastidx = BLOCK_SIZE-1;
int k = 20;
int bit = 1;
int NUM_BLOCKS = N / BLOCK_SIZE;
radix <<< NUM_BLOCKS, BLOCK_SIZE >>> (d_DataIn, firstidx, lastidx, k, N, bit);
cudaMemcpy(h_DataOut, d_DataIn, size, cudaMemcpyDeviceToHost);
WriteData(h_DataOut1, h_DataOut, 10, N);
return 0;
}
List of headers that I used:
#include "cuda.h"
#include "cuda_runtime_api.h"
#include "device_launch_parameters.h"
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include <thrust/transform.h>
#include <thrust/generate.h>
#include "functor.h"
#include <thrust/iterator/counting_iterator.h>
#include <thrust/copy.h>
#include <thrust/device_ptr.h>
Another header file "functor.h" to convert floating point numbers to int type and to generate random floating numbers.
#include <thrust/random.h>
#include <sstream>
#include <fstream>
#include <iomanip>
struct RandomFloatNumbers
{
float a, b;
__host__ __device__
RandomFloatNumbers(float _a, float _b) : a(_a), b(_b) {};
__host__ __device__
float operator() (const unsigned int n) const{
thrust::default_random_engine rng;
thrust::uniform_real_distribution<float> dist(a,b);
rng.discard(n);
return dist(rng);
}
};
struct FloatToInt
{
__host__ __device__
int operator() (const float &x)
const {
union {
float f_value;
int i_value;
} value;
value.f_value = x;
return value.i_value;
}
};
float IntToFloat(int &x)
{
union{
float f_value;
int i_value;
}value;
value.i_value = x;
return value.f_value;
}
bool WriteData(float *h_DataOut1, int *h_DataOut, int bit, int N)
{
std::ofstream data;
std::stringstream file;
file << "out\\Partition_";
file << std::setfill('0') <<std::setw(2) << bit;
file << ".txt";
data.open((file.str()).c_str());
if(data.is_open() == false)
{
std::cout << "File is not open" << std::endl;
return false;
}
for(int i = 0; i < N; i++)
{
h_DataOut1[i] = IntToFloat(h_DataOut[i]);
//cout << h_HostFloat[i] << " \t" << h_DataOut1[i] << endl;
//std::bitset<32>bitshift(h_DataOut[i]&1<<31-bit);
//data << bitshift[31-bit] << "\t" <<h_DataOut1[i] <<std::endl;
data << h_DataOut1[i] << std::endl;
}
data << std::endl;
data.close();
std::cout << "Partition=" <<bit <<"\n";
return true;
}

Per your request, I'm posting the code I used to investigate this and help me in studying your code.
#include <stdio.h>
#include <stdlib.h>
__device__ int gpu_partition(unsigned int *data, unsigned int *partition, unsigned int *ones, unsigned int* zeroes, int bit, int idx, unsigned int* warp_ones){
int one = 0;
int valid = 0;
int my_one, my_zero;
if (partition[idx]){
valid = 1;
if(data[idx] & (1ULL<<(31-bit))) one=1;}
__syncthreads();
if (valid){
if (one){
my_one=1;
my_zero=0;}
else{
my_one=0;
my_zero=1;}
}
else{
my_one=0;
my_zero=0;}
ones[idx]=my_one;
zeroes[idx]=my_zero;
unsigned int warp_one = __popc(__ballot(my_one));
if (!(threadIdx.x & 31))
warp_ones[threadIdx.x>>5] = warp_one;
__syncthreads();
// reduce
for (int i = 16; i > 0; i>>=1){
if (threadIdx.x < i)
warp_ones[threadIdx.x] += warp_ones[threadIdx.x + i];
__syncthreads();}
return warp_ones[0];
}
__global__ void gpu_radixkernel(unsigned int *data, unsigned int m, unsigned int n, unsigned int *result){
__shared__ unsigned int loc_data[1024];
__shared__ unsigned int loc_ones[1024];
__shared__ unsigned int loc_zeroes[1024];
__shared__ unsigned int loc_warp_ones[32];
int l=0;
int bit = 0;
unsigned int u = n;
if (n<2){
if ((n == 1) && !(threadIdx.x)) *result = data[0];
return;}
loc_data[threadIdx.x] = data[threadIdx.x];
loc_ones[threadIdx.x] = (threadIdx.x<n)?1:0;
__syncthreads();
unsigned int *next = loc_ones;
do {
int s = gpu_partition(loc_data, next, loc_ones, loc_zeroes, bit++, threadIdx.x, loc_warp_ones);
if ((u-s) > m){
u = (u-s);
next = loc_zeroes;}
else{
l = (u-s);
next = loc_ones;}}
while ((u != l) && (bit<32));
if (next[threadIdx.x]) *result = loc_data[threadIdx.x];
}
int partition(unsigned int *data, int l, int u, int bit){
unsigned int *temp = (unsigned int *)malloc(((u-l)+1)*sizeof(unsigned int));
int pos = 0;
for (int i = l; i<=u; i++)
if(data[i] & (1ULL<<(31-bit))) temp[pos++] = data[i];
int result = u-pos;
for (int i = l; i<=u; i++)
if(!(data[i] & (1ULL<<(31-bit)))) temp[pos++] = data[i];
pos = 0;
for (int i = u; i>=l; i--)
data[i] = temp[pos++];
free(temp);
return result;
}
unsigned int radixselect(unsigned int *data, int l, int u, int m, int bit){
if (l == u) return(data[l]);
if (bit > 32) {printf("radixselect fail!\n"); return 0;}
int s = partition(data, l, u, bit);
if (s>=m) return radixselect(data, l, s, m, bit+1);
return radixselect(data, s+1, u, m, bit+1);
}
int main(){
unsigned int data[8] = {32767, 22, 88, 44, 99, 101, 0, 7};
unsigned int data1[8];
for (int i = 0; i<8; i++){
for (int j=0; j<8; j++) data1[j] = data[j];
printf("value[%d] = %d\n", i, radixselect(data1, 0, 7, i, 0));}
unsigned int *d_data;
cudaMalloc((void **)&d_data, 1024*sizeof(unsigned int));
unsigned int h_result, *d_result;
cudaMalloc((void **)&d_result, sizeof(unsigned int));
cudaMemcpy(d_data, data, 8*sizeof(unsigned int), cudaMemcpyHostToDevice);
for (int i = 0; i < 8; i++){
gpu_radixkernel<<<1,1024>>>(d_data, i, 8, d_result);
cudaMemcpy(&h_result, d_result, sizeof(unsigned int), cudaMemcpyDeviceToHost);
printf("gpu result index %d = %d\n", i, h_result);
}
unsigned int data2[1024];
unsigned int data3[1024];
for (int i = 0; i < 1024; i++) data2[i] = rand();
cudaMemcpy(d_data, data2, 1024*sizeof(unsigned int), cudaMemcpyHostToDevice);
for (int i = 0; i < 1024; i++){
for (int j = 0; j<1024; j++) data3[j] = data2[j];
unsigned int cpuresult = radixselect(data3, 0, 1023, i, 0);
gpu_radixkernel<<<1,1024>>>(d_data, i, 1024, d_result);
cudaMemcpy(&h_result, d_result, sizeof(unsigned int), cudaMemcpyDeviceToHost);
if (h_result != cpuresult) {printf("mismatch at index %d, cpu: %d, gpu: %d\n", i, cpuresult, h_result); return 1;}
}
printf("Finished\n");
return 0;
}
Here are some notes, in no particular order:
I got rid of all your thrust code, it's not doing anything useful as far as the radix select algorithm is concerned. I also find your casting of float to int curious. I haven't thought through the ramifications of trying to do a bitwise radix select in order on a sequence of exponent bits followed by a sequence of mantissa bits. It might work, (although I think if you include the sign bit, it definitely won't work) but again I don't think it's central to understanding the algorithm.
I included a host version that I wrote just to check my device results.
I'm pretty sure this algorithm will fail in some cases where there are duplicated elements. For example, if you hand it a vector of all zeroes, I think it will fail. I don't think it would be difficult to handle that case however.
my host version is recursive, but my device version is not. I don't see that recursion is that useful here, since the non-recursive form of the algorithm is easy to write as well, especially since there are at most 32 bits to travel through. Still, if you wanted to create a recursive device version, it should not be difficult, by incorporating the u,s, and l manipulation code inside the partition function.
I have dispensed with typical cuda error checking. However I recommend it.
I don't consider this to be a paragon of cuda programming. If you delve into for example a radix sort algorithm (such as here), you will see that it is pretty complex. A fast GPU radix select would look nothing like my code. I wrote my code to be analogous to the serial recursive partitioned radix sort, which is not the best way to do it on a massively parallel architecture.
Since radix select is not a sort, I attempted to write a device code that would do no data movement of the input data, since I considered this to be expensive and unnecessary. I do a single read from global memory for the data at the beginning of the kernel, and thereafter I do all work out of shared memory, and even in shared memory I am not re-arranging the data (as I do in my host version) so as to avoid the cost of data movement. Instead I keep flag arrays of ones and zeroes partitions, to feed to the next partitioning step. The data movement would involve a fair amount of uncoalesced and/or bank-conflicted traffic, whereas the flag arrays allow all accesses to be non-bank-conflicted.

Related

Undefined behavior with determinist procedure

I am currently trying to implement a "cave generation" as a 2D array following the "Game of Life" ideas. The idea is as follow:
I have a 2d vector of 0s and 1s (which respectively represent air and block) randomly generated with a uniform_real_distribution with density (here 0.45, so 45% of the array will be 1).
After this we iterate x times on the array. An iteration looks as follow:
First, we copy the array on a new one.
Second, we iterate on the old array as follow: We look at the number of blocks on the neighbourhood of the block we're at, and depending on two things we do this:
IF the current tile is air and has more than 4 blocks in its neighbourhood (-1,-1) to (1,1) excluding himself, change it to a block in the NEW ARRAY
IF the current tile is a block and has less than 3 blocks in its neighbourhood, change it to air in the NEW ARRAY
Copy the new array in the old array
The problem is, that EVEN when I seed my uniform law with a determinist seed, sometimes (1 time over 3), the map will be completely filled with blocks after two or three iterations. I have literally 0 idea of why after looking at my code for many hours, and this is why I am here. There is the code:
cavefactory.h
#ifndef CAVEFACTORY_H_
#define CAVEFACTORY_H_
#include <vector>
namespace cavegenerator {
// define cave_t as a 2d vector of integers
using cave_t = std::vector<std::vector<int>>;
// constants
namespace DEFAULT {
constexpr unsigned short int WIDTH = 64;
constexpr unsigned short int HEIGHT = 64;
constexpr float DENSITY = 0.45;
constexpr unsigned short int BIRTH_LIMIT = 4;
constexpr unsigned short int DEATH_LIMIT = 3;
} // namespace DEFAULT
class CaveFactory {
public:
CaveFactory(unsigned short int width = DEFAULT::WIDTH,
unsigned short int height = DEFAULT::HEIGHT,
float density = DEFAULT::DENSITY);
// makes a cave with the desired number of iterations and parameters
static cave_t MakeCave(unsigned short int width = DEFAULT::WIDTH,
unsigned short int height = DEFAULT::HEIGHT,
float density = DEFAULT::DENSITY,
int iterations = 3,
unsigned short int bl = DEFAULT::BIRTH_LIMIT,
unsigned short int dl = DEFAULT::DEATH_LIMIT);
// implemented in case of generalization of cave(more than two blocks)
bool isSolid(int i, int j);
cave_t getCave();
void Print();
void Iterate( unsigned short int bl = DEFAULT::BIRTH_LIMIT,
unsigned short int dl = DEFAULT::DEATH_LIMIT );
private:
cave_t cave_;
int NumberOfNeighbours(int i, int j);
void Initialize(float density = DEFAULT::DENSITY);
};
} // namespace cavegenerator
#endif // CAVEFACTORY_H_
cavefactory.cc
#include "cavefactory.h"
#include <random>
#include <iostream>
#include <ctime>
#include <algorithm>
namespace cavegenerator {
CaveFactory::CaveFactory(unsigned short int width, unsigned short int height, float density) {
cave_.resize(width);
for (auto &i : cave_) {
i.resize(height);
}
Initialize(density);
}
bool CaveFactory::isSolid(int i, int j) {
return (cave_[i][j] == 1);
}
int CaveFactory::NumberOfNeighbours(int x, int y) {
int num = 0;
for (int i = -1; i < 2; i++) {
for (int j = -1; j < 2; j++) {
if ( i == 0 && j == 0 ) continue; // we don't want to count ourselve
// if out of bounds, add a solid neighbour
if ( x + i >= (int)cave_.size() || x + i < 0 || y + j >= (int)cave_[i].size() || y + j < 0) {
++num;
} else if (isSolid(x+i, y+j)) {
++num;
}
}
}
return num;
}
cave_t CaveFactory::getCave() {
return cave_;
}
void CaveFactory::Print() {
for (auto &i : cave_) {
for (auto &j : i) {
std::cout << ((j==1) ? "x" : " ");
}
std::cout << "\n";
}
return;
}
cave_t CaveFactory::MakeCave(unsigned short int width,
unsigned short int height,
float density,
int iterations,
unsigned short int bl,
unsigned short int dl)
{
CaveFactory cave(width, height, density);
for (int i = 0; i < iterations; i++) {
cave.Iterate(bl, dl);
}
return cave.getCave();
}
// Initlialize the cave with the specified density
void CaveFactory::Initialize(float density) {
std::mt19937 rd(4);
std::uniform_real_distribution<float> roll(0, 1);
for (auto &i : cave_) {
for (auto &j : i) {
if (roll(rd) < density) {
j = 1;
} else {
j = 0;
}
}
}
}
// for each cell in the original cave, if the cell is solid:
// if the number of solid neighbours is under the death limit, we kill the block
// if the cell is air, if the number of solid blocks is above the birth limit we place a block
void CaveFactory::Iterate(unsigned short int bl, unsigned short int dl) {
cave_t new_cave = cave_;
for (int i = 0; i < (int)cave_.size(); i++) {
for (int j = 0; j < (int)cave_[0].size(); j++) {
int number_of_neighbours = NumberOfNeighbours(i, j);
if (isSolid(i, j) && number_of_neighbours < dl) {
new_cave[i][j] = 0;
} else if (!isSolid(i,j) && number_of_neighbours > bl) {
new_cave[i][j] = 1;
}
}
}
std::copy(new_cave.begin(), new_cave.end(), cave_.begin());
}
} // namespace cavegenerator
main.cc
#include <iostream>
#include <vector>
#include <random>
#include <ctime>
#include <windows.h>
#include "cavefactory.h"
int main() {
cavegenerator::CaveFactory caveEE;
caveEE.Print();
for(int i = 0; i < 15; i++) {
caveEE.Iterate();
Sleep(600);
system("cls");
caveEE.Print();
}
return 0;
}
I know windows.h is a bad habit, I just used it for debugging.
I hope someone can make me understand, maybe it's just a normal behavior I'm not aware of?
Thank you very much.
(int)cave_[i].size() in NumberOfNeighbours is incorrect, it should be (int)cave_[x+i].size() (or (int)cave_[0].size() since all rows and columns are equal size). When i equals -1 you have an out of bounds vector access and undefined behaviour.

All Strings of {A,C,T,G}

I'm currently working on a problem of solving the combination of the different length of {A,C,T,G}, from 1 letter to 6 letters.
For example:
------#=1------
1:A
2:C
3:G
4:T
------#=2------
1:AA
2:AC
3:AG
4:AT
5:CA
6:CC
7:CG
8:CT
9:GA
10:GC
11:GG
12:GT
13:TA
14:TC
15:TG
16:TT
------#=3------
1:AAA
2:AAC
3:AAG
4:AAT
5:ACA
.
.
.
Now I can only solve the combinations from 1 to 4 letters, and I have no idea how to solve the combination of {A,C,T,G} of 5 letters and 6 letters, which the length of the combination(5 and 6) is greater than the length of all the strings(4)....
Here is my code:
#include <iostream>
#include <cstdlib>
using namespace std;
void combinationUtil(char arr[], char data[], int start, int end, int index, int r);
void printCombination(char arr[], int n, int r)
{
char data[100];
combinationUtil(arr, data, 0, n-1, 0, r);
}
void combinationUtil(char arr[], char data[], int start, int end, int index, int r)
{
if (index == r) {
for (int j=0; j<r; j++)
cout << data[j];
cout << endl;
return;
}
for (int i=start; i<=end && end-i+1 >= r-index; i++)
{
data[index] = arr[i];
combinationUtil(arr, data, i+1, end, index+1, r);
}
}
int main()
{
char arr[] = {'A','T','C','G'};
int n = sizeof(arr)/sizeof(arr[0]);
printCombination(arr, n, 1);
printCombination(arr, n, 2);
printCombination(arr, n, 3);
printCombination(arr, n, 4);
printCombination(arr, n, 5);
system("pause");
}
At least assuming I understand what you want, this is pretty easy to solve by treating it as counting from 0 to some limit in base 4, with the "digits" displayed as "A", "C", "G" and "T".
#include <string>
#include <iostream>
std::string cvt(unsigned input, unsigned len) {
std::string ret;
static const char letters[] = "ACGT";
for (int i=0; i<len; i++) {
ret.push_back(letters[input%4]);
input /= 4;
}
return ret;
}
int main() {
unsigned limit = 1;
unsigned length = 4;
for (int i=0; i<4; i++)
limit *= 4;
for (int i=0; i<limit; i++)
std::cout << cvt(i, length) << "\n";
}
Your code assumes each letter can only be used once. That's why you are getting no results for both sequences of length 5 and 6. Rewrite your function as the following and do not pass as start or end value to it:
void combinationUtil(char arr[], char data[], int index, int r)
{
if (index == r)
{
for (int j=0; j<r; ++j)
cout << data[j];
cout << endl;
return;
}
for (int i=0; i<r; ++i)
{
data[index] = arr[i];
combinationUtil(arr, data, index+1, r);
}
}

Weird crashing program while debug runs smoothly (Eclipse C++)

I'm writing a program for my algorithmic math class at university and I'm using Win 7 (x64), Eclipse Oxygen.1a Release (4.7.1a) with MinGW 6.3.0.
Whenever I build and run the program it crashes with windows claiming 'Abgabe3.exe stopped working' but when trying to find the problem using the debugger and breakpoints I step trough the whole program and it finishes without errors...
I stripped everything not used by the problematic function and copied everything into a seperate file and the exact problem occurs.
Maybe somebody has a clue what happened at my side. ^^
#include <math.h> /* pow, sqrt */
#include <iostream> /* cin, cout */
#include <new> /* new */
#include <string> /* string */
#include <stdlib.h> /* srand, rand */
#include <time.h> /* time */
using namespace std;
void NORM(double* res, double* x, int n){
res[0] = 0.0;
for(int i = 0; i < n; i++){
res[0] += pow(x[i], 2);
}
res[0] = sqrt(res[0]);
}
void initRand(double* x, int n){
srand (time(NULL) * rand());
for(int i = 0; i < n; i++){
x[i] = (((double) rand()) / ((double) RAND_MAX));
}
}
void createArray(double* &x, int n){
if (n > 0){
x = new double[n];
initRand(x, n);
}
}
void printArray(double* x, int n){
if (x != NULL){
cout<<"(\n";
for(int i = 0; i < n; i++){
if(i+1 == n) cout<<x[i];
else if ((i % 5) == 0) cout<<x[i];
else if ( ((i+1) % 5) == 0 ){
cout<<", "<<x[i]<<"\n";
}
else {
cout<<", "<<x[i];
}
}
cout<<"\n)\n";
}
else cout<<"\nError: pointer = NULL\n";
}
unsigned long long int bin(unsigned int n, unsigned int k){
unsigned long long res = 1;
if(k == 0) return 1;
else if( n >= k){
for(unsigned long long int i = 1; i <= k; i++){
res *= (n + 1 - i) / i;
}
}
else return 0;
return res;
}
void newArray(double** x, unsigned int v, unsigned int n){
for(unsigned int i = 0; i < v; i++){
double* ptr = x[i];
createArray(ptr,n);
x[i] = ptr;
}
}
void experiment(double** vektorArray){
unsigned int n = 10, v = 20;
cout<<"Dimension n = "<<n<<"\nAnzahl Versuche v = "<<v<<endl;
//Erstellen der Vektoren
cout<<"Erstellen - starte\n";
vektorArray = new double*[n];
newArray(vektorArray, v, n);
cout<<"Erstellen - fertig\n";
for(unsigned int i = 0; i < v; i++){
if(i%10 == 0) printArray(vektorArray[i], n);
}
}
int main(int argc, char** argv){
double** vektorArray = NULL;
experiment(vektorArray);
return 0;
}
vektorArray = new double*[n];
created an array of size n, but
void newArray(double** x, unsigned int v, unsigned int n)
{
for (unsigned int i = 0; i < v; i++)
{
double* ptr = x[i];
createArray(ptr, n);
x[i] = ptr;
}
}
and
for (unsigned int i = 0; i < v; i++)
{
if (i % 10 == 0)
printArray(vektorArray[i], n);
}
index that array with v. Looks like you got your variables crossed. Strongly recommend giving variables better, more descriptive names to help make this more obvious.

understanding the logic of Rotating a N x N matrix

Hi I have started solving C++ questions. One among them is rotating a N x N matrix to a 90 degree clockwise.
below is the code link, that i'm referring to. I had never solved matrix problems in C++/any.
http://www.geeksforgeeks.org/turn-an-image-by-90-degree/
#include <stdio.h>
#include <stdlib.h>
void displayMatrix(unsigned int const *p, unsigned int row, unsigned int col);
void rotate(unsigned int *pS, unsigned int *pD, unsigned int row, unsigned int col);
int main()
{
// declarations
unsigned int image[][4] = {{1,2,3,4}, {5,6,7,8}, {9,10,11,12}};
unsigned int *pSource;
unsigned int *pDestination;
unsigned int m, n;
// setting initial values and memory allocation
m = 3, n = 4, pSource = (unsigned int *)image;
pDestination = (unsigned int *)malloc(sizeof(int)*m*n);
// process each buffer
displayMatrix(pSource, m, n);
rotate(pSource, pDestination, m, n);
displayMatrix(pDestination, n, m);
free(pDestination);
getchar();
return 0;
}
void displayMatrix(unsigned int const *p, unsigned int r, unsigned int c)
{
unsigned int row, col;
printf("\n\n");
for(row = 0; row < r; row++)
{
for(col = 0; col < c; col++)
{
printf("%d\t", *(p + row * c + col)); // what is this??? couldnt understand this logic?
}
printf("\n");
}
printf("\n\n");
}
void rotate(unsigned int *pS, unsigned int *pD, unsigned int row, unsigned int col)
{
unsigned int r, c;
for(r = 0; r < row; r++)
{
for(c = 0; c < col; c++)
{
*(pD + c * row + (row - r - 1)) = *(pS + r * col + c); // not understanding this logic as well.
}
}
}
could any one please explain more about this logic. I'm not able to resolve few places in the above problems that i have mentioned in the code itself.
Also please let me know the time and space complexity detailed..Thanks in advance.
The code relies on a two-dimensional array being contiguously stored and treats it as one-dimensional.
The line
*(pD + c * row + (row - r - 1)) = *(pS + r * col + c);
is equivalent to
pD[c][row-r-1] = pS[r][c];

Count the number of component wise comparisons in quicksort algorithm.

I'm trying to count the number of comparisons my quicksort algorithm makes for an array size of 500. I know that the best case for quicksort with partition is nlogn-n+1. So for an array size of 500, the best case number of component wise comparisons would be about 3983. However, when I run my code, I'm getting 2400 comparisons or so, depending on the array the random function generates. Am I counting the number of component wise comparisons wrong? Please help.
#include <iostream>
#include <string>
#include <stdlib.h>
using namespace std;
int count_500 = 0;
int partition(int *S,int l, int u);
void swap(int &val1, int &val2);
void Quicksort(int S[],int low, int hi);
void exchange(int list[], int p, int q);
int median_of_3(int list[], int p, int r);
void Quicksort_M3(int S[], int low, int hi);
int main()
{
int S1_500[500];
int S2_500[500];
int S3_500[500];
int S1_200[200];
int S2_200[200];
int S3_200[200];
int S1_8[8];
int S2_8[8];
int S3_8[8];
srand ( time(NULL) );
for(int i=0; i<500; i++)
{
S1_500[i] = rand()%1000;
S2_500[i] = rand()%1000;
S3_500[i] = rand()%1000;
}
for(int i=0; i<200; i++)
{
S1_200[i] = rand()%500;
S2_200[i] = rand()%500;
S3_200[i] = rand()%500;
}
for(int i=0; i<8; i++)
{
S1_8[i] = rand()%100;
S2_8[i] = rand()%100;
S3_8[i] = rand()%100;
}
Quicksort(S1_500,0,499);
for(int i=0; i<500; i++)
{
cout << S1_500[i] << endl;
}
cout << "Number of component wise comparisons is: " << count_500 << endl;
}
int partition(int *S,int l, int u)
{
int x = S[l];
int j = l;
for(int i=l+1; i<=u; i++)
{
if(S[i] < x)
{
count_500++; // Count the component wise comparison
j++;
swap(S[i],S[j]);
}
}
int p = j;
swap(S[l],S[p]);
return p;
}
void swap(int &val1, int &val2)
{
int temp = val1;
val1 = val2;
val2 = temp;
}
void Quicksort(int S[],int low, int hi)
{
if (low < hi)
{
int p = partition(S,low,hi);
Quicksort(S,low,p-1);
Quicksort(S,p+1,hi);
}
}
You want the count_500++; outside the if statement. You're only counting the comparisons, where the result is true.
Change
if(S[i] < x)
{
count_500++; // Count the component wise comparison
...
}
to
count_500++; // Count the component wise comparison
if(S[i] < x)
{
...
}