C++ simple trick with an array pointer improves performance - c++

I have found a strange behavior in my heap sort routine (see below).
void hpsort(unsigned long n, double *data)
{
unsigned long i, ir, j, l;
double rra;
if (n < 2) return;
l = (n - 2) / 2 + 1;
ir = n - 1;
for (;;)
{
if (l > 0) rra = data[--l];
else
{
rra = data[ir];
data[ir] = data[0];
if (--ir == 0) { data[0] = rra; break; }
}
i = l;
j = l + l + 1;
while (j <= ir)
{
if (j < ir && data[j] < data[j+1]) ++j;
if (rra < data[j])
{
data[i] = data[j];
i = j;
j += j + 1;
}
else break;
}
data[i] = rra;
}
return;
}
If I do a benchmark calling this routine like this
double* array = (double*)malloc(sizeof(double) * N);
... fill in the array ...
hpsort(N, array);
it takes X seconds. but if I add just a single line
void hpsort(unsigned int n, double *data)
{
++data;
and do benchmark as
double* array = (double*)malloc(sizeof(double) * N);
... fill in the array ...
hpsort(N, array-1);
it takes about 0.96X seconds (i.e. 4% faster). This performance difference is stable from run to run.
It feels like g++ compiler does bounds checking in the first case, while in the second case I can cheat it somehow. But I never heard that bounds checking is done for C arrays...
Any ideas why I get this strange difference in performance?
p.s. compilation is done with g++ -O2. by the way, changing unsigned long to long int also decreases performance by about 3 to 4%.
p.p.s. the "Defined Behavior" version also shows performance improvement
void hpsort(unsigned int n, double *data)
{
--data;
and benchmark as
double* array = (double*)malloc(sizeof(double) * N);
... fill in the array ...
hpsort(N, array+1);
p.p.p.s. Performance comparison
Size of array Faster Slower
10 1.46 1.60
100 1.41 1.62
1000 1.84 1.96
10000 1.78 1.87
100000 1.72 1.80
1000000 1.76 1.83
10000000 1.98 2.03
here is my code for hpsort.cpp
void hpsort1(unsigned long n, double *data)
{
unsigned long i, ir, j, l;
double rra;
if (n < 2) return;
l = (n - 2) / 2 + 1;
ir = n - 1;
for (;;)
{
if (l > 0) rra = data[--l];
else
{
rra = data[ir];
data[ir] = data[0];
if (--ir == 0)
{
data[0] = rra;
break;
}
}
i = l;
j = l + l + 1;
while (j <= ir)
{
if (j < ir && data[j] < data[j+1]) ++j;
if (rra < data[j])
{
data[i] = data[j];
i = j;
j += j + 1;
}
else break;
}
data[i] = rra;
}
return;
}
void hpsort2(unsigned long n, double *data)
{
unsigned long i, ir, j, l;
double rra;
--data;
if (n < 2) return;
l = (n - 2) / 2 + 1;
ir = n - 1;
for (;;)
{
if (l > 0) rra = data[--l];
else
{
rra = data[ir];
data[ir] = data[0];
if (--ir == 0)
{
data[0] = rra;
break;
}
}
i = l;
j = l + l + 1;
while (j <= ir)
{
if (j < ir && data[j] < data[j+1]) ++j;
if (rra < data[j])
{
data[i] = data[j];
i = j;
j += j + 1;
}
else break;
}
data[i] = rra;
}
return;
}
and here is my benchmarking code heapsort-benchmark.cpp
#include <vector>
#include <alloca.h>
#include <limits.h>
#include <stdlib.h>
#include <string.h>
#include <stdio.h>
#include <time.h>
#include <math.h>
using namespace std;
void hpsort1(unsigned long n, double *data);
void hpsort2(unsigned long n, double *data);
typedef double element_t;
typedef void(*Test)(element_t*, element_t*, int);
const int sizes [] = {10, 100, 1000, 10000, 100000, 1000000, 10000000};
const int largest_size = sizes[sizeof(sizes)/sizeof(int)-1];
vector<double> result_times; // results are pushed into this vector
clock_t start_time;
void do_row(int size) // print results for given size of processed array
{
printf("%10d \t", size);
for (int i=0; i<result_times.size(); ++i) printf("%.2f\t", result_times[i]);
printf("\n");
result_times.clear();
}
inline void start_timer() { start_time = clock(); }
inline double timer()
{
clock_t end_time = clock();
return (end_time - start_time)/double(CLOCKS_PER_SEC);
}
void run(Test f, element_t* first, element_t* last, int number_of_times)
{
start_timer();
while (number_of_times-- > 0) f(first,last,number_of_times);
result_times.push_back(timer());
}
void random_shuffle(double *first, double *last)
{
size_t i, j, n;
double tmp;
n = last-first;
srand((unsigned int)0);
for (i=n-1; i>0; --i)
{
j = rand() % (i+1);
tmp = first[i];
first[i] = first[j];
first[j] = tmp;
}
return;
}
void hpsort1_test(element_t* first, element_t* last, int number_of_times)
{
size_t num_elements = (last-first);
element_t* array = (element_t*)malloc(sizeof(element_t)*num_elements);
memcpy(array, first, sizeof(element_t)*num_elements);
hpsort1(num_elements, array);
free(array);
}
void hpsort2_test(element_t* first, element_t* last, int number_of_times)
{
size_t num_elements = (last-first);
element_t* array = (element_t*)malloc(sizeof(element_t)*num_elements);
memcpy(array, first, sizeof(element_t)*num_elements);
hpsort2(num_elements, array+1);
free(array);
}
void initialize(element_t* first, element_t* last)
{
element_t x = 0.;
while (first != last) { *first++ = x; x += 1.; }
}
double logtwo(double x) { return log(x)/log((double) 2.0); }
int number_of_tests(int size)
{
double n = (double)size;
double largest_n = (double)largest_size;
return int(floor((largest_n * logtwo(largest_n)) / (n * logtwo(n))));
}
void run_tests(int size)
{
const int n = number_of_tests(size);
element_t *buffer = (element_t *)malloc(size * sizeof(element_t));
element_t* buffer_end = &buffer[size];
initialize(buffer, buffer + size); // fill in the elements
for (int i = 0; i < size/2; ++i) buffer[size/2 + i] = buffer[i]; // fill in the second half with values of the first half
//random_shuffle(buffer, buffer_end); // shuffle if you do not want an ordered array
run(hpsort2_test, buffer, buffer_end, n);
run(hpsort1_test, buffer, buffer_end, n);
do_row(size);
free(buffer);
}
int main()
{
const int n = sizeof(sizes)/sizeof(int);
for (int i = 0; i < n; ++i) run_tests(sizes[i]);
}
I compile and run it as
g++ -O2 -c heapsort-benchmark.cpp
g++ -O2 -c hpsort.cpp
g++ -O2 -o heapsort-benchmark heapsort-benchmark.o hpsort.o
./heapsort-benchmark
The first column will be faster version

Unable to get consistent results like OP.
IMO OP's small differences are not part of the difference in code, but part an artifact of testing.
void hpsort(unsigned long n, double *data) {
unsigned long i, ir, j, l;
double rra;
...
}
void hpsort1(unsigned long n, double *data) {
--data;
unsigned long i, ir, j, l;
double rra;
...
}
Test code
#include <time.h>
#include <stdlib.h>
void test(const char *s, int code, size_t n) {
srand(0);
double* array = (double*) malloc(sizeof(double) * n * 2);
// make 2 copies of same random data
for (size_t i = 0; i < n; i++) {
array[i] = rand();
array[i+n] = array[i];
}
double dt0;
double dt1;
clock_t c0 = clock();
clock_t c1,c2;
if (code) {
hpsort1(n, array + 1);
c1 = clock();
hpsort(n, &array[n]);
c2 = clock();
dt0 = (double) (c2 - c1)/CLOCKS_PER_SEC;
dt1 = (double) (c1 - c0)/CLOCKS_PER_SEC;
} else {
hpsort(n, array);
c1 = clock();
hpsort1(n, &array[n]+1);
c2 = clock();
dt0 = (double) (c1 - c0)/CLOCKS_PER_SEC;
dt1 = (double) (c2 - c1)/CLOCKS_PER_SEC;
}
free(array);
const char *cmp = dt0==dt1 ? "==" : (dt0<dt1 ? "<" : ">");
printf("%s %f %2s %f Diff:% f%%\n", s, dt0, cmp, dt1, 100*(dt1-dt0)/dt0);
}
int main() {
//srand((unsigned) time(0));
size_t n = 3000000;
for (int i=0; i<10; i++) {
test("heap first", 0, n);
test("heap1 first", 1, n);
fflush(stdout);
}
}
Output
heap first 1.263000 > 1.201000 Diff:-4.908947%
heap1 first 1.295000 < 1.326000 Diff: 2.393822%
heap first 1.342000 > 1.295000 Diff:-3.502235%
heap1 first 1.279000 < 1.295000 Diff: 1.250977%
heap first 1.279000 == 1.279000 Diff: 0.000000%
heap1 first 1.280000 > 1.279000 Diff:-0.078125%
heap first 1.295000 > 1.294000 Diff:-0.077220%
heap1 first 1.280000 > 1.279000 Diff:-0.078125%
heap first 1.279000 == 1.279000 Diff: 0.000000%
heap1 first 1.295000 > 1.279000 Diff:-1.235521%
heap first 1.263000 < 1.295000 Diff: 2.533650%
heap1 first 1.280000 > 1.279000 Diff:-0.078125%
heap first 1.295000 > 1.263000 Diff:-2.471042%
heap1 first 1.295000 < 1.310000 Diff: 1.158301%
heap first 1.310000 < 1.326000 Diff: 1.221374%
heap1 first 1.326000 < 1.342000 Diff: 1.206637%
heap first 1.279000 == 1.279000 Diff: 0.000000%
heap1 first 1.264000 < 1.295000 Diff: 2.452532%
heap first 1.279000 > 1.264000 Diff:-1.172791%
heap1 first 1.279000 > 1.264000 Diff:-1.172791%

Related

Reason for Segmentation Fault During Depth First Search on Tree of Large Size C++

I'm trying to solve https://open.kattis.com/problems/rootedsubtrees and part of the solution requires finding the minimum distance between any 2 nodes on the tree. To do this, I'm using Lowest Common Ancestor as a subroutine. Part of my LCA code uses a DFS to traverse the tree. Somehow, running this code on a line graph of size 200000 leads to a segmentation fault during the DFS section of the code.
#pragma GCC optimize("Ofast")
#pragma GCC target("sse,sse2,sse3,ssse3,sse4,popcnt,abm,mmx,avx,avx2,fma")
#include <bits/stdc++.h>
using namespace std;
typedef long long ll;
typedef vector<int> vi;
#define fast_cin() \
ios_base::sync_with_stdio(false); \
cin.tie(NULL); \
cout.tie(NULL);
int n, q, idx;
vector<int> adjlist[200009];
vector<int> L, E,
H; // depth at traversal index, node at traversal index, first traversal index of node
void dfs(int cur, int depth) {
cout << "dfs " << cur << " " << idx << endl;
H[cur] = idx;
E[idx] = cur;
L[idx++] = depth;
for (int &nxt : adjlist[cur]) {
if (H[nxt] != -1) continue;
dfs(nxt, depth + 1);
E[idx] = cur; // backtrack to current node
L[idx++] = depth;
}
}
class SparseTable { // OOP style
private:
vi A, P2, L2;
vector<vi> SpT; // the Sparse Table
public:
SparseTable() {} // default constructor
SparseTable(vi &initialA) { // pre-processing routine
A = initialA;
int n = (int)A.size();
int L2_n = (int)log2(n) + 1;
P2.assign(L2_n, 0);
L2.assign(1 << L2_n, 0);
for (int i = 0; i <= L2_n; ++i) {
P2[i] = (1 << i); // to speed up 2^i
L2[(1 << i)] = i; // to speed up log_2(i)
}
for (int i = 2; i < P2[L2_n]; ++i)
if (L2[i] == 0) L2[i] = L2[i - 1]; // to fill in the blanks
// the initialization phase
SpT = vector<vi>(L2[n] + 1, vi(n));
for (int j = 0; j < n; ++j) SpT[0][j] = j; // RMQ of sub array [j..j]
// the two nested loops below have overall time complexity = O(n log n)
for (int i = 1; P2[i] <= n; ++i) // for all i s.t. 2^i <= n
for (int j = 0; j + P2[i] - 1 < n; ++j) { // for all valid j
int x = SpT[i - 1][j]; // [j..j+2^(i-1)-1]
int y = SpT[i - 1][j + P2[i - 1]]; // [j+2^(i-1)..j+2^i-1]
SpT[i][j] = A[x] <= A[y] ? x : y;
}
}
int RMQ(int i, int j) {
int k = L2[j - i + 1]; // 2^k <= (j-i+1)
int x = SpT[k][i]; // covers [i..i+2^k-1]
int y = SpT[k][j - P2[k] + 1]; // covers [j-2^k+1..j]
return A[x] <= A[y] ? x : y;
}
};
int LCA(int u, int v, SparseTable &SpT) {
if (H[u] > H[v]) swap(u, v);
return E[SpT.RMQ(H[u], H[v])];
}
int APSP(int u, int v, SparseTable &SpT) {
int ancestor = LCA(u, v, SpT);
return L[H[u]] + L[H[v]] - 2 * L[H[ancestor]];
}
int main() {
fast_cin();
cin >> n >> q;
L.assign(2 * (n + 9), 0);
E.assign(2 * (n + 9), 0);
H.assign(n + 9, -1);
idx = 0;
int u, v;
for (int i = 0; i < n - 1; i++) {
cin >> u >> v;
u--;
v--;
adjlist[u].emplace_back(v);
adjlist[v].emplace_back(u);
}
dfs(0, 0);
SparseTable SpT(L);
ll d;
while (q--) {
cin >> u >> v;
u--;
v--;
d = (ll) APSP(u, v, SpT) + 1;
cout << (ll) n - d + (d) * (d + 1) / 2 << endl;
}
return 0;
}
Using the following Python Code to generate the input of a large line graph
n = 200000
q = 1
print(n, q)
for i in range(1, n):
print(i, i+1)
print(1, 200000)
I get the following last few lines of output before my program crashes.
.
.
.
dfs 174494 174494
dfs 174495 174495
dfs 174496 174496
dfs 174497 174497
dfs 174498 174498
Segmentation fault (core dumped)
Is the problem an issue of exhausting stack space with the recursion or something else?
You posted a lot of code, but here is one obvious error in the SparseMatrix class:
std::vector<int> P2;
//...
P2.assign(L2_n, 0);
for (int i = 0; i <= L2_n; ++i)
{
P2[i] = (1 << i); // <-- Out of bounds access when i == L2_n
To show you the error, change that line of code to this:
P2.at(i) = (1 << i); // <-- Out of bounds access when i == L2_n
You will now get a std::out_of_range exception thrown.
If you write a loop using <=, that loop will be considered suspicious, since a lot of off-by-one and buffer overrun errors occur with loop conditions written this way.
I believe stack exhaustion was the main problem in running the code on my machine. I re-implemented the DFS in an iterative fashion.
stack<tuple<int, int, bool>> st; // cur, depth, first_time
st.push ({0, 0, 1});
while (!st.empty()) {
auto [cur, depth, first_time] = st.top();
st.pop();
if (first_time){
H[cur] = idx;
}
E[idx] = cur;
L[idx++] = depth;
for (int &nxt : adjlist[cur]) {
if (H[nxt] != -1) continue;
st.push({cur, depth, 0});
st.push({nxt, depth+1, 1});
break;
}
}
and my code was able to run the large testcase on my machine.
I'm not sure is this is relevant to the original question, but after this change, the code still flagged a run-time error on the online judge and I eventually realized that the issue was that the sparse table was using too much memory, so I fixed that by avoiding wasted declared but not used memory spaces in rows of the sparse table. Then the online judge deemed it as being too slow. So I reverted the DFS code back to the recursive version, and it was accepted. Note that the accepted solution actually crashes on my machine when running the large testcase... I guess my machine has a more limited stack space than the online grader.
The accepted solution is here
#pragma GCC optimize("Ofast")
#pragma GCC target("sse,sse2,sse3,ssse3,sse4,popcnt,abm,mmx,avx,avx2,fma")
#include <bits/stdc++.h>
using namespace std;
typedef long long ll;
typedef vector<int> vi;
#define fast_cin() \
ios_base::sync_with_stdio(false); \
cin.tie(NULL); \
cout.tie(NULL);
int n, q, idx;
vector<int> adjlist[(int)2e5 + 9];
vector<int> L, E,
H; // depth at traversal index, node at traversal index, first traversal index of node
void dfs(int cur, int depth) {
H[cur] = idx;
E[idx] = cur;
L[idx++] = depth;
for (int &nxt : adjlist[cur]) {
if (H[nxt] != -1) continue;
dfs(nxt, depth + 1);
E[idx] = cur; // backtrack to current node
L[idx++] = depth;
}
}
class SparseTable { // OOP style
private:
vi A, P2, L2;
vector<vi> SpT; // the Sparse Table
public:
SparseTable() {} // default constructor
SparseTable(vi &initialA) { // pre-processing routine
A = initialA;
int n = (int)A.size();
int L2_n = (int)log2(n) + 1;
P2.assign(L2_n + 1, 0);
L2.assign((1 << L2_n) + 1, 0);
for (int i = 0; i <= L2_n; ++i) {
P2[i] = (1 << i); // to speed up 2^i
L2[(1 << i)] = i; // to speed up log_2(i)
}
for (int i = 2; i < P2[L2_n]; ++i)
if (L2[i] == 0) L2[i] = L2[i - 1]; // to fill in the blanks
// the initialization phase
SpT = vector<vi>(L2[n] + 1, vi());
SpT[0] = vi(n, 0);
for (int j = 0; j < n; ++j) SpT[0][j] = j; // RMQ of sub array [j..j]
// the two nested loops below have overall time complexity = O(n log n)
for (int i = 1; P2[i] <= n; ++i) { // for all i s.t. 2^i <= n
SpT[i] = vi(n + 1 - P2[i]); // initialize SpT[i]
for (int j = 0; j + P2[i] - 1 < n; ++j) { // for all valid j
int x = SpT[i - 1][j]; // [j..j+2^(i-1)-1]
int y = SpT[i - 1][j + P2[i - 1]]; // [j+2^(i-1)..j+2^i-1]
SpT[i][j] = A[x] <= A[y] ? x : y;
}
}
}
int RMQ(int i, int j) {
int k = L2[j - i + 1]; // 2^k <= (j-i+1)
int x = SpT[k][i]; // covers [i..i+2^k-1]
int y = SpT[k][j - P2[k] + 1]; // covers [j-2^k+1..j]
return A[x] <= A[y] ? x : y;
}
};
int LCA(int u, int v, SparseTable &SpT) {
if (H[u] > H[v]) swap(u, v);
return E[SpT.RMQ(H[u], H[v])];
}
int APSP(int u, int v, SparseTable &SpT) {
int ancestor = LCA(u, v, SpT);
return L[H[u]] + L[H[v]] - 2 * L[H[ancestor]];
}
int main() {
fast_cin();
cin >> n >> q;
L.assign(2 * (n), 0);
E.assign(2 * (n), 0);
H.assign(n, -1);
idx = 0;
int u, v;
for (int i = 0; i < n - 1; i++) {
cin >> u >> v;
u--;
v--;
adjlist[u].emplace_back(v);
adjlist[v].emplace_back(u);
}
dfs(n - 1, 0);
SparseTable SpT(L);
ll d;
while (q--) {
cin >> u >> v;
u--;
v--;
d = (ll)APSP(u, v, SpT) + 1LL;
cout << (ll)n - d + (d) * (d + 1) / (ll)2 << endl;
}
return 0;
}

How to call ** var in C/C++ 10.5 chapter numerical recipes

I have a problem with calling this function:
void powell(float p[], float **xi, int n,
float ftol, int *iter, float *fret,
float (*func)(float []))
I don't know which argument must be under **xi to run my code.
Whole function below:
void powell(float p[], float** xi, int n, float ftol, int* iter, float* fret, float (*func)(float[]))
{
void linmin(float p[], float xi[], int n, float* fret, float (*func)(float[]));
int i, ibig, j;
float del, fp, fptt, t, *pt, *ptt, *xit;
pt = vector(1, n);
ptt = vector(1, n);
xit = vector(1, n);
*fret = (*func)(p);
for (j = 1; j <= n; j++)
pt[j] = p[j];
for (*iter = 1;; ++(*iter)) {
fp = (*fret);
ibig = 0;
del = 0.0;
for (i = 1; i <= n; i++) {
for (j = 1; j <= n; j++)
xit[j] = xi[j][i];
fptt = (*fret);
linmin(p, xit, n, fret, func);
if (fptt - (*fret) > del) {
del = fptt - (*fret);
ibig = i;
}
}
if (2.0 * (fp - (*fret)) <= ftol * (fabs(fp) + fabs(*fret)) + TINY) {
free_vector(xit, 1, n);
free_vector(ptt, 1, n);
free_vector(pt, 1, n);
return;
}
if (*iter == ITMAX)
nrerror("powell exceeding maximum iterations.");
for (j = 1; j <= n; j++) {
ptt[j] = 2.0 * p[j] - pt[j];
xit[j] = p[j] - pt[j];
pt[j] = p[j];
}
fptt = (*func)(ptt);
if (fptt < fp) {
t = 2.0 * (fp - 2.0 * (*fret) + fptt) * SQR(fp - (*fret) - del) - del * SQR(fp - fptt);
if (t < 0.0) {
linmin(p, xit, n, fret, func);
for (j = 1; j <= n; j++) {
xi[j][ibig] = xi[j][n];
xi[j][n] = xit[j];
}
}
}
}
}
Thanks in advance.
A double pointer means that the function wants the address of a pointer.
void my_function(int **p_pointer)
{
*p_pointer = new int[42];
}
int main(void)
{
int * pointer = nullptr;
my_function(&pointer);
return 0;
}
In C++, the double pointer can be avoided by using reference:
void another_function(int *& pointer)
{
pointer = new int [256];
}
int main(void)
{
int p = nullptr;
another_function(p);
return 0;
}
One of the primary concerns with pointers is that they can point to anywhere, a defined location or not. Testing a pointer for validity is complex because it depends on the range (or ranges) that are valid for the current platform. With references, the reference is valid, by definition, so no validity checks need to be performed.

How is numpy so fast?

I'm trying to understand how numpy can be so fast, based on my shocking comparison with optimized C/C++ code which is still far from reproducing numpy's speed.
Consider the following example:
Given a 2D array with shape=(N, N) and dtype=float32, which represents a list of N vectors of N dimensions, I am computing the pairwise differences between every pair of vectors. Using numpy broadcasting, this simply writes as:
def pairwise_sub_numpy( X ):
return X - X[:, None, :]
Using timeit I can measure the performance for N=512: it takes 88 ms per call on my laptop.
Now, in C/C++ a naive implementation writes as:
#define X(i, j) _X[(i)*N + (j)]
#define res(i, j, k) _res[((i)*N + (j))*N + (k)]
float* pairwise_sub_naive( const float* _X, int N )
{
float* _res = (float*) aligned_alloc( 32, N*N*N*sizeof(float));
for (int i = 0; i < N; i++) {
for (int j = 0; j < N; j++) {
for (int k = 0; k < N; k++)
res(i,j,k) = X(i,k) - X(j,k);
}
}
return _res;
}
Compiling using gcc 7.3.0 with -O3 flag, I get 195 ms per call for pairwise_sub_naive(X), which is not too bad given the simplicity of the code, but about 2 times slower than numpy.
Now I start getting serious and add some small optimizations, by indexing the row vectors directly:
float* pairwise_sub_better( const float* _X, int N )
{
float* _res = (float*) aligned_alloc( 32, N*N*N*sizeof(float));
for (int i = 0; i < N; i++) {
const float* xi = & X(i,0);
for (int j = 0; j < N; j++) {
const float* xj = & X(j,0);
float* r = &res(i,j,0);
for (int k = 0; k < N; k++)
r[k] = xi[k] - xj[k];
}
}
return _res;
}
The speed stays the same at 195 ms, which means that the compiler was able to figure that much. Let's now use SIMD vector instructions:
float* pairwise_sub_simd( const float* _X, int N )
{
float* _res = (float*) aligned_alloc( 32, N*N*N*sizeof(float));
// create caches for row vectors which are memory-aligned
float* xi = (float*)aligned_alloc(32, N * sizeof(float));
float* xj = (float*)aligned_alloc(32, N * sizeof(float));
for (int i = 0; i < N; i++) {
memcpy(xi, & X(i,0), N*sizeof(float));
for (int j = 0; j < N; j++) {
memcpy(xj, & X(j,0), N*sizeof(float));
float* r = &res(i,j,0);
for (int k = 0; k < N; k += 256/sizeof(float)) {
const __m256 A = _mm256_load_ps(xi+k);
const __m256 B = _mm256_load_ps(xj+k);
_mm256_store_ps(r+k, _mm256_sub_ps( A, B ));
}
}
}
free(xi);
free(xj);
return _res;
}
This only yields a small boost (178 ms instead of 194 ms per function call).
Then I was wondering if a "block-wise" approach, like what is used to optimize dot-products, could be beneficials:
float* pairwise_sub_blocks( const float* _X, int N )
{
float* _res = (float*) aligned_alloc( 32, N*N*N*sizeof(float));
#define B 8
float cache1[B*B], cache2[B*B];
for (int bi = 0; bi < N; bi+=B)
for (int bj = 0; bj < N; bj+=B)
for (int bk = 0; bk < N; bk+=B) {
// load first 8x8 block in the cache
for (int i = 0; i < B; i++)
for (int k = 0; k < B; k++)
cache1[B*i + k] = X(bi+i, bk+k);
// load second 8x8 block in the cache
for (int j = 0; j < B; j++)
for (int k = 0; k < B; k++)
cache2[B*j + k] = X(bj+j, bk+k);
// compute local operations on the caches
for (int i = 0; i < B; i++)
for (int j = 0; j < B; j++)
for (int k = 0; k < B; k++)
res(bi+i,bj+j,bk+k) = cache1[B*i + k] - cache2[B*j + k];
}
return _res;
}
And surprisingly, this is the slowest method so far (258 ms per function call).
To summarize, despite some efforts with some optimized C++ code, I can't come anywhere close the 88 ms / call that numpy achieves effortlessly. Any idea why?
Note: By the way, I am disabling numpy multi-threading and anyway, this kind of operation is not multi-threaded.
Edit: Exact code to benchmark the numpy code:
import numpy as np
def pairwise_sub_numpy( X ):
return X - X[:, None, :]
N = 512
X = np.random.rand(N,N).astype(np.float32)
import timeit
times = timeit.repeat('pairwise_sub_numpy( X )', globals=globals(), number=1, repeat=5)
print(f">> best of 5 = {1000*min(times):.3f} ms")
Full benchmark for C code:
#include <stdio.h>
#include <string.h>
#include <xmmintrin.h> // compile with -mavx -msse4.1
#include <pmmintrin.h>
#include <immintrin.h>
#include <time.h>
#define X(i, j) _x[(i)*N + (j)]
#define res(i, j, k) _res[((i)*N + (j))*N + (k)]
float* pairwise_sub_naive( const float* _x, int N )
{
float* _res = (float*) aligned_alloc( 32, N*N*N*sizeof(float));
for (int i = 0; i < N; i++) {
for (int j = 0; j < N; j++) {
for (int k = 0; k < N; k++)
res(i,j,k) = X(i,k) - X(j,k);
}
}
return _res;
}
float* pairwise_sub_better( const float* _x, int N )
{
float* _res = (float*) aligned_alloc( 32, N*N*N*sizeof(float));
for (int i = 0; i < N; i++) {
const float* xi = & X(i,0);
for (int j = 0; j < N; j++) {
const float* xj = & X(j,0);
float* r = &res(i,j,0);
for (int k = 0; k < N; k++)
r[k] = xi[k] - xj[k];
}
}
return _res;
}
float* pairwise_sub_simd( const float* _x, int N )
{
float* _res = (float*) aligned_alloc( 32, N*N*N*sizeof(float));
// create caches for row vectors which are memory-aligned
float* xi = (float*)aligned_alloc(32, N * sizeof(float));
float* xj = (float*)aligned_alloc(32, N * sizeof(float));
for (int i = 0; i < N; i++) {
memcpy(xi, & X(i,0), N*sizeof(float));
for (int j = 0; j < N; j++) {
memcpy(xj, & X(j,0), N*sizeof(float));
float* r = &res(i,j,0);
for (int k = 0; k < N; k += 256/sizeof(float)) {
const __m256 A = _mm256_load_ps(xi+k);
const __m256 B = _mm256_load_ps(xj+k);
_mm256_store_ps(r+k, _mm256_sub_ps( A, B ));
}
}
}
free(xi);
free(xj);
return _res;
}
float* pairwise_sub_blocks( const float* _x, int N )
{
float* _res = (float*) aligned_alloc( 32, N*N*N*sizeof(float));
#define B 8
float cache1[B*B], cache2[B*B];
for (int bi = 0; bi < N; bi+=B)
for (int bj = 0; bj < N; bj+=B)
for (int bk = 0; bk < N; bk+=B) {
// load first 8x8 block in the cache
for (int i = 0; i < B; i++)
for (int k = 0; k < B; k++)
cache1[B*i + k] = X(bi+i, bk+k);
// load second 8x8 block in the cache
for (int j = 0; j < B; j++)
for (int k = 0; k < B; k++)
cache2[B*j + k] = X(bj+j, bk+k);
// compute local operations on the caches
for (int i = 0; i < B; i++)
for (int j = 0; j < B; j++)
for (int k = 0; k < B; k++)
res(bi+i,bj+j,bk+k) = cache1[B*i + k] - cache2[B*j + k];
}
return _res;
}
int main()
{
const int N = 512;
float* _x = (float*) malloc( N * N * sizeof(float) );
for( int i = 0; i < N; i++)
for( int j = 0; j < N; j++)
X(i,j) = ((i+j*j+17*i+101) % N) / float(N);
double best = 9e9;
for( int i = 0; i < 5; i++)
{
struct timespec start, stop;
clock_gettime(CLOCK_THREAD_CPUTIME_ID, &start);
//float* res = pairwise_sub_naive( _x, N );
//float* res = pairwise_sub_better( _x, N );
//float* res = pairwise_sub_simd( _x, N );
float* res = pairwise_sub_blocks( _x, N );
clock_gettime(CLOCK_THREAD_CPUTIME_ID, &stop);
double t = (stop.tv_sec - start.tv_sec) * 1e6 + (stop.tv_nsec - start.tv_nsec) / 1e3; // in microseconds
if (t < best) best = t;
free( res );
}
printf("Best of 5 = %f ms\n", best / 1000);
free( _x );
return 0;
}
Compiled using gcc 7.3.0 gcc -Wall -O3 -mavx -msse4.1 -o test_simd test_simd.c
Summary of timings on my machine:
Implementation
Time
numpy
88 ms
C++ naive
194 ms
C++ better
195 ms
C++ SIMD
178 ms
C++ blocked
258 ms
C++ blocked (gcc 8.3.1)
217 ms
As pointed out by some of the comments numpy uses SIMD in its implementation and it does not allocate memory at the point of computation. If I eliminate the memory allocation from your implementation, pre-allocating all the buffers ahead of the computation then I get a better time compared to numpy even with the scaler version(that is the one without any optimizations).
Also in terms of SIMD and why your implementation does not perform much better than the scaler is because your memory access patterns are not ideal for SIMD usage - you do memcopy and you load into SIMD registers from locations that are far apart from each other - e.g. you fill vectors from line 0 and line 511, which might not play well with the cache or with the SIMD prefetcher.
There is also a mistake in how you load the SIMD registers(if I understood correctly what you're trying to compute): a 256 bit SIMD register can load 8 single-precision floating-point numbers 8 * 32 = 256, but in your loop you jump k by "256/sizeof(float)" which is 256/4 = 64; _x and _res are float pointers and the SIMD intrinsics expect also float pointers as arguments so instead of reading all elements from those lines every 8 floats you read them every 64 floats.
The computation can be optimized further by changing the access patterns but also by observing that you repeat some computations: e.g. when iterating with line0 as a base you compute line0 - line1 but at some future time, when iterating with line1 as a base, you need to compute line1 - line0 which is basically -(line0 - line1), that is for each line after line0 a lot of results could be reused from previous computations.
A lot of times SIMD usage or parallelization requires one to change how data is accessed or reasoned about in order to provide meaningful improvements.
Here is what I have done as a first step based on your initial implementation and it is faster than the numpy(don't mind the OpenMP stuff as it's not how its supposed to be done, I just wanted to see how it behaves trying the naive way).
C++
Time scaler version: 55 ms
Time SIMD version: 53 ms
**Time SIMD 2 version: 33 ms**
Time SIMD 3 version: 168 ms
Time OpenMP version: 59 ms
Python numpy
>> best of 5 = 88.794 ms
#include <cstdlib>
#include <xmmintrin.h> // compile with -mavx -msse4.1
#include <pmmintrin.h>
#include <immintrin.h>
#include <numeric>
#include <algorithm>
#include <chrono>
#include <iostream>
#include <cstring>
using namespace std;
float* pairwise_sub_naive (const float* input, float* output, int n)
{
for (int i = 0; i < n; i++) {
for (int j = 0; j < n; j++) {
for (int k = 0; k < n; k++)
output[(i * n + j) * n + k] = input[i * n + k] - input[j * n + k];
}
}
return output;
}
float* pairwise_sub_simd (const float* input, float* output, int n)
{
for (int i = 0; i < n; i++)
{
const int idxi = i * n;
for (int j = 0; j < n; j++)
{
const int idxj = j * n;
const int outidx = idxi + j;
for (int k = 0; k < n; k += 8)
{
__m256 A = _mm256_load_ps(input + idxi + k);
__m256 B = _mm256_load_ps(input + idxj + k);
_mm256_store_ps(output + outidx * n + k, _mm256_sub_ps( A, B ));
}
}
}
return output;
}
float* pairwise_sub_simd_2 (const float* input, float* output, int n)
{
float* line_buffer = (float*) aligned_alloc(32, n * sizeof(float));
for (int i = 0; i < n; i++)
{
const int idxi = i * n;
for (int j = 0; j < n; j++)
{
const int idxj = j * n;
const int outidx = idxi + j;
for (int k = 0; k < n; k += 8)
{
__m256 A = _mm256_load_ps(input + idxi + k);
__m256 B = _mm256_load_ps(input + idxj + k);
_mm256_store_ps(line_buffer + k, _mm256_sub_ps( A, B ));
}
memcpy(output + outidx * n, line_buffer, n);
}
}
return output;
}
float* pairwise_sub_simd_3 (const float* input, float* output, int n)
{
for (int i = 0; i < n; i++)
{
const int idxi = i * n;
for (int k = 0; k < n; k += 8)
{
__m256 A = _mm256_load_ps(input + idxi + k);
for (int j = 0; j < n; j++)
{
const int idxj = j * n;
const int outidx = (idxi + j) * n;
__m256 B = _mm256_load_ps(input + idxj + k);
_mm256_store_ps(output + outidx + k, _mm256_sub_ps( A, B ));
}
}
}
return output;
}
float* pairwise_sub_openmp (const float* input, float* output, int n)
{
int i, j;
#pragma omp parallel for private(j)
for (i = 0; i < n; i++)
{
for (j = 0; j < n; j++)
{
const int idxi = i * n;
const int idxj = j * n;
const int outidx = idxi + j;
for (int k = 0; k < n; k += 8)
{
__m256 A = _mm256_load_ps(input + idxi + k);
__m256 B = _mm256_load_ps(input + idxj + k);
_mm256_store_ps(output + outidx * n + k, _mm256_sub_ps( A, B ));
}
}
}
/*for (i = 0; i < n; i++)
{
for (j = 0; j < n; j++)
{
for (int k = 0; k < n; k++)
{
output[(i * n + j) * n + k] = input[i * n + k] - input[j * n + k];
}
}
}*/
return output;
}
int main ()
{
constexpr size_t n = 512;
constexpr size_t input_size = n * n;
constexpr size_t output_size = n * n * n;
float* input = (float*) aligned_alloc(32, input_size * sizeof(float));
float* output = (float*) aligned_alloc(32, output_size * sizeof(float));
float* input_simd = (float*) aligned_alloc(32, input_size * sizeof(float));
float* output_simd = (float*) aligned_alloc(32, output_size * sizeof(float));
float* input_par = (float*) aligned_alloc(32, input_size * sizeof(float));
float* output_par = (float*) aligned_alloc(32, output_size * sizeof(float));
iota(input, input + input_size, float(0.0));
fill(output, output + output_size, float(0.0));
iota(input_simd, input_simd + input_size, float(0.0));
fill(output_simd, output_simd + output_size, float(0.0));
iota(input_par, input_par + input_size, float(0.0));
fill(output_par, output_par + output_size, float(0.0));
std::chrono::milliseconds best_scaler{100000};
for (int i = 0; i < 5; ++i)
{
auto start = chrono::high_resolution_clock::now();
pairwise_sub_naive(input, output, n);
auto stop = chrono::high_resolution_clock::now();
auto duration = chrono::duration_cast<chrono::milliseconds>(stop - start);
if (duration < best_scaler)
{
best_scaler = duration;
}
}
cout << "Time scaler version: " << best_scaler.count() << " ms\n";
std::chrono::milliseconds best_simd{100000};
for (int i = 0; i < 5; ++i)
{
auto start = chrono::high_resolution_clock::now();
pairwise_sub_simd(input_simd, output_simd, n);
auto stop = chrono::high_resolution_clock::now();
auto duration = chrono::duration_cast<chrono::milliseconds>(stop - start);
if (duration < best_simd)
{
best_simd = duration;
}
}
cout << "Time SIMD version: " << best_simd.count() << " ms\n";
std::chrono::milliseconds best_simd_2{100000};
for (int i = 0; i < 5; ++i)
{
auto start = chrono::high_resolution_clock::now();
pairwise_sub_simd_2(input_simd, output_simd, n);
auto stop = chrono::high_resolution_clock::now();
auto duration = chrono::duration_cast<chrono::milliseconds>(stop - start);
if (duration < best_simd_2)
{
best_simd_2 = duration;
}
}
cout << "Time SIMD 2 version: " << best_simd_2.count() << " ms\n";
std::chrono::milliseconds best_simd_3{100000};
for (int i = 0; i < 5; ++i)
{
auto start = chrono::high_resolution_clock::now();
pairwise_sub_simd_3(input_simd, output_simd, n);
auto stop = chrono::high_resolution_clock::now();
auto duration = chrono::duration_cast<chrono::milliseconds>(stop - start);
if (duration < best_simd_3)
{
best_simd_3 = duration;
}
}
cout << "Time SIMD 3 version: " << best_simd_3.count() << " ms\n";
std::chrono::milliseconds best_par{100000};
for (int i = 0; i < 5; ++i)
{
auto start = chrono::high_resolution_clock::now();
pairwise_sub_openmp(input_par, output_par, n);
auto stop = chrono::high_resolution_clock::now();
auto duration = chrono::duration_cast<chrono::milliseconds>(stop - start);
if (duration < best_par)
{
best_par = duration;
}
}
cout << "Time OpenMP version: " << best_par.count() << " ms\n";
cout << "Verification\n";
if (equal(output, output + output_size, output_simd))
{
cout << "PASSED\n";
}
else
{
cout << "FAILED\n";
}
return 0;
}
Edit: Small correction as there was a wrong call related to the second version of SIMD implementation.
As you can see now, the second implementation is the fastest as it behaves the best from the point of view of the locality of reference of the cache. Examples 2 and 3 of SIMD implementations are there to illustrate for you how changing memory access patterns to influence the performance of your SIMD optimizations.
To summarize(knowing that I'm far from being complete in my advice) be mindful of your memory access patterns and of the loads and stores to\from the SIMD unit; the SIMD is a different hardware unit inside the processor's core so there is a penalty in shuffling data back and forth, hence when you load a register from memory try to do as many operations as possible with that data and do not be too eager to store it back(of course, in your example that might be all you need to do with the data). Be mindful also that there is a limited number of SIMD registers available and if you load too many then they will "spill", that is they will be stored back to temporary locations in main memory behind the scenes killing all your gains. SIMD optimization, it's a true balance act!
There is some effort to put a cross-platform intrinsics wrapper into the standard(I developed myself a closed source one in my glorious past) and even it's far from being complete, it's worth taking a look at(read the accompanying papers if you're truly interested to learn how SIMD works).
https://github.com/VcDevel/std-simd
This is a complement to the answer posted by #celakev .
I think I finally got to understand what exactly was the issue. The issue was not about allocating the memory in the main function that does the computation.
What was actually taking time is to access new (fresh) memory. I believe that the malloc call returns pages of memory which are virtual, i.e. that does not corresponds to actual physical memory -- until it is explicitly accessed. What actually takes time is the process of allocating physical memory on the fly (which I think is OS-level) when it is accessed in the function code.
Here is a proof. Consider the two following trivial functions:
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
float* just_alloc( size_t N )
{
return (float*) aligned_alloc( 32, sizeof(float)*N );
}
void just_fill( float* _arr, size_t N )
{
for (size_t i = 0; i < N; i++)
_arr[i] = 1;
}
#define Time( code_to_benchmark, cleanup_code ) \
do { \
double best = 9e9; \
for( int i = 0; i < 5; i++) { \
struct timespec start, stop; \
clock_gettime(CLOCK_THREAD_CPUTIME_ID, &start); \
code_to_benchmark; \
clock_gettime(CLOCK_THREAD_CPUTIME_ID, &stop); \
double t = (stop.tv_sec - start.tv_sec) * 1e3 + (stop.tv_nsec - start.tv_nsec) / 1e6; \
printf("Time[%d] = %f ms\n", i, t); \
if (t < best) best = t; \
cleanup_code; \
} \
printf("Best of 5 for '" #code_to_benchmark "' = %f ms\n\n", best); \
} while(0)
int main()
{
const size_t N = 512;
Time( float* arr = just_alloc(N*N*N), free(arr) );
float* arr = just_alloc(N*N*N);
Time( just_fill(arr, N*N*N), ; );
free(arr);
return 0;
}
I get the following timings, which I now detail for each of the calls:
Time[0] = 0.000931 ms
Time[1] = 0.000540 ms
Time[2] = 0.000523 ms
Time[3] = 0.000524 ms
Time[4] = 0.000521 ms
Best of 5 for 'float* arr = just_alloc(N*N*N)' = 0.000521 ms
Time[0] = 189.822237 ms
Time[1] = 45.041083 ms
Time[2] = 46.331428 ms
Time[3] = 44.729433 ms
Time[4] = 42.241279 ms
Best of 5 for 'just_fill(arr, N*N*N)' = 42.241279 ms
As you can see, allocating memory is blazingly fast, but the first time that the memory is accessed, it is 5 times slower than the other times. So, basically the reason that my code was slow was because i was each time reallocating fresh memory that had no physical address yet. (Correct me if I'm wrong but I think that's the gist of it!)
A bit late to the party, but I wanted to add a pairwise method with Eigen, which is supposed to give C++ a high-level algebra manipulation capability and use SIMD under the hood. Just like numpy.
Here is the implementation
#include <iostream>
#include <vector>
#include <chrono>
#include <algorithm>
#include <Eigen/Dense>
auto pairwise_eigen(const Eigen::MatrixXf &input, std::vector<Eigen::MatrixXf> &output) {
for (int k = 0; k < input.cols(); ++k)
output[k] = input
// subtract matrix with repeated k-th column
- input.col(k) * Eigen::RowVectorXf::Ones(input.cols());
}
int main() {
constexpr size_t n = 512;
// allocate input and output
Eigen::MatrixXf input = Eigen::MatrixXf::Random(n, n);
std::vector<Eigen::MatrixXf> output(n);
std::chrono::milliseconds best_eigen{100000};
for (int i = 0; i < 5; ++i) {
auto start = std::chrono::high_resolution_clock::now();
pairwise_eigen(input, output);
auto end = std::chrono::high_resolution_clock::now();
auto duration = std::chrono::duration_cast<std::chrono::milliseconds>(end-start);
if (duration < best_eigen)
best_eigen = duration;
}
std::cout << "Time Eigen version: " << best_eigen.count() << " ms\n";
return 0;
}
The full benchmark tests suggested by #celavek on my system are
Time scaler version: 57 ms
Time SIMD version: 58 ms
Time SIMD 2 version: 40 ms
Time SIMD 3 version: 58 ms
Time OpenMP version: 58 ms
Time Eigen version: 76 ms
Numpy >> best of 5 = 118.489 ms
Whit Eigen there is still a noticeable improvement with respect to Numpy, but not so impressive compared to the "raw" implementations (there is certainly some overhead).
An extra optimization is to allocate the output vector with copies of the input and then subtract directly from each vector entry, simply replacing the following lines
// inside the pairwise method
for (int k = 0; k < input.cols(); ++k)
output[k] -= input.col(k) * Eigen::RowVectorXf::Ones(input.cols());
// at allocation time
std::vector<Eigen::MatrixXf> output(n, input);
This pushes the best of 5 down to 60 ms.

Corrupted memory issue when deleting allocated memory

I am trying to store a sparse vector using a bit mask. I allocate a char* to represent the bit mask. However, when I delete [] the mask, I get a memory corruption error. Upon investigation, I'm seeing that it's because I'm freeing memory that I'm not supposed to. This is confusing, since I don't see how this could be the case.
When I run this on my case, it prints out "ALLOCATED" and "DEALLOCATING" but nothing further.
void set_i_bit(char* mask, int i) {
int field_num = floor(i/8);
int bit_num = i %8;
mask[field_num] = (1 << bit_num) | mask[field_num];
}
int write_sparse_with_bitmask(vector<float> arr, ofstream* fout) {
int mx_sz = arr.size() - 1;
float tol = 0.5;
char* mask = 0;
for(int i = arr.size() -1; i>=0; i-=1) {
if (fabs(arr[i]) > tol) break;
mx_sz = i;
}
int sprse_cnt = 0;
for(int i = 0; i<=mx_sz; i+=1) {
if (fabs(arr[i]) < tol) sprse_cnt++;
}
int bitmask_sz = ceil(mx_sz/8);
if (sprse_cnt*sizeof(int16_t) + sizeof(int16_t) > bitmask_sz) {
cout<<"ALLOCATED"<<endl;
mask = new char[bitmask_sz];
for (int i =0; i<bitmask_sz; i++) mask[i] = 0;
for(int i = 0; i<=mx_sz; i+=1) {
if (fabs(arr[i]) > coef_tol) {
set_i_bit(mask, i);
}
}
}
else {
bitmask_sz = 0;
}
uint16_t sz = mx_sz + 1;
uint16_t bt_msk = bitmask_sz + 1;
char flag = 0;
if (bitmask_sz > 0) {
flag = flag | 1;
}
fout->write((char*)&sz, sizeof(uint16_t));
fout->write((char*)&flag, sizeof(char));
int w_size = sizeof(uint16_t) + sizeof(char);
if (flag & 1) {
fout->write((char*)&bt_msk, sizeof(uint16_t));
fout->write(mask, sizeof(char)*bt_msk);
cout<<"DEALLOCATING"<<endl;
delete [] mask;
cout<<"THIS DOESN'T PRINT"<<endl;
w_size += sizeof(uint16_t) + sizeof(char)*bt_msk;
}
for(int i = 0; i<=mx_sz; i+=1) {
if (fabs(arr[i]) > tol || !(flag & 1)) {
int16_t vl = arr[i];
fout->write((char*) &vl, sizeof(int16_t));
w_size += sizeof(int16_t);
}
}
return w_size;
}

C++ Not Counting white beands

I need some help. I'm writing a code in C++ that will ultimately take a random string passed in, and it will do a break at every point in the string, and it will count the number of colors to the right and left of the break (r, b, and w). Here's the catch, the w can be either r or b when it breaks or when the strong passes it ultimately making it a hybrid. My problem is when the break is implemented and there is a w immediately to the left or right I can't get the program to go find the fist b or r. Can anyone help me?
#include <stdio.h>
#include "P2Library.h"
void doubleNecklace(char neck[], char doubleNeck[], int size);
int findMaxBeads(char neck2[], int size);
#define SIZE 7
void main(void)
{
char necklace[SIZE];
char necklace2[2 * SIZE];
int brk;
int maxBeads;
int leftI, rightI, leftCount = 0, rightCount=0, totalCount, maxCount = 0;
char leftColor, rightColor;
initNecklace(necklace, SIZE);
doubleNecklace(necklace, necklace2, SIZE);
maxBeads = findMaxBeads(necklace2, SIZE * 2);
checkAnswer(necklace, SIZE, maxBeads);
printf("The max number of beads is %d\n", maxBeads);
}
int findMaxBeads(char neck2[], int size)
{
int brk;
int maxBeads;
int leftI, rightI, leftCount = 0, rightCount=0, totalCount, maxCount = 0;
char leftColor, rightColor;
for(brk = 0; brk < 2 * SIZE - 1; brk++)
{
leftCount = rightCount = 0;
rightI = brk;
rightColor = neck2[rightI];
if(rightI == 'w')
{
while(rightI == 'w')
{
rightI++;
}
rightColor = neck2[rightI];
}
rightI = brk;
while(neck2[rightI] == rightColor || neck2[rightI] == 'w')
{
rightCount++;
rightI++;
}
if(brk > 0)
{
leftI = brk - 1;
leftColor = neck2[leftI];
if(leftI == 'w')
{
while(leftI == 'w')
{
leftI--;
}
leftColor = neck2[leftI];
}
leftI = brk - 1;
while(leftI >= 0 && neck2[leftI] == leftColor || neck2[leftI] == 'w')
{
leftCount++;
leftI--;
}
}
totalCount = leftCount + rightCount;
if(totalCount > maxCount)
{
maxCount = totalCount;
}
}
return maxCount;
}
void doubleNecklace(char neck[], char doubleNeck[], int size)
{
int i;
for(i = 0; i < size; i++)
{
doubleNeck[i] = neck[i];
doubleNeck[i+size] = neck[i];
}
}
I didn't study the code in detail, but something is not symmetric: in the for loop, the "left" code has an if but the "right" code doesn't. Maybe you should remove that -1 in the for condition and add it as an if for the "right" code:
for(brk = 0; brk < 2 * SIZE; brk++)
{
leftCount = rightCount = 0;
if (brk < 2 * SIZE - 1)
{
rightI = brk;
rightColor = neck2[rightI];
//...
}
if(brk > 0)
{
leftI = brk - 1;
leftColor = neck2[leftI];
//...
}
//...
Just guessing, though... :-/
Maybe you should even change those < for <=.