AVX2 instruction interrupt in Visual Studio 2013 - c++

Here's the c++ code:
#include <stdio.h>
#include <iostream>
#include <immintrin.h>
using namespace std;
int main(int argc, char* argv[]) {
char a[100];
for (int i = 0; i < 32; i++)
a[i] = 1;
for (int i = 32; i < 64; i++)
a[i] = 0;
__m256i x = _mm256_loadu_si256((__m256i *)(a));
__m256i y = _mm256_loadu_si256((__m256i *)(a + 32));
for (int i = 0; i < 32; i++)
cout << (int)x.m256i_i8[i];
cout << endl;
for (int i = 0; i < 32; i++)
cout << (int)y.m256i_i8[i];
cout << endl;
__m256i z = _mm256_xor_si256(x, y);
for (int i = 0; i < 32; i++)
cout << (int)z.m256i_i8[i];
cout << endl;
return 0;
}
When I run this code in VS2013, the xor operation will interrupt.
And I think my cpu can support AVX2 since x and y can be output successfully.
Can somebody tell me how to fix this, please?

Related

C++ 2D array, problem outputting more columns

How come no matter what I do for rows and columns my columns won't go above two? It should be 8x8 adding the 8 numbers together 8 times. I don't know what I'm doing wrong. Thank you
#include <iostream>
#include <iomanip>
#include <cstdlib>
using namespace std;
int main() {
srand(time(0));
int array1[8][8];
int array2[8][8];
int addition[8][8];
for (int i = 0; i < 7; i++) {
for (int j = 0; j < 7; j++)
array1[i][j] = rand() % 6;
}
for (int i = 0;i < 7; i++) {
for (int j = 0; j < 7; j++) {
array2[i][j] = rand() % 8;
}
}
{
for (int i = 0; i < 7; i++) {
for (int j = 0; j < 7; j++) {
addition[i][j] = array1[i][j] + array2[i][j];
}
for (int i = 0; i < 7; i++) {
for (int j = 0; j < 7; j++) {
cout << array1[i][j];
cout << " " << array2[i][j];
cout << " " << endl;
cout << "both previous numbers added together = " << addition[i][j] << endl;
}
}
return 0;
}
}
}
Hi look carefully at the code structure.
you had some extra brackets.
This is the correct structure:
#include <iostream>
//using namespace std; - use this only in simple projects (my opinion).
int main()
{
srand(time(0));
int array1[8][8];
int array2 [8][8];
int addition[8][8];
for (int i = 0;i < 8;i++)
{
for (int j = 0; j< 8;j++)
array1[i][j] = rand() % 6;
}
for (int i = 0;i < 8;i++)
{
for (int j = 0;j < 8;j++)
array2[i][j] = rand() % 8;
}
//{ - you dont need this here
for (int i = 0;i < 8;i++)
{
for (int j = 0;j < 8;j++)
addition[i][j] = array1[i][j] + array2[i][j];
}
for (int i = 0;i < 8;i++)
{
for (int j = 0;j < 8;j++)
{
std::cout << array1[i][j];
std::cout << " " << array2[i][j];
std::cout << " " << std::endl;
std::cout << "both previous numbers added together = " << addition[i][j] << std::endl;
}
}
//} - and you don't need this here
return 0;
}
Take this example and compare to your code to see your mistake. Code just wasn't structured properly.
Your code's logics are absolutely right! However, the mistake was found in improper bracket structuring on for loop. I have corrected your code and mentioned the mistakes as comments.
#include <iomanip>
#include <cstdlib>
#include <iostream> //include this header to use "cout"
using namespace std;
int main() {
srand(time(0));
int array1[8][8];
int array2[8][8];
int addition[8][8];
for (int i = 0; i < 7; i++) {
for (int j = 0; j < 7; j++)
array1[i][j] = rand() % 6;
}
for (int i = 0;i < 7; i++) {
for (int j = 0; j < 7; j++) {
array2[i][j] = rand() % 8;
}
}
{
for (int i = 0; i < 7; i++) {
for (int j = 0; j < 7; j++) {
addition[i][j] = array1[i][j] + array2[i][j];
}
} //add this bracket
for (int i = 0; i < 7; i++) {
for (int j = 0; j < 7; j++) {
cout << array1[i][j];
cout << " " << array2[i][j];
cout << " " << endl;
cout << "both previous numbers added together = " << addition[i][j] << endl;
//} remove this bracket
}
return 0;
}
}
}
Also, to add on, if you want an 8x8 matrix use i<8 and j<8 everywhere in the code. Here you have used i<7 and j<7 which means you get a 7x7 matrix as result.
Logic:
i=0 to i<7 use have => 0,1,2,3,4,5,6 (stopping at 6 because 6<7 is true and 7<7 becomes false naturally). So from 0 to 6 there are totally 7 elements.
Hope this helps! :)

C++ 'unresolved errors' using Intel's oneAPI compiler, possibly involved with the Math Kernel Library?

I'm trying to run the following code on an Ubuntu machine:
/**
For compiling -->
export GOMP_CPU_AFFINITY='0,4,8,12,16,20,24,28,32,36,40,44,48,52,56,60,64,68,72,76,80,84,88,92,96,100,104,108,112,116,120,124,128,132,136,140,144,148,152,156,160,164,168,172,176,180,184,188,192,196,200,204,208,212,216,220,224,228,232,236,240,244,248,252,256,260,264,268'
export OMP_NUM_THREADS=68
set MKL_NUM_THREADS = 68
icc -std=gnu++98 -O3 -qopenmp -xhost -ansi-alias -ipo -AVX512 mkl_2d_heat_fftw_P.cpp -o mkl_2d_heat_fftw_P -lm -mkl
For running -->
* ./mkl_2d_heat_fftw_P N T numThreads
* Example: ./mkl_2d_heat_fftw_P 1000 100000 1
*/
#include <iostream>
#include <vector>
#include <algorithm>
#include <cstring>
#include <complex.h>
#include "mkl_service.h"
#include "mkl_dfti.h"
#include <string>
#include <cstdlib>
#include <cmath>
#include <ctime>
#include <sys/time.h>
#include <cstdio>
#include <omp.h>
// #include <cilk/cilk.h>
// #include <cilk/cilk_api.h>
// #include "cilktime.h"
#ifdef USE_PAPI
#include <papi.h>
#include "papilib.h"
#endif
#ifdef POLYBENCH
#include <polybench.h>
#endif
using namespace std;
typedef vector<double> vd;
typedef vector<vector<double> > vvd;
#define PB push_back
#define SZ(x) (int)x.size()
#define MAXN 8010
int T, N, N_THREADS;
const int BASE = 1024;
double a1[MAXN][MAXN], a2[MAXN][MAXN];
// double *forward_input_buffer, *backward_output_buffer;
// double complex *forward_output_buffer, *backward_input_buffer;
double complex *a_complex, *odd_mults, *input_complex;
double *mkl_forward_input_buffer, *mkl_backward_output_buffer;
double complex *mkl_forward_output_buffer, *mkl_backward_input_buffer;
template<class T> void out(const vector<T> &a) { cout<<"array: "; for (int i=0;i<SZ(a);i++) cout<<a[i]<<" "; cout<<endl; cout.flush(); }
long getTime(){
struct timeval tp;
gettimeofday(&tp, NULL);
long int ms = tp.tv_sec * 1000 + tp.tv_usec / 1000;
return ms;
}
void pad_matrix(vvd &v, int rows, int cols){
int pad_rows = rows - SZ(v);
int pad_cols = cols - SZ(v[0]);
for (int i = 0; i < pad_rows; i++)
v.PB(vd(SZ(v[0]), 0.0));
for (int i = 0; i < SZ(v); i++){
for (int j = 0; j < pad_cols; j++)
v[i].PB(0.0);
}
}
// Resizing matrices to [r1+r2, c1+c2] for circular convolution
void pad_vectors(vd &input, vd &formula)
{
int n = SZ(input);
vd tmp = vd(n*3, 0);
for (int i = 0; i < n; i++)
tmp[i] = tmp[n + i] = tmp[n + n + i] = input[i];
input = tmp;
int diff = abs(SZ(input) - SZ(formula));
for (int i = 0; i < diff; i++)
if (SZ(input) < SZ(formula))
input.PB(0.0);
else
formula.PB(0.0);
}
void print_matrix(vvd v, string msg){
cout << msg << ": " << endl;
for (int i = 0; i < SZ(v); i++){
for (int j = 0; j < SZ(v[i]); j++)
cout << v[i][j] << " ";
cout << endl;
}
cout << endl;
}
void print_matrix_arr(double *v, int n, string msg){
cout << msg << ": " << endl;
for (int i = 0; i < n; i++){
for (int j = 0; j < n; j++)
cout << v[i*n + j] << " ";
cout << endl;
}
cout << endl;
}
void print_complex_matrix(double complex* input_buffer1, double complex* input_buffer2, int n, string msg){
cout << msg << ": " << endl;
for (int i = 0; i < n * n; i++){
if (i % n == 0)
cout << endl;
printf("ratio:%f\t%f%+fi\t \t%f%+fi\n", crealf(input_buffer1[i])/crealf(input_buffer2[i]), crealf(input_buffer1[i]), cimagf(input_buffer1[i]), crealf(input_buffer2[i]), cimagf(input_buffer2[i]));
// cout << (*input_buffer[i]).real() << " " << (*input_buffer[i]).imag() << ",\t";
}
cout << endl;
}
void print_vector(vd v, string msg){
cout << msg << ": ";
for (int i = 0; i < SZ(v); i++)
cout << v[i] << " ";
cout << endl;
}
// fftw_plan plan_forward, plan_backward;
DFTI_DESCRIPTOR_HANDLE my_desc1_handle = NULL, my_desc2_handle = NULL;
// double mkl_forward_input_buffer[MAXN * MAXN], mkl_backward_output_buffer[MAXN * MAXN];
// double complex mkl_forward_output_buffer[MAXN * MAXN], mkl_backward_input_buffer[MAXN * MAXN];
// DFT of real valued matrix. CAUTION: initialize the input array after creating the plan
void mkl_fft_forward(vvd &v, double complex *output_buffer, int n)
{
int sz_i = SZ(v), sz_j = SZ(v[0]);
#pragma omp parallel for
for (int i = 0; i < sz_i; i++)
for (int j = 0; j < sz_j; j++){
mkl_forward_input_buffer[i*n + j] = v[i][j];
}
#pragma omp parallel for
for (int i = 0; i < n; i++)
for (int j = 0; j < n; j++)
mkl_forward_output_buffer[i*n + j] = 0.0;
// print_matrix_arr(mkl_forward_input_buffer, n, "input bufer");
DftiComputeForward(my_desc1_handle, mkl_forward_input_buffer, mkl_forward_output_buffer);
#pragma omp parallel for
for (int i = 0; i < n; i++){
for (int j = 0; j < n; j++){
output_buffer[i*n + j] = mkl_forward_output_buffer[i*n + j];
// printf("%f+%f\n", crealf(output_buffer[i*n + j]), cimagf(output_buffer[i*n+j]));
}
}
}
// Inverse DFT of complex input array
void mkl_fft_backward(double complex* input_buffer, vvd &output, int n)
{
#pragma omp parallel for
for (int i = 0; i < n; i++)
for (int j = 0; j < n; j++)
{
mkl_backward_input_buffer[i*n + j] = input_buffer[i*n + j];
mkl_backward_output_buffer[i*n + j] = 0.0;
}
DftiComputeBackward(my_desc2_handle, mkl_backward_input_buffer, mkl_backward_output_buffer);
#pragma omp parallel for
for (int i = 0; i < n; i++)
for (int j = 0; j < n; j++)
output[i][j] = mkl_backward_output_buffer[i*n + j]/(n * n * 1.0);
}
// Takes two array(a_real and b_real) as input and writes the output to "res"
void convolution_fftw_2d(vvd &a_real, vvd &input, vvd &result)
{
if (T == 0)
return ;
int n_formula = N;
mkl_fft_forward(a_real, a_complex, n_formula);
// double complex* odd_mults = fftw_alloc_complex(n_formula * n_formula); // Do not need to allocate new space, we can use the space of a_complex or b_complex
bool is_initialized = false; // if odd_mult array is initialized
int t = T;
// ############# Repeated squaring - start ############
while (t > 1){
if (t & 1){
if (is_initialized == false){
#pragma omp parallel for
for(int i = 0; i < n_formula * n_formula; i++)
odd_mults[i] = a_complex[i];
is_initialized = true;
} else {
#pragma omp parallel for
for(int i = 0; i < n_formula * n_formula; i++)
odd_mults[i] = odd_mults[i] * a_complex[i];
}
}
#pragma omp parallel for
for(int i = 0; i < n_formula * n_formula; i++)
a_complex[i] = a_complex[i] * a_complex[i];
t /= 2;
}
if (is_initialized){
#pragma omp parallel for
for(int i = 0; i < n_formula * n_formula; i++)
a_complex[i] = a_complex[i] * odd_mults[i];
}
// ############# Repeated squaring - end ############
// while (--t > 0){
// cout << "t: " << t << endl;
// for(int i = 0; i < n_formula * n_formula; i++)
// pointwise_mult[i] = pointwise_mult[i] * a_complex[i];
// }
// fft_backward(a_complex, formula, n_formula);
// // Scale the output array according to number of samples
// #pragma omp parallel for
// for (int i = 0; i < SZ(formula); i++)
// for (int j = 0; j < SZ(formula[0]); j++){
// double r = formula[i][j] / (n_formula * n_formula);
// formula[i][j] = r;
// // formula[i][j] = (abs(r) < 1e-8? 0:r);
// }
// print_matrix(formula, "Formula");
// vvd input(N, vd(N, 0.0));
// #pragma omp parallel for
// for (int i = 0; i < N; i++)
// for (int j = 0; j < N; j++){
// input[i][j] = a1[i][j];
// }
// print_matrix(input, "Input");
// reverse(input.begin(), input.end());
// double complex* formula_complex = fftw_alloc_complex(N * N);
// fft_forward(formula, formula_complex, N);
// double complex* input_complex = fftw_alloc_complex(N * N);
mkl_fft_forward(input, input_complex, N);
// fft_forward(input, input_complex, N);
// double complex* result_complex = fftw_alloc_complex(n * n); // Do not need to allocate new space, we can use the space of a_complex or b_complex
#pragma omp parallel for
for (int i = 0; i < N * N; i++){
a_complex[i] = input_complex[i] * a_complex[i];
}
mkl_fft_backward(a_complex, result, N);
// fft_backward(a_complex, result, N);
// print_matrix(result, "Result (needs to be rotated)");
return ;
}
void mkl_init(int n)
{
MKL_LONG status;
MKL_LONG len[2] = {n, n};
len[0] = n; len[1] = n;
status = DftiCreateDescriptor(&my_desc1_handle, DFTI_DOUBLE, DFTI_REAL, 2, len);
status = DftiSetValue(my_desc1_handle, DFTI_PLACEMENT, DFTI_NOT_INPLACE);
status = DftiSetValue(my_desc1_handle, DFTI_CONJUGATE_EVEN_STORAGE, DFTI_COMPLEX_COMPLEX);
status = DftiSetValue( my_desc1_handle, DFTI_PACKED_FORMAT, DFTI_CCE_FORMAT );
status = DftiCommitDescriptor(my_desc1_handle);
status = DftiCreateDescriptor(&my_desc2_handle, DFTI_DOUBLE, DFTI_REAL, 2, len);
status = DftiSetValue(my_desc2_handle, DFTI_CONJUGATE_EVEN_STORAGE, DFTI_COMPLEX_COMPLEX);
status = DftiSetValue(my_desc2_handle, DFTI_PLACEMENT, DFTI_NOT_INPLACE);
status = DftiSetValue( my_desc2_handle, DFTI_PACKED_FORMAT, DFTI_CCE_FORMAT );
status = DftiCommitDescriptor(my_desc2_handle);
}
void initialize(){
mkl_init(N);
// forward_input_buffer = fftw_alloc_real(N * N);
// backward_output_buffer = fftw_alloc_real(N * N);
// forward_output_buffer = fftw_alloc_complex(N * N);
// backward_input_buffer = fftw_alloc_complex(N * N);
a_complex = (double complex *)malloc(sizeof(double complex) * N * N); //fftw_alloc_complex(N);
odd_mults = (double complex *)malloc(sizeof(double complex) * N * N); //fftw_alloc_complex(N);
input_complex = (double complex *)malloc(sizeof(double complex) * N * N); //fftw_alloc_complex(N);
mkl_forward_input_buffer = (double *)malloc(sizeof(double) * N * N);
mkl_backward_output_buffer = (double *)malloc(sizeof(double) * N * N);
mkl_forward_output_buffer = (double complex *)malloc(sizeof(double complex) * N * N);
mkl_backward_input_buffer = (double complex *)malloc(sizeof(double complex) * N * N);
for (int i = 0; i < N+2; ++i)
for (int j = 0; j < N+2; j++)
a1[i][j] = a2[i][j] = 1.0 * (rand() % BASE);
}
void mkl_destroy(){
MKL_LONG status;
status = DftiFreeDescriptor(&my_desc1_handle);
status = DftiFreeDescriptor(&my_desc2_handle);
free(a_complex);
free(odd_mults);
free(input_complex);
free(mkl_forward_input_buffer);
free(mkl_backward_output_buffer);
free(mkl_forward_output_buffer);
free(mkl_backward_input_buffer);
}
#define getIdx(i, N) ((i + N) % N)
bool verify(vvd result){
for (int t = 0; t < T; ++t) {
// cout << "t: " << t << endl;
for (int i = 0; i < N; ++i)
for (int j = 0; j < N; j++){
// a2[i] = 0.125 * (a1[i+1] - 2.0 * a1[i] + a1[i-1]);
// cout << i << " " << j << " : " << getIdx(i -1, N) << " " << getIdx(i + 1, N) << " " << getIdx(j - 1, N) << " " << getIdx(j + 1, N) << endl;
// a2[i][j] = a1[getIdx(i - 1, N)][getIdx(j, N)] + a1[getIdx(i, N)][getIdx(j + 1, N)]
// + a1[getIdx(i + 1, N)][getIdx(j, N)] + a1[getIdx(i, N)][getIdx(j - 1, N)];
a2[i][j] = 0.125*a1[getIdx(i - 1, N)][getIdx(j, N)] + 0.125*a1[getIdx(i, N)][getIdx(j + 1, N)]
+ 0.125*a1[getIdx(i + 1, N)][getIdx(j, N)] + 0.125*a1[getIdx(i, N)][getIdx(j - 1, N)]
+ (-2.0*(0.125*2.0) + 1.0)*a1[i][j];
}
for (int i = 0; i < N; ++i)
for (int j = 0; j < N; j++)
a1[i][j] = a2[i][j];
}
// cout << "Final Answer (iter): ";
// for (int i = 0; i < N; i++){
// for (int j = 0; j < N; j++)
// cout << a1[i][j] << " ";
// cout << endl;
// }
// cout << endl;
int cnt = 0;
for (int i = 0; i < N; i++)
for (int j = 0; j < N; j++)
if (fabs (a1[i][j] - result[i][j]) > 1e-8)
cnt++;
cout << "Number of Mismatched Cell: " << cnt << endl;
return 0;
}
int main(int argc, char *argv[])
{
double x;
int t, n, numThreads;
// vvd a, b;
if (argc < 4){
cout << "Enter: N T numThreads" << endl;
return 1;
}
if (argc > 1){
n = atoi(argv[1]);
}
if (argc > 2)
t = atoi(argv[2]);
numThreads = 1;
if (argc > 3){
numThreads = atoi(argv[3]);
omp_set_num_threads(numThreads);
}
N = n; T = t; N_THREADS = numThreads;
initialize();
#ifdef USE_PAPI
papi_init();
#endif
int sz_formula = 3;
// double formula[3][3] = {{0, 1, 0},
// {1, 0, 1},
// {0, 1, 0}};
double formula[3][3] = {{0, 0.125, 0},
{0.125, (-2.0*(0.125*2.0) + 1.0), 0.125},
{0, 0.125, 0}};
// double formula[3][3] = {{1, 0, 1},
// {0, 0, 0},
// {0, 0, 0}};
vvd a(sz_formula, vd(sz_formula));
for (int i = 0; i < sz_formula; i++)
for (int j = 0; j < sz_formula; j++)
a[i][j] = formula[i][j];
vvd input(n, vd(n)), result(n, vd(n,0.0));
for (int i = 0; i < n; i++)
for (int j = 0; j < n; j++)
input[i][j] = a1[i][j];
long start = getTime();
#ifdef POLYBENCH
/* Start timer. */
polybench_start_instruments;
#endif
convolution_fftw_2d(a, input, result);
// Result must be rotated (T mod N) indices
#ifdef POLYBENCH
/* Stop and print timer. */
polybench_stop_instruments;
polybench_print_instruments;
#endif
long end = getTime();
vvd rotated_result(n, vd(n, 0.0));
int k = 0;
for (int i = 0; i < n; i++)
for (int j = 0; j < n; j++)
rotated_result[i][j] = result[(i+(t%n)) % n][(j+(t%n)) % n];
// print_matrix(rotated_result, "rotated");
cout << N << "," << T << "," << numThreads << "," << (end - start) / 1000.0 << endl;
mkl_destroy();
#ifdef USE_PAPI
countTotalMiss(p);
PAPI_shutdown();
delete threadcounter;
for (int i = 0; i < p; i++) delete l2miss[i];
delete l2miss;
delete errstring;
delete EventSet;
delete eventCode;
#endif
long start_iter = getTime();
// verify(rotated_result);
long end_iter = getTime();
// cout << "Time (Iter): " << end_iter - start_iter << endl;
return 0;
}
I'm pretty new to running C++ overall (I'm most familiar with Python, which is an interpreted language), so I'm not sure how it all works. This was pre-made code that I have to test before I make my own. I'm on Ubuntu 20.04 and to begin with, I'm not sure how to work everything. I have Visual Studio Code installed, and I'm trying to run the command icc -std=gnu++98 -O3 -qopenmp -xhost -ansi-alias -ipo -AVX512 mkl_2d_heat_fftw_P.cpp -o mkl_2d_heat_fftw_P -lm -mkl through the terminal. I'm not even sure if this is right, but I'm getting the following error messages:
ipo: warning #11021 (6 times): unresolved DftiFreeDescriptor, DftiCommitDescriptor, DftiSetValue, DftiCreateDescriptor_d_md, DftiComputeBackward, DftiComputeForward
ld: cannot find (3 times): -lmkl_intel_lp64, -lmkl_intel_thread, -lmkl_core.
These are eventually meant to run on supercomputers at my institution, but is the compiling command wrong for my local computer? If it helps, I'm running this on a VirtualBox on Windows 10 Pro (I tried running Ubuntu 20.04 LTS on Hyper-V but I never figured out how to successfully connect it to the Internet because whenever I made the VM's switch my WiFi one, it made my computer overall have no Internet access).
Looking at it, it seems like the Math Kernel Library is the problem, though I'm not too sure. Did I mess up the installation of the oneAPI Base Toolkit? I have every oneAPI Toolkit installed (and all of their features too, including FPGA support).
Or am I just compiling it wrong as the script described listed in the top comment is for a supercomputer (I'm not sure what to do with the first three lines, like export GOMP_CPU_AFFINITY, export OMP_NUM_THREADS, and set MKL_NUM_THREADS).
Thanks for any help in advance!

Error in my code - Boolean Truth Table

I am currently working on a program that prints a 5 variable truth table. I am using a 2d array. My code currently produces the table, but says it is corrupt and "the stack around the variable "table" was corrupted. Any help?
#include <iostream>
using namespace std;
int main() {
bool table[5][32];
for (int i = 0; i < 32; i++) {
for (int j = 0; j < 5; j++) {
table[i][j] = ((i >> j)& 1);
}
}
for (int i = 0; i < 32; i++) {
for (int j = 0; j < 5; j++) {
cout << table[i][j] << " ";
}
cout << endl;
}
return 0;
}
This is homework, so I would like to understand it, not just have an answer.
The index is wrong. Only table[0] to table[4] are available, so accessing table[5] to table[31] is illegal.
Try this:
#include <iostream>
using namespace std;
int main() {
bool table[32][5]; // swap 32 and 5
for (int i = 0; i < 32; i++) {
for (int j = 0; j < 5; j++) {
table[i][j] = ((i >> j)& 1);
}
}
for (int i = 0; i < 32; i++) {
for (int j = 0; j < 5; j++) {
cout << table[i][j] << " ";
}
cout << endl;
}
return 0;
}
There is attempt to read out of bound values from array.
If you need 5x32 matrix Use code below:
for (int i = 0; i < 5; i++) { // 32-> 5
for (int j = 0; j < 32; j++) { // 5->32
If you need 32x5 matrix then replace code below:
bool table[32][5]; //it was table[5][32];

Joining threads in a loop - Conversion error

So I'm trying to join threads in a for loop but it's giving me the error:
invalid conversion from 'pthread_t* {aka long unsigned int*}' to
'pthread_t {aka long unsigned int}'.
The codes are as below and any help would be greatly appreciated!
Thanks in advance!
#include <iostream>
#include <fstream>
#include <sstream>
#include <string>
#include <stdlib.h>
#include <time.h>
#include <pthread.h>
int threadArray[10][10];
int arrayVar[2];
using namespace std;
void *calc(void *arg){
int startPoint = arrayVar[0];
int endPoint = arrayVar[1];
int newArray[10][10];
int calculated;
for (int i = startPoint ; i < endPoint; i++){
for (int j = 0; j < 10; j++){
calculated = (threadArray[i][j] * 2 + 4) * 2 + 4;
newArray[i][j] = calculated;
}
}
for (int i = startPoint; i < endPoint; i++){
for (int j = 0; j < 10; j++){
cout << newArray[i][j] << " ";
}
cout << endl;
}
return NULL;
}
int main(){
int rc;
int start = 0;
int end;
ifstream numFile;
numFile.open("numbers.txt");
if (numFile.is_open()){
for (int row = 0; row < 10; row++){
std::string line;
std::getline(numFile, line);
std::stringstream iss(line);
for (int col = 0; col < 10; col++){
std::string num;
std::getline(iss, num, ' ');
std::stringstream converter(num);
converter >> threadArray[row][col];
}
}
cout << "Original 2D Array" << endl << endl;
for (int i = 0; i < 10; i++){
for (int j = 0; j < 10; j++){
cout << threadArray[i][j] << " ";
}
cout << endl;
}
cout << endl;
}
srand (time(NULL) );
const int rowArray[3] = {1, 2, 5};
int arrayIndex = rand() % 3;
int noOfRows = (rowArray[arrayIndex]);
end = noOfRows;
int noOfThreads = 10 / noOfRows;
pthread_t threads[noOfThreads];
arrayVar[2];
cout << "2D Array Altered" << endl << endl;
for (int t = 0; t < noOfThreads; t++){
arrayVar[0] = start;
arrayVar[1] = end;
rc = pthread_create(&threads[t], NULL, calc, NULL);
start = start + noOfRows;
end = end + noOfRows;
}
for (int t = 0; t < noOfThreads; t++){
rc = pthread_join(&threads[t], NULL);
}
pthread_exit(NULL);
}
I think threads[t] is actually just it's pid, it's an integer and you should pass in by value
pthread_join(threads[t], NULL)

Multiplication with AVX

Please this is my first time of using AVX and I'm trying to perform a simple multiplication on double precision numbers but I'm not getting all results correct.
I get just the first 4 results and the others are jargon.
#include <immintrin.h>
#include <iostream>
#include <math.h>
#include <time.h>
using namespace std;
int main() {
double *a, *b; // data pointers
double *pA,*pB; // work pointer
__m256d rA_AVX, rB_AVX; // variables for AVX
const int vector_size = 8;
a = (double*) _mm_malloc (vector_size*sizeof(double),64);
b = (double*) _mm_malloc (vector_size*sizeof(double),64);
for(int i = 0; i < vector_size; i++) {
a[i] = (rand() % 48);
b[i] = 0.0f;
cout << a[i] << endl;
}
for (int i = 0; i < vector_size; i += 8)
{
pA = a;
pB = b;
rA_AVX = _mm256_load_pd(pA);
rB_AVX = _mm256_mul_pd(rA_AVX,rA_AVX);
_mm256_store_pd(pB,rB_AVX);
pA += 8;
pB += 8;
}
for (int i=0; i<vector_size; i++){
cout << endl << b[i] << endl;
}
_mm_free(a);
_mm_free(b);
system("PAUSE");
return 0;
}