I have two __m256i vectors, filled with 32 8-bit integers. Something like this:
__int8 *a0 = new __int8[32] {2};
__int8 *a1 = new __int8[32] {3};
__m256i v0 = _mm256_loadu_si256((__m256i*)a0);
__m256i v1 = _mm256_loadu_si256((__m256i*)a1);
How can i multiply these vectors, using something like _mm256_mul_epi8(v0, v1) (which does not exist) or any another way?
I want 2 vectors of results, because the output element width is twice the input element width. Or something that works similarly to _mm_mul_epu32 would be ok, using only the even input elements (0, 2, 4, etc.)
You want the result separated in two vectors so this is my suggestion for your question. I've tried to be clear, simple and realizable:
#include <stdio.h>
#include <x86intrin.h>
void _mm256_print_epi8(__m256i );
void _mm256_print_epi16(__m256i );
void _mm256_mul_epi8(__m256i , __m256i , __m256i* , __m256i* );
int main()
{
char a0[32] = {1, 2, 3, -4, 5, 6, 7, 8, 9, -10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, -24, 25, 26, 27, 28, 29, 30, 31, 32};
char a1[32] = {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, -13, 14, 15, 16, 17, 18, 19, -20, 21, 22, 23, 24, -25, 26, 27, 28, 29, 30, 31, 32, 33};
__m256i v0 = _mm256_loadu_si256((__m256i*) &a0[0]);
__m256i v1 = _mm256_loadu_si256((__m256i*) &a1[0]);
__m256i r0, r1;//for 16 bit results
_mm256_mul_epi8(v0, v1, &r0, &r1);
printf("\nv0 = ");_mm256_print_epi8(v0);
printf("\nv1 = ");_mm256_print_epi8(v1);
printf("\nr0 = ");_mm256_print_epi16(r0);
printf("\nr1 = ");_mm256_print_epi16(r1);
printf("\nfinished\n");
return 0;
}
//v0 and v1 are 8 bit input vectors. r0 and r1 are 18 bit results of multiplications
void _mm256_mul_epi8(__m256i v0, __m256i v1, __m256i* r0, __m256i* r1)
{
__m256i tmp0, tmp1;
__m128i m128_v0, m128_v1;
m128_v0 = _mm256_extractf128_si256 (v0, 0);
m128_v1 = _mm256_extractf128_si256 (v1, 0);
tmp0= _mm256_cvtepi8_epi16 (m128_v0); //printf("\ntmp0 = ");_mm256_print_epi16(tmp0);
tmp1= _mm256_cvtepi8_epi16 (m128_v1); //printf("\ntmp1 = ");_mm256_print_epi16(tmp1);
*r0 =_mm256_mullo_epi16(tmp0, tmp1);
m128_v0 = _mm256_extractf128_si256 (v0, 1);
m128_v1 = _mm256_extractf128_si256 (v1, 1);
tmp0= _mm256_cvtepi8_epi16 (m128_v0); //printf("\ntmp0 = ");_mm256_print_epi16(tmp0);
tmp1= _mm256_cvtepi8_epi16 (m128_v1); //printf("\ntmp1 = ");_mm256_print_epi16(tmp1);
*r1 =_mm256_mullo_epi16(tmp0, tmp1);
}
void _mm256_print_epi8(__m256i vec)
{
char temp[32];
_mm256_storeu_si256((__m256i*)&temp[0], vec);
int i;
for(i=0; i<32; i++)
printf(" %3i,", temp[i]);
}
void _mm256_print_epi16(__m256i vec)
{
short temp[16];
_mm256_storeu_si256((__m256i*)&temp[0], vec);
int i;
for(i=0; i<16; i++)
printf(" %3i,", temp[i]);
}
The output is:
[martin#mrt Stack over flow]$ gcc -O2 -march=native mul_epi8.c -o out
[martin#mrt Stack over flow]$ ./out
v0 = 1, 2, 3, -4, 5, 6, 7, 8, 9, -10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, -24, 25, 26, 27, 28, 29, 30, 31, 32,
v1 = 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, -13, 14, 15, 16, 17, 18, 19, -20, 21, 22, 23, 24, -25, 26, 27, 28, 29, 30, 31, 32, 33,
r0 = 2, 6, 12, -20, 30, 42, 56, 72, 90, -110, 132, -156, 182, 210, 240, 272,
r1 = 306, 342, -380, 420, 462, 506, 552, 600, 650, 702, 756, 812, 870, 930, 992, 1056,
finished
[martin#mrt Stack over flow]$
NOTE: I've commented the intermediate results tmp0 and tmp1 in the recommended code.
In addition, as peter suggested in comments and provided a godbolt link, if your program loads from memory and you don't need to multiply elements in vectors you can use this code:
#include <immintrin.h>
//v0 and v1 are 8 bit input vectors. r0 and r1 are 18 bit results of multiplications
__m256i mul_epi8_to_16(__m128i v0, __m128i v1)
{
__m256i tmp0 = _mm256_cvtepi8_epi16 (v0); //printf("\ntmp0 = ");_mm256_print_epi16(tmp0);
__m256i tmp1 = _mm256_cvtepi8_epi16 (v1); //printf("\ntmp1 = ");_mm256_print_epi16(tmp1);
return _mm256_mullo_epi16(tmp0, tmp1);
}
__m256i mul_epi8_to_16_memsrc(char *__restrict a, char *__restrict b){
__m128i v0 = _mm_loadu_si128((__m128i*) a);
__m128i v1 = _mm_loadu_si128((__m128i*) b);
return mul_epi8_to_16(v0, v1);
}
int main()
{
char a0[32] = {1, 2, 3, -4, 5, 6, 7, 8, 9, -10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, -24, 25, 26, 27, 28, 29, 30, 31, 32};
char a1[32] = {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, -13, 14, 15, 16, 17, 18, 19, -20, 21, 22, 23, 24, -25, 26, 27, 28, 29, 30, 31, 32, 33};
__m256i r0 = mul_epi8_to_16_memsrc(a0, a1);
}
I wrote a code for calculating the max subarray using brute force method. My code reads a number of arrays from an input file and returns the output file, which contains the max subarray and the sum value.
Everything works fine except the first max subarray on the output file always contains a really large number at the end, which gets added to the sum value. The subsequent sub-arrays don't have this problem. I've included an example at the bottom of this post.
I can't figure out where I went wrong. Any help would be greatly appreciated!
Here is the function that runs the algorithm and prints it to output file:
void a1(int a[], int size, string filename){
//run algorithm 1
int sum = a[0], start = 0, end = 0;
for (int x = 0; x < size; x++) {
int tempSum = 0;
int y = x;
while(y>=0){
tempSum += a[y];
if(tempSum>sum){
sum=tempSum;
start=y;
end=x;
}
y--;
}
}
//print results on file
ofstream output;
output.open(filename.c_str(), ios::out | ios::app);
output << "\nMax sum array: ";
for (int x = start; x <= end; x++) {
output << a[x];
if (x != end) output << ", ";
}
output << "\nMax sum value: " << sum << "\n";
output.close();
}
Here is the main file:
int main() {
int a[50];
ifstream inputFile;
string s;
stringstream ss;
string outputfile = "MSS_Results.txt";
//print title
ofstream output;
output.open(outputfile.c_str(), ios::out | ios::app);
output << "Algorithm 1:\n";
output.close();
//read file and run a1
int size;
char c;
inputFile.open("MSS_Problems.txt");
while (!inputFile.eof()) {
getline(inputFile, s);
size = 0;
ss << s;
ss >> c;
while (ss.rdbuf()->in_avail()) {
ss >> a[size];
size++;
ss >> c;
if (!ss.rdbuf()->in_avail() && c != ']') {
ss.clear();
getline(inputFile, s);
ss << s;
}
}
ss.clear();
if (size > 0) a1(a, size, outputfile);
}
inputFile.close();
return 0;
}
Example of input file:
[1, 2, 4, -1, 4, -10, 4, -19, 18, -1, -3, -4, 11, 3, -20, 19, -33, 50, 66, -22, -4, -55, 91, 100, -102, 9, 10, 19, -10, 10, 11, 11, -10, -18, 50, 90]
[12, 12, 14, -88, -1, 45, 6, 8, -33, 2, 8, -9, -33, -8, -23, -77, -89, 1, 9, 10, 92, 87]
[565, 78, 33, 9, 10, 84, 71, -4, -22, -55, -10, 76, -9, -9, -11, 76, 89, 11, 10, -33, 9]
[2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3]
Example of output file:
Algorithm 1:
Max sum array: 50, 66, -22, -4, -55, 91, 100, -102, 9, 10, 19, -10, 10, 11, 11, -10, -18, 50, 90, 3897136
Max sum value: 3897432
Max sum array: 1, 9, 10, 92, 87, 91
Max sum value: 290
Max sum array: 565, 78, 33, 9, 10, 84, 71, -4, -22, -55, -10, 76, -9, -9, -11, 76, 89, 11, 10, -33, 9, 87
Max sum value: 1055
Max sum array: 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 11
Max sum value: 103
As you can see, for the first array, there is a 3897136 that does not belong to the original array.
If I delete the first line from the input, the input looks like this:
[12, 12, 14, -88, -1, 45, 6, 8, -33, 2, 8, -9, -33, -8, -23, -77, -89, 1, 9, 10, 92, 87]
[565, 78, 33, 9, 10, 84, 71, -4, -22, -55, -10, 76, -9, -9, -11, 76, 89, 11, 10, -33, 9]
[2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3]
Now my output looks something like this:
Algorithm 1:
Max sum array: 1, 9, 10, 92, 87, 624
Max sum value: 823
Max sum array: 565, 78, 33, 9, 10, 84, 71, -4, -22, -55, -10, 76, -9, -9, -11, 76, 89, 11, 10, -33, 9, 87
Max sum value: 1055
Max sum array: 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3
Max sum value: 92
I initialized the array incorrectly, which is why it sometimes gave me a garbage number at the end. To initialize properly, I simply changed
a[100];
to
a[100] = {0}
and that fixed the problem of abnormally large numbers at the end of the array.
I then moved a[100] = {0} into the while loop where the code reads the input file. That seems to have fixed the new issue of reading wrong elements into the array.
Final unresolved issue: 0 at the end of array.
Will update once I solve that.
Since all the problem is finding the maximum subarray , therefore the "large number" at the end will need to get added to produce the correct results.
In the first example that you provided, all the numbers were positive.
This means that the maximum sum subarray will actually be the sum of all the array elements.
Your algorithm part is OK;
so basically i took my c++ code (which is working correctly) and rewrite it to cuda (i have no experience with cuda). The one part of the code (solve() method) is not working correctly and i really dont know why.
So my question is what exactly means "unspecified launch failure" error during cudaMemcpy and why is it happening in my code.
My second question is why variables backup_ans and ans differs when they compute the same thing?
#include "stdio.h"
#include <algorithm>
__device__ unsigned int primes[1024];
__device__ long long n = 1ll<<32; // #unsigned_integers
__device__ int hashh(long long x) {
return (x>>1)%1024;
}
// compute (x^e)%n
__device__ unsigned long long mulmod(unsigned long long x,unsigned long long e,unsigned long long n) {
unsigned long long ans = 1;
while(e>0) {
if(e&1) ans = (ans*x)%n;
x = (x*x)%n;
e>>=1;
}
return ans;
}
// determine whether n is strong probable prime base a or not.
// n is ODD
__device__ int is_SPRP(unsigned long long a,unsigned long long n) {
int d=0;
unsigned long long t = n-1;
while(t%2==0) {
++d;
t>>=1;
}
unsigned long long x = mulmod(a,t,n);
if(x==1) return 1;
for(int i=0;i<d;++i) {
if(x==n-1) return 1;
x=(x*x)%n;
}
return 0;
}
__device__ int prime(long long x) {
return is_SPRP((unsigned long long)primes[(((long long)0xAFF7B4*x)>>7)%1024],(unsigned long long)x);
}
// copy all unsigned COMPOSITE ingeters which are not congruent to zero modulo 2,3,5,7 and their hashh value = 0;
// count of those elements store in c
// 335545 is just magic constant to distribute all integers equally on all 400*32 threads
__global__ void find(unsigned int *out,unsigned int *c) {
unsigned int buff[4096];
int local_c = 0;
long long b = 121+(threadIdx.x+blockIdx.x*blockDim.x)*335545;
long long e = b+335545;
if(b%2==0) ++b;
for(long long i=b;i<e && i<n;i+=2) {
if(i%3==0 || i%5==0 || i%7==0 || prime(i)) continue;
if(hashh(i)==0) {
buff[local_c++]=(unsigned int)i;
if(local_c==4096) {
int start = atomicAdd(c,local_c);
for(int i=0;i<local_c;++i) out[i+start]=buff[i];
local_c=0;
}
}
}
int start = atomicAdd(c,local_c);
for(int i=0;i<local_c;++i) out[i+start]=buff[i];
}
// find base for which all elements in input are NOT SPRP. base is from {2,..,34} stored in 32bit uint
__global__ void solve(unsigned int *input, unsigned int *count,unsigned int *backup, unsigned int *ans) {
__shared__ unsigned int s[32];
unsigned int dif = (*count)/(blockDim.x*gridDim.x) +1;
unsigned int b = (threadIdx.x+blockIdx.x*blockDim.x)*dif;
unsigned int e = b+dif>(*count)?(*count):b+dif;
unsigned int mysol = 0;
for(long long i = 2; i<33; ++i) {
int sol = 1;
// each thread doing its part
for(unsigned int j = b; j<e ; ++j) {
//is some element is sprp base i break
if(is_SPRP((unsigned long long)i,(unsigned long long)input[j])!=0) {
sol=0;
break;
}
}
// if all elements passed store base to mysol
if(sol==1) mysol|=1<<(i-2);
}
s[threadIdx.x] = mysol;
// save thread_result
backup[threadIdx.x+blockDim.x*blockIdx.x] = mysol;
__syncthreads();
// compute global resulte and store it to ans
if(threadIdx.x==0) {
unsigned int global_sol = ~0;
for(int i=0;i<blockDim.x;++i) global_sol&=s[i];
atomicAnd(ans,global_sol);
}
}
int main(void) {
// number of blocks & thread for solve
const int blocks = 400;
const int threads = 32;
unsigned int prms[] = { 17, 11, 6, 60, 7, 13, 11, 34, 13, 2, 3, 37, 13, 11, 38, 2, 7, 105, 2, 7, 42, 11, 7, 3, 6, 15, 53, 44, 6, 6, 5, 15, 54, 7, 35, 10, 10, 15, 10, 10, 17, 17, 11, 10, 15, 43, 7, 5, 5, 3, 7, 43, 34, 2, 34, 2, 68, 53, 39, 10, 7, 6, 11, 2, 5, 2, 7, 2, 6, 5, 15, 40, 3, 5, 5, 2, 2, 10, 47, 13, 7, 43, 6, 7, 5, 6, 6, 13, 6, 35, 6, 15, 6, 13, 40, 10, 11, 2, 7, 2, 2, 3, 13, 3, 11, 15, 10, 5, 11, 14, 7, 11, 47, 5, 2, 2, 6, 2, 5, 55, 6, 5, 7, 2, 6, 58, 35, 11, 5, 12, 17, 6, 10, 12, 6, 6, 2, 53, 2, 2, 13, 5, 14, 7, 15, 6, 13, 62, 10, 6, 3, 7, 7, 3, 14, 5, 14, 73, 15, 11, 11, 6, 5, 17, 10, 5, 3, 37, 51, 10, 7, 5, 38, 12, 5, 11, 5, 7, 6, 5, 6, 40, 43, 57, 10, 13, 7, 15, 2, 10, 34, 7, 39, 10, 5, 3, 6, 13, 11, 5, 10, 43, 10, 5, 3, 14, 5, 2, 5, 41, 5, 39, 46, 2, 10, 2, 5, 12, 3, 2, 2, 5, 15, 43, 17, 41, 2, 13, 15, 38, 11, 11, 3, 34, 5, 6, 3, 7, 2, 37, 5, 6, 10, 17, 35, 2, 15, 6, 7, 5, 3, 13, 13, 12, 34, 2, 12, 10, 15, 13, 2, 2, 34, 6, 6, 5, 2, 7, 13, 3, 6, 11, 39, 42, 7, 2, 6, 39, 47, 3, 17, 5, 13, 7, 2, 47, 3, 7, 6, 11, 17, 37, 48, 7, 37, 11, 7, 10, 3, 14, 39, 14, 15, 43, 17, 2, 12, 7, 13, 5, 3, 6, 34, 37, 3, 17, 13, 2, 5, 10, 10, 44, 37, 2, 2, 10, 10, 7, 3, 7, 2, 7, 5, 43, 43, 11, 15, 51, 13, 17, 10, 11, 2, 5, 34, 17, 2, 2, 42, 6, 6, 5, 47, 15, 2, 12, 7, 3, 10, 15, 3, 7, 12, 12, 15, 43, 14, 7, 58, 13, 10, 6, 6, 38, 34, 5, 5, 13, 38, 6, 11, 10, 6, 7, 2, 55, 2, 13, 5, 11, 44, 15, 17, 2, 40, 2, 15, 13, 6, 2, 3, 3, 3, 3, 6, 39, 5, 11, 17, 37, 5, 7, 6, 10, 6, 12, 7, 5, 14, 10, 12, 71, 10, 35, 6, 11, 3, 2, 38, 3, 2, 34, 10, 17, 42, 2, 12, 6, 6, 11, 40, 12, 10, 6, 10, 2, 3, 3, 56, 11, 7, 42, 2, 38, 12, 2, 2, 13, 40, 12, 6, 5, 5, 59, 15, 38, 5, 5, 5, 7, 2, 10, 7, 2, 17, 10, 11, 6, 6, 6, 2, 10, 6, 54, 2, 82, 3, 34, 14, 15, 44, 5, 46, 2, 13, 5, 12, 13, 11, 10, 39, 5, 40, 3, 60, 3, 42, 11, 3, 46, 17, 3, 2, 37, 6, 42, 12, 14, 3, 12, 66, 13, 34, 7, 3, 13, 3, 11, 2, 13, 12, 38, 34, 5, 40, 10, 14, 6, 14, 11, 38, 58, 2, 48, 5, 15, 5, 73, 3, 37, 5, 11, 10, 5, 5, 13, 2, 10, 13, 34, 17, 3, 7, 47, 2, 2, 10, 15, 3, 3, 13, 6, 34, 13, 10, 13, 3, 6, 41, 10, 6, 2, 6, 2, 6, 2, 6, 6, 37, 10, 44, 35, 13, 51, 2, 7, 53, 5, 40, 5, 2, 37, 11, 15, 11, 13, 2, 5, 2, 6, 10, 17, 15, 43, 39, 17, 2, 12, 10, 15, 17, 7, 13, 3, 7, 15, 37, 5, 15, 7, 6, 10, 51, 2, 2, 40, 61, 2, 13, 13, 11, 2, 5, 34, 5, 5, 7, 2, 2, 2, 11, 3, 6, 13, 6, 17, 11, 10, 7, 46, 15, 7, 14, 35, 11, 7, 10, 6, 11, 40, 11, 2, 39, 7, 6, 66, 5, 3, 6, 5, 11, 10, 2, 10, 7, 13, 2, 45, 34, 6, 35, 2, 11, 5, 59, 75, 10, 17, 14, 17, 17, 17, 2, 11, 7, 10, 6, 11, 6, 56, 34, 35, 11, 14, 12, 41, 40, 17, 40, 3, 11, 7, 37, 14, 7, 13, 7, 5, 2, 10, 6, 39, 2, 7, 37, 35, 10, 5, 15, 2, 7, 38, 34, 11, 17, 5, 6, 10, 3, 6, 7, 7, 43, 14, 2, 43, 3, 2, 47, 7, 35, 7, 3, 53, 2, 10, 10, 10, 60, 10, 6, 2, 6, 10, 5, 7, 57, 53, 13, 3, 35, 38, 15, 42, 3, 3, 12, 2, 10, 3, 38, 54, 13, 10, 11, 7, 13, 7, 2, 12, 39, 10, 54, 2, 12, 38, 10, 12, 12, 5, 15, 6, 10, 13, 5, 15, 10, 13, 6, 41, 40, 14, 12, 10, 11, 40, 5, 11, 10, 2, 5, 2, 13, 6, 2, 13, 5, 2, 10, 15, 5, 5, 10, 34, 13, 2, 5, 14, 5, 6, 5, 13, 3, 43, 6, 13, 11, 50, 3, 6, 6, 12, 15, 11, 37, 7, 69, 11, 14, 14, 7, 43, 5, 35, 11, 35, 11, 11, 34, 34, 39, 14, 11, 2, 10, 53, 6, 11, 2, 11, 60, 39, 11, 6, 15, 40, 17, 47, 34, 50, 7, 59, 47, 5, 13, 39, 5, 6, 53, 10, 14, 5, 51, 5, 7, 5, 6, 77, 7, 12, 7, 42, 2, 5, 2, 6, 60, 10, 13, 10, 6, 47, 6, 15, 17, 10, 11, 10, 12, 7, 7, 10, 17, 34, 5, 10, 7, 7, 2, 6, 10, 38, 2, 15, 6, 13, 7, 13, 2, 3, 13, 5, 3, 17, 2, 5, 15, 11, 39, 7, 39, 10, 10, 2, 6, 13, 3, 5, 17, 6, 14, 10, 37, 44, 3, 34, 5, 11, 7, 12, 2, 5, 3, 12, 3, 2, 3, 133, 12, 2, 2, 2, 3, 34, 14, 41, 2, 37, 11, 2, 6, 11, 6, 7, 15, 11, 35, 13, 6, 5, 2, 14, 7, 2 };
printf("primes_copy: %s\n",cudaGetErrorString(cudaMemcpyToSymbol(primes,prms,1024*4)));
/*-----*/
// allocate buffers
unsigned int *dev_input,*dev_count;
printf("alloc_input: %s\n",cudaGetErrorString(cudaMalloc((void**)&dev_input,sizeof(int)*(1<<23))));
printf("alloc_count: %s\n",cudaGetErrorString(cudaMalloc((void**)&dev_count,4)));
printf("memset_count: %s\n",cudaGetErrorString(cudaMemset(dev_count,0,4)));
find<<<400,32>>>(dev_input,dev_count);
cudaDeviceSynchronize();
unsigned int count;
printf("copy_count: %s\n",cudaGetErrorString(cudaMemcpy(&count,dev_count,4,cudaMemcpyDeviceToHost)));
// sort found elements just to make debbug easier, it is not necessary
unsigned int *backup_numbers = new unsigned int[1000000];
printf("copy_backup: %s\n",cudaGetErrorString(cudaMemcpy(backup_numbers,dev_input,4*count,cudaMemcpyDeviceToHost)));
std::sort(backup_numbers,backup_numbers+count);
printf("copy_S_backup: %s\n",cudaGetErrorString(cudaMemcpy(dev_input,backup_numbers,4*count,cudaMemcpyHostToDevice)));
delete[] backup_numbers;
printf("\nsize: %u\n",count);
// allocate buffers
unsigned int *dev_backup, *dev_ans;
printf("malloc_backup: %s\n",cudaGetErrorString(cudaMalloc((void**)&dev_backup,sizeof(int)*blocks*threads)));
printf("malloc_ans: %s\n",cudaGetErrorString(cudaMalloc((void**)&dev_ans,4)));
printf("memset_ans: %s\n",cudaGetErrorString(cudaMemset(dev_ans,0xFF,4)));
solve<<<blocks,threads>>>(dev_input,dev_count,dev_backup,dev_ans);
cudaDeviceSynchronize();
unsigned int ans,*backup;
printf("memcpy_ans: %s\n",cudaGetErrorString(cudaMemcpy(&ans,dev_ans,4,cudaMemcpyDeviceToHost)));
backup = new unsigned int[400*32];
printf("memcpy_backup: %s\n",cudaGetErrorString(cudaMemcpy(backup,dev_backup,4*blocks*threads,cudaMemcpyDeviceToHost)));
unsigned int backup_ans = ~0;
// compute global result using backuped thread_results
// notice backup_ans and ans MUST be the same, but they are NOT (WHY!)
for(int i=0;i<threads*blocks;++i) backup_ans&=backup[i];
printf("ans: %u\nbackup_ans %u\n",ans,backup_ans);
printf("%u\n",backup[48]);
delete[] backup;
cudaFree(dev_ans);
cudaFree(dev_backup);
cudaFree(dev_count);
cudaFree(dev_input);
}
All code except solve() method works as intend. solve() method just computes bullshit (because backup_ans and ans differ) and it is also giving me the "unspecified launch failure" error on last two cudaMemcpy.
When i run solve<<<1,1>>>(...) i got
ans: 134816642 backup_ans 432501552
but when i run solve<<<400,32>>>(...) it gives me
ans: 134816642 backup_ans 0
(correct answer should be 0)
In all situations it should compute backup_ans=ans=0
Any advice what i am doing wrong would be helpful.
Code for generating primes.bin
#include <cstdlib>
#include <stdio.h>
using namespace std;
const unsigned long long n = 1ll<<32;
const int buffer_size = 2000000;
typedef unsigned char uch;
typedef unsigned int uint;
typedef unsigned long long ull;
uch *primes;
int prime(long long x) {
if(x==2) return 1;
if(x%2==0) return 0;
long long pos = x/16;
long long index = (x&15)>>1;
return (1<<index)&(~(primes[pos]));
}
void eratosten_sieve(void) {
long long pos;
long long index;
for(long long i=3;i*i<n;++i) {
if(!prime(i)) continue;
for(long long j=i*i;j<n;j+=(i<<1)) {
pos = j/16;
index = ((j&15)>>1);
primes[pos]|=(1<<index);
}
}
}
int main(void) {
primes = new uch[(n/16)+1];
for(long long i=0;i<(n/16)+1;++i) primes[i]=0;
printf("generating\n");
eratosten_sieve();
int l = n/16 +1;
printf("writing\n");
FILE *f = fopen("primes.bin","wb");
fwrite(primes,1,l,f);
fclose(f);
printf("done\n");
delete[] primes;
}
PS: i am compiling it by nvcc -arch compute_11
CUDA Driver Version / Runtime Version 5.5 / 5.5
CUDA Capability Major/Minor version number: 1.1
Total amount of global memory: 1023 MBytes (1073020928 bytes)
(14) Multiprocessors, ( 8) CUDA Cores/MP: 112 CUDA Cores
GPU Clock rate: 1500 MHz (1.50 GHz)
Memory Clock rate: 900 Mhz
Memory Bus Width: 256-bit
Maximum Texture Dimension Size (x,y,z) 1D=(8192), 2D=(65536, 32768), 3D=(2048, 2048, 2048)
Maximum Layered 1D Texture Size, (num) layers 1D=(8192), 512 layers
Maximum Layered 2D Texture Size, (num) layers 2D=(8192, 8192), 512 layers
Total amount of constant memory: 65536 bytes
Total amount of shared memory per block: 16384 bytes
Total number of registers available per block: 8192
Warp size: 32
Maximum number of threads per multiprocessor: 768
Maximum number of threads per block: 512
Max dimension size of a thread block (x,y,z): (512, 512, 64)
Max dimension size of a grid size (x,y,z): (65535, 65535, 1)
Maximum memory pitch: 2147483647 bytes
Texture alignment: 256 bytes
Concurrent copy and kernel execution: Yes with 1 copy engine(s)
Run time limit on kernels: Yes
Integrated GPU sharing Host Memory: No
Support host page-locked memory mapping: Yes
Alignment requirement for Surfaces: Yes
Device has ECC support: Disabled
Device supports Unified Addressing (UVA): No
Device PCI Bus ID / PCI location ID: 1 / 0
Compute Mode:
< Default (multiple host threads can use ::cudaSetDevice() with device simultaneously) >
deviceQuery, CUDA Driver = CUDART, CUDA Driver Version = 5.5, CUDA Runtime Version = 5.5, NumDevs = 1, Device0 = GeForce 9800 GT
Result = PASS
OK, you are out of memory. It took me a while to figure out because I was not thinking about the large static allocation:
__device__ unsigned char primes[(1<<28)+1];
Normally when folks are out of memory, they discover it on a cudaMalloc operation. In your case, your GPU has 1GB of memory, and I am guessing you are also hosting a display on it (you didn't answer that question). Take a look at how much free memory there is in the nvidia-smi -a output, it will look something like this:
FB Memory Usage
Total : 1535 MiB
Used : 3 MiB
Free : 1532 MiB
Your numbers will be smaller - the Free line is what we care about.
Your dynamic allocations (ie. from cudaMalloc) are allocating about 350MB. But the kernel launch brings the static allocation into play, and then your total footprint rises to over 700MB (2^28 is over 250MB). If you have a display running on that GPU, it will consume some of the 1GB of memory, leaving you with not enough to run a kernel that requires 700MB.
If you want to run on that GPU, see if you can pare your problem size down somehow.
And it's always good to do proper cuda error checking, but apart from this issue, your code seems to run with no errors for me on devices with more memory.
I have two questions,
Q1.
The code is below:
orgtable = Table[{i, node2 = i + 1, node3 = node2 + 6, node4 = node3 - 1,
node5 = i + 18, node6 = node5 + 1, node7 = node6 + 6,
node8 = node7 - 1}, {i, 1, 36}
];
modtable = Drop[orgtable, {6, 36, 6}];
finaltable = With[{n = 5, m = 10},Flatten[Partition[modtable, n, n + m, 1, {}], 1]]
The first piece of code gives me an original table, the second one gives me a modified table, and the third yields the final table.
The output of the final table looks like this:
{{1, 2, 8, 7, 19, 20, 26, 25}, {2, 3, 9, 8, 20, 21, 27, 26},
{3, 4, 10, 9, 21, 22, 28, 27}, {4, 5, 11, 10, 22, 23, 29, 28},
{5, 6, 12,11, 23, 24, 30, 29}, {19, 20, 26, 25, 37, 38, 44,43},
{20, 21, 27,26, 38, 39, 45, 44}, {21, 22, 28, 27, 39, 40, 46, 45},
{22, 23, 29,28, 40,41, 47, 46}, {23, 24, 30, 29, 41, 42, 48, 47}}
But I want it to set up a counter to the final table so that my output should look like this(below):The counter will increase by 1 and in the below example it will start with 200;
{{200,1, 2, 8, 7, 19, 20, 26, 25}, {201,2, 3, 9, 8, 20, 21, 27, 26},
{202,3, 4,10, 9, 21,22, 28, 27}, {203,4, 5, 11, 10, 22, 23, 29, 28},
{204,5, 6, 12,11, 23, 24, 30, 29} and so on
As you can see from the desired output the count is present for each element and increases by one
Now question number two:
mycounter = 100;
tryone =
TableForm[
Flatten[
Table[{++mycounter, xcord, ycord,
(150*(Sin[((xcord - 90*2*3.14)/180]^2)*
(Sin[((ycord - 45)*2*3.14)/180]^2)
) + 20
}, {xcord, 0, 200, 5}, {ycord, 0, 200, 5}
], 1
]
]
In the above example, I have successfully implemented a counter which is starting from 100 and incrementing by 1 and it gives me an output
100 0 0 20.03
101 0 5 20.04 and so on..
But now I want to use the Transpose function on this, since I want to transpose the value presented but at the same time I don't want to transpose the "my counter".
mycounter = 100;
secondtry=
TableForm[
Flatten[
Transpose[
Table[{++mycounter, xcord, ycord,
(150*(Sin[((xcord - 90)*2*3.14)/180]^2)*
(Sin[((ycord - 45)*2*3.14)/180]^2)
) +20}, {xcord, 0, 200, 5}, {ycord, 0, 200, 5}
]
], 1
]
]
But as you can see the Transpose function transposes also the "mycounter" which I do not want. How do you prevent the transpose function from working on "mycounter" but work on the rest of it?
Any other idea of implementing a counter in the above code is also welcome.
Removed answer to first question as I probably didn't understand what you wanted.
As to the second question: I'm not sure whether I fully understand you here. If the counter belongs to the coordinate set the output should be left as it is, how awkward it may look. If the counter column is simply a line counter of the final output you could put in after you have done your flattening just like before.
But in this case, it seems the Transpose is fully superfluous. It suffices to switch the order of the indices of your table. If you do that you can leave your counter as it is:
mycounter = 100;
secondtry =
Flatten[
Table[{mycounter++, xcord,ycord,
(150*(Sin[((xcord - 90)*2*3.14)/180]^2)*
(Sin[((ycord - 45)*2*3.14)/180]^2)
) + 20},
{ycord,0, 200, 5}, {xcord, 0, 200, 5} (* order switched here *)
], 1
]
A few notes: I removed the TableForm from your assignment. This is generally only used for printing and not for data that gets assigned to a variable. If you want to do an assignment and want to see the result at the same time you could try something like
(myVar = Table[...{...},{...}] ) //TableForm
Also note that you don't have to multiply by 3.14/180 to convert degrees to radians. Mathematica has a built-in quantity named Degree for that (if you use the shortcut esc deg esc you will have a nice degree symbol instead). It looks like you are multiplying with 2 pi/180 for this conversion. If that was your intention, it was incorrect. The conversion is either 2 pi/360 or pi/180. ((xcord - 90)*2*3.14)/180 should then be written as (xcord - 90)Degree.
Question 1 :
Transpose[Prepend[Transpose[#], Range[Length[#]] + 200]] &#
{{1, 2, 8, 7, 19, 20, 26, 25}, {2, 3, 9, 8, 20, 21, 27, 26}, {3, 4,
10, 9, 21, 22, 28, 27}, {4, 5, 11, 10, 22, 23, 29, 28}, {5, 6, 12,
11, 23, 24, 30, 29}, {19, 20, 26, 25, 37, 38, 44, 43}, {20, 21, 27,
26, 38, 39, 45, 44}, {21, 22, 28, 27, 39, 40, 46, 45}, {22, 23,
29, 28, 40, 41, 47, 46}, {23, 24, 30, 29, 41, 42, 48, 47}}
Question2:
Function[mat,
Partition[
Transpose[Prepend[Transpose[#], Range[Length[#]] + 99]] &#
Flatten[mat, 1], Length[mat]]]#
Table[{xcord,
ycord, (150*(Sin[((xcord - 90)*2*3.14)/
180]^2)*(Sin[((ycord - 45)*2*3.14)/180]^2)
) + 20
}, {xcord, 0, 200, 50}, {ycord, 0, 200, 50}
]
Create the rest of the table without the counter, create a suitable n*1 matrix of the index using Range, and then use MapThread with the inner function Join to put the two together.
Your finaltable could also be produced from modtable using Table as follows:
finaltableAlt = Delete[#, Transpose#{Flatten#Table[i + j, {i, 5, (
Length[#] - 10), 15}, {j, 10}]}] & # modtable
Another possibility for numbering:
MapIndexed[Flatten#{#2[[1]] + 199, #1} &, finaltableAlt]