can anyone see where i made a mistake here? I know that the algorithm will properly decrypt the encrypted data. however, most of the encrypted data is not the correct output, according to the RC6 paper.
// hexlify(string) turns a string into its hex representation: hexlify("AB") -> "4142"
// unhexlify(string) turns a string into its ASCII representation: unhexlify("4142") -> "AB"
// uint128_t is my own version of uint128, and Im pretty sure that the math is correct
// little_end(string, base) flips a string by bytes to get the little endian version of the string
// ROL/ROR(int, rotate x bits, bitsize of input int) does bitwise rotation
class RC6{
private:
unsigned int w, r, b, lgw;
std::vector <uint32_t> S;
uint128_t mod;
std::string mode;
void keygen(std::string KEY){
uint64_t p, q;
rc_pq(w, p, q);
KEY = hexlify(KEY);
unsigned int u = (unsigned int) ceil(w / 8.);
unsigned int c = (unsigned int) ceil(float(b) / u);
while ((KEY.size() >> 1) % u != 0)
KEY += zero;
std::vector <uint32_t> L;
for(unsigned int x = 0; x < c; x++)
L.push_back(toint(little_end(KEY.substr(2 * u * x, 2 * u), 16), 16));
S.push_back(p);
for(unsigned int i = 0; i < 2 * r + 3; i++)
S.push_back((S[i] + q) % mod);
uint32_t A = 0, B = 0, i = 0, j = 0;
uint32_t v = 3 * std::max(c, 2 * r + 4);
for(unsigned int s = 1; s < v + 1; s++){
A = S[i] = ROL((S[i] + A + B) % mod, 3, w);
B = L[j] = ROL((L[j] + A + B) % mod, (A + B) % w, w);
i = (i + 1) % (2 * r + 4);
j = (j + 1) % c;
}
}
public:
RC6(std::string KEY, std::string MODE, unsigned int W = 32, unsigned int R = 20, unsigned int B = 16){
w = W;
r = R;
b = B;
mod = uint128_t(1) << w;
lgw = (unsigned int) log2(w);
mode = MODE;
keygen(KEY);
}
std::string run(std::string DATA){
DATA = hexlify(DATA);
uint32_t A = toint(little_end(DATA.substr(0, 8), 16), 16), B = toint(little_end(DATA.substr(8, 8), 16), 16), C = toint(little_end(DATA.substr(16, 8), 16), 16), D = toint(little_end(DATA.substr(24, 8), 16), 16);
if (mode == "e"){
B += S[0];
D += S[1];
for(unsigned int i = 1; i < r + 1; i++){
uint64_t t = ROL((uint64_t) ((B * (2 * B + 1)) % mod), lgw, w);
uint64_t u = ROL((uint64_t) ((D * (2 * D + 1)) % mod), lgw, w);
A = ROL(A ^ t, u % w, w) + S[2 * i];
C = ROL(C ^ u, t % w, w) + S[2 * i + 1];
uint64_t temp = A; A = B % mod; B = C % mod; C = D % mod; D = temp % mod;
}
A += S[2 * r + 2];
C += S[2 * r + 3];
}
else{
C -= S[2 * r + 3];
A -= S[2 * r + 2];
for(int i = r; i > 0; i--){
uint64_t temp = D; D = C % mod; C = B % mod; B = A % mod; A = temp % mod;
uint64_t u = ROL((uint64_t) ((D * (2 * D + 1)) % mod), lgw, w);
uint64_t t = ROL((uint64_t) ((B * (2 * B + 1)) % mod), lgw, w);
C = ROR((C - S[2 * i + 1]) % mod, t % w, w) ^ u;
A = ROR((A - S[2 * i]) % mod, u % w, w) ^ t;
}
D -= S[1];
B -= S[0];
}
w >>= 2;
return unhexlify(little_end(makehex(A % mod, w)) + little_end(makehex(B % mod, w)) + little_end(makehex(C % mod, w)) + little_end(makehex(D % mod, w)));
}
};
of these tests vectors, only the first two are correct. the rest are not
data = "00000000000000000000000000000000";
key = "00000000000000000000000000000000";
ciphertext = "8fc3a53656b1f778c129df4e9848a41e";
data = "02132435465768798a9bacbdcedfe0f1";
key = "0123456789abcdef0112233445566778";
ciphertext = "524e192f4715c6231f51f6367ea43f18";
data = "00000000000000000000000000000000";
key = "000000000000000000000000000000000000000000000000";
ciphertext = "6cd61bcb190b30384e8a3f168690ae82";
data = "02132435465768798a9bacbdcedfe0f1";
key = "0123456789abcdef0112233445566778899aabbccddeeff0";
ciphertext = "688329d019e505041e52e92af95291d4";
data = "00000000000000000000000000000000";
key = "0000000000000000000000000000000000000000000000000000000000000000";
ciphertext = "8f5fbd0510d15fa893fa3fda6e857ec2";
data = "02132435465768798a9bacbdcedfe0f1";
key = "0123456789abcdef0112233445566778899aabbccddeeff01032547698badcfe";
ciphertext = "c8241816f0d7e48920ad16a1674e5d48";
did i mess up a uint somewhere? wrong little endian change?
I think I figured it out. Can anyone corroborate? I think that because I set b = 16 by default, I'm causing the errors. My harddrive is dead or I would have tested this already
Related
I'm currently writing a program for YUV420SP => RGB/BGR color space conversion, follow the floatint-point formula calculation, without any SIMD or multi-threading optimization.
The function's input data is unsigned char type, the finally result's type is also unsigned char type. But for the intermediate variables, the formula itself requires float type(the expressions in the right of the =), but for the float => unsigned char conversion, there are two choices, one is using float r, g, b the other is int r, g, b:
unsigned char y = 223; // mock for getting y value
unsigned char u = 200; // mock for getting u value
unsigned char v = 200; // mock for getting v value
unsigned char* rgb0 = (unsigned char*)malloc(MAXN); // for finally result saving
// the YUV=>RGB color conversion
float r, g, b; // [!! choice1 !!] if using this line, code run slower
int r, g, b; // [!! choice2 !!] if using this line, code run much faster
y = std::max(16, (int)y_ptr0[0]);
r = 1.164 * (y - 16) + 1.596 * (v - 128);
g = 1.164 * (y - 16) - 0.813 * (v - 128) - 0.391 * (u - 128);
b = 1.164 * (y - 16) + 2.018 * (u - 128);
rgb0[2-b_idx] = saturate_ucast(r);
rgb0[1] = saturate_ucast(g);
rgb0[b_idx] = saturate_ucast(b);
rgb0 += 3;
What makes me confusing is, for the actual test (convert a width=7680x4320 image), the float r,g,b is about much slower that using int r, g, b, on both Linux x86 and Android ARMv8 platform
The full code for the color conversion is:
#include <limits.h>
inline uchar saturate_uchar(int v)
{
return (uchar)((unsigned int)v <= UCHAR_MAX ? v : v > 0 ? UCHAR_MAX : 0);
}
inline uchar saturate_uchar(float v)
{
int iv = round(v);
return saturate_uchar(iv);
}
template<int u_idx, int b_idx>
void yuv420sp2rgb_naive(
const uchar* y_plane, int height, int width, int y_linebytes,
const uchar* uv_plane, int uv_linebytes,
uchar* rgb, int rgb_linebytes,
const Option& opt
)
{
/// param checking
assert (y_plane!=NULL && uv_plane!=NULL && rgb!=NULL);
/// neon-specific param checking
assert (width>=2 && height>=2);
int w = width;
int h = height;
for (int i=0; i <= h-2; i+=2)
{
const unsigned char* y_ptr0 = y_plane + i * y_linebytes;
const unsigned char* y_ptr1 = y_ptr0 + y_linebytes;
unsigned char* rgb0 = rgb + i * rgb_linebytes;
unsigned char* rgb1 = rgb0+ rgb_linebytes;
const unsigned char* uv_ptr = uv_plane + (i/2) * uv_linebytes;
for (size_t j=0; j <= width-2; j += 2)
{
int y;
float r, g, b; // choice1
//int r, g, b; // choice2
// R = 1.164(Y - 16) + 1.596(V - 128)
// G = 1.164(Y - 16) - 0.813(V - 128) - 0.391(U - 128)
// B = 1.164(Y - 16) + 2.018(U - 128)
int u = uv_ptr[u_idx];
int v = uv_ptr[1 - u_idx];
// y00
y = std::max(16, (int)y_ptr0[0]);
r = 1.164 * (y - 16) + 1.596 * (v - 128);
g = 1.164 * (y - 16) - 0.813 * (v - 128) - 0.391 * (u - 128);
b = 1.164 * (y - 16) + 2.018 * (u - 128);
rgb0[2-b_idx] = saturate_uchar(r);
rgb0[1] = saturate_uchar(g);
rgb0[b_idx] = saturate_uchar(b);
rgb0 += 3;
// y01
y = std::max(16, (int)y_ptr0[1]);
r = 1.164 * (y - 16) + 1.596 * (v - 128);
g = 1.164 * (y - 16) - 0.813 * (v - 128) - 0.391 * (u - 128);
b = 1.164 * (y - 16) + 2.018 * (u - 128);
rgb0[2-b_idx] = saturate_uchar(r);
rgb0[1] = saturate_uchar(g);
rgb0[b_idx] = saturate_uchar(b);
rgb0 += 3;
// y10
y = std::max(16, (int)y_ptr1[0]);
r = 1.164 * (y - 16) + 1.596 * (v - 128);
g = 1.164 * (y - 16) - 0.813 * (v - 128) - 0.391 * (u - 128);
b = 1.164 * (y - 16) + 2.018 * (u - 128);
rgb1[2-b_idx] = saturate_uchar(r);
rgb1[1] = saturate_uchar(g);
rgb1[b_idx] = saturate_uchar(b);
rgb1 += 3;
// y11
y = std::max(16, (int)y_ptr1[1]);
r = 1.164 * (y - 16) + 1.596 * (v - 128);
g = 1.164 * (y - 16) - 0.813 * (v - 128) - 0.391 * (u - 128);
b = 1.164 * (y - 16) + 2.018 * (u - 128);
rgb1[2-b_idx] = saturate_uchar(r);
rgb1[1] = saturate_uchar(g);
rgb1[b_idx] = saturate_uchar(b);
rgb1 += 3;
y_ptr0 += 2;
y_ptr1 += 2;
uv_ptr += 2;
}
}
}
platform
choice
time cost
linux x64
float r, g, b
140 ms
linux x64
int r, g, b
107 ms
armv8
float r, g, b
152 ms
armv8
int r, g, b
111 ms
Question: why changing variable r,g,b's type from float to int boost speed so much?
I have a RGBA image in binary format(.raw) and I am trying to convert the image into YCbCr using C++. However the converted image when viewed using ffplay give me a green image. What could I be doing wrong? I have a code to reproduce the problem I am facing. The input image looks like this: https://drive.google.com/file/d/1oDswYmmSV0pfNe-u8Do06WWVu2v1B-rg/view?usp=sharing and the snapshot of the converted image is https://drive.google.com/file/d/1G8Rut3CXILqbmlGrFQsnushy2CLKu40w/view?usp=sharing. The input RGBA .raw image can be obtained here: https://drive.google.com/file/d/19JhMjRdibGCgaUsE6DBGAXGTRiT2bmTM/view?usp=sharing
#include <fstream>
#include <iostream>
#include <vector>
#include <array>
typedef unsinged char byte;
int main(){
std::ifstream infile;
std::ofstream outfile;
const unsigned width = 1280;
const unsigned height = 720;
std::vector<std::array<byte, 4>> imageBuffer;
std::vector<std::array<byte, 3>> output;
imageBuffer.resize(width*height);
output.resize(width*height);
infile.open("input.raw", std::ios::binary);
if(infile){
infile.read(reinterpret_cast<char*>(&imageBuffer[0]), width*height*4*sizeof(char));
}
for (unsigned y=0; y<height; ++y){
for(unsigned x=0; x<width; ++x){
byte R, G, B, A;
R = imageBuffer[y*width + x][0];
G = imageBuffer[y*width + x][1];
B = imageBuffer[y*width + x][2];
byte Y, Cb, Cr;
Y = 0.257*R + 0.504*G + 0.098*B + 16;
Cb = -0.148*R - 0.291*G + 0.439*B + 128;
Cr = 0.439*R - 0.368*G - 0.071*B + 128;
output[y*width + x][0] = Y;
output[y*width + x][1] = Cb;
output[y*width + x][2] = Cr;
}
}
std::ofstream os("output444.yuv", std::ios::binary);
if(!os)
return false;
os.write(reinterpret_cast<char*>(&output[0]), 1280*720*3*sizeof(char));}
Your code is fine for YUV_4_4_4 8-bit-Packed.
You can view it with YUView: https://github.com/IENT/YUView/releases
and select the settings:
It will display just fine.
However, if you are seeing it Green or whatever wrong colours, it means the program reading it is expecting a different format. Most likely it is expecting planar format which means you need to write all Y bytes first. Then write Cb bytes, then Cr bytes.
So it'd look like (YCbCr_4_4_4_Planar):
YYYY
YYYY
YYYY
CbCbCbCb
CbCbCbCb
CrCrCrCr
CrCrCrCr
instead of packed which looks like (Your code above = YCbCr_4_4_4_Packed/Interleaved):
YCbCrYCbCrYCbCr
YCbCrYCbCrYCbCr
YCbCrYCbCrYCbCr
YCbCrYCbCrYCbCr
Below I wrote some code that can handle multiple formats. It'll take a RAW image format and convert it to either:
YUV_4_2_2_PLANAR,
YUV_4_2_2_PACKED,
YUV_4_4_4_PLANAR,
YUV_4_4_4_PACKED,
//
// main.cpp
// RAW-To-YUV-Conversion
//
// Created by Brandon on 2021-08-06.
//
#include <iostream>
#include <fstream>
#include <utility>
#include <memory>
#include <vector>
void RGBToYUV(std::uint8_t R, std::uint8_t G, std::uint8_t B, std::uint8_t& Y, std::uint8_t& U, std::uint8_t& V)
{
Y = 0.257 * R + 0.504 * G + 0.098 * B + 16;
U = -0.148 * R - 0.291 * G + 0.439 * B + 128;
V = 0.439 * R - 0.368 * G - 0.071 * B + 128;
}
//void RGBToYUV(std::uint8_t R, std::uint8_t G, std::uint8_t B, std::uint8_t &Y, std::uint8_t &U, std::uint8_t &V)
//{
// #define RGB2Y(r, g, b) (uint8_t)(((66 * (r) + 129 * (g) + 25 * (b) + 128) >> 8) + 16)
// #define RGB2U(r, g, b) (uint8_t)(((-38 * (r) - 74 * (g) + 112 * (b) + 128) >> 8) + 128)
// #define RGB2V(r, g, b) (uint8_t)(((112 * (r) - 94 * (g) - 18 * (b) + 128) >> 8) + 128)
//
// Y = RGB2Y((int)R, (int)G, (int)B);
// U = RGB2U((int)R, (int)G, (int)B);
// V = RGB2V((int)R, (int)G, (int)B);
//}
enum Format
{
YUV_4_2_2_PLANAR,
YUV_4_2_2_PACKED,
YUV_4_4_4_PLANAR,
YUV_4_4_4_PACKED,
};
class RawImage
{
private:
std::unique_ptr<std::uint8_t> pixels;
std::uint32_t width, height;
std::uint16_t bpp;
public:
RawImage(const char* path, std::uint32_t width, std::uint32_t height);
~RawImage() {}
void SaveYUV(const char* path, Format format);
};
RawImage::RawImage(const char* path, std::uint32_t width, std::uint32_t height) : pixels(nullptr), width(width), height(height), bpp(32)
{
std::ifstream file(path, std::ios::in | std::ios::binary);
if (file)
{
std::size_t size = width * height * 4;
file.seekg(0, std::ios::beg);
pixels.reset(new std::uint8_t[size]);
file.read(reinterpret_cast<char*>(pixels.get()), size);
}
}
void RawImage::SaveYUV(const char* path, Format format)
{
std::ofstream file(path, std::ios::out | std::ios::binary);
if (file)
{
if (format == Format::YUV_4_2_2_PLANAR)
{
std::unique_ptr<std::uint8_t> y_plane{new std::uint8_t[width * height]};
std::unique_ptr<std::uint8_t> u_plane{new std::uint8_t[(width * height) >> 1]};
std::unique_ptr<std::uint8_t> v_plane{new std::uint8_t[(width * height) >> 1]};
std::uint8_t* in = pixels.get();
std::uint8_t* y_plane_ptr = y_plane.get();
std::uint8_t* u_plane_ptr = u_plane.get();
std::uint8_t* v_plane_ptr = v_plane.get();
for (std::uint32_t i = 0; i < height; ++i)
{
for (std::uint32_t j = 0; j < width; j += 2)
{
std::uint32_t offset = 4;
std::size_t in_pos = i * (width * offset) + offset * j;
std::uint8_t Y1 = 0;
std::uint8_t U1 = 0;
std::uint8_t V1 = 0;
std::uint8_t Y2 = 0;
std::uint8_t U2 = 0;
std::uint8_t V2 = 0;
RGBToYUV(in[in_pos + 0], in[in_pos + 1], in[in_pos + 2], Y1, U1, V1);
RGBToYUV(in[in_pos + 4], in[in_pos + 5], in[in_pos + 6], Y2, U2, V2);
std::uint8_t U3 = (U1 + U2 + 1) >> 1;
std::uint8_t V3 = (V1 + V2 + 1) >> 1;
*y_plane_ptr++ = Y1;
*y_plane_ptr++ = Y2;
*u_plane_ptr++ = U3;
*v_plane_ptr++ = V3;
}
}
file.write(reinterpret_cast<char*>(y_plane.get()), width * height);
file.write(reinterpret_cast<char*>(u_plane.get()), (width * height) >> 1);
file.write(reinterpret_cast<char*>(v_plane.get()), (width * height) >> 1);
}
else if (format == Format::YUV_4_2_2_PACKED)
{
std::size_t size = width * height * 2;
std::unique_ptr<std::uint8_t> buffer{new std::uint8_t[size]};
std::uint8_t* in = pixels.get();
std::uint8_t* out = buffer.get();
for (std::uint32_t i = 0; i < height; ++i)
{
for (std::uint32_t j = 0; j < width; j += 2)
{
std::uint32_t offset = 4;
std::size_t in_pos = i * (width * offset) + offset * j;
std::uint8_t Y1 = 0;
std::uint8_t U1 = 0;
std::uint8_t V1 = 0;
std::uint8_t Y2 = 0;
std::uint8_t U2 = 0;
std::uint8_t V2 = 0;
RGBToYUV(in[in_pos + 0], in[in_pos + 1], in[in_pos + 2], Y1, U1, V1);
RGBToYUV(in[in_pos + 4], in[in_pos + 5], in[in_pos + 6], Y2, U2, V2);
std::uint8_t U3 = (U1 + U2 + 1) >> 1;
std::uint8_t V3 = (V1 + V2 + 1) >> 1;
std::size_t out_pos = i * (width * 2) + 2 * j;
out[out_pos + 0] = Y1;
out[out_pos + 1] = U3;
out[out_pos + 2] = Y2;
out[out_pos + 3] = V3;
}
}
file.write(reinterpret_cast<char*>(buffer.get()), size);
}
else if (format == Format::YUV_4_4_4_PLANAR)
{
std::size_t size = width * height * 3;
std::unique_ptr<std::uint8_t> buffer{new std::uint8_t[size]};
std::uint8_t* in = pixels.get();
std::uint8_t* out = buffer.get();
for (std::uint32_t i = 0; i < height; ++i)
{
for (std::uint32_t j = 0; j < width; ++j)
{
std::uint32_t offset = 4;
std::size_t in_pos = i * (width * offset) + offset * j;
std::uint8_t Y = 0;
std::uint8_t U = 0;
std::uint8_t V = 0;
RGBToYUV(in[in_pos + 0], in[in_pos + 1], in[in_pos + 2], Y, U, V);
std::size_t y_pos = i * width + j;
std::size_t u_pos = y_pos + (width * height);
std::size_t v_pos = y_pos + (width * height * 2);
out[y_pos] = Y;
out[u_pos] = U;
out[v_pos] = V;
}
}
file.write(reinterpret_cast<char*>(buffer.get()), size);
}
else if (format == Format::YUV_4_4_4_PACKED)
{
std::size_t size = width * height * 3;
std::unique_ptr<std::uint8_t> buffer{new std::uint8_t[size]};
std::uint8_t* in = pixels.get();
std::uint8_t* out = buffer.get();
for (std::uint32_t i = 0; i < height; ++i)
{
for (std::uint32_t j = 0; j < width; ++j)
{
std::uint32_t offset = 4;
std::size_t in_pos = i * (width * offset) + offset * j;
std::uint8_t Y = 0;
std::uint8_t U = 0;
std::uint8_t V = 0;
RGBToYUV(in[in_pos + 0], in[in_pos + 1], in[in_pos + 2], Y, U, V);
std::size_t out_pos = i * (width * 3) + 3 * j;
out[out_pos + 0] = Y;
out[out_pos + 1] = U;
out[out_pos + 2] = V;
}
}
file.write(reinterpret_cast<char*>(buffer.get()), size);
}
}
}
int main(int argc, const char * argv[]) {
RawImage img{"/Users/brandon/Downloads/input.raw", 1280, 720};
img.SaveYUV("/Users/brandon/Downloads/output.yuv", Format::YUV_4_4_4_PACKED);
return 0;
}
You are overwriting the same byte here:
output[y*width + x][0] = Y;
output[y*width + x][0] = Cb;
output[y*width + x][0] = Cr;
I would like to modify the code for an OpenCV mean filter to use Intel intrinsics. I'm an SSE newbie and I really don't know where to start from. I checked a lot of resources on the web, but I didn't have a lot of success.
This is the program:
#include "opencv2/imgproc/imgproc.hpp"
#include "opencv2/highgui/highgui.hpp"
using namespace cv;
using namespace std;
int main()
{
int A[3][3] = { { 1, 1, 1 }, { 1, 1, 1 }, { 1, 1, 1 } };
int c = 0;
int d = 0;
Mat var1 = imread("images.jpg", 1);
Mat var2(var1.rows, var1.cols, CV_8UC3, Scalar(0, 0, 0));
for (int i = 0; i < var1.rows; i++)
{
var2.at<Vec3b>(i, 0) = var1.at<Vec3b>(i, 0);
var2.at<Vec3b>(i, var1.cols - 1) = var1.at<Vec3b>(i, var1.cols - 1);
}
for (int i = 0; i < var1.cols; i++)
{
var2.at<Vec3b>(0, i) = var1.at<Vec3b>(0, i);
var2.at<Vec3b>(var1.rows - 1, i) = var1.at<Vec3b>(var1.rows - 1, i);
}
for (int i = 0; i < var1.rows; i++) {
for (int j = 0; j < var1.cols; j++)
{
c = 0;
for (int m = i; m < var1.rows; m++, c++)
{
if (c < 3)
{
d = 0;
for (int n = j; n < var1.cols; n++, d++)
{
if (d < 3)
{
if ((i + 1) < var1.rows && (j + 1) < var1.cols)
{
var2.at<Vec3b>(i + 1, j + 1)[0] += var1.at<Vec3b>(m, n)[0] * A[m - i][n - j] / 9;
var2.at<Vec3b>(i + 1, j + 1)[1] += var1.at<Vec3b>(m, n)[1] * A[m - i][n - j] / 9;
var2.at<Vec3b>(i + 1, j + 1)[2] += var1.at<Vec3b>(m, n)[2] * A[m - i][n - j] / 9;
}
}
}
}
}
}
}
imshow("window1", var1);
imshow("window2", var2);
waitKey(0);
return(0);
}
The part that I find difficult is understanding how to convert the innermost 2 loops, where the mean value is computed. Any help will be greatly appreciated.
Just for fun, I thought it might be interesting to start with a naive implementation of a 3x3 mean filter and then optimise this incrementally, ending up with a SIMD (SSE) implementation, measuring the throughput improvement at each stage.
1 - Mean_3_3_ref - reference implementation
This is just a simple scalar implementation which we'll use as a baseline for throughput and for validating further implementations:
void Mean_3_3_ref(const Mat &image_in, Mat &image_out)
{
for (int y = 1; y < image_in.rows - 1; ++y)
{
for (int x = 1; x < image_in.cols - 1; ++x)
{
for (int c = 0; c < 3; ++c)
{
image_out.at<Vec3b>(y, x)[c] = (image_in.at<Vec3b>(y - 1, x - 1)[c] +
image_in.at<Vec3b>(y - 1, x )[c] +
image_in.at<Vec3b>(y - 1, x + 1)[c] +
image_in.at<Vec3b>(y , x - 1)[c] +
image_in.at<Vec3b>(y , x )[c] +
image_in.at<Vec3b>(y , x + 1)[c] +
image_in.at<Vec3b>(y + 1, x - 1)[c] +
image_in.at<Vec3b>(y + 1, x )[c] +
image_in.at<Vec3b>(y + 1, x + 1)[c] + 4) / 9;
}
}
}
}
2 - Mean_3_3_scalar - somewhat optimised scalar implementation
Exploit the redundancy in summing successive columns - we save the last two column sums so that we only need to calculate one new column sum (per channel) on each iteration:
void Mean_3_3_scalar(const Mat &image_in, Mat &image_out)
{
for (int y = 1; y < image_in.rows - 1; ++y)
{
int r_1, g_1, b_1;
int r0, g0, b0;
int r1, g1, b1;
r_1 = g_1 = b_1 = 0;
r0 = g0 = b0 = 0;
for (int yy = y - 1; yy <= y + 1; ++yy)
{
r_1 += image_in.at<Vec3b>(yy, 0)[0];
g_1 += image_in.at<Vec3b>(yy, 0)[1];
b_1 += image_in.at<Vec3b>(yy, 0)[2];
r0 += image_in.at<Vec3b>(yy, 1)[0];
g0 += image_in.at<Vec3b>(yy, 1)[1];
b0 += image_in.at<Vec3b>(yy, 1)[2];
}
for (int x = 1; x < image_in.cols - 1; ++x)
{
r1 = g1 = b1 = 0;
for (int yy = y - 1; yy <= y + 1; ++yy)
{
r1 += image_in.at<Vec3b>(yy, x + 1)[0];
g1 += image_in.at<Vec3b>(yy, x + 1)[1];
b1 += image_in.at<Vec3b>(yy, x + 1)[2];
}
image_out.at<Vec3b>(y, x)[0] = (r_1 + r0 + r1 + 4) / 9;
image_out.at<Vec3b>(y, x)[1] = (g_1 + g0 + g1 + 4) / 9;
image_out.at<Vec3b>(y, x)[2] = (b_1 + b0 + b1 + 4) / 9;
r_1 = r0;
g_1 = g0;
b_1 = b0;
r0 = r1;
g0 = g1;
b0 = b1;
}
}
}
3 - Mean_3_3_scalar_opt - further optimised scalar implementation
As per Mean_3_3_scalar, but also remove OpenCV overheads by caching pointers to each row that we are working on:
void Mean_3_3_scalar_opt(const Mat &image_in, Mat &image_out)
{
for (int y = 1; y < image_in.rows - 1; ++y)
{
const uint8_t * const input_1 = image_in.ptr(y - 1);
const uint8_t * const input0 = image_in.ptr(y);
const uint8_t * const input1 = image_in.ptr(y + 1);
uint8_t * const output = image_out.ptr(y);
int r_1 = input_1[0] + input0[0] + input1[0];
int g_1 = input_1[1] + input0[1] + input1[1];
int b_1 = input_1[2] + input0[2] + input1[2];
int r0 = input_1[3] + input0[3] + input1[3];
int g0 = input_1[4] + input0[4] + input1[4];
int b0 = input_1[5] + input0[5] + input1[5];
for (int x = 1; x < image_in.cols - 1; ++x)
{
int r1 = input_1[x * 3 + 3] + input0[x * 3 + 3] + input1[x * 3 + 3];
int g1 = input_1[x * 3 + 4] + input0[x * 3 + 4] + input1[x * 3 + 4];
int b1 = input_1[x * 3 + 5] + input0[x * 3 + 5] + input1[x * 3 + 5];
output[x * 3 ] = (r_1 + r0 + r1 + 4) / 9;
output[x * 3 + 1] = (g_1 + g0 + g1 + 4) / 9;
output[x * 3 + 2] = (b_1 + b0 + b1 + 4) / 9;
r_1 = r0;
g_1 = g0;
b_1 = b0;
r0 = r1;
g0 = g1;
b0 = b1;
}
}
}
4 - Mean_3_3_blur - leverage OpenCV's blur function
OpenCV has a function called blur, which is based on the function boxFilter, which is just another name for a mean filter. Since OpenCV code has been quite heavily optimised over the years (using SIMD in many cases), let's see if this makes a big improvement over our scalar code:
void Mean_3_3_blur(const Mat &image_in, Mat &image_out)
{
blur(image_in, image_out, Size(3, 3));
}
5 - Mean_3_3_SSE - SSE implementation
This a reasonably efficient SIMD implementation. It uses the same techniques as the scalar code above in order to eliminate redundancy in processing successive pixels:
#include <tmmintrin.h> // Note: requires SSSE3 (aka MNI)
inline void Load2(const ssize_t offset, const uint8_t* const src, __m128i& vh, __m128i& vl)
{
const __m128i v = _mm_loadu_si128((__m128i *)(src + offset));
vh = _mm_unpacklo_epi8(v, _mm_setzero_si128());
vl = _mm_unpackhi_epi8(v, _mm_setzero_si128());
}
inline void Store2(const ssize_t offset, uint8_t* const dest, const __m128i vh, const __m128i vl)
{
__m128i v = _mm_packus_epi16(vh, vl);
_mm_storeu_si128((__m128i *)(dest + offset), v);
}
template <int SHIFT> __m128i ShiftL(const __m128i v0, const __m128i v1) { return _mm_alignr_epi8(v1, v0, SHIFT * sizeof(short)); }
template <int SHIFT> __m128i ShiftR(const __m128i v0, const __m128i v1) { return _mm_alignr_epi8(v1, v0, 16 - SHIFT * sizeof(short)); }
template <int CHANNELS> void Mean_3_3_SSE_Impl(const Mat &image_in, Mat &image_out)
{
const int nx = image_in.cols;
const int ny = image_in.rows;
const int kx = 3 / 2; // x, y borders
const int ky = 3 / 2;
const int kScale = 3 * 3; // scale factor = total number of pixels in sum
const __m128i vkScale = _mm_set1_epi16((32768 + kScale / 2) / kScale);
const int nx0 = ((nx + kx) * CHANNELS + 15) & ~15; // round up total width to multiple of 16
int x, y;
for (y = ky; y < ny - ky; ++y)
{
const uint8_t * const input_1 = image_in.ptr(y - 1);
const uint8_t * const input0 = image_in.ptr(y);
const uint8_t * const input1 = image_in.ptr(y + 1);
uint8_t * const output = image_out.ptr(y);
__m128i vsuml_1, vsumh0, vsuml0;
__m128i vh, vl;
vsuml_1 = _mm_set1_epi16(0);
Load2(0, input_1, vsumh0, vsuml0);
Load2(0, input0, vh, vl);
vsumh0 = _mm_add_epi16(vsumh0, vh);
vsuml0 = _mm_add_epi16(vsuml0, vl);
Load2(0, input1, vh, vl);
vsumh0 = _mm_add_epi16(vsumh0, vh);
vsuml0 = _mm_add_epi16(vsuml0, vl);
for (x = 0; x < nx0; x += 16)
{
__m128i vsumh1, vsuml1, vsumh, vsuml;
Load2((x + 16), input_1, vsumh1, vsuml1);
Load2((x + 16), input0, vh, vl);
vsumh1 = _mm_add_epi16(vsumh1, vh);
vsuml1 = _mm_add_epi16(vsuml1, vl);
Load2((x + 16), input1, vh, vl);
vsumh1 = _mm_add_epi16(vsumh1, vh);
vsuml1 = _mm_add_epi16(vsuml1, vl);
vsumh = _mm_add_epi16(vsumh0, ShiftR<CHANNELS>(vsuml_1, vsumh0));
vsuml = _mm_add_epi16(vsuml0, ShiftR<CHANNELS>(vsumh0, vsuml0));
vsumh = _mm_add_epi16(vsumh, ShiftL<CHANNELS>(vsumh0, vsuml0));
vsuml = _mm_add_epi16(vsuml, ShiftL<CHANNELS>(vsuml0, vsumh1));
// round mean
vsumh = _mm_mulhrs_epi16(vsumh, vkScale);
vsuml = _mm_mulhrs_epi16(vsuml, vkScale);
Store2(x, output, vsumh, vsuml);
vsuml_1 = vsuml0;
vsumh0 = vsumh1;
vsuml0 = vsuml1;
}
}
}
void Mean_3_3_SSE(const Mat &image_in, Mat &image_out)
{
const int channels = image_in.channels();
switch (channels)
{
case 1:
Mean_3_3_SSE_Impl<1>(image_in, image_out);
break;
case 3:
Mean_3_3_SSE_Impl<3>(image_in, image_out);
break;
default:
throw("Unsupported format.");
break;
}
}
Results
I benchmarked all of the above implementations on an 8th gen Core i9 (MacBook Pro 16,1) at 2.4 GHz, with an image size of 2337 rows x 3180 cols. The compiler was Apple clang version 12.0.5 (clang-1205.0.22.9) and the only optimisation switch was -O3. OpenCV version was 4.5.0 (via Homebrew). (Note: I verified that for Mean_3_3_blur the cv::blur function was dispatched to an AVX2 implementation.) The results:
Mean_3_3_ref 62153 µs
Mean_3_3_scalar 41144 µs = 1.51062x
Mean_3_3_scalar_opt 26238 µs = 2.36882x
Mean_3_3_blur 20121 µs = 3.08896x
Mean_3_3_SSE 4838 µs = 12.84680x
Notes
I have ignored the border pixels in all implementations - if required these can either be filled with pixels from the original image or using some other form of edge pixel processing.
The code is not "industrial strength" - it was only written for benchmarking purposes.
There are a few further possible optimisations, e.g. use wider SIMD (AVX2, AVX512), exploit the redundancy between successive rows, etc - these are left as an exercise for the reader.
The SSE implementation is fastest, but this comes at the cost of increased complexity, decreased mantainability and reduced portability.
The OpenCV blur function gives the second best performance, and should probably be the preferred solution if it meets throughput requirements - it's the simplest solution, and simple is good.
I'm attempt to work with unit tests for meson, and am getting:
ϰ ninja
[4/4] Linking target test/crypto/crypto_tests.
FAILED: test/crypto/crypto_tests
clang++-6.0 -o test/crypto/crypto_tests 'test/crypto/test#crypto##crypto_tests#exe/.._.._src_platform_encoding_Endian.c.o' 'test/crypto/test#crypto##crypto_tests#exe/.._.._src_crypto_Sha1.c.o' 'test/crypto/test#crypto##crypto_tests#exe/Sha1Tests.cpp.o' 'test/crypto/test#crypto##crypto_tests#exe/.._.._subprojects_googletest-release-1.8.0_googletest_src_gtest-all.cc.o' 'test/crypto/test#crypto##crypto_tests#exe/.._.._subprojects_googletest-release-1.8.0_googletest_src_gtest_main.cc.o' -Wl,--no-undefined -Wl,--as-needed -pthread
test/crypto/test#crypto##crypto_tests#exe/Sha1Tests.cpp.o: In function `sha_one_hash_simple_Test::TestBody()':
/home/kfc/molten/magma/builddir/../test/crypto/Sha1Tests.cpp:46: undefined reference to `br_sha1_init(br_sha1_context*)'
/home/kfc/molten/magma/builddir/../test/crypto/Sha1Tests.cpp:47: undefined reference to `br_sha1_update(br_sha1_context*, void const*, unsigned long)'
/home/kfc/ecoan/molten/magma/builddir/../test/crypto/Sha1Tests.cpp:48: undefined reference to `br_sha1_out(br_sha1_context const*, void*)'
clang: error: linker command failed with exit code 1 (use -v to see invocation)
ninja: build stopped: subcommand failed.
However I have defined those functions in Sha1.h:
#ifndef _CRYPTO_SHA1_H
#define _CRYPTO_SHA1_H
#include <stddef.h>
#include <string.h>
#include "../BuildSwitches.h"
#include "../types/BaseTypes.h"
/**
* Symbolic identifier for SHA-1.
*/
#define br_sha1_ID 2
/**
* SHA-1 output size (in bytes).
*/
#define br_sha1_SIZE 20
/**
* SHA-1 context.
*
* Fields are not supposed to be accessed by user code.
*/
typedef struct {
unsigned char buf[64];
UINT64 count;
UINT32 val[5];
} br_sha1_context;
/**
* SHA-1 context initialisation.
*
* This function initialises or resets a context for a new SHA-1
* computation.
*
* ctx: pointer to the context structure.
*/
void br_sha1_init(br_sha1_context *ctx);
/**
* Inject some data bytes in a running SHA-1 computation.
*
* The provided context is updated with some data bytes. If the number
* of bytes (`len`) is zero, then the data pointer (`data`) is ignored
* and may be `NULL`, and this function does nothing.
*
* ctx: pointer to the context structure.
* data: pointer to the injected data.
* len: injected data length (in bytes).
*/
void br_sha1_update(br_sha1_context *ctx, const void *data, size_t len);
/**
* Compute SHA-1 output.
*
* The SHA-1 output for the concatenation of all bytes injected in the
* provided context since the last initialisation or reset call, is
* computed and written in the buffer pointed to by `out`. The context
* itself is not modified, so extra bytes may be injected afterwards
* to continue that computation.
*
* ctx: pointer to the context structure.
* out: destination buffer for the hash output.
*/
void br_sha1_out(const br_sha1_context *ctx, void *out);
/**
* Save SHA-1 running state.
*
* The running state for SHA-1 (output of the last internal block
* processing) is written in the buffer pointed to by `out`. The
* number of bytes injected since the last initialisation or reset
* call is returned. The context is not modified.
*
* ctx: pointer to the context structure.
* out: destination buffer for the running state.
*
* returns: the injected total byte length.
*/
UINT64 br_sha1_state(const br_sha1_context *ctx, void *out);
/**
* Restore SHA-1 running state.
*
* The running state for SHA-1 is set to the provided values.
*
* ctx: pointer to the context structure.
* stb: source buffer for the running state.
* count: the injected total byte length.
*/
void br_sha1_set_state(br_sha1_context *ctx, const void *stb, UINT64 count);
void br_sha1_round(const unsigned char *buf, UINT32 *val);
extern const UINT32 br_sha1_IV[];
#endif // _CRYPTO_SHA1_H
These have the same definitions in Sha1.c:
#include "../platform/encoding/Endian.h"
#include "./Sha1.h"
#define F(B, C, D) ((((C) ^ (D)) & (B)) ^ (D))
#define G(B, C, D) ((B) ^ (C) ^ (D))
#define H(B, C, D) (((D) & (C)) | (((D) | (C)) & (B)))
#define I(B, C, D) G(B, C, D)
#define ROTL(x, n) (((x) << (n)) | ((x) >> (32 - (n))))
#define K1 ((UINT32)0x5A827999)
#define K2 ((UINT32)0x6ED9EBA1)
#define K3 ((UINT32)0x8F1BBCDC)
#define K4 ((UINT32)0xCA62C1D6)
const UINT32 br_sha1_IV[5] = { 0x67452301, 0xEFCDAB89, 0x98BADCFE, 0x10325476, 0xC3D2E1F0 };
void br_sha1_round(const unsigned char *buf, UINT32 *val) {
UINT32 m[80];
UINT32 a, b, c, d, e;
int i;
a = val[0];
b = val[1];
c = val[2];
d = val[3];
e = val[4];
br_range_dec32be(m, 16, buf);
for (i = 16; i < 80; i ++) {
UINT32 x = m[i - 3] ^ m[i - 8] ^ m[i - 14] ^ m[i - 16];
m[i] = ROTL(x, 1);
}
for (i = 0; i < 20; i += 5) {
e += ROTL(a, 5) + F(b, c, d) + K1 + m[i + 0]; b = ROTL(b, 30);
d += ROTL(e, 5) + F(a, b, c) + K1 + m[i + 1]; a = ROTL(a, 30);
c += ROTL(d, 5) + F(e, a, b) + K1 + m[i + 2]; e = ROTL(e, 30);
b += ROTL(c, 5) + F(d, e, a) + K1 + m[i + 3]; d = ROTL(d, 30);
a += ROTL(b, 5) + F(c, d, e) + K1 + m[i + 4]; c = ROTL(c, 30);
}
for (i = 20; i < 40; i += 5) {
e += ROTL(a, 5) + G(b, c, d) + K2 + m[i + 0]; b = ROTL(b, 30);
d += ROTL(e, 5) + G(a, b, c) + K2 + m[i + 1]; a = ROTL(a, 30);
c += ROTL(d, 5) + G(e, a, b) + K2 + m[i + 2]; e = ROTL(e, 30);
b += ROTL(c, 5) + G(d, e, a) + K2 + m[i + 3]; d = ROTL(d, 30);
a += ROTL(b, 5) + G(c, d, e) + K2 + m[i + 4]; c = ROTL(c, 30);
}
for (i = 40; i < 60; i += 5) {
e += ROTL(a, 5) + H(b, c, d) + K3 + m[i + 0]; b = ROTL(b, 30);
d += ROTL(e, 5) + H(a, b, c) + K3 + m[i + 1]; a = ROTL(a, 30);
c += ROTL(d, 5) + H(e, a, b) + K3 + m[i + 2]; e = ROTL(e, 30);
b += ROTL(c, 5) + H(d, e, a) + K3 + m[i + 3]; d = ROTL(d, 30);
a += ROTL(b, 5) + H(c, d, e) + K3 + m[i + 4]; c = ROTL(c, 30);
}
for (i = 60; i < 80; i += 5) {
e += ROTL(a, 5) + I(b, c, d) + K4 + m[i + 0]; b = ROTL(b, 30);
d += ROTL(e, 5) + I(a, b, c) + K4 + m[i + 1]; a = ROTL(a, 30);
c += ROTL(d, 5) + I(e, a, b) + K4 + m[i + 2]; e = ROTL(e, 30);
b += ROTL(c, 5) + I(d, e, a) + K4 + m[i + 3]; d = ROTL(d, 30);
a += ROTL(b, 5) + I(c, d, e) + K4 + m[i + 4]; c = ROTL(c, 30);
}
val[0] += a;
val[1] += b;
val[2] += c;
val[3] += d;
val[4] += e;
}
void br_sha1_init(br_sha1_context *ctx) {
memcpy(ctx->val, br_sha1_IV, sizeof ctx->val);
ctx->count = 0;
}
void br_sha1_update(br_sha1_context *cc, const void *data, size_t len) {
const unsigned char *buf;
size_t ptr;
buf = (const unsigned char *) data;
ptr = (size_t)cc->count & 63;
while (len > 0) {
size_t clen;
clen = 64 - ptr;
if (clen > len) {
clen = len;
}
memcpy(cc->buf + ptr, buf, clen);
ptr += clen;
buf += clen;
len -= clen;
cc->count += (UINT64)clen;
if (ptr == 64) {
br_sha1_round(cc->buf, cc->val);
ptr = 0;
}
}
}
void br_sha1_out(const br_sha1_context *cc, void *dst) {
unsigned char buf[64];
UINT32 val[5];
size_t ptr;
ptr = (size_t)cc->count & 63;
memcpy(buf, cc->buf, ptr);
memcpy(val, cc->val, sizeof val);
buf[ptr ++] = 0x80;
if (ptr > 56) {
memset(buf + ptr, 0, 64 - ptr);
br_sha1_round(buf, val);
memset(buf, 0, 56);
} else {
memset(buf + ptr, 0, 56 - ptr);
}
br_enc64be(buf + 56, cc->count << 3);
br_sha1_round(buf, val);
br_range_enc32be(dst, val, 5);
}
UINT64 br_sha1_state(const br_sha1_context *cc, void *dst) {
br_range_enc32be(dst, cc->val, 5);
return cc->count;
}
void br_sha1_set_state(br_sha1_context *cc, const void *stb, UINT64 count) {
br_range_dec32be(cc->val, 5, stb);
cc->count = count;
}
I then have a test file that then attempts to test the SHA1 Functions:
#include <gtest/gtest.h>
#include "../../src/crypto/Sha1.h"
static size_t hextobin(unsigned char *dst, const char *src) {
size_t num;
unsigned acc;
int z;
num = 0;
z = 0;
acc = 0;
while (*src != 0) {
int c = *src ++;
if (c >= '0' && c <= '9') {
c -= '0';
} else if (c >= 'A' && c <= 'F') {
c -= ('A' - 10);
} else if (c >= 'a' && c <= 'f') {
c -= ('a' - 10);
} else {
continue;
}
if (z) {
*dst ++ = (acc << 4) + c;
num ++;
} else {
acc = c;
}
z = !z;
}
return num;
}
TEST(sha_one, hash_simple) {
unsigned char ref[br_sha1_SIZE];
hextobin(ref, (const char *) "a9993e364706816aba3e25717850c26c9cd0d89d");
unsigned char res[br_sha1_SIZE];
const char *data = (const char *)"abc";
br_sha1_context mc;
size_t n;
n = strlen(data);
br_sha1_init(&mc);
br_sha1_update(&mc, data, n);
br_sha1_out(&mc, res);
ASSERT_EQ(res, ref);
}
However, when using a meson.build file like:
crypto_srcs = [
'../../src/platform/encoding/Endian.c',
'../../src/crypto/Sha1.c',
'Sha1Tests.cpp',
]
e = executable('crypto_tests', sources : crypto_srcs, dependencies : gtest_dep)
test('crypto tests', e)
I'm getting the above error saying it can't find the reference to those functions (yet it seems to resolve the actual struct defined in that header file just fine? Which is extra confusing to me. So I then attempted to view the symbols on the object file itself to see if the functions were actually defined:
ϰ nm ./builddir/src/src##magmatpm#sta/crypto_Sha1.c.o
0000000000000dc0 t br_enc32be
0000000000000ce0 t br_enc64be
U br_range_dec32be
U br_range_enc32be
0000000000000a80 T br_sha1_init
0000000000000000 R br_sha1_IV
0000000000000ba0 T br_sha1_out
0000000000000000 T br_sha1_round
0000000000000d70 T br_sha1_set_state
0000000000000d30 T br_sha1_state
0000000000000ac0 T br_sha1_update
U memcpy
U memset
Which shows the functions defined inside the object file. So I'm a bit confused on how I'm getting this error at all.
As I noted in a comment:
You compiled the code in Sha1.c with a C compiler; the names you show aren't the mangled names that the C++ compiler wants. You need to compile the code with the C++ compiler (rename the source if need be), or you need to tell the C++ code that the functions are defined extern "C".
One way to do the latter is to use:
extern "C" {
#include "Sha1.h"
}
in the C++ code that uses the C code.
I am trying to make a fraction calculator that calculates on a cuda devise, below is first the sequential version and then my try for a parallel version.
It runs without error, but for some reason do it not give the result back, I have been trying to get this to work for 2 weeks now, but can’t find the error!
Serilized version
int f(int x, int c, int n);
int gcd(unsigned int u, unsigned int v);
int main ()
{
clock_t start = clock();
srand ( time(NULL) );
int x = 1;
int y = 2;
int d = 1;
int c = rand() % 100;
int n = 323;
if(n % y == 0)
d = y;
while(d == 1)
{
x = f(x, c, n);
y = f(f(y, c, n), c, n);
int abs = x - y;
if(abs < 0)
abs = abs * -1;
d = gcd(abs, n);
if(d == n)
{
printf("\nd == n");
c = 0;
while(c == 0 || c == -2)
c = rand() % 100;
x = 2;
y = 2;
}
}
int d2 = n/d;
printf("\nTime elapsed: %f", ((double)clock() - start) / CLOCKS_PER_SEC);
printf("\nResult: %d", d);
printf("\nResult2: %d", d2);
int dummyReadForPause;
scanf_s("%d",&dummyReadForPause);
}
int f(int x, int c, int n)
{
return (int)(pow((float)x, 2) + c) % n;
}
int gcd(unsigned int u, unsigned int v){
int shift;
/ * GCD(0,x) := x * /
if (u == 0 || v == 0)
return u | v;
/ * Let shift := lg K, where K is the greatest power of 2
dividing both u and v. * /
for (shift = 0; ((u | v) & 1) == 0; ++shift) {
u >>= 1;
v >>= 1;
}
while ((u & 1) == 0)
u >>= 1;
/ * From here on, u is always odd. * /
do {
while ((v & 1) == 0) / * Loop X * /
v >>= 1;
/ * Now u and v are both odd, so diff(u, v) is even.
Let u = min(u, v), v = diff(u, v)/2. * /
if (u < v) {
v -= u;
} else {
int diff = u - v;
u = v;
v = diff;
}
v >>= 1;
} while (v != 0);
return u << shift;
}
parallel version
#define threads 512
#define MaxBlocks 65535
#define RunningTheads (512*100)
__device__ int gcd(unsigned int u, unsigned int v)
{
int shift;
if (u == 0 || v == 0)
return u | v;
for (shift = 0; ((u | v) & 1) == 0; ++shift) {
u >>= 1;
v >>= 1;
}
while ((u & 1) == 0)
u >>= 1;
do {
while ((v & 1) == 0)
v >>= 1;
if (u < v) {
v -= u;
} else {
int diff = u - v;
u = v;
v = diff;
}
v >>= 1;
} while (v != 0);
return u << shift;
}
__device__ bool cuda_found;
__global__ void cudaKernal(int *cArray, int n, int *outr)
{
int index = blockIdx.x * threads + threadIdx.x;
int x = 1;
int y = 2;
int d = 4;
int c = cArray[index];
while(d == 1 && !cuda_found)
{
x = (int)(pow((float)x, 2) + c) % n;
y = (int)(pow((float)y, 2) + c) % n;
y = (int)(pow((float)y, 2) + c) % n;
int abs = x - y;
if(abs < 0)
abs = abs * -1;
d = gcd(abs, n);
}
if(d != 1 && !cuda_found)
{
cuda_found = true;
outr = &d;
}
}
int main ()
{
int n = 323;
int cArray[RunningTheads];
cArray[0] = 1;
for(int i = 1; i < RunningTheads-1; i++)
{
cArray[i] = i+2;
}
int dresult = 0;
int *dev_cArray;
int *dev_result;
HANDLE_ERROR(cudaMalloc((void**)&dev_cArray, RunningTheads*sizeof(int)));
HANDLE_ERROR(cudaMalloc((void**)&dev_result, sizeof(int)));
HANDLE_ERROR(cudaMemcpy(dev_cArray, cArray, RunningTheads*sizeof(int), cudaMemcpyHostToDevice));
int TotalBlocks = ceil((float)RunningTheads/(float)threads);
if(TotalBlocks > MaxBlocks)
TotalBlocks = MaxBlocks;
printf("Blocks: %d\n", TotalBlocks);
printf("Threads: %d\n\n", threads);
cudaKernal<<<TotalBlocks,threads>>>(dev_cArray, n, dev_result);
HANDLE_ERROR(cudaMemcpy(&dresult, dev_result, sizeof(int), cudaMemcpyDeviceToHost));
HANDLE_ERROR(cudaFree(dev_cArray));
HANDLE_ERROR(cudaFree(dev_result));
if(dresult == 0)
dresult = 1;
int d2 = n/dresult;
printf("\nResult: %d", dresult);
printf("\nResult2: %d", d2);
int dummyReadForPause;
scanf_s("%d",&dummyReadForPause);
}
Lets have a look at your kernel code:
__global__ void cudaKernal(int *cArray, int n, int *outr)
{
int index = blockIdx.x * threads + threadIdx.x;
int x = 1;
int y = 2;
int d = 4;
int c = cArray[index];
while(d == 1 && !cuda_found) // always false because d is always 4
{
x = (int)(pow((float)x, 2) + c) % n;
y = (int)(pow((float)y, 2) + c) % n;
y = (int)(pow((float)y, 2) + c) % n;
int abs = x - y;
if(abs < 0)
abs = abs * -1;
d = gcd(abs, n); // never writes to d because the loop won't
// be executed
}
if(d != 1 && !cuda_found) // maybe true if cuda_found was initalized
// with false
{
cuda_found = true; // Memory race here.
outr = &d; // you are changing the adresse where outr
// points to; the host code does not see this
// change. your cudaMemcpy dev -> host will copy
// the exact values back from device that have
// been uploaded by cudaMemcpy host -> dev
// if you want to set outr to 4 than write:
// *outr = d;
}
}
One of the problems is you don't return the result. In your code you just change outr which has local scope in your kernel function (i.e. changes are not seen outside this function). You should write *outr = d; to change the value of memory you're pointing with outr.
and I'm not sure if CUDA initializes global variables with zero. I mean are you sure cuda_found is always initialized with false?