I'm attempt to work with unit tests for meson, and am getting:
ϰ ninja
[4/4] Linking target test/crypto/crypto_tests.
FAILED: test/crypto/crypto_tests
clang++-6.0 -o test/crypto/crypto_tests 'test/crypto/test#crypto##crypto_tests#exe/.._.._src_platform_encoding_Endian.c.o' 'test/crypto/test#crypto##crypto_tests#exe/.._.._src_crypto_Sha1.c.o' 'test/crypto/test#crypto##crypto_tests#exe/Sha1Tests.cpp.o' 'test/crypto/test#crypto##crypto_tests#exe/.._.._subprojects_googletest-release-1.8.0_googletest_src_gtest-all.cc.o' 'test/crypto/test#crypto##crypto_tests#exe/.._.._subprojects_googletest-release-1.8.0_googletest_src_gtest_main.cc.o' -Wl,--no-undefined -Wl,--as-needed -pthread
test/crypto/test#crypto##crypto_tests#exe/Sha1Tests.cpp.o: In function `sha_one_hash_simple_Test::TestBody()':
/home/kfc/molten/magma/builddir/../test/crypto/Sha1Tests.cpp:46: undefined reference to `br_sha1_init(br_sha1_context*)'
/home/kfc/molten/magma/builddir/../test/crypto/Sha1Tests.cpp:47: undefined reference to `br_sha1_update(br_sha1_context*, void const*, unsigned long)'
/home/kfc/ecoan/molten/magma/builddir/../test/crypto/Sha1Tests.cpp:48: undefined reference to `br_sha1_out(br_sha1_context const*, void*)'
clang: error: linker command failed with exit code 1 (use -v to see invocation)
ninja: build stopped: subcommand failed.
However I have defined those functions in Sha1.h:
#ifndef _CRYPTO_SHA1_H
#define _CRYPTO_SHA1_H
#include <stddef.h>
#include <string.h>
#include "../BuildSwitches.h"
#include "../types/BaseTypes.h"
/**
* Symbolic identifier for SHA-1.
*/
#define br_sha1_ID 2
/**
* SHA-1 output size (in bytes).
*/
#define br_sha1_SIZE 20
/**
* SHA-1 context.
*
* Fields are not supposed to be accessed by user code.
*/
typedef struct {
unsigned char buf[64];
UINT64 count;
UINT32 val[5];
} br_sha1_context;
/**
* SHA-1 context initialisation.
*
* This function initialises or resets a context for a new SHA-1
* computation.
*
* ctx: pointer to the context structure.
*/
void br_sha1_init(br_sha1_context *ctx);
/**
* Inject some data bytes in a running SHA-1 computation.
*
* The provided context is updated with some data bytes. If the number
* of bytes (`len`) is zero, then the data pointer (`data`) is ignored
* and may be `NULL`, and this function does nothing.
*
* ctx: pointer to the context structure.
* data: pointer to the injected data.
* len: injected data length (in bytes).
*/
void br_sha1_update(br_sha1_context *ctx, const void *data, size_t len);
/**
* Compute SHA-1 output.
*
* The SHA-1 output for the concatenation of all bytes injected in the
* provided context since the last initialisation or reset call, is
* computed and written in the buffer pointed to by `out`. The context
* itself is not modified, so extra bytes may be injected afterwards
* to continue that computation.
*
* ctx: pointer to the context structure.
* out: destination buffer for the hash output.
*/
void br_sha1_out(const br_sha1_context *ctx, void *out);
/**
* Save SHA-1 running state.
*
* The running state for SHA-1 (output of the last internal block
* processing) is written in the buffer pointed to by `out`. The
* number of bytes injected since the last initialisation or reset
* call is returned. The context is not modified.
*
* ctx: pointer to the context structure.
* out: destination buffer for the running state.
*
* returns: the injected total byte length.
*/
UINT64 br_sha1_state(const br_sha1_context *ctx, void *out);
/**
* Restore SHA-1 running state.
*
* The running state for SHA-1 is set to the provided values.
*
* ctx: pointer to the context structure.
* stb: source buffer for the running state.
* count: the injected total byte length.
*/
void br_sha1_set_state(br_sha1_context *ctx, const void *stb, UINT64 count);
void br_sha1_round(const unsigned char *buf, UINT32 *val);
extern const UINT32 br_sha1_IV[];
#endif // _CRYPTO_SHA1_H
These have the same definitions in Sha1.c:
#include "../platform/encoding/Endian.h"
#include "./Sha1.h"
#define F(B, C, D) ((((C) ^ (D)) & (B)) ^ (D))
#define G(B, C, D) ((B) ^ (C) ^ (D))
#define H(B, C, D) (((D) & (C)) | (((D) | (C)) & (B)))
#define I(B, C, D) G(B, C, D)
#define ROTL(x, n) (((x) << (n)) | ((x) >> (32 - (n))))
#define K1 ((UINT32)0x5A827999)
#define K2 ((UINT32)0x6ED9EBA1)
#define K3 ((UINT32)0x8F1BBCDC)
#define K4 ((UINT32)0xCA62C1D6)
const UINT32 br_sha1_IV[5] = { 0x67452301, 0xEFCDAB89, 0x98BADCFE, 0x10325476, 0xC3D2E1F0 };
void br_sha1_round(const unsigned char *buf, UINT32 *val) {
UINT32 m[80];
UINT32 a, b, c, d, e;
int i;
a = val[0];
b = val[1];
c = val[2];
d = val[3];
e = val[4];
br_range_dec32be(m, 16, buf);
for (i = 16; i < 80; i ++) {
UINT32 x = m[i - 3] ^ m[i - 8] ^ m[i - 14] ^ m[i - 16];
m[i] = ROTL(x, 1);
}
for (i = 0; i < 20; i += 5) {
e += ROTL(a, 5) + F(b, c, d) + K1 + m[i + 0]; b = ROTL(b, 30);
d += ROTL(e, 5) + F(a, b, c) + K1 + m[i + 1]; a = ROTL(a, 30);
c += ROTL(d, 5) + F(e, a, b) + K1 + m[i + 2]; e = ROTL(e, 30);
b += ROTL(c, 5) + F(d, e, a) + K1 + m[i + 3]; d = ROTL(d, 30);
a += ROTL(b, 5) + F(c, d, e) + K1 + m[i + 4]; c = ROTL(c, 30);
}
for (i = 20; i < 40; i += 5) {
e += ROTL(a, 5) + G(b, c, d) + K2 + m[i + 0]; b = ROTL(b, 30);
d += ROTL(e, 5) + G(a, b, c) + K2 + m[i + 1]; a = ROTL(a, 30);
c += ROTL(d, 5) + G(e, a, b) + K2 + m[i + 2]; e = ROTL(e, 30);
b += ROTL(c, 5) + G(d, e, a) + K2 + m[i + 3]; d = ROTL(d, 30);
a += ROTL(b, 5) + G(c, d, e) + K2 + m[i + 4]; c = ROTL(c, 30);
}
for (i = 40; i < 60; i += 5) {
e += ROTL(a, 5) + H(b, c, d) + K3 + m[i + 0]; b = ROTL(b, 30);
d += ROTL(e, 5) + H(a, b, c) + K3 + m[i + 1]; a = ROTL(a, 30);
c += ROTL(d, 5) + H(e, a, b) + K3 + m[i + 2]; e = ROTL(e, 30);
b += ROTL(c, 5) + H(d, e, a) + K3 + m[i + 3]; d = ROTL(d, 30);
a += ROTL(b, 5) + H(c, d, e) + K3 + m[i + 4]; c = ROTL(c, 30);
}
for (i = 60; i < 80; i += 5) {
e += ROTL(a, 5) + I(b, c, d) + K4 + m[i + 0]; b = ROTL(b, 30);
d += ROTL(e, 5) + I(a, b, c) + K4 + m[i + 1]; a = ROTL(a, 30);
c += ROTL(d, 5) + I(e, a, b) + K4 + m[i + 2]; e = ROTL(e, 30);
b += ROTL(c, 5) + I(d, e, a) + K4 + m[i + 3]; d = ROTL(d, 30);
a += ROTL(b, 5) + I(c, d, e) + K4 + m[i + 4]; c = ROTL(c, 30);
}
val[0] += a;
val[1] += b;
val[2] += c;
val[3] += d;
val[4] += e;
}
void br_sha1_init(br_sha1_context *ctx) {
memcpy(ctx->val, br_sha1_IV, sizeof ctx->val);
ctx->count = 0;
}
void br_sha1_update(br_sha1_context *cc, const void *data, size_t len) {
const unsigned char *buf;
size_t ptr;
buf = (const unsigned char *) data;
ptr = (size_t)cc->count & 63;
while (len > 0) {
size_t clen;
clen = 64 - ptr;
if (clen > len) {
clen = len;
}
memcpy(cc->buf + ptr, buf, clen);
ptr += clen;
buf += clen;
len -= clen;
cc->count += (UINT64)clen;
if (ptr == 64) {
br_sha1_round(cc->buf, cc->val);
ptr = 0;
}
}
}
void br_sha1_out(const br_sha1_context *cc, void *dst) {
unsigned char buf[64];
UINT32 val[5];
size_t ptr;
ptr = (size_t)cc->count & 63;
memcpy(buf, cc->buf, ptr);
memcpy(val, cc->val, sizeof val);
buf[ptr ++] = 0x80;
if (ptr > 56) {
memset(buf + ptr, 0, 64 - ptr);
br_sha1_round(buf, val);
memset(buf, 0, 56);
} else {
memset(buf + ptr, 0, 56 - ptr);
}
br_enc64be(buf + 56, cc->count << 3);
br_sha1_round(buf, val);
br_range_enc32be(dst, val, 5);
}
UINT64 br_sha1_state(const br_sha1_context *cc, void *dst) {
br_range_enc32be(dst, cc->val, 5);
return cc->count;
}
void br_sha1_set_state(br_sha1_context *cc, const void *stb, UINT64 count) {
br_range_dec32be(cc->val, 5, stb);
cc->count = count;
}
I then have a test file that then attempts to test the SHA1 Functions:
#include <gtest/gtest.h>
#include "../../src/crypto/Sha1.h"
static size_t hextobin(unsigned char *dst, const char *src) {
size_t num;
unsigned acc;
int z;
num = 0;
z = 0;
acc = 0;
while (*src != 0) {
int c = *src ++;
if (c >= '0' && c <= '9') {
c -= '0';
} else if (c >= 'A' && c <= 'F') {
c -= ('A' - 10);
} else if (c >= 'a' && c <= 'f') {
c -= ('a' - 10);
} else {
continue;
}
if (z) {
*dst ++ = (acc << 4) + c;
num ++;
} else {
acc = c;
}
z = !z;
}
return num;
}
TEST(sha_one, hash_simple) {
unsigned char ref[br_sha1_SIZE];
hextobin(ref, (const char *) "a9993e364706816aba3e25717850c26c9cd0d89d");
unsigned char res[br_sha1_SIZE];
const char *data = (const char *)"abc";
br_sha1_context mc;
size_t n;
n = strlen(data);
br_sha1_init(&mc);
br_sha1_update(&mc, data, n);
br_sha1_out(&mc, res);
ASSERT_EQ(res, ref);
}
However, when using a meson.build file like:
crypto_srcs = [
'../../src/platform/encoding/Endian.c',
'../../src/crypto/Sha1.c',
'Sha1Tests.cpp',
]
e = executable('crypto_tests', sources : crypto_srcs, dependencies : gtest_dep)
test('crypto tests', e)
I'm getting the above error saying it can't find the reference to those functions (yet it seems to resolve the actual struct defined in that header file just fine? Which is extra confusing to me. So I then attempted to view the symbols on the object file itself to see if the functions were actually defined:
ϰ nm ./builddir/src/src##magmatpm#sta/crypto_Sha1.c.o
0000000000000dc0 t br_enc32be
0000000000000ce0 t br_enc64be
U br_range_dec32be
U br_range_enc32be
0000000000000a80 T br_sha1_init
0000000000000000 R br_sha1_IV
0000000000000ba0 T br_sha1_out
0000000000000000 T br_sha1_round
0000000000000d70 T br_sha1_set_state
0000000000000d30 T br_sha1_state
0000000000000ac0 T br_sha1_update
U memcpy
U memset
Which shows the functions defined inside the object file. So I'm a bit confused on how I'm getting this error at all.
As I noted in a comment:
You compiled the code in Sha1.c with a C compiler; the names you show aren't the mangled names that the C++ compiler wants. You need to compile the code with the C++ compiler (rename the source if need be), or you need to tell the C++ code that the functions are defined extern "C".
One way to do the latter is to use:
extern "C" {
#include "Sha1.h"
}
in the C++ code that uses the C code.
Related
I'm currently writing a program for YUV420SP => RGB/BGR color space conversion, follow the floatint-point formula calculation, without any SIMD or multi-threading optimization.
The function's input data is unsigned char type, the finally result's type is also unsigned char type. But for the intermediate variables, the formula itself requires float type(the expressions in the right of the =), but for the float => unsigned char conversion, there are two choices, one is using float r, g, b the other is int r, g, b:
unsigned char y = 223; // mock for getting y value
unsigned char u = 200; // mock for getting u value
unsigned char v = 200; // mock for getting v value
unsigned char* rgb0 = (unsigned char*)malloc(MAXN); // for finally result saving
// the YUV=>RGB color conversion
float r, g, b; // [!! choice1 !!] if using this line, code run slower
int r, g, b; // [!! choice2 !!] if using this line, code run much faster
y = std::max(16, (int)y_ptr0[0]);
r = 1.164 * (y - 16) + 1.596 * (v - 128);
g = 1.164 * (y - 16) - 0.813 * (v - 128) - 0.391 * (u - 128);
b = 1.164 * (y - 16) + 2.018 * (u - 128);
rgb0[2-b_idx] = saturate_ucast(r);
rgb0[1] = saturate_ucast(g);
rgb0[b_idx] = saturate_ucast(b);
rgb0 += 3;
What makes me confusing is, for the actual test (convert a width=7680x4320 image), the float r,g,b is about much slower that using int r, g, b, on both Linux x86 and Android ARMv8 platform
The full code for the color conversion is:
#include <limits.h>
inline uchar saturate_uchar(int v)
{
return (uchar)((unsigned int)v <= UCHAR_MAX ? v : v > 0 ? UCHAR_MAX : 0);
}
inline uchar saturate_uchar(float v)
{
int iv = round(v);
return saturate_uchar(iv);
}
template<int u_idx, int b_idx>
void yuv420sp2rgb_naive(
const uchar* y_plane, int height, int width, int y_linebytes,
const uchar* uv_plane, int uv_linebytes,
uchar* rgb, int rgb_linebytes,
const Option& opt
)
{
/// param checking
assert (y_plane!=NULL && uv_plane!=NULL && rgb!=NULL);
/// neon-specific param checking
assert (width>=2 && height>=2);
int w = width;
int h = height;
for (int i=0; i <= h-2; i+=2)
{
const unsigned char* y_ptr0 = y_plane + i * y_linebytes;
const unsigned char* y_ptr1 = y_ptr0 + y_linebytes;
unsigned char* rgb0 = rgb + i * rgb_linebytes;
unsigned char* rgb1 = rgb0+ rgb_linebytes;
const unsigned char* uv_ptr = uv_plane + (i/2) * uv_linebytes;
for (size_t j=0; j <= width-2; j += 2)
{
int y;
float r, g, b; // choice1
//int r, g, b; // choice2
// R = 1.164(Y - 16) + 1.596(V - 128)
// G = 1.164(Y - 16) - 0.813(V - 128) - 0.391(U - 128)
// B = 1.164(Y - 16) + 2.018(U - 128)
int u = uv_ptr[u_idx];
int v = uv_ptr[1 - u_idx];
// y00
y = std::max(16, (int)y_ptr0[0]);
r = 1.164 * (y - 16) + 1.596 * (v - 128);
g = 1.164 * (y - 16) - 0.813 * (v - 128) - 0.391 * (u - 128);
b = 1.164 * (y - 16) + 2.018 * (u - 128);
rgb0[2-b_idx] = saturate_uchar(r);
rgb0[1] = saturate_uchar(g);
rgb0[b_idx] = saturate_uchar(b);
rgb0 += 3;
// y01
y = std::max(16, (int)y_ptr0[1]);
r = 1.164 * (y - 16) + 1.596 * (v - 128);
g = 1.164 * (y - 16) - 0.813 * (v - 128) - 0.391 * (u - 128);
b = 1.164 * (y - 16) + 2.018 * (u - 128);
rgb0[2-b_idx] = saturate_uchar(r);
rgb0[1] = saturate_uchar(g);
rgb0[b_idx] = saturate_uchar(b);
rgb0 += 3;
// y10
y = std::max(16, (int)y_ptr1[0]);
r = 1.164 * (y - 16) + 1.596 * (v - 128);
g = 1.164 * (y - 16) - 0.813 * (v - 128) - 0.391 * (u - 128);
b = 1.164 * (y - 16) + 2.018 * (u - 128);
rgb1[2-b_idx] = saturate_uchar(r);
rgb1[1] = saturate_uchar(g);
rgb1[b_idx] = saturate_uchar(b);
rgb1 += 3;
// y11
y = std::max(16, (int)y_ptr1[1]);
r = 1.164 * (y - 16) + 1.596 * (v - 128);
g = 1.164 * (y - 16) - 0.813 * (v - 128) - 0.391 * (u - 128);
b = 1.164 * (y - 16) + 2.018 * (u - 128);
rgb1[2-b_idx] = saturate_uchar(r);
rgb1[1] = saturate_uchar(g);
rgb1[b_idx] = saturate_uchar(b);
rgb1 += 3;
y_ptr0 += 2;
y_ptr1 += 2;
uv_ptr += 2;
}
}
}
platform
choice
time cost
linux x64
float r, g, b
140 ms
linux x64
int r, g, b
107 ms
armv8
float r, g, b
152 ms
armv8
int r, g, b
111 ms
Question: why changing variable r,g,b's type from float to int boost speed so much?
I need to convert image from bgr to yuv420p and I first use OpenCV to do so.
Mat img = imread("1.bmp");
Mat yuvImg;
cvtColor(img,yuvImg,COLOR_BGR2YUV_I420);
The result of it is normal. However,my image is too big and its pixel is almost 6400 * 2000.
I find it costs too much time of converting bgr to yuv420p with opencv api cvtcolor.
Then I decide to convert it myself and speed it with cuda.
Here is code in cpu:
void bgr_to_yuv420p(unsigned char* yuv420p, unsigned char* bgr, int width, int height)
{
if (yuv420p == NULL || bgr== NULL)
return;
int frameSize = width*height;
int chromaSize = frameSize / 4;
int yIndex = 0;
int uIndex = frameSize;
int vIndex = frameSize + chromaSize;
int R, G, B, Y, U, V;
for (int i = 0; i < height; i++)
{
for (int j = 0; j < width; j++)
{
B = bgr[(i * width + j) * 3 + 0];
G = bgr[(i * width + j) * 3 + 1];
R = bgr[(i * width + j) * 3 + 2];
//BGR to YUV
Y = ((66 * R + 129 * G + 25 * B + 128) >> 8) + 16;
U = ((-38 * R - 74 * G + 112 * B + 128) >> 8) + 128;
V = ((112 * R - 94 * G - 18 * B + 128) >> 8) + 128;
yuv420p[yIndex++] = (unsigned char)((Y < 0) ? 0 : ((Y > 255) ? 255 : Y));
if (i % 2 == 0 && j % 2 == 0)
{
yuv420p[uIndex++] = (unsigned char)((U < 0) ? 0 : ((U > 255) ? 255 : U));
yuv420p[vIndex++] = (unsigned char)((V < 0) ? 0 : ((V > 255) ? 255 : V));
}
}
}
}
I test the code bgr_to_yuv420p(...) and the result is also normal.
Then I speed it up with cuda.
Here is all my code include kernel function and test function.
#include <iostream>
#include <time.h>
#include <vector_types.h>
#include <cuda_runtime.h>
#include <device_launch_parameters.h>
#include "opencv2/highgui.hpp"
#include "opencv2/opencv.hpp"
using namespace cv;
using namespace std;
//kernel function to convert bgr to yuv420p
__global__ void bgr2yuv420p(uchar3 * d_in, unsigned char * d_out,
uint imgheight, uint imgwidth)
{
int col_num = blockIdx.x*blockDim.x+threadIdx.x;
int row_num = blockIdx.y*blockDim.y+threadIdx.y;
if ((row_num < imgheight) && (col_num < imgwidth))
{
// uint32_t a = *((uint32_t *)&dinput[global_offset*3]);
int global_offset = row_num*imgwidth+col_num;
int r,g,b;
r = int(d_in[global_offset].z);
g = int (d_in[global_offset].y);
b = int (d_in[global_offset].x);
d_out[row_num * imgwidth + col_num] = ((66*r + 129*g + 25*b) >> 8) + 16;
if(((threadIdx.x & 1) == 0) && ((threadIdx.y & 1) == 0)){
int uv_offset = imgwidth*imgheight+((row_num*imgwidth))+col_num;
d_out[uv_offset] = ((112*r + -94*g + -18*b) >> 8) + 128;
d_out[uv_offset+1] = ((-38*r + -74*g + 112*b) >> 8) + 128;
}
}
}
int main(void)
{
Mat srcImage = imread("1.bmp");
imshow("srcImage", srcImage);
const uint imgheight = srcImage.rows;
const uint imgwidth = srcImage.cols;
Mat nv12Image(imgheight * 3 / 2, imgwidth, CV_8UC1, Scalar(255));
//input and output
uchar3 *d_in;
unsigned char *d_out;
// malloc memo in gpu
cudaMalloc((void**)&d_in, imgheight*imgwidth*sizeof(uchar3));
cudaMalloc((void**)&d_out, imgheight*imgwidth*sizeof(unsigned char) * 3 / 2);
//copy image from cpu to gpu
cudaMemcpy(d_in, srcImage.data, imgheight*imgwidth*sizeof(uchar3), cudaMemcpyHostToDevice);
dim3 threadsPerBlock(32, 32);
dim3 blocksPerGrid((imgwidth + threadsPerBlock.x - 1) / threadsPerBlock.x,
(imgheight + threadsPerBlock.y - 1) / threadsPerBlock.y);
//run kernel function
bgr2yuv420p<<<blocksPerGrid, threadsPerBlock>>>(d_in, d_out, imgheight, imgwidth);
cudaDeviceSynchronize();
//copy yuv420p from gpu to cpu
cudaMemcpy(nv12Image.data, d_out, imgheight*imgwidth*sizeof(unsigned char) * 3 / 2, cudaMemcpyDeviceToHost);
imshow("nv12",nv12Image);
imwrite("cuda.bmp",nv12Image);
cudaFree(d_in);
cudaFree(d_out);
return 0;
}
The code with cuda can run but the result is not normal. Y of YUV420p is normal but there is something wrong with U and V. I think the reason is here in __global__ void bgr2yuv420p(...)
if(((threadIdx.x & 1) == 0) && ((threadIdx.y & 1) == 0)){
int uv_offset = imgwidth*imgheight+((row_num*imgwidth))+col_num;
d_out[uv_offset] = ((112*r + -94*g + -18*b) >> 8) + 128;
d_out[uv_offset+1] = ((-38*r + -74*g + 112*b) >> 8) + 128;
}
I try a lot but still cannot solve it. And I find little code about converting rgb to yuv420p, More codes are about converting yuv420p to rgb. So I want to know is somebody running into the same question or giving me some advice?
Thanks Robert Crovella.Here is my update-1.
I follow Robert Crovella's advice and change the kernel function like this:
//kernel function to convert bgr to yuv420p
__global__ void bgr2yuv420p(uchar3 * d_in, unsigned char * d_out,
uint imgheight, uint imgwidth)
{
int col_num = blockIdx.x*blockDim.x+threadIdx.x;
int row_num = blockIdx.y*blockDim.y+threadIdx.y;
if ((row_num < imgheight) && (col_num < imgwidth))
{
// uint32_t a = *((uint32_t *)&dinput[global_offset*3]);
int global_offset = row_num*imgwidth+col_num;
int r,g,b;
r = int(d_in[global_offset].z);
g = int (d_in[global_offset].y);
b = int (d_in[global_offset].x);
d_out[row_num * imgwidth + col_num] = ((66*r + 129*g + 25*b) >> 8) + 16;
if(((threadIdx.x & 1) == 0) && ((threadIdx.y & 1) == 0)){
int uv_offset = imgwidth*imgheight+((row_num>>1)*imgwidth)+col_num;
d_out[uv_offset] = ((112*r + -94*g + -18*b) >> 8) + 128;
d_out[uv_offset+1] = ((-38*r + -74*g + 112*b) >> 8) + 128;
}
}
}
I test the new kernel with excitement,but the result is also not normal.
Here is my result image with the updated kernel function.
yuv420p image converted by myself
Then the normal result image converted by opencv api is here.
yuv420p image converted by opencv api
As we can see, the difference between the two images is U and V. I have already changed the index of U and V in kernel function, i.e.
if(((threadIdx.x & 1) == 0) && ((threadIdx.y & 1) == 0)){
int uv_offset = imgwidth*imgheight+((row_num >>1)*imgwidth)+col_num;
d_out[uv_offset] = ((112*r + -94*g + -18*b) >> 8) + 128;
d_out[uv_offset+1] = ((-38*r + -74*g + 112*b) >> 8) + 128;
}
I think it will work but it does not. Any other advice? Robert Crovella
Edit: The solution is Robert Crovella's latest answer. I have double checked it and it is really perfect.
There are a variety of issues:
the calculations to convert R,G,B to Y,U,V between your CPU and GPU codes are not identical. Yes, this matters.
Your CPU code has planar Y,U,V storage. That means Y has its own plane, U has its own plane, and V has its own plane. Your GPU codes is semi planar (NV12) format. That means Y has its own plane, and U,V are interleaved in a single plane: UVUVUVUVUVUV.... Obviously the output of those two codes could never match identically.
IMO, there is no need to drag OpenCV into this.
Your UV offset calculation in the kernel (GPU) code was broken. The imgwidth*imgheight offset gets you past the Y area (correctly), but from that point, it is not correct to use row_num*imgwidth to index by row into the UV planar region. You do not have that many rows in the UV planar region, you only have half as many rows.
In your GPU kernel, you had U,V ordering reversed, you were effectively doing VUVUVUVU...
My recommendation would be to start by harmonizing the calculation differences and storage order/format. The following code has the above issues addressed, and gives matching results for me between CPU and GPU codes:
$ cat t1708.cu
#include <iostream>
#include <time.h>
#include <cstdlib>
using namespace std;
// I have no idea if these are the correct conversion formulas
// I simply lifted what I saw in your host code so that we
// are using the same conversion calculations in host and device
__host__ __device__ unsigned char bgr2y(int R, int G, int B){
int Y = ((66 * R + 129 * G + 25 * B + 128) >> 8) + 16;
return (unsigned char)((Y<0)? 0 : ((Y > 255) ? 255 : Y));}
__host__ __device__ int bgr2u(int R, int G, int B){
int U = ((-38 * R - 74 * G + 112 * B + 128) >> 8) + 128;
return (unsigned char)((U<0)? 0 : ((U > 255) ? 255 : U));}
__host__ __device__ int bgr2v(int R, int G, int B){
int V = ((112 * R - 94 * G - 18 * B + 128) >> 8) + 128;
return (unsigned char)((V<0)? 0 : ((V > 255) ? 255 : V));}
void bgr_to_yuv420p(unsigned char* yuv420p, unsigned char* bgr, int width, int height)
{
if (yuv420p == NULL || bgr== NULL)
return;
int frameSize = width*height;
int yIndex = 0;
int uIndex = frameSize;
int R, G, B;
for (int i = 0; i < height; i++)
{
for (int j = 0; j < width; j++)
{
B = bgr[(i * width + j) * 3 + 0];
G = bgr[(i * width + j) * 3 + 1];
R = bgr[(i * width + j) * 3 + 2];
//BGR to YUV
yuv420p[yIndex++] = bgr2y(R,G,B);
if (i % 2 == 0 && j % 2 == 0)
{
yuv420p[uIndex] = bgr2u(R,G,B);
yuv420p[uIndex+1] = bgr2v(R,G,B);
uIndex+=2;
}
}
}
}
//kernel function to convert bgr to yuv420p
__global__ void bgr2yuv420p(uchar3 * d_in, unsigned char * d_out,
uint imgheight, uint imgwidth)
{
int col_num = blockIdx.x*blockDim.x+threadIdx.x;
int row_num = blockIdx.y*blockDim.y+threadIdx.y;
if ((row_num < imgheight) && (col_num < imgwidth))
{
// uint32_t a = *((uint32_t *)&dinput[global_offset*3]);
int global_offset = row_num*imgwidth+col_num;
int r,g,b;
r = int(d_in[global_offset].z);
g = int (d_in[global_offset].y);
b = int (d_in[global_offset].x);
d_out[row_num * imgwidth + col_num] = bgr2y(r,g,b);
if(((threadIdx.x & 1) == 0) && ((threadIdx.y & 1) == 0)){
int uv_offset = imgwidth*imgheight+((row_num>>1)*imgwidth)+col_num;
d_out[uv_offset] = bgr2u(r,g,b);
d_out[uv_offset+1] = bgr2v(r,g,b);
}
}
}
int main(void)
{
const uint imgheight = 1000;
const uint imgwidth = 1500;
//input and output
uchar3 *d_in;
unsigned char *d_out;
uchar3 *idata = new uchar3[imgheight*imgwidth];
unsigned char *odata = new unsigned char[imgheight*imgwidth*3/2];
unsigned char *cdata = new unsigned char[imgheight*imgwidth*3/2];
uchar3 pix;
for (int i = 0; i < imgheight*imgwidth; i++){
pix.x = (rand()%30)+40;
pix.y = (rand()%30)+40;
pix.z = (rand()%30)+40;
idata[i] = pix;}
for (int i = 0; i < imgheight*imgwidth; i++) idata[i] = pix;
bgr_to_yuv420p(cdata, (unsigned char*) idata, imgwidth, imgheight);
// malloc memo in gpu
cudaMalloc((void**)&d_in, imgheight*imgwidth*sizeof(uchar3));
cudaMalloc((void**)&d_out, imgheight*imgwidth*sizeof(unsigned char) * 3 / 2);
//copy image from cpu to gpu
cudaMemcpy(d_in, idata, imgheight*imgwidth*sizeof(uchar3), cudaMemcpyHostToDevice);
dim3 threadsPerBlock(32, 32);
dim3 blocksPerGrid((imgwidth + threadsPerBlock.x - 1) / threadsPerBlock.x,
(imgheight + threadsPerBlock.y - 1) / threadsPerBlock.y);
//run kernel function
bgr2yuv420p<<<blocksPerGrid, threadsPerBlock>>>(d_in, d_out, imgheight, imgwidth);
cudaDeviceSynchronize();
//copy yuv420p from gpu to cpu
cudaMemcpy(odata, d_out, imgheight*imgwidth*sizeof(unsigned char) * 3 / 2, cudaMemcpyDeviceToHost);
for (int i = 0; i < (imgwidth*imgheight*3/2); i++) if (odata[i] != cdata[i]) {std::cout << "mismatch at: " << i << " was: " << (int)odata[i] << " should be: " << (int)cdata[i] << std::endl; return 0;}
cudaFree(d_in);
cudaFree(d_out);
return 0;
}
$ nvcc -o t1708 t1708.cu
$ cuda-memcheck ./t1708
========= CUDA-MEMCHECK
========= ERROR SUMMARY: 0 errors
$
Any time you are having trouble with a CUDA code, I recommend
Proper CUDA error checking
Running your code with cuda-memcheck
EDIT: Based on additional comments, here is a version of the above code that uses the OP-supplied CPU code verbatim, and provides a CUDA kernel that generates YUV planar storage (instead of semi-planar storage):
#include <iostream>
#include <time.h>
#include <cstdlib>
using namespace std;
__host__ __device__ unsigned char bgr2y(int R, int G, int B){
int Y = ((66 * R + 129 * G + 25 * B + 128) >> 8) + 16;
return (unsigned char)((Y<0)? 0 : ((Y > 255) ? 255 : Y));}
__host__ __device__ int bgr2u(int R, int G, int B){
int U = ((-38 * R - 74 * G + 112 * B + 128) >> 8) + 128;
return (unsigned char)((U<0)? 0 : ((U > 255) ? 255 : U));}
__host__ __device__ int bgr2v(int R, int G, int B){
int V = ((112 * R - 94 * G - 18 * B + 128) >> 8) + 128;
return (unsigned char)((V<0)? 0 : ((V > 255) ? 255 : V));}
void bgr_to_yuv420sp(unsigned char* yuv420p, unsigned char* bgr, int width, int height)
{
if (yuv420p == NULL || bgr== NULL)
return;
int frameSize = width*height;
int yIndex = 0;
int uIndex = frameSize;
int R, G, B;
for (int i = 0; i < height; i++)
{
for (int j = 0; j < width; j++)
{
B = bgr[(i * width + j) * 3 + 0];
G = bgr[(i * width + j) * 3 + 1];
R = bgr[(i * width + j) * 3 + 2];
//BGR to YUV
yuv420p[yIndex++] = bgr2y(R,G,B);
if (i % 2 == 0 && j % 2 == 0)
{
yuv420p[uIndex] = bgr2u(R,G,B);
yuv420p[uIndex+1] = bgr2v(R,G,B);
uIndex+=2;
}
}
}
}
void bgr_to_yuv420p(unsigned char* yuv420p, unsigned char* bgr, int width, int height)
{
if (yuv420p == NULL || bgr== NULL)
return;
int frameSize = width*height;
int chromaSize = frameSize / 4;
int yIndex = 0;
int uIndex = frameSize;
int vIndex = frameSize + chromaSize;
int R, G, B, Y, U, V;
for (int i = 0; i < height; i++)
{
for (int j = 0; j < width; j++)
{
B = bgr[(i * width + j) * 3 + 0];
G = bgr[(i * width + j) * 3 + 1];
R = bgr[(i * width + j) * 3 + 2];
//BGR to YUV
Y = ((66 * R + 129 * G + 25 * B + 128) >> 8) + 16;
U = ((-38 * R - 74 * G + 112 * B + 128) >> 8) + 128;
V = ((112 * R - 94 * G - 18 * B + 128) >> 8) + 128;
yuv420p[yIndex++] = (unsigned char)((Y < 0) ? 0 : ((Y > 255) ? 255 : Y));
if (i % 2 == 0 && j % 2 == 0)
{
yuv420p[uIndex++] = (unsigned char)((U < 0) ? 0 : ((U > 255) ? 255 : U));
yuv420p[vIndex++] = (unsigned char)((V < 0) ? 0 : ((V > 255) ? 255 : V));
}
}
}
}
//kernel function to convert bgr to yuv420sp
__global__ void bgr2yuv420sp(uchar3 * d_in, unsigned char * d_out,
uint imgheight, uint imgwidth)
{
int col_num = blockIdx.x*blockDim.x+threadIdx.x;
int row_num = blockIdx.y*blockDim.y+threadIdx.y;
if ((row_num < imgheight) && (col_num < imgwidth))
{
// uint32_t a = *((uint32_t *)&dinput[global_offset*3]);
int global_offset = row_num*imgwidth+col_num;
int r,g,b;
r = int(d_in[global_offset].z);
g = int (d_in[global_offset].y);
b = int (d_in[global_offset].x);
d_out[row_num * imgwidth + col_num] = bgr2y(r,g,b);
if(((threadIdx.x & 1) == 0) && ((threadIdx.y & 1) == 0)){
int uv_offset = imgwidth*imgheight+((row_num>>1)*imgwidth)+col_num;
d_out[uv_offset] = bgr2u(r,g,b);
d_out[uv_offset+1] = bgr2v(r,g,b);
}
}
}
//kernel function to convert bgr to yuv420p
__global__ void bgr2yuv420p(uchar3 * d_in, unsigned char * d_out,
uint imgheight, uint imgwidth)
{
int col_num = blockIdx.x*blockDim.x+threadIdx.x;
int row_num = blockIdx.y*blockDim.y+threadIdx.y;
if ((row_num < imgheight) && (col_num < imgwidth))
{
// uint32_t a = *((uint32_t *)&dinput[global_offset*3]);
int global_offset = row_num*imgwidth+col_num;
int r,g,b;
r = int(d_in[global_offset].z);
g = int (d_in[global_offset].y);
b = int (d_in[global_offset].x);
d_out[row_num * imgwidth + col_num] = bgr2y(r,g,b);
if(((threadIdx.x & 1) == 0) && ((threadIdx.y & 1) == 0)){
int u_offset = imgwidth*imgheight+((row_num>>1)*(imgwidth>>1))+(col_num>>1);
d_out[u_offset] = bgr2u(r,g,b);
int v_offset = u_offset+((imgheight>>1)*(imgwidth>>1));
d_out[v_offset] = bgr2v(r,g,b);
}
}
}
int main(void)
{
const uint imgheight = 1000;
const uint imgwidth = 1500;
//input and output
uchar3 *d_in;
unsigned char *d_out;
uchar3 *idata = new uchar3[imgheight*imgwidth];
unsigned char *odata = new unsigned char[imgheight*imgwidth*3/2];
unsigned char *cdata = new unsigned char[imgheight*imgwidth*3/2];
uchar3 pix;
for (int i = 0; i < imgheight*imgwidth; i++){
pix.x = (rand()%30)+40;
pix.y = (rand()%30)+40;
pix.z = (rand()%30)+40;
idata[i] = pix;}
for (int i = 0; i < imgheight*imgwidth; i++) idata[i] = pix;
bgr_to_yuv420p(cdata, (unsigned char*) idata, imgwidth, imgheight);
// malloc memo in gpu
cudaMalloc((void**)&d_in, imgheight*imgwidth*sizeof(uchar3));
cudaMalloc((void**)&d_out, imgheight*imgwidth*sizeof(unsigned char) * 3 / 2);
//copy image from cpu to gpu
cudaMemcpy(d_in, idata, imgheight*imgwidth*sizeof(uchar3), cudaMemcpyHostToDevice);
dim3 threadsPerBlock(32, 32);
dim3 blocksPerGrid((imgwidth + threadsPerBlock.x - 1) / threadsPerBlock.x,
(imgheight + threadsPerBlock.y - 1) / threadsPerBlock.y);
//run kernel function
bgr2yuv420p<<<blocksPerGrid, threadsPerBlock>>>(d_in, d_out, imgheight, imgwidth);
cudaDeviceSynchronize();
//copy yuv420p from gpu to cpu
cudaMemcpy(odata, d_out, imgheight*imgwidth*sizeof(unsigned char) * 3 / 2, cudaMemcpyDeviceToHost);
for (int i = 0; i < (imgwidth*imgheight*3/2); i++) if (odata[i] != cdata[i]) {std::cout << "mismatch at: " << i << " was: " << (int)odata[i] << " should be: " << (int)cdata[i] << std::endl; return 0;}
cudaFree(d_in);
cudaFree(d_out);
return 0;
}
I don't claim correctness for this code or any other code that I post. Anyone using any code I post does so at their own risk. I merely claim that I have attempted to address the deficiencies that I found in the original posting, and provide some explanation thereof. I am not claiming my code is defect-free, or that it is suitable for any particular purpose. Use it (or not) at your own risk.
I'm writing a c++ webcam viewer using video4linux. I need a RGB24 output (interleaved R8B8G8) for displaying. I'm able to get video input for almost all low-resolution webcam, using YUYV, GREY8 or RGB24. But I need to get input also from high-resolution webcams, that use MJPEG for compression when high framerate is needed.
I'm able to get MJPEG stream using V4L2_PIX_FMT_MJPEG as pixel format, but received framebuffer is compressed.
How can I quickly convert it to RGB24?
Can I use libjpeg for this?
The quickest solution I've found is decode_jpeg_raw from mjpegtools which decode jpeg data to planar YUV420. Then the conversion from yuv420 to rgb24 is done by this function:
inline int clip(int value) {
return (value > 255) ? 255 : (value < 0) ? 0 : value;
}
static void yuv420_to_rgb24(
/* luminance (source) */const uint8_t* const y
/* u chrominance (source) */, const uint8_t* u
/* v chrominance (source) */, const uint8_t* v
/* rgb interleaved (destination) */, uint8_t* const dst
/* jpeg size */, int const size
/* image width */, int const width) {
const int lineSize = width * 3;
uint8_t* r1 = dst;
uint8_t* g1 = r1 + 1;
uint8_t* b1 = r1 + 2;
uint8_t* r2 = r1 + lineSize;
uint8_t* g2 = r2 + 1;
uint8_t* b2 = r2 + 2;
const uint8_t* y1 = y;
const uint8_t* y2 = y + width;
uint8_t* const end = r1 + size;
int c1 = 0;
int c2 = 0;
int e = 0;
int d = 0;
while (r1 != end) {
uint8_t* const lineEnd = r2;
/* line by line */
while (r1 != lineEnd) {
/* first pixel */
c1 = *y1 - 16;
c2 = *y2 - 16;
d = *u - 128;
e = *v - 128;
*r1 = clip(c1 + ((454 * e) >> 8));
*g1 = clip(c1 - ((88 * e + 183 * d) >> 8));
*b1 = clip(c1 + ((359 * d) >> 8));
*r2 = clip(c2 + ((454 * e) >> 8));
*g2 = clip(c2 - ((88 * e + 183 * d) >> 8));
*b2 = clip(c2 + ((359 * d) >> 8));
r1 += 3;
g1 += 3;
b1 += 3;
r2 += 3;
g2 += 3;
b2 += 3;
++y1;
++y2;
/* second pixel */
c1 = *y1 - 16;
c2 = *y2 - 16;
d = *u - 128;
e = *v - 128;
*r1 = clip(c1 + ((454 * e) >> 8));
*g1 = clip(c1 - ((88 * e + 183 * d) >> 8));
*b1 = clip(c1 + ((359 * d) >> 8));
*r2 = clip(c2 + ((454 * e) >> 8));
*g2 = clip(c2 - ((88 * e + 183 * d) >> 8));
*b2 = clip(c2 + ((359 * d) >> 8));
r1 += 3;
g1 += 3;
b1 += 3;
r2 += 3;
g2 += 3;
b2 += 3;
++y1;
++y2;
++u;
++v;
}
r1 += lineSize;
g1 += lineSize;
b1 += lineSize;
r2 += lineSize;
g2 += lineSize;
b2 += lineSize;
y1 += width;
y2 += width;
}
}
Yes you can use libjpeg for this, but usually the output of libjpeg is in YUV420 or YUV422.
You might instead use that code: http://mxhaard.free.fr/spca50x/Download/gspcav1-20071224.tar.gz (check for decoder source, there's a small jpeg decoder that's working well and deals with color conversion directly so the output is in RGB888)
can anyone see where i made a mistake here? I know that the algorithm will properly decrypt the encrypted data. however, most of the encrypted data is not the correct output, according to the RC6 paper.
// hexlify(string) turns a string into its hex representation: hexlify("AB") -> "4142"
// unhexlify(string) turns a string into its ASCII representation: unhexlify("4142") -> "AB"
// uint128_t is my own version of uint128, and Im pretty sure that the math is correct
// little_end(string, base) flips a string by bytes to get the little endian version of the string
// ROL/ROR(int, rotate x bits, bitsize of input int) does bitwise rotation
class RC6{
private:
unsigned int w, r, b, lgw;
std::vector <uint32_t> S;
uint128_t mod;
std::string mode;
void keygen(std::string KEY){
uint64_t p, q;
rc_pq(w, p, q);
KEY = hexlify(KEY);
unsigned int u = (unsigned int) ceil(w / 8.);
unsigned int c = (unsigned int) ceil(float(b) / u);
while ((KEY.size() >> 1) % u != 0)
KEY += zero;
std::vector <uint32_t> L;
for(unsigned int x = 0; x < c; x++)
L.push_back(toint(little_end(KEY.substr(2 * u * x, 2 * u), 16), 16));
S.push_back(p);
for(unsigned int i = 0; i < 2 * r + 3; i++)
S.push_back((S[i] + q) % mod);
uint32_t A = 0, B = 0, i = 0, j = 0;
uint32_t v = 3 * std::max(c, 2 * r + 4);
for(unsigned int s = 1; s < v + 1; s++){
A = S[i] = ROL((S[i] + A + B) % mod, 3, w);
B = L[j] = ROL((L[j] + A + B) % mod, (A + B) % w, w);
i = (i + 1) % (2 * r + 4);
j = (j + 1) % c;
}
}
public:
RC6(std::string KEY, std::string MODE, unsigned int W = 32, unsigned int R = 20, unsigned int B = 16){
w = W;
r = R;
b = B;
mod = uint128_t(1) << w;
lgw = (unsigned int) log2(w);
mode = MODE;
keygen(KEY);
}
std::string run(std::string DATA){
DATA = hexlify(DATA);
uint32_t A = toint(little_end(DATA.substr(0, 8), 16), 16), B = toint(little_end(DATA.substr(8, 8), 16), 16), C = toint(little_end(DATA.substr(16, 8), 16), 16), D = toint(little_end(DATA.substr(24, 8), 16), 16);
if (mode == "e"){
B += S[0];
D += S[1];
for(unsigned int i = 1; i < r + 1; i++){
uint64_t t = ROL((uint64_t) ((B * (2 * B + 1)) % mod), lgw, w);
uint64_t u = ROL((uint64_t) ((D * (2 * D + 1)) % mod), lgw, w);
A = ROL(A ^ t, u % w, w) + S[2 * i];
C = ROL(C ^ u, t % w, w) + S[2 * i + 1];
uint64_t temp = A; A = B % mod; B = C % mod; C = D % mod; D = temp % mod;
}
A += S[2 * r + 2];
C += S[2 * r + 3];
}
else{
C -= S[2 * r + 3];
A -= S[2 * r + 2];
for(int i = r; i > 0; i--){
uint64_t temp = D; D = C % mod; C = B % mod; B = A % mod; A = temp % mod;
uint64_t u = ROL((uint64_t) ((D * (2 * D + 1)) % mod), lgw, w);
uint64_t t = ROL((uint64_t) ((B * (2 * B + 1)) % mod), lgw, w);
C = ROR((C - S[2 * i + 1]) % mod, t % w, w) ^ u;
A = ROR((A - S[2 * i]) % mod, u % w, w) ^ t;
}
D -= S[1];
B -= S[0];
}
w >>= 2;
return unhexlify(little_end(makehex(A % mod, w)) + little_end(makehex(B % mod, w)) + little_end(makehex(C % mod, w)) + little_end(makehex(D % mod, w)));
}
};
of these tests vectors, only the first two are correct. the rest are not
data = "00000000000000000000000000000000";
key = "00000000000000000000000000000000";
ciphertext = "8fc3a53656b1f778c129df4e9848a41e";
data = "02132435465768798a9bacbdcedfe0f1";
key = "0123456789abcdef0112233445566778";
ciphertext = "524e192f4715c6231f51f6367ea43f18";
data = "00000000000000000000000000000000";
key = "000000000000000000000000000000000000000000000000";
ciphertext = "6cd61bcb190b30384e8a3f168690ae82";
data = "02132435465768798a9bacbdcedfe0f1";
key = "0123456789abcdef0112233445566778899aabbccddeeff0";
ciphertext = "688329d019e505041e52e92af95291d4";
data = "00000000000000000000000000000000";
key = "0000000000000000000000000000000000000000000000000000000000000000";
ciphertext = "8f5fbd0510d15fa893fa3fda6e857ec2";
data = "02132435465768798a9bacbdcedfe0f1";
key = "0123456789abcdef0112233445566778899aabbccddeeff01032547698badcfe";
ciphertext = "c8241816f0d7e48920ad16a1674e5d48";
did i mess up a uint somewhere? wrong little endian change?
I think I figured it out. Can anyone corroborate? I think that because I set b = 16 by default, I'm causing the errors. My harddrive is dead or I would have tested this already
I'm working with a very restrictive embedded processor, which only has 128 bytes of ram. I'd like to implement SHA1 on it. RFC3174 describes, in 'method 2', a way of implementing SHA1 that doesn't require allocating an array of 80 32-bit words (which, at 320 bytes, is obviously not practical), and seems like it ought to be usable on my processor. I'm unable to find any implementations of 'method 2', though, and the sample code in the RFC only implements the default method.
Is anyone aware of a memory-efficient implementation of SHA1 in C or C++?
You should be able to quickly adapt the method 1 source to method 2. The function to change is Sha1ProcessMessageBlock() in method 1. Initialize w[0:15] from message, then do a loop of 0 to 79, where you only do w[] manipulation after iteration 16, and temp calculation depends on ts value (0-19 uses one, 20-39 uses another, etc). The important thing to remember is using index%16 or index & 0x0f whenever you are addressing the w[] array.
A quick modification would be something like this (double check all accesses to w to make sure I haven't missed the t & 0x0f):
void SHA1ProcessMessageBlock(SHA1Context *context)
{
const uint32_t K[] = { /* Constants defined in SHA-1 */
0x5A827999,
0x6ED9EBA1,
0x8F1BBCDC,
0xCA62C1D6
};
int t; /* Loop counter */
uint32_t temp; /* Temporary word value */
uint32_t W[16]; /* Word sequence */
uint32_t A, B, C, D, E; /* Word buffers */
/*
* Initialize the first 16 words in the array W. You can move this to your
* context.
*/
for(t = 0; t < 16; t++)
{
W[t] = context->Message_Block[t * 4] << 24;
W[t] |= context->Message_Block[t * 4 + 1] << 16;
W[t] |= context->Message_Block[t * 4 + 2] << 8;
W[t] |= context->Message_Block[t * 4 + 3];
}
A = context->Intermediate_Hash[0];
B = context->Intermediate_Hash[1];
C = context->Intermediate_Hash[2];
D = context->Intermediate_Hash[3];
E = context->Intermediate_Hash[4];
for(t = 0; t < 80; t++) {
if (t >= 16) {
W[t&0xf] = SHA1CircularShift(1,W[(t-3)&0xf] ^ W[(t-8)&0xf] ^ W[(t-14)&0xf] ^ W[t&0xf]);
}
if (t<20) {
temp = SHA1CircularShift(5,A) +
((B & C) | ((~B) & D)) + E + W[t&0xf] + K[0];
}
else if (t<40) {
temp = SHA1CircularShift(5,A) + (B ^ C ^ D) + E + W[t&0xf] + K[1];
}
else if (t < 60) {
temp = SHA1CircularShift(5,A) +
((B & C) | (B & D) | (C & D)) + E + W[t&0xf] + K[2];
}
else {
temp = SHA1CircularShift(5,A) + (B ^ C ^ D) + E + W[t&0xf] + K[3];
}
E = D;
D = C;
C = SHA1CircularShift(30,B);
B = A;
A = temp;
}
context->Intermediate_Hash[0] += A;
context->Intermediate_Hash[1] += B;
context->Intermediate_Hash[2] += C;
context->Intermediate_Hash[3] += D;
context->Intermediate_Hash[4] += E;
context->Message_Block_Index = 0;
}
There are still savings to be made: get rid of W[] array on stack and put it in context pre-initialized with the data you get.
Also, you need a lot of pre-processing before calling this function. For example, if all your messages are less than 55 bytes, you can put it in W array, add padding, and process immediately. If not, you'll have to call process twice: first with your partially padded input, and again with the rest of the pad, etc. That sort of thing would be very application specific, and I doubt you'll be able to find the code to do it for you.
By the way, the code above is a straight adaptation from the type 1 source from your link. You can probably squeeze a bit more out of it if you try to optimize it further.
I couldn't think of a way to get any savings on the intermediate hash, so you will need a total of 108 bytes for this (109 if counter is also in RAM), and 24 of which is local to this function, and can be reused in other places - so long as they are also temporary. So it is very hard to do what you want to do.
EDIT: If all your messages are less than 55 bytes, you can save another 20 bytes in your context by getting rid of the intermediate_hash[] storage. Simply initialize A-E from the constants, and add the constants at the end. Finally, instead of storing them in a separate variable, overwrite your input when this function ends.
I have implemented SHA-1 for several memory-constrained environments. You can get by with
DWORD W[16] ; // instead of H[80]
DWORD H[5] ; // Intermediate hash value
DWORD BitCount[2] ; // Probably a single DWORD is enough here
plus a few bytes of housekeeping. W is updated on the fly, as a circular buffer, instead of being generated at the start of each round.
working example:
#include<iostream>
#include<stdio.h>
#include<stdlib.h>
#include<string>
using namespace std;
unsigned CircularShift(int bits, unsigned word)
{
return ((word << bits) & 0xFFFFFFFF) | ((word & 0xFFFFFFFF) >> (32-bits));
}
int main(void)
{
string mess;
cin >> mess;
unsigned int lm = mess.length();
unsigned int lmb = lm*8;
unsigned char *messc;
messc=(unsigned char*)malloc((sizeof(unsigned char))*64);
for (unsigned short int i =0;i<64;i++)
{
messc[i]=char(0x00);
}
for(int i=0;i<mess.length();i++)
{
messc[i]=mess[i];
}
messc[lm]=(unsigned char)128;
messc[56] = (lmb >> 24) & 0xFF;
messc[57] = (lmb >> 16) & 0xFF;
messc[58] = (lmb >> 8) & 0xFF;
// messc[59] = (lmb) & 0xFF;
messc[60] = (lmb >> 24) & 0xFF;
messc[61] = (lmb >> 16) & 0xFF;
messc[62] = (lmb >> 8) & 0xFF;
messc[63] = (lmb) & 0xFF;
for(int i =0 ;i<64;i++)
{
cout<< hex << (int)messc[i] << " ";
}
unsigned *H;
H=(unsigned*)malloc(5*sizeof(unsigned));
H[0] = 0x67452301;
H[1] = 0xEFCDAB89;
H[2] = 0x98BADCFE;
H[3] = 0x10325476;
H[4] = 0xC3D2E1F0;
const unsigned K[]={0x5A827999,0x6ED9EBA1,0x8F1BBCDC,0xCA62C1D6};
int t;
unsigned temp;
unsigned *W;
unsigned A, B, C, D, E;
W=(unsigned*)malloc(80*sizeof(unsigned));
unsigned char *messh;
messh=(unsigned char*)malloc(64*sizeof(unsigned char));
int k;
for(t = 0; t < 16; t++)
{
W[t] = ((unsigned) messc[t * 4])<< 24; ;
W[t] |= ((unsigned) messc[t * 4 + 1])<< 16;
W[t] |= ((unsigned) messc[t * 4 + 2]) << 8;
W[t] |= ((unsigned) messc[t * 4 + 3]);
}
for(t = 16; t < 80; t++)
{
W[t] = CircularShift(1,W[t-3] ^ W[t-8] ^ W[t-14] ^ W[t-16]);
}
A = H[0];
B = H[1];
C = H[2];
D = H[3];
E = H[4];
for(t = 0; t < 20; t++)
{
temp = CircularShift(5,A) + ((B & C) | ((~B) & D)) + E + W[t] + K[0];
temp &= 0xFFFFFFFF;
E = D;
D = C;
C = CircularShift(30,B);
B = A;
A = temp;
}
for(t = 20; t < 40; t++)
{
temp = CircularShift(5,A) + (B ^ C ^ D) + E + W[t] + K[1];
temp &= 0xFFFFFFFF;
E = D;
D = C;
C = CircularShift(30,B);
B = A;
A = temp;
}
for(t = 40; t < 60; t++)
{
temp = CircularShift(5,A) +
((B & C) | (B & D) | (C & D)) + E + W[t] + K[2];
temp &= 0xFFFFFFFF;
E = D;
D = C;
C = CircularShift(30,B);
B = A;
A = temp;
}
for(t = 60; t < 80; t++)
{
temp = CircularShift(5,A) + (B ^ C ^ D) + E + W[t] + K[3];
temp &= 0xFFFFFFFF;
E = D;
D = C;
C = CircularShift(30,B);
B = A;
A = temp;
}
H[0] = (H[0] + A) & 0xFFFFFFFF;
H[1] = (H[1] + B) & 0xFFFFFFFF;
H[2] = (H[2] + C) & 0xFFFFFFFF;
H[3] = (H[3] + D) & 0xFFFFFFFF;
H[4] = (H[4] + E) & 0xFFFFFFFF;
cout <<"\nTHIS IS SHHHHHAAAAAAAAAAA\n";
for(int i=0;i<5;i++)
{
cout << hex << H[i] << " ";
}
//Message_Block_Index = 0;
}
All things considered, looking at your requirements, I think you are going to have to change your specs. Either a bigger chip, or a simpler algorithm. Even implementing SHA-1 (without HMAC) would be a challenge, but it should be doable.