Find the idle GPU in a multi-GPU machine

Find the idle GPU in a multi-GPU machine - c++

I have the following code running under cuda (Windows 10, VS 2015).
//Code for running on one or 2 gpu's
const unsigned __int64 MemOutputSize = (1i64 << 25)*64; //2GB
int deviceCount;
cudaGetDeviceCount(&deviceCount);
unsigned long long* dBuffer[2];
for (int dev = 0; dev < deviceCount; dev++) {
cudaMalloc(&dBuffer[dev], MemOutputSize);
cudaMemset(dBuffer[dev], 0, MemOutputSize);
}
for (int i=0; i < (1024*1024*1024); i++) {
int dev = i % deviceCount;
cudaSetDevice(dev);
runKernel<<<NUM_BLOCK, NUM_THREADS>>>(i, dBuffer[dev]);
}
//Copy data from GPU buffers to main mem
//Merge buffers into one.
It does not matter which GPU runs which part of the code.
Normally I would run the code on the fastest GPU. But there are two GPU's so this way I can run the code on both, doubling my speed.
However when I run the code, the slowest GPU runs at 100%, whilst the fastest runs at 67%, which means that I only get a speedup of 2 x 67% = 133%. I want both GPU's to be much nearer 100% utilization all the time.
I get these percentages from Task manager (select Performance tab, compute_0). And no, FireFox or other GPU using procs are not running.
Is there a way to select the currently idle GPU in the for loop?
This would enable me to select the fast GPU 60% of the time and the slower one 40% of the time, increasing my speed-up to 158%
As per the cuda tag, I'm only concerned with NVidia cards (dual GTX 980 in fact).
The full (working) code is here:
// System includes
#include <stdio.h>
#include <stdint.h>
#include <assert.h>
#include <conio.h>
#include <chrono>
#include <ctime>
#include <iostream>
#include <fstream>
#include <windows.h>
// CUDA runtime
#include <cuda_runtime.h>
// helper functions & utilities to work with CUDA
#include <helper_functions.h>
#include <helper_cuda.h>
#define NUM_BLOCKS 2048 //2^11
#define NUM_THREADS 128 //2^7 49-11-7 = 31
const unsigned __int64 MemOutputSize = (1i64 << 25) * 64; // (__int64)(2 * 1024 * 1024 * 1024);
//const unsigned __int64 MemOutputSize = 0_40000000; //(1 * 1024 * 1024 * 1024);
/************************************************************************************************************/
/************************* Build the lookup table *************************************/
/************************* *************************************/
/************************************************************************************************************/
__global__ static void SevenToFive(const unsigned __int64 input, void* doutput) {
const unsigned int NRegs = 16;
unsigned __int64 Y[NRegs];
// A cell looks like this:
// BCD 123
// AxA 405
// BCD 678
// we're using half-adder logic to store the 1, 2 and 4's count in 3 bitplanes.
const unsigned __int64 MaskR2 = 0x0303030303030303; //Keep the rightmost 2 rows.
const unsigned __int64 MaskR1 = 0x0101010101010101; //Keep the rightmost row.
const unsigned __int64 MaskL6 = (0xFFFFFFFFFFFFFFFF & (~MaskR2));
const unsigned __int64 MaskL7 = (0xFFFFFFFFFFFFFFFF & (~MaskR1));
//const unsigned __int64 AllOn = 0xFFFFFFFFFFFFFFFF;
const unsigned __int64 Mask5x5 = 0x007C7C7C7C7C0000; //Both masks use the Q layout, because the input is transformed
const unsigned __int64 Mask3x3 = 0x0000383838000000; //to Q in the Y[0] register.
//const unsigned __int64 Mask6x4 = 0x00007E7E7E7E0000; //Both masks use the Q layout, because the input is transformed
//const unsigned __int64 Mask4x2 = 0x0000003C3C000000; //to Q in the Y[0] register.
Y[14] = threadIdx.x;//*/ Y[14] = 127; /*debug*/ 7 bits
Y[13] = blockIdx.x;//*/ Y[13] = 2047; /*debug*/ 11 bits
//__int64 input2 = -1;
//Put 7x7 input into Y15.
//The top line (least significant) and left most line will be empty (lsb of every line).
//when doing non-overlapping or's always use xor to make any errors stand out.
//3+7+7+7+7 = 31 bits
Y[15] = (input & 7) ^ (((input >> 3) & 127) << 4) ^ (((input >> 10) & 127) << 12) ^ (((input >> 17) & 127) << 20) ^ (((input >> 24) & 127) << 28); //^ (((input >> 31) & 7) << 36);
//Y[15] = (input2 & 7) ^ (((input2 >> 3) & 127) << 4) ^ (((input2 >> 10) & 127) << 12) ^ (((input2 >> 17) & 127) << 20) ^ (((input2 >> 24) & 127) << 28); //^ (((input2 >> 31) & 7) << 36);
Y[15] = Y[15] << (20);
//Y[15] = 0;
//31+7 = 38 bits
Y[15] = Y[15] ^ Y[14]; //threadIdx.x;
//38 + 7 + 4 = 49 bits. This makes a total of 2^49
Y[15] = Y[15] ^ ((Y[13] & 127) << 8) ^ (((Y[13] >> 7) & 15) << 16); //blockIdx.x
//Y[15] = 0x070702000 >> 1; //Test with a glider traveling south
//Y[15] = 0x01c0000; //Test with a blinker
//Y[15] = 0x00c0800; //Test with a preblock
//Y[15] = AllOn;//*input;
Y[1] = 0;
Y[4] = 0;
Y[3] = 0;
///*debug*/Y[15] = 0x7F007F087F007F;//#######;0000000,#######,000#000,#######,0000000,####### : 4 lines with a little ward in the middle.
//Y[04] = (Y[15] << 7) & 0x8080808080808080;
//Y[15] = (Y[15] >> 1) & 0x7F7F7F7F7F7F7F7F;
//Y[3] = (Y[3] >> (64 - 16)); //vpsrldq xmm3,xmm3,16-4 //keep the bottom 2 rows of NW & shift them to the top
Y[6] = (Y[1] >> (64 - 8)); //vpsrldq xmm6, xmm1, 16 - 2 //N5 keep the bottom 1 rows of N & shift them to the top.
Y[1] = (Y[1] >> (64 - 16)); //vpsrldq xmm1,xmm1,16-4 //N3 keep the bottom 2 rows of N & shift them to the top.
Y[2] = ((Y[4] >> 6) & MaskR2); //vpsrlw xmm2,xmm4,14 //W6 keep the 2 rightmost columns of W
//Y[3] = ((Y[3] >> 6) & MaskR2); //vpsrlw xmm3,xmm3,14 //NW1 keep the 2 rightmost columns of NW
Y[5] = (Y[15] << 16); //vpslldq xmm5,xmm15,4 //main3 remove the bottom 2 rows from main
Y[7] = (Y[15] << 8); //vpslldq xmm7,xmm15,2 //main5 remove the bottom 1 row from main
/*D3*/Y[14] = (Y[1] ^ Y[5]); //vpxor xmm14, xmm1, xmm5 //***** ymm14 3 - D 2 rows N +14 rows main
/*A5*/Y[13] = (Y[7] ^ Y[6]); //vpxor xmm13, xmm7, xmm6 //***** ymm13 5 - A' 1 row N +15 rows main
//We are now done with N, ymm1 and ymm6
Y[1] = ((Y[2] >> 1) & MaskR1); //vpsrlw xmm1,xmm2,1 //W7 remove an extra column from W
Y[7] = ((Y[15] << 1) & MaskL7); // //main7 Shift main right
Y[8] = ((Y[13] << 1) & MaskL7); // //main0+N0 Shift main+N1 right
Y[9] = ((Y[14] << 1) & MaskL7); // //main2+N2 Shift mainn+N2 right
/*C7*/Y[12] = (Y[7] ^ Y[1]); // //***** ymm12 7 - C Main7+W7
Y[7] = ((Y[7] << 1) & MaskL7); // //main6 Shift main right
/*B6*/Y[11] = (Y[7] ^ Y[2]); // //***** ymm11 6 - B' Main6+W6
Y[10] = (Y[11] << 8); // //main4+W4 Shift Main6W6 down
Y[7] = (Y[3] >> 8); // //NW4 Shift NW1 up (only one row)
Y[6] = ((Y[6] << 2) & MaskL6); // //N4 Shift N3 right
Y[10] = (Y[10] ^ Y[7]); // //main4+W4+NW4
/*A4*/Y[10] = (Y[10] ^ Y[6]); // //***** ymm10 4 - A
Y[1] = (Y[1] << 8); // //W0 Shift W7 down 1 row
Y[7] = ((Y[7] >> 1) & MaskR1); // //NW0 Shift NW4 left (keep only 1 pixel)
Y[0] = (Y[8] ^ Y[1]); // //main0+N0+W0
/*X0*/Y[0] = (Y[0] ^ Y[7]); // //***** ymm0 0 - x
Y[1] = (Y[2] << 16); //W1 Shift W down 2 rows
Y[8] = ((Y[9] << 1) & MaskL6); //main1+N1 Shift Main2N2 right 1 column
Y[8] = (Y[8] ^ Y[1]); //main1+N1+W1 Combine with W
/*B18*/Y[8] = (Y[8] ^ Y[3]); //**** ymm8 1 - B Combine with the original NW
Y[7] = ((Y[1] >> 1) & MaskR1); //W2 Shift W1 left 1 column
Y[5] = ((Y[3] >> 1) & MaskR1); //NW2 Shift the original NW left 1 column
Y[1] = (Y[7] ^ Y[5]); //W2+NW2 combine w2 & NW2
/*C2*/Y[9] = (Y[1] ^ Y[9]); //**** ymm9 2 - C' main2+N2+W2+NW2
//Count the 1's & 2's
Y[1] = (Y[12] ^ Y[9]); //1's count of c
Y[2] = (Y[12] & Y[9]); //2's count of c
Y[3] = (Y[10] ^ Y[13]); //1's count of a
Y[4] = (Y[10] & Y[13]); //2's count of a
Y[5] = (Y[8] ^ Y[11]); //1's count of b
Y[6] = (Y[8] & Y[11]); //2's count of b
Y[7] = (Y[14] ^ Y[15]); //1's count of d
Y[8] = (Y[14] & Y[15]); //2's count of d
//Add the 1's together
Y[10] = (Y[1] & Y[3]); //2's count of CA
Y[1] = (Y[1] ^ Y[3]); //combined ones of CA
Y[12] = (Y[5] & Y[7]); //2's count of BD
Y[5] = (Y[5] ^ Y[7]); //combined ones of BD
Y[14] = (Y[1] & Y[5]); //2's count of CABD
Y[1] = (Y[1] ^ Y[5]); //final count of the 1's
//now we need to add all the 2's together.
Y[3] = (Y[2] & Y[4]); //4's count of ca
Y[2] = (Y[2] ^ Y[4]); //2's count of ca
Y[5] = (Y[6] & Y[8]); //4's count of bd
Y[6] = (Y[6] ^ Y[8]); //2's count of bd
Y[7] = (Y[10] & Y[12]); //4's count of CABD
Y[8] = (Y[10] ^ Y[12]); //2's count of CABD
Y[9] = (Y[2] & Y[6]); //4's count of cabd
Y[4] = (Y[2] ^ Y[6]); //2's count of cabd
Y[11] = (Y[8] & Y[14]); //4's count of CABD+abcd
Y[12] = (Y[8] ^ Y[14]); //2's count of CABD+abcd
//add all 4's
Y[15] = (Y[3] | Y[5]); //Saturated add of the 4's
Y[13] = (Y[7] | Y[9]);
Y[14] = (Y[11] | Y[15]);
//add the 2's
Y[2] = (Y[12] ^ Y[4]);
//final add
Y[4] = (Y[14] | Y[13]);
//now we have all the counts.
Y[14] = (Y[0] & Y[2]); //All 2's stay the same
Y[3] = (Y[2] & Y[1]); //Y[3] hold's the 3 neighbors; i.e. the new births
Y[14] = (Y[14] | Y[3]); //The same + births = new pattern
Y[15] = (Y[14] & (~Y[4])); //but subtract the 4+ neighbors
//Now extract the 5x5 resulting block as well as the 3x3 input block
Y[6] = (Y[15] & Mask5x5); //get the output 5x5
Y[4] = (Y[0] & Mask3x3); //and the input 3x3
//Translate the 5x5 block into a linear number.
//Mask5x5 = 0x003E3E3E3E3E0000; //Both masks use the Q layout, because the input is transformed
//Mask3x3 = 0x00001C1C1C000000; //to Q in the Y[0] register.
// ----1------ ------2------ -------- 3------ ----- 4------ ------ 6-----
Y[5] = ((Y[6] & 0x7C0000) >> (10 + 8)) | ((Y[6] & 0x7C000000) >> (10 + 16 - 5)) | ((Y[6] & 0x7C00000000) >> (10 + 24 - 10)) | ((Y[6] & 0x7C0000000000) >> (10 + 32 - 15)) | ((Y[6] & 0x7C000000000000) >> (10 + 40 - 20));
Y[3] = ((Y[4] & 0x38000000) >> (11 + 16)) | ((Y[4] & 0x3800000000) >> (11 + 24 - 3)) | ((Y[4] & 0x380000000000) >> (11 + 32 - 6));
// Mask6x4 = 0x00007E7E7E7E0000; //Both masks use the Q layout, because the input is transformed
// Mask4x2 = 0x0000003C3C000000; //to Q in the Y[0] register.
//Y[5] = ((Y[6] & 0x7E0000) >> (9 + 8)) | ((Y[6] & 0x7E000000) >> (9 + 16 - 6)) | ((Y[6] & 0x7E00000000) >> (9 + 24 - 12)) | ((Y[6] & 0x7E0000000000) >> (9 + 32 - 18));
//Y[3] = ((Y[4] & 0x3C000000) >> (18 + 8)) | ((Y[4] & 0x3C00000000) >> (18 + 16 - 4));
//Y[15] is the output block where the data must be stored.
//Every block = 256 / 8 = 64 bytes =
Y[5] = Y[5] * 64;
//Y[3] is the inner 2x8 block, holding 8 bits, a number from 0 - 255. The upper 3 bits denote the dword to store the data in. The lower 5 bits are a shift mask denoting which bit to flip.
//
Y[4] = (1i64 << (Y[3] & 31i64)); //or mask.
Y[3] = (Y[3] >> 5) * 4; //dword offset //8*4 = 32 bits
Y[5] = Y[5] | Y[3];
//** remove this line!
//Y[5] = Y[5] & ((MemOutputSize)-1); //mask at 512MB, because we don't have more memory.
//** remove the above line !
Y[5] = Y[5] + (unsigned __int64)doutput;
/*debug*/atomicOr((unsigned int *)Y[5], (unsigned int)Y[4]);
}
void printDevProp(cudaDeviceProp devProp)
{
printf("%s\n", devProp.name);
printf("Major revision number: %d\n", devProp.major);
printf("Minor revision number: %d\n", devProp.minor);
printf("Total global memory: %zu", devProp.totalGlobalMem);
printf(" bytes\n");
printf("Number of multiprocessors: %d\n", devProp.multiProcessorCount);
printf("Total amount of shared memory per block: %zu\n", devProp.sharedMemPerBlock);
printf("Total registers per block: %d\n", devProp.regsPerBlock);
printf("Warp size: %d\n", devProp.warpSize);
printf("Maximum memory pitch: %zu\n", devProp.memPitch);
printf("Total amount of constant memory: %zu\n", devProp.totalConstMem);
return;
}
unsigned long long getTotalSystemMemory()
{
MEMORYSTATUSEX status;
status.dwLength = sizeof(status);
GlobalMemoryStatusEx(&status);
return status.ullTotalPhys;
}
#define filename "lookuptable5to3_doublecheckA.bin"
// Start the main CUDA Sample here
int main(int argc, char **argv)
{
printf("CUDA Lookup table 5x5->3x3 dual GPU version\n");
int deviceCount;
cudaGetDeviceCount(&deviceCount);
printf("Device count is %i, ", deviceCount);
printf("Available RAM = %lliGiB\n", (getTotalSystemMemory() >> 30i64));
if (deviceCount > 2) { deviceCount = 2; }
// This will pick the best possible CUDA capable device
int dev1 = findCudaDevice(argc, (const char **)argv);
cudaDeviceProp dp;
checkCudaErrors(cudaGetDeviceProperties(&dp, dev1));
printDevProp(dp);
//float *dinput = NULL;
unsigned long long int* dGPUoutput[2];
for (int dev = 0; dev < deviceCount; dev++) {
dGPUoutput[dev] = NULL;
}
//clock_t *dtimer = NULL;
//clock_t timer[NUM_BLOCKS * 2];
//float input[NUM_THREADS * 2];
//for (int i = 0; i < NUM_THREADS * 2; i++)
//{
// input[i] = (float)i;
//}
//unsigned __int64 a = 0xFFFFFFFFFFFFFFFF;
//unsigned __int64 b;
//SevenToFive(&a, &b);
//checkCudaErrors(cudaMalloc((void **)&dinput, sizeof(float) * NUM_THREADS * 2));
//checkCudaErrors(cudaMemcpy(dinput, input, sizeof(float) * NUM_THREADS * 2, cudaMemcpyHostToDevice));
//timedReduction<<<NUM_BLOCKS, NUM_THREADS, sizeof(float) * 2 *NUM_THREADS>>>(dinput, doutput, dtimer);
char* dest[2];
char* outputdest;
outputdest = (char*)malloc(sizeof(char) * MemOutputSize);
if (outputdest == NULL) { printf("Out of memory"); getch(); exit(EXIT_FAILURE); }
//test write
//printf("test write of data\n");
//std::ofstream outputFile;
//outputFile.open(filename, std::ofstream::out | std::ofstream::trunc | std::ofstream::binary);
///*debug*/outputFile.write(dest, MemOutputSize);
//outputFile.close();
printf("Start computing\n");
//DebugSevenToFive((unsigned long long int)-1, NULL); //just a place to check if needed.
//getch();
for (int dev = 0; dev < deviceCount; dev++) {
cudaSetDevice(dev);
/*debug*/checkCudaErrors(cudaMalloc((void **)&dGPUoutput[dev], sizeof(char) * MemOutputSize));
/*debug*/checkCudaErrors(cudaMemset(dGPUoutput[dev], 0, sizeof(char) * MemOutputSize));
dest[dev] = (char*)malloc(sizeof(char) * MemOutputSize);
if (dest[dev] == NULL) { printf("Out of memory"); getch(); exit(EXIT_FAILURE); }
}
auto t_start = std::chrono::high_resolution_clock::now();
//getch();
//we need to repeat this 2^31 times. 31=5+6+20
try {
for (int k = 0; k < 64; k++) { //2^6
printf("%i of 64 ", k + 1);
for (int j = 0; j < 32; j++) { //2^5
printf(".");
for (int q = 0; q < 1024; q++) {
printf("%4i\b\b\b\b", 1023 - q);
for (int i = 0; i < (/*1024 **/ 1024); i++) { //2^20
//__global__ static void SevenToFiveCount(const unsigned __int64 input, void* doutput)
///*debug*/SevenToFiveCount << <NUM_BLOCKS, NUM_THREADS >> > (i + (q * 1024) + (j * 1024 * 1024) + (k * 1024 * 1024 * 32), doutput); //256K, need to run this code 2GB times.
int dev = i % deviceCount;
cudaSetDevice(dev);
SevenToFive<<<NUM_BLOCKS, NUM_THREADS>>> (i + (q * 1024) + (j * 1024 * 1024) + (k * 1024 * 1024 * 32), dGPUoutput[dev]); //256K, need to run this code 2GB times.
}
}
}
printf("-");
for (int dev = 0; dev < deviceCount; dev++) {
cudaSetDevice(dev);
checkCudaErrors(cudaDeviceSynchronize());
/*debug*/checkCudaErrors(cudaMemcpy(dest[dev], dGPUoutput[dev], sizeof(char) * MemOutputSize, cudaMemcpyDeviceToHost));
}
for (__int64 i = 0; i < MemOutputSize * sizeof(char); i++) {
outputdest[i] = dest[0][i];
for (int dev = 1; dev < deviceCount; dev++) {
outputdest[i] |= dest[dev][i];
}
}
std::ofstream outputFile;
outputFile.open(filename, std::ofstream::out | std::ofstream::trunc | std::ofstream::binary);
outputFile.write(outputdest, MemOutputSize);
outputFile.close();
printf("W \n");
}
cudaDeviceSynchronize();
printf("\nDone computing\n");
auto t_end = std::chrono::high_resolution_clock::now();
std::cout << "Millisecs used " << std::chrono::duration<double, std::milli>(t_end - t_start).count() << "ms\n";
int check = 0;
for (int i = 0; i < (1024 * 1024 * 1024 / 4); i++) {
check |= outputdest[i];
}
if (check == 0) { printf("Error: nothing happened"); getch(); for (;;) {} }
else { printf("all ok"); getch(); }
//checkCudaErrors(cudaFree(dinput));
for (int dev = 0; dev < deviceCount; dev++) {
cudaFree(dGPUoutput[dev]);
}
free(outputdest);
outputdest = NULL;
for (int dev = 0; dev < deviceCount; dev++) {
free(dest[dev]);
dest[dev] = NULL;
}
getch();
return EXIT_SUCCESS;
}
catch (const std::exception& e) //catch all exceptions
{
printf("Oops, an error happened. Here are the details:\n");
std::cout << e.what() << std::endl;
printf("\nPress any key\n");
getch();
}
return EXIT_FAILURE;
}

(this doesn't fit into a comment, sorry)
Prepare a solver, feed it all GPUs instantanous or average performances for last several seconds as variables. Minimize total time per N kernels or maximize 1 kernel / average time value.
If kernels are identical, there should be only 1 minima point such as 90 kernels to gtx 1080 and 10 kernels to gtx1050, per second. If there are many different kernels, then they could become other variables for the solver.
Solver includes some math you can do maybe. I'll add only a solverless version:
give them their own queues of kernels
have them get kernels one at a time (finishes, gets a new )
check size of those queues.
Empty queue = 1.0 performance.
Full queue = 0.0 performance
continue optimizing by "produce"ing a kernel to 1 queue at a time(maybe a fixed time or quickly until one of queues is filled), by probability equal to performance, probably a normalized version which makes 0.1 0.1 equal to 0.5 0.5. Set max size of each queue to 50 to 100 for simple accuracy of performance.
In CUDA you can assign multiple "stream"s so that you can do this with multiple queues per GPU, asynchronously to each other, to increase GPU usage further. It's important to check sum of queues performance values per GPU stays in tune with its real performance.
Since you run millions of kernels, you can have queues with thousands of kernels to make a better performance measurement. Just 5 max size of a queue may not be very good to share a few kernels.
Maybe you can make queue max sizes adapting to their attached GPU performances. Starting with just 5 max size, increasing to thousands on fastest GPUs so that performance can be measured like 0.5444533 instead of just 0.5.
When last kernel is enqueued, all remaining number of elements in queues should be equal to or near their own GPU performances and expected to finish at the same time.
(untested)

Related

Are there any algorithms or bit hacks for counting the occurrence each nth bit over an array of integers?

I have a lot of 32b values and I need to count the occurrence of each nth true bit over the entire length of the data and I need to do it as fast as possible because this is the performance bottleneck of the whole simulation. I created a naive c++ approach that does this for 8 bit values to illustrate the question:
#include <iostream>
#include <vector>
#include <cstdint>
std::vector<uint32_t> vertical_popcount(std::vector<uint8_t>& data) {
std::vector<uint32_t> result({0, 0, 0, 0, 0, 0, 0, 0});
for (auto i = 0; i < data.size(); i++) {
result[0] += (data[i] & 0b10000000) > 0;
result[1] += (data[i] & 0b01000000) > 0;
result[2] += (data[i] & 0b00100000) > 0;
result[3] += (data[i] & 0b00010000) > 0;
result[4] += (data[i] & 0b00001000) > 0;
result[5] += (data[i] & 0b00000100) > 0;
result[6] += (data[i] & 0b00000010) > 0;
result[7] += (data[i] & 0b00000001) > 0;
}
return result;
}
int main() {
std::vector<uint8_t> data({0b00000001, 0b00000100, 0b00000101});
auto result = vertical_popcount(data);
std::cout << "occurrence of bits: " << result[0] << ", " << result[1] << ", " << result[2] << ", " << result[3] << ", " << result[4] << ", " << result[5] << ", " << result[6] << ", " << result[7] << "\n";
return 0;
}
Is there an algorithm that does the same but (much) faster?

Pepijn Kramers answer shows how to parallelize the operations to do 8 byte at once. My answer looks at doing more bits at once.
In your code you extract each bit and increment a counter. You do a SIMD operation on that manually on blocks of 8 uint64_t.
The idea is to spread the bits out alternating 0 and data bits so that they can be added without overflowing into the next data bit. First step is to spread them out into 2bit units, then 4bit units, 8bit units and then sum the bytes in each uint64_t. If you want to extend this to 32 bit counts then you need to add 2 more steps to separate into 16bit units and 32bit units. The example below works on 8 uint64_t but if you have larger arrays you can merge more values per step. Just keep track of how many bits you have for each count (2, 4, 8, 16, 32) and don't merge more than 2^n-1 values.
uint64_t data[8] = 0x0123456789ABCDEF;
static const uint64_t mask0 = 0x5555555555555555;
static const uint64_t mask1 = 0x3333333333333333;
static const uint64_t mask2 = 0x0F0F0F0F0F0F0F0F;
// split even and odd bits and add 2 values together
// 2 bit per count, max value 2
uint64_t t000 = data[0] & mask0 + data[1] & mask0;
uint64_t t010 = data[2] & mask0 + data[3] & mask0;
uint64_t t020 = data[4] & mask0 + data[5] & mask0;
uint64_t t030 = data[6] & mask0 + data[7] & mask0;
uint64_t t001 = (data[0] >> 1) & mask0 + (data[1] >> 1) & mask0;
uint64_t t011 = (data[2] >> 1) & mask0 + (data[3] >> 1) & mask0;
uint64_t t021 = (data[4] >> 1) & mask0 + (data[5] >> 1) & mask0;
uint64_t t031 = (data[6] >> 1) & mask0 + (data[7] >> 1) & mask0;
// split into nibbles and build sum of 4 values
// 4 bit per count, max value 4
uint64_t t100 = t000 & mask1 + t010 & mask1;
uint64_t t101 = t001 & mask1 + t011 & mask1;
uint64_t t102 = (t000 >> 2) & mask1 + (t010 >> 2) & mask1;
uint64_t t103 = (t001 >> 2) & mask1 + (t011 >> 2) & mask1;
uint64_t t110 = t020 & mask1 + t030 & mask1;
uint64_t t111 = t021 & mask1 + t031 & mask1;
uint64_t t112 = (t020 >> 2) & mask1 + (t030 >> 2) & mask1;
uint64_t t113 = (t021 >> 2) & mask1 + (t031 >> 2) & mask1;
// split into bytes, and build sum of 8 values
// 8 bit per count, max 8
uint64_t sum[] = { t100 & mask2 + t110 & mask2;
t101 & mask2 + t111 & mask2;
t102 & mask2 + t112 & mask2;
t103 & mask2 + t113 & mask2;
(t100 >> 4) & mask2 + (t110 >> 4) & mask2;
(t101 >> 4) & mask2 + (t111 >> 4) & mask2;
(t102 >> 4) & mask2 + (t112 >> 4) & mask2;
(t103 >> 4) & mask2 + (t113 >> 4) & mask2; }
// add 8 bytes of sum[i] into a single byte
for(int i = 0; i < 8; ++i) {
sum[i] = sum[i] & 0xFFFFFFFF + (sum[i] >> 32);
sum[i] = sum[i] & 0xFFFF + (sum[i] >> 16);
sum[i] = sum[i] & 0xFF + (sum[i] >> 8);
}
Unless I made a mistake sum should now hold the bit count for each bit 0-7 for the block of 8 uint64_t.
You can improve this for larger blocks. When the bits are split into even and odd each count has 2 bits. That can hold the sum of 3 uint64_t and I only use 2. Similar the split into nibbles has 4 bits per count so it can hold the sum of 15 uint64_t. The split into bytes can hold the sum of 255 uint64_t.
You can also extend this to SIMD registers and do 128bit - 512bit at once. And I think there is a SIMD sum of bytes in vector intrinsic you can use instead of the loop at the end.

Here is a sketch of a possible approach (you can improve on this a lot if you can tweak your input to multiples of 8 bytes, or if you know the size of your vectors up front). But it gives you an idea how to use popcount. (For 32bits architecture the pattern is the same but you get less performance)
// #include <intrin.h> // for popcnt which counts number of set bits in a variable
#include <array>
#include <iostream>
#include <vector>
#include <cstdint>
#include <memory>
// work on copy on purpose so we can pad memory of vector to 64bit alignment (kind of a quick hack for now, the extra memory (re)allocations might slow you down too much)
auto vertical_popcount(std::vector<std::uint8_t> values)
{
// use 64bit architecture to do 8 values per cycle
static constexpr std::array<std::uint64_t, 8> masks
{
0x0101010101010101,
0x0202020202020202,
0x0404040404040404,
0x0808080808080808,
0x1010101010101010,
0x2020202020202020,
0x4040404040404040,
0x8080808080808080
};
//using an array instead of vector safes at least one dynamic allocation
std::array<std::size_t, 8> counts{};
// align data to multiple of 8 bytes
// add a few extra 0 bytes, they wont impact the counting
for (std::size_t n = 0; n < values.size() % 8; ++n)
values.push_back(0);
// make a uint64_t pointer into 8 bit data
// to pickup 8 bytes at a time for masking
auto ptr = reinterpret_cast<std::uint64_t*>(values.data());
for (std::size_t n = 0; n < values.size() / 8; ++n)
{
for (std::size_t m = 0; m < 8; ++m)
{
// mask 8 bytes at a time
auto masked_value = (*ptr & masks[m]);
// count bits of 8 uint8_t's in one loop
auto bitcount = std::popcount(masked_value);
counts[m] += bitcount;
}
++ptr;
}
return counts;
}
int main()
{
std::vector<uint8_t> data({ 0b00001001, 0b00000100, 0b00000101 });
auto result = vertical_popcount(data);
std::cout << "occurrence of bits: " << result[0] << ", " << result[1] << ", " << result[2] << ", " << result[3] << ", " << result[4] << ", " << result[5] << ", " << result[6] << ", " << result[7] << "\n";
return 0;
}

Reading a Monochrome Bitmap in C++ requires reading every other line?

First off, this is not a duplicate. I have already read Converting 1-bit bmp file to array in C/C++ and my question is about an inconsistency I'm seeing in the formulas provided with the one that works for me.
The Issue
I am trying to read in a 1-bit Bitmap image that was created in MS Paint. I've used the code provided by other answers on this site, but there are a few things I had to change to get it to work, and I want to understand why,
Change 1: lineSize must be doubled
Original
int lineSize = (w / 8 + (w / 8) % 4);
Mine:
int lineSize = (w/ 8 + (w / 8) % 4) * 2;
Change 2: Endianness must be reversed
Original:
for(k = 0 ; k < 8 ; k++)
... (data[fpos] >> k ) & 1;
Mine:
for (int k = 7; k >= 0; --k) {
... (data[rawPos] >> k) & 1;
Full Code
NOTE: This code works. There are some changes from the original, but the core read part is the same.
vector<vector<int>> getBlackAndWhiteBmp(string filename) {
BmpHeader head;
ifstream f(filename, ios::binary);
if (!f) {
throw "Invalid file given";
}
int headSize = sizeof(BmpHeader);
f.read((char*)&head, headSize);
if (head.bitsPerPixel != 1) {
f.close();
throw "Invalid bitmap loaded";
}
int height = head.height;
int width = head.width;
// Lines are aligned on a 4-byte boundary
int lineSize = (width / 8 + (width / 8) % 4) * 2;
int fileSize = lineSize * height;
vector<unsigned char> rawFile(fileSize);
vector<vector<int>> img(head.height, vector<int>(width, -1));
// Skip to where the actual image data is
f.seekg(head.offset);
// Read in all of the file
f.read((char*)&rawFile[0], fileSize);
// Decode the actual boolean values of the pixesl
int row;
int reverseRow; // Because bitmaps are stored bottom to top for some reason
int columnByte;
int columnBit;
for (row = 0, reverseRow = height - 1; row < height; ++row, --reverseRow) {
columnBit = 0;
for (columnByte = 0; columnByte < ceil((width / 8.0)); ++columnByte) {
int rawPos = (row * lineSize) + columnByte;
for (int k = 7; k >= 0 && columnBit < width; --k, ++columnBit) {
img[reverseRow][columnBit] = (rawFile[rawPos] >> k) & 1;
}
}
}
f.close();
return img;
}
#pragma pack(1)
struct BmpHeader {
char magic[2]; // 0-1
uint32_t fileSize; // 2-5
uint32_t reserved; // 6-9
uint32_t offset; // 10-13
uint32_t headerSize; // 14-17
uint32_t width; // 18-21
uint32_t height; // 22-25
uint16_t bitsPerPixel; // 26-27
uint16_t bitDepth; // 28-29
};
#pragma pack()
Potentially relevant information:
I'm using Visual Studio 2017
I'm compiling for C++14
I'm on a Windows 10 OS
Thanks.

Both of those line size formulas are incorrect.
For example, for w = 1, (w / 8 + (w / 8) % 4) results in zero. It's still zero if you multiply by two. It's expected to be 4 for width = 1.
The correct formula for line size (or bytes per line) is
((w * bpp + 31) / 32) * 4 where bpp is bits per pixel, in this case it is 1.
By coincidence the values are sometimes the same, for some smaller width values.
See also MSDN example:
DWORD dwBmpSize = ((bmpScreen.bmWidth * bi.biBitCount + 31) / 32) * 4 * bmpScreen.bmHeight;
Also, 1-bit image has 2 palette entries, for a total of 8 bytes. It seems you are ignoring the palette and assuming that 0 is black, and 1 is white, always.
The part where you flip the bits is correct, the other code appears to be incorrect.
Lets say we have a single byte 1000 0000 This is mean to be a single row, starting with 7 zeros and ending in 1.
Your code is a bit confusing for me (but seems okay when you fix linesize). I wrote my own version:
void test(string filename)
{
BmpHeader head;
ifstream f(filename, ios::binary);
if(!f.good())
return;
int headsize = sizeof(BmpHeader);
f.read((char*)&head, headsize);
if(head.bitsPerPixel != 1)
{
f.close();
throw "Invalid bitmap loaded";
}
int height = head.height;
int width = head.width;
int bpp = 1;
int linesize = ((width * bpp + 31) / 32) * 4;
int filesize = linesize * height;
vector<unsigned char> data(filesize);
//read color table
uint32_t color0;
uint32_t color1;
uint32_t colortable[2];
f.seekg(54);
f.read((char*)&colortable[0], 4);
f.read((char*)&colortable[1], 4);
printf("colortable: 0x%06X 0x%06X\n", colortable[0], colortable[1]);
f.seekg(head.offset);
f.read((char*)&data[0], filesize);
for(int y = height - 1; y >= 0; y--)
{
for(int x = 0; x < width; x++)
{
int pos = y * linesize + x / 8;
int bit = 1 << (7 - x % 8);
int v = (data[pos] & bit) > 0;
printf("%d", v);
}
printf("\n");
}
f.close();
}
Test image:
(33 x 20 monochrome bitmap)
Output:
colortable: 0x000000 0xFFFFFF
000000000000000000000000000000000
000001111111111111111111111111110
000001111111111111111111111111110
000001111111111111111111111111110
000001111111111111111111111111110
011111111111111111111111111111110
011111111111111111111111111111110
011111111111111111111111111111110
011111111111111111111111111111110
011111111111111111111111111111110
011111111111111111111111111111110
011111111111111111111111111111110
011111111111111111111111111111110
011111111111111111111111111111110
011111111111111111111111111111110
011111111111111111111111111111110
011111111111111111111111111110010
011111111111111111111111111110010
011111111111111111111111111111110
000000000000000000000000000000000
Notice this line in above code:
int pos = y * linesize + x / 8;
int bit = 1 << (7 - x % 8);
int v = (data[pos] & bit) > 0;
printf("%d", v);
First I wrote it as
int bit = 1 << (x % 8);
But this shows the bits in the wrong order, so I had to change to 1 << (7 - x % 8) which is basically what you did also. I don't know why it's designed like that. There must be some historical reasons for it!
(above code is for little-endian machines only)

Convert 0x1234 to 0x11223344

How do I expand the hexadecimal number 0x1234 to 0x11223344 in a high-performance way?
unsigned int c = 0x1234, b;
b = (c & 0xff) << 4 | c & 0xf | (c & 0xff0) << 8
| (c & 0xff00) << 12 | (c & 0xf000) << 16;
printf("%p -> %p\n", c, b);
Output:
0x1234 -> 0x11223344
I need this for color conversion. Users provide their data in the form 0xARGB, and I need to convert it to 0xAARRGGBB. And yes, there could be millions, because each could be a pixel. 1000x1000 pixels equals to one million.
The actual case is even more complicated, because a single 32-bit value contains both foreground and background colors. So 0xARGBargb become: [ 0xAARRGGBB, 0xaarrggbb ]
Oh yes, one more thing, in a real application I also negate alpha, because in OpenGL 0xFF is non-transparent and 0x00 is most transparent, which is inconvenient in most cases, because usually you just need an RGB part and transparency is assumed to be non-present.

This can be done using SSE2 as follows:
void ExpandSSE2(unsigned __int64 in, unsigned __int64 &outLo, unsigned __int64 &outHi) {
__m128i const mask = _mm_set1_epi16((short)0xF00F);
__m128i const mul0 = _mm_set1_epi16(0x0011);
__m128i const mul1 = _mm_set1_epi16(0x1000);
__m128i v;
v = _mm_cvtsi64_si128(in); // Move the 64-bit value to a 128-bit register
v = _mm_unpacklo_epi8(v, v); // 0x12 -> 0x1212
v = _mm_and_si128(v, mask); // 0x1212 -> 0x1002
v = _mm_mullo_epi16(v, mul0); // 0x1002 -> 0x1022
v = _mm_mulhi_epu16(v, mul1); // 0x1022 -> 0x0102
v = _mm_mullo_epi16(v, mul0); // 0x0102 -> 0x1122
outLo = _mm_extract_epi64(v, 0);
outHi = _mm_extract_epi64(v, 1);
}
Of course you’d want to put the guts of the function in an inner loop and pull out the constants. You will also want to skip the x64 registers and load values directly into 128-bit SSE registers. For an example of how to do this, refer to the SSE2 implementation in the performance test below.
At its core, there are five instructions, which perform the operation on four color values at a time. So, that is only about 1.25 instructions per color value. It should also be noted that SSE2 is available anywhere x64 is available.
Performance tests for an assortment of the solutions here
A few people have mentioned that the only way to know what's faster is to run the code, and this is unarguably true. So I've compiled a few of the solutions into a performance test so we can compare apples to apples. I chose solutions which I felt were significantly different from the others enough to require testing. All the solutions read from memory, operate on the data, and write back to memory. In practice some of the SSE solutions will require additional care around the alignment and handling cases when there aren't another full 16 bytes to process in the input data. The code I tested is x64 compiled under release using Visual Studio 2013 running on a 4+ GHz Core i7.
Here are my results:
ExpandOrig: 56.234 seconds // From asker's original question
ExpandSmallLUT: 30.209 seconds // From Dmitry's answer
ExpandLookupSmallOneLUT: 33.689 seconds // from Dmitry's answer
ExpandLookupLarge: 51.312 seconds // A straightforward lookup table
ExpandAShelly: 43.829 seconds // From AShelly's answer
ExpandAShellyMulOp: 43.580 seconds // AShelly's answer with an optimization
ExpandSSE4: 17.854 seconds // My original SSE4 answer
ExpandSSE4Unroll: 17.405 seconds // My original SSE4 answer with loop unrolling
ExpandSSE2: 17.281 seconds // My current SSE2 answer
ExpandSSE2Unroll: 17.152 seconds // My current SSE2 answer with loop unrolling
In the test results above you'll see I included the asker's code, three lookup table implementations including the small lookup table implementation proposed in Dmitry's answer. AShelly's solution is included too, as well as a version with an optimization I made (an operation can be eliminated). I included my original SSE4 implementation, as well as a superior SSE2 version I made later (now reflected as the answer), as well as unrolled versions of both since they were the fastest here, and I wanted to see how much unrolling sped them up. I also included an SSE4 implementation of AShelly's answer.
So far I have to declare myself the winner. But the source is below, so anyone can test it out on their platform, and include their own solution into the testing to see if they've made a solution that's even faster.
#define DATA_SIZE_IN ((unsigned)(1024 * 1024 * 128))
#define DATA_SIZE_OUT ((unsigned)(2 * DATA_SIZE_IN))
#define RERUN_COUNT 500
#include <cstdlib>
#include <ctime>
#include <iostream>
#include <utility>
#include <emmintrin.h> // SSE2
#include <tmmintrin.h> // SSSE3
#include <smmintrin.h> // SSE4
void ExpandOrig(unsigned char const *in, unsigned char const *past, unsigned char *out) {
unsigned u, v;
do {
// Read in data
u = *(unsigned const*)in;
v = u >> 16;
u &= 0x0000FFFF;
// Do computation
u = (u & 0x00FF) << 4
| (u & 0x000F)
| (u & 0x0FF0) << 8
| (u & 0xFF00) << 12
| (u & 0xF000) << 16;
v = (v & 0x00FF) << 4
| (v & 0x000F)
| (v & 0x0FF0) << 8
| (v & 0xFF00) << 12
| (v & 0xF000) << 16;
// Store data
*(unsigned*)(out) = u;
*(unsigned*)(out + 4) = v;
in += 4;
out += 8;
} while (in != past);
}
unsigned LutLo[256],
LutHi[256];
void MakeLutLo(void) {
for (unsigned i = 0, x; i < 256; ++i) {
x = i;
x = ((x & 0xF0) << 4) | (x & 0x0F);
x |= (x << 4);
LutLo[i] = x;
}
}
void MakeLutHi(void) {
for (unsigned i = 0, x; i < 256; ++i) {
x = i;
x = ((x & 0xF0) << 20) | ((x & 0x0F) << 16);
x |= (x << 4);
LutHi[i] = x;
}
}
void ExpandLookupSmall(unsigned char const *in, unsigned char const *past, unsigned char *out) {
unsigned u, v;
do {
// Read in data
u = *(unsigned const*)in;
v = u >> 16;
u &= 0x0000FFFF;
// Do computation
u = LutHi[u >> 8] | LutLo[u & 0xFF];
v = LutHi[v >> 8] | LutLo[v & 0xFF];
// Store data
*(unsigned*)(out) = u;
*(unsigned*)(out + 4) = v;
in += 4;
out += 8;
} while (in != past);
}
void ExpandLookupSmallOneLUT(unsigned char const *in, unsigned char const *past, unsigned char *out) {
unsigned u, v;
do {
// Read in data
u = *(unsigned const*)in;
v = u >> 16;
u &= 0x0000FFFF;
// Do computation
u = ((LutLo[u >> 8] << 16) | LutLo[u & 0xFF]);
v = ((LutLo[v >> 8] << 16) | LutLo[v & 0xFF]);
// Store data
*(unsigned*)(out) = u;
*(unsigned*)(out + 4) = v;
in += 4;
out += 8;
} while (in != past);
}
unsigned LutLarge[256 * 256];
void MakeLutLarge(void) {
for (unsigned i = 0; i < (256 * 256); ++i)
LutLarge[i] = LutHi[i >> 8] | LutLo[i & 0xFF];
}
void ExpandLookupLarge(unsigned char const *in, unsigned char const *past, unsigned char *out) {
unsigned u, v;
do {
// Read in data
u = *(unsigned const*)in;
v = u >> 16;
u &= 0x0000FFFF;
// Do computation
u = LutLarge[u];
v = LutLarge[v];
// Store data
*(unsigned*)(out) = u;
*(unsigned*)(out + 4) = v;
in += 4;
out += 8;
} while (in != past);
}
void ExpandAShelly(unsigned char const *in, unsigned char const *past, unsigned char *out) {
unsigned u, v, w, x;
do {
// Read in data
u = *(unsigned const*)in;
v = u >> 16;
u &= 0x0000FFFF;
// Do computation
w = (((u & 0xF0F) * 0x101) & 0xF000F) + (((u & 0xF0F0) * 0x1010) & 0xF000F00);
x = (((v & 0xF0F) * 0x101) & 0xF000F) + (((v & 0xF0F0) * 0x1010) & 0xF000F00);
w += w * 0x10;
x += x * 0x10;
// Store data
*(unsigned*)(out) = w;
*(unsigned*)(out + 4) = x;
in += 4;
out += 8;
} while (in != past);
}
void ExpandAShellyMulOp(unsigned char const *in, unsigned char const *past, unsigned char *out) {
unsigned u, v;
do {
// Read in data
u = *(unsigned const*)in;
v = u >> 16;
u &= 0x0000FFFF;
// Do computation
u = ((((u & 0xF0F) * 0x101) & 0xF000F) + (((u & 0xF0F0) * 0x1010) & 0xF000F00)) * 0x11;
v = ((((v & 0xF0F) * 0x101) & 0xF000F) + (((v & 0xF0F0) * 0x1010) & 0xF000F00)) * 0x11;
// Store data
*(unsigned*)(out) = u;
*(unsigned*)(out + 4) = v;
in += 4;
out += 8;
} while (in != past);
}
void ExpandSSE4(unsigned char const *in, unsigned char const *past, unsigned char *out) {
__m128i const mask0 = _mm_set1_epi16((short)0x8000),
mask1 = _mm_set1_epi8(0x0F),
mul = _mm_set1_epi16(0x0011);
__m128i u, v, w, x;
do {
// Read input into low 8 bytes of u and v
u = _mm_load_si128((__m128i const*)in);
v = _mm_unpackhi_epi8(u, u); // Expand each single byte to two bytes
u = _mm_unpacklo_epi8(u, u); // Do it again for v
w = _mm_srli_epi16(u, 4); // Copy the value into w and shift it right half a byte
x = _mm_srli_epi16(v, 4); // Do it again for v
u = _mm_blendv_epi8(u, w, mask0); // Select odd bytes from w, and even bytes from v, giving the the desired value in the upper nibble of each byte
v = _mm_blendv_epi8(v, x, mask0); // Do it again for v
u = _mm_and_si128(u, mask1); // Clear the all the upper nibbles
v = _mm_and_si128(v, mask1); // Do it again for v
u = _mm_mullo_epi16(u, mul); // Multiply each 16-bit value by 0x0011 to duplicate the lower nibble in the upper nibble of each byte
v = _mm_mullo_epi16(v, mul); // Do it again for v
// Write output
_mm_store_si128((__m128i*)(out ), u);
_mm_store_si128((__m128i*)(out + 16), v);
in += 16;
out += 32;
} while (in != past);
}
void ExpandSSE4Unroll(unsigned char const *in, unsigned char const *past, unsigned char *out) {
__m128i const mask0 = _mm_set1_epi16((short)0x8000),
mask1 = _mm_set1_epi8(0x0F),
mul = _mm_set1_epi16(0x0011);
__m128i u0, v0, w0, x0,
u1, v1, w1, x1,
u2, v2, w2, x2,
u3, v3, w3, x3;
do {
// Read input into low 8 bytes of u and v
u0 = _mm_load_si128((__m128i const*)(in ));
u1 = _mm_load_si128((__m128i const*)(in + 16));
u2 = _mm_load_si128((__m128i const*)(in + 32));
u3 = _mm_load_si128((__m128i const*)(in + 48));
v0 = _mm_unpackhi_epi8(u0, u0); // Expand each single byte to two bytes
u0 = _mm_unpacklo_epi8(u0, u0); // Do it again for v
v1 = _mm_unpackhi_epi8(u1, u1); // Do it again
u1 = _mm_unpacklo_epi8(u1, u1); // Again for u1
v2 = _mm_unpackhi_epi8(u2, u2); // Again for v1
u2 = _mm_unpacklo_epi8(u2, u2); // Again for u2
v3 = _mm_unpackhi_epi8(u3, u3); // Again for v2
u3 = _mm_unpacklo_epi8(u3, u3); // Again for u3
w0 = _mm_srli_epi16(u0, 4); // Copy the value into w and shift it right half a byte
x0 = _mm_srli_epi16(v0, 4); // Do it again for v
w1 = _mm_srli_epi16(u1, 4); // Again for u1
x1 = _mm_srli_epi16(v1, 4); // Again for v1
w2 = _mm_srli_epi16(u2, 4); // Again for u2
x2 = _mm_srli_epi16(v2, 4); // Again for v2
w3 = _mm_srli_epi16(u3, 4); // Again for u3
x3 = _mm_srli_epi16(v3, 4); // Again for v3
u0 = _mm_blendv_epi8(u0, w0, mask0); // Select even bytes from w, and odd bytes from v, giving the the desired value in the upper nibble of each byte
v0 = _mm_blendv_epi8(v0, x0, mask0); // Do it again for v
u1 = _mm_blendv_epi8(u1, w1, mask0); // Again for u1
v1 = _mm_blendv_epi8(v1, x1, mask0); // Again for v1
u2 = _mm_blendv_epi8(u2, w2, mask0); // Again for u2
v2 = _mm_blendv_epi8(v2, x2, mask0); // Again for v2
u3 = _mm_blendv_epi8(u3, w3, mask0); // Again for u3
v3 = _mm_blendv_epi8(v3, x3, mask0); // Again for v3
u0 = _mm_and_si128(u0, mask1); // Clear the all the upper nibbles
v0 = _mm_and_si128(v0, mask1); // Do it again for v
u1 = _mm_and_si128(u1, mask1); // Again for u1
v1 = _mm_and_si128(v1, mask1); // Again for v1
u2 = _mm_and_si128(u2, mask1); // Again for u2
v2 = _mm_and_si128(v2, mask1); // Again for v2
u3 = _mm_and_si128(u3, mask1); // Again for u3
v3 = _mm_and_si128(v3, mask1); // Again for v3
u0 = _mm_mullo_epi16(u0, mul); // Multiply each 16-bit value by 0x0011 to duplicate the lower nibble in the upper nibble of each byte
v0 = _mm_mullo_epi16(v0, mul); // Do it again for v
u1 = _mm_mullo_epi16(u1, mul); // Again for u1
v1 = _mm_mullo_epi16(v1, mul); // Again for v1
u2 = _mm_mullo_epi16(u2, mul); // Again for u2
v2 = _mm_mullo_epi16(v2, mul); // Again for v2
u3 = _mm_mullo_epi16(u3, mul); // Again for u3
v3 = _mm_mullo_epi16(v3, mul); // Again for v3
// Write output
_mm_store_si128((__m128i*)(out ), u0);
_mm_store_si128((__m128i*)(out + 16), v0);
_mm_store_si128((__m128i*)(out + 32), u1);
_mm_store_si128((__m128i*)(out + 48), v1);
_mm_store_si128((__m128i*)(out + 64), u2);
_mm_store_si128((__m128i*)(out + 80), v2);
_mm_store_si128((__m128i*)(out + 96), u3);
_mm_store_si128((__m128i*)(out + 112), v3);
in += 64;
out += 128;
} while (in != past);
}
void ExpandSSE2(unsigned char const *in, unsigned char const *past, unsigned char *out) {
__m128i const mask = _mm_set1_epi16((short)0xF00F),
mul0 = _mm_set1_epi16(0x0011),
mul1 = _mm_set1_epi16(0x1000);
__m128i u, v;
do {
// Read input into low 8 bytes of u and v
u = _mm_load_si128((__m128i const*)in);
v = _mm_unpackhi_epi8(u, u); // Expand each single byte to two bytes
u = _mm_unpacklo_epi8(u, u); // Do it again for v
u = _mm_and_si128(u, mask);
v = _mm_and_si128(v, mask);
u = _mm_mullo_epi16(u, mul0);
v = _mm_mullo_epi16(v, mul0);
u = _mm_mulhi_epu16(u, mul1); // This can also be done with a right shift of 4 bits, but this seems to mesure faster
v = _mm_mulhi_epu16(v, mul1);
u = _mm_mullo_epi16(u, mul0);
v = _mm_mullo_epi16(v, mul0);
// write output
_mm_store_si128((__m128i*)(out ), u);
_mm_store_si128((__m128i*)(out + 16), v);
in += 16;
out += 32;
} while (in != past);
}
void ExpandSSE2Unroll(unsigned char const *in, unsigned char const *past, unsigned char *out) {
__m128i const mask = _mm_set1_epi16((short)0xF00F),
mul0 = _mm_set1_epi16(0x0011),
mul1 = _mm_set1_epi16(0x1000);
__m128i u0, v0,
u1, v1;
do {
// Read input into low 8 bytes of u and v
u0 = _mm_load_si128((__m128i const*)(in ));
u1 = _mm_load_si128((__m128i const*)(in + 16));
v0 = _mm_unpackhi_epi8(u0, u0); // Expand each single byte to two bytes
u0 = _mm_unpacklo_epi8(u0, u0); // Do it again for v
v1 = _mm_unpackhi_epi8(u1, u1); // Do it again
u1 = _mm_unpacklo_epi8(u1, u1); // Again for u1
u0 = _mm_and_si128(u0, mask);
v0 = _mm_and_si128(v0, mask);
u1 = _mm_and_si128(u1, mask);
v1 = _mm_and_si128(v1, mask);
u0 = _mm_mullo_epi16(u0, mul0);
v0 = _mm_mullo_epi16(v0, mul0);
u1 = _mm_mullo_epi16(u1, mul0);
v1 = _mm_mullo_epi16(v1, mul0);
u0 = _mm_mulhi_epu16(u0, mul1);
v0 = _mm_mulhi_epu16(v0, mul1);
u1 = _mm_mulhi_epu16(u1, mul1);
v1 = _mm_mulhi_epu16(v1, mul1);
u0 = _mm_mullo_epi16(u0, mul0);
v0 = _mm_mullo_epi16(v0, mul0);
u1 = _mm_mullo_epi16(u1, mul0);
v1 = _mm_mullo_epi16(v1, mul0);
// write output
_mm_store_si128((__m128i*)(out ), u0);
_mm_store_si128((__m128i*)(out + 16), v0);
_mm_store_si128((__m128i*)(out + 32), u1);
_mm_store_si128((__m128i*)(out + 48), v1);
in += 32;
out += 64;
} while (in != past);
}
void ExpandAShellySSE4(unsigned char const *in, unsigned char const *past, unsigned char *out) {
__m128i const zero = _mm_setzero_si128(),
v0F0F = _mm_set1_epi32(0x0F0F),
vF0F0 = _mm_set1_epi32(0xF0F0),
v0101 = _mm_set1_epi32(0x0101),
v1010 = _mm_set1_epi32(0x1010),
v000F000F = _mm_set1_epi32(0x000F000F),
v0F000F00 = _mm_set1_epi32(0x0F000F00),
v0011 = _mm_set1_epi32(0x0011);
__m128i u, v, w, x;
do {
// Read in data
u = _mm_load_si128((__m128i const*)in);
v = _mm_unpackhi_epi16(u, zero);
u = _mm_unpacklo_epi16(u, zero);
// original source: ((((a & 0xF0F) * 0x101) & 0xF000F) + (((a & 0xF0F0) * 0x1010) & 0xF000F00)) * 0x11;
w = _mm_and_si128(u, v0F0F);
x = _mm_and_si128(v, v0F0F);
u = _mm_and_si128(u, vF0F0);
v = _mm_and_si128(v, vF0F0);
w = _mm_mullo_epi32(w, v0101); // _mm_mullo_epi32 is what makes this require SSE4 instead of SSE2
x = _mm_mullo_epi32(x, v0101);
u = _mm_mullo_epi32(u, v1010);
v = _mm_mullo_epi32(v, v1010);
w = _mm_and_si128(w, v000F000F);
x = _mm_and_si128(x, v000F000F);
u = _mm_and_si128(u, v0F000F00);
v = _mm_and_si128(v, v0F000F00);
u = _mm_add_epi32(u, w);
v = _mm_add_epi32(v, x);
u = _mm_mullo_epi32(u, v0011);
v = _mm_mullo_epi32(v, v0011);
// write output
_mm_store_si128((__m128i*)(out ), u);
_mm_store_si128((__m128i*)(out + 16), v);
in += 16;
out += 32;
} while (in != past);
}
int main() {
unsigned char *const indat = new unsigned char[DATA_SIZE_IN ],
*const outdat0 = new unsigned char[DATA_SIZE_OUT],
*const outdat1 = new unsigned char[DATA_SIZE_OUT],
* curout = outdat0,
* lastout = outdat1,
* place;
unsigned start,
stop;
place = indat + DATA_SIZE_IN - 1;
do {
*place = (unsigned char)rand();
} while (place-- != indat);
MakeLutLo();
MakeLutHi();
MakeLutLarge();
for (unsigned testcount = 0; testcount < 1000; ++testcount) {
// Solution posted by the asker
start = clock();
for (unsigned rerun = 0; rerun < RERUN_COUNT; ++rerun)
ExpandOrig(indat, indat + DATA_SIZE_IN, curout);
stop = clock();
std::cout << "ExpandOrig:\t\t\t" << (((stop - start) / 1000) / 60) << ':' << (((stop - start) / 1000) % 60) << ":." << ((stop - start) % 1000) << std::endl;
std::swap(curout, lastout);
// Dmitry's small lookup table solution
start = clock();
for (unsigned rerun = 0; rerun < RERUN_COUNT; ++rerun)
ExpandLookupSmall(indat, indat + DATA_SIZE_IN, curout);
stop = clock();
std::cout << "ExpandSmallLUT:\t\t\t" << (((stop - start) / 1000) / 60) << ':' << (((stop - start) / 1000) % 60) << ":." << ((stop - start) % 1000) << std::endl;
std::swap(curout, lastout);
if (memcmp(outdat0, outdat1, DATA_SIZE_OUT))
std::cout << "INCORRECT OUTPUT" << std::endl;
// Dmitry's small lookup table solution using only one lookup table
start = clock();
for (unsigned rerun = 0; rerun < RERUN_COUNT; ++rerun)
ExpandLookupSmallOneLUT(indat, indat + DATA_SIZE_IN, curout);
stop = clock();
std::cout << "ExpandLookupSmallOneLUT:\t" << (((stop - start) / 1000) / 60) << ':' << (((stop - start) / 1000) % 60) << ":." << ((stop - start) % 1000) << std::endl;
std::swap(curout, lastout);
if (memcmp(outdat0, outdat1, DATA_SIZE_OUT))
std::cout << "INCORRECT OUTPUT" << std::endl;
// Large lookup table solution
start = clock();
for (unsigned rerun = 0; rerun < RERUN_COUNT; ++rerun)
ExpandLookupLarge(indat, indat + DATA_SIZE_IN, curout);
stop = clock();
std::cout << "ExpandLookupLarge:\t\t" << (((stop - start) / 1000) / 60) << ':' << (((stop - start) / 1000) % 60) << ":." << ((stop - start) % 1000) << std::endl;
std::swap(curout, lastout);
if (memcmp(outdat0, outdat1, DATA_SIZE_OUT))
std::cout << "INCORRECT OUTPUT" << std::endl;
// AShelly's Interleave bits by Binary Magic Numbers solution
start = clock();
for (unsigned rerun = 0; rerun < RERUN_COUNT; ++rerun)
ExpandAShelly(indat, indat + DATA_SIZE_IN, curout);
stop = clock();
std::cout << "ExpandAShelly:\t\t\t" << (((stop - start) / 1000) / 60) << ':' << (((stop - start) / 1000) % 60) << ":." << ((stop - start) % 1000) << std::endl;
std::swap(curout, lastout);
if (memcmp(outdat0, outdat1, DATA_SIZE_OUT))
std::cout << "INCORRECT OUTPUT" << std::endl;
// AShelly's Interleave bits by Binary Magic Numbers solution optimizing out an addition
start = clock();
for (unsigned rerun = 0; rerun < RERUN_COUNT; ++rerun)
ExpandAShellyMulOp(indat, indat + DATA_SIZE_IN, curout);
stop = clock();
std::cout << "ExpandAShellyMulOp:\t\t" << (((stop - start) / 1000) / 60) << ':' << (((stop - start) / 1000) % 60) << ":." << ((stop - start) % 1000) << std::endl;
std::swap(curout, lastout);
if (memcmp(outdat0, outdat1, DATA_SIZE_OUT))
std::cout << "INCORRECT OUTPUT" << std::endl;
// My SSE4 solution
start = clock();
for (unsigned rerun = 0; rerun < RERUN_COUNT; ++rerun)
ExpandSSE4(indat, indat + DATA_SIZE_IN, curout);
stop = clock();
std::cout << "ExpandSSE4:\t\t\t" << (((stop - start) / 1000) / 60) << ':' << (((stop - start) / 1000) % 60) << ":." << ((stop - start) % 1000) << std::endl;
std::swap(curout, lastout);
if (memcmp(outdat0, outdat1, DATA_SIZE_OUT))
std::cout << "INCORRECT OUTPUT" << std::endl;
// My SSE4 solution unrolled
start = clock();
for (unsigned rerun = 0; rerun < RERUN_COUNT; ++rerun)
ExpandSSE4Unroll(indat, indat + DATA_SIZE_IN, curout);
stop = clock();
std::cout << "ExpandSSE4Unroll:\t\t" << (((stop - start) / 1000) / 60) << ':' << (((stop - start) / 1000) % 60) << ":." << ((stop - start) % 1000) << std::endl;
std::swap(curout, lastout);
if (memcmp(outdat0, outdat1, DATA_SIZE_OUT))
std::cout << "INCORRECT OUTPUT" << std::endl;
// My SSE2 solution
start = clock();
for (unsigned rerun = 0; rerun < RERUN_COUNT; ++rerun)
ExpandSSE2(indat, indat + DATA_SIZE_IN, curout);
stop = clock();
std::cout << "ExpandSSE2:\t\t\t" << (((stop - start) / 1000) / 60) << ':' << (((stop - start) / 1000) % 60) << ":." << ((stop - start) % 1000) << std::endl;
std::swap(curout, lastout);
if (memcmp(outdat0, outdat1, DATA_SIZE_OUT))
std::cout << "INCORRECT OUTPUT" << std::endl;
// My SSE2 solution unrolled
start = clock();
for (unsigned rerun = 0; rerun < RERUN_COUNT; ++rerun)
ExpandSSE2Unroll(indat, indat + DATA_SIZE_IN, curout);
stop = clock();
std::cout << "ExpandSSE2Unroll:\t\t" << (((stop - start) / 1000) / 60) << ':' << (((stop - start) / 1000) % 60) << ":." << ((stop - start) % 1000) << std::endl;
std::swap(curout, lastout);
if (memcmp(outdat0, outdat1, DATA_SIZE_OUT))
std::cout << "INCORRECT OUTPUT" << std::endl;
// AShelly's Interleave bits by Binary Magic Numbers solution implemented using SSE2
start = clock();
for (unsigned rerun = 0; rerun < RERUN_COUNT; ++rerun)
ExpandAShellySSE4(indat, indat + DATA_SIZE_IN, curout);
stop = clock();
std::cout << "ExpandAShellySSE4:\t\t" << (((stop - start) / 1000) / 60) << ':' << (((stop - start) / 1000) % 60) << ":." << ((stop - start) % 1000) << std::endl;
std::swap(curout, lastout);
if (memcmp(outdat0, outdat1, DATA_SIZE_OUT))
std::cout << "INCORRECT OUTPUT" << std::endl;
}
delete[] indat;
delete[] outdat0;
delete[] outdat1;
return 0;
}
NOTE:
I had an SSE4 implementation here initially. I found a way to implement this using SSE2, which is better because it will run on more platforms. The SSE2 implementation is also faster. So, the solution presented at the top is now the SSE2 implementation and not the SSE4 one. The SSE4 implementation can still be seen in the performance tests or in the edit history.

I'm not sure what the most efficient way would be, but this is a little shorter:
#include <stdio.h>
int main()
{
unsigned x = 0x1234;
x = (x << 8) | x;
x = ((x & 0x00f000f0) << 4) | (x & 0x000f000f);
x = (x << 4) | x;
printf("0x1234 -> 0x%08x\n",x);
return 0;
}
If you need to do this repeatedly and very quickly, as suggested in your edit, you could consider generating a lookup table and using that instead. The following function dynamically allocates and initializes such a table:
unsigned *makeLookupTable(void)
{
unsigned *tbl = malloc(sizeof(unsigned) * 65536);
if (!tbl) return NULL;
int i;
for (i = 0; i < 65536; i++) {
unsigned x = i;
x |= (x << 8);
x = ((x & 0x00f000f0) << 4) | (x & 0x000f000f);
x |= (x << 4);
/* Uncomment next line to invert the high byte as mentioned in the edit. */
/* x = x ^ 0xff000000; */
tbl[i] = x;
}
return tbl;
}
After that each conversion is just something like:
result = lookuptable[input];
..or maybe:
result = lookuptable[input & 0xffff];
Or a smaller, more cache-friendly lookup table (or pair) could be used with one lookup each for the high and low bytes (as noted by #LưuVĩnhPhúc in the comments). In that case, table generation code might be:
unsigned *makeLookupTableLow(void)
{
unsigned *tbl = malloc(sizeof(unsigned) * 256);
if (!tbl) return NULL;
int i;
for (i = 0; i < 256; i++) {
unsigned x = i;
x = ((x & 0xf0) << 4) | (x & 0x0f);
x |= (x << 4);
tbl[i] = x;
}
return tbl;
}
...and an optional second table:
unsigned *makeLookupTableHigh(void)
{
unsigned *tbl = malloc(sizeof(unsigned) * 256);
if (!tbl) return NULL;
int i;
for (i = 0; i < 256; i++) {
unsigned x = i;
x = ((x & 0xf0) << 20) | ((x & 0x0f) << 16);
x |= (x << 4);
/* uncomment next line to invert high byte */
/* x = x ^ 0xff000000; */
tbl[i] = x;
}
return tbl;
}
...and to convert a value with two tables:
result = hightable[input >> 8] | lowtable[input & 0xff];
...or with one (just the low table above):
result = (lowtable[input >> 8] << 16) | lowtable[input & 0xff];
result ^= 0xff000000; /* to invert high byte */
If the upper part of the value (alpha?) doesn't change much, even the single large table might perform well since consecutive lookups would be closer together in the table.
I took the performance test code #Apriori posted, made some adjustments, and added tests for the other responses that he hadn't included originally... then compiled three versions of it with different settings. One is 64-bit code with SSE4.1 enabled, where the compiler can make use of SSE for optimizations... and then two 32-bit versions, one with SSE and one without. Although all three were run on the same fairly recent processor, the results show how the optimal solution can change depending on the processor features:
64b SSE4.1 32b SSE4.1 32b no SSE
-------------------------- ---------- ---------- ----------
ExpandOrig time: 3.502 s 3.501 s 6.260 s
ExpandLookupSmall time: 3.530 s 3.997 s 3.996 s
ExpandLookupLarge time: 3.434 s 3.419 s 3.427 s
ExpandIsalamon time: 3.654 s 3.673 s 8.870 s
ExpandIsalamonOpt time: 3.784 s 3.720 s 8.719 s
ExpandChronoKitsune time: 3.658 s 3.463 s 6.546 s
ExpandEvgenyKluev time: 6.790 s 7.697 s 13.383 s
ExpandIammilind time: 3.485 s 3.498 s 6.436 s
ExpandDmitri time: 3.457 s 3.477 s 5.461 s
ExpandNitish712 time: 3.574 s 3.800 s 6.789 s
ExpandAdamLiss time: 3.673 s 5.680 s 6.969 s
ExpandAShelly time: 3.524 s 4.295 s 5.867 s
ExpandAShellyMulOp time: 3.527 s 4.295 s 5.852 s
ExpandSSE4 time: 3.428 s
ExpandSSE4Unroll time: 3.333 s
ExpandSSE2 time: 3.392 s
ExpandSSE2Unroll time: 3.318 s
ExpandAShellySSE4 time: 3.392 s
The executables were compiled on 64-bit Linux with gcc 4.8.1, using -m64 -O3 -march=core2 -msse4.1, -m32 -O3 -march=core2 -msse4.1 and -m32 -O3 -march=core2 -mno-sse respectively. #Apriori's SSE tests were omitted for the 32-bit builds (crashed on 32-bit with SSE enabled, and obviously won't work with SSE disabled).
Among the adjustments made was to use actual image data instead of random values (photos of objects with transparent backgrounds), which greatly improved the performance of the large lookup table but made little difference for the others.
Essentially, the lookup tables win by a landslide when SSE is unnavailable (or unused)... and the manually coded SSE solutions win otherwise. However, it's also noteworthy that when the compiler could use SSE for optimizations, most of the bit manipulation solutions were almost as fast as the manually coded SSE -- still slower, but only marginally.

Here's another attempt, using eight operations:
b = (((c & 0x0F0F) * 0x0101) & 0x00F000F) +
(((c & 0xF0F0) * 0x1010) & 0xF000F00);
b += b * 0x10;
printf("%x\n",b); //Shows '0x11223344'
*Note, this post originally contained quite different code, based on Interleave bits by Binary Magic Numbers from Sean Anderson's bithacks page. But that wasn't quite what the OP was asking. so it has ben removed. The majority of the comments below refer to that missing version.

I wanted to add this link into the answer pool because I think it is extremely important when talking about optimization, to remember the hardware we are running on, as well as the technologies compiling our code for said platform.
Blog post Playing with the CPU pipeline is about looking into optimizing a set of code for the CPU pipelining. It actually shows an example of where he tries to simplify the math down to the fewest actual mathematical operations, yet it was FAR from the most optimal solution in terms of time. I have seen a couple of answers here speaking to that effect, and they may be correct, they may not. The only way to know is to actually measure the time from start to finish of your particular snippet of code, in comparison to others. Read this blog; it is EXTREMELY interesting.
I think I should mention that I am in this particular case not going to put ANY code up here unless I have truly tried multiple attempts, and actually gotten on that is particularly faster through multiple tries.

I think that the lookup table approach suggested by Dimitri is a good choice, but I suggest to go one step further and generate the table in compile time; doing the work at compile time will obviously lessen the execution time.
First, we create a compile-time value, using any of the suggested methods:
constexpr unsigned int transform1(unsigned int x)
{
return ((x << 8) | x);
}
constexpr unsigned int transform2(unsigned int x)
{
return (((x & 0x00f000f0) << 4) | (x & 0x000f000f));
}
constexpr unsigned int transform3(unsigned int x)
{
return ((x << 4) | x);
}
constexpr unsigned int transform(unsigned int x)
{
return transform3(transform2(transform1(x)));
}
// Dimitri version, using constexprs
template <unsigned int argb> struct aarrggbb_dimitri
{
static const unsigned int value = transform(argb);
};
// Adam Liss version
template <unsigned int argb> struct aarrggbb_adamLiss
{
static const unsigned int value =
(argb & 0xf000) * 0x11000 +
(argb & 0x0f00) * 0x01100 +
(argb & 0x00f0) * 0x00110 +
(argb & 0x000f) * 0x00011;
};
And then, we create the compile-time lookup table with whatever method we have available, I'll wish to use the C++14 integer sequence but I don't know which compiler will the OP be using. So another possible approach would be to use a pretty ugly macro:
#define EXPAND16(x) aarrggbb<x + 0>::value, \
aarrggbb<x + 1>::value, \
aarrggbb<x + 2>::value, \
aarrggbb<x + 3>::value, \
aarrggbb<x + 4>::value, \
aarrggbb<x + 5>::value, \
aarrggbb<x + 6>::value, \
... and so on
#define EXPAND EXPAND16(0), \
EXPAND16(0x10), \
EXPAND16(0x20), \
EXPAND16(0x30), \
EXPAND16(0x40), \
... and so on
... and so on
See demo here.
PS: The Adam Liss approach could be used without C++11.

If multiplication is cheap and 64-bit arithmetics is available, you could use this code:
uint64_t x = 0x1234;
x *= 0x0001000100010001ull;
x &= 0xF0000F0000F0000Full;
x *= 0x0000001001001001ull;
x &= 0xF0F0F0F000000000ull;
x = (x >> 36) * 0x11;
std::cout << std::hex << x << '\n';
In fact, it uses the same idea as the original attempt by AShelly.

This works and may be easier to understand, but bit manipulations are so cheap that I wouldn't worry much about efficiency.
#include <stdio.h>
#include <stdlib.h>
void main() {
unsigned int c = 0x1234, b;
b = (c & 0xf000) * 0x11000 + (c & 0x0f00) * 0x01100 +
(c & 0x00f0) * 0x00110 + (c & 0x000f) * 0x00011;
printf("%x -> %x\n", c, b);
}

Assuming that, you want to always convert 0xWXYZ to 0xWWXXYYZZ, I believe that below solution would be little faster than the one you suggested:
unsigned int c = 0x1234;
unsigned int b = (c & 0xf) | ((c & 0xf0) << 4) |
((c & 0xf00) << 8) | ((c & 0xf000) << 12);
b |= (b << 4);
Notice that, one &(and) operation is saved from your solution. :-)
Demo.

Another way is:
DWORD OrVal(DWORD & nible_pos, DWORD input_val, DWORD temp_val, int shift)
{
if (nible_pos==0)
nible_pos = 0x0000000F;
else
nible_pos = nible_pos << 4;
DWORD nible = input_val & nible_pos;
temp_val |= (nible << shift);
temp_val |= (nible << (shift + 4));
return temp_val;
}
DWORD Converter2(DWORD input_val)
{
DWORD nible_pos = 0x00000000;
DWORD temp_val = 0x00000000;
temp_val = OrVal(nible_pos, input_val, temp_val, 0);
temp_val = OrVal(nible_pos, input_val, temp_val, 4);
temp_val = OrVal(nible_pos, input_val, temp_val, 8);
temp_val = OrVal(nible_pos, input_val, temp_val, 12);
return temp_val;
}
DWORD val2 = Converter2(0x1234);
An optimized version (3 times faster):
DWORD Converter3(DWORD input_val)
{
DWORD nible_pos = 0;
DWORD temp_val = 0;
int shift = 0;
DWORD bit_nible[4] = { 0x000F, 0x000F0, 0x0F00, 0xF000 };
for ( ; shift < 16; shift+=4 )
{
if (nible_pos==0)
nible_pos = 0x0000000F;
else
nible_pos = nible_pos << 4;
DWORD nible = input_val & nible_pos;
temp_val |= (nible << shift);
temp_val |= (nible << (shift + 4));
}
return temp_val;
}

Perhaps this could be more simpler & efficient.
unsigned int g = 0x1234;
unsigned int ans = 0;
ans = ( ( g & 0xf000 ) << 16) + ( (g & 0xf00 ) << 12)
+ ( ( g&0xf0 ) << 8) + ( ( g&0xf ) << 4);
ans = ( ans | ans>>4 );
printf("%p -> %p\n", g, ans);

unsigned long transform(unsigned long n)
{
/* n: 00AR
* 00GB
*/
n = ((n & 0xff00) << 8) | (n & 0x00ff);
/* n: 0AR0
* 0GB0
*/
n <<= 4;
/* n: AAR0
* GGB0
*/
n |= (n & 0x0f000f00L) << 4;
/* n: AARR
* GGBB
*/
n |= (n & 0x00f000f0L) >> 4;
return n;
}
The alpha and red components are shifted into the higher 2 bytes where they belong, and the result is then shifted left by 4 bits, resulting in every component being exactly where it needs to be.
With a form of 0AR0 0GB0, a bit mask and left-shift combination is OR'ed with the current value. This copies the A and G components to the position just left of them. The same thing is done for the R and B components, except in the opposite direction.

If you are going to do this for OpenGL, I suggest you to use a glTexImageXD function with type parameter set to GL_UNSIGNED_SHORT_4_4_4_4. Your OpenGL driver should do the rest. And about the transparency inversion you can always manipulate blending via the glBlendFunc and glBlendEquation functions.

While others operate on hard-core optimization...
Take this as your best bet:
std::string toAARRGGBB(const std::string &argb)
{
std::string ret("0x");
int start = 2; //"0x####";
// ^^ skipped
for (int i = start;i < argb.length(); ++i)
{
ret += argb[i];
ret += argb[i];
}
return ret;
}
int main()
{
std::string argb = toAARRGGBB("0xACED"); //!!!
}
Haha

How can I pad my md5 message with c/c++

I'm working on a program in c++ to do md5 checksums. I'm doing this mainly because I think I'll learn a lot of different things about c++, checksums, OOP, and whatever else I run into.
I'm having trouble the check sums and I think the problem is in the function padbuff which does the message padding.
#include "HashMD5.h"
int leftrotate(int x, int y);
void padbuff(uchar * buffer);
//HashMD5 constructor
HashMD5::HashMD5()
{
Type = "md5";
Hash = "";
}
HashMD5::HashMD5(const char * hashfile)
{
Type = "md5";
std::ifstream filestr;
filestr.open(hashfile, std::fstream::in | std::fstream::binary);
if(filestr.fail())
{
std::cerr << "File " << hashfile << " was not opened.\n";
std::cerr << "Open failed with error ";
}
}
std::string HashMD5::GetType()
{
return this->Type;
}
std::string HashMD5::GetHash()
{
return this->Hash;
}
bool HashMD5::is_open()
{
return !((this->filestr).fail());
}
void HashMD5::CalcHash(unsigned int * hash)
{
unsigned int *r, *k;
int r2[4] = {0, 4, 9, 15};
int r3[4] = {0, 7, 12, 19};
int r4[4] = {0, 4, 9, 15};
uchar * buffer;
int bufLength = (2<<20)*8;
int f,g,a,b,c,d, temp;
int *head;
uint32_t maxint = 1<<31;
//Initialized states
unsigned int h[4]{ 0x67452301, 0xefcdab89, 0x98badcfe, 0x10325476};
r = new unsigned int[64];
k = new unsigned int[64];
buffer = new uchar[bufLength];
if(r==NULL || k==NULL || buffer==NULL)
{
std::cerr << "One of the dyn alloc failed\n";
}
// r specifies the per-round shift amounts
for(int i = 0; i<16; i++)
r[i] = 7 + (5 * ((i)%4) );
for(int i = 16; i < 32; i++)
r[i] = 5 + r2[i%4];
for(int i = 32; i< 48; i++)
r[i] = 4 + r3[i%4];
for(int i = 48; i < 63; i++)
r[i] = 6 + r4[i%4];
for(int i = 0; i < 63; i++)
{
k[i] = floor( fabs( sin(i + 1)) * maxint);
}
while(!(this->filestr).eof())
{
//Read in 512 bits
(this->filestr).read((char *)buffer, bufLength-512);
padbuff(buffer);
//The 512 bits are now 16 32-bit ints
head = (int *)buffer;
for(int i = 0; i < 64; i++)
{
if(i >=0 && i <=15)
{
f = (b & c) | (~b & d);
g = i;
}
else if(i >= 16 && i <=31)
{
f = (d & b) | (~d & b);
g = (5*i +1) % 16;
}
else if(i >=32 && i<=47)
{
f = b ^ c ^ d;
g = (3*i + 5 ) % 16;
}
else
{
f = c ^ (b | ~d);
g = (7*i) % 16;
}
temp = d;
d = c;
c = b;
b = b + leftrotate((a + f + k[i] + head[g]), r[i]);
a = temp;
}
h[0] +=a;
h[1] +=b;
h[2] +=c;
h[3] +=d;
}
delete[] r;
delete[] k;
hash = h;
}
int leftrotate(int x, int y)
{
return(x<<y) | (x >> (32 -y));
}
void padbuff(uchar* buffer)
{
int lack;
int length = strlen((char *)buffer);
uint64_t mes_size = length % UINT64_MAX;
if((lack = (112 - (length % 128) ))>0)
{
*(buffer + length) = ('\0'+1 ) << 3;
memset((buffer + length + 1),0x0,lack);
memcpy((void*)(buffer+112),(void *)&mes_size, 64);
}
}
In my test program I run this on the an empty message. Thus length in padbuff is 0. Then when I do *(buffer + length) = ('\0'+1 ) << 3;, I'm trying to pad the message with a 1. In the Netbeans debugger I cast buffer as a uint64_t and it says buffer=8. I was trying to put a 1 bit in the most significant spot of buffer so my cast should have been UINT64_MAX. Its not, so I'm confused about how my padding code works. Can someone tell me what I'm doing and what I'm supposed to do in padbuff? Thanks, and I apologize for the long freaking question.
Just to be clear about what the padding is supposed to be doing, here is the padding excerpt from Wikipedia:
The message is padded so that its length is divisible by 512. The padding works as follows: first a single bit, 1, is appended to the end of the message. This is followed by as many zeros as are required to bring the length of the message up to 64 bits fewer than a multiple of 512. The remaining bits are filled up with 64 bits representing the length of the original message, modulo 264.
I'm mainly looking for help for padbuff, but since I'm trying to learn all comments are appreciated.

The first question is what you did:
length % UINT64_MAX doesn't make sense at all because length is in bytes and MAX is the value you can store in UINT64.
You thought that putting 1 bit in the most significant bit would give the maximum value. In fact, you need to put 1 in all bits to get it.
You shift 1 by 3. It's only half the length of the byte.
The byte pointed by buffer is the least significant in little endian. (I assume you have little endian since the debugger showed 8).
The second question how it should work.
I don't know what exactly padbuff should do but if you want to pad and get UINT64_MAX, you need something like this:
int length = strlen((char *)buffer);
int len_of_padding = sizeof(uint64_t) - length % sizeof(uint64_t);
if(len_of_padding > 0)
{
memset((void*)(buffer + length), 0xFF, len_of_padding);
}
You worked with the length of two uint64 values. May be you wanted to zero the next one:
uint64_t *after = (uint64_t*)(buffer + length + len_of_padding);
*after = 0;

A memory-efficient SHA1 implementation

I'm working with a very restrictive embedded processor, which only has 128 bytes of ram. I'd like to implement SHA1 on it. RFC3174 describes, in 'method 2', a way of implementing SHA1 that doesn't require allocating an array of 80 32-bit words (which, at 320 bytes, is obviously not practical), and seems like it ought to be usable on my processor. I'm unable to find any implementations of 'method 2', though, and the sample code in the RFC only implements the default method.
Is anyone aware of a memory-efficient implementation of SHA1 in C or C++?

You should be able to quickly adapt the method 1 source to method 2. The function to change is Sha1ProcessMessageBlock() in method 1. Initialize w[0:15] from message, then do a loop of 0 to 79, where you only do w[] manipulation after iteration 16, and temp calculation depends on ts value (0-19 uses one, 20-39 uses another, etc). The important thing to remember is using index%16 or index & 0x0f whenever you are addressing the w[] array.
A quick modification would be something like this (double check all accesses to w to make sure I haven't missed the t & 0x0f):
void SHA1ProcessMessageBlock(SHA1Context *context)
{
const uint32_t K[] = { /* Constants defined in SHA-1 */
0x5A827999,
0x6ED9EBA1,
0x8F1BBCDC,
0xCA62C1D6
};
int t; /* Loop counter */
uint32_t temp; /* Temporary word value */
uint32_t W[16]; /* Word sequence */
uint32_t A, B, C, D, E; /* Word buffers */
/*
* Initialize the first 16 words in the array W. You can move this to your
* context.
*/
for(t = 0; t < 16; t++)
{
W[t] = context->Message_Block[t * 4] << 24;
W[t] |= context->Message_Block[t * 4 + 1] << 16;
W[t] |= context->Message_Block[t * 4 + 2] << 8;
W[t] |= context->Message_Block[t * 4 + 3];
}
A = context->Intermediate_Hash[0];
B = context->Intermediate_Hash[1];
C = context->Intermediate_Hash[2];
D = context->Intermediate_Hash[3];
E = context->Intermediate_Hash[4];
for(t = 0; t < 80; t++) {
if (t >= 16) {
W[t&0xf] = SHA1CircularShift(1,W[(t-3)&0xf] ^ W[(t-8)&0xf] ^ W[(t-14)&0xf] ^ W[t&0xf]);
}
if (t<20) {
temp = SHA1CircularShift(5,A) +
((B & C) | ((~B) & D)) + E + W[t&0xf] + K[0];
}
else if (t<40) {
temp = SHA1CircularShift(5,A) + (B ^ C ^ D) + E + W[t&0xf] + K[1];
}
else if (t < 60) {
temp = SHA1CircularShift(5,A) +
((B & C) | (B & D) | (C & D)) + E + W[t&0xf] + K[2];
}
else {
temp = SHA1CircularShift(5,A) + (B ^ C ^ D) + E + W[t&0xf] + K[3];
}
E = D;
D = C;
C = SHA1CircularShift(30,B);
B = A;
A = temp;
}
context->Intermediate_Hash[0] += A;
context->Intermediate_Hash[1] += B;
context->Intermediate_Hash[2] += C;
context->Intermediate_Hash[3] += D;
context->Intermediate_Hash[4] += E;
context->Message_Block_Index = 0;
}
There are still savings to be made: get rid of W[] array on stack and put it in context pre-initialized with the data you get.
Also, you need a lot of pre-processing before calling this function. For example, if all your messages are less than 55 bytes, you can put it in W array, add padding, and process immediately. If not, you'll have to call process twice: first with your partially padded input, and again with the rest of the pad, etc. That sort of thing would be very application specific, and I doubt you'll be able to find the code to do it for you.
By the way, the code above is a straight adaptation from the type 1 source from your link. You can probably squeeze a bit more out of it if you try to optimize it further.
I couldn't think of a way to get any savings on the intermediate hash, so you will need a total of 108 bytes for this (109 if counter is also in RAM), and 24 of which is local to this function, and can be reused in other places - so long as they are also temporary. So it is very hard to do what you want to do.
EDIT: If all your messages are less than 55 bytes, you can save another 20 bytes in your context by getting rid of the intermediate_hash[] storage. Simply initialize A-E from the constants, and add the constants at the end. Finally, instead of storing them in a separate variable, overwrite your input when this function ends.

I have implemented SHA-1 for several memory-constrained environments. You can get by with
DWORD W[16] ; // instead of H[80]
DWORD H[5] ; // Intermediate hash value
DWORD BitCount[2] ; // Probably a single DWORD is enough here
plus a few bytes of housekeeping. W is updated on the fly, as a circular buffer, instead of being generated at the start of each round.

working example:
#include<iostream>
#include<stdio.h>
#include<stdlib.h>
#include<string>
using namespace std;
unsigned CircularShift(int bits, unsigned word)
{
return ((word << bits) & 0xFFFFFFFF) | ((word & 0xFFFFFFFF) >> (32-bits));
}
int main(void)
{
string mess;
cin >> mess;
unsigned int lm = mess.length();
unsigned int lmb = lm*8;
unsigned char *messc;
messc=(unsigned char*)malloc((sizeof(unsigned char))*64);
for (unsigned short int i =0;i<64;i++)
{
messc[i]=char(0x00);
}
for(int i=0;i<mess.length();i++)
{
messc[i]=mess[i];
}
messc[lm]=(unsigned char)128;
messc[56] = (lmb >> 24) & 0xFF;
messc[57] = (lmb >> 16) & 0xFF;
messc[58] = (lmb >> 8) & 0xFF;
// messc[59] = (lmb) & 0xFF;
messc[60] = (lmb >> 24) & 0xFF;
messc[61] = (lmb >> 16) & 0xFF;
messc[62] = (lmb >> 8) & 0xFF;
messc[63] = (lmb) & 0xFF;
for(int i =0 ;i<64;i++)
{
cout<< hex << (int)messc[i] << " ";
}
unsigned *H;
H=(unsigned*)malloc(5*sizeof(unsigned));
H[0] = 0x67452301;
H[1] = 0xEFCDAB89;
H[2] = 0x98BADCFE;
H[3] = 0x10325476;
H[4] = 0xC3D2E1F0;
const unsigned K[]={0x5A827999,0x6ED9EBA1,0x8F1BBCDC,0xCA62C1D6};
int t;
unsigned temp;
unsigned *W;
unsigned A, B, C, D, E;
W=(unsigned*)malloc(80*sizeof(unsigned));
unsigned char *messh;
messh=(unsigned char*)malloc(64*sizeof(unsigned char));
int k;
for(t = 0; t < 16; t++)
{
W[t] = ((unsigned) messc[t * 4])<< 24; ;
W[t] |= ((unsigned) messc[t * 4 + 1])<< 16;
W[t] |= ((unsigned) messc[t * 4 + 2]) << 8;
W[t] |= ((unsigned) messc[t * 4 + 3]);
}
for(t = 16; t < 80; t++)
{
W[t] = CircularShift(1,W[t-3] ^ W[t-8] ^ W[t-14] ^ W[t-16]);
}
A = H[0];
B = H[1];
C = H[2];
D = H[3];
E = H[4];
for(t = 0; t < 20; t++)
{
temp = CircularShift(5,A) + ((B & C) | ((~B) & D)) + E + W[t] + K[0];
temp &= 0xFFFFFFFF;
E = D;
D = C;
C = CircularShift(30,B);
B = A;
A = temp;
}
for(t = 20; t < 40; t++)
{
temp = CircularShift(5,A) + (B ^ C ^ D) + E + W[t] + K[1];
temp &= 0xFFFFFFFF;
E = D;
D = C;
C = CircularShift(30,B);
B = A;
A = temp;
}
for(t = 40; t < 60; t++)
{
temp = CircularShift(5,A) +
((B & C) | (B & D) | (C & D)) + E + W[t] + K[2];
temp &= 0xFFFFFFFF;
E = D;
D = C;
C = CircularShift(30,B);
B = A;
A = temp;
}
for(t = 60; t < 80; t++)
{
temp = CircularShift(5,A) + (B ^ C ^ D) + E + W[t] + K[3];
temp &= 0xFFFFFFFF;
E = D;
D = C;
C = CircularShift(30,B);
B = A;
A = temp;
}
H[0] = (H[0] + A) & 0xFFFFFFFF;
H[1] = (H[1] + B) & 0xFFFFFFFF;
H[2] = (H[2] + C) & 0xFFFFFFFF;
H[3] = (H[3] + D) & 0xFFFFFFFF;
H[4] = (H[4] + E) & 0xFFFFFFFF;
cout <<"\nTHIS IS SHHHHHAAAAAAAAAAA\n";
for(int i=0;i<5;i++)
{
cout << hex << H[i] << " ";
}
//Message_Block_Index = 0;
}

All things considered, looking at your requirements, I think you are going to have to change your specs. Either a bigger chip, or a simpler algorithm. Even implementing SHA-1 (without HMAC) would be a challenge, but it should be doable.

We Keep Coding

c++ django amazon-web-services regex python-2.7 google-cloud-platform list unit-testing opengl ember.js