Use load/store correctly

Use load/store correctly - c++

How to use load/store to do aligned int16_t byte swapping correctly?
void byte_swapping(uint16_t* dest, const uint16_t* src,
size_t count) {
__m128i _s, _d;
for (uint16_t const * end(dest + count); dest != end; dest += 8, src += 8)
{
_s = _mm_load_si128((__m128i*)src);
_d = _mm_or_si128(_mm_slli_epi16(_s, 8), _mm_srli_epi16(_s, 8));
_mm_store_si128((__m128i*) dest, _d);
}
}

Your code will fail when count is not a multiple of 8, or when either src or dest is not 16 byte aligned.
Here is a fixed (and tested) version of your code:
void byte_swapping(uint16_t* dest, const uint16_t* src, size_t count)
{
size_t i;
for (i = 0; i + 8 <= count; i += 8)
{
__m128i s = _mm_loadu_si128((__m128i*)&src[i]);
__m128i d = _mm_or_si128(_mm_slli_epi16(s, 8), _mm_srli_epi16(s, 8));
_mm_storeu_si128((__m128i*)&dest[i], d);
}
for ( ; i < count; ++i) // handle residual elements
{
uint16_t w = src[i];
w = (w >> 8) | (w << 8);
dest[i] = w;
}
}

Related

Fast pyrDown image with AVX instructions

I have 2 pyrDown implementation with SSE2 and AVX instructions set. They are differ and AVX implementation get wrong image result. Also AVX implementation is slower that SSE2 impl. It's strange. Whats wrong with AVX implementation and how it make faster?
// SSE2 implementation
static __inline __m128i average2RowsSingle(const uint8_t* __restrict__ src, size_t srcStep) {
__m128i v0 = _mm_load_si128((const __m128i *)src);
__m128i v1 = _mm_load_si128((const __m128i *)&src[srcStep]);
return _mm_avg_epu8(v0, v1);
}
// SSSE3 version
// I used `__restrict__` to give the compiler more flexibility in unrolling
void average2Rows(const uint8_t* __restrict__ src,
uint8_t*__restrict__ dst,
size_t srcStep,
size_t size)
{
const __m128i vk1 = _mm_set1_epi8(1);
const __m128i add2 = _mm_set1_epi16(2);
size_t dstsize = size/2;
for (size_t i = 0; i < dstsize - 15; i += 16)
{
const size_t ii = i*2;
// based on https://stackoverflow.com/a/45564565/820795
__m128i left = average2RowsSingle(src+ii, srcStep);
__m128i right = average2RowsSingle(src+ii+16, srcStep);
__m128i w0 = _mm_maddubs_epi16(left, vk1); // unpack and horizontal add
__m128i w1 = _mm_maddubs_epi16(right, vk1);
w0 = _mm_srli_epi16(w0, 1); // divide by 2
w1 = _mm_srli_epi16(w1, 1);
w0 = _mm_packus_epi16(w0, w1); // pack
_mm_storeu_si128((__m128i *)&dst[i], w0);
}
}
// AVX implementation
static __m256i average2RowsSingle(const uint8_t* __restrict__ src, size_t srcStep) {
auto v0 = _mm256_load_si256((const __m256i*)src);
auto v1 = _mm256_load_si256((const __m256i*)&src[srcStep]);
return _mm256_avg_epu8(v0, v1);
}
void average2Rows(const uint8_t* __restrict__ src,
uint8_t*__restrict__ dst,
size_t srcStep,
size_t size) {
const __m128i vk1 = _mm_set1_epi8(1);
size_t dstsize = size/2;
const signed char o = -1; // make shuffle zero
const __m256i vec_r_i16 = _mm256_set_epi8(o,30, o,28, o,26, o,24, o,22, o,20, o,18, o,16,
o,14, o,12, o,10, o, 8, o, 6, o, 4, o, 2, o, 0);
const __m256i vec_l_i16 = _mm256_set_epi8(o,31, o,29, o,27, o,25, o,23, o,21, o,19, o,17,
o,15, o,13, o,11, o, 9, o, 7, o, 5, o, 3, o, 1);
for (size_t i = 0; i < dstsize - 31; i += 32)
{
const size_t ii = i * 2;
auto left = average2RowsSingle(src + ii, srcStep);
auto right = average2RowsSingle(src + ii + 32, srcStep);
auto w0 = _mm256_shuffle_epi8(left, vec_r_i16);
auto w1 = _mm256_shuffle_epi8(left, vec_l_i16);
left = _mm256_srli_epi16(_mm256_add_epi16(w0, w1), 1);
w0 = _mm256_shuffle_epi8(right, vec_r_i16);
w1 = _mm256_shuffle_epi8(right, vec_l_i16);
right = _mm256_srli_epi16(_mm256_add_epi16(w0, w1), 1);
left = _mm256_packus_epi16(left, right);
_mm256_storeu_si256((__m256i *) &dst[i], left);
}
}
Wrong result after AVX implementation:

With help of #chtz I come up to this code:
inline __m256i average2RowsSingle(const uint8_t* __restrict__ src, size_t srcStep) {
auto v0 = _mm256_loadu_si256((const __m256i *)src);
auto v1 = _mm256_loadu_si256((const __m256i *)&src[srcStep]);
return _mm256_avg_epu8(v0, v1);
}
void average2Rows(const uint8_t* __restrict__ src,
uint8_t*__restrict__ dst,
size_t srcStep,
size_t size) {
const auto vk1 = _mm256_set1_epi8(1);
const size_t dstSize = size/2;
for (size_t i = 0; i < dstSize - 31; i += 32)
{
const size_t ii = i * 2;
// based on https://stackoverflow.com/a/45564565/820795
auto left = average2RowsSingle(src + ii, srcStep);
auto right = average2RowsSingle(src + ii + 32, srcStep);
auto w0 = _mm256_maddubs_epi16(left, vk1); // unpack and horizontal add
auto w1 = _mm256_maddubs_epi16(right, vk1);
w0 = _mm256_srli_epi16(w0, 1); // divide by 2
w1 = _mm256_srli_epi16(w1, 1);
w0 = _mm256_packus_epi16(w0, w1); // pack
w0 = _mm256_permute4x64_epi64(w0, 0xd8); // shuffle to get correct order
_mm256_storeu_si256((__m256i *)&dst[i], w0);
}
}
Result image:

Converting a RGBA image to RGB image

I try to convert a RGBA image to a RGB image (8-bit unsigned integer per channel). At first I used OpenCV and the following function for that
m_bufferMat.data = (uchar*) (ptr1);
m_bufferMat.convertTo(m_bufferMat, CV_8UC3);
But for the other parts of the application I don't need to use OpenCV, so I tried to convert the image myself so I don't need to link and include the OpenCV library. The fastest method I could imagine is to iterate trough the buffer and copy just the first 3 bytes to another buffer like the following:
for(int i = 0; i < width * height; i++) {
*(ptr2++) = *(ptr1++);
*(ptr2++) = *(ptr1++);
*(ptr2++) = *(ptr1++);
ptr1++;
}
But for that I need to copy which could be not realy fast. The OpenCV function is 1.5 times faster than my own function. Has anybody an idea why? Can I implement a function where I don't need to copy?

There are many optimizations that could be done. Here is a test bench program to try them and a few example optimizations:
#include <iostream>
#include <string>
#include <vector>
#include <intrin.h>
#include <functional>
volatile int width = 1920;
volatile int height = 1080;
unsigned char* src = new unsigned char[width * height * 4];
unsigned char* dst = new unsigned char[width * height * 3];
unsigned char* refDst = new unsigned char[width * height * 3];
void DefaultFunc() {
auto ptr1 = src;
auto ptr2 = dst;
for (int i = 0; i < width * height; i++) {
*(ptr2++) = *(ptr1++);
*(ptr2++) = *(ptr1++);
*(ptr2++) = *(ptr1++);
ptr1++;
}
}
void NPreCalculatedFunc() {
auto ptr1 = src;
auto ptr2 = dst;
auto n = width * height;
for (int i = 0; i < n; i++) {
*(ptr2++) = *(ptr1++);
*(ptr2++) = *(ptr1++);
*(ptr2++) = *(ptr1++);
ptr1++;
}
}
void ReadFullPixelFunc() {
unsigned int* ptr1 = (unsigned int*)src;
auto ptr2 = dst;
auto n = width * height;
for (int i = 0; i < n; i++) {
auto srcPix = *(ptr1++);
*(ptr2++) = srcPix & 0xff;
*(ptr2++) = (srcPix >> 8) & 0xff;
*(ptr2++) = (srcPix >> 16) & 0xff;
}
}
void ReadAndWriteFullPixelFunc() {
unsigned int* ptr1 = (unsigned int*)src;
unsigned int* ptr2 = (unsigned int*)dst;
auto n = width * height / 4;
unsigned int writeBuf = 0;
for (int i = n; i; i--) {
// by reading 4 pixels, we get to store 3 unsigned ints
auto srcPix = *(ptr1++);
writeBuf = srcPix & 0x00ffffff;
srcPix = *(ptr1++);
writeBuf |= srcPix << 24;
*(ptr2++) = writeBuf;
writeBuf = (srcPix >> 8) & 0xffff;
srcPix = *(ptr1++);
writeBuf |= (srcPix << 16);
*(ptr2++) = writeBuf;
writeBuf = (srcPix >> 16) & 0xff;
srcPix = *(ptr1++);
writeBuf |= (srcPix << 8);
*(ptr2++) = writeBuf;
}
// todo: if width * height is not divisible by 4, process the last max 3 pixels here with the unoptimized loop
}
void ReadAndWriteFullPixelXmmFunc() {
unsigned int* ptr1 = (unsigned int*)src;
unsigned int* ptr2 = (unsigned int*)dst;
auto n = width * height / 4;
unsigned int writeBuf = 0;
__m128i reorder = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 14, 13, 12, 10, 9, 8, 6, 5, 4, 2, 1, 0);
for (int i = n; i; i--) {
auto srcPix4_ro = _mm_shuffle_epi8(_mm_loadu_si128((__m128i*)ptr1), reorder); // read 4 source pixels, remove alpha bytes, pack to low 12 bytes of srcPix4
ptr1 += 4;
_mm_storel_epi64((__m128i*)ptr2, srcPix4_ro); // store 2 first pixels
ptr2 += 2;
auto shifted = _mm_bsrli_si128(srcPix4_ro, 8);
_mm_storeu_si32(ptr2, shifted); // store 3rd pixel
ptr2 += 1;
}
// todo: if width * height is not divisible by 4, process the last max 3 pixels here with the unoptimized loop
}
unsigned long long PrintShortestTime(std::function<void()> f, const char *label, unsigned long long refTime) {
unsigned long long minTicks = ~0ull;
memset(dst, 0, width * height * 3);
for (int i = 0; i < 500; i++) {
auto start = __rdtsc();
f();
auto end = __rdtsc();
auto duration = end - start;
if (duration < minTicks) {
minTicks = duration;
}
}
if (memcmp(refDst, dst, width * height * 3)) { // test that we got the right answer
printf("Fail - result does not equal refrence!\n");
}
printf("%s : %llu clock cycles - %0.3lf x base implementation time\n", label, minTicks, refTime ? ((double)minTicks/(double)refTime):1.0);
return minTicks;
}
int main() {
for (int i = 0; i < width * height * 4; i++) {
src[i] = rand() & 0xff;
}
DefaultFunc();
memcpy(refDst, dst, width * height * 3);
auto refTime = PrintShortestTime(DefaultFunc, "default, unoptimized", 0);
PrintShortestTime(NPreCalculatedFunc, "n precalculated", refTime);
PrintShortestTime(ReadFullPixelFunc, "n precalculated, reading 1 pixel at a time", refTime);
PrintShortestTime(ReadAndWriteFullPixelFunc, "reading and writing ints at a time", refTime);
PrintShortestTime(ReadAndWriteFullPixelXmmFunc, "with xmm intrinsincs", refTime);
}
For me, on visual studio & x64 or x86, the last version takes about 0.4x as long time as the basic version:
default, unoptimized : 7511848 clock cycles - 1.000 x base implementation time
n precalculated : 7383696 clock cycles - 0.983 x base implementation time
n precalculated, reading 1 pixel at a time : 7354644 clock cycles - 0.979 x base implementation time
reading and writing ints at a time : 4613816 clock cycles - 0.614 x base implementation time
with xmm intrinsincs : 3036824 clock cycles - 0.404 x base implementation time
It would probably be possible to optimize further by unrolling the loop, writing memory in larger chunks.

Altivec: analogue of _mm_sad_epu8()

I try to port a SSE function which get absolute difference of two 8-bit unsigned integer arrays.
It looks like:
uint64_t AbsDiffSum(const uint8_t * a, const uint8_t * b, size_t size)
{
assert(size%16 == 0);
__m128i _sum = _mm_setzero_si128();
for(size_t i = 0; i < size; i += 16)
{
const __m128i _a = _mm_loadu_si128((__m128i*)(a + i));
const __m128i _b = _mm_loadu_si128((__m128i*)(b + i));
_sum = _mm_add_epi64(_sum, _mm_sad_epu8(_a, _b));
}
return _mm_cvtsi128_si64(_mm_add_epi64(_sum, _mm_srli_si128(_sum, 8)));
}
Main work is performed by intrinsic function _mm_sad_epu8().
Is there an analogue for Altivec?

Unfortunately, there is no direct analogue of intrinsic function _mm_sad_epu8 for Altivec.
But there is a possibility to emulate it:
typedef __vector uint8_t uint8x16_t;
typedef __vector uint32_t uint32x4_t;
const uint8_t K8_01 = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
uint64_t AbsDiffSum(const uint8_t * a, const uint8_t * b, size_t size)
{
uint32x4_t _sum = {0, 0, 0, 0};
for(size_t i = 0; i < size; i += 16)
{
// Aligned loading of 128-bit vector
uint8x16_t _a = vec_ld(a + i);
// Aligned loading of 128-bit vector
uint8x16_t _b = vec_ld(b + i);
// Find absolute difference of two 8-bit unsigned
uint8x16_t absDifference = vec_sub(vec_max(a, b), vec_min(a, b));
// Sum result with using of vec_msum
_sum = vec_msum(absDifference, K8_01, _sum);
}
return vec_extract(_sum, 0) + vec_extract(_sum, 1) +
vec_extract(_sum, 2) + vec_extract(_sum, 3);
}

Comparing two vector<bool> with SSE

I have two vector<bool> A and B.
I want to compare them and count the number of elements that are equal:
For example:
A = {0,1,0,1}
B = {0,0,1,1}
Result will be equal to 2.
I can use _mm_cmpeq_epi8 but it is only compare 16 elements (i.e. I should convert 0 and 1 to char and then do the comparison).
Is it possible to compare 128 elements each time with SSE (or SIMD instructions)?

If you can either assume that vector<bool> is using contiguous byte-sized elements for storage, or if you can consider using something like vector<uint8_t> instead, then this example should give you a good starting point:
static size_t count_equal(const vector<uint8_t> &vec1, const vector<uint8_t> &vec2)
{
assert(vec1.size() == vec2.size()); // vectors must be same size
const size_t n = vec1.size();
const size_t max_block_size = 255 * 16; // max block size before possible overflow
__m128i vcount = _mm_setzero_si128();
size_t i, count = 0;
for (i = 0; i + 16 <= n; ) // for each block
{
size_t m = std::min(n, i + max_block_size);
for ( ; i + 16 <= m; i += 16) // for each vector in block
{
__m128i v1 = _mm_loadu_si128((__m128i *)&vec1[i]);
__m128i v2 = _mm_loadu_si128((__m128i *)&vec2[i]);
__m128i vcmp = _mm_cmpeq_epi8(v1, v2);
vcount = _mm_sub_epi8(vcount, vcmp);
}
vcount = _mm_sad_epu8(vcount, _mm_setzero_si128());
count += _mm_extract_epi16(vcount, 0) + _mm_extract_epi16(vcount, 4);
vcount = _mm_setzero_si128(); // update count from current block
}
vcount = _mm_sad_epu8(vcount, _mm_setzero_si128());
count += _mm_extract_epi16(vcount, 0) + _mm_extract_epi16(vcount, 4);
for ( ; i < n; ++i) // deal with any remaining partial vector
{
count += (vec1[i] == vec2[i]);
}
return count;
}
Note that this is using vector<uint8_t>. If you really have to use vector<bool> and can guarantee that the elements will always be contiguous and byte-sized then you'll just need to coerce the vector<bool> into a const uint8_t * or similar somehow.
Test harness:
#include <cassert>
#include <cstdlib>
#include <ctime>
#include <iostream>
#include <vector>
#include <emmintrin.h> // SSE2
using std::vector;
static size_t count_equal_ref(const vector<uint8_t> &vec1, const vector<uint8_t> &vec2)
{
assert(vec1.size() == vec2.size());
const size_t n = vec1.size();
size_t i, count = 0;
for (i = 0 ; i < n; ++i)
{
count += (vec1[i] == vec2[i]);
}
return count;
}
static size_t count_equal(const vector<uint8_t> &vec1, const vector<uint8_t> &vec2)
{
assert(vec1.size() == vec2.size()); // vectors must be same size
const size_t n = vec1.size();
const size_t max_block_size = 255 * 16; // max block size before possible overflow
__m128i vcount = _mm_setzero_si128();
size_t i, count = 0;
for (i = 0; i + 16 <= n; ) // for each block
{
size_t m = std::min(n, i + max_block_size);
for ( ; i + 16 <= m; i += 16) // for each vector in block
{
__m128i v1 = _mm_loadu_si128((__m128i *)&vec1[i]);
__m128i v2 = _mm_loadu_si128((__m128i *)&vec2[i]);
__m128i vcmp = _mm_cmpeq_epi8(v1, v2);
vcount = _mm_sub_epi8(vcount, vcmp);
}
vcount = _mm_sad_epu8(vcount, _mm_setzero_si128());
count += _mm_extract_epi16(vcount, 0) + _mm_extract_epi16(vcount, 4);
vcount = _mm_setzero_si128(); // update count from current block
}
vcount = _mm_sad_epu8(vcount, _mm_setzero_si128());
count += _mm_extract_epi16(vcount, 0) + _mm_extract_epi16(vcount, 4);
for ( ; i < n; ++i) // deal with any remaining partial vector
{
count += (vec1[i] == vec2[i]);
}
return count;
}
int main(int argc, char * argv[])
{
size_t n = 100;
if (argc > 1)
{
n = atoi(argv[1]);
}
vector<uint8_t> vec1(n);
vector<uint8_t> vec2(n);
srand((unsigned int)time(NULL));
for (size_t i = 0; i < n; ++i)
{
vec1[i] = rand() & 1;
vec2[i] = rand() & 1;
}
size_t n_ref = count_equal_ref(vec1, vec2);
size_t n_test = count_equal(vec1, vec2);
if (n_ref == n_test)
{
std::cout << "PASS" << std::endl;
}
else
{
std::cout << "FAIL: n_ref = " << n_ref << ", n_test = " << n_test << std::endl;
}
return 0;
}
Compile and run:
$ g++ -Wall -msse3 -O3 test.cpp && ./a.out
PASS

std::vector<bool> is a specialization of std::vector for the type bool. Although not specified by the C++ standard, in most implementations std::vector<bool> is made space efficient such that each of its element is a single bit instead of a bool.
The behaviour of std::vector<bool> is similar to its primarily template counterpart, except that:
std::vector<bool> does not necessarily store its element contiguously .
In order to expose its elements (i.e., the individual bits) std::vector<bool> uses a proxy class (i.e., std::vector<bool>::reference). Objects of class std::vector<bool>::reference are returned by std::vector<bool> subscript operator (i.e., operator[]) by value.
Accordingly, I don't think it's portable to use _mm_cmpeq_epi8 like functions since storage of a std::vector<bool> is implementation defined (i.e., not guaranteed contiguous).
An alternative but portable way is to use regular STL facilities like the example below:
std::vector<bool> A = {0,1,0,1};
std::vector<bool> B = {0,0,1,1};
std::vector<bool> C(A.size());
std::transform(A.begin(), A.end(), B.begin(), C.begin(), [](bool const &a, bool const &b) { return a == b;});
std::cout << std::count(C.begin(), C.end(), true) << std::endl;
Live Demo

Writing BMP image in pure c/c++ without other libraries

In my algorithm, I need to create an information output. I need to write a boolean matrix into a bmp file.
It must be a monocromic image, where pixels are white if the matrix on such element is true.
Main problem is the bmp header and how to write this.

See if this works for you...
In this code, I had 3 2-dimensional arrays, called red,green and blue. Each one was of size [width][height], and each element corresponded to a pixel - I hope this makes sense!
FILE *f;
unsigned char *img = NULL;
int filesize = 54 + 3*w*h; //w is your image width, h is image height, both int
img = (unsigned char *)malloc(3*w*h);
memset(img,0,3*w*h);
for(int i=0; i<w; i++)
{
for(int j=0; j<h; j++)
{
x=i; y=(h-1)-j;
r = red[i][j]*255;
g = green[i][j]*255;
b = blue[i][j]*255;
if (r > 255) r=255;
if (g > 255) g=255;
if (b > 255) b=255;
img[(x+y*w)*3+2] = (unsigned char)(r);
img[(x+y*w)*3+1] = (unsigned char)(g);
img[(x+y*w)*3+0] = (unsigned char)(b);
}
}
unsigned char bmpfileheader[14] = {'B','M', 0,0,0,0, 0,0, 0,0, 54,0,0,0};
unsigned char bmpinfoheader[40] = {40,0,0,0, 0,0,0,0, 0,0,0,0, 1,0, 24,0};
unsigned char bmppad[3] = {0,0,0};
bmpfileheader[ 2] = (unsigned char)(filesize );
bmpfileheader[ 3] = (unsigned char)(filesize>> 8);
bmpfileheader[ 4] = (unsigned char)(filesize>>16);
bmpfileheader[ 5] = (unsigned char)(filesize>>24);
bmpinfoheader[ 4] = (unsigned char)( w );
bmpinfoheader[ 5] = (unsigned char)( w>> 8);
bmpinfoheader[ 6] = (unsigned char)( w>>16);
bmpinfoheader[ 7] = (unsigned char)( w>>24);
bmpinfoheader[ 8] = (unsigned char)( h );
bmpinfoheader[ 9] = (unsigned char)( h>> 8);
bmpinfoheader[10] = (unsigned char)( h>>16);
bmpinfoheader[11] = (unsigned char)( h>>24);
f = fopen("img.bmp","wb");
fwrite(bmpfileheader,1,14,f);
fwrite(bmpinfoheader,1,40,f);
for(int i=0; i<h; i++)
{
fwrite(img+(w*(h-i-1)*3),3,w,f);
fwrite(bmppad,1,(4-(w*3)%4)%4,f);
}
free(img);
fclose(f);

Clean C Code for Bitmap (BMP) Image Generation
This code does not use any library other than stdio.h. So, it can be easily incorporated in other languages of C-Family, like- C++, C#, Java.
#include <stdio.h>
const int BYTES_PER_PIXEL = 3; /// red, green, & blue
const int FILE_HEADER_SIZE = 14;
const int INFO_HEADER_SIZE = 40;
void generateBitmapImage(unsigned char* image, int height, int width, char* imageFileName);
unsigned char* createBitmapFileHeader(int height, int stride);
unsigned char* createBitmapInfoHeader(int height, int width);
int main ()
{
int height = 361;
int width = 867;
unsigned char image[height][width][BYTES_PER_PIXEL];
char* imageFileName = (char*) "bitmapImage.bmp";
int i, j;
for (i = 0; i < height; i++) {
for (j = 0; j < width; j++) {
image[i][j][2] = (unsigned char) ( i * 255 / height ); ///red
image[i][j][1] = (unsigned char) ( j * 255 / width ); ///green
image[i][j][0] = (unsigned char) ( (i+j) * 255 / (height+width) ); ///blue
}
}
generateBitmapImage((unsigned char*) image, height, width, imageFileName);
printf("Image generated!!");
}
void generateBitmapImage (unsigned char* image, int height, int width, char* imageFileName)
{
int widthInBytes = width * BYTES_PER_PIXEL;
unsigned char padding[3] = {0, 0, 0};
int paddingSize = (4 - (widthInBytes) % 4) % 4;
int stride = (widthInBytes) + paddingSize;
FILE* imageFile = fopen(imageFileName, "wb");
unsigned char* fileHeader = createBitmapFileHeader(height, stride);
fwrite(fileHeader, 1, FILE_HEADER_SIZE, imageFile);
unsigned char* infoHeader = createBitmapInfoHeader(height, width);
fwrite(infoHeader, 1, INFO_HEADER_SIZE, imageFile);
int i;
for (i = 0; i < height; i++) {
fwrite(image + (i*widthInBytes), BYTES_PER_PIXEL, width, imageFile);
fwrite(padding, 1, paddingSize, imageFile);
}
fclose(imageFile);
}
unsigned char* createBitmapFileHeader (int height, int stride)
{
int fileSize = FILE_HEADER_SIZE + INFO_HEADER_SIZE + (stride * height);
static unsigned char fileHeader[] = {
0,0, /// signature
0,0,0,0, /// image file size in bytes
0,0,0,0, /// reserved
0,0,0,0, /// start of pixel array
};
fileHeader[ 0] = (unsigned char)('B');
fileHeader[ 1] = (unsigned char)('M');
fileHeader[ 2] = (unsigned char)(fileSize );
fileHeader[ 3] = (unsigned char)(fileSize >> 8);
fileHeader[ 4] = (unsigned char)(fileSize >> 16);
fileHeader[ 5] = (unsigned char)(fileSize >> 24);
fileHeader[10] = (unsigned char)(FILE_HEADER_SIZE + INFO_HEADER_SIZE);
return fileHeader;
}
unsigned char* createBitmapInfoHeader (int height, int width)
{
static unsigned char infoHeader[] = {
0,0,0,0, /// header size
0,0,0,0, /// image width
0,0,0,0, /// image height
0,0, /// number of color planes
0,0, /// bits per pixel
0,0,0,0, /// compression
0,0,0,0, /// image size
0,0,0,0, /// horizontal resolution
0,0,0,0, /// vertical resolution
0,0,0,0, /// colors in color table
0,0,0,0, /// important color count
};
infoHeader[ 0] = (unsigned char)(INFO_HEADER_SIZE);
infoHeader[ 4] = (unsigned char)(width );
infoHeader[ 5] = (unsigned char)(width >> 8);
infoHeader[ 6] = (unsigned char)(width >> 16);
infoHeader[ 7] = (unsigned char)(width >> 24);
infoHeader[ 8] = (unsigned char)(height );
infoHeader[ 9] = (unsigned char)(height >> 8);
infoHeader[10] = (unsigned char)(height >> 16);
infoHeader[11] = (unsigned char)(height >> 24);
infoHeader[12] = (unsigned char)(1);
infoHeader[14] = (unsigned char)(BYTES_PER_PIXEL*8);
return infoHeader;
}

Without the use of any other library you can look at the BMP file format. I've implemented it in the past and it can be done without too much work.
Bitmap-File Structures
Each bitmap file contains a
bitmap-file header, a
bitmap-information header, a color
table, and an array of bytes that
defines the bitmap bits. The file has
the following form:
BITMAPFILEHEADER bmfh;
BITMAPINFOHEADER bmih;
RGBQUAD aColors[];
BYTE aBitmapBits[];
... see the file format for more details

this is a example code copied from
https://en.wikipedia.org/wiki/User:Evercat/Buddhabrot.c
void drawbmp (char * filename) {
unsigned int headers[13];
FILE * outfile;
int extrabytes;
int paddedsize;
int x; int y; int n;
int red, green, blue;
extrabytes = 4 - ((WIDTH * 3) % 4); // How many bytes of padding to add to each
// horizontal line - the size of which must
// be a multiple of 4 bytes.
if (extrabytes == 4)
extrabytes = 0;
paddedsize = ((WIDTH * 3) + extrabytes) * HEIGHT;
// Headers...
// Note that the "BM" identifier in bytes 0 and 1 is NOT included in these "headers".
headers[0] = paddedsize + 54; // bfSize (whole file size)
headers[1] = 0; // bfReserved (both)
headers[2] = 54; // bfOffbits
headers[3] = 40; // biSize
headers[4] = WIDTH; // biWidth
headers[5] = HEIGHT; // biHeight
// Would have biPlanes and biBitCount in position 6, but they're shorts.
// It's easier to write them out separately (see below) than pretend
// they're a single int, especially with endian issues...
headers[7] = 0; // biCompression
headers[8] = paddedsize; // biSizeImage
headers[9] = 0; // biXPelsPerMeter
headers[10] = 0; // biYPelsPerMeter
headers[11] = 0; // biClrUsed
headers[12] = 0; // biClrImportant
outfile = fopen(filename, "wb");
//
// Headers begin...
// When printing ints and shorts, we write out 1 character at a time to avoid endian issues.
//
fprintf(outfile, "BM");
for (n = 0; n <= 5; n++)
{
fprintf(outfile, "%c", headers[n] & 0x000000FF);
fprintf(outfile, "%c", (headers[n] & 0x0000FF00) >> 8);
fprintf(outfile, "%c", (headers[n] & 0x00FF0000) >> 16);
fprintf(outfile, "%c", (headers[n] & (unsigned int) 0xFF000000) >> 24);
}
// These next 4 characters are for the biPlanes and biBitCount fields.
fprintf(outfile, "%c", 1);
fprintf(outfile, "%c", 0);
fprintf(outfile, "%c", 24);
fprintf(outfile, "%c", 0);
for (n = 7; n <= 12; n++)
{
fprintf(outfile, "%c", headers[n] & 0x000000FF);
fprintf(outfile, "%c", (headers[n] & 0x0000FF00) >> 8);
fprintf(outfile, "%c", (headers[n] & 0x00FF0000) >> 16);
fprintf(outfile, "%c", (headers[n] & (unsigned int) 0xFF000000) >> 24);
}
//
// Headers done, now write the data...
//
for (y = HEIGHT - 1; y >= 0; y--) // BMP image format is written from bottom to top...
{
for (x = 0; x <= WIDTH - 1; x++)
{
red = reduce(redcount[x][y] + COLOUR_OFFSET) * red_multiplier;
green = reduce(greencount[x][y] + COLOUR_OFFSET) * green_multiplier;
blue = reduce(bluecount[x][y] + COLOUR_OFFSET) * blue_multiplier;
if (red > 255) red = 255; if (red < 0) red = 0;
if (green > 255) green = 255; if (green < 0) green = 0;
if (blue > 255) blue = 255; if (blue < 0) blue = 0;
// Also, it's written in (b,g,r) format...
fprintf(outfile, "%c", blue);
fprintf(outfile, "%c", green);
fprintf(outfile, "%c", red);
}
if (extrabytes) // See above - BMP lines must be of lengths divisible by 4.
{
for (n = 1; n <= extrabytes; n++)
{
fprintf(outfile, "%c", 0);
}
}
}
fclose(outfile);
return;
}
drawbmp(filename);

Here is a C++ variant of the code that works for me. Note I had to change the size computation to account for the line padding.
// mimeType = "image/bmp";
unsigned char file[14] = {
'B','M', // magic
0,0,0,0, // size in bytes
0,0, // app data
0,0, // app data
40+14,0,0,0 // start of data offset
};
unsigned char info[40] = {
40,0,0,0, // info hd size
0,0,0,0, // width
0,0,0,0, // heigth
1,0, // number color planes
24,0, // bits per pixel
0,0,0,0, // compression is none
0,0,0,0, // image bits size
0x13,0x0B,0,0, // horz resoluition in pixel / m
0x13,0x0B,0,0, // vert resolutions (0x03C3 = 96 dpi, 0x0B13 = 72 dpi)
0,0,0,0, // #colors in pallete
0,0,0,0, // #important colors
};
int w=waterfallWidth;
int h=waterfallHeight;
int padSize = (4-(w*3)%4)%4;
int sizeData = w*h*3 + h*padSize;
int sizeAll = sizeData + sizeof(file) + sizeof(info);
file[ 2] = (unsigned char)( sizeAll );
file[ 3] = (unsigned char)( sizeAll>> 8);
file[ 4] = (unsigned char)( sizeAll>>16);
file[ 5] = (unsigned char)( sizeAll>>24);
info[ 4] = (unsigned char)( w );
info[ 5] = (unsigned char)( w>> 8);
info[ 6] = (unsigned char)( w>>16);
info[ 7] = (unsigned char)( w>>24);
info[ 8] = (unsigned char)( h );
info[ 9] = (unsigned char)( h>> 8);
info[10] = (unsigned char)( h>>16);
info[11] = (unsigned char)( h>>24);
info[20] = (unsigned char)( sizeData );
info[21] = (unsigned char)( sizeData>> 8);
info[22] = (unsigned char)( sizeData>>16);
info[23] = (unsigned char)( sizeData>>24);
stream.write( (char*)file, sizeof(file) );
stream.write( (char*)info, sizeof(info) );
unsigned char pad[3] = {0,0,0};
for ( int y=0; y<h; y++ )
{
for ( int x=0; x<w; x++ )
{
long red = lround( 255.0 * waterfall[x][y] );
if ( red < 0 ) red=0;
if ( red > 255 ) red=255;
long green = red;
long blue = red;
unsigned char pixel[3];
pixel[0] = blue;
pixel[1] = green;
pixel[2] = red;
stream.write( (char*)pixel, 3 );
}
stream.write( (char*)pad, padSize );
}

Note that the lines are saved from down to up and not the other way around.
Additionally, the scanlines must have a byte-length of multiples of four, you should insert fill bytes at the end of the lines to ensure this.

I just wanted to share an improved version of Minhas Kamal's code because although it worked well enough for most applications, I had a few issues with it still. Two highly important things to remember:
The code (at the time of writing) calls free() on two static arrays. This will cause your program to crash. So I commented out those lines.
NEVER assume that your pixel data's pitch is always (Width*BytesPerPixel). It's best to let the user specify the pitch value. Example: when manipulating resources in Direct3D, the RowPitch is never guaranteed to be an even multiple of the byte depth being used. This can cause errors in your generated bitmaps (especially at odd resolutions such as 1366x768).
Below, you can see my revisions to his code:
const int bytesPerPixel = 4; /// red, green, blue
const int fileHeaderSize = 14;
const int infoHeaderSize = 40;
void generateBitmapImage(unsigned char *image, int height, int width, int pitch, const char* imageFileName);
unsigned char* createBitmapFileHeader(int height, int width, int pitch, int paddingSize);
unsigned char* createBitmapInfoHeader(int height, int width);
void generateBitmapImage(unsigned char *image, int height, int width, int pitch, const char* imageFileName) {
unsigned char padding[3] = { 0, 0, 0 };
int paddingSize = (4 - (/*width*bytesPerPixel*/ pitch) % 4) % 4;
unsigned char* fileHeader = createBitmapFileHeader(height, width, pitch, paddingSize);
unsigned char* infoHeader = createBitmapInfoHeader(height, width);
FILE* imageFile = fopen(imageFileName, "wb");
fwrite(fileHeader, 1, fileHeaderSize, imageFile);
fwrite(infoHeader, 1, infoHeaderSize, imageFile);
int i;
for (i = 0; i < height; i++) {
fwrite(image + (i*pitch /*width*bytesPerPixel*/), bytesPerPixel, width, imageFile);
fwrite(padding, 1, paddingSize, imageFile);
}
fclose(imageFile);
//free(fileHeader);
//free(infoHeader);
}
unsigned char* createBitmapFileHeader(int height, int width, int pitch, int paddingSize) {
int fileSize = fileHeaderSize + infoHeaderSize + (/*bytesPerPixel*width*/pitch + paddingSize) * height;
static unsigned char fileHeader[] = {
0,0, /// signature
0,0,0,0, /// image file size in bytes
0,0,0,0, /// reserved
0,0,0,0, /// start of pixel array
};
fileHeader[0] = (unsigned char)('B');
fileHeader[1] = (unsigned char)('M');
fileHeader[2] = (unsigned char)(fileSize);
fileHeader[3] = (unsigned char)(fileSize >> 8);
fileHeader[4] = (unsigned char)(fileSize >> 16);
fileHeader[5] = (unsigned char)(fileSize >> 24);
fileHeader[10] = (unsigned char)(fileHeaderSize + infoHeaderSize);
return fileHeader;
}
unsigned char* createBitmapInfoHeader(int height, int width) {
static unsigned char infoHeader[] = {
0,0,0,0, /// header size
0,0,0,0, /// image width
0,0,0,0, /// image height
0,0, /// number of color planes
0,0, /// bits per pixel
0,0,0,0, /// compression
0,0,0,0, /// image size
0,0,0,0, /// horizontal resolution
0,0,0,0, /// vertical resolution
0,0,0,0, /// colors in color table
0,0,0,0, /// important color count
};
infoHeader[0] = (unsigned char)(infoHeaderSize);
infoHeader[4] = (unsigned char)(width);
infoHeader[5] = (unsigned char)(width >> 8);
infoHeader[6] = (unsigned char)(width >> 16);
infoHeader[7] = (unsigned char)(width >> 24);
infoHeader[8] = (unsigned char)(height);
infoHeader[9] = (unsigned char)(height >> 8);
infoHeader[10] = (unsigned char)(height >> 16);
infoHeader[11] = (unsigned char)(height >> 24);
infoHeader[12] = (unsigned char)(1);
infoHeader[14] = (unsigned char)(bytesPerPixel * 8);
return infoHeader;
}

I edited ralf's htp code so that it would compile (on gcc, running ubuntu 16.04 lts). It was just a matter of initializing the variables.
int w = 100; /* Put here what ever width you want */
int h = 100; /* Put here what ever height you want */
int red[w][h];
int green[w][h];
int blue[w][h];
FILE *f;
unsigned char *img = NULL;
int filesize = 54 + 3*w*h; //w is your image width, h is image height, both int
if( img )
free( img );
img = (unsigned char *)malloc(3*w*h);
memset(img,0,sizeof(img));
int x;
int y;
int r;
int g;
int b;
for(int i=0; i<w; i++)
{
for(int j=0; j<h; j++)
{
x=i; y=(h-1)-j;
r = red[i][j]*255;
g = green[i][j]*255;
b = blue[i][j]*255;
if (r > 255) r=255;
if (g > 255) g=255;
if (b > 255) b=255;
img[(x+y*w)*3+2] = (unsigned char)(r);
img[(x+y*w)*3+1] = (unsigned char)(g);
img[(x+y*w)*3+0] = (unsigned char)(b);
}
}
unsigned char bmpfileheader[14] = {'B','M', 0,0,0,0, 0,0, 0,0, 54,0,0,0};
unsigned char bmpinfoheader[40] = {40,0,0,0, 0,0,0,0, 0,0,0,0, 1,0, 24,0};
unsigned char bmppad[3] = {0,0,0};
bmpfileheader[ 2] = (unsigned char)(filesize );
bmpfileheader[ 3] = (unsigned char)(filesize>> 8);
bmpfileheader[ 4] = (unsigned char)(filesize>>16);
bmpfileheader[ 5] = (unsigned char)(filesize>>24);
bmpinfoheader[ 4] = (unsigned char)( w );
bmpinfoheader[ 5] = (unsigned char)( w>> 8);
bmpinfoheader[ 6] = (unsigned char)( w>>16);
bmpinfoheader[ 7] = (unsigned char)( w>>24);
bmpinfoheader[ 8] = (unsigned char)( h );
bmpinfoheader[ 9] = (unsigned char)( h>> 8);
bmpinfoheader[10] = (unsigned char)( h>>16);
bmpinfoheader[11] = (unsigned char)( h>>24);
f = fopen("img.bmp","wb");
fwrite(bmpfileheader,1,14,f);
fwrite(bmpinfoheader,1,40,f);
for(int i=0; i<h; i++)
{
fwrite(img+(w*(h-i-1)*3),3,w,f);
fwrite(bmppad,1,(4-(w*3)%4)%4,f);
}
fclose(f);

The best bitmap encoder is the one you do not write yourself. The file format is a lot more involved, than one might expect. This is evidenced by the fact, that all proposed answers do not create a monochrome (1bpp) bitmap, but rather write out 24bpp files, that happen to only use 2 colors.
The following is a Windows-only solution, using the Windows Imaging Component. It doesn't rely on any external/3rd party libraries, other than what ships with Windows.
Like every C++ program, we need to include several header files. And link to Windowscodecs.lib while we're at it:
#include <Windows.h>
#include <comdef.h>
#include <comip.h>
#include <comutil.h>
#include <wincodec.h>
#include <vector>
#pragma comment(lib, "Windowscodecs.lib")
Next up, we declare our container (a vector, of vectors! Of bool!), and a few smart pointers for convenience:
using _com_util::CheckError;
using container = std::vector<std::vector<bool>>;
_COM_SMARTPTR_TYPEDEF(IWICImagingFactory, __uuidof(IWICImagingFactory));
_COM_SMARTPTR_TYPEDEF(IWICBitmapEncoder, __uuidof(IWICBitmapEncoder));
_COM_SMARTPTR_TYPEDEF(IWICBitmapFrameEncode, __uuidof(IWICBitmapFrameEncode));
_COM_SMARTPTR_TYPEDEF(IWICStream, __uuidof(IWICStream));
_COM_SMARTPTR_TYPEDEF(IWICPalette, __uuidof(IWICPalette));
With that all settled, we can jump right into the implementation. There's a bit of setup required to get a factory, an encoder, a frame, and get everything prepared:
void write_bitmap(wchar_t const* pathname, container const& data)
{
// Create factory
IWICImagingFactoryPtr sp_factory { nullptr };
CheckError(sp_factory.CreateInstance(CLSID_WICImagingFactory, nullptr,
CLSCTX_INPROC_SERVER));
// Create encoder
IWICBitmapEncoderPtr sp_encoder { nullptr };
CheckError(sp_factory->CreateEncoder(GUID_ContainerFormatBmp, nullptr, &sp_encoder));
// Create stream
IWICStreamPtr sp_stream { nullptr };
CheckError(sp_factory->CreateStream(&sp_stream));
CheckError(sp_stream->InitializeFromFilename(pathname, GENERIC_WRITE));
// Initialize encoder with stream
CheckError(sp_encoder->Initialize(sp_stream, WICBitmapEncoderNoCache));
// Create new frame
IWICBitmapFrameEncodePtr sp_frame { nullptr };
IPropertyBag2Ptr sp_properties { nullptr };
CheckError(sp_encoder->CreateNewFrame(&sp_frame, &sp_properties));
// Initialize frame with default properties
CheckError(sp_frame->Initialize(sp_properties));
// Set pixel format
// SetPixelFormat() requires a pointer to non-const
auto pf { GUID_WICPixelFormat1bppIndexed };
CheckError(sp_frame->SetPixelFormat(&pf));
if (!::IsEqualGUID(pf, GUID_WICPixelFormat1bppIndexed))
{
// Report unsupported pixel format
CheckError(WINCODEC_ERR_UNSUPPORTEDPIXELFORMAT);
}
// Set size derived from data argument
auto const width { static_cast<UINT>(data.size()) };
auto const height { static_cast<UINT>(data[0].size()) };
CheckError(sp_frame->SetSize(width, height));
// Set palette on frame. This is required since we use an indexed pixel format.
// Only GIF files support global palettes, so make sure to set it on the frame
// rather than the encoder.
IWICPalettePtr sp_palette { nullptr };
CheckError(sp_factory->CreatePalette(&sp_palette));
CheckError(sp_palette->InitializePredefined(WICBitmapPaletteTypeFixedBW, FALSE));
CheckError(sp_frame->SetPalette(sp_palette));
At that point everything is set up, and we have a frame to dump our data into. For 1bpp files, every byte stores the information of 8 pixels. The left-most pixel is stored in the MSB, with pixels following all the way down to the right-most pixel stored in the LSB.
The code isn't entirely important; you'll be replacing that with whatever suits your needs, when you replace the data layout of your input anyway:
// Write data to frame
auto const stride { (width * 1 + 7) / 8 };
auto const size { height * stride };
std::vector<unsigned char> buffer(size, 127u);
// Convert data to match required layout. Each byte stores 8 pixels, with the
// MSB being the leftmost, the LSB the right-most.
for (size_t x { 0 }; x < data.size(); ++x)
{
for (size_t y { 0 }; y < data[x].size(); ++y)
{
auto shift { x % 8 };
auto mask { 0x80 >> shift };
auto bit { mask * data[x][y] };
auto& value { buffer[y * stride + x / 8] };
value &= ~mask;
value |= bit;
}
}
CheckError(sp_frame->WritePixels(height, stride,
static_cast<UINT>(buffer.size()), buffer.data()));
What's left is to commit the changes to the frame and the encoder, which will ultimately write the image file to disk:
// Commit frame
CheckError(sp_frame->Commit());
// Commit image
CheckError(sp_encoder->Commit());
}
This is a test program, writing out an image to a file passed as the first command-line argument:
#include <iostream>
int wmain(int argc, wchar_t* argv[])
try
{
if (argc != 2)
{
return -1;
}
CheckError(::CoInitializeEx(nullptr, COINIT_APARTMENTTHREADED));
// Create 64x64 matrix
container data(64, std::vector<bool>(64, false));
// Fill with arrow pointing towards the upper left
for (size_t i { 0 }; i < data.size(); ++i)
{
data[0][i] = true;
data[i][0] = true;
data[i][i] = true;
}
::write_bitmap(argv[1], data);
::CoUninitialize();
}
catch (_com_error const& e)
{
std::wcout << L"Error!\n" << L" Message: " << e.ErrorMessage() << std::endl;
}
It produces the following 64x64 image (true 1bpp, 4096 pixels, 574 bytes in size):

If you get strange colors switches in the middle of your image using the above C++ function. Be sure to open the outstream in binary mode:
imgFile.open(filename, std::ios_base::out | std::ios_base::binary);
Otherwise windows inserts unwanted characters in the middle of your file! (been banging my head on this issue for hours)
See related question here: Why does ofstream insert a 0x0D byte before 0x0A?

Here's a simple c++ bmp image file class.
class bmp_img {
public:
constexpr static int header_size = 14;
constexpr static int info_header_size = 40;
constexpr static size_t bytes_per_pixel = 3;
bmp_img(size_t width, size_t height) :
image_px_width{ width }, image_px_height{ height }, row_width{ image_px_width * bytes_per_pixel },
row_padding{ (4 - row_width % 4) % 4 }, row_stride{ row_width + row_padding }, file_size{ header_size + info_header_size + (image_px_height * row_stride) },
image(image_px_height, std::vector<unsigned char>(row_width))
{
//header file type
file_header[0] = 'B';
file_header[1] = 'M';
//header file size info
file_header[2] = static_cast<unsigned char>(file_size);
file_header[3] = static_cast<unsigned char>(file_size >> 8);
file_header[4] = static_cast<unsigned char>(file_size >> 16);
file_header[5] = static_cast<unsigned char>(file_size >> 24);
//header offset to pixel data
file_header[10] = header_size + info_header_size;
//info header size
info_header[0] = info_header_size;
//info header image width
info_header[4] = static_cast<unsigned char>(image_px_width);
info_header[5] = static_cast<unsigned char>(image_px_width >> 8);
info_header[6] = static_cast<unsigned char>(image_px_width >> 16);
info_header[7] = static_cast<unsigned char>(image_px_width >> 24);
//info header image height
info_header[8] = static_cast<unsigned char>(image_px_height);
info_header[9] = static_cast<unsigned char>(image_px_height >> 8);
info_header[10] = static_cast<unsigned char>(image_px_height >> 16);
info_header[11] = static_cast<unsigned char>(image_px_height >> 24);
//info header planes
info_header[12] = 1;
//info header bits per pixel
info_header[14] = 8 * bytes_per_pixel;
}
size_t width() const {
return image_px_width;
}
size_t height() const {
return image_px_height;
}
void set_pixel(size_t x, size_t y, int r, int g, int b) {
image[y][x * bytes_per_pixel + 2] = r;
image[y][x * bytes_per_pixel + 1] = g;
image[y][x * bytes_per_pixel + 0] = b;
}
void fill(int r, int g, int b) {
for (int y = 0; y < image_px_height; ++y) {
for (int x = 0; x < image_px_width; ++x) {
set_pixel(x, y, r, g, b);
}
}
}
void write_to_file(const char* file_name) const {
std::ofstream img_file(file_name, std::ios_base::binary | std::ios_base::out);
img_file.write((char*)file_header, header_size);
img_file.write((char*)info_header, info_header_size);
std::vector<char> allignment(row_padding);
for (int y = image_px_height - 1; y >= 0; --y) {
img_file.write((char*)image[y].data(), row_width);
img_file.write(allignment.data(), row_padding);
}
img_file.close();
}
private:
size_t image_px_width;
size_t image_px_height;
size_t row_width;
size_t row_padding;
size_t row_stride;
size_t file_size;
unsigned char file_header[header_size] = { 0 };
unsigned char info_header[info_header_size] = { 0 };
std::vector<std::vector<unsigned char>> image;
};

C++ answer, flexible API, assumes little-endian system to code-golf it a bit. Note this uses the bmp native y-axis (0 at the bottom).
#include <vector>
#include <fstream>
struct image
{
image(int width, int height)
: w(width), h(height), rgb(w * h * 3)
{}
uint8_t & r(int x, int y) { return rgb[(x + y*w)*3 + 2]; }
uint8_t & g(int x, int y) { return rgb[(x + y*w)*3 + 1]; }
uint8_t & b(int x, int y) { return rgb[(x + y*w)*3 + 0]; }
int w, h;
std::vector<uint8_t> rgb;
};
template<class Stream>
Stream & operator<<(Stream & out, image const& img)
{
uint32_t w = img.w, h = img.h;
uint32_t pad = w * -3 & 3;
uint32_t total = 54 + 3*w*h + pad*h;
uint32_t head[13] = {total, 0, 54, 40, w, h, (24<<16)|1};
char const* rgb = (char const*)img.rgb.data();
out.write("BM", 2);
out.write((char*)head, 52);
for(uint32_t i=0 ; i<h ; i++)
{ out.write(rgb + (3 * w * i), 3 * w);
out.write((char*)&pad, pad);
}
return out;
}
int main()
{
image img(100, 100);
for(int x=0 ; x<100 ; x++)
{ for(int y=0 ; y<100 ; y++)
{ img.r(x,y) = x;
img.g(x,y) = y;
img.b(x,y) = 100-x;
}
}
std::ofstream("/tmp/out.bmp") << img;
}

This code uses some newer C++ features. I've used it to create 8bit and 24bit bmp files. It only writes bmp files, one day we may read them too!
I didn't like all the shifting and error proneess for endian safety.
It could use lots more comments but the code is pretty straight forward. The supposedly run-time detection of endianness results in code being optimized away on all the compilers I tested (a while ago).
endian_type.h >> Endian safe POD type.
#ifndef ENDIAN_TYPE_H
#define ENDIAN_TYPE_H
#include <algorithm>
#include <type_traits>
namespace endian_type {
template <typename T, bool store_as_big_endian>
struct EndianType {
using value_type = T;
static_assert(std::is_fundamental_v<value_type>,
"EndianType works for fundamental data types");
EndianType() = default;
EndianType(const value_type& value)
: value{ convert_to(value) } {}
struct TypeAsBytes {
unsigned char value[sizeof(value_type)];
};
static constexpr bool is_big_endian() {
union { int ival; char cval; } uval;
uval.ival = 1;
return 0 == uval.cval;
}
static TypeAsBytes convert_to(const value_type& ivalue) {
TypeAsBytes ovalue;
const unsigned char* p_ivalue = (const unsigned char*)&ivalue;
if (store_as_big_endian != is_big_endian()) {
std::reverse_copy(p_ivalue, p_ivalue + sizeof(value_type), ovalue.value);
} else {
std::copy(p_ivalue, p_ivalue + sizeof(value_type), ovalue.value);
}
return ovalue;
}
static value_type convert_from(const TypeAsBytes& ivalue) {
value_type ovalue;
unsigned char* p_ovalue = (unsigned char*) &ovalue;
const unsigned char* p_ivalue = (const unsigned char*)&ivalue;
if (store_as_big_endian != is_big_endian()) {
std::reverse_copy(p_ivalue, p_ivalue + sizeof(value_type), p_ovalue);
}
else {
std::copy(p_ivalue, p_ivalue + sizeof(value_type), p_ovalue);
}
return ovalue;
}
value_type get() const {
return convert_from(value);
}
EndianType& set(const value_type& ivalue) {
value = convert_to(ivalue);
return *this;
}
operator value_type() const {
return get();
}
EndianType& operator=(const value_type& ivalue) {
set(ivalue);
return *this;
}
private:
TypeAsBytes value;
};
template <typename T>
using BigEndian = EndianType<T, true>;
template <typename T>
using LittleEndian = EndianType<T, false>;
} // namespace endian_type
#endif // ENDIAN_TYPE_H
The following contains the write_bmp functions.
bmp_writer.h >> the BMP writer header
#ifndef BMP_WRITER
#define BMP_WRITER
#include "endian_type.h"
#include <cctype>
#include <vector>
#include <fstream>
namespace bmp_writer {
template <typename T>
using LittleEndian = endian_type::LittleEndian<T>;
struct Header {
char magic[2]{ 'B', 'M' };
LittleEndian<std::uint32_t> size;
LittleEndian<std::uint16_t> app_data1;
LittleEndian<std::uint16_t> app_data2;
LittleEndian<std::uint32_t> offset;
};
struct Info {
LittleEndian<std::uint32_t> info_size{ 40 };
LittleEndian<std::uint32_t> width;
LittleEndian<std::uint32_t> height;
LittleEndian<std::uint16_t> count_colour_planes{ 1 };
LittleEndian<std::uint16_t> bits_per_pixel;
LittleEndian<std::uint32_t> compression{};
LittleEndian<std::uint32_t> image_bytes_size;
LittleEndian<std::uint32_t> resolution_horizontal{ 2835 };
LittleEndian<std::uint32_t> resolution_vertical{ 2835 };
LittleEndian<std::uint32_t> count_pallete_entries{ 0 };
LittleEndian<std::uint32_t> important_colours{ 0 };
};
template <std::size_t count>
class Palette {
public:
static constexpr std::uint32_t NUM_CHANNELS = 4;
using Entry = std::uint8_t[NUM_CHANNELS];
private:
Palette() {
for (auto i = 0; i < count; ++i) {
auto& entry = table[i];
for (auto j = 0; j < NUM_CHANNELS - 1; ++j) {
entry[j] = i;
}
}
}
Palette(const Palette&) = delete;
Palette(const Palette&&) = delete;
Palette& operator=(const Palette&) = delete;
Palette& operator=(const Palette&&) = delete;
public:
static const Palette& get() {
static const Palette palette;
return palette;
}
Entry table[count];
};
static_assert(sizeof(Info) == 40, "");
template <typename T>
void write_bmp(
std::ofstream& out,
std::uint32_t width,
std::uint32_t height,
std::uint16_t count_colour_planes,
const T* data,
std::uint32_t data_size
) {
auto& palette = Palette<256>::get();
Header header;
Info info;
info.width = width;
info.height = height;
//info.count_colour_planes = count_colour_planes;
const std::uint32_t t_per_pixel = data_size / (width * height);
info.bits_per_pixel = std::uint16_t(sizeof(T) * 8 * t_per_pixel);
const std::uint32_t row_len = width * sizeof(T) * t_per_pixel;
// Round row up to next multiple of 4.
const std::uint32_t padded_row_len = (row_len + 3) & ~3u;
const std::uint32_t data_size_bytes = padded_row_len * height;
info.image_bytes_size = data_size_bytes;
if (count_colour_planes == 1) {
header.offset = sizeof(Info) + sizeof(Header) + sizeof(palette);
} else {
header.offset = sizeof(Info) + sizeof(Header);
}
header.size = header.offset + height * padded_row_len;
out.write(reinterpret_cast<const char*>(&header), sizeof(header));
out.write(reinterpret_cast<const char*>(&info), sizeof(info));
if (count_colour_planes == 1) {
out.write(reinterpret_cast<const char*>(&palette), sizeof(palette));
}
const char padding[3] = {};
for (int i = height; i > 0;) {
--i;
const char* p_row =
reinterpret_cast<const char*>(data + i * width);
out.write(p_row, row_len);
if (padded_row_len != row_len) {
out.write(padding, padded_row_len - row_len);
}
}
};
template <typename T>
void write_bmp(
std::ofstream& out,
std::uint32_t width,
std::uint32_t height,
std::uint16_t count_colour_planes,
const std::vector<T>& data
) {
write_bmp(out, width, height, count_colour_planes,
&*data.cbegin(), data.size());
}
template <typename T>
void write_bmp(
const std::string& outfilename,
std::uint32_t width,
std::uint32_t height,
std::uint16_t count_colour_planes,
const std::vector<T>& data
) {
std::ofstream out{ outfilename, std::ios_base::binary };
if (!out) {
throw std::runtime_error("Failed to open: " + outfilename);
}
write_bmp(out, width, height, count_colour_planes,
&*data.begin(), static_cast<std::uint32_t>(data.size()));
out.close();
}
} // namespace
#endif // BMP_WRITER
And an example of use:
#include "bmp_writer.h"
struct PixelType {
PixelType(std::uint8_t r, std::uint8_t g, std::uint8_t b)
: c{ b, g, r } {}
PixelType(std::uint32_t c)
: c{ (c >> 16) & 0xffu, (c >> 8) & 0xffu, c & 0xffu } {}
PixelType() = default;
std::uint8_t c[3] = {};
};
void bmp_writer_test1() {
const int size_x = 20;
const int size_y = 10;
std::vector<PixelType> data(size_x * size_y);
// Write some pixels.
data[2] = PixelType(0xff0000); // red
data[10] = PixelType(0x00ff00); // green
bmp_writer::write_bmp(
"test_bmp_writer1.bmp",
std::uint32_t(size_x),
std::uint32_t(size_y),
std::uint16_t(sizeof(PixelType)),
data
);
}
void bmp_writer_test2() {
const int size_x = 20;
const int size_y = 10;
PixelType data[size_x * size_y];
// Write some pixels.
data[15] = PixelType(0xff, 0, 0); // red
data[17] = PixelType(0, 0xff, 0); // green
std::ofstream out{ "test_bmp_writer2.bmp", std::ios_base::binary };
if (!out) {
throw std::runtime_error("Failed to open: " "test_bmp_writer2.bmp");
}
bmp_writer::write_bmp(
out,
std::uint32_t(size_x),
std::uint32_t(size_y),
std::uint16_t(sizeof(PixelType)),
data,
sizeof(data) / sizeof PixelType
);
}

We Keep Coding

c++ django amazon-web-services regex python-2.7 google-cloud-platform list unit-testing opengl ember.js

Use load/store correctly - c++

Related

Fast pyrDown image with AVX instructions

Converting a RGBA image to RGB image

Altivec: analogue of _mm_sad_epu8()

Comparing two vector<bool> with SSE

Writing BMP image in pure c/c++ without other libraries

Categories

Resources