Fail to Read Through Shared Memory - c++

I am trying to publish some random things over shared memory; and for some weird reason, the reader doesn't pick up what the sender has written
#include <sys/stat.h>
#include <fcntl.h>
#include <sys/mman.h>
#include <unistd.h>
#include <sys/types.h>
#include <cstdio>
class SHM {
volatile char* _ptr;
SHM() {
const auto handle = shm_open("myTest", O_RDWR|O_CREAT, 0666);
const auto size = 4 * 1024 * 1024;
if (-1 == ftruncate(handle, size)) {
_ptr = (volatile char*)mmap(0,size , PROT_READ | PROT_WRITE, MAP_SHARED, handle, 0);
if(_ptr == MAP_FAILED){
int rc = fchmod(handle, 0666);
if (rc == -1) {
bool read(uint64_t& magic, uint64_t& time) {
const uint64_t newVal = *(uint64_t*)_ptr;
if (newVal != magic) {
magic = newVal;
printf("value changed!!!\n");
time = *(uint64_t*)(_ptr + sizeof(magic));
return true;
//printf("old value: %lu\n", newVal);
return false;
void publish(const uint64_t time) {
__sync_fetch_and_add((uint64_t*)_ptr, time);
*(uint64_t*)(_ptr + sizeof(uint64_t)) = time;
Here is the sender:
#include <ctime>
#include <unistd.h>
#include <cstdlib>
#include <cstdint>
#include "shm.h"
int main() {
SHM shm;
timespec t;
for (auto i = 0; i < 10000; i++) {
if (0 == clock_gettime(CLOCK_REALTIME, &t)) {
const uint64_t v = t.tv_sec * 1000 * 1000 * 1000 + t.tv_nsec;
printf("published %lu\n", v);
Here is the reader:
#include <iostream>
#include "shm.h"
int main() {
SHM shm;
uint64_t magic = 0;
uint64_t t = 0;
while (true) {
if (, t)) {
printf("%lu, %lu\n", magic, t);
If I restart the reader, the reader is indeed able to read the last value that the sender has written.
However, if I start the reader first, and then the sender, all the values the sender writes aren't picked up by the reader.
To make this even weirder, if I uncomment the printf statement in SHM::read(), then the reader is able to pick up sometimes.
Any idea?
GCC version:
g++ (GCC) 7.2.1 20170829 (Red Hat 7.2.1-1)

I spotted a couple of issues, however, I am unsure if they would fix your problem.
name for shm_open should start with / for portable use.
In read and publish the casts must not discard volatile. E.g.: const uint64_t newVal = *(uint64_t volatile*)_ptr;. Even better, drop volatile and use std::atomic.
Although there are different processes involved, this is still the case of same objects being accessed by more than one thread of execution and at least one of these threads modifies the shared objects.
I made the above changes. Using std::atomic fixed it:
class SHM {
void* _ptr;
SHM() {
const auto handle = shm_open("/myTest", O_RDWR|O_CREAT, 0666);
const auto size = 4 * 1024 * 1024;
if (-1 == ftruncate(handle, size))
_ptr = mmap(0,size , PROT_READ | PROT_WRITE, MAP_SHARED, handle, 0);
if(_ptr == MAP_FAILED)
bool read(uint64_t& magic, uint64_t& time) {
auto p = static_cast<std::atomic<uint64_t>*>(_ptr);
const uint64_t newVal = p[0];
if (newVal != magic) {
magic = newVal;
printf("value changed!!!\n");
time = p[1];
return true;
return false;
void publish(const uint64_t time) {
auto p = static_cast<std::atomic<uint64_t>*>(_ptr);
p[0] += time;
p[1] = time;
void sender() {
SHM shm;
timespec t;
for (auto i = 0; i < 10000; i++) {
if (0 == clock_gettime(CLOCK_REALTIME, &t)) {
const uint64_t v = t.tv_sec * 1000 * 1000 * 1000 + t.tv_nsec;
printf("published %lu\n", v);
void reader() {
SHM shm;
uint64_t magic = 0;
uint64_t t = 0;
while (true) {
if (, t)) {
printf("%lu, %lu\n", magic, t);
int main(int ac, char**) {
if(ac > 1)
With std::atomic you can have more control. E.g.:
struct Data {
std::atomic<uint64_t> time;
std::atomic<uint64_t> generation;
// ...
bool read(uint64_t& generation, uint64_t& time) {
auto data = static_cast<Data*>(_ptr);
auto new_generation = data->generation.load(std::memory_order_acquire); // 1. Syncronizes with (2).
if(generation == new_generation)
return false;
generation = new_generation;
time = data->time.load(std::memory_order_relaxed);
printf("value changed!!!\n");
return true;
void publish(const uint64_t time) {
auto data = static_cast<Data*>(_ptr);
data->, std::memory_order_relaxed);
data->generation.fetch_add(time, std::memory_order_release); // 2. (1) Synchronises with this store.


C++ USB communication delay

I use ftd3xx.dll to communicate with the device
The data read part and the data write part are divided into threads and used.
#include <thread>
#include <queue>
#include <array>
#include <windows.h>
using namespace std;
bool dataRead = false;
queue< vector<unsigned short>> BufferQueue;
unsigned WINAPI Write(void* arg) {
int Width = 1000;
vector<unsigned short> data;
while (Opened)
while (dataRead)
if (BufferQueue.size() > 0) {
data = BufferQueue.front();
//wrtie something
if (!dataRead)
return 0;
unsigned WINAPI Read(void* arg) {
int Width = 1000;
vector<unsigned short> data(Width);
BYTE* acReadBuf = new BYTE[Width];
ULONG ulBytesRead = 0;
int idx = 0;
while (dataRead)
ftStatus = FT_ReadPipe(ftHandle, CstReadPipeNo, acReadBuf, Width, &ulBytesRead, NULL);
if (FT_SUCCESS(ftStatus))
idx = 0;
for (int i = 0; i < Width; i++) {
data[i] = ((unsigned short)((unsigned short)acReadBuf[idx] | ((unsigned short)acReadBuf[idx + 1] << 8)));
idx += 2;
if (BufferQueue.size() > 10000) {
queue< vector<unsigned short>> empty;
swap(BufferQueue, empty);
return 0;
void main() {
dataRead = true;
HANDLE r_hThread = NULL;
unsigned r_threadID;
r_hThread = (HANDLE)_beginthreadex(NULL, 0, Read, NULL, 0, &r_threadID);
HANDLE w_hThread = NULL;
unsigned w_threadID;
w_hThread = (HANDLE)_beginthreadex(NULL, 0, Write, NULL, 0, &w_threadID);
dataRead = false;;
WaitForSingleObject(r_hThread, INFINITE);
WaitForSingleObject(w_hThread, INFINITE);
I want to queue the array directly, but first I am using it as a vector.
Importantly, data loss occurs when other programs are run or even calculators are run.
The same is true even if the device gives the data late or fast.
I would be grateful if someone could help me.

C/C++ threads magic difference in condition

I wanted to write simple multithread app in C/C++. Function funProducent produces 100 values and if random generated value is in given range, char is added to buffer. Function funKonzument comsumes values from buffer. Here is my code:
#include <stdlib.h>
#include <stdio.h>
#include <pthread.h>
#define BUFFER_LIMIT 20
struct struktura{
pthread_mutex_t mutex;
pthread_cond_t bufferNotFull;
pthread_cond_t bufferNotEmpty;
int bufferIndex;
char * buffer;
int junk;
void * funProducent(void *arg){
struktura * data = (struktura *) arg;
int i = 0;
while (i < 100) {
if(data->bufferIndex == BUFFER_LIMIT - 1){
pthread_cond_wait(&data->bufferNotFull, &data->mutex);
int randomValue = (rand() % 20) + 1;
if( randomValue < 13 ){
data->buffer[++data->bufferIndex] = 'a';
printf("%2d : Producent at index %d added %c\n", i, data->bufferIndex, data->buffer[data->bufferIndex]);
} else {
printf("producent is done\n");
void * funKonzument(void *arg){
struktura * data = (struktura *) arg;
int i = 0;
while (i + data->junk < 100) {
printf("%d\n", i + data->junk);
if(data->bufferIndex < 0){
pthread_cond_wait(&data->bufferNotEmpty, &data->mutex);
printf("%2d : Konzument at index %d consumed %c\n", i, data->bufferIndex, data->buffer[data->bufferIndex]);
printf("konzument is done\n");
int main(int argc, char** argv) {
pthread_t threadProducent, threadKonzument;
struktura threadData;
threadData.buffer = (char *) malloc(sizeof(char) * BUFFER_LIMIT);
threadData.bufferIndex = -1;
threadData.bufferNotFull = PTHREAD_COND_INITIALIZER;
threadData.bufferNotEmpty = PTHREAD_COND_INITIALIZER;
threadData.junk = 0;
pthread_create(&threadProducent, NULL, funProducent, &threadData);
pthread_create(&threadKonzument, NULL, funKonzument, &threadData);
pthread_join(threadProducent, NULL);
pthread_join(threadKonzument, NULL);
return 0;
When I try to run this code, sometimes it stucks in funKonzument at this line:
pthread_cond_wait(&data->bufferNotEmpty, &data->mutex);
But...when I change condition in funProducent method from:
if( randomValue < 13 )
if( randomValue > 8 )
everything works fine. Is anyone able to explain me what magic difference is between this two conditions?
You are probably suffering from spurious wakes and some problem with the junk counter. I just removed that counter and added a cond wait loop function (and a little lock context manager) and then the hangings seems to have stopped.
#include <stdlib.h>
#include <stdio.h>
#include <pthread.h>
#include <stdexcept>
#include <functional>
#define BUFFER_LIMIT 20
struct struktura{
pthread_mutex_t mutex;
pthread_cond_t bufferNotFull;
pthread_cond_t bufferNotEmpty;
int bufferIndex;
char * buffer;
// a lock context manager
class mlock {
pthread_mutex_t* mtx;
mlock(pthread_mutex_t& Mtx) :
int rv=pthread_mutex_lock(mtx);
if(rv) throw std::runtime_error(std::to_string(rv));
mlock(const mlock&) = delete;
mlock(mlock&&) = delete;
mlock& operator=(const mlock&) = delete;
mlock& operator=(mlock&&) = delete;
~mlock() {
// silly loop to take care of spurious wakes
void cwait(pthread_cond_t& c, pthread_mutex_t& m, std::function<bool()> f) {
while(f()) pthread_cond_wait(&c, &m);
void* funProducent(void *arg){
struktura* data = static_cast<struktura*>(arg);
int i = 0;
while(i < 100) {
mlock dummy(data->mutex);
cwait(data->bufferNotFull, data->mutex, [&](){return data->bufferIndex == BUFFER_LIMIT - 1;});
int randomValue = (rand() % 20) + 1;
if( randomValue < 13 ){
data->buffer[++data->bufferIndex] = 'a';
printf("%2d : Producent at index %d added %c\n", i, data->bufferIndex, data->buffer[data->bufferIndex]);
printf("producent is done\n");
return nullptr;
void* funKonzument(void *arg){
struktura* data = static_cast<struktura*>(arg);
int i = 0;
while(i < 100) {
mlock dummy(data->mutex);
cwait(data->bufferNotEmpty, data->mutex, [&](){return data->bufferIndex<0;});
printf("\t\t\t%2d : Konzument at index %d consumed %c\n", i, data->bufferIndex, data->buffer[data->bufferIndex]);
printf("\t\t\tkonzument is done\n");
return nullptr;
int main() {
pthread_t threadProducent, threadKonzument;
struktura threadData;
threadData.buffer = (char *) malloc(sizeof(char) * BUFFER_LIMIT);
threadData.bufferIndex = -1;
threadData.bufferNotFull = PTHREAD_COND_INITIALIZER;
threadData.bufferNotEmpty = PTHREAD_COND_INITIALIZER;
pthread_create(&threadProducent, NULL, funProducent, &threadData);
pthread_create(&threadKonzument, NULL, funKonzument, &threadData);
pthread_join(threadProducent, NULL);
pthread_join(threadKonzument, NULL);
return 0;

Why SDL_RWops performs so poorly when writing to file compared to cstdio and std::fstream?

I'm currently in process of migrating my hobby project from std::fstream to SDL_RWops (because SDL_RWops is my only simple choice for loading assets on Android).
Reading from a file works perfectly, but writing to a file is incredibly slow.
Consider following testcases:
C standard IO - 0.217193 secs
std::FILE *io = std::fopen("o.txt", "w");
for (int i = 0; i < 1024*1024*4; i++)
std::putc('0', io);
C++ streams - 0.278278 secs
std::ofstream io("o.txt");
for (int i = 0; i < 1024*1024*4; i++)
io << '0';
SDL_RWops: - 17.9893 secs
SDL_RWops *io = SDL_RWFromFile("o.txt", "w");
for (int i = 0; i < 1024*1024*4; i++)
io->write(io, "0", 1, 1);
All testcases were compiled with g++ 5.3.0 (mingw-w64) x86 with -O3. I've used SDL 2.0.4.
I've also tried -O0 with similar results (0.02 to 0.25 seconds slower).
After looking at these results I have an obvious questions:
Why SDL_RWops writing performance is so poor?
What can I do to make it perform better?
Edit: Here is the code of windows_file_write() (from SDL), which is what io->write should point to. It should do buffered output, but I'm not sure how it works.
static size_t SDLCALL
windows_file_write(SDL_RWops * context, const void *ptr, size_t size, size_t num)
size_t total_bytes;
DWORD byte_written;
size_t nwritten;
total_bytes = size * num;
if (!context || context->hidden.windowsio.h == INVALID_HANDLE_VALUE || total_bytes <= 0 || !size)
return 0;
if (context->hidden.windowsio.buffer.left) {
-(LONG)context->hidden.windowsio.buffer.left, NULL,
context->hidden.windowsio.buffer.left = 0;
/* if in append mode, we must go to the EOF before write */
if (context->hidden.windowsio.append) {
if (SetFilePointer(context->hidden.windowsio.h, 0L, NULL, FILE_END) ==
return 0;
if (!WriteFile
(context->hidden.windowsio.h, ptr, (DWORD)total_bytes, &byte_written, NULL)) {
return 0;
nwritten = byte_written / size;
return nwritten;
In short: I've managed to improve it. Now I'm getting 0.316382 secs, which is only a bit slower than other solutions.
But it's one of the dirtiest hacks I've ever done in my life. I'd appreciate any better solutions.
How it was done: I've rolled custom replacement for SDL_RWFromFile(): I've copy-pasted the implementation from SDL_rwops.c and removed all preprocessor branches as if only HAVE_STDIO_H was defined. The function contained a call to SDL_RWFromFP(), thus I've copy-pasted SDL_RWFromFP() too and applied same modifications to it. In turn, SDL_RWFromFP() relied on stdio_size(),stdio_read(),stdio_write(),stdio_seek() and stdio_close() (these are a part of SDL_rwops.c too), thus I've copy-pasted them too. In turn, these relied (again!) on some fields of "hidden" union inside of struct SDL_RWops, which are disabled on windows using preprocessor. Instead of changing the header, I've changed the copy-pasted code to use different members of "hidden" union, which do exist on windows. (It's safe, because nothing except my own and copy-pasted code touches the struct.) Some other tweaks were made to make the code work as C++ instead of C.
This is what I got:
#if OnWindows
#define hidden_stdio_fp ((FILE * &)context->hidden.windowsio.h)
#define hidden_stdio_autoclose ((SDL_bool &)context->hidden.windowsio.append)
// ** Begin copied code **
static auto stdio_size = [](SDL_RWops * context) -> int64_t
int64_t pos, size;
pos = SDL_RWseek(context, 0, RW_SEEK_CUR);
if (pos < 0) {
return -1;
size = SDL_RWseek(context, 0, RW_SEEK_END);
SDL_RWseek(context, pos, RW_SEEK_SET);
return size;
static auto stdio_seek = [](SDL_RWops * context, int64_t offset, int whence) -> int64_t
#ifdef HAVE_FSEEKO64
if (std::fseeko64(hidden_stdio_fp, (off64_t)offset, whence) == 0) {
return std::ftello64(hidden_stdio_fp);
#elif defined(HAVE_FSEEKO)
if (std::fseeko(hidden_stdio_fp, (off_t)offset, whence) == 0) {
return std::ftello(hidden_stdio_fp);
#elif defined(HAVE__FSEEKI64)
if (std::_fseeki64(hidden_stdio_fp, offset, whence) == 0) {
return std::_ftelli64(hidden_stdio_fp);
if (std::fseek(hidden_stdio_fp, offset, whence) == 0) {
return std::ftell(hidden_stdio_fp);
return SDL_Error(SDL_EFSEEK);
static auto stdio_read = [](SDL_RWops * context, void *ptr, std::size_t size, std::size_t maxnum) -> std::size_t
std::size_t nread;
nread = std::fread(ptr, size, maxnum, hidden_stdio_fp);
if (nread == 0 && std::ferror(hidden_stdio_fp)) {
return nread;
static auto stdio_write = [](SDL_RWops * context, const void *ptr, std::size_t size, std::size_t num) -> std::size_t
std::size_t nwrote;
nwrote = std::fwrite(ptr, size, num, hidden_stdio_fp);
if (nwrote == 0 && std::ferror(hidden_stdio_fp)) {
return nwrote;
static auto stdio_close = [](SDL_RWops * context) -> int
int status = 0;
if (context) {
if (hidden_stdio_autoclose) {
/* WARNING: Check the return value here! */
if (std::fclose(hidden_stdio_fp) != 0) {
status = SDL_Error(SDL_EFWRITE);
return status;
static auto RWFromFP = [](FILE * fp, SDL_bool autoclose) -> SDL_RWops *
SDL_RWops *context = 0;
context = SDL_AllocRW();
if (context != 0) {
context->size = stdio_size;
context->seek = stdio_seek;
context->read = stdio_read;
context->write = stdio_write;
context->close = stdio_close;
hidden_stdio_fp = fp;
hidden_stdio_autoclose = autoclose;
context->type = SDL_RWOPS_STDFILE;
return context;
static auto SDL_RWFromFile = [](const char *file, const char *mode) -> SDL_RWops *
SDL_RWops *context = 0;
if (!file || !*file || !mode || !*mode) {
SDL_SetError("SDL_RWFromFile(): No file or no mode specified");
return 0;
FILE *fp = std::fopen(file, mode);
if (fp == 0) {
SDL_SetError("Couldn't open %s", file);
} else {
context = RWFromFP(fp, (SDL_bool)1);
return context;
// ** End copied code **
#undef hidden_stdio_fp
#undef hidden_stdio_autoclose

How to asynchronously read/write in C++?

How do you copy one stream to another using dedicated read/write threads in C++?
Let's say I have these methods (not real, but to illustrate the point) to read/write data from. These read/write functions could represent anything (network/file/USB/serial/etc).
// returns the number of bytes read
void read(char* buffer, int bufferSize, int* bytesRead);
// returns the number of bytes written
void write(char* buffer, int bufferSize, int* bytesWritten);
The solution should also be portable.
NOTE: I am aware that Windows has a FILE_FLAG_OVERLAPPED feature, but this assumes that the read/write is file IO. Remember, these read/write methods could represent anything.
Here is the solution I came up with.
#pragma once
#include <stdlib.h>
#include <queue>
#include <mutex>
#include <thread>
#include <chrono>
#include <list>
#include <thread>
struct BufferBlock;
struct ReadStream
// read a stream to a buffer.
// return non-zero if error occured
virtual int read(char* buffer, int bufferSize, int* bytesRead) = 0;
struct WriteStream
// write a buffer to a stream.
// return non-zero if error occured
virtual int write(char* buffer, int bufferSize, int* bytesWritten) = 0;
class BufferBlockManager
BufferBlockManager(int numberOfBlocks, int bufferSize);
void enqueueBlockForRead(BufferBlock* block);
void dequeueBlockForRead(BufferBlock** block);
void enqueueBlockForWrite(BufferBlock* block);
void dequeueBlockForWrite(BufferBlock** block);
void resetState();
std::list<BufferBlock*> blocks;
std::queue<BufferBlock*> blocksPendingRead;
std::queue<BufferBlock*> blocksPendingWrite;
std::mutex queueLock;
std::chrono::milliseconds dequeueSleepTime;
void AsyncCopyStream(BufferBlockManager* bufferBlockManager, ReadStream* readStream, WriteStream* writeStream, int* readResult, int* writeResult);
#include "AsyncReadWrite.h"
struct BufferBlock
BufferBlock(int bufferSize) : buffer(NULL)
this->bufferSize = bufferSize;
this->buffer = new char[bufferSize];
this->actualSize = 0;
this->isLastBlock = false;
this->bufferSize = 0;
this->buffer = NULL;
this->actualSize = 0;
char* buffer;
int bufferSize;
int actualSize;
bool isLastBlock;
BufferBlockManager::BufferBlockManager(int numberOfBlocks, int bufferSize)
dequeueSleepTime = std::chrono::milliseconds(100);
for (int x = 0; x < numberOfBlocks; x++)
BufferBlock* block = new BufferBlock(bufferSize);
for (std::list<BufferBlock*>::const_iterator iterator = blocks.begin(), end = blocks.end(); iterator != end; ++iterator) {
delete (*iterator);
void BufferBlockManager::enqueueBlockForRead(BufferBlock* block)
block->actualSize = 0;
block->isLastBlock = false;
void BufferBlockManager::dequeueBlockForRead(BufferBlock** block)
while (blocksPendingRead.size() == 0)
if (blocksPendingRead.size() == 0)
*block = blocksPendingRead.front();
void BufferBlockManager::enqueueBlockForWrite(BufferBlock* block)
void BufferBlockManager::dequeueBlockForWrite(BufferBlock** block)
while (blocksPendingWrite.size() == 0)
if (blocksPendingWrite.size() == 0)
*block = blocksPendingWrite.front();
void BufferBlockManager::resetState()
blocksPendingRead = std::queue<BufferBlock*>();
blocksPendingWrite = std::queue<BufferBlock*>();
for (std::list<BufferBlock*>::const_iterator iterator = blocks.begin(), end = blocks.end(); iterator != end; ++iterator) {
(*iterator)->actualSize = 0;
struct AsyncCopyContext
AsyncCopyContext(BufferBlockManager* bufferBlockManager, ReadStream* readStream, WriteStream* writeStream)
this->bufferBlockManager = bufferBlockManager;
this->readStream = readStream;
this->writeStream = writeStream;
BufferBlockManager* bufferBlockManager;
ReadStream* readStream;
WriteStream* writeStream;
int readResult;
int writeResult;
void ReadStreamThread(AsyncCopyContext* asyncContext)
int bytesRead = 0;
BufferBlock* readBuffer = NULL;
while (
// as long there hasn't been any write errors
asyncContext->writeResult == ASYNC_COPY_READ_WRITE_SUCCESS
// and we haven't had an error reading yet
// let's deque a block to read to!
readResult = asyncContext->readStream->read(readBuffer->buffer, readBuffer->bufferSize, &bytesRead);
readBuffer->actualSize = bytesRead;
readBuffer->isLastBlock = bytesRead == 0;
// this was a valid read, go ahead and queue it for writing
// an error occured reading
asyncContext->readResult = readResult;
// since an error occured, lets queue an block to write indicatiting we are done and there are no more bytes to read
readBuffer->isLastBlock = true;
readBuffer->actualSize = 0;
if (readBuffer->isLastBlock) return;
void WriteStreamThread(AsyncCopyContext* asyncContext)
int bytesWritten = 0;
BufferBlock* writeBuffer = NULL;
bool isLastWriteBlock = false;
while (
// as long as there are no errors during reading
asyncContext->readResult == ASYNC_COPY_READ_WRITE_SUCCESS
// and we haven't had an error writing yet
// lets dequeue a block for writing!
isLastWriteBlock = writeBuffer->isLastBlock;
if (writeBuffer->actualSize > 0)
writeResult = asyncContext->writeStream->write(writeBuffer->buffer, writeBuffer->actualSize, &bytesWritten);
if (isLastWriteBlock) return;
asyncContext->writeResult = writeResult;
void AsyncCopyStream(BufferBlockManager* bufferBlockManager, ReadStream* readStream, WriteStream* writeStream, int* readResult, int* writeResult)
AsyncCopyContext asyncContext(bufferBlockManager, readStream, writeStream);
std::thread readThread(ReadStreamThread, &asyncContext);
std::thread writeThread(WriteStreamThread, &asyncContext);
*readResult = asyncContext.readResult;
*writeResult = asyncContext.writeResult;
#include <stdio.h>
#include <tchar.h>
#include "AsyncReadWrite.h"
struct ReadTestStream : ReadStream
int readCount = 0;
int read(char* buffer, int bufferSize, int* bytesRead)
printf("Starting read...\n");
memset(buffer, bufferSize, 0);
if (readCount == 10)
*bytesRead = 0;
return 0;
// pretend this function takes a while!
char buff[100];
sprintf_s(buff, "This is read number %d\n", readCount);
strcpy_s(buffer, sizeof(buff), buff);
*bytesRead = strlen(buffer);
printf("Finished read...\n");
return 0;
struct WriteTestStream : WriteStream
int write(char* buffer, int bufferSize, int* bytesWritten)
printf("Starting write...\n");
// pretend this function takes a while!
printf("Finished write...\n");
return 0;
int _tmain(int argc, _TCHAR* argv[])
BufferBlockManager bufferBlockManager(5, 4096);
ReadTestStream readStream;
WriteTestStream writeStream;
int readResult = 0;
int writeResult = 0;
printf("Starting copy...\n");
AsyncCopyStream(&bufferBlockManager, &readStream, &writeStream, &readResult, &writeResult);
printf("Finished copy... readResult=%d writeResult=%d \n", readResult, writeResult);
return 0;
EDIT: I put my solution into a GitHub repository here. If you wish to use this code, refer to the repository since it may be more updated than this answer.
Typically, you would just have one thread for each direction that alternates between reads and writes.

How to use LZMA SDK in C++?

i have difficulties in using LZMA SDK in my application.
I would like to create a kind of single file compression tool. I dont need any directory support, just need only the LZMA2 stream. But i have no idea on how LZMA SDK is to be used for this.
Please can anyone give me a little example on how the LZMA SDK can be used under C++?
I think that it's a properly little example to use LZMA SDK.
/* LzmaUtil.c -- Test application for LZMA compression
Igor Pavlov
public domain */
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include "../LzmaDec.h"
#include "../LzmaEnc.h"
#include "../Alloc.h"
const char *kCantReadMessage = "Can not read input file";
const char *kCantWriteMessage = "Can not write output file";
const char *kCantAllocateMessage = "Can not allocate memory";
const char *kDataErrorMessage = "Data error";
static void *SzAlloc(void *p, size_t size) { p = p; return MyAlloc(size); }
static void SzFree(void *p, void *address) { p = p; MyFree(address); }
static ISzAlloc g_Alloc = { SzAlloc, SzFree };
#define kInBufferSize (1 << 15)
#define kOutBufferSize (1 << 15)
unsigned char g_InBuffer[kInBufferSize];
unsigned char g_OutBuffer[kOutBufferSize];
size_t MyReadFile(FILE *file, void *data, size_t size)
{ return fread(data, 1, size, file); }
int MyReadFileAndCheck(FILE *file, void *data, size_t size)
{ return (MyReadFile(file, data, size) == size); }
size_t MyWriteFile(FILE *file, const void *data, size_t size)
if (size == 0)
return 0;
return fwrite(data, 1, size, file);
int MyWriteFileAndCheck(FILE *file, const void *data, size_t size)
{ return (MyWriteFile(file, data, size) == size); }
long MyGetFileLength(FILE *file)
long length;
fseek(file, 0, SEEK_END);
length = ftell(file);
fseek(file, 0, SEEK_SET);
return length;
void PrintHelp(char *buffer)
strcat(buffer, "\nLZMA Utility 4.58 Copyright (c) 1999-2008 Igor Pavlov 2008-04-11\n"
"\nUsage: lzma <e|d> inputFile outputFile\n"
" e: encode file\n"
" d: decode file\n");
int PrintError(char *buffer, const char *message)
strcat(buffer, "\nError: ");
strcat(buffer, message);
strcat(buffer, "\n");
return 1;
int PrintErrorNumber(char *buffer, SRes val)
sprintf(buffer + strlen(buffer), "\nError code: %x\n", (unsigned)val);
return 1;
int PrintUserError(char *buffer)
return PrintError(buffer, "Incorrect command");
#define IN_BUF_SIZE (1 << 16)
#define OUT_BUF_SIZE (1 << 16)
static int Decode(FILE *inFile, FILE *outFile, char *rs)
UInt64 unpackSize;
int thereIsSize; /* = 1, if there is uncompressed size in headers */
int i;
int res = 0;
CLzmaDec state;
/* header: 5 bytes of LZMA properties and 8 bytes of uncompressed size */
unsigned char header[LZMA_PROPS_SIZE + 8];
/* Read and parse header */
if (!MyReadFileAndCheck(inFile, header, sizeof(header)))
return PrintError(rs, kCantReadMessage);
unpackSize = 0;
thereIsSize = 0;
for (i = 0; i < 8; i++)
unsigned char b = header[LZMA_PROPS_SIZE + i];
if (b != 0xFF)
thereIsSize = 1;
unpackSize += (UInt64)b << (i * 8);
res = LzmaDec_Allocate(&state, header, LZMA_PROPS_SIZE, &g_Alloc);
if (res != SZ_OK)
return res;
Byte inBuf[IN_BUF_SIZE];
Byte outBuf[OUT_BUF_SIZE];
size_t inPos = 0, inSize = 0, outPos = 0;
for (;;)
if (inPos == inSize)
inSize = MyReadFile(inFile, inBuf, IN_BUF_SIZE);
inPos = 0;
SizeT inProcessed = inSize - inPos;
SizeT outProcessed = OUT_BUF_SIZE - outPos;
ELzmaFinishMode finishMode = LZMA_FINISH_ANY;
ELzmaStatus status;
if (thereIsSize && outProcessed > unpackSize)
outProcessed = (SizeT)unpackSize;
finishMode = LZMA_FINISH_END;
res = LzmaDec_DecodeToBuf(&state, outBuf + outPos, &outProcessed,
inBuf + inPos, &inProcessed, finishMode, &status);
inPos += (UInt32)inProcessed;
outPos += outProcessed;
unpackSize -= outProcessed;
if (outFile != 0)
MyWriteFile(outFile, outBuf, outPos);
outPos = 0;
if (res != SZ_OK || thereIsSize && unpackSize == 0)
if (inProcessed == 0 && outProcessed == 0)
if (thereIsSize || status != LZMA_STATUS_FINISHED_WITH_MARK)
LzmaDec_Free(&state, &g_Alloc);
return res;
typedef struct _CFileSeqInStream
ISeqInStream funcTable;
FILE *file;
} CFileSeqInStream;
static SRes MyRead(void *p, void *buf, size_t *size)
if (*size == 0)
return SZ_OK;
*size = MyReadFile(((CFileSeqInStream*)p)->file, buf, *size);
if (*size == 0)
return SZE_FAIL;
return SZ_OK;
typedef struct _CFileSeqOutStream
ISeqOutStream funcTable;
FILE *file;
} CFileSeqOutStream;
static size_t MyWrite(void *pp, const void *buf, size_t size)
return MyWriteFile(((CFileSeqOutStream *)pp)->file, buf, size);
static SRes Encode(FILE *inFile, FILE *outFile, char *rs)
CLzmaEncHandle enc;
SRes res;
CFileSeqInStream inStream;
CFileSeqOutStream outStream;
CLzmaEncProps props;
enc = LzmaEnc_Create(&g_Alloc);
if (enc == 0)
return SZ_ERROR_MEM;
inStream.funcTable.Read = MyRead;
inStream.file = inFile;
outStream.funcTable.Write = MyWrite;
outStream.file = outFile;
res = LzmaEnc_SetProps(enc, &props);
if (res == SZ_OK)
Byte header[LZMA_PROPS_SIZE + 8];
size_t headerSize = LZMA_PROPS_SIZE;
UInt64 fileSize;
int i;
res = LzmaEnc_WriteProperties(enc, header, &headerSize);
fileSize = MyGetFileLength(inFile);
for (i = 0; i < 8; i++)
header[headerSize++] = (Byte)(fileSize >> (8 * i));
if (!MyWriteFileAndCheck(outFile, header, headerSize))
return PrintError(rs, "writing error");
if (res == SZ_OK)
res = LzmaEnc_Encode(enc, &outStream.funcTable, &inStream.funcTable,
NULL, &g_Alloc, &g_Alloc);
LzmaEnc_Destroy(enc, &g_Alloc, &g_Alloc);
return res;
int main2(int numArgs, const char *args[], char *rs)
FILE *inFile = 0;
FILE *outFile = 0;
char c;
int res;
int encodeMode;
if (numArgs == 1)
return 0;
if (numArgs < 3 || numArgs > 4 || strlen(args[1]) != 1)
return PrintUserError(rs);
c = args[1][0];
encodeMode = (c == 'e' || c == 'E');
if (!encodeMode && c != 'd' && c != 'D')
return PrintUserError(rs);
size_t t4 = sizeof(UInt32);
size_t t8 = sizeof(UInt64);
if (t4 != 4 || t8 != 8)
return PrintError(rs, "LZMA UTil needs correct UInt32 and UInt64");
inFile = fopen(args[2], "rb");
if (inFile == 0)
return PrintError(rs, "Can not open input file");
if (numArgs > 3)
outFile = fopen(args[3], "wb+");
if (outFile == 0)
return PrintError(rs, "Can not open output file");
else if (encodeMode)
if (encodeMode)
res = Encode(inFile, outFile, rs);
res = Decode(inFile, outFile, rs);
if (outFile != 0)
if (res != SZ_OK)
if (res == SZ_ERROR_MEM)
return PrintError(rs, kCantAllocateMessage);
else if (res == SZ_ERROR_DATA)
return PrintError(rs, kDataErrorMessage);
return PrintErrorNumber(rs, res);
return 0;
int MY_CDECL main(int numArgs, const char *args[])
char rs[800] = { 0 };
int res = main2(numArgs, args, rs);
return res;
Also you can see it at:
I recently found a nice example, written in C++. Credit goes to GH user Treeki who published the original gist:
// note: -D_7ZIP_ST is required when compiling on non-Windows platforms
// g++ -o lzma_sample -std=c++14 -D_7ZIP_ST lzma_sample.cpp LzmaDec.c LzmaEnc.c LzFind.c
#include <stdio.h>
#include <stdint.h>
#include <string.h>
#include <memory>
#include "LzmaEnc.h"
#include "LzmaDec.h"
static void *_lzmaAlloc(ISzAllocPtr, size_t size) {
return new uint8_t[size];
static void _lzmaFree(ISzAllocPtr, void *addr) {
if (!addr)
delete[] reinterpret_cast<uint8_t *>(addr);
static ISzAlloc _allocFuncs = {
_lzmaAlloc, _lzmaFree
std::unique_ptr<uint8_t[]> lzmaCompress(const uint8_t *input, uint32_t inputSize, uint32_t *outputSize) {
std::unique_ptr<uint8_t[]> result;
// set up properties
CLzmaEncProps props;
if (inputSize >= (1 << 20))
props.dictSize = 1 << 20; // 1mb dictionary
props.dictSize = inputSize; // smaller dictionary = faster!
props.fb = 40;
// prepare space for the encoded properties
SizeT propsSize = 5;
uint8_t propsEncoded[5];
// allocate some space for the compression output
// this is way more than necessary in most cases...
// but better safe than sorry
// (a smarter implementation would use a growing buffer,
// but this requires a bunch of fuckery that is out of
/// scope for this simple example)
SizeT outputSize64 = inputSize * 1.5;
if (outputSize64 < 1024)
outputSize64 = 1024;
auto output = std::make_unique<uint8_t[]>(outputSize64);
int lzmaStatus = LzmaEncode(
output.get(), &outputSize64, input, inputSize,
&props, propsEncoded, &propsSize, 0,
&_allocFuncs, &_allocFuncs);
*outputSize = outputSize64 + 13;
if (lzmaStatus == SZ_OK) {
// tricky: we have to generate the LZMA header
// 5 bytes properties + 8 byte uncompressed size
result = std::make_unique<uint8_t[]>(outputSize64 + 13);
uint8_t *resultData = result.get();
memcpy(resultData, propsEncoded, 5);
for (int i = 0; i < 8; i++)
resultData[5 + i] = (inputSize >> (i * 8)) & 0xFF;
memcpy(resultData + 13, output.get(), outputSize64);
return result;
std::unique_ptr<uint8_t[]> lzmaDecompress(const uint8_t *input, uint32_t inputSize, uint32_t *outputSize) {
if (inputSize < 13)
return NULL; // invalid header!
// extract the size from the header
UInt64 size = 0;
for (int i = 0; i < 8; i++)
size |= (input[5 + i] << (i * 8));
if (size <= (256 * 1024 * 1024)) {
auto blob = std::make_unique<uint8_t[]>(size);
ELzmaStatus lzmaStatus;
SizeT procOutSize = size, procInSize = inputSize - 13;
int status = LzmaDecode(blob.get(), &procOutSize, &input[13], &procInSize, input, 5, LZMA_FINISH_END, &lzmaStatus, &_allocFuncs);
if (status == SZ_OK && procOutSize == size) {
*outputSize = size;
return blob;
return NULL;
void hexdump(const uint8_t *buf, int size) {
int lines = (size + 15) / 16;
for (int i = 0; i < lines; i++) {
printf("%08x | ", i * 16);
int lineMin = i * 16;
int lineMax = lineMin + 16;
int lineCappedMax = (lineMax > size) ? size : lineMax;
for (int j = lineMin; j < lineCappedMax; j++)
printf("%02x ", buf[j]);
for (int j = lineCappedMax; j < lineMax; j++)
printf(" ");
printf("| ");
for (int j = lineMin; j < lineCappedMax; j++) {
if (buf[j] >= 32 && buf[j] <= 127)
printf("%c", buf[j]);
void testIt(const uint8_t *input, int size) {
printf("Test Input:\n");
hexdump(input, size);
uint32_t compressedSize;
auto compressedBlob = lzmaCompress(input, size, &compressedSize);
if (compressedBlob) {
hexdump(compressedBlob.get(), compressedSize);
} else {
printf("Nope, we screwed it\n");
// let's try decompressing it now
uint32_t decompressedSize;
auto decompressedBlob = lzmaDecompress(compressedBlob.get(), compressedSize, &decompressedSize);
if (decompressedBlob) {
hexdump(decompressedBlob.get(), decompressedSize);
} else {
printf("Nope, we screwed it (part 2)\n");
void testIt(const char *string) {
testIt((const uint8_t *)string, strlen(string));
int main(int argc, char **argv) {
testIt("here is a cool string");
testIt("here's something that should compress pretty well: abcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdef");
return 0;
You can refer to this file on how to use lzma2。
// Tencent is pleased to support the open source community by making libpag available.
// Copyright (C) 2021 THL A29 Limited, a Tencent company. All rights reserved.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// unless required by applicable law or agreed to in writing, software distributed under the
// license is distributed on an "as is" basis, without warranties or conditions of any kind,
// either express or implied. see the license for the specific language governing permissions
// and limitations under the license.
#include "LzmaUtil.h"
#include "test/framework/lzma/Lzma2DecMt.h"
#include "test/framework/lzma/Lzma2Enc.h"
namespace pag {
static void* LzmaAlloc(ISzAllocPtr, size_t size) {
return new uint8_t[size];
static void LzmaFree(ISzAllocPtr, void* address) {
if (!address) {
delete[] reinterpret_cast<uint8_t*>(address);
static ISzAlloc gAllocFuncs = {LzmaAlloc, LzmaFree};
class SequentialOutStream {
virtual ~SequentialOutStream() = default;
virtual bool write(const void* data, size_t size) = 0;
class SequentialInStream {
virtual ~SequentialInStream() = default;
virtual bool read(void* data, size_t size, size_t* processedSize) = 0;
struct CSeqInStreamWrap {
ISeqInStream vt;
std::unique_ptr<SequentialInStream> inStream;
struct CSeqOutStreamWrap {
ISeqOutStream vt;
std::unique_ptr<SequentialOutStream> outStream;
class BuffPtrInStream : public SequentialInStream {
explicit BuffPtrInStream(const uint8_t* buffer, size_t bufferSize)
: buffer(buffer), bufferSize(bufferSize) {
bool read(void* data, size_t size, size_t* processedSize) override {
if (processedSize) {
*processedSize = 0;
if (size == 0 || position >= bufferSize) {
return true;
auto remain = bufferSize - position;
if (remain > size) {
remain = size;
memcpy(data, static_cast<const uint8_t*>(buffer) + position, remain);
position += remain;
if (processedSize) {
*processedSize = remain;
return true;
const uint8_t* buffer = nullptr;
size_t bufferSize = 0;
size_t position = 0;
class VectorOutStream : public SequentialOutStream {
explicit VectorOutStream(std::vector<uint8_t>* buffer) : buffer(buffer) {
bool write(const void* data, size_t size) override {
auto oldSize = buffer->size();
buffer->resize(oldSize + size);
memcpy(&(*buffer)[oldSize], data, size);
return true;
std::vector<uint8_t>* buffer;
class BuffPtrSeqOutStream : public SequentialOutStream {
BuffPtrSeqOutStream(uint8_t* buffer, size_t size) : buffer(buffer), bufferSize(size) {
bool write(const void* data, size_t size) override {
auto remain = bufferSize - position;
if (remain > size) {
remain = size;
if (remain != 0) {
memcpy(buffer + position, data, remain);
position += remain;
return remain != 0 || size == 0;
uint8_t* buffer = nullptr;
size_t bufferSize = 0;
size_t position = 0;
static const size_t kStreamStepSize = 1 << 31;
static SRes MyRead(const ISeqInStream* p, void* data, size_t* size) {
CSeqInStreamWrap* wrap = CONTAINER_FROM_VTBL(p, CSeqInStreamWrap, vt);
auto curSize = (*size < kStreamStepSize) ? *size : kStreamStepSize;
if (!wrap->inStream->read(data, curSize, &curSize)) {
*size = curSize;
return SZ_OK;
static size_t MyWrite(const ISeqOutStream* p, const void* buf, size_t size) {
auto* wrap = CONTAINER_FROM_VTBL(p, CSeqOutStreamWrap, vt);
if (wrap->outStream->write(buf, size)) {
return size;
return 0;
class Lzma2Encoder {
Lzma2Encoder() {
encoder = Lzma2Enc_Create(&gAllocFuncs, &gAllocFuncs);
~Lzma2Encoder() {
std::shared_ptr<Data> code(const std::shared_ptr<Data>& inputData) {
if (encoder == nullptr || inputData == nullptr || inputData->size() == 0) {
return nullptr;
auto inputSize = inputData->size();
CLzma2EncProps lzma2Props;
lzma2Props.lzmaProps.dictSize = inputSize;
lzma2Props.lzmaProps.level = 9;
lzma2Props.numTotalThreads = 4;
Lzma2Enc_SetProps(encoder, &lzma2Props);
std::vector<uint8_t> outBuf;
outBuf.resize(1 + 8);
outBuf[0] = Lzma2Enc_WriteProperties(encoder);
for (int i = 0; i < 8; i++) {
outBuf[1 + i] = static_cast<uint8_t>(inputSize >> (8 * i));
CSeqInStreamWrap inWrap = {};
inWrap.vt.Read = MyRead;
inWrap.inStream = std::make_unique<BuffPtrInStream>(
static_cast<const uint8_t*>(inputData->data()), inputSize);
CSeqOutStreamWrap outStream = {};
outStream.vt.Write = MyWrite;
outStream.outStream = std::make_unique<VectorOutStream>(&outBuf);
auto status =
Lzma2Enc_Encode2(encoder, &outStream.vt, nullptr, nullptr, &inWrap.vt, nullptr, 0, nullptr);
if (status != SZ_OK) {
return nullptr;
return Data::MakeWithCopy(&outBuf[0], outBuf.size());
CLzma2EncHandle encoder = nullptr;
std::shared_ptr<Data> LzmaUtil::Compress(const std::shared_ptr<Data>& pixelData) {
Lzma2Encoder encoder;
return encoder.code(pixelData);
class Lzma2Decoder {
Lzma2Decoder() {
decoder = Lzma2DecMt_Create(&gAllocFuncs, &gAllocFuncs);
~Lzma2Decoder() {
if (decoder) {
std::shared_ptr<Data> code(const std::shared_ptr<Data>& inputData) {
if (decoder == nullptr || inputData == nullptr || inputData->size() == 0) {
return nullptr;
auto input = static_cast<const uint8_t*>(inputData->data());
auto inputSize = inputData->size() - 9;
Byte prop = static_cast<const Byte*>(input)[0];
CLzma2DecMtProps props;
props.inBufSize_ST = inputSize;
props.numThreads = 1;
UInt64 outBufferSize = 0;
for (int i = 0; i < 8; i++) {
outBufferSize |= (input[1 + i] << (i * 8));
auto outBuffer = new uint8_t[outBufferSize];
CSeqInStreamWrap inWrap = {};
inWrap.vt.Read = MyRead;
inWrap.inStream = std::make_unique<BuffPtrInStream>(input + 9, inputSize);
CSeqOutStreamWrap outWrap = {};
outWrap.vt.Write = MyWrite;
outWrap.outStream = std::make_unique<BuffPtrSeqOutStream>(outBuffer, outBufferSize);
UInt64 inProcessed = 0;
int isMT = false;
auto res = Lzma2DecMt_Decode(decoder, prop, &props, &outWrap.vt, &outBufferSize, 1, &inWrap.vt,
&inProcessed, &isMT, nullptr);
if (res == SZ_OK && inputSize == inProcessed) {
return Data::MakeAdopted(outBuffer, outBufferSize, Data::DeleteProc);
delete[] outBuffer;
return nullptr;
CLzma2DecMtHandle decoder = nullptr;
std::shared_ptr<Data> LzmaUtil::Decompress(const std::shared_ptr<Data>& data) {
Lzma2Decoder decoder;
return decoder.code(data);
} // namespace pag