Uncompressing files with Qt qUnCompress function - c++

I read documentation and post about uncompressing ZIP files but I've additional questions.
I need to uncompress zip file in Qt. That is XML file compressed with gzip.
I know that qUnCompress can uncompressing zip files prepared with ZLIB and ZLIB has diffrent header than GZIP.
As i read in documentation:
Note: If you want to use this function to uncompress external data that was compressed using zlib, you first need to prepend a four byte header to the byte array containing the data. The header must contain the expected length (in bytes) of the uncompressed data, expressed as an unsigned, big-endian, 32-bit integer.
Is that means that I have to put at the beginning only length (bigendian) and than compressed data ?
I did it but I have an error from qUncompress function:
qUncompress: Z_DATA_ERROR: Input data is corrupted

You need to write you own gUncompress() function using either zlib, or some other library, that implements the DEFLATE algorithm. I personally prefer miniz:
http://code.google.com/p/miniz/
Here's some code for you:
#include <stdexcept>
#include <QtCore>
#ifndef TINFL_HEADER_FILE_ONLY
# define TINFL_HEADER_FILE_ONLY
#endif // TINFL_HEADER_FILE_ONLY
extern "C" {
# include "tinfl.h"
}
#include "guncompress.hpp"
static tinfl_decompressor inflator;
static QByteArray result(TINFL_LZ_DICT_SIZE, 0);
//////////////////////////////////////////////////////////////////////////////
QByteArray gUncompress(QByteArray const& data)
{
mz_uint8 const* inPtr(reinterpret_cast<mz_uint8 const*>(data.data()) + 10);
tinfl_init(&inflator);
size_t inAvail(data.size());
size_t outTotal(0);
tinfl_status ret;
do
{
size_t inSize(inAvail);
size_t outSize(result.size() - outTotal);
ret = tinfl_decompress(&inflator,
inPtr,
&inSize,
reinterpret_cast<mz_uint8*>(result.data()),
reinterpret_cast<mz_uint8*>(result.data()) + outTotal,
&outSize,
0
);
switch (ret)
{
case TINFL_STATUS_HAS_MORE_OUTPUT:
inAvail -= inSize;
inPtr += inSize;
result.resize(2 * result.size());
case TINFL_STATUS_DONE:
outTotal += outSize;
break;
default:
throw std::runtime_error("error decompressing gzipped content");
}
}
while (TINFL_STATUS_DONE != ret);
return QByteArray::fromRawData(result.data(), outTotal);
}
Also note that zip files and gzip files do not share the same format. Zip files need to be handled differently, as they contain a directory of files they contain.

Look for qzip.cpp, qzipreader_p.h, qzipwriter_p.h in the source for Qt. It can be used for reading and writing zip files.

Related

Why does my use of zlib decompress incorrectly?

Please explain if this is a Zlib bug or I misunderstand the use of Zlib.
I am trying to do the following:
-I have two strings - data from which I need to compress: string_data_1 and string_data_2 and which I compress with Zlib as raw data.
-Next, I create a third string and copy the already compressed data into this single row.
-Now I'm decompressing this combined compressed data and there is a problem.
Zlib decompressed only the "first" part of the compressed data, did not decompress the second part. Is that how it should be?
For an example in the facebook/zstd:Zstandard library - exactly the same action - leads to unpacking - all compressed data and the first and second parts.
Here is a simple code:
#include <iostream>
#include <string>
#include <zlib.h>
int my_Zlib__compress__RAW(std::string& string_data_to_be_compressed, std::string& string_compressed_result, int level_compressed)
{
//-------------------------------------------------------------------------
uLong zlib_uLong = compressBound(string_data_to_be_compressed.size());
string_compressed_result.resize(zlib_uLong);
//-------------------------------------------------------------------------
//this is the standard Zlib compress2 function - with one exception: the deflateInit2 function is used instead of the deflateInit function and the windowBits parameter is set to "-15" so that Zlib compresses the data as raw data:
int status = my_compress2((Bytef*)&string_compressed_result[0], &zlib_uLong, (const Bytef*)&string_data_to_be_compressed[0], string_data_to_be_compressed.size(), level_compressed);
if (status == Z_OK)
{
string_compressed_result.resize(zlib_uLong);
return 0;
}
else
{
return 1;
}
}
int my_Zlib__uncompress__RAW(std::string& string_data_to_be_uncompressed, std::string& string_compressed_data, size_t size_uncompressed_data)
{
//-------------------------------------------------------------------------
string_data_to_be_uncompressed.resize(size_uncompressed_data);
//-------------------------------------------------------------------------
//this is the standard Zlib uncompress function - with one exception: the inflateInit2 function is used instead of the inflateInit function and the windowBits parameter is set to "-15" so that Zlib uncompresses the data as raw data:
int status = my_uncompress((Bytef*)&string_data_to_be_uncompressed[0], (uLongf*)&size_uncompressed_data, (const Bytef*)&string_compressed_data[0], string_compressed_data.size());
if (status == Z_OK)
{
return 0;
}
}
int main()
{
int level_compressed = 9;
//------------------------------------------Compress_1-------------------------------------------
std::string string_data_1 = "Hello12_Hello12_Hello125"; //The data to be compressed.
std::string string_compressed_result_RAW_1; //Compressed data will be written here
int status = my_Zlib__compress__RAW(string_data_1 , string_compressed_result_RAW_1, level_compressed);
//----------------------------------------------------------------------------------------------
//--------------------------------------Compress_2----------------------------------------------
std::string string_data_2= "BUY22_BUY22_BUY223"; //The data to be compressed.
std::string string_compressed_result_RAW_2; //Compressed data will be written here
status = my_Zlib__compress__RAW(string_data_2 , string_compressed_result_RAW_2, level_compressed);
//----------------------------------------------------------------------------------------------
std::string Total_compressed_data = string_compressed_result_RAW_1 + string_compressed_result_RAW_2; //Combine two compressed data into one string
//Now I want to uncompress the data in a string - "Total_compressed_data"
//--------------------------------------Uncompress--------------------------------
std::string string_uncompressed_result_RAW; //Uncompressed data will be written here
int size_that_should_be_when_unpacking = string_data_1.size() + string_data_2.size();
status = my_Zlib__uncompress__RAW(string_uncompressed_result_RAW, Total_compressed_data, size_that_should_be_when_unpacking , level_compressed);
//--------------------------------------------------------------------------------
std::cout<<string_uncompressed_result_RAW<<std::endl; //Hello12_Hello12_Hello125
}
Zlib decompressed only the "first" part of the compressed data, did not decompress the "second" part.
Is that how it should be?
As noted in the comments, a concatenation of zlib streams is not a zlib stream. You need to uncompress again for the second zlib stream. Or compress the whole thing as one zlib stream in the first place.
You would need to use a variant of uncompress2(), not uncompress(), since the former will return the size of the first decompressed zlib stream in the last parameter, so that you know where to start decompressing the second one.
Better yet, you should use the inflate() functions instead for your application. The retention of the uncompressed size for use in decompression means that you'd need that on the other end. How do you get that? Are you transmitting it separately? You do not need that. You should use inflate() to decompress a chunk at a time, and then you don't need to know the uncompressed size ahead of time.
You should also use the deflate() functions for compression. Then you can keep the stream open, and keep compressing until you're done. Then you will have a single zlib stream.

adapt zlib zpipe for char arrays

I have a char array (char* dataToInflate) obtained from a .gz file I would like to inflate into another char array.
I don't know the original decompressed size, so I believe this means I can't use the uncompress function that is within the zlib library, since per the manual:
The size of the uncompressed data must have been saved previously by the compressor and transmitted to the decompressor by some mechanism outside the scope of this compression library.
I have looked at the zpipe.C example (https://zlib.net/zpipe.c), and the inf function here looks suitable but I'm not sure how to adapt it from FILEs to char arrays.
Does anyone know how or have any other ideas for inflating a char array into another char array?
Update:
I read here: Uncompress() of 'zlib' returns Z_DATA_ERROR
that for arrays obtained through gzip files, uncompress isn't suitable.
I found that I could decompress the file in full using gzopen, gzread and gzclose like so:
gzFile in_file_gz = gzopen(gz_char_array, "rb");
char unzip_buffer[8192];
int unzipped_bytes;
std::vector<char> unzipped_data;
while (true) {
unzipped_bytes = gzread(in_file_gz, unzip_buffer, 8192);
if (unzipped_bytes > 0) {
unzipped_data.insert(unzipped_data.end(), unzip_buffer, unzip_buffer + unzipped_bytes);
} else {
break;
}
}
gzclose(in_file_gz)
but I would also like to be able to decompress the char array. I tried with the following method:
void test_inflate(Byte *compr, uLong comprLen, Byte *uncompr, uLong *uncomprLen) {
int err;
z_stream d_stream; /* decompression stream */
d_stream.zalloc = NULL;
d_stream.zfree = NULL;
d_stream.opaque = NULL;
d_stream.next_in = compr;
d_stream.avail_in = 0;
d_stream.next_out = uncompr;
err = inflateInit2(&d_stream, MAX_WBITS + 16);
CHECK_ERR(err, "inflateInit");
while (d_stream.total_out < *uncomprLen && d_stream.total_in < comprLen) {
d_stream.avail_in = d_stream.avail_out = 1; /* force small buffers */
err = inflate(&d_stream, Z_NO_FLUSH);
if (err == Z_STREAM_END)
break;
CHECK_ERR(err, "inflate");
}
err = inflateEnd(&d_stream);
*uncomprLen = d_stream.total_out;
}
but in the while loop, the inflate method returns Z_STREAM_END before the file has decompressed in full.
The method returns successfully, but only a partial buffer has been written.
I put a minimum working example here:
https://github.com/alanjtaylor/zlibExample
if anyone has time to look.
Thanks a lot!
The example you have on github, "zippedFile.gz" is a concatenation of seven independent gzip members. This is permitted by the gzip standard (RFC 1952), and the zlib gz* file functions automatically process all of the members.
pigz will show all of the members:
% pigz -lvt zippedFile.gz
method check timestamp compressed original reduced name
gzip 8 e323586d ------ ----- 616431 1543643 60.1% zippedFile
gzip 8 7efd928a ------ ----- 369231 921600 59.9% <...>
gzip 8 7ebd8b2a ------ ----- 919565 2319970 60.4% <...>
gzip 8 3dd6e2ba ------ ----- 619670 1549236 60.0% <...>
gzip 8 c1cb922e ------ ----- 600367 1533151 60.8% <...>
gzip 8 a9fef06c ------ ----- 620250 1541785 59.8% <...>
gzip 8 43b57506 ------ ----- 623081 1555203 59.9% <...>
The inflate* functions will only process one member at a time, in order to let you know with Z_STREAM_END that the member decompressed successfully and that the CRC checked out ok.
All you need to do is put your inflator in a loop and run it until the input is exhausted, or you run into an error. (This is noted in the documentation for inflateInit2 in zlib.h.)
There are a few issues with your inflator, but I understand that it is just an initial attempt to get things working, so I won't comment.
uncompress is indeed designed for where you have all that information ready. It's a utility function.
It probably wraps inflate, which is what you want to use. You have to run it in a loop and manage the "stream" parameters yourself by repeatedly pointing to the next chunk of buffered data until it's all been eaten.
There's an annotated example in the documentation.

Zlib decompression function copies instead of decompresses

I'm trying to implement archive handling in my application, using zlib on Linux. The app is written in C++ with Qt5.
This is my example function:
int Foo::decompress(const QString &file)
{
char buffer[128];
int num_read=0;
gzFile fi = gzopen(file.toUtf8().constData(),"rb");
FILE *outfile = fopen("/Data/test.unz", "wb");
if (!fi || !outfile) return -1;
while ((num_read = gzread(fi, buffer, sizeof(buffer))) > 0) {
fwrite(buffer, 1, num_read, outfile);
}
gzclose(fi);
fclose(outfile);
}
This code causes to copy whatever file I feed it , instead of decompressing it. I've read a few other zlib decompression topics, but all they do is add to my confusion.
Any help?
Then the input is not a gzip file. gzread() serves as a drop-in replacement for fread(), so that when the input is not a gzip file, it works like fread() and reads the file with no translation.
Since your input is apparently not a gzip file, and since you say "archive handling", it sounds like you are confused about the file formats. If you are trying to read a .zip file, that is an entirely different thing from a gzip file.

How to distributed load GZIP File?

I have a gzip file "dat.gz", the origin file contains only ascii text line by line. The .gz file is generated by 'pigz -i'
I want to load "dat.gz" into several process to do parallel data processing. The program language must be C or C++. Under Linux
For example, the origin file contains "1\n2\n3", and I load the .gz file into 3 process(p0, p1, p2), so that p0 gets "1", p1 gets "2" and p3 gets"3".
I read the file format of gz here: http://tools.ietf.org/pdf/rfc1952.pdf , and I found that each block of one .gz file starts with "\x1f\x8b". So I cut the .gz file by "\x1f\x8b" into blocks. But when I use the decompress lib of boost to process the block, something goes wrong.
Maybe my method was wrong at root.
My test .gz file can be downloaded here: https://drive.google.com/file/d/0B9DaAjBTb3bbcEM1N1c4OEg0SWc/view?usp=sharing
My C++ test code is following. Running by "g++ -std=c++11 test.cpp -lboost_iostreams && ./a.out". It throws out an exception.
terminate called after throwing an instance of
boost::exception_detail::clone_impl >'
what(): gzip error
Aborted
#include <stdio.h>
#include <stdlib.h>
#include <string>
#include <fstream>
#include <iostream>
#include <boost/iostreams/filtering_stream.hpp>
#include <boost/iostreams/filter/gzip.hpp>
#include <boost/iostreams/copy.hpp>
#include <sstream>
//define buffer size of fread: 128KB
#define BUFSIZE 128*1024
void get_first_block(char *fn) {
FILE* fin = fopen(fn, "rb");
char buf[BUFSIZE] = {0};
int pos = 0;
//skip first 2 byte
fread(buf, sizeof(char), 2, fin);
int i;
while (1) {
int sz = fread(buf, sizeof(char), BUFSIZE, fin);
if (sz <= 1) {
break;
}
for (i=0; i<sz-1; ++i) {
if (buf[i] == (char)0x1f && buf[i+1] == (char)0x8b) {
break;
}
}
pos += sz;
}
//first block start: 0
//first block end: pos + i -1
int len = pos+i;
fseek(fin, 0, SEEK_SET);
char *blk = (char*)malloc(len);
fread(blk, 1, len, fin);
using namespace boost::iostreams;
filtering_streambuf<input> in;
in.push( gzip_decompressor() );
in.push( boost::iostreams::array_source(blk , len) );
std::stringstream _sstream;
boost::iostreams::copy(in, _sstream);
std::cout << _sstream.rdbuf() ;
}
int main() {
get_first_block("0000.gz");
return 0;
}
It's unlikely that there is more than one of those blocks in a .gz file, also see the Wikipedia article about gzip:
Although its file format also allows for multiple such streams to be
concatenated (zipped files are simply decompressed concatenated as if
they were originally one file), gzip is normally used to compress
just single files.
This is especially true for your test file, because if you additionally look at the "compression method" flag, you can expand the search string to 0x1F, 0x8B, 0x08 which only appears once at the very beginning of your test file.
When trying to split a .gz file into blocks, you've got to do some more parsing instead of just looking for 0x1F, 0x8B, because this can also appear inside compressed data blocks or other parts of the member.
You have to parse the members and the compressed data. Unfortunately, the header only contains the uncompressed length of the data, not the compressed length, so you can't just skip the compressed data without parsing it.
The compressed data will be deflate data (there are other, but unused compression types), see RFC 1951. For non-compressed deflate blocks (chapter 3.2.4), there's a LEN field in the header so you can skip those easily. But unfortunately, there's no length field in the header of compressed blocks, so you'll have to completely parse those.
pigz -i compresses each block independently, which permits random access at each block boundary. Between each block is an empty stored block, which ends with the sequence of bytes 00 00 ff ff. You can search for that sequence, and attempt to decompress after that. There are 39 such markers in your example file.
There is nothing that prevents 00 00 ff ff from appearing in the middle of a compressed block, not marking a block boundary. So you should expect that occasionally you will get a false indication of such a boundary, indicated by a failure to decompress. In that case, simply move on to the next such marker.

Libexif , appending new exif data

I have a task to edit exif tags and add to them application specific values.
if the exif tags exist libexif is more than happy to edit them .
but if the exif tags don't exist, i will have to create them and append them to file.
libexif uses the C fopen so i don't think there is going to be an easy way without some IO manipulation.
I am thinking to read the raw image data put them in memory , fopen(newfile, 'w')
add the exif data
and then append the image data.
only if someone knows an easier way , ( i am restricted with libexif, libexiv2 might create a liscence conflict) .
for the common good i am going to answer my own question, exif application has a modified libjpeg that enable the manipulation of the jpeg raw data.
it has functions like
jpeg_data_load_data (JPEGData *data, const unsigned char *d,unsigned int size);
and
jpeg_data_set_exif_data(myJPEGImage,exif); jpeg_data_save_file(myJPEGImage,"gangrene1.jpg");
That can be used, also free available programs like imagemagick have their own libjpeg , libexif implementation to do manipulate exif and jpeg data.
Hopes this helps
I have just gone down the same road as you with choosing between libexif and libexiv2. I went with libexif due to the licensing.
Back to the question at hand,
libexif doesn't support directly loading JPG's in. You'll need another package to read in the JPG and extract the EXIF header (or you could write something yourself).
There is an excellent Github project called exifyay that uses libexif and has two extra libs that handle reading in JPGS. It is a python project but the sources for the libraries are C. You can find exifyay here (note I am not involved in any way with exifyay or libexif)
I have just recently compiled libexif and merged sources from exifyay into a VS2010 project here. There is an example in the folder 'contrib\examples\LibexifExample'. If you don't like downloading random links here is a sample of the code I got working:
/*
* write-exif.c
*
* Placed into the public domain by Daniel Fandrich
*
* Create a new EXIF data block and write it into a JPEG image file.
*
* The JPEG image data used in this example is fixed and is guaranteed not
* to contain an EXIF tag block already, so it is easy to precompute where
* in the file the EXIF data should be. In real life, a library like
* libjpeg (included with the exif command-line tool source code) would
* be used to write to an existing JPEG file.
*/
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <assert.h>
#include <libexif/exif-data.h>
#include <libjpeg/jpeg-data.h>
#include <JpegEncoderEXIF/JpegEncoderEXIF.h>
/* byte order to use in the EXIF block */
#define FILE_BYTE_ORDER EXIF_BYTE_ORDER_INTEL
/* comment to write into the EXIF block */
#define FILE_COMMENT "libexif demonstration image"
/* special header required for EXIF_TAG_USER_COMMENT */
#define ASCII_COMMENT "ASCII\0\0\0"
static ExifEntry *create_tag(ExifData *exif, ExifIfd ifd, ExifTag tag, size_t len)
{
void *buf;
ExifEntry *entry;
/* Create a memory allocator to manage this ExifEntry */
ExifMem *mem = exif_mem_new_default();
assert(mem != NULL); /* catch an out of memory condition */
/* Create a new ExifEntry using our allocator */
entry = exif_entry_new_mem (mem);
assert(entry != NULL);
/* Allocate memory to use for holding the tag data */
buf = exif_mem_alloc(mem, len);
assert(buf != NULL);
/* Fill in the entry */
entry->data = (unsigned char*)buf;
entry->size = len;
entry->tag = tag;
entry->components = len;
entry->format = EXIF_FORMAT_UNDEFINED;
/* Attach the ExifEntry to an IFD */
exif_content_add_entry (exif->ifd[ifd], entry);
/* The ExifMem and ExifEntry are now owned elsewhere */
exif_mem_unref(mem);
exif_entry_unref(entry);
return entry;
}
int main(int argc, char **argv)
{
ExifEntry *entry;
//Input JPG
char mInputFilename[]="example.jpg";
//Load JPG
JPEGData * mJpegData = jpeg_data_new_from_file(mInputFilename);
//Load Exif data from JPG
ExifData * mExifData = jpeg_data_get_exif_data(mJpegData);
//Set some Exif options
exif_data_set_option(mExifData, EXIF_DATA_OPTION_FOLLOW_SPECIFICATION);
exif_data_set_data_type(mExifData, EXIF_DATA_TYPE_COMPRESSED);
exif_data_set_byte_order(mExifData, FILE_BYTE_ORDER);
entry = create_tag(mExifData, EXIF_IFD_EXIF, EXIF_TAG_USER_COMMENT,
sizeof(ASCII_COMMENT) + sizeof(FILE_COMMENT) - 2);
/* Write the special header needed for a comment tag */
memcpy(entry->data, ASCII_COMMENT, sizeof(ASCII_COMMENT)-1);
/* Write the actual comment text, without the trailing NUL character */
memcpy(entry->data+8, FILE_COMMENT, sizeof(FILE_COMMENT)-1);
/* create_tag() happens to set the format and components correctly for
* EXIF_TAG_USER_COMMENT, so there is nothing more to do. */
/* Create a EXIF_TAG_SUBJECT_AREA tag */
entry = create_tag(mExifData, EXIF_IFD_EXIF, EXIF_TAG_SUBJECT_AREA,
4 * exif_format_get_size(EXIF_FORMAT_SHORT));
entry->format = EXIF_FORMAT_SHORT;
entry->components = 4;
//Write back exif data
jpeg_data_set_exif_data(mJpegData,mExifData);
//Save to JPG
jpeg_data_save_file(mJpegData,"test.jpg");
return 0;
}