I have this procedure I'm working on, to simply do the following:
Open a gzipped file
Uncompress it to a std::string
Compress it from a std::string
Save as gzipped file
However, I cannot seem to write the gzipped data at the end. The error I'm getting is :No Matching Function Call to 'fwrite'.
Am I feeding it the wrong parameters? Or an incorrect type?
Here's the full procedure:
#include <iostream>
#include "zlib-1.2.8/zlib.h"
#include <stdexcept>
#include <iomanip>
#include <sstream>
#include <stdio.h>
#include <assert.h>
#include <cstdio>
#include <string>
#include <cstring>
#include <cstdlib>
#include "zlib.h"
#include "zconf.h"
#if defined(MSDOS) || defined(OS2) || defined(WIN32) || defined(__CYGWIN__)
# include <fcntl.h>
# include <io.h>
# define SET_BINARY_MODE(file) setmode(fileno(file), O_BINARY)
#else
# define SET_BINARY_MODE(file)
#endif
#define CHUNK 16384
using namespace std;
bool gzipInflate( const std::string& compressedBytes, std::string& uncompressedBytes ) {
if ( compressedBytes.size() == 0 ) {
uncompressedBytes = compressedBytes ;
return true ;
}
uncompressedBytes.clear() ;
unsigned full_length = compressedBytes.size() ;
unsigned half_length = compressedBytes.size() / 2;
unsigned uncompLength = full_length ;
char* uncomp = (char*) calloc( sizeof(char), uncompLength );
z_stream strm;
strm.next_in = (Bytef *) compressedBytes.c_str();
strm.avail_in = compressedBytes.size() ;
strm.total_out = 0;
strm.zalloc = Z_NULL;
strm.zfree = Z_NULL;
bool done = false ;
if (inflateInit2(&strm, (16+MAX_WBITS)) != Z_OK) {
free( uncomp );
return false;
}
while (!done) {
// If our output buffer is too small
if (strm.total_out >= uncompLength ) {
// Increase size of output buffer
char* uncomp2 = (char*) calloc( sizeof(char), uncompLength + half_length );
memcpy( uncomp2, uncomp, uncompLength );
uncompLength += half_length ;
free( uncomp );
uncomp = uncomp2 ;
}
strm.next_out = (Bytef *) (uncomp + strm.total_out);
strm.avail_out = uncompLength - strm.total_out;
// Inflate another chunk.
int err = inflate (&strm, Z_SYNC_FLUSH);
if (err == Z_STREAM_END) done = true;
else if (err != Z_OK) {
break;
}
}
if (inflateEnd (&strm) != Z_OK) {
free( uncomp );
return false;
}
for ( size_t i=0; i<strm.total_out; ++i ) {
uncompressedBytes += uncomp[ i ];
}
free( uncomp );
return true ;
}
/** Compress a STL string using zlib with given compression level and return
* the binary data. */
std::string compress_string(const std::string& str, int compressionlevel = Z_BEST_COMPRESSION)
{
z_stream zs; // z_stream is zlib's control structure
memset(&zs, 0, sizeof(zs));
if (deflateInit(&zs, compressionlevel) != Z_OK)
throw(std::runtime_error("deflateInit failed while compressing."));
zs.next_in = (Bytef*)str.data();
zs.avail_in = str.size(); // set the z_stream's input
int ret;
char outbuffer[32768];
std::string outstring;
// retrieve the compressed bytes blockwise
do {
zs.next_out = reinterpret_cast<Bytef*>(outbuffer);
zs.avail_out = sizeof(outbuffer);
ret = deflate(&zs, Z_FINISH);
if (outstring.size() < zs.total_out) {
// append the block to the output string
outstring.append(outbuffer,zs.total_out - outstring.size());
}
}
while (ret == Z_OK);
deflateEnd(&zs);
if (ret != Z_STREAM_END) { // an error occurred that was not EOF
std::ostringstream oss;
oss << "Exception during zlib compression: (" << ret << ") " << zs.msg;
throw(std::runtime_error(oss.str()));
}
return outstring;
}
/* Reads a file into memory. */
bool loadBinaryFile( const std::string& filename, std::string& contents ) {
// Open the gzip file in binary mode
FILE* f = fopen( filename.c_str(), "rb" );
if ( f == NULL )
return false ;
// Clear existing bytes in output vector
contents.clear();
// Read all the bytes in the file
int c = fgetc( f );
while ( c != EOF ) {
contents += (char) c ;
c = fgetc( f );
}
fclose (f);
return true ;
}
int main(int argc, char *argv[]) {
// Read the gzip file data into memory
std::string fileData ;
if ( !loadBinaryFile( "myfilein.gz", fileData ) ) {
printf( "Error loading input file." );
return 0 ;
}
// Uncompress the file data
std::string data ;
if ( !gzipInflate( fileData, data ) ) {
printf( "Error decompressing file." );
return 0 ;
}
// Print the data
printf( "Data: \"" );
for ( size_t i=0; i<data.size(); ++i ) {
printf( "%c", data[i] );
}
printf ( "\"\n" );
std::string outy;
// Compress the file data
outy = compress_string(data, 0);
//Write the gzipped data to a file.
FILE *handleWrite=fopen("myfileout.gz","wb");
fwrite(outy,sizeof(outy),1,handleWrite);
fclose(handleWrite);
return 0;
}
std::string cannot be cast to void* (what fwrite expects).
Try:
fwrite(outy.data(),outy.size(),1,handleWrite);
Although I'm not sure if storing binary data inside std::string is a good idea.
Related
On Python, there is this option errors='ignore' for the open Python function:
open( '/filepath.txt', 'r', encoding='UTF-8', errors='ignore' )
With this, reading a file with invalid UTF8 characters will replace them with nothing, i.e., they are ignored. For example, a file with the characthers Føö»BÃ¥r is going to be read as FøöBår.
If a line as Føö»BÃ¥r is read with getline() from stdio.h, it will be read as Føö�Bår:
FILE* cfilestream = fopen( "/filepath.txt", "r" );
int linebuffersize = 131072;
char* readline = (char*) malloc( linebuffersize );
while( true )
{
if( getline( &readline, &linebuffersize, cfilestream ) != -1 ) {
std::cerr << "readline=" readline << std::endl;
}
else {
break;
}
}
How can I make stdio.h getline() read it as FøöBår instead of Føö�Bår, i..e, ignoring invalid UTF8 characters?
One overwhelming solution I can think of it do iterate throughout all characters on each line read and build a new readline without any of these characters. For example:
FILE* cfilestream = fopen( "/filepath.txt", "r" );
int linebuffersize = 131072;
char* readline = (char*) malloc( linebuffersize );
char* fixedreadline = (char*) malloc( linebuffersize );
int index;
int charsread;
int invalidcharsoffset;
while( true )
{
if( ( charsread = getline( &readline, &linebuffersize, cfilestream ) ) != -1 )
{
invalidcharsoffset = 0;
for( index = 0; index < charsread; ++index )
{
if( readline[index] != '�' ) {
fixedreadline[index-invalidcharsoffset] = readline[index];
}
else {
++invalidcharsoffset;
}
}
std::cerr << "fixedreadline=" << fixedreadline << std::endl;
}
else {
break;
}
}
Related questions:
Fixing invalid UTF8 characters
Replacing non UTF8 characters
python replace unicode characters
Python unicode: how to replace character that cannot be decoded using utf8 with whitespace?
You are confusing what you see with what is really going on. The getline function does not do any replacement of characters. [Note 1]
You are seeing a replacement character (U+FFFD) because your console outputs that character when it is asked to render an invalid UTF-8 code. Most consoles will do that if they are in UTF-8 mode; that is, the current locale is UTF-8.
Also, saying that a file contains the "characters Føö»BÃ¥r" is at best imprecise. A file does not really contain characters. It contains byte sequences which may be interpreted as characters -- for example, by a console or other user presentation software which renders them into glyphs -- according to some encoding. Different encodings produce different results; in this particular case, you have a file which was created by software using the Windows-1252 encoding (or, roughly equivalently, ISO 8859-15), and you are rendering it on a console using UTF-8.
What that means is that the data read by getline contains an invalid UTF-8 sequence, but it (probably) does not contain the replacement character code. Based on the character string you present, it contains the hex character \xbb, which is a guillemot (») in Windows code page 1252.
Finding all the invalid UTF-8 sequences in a string read by getline (or any other C library function which reads files) requires scanning the string, but not for a particular code sequence. Rather, you need to decode UTF-8 sequences one at a time, looking for the ones which are not valid. That's not a simple task, but the mbtowc function can help (if you have enabled a UTF-8 locale). As you'll see in the linked manpage, mbtowc returns the number of bytes contained in a valid "multibyte sequence" (which is UTF-8 in a UTF-8 locale), or -1 to indicate an invalid or incomplete sequence. In the scan, you should pass through the bytes in a valid sequence, or remove/ignore the single byte starting an invalid sequence, and then continue the scan until you reach the end of the string.
Here's some lightly-tested example code (in C):
#include <stdlib.h>
#include <string.h>
/* Removes in place any invalid UTF-8 sequences from at most 'len' characters of the
* string pointed to by 's'. (If a NUL byte is encountered, conversion stops.)
* If the length of the converted string is less than 'len', a NUL byte is
* inserted.
* Returns the length of the possibly modified string (with a maximum of 'len'),
* not including the NUL terminator (if any).
* Requires that a UTF-8 locale be active; since there is no way to test for
* this condition, no attempt is made to do so. If the current locale is not UTF-8,
* behaviour is undefined.
*/
size_t remove_bad_utf8(char* s, size_t len) {
char* in = s;
/* Skip over the initial correct sequence. Avoid relying on mbtowc returning
* zero if n is 0, since Posix is not clear whether mbtowc returns 0 or -1.
*/
int seqlen;
while (len && (seqlen = mbtowc(NULL, in, len)) > 0) { len -= seqlen; in += seqlen; }
char* out = in;
if (len && seqlen < 0) {
++in;
--len;
/* If we find an invalid sequence, we need to start shifting correct sequences. */
for (; len; in += seqlen, len -= seqlen) {
seqlen = mbtowc(NULL, in, len);
if (seqlen > 0) {
/* Shift the valid sequence (if one was found) */
memmove(out, in, seqlen);
out += seqlen;
}
else if (seqlen < 0) seqlen = 1;
else /* (seqlen == 0) */ break;
}
*out++ = 0;
}
return out - s;
}
Notes
Aside from the possible line-end transformation of the underlying I/O library, which will replace CR-LF with a single \n on systems like Windows where the two character CR-LF sequence is used as a line-end indication.
As #rici well explains in his answer, there can be several invalid UTF-8 sequences in a byte sequence.
Possibly iconv(3) could be worth a look, e.g. see https://linux.die.net/man/3/iconv_open.
When the string "//IGNORE" is appended to tocode, characters that cannot be represented in the target character set will be silently discarded.
Example
This byte sequence, if interpreted as UTF-8, contains some invalid UTF-8:
"some invalid\xFE\xFE\xFF\xFF stuff"
If you display this you would see something like
some invalid���� stuff
When this string passes through the remove_invalid_utf8 function in the following C program, the invalid UTF-8 bytes are removed using the iconv function mentioned above.
So the result is then:
some invalid stuff
C Program
#include <stdio.h>
#include <iconv.h>
#include <string.h>
#include <stdlib.h>
#include <stdbool.h>
#include <errno.h>
char *remove_invalid_utf8(char *utf8, size_t len) {
size_t inbytes_len = len;
char *inbuf = utf8;
size_t outbytes_len = len;
char *result = calloc(outbytes_len + 1, sizeof(char));
char *outbuf = result;
iconv_t cd = iconv_open("UTF-8//IGNORE", "UTF-8");
if(cd == (iconv_t)-1) {
perror("iconv_open");
}
if(iconv(cd, &inbuf, &inbytes_len, &outbuf, &outbytes_len)) {
perror("iconv");
}
iconv_close(cd);
return result;
}
int main() {
char *utf8 = "some invalid\xFE\xFE\xFF\xFF stuff";
char *converted = remove_invalid_utf8(utf8, strlen(utf8));
printf("converted: %s to %s\n", utf8, converted);
free(converted);
return 0;
}
I also managed to fix it by trailing/cutting down all Non-ASCII characters.
This one takes about 2.6 seconds to parse 319MB:
#include <stdlib.h>
#include <iostream>
int main(int argc, char const *argv[])
{
FILE* cfilestream = fopen( "./test.txt", "r" );
size_t linebuffersize = 131072;
if( cfilestream == NULL ) {
perror( "fopen cfilestream" );
return -1;
}
char* readline = (char*) malloc( linebuffersize );
char* fixedreadline = (char*) malloc( linebuffersize );
if( readline == NULL ) {
perror( "malloc readline" );
return -1;
}
if( fixedreadline == NULL ) {
perror( "malloc fixedreadline" );
return -1;
}
char* source;
if( ( source = std::setlocale( LC_ALL, "en_US.utf8" ) ) == NULL ) {
perror( "setlocale" );
}
else {
std::cerr << "locale='" << source << "'" << std::endl;
}
int index;
int charsread;
int invalidcharsoffset;
unsigned int fixedchar;
while( true )
{
if( ( charsread = getline( &readline, &linebuffersize, cfilestream ) ) != -1 )
{
invalidcharsoffset = 0;
for( index = 0; index < charsread; ++index )
{
fixedchar = static_cast<unsigned int>( readline[index] );
// std::cerr << "index " << std::setw(3) << index
// << " readline " << std::setw(10) << fixedchar
// << " -> '" << readline[index] << "'" << std::endl;
if( 31 < fixedchar && fixedchar < 128 ) {
fixedreadline[index-invalidcharsoffset] = readline[index];
}
else {
++invalidcharsoffset;
}
}
fixedreadline[index-invalidcharsoffset] = '\0';
// std::cerr << "fixedreadline=" << fixedreadline << std::endl;
}
else {
break;
}
}
std::cerr << "fixedreadline=" << fixedreadline << std::endl;
free( readline );
free( fixedreadline );
fclose( cfilestream );
return 0;
}
Alternative and slower version using memcpy
Using menmove does not improve much speed, so you could either one.
This one takes about 3.1 seconds to parse 319MB:
#include <stdlib.h>
#include <iostream>
#include <cstring>
#include <iomanip>
int main(int argc, char const *argv[])
{
FILE* cfilestream = fopen( "./test.txt", "r" );
size_t linebuffersize = 131072;
if( cfilestream == NULL ) {
perror( "fopen cfilestream" );
return -1;
}
char* readline = (char*) malloc( linebuffersize );
char* fixedreadline = (char*) malloc( linebuffersize );
if( readline == NULL ) {
perror( "malloc readline" );
return -1;
}
if( fixedreadline == NULL ) {
perror( "malloc fixedreadline" );
return -1;
}
char* source;
char* destination;
char* finalresult;
int index;
int lastcopy;
int charsread;
int charstocopy;
int invalidcharsoffset;
bool hasignoredbytes;
unsigned int fixedchar;
if( ( source = std::setlocale( LC_ALL, "en_US.utf8" ) ) == NULL ) {
perror( "setlocale" );
}
else {
std::cerr << "locale='" << source << "'" << std::endl;
}
while( true )
{
if( ( charsread = getline( &readline, &linebuffersize, cfilestream ) ) != -1 )
{
hasignoredbytes = false;
source = readline;
destination = fixedreadline;
lastcopy = 0;
invalidcharsoffset = 0;
for( index = 0; index < charsread; ++index )
{
fixedchar = static_cast<unsigned int>( readline[index] );
// std::cerr << "fixedchar " << std::setw(10)
// << fixedchar << " -> '"
// << readline[index] << "'" << std::endl;
if( 31 < fixedchar && fixedchar < 128 ) {
if( hasignoredbytes ) {
charstocopy = index - lastcopy - invalidcharsoffset;
memcpy( destination, source, charstocopy );
source += index - lastcopy;
lastcopy = index;
destination += charstocopy;
invalidcharsoffset = 0;
hasignoredbytes = false;
}
}
else {
++invalidcharsoffset;
hasignoredbytes = true;
}
}
if( destination != fixedreadline ) {
charstocopy = charsread - static_cast<int>( source - readline )
- invalidcharsoffset;
memcpy( destination, source, charstocopy );
destination += charstocopy - 1;
if( *destination == '\n' ) {
*destination = '\0';
}
else {
*++destination = '\0';
}
finalresult = fixedreadline;
}
else {
finalresult = readline;
}
// std::cerr << "finalresult=" << finalresult << std::endl;
}
else {
break;
}
}
std::cerr << "finalresult=" << finalresult << std::endl;
free( readline );
free( fixedreadline );
fclose( cfilestream );
return 0;
}
Optimized solution using iconv
This takes about 4.6 seconds to parse 319MB of text.
#include <iconv.h>
#include <string.h>
#include <stdlib.h>
#include <iostream>
// Compile it with:
// g++ -o main test.cpp -O3 -liconv
int main(int argc, char const *argv[])
{
FILE* cfilestream = fopen( "./test.txt", "r" );
size_t linebuffersize = 131072;
if( cfilestream == NULL ) {
perror( "fopen cfilestream" );
return -1;
}
char* readline = (char*) malloc( linebuffersize );
char* fixedreadline = (char*) malloc( linebuffersize );
if( readline == NULL ) {
perror( "malloc readline" );
return -1;
}
if( fixedreadline == NULL ) {
perror( "malloc fixedreadline" );
return -1;
}
char* source;
char* destination;
int charsread;
size_t inchars;
size_t outchars;
if( ( source = std::setlocale( LC_ALL, "en_US.utf8" ) ) == NULL ) {
perror( "setlocale" );
}
else {
std::cerr << "locale='" << source << "'" << std::endl;
}
iconv_t conversiondescriptor = iconv_open("UTF-8//IGNORE", "UTF-8");
if( conversiondescriptor == (iconv_t)-1 ) {
perror( "iconv_open conversiondescriptor" );
}
while( true )
{
if( ( charsread = getline( &readline, &linebuffersize, cfilestream ) ) != -1 )
{
source = readline;
inchars = charsread;
destination = fixedreadline;
outchars = charsread;
if( iconv( conversiondescriptor, &source, &inchars, &destination, &outchars ) )
{
perror( "iconv" );
}
// Trim out the new line character
if( *--destination == '\n' ) {
*--destination = '\0';
}
else {
*destination = '\0';
}
// std::cerr << "fixedreadline='" << fixedreadline << "'" << std::endl;
}
else {
break;
}
}
std::cerr << "fixedreadline='" << fixedreadline << "'" << std::endl;
free( readline );
free( fixedreadline );
if( fclose( cfilestream ) ) {
perror( "fclose cfilestream" );
}
if( iconv_close( conversiondescriptor ) ) {
perror( "iconv_close conversiondescriptor" );
}
return 0;
}
Slowest solution ever using mbtowc
This takes about 24.2 seconds to parse 319MB of text.
If you comment out the line fixedchar = mbtowc(NULL, source, charsread); and uncomment the line charsread -= fixedchar; (breaking the invalid characters removal) this will take 1.9 seconds instead of 24.2 seconds (also compiled with -O3 optimization level).
#include <stdlib.h>
#include <string.h>
#include <iostream>
#include <cstring>
#include <iomanip>
int main(int argc, char const *argv[])
{
FILE* cfilestream = fopen( "./test.txt", "r" );
size_t linebuffersize = 131072;
if( cfilestream == NULL ) {
perror( "fopen cfilestream" );
return -1;
}
char* readline = (char*) malloc( linebuffersize );
if( readline == NULL ) {
perror( "malloc readline" );
return -1;
}
char* source;
char* lineend;
char* destination;
int charsread;
int fixedchar;
if( ( source = std::setlocale( LC_ALL, "en_US.utf8" ) ) == NULL ) {
perror( "setlocale" );
}
else {
std::cerr << "locale='" << source << "'" << std::endl;
}
while( true )
{
if( ( charsread = getline( &readline, &linebuffersize, cfilestream ) ) != -1 )
{
lineend = readline + charsread;
destination = readline;
for( source = readline; source != lineend; )
{
// fixedchar = 1;
fixedchar = mbtowc(NULL, source, charsread);
charsread -= fixedchar;
// std::ostringstream contents;
// for( int index = 0; index < fixedchar; ++index )
// contents << source[index];
// std::cerr << "fixedchar=" << std::setw(10)
// << fixedchar << " -> '"
// << contents.str().c_str() << "'" << std::endl;
if( fixedchar > 0 ) {
memmove( destination, source, fixedchar );
source += fixedchar;
destination += fixedchar;
}
else if( fixedchar < 0 ) {
source += 1;
// std::cerr << "errno=" << strerror( errno ) << std::endl;
}
else {
break;
}
}
// Trim out the new line character
if( *--destination == '\n' ) {
*--destination = '\0';
}
else {
*destination = '\0';
}
// std::cerr << "readline='" << readline << "'" << std::endl;
}
else {
break;
}
}
std::cerr << "readline='" << readline << "'" << std::endl;
if( fclose( cfilestream ) ) {
perror( "fclose cfilestream" );
}
free( readline );
return 0;
}
Fastest version from all my others above using memmove
You cannot use memcpy here because the memory regions overlap!
This takes about 2.4 seconds to parse 319MB.
If you comment out the lines *destination = *source and memmove( destination, source, 1 ) (breaking the invalid characters removal) the performance still almost the same as when memmove is being called. Here in, calling memmove( destination, source, 1 ) is a little slower than directly doing *destination = *source;
#include <stdlib.h>
#include <iostream>
#include <cstring>
#include <iomanip>
int main(int argc, char const *argv[])
{
FILE* cfilestream = fopen( "./test.txt", "r" );
size_t linebuffersize = 131072;
if( cfilestream == NULL ) {
perror( "fopen cfilestream" );
return -1;
}
char* readline = (char*) malloc( linebuffersize );
if( readline == NULL ) {
perror( "malloc readline" );
return -1;
}
char* source;
char* lineend;
char* destination;
int charsread;
unsigned int fixedchar;
if( ( source = std::setlocale( LC_ALL, "en_US.utf8" ) ) == NULL ) {
perror( "setlocale" );
}
else {
std::cerr << "locale='" << source << "'" << std::endl;
}
while( true )
{
if( ( charsread = getline( &readline, &linebuffersize, cfilestream ) ) != -1 )
{
lineend = readline + charsread;
destination = readline;
for( source = readline; source != lineend; ++source )
{
fixedchar = static_cast<unsigned int>( *source );
// std::cerr << "fixedchar=" << std::setw(10)
// << fixedchar << " -> '" << *source << "'" << std::endl;
if( 31 < fixedchar && fixedchar < 128 ) {
*destination = *source;
++destination;
}
}
// Trim out the new line character
if( *source == '\n' ) {
*--destination = '\0';
}
else {
*destination = '\0';
}
// std::cerr << "readline='" << readline << "'" << std::endl;
}
else {
break;
}
}
std::cerr << "readline='" << readline << "'" << std::endl;
if( fclose( cfilestream ) ) {
perror( "fclose cfilestream" );
}
free( readline );
return 0;
}
Bonus
You can also use Python C Extensions (API).
It takes about 2.3 seconds to parse 319MB without converting them to cached version UTF-8 char*
And takes about 3.2 seconds to parse 319MB converting them to UTF-8 char*.
And also takes about 3.2 seconds to parse 319MB converting them to cached ASCII char*.
#define PY_SSIZE_T_CLEAN
#include <Python.h>
#include <iostream>
typedef struct
{
PyObject_HEAD
}
PyFastFile;
static PyModuleDef fastfilepackagemodule =
{
// https://docs.python.org/3/c-api/module.html#c.PyModuleDef
PyModuleDef_HEAD_INIT,
"fastfilepackage", /* name of module */
"Example module that wrapped a C++ object", /* module documentation, may be NULL */
-1, /* size of per-interpreter state of the module, or
-1 if the module keeps state in global variables. */
NULL, /* PyMethodDef* m_methods */
NULL, /* inquiry m_reload */
NULL, /* traverseproc m_traverse */
NULL, /* inquiry m_clear */
NULL, /* freefunc m_free */
};
// initialize PyFastFile Object
static int PyFastFile_init(PyFastFile* self, PyObject* args, PyObject* kwargs) {
char* filepath;
if( !PyArg_ParseTuple( args, "s", &filepath ) ) {
return -1;
}
int linecount = 0;
PyObject* iomodule;
PyObject* openfile;
PyObject* fileiterator;
iomodule = PyImport_ImportModule( "builtins" );
if( iomodule == NULL ) {
std::cerr << "ERROR: FastFile failed to import the io module '"
"(and open the file " << filepath << "')!" << std::endl;
PyErr_PrintEx(100);
return -1;
}
PyObject* openfunction = PyObject_GetAttrString( iomodule, "open" );
if( openfunction == NULL ) {
std::cerr << "ERROR: FastFile failed get the io module open "
<< "function (and open the file '" << filepath << "')!" << std::endl;
PyErr_PrintEx(100);
return -1;
}
openfile = PyObject_CallFunction(
openfunction, "ssiss", filepath, "r", -1, "ASCII", "ignore" );
if( openfile == NULL ) {
std::cerr << "ERROR: FastFile failed to open the file'"
<< filepath << "'!" << std::endl;
PyErr_PrintEx(100);
return -1;
}
PyObject* iterfunction = PyObject_GetAttrString( openfile, "__iter__" );
Py_DECREF( openfunction );
if( iterfunction == NULL ) {
std::cerr << "ERROR: FastFile failed get the io module iterator"
<< "function (and open the file '" << filepath << "')!" << std::endl;
PyErr_PrintEx(100);
return -1;
}
PyObject* openiteratorobject = PyObject_CallObject( iterfunction, NULL );
Py_DECREF( iterfunction );
if( openiteratorobject == NULL ) {
std::cerr << "ERROR: FastFile failed get the io module iterator object"
<< " (and open the file '" << filepath << "')!" << std::endl;
PyErr_PrintEx(100);
return -1;
}
fileiterator = PyObject_GetAttrString( openfile, "__next__" );
Py_DECREF( openiteratorobject );
if( fileiterator == NULL ) {
std::cerr << "ERROR: FastFile failed get the io module iterator "
<< "object (and open the file '" << filepath << "')!" << std::endl;
PyErr_PrintEx(100);
return -1;
}
PyObject* readline;
while( ( readline = PyObject_CallObject( fileiterator, NULL ) ) != NULL ) {
linecount += 1;
PyUnicode_AsUTF8( readline );
Py_DECREF( readline );
// std::cerr << "linecount " << linecount << " readline '" << readline
// << "' '" << PyUnicode_AsUTF8( readline ) << "'" << std::endl;
}
std::cerr << "linecount " << linecount << std::endl;
// PyErr_PrintEx(100);
PyErr_Clear();
PyObject* closefunction = PyObject_GetAttrString( openfile, "close" );
if( closefunction == NULL ) {
std::cerr << "ERROR: FastFile failed get the close file function for '"
<< filepath << "')!" << std::endl;
PyErr_PrintEx(100);
return -1;
}
PyObject* closefileresult = PyObject_CallObject( closefunction, NULL );
Py_DECREF( closefunction );
if( closefileresult == NULL ) {
std::cerr << "ERROR: FastFile failed close open file '"
<< filepath << "')!" << std::endl;
PyErr_PrintEx(100);
return -1;
}
Py_DECREF( closefileresult );
Py_XDECREF( iomodule );
Py_XDECREF( openfile );
Py_XDECREF( fileiterator );
return 0;
}
// destruct the object
static void PyFastFile_dealloc(PyFastFile* self) {
Py_TYPE(self)->tp_free( (PyObject*) self );
}
static PyTypeObject PyFastFileType =
{
PyVarObject_HEAD_INIT( NULL, 0 )
"fastfilepackage.FastFile" /* tp_name */
};
// create the module
PyMODINIT_FUNC PyInit_fastfilepackage(void)
{
PyObject* thismodule;
// https://docs.python.org/3/c-api/typeobj.html
PyFastFileType.tp_new = PyType_GenericNew;
PyFastFileType.tp_basicsize = sizeof(PyFastFile);
PyFastFileType.tp_dealloc = (destructor) PyFastFile_dealloc;
PyFastFileType.tp_flags = Py_TPFLAGS_DEFAULT;
PyFastFileType.tp_doc = "FastFile objects";
PyFastFileType.tp_init = (initproc) PyFastFile_init;
if( PyType_Ready( &PyFastFileType) < 0 ) {
return NULL;
}
thismodule = PyModule_Create(&fastfilepackagemodule);
if( thismodule == NULL ) {
return NULL;
}
// Add FastFile class to thismodule allowing the use to create objects
Py_INCREF( &PyFastFileType );
PyModule_AddObject( thismodule, "FastFile", (PyObject*) &PyFastFileType );
return thismodule;
}
To built it, create the file source/fastfilewrappar.cpp with the contents of the above file and the setup.py with the following contents:
#! /usr/bin/env python
# -*- coding: utf-8 -*-
from setuptools import setup, Extension
myextension = Extension(
language = "c++",
extra_link_args = ["-std=c++11"],
extra_compile_args = ["-std=c++11"],
name = 'fastfilepackage',
sources = [
'source/fastfilewrapper.cpp'
],
include_dirs = [ 'source' ],
)
setup(
name = 'fastfilepackage',
ext_modules= [ myextension ],
)
To run example, use following Python script:
import time
import datetime
import fastfilepackage
testfile = './test.txt'
timenow = time.time()
iterable = fastfilepackage.FastFile( testfile )
fastfile_time = time.time() - timenow
timedifference = datetime.timedelta( seconds=fastfile_time )
print( 'FastFile timedifference', timedifference, flush=True )
Example:
user#user-pc$ /usr/bin/pip3.6 install .
Processing /fastfilepackage
Building wheels for collected packages: fastfilepackage
Building wheel for fastfilepackage (setup.py) ... done
Stored in directory: /pip-ephem-wheel-cache-j313cpzc/wheels/e5/5f/bc/52c820
Successfully built fastfilepackage
Installing collected packages: fastfilepackage
Found existing installation: fastfilepackage 0.0.0
Uninstalling fastfilepackage-0.0.0:
Successfully uninstalled fastfilepackage-0.0.0
Successfully installed fastfilepackage-0.0.0
user#user-pc$ /usr/bin/python3.6 fastfileperformance.py
linecount 820800
FastFile timedifference 0:00:03.204614
Using std::getline
This takes about 4.7 seconds to parse 319MB.
If you remove the UTF-8 removal algorithm borrowed from the fastest benchmark using stdlib.h getline(), it takes 1.7 seconds to run.
#include <stdlib.h>
#include <iostream>
#include <locale>
#include <fstream>
#include <iomanip>
int main(int argc, char const *argv[])
{
unsigned int fixedchar;
int linecount = -1;
char* source;
char* lineend;
char* destination;
if( ( source = setlocale( LC_ALL, "en_US.ascii" ) ) == NULL ) {
perror( "setlocale" );
return -1;
}
else {
std::cerr << "locale='" << source << "'" << std::endl;
}
std::ifstream fileifstream{ "./test.txt" };
if( fileifstream.fail() ) {
std::cerr << "ERROR: FastFile failed to open the file!" << std::endl;
return -1;
}
size_t linebuffersize = 131072;
char* readline = (char*) malloc( linebuffersize );
if( readline == NULL ) {
perror( "malloc readline" );
return -1;
}
while( true )
{
if( !fileifstream.eof() )
{
linecount += 1;
fileifstream.getline( readline, linebuffersize );
lineend = readline + fileifstream.gcount();
destination = readline;
for( source = readline; source != lineend; ++source )
{
fixedchar = static_cast<unsigned int>( *source );
// std::cerr << "fixedchar=" << std::setw(10)
// << fixedchar << " -> '" << *source << "'" << std::endl;
if( 31 < fixedchar && fixedchar < 128 ) {
*destination = *source;
++destination;
}
}
// Trim out the new line character
if( *source == '\n' ) {
*--destination = '\0';
}
else {
*destination = '\0';
}
// std::cerr << "readline='" << readline << "'" << std::endl;
}
else {
break;
}
}
std::cerr << "linecount='" << linecount << "'" << std::endl;
if( fileifstream.is_open() ) {
fileifstream.close();
}
free( readline );
return 0;
}
Resume
2.6 seconds trimming UTF-8 using two buffers with indexing
3.1 seconds trimming UTF-8 using two buffers with memcpy
4.6 seconds removing invalid UTF-8 with iconv
24.2 seconds removing invalid UTF-8 with mbtowc
2.4 seconds trimming UTF-8 using one buffer with pointer direct assigning
Bonus
2.3 seconds removing invalid UTF-8 without converting them to a cached UTF-8 char*
3.2 seconds removing invalid UTF-8 converting them to a cached UTF-8 char*
3.2 seconds trimming UTF-8 and caching as ASCII char*
4.7 seconds trimming UTF-8 with std::getline() using one buffer with pointer direct assigning
The used file ./text.txt had 820.800 lines where each line was equal to:
id-é-char&id-é-char&id-é-char&id-é-char&id-é-char&id-é-char&id-é-char&id-é-char&id-é-char&id-é-char&id-é-char&id-é-char&id-é-char&id-é-char&id-é-char&id-é-char&id-é-char&id-é-char&id-é-char&id-é-char\r\n
And all versions where compiled with
g++ (GCC) 7.4.0
iconv (GNU libiconv 1.14)
g++ -o main test.cpp -O3 -liconv && time ./main
I'm trying to use miniunzip to extract some files. It works on Linux. On Windows, it throws no errors, but if the file is executable, the resulting binary doesn't work. I get a message window with a message about incompatibility with 64-bit Windows. If I use another utility, such as 7-zip, to unpack it, everything works fine, so the problem is here in my code. Here is the class method that does all the work.
bool FileHandler::unzip( string inputFile, string outputDirectory )
{
if (!fileExists(inputFile)) {
this->errorMessage = "Can't find file at " + inputFile;
return false;
}
unzFile zipFile = unzOpen(inputFile.c_str());
if( zipFile == nullptr ){
this->errorMessage = "FileHandler::unzip failed to open input file";
return false;
}
vector<string> files;
vector<string> folders;
unz_global_info globalInfo;
int err = unzGetGlobalInfo( zipFile, &globalInfo );
if (unzGoToFirstFile(zipFile) != UNZ_OK) {
this->errorMessage = "FileHandler::unzip failed calling unzGoToFirstFile";
return false;
}
for ( unsigned long i=0; i < globalInfo.number_entry && err == UNZ_OK; i++ ){
char filename[FILENAME_MAX];
unz_file_info subFileInfo;
err = unzGetCurrentFileInfo( zipFile, &subFileInfo, filename,
sizeof(filename), NULL, 0, NULL, 0);
if ( err == UNZ_OK )
{
char nLast = filename[subFileInfo.size_filename-1];
if ( nLast =='/' || nLast == '\\' )
{
folders.push_back(filename);
}
else
{
files.push_back(filename);
}
err = unzGoToNextFile(zipFile);
}
}
for ( string & folder : folders ){
string strippedFolder = folder.substr(0, folder.length()-1);
string dirPath = normalizePath(outputDirectory+"/"+strippedFolder);
if( ! makeDirectory( dirPath ) ){
this->errorMessage = "FileHandler::unzip Failed to create directory "+dirPath;
return false;
}
}
for ( auto it = files.begin(); it != files.end(); it++ ){
if( zipFile == 0 ){
this->errorMessage = "FileHandler::unzip invalid unzFile object at position 1";
return false;
}
string filename = *it;
//string filepath = outputDirectory + "/" + *it;
string filepath = normalizePath( outputDirectory + "/" + *it );
const char * cFile = filename.c_str();
const char * cPath = filepath.c_str();
int err = unzLocateFile( zipFile, cFile, 0 );
if ( err != UNZ_OK ){
this->errorMessage = "FileHandler::unzip error locating sub-file.";
return false;
}
err = unzOpenCurrentFile( zipFile );
if( err != UNZ_OK ){
this->errorMessage = "FileHandler::unzip error opening current file";
return false;
}
ofstream fileStream{ cPath };
// Need an ostream object here.
if( fileStream.fail() ){
this->errorMessage = "FileHandler::unzip error opening file stream at "+string(cPath);
return false;
}
unz_file_info curFileInfo;
err = unzGetCurrentFileInfo( zipFile, &curFileInfo, 0, 0, 0, 0, 0, 0);
if ( err != UNZ_OK )
{
this->errorMessage = "FileHandler::unzip failed to read size of file";
return false;
}
unsigned int size = (unsigned int)curFileInfo.uncompressed_size;
char * buf = new char[size];
size = unzReadCurrentFile( zipFile, buf, size );
if ( size < 0 ){
this->errorMessage = "FileHandler::unzip unzReadCurrentFile returned an error. ";
return false;
}
fileStream.write( buf, size );
fileStream.flush();
delete [] buf;
fileStream.close();
#ifndef _WIN32
vector<string> parts = splitString(filename, ".");
if( parts.size() == 1 ){ // In linux, assume file without extension is executable
mode_t old_mask = umask( 000 );
chmod( cPath, S_IRWXU|S_IRWXG|S_IROTH|S_IXOTH );
umask( old_mask );
}
#endif
unzCloseCurrentFile( zipFile );
}
unzClose(zipFile);
return true;
}
std::ostream opens files in text mode by default, you need to make it use binary mode instead.
On Linux there doesn't seem to be any difference between text and binary modes. But on Windows, attempting to write \n into a text file produces \r\n, currupting your data.
You need to change this line
ofstream fileStream{ cPath };
to
ofstream fileStream{ cPath, ostream::out | ostream::binary };
For example the file I am parsing contains unicode char u201d ie. ” (accented quote)
How do I replace it with " (Straight quote)?
using c++ and STL i would use a code like this, you still need to save to output buffer to file.. tested on linux.
#include <string>
#include <stdio.h>
#include <stdlib.h>
#include <iostream>
// load file data
char* load_file(const char *filename)
{
FILE *fp;
fp = fopen(filename, "r");
if (!fp)
return NULL;
size_t size;
if ((0 != fseek(fp, 0, SEEK_END)) || (-1 == (size = ftell(fp))))
size = 0;
// set fp at file start
fseek(fp, 0, 0);
char *buffer;
buffer = (char*) malloc(size);
if(!buffer)
{
fclose (fp);
return NULL;
}
if(size != fread(buffer, 1, size, fp))
{
free (buffer);
buffer = NULL;
}
fclose (fp);
return buffer;
}
// replace string
std::string replace(const std::string& str, const std::string& from, const std::string& to)
{
if(str.size() < 1)
return str;
std::string temp_str(str);
size_t start_pos = 0;
while((start_pos = temp_str.find(from, start_pos)) != std::string::npos)
{
temp_str.replace(start_pos, from.length(), to);
start_pos += to.length();
}
return temp_str.c_str();
}
int main(int argc, char** argv)
{
const char* file_name = "test.txt";
char* file_bytes = load_file(file_name);
if(file_bytes == nullptr)
return EXIT_FAILURE;
std::cout << replace(file_bytes, "”", "\"") << std::endl;
return EXIT_SUCCESS;
}
I am writing a simple app that outputs all files in some directory to console. To achieve this I dynamically allocate memory in function PathCreator() and return a pointer to this memory. I don't know how to correctly free this memory segment in GetAllFiles(). When I use the code below I get a stack overflow exception. How can I fix this? Please don't offer me to use something that doesn't need dynamically allocated memory, I just want to fix my code.
#include "stdafx.h"
#include <windows.h>
#include <iostream>
wchar_t *PathCreator(wchar_t *dir, wchar_t *fileName);
int is_directory(wchar_t *p)
{
wchar_t *t = PathCreator(p,L"\\");
WIN32_FIND_DATA file;
HANDLE search_hendle = FindFirstFile(t, &file);
long error = GetLastError();
if(error == 267)
{
return 0;
}
else
{
return 1;
}
}
wchar_t *PathCreator(wchar_t *dir, wchar_t *fileName)
{
wchar_t* path = 0;
int size = 0;
wchar_t *d = dir;
wchar_t *f = fileName;
while(*d != '\0')
{
d++;
size++;
}
while(*f != '\0')
{
f++;
size++;
}
path = new wchar_t[(size+=3) * sizeof(wchar_t)];
int j = 0;
while(j < size)
{
path[j] = '\0';
j++;
}
int i;
i = 0;
while(*dir != '\0')
{
path[i] = *dir;
i++;
dir++;
}
path[i++] = '\\';
wchar_t *t = fileName;
while(*t != '\0')
{
path[i] = *t;
i++;
t++;
}
path[i] = '\0';
return path;
}
void GetAllFiles(wchar_t* dir)
{
wchar_t *p = 0;
int i = 0;
WIN32_FIND_DATA file;
wchar_t *t = PathCreator(dir, L"*");
HANDLE search_hendle = FindFirstFile(t, &file);
if(search_hendle)
{
do
{
p = PathCreator(dir,file.cFileName);
if(!is_directory(p))
{
std::wcout << p << std::endl;
}
else
{
GetAllFiles(p);
}
delete [] p;
}
while(FindNextFile(search_hendle, &file));
}
delete [] t;
FindClose(search_hendle);
}
int _tmain(int argc, _TCHAR* argv[])
{
GetAllFiles(L"C:\\Users");
}
So, you have "." and ".." in your directory search.
The first entry is ".", so:
p = PathCreator(dir, file.cFilename)
yields:
"C:\Users\."
Then the next line:
if (!is_directory(p))
Is ALWAYS false, so it just keeps recursing into:
GetAllFiles(p)
forever ... or until your stack blows up, whichever comes first ;-)
I would recommend explicitly checking for "." and ".." and skipping those entries (also MFC and Qt, etc. have nice directory handling classes, but I think you want to do it this way).
My modification:
do
{
// I added this - guess I can't embolden code text
if (wcscmp(file.cFileName,L".") == 0 || wcscmp(file.cFileName,L"..")==0)
continue;
p = PathCreator(dir,file.cFileName);
if(!is_directory(p))
{
std::wcout << p << std::endl;
}
else
{
GetAllFiles(p);
}
delete [] p;
}
while(FindNextFile(search_hendle, &file));
Again you try to use C in place of C++ and you still using wcout?! no problem you are a programmer and I'm sure you have a reason for this! but memory management in C is much much harder than C++ and you should have some skills to use it. Here is a fully working code but as you see it is really harder to manage, use and understand than its C++ version using standard containers and string, so if you are allowed to use C++(as you use wcout) then use its C++ version for ease:
#include <Windows.h>
/*! \brief Merge \a folder and \a filename into a newly allocate memory and
* return it to the caller. Use free to free returned memory!
*/
wchar_t* PathCreator( wchar_t const* folder, wchar_t const* filename )
{
wchar_t* res;
size_t i, len, folderLen = wcslen( folder ), filenameLen = wcslen( filename );
len = folderLen + filenameLen;
if( folder[folderLen - 1] != '\\' ) ++len;
++len; // for \0
res = (wchar_t*) malloc( sizeof(wchar_t) * len );
if( !res ) return NULL;
wcscpy_s( res, len, folder );
/* Remove possible wide card at end of folder */
for( i = folderLen; i--; ) {
if( res[i] == '*' || res[i] == '?' ) {
res[i] = 0;
--folderLen;
} else {
break;
}
}
if( res[folderLen - 1] != '\\' ) wcscat_s( res, len, L"\\" );
wcscat_s( res, len, filename );
return res;
}
/*! \brief Free memory that returned by \ref GetAllFiles
*/
void FreeAllFilesMemory( wchar_t** p )
{
wchar_t** tmp = p;
if( !p ) return ;
while( *tmp ) free( *tmp++ );
free( p );
}
wchar_t** AddToArray( wchar_t** p, size_t* pAllocated, size_t* pUsed, wchar_t* s )
{
if( *pUsed >= *pAllocated ) {
size_t newAlloc = *pAllocated * 3 / 2; // Grow by 1.5
if( newAlloc < 16 ) newAlloc = 16;
p = (wchar_t**) realloc( p, newAlloc * sizeof(wchar_t*) );
if( !p ) return NULL;
*pAllocated = newAlloc;
}
p[*pUsed] = s;
++*pUsed;
return p;
}
wchar_t** GetAllFilesImpl( wchar_t const* folder, wchar_t** res, size_t* pAllocated, size_t* pUsed )
{
HANDLE hSearch;
WIN32_FIND_DATAW fileinfo;
size_t allocatedMemory = 0;
hSearch = FindFirstFileW( folder, &fileinfo );
if( hSearch != INVALID_HANDLE_VALUE ) {
do {
wchar_t* sFileName, ** tmp, sTmp[ 1024 ];
/* ignore ., .. */
if( !wcscmp(fileinfo.cFileName, L".") ||
!wcscmp(fileinfo.cFileName, L"..") )
continue;
sFileName = PathCreator( folder, fileinfo.cFileName );
wprintf( L"%s\n", sFileName ); /* Print result */
tmp = AddToArray( res, pAllocated, pUsed, sFileName );
if( !tmp ) return FreeAllFilesMemory(res), NULL;
res = tmp;
if( fileinfo.dwFileAttributes & FILE_ATTRIBUTE_DIRECTORY ) {
wcscpy_s( sTmp, sFileName );
wcscat_s( sTmp, L"\\*" );
tmp = GetAllFilesImpl( sTmp, res, pAllocated, pUsed );
if( !tmp ) return NULL;
res = tmp;
}
} while( FindNextFileW(hSearch, &fileinfo) );
FindClose( hSearch );
}
return res;
}
/*! \brief List all files that match a pattern and return it as an array of
* wide strings, free result using \ref FreeAllFilesMemory
*/
wchar_t** GetAllFiles( wchar_t const* folder )
{
size_t nAllocated = 0, nUsed = 0;
wchar_t** res = GetAllFilesImpl( folder, NULL, &nAllocated, &nUsed );
if( res ) {
/* to indicate end of result add a NULL string */
wchar_t** tmp = AddToArray( res, &nAllocated, &nUsed, NULL );
if( !tmp ) return FreeAllFilesMemory(res), NULL;
res = tmp;
}
return res;
}
I have 3 terabyte .gz file and want to read its uncompressed content line-by-line in a C++ program. As the file is quite huge, I want to avoid loading it completely in memory.
Can anyone post a simple example of doing it?
You most probably will have to use ZLib's deflate, example is available from their site
Alternatively you may have a look at BOOST C++ wrapper
The example from BOOST page (decompresses data from a file and writes it to standard output)
#include <fstream>
#include <iostream>
#include <boost/iostreams/filtering_streambuf.hpp>
#include <boost/iostreams/copy.hpp>
#include <boost/iostreams/filter/zlib.hpp>
int main()
{
using namespace std;
ifstream file("hello.z", ios_base::in | ios_base::binary);
filtering_streambuf<input> in;
in.push(zlib_decompressor());
in.push(file);
boost::iostreams::copy(in, cout);
}
For something that is going to be used regularly, you probably want to use one of the previous suggestions. Alternatively, you can do
gzcat file.gz | yourprogram
and have yourprogram read from cin. This will decompress parts of the file in memory as it is needed, and send the uncompressed output to yourprogram.
Using zlib, I'm doing something along these lines:
// return a line in a std::vector< char >
std::vector< char > readline( gzFile f ) {
std::vector< char > v( 256 );
unsigned pos = 0;
for ( ;; ) {
if ( gzgets( f, &v[ pos ], v.size() - pos ) == 0 ) {
// end-of-file or error
int err;
const char *msg = gzerror( f, &err );
if ( err != Z_OK ) {
// handle error
}
break;
}
unsigned read = strlen( &v[ pos ] );
if ( v[ pos + read - 1 ] == '\n' ) {
if ( pos + read >= 2 && v[ pos + read - 2 ] == '\r' ) {
pos = pos + read - 2;
} else {
pos = pos + read - 1;
}
break;
}
if ( read == 0 || pos + read < v.size() - 1 ) {
pos = read + pos;
break;
}
pos = v.size() - 1;
v.resize( v.size() * 2 );
}
v.resize( pos );
return v;
}
EDIT: Removed two mis-copied * in the example above.
EDIT: Corrected out of bounds read on v[pos + read - 2]
The zlib library supports decompressing files in memory in blocks, so you don't have to decompress the entire file in order to process it.
Here is some code with which you can read normal and zipped files line by line:
char line[0x10000];
FILE *infile=open_file(file);
bool gzipped=endsWith(file, ".gz");
if(gzipped)
init_gzip_stream(infile,&line[0]);
while (readLine(infile,line,gzipped)) {
if(line[0]==0)continue;// skip gzip new_block
printf(line);
}
#include <zlib.h>
#define CHUNK 0x100
#define OUT_CHUNK CHUNK*100
unsigned char gzip_in[CHUNK];
unsigned char gzip_out[OUT_CHUNK];
///* These are parameters to inflateInit2. See http://zlib.net/manual.html for the exact meanings. */
#define windowBits 15
#define ENABLE_ZLIB_GZIP 32
z_stream strm = {0};
z_stream init_gzip_stream(FILE* file,char* out){// unsigned
strm.zalloc = Z_NULL;
strm.zfree = Z_NULL;
strm.opaque = Z_NULL;
strm.next_in = gzip_in;
strm.avail_in = 0;
strm.next_out = gzip_out;
inflateInit2 (& strm, windowBits | ENABLE_ZLIB_GZIP);
return strm;
}
bool inflate_gzip(FILE* file, z_stream strm,size_t bytes_read){
strm.avail_in = (int)bytes_read;
do {
strm.avail_out = OUT_CHUNK;
inflate (& strm, Z_NO_FLUSH);
// printf ("%s",gzip_out);
}while (strm.avail_out == 0);
if (feof (file)) {
inflateEnd (& strm);
return false;
}
return true;// all OK
}
char* first_line=(char*)&gzip_out[0];
char* current_line=first_line;
char* next_line=first_line;
char hangover[1000];
bool readLine(FILE* infile,char* line,bool gzipped){
if(!gzipped)
return fgets(line, sizeof(line), infile) != NULL;
else{
bool ok=true;
current_line=next_line;
if(!current_line || strlen(current_line)==0 || next_line-current_line>OUT_CHUNK){
current_line=first_line;
size_t bytes_read = fread (gzip_in, sizeof (char), CHUNK, infile);
ok=inflate_gzip(infile,strm,bytes_read);
strcpy(line,hangover);
}
if(ok){
next_line=strstr(current_line,"\n");
if(next_line){
next_line[0]=0;
next_line++;
strcpy(line+strlen(hangover),current_line);
hangover[0]=0;
}else{
strcpy(hangover,current_line);
line[0]=0;// skip that one!!
}
}
return ok;
}
}
You can't do that, because *.gz doesn't have "lines".
If compressed data has newlines, you'll have to decompress it. You don't have to decompress all data at once, you know, you can do it in chunks, and send strings back to main program when you encounter newline characters. *.gz can be decompressed using zlib.
Chilkat (http://www.chilkatsoft.com/) has libraries to read compressed files from a C++, .Net, VB, ... application.