Related
On Python, there is this option errors='ignore' for the open Python function:
open( '/filepath.txt', 'r', encoding='UTF-8', errors='ignore' )
With this, reading a file with invalid UTF8 characters will replace them with nothing, i.e., they are ignored. For example, a file with the characthers Føö»BÃ¥r is going to be read as FøöBår.
If a line as Føö»BÃ¥r is read with getline() from stdio.h, it will be read as Føö�Bår:
FILE* cfilestream = fopen( "/filepath.txt", "r" );
int linebuffersize = 131072;
char* readline = (char*) malloc( linebuffersize );
while( true )
{
if( getline( &readline, &linebuffersize, cfilestream ) != -1 ) {
std::cerr << "readline=" readline << std::endl;
}
else {
break;
}
}
How can I make stdio.h getline() read it as FøöBår instead of Føö�Bår, i..e, ignoring invalid UTF8 characters?
One overwhelming solution I can think of it do iterate throughout all characters on each line read and build a new readline without any of these characters. For example:
FILE* cfilestream = fopen( "/filepath.txt", "r" );
int linebuffersize = 131072;
char* readline = (char*) malloc( linebuffersize );
char* fixedreadline = (char*) malloc( linebuffersize );
int index;
int charsread;
int invalidcharsoffset;
while( true )
{
if( ( charsread = getline( &readline, &linebuffersize, cfilestream ) ) != -1 )
{
invalidcharsoffset = 0;
for( index = 0; index < charsread; ++index )
{
if( readline[index] != '�' ) {
fixedreadline[index-invalidcharsoffset] = readline[index];
}
else {
++invalidcharsoffset;
}
}
std::cerr << "fixedreadline=" << fixedreadline << std::endl;
}
else {
break;
}
}
Related questions:
Fixing invalid UTF8 characters
Replacing non UTF8 characters
python replace unicode characters
Python unicode: how to replace character that cannot be decoded using utf8 with whitespace?
You are confusing what you see with what is really going on. The getline function does not do any replacement of characters. [Note 1]
You are seeing a replacement character (U+FFFD) because your console outputs that character when it is asked to render an invalid UTF-8 code. Most consoles will do that if they are in UTF-8 mode; that is, the current locale is UTF-8.
Also, saying that a file contains the "characters Føö»BÃ¥r" is at best imprecise. A file does not really contain characters. It contains byte sequences which may be interpreted as characters -- for example, by a console or other user presentation software which renders them into glyphs -- according to some encoding. Different encodings produce different results; in this particular case, you have a file which was created by software using the Windows-1252 encoding (or, roughly equivalently, ISO 8859-15), and you are rendering it on a console using UTF-8.
What that means is that the data read by getline contains an invalid UTF-8 sequence, but it (probably) does not contain the replacement character code. Based on the character string you present, it contains the hex character \xbb, which is a guillemot (») in Windows code page 1252.
Finding all the invalid UTF-8 sequences in a string read by getline (or any other C library function which reads files) requires scanning the string, but not for a particular code sequence. Rather, you need to decode UTF-8 sequences one at a time, looking for the ones which are not valid. That's not a simple task, but the mbtowc function can help (if you have enabled a UTF-8 locale). As you'll see in the linked manpage, mbtowc returns the number of bytes contained in a valid "multibyte sequence" (which is UTF-8 in a UTF-8 locale), or -1 to indicate an invalid or incomplete sequence. In the scan, you should pass through the bytes in a valid sequence, or remove/ignore the single byte starting an invalid sequence, and then continue the scan until you reach the end of the string.
Here's some lightly-tested example code (in C):
#include <stdlib.h>
#include <string.h>
/* Removes in place any invalid UTF-8 sequences from at most 'len' characters of the
* string pointed to by 's'. (If a NUL byte is encountered, conversion stops.)
* If the length of the converted string is less than 'len', a NUL byte is
* inserted.
* Returns the length of the possibly modified string (with a maximum of 'len'),
* not including the NUL terminator (if any).
* Requires that a UTF-8 locale be active; since there is no way to test for
* this condition, no attempt is made to do so. If the current locale is not UTF-8,
* behaviour is undefined.
*/
size_t remove_bad_utf8(char* s, size_t len) {
char* in = s;
/* Skip over the initial correct sequence. Avoid relying on mbtowc returning
* zero if n is 0, since Posix is not clear whether mbtowc returns 0 or -1.
*/
int seqlen;
while (len && (seqlen = mbtowc(NULL, in, len)) > 0) { len -= seqlen; in += seqlen; }
char* out = in;
if (len && seqlen < 0) {
++in;
--len;
/* If we find an invalid sequence, we need to start shifting correct sequences. */
for (; len; in += seqlen, len -= seqlen) {
seqlen = mbtowc(NULL, in, len);
if (seqlen > 0) {
/* Shift the valid sequence (if one was found) */
memmove(out, in, seqlen);
out += seqlen;
}
else if (seqlen < 0) seqlen = 1;
else /* (seqlen == 0) */ break;
}
*out++ = 0;
}
return out - s;
}
Notes
Aside from the possible line-end transformation of the underlying I/O library, which will replace CR-LF with a single \n on systems like Windows where the two character CR-LF sequence is used as a line-end indication.
As #rici well explains in his answer, there can be several invalid UTF-8 sequences in a byte sequence.
Possibly iconv(3) could be worth a look, e.g. see https://linux.die.net/man/3/iconv_open.
When the string "//IGNORE" is appended to tocode, characters that cannot be represented in the target character set will be silently discarded.
Example
This byte sequence, if interpreted as UTF-8, contains some invalid UTF-8:
"some invalid\xFE\xFE\xFF\xFF stuff"
If you display this you would see something like
some invalid���� stuff
When this string passes through the remove_invalid_utf8 function in the following C program, the invalid UTF-8 bytes are removed using the iconv function mentioned above.
So the result is then:
some invalid stuff
C Program
#include <stdio.h>
#include <iconv.h>
#include <string.h>
#include <stdlib.h>
#include <stdbool.h>
#include <errno.h>
char *remove_invalid_utf8(char *utf8, size_t len) {
size_t inbytes_len = len;
char *inbuf = utf8;
size_t outbytes_len = len;
char *result = calloc(outbytes_len + 1, sizeof(char));
char *outbuf = result;
iconv_t cd = iconv_open("UTF-8//IGNORE", "UTF-8");
if(cd == (iconv_t)-1) {
perror("iconv_open");
}
if(iconv(cd, &inbuf, &inbytes_len, &outbuf, &outbytes_len)) {
perror("iconv");
}
iconv_close(cd);
return result;
}
int main() {
char *utf8 = "some invalid\xFE\xFE\xFF\xFF stuff";
char *converted = remove_invalid_utf8(utf8, strlen(utf8));
printf("converted: %s to %s\n", utf8, converted);
free(converted);
return 0;
}
I also managed to fix it by trailing/cutting down all Non-ASCII characters.
This one takes about 2.6 seconds to parse 319MB:
#include <stdlib.h>
#include <iostream>
int main(int argc, char const *argv[])
{
FILE* cfilestream = fopen( "./test.txt", "r" );
size_t linebuffersize = 131072;
if( cfilestream == NULL ) {
perror( "fopen cfilestream" );
return -1;
}
char* readline = (char*) malloc( linebuffersize );
char* fixedreadline = (char*) malloc( linebuffersize );
if( readline == NULL ) {
perror( "malloc readline" );
return -1;
}
if( fixedreadline == NULL ) {
perror( "malloc fixedreadline" );
return -1;
}
char* source;
if( ( source = std::setlocale( LC_ALL, "en_US.utf8" ) ) == NULL ) {
perror( "setlocale" );
}
else {
std::cerr << "locale='" << source << "'" << std::endl;
}
int index;
int charsread;
int invalidcharsoffset;
unsigned int fixedchar;
while( true )
{
if( ( charsread = getline( &readline, &linebuffersize, cfilestream ) ) != -1 )
{
invalidcharsoffset = 0;
for( index = 0; index < charsread; ++index )
{
fixedchar = static_cast<unsigned int>( readline[index] );
// std::cerr << "index " << std::setw(3) << index
// << " readline " << std::setw(10) << fixedchar
// << " -> '" << readline[index] << "'" << std::endl;
if( 31 < fixedchar && fixedchar < 128 ) {
fixedreadline[index-invalidcharsoffset] = readline[index];
}
else {
++invalidcharsoffset;
}
}
fixedreadline[index-invalidcharsoffset] = '\0';
// std::cerr << "fixedreadline=" << fixedreadline << std::endl;
}
else {
break;
}
}
std::cerr << "fixedreadline=" << fixedreadline << std::endl;
free( readline );
free( fixedreadline );
fclose( cfilestream );
return 0;
}
Alternative and slower version using memcpy
Using menmove does not improve much speed, so you could either one.
This one takes about 3.1 seconds to parse 319MB:
#include <stdlib.h>
#include <iostream>
#include <cstring>
#include <iomanip>
int main(int argc, char const *argv[])
{
FILE* cfilestream = fopen( "./test.txt", "r" );
size_t linebuffersize = 131072;
if( cfilestream == NULL ) {
perror( "fopen cfilestream" );
return -1;
}
char* readline = (char*) malloc( linebuffersize );
char* fixedreadline = (char*) malloc( linebuffersize );
if( readline == NULL ) {
perror( "malloc readline" );
return -1;
}
if( fixedreadline == NULL ) {
perror( "malloc fixedreadline" );
return -1;
}
char* source;
char* destination;
char* finalresult;
int index;
int lastcopy;
int charsread;
int charstocopy;
int invalidcharsoffset;
bool hasignoredbytes;
unsigned int fixedchar;
if( ( source = std::setlocale( LC_ALL, "en_US.utf8" ) ) == NULL ) {
perror( "setlocale" );
}
else {
std::cerr << "locale='" << source << "'" << std::endl;
}
while( true )
{
if( ( charsread = getline( &readline, &linebuffersize, cfilestream ) ) != -1 )
{
hasignoredbytes = false;
source = readline;
destination = fixedreadline;
lastcopy = 0;
invalidcharsoffset = 0;
for( index = 0; index < charsread; ++index )
{
fixedchar = static_cast<unsigned int>( readline[index] );
// std::cerr << "fixedchar " << std::setw(10)
// << fixedchar << " -> '"
// << readline[index] << "'" << std::endl;
if( 31 < fixedchar && fixedchar < 128 ) {
if( hasignoredbytes ) {
charstocopy = index - lastcopy - invalidcharsoffset;
memcpy( destination, source, charstocopy );
source += index - lastcopy;
lastcopy = index;
destination += charstocopy;
invalidcharsoffset = 0;
hasignoredbytes = false;
}
}
else {
++invalidcharsoffset;
hasignoredbytes = true;
}
}
if( destination != fixedreadline ) {
charstocopy = charsread - static_cast<int>( source - readline )
- invalidcharsoffset;
memcpy( destination, source, charstocopy );
destination += charstocopy - 1;
if( *destination == '\n' ) {
*destination = '\0';
}
else {
*++destination = '\0';
}
finalresult = fixedreadline;
}
else {
finalresult = readline;
}
// std::cerr << "finalresult=" << finalresult << std::endl;
}
else {
break;
}
}
std::cerr << "finalresult=" << finalresult << std::endl;
free( readline );
free( fixedreadline );
fclose( cfilestream );
return 0;
}
Optimized solution using iconv
This takes about 4.6 seconds to parse 319MB of text.
#include <iconv.h>
#include <string.h>
#include <stdlib.h>
#include <iostream>
// Compile it with:
// g++ -o main test.cpp -O3 -liconv
int main(int argc, char const *argv[])
{
FILE* cfilestream = fopen( "./test.txt", "r" );
size_t linebuffersize = 131072;
if( cfilestream == NULL ) {
perror( "fopen cfilestream" );
return -1;
}
char* readline = (char*) malloc( linebuffersize );
char* fixedreadline = (char*) malloc( linebuffersize );
if( readline == NULL ) {
perror( "malloc readline" );
return -1;
}
if( fixedreadline == NULL ) {
perror( "malloc fixedreadline" );
return -1;
}
char* source;
char* destination;
int charsread;
size_t inchars;
size_t outchars;
if( ( source = std::setlocale( LC_ALL, "en_US.utf8" ) ) == NULL ) {
perror( "setlocale" );
}
else {
std::cerr << "locale='" << source << "'" << std::endl;
}
iconv_t conversiondescriptor = iconv_open("UTF-8//IGNORE", "UTF-8");
if( conversiondescriptor == (iconv_t)-1 ) {
perror( "iconv_open conversiondescriptor" );
}
while( true )
{
if( ( charsread = getline( &readline, &linebuffersize, cfilestream ) ) != -1 )
{
source = readline;
inchars = charsread;
destination = fixedreadline;
outchars = charsread;
if( iconv( conversiondescriptor, &source, &inchars, &destination, &outchars ) )
{
perror( "iconv" );
}
// Trim out the new line character
if( *--destination == '\n' ) {
*--destination = '\0';
}
else {
*destination = '\0';
}
// std::cerr << "fixedreadline='" << fixedreadline << "'" << std::endl;
}
else {
break;
}
}
std::cerr << "fixedreadline='" << fixedreadline << "'" << std::endl;
free( readline );
free( fixedreadline );
if( fclose( cfilestream ) ) {
perror( "fclose cfilestream" );
}
if( iconv_close( conversiondescriptor ) ) {
perror( "iconv_close conversiondescriptor" );
}
return 0;
}
Slowest solution ever using mbtowc
This takes about 24.2 seconds to parse 319MB of text.
If you comment out the line fixedchar = mbtowc(NULL, source, charsread); and uncomment the line charsread -= fixedchar; (breaking the invalid characters removal) this will take 1.9 seconds instead of 24.2 seconds (also compiled with -O3 optimization level).
#include <stdlib.h>
#include <string.h>
#include <iostream>
#include <cstring>
#include <iomanip>
int main(int argc, char const *argv[])
{
FILE* cfilestream = fopen( "./test.txt", "r" );
size_t linebuffersize = 131072;
if( cfilestream == NULL ) {
perror( "fopen cfilestream" );
return -1;
}
char* readline = (char*) malloc( linebuffersize );
if( readline == NULL ) {
perror( "malloc readline" );
return -1;
}
char* source;
char* lineend;
char* destination;
int charsread;
int fixedchar;
if( ( source = std::setlocale( LC_ALL, "en_US.utf8" ) ) == NULL ) {
perror( "setlocale" );
}
else {
std::cerr << "locale='" << source << "'" << std::endl;
}
while( true )
{
if( ( charsread = getline( &readline, &linebuffersize, cfilestream ) ) != -1 )
{
lineend = readline + charsread;
destination = readline;
for( source = readline; source != lineend; )
{
// fixedchar = 1;
fixedchar = mbtowc(NULL, source, charsread);
charsread -= fixedchar;
// std::ostringstream contents;
// for( int index = 0; index < fixedchar; ++index )
// contents << source[index];
// std::cerr << "fixedchar=" << std::setw(10)
// << fixedchar << " -> '"
// << contents.str().c_str() << "'" << std::endl;
if( fixedchar > 0 ) {
memmove( destination, source, fixedchar );
source += fixedchar;
destination += fixedchar;
}
else if( fixedchar < 0 ) {
source += 1;
// std::cerr << "errno=" << strerror( errno ) << std::endl;
}
else {
break;
}
}
// Trim out the new line character
if( *--destination == '\n' ) {
*--destination = '\0';
}
else {
*destination = '\0';
}
// std::cerr << "readline='" << readline << "'" << std::endl;
}
else {
break;
}
}
std::cerr << "readline='" << readline << "'" << std::endl;
if( fclose( cfilestream ) ) {
perror( "fclose cfilestream" );
}
free( readline );
return 0;
}
Fastest version from all my others above using memmove
You cannot use memcpy here because the memory regions overlap!
This takes about 2.4 seconds to parse 319MB.
If you comment out the lines *destination = *source and memmove( destination, source, 1 ) (breaking the invalid characters removal) the performance still almost the same as when memmove is being called. Here in, calling memmove( destination, source, 1 ) is a little slower than directly doing *destination = *source;
#include <stdlib.h>
#include <iostream>
#include <cstring>
#include <iomanip>
int main(int argc, char const *argv[])
{
FILE* cfilestream = fopen( "./test.txt", "r" );
size_t linebuffersize = 131072;
if( cfilestream == NULL ) {
perror( "fopen cfilestream" );
return -1;
}
char* readline = (char*) malloc( linebuffersize );
if( readline == NULL ) {
perror( "malloc readline" );
return -1;
}
char* source;
char* lineend;
char* destination;
int charsread;
unsigned int fixedchar;
if( ( source = std::setlocale( LC_ALL, "en_US.utf8" ) ) == NULL ) {
perror( "setlocale" );
}
else {
std::cerr << "locale='" << source << "'" << std::endl;
}
while( true )
{
if( ( charsread = getline( &readline, &linebuffersize, cfilestream ) ) != -1 )
{
lineend = readline + charsread;
destination = readline;
for( source = readline; source != lineend; ++source )
{
fixedchar = static_cast<unsigned int>( *source );
// std::cerr << "fixedchar=" << std::setw(10)
// << fixedchar << " -> '" << *source << "'" << std::endl;
if( 31 < fixedchar && fixedchar < 128 ) {
*destination = *source;
++destination;
}
}
// Trim out the new line character
if( *source == '\n' ) {
*--destination = '\0';
}
else {
*destination = '\0';
}
// std::cerr << "readline='" << readline << "'" << std::endl;
}
else {
break;
}
}
std::cerr << "readline='" << readline << "'" << std::endl;
if( fclose( cfilestream ) ) {
perror( "fclose cfilestream" );
}
free( readline );
return 0;
}
Bonus
You can also use Python C Extensions (API).
It takes about 2.3 seconds to parse 319MB without converting them to cached version UTF-8 char*
And takes about 3.2 seconds to parse 319MB converting them to UTF-8 char*.
And also takes about 3.2 seconds to parse 319MB converting them to cached ASCII char*.
#define PY_SSIZE_T_CLEAN
#include <Python.h>
#include <iostream>
typedef struct
{
PyObject_HEAD
}
PyFastFile;
static PyModuleDef fastfilepackagemodule =
{
// https://docs.python.org/3/c-api/module.html#c.PyModuleDef
PyModuleDef_HEAD_INIT,
"fastfilepackage", /* name of module */
"Example module that wrapped a C++ object", /* module documentation, may be NULL */
-1, /* size of per-interpreter state of the module, or
-1 if the module keeps state in global variables. */
NULL, /* PyMethodDef* m_methods */
NULL, /* inquiry m_reload */
NULL, /* traverseproc m_traverse */
NULL, /* inquiry m_clear */
NULL, /* freefunc m_free */
};
// initialize PyFastFile Object
static int PyFastFile_init(PyFastFile* self, PyObject* args, PyObject* kwargs) {
char* filepath;
if( !PyArg_ParseTuple( args, "s", &filepath ) ) {
return -1;
}
int linecount = 0;
PyObject* iomodule;
PyObject* openfile;
PyObject* fileiterator;
iomodule = PyImport_ImportModule( "builtins" );
if( iomodule == NULL ) {
std::cerr << "ERROR: FastFile failed to import the io module '"
"(and open the file " << filepath << "')!" << std::endl;
PyErr_PrintEx(100);
return -1;
}
PyObject* openfunction = PyObject_GetAttrString( iomodule, "open" );
if( openfunction == NULL ) {
std::cerr << "ERROR: FastFile failed get the io module open "
<< "function (and open the file '" << filepath << "')!" << std::endl;
PyErr_PrintEx(100);
return -1;
}
openfile = PyObject_CallFunction(
openfunction, "ssiss", filepath, "r", -1, "ASCII", "ignore" );
if( openfile == NULL ) {
std::cerr << "ERROR: FastFile failed to open the file'"
<< filepath << "'!" << std::endl;
PyErr_PrintEx(100);
return -1;
}
PyObject* iterfunction = PyObject_GetAttrString( openfile, "__iter__" );
Py_DECREF( openfunction );
if( iterfunction == NULL ) {
std::cerr << "ERROR: FastFile failed get the io module iterator"
<< "function (and open the file '" << filepath << "')!" << std::endl;
PyErr_PrintEx(100);
return -1;
}
PyObject* openiteratorobject = PyObject_CallObject( iterfunction, NULL );
Py_DECREF( iterfunction );
if( openiteratorobject == NULL ) {
std::cerr << "ERROR: FastFile failed get the io module iterator object"
<< " (and open the file '" << filepath << "')!" << std::endl;
PyErr_PrintEx(100);
return -1;
}
fileiterator = PyObject_GetAttrString( openfile, "__next__" );
Py_DECREF( openiteratorobject );
if( fileiterator == NULL ) {
std::cerr << "ERROR: FastFile failed get the io module iterator "
<< "object (and open the file '" << filepath << "')!" << std::endl;
PyErr_PrintEx(100);
return -1;
}
PyObject* readline;
while( ( readline = PyObject_CallObject( fileiterator, NULL ) ) != NULL ) {
linecount += 1;
PyUnicode_AsUTF8( readline );
Py_DECREF( readline );
// std::cerr << "linecount " << linecount << " readline '" << readline
// << "' '" << PyUnicode_AsUTF8( readline ) << "'" << std::endl;
}
std::cerr << "linecount " << linecount << std::endl;
// PyErr_PrintEx(100);
PyErr_Clear();
PyObject* closefunction = PyObject_GetAttrString( openfile, "close" );
if( closefunction == NULL ) {
std::cerr << "ERROR: FastFile failed get the close file function for '"
<< filepath << "')!" << std::endl;
PyErr_PrintEx(100);
return -1;
}
PyObject* closefileresult = PyObject_CallObject( closefunction, NULL );
Py_DECREF( closefunction );
if( closefileresult == NULL ) {
std::cerr << "ERROR: FastFile failed close open file '"
<< filepath << "')!" << std::endl;
PyErr_PrintEx(100);
return -1;
}
Py_DECREF( closefileresult );
Py_XDECREF( iomodule );
Py_XDECREF( openfile );
Py_XDECREF( fileiterator );
return 0;
}
// destruct the object
static void PyFastFile_dealloc(PyFastFile* self) {
Py_TYPE(self)->tp_free( (PyObject*) self );
}
static PyTypeObject PyFastFileType =
{
PyVarObject_HEAD_INIT( NULL, 0 )
"fastfilepackage.FastFile" /* tp_name */
};
// create the module
PyMODINIT_FUNC PyInit_fastfilepackage(void)
{
PyObject* thismodule;
// https://docs.python.org/3/c-api/typeobj.html
PyFastFileType.tp_new = PyType_GenericNew;
PyFastFileType.tp_basicsize = sizeof(PyFastFile);
PyFastFileType.tp_dealloc = (destructor) PyFastFile_dealloc;
PyFastFileType.tp_flags = Py_TPFLAGS_DEFAULT;
PyFastFileType.tp_doc = "FastFile objects";
PyFastFileType.tp_init = (initproc) PyFastFile_init;
if( PyType_Ready( &PyFastFileType) < 0 ) {
return NULL;
}
thismodule = PyModule_Create(&fastfilepackagemodule);
if( thismodule == NULL ) {
return NULL;
}
// Add FastFile class to thismodule allowing the use to create objects
Py_INCREF( &PyFastFileType );
PyModule_AddObject( thismodule, "FastFile", (PyObject*) &PyFastFileType );
return thismodule;
}
To built it, create the file source/fastfilewrappar.cpp with the contents of the above file and the setup.py with the following contents:
#! /usr/bin/env python
# -*- coding: utf-8 -*-
from setuptools import setup, Extension
myextension = Extension(
language = "c++",
extra_link_args = ["-std=c++11"],
extra_compile_args = ["-std=c++11"],
name = 'fastfilepackage',
sources = [
'source/fastfilewrapper.cpp'
],
include_dirs = [ 'source' ],
)
setup(
name = 'fastfilepackage',
ext_modules= [ myextension ],
)
To run example, use following Python script:
import time
import datetime
import fastfilepackage
testfile = './test.txt'
timenow = time.time()
iterable = fastfilepackage.FastFile( testfile )
fastfile_time = time.time() - timenow
timedifference = datetime.timedelta( seconds=fastfile_time )
print( 'FastFile timedifference', timedifference, flush=True )
Example:
user#user-pc$ /usr/bin/pip3.6 install .
Processing /fastfilepackage
Building wheels for collected packages: fastfilepackage
Building wheel for fastfilepackage (setup.py) ... done
Stored in directory: /pip-ephem-wheel-cache-j313cpzc/wheels/e5/5f/bc/52c820
Successfully built fastfilepackage
Installing collected packages: fastfilepackage
Found existing installation: fastfilepackage 0.0.0
Uninstalling fastfilepackage-0.0.0:
Successfully uninstalled fastfilepackage-0.0.0
Successfully installed fastfilepackage-0.0.0
user#user-pc$ /usr/bin/python3.6 fastfileperformance.py
linecount 820800
FastFile timedifference 0:00:03.204614
Using std::getline
This takes about 4.7 seconds to parse 319MB.
If you remove the UTF-8 removal algorithm borrowed from the fastest benchmark using stdlib.h getline(), it takes 1.7 seconds to run.
#include <stdlib.h>
#include <iostream>
#include <locale>
#include <fstream>
#include <iomanip>
int main(int argc, char const *argv[])
{
unsigned int fixedchar;
int linecount = -1;
char* source;
char* lineend;
char* destination;
if( ( source = setlocale( LC_ALL, "en_US.ascii" ) ) == NULL ) {
perror( "setlocale" );
return -1;
}
else {
std::cerr << "locale='" << source << "'" << std::endl;
}
std::ifstream fileifstream{ "./test.txt" };
if( fileifstream.fail() ) {
std::cerr << "ERROR: FastFile failed to open the file!" << std::endl;
return -1;
}
size_t linebuffersize = 131072;
char* readline = (char*) malloc( linebuffersize );
if( readline == NULL ) {
perror( "malloc readline" );
return -1;
}
while( true )
{
if( !fileifstream.eof() )
{
linecount += 1;
fileifstream.getline( readline, linebuffersize );
lineend = readline + fileifstream.gcount();
destination = readline;
for( source = readline; source != lineend; ++source )
{
fixedchar = static_cast<unsigned int>( *source );
// std::cerr << "fixedchar=" << std::setw(10)
// << fixedchar << " -> '" << *source << "'" << std::endl;
if( 31 < fixedchar && fixedchar < 128 ) {
*destination = *source;
++destination;
}
}
// Trim out the new line character
if( *source == '\n' ) {
*--destination = '\0';
}
else {
*destination = '\0';
}
// std::cerr << "readline='" << readline << "'" << std::endl;
}
else {
break;
}
}
std::cerr << "linecount='" << linecount << "'" << std::endl;
if( fileifstream.is_open() ) {
fileifstream.close();
}
free( readline );
return 0;
}
Resume
2.6 seconds trimming UTF-8 using two buffers with indexing
3.1 seconds trimming UTF-8 using two buffers with memcpy
4.6 seconds removing invalid UTF-8 with iconv
24.2 seconds removing invalid UTF-8 with mbtowc
2.4 seconds trimming UTF-8 using one buffer with pointer direct assigning
Bonus
2.3 seconds removing invalid UTF-8 without converting them to a cached UTF-8 char*
3.2 seconds removing invalid UTF-8 converting them to a cached UTF-8 char*
3.2 seconds trimming UTF-8 and caching as ASCII char*
4.7 seconds trimming UTF-8 with std::getline() using one buffer with pointer direct assigning
The used file ./text.txt had 820.800 lines where each line was equal to:
id-é-char&id-é-char&id-é-char&id-é-char&id-é-char&id-é-char&id-é-char&id-é-char&id-é-char&id-é-char&id-é-char&id-é-char&id-é-char&id-é-char&id-é-char&id-é-char&id-é-char&id-é-char&id-é-char&id-é-char\r\n
And all versions where compiled with
g++ (GCC) 7.4.0
iconv (GNU libiconv 1.14)
g++ -o main test.cpp -O3 -liconv && time ./main
I have to create some directories and when I try to search one I have to know if it was already created.
The problem is that after creating a directory with CreateDirectory() and trying to check if it was created I get an error which says that it wasn't created.
If I close and restart the program, without creating the directory but just checking if it was created, everything works.
bool DirectoryExists( const char* absolutePath ){
if( _access( absolutePath, 0 ) == 0 ){
struct stat status;
stat( absolutePath, &status );
return (status.st_mode & S_IFDIR) != 0;
}
return false;
}
marca = "database\\"+marca;
CreateDirectory (marca.c_str(), NULL);
// useless operation
if(! DirectoryExists(marca.c_str() ) )
{
cout<<" Error !";
return -1;
}
If marca was "database" it would work. But if marca is "database/foo" you cannot create both of these at the same time.
Here's a version of your code where I separate these operations.
#include <windows.h>
#include <io.h>
#include <string>
#include <cstdio>
#include <cstdlib>
#include <iostream>
using namespace std;
bool DirectoryExists( const char* absolutePath )
{
if( _access( absolutePath, 0 ) == 0 ){
struct stat status;
stat( absolutePath, &status );
return (status.st_mode & S_IFDIR) != 0;
}
return false;
}
bool MakeDirectory(const string& marca)
{
if(! CreateDirectory(marca.c_str(), NULL))
{
DWORD error = GetLastError();
TCHAR buf[256];
FormatMessage(FORMAT_MESSAGE_FROM_SYSTEM | FORMAT_MESSAGE_IGNORE_INSERTS,
NULL, GetLastError(), MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT),
buf, (sizeof(buf) / sizeof(TCHAR)), NULL);
cout << "Failed to create directory: " << buf << '\n';
return false;
}
if(! DirectoryExists(marca.c_str() ) )
{
cout << "Directory does not exist\n";
return false;
}
return true;
}
int main()
{
// name of subdirectory
string marca = "foo";
// first create top directory
string d = "database";
MakeDirectory(d);
// then subdirectory
d += "/" + marca;
MakeDirectory(d);
return 0;
}
I am writing a program that uses MySQLe as embedded backend. The database library is owned by an object called "Domain". This Domain object runs within the main thread.
The program launches another thread running a XML-RPC server (boost::thread and xmlrpc_c::serverAbyss). It is linked to the Domain object.
When the XML-RPC server makes the Domain object execute an SQL query the program crashes:
Program received signal: “EXC_BAD_ACCESS”.
[Switching to process 73191]
[Switching to process 73191]
Xcode could not locate source file: regex.cpp (line: 74)
When the master thread calls Domain object's method that executes SQL queries the program still runs.
/*
* Ports listening
*
* - create a Rpc_Server object
* - create a dedicated thread
*/
Rpc_Server server(&domain, &conf_params, &router);
boost::thread server_thread(boost::bind(&Rpc_Server::run, &server)); // This thread makes the server crash
/*
* Domain routine
*
* - Check for ready jobs every minute
*/
while (1) {
v_jobs jobs = domain.get_ready_jobs(conf_params.get_param("node_name")); // This method does NOT make the server crash
sleep(60);
}
Both the Domain object's methods and the Database object's methods lock a mutex to avoid multi access.
bool Mysql::execute(const std::string* query) {
MYSQL_RES* res;
MYSQL_ROW row;
if ( query == NULL )
return false;
this->updates_mutex.lock();
std::cout << query->c_str() << std::endl;
if ( mysql_query(this->mysql, query->c_str()) != 0 ) {
std::cerr << query << std::endl << mysql_error(this->mysql);
UNLOCK_MUTEX;
return false;
}
res = mysql_store_result(this->mysql);
if (res)
while ( ( row = mysql_fetch_row(res) ) )
for ( uint i=0 ; i < mysql_num_fields(res) ; i++ )
std::cout << row[i] << std::endl;
else
if ( mysql_field_count(this->mysql) != 0 ) {
std::cerr << "Erreur : " << mysql_error(this->mysql) << std::endl;
mysql_free_result(res);
this->updates_mutex.unlock();
return false;
}
mysql_free_result(res);
this->updates_mutex.unlock();
return true;
}
bool Domain::add_node(const std::string* running_node, const std::string* n, const int* w) {
std::string query;
this->updates_mutex.lock();
query = "START TRANSACTION;";
if ( this->database.execute(&query) == false ) {
this->updates_mutex.unlock();
return false;
}
query = "REPLACE INTO node (node_name,node_weight) VALUES ('";
query += n->c_str();
query += "','";
query += boost::lexical_cast<std::string>(*w);
query += "');";
if ( this->database.execute(&query) == false ) {
query = "ROLLBACK;";
this->database.execute(&query);
this->updates_mutex.unlock();
return false;
}
query = "COMMIT;"
if ( this->database.execute(&query) == false ) {
this->updates_mutex.unlock();
return false;
} else
this->updates_mutex.unlock();
return true;
}
The MySQLe is created there:
bool Mysql::prepare(const std::string* node_name, const std::string* db_skeleton) {
static char* server_args[] = {"this_program","--datadir=."};
static char* server_groups[] = {"embedded","server","this_program_SERVER",(char *)NULL};
std::string query("CREATE DATABASE IF NOT EXISTS ");
// DB init
if ( mysql_library_init(sizeof(server_args) / sizeof(char *), server_args, server_groups) )
std::cerr << "could not initialize MySQL library" << std::endl;
std::cout << "mysql init..." << std::endl;
if ( (this->mysql = mysql_init(NULL)) == NULL )
std::cerr << mysql_error(this->mysql) << std::endl;
if ( ! mysql_thread_safe() ) {
std::cerr << "MySQL is NOT theadsafe !" << std::endl;
return false;
}
mysql_options(this->mysql, MYSQL_READ_DEFAULT_GROUP, "embedded");
mysql_options(this->mysql, MYSQL_OPT_USE_EMBEDDED_CONNECTION, NULL);
mysql_real_connect(this->mysql, NULL, NULL, NULL, NULL, 0, NULL, 0);
// Creates the schema
query += this->translate_into_db(node_name);
query += ";";
if ( this->execute(&query) == false )
return false;
// Creates the schema
query = "CREATE SCHEMA IF NOT EXISTS ";
query += this->translate_into_db(node_name);
query += " DEFAULT CHARACTER SET latin1;";
this->execute(&query);
// Uses it
query = "USE " + this->translate_into_db(node_name) + ";";
this->execute(&query);
// Loads the skeleton from file
return this->load_file(db_skeleton->c_str());
}
Am I wrong somewhere?
Do you have an example to show me?
I found the solution to my problem. Each thread needs to initialize the MySQL environment. That is to say execute some mysql_* functions.
Here are the modified / new methods :
bool Mysql::atomic_execute(const std::string* query) {
MYSQL_RES* res;
MYSQL_ROW row;
boost::regex empty_string("^\\s+$", boost::regex::perl);
if ( query == NULL )
return false;
if ( query->empty() == true or boost::regex_match(*query, empty_string) == true ) {
std::cerr << "Error : query is empty !" << std::endl;
return false;
}
this->updates_mutex.lock();
if ( mysql_query(this->mysql, query->c_str()) != 0 ) {
std::cerr << query << std::endl << mysql_error(this->mysql);
this->updates_mutex.unlock();;
return false;
}
res = mysql_store_result(this->mysql);
if (res)
while ( ( row = mysql_fetch_row(res) ) )
for ( uint i=0 ; i < mysql_num_fields(res) ; i++ )
std::cout << row[i] << std::endl;
else
if ( mysql_field_count(this->mysql) != 0 ) {
std::cerr << "Erreur : " << mysql_error(this->mysql) << std::endl;
mysql_free_result(res);
this->updates_mutex.unlock();
return false;
}
mysql_free_result(res);
this->updates_mutex.unlock();
return true;
}
bool Mysql::standalone_execute(const v_queries* queries) {
MYSQL* local_mysql = this->init();
std::string query = "START TRANSACTION;";
if ( this->atomic_execute(&query) == false ) {
mysql_close(local_mysql);
return false;
}
BOOST_FOREACH(std::string q, *queries) {
std::cout << q.c_str() << std::endl;
if ( this->atomic_execute(&q) == false ) {
query = "ROLLBACK";
this->atomic_execute(&query);
mysql_close(local_mysql);
return false;
}
}
query = "COMMIT";
if ( this->atomic_execute(&query) == false ) {
mysql_close(local_mysql);
return false;
}
mysql_close(local_mysql);
return true;
}
MYSQL* Mysql::init() {
MYSQL* local_mysql;
local_mysql = mysql_init(this->mysql);
mysql_options(this->mysql, MYSQL_READ_DEFAULT_GROUP, "embedded");
mysql_options(this->mysql, MYSQL_OPT_USE_EMBEDDED_CONNECTION, NULL);
mysql_real_connect(local_mysql, NULL, NULL, NULL, NULL, 0, NULL, 0);
return local_mysql;
}
The atomic_execute method is used to send single queries to the server.
The standalone_execute method initializes a connection and a transaction, then it sends the whole queries to the server using atomic_execute.
I do not know if a ROLLBACK is useful in case of COMMIT's failure...
The code might need some improvements but it works.
I am currently programming a simple program, I want to distribute to my friends. What I am trying to accomplish, is to write some external binary files to a buffer from the internet, upon starting the program. To do this, I am using windows internet(wininet). Currently, I am using InternetReadFile to write the file to a buffer which I use later in the program. However, the File is not read completely, as in, the resulting size is much smaller than the size of the file on the server, when it should be the same.
I would like to do this, without using any external libraries.
Any idea of what could solve my problem?
Thanks,
Andrew
The documentation makes the following remarks:
InternetReadFile operates much like the base ReadFile function, with a few exceptions. Typically, InternetReadFile retrieves data from an HINTERNET handle as a sequential stream of bytes. The amount of data to be read for each call to InternetReadFile is specified by the dwNumberOfBytesToRead parameter and the data is returned in the lpBuffer parameter. A normal read retrieves the specified dwNumberOfBytesToRead for each call to InternetReadFile until the end of the file is reached. To ensure all data is retrieved, an application must continue to call the InternetReadFile function until the function returns TRUE and the lpdwNumberOfBytesRead parameter equals zero.
Basically, there is no guarantee that the function to read exactly dwNumberOfBytesToRead. Check out how many bytes were actually read using the lpdwNumberOfBytesRead parameter.
Moreover, as soon as the total file size is larger than dwNumberOfBytesToRead, you will need to invoke the call multiple times. Because it cannot read more than dwNumberOfBytesToRead at once.
If you have the total file size in advance, the loop takes the following form:
::DWORD error = ERROR_SUCCESS;
::BYTE data[SIZE]; // total file size.
::DWORD size = 0;
::DWORD read = 0;
do {
::BOOL result = ::InternetReadFile(stream, data+size, SIZE-size, &read);
if ( result == FALSE ) {
error = ::GetLastError();
}
}
while ((error == ERROR_SUCCESS) && (read > 0) && ((size+=read) < SIZE));
// check that `SIZE` was correct.
if (size != SIZE) {
}
If not, then you need to write the data in the buffer to another file instead of accumulating it.
EDIT (SAMPLE TEST PROGRAM):
Here's a complete program that fetches StackOverflow's front page. This downloads about 200K of HTML code in 1K chunks and the full page is retrieved. Can you run this and see if it works?
#include <Windows.h>
#include <Wininet.h>
#include <iostream>
#include <fstream>
namespace {
::HINTERNET netstart ()
{
const ::HINTERNET handle =
::InternetOpenW(0, INTERNET_OPEN_TYPE_DIRECT, 0, 0, 0);
if ( handle == 0 )
{
const ::DWORD error = ::GetLastError();
std::cerr
<< "InternetOpen(): " << error << "."
<< std::endl;
}
return (handle);
}
void netclose ( ::HINTERNET object )
{
const ::BOOL result = ::InternetCloseHandle(object);
if ( result == FALSE )
{
const ::DWORD error = ::GetLastError();
std::cerr
<< "InternetClose(): " << error << "."
<< std::endl;
}
}
::HINTERNET netopen ( ::HINTERNET session, ::LPCWSTR url )
{
const ::HINTERNET handle =
::InternetOpenUrlW(session, url, 0, 0, 0, 0);
if ( handle == 0 )
{
const ::DWORD error = ::GetLastError();
std::cerr
<< "InternetOpenUrl(): " << error << "."
<< std::endl;
}
return (handle);
}
void netfetch ( ::HINTERNET istream, std::ostream& ostream )
{
static const ::DWORD SIZE = 1024;
::DWORD error = ERROR_SUCCESS;
::BYTE data[SIZE];
::DWORD size = 0;
do {
::BOOL result = ::InternetReadFile(istream, data, SIZE, &size);
if ( result == FALSE )
{
error = ::GetLastError();
std::cerr
<< "InternetReadFile(): " << error << "."
<< std::endl;
}
ostream.write((const char*)data, size);
}
while ((error == ERROR_SUCCESS) && (size > 0));
}
}
int main ( int, char ** )
{
const ::WCHAR URL[] = L"http://stackoverflow.com/";
const ::HINTERNET session = ::netstart();
if ( session != 0 )
{
const ::HINTERNET istream = ::netopen(session, URL);
if ( istream != 0 )
{
std::ofstream ostream("output.txt", std::ios::binary);
if ( ostream.is_open() ) {
::netfetch(istream, ostream);
}
else {
std::cerr << "Could not open 'output.txt'." << std::endl;
}
::netclose(istream);
}
::netclose(session);
}
}
#pragma comment ( lib, "Wininet.lib" )
Given the key for some registry value (e.g. HKEY_LOCAL_MACHINE\blah\blah\blah\foo) how can I:
Safely determine that such a key exists.
Programmatically (i.e. with code) get its value.
I have absolutely no intention of writing anything back to the registry (for the duration of my career if I can help it). So we can skip the lecture about every molecule in my body exploding at the speed of light if I write to the registry incorrectly.
Prefer answers in C++, but mostly just need to know what the special Windows API incantation to get at the value is.
Here is some pseudo-code to retrieve the following:
If a registry key exists
What the default value is for that registry key
What a string value is
What a DWORD value is
Example code:
Include the library dependency: Advapi32.lib
HKEY hKey;
LONG lRes = RegOpenKeyExW(HKEY_LOCAL_MACHINE, L"SOFTWARE\\Perl", 0, KEY_READ, &hKey);
bool bExistsAndSuccess (lRes == ERROR_SUCCESS);
bool bDoesNotExistsSpecifically (lRes == ERROR_FILE_NOT_FOUND);
std::wstring strValueOfBinDir;
std::wstring strKeyDefaultValue;
GetStringRegKey(hKey, L"BinDir", strValueOfBinDir, L"bad");
GetStringRegKey(hKey, L"", strKeyDefaultValue, L"bad");
LONG GetDWORDRegKey(HKEY hKey, const std::wstring &strValueName, DWORD &nValue, DWORD nDefaultValue)
{
nValue = nDefaultValue;
DWORD dwBufferSize(sizeof(DWORD));
DWORD nResult(0);
LONG nError = ::RegQueryValueExW(hKey,
strValueName.c_str(),
0,
NULL,
reinterpret_cast<LPBYTE>(&nResult),
&dwBufferSize);
if (ERROR_SUCCESS == nError)
{
nValue = nResult;
}
return nError;
}
LONG GetBoolRegKey(HKEY hKey, const std::wstring &strValueName, bool &bValue, bool bDefaultValue)
{
DWORD nDefValue((bDefaultValue) ? 1 : 0);
DWORD nResult(nDefValue);
LONG nError = GetDWORDRegKey(hKey, strValueName.c_str(), nResult, nDefValue);
if (ERROR_SUCCESS == nError)
{
bValue = (nResult != 0) ? true : false;
}
return nError;
}
LONG GetStringRegKey(HKEY hKey, const std::wstring &strValueName, std::wstring &strValue, const std::wstring &strDefaultValue)
{
strValue = strDefaultValue;
WCHAR szBuffer[512];
DWORD dwBufferSize = sizeof(szBuffer);
ULONG nError;
nError = RegQueryValueExW(hKey, strValueName.c_str(), 0, NULL, (LPBYTE)szBuffer, &dwBufferSize);
if (ERROR_SUCCESS == nError)
{
strValue = szBuffer;
}
return nError;
}
const CString REG_SW_GROUP_I_WANT = _T("SOFTWARE\\My Corporation\\My Package\\Group I want");
const CString REG_KEY_I_WANT= _T("Key Name");
CRegKey regKey;
DWORD dwValue = 0;
if(ERROR_SUCCESS != regKey.Open(HKEY_LOCAL_MACHINE, REG_SW_GROUP_I_WANT))
{
m_pobLogger->LogError(_T("CRegKey::Open failed in Method"));
regKey.Close();
goto Function_Exit;
}
if( ERROR_SUCCESS != regKey.QueryValue( dwValue, REG_KEY_I_WANT))
{
m_pobLogger->LogError(_T("CRegKey::QueryValue Failed in Method"));
regKey.Close();
goto Function_Exit;
}
// dwValue has the stuff now - use for further processing
Since Windows >=Vista/Server 2008, RegGetValue is available, which is a safer function than RegQueryValueEx. No need for RegOpenKeyEx, RegCloseKey or NUL termination checks of string values (REG_SZ, REG_MULTI_SZ, REG_EXPAND_SZ).
#include <iostream>
#include <string>
#include <exception>
#include <windows.h>
/*! \brief Returns a value from HKLM as string.
\exception std::runtime_error Replace with your error handling.
*/
std::wstring GetStringValueFromHKLM(const std::wstring& regSubKey, const std::wstring& regValue)
{
size_t bufferSize = 0xFFF; // If too small, will be resized down below.
std::wstring valueBuf; // Contiguous buffer since C++11.
valueBuf.resize(bufferSize);
auto cbData = static_cast<DWORD>(bufferSize * sizeof(wchar_t));
auto rc = RegGetValueW(
HKEY_LOCAL_MACHINE,
regSubKey.c_str(),
regValue.c_str(),
RRF_RT_REG_SZ,
nullptr,
static_cast<void*>(valueBuf.data()),
&cbData
);
while (rc == ERROR_MORE_DATA)
{
// Get a buffer that is big enough.
cbData /= sizeof(wchar_t);
if (cbData > static_cast<DWORD>(bufferSize))
{
bufferSize = static_cast<size_t>(cbData);
}
else
{
bufferSize *= 2;
cbData = static_cast<DWORD>(bufferSize * sizeof(wchar_t));
}
valueBuf.resize(bufferSize);
rc = RegGetValueW(
HKEY_LOCAL_MACHINE,
regSubKey.c_str(),
regValue.c_str(),
RRF_RT_REG_SZ,
nullptr,
static_cast<void*>(valueBuf.data()),
&cbData
);
}
if (rc == ERROR_SUCCESS)
{
cbData /= sizeof(wchar_t);
valueBuf.resize(static_cast<size_t>(cbData - 1)); // remove end null character
return valueBuf;
}
else
{
throw std::runtime_error("Windows system error code: " + std::to_string(rc));
}
}
int main()
{
std::wstring regSubKey;
#ifdef _WIN64 // Manually switching between 32bit/64bit for the example. Use dwFlags instead.
regSubKey = L"SOFTWARE\\WOW6432Node\\Company Name\\Application Name\\";
#else
regSubKey = L"SOFTWARE\\Company Name\\Application Name\\";
#endif
std::wstring regValue(L"MyValue");
std::wstring valueFromRegistry;
try
{
valueFromRegistry = GetStringValueFromHKLM(regSubKey, regValue);
}
catch (std::exception& e)
{
std::cerr << e.what();
}
std::wcout << valueFromRegistry;
}
Its parameter dwFlags supports flags for type restriction, filling the value buffer with zeros on failure (RRF_ZEROONFAILURE) and 32/64bit registry access (RRF_SUBKEY_WOW6464KEY, RRF_SUBKEY_WOW6432KEY) for 64bit programs.
The pair RegOpenKey and RegQueryKeyEx will do the trick.
If you use MFC CRegKey class is even more easier solution.
RegQueryValueEx
This gives the value if it exists, and returns an error code ERROR_FILE_NOT_FOUND if the key doesn't exist.
(I can't tell if my link is working or not, but if you just google for "RegQueryValueEx" the first hit is the msdn documentation.)
Typically the register key and value are constants in the program. If so, here is an example how to read a DWORD registry value Computer\HKEY_LOCAL_MACHINE\SYSTEM\CurrentControlSet\Control\FileSystem\LongPathsEnabled:
#include <windows.h>
DWORD val;
DWORD dataSize = sizeof(val);
if (ERROR_SUCCESS == RegGetValueA(HKEY_LOCAL_MACHINE, "SYSTEM\\CurrentControlSet\\Control\\FileSystem", "LongPathsEnabled", RRF_RT_DWORD, nullptr /*type not required*/, &val, &dataSize)) {
printf("Value is %i\n", val);
// no CloseKey needed because it is a predefined registry key
}
else {
printf("Error reading.\n");
}
To adapt for other value types, see https://learn.microsoft.com/en-us/windows/win32/api/winreg/nf-winreg-reggetvaluea for complete spec.
This console app will list all the values and their data from a registry key for most of the potential registry values. There's some weird ones not often used. If you need to support all of them, expand from this example while referencing this Registry Value Type documentation.
Let this be the registry key content you can import from a .reg file format:
Windows Registry Editor Version 5.00
[HKEY_CURRENT_USER\added\subkey]
"String_Value"="hello, world!"
"Binary_Value"=hex:01,01,01,01
"Dword value"=dword:00001224
"QWord val"=hex(b):24,22,12,00,00,00,00,00
"multi-line val"=hex(7):4c,00,69,00,6e,00,65,00,20,00,30,00,00,00,4c,00,69,00,\
6e,00,65,00,20,00,31,00,00,00,4c,00,69,00,6e,00,65,00,20,00,32,00,00,00,00,\
00
"expanded_val"=hex(2):25,00,55,00,53,00,45,00,52,00,50,00,52,00,4f,00,46,00,49,\
00,4c,00,45,00,25,00,5c,00,6e,00,65,00,77,00,5f,00,73,00,74,00,75,00,66,00,\
66,00,00,00
The console app itself:
#include <Windows.h>
#include <iostream>
#include <string>
#include <locale>
#include <vector>
#include <iomanip>
int wmain()
{
const auto hKey = HKEY_CURRENT_USER;
constexpr auto lpSubKey = TEXT("added\\subkey");
auto openedKey = HKEY();
auto status = RegOpenKeyEx(hKey, lpSubKey, 0, KEY_READ, &openedKey);
if (status == ERROR_SUCCESS) {
auto valueCount = static_cast<DWORD>(0);
auto maxNameLength = static_cast<DWORD>(0);
auto maxValueLength = static_cast<DWORD>(0);
status = RegQueryInfoKey(openedKey, NULL, NULL, NULL, NULL, NULL, NULL,
&valueCount, &maxNameLength, &maxValueLength, NULL, NULL);
if (status == ERROR_SUCCESS) {
DWORD type = 0;
DWORD index = 0;
std::vector<wchar_t> valueName = std::vector<wchar_t>(maxNameLength + 1);
std::vector<BYTE> dataBuffer = std::vector<BYTE>(maxValueLength);
for (DWORD index = 0; index < valueCount; index++) {
DWORD charCountValueName = static_cast<DWORD>(valueName.size());
DWORD charBytesData = static_cast<DWORD>(dataBuffer.size());
status = RegEnumValue(openedKey, index, valueName.data(), &charCountValueName,
NULL, &type, dataBuffer.data(), &charBytesData);
if (type == REG_SZ) {
const auto reg_string = reinterpret_cast<wchar_t*>(dataBuffer.data());
std::wcout << L"Type: REG_SZ" << std::endl;
std::wcout << L"\tName: " << valueName.data() << std::endl;
std::wcout << L"\tData : " << reg_string << std::endl;
}
else if (type == REG_EXPAND_SZ) {
const auto casted = reinterpret_cast<wchar_t*>(dataBuffer.data());
TCHAR buffer[32000];
ExpandEnvironmentStrings(casted, buffer, 32000);
std::wcout << L"Type: REG_EXPAND_SZ" << std::endl;
std::wcout << L"\tName: " << valueName.data() << std::endl;
std::wcout << L"\tData: " << buffer << std::endl;
}
else if (type == REG_MULTI_SZ) {
std::vector<std::wstring> lines;
const auto str = reinterpret_cast<wchar_t*>(dataBuffer.data());
auto line = str;
lines.emplace_back(line);
for (auto i = 0; i < charBytesData / sizeof(wchar_t) - 1; i++) {
const auto c = str[i];
if (c == 0) {
line = str + i + 1;
const auto new_line = reinterpret_cast<wchar_t*>(line);
if (wcsnlen_s(new_line, 1024) > 0)
lines.emplace_back(new_line);
}
}
std::wcout << L"Type: REG_MULTI_SZ" << std::endl;
std::wcout << L"\tName: " << valueName.data() << std::endl;
std::wcout << L"\tData: " << std::endl;
for (size_t i = 0; i < lines.size(); i++) {
std::wcout << L"\t\tLine[" << i + 1 << L"]: " << lines[i] << std::endl;
}
}
if (type == REG_DWORD) {
const auto dword_value = reinterpret_cast<unsigned long*>(dataBuffer.data());
std::wcout << L"Type: REG_DWORD" << std::endl;
std::wcout << L"\tName: " << valueName.data() << std::endl;
std::wcout << L"\tData : " << std::to_wstring(*dword_value) << std::endl;
}
else if (type == REG_QWORD) {
const auto qword_value = reinterpret_cast<unsigned long long*>(dataBuffer.data());
std::wcout << L"Type: REG_DWORD" << std::endl;
std::wcout << L"\tName: " << valueName.data() << std::endl;
std::wcout << L"\tData : " << std::to_wstring(*qword_value) << std::endl;
}
else if (type == REG_BINARY) {
std::vector<uint16_t> bins;
for (auto i = 0; i < charBytesData; i++) {
bins.push_back(static_cast<uint16_t>(dataBuffer[i]));
}
std::wcout << L"Type: REG_BINARY" << std::endl;
std::wcout << L"\tName: " << valueName.data() << std::endl;
std::wcout << L"\tData:";
for (size_t i = 0; i < bins.size(); i++) {
std::wcout << L" " << std::uppercase << std::hex << \
std::setw(2) << std::setfill(L'0') << std::to_wstring(bins[i]);
}
std::wcout << std::endl;
}
}
}
}
RegCloseKey(openedKey);
return 0;
}
Expected console output:
Type: REG_SZ
Name: String_Value
Data : hello, world!
Type: REG_BINARY
Name: Binary_Value
Data: 01 01 01 01
Type: REG_DWORD
Name: Dword value
Data : 4644
Type: REG_DWORD
Name: QWord val
Data : 1188388
Type: REG_MULTI_SZ
Name: multi-line val
Data:
Line[1]: Line 0
Line[2]: Line 1
Line[3]: Line 2
Type: REG_EXPAND_SZ
Name: expanded_val
Data: C:\Users\user name\new_stuff
#include <windows.h>
#include <map>
#include <string>
#include <stdio.h>
#include <string.h>
#include <tr1/stdint.h>
using namespace std;
void printerr(DWORD dwerror) {
LPVOID lpMsgBuf;
FormatMessage(
FORMAT_MESSAGE_ALLOCATE_BUFFER |
FORMAT_MESSAGE_FROM_SYSTEM |
FORMAT_MESSAGE_IGNORE_INSERTS,
NULL,
dwerror,
MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT), // Default language
(LPTSTR) &lpMsgBuf,
0,
NULL
);
// Process any inserts in lpMsgBuf.
// ...
// Display the string.
if (isOut) {
fprintf(fout, "%s\n", lpMsgBuf);
} else {
printf("%s\n", lpMsgBuf);
}
// Free the buffer.
LocalFree(lpMsgBuf);
}
bool regreadSZ(string& hkey, string& subkey, string& value, string& returnvalue, string& regValueType) {
char s[128000];
map<string,HKEY> keys;
keys["HKEY_CLASSES_ROOT"]=HKEY_CLASSES_ROOT;
keys["HKEY_CURRENT_CONFIG"]=HKEY_CURRENT_CONFIG; //DID NOT SURVIVE?
keys["HKEY_CURRENT_USER"]=HKEY_CURRENT_USER;
keys["HKEY_LOCAL_MACHINE"]=HKEY_LOCAL_MACHINE;
keys["HKEY_USERS"]=HKEY_USERS;
HKEY mykey;
map<string,DWORD> valuetypes;
valuetypes["REG_SZ"]=REG_SZ;
valuetypes["REG_EXPAND_SZ"]=REG_EXPAND_SZ;
valuetypes["REG_MULTI_SZ"]=REG_MULTI_SZ; //probably can't use this.
LONG retval=RegOpenKeyEx(
keys[hkey], // handle to open key
subkey.c_str(), // subkey name
0, // reserved
KEY_READ, // security access mask
&mykey // handle to open key
);
if (ERROR_SUCCESS != retval) {printerr(retval); return false;}
DWORD slen=128000;
DWORD valuetype = valuetypes[regValueType];
retval=RegQueryValueEx(
mykey, // handle to key
value.c_str(), // value name
NULL, // reserved
(LPDWORD) &valuetype, // type buffer
(LPBYTE)s, // data buffer
(LPDWORD) &slen // size of data buffer
);
switch(retval) {
case ERROR_SUCCESS:
//if (isOut) {
// fprintf(fout,"RegQueryValueEx():ERROR_SUCCESS:succeeded.\n");
//} else {
// printf("RegQueryValueEx():ERROR_SUCCESS:succeeded.\n");
//}
break;
case ERROR_MORE_DATA:
//what do I do now? data buffer is too small.
if (isOut) {
fprintf(fout,"RegQueryValueEx():ERROR_MORE_DATA: need bigger buffer.\n");
} else {
printf("RegQueryValueEx():ERROR_MORE_DATA: need bigger buffer.\n");
}
return false;
case ERROR_FILE_NOT_FOUND:
if (isOut) {
fprintf(fout,"RegQueryValueEx():ERROR_FILE_NOT_FOUND: registry value does not exist.\n");
} else {
printf("RegQueryValueEx():ERROR_FILE_NOT_FOUND: registry value does not exist.\n");
}
return false;
default:
if (isOut) {
fprintf(fout,"RegQueryValueEx():unknown error type 0x%lx.\n", retval);
} else {
printf("RegQueryValueEx():unknown error type 0x%lx.\n", retval);
}
return false;
}
retval=RegCloseKey(mykey);
if (ERROR_SUCCESS != retval) {printerr(retval); return false;}
returnvalue = s;
return true;
}