Why is MD5Sum so fast - c++

I've been studying hashing in C/C++ and tried to replicate the md5sum command in Linux. After analysing the source code, it seems that md5sum relies on the md5 library's md5_stream. I've approximated the md5_stream function from the md5.h library into the code below, and it runs in ~13-14 seconds. I've tried to call the md5_stream function directly and got ~13-14 seconds. The md5sum runs in 4 seconds. What have the GNU people done to get the speed out of the code?
The md5.h/md5.c code is available in the CoreUtils source code.
#include <QtCore/QCoreApplication>
#include <QtCore/QDebug>
#include <iostream>
#include <iomanip>
#include <fstream>
#include "md5.h"
#define BLOCKSIZE 32784
int main()
{
FILE *fpinput, *fpoutput;
if ((fpinput = fopen("/dev/sdb", "rb")) == 0) {
throw std::runtime_error("input file doesn't exist");
}
struct md5_ctx ctx;
size_t sum;
char *buffer = (char*)malloc (BLOCKSIZE + 72);
unsigned char *resblock = (unsigned char*)malloc (16);
if (!buffer)
return 1;
md5_init_ctx (&ctx);
size_t n;
sum = 0;
while (!ferror(fpinput) && !feof(fpinput)) {
n = fread (buffer + sum, 1, BLOCKSIZE - sum, fpinput);
if (n == 0){
break;
}
sum += n;
if (sum == BLOCKSIZE) {
md5_process_block (buffer, BLOCKSIZE, &ctx);
sum = 0;
}
}
if (n == 0 && ferror (fpinput)) {
free (buffer);
return 1;
}
/* Process any remaining bytes. */
if (sum > 0){
md5_process_bytes (buffer, sum, &ctx);
}
/* Construct result in desired memory. */
md5_finish_ctx (&ctx, resblock);
free (buffer);
for (int x = 0; x < 16; ++x){
std::cout << std::setfill('0') << std::setw(2) << std::hex << static_cast<uint16_t>(resblock[x]);
std::cout << " ";
}
std::cout << std::endl;
free(resblock);
return 0;
}
EDIT: Was a default mkspec problem in Fedora 19 64-bit.

fread() is convenient, but don't use fread() if you care about performance. fread() will copy from the OS to a libc buffer, then to your buffer. This extra copying cost CPU cycles and cache.
For better performance use open() then read() to avoid the extra copy. Make sure your read() calls are multiples of the block size, but lower than your CPU cache size.
For best performance use mmap() map the disk directly to RAM.
If you try something like the below code, it should go faster.
// compile gcc mmap_md5.c -lgcrypt
#include <sys/mman.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <gcrypt.h>
#include <linux/fs.h> // ioctl
#define handle_error(msg) \
do { perror(msg); exit(EXIT_FAILURE); } while (0)
int main(int argc, char *argv[])
{
char *addr;
int fd;
struct stat sb;
off_t offset, pa_offset;
size_t length;
ssize_t s;
unsigned char digest[16];
char digest_ascii[32+1] = {0,};
int digest_length = gcry_md_get_algo_dlen (GCRY_MD_MD5);
int i;
if (argc < 3 || argc > 4) {
fprintf(stderr, "%s file offset [length]\n", argv[0]);
exit(EXIT_FAILURE);
}
fd = open(argv[1], O_RDONLY);
if (fd == -1)
handle_error("open");
if (fstat(fd, &sb) == -1) /* To obtain file size */
handle_error("fstat");
offset = atoi(argv[2]);
pa_offset = offset & ~(sysconf(_SC_PAGE_SIZE) - 1);
if (sb.st_mode | S_IFBLK ) {
// block device. use ioctl to find length
ioctl(fd, BLKGETSIZE64, &length);
} else {
/* offset for mmap() must be page aligned */
if (offset >= sb.st_size) {
fprintf(stderr, "offset is past end of file size=%zd, offset=%d\n", sb.st_size, (int) offset);
exit(EXIT_FAILURE);
}
if (argc == 4) {
length = atoi(argv[3]);
if (offset + length > sb.st_size)
length = sb.st_size - offset;
/* Canaqt display bytes past end of file */
} else { /* No length arg ==> display to end of file */
length = sb.st_size - offset;
}
}
printf("length= %zd\n", length);
addr = mmap(NULL, length + offset - pa_offset, PROT_READ,
MAP_PRIVATE, fd, pa_offset);
if (addr == MAP_FAILED)
handle_error("mmap");
gcry_md_hash_buffer(GCRY_MD_MD5, digest, addr + offset - pa_offset, length);
for (i=0; i < digest_length; i++) {
sprintf(digest_ascii+(i*2), "%02x", digest[i]);
}
printf("hash=%s\n", digest_ascii);
exit(EXIT_SUCCESS);
}

It turned out to be an error in the Qt mkspecs regarding an optimization flag not being set properly.

Related

Linux memory mapped file consuming more disk than expected

Context: I'm using memory mapped file in my code created using ACE_Mem_Map. It is observed that the memory mapped file is consuming more disk space than expected.
Scenario:
I have a structure containing a char array of 15KB. I have created a memory map file for array of this struct with file size ~2GB.
If I try to access few bytes of the char array(say 256), then, file size consumed is shown as 521 MB but actual disk usage shown by filesystem(using df -h) is more than 3GB.
If I access all bytes of the memory, then both file size and disk usage is shown as 2 GB.
Environment:
OS: Oracle Linux 7.3
Kernel version: 3.10.0/4.1.12
Code:
#include<ace/Mem_Map.h>
#include <stdio.h>
#define TEST_BUFF_SIZE 15*1024
typedef struct _test_struct_ {
char test[TEST_BUFF_SIZE];
_test_struct_() {
reset();
}
void reset() {
/* Issue replicating */
memset(test, '\0', 256);
/* Issue not replicating */
memset(test, '\0', TEST_BUFF_SIZE);
}
}TestStruct_t;
int main(int argc, char *argv[]) {
if(3 != argc) {
printf("Usage: %s <num of blocks> <filename>\n",
argv[0]);
return -1;
}
ACE_Mem_Map map_buf_;
size_t num_of_blocks = strtoull(argv[1], NULL, 10);
size_t MAX_SIZE = num_of_blocks*sizeof(TestStruct_t);
char* mmap_file_name = argv[2];
printf("num_of_blocks[%llu], sizeof(TestStruct_t)[%llu], MAX_SIZE[%llu], mmap_file_name[%s]\n",
num_of_blocks,
sizeof(TestStruct_t),
MAX_SIZE,
mmap_file_name);
TestStruct_t *base_addr_;
ACE_HANDLE fp_ = ACE_OS::open(mmap_file_name,O_RDWR|O_CREAT,
ACE_DEFAULT_OPEN_PERMS,0);
if (fp_ == ACE_INVALID_HANDLE)
{
printf("Error opening file\n");
return -1;
}
map_buf_.map(fp_,MAX_SIZE,PROT_WRITE,MAP_SHARED);
base_addr_ = (TestStruct_t*)map_buf_.addr();
if (base_addr_ == MAP_FAILED)
{
printf("Map init failure\n");
ACE_OS::close(fp_);
return -1;
}
printf("map_buf_ size[%llu]\n",
map_buf_.size());
for(size_t i = 0; i < num_of_blocks; i++) {
base_addr_[i].reset();
}
return 0;
}
Can anyone explain why is scenario 1 happening??
Note: In scenario 1, if I make a copy of generated mmap file and then delete that copy, then the additional 2.5GB disk space gets freed. Don't know the reason
I 'upgraded' your program to nearly C and minus whatever ACE is and got this:
$ ./a.out 32 fred
num_of_blocks[32], sizeof(TestStruct_t)[15360], MAX_SIZE[491520], mmap_file_name[fred]
Bus error: 10
Which is pretty much expected. Mmap does not extend the size of the mapped file, so it generates an address error when you try to reference an unfilled part.
So, the answer is that whatever ACE.map does, it likely invokes something like ftruncate(2) to extend the file to the size you give as a parameter. #John Bollinger hints at this by asking how are you measuring that: ls or du. You should use the latter.
Anyway, almost C version:
#include <sys/mman.h>
#include <sys/types.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <fcntl.h>
#include <stdio.h>
#define TEST_BUFF_SIZE 15*1024
typedef struct _test_struct_ {
char test[TEST_BUFF_SIZE];
_test_struct_() {
reset();
}
void reset() {
/* Issue replicating */
memset(test, '\0', 256);
/* Issue not replicating */
memset(test, '\0', TEST_BUFF_SIZE);
}
}TestStruct_t;
int main(int argc, char *argv[]) {
if(argc < 3) {
printf("Usage: %s <num of blocks> <filename>\n",
argv[0]);
return 1;
}
void *buf;
size_t num_of_blocks = strtoull(argv[1], NULL, 10);
size_t MAX_SIZE = num_of_blocks*sizeof(TestStruct_t);
char* mmap_file_name = argv[2];
printf("num_of_blocks[%zu], sizeof(TestStruct_t)[%zu], MAX_SIZE[%zu], mmap_file_name[%s]\n",
num_of_blocks,
sizeof(TestStruct_t),
MAX_SIZE,
mmap_file_name);
int fp = open(mmap_file_name,O_RDWR|O_CREAT,0666);
if (fp == -1)
{
perror("Error opening file");
return 1;
}
/*SOMETHING CLEVER*/
switch (argc) {
case 3:
break;
case 4:
if (ftruncate(fp, MAX_SIZE) != 0) {
perror("ftruncate");
return 1;
}
break;
case 5:
if (lseek(fp, MAX_SIZE-1, SEEK_SET) != MAX_SIZE-1 ||
write(fp, "", 1) != 1) {
perror("seek,write");
return 1;
}
}
void *b = mmap(0, MAX_SIZE, PROT_READ|PROT_WRITE, MAP_SHARED, fp, 0);
if (b == MAP_FAILED)
{
perror("Map init failure");
return 1;
}
TestStruct_t *base_addr = (TestStruct_t *)b;
for(size_t i = 0; i < num_of_blocks; i++) {
base_addr[i].reset();
}
return 0;
}
The SOMETHING CLEVER bit allows you to either work with an empty file (argc == 3), grow it with ftruncate (argc == 4), or grow it with lseek && write (argc == 5).
On UNIX-y systems, ftruncate may or may not reserve space for your file; a lengthened file without reserved space is called sparce. Almost universally, the lseek && write will create a sparse file, unless your system doesn't support that.
The sparce file will allocate actual disk blocks as you write to it, however, if it fails, it will deliver a signal whereas the pre-allocated one will not.
Your loop at the bottom walks the whole extent, so the file will always be grown; reduce that loop and you can see if the options make a difference on your system.

direct access to HDD

I want to print out boot sector using code below, but there is mistake.
#include <windows.h>
#include <stdio.h>
#include <iostream>
#include <math.h>
using namespace std;
short ReadSect
(const char * _dsk, // disk to access
char *&_buff, // buffer where sector will be stored
unsigned int _nsect // sector number, starting with 0
)
{
DWORD dwRead;
HANDLE
hDisk=CreateFile(_dsk,GENERIC_READ,FILE_SHARE_READ,0,OPEN_EXISTING,0,0);
if(hDisk==INVALID_HANDLE_VALUE) // this may happen if another program is
already reading from disk
{
CloseHandle(hDisk);
return 1;
}
SetFilePointer(hDisk,_nsect*512,0,FILE_BEGIN); // which sector to read
ReadFile(hDisk,_buff,512,&dwRead,0); // read sector
CloseHandle(hDisk);
return 0;
}
int main()
{
char * drv="\\\\.\\C:";
char *dsk=" \\\\.\\PhysicalDrive0";
int sector=0;
int b = 1;
char *buff=new char[512];
ReadSect(dsk,buff,sector);
if((unsigned char)buff[510]==0x55 && (unsigned char)buff[511]==0xaa) cout
<<"Disk is bootable!"<<endl;
else printf("%02hhX\n",(unsigned int)(unsigned char)buff[511]);
printf("\n");
while (b<513)
{
if (b%16==0)
printf(" %02hhX\n",(unsigned int)(unsigned char)buff[b-1]);
else
printf (" %02hhX ",(unsigned int)(unsigned char)buff[b-1]);
b++;
}
getchar();
}
Instead of printing hexadecimal digits of boot sectors, microsoft visual studio prints out stream of "CDs". What is the mistake and how to fix the problem? Can anyone help?
Photo of output
First of all, start it as Administrator
remove space from
char *dsk=" \\\\.\\PhysicalDrive0";
must be
char *dsk="\\\\.\\PhysicalDrive0";
And, if you use char *dsk, make modification in:
hDisk=CreateFile(_dsk,GENERIC_READ,FILE_SHARE_READ,0,OPEN_EXISTING,0,0);
must be:
CreateFileA (......)
#include <windows.h>
#include <stdio.h>
#include <iostream>
#include <math.h>
using namespace std;
short ReadSect
(const char * _dsk, // disk to access
char *&_buff, // buffer where sector will be stored
unsigned int _nsect // sector number, starting with 0
)
{
DWORD dwRead;
HANDLE
hDisk = CreateFileA( _dsk, GENERIC_READ, FILE_SHARE_READ, NULL, OPEN_EXISTING, 0, 0);
//CreateFile()
if (hDisk == INVALID_HANDLE_VALUE) // this may happen if another program is already reading from disk
{
CloseHandle(hDisk);
return 1;
}
SetFilePointer(hDisk, _nsect * 512, 0, FILE_BEGIN); // which sector to read
ReadFile(hDisk, _buff, 512, &dwRead, 0); // read sector
CloseHandle(hDisk);
return 0;
}
int main()
{
char * drv = "\\\\.\\C:";
char *dsk = "\\\\.\\PhysicalDrive0";
int sector = 0;
int b = 1;
char *buff = new char[512];
ReadSect(dsk, buff, sector);
if ((unsigned char)buff[510] == 0x55 && (unsigned char)buff[511] == 0xaa) cout
<< "Disk is bootable!" << endl;
else printf("%02hhX\n", (unsigned int)(unsigned char)buff[511]);
printf("\n");
while (b<513)
{
if (b % 16 == 0)
printf(" %02hhX\n", (unsigned int)(unsigned char)buff[b - 1]);
else
printf(" %02hhX ", (unsigned int)(unsigned char)buff[b - 1]);
b++;
}
getchar();
}
WinAPI Unicode and ANSI functions

Unix file descriptor

Today I found very interesting behavior of file descriptors in Linux. Look at that code:
#include <dirent.h> /* Defines DT_* constants */
#include <fcntl.h>
#include <stdio.h>
#include <unistd.h>
#include <stdlib.h>
#include <sys/stat.h>
#include <sys/syscall.h>
#include <sys/types.h>
#include <errno.h>
#define handle_error(msg) \
do { trace(msg); exit(0); } while (0)
#define trace printf
int createFile(const char* name) {
int r;
r = ::open( name, 0 );
if (r < 0)
{
trace("create file : %s\n", name);
r = ::open( name, O_CREAT, 0666 );
if (r < 0)
trace("error r < 0 %d\n",errno);
}
return r;
}
int createDir(const char* name) {
int r = ::mkdir( name, 0777 );
if (r != 0) {
trace("error r!=0\n");
}
r = open(name, 0);
if (r < 0) {
trace("error create dir r <0\n");
}
return r;
}
struct linux_dirent {
long d_ino;
off_t d_off;
unsigned short d_reclen;
char d_name[];
};
#include <sys/types.h>
#include <dirent.h>
void test123(int fd) {
int nread;
char buf[1024];
unsigned char buffer[1024];
struct linux_dirent *d;
int bpos,r;
char d_type;
if (fd == -1)
handle_error("open");
for ( ; ; ) {
nread = syscall(SYS_getdents, fd, buf, 1024);
if (nread == -1)
handle_error("getdents");
if (nread == 0)
break;
trace("--------------- nread=%d ---------------\n", nread);
trace("i-node# file type d_reclen d_off d_name\n");
for (bpos = 0; bpos < nread;) {
d = (struct linux_dirent *) (buf + bpos);
trace("%8ld ", d->d_ino);
d_type = *(buf + bpos + d->d_reclen - 1);
trace("%4d %10lld %s\n", d->d_reclen,
(long long) d->d_off, d->d_name);
bpos += d->d_reclen;
}
}
}
int main(int argc, const char * argv[]) {
int dir = createDir("test");
int file = createFile("test/file.gg");
test123(dir);
close(dir);
close(file);
return 0;
}
in that code I create folder, save its file descriptor, create file in that folder and after I want to print all files in that directory via file descriptors. However I get this output:
create file : test/file.gg
--------------- nread=32 ---------------
i-node# file type d_reclen d_off d_name
48879 16 1 .
48880 16 2 ..
There is no file.gg file in that folder. So, my question is - how it can be and how to work correctly with file descriptors? As I understand file descriptor is just an index in local for process table with all opened files and directories. But it is looks like that folder descriptor caches somehow files in that folder.
How to work correctly with descriptors in my case?
Try to do an fsync on your directory. You should open directory with O_RDONLY flags. O_WRONLY will fail. Create a file and sync may not sync metadata for this file. More informations in this article

Bad address in my write-function

I have to code this little task and can´t find my mistake. It should just read some data from a file and copy it in opposite order into another one. The first part seems to work, but the while-part gives me "Bad Address" for every time the write function is used. I´m grateful for every idea!
#include <iostream>
#include <cerrno>
#include <fcntl.h>
#include <cstdio>
#include <cstring>
#include <unistd.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <sys/mman.h>
#define TRY(cmd,msg) { \
if ((cmd) < 0) { \
std::cerr << (msg) << ": " << strerror(errno) << std::endl; \
} \
}
int main(int argc, char *argv[]){
int fd_in, fd_out, rest;
off_t map_size, offset, length;
char * addr;
struct stat sb;
if (argc != 3) {
std::cerr << "Usage: kopfstand in out" << std::endl;
return -1;
}
if ((fd_in = open(argv[1],O_RDONLY)) == -1){
perror("open");
return -1;
}
if ((fd_out = open(argv[2],O_WRONLY|O_CREAT|O_TRUNC,S_IRUSR|S_IWUSR)) == -1){
close(fd_in);
perror("open");
return -1;
}
fstat(fd_in, &sb);
length = sb.st_size;
map_size = sysconf(_SC_PAGESIZE);
rest = length % map_size;
offset = length -rest;
if(rest != 0){
addr = (char*)mmap(NULL, rest, PROT_READ, MAP_PRIVATE, fd_in, offset);
if(addr == MAP_FAILED){
perror("Error mmaping the File");
}
for (int off = rest-1; off >= 0; --off){
TRY(write(fd_out, addr+off, 1),"write");
}
if(munmap((char*)addr, rest)== -1){
perror("munmap");
}
}
while(offset > 0){
offset =- map_size;
addr = (char*)mmap(NULL, map_size, PROT_READ, MAP_SHARED, fd_in, offset);
if(addr == MAP_FAILED){
perror("Error mmaping the File");
}
for(int off = map_size-1; off >= 0; --off){
TRY(write(fd_out, addr+off, 1),"write");
}
if(munmap((char*)addr, map_size) == -1){
perror("munmap");
}
}
close(fd_in);
close(fd_out);
return 0;
}
You're going to kick yourself! (I kicked myself when I finally spotted the problem!)
This line:
offset =- map_size;
should be:
offset -= map_size;
See also What does =+ mean in C?
One of the comments to the question by Andrew Medico observes that clang identifies the problem immediately — and it does:
$ clang -O3 -g -std=c++11 -Wall -Wextra -Werror badadd.cpp -o badadd
badadd.cpp:65:16: error: use of unary operator that may be intended as compound assignment (-=)
[-Werror]
offset =- map_size;
G++ (GCC) 4.9.1 (home-built on Mac OS X 10.9 and running on 10.10.1) with the same compiler options does not identify the problem.
JFTR: I found the problem the hard way (with print statements tracing the value of offset), and only read Andrew's comment after finding and posting the answer.

linux writing string value as hex to serial

here's my problem
i have a string with hex value such as
std::string str ="8C000002008E"
and i want to write this as hex to serial out by using the
write()
I have a sony Display which i want to control with.
Passing a
unsigned char test[6] = {0x8c, 0x00, 0x00, 0x02, 0x00, 08E};
to the write method works.
But i dont know how to convert the string to such a char array especially that the size of the char array has to be calculated on runtime.
Thanks for your help.
Here is my full code
#include <stdio.h>
#include <iostream>
#include <cstring>
#include <fcntl.h>
#include <termios.h>
#include <sstream>
using namespace std;
#define TERM_DEVICE "/dev/ttyUSB0"
#define TERM_SPEED "B9600"
int main() {
std::string teststr = "8C000002008E";
int fd, old_flags;
ssize_t length;
char buffer[16];
struct termios term_attr;
fd_set input_fdset;
if ((fd = open(TERM_DEVICE, O_RDWR)) == -1)
{
perror("terminal: Can't open device " TERM_DEVICE);
return(1);
}
/* RS232 konfigurieren */
if (tcgetattr(fd, &term_attr) != 0)
{
perror("terminal: tcgetattr() failed");
return(1);
}
cfsetispeed(&term_attr, B9600);
cfsetospeed(&term_attr, B9600);
term_attr.c_cflag &= ~PARENB;
term_attr.c_cflag &= CS8;
term_attr.c_cflag &= CSIZE;
term_attr.c_cflag &= CSTOPB;
term_attr.c_iflag = 0;
term_attr.c_oflag = OPOST | ONLCR;
term_attr.c_lflag = 0;
if (tcsetattr(fd, TCSAFLUSH, &term_attr) != 0)
perror("terminal: tcsetattr() failed");
if (tcgetattr(STDIN_FILENO, &term_attr) != 0)
{
perror("terminal: tcgetattr() failed");
return(1);
}
old_flags = term_attr.c_lflag;
term_attr.c_lflag &= ~(ICANON | ECHO);
if (tcsetattr(STDIN_FILENO, TCSAFLUSH, &term_attr) != 0)
perror("terminal: tcsetattr() failed");
while (1)
{
FD_ZERO(&input_fdset);
FD_SET(STDIN_FILENO, &input_fdset);
FD_SET(fd, &input_fdset);
if (select(fd+1, &input_fdset, NULL, NULL, NULL) == -1)
perror("terminal: select() failed");
unsigned char test[6] = {0x8c, 0x00, 0x00, 0x02, 0x00, 0x8E};
if (FD_ISSET(STDIN_FILENO, &input_fdset)){
if ((length = read(STDIN_FILENO, buffer, 16)) == -1)
perror("terminal: read() failed");
else
if (buffer[0] == '\33')
break;
else{
write(fd, test , sizeof(test));
}
}
if (FD_ISSET(fd, &input_fdset))
{
if ((length = read(fd, buffer, 16)) == -1)
perror("terminal: read() failed");
else
cout << std::hex << buffer<< endl;
}
}
term_attr.c_lflag = old_flags;
if (tcsetattr(STDIN_FILENO, TCSAFLUSH, &term_attr) != 0)
perror("terminal: tcsetattr() failed");
printf("Aborted.\n");
close(fd);
return 0;
}
If the problem is only one of converting the string to a char array, you can try the following:
#include <iostream>
#include <sstream>
int main(int argc, char **argv)
{
const std::string str ="8C000002008E";
// count the number of character pairs (i.e. bytes) in the string
// and dynamically allocate an array of the required size
const int numBytes = str.size() / 2;
unsigned char* bytes = new unsigned char[numBytes];
for (int i = 0; i < numBytes; ++i)
{
// grab two characters from the string...
std::string twoChars = str.substr(2 * i, 2);
// ...and convert them to an integer using a stringstream
int byte;
std::stringstream ss(twoChars);
ss >> std::hex >> byte;
// store the result in our char array
bytes[i] = byte;
}
//
// do something with the byte array here
//
// deallocate array before exiting
delete[] bytes;
}
Please note that this assumes that the initial string always contains an even number of characters. Some extra code would be needed for handling input strings of an odd size.
Probably fast solution I can't offer to you, but you may try this:
std::string str ="8C000002008E"
std::vector<unsigned char> vect;
vect.resize(str.length() / 2);
for (size_t i = 0; i < str.length(); ++i)
{
vect[i] = str[2*i] - ('0' - 7 * (str[2*i] >= '9')); //it is possible to optimize this
vect[i] <<= 4;
vect[i] = str[2*i+1] - ('0' - 7 * (str[2*i+1] >= '9')); //it is possible to optimize this
}
Didn't tried that, but should work, however, code is really not optimized, I have left this part for yourself.
Also this assumes that the initial string always contains an even number of characters.