Fail to query via move_pages() - c++

#include <cstdint>
#include <iostream>
#include <numaif.h>
#include <sys/mman.h>
#include <fcntl.h>
#include <errno.h>
#include <unistd.h>
#include <string.h>
#include <limits>
int main(int argc, char** argv) {
const constexpr uint64_t size = 16lu * 1024 * 1024;
const constexpr uint32_t nPages = size / (4lu * 1024 * 1024);
int32_t status[nPages];
std::fill_n(status, nPages, std::numeric_limits<int32_t>::min());
void* pages[nPages];
auto fd = shm_open("test_shm", O_RDWR|O_CREAT, 0666);
void* ptr = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
if (ptr == MAP_FAILED) {
if (fd > 0) close(fd);
throw "failed to map hugepages";
}
for (uint32_t i = 0; i < nPages; i++) {
pages[i] = (char*)ptr + 4 * 1024 * 1024;
}
if (0 != move_pages(0, nPages, pages, nullptr, status, 0)) {
std::cout << "failed to inquiry pages because " << strerror(errno) << std::endl;
}
else {
for (uint32_t i = 0; i < nPages; i++) {
std::cout << "page # " << i << " locates at numa node " << status[i] << std::endl;
}
}
munmap(ptr, size);
close(fd);
}
And it prints:
page # 0 locates at numa node -2
page # 1 locates at numa node -2
page # 2 locates at numa node -2
page # 3 locates at numa node -2
According to the manpage, it states:
nodes is an array of integers that specify the desired location for each page.
Each element in the array is a node number. nodes can also be NULL, in which
case move_pages() does not move any pages but instead will return the node where
each page currently resides, in the status array. Obtaining the status of each
page may be necessary to determine pages that need to be moved.
Why does it print negative values although querying return success? My machine only has 2 NUMAs -- 0 and 1.
kernel version: 3.10.0-862.2.3.el7.x86_64
Here is the version for hugepages:
#include <cstdint>
#include <iostream>
#include <numaif.h>
#include <sys/mman.h>
#include <fcntl.h>
#include <errno.h>
#include <unistd.h>
#include <string.h>
#include <limits>
int main(int argc, char** argv) {
const int32_t dst_node = strtoul(argv[1], nullptr, 10);
const constexpr uint64_t size = 4lu * 1024 * 1024;
const constexpr uint64_t pageSize = 2lu * 1024 * 1024;
const constexpr uint32_t nPages = size / pageSize;
int32_t status[nPages];
std::fill_n(status, nPages, std::numeric_limits<int32_t>::min());
void* pages[nPages];
int32_t dst_nodes[nPages];
void* ptr = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_ANONYMOUS | MAP_PRIVATE | MAP_HUGETLB, -1, 0);
if (ptr == MAP_FAILED) {
throw "failed to map hugepages";
}
memset(ptr, 0x41, nPages*pageSize);
for (uint32_t i = 0; i < nPages; i++) {
pages[i] = &((char*)ptr)[i*pageSize];
dst_nodes[i] = dst_node;
}
std::cout << "Before moving" << std::endl;
if (0 != move_pages(0, nPages, pages, nullptr, status, 0)) {
std::cout << "failed to inquiry pages because " << strerror(errno) << std::endl;
}
else {
for (uint32_t i = 0; i < nPages; i++) {
std::cout << "page # " << i << " locates at numa node " << status[i] << std::endl;
}
}
// real move
if (0 != move_pages(0, nPages, pages, dst_nodes, status, MPOL_MF_MOVE_ALL)) {
std::cout << "failed to move pages because " << strerror(errno) << std::endl;
exit(-1);
}
const constexpr uint64_t smallPageSize = 4lu * 1024;
const constexpr uint32_t nSmallPages = size / smallPageSize;
void* smallPages[nSmallPages];
int32_t smallStatus[nSmallPages] = {std::numeric_limits<int32_t>::min()};
for (uint32_t i = 0; i < nSmallPages; i++) {
smallPages[i] = &((char*)ptr)[i*smallPageSize];
}
std::cout << "after moving" << std::endl;
if (0 != move_pages(0, nSmallPages, smallPages, nullptr, smallStatus, 0)) {
std::cout << "failed to inquiry pages because " << strerror(errno) << std::endl;
}
else {
for (uint32_t i = 0; i < nSmallPages; i++) {
std::cout << "page # " << i << " locates at numa node " << smallStatus[i] << std::endl;
}
}
}
The interesting thing is that move_pages() seems to understand hugepages as after the hugepages are moved, I query based on small page size, and it populates the expected NUMA IDs.

Your usage of shm_open and mmap probably will not get huge pages as you want.
move_pages syscall (and libnuma wrapper) works on standard pages of 4096 bytes for x86_64.
And you use move_pages in wrong way with incorrect 3rd argument "pages". It should be not pointer to memory; but pointer to array which itself will contain nPages pointers:
http://man7.org/linux/man-pages/man2/move_pages.2.html
long move_pages(int pid, unsigned long count, void **pages,
const int *nodes, int *status, int flags);
pages is an array of pointers to the pages that should be moved.
These are pointers that should be aligned to page boundaries.
Addresses are specified as seen by the process specified by pid.
Without correct pointers in the "pages' you will get -14 which is EFAULT according to errno 14 (from moreutils package).
//https://stackoverflow.com/questions/54546367/fail-to-query-via-move-pages
//g++ 54546367.move_pages.cc -o 54546367.move_pages -lnuma -lrt
#include <cstdint>
#include <iostream>
#include <numaif.h>
#include <sys/mman.h>
#include <fcntl.h>
#include <errno.h>
#include <unistd.h>
#include <string.h>
#include <limits>
int main(int argc, char** argv) {
const constexpr uint64_t size = 256lu * 1024;// * 1024;
const constexpr uint32_t nPages = size / (4lu * 1024);
void * pages[nPages];
int32_t status[nPages];
std::fill_n(status, nPages, std::numeric_limits<int32_t>::min());
// auto fd = shm_open("test_shm", O_RDWR|O_CREAT, 0666);
// void* ptr = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
void* ptr = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, 0, 0);
std::cout << "Ptr is " << ptr << std::endl;
if (ptr == MAP_FAILED) {
// if (fd > 0) close(fd);
throw "failed to map hugepages";
}
memset(ptr, 0x41, nPages*4096);
for(uint32_t i = 0; i<nPages; i++) {
pages[i] = &((char*)ptr)[i*4096];
}
if (0 != move_pages(0, nPages, pages, nullptr, status, 0)) {
std::cout << "failed to inquiry pages because " << strerror(errno) << std::endl;
}
else {
for (uint32_t i = 0; i < nPages; i++) {
std::cout << "page # " << i << " locates at numa node " << status[i] << std::endl;
}
}
munmap(ptr, size);
// close(fd);
}
With NUMA machine it outputs same node when started as taskset -c 7 ./54546367.move_pages and interleaved (0 1 0 1) when numactl -i all ./54546367.move_pages.

Related

Getting memory usage of program from another program in C++ (LINUX)

I would like to measure the maximum memory usage of abc.exe on random tests generated by gen.exe. How could I do that?
My code that runs abc.exe on tests from gen.exe looks like this:
#include <bits/stdc++.h>
using namespace std;
int main()
{
int i = 0;
while (true)
{
string si = to_string(i);
cout << i << "\n";
if (system(("echo " + si + "| ./gen.exe > test.in").c_str())) // gen.exe is test generator
{
cout << "gen error\n";
break;
}
if (system(("./abc.exe < test.in > a.out"))) // abc.exe is the program I want to test
{
cout << "abc error\n";
break;
}
i++;
}
}
I know that i can use time -v ./abc.exe but then the used memory is printed in the terminal but I'd like to be able to save it to a variable.
You can use getrusage( RUSAGE_CHILDREN, ... ) to obtain the maximum resident memory. Note that this call will return the maximum memory used by the biggest child at that point in time.
In the example below I used boost::process because it gives better control but it's up to you to use std::system or not, works the same way.
#include <string>
#include <cstdint>
#include <string.h>
#include <iostream>
#include <boost/process/child.hpp>
#include <sys/resource.h>
namespace bp = boost::process;
int parent( const std::string& exename )
{
// Loop from 0 to 10 megabytes
for ( int j=0; j<10; ++j )
{
// Command name is the name of this executable plus one argument with size
std::string gencmd = exename + " " + std::to_string(j);
// Start process
bp::child child( gencmd );
// Wait for it to allocate memory
sleep(1);
// Query the memory usage at this point in time
struct rusage ru;
getrusage( RUSAGE_CHILDREN, &ru );
std::cerr << "Loop:" << j << " mem:"<< ru.ru_maxrss/1024. << " MB" << std::endl;
// Wait for process to quit
child.wait();
if ( child.exit_code()!=0 )
{
std::cerr << "Error executing child:" << child.exit_code() << std::endl;
return 1;
}
}
return 0;
}
int child( int size ) {
// Allocated "size" megabites explicitly
size_t memsize = size*1024*1024;
uint8_t* ptr = (uint8_t*)malloc( memsize );
memset( ptr, size, memsize );
// Wait for the parent to sample our memory usage
sleep( 2 );
// Free memory
free( ptr );
return 0;
}
int main( int argc, char* argv[] )
{
// Without arguments, it is the parent.
// Pass the name of the binary
if ( argc==1 ) return parent( argv[0] );
return child( std::atoi( argv[1] ) );
}
It prints
$ ./env_test
Loop:0 mem:0 MB
Loop:1 mem:3.5625 MB
Loop:2 mem:4.01953 MB
Loop:3 mem:5.05469 MB
Loop:4 mem:6.04688 MB
Loop:5 mem:7.05078 MB
Loop:6 mem:7.78516 MB
Loop:7 mem:8.97266 MB
Loop:8 mem:9.82031 MB
Loop:9 mem:10.8867 MB
If you cannot use boost libraries, you'd got to work a little more but it is still feasible.
If you just want to know the maximum size ever of your children processes then the following works with std::system:
#include <cstdio>
#include <string>
#include <iostream>
#include <sstream>
#include <string.h>
#include <unistd.h>
#include <sys/resource.h>
int main(int argc, char* argv[]) {
if (argc > 1) {
size_t size = ::atol(argv[1]);
size_t memsize = size * 1024 * 1024;
void* ptr = ::malloc(memsize);
memset(ptr, 0, memsize);
::sleep(2);
::free(ptr);
return 0;
}
for (int j = 0; j < 10; ++j) {
std::ostringstream cmd;
cmd << argv[0] << " " << j;
int res = std::system(cmd.str().c_str());
if (res < 0) {
fprintf(stderr, "ERROR system: %s\n", strerror(errno));
break;
}
struct rusage ru;
res = getrusage(RUSAGE_CHILDREN, &ru);
size_t maxmem = ru.ru_maxrss;
fprintf(stderr, "Loop:%d MaxMem:%ld\n", j, maxmem);
}
return 0;
}
It prints
Loop:0 MaxMem:3552
Loop:1 MaxMem:4192
Loop:2 MaxMem:5148
Loop:3 MaxMem:6228
Loop:4 MaxMem:7364
Loop:5 MaxMem:8456
Loop:6 MaxMem:9120
Loop:7 MaxMem:10188
Loop:8 MaxMem:11324
Loop:9 MaxMem:12256
However if you want to keep track of the memory usage during the child process execution you cannot use std::system(). First, you need to call fork() to spawn a new process and then execv() to execute a bash command.
#include <string>
#include <cstdint>
#include <string.h>
#include <unistd.h>
#include <iostream>
#include <sys/resource.h>
#include <sys/wait.h>
#include <fcntl.h>
#include <vector>
int parent(const std::string& exename) {
// Loop from 0 to 10 megabytes
for (int j = 0; j < 10; ++j) {
// Command name is the name of this executable plus one argument with size
std::string gencmd = exename + " " + std::to_string(j);
// Start process
pid_t pid = fork();
if (pid == 0) { // child
const char* args[] = {"/bin/bash", "-c", gencmd.c_str(), (char*)0};
int res = execv("/bin/bash", (char**)args);
// Should never return
std::cerr << "execv error: " << strerror(errno) << std::endl;
return 1;
}
// parent
long maxmem = 0;
while (true) {
int status;
pid_t rid = ::waitpid(pid, &status, WNOHANG);
if (rid < 0) {
if (errno != ECHILD) {
std::cerr << "waitpid:" << strerror(errno) << std::endl;
return 2;
}
break;
}
if (rid == pid) {
if (WIFEXITED(pid)) {
break;
}
}
// Wait for it to allocate memory
usleep(10000);
// Query the memory usage at this point in time
struct rusage ru;
int res = getrusage(RUSAGE_CHILDREN, &ru);
if (res != 0) {
if (errno != ECHILD) {
std::cerr << "getrusage:" << errno << strerror(errno) << std::endl;
}
break;
}
if (maxmem < ru.ru_maxrss) {
maxmem = ru.ru_maxrss;
}
}
std::cerr << "Loop:" << j << " mem:" << maxmem / 1024. << " MB" << std::endl;
}
return 0;
}
int child(int size) {
// Allocated "size" megabites explicitly
size_t memsize = size * 1024 * 1024;
uint8_t* ptr = (uint8_t*)malloc(memsize);
memset(ptr, size, memsize);
// Wait for the parent to sample our memory usage
sleep(2);
// Free memory
free(ptr);
return 0;
}
int main(int argc, char* argv[]) {
// Without arguments, it is the parent.
// Pass the name of the binary
if (argc == 1) return parent(argv[0]);
return child(std::atoi(argv[1]));
}
The result on my machine is:
$ ./fork_test
Loop:0 mem:3.22656 MB
Loop:1 mem:3.69922 MB
Loop:2 mem:4.80859 MB
Loop:3 mem:5.92578 MB
Loop:4 mem:6.87109 MB
Loop:5 mem:8.05469 MB
Loop:6 mem:8.77344 MB
Loop:7 mem:9.71875 MB
Loop:8 mem:10.7422 MB
Loop:9 mem:11.6797 MB
There is a video about this post.

Using mmap memory for a circular buffer with very low overhead

I have a debugging tool which in order to register its acquired data uses a data structure called DiskPool (code follows). At start, this data structure mmaps a certain amount of data (backed by a file on disk). Clients can allocate memory via a simple bump pointer mechanism (implemented using std::atomic<size_t>.
As the volume of acquired data is massive I have decided to have a window over a time period instead of registering and keeping all the data. To fulfil such a purpose I have to change the disk pool into a circular buffer but this should not impose a considerable overhead as this overhead affects the measurement.
I wanted to ask you if anybody has any idea? (For example, using an atomic interface of STL).
#include <sys/mman.h>
#include <fcntl.h>
#include <unistd.h>
#include <sys/stat.h>
#include <atomic>
#include <memory>
#include <signal.h>
#include <chrono>
#include <thread>
#define handle_error(msg) \
do { perror(msg); exit(EXIT_FAILURE); } while (0)
class DiskPool {
char* addr_; // Initialized by mmap()
size_t len_; // Given by the user as many as memory pages as needed
std::atomic<size_t> top_; // Offset from address_
int fd_;
public:
DiskPool(size_t l, const char* file) : len_(l), top_(0),fd_(-1)
{
struct stat st;
fd_= open(file, O_CREAT|O_RDWR, S_IREAD | S_IWRITE);
if (fd_ == -1)
handle_error("open");
if (ftruncate(fd_, len_* sysconf(_SC_PAGE_SIZE)) != 0)
handle_error("ftruncate() error");
else {
fstat(fd_, &st);
printf("the file has %ld bytes\n", (long) st.st_size);
}
addr_ = static_cast<char*>( mmap(NULL, (len_* sysconf(_SC_PAGE_SIZE)),
PROT_READ | PROT_WRITE, MAP_SHARED|MAP_NORESERVE, fd_,0));
if (addr_ == MAP_FAILED)
handle_error("mmap failed.");
}
~DiskPool()
{
close(fd_);
if( munmap(addr_, len_)< 0) {
handle_error("Could not unmap file");
exit(1);}
std::cout << "Successfully unmapped the file. " << std::endl;
}
void* allocate(size_t s)
{
size_t t = std::atomic_fetch_add(&top_, s);
return addr_+t;
}
void flush() {madvise(addr_, len_, MADV_DONTNEED);}
};
As an example, I created sample code that uses this disk pool to record data at the creation and destruction of an object (AutomaticLifetimeCollector).
static const std::string RECORD_FILE = "Data.txt";
static const size_t DISK_POOL_NUMBER_OF_PAGES = 10000;
static std::shared_ptr<DiskPool> diskPool =
std::shared_ptr <DiskPool> (new DiskPool(DISK_POOL_NUMBER_OF_PAGES,RECORD_FILE.c_str()));
struct TaskRecord
{
uint64_t tid; // Thread id
uint64_t tag; // User-given identifier (“f1”)
uint64_t start_time; // nanoseconds
uint64_t stop_time;
uint64_t cpu_time;
TaskRecord(int depth, size_t tag, uint64_t start_time) :
tid(pthread_self()), tag(tag),
start_time(start_time), stop_time(0), cpu_time(0) {}
};
class AutomaticLifetimeCollector
{
TaskRecord* record_;
public:
AutomaticLifetimeCollector(size_t tag) :
record_(new(diskPool->allocate(sizeof(TaskRecord)))
TaskRecord(2, tag, (uint64_t)1000000004L))
{
}
~AutomaticLifetimeCollector() {
record_->stop_time = (uint64_t)1000000000L;
record_->cpu_time = (uint64_t)1000000002L;
}
};
inline void DelayMilSec(unsigned int pduration)
{
std::this_thread::sleep_until(std::chrono::system_clock::now() +
std::chrono::milliseconds(pduration));
}
std::atomic<bool> LoopsRunFlag {true};
void sigIntHappened(int signal)
{
std::cout<< "Application was terminated.";
LoopsRunFlag.store(false, std::memory_order_release);
}
int main()
{
signal(SIGINT, sigIntHappened);
unsigned int i = 0;
while(LoopsRunFlag)
{
AutomaticLifetimeCollector alc(i++);
DelayMilSec(2);
}
diskPool->flush();
return(0);
}
So accounting only for the handing out of variable-sized slices for a variable buffer, I believe a Compare-And-Swap loop should work.
The basic idea here is to read a value (which is atomic), do some computation with it, then write the value, if it did not change since reading. If it did change (another thread/process), the computation must be redone with the new value.
Since you have variable sized objects, I think actually simply slicing it into n array elements with (i + 1) % n won't work, as given (i + item_len) % capacity, it would split the allocation between the end and start of the buffer, and while that can be correct and working, I think maybe not what you wanted. So that means a condition, but I think the CPU should predict it pretty well.
#include <iostream>
#include <atomic>
std::atomic<size_t> next_index = 0;
const size_t len = 100; // small for demo purpose
size_t alloc(size_t required_size)
{
if (required_size > len) std::terminate(); // do something, would cause a buffer overflow
size_t i, ret_index, new_index;
i = next_index.load();
do
{
auto space = len - i;
ret_index = required_size <= space ? i : 0; // Wrap if needed
new_index = ret_index + required_size;
} while (next_index.compare_exchange_weak(i, new_index)); // succeed if value did of i not change
return ret_index;
}
int main()
{
std::cout << alloc(4) << std::endl; // 0 - 3
std::cout << alloc(8) << std::endl; // 4 - 11
std::cout << alloc(32) << std::endl; // 12 - 43
std::cout << alloc(32) << std::endl; // 44 - 75
std::cout << alloc(32) << std::endl; // 0 - 31 (76 - 107 would overflow)
std::cout << alloc(32) << std::endl; // 32 - 63
std::cout << alloc(32) << std::endl; // 64 - 95
std::cout << alloc(32) << std::endl; // 0 - 31 (96 - 127 would overflow)
}
Which should be fairly simple to plug in to your class:
void* allocate(size_t s)
{
if (s > len_ * sysconf(_SC_PAGE_SIZE)) std::terminate(); // do something, would cause a buffer overflow
size_t i, ret_index, new_index;
i = top_.load();
do
{
auto space = len_ * sysconf(_SC_PAGE_SIZE) - i;
ret_index = s <= space ? i : 0; // Wrap if needed
new_index = ret_index + s;
} while (top_.compare_exchange_weak(i, new_index)); // succeed if value did of i not change
return addr_+ ret_index;
}
len_ * sysconf(_SC_PAGE_SIZE) is in a few places, so might be the more useful value to store in len_ itself.

OPUS decode raw PCM data

I am trying to compress and decompress raw PCM (16-Bit) audio, using OPUS.
Here below is my code for opus_encoder.c. If I remove my decoder.c, the buffer works just fine as in the microphone is able to take in raw PCM data. However, once I have implemented my decoder class, it gave me a lot of errors such as memory allocation, heap corruption and so on. Here are some of my errors:
std::bad_alloc at memory location 0x0031D4BC
Stack overflow (parameters: 0x00000000, 0x05122000)
Access violation reading location 0x04A40000.
Based on my understanding, I think my decoder size cannot allocate the memory properly. Can you take a look at my codes and see what went wrong?
Opus_encoder.c
#include "opusencoder.h"
#include <QtConcurrent/QtConcurrent>
opusencoder::opusencoder(){
}
opusencoder::~opusencoder(){
}
OpusEncoder *enc;
int error;
unsigned char *compressedbuffer;
opus_uint32 enc_final_range;
short pcm = 0;
unsigned char *opusencoder::encodedata(const char *audiodata, const unsigned int& size) {
if (size == 0)
return false;
enc = (OpusEncoder *)malloc(opus_encoder_get_size(1));
enc = opus_encoder_create(8000, 1, OPUS_APPLICATION_VOIP, &error);
if (enc == NULL)
{
exit;
}
opus_int32 rate;
opus_encoder_ctl(enc, OPUS_GET_BANDWIDTH(&rate));
this->encoded_data_size = rate;
int len;
for (int i = 0; i < size / 2; i++)
{
//combine pairs of bytes in the original data into two-byte number
//convert const char to short
pcm= audiodata[2 * i] << 8 | audiodata[(2 * i) + 1];
}
qDebug() << "audiodata: " << pcm << endl;
compressedbuffer = new (unsigned char[this->encoded_data_size]);
len = opus_encode(enc, &pcm, 320, compressedbuffer, this->encoded_data_size);
len = opus_packet_unpad(compressedbuffer, len);
len++;
if (len < 0)
{
qDebug() << "Failure to compress";
return NULL;
}
qDebug() << "COmpressed buffer:" << compressedbuffer << endl;
qDebug() << "opus_encode() ................................ OK.\n" << endl;
}
Opus_decoder.c
##include "opusdecoder.h"
#include <QtConcurrent/QtConcurrent>
#define OPUS_CLEAR(dst, n) (memset((dst), 0, (n)*sizeof(*(dst))))
int num_channels = 1;
opusdecoder::opusdecoder(){
}
opusdecoder::~opusdecoder(){
}
opus_int16* opusdecoder::decodedata(int frame_size, const unsigned char *data)
{
dec = opus_decoder_create(8000, 1, &err);
if (dec == NULL)
{
exit;
}
opus_int32 rate;
opus_decoder_ctl(dec, OPUS_GET_BANDWIDTH(&rate));
rate = decoded_data_size;
this->num_channels = num_channels;
int decodedatanotwo;
opus_int16 *decompress = new (opus_int16[frame_size * this->num_channels]);
opus_packet_get_nb_channels(data);
decodedatanotwo= opus_decode(dec, data, this->decoded_data_size, decompress, 320, 0);
if (decodedatanotwo < 0)
{
qDebug() << "Failure to decompress";
return NULL;
}
qDebug() << "opus_decode() ................................ OK.\n" << decodedatanotwo << endl;
if (decodedatanotwo != frame_size)
{
exit;
}
}

Why is MD5Sum so fast

I've been studying hashing in C/C++ and tried to replicate the md5sum command in Linux. After analysing the source code, it seems that md5sum relies on the md5 library's md5_stream. I've approximated the md5_stream function from the md5.h library into the code below, and it runs in ~13-14 seconds. I've tried to call the md5_stream function directly and got ~13-14 seconds. The md5sum runs in 4 seconds. What have the GNU people done to get the speed out of the code?
The md5.h/md5.c code is available in the CoreUtils source code.
#include <QtCore/QCoreApplication>
#include <QtCore/QDebug>
#include <iostream>
#include <iomanip>
#include <fstream>
#include "md5.h"
#define BLOCKSIZE 32784
int main()
{
FILE *fpinput, *fpoutput;
if ((fpinput = fopen("/dev/sdb", "rb")) == 0) {
throw std::runtime_error("input file doesn't exist");
}
struct md5_ctx ctx;
size_t sum;
char *buffer = (char*)malloc (BLOCKSIZE + 72);
unsigned char *resblock = (unsigned char*)malloc (16);
if (!buffer)
return 1;
md5_init_ctx (&ctx);
size_t n;
sum = 0;
while (!ferror(fpinput) && !feof(fpinput)) {
n = fread (buffer + sum, 1, BLOCKSIZE - sum, fpinput);
if (n == 0){
break;
}
sum += n;
if (sum == BLOCKSIZE) {
md5_process_block (buffer, BLOCKSIZE, &ctx);
sum = 0;
}
}
if (n == 0 && ferror (fpinput)) {
free (buffer);
return 1;
}
/* Process any remaining bytes. */
if (sum > 0){
md5_process_bytes (buffer, sum, &ctx);
}
/* Construct result in desired memory. */
md5_finish_ctx (&ctx, resblock);
free (buffer);
for (int x = 0; x < 16; ++x){
std::cout << std::setfill('0') << std::setw(2) << std::hex << static_cast<uint16_t>(resblock[x]);
std::cout << " ";
}
std::cout << std::endl;
free(resblock);
return 0;
}
EDIT: Was a default mkspec problem in Fedora 19 64-bit.
fread() is convenient, but don't use fread() if you care about performance. fread() will copy from the OS to a libc buffer, then to your buffer. This extra copying cost CPU cycles and cache.
For better performance use open() then read() to avoid the extra copy. Make sure your read() calls are multiples of the block size, but lower than your CPU cache size.
For best performance use mmap() map the disk directly to RAM.
If you try something like the below code, it should go faster.
// compile gcc mmap_md5.c -lgcrypt
#include <sys/mman.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <gcrypt.h>
#include <linux/fs.h> // ioctl
#define handle_error(msg) \
do { perror(msg); exit(EXIT_FAILURE); } while (0)
int main(int argc, char *argv[])
{
char *addr;
int fd;
struct stat sb;
off_t offset, pa_offset;
size_t length;
ssize_t s;
unsigned char digest[16];
char digest_ascii[32+1] = {0,};
int digest_length = gcry_md_get_algo_dlen (GCRY_MD_MD5);
int i;
if (argc < 3 || argc > 4) {
fprintf(stderr, "%s file offset [length]\n", argv[0]);
exit(EXIT_FAILURE);
}
fd = open(argv[1], O_RDONLY);
if (fd == -1)
handle_error("open");
if (fstat(fd, &sb) == -1) /* To obtain file size */
handle_error("fstat");
offset = atoi(argv[2]);
pa_offset = offset & ~(sysconf(_SC_PAGE_SIZE) - 1);
if (sb.st_mode | S_IFBLK ) {
// block device. use ioctl to find length
ioctl(fd, BLKGETSIZE64, &length);
} else {
/* offset for mmap() must be page aligned */
if (offset >= sb.st_size) {
fprintf(stderr, "offset is past end of file size=%zd, offset=%d\n", sb.st_size, (int) offset);
exit(EXIT_FAILURE);
}
if (argc == 4) {
length = atoi(argv[3]);
if (offset + length > sb.st_size)
length = sb.st_size - offset;
/* Canaqt display bytes past end of file */
} else { /* No length arg ==> display to end of file */
length = sb.st_size - offset;
}
}
printf("length= %zd\n", length);
addr = mmap(NULL, length + offset - pa_offset, PROT_READ,
MAP_PRIVATE, fd, pa_offset);
if (addr == MAP_FAILED)
handle_error("mmap");
gcry_md_hash_buffer(GCRY_MD_MD5, digest, addr + offset - pa_offset, length);
for (i=0; i < digest_length; i++) {
sprintf(digest_ascii+(i*2), "%02x", digest[i]);
}
printf("hash=%s\n", digest_ascii);
exit(EXIT_SUCCESS);
}
It turned out to be an error in the Qt mkspecs regarding an optimization flag not being set properly.

Vlan id is set to 0 when TPACKET_V2 is used

I have a problem about the usage of this TPACKET_V2 .
My problem is that after setting of this type of packet on socket, when I try to receive some packets I can't read the vlan id from the packet (of course from the header of the packet) the vlan_tci is ever 0.
Now I'm using open suse sp1 and when I run my program on sless sp2 I 'm able to get the vlan id with the same program that doesn't work on sless sp1 but the weird thing is that tcpdump is able to get the vlan id (on this sless) and tcpdump set the TPACKET_V2 (so this means that TPACKET_2 is supported)
My simple project is based on these functions , all called by the function createSocket , then there is a simple method that is reading packets on the socket and there I try to get informations on vlan id (there there is also the relative part used before with the TPACKET_V1)
#include <linux/if_packet.h>
#include <linux/if_ether.h>
#include <netinet/in.h>
#include <netinet/if_ether.h>
#include <netinet/ether.h>
#include <linux/filter.h>
#include <net/if.h>
#include <arpa/inet.h>
enum INTERFACE_T
{
RX_INTERFACE,
TX_INTERFACE
};
static const char* PKT_TYPE[];
// BLOCK_NUM*BLOCK_SIZE = FRAME_NUM*FRAME_SIZE
enum { RX_BLOCK_SIZE = 8192,
RX_BLOCK_NUM = 256,
RX_FRAME_SIZE = 2048,
RX_FRAME_NUM = 1024
};
enum { TX_BLOCK_SIZE = 8192,
TX_BLOCK_NUM = 256,
TX_FRAME_SIZE = 2048,
TX_FRAME_NUM = 1024
};
struct RxFrame {
struct tpacket2_hdr tp_h; // Packet header
uint8_t tp_pad[TPACKET_ALIGN(sizeof(tpacket2_hdr))-sizeof(tpacket2_hdr)];
struct sockaddr_ll sa_ll; // Link level address information
uint8_t sa_ll_pad[14]; //Alignment padding
struct ethhdr eth_h;
} __attribute__((packed));
struct TxFrame
{
struct tpacket_hdr tp_h; // Packet header
uint8_t tp_pad[TPACKET_ALIGN(sizeof(tpacket_hdr))-sizeof(tpacket_hdr)];
// struct vlan_ethhdr vlan_eth_h;
// struct arp arp;
} __attribute__((packed));
struct ring_buff {
struct tpacket_req req;
size_t size; // mmap size
size_t cur_frame;
struct iovec *ring_buffer_;
void *buffer; // mmap
};
int setIfFlags(short int flags)
{
struct ifreq ifr;
memset(&ifr, 0, sizeof(ifr));
strncpy(ifr.ifr_name, if_name_.c_str(), sizeof(ifr.ifr_name));
ifr.ifr_hwaddr.sa_family=getIfArptype();
ifr.ifr_flags |= flags;
if ( ioctl(socket_, SIOCSIFFLAGS, &ifr) == -1)
{
std::cout << "Error: ioctl(SIOSIFFLAGS) failed!" << std::endl;
return 1;
}
return 0;
}
int bindSocket()
{
struct sockaddr_ll sll;
memset(&sll, 0, sizeof(sll));
sll.sll_family = AF_PACKET;
sll.sll_protocol = htons(ETH_P_ALL);
sll.sll_ifindex = ifIndex_;
sll.sll_hatype = 0;
sll.sll_pkttype = 0;
sll.sll_halen = 0;
if (bind(socket_, (struct sockaddr *)&sll, sizeof(sll)) < 0) {
std::cout << "Error: bind() failed!" << std::endl;
return 1;
}
return 0;
}
int packetMmap(ring_buff * rb)
{
assert(rb);
rb->buffer = mmap(0, rb->size, PROT_READ | PROT_WRITE, MAP_SHARED, socket_, 0);
if (rb->buffer == MAP_FAILED) {
std::cout << "Error: mmap() failed!" << std::endl;
return 1;
}
return 0;
}
void packetMunmap(ring_buff * rb)
{
assert(rb);
if (rb->buffer)
{
munmap(rb->buffer, rb->size);
rb->buffer = NULL;
rb->size = 0;
}
}
int frameBufferCreate(ring_buff * rb)
{
assert(rb);
rb->ring_buffer_ = (struct iovec*) malloc(rb->req.tp_frame_nr * sizeof(*rb->ring_buffer_));
if (!rb->ring_buffer_) {
std::cout << "No memory available !!!" << std::endl;
return 1;
}
memset(rb->ring_buffer_, 0, rb->req.tp_frame_nr * sizeof(*rb->ring_buffer_));
for (unsigned int i = 0; i < rb->req.tp_frame_nr; i++) {
rb->ring_buffer_[i].iov_base = static_cast<void *>(static_cast<char *>(rb->buffer)+(i*rb->req.tp_frame_size));
rb->ring_buffer_[i].iov_len = rb->req.tp_frame_size;
}
return 0;
}
void setRingBuffer(struct ring_buff *ringbuf) { rb_ = ringbuf; }
int setVlanTaggingStripping()
{
socklen_t len;
int val;
unsigned int sk_type, tp_reserve, maclen, tp_hdrlen, netoff, macoff;
unsigned int tp_hdr_len;
unsigned int frame_size = RX_FRAME_SIZE;
val = TPACKET_V2;
len = sizeof(val);
if (getsockopt(socket_, SOL_PACKET, PACKET_HDRLEN, &val, &len) < 0) {
std::cout << "Error: getsockopt(SOL_PACKET, PACKET_HDRLEN) failed (can't get TPACKET_V2 header len on packet)" << std::endl;
return 1;
}
tp_hdr_len = val;
std::cout << "TPACKET_V2 header is supported (hdr len is " << val << ")"<< std::endl;
std::cout << "tpacket2_hdrs header is supported (hdr len is " << sizeof(tpacket2_hdr) << ")"<< std::endl;
val = TPACKET_V2;
if (setsockopt(socket_, SOL_PACKET, PACKET_VERSION, &val, sizeof(val)) < 0) {
std::cout << "Error: setsockopt(SOL_PACKET, PACKET_VERSION) failed (can't activate TPACKET_V2 on packet)" << std::endl;
return 1;
}
std::cout << "TPACKET_V2 version is configured !!! " << std::endl;
/* Reserve space for VLAN tag reconstruction */
val = VLAN_TAG_LEN;
if (setsockopt(socket_, SOL_PACKET, PACKET_RESERVE, &val, sizeof(val)) < 0) {
std::cout << "Error: setsockopt(SOL_PACKET, PACKET_RESERVE) failed (can't set up reserve on packet)" << std::endl;
return 1;
}
std::cout<< "Reserve space for VLAN tag reconstruction is configured !!! " << std::endl;
return 0;
}
int setSoBufforce(int optname, int buffSize)
{
if (setsockopt(socket_, SOL_SOCKET, SO_SNDBUFFORCE, &buffSize, sizeof(buffSize)) == -1)
{
std::cout << "Error: setsocketopt("<< (optname == SO_SNDBUFFORCE ? "SO_SNDBUFFORCE" : "SO_RCVBUFFORCE") << ") failed!" << std::endl;
return 1;
}
return 0;
}
createSocket(std::string ifName, INTERFACE_T ifType)
{
if (ifName.empty())
{
std::cout << "Error: interface is empty!" << std::endl;;
return NULL;
}
//Create the socket
if ( (socket_ = socket(AF_PACKET, SOCK_RAW, htons(ETH_P_ALL))) == -1 )
{
std::cout << "Error: calling socket(AF_PACKET, SOCK_RAW, htons(ETH_P_ALL)) failed!" << std::endl;
}
std::cout << "Creating Socket on interface= " << ifName << " to listen to ETH_P_ALL"<<std::endl;;
s->setIfFlags(IFF_PROMISC|IFF_BROADCAST);
//allocate space for ring buffer
ring_buff *rb = (ring_buff *) malloc(sizeof(ring_buff));
// use the same size for RX/TX ring
//set the version , here I insert the use of TPACKET_V2!
setVlanTaggingStripping();
rb->req.tp_block_size = RX_BLOCK_SIZE;
rb->req.tp_block_nr = RX_BLOCK_NUM;
rb->req.tp_frame_size = RX_FRAME_SIZE;
rb->req.tp_frame_nr = RX_FRAME_NUM;
setPacketRing(PACKET_RX_RING,&rb->req);
rb->size = (rb->req.tp_block_size)*(rb->req.tp_block_nr);
rb->cur_frame = 0;
// Tweak send/rcv buffer size
int sndBufSz = 4194304; // Send buffer in bytes
int rcvBufSz = 4194304; // Receive buffer in bytes
if (setSoBufforce(SO_SNDBUFFORCE, sndBufSz))
{
//close socket
}
if (setSoBufforce(SO_RCVBUFFORCE, rcvBufSz))
{
//close socket
}
// Add ARP filter so we will only receive ARP packet on this socket
struct sock_filter BPF_code[6];
struct sock_fprog filter;
bindSocket();
if (packetMmap(rb))
{
std::cout << "Error: mmap() failed!" << std::endl;
//close socket
}
frameBufferCreate(rb);
setRingBuffer(rb);
}
and in my function for receive packets and I try to read informations and in particular h_vlan_TCI from but I receive ever 0x00 !!! Any suggestions?
struct vlan_ethhdr* vlan_eth_h = (struct vlan_ethhdr*)&frame->eth_h
void readRawSocket(socket_)
{
while (*(unsigned long*)rb->ring_buffer_[rb->cur_frame].iov_base)
{
RxFrame* frame = (RxFrame *)rb->ring_buffer_[rb->cur_frame].iov_base;
#if 0
tpacket_hdr* h = &frame->tp_h;
char buffer[256];
sprintf (buffer, " -tpacket(v1): status=%ld,len=%d,snaplen=%d,mac=%d,net=%d,sec=%d,usec=%d",
h->tp_status, h->tp_len, h->tp_snaplen, h->tp_mac,h->tp_net, h->tp_sec, h->tp_usec);
std::cout << std::string(buffer) << std::endl;
#else
tpacket2_hdr* h = &frame->tp_h;
char buffer[512];
sprintf (buffer, " -tpacket(v2): status=%d,len=%d,snaplen=%d,mac=%d,net=%d,sec=%d,nsec=%d,vlan_tci=%d (vlan_tci=0x%04x)",
h->tp_status, h->tp_len, h->tp_snaplen, h->tp_mac, h->tp_net, h->tp_sec, h->tp_nsec, h->tp_vlan_tci, ntohs(h->tp_vlan_tci));
std::cout << std::string(buffer) << std::endl;
#endif
if ( ETH_P_8021Q == ntohs(frame->eth_h.h_proto) )
{
struct vlan_ethhdr* vlan_eth_h = (struct vlan_ethhdr*)&frame->eth_h;
int vlan_tag = VLAN_TAG(ntohs(vlan_eth_h->h_vlan_TCI));
std::cout << " -Vlan " << vlan_tag << " packet to this host received";
}
rb->cur_frame = ( rb->cur_frame+1) % rx_socket_->getFrameNum();
} // while()
}
When the kernel removes the vlan it also changes eth_h.h_proto to the protocol after de vlan tag so ETH_P_8021Q == ntohs(frame->eth_h.h_proto) will most probably be false.
Also, if you are listening in the tagged interface (ie. eth0.100) instead of the physical interface (eth0) you will not see the tags.