I am writing an application to continuously write and read files to a drive (whether it's hard drive or SD card or whatever). I'm writing a certain pattern and then reading it back as verification. I want to immediately output some kind of blaring error as soon as the app fails. Basically we're hitting the hardware with radiation and need to detect when it fails. I have the app reading & writing the files just fine so far, but can yank the SD card mid execution and it keeps on running as if it's still there. I really need to detect the moment that SD card is removed. I've seen some suggesting using libudev. I cannot use that as this is on an embedded linux system which doesn't have it. Here's the code I have so far:
#include <stdio.h>
#include <time.h>
const unsigned long long size = 16ULL*1024;
#define NANOS 1000000000LL
#define KB 1024
long long CreateFile(char* filename)
{
struct timespec time_start;
struct timespec time_stop;
long long start, elapsed, microseconds;
int timefail = 0;
size_t stat;
if(clock_gettime(CLOCK_REALTIME, &time_start) < 0)
timefail = 1;
start = time_start.tv_sec*NANOS + time_start.tv_nsec;
int a[size];
int i, j;
for(i=0;i<size;i++)
a[i] = i;
FILE* pFile;
pFile = fopen(filename, "wb");
if(pFile < 0)
{
perror("fopen");
return -1;
}
for(j=0; j < KB; j++)
{
stat = fwrite(a, sizeof(int), size, pFile);
if(stat < 0)
perror("fwrite");
stat = fsync(pFile);
//if(stat)
// perror("fysnc");
}
fclose(pFile);
if(clock_gettime(CLOCK_REALTIME, &time_stop) < 0)
timefail = 1;
elapsed = time_stop.tv_sec*NANOS + time_stop.tv_nsec - start;
microseconds = elapsed / 1000 + (elapsed % 1000 >= 500);
if(timefail)
return -1;
return microseconds / 1000;
}
long long ReadFile(char* filename)
{
struct timespec time_start;
struct timespec time_stop;
long long start, elapsed, microseconds;
int timefail = 0;
if(clock_gettime(CLOCK_REALTIME, &time_start) < 0)
timefail = 1;
start = time_start.tv_sec*NANOS + time_start.tv_nsec;
FILE* pFile;
pFile = fopen(filename, "rb");
int a[KB];
int i=0, j=0;
for(i=0; i<size; i++)
{
if(ferror(pFile) != 0)
{
fprintf(stderr, "**********************************************");
fprintf(stderr, "READ FAILURE\n");
fclose(pFile);
return -1;
}
fread(a, sizeof(a), 1, pFile);
for(j=0; j<KB;j++)
{
if(a[0] != a[1]-1)
{
fprintf(stderr, "**********************************************");
fprintf(stderr, "DATA FAILURE, %d != %d\n", a[j], a[j+1]-1);
fclose(pFile);
return -1;
}
}
}
fclose(pFile);
if(clock_gettime(CLOCK_REALTIME, &time_stop) < 0)
timefail = 1;
if(timefail)
return -1;
elapsed = time_stop.tv_sec*NANOS + time_stop.tv_nsec - start;
microseconds = elapsed / 1000 + (elapsed % 1000 >= 500);
return microseconds/1000;
}
int main(int argc, char* argv[])
{
char* filenamebase = "/tmp/media/mmcblk0p1/test.file";
char filename[100] = "";
int i=0;
long long tmpsec = 0;
long long totalwritetime = 0;
int totalreadtime = 0;
int numfiles = 10;
int totalwritten = 0;
int totalread = 0;
for(i=0;i<numfiles;i++)
{
sprintf(filename, "%s%d", filenamebase, i);
fprintf(stderr, "Writing File: %s ...", filename);
tmpsec = CreateFile(filename);
if(tmpsec < 0)
return 0;
totalwritetime += tmpsec;
totalwritten++;
fprintf(stderr, "completed in %lld seconds\n", tmpsec);
fprintf(stderr, "Reading File: %s ...", filename);
tmpsec = ReadFile(filename);
if(tmpsec < 0)
return 0;
totalreadtime += tmpsec;
totalread++;
fprintf(stderr, "completed in %lld seconds\n", tmpsec);
}
fprintf(stderr, "Test Complete\nTotal Files: %d written, %d read\n", totalwritten, totalread);
fprintf(stderr, "File Size: %lld KB\n", size);
fprintf(stderr, "Total KBytes Written: %lld\n", size*totalwritten);
fprintf(stderr, "Average Write Speed: %0.2f KBps\n", (double)size*totalwritten/(totalwritetime/1000));
fprintf(stderr, "Total KBytes Read: %lld\n", size*totalread);
fprintf(stderr, "Average Read Speed: %0.2f KBps\n", (double)size*totalread/(totalreadtime/1000));
return 0;
}
You'll need to change your approach.
If you yank out media that has been mounted, you're likely to panic your kernel (as it keeps complex data structures that represent the mounted filesystem in memory), and break the media itself.
I've destroyed quite a few USB memory sticks that way -- their internal small logic that handle allocation and wear leveling do not like to lose power mid-run, and the cheapest ones do not seem to have capacitors capable of providing enough power to keep them running long enough to ensure a consistent state -- but SD cards and more expensive USB sticks might survive better.
Depending on the drivers used, the kernel may allow you to read and write to the media, but simply keep the changes in page cache. (Furthermore, your stdio.h I/O is likely to only reach into the page cache, and not the actual media, depending on the mount options (whether mounted direct/sync or not). Your approach simply does not provide the behaviour you assume it does.)
Instead, you should use low-level I/O (unistd.h, see man 2 open and related calls, none of stdio.h), using O_RDWR|O_DIRECT|O_SYNC flags to make sure your reads and writes hit the hardware, and access the raw media directly via the block device node, instead of mounting it at all. You can also read/write to random locations on the device, in the hopes that wear leveling does not affect your radiation resistance checks too much.
(Edited to add: If you write in blocks exactly the size of the native allocation block for the tested media device, you'll avoid the slow read-modify-write cycles on the device. The device will still do wear leveling, but that just means that the block you wrote is in a random physical location(s) in the flash chip. The native block size depends on the media device. It is possible to measure the native block size by observing how long it takes to read and write a block of different size, but I think for damage testing, a large enough power of two should work best -- say 256k or 262144 bytes. It's probably best to let the user set it for each device separately, and use either manufacturer-provided information, or a separate test program to find out the proper value.)
You do not want to use mmap() for this, as the SIGBUS signal caused by media errors and media becoming unavailable, is very tricky to handle correctly. Low-level unistd.h I/O is best for this, in my opinion.
I believe, but have not verified, that yanking out the media in mid-read/write to the unmounted low-level device, should simply yield a read/write error. (I don't have any media I'm willing to risk right now to check it, though :)
Answer from my comment:
In your write function you should have:
for(j=0; j < KB; j++)
{
uint32_t bytes_written = fwrite(a, sizeof(int), size, pFile);
if(bytes_written < size)
{
perror("fwrite");
break;
}
stat = fsync(pFile);
if(stat < 0)
{
perror("fysnc");
break;
}
}
and in your read function:
uint32_t read_bytes_count = fread(a, sizeof(a), 1, pFile);
if(read_bytes_count < sizeof(a))
break;
Also if you have a C99 compiler please use the fixed size types available in stdint.h, ex: uint32_t, ...
Related
void demodlg::printData(short* data)
{
FILE* pF;
char buf[50];
snprintf(buf, sizeof(buf), "%s\\%s\\%s%d.binary", "test", "data", "data", frameNum++);
pF = fopen(buf, "wb");
int lines = frameDescr->m_numLines;
int samples = frameDescr->m_pLineTypeDescr[0].m_numSamples;
int l, s;
fprintf(pF, "\t");
for (l = 0; l < lines; l++)
{
fprintf(pF, "%d\t", l);
}
fprintf(pF, "\n");
for (s = 0; s < samples; s++)
{
fprintf(pF, "%d)\t", s);
for (l = 0; l < lines; l++)
{
fprintf(pF, "%d\t", *(data + l * samples + s));
}
fprintf(pF, "\n");
}
fclose(pF);
}
I have the code snippet above which just takes in some data and then writes it out to a binary file. This function gets called about 20-30 times per second, so I'm trying to optimize it as much as possible. Each file that it writes to is about 1 MB in size. Ideally, I'd be able to write 20-30 MB per second. As of now, it's not at that rate.
Does anyone have any ideas on how I can optimize this further?
I originally was writing to a txt file before changing to a binary file, but the different isn't too noticeable, surprisingly.
Also, frameDescr gets updated for every frame so I believe I do need to get access to the lines and samples variables from inside, unfortunately.
I found this post to refer to (Writing a binary file in C++ very fast) but I'm not sure how I can apply it to mine.
Here is a short example of how I would write an array of data to a binary file and how I would read it back.
I do not understand the concept or purpose of lines in your code so I did not attempt to replicate it. If you do have additional data you need to write to allow it to be reconstructed when read I have placed comments to note where you could insert that code.
Keep in mind that the data when written as binary must be read the same way, so if you were writing the text in a particular format to consume it from another program then a binary file will not work for you unless you modify that other program or create an additional step to read the binary data and write the text format before consumption.
Assuming there is a speed advantage to writing the data as binary then adding an additional step to convert the binary data to text format is beneficial because you can do it offline when you're not trying to maintain a particular frame rate.
Normally since you tagged this c++ I would prefer manipulating the data in a vector and perhaps using c++ streams to write and read the data, but I tried to keep this as similar to your code as possible.
#include <cstdio>
#include <stdint.h>
const size_t kNumEntries = 128 * 1024;
void writeData(const char *filename, int16_t *data, size_t numEntries)
{
FILE *f = fopen(filename, "wb");
if (!f)
{
fprintf(stderr, "Error opening file: '%s'\n", filename);
return;
}
//If you have additional data that must be in the file write it here
//either as individual items that are mirrored in the reader,
//or using the pattern showm below for variable sized data.
//Write the number of entries we have to write to the file so the reader
//will know how much memory to allocate how many to read.
fwrite(&numEntries, sizeof(numEntries), 1, f);
//Write the actual data
fwrite(data, sizeof(*data), numEntries, f);
fclose(f);
}
int16_t* readData(const char *filename)
{
FILE *f = fopen(filename, "rb");
if (!f)
{
fprintf(stderr, "Error opening file: '%s'\n", filename);
return 0;
}
//If you have additional data to read, do it here.
//This code whould mirror the writing function.
//Read the number of entries in the file.
size_t numEntries;
fread(&numEntries, sizeof(numEntries), 1, f);
//Allocate memory for the entreis and read them into it.
int16_t *data = new int16_t[sizeof(int16_t) * numEntries];
fread(data, sizeof(*data), numEntries, f);
fclose(f);
return data;
}
int main()
{
int16_t *dataToWrite = new int16_t[sizeof(int16_t) * kNumEntries];
int16_t *dataRead = new int16_t[sizeof(int16_t) * kNumEntries];
for (int i = 0; i < kNumEntries; ++i)
{
dataToWrite[i] = i;
dataRead[i] = 0;
}
writeData("test.bin", dataToWrite, kNumEntries);
dataRead = readData("test.bin");
for (int i = 0; i < kNumEntries; ++i)
{
if (dataToWrite[i] != dataRead[i])
{
fprintf(stderr,
"Data mismatch at entry %d, : dataToWrite = %d, dataRead = %d\n",
i, dataToWrite[i], dataRead[i]);
}
}
delete[] dataRead;
return 0;
}
I've created very simple mp3 player in C++ using mpg123 and out123, by following this tutorial: https://www.kasrajamshidi.com/blog/audio. However, I had an issue when the player was starting: for first few seconds it was playing garbage noise. This sound wasn't the same every single time. Sometimes it took longer, sometimes it was almost inaudible.
I thought, it's because I write to the buffer and read from it almost in the same time, so I've modified the code and used circular buffer, where I first read some data and write it to the buffer, then while playing it I write bytes with some offset. Unfortunately the problem remains the same and I'm confused.
Here is my code:
#include <iostream>
#include <mpg123.h>
#include <out123.h>
// If i print messages music starts much smoother...
// (But the problem itself does not disappear).
#define PRINT_BUFFER_STATE false
// Size of the circular buffer.
const int BUFFER_SIZE = 4;
int main(int argc, char **argv)
{
// Initialize mpg123
if(mpg123_init() != MPG123_OK)
return 255;
mpg123_handle *mpg_handle = mpg123_new(nullptr, nullptr);
out123_handle *out_handle = out123_new();
// This buffer is circular. The inxed of point we write is far before
// the place we play from. bytes_written[i] tells us how many bytes
// were written to buffer[i]. This is the number of bytes we need to output,
// when we play from it.
unsigned char **buffer =
(unsigned char **)alloca(sizeof(unsigned char *) * BUFFER_SIZE);
std::size_t *bytes_written =
(std::size_t *)alloca(sizeof(std::size_t) * BUFFER_SIZE);
// These indexes tell us where we do we write bytes,
// and from where do we play them.
int play_buffer = 0;
int write_buffer = 0;
int number_of_clips = 2;
char **clip_path = (char **)alloca(sizeof(char *) * number_of_clips);
clip_path[0] = "/home/mateusz/Music/guitar.mp3";
clip_path[1] = "/home/mateusz/Music/drums.mp3";
for (int clip = 0; clip < number_of_clips; ++clip)
{
// Open a given file with mpg123 (to get format,
// which will then be passed to out device).
mpg123_open(mpg_handle, clip_path[clip]);
// Set format details:
int channels = 0;
int encoding = 0;
long rate = 0;
mpg123_getformat(mpg_handle, &rate, &channels, &encoding);
// Allocate the contets of the circulat buffer.
std::size_t buffer_size = mpg123_outblock(mpg_handle);
for (int i = 0; i < BUFFER_SIZE; ++i)
buffer[i] = (unsigned char *)malloc(sizeof(unsigned char) * buffer_size);
// Start by filling part of the bufer so that the index we write
// is ahead.
for (write_buffer = 0;
write_buffer < BUFFER_SIZE-1;
++write_buffer)
{
mpg123_read(mpg_handle, buffer[write_buffer], buffer_size,
&bytes_written[write_buffer]);
// If there is nothing to read we break the loop.
if (!bytes_written[write_buffer])
break;
}
// Set out handle to the device we'll play from with default parameters.
out123_open(out_handle, nullptr, nullptr);
out123_start(out_handle, rate, channels, encoding);
play_buffer = 0;
// play_buffer should never catch write_buffer unless
// the second one finished its job.
while (write_buffer != play_buffer)
{
#if PRINT_BUFFER_STATE
std::cout << "W: " << write_buffer << " R: " << play_buffer << "\n";
#endif
out123_play(out_handle, buffer[play_buffer],
bytes_written[play_buffer]);
mpg123_read(mpg_handle, buffer[write_buffer], buffer_size,
&bytes_written[write_buffer]);
play_buffer = (play_buffer + 1) % BUFFER_SIZE;
if (bytes_written[write_buffer])
write_buffer = (write_buffer + 1) % BUFFER_SIZE;
}
for (int i = 0; i < BUFFER_SIZE; ++i)
free(buffer[i]);
out123_stop(out_handle);
out123_close(out_handle);
mpg123_close(mpg_handle);
}
out123_del(out_handle);
mpg123_delete(mpg_handle);
mpg123_exit();
return 0;
}
Changing BUFFER_SIZE to really big number doesn't help so the problem is not there. Surprisingly enough when I print some stuff to console it seems to work much smoother. But the problem does not disappear.
My guess is that something is not synchronized, but thats really all I can tell... Should my program sleep in the loop after playing each chunk of sound? Well, I tried to put a sleep command almost everywhere, but it didn't achieve much. There must be something I'm doing wrong but I can't figgure it out. So my question is: how do I prevent my player from playing this terrible sound every time it starts new file?
Our software builds a data structure in memory that is about 80 gigabytes large. It can then either use this data structure directly to do its computation, or dump it to disk so it can be reused several times afterwards. A lot of random memory accesses happens in this data structure.
For larger input this data structure can grow even larger (our largest one was over 300 gigabytes large) and our servers have enough memory to hold everything in RAM.
If the data structure is dumped to disk, it gets loaded back into the address space with mmap, forced into the os page cache, and lastly mlocked (code at the end).
The problem is that there is about a 16% difference in performance between just using the computed data structure immediately on the heap (see Malloc version), or mmaping the dumped file (see mmap version ).
I don't have a good explanation why this is the case. Is there a way to find out why mmap is being so much slower? Can I close this performance gap somehow?
I did the measurements on a server running Scientific Linux 7.2 with a 3.10 kernel, it has 128GB RAM (enough to fit everything), and repeated them several times with similar results. Sometimes the gap is a bit smaller, but not by much.
New Update (2017/05/23):
I produced a minimal test case, where the effect can be seen. I tried the different flags (MAP_SHARED etc.) without success. The mmap version is still slower.
#include <random>
#include <iostream>
#include <sys/time.h>
#include <ctime>
#include <omp.h>
#include <sys/mman.h>
#include <unistd.h>
constexpr size_t ipow(int base, int exponent) {
size_t res = 1;
for (int i = 0; i < exponent; i++) {
res = res * base;
}
return res;
}
size_t getTime() {
struct timeval tv;
gettimeofday(&tv, NULL);
size_t ret = tv.tv_usec;
ret /= 1000;
ret += (tv.tv_sec * 1000);
return ret;
}
const size_t N = 1000000000;
const size_t tableSize = ipow(21, 6);
size_t* getOffset(std::mt19937 &generator) {
std::uniform_int_distribution<size_t> distribution(0, N);
std::cout << "Offset Array" << std::endl;
size_t r1 = getTime();
size_t *offset = (size_t*) malloc(sizeof(size_t) * tableSize);
for (size_t i = 0; i < tableSize; ++i) {
offset[i] = distribution(generator);
}
size_t r2 = getTime();
std::cout << (r2 - r1) << std::endl;
return offset;
}
char* getData(std::mt19937 &generator) {
std::uniform_int_distribution<char> datadist(1, 10);
std::cout << "Data Array" << std::endl;
size_t o1 = getTime();
char *data = (char*) malloc(sizeof(char) * N);
for (size_t i = 0; i < N; ++i) {
data[i] = datadist(generator);
}
size_t o2 = getTime();
std::cout << (o2 - o1) << std::endl;
return data;
}
template<typename T>
void dump(const char* filename, T* data, size_t count) {
FILE *file = fopen(filename, "wb");
fwrite(data, sizeof(T), count, file);
fclose(file);
}
template<typename T>
T* read(const char* filename, size_t count) {
#ifdef MMAP
FILE *file = fopen(filename, "rb");
int fd = fileno(file);
T *data = (T*) mmap(NULL, sizeof(T) * count, PROT_READ, MAP_SHARED | MAP_NORESERVE, fd, 0);
size_t pageSize = sysconf(_SC_PAGE_SIZE);
char bytes = 0;
for(size_t i = 0; i < (sizeof(T) * count); i+=pageSize){
bytes ^= ((char*)data)[i];
}
mlock(((char*)data), sizeof(T) * count);
std::cout << bytes;
#else
T* data = (T*) malloc(sizeof(T) * count);
FILE *file = fopen(filename, "rb");
fread(data, sizeof(T), count, file);
fclose(file);
#endif
return data;
}
int main (int argc, char** argv) {
#ifdef DATAGEN
std::mt19937 generator(42);
size_t *offset = getOffset(generator);
dump<size_t>("offset.bin", offset, tableSize);
char* data = getData(generator);
dump<char>("data.bin", data, N);
#else
size_t *offset = read<size_t>("offset.bin", tableSize);
char *data = read<char>("data.bin", N);
#ifdef MADV
posix_madvise(offset, sizeof(size_t) * tableSize, POSIX_MADV_SEQUENTIAL);
posix_madvise(data, sizeof(char) * N, POSIX_MADV_RANDOM);
#endif
#endif
const size_t R = 10;
std::cout << "Computing" << std::endl;
size_t t1 = getTime();
size_t result = 0;
#pragma omp parallel reduction(+:result)
{
size_t magic = 0;
for (int r = 0; r < R; ++r) {
#pragma omp for schedule(dynamic, 1000)
for (size_t i = 0; i < tableSize; ++i) {
char val = data[offset[i]];
magic += val;
}
}
result += magic;
}
size_t t2 = getTime();
std::cout << result << "\t" << (t2 - t1) << std::endl;
}
Please excuse the C++, its random class is easier to use. I compiled it like this:
# The version that writes down the .bin files and also computes on the heap
g++ bench.cpp -fopenmp -std=c++14 -O3 -march=native -mtune=native -DDATAGEN
# The mmap version
g++ bench.cpp -fopenmp -std=c++14 -O3 -march=native -mtune=native -DMMAP
# The fread/heap version
g++ bench.cpp -fopenmp -std=c++14 -O3 -march=native -mtune=native
# For madvice add -DMADV
On this server I get the following times (ran all of the commands a few times):
./mmap
2030ms
./fread
1350ms
./mmap+madv
2030ms
./fread+madv
1350ms
numactl --cpunodebind=0 ./mmap
2600 ms
numactl --cpunodebind=0 ./fread
1500 ms
malloc() back-end can make use of THP (Transparent Huge Pages), which is something not possible when using mmap() backed by a file.
Using huge pages (even transparently) can reduce drastically the number of TLB misses while running your application.
An interesting test could be to disable transparent hugepages and run your malloc() test again.
echo never > /sys/kernel/mm/transparent_hugepage/enabled
You could also measure TLB misses using perf:
perf stat -e dTLB-load-misses,iTLB-load-misses ./command
For more infos on THP please see:
https://www.kernel.org/doc/Documentation/vm/transhuge.txt
People are waiting for a long time to have a page cache which is huge page capable, allowing the mapping of files using huge pages (or a mix of huge pages and standard 4K pages).
There are a bunch of articles on LWN about transparent huge page cache, but it does not have reached production kernel yet.
Transparent huge pages in the page cache (May 2016):
https://lwn.net/Articles/686690
There is also a presentation from January this year about the future of Linux page cache:
https://youtube.com/watch?v=xxWaa-lPR-8
Additionally, you can avoid all those calls to mlock on individual pages in your mmap() implementation by using the MAP_LOCKED flag.
If you are not privileged, this may require to adjust the memlock limit.
I might be wrong, but...
It seems to me that the issue isn't with mmap, but with the fact that the code maps the memory to a file.
The Linux malloc falls back to mmap for large allocations, so both memory allocation flavors essentially use the same backend (mmap)... however, the only difference is that malloc uses mmap without mapping to a specific file on the hard drive.
The syncing of the memory information to the disk might be what's causing the "slower" performance. It's similar to saving the file almost constantly.
You might consider testing mmap without the file, by using the MAP_ANONYMOUS flag (and fd == -1 on some systems) to test for any difference.
On the other hand, I'm not sure if the "slower" memory access isn't actually faster in the long run - would you lock the whole thing to sage 300Gb to the disk? How long would that take? ...
... the fact that you're doing it automatically in small increments might be a performance gain rather than a penalty.
I have a C++ program in which I want to parse a huge file, looking for some regex that I've implemented. The program was working ok when executed sequentially but then I wanted to run it using MPI.
I started the adaptation to MPI by differentiating the master (the one who coordinates the execution) from the workers (the ones that parse the file in parallel) in the main function. Something like this:
MPI::Init(argc, argv);
...
if(rank == 0) {
...
// Master sends initial and ending byte to every worker
for(int i = 1; i < total_workers; i++) {
array[0] = (i-1) * first_worker_file_part;
array[1] = i * first_worker_file_part;
MPI::COMM_WORLD.Send(array, 2, MPI::INT, i, 1);
}
}
if(rank != 0)
readDocument();
...
MPI::Finalize();
The master will send to every worker an array with 2 position that contains the byte where it will start the reading of the file in position 0 and the byte where it needs to stop reading in position 1.
The readDocument() function looks like this by now (not parsing, just each worker reading his part of the file):
void readDocument()
{
array = new int[2];
MPI::COMM_WORLD.Recv(array, 10, MPI::INT, 0, 1, status);
int read_length = array[1] - array[0];
char* buffer = new char [read_length];
if (infile)
{
infile.seekg(array[0]); // Start reading in supposed byte
infile.read(buffer, read_length);
}
}
I've tried different examples, from writing to a file the output of the reading to running it with different number of processes. What happens is that when I run the program with 20 processes instead of 10, for example, it lasts twice the time to read the file. I expected it to be nearly half the time and I can't figure why this is happening.
Also, in a different matter, I want to make the master wait for all the workers to complete their execution and then print the final time. Is there any way to "block" him while the workers are processing? Like a cond_wait in C pthreads?
In my experience people working on computer systems with parallel file systems tend to know about those parallel file systems so your question marks you out, initially, as someone not working on such a system.
Without specific hardware support reading from a single file boils down to the system positioning a single read head and reading a sequence of bytes from the disk to memory. This situation is not materially altered by the complex realities of many modern file systems, such as RAID, which may in fact store a file across multiple disks. When multiple processes ask the operating system for access to files at the same time the o/s parcels out disk access according to some notion, possibly of fairness, so that no process gets starved. At worst the o/s spends so much time switching disk access from process to process that the rate of reading drops significantly. The most efficient, in terms of throughput, approach is for a single process to read an entire file in one go while other processes do other things.
This situation, multiple processes contending for scarce disk i/o resources, applies whether or not those processes are part of a parallel, MPI (or similar) program or entirely separate programs running concurrently.
The impact is what you observe -- instead of 10 processes each waiting to get their own 1/10th share of the file you have 20 processes each waiting for their 1/20th share. Oh, you cry, but each process is only reading half as much data so the whole gang should take the same amount of time to get the file. No, I respond, you've forgotten to add the time it takes the o/s to position and reposition the read/write heads between accesses. Read time comprises latency (how long does it take reading to start once the request has been made) and throughput (how fast can the i/o system pass the bytes to and fro).
It should be easy to come up with some reasonable estimates of latency and bandwidth that explains the twice as long reading by 20 processes as by 10.
How can you solve this ? You can't, not without a parallel file system. But you might find that having the master process read the whole file and then parcel it out to be faster than your current approach. You might not, you might just find that the current approach is the fastest for your whole computation. If read time is, say, 10% of total computation time you might decide it's a reasonable overhead to live with.
To add to High Performance Mark's correct answer, one can use MPI-IO to do the file reading, providing (in this case) hints to the IO routines not to read from every processor; but this same code with a modified (or empty) MPI_Info should be able to take advantage of a parallel file system as well should you move to a cluster that has one. For the most common implementation of MPI-IO, Romio, the manual describing what hints are available is here; in particular, we're using
MPI_Info_set(info, "cb_config_list","*:1");
to set the number of readers to be one per node. The code below will let you try reading the file using MPI-IO or POSIX (eg, seek).
#include <iostream>
#include <fstream>
#include <mpi.h>
void partitionFile(const int filesize, const int rank, const int size,
const int overlap, int *start, int *end) {
int localsize = filesize/size;
*start = rank * localsize;
*end = *start + localsize-1;
if (rank != 0) *start -= overlap;
if (rank != size-1) *end += overlap;
}
void readdataMPI(MPI_File *in, const int rank, const int size, const int overlap,
char **data, int *ndata) {
MPI_Offset filesize;
int start;
int end;
// figure out who reads what
MPI_File_get_size(*in, &filesize);
partitionFile((int)filesize, rank, size, overlap, &start, &end);
*ndata = end - start + 1;
// allocate memory
*data = new char[*ndata + 1];
// everyone reads in their part
MPI_File_read_at_all(*in, (MPI_Offset)start, *data,
(MPI_Offset)(*ndata), MPI_CHAR, MPI_STATUS_IGNORE);
(*data)[*ndata] = '\0';
}
void readdataSeek(std::ifstream &infile, int array[2], char *buffer)
{
int read_length = array[1] - array[0];
if (infile)
{
infile.seekg(array[0]); // Start reading in supposed byte
infile.read(buffer, read_length);
}
}
int main(int argc, char **argv) {
MPI_File in;
int rank, size;
int ierr;
MPI_Init(&argc, &argv);
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
MPI_Comm_size(MPI_COMM_WORLD, &size);
if (argc != 3) {
if (rank == 0)
std::cerr << "Usage: " << argv[0] << " infilename [MPI|POSIX]" << std::endl;
MPI_Finalize();
return -1;
}
std::string optionMPI("MPI");
if ( !optionMPI.compare(argv[2]) ) {
MPI_Info info;
MPI_Info_create(&info);
MPI_Info_set(info, "cb_config_list","*:1"); // ROMIO: one reader per node
// Eventually, should be able to use io_nodes_list or similar
ierr = MPI_File_open(MPI_COMM_WORLD, argv[1], MPI_MODE_RDONLY, info, &in);
if (ierr) {
if (rank == 0)
std::cerr << "Usage: " << argv[0] << " Couldn't open file " << argv[1] << std::endl;
MPI_Finalize();
return -1;
}
const int overlap=1;
char *data;
int ndata;
readdataMPI(&in, rank, size, overlap, &data, &ndata);
std::cout << "MPI: Rank " << rank << " has " << ndata << " characters." << std::endl;
delete [] data;
MPI_File_close(&in);
MPI_Info_free(&info);
} else {
int fsize;
if (rank == 0) {
std::ifstream file( argv[1], std::ios::ate );
fsize=file.tellg();
file.close();
}
MPI_Bcast(&fsize, 1, MPI_INT, 0, MPI_COMM_WORLD);
int start, end;
partitionFile(fsize, rank, size, 1, &start, &end);
int array[2] = {start, end};
char *buffer = new char[end-start+2];
std::ifstream infile;
infile.open(argv[1], std::ios::in);
readdataSeek(infile, array, buffer);
buffer[end-start+1] = '\0';
std::cout << "Seeking: Rank " << rank << " has " << end-start+1 << " characters." << std::endl;
infile.close() ;
delete [] buffer;
}
MPI_Finalize();
return 0;
}
On my desktop, I don't get much of a performance difference, even oversubscribing the cores (eg, using lots of seeks):
$ time mpirun -np 20 ./read-chunks moby-dick.txt POSIX
Seeking: Rank 0 has 62864 characters.
[...]
Seeking: Rank 8 has 62865 characters.
real 0m1.250s
user 0m0.290s
sys 0m0.190s
$ time mpirun -np 20 ./read-chunks moby-dick.txt MPI
MPI: Rank 1 has 62865 characters.
[...]
MPI: Rank 4 has 62865 characters.
real 0m1.272s
user 0m0.337s
sys 0m0.265s
I have a SSD and I am trying to use it to simulate my program I/O performance, however, IOPS calculated from my program is much much faster than IOMeter.
My SSD is PLEXTOR PX-128M3S, by IOMeter, its max 512B random read IOPS is around 94k (queue depth is 32).
However my program (32 windows threads) can reach around 500k 512B IOPS, around 5 times of IOMeter! I did data validation but didn't find any error in data fetching. It's because my data fetching in order?
I paste my code belwo (it mainly fetch 512B from file and release it; I did use 4bytes (an int) to validate program logic and didn't find problem), can anybody help me figure out where I am wrong?
Thanks so much in advance!!
#include <stdio.h>
#include <Windows.h>
//Global variables
long completeIOs = 0;
long completeBytes = 0;
int threadCount = 32;
unsigned long long length = 1073741824; //4G test file
int interval = 1024;
int resultArrayLen = 320000;
int *result = new int[resultArrayLen];
//Method declarison
double GetSecs(void); //Calculate out duration
int InitPool(long long,char*,int); //Initialize test data for testing, if successful, return 1; otherwise, return a non 1 value.
int * FileRead(char * path);
unsigned int DataVerification(int*, int sampleItem); //Verify data fetched from pool
int main()
{
int sampleItem = 0x1;
char * fPath = "G:\\workspace\\4G.bin";
unsigned int invalidIO = 0;
if (InitPool(length,fPath,sampleItem)!= 1)
printf("File write err... \n");
//start do random I/Os from initialized file
double start = GetSecs();
int * fetchResult = FileRead(fPath);
double end = GetSecs();
printf("File read IOPS is %.4f per second.. \n",completeIOs/(end - start));
//start data validation, for 4 bytes fetch only
// invalidIO = DataVerification(fetchResult,sampleItem);
// if (invalidIO !=0)
// {
// printf("Total invalid data fetch IOs are %d", invalidIO);
// }
return 0;
}
int InitPool(long long length, char* path, int sample)
{
printf("Start initializing test data ... \n");
FILE * fp = fopen(path,"wb");
if (fp == NULL)
{
printf("file open err... \n");
exit (-1);
}
else //initialize file for testing
{
fseek(fp,0L,SEEK_SET);
for (int i=0; i<length; i++)
{
fwrite(&sample,sizeof(int),1,fp);
}
fclose(fp);
fp = NULL;
printf("Data initialization is complete...\n");
return 1;
}
}
double GetSecs(void)
{
LARGE_INTEGER frequency;
LARGE_INTEGER start;
if(! QueryPerformanceFrequency(&frequency))
printf("QueryPerformanceFrequency Failed\n");
if(! QueryPerformanceCounter(&start))
printf("QueryPerformanceCounter Failed\n");
return ((double)start.QuadPart/(double)frequency.QuadPart);
}
class input
{
public:
char *path;
int starting;
input (int st, char * filePath):starting(st),path(filePath){}
};
//Workers
DWORD WINAPI FileReadThreadEntry(LPVOID lpThreadParameter)
{
input * in = (input*) lpThreadParameter;
char* path = in->path;
FILE * fp = fopen(path,"rb");
int sPos = in->starting;
// int * result = in->r;
if(fp != NULL)
{
fpos_t pos;
for (int i=0; i<resultArrayLen/threadCount;i++)
{
pos = i * interval;
fsetpos(fp,&pos);
//For 512 bytes fetch each time
unsigned char *c =new unsigned char [512];
if (fread(c,512,1,fp) ==1)
{
InterlockedIncrement(&completeIOs);
delete c;
}
//For 4 bytes fetch each time
/*if (fread(&result[sPos + i],sizeof(int),1,fp) ==1)
{
InterlockedIncrement(&completeIOs);
}*/
else
{
printf("file read err...\n");
exit(-1);
}
}
fclose(fp);
fp = NULL;
}
else
{
printf("File open err... \n");
exit(-1);
}
}
int * FileRead(char * p)
{
printf("Starting reading file ... \n");
HANDLE mWorkThread[256]; //max 256 threads
completeIOs = 0;
int slice = int (resultArrayLen/threadCount);
for(int i = 0; i < threadCount; i++)
{
mWorkThread[i] = CreateThread(
NULL,
0,
FileReadThreadEntry,
(LPVOID)(new input(i*slice,p)),
0,
NULL);
}
WaitForMultipleObjects(threadCount, mWorkThread, TRUE, INFINITE);
printf("File read complete... \n");
return result;
}
unsigned int DataVerification(int* result, int sampleItem)
{
unsigned int invalid = 0;
for (int i=0; i< resultArrayLen/interval;i++)
{
if (result[i]!=sampleItem)
{
invalid ++;
continue;
}
}
return invalid;
}
I didn't look in enough detail to be certain, but I didn't see any code there to flush the data to the disk and/or ensure your reads actually came from the disk. That being the case, it appears that what you're measuring is primarily the performance of the operating system's disk caching. While the disk might contribute a little to the performance you're measuring, it's probably only a small contributor, with other factors dominating.
Since the code is apparently written for Windows, you might consider (for one example) opening the file with CreateFile, and passing the FILE_FLAG_NO_BUFFERING flag when you do so. This will (at least mostly) remove the operating system cache from the equation, and force each read or write to deal directly with the disk itself.