I'm trying to read a file consists of 100000000 float numbers like 0.12345678 or -0.1234567 separated by space in c++. I used fscanf() to read the file and the codes is like this:
FILE *fid = fopen("testingfile.txt", "r");
if (fid == NULL)
return false;
float v;
for (int i = 0; i < 100000000; i++)
fscanf(fid, "%f", &v);
fclose(fid);
The file is 1199999988 bytes in size and took around 18 seconds to finish reading using fscanf().Therefore, I would like to use mmap() to speed up the reading and code is like this:
#define FILEPATH "testingfile.txt"
char text[10] = {'\0'};
struct stat s;
int status = stat(FILEPATH, &s);
int fd = open(FILEPATH, O_RDONLY);
if (fd == -1)
{
perror("Error opening file for reading");
return 0;
}
char *map = (char *)mmap(NULL, s.st_size, PROT_READ, MAP_SHARED, fd, 0);
close(fd);
if (map == MAP_FAILED)
{
perror("Error mmapping the file");
return 0;
}
for (int i = 0,j=0; i < s.st_size; i++)
{
if (isspace(map[i]))
{
text[j] = '\0';
j = 0;
float v = atof(text);
for (int j = 0; j < 10; j++)
text[j] = '\0';
continue;
}
text[j] = map[i];
j++;
}
if (munmap(map, s.st_size) == -1)
{
return 0;
}
However, it still takes around 14.5 seconds to finish reading. I found the most time consuming part is converting array to float,which consumes around 10 seconds
So I have three questions:
Is there any way I can directly read float instead of char or
Is there any better method to convert char array to float
How does fscanf recognize floating point value and read it, which is much faster than atof().
Thanks in advance!
Based on the advice given, here are two possible solutions to this problem:
The first approach would be a bit "stupid". Since the format of floating number values stored is known, conversion from char array to float number can be easily done without usingatof().
By removing atof(), it only takes 8 seconds to finish reading and conversion for the same file.
The second approach is to change the store format of float numbers in the file (as advised by Jeremy Friesner). Floating number values are stored in binary format so that conversion part for mmap() is not required. The code becomes something like this:
#define FILEPATH "myfile.bin"
int main()
{
int start_s = clock();
struct stat s;
int status = stat(FILEPATH, &s);
int fd = open(FILEPATH, O_RDONLY);
if (fd == -1)
{
perror("Error opening file for reading");
return 0;
}
float *map = (float *)mmap(NULL, s.st_size, PROT_READ, MAP_SHARED, fd, 0);
close(fd);
if (map == MAP_FAILED)
{
perror("Error mmapping the file");
return 0;
}
for (int i = 0; i < s.st_size / 4; i++)
{
float v = map[i];
}
if (munmap(map, s.st_size) == -1)
{
return 0;
}
}
This would dramatically reduce the time required to read the file in same size.
Related
I have a homework about WAV files and FIR filters for a Digital Signal Processing class.
My program must read a WAV file, apply a filter to the data and write the output data to another WAV file again.
I have completed reading and applying filters but I can't write the WAV file. The program doesn't give any errors while compiling but the WAV file doesn't play.
If I write "temp" to the WAV, it runs properly. But if I write "data", it doesn't.
How can I write a WAV file properly?
#define _CRT_SECURE_NO_WARNINGS
#define PI 3.14f
#define WAV_HEADER_LENGTH 44
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <iostream>
#include <fstream>
char* read_wav(const char* filename, short*, short*, int*);
void write_wav(const char* filename, const char*, int);
using namespace std;
int main()
{
short nchannel, ssample;
int csample;
//Reading WAV file and returning the data.
char* temp = read_wav("sum.wav", &nchannel, &ssample, &csample);
short* data = (short*)&temp[WAV_HEADER_LENGTH];
cout << "How many coefficients are there in filter ?" << endl;
int N;
cin >> N ;
float filter[N];
cout << "Type coefficients in filter." << endl;
for(int i=0; i<N;i++){
cin >> filter[i];
}
short* output = (short*)&temp[WAV_HEADER_LENGTH];
for(int i=0; i < csample; i++){
double sum = 0;
for(int j=0; j < N; j++){
if((i - j) >= 0)
sum += filter[j] * data[i-j];
}
output[i] = (short) sum;
}
write_wav("test.wav", out, csample * ssample + WAV_HEADER_LENGTH);
}
char* read_wav(const char* filename, short* nchannel, short* ssample, int* csample) {
//Reading the file.
FILE* fp = fopen(filename, "rb");
if (!fp) {
fprintf(stderr, "Couldn't open the file \"%s\"\n", filename);
exit(0);
}
fseek(fp, 0, SEEK_END);
int file_size = ftell(fp);
fseek(fp, 0, SEEK_SET);
printf("The file \"%s\" has %d bytes\n\n", filename, file_size);
char* buffer = (char*)malloc(sizeof(char) * file_size);
fread(buffer, file_size, 1, fp);
// Dump the buffer info.
*nchannel = *(short*)&buffer[22];
*ssample = *(short*)&buffer[34] / 8;
*csample = *(int*)&buffer[40] / *ssample;
printf("ChunkSize :\t %u\n", *(int*)&buffer[4]);
printf("Format :\t %u\n", *(short*)&buffer[20]);
printf("NumChannels :\t %u\n", *(short*)&buffer[22]);
printf("SampleRate :\t %u\n", *(int*)&buffer[24]); // number of samples per second
printf("ByteRate :\t %u\n", *(int*)&buffer[28]); // number of bytes per second
printf("BitsPerSample :\t %u\n", *(short*)&buffer[34]);
printf("Subchunk2ID :\t \"%c%c%c%c\"\n", buffer[36], buffer[37], buffer[38], buffer[39]); // marks beginning of the data section
printf("Subchunk2Size :\t %u\n", *(int*)&buffer[40]); // size of data (byte)
printf("Duration :\t %fs\n\n", (float)(*(int*)&buffer[40]) / *(int*)&buffer[28]);
fclose(fp);
return buffer;
}
void write_wav(const char* filename, const char* data, int len) {
FILE* fp = fopen(filename, "wb");
if (!fp) {
fprintf(stderr, "Couldn't open the file \"%s\"\n", filename);
exit(0);
}
fwrite(data, len, 1, fp);
fclose(fp);
}
This works for me:
int main()
{
short nchannel, ssample;
int csample;
// Reading WAV file and returning the data.
char* temp = read_wav("sum.wav", &nchannel, &ssample, &csample);
short* data = (short*)&temp[WAV_HEADER_LENGTH];
// cout << "How many coefficients are there in filter ?" << endl;
const int N = 2;
// cin >> N;
float filter[N] = {0.5, 0.75};
// cout << "Type coefficients in filter." << endl;
// for (int i = 0; i < N; i++)
// {
// cin >> filter[i];
// }
short* output = (short*)&temp[WAV_HEADER_LENGTH];
for (int i = 0; i < csample; i++)
{
double sum = 0;
for (int j = 0; j < N; j++)
{
if ((i - j) >= 0) sum += filter[j] * data[i - j];
}
output[i] = (short)sum;
}
write_wav("test.wav", (char*)temp, csample * ssample + WAV_HEADER_LENGTH);
}
My changes:
The major change is to use the full buffer, with extremely misleading name: temp, instead of your out that does not compile, as the argument of write_wav.
I applied "my" filter coefficients (the sound from the output file is really distorted),
I applied my favorite indentation
If the code is to be portable, you need to check the endiannes and act accordingly.
I would expect the input and output files to be of the same length, but they're not. Please check it yourself why this is not the case.
Example:
-rw-r--r-- 1 zkoza zkoza 787306 06-23 14:09 sum.wav
-rw-r--r-- 1 zkoza zkoza 787176 06-23 14:16 test.wav
It looks like 130 bytes are missing in the output file.
Your float filter[N] with N not known at compile time is a C++ extension: please use std::vector in your final code instead.
Next time please provide also a link for any input files. For my tests, I used https://freewavesamples.com/alesis-fusion-clean-guitar-c3 , but all these little things, like finding an input file (WAV format has several flavors, I could have missed the correct one), guessing filter parameters etc. take time and effort.
Your condition if ((i - j) >= 0) can be written in a way easier to understand; preferably by changing the inner loop "header".
I'm trying to compact a raster file in a way that is easy to read without GDAL library (my web server cannot install GDAL). Following this question, I'm doing the following to convert a raster's bytes (only 0 and 1 values) to bits:
int main(int argc,char *argv[]) {
if (argc < 3) {
return 1;
}
GDALDataset *poDataset;
GDALAllRegister();
poDataset = (GDALDataset*)GDALOpen(argv[1],GA_ReadOnly);
if (poDataset == NULL) {
return 2;
}
int tx=poDataset->GetRasterXSize(), ty=poDataset->GetRasterYSize();
GDALRasterBand *poBand;
int nBlockXSize,nBlockYSize;
poBand = poDataset->GetRasterBand(1);
printf("Type: %s\n",GDALGetDataTypeName(poBand->GetRasterDataType()));
// Type: Byte
poBand->GetBlockSize(&nBlockXSize,&nBlockYSize);
int i, nX = tx/nBlockXSize, nY = ty/nBlockYSize;
char *data = (char*)CPLMalloc(nBlockXSize*nBlockYSize + 1);
uint32_t out[nBlockXSize*nBlockYSize/32];
char temp;
CPLErr erro;
FILE* pFile;
pFile = fopen(argv[2],"wb");
for (y=0; y<nY; y++) {
for (x=0; x<nX; x++) {
erro = poBand->ReadBlock(x,y,data);
if (erro > 0) {
return 3;
}
for (i=0; i<nBlockXSize*nBlockYSize; i+=32) {
temp = data[i+32];
data[i+32] = 0;
out[i/32] = strtoul(&data[i],0,2);
if (data[i] != 0) {
printf("%u/%u ",data[i],out[i/32]);
}
data[i+32] = temp;
}
ch = getchar(); // for debugging
}
fwrite(out,4,nBlockXSize*nBlockYSize/32,pFile);
}
fclose(pFile);
CPLFree(data);
return 0;
}
After the first set of bytes is read (for (i=0; i<nBlockXSize*nBlockYSize; i+=32)), I can see that printf("%u/%u ",data[i],out[i/32]); is printing some "1/0", meaning that, where my raster has a 1 value, this is being passed to strtoul, which is returning 0. Obviously I'm messing with something (pointers, probably), but can't find where. What am I doing wrong?
strtoul is for converting printable character data to an integer. The string should contain character codes for digits, e.g. '0', '1' etc.
Apparently in your case the source data is actually the integer value 1 and so strtoul finds there are no characters of the expected form and returns 0 .
Hello I am trying to write 8 bits from std::vector to binary file and read them back . Writing works fine , have checked with binary editor and all values are correct , but once I try to read I got bad data .
Data that i am writing :
11000111 //bits
Data that i got from reading:
11111111 //bits
Read function :
std::vector<bool> Read()
{
std::vector<bool> map;
std::ifstream fin("test.bin", std::ios::binary);
int size = 8 / 8.0f;
char * buffer = new char[size];
fin.read(buffer, size);
fin.close();
for (int i = 0; i < size; i++)
{
for (int id = 0; id < 8; id++)
{
map.emplace_back(buffer[i] << id);
}
}
delete[] buffer;
return map;
}
Write function(just so you guys know more whats going on)
void Write(std::vector<bool>& map)
{
std::ofstream fout("test.bin", std::ios::binary);
char byte = 0;
int byte_index = 0;
for (size_t i = 0; i < map.size(); i++)
{
if (map[i])
{
byte |= (1 << byte_index);
}
byte_index++;
if (byte_index > 7)
{
byte_index = 0;
fout.write(&byte, sizeof(byte));
}
}
fout.close();
}
Your code spreads out one byte (the value of buffer[i], where i is always 0) over 8 bools. Since you only read one byte, which happens to be non-zero, you now end up with 8 trues (since any non-zero integer converts to true).
Instead of spreading one value out, you probably want to split one value into its constituent bits:
for (int id = 0; id < 8; id++)
{
map.emplace_back((static_cast<unsigned char>(buffer[i]) & (1U << id)) >> id);
}
I am using libsndfile to read .caf file. I am able to read the file properly with number of items in the audio file. However, when I save those numbers in a text file and try to verify my values with MATLAB, they look a lot different. I have attached the code in C++ and the values I obtain from C++ and MATLAB.
void ofApp::setup(){
const char* fn = "/Users/faiyadhshahid/Desktop/Desktopdemo.caf";
SNDFILE *sf;
SF_INFO info;
int num_channels, num, num_items, *buf, f, sr,c, i , j;
FILE *out;
/* Open the WAV file. */
info.format = 0;
sf = sf_open(fn,SFM_READ,&info);
if (sf == NULL)
{
printf("Failed to open the file.\n");
}
/* Print some of the info, and figure out how much data to read. */
f = info.frames;
sr = info.samplerate;
c = info.channels;
printf("frames=%d\n",f);
printf("samplerate=%d\n",sr);
printf("channels=%d\n",c);
num_items = f*c;
printf("num_items=%d\n",num_items);
/* Allocate space for the data to be read, then read it. */
buf = (int *) malloc(num_items*sizeof(int));
num = sf_read_int(sf,buf,num_items);
sf_close(sf);
printf("Read %d items\n",num);
/* Write the data to filedata.out. */
out = fopen("/Users/faiyadhshahid/Desktop/filedata.txt","w");
for (i = 0; i < num; i += c)
{
for (j = 0; j < c; ++j)
fprintf(out,"%d ",buf[i+j]);
fprintf(out,"\n");
}
fclose(out);
return 0;
}
Values of C++ (on left) vs MATLAB (on right):
I figured it out by myself. I was comparing apples with oranges.
The changes I needed to make were to convert the buffer saving the values to read float values. `int num_channels, num, num_items,f, sr,c, i , j;
float *buf;
FILE *out;
/* Open the WAV file. */
info.format = 0;
sf = sf_open(fn,SFM_READ,&info);
if (sf == NULL)
{
printf("Failed to open the file.\n");
}
/* Print some of the info, and figure out how much data to read. */
f = info.frames;
sr = info.samplerate;
c = info.channels;
printf("frames=%d\n",f);
printf("samplerate=%d\n",sr);
printf("channels=%d\n",c);
num_items = f*c;
printf("num_items=%d\n",num_items);
/* Allocate space for the data to be read, then read it. */
buf = (float *) malloc(num_items*sizeof(float));
num = sf_read_float(sf,buf,num_items);
sf_close(sf);
printf("Read %d items\n",num);
/* Write the data to filedata.out. */
out = fopen("/Users/faiyadhshahid/Desktop/filedata.txt","w");
for (i = 0; i < num; i += c)
{
for (j = 0; j < c; ++j)
fprintf(out,"%f \n",buf[i]);
// fprintf(out,"\n");
}
fclose(out);
`
I need to read huge 35G file from disc line by line in C++. Currently I do it the following way:
ifstream infile("myfile.txt");
string line;
while (true) {
if (!getline(infile, line)) break;
long linepos = infile.tellg();
process(line,linepos);
}
But it gives me about 2MB/sec performance, though file manager copies the file with 100Mb/s speed. I guess that getline() is not doing buffering correctly. Please propose some sort of buffered line-by-line reading approach.
UPD: process() is not a bottleneck, code without process() works with the same speed.
You won't get anywhere close to line speed with the standard IO streams. Buffering or not, pretty much ANY parsing will kill your speed by orders of magnitude. I did experiments on datafiles composed of two ints and a double per line (Ivy Bridge chip, SSD):
IO streams in various combinations: ~10 MB/s. Pure parsing (f >> i1 >> i2 >> d) is faster than a getline into a string followed by a sstringstream parse.
C file operations like fscanf get about 40 MB/s.
getline with no parsing: 180 MB/s.
fread: 500-800 MB/s (depending on whether or not the file was cached by the OS).
I/O is not the bottleneck, parsing is. In other words, your process is likely your slow point.
So I wrote a parallel parser. It's composed of tasks (using a TBB pipeline):
fread large chunks (one such task at a time)
re-arrange chunks such that a line is not split between chunks (one such task at a time)
parse chunk (many such tasks)
I can have unlimited parsing tasks because my data is unordered anyway. If yours isn't then this might not be worth it to you.
This approach gets me about 100 MB/s on an 4-core IvyBridge chip.
I've translated my own buffering code from my java project and it does what I need. I had to put defines to overcome problems with M$VC 2010 compiler tellg, that always gives wrong negative values on huge files. This algorithm gives desired speed ~100MB/s, though it does some usless new[].
void readFileFast(ifstream &file, void(*lineHandler)(char*str, int length, __int64 absPos)){
int BUF_SIZE = 40000;
file.seekg(0,ios::end);
ifstream::pos_type p = file.tellg();
#ifdef WIN32
__int64 fileSize = *(__int64*)(((char*)&p) +8);
#else
__int64 fileSize = p;
#endif
file.seekg(0,ios::beg);
BUF_SIZE = min(BUF_SIZE, fileSize);
char* buf = new char[BUF_SIZE];
int bufLength = BUF_SIZE;
file.read(buf, bufLength);
int strEnd = -1;
int strStart;
__int64 bufPosInFile = 0;
while (bufLength > 0) {
int i = strEnd + 1;
strStart = strEnd;
strEnd = -1;
for (; i < bufLength && i + bufPosInFile < fileSize; i++) {
if (buf[i] == '\n') {
strEnd = i;
break;
}
}
if (strEnd == -1) { // scroll buffer
if (strStart == -1) {
lineHandler(buf + strStart + 1, bufLength, bufPosInFile + strStart + 1);
bufPosInFile += bufLength;
bufLength = min(bufLength, fileSize - bufPosInFile);
delete[]buf;
buf = new char[bufLength];
file.read(buf, bufLength);
} else {
int movedLength = bufLength - strStart - 1;
memmove(buf,buf+strStart+1,movedLength);
bufPosInFile += strStart + 1;
int readSize = min(bufLength - movedLength, fileSize - bufPosInFile - movedLength);
if (readSize != 0)
file.read(buf + movedLength, readSize);
if (movedLength + readSize < bufLength) {
char *tmpbuf = new char[movedLength + readSize];
memmove(tmpbuf,buf,movedLength+readSize);
delete[]buf;
buf = tmpbuf;
bufLength = movedLength + readSize;
}
strEnd = -1;
}
} else {
lineHandler(buf+ strStart + 1, strEnd - strStart, bufPosInFile + strStart + 1);
}
}
lineHandler(0, 0, 0);//eof
}
void lineHandler(char*buf, int l, __int64 pos){
if(buf==0) return;
string s = string(buf, l);
printf(s.c_str());
}
void loadFile(){
ifstream infile("file");
readFileFast(infile,lineHandler);
}
Use a line parser or write the same. here is a sample in the sourceforge http://tclap.sourceforge.net/ and put in a buffer if necessary.