I am attempting to read parquet data from a binary stream (via API posts). For example: I have a rather large parquet file on the other side of an REST API and need to fetch parts of the file. I have been attempting to follow file spec here: https://github.com/apache/parquet-format however, the pattern seems to be failing (or I am misunderstanding part).
For my test, I have moved a parquet file onto my local system and am reading in binary data from the file using ifstream. My steps are as follows:
Read in magic number from header
Read in magic number from footer
Read in FileMetaData length
Read in FileMetaData (from bottom of file)
Convert stream to FileMetaData Type using:
std::shared_ptr<parquet::FileMetaData> _metadata = parquet::FileMetaData::Make(metadataBuffer.data(), &metadataLength);
Read in RowGroup(0) and RowGroup(1) file_offset and total_byte_size from the FileMetaData like this:
_metadata->RowGroup(x)->file_offset();
_metadata->RowGroup(x)->total_byte_size();
After storing this data, I proceed to read in each RowGroup from the file using ifstream again. My start position is the file_offset from the beginning of the file.
Once my RowGroup data is read in to a vector of objects, I attempt to convert the buffered data into RowGroupMetaData
std::shared_ptr<parquet::RowGroupMetaData> _rowGroupMetaData = parquet::RowGroupMetaData::Make(rowGroupData[x].rowGroupBuffer.data(), rowGroupData[x].schema);
This is where I get stuck. When I try to access parts of the _rowGroupMetaData, I am getting junk back. It seems I must be skipping a step or overlooking part of the file spec.
I noticed that there is data between the magic number PAR1 at the top of the file an the file offset of RowGroup(0). the magic number is 4 characters long but the RowGroup(0) file_offset = 113. I am not sure what the data between 4-113 is and I cannot find information on it in the spec.
My parquet file is rather simple. 2 RowGroups with 2 columns. Total of 5 rows across both RowGroups.
Code:
ifstream inFile("parquet-arrow-example.parquet", std::ofstream::binary | std::ios::ate);
std::streamsize fileSize = inFile.tellg();
inFile.seekg(0, std::ios::beg);
std::vector<char> headBuffer;
std::vector<char> tailBuffer;
std::vector<uint8_t> metadataBuffer;
headBuffer.resize(4);
tailBuffer.resize(4);
struct RowGroupData {
int groupId;
int64_t byteLength;
int64_t offset;
const parquet::SchemaDescriptor* schema;
vector<uint8_t> rowGroupBuffer;
};
uint32_t metadataLength = 0;
string header;
string footer;
//Header
inFile.read((char*)&headBuffer[0], headBuffer.size()); //PAR1
header = string(headBuffer.begin(), headBuffer.end());
cout << header << endl;
//Footer
inFile.seekg(-4, std::ios::end);
inFile.read((char*)&tailBuffer[0], tailBuffer.size()); //PAR1
footer = string(tailBuffer.begin(), tailBuffer.end());
cout << footer << endl;
//Metadata Size
inFile.seekg(-8, std::ios::end);
inFile.read((char*)&metadataLength, 4);
cout << "Metadata Length: " << metadataLength << endl;
int len = -8 - metadataLength;
//Get MetaData
inFile.seekg(len, std::ios::end);
metadataBuffer.resize(metadataLength);
inFile.read((char*)&metadataBuffer[0], metadataBuffer.size());
cout << string(metadataBuffer.begin(), metadataBuffer.end()) << endl;
std::shared_ptr<parquet::FileMetaData> _metadata = parquet::FileMetaData::Make(metadataBuffer.data(), &metadataLength);
cout << "Num Rows: " << _metadata->num_rows() << endl;
cout << "Num Columns: " << _metadata->num_columns() << endl;
cout << "Num RowGroups: " << _metadata->num_row_groups() << endl;
vector<RowGroupData> rowGroupData;
//int seeqPos = 4;
for (int x = 0; x < _metadata->num_row_groups(); x++) {
cout << "RowGroup " << x << " Byte Size: " << _metadata->RowGroup(x)->total_byte_size() << endl;
cout << "RowGroup " << x << " File Offset: " << _metadata->RowGroup(x)->file_offset() << endl;
cout << "RowGroup " << x << " Column 0 File Offset: " << _metadata->RowGroup(x)->ColumnChunk(0)->file_offset() << endl;
cout << "RowGroup " << x << " Column 0 Byte Size: " << _metadata->RowGroup(x)->ColumnChunk(0)->total_compressed_size() << endl;
cout << "RowGroup " << x << " Column 1 File Offset: " << _metadata->RowGroup(x)->ColumnChunk(1)->file_offset() << endl;
cout << "RowGroup " << x << " Column 1 Byte Size: " << _metadata->RowGroup(x)->ColumnChunk(1)->total_compressed_size() << endl;
RowGroupData rgData;
rgData.groupId = x;
rgData.byteLength = _metadata->RowGroup(x)->total_byte_size();
rgData.offset = _metadata->RowGroup(x)->file_offset();
rgData.schema = _metadata->RowGroup(x)->schema();
rgData.rowGroupBuffer.resize(rgData.byteLength);
//Store rowGroup Length
//Store rowGroup Data
inFile.seekg(rgData.offset, std::ios::beg);
inFile.read((char*)&rgData.rowGroupBuffer[0], rgData.rowGroupBuffer.size());
rowGroupData.push_back(rgData);
//seeqPos = seeqPos + rgData.byteLength;
}
cout << endl;
for (int x = 0; x < rowGroupData.size(); x++) {
vector<uint8_t> rgBuffer;
//rgBuffer = rowGroupData[x].rowGroupBuffer;
cout << "RowGroupId: " << rowGroupData[x].groupId << endl;
cout << "RowGroupData: " << string(rowGroupData[x].rowGroupBuffer.begin(), rowGroupData[x].rowGroupBuffer.end()) << endl;
std::shared_ptr<parquet::RowGroupMetaData> _rowGroupMetaData = parquet::RowGroupMetaData::Make(rowGroupData[x].rowGroupBuffer.data(), rowGroupData[x].schema);
cout << "RowGroup Rows: " << _rowGroupMetaData->num_rows() << endl;
cout << "Byte Size: " << _rowGroupMetaData->total_byte_size() << endl;
}
The data between the file header and the file_offset is the column_chunk metadata for the first column.
The parquet spec is a little confusing because there are two different file offsets. The one on the RowGroup is an offset to the first page of data in the row group. And the column chunk file_offset which points to the column chunks metadata.
To my knowledge the first offset is mostly used for splitting files, I think most other readers use the latter offset for parsing columns.
Also note that in C++ at least file_offset was being written out incorrectly prior to the release of Arrow 6.0 (it pointed to the same byte offset as that the column offset chunk did).
Last, parquet is a non-trivial format and it is easy to have subtle bugs, I'd strongly recommend trying to use a standard implementation which has been battle tested rather then creating your own. If something is missing from the API it might be simpler to contribute it to an existing implementation instead of trying to build everything from scratch.
Related
I have a vector of objects with quite a few variables (name, type, length etc) which I am trying to write to file.
vector <Boat> berths;
void Boat::write_boats()
{
ofstream file("records_file.txt");
for (Boat b : berths)
{
file << owner_name << "; " << boat_name << "; " << type << "; " << length << "; " << draft << '\n';
}
file.close();
}
void save_records()
{
for (unsigned int i = 1; i < berths.size(); i++)
{
berths[i].write_boats();
}
}
I call the save_records() function with a menu option that ends the application.
The output i get is:
1) If i register a boat object, close the app and go in the text file, I can see the object written twice.
2) If i register 2 objects and I go in the text file, only the last (second) object has been written to file, and it shows 3 times.
Now my questions are:
What causes the double output?
Why is only the last object written to file? I thought the loop would fix that but it didn't
One problem I can spot: "i = 1" in the loop should be "i = 0", because array indexes start from 0. The second: you iterate 'berths' array, so you will get N * N boats saved, if you have N boats in 'berths'.
The simple solution would be
void save_all()
{
ofstream file("records_file.txt");
for (Boat b : berths)
{
file << b.owner_name << "; " << b.boat_name << "; " << b.type << "; " << b.length << "; " << b.draft << '\n';
}
}
If you have to make 'owner_name', 'type' and the rest of the fields as private, then you would have to declare
void Boat::save(std::ofstream& f) const
{
file << owner_name << "; " << boat_name << "; " << type << "; " << length << "; " << draft << '\n';
}
and modify 'save_all' to
void save_all()
{
ofstream file("records_file.txt");
for (const Boat& b: berths)
b.save(f);
}
Every time ofstream file("records_file.txt"); is called, it created a new file and overwrite it, if you want to append in the file you have to open it by this way:
ofstream file("records_file.txt", ios::app);
See: http://www.cplusplus.com/doc/tutorial/files/
I guess you are using something like while(!bla.eof()), if so then it reaches the end of the buffer but it needs to go past it to raise the flag, so you have the same output twice at the end.
I've been trying to read in a *.csv file to a vector, however it doesn't seem to work as such, it behaves that there are objects but doesn't display them and acts if they're empty.
This is the function to load the *.csv file.
void loadShoes()
{
fstream shoes;
shoes.open("shoes.txt", ios::in);
string shoeId;
string valueId;
while (getline(shoes, shoeId, ','))
{
getline(shoes, valueId, ',');
ShoeMap[shoeId] = valueId;
if (shoeId == "ShoeLaceStyle")
{
thefootwear.addShoe(ShoeMap);
};
}
}
This code is from the main that calls the function to load into a vector and then be displayed in a simple UI.
else if (userInput == 3)
{
for (int i = 0; i < thefootwear.vecNewShoe.size(); i++)
{
cout << i + 1 << " - " << thefootwear.vecNewShoe[i]->getShoeID() << "\n";
}
cout << "\nWhich Shoe would you like to view from the store?\n";
cin >> userInput1;
cout << "ShoeID - " << thefootwear.vecNewShoe[userInput1 - 1]->getShoeID() << "\n" << "ShoeName - "
<< thefootwear.vecNewShoe[userInput1 - 1]->getShoeName() << "\n" << "ShoeType - "
<< thefootwear.vecNewShoe[userInput1 - 1]->getShoeType() << "\n" << "ShoeSize - "
<< thefootwear.vecNewShoe[userInput1 - 1]->getShoeSize() << "\n" << "ShoeSoleStyle - "
<< thefootwear.vecNewShoe[userInput1 - 1]->getShoeSoleStyle() << "\n" << "ShoeColour - "
<< thefootwear.vecNewShoe[userInput1 - 1]->getShoeColour() << "\n" << "ShoeLaceStyle - "
<< thefootwear.vecNewShoe[userInput1 - 1]->getShoeLaceStyle() << "\n";
}
This is the method and function to add a new shoe with the required variables, in the Footwear class.
Shoe* Footwear::addShoe(map<string, string> ShoeMap)
{
Shoe* newShoe = new Shoe(ShoeMap["ShoeID"], ShoeMap["ShoeName"], ShoeMap["ShoeType"], ShoeMap["ShoeSize"], ShoeMap["ShoeSoleStyle"], ShoeMap["ShoeColour"], ShoeMap["ShoeLaceStyle"]);
vecNewShoe.push_back(newShoe);
return newShoe;
}
When I run the program, it displays the correct number of shoes currently in the *.csv file, however it fails to display their corresponding Shoe ID and I am, therefore, unable to access the variables related to the Shoes.
Speaking to your need rather than your specific question: Consider not writing your own custom CSV parser, but rather using a pre-existing C++ CSV parser library. There are several popular ones on GitHub for example.
This has the added benefit of being more robust against unexpected input contrivances (such as quoted numbers for example).
I need to extract all the cell data from a .vtu (XML unstructured grid) for further manipulations in a c++ program. I am quite new to VTK...
//read all the data from the file
vtkSmartPointer<vtkXMLUnstructuredGridReader> reader =
vtkSmartPointer<vtkXMLUnstructuredGridReader>::New();
reader->SetFileName(filename.c_str());
reader->Update();
unsigned int cellNumber = reader->GetOutput()->GetNumberOfCells();
cout << "There are " << cellNumber << " input cells." << endl;
This is correct - the cell number is displayed correctly. How do access now the names of the different CellArrays properties stored in the .vtu file and then their actual numeric values? Any help is appreciated!
Cheers,
Domanov
//read all the data from the file
vtkSmartPointer<vtkXMLUnstructuredGridReader> reader =
vtkSmartPointer<vtkXMLUnstructuredGridReader>::New();
reader->SetFileName(filename.c_str());
reader->Update();
unsigned int cellNumber = reader->GetOutput()->GetNumberOfCells();
cout << "There are " << cellNumber << " input cells." << endl;
To access the cell data of unstructured grid, you can do as following:
vtkUnstructuredGrid* ugrid = reader->GetOutput();
vtkCellData *cellData = ugrid->GetCellData();
for (int i = 0; i < cellData->GetNumberOfArrays(); i++)
{
vtkDataArray* data = cellData->GetArray(j);
cout << "name " << data->GetName() << endl;
for (int j = 0; j < data->GetNumberOfTuples(); j++)
{
double value = data->GetTuple1(j);
cout << " value " << j << "th is " << value << endl;
}
}
I am sure someone somewhere has had this same issue but I have looked far and wide (including here on the stackoverflow) to find out how to properly align my columns in an output file. The following is the complete code I am using (for an event generator called Pythia 8 of which C++ is the primary language):
using namespace Pythia8;
int main()
{
Pythia pythia;
pythia.readString("Top:gg2ttbar = 1");
pythia.init(2212, 2212, 14000.);
ofstream myfile;
myfile.open("ttbar.txt");
for (int iEvent = 0; iEvent < 1; ++iEvent)
{
if (!pythia.next()) continue;
vector<double> part;
for (int i = 0; i < pythia.event.size(); ++i)
{
if (pythia.event[i].status() == 91) part.push_back(i);
}
myfile << "N = " << part.size() << endl;
for (int j = 0; j < (int(part.size()) - 1); ++j)
{
myfile << left << setw(4) << int(part[j]);
myfile << setw(4) << left << pythia.event[part[j]].name() << " "
<< right << pythia.event[part[j]].id() << " "
<< pythia.event[part[j]].px() << " " << pythia.event[part[j]].py()
<< " " << pythia.event[part[j]].pz() << " "
<< pythia.event[part[j]].m() << " " << pythia.event[part[j]].pT() << endl;
}
}
pythia.stat();
myfile.close();
return 0;
}
The issue occurs near the bottom where the loop that writes out the text file starts, as it is currently written in the above code, the first two columns are mashed together:
N = 665
1777pi- -211 1.19978 0.715507 32.7878 0.13957 1.39694
1779pi+ 211 -8.24173 6.07047 -31.6818 0.13957 10.2361
That is the first couple lines of the output (the program shows the line number where a certain particle is produced and relevant information about it like the name, mass...etc.). I cannot seem to format it so I don't have to use the inserted spaces that I put in by hand.
as it is currently written in the above code, the first two columns are mashed together
well, yes, you explicitly wrote the first two columns with no whitespace between them:
myfile << left << setw(4) << int(part[j]);
myfile << setw(4) << left << pythia.event[part[j]].name() << ...
If you want a general way to format this without worrying adding manual whitespace, split it into two steps:
create a vector<string> containing the columns for each line (you can just use an ostringstream to format each column individually)
write a function to take that vector and write it to an ostream, with spaces between.
std::copy(begin, end, std::ostream_iterator(myfile, " "));
will be sufficient if you just want a fixed number of spaces between each column
I am doing a little game and I am saving the player details in a txt file.
Example of that txt file:
Eric 13 8 10 10 30 10 10 50 0 0 0 0
William 1 0 10 30 30 10 10 50 0 0 0 0
John 1 0 10 30 30 10 10 50 0 0 0 0
This is what I had in mind: when the player chooses to save the game while playing, the save_game function should check if there is already any saved data. If there is, instead of appending the data to the end of the txt, it should overwrite that specific line.
Here is my current function:
// SAVE GAME
void save_game(Player player)
{
ofstream coutfile (SaveDestiny, ios::app);
if (coutfile.is_open()) // if it opens correctly
{
// Now checking if the name already exists
string imported_name;
ifstream cinfile (SaveDestiny); // opens file that contains the saved games
cinfile >> imported_name; // getting first element of file
bool j = 0; // j = 0 while the strings don't match. j = 1 when the string was found
while (cinfile >> imported_name) // while the end of file is not reached
{
if (player.name.compare(imported_name) == 0) // if the strings are the same, overwrite data
{
j = 1;
coutfile << " \r" << endl;
break;
}
else // if the strings are different, keep reading
{
cinfile >> imported_name;
}
}
// Continuing...
coutfile << player.name << " " << player.level << " " << player.exp << " " << player.max_exp << " "
<< player.hp << " " << player.max_hp << " " << player.mp << " " << player.max_mp << " "
<< player.gold << " " << player.weapon << " " << player.shield << " " << player.heal_spell << " "
<< player.attack_spell << endl;
}
else
{
ofstream coutfile (SaveDestiny, ios::app);
coutfile << "test";
cout << "Unable to open file";
cin.get();
}
draw_rectangle(37,8,72,14,15); // white limits
draw_rectangle(39,9,70,13,9); // blue background
cor(9,15);
gotoxy(50,10);
cout << "GAME SAVED!";
gotoxy(41,12);
cor(9,14);
cout << "Press <Enter> to continue... ";
cin.get();
}
On most modern filesystems files are not "line-based" (or "record-based") they are character-based so you can't "overwrite a line". The old line might be 20 characters long and the new one would be 24 characters, in which case it would overwrite the old line and the first 4 characters of the next line. To make this work you would have to "push" everything after the line later in the file, which isn't possible with C++ (or C) IO facilities.
One option would be to write all lines with a fixed length, say 50 characters, so that overwriting the 3rd line involves replacing characters 100 to 149, even if the line only actually needs 24 characters.
Another option would be to keep the file in memory in a record-based form and write out the entire file every time you change it (or at least write out the new line and all lines that come after it)
Ok I've managed to get around the problem and now it's working brilliantly! :D
First, the function checks if the player name already is on the txt. I created a enable variable j. When j=1, the name exists and the data needs to be overwritten! When j=0, the function will append the data to the txt right away.
Ok, let's say j=1. The function determines the number of lines in txt. It then creates a vector with two vectors inside: the name, and the game variables.
After that, the function deletes the previouscontent of txt file. And writes the content of the vector to the txt, except the data that needs to be overwritten (it will skip writing that part to the txt), because at the end of the function, that new data will be written. :D I hope I made myself clear enough. Sorry if someone doesn't understand what I wrote...
Here is my new save_game function:
// SAVE GAME
void save_game(Player player)
{
ofstream coutfile (SaveDestiny, ios::app);
if (coutfile.is_open()) // if it opens correctly
{
string imported_name;
ifstream cinfile (SaveDestiny); // opens file that contains the saved games
bool j = 0;
// Now checking if the name already exists
while (cinfile >> imported_name) // while the end of file is not reached
{
if (player.name.compare(imported_name) == 0) // if the strings are the same, overwrite data
{
j = 1; // enable overwrite
break;
}
// if the strings are different, keep reading
}
// at this point: j = 0 to append to end. j = 1 to overwrite.
// Overwriting data
if (j == 1)
{
ifstream cinfile (SaveDestiny);
// now determining the size of the vector (number of lines in txt)
int line_numbers = 0;
string line;
while (getline(cinfile, line))
{
line_numbers++;
}
cinfile.close(); // closing
ifstream cinfile2 (SaveDestiny); // reopening to read from the beginning
// now creating the vector with the saves
vector<vector<string>> temp_saves(line_numbers, vector<string>(2));
string name2;
string values;
for (unsigned int x = 0; x < temp_saves.size(); x++)
{
cinfile2 >> name2;
getline(cinfile2, values);
temp_saves[x][0] = name2;
temp_saves[x][1] = values;
}
coutfile.close(); // closing output file
ofstream coutfile2 (SaveDestiny); // reopening in overwrite mode
// delete all saves.txt, copying vector content to txt (except the one we want to overwrite)
for (unsigned int x = 0; x < temp_saves.size(); x++)
{
if ( temp_saves[x][0].compare(player.name) != 0)
{
coutfile2 << temp_saves[x][0] << temp_saves[x][1] << endl;
}
}
coutfile2.close(); // closing output file
}
// Appending new data...
ofstream coutfile3 (SaveDestiny, ios::app); // reopening in append mode
coutfile3 << player.name << " " << player.level << " " << player.exp << " " << player.max_exp << " "
<< player.hp << " " << player.max_hp << " " << player.mp << " " << player.max_mp << " "
<< player.gold << " " << player.weapon << " " << player.shield << " " << player.heal_spell << " "
<< player.attack_spell << endl;
}
else
{
ofstream coutfile (SaveDestiny, ios::app);
cout << "Unable to open file";
cin.get();
}
draw_rectangle(37,8,72,14,15); // white limits
draw_rectangle(39,9,70,13,9); // blue background
cor(9,15);
gotoxy(50,10);
cout << "GAME SAVED!";
gotoxy(41,12);
cor(9,14);
cout << "Press <Enter> to continue... ";
cin.get();
}