I created this code to read and filter my csv files.
It works like I want it to work for small files.
But I just tried out a file of size 200k lines and it takes around 4 minutes, which is too long for my use case.
After testing a bit and fixing some quite stupid things I got the time down a little to 3 minutes.
I found out about half of the Time is spent reading in the file and half of the Time is spend generating the Result Vector.
Is there any way to Improve the speed of my Programm?
Especially the Reading from csv part?
I do not really have an Idea at the moment.
I'd appreciate any help.
EDIT:The filter is filtering the data by either a timeframe or timeframe and filterword in specific columns and outputting the data into a resulting vector of strings.
My CSV files look like this->
Headers are:
ID;Timestamp;ObjectID;UserID;Area;Description;Comment;Checksum
Data is:
523;19.05.2021 12:15;####;admin;global;Parameter changed to xxx; Comment;x3J2j4
std::ifstream input_file(strComplPath, std::ios::in);
int counter = 0;
while (std::getline(input_file, record))
{
istringstream line(record);
while (std::getline(line, record, delimiter))
{
record.erase(remove(record.begin(), record.end(), '\"'), record.end());
items.push_back(record);
//cout << record;
}
csv_contents[counter] = items;
items.clear();
++counter;
}
for (int i = 0; i < csv_contents.size(); i++) {
string regexline = csv_contents[i][1];
string endtime = time_upper_bound;
string starttime = time_lower_bound;
bool checkline = false;
bool isInRange = false, isLater = false, isEarlier = false;
// Check for faulty Data and replace it with an empty string
for (int oo = 0; oo < 8; oo++) {
if (csv_contents[i][oo].rfind("#", 0) == 0) {
csv_contents[i][oo] = "";
}
}
if ((regex_search(starttime, m, timestampformat) && regex_search(endtime, m, timestampformat))) {
filtertimeboth = true;
}
else if (regex_search(starttime, m, timestampformat)) {
filterfromstart = true;
}
else if (regex_search(endtime, m, timestampformat)) {
filtertoend = true;
}
}
I'm not sure exactly what the bottleneck is in your program (I copied your code from an earlier version of the question) but you have a lot of regex:es and mix reading records with post processing. I suggest that you create a class to hold one of these records, called record, overload operator>> for record and then use std::copy_if from the file with a filter that you can design separately from the reading. Do post processing after you've read the records that passes the filter.
I made a small test and it takes 2 seconds to read 200k records on my old spinning disk while doing filtering. I only used time_lower_bound and time_upper_bound to filter and additional checks will of course make it a little slower, but it should not take minutes.
Example:
#include <algorithm>
#include <chrono>
#include <ctime>
#include <fstream>
#include <iomanip>
#include <iostream>
#include <iterator>
#include <sstream>
#include <string>
#include <thread>
#include <vector>
// the suggested class to hold a record
struct record {
int ID;
std::chrono::system_clock::time_point Timestamp;
std::string ObjectID;
std::string UserID;
std::string Area;
std::string Description;
std::string Comment;
std::string Checksum;
};
// A free function to read a time_point from an `istream`:
std::chrono::system_clock::time_point to_tp(std::istream& is, const char* fmt) {
std::chrono::system_clock::time_point tp{};
// C++20:
// std::chrono::from_stream(is, tp, fmt, nullptr, nullptr);
// C++11 to C++17 version:
std::tm tmtp{};
tmtp.tm_isdst = -1;
if(is >> std::get_time(&tmtp, fmt)) {
tp = std::chrono::system_clock::from_time_t(std::mktime(&tmtp));
}
return tp;
}
// The operator>> overload to read one `record` from an `istream`:
std::istream& operator>>(std::istream& is, record& r) {
is >> r.ID;
r.Timestamp = to_tp(is, ";%d.%m.%Y %H:%M;"); // using the helper function above
std::getline(is, r.ObjectID, ';');
std::getline(is, r.UserID, ';');
std::getline(is, r.Area, ';');
std::getline(is, r.Description, ';');
std::getline(is, r.Comment, ';');
std::getline(is, r.Checksum);
return is;
}
// An operator<< overload to print one `record`:
std::ostream& operator<<(std::ostream& os, const record& r) {
std::ostringstream oss;
oss << r.ID;
{ // I only made a C++11 to C++17 version for this one:
std::time_t time = std::chrono::system_clock::to_time_t(r.Timestamp);
std::tm ts = *std::localtime(&time);
oss << ';' << ts.tm_mday << '.' << ts.tm_mon + 1 << '.'
<< ts.tm_year + 1900 << ' ' << ts.tm_hour << ':' << ts.tm_min << ';';
}
oss << r.ObjectID << ';' << r.UserID << ';' << r.Area << ';'
<< r.Description << ';' << r.Comment << ';' << r.Checksum << '\n';
return os << oss.str();
}
// The reading and filtering part of `main` would then look like this:
int main() { // not "void main()"
std::istringstream time_lower_bound_s("20.05.2019 16:40:00");
std::istringstream time_upper_bound_s("20.05.2021 09:40:00");
// Your time boundaries as `std::chrono::system_clock::time_point`s -
// again using the `to_tp` helper function:
auto time_lower_bound = to_tp(time_lower_bound_s, "%d.%m.%Y %H:%M:%S");
auto time_upper_bound = to_tp(time_upper_bound_s, "%d.%m.%Y %H:%M:%S");
// Verify that the boundaries were parsed ok:
if(time_lower_bound == std::chrono::system_clock::time_point{} ||
time_upper_bound == std::chrono::system_clock::time_point{}) {
std::cerr << "failed to parse boundaries\n";
return 1;
}
std::ifstream is("data"); // whatever your file is called
if(is) {
std::vector<record> recs; // a vector with all the records
// create your filter
auto filter = [&time_lower_bound, &time_upper_bound](const record& r) {
// Only copy those `record`s within the set boundaries.
// You can add additional conditions here too.
return r.Timestamp >= time_lower_bound &&
r.Timestamp <= time_upper_bound;
};
// Copy those records that pass the filter:
std::copy_if(std::istream_iterator<record>(is),
std::istream_iterator<record>{}, std::back_inserter(recs),
filter);
// .. post process `recs` here ...
// print result
for(auto& r : recs) std::cout << r;
}
}
Answer is already given by Ted. I made a solution in the same time. So let me show it additionally.
I created test data with 500k records and all parsing an stuff was done in below 3 seconds on my machine.
Additionally, I also created classes.
Speed will be gained by using std::move, increasing the input buffer size and using reservefor the std::vector.
Please see yet another solution below. I omitted filtering. Ted showed it already.
#include <iostream>
#include <fstream>
#include <iomanip>
#include <string>
#include <ctime>
#include <vector>
#include <chrono>
#include <sstream>
#include <algorithm>
#include <iterator>
constexpr size_t MaxLines = 600'000u;
constexpr size_t NumberOfLines = 500'000u;
const std::string fileName{ "test.csv" };
// Dummy rtoutine for writing a test file
void createFile() {
if (std::ofstream ofs{ fileName }; ofs) {
std::time_t ttt = 0;
for (size_t k = 0; k < NumberOfLines; ++k) {
std::time_t time = static_cast<time_t>(ttt);
ttt += 1000;
ofs << k << ';'
#pragma warning(suppress : 4996)
<< std::put_time(std::localtime(&time), "%d.%m.%Y %H:%M") << ';'
<< k << ';'
<< "UserID" << k << ';'
<< "Area" << k << ';'
<< "Description" << k << ';'
<< "Comment" << k << ';'
<< "Checksum" << k << '\n';
}
}
else std::cerr << "\n*** Error: Could not open '" << fileName << "' for writing\n\n";
}
// We will create a bigger input buffer for our stream
constexpr size_t ifStreamBufferSize = 100'000u;
static char buffer[ifStreamBufferSize];
// Object oriented Model. Class for one record
struct Record {
// Data
long id{};
std::tm time{};
long objectId{};
std::string userId{};
std::string area{};
std::string description{};
std::string comment{};
std::string checkSum{};
// Methods
// Extractor operator
friend std::istream& operator >> (std::istream& is, Record& r) {
// Read one complete line
if (std::string line; std::getline(is, line)) {
// Here we will stor the parts of the line after the split
std::vector<std::string> parts{};
// Convert line to istringstream for further extraction of line parts
std::istringstream iss{ line };
// One part of a line
std::string part{};
bool wrongData = false;
// Split
while (std::getline(iss, part, ';')) {
// Check fpor error
if (part[0] == '#') {
is.setstate(std::ios::failbit);
break;
}
// add part
parts.push_back(std::move(part));
}
// If all was OK
if (is) {
// If we have enough parts
if (parts.size() == 8) {
// Convert parts to target data in record
r.id = std::strtol(parts[0].c_str(), nullptr, 10);
std::istringstream ss{parts[1]};
ss >> std::get_time(& r.time, "%d.%m.%Y %H:%M");
if (ss.fail())
is.setstate(std::ios::failbit);
r.objectId = std::strtol(parts[2].c_str(), nullptr, 10);
r.userId = std::move(parts[3]);
r.area = std::move(parts[4]);
r.description = std::move(parts[5]);
r.comment = std::move(parts[6]);
r.checkSum = std::move(parts[7]);
}
else is.setstate(std::ios::failbit);
}
}
return is;
}
// Simple inserter function
friend std::ostream& operator << (std::ostream& os, const Record& r) {
return os << r.id << " "
#pragma warning(suppress : 4996)
<< std::put_time(&r.time, "%d.%m.%Y %H:%M") << " "
<< r.objectId << " " << r.userId << " " << r.area << " " << r.description << " " << r.comment << " " << r.checkSum;
}
};
// Data will hold all records
struct Data {
// Data part
std::vector<Record> records{};
// Constructor will reserve space to avaoid reallocation
Data() { records.reserve(MaxLines); }
// Simple extractor. Will call Record's exractor
friend std::istream& operator >> (std::istream& is, Data& d) {
// Set bigger file buffer. This is a time saver
is.rdbuf()->pubsetbuf(buffer, ifStreamBufferSize);
std::copy(std::istream_iterator<Record>(is), {}, std::back_inserter(d.records));
return is;
}
// Simple inserter
friend std::ostream& operator >> (std::ostream& os, const Data& d) {
std::copy(d.records.begin(), d.records.end(), std::ostream_iterator<Record>(os, "\n"));
return os;
}
};
int main() {
// createFile();
auto start = std::chrono::system_clock::now();
auto elapsed = std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::system_clock::now() - start);
if (std::ifstream ifs{ fileName }; ifs) {
Data data;
// Start time measurement
start = std::chrono::system_clock::now();
// Read and parse complete data
ifs >> data;
// End of time measurement
elapsed = std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::system_clock::now() - start);
std::cout << "\nReading and splitting. Duration: " << elapsed.count() << " ms\n";
// Some debug output
std::cout << "\n\nNumber of read records: " << data.records.size() << "\n\n";
for (size_t k{}; k < 10; ++k)
std::cout << data.records[k] << '\n';
}
else std::cerr << "\n*** Error: Could not open '" << fileName << "' for reading\n\n";
}
And yes, I used "ctime".
Related
I am trying to parse through a gcode and I want to extract only the x & y coordinates of G1 from each line
GCODE EXAMPLE
G1 X123.456 Y125.425 Z34.321
I tried the basic getline() function but it prints the whole line, don't understand how to add filters to the getline() to just extract only x & y numerical values and only for lines with G1 on start.
#include <iostream>
#include <fstream>
#include <vector>
#include <sstream>
#include <fstream>
using std::cout; using std::cerr;
using std::endl; using std::string;
using std::ifstream; using std::vector;
int main()
{
string filename("test1.gcode");
vector<string> lines;
string line;
ifstream input_file(filename);
if (!input_file.is_open()) {
cerr << "Could not open the file - '"
<< filename << "'" << endl;
return EXIT_FAILURE;
}
while (getline(input_file, line)){
lines.push_back(line);
}
for (const auto &i : lines)
cout << i << endl;
input_file.close();
return EXIT_SUCCESS;
}
You cannot add filters to getline(). It will always return the complete next line in the input.
What you can do is parse this line yourself, and extract the values that you need.
This is can be done in multiple ways. One of them is demonstrated below.
I used std::string::find to get an offset for the character 'X'/'Y' marking the x/y coordinates.
Then I used std::atof to convert the relevant part of the line to a double value.
I also used std::string::find to check whether the line starts with the required prefix for this command.
#include <string>
#include <iostream>
#include <cstdlib>
int main()
{
std::string line = "G1 X123.456 Y125.425 Z34.321";
if (line.find("G1") == 0)
{
size_t xpos = line.find('X');
if (xpos == std::string::npos) { /* handle error */ }
double x = std::atof(line.data() + xpos + 1); // skip over the 'X'
size_t ypos = line.find('Y');
if (ypos == std::string::npos) { /* handle error */ }
double y = std::atof(line.data() + ypos + 1); // skip over the 'Y'
std::cout << "X: " << x << ", Y: " << y << std::endl;
}
}
Output:
X: 123.456, Y: 125.425
The usual way to handle structured data is... with a struct.
struct line_item {
int g;
double x;
double y;
double z;
};
std::istream& operator>>(std::istream& in, line_item& item) {
line_item temp;
item = {};
in >> 'G' >> temp.g >> 'X' >> temp.x >> 'Y' >> temp.y >> 'Z' >> temp.z;
if (in)
item = temp;
return in;
}
std::ostream& operator<<(std::ostream& out, const line_item& item) {
return out << 'G' << item.g << " X" << item.x << " Y" << item.y << " Z" << item.z;
}
Normally one can't istream >> 'G', so I have a helper for that.
//helper function for "reading in" character literals
template<class e, class t>
std::basic_istream<e,t>& operator>>(std::basic_istream<e,t>& in, const e& char_literal) {
e buffer; //get buffer
in >> buffer; //read data
if (buffer != char_literal) //if it failed
in.setstate(in.rdstate() | std::ios::failbit); //set the state
return in;
}
http://coliru.stacked-crooked.com/a/6af1f3c881cc9e6e
Then you read in the items using the normal istream>>line_item, and from there, you have structured data that you can do whatever you want with, such as creating a secondary struct that only stores the items you care about.
I'm tackling a exercise which is designed to cause exactly this problem, of overloading the memory. Pretty much I'm loading various file sizes from 1,000 to 5 million lines of entries like this in a txt file (1 line = 1 entry):
SHFIv,aiSdG
PlgNB,bPHoP
ZHWJU,gfwgC
UAygL,Vqvhi
BlyzX,LLbCo
jbvrT,Utblj
...
pretty much every entry has 2 values separated by comma, in my code, I separate these values and try to find another matching value, there are always only 2 exactly matching values and each time 1 value is found the other one with which it is paired points to another pair, and so on until the final one gets found.
For example SHFIv,aiSdG would point to aiSdG,YDUVo.
I know my code is not very efficient, partly due to using recursion, but I could'nt figure out a better way to do the job, so any suggestions on how to possibly improve it to handle larger inputs would be greatly appriciated
#include <iostream>
#include <fstream>
#include <string>
#include <sstream>
#include <map>
#include <unordered_map>
#include <stdio.h>
#include <vector>
#include <iterator>
#include <utility>
#include <functional>
#include <algorithm>
using namespace std;
template<typename T>
void search_bricks_backwards(string resume, vector<T>& vec, vector<string>& vec2) {
int index = 0;
for (const auto& pair : vec) {
//cout << "iteration " << index << endl;
if (pair.second == resume) {
vec2.insert(vec2.begin(), resume);
cout << "found " << resume << " and " << pair.second << endl;
search_bricks_backwards(pair.first, vec, vec2);
}
if (index + 1 == vec.size()) {
cout << "end of backward search, exitting..." << endl;
}
index++;
}
}
template<typename T>
void search_bricks(string start, vector<T>& vec, vector<string>& vec2) {
int index = 0;
for (const auto& pair : vec) {
//cout << "iteration " << index << endl;
if (pair.first == start) {
vec2.push_back(start);
cout << "found " << start << " and " << pair.first << endl;
search_bricks(pair.second, vec, vec2);
}
if (index + 1 == vec.size()) {
//search_bricks_backwards(start, vec, vec2);
// this also gets called on every recursion rather than just once
// as I originally intended when the forward iteration gets finished
}
index++;
}
}
template<typename T> // printing function
void printVectorElements(vector<T>& vec)
{
for (auto i = 0; i < vec.size(); ++i) {
cout << "(" << vec.at(i).first << ","
<< vec.at(i).second << ")" << endl ;
}
cout << endl;
}
vector<string> split(string s, string delimiter) { // filtering function
size_t pos_start = 0, pos_end, delim_len = delimiter.length();
string token;
vector<string> res;
while ((pos_end = s.find(delimiter, pos_start)) != string::npos) {
token = s.substr(pos_start, pos_end - pos_start);
pos_start = pos_end + delim_len;
res.push_back(token);
}
res.push_back(s.substr(pos_start));
return res;
}
int main()
{
vector<pair<string, string>> bricks;
vector<string> sorted_bricks;
ifstream inFile;
inFile.open("input-pairs-5K.txt"); // transferring data from .txt to a string
stringstream strStream;
strStream << inFile.rdbuf();
string str = strStream.str();
istringstream iss(str);
for (string line; getline(iss, line); )
// filtering data from string and dividing on ","
{
string delimiter = ",";
string s = line;
vector<string> v = split(s, delimiter);
string s1 = v.at(0);
string s2 = v.at(1);
bricks.push_back(make_pair(s1, s2));
}
search_bricks(bricks[0].second, bricks, sorted_bricks);
//printVectorElements(bricks);
//for (auto i = sorted_bricks.begin(); i != sorted_bricks.end(); ++i)
//cout << *i << " "; // this is just to check if vectors have data
}
Here is link to the 1k test data that works for me (only for the search bricks without backwards searching since it triggers on every recursion) again thanks for any suggestions on how to improve or get rid of the recursion. I don't code in c++ often and don't really know how else to tackle this.
Although implementing non-recursive version of your algorithm is canonical solution, if you really need to solve the problem without code modification, you can increase the stack size by modifying compiler option. ~100Mb will be usually sufficient.
In MSVC : /STACK:commit 104857600
In gcc : --stack, 104857600
Below code is the normal way to get the input from a text and store it in an array in a structure.
Wanted to ask how can i use pointer to store all these data into the array of structure ? Like p1->Years (this is without array, but how can i apply this to way of writing in below code)
Any better suggestion to use pointer to take in the input?
int years = 4;
struct maju_company {
int Year;
float quarter1, quarter2, quarter3, quarter4, total_sales, average_sales;
};
int main() {
string line;
maju_company p1[years];
fstream yeecinnfile("MajuSales.txt");
if(yeecinnfile.is_open()) {
//ignoring the first four line of code and store the rest of the code
string line1,line2,line3,line4;
getline(yeecinnfile,line1);
getline(yeecinnfile,line2);
getline(yeecinnfile,line3);
getline(yeecinnfile,line4);
while(!yeecinnfile.eof()) {
for(int i =0; i<years; i++) {
yeecinnfile>>p1[i].Year>>p1[i].quarter1>>p1[i].quarter2>>p1[i].quarter3>>p1[i].quarter4;
}
}
for(int i =0; i<years; i++) {
cout<<p1[i].Year<<setw(10)<<p1[i].quarter1<<setw(10)<<p1[i].quarter2<<setw(10)<<p1[i].quarter3<<setw(10)<<p1[i].quarter4<<endl;
}
cout<<endl;
}
}
I see nothing wrong with the way you do this.
However, you could create a pointer to each record inside the loop
maju_company* p = &p1[i];
and then use p-> instead of p1[i]., but I really don't see this as an improvement.
If the reading loop looks too complicated, I would rather move the code to a separate function, perhaps
void read_record(maju_company& company);
or maybe
maju_company read_record();
and then only have to handle a single company inside the function (so no indexing and no ponters there).
I think you wouldn't need pointers at all for your example.
Use a std::vector to hold all your data and then there are other
things from C++ I think you should learn to use, example here :
(if you have questions let me know)
#include <iostream>
#include <string>
#include <vector>
#include <fstream>
// dont use : using namespace std;
struct maju_company_data
{
int year;
float quarter1, quarter2, quarter3, quarter4, total_sales, average_sales;
};
// describe how to stream data to an output stream (like std::cout)
std::ostream& operator<<(std::ostream& os, const maju_company_data& data)
{
os << "-----------------------------------------------------\n";
os << "Company data for year : " << data.year << "\n";
os << "Quarter 1 : " << data.quarter1 << "\n";
os << "Quarter 2 : " << data.quarter1 << "\n";
os << "Quarter 3 : " << data.quarter1 << "\n";
os << "Quarter 4 : " << data.quarter1 << "\n";
os << "\n";
return os;
}
int main()
{
// no need to manage pointers yourself use a vector
std::vector<maju_company_data> company_yearly_data; // give variables a meaningful name
std::ifstream ifile("MajuSales.txt"); // ifstream your using file as input
std::string line1, line2, line3, line4;
// ignore first line
ifile >> line1;
while (ifile >> line1 >> line2 >> line3 >> line4) // probably you need to read a few more lines here
{
maju_company_data data;
// convert read strings to numbers
data.year = std::stoi(line1);
data.quarter1 = std::stof(line2);
data.quarter2 = std::stof(line3);
data.quarter3 = std::stof(line4);
//..
//data.quarter4 = std::stof(line5);
//data.total_sales = std::stof(line6);
company_yearly_data.push_back(data);
};
// this is a range based for loop
// it is prefered since you cant go out of bounds
// const auto& means that data will be an unmodifiable
// reference to each of the structs stored in the vector
for (const auto& data : company_yearly_data)
{
std::cout << data; // since we overloaded << this loop will be nice and clean
}
return 0;
}
A C++ approach to this to overload the istream operator>> and ostream operator<< for your specific type. E.g.
#include <algorithm>
#include <array>
#include <fstream>
#include <iomanip>
#include <iostream>
#include <iterator>
#include <string>
static constexpr auto years{4};
struct maju_company {
int Year{};
float quarter1{}, quarter2{}, quarter3{}, quarter4{};
float total_sales{}, average_sales{}; // ALWAYS init your floats.
};
auto& operator>>(std::istream& is, maju_company& mc) {
is >> mc.Year
>> mc.quarter1 >> mc.quarter2 >> mc.quarter3 >> mc.quarter4
>> mc.total_sales >> mc.average_sales;
return is;
}
auto& operator<<(std::ostream& os, maju_company const& mc) {
os << mc.Year
<< std::setw(10) << mc.quarter1
<< std::setw(10) << mc.quarter2
<< std::setw(10) << mc.quarter3
<< std::setw(10) << mc.quarter4;
return os;
}
You can then go on to use the type using the std library, e.g.
int main() {
auto p1{std::array<maju_company, years>{}};
{
auto fs{std::fstream("MajuSales.txt")};
if (!fs.is_open()) return -1;
{
// throw away 4 lines
auto dummy{std::string{}};
for (auto i{0}; i < 4; ++i) getline(fs, dummy);
}
std::copy_n(std::istream_iterator<maju_company>{fs},
years,
begin(p1));
}
std::copy(cbegin(p1), cend(p1),
std::ostream_iterator<maju_company>{std::cout, "\n"});
}
In this code I have an overloaded constructor Record::Record(string s)that reads in a string, I am trying to create a string stream from 's' and use getline(stringStream, line, ",") to read each element from the string with "," as the delimiter, then assign each element to the appropriate object variable. The end goal of this assignment is to open a file, read in its data, assign the data appropriately in a vector, then write and parse the data to a new file.
My understanding of working with private class members is limited. I am unsure how to go about writing the constructor. In the past I've used a pointer for private members (e.g 'this-> foo;), at this point I just need to understand how to implement the contents of Record, so far what I've tried has been incorrect and I can only find references to using pointers to int's.
Normally I would go to my comp-sci lab and ask a TA but it is currently close due to COVID.
Here is the code for my constuctors and overloaded operators.
Record.cpp
#include <string>
#include <sstream>
#include "Record.h"
using namespace std;
Record::Record(string s) {
/** here is where I need to assign data to the following.
std::string department;
std::string item_code;
int quantity;
double cost; **/
}
Record::~Record() {
// TODO Auto-generated destructor stub
}
//overloaded "==" and "<" comparison operators
bool operator ==(const Record &lhs, const Record &rhs){
return (lhs.cost == rhs.cost && lhs.department == rhs.department &&
lhs.item_code == rhs.item_code && lhs.quantity == rhs.quantity);
}
/**bool operator <(const Record &a, const Record &b){ //do not need at this time
}
**/
//Overloaded "<<" operator
std::ostream& operator <<(std::ostream& os, const Record& r){
os << r.department << ',' << r.item_code << ',' << r.quantity << ',' << r.cost;
return os;
}
Record.h
#ifndef RECORD_H_
#define RECORD_H_
#include <iostream>
#include <string>
class Record {
public:
//Constructor
Record(std::string s); //pass this string to our Record class
//De-constructor
virtual ~Record();
//overloaded "==" and "<" comparison operators
friend bool operator ==(const Record &a, const Record &b);
//friend bool operator <(const Record &a, const Record &b); //Do not need at this time.
//Overloaded "<<" operator
friend std::ostream& operator <<(std::ostream&, const Record&);
private:
std::string department;
std::string item_code;
int quantity;
double cost;
};
#endif /* RECORD_H_ */
Main.cpp
#include <stdio.h>
#include <stdlib.h>
#include <iostream>
#include <string>
#include <libgen.h>
#include <fstream>
#include <sstream>
#include <vector>
#include <algorithm>
#include "Record.h"
using namespace std;
int main() {
vector<Record> records; //vector of type Records to hold each "Line" of input file
string filename; // File name and path stored as string
/**
* Prompts user for the name of input file and stores in string variable filename
*
*/
cout << "please enter the name of your file with file path:" << endl;
cin >> filename;
ifstream ifs { filename.c_str() };
if (!ifs) {
cerr << " Can't open file " << filename << endl;
return 1;
}
string path = filename.substr(0, filename.find_last_of("\\/"));
string file = filename.substr(filename.find_last_of("\\/") + 1,
filename.length());
if (path.compare(file) == 0) {
path == "";
}
//test for file and file path
cout << "Path portion of " << filename << " is " << path << endl;
cout << "File portion of " << filename << " is " << file << endl; // path + "new_" + file + ".cvs", make new file with new path
/**
* Put each line of input file in to the records vector
*/
string line; //strings for each parameter of the vector object
while (getline(ifs, line)) {
Record newRecord(line); //this should check for duplicates and ignore any found.
int i = 0;
int n = 0;
if((newRecord == records[i]) || (i < n) ){
i++;
}
else{
records.push_back(newRecord);
n++;
i = 0;
}
}
ifs.close(); //closes the stream
//create new file and output data to it
string newFile = ("new_" + file + ".cvs");
//check to see if file path and file name are correct
cout << (path + newFile);
//Open stream to new file and write to it
ofstream ofs(path + newFile);
ofs.open(newFile);
for(size_t i = 0; i < records.size(); i++){
ofs << (i+1) << ' ' << records[i];
}
ofs.close(); //close output stream
return 0;
}
You can do something like:
Record::Record(std::string s){
std::string word;
std::vector<std::string> temp;
std::stringstream ss(s);
while(getline(ss, word, ','))
temp.push_back(word);
department = temp.at(0);
item_code = temp.at(1);
quantity = stoi(temp.at(2));
cost = stod(temp.at(3));
}
I'm assuming you mean that each parmeter is separated by ',', not each line, if that's not the case, say something.
Edit
So there are a couple of issues in your main function, namely, the while getline cycle will probably have out_of_range access, you can use a range-based for loop, which avoids container overflow:
while (getline(ifs, line))
{
bool flag = 1;
Record newRecord(line);
for(Record r : records){
if(r == newRecord){
flag = 0;
break;
}
}
if(flag)
records.push_back(newRecord);
}
The ofstream file is not being properly opened and tested, you can do something like:
ofstream ofs;
ofs.open(path + newFile);
if (!ofs)
{
cerr << "Can't open file " << filename << endl;
return 1;
}
for (size_t i = 0; i < records.size(); i++)
{
ofs << (i + 1) << ' ' << records[i] << endl;
}
This line:
path == "";
I'm sure you meant:
path = "";
Running sample
One last note, using namespace std is not a very good practice.
The content of my test csv file looks as follows:
*test.csv*
name;age;weight;height;test
Bla;32;1.2;4.3;True
Foo;43;2.2;5.3;False
Bar;None;3.8;2.4;True
Ufo;32;1.5;5.4;True
I load the test.csv file with the following C++ program that prints the file's content on the screen:
#include <iostream>
#include <vector>
#include <string>
#include <iomanip>
#include <fstream>
#include <sstream>
void readCSV(std::vector<std::vector<std::string> > &data, std::string filename);
void printCSV(const std::vector<std::vector<std::string>> &data);
int main(int argc, char** argv) {
std::string file_path = "./test.csv";
std::vector<std::vector<std::string> > data;
readCSV(data, file_path);
printCSV(data);
return 0;
}
void readCSV(std::vector<std::vector<std::string> > &data, std::string filename) {
char delimiter = ';';
std::string line;
std::string item;
std::ifstream file(filename);
while (std::getline(file, line)) {
std::vector<std::string> row;
std::stringstream string_stream(line);
while (std::getline(string_stream, item, delimiter)) {
row.push_back(item);
}
data.push_back(row);
}
file.close();
}
void printCSV(const std::vector<std::vector<std::string> > &data) {
for (std::vector<std::string> row: data) {
for (std::string item: row) {
std::cout << item << ' ';
}
std::cout << std::endl;
}
}
I have the following questions:
How can I load only rows where, for example, age == 32?
How can I load, for example, only the name, weight columns?
How can I exclude rows that contain None?
How can I skip the first row of the document?
Does it make more sense to extract the desired information after I loaded the entire csv file (if memory is not a problem)? If possible, I want to use only the STL.
Any assistance you can provide would be greatly appreciated
you can try some csv libraries, but if you want to do this with custom code then
Inside printCSV you ask the cin to enter column names
Maintain it in a variable
In this code for (std::vector<std::string> row: data)
Check the item again each of those input when first the loop runs
then inside the second loop keep an index, accordingly you skip the column number
Example code to print only two columns
void printCSV(const std::vector<std::vector<std::string> > &data) {
int col = 0;
std::vector<std::string> column_filter;
std::vector<int> column_index;
column_filter.push_back("name");
column_filter.push_back("weight");
int row1 =0;
for (std::vector<std::string> row: data) {
col = 0;
if(row1==0) {
int col1 = 0;
for (std::string item: row) {
for (std::string colname: column_filter){
if(item.compare(colname)==0) {
column_index.push_back(col1);
}
}
col1++;
}
}
for (std::string item: row) {
int found =0;
for (int index: column_index) {
if(index==col) found = 1;
}
if(found==1)
std::cout << item << ' ';
col++;
}
std::cout << std::endl;
row1++;
}
}
Output
name weight
Bla 1.2
Foo 2.2
Bar 3.8
Ufo 1.5
Before you close. Here all answers in one file. But I will then explain in your single questions then later.
#include <iostream>
#include <vector>
#include <string>
#include <iomanip>
#include <fstream>
#include <regex>
#include <algorithm>
#include <iterator>
// Save typing Work
using Rec = std::vector<std::string>;
std::regex delimiter{ ";" };
// Proxy class for easier input and output
struct Record {
// Our data for one record
Rec data{};
// Overwrite extractor operator
friend std::istream& operator >> (std::istream& is, Record& r) {
// Read on complete line from the input stream, and check, if the read was successfull
if (std::string line{}; std::getline(is, line)) {
// If there is something in our data vector already, delete it
r.data.clear();
// Now, in one statement, split the string into tokens and copy the result into our data vector
std::copy(std::sregex_token_iterator(line.begin(), line.end(), delimiter, -1), {}, std::back_inserter(r.data));
}
return is;
}
// Overwrite inserter for easier output
friend std::ostream& operator << (std::ostream& os, const Record& r) {
std::copy(r.data.begin(), r.data.end(), std::ostream_iterator<std::string>(os,"\t"));
return os;
}
};
// Proxy for the complete CSV file
struct Roster {
// The header
Rec header{};
// All records of the CSV file
std::vector<Record> records{};
friend std::istream& operator >> (std::istream& is, Roster& r) {
// Read on complete line from the input stream, and check, if the read was successfull
if (std::string line{}; std::getline(is, line)) {
// So, we just have read the header
// Now, in one statement, split the string into tokens and copy the result into the header
r.header.clear();
std::copy(std::sregex_token_iterator(line.begin(), line.end(), delimiter, -1), {}, std::back_inserter(r.header));
// Now, in one statement, read all lines, split the string into tokens and copy the result into our record vector
r.records.clear();
std::copy(std::istream_iterator<Record>(is), {}, std::back_inserter(r.records));
}
return is;
}
// Overwrite inserter for easier output
friend std::ostream& operator << (std::ostream& os, const Roster& r) {
std::copy(r.records.begin(), r.records.end(), std::ostream_iterator<Record>(os, "\n"));
return os;
}
};
int main() {
// Open CSV file and check, if it could be opened
if (std::ifstream csvFileStream("r:\\test.csv"); csvFileStream) {
Roster roster{};
// Read the complete CSV file
csvFileStream >> roster;
// Show all read data on std::cout
std::cout << roster;
// All records with age ==32
std::cout << "\n\nAge 32\n";
std::vector<Record> age32{};
std::copy_if(roster.records.begin(), roster.records.end(), std::back_inserter(age32), [](const Record& r) { return r.data[1] == "32"; });
for (const Record& r : age32) std::cout << r << "\n";
// Or
std::cout << "\n\nAge 32 Option 2\n";
csvFileStream.clear(); csvFileStream.seekg(std::ios::beg); age32.clear();
std::copy_if(std::istream_iterator<Record>(csvFileStream), {}, std::back_inserter(age32), [](const Record& r) { return r.data[1] == "32"; });
for (const Record& r : age32) std::cout << r << "\n";
// Get Name and weight columns
std::cout << "\n\nweight and columns\n";
std::vector<std::vector<std::string>> nameAndWeight{};
std::transform(roster.records.begin(), roster.records.end(), std::back_inserter(nameAndWeight),
[](const Record& r) { std::vector<std::string>rec{ r.data[0], r.data[2] } ; return rec; });
for (const std::vector<std::string>& r : nameAndWeight) std::cout << r[0] << "\t" << r[1] << "\n";
// Everything but none
std::cout << "\n\nEverything but none\n";
std::vector<Record> notNone{};
std::copy_if(roster.records.begin(), roster.records.end(), std::back_inserter(notNone), [](const Record& r) { return r.data[1] != "None"; });
for (const Record& r : notNone) std::cout << r << "\n";
}
else {
std::cerr << "\n*** Error: Could not open source file\n";
}
return 0;
}