As I was writing the in the other post, I am trying to implement the K-means algorithm in C++. When debugging, I get no errors, but when trying to run the program, I get the error I mentioned in the title:
Terminate called after throwing an instance of 'std:invalid_argument' what():stof
I know that this error comes when the file can't convert to float. Then, what I wanted to ask is, is there something that I should change in my code, or would it be better to use another file format as input, instead of a *.csv file ? (obviosuly, I'm making the hypothesis that the program can't read something from the file. I don't know if that's the right reasoning though.)
Thank you everyone!
#include <iostream>
#include <fstream>
#include <sstream>
#include <vector>
using namespace std;
//Inizializzare il punto
struct Point {
double x, y; // Coordinate del punto
int cluster; // Cluster di default
double minDist; // Distanza minima
Point()
: x(0.0)
, y(0.0)
, cluster(-1)
, minDist(__DBL_MAX__)
{
}
Point(double x, double y)
: x(x)
, y(y)
, cluster(-1)
, minDist(__DBL_MAX__)
{
}
double distance(Point p)
{
return (p.x - x) * (p.x - x) + (p.y - y) * (p.y - y);
}
};
vector<Point> readcsv()
{
vector<Point> points;
string line;
ifstream file("Mall_Customers.csv");
while (getline(file, line)) {
stringstream lineStream(line);
string bit;
double x, y;
getline(lineStream, bit, ',');
x = stof(bit);
getline(lineStream, bit, '\n');
y = stof(bit);
points.push_back(Point(x, y));
}
return points;
}
vector<Point> points = readcsv();
void kMeansClustering(vector<Point>* points, int epochs, int k)
{
int n = points->size();
vector<Point> centroids;
srand(time(0));
for (int i = 0; i < k; ++i) {
centroids.push_back(points->at(rand() % n));
}
for (vector<Point>::iterator c = begin(centroids); c != end(centroids); ++c) {
int clusterId = c - begin(centroids);
{
for (vector<Point>::iterator it = points->begin(); it != points->end(); ++it) {
Point p = *it;
double dist = c->distance(p);
if (dist < p.minDist) {
p.minDist = dist;
p.cluster = clusterId;
}
*it = p;
}
}
}
vector<int> nPoints;
vector<double> sumX, sumY;
for (int j = 0; j < k; j++) {
nPoints.push_back(0.0);
sumX.push_back(0.0);
sumY.push_back(0.0);
}
for (vector<Point>::iterator it = points->begin(); it != points->end(); ++it) {
int clusterId = it->cluster;
nPoints[clusterId] += 1;
sumX[clusterId] += it->x;
sumY[clusterId] += it->y;
it->minDist = __DBL_MAX__; // reset distance
}
// Compute the new centroids
for (vector<Point>::iterator c = begin(centroids); c != end(centroids); ++c) {
int clusterId = c - begin(centroids);
c->x = sumX[clusterId] / nPoints[clusterId];
c->y = sumY[clusterId] / nPoints[clusterId];
}
// Write to csv
ofstream myfile;
myfile.open("output.csv");
myfile << "x,y,c" << endl;
for (vector<Point>::iterator it = points->begin(); it != points->end();
++it) {
myfile << it->x << "," << it->y << "," << it->cluster << endl;
}
myfile.close();
}
int main()
{
vector<Point> points = readcsv();
// Run k-means with 100 iterations and for 5 clusters
kMeansClustering(&points, 100, 5);
}
It seems to work now, thank you everyone for the help you gave me, in particular #Yksisarvinen and #molbdnilo : the problem was in the fact that I blindly imported all the csv file in the program, but I was supposed to import only two columns: one was the income, the other one was the spending column.
Although now I have to figure why it shows me the final result, not in the form of "bubbles",as one would expect from the algorithm,but in the form of points being clustered on a straight line.
When trying to open the original file, the original dataset, I get the error shown in the image below.
Related
#include <iostream>
#include <fstream>
#include <cmath>
using namespace std;
double* cal1(double* all1)
{
int t,count=0;
ifstream srcFile("in.txt", ios::in);
if (!srcFile)
{
cout << "error opening source file." << endl;
return 0;
}
char x;
while (srcFile >> x)
{
t = x - 'a' ;
count++;
if (t >= 0 && t <= 25)
all1[t]++;
else
all1[26]++;
}
all1[27] =count ;
srcFile.close();
/* for (t = 0; t <= 26; t++)
{
cout << all1[t] / all1[27]<<endl;
}
cout << all1[27] << endl;*/
return all1;
}
double finalcal1(double* all)
{
int t;
double p,cal1=0;
for (t = 0; t <= 26; t++)
{
p = (all[t] / all[27]);
all[t] = p * log(p);
}
for (t = 0; t <= 26; t++)
{
cal1 -= all[t];
}
return cal1;
}
int main()
{
double *all =new double[28]; //1
double t;
all = cal1(all);
t = finalcal1(all);
cout << t << endl;
delete[] all;
return 0;
}
enter code here
instead of receiving a number from the result, I just got a “-nan.(ind)” which is not even a number. Besides, when I change the number from mark 1 to *all =new double[27] which is what it supposed to be, there would be error or bugs showing up.
double *all =new double[28];
You probably want to initialise all these values to zero to start with since, otherwise, they'll have arbitrary values.
And, if those arbitrary values consist of any NaN items, that will propagate when you add things to them, or divide by some count.
Something like this will do the trick:
double *all = new double[28]();
You may also want to consider the possibility that log(x) is not actually defined for all values of x (such as zero or negative values) - that may be another way in which you could get a NaN.
question: Dean of MAIT is going to visit Hostels of MAIT. As you know that he is a very busy person so he decided to visit only the first "K" nearest Hostels. Hostels are situated on a 2D plane. You are given the coordinates of hostels and you have to answer the Rocket distance of Kth nearest hostel from the origin ( Dean's place )
Input Format
The first line of input contains Q Total no. of queries and K There are two types of queries:
first type: 1 x y For query of 1st type, you came to know about the coordinates ( x, y ) of the newly constructed hostel. second type: 2 For query of 2nd type, you have to output the Rocket distance of Kth nearest hostel till now.
//The Dean will always stay at his place ( origin ). It is guaranteed that there will be at least k queries of type 1 before the first query of type 2.
Rocket distance between two points ( x2 , y2 ) and ( x1 , y1 ) is defined as (x2 - x1)2 + (y2 - y1)2
Constraints
1 < = k < = Q < = 10^5 -10^6 < = x , y < = 10^6
Output Format
For each query of type 2 output the Rocket distance of Kth nearest hostel from Origin.//
This is my code:
#include <iostream>
#include <queue>
#include <vector>
#include <algorithm>
using namespace std;
class roomno
{
public:
int x;
int y;
roomno(int x,int y)
{
this->x=x;
this->y=y;
}
void print()
{
cout<<"location"<<"("<<x<<","<<y<<")"<<endl;
}
int distance ()
{
return (x*x+y*y);
}
};
class roomcompare
{
public:
bool operator() (roomno r1,roomno r2)
{
return r1.distance()>r2.distance();
}
};
int main()
{
int x[1000]},y[1000];
int l,k=0;
priority_queue<roomno,vector<roomno>,roomcompare> pq;
int n,i,j;
cin>>n>>l;
//cin>>n;
cin.ignore();
for( i=0;i<n;i++)
{
cin>>x[i];
}
cin.ignore();
for( j=0;j<n;j++)
{
cin>>y[j];
}
cin.ignore();
for(i=0;i<n;i++ )
{
roomno r1(x[i],y[i]);
pq.push(r1);
}
while(!pq.empty()&&k!=l)
{ k++;
roomno r2=pq.top();
r2.print();
pq.pop();
}
return 0;
}
Original link to code: https://codeshare.io/2j1bkA
What's wrong with my code?
Please consider adding what problem you are facing in your post. I've seen your code but I couldn't help with your code as I even saw a syntax error in your code and didn't know if it was the problem.
If I didn't misunderstand your question, std::set will fit better than priority queue as it is not possible to remove the largest item in a priority queue(min heap). The following code should work:
#include <iostream>
#include <set>
using namespace std;
using ll = long long;
multiset<ll> distances;
int main() {
ll n, k;
cin >> n >> k;
for(ll i = 0; i < n; ++i) {
ll query;
cin >> query;
if (query == 1) {
ll x, y;
cin >> x >> y;
distances.insert(x * x + y * y);
if (distances.size() > k) {
distances.erase(--distances.end());
}
} else {
cout << *distances.rbegin() << '\n';
}
}
return 0;
}
How do I merge certain elements in a vector or similar data structure by only iterating over the complete list once? Is there a more efficient way than what I have got?
I have a vector of vectors of points: std::vector<std::vector<cv::Point>> contours
And I need to compare always two of them and then decide if I want to merge them or continue comparing.
ContourMoments has some helper functions to calculate the distance between points for example. The function merge() only takes all the points from one ContourMoments object and adds them to the calling ContourMoments object.
#include <iostream>
#include <cmath>
#include <ctime>
#include <cstdlib>
#include <opencv/cv.h>
#include <opencv2/imgproc/imgproc.hpp>
class ContourMoments
{
private:
void init()
{
moments = cv::moments(contour);
center = cv::Point2f(moments.m10 / moments.m00, moments.m01 / moments.m00);
float totalX = 0.0, totalY = 0.0;
for (auto const&p : contour)
{
totalX += p.x;
totalY += p.y;
}
gravitationalCenter = cv::Point2f(totalX / contour.size(), totalY / contour.size());
}
public:
cv::Moments moments;
std::vector<cv::Point2f> contour;
cv::Point2f center;
cv::Point2f gravitationalCenter;
ContourMoments(const std::vector<cv::Point2f> &c)
{
contour = c;
init();
}
ContourMoments(const ContourMoments &cm)
{
contour = cm.contour;
init();
}
void merge(const ContourMoments &cm)
{
contour.insert(contour.end(), std::make_move_iterator(cm.contour.begin()), std::make_move_iterator(cm.contour.end()));
init();
}
float horizontalDistanceTo(const ContourMoments &cm)
{
return std::abs(center.x - cm.center.x);
}
float verticalDistanceTo(const ContourMoments &cm)
{
return std::abs(center.y - cm.center.y);
}
float centerDistanceTo(const ContourMoments &cm)
{
return std::sqrt(std::pow(center.x - cm.center.x, 2) + std::pow(center.y - cm.center.y, 2));
}
ContourMoments() = default;
~ContourMoments() = default;
};
float RandomFloat(float a, float b) {
float random = ((float) rand()) / (float) RAND_MAX;
float diff = b - a;
float r = random * diff;
return a + r;
}
std::vector<std::vector<cv::Point2f>> createData()
{
std::vector<std::vector<cv::Point2f>> cs;
for (int i = 0; i < 2000; ++i) {
std::vector<cv::Point2f> c;
int j_stop = rand()%11;
for (int j = 0; j < j_stop; ++j) {
c.push_back(cv::Point2f(RandomFloat(0.0, 20.0), RandomFloat(0.0, 20.0)));
}
cs.push_back(c);
}
return cs;
}
void printVectorOfVectorsOfPoints(const std::vector<std::vector<cv::Point2f>> &cs) {
std::cout << "####################################################" << std::endl;
for (const auto &el : cs) {
bool first = true;
for (const auto &pt : el) {
if (!first) {
std::cout << ", ";
}
first = false;
std::cout << "{x: " + std::to_string(pt.x) + ", y: " + std::to_string(pt.y) + "}";
}
std::cout << std::endl;
}
std::cout << "####################################################" << std::endl;
}
void merge(std::vector<std::vector<cv::Point2f>> &contours, int &counterMerged){
for(auto it = contours.begin() ; it < contours.end() ; /*++it*/)
{
int counter = 0;
if (it->size() < 5)
{
it = contours.erase(it);
continue;
}
for (auto it2 = it + 1; it2 < contours.end(); /*++it2*/)
{
if (it2->size() < 5)
{
it2 = contours.erase(it2);
continue;
}
ContourMoments cm1(*it);
ContourMoments cm2(*it2);
if (cm1.centerDistanceTo(cm2) > 4.0)
{
++counter;
++it2;
continue;
}
counterMerged++;
cm1.merge(std::move(cm2));
it2 = contours.erase(it2);
}
if (counter > 0)
{
std::advance(it, counter);
}
else
{
++it;
}
}
}
int main(int argc, const char * argv[])
{
srand(time(NULL));
std::vector<std::vector<cv::Point2f>> contours = createData();
printVectorOfVectorsOfPoints(contours);
int counterMerged = 0;
merge(contours, counterMerged);
printVectorOfVectorsOfPoints(contours);
std::cout << "Merged " << std::to_string(counterMerged) << " vectors." << std::endl;
return 0;
}
Thanks in advance!
Best wishes
Edit
posted the complete example - install opencv first
This generates 2000 vectors of up to 10 points and merges them if they are close together.
For each of your contours, you can pre-compute the centers. Then, what you want to do is to cluster the contours. Each pair of contours whose center is at most a distance of d apart should belong to the same cluster.
This can be done with a simple radius query. Like this:
for each contour ci in all contours
for each contour cj with cluster center at most d away from ci
merge cluster ci and cj
For the radius query, I would suggest something like a k-d tree. Enter all your contours into the tree and then you can query for the neighbors in O(log n) instead of O(n) like you are doing now.
For the merging part, I would suggest a union-find data structure. This lets you do the merge in practically constant time. At the end, you can just gather all your contour data into a big cluster contour.
I didn't scrutinize your code. If what you want to do is to scan the vector, form groups of contiguous points and emit a single point per group, in sequence, e.g.
a b c|d e|f g h|i j k|l => a b c d f i l
the simplest is to perform the operation in-place.
Keep an index after the last emitted point, let n (initially 0), and whenever you emit an new one, store it at array[n++], overwriting this element, which has already been processed. In the end, resize the array.
One might think of a micro-optimization to compare n to the current index, and avoid overwriting an element with itself. Anyway, as soon as an element has been dropped, the test becomes counter-productive.
I am working on escape-time fractals as my 12th grade project, to be written in c++ , using the simple graphics.h library that is outdated but seems sufficient.
The code for generating the Mandelbrot set seems to work, and I assumed that Julia sets would be a variation of the same. Here is the code:
(Here, fx and fy are simply functions to convert the actual complex co-ordinates like (-0.003,0.05) to an actual value of a pixel on the screen.)
int p;
x0=0, y0=0;
long double r, i;
cout<<"Enter c"<<endl;
cin>>r>>i;
for(int i= fx(-2); i<=fx(2); i++)
{
for(int j= fy(-2); j>=fy(2); j--)
{
long double x=0.0, y= 0.0,t;
x= gx(i), y= gy(j);
int k= -1;
while(( x*x + y*y <4)&& k<it-1)
{
t= x*x - y*y + r;
y= 2*x*y + i ;
x=t;
k++;
}
p= k*pd;
setcolor(COLOR(colour[p][0],colour[p][1],colour[p][2]));
putpixel(i,j,getcolor());
}
}
But this does not seem to be the case. The output window shows the entire circle of radius=2 with the colour corresponding to an escape time of 1 iteration.
Also, on trying to search for a solution to this problem, I've seen that all the algorithms others have used initializes the initial co-ordinates somewhat like this:
x = (col - width/2)*4.0/width;
y = (row - height/2)*4.0/width;
Could somebody explain what I'm missing out?
I guess that the main problem is that the variable i (imaginary part) is mistakenly overridden by the loop variable i. So the line
y= 2*x*y + i;
gives the incorrect result. This variable should be renamed as, say im. The corrected version is attached below, Since I don't have graphics.h, I used the screen as the output.
#include <iostream>
using namespace std;
#define WIDTH 40
#define HEIGHT 60
/* real to screen */
#define fx(x) ((int) ((x + 2)/4.0 * WIDTH))
#define fy(y) ((int) ((2 - y)/4.0 * HEIGHT))
/* screen to real */
#define gx(i) ((i)*4.0/WIDTH - 2)
#define gy(j) ((j)*4.0/HEIGHT - 2)
static void julia(int it, int pd)
{
int p;
long double re = -0.75, im = 0;
long double x0 = 0, y0 = 0;
cout << "Enter c" << endl;
cin >> re >> im;
for (int i = fx(-2.0); i <= fx(2.0); i++)
{
for (int j = fy(-2.0); j >= fy(2.0); j--)
{
long double x = gx(i), y = gy(j), t;
int k = 0;
while (x*x + y*y < 4 && k < it)
{
t = x*x - y*y + re;
y = 2*x*y + im;
x = t;
k++;
}
p = (int) (k * pd);
//setcolor(COLOR(colour[p][0],colour[p][1],colour[p][2]));
//putpixel(i,j,getcolor());
cout << p; // for ASCII output
}
cout << endl; // for ASCII output
}
}
int main(void)
{
julia(9, 1);
return 0;
}
and the output with input -0.75 0 is given below.
0000000000000000000000000000000000000000000000000000000000000
0000000000000000000001111111111111111111000000000000000000000
0000000000000000011111111111111111111111111100000000000000000
0000000000000001111111111111111111111111111111000000000000000
0000000000000111111111111122222222211111111111110000000000000
0000000000011111111111122222349432222211111111111100000000000
0000000001111111111112222233479743322222111111111111000000000
0000000011111111111222222334999994332222221111111111100000000
0000000111111111112222223345999995433222222111111111110000000
0000011111111111122222234479999999744322222211111111111100000
0000011111111111222222346899999999986432222221111111111100000
0000111111111111222223359999999999999533222221111111111110000
0001111111111112222233446999999999996443322222111111111111000
0011111111111112222233446999999999996443322222111111111111100
0011111111111122222333456899999999986543332222211111111111100
0111111111111122223334557999999999997554333222211111111111110
0111111111111122233345799999999999999975433322211111111111110
0111111111111122233457999999999999999997543322211111111111110
0111111111111122334469999999999999999999644332211111111111110
0111111111111122345999999999999999999999995432211111111111110
0111111111111122379999999999999999999999999732211111111111110
0111111111111122345999999999999999999999995432211111111111110
0111111111111122334469999999999999999999644332211111111111110
0111111111111122233457999999999999999997543322211111111111110
0111111111111122233345799999999999999975433322211111111111110
0111111111111122223334557999999999997554333222211111111111110
0011111111111122222333456899999999986543332222211111111111100
0011111111111112222233446999999999996443322222111111111111100
0001111111111112222233446999999999996443322222111111111111000
0000111111111111222223359999999999999533222221111111111110000
0000011111111111222222346899999999986432222221111111111100000
0000011111111111122222234479999999744322222211111111111100000
0000000111111111112222223345999995433222222111111111110000000
0000000011111111111222222334999994332222221111111111100000000
0000000001111111111112222233479743322222111111111111000000000
0000000000011111111111122222349432222211111111111100000000000
0000000000000111111111111122222222211111111111110000000000000
0000000000000001111111111111111111111111111111000000000000000
0000000000000000011111111111111111111111111100000000000000000
0000000000000000000001111111111111111111000000000000000000000
0000000000000000000000000000000000000000000000000000000000000
would you please tell how you display the image by using these graphics.h library
//setcolor(COLOR(colour[p][0],colour[p][1],colour[p][2]));
//putpixel(i,j,getcolor());
This is my code I have, which when I build it works and creates the .exe file, however throughout the process I print the function (i.e. mean, covarience, coefficient) and they all come back as -1#IND. Think it may not be pulling in the data from the CSV files correctly?
// basic file operations
#include <iterator>
#include <iostream>
#include <fstream>
#include <sstream>
#include <vector>
#include <string>
#include <stdio.h>
#include <math.h>
#include <stdlib.h>
using namespace std;
typedef vector<double> Prices;
Prices parse_csv_line(string& line)
{
Prices result;
string datum;
stringstream ss(line);
int count=0;
while(getline(ss,datum,','))
{
// convert string to
count++;
if (count%2 == 0)
result.push_back(atof(datum.c_str()));
}
return result;
}
Prices parse_csv_file(const char* filename)
{
ifstream file(filename);
Prices prices;
string line;
// This will discard the header line
getline(file, line);
// This will get each line in the file, and collate its values
while (getline(file, line))
{
Prices v = parse_csv_line(line);
prices.insert(prices.end(), v.begin(), v.end());
}
for(Prices::iterator it=prices.begin(); it != prices.end(); it++)
cout << " " << *it;
return prices;
}
//Calculate Correlation of series A and B, then return
/* Calculatethe mean averages for A and B.
(For each series, add each sample and then divide by the number of samples.) */
double CalculateMean(Prices x)
{
double sum = 0;
for(size_t i = 0; i < x.size(); i++)
sum += x[i];
return (sum / x.size());
}
/* Calculate the variance for A and B.
(First calculate the difference from the mean for each sample number. Square each number then divide by the number of samples (n).
If the numbers you are calculating represent a sample of a larger group, then you would divide by n – 1.) */
double CalculateVariance(Prices x)
{
double mean = CalculateMean(x);
double temp = 0;
for(size_t i = 0; i < x.size(); i++)
{
temp += (x[i] - mean) * (x[i] - mean) ;
}
return temp / x.size();
}
/* calculate the standard deviation for A and B, which is the square root of the variance.
(This number will tell you how closely your samples are located to the mean.) */
double Calculate_StandardDeviation(Prices x)
{
return sqrt(CalculateVariance(x));
}
/* Lastly, calculate the Covariance of the 2 series.
(This value can be used to represent the linear relationship between two variables.) */
double Calculate_Covariance(Prices x, Prices y)
{
double meanX = CalculateMean(x);
double meanY = CalculateMean(y);
cout << "mean x = " << meanX << "\n";
cout << "mean y = " << meanY << "\n";
double total = 0;
for(size_t i = 0; i < x.size(); i++)
{
total += (x[i] - meanX) * (y[i] - meanY);
}
return total / x.size();
}
// Using the calculated values, these can then be inputted into the Correlation Coefficient formula to find the correlation of series A and B.
double Calculate_Correlation(Prices x, Prices y)
{
double covariance = Calculate_Covariance(x, y);
cout << "covariance =" << covariance << "\n";
double correlation = covariance / (Calculate_StandardDeviation(x) * Calculate_StandardDeviation(y));
return correlation;
};
int main()
{
Prices a = parse_csv_file("PC1_A.CSV");
Prices b = parse_csv_file("PC1_B.CSV");
double correlation = Calculate_Correlation(a, b);
cout << "Correlation is: " << correlation;
cin.get();
}