I'm making a program in C++ which counts NGS read alignments against a reference annotation. Basically the program reads both the annotation and alignment file into memory, iterates through the annotation, binary searches the alignment file for a probable location, upon finding this location linear searches a frame that is around that probable location.
Typically I want to keep this frame somewhat large (10000 alignments), so I had the idea to split the frame up and throw parts of it into separate threads.
Everything compiles and runs, but it doesn't look like my multithreading is working as intended because my comp is using one core for the job. Would anyone be kind enough to help me figure this out where I implemented the threading wrong.
https://sourceforge.net/projects/fast-count/?source=directory
#include <iostream>
#include <cstdlib>
#include <vector>
#include <string>
#include <thread>
#include <sstream>
#include <fstream>
#include <math.h>
#include "api/BamReader.h"
using namespace std;
using namespace BamTools;
int hit_count = 0;
struct bam_headers{
string chr;
int start;
};
struct thread_data{
int thread_id;
int total_thread;
int start_gtf;
int stop_gtf;
};
struct gtf_headers{
string chr;
string source;
string feature;
string score;
string strand;
string frame;
string annotation;
int start;
int end;
};
void process(int* start_holder, int size, int gtf_start, int gtf_stop){
//threaded counter process
for (int t = 0; t < size; t++){
if((start_holder[t] >= gtf_start) && (start_holder[t] <= gtf_stop)){
hit_count++;
}
}
}
vector <string> find_index(vector <vector <bam_headers> > bams){
//define vector for bam_index to chromosome
vector <string> compute_holder;
for (int bam_idx = 0; bam_idx < bams.size();bam_idx++){
compute_holder.push_back(bams[bam_idx][0].chr);
}
return compute_holder;
}
vector <gtf_headers> load_gtf(char* filename){
//define matrix to memory holding gtf annotations by assoc. header
vector<gtf_headers> push_matrix;
gtf_headers holder;
ifstream gtf_file(filename);
string line;
cout << "Loading GTF to memory" << "\n";
if (gtf_file.is_open()){
int sub_count = 0;
string transfer_hold[8];
while(getline(gtf_file,line)){
//iterate through file
istringstream iss(line);
string token;
//iterate through line, and tokenize by tab delimitor
while(getline(iss,token,'\t')){
if (sub_count == 8){
//assign to hold struct, and push to vector
holder.chr = transfer_hold[0];
holder.source = transfer_hold[1];
holder.feature = transfer_hold[2];
holder.start = atoi(transfer_hold[3].c_str());
holder.end = atoi(transfer_hold[4].c_str());
holder.score = transfer_hold[5];
holder.strand = transfer_hold[6];
holder.frame = transfer_hold[7];
holder.annotation = token;
push_matrix.push_back(holder);
sub_count = 0;
} else {
//temporarily hold tokens
transfer_hold[sub_count] = token;
++sub_count;
}
}
}
cout << "GTF successfully loaded to memory" << "\n";
gtf_file.close();
return(push_matrix);
}else{
cout << "GTF unsuccessfully loaded to memory. Check path to file, and annotation format. Exiting" << "\n";
exit(-1);
}
}
vector <vector <bam_headers>> load_bam(char* filename){
//parse individual bam file to chromosome bins
vector <vector <bam_headers> > push_matrix;
vector <bam_headers> iter_chr;
int iter_refid = -1;
bam_headers bam_holder;
BamReader reader;
BamAlignment al;
const vector<RefData>& references = reader.GetReferenceData();
cout << "Loading " << filename << " to memory" << "\n";
if (reader.Open(filename)) {
while (reader.GetNextAlignmentCore(al)) {
if (al.IsMapped()){
//bam file must be sorted by chr. otherwise the lookup will segfault
if(al.RefID != iter_refid){
//check if chr. position has advanced in the bam file, if true, push empty vector
iter_refid++;
push_matrix.push_back(iter_chr);
}else{
//if chr. position hasn't advanced push to current index in 2d vector
bam_holder.chr = references[al.RefID].RefName;
bam_holder.start = al.Position;
push_matrix.at(iter_refid).push_back(bam_holder);
}
}
}
reader.Close();
cout << "Successfully loaded " << filename << " to memory" << "\n";
return(push_matrix);
}else{
cout << "Could not open input BAM file. Exiting." << endl;
exit(-1);
}
}
short int find_bin(const string & gtf_chr, const vector <string> mapping){
//determines which chr. bin the gtf line is associated with
int bin_compare = -1;
for (int i = 0; i < mapping.size(); i++){
if(gtf_chr == mapping[i]){
bin_compare = i;
}
}
return(bin_compare);
}
int find_frame(gtf_headers gtf_matrix, vector <bam_headers> bam_file_bin){
//binary search to find alignment index with greater and less than gtf position
int bin_size = bam_file_bin.size();
int high_end = bin_size;
int low_end = 0;
int binary_i = bin_size / 2;
int repeat = 0;
int frame_start;
bool found = false;
while (found != true){
if ((bam_file_bin[binary_i].start >= gtf_matrix.start) && (bam_file_bin[binary_i].start <= gtf_matrix.end)){
frame_start = binary_i;
found = true;
}else{
if(repeat != binary_i){
if(bam_file_bin[binary_i].start > gtf_matrix.end){
if(repeat != binary_i){
repeat = binary_i;
high_end = binary_i;
binary_i = ((high_end - low_end) / 2) + low_end;
}
}else{
if(repeat != binary_i){
repeat = binary_i;
low_end = binary_i;
binary_i = ((high_end - low_end) / 2) + low_end;
}
}
}else{
frame_start = low_end;
found = true;
}
}
}
return(frame_start);
}
vector <int > define_frame(int frame_size, int frame_start, int bam_matrix){
//define the frame for the search
vector <int> push_ints;
push_ints.push_back(frame_start - (frame_size / 2));
push_ints.push_back(frame_start + (frame_size / 2));
if(push_ints[0] < 0){
push_ints[0] = 0;
push_ints[1] = frame_size;
if(push_ints[1] > bam_matrix){
push_ints[1] = frame_size;
}
}
if(push_ints[1] > bam_matrix){
push_ints[1] = bam_matrix;
push_ints[0] = bam_matrix - (frame_size / 2);
if(push_ints[0] < 0){
push_ints[0] = 0;
}
}
return(push_ints);
}
void thread_handler(int nthread, vector <int> frame, vector <bam_headers> bam_matrix, gtf_headers gtf_record){
int thread_divide = frame[1]-frame[0];//frame_size / nthread;
int thread_remain = (frame[1]-frame[0]) % nthread;
int* start_holder = new int[thread_divide];
for(int i = 0; i < nthread; i++){
if (i < nthread - 1){
for (int frame_index = 0; frame_index < thread_divide; frame_index++){
start_holder[frame_index] = bam_matrix[frame[0]+frame_index].start;
}
frame[0] = frame[0] + thread_divide;
thread first(process, start_holder,thread_divide,gtf_record.start,gtf_record.end);
first.join();
}else{
for (int frame_index = 0; frame_index < thread_divide + thread_remain; frame_index++){
start_holder[frame_index] = bam_matrix[frame[0]+frame_index].start;
}
thread last(process, start_holder,thread_divide + thread_remain,gtf_record.start,gtf_record.end);
last.join();
}
}
}
int main (int argc, char *argv[])
{
// usage
// ./count threads frame_size gtf_file files
//define matrix to memory holding gtf annotations by assoc. header
vector <gtf_headers> gtf_matrix = load_gtf(argv[3]);
//load bam, perform counts
for(int i = 4;i < argc;i++){
//iterate through filenames in argv, define matrix to memory holding bam alignments chr and bp position
vector <vector <bam_headers> > bam_matrix = load_bam(argv[i]);
//map chromosome to bam matrix index
vector <string> index_mapping = find_index(bam_matrix);
//iterate through gtf matrix, find corresponding bins for chr, set search frames, and count
for(int gtf_i = 0; gtf_i < gtf_i < gtf_matrix.size();gtf_i++){ //gtf_i < gtf_matrix.size()
hit_count = 0;
//find corresponding bins for gtf chr
short int bin_compare = find_bin(gtf_matrix[gtf_i].chr,index_mapping);
if(bin_compare != -1){
//find start of search frame
int frame_start = find_frame(gtf_matrix[gtf_i], bam_matrix[bin_compare]);
//get up lower bounds of search frame;
vector <int> full_frame = define_frame(atoi(argv[2]),frame_start,bam_matrix[bin_compare].size());
//create c array of bam positional data for the frame, and post to thread process
thread_handler(atoi(argv[1]),full_frame,bam_matrix[bin_compare],gtf_matrix[gtf_i]);
}
//counts displayed in STOUT
cout << gtf_matrix[gtf_i].chr << "\t" << gtf_matrix[gtf_i].source << "\t" << gtf_matrix[gtf_i].feature << "\t" << gtf_matrix[gtf_i].start << "\t" << gtf_matrix[gtf_i].end << "\t" << gtf_matrix[gtf_i].score << "\t" << gtf_matrix[gtf_i].strand << "\t" << gtf_matrix[gtf_i].frame << "\t" << gtf_matrix[gtf_i].annotation << "\t" << hit_count << "\n";
}
}
}
The answer to your question is very simple:
thread last(process, start_holder,thread_divide + thread_remain,gtf_record.start,gtf_record.end);
last.join();
Here, the parent task creates a new thread, and ... immediately waits for the thread to finish. That's what join() does, it waits for the thread to terminate.
So, your code starts a new thread, and immediately waits for it to finish, before doing anything else, like starting the next thread.
You need to rewrite thread_handler() to instantiate all std::thread instances, and then after instantiating all of them, call join() on each one, to wait for all of them to finish.
The typical approach is to precreate a std::vector of all thread instances, using std::thread's default constructor, then loop over them to initialize each one, then loop over them again, calling join() on each one.
Related
The following C++ program takes two text files, stop_words.txt, and story.txt. It then removes all the stop word occurrences in the story.txt file. For instance,
Monkey is a common name that may refer to groups or species of mammals, in part, the simians of infraorder L. The term is applied descriptively to groups of primates, such as families of new world monkeys and old world monkeys. Many monkey species are tree-dwelling (arboreal), although there are species that live primarily on the ground, such as baboons. Most species are also active during the day (diurnal). Monkeys are generally considered to be intelligent, especially the old world monkeys of Catarrhini.
the text above is story.txt, and the stop_words.txt file is given below:
is
are
be
When I run my code, it doesn't delete all the stop words and keeps some of them. The code also creates a file called stop_words_counter.txt which should display the number of stop word occurrences like so:
is 2
are 4
b 1
But my output file shows the following:
is 1
are 4
be 1
I would be very grateful for some help regarding this code! I have posted it below for your reference.
#include <iostream>
#include <string>
#include <fstream>
using namespace std;
const int MAX_NUM_STOPWORDS = 100;
struct Stop_word
{
string word; // stop word
int count; // removal count
};
int stops[100];
string ReadLineFromStory(string story_filename )
{
string x = "";
string b;
ifstream fin;
fin.open(story_filename);
while(getline(fin, b))
{
x += b;
}
return x;
}
void ReadStopWordFromFile(string stop_word_filename, Stop_word words[], int &num_words)
{
ifstream fin;
fin.open(stop_word_filename);
string a;
int i = 0;
if (fin.fail())
{
cout << "Failed to open "<< stop_word_filename << endl;
exit(1);
}
words[num_words].count = 0;
while (fin >> words[num_words].word)
{
++num_words;
}
fin.close();
}
void WriteStopWordCountToFile(string wordcount_filename, Stop_word words[], int num_words)
{
ofstream fout;
fout.open(wordcount_filename);
for (int i = 0; i < 1; i++)
{
fout << words[i].word << " "<< stops[i] + 1 << endl;
}
for (int i = 1; i < num_words; i++)
{
fout << words[i].word << " "<< stops[i] << endl;
}
fout.close();
}
int RemoveWordFromLine(string &line, string word)
{
int length = line.length();
int counter = 0;
int wl = word.length();
for(int i=0; i < length; i++)
{
int x = 0;
if(line[i] == word[0] && (i==0 || (i != 0 && line[i-1]==' ')))
{
for(int j = 1 ; j < wl; j++)
if (line[i+j] != word[j])
{
x = 1;
break;
}
if(x == 0 && (i + wl == length || (i + wl != length && line[i+wl] == ' ')))
{
for(int k = i + wl; k < length; k++)
line[k -wl] =line[k];
length -= wl;
counter++;
}
}
}
line[length] = 0;
char newl[1000] = {0};
for(int i = 0; i < length; i++)
newl[i] = line[i];
line.assign(newl);
return counter;
}
int RemoveAllStopwordsFromLine(string &line, Stop_word words[], int num_words)
{
int counter[100];
int final = 0;
for(int i = 1; i <= num_words; i++)
{
counter[i] = RemoveWordFromLine(line, words[i].word);
final += counter[i];
stops[i] = counter[i];
}
return final;
}
int main()
{
Stop_word stopwords[MAX_NUM_STOPWORDS]; // an array of struct Stop_word
int num_words = 0, total = 0;
// read in two filenames from user input
string a, b, c;
cin >> a >> b;
// read stop words from stopword file and
// store them in an array of struct Stop_word
ReadStopWordFromFile(a, stopwords, num_words);
// open text file
c = ReadLineFromStory(b);
// open cleaned text file
ofstream fout;
fout.open("story_cleaned.txt");
// read in each line from text file, remove stop words,
// and write to output cleaned text file
total = RemoveAllStopwordsFromLine(c, stopwords, num_words) + 1 ;
fout << c;
// close text file and cleaned text file
fout.close();
// write removal count of stop words to files
WriteStopWordCountToFile("stop_words_count.txt", stopwords, num_words);
// output to screen total number of words removed
cout << "Number of stop words removed = " << total << endl;
return 0;
}
There is one major bug in your code.
in function RemoveAllStopwordsFromLine
you are using the wrong array indices. In C++ the first element in an array has the index 0. Also you must compare with "less" than the size.
for (int i = 1; i <= num_words; i++)
So the first stop word "is", will never be checked and counted.
Please modify to
for (int i = 0; i < num_words; i++)
But then you need also to remove your patch in function WriteStopWordCountToFile . You made a special case for element 0. That is wrong.
Please remove
for (int i = 0; i < 1; i++)
{
fout << words[i].word << " " << stops[i] + 1 << endl;
}
and start the next for with 0. And remove the "+" while calculating the total.
Because you are using C-Style arrays, magic numbers and ultra complex code, I will show you a modern C++ solution.
In C++ you have many useful algorithms. Some are specifically designed to address your requirments. So, please use them. Try to get away from C and migrate to C++.
#include <string>
#include <iostream>
#include <fstream>
#include <vector>
#include <iterator>
#include <algorithm>
#include <regex>
#include <sstream>
// The filenames. Whatever you want
const std::string storyFileName{ "r:\\story.txt" };
const std::string stopWordFileName{ "r:\\stop_words.txt" };
const std::string stopWordsCountFilename{ "r:\\stop_words_count.txt" };
const std::string storyCleanedFileName{ "r:\\story_cleaned.txt" };
// Becuase of the simplicity of the task, put everything in main
int main() {
// Open all 4 needed files
std::ifstream storyFile(storyFileName);
std::ifstream stopWordFile(stopWordFileName);
std::ofstream stopWordsCountFile(stopWordsCountFilename);
std::ofstream storyCleanedFile(storyCleanedFileName);
// Check, if the files could be opened
if (storyFile && stopWordFile && stopWordsCountFile && storyCleanedFile) {
// 1. Read the complete sourcefile with the story into a std::string
std::string story( std::istreambuf_iterator<char>(storyFile), {} );
// 2. Read all "stop words" into a std::vector of std::strings
std::vector stopWords(std::istream_iterator<std::string>(stopWordFile), {});
// 3. Count the occurences of the "stop words" and write them into the destination file
std::for_each(stopWords.begin(), stopWords.end(), [&story,&stopWordsCountFile](std::string& sw) {
std::regex re{sw}; // One of the "stop words"
stopWordsCountFile << sw << " --> " << // Write count to output
std::distance(std::sregex_token_iterator(story.begin(), story.end(), re, 1), {}) << "\n";});
// 4. Replace "stop words" in story and write new story into file
std::ostringstream wordsToReplace; // Build a list of all stop words, followed by an option white space
std::copy(stopWords.begin(), stopWords.end(), std::ostream_iterator<std::string>(wordsToReplace, "\\s?|"));
storyCleanedFile << std::regex_replace(story,std::regex(wordsToReplace.str()), "");
}
else {
// In case that any of the files could not be opened.
std::cerr << "\n*** Error: Could not open one of the files\n";
}
return 0;
}
Please try to study and understand this code. This is a very simple solution.
I have the fallowing code. I read the guide for what a segmentation fault is, but I'm not 100% sure where its actually happening within my code. It works until I start working with the dynamic array (histogram), more specifically at the //set all initial values to be zero. Within that mess after I'm not sure. Thanks!
The instructor asked to "Use a dynamic array to store the histogram.", Which I think is my issue here.
-Solved-
thanks for the help, the error was in how I initialized the array pointer
rather than
const int hSize = 10;
IntArrayPtr histogram;
histogram = new int[hSize];
I used
const int hSize = 10;
int hValues[hSize] = { 0 };
IntArrayPtr histogram;
histogram = hValues;
Which worked as the instructor wanted.
#include <iostream>
#include <vector>
using namespace std;
typedef int* IntArrayPtr;
int main() {
vector<int>grades;
int newGrade;
cout << "Input grades between 0 and 100. Input -1 to calculate histogram: " << endl;
cin >> newGrade;
grades.push_back(newGrade);
while (newGrade > 0) {
cin >> newGrade;
while (newGrade > 100) {
cout << "less than 100 plz: ";
cin >> newGrade;
}
grades.push_back(newGrade);
}
grades.pop_back();
int size = grades.size();
cout << "Calculating histogram with " << size << " grades." << endl;
//Create dynamic array for the histogram of 10 sections.
const int hSize = 10;
IntArrayPtr histogram;
histogram = new int[hSize];
}
//Make the historgram
int stackValue = 0;
for (int j = 0; j < hSize; j++) {
//Loop through the grade vector slots
for (int i = 0; i < size; i++) {
int testValue = grades[i];
//If the grade at the index is between the stack values of the histogram add one to the value of the slot
if (testValue > stackValue && testValue < stackValue + 10) {
histogram[j]++;
}
}
//After looping through the vector jump up to the next histogram slot and corresponding stack value.
stackValue += 10;
}
//Histogram output. Only output the stacks with values
for (int i = 0; i < 10; i++) {
if (histogram[i] != 0) {
cout << "Number of " << (i + 1) * 10 << "'s: " << histogram[i];
}
}
return 0;
}
Working Code:
#include <iostream>
#include <vector>
using namespace std;
typedef int* IntArrayPtr;
int main() {
vector<int>grades;
int newGrade;
cout << "Input grades between 0 and 100. Input -1 to calculate histogram: " << endl;
cin >> newGrade;
grades.push_back(newGrade);
while (newGrade > 0) {
cin >> newGrade;
while (newGrade > 100) {
cout << "less than 100 plz: ";
cin >> newGrade;
}
grades.push_back(newGrade);
}
grades.pop_back();
int size = grades.size();
cout << "Calculating histogram with " << size << " grades." << endl;
//Create dynamic array for the histogram of 10 sections.
const int hSize = 10;
int hValues[hSize] = { 0 };
IntArrayPtr histogram;
histogram = hValues;
//Make the historgram
int stackValue = 0;
for (int j = 0; j < hSize; j++) {
//Loop through the grade vector slots
for (int i = 0; i < size; i++) {
int testValue = grades[i];
//If the grade at the index is between the stack values of the histogram add one to the value of the slot
if (testValue > stackValue && testValue < stackValue + 10) {
histogram[j]++;
}
}
//After looping through the vector jump up to the next histogram slot and corresponding stack value.
stackValue += 10;
}
//Histogram output. Only output the stacks with values
for (int i = 0; i < 10; i++) {
if (histogram[i] != 0) {
cout << "Number of " << (i + 1) * 10 << "'s: " << histogram[i] << endl;
}
}
return 0;
}
histogram is a pointer, not an array.
While
int histogram[hSize] = {0};
would create a zero-initialised array, your
histogram = { 0 };
does not set any elements to zero (it couldn't, because histogram points to one int, not many).
The braces are ignored – a pretty confusing behaviour inherited from C – and it is equivalent to
histogram = 0;
that is,
histogram = nullptr;
You want
int* histogram = new int[hSize]();
The parentheses value-initialises the array, and in turn its elements.
Value-initialising integers sets them to zero.
(By the way: the habit of typedeffing away asterisks causes more problems than it solves. Don't do it.)
Seg faults are problems with accessing regions of memory you don't have access to, so you need to look at your use of pointers. It often means you have a pointer with a bad value that you just dereferenced.
In this case, the problem is this line:
histogram = { 0 };
This is not setting the histogram values to zero as you think: it's resetting the historgram pointer to zero. Then you later dereference that pointer causing your SegFault (note that this line doesn't even compile with clang, so your compiler isn't helping you any on this one).
Changing that line to:
memset(histogram, 0, hSize);
Will sort the problem in this case.
More generally, to diagnose a segfault there are two tricks I use regularly (though avoidance is better than cure):
Run the program under a debugger: the debugger will likely stop the program at the point of the fault and you can see exactly where it failed
Run the program under Valgrind or similar - that will also tell you where the error surfaced but in more complex failures can also tell you where it was caused (often not the same place).
I am new to c++ programming and am taking a computational physics class where we are analyzing the problem of percolation on a square lattice using a single-cluster algorithm. My professor has given us some base code, and asked us to modify it as well as write some additional code and scripts within and without this specific program. I have written the majority of the code and scripts necessary to solve and plot this problem, but I am having an issue with my main data output program, specifically that of an infinite loop when I set an input parameter to any value other than 0.
Three main function comprise this program, namely LATTICE::LATTICE, CLUSTER::grow, and CUSTER::print, and also uses a standard Mersenne Twister header file. The heavily modified, commented, and toyed with c++ program is as follows:
#include <fstream>
#include <iostream>
#include <math.h>
#include <string>
#include <sstream>
#include <iomanip>
#include <vector>
#include <cstdlib>
#include "MersenneTwister.h"
using namespace std;
class PARAMS
{
public:
int Nlin; // linear size of lattice
double pr; // probability for a site
double Nclust; // number of clusters in a bin
double Nbin; // number of bins of data to output
int SEED; // seed for mersenne twister
string latt_; // which lattice
PARAMS();//constructor
};
class LATTICE
{
public:
LATTICE(const PARAMS&);//constructor
int Nsite;// number of lattice sites
int Lx,Ly;
vector<vector<int> > nrnbrs;
void print ();
};
class CLUSTER
{
public:
CLUSTER(const PARAMS&, const LATTICE&);//constructor
void grow(const PARAMS&, const LATTICE&, MTRand&);
void meas_clear(const LATTICE&);
void meas(const LATTICE&);
void binwrite(const PARAMS&, const LATTICE&);
//void print(const LATTICE& latt, int index);
void print(const PARAMS& p, const LATTICE& latt);
~CLUSTER();// destructor
//private:
int size;
vector <int> conf;
vector <int> stack;
double pr;
//int stck_pnt,stck_end;
double avg_size;
ofstream dfout;
vector <int> stck_pnt;
vector <int> stck_end;
int z, pnt, prob, val, row, column;
vector< vector< vector <int> > > imax;
};
int main(void)
{
PARAMS p;
LATTICE latt(p);
CLUSTER cluster(p,latt);
MTRand ran(p.SEED);
latt.print();
/*for (int bin=0;bin<p.Nbin;bin++)
{
cluster.meas_clear(latt);
for(int clust=0;clust<p.Nclust;clust++)
{
cluster.grow(p,latt,ran);
cluster.meas(latt);
}
cluster.binwrite(p,latt);
}
*/
cluster.grow(p, latt, ran);
cluster.print(p,latt);
}
PARAMS::PARAMS(){
//initializes commonly used parameters from a file
ifstream pfin;
pfin.open("param.dat");
if (pfin.is_open()) {
pfin >> Nlin;
pfin >> pr;
pfin >> Nclust;
pfin >> Nbin;
pfin >> SEED;
pfin >> latt_;
}
else
{cout << "No input file to read ... exiting!"<<endl;exit(1);}
pfin.close();
// print out all parameters for record
cout << "--- Parameters at input for percolation problem ---"<<endl;
cout <<"Nlin = "<<Nlin<<"; prob. of site = "<<pr<<endl;
cout <<"Number of clusters in a bin = "<<Nclust<<"; Number of bins = "<<Nbin<<endl;
cout <<"RNG will be given SEED of = "<<SEED<<endl;
cout <<"Percolation problem on lattice --> "<<latt_<<endl;
};//constructor
LATTICE::LATTICE (const PARAMS& p)
{
string latt_=p.latt_;
if(p.latt_=="sqlatt_PBC")
{
Lx=p.Nlin;Ly=p.Nlin;
Nsite=Lx*Ly;
int i;
nrnbrs = vector<vector<int> >(Nsite, vector<int>(4));
for (i=0; i<Nsite; i++){
if((i+1) % p.Nlin != 0) nrnbrs[i][0] = i+1;
else nrnbrs[i][0] = i - p.Nlin + 1 ;
if(i + p.Nlin < Nsite ) nrnbrs[i][1] = i+p.Nlin;
else nrnbrs[i][1] = i - (Nsite-p.Nlin);
if(i % p.Nlin > 0) nrnbrs[i][2] = i-1;
else nrnbrs[i][2] = i-1+p.Nlin;
if(i - p.Nlin >= 0) nrnbrs[i][3] = i-p.Nlin;
else nrnbrs[i][3] = i + (Nsite-p.Nlin);
}
}
else if(p.latt_=="sqlatt_OBC")
{
Lx=p.Nlin;Ly=p.Nlin;
Nsite=Lx*Ly;
nrnbrs = vector<vector<int> >(Nsite, vector<int>(0));
for (int i=0; i<Nsite; i++){
if((i+1) % p.Nlin != 0){
nrnbrs[i].push_back(i+1);
}
if(i + p.Nlin < Nsite ){
nrnbrs[i].push_back(i+p.Nlin);
}
if(i % p.Nlin > 0){
nrnbrs[i].push_back(i-1);
}
if(i - p.Nlin >= 0){
nrnbrs[i].push_back(i-p.Nlin);
}
}
}
else
{cout <<"Dont know your option for lattice in param.dat .. exiting"<<endl;exit(1);}
}
void LATTICE::print()
{
//THIS FUNCTIONS MAY BE CALLED DURING DEBUGGING TO MAKE SURE LATTICE HAS BEEN DEFINED CORRECTLY
cout <<"---printing out properties of lattice ---"<<endl;
cout<<"size is "<<Lx<<"x"<<Ly<<endl;
cout <<"neighbors are"<<endl;
for (int site=0;site<Nsite;site++)
{
cout <<site<<" : ";
for (size_t nn=0;nn<nrnbrs.at(site).size();nn++)
cout<<nrnbrs.at(site).at(nn)<<" ";
cout <<endl;
}
cout << endl;
}
CLUSTER::CLUSTER(const PARAMS& p, const LATTICE& latt)
{
conf.resize(latt.Nsite);
stack.resize(latt.Nsite);
pr=p.pr;// store prob in a private member of cluster
dfout.open("data.out");
}
CLUSTER::~CLUSTER()
{
dfout.close();
}
void CLUSTER::grow(const PARAMS& p, const LATTICE& latt, MTRand& ran)
{
conf.resize(latt.Nsite); // Initalize Nsite elements of lattice to 0 in conf
// 0 = Not Asked; 1 = Asked, Joined; 2 = Asked, Refused
for (int i = 0; i < p.Nclust; ++i) { // Iterate for Nclust values
z = ran.randInt(latt.Nsite - 1); // Random integer between 0 and Nsite; Selects first lattice element in the cluster algorithm per Nclus
stck_pnt.resize(0); // Set stck_pnt and stck_end vectors to size 0; Will be filled when iterating through each Nclust
stck_end.resize(0); //-----------------------------------------------------------------------------------------------
//while (conf[z] != 0) { z = ran.randInt(latt.Nsite - 1); } // Iterate through lattice elements until we select one that has not been asked to join
conf[z] = 1; // Set element z in conf to have been asked to join and accepted
stck_pnt.push_back(z); // Add z to both stck_pnt and stck_end
stck_end.push_back(z);
for (int j = 0; j = 3; ++j) { // Add z's nearest neighbors to stck_end; Ignore if already been asked
if (conf[latt.nrnbrs[z][j] == 0]) {
stck_end.push_back(latt.nrnbrs[z][j]);
}
}
pnt = 1; // Initialize pnt for trasnferral of stack_end values to stck_pnt
while (stck_pnt.size() < stck_end.size()) {
stck_pnt.push_back(stck_end[pnt]); // Add pnt element of stck_end to stck_pnt
double prob = ran.rand(); // Get probability value for testing if cluster grows
if (prob <= pr) {
conf[stck_pnt[pnt]] = 1; // Set the current stck_pnt element to joined in conf
for (int j = 0; j = 3; ++j) { // Add z's nearest neighbors to stck_end; Ignore if already been asked
if (find(stck_end.begin(), stck_end.end(), latt.nrnbrs[stck_pnt[pnt]][j]) != stck_end.end()) {
// The given value already exists in stck_end, don't add it again
}
else { // The given value is not contained in stck_end, add it to stck_end
stck_end.push_back(latt.nrnbrs[z][j]);
}
}
}
else {
conf[stck_pnt[pnt]] = 2; // Set the given value to haven been asked and refused in conf
}
++pnt; // Increment pnt; ++p is more efficient then p++ due to lack of copying value
}
}
}
/*
void CLUSTER::print(const LATTICE& latt, int index)
{
stringstream ss;
string file_name;
ss << index << ".clust";
file_name = ss.str();
ofstream clout;
clout.open(file_name.c_str());
clout << "#" << latt.Lx << " x " << latt.Ly << endl;
for (int y = 0; y < latt.Ly; y++)
{
for (int x = 0; x < latt.Lx; x++)
clout << conf[x + y*latt.Lx] << " ";
clout << endl;
}
clout.close();
}
*/
void CLUSTER::print(const PARAMS& p, const LATTICE& latt)
{
//vector< vector< vector<int> > > imax(latt.Lx, vector< vector<int>>(latt.Ly, vector<int>(1)));
// Resize and allocate memeory for imax
//-------------- Row = y-position = i/Lx --------------- Column = x-position = i%Lx ---------------- val = conf[i]
ofstream myFile;
myFile.open("imax.out");
cout << "THe following output was calculated for the input parameters; Recorded to 'imax.out'" << endl;
cout <<"[index]" << "\t" << "[x-position]" << "\t" << "[y-position]" << "\t" << "[conf val]" << endl << endl;
for (int i = 0; i < latt.Nsite; ++i) {
val = conf[i]; // Find color value
row = i / latt.Lx; // Find row number
column = i%latt.Lx; // Find column number
cout << i << "\t" << column << "\t" << row << "\t" << val << endl;
myFile << i << "\t" << column << "\t" << row << "\t" << val << endl;
}
myFile.close();
double size = 0.0; // Initialize size
for (int i = 0; i < latt.Nsite; ++i) {
if (conf[i] == 1) {
size += 1;
}
}
double avg_size = size / p.Nclust; // Find avg_size
}
void CLUSTER::meas(const LATTICE& latt)
{
avg_size+=(double)size;
}
void CLUSTER::meas_clear(const LATTICE& latt)
{
avg_size=0.;
}
void CLUSTER::binwrite(const PARAMS& p, const LATTICE& latt)
{
dfout << avg_size/((double)p.Nclust)<<endl;
}
When I set Nclust=0 in the input file, the code runs as expected and gives the proper output in the file and console. However, when I set Nclust equal to any other value, I get the proper lattice console output but the program hangs for the cluster algorithm. I at first assumed that my computer and algorithm were slow and inefficient and that the program was working in some non-linear time. However, after leaving the program running for around 30 minutes for a 4x4 lattice (only 16 elements in the conf[] vector), no progress had been made and I assumed that the program was stuck in a loop.
After spending several hours going over the CLUSTER::grow() method line-by-line and experimenting with changing various bits of code, I have been unable to resolve where this loop error originates from. I would assume it is somewhere in the while loop that compares the size of stck_pnt and stck_end, but I cannot figure out why or where this is. Any help with this would be very greatly appreciated.
Tl;dr: For Nclust !=0, CLUSTER:grow gets stuck in an infinite loop
You have infinite loop here:
stck_end.push_back(z);
for (int j = 0; j = 3; ++j) { // <======== HERE
and here:
conf[stck_pnt[pnt]] = 1; // Set the current stck_pnt element to joined in conf
for (int j = 0; j = 3; ++j) { // <======== HERE
I have been working with a program that will read through multiple text files, record the number of words in them, and write to a file all of the words and the frequency of them. However, I have encounter a segmentation fault somewhere in my code. I have tried using tools such as Valgrind to help me debug it, however it only points to where I say int i = 0 in the main loop. I apologize for posting a large portion of my code but I have spent hours trying to find where the bug is and cannot seem to find it for the life of me. The issues began when I started passing a structure in pthread_exit().
#include <iostream>
#include <fstream>
#include <string>
#include <pthread.h>
#include <vector>
#include <algorithm>
#include <sstream>
#include <iterator>
#include <pthread.h>
#include <stdio.h>
#include <stdlib.h>
#include <cstdio>
using namespace std;
// Create a structure that we can store information in
typedef struct info{
int words;
string dictionary[500000];
} info;
// Counts the number of words in the text file so we know how big to make our array
int countWord(char *arg){
char words[25000];
int count = 0;
ifstream check;
check.open(arg);
while(!check.eof()){
check>>words;
count++;
}
cout<<"Word Count: "<< count << '\n';
check.close();
return count;
}
// Checks to see if the word exists in our dictionary or not
int findWord(string array[], string target, int wordCount){
for(int i = 0; i < wordCount; ++i){
if(array[i] == target){
return 1;
}
}
return 0;
}
// Checks to see how many times a word is repeated
int checkWord(string array[], string target, int wordCount){
int number = 0;
for(int i = 0; i < wordCount; i++){
if(array[i] == target){
number++;
}
}
return number;
}
void *threads(void *arg){
info information;
char *fileName = (char *)arg;
ifstream myfile (fileName);
string line;
string fullText[15000];
string dictionary[500000];
int wordCount = countWord(fileName);
int i = 0;
int find;
int check;
int x = 0;
int checkingStart = 0;
// Opens and reads the file word by word removing any symbols that we dislike
if (myfile.is_open()){
while(myfile >> line){
transform(line.begin(), line.end(), line.begin(), ::tolower);
line.erase(remove(line.begin(), line.end(), ','), line.end());
fullText[i] = line;
i++;
}
}
else cout << "Unable to Open the File";
myfile.close();
// Goes through and adds all the words to our dictionary
for(i = 0; i < wordCount; ++i){
find = findWord(dictionary, fullText[i], wordCount);
if(find == 0){
dictionary[x] = {fullText[i]};
++x;
checkingStart = 1;
}
}
// Sets each section of dictionary equal to the one in the structure
for(i = 0; i < wordCount; ++i){
information.dictionary[i] = dictionary[i];
}
// Sets words equal to word count and then passes the structure information out of the thread
information.words = wordCount;
pthread_exit(&information);
return NULL;
}
int main(){
int i = 0;
int x = 0;
int y = 0;
int z = 0;
int a = 0;
int b = 0;
int add = 0;
int currentSize = 0;
int checkingStart = 0;
int wordCount;
int find;
string fullDictionary[500000];
string dict[500000];
ofstream writeFile;
info information;
char *fileName;
char *fileList[2];
pthread_t threadCount[2];
int frequency[500000];
int check;
fileList[0] = "text1";
fileList[1] = "text2";
// Creates a loop that creates and joins threads for each text file
for(a = 0; a < 1; ++a){
fileName = fileList[a];
pthread_create(&threadCount[a], NULL, threads, &fileName);
pthread_join(threadCount[a], (void **)&information);
wordCount = information.words;
// Sets each part of dict equal to the same slot on info.dict
for(b = 0; b < wordCount; ++b){
dict[b] = information.dictionary[b];
}
// Adds to a complete list of all the text files added together
for(y = 0, z = currentSize; z < wordCount; ++z, ++y){
fullDictionary[z] = dict[y];
}
currentSize = (currentSize + wordCount);
}
// Goes through and adds all the words to our dictionary
for(i = 0; i < wordCount; ++i){
find = findWord(dict, fullDictionary[i], currentSize);
if(find == 0){
dict[x] = {fullDictionary[i]};
cout << "Added the Word: " << fullDictionary[i] << "\n";
add = 1;
checkingStart = 1;
}
// Checks the number of times each word appears in the text file
if(checkingStart == 1){
check = checkWord(fullDictionary, dict[x], wordCount);
frequency[x] = {check};
}
// Checks to see if it needs to move to the next open dictionary spot
if(add == 1){
++x;
add = 0;
}
}
return 0;
}
These were the changes that were needed to get the program working.
1) One issue seems to be that the size of the variables in the function threads. Looks like every thread that is spawned has some default limit . You could read up on pthread_attr_setstacksize. but the simplest solution was to reduce the size of the strings in thread.So the size of the variables are why it's gives a segmentation fault as soon as the threads function is called.
As already mention in the comments above usage of vector/maps classes will help reduce the need for large local variables.
2) The return variable needs to be a non-local variable else the return value does not make it back successfully.
3) just noticed the main loop ( variable a ) is running only once . Also once the thread is launched(pthread_create) the loop is waiting for the join . This will result in serialization of the threads. The create can be done first and then the join can be in called in a separate loop after that.
Changes are given below ..
In function - threads
info *information;
//changed to pointer
// info information;
char *fileName = (char *)arg;
ifstream myfile (fileName);
string line;
string fullText[1500];
string dictionary[5000];
// reduced size
//string fullText[15000];
//string dictionary[500000];
.....
information = new info; // create an instance
........
// change to pointer
information->dictionary[i] = dictionary[i];
}
// Sets words equal to word count and then passes the structure information out of the thread
information->words = wordCount;
pthread_exit(information); // return pointer
in function - main
info *information; // change to pointer
....
for(a = 0; a < 2; ++a){ // loop to 2
.....
pthread_create(&threadCount[a], NULL, threads, (void *)fileName); // changed file name
// pthread_create(&threadCount[a], NULL, threads, &fileName);
wordCount = information->words; // changed for pointer
...
dict[b] = information->dictionary[b] // changed for pointer
After the edits you should be able to run to debug the rest of the functionality.
Currently I am getting an runtime "assertation error"
Here is the error:
I'm reading words from a text file into dynamically allocated arrays.
this block of code is where I am filling the new arrays.
I know the problem is being caused by this block of code and something about my logic is off just can't see what it is.
//fill new arrays
for( int y = 0; y < new_numwords; y++)
{
for( int i = 0; i < NUM_WORDS; i++)
{
if (!strcmp(SentenceArry[i], EMPTY[0]) == 0)
{
New_SentenceArry[y] = SentenceArry[i];
New_WordCount[y] = WordCount[i];
y++;
}
}
}
Also how would I pass this dynamically allocated 2D array to a function? (the code really needs to be cleaned up as a whole)
char** SentenceArry = new char*[NUM_WORDS]; //declare pointer for the sentence
for( int i = 0; i < NUM_WORDS; i++)
{
SentenceArry[i] = new char[WORD_LENGTH];
}
Here is the full extent of the code.. help would be much appreciated!
Here is what is being read in:
and the current output (the output is how it's suppose to be ):
#define _CRT_SECURE_NO_WARNINGS
#include <iostream>
#include <fstream>
#include <cstring>
#include <cctype>
#include <iomanip>
using std::setw;
using std::left;
using std::cout;
using std::cin;
using std::endl;
using std::ifstream;
int main()
{
const int NUM_WORDS = 17;//constant for the elements of arrays
const int WORD_LENGTH = 50;//constant for the length of the cstrings (NEED TO GIVE THE VALUE ZERO STILL!)
short word_entry = 0; //declare counter
short new_numwords= 0; //declare new word count
char EMPTY[1][4]; //NULL ARRAY
EMPTY[0][0] = '\0';//define it as null
char** SentenceArry = new char*[NUM_WORDS]; //declare pointer for the sentence
for( int i = 0; i < NUM_WORDS; i++)
{
SentenceArry[i] = new char[WORD_LENGTH];
}
int WordCount[NUM_WORDS];//declare integer array for the word counter
for(int i = 0; i < NUM_WORDS; i++)//fill int array
{
WordCount[i] = 1;
}
int New_WordCount[NUM_WORDS] = {0};
ifstream read_text("DataFile.txt"); //read in our text file
if (read_text.is_open()) //check if the the file was opened
{
read_text >> SentenceArry[word_entry];
//REMOVE PUNCTUATION BEFORE BEING READ INTO THE ARRAY
while (!read_text.eof())
{
word_entry++; //increment counter
read_text >> SentenceArry[word_entry]; //read in single words of the text file into the array SentenceArry
char* ptr_ch;//declare our pointer that will find chars
ptr_ch = strstr( SentenceArry[word_entry], ",");//look for "," within the array
if (ptr_ch != NULL)//if true replace it with a null character
{
strncpy( ptr_ch, "\0" , 1);
}//end if
else
{
ptr_ch = strstr( SentenceArry[word_entry], ".");//look for "." within the array
if (ptr_ch != NULL)//if true replace it with a null character
{
strncpy( ptr_ch, "\0" , 1);
}//end if
}//end else
} //end while
}//end if
else
{
cout << "The file could not be opened!" << endl;//display error message if file doesn't open
}//end else
read_text.close(); //close the text file after eof
//WORD COUNT NESTED FOR LOOP
for(int y = 0; y < NUM_WORDS; y++)
{
for(int i = y+1; i < NUM_WORDS; i++)
{
if (strcmp(SentenceArry[y], EMPTY[0]) == 0)//check if the arrays match
{
y++;
}
else
{
if (strcmp(SentenceArry[y], SentenceArry[i]) == 0)//check if the arrays match
{
WordCount[y]++;
strncpy(SentenceArry[i], "\0" , 3);
}//end if
}//end if
}//end for
}//end for
//find how many arrays still contain chars
for(int i = 0; i < NUM_WORDS; i++)
{
if (!strcmp(SentenceArry[i], EMPTY[0]) == 0)
{
new_numwords++;
}
}
//new dynamic array
char** New_SentenceArry = new char*[new_numwords]; //declare pointer for the sentence
for( int i = 0; i < new_numwords; i++)
{
New_SentenceArry[i] = new char[new_numwords];
}
//fill new arrays
for( int y = 0; y < new_numwords; y++)
{
for( int i = 0; i < NUM_WORDS; i++)
{
if (!strcmp(SentenceArry[i], EMPTY[0]) == 0)
{
New_SentenceArry[y] = SentenceArry[i];
New_WordCount[y] = WordCount[i];
y++;
}
}
}
//DISPLAY REPORT
cout << left << setw(15) << "Words" << left << setw(9) << "Frequency" << endl;
for(int i = 0; i < new_numwords; i++) //compare i to the array constant NUM_WORDS
{
cout << left << setw(15) << New_SentenceArry[i] << left << setw(9) << New_WordCount[i] << endl; //display the contents of the array SentenceArry
}
//DEALLOCATION
for( int i = 0; i < NUM_WORDS; i++)//deallocate the words inside the arrays
{
delete [] SentenceArry[i];
}
for(int i = 0; i < new_numwords; i++)
{
delete [] New_SentenceArry[i];
}
delete [] SentenceArry; //deallocate the memory allocation made for the array SentenceArry
delete [] New_SentenceArry;//deallocate the memory allocation made for the array New_SentenceArry
}//end main
There are several issues with the code, not withstanding that this could be written using C++, not C with a sprinkling of C++ I/O..
Issue 1:
Since you're using c-style strings, any copying of string data will require function calls such as strcpy(), strncpy(), etc. You failed in following this advice in this code:
for( int y = 0; y < new_numwords; y++)
{
for( int i = 0; i < NUM_WORDS; i++)
{
if (!strcmp(SentenceArry[i], EMPTY[0]) == 0)
{
New_SentenceArry[y] = SentenceArry[i]; // This is wrong
New_WordCount[y] = WordCount[i];
y++;
}
}
}
You should be using strcpy(), not = to copy strings.
strcpy(New_SentenceArry[y], SentenceArry[i]);
Issue 2:
You should allocate WORD_LENGTH for both the original and new arrays. The length of the strings is independent of the number of strings.
char** New_SentenceArry = new char*[new_numwords]; //declare pointer for the sentence
for( int i = 0; i < new_numwords; i++)
{
New_SentenceArry[i] = new char[new_numwords];
}
This should be:
char** New_SentenceArry = new char*[new_numwords]; //declare pointer for the sentence
for( int i = 0; i < new_numwords; i++)
{
New_SentenceArry[i] = new char[WORD_LENGTH];
}
Issue 3:
Your loops do not check to see if the index is going out of bounds of your arrays.
It seems that you coded your program in accordance to the data that you're currently using, instead of writing code regardless of what the data will be. If you have limited yourself to 17 words, where is the check to see if the index goes above 16? Nowhere.
For example:
while (!read_text.eof() )
Should be:
while (!read_text.eof() && word_entry < NUM_WORDS)
Issue 4:
You don't process the first string found correctly:
read_text >> SentenceArry[word_entry]; // Here you read in the first word
while (!read_text.eof() )
{
word_entry++; //increment counter
read_text >> SentenceArry[word_entry]; // What about the first word you read in?
Summary:
Even with these changes, I can't guarantee that the program won't crash. Even it it doesn't crash with these changes, I can't guarantee it will work 100% of the time -- a guarantee would require further analysis.
The proper C++ solution, given what this assignment was about, is to use a std::map<std::string, int> to keep the word frequency. The map would automatically store similar words in one entry (given that you remove the junk from the word), and would bump up the count to 1 automatically, when the entry is inserted into the map.
Something like this:
#include <string>
#include <map>
#include <algorithm>
typedef std::map<std::string, int> StringMap;
using namespace std;
bool isCharacterGarbage(char ch)
{ return ch == ',' || ch == '.'; }
int main()
{
StringMap sentenceMap;
//...
std::string temp;
read_text >> temp;
temp.erase(std::remove_if(temp.begin(), temp.end(), isCharacterGarbage),temp.end());
sentenceMap[temp]++;
//...
}
That code alone does everything your original code did -- keep track of the strings, bumps up the word count, removes the junk characters from the word before being processed, etc. But best of all, no manual memory management. No calls to new[], delete[], nothing. The code just "works". That is effectively 5 lines of code that you would just need to write a "read" loop around.
I won't go through every detail, you can do that for yourself since the code is small, and there are vast amounts of resources available explaining std::map, remove_if(), etc.
Then printing out is merely going through the map and printing each entry (string and count). If you add the printing, that may be 4 lines of extra code. So in all, practically all of the assignment is done with effectively 10 or so lines of code.
Remove below code.
for(int i = 0; i < new_numwords; i++)
{
delete [] New_SentenceArry[i];
}