I faced a little trouble. I'm not sure if I can understand it.
So, I have some code. And I'm trying to add #pragma loop(hint_parallel(8)) statement for a few loops in the code.
When I compile that using necessary compilation options which are actually like this:
gcc -w -funroll-loops -O2 -fno-inline -fipa-pta -msse2
-funsafe-math-optimizations -ftree-vectorizer-verbose=1 -fopt-info-optimized=logs/optOpt.txt -shared -fPIC singleThread.cpp
I get segmentation fault.
fish: './a.out' terminated by signal SIGSEGV (Address boundary error)
The point is that I have no idea why it is. I suspected that it could be a problem with a constant that is used in these loops. But I don't think that this is related. if I just compile this code using -O0 optimisation it works fine (because complier doesn't vectorise something I guess).
Could you please take a look on the code below and suggest me in which direction I should check.
Thanks.
#include <iostream>
#include <vector>
#include <string>
#include <algorithm>
#include <random>
#include <cstdio>
#include <set>
#include <fstream>
#include <cstdint>
#include <climits>
using namespace std;
const int STRING_HASH_SIZE = 32;
int convert(vector<string> &inputVector, const char **outputArray);
void printCollisions(const char **charArray, int size);
void printArray(const char **arrayToPrint, int size);
int getHashCode(const char *characters, unsigned long size);
string getRandomString();
void writeFileIfNeeded(vector<string> &vector, bool needToWrite);
vector<string> generateStringsVector(int size, bool isNeedToWriteFile);
/**
* main method is present to test these native code.
* to perform some external operation we should use another method.
* #return
*/
int main() {
/**
* The constant represents number of strings that will be generated
* in the string vector generation.
*/
const int STRING_NUMBERS = 100000;
vector<string> inputVector = generateStringsVector(STRING_NUMBERS, false);
#pragma pack 8
const char *charArray[inputVector.size()];
int hashResult = convert(inputVector, charArray);
if (hashResult != 0) {
return 0;
}
printCollisions(charArray, STRING_NUMBERS);
}
/**
* Converts an input vector to char array.
* Getting a hash of
* Returns 0 if conversion from vector to array has been successfully performed.
* #param inputVector [ input array reference ]
* #param outputArray [ a char array that would contain char sequences from vector ]
* #return [ hash sum (int)]
*/
int convert(vector<string> &inputVector, const char **outputArray) {
int hashSum = 0;
#pragma loop(hint_parallel(8))
for (int i = 0; i < inputVector.size(); i++) {
outputArray[i] = inputVector[i].c_str();
}
#pragma loop(hint_parallel(8))
for (auto &i : inputVector) {
hashSum += getHashCode(i.c_str(), i.length());
}
int stringHashSize = STRING_HASH_SIZE;
#pragma loop(hint_parallel(8))
for (int i = 0; i < inputVector.size(); i++) {
hashSum -= getHashCode(outputArray[i], stringHashSize);
}
if (hashSum != 0) {
cout << "\nConversion isn't succeeded, hash = " << hashSum << endl;
} else {
cout << "\nConversion succeeded" << endl;
}
return hashSum;
}
/**
* Prints count and percentage of collisions in array hash codes
* #param charArray
* #param size
*/
void printCollisions(const char **charArray, int size) {
set<int> setOfHashes;
int stringHashSize = STRING_HASH_SIZE;
#pragma loop(hint_parallel(8))
for (int i = 0; i < size; i++) {
setOfHashes.insert(getHashCode(charArray[i], stringHashSize));
}
unsigned long collisions = size - setOfHashes.size();
cout << collisions << "/" << size << " " << 100.0 * collisions / size << "% of collisions";
}
/**
* Prints input char array
* #param arrayToPrint
*/
void printArray(const char **arrayToPrint, int size) {
cout << "\nPrinted array size = " << size << endl;
for (int i = 0; i < size; i++) {
cout << arrayToPrint[i] << ":" << getHashCode(arrayToPrint[i], STRING_HASH_SIZE) << endl;
}
}
/**
*
* #param characters
* #return
*/
int getHashCode(const char *characters, unsigned long size) {
int hash = 0;
#pragma loop(hint_parallel(8))
for (int i = 0; i < size; i++) {
hash = (31 + hash) * (characters[i]);
}
return hash;
}
/**
* Get a random String from alphabetical char sequence.
* #return a randomized string according to an alphabet.
*/
string getRandomString() {
string str("0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz");
random_device rd;
mt19937 generator(rd());
shuffle(str.begin(), str.end(), generator);
return str.substr(0, STRING_HASH_SIZE);
}
/**
* Generates a vector with random strings
* #param size - an int value that will be used as size of a generated vector
* #return reference to generated vector.
*/
vector<string> generateStringsVector(int size, bool isNeedToWriteFile) {
vector<string> charArray;
#pragma loop(hint_parallel(8))
for (int i = 0; i < size; i++) {
string str = getRandomString();
charArray.push_back(str);
}
writeFileIfNeeded(charArray, isNeedToWriteFile);
return charArray;
}
/**
* Writes file with name according to vector size (e.g. 100000.csv)
* if needToWrite is true
* #param vector
* #param needToWrite
*/
void writeFileIfNeeded(vector<string> &vector, bool needToWrite) {
if (needToWrite) {
ofstream csvFile;
string filename = to_string(vector.size()) + ".csv";
csvFile.open(filename, fstream::out);
for (const auto &i : vector) {
csvFile << i << "\n";
}
csvFile.close();
}
}
What is causing the segmentation fault is the way you compile your code and not the pragmas (which don't have any effect in gcc anyway, see below):
gcc -w -funroll-loops -O2 -fno-inline -fipa-pta -msse2
-funsafe-math-optimizations -ftree-vectorizer-verbose=1 -fopt-info-optimized=logs/optOpt.txt -shared -fPIC singleThread.cpp
By using -shared -fPIC you are creating a DSO (dynamic shared object). If you try to execute this file, you'll get an invalid PC (program counter) and your program will crash immediately. You must compile your code without -shared -fPIC (and use -pie -fPIE if you need a position-independent executable).
Also, for compiling C++ code you should normally use g++ instead of gcc.
The given pragmas should not have any effect on your code, as these ones are only understood by Microsoft Visual Studio. Add -Wall to your compile options and gcc will show you the respective warnings.
In any case, you should get rid of vendor-specific pragmas and use standardized solutions like OpenMP instead (compile with -fopenmp). That way, you are a step closer to writing compiler-independent code.
As for the parallelized loops, you should make sure you don't run into race conditions or other synchronization failures. For example, to compute a sum, #pragma omp parallel for reduction(+: sum) is your friend in OpenMP (reference sheet).
Disclaimer: I have used gcc 7.3.0 on x86_64 (CentOS Linux).
Related
I was trying to write an array of ints to a binary file and then read the just written file and write it in another array (of the same size of the first), but i don't understand why the second array contains the correct numbers only until 25 (its 26th element, since numbers i wrote in the first start from 0).
A very weird thing i noticed, is that if i replace 'x[i] = i;' with 'x[i] = i * 3;' in the first for cycle in main, i obtain the correct numbers printed until 279 instead of 25 (and 25*3 != 279).
How could I write/read binary files to/from raw arrays in C++?
main.cpp:
#include "arrays_binary_files.hpp"
#include <iostream>
int main()
{
int x[1000];
//#if 0
for (size_t i{ 0 }; i != sizeof x / sizeof * x; ++i)
x[i] = i;
//#endif
int y[sizeof x / sizeof *x];
std::cout << "scrivo x su x.bin? [invio per continuare] _";//write?
(void)getchar();
std::cout << "\nscrivo x su x.bin...";//writing...
arrToBinFile(x, sizeof x / sizeof * x, "x.bin");
std::cout << "\nscritto x su x.bin";//written!
std::cout << "\n\nscrivo x.bin su y? [invio per continuare] _";//read?
(void)getchar();
std::cout << "scrivo x.bin su y...";//reading...
binFileToArr("x.bin", y, sizeof y / sizeof * x);
std::cout << "\nscritto x.bin su y";//read!
std::cout << "\n\nvisualizzo y? [invio per continuare] _";//show?
for (size_t i{ 0 }; i != sizeof y / sizeof * y; ++i) {
std::cout << '\n' << y[i];//stampa bene solo fino a 25
(void)getchar();
}
return 0;
}
arrays_binary_files.hpp:
#ifndef arrays_binary_files_hpp_included
#define arrays_binary_files_hpp_included
#include <fstream>
#include <filesystem>
//namespace {
/*
* gives internal linkage (like specifying static for everything), so that each
* function is "local" in each translation unit which is pasted in by #include
*/
//i tried to inline in order to debug using breakpoints, but i didn't understand the same where the bug is
inline char arrToBinFile(const int inputArray[], const size_t inputArrayLength, const std::string& fileName) {
std::ofstream outputData;
outputData.open(fileName);
if (outputData) {
outputData.write(reinterpret_cast<const char*>(inputArray), sizeof(int) * inputArrayLength);
outputData.close();
return 0;
}
else {
outputData.close();
return -1;
}
}
//i tried to inline to debug using breakpoints, but i didn't understand the same where the bug is
inline char binFileToArr(const std::string& fileName, int outputArray[], const size_t outputArrayLength) {
std::ifstream inputData;
inputData.open(fileName);
if (inputData /*&& std::filesystem::file_size(fileName) <= outputArrayLength*/) {
inputData.read(reinterpret_cast<char*>(outputArray), sizeof(int) * outputArrayLength);
inputData.close();
return 0;
}
else {
inputData.close();
return -1;
}
}
//}
#endif
screenshot of the console in case of leaving the main function in main.cpp as it is:
screenshot of the console in case of replacing 'x[i] = i;' with 'x[i] = i * 3;' in the main function in main.cpp:
I am fairly new to c++, I was making a encryptor to imrpove my c++, at first I kept my Cryptographer class in cryptographer.hpp and then added function body in cryptographer.cpp and then included cryptographer.hpp in main.cpp it gave me a compiler error, so I just pasted the code in main.cpp like this
#include <iostream>
#include <map>
class Cryptographer{
public:
int n_factor;
std::string text;
Cryptographer(std::string user_arg, int user_n_factor);
struct cryptographer
{
std::string encrypted_text;
std::string generated_key="";
};
cryptographer crypted_text;
void generate_key();
void encrypt();
void decrypt();
std::string get_key();
std::string get_text();
};
using key_map = std::map<char, std::string[5]>;
void Cryptographer::generate_key(){
for (int _ = 0; _ < 5; _++){
crypted_text.generated_key += rand() % 26 + 65;
}
}
void Cryptographer::encrypt(){
generate_key();
key_map keyHashMap;
for (auto key_letter: crypted_text.generated_key){
int key_letter_int = (int) key_letter;
std::string key_letter_arr[5];
int memory_number = key_letter_int;
for (int index=0; index < 5; index++){
if (memory_number+n_factor > 91){
memory_number = 65;
}else{
key_letter_arr[5] = std::string(1, char (memory_number + n_factor));
memory_number += n_factor;
}
}
keyHashMap.emplace(key_letter, key_letter_arr);
}
for(int index=0; index<text.size(); index++){
int key = index %4;
int key_patter = rand()% 4;
int checking_index = 0;
for (auto &elem: keyHashMap){
if (checking_index == key){
std::cout << elem.second[1];
}
}
}
crypted_text.encrypted_text = "test";
}
Cryptographer::Cryptographer(std::string user_arg, int user_n_factor):
text(user_arg), n_factor(user_n_factor)
{}
int main(){
Cryptographer crypter("hello guys", 3);
crypter.encrypt();
std::cout << crypter.get_text();
return 0;
}
and ran this in my terminal
g++ main.cpp -o test
and it popped this large error
https://hastebin.com/cibeyanoro.cpp
I am on ubuntu 20.04, I also tried removing and reinstalling latest version of g++ but the same error pops up.
g++ error in compiling while using std::string[5] as a type in std::map<>
Arrays cannot be stored as elements of std::map. You can store classes though, and arrays can be members of a class. The standard library provides a template for such array wrapper. It's called std::array. You can use that as the element of the map instead.
Sidenote: std::map isn't the only standard container with such limitation. std::array is the only standard container that can itself contain arrays as elements.
I am trying to write an Octave C++ .oct function that uses the linasm-1.13 library but I cannot seem to get even basic loading of tzdata from /usr/share/zoneinfo/ to work. My simple test function so far is
#include <octave/oct.h>
#include <Time.h> // the linasm-1.13 library
DEFUN_DLD ( tz, args, nargout,
"-*- texinfo -*-\n\
#deftypefn {Function File} {} tz (#var{YYYYMMDDHHMMSS})\n\
\n\
#end deftypefn" )
{
octave_value_list retval_list ;
unsigned int tz ;
const char *ny_time = "/usr/share/zoneinfo/America/New_York" ;
tz = Time::LoadTimeZone( ny_time ) ;
return retval_list ;
which, on compiling with mkoctfile, gives this error
>> mkoctfile tz.cc
tz.cc: In function ‘octave_value_list Ftz(const octave_value_list&, int)’:
tz.cc:24:34: error: cannot call member function ‘unsigned int Time::LoadTimeZone(const char*)’ without object
tz = Time::LoadTimeZone( ny_time ) ;
^
warning: mkoctfile: building exited with failure status
My understanding of this is that ny_time is not an object that is recognised, but I have tried casting ny_time as a string literal as detailed in this accepted SO answer.
I am doing things this way because the input for LoadTimeZone according to the linasm page should be a "path to tzfile, which describes required time zone." Where am I going wrong?
I think you have to #include "source.cc" files also, not just the #include "header.h" files. In your case, I guess you should add: #include "Time.cc" or something like that. I don't know why but this worked for me when working with Rafat's Hussain wavemin library, but I had only 4 files, it must be incredibly tedious with lots of files.
This is what I did (it's a modified version of the test code provided by Rafat with his library).
#include "wavemin.h"
#include "waveaux.h"
#include "wavemin.cc"
#include "waveaux.cc"
#include <octave/oct.h>
double ensayo();
double absmax(double *array, int N);
DEFUN_DLD(helloctave2, argv, , "Usage: hello()"){
wave_object obj;
wt_object wt;
double *inp, *out, *diff;
int N, i, J;
char *name = "db4";
obj = wave_init(name);// Initialize the wavelet
N = 14; //Length of Signal
inp = (double*)malloc(sizeof(double)* N); //Input signal
out = (double*)malloc(sizeof(double)* N);
diff = (double*)malloc(sizeof(double)* N);
//wmean = mean(temp, N);
for (i = 0; i < N; ++i) {
inp[i] = i;
}
J = 1; //Decomposition Levels
wt = wt_init(obj, "dwt", N, J);// Initialize the wavelet transform object
setDWTExtension(wt, "sym");// Options are "per" and "sym". Symmetric is the default option
setWTConv(wt, "direct");
dwt(wt, inp);// Perform DWT
//DWT output can be accessed using wt->output vector. Use wt_summary to find out how to extract appx and detail coefficients
for (i = 0; i < wt->outlength; ++i) {
octave_stdout << wt->output[i];
octave_stdout << "\n";
}
idwt(wt, out);// Perform IDWT (if needed)
// Test Reconstruction
for (i = 0; i < wt->siglength; ++i) {
diff[i] = out[i] - inp[i];
}
octave_stdout << absmax(diff, wt->siglength);
octave_stdout << "\n";
octave_value_list retval;
return retval;
}
double
absmax(double *array, int N) {
double max;
int i;
max = 0.0;
for (i = 0; i < N; ++i) {
if (fabs(array[i]) >= max) {
max = fabs(array[i]);
}
}
return max;
}
I'm Java user coming over to C++, and I am having a hard time understanding what is going wrong with this statement. My program has been segfaulting anywhere I put the push_back command. So I'm wondering what exactly is going on.
class Process {
public:
int nice;
int arrivalTime;
int cpuBursts;
list<int> burstList;
Process() {
burstList.push_back(10); // Segfaults here...
}
};
Here is the full code:
#include<iostream>
#include<stdlib.h>
#include<fstream>
#include<list>
#include<string.h>
using namespace std;
int calcTimeslice(int priority);
int calcOriginalPrio(int nice);
int readFile(int ,char **);
int calcPrioBonus(int,int);
void tokenizeAndAdd(char *);
class Bursts {
public:
int isCPUBurst;
int time;
Bursts() {}
// Constructor to make it easier to add to list
Bursts(int tempIsCPU, int tempTime) {
isCPUBurst = tempIsCPU;
time = tempTime;
}
};
class Process {
public:
int nice;
int arrivalTime;
int cpuBursts;
list<int> burstList;
Process() {
burstList.push_back(10);
}
};
int main(int arg, char **argv) {
// This is if the file was not correctly read into the program
// or it doesnt exist ...
if(readFile(arg,argv)==-1) {
cout << "File could not be read. \n";
return -1;
}
//cout << "Original Calc Whatever: " << calcOriginal(19) << '\n';
return 0;
}
/*
* Calculates the timeslice based on the priority
*/
int calcTimeslice(int priority) {
double finalCalc;
// This is the given function in the prompt
finalCalc = ( (1 - (priority / 140)) * 290 + (.5) ) + 10;
// Cast to int, this will be a truncate
return ((int)finalCalc);
}
int readFile(int arg, char **argv) {
char *temp,*pointer;
int endOfFile = 1;
// While its not the end of the file
while(endOfFile) {
// Read in the input from stdin
fgets(temp,256,stdin);
// Check to see if this line had a * in it
if(*temp =='*')
endOfFile = 0;
else
tokenizeAndAdd(temp);
}
return 0;
}
void tokenizeAndAdd(char *string) {
char *token = strtok(string," \n");
int i = 0;
Process p;
while(token != NULL) {
cout << token << endl;
if(i>2) { // If it is odd (CPU burst)
if(i%2 == 1) {
int tempInt = atoi(token);
//p.burstList.push_back(tempInt);
}
else { // If it is even (IO burst)
int tempInt = atoi(token);
//p.burstLis.push_back(tempInt);
}
}
else if(i==0)
p.nice = atoi(token);
else if(i==1)
p.arrivalTime = atoi(token);
else if(i==2)
p.cpuBursts = atoi(token);
token = strtok(NULL," \n");
i++;
}
//cout << p.nice << " " << p.arrivalTime << " " << p.cpuBursts << "\n";
//i = 0;
//cout << p.burstList.size() << "\n";
// cout <<
//}
return;
}
/*
* Calculates and returns the original priority based on the nice number
* provided in the file.
*/
int calcOriginalPrio(int nice) {
double finalCalc;
// This is the given function from the prompt
finalCalc = (( nice + 20 ) / 39 ) * 30 + 105.5;
// Cast to int, this is a truncate in C++
return ((int)finalCalc);
}
/*
* Calculates the bonus time given to a process
*/
int calcPrioBonus(int totalCPU, int totalIO) {
double finalCalc;
// How to calculate bonus off of the prompt
if(totalCPU < totalIO)
finalCalc = ( (1 - (totalCPU / (double)totalIO)) * (-5)) - .5;
else
finalCalc = ( (1 - (totalIO / (double)totalCPU)) * 5) + .5;
// Cast to int
return ((int)finalCalc);
}
You are using temp uninitialized in the following code:
char *temp;
...
while(endOfFile) {
fgets(temp,256,stdin);
...
This can have any side effect, since it most likely destroys your stack or parts of the heap memory. It could fail immediately (when calling the fgets() function), it could fail later (as in your sample) or it could even run fine - maybe until you upgrade your OS, your compiler or anything else, or until you want to run the same executable on another machine. This is called undefined behaviour.
You need to allocate space for the temp variable, not a pointer only. Use something like
char temp[256];
...
while(endOfFile) {
fgets(temp,256,stdin);
...
For more information, see the fgets() documentation. The first parameter is a pointer to a char array - that is where fgets() will store the bytes which have been read. In your code, you pass an uninitialized pointer which means that fgets() will store the bytes to an undefined memory location - this is catched by the OS which terminates your application with a segmentation fault.
BTW: You should consider enabling pedantic warnings when compiling - I compiled with
g++ -Wall -pedantic -o list list.cpp
which gave me the following warning:
list.cpp: In function 'int readFile(int, char**)':
list.cpp:76:26: warning: 'temp' may be used uninitialized in this function [-Wuninitialized]
This is probably not the actual code with the error you report. But here is one of the problems with give you UB.
char *temp,*pointer; // uninicialized pointer char temp[1000]; could work?
int endOfFile = 1;
// While its not the end of the file
while(endOfFile) {
// Read in the input from stdin
fgets(temp,256,stdin);
The last function call will read a maximum of 256 bytes from stdin and will write it in the memory pointed by pointer tmp. So, you need to first "prepare" that memory. But with char *tmp; you only define a pointer, with no defined value, that is, with point to some possible unexisting or illegal/inaccessible for you memory. In contrary, char tmp[1000]; will define in the "stack memory" a block of 1000 bytes, with you can point to using simple the variable tmp. Hope this is clear for you.
EDIT:
I don't know why that would change the behavior of the list,
You are right. That is Undefined Behavior (UB). When you write in some unknown memory (pointed by an uninitialized pointer) you may overwrite data or even code that will broke somewhere the correct function of your program in an unpredicted way.
You will need to learn more about pointers but better you use std::string, and look how parse your file using string and stringstream. That will manage for you the memmory,
I am working on a C++ program that uses some external C libraries. As far as I can tell though that is not the cause of the problem, and the issue is with my C++ code. The program runs fine with no errors or anything on my test datasets, but after going through nearly the entire full dataset, I get a segfault. Running GDB gives me this segfault:
(gdb) run -speciesMain=allMis1 -speciesOther=anoCar2 -speciesMain=allMis1 -speciesOther=anoCar2 /hive/data/genomes/allMis1/bed/lastz.anoCar2/mafRBestNet/*.maf.gz
Starting program: /cluster/home/jstjohn/bin/mafPairwiseSyntenyDecay -speciesMain=allMis1 -speciesOther=anoCar2 -speciesMain=allMis1 -speciesOther=anoCar2 /hive/data/genome
s/allMis1/bed/lastz.anoCar2/mafRBestNet/*.maf.gz
Detaching after fork from child process 3718.
Program received signal SIGSEGV, Segmentation fault.
0x0000003009cb7672 in __gnu_cxx::__exchange_and_add(int volatile*, int) () from /usr/lib64/libstdc++.so.6
(gdb) up
#1 0x0000003009c9db59 in std::basic_string, std::allocator >::~basic_string() () from /usr/lib64/libstdc++.so.6
(gdb) up
#2 0x00000000004051e7 in PairAlnInfo::~PairAlnInfo (this=0x7fffffffcd70, __in_chrg=) at mafPairwiseSyntenyDecay.cpp:37
(gdb) up
#3 0x0000000000404eb0 in main (argc=2, argv=0x7fffffffcf78) at mafPairwiseSyntenyDecay.cpp:260
It looks like something is going on with a double free of my PairAlnInfo class. The weird thing is that I don't define a destructor, and I am not allocating anything with new. I have tried this both with g++44 and g++4.1.2 on the linux machine and have had the same results.
To make things even weirder, on my linux box (with more available RAM and everything, not that RAM is an issue with this program, but it is a beefy system) the seg fault happens as described above before the program reaches the loop to print output. On my much smaller macbook air using either g++ or clang++, the program still segfaults, but it doesn't do that until after the results are printed, right before the final return(0) out of the main function. Here is what the GDB trace looks like on my mac running on the same file after compiling with Mac's default g++4.2:
(more results)...
98000 27527 162181 0.83027
99000 27457 161467 0.829953
100000 27411 160794 0.829527
Program received signal EXC_BAD_ACCESS, Could not access memory.
Reason: KERN_INVALID_ADDRESS at address: 0x00004a2c00106077
0x00007fff9365a6e5 in std::string::_Rep::_M_dispose ()
(gdb) up
#1 0x00007fff9365a740 in std::basic_string, std::allocator >::~basic_string ()
(gdb) up
#2 0x0000000100003938 in main (argc=1261, argv=0x851d5fbff533) at mafPairwiseSyntenyDecay.cpp:301
(gdb)
Just in case you didn't notice the time of my posting, it's about 2:30AM now... I have been hacking away at this problem for about 10 hours now. Thanks so much for taking the time to look at this and help me out! The code and some instructions for replicating my situation follow.
If you are interested in downloading and installing the whole thing with dependencies then download my KentLib repository, make in the base directory, and then go to examples/mafPairwiseSyntenyDecay and run make there. An example (rather large) that causes the bug I am discussing is the gziped file available here: 100Mb file that the program crashes on. Then execute the program with these arguments -speciesMain=allMis1 -speciesOther=anoCar2 anoCar2.allMis1.rbest.maf.gz.
/**
* mafPairwiseSyntenyDecay
* Author: John St. John
* Date: 4/26/2012
*
* calculates the mean synteny decay in different range bins
*
*
*/
//Kent source C imports
extern "C" {
#include "common.h"
#include "options.h"
#include "maf.h"
}
#include <map>
#include <string>
#include <set>
#include <vector>
#include <sstream>
#include <iostream>
//#define NDEBUG
#include <assert.h>
using namespace std;
/*
Global variables
*/
class PairAlnInfo {
public:
string oname;
int sstart;
int send;
int ostart;
int oend;
char strand;
PairAlnInfo(string _oname,
int _sstart, int _send,
int _ostart, int _oend,
char _strand):
oname(_oname),
sstart(_sstart),
send(_send),
ostart(_ostart),
oend(_oend),
strand(_strand){}
PairAlnInfo():
oname("DUMMY"),
sstart(-1),
send(-1),
ostart(-1),
oend(-1),
strand(-1){}
};
vector<string> &split(const string &s, char delim, vector<string> &elems) {
stringstream ss(s);
string item;
while(getline(ss, item, delim)) {
elems.push_back(item);
}
return(elems);
}
vector<string> split(const string &s, char delim) {
vector<string> elems;
return(split(s, delim, elems));
}
#define DEF_MIN_LEN (200)
#define DEF_MIN_SCORE (200)
typedef map<int,PairAlnInfo> PairAlnInfoByPos;
typedef map<string, PairAlnInfoByPos > ChromToPairAlnInfoByPos;
ChromToPairAlnInfoByPos pairAlnInfoByPosByChrom;
void usage()
/* Explain usage and exit. */
{
errAbort(
(char*)"mafPairwiseSyntenyDecay -- Calculates pairwise syntenic decay from maf alignment containing at least the two specified species.\n"
"usage:\n"
"\tmafPairwiseSyntenyDecay [options] [*required options] file1.maf[.gz] ... \n"
"Options:\n"
"\t-help\tPrints this message.\n"
"\t-minScore=NUM\tMinimum MAF alignment score to consider (default 200)\n"
"\t-minAlnLen=NUM\tMinimum MAF alignment block length to consider (default 200)\n"
"\t-speciesMain=NAME\t*Name of the main species (exactly as it appears before the '.') in the maf file (REQUIRED)\n"
"\t-speciesOther=NAME\t*Name of the other species (exactly as it appears before the '.') in the maf file (REQUIRED)\n"
);
}//end usage()
static struct optionSpec options[] = {
/* Structure holding command line options */
{(char*)"help",OPTION_STRING},
{(char*)"minScore",OPTION_INT},
{(char*)"minAlnLen",OPTION_INT},
{(char*)"speciesMain",OPTION_STRING},
{(char*)"speciesOther",OPTION_STRING},
{NULL, 0}
}; //end options()
/**
* Main function, takes filenames for paired qseq reads
* and outputs three files.
*/
int iterateOverAlignmentBlocksAndStorePairInfo(char *fileName, const int minScore, const int minAlnLen, const string speciesMain, const string speciesOther){
struct mafFile * mFile = mafOpen(fileName);
struct mafAli * mAli;
//loop over alignment blocks
while((mAli = mafNext(mFile)) != NULL){
struct mafComp *first = mAli->components;
int seqlen = mAli->textSize;
//First find and store set of duplicates in this block
set<string> seen;
set<string> dups;
if(mAli->score < minScore || seqlen < minAlnLen){
//free here and pre-maturely end
mafAliFree(&mAli);
continue;
}
for(struct mafComp *item = first; item != NULL; item = item->next){
string tmp(item->src);
string tname = split(tmp,'.')[0];
if(seen.count(tname)){
//seen this item
dups.insert(tname);
}else{
seen.insert(tname);
}
}
for(struct mafComp *item1 = first; item1->next != NULL; item1 = item1->next){
//stop one before the end
string tmp1(item1->src);
vector<string> nameSplit1(split(tmp1,'.'));
string name1(nameSplit1[0]);
if(dups.count(name1) || (name1 != speciesMain && name1 != speciesOther)){
continue;
}
for(struct mafComp *item2 = item1->next; item2 != NULL; item2 = item2->next){
string tmp2(item2->src);
vector<string> nameSplit2(split(tmp2,'.'));
string name2 = nameSplit2[0];
if(dups.count(name2) || (name2 != speciesMain && name2 != speciesOther)){
continue;
}
string chr1(nameSplit1[1]);
string chr2(nameSplit2[1]);
char strand;
if(item1->strand == item2->strand)
strand = '+';
else
strand = '-';
int start1,end1,start2,end2;
if(item1->strand == '+'){
start1 = item1->start;
end1 = start1 + item1->size;
}else{
end1 = item1->start;
start1 = end1 - item1->size;
}
if(item2->strand == '+'){
start2 = item2->start;
end2 = start2+ item2->size;
}else{
end2 = item2->start;
start2 = end2 - item2->size;
}
if(name1 == speciesMain){
PairAlnInfo aln(chr2,start1,end1,start2,end2,strand);
pairAlnInfoByPosByChrom[chr1][start1] = aln;
}else{
PairAlnInfo aln(chr1,start2,end2,start1,end1,strand);
pairAlnInfoByPosByChrom[chr2][start2] = aln;
}
} //end loop over item2
} //end loop over item1
mafAliFree(&mAli);
}//end loop over alignment blocks
mafFileFree(&mFile);
return(0);
}
int main(int argc, char *argv[])
/* Process command line. */
{
optionInit(&argc, argv, options);
if(optionExists((char*)"help") || argc <= 1){
usage();
}
int minAlnScore = optionInt((char*)"minScore",DEF_MIN_SCORE);
int minAlnLen = optionInt((char*)"minAlnLen",DEF_MIN_LEN);
string speciesMain(optionVal((char*)"speciesMain",NULL));
string speciesOther(optionVal((char*)"speciesOther",NULL));
if(speciesMain.empty() || speciesOther.empty())
usage();
//load the relevant alignment info from the maf(s)
for(int i = 1; i<argc; i++){
iterateOverAlignmentBlocksAndStorePairInfo(argv[i], minAlnScore, minAlnLen, speciesMain, speciesOther);
}
const int blockSize = 1000;
const int blockCount = 100;
int totalWindows[blockCount] = {0};
int containBreak[blockCount] = {0};
//we want the fraction of windows of each size that contain a break
//
for(ChromToPairAlnInfoByPos::iterator mainChromItter = pairAlnInfoByPosByChrom.begin();
mainChromItter != pairAlnInfoByPosByChrom.end();
mainChromItter++){
//process the alignments shared by this chromosome
//note that map stores them sorted by begin position
vector<int> keys;
for(PairAlnInfoByPos::iterator posIter = mainChromItter->second.begin();
posIter != mainChromItter->second.end();
posIter++){
keys.push_back(posIter->first);
}
for(int i = 0; i < keys.size(); i++){
//first check for trivial window (ie our block)
PairAlnInfo pi1 = mainChromItter->second[keys[i]];
assert(pi1.send > pi1.sstart);
assert(pi1.sstart == keys[i]);
int numBucketsThisWindow = (pi1.send - pi1.sstart) / blockSize;
for(int k = 0; k < numBucketsThisWindow && k < blockCount; k++)
totalWindows[k]++;
for(int j = i+1; j < keys.size(); j++){
PairAlnInfo pi2 = mainChromItter->second[keys[j]];
assert(pi2.sstart == keys[j]);
assert(pi2.send > pi2.sstart);
assert(pi2.sstart > pi1.sstart);
if(pi2.oname == pi1.oname){
int moreToInc = (pi2.send - pi1.sstart) / blockSize;
for(int k = numBucketsThisWindow; k < moreToInc && k < blockCount; k++)
totalWindows[k]++;
numBucketsThisWindow = moreToInc; //so we don't double count
}else{
int numDiscontigBuckets = (pi2.send - pi1.sstart) / blockSize;
for(int k = numBucketsThisWindow; k < numDiscontigBuckets && k < blockSize; k++){
containBreak[k]++;
totalWindows[k]++;
}
numBucketsThisWindow = numDiscontigBuckets;
}
if((keys[j] - keys[i]) >= (blockSize * blockCount)){
//i = j;
break;
}
}
}
}
cout << "#WindowSize\tNumContainBreak\tNumTotal\t1-(NumContainBreak/NumTotal)" << endl;
for(int i = 0; i < blockCount; i++){
cout << (i+1)*blockSize << '\t';
cout << containBreak[i] << '\t';
cout << totalWindows[i] << '\t';
cout << (totalWindows[i] > 0? 1.0 - (double(containBreak[i])/double(totalWindows[i])): 0) << endl;
}
return(0);
} //end main()
Try running your program under valgrind. This will give you a report of possibly or actually lost memory, uninitialised, etc.
Your issues are probably due to due memory corruption occurring at some point in the program sometime prior to the actual errors you are seeing.
One potential issue in the code you posted is the loop:
for(int k = numBucketsThisWindow; k<numDiscontigBuckets && k < blockSize; k++){
which uses blockSize instead of the correct blockCount which leads to a possible overflow of both the totalWindows[] and containBreak[] arrays. This would overwrite the speciesMain and speciesOther strings, alonth with anything else on the stack, which might very well result in the errors you are seeing.