I have written a small perl "hack" to replace 1's with alphabets in a range of columns in a tab delimited file. The file looks like this:
Chr Start End Name Score Strand Donor Acceptor Merged_Transcript Gencode Colon Heart Kidney Liver Lung Stomach
chr10 100177483 100177931 . . - 1 1 1 1 1 0 1 1 0 0
chr10 100178014 100179801 . . - 1 1 1 1 1 1 1 1 1 0
chr10 100179915 100182125 . . - 1 1 1 1 1 1 1 0 1 0
chr10 100182270 100183359 . . - 1 1 1 1 0 0 1 0 1 0
chr10 100183644 100184069 . . - 1 1 1 1 0 0 1 0 1 0
The gola is to take columns 11 through 16 and append letters A to Z if a value of 1 is seen in those columns. My code so far is producing an empty output and this is my first time doing regular expressions.
cat infile.txt \
| perl -ne '#alphabet=("A".."Z");
$is_known_intron = 0;
$is_known_donor = 1;
$is_known_acceptor = 1;
chomp;
$_ =~ s/^\s+//;
#d = split /\s+/, $_;
#d_bool=#d[$11-$16];
$ct=1;
$known_intron = $d[$10];
$num_of_overlapping_gene = $d[$9];
$known_acceptor = $d[$8];
$known_donor = $d[$7];
$k="";
if (($known_intron == $is_known_intron) and ($known_donor == $is_known_donor) and ($known_acceptor == $is_known_acceptor)) {
for ($i = 0; $i < scalar #d_bool; $i++){
$k.=$alphabet[$i] if ($d_bool[$i])
}
$alphabet_ct{$k}+=$ct;
}
END
{
foreach $k (sort keys %alphabet_ct){
print join("\t", $k, $alphabet_ct{$k}), "\n";
}
} '\
> Outfile.txt
What should I be doing instead?
Thanks!
* Edit *
Expected Output
ABCD 45
BCD 23
ABCDEF 1215
so on and so forth.
I converted your code into a script for ease of debugging. I've put comments in the code to point out dodgy bits:
use strict;
use warnings;
my %alphabet_ct;
my #alphabet = ( "A" .. "Z" );
my $is_known_intron = 0;
my $is_known_donor = 1;
my $is_known_acceptor = 1;
while (<DATA>) {
# don't process the first line
next unless /chr10/;
chomp;
# this should remove whitespace at the beginning of the line but is doing nothing as there is none
$_ =~ s/^\s+//;
my #d = split /\s+/, $_;
# the range operator in perl is .. (not "-")
my #d_bool = #d[ 10 .. 15 ];
my $known_intron = $d[9];
my $known_acceptor = $d[7];
my $known_donor = $d[6];
my $k = "";
# this expression is false for all the data in the sample you provided as
# $is_known_intron is set to 0
if ( ( $known_intron == $is_known_intron )
and ( $known_donor == $is_known_donor )
and ( $known_acceptor == $is_known_acceptor ) )
{
for ( my $i = 0; $i < scalar #d_bool; $i++ ) {
$k .= $alphabet[$i] if $d_bool[$i];
}
# it is more idiomatic to write $alphabet_ct{$k}++;
# $alphabet_ct{$k} += $ct;
$alphabet_ct{$k}++;
}
}
foreach my $k ( sort keys %alphabet_ct ) {
print join( "\t", $k, $alphabet_ct{$k} ) . "\n";
}
__DATA__
Chr Start End Name Score Strand Donor Acceptor Merged_Transcript Gencode Colon Heart Kidney Liver Lung Stomach
chr10 100177483 100177931 . . - 1 1 1 1 1 0 1 1 0 0
chr10 100178014 100179801 . . - 1 1 1 1 1 1 1 1 1 0
chr10 100179915 100182125 . . - 1 1 1 1 1 1 1 0 1 0
chr10 100182270 100183359 . . - 1 1 1 1 0 0 1 0 1 0
chr10 100183644 100184069 . . - 1 1 1 1 0 0 1 0 1 0
With $is_known_intron set to 1, the sample data gives the results:
ABCDE 1
ABCE 1
ACD 1
CE 2
I have recently noticed that a quick script I had written in Perl that was designed to be used on sub 10MB files has been modified, re-tasked and used in 40MB+ text files with significant performance issues in a batch environment.
The jobs have been running for about 12 hours per run when encountering a large text file and I am wondering how do I improve the perfomance of the code? Should I slurp the file into memory and if I do it will break the jobs reliance on the line numbers in the file. Any constructive thought would be greatly appreciated, I know the job is looping through the file too many times but how to reduce that?
#!/usr/bin/perl
use strict;
use warnings;
my $filename = "$ARGV[0]"; # This is needed for regular batch use
my $cancfile = "$ARGV[1]"; # This is needed for regular batch use
my #num =();
open(FILE, "<", "$filename") || error("Cannot open file ($!)");
while (<FILE>)
{
push (#num, $.) if (/^P\|/)
}
close FILE;
my $start;
my $end;
my $loop = scalar(#num);
my $counter =1;
my $test;
open (OUTCANC, ">>$cancfile") || error ("Could not open file: ($!)");
#Lets print out the letters minus the CANCEL letters
for ( 1 .. $loop )
{
$start = shift(#num) if ( ! $start );
$end = shift(#num);
my $next = $end;
$end--;
my $exclude = "FALSE";
open(FILE, "<", "$filename") || error("Cannot open file ($!)");
while (<FILE>)
{
my $line = $_;
$test = $. if ( eof );
if ( $. == $start && $line =~ /^P\|[0-9][0-9][0-9][0-9][0-9][0-9][0-9][0-9][0-9]\|1I\|IR\|/)
{
print OUTCANC "$line";
$exclude = "TRUECANC";
next;
}
if ( $. >= $start && $. <= $end && $exclude =~ "TRUECANC")
{
print OUTCANC "$line";
} elsif ( $. >= $start && $. <= $end && $exclude =~ "FALSE"){
print $_;
}
}
close FILE;
$end = ++$test if ( $end < $start );
$start = $next if ($next);
}
#Lets print the last letter in the file
my $exclude = "FALSE";
open(FILE, "<", "$filename") || error("Cannot open file ($!)");
while (<FILE>)
{
my $line = $_;
if ( $. == $start && $line =~ /^P\|[0-9][0-9][0-9][0-9][0-9][0-9][0-9][0-9][0-9]\|1I\|IR\|/)
{
$exclude = "TRUECANC";
next;
}
if ( $. >= $start && $. <= $end && $exclude =~ "TRUECANC")
{
print OUTCANC "$line";
} elsif ( $. >= $start && $. <= $end && $exclude =~ "FALSE"){
print $_;
}
}
close FILE;
close OUTCANC;
#----------------------------------------------------------------
sub message
{
my $m = shift or return;
print("$m\n");
}
sub error
{
my $e = shift || 'unknown error';
print("$0: $e\n");
exit 0;
}
There are some things that could speed the script up, like removing unneccessary regex usage.
/^P\|/ is equivalent to "P|" eq substr $_, 0, 2.
$foo =~ "BAR" could be -1 != index $foo, "BAR".
Then there is some repeated code. Factoring that out into a sub will not increase performance per se, but makes it easier to reason about the behaviour of the script.
There are a lot of unneccessary stringifications like "$filename" โ $filename alone is sufficient.
But the worst offender would be this:
for ( 1 .. $loop ) {
...
open FILE, "<", $filename or ...
while (<FILE>) {
...
}
...
}
You only need to read that file in once, preferably into an array. You can the loop over the indices:
for ( 1 .. $loop ) {
...
for my $i (0 .. $#file_contents) {
my $line = $file_contents[$i];
... # swap $. for $i, but avoid off-by-one error
}
...
}
Disk IO is slow, so cache where you can!
I also see that you are using the $exclude variable as a boolean with the values FALSE and TRUECANC. Why not 0 and 1, so you can use it directly in a conditional?
You can factor out common tests in if/elsif:
if (FOO && BAR) { THING_A }
elsif (FOO && BAZ) { THING_B }
should be
if (FOO) {
if (BAR) { THING_A }
elsif (BAZ) { THING_B }
}
The $. == $start && $line =~ /^P\|.../ test may be silly, because $start contains only the numbers of lines that start with P| โ so the regex may be sufficient here.
Edit
If I have understood the script correctly then the following should yield a significant performance increase:
#!/usr/bin/perl
use strict;
use warnings;
my ($filename, $cancfile) = #ARGV;
open my $fh, "<", $filename or die "$0: Couldn't open $filename: $!";
my (#num, #lines);
while (<$fh>)
{
push #lines, $_;
push #num, $#lines if "P|" eq substr $_, 0, 2;
}
open my $outcanc, ">>", $cancfile or die "$0: Couldn't open $cancfile: $!";
for my $i ( 0 .. $#num )
{
my $start = $num[$i];
my $end = ($num[$i+1] // #lines) - 1;
# pre v5.10:
# my $end = (defined $num[$i+1] ? $num[$i+1] : #lines) - 1
if ($lines[$start] =~ /^P[|][0-9]{9}[|]1I[|]IR[|]/) {
print {$outcanc} #lines[$start .. $end];
} else {
print STDOUT #lines[$start .. $end];
}
}
The script is cleaned up. The file is cached in an array. Only the parts of the array are iterated that are actually needed โ we are down to O(n) from the previous O(n ยท m).
For your future scripts: Proving behaviour around loops and mutating variables is not impossible, but tedious and annoying. Realizing that
for (1 .. #num) {
$start = shift #num unless $next; # aka "do this only in the first iteration"
$next = shift #num:
$end = $next - 1:
while (<FH>) {
...
$test = $. if eof
...
}
$end = ++test if $end < $start;
$start = $next if $next;
}
is actually all about circumventing a possible undef in the 2nd shift takes some time. Instead of testing for eof in the inner loop, we can just pick the line number after the loop, so we don't need $test. Then we get:
$start = shift #num;
for my $i (1 .. #num) {
$end = $num[$i] - 1:
while (<FH>) { ... }
$end = $. + 1 if $end < $start; # $end < $start only true if not defined $num[$i]
$start = $num[$i] if $num[$i];
}
After translating $i down by one we confine the out-of-bounds problem to one point only:
for my $i (0 .. $#num) {
$start = $num[$i];
$end = $num[$i+1] - 1; # HERE: $end = -1 if $i == $#num
while (<FH>) { ... }
}
$end = $. + 1 if $end < $start;
After replacing the file reading with an array (careful, there is a difference of one between the array index and the line number), we see that the final file reading loop can be avoided if we pull that iteration into the for loop, because we know how many lines there are in total. So to say, we do
$end = ($num[$i+1] // $last_line_number) - 1;
Hopefully my cleaned up code is indeed equivalent to the original.
I have some text that looks like this:
(something1)something2
However something1 and something2 might also have some parentheses inside them such as
(some(thing)1)something(2)
I want to extract something1 (including internal parentheses if there are any) to a variable. Since I can count on the text always starting with an opening parentheses, I'm hoping that I can do something where I match the first parenthesis to the correct closing parentheses, and extract the middle.
Everything I have tried so far has the potential to match the wrong ending parentheses.
If you have perl, the:
perl -MText::Balanced -nlE 'say [Text::Balanced::extract_bracketed( $_, "()" )]->[0]' <<EOF
(something1)something2
(some(thing)1)something(2)
(some(t()()hing)()1)()something(2)
EOF
will prints
(something1)
(some(thing)1)
(some(t()()hing)()1)
Since this is apparently something that is impossible with regular expressions, I have resorted to pickup the the characters 1 by 1:
first=""
count=0
while test -n "$string"
do
char=${string:0:1} # Get the first character
if [[ "$char" == ")" ]]
then
count=$(( $count - 1 ))
fi
if [[ $count > 0 ]]
then
first="$first$char"
fi
if [[ "$char" == "(" ]]
then
count=$(( $count + 1 ))
fi
string=${string:1} # Trim the first character
if [[ $count == 0 ]]
then
second="$string"
string=""
fi
done
You can do it with perl:
echo "(some(thing)1)something(2)" | perl -ne '$_ =~ /(\((?:\(.*\)|[^(])*\))|\w+/s; print $1;'
awk can do it:
#!/bin/awk -f
{
for (i=1; i<=length; ++i) {
if (numLeft == 0 && substr($0, i, 1) == "(") {
leftPos = i
numLeft = 1
} else if (substr($0, i, 1) == "(") {
++numLeft
} else if (substr($0, i, 1) == ")") {
++numRight
}
if (numLeft && numLeft == numRight) {
print substr($0, leftPos, i-leftPos+1)
next
}
}
}
Input:
(something1)something2
(some(thing)1)something(2)
Output:
(something1)
(some(thing)1)
I have written a Perl script for the following bioinformatics question, but unfortunately there is a problem with the output.
Question
1) From a file of 40,000 unique sequences, unique meaning the sequence id numbers, extract the following pattern
$gpat = [G]{3,5}; $npat = [A-Z]{1,25};<br>
$pattern = $gpat.$npat.$gpat.$npat.$gpat.$npat.$gpat;
2) For each sequence, find if $pattern occurs between the values of
0-100
100-200
200-300
...
900-1000
1000
If a certain sequence is <1000 characters long, even then the division must be maintained i.e. 0-100,100-200 etc.
The Issue
The main issue I am having is with counting the number of times $pattern occurs for each sequence subdivision and then adding its count for all the sequences.
For example, for sequence 1, say $pattern occurs 5 times at a length >1000. For sequence 2, say $pattern occurs 3 times at length>1000. Then total count should be 5+3 =8.
Instead, my result is coming like : (5+4+3+2+1) + (3+2+1) = 21 i.e. a cumulative total.
I am facing the same issue with the count for the first 10 subdivisions of 100 characters each.
I would be grateful if a correct code could be provided for this calculation.
The code I have written is as under. It is heavily derived from Borodin's answer to one of my previous questions here : Perl: Search a pattern across array elements
His answer is here: https://stackoverflow.com/a/11206399/1468737
The Code :
use strict;
use warnings;
my $gpat = '[G]{3,5}';
my $npat = '[A-Z]{1,25}';
my $pattern = $gpat.$npat.$gpat.$npat.$gpat.$npat.$gpat;
my $regex = qr/$pattern/i;
open my $fh, '<', 'small.fa' or die $!;
my ($id, $seq);
my #totals = (0, 0, 0, 0, 0, 0, 0, 0, 0, 0); #intialize the #total arrays...
#..it should contain 10 parts for 10 divisions upto 1000bp
my #thousandcounts =(0); #counting total occurrences of $pattern at >1000 length
while (<$fh>) {
chomp;
if (/^>(\w+)/) {
process_seq($seq) if $id;
$id = $1;
$seq = '';
print "$id\n";
}
elsif ($id) {
$seq .= $_;
process_seq($seq) if eof;
}
}
print "Totals : #totals\n";
print "Thousand Counts total : #thousandcounts\n";
##**SUBROUTINE**
sub process_seq {
my $sequence = shift #_;
my $subseq = substr $sequence,0,1000;
my $length = length $subseq;
print $length,"\n";
if ($length eq 1000) {
my #offsets = map {sprintf '%.0f', $length * $_/ 10} 1..10;
print "Offsets of 10 divisions: #offsets\n";
my #counts = (0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
my #count = (0);
while ($sequence =~ /$regex/g) {
my $place = $-[0];
print $place,"\n\n";
if ($place <=1000){
for my $i (0..9) {
next if $place >= $offsets[$i];
$counts[$i]++;
last;
}
}
print "Counts : #counts\n\n";
$totals[$_] += $counts[$_] for 0..9;
if ($place >1000){
for my $i(0){
$count[$i]++;
last;
}
} print "Count greater than 1000 : #count\n\n";
$thousandcounts[$_] += $count[$_] for 0;
}
}
#This region of code is for those sequences whose total length is less than 1000
#It is working great ! No issues here
elsif ($length != 1000) {
my $substr = join ' ', unpack '(A100)*', $sequence;
my #offsets = map {sprintf '%.0f', $length * $_/ ($length/100)} 1..10;
print "Offsets of 10 divisions: #offsets\n";
my #counts = (0, 0, 0, 0, 0, 0, 0, 0, 0, 0,);
while ($sequence =~ /$regex/g) {
my $place = $-[0];
print "Place : $place","\n\n";
for my $i (0..9) {
next if $place >= $offsets[$i];
$counts[$i]++; .
last;
}
}
print "Counts : #counts\n\n";
$totals[$_] += $counts[$_] for 0..9;
}
}#subroutine ends
I am also attaching a small segment of the file I am working with. This one is titled small.fa and I have been experimenting with this file only before moving onto to the bigger file containing >40,000 sequences.
>NR_037701 1
aggagctatgaatattaatgaaagtggtcctgatgcatgcatattaaaca
tgcatcttacatatgacacatgttcaccttggggtggagacttaatattt
aaatattgcaatcaggccctatacatcaaaaggtctattcaggacatgaa
ggcactcaagtatgcaatctctgtaaacccgctagaaccagtcatggtcg
gtgggctccttaccaggagaaaattaccgaaatcactcttgtccaatcaa
agctgtagttatggctggtggagttcagttagtcagcatctggtggagct
gcaagtgttttagtattgtttatttagaggccagtgcttatttagctgct
agagaaaaggaaaacttgtggcagttagaacatagtttattcttttaagt
gtagggctgcatgacttaacccttgtttggcatggccttaggtcctgttt
gtaatttggtatcttgttgccacaaagagtgtgtttggtcagtcttatga
cctctattttgacattaatgctggttggttgtgtctaaaccataaaaggg
aggggagtataatgaggtgtgtctgacctcttgtcctgtcatggctggga
actcagtttctaaggtttttctggggtcctctttgccaagagcgtttcta
ttcagttggtggaggggacttaggattttatttttagtttgcagccaggg
tcagtacatttcagtcacccccgcccagccctcctgatcctcctgtcatt
cctcacatcctgtcattgtcagagattttacagatatagagctgaatcat
ttcctgccatctcttttaacacacaggcctcccagatctttctaacccag
gacctacttggaaaggcatgctgggtctcttccacagactttaagctctc
cctacaccagaatttaggtgagtgctttgaggacatgaagctattcctcc
caccaccagtagccttgggctggcccacgccaactgtggagctggagcgg
gagggaggagtacagacatggaattttaattctgtaatccagggcttcag
ttatgtacaacatccatgccatttgatgattccaccactccttttccatc
tcccagaagcctgctttttaatgcccgcttaatattatcagagccgagcc
tggaatcaaactgcctctttcaaaacctgccactatatcctggctttgtg
acctcagccaagttgcttgactattctcagtctcagtttctgcacctgtc
aaatagggtttatgttaacctaactttcagggctgtcaggattaaatgag
catgaaccacataaaatgtttggtgtatagtaagtgtacagtaaatactt
ccattatcagtccctgcaattctatttttcttccttctctacacagcccc
tgtctggctttaaaatgtcctgccctgctttttatgagtggataccccca
gccctatgtggattagcaagttaagtaatgacactcagagacagttccat
ctttgtccataacttgctctgtgatccagtgtgcatcactcaaacagact
atctcttttctcctacaaaacagacagctgcctctcagataatgttgggg
gcataggaggaatgggaagcccgctaagagaacagaagtcaaaaacagtt
gggttctagatgggaggaggtgtgcgtgcacatgtatgtttgtgtttcag
gtcttggaatctcagcaggtcagtcacattgcagtgtgtcgcttcacctg
gctccctcttttaaagattttccttccctctttccaactccctgggtcct
ggatcctccaacagtgtcagggttagatgccttttatgggccacttgcat
tagtgtcctgatagaggcttaatcactgctcagaaactgccttctgccca
ctggcaaagggaggcaggggaaatacatgattctaattaatggtccaggc
agagaggacactcagaatttcaggactgaagagtatacatgtgtgtgatg
gtaaatgggcaaaaatcatcccttggcttctcatgcataatgcatgggca
cacagactcaaaccctctctcacacacatacacatatacattgttattcc
acacacaaggcataatcccagtgtccagtgcacatgcatacacgcacaca
ttcccttcctaggccactgtattgctttcctagggcatcttcttataaga
caccagtcgtataaggagcccaccccactcatctgagcttatcaaccaat
tacattaggaaagactgtatttcctagtaaggtcacattcagtagtactg
agggttgggacttcaacacagctttttgggggatcataattcaacccatg
acagccactgagattattatatctccagagaataaatgtgtggagttaaa
aggaagatacatgtggtacaaggggtggtaaggcaagggtaaaaggggag
ggaggggattgaactagacacagacacatgagcaggactttggggagtgt
gttttatatctgtcagatgcctagaacagcacctgaaatatgggactcaa
tcattttagtccccttctttctataagtgtgtgtgtgcggatatgtgtgc
tagatgttcttgctgtgttaggaggtgataaacatttgtccatgttatat
aggtggaaagggtcagactactaaattgtgaagacatcatctgtctgcat
ttattgagaatgtgaatatgaaacaagctgcaagtattctataaatgttc
actgttattagatattgtatgtctttgtgtccttttattcatgaattctt
gcacattatgaagaaagagtccatgtggtcagtgtcttacccggtgtagg
gtaaatgcacctgatagcaataacttaagcacacctttataatgacccta
tatggcagatgctcctgaatgtgtgtttcgagctagaaaatccgggagtg
gccaatcggagattcgtttcttatctataatagacatctgagcccctggc
ccatcccatgaaacccaggctgtagagaggattgaggccttaagttttgg
gttaaatgacagttgccaggtgtcgctcattagggaaaggggttaagtga
aaatgctgtataaactgcatgatgtttgcaggcagttgtggttttcctgc
ccagcctgccaccaccgggccatgcggatatgttgtccagcccaacacca
caggaccatttctgtatgtaagacaattctatccagcccgccacctctgg
actccctcccctgtatgtaagccctcaataaaaccccacgtctcttttgc
tggcaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
aaa
>NR_002714 1
gttatacatctctaccattacctagcctgaaaagccacctcagattcagc
caacaagtaagtgggcattacaggagaagggtacctttcacaagggctgt
aatctaaaatcttggggaagatacagcgtcatctgtccaagaggtgtcag
cagtaacgaagcctcagtagaagccaaagttattttggattactgagcct
gtatagtttccagattctcaagagaaatatatgggaatgtagatatctca
gaggaccttcctgctgtcaggaattcagaggaggaaataaggaaggtaat
aggtgctctgctctcattctctcaaaccctcttccctgtgttttcctata
gagattgctgatttgctccttaagcaagagattcactgctgctcagcatg
gctcagaccaactcatgcttcatgctgatctcctgcctgatgttcctgtc
tctgagccaaggtgagattgttttccccacacatacctcccacaacccca
gccctgaagccctcactctatcctcatgcatatgagttcacttgagaaaa
agcagagtcaagttcaggggttgttttgtgttgttcagtgatatttattg
ctgatctcatcccattcaaaaacatcctgacctccctaaggagttagaga
tggaacttagcataaccctttatcagtgaccactgcagttggcattggtt
tgtcatattaacactactcatgatgggggtgttgaggatgtctgtttgta
gacagtcattagtggaatggggaactgaggggagctttgtgtgtagagaa
actggacaggcttgagaaagaagcctcagtccttcaaggaagaaaaagcc
ataagtaaaagggacaatggggacacttttcatgagcctattcattgtgt
gctcttgtcttgagcaaagacatcttgagagcctataggtaagatgcaga
agggcagaagtgaccaatcgcttcgtgacctataggatccttctattcct
ataaagaatcctcagaagctcctacctcatattttagcctttaccttgcc
ctgagggtctttcttaattgtctctcttttcccaggacaggaggcccatg
ctgagttgcccaaggcccagatcagctgcccagaaggcaccagtgcctaa
ggctcccactgctactactttaatgaagagcatgagacctgggtttatgc
agatgtgagtgaggagagcagtgtgggaagggaggctcacgaagggaggg
gaagctgccactctccagtgtgttcagtggctgatatgagatgagactaa
tcccctccctatccaatcatcagcccaaaactttccaatctactttatcc
catcattcagcacagagatgctggtggtcagtgacagcatcatcagggac
atttctgtgctgtcctttttctgttacatcctctgggagggctcaatatg
tctcccacactttcctccttcactgagtgctccattttcttctccaacag
ctctactgccagaacatgaattcaggtaacctggtgtctgtgctcaccca
ggctgagggtgcctttgtggcttcgctgattaaagagagtggcaccaagg
atagcaatgtctggattggcctccatgacccccaccggatcagtctgctg
catcttctacctcctgattatcaggttccagagggtctgatgtctggcac
ctcaagcatcagtttttactatattatgataaaagcaacctctctataaa
tcatataatgtaaaggatatcaaggttctccataggttcttcgagataag
cttaaagctgaatttcctgtgtgtttcaggcattcacagataaactcatt
ctctgtacttctagggtagcatctttatgtatctattatgtacctcttat
ctattgtgttatcatctctgttatagaagagccttctgtagaccatatag
aaaaagattatagaggaggagaatctactgctggcaattgggaaccgcaa
ggtatactaaataatatatcaacaactaatggccatctaatgctatgctg
gatatgaacttttggggcctcaggaaagaaaaaccaggaactagtttcaa
taatgaggtgtcatggttccctgtggcaaatttagaacgcttatcgtttg
gcaggacacagagaggtaggtgaacattccaggaaagaagcagcttagag
aaaatgtggaggaaataatatgacacttagagaaaaaggaaggtttattc
ttgtcttatgtcttgacctgtttctgagtgcgaacacaaaccaggtgttt
ctgtctctttctgagtcacgtctgcccctgttctggcccttccccatcta
gaactgccattatcagtggagtagtgggtccctggtctcctacaaatcct
gggacattggatccccaagctgtgccaatactgcctactgtgctagcctg
acttcaagctcaggtgaggggcacagaatccacacacttattgccatcct
ctcctatttatctctgaggatcgaccggggactgggatagaggaagggtg
agctcctcattcaggaaatagaggagtgtttcctctttatttttgctgag
tcctgcagccaggagggtaatacactctgatcccctcagtctgaatcttc
tcattgtcttataggattcaagaaatggaaggatgattcttgtaaggaga
agttctcctttgtttgcaagttcaaatactggaggcaattgtaaaatgga
cgtctagaattggtctaccagttactatggagtaaaagaattaaactgga
ccatctctctccatatcaatctggaccatctctcctctgctaaatttgca
tgactgatctttagtatctttacctacctcaatttctggagccctaaaca
ataaaaataaacatgtttcccccat
>NR_003569 1
ctgggacccacgacgacagaaggcgccgatggccgcgcctgctgagccct
gcgcggggcagggggtctggaaccagacagagcctgaacctgccgccacc
agcctgctgagcctgtgcttcctgagaacagcaggggtctgggtaccccc
catgtacctctgggtccttggtcccatctacctcctcttcatccaccacc
atggccggggctacctccggatgttccccactcttcaaagccaagatggt
gcttggattcgccctcatagtcctgtgtacctccagcgtggctgtcgctc
tttggaaaatccaacagggaacgcctgaggccccagaattcctcattcat
cctactgtgtggctcaccacgatgagcttcgcagtgttcctgattcacac
caagaggaaaaagggagtccagtcatctggagtgctgtttggttactggc
ttctctgctttgtcttgccagctaccaacgctgcccagcaggcctccgga
gcgggcttccagagcgaccctgtccgccacctgtccacctacctatgcct
gtctctggtggtggcacagtttgtgctgtcctgcctggcggatcaacccc
ccttcttccctgaagacccccagcagtctaacccctgtccagagactggg
gcagccttcccctccaaagccacgttctggtgggtttctggcctggtctg
gaggggatacaggaggccactgagaccaaaagacctctggtcgcttggga
gagaaaactcctcagaagaacttgtttcccggcttgaaaaggagtggatg
aggaaccgcagtgcagcccgggggcacaacaaggcaatagcatttaaaag
gaaaggcggcagtggcatggaggctccagagactgagcccttcctacggc
aagaagggagccagtggcgcccactgctgaaggccatctggcaggtgttc
cattctaccttcctcctggggaccctcagcctcgtcatcagtgatgtctt
caggttcactgtccccaagctgctcagccttttcctggagtttattggtg
atcccaagcctccagcctggaagggctacctcctcgccgtgctgatgttc
ctctcggcctgcctgcaaacgctgtttgagcagcagaacatgtacaggct
caaggtgctgtagatgaggctgcggtcggccatcactggcctggtgtaca
gaaaggcatccacagcatatctgaagaaatattcagaagttaactaatct
cagatgatttcagcaggagtaaagaagagaaacagactcagaaatgccat
tacaacagttaattatgtcaaatttatcaccctgattgatcacgcagcat
taacctcaagaacgccaagccaagtttttttgacaaatgtgagccaaggt
ttccgaaaaactagcagatatgactgtgacttacaaaatggaaaaagtaa
acgagaaacacaatttgatatgatttaataaaagatttgtttccaccact
tctcctgggaacctcagcacattttctttccactgacagttattatctct
acctttattgaacaaagacacccggaacacagctgctgaggatcagtaaa
gaaaatcattcttttattaataagactgttattagcaggaaaaaaaaatc
catgtttgggagtttgcactgaagttacaggccattttgaagaaatatgg
ctgactagtgccaacattatttcaggcaatttcatgatcaaatgtcttat
taggttgtttaaaatttttatagagattgtaaatcagaactattttctat
ttgccctaaatatttagatgctacagggaaagcagatcaaattaaagggt
actgtgcacatttttttactgggaactcccagggatataaatcatttcgc
ctgcagcatggaattcttcagtacacatgcttgtggaaacattccacgct
ccgccagcacgctcattaaagtgatgatttgggttgcaacaacagtgcca
agtacttcctgtgttcaactggggaccatgtggcaagacccaaagcttcc
ccagagatcctatgggaataagttttttgagccaccatattccattattt
cagcctaaaataacaccatgggacaagaatcagaagacagaggagcagac
aaatgtgtgtagacatgctggaaggaatctttctttttagaaacagggtc
aatatctattaaactttaagatgtgtatctcttgacctggcagtttctgt
atttgagttttaacctactgatatacccatgcatgtgaataaagtatctt
cctgcatgtaacaggatatttaatgtaaccttgattatagttgcaaatgc
tgggaaacgatccaaatgtctttcaatatggcactgattaaataaattat
ggcacagtctcacaatgaaaaacaaatgtagccattaaacagaatgaaat
gggtctagctaaattgaaataggactacctctaagatatgttgttaaaaa
gaaaaaaaagaaagtgcagaggaacaagtatgataccattttgtattttt
taacatatgcaagcgtgattgtgcccacacagaatacctttgaaaataaa
ctcagtatttgcctcagtggataaaaacaagaaccagccttattttcact
gttatatcttttggtgccactttttgaactttttaccatatgtgcatatg
taactttctaaataaattttgtaaaaaaaaaaaaaaaaaa
>NR_002817 2
aactcggtctccactgcactgctggccagacgagggatgttattttgggc
agtgcatctggacttggttcaagtggcaccagccaaatccctgccttact
gacctctcccctggaggagcaggagcagtgctcaaggccgccctgggagg
gctgagaggcaggctctggactggggacacagggatagctgagccccagc
tgggggtggaagctgagccagggacagtcacagaggaacaagatcaagat
gcgctttaactgagaagcccccaaggcagaggctgagaatcagaagacat
ttcagcagacatctacaaatctgaaggacaaaacatggttcaagcatctg
ggcacaggcggtccacccgtggctccaaaatggtctcctggtccgtgata
gcaaagatccaggaaatatggtgcgaggaagatgagaggaagatggcgcg
agagttcctggccgagttcatgagcacatatgtcatgatggagtggctga
ccgggatgctccagctgtgtctcttcgccatcgtggaccaggagaacaac
ccagcactgccaggaacacacgcactggtgataggcatcctcgtggtcat
catcagggtgtaccatggcatgaacacaggatatgccatcaatccgtccc
gggacctgccccccccccccgcatcttcaccttcattgctggttggggca
aactggtcttcaggtactgcccctgcccaggcccattcctttgagatttt
ctgtggggcccctgtgtgttgaggtgtggggggtgatgtgaggggcagca
caggagggtcctgcagagcccccaggtggcctggggagcaggagtgagtc
ccaacatttccccaggccagtagagatacagatcctgcacctgcactgag
tgtcaaccctgtccctgagtcgggctgaggctgaccagggccccgggttg
ggggtgtttcctgggttagcctgaggatgactcctctgctcaaccagtct
tggcccgaggtggatgagggtgctgtcctgggcatcagccccctcagccg
gcctctgcctcttgcctgcagcgatggggagaacttgtggtgggtgccag
tggtggcaccacttctgggtgcctctctaggtggcatcatctacctggtc
ttcattggctccaccatcccacgggagcccctgaaattggaggactctgt
ggcatatgaagaccacgggataaccgtattgcccaagatgggatctcatg
aacccatgatctctccccttaccctcatctccgtgagccctgccaacaga
tcttcagtccaccctgccccacccttacatgaatccatggccctagagca
cttctaagcagagattatttgtgatcccatcccttccccaataaagagaa
gcttgtcccacagcagtacccccacttcctgggggcctcctgtggttggg
cttccctcctgggttcttccaggagctctagggctatgtcttagcccaag
gtgtagaggtgaggcacctcaagtctttcatgccctgggaactggggtgc
cccagggggagaatggggaagagctgacctgcgccctcagtaggaacaag
gtaagatgaaagaatgacagaaacagaatgagggattttcaggcaagggg
gaaggaagggcagttttggtgaaaggactgtagctgactggtggggggct
ggctttggaaatactttgaggggatcctgagactggactctagactctcc
cctggttgttcccttccccgagttctggccggttcttggaccagacaagg
catggcccaagaaggtagatcagaattttttagcctttttttcattagtg
ccttccctagtataattccagattttttttcttaatcacatgaaatttta
ataccacagatatactatacatctgtttatgttctgtatatgttctgtgc
tttatacgtaaaaaagagtaagattttttttcacctccccttttaagaat
cagttttaattcccttgagaatgcttgttatagattgaaggctggtaagg
ggttgggctcctctttcttcttcctggtgccagagtgctcccacatgaag
gaataggaaaggaagatgcaaagagggaaatccttcgaacacatgaagac
acaggaagaggcctcttagggctccaagggctccagggaagcagctgcag
aggttgggtggggtgaggggccaggatccactgaccctggggccaggcag
gaatcactctgttgcctggggctcagaaggcagtatcacccatggttcct
gtcattgctcatgtattttgcctttcaacaattattgtgcacctactgtg
tgcaggccctgcctggacactggggatgcgcagtggatgcactgggctct
gcctttgagggttgcagtttaatgggtgacaggtaattataaggaagaag
gtgagtgcagagtgggaggcttggaggctgtggggcttggggtgggggag
ctcacatccagcctctgggccaaggccaggaggcttcccagagcaggaga
cagagcagggtattgtggtggggggtgtcctttttggggctgggatctgc
actttacagtttgaggggatgggcagaggaggctgggcttcattctggag
gtggggacatggtgaggtgaggtttagaaagcacacctgagccgcagtgt
gtaggatgctggaaatggtggagatgggcctgcgaagagagtgctgggaa
gtgatgacccaggagcagcagccgggcacctaacaatgggtcagcaccgt
gggcgtggagacaaaggccgggattgatcaatacccgagaagtacaatgt
acaggacttgggctccatttggatggagtgggtgagggaggagtcagaaa
tggcttccgatttccagcttgggcctggggattggagatgtccccactga
gagtagggcacaagtgaggaaatggtttggagaggaagatgataagttac
atcatggatgtgctgagtctgagttgcctatgggacttggaatggggggt
ggcaaaaggtgtgtgatcttgagcaagatattcaactcttctgggccttg
gtcttctcatttgtaaaacggtgataagaatattacttcccatttgtgtt
gctgtgaatattaaatgcgctaccacatgt
Thank you for taking the time to go through my problem.
Any help and input would be deeply appreciated.
Thank you for taking the time to go through my problem!
This is pretty much the same as your previous problem except that the intervals are independent of the length of the sequence and so can be defined just once instead of changing them for every sequence.
This program is a modification of my previous solution. As I described, it starts with a fixed set of values in #offsets from 100 to 1000 in steps of 100, and the final range > 1000 is terminated at 2E9 or 2 billion. This is close to the maximum positive 32-bit integer and serves to catch all offsets above 1000. I assume you won't be dealing with sequences any bigger than this?
The #totals and #counts arrays are initialised to zeroes with the same number of elements as the #offsets array.
Otherwise the functionality is much as before.
use strict;
use warnings;
use List::MoreUtils 'firstval';
my $gpat = '[G]{3,5}';
my $npat = '[A-Z]{1,25}';
my $pattern = $gpat.$npat.$gpat.$npat.$gpat.$npat.$gpat;
my $regex = qr/$pattern/i;
open my $fh, '<', 'small.fa' or die $!;
my #offsets = map $_*100, 1 .. 10;
push #offsets, 2E9;
my #totals = (0) x #offsets;
my ($id, $seq);
while (<$fh>) {
chomp;
if (/^>(\w+)/) {
process_seq($seq) if $id;
$id = $1;
$seq = '';
print "$id\n";
}
elsif ($id) {
$seq .= $_;
process_seq($seq) if eof;
}
}
print "Total: #totals\n";
sub process_seq {
my $sequence = shift;
my #counts = (0) x #offsets;
while ($sequence =~ /$regex/g) {
my $place = $-[0];
my $i = firstval { $place < $offsets[$_] } keys #offsets;
$counts[$i]++;
}
print "Counts: #counts\n\n";
$totals[$_] += $counts[$_] for keys #totals;
}
output
Running this program against your new data file small.fa produces
Total: 1 1 0 0 0 0 0 1 0 1 10
But using the data from the previous question, sample.fa is much more interesting
Total: 5 4 1 0 0 2 2 1 0 0 1
The following seems to work. While playing around, I put the data you posted in the __DATA__ section at the end of the script. To use it with a real data file, you'll need to open it, and pass the file handle to run.
#!/usr/bin/env perl
use strict; use warnings;
use Data::Dumper;
use List::MoreUtils qw( first_index );
if (#ARGV) {
my ($input_file) = #ARGV;
open my $input, '<', $input_file
or die "Cannot open '$input_file': $!";
run($input);
close $input
or die "Cannot close '$input_file': $!";
}
else {
run(\*DATA);
}
sub run {
my ($fh, $start_pat, $stop_pat) = #_;
# These are your patterns. I changed $npat because I don't
# think, e.g., q is a valid character in your input.
my $gpat = '[g]{3,5}';
my $npat = '[acgt]{1,25}';
my $wanted = qr/$gpat$npat$gpat$npat$gpat$npat$gpat/;
# These just tell us where a sequence begins and ends.
my $start = qr/\A>([A-Za-z_0-9]+)/;
my $stop = qr/[^acgt]/;
# Set up the bins and labels for the histogram.
my #bins = map 100 * $_, 1 .. 10;
my #labels = map sprintf('%d - %d', $_ - 100, $_), #bins;
# Initialize the histogram with all zero counts.
my %hist = map { $_ => 0 } #labels;
my $id;
while (my $line = <$fh>) {
# Whenever you see a new sequence, read it completely
# and pass it to build_histogram.
if (($id) = ($line =~ $start)) {
print "Start sequence: '$id':\n";
my $seq_ref;
($line, $seq_ref) = read_sequence($fh, $stop);
my $hist = build_histogram(
$seq_ref,
$wanted,
\#bins,
\#labels,
);
# Add the counts from this sequence to the overall
# histogram.
for my $key ( keys %$hist ) {
$hist{ $key } += $hist->{$key};
}
# exit loop if read_sequence stopped because of EOF.
last unless defined $line;
# else see if the line that stopped input is the start
# of a new sequence.
redo;
}
}
print Dumper \%hist;
}
sub build_histogram {
my ($seq_ref, $wanted, $bins, $labels) = #_;
my %hist;
while ($$seq_ref =~ /$wanted/g) {
# Whenever we find segment which matches what we want,
# store the position,
my $pos = $-[0];
# and find the bin where it fits.
my $idx = first_index { $_ > $pos } #$bins;
# if you do not have List::MoreUtils, you should install it
# however, the grep can be used instead of first_index
# my ($idx) = grep { $bins->[$_] > $pos } 0 .. $#$bins;
# $idx = -1 unless defined $idx;
# if it did not fit in the bins, then the position must
# be greater than the upper limit of the last bin, put
# it in "> than upper limit of last bin".
my $key = ($idx == -1 ? "> $bins->[-1]" : $labels->[$idx]);
$hist{ $key } += 1;
}
# we're done matching, return the histogram for this sequence
return \%hist;
}
sub read_sequence {
my ($fh, $stop) = #_;
my ($line, $seq);
while ($line = <$fh>) {
$line =~ s/\s+\z//;
last if $line =~ $stop;
$seq .= $line;
}
return ($line, \$seq);
}
__DATA__
-- Either paste your data here, or pass the name
-- of your input file on the command line
Output:
Start sequence: 'NR_037701':
Start sequence: 'NR_002714':
Start sequence: 'NR_003569':
Start sequence: 'NR_002817':
$VAR1 = {
'700 - 800' => 0,
'> 1000' => 10,
'200 - 300' => 1,
'900 - 1000' => 1,
'800 - 900' => 1,
'500 - 600' => 0,
'0 - 100' => 0,
'100 - 200' => 1,
'300 - 400' => 0,
'400 - 500' => 0,
'600 - 700' => 0
};
Also, you should take Chris Charley's advice and use Bio::SeqIO to read sequences rather than my homebrewed read_sequence function. I was just too lazy to install BioPerl just for the purpose of answering this question.
Generally, in Perl you can count the occurrence of a pattern by:
$_ = $input;
my $c = 0;
$c++ while s/pattern//s;
I was finally able to figure out where I was going wrong with my code. It turned out to be a looping problem. The following code works perfectly. I have marked it in comments the places where I made the modification.
#!/usr/bin/perl -w
use strict;
use warnings;
my $gpat = '[G]{3,5}';
my $npat = '[A-Z]{1,25}';
my $pattern = $gpat . $npat . $gpat . $npat . $gpat . $npat . $gpat;
my $regex = qr/$pattern/i;
open OUT, ">Quadindividual.refMrna.fa" or die;
open my $fh, '<', 'refMrna.fa' or die $!;
my ( $id, $seq ); # can be written as my $id; my $seq;
my #totals = ( 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ); #intialize the #total arrays.
my #thousandcounts = (0);
while (<$fh>) {
chomp;
if (/^>(\w+)/) {
process_seq($seq) if $id;
$id = $1;
$seq = '';
print "$id\n";
print OUT "$id\n";
}
elsif ($id) {
$seq .= $_;
process_seq($seq) if eof;
}
}
print "Totals : #totals\n";
print OUT "Totals : #totals \n";
print "Thousand Counts total : #thousandcounts\n";
print OUT "Thousand Counts total : #thousandcounts\n";
sub process_seq {
my $sequence = shift #_;
my $subseq = substr $sequence, 0, 1000;
my $length = length $subseq;
print $length, "\n";
my #offsets = map { sprintf '%.0f', $length * $_ / 10 } 1 .. 10;
print "Offsets of 10 divisions: #offsets\n";
my #counts = ( 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, );
my #count = (0);
# *MODIFICATION*
# This if loop was intialized from my #offsets above earlier
if ( $length eq 1000 ) {
while ( $sequence =~ /$regex/g ) {
my $place = $-[0];
print $place, "\n\n";
if ( $place <= 1000 ) {
for my $i ( 0 .. 9 ) {
next if $place >= $offsets[$i];
$counts[$i]++;
last;
}
}
if ( $place > 1000 ) {
for my $i (0) {
$count[$i]++;
last;
}
}
} #*MODIFICATION*
#The following commands were also subsequently shifted to ..
#...properly compute the total
print "Counts : #counts\n\n";
$totals[$_] += $counts[$_] for 0 .. 9;
print "Count : #count\n\n";
$thousandcounts[$_] += $count[$_] for 0;
}
elsif ( $length != 1000 ) {
my $substr = join ' ', unpack '(A100)*', $sequence;
my #offsets =
map { sprintf '%.0f', $length * $_ / ( $length / 100 ) } 1 .. 10;
print "Offsets of 10 divisions: #offsets\n";
my #counts = ( 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, );
while ( $sequence =~ /$regex/g ) {
my $place = $-[0];
print "Place : $place", "\n\n";
for my $i ( 0 .. 9 ) {
next if $place >= $offsets[$i];
$counts[$i]++;
last;
}
}
print "Counts : #counts\n\n";
$totals[$_] += $counts[$_] for 0 .. 9;
}
} #subroutine ends