I have 2 files, as follows:
file1.txt:
0 117nt, >gene_73|GeneMark.hm... *
0 237nt, >gene_3097|GeneMark.... *
0 237nt, >gene_579|GeneMark.h... *
0 237nt, >gene_988|GeneMark.h... *
0 189nt, >gene_97|GeneMark.hm... *
0 183nt, >gene_97|GeneMark.hm... *
file2.fasta:
>gene_735|GeneMark.hmm|237_nt|+|798985|799221
TTGTGGTTCGTGCCGCGCGACGCGTTGCGTCTGCAAACGCCCGACGAAGACATCGCGACCTATCTGTTCAACAAGCATGTGATTCGGCATCGGTTCTGTCCGACCTGCGGGATTCATCCGTTCGCGGAAGGCACGGACCCGAAGGGCAACGCGATGGCGGCCGTCAATCTTCGCTGCGTCGACGGCGTCGATCTCGACGCGTTGAGCGTCCGCCATTTCGACGGGCGCGCGCTCTGA
>gene_579|GeneMark.hmm|237_nt|+|667187|667423
ATGTACCACGGCGCCGAATTTGCCGCTGCCAAGGGCATGCGCTGGCTGCGAGATGCCGCCAACGGCTCTGCCTTCATCGCACCGGGCAGTCCGTGGCAAAACGGTTTCGTCGAGCGTTTCAACGGCAAGCTGCATGACGAATTGCTGAACCGGGAATGGTTCCGCGGCCGTGCCGAGACCAAGATGCTCATCGAACGCTCCGGCTACGGTCCGTCGAGTCTGACCGGATTCCGATGA
>gene_1876|GeneMark.hmm|234_nt|-|2168498|2168731
ATGCTGTTCTTTTCGCGCGCGGGCGTGTCGCGTGCGGCCGGCGGCCAATCATGCGGCGAGTCGTTTTGTCGCGGCTCGCGGCGCTTGCCGACGTTGGAATCGCGCGCGCCGATGCGCGGATCGGGGCGGCAACGTTTGCGTATGAGGAATGATGCGTTTGCGCATCGGGAATGGGCGCCTCGCCCCGGTTTCGCCGCGATTCCGCCCGACTCGAGGCAGTCGTTTTTCCGCTAA
>gene_3097|GeneMark.hmm|237_nt|-|3467022|3467258
GTGTCGAACGAACGTCGCGGCGAACGGCCGCTGCGGGCATCGCCGCAGGACGTCACACGGCGAACGTCGCGCGCGATCCTCGGCGGCCGCGAACGTGGGCCGTCCCGTGGCACGTTCGGCTCGCTCGGCATGGCGAACGACCGCCGCATCGCGCATCGCCGTCGCGCGGCCTCCAAAAAAACGGCGGTCAGCGACCGCCGGCTTTGGCCGAAACCGATGCGTCGTACGAATCAGTGA
>gene_988|GeneMark.hmm|237_nt|+|1121027|1121263
ATGACCTTGTCAGGCAACATCAAGGACGGCGACTGGACGGTCGAGGTGACGACATCGCCGGTGCAGGGCGGTTACGTGTGCGACATCGAGGTGATGCACGGCGCGCCGGGCGGCGCGTTCCGGCACGCGTTCCGGCACGGCGGCACTTATCCGGCCGAGCGCGACGCGATGATCGAGGGGCTGCGCGCGGGCATGACCTGGATCGAGCTGAAGATGTCGAAAGCATTCAATCTGTAA
>gene_97|GeneMark.hmm|105_nt|+|90122|90226
GTGACGCGTTTCGCGACGCGCGTCGATGGGGCGGGCGCGAAACCCGTTCGCCGCGATGCGGCGGACGGGGTATGGCCGAGCGCCGTCCGTCGCGGCGAGAGTTGA
>gene_97|GeneMark.hmm|183_nt|-|107002|107184
ATGGAGGCAATCGTGATCGAGCAAGTGATACTGGGCGTCTTTCTCGTACTGCCGCTTCTCATCGTCGCGGTGCTGTACTCCGACGAACTCTGGCAAGAACACCGCCTGCAGCATCCGCGCGACGAGCACACGCCACATATCGACTGGCGTCATCCGTGGCGGATCCTGCGGCGAGGGCACTAA
>gene_97|GeneMark.hmm|189_nt|-|98624|98812
GTGAAATACACGAGCGACCATTACGCGGGCGTCAAATTTGGCGCGCTGTACGGGTTCTCGAACGCGGCGAACTTCGCCGACAACCGCGCTCGCCGGCGCATGCGCGGCGTTCGCATACGCGATCGGCAAAAGCGGCGTGATGTGCGGTTGCCTGCCGCGCTCGCGCTATGCGCGGCACGCCATCGATGA
>gene_97|GeneMark.hmm|234_nt|+|105494|105727
ATGAAGATTCAAATCGCCATTGTTTATTTTGTCGCCCGTCACGCAAACGAGCAGGCGCGAAGCGGATCGGCGCGCATTGGCGAAGAGCCGGCGCGCATCGGCATCGCGCTCGCGCGACACATGCGCGCCGCGCGCGGCCGGTCGACGCCGGATTCGCCTGTCGATCGATCCGGTGCGCCCCGAGCCGATGAGCGGTACGCTTCGGCGCGCGCGCGACACGCGCGACACGCGTGA
>gene_979|GeneMark.hmm|225_nt|-|1115442|1115666
TTGATCGACGCGCGGGGCCGGCCGGGCCGCGGGGTATCGAAGGCGATCGACGCGCAACACGAATCGCCGCCGCGCGCCGAAACCTCGCTATGCGCGTCGCGCGCACGCGCGGCCGGCGGCGCACGCGCGGGTGTGCGCGGGCCGGCGGCGCGGCCGCTCGCACTGCGCGACCGCTCGCGCGCACGCCTTCCTCGGCACGCGCCGGGAATCCCGGCCCTTCAATGA
The output that I expect is:
>gene_579|GeneMark.hmm|237_nt|+|667187|667423
ATGTACCACGGCGCCGAATTTGCCGCTGCCAAGGGCATGCGCTGGCTGCGAGATGCCGCCAACGGCTCTGCCTTCATCGCACCGGGCAGTCCGTGGCAAAACGGTTTCGTCGAGCGTTTCAACGGCAAGCTGCATGACGAATTGCTGAACCGGGAATGGTTCCGCGGCCGTGCCGAGACCAAGATGCTCATCGAACGCTCCGGCTACGGTCCGTCGAGTCTGACCGGATTCCGATGA
>gene_3097|GeneMark.hmm|237_nt|-|3467022|3467258
GTGTCGAACGAACGTCGCGGCGAACGGCCGCTGCGGGCATCGCCGCAGGACGTCACACGGCGAACGTCGCGCGCGATCCTCGGCGGCCGCGAACGTGGGCCGTCCCGTGGCACGTTCGGCTCGCTCGGCATGGCGAACGACCGCCGCATCGCGCATCGCCGTCGCGCGGCCTCCAAAAAAACGGCGGTCAGCGACCGCCGGCTTTGGCCGAAACCGATGCGTCGTACGAATCAGTGA
>gene_988|GeneMark.hmm|237_nt|+|1121027|1121263
ATGACCTTGTCAGGCAACATCAAGGACGGCGACTGGACGGTCGAGGTGACGACATCGCCGGTGCAGGGCGGTTACGTGTGCGACATCGAGGTGATGCACGGCGCGCCGGGCGGCGCGTTCCGGCACGCGTTCCGGCACGGCGGCACTTATCCGGCCGAGCGCGACGCGATGATCGAGGGGCTGCGCGCGGGCATGACCTGGATCGAGCTGAAGATGTCGAAAGCATTCAATCTGTAA
>gene_97|GeneMark.hmm|183_nt|-|107002|107184
ATGGAGGCAATCGTGATCGAGCAAGTGATACTGGGCGTCTTTCTCGTACTGCCGCTTCTCATCGTCGCGGTGCTGTACTCCGACGAACTCTGGCAAGAACACCGCCTGCAGCATCCGCGCGACGAGCACACGCCACATATCGACTGGCGTCATCCGTGGCGGATCCTGCGGCGAGGGCACTAA
>gene_97|GeneMark.hmm|189_nt|-|98624|98812
GTGAAATACACGAGCGACCATTACGCGGGCGTCAAATTTGGCGCGCTGTACGGGTTCTCGAACGCGGCGAACTTCGCCGACAACCGCGCTCGCCGGCGCATGCGCGGCGTTCGCATACGCGATCGGCAAAAGCGGCGTGATGTGCGGTTGCCTGCCGCGCTCGCGCTATGCGCGGCACGCCATCGATGA
There are 4 sequences with gene number 97, but all in different length. I want the sequence with the correct gene length only which listed in file1.txt to output in the output.fasta file. What I've done so far is as follows (but failed and have some errors):
#!/usr/bin/perl
use strict;
use warnings;
my #genes;
open my $list, '<file1.txt';
while (my $line = <$list>) {
push (#genes, $1) if $line =~/\>(.*?)\|/gs;
}
my $tag1 = "0\t";
my $tag2 = "nt";
while (my $line = <$list>) {
if ($line =~ /$tag1(.*?)$tag2/) {
my $match1 = $1;
}
}
my $input;
{
local $/ = undef;
open my $fasta, '<file2.fasta';
my $tag3 = "GeneMark.hmm";
my $tag4 = "_nt";
while (my $input = <$fasta>) {
if ($input =~ /$tag3(.*?)$tag4/) {
my $match2 = $1; }}
close $fasta;
}
my #lines = split(/>/,$input);
foreach my $l (#lines) {
if ($l =~ /(.+?)\|/) {
my $real_name = $1;
if ($real_name ~~ #genes) {
if ($match2 = $match1) {
open (OUTFILE, '>>output.fasta');
print OUTFILE ">$l"; }
}
}
}
Can anyone give me some guide to correct the code? Or is there any better way to do this? Any help will be very much appreciated! Thanks! :)
Here's an option that uses Bio::SeqIO:
use strict;
use warnings;
use Bio::SeqIO;
my %hash;
open my $fh, '<', $ARGV[0] or die $!;
while (<$fh>) {
push #{ $hash{$2} }, $1 if /\s+(\d+)nt,.+?>(gene_\d+)\|/;
}
close $fh;
my $in = Bio::SeqIO->new( -file => $ARGV[1], -format => 'Fasta' );
my $out = Bio::SeqIO->new( -fh => \*STDOUT, -format => 'Fasta' );
while ( my $seq = $in->next_seq() ) {
$out->write_seq($seq)
if $seq->id =~ /(gene_\d+)\|.+?\|(\d+)_nt\|/ and grep /$2/, #{ $hash{$1} };
}
Usage: perl script.pl file1.txt file2.fasta [>outFile.fasta]
The second, optional parameter directs output to a file.
Output from your data:
>gene_579|GeneMark.hmm|237_nt|+|667187|667423
ATGTACCACGGCGCCGAATTTGCCGCTGCCAAGGGCATGCGCTGGCTGCGAGATGCCGCC
AACGGCTCTGCCTTCATCGCACCGGGCAGTCCGTGGCAAAACGGTTTCGTCGAGCGTTTC
AACGGCAAGCTGCATGACGAATTGCTGAACCGGGAATGGTTCCGCGGCCGTGCCGAGACC
AAGATGCTCATCGAACGCTCCGGCTACGGTCCGTCGAGTCTGACCGGATTCCGATGA
>gene_3097|GeneMark.hmm|237_nt|-|3467022|3467258
GTGTCGAACGAACGTCGCGGCGAACGGCCGCTGCGGGCATCGCCGCAGGACGTCACACGG
CGAACGTCGCGCGCGATCCTCGGCGGCCGCGAACGTGGGCCGTCCCGTGGCACGTTCGGC
TCGCTCGGCATGGCGAACGACCGCCGCATCGCGCATCGCCGTCGCGCGGCCTCCAAAAAA
ACGGCGGTCAGCGACCGCCGGCTTTGGCCGAAACCGATGCGTCGTACGAATCAGTGA
>gene_988|GeneMark.hmm|237_nt|+|1121027|1121263
ATGACCTTGTCAGGCAACATCAAGGACGGCGACTGGACGGTCGAGGTGACGACATCGCCG
GTGCAGGGCGGTTACGTGTGCGACATCGAGGTGATGCACGGCGCGCCGGGCGGCGCGTTC
CGGCACGCGTTCCGGCACGGCGGCACTTATCCGGCCGAGCGCGACGCGATGATCGAGGGG
CTGCGCGCGGGCATGACCTGGATCGAGCTGAAGATGTCGAAAGCATTCAATCTGTAA
>gene_97|GeneMark.hmm|183_nt|-|107002|107184
ATGGAGGCAATCGTGATCGAGCAAGTGATACTGGGCGTCTTTCTCGTACTGCCGCTTCTC
ATCGTCGCGGTGCTGTACTCCGACGAACTCTGGCAAGAACACCGCCTGCAGCATCCGCGC
GACGAGCACACGCCACATATCGACTGGCGTCATCCGTGGCGGATCCTGCGGCGAGGGCAC
TAA
>gene_97|GeneMark.hmm|189_nt|-|98624|98812
GTGAAATACACGAGCGACCATTACGCGGGCGTCAAATTTGGCGCGCTGTACGGGTTCTCG
AACGCGGCGAACTTCGCCGACAACCGCGCTCGCCGGCGCATGCGCGGCGTTCGCATACGC
GATCGGCAAAAGCGGCGTGATGTGCGGTTGCCTGCCGCGCTCGCGCTATGCGCGGCACGC
CATCGATGA
Bio::SeqIO lives to parse fasta (and other such) files, so the above leverages this capability. After creating a hash of arrays (HoA) from file1.txt, the fasta file is processed, and only matching fasta records are printed.
Hope this helps!
Related
With the use of perl regex, if two consecutive lines match than count the number of lines.
I want the number of lines until matches the pattern
D001
0000
open ($file, "$file") || die;
my #lines_f = $file;
my $total_size = $#lines_f +1;
foreach my $line (#lines_f)
{
if ($line =~ /D001/) {
$FSIZE = $k + 1;
} else {
$k++;}
}
Instead of just D001, I also want to check if the next line is 0000. If so $FSIZE is the $file size.
The $file would look something like this
00001
00002
.
.
.
D0001
00000
00000
Here is an example. This sets $FSIZE to undef if it cannot find the marker lines:
use strict;
use warnings;
my $fn = 'test.txt';
open ( my $fh, '<', $fn ) or die "Could not open file '$fn': $!";
chomp (my #lines = <$fh>);
close $fh;
my $FSIZE = undef;
for my $i (0..$#lines) {
if ($lines[$i] =~ /D0001/) {
if ( $i < $#lines ) {
if ( $lines[$i+1] =~ /00000/ ) {
$FSIZE = $i + 1;
last;
}
}
}
}
I have a fasta file. I need to remove sequences containing āNā or did not contain at least 3 unique bases.
The code so far is below. Also how would I remove the sequence ID line as go along for sequences I delete.
#!/usr/bin/perl
use strict;
use warnings;
open FILE, '<', $ARGV[0] or die qq{Failed to open "$ARGV[1]" for input: $!\n};
open match_fh, ">$ARGV[0]_trimmed.fasta"
or die qq{Failed to open for output: $!\n};
while ( my $line = <FILE> ) {
chomp($line);
if ( $line =~ m/^>/ ) {
print match_fh "$line\n";
my #data = split( /\|/, $line );
my $nextline = <FILE>;
if ( $nextline !~ /N+/g ) {
if ( $nextline =~ /[ATGC]{3}/g ) {
}
print match_fh "$nextline";
}
}
}
close FILE;
close match_fh;
INPUT
>seq1
ATGCGGGATGATCCGAACGTTTAATCTCGTATGCCGTCTTCTATCTCNNN
>seq2
GATGAGCTTGACTCTAGTCCATCTCGTATGCCGTCTTCTGCTATCTCGTA
>seq3
TTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTC
>seq4
TGGTACTGTAAGCATGAGAGTAATCTCGTATGCCGTCTTCTGCTTGAAAA
OUTPUT
>seq2
GATGAGCTTGACTCTAGTCCATCTCGTATGCCGTCTTCTGCTATCTCGTA
>seq4
TGGTACTGTAAGCATGAGAGTAATCTCGTATGCCGTCTTCTGCTTGAAAA
while(my $head = <FILE>) {
next if($head !~ /^>/);
$_=<FILE>;
if(!/N+/ && /A/+/T/+/G/+/C/ >= 3) {
print match_fh $head, $_;
}
}
I am trying to find pattern Pattern String , once it found , I need to get the next line of pattern, which contains page number, I need extract the page number 2 in below sample text file Page: 2 of 5. Here is my try:
my $filename="sample.txt";
$i=1;
open(FILE, "<$filename") or die "File couldn't be matched $filename\n";
#array = <FILE>;
foreach $line(#array){
chomp($line);
if ($array[$i]=~/(\s+)Pattern String(\s+)/) {
if ($array[$i]=~/(\s+)Page:(\s+)(.*) of (.*)/) {
$page = $3;
}
}
Here is my sample text file :
Pattern String
MCN: 349450A0 NCP Account ID: 999 600-0089 Page: 2 of 5
=============================================================================
Customer Name: PCS HEALTH SYSTEMS
Customer Number: 349450A0
What about this? Is that what you want? After a match and if next line is not empty then show the line. Let me know if worked for you.
# Perl:
my $filename="sample.txt";
my $match = undef;
my $line = "";
open(my $fh, "<", $filename) or die "Failed to open file: $!";
foreach (<$fh>) {
$line = $_;
if ( $line =~ /.*Pattern\sString.*/ ) {
$match = 1;
next;
}
if (($match == "1") && ($line !~ /^$/)){
print $line;
$match = undef;
}
}
I think this will solve the problem (I'm assuming that the sample files will always have the same format). I hope this will help you, please let me know if it worked.
my $filename="sample.txt";
my $count = 0;
my $tgline = 0;
open(my $fh, "<", $filename) or die "Failed to open file: $!";
my #lines = <$fh>;
foreach (#lines) {
if ( $_ =~ /.*Pattern\sString.*/ ) {
$tgline = $count + 2;
if ( $lines[$tgline] =~ /.*Page\:\s(\d+)\sof\s(\d+)$/ ) {
print "Current page: " . $1 . "\n";
print "Total page #: " . $2 . "\n";
}
}
$count+=1;
}
I don't know why are you matching Pattern String, if your target is achieveing 2 from Page: 2 of 5 from your input file. This is a way to get this:
use warnings;
use strict;
my $filename = "sample.txt";
open my $fh, "<","$filename" or die "Couldn't open $filename: $!";
while (my $line = <$fh>)
{
if($line =~ m/.*Page:\s(\d+)\sof\s(\d+)$/)
{
print "$1\n";
}
}
sample.txt:
Pattern String
MCN: 349450A0 NCP Account ID: 999 600-0089 Page: 2 of 5
=============================================================================
Customer Name: PCS HEALTH SYSTEMS
Customer Number: 349450A0
Output:
2
I have a 100MB plain text database file which I would like to parse and convert into datastructure for easy access. The environment is perl and cygwin. Since we receive the plain text file with data from third party, I am not able to use any existing parser like xml or google protocol buffers.
Text file looks like below.
Class=Instance1
parameterA = <val>
parameterB = <val>
parameterC = <val>
ref = Instance2
Class=Instance2
parameterA = <val>
parameterB = <val>
parameterC = <val>
The file contains a huge number class variants.
What would be the best option to parse this ? Will yacc/lex help me or should i write my own perl parser ?
This should do the trick. It auto-detects the line ending by checking the first one, and the assumption here is a record is separated by a blank line.
Within each record, key/value pairs are assumed to be joined with an equal sign (=), and maybe some whitespace.
Here's my code:
#!/bin/env perl
use strict;
use warnings;
use Data::Dumper;
use Getopt::Long;
my $db_file;
GetOptions(
'file=s' => \$db_file,
);
sub detect_line_ending {
my ($fh) = #_;
my $line = <$fh>;
# Rewind to the beginning
seek($fh, 0, 0);
my ($ending) = $line =~ m/([\f\n\r]+$)/s;
return $ending;
}
sub process_chunk {
my ($chunk, $line_ending) = #_;
my #lines = split(/$line_ending/, $chunk);
my $section = {};
foreach my $line (#lines) {
my ($key, $value) = split(/[ \t]*=[ \t]*/, $line, 2);
$section->{$key} = $value;
}
return $section;
}
sub read_db_file {
my ($file) = #_;
my $data = [];
open (my $fh, '<', $file) or die $!;
my $line_ending = detect_line_ending($fh);
{
local $/ = $line_ending.$line_ending;
while (my $chunk = <$fh>) {
chomp $chunk;
my $section = process_chunk($chunk, $line_ending);
push #$data, $section;
}
}
close $fh;
return $data;
}
print Dumper read_db_file($db_file);
Is this what you want?
#!/usr/bin/perl
use Data::Dumper;
use Modern::Perl;
my %classes;
my $current;
while(<DATA>) {
chomp;
if (/^Class\s*=\s*(\w+)/) {
$classes{$1} = {};
$current = $1;
} elsif (/^(\w+)\s*=\s*(.+)$/) {
$classes{$current}{$1} = $2;
}
}
say Dumper\%classes;
Output:
$VAR1 = {
'Instance2' => {
'parameterC' => '<val>',
'parameterB' => '<val>',
'parameterA' => '<val>'
},
'Instance1' => {
'parameterC' => '<val>',
'ref' => 'Instance2',
'parameterB' => '<val>',
'parameterA' => '<val>'
}
};
I have a file which contains information something like this:
TAG1 "file1.txt"
some additional lines
TAG2 "file2.txt"
some more lines
TAG3 "file3.txt".
Now, I want to read what is inside the double quotes and assign it to variable ( something like $var1 = file1.txt $var2 = file2.txt $var3 = fil3.txt). Can anyone guild me how to do this.?
You could achieve your goal by
using regular expression
my #files;
while (my $line = <>) {
if (m/"([^"]+)"/) {
push #files, $1;
}
}
using split()
my #files;
while (my $line = <>) {
my (undef, $file, undef) = split /"/, $line, 3;
push #files, $file;
}
Except for the period after "file3.txt". (which I suspect is a artifact from posting the question), your data appears to be a CSV file with tabs.
If that's the case, I advise you to parse the file with Text::CSV
use strict;
use warnings;
use autodie;
use Text::CSV;
my $csv = Text::CSV->new ( { sep_char => "\t" } )
or die "Cannot use CSV: ".Text::CSV->error_diag ();
my #files;
open my $fh, '<', 'file.csv';
while ( my $row = $csv->getline( $fh ) ) {
push #files; $row->[1];
}
$csv->eof or $csv->error_diag();
close $fh;
print "#files";