Strange behavior when parsing output line by line - regex

I am running the following code:
use strict;
use warnings;
use Data::Dumper;
use File::HomeDir;
use File::Temp ();
use File::Spec;
open my $output, '<', '/tmp/cs.txt';
my #color_clusters;
my $image_number = 0;
my $image_name = undef;
my $last_image_name = '';
my $line = undef;
for $line (<$output>) {
chomp($line);
print "***${line}***\n";
# image (file) name -> ^\S+
# cluster number -> cluster \d,
# HEX -> hex #([0-9A-Z])6,
# Cluster Color -> cmyk \d+ \d+ \d+ \d+ []
# Color Category -> (empty at the moment)
# Pixels -> f 0.\d+
# R, G, B -> rgb \d+ \d+ \d+
# H, S, V -> hsv \d+ \d+ \d+
$line =~ m/
^(?<IMAGE_NAME>.+) # image file name
\ cluster\ (?<CLUST_NUM>\d+)\ n\ [0-9]+ # cluster number
\ f\ (?<PIXELS>[-]?[0-9]+[,.]?[0-9]*) # percent of pixels belonging to this cluster
\ rgb\ (?<RED>\d+)\ (?<GREEN>\d+)\ (?<BLUE>\d+)
\ hex\ \#(?<HEX>[0-9A-F]+) # Hexadecimal notation used in HTML
\ hsv\ (?<HUE>\d+)\ (?<SATURATION>\d+)\ (?<VALUE>\d+)
\ .+\ (?<CLUSTER_COLOR>\w+)\[
/x;
$image_name = $+{IMAGE_NAME};
if ($last_image_name ne $image_name) {
$last_image_name = $image_name;
$image_number++;
}
my $cluster_number = int($+{CLUST_NUM}) + 1; # convert to 1 based
my $pixels = $+{PIXELS};
if ($pixels) {
$pixels = ''. int((0 + $pixels) * 100). '%'
}
my $cluster_color = $+{CLUSTER_COLOR};
if ($cluster_color =~ m/_/) {
$cluster_color =~ tr/_/\ /; # replace '_' with space (' ')
}
my %color_cluster = (
image_num => $image_number,
image_name => $image_name,
cluster_number => $cluster_number,
hex_code => $+{HEX},
cluster_color => $cluster_color,
color_category => '', # currently empty, will be calculated from HSV values
pixels => ''. int($+{PIXELS} * 100). '%', # percent of pixels within this cluster
r => $+{RED}, g => $+{GREEN}, b => $+{BLUE},
h => $+{HUE}, s => $+{SATURATION}, v => $+{VALUE}
);
push #color_clusters, %color_cluster;
print Dumper \%color_cluster;
$line =~ m/^.+$/;
} # end of for loop
on input that looks like this:
IMG_0069_result.JPG cluster 0 n 69 f 0.0627272727272727 rgb 248 249 240 hex #F8F9F0 hsv 67 3 98 lab 98 -2 4 lch 98 4 114 xyz 0.88 0.94 0.96 cmyk 0 0 3 2 bianca[1402][252,251,243](1.0):eighth_pearl_lusta[3414][249,248,240](1.1):quarter_bianca[6922][249,248,240](1.1):filmpro_white[3624][249,246,237](1.4):orchid_white[6246][255,253,243](1.8):quarter_pearl_lusta[6978][255,253,244](1.8):twilight_blue[8616][244,246,236](1.8):glistening_white[3874][244,244,236](1.9):quarter_rice_cake[6986][246,244,237](1.9):half_bianca[4292][246,243,233](2.0) 10 bianca:cake:eighth:filmpro:glistening:half:lusta:orchid:pearl:quarter:rice:twilight:blue:white
IMG_0069_result.JPG cluster 1 n 67 f 0.0609090909090909 rgb 251 252 247 hex #FBFCF7 hsv 66 2 99 lab 99 -1 2 lch 99 3 114 xyz 0.92 0.97 1.02 cmyk 0 0 2 1 baby_powder[1248][254,254,250](1.3):ceramic[2174][252,255,249](1.6):hint_of_grey[4499][252,255,249](1.6):sea_fog[7554][252,255,249](1.6):wan_white[8990][252,255,249](1.6):snow_drift[7811][247,250,247](1.7):bianca[1402][252,251,243](1.9):black_white[1483][255,254,246](2.1):romance[7283][255,254,253](2.1):quarter_alabaster[6916][247,246,242](2.2) 10 alabaster:baby:bianca:ceramic:drift:fog:hint:of:powder:quarter:romance:sea:snow:wan:black:grey:white
IMG_0069_result.JPG cluster 2 n 66 f 0.06 rgb 250 250 244 hex #FAFAF4 hsv 65 3 98 lab 98 -1 3 lch 98 3 113 xyz 0.9 0.95 0.99 cmyk 0 0 3 2 bianca[1402][252,251,243](1.1):spring_wood[7933][248,246,241](1.5):eighth_pearl_lusta[3414][249,248,240](1.6):quarter_bianca[6922][249,248,240](1.6):quarter_alabaster[6916][247,246,242](1.8):bridal_heath[1713][255,250,244](2.0):baby_powder[1248][254,254,250](2.1):snow_drift[7811][247,250,247](2.1):ceramic[2174][252,255,249](2.1):hint_of_grey[4499][252,255,249](2.1) 10 alabaster:baby:bianca:bridal:ceramic:drift:eighth:heath:hint:lusta:of:pearl:powder:quarter:snow:spring:wood:grey
IMG_0069_result.JPG cluster 3 n 65 f 0.0590909090909091 rgb 245 247 236 hex #F5F7EC hsv 66 4 97 lab 97 -2 5 lch 97 6 114 xyz 0.86 0.92 0.92 cmyk 0 0 4 3 twilight_blue[8616][244,246,236](1.0):filmpro_white[3624][249,246,237](1.6):half_bianca[4292][246,243,233](1.8):half_orchid_white[4363][247,244,234](1.8):eighth_pearl_lusta[3414][249,248,240](1.9):quarter_bianca[6922][249,248,240](1.9):glistening_white[3874][244,244,236](2.1):quarter_rice_cake[6986][246,244,237](2.1):ecru_white[3358][245,243,229](2.2):joanna[4771][245,243,229](2.2) 10 bianca:cake:ecru:eighth:filmpro:glistening:half:joanna:lusta:orchid:pearl:quarter:rice:twilight:blue:white
IMG_0069_result.JPG cluster 4 n 61 f 0.0554545454545455 rgb 248 249 240 hex #F8F9F0 hsv 65 4 97 lab 98 -2 4 lch 98 5 113 xyz 0.88 0.94 0.96 cmyk 0 0 4 3 bianca[1402][252,251,243](1.0):eighth_pearl_lusta[3414][249,248,240](1.1):quarter_bianca[6922][249,248,240](1.1):filmpro_white[3624][249,246,237](1.4):orchid_white[6246][255,253,243](1.8):quarter_pearl_lusta[6978][255,253,244](1.8):twilight_blue[8616][244,246,236](1.8):glistening_white[3874][244,244,236](1.9):quarter_rice_cake[6986][246,244,237](1.9):half_bianca[4292][246,243,233](2.0) 10 bianca:cake:eighth:filmpro:glistening:half:lusta:orchid:pearl:quarter:rice:twilight:blue:white
IMG_0069_result.JPG cluster 5 n 60 f 0.0545454545454545 rgb 249 249 240 hex #F9F9F0 hsv 63 4 98 lab 98 -2 4 lch 98 5 111 xyz 0.89 0.94 0.96 cmyk 0 0 4 2 bianca[1402][252,251,243](0.7):eighth_pearl_lusta[3414][249,248,240](0.9):quarter_bianca[6922][249,248,240](0.9):filmpro_white[3624][249,246,237](1.1):orchid_white[6246][255,253,243](1.6):quarter_pearl_lusta[6978][255,253,244](1.6):floral_white[3694][255,250,240](1.7):glistening_white[3874][244,244,236](1.8):quarter_rice_cake[6986][246,244,237](1.8):twilight_blue[8616][244,246,236](1.9) 10 bianca:cake:eighth:filmpro:floral:glistening:lusta:orchid:pearl:quarter:rice:twilight:blue:white
IMG_0069_result.JPG cluster 6 n 60 f 0.0545454545454545 rgb 249 250 243 hex #F9FAF3 hsv 67 3 98 lab 98 -2 3 lch 98 4 114 xyz 0.9 0.95 0.98 cmyk 0 0 3 2 bianca[1402][252,251,243](0.9):eighth_pearl_lusta[3414][249,248,240](1.4):quarter_bianca[6922][249,248,240](1.4):spring_wood[7933][248,246,241](1.9):ceramic[2174][252,255,249](2.0):hint_of_grey[4499][252,255,249](2.0):sea_fog[7554][252,255,249](2.0):wan_white[8990][252,255,249](2.0):orchid_white[6246][255,253,243](2.1):quarter_pearl_lusta[6978][255,253,244](2.1) 10 bianca:ceramic:eighth:fog:hint:lusta:of:orchid:pearl:quarter:sea:spring:wan:wood:grey:white
IMG_0069_result.JPG cluster 7 n 58 f 0.0527272727272727 rgb 250 251 246 hex #FAFBF6 hsv 69 2 98 lab 98 -1 2 lch 98 2 116 xyz 0.9 0.96 1.01 cmyk 0 0 2 2 snow_drift[7811][247,250,247](1.6):baby_powder[1248][254,254,250](1.6):bianca[1402][252,251,243](1.8):quarter_alabaster[6916][247,246,242](1.9):ceramic[2174][252,255,249](1.9):hint_of_grey[4499][252,255,249](1.9):sea_fog[7554][252,255,249](1.9):wan_white[8990][252,255,249](1.9):spring_wood[7933][248,246,241](2.0):eighth_pearl_lusta[3414][249,248,240](2.2) 10 alabaster:baby:bianca:ceramic:drift:eighth:fog:hint:lusta:of:pearl:powder:quarter:sea:snow:spring:wan:wood:grey:white
The input is the output of colorsummarizer a program written in Perl that summarizes the colors of images (http://mkweb.bcgsc.ca/color-summarizer/).
Since I am using Perl, I could call the libraries directly rather than run the command line from Perl, but I decided to run the command line since it is easier or least was supposed to be easier...
When running the code above, although all of the lines look very similar to each other regarding their structure, some of the lines are being parsed correctly, while others are not.
Here is part of the output I am getting (STDOUT and STDIN interleaved):
Use of uninitialized value $+{"PIXELS"} in multiplication (*) at /tmp/1.pl line 59, <$output> line 8.
Use of uninitialized value $+{"PIXELS"} in multiplication (*) at /tmp/1.pl line 59, <$output> line 8.
Use of uninitialized value $+{"PIXELS"} in multiplication (*) at /tmp/1.pl line 59, <$output> line 8.
***IMG_0069_result.JPG cluster 0 n 69 f 0.0627272727272727 rgb 248 249 240 hex #F8F9F0 hsv 67 3 98 lab 98
-2 4 lch 98 4 114 xyz 0.88 0.94 0.96 cmyk 0 0 3 2 bianca[1402][252,251,243](1.0):eighth_pearl_lusta[3414][
249,248,240](1.1):quarter_bianca[6922][249,248,240](1.1):filmpro_white[3624][249,246,237](1.4):orchid_whit
e[6246][255,253,243](1.8):quarter_pearl_lusta[6978][255,253,244](1.8):twilight_blue[8616][244,246,236](1.8
):glistening_white[3874][244,244,236](1.9):quarter_rice_cake[6986][246,244,237](1.9):half_bianca[4292][246
,243,233](2.0) 10 bianca:cake:eighth:filmpro:glistening:half:lusta:orchid:pearl:quarter:rice:twilight:blue
:white***
$VAR1 = {
'pixels' => '6%',
'b' => '240',
's' => '3',
'image_name' => 'IMG_0069_result.JPG',
'image_num' => 1,
'h' => '67',
'cluster_number' => 1,
'color_category' => '',
'r' => '248',
'v' => '98',
'g' => '249',
'cluster_color' => 'bianca',
'hex_code' => 'F8F9F0'
};
***IMG_0069_result.JPG cluster 1 n 67 f 0.0609090909090909 rgb 251 252 247 hex #FBFCF7 hsv 66 2 99 lab 99
-1 2 lch 99 3 114 xyz 0.92 0.97 1.02 cmyk 0 0 2 1 baby_powder[1248][254,254,250](1.3):ceramic[2174][252,25
5,249](1.6):hint_of_grey[4499][252,255,249](1.6):sea_fog[7554][252,255,249](1.6):wan_white[8990][252,255,2
49](1.6):snow_drift[7811][247,250,247](1.7):bianca[1402][252,251,243](1.9):black_white[1483][255,254,246](
2.1):romance[7283][255,254,253](2.1):quarter_alabaster[6916][247,246,242](2.2) 10 alabaster:baby:bianca:ce
ramic:drift:fog:hint:of:powder:quarter:romance:sea:snow:wan:black:grey:white***
$VAR1 = {
'cluster_number' => 2,
'h' => undef,
'image_num' => 1,
'image_name' => 'IMG_0069_result.JPG',
'b' => undef,
'pixels' => '0%',
's' => undef,
'g' => undef,
'r' => undef,
'color_category' => '',
'v' => undef,
'hex_code' => undef,
'cluster_color' => 'baby powder'
};
***IMG_0069_result.JPG cluster 2 n 66 f 0.06 rgb 250 250 244 hex #FAFAF4 hsv 65 3 98 lab 98 -1 3 lch 98 3
113 xyz 0.9 0.95 0.99 cmyk 0 0 3 2 bianca[1402][252,251,243](1.1):spring_wood[7933][248,246,241](1.5):eigh
th_pearl_lusta[3414][249,248,240](1.6):quarter_bianca[6922][249,248,240](1.6):quarter_alabaster[6916][247,
246,242](1.8):bridal_heath[1713][255,250,244](2.0):baby_powder[1248][254,254,250](2.1):snow_drift[7811][247,250,247](2.1):ceramic[2174][252,255,249](2.1):hint_of_grey[4499][252,255,249](2.1) 10 alabaster:baby:bianca:bridal:ceramic:drift:eighth:heath:hint:lusta:of:pearl:powder:quarter:snow:spring:wood:grey***
$VAR1 = {
'image_name' => 'IMG_0069_result.JPG',
'image_num' => 1,
'h' => '65',
'cluster_number' => 3,
'pixels' => '6%',
'b' => '244',
's' => '3',
'hex_code' => 'FAFAF4',
'cluster_color' => 'bianca',
'g' => '250',
'color_category' => '',
'r' => '250',
'v' => '98'
};
***IMG_0069_result.JPG cluster 3 n 65 f 0.0590909090909091 rgb 245 247 236 hex #F5F7EC hsv 66 4 97 lab 97 -2 5 lch 97 6 114 xyz 0.86 0.92 0.92 cmyk 0 0 4 3 twilight_blue[8616][244,246,236](1.0):filmpro_white[3624][249,246,237](1.6):half_bianca[4292][246,243,233](1.8):half_orchid_white[4363][247,244,234](1.8):eighth_pearl_lusta[3414][249,248,240](1.9):quarter_bianca[6922][249,248,240](1.9):glistening_white[3874][244,244,236](2.1):quarter_rice_cake[6986][246,244,237](2.1):ecru_white[3358][245,243,229](2.2):joanna[4771][245,243,229](2.2) 10 bianca:cake:ecru:eighth:filmpro:glistening:half:joanna:lusta:orchid:pearl:quarter:rice:twilight:blue:white***
$VAR1 = {
'cluster_number' => 4,
'h' => undef,
'image_name' => 'IMG_0069_result.JPG',
'image_num' => 1,
'b' => undef,
'pixels' => '0%',
's' => undef,
'g' => undef,
'r' => undef,
'color_category' => '',
'v' => undef,
'hex_code' => undef,
'cluster_color' => 'twilight blue'
};
***IMG_0069_result.JPG cluster 4 n 61 f 0.0554545454545455 rgb 248 249 240 hex #F8F9F0 hsv 65 4 97 lab 98 -2 4 lch 98 5 113 xyz 0.88 0.94 0.96 cmyk 0 0 4 3 bianca[1402][252,251,243](1.0):eighth_pearl_lusta[3414][249,248,240](1.1):quarter_bianca[6922][249,248,240](1.1):filmpro_white[3624][249,246,237](1.4):orchid_white[6246][255,253,243](1.8):quarter_pearl_lusta[6978][255,253,244](1.8):twilight_blue[8616][244,246,236](1.8):glistening_white[3874][244,244,236](1.9):quarter_rice_cake[6986][246,244,237](1.9):half_bianca[4292][246,243,233](2.0) 10 bianca:cake:eighth:filmpro:glistening:half:lusta:orchid:pearl:quarter:rice:twilight:blue:white***
$VAR1 = {
'b' => '240',
'pixels' => '5%',
's' => '4',
'h' => '65',
'cluster_number' => 5,
'image_num' => 1,
'image_name' => 'IMG_0069_result.JPG',
'r' => '248',
'color_category' => '',
'v' => '97',
'g' => '249',
'hex_code' => 'F8F9F0',
'cluster_color' => 'bianca'
};
***IMG_0069_result.JPG cluster 5 n 60 f 0.0545454545454545 rgb 249 249 240 hex #F9F9F0 hsv 63 4 98 lab 98 -2 4 lch 98 5 111 xyz 0.89 0.94 0.96 cmyk 0 0 4 2 bianca[1402][252,251,243](0.7):eighth_pearl_lusta[3414][249,248,240](0.9):quarter_bianca[6922][249,248,240](0.9):filmpro_white[3624][249,246,237](1.1):orchid_white[6246][255,253,243](1.6):quarter_pearl_lusta[6978][255,253,244](1.6):floral_white[3694][255,250,240](1.7):glistening_white[3874][244,244,236](1.8):quarter_rice_cake[6986][246,244,237](1.8):twilight_blue[8616][244,246,236](1.9) 10 bianca:cake:eighth:filmpro:floral:glistening:lusta:orchid:pearl:quarter:rice:twilight:blue:white***
$VAR1 = {
's' => '4',
'pixels' => '5%',
'b' => '240',
'image_num' => 1,
'image_name' => 'IMG_0069_result.JPG',
'h' => '63',
'cluster_number' => 6,
'v' => '98',
'color_category' => '',
'r' => '249',
'g' => '249',
'hex_code' => 'F9F9F0',
'cluster_color' => 'bianca'
};
...

I found the cause of the problem.
The problem is in these lines:
if ($cluster_color =~ m/_/) {
$cluster_color =~ tr/_/\ /; # replace '_' with space (' ')
}
On output lines where there are underscores in $cluster_color,
the second line alters the capture buffer, hence resetting the other capture groups.
The solution was to assign each capture group to its own variable, and add the 3 lines above after all the capture groups have been assigned to their respective variables.
So, assigning directly to the hash was a bad idea :-)
I also improved my regex to be more precise, although this was not the cause of the problem, but simply the ... =~ tr/_/\ /; that altered the capture buffer.
Bellow is my working code:
use strict;
use warnings;
use Data::Dumper;
use File::HomeDir;
use File::Temp ();
use File::Spec;
open my $output, '<', '/tmp/cs8.txt';
my #color_clusters;
my $image_number = 0;
my $image_name = undef;
my $last_image_name = '';
my $line = undef;
for $line (<$output>) {
chomp($line);
print "***${line}***\n";
# image (file) name -> ^\S+
# cluster number -> cluster \d,
# HEX -> hex #([0-9A-Z])6,
# Cluster Color -> cmyk \d+ \d+ \d+ \d+ []
# Color Category -> (empty at the moment)
# Pixels -> f 0.\d+
# R, G, B -> rgb \d+ \d+ \d+
# H, S, V -> hsv \d+ \d+ \d+
$line =~ m/
^(?<IMAGE_NAME>.+) # image file name
\ cluster\ (?<CLUST_NUM>\d+)\ n\ [0-9]+ # cluster number
\ f\ (?<PIXELS>[0-9]+\.?[0-9]*) # percent of pixels belonging to this cluster
\ rgb\ (?<RED>[0-9]{1,3})\ (?<GREEN>[0-9]{1,3})\ (?<BLUE>[0-9]{1,3})
\ hex\ \#(?<HEX>[0-9A-F]{6}) # Hexadecimal notation used in HTML
\ hsv\ (?<HUE>[0-9]{1,3})\ (?<SATURATION>[0-9]{1,3})\ (?<VALUE>[0-9]{1,3})
\ .+\ (?<CLUSTER_COLOR>\w+)\[
/x;
$image_name = $+{IMAGE_NAME};
if ($last_image_name ne $image_name) {
$last_image_name = $image_name;
$image_number++;
}
my $cluster_number = $+{CLUST_NUM};
if (defined $cluster_number) {
$cluster_number = 1 + $cluster_number; # convert to 1 based
}
my $pixels = $+{PIXELS};
if (defined $pixels) {
$pixels = ''. int((0 + $pixels) * 100). '%'
}
my $cluster_color = $+{CLUSTER_COLOR};
my $hex = $+{HEX};
my ($red, $green, $blue) = ($+{RED}, $+{GREEN}, $+{BLUE});
my ($hue, $saturation, $value) = ($+{HUE}, $+{SATURATION}, $+{VALUE});
if ($cluster_color =~ m/_/) {
$cluster_color =~ tr/_/\ /; # replace '_' with space (' ')
}
my %color_cluster = (
image_num => $image_number,
image_name => $image_name,
cluster_number => $cluster_number,
hex_code => $hex,
cluster_color => $cluster_color,
color_category => '', # currently empty, will be calculated from HSV values
pixels => $pixels, # percent of pixels within this cluster
r => $red, g => $green, b => $blue,
h => $hue, s => $saturation, v => $value,
);
push #color_clusters, %color_cluster;
print Dumper \%color_cluster;
} # end of for loop
Many thanks #Yunnosch for your comments, they gave me hints regarding the right direction.
Cheers,
Asaf

Related

How to substitute integers with letters in Ruby

I am new to Ruby and programming. I am working on a card game. I have a variable (straightHigh) currently filled with a number n representing a rank of a card. I want certain numbers (11-14) to be replaced with specific letters (11 => J, 12 => Q, 13 => K, 14 => A).
I've tried gsub and gsub! with and without regular expressions. But regular expressions are very foreign to me.
if y == 5
straightHigh = n + 4
#straightHigh.to_s.gsub!(/[11-14]/, 11 => 'J', 12 => 'Q', 13 => 'k', 14 => 'A')
p straightHigh.to_s
end
I've tried:
straightHigh.to_s.gsub!(/[11-14]/, 14 => 'Ace', 13 => K, 12 => Q, 11 => J)
which resulted in syntax errors.
I've tried
straightHigh.to_s.gsub!(/[11-14]/, 'Ace')
this does not throw an error, but does not seem to alter the values either.
Maybe you should use a case statement:
def get_card(number)
case number
when 2..10
return number.to_s
when 11
return 'J'
when 12
return 'Q'
when 13
return 'J'
when 14
return 'Ace'
end
end
I am not sure what you are trying to do, but I believe you are trying to map an integer with a string? If so, you can use a hash:
# straight_high Integer
# returns String
def get_card(straight_high)
card_values = {
11 => 'J',
12 => 'Q',
13 => 'K',
14 => 'Ace',
}
card_values[straight_high]
end

Removing part of a value in a certain column in a dataframe , and returning a DF

I have the following Data Frame named: mydf:
A B
0 3de (1ABS) Adiran
1 3SA (SDAS) Adel
2 7A (ASA) Ronni
3 820 (SAAa) Emili
I want to remove the " (xxxx)" and keeps the values in column A , so the dataframe (mydf) will look like:
A B
0 3de Adiran
1 3SA Adel
2 7A Ronni
3 820 Emili
I have tried :
print mydf['A'].apply(lambda x: re.sub(r" \(.+\)", "", x) )
but then I get a Series object back and not a dataframe object.
I have also tried to use replace:
df.replace([' \(.*\)'],[""], regex=True), But it didn't change anything.
What am I doing wrong?
Thank you!
you can use str.split() method:
In [3]: df.A = df.A.str.split('\s+\(').str[0]
In [4]: df
Out[4]:
A B
0 3de Adiran
1 3SA Adel
2 7A Ronni
3 820 Emili
or using str.extract() method:
In [9]: df.A = df.A.str.extract(r'([^\(\s]*)', expand=False)
In [10]: df
Out[10]:
A B
0 3de Adiran
1 3SA Adel
2 7A Ronni
3 820 Emili

How to make list of lists based on particular character?

I have one file which has follow lines:
B99990001 1 2 3 4
B99990001 1 3 3 4
B99990002 1 2 3 4
B99990002 1 3 3 4
B99990003 1 2 3 4
B99990003 1 3 3 4
So Here my aim is to make a main list which should have three sub lists based on the first columns (B99990001,B99990002,B99990003) of lines:
Mainlist=[
['B99990001 1 2 3 4','B99990001 1 3 3 4'],#sublist1 has B99990001
['B99990002 1 2 3 4','B99990002 1 3 3 4'],#sublist2 has B99990002
['B99990002 1 2 3 4','B99990002 1 3 3 4'] #sublist3 has B99990002
]
I hope, My question is understandable. So If someones know could you help me out of this.
Thanking you in advance
SEE HERE MY REAL EXAMPLE:
import os
import re
pdbPathAndName = ['/Users/Mahesh/Documents/MAHESH_INTERNSHIP_2014 /ENZOWP2/2WC5_090715_170128/E3P/E3P.B99990001.pdb','/Users/Mahesh/Documents/MAHESH_INTERNSHIP_2014/ENZOWP2/2WC5_090715_170128/E3P/E3P.B99990002.pdb']
''' /Users/Mahesh/Documents/MAHESH_INTERNSHIP_2014/ENZOWP2/2WC5_090715_170128/E3P/E3P.B99990001.pdb=[
'ATOM 138 SG CYS 19 4.499 4.286 8.260 1.00 71.96 S',
'ATOM 397 SG CYS 50 14.897 3.238 9.338 1.00 34.60 S',
'ATOM 424 SG CYS 54 5.649 5.914 8.639 1.00 42.68 S',
'ATOM 774 SG CYS 97 12.114 -6.864 23.897 1.00 62.23 S',
'ATOM 865 SG CYS 108 15.200 3.910 11.227 1.00 54.49 S' ]
/Users/Mahesh/Documents/MAHESH_INTERNSHIP_2014/ENZOWP2/2WC5_090715_170128/E3P/E3P.B99990002.pdb=[
'ATOM 929 SG CYS 117 13.649 -6.894 22.589 1.00106.90 S',
'ATOM 138 SG CYS 19 4.499 4.286 8.260 1.00 71.96 S',
'ATOM 397 SG CYS 50 14.897 3.238 9.338 1.00 34.60 S',
'ATOM 424 SG CYS 54 5.649 5.914 8.639 1.00 42.68 S',
'ATOM 774 SG CYS 97 12.114 -6.864 23.897 1.00 62.23 S',
'ATOM 865 SG CYS 108 15.200 3.910 11.227 1.00 54.49 S',
'ATOM 929 SG CYS 117 13.649 -6.894 22.589 1.00106.90 S' ] '''
for path in pdbPathAndName:
f = open(path, 'r').readlines()
f = map(lambda x: x.strip(), f)
for line in f:
if "SG" in line and line.endswith("S"):
print (path.split("/")[-1] + "_" + re.split('\s+', line)[1] + ":" + re.split('\s+', line)[5] + ":" +re.split('\s+', line)[6] + ":" + re.split('\s+', line)[7])
#PRINTED OUTPUT
'''E3P.B99990001.pdb_138:6.923:0.241:6.116
E3P.B99990001.pdb_397:15.856:3.506:8.144
E3P.B99990001.pdb_424:8.558:1.315:6.627
E3P.B99990001.pdb_774:14.204:-5.490:24.812
E3P.B99990001.pdb_865:15.545:4.258:10.007
E3P.B99990001.pdb_929:16.146:-6.081:24.770
E3P.B99990002.pdb_138:4.499:4.286:8.260
E3P.B99990002.pdb_397:14.897:3.238:9.338
E3P.B99990002.pdb_424:5.649:5.914:8.639
E3P.B99990002.pdb_774:12.114:-6.864:23.897
E3P.B99990002.pdb_865:15.200:3.910:11.227
E3P.B99990002.pdb_929:13.649:-6.894:22.589'''
#MY EXPECTED OUTPUT
''' MainlIst=[
['E3P.B99990001.pdb_138:6.923:0.241:6.116'
'E3P.B99990001.pdb_397:15.856:3.506:8.144'
'E3P.B99990001.pdb_424:8.558:1.315:6.627'
'E3P.B99990001.pdb_774:14.204:-5.490:24.812'
'E3P.B99990001.pdb_865:15.545:4.258:10.007'
'E3P.B99990001.pdb_929:16.146:-6.081:24.770']#sublist1
['E3P.B99990002.pdb_138:4.499:4.286:8.260'
'E3P.B99990002.pdb_397:14.897:3.238:9.338'
'E3P.B99990002.pdb_424:5.649:5.914:8.639'
'E3P.B99990002.pdb_774:12.114:-6.864:23.897'
'E3P.B99990002.pdb_929:13.649:-6.894:22.589']#sublist2
]'''
#then use thes sublists to make combinations
for sublists in mainlist:
Combinatedlist=map(dict,itertools.combinations(sublists.iteritems(), 2))
#since it is sublist there wont be any crossing between sublist1 and sublist2 while doing combinations
#but still I didnt get proper result if you can then suggest me your ways
Hi guys I got an answer for this by just including particular pattern between each blogs and spitted based on the same to make sub lists then made a combination out of it
My code:
import fileinput
import os
import re
import itertools
import math
import sys
pdbPathAndName = ['/Users/Mahesh/Documents/MAHESH_INTERNSHIP_2014/ENZOWP2/2WC5_090715_170128/E3P/E3P.B99990001.pdb','/Users/Mahesh/Documents/MAHESH_INTERNSHIP_2014/ENZOWP2/2WC5_090715_170128/E3P/E3P.B99990002.pdb']
ATOM_COORDINATE=[]
for path in pdbPathAndName:
f = open(path, 'r').readlines()
f = map(lambda x: x.strip(), f)
for line in f:
if "SG" in line and line.endswith("S"):
ATOM_COORDINATE.append(path.split("/")[-1] + "_" + re.split('\s+', line)[1] + ":" + re.split('\s+', line)[5] + ":" +re.split('\s+', line)[6] + ":" + re.split('\s+', line)[7])
ATOM_COORDINATE.append("foo")
#Making Mainlist with sublists by splitting "foo" pattern
sub = []
for item in ATOM_COORDINATE:
if item == 'foo':
ATOM_COORDINATE.append(sub)
sub = []
else:
sub.append(item)
#Making combinations out of sublists
COMBINATION=[]
for sublists in sub:
for L in range(2, len(sublists), 4):
for subset in itertools.combinations(sublists, L):
COMBINATION.append(subset)
OUTPUT:
MainlistWithSublists:
[['E3P.B99990001.pdb_138:6.923:0.241:6.116', 'E3P.B99990001.pdb_397:15.856:3.506:8.144', 'E3P.B99990001.pdb_424:8.558:1.315:6.627', 'E3P.B99990001.pdb_774:14.204:-5.490:24.812', 'E3P.B99990001.pdb_865:15.545:4.258:10.007', 'E3P.B99990001.pdb_929:16.146:-6.081:24.770'], ['E3P.B99990002.pdb_138:4.499:4.286:8.260', 'E3P.B99990002.pdb_397:14.897:3.238:9.338', 'E3P.B99990002.pdb_424:5.649:5.914:8.639', 'E3P.B99990002.pdb_774:12.114:-6.864:23.897', 'E3P.B99990002.pdb_865:15.200:3.910:11.227', 'E3P.B99990002.pdb_929:13.649:-6.894:22.589']]
Combination out of sublists:
[('E3P.B99990001.pdb_138:6.923:0.241:6.116', 'E3P.B99990001.pdb_397:15.856:3.506:8.144'), ('E3P.B99990001.pdb_138:6.923:0.241:6.116', 'E3P.B99990001.pdb_424:8.558:1.315:6.627'), ('E3P.B99990001.pdb_138:6.923:0.241:6.116', 'E3P.B99990001.pdb_774:14.204:-5.490:24.812'), ('E3P.B99990001.pdb_138:6.923:0.241:6.116', 'E3P.B99990001.pdb_865:15.545:4.258:10.007'), ('E3P.B99990001.pdb_138:6.923:0.241:6.116', 'E3P.B99990001.pdb_929:16.146:-6.081:24.770'), ('E3P.B99990001.pdb_397:15.856:3.506:8.144', 'E3P.B99990001.pdb_424:8.558:1.315:6.627'), ('E3P.B99990001.pdb_397:15.856:3.506:8.144', 'E3P.B99990001.pdb_774:14.204:-5.490:24.812'), ('E3P.B99990001.pdb_397:15.856:3.506:8.144', 'E3P.B99990001.pdb_865:15.545:4.258:10.007'), ('E3P.B99990001.pdb_397:15.856:3.506:8.144', 'E3P.B99990001.pdb_929:16.146:-6.081:24.770'), ('E3P.B99990001.pdb_424:8.558:1.315:6.627', 'E3P.B99990001.pdb_774:14.204:-5.490:24.812'), ('E3P.B99990001.pdb_424:8.558:1.315:6.627', 'E3P.B99990001.pdb_865:15.545:4.258:10.007'), ('E3P.B99990001.pdb_424:8.558:1.315:6.627', 'E3P.B99990001.pdb_929:16.146:-6.081:24.770'), ('E3P.B99990001.pdb_774:14.204:-5.490:24.812', 'E3P.B99990001.pdb_865:15.545:4.258:10.007'), ('E3P.B99990001.pdb_774:14.204:-5.490:24.812', 'E3P.B99990001.pdb_929:16.146:-6.081:24.770'), ('E3P.B99990001.pdb_865:15.545:4.258:10.007', 'E3P.B99990001.pdb_929:16.146:-6.081:24.770'), ('E3P.B99990002.pdb_138:4.499:4.286:8.260', 'E3P.B99990002.pdb_397:14.897:3.238:9.338'), ('E3P.B99990002.pdb_138:4.499:4.286:8.260', 'E3P.B99990002.pdb_424:5.649:5.914:8.639'), ('E3P.B99990002.pdb_138:4.499:4.286:8.260', 'E3P.B99990002.pdb_774:12.114:-6.864:23.897'), ('E3P.B99990002.pdb_138:4.499:4.286:8.260', 'E3P.B99990002.pdb_865:15.200:3.910:11.227'), ('E3P.B99990002.pdb_138:4.499:4.286:8.260', 'E3P.B99990002.pdb_929:13.649:-6.894:22.589'), ('E3P.B99990002.pdb_397:14.897:3.238:9.338', 'E3P.B99990002.pdb_424:5.649:5.914:8.639'), ('E3P.B99990002.pdb_397:14.897:3.238:9.338', 'E3P.B99990002.pdb_774:12.114:-6.864:23.897'), ('E3P.B99990002.pdb_397:14.897:3.238:9.338', 'E3P.B99990002.pdb_865:15.200:3.910:11.227'), ('E3P.B99990002.pdb_397:14.897:3.238:9.338', 'E3P.B99990002.pdb_929:13.649:-6.894:22.589'), ('E3P.B99990002.pdb_424:5.649:5.914:8.639', 'E3P.B99990002.pdb_774:12.114:-6.864:23.897'), ('E3P.B99990002.pdb_424:5.649:5.914:8.639', 'E3P.B99990002.pdb_865:15.200:3.910:11.227'), ('E3P.B99990002.pdb_424:5.649:5.914:8.639', 'E3P.B99990002.pdb_929:13.649:-6.894:22.589'), ('E3P.B99990002.pdb_774:12.114:-6.864:23.897', 'E3P.B99990002.pdb_865:15.200:3.910:11.227'), ('E3P.B99990002.pdb_774:12.114:-6.864:23.897', 'E3P.B99990002.pdb_929:13.649:-6.894:22.589'), ('E3P.B99990002.pdb_865:15.200:3.910:11.227', 'E3P.B99990002.pdb_929:13.649:-6.894:22.589')]
Thanks to all
If you want to have the exact same output:
from collections import OrderedDict
d = OrderedDict()
with open('file.txt') as f:
for line in f:
splitted = line.strip().split()
key = splitted[0]
if key not in d:
d[key] = []
d[key].append(' '.join( splitted[1:] ))
mainList = [ [key + ' ' + item for item in d[key] ] for key in d ]
print mainList
Output:
[['B99990001 1 2 3 4', 'B99990001 1 3 3 4'],
['B99990002 1 2 3 4', 'B99990002 1 3 3 4'],
['B99990003 1 2 3 4', 'B99990003 1 3 3 4']]
If you can, just use a dictionary:
from collections import defaultdict
s = """B99990001 1 2 3 4
B99990001 1 3 3 4
B99990002 1 2 3 4
B99990002 1 3 3 4
B99990003 1 2 3 4
B99990003 1 3 3 4"""
d = defaultdict(list)
for line in s.split('\n'):
index, values = line.split(maxsplit=1)
d[index].append(values)
Output (dictionary d):
d = {
'B99990003': ['1 2 3 4', '1 3 3 4'],
'B99990001': ['1 2 3 4', '1 3 3 4'],
'B99990002': ['1 2 3 4', '1 3 3 4'],
}
If you really need to use a list of lists instead of a dict, you can just convert this back to a list:
l = [['%s %s' % (index, value) for value in d[index]] for index in d]
You can sort it using sorted(l) if you prefer a sorted version.

R + converting a integer to a hh:mm format using regex + gsub

interval is a subset of 5 minute intervals for a 25 hour period
> interval
[1] 45 50 55 100 105 110 115 120 125 130 135 2035 2040 2045 2050 2055 2100 2105 2110 2115 2120 2125
I want to insert : to put it in a time fomat that i can convert to a time format
> gsub('^([0-9]{1,2})([0-9]{2})$', '\\1:\\2', interval)
[1] "45" "50" "55" "1:00" "1:05" "1:10" "1:15" "1:20" "1:25" "1:30" "1:35" "20:35" "20:40" "20:45"
[15] "20:50" "20:55" "21:00" "21:05" "21:10" "21:15" "21:20" "21:25"
I have got it working for nearly all my examples.
How do I get it so that it works on the numbers "5" ... "45" "50" "55"
Found this duplicate here but this does not use gsub
An easy way to do this would be to make sure all the inputs have at least 4 characters:
gsub('^([0-9]{1,2})([0-9]{2})$', '\\1:\\2', sprintf('%04d',interval))
# "00:45" "00:50" "00:55" "01:00" "01:05" "01:10" "01:15" "01:20" "01:25"
# "01:30" "01:35" "20:35" "20:40" "20:45" "20:50" "20:55" "21:00" "21:05"
# "21:10" "21:15" "21:20" "21:25"
Using sub:
> sub('..\\K', ':', sprintf('%04d',interval), perl=T)
# [1] "00:45" "00:50" "00:55" "01:00" "01:05" "01:10" "01:15" "01:20" "01:25"
# [10] "01:30" "01:35" "20:35" "20:40" "20:45" "20:50" "20:55" "21:00" "21:05"
# [19] "21:10" "21:15" "21:20" "21:25"

Easily parsable output from rrdtool

I'm working with a large bunch of RRD-files, where I have to query the data quite a lot - and mostly by reading all the data and pass it on.
Currently, I use rrdtool fetch <filename> CF --start XXX --end YYY, but as it only returns data for one CF at a time, I first have to do a separate query to find the CF's (= run and parse rrdtool info <filename>) and then run rrdtool fetch for each found CF. The output is trivial to parse, though.
Alternately, there is rrdtool xport DEF:XX=<filename>:RRA:CF ... XPORT:XX:XX ... with multiple "sets" of the latter commands for each thing I want. On the upside, this can give me all the data in one go, but I still need to have a fairly good idea about what data I want beforehand. Also, it only spits out XML (always a hassle to parse).
I have a feeling I'm missing something very obvious, as it simply can't be such a big hassle to get a list of timestamp → numbers out of a file... Any clues?
While there are patches around for adding JSON-support, there is currently no way around:
Parsing at least two different output formats (rrdtool info's ASCII and then either XML from rrdtool xport or tabular data from rrdtool fetch).
Dumping the entire contents of the file to XML via rrdtool dump and then re-implementing quite a bit of librrd's internals.
I've written a parser that turns the output of rrdtool info /tmp/pb_1_amp.rrd into a nested array. So from:
filename = "/tmp/pb_1_amp.rrd"
rrd_version = "0003"
step = 1800
last_update = 1372685403
header_size = 1208
ds[amp].index = 0
ds[amp].type = "GAUGE"
ds[amp].minimal_heartbeat = 3200
ds[amp].min = 0.0000000000e+00
ds[amp].max = 1.0000000000e+02
ds[amp].last_ds = "5.6"
ds[amp].value = 1.6800000000e+01
ds[amp].unknown_sec = 0
rra[0].cf = "AVERAGE"
rra[0].rows = 576
rra[0].cur_row = 385
rra[0].pdp_per_row = 1
rra[0].xff = 5.0000000000e-01
rra[0].cdp_prep[0].value = NaN
rra[0].cdp_prep[0].unknown_datapoints = 0
rra[1].cf = "AVERAGE"
rra[1].rows = 672
rra[1].cur_row = 159
rra[1].pdp_per_row = 6
rra[1].xff = 5.0000000000e-01
rra[1].cdp_prep[0].value = 1.6999833333e+01
rra[1].cdp_prep[0].unknown_datapoints = 0
rra[2].cf = "AVERAGE"
rra[2].rows = 732
rra[2].cur_row = 639
rra[2].pdp_per_row = 24
rra[2].xff = 5.0000000000e-01
rra[2].cdp_prep[0].value = 1.6999833333e+01
rra[2].cdp_prep[0].unknown_datapoints = 0
rra[3].cf = "AVERAGE"
rra[3].rows = 1460
rra[3].cur_row = 593
rra[3].pdp_per_row = 144
rra[3].xff = 5.0000000000e-01
rra[3].cdp_prep[0].value = 6.6083527778e+02
rra[3].cdp_prep[0].unknown_datapoints = 0
to:
Array
(
[filename] => /tmp/pb_1_amp.rrd
[rrd_version] => 0003
[step] => 1800
[last_update] => 1372685403
[header_size] => 1208
[ds] => Array
(
[amp] => Array
(
[index] => 0
[type] => GAUGE
[minimal_heartbeat] => 3200
[min] => 0.0000000000e+00
[max] => 1.0000000000e+02
[last_ds] => 5.6
[value] => 1.6800000000e+01
[unknown_sec] => 0
)
)
[rra] => Array
(
[0] => Array
(
[cf] => AVERAGE
[rows] => 576
[cur_row] => 385
[pdp_per_row] => 1
[xff] => 5.0000000000e-01
[cdp_prep] => Array
(
[0] => Array
(
[value] => NaN
[unknown_datapoints] => 0
)
)
)
[1] => Array
(
[cf] => AVERAGE
[rows] => 672
[cur_row] => 159
[pdp_per_row] => 6
[xff] => 5.0000000000e-01
[cdp_prep] => Array
(
[0] => Array
(
[value] => 1.6999833333e+01
[unknown_datapoints] => 0
)
)
)
[2] => Array
(
[cf] => AVERAGE
[rows] => 732
[cur_row] => 639
[pdp_per_row] => 24
[xff] => 5.0000000000e-01
[cdp_prep] => Array
(
[0] => Array
(
[value] => 1.6999833333e+01
[unknown_datapoints] => 0
)
)
)
[3] => Array
(
[cf] => AVERAGE
[rows] => 1460
[cur_row] => 593
[pdp_per_row] => 144
[xff] => 5.0000000000e-01
[cdp_prep] => Array
(
[0] => Array
(
[value] => 6.6083527778e+02
[unknown_datapoints] => 0
)
)
)
)
)
It's in PHP but it should be easy to port to any other language. Here's the code:
$store = array();
foreach ($lines as $line) {
list($raw_key, $raw_val) = explode(' = ', $line);
$keys = preg_split('/[\.\[\]]/', $raw_key, -1, PREG_SPLIT_NO_EMPTY);
$key_count = count($keys);
$pointer = &$store;
foreach ($keys as $key_num => $key) {
if (!array_key_exists($key, $pointer)) {
$pointer[$key] = array();
}
$pointer = &$pointer[$key];
if ($key_num+1 === $key_count) {
$pointer = trim($raw_val, '"');
}
}
}
It assumes the rrdtool info output is split by newline (\n) and found in $lines. Hope this helps.
If you want the 'table of contents' use rrdtool info, if you want the whole content, use rrdtool dump.
BUT ... why would you want that?
cheers
tobi