awk regex for nested big brackets - regex

I have a structure like this:
label1 {
label1_1 {
item1_1_1: "value1_1_1";
label1_1_2:{ item1_1_2_1: "value1_1_2_1";};
item1_1_3: "value1_1_3";
};
label1_2 {...};
...
};
label2 {
item2_1: "value2_1";
label2_1:{
item2_1_1: "value2_1_1";
...
};
};
The section could be in one line or multiple lines, and empty line presentable. I'm trying to use awk to get any section with given label name,
section=$(awk -v RS='' -v ORS='\n\n' "/($2)\s(\{([^{}]|(?R)|\n)*\})/" $1)
where the $1 is file name, $2 is label name. It works if happens no empty line in the section, for example "label2", but faild by others.
What's the correct regex I should use?

Here's one way to do what you want, assuming neither { nor } can occur within quoted strings and using GNU awk 4.* for a couple of extensions:
$ cat tst.awk
BEGIN { RS="^$" }
{
tmp = $0
while ( match(tmp,/(\<([[:alnum:]_]+):?\s*{[^{}]+};)/,a) ) {
start[a[2]] = RSTART
lgth[a[2]] = RLENGTH
tmp = substr(tmp,1,RSTART-1) sprintf("%*s",length(a[1]),"") substr(tmp,RSTART+RLENGTH)
}
}
label in start { print substr($0,start[label],lgth[label]) }
.
$ awk -v label='label2' -f tst.awk file
label2 {
item2_1: "value2_1";
label2_1:{
item2_1_1: "value2_1_1";
...
};
};
$ awk -v label='label1_1' -f tst.awk file
label1_1 {
item1_1_1: "value1_1_1";
label1_1_2:{ item1_1_2_1: "value1_1_2_1";};
item1_1_3: "value1_1_3";
};
$ awk -v label='label1_1_2' -f tst.awk file
label1_1_2:{ item1_1_2_1: "value1_1_2_1";};
You can call awk as either awk -f scriptfile inputfile or awk 'script' inputfile so to use the above awk script inline instead of stored in a file is just:
awk '
BEGIN { RS="^$" }
{
tmp = $0
while ( match(tmp,/(\<([[:alnum:]_]+):?\s*{[^{}]+};)/,a) ) {
start[a[2]] = RSTART
lgth[a[2]] = RLENGTH
tmp = substr(tmp,1,RSTART-1) sprintf("%*s",length(a[1]),"") substr(tmp,RSTART+RLENGTH)
}
}
label in start { print substr($0,start[label],lgth[label]) }
' file

Related

How do I detect embbeded field names and reorder fields using awk?

I have the following data:
"b":1.14105,"a":1.14106,"x":48,"t":1594771200000
"a":1.141,"b":1.14099,"x":48,"t":1594771206000
...
I am trying to display data in a given order and only for three fields. As the fields order is not guaranteed, I need to read the "tag" for each comma separated column for each line.
I have tried to solve this task using awk:
awk -F',' '
{
for(i=1; i<=$NF; i++) {
if(index($i,"\"a\":")!=0) a=$i;
if(index($i,"\"b\":")!=0) b=$i;
if(index($i,"\"t\":")!=0) t=$i;
}
printf("%s,%s,%s\n",a,b,t);
}
'
But I get:
,,
,,
...
In the above data sample, I would expect:
"a":1.14106,"b":1.14105,"t":1594771200000
"a":1.141,"b":1.14099,"t":1594771206000
...
Note: I am using the awk shipped with FreeBSD
$ cat tst.awk
BEGIN {
FS = "[,:]"
OFS = ","
}
{
for (i=1; i<NF; i+=2) {
f[$i] = $(i+1)
}
print p("a"), p("b"), p("t")
}
function p(tag, t) {
t = "\"" tag "\""
return t ":" f[t]
}
.
$ awk -f tst.awk file
"a":1.14106,"b":1.14105,"t":1594771200000
"a":1.141,"b":1.14099,"t":1594771206000
With awk and an array:
awk -F '[:,]' '{for(i=1; i<=NF; i=i+2){a[$i]=$(i+1)}; print "\"a\":" a["\"a\""] ",\"b\":" a["\"b\""] ",\"t\":" a["\"t\""]}' file
or
awk -F '[":,]' '{for(i=2; i<=NF; i=i+4){a[$i]=$(i+2)}; print "\"a\":" a["a"] ",\"b\":" a["b"] ",\"t\":" a["t"]}' file
Output:
"a":1.14106,"b":1.14105,"t":1594771200000
"a":1.141,"b":1.14099,"t":1594771206000
similar awk where you can specify the fields and order.
$ awk -F[:,] -v fields='"a","b","t"' 'BEGIN{n=split(fields,f)}
{for(i=1;i<NF;i+=2) map[$i]=$(i+1);
for(i=1;i<=n;i++) printf "%s", f[i]":"map[f[i]] (i==n?ORS:",")}' file
"a":1.14106,"b":1.14105,"t":1594771200000
"a":1.141,"b":1.14099,"t":1594771206000

Replace variable length string with characters to matching original string length

I am using a regular expression:
>\.*<
to match certain parts of field3 but I can't figure out how to replace with a number of characters that would preserve the original string length.
Input:
field1 field2 >>>>>.>............>>>.........<<<.......>>>>.......<<<<.<.<<<<<.
Expected output:
field1 field2 >>>>>.>............>>LLLLLLLLLLL<<.......>>>LLLLLLLLL<<<.<.<<<<<.
My poor failing attempt:
awk 'match($3, />\.*</){split($3, sst, "");for(i=RSTART;i<=RLENGTH;i++){sst[i]="L"};joined=sep="";for(x=1; x in sst;x++){joined=joined sep sst[x];sep=""};printf("%s\n", joined)}' hg19-matRNA.tsv > test2.tsv
Any help would be greatly appreciated!
With GNU awk for the 3rd arg to match() and gensub():
$ cat tst.awk
{
while ( match($3,/(.*)(>\.*<)(.*)/,a) ) {
$3 = a[1] gensub(/./,"L","g",a[2]) a[3]
}
print
}
$ awk -f tst.awk file
field1 field2 >>>>>.>............>>LLLLLLLLLLL<<.......>>>LLLLLLLLL<<<.<.<<<<<.
With any awk:
$ cat tst.awk
{
while ( match($3,/>\.*</) ) {
tgt = substr($3,RSTART,RLENGTH)
gsub(/./,"L",tgt)
$3 = substr($3,1,RSTART-1) tgt substr($3,RSTART+RLENGTH)
}
print
}
$ awk -f tst.awk file
field1 field2 >>>>>.>............>>LLLLLLLLLLL<<.......>>>LLLLLLLLL<<<.<.<<<<<.
awk solution. You could use patsplit as well like this:
$ cat tst.awk
{
patsplit($3, a, ">\\.+<", seps)
l=(length(a)>length(seps)?length(a):length(seps))
for (i=0; i<l; i++){
if (i in a) gsub(/./,"L",a[i])
s=s sprintf("%s", (i in a)?a[i]seps[i]:seps[i])
}
$3=s
}1
$ awk -f tst.awk file
field1 field2 >>>>>.>............>>LLLLLLLLLLL<<.......>>>LLLLLLLLL<<<.<.<<<<<.

how to replace everything except string of interest

file.txt
fruits:banana,apple,grape,limon,orange,tomate,
fruits:apple,limon,
fruits:banana,grape,limon,
fruits:orange,tomate,grape,
fruits:banana,
fruits:apple,
fruits:banana,apple,
I need to replace everything that is different than "banana" for FRUIT, and get output like this:
fruits:banana,FRUIT,FRUIT,FRUIT,FRUIT,FRUIT,
fruits:FRUIT,FRUIT,
fruits:banana,FRUIT,FRUIT,
fruits:FRUIT,FRUIT,FRUIT,
fruits:banana,
fruits:FRUIT,
fruits:FRUIT,apple,
I tried using awk, but I can only replace the fields of specific strings.
Example replace all strings "apple" by fruit2, or all strings "apple" by fruit2 and all strings "tomate"or "orange" by fruit3
awk -F":" '{ gsub(/apple/,"FRUIT2",$2); print }' OFS="," file.tx
or
awk -F":" '{ gsub(/apple/,"FRUIT2",$2);;gsub(/tomate|orange/,"FRUIT3",$2); print }' OFS="," file.txt |sed "s/./:/7"
fruits:banana,FRUIT2,grape,limon,FRUIT3,FRUIT3,
fruits:FRUIT2,limon,
fruits:banana,grape,limon,
fruits:FRUIT3,FRUIT3,grape,
fruits:banana,
fruits:FRUIT2,
fruits:banana,FRUIT2
but I really need is to replace everything that is different from that for any string, ex: fruit4
How to generate output like this?
fruits:FRUIT4,FRUIT2,FRUIT4,FRUIT4,FRUIT3,FRUIT3,
fruits:FRUIT2,FRUIT4,
fruits:FRUIT4,FRUIT4,FRUIT4,
fruits:FRUIT3,FRUIT3,FRUIT4,
fruits:FRUIT4,
fruits:FRUIT2,
fruits:FRUIT4,FRUIT2
This awk should work:
awk -F, -v OFS=, '{
for (i=1; i<=NF; i++)
if ($i !~ /(^|:)banana$/)
sub(/[^:]+$/, "FRUIT", $i)
} 1' file
Output:
fruits:banana,FRUIT,FRUIT,FRUIT,FRUIT,FRUIT,
fruits:FRUIT,FRUIT,
fruits:banana,FRUIT,FRUIT,
fruits:FRUIT,FRUIT,FRUIT,
fruits:banana,
fruits:FRUIT,
fruits:banana,FRUIT,
To make the process automated, you can do
awk -F '[:,]' -v OFS=, '
{
for (i=2; i<=NF; i++)
if ($i)
if (seen[$i])
$i = seen[$i]
else
$i = seen[$i] = "FRUIT" ++n
sub(OFS, ":")
print
}
END {
print "map:"
for (key in seen)
print key "\t" seen[key]
}
' file
fruits:FRUIT1,FRUIT2,FRUIT3,FRUIT4,FRUIT5,FRUIT6,
fruits:FRUIT2,FRUIT4,
fruits:FRUIT1,FRUIT3,FRUIT4,
fruits:FRUIT5,FRUIT6,FRUIT3,
fruits:FRUIT1,
fruits:FRUIT2,
fruits:FRUIT1,FRUIT2,
map:
orange FRUIT5
tomate FRUIT6
apple FRUIT2
limon FRUIT4
banana FRUIT1
grape FRUIT3
If you'd like some flexibility in being able to specify your mapping of old to new names on the command line:
$ cat tst.awk
BEGIN {
FS="[:,]"; OFS=","
split(map,t)
for (i=1; i in t; i+=2) {
m[t[i]] = t[i+1]
}
}
{
printf "%s:", $1
for (i=2;i<=NF;i++) {
if ($i in m ) { $i = m[$i] }
else if ("*" in m) { $i = m["*"] }
printf "%s%s", $i, (i<NF?OFS:ORS)
}
}
.
$ awk -v map='apple,FRUIT2,tomate,FRUIT3,*,FRUIT4' -f tst.awk file
fruits:FRUIT4,FRUIT2,FRUIT4,FRUIT4,FRUIT4,FRUIT3,FRUIT4
fruits:FRUIT2,FRUIT4,FRUIT4
fruits:FRUIT4,FRUIT4,FRUIT4,FRUIT4
fruits:FRUIT4,FRUIT3,FRUIT4,FRUIT4
fruits:FRUIT4,FRUIT4
fruits:FRUIT2,FRUIT4
fruits:FRUIT4,FRUIT2,FRUIT4
$ awk -v map='apple,BAZINGA,*,VEGGIE' -f tst.awk file
fruits:VEGGIE,BAZINGA,VEGGIE,VEGGIE,VEGGIE,VEGGIE,VEGGIE
fruits:BAZINGA,VEGGIE,VEGGIE
fruits:VEGGIE,VEGGIE,VEGGIE,VEGGIE
fruits:VEGGIE,VEGGIE,VEGGIE,VEGGIE
fruits:VEGGIE,VEGGIE
fruits:BAZINGA,VEGGIE
fruits:VEGGIE,BAZINGA,VEGGIE
$ awk -v map='apple,FRUIT2,tomate,FRUIT3' -f tst.awk file
fruits:banana,FRUIT2,grape,limon,orange,FRUIT3,
fruits:FRUIT2,limon,
fruits:banana,grape,limon,
fruits:orange,FRUIT3,grape,
fruits:banana,
fruits:FRUIT2,
fruits:banana,FRUIT2,

filtering some text from line using sed linux

I have a following content in the file:
NAME=ALARMCARDSLOT137 TYPE=2 CLASS=116 SYSPORT=2629 STATE=U ALARM=M APPL=" " CRMPLINK=CHASSIS131 DYNDATA="GL:1,15 ADMN:1 OPER:2 USAG:2 STBY:0 AVAL:0 PROC:0 UKNN:0 INH:0 ALM:20063;1406718801,"
I just want to filter out NAME , SYSPORT and ALM field using sed
Try the below sed command to filter out NAME,SYSPORT,ALM fields ,
$ sed 's/.*\(NAME=[^ ]*\).*\(SYSPORT=[^ ]*\).*\(ALM:[^;]*\).*/\1 \2 \3/g' file
NAME=ALARMCARDSLOT137 SYSPORT=2629 ALM:20063
why not using grep?
grep -oE 'NAME=\S*|SYSPORT=\S*|ALM:[^;]*'
test with your text:
kent$ echo 'NAME=ALARMCARDSLOT137 TYPE=2 CLASS=116 SYSPORT=2629 STATE=U ALARM=M APPL=" " CRMPLINK=CHASSIS131 DYNDATA="GL:1,15 ADMN:1 OPER:2 USAG:2 STBY:0 AVAL:0 PROC:0 UKNN:0 INH:0 ALM:20063;1406718801,"'|grep -oE 'NAME=\S*|SYSPORT=\S*|ALM:[^;]*'
NAME=ALARMCARDSLOT137
SYSPORT=2629
ALM:20063
Here is another awk
awk -F" |;" -v RS=" " '/NAME|SYSPORT|ALM/ {print $1}'
NAME=ALARMCARDSLOT137
SYSPORT=2629
ALM:20063
Whenever there are name=value pairs in input files, I find it best to first create an array mapping the names to the values and then operating on the array using the names of the fields you care about. For example:
$ cat tst.awk
function bldN2Varrs( i, fldarr, fldnr, subarr, subnr, tmp ) {
for (i=2;i<=NF;i+=2) { gsub(/ /,RS,$i) }
split($0,fldarr,/[[:blank:]]+/)
for (fldnr in fldarr) {
split(fldarr[fldnr],tmp,/=/)
gsub(RS," ",tmp[2])
gsub(/^"|"$/,"",tmp[2])
name2value[tmp[1]] = tmp[2]
split(tmp[2],subarr,/ /)
for (subnr in subarr) {
split(subarr[subnr],tmp,/:/)
subName2value[tmp[1]] = tmp[2]
}
}
}
function prt( fld, subfld ) {
if (subfld) print fld "/" subfld "=" subName2value[subfld]
else print fld "=" name2value[fld]
}
BEGIN { FS=OFS="\"" }
{
bldN2Varrs()
prt("NAME")
prt("SYSPORT")
prt("DYNDATA","ALM")
}
.
$ awk -f tst.awk file
NAME=ALARMCARDSLOT137
SYSPORT=2629
DYNDATA/ALM=20063;1406718801,
and if 20063;1406718801, isn't the desired value for the ALM field and you just want some subsection of that, simply tweak the array construction function to suit whatever your criteria is.

Bash command to match n line

I have an index HTML file with file/dir listing. It is just a usual filebrowser like :
...content here...
<td>20120011/</td>
<td>20120111/</td>
<td>20120211/</td>
<td>20120411/</td>
...content here...
I don't understand how to extract the 2nd line from the bottom.
1) I downloaded HTML with curl
content=$(curl -sL "http://path-to-html")
2) then used
dir=$(echo $content | sed '/.*href="\([0-9]*\/\)".*/!d;s//\1/;q')
which gives me the last match : 20120411.
But how to get the previous one ?
I don't know the total count of items.
This awk program will print the penultimate line:
echo ${content} | awk '{ pen = ult; ult = $0 } END { print pen }'
This will print the penultimate matching line:
echo ${content} | awk '/href="([0-9]{8}\/)"/ { pen = ult; ult = $0 } END { print pen }'
If you just want to extract the first capture group:
echo ${content} | awk 'match($0, /href="([0-9]{8}\/)"/, a) { pen = ult; ult = a[1] } END { print pen }'
Putting it all together:
bash-4.2$ dir=$(curl -sL http://www.arteetmarte.no/tmp/index.html |
awk 'match($0, /href="([0-9]{8}\/)"/, a) {
pen = ult
ult = a[1]
}
END {
print pen
}
')
bash-4.2$ echo ${dir}
20130918/
Tested with: GNU Awk 4.1.0, API: 1.0
May be a bit easier with awk
dir=$(echo "$content"|awk '/href=/{x=p;p=$0}END{sub(/.*">/,"",x);sub(/<.*/, "",x); print x}')
dir=$(echo $content | sed sed -n '/href="\([0-9]\{1,\}\/\)"/ {s|.*href="\([0-9]\{1,\}/\)".*|-\1-|;H;}
$ {x;l;s|.*-\([0-9]\{1,\}/\)-\(\n-[0-9]\{1,\}/-\)\{1\}$|\1|p;}')
The 1 in \{1\}$ specify how much line must be removed from the end