Search text for matching strings to print line AND matching string - regex

I am using awk to search a large text for matching strings. My goal is to print the matching string, line number, and matching line. I haven't been able to achieve the first part (ie. printing the matching string).
Currently I have:
awk '/string1/string2/string3/{ print NR, $0 }' file_to_search.txt
This produces the line number and matching line, but not the matching string.
Any help is appreciated.

Your question isnt clear but it sounds like this might be what you want:
awk 'match($0,/regexp/){ print substr($0,RSTART,RLENGTH), NR, $0 }' file_to_search.txt

With grep, you can get the line number and the matching string or the complete line (as I know )
grep -wnoF "string1
string2
string3" infile
With awk, you can get what you look for
awk '
function findmatch(s, i) {
for (i=1;i<=NF;i++)
{if ($i == s)
{print "find string = "s,"on line number = "FNR,"Complete line = "$0}};
}
s1{findmatch(s1)}
s2{findmatch(s2)}
s3{findmatch(s3)}
' s1='string1' s2='string2' s3='string3' infile

Related

Match strings in two files using awk and regexp

I have two files.
File 1 includes various types of SeriesDescriptions
"SeriesDescription": "Type_*"
"SeriesDescription": "OtherType_*"
...
File 2 contains information with only one SeriesDescription
"Name":"Joe"
"Age":"18"
"SeriesDescription":"Type_(Joe_text)"
...
I want to
compare the two files and find the lines that match for "SeriesDescription" and
print the line number of the matched text from File 1.
Expected Output:
"SeriesDescription": "Type_*" 24 11 (the correct line numbers in my files)
"SeriesDescription" will always be found on line 11 of File 2. I am having trouble matching given the * and have also tried changing it to .* without luck.
Code I have tried:
grep -nf File1.txt File2.txt
Successfully matches, but I want the line number from File1
awk 'FNR==NR{l[$1]=NR; next}; $1 in l{print $0, l[$1], FNR}' File2.txt File1.txt
This finds a match and prints the line number from both files, however, this is matching on the first column and prints the last line from File 1 as the match (since every line has the same column 1 for File 1).
awk 'FNR==NR{l[$2]=$3;l[$2]=NR; next}; $2 in l{print $0, l[$2], FNR}' File2.txt File1.txt
Does not produce a match.
I have also tried various settings of FS=":" without luck. I am not sure if the trouble is coming from the regex or the use of "" in the files or something else. Any help would be greatly appreciated!
With your shown samples, please try following. Written and tested in GNU awk, should work in any awk.
awk '
{ val="" }
match($0,/^[^_]*_/){
val=substr($0,RSTART,RLENGTH)
gsub(/[[:space:]]+/,"",val)
}
FNR==NR{
if(val){
arr[val]=$0 OFS FNR
}
next
}
(val in arr){
print arr[val] OFS FNR
}
' SeriesDescriptions file2
With your shown samples output will be:
"SeriesDescription": "Type_*" 1 3
Explanation: Adding detailed explanation for above.
awk ' ##Starting awk program from here.
{ val="" } ##Nullifying val here.
match($0,/^[^_]*_/){ ##Using match to match value till 1st occurrence of _ here.
val=substr($0,RSTART,RLENGTH) ##Creating val which has sub string of above matched regex.
gsub(/[[:space:]]+/,"",val) ##Globally substituting spaces with NULL in val here.
}
FNR==NR{ ##This will execute when first file is being read.
if(val){ ##If val is NOT NULL.
arr[val]=$0 OFS FNR ##Create arr with index of val, which has value of current line OFS and FNR in it.
}
next ##next will skip all further statements from here.
}
(val in arr){ ##Checking if val is present in arr then do following.
print arr[val] OFS FNR ##Printing arr value with OFS, FNR value.
}
' SeriesDescriptions file2 ##Mentioning Input_file name here.
Bonus solution: If above is working fine for you AND you have this match only once in your file2 then you can exit from program to make it quick, in that case have above code in following way.
awk '
{ val="" }
match($0,/^[^_]*_/){
val=substr($0,RSTART,RLENGTH)
gsub(/[[:space:]]+/,"",val)
}
FNR==NR{
if(val){
arr[val]=$0 OFS FNR
}
next
}
(val in arr){
print arr[val] OFS FNR
exit
}
' SeriesDescriptions file2

Removing multiple delimiters between outside delimiters on each line

Using awk or sed in a bash script, I need to remove comma separated delimiters that are located between an inner and outer delimiter. The problem is that wrong values ends up in the wrong columns, where only 3 columns are desired.
For example, I want to turn this:
2020/11/04,Test Account,569.00
2020/11/05,Test,Account,250.00
2020/11/05,More,Test,Accounts,225.00
Into this:
2020/11/04,Test Account,569.00
2020/11/05,Test Account,250.00
2020/11/05,More Test Accounts,225.00
I've tried to use a few things, testing regex:
But I cannot find a solution to only select the commas in order to remove.
awk -F, '{ printf "%s,",$1;for (i=2;i<=NF-2;i++) { printf "%s ",$i };printf "%s,%s\n",$(NF-1),$NF }' file
Using awk, print the first comma delimited field and then loop through the rest of the field up to the last but 2 field printing the field followed by a space. Then for the last 2 fields print the last but one field, a comma and then the last field.
With GNU awk for the 3rd arg to match():
$ awk -v OFS=, '{
match($0,/([^,]*),(.*),([^,]*)/,a)
gsub(/,/," ",a[2])
print a[1], a[2], a[3]
}' file
2020/11/04,Test Account,569.00
2020/11/05,Test Account,250.00
2020/11/05,More Test Accounts,225.00
or with any awk:
$ awk '
BEGIN { FS=OFS="," }
{
n = split($0,a)
gsub(/^[^,]*,|,[^,]*$/,"")
gsub(/,/," ")
print a[1], $0, a[n]
}
' file
2020/11/04,Test Account,569.00
2020/11/05,Test Account,250.00
2020/11/05,More Test Accounts,225.00
Use this Perl one-liner:
perl -F',' -lane 'print join ",", $F[0], "#F[1 .. ($#F-1)]", $F[-1];' in.csv
The Perl one-liner uses these command line flags:
-e : Tells Perl to look for code in-line, instead of in a file.
-n : Loop over the input one line at a time, assigning it to $_ by default.
-l : Strip the input line separator ("\n" on *NIX by default) before executing the code in-line, and append it when printing.
-a : Split $_ into array #F on whitespace or on the regex specified in -F option.
-F',' : Split into #F on comma, rather than on whitespace.
$F[0] : first element of the array #F (= first comma-delimited value).
$F[-1] : last element of #F.
#F[1 .. ($#F-1)] : elements of #F between the second from the start and the second from the end, inclusive.
"#F[1 .. ($#F-1)]" : the above elements, joined on blanks into a string.
join ",", ... : join the LIST "..." on a comma, and return the resulting string.
SEE ALSO:
perldoc perlrun: how to execute the Perl interpreter: command line switches
perl -pe 's{,\K.*(?=,)}{$& =~ y/,/ /r}e' file
sed -e ':a' -e 's/\(,[^,]*\),\([^,]*,\)/\1 \2/; t a' file
awk '{$1=$1","; $NF=","$NF; gsub(/ *, */,","); print}' FS=, file
awk '{for (i=2; i<=NF; ++i) $i=(i>2 && i<NF ? " " : ",") $i} 1' FS=, OFS= file
awk doesn't support look arounds, we could have it by using match function of awk; using that could you please try following, written and tested with shown samples in GNU awk.
awk '
match($0,/,.*,/){
val=substr($0,RSTART+1,RLENGTH-2)
gsub(/,/," ",val)
print substr($0,1,RSTART) val substr($0,RSTART+RLENGTH-1)
}
' Input_file
Yet another perl
$ perl -pe 's/(?:^[^,]*,|,[^,]*$)(*SKIP)(*F)|,/ /g' ip.txt
2020/11/04,Test Account,569.00
2020/11/05,Test Account,250.00
2020/11/05,More Test Accounts,225.00
(?:^[^,]*,|,[^,]*$) matches first/last field along with the comma character
(*SKIP)(*F) this would prevent modification of preceding regexp
|, provide , as alternate regexp to be matched for modification
With sed (assuming \n is supported by the implementation, otherwise, you'll have to find a character that cannot be present in the input)
sed -E 's/,/\n/; s/,([^,]*)$/\n\1/; y/,/ /; y/\n/,/'
s/,/\n/; s/,([^,]*)$/\n\1/ replace first and last comma with newline character
y/,/ / replace all comma with space
y/\n/,/ change newlines back to comma
A similar answer to Timur's, in awk
awk '
BEGIN { FS = OFS = "," }
function join(start, stop, sep, str, i) {
str = $start
for (i = start + 1; i <= stop; i++) {
str = str sep $i
}
return str
}
{ print $1, join(2, NF-1, " "), $NF }
' file.csv
It's a shame awk doesn't ship with a join function builtin

How to extract url paths recursively

i want to list all endpoints in a list of url like
https://test123.com/endpoint1/endpoint2/endpoint3
https://test456.com/endpoint1/endpoint2/endpoint3
https://test789.com/endpoint1/endpoint2/endpoint3
into output like
https://test123.com/
https://test123.com/endpoint1/
https://test123.com/endpoint1/endpoint2/
https://test123.com/endpoint1/endpoint2/endpoint3
https://test456.com/
https://test456.com/endpoint1/
https://test456.com/endpoint1/endpoint2/
https://test456.com/endpoint1/endpoint2/endpoint3
And so on, listing all endpoints recursively so i would do something with each endpoint.
I tried to use this but it print it separately.
awk '$1=$1' FS="/" OFS="\n"
thanks
Could you please try following, written and tested with shown samples.
awk '
match($0,/http[s]?:\/\/[^/]*\//){
first=substr($0,RSTART,RLENGTH)
val=substr($0,RSTART+RLENGTH)
num=split(val,array,"/")
print first
for(i=1;i<=num;i++){
value=(value?value "/":"")array[i]
print first value
}
val=first=value=""
}' Input_file
Explanation: Adding detailed explanation for above.
awk ' ##Starting awk program from here.
match($0,/http[s]?:\/\/[^/]*\//){ ##Using match function which matches http OR https :// then till first occurrence of /
first=substr($0,RSTART,RLENGTH) ##Creating first with sub-string which starts from RSTART till RLENGTH value of current line.
val=substr($0,RSTART+RLENGTH) ##Creating val which has rest of line out of match function in 3rd line of code.
num=split(val,array,"/") ##Splitting val into array with delimiter / here.
print first ##Printing first here.
for(i=1;i<=num;i++){ ##Starting for loop till value of num from i=1 here.
value=(value?value "/":"")array[i] ##Creating value which has array[i] and keep adding in its previous value to it.
print first value ##Printing first and value here.
}
val=first=value="" ##Nullify variables val, first and value here.
}
' Input_file ##Mentioning Input_file name here.
With two loops:
awk '{
x=$1 OFS $2 OFS $3 # x contains prefix https://
for(i=3; i<=NF; i++) { # NF is number of last element
printf("%s", x) # print prefix
for(j=4; j<=i; j++){
printf("%s%s", OFS, $j) # print / and single element
}
print ""
}
}' FS='/' OFS='/' file
Output:
https://test123.com
https://test123.com/endpoint1
https://test123.com/endpoint1/endpoint2
https://test123.com/endpoint1/endpoint2/endpoint3
https://test456.com
https://test456.com/endpoint1
https://test456.com/endpoint1/endpoint2
https://test456.com/endpoint1/endpoint2/endpoint3
https://test789.com
https://test789.com/endpoint1
https://test789.com/endpoint1/endpoint2
https://test789.com/endpoint1/endpoint2/endpoint3
See: 8 Powerful Awk Built-in Variables – FS, OFS, RS, ORS, NR, NF, FILENAME, FNR
$ awk -F'/' '{ep=$1 FS FS; for (i=3;i<NF;i++) print ep=ep $i FS; print ep $NF}' file
https://test123.com/
https://test123.com/endpoint1/
https://test123.com/endpoint1/endpoint2/
https://test123.com/endpoint1/endpoint2/endpoint3
https://test456.com/
https://test456.com/endpoint1/
https://test456.com/endpoint1/endpoint2/
https://test456.com/endpoint1/endpoint2/endpoint3
https://test789.com/
https://test789.com/endpoint1/
https://test789.com/endpoint1/endpoint2/
https://test789.com/endpoint1/endpoint2/endpoint3
A solution using perl.
perl -F/ -le 'print; while (3 < #F) { pop #F; print join("/", #F, "") }' input_file
Gives the following for your sample input.
https://test123.com/endpoint1/endpoint2/endpoint3
https://test123.com/endpoint1/endpoint2/
https://test123.com/endpoint1/
https://test123.com/
https://test456.com/endpoint1/endpoint2/endpoint3
https://test456.com/endpoint1/endpoint2/
https://test456.com/endpoint1/
https://test456.com/
https://test789.com/endpoint1/endpoint2/endpoint3
https://test789.com/endpoint1/endpoint2/
https://test789.com/endpoint1/
https://test789.com/
See https://perldoc.perl.org/perlrun.html#Command-Switches look for -Fpattern.

Dynamically generated regex for gsub not working

I have an input CSV file:
1,5,1
1,6,2
1,5,3
1,7,4
1,5,5
1,6,6
1,6,7
I need to create a string out of this as follows:
;5,1,3,5;6,2,6,7;7,4
So each character, except the first which is the value of the field $2, in the substring in between the ; denotes the row number of middle field; for example ;5,1,3,5 means that 5 is at row number 1,3,5.
I've been trying to use awk with gsub, trying to create the string MYSTR dynamically.
The regex inside the gsub is not working. I need a regex that will match ;$3 (the value of $3, which can be a two digit number) and replace it with ;$3,RowNO, if the pattern is not matched then add ;$3 at the end of the string.
This is what I have so far:
awk -F',' '{
print NR, $3;
noofchars=gsub(/;$3/,";"$3","NR,MYSTR);
print noofchars;
if ( noofchars == 1 )
;
else
MYSTR=MYSTR";"$3","NR;
print NR, $3;
print MYSTR;
}
END{print MYSTR;}' $1
The regex doesn't work because $3 isn't interpreted as the field #3 value but is seen as the anchor $ (that matches the end of the line) and a literal 3.
You can do it without gsub:
awk -F, '{a[$2]=a[$2]","NR}END{for (i in a){printf(";%d%s",i,a[i])}}'
Input
$ cat file
1,5,1
1,6,2
1,5,3
1,7,4
1,5,5
1,6,6
1,6,7
Output
$ awk -F, '{gsub(/[ ]+/,"",$3);a[$2] = ($2 in a ? a[$2]:$2) FS $3 }END{for(i in a)printf("%s%s",";",a[i]); print ""}' file
;5,1,3,5;6,2,6,7;7,4
Better Readable version
awk -F, '
{
gsub(/[ ]+/,"",$3); # suppress space char in third field
a[$2] = ($2 in a ? a[$2]:$2) FS $3 # array a where index being field2 and value will be field3, if index exists before append string with existing value
}
END{
for(i in a) # loop through array a and print values
printf("%s%s",";",a[i]);
print ""
}
' file
#vsshekhar: Try following too: It will provide you values in the correct same order which Input_file ($2) are coming.
awk -F, '{A[++i]=$2;B[A[i]]=B[A[i]]?B[A[i]] "," FNR:FNR} END{for(j=1;j<=i;j++){if(B[A[j]]){printf(";%s,%s",A[j],B[A[j]]);delete B[A[j]]}};print ""}' Input_file
Adding a non-one liner form of solution too now.
awk -F, '{
A[++i]=$2;
B[A[i]]=B[A[i]]?B[A[i]] "," FNR:FNR
}
END{
for(j=1;j<=i;j++){
if(B[A[j]]){
printf(";%s,%s",A[j],B[A[j]]);
delete B[A[j]]
}
};
print ""
}
' Input_file

Find specific words and replace with capitals

I need to use Unix and create an awk script. The first part of the script is to find the words "Ant" "Ass" and "Ape" in a text file and replace them with the same word but capitalized.
Do I use gsub to find each occurrence? If i do:
{gsub(/Ass/, "ASS"); print}
{gsub(/Ape/, "APE"); print}
{gsub(/Ant/, 'ANT"); print}
it just prints every line of the file 3 or 4 times... how can I search and replace these three words and then print out only the modified line?
The second part of the program is to track the number of lines with matches to Ass, Ape, or Ant and the number of substitutions made.
Thanks for your help!
Do all the substitutions in a single clause:
{subs += gsub(/Ass/, "ASS"); subs += gsub(/Ape/, "APE"); subs += gsub(/Ant/, "ANT"); print; }
END { print "Total substitutions:", subs; }
sed 's/Ant/ANT/g; s/Ass/ASS/g; s/Ape/APE/'
Another way:
awk '
BEGIN {IGNORECASE=1}
{
s = 0
while (match(substr($0, s),/ass|ape|ant/) > 0) {
c=substr($0,s + RSTART - 1,RLENGTH)
sub(c,toupper(c))
s += RSTART + RLENGTH
}
print
}' input