Find patterns in strings and concatenate unique parts in R - regex

Conciser having 2 strings s1 and s2. How is it possible to concatenate only those parts in these strings, which are unique, and leave non unique parts as is. Let / be the separator between concatenated parts. The desired result looks like this:
s1 <- "very big house"
s2 <- "very small house"
some_function(s1,s2)
"very big/small house" #// desired result.
Usually leading and tailing charters are the same for both strings and the differences appear only in the middle.
More examples of desired result:
# a
s1 <- "1b"; s2 <- "2b"
"1/2b" # <- new string
# b
s1 <- "a_1_b"; s2 <- "a_2_b"
"a_1/2_b" # <- new string
# c
s1 <- "a"; s2 <- "b"
"a/b" # <- new string

Here's a solution:
pasteMergePrefixAndSuffix <- function(vs1,vs2,sep=' ') {
## cycle string vectors to same length
vsl <- max(length(vs1),length(vs2));
vs1 <- rep(vs1,len=vsl);
vs2 <- rep(vs2,len=vsl);
## precompute character splits
ss1 <- strsplit(vs1,'');
ss2 <- strsplit(vs2,'');
## iterate over each pair of strings
sapply(seq_along(vs1),function(si) {
s1 <- vs1[si];
s2 <- vs2[si];
s1l <- length(ss1[[si]]);
s2l <- length(ss2[[si]]);
slmin <- min(s1l,s2l);
## handle trivial case of exact equality
if (s1==s2) return(s1);
## get prefix and suffix lengths
if (slmin==0L) { ## empty string cannot result in a prefix or suffix
pl <- sl <- 0L;
} else {
eq <- ss1[[si]][seq_len(slmin)]==ss2[[si]][seq_len(slmin)];
pl <- if (all(eq)) slmin else if (eq[1L]==T) which(!eq)[1L]-1L else 0L;
eq <- rev(ss1[[si]])[seq_len(slmin)]==rev(ss2[[si]])[seq_len(slmin)];
sl <- if (all(eq)) slmin else if (eq[1L]==T) which(!eq)[1L]-1L else 0L;
}; ## end if
## paste together prefix, sep-pasted middles, and suffix
m1 <- substr(s1,pl+1L,s1l-sl);
m2 <- substr(s2,pl+1L,s2l-sl);
paste0(substr(s1,1L,pl),paste(sep=sep,m1,m2),substr(s1,s1l-sl+1L,s1l));
});
}; ## end pasteMergePrefixAndSuffix()
Demo:
pasteMergePrefixAndSuffix('a','b');
## [1] "a b"
pasteMergePrefixAndSuffix('a','b','/');
## [1] "a/b"
s1 <- 'very big house'; s2 <- 'very small house'; pasteMergePrefixAndSuffix(s1,s2,'/');
## [1] "very big/small house"
s1 <- '1b'; s2 <- '2b'; pasteMergePrefixAndSuffix(s1,s2,'/');
## [1] "1/2b"
s1 <- 'a_1_b'; s2 <- 'a_2_b'; pasteMergePrefixAndSuffix(s1,s2,'/');
## [1] "a_1/2_b"
s1 <- 'ab'; s2 <- 'ab'; pasteMergePrefixAndSuffix(s1,s2,'/');
## [1] "ab"
s1 <- 'xab'; s2 <- 'ab'; pasteMergePrefixAndSuffix(s1,s2,'/');
## [1] "x/ab"
s1 <- 'ab'; s2 <- 'abx'; pasteMergePrefixAndSuffix(s1,s2,'/');
## [1] "ab/x"
s1 <- 'abx'; s2 <- 'ab'; pasteMergePrefixAndSuffix(s1,s2,'/');
## [1] "abx/"
s1 <- 'ab'; s2 <- 'xab'; pasteMergePrefixAndSuffix(s1,s2,'/');
## [1] "/xab"
s1 <- ''; s2 <- 'x'; pasteMergePrefixAndSuffix(s1,s2,'/');
## [1] "/x"
s1 <- 'x'; s2 <- ''; pasteMergePrefixAndSuffix(s1,s2,'/');
## [1] "x/"

Related

removing consecutive duplicates in strings R

I'd like to collapse two strings s1 = "word1 word2 word3" and s2 = "word2 word3 word4" but removing the extra (future) consecutive overlap/duplicate ("word2 word3"). That is, I should obtain s = "word1 word2 word3 word4" rather than s = "word1 word2 word3 word2 word3 word4".
More simply, it should also work for single-word overlaps: s1 = "word1 word2" and s2 = "word2 word3" should give me s = word1 word2 word3" rather than s = "word1 word2 word2 word3".
I am using wordnumber for illustration purposes but of course it should work for any word...
Use unique on the result, that should remove all the duplicates.
And perhaps also use sort?
EDIT: Sorry, my first answer did miss the point completely. Here's a revised solution based on the stringr-package, that I think should work. The idea is to first split the strings into vectors, then compare the vectors and check if an overlap is present - finally join the vectors based on whether or not an overlap was detected.
s1 = "word1 word2 word3"
s2 = "word2 word3 word4"
library(stringr)
.s1_splitted <- str_split(
string = s1,
pattern = "\ +")[[1]]
.s2_splitted <- str_split(
string = s2,
pattern = "\ +")[[1]]
.matches12 <- charmatch(
x = .s1_splitted,
table = .s2_splitted)
If the last number is different from NA, and shorter than the
length of .s1_splitted, then check if the end of the vector
looks like it ought to do.
.last_element <- tail(.matches12, n = 1)
if (! is.na(.last_element)) {
if (.last_element <= length(.s1_splitted)) {
.overlap <- identical(
x = 1:.last_element,
y = tail(x = .matches12,
n = .last_element))
}
} else
.overlap <- FALSE
Join the components, based on overlap.
if (.overlap) {
.joined <- c(
head(x = .s1_splitted,
n = - .last_element),
.s2_splitted)
} else
.joined <- c(.s1_splitted,
.s2_splitted)
Convert back to a string
.result <- paste(.joined, collapse = " ")
This was surprisingly difficult, but I believe I have a solution:
sjoin <- function(s1,s2) {
ss1 <- strsplit(s1,'\\s+')[[1L]];
ss2 <- strsplit(s2,'\\s+')[[1L]];
if (length(ss1)==0L) return(s2);
if (length(ss2)==0L) return(s1);
n <- 0L; for (i in seq(min(length(ss1),length(ss2)),1L))
if (all(ss1[seq(to=length(ss1),len=i)]==ss2[seq(1L,len=i)])) {
n <- i;
break;
}; ## end if
paste(collapse=' ',c(ss1,if (n==0L) ss2 else ss2[-1:-n]));
}; ## end sjoin()
sjoin('1 2 3','2 3 4');
## [1] "1 2 3 4"
sjoin('1 2 3 x','2 3 4');
## [1] "1 2 3 x 2 3 4"
sjoin('1 2 3','x 2 3 4');
## [1] "1 2 3 x 2 3 4"
sjoin('','')
## [1] ""
sjoin('a','');
## [1] "a"
sjoin('','a');
## [1] "a"
sjoin('a','a')
## [1] "a"
sjoin('a b c','a b c');
## [1] "a b c"
sjoin('a b c','c');
## [1] "a b c"
sjoin('a b c','c d');
## [1] "a b c d"
sjoin('b','b c d');
## [1] "b c d"
sjoin('a b','b c d');
## [1] "a b c d"

Insert a character at multiple positions in a string at once

Let us say I have a string
"ABCDEFGHI56dfsdfd"
What I want to do is insert a space character at multiple positions at once.
For eg. I want to insert space character at randomly chosen two positions say 4 and 8.
So the output should be
"ABCD EFGH I56dfsdfd"
What is the most effective way of doing this? Given the string can have any type of characters in it (not just the alphabets).
Here's a solution based on regular expressions:
vec <- "ABCDEFGHI56dfsdfd"
# sample two random positions
pos <- sample(nchar(vec), 2)
# [1] 6 4
# generate regex pattern
pat <- paste0("(?=.{", nchar(vec) - pos, "}$)", collapse = "|")
# [1] "(?=.{11}$)|(?=.{13}$)"
# insert spaces at (after) positions
gsub(pat, " ", vec, perl = TRUE)
# [1] "ABCD EF GHI56dfsdfd"
This approach is based on positive lookaheads, e.g., (?=.{11}$). In this example, a space is inserted at 11 characters before the end of the string ($).
A bit more brute-force-y than Sven's:
randomSpaces <- function(txt) {
pos <- sort(sample(nchar(txt), 2))
paste(substr(txt, 1, pos[1]), " ",
substr(txt, pos[1]+1, pos[2]), " ",
substr(txt, pos[2]+1, nchar(txt)), collapse="", sep="")
}
for (i in 1:10) print(randomSpaces("ABCDEFGHI56dfsdfd"))
## [1] "ABCDEFG HI56 dfsdfd"
## [1] "ABC DEFGHI5 6dfsdfd"
## [1] "AB CDEFGHI56dfsd fd"
## [1] "ABCDEFGHI 5 6dfsdfd"
## [1] "ABCDEF GHI56dfsdf d"
## [1] "ABC DEFGHI56dfsdf d"
## [1] "ABCD EFGHI56dfsd fd"
## [1] "ABCDEFGHI56d fsdfd "
## [1] "AB CDEFGH I56dfsdfd"
## [1] "A BCDE FGHI56dfsdfd"
Based on the accepted answer, here's a function that simplifies this approach:
##insert pattern in string at position
substrins <- function(ins, x, ..., pos=NULL, offset=0){
stopifnot(is.numeric(pos),
is.numeric(offset),
!is.null(pos))
offset <- offset[1]
pat <- paste0("(?=.{", nchar(x) - pos - (offset-1), "}$)", collapse = "|")
gsub(pattern = pat, replacement = ins, x = x, ..., perl = TRUE)
}
# insert space at position 10
substrins(" ", "ABCDEFGHI56dfsdfd", pos = 10)
##[1] "ABCDEFGHI 56dfsdfd"
# insert pattern before position 10 (i.e. at position 9)
substrins(" ", "ABCDEFGHI56dfsdfd", pos = 10, offset=-1)
##[1] "ABCDEFGH I56dfsdfd"
# insert pattern after position 10 (i.e. at position 11)
substrins(" ", "ABCDEFGHI56dfsdfd", pos = 10, offset=1)
##[1] "ABCDEFGHI5 6dfsdfd"
Now to do what the OP wanted:
# insert space at position 4 and 8
substrins(" ", "ABCDEFGHI56dfsdfd", pos = c(4,8))
##[1] "ABC DEFG HI56dfsdfd"
# insert space after position 4 and 8 (as per OP's desired output)
substrins(" ", "ABCDEFGHI56dfsdfd", pos = c(4,8), offset=1)
##[1] "ABCD EFGH I56dfsdfd"
To replicate the other, more brute-force-y answer one would do:
set.seed(123)
x <- "ABCDEFGHI56dfsdfd"
for (i in 1:10) print(substrins(" ", x, pos = sample(nchar(x), 2)))
##[1] "ABCD EFGHI56d fsdfd"
##[1] "ABCDEF GHI56dfs dfd"
##[1] " ABCDEFGHI56dfsd fd"
##[1] "ABCDEFGH I56dfs dfd"
##[1] "ABCDEFG HI 56dfsdfd"
##[1] "ABCDEFG HI56dfsdf d"
##[1] "ABCDEFGHI 56 dfsdfd"
##[1] "A BCDEFGHI56dfs dfd"
##[1] " ABCD EFGHI56dfsdfd"
##[1] "ABCDE FGHI56dfsd fd"

How do Capture Groups Beyond \9 get Referenced in R?

Is it possible in R to capture groups >9 in a regular expression?
sub("(.+)-(.+)-(.+)-(.+)-(.+)-(.+)-(.+)-(.+)-(.+)", "\\1 & \\9",
"abc-02-03-04-05-06-07-08-09")
gives
[1] "abc & 09"
which is expected result, but
sub("(.+)-(.+)-(.+)-(.+)-(.+)-(.+)-(.+)-(.+)-(.+)-(.+)", "\\1 & \\10",
"abc-02-03-04-05-06-07-08-09-10")
[1] "abc & abc0"
fails, as the expected result would have been
[1] "abc & 10"
I need this for a function like the following, which works fine for up to 9 formats but no more:
x <- as.Date(c("2005-09-02", "2012-04-08"))
fmt <- "dddd, d.m.yy"
fmt <- gsub(pattern = "dddd", replacement = "\\\\1", x = fmt)
fmt <- gsub(pattern = "ddd", replacement = "\\\\2", x = fmt)
fmt <- gsub(pattern = "dd", replacement = "\\\\3", x = fmt)
fmt <- gsub(pattern = "d", replacement = "\\\\4", x = fmt)
fmt <- gsub(pattern = "mmmm", replacement = "\\\\5", x = fmt)
fmt <- gsub(pattern = "mmm", replacement = "\\\\6", x = fmt)
fmt <- gsub(pattern = "mm", replacement = "\\\\7", x = fmt)
fmt <- gsub(pattern = "m", replacement = "\\\\8", x = fmt)
fmt <- gsub(pattern = "yyyy", replacement = "\\\\9", x = fmt)
fmt <- gsub(pattern = "yy", replacement = "\\\\10", x = fmt)
fmt <- gsub(pattern = "y", replacement = "\\\\11", x = fmt)
fmt
sub("(.+)-(.+)-(.+)-0?(.+)-(.+)-(.+)-(.+)-0?(.+)-(.+)-(.+)-0?(.+)", fmt,
format(x, "%A-%a-%d-%d-%B-%b-%m-%m-%Y-%y-%y"))
Its important to note that the limit is nine backreferences; you get unlimited captures. By using str_match from stringr (or, more clunkily, regmatches from base R), you can always restructure your code to avoid having to use backreferences.
library(stringr)
(matches <- str_match(
"abc-02-03-04-05-06-07-08-09-10",
"(.+)-(.+)-(.+)-(.+)-(.+)-(.+)-(.+)-(.+)-(.+)-(.+)")
)
## [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10] [,11]
## [1,] "abc-02-03-04-05-06-07-08-09-10" "abc" "02" "03" "04" "05" "06" "07" "08" "09" "10"
paste(matches[, 2], matches[, 11], sep = " & ")
## [1] "abc & 10"
As Dason indicates, you're better off splitting the string and taking the desired elements.
elements <- c(1,10)
paste(strsplit("abc-02-03-04-05-06-07-08-09-10", '-')[[1]][elements], collapse=' & ')
## [1] "abc & 10"
This can be vectorized with sapply, if needed:
sapply(strsplit("abc-02-03-04-05-06-07-08-09-10", '-'), function(x) paste(x[elements], collapse=' & '))

regex grab from beginning to n occurrence of character

I'm really putting time into learning regex and I'm playing with different toy scenarios. One setup I can't get to work is to grab from the beginning of a string to n occurrence of a character where n > 1.
Here I can grab from the beginning of the string to the first underscore but I can't generalize this to the second or third underscore.
x <- c("a_b_c_d", "1_2_3_4", "<_?_._:")
gsub("_.*$", "", x)
Here's what I'm trying to achieve with regex. (`sub`/`gsub`):
## > sapply(lapply(strsplit(x, "_"), "[", 1:2), paste, collapse="_")
## [1] "a_b" "1_2" "<_?"
#or
## > sapply(lapply(strsplit(x, "_"), "[", 1:3), paste, collapse="_")
## [1] "a_b_c" "1_2_3" "<_?_."
Related post: regex from first character to the end of the string
Here's a start. To make this safe for general use, you'll need it to properly escape regular expressions' special characters:
x <- c("a_b_c_d", "1_2_3_4", "<_?_._:", "", "abcd", "____abcd")
matchToNth <- function(char, n) {
others <- paste0("[^", char, "]*") ## matches "[^_]*" if char is "_"
mainPat <- paste0(c(rep(c(others, char), n-1), others), collapse="")
paste0("(^", mainPat, ")", "(.*$)")
}
gsub(matchToNth("_", 2), "\\1", x)
# [1] "a_b" "1_2" "<_?" "" "abcd" "_"
gsub(matchToNth("_", 3), "\\1", x)
# [1] "a_b_c" "1_2_3" "<_?_." "" "abcd" "__"
How about:
gsub('^(.+_.+?).*$', '\\1', x)
# [1] "a_b" "1_2" "<_?"
Alternatively you can use {} to indicate the number of repeats...
sub('((.+_){1}.+?).*$', '\\1', x) # {0} will give "a", {1} - "a_b", {2} - "a_b_c" and so on
So you don't have to repeat yourself if you wanted to match the nth one...
second underscore in perl style regex:
/^(.?_.?_)/
and third:
/^(.*?_.*?_.*?_)/
Maybe something like this
x
## [1] "a_b_c_d" "1_2_3_4" "<_?_._:"
gsub("(.*)_", "\\1", regmatches(x, regexpr("([^_]*_){1}", x)))
## [1] "a" "1" "<"
gsub("(.*)_", "\\1", regmatches(x, regexpr("([^_]*_){2}", x)))
## [1] "a_b" "1_2" "<_?"
gsub("(.*)_", "\\1", regmatches(x, regexpr("([^_]*_){3}", x)))
## [1] "a_b_c" "1_2_3" "<_?_."
Using Justin's approach this was what I devised:
beg2char <- function(text, char = " ", noc = 1, include = FALSE) {
inc <- ifelse(include, char, "?")
specchar <- c(".", "|", "(", ")", "[", "{", "^", "$", "*", "+", "?")
if(char %in% specchar) {
char <- paste0("\\", char)
}
ins <- paste(rep(paste0(char, ".+"), noc - 1), collapse="")
rep <- paste0("^(.+", ins, inc, ").*$")
gsub(rep, "\\1", text)
}
x <- c("a_b_c_d", "1_2_3_4", "<_?_._:")
beg2char(x, "_", 1)
beg2char(x, "_", 2)
beg2char(x, "_", 3)
beg2char(x, "_", 4)
beg2char(x, "_", 3, include=TRUE)

R : How to search for a regex in a vector over elements outwardly?

Is it possible in R to search for a regex in a vector as if all the elements are a collapsed single element? If we collapse all the elements into one to do this, it becomes impossible to put them back to their element-wise form after the search.
here is a vector.
vector<-c("I", "met", "a", "cow")
now, the search word is "meta" (elements 2 and 3 collapsed).
Let's say my task is to merge the two elements across which the search string lies.
So what I expect is this:
vector = "I", "meta", "cow"
Is it possible to do this? Please help.
If you'd like something that matches "meta" but not "taco", this will do the trick:
myFun <- function(vector, word) {
D <- "UnLiKeLyStRiNg"
## Construct a string on which you'll perform regex-search
xx <- paste0(paste0(D, vector, collapse=""), D)
## Construct the regex pattern
start <- paste0("(?<=", D, ")")
mid <- paste0(strsplit(word, "")[[1]], collapse=paste0("(", D, ")?"))
end <- paste0("(?=", D, ")")
pat <- paste0(start, mid, end)
## Use it
strsplit(gsub(pat, word, xx, perl=TRUE), D)[[1]][-1]
}
vector <- c("I", "met", "a", "cow")
myFun(vector, "meta")
# [1] "I" "meta" "cow"
myFun(vector, "taco")
# [1] "I" "met" "a" "cow"
myFun(vector, "Imet")
# [1] "Imet" "a" "cow"
myFun(vector, "Ime")
# [1] "I" "met" "a" "cow"
If only complete elements should merged, you could try this approach:
mergeRegExpr <- function(x, pattern) {
str <- paste(x, sep="", collapse="")
## find starting position of each word
wordStart <- head(cumsum(c(1, nchar(x))), -1)
## look for pattern
rx <- regexpr(pattern=pattern, text=str, fixed=TRUE)
## pos of matching pattern == rx+nchar(pattern)-1
rxEnd <- rx+attr(rx, "match.length")-1
## which vector elements doesn't match pattern
sel <- wordStart < rx | wordStart > rxEnd
## insert merged elements
return(append(x[sel], paste(x[!sel], collapse=""), rx-1))
}
vector <- c("I", "met", "a", "cow")
mergeRegExpr(vector, "meta")
# "I" "meta" "cow"
mergeRegExpr(vector, "acow")
# "I" "met" "acow"
mergeRegExpr(vector, "Imeta")
# "Imeta" "cow"
## partial matching doesn't work
mergeRegExpr(vector, "taco")
# "I" "metacow"
Building on Carl Witthoft's comment, my solution was not with regex, but with basic matching:
# A slightly longer vector
v = c("I", "met", "a", "cow", "today",
"You", "met", "a", "cow", "today")
# Create the combinations of each pair
temp1 = sapply(1:(length(v)-1),
function(x) paste0(v[x], v[x+1]))
# Grab the index of the desired search term
temp2 = which(temp1 %in% "meta")
# The following also works.
# Don't know what's faster/better.
# temp2 = grep("meta", temp1)
# Do some manual substitution and deletion
v[temp2] <- "meta"
v <- v[-(temp2+1)]
I don't think this is an ideal situation at all though.