Related
I'm trying to make the following call to outer() substantially faster. Parallelizing via foreach is still prohibitively slow, so I'd like to attempt calling this in C++ using Rcpp but would love to hear any faster alternative.
Given a matrix mat and a list of matrix colnames col.list I am summarizing mat as such.
mycall <- function(mat, col.list) {
outer(
rownames(mat),
col.list,
Vectorize(function(x,y) {
mean(mat[x,y])
})
)
}
For instance:
set.seed(123)
mat <- matrix(rnorm(100),nrow=10)
rownames(mat) <- letters[1:10]
colnames(mat) <- LETTERS[1:10]
mat
A B C D E F G H I J
a -0.56047565 1.2240818 -1.0678237 0.42646422 -0.69470698 0.25331851 0.37963948 -0.4910312 0.005764186 0.9935039
b -0.23017749 0.3598138 -0.2179749 -0.29507148 -0.20791728 -0.02854676 -0.50232345 -2.3091689 0.385280401 0.5483970
c 1.55870831 0.4007715 -1.0260044 0.89512566 -1.26539635 -0.04287046 -0.33320738 1.0057385 -0.370660032 0.2387317
d 0.07050839 0.1106827 -0.7288912 0.87813349 2.16895597 1.36860228 -1.01857538 -0.7092008 0.644376549 -0.6279061
e 0.12928774 -0.5558411 -0.6250393 0.82158108 1.20796200 -0.22577099 -1.07179123 -0.6880086 -0.220486562 1.3606524
f 1.71506499 1.7869131 -1.6866933 0.68864025 -1.12310858 1.51647060 0.30352864 1.0255714 0.331781964 -0.6002596
g 0.46091621 0.4978505 0.8377870 0.55391765 -0.40288484 -1.54875280 0.44820978 -0.2847730 1.096839013 2.1873330
h -1.26506123 -1.9666172 0.1533731 -0.06191171 -0.46665535 0.58461375 0.05300423 -1.2207177 0.435181491 1.5326106
i -0.68685285 0.7013559 -1.1381369 -0.30596266 0.77996512 0.12385424 0.92226747 0.1813035 -0.325931586 -0.2357004
j -0.44566197 -0.4727914 1.2538149 -0.38047100 -0.08336907 0.21594157 2.05008469 -0.1388914 1.148807618 -1.0264209
col.list <- replicate(5, sample(colnames(mat),sample(10,1)), simplify = F)
col.list
[[1]]
[1] "I" "H" "F" "C"
[[2]]
[1] "H" "C" "E" "D"
[[3]]
[1] "F" "A" "B" "C"
[[4]]
[1] "I" "G" "H" "F"
[[5]]
[1] "B" "F" "A" "D" "J"
mycall(mat, col.list)
[,1] [,2] [,3] [,4] [,5]
[1,] -0.32494304 -0.45677441 -0.03772476 0.03692275 0.46737855
[2,] -0.54260254 -0.75753314 -0.02922133 -0.61368967 0.07088301
[3,] -0.10844910 -0.09763415 0.22265121 0.06475016 0.61009334
[4,] 0.14372171 0.40224937 0.20522554 0.07130067 0.36000416
[5,] -0.43982636 0.17912380 -0.31934091 -0.55151435 0.30598183
[6,] 0.29678266 -0.27389757 0.83293885 0.79433814 1.02136588
[7,] 0.02527506 0.17601171 0.06195023 -0.07211925 0.43025291
[8,] -0.01188734 -0.39897791 -0.62342288 -0.03697956 -0.23527315
[9,] -0.28972770 -0.12070775 -0.24994491 0.22537340 -0.08066115
[10,] 0.61991819 0.16277087 0.13782578 0.81898563 -0.42188074
You could try:
sapply(col.list, function(v) rowMeans(mat[, v]))
I suspect the reason your solution is slow is Vectorize: it's a nice way to transform a scalar function into a vectorized function, but it has a huge cost: since it's based on mapply, it will call the function on each element, one by one. That is, one call to mean for each entry. If the outer result is large, that's going to be very costly. Instead, with the solution above, the code is at least vectorized in one direction, thanks to rowMeans.
I am attempting to write a regular expression that replaces each element in this matrix with only the two numbers after the first colon before and after the comma. There is also "./.:.:.:.:." which I would like to change to "0,0".
head(data)
Offspring-95_CAATCG Offspring-96_AAACGG Offspring-97_ACTCTT
[1,] "./.:1,7:8:18:262,0,18" "0/1:18,4:21:56:56,0,591" "0/0:27,0:27:78:0,78,723"
[2,] "0/0:49,0:49:99:0,147,1891" "0/0:107,0:107:99:0,319,4185" "1/1:0,22:22:66:902,66,0"
[3,] "0/0:42,0:42:99:0,126,1324" "./.:.:.:.:." "0/1:35,88:117:99:3152,0,718"
I have tried:
try <- gsub("\\:[0-9]*\\,[0-9]*\\:", \\1, data)
The desired output is:
Offspring-95_CAATCG Offspring-96_AAACGG Offspring-97_ACTCTT
[1,] "1,7" "18,4" "27,0"
[2,] "49,0" "107,0" "0,22"
[3,] "42,0" "0,0" "35,88"
Thanks,
This could be done by
sub('[^:]+:([^:]+).*', '\\1', data)
# Offspring.95_CAATCG Offspring.96_AAACGG Offspring.97_ACTCTT
#[1,] "1,7" "18,4" "27,0"
#[2,] "49,0" "107,0" "0,22"
#[3,] "9,4" "33,13" "13,0"
Visualization
[^:]+:([^:]+).*
Debuggex Demo
Or using regmatches from base R
data[] <- regmatches(data, regexpr('(?<=:)[0-9]+,[0-9]+', data, perl=TRUE))
Visualization
(?<=:)[0-9]+,[0-9]+
Debuggex Demo
The above regex can be used with stringr or stringi (for big dataset)
library(stringr)
`dim<-`(str_extract(data, perl('(?<=:)[0-9]+,[0-9]+')), dim(data))
# [,1] [,2] [,3]
#[1,] "1,7" "18,4" "27,0"
#[2,] "49,0" "107,0" "0,22"
#[3,] "9,4" "33,13" "13,0"
Or
library(stringi)
`dim<-`(stri_extract(data, regex='(?<=:)[0-9]+,[0-9]+'), dim(data))
# [,1] [,2] [,3]
#[1,] "1,7" "18,4" "27,0"
#[2,] "49,0" "107,0" "0,22"
#[3,] "9,4" "33,13" "13,0"
Update
data1[] <- sub('[^:]+:([^:]+).*', '\\1', data1)
data1[!grepl(',', data1)] <- '0,0'
data1
# Offspring.95_CAATCG Offspring.96_AAACGG Offspring.97_ACTCTT
#[1,] "1,7" "18,4" "27,0"
#[2,] "49,0" "107,0" "0,22"
#[3,] "42,0" "0,0" "35,88"
data
data <- structure(c("./.:1,7:8:18:262,0,18", "0/0:49,0:49:99:0,147,1891",
"0/1:9,4:13:99:129,0,334", "0/1:18,4:21:56:56,0,591",
"0/0:107,0:107:99:0,319,4185",
"0/1:33,13:44:99:317,0,1150", "0/0:27,0:27:78:0,78,723",
"1/1:0,22:22:66:902,66,0", "0/0:13,0:13:39:0,39,528"), .Dim = c(3L, 3L),
.Dimnames = list(NULL, c("Offspring.95_CAATCG", "Offspring.96_AAACGG",
"Offspring.97_ACTCTT")))
data1 <- structure(c("./.:1,7:8:18:262,0,18", "0/0:49,0:49:99:0,147,1891",
"0/0:42,0:42:99:0,126,1324", "0/1:18,4:21:56:56,0,591",
"0/0:107,0:107:99:0,319,4185",
"./.:.:.:.:.", "0/0:27,0:27:78:0,78,723", "1/1:0,22:22:66:902,66,0",
"0/1:35,88:117:99:3152,0,718"), .Dim = c(3L, 3L), .Dimnames = list(
NULL, c("Offspring.95_CAATCG", "Offspring.96_AAACGG", "Offspring.97_ACTCTT"
)))
Not regex subbing but probably pretty darn quick.
apply(data, 2, function(x) sapply(strsplit(x, ":"), "[[", 2))
## Offspring.95_CAATCG Offspring.96_AAACGG Offspring.97_ACTCTT
## [1,] "1,7" "18,4" "27,0"
## [2,] "49,0" "107,0" "0,22"
## [3,] "9,4" "33,13" "13,0"
Try this:
out<-list()
for(i in seq(ncol(data)))
out[[i]]<-gsub('[^:]*:([0-9]+,[0-9]+).*','\\1',data[,i])
out<-as.data.frame(out)
dimnames(out)<-dimnames(data)
out
I've got several character vectors that I want to paste together. The problem is that some of the character vectors are pretty sparse. So, when I paste them, I get NA's and extra separators. How can I efficiently remove the NA's and extra separators while still joining the vectors?
I've got something like:
n1 = c("goats", "goats", "spatula", NA, "rectitude", "boink")
n2 = c("forever", NA, "...yes", NA, NA, NA)
cbind(paste(n1,n2, sep=", "))
which gives me:
[1,] "goats, forever"
[2,] "goats, NA"
[3,] "spatula, ...yes"
[4,] "NA, NA"
[5,] "rectitude, NA"
[6,] "boink, NA"
but I want:
[1,] "goats, forever"
[2,] "goats"
[3,] "spatula, ...yes"
[4,] <NA>
[5,] "rectitude"
[6,] "boink"
There are clearly inefficient and tedious ways of doing this with a lot of regular expressions and string splitting. But anything quick/simple?
Not a lot of regex, just 1 line and 1 more to replace NA
n1 <- c("goats", "goats", "spatula", NA, "rectitude", "boink")
n2 <- c("forever", NA, "...yes", NA, NA, NA)
n3 <- cbind(paste(n1,n2, sep=", "))
n3 <- gsub("(, )?NA", "", n3)
n3[n3==""] <- NA
Code (no regex or string splitting):
vec <- apply(cbind(n1,n2),1,function(x)
ifelse(all(is.na(x)), NA, paste(na.omit(x),collapse=", ")) )
Result:
> vec # as a vector
[1] "goats, forever" "goats" "spatula, ...yes" NA "rectitude" "boink"
> cbind(vec) # as a matrix
vec
[1,] "goats, forever"
[2,] "goats"
[3,] "spatula, ...yes"
[4,] NA
[5,] "rectitude"
[6,] "boink"
Here's an option using the qdap package (though the other options seem better to me as they use base R):
library(qdap)
gsub(" ", ", ", blank2NA(Trim(gsub("NA", "", paste(n1, n2)))))
## [1] "goats, forever" "goats" "spatula, ...yes" NA
## [5] "rectitude" "boink"
Or...
## gsub(" ", ", ", blank2NA(gsub("NA| NA", "", paste(n1, n2))))
I have a question about lists in R. I have a list within 16 list containing a list with variables like this:
x
[[1]]
A 1 3
B 4 2
[[2]]
C 23 4
D 9 22
E 4 54
The A,B,C and D are rownames in the lists. Now I want to create a file that paste only the rownames in a dataframe. Each row in the dataframe contains 1 list in the total list.
A B
C D E
Can anyone help me with this? I thought maybe someting like do.call(rbind, rownames(x))
EDIT! 05-08-2011
Is there a way to save the rownames list by list? So in the end there are no NA's in the data and the data is unequal?
Thank you all!
Making an assumption about the nature of x, if we use:
x <- list(matrix(c(1,4,3,2), ncol = 2,
dimnames = list(c("A","B"), NULL)),
matrix(c(23,9,4,4,22,54), ncol = 2,
dimnames = list(c("C","D","E"), NULL)))
which gives:
> x
[[1]]
[,1] [,2]
A 1 3
B 4 2
[[2]]
[,1] [,2]
C 23 4
D 9 22
E 4 54
Then
> lapply(x, rownames)
[[1]]
[1] "A" "B"
[[2]]
[1] "C" "D" "E"
seems the only plausible answer. Unless we pad the ("A","B") vector with something, we can't use a matrix or a data frame because the component lengths do not match. Hence one of the reasons the do.call() idea fails:
> do.call(rbind, rownames(x))
Error in do.call(rbind, rownames(x)) : second argument must be a list
> do.call(rbind, lapply(x, rownames))
[,1] [,2] [,3]
[1,] "A" "B" "A"
[2,] "C" "D" "E"
Warning message:
In function (..., deparse.level = 1) :
number of columns of result is not a multiple of vector length (arg 1)
To pad the result with NA and get a data frame, we could do:
out <- lapply(x, rownames)
foo <- function(x, max, repl = NA) {
if(length(x) == max)
out <- x
else {
out <- rep(repl, max)
out[seq_along(x)] <- x
}
out
}
out <- lapply(out, foo, max = max(sapply(out, length)))
(out <- do.call(rbind, out))
The last line gives:
> (out <- do.call(rbind, out))
[,1] [,2] [,3]
[1,] "A" "B" NA
[2,] "C" "D" "E"
If you want that nicely printed, then
> print(format(out), quote = FALSE)
[,1] [,2] [,3]
[1,] A B NA
[2,] C D E
is an option inside R.
This should do it:
lapply(x, function(curdfr){paste(rownames(curdfr))})
This results in a vector with each element the space-separated rownames of the elements of the list.
Your sample data:
x <- list(
matrix(c(1,4,3,2), nrow = 2, dimnames = list(LETTERS[1:2])),
matrix(c(23,9,4,4,22,54), nrow = 3, dimnames = list(LETTERS[3:5]))
)
What you want:
unlist(lapply(x, rownames))
Or, if you are keen on do.call, then this is equivalent:
do.call(c, lapply(x, rownames))
In R, is it possible to extract group capture from a regular expression match? As far as I can tell, none of grep, grepl, regexpr, gregexpr, sub, or gsub return the group captures.
I need to extract key-value pairs from strings that are encoded thus:
\((.*?) :: (0\.[0-9]+)\)
I can always just do multiple full-match greps, or do some outside (non-R) processing, but I was hoping I can do it all within R. Is there's a function or a package that provides such a function to do this?
str_match(), from the stringr package, will do this. It returns a character matrix with one column for each group in the match (and one for the whole match):
> s = c("(sometext :: 0.1231313213)", "(moretext :: 0.111222)")
> str_match(s, "\\((.*?) :: (0\\.[0-9]+)\\)")
[,1] [,2] [,3]
[1,] "(sometext :: 0.1231313213)" "sometext" "0.1231313213"
[2,] "(moretext :: 0.111222)" "moretext" "0.111222"
gsub does this, from your example:
gsub("\\((.*?) :: (0\\.[0-9]+)\\)","\\1 \\2", "(sometext :: 0.1231313213)")
[1] "sometext 0.1231313213"
you need to double escape the \s in the quotes then they work for the regex.
Hope this helps.
Try regmatches() and regexec():
regmatches("(sometext :: 0.1231313213)",regexec("\\((.*?) :: (0\\.[0-9]+)\\)","(sometext :: 0.1231313213)"))
[[1]]
[1] "(sometext :: 0.1231313213)" "sometext" "0.1231313213"
gsub() can do this and return only the capture group:
However, in order for this to work, you must explicitly select elements outside your capture group as mentioned in the gsub() help.
(...) elements of character vectors 'x' which are not substituted will be returned unchanged.
So if your text to be selected lies in the middle of some string, adding .* before and after the capture group should allow you to only return it.
gsub(".*\\((.*?) :: (0\\.[0-9]+)\\).*","\\1 \\2", "(sometext :: 0.1231313213)")
[1] "sometext 0.1231313213"
Solution with strcapture from the utils:
x <- c("key1 :: 0.01",
"key2 :: 0.02")
strcapture(pattern = "(.*) :: (0\\.[0-9]+)",
x = x,
proto = list(key = character(), value = double()))
#> key value
#> 1 key1 0.01
#> 2 key2 0.02
This is how I ended up working around this problem. I used two separate regexes to match the first and second capture groups and run two gregexpr calls, then pull out the matched substrings:
regex.string <- "(?<=\\().*?(?= :: )"
regex.number <- "(?<= :: )\\d\\.\\d+"
match.string <- gregexpr(regex.string, str, perl=T)[[1]]
match.number <- gregexpr(regex.number, str, perl=T)[[1]]
strings <- mapply(function (start, len) substr(str, start, start+len-1),
match.string,
attr(match.string, "match.length"))
numbers <- mapply(function (start, len) as.numeric(substr(str, start, start+len-1)),
match.number,
attr(match.number, "match.length"))
I like perl compatible regular expressions. Probably someone else does too...
Here is a function that does perl compatible regular expressions and matches the functionality of functions in other languages that I am used to:
regexpr_perl <- function(expr, str) {
match <- regexpr(expr, str, perl=T)
matches <- character(0)
if (attr(match, 'match.length') >= 0) {
capture_start <- attr(match, 'capture.start')
capture_length <- attr(match, 'capture.length')
total_matches <- 1 + length(capture_start)
matches <- character(total_matches)
matches[1] <- substr(str, match, match + attr(match, 'match.length') - 1)
if (length(capture_start) > 1) {
for (i in 1:length(capture_start)) {
matches[i + 1] <- substr(str, capture_start[[i]], capture_start[[i]] + capture_length[[i]] - 1)
}
}
}
matches
}
As suggested in the stringr package, this can be achieved using either str_match() or str_extract().
Adapted from the manual:
library(stringr)
strings <- c(" 219 733 8965", "329-293-8753 ", "banana",
"239 923 8115 and 842 566 4692",
"Work: 579-499-7527", "$1000",
"Home: 543.355.3679")
phone <- "([2-9][0-9]{2})[- .]([0-9]{3})[- .]([0-9]{4})"
Extracting and combining our groups:
str_extract_all(strings, phone, simplify=T)
# [,1] [,2]
# [1,] "219 733 8965" ""
# [2,] "329-293-8753" ""
# [3,] "" ""
# [4,] "239 923 8115" "842 566 4692"
# [5,] "579-499-7527" ""
# [6,] "" ""
# [7,] "543.355.3679" ""
Indicating groups with an output matrix (we're interested in columns 2+):
str_match_all(strings, phone)
# [[1]]
# [,1] [,2] [,3] [,4]
# [1,] "219 733 8965" "219" "733" "8965"
#
# [[2]]
# [,1] [,2] [,3] [,4]
# [1,] "329-293-8753" "329" "293" "8753"
#
# [[3]]
# [,1] [,2] [,3] [,4]
#
# [[4]]
# [,1] [,2] [,3] [,4]
# [1,] "239 923 8115" "239" "923" "8115"
# [2,] "842 566 4692" "842" "566" "4692"
#
# [[5]]
# [,1] [,2] [,3] [,4]
# [1,] "579-499-7527" "579" "499" "7527"
#
# [[6]]
# [,1] [,2] [,3] [,4]
#
# [[7]]
# [,1] [,2] [,3] [,4]
# [1,] "543.355.3679" "543" "355" "3679"
This can be done using the package unglue, taking the example from the selected answer:
# install.packages("unglue")
library(unglue)
s <- c("(sometext :: 0.1231313213)", "(moretext :: 0.111222)")
unglue_data(s, "({x} :: {y})")
#> x y
#> 1 sometext 0.1231313213
#> 2 moretext 0.111222
Or starting from a data frame
df <- data.frame(col = s)
unglue_unnest(df, col, "({x} :: {y})",remove = FALSE)
#> col x y
#> 1 (sometext :: 0.1231313213) sometext 0.1231313213
#> 2 (moretext :: 0.111222) moretext 0.111222
you can get the raw regex from the unglue pattern, optionally with named capture :
unglue_regex("({x} :: {y})")
#> ({x} :: {y})
#> "^\\((.*?) :: (.*?)\\)$"
unglue_regex("({x} :: {y})",named_capture = TRUE)
#> ({x} :: {y})
#> "^\\((?<x>.*?) :: (?<y>.*?)\\)$"
More info : https://github.com/moodymudskipper/unglue/blob/master/README.md