R: convert text duration ("..d ..h ..m ..s") into seconds - regex

Trying to convert the following durations into seconds
x <- "1005d 16h 09m 57s"
x1 <- "16h 09m 57s"
x2 <- "06h 09m 57s"
x3 <- "09m 57s"
x4 <- "57s"
I've modified the answer from Jthorpe in this post Convert factor of format Hh Mm Ss to time duration.
days <- as.numeric(gsub('^*([0-9]+)d.*$','\\1',x3))
hours <- as.numeric(gsub('^.*([0-9][0-9])h.*$','\\1',x3))
minutes <- as.numeric(gsub('^.*([0-9][0-9])m.*$','\\1',x4))
seconds <- as.numeric(gsub('^.*([0-9][0-9])s.*$','\\1',x4))
duration_seconds <- seconds + 60*minutes + 60*60*hours + 24*60*60*days
However, this is only working with x, but not x1-x4. Now, I know I can probably use if logic to get around the issue, but is there a better way?
Thanks in advance.

We can change the space character (\\s+) with + using gsub, then we can replace 'd', 'h', 'm', 's' with gsubfn and loop through the output and evaluate the string.
library(gsubfn)
v2 <- gsubfn("[a-z]", list(d="*24*60*60", h = "*60*60", m = "*60",
s="*1"), gsub("\\s+", "+", v1))
unname(sapply(v2, function(x) eval(parse(text=x))))
#[1] 86890197 58197 22197 597 57
data
v1 <- c(x, x1, x2, x3, x4)

Use:
ifelse(is.na(your_exp),0)
So that whenever na is the output of your expression it becomes 0.
Eg:
days <- ifelse(is.na(as.numeric(gsub('^*([0-9]+)d.*$','\\1',x1))),0)
hours <- ifelse(is.na(as.numeric(gsub('^.*([0-9][0-9])h.*$','\\1',x1))),0)
minutes <- ifelse(is.na(as.numeric(gsub('^.*([0-9][0-9])m.*$','\\1',x1))),0)
seconds <- ifelse(is.na(as.numeric(gsub('^.*([0-9][0-9])s.*$','\\1',x1))),0)
Output:(after duration_seconds <- seconds + 60*minutes + 60*60*hours + 24*60*60*days)
> duration_seconds
[1] 58197

Related

Splitting string columns FAST in R

I have a data frame with 107 columns and 745000 rows (much bigger than in my example).
The case is, that I have character type columns which I want to separate, because they seem to contain some type-ish ending at the end of each sequence.
I want to saparate these type-ending parts to new columns.
I have made my own solution, but it seem to be far too slow for iterating through all the 745000 rows 53 times.
So I embed my solution in the following code, with some arbitrary data:
set.seed(1)
code_1 <- paste0(round(runif(5000, 100000, 999999)), "_", round(runif(1000, 1, 15)))
code_2 <- sample(c(paste0(round(runif(10, 100000, 999999)), "_", round(runif(10, 1, 15))), NA), 5000, replace = TRUE)
code_3 <- sample(c(paste0(round(runif(3, 100000, 999999)), "_", round(runif(3, 1, 15))), NA), 5000, replace = TRUE)
code_4 <- sample(c(paste0(round(runif(1, 100000, 999999)), "_", round(runif(1, 1, 15))), NA), 5000, replace = TRUE)
code_type_1 <- rep(NA, 5000)
code_type_2 <- rep(NA, 5000)
code_type_3 <- rep(NA, 5000)
code_type_4 <- rep(NA, 5000)
df <- data.frame(cbind(code_1,
code_2,
code_3,
code_4,
code_type_1,
code_type_2,
code_type_3,
code_type_4),
stringsAsFactors = FALSE)
df_new <- data.frame(code_1 = character(),
code_2 = character(),
code_3 = character(),
code_4 = character(),
code_type_1 = character(),
code_type_2 = character(),
code_type_3 = character(),
code_type_4 = character(),
stringsAsFactors = FALSE)
for (i in 1:4) {
i_t <- i + 4
temp <- strsplit(df[, c(i)], "[_]")
for (j in 1:nrow(df)) {
df_new[c(j), c(i)] <- unlist(temp[j])[1]
df_new[c(j), c(i_t)] <- ifelse(is.na(unlist(temp[j])[1]), NA, unlist(temp[j])[2])
}
print(i)
}
for (i in 1:8) {
df_new[, c(i)] <- factor(df_new[, c(i)])
}
Do anyone have some ideas how to speed things up here?
First we pre-allocate the results data.frame to the desired final length. This is very important; see The R Inferno, Circle 2. Then we vectorize the inner loop. We also use fixed = TRUE and avoid the regex in strsplit.
system.time({
df_new1 <- data.frame(code_1 = character(nrow(df)),
code_2 = character(nrow(df)),
code_3 = character(nrow(df)),
code_4 = character(nrow(df)),
code_type_1 = character(nrow(df)),
code_type_2 = character(nrow(df)),
code_type_3 = character(nrow(df)),
code_type_4 = character(nrow(df)),
stringsAsFactors = FALSE)
for (i in 1:4) {
i_t <- i + 4
temp <- do.call(rbind, strsplit(df[, c(i)], "_", fixed = TRUE))
df_new1[, i] <- temp[,1]
df_new1[, i_t] <- ifelse(is.na(temp[,1]), NA, temp[,2])
}
df_new1[] <- lapply(df_new1, factor)
})
# user system elapsed
# 0.029 0.000 0.029
all.equal(df_new, df_new1)
#[1] TRUE
Of course, there are ways to make this even faster, but this is close to your original approach and should be sufficient.
Here's another way, using gsub inside a custom function in combination with purrr::dmap() - which is equivalent to lapply, but outputs a data.frame instead of a list.
library(purrr)
# Define function which gets rid of everything after and including "_"
replace01 <- function(df, ptrn = "_.*")
dmap(df[,1:4], gsub, pattern = ptrn, replacement = "")
# Because "pattern" is argument we can change it to get 2nd part, then cbind()
test <- cbind(replace01(df),
replace01(df, ptrn = ".*_"))
Note that the output here character columns, you can always convert them to factor if you like.
Another possibility:
setNames(do.call(rbind.data.frame, lapply(1:nrow(df), function(i) {
x <- stri_split_fixed(df[i, 1:4], "_", 2, simplify=TRUE)
y <- c(x[,1], x[,2])
y[y==""] <- NA
y
})), colnames(df)) -> df_new
or
setNames(do.call(rbind.data.frame, lapply(1:nrow(df), function(i) {
x <- stri_split_fixed(df[i, 1:4], "_", 2, simplify=TRUE)
c(x[,1], x[,2])
})), colnames(df)) -> df_new
df_new[df_new==""] <- NA
df_new
which is marginally faster:
Unit: milliseconds
expr min lq mean median uq max neval cld
na_after 669.8357 718.1301 724.8803 723.5521 732.9998 790.1405 10 a
na_inner 719.3362 738.1569 766.4267 762.1594 791.6198 825.0269 10 b

Inserting character in between words - R Language

I have a dataset, In which one column has a values in the format of [A-Z][A-Z][0-1][0-9][0-1][0-1][0-1][0-9][0-9] ie, AC1200019
Now I want to convert this format to [A-Z][A-Z][-][0-1][0-9][-][0-1][0-1][0-1][-][0-9][0-9] ie, AC-12-000-19
([A-Z][A-Z])([0-1][0-9])([0-1][0-1][0-1])([0-9][0-9])
Try this.Replace by $1-$2-$3-$4 or \\1-\\2-\\3-\\4.See demo.
https://regex101.com/r/uK9cD8/5
Try
gsub('^([A-Z]{2})([0-1][0-9])([0-1]{3})([0-9]{2})', '\\1-\\2-\\3-\\4', str1)
#[1] "AC-12-000-19"
data
str1 <- 'AC1200019'
Assuming the entire column has the same number of characters, here a simple version.
library(stringr)
x <- data.frame(X1 = c("AC1510018", "AC1200019", "BT1801007"))
paste(str_sub(x$X1,1,2), str_sub(x$X1,3,4),
str_sub(x$X1,5,7), str_sub(x$X1,8,9) , sep= "-")
I like the dplyr suite so here a version using dplyr and tidyr:
library(dplyr)
library(tidyr)
x %>%
separate(X1, into = c("X2", "X3", "X4", "X5"), sep = c(2,4,7)) %>%
unite("X1", X2, X3, X4, X5, sep="-")
or
x %>%
transmute(X2 = paste(str_sub(X1,1,2), str_sub(X1,3,4),
str_sub(X1,5,7), str_sub(X1,8,9) , sep= "-"))

Subset all 3 digit numbers and collapse them with a separator in a data frame. R

I'm formating a data set so each entry has the adegenet format for codominant markers, such as:
Loci1
###/###
208/210
200/204
198/208
where the # represents any digit (the number is a allele size in basepairs). My data has some homozygous entries (all 3 digit integers with no separator) that have the the form of:
Loci1
###
208
198
I intend to paste the 3 digit string to itself with sep='/' to produce the first format. I've tried to use grep to subset these homozygous entries by finding all non ###/### and negating the match using the table matching such as:
a <- grep('\\b\\d{3}?[/]\\d{3}', score$Loci1, value =T ) # Subset all ###/###/
score[!(a %in% 1:nrow(score$Loci1)), ] # works but only on vectors...
After the subset I could paste. The problem arises when I apply this to a data frame. grep seems to treat the data frame as a list (which in part it is) and returns columns that have a match.
So in short how can I go from ### to ###/### in a data frame
self contained example of data:
score2 <- NULL
set.seed(9)
Loci1 <- NULL
Loci2 <- NULL
Loci3 <- NULL
for (i in 1:5) Loci1 <- append(Loci1, paste(sample(seq(from = 230, to=330, by=3), 2, replace = F), collapse = '/'))
for (i in 1:5) Loci2 <- append(Loci2, paste(sample(seq(from = 230, to=330, by=3), 2, replace = F), collapse = '/'))
for (i in 1:5) Loci3 <- append(Loci3, paste(sample(seq(from = 230, to=330, by=3), 2, replace = F), collapse = '/'))
score2 <- data.frame(Loci1, Loci2, Loci3, stringsAsFactors = F)
score2[2,3] <- strsplit(score2[2,3], split = '/')[1]
score2[5,2] <- strsplit(score2[3,3], split = '/')[1]
score2[1,1] <- strsplit(score2[1,1], split = '/')[1]
score2[c(1, 4),c(2,3)] <- NA
score2
You could just replace the 3 digit items with the separator and a copy:
sub("^(...)$", "\\1/\\1", Loci1)
Use lapply with an anonymized function:
data.frame( lapply(score2, function(x) sub("^(...)$", "\\1/\\1", x) ) )
Loci1 Loci2 Loci3
1 251/251 <NA> <NA>
2 251/329 320/257 260/260
3 275/242 278/329 281/320
4 269/266 <NA> <NA>
5 296/326 281/281 326/314
(Not sure what the "paste-part" was supposed to refer to, but I think this was the intent of your question)
If the numeric values could have a varying number of digits then use a pattern argument like "^([0-9]{1,9})$"
An option using grep/paste,
m1 <- as.matrix(score2)
indx <- grep('^...$', m1)
m1[indx] <- paste(m1[indx], m1[indx], sep="/")
as.data.frame(m1)
# Loci1 Loci2 Loci3
#1 251/251 <NA> <NA>
#2 251/329 320/257 260/260
#3 275/242 278/329 281/320
#4 269/266 <NA> <NA>
#5 296/326 281/281 326/314
Or without converting to matrix, this can be done using lapply
score2[] <- lapply(score2, function(x) ifelse(grepl('^...$', x),
paste(x, x, sep="/"),x))

R: Fast string split on first delimiter occurence

I have a file with ~ 40 million rows that I need to split based on the first comma delimiter.
The following using the stringr function str_split_fixed works well but is very slow.
library(data.table)
library(stringr)
df1 <- data.frame(id = 1:1000, letter1 = rep(letters[sample(1:25,1000, replace = T)], 40))
df1$combCol1 <- paste(df1$id, ',',df1$letter1, sep = '')
df1$combCol2 <- paste(df1$combCol1, ',', df1$combCol1, sep = '')
st1 <- str_split_fixed(df1$combCol2, ',', 2)
Any suggestions for a faster way to do this?
Update
The stri_split_fixed function in more recent versions of "stringi" have a simplify argument that can be set to TRUE to return a matrix. Thus, the updated solution would be:
stri_split_fixed(df1$combCol2, ",", 2, simplify = TRUE)
Original answer (with updated benchmarks)
If you are comfortable with the "stringr" syntax and don't want to veer too far from it, but you also want to benefit from a speed boost, try the "stringi" package instead:
library(stringr)
library(stringi)
system.time(temp1 <- str_split_fixed(df1$combCol2, ',', 2))
# user system elapsed
# 3.25 0.00 3.25
system.time(temp2a <- do.call(rbind, stri_split_fixed(df1$combCol2, ",", 2)))
# user system elapsed
# 0.04 0.00 0.05
system.time(temp2b <- stri_split_fixed(df1$combCol2, ",", 2, simplify = TRUE))
# user system elapsed
# 0.01 0.00 0.01
Most of the "stringr" functions have "stringi" parallels, but as can be seen from this example, the "stringi" output required one extra step of binding the data to create the output as a matrix instead of as a list.
Here's how it compares with #RichardScriven's suggestion in the comments:
fun1a <- function() do.call(rbind, stri_split_fixed(df1$combCol2, ",", 2))
fun1b <- function() stri_split_fixed(df1$combCol2, ",", 2, simplify = TRUE)
fun2 <- function() {
do.call(rbind, regmatches(df1$combCol2, regexpr(",", df1$combCol2),
invert = TRUE))
}
library(microbenchmark)
microbenchmark(fun1a(), fun1b(), fun2(), times = 10)
# Unit: milliseconds
# expr min lq mean median uq max neval
# fun1a() 42.72647 46.35848 59.56948 51.94796 69.29920 98.46330 10
# fun1b() 17.55183 18.59337 20.09049 18.84907 22.09419 26.85343 10
# fun2() 370.82055 404.23115 434.62582 439.54923 476.02889 480.97912 10

R: What's the easiest way to print out pairs of values from a data.frame?

I have a data.frame:
df<-data.frame(a=c("x","x","y","y"),b=c(1,2,3,4))
> df
a b
1 x 1
2 x 2
3 y 3
4 y 4
What's the easiest way to print out each pair of values as a list of strings like this:
"x1", "x2", "y1", "y2"
apply(df, 1, paste, collapse="")
with(df, paste(a, b, sep=""))
And this should be faster than apply.
About timing
For 10000 rows we get:
df <- data.frame(
a = sample(c("x","y"), 10000, replace=TRUE),
b = sample(1L:4L, 10000, replace=TRUE)
)
N = 100
mean(replicate(N, system.time( with(df, paste(a, b, sep="")) )["elapsed"]), trim=0.05)
# 0.005778
mean(replicate(N, system.time( apply(df, 1, paste, collapse="") )["elapsed"]), trim=0.05)
# 0.09611
So increase in speed is visible for few thousands.
It's because Shane's solution call paste for each row separately. So there is nrow(df) calls of paste, in my solution is one call.
Also, you can use sqldf library:
library("sqldf")
df<-data.frame(a=c("x","x","y","y"),b=c(1,2,3,4))
result <- sqldf("SELECT a || cast(cast(b as integer) as text) as concat FROM df")
You will get the following result:
concat
1 x1
2 x2
3 y3
4 y4