Related
I am trying to apply nested foreach loops to a list. When using nested for loops my codes works. But when I try to use foreach loops I do not get the full results (in a list), but just a few values.
This is my nested for loops code:
library(sn)
library(mnormt)
library(mokken)
library(polycor)
library(foreach)
library(parallel)
data("DS14")
data<-DS14[,3:5] # for testing I only use 3 variables
source("C:/Users/.../code to apply function fit_skewnorm (Kolbe et al., 2021).R")
# Kolbe et al. for reference: https://doi.org/10.3390/psych3040037
# see Appendix B
allresults_skew <- replicate(ncol(data)-1, matrix(NA,ncol(data),9), simplify = FALSE)
for(p in 1:ncol(data)){
for(q in 2:ncol(data)){
if(q<=p){
next}
tryCatch({ # a function to continue with loop in case of errors
obsn = table(data[,p], data[,q])
ncats1 = nrow(obsn)
ncats2 = ncol(obsn)
ntot = sum(obsn)
obsp = obsn/ntot
proportions2 = matrix(colSums(obsp), 1, ncats2)
proportions1 = matrix(rowSums(obsp), ncats1 , 1)
premultiplier = matrix(0, ncats1, ncats1)
for(l in 1:ncats1)for(m in 1:l)premultiplier[l,m] = 1
postmultiplier = matrix(0, ncats2, ncats2)
for(l in 1:ncats2)for(m in l:ncats2)postmultiplier[l,m] = 1
cumulprops2 = proportions2 %*% postmultiplier
cumulprops1 = premultiplier %*% proportions1
nthresholds1 = ncats1 - 1
nthresholds2 = ncats2 - 1
thresholds1 = matrix(0, 1, nthresholds1)
for(l in 1:nthresholds1)thresholds1[l] = qnorm(cumulprops1[l])
thresholds2 = matrix(0, 1, nthresholds2)
for(l in 1:nthresholds2)thresholds2[l] = qnorm(cumulprops2[l])
pcorr = polycor::polychor(obsn)
results_fit = fit_skewnorm(c("th1" = thresholds1, "th2" = thresholds2, "corr" = pcorr, "alpha" = c(2 ,2)))
allresults_skew[[p]][q,1] <- p
allresults_skew[[p]][q,2] <- q
allresults_skew[[p]][q,3] <- results_fit[,1]
allresults_skew[[p]][q,4] <- results_fit[,2]
allresults_skew[[p]][q,5] <- results_fit[,3]
allresults_skew[[p]][q,6] <- results_fit[,4]
allresults_skew[[p]][q,7] <- results_fit[,5]
allresults_skew[[p]][q,8] <- results_fit[,6]
allresults_skew[[p]][q,9] <- results_fit[,7]
}, error=function(e){cat("ERROR :",conditionMessage(e), "\n")}) # part of tryCatch
}
}
Then, allresults_skew is:
[[1]]
[,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9]
[1,] NA NA NA NA NA NA NA NA NA
[2,] 1 2 19.97874 13 0.095741675130554 0.2705112 0 1.4656923 0.7528304
[3,] 1 3 65.49704 13 0.000000005354567 0.8426818 0 0.2512463 2.2963329
[[2]]
[,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9]
[1,] NA NA NA NA NA NA NA NA NA
[2,] NA NA NA NA NA NA NA NA NA
[3,] 2 3 31.14632 13 0.003209404 0.2753952 0 0.7247398 0.5957852
My current nested foreach loops look like this:
allresults_skew <- replicate(ncol(data)-1, matrix(NA,ncol(data),9), simplify = FALSE)
no_cores <- detectCores(logical = TRUE)
cl <- makeCluster(no_cores-1)
registerDoParallel(cl)
getDoParWorkers()
foreach(i = 1:ncol(data),.combine = 'cbind') %:%
foreach(j = 2:ncol(data), .combine = 'rbind') %dopar% {
if(j<=i){
return(NA)}
tryCatch({ # a function to continue with loop in case of errors
#progress(i, ncol(data)-1)
obsn = table(data[,i], data[,j])
ncats1 = nrow(obsn)
ncats2 = ncol(obsn)
ntot = sum(obsn)
obsp = obsn/ntot
proportions2 = matrix(colSums(obsp), 1, ncats2)
proportions1 = matrix(rowSums(obsp), ncats1 , 1)
premultiplier = matrix(0, ncats1, ncats1)
for(l in 1:ncats1)for(m in 1:l)premultiplier[l,m] = 1
postmultiplier = matrix(0, ncats2, ncats2)
for(l in 1:ncats2)for(m in l:ncats2)postmultiplier[l,m] = 1
cumulprops2 = proportions2 %*% postmultiplier
cumulprops1 = premultiplier %*% proportions1
nthresholds1 = ncats1 - 1
nthresholds2 = ncats2 - 1
thresholds1 = matrix(0, 1, nthresholds1)
for(l in 1:nthresholds1)thresholds1[l] = qnorm(cumulprops1[l])
thresholds2 = matrix(0, 1, nthresholds2)
for(l in 1:nthresholds2)thresholds2[l] = qnorm(cumulprops2[l])
pcorr = polycor::polychor(obsn)
results_fit = fit_skewnorm(c("th1" = thresholds1, "th2" = thresholds2, "corr" = pcorr, "alpha" = c(2 ,2)))
allresults_skew[[i]][j,1] <- i
allresults_skew[[i]][j,2] <- j
allresults_skew[[i]][j,3] <- results_fit[,1]
allresults_skew[[i]][j,4] <- results_fit[,2]
allresults_skew[[i]][j,5] <- results_fit[,3]
allresults_skew[[i]][j,6] <- results_fit[,4]
allresults_skew[[i]][j,7] <- results_fit[,5]
allresults_skew[[i]][j,8] <- results_fit[,6]
allresults_skew[[i]][j,9] <- results_fit[,7]
}, error=function(e){cat("ERROR :",conditionMessage(e), "\n")}) # part of tryCatch
NULL
}
stopCluster(cl)
After these foreach loops are run, I get this matrix:
[,1] [,2] [,3]
result.1 0.7528304 NA NA
result.2 2.2963329 0.5957852 NA
And asking for allresults_skew, gives me:
[[1]]
[,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9]
[1,] NA NA NA NA NA NA NA NA NA
[2,] NA NA NA NA NA NA NA NA NA
[3,] NA NA NA NA NA NA NA NA NA
[[2]]
[,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9]
[1,] NA NA NA NA NA NA NA NA NA
[2,] NA NA NA NA NA NA NA NA NA
[3,] NA NA NA NA NA NA NA NA NA
So the foreach loops just gives values from the last columns from the for loops, but only directly after the code ran. Using allresults_skew the matrices are still completely NA.
I would be grateful if anyone could help me this and tell me what to change.
I am also uncertain whether to choose either .combine = 'c', 'rbind', or 'cbind' in the two foreach loops. But I assume this does not explain why the code does not result in a list that includes 3 x 9 matrices.
I found a solution that works for me:
allresults_skew <- foreach(i = 1:ncol(data)) %:% foreach(j = 2:ncol(data)) %dopar% {
if(j<=i){
return(NA)}
tryCatch({ # a function to continue with loop in case of errors
#progress(i, ncol(data)-1)
obsn = table(data[,i], data[,j])
ncats1 = nrow(obsn)
ncats2 = ncol(obsn)
ntot = sum(obsn)
obsp = obsn/ntot
proportions2 = matrix(colSums(obsp), 1, ncats2)
proportions1 = matrix(rowSums(obsp), ncats1 , 1)
premultiplier = matrix(0, ncats1, ncats1)
for(l in 1:ncats1)for(m in 1:l)premultiplier[l,m] = 1
postmultiplier = matrix(0, ncats2, ncats2)
for(l in 1:ncats2)for(m in l:ncats2)postmultiplier[l,m] = 1
cumulprops2 = proportions2 %*% postmultiplier
cumulprops1 = premultiplier %*% proportions1
nthresholds1 = ncats1 - 1
nthresholds2 = ncats2 - 1
thresholds1 = matrix(0, 1, nthresholds1)
for(l in 1:nthresholds1)thresholds1[l] = qnorm(cumulprops1[l])
thresholds2 = matrix(0, 1, nthresholds2)
for(l in 1:nthresholds2)thresholds2[l] = qnorm(cumulprops2[l])
pcorr = polycor::polychor(obsn)
results_fit = fit_skewnorm(c("th1" = thresholds1, "th2" = thresholds2, "corr" = pcorr, "alpha" = c(2 ,2)))
}, error=function(e){cat("ERROR :",conditionMessage(e), "\n")}) # part of tryCatch
#NULL
}
stopCluster(cl)
Then, using
allresults_skew0 <- unlist(allresults_skew, recursive = FALSE)
allresults_skew0 <- Reduce(rbind,allresults_skew0)
gave me
chisq df p corr conv alpha1 alpha2
1 19.97874 13 0.095741675130554 0.2705112 0 1.4656923 0.7528304
2 65.49704 13 0.000000005354567 0.8426818 0 0.2512463 2.2963329
3 NA NA NA NA NA NA NA
4 31.14632 13 0.003209403883258 0.2753952 0 0.7247398 0.5957852
5 NA NA NA NA NA NA NA
6 NA NA NA NA NA NA NA
I want to use regex to capture substrings - I already have a working solution, but I wonder if there is a faster solution. I am applying applyCaptureRegex on a vector with about 400.000 entries.
exampleData <- as.data.frame(c("[hg19:21:34809787-34809808:+]","[hg19:11:105851118-105851139:+]","[hg19:17:7482245-7482266:+]","[hg19:6:19839915-19839936:+]"))
captureRegex <- function(captRegEx,str){
sapply(regmatches(str,gregexpr(captRegEx,str))[[1]], function(m) regmatches(m,regexec(captRegEx,m)))
}
applyCaptureRegex <- function(mir,r){
mir <- unlist(apply(mir, 1, function(x) captureRegex(r,x[1])))
mir <- matrix(mir ,ncol=5, byrow = TRUE)
mir
}
Usage and results:
> captureRegex("\\[[a-z0-9]+:([0-9]+):([0-9]+)-([0-9]+):([-+])\\]","[hg19:12:125627828-125627847:-]")
$`[hg19:12:125627828-125627847:-]`
[1] "[hg19:12:125627828-125627847:-]" "12" "125627828" "125627847" "-"
> applyCaptureRegex(exampleData,"\\[[a-z0-9]+:([0-9]+):([0-9]+)-([0-9]+):([-+])\\]")
[,1] [,2] [,3] [,4] [,5]
[1,] "[hg19:21:34809787-34809808:+]" "21" "34809787" "34809808" "+"
[2,] "[hg19:11:105851118-105851139:+]" "11" "105851118" "105851139" "+"
[3,] "[hg19:17:7482245-7482266:+]" "17" "7482245" "7482266" "+"
[4,] "[hg19:6:19839915-19839936:+]" "6" "19839915" "19839936" "+"
Thank you!
Why reinvent the wheel? You have several library packages to choose from with functions that return a character matrix with one column for each capturing group in your pattern.
stri_match_all_regex — stringi
x <- c('[hg19:21:34809787-34809808:+]', '[hg19:11:105851118-105851139:+]', '[hg19:17:7482245-7482266:+]', '[hg19:6:19839915-19839936:+]')
do.call(rbind, stri_match_all_regex(x, '\\[[^:]+:(\\d+):(\\d+)-(\\d+):([-+])]'))
# [,1] [,2] [,3] [,4] [,5]
# [1,] "[hg19:21:34809787-34809808:+]" "21" "34809787" "34809808" "+"
# [2,] "[hg19:11:105851118-105851139:+]" "11" "105851118" "105851139" "+"
# [3,] "[hg19:17:7482245-7482266:+]" "17" "7482245" "7482266" "+"
# [4,] "[hg19:6:19839915-19839936:+]" "6" "19839915" "19839936" "+"
str_match — stringr
str_match(x, '\\[[^:]+:(\\d+):(\\d+)-(\\d+):([-+])]')
strapplyc — gsubfn
strapplyc(x, "(\\[[^:]+:(\\d+):(\\d+)-(\\d+):([-+])])", simplify = rbind)
Below is a benchmark comparison of all combined solutions.
x <- rep(c('[hg19:21:34809787-34809808:+]',
'[hg19:11:105851118-105851139:+]',
'[hg19:17:7482245-7482266:+]',
'[hg19:6:19839915-19839936:+]'), 1000)
applyCaptureRegex <- function(mir, r) {
do.call(rbind, lapply(mir, function(x) regmatches(x, regexec(r, x))[[1]]))
}
gsubfn <- function(x1) strapplyc(x1, '(\\[[^:]+:(\\d+):(\\d+)-(\\d+):([-+])])', simplify = rbind)
regmtch <- function(x1) applyCaptureRegex(x1, '\\[[^:]+:(\\d+):(\\d+)-(\\d+):([-+])]')
stringr <- function(x1) str_match(x1, '\\[[^:]+:(\\d+):(\\d+)-(\\d+):([-+])]')
stringi <- function(x1) do.call(rbind, stri_match_all_regex(x1, '\\[[^:]+:(\\d+):(\\d+)-(\\d+):([-+])]'))
require(microbenchmark)
microbenchmark(gsubfn(x), regmtch(x), stringr(x), stringi(x))
Result
Unit: milliseconds
expr min lq mean median uq max neval
gsubfn(x) 372.27072 382.82179 391.21837 388.32396 396.27361 449.03091 100
regmtch(x) 394.03164 409.87523 419.42936 417.76770 427.08208 456.92460 100
stringr(x) 65.81644 70.28327 76.02298 75.43162 78.92567 116.18026 100
stringi(x) 15.88171 16.53047 17.52434 16.96127 17.76007 23.94449 100
I am attempting to write a regular expression that replaces each element in this matrix with only the two numbers after the first colon before and after the comma. There is also "./.:.:.:.:." which I would like to change to "0,0".
head(data)
Offspring-95_CAATCG Offspring-96_AAACGG Offspring-97_ACTCTT
[1,] "./.:1,7:8:18:262,0,18" "0/1:18,4:21:56:56,0,591" "0/0:27,0:27:78:0,78,723"
[2,] "0/0:49,0:49:99:0,147,1891" "0/0:107,0:107:99:0,319,4185" "1/1:0,22:22:66:902,66,0"
[3,] "0/0:42,0:42:99:0,126,1324" "./.:.:.:.:." "0/1:35,88:117:99:3152,0,718"
I have tried:
try <- gsub("\\:[0-9]*\\,[0-9]*\\:", \\1, data)
The desired output is:
Offspring-95_CAATCG Offspring-96_AAACGG Offspring-97_ACTCTT
[1,] "1,7" "18,4" "27,0"
[2,] "49,0" "107,0" "0,22"
[3,] "42,0" "0,0" "35,88"
Thanks,
This could be done by
sub('[^:]+:([^:]+).*', '\\1', data)
# Offspring.95_CAATCG Offspring.96_AAACGG Offspring.97_ACTCTT
#[1,] "1,7" "18,4" "27,0"
#[2,] "49,0" "107,0" "0,22"
#[3,] "9,4" "33,13" "13,0"
Visualization
[^:]+:([^:]+).*
Debuggex Demo
Or using regmatches from base R
data[] <- regmatches(data, regexpr('(?<=:)[0-9]+,[0-9]+', data, perl=TRUE))
Visualization
(?<=:)[0-9]+,[0-9]+
Debuggex Demo
The above regex can be used with stringr or stringi (for big dataset)
library(stringr)
`dim<-`(str_extract(data, perl('(?<=:)[0-9]+,[0-9]+')), dim(data))
# [,1] [,2] [,3]
#[1,] "1,7" "18,4" "27,0"
#[2,] "49,0" "107,0" "0,22"
#[3,] "9,4" "33,13" "13,0"
Or
library(stringi)
`dim<-`(stri_extract(data, regex='(?<=:)[0-9]+,[0-9]+'), dim(data))
# [,1] [,2] [,3]
#[1,] "1,7" "18,4" "27,0"
#[2,] "49,0" "107,0" "0,22"
#[3,] "9,4" "33,13" "13,0"
Update
data1[] <- sub('[^:]+:([^:]+).*', '\\1', data1)
data1[!grepl(',', data1)] <- '0,0'
data1
# Offspring.95_CAATCG Offspring.96_AAACGG Offspring.97_ACTCTT
#[1,] "1,7" "18,4" "27,0"
#[2,] "49,0" "107,0" "0,22"
#[3,] "42,0" "0,0" "35,88"
data
data <- structure(c("./.:1,7:8:18:262,0,18", "0/0:49,0:49:99:0,147,1891",
"0/1:9,4:13:99:129,0,334", "0/1:18,4:21:56:56,0,591",
"0/0:107,0:107:99:0,319,4185",
"0/1:33,13:44:99:317,0,1150", "0/0:27,0:27:78:0,78,723",
"1/1:0,22:22:66:902,66,0", "0/0:13,0:13:39:0,39,528"), .Dim = c(3L, 3L),
.Dimnames = list(NULL, c("Offspring.95_CAATCG", "Offspring.96_AAACGG",
"Offspring.97_ACTCTT")))
data1 <- structure(c("./.:1,7:8:18:262,0,18", "0/0:49,0:49:99:0,147,1891",
"0/0:42,0:42:99:0,126,1324", "0/1:18,4:21:56:56,0,591",
"0/0:107,0:107:99:0,319,4185",
"./.:.:.:.:.", "0/0:27,0:27:78:0,78,723", "1/1:0,22:22:66:902,66,0",
"0/1:35,88:117:99:3152,0,718"), .Dim = c(3L, 3L), .Dimnames = list(
NULL, c("Offspring.95_CAATCG", "Offspring.96_AAACGG", "Offspring.97_ACTCTT"
)))
Not regex subbing but probably pretty darn quick.
apply(data, 2, function(x) sapply(strsplit(x, ":"), "[[", 2))
## Offspring.95_CAATCG Offspring.96_AAACGG Offspring.97_ACTCTT
## [1,] "1,7" "18,4" "27,0"
## [2,] "49,0" "107,0" "0,22"
## [3,] "9,4" "33,13" "13,0"
Try this:
out<-list()
for(i in seq(ncol(data)))
out[[i]]<-gsub('[^:]*:([0-9]+,[0-9]+).*','\\1',data[,i])
out<-as.data.frame(out)
dimnames(out)<-dimnames(data)
out
I have a big Eurostat dataset loaded like this:
install.packages("SmarterPoland")
library(SmarterPoland)
GDP_raw <- getEurostatRCV(kod = "namq_gdp_c")
It has this structure:
s_adj unit indic_na geo time value
1 NSA EUR_HAB B11 AT 2014Q1 NA
2 NSA EUR_HAB B11 BE 2014Q1 200.0
3 NSA EUR_HAB B11 BG 2014Q1 -100.0
I want to use "time" as the first column and the other variables as rows. Doing it the other way around is easy with:
GDP_sorted <- cast(GDP_raw, geo + unit + s_adj + indic_na ~ time)
which returns:
geo unit s_adj indic_na 1955Q1 1955Q2 1955Q3 1955Q4
1 AT EUR_HAB NSA B11 NA NA NA NA
2 AT EUR_HAB NSA B111 NA NA NA NA
3 AT EUR_HAB NSA B112 NA NA NA NA
The problem is, that here the columns are variables so every quarter is its own variable which doesn't make sense from a Time Series perspective. I need some sort of transpose (simple t() doesn't return the same data type). However, if I try cast the other way around, it adds the different categories together into one variable and creates:
time AT_EUR_HAB_NSA_B11 AT_EUR_HAB_NSA_B111 AT_EUR_HAB_NSA_B112
1 1955Q1 NA NA NA
2 1955Q2 NA NA NA
3 1955Q3 NA NA NA
Which means I have 12405 variables. That makes subset infeasible. I'd like something along the lines of:
time
s_adj NSA NSA NSA
geo AT AT AT
unit EUR_HAB EUR_HAB EUR_HAB
indic_na B11 B12 B13
1 1955Q1 NA NA NA
2 1955Q2 NA NA NA
3 1955Q3 NA NA NA
and so forth (this is a fictional example). So then I could use:
Demand <- subset(GDP_sorted, (indic_na == "P3_P5") & (s_adj == "SWDA") & (unit == "MIO_EUR"))
Without having to specify all combinations of variables from 12405 variables.
Until someone provides a better answer, here is a workaround I'm using now:
start from the raw downloaded table:
GDP_raw <- read.table("/media/38A05C6AA05C311C/1_Documents/Dropbox/Masterarbeit/2_R/Data/GDP_raw.RData")
then subset your variable of interest:
Demand <- subset(GDP_raw, (indic_na == "P3_P5") & (s_adj == "SWDA") & (unit == "MIO_EUR"))
and then the only dimensions which remain are time and geo which you can cast simply as:
Demand_cast <- cast(Demand, time ~ geo)
Which gives you one file with the matrix of your variable of the form:
time AT BE BG
1955Q1 NA NA NA
1955Q2 NA NA NA
1955Q3 NA NA NA
I've got several character vectors that I want to paste together. The problem is that some of the character vectors are pretty sparse. So, when I paste them, I get NA's and extra separators. How can I efficiently remove the NA's and extra separators while still joining the vectors?
I've got something like:
n1 = c("goats", "goats", "spatula", NA, "rectitude", "boink")
n2 = c("forever", NA, "...yes", NA, NA, NA)
cbind(paste(n1,n2, sep=", "))
which gives me:
[1,] "goats, forever"
[2,] "goats, NA"
[3,] "spatula, ...yes"
[4,] "NA, NA"
[5,] "rectitude, NA"
[6,] "boink, NA"
but I want:
[1,] "goats, forever"
[2,] "goats"
[3,] "spatula, ...yes"
[4,] <NA>
[5,] "rectitude"
[6,] "boink"
There are clearly inefficient and tedious ways of doing this with a lot of regular expressions and string splitting. But anything quick/simple?
Not a lot of regex, just 1 line and 1 more to replace NA
n1 <- c("goats", "goats", "spatula", NA, "rectitude", "boink")
n2 <- c("forever", NA, "...yes", NA, NA, NA)
n3 <- cbind(paste(n1,n2, sep=", "))
n3 <- gsub("(, )?NA", "", n3)
n3[n3==""] <- NA
Code (no regex or string splitting):
vec <- apply(cbind(n1,n2),1,function(x)
ifelse(all(is.na(x)), NA, paste(na.omit(x),collapse=", ")) )
Result:
> vec # as a vector
[1] "goats, forever" "goats" "spatula, ...yes" NA "rectitude" "boink"
> cbind(vec) # as a matrix
vec
[1,] "goats, forever"
[2,] "goats"
[3,] "spatula, ...yes"
[4,] NA
[5,] "rectitude"
[6,] "boink"
Here's an option using the qdap package (though the other options seem better to me as they use base R):
library(qdap)
gsub(" ", ", ", blank2NA(Trim(gsub("NA", "", paste(n1, n2)))))
## [1] "goats, forever" "goats" "spatula, ...yes" NA
## [5] "rectitude" "boink"
Or...
## gsub(" ", ", ", blank2NA(gsub("NA| NA", "", paste(n1, n2))))