converting some rows into columns in R - regex

I have a table with 1 columns and I want to extract one among the other elements in rows and insert into new column.
lets say my table: df
V1
elements-of-01-to-20
ACTCTGCGACHCHAHAATT
elements-of-21-to-30
ACTAGCTATTATCGATATT
elements-of-31-to-40
CCCTTATATTGGAGCTACT
my desired result:
V1 V2
elements-of-01-to-20 ACTCTGCGACHCHAHAATT
elements-of-21-to-20 ACTAGCTATTATCGATATT
elements-of-31-to-20 CCCTTATATTGGAGCTACT
elements-of-31-to-40 CCCTTATATTGGAGCTACT
edited:
thanks for all replies. my second question is what if my dataset has multiple sequences followed by specific term like elements-of:
V1 => result => V1 v2
elements-of-01-to-20 elements-of-01-to-20 ACTCTGCGACHCHAHAATTAGGGGATGCTGATTTAGTA
ACTCTGCGACHCHAHAATT elements-of-21-to-30 ACTAGCTATTATCGATATT
AGGGGATGCTGATTTAGTA
elements-of-21-to-30
ACTAGCTATTATCGATATT

If the pattern is the same as in the example
indx <- c(TRUE, FALSE)
data.frame(V1=df$V1[indx], V2=df$V1[!indx])
# V1 V2
#1 elements-of-01-to-20 ACTCTGCGACHCHAHAATT
#2 elements-of-21-to-30 ACTAGCTATTATCGATATT
#3 elements-of-31-to-40 CCCTTATATTGGAGCTACT
Update
Based on the updated dataset
library(data.table)
setDT(df)[,list(V1=V1[1], V2=paste(V1[-1], collapse='')),
by= list(indx=cumsum(grepl('^[^A-Z]', df$V1)))][, indx:=NULL][]
# V1 V2
#1: elements-of-01-to-20 ACTCTGCGACHCHAHAATTAGGGGATGCTGATTTAGTA
#2: elements-of-21-to-30 ACTAGCTATTATCGATATT
New data
df <- structure(list(V1 = c("elements-of-01-to-20", "ACTCTGCGACHCHAHAATT",
"AGGGGATGCTGATTTAGTA", "elements-of-21-to-30", "ACTAGCTATTATCGATATT"
)), .Names = "V1", class = "data.frame", row.names = c(NA, -5L))

If that is just a fasta file then look at the Biostrings package. You could do it this way too
MySeq <- data.frame("Name" = df$V1[(seq(1, length(df$V1), by=2)],
"Seq" = df$V1[(seq(2, length(df$V1), by=2)],
stringsAsFactors = FALSE)

Here is another way using grepl:
#dummy data
df <- read.table(text=" V1
elements-of-01-to-20
ACTCTGCGACHCHAHAATT
elements-of-21-to-30
ACTAGCTATTATCGATATT
elements-of-31-to-40
CCCTTATATTGGAGCTACT",
as.is=TRUE,header=TRUE)
#result
cbind(df[ grepl("elements",df$V1), "V1"],
df[ !grepl("elements",df$V1), "V1"])
#output
# [,1] [,2]
# [1,] "elements-of-01-to-20" "ACTCTGCGACHCHAHAATT"
# [2,] "elements-of-21-to-30" "ACTAGCTATTATCGATATT"
# [3,] "elements-of-31-to-40" "CCCTTATATTGGAGCTACT"

Try (using traditional programming methods):
ndf = data.frame(V1="", V2="", stringsAsFactors=FALSE)
i=1
while(i<nrow(df)){
ndf[(nrow(ndf)+1),]=c(df[i,1], df[(i+1),1])
i=i+2
}
ndf[-1,]
V1 V2
2 elements-of-01-to-20 ACTCTGCGACHCHAHAATT
3 elements-of-21-to-30 ACTAGCTATTATCGATATT
4 elements-of-31-to-40 CCCTTATATTGGAGCTACT

Related

R regular expression to split string column into multiple columns

I have a column as follows in a dataframe called PeakBoundaries:
chrom
chr11:69464719-69502928
chr7:55075808-55093954
chr8:128739772-128762863
chr3:169389459-169490555
chr17:37848534-37877201
chr19:30306758-30316875
chr1:150496857-150678056
chr12:69183279-69260755
chr11:77610143-77641464
chr8:38191804-38260814
chr12:58135797-58156509
I would like to separate out the columns so that the columns look like below in a dataframe:
chr chrStart chrEnd
chr11 69464719 69502928
chr7 55075808 55093954
chr8 128739772 128762863
chr3 169389459 169490555
etc.
I have tried a regular expression approach but am not getting anywhere in terms of getting the match to enter into a new column:
PeakBoundaries$chrOnly <- PeakBoundaries[grep("\\w+?=\\:"),PeakBoundaries$chrom]
I am met with the error:
Error in [.data.frame(PeakBoundaries, grep("\w+?=\:"), PeakBoundaries$chrom) :
undefined columns selected
Try this - no regex needed, just the strsplit function:
dat <- read.table(text="chr11:69464719-69502928
chr7:55075808-55093954
chr8:128739772-128762863
chr3:169389459-169490555
chr17:37848534-37877201
chr19:30306758-30316875
chr1:150496857-150678056
chr12:69183279-69260755
chr11:77610143-77641464
chr8:38191804-38260814
chr12:58135797-58156509", stringsAsFactors=FALSE)
dat[,2:4] <- matrix(unlist(strsplit(dat[,1],split = "\\:|\\-")), ncol=3, byrow=TRUE)
colnames(dat) <- c("chrom", "chr", "chrStart", "chrEnd")
# Convert last two columns from character to numeric:
dat$chrStart <- as.numeric(dat$chrStart)
dat$chrEnd <- as.numeric(dat$chrEnd)
Results
> res
chrom chr chrStart chrEnd
1 chr11:69464719-69502928 chr11 69464719 69502928
2 chr7:55075808-55093954 chr7 55075808 55093954
3 chr8:128739772-128762863 chr8 128739772 128762863
4 chr3:169389459-169490555 chr3 169389459 169490555
5 chr17:37848534-37877201 chr17 37848534 37877201
6 chr19:30306758-30316875 chr19 30306758 30316875
7 chr1:150496857-150678056 chr1 150496857 150678056
8 chr12:69183279-69260755 chr12 69183279 69260755
9 chr11:77610143-77641464 chr11 77610143 77641464
10 chr8:38191804-38260814 chr8 38191804 38260814
11 chr12:58135797-58156509 chr12 58135797 58156509
Edit
You could do everything using only your existing dataframe. Replace dat[,1] with PeakBoundaries$chrom and dat[,2:4] with PeakBoundaries[,(ncol(PeakBoundaries)+1):(ncol(PeakBoundaries)+3)] and you should have it!
Edit By OP
OK so I think there's something a bit odd with my dataset but I've sorted it with Dominic's help so that it it is now:
PeakBoundaries <- as.data.frame(PeakBoundaries)
PeakBoundaries <- PeakBoundaries[-1,]
PeakBoundaries <- as.data.frame(PeakBoundaries)
PeakBoundaries$PeakBoundaries <-
as.character(PeakBoundaries$PeakBoundaries)
PeakBoundaries[,(ncol(PeakBoundaries)+1):(ncol(PeakBoundaries)+3)] <-
matrix(unlist(strsplit(PeakBoundaries$PeakBoundaries,
split = "\\:|\\-")), ncol=3, byrow=TRUE)
A shorter version of Dominic's answer, making the insertion a one-liner:
dat <- data.frame(chrom = readLines(textConnection("chr11:69464719-69502928
chr7:55075808-55093954
chr8:128739772-128762863
chr3:169389459-169490555
chr17:37848534-37877201
chr19:30306758-30316875
chr1:150496857-150678056
chr12:69183279-69260755
chr11:77610143-77641464
chr8:38191804-38260814
chr12:58135797-58156509")) )
dat[, c('chr','chrStart','chrEnd')] <- t( sapply( dat$chrom, function(s) { str_split(s, '[:-]') [[1]] } ) )
dat$chrStart <- as.numeric(dat$chrStart)
dat$chrEnd <- as.numeric(dat$chrEnd)
We could try
library(tidyr)
extract(dat, chrom, into=c('chr', 'chrStart', 'chrEnd'),
'([^:]+):([^-]+)-(.*)', convert=TRUE)
# chr chrStart chrEnd
#1 chr11 69464719 69502928
#2 chr7 55075808 55093954
#3 chr8 128739772 128762863
#4 chr3 169389459 169490555
#5 chr17 37848534 37877201
#6 chr19 30306758 30316875
#7 chr1 150496857 150678056
#8 chr12 69183279 69260755
#9 chr11 77610143 77641464
#10 chr8 38191804 38260814
#11 chr12 58135797 58156509
Or a faster option using the devel version of data.table. We can install the v1.9.5 from here
library(data.table) # v1.9.5+
nm1 <- c('chr', 'chrStart', 'chrEnd')
res <- setDT(tstrsplit(dat$chrom, '[:-]', type.convert=TRUE))
setnames(res, nm1)
res
# chr chrStart chrEnd
# 1: chr11 69464719 69502928
# 2: chr7 55075808 55093954
# 3: chr8 128739772 128762863
# 4: chr3 169389459 169490555
# 5: chr17 37848534 37877201
# 6: chr19 30306758 30316875
# 7: chr1 150496857 150678056
# 8: chr12 69183279 69260755
# 9: chr11 77610143 77641464
#10: chr8 38191804 38260814
#11: chr12 58135797 58156509
Or
library(splitstackshape)
setnames(cSplit(dat, 'chrom', ':|-',fixed=FALSE,
type.convert=TRUE), nm1)[]
data
dat <- structure(list(chrom = structure(c(2L, 9L, 10L, 8L, 6L, 7L, 1L,
5L, 3L, 11L, 4L), .Label = c("chr1:150496857-150678056",
"chr11:69464719-69502928",
"chr11:77610143-77641464", "chr12:58135797-58156509",
"chr12:69183279-69260755",
"chr17:37848534-37877201", "chr19:30306758-30316875",
"chr3:169389459-169490555",
"chr7:55075808-55093954", "chr8:128739772-128762863",
"chr8:38191804-38260814"
), class = "factor")), .Names = "chrom", row.names = c(NA, -11L
), class = "data.frame")

R match between two comma-separated strings

I am trying to find an elegant way to find matches between the two following character columns in a data frame. The complicated part is that either string can contain a comma-separated list, and if a member of one list is a match for any member of the other list, then that whole entry would be considered a match. I'm not sure how well I've explained this, so here's sample data and output:
Alt1:
AT
A
G
CGTCC,AT
CGC
Alt2:
AA
A
GG
AT,GGT
CG
Expected Match per row:
Row 1 = none
Row 2 = A
Row 3 = none
Row 4 = AT
Row 5 = none
Non-working solutions:
First attempt: merge entire data frames by desired columns, then match up the alt columns shown above:
match1 = data.frame(merge(vcf.df, ref.df, by=c("chr", "start", "end", "ref")))
matches = unique(match1[unlist(sapply(match1$Alt1 grep, match1$Alt2, fixed=TRUE)),])
Second method, using findoverlaps feature from VariantAnnoatation/Granges:
findoverlaps(ranges(vcf1), ranges(vcf2))
Any suggestions would be greatly appreciated! Thank you!
Solution
Thanks to #Marat Talipov's answer below, the following solution works to compare two comma-separated strings:
> ##read in edited kaviar vcf and human ref
> ref <- readVcfAsVRanges("ref.vcf.gz", humie_ref)
Warning message:
In .vcf_usertag(map, tag, ...) :
ScanVcfParam ‘geno’ fields not present: ‘AD’
> ##rename chromosomes to match with vcf files
> ref <- renameSeqlevels(ref, c("1"="chr1"))
> ##################################
> ## Gather VCF files to process ##
> ##################################
> ##data frame *.vcf.gz files in directory path
> vcf_path <- data.frame(path=list.files(vcf_dir, pattern="*.vcf.gz$", full=TRUE))
> ##read in everything but sample data for speediness
> vcf_param = ScanVcfParam(samples=NA)
> vcf <- readVcfAsVRanges("test.vcf.gz", humie_ref, param=vcf_param)
> #################
> ## Match SNP's ##
> #################
> ##create data frames of info to match on
> vcf.df = data.frame(chr =as.character(seqnames(vcf)), start = start(vcf), end = end(vcf), ref = as.character(ref(vcf)),
+ alt=alt(vcf), stringsAsFactors=FALSE)
> ref.df = data.frame(chr =as.character(seqnames(ref)), start = start(ref), end = end(ref),
+ ref = as.character(ref(ref)), alt=alt(ref), stringsAsFactors=FALSE)
>
> ##merge based on all positional fields except vcf
> col_match = data.frame(merge(vcf.df, ref.df, by=c("chr", "start", "end", "ref")))
> library(stringi)
> ##split each alt column by comma and bind together
> M1 <- stri_list2matrix(sapply(col_match$alt.x,strsplit,','))
> M2 <- stri_list2matrix(sapply(col_match$alt.y,strsplit,','))
> M <- rbind(M1,M2)
> ##compare results
> result <- apply(M,2,function(z) unique(na.omit(z[duplicated(z)])))
> ##add results column to col_match df for checking/subsetting
> col_match$match = result
> head(col_match)
chr start end ref alt.x alt.y match
1 chr1 39998059 39998059 A G G G
2 chr1 39998059 39998059 A G G G
3 chr1 39998084 39998084 C A A A
4 chr1 39998084 39998084 C A A A
5 chr1 39998085 39998085 G A A A
6 chr1 39998085 39998085 G A A A
In the case that input lists are of equal length and you'd like to compare list elements in the pairwise manner, you could use this solution:
library(stringi)
M1 <- stri_list2matrix(sapply(Alt1,strsplit,','))
M2 <- stri_list2matrix(sapply(Alt2,strsplit,','))
M <- rbind(M1,M2)
result <- apply(M,2,function(z) unique(na.omit(z[duplicated(z)])))
Sample input:
Alt1 <- list('AT','A','G','CGTCC,AT','CGC','GG,CC')
Alt2 <- list('AA','A','GG','AT,GGT','CG','GG,CC')
Output:
# [[1]]
# character(0)
#
# [[2]]
# [1] "A"
#
# [[3]]
# character(0)
#
# [[4]]
# [1] "AT"
#
# [[5]]
# character(0)
#
# [[6]]
# [1] "GG" "CC"
Sticking with the stringi package, you could do something like this, using the Alt1 and Alt2 data from Marat's answer.
library(stringi)
f <- function(x, y) {
ssf <- stri_split_fixed(c(x, y), ",", simplify = TRUE)
if(any(sd <- stri_duplicated(ssf))) ssf[sd] else NA_character_
}
Map(f, Alt1, Alt2)
# [[1]]
# [1] NA
#
# [[2]]
# [1] "A"
#
# [[3]]
# [1] NA
#
# [[4]]
# [1] "AT"
#
# [[5]]
# [1] NA
#
# [[6]]
# [1] "GG" "CC"
Or in base R, we can use scan() to separate the strings with commas.
g <- function(x, y, sep = ",") {
s <- scan(text = c(x, y), what = "", sep = sep, quiet = TRUE)
s[duplicated(s)]
}
Map(g, Alt1, Alt2)
you could do something like this:
Alt1 <- list('AT','A','G',c('CGTCC','AT'),'CGC')
Alt2 <- list('AA','A','GG',c('AT','GGT'),'CG')
# make sure you change the lists within in the lists into vectors
matchlist <- list()
for (i in 1:length(Alt1)){
matchlist[[i]] <- ifelse(Alt1[[i]] %in% Alt2[[i]],
paste("Row",i,"=",c(Alt1[[i]],Alt2[[i]])[duplicated(c(Alt1[[i]],Alt2[[i]]))],sep=" "),
paste("Row",i,"= none",sep=" "))
}
print(matchlist)

In R: tidyr split and swing value into column name using regex

Im trying to get customized with the tidyrpackage, and am strugling with the problem of having a variable which is a concatenate of several variables. In the minimal example below, I would like to split variable v2 into its constituent variables v3and v4and then swing these so I end up with the four variables v1-v4.
require(plyr)
require(dplyr)
require(stringr)
require(tidyr)
data <-
data.frame(
v1=c(1,2),
v2=c("v3 cheese; v4 200", "v3 ham; v4 150")) %>%
tbl_df()
If I split v2 into a new temp I get only v3:
mutate(data,
temp=unlist(sapply(str_split(data$v2, pattern=";"), "[", 1)))
v1 v2 temp
1 1 v3 cheese; v4 200 v3 cheese
2 2 v3 ham; v4 150 v3 ham
My problems are:
1) How do I split and swing v3 AND v4 up as column names using tidyr?
2) In my real data I do not know (or they are to many) the
variable names but they have the structure "var value", and I
would like to use some regex to automatically identify and swing
them as in 1)
Got inspired by this SO answer but could not get it to work though with regex code for variable names.
UPDATE:
My output would be something like (v2 could be skipped as its now redundant with v3 and v4):
v1 v2 v3 v4
1 1 v3 cheese; v4 200 cheese 200
2 2 v3 ham; v4 150 ham 150
Split the data by ";", convert the split output to a long form, split the data again by " " (but in a wide form this time) and spread the values out to the wide form you desire.
Here it is using "dplyr" + "tidyr" + "stringi":
library(dplyr)
library(tidyr)
library(stringi)
data %>%
mutate(v2 = stri_split_fixed(as.character(v2), ";")) %>%
unnest(v2) %>%
mutate(v2 = stri_trim_both(v2)) %>%
separate(v2, into = c("var", "val")) %>%
spread(var, val)
# Source: local data frame [2 x 3]
#
# v1 v3 v4
# 1 1 cheese 200
# 2 2 ham 150
Alternatively, using cSplit from my "splitstackshape" package (which doesn't presently work with tbl_dfs)
library(dplyr)
library(tidyr)
library(splitstackshape)
as.data.frame(data) %>%
cSplit("v2", ";", "long") %>%
cSplit("v2", " ") %>%
spread(v2_1, v2_2)
# v1 v3 v4
# 1: 1 cheese 200
# 2: 2 ham 150

split string for specific column

I have a file like this:
V1 V2
1 1-500891 CGCGACCTCAGATCAGACGTGGCGACCCGCTGAA
2 2-280976 AGGTTCCGGATAAGTAAGAGCC
3 3-223181 TCTTAACCCGGACCAGAAACTA
I would like to split (and swap) the V1 column resulting in the following output
Sequence Count
CGCGACCTCAGATCAGACGTGGCGACCCGCTGAA 500891
AGGTTCCGGATAAGTAAGAGCC 280976
TCTTAACCCGGACCAGAAACTA 223181
I have tried this, but it did not work:
df_split <- strsplit(as.character(df), split="-", fixed=T)
You can try sub to remove the part of the string up till -.
df$V1 <- sub('.*-', '', df$V1)
df
# V1 V2
#1 500891 CGCGACCTCAGATCAGACGTGGCGACCCGCTGAA
#2 280976 AGGTTCCGGATAAGTAAGAGCC
#3 223181 TCTTAACCCGGACCAGAAACTA
You applied the strsplit on the whole dataset instead of specific column ("V1"). Here, is a possible option for you to consider
df$V1 <- sapply(strsplit(as.character(df$V1),
split="-", fixed=TRUE),`[`,2)
df$V1
#[1] "500891" "280976" "223181"
Or an option using tidyr
library(tidyr)
extract(df, 'V1', 'Count', '.*-(.*)')
# Count V2
#1 500891 CGCGACCTCAGATCAGACGTGGCGACCCGCTGAA
#2 280976 AGGTTCCGGATAAGTAAGAGCC
#3 223181 TCTTAACCCGGACCAGAAACTA

R: What's the easiest way to print out pairs of values from a data.frame?

I have a data.frame:
df<-data.frame(a=c("x","x","y","y"),b=c(1,2,3,4))
> df
a b
1 x 1
2 x 2
3 y 3
4 y 4
What's the easiest way to print out each pair of values as a list of strings like this:
"x1", "x2", "y1", "y2"
apply(df, 1, paste, collapse="")
with(df, paste(a, b, sep=""))
And this should be faster than apply.
About timing
For 10000 rows we get:
df <- data.frame(
a = sample(c("x","y"), 10000, replace=TRUE),
b = sample(1L:4L, 10000, replace=TRUE)
)
N = 100
mean(replicate(N, system.time( with(df, paste(a, b, sep="")) )["elapsed"]), trim=0.05)
# 0.005778
mean(replicate(N, system.time( apply(df, 1, paste, collapse="") )["elapsed"]), trim=0.05)
# 0.09611
So increase in speed is visible for few thousands.
It's because Shane's solution call paste for each row separately. So there is nrow(df) calls of paste, in my solution is one call.
Also, you can use sqldf library:
library("sqldf")
df<-data.frame(a=c("x","x","y","y"),b=c(1,2,3,4))
result <- sqldf("SELECT a || cast(cast(b as integer) as text) as concat FROM df")
You will get the following result:
concat
1 x1
2 x2
3 y3
4 y4