R regular expression to split string column into multiple columns - regex

I have a column as follows in a dataframe called PeakBoundaries:
chrom
chr11:69464719-69502928
chr7:55075808-55093954
chr8:128739772-128762863
chr3:169389459-169490555
chr17:37848534-37877201
chr19:30306758-30316875
chr1:150496857-150678056
chr12:69183279-69260755
chr11:77610143-77641464
chr8:38191804-38260814
chr12:58135797-58156509
I would like to separate out the columns so that the columns look like below in a dataframe:
chr chrStart chrEnd
chr11 69464719 69502928
chr7 55075808 55093954
chr8 128739772 128762863
chr3 169389459 169490555
etc.
I have tried a regular expression approach but am not getting anywhere in terms of getting the match to enter into a new column:
PeakBoundaries$chrOnly <- PeakBoundaries[grep("\\w+?=\\:"),PeakBoundaries$chrom]
I am met with the error:
Error in [.data.frame(PeakBoundaries, grep("\w+?=\:"), PeakBoundaries$chrom) :
undefined columns selected

Try this - no regex needed, just the strsplit function:
dat <- read.table(text="chr11:69464719-69502928
chr7:55075808-55093954
chr8:128739772-128762863
chr3:169389459-169490555
chr17:37848534-37877201
chr19:30306758-30316875
chr1:150496857-150678056
chr12:69183279-69260755
chr11:77610143-77641464
chr8:38191804-38260814
chr12:58135797-58156509", stringsAsFactors=FALSE)
dat[,2:4] <- matrix(unlist(strsplit(dat[,1],split = "\\:|\\-")), ncol=3, byrow=TRUE)
colnames(dat) <- c("chrom", "chr", "chrStart", "chrEnd")
# Convert last two columns from character to numeric:
dat$chrStart <- as.numeric(dat$chrStart)
dat$chrEnd <- as.numeric(dat$chrEnd)
Results
> res
chrom chr chrStart chrEnd
1 chr11:69464719-69502928 chr11 69464719 69502928
2 chr7:55075808-55093954 chr7 55075808 55093954
3 chr8:128739772-128762863 chr8 128739772 128762863
4 chr3:169389459-169490555 chr3 169389459 169490555
5 chr17:37848534-37877201 chr17 37848534 37877201
6 chr19:30306758-30316875 chr19 30306758 30316875
7 chr1:150496857-150678056 chr1 150496857 150678056
8 chr12:69183279-69260755 chr12 69183279 69260755
9 chr11:77610143-77641464 chr11 77610143 77641464
10 chr8:38191804-38260814 chr8 38191804 38260814
11 chr12:58135797-58156509 chr12 58135797 58156509
Edit
You could do everything using only your existing dataframe. Replace dat[,1] with PeakBoundaries$chrom and dat[,2:4] with PeakBoundaries[,(ncol(PeakBoundaries)+1):(ncol(PeakBoundaries)+3)] and you should have it!
Edit By OP
OK so I think there's something a bit odd with my dataset but I've sorted it with Dominic's help so that it it is now:
PeakBoundaries <- as.data.frame(PeakBoundaries)
PeakBoundaries <- PeakBoundaries[-1,]
PeakBoundaries <- as.data.frame(PeakBoundaries)
PeakBoundaries$PeakBoundaries <-
as.character(PeakBoundaries$PeakBoundaries)
PeakBoundaries[,(ncol(PeakBoundaries)+1):(ncol(PeakBoundaries)+3)] <-
matrix(unlist(strsplit(PeakBoundaries$PeakBoundaries,
split = "\\:|\\-")), ncol=3, byrow=TRUE)

A shorter version of Dominic's answer, making the insertion a one-liner:
dat <- data.frame(chrom = readLines(textConnection("chr11:69464719-69502928
chr7:55075808-55093954
chr8:128739772-128762863
chr3:169389459-169490555
chr17:37848534-37877201
chr19:30306758-30316875
chr1:150496857-150678056
chr12:69183279-69260755
chr11:77610143-77641464
chr8:38191804-38260814
chr12:58135797-58156509")) )
dat[, c('chr','chrStart','chrEnd')] <- t( sapply( dat$chrom, function(s) { str_split(s, '[:-]') [[1]] } ) )
dat$chrStart <- as.numeric(dat$chrStart)
dat$chrEnd <- as.numeric(dat$chrEnd)

We could try
library(tidyr)
extract(dat, chrom, into=c('chr', 'chrStart', 'chrEnd'),
'([^:]+):([^-]+)-(.*)', convert=TRUE)
# chr chrStart chrEnd
#1 chr11 69464719 69502928
#2 chr7 55075808 55093954
#3 chr8 128739772 128762863
#4 chr3 169389459 169490555
#5 chr17 37848534 37877201
#6 chr19 30306758 30316875
#7 chr1 150496857 150678056
#8 chr12 69183279 69260755
#9 chr11 77610143 77641464
#10 chr8 38191804 38260814
#11 chr12 58135797 58156509
Or a faster option using the devel version of data.table. We can install the v1.9.5 from here
library(data.table) # v1.9.5+
nm1 <- c('chr', 'chrStart', 'chrEnd')
res <- setDT(tstrsplit(dat$chrom, '[:-]', type.convert=TRUE))
setnames(res, nm1)
res
# chr chrStart chrEnd
# 1: chr11 69464719 69502928
# 2: chr7 55075808 55093954
# 3: chr8 128739772 128762863
# 4: chr3 169389459 169490555
# 5: chr17 37848534 37877201
# 6: chr19 30306758 30316875
# 7: chr1 150496857 150678056
# 8: chr12 69183279 69260755
# 9: chr11 77610143 77641464
#10: chr8 38191804 38260814
#11: chr12 58135797 58156509
Or
library(splitstackshape)
setnames(cSplit(dat, 'chrom', ':|-',fixed=FALSE,
type.convert=TRUE), nm1)[]
data
dat <- structure(list(chrom = structure(c(2L, 9L, 10L, 8L, 6L, 7L, 1L,
5L, 3L, 11L, 4L), .Label = c("chr1:150496857-150678056",
"chr11:69464719-69502928",
"chr11:77610143-77641464", "chr12:58135797-58156509",
"chr12:69183279-69260755",
"chr17:37848534-37877201", "chr19:30306758-30316875",
"chr3:169389459-169490555",
"chr7:55075808-55093954", "chr8:128739772-128762863",
"chr8:38191804-38260814"
), class = "factor")), .Names = "chrom", row.names = c(NA, -11L
), class = "data.frame")

Related

R - extract all strings matching pattern and create relational table

I am looking for a shorter and more pretty solution (possibly in tidyverse) to the following problem. I have a data.frame "data":
id string
1 A 1.001 xxx 123.123
2 B 23,45 lorem ipsum
3 C donald trump
4 D ssss 134, 1,45
What I wanted to do is to extract all numbers (no matter if the delimiter is "." or "," -> in this case I assume that string "134, 1,45" can be extracted into two numbers: 134 and 1.45) and create a data.frame "output" looking similar to this:
id string
1 A 1.001
2 A 123.123
3 B 23.45
4 C <NA>
5 D 134
6 D 1.45
I managed to do this (code below) but the solution is pretty ugly for me also not so efficient (two for-loops). Could someone suggest a better way to do do this (preferably using dplyr)
# data
data <- data.frame(id = c("A", "B", "C", "D"),
string = c("1.001 xxx 123.123",
"23,45 lorem ipsum",
"donald trump",
"ssss 134, 1,45"),
stringsAsFactors = FALSE)
# creating empty data.frame
len <- length(unlist(sapply(data$string, function(x) gregexpr("[0-9]+[,|.]?[0-9]*", x))))
output <- data.frame(id = rep(NA, len), string = rep(NA, len))
# main solution
start = 0
for(i in 1:dim(data)[1]){
tmp_len <- length(unlist(gregexpr("[0-9]+[,|.]?[0-9]*", data$string[i])))
for(j in (start+1):(start+tmp_len)){
output[j,1] <- data$id[i]
output[j,2] <- regmatches(data$string[i], gregexpr("[0-9]+[,|.]?[0-9]*", data$string[i]))[[1]][j-start]
}
start = start + tmp_len
}
# further modifications
output$string <- gsub(",", ".", output$string)
output$string <- as.numeric(ifelse(substring(output$string, nchar(output$string), nchar(output$string)) == ".",
substring(output$string, 1, nchar(output$string) - 1),
output$string))
output
1) Base R This uses relatively simple regular expressions and no packages.
In the first 2 lines of code replace any comma followed by a space with a
space and then replace all remaining commas with a dot. After these two lines s will be: c("1.001 xxx 123.123", "23.45 lorem ipsum", "donald trump", "ssss 134 1.45")
In the next 4 lines of code trim whitespace from beginning and end of each string field and split the string field on whitespace producing a
list. grep out those elements consisting only of digits and dots. (The regular expression ^[0-9.]*$ matches the start of a word followed by zero or more digits or dots followed by the end of the word so only words containing only those characters are matched.) Replace any zero length components with NA. Finally add data$id as the names. After these 4 lines are run the list L will be list(A = c("1.001", "123.123"), B = "23.45", C = NA, D = c("134", "1.45")) .
In the last line of code convert the list L to a data frame with the appropriate names.
s <- gsub(", ", " ", data$string)
s <- gsub(",", ".", s)
L <- strsplit(trimws(s), "\\s+")
L <- lapply(L, grep, pattern = "^[0-9.]*$", value = TRUE)
L <- ifelse(lengths(L), L, NA)
names(L) <- data$id
with(stack(L), data.frame(id = ind, string = values))
giving:
id string
1 A 1.001
2 A 123.123
3 B 23.45
4 C <NA>
5 D 134
6 D 1.45
2) magrittr This variation of (1) writes it as a magrittr pipeline.
library(magrittr)
data %>%
transform(string = gsub(", ", " ", string)) %>%
transform(string = gsub(",", ".", string)) %>%
transform(string = trimws(string)) %>%
with(setNames(strsplit(string, "\\s+"), id)) %>%
lapply(grep, pattern = "^[0-9.]*$", value = TRUE) %>%
replace(lengths(.) == 0, NA) %>%
stack() %>%
with(data.frame(id = ind, string = values))
3) dplyr/tidyr This is an alternate pipeline solution using dplyr and tidyr. unnest converts to long form, id is made factor so that we can later use complete to recover id's that are removed by subsequent filtering, the filter removes junk rows and complete inserts NA rows for each id that would otherwise not appear.
library(dplyr)
library(tidyr)
data %>%
mutate(string = gsub(", ", " ", string)) %>%
mutate(string = gsub(",", ".", string)) %>%
mutate(string = trimws(string)) %>%
mutate(string = strsplit(string, "\\s+")) %>%
unnest() %>%
mutate(id = factor(id))
filter(grepl("^[0-9.]*$", string)) %>%
complete(id)
4) data.table
library(data.table)
DT <- as.data.table(data)
DT[, string := gsub(", ", " ", string)][,
string := gsub(",", ".", string)][,
string := trimws(string)][,
string := setNames(strsplit(string, "\\s+"), id)][,
list(string = list(grep("^[0-9.]*$", unlist(string), value = TRUE))), by = id][,
list(string = if (length(unlist(string))) unlist(string) else NA_character_), by = id]
DT
Update Removed assumption that junk words do not have digit or dot. Also added (2), (3) and (4) and some improvements.
We can replace the , in between the numbers with . (using gsub), extract the numbers with str_extract_all (from stringr into a list), replace the list elements that have length equal to 0 with NA, set the names of the list with 'id' column, stack to convert the list to data.frame and rename the columns.
library(stringr)
setNames(stack(setNames(lapply(str_extract_all(gsub("(?<=[0-9]),(?=[0-9])", ".",
data$string, perl = TRUE), "[0-9.]+"), function(x)
if(length(x)==0) NA else as.numeric(x)), data$id))[2:1], c("id", "string"))
# id string
#1 A 1.001
#2 A 123.123
#3 B 23.45
#4 C NA
#5 D 134
#6 D 1.45
Same idea as Gabor's. I had hoped to use R's built-in parsing of strings (type.convert, used in read.table) rather than writing custom regex substitutions:
sp = setNames(strsplit(data$string, " "), data$id)
spc = lapply(sp, function(x) {
x = x[grep("[^0-9.,]$", x, invert=TRUE)]
if (!length(x))
NA_real_
else
mapply(type.convert, x, dec=gsub("[^.,]", "", x), USE.NAMES=FALSE)
})
setNames(rev(stack(spc)), names(data))
id string
1 A 1.001
2 A 123.123
3 B 23.45
4 C <NA>
5 D 134
6 D 1.45
Unfortunately, type.convert is not robust enough to consider both decimal delimiters at once, so we need this mapply malarkey instead of type.convert(x, dec = "[.,]").

Find partial occurences in data frame based on a vector

I've got a dataframe a and a vector b (derived from another data frame). Now I want to find all occurences from vector b in a.
However, unfortunately vector b sometimes misses a leading character.
a <- structure(list(GSN_IDENTITY_CODE = c("01234567", "65461341", "NH1497", "ZH0080", "TP5146", "TP5146"), PIG_ID = c("129287133", "120561144", "119265685", "121883198", "109371743", "109371743" ), SEX_CODE = c("Z", "Z", "Z", "Z", "B", "B")), .Names = c("GSN_IDENTITY_CODE", "PIG_ID", "SEX_CODE"), row.names = c(NA, 6L), class = "data.frame")
> a
# IDENTITY_CODE PIG_ID SEX_CODE
#1 01234567 129287133 Z
#2 65461341 120561144 Z
#3 NH1497 119265685 Z
#4 ZH0080 121883198 Z
#5 TP5146 109371743 B
#6 TP5146 109371743 B
b <- c("65461341", "1234567", "ZH0080", "TP5146")
My expected output would be this:
a
# GSN_IDENTITY_CODE PIG_ID SEX_CODE
#1 01234567 129287133 Z
#2 65461341 120561144 Z
#4 ZH0080 121883198 Z
#5 TP5146 109371743 B
When first removing the duplicates it solves one problem, however I still need a way to select all rows containing the values from vector b whereas I need more rows:
a <- a[!duplicated(a$GSN_IDENTITY_CODE),]
Unfortunately I cannot use %in% because it will bring in duplicates and miss out on the first line because it does not accept regex':
> a[a$GSN_IDENTITY_CODE %in% b,]
# GSN_IDENTITY_CODE PIG_ID SEX_CODE
#2 65461341 120561144 Z
#4 ZH0080 121883198 Z
#5 TP5146 109371743 B
#6 TP5146 109371743 B
Using data.table's %like% would work only for the first string in vector b
library(data.table)
> setDT(a)
> a[a$GSN_IDENTITY_CODE %like% b,]
# GSN_IDENTITY_CODE PIG_ID SEX_CODE
#1: 65461341 120561144 Z
Warning message:
In grepl(pattern, vector) :
argument 'pattern' has length > 1 and only the first element will be used
Is there a function in R that supports my needs here?
#Frank's attempt yields the following error:
a <- structure(list(GSN_IDENTITY_CODE = c("01234567", "65461341", "NH1497", "ZH0080", "TP5146", "TP5146"), PIG_ID = c("129287133", "120561144", "119265685", "121883198", "109371743", "109371743" ), SEX_CODE = c("Z", "Z", "Z", "Z", "B", "B")), .Names = c("GSN_IDENTITY_CODE", "PIG_ID", "SEX_CODE"), row.names = c(NA, 6L), class = "data.frame")
b <- c("65461341", "1234567", "ZH0080", "TP5146")
> a[.(b), on="GSN_IDENTITY_CODE", nomatch=FALSE, mult="first"]
Error in `[.data.frame`(a, .(b), on = "GSN_IDENTITY_CODE", nomatch = FALSE, :
unused arguments (on = "GSN_IDENTITY_CODE", nomatch = FALSE, mult = "first")
> setDT(a)
> a[.(b), on="GSN_IDENTITY_CODE", nomatch=FALSE, mult="first"]
Error in bmerge(i, x, leftcols, rightcols, io, xo, roll, rollends, nomatch, :
x.'GSN_IDENTITY_CODE' is a character column being joined to i.'NA' which is type 'NULL'. Character columns must join to factor or character columns.
You can do something like this for close matches if the extra character might occur anywhere in the string:
library(stringdist)
library(purrr)
a$closest_match <- map(a$GSN_IDENTITY_CODE, ~stringdist(., b, method = "lv")) %>%
map_dbl(min)
a[a$closest_match < 2, ]
If the extra character is always at the beginning, I would do something like this:
library(stringr)
a$stripped_code <- str_replace(a$GSN_IDENTITY_CODE,"^\\d", "")
a$keep <- a$GSN_IDENTITY_CODE %in% b | a$stripped_code %in% b
a[a$keep, ]

R match between two comma-separated strings

I am trying to find an elegant way to find matches between the two following character columns in a data frame. The complicated part is that either string can contain a comma-separated list, and if a member of one list is a match for any member of the other list, then that whole entry would be considered a match. I'm not sure how well I've explained this, so here's sample data and output:
Alt1:
AT
A
G
CGTCC,AT
CGC
Alt2:
AA
A
GG
AT,GGT
CG
Expected Match per row:
Row 1 = none
Row 2 = A
Row 3 = none
Row 4 = AT
Row 5 = none
Non-working solutions:
First attempt: merge entire data frames by desired columns, then match up the alt columns shown above:
match1 = data.frame(merge(vcf.df, ref.df, by=c("chr", "start", "end", "ref")))
matches = unique(match1[unlist(sapply(match1$Alt1 grep, match1$Alt2, fixed=TRUE)),])
Second method, using findoverlaps feature from VariantAnnoatation/Granges:
findoverlaps(ranges(vcf1), ranges(vcf2))
Any suggestions would be greatly appreciated! Thank you!
Solution
Thanks to #Marat Talipov's answer below, the following solution works to compare two comma-separated strings:
> ##read in edited kaviar vcf and human ref
> ref <- readVcfAsVRanges("ref.vcf.gz", humie_ref)
Warning message:
In .vcf_usertag(map, tag, ...) :
ScanVcfParam ‘geno’ fields not present: ‘AD’
> ##rename chromosomes to match with vcf files
> ref <- renameSeqlevels(ref, c("1"="chr1"))
> ##################################
> ## Gather VCF files to process ##
> ##################################
> ##data frame *.vcf.gz files in directory path
> vcf_path <- data.frame(path=list.files(vcf_dir, pattern="*.vcf.gz$", full=TRUE))
> ##read in everything but sample data for speediness
> vcf_param = ScanVcfParam(samples=NA)
> vcf <- readVcfAsVRanges("test.vcf.gz", humie_ref, param=vcf_param)
> #################
> ## Match SNP's ##
> #################
> ##create data frames of info to match on
> vcf.df = data.frame(chr =as.character(seqnames(vcf)), start = start(vcf), end = end(vcf), ref = as.character(ref(vcf)),
+ alt=alt(vcf), stringsAsFactors=FALSE)
> ref.df = data.frame(chr =as.character(seqnames(ref)), start = start(ref), end = end(ref),
+ ref = as.character(ref(ref)), alt=alt(ref), stringsAsFactors=FALSE)
>
> ##merge based on all positional fields except vcf
> col_match = data.frame(merge(vcf.df, ref.df, by=c("chr", "start", "end", "ref")))
> library(stringi)
> ##split each alt column by comma and bind together
> M1 <- stri_list2matrix(sapply(col_match$alt.x,strsplit,','))
> M2 <- stri_list2matrix(sapply(col_match$alt.y,strsplit,','))
> M <- rbind(M1,M2)
> ##compare results
> result <- apply(M,2,function(z) unique(na.omit(z[duplicated(z)])))
> ##add results column to col_match df for checking/subsetting
> col_match$match = result
> head(col_match)
chr start end ref alt.x alt.y match
1 chr1 39998059 39998059 A G G G
2 chr1 39998059 39998059 A G G G
3 chr1 39998084 39998084 C A A A
4 chr1 39998084 39998084 C A A A
5 chr1 39998085 39998085 G A A A
6 chr1 39998085 39998085 G A A A
In the case that input lists are of equal length and you'd like to compare list elements in the pairwise manner, you could use this solution:
library(stringi)
M1 <- stri_list2matrix(sapply(Alt1,strsplit,','))
M2 <- stri_list2matrix(sapply(Alt2,strsplit,','))
M <- rbind(M1,M2)
result <- apply(M,2,function(z) unique(na.omit(z[duplicated(z)])))
Sample input:
Alt1 <- list('AT','A','G','CGTCC,AT','CGC','GG,CC')
Alt2 <- list('AA','A','GG','AT,GGT','CG','GG,CC')
Output:
# [[1]]
# character(0)
#
# [[2]]
# [1] "A"
#
# [[3]]
# character(0)
#
# [[4]]
# [1] "AT"
#
# [[5]]
# character(0)
#
# [[6]]
# [1] "GG" "CC"
Sticking with the stringi package, you could do something like this, using the Alt1 and Alt2 data from Marat's answer.
library(stringi)
f <- function(x, y) {
ssf <- stri_split_fixed(c(x, y), ",", simplify = TRUE)
if(any(sd <- stri_duplicated(ssf))) ssf[sd] else NA_character_
}
Map(f, Alt1, Alt2)
# [[1]]
# [1] NA
#
# [[2]]
# [1] "A"
#
# [[3]]
# [1] NA
#
# [[4]]
# [1] "AT"
#
# [[5]]
# [1] NA
#
# [[6]]
# [1] "GG" "CC"
Or in base R, we can use scan() to separate the strings with commas.
g <- function(x, y, sep = ",") {
s <- scan(text = c(x, y), what = "", sep = sep, quiet = TRUE)
s[duplicated(s)]
}
Map(g, Alt1, Alt2)
you could do something like this:
Alt1 <- list('AT','A','G',c('CGTCC','AT'),'CGC')
Alt2 <- list('AA','A','GG',c('AT','GGT'),'CG')
# make sure you change the lists within in the lists into vectors
matchlist <- list()
for (i in 1:length(Alt1)){
matchlist[[i]] <- ifelse(Alt1[[i]] %in% Alt2[[i]],
paste("Row",i,"=",c(Alt1[[i]],Alt2[[i]])[duplicated(c(Alt1[[i]],Alt2[[i]]))],sep=" "),
paste("Row",i,"= none",sep=" "))
}
print(matchlist)

converting some rows into columns in R

I have a table with 1 columns and I want to extract one among the other elements in rows and insert into new column.
lets say my table: df
V1
elements-of-01-to-20
ACTCTGCGACHCHAHAATT
elements-of-21-to-30
ACTAGCTATTATCGATATT
elements-of-31-to-40
CCCTTATATTGGAGCTACT
my desired result:
V1 V2
elements-of-01-to-20 ACTCTGCGACHCHAHAATT
elements-of-21-to-20 ACTAGCTATTATCGATATT
elements-of-31-to-20 CCCTTATATTGGAGCTACT
elements-of-31-to-40 CCCTTATATTGGAGCTACT
edited:
thanks for all replies. my second question is what if my dataset has multiple sequences followed by specific term like elements-of:
V1 => result => V1 v2
elements-of-01-to-20 elements-of-01-to-20 ACTCTGCGACHCHAHAATTAGGGGATGCTGATTTAGTA
ACTCTGCGACHCHAHAATT elements-of-21-to-30 ACTAGCTATTATCGATATT
AGGGGATGCTGATTTAGTA
elements-of-21-to-30
ACTAGCTATTATCGATATT
If the pattern is the same as in the example
indx <- c(TRUE, FALSE)
data.frame(V1=df$V1[indx], V2=df$V1[!indx])
# V1 V2
#1 elements-of-01-to-20 ACTCTGCGACHCHAHAATT
#2 elements-of-21-to-30 ACTAGCTATTATCGATATT
#3 elements-of-31-to-40 CCCTTATATTGGAGCTACT
Update
Based on the updated dataset
library(data.table)
setDT(df)[,list(V1=V1[1], V2=paste(V1[-1], collapse='')),
by= list(indx=cumsum(grepl('^[^A-Z]', df$V1)))][, indx:=NULL][]
# V1 V2
#1: elements-of-01-to-20 ACTCTGCGACHCHAHAATTAGGGGATGCTGATTTAGTA
#2: elements-of-21-to-30 ACTAGCTATTATCGATATT
New data
df <- structure(list(V1 = c("elements-of-01-to-20", "ACTCTGCGACHCHAHAATT",
"AGGGGATGCTGATTTAGTA", "elements-of-21-to-30", "ACTAGCTATTATCGATATT"
)), .Names = "V1", class = "data.frame", row.names = c(NA, -5L))
If that is just a fasta file then look at the Biostrings package. You could do it this way too
MySeq <- data.frame("Name" = df$V1[(seq(1, length(df$V1), by=2)],
"Seq" = df$V1[(seq(2, length(df$V1), by=2)],
stringsAsFactors = FALSE)
Here is another way using grepl:
#dummy data
df <- read.table(text=" V1
elements-of-01-to-20
ACTCTGCGACHCHAHAATT
elements-of-21-to-30
ACTAGCTATTATCGATATT
elements-of-31-to-40
CCCTTATATTGGAGCTACT",
as.is=TRUE,header=TRUE)
#result
cbind(df[ grepl("elements",df$V1), "V1"],
df[ !grepl("elements",df$V1), "V1"])
#output
# [,1] [,2]
# [1,] "elements-of-01-to-20" "ACTCTGCGACHCHAHAATT"
# [2,] "elements-of-21-to-30" "ACTAGCTATTATCGATATT"
# [3,] "elements-of-31-to-40" "CCCTTATATTGGAGCTACT"
Try (using traditional programming methods):
ndf = data.frame(V1="", V2="", stringsAsFactors=FALSE)
i=1
while(i<nrow(df)){
ndf[(nrow(ndf)+1),]=c(df[i,1], df[(i+1),1])
i=i+2
}
ndf[-1,]
V1 V2
2 elements-of-01-to-20 ACTCTGCGACHCHAHAATT
3 elements-of-21-to-30 ACTAGCTATTATCGATATT
4 elements-of-31-to-40 CCCTTATATTGGAGCTACT

Create new column in dataframe based on partial string matching other column

I have a dataframe with 2 columns GL and GLDESC and want to add a 3rd column called KIND based on some data that is inside of column GLDESC.
The dataframe is as follows:
GL GLDESC
1 515100 Payroll-Indir Salary Labor
2 515900 Payroll-Indir Compensated Absences
3 532300 Bulk Gas
4 539991 Area Charge In
5 551000 Repairs & Maint-Spare Parts
6 551100 Supplies-Operating
7 551300 Consumables
For each row of the data table:
If GLDESC contains the word Payroll anywhere in the string then I want KIND to be Payroll
If GLDESC contains the word Gas anywhere in the string then I want KIND to be Materials
In all other cases I want KIND to be Other
I looked for similar examples on stackoverflow but could not find any, also looked in R for dummies on switch, grep, apply and regular expressions to try and match only part of the GLDESC column and then fill the KIND column with the kind of account but was unable to make it work.
Since you have only two conditions, you can use a nested ifelse:
#random data; it wasn't easy to copy-paste yours
DF <- data.frame(GL = sample(10), GLDESC = paste(sample(letters, 10),
c("gas", "payroll12", "GaSer", "asdf", "qweaa", "PayROll-12",
"asdfg", "GAS--2", "fghfgh", "qweee"), sample(letters, 10), sep = " "))
DF$KIND <- ifelse(grepl("gas", DF$GLDESC, ignore.case = T), "Materials",
ifelse(grepl("payroll", DF$GLDESC, ignore.case = T), "Payroll", "Other"))
DF
# GL GLDESC KIND
#1 8 e gas l Materials
#2 1 c payroll12 y Payroll
#3 10 m GaSer v Materials
#4 6 t asdf n Other
#5 2 w qweaa t Other
#6 4 r PayROll-12 q Payroll
#7 9 n asdfg a Other
#8 5 d GAS--2 w Materials
#9 7 s fghfgh e Other
#10 3 g qweee k Other
EDIT 10/3/2016 (..after receiving more attention than expected)
A possible solution to deal with more patterns could be to iterate over all patterns and, whenever there is match, progressively reduce the amount of comparisons:
ff = function(x, patterns, replacements = patterns, fill = NA, ...)
{
stopifnot(length(patterns) == length(replacements))
ans = rep_len(as.character(fill), length(x))
empty = seq_along(x)
for(i in seq_along(patterns)) {
greps = grepl(patterns[[i]], x[empty], ...)
ans[empty[greps]] = replacements[[i]]
empty = empty[!greps]
}
return(ans)
}
ff(DF$GLDESC, c("gas", "payroll"), c("Materials", "Payroll"), "Other", ignore.case = TRUE)
# [1] "Materials" "Payroll" "Materials" "Other" "Other" "Payroll" "Other" "Materials" "Other" "Other"
ff(c("pat1a pat2", "pat1a pat1b", "pat3", "pat4"),
c("pat1a|pat1b", "pat2", "pat3"),
c("1", "2", "3"), fill = "empty")
#[1] "1" "1" "3" "empty"
ff(c("pat1a pat2", "pat1a pat1b", "pat3", "pat4"),
c("pat2", "pat1a|pat1b", "pat3"),
c("2", "1", "3"), fill = "empty")
#[1] "2" "1" "3" "empty"
I personally like matching by index. You can loop grep over your new labels, in order to get the indices of your partial matches, then use this with a lookup table to simply reassign the values.
If you wanna create new labels, use a named vector.
DF <- data.frame(GL = sample(10), GLDESC = paste(sample(letters, 10),
c(
"gas", "payroll12", "GaSer", "asdf", "qweaa", "PayROll-12",
"asdfg", "GAS--2", "fghfgh", "qweee"
), sample(letters, 10),
sep = " "
))
lu <- stack(sapply(c(Material = "gas", Payroll = "payroll"), grep, x = DF$GLDESC, ignore.case = TRUE))
DF$KIND <- DF$GLDESC
DF$KIND[lu$values] <- as.character(lu$ind)
DF$KIND[-lu$values] <- "Other"
DF
#> GL GLDESC KIND
#> 1 6 x gas f Material
#> 2 3 t payroll12 q Payroll
#> 3 5 a GaSer h Material
#> 4 4 s asdf x Other
#> 5 1 m qweaa y Other
#> 6 10 y PayROll-12 r Payroll
#> 7 7 g asdfg a Other
#> 8 2 k GAS--2 i Material
#> 9 9 e fghfgh j Other
#> 10 8 l qweee p Other
Created on 2021-11-13 by the reprex package (v2.0.1)