Extracting text strings using data.table in R - regex

I have a data.table similar to the one as follows
Data
library(data.table)
DT <- structure(list(N = 1:6, VN = c("v1", "v3", "v6", "v7a", "v18",
"v23"), T1 = c("bigby (wolf)", "white", "red (rose)", "piggy (straw)",
"(curse) beast", "prince"), T2 = c("jack (bean)", "snow (dwarves)",
"beard (blue)", "bhageera (jungle) mowgli (book)", "beauty",
"glass (slipper)"), T3 = c("hk (34)", "VL (r45)", "tg (h5)",
"tt (HG) (45)", "gh", "vlp"), Val = c(36, 25, 0.84, 12, 78, 258
)), .Names = c("N", "VN", "T1", "T2", "T3", "Val"), class = "data.frame", row.names = c(NA,
-6L))
setDT(DT)
DT
N VN T1 T2 T3 Val
1: 1 v1 bigby (wolf) jack (bean) hk (34) 36.00
2: 2 v3 white snow (dwarves) VL (r45) 25.00
3: 3 v6 red (rose) beard (blue) tg (h5) 0.84
4: 4 v7a piggy (straw) bhageera (jungle) mowgli (book) tt (HG) (45) 12.00
5: 5 v18 (curse) beast beauty gh 78.00
6: 6 v23 prince glass (slipper) vlp 258.00
I want to extract all the strings within parentheses from columns T1 and T2 to a new column C.
I can do it to single rows as follows.
Rowwise calculations
setDF(DT)
dtf <- c("T1", "T2")
paste(unique(unlist(regmatches(DT[4,dtf], gregexpr("(?=\\().*?(?<=\\))", DT[4,dtf], perl=T)))), collapse=" ")
[1] "(straw) (jungle) (book)"
paste(unique(unlist(regmatches(DT[3,dtf], gregexpr("(?=\\().*?(?<=\\))", DT[3,dtf], perl=T)))), collapse=" ")
[1] "(rose) (blue)"
I am not able to get similar results using data.table.
Try with data.table
setDT(DT)
DT[, C := paste(unique(unlist(regmatches(get(dtf), gregexpr("(?=\\().*?(?<=\\))", get(dtf), perl=T)))), collapse=" ")]
How to use data.table to get the desired result?
Desired result
out <- structure(list(N = 1:6, VN = c("v1", "v3", "v6", "v7a", "v18",
"v23"), T1 = c("bigby (wolf)", "white", "red (rose)", "piggy (straw)",
"(curse) beast", "prince"), T2 = c("jack (bean)", "snow (dwarves)",
"beard (blue)", "bhageera (jungle) mowgli (book)", "beauty",
"glass (slipper)"), T3 = c("hk (34)", "VL (r45)", "tg (h5)",
"tt (HG) (45)", "gh", "vlp"), Val = c(36, 25, 0.84, 12, 78, 258
), C = c("(wolf) (bean)", "(dwarves)", "(rose) (blue)", "(straw) (jungle) (book)",
"(curse)", "(slipper)")), .Names = c("N", "VN", "T1", "T2", "T3",
"Val", "C"), class = "data.frame", row.names = c(NA, -6L))
out
N VN T1 T2 T3 Val C
1 1 v1 bigby (wolf) jack (bean) hk (34) 36.00 (wolf) (bean)
2 2 v3 white snow (dwarves) VL (r45) 25.00 (dwarves)
3 3 v6 red (rose) beard (blue) tg (h5) 0.84 (rose) (blue)
4 4 v7a piggy (straw) bhageera (jungle) mowgli (book) tt (HG) (45) 12.00 (straw) (jungle) (book)
5 5 v18 (curse) beast beauty gh 78.00 (curse)
6 6 v23 prince glass (slipper) vlp 258.00 (slipper)

You can use by and .SDcols to do this.
setDT(DT)
dtf <- c("T1", "T2")
DT[, C := paste(unique(unlist(regmatches(.SD, gregexpr("(?=\\().*?(?<=\\))", .SD, perl=T)))),
collapse=" "),
by = N,
.SDcols = dtf]
DT
## N VN T1 T2 T3 Val C
## 1: 1 v1 bigby (wolf) jack (bean) hk (34) 36.00 (wolf) (bean)
## 2: 2 v3 white snow (dwarves) VL (r45) 25.00 (dwarves)
## 3: 3 v6 red (rose) beard (blue) tg (h5) 0.84 (rose) (blue)
## 4: 4 v7a piggy (straw) bhageera (jungle) mowgli (book) tt (HG) (45) 12.00 (straw) (jungle) (book)
## 5: 5 v18 (curse) beast beauty gh 78.00 (curse)
## 6: 6 v23 prince glass (slipper) vlp 258.00 (slipper)

Related

Output result not changing when I choose input values in dropdown menu

I have a problem with the shiny app in that the plots does not change when I choose different input values in the app. In my example I want to choose different "miRNA" resulting in a different survival-plot
Here is my app:
library(dplyr)
require(survminer)
library(tidyverse)
require(reshape2)
library(shiny)
library(tidyr)
require(survival)
example data:
df.miRNA.cpm <- structure(list(`86` = c(5.57979757386892, 17.0240095264258, 4.28380151026145,
13.0457611762755, 12.5531123449841), `175` = c(5.21619202802748,
15.2849097474841, 2.46719979911461, 10.879496005461, 9.66416497290915
), `217` = c(5.42796072966512, 17.1413407297933, 5.15230233060323,
12.2646127361351, 12.1031024927547), `394` = c(-1.1390337316217,
15.1021660424984, 4.63168157763046, 11.1299079134792, 9.55572588729967
), `444` = c(5.06134249676025, 14.5442494311861, -0.399445049232868,
7.45775961504073, 9.92629675808998)), row.names = c("hsa_let_7a_3p",
"hsa_let_7a_5p", "hsa_let_7b_3p", "hsa_let_7b_5p", "hsa_let_7c_5p"
), class = "data.frame")
df.miRNA.cpm$miRNA <- rownames(df.miRNA.cpm)
ss.survival.shiny.miRNA.miRNA <- structure(list(ID = c("86", "175", "217", "394", "444"), TimeDiff = c(71.0416666666667,
601.958333333333, 1130, 1393, 117.041666666667), Status = c(1L,
1L, 0L, 0L, 1L)), row.names = c(NA, 5L), class = "data.frame")
Joint the two example data frames:
data_prep.miRNA <- df.miRNA.cpm %>%
tidyr::pivot_longer(-miRNA, names_to = "ID") %>%
left_join(ss.survival.shiny.miRNA.miRNA)
Example of the joined data:
> data_prep.miRNA
# A tibble: 153,033 x 5
miRNA ID value TimeDiff Status
<chr> <chr> <dbl> <dbl> <int>
1 hsa_let_7a_3p 86 5.58 71.0 1
2 hsa_let_7a_3p 175 5.22 602. 1
3 hsa_let_7a_3p 217 5.43 1130 0
4 hsa_let_7a_3p 394 -1.14 1393 0
5 hsa_let_7a_3p 444 5.06 117. 1
6 hsa_let_7a_3p 618 4.37 1508 0
7 hsa_let_7a_3p 640 2.46 1409 0
8 hsa_let_7a_3p 829 0.435 919. 0
9 hsa_let_7a_3p 851 -1.36 976. 0
10 hsa_let_7a_3p 998 3.87 1196. 0
# … with 153,023 more rows
Shiny:
ui.miRNA <- fluidPage(
selectInput("MicroRNA", "miRNA", choices = unique(data_prep.miRNA$miRNA)),
plotOutput("myplot"))
server <- function(input, output, session) {
data_selected <- reactive({
filter(data_prep.miRNA, miRNA %in% input$MicroRNA)
})
output$myplot <- renderPlot({
survfit(Surv(TimeDiff, Status) ~ cut(value, quantile(value, probs = c(0, .4, .8)), include.lowest=TRUE),data = data_selected())
ggsurvplot(fitSurv)
})
}
shinyApp(ui.miRNA, server)
You have to assign the output of survfit to fitSurv:
output$myplot <- renderPlot({
fitSurv <- survfit(Surv(TimeDiff, Status) ~ cut(value, quantile(value, probs = c(0, .4, .8)), include.lowest=TRUE),data = data_selected())
ggsurvplot(fitSurv)
})

C++ or Rcpp: comparison of two vectors without loop

I am a novice in C++ and Rcpp, and I am wondering how to compare each element of two different vectors without loop at one time.
My goal is to change the element of v1 by referencing other vector.`
Current code is
v1 = {6,7,8,9,10}
v2 = {2,4,6,8,10}
v3 = {a,b,a,b,c}
v4 = {0,0,0,0,0}
v5 = {a,b,c}
v6 = {1,2,3}
for (i in 1:5){
if (v1[i] > v2[i]){
for (j in 1:3){
if (v5[j] == v3[i]){
v4[i] = v2[i] + v6[j]
if (v1[i] > v4[i]){
v1[i] = v4[i]
}
}
}
}
}
The result sould be
v1 = {3,6,7,9,10}
In fact, v1, v2, v3, v4 and v5, v6 are the different dataframe in R. Each element of v1 is compared to v2. If an element i in v1 is larger than i element in v2, the element of v1 becomes a sum of i element of v1 and element of v6 by corresponding v3 & v5. Then the newly estimated value v4[i] is compared to v1[i].
I have ta large number of cases in v1~v5 and v5~v6. In this case, using loop takes a long time. Is it possible to compare the different vectors without loop? or how to estimate and reference the other vector's element?
I do not see the need to use Rcpp or C++ here. The way I understand your requirements, you are trying to manipulate two sets of equal length vectors. For a "set of equal length" vectors one normally uses a data.frame or one of its extensions. Here I am using base R, data.table and dplyr with tibble. See for yourself which syntax you prefer. Generally speaking, data.table will most likely be faster for large data sets.
Setup data:
v1 <- c(6,7,8,9,10)
v2 <- c(2,4,6,8,10)
v3 <- c("a","b","a","b","c")
v5 <- c("a","b","c")
v6 <- c(1,2,3)
Base R:
df1 <- data.frame(v1, v2, v3)
df2 <- data.frame(v5, v6)
df1 <- merge(df1, df2, by.x = "v3", by = "v5")
df1$v4 <- df1$v2 + df1$v6
df1$v1 <- ifelse(df1$v1 > df1$v2 & df1$v1 > df1$v4, df1[["v4"]], df1[["v1"]])
df1
#> v3 v1 v2 v6 v4
#> 1 a 3 2 1 3
#> 2 a 7 6 1 7
#> 3 b 6 4 2 6
#> 4 b 9 8 2 10
#> 5 c 10 10 3 13
data.table:
library(data.table)
dt1 <- data.table(v1, v2, v3, key = "v3")
dt2 <- data.table(v5, v6, key = "v5")
dt1[dt2, v4 := v2 + v6]
dt1[v1 > v2 & v1 > v4, v1 := v4]
dt1
#> v1 v2 v3 v4
#> 1: 3 2 a 3
#> 2: 7 6 a 7
#> 3: 6 4 b 6
#> 4: 9 8 b 10
#> 5: 10 10 c 13
dplyr:
suppressPackageStartupMessages(library(dplyr))
t1 <- tibble(v1, v2, v3)
t2 <- tibble(v5, v6)
t1 %>%
inner_join(t2, by = c("v3" = "v5")) %>%
mutate(v4 = v2 + v6) %>%
mutate(v1 = case_when(
v1 > v2 & v1 > v4 ~ v4,
TRUE ~ v1
))
#> # A tibble: 5 x 5
#> v1 v2 v3 v6 v4
#> <dbl> <dbl> <chr> <dbl> <dbl>
#> 1 3 2 a 1 3
#> 2 6 4 b 2 6
#> 3 7 6 a 1 7
#> 4 9 8 b 2 10
#> 5 10 10 c 3 13
Created on 2019-04-19 by the reprex package (v0.2.1)
The general idea is always the same:
join the two tables on the character column
create new column v4 as sum of v2 and v6
update v1 to the value of v4 where v1 > v2 and v1 > v4
Note that base R and data.table do not preserve the order, so it would make more sense to put the output into an additional column.

Remove string in parenthesis and add that as a new column

Possible duplicate Here
I have a data frame of two columns. I want to remove the string in parenthesis and add that as a new column. Data frame is displayed below.
structure(list(ID = 1:12, Gene.Name = structure(c(3L, 11L, 9L,
5L, 1L, 8L, 2L, 4L, 6L, 12L, 10L, 7L), .Label = c(" ATP synt, H+ tran, O subunit (oligomycin sensitivity conferring protein) (ATP5O), mRNA",
" heterogeneous nuclear ribonucleoprotein F (HNRPF), mRNA", " NADH (ubiquinone) 1 alpha subcomplex, 4 (9kD, MLRQ) (NDUFA4), mRNA",
" ribosomal protein L34 (RPL34), transcript variant 1, mRNA",
" ribosomal protein S11 (RPS11), mRNA", "ATP synthase, H+ tran, mitochondrial F0, subunit c (subunit 9) isoform 3 (ATP5G3), mRNA",
"clone MGC:10120 IMAGE:3900723, mRNA, complete cds", "cytidine monophosphate N-acetylneuraminic acid synthetase (CMAS), mRNA",
"farnesyl-diphosphate farnesyltransferase 1 (FDFT1), mRNA", "homeobox protein from AL590526 (LOC84528), mRNA",
"mitochondrial S33 (MRPS33), transcript variant 1, nuclear gene, mRNA",
"ribosomal protein S15a (RPS15A), mRNA"), class = "factor")), .Names = c("ID",
"Gene.Name"), row.names = c(NA, -12L), class = "data.frame")
if the string in parenthesis is not found, then leave that row empty. Here i have two cases
1) Get all the string in parenthesis and add as a new column separated by ,
2) Last string in parenthesis and add as new column
I tried something like df$Symbol <- sapply(df, function(x) sub("\\).*", "", sub(".*\\(", "", x))) but does not give the appropriate output
Case 1 output
ID Gene.Name Symbol
1 NADH (ubiquinone) 1 alpha subcomplex, 4 (9kD, MLRQ) (NDUFA4), mRNA ubiquinone, (9kD, MLRQ),NDUFA4
2 mitochondrial S33 (MRPS33), transcript variant 1, nuclear gene, mRNA MRPS33
3 farnesyl-diphosphate farnesyltransferase 1 (FDFT1), mRNA FDFT1
4 ribosomal protein S11 (RPS11), mRNA RPS11
5 ATP synt, H+ tran, O subunit (oligomycin sensitivity conferring protein) (ATP5O), mRNA oligomycin sensitivity conferring protein,ATP5O
6 cytidine monophosphate N-acetylneuraminic acid synthetase (CMAS), mRNA CMAS
7 heterogeneous nuclear ribonucleoprotein F (HNRPF), mRNA HNRPF
8 ribosomal protein L34 (RPL34), transcript variant 1, mRNA RPL34
9 ATP synthase, H+ tran, mitochondrial F0, subunit c (subunit 9) isoform 3 (ATP5G3), mRNA subunit 9,ATP5G3
10 ribosomal protein S15a (RPS15A), mRNA RPS15A
11 homeobox protein from AL590526 (LOC84528), mRNA LOC84528
12 clone MGC:10120 IMAGE:3900723, mRNA, complete cds NA
Case 2 output
ID Gene.Name Symbol
1 NADH (ubiquinone) 1 alpha subcomplex, 4 (9kD, MLRQ) (NDUFA4), mRNA NDUFA4
2 mitochondrial S33 (MRPS33), transcript variant 1, nuclear gene, mRNA MRPS33
3 farnesyl-diphosphate farnesyltransferase 1 (FDFT1), mRNA FDFT1
4 ribosomal protein S11 (RPS11), mRNA RPS11
5 ATP synt, H+ tran, O subunit (oligomycin sensitivity conferring protein) (ATP5O), mRNA ATP5O
6 cytidine monophosphate N-acetylneuraminic acid synthetase (CMAS), mRNA CMAS
7 heterogeneous nuclear ribonucleoprotein F (HNRPF), mRNA HNRPF
8 ribosomal protein L34 (RPL34), transcript variant 1, mRNA RPL34
9 ATP synthase, H+ tran, mitochondrial F0, subunit c (subunit 9) isoform 3 (ATP5G3), mRNA ATP5G3
10 ribosomal protein S15a (RPS15A), mRNA RPS15A
11 homeobox protein from AL590526 (LOC84528), mRNA LOC84528
12 clone MGC:10120 IMAGE:3900723, mRNA, complete cds <NA>
An option using sub to get the words inside the round brackets at the end of the string.
Symbol <- sub('.*\\(([^\\)]+)\\)[^\\(]+$', '\\1',df1[,2])
df1$Symbol <- Symbol[1:nrow(df1)*NA^(!grepl('\\(',df1[,2]))]
df1$Symbol
#[1] "NDUFA4" "MRPS33" "FDFT1" "RPS11" "ATP5O" "CMAS"
#[7] "HNRPF" "RPL34" "ATP5G3" "RPS15A" "LOC84528" NA
Update
For the first case, ie. extract all characters within the round brackets and paste them together using ,, one option is rm_round from qdapRegex. The output of rm_round is a list. So we use lapply/sapply to loop through the list. Strings that have , inside are separated with grep and we paste the round brackets, and then paste the strings together with collapse=', '. A convenient wrapper function is toString.
library(qdapRegex)
df1$allSymbol <- sapply(rm_round(df1[,2],extract=TRUE), function(x) {
indx <- grep(',', x)
x[indx] <-paste0("(", x[indx], ")")
toString(x)})
is.na(df1$allSymbol) <- df1$allSymbol=='NA'
df1[3:4]
# allSymbol Symbol
#1 ubiquinone, (9kD, MLRQ), NDUFA4 NDUFA4
#2 MRPS33 MRPS33
#3 FDFT1 FDFT1
#4 RPS11 RPS11
#5 oligomycin sensitivity conferring protein, ATP5O ATP5O
#6 CMAS CMAS
#7 HNRPF HNRPF
#8 RPL34 RPL34
#9 subunit 9, ATP5G3 ATP5G3
#10 RPS15A RPS15A
#11 LOC84528 LOC84528
#12 <NA> <NA>
I think I took the easy way out, but if you can get away with it, only match the things in the parentheses that look like a gene symbol, ie, only capital letters and digits
dd <- structure(list(ID = 1:12, Gene.Name = structure(c(3L, 11L, 9L, 5L, 1L, 8L, 2L, 4L, 6L, 12L, 10L, 7L), .Label = c(" ATP synt, H+ tran, O subunit (oligomycin sensitivity conferring protein) (ATP5O), mRNA", " heterogeneous nuclear ribonucleoprotein F (HNRPF), mRNA", " NADH (ubiquinone) 1 alpha subcomplex, 4 (9kD, MLRQ) (NDUFA4), mRNA", " ribosomal protein L34 (RPL34), transcript variant 1, mRNA", " ribosomal protein S11 (RPS11), mRNA", "ATP synthase, H+ tran, mitochondrial F0, subunit c (subunit 9) isoform 3 (ATP5G3), mRNA", "clone MGC:10120 IMAGE:3900723, mRNA, complete cds", "cytidine monophosphate N-acetylneuraminic acid synthetase (CMAS), mRNA", "farnesyl-diphosphate farnesyltransferase 1 (FDFT1), mRNA", "homeobox protein from AL590526 (LOC84528), mRNA", "mitochondrial S33 (MRPS33), transcript variant 1, nuclear gene, mRNA", "ribosomal protein S15a (RPS15A), mRNA"), class = "factor")), .Names = c("ID", "Gene.Name"), row.names = c(NA, -12L), class = "data.frame")
dd$Gene.Name <- as.character(dd$Gene.Name)
## case 1
mm <- gregexpr('(?<=\\()(.*?)(?=\\))', dd$Gene.Name, perl = TRUE)
mm <- regmatches(dd$Gene.Name, mm)
dd <- cbind(dd, case1 = sapply(mm, function(x)
ifelse(length(x), paste(x, collapse = ', '), NA)))
dd[, c(1,3)]
# ID case1
# 1 1 ubiquinone, 9kD, MLRQ, NDUFA4
# 2 2 MRPS33
# 3 3 FDFT1
# 4 4 RPS11
# 5 5 oligomycin sensitivity conferring protein, ATP5O
# 6 6 CMAS
# 7 7 HNRPF
# 8 8 RPL34
# 9 9 subunit 9, ATP5G3
# 10 10 RPS15A
# 11 11 LOC84528
# 12 12 <NA>
## case 2
mm <- gregexpr('(?<=\\()([A-Z0-9]+)(?=\\))', dd$Gene.Name, perl = TRUE)
mm <- regmatches(dd$Gene.Name, mm)
dd <- cbind(dd, case2 = sapply(mm, function(x) ifelse(length(x), x, NA)))
dd[, c(1,4)]
# ID case2
# 1 1 NDUFA4
# 2 2 MRPS33
# 3 3 FDFT1
# 4 4 RPS11
# 5 5 ATP5O
# 6 6 CMAS
# 7 7 HNRPF
# 8 8 RPL34
# 9 9 ATP5G3
# 10 10 RPS15A
# 11 11 LOC84528
# 12 12 <NA>

Splitting string into unknown number of new dataframe columns

I have a dataframe with a character column that contains email metadata in the form of multiple strings delimited by a newline character \n:
person myString
1 John To name5#email.com by sender6 on 01-12-2014\n
2 Jane To name#email.com,name4#email.com by sender1 on 01-22-2014\nTo name3#email.com by sender2 on 02-03-2014\nTo email5#domain.com by sender1 on 06-21-2014\n
3 Tim To name2#email.com by sender2 on 05-11-2014\nTo name#email.com by sender2 on 06-03-2015\n
I want to split the different substrings of myString into different columns so that it looks like this:
person email1 email2 email3
1 John To name5#email.com by sender6 on 01-12-2014 <NA> <NA>
2 Jane To name#email.com,name4#email.com by sender1 on 01-22-2014 To name3#email.com by sender2 on 02-03-2014 To email5#domain.com by sender1 on 06-21-2014
3 Tim To name2#email.com by sender2 on 05-11-2014 To name#email.com by sender2 on 06-03-2015 <NA>
My current approach uses separate from the tidyr package:
library(dplyr)
library(tidyr)
res1 <- df %>%
separate(col = myString, into = paste(rep("email", 3), 1:3), sep = "\\n", extra = "drop")
res1[res1 == ""] <- NA
But with this approach, I have to manually specify that there are three columns to extract.
I'm looking to improve this process with either or both of:
A way to automate that calculation of the maximum number of occurrences of the delimiting character (i.e., how many new variables are needed)
Other approaches to splitting into an unknown number of columns
And if there's a good solution that returns data in a long form, instead of wide, that would be great too.
Sample data:
df <- structure(list(person = c("John", "Jane", "Tim"), myString = c("To name5#email.com by sender6 on 01-12-2014\n",
"To name#email.com,name4#email.com by sender1 on 01-22-2014\nTo name3#email.com by sender2 on 02-03-2014\nTo email5#domain.com by sender1 on 06-21-2014\n",
"To name2#email.com by sender2 on 05-11-2014\nTo name#email.com by sender2 on 06-03-2015\n"
)), .Names = c("person", "myString"), row.names = c(NA, -3L), class = "data.frame")
I would suggest cSplit from my "splitstackshape" package:
library(splitstackshape)
cSplit(df, "myString", "\n")
# person myString_1
# 1: John To name5#email.com by sender6 on 01-12-2014
# 2: Jane To name#email.com,name4#email.com by sender1 on 01-22-2014
# 3: Tim To name2#email.com by sender2 on 05-11-2014
# myString_2
# 1: NA
# 2: To name3#email.com by sender2 on 02-03-2014
# 3: To name#email.com by sender2 on 06-03-2015
# myString_3
# 1: NA
# 2: To email5#domain.com by sender1 on 06-21-2014
# 3: NA
You can also try stri_split_fixed from the "stringi" package with the argument simplify = TRUE (though with your sample data, this adds an extra empty column at the end). The approach would be something like:
library(stringi)
data.frame(person = df$person,
stri_split_fixed(df$myString, "\n",
simplify = TRUE))
Seems hacky, but here ya go...
Use strsplit to split the char vector. Get the max length, use that for your columns.
df <- data.frame(
person = c("John", "Jane", "Tim"),
myString = c("To name5#email.com by sender6 on 01-12-2014\n",
"To name#email.com,name4#email.com by sender1 on 01-22-2014\nTo name3#email.com by sender2 on 02-03-2014\nTo email5#domain.com by sender1 on 06-21-2014\n",
"To name2#email.com by sender2 on 05-11-2014\nTo name#email.com by sender2 on 06-03-2015\n"
), stringsAsFactors=FALSE
)
a <- strsplit(df$myString, "\n")
max_len <- max(sapply(a, length))
for(i in 1:max_len){
df[,paste0("email", i)] <- sapply(a, "[", i)
}
Here is an efficient route to a long form:
a <- strsplit(df$myString, "\n")
lens <- vapply(a, length, integer(1L)) # or lengths(a) in R 3.2
longdf <- df[rep(seq_along(a), lens),]
longdf$string <- unlist(a)
Note that stack() is often useful for these cases.
Can be simplified by using the IRanges Bioconductor package:
longdf <- df[togroup(a),]
longdf$string <- unlist(a)
Then, if really necessary, go to wide-form:
longdf$myString <- NULL
longdf$token <- sequence(lens)
widedf <- reshape(longdf, timevar="token", idvar="person", direction="wide")
This might suffice:
library(data.table)
dt = as.data.table(df) # or setDT to convert in place
dt[, strsplit(myString, split = "\n"), by = person]
# person V1
#1: John To name5#email.com by sender6 on 01-12-2014
#2: Jane To name#email.com,name4#email.com by sender1 on 01-22-2014
#3: Jane To name3#email.com by sender2 on 02-03-2014
#4: Jane To email5#domain.com by sender1 on 06-21-2014
#5: Tim To name2#email.com by sender2 on 05-11-2014
#6: Tim To name#email.com by sender2 on 06-03-2015
And then can trivially convert to wide format:
dcast(dt[, strsplit(myString, split = "\n"), by = person][, idx := 1:.N, by = person],
person ~ idx, value.var = 'V1')
# person 1 2 3
#1: Jane To name#email.com,name4#email.com by sender1 on 01-22-2014 To name3#email.com by sender2 on 02-03-2014 To email5#domain.com by sender1 on 06-21-2014
#2: John To name5#email.com by sender6 on 01-12-2014 NA NA
#3: Tim To name2#email.com by sender2 on 05-11-2014 To name#email.com by sender2 on 06-03-2015 NA
# (load reshape2 and use dcast.data.table instead of dcast if not using 1.9.5+)

Convert data frame column to float and perform operation in Pandas

I have a data frame that contains the following that are imported as strings
df3 = pd.DataFrame(data = {
'Column1':['10/1','9/5','7/4','12/3','18/7','14/2']})
I am tried to convert to float and do the division. The following did work well.
for i, v in enumerate(df3.Column1):
df3['Column2'] = float(v[:-2]) / float(v[-1])
print df3.Column2
This is the output that I am trying to achieve
df3 = pd.DataFrame(data = {
'Column1':['10/1','9/5','7/4','12/3','18/7','14/2'],
'Column2':['10.0','1.8','1.75','4.0','2.57142857143','7.0']})
df3
The following would work, define a function to perform the casting to float and return this, the result of which should be assigned to your new column:
In [10]:
df3 = pd.DataFrame(data = {
'Column1':['10/1','9/5','7/4','12/3','18/7','14/2']})
def func(x):
return float(x[:-2]) / float(x[-1])
df3['Column2'] = df3['Column1'].apply(func)
df3
Out[10]:
Column1 Column2
0 10/1 10.000000
1 9/5 1.800000
2 7/4 1.750000
3 12/3 4.000000
4 18/7 2.571429
5 14/2 7.000000
if, and ONLY IF, you do not have input/data from an untrusted source, here's a shortcut:
In [46]: df3
Out[46]:
Column1
0 10/1
1 9/5
2 7/4
3 12/3
4 18/7
5 14/2
In [47]: df3.Column1.map(eval)
Out[47]:
0 10.000000
1 1.800000
2 1.750000
3 4.000000
4 2.571429
5 7.000000
Name: Column1, dtype: float64
But seriously...be careful with eval.