Related
I am trying to add geom_smooth(method = 'loess'), however this is not showing up in the plot. I believe it is something about the numeric values, that geom_smooth is not recognizing the input as numeric?
> head(CH12F3.miRNA_prep.miRNA)
miRNA variable value
1 mmu-let-7a-1-3p 0h 0.5098628
2 mmu-let-7a-5p 0h 0.4286451
3 mmu-let-7b-3p 0h 0.0000000
4 mmu-let-7b-5p 0h 1.4925830
5 mmu-let-7c-2-3p 0h 1.0715206
6 mmu-let-7c-5p 0h 1.3836720
server <- function(input, output, session) {
data_selected <- reactive({
filter(CH12F3.miRNA_prep.miRNA, miRNA %in% input$MicroRNA)
})
output$myplot <- renderPlot({
ggplot(data_selected(), aes_string("variable", "value", colour = "variable")) +
geom_point() + theme_classic(base_size = 12) +
labs(colour="Time Point",x="Time",y="Expression (cpm,log2)\nTreated/Control")+
theme(axis.text.x = element_text(angle = 45,hjust = 1)) + geom_smooth(method = 'loess')
} )
}
In your ggplot, try aes(as.numeric(variable), as.numeric(value), color=variable) instead of aes_string().
I wish to split strings into non-overlapping segments where the endpoints of a segment are numbers within a field of dots. I can do this using the code below. However, this code seems to be overly complex and involves nested for-loops. Is there a simpler way, ideally using regex in base R?
Here is an example and the desired.result.
my.data <- read.table(text = '
my.string cov1 cov2
11....... 1 A
1.1.2.1.1 2 B
1234..... 3 C
1...2...3 4 C
..3..4... 5 D
', header = TRUE, stringsAsFactors = FALSE)
desired.result <- read.table(text = '
my.string cov1 cov2
11....... 1 A
1.1...... 2 B
..1.2.... 2 B
....2.1.. 2 B
......1.1 2 B
12....... 3 C
.23...... 3 C
..34..... 3 C
1...2.... 4 C
....2...3 4 C
..3..4... 5 D
', header = TRUE, stringsAsFactors = FALSE, na.strings = 'NA')
new.data <- data.frame(do.call(rbind, strsplit(my.data$my.string,'')), stringsAsFactors = FALSE)
n.segments <- rowSums(!(new.data[1:ncol(new.data)] == '.')) - 1
my.end.points <- do.call(rbind, gregexpr("[0-9]", my.data$my.string, perl=TRUE))
my.end.point.char <- do.call(rbind, strsplit(my.data$my.string, ""))
my.end.point.char <- t(apply(my.end.point.char, 1, as.numeric))
new.strings <- matrix('.', nrow = sum(n.segments), ncol = max(nchar(my.data$my.string)))
new.cov <- as.data.frame(matrix(NA, nrow = sum(n.segments), ncol = (ncol(my.data) - 1)))
m <- 1
for(i in 1:nrow(new.data)) {
for(j in 1:n.segments[i]) {
for(k in 1:ncol(new.strings)) {
new.strings[m, my.end.points[i, j ] ] <- my.end.point.char[i, my.end.points[i, j ]]
new.strings[m, my.end.points[i, (j+1)] ] <- my.end.point.char[i, my.end.points[i,(j+1)]]
new.cov[m,] <- my.data[i, c(2:ncol(my.data))]
}
m <- m + 1
}
}
my.result <- data.frame(my.string = apply(new.strings, 1, function(x) paste0(x, collapse = '')), stringsAsFactors = FALSE)
my.result <- data.frame(my.result, new.cov)
colnames(my.result) <- names(my.data)
all.equal(desired.result, my.result)
# [1] TRUE
w <- nchar(my.data$my.string[1L]);
dps <- character(w+1L); dps[1L] <- ''; for (i in seq_len(w)) dps[i+1L] <- paste0(dps[i],'.');
x <- Map(my.data$my.string,gregexpr('[^.]',my.data$my.string),f=function(s,g)
if (length(g)<3L) s else sapply(seq_len(length(g)-1L),function(gi)
paste0(dps[g[gi]],substr(s,g[gi],g[gi+1L]),dps[w-g[gi+1L]+1L])
)
);
res <- transform(my.data[rep(seq_len(nrow(my.data)),sapply(x,length)),],my.string=unlist(x));
res;
## my.string cov1 cov2
## 1 11....... 1 A
## 2 1.1...... 2 B
## 2.1 ..1.2.... 2 B
## 2.2 ....2.1.. 2 B
## 2.3 ......1.1 2 B
## 3 12....... 3 C
## 3.1 .23...... 3 C
## 3.2 ..34..... 3 C
## 4 1...2.... 4 C
## 4.1 ....2...3 4 C
## 5 ..3..4... 5 D
Note: You can replace the sapply(x,length) piece with lengths(x) if you have a recent enough version of R.
Benchmarking
library(microbenchmark);
bgoldst <- function(my.data) { w <- nchar(my.data$my.string[1L]); dps <- character(w+1L); dps[1L] <- ''; for (i in seq_len(w)) dps[i+1L] <- paste0(dps[i],'.'); x <- Map(my.data$my.string,gregexpr('[^.]',my.data$my.string),f=function(s,g) if (length(g)<3L) s else sapply(seq_len(length(g)-1L),function(gi) paste0(dps[g[gi]],substr(s,g[gi],g[gi+1L]),dps[w-g[gi+1L]+1L]))); transform(my.data[rep(seq_len(nrow(my.data)),sapply(x,length)),],my.string=unlist(x)); };
rawr <- function(my.data) { f <- function(x, m) { y <- gsub('.', '\\.', x); cs <- attr(m, "capture.start"); cl <- attr(m, "capture.length"); Vectorize(`substr<-`)(y, cs, cl + cs - 1, Vectorize(substr)(x, cs, cl + cs - 1)); }; m <- gregexpr('(?=([0-9][.]*[0-9]))', my.data$my.string, perl = TRUE); strs <- Map(f, my.data$my.string, m); tmp <- `rownames<-`(my.data[rep(1:nrow(my.data), sapply(strs,length)), ], NULL); tmp$my.string <- unlist(strs); tmp; };
carroll <- function(my.data) { strings <- sapply(my.data$my.string, function(x) { stri_match_all_regex(x, "(?=([0-9]{1}\\.*[0-9]{1}))")[[1]][,2]; }); strpos <- lapply(1:length(strings), function(x) { y <- {nchar(sub(perl=T,'^\\.*\\K.*','',my.data$my.string[x]))+c(0, cumsum(nchar(strings[[x]])-1))}; return(y[-length(y)]); }); w <- nchar(my.data$my.string[1L]); output.result <- data.frame(my.string = cbind(unlist(sapply(1:length(strings), function(y) { cbind(sapply(1:length(strings[[y]]), function(x) { leftstr <- paste0(paste0(rep(".", strpos[[y]][[x]]), collapse=""), strings[[y]][x]); rightstr <- paste0(rep(".", w-nchar(leftstr)), collapse=""); paste0(leftstr, rightstr, collapse=""); })); }))), my.data[unlist(sapply(1:length(strings), function(x) { rep(x, sapply(strings, length)[x]); })), c(2,3)], stringsAsFactors=FALSE); row.names(output.result) <- NULL; output.result; };
## OP's sample input
my.data <- read.table(text = '
my.string cov1 cov2
11....... 1 A
1.1.2.1.1 2 B
1234..... 3 C
1...2...3 4 C
..3..4... 5 D
', header = TRUE, stringsAsFactors = FALSE);
ex <- bgoldst(my.data);
all.equal(ex,rawr(my.data),check.attributes=F);
## [1] TRUE
all.equal(ex,carroll(my.data),check.attributes=F);
## [1] TRUE
microbenchmark(bgoldst(my.data),rawr(my.data),carroll(my.data));
## Unit: microseconds
## expr min lq mean median uq max neval
## bgoldst(my.data) 422.094 451.816 483.5305 476.6195 503.775 801.421 100
## rawr(my.data) 1096.502 1160.863 1277.7457 1236.7720 1298.996 3092.785 100
## carroll(my.data) 1130.287 1176.900 1224.6911 1213.2515 1247.249 1525.437 100
## scale test
set.seed(1L);
NR <- 1e4; NS <- 30L; probDot <- 3/4;
x <- c('.',0:9); probs <- c(probDot,rep((1-probDot)/10,10L)); my.data <- data.frame(my.string=do.call(paste0,as.data.frame(replicate(NS,sample(x,NR,T,probs)))),cov1=sample(seq_len(NR)),cov2=sample(make.unique(rep(LETTERS,len=NR))),stringsAsFactors=F);
repeat { w <- which(sapply(gregexpr('[^.]',my.data$my.string),length)==1L); if (length(w)==0L) break; my.data$my.string[w] <- do.call(paste0,as.data.frame(replicate(NS,sample(x,length(w),T,probs)))); }; ## prevent single-digit strings, which rawr and carroll solutions don't support
ex <- bgoldst(my.data);
all.equal(ex,rawr(my.data),check.attributes=F);
## [1] TRUE
all.equal(ex,carroll(my.data),check.attributes=F);
## [1] TRUE
microbenchmark(bgoldst(my.data),rawr(my.data),carroll(my.data),times=1L);
## Unit: milliseconds
## expr min lq mean median uq max neval
## bgoldst(my.data) 904.887 904.887 904.887 904.887 904.887 904.887 1
## rawr(my.data) 2736.462 2736.462 2736.462 2736.462 2736.462 2736.462 1
## carroll(my.data) 108575.001 108575.001 108575.001 108575.001 108575.001 108575.001 1
my.data <- read.table(text = '
my.string cov1 cov2
11....... 1 A
1.1.2.1.1 2 B
1234..... 3 C
1...2...3 4 C
..3..4... 5 D
', header = TRUE, stringsAsFactors = FALSE)
f <- function(x, m) {
if (nchar(gsub('.', '', x, fixed = TRUE)) < 2L) return(x)
y <- gsub('.', '\\.', x)
cs <- attr(m, "capture.start")
cl <- attr(m, "capture.length")
Vectorize(`substr<-`)(y, cs, cl + cs - 1, Vectorize(substr)(x, cs, cl + cs - 1))
}
m <- gregexpr('(?=([0-9][.]*[0-9]))', my.data$my.string, perl = TRUE)
strs <- Map(f, my.data$my.string, m)
tmp <- `rownames<-`(my.data[rep(1:nrow(my.data), lengths(strs)), ], NULL)
tmp$my.string <- unlist(strs)
# my.string cov1 cov2
# 1 11....... 1 A
# 2 1.1...... 2 B
# 3 ..1.2.... 2 B
# 4 ....2.1.. 2 B
# 5 ......1.1 2 B
# 6 12....... 3 C
# 7 .23...... 3 C
# 8 ..34..... 3 C
# 9 1...2.... 4 C
# 10 ....2...3 4 C
# 11 ..3..4... 5 D
identical(tmp, desired.result)
# [1] TRUE
Here's an option. Not clean, but neither is the problem.
library(stringi)
## isolate the strings, allowing overlap via positive lookaheads
strings <- sapply(my.data$my.string, function(x) {
stri_match_all_regex(x, "(?=([0-9]{1}\\.*[0-9]{1}))")[[1]][,2]
})
Identify the offsets at the start of each group.
## identify the . offsets
strpos <- lapply(1:length(strings), function(x) {
y <- {nchar(sub(perl=T,'^\\.*\\K.*','',my.data$my.string[x]))+c(0, cumsum(nchar(strings[[x]])-1))}
return(y[-length(y)])
})
Build up the data.frame with only 2 sapply loops.
## collate the results using sapply
w <- nchar(my.data$my.string[1L]);
output.result <- data.frame(
my.string = cbind(unlist(sapply(1:length(strings), function(y) {
cbind(sapply(1:length(strings[[y]]), function(x) {
leftstr <- paste0(paste0(rep(".", strpos[[y]][[x]]), collapse=""), strings[[y]][x])
rightstr <- paste0(rep(".", w-nchar(leftstr)), collapse="")
paste0(leftstr, rightstr, collapse="")
}))
}))),
my.data[unlist(sapply(1:length(strings), function(x) {
rep(x, sapply(strings, length)[x])
})), c(2,3)], stringsAsFactors=FALSE
)
row.names(output.result) <- NULL
output.result
my.string cov1 cov2
1 11....... 1 A
2 1.1...... 2 B
3 ..1.2.... 2 B
4 ....2.1.. 2 B
5 ......1.1 2 B
6 12....... 3 C
7 .23...... 3 C
8 ..34..... 3 C
9 1...2.... 4 C
10 ....2...3 4 C
11 ..3..4... 5 D
identical(desired.result, output.result)
[1] TRUE
I would like to read a matrix file something which looks like:
sample sample1 sample2 sample3
sample1 1 0.7 0.8
sample2 0.7 1 0.8
sample3 0.8 0.8 1
I would like to fetch all the pairs that have a value of > 0.8. E.g: sample1,sample3 0.8 sample2,sample3 0.8 etc in a large file .
When I use csv.reader, each line is turning in to a list and keeping track of row and column names makes program dodgy. I would like to know an elegant way of doing it like using numpy or pandas.
Desired output:
sample1,sample3 0.8
sample2,sample3 0.8
1 can be ignored because between same sample, it will be 1 always.
You can mask out the off upper-triangular values with np.triu:
In [11]: df
Out[11]:
sample1 sample2 sample3
sample
sample1 1.0 0.7 0.8
sample2 0.7 1.0 0.8
sample3 0.8 0.8 1.0
In [12]: np.triu(df, 1)
Out[12]:
array([[ 0. , 0.7, 0.8],
[ 0. , 0. , 0.8],
[ 0. , 0. , 0. ]])
In [13]: np.triu(df, 1) >= 0.8
Out[13]:
array([[False, False, True],
[False, False, True],
[False, False, False]], dtype=bool)
Then to extract the index/columns where it's True I think you have to use np.where*:
In [14]: np.where(np.triu(df, 1) >= 0.8)
Out[14]: (array([0, 1]), array([2, 2]))
This gives you an array of first index indices and then column indices (this is the least efficient part of this numpy version):
In [16]: index, cols = np.where(np.triu(df, 1) >= 0.8)
In [17]: [(df.index[i], df.columns[j], df.iloc[i, j]) for i, j in zip(index, cols)]
Out[17]:
[('sample1', 'sample3', 0.80000000000000004),
('sample2', 'sample3', 0.80000000000000004)]
As desired.
*I may be forgetting an easier way to get this last chunk (Edit: the below pandas code does it, but I think there may be another way too.)
You can use the same trick in pandas but with stack to get the index/columns natively:
In [21]: (np.triu(df, 1) >= 0.8) * df
Out[21]:
sample1 sample2 sample3
sample
sample1 0 0 0.8
sample2 0 0 0.8
sample3 0 0 0.0
In [22]: res = ((np.triu(df, 1) >= 0.8) * df).stack()
In [23]: res
Out[23]:
sample
sample1 sample1 0.0
sample2 0.0
sample3 0.8
sample2 sample1 0.0
sample2 0.0
sample3 0.8
sample3 sample1 0.0
sample2 0.0
sample3 0.0
dtype: float64
In [24]: res[res!=0]
Out[24]:
sample
sample1 sample3 0.8
sample2 sample3 0.8
dtype: float64
If you want to use Pandas, the following answer will help. I am assuming you will figure out how to read your matrix files into Pandas by yourself. I am also assuming that your columns and rows are labelled correctly. What you will end up with after you read your data is a DataFrame which will look a lot like the matrix you have at the top of your question. I am assuming that all row names are the DataFrame index. I am taking that you have read the data into a variable called df as my starting point.
Pandas is more efficient row-wise than column-wise. So, we do things row-wise, looping over the columns.
pairs = {}
for col in df.columns:
pairs[col] = df[(df[col] >= 0.8) & (df[col] < 1)].index.tolist()
# If row names are not an index, but a different column named 'names' run the following line, instead of the line above
# pairs[col] = df[(df[col] >= 0.8) & (df[col] < 1)]['names'].tolist()
Alternatively, you can use apply() to do this, because that too will loop over all columns. (Maybe in 0.17 it will release the GIL for faster results, I do not know because I have not tried it.)
pairs will now contain the column name as key and a list of the names of rows as values where the correlation is greater than 0.8, but less than 1.
If you also want to extract correlation values from the DataFrame, replace .tolist() by .to_dict(). .to_dict() will generate a dict such that index is key and value is value: {index -> value}. So, ultimately your pairs will look like {column -> {index -> value}}. It will also be guaranteed free of nan. Note that .to_dict() will only work if your index contains the row names that you want, else it will return the default index, which is just numbers.
Ps. If your file is huge, I would recommend reading it in chunks. In this case, the piece of code above will be repeated for each chunk. So it should be inside your loop that iterates over chunks. However, then you will have to be careful to append new data coming from the next chunk to pairs. The following links are for your reference:
Pandas I/O docs
Pandas read_csv() function
SO question on chunked read
You might also want to read reference 1 for other types of I/O supported by Pandas.
To read it in you need the skipinitialspace and index_col parameters:
a=pd.read_csv('yourfile.txt',sep=' ',skipinitialspace=True,index_col=0)
To get the values pair wise:
[[x,y,round(a[x][y],3)] for x in a.index for y in a.columns if x!=y and a[x][y]>=0.8][:2]
Gives:
[['sample1', 'sample3', 0.8],
['sample2', 'sample3', 0.8]]
Using scipy.sparse.coo_matrix, as it works with a "(row, col) data" format.
from scipy.sparse import coo_matrix
import numpy as np
M = np.matrix([[1.0, 0.7, 0.8], [0.7, 1.0, 0.8], [0.8, 0.8, 1.0]])
S = coo_matrix(M)
Here, S.row and S.col are arrays of row and column indices, S.data is the array of values at those indices. So you can filter by
idx = S.data >= 0.8
And for instance create a new matrix only with those elements:
S2 = coo_matrix((S.data[idx], (S.row[idx], S.col[idx])))
print S2
The output is
(0, 0) 1.0
(0, 2) 0.8
(1, 1) 1.0
(1, 2) 0.8
(2, 0) 0.8
(2, 1) 0.8
(2, 2) 1.0
Note (0,1) does not appear as the value is 0.7.
pandas' read_table can handle regular expressions in the sep parameter.
In [19]: !head file.txt
sample sample1 sample2 sample3
sample1 1 0.7 0.8
sample2 0.7 1 0.8
sample3 0.8 0.8 1
In [20]: df = pd.read_table('file.txt', sep='\s+')
In [21]: df
Out[21]:
sample sample1 sample2 sample3
0 sample1 1.0 0.7 0.8
1 sample2 0.7 1.0 0.8
2 sample3 0.8 0.8 1.0
From there, you can filter on all values >= 0.8.
In [23]: df[df >= 0.8]
Out[23]:
sample sample1 sample2 sample3
0 sample1 1.0 NaN 0.8
1 sample2 NaN 1.0 0.8
2 sample3 0.8 0.8 1.0
I'm using Python 2.7.8 |Anaconda 2.1.0. I'm wondering why the strange behavior below occurs
I create a pandas dataframe with two columns, then add a third column by summing the first two columns
x = pd.DataFrame(np.random.randn(5, 2), columns = ['a', 'b'])
x['c'] = x[['a', 'b']].sum(axis = 1) #or x['c'] = x['a'] + x['b']
Out[7]:
a b c
0 -1.644246 0.851602 -0.792644
1 -0.129092 0.237140 0.108049
2 0.623160 0.105494 0.728654
3 0.737803 -1.612189 -0.874386
4 0.340671 -0.113334 0.227337
All good so far. Now I want to set the values of column c to zero if they are negative
x[x['c']<0] = 0
Out[9]:
a b c
0 0.000000 0.000000 0.000000
1 -0.129092 0.237140 0.108049
2 0.623160 0.105494 0.728654
3 0.000000 0.000000 0.000000
4 0.340671 -0.113334 0.227337
This gives the desired result in column 'c', but for some reason columns 'a' and 'b' have been modified - i don't want this to happen. I was wondering why this is happening and how I can fix this behavior?
You have to specify you only want the 'c' column:
x.loc[x['c']<0, 'c'] = 0
When you just index with a boolean array/series, this will select full rows, as you can see in this example:
In [46]: x['c']<0
Out[46]:
0 True
1 False
2 False
3 True
4 False
Name: c, dtype: bool
In [47]: x[x['c']<0]
Out[47]:
a b c
0 -0.444493 -0.592318 -1.036811
3 -1.363727 -1.572558 -2.936285
Because you are setting to zero for all the columns. You should set it only for column c
x['c'][x['c']<0] = 0
Say i have a list mn like this
i<-c(w=5,n="oes")
p<-c(w=9,n="ty",j="ooe")
mn<-list(i,p,i,p,i,p,i)
Now I´d like to select the list elements with the shortest length (the i´s) and append "unknown" to the list before creating a dataframe. How can I do this?
EDIT: In the end I´d like the list to have every i element in mn as w=5,n="oes", and j="unknown" before mn including p is changed into a dataframe:
To find the lenght of each element in your list, use length wrapped in sapply:
len <- sapply(mn, length)
len
[1] 2 3 2 3 2 3 2
Now, to identify only those elements that have lengths equal to the shortest length:
which(len==min(len))
[1] 1 3 5 7
Use subsetting and as.data.frame to create your data.frame. But this data.frame will have somewhat random column names, so I rename the column names:
df <- as.data.frame(mn[which(len==min(len))])
names(df) <- seq_len(ncol(df))
df
1 2 3 4
w 5 5 5 5
n oes oes oes oes
You will have to clarify what you mean with "append unknown" to this data.frame.
Another possibility is:
all.names = unique( unlist( lapply( mn, names ) ) )
do.call( 'rbind', lapply( mn, function( r ) {
data.frame( sapply( all.names, function( v ) r[ v ], simplify=F ) )
} ) )
which gives:
w n j
w 5 oes <NA>
w1 9 ty ooe
w2 5 oes <NA>
w3 9 ty ooe
w4 5 oes <NA>
w5 9 ty ooe
w6 5 oes <NA>
But I get the feeling there's a much neater route to this solution...
edit
If you want unknown rather than <NA>, you can change the inner sapply to:
sapply( all.names, function( v ) if( is.na( r[v] ) ) 'unknown' else r[v], simplify=F )
There is not very elegant, but it might do the trick:
maxlength <- max(sapply(mn,length))
## make a new list, with the "missing" entries replaced with "unknown"
mn2 <- lapply(mn,function(x)c(x,rep('unknown',maxlength - length(x))))
## convert to a data.frame
mn3 <- data.frame(matrix(unlist(mn2),nrow = 3))
Which gives the following
> mn3
X1 X2 X3 X4 X5 X6 X7
1 5 9 5 9 5 9 5
2 oes ty oes ty oes ty oes
3 unknown ooe unknown ooe unknown ooe unknown
However it is better practice to use NA, rather than "unknown"