library(plyr) library(ggplot2) library(reshape) dir.high = "MAGIC.HIGHCOV/PSL/" dir.low = "MAGIC.LOWCOV/PSL/" #takes all psl files from directory and computes counts of blat hits (the hits are the number of targets) bacteria.counts <- function(dir){ files = list.files(dir, pattern = "psl") print(files) psl = lapply(paste(dir, files, sep = ""), function(x){tab = try(read.table(x, header = F, skip = 5, comment.char = "")); if (inherits(tab, 'try-error')){return(c())} else return(tab[,14])}) names = gsub('_', '.', files) names = substr(names, 1, gregexpr('.',names, fixed = T)[[1]][2]-1) names(psl) = names psl.counts = lapply(psl, function(x){t(as.matrix(table(x)))}) counts = do.call(rbind.fill.matrix, psl.counts) rownames(counts) = names counts[is.na(counts)] = 0 return(counts) } #counts = bacteria.counts(dir.low) #print(str(counts)) #print(counts[1:10, 1:10]) load("MAGIC.LOWCOV/bacteria.counts.Rdata") counts.low = counts counts.high = load("MAGIC.HIGHCOV/bacteria.counts.Rdata") counts.high = counts #print(counts.high[1:10,1:10]) lines = rownames(counts.high) lines = lines[-which(lines == "MAGIC.183")] bacteria = intersect(colnames(counts.high), colnames(counts.low)) #print(lines) #counts.low[1:10,1:10] counts.low = counts.low[lines, bacteria] counts.high = counts.high[lines, bacteria] counts.low[counts.low == 0] = 10e-5 counts.high[counts.low == 0] = 10e-5 pdf("bacteria.hits.lowvshigh.zoom.pdf") lapply(lines, function(x){ dfr <- data.frame(low.coverage = counts.low[x,], high.coverage = counts.high[x,]) dfr <- dfr[order(dfr[,1], decreasing = T),] g1 <- ggplot(dfr) + geom_point(aes(x = low.coverage, y = high.coverage)) + xlim(0,5) + labs(title = x) + ylim(0,50) print(g1) dfr <- data.frame(low.coverage = scale(counts.low[x,]), high.coverage = scale(counts.high[x,])) dfr <- dfr[order(dfr[,1], decreasing = T),] cor = round(cor(dfr[,1], dfr[,2]), 2) #g1 <- ggplot(dfr, aes(x = low.coverage, y = high.coverage)) + geom_point() + geom_abline() #print(g1 + labs(title = paste(x, ", r =", cor), x = "Low coverage counts", y = "High coverage counts")) #g2 <- g1 + scale_y_log10(breaks = c(0,1,2,5,10,20,50,100)) + scale_x_log10(breaks = c(0,1,2,5,10,20,50,100)) + labs(title = paste(x, ", r =", cor, ", logscale"), x = "Low coverage counts", y = "High coverage counts") #print(g2) }) dev.off()