source("happy.preCC.29062015.R") library(lme4) # MAGIC.R # (C) Richard Mott 2010; richard.mott@well.ox.ac.uk # Distributed under the GPL # This code contains functions to analyse QTLs in MAGIC recombinant inbred lines. # See http:/mus.well.ox.ac.uk/magic for instructions. prepare.database <- function( gdir="./", dir="./CONDENSED", mc.cores=5 ) { chrs = paste("chr", 1:5, sep="") prefix = ".MAGIC" haploid=TRUE generations=7 make.condensed.genome( chrs, prefix, gdir=gdir, dir=dir, thin=1, gen=generations, haploid=haploid, mc.cores=mc.cores) } write.scans.as.txt<- function(scans) { scan.files = paste( names(scans), ".scan.txt", sep="") qtl.files = paste( names(scans), ".qtls.txt", sep="") for( i in 1:length(scans)) { cat( "writing scans to ", scan.files[i], "\n") write.table(scans[[i]]$result,file=scan.files[i],row=FALSE,quote=F,sep="\t") cat( "writing qtls to ", qtl.files[i], "\n") write.table(scans[[i]]$qtls,file=qtl.files[i],row=FALSE,quote=F,sep="\t") } return(NULL) } load.phenotypes <- function( phenotype.file, phenotypes, covariates=NULL ) { cat( "reading phenotype file ", phenotype.file, "\n") phen.data=read.delim(phenotype.file) nam = names(phen.data) if ( is.null(phen.data$SUBJECT.NAME )) { cat( "ERROR phenotype file missing column SUBJECT.NAME\n") return(NULL) } else if ( ! is.factor(phen.data$SUBJECT.NAME )) { cat( "ERROR phenotype file column SUBJECT.NAME is not correct format\n") return(NULL) } use = c() for( n in nam ) { use = c( use, is.numeric(phen.data[[n]])) } if ( is.null(phenotypes) ) { phenotypes = nam[use] } else { nam = nam[use] use = match(phenotypes,nam,nomatch=0) phenotypes = nam[use] } if ( !is.null(covariates)) { use.c = covariates %in% names(phen.data) if ( sum( !use.c) > 0 ) { cat("ERROR missing covariates ", covariates[!use.c], "\n") return(NULL) } cat( "using covariates ", covariates, "\n") use.p = phenotypes %in% covariates phenotypes = phenotypes[!use.p] } if ( length(phenotypes) > 0 ) cat("Analysing numeric phenotypes ", phenotypes, "\n") else { cat("ERROR no numeric phenotypes to analyse\n") return(NULL) } return(list(phen.data=phen.data,phenotypes=phenotypes,covariates=covariates)) } scan.phenotypes <- function( phenotype.file, phenotypes=NULL, covariates=NULL, db=NULL, null.model = " ~ 1", dir="./CONDENSED", threshold=0.1, permute=1000, histogram.pdf="histogram.pdf", save.file="scans.RData", scan.plot.pdf= "scans.pdf", mc.cores=5) { if ( is.null(db)) db = load.condensed.database(dir) pd = load.phenotypes( phenotype.file, phenotypes, covariates ) phen.data=pd$phen.data phenotypes=pd$phenotypes if ( is.null(phen.data)) return(NULL) if ( ! is.null(histogram.pdf)) { cat("plotting histograms of phenotypes to ", histogram.pdf, "\n") plot.histograms( histogram.pdf, phen.data, phenotypes) } scans = lapply( phenotypes, function( phenotype, db, phen.data, null.model,permute, mc.cores) { null.model = paste( phenotype, null.model ) cat( "null model: ", null.model, "\n") cat( "scanning", null.model, "\n") scan=multicore.condensed.genome.scan.lm( db, phen.data, null.model, permute=permute, mc.cores=mc.cores, subject.name="SUBJECT.NAME") box.pdf = paste( phenotype, ".accession.estimates.pdf", sep="") scan$qtls = find.qtls( db=db, phenotype=phenotype, phen.data=phen.data, scan=scan, genomewide.threshold=threshold, box.pdf=box.pdf ) return(scan) }, db, phen.data, null.model, permute, mc.cores ) names(scans) = phenotypes if ( !is.null(save.file)) { cat( "saving scans to binary file ", save.file, "\n") save( scans, file=save.file) } if ( ! is.null(scan.plot.pdf)) { cat( "plotting scans to ", scan.plot.pdf, "\n") plot.scans( db, scans, scan.plot.pdf) } write.scans.as.txt(scans); lapply( phenotypes, write.gscandb.scans, scans, sh.file=NULL, population="MAGIC", genome.build="TAIR9" ) return(scans) } find.qtls <- function( db, phenotype, phen.data, scan, genomewide.threshold, n.sample=200, box.pdf="box.pdf" ) { chrs = unique( scan$result$chromosome) qtls = NULL for( chr in chrs ) { result.chr = scan$result[scan$result$chromosome==chr,] first.line = result.chr[1,] last.line = result.chr[nrow(result.chr),] result.chr = rbind(first.line,result.chr,last.line) island = result.chr$genomewide.permute.pval <= genomewide.threshold island[1] = FALSE island[length(island)]=FALSE island1 = c(island[2:length(island)],FALSE) start.qtl = which(!island & island1) end.qtl = which( island & !island1) len = length(start.qtl) if ( len > 0 ) { qtl.peak = c() for( i in 1:len) { qtl.peak = c(qtl.peak, start.qtl[i]-1+which.max(result.chr$logP[start.qtl[i]:end.qtl[i]])) } qtls.chr = data.frame( phenotype = rep(phenotype,len), chromosome=result.chr$chromosome[qtl.peak], island.from.bp=result.chr$from.bp[start.qtl], island.to.bp=result.chr$from.bp[end.qtl], peak.bp=result.chr$from.bp[qtl.peak], peak.SNP=result.chr$from.marker[qtl.peak], logP=result.chr$logP[qtl.peak], genomewide.pvalue=result.chr$genomewide.permute.pval[qtl.peak]) qtls = rbind(qtls,qtls.chr) } } snp = as.character(qtls$peak.SNP) snp.chr = as.character(qtls$chromosome) snp.bp = as.character(qtls$peak.bp) snp.idx = match(snp, scan$result$from.marker) if ( !is.null(qtls )) { est.plot = list() for( i in 1:nrow(qtls)) { d = db$additive$matrices[[snp.idx[i]]] y = phen.data[[phenotype]] names(y) = phen.data$SUBJECT.NAME model="linear" results = imputed.one.way.anova( db, y, d, n.sample=n.sample, model=model ) if ( ! is.null(box.pdf)) { est = results[,(ncol(results)-length(db$strains)+1):ncol(results)] est.plot[[i]] = data.frame( est=as.vector(t(est)), accession=rep(colnames(est), nrow(est))) } p = data.frame(imputed.parameter.distributions ( results, model=model, delta=0.01, fact=4 )) np = nrow(p) parameters = data.frame( Phenotype=rep(phenotype,np), SNP = rep(snp[i], np),chr=rep(snp.chr[i],np), bp=rep(as.character(snp.bp[i]),np), scan.logP = rep(as.character((qtls$logP[i])),np)) parameters = cbind(parameters,p) file=paste(phenotype, snp[i], "imputed.txt", sep=".") cat("writing parameter estimates to ",file,"\n") write.table( parameters, file=file, quote=F, sep="\t", row=FALSE) file=paste(phenotype, snp[i], "results.txt", sep=".") write.table( results, file=file, quote=F, sep="\t", row=FALSE) } if ( ! is.null(box.pdf)) { cat("plotting estimates to ", box.pdf, "\n") pdf(box.pdf) par(mfrow=c(3,1)) for( i in 1:nrow(qtls)) { boxplot(est ~ accession, data=est.plot[[i]], las=3,cex=0.6, main=paste(phenotype, snp[i], snp.chr[i], snp.bp[i], "logP", sprintf("%.2f", qtls$logP[i])) ) } dev.off() } return(qtls) } return(NULL) } imputed.one.way.anova <- function( db, phenotype, d, n.sample=100, model="linear" ) { marker = d$marker dmat = d$mat cc <- complete.cases(phenotype) phenotype <- phenotype[cc] use.subjects <- db$subjects[match(names(phenotype), db$subjects, nomatch=0)] p.subjects <- match(use.subjects,names(phenotype),nomatch=0) phenotype <- phenotype[p.subjects] g.subjects <- match(use.subjects,db$subjects) cat( marker, "\n" ) dmat <- dmat[g.subjects,] nc = ncol(dmat) mat = matrix( 0, nrow=nc, ncol=nc) diag(mat) <- rep(1,nc) X <- apply( dmat, 1, function(x, n.sample) { sample(nc,size=n.sample, prob=x,replace=TRUE) }, n.sample) if ( model=="linear") { # results = as.matrix(t(apply( X, 1, function( x, mat, phenotype ) { one.way.anova( mat[x,], phenotype) }, mat, phenotype ))) tmp = mclapply( 1:nrow(X), function( r, X, mat, phenotype ) { x=X[r,]; one.way.anova( mat[x,], phenotype) }, X, mat, phenotype, mc.cores=10 ) results= t(do.call( "cbind", tmp )) colnames(results) <- c( "imputed.logP", "sigma2.hat", "df", "N.df", "p", paste("N.", db$strains, sep=""), db$strains ) return(results) } else if ( model == "binary") { results = as.matrix(t(apply( X, 1, function( x, mat, phenotype ) { one.way.binary( mat[x,], phenotype) }, mat, phenotype ))) colnames(results) <- c( "logP", "pi.null", "df", "N.df", "p", paste("N.", db$strains, sep=""), db$strains ) return(results) } else { warning( "unknown model ", model, "\n") return (NULL) } } imputed.parameter.distributions <- function( results, model="linear", delta=0.01, fact=4 ) { mean.results = apply(results,2,mean, na.rm=TRUE) results = data.frame(results) p <- results$p[1] n <- results$N.df[1] N.strains = (ncol(results)-5)/2 p6 = p+6 p12 = p+12 N <- as.matrix(results[,seq(6,6+N.strains-1,1)]) N <- ifelse( N>0, N, NA) est <- results[,seq(6+N.strains,ncol(results),1)] logP = mean(results$imputed.logP) if ( model == "linear" ) { sigma2.hat <- results$sigma2.hat %o% rep(1,p) stderr <- sqrt(sigma2.hat/N) x.range = seq( -fact, +fact, delta ) all.results <- NULL # print(N) # print(est) all.ok = sum(is.na(N)) tmp = mclapply( 1:p, function(i, N, est, sigma2.hat, x.range) { ok = !is.na(N[,i]) Nok = sum(ok) if ( Nok > 0 ) { mu.mean = mean(est[ok,i]) sigma2.mean = mean(sigma2.hat [ok,i]) Ni <- N[ok,i] N.mean = mean(Ni) se = sqrt(sigma2.mean/N.mean) x.target = mu.mean +x.range *se s = (x.target %o% rep(1,Nok) - rep(1,length(x.target)) %o% est[ok,i])/(rep(1,length(x.target)) %o% sqrt(sigma2.hat[ok,i]/Ni) ) t.distribution <- pt( s, n ) avg.t.distribution = apply(t.distribution, 1, mean ) distribution <- data.frame( x=x.target, d=avg.t.distribution) # plot(distribution$x, distribution$d) density <- diff(distribution$d) x.density = distribution$x[1:length(density)] mean.i <- sum( x.density*density) var.i = sum( x.density*x.density*density) - mean.i*mean.i pct.50 = distribution$x[c(max(which( distribution$d < 0.25 )), min(which( distribution$d > 0.75 )))] pct.90 = distribution$x[c( max(which( distribution$d < 0.05 )), min(which( distribution$d > 0.95 )))] # cat( Nok, names(est)[i], mean.i, sqrt(var.i), pct.50, pct.90, "\n") return(c( logP, names(est)[i], mean.i, sqrt(var.i), pct.50, pct.90)) } else { return(c( logP, names(est)[i], NA, NA, NA, NA, NA, NA ))} } , N, est, sigma2.hat, x.range ) all.results = data.frame(t(do.call("cbind", tmp))) names(all.results) = c( "impute.logP", "Accession", "mean", "stderr", "lower.50", "upper.50", "lower.90", "upper.90") return(all.results) } else if ( model == "binary" ) { all.results <- NULL for( i in 1:p ) { ok = !is.na(N[,i]) Nok = sum(ok) if ( Nok > 0 ) { n.max = max(N[ok,i]) binom.dist = apply( cbind( est[ok,i], N[ok,i]), 1, function( x, n ) { d =rep(0, n); d[1:(x[2]+1)] = dbinom( 0:x[2], x[2], x[1]); return(d) }, n.max+1 ) total.dist = apply(binom.dist, 1, mean) print(total.dist) z=seq(0,n.max); cs = cumsum(total.dist); q05 = max(1,which(cs<=0.05))-1; q25 = max(1,which(cs<=0.25))-1; q50=max(1,which(cs<=0.50))-1; q75 =min(which(cs>=0.75))-1; q95=min(which(cs>=0.95))-1; print(cs) mu = sum(z*total.dist)/n.max se = sqrt(sum(z*z*total.dist)/(n.max*n.max)-mu*mu) # cat( names(est)[i], logP, mu, se, q25, q75, q05,q95 , "\n") all.results = rbind( all.results, c( logP, names(est)[i], mu, se, q25/n.max, q75/n.max, q05/n.max,q95/n.max )) } else { all.results = rbind( all.results, c( logP, names(est)[i], logP, NA, NA, NA, NA, NA, NA )) } } all.results = data.frame(all.results) names(all.results) = c( "impute.logP", "Accession", "mean", "stderr", "lower.50", "upper.50", "lower.90", "upper.90") return(all.results) } } one.way.anova <- function( X, y ) { # assumes design matrix X correponds to one-way anova - ie exactly one non-zero element per row, equal to 1, and assumes all y's are non-missing n = length(y) N <- apply(X, 2, sum ) # number of replicates of each class df = sum(N>0) S <- t(X) %*% y # sums of replicates beta.hat = ifelse( N>0 , S/N, NA ) # estimates y.hat = X %*% beta.hat y.resid = y - y.hat RSS = sum(y.resid*y.resid) sigma.hat = RSS/(n-df) yy = y-mean(y) TSS = sum(yy*yy) FSS = TSS-RSS F = 0 logP = NA if (df > 1) { F = (FSS/(df-1))/(RSS/(n-df)) logP = -pf( F, df-1, n-df, lower.tail =FALSE, log=TRUE)/log(10) } # a = anova(lm(y ~ X)) return( c( logP=logP, sigma.hat=sigma.hat, df=df-1, ndf=n-df, ncol=ncol(X), N=N, beta.hat=beta.hat ) ) } one.way.binary <- function( X, y ) { # assumes design matrix X correponds to one-way design - ie exatly one non-zero element per row, equal to 1, and assumes all y's are non-missing and either 0 or 1 or TRUE or FALSE # estimates are given as proportions pi n = length(y) N <- apply(X, 2, sum ) # number of replicates of each class df = sum(N>0) S <- t(X) %*% y # sums of replicates pi.hat = ifelse( N>0 , S/N, NA ) # estimates pi.null = sum(y)/n y.hat = X %*% pi.hat if ( df > 1 ) { log.LR = 2*sum( ifelse( y > 0, y*log(y.hat/pi.null), 0) + ifelse( 1-y>0, (1-y)*log((1-y.hat)/(1-pi.null)),0)) logP = -pchisq( log.LR, df-1, log=TRUE, lower.tail=FALSE )/log(10) } else{ logP = NA log.LR = 0 } return( c( logP=logP, pi.null=pi.null, df=df-1, ndf=n-df, ncol=ncol(X), N=N, pi.hat=pi.hat ) ) }