source("happy.preCC.29062015.R")
library(lme4)

# MAGIC.R
# (C) Richard Mott 2010; richard.mott@well.ox.ac.uk
# Distributed under the GPL
# This code contains functions to analyse QTLs in MAGIC recombinant inbred lines. 
# See http:/mus.well.ox.ac.uk/magic for instructions.

prepare.database <- function( gdir="./", dir="./CONDENSED",  mc.cores=5 ) {

  chrs = paste("chr", 1:5, sep="")
  prefix = ".MAGIC"
  haploid=TRUE
  generations=7

  make.condensed.genome(  chrs, prefix, gdir=gdir, dir=dir, thin=1, gen=generations, haploid=haploid, mc.cores=mc.cores)
}

write.scans.as.txt<- function(scans) {

  scan.files = paste( names(scans), ".scan.txt", sep="")
  qtl.files = paste( names(scans), ".qtls.txt", sep="")
  for( i in 1:length(scans)) {
    cat( "writing scans to ", scan.files[i], "\n")
    write.table(scans[[i]]$result,file=scan.files[i],row=FALSE,quote=F,sep="\t")
    cat( "writing qtls to ", qtl.files[i], "\n")
    write.table(scans[[i]]$qtls,file=qtl.files[i],row=FALSE,quote=F,sep="\t")
  }
  return(NULL)
}

load.phenotypes <- function( phenotype.file, phenotypes, covariates=NULL ) {

  cat( "reading phenotype file ", phenotype.file, "\n")
  phen.data=read.delim(phenotype.file)

  nam = names(phen.data)

  if ( is.null(phen.data$SUBJECT.NAME )) {
    cat( "ERROR phenotype file missing column SUBJECT.NAME\n")
    return(NULL)
  }
  else if ( ! is.factor(phen.data$SUBJECT.NAME )) {
    cat( "ERROR phenotype file column SUBJECT.NAME is not correct format\n")
    return(NULL)
  }

  use = c()
  for( n in nam ) { use = c( use, is.numeric(phen.data[[n]])) }

  if ( is.null(phenotypes) ) {
    phenotypes = nam[use]
  }
  else {
    nam = nam[use]
    use = match(phenotypes,nam,nomatch=0)
    phenotypes = nam[use]
  }

  if ( !is.null(covariates)) {
      use.c = covariates %in% names(phen.data)
      if ( sum( !use.c) > 0 ) {
          cat("ERROR missing covariates ", covariates[!use.c], "\n")
          return(NULL)
      }
      cat( "using covariates ", covariates, "\n")
      use.p = phenotypes %in% covariates
      phenotypes = phenotypes[!use.p]
  }
  if ( length(phenotypes) > 0 ) 
    cat("Analysing numeric phenotypes ", phenotypes, "\n")
  else {
    cat("ERROR no numeric phenotypes to analyse\n")
    return(NULL)
  }
  return(list(phen.data=phen.data,phenotypes=phenotypes,covariates=covariates))
}

scan.phenotypes <- function( phenotype.file, phenotypes=NULL, covariates=NULL, db=NULL, null.model = " ~ 1", dir="./CONDENSED", threshold=0.1, permute=1000, histogram.pdf="histogram.pdf", save.file="scans.RData", scan.plot.pdf= "scans.pdf", mc.cores=5) {

  if ( is.null(db)) db = load.condensed.database(dir)
  
  pd = load.phenotypes( phenotype.file, phenotypes, covariates ) 
  phen.data=pd$phen.data
  phenotypes=pd$phenotypes
  if ( is.null(phen.data)) return(NULL)
	
  if ( ! is.null(histogram.pdf)) {
    cat("plotting histograms of phenotypes to ", histogram.pdf, "\n")
    plot.histograms( histogram.pdf, phen.data, phenotypes)
  }


  scans = lapply( phenotypes, 
      function( phenotype, db, phen.data, null.model,permute, mc.cores) {
          null.model = paste( phenotype, null.model )
          cat( "null model: ", null.model, "\n")
          cat( "scanning", null.model, "\n")
          scan=multicore.condensed.genome.scan.lm( db, phen.data, null.model, permute=permute, mc.cores=mc.cores, subject.name="SUBJECT.NAME")
          box.pdf = paste( phenotype, ".accession.estimates.pdf", sep="")
          scan$qtls = find.qtls( db=db, phenotype=phenotype, phen.data=phen.data, scan=scan, genomewide.threshold=threshold, box.pdf=box.pdf )
          return(scan)
      },
      db, phen.data, null.model, permute, mc.cores )
  names(scans) = phenotypes

  if ( !is.null(save.file)) {
    cat( "saving scans to binary file ", save.file, "\n")
    save( scans, file=save.file)
  }

  if ( ! is.null(scan.plot.pdf)) {
    cat( "plotting scans to ", scan.plot.pdf, "\n")
    plot.scans( db, scans, scan.plot.pdf)
  }

  write.scans.as.txt(scans);	
  lapply( phenotypes, write.gscandb.scans, scans,  sh.file=NULL, population="MAGIC", genome.build="TAIR9" ) 

  return(scans) 
}

find.qtls <- function( db, phenotype, phen.data, scan, genomewide.threshold, n.sample=200, box.pdf="box.pdf" ) {

  chrs = unique( scan$result$chromosome)
  qtls = NULL
  for( chr in chrs ) {

    result.chr = scan$result[scan$result$chromosome==chr,]
    first.line = result.chr[1,]
    last.line = result.chr[nrow(result.chr),]
    result.chr = rbind(first.line,result.chr,last.line)

    island = result.chr$genomewide.permute.pval <= genomewide.threshold
    island[1] = FALSE
    island[length(island)]=FALSE

    island1 = c(island[2:length(island)],FALSE)
    start.qtl = which(!island &  island1)
    end.qtl = which( island & !island1)
    len = length(start.qtl)
    if ( len > 0 ) {
      qtl.peak = c()
      for( i in 1:len) {
        qtl.peak = c(qtl.peak, start.qtl[i]-1+which.max(result.chr$logP[start.qtl[i]:end.qtl[i]]))
      }
      qtls.chr = data.frame( phenotype = rep(phenotype,len), chromosome=result.chr$chromosome[qtl.peak], island.from.bp=result.chr$from.bp[start.qtl], island.to.bp=result.chr$from.bp[end.qtl], peak.bp=result.chr$from.bp[qtl.peak], peak.SNP=result.chr$from.marker[qtl.peak], logP=result.chr$logP[qtl.peak], genomewide.pvalue=result.chr$genomewide.permute.pval[qtl.peak])
      qtls = rbind(qtls,qtls.chr)
    }
  }
  snp = as.character(qtls$peak.SNP)
  snp.chr = as.character(qtls$chromosome)
  snp.bp = as.character(qtls$peak.bp)
  snp.idx = match(snp, scan$result$from.marker)
  if ( !is.null(qtls )) {
    est.plot = list()
    for( i in 1:nrow(qtls)) {

      d = db$additive$matrices[[snp.idx[i]]]
      y = phen.data[[phenotype]]
      names(y) = phen.data$SUBJECT.NAME
      model="linear"
      results = imputed.one.way.anova( db, y, d, n.sample=n.sample, model=model ) 
      if ( ! is.null(box.pdf)) {
        est = results[,(ncol(results)-length(db$strains)+1):ncol(results)]
        est.plot[[i]] = data.frame( est=as.vector(t(est)), accession=rep(colnames(est), nrow(est)))
      }
      p = data.frame(imputed.parameter.distributions ( results,  model=model, delta=0.01, fact=4 ))
      np = nrow(p)
      parameters = data.frame( Phenotype=rep(phenotype,np), SNP = rep(snp[i], np),chr=rep(snp.chr[i],np), bp=rep(as.character(snp.bp[i]),np), scan.logP = rep(as.character((qtls$logP[i])),np))

      parameters = cbind(parameters,p)
      file=paste(phenotype, snp[i], "imputed.txt", sep=".")
      cat("writing parameter estimates to ",file,"\n")
      write.table( parameters, file=file, quote=F, sep="\t", row=FALSE)
      file=paste(phenotype, snp[i], "results.txt", sep=".")
      write.table( results, file=file, quote=F, sep="\t", row=FALSE)
    }
    if ( ! is.null(box.pdf)) {
      cat("plotting estimates to ", box.pdf, "\n")
      pdf(box.pdf)
      par(mfrow=c(3,1))


      for( i in 1:nrow(qtls)) {
        boxplot(est ~ accession, data=est.plot[[i]], las=3,cex=0.6, main=paste(phenotype, snp[i], snp.chr[i], snp.bp[i], "logP", sprintf("%.2f", qtls$logP[i])) )
      }

      dev.off()
    }
    return(qtls)
  }
  return(NULL)
}


imputed.one.way.anova <- function( db, phenotype, d,  n.sample=100, model="linear" ) {

  marker = d$marker
  dmat = d$mat	
  cc <- complete.cases(phenotype)
  phenotype <- phenotype[cc]
  use.subjects <- db$subjects[match(names(phenotype), db$subjects, nomatch=0)]
  p.subjects <- match(use.subjects,names(phenotype),nomatch=0)
  phenotype <- phenotype[p.subjects]
  g.subjects <- match(use.subjects,db$subjects)

  cat( marker, "\n" )

  dmat <- dmat[g.subjects,]
  nc = ncol(dmat)
  mat = matrix( 0, nrow=nc, ncol=nc)
  diag(mat) <- rep(1,nc)
  X <- apply( dmat, 1, function(x, n.sample) { sample(nc,size=n.sample, prob=x,replace=TRUE) }, n.sample)
  if ( model=="linear") {
                                        #    results = as.matrix(t(apply( X, 1, function( x, mat, phenotype ) {  one.way.anova( mat[x,], phenotype) }, mat, phenotype )))
    tmp = mclapply( 1:nrow(X),  function( r, X, mat, phenotype ) {  x=X[r,]; one.way.anova( mat[x,], phenotype) }, X, mat, phenotype, mc.cores=10 )
    results= t(do.call( "cbind", tmp ))

    colnames(results) <- c( "imputed.logP", "sigma2.hat", "df", "N.df", "p",  paste("N.", db$strains, sep=""), db$strains )
    return(results)
  }
  else if ( model == "binary") {
    results = as.matrix(t(apply( X, 1, function( x, mat, phenotype ) {  one.way.binary( mat[x,], phenotype) }, mat, phenotype )))
    colnames(results) <- c( "logP", "pi.null", "df", "N.df", "p",  paste("N.", db$strains, sep=""), db$strains )
    return(results)
  }
  else {
    warning( "unknown model ", model, "\n")
    return (NULL)
  }
}


imputed.parameter.distributions <- function( results, model="linear", delta=0.01, fact=4 ) {
  mean.results = apply(results,2,mean, na.rm=TRUE)
  results = data.frame(results)
  p <- results$p[1]
  n <- results$N.df[1]
  N.strains = (ncol(results)-5)/2
  p6 = p+6
  p12 = p+12
  N <- as.matrix(results[,seq(6,6+N.strains-1,1)])
  N <- ifelse( N>0, N, NA)
  est <- results[,seq(6+N.strains,ncol(results),1)]
  logP = mean(results$imputed.logP)

  if ( model == "linear" ) {
    sigma2.hat <- results$sigma2.hat %o% rep(1,p)
    stderr <- sqrt(sigma2.hat/N)

    x.range = seq( -fact, +fact, delta )
    all.results <- NULL
                                        #    print(N)
                                        #    print(est)
    all.ok = sum(is.na(N))


    tmp = mclapply( 1:p, function(i, N, est, sigma2.hat, x.range) { 
      ok = !is.na(N[,i])
      Nok = sum(ok)

      if ( Nok > 0 ) {
        mu.mean = mean(est[ok,i]) 
        sigma2.mean = mean(sigma2.hat [ok,i])
        Ni <- N[ok,i]
        N.mean = mean(Ni)
        se = sqrt(sigma2.mean/N.mean)
        x.target = mu.mean +x.range *se
        s = (x.target %o% rep(1,Nok) - rep(1,length(x.target)) %o% est[ok,i])/(rep(1,length(x.target)) %o% sqrt(sigma2.hat[ok,i]/Ni) )

        t.distribution <- pt( s, n )
        avg.t.distribution = apply(t.distribution, 1, mean )
        distribution <- data.frame( x=x.target, d=avg.t.distribution)
                                        #      plot(distribution$x, distribution$d)

        density <- diff(distribution$d)
        x.density = distribution$x[1:length(density)]

        mean.i <- sum( x.density*density)
        var.i = sum( x.density*x.density*density) - mean.i*mean.i
        pct.50 =  distribution$x[c(max(which( distribution$d < 0.25 )), min(which( distribution$d > 0.75 )))]
        pct.90 =  distribution$x[c( max(which( distribution$d < 0.05 )), min(which( distribution$d > 0.95 )))]

                                        #        cat( Nok, names(est)[i], mean.i, sqrt(var.i), pct.50, pct.90, "\n")

        return(c( logP, names(est)[i],  mean.i, sqrt(var.i), pct.50, pct.90))
      }
      else { return(c( logP, names(est)[i],  NA, NA, NA, NA, NA, NA ))}
    }
      , N, est, sigma2.hat, x.range )


    all.results = data.frame(t(do.call("cbind", tmp)))
    names(all.results) = c( "impute.logP", "Accession",  "mean", "stderr", "lower.50", "upper.50", "lower.90", "upper.90")
    return(all.results)
  }  
  else if ( model == "binary" ) {
    all.results <- NULL
    for( i in 1:p ) {
      ok = !is.na(N[,i])
      Nok = sum(ok)
      if ( Nok > 0 ) {

        n.max = max(N[ok,i])
        binom.dist = apply( cbind( est[ok,i], N[ok,i]), 1, function( x, n ) { d =rep(0, n); d[1:(x[2]+1)] = dbinom( 0:x[2], x[2], x[1]); return(d) }, n.max+1   )
        total.dist = apply(binom.dist, 1, mean)
        print(total.dist)
        z=seq(0,n.max); cs = cumsum(total.dist); q05 = max(1,which(cs<=0.05))-1; q25 = max(1,which(cs<=0.25))-1; q50=max(1,which(cs<=0.50))-1; q75 =min(which(cs>=0.75))-1; q95=min(which(cs>=0.95))-1;
        print(cs)
        mu = sum(z*total.dist)/n.max
        se = sqrt(sum(z*z*total.dist)/(n.max*n.max)-mu*mu)
                                        #        cat( names(est)[i], logP, mu, se, q25, q75, q05,q95  , "\n")
        all.results = rbind( all.results, c( logP, names(est)[i],  mu, se, q25/n.max, q75/n.max, q05/n.max,q95/n.max  ))
      }
      else {
        all.results = rbind( all.results, c( logP, names(est)[i], logP, NA, NA, NA, NA, NA, NA ))
      }
    }
    all.results = data.frame(all.results)
    names(all.results) = c(  "impute.logP", "Accession", "mean", "stderr", "lower.50", "upper.50", "lower.90", "upper.90")
    return(all.results)
  }

}

one.way.anova <- function( X, y ) {

                                        # assumes design matrix X correponds to one-way anova - ie exactly one non-zero element per row, equal to 1, and assumes all y's are non-missing

  n = length(y)

  N <- apply(X, 2, sum ) # number of replicates of each class
  df = sum(N>0)
  S <- t(X) %*% y  # sums of replicates
  beta.hat = ifelse( N>0 , S/N, NA ) # estimates
  y.hat = X %*% beta.hat
  y.resid = y - y.hat
  RSS = sum(y.resid*y.resid)
  sigma.hat = RSS/(n-df)
  yy = y-mean(y)
  TSS = sum(yy*yy)
  FSS = TSS-RSS
  F = 0
  logP = NA
  if (df > 1) {
    F = (FSS/(df-1))/(RSS/(n-df))
    logP = -pf( F, df-1, n-df, lower.tail =FALSE, log=TRUE)/log(10)
  }
                                        #  a = anova(lm(y ~ X))

  return( c( logP=logP, sigma.hat=sigma.hat, df=df-1, ndf=n-df, ncol=ncol(X), N=N, beta.hat=beta.hat ) )
}

one.way.binary <- function( X, y ) {

                                        # assumes design matrix X correponds to one-way design - ie exatly one non-zero element per row, equal to 1, and assumes all y's are non-missing and either 0 or 1 or TRUE or FALSE
                                        # estimates are given as proportions pi

  n = length(y)

  N <- apply(X, 2, sum ) # number of replicates of each class
  df = sum(N>0)
  S <- t(X) %*% y  # sums of replicates
  pi.hat = ifelse( N>0 , S/N, NA ) # estimates
  pi.null = sum(y)/n
  y.hat = X %*% pi.hat

  if ( df > 1 ) {
    log.LR = 2*sum( ifelse( y > 0, y*log(y.hat/pi.null), 0) + ifelse( 1-y>0, (1-y)*log((1-y.hat)/(1-pi.null)),0))
    logP = -pchisq( log.LR, df-1, log=TRUE, lower.tail=FALSE )/log(10)
  }
  else{
    logP = NA
    log.LR = 0
  }
  return( c( logP=logP, pi.null=pi.null, df=df-1, ndf=n-df, ncol=ncol(X), N=N, pi.hat=pi.hat ) )
}