##################################################
# This is the script to process all pehnotypes and generate the residuals and different tables.
#
# The code can be edited to adjust for specific requirements.
#
# ########### Latest modifications (26Jun15) #############
#
# - Remove all Brain, Cardio.Echo, Telomeres and Mitochondria measues
# - Remove Neuro.DCX_corr as difficult to argue without functional backup
# - Remove EPM "Middle" measures
# - Remove the "Fibro" measure
# - Added a few SPPI measures (those requested by Chicago)
# - Limited PAS measures to 3: First5, Last10 and Total_Activity
# - Get Neophagia measure back
#
# ####### Below notes to previous version (13Feb15): ######
#
# Changes made in this latest version:
#	- Updated neurogenesis and ears measures (all done)
#	- design Neuro.Staining.batch as a factor
#   - I've changed the order of covariates in Serotonin measures to start with "Date.Year" (because the latest measures made in 2014 where higher than previous in 2011 and 2012)
#	- Updated brain anatomy measures (more done and DG length is complete)
#	- I don't remove outliers for the BMC Mode, Mean and Median measures because I want them to INCLUDE the outliers (osteopetrosis)
#	- Restrict list of measures to 249
#
# It turns out that it is better to use 1st quality data for some of the tests while others can do with all so I specify the dataset to be used
# for each test. In addition I include weight as a covariate for all measures except Body.Weight (obvious) and Muscles weight where I use tibia length.
# That should really be the last set of phenotypes for mapping.
#
# - "Ultimate" list of phenotypes for the main article
# - I've added "Body.Weight" as a covariate copying "Weight.Average"
# - I use Weight.Average as covariate except for weight measures (obvious) and Muscle weights where I use tibia length
# - Removed the following measures that didn't made much sense:
#		- "Moving" in EPM and OFT
#		- "_Corr" in Hypoxia. I now include Body.Weight as covariate so this corrected measure doesn't make sense anymore
# 		- "BW", for body weight, in Echo (cardio). Same as above: this corrected measure is redundant given the fact I introcude Body.Weight in the linear model
# 		- "Ratio" calculated in EPM: never pointed to any QTL so no need to add more measures
# 		- "_best" in Neuro: Nasrin is spending time checking if it is better to use only measures with all sections but for the tiume being I drop them
# - I've changed the order of covariates in Sleep measures to start with "Sleep.Light" (the animals from the first batches where light started 1 hour earlier or later)
# - I've created a "Tibia.Length" covariate (copying "Muscles.Tibia.mm") that I only use for the muscles weight measures
# - I now include new BMC measures (".N") with only "healthy" bones (~normal) exculding those with osteopetrosis
# - I will test FC with measures from Boxes 2 and 3 only (".Q"). I lose half of the measures but boxes 1 and 4 measures were very skewed. It might actually help
#
# ##### Below older notes: #####
# 
# - Added a new Brain.Anatomy measure: HippoCort_Ratio where I correct Hippocampus size by the cortical thickness
# - I exclude all qPCR measures (mine and Jacques)
# - I don't calculate any Principal Components (we won't include them in the paper)
# - Brain and Neuro measures: - I've altered the order of covariates so Experimenter and Protocol are tested first
#                             - I am now using IDs checked by Chris Jungerius: there were quite a lot of changes from before 
# - Mito and Telo measures: DNA concentration is tested first as covariate before the others
# - Removes data flagged as "bad" (Data.Quality=0)
# - I include the latest measures of MitDNA produced by Cai Na (but still only DNA extracted from tails)
# - I remove the 4 pairs of animals that have the same DNA (duplicates)
# - FC measures are first normalized (standardized) per box before calculating residuals (so outliers are removed in each box separately).
# - (PCAs are NOT included as covariates)
#
# Jérôme Nicod, 13 Feb 15
##################################################

setwd("/home/nicod/final_12Mar13/")
library(lme4)

normalize <- function(values){         # Jonathan's 'normalize' R function
      n <- length(values)+1;
      r <- rank(values)/n;
      qnorm(r)
      }

### Get data  ###
# First I import (copy) all the "final" phenotypes tables from each test that I've done recently
files =list.files(full.name=FALSE, pattern="_final_")
print(paste("number of files:",length(files)))
# [1] 31     # including Pca but I will remove them later
# Checking all tables:
for (f in files){
     table = read.delim(f)
     cat(f,dim(table),"\n")     # dim() will show me if any has duplicates (length>2117)
     }
	 
# This are the tests where I onl keep 1st quality data:
quality_data=c("Adrenals","Bioch","Fibro","Haem","WH","Brain","EPM","Neo","Serotonin")

# Then I create a unique table with all measures. I first have to change the names of the columns so it includes the test (FACS, Bioch, etc):
data = read.delim("Adrenals_final_2Apr14.txt")[,1:3]     # Get the first 3 columns (Anima_ID, Batch and Sex) for the 2117 mice
list=data.frame()
for (f in files){
     table = read.delim(f)
	 if (unlist(strsplit(f,"_"))[1] %in% quality_data){
		 table = table[table$Data.Quality==1,]	# Only keep 1st quality data
		 } else {
			 table = table[table$Data.Quality!=0,]	# Removes data flagged as "bad quality"
		 }
     drops <- c("Batch","Sex")
     table = table[,!(names(table) %in% drops)]    # Removes "Batch" and "Sex" columns if present
     test = unlist(strsplit(f,"_final_"))[1]     # select the string of characters before "_final_" -> name of the test (Adrenals, Bioch, etc)
     for (n in 1:length(names(table))){
          if (names(table)[n]!="Animal_ID"){
               names(table)[n]=paste(test,".",names(table)[n],sep="")
               }
          }
     data = merge(data,table,by="Animal_ID",all.x=T,all.y=F)      # was all.y=T but I can't with PCA because some of the IDs are edited (.bis)
     info = cbind(data.frame(test),data.frame(names(table)[names(table)!="Animal_ID"]))     # Generates a list of all column names
     list = rbind(list,info)
     }
names(list)=c("Test","Data")

# Here I deal with the body weight measures
# First I copy the Weight.Average in this new column ("Body.Weight") that I will use as covariate:
data$Body.Weight=data$Weight.Average
# Then I only keep the measured (not the calculated) and 1st quality weight measures for mapping:
weight_measures=c("Weight.Neo","Weight.Startle","Weight.Hypo","Weight.ECG","Weight.Diss")
for (w in weight_measures){
	w_source=paste(w,".source",sep="")
	cc=complete.cases(data[,c(w,w_source)])
	data[data[,w_source]=="c" & data$Weight.Data.Quality>1 & cc,w]=NA
}
# Finally I only keep 1st quality data for average weight and BMI:
for (i in c("Weight.BMI.body","Weight.BMI.tibia","Weight.Average")){
	data[data$Weight.Data.Quality>1 & complete.cases(data[,i]),i]=NA
}

# I also create a "Tibia.Length" covariate column by copying the Muscle.Tibia.mm measure:
data$Tibia.Length=data$Muscles.Tibia.mm
# This covariate is "skipped" as it is only used for the Muscles weight measures where I specify it

# write.table(list,"List_data_2Apr13.txt",sep="\t",quote=F,row.names=F)

# I change the format of Neuro.Stainin.batch so it's considered a factor (default is int):
data$Neuro.Staining.batch=as.factor(data$Neuro.Staining.batch)

# I change the format of the "Month" so it works with later calculations (delay for instance). 
for (i in names(data)[grep("Month",names(data))]){
     data[i]<-replace(data[i],data[i]==1,"Jan")
     data[i]<-replace(data[i],data[i]==2,"Feb")
     data[i]<-replace(data[i],data[i]==3,"Mar")
     data[i]<-replace(data[i],data[i]==4,"Apr")
     data[i]<-replace(data[i],data[i]==5,"May")
     data[i]<-replace(data[i],data[i]==6,"Jun")
     data[i]<-replace(data[i],data[i]==7,"Jul")
     data[i]<-replace(data[i],data[i]==8,"Aug")
     data[i]<-replace(data[i],data[i]==9,"Sep")
     data[i]<-replace(data[i],data[i]==10,"Oct")
     data[i]<-replace(data[i],data[i]==11,"Nov")
     data[i]<-replace(data[i],data[i]==12,"Dec")
     }

## Get list of measures and covariates ##
list_origin = read.delim("List_data_18Apr13.txt")           # Annotated list of all measures, covariates and other columns (info). It also contains 2 more rows with "Sex" and "Batch"
                                                                                       # This latest version(18Apr13) contains the last 2 Macrophages qPCR measures: Ccle and Vegaf
                                                                                       # Also contains PCA1-20 added on 15May13 but they are now all flagged as "skip"
## Remove qPCR tests from the table (RNA and Macrophages) 																					   
list_origin=list_origin[-which(list_origin$Test=="RNA"),]
list_origin=list_origin[-which(list_origin$Test=="Macrophages"),]
dim(list_origin)
# [1] 774   7
																					   

## To check if any of the columns from the _final_ tables are NOT present in the annotated table:
print("Number of columns absent in the annotated table:")
print(list$Data[is.na(match(list$Data,list_origin$Data))])
#### All the Macrophages and RNA that I've removed:
# [1] Macrophages.Il12         Macrophages.Tnfa         Macrophages.IL6         
# [4] Macrophages.IL10         Macrophages.Nos2         Macrophages.Nlrp3       
# [7] Macrophages.Tgfb1        Macrophages.Ccl2         Macrophages.Vegaf       
#[10] Macrophages.Data.Quality RNA.Alpl.Liver           RNA.Scarb1.Liver        
#[13] RNA.Rap1gap.Liver        RNA.Data.Quality        

## To remove PCAs from the list of covariates to test:
list_origin$Skip[grep("Pca.",list_origin$Data)]="skip"

## To remove "Batch" and all "Date" from the list of covariates to test:
# list_origin$Skip[list_origin$Data=="Batch"]="skip"
# list_origin$Skip[grep("Date",list_origin$Data)]="skip"

## To include Weight in the list of covariates
# list_origin$Group[list_origin$Data=="Weight.Average"]="c"
# list_origin$Test[list_origin$Data=="Weight.Average"]="General"
# And then I remore the other weights from the analysis:
# list_origin=list_origin[-which(list_origin$Test=="Weight"),]

list_clean=list_origin[list_origin$Skip!="skip",]                        # removes covariates I've decided to skip (comments, etc)

## Rank normalize the FC measures per box (after removing outiers):
fc_measures=list_clean$Data[list_clean$Group=="m"][grep("FC.",list_clean$Data[list_clean$Group=="m"])]
# But NOT the ".Q" values with only Boxes 2 and 3:
fc_measures=fc_measures[!fc_measures %in% fc_measures[grep(".Q",fc_measures)]]
fc_box=paste("Box",1:4,sep="")
for (b in fc_box){
	for (i in fc_measures){                                                     
	      x =data[data$FC.Box.Cue==b & !is.na(data[names(data)==i]),names(data)==i]
		  if(length(x)>0){
		      out<- abs(x-mean(x,na.rm=T))/sd(x,na.rm=T)>3     # outliers defined as measures outside 3 SD
		      x1<-replace(x,out,NA)							   # remove outliers
			  x2<-normalize(x1)								   # normalize
		      data[data$FC.Box.Cue==b & !is.na(data[names(data)==i]),names(data)==i]<-x2
		      }
	      }
}

## Table with covariates to test only ##
list_cov = list_clean[list_clean$Group=="c",]
list_cov=list_cov[-grep(".Task.Cage",list_cov$Data),]              # remove "Task.Cage" from the list of covariates to test (too strong effect)
print(paste("list of covariates:",dim(list_cov)))
# [1] 185   7      # would be 200 with PCA

## Check the .Hour measures ##
# I want hours in 24h format but some might be in 12h (AM/PM) format. Anything between 1 and 6 should be transformed to 13-18
list_hour = list[grep(".Hour",list$Data),]
print(dim(list_hour))
# [1] 14  2

for (l in list_hour$Data){
     print(l)
     print(table(data[,l]))
     }
# The only changes required are in Diss.CullTime.Hour: 18 mice have Hour = 1 to be change to 13:
data$Diss.CullTime.Hour[data$Diss.CullTime.Hour==1]=13 

## Check the .Minutes measures ##
# Here I check that all the .Minutes are between 0 and 59:
list_min = list[grep(".Minute",list$Data),]
dim(list_min)
# [1] 14  2

for (l in list_min$Data){
     cat(l," ",min(data[,l],na.rm=T)," ",max(data[,l],na.rm=T),"\n")
     #print(table(data[,l]))
     }
# And all are correct

print(paste("size of data before calculating covariates:",dim(data)))
# [1] 2117  727

## Calculate covariates ##

# Gives numerical value to batch order:
data$Diss.Project.Order = as.numeric(gsub("OBT","",data$Batch))          

# Calculate time sitting on ice to check if it  has any influence on Biochemistry
d.hour = data$Diss.CullTime.Hour
d.min = data$Diss.CullTime.Minute
d.time = strptime(paste(d.hour,d.min),"%H %M")
b.hour = data$Diss.BloodSplit.Time.Hour
b.min = data$Diss.BloodSplit.Time.Minute
b.time = strptime(paste(b.hour,b.min),"%H %M")
data$Bioch.BloodSplit.Time.On_Ice = as.numeric(difftime(b.time,d.time))               # surprisingly there are a few  values<0
data$Bioch.BloodSplit.Time.On_Ice[data$Bioch.BloodSplit.Time.On_Ice<0]=NA       # I remove the negative values that MUST be mistakes

# To determine which samples have been frozen before Biochemistry
diss.d = data$Diss.Date.Day
diss.m = data$Diss.Date.Month
diss.y = data$Diss.Date.Year
diss.s = as.Date(paste(diss.d,diss.m,diss.y),"%d %b %Y")              # Day of dissection
test="Bioch.Date"
d=data[,paste(test,".Day",sep="")]
m=data[,paste(test,".Month",sep="")]
y=data[,paste(test,".Year",sep="")]
s = as.Date(paste(d,m,y),"%d %b %Y")                             # Day the measures were made
diff = as.numeric(difftime(s,diss.s))                            # Difference between dates: difftime
diff[diff>0] = "Frozen"                                          # If the difference in time between diss. and bioch>0 it mens samples have been frozen
diff[diff==0] = "Fresh"                                          # If no difference, samples were processed Fresh
data$Bioch.Frozen = diff                                         # Batches 8 and 55 are Frozen, all others Fresh
      

cov_calc = list_cov[list_cov$Calculated=="calculated",]                         # lists covariates that need to be calculated
for (b in cov_calc$Data){                                                     
     if (grepl(b,pattern=".Delay")){                                     # Calculate the number of days between dissection and measuring.
          test = gsub(b,pattern=".Delay",replacement="")      # Gives back the name of the test/phenotype
          diss.d = data$Diss.Date.Day
          diss.m = data$Diss.Date.Month
          diss.y = data$Diss.Date.Year
          diss.s = strptime(paste(diss.d,diss.m,diss.y),"%d %b %Y")     
          d=data[,paste(test,".Date.Day",sep="")]
          m=data[,paste(test,".Date.Month",sep="")]
          y=data[,paste(test,".Date.Year",sep="")]
          s = strptime(paste(d,m,y),"%d %b %Y")                      # also possible: s = as.Date(paste(d,m,y),"%d %b %Y")
          data[,b] = as.numeric(difftime(s,diss.s))                  # Gives difference in days between dates
     }
     if (grepl(b,pattern=".Day_of_Year")){                                # I transform the date (Day/Month) into a continuous value to test its effect
          test = gsub(b,pattern=".Day_of_Year",replacement="")
          d=data[,paste(test,".Day",sep="")]
          m=data[,paste(test,".Month",sep="")]
          s = strptime(paste(d,m),"%d %b")$yday+1     # give the Julian date number: %b is for the month as a 3-letter. %m for the month as a 1-12
          data[,b] =sin(s*pi/366)                             # 1 < s < 366 and I want to transform to 0 < s < 1 with the sin() function 
     }                                                                      
     if (grepl(b,pattern=".Hour.Minute")){                                # Calculate a more precise version of the hour by adding the minutes                                           
          test = gsub(b,pattern=".Hour.Minute",replacement="")
          hour = data[,paste(test,".Hour",sep="")]
          minutes = data[,paste(test,".Minute",sep="")]
          precise.hour=hour+minutes/60                                          # Adds minutes to the hour
          data[,b] = precise.hour
     }
}


## Set defined covariates (in data table) as "factor" or "numeric"##
for (a in 1:length(list_cov[,2])){                        
     name = list_cov[a,2] 
     if (is.na(list_cov[a,6])){
          data[,as.character(name)]==as.factor(data[,as.character(name)])
     }
     if(list_cov[a,6] == "numeric"){
          data[,as.character(name)]==as.numeric(data[,as.character(name)])
     }
}    

print(paste("Size of data after calculating covariates:",dim(data)))
# [1] 2117  787

## List measures to test ##
# Do not include the measures that are to be treated as.factor
# Also some measures have been flagged with "skip" as we don't include them in the analysis anymore (26Jun15)
measures = list_clean$Data[list_clean$Group=="m" & list_clean$Type!="as.factor" & list_clean$Skip!="skip"]     
print(paste("Number of measures:",length(measures)))
# [1] 276

# Remove the "Moving" measures (EPM and OFT): not really relevant
measures=measures[!measures %in% measures[grep("Moving",measures)]]
# Remove the "Ratio" (distance) measures in EPM:
measures=measures[!measures %in% measures[grep("eRatio",measures)]]
# Remove the weight-corrected hypoxia measures:
measures=measures[!measures %in% measures[grep("_Corr",measures)]]
# Remove the weight-corrected cardiac (Echo) measures:
measures=measures[!measures %in% measures[grep(".BW",measures)]]
# Remove the "best" measures in Neuro (when all sections are present): Nasrin will check what is the best
measures=measures[!measures %in% measures[grep("_best",measures)]]
# Remove the "Echo" measures:
measures=measures[!measures %in% measures[grep("Cardio.Echo",measures)]]
# Remove the "EPM.Middle" measures:
measures=measures[!measures %in% measures[grep("EPM.Middle",measures)]]

print(paste("Number of measures:",length(measures)))
# [1] 250

# Only keep 249 measures:
#list_traits=read.delim("/home/nicod/List_249_quantitative_traits_28Jan15.txt",stringsAsFactors=F)[,1]
#measures=measures[measures %in% list_traits]
#print(paste("Number of measures:",length(measures)))

## Remove outliers (as in QC analysis) ##
for (i in measures){
	if (!i %in% c("BMC.Mode","BMC.Mean","BMC.Median")){		# I don't remove outliers in the BMC measures because they INCLUDE outliers (osteopetrosis)
        x =data[,names(data)==i]
        out<- abs(x-mean(x,na.rm=T))/sd(x,na.rm=T)>3     # outliers defined as measures outside 3 SD
        x1<-replace(x,out,NA)
        data[,names(data)==i]<-x1
	}                                                     
}

data_backup = data     # Backup of the table at that point in case the calculation of the residuals goes wrong

## Remove measures for mice suspected of being mixed-up ##

# First the mice identified with the wrong gender:
wrong_sex = read.delim("/home/nicod/Wrong_Gender_To_Exclude_27Feb13.txt")
length(wrong_sex[,1])
# [1] 8     # 8 mice with sex mix-up

# Exclude mice with duplicated DNA:
duplicated = read.table("/home/nicod/Duplicated_list_DNA_20Nov13.txt")
length(duplicated[,1])
# [1] 8     # 8 mice with duplicated DNA (actually one has also sex mix-up (59.0h)

# Exclude "unrelated" mice:
unrelated = read.table("/home/nicod/Unrelated_list_DNA_10Dec13.txt")
length(unrelated[,1])
# [1] 2     # only (isolated) 2 mice that look "unrelated" in the IBD plt. Outliers to me: remove

# Then the mice with poor correlation with the Affy data:
poor_affy = read.delim("/home/nicod/Poor_Affy_Correlation_17Apr13.txt",header=F)
length(poor_affy[,1])
# [1] 4     # 4 mice with poor Affy correlation

# Exclude those that had mistakes in weight measures:
bad_weight = read.delim("/home/nicod/Edited_Weight_to_exclude_17Apr13.txt",header=F)
length(bad_weight[,1])
# [1] 287

# Exclude those that had difference in weight measures between Scheduling and Exports/Anonymus:
diff_weight = read.delim("/home/nicod/Weight_changes_Scheduling_368mice_25Apr13.txt")
length(diff_weight[,1])
# [1] 368

# Exclude those that had high mean residuals between 20 phenotypes:
high_resid = read.delim("/home/nicod/High_residuals_2May13.txt",header=F)
length(high_resid[,1])
# [1] 202

exclude=c(as.character(wrong_sex[,1]),as.character(duplicated[,1]))     #,as.character(unrelated[,1]),as.character(poor_affy[,1]),as.character(high_resid[,1]),as.character(bad_weight[,1]),as.character(diff_weight[,1]))

set = data$Animal_ID %in% unique(exclude)     # I use "unique" in case some mice are excluded for more than 1 reason
print(paste("Number of mice removed from the analysis (mix-up):",length(set[set==TRUE])))
# [1] 15

for (i in measures){
     data[set,as.character(i)]=NA     # Replace measures made with mixed-up mice with NA
     }

## Select One Gender Only (males or females) ##

# data=data[data$Sex=="M",]
# dim(data)
# [1] 1065  711

# data=data[data$Sex=="F",]
# dim(data)
# [1] 1052  711

data_backup = data     # Backup of the table at that point in case something downstream goes wrong

## Test all covariates one after the other ##
list_of_significant=list()
results=NULL
models=NULL
listing=NULL
tissues_dependent <- c("WH","Weight","Micronucleus","Bioch","Haem","Serotonin","FACS","Adrenals","Mito")     # Lists the phenotypes that use tissues
for(i in 1:length(measures)){
    m = measures[i]
    phen = unlist(strsplit(as.character(m),"\\."))[1]               # this splits the "measure" in the strings separated by "." and select the first one
    if (phen %in% tissues_dependent){
          # If using tissues then I also test Diss covariates
          # By listing the covariates this way I will test the "General" first, then the "Tissue" and finally the "test-specific" covariates (in that order)
          covariates = c(as.vector(list_cov[,2][list_cov$Test=="General"]),as.vector(list_cov[,2][list_cov$Test=="Diss"]),as.vector(list_cov[,2][list_cov$Test==phen]))     

    } else {
          # If not from tissues I don't need to include the dissection covariates
          covariates = c(as.vector(list_cov[,2][list_cov$Test=="General"]),as.vector(list_cov[,2][list_cov$Test==phen]))    
    }
	
	### Change the order of covariates in Neurogenesis and Brain Anatomy tests:
	if (phen=="Brain") covariates = unique(c("Brain.Protocol","Brain.Experimenter", "Brain.Section_Number",covariates))	# Protocol and Experimenter have massive effect on brain measures so I use this line so tehy are tested first as covariates (before Sex, Batch, etc)
	if (length(grep("DCX",m)==1)) covariates = unique(c("Neuro.Nb_slices_DCX",covariates))	# Same here: I test number of slices first as covariates before the rest
	if (length(grep("Ki67",m)==1)) covariates = unique(c("Neuro.Nb_slices_Ki67",covariates))	# Same here: I test number of slices first as covariates before the rest
	if (phen=="Neuro") covariates = unique(c("Neuro.Staining.batch","Neuro.Experimenter",covariates))	# Same here: I test Experimenter first as covariates before the rest
	### Add DNA concentration as first covariate for Mito and Telo measures:
	if (phen=="Mito") covariates = unique(c("Mito.Extracted_DNA_conc",covariates))	
	if (phen=="Telo") covariates = unique(c("Mito.Extracted_DNA_conc",covariates))	
	### Test "Light" first for the Sleep measures:
	if (phen=="Sleep") covariates = unique(c("Sleep.Light",covariates))	
	### Test "Date.Year" first (year when the measure was done) for the Serotonin measures (because 2014 was done much later):
	if (phen=="Serotonin") covariates = unique(c("Serotonin.Date.Year",covariates))	
	### Remove "Weight.Avearge" for the weight measures (sort of obvious):
	if (phen=="Weight") covariates = covariates[covariates!="Body.Weight"]
	### For muscles measures: change body weight by tibia length:
	if (phen=="Muscles"){
		if (m!="Muscles.Tibia.mm") covariates = gsub ("Body.Weight","Tibia.Length",covariates) 		
		} 				
    group = NULL
    for(c in covariates){
            test = c(group,c)
            cc=complete.cases(data[,c(as.character(m),test)])
            if (length(unique(data[cc,c]))>1){          # select only the covariates here there is more than 1 value
                 test_1 = c(1,test)
                 group_1 = c(1,group)
                 if (length(cc[cc==TRUE])>0.8*length(data[,as.character(m)][!is.na(data[,as.character(m)])])){     # test covariates only if present in >70% measures
                      formula_0 =paste(m, "~", paste(group_1,collapse=" + "),sep="")
                      formula_1 =paste(m, "~", paste(test_1,collapse=" + "),sep="")
                      a=anova(lm(formula_0,data=data[cc,]),lm(formula_1,data=data[cc,]))
                      pvalue=a[2,6]
                      RSS=a[2,2]
                      TSS=a[1,2]
                      FSS=TSS-RSS
                      fraction_explained=FSS/TSS
                      percents=fraction_explained*100
                 if (!is.na(pvalue)){
                      if (pvalue<0.05) {                                             # Keeps only significant covariates
                          list_of_significant[[paste(m,c,sep='_')]]=a
                          results=rbind(results,c(as.character(m),c,pvalue,fraction_explained,percents))
                               if (percents>=1){                                    # Keeps only covariates which effect size is above 0.1%
                                   group = test
                               }
                      }    
                 }
                 }
           }
    }
    ################################
    # Normal option: residuals added to the data table
    ################################
	if (length(grep("Macrophages",m))==1) group = NULL		# I don't apply correction for qPCR data (Macrophages)
	if (length(grep("RNA",m))==1) group = NULL				# I don't apply correction for qPCR data (RNA)
    resid.name = paste(m,".resid",sep="")         
    best_formula = paste(m, "~", paste(c(1,group),collapse=" + "),sep="")     	# The best formula has to be rewritten here as formula_0 might
                                                                                                               # be erroneus (in case the last covariate has an effect)
    cc=complete.cases(data[,c(as.character(m),group)])                          # cc needs to be redefined in case the last covariate tested is rejected from the model
    if (length(grep("Batch",best_formula))==1){
    	best_formula=gsub("Batch","(1|Batch)",best_formula)						# Introduce Batch as random effect if present 
	}
    if (length(grep("Staining.batch",best_formula))==1){
    	best_formula=gsub("Neuro.Staining.batch","(1|Neuro.Staining.batch)",best_formula)						# Introduce Batch as random effect if present 
	}
	if (length(grep("[|]",best_formula))==1){
		data[cc,resid.name] = resid(lmer(best_formula,data=data[cc,]))          # calculate residuals of the best mixed model and add to table
		} else {
			data[cc,resid.name] = resid(lm(best_formula,data=data[cc,]))        # calculate residuals of the best linear model and add to table
		}
    models = c(models,best_formula)
    listing = rbind(listing,c(as.character(m),paste(group,collapse=",")))
    #################################

##############################################
# Alternative option: a new (small) table is generated for each residuals
##############################################
#best_formula = paste(m, "~", paste(c(1,group),collapse=" + "),sep="")
#models = c(models,best_formula)
#listing = rbind(listing,c(as.character(m),paste(group,collapse=",")))
#r=resid(lm(best_formula,data=data))
#ids=as.character(data$Animal_ID[as.numeric(names(r))])
#table=data.frame(cbind(ids,r))
#table=cbind("n/a",table)
#names(table)=c("Family","Individual","Phenotype")
#file_name=paste("/data/scratch/nicod/residual_tables_males_only_27May13/",m,".resid_pheno.txt",sep="")
#write.table(table,file=file_name,sep="\t",quote=F,row.names=F)
##############################################
}

write.table(models,"models_For_Article_New_26Jun15.txt",sep="\t",quote=F,row.names=F)
write.table(listing,"listing_For_Article_New_26Jun15.txt",sep="\t",quote=F,row.names=F)
write.table(results,"results_For_Article_New_26Jun15.txt",sep="\t",quote=F,row.names=F)

## Remove outliers on the residuals (one more time) ##
for (i in measures){                                                     
      y = as.character(paste(i,".resid",sep=""))
      x =data[,y]
      out<- abs(x-mean(x,na.rm=T))/sd(x,na.rm=T)>3     # outliers defined as measures outside 3 SD
      x1<-replace(x,out,NA)
      data[,y]<-x1
      }

dim(data)
# [1] 2117 1035

### I don't calculate PCAs
#tests=c("Hypoxia","Brain","Cardio","FC","Sleep","SPPI","Voc","Anxiety")
out=data
#for (t in tests){
#	 dt=data[,grep(".resid",names(data))]
#	 dt=dt[,-grep("Weight",names(dt))]       # Remove "weight" meaasures because the column names also contain "Hypoxia","Neo", etc
#     if(t=="Anxiety"){
#          dt=dt[,c(grep("EPM",names(dt)),grep("OFT",names(dt)),grep("Neo",names(dt)))]         
#          } else {
#               dt=dt[,grep(t,names(dt))]
#          }
#     pca=prcomp(dt[complete.cases(dt),],scale=TRUE)
#     important=length(summary(pca)$importance[3,][summary(pca)$importance[3,]<0.9])+1     # Number or PCAs explaining more than 90% of variance
#     x=data.frame(pca$x[,c(1:important)])
#     names(x)=paste(t,".",names(x),".resid",sep="")
#     #ids=data$Animal_ID[complete.cases(dt)]
#     x=cbind(data$Animal_ID[complete.cases(dt)],x)
#     names(x)[1]="Animal_ID"
#     out=merge(out,x,by="Animal_ID",all.x=T,all.y=F)
#}

## Save table (residuals) for QTL mapping ##
write.table(out,"Harwell_all_measures_resids_For_Article_New_26Jun15.txt",sep="\t",quote=F,row.names=F)