################################################## # This is the script to process all pehnotypes and generate the residuals and different tables. # # The code can be edited to adjust for specific requirements. # # ########### Latest modifications (26Jun15) ############# # # - Remove all Brain, Cardio.Echo, Telomeres and Mitochondria measues # - Remove Neuro.DCX_corr as difficult to argue without functional backup # - Remove EPM "Middle" measures # - Remove the "Fibro" measure # - Added a few SPPI measures (those requested by Chicago) # - Limited PAS measures to 3: First5, Last10 and Total_Activity # - Get Neophagia measure back # # ####### Below notes to previous version (13Feb15): ###### # # Changes made in this latest version: # - Updated neurogenesis and ears measures (all done) # - design Neuro.Staining.batch as a factor # - I've changed the order of covariates in Serotonin measures to start with "Date.Year" (because the latest measures made in 2014 where higher than previous in 2011 and 2012) # - Updated brain anatomy measures (more done and DG length is complete) # - I don't remove outliers for the BMC Mode, Mean and Median measures because I want them to INCLUDE the outliers (osteopetrosis) # - Restrict list of measures to 249 # # It turns out that it is better to use 1st quality data for some of the tests while others can do with all so I specify the dataset to be used # for each test. In addition I include weight as a covariate for all measures except Body.Weight (obvious) and Muscles weight where I use tibia length. # That should really be the last set of phenotypes for mapping. # # - "Ultimate" list of phenotypes for the main article # - I've added "Body.Weight" as a covariate copying "Weight.Average" # - I use Weight.Average as covariate except for weight measures (obvious) and Muscle weights where I use tibia length # - Removed the following measures that didn't made much sense: # - "Moving" in EPM and OFT # - "_Corr" in Hypoxia. I now include Body.Weight as covariate so this corrected measure doesn't make sense anymore # - "BW", for body weight, in Echo (cardio). Same as above: this corrected measure is redundant given the fact I introcude Body.Weight in the linear model # - "Ratio" calculated in EPM: never pointed to any QTL so no need to add more measures # - "_best" in Neuro: Nasrin is spending time checking if it is better to use only measures with all sections but for the tiume being I drop them # - I've changed the order of covariates in Sleep measures to start with "Sleep.Light" (the animals from the first batches where light started 1 hour earlier or later) # - I've created a "Tibia.Length" covariate (copying "Muscles.Tibia.mm") that I only use for the muscles weight measures # - I now include new BMC measures (".N") with only "healthy" bones (~normal) exculding those with osteopetrosis # - I will test FC with measures from Boxes 2 and 3 only (".Q"). I lose half of the measures but boxes 1 and 4 measures were very skewed. It might actually help # # ##### Below older notes: ##### # # - Added a new Brain.Anatomy measure: HippoCort_Ratio where I correct Hippocampus size by the cortical thickness # - I exclude all qPCR measures (mine and Jacques) # - I don't calculate any Principal Components (we won't include them in the paper) # - Brain and Neuro measures: - I've altered the order of covariates so Experimenter and Protocol are tested first # - I am now using IDs checked by Chris Jungerius: there were quite a lot of changes from before # - Mito and Telo measures: DNA concentration is tested first as covariate before the others # - Removes data flagged as "bad" (Data.Quality=0) # - I include the latest measures of MitDNA produced by Cai Na (but still only DNA extracted from tails) # - I remove the 4 pairs of animals that have the same DNA (duplicates) # - FC measures are first normalized (standardized) per box before calculating residuals (so outliers are removed in each box separately). # - (PCAs are NOT included as covariates) # # Jérôme Nicod, 13 Feb 15 ################################################## setwd("/home/nicod/final_12Mar13/") library(lme4) normalize <- function(values){ # Jonathan's 'normalize' R function n <- length(values)+1; r <- rank(values)/n; qnorm(r) } ### Get data ### # First I import (copy) all the "final" phenotypes tables from each test that I've done recently files =list.files(full.name=FALSE, pattern="_final_") print(paste("number of files:",length(files))) # [1] 31 # including Pca but I will remove them later # Checking all tables: for (f in files){ table = read.delim(f) cat(f,dim(table),"\n") # dim() will show me if any has duplicates (length>2117) } # This are the tests where I onl keep 1st quality data: quality_data=c("Adrenals","Bioch","Fibro","Haem","WH","Brain","EPM","Neo","Serotonin") # Then I create a unique table with all measures. I first have to change the names of the columns so it includes the test (FACS, Bioch, etc): data = read.delim("Adrenals_final_2Apr14.txt")[,1:3] # Get the first 3 columns (Anima_ID, Batch and Sex) for the 2117 mice list=data.frame() for (f in files){ table = read.delim(f) if (unlist(strsplit(f,"_"))[1] %in% quality_data){ table = table[table$Data.Quality==1,] # Only keep 1st quality data } else { table = table[table$Data.Quality!=0,] # Removes data flagged as "bad quality" } drops <- c("Batch","Sex") table = table[,!(names(table) %in% drops)] # Removes "Batch" and "Sex" columns if present test = unlist(strsplit(f,"_final_"))[1] # select the string of characters before "_final_" -> name of the test (Adrenals, Bioch, etc) for (n in 1:length(names(table))){ if (names(table)[n]!="Animal_ID"){ names(table)[n]=paste(test,".",names(table)[n],sep="") } } data = merge(data,table,by="Animal_ID",all.x=T,all.y=F) # was all.y=T but I can't with PCA because some of the IDs are edited (.bis) info = cbind(data.frame(test),data.frame(names(table)[names(table)!="Animal_ID"])) # Generates a list of all column names list = rbind(list,info) } names(list)=c("Test","Data") # Here I deal with the body weight measures # First I copy the Weight.Average in this new column ("Body.Weight") that I will use as covariate: data$Body.Weight=data$Weight.Average # Then I only keep the measured (not the calculated) and 1st quality weight measures for mapping: weight_measures=c("Weight.Neo","Weight.Startle","Weight.Hypo","Weight.ECG","Weight.Diss") for (w in weight_measures){ w_source=paste(w,".source",sep="") cc=complete.cases(data[,c(w,w_source)]) data[data[,w_source]=="c" & data$Weight.Data.Quality>1 & cc,w]=NA } # Finally I only keep 1st quality data for average weight and BMI: for (i in c("Weight.BMI.body","Weight.BMI.tibia","Weight.Average")){ data[data$Weight.Data.Quality>1 & complete.cases(data[,i]),i]=NA } # I also create a "Tibia.Length" covariate column by copying the Muscle.Tibia.mm measure: data$Tibia.Length=data$Muscles.Tibia.mm # This covariate is "skipped" as it is only used for the Muscles weight measures where I specify it # write.table(list,"List_data_2Apr13.txt",sep="\t",quote=F,row.names=F) # I change the format of Neuro.Stainin.batch so it's considered a factor (default is int): data$Neuro.Staining.batch=as.factor(data$Neuro.Staining.batch) # I change the format of the "Month" so it works with later calculations (delay for instance). for (i in names(data)[grep("Month",names(data))]){ data[i]<-replace(data[i],data[i]==1,"Jan") data[i]<-replace(data[i],data[i]==2,"Feb") data[i]<-replace(data[i],data[i]==3,"Mar") data[i]<-replace(data[i],data[i]==4,"Apr") data[i]<-replace(data[i],data[i]==5,"May") data[i]<-replace(data[i],data[i]==6,"Jun") data[i]<-replace(data[i],data[i]==7,"Jul") data[i]<-replace(data[i],data[i]==8,"Aug") data[i]<-replace(data[i],data[i]==9,"Sep") data[i]<-replace(data[i],data[i]==10,"Oct") data[i]<-replace(data[i],data[i]==11,"Nov") data[i]<-replace(data[i],data[i]==12,"Dec") } ## Get list of measures and covariates ## list_origin = read.delim("List_data_18Apr13.txt") # Annotated list of all measures, covariates and other columns (info). It also contains 2 more rows with "Sex" and "Batch" # This latest version(18Apr13) contains the last 2 Macrophages qPCR measures: Ccle and Vegaf # Also contains PCA1-20 added on 15May13 but they are now all flagged as "skip" ## Remove qPCR tests from the table (RNA and Macrophages) list_origin=list_origin[-which(list_origin$Test=="RNA"),] list_origin=list_origin[-which(list_origin$Test=="Macrophages"),] dim(list_origin) # [1] 774 7 ## To check if any of the columns from the _final_ tables are NOT present in the annotated table: print("Number of columns absent in the annotated table:") print(list$Data[is.na(match(list$Data,list_origin$Data))]) #### All the Macrophages and RNA that I've removed: # [1] Macrophages.Il12 Macrophages.Tnfa Macrophages.IL6 # [4] Macrophages.IL10 Macrophages.Nos2 Macrophages.Nlrp3 # [7] Macrophages.Tgfb1 Macrophages.Ccl2 Macrophages.Vegaf #[10] Macrophages.Data.Quality RNA.Alpl.Liver RNA.Scarb1.Liver #[13] RNA.Rap1gap.Liver RNA.Data.Quality ## To remove PCAs from the list of covariates to test: list_origin$Skip[grep("Pca.",list_origin$Data)]="skip" ## To remove "Batch" and all "Date" from the list of covariates to test: # list_origin$Skip[list_origin$Data=="Batch"]="skip" # list_origin$Skip[grep("Date",list_origin$Data)]="skip" ## To include Weight in the list of covariates # list_origin$Group[list_origin$Data=="Weight.Average"]="c" # list_origin$Test[list_origin$Data=="Weight.Average"]="General" # And then I remore the other weights from the analysis: # list_origin=list_origin[-which(list_origin$Test=="Weight"),] list_clean=list_origin[list_origin$Skip!="skip",] # removes covariates I've decided to skip (comments, etc) ## Rank normalize the FC measures per box (after removing outiers): fc_measures=list_clean$Data[list_clean$Group=="m"][grep("FC.",list_clean$Data[list_clean$Group=="m"])] # But NOT the ".Q" values with only Boxes 2 and 3: fc_measures=fc_measures[!fc_measures %in% fc_measures[grep(".Q",fc_measures)]] fc_box=paste("Box",1:4,sep="") for (b in fc_box){ for (i in fc_measures){ x =data[data$FC.Box.Cue==b & !is.na(data[names(data)==i]),names(data)==i] if(length(x)>0){ out<- abs(x-mean(x,na.rm=T))/sd(x,na.rm=T)>3 # outliers defined as measures outside 3 SD x1<-replace(x,out,NA) # remove outliers x2<-normalize(x1) # normalize data[data$FC.Box.Cue==b & !is.na(data[names(data)==i]),names(data)==i]<-x2 } } } ## Table with covariates to test only ## list_cov = list_clean[list_clean$Group=="c",] list_cov=list_cov[-grep(".Task.Cage",list_cov$Data),] # remove "Task.Cage" from the list of covariates to test (too strong effect) print(paste("list of covariates:",dim(list_cov))) # [1] 185 7 # would be 200 with PCA ## Check the .Hour measures ## # I want hours in 24h format but some might be in 12h (AM/PM) format. Anything between 1 and 6 should be transformed to 13-18 list_hour = list[grep(".Hour",list$Data),] print(dim(list_hour)) # [1] 14 2 for (l in list_hour$Data){ print(l) print(table(data[,l])) } # The only changes required are in Diss.CullTime.Hour: 18 mice have Hour = 1 to be change to 13: data$Diss.CullTime.Hour[data$Diss.CullTime.Hour==1]=13 ## Check the .Minutes measures ## # Here I check that all the .Minutes are between 0 and 59: list_min = list[grep(".Minute",list$Data),] dim(list_min) # [1] 14 2 for (l in list_min$Data){ cat(l," ",min(data[,l],na.rm=T)," ",max(data[,l],na.rm=T),"\n") #print(table(data[,l])) } # And all are correct print(paste("size of data before calculating covariates:",dim(data))) # [1] 2117 727 ## Calculate covariates ## # Gives numerical value to batch order: data$Diss.Project.Order = as.numeric(gsub("OBT","",data$Batch)) # Calculate time sitting on ice to check if it has any influence on Biochemistry d.hour = data$Diss.CullTime.Hour d.min = data$Diss.CullTime.Minute d.time = strptime(paste(d.hour,d.min),"%H %M") b.hour = data$Diss.BloodSplit.Time.Hour b.min = data$Diss.BloodSplit.Time.Minute b.time = strptime(paste(b.hour,b.min),"%H %M") data$Bioch.BloodSplit.Time.On_Ice = as.numeric(difftime(b.time,d.time)) # surprisingly there are a few values<0 data$Bioch.BloodSplit.Time.On_Ice[data$Bioch.BloodSplit.Time.On_Ice<0]=NA # I remove the negative values that MUST be mistakes # To determine which samples have been frozen before Biochemistry diss.d = data$Diss.Date.Day diss.m = data$Diss.Date.Month diss.y = data$Diss.Date.Year diss.s = as.Date(paste(diss.d,diss.m,diss.y),"%d %b %Y") # Day of dissection test="Bioch.Date" d=data[,paste(test,".Day",sep="")] m=data[,paste(test,".Month",sep="")] y=data[,paste(test,".Year",sep="")] s = as.Date(paste(d,m,y),"%d %b %Y") # Day the measures were made diff = as.numeric(difftime(s,diss.s)) # Difference between dates: difftime diff[diff>0] = "Frozen" # If the difference in time between diss. and bioch>0 it mens samples have been frozen diff[diff==0] = "Fresh" # If no difference, samples were processed Fresh data$Bioch.Frozen = diff # Batches 8 and 55 are Frozen, all others Fresh cov_calc = list_cov[list_cov$Calculated=="calculated",] # lists covariates that need to be calculated for (b in cov_calc$Data){ if (grepl(b,pattern=".Delay")){ # Calculate the number of days between dissection and measuring. test = gsub(b,pattern=".Delay",replacement="") # Gives back the name of the test/phenotype diss.d = data$Diss.Date.Day diss.m = data$Diss.Date.Month diss.y = data$Diss.Date.Year diss.s = strptime(paste(diss.d,diss.m,diss.y),"%d %b %Y") d=data[,paste(test,".Date.Day",sep="")] m=data[,paste(test,".Date.Month",sep="")] y=data[,paste(test,".Date.Year",sep="")] s = strptime(paste(d,m,y),"%d %b %Y") # also possible: s = as.Date(paste(d,m,y),"%d %b %Y") data[,b] = as.numeric(difftime(s,diss.s)) # Gives difference in days between dates } if (grepl(b,pattern=".Day_of_Year")){ # I transform the date (Day/Month) into a continuous value to test its effect test = gsub(b,pattern=".Day_of_Year",replacement="") d=data[,paste(test,".Day",sep="")] m=data[,paste(test,".Month",sep="")] s = strptime(paste(d,m),"%d %b")$yday+1 # give the Julian date number: %b is for the month as a 3-letter. %m for the month as a 1-12 data[,b] =sin(s*pi/366) # 1 < s < 366 and I want to transform to 0 < s < 1 with the sin() function } if (grepl(b,pattern=".Hour.Minute")){ # Calculate a more precise version of the hour by adding the minutes test = gsub(b,pattern=".Hour.Minute",replacement="") hour = data[,paste(test,".Hour",sep="")] minutes = data[,paste(test,".Minute",sep="")] precise.hour=hour+minutes/60 # Adds minutes to the hour data[,b] = precise.hour } } ## Set defined covariates (in data table) as "factor" or "numeric"## for (a in 1:length(list_cov[,2])){ name = list_cov[a,2] if (is.na(list_cov[a,6])){ data[,as.character(name)]==as.factor(data[,as.character(name)]) } if(list_cov[a,6] == "numeric"){ data[,as.character(name)]==as.numeric(data[,as.character(name)]) } } print(paste("Size of data after calculating covariates:",dim(data))) # [1] 2117 787 ## List measures to test ## # Do not include the measures that are to be treated as.factor # Also some measures have been flagged with "skip" as we don't include them in the analysis anymore (26Jun15) measures = list_clean$Data[list_clean$Group=="m" & list_clean$Type!="as.factor" & list_clean$Skip!="skip"] print(paste("Number of measures:",length(measures))) # [1] 276 # Remove the "Moving" measures (EPM and OFT): not really relevant measures=measures[!measures %in% measures[grep("Moving",measures)]] # Remove the "Ratio" (distance) measures in EPM: measures=measures[!measures %in% measures[grep("eRatio",measures)]] # Remove the weight-corrected hypoxia measures: measures=measures[!measures %in% measures[grep("_Corr",measures)]] # Remove the weight-corrected cardiac (Echo) measures: measures=measures[!measures %in% measures[grep(".BW",measures)]] # Remove the "best" measures in Neuro (when all sections are present): Nasrin will check what is the best measures=measures[!measures %in% measures[grep("_best",measures)]] # Remove the "Echo" measures: measures=measures[!measures %in% measures[grep("Cardio.Echo",measures)]] # Remove the "EPM.Middle" measures: measures=measures[!measures %in% measures[grep("EPM.Middle",measures)]] print(paste("Number of measures:",length(measures))) # [1] 250 # Only keep 249 measures: #list_traits=read.delim("/home/nicod/List_249_quantitative_traits_28Jan15.txt",stringsAsFactors=F)[,1] #measures=measures[measures %in% list_traits] #print(paste("Number of measures:",length(measures))) ## Remove outliers (as in QC analysis) ## for (i in measures){ if (!i %in% c("BMC.Mode","BMC.Mean","BMC.Median")){ # I don't remove outliers in the BMC measures because they INCLUDE outliers (osteopetrosis) x =data[,names(data)==i] out<- abs(x-mean(x,na.rm=T))/sd(x,na.rm=T)>3 # outliers defined as measures outside 3 SD x1<-replace(x,out,NA) data[,names(data)==i]<-x1 } } data_backup = data # Backup of the table at that point in case the calculation of the residuals goes wrong ## Remove measures for mice suspected of being mixed-up ## # First the mice identified with the wrong gender: wrong_sex = read.delim("/home/nicod/Wrong_Gender_To_Exclude_27Feb13.txt") length(wrong_sex[,1]) # [1] 8 # 8 mice with sex mix-up # Exclude mice with duplicated DNA: duplicated = read.table("/home/nicod/Duplicated_list_DNA_20Nov13.txt") length(duplicated[,1]) # [1] 8 # 8 mice with duplicated DNA (actually one has also sex mix-up (59.0h) # Exclude "unrelated" mice: unrelated = read.table("/home/nicod/Unrelated_list_DNA_10Dec13.txt") length(unrelated[,1]) # [1] 2 # only (isolated) 2 mice that look "unrelated" in the IBD plt. Outliers to me: remove # Then the mice with poor correlation with the Affy data: poor_affy = read.delim("/home/nicod/Poor_Affy_Correlation_17Apr13.txt",header=F) length(poor_affy[,1]) # [1] 4 # 4 mice with poor Affy correlation # Exclude those that had mistakes in weight measures: bad_weight = read.delim("/home/nicod/Edited_Weight_to_exclude_17Apr13.txt",header=F) length(bad_weight[,1]) # [1] 287 # Exclude those that had difference in weight measures between Scheduling and Exports/Anonymus: diff_weight = read.delim("/home/nicod/Weight_changes_Scheduling_368mice_25Apr13.txt") length(diff_weight[,1]) # [1] 368 # Exclude those that had high mean residuals between 20 phenotypes: high_resid = read.delim("/home/nicod/High_residuals_2May13.txt",header=F) length(high_resid[,1]) # [1] 202 exclude=c(as.character(wrong_sex[,1]),as.character(duplicated[,1])) #,as.character(unrelated[,1]),as.character(poor_affy[,1]),as.character(high_resid[,1]),as.character(bad_weight[,1]),as.character(diff_weight[,1])) set = data$Animal_ID %in% unique(exclude) # I use "unique" in case some mice are excluded for more than 1 reason print(paste("Number of mice removed from the analysis (mix-up):",length(set[set==TRUE]))) # [1] 15 for (i in measures){ data[set,as.character(i)]=NA # Replace measures made with mixed-up mice with NA } ## Select One Gender Only (males or females) ## # data=data[data$Sex=="M",] # dim(data) # [1] 1065 711 # data=data[data$Sex=="F",] # dim(data) # [1] 1052 711 data_backup = data # Backup of the table at that point in case something downstream goes wrong ## Test all covariates one after the other ## list_of_significant=list() results=NULL models=NULL listing=NULL tissues_dependent <- c("WH","Weight","Micronucleus","Bioch","Haem","Serotonin","FACS","Adrenals","Mito") # Lists the phenotypes that use tissues for(i in 1:length(measures)){ m = measures[i] phen = unlist(strsplit(as.character(m),"\\."))[1] # this splits the "measure" in the strings separated by "." and select the first one if (phen %in% tissues_dependent){ # If using tissues then I also test Diss covariates # By listing the covariates this way I will test the "General" first, then the "Tissue" and finally the "test-specific" covariates (in that order) covariates = c(as.vector(list_cov[,2][list_cov$Test=="General"]),as.vector(list_cov[,2][list_cov$Test=="Diss"]),as.vector(list_cov[,2][list_cov$Test==phen])) } else { # If not from tissues I don't need to include the dissection covariates covariates = c(as.vector(list_cov[,2][list_cov$Test=="General"]),as.vector(list_cov[,2][list_cov$Test==phen])) } ### Change the order of covariates in Neurogenesis and Brain Anatomy tests: if (phen=="Brain") covariates = unique(c("Brain.Protocol","Brain.Experimenter", "Brain.Section_Number",covariates)) # Protocol and Experimenter have massive effect on brain measures so I use this line so tehy are tested first as covariates (before Sex, Batch, etc) if (length(grep("DCX",m)==1)) covariates = unique(c("Neuro.Nb_slices_DCX",covariates)) # Same here: I test number of slices first as covariates before the rest if (length(grep("Ki67",m)==1)) covariates = unique(c("Neuro.Nb_slices_Ki67",covariates)) # Same here: I test number of slices first as covariates before the rest if (phen=="Neuro") covariates = unique(c("Neuro.Staining.batch","Neuro.Experimenter",covariates)) # Same here: I test Experimenter first as covariates before the rest ### Add DNA concentration as first covariate for Mito and Telo measures: if (phen=="Mito") covariates = unique(c("Mito.Extracted_DNA_conc",covariates)) if (phen=="Telo") covariates = unique(c("Mito.Extracted_DNA_conc",covariates)) ### Test "Light" first for the Sleep measures: if (phen=="Sleep") covariates = unique(c("Sleep.Light",covariates)) ### Test "Date.Year" first (year when the measure was done) for the Serotonin measures (because 2014 was done much later): if (phen=="Serotonin") covariates = unique(c("Serotonin.Date.Year",covariates)) ### Remove "Weight.Avearge" for the weight measures (sort of obvious): if (phen=="Weight") covariates = covariates[covariates!="Body.Weight"] ### For muscles measures: change body weight by tibia length: if (phen=="Muscles"){ if (m!="Muscles.Tibia.mm") covariates = gsub ("Body.Weight","Tibia.Length",covariates) } group = NULL for(c in covariates){ test = c(group,c) cc=complete.cases(data[,c(as.character(m),test)]) if (length(unique(data[cc,c]))>1){ # select only the covariates here there is more than 1 value test_1 = c(1,test) group_1 = c(1,group) if (length(cc[cc==TRUE])>0.8*length(data[,as.character(m)][!is.na(data[,as.character(m)])])){ # test covariates only if present in >70% measures formula_0 =paste(m, "~", paste(group_1,collapse=" + "),sep="") formula_1 =paste(m, "~", paste(test_1,collapse=" + "),sep="") a=anova(lm(formula_0,data=data[cc,]),lm(formula_1,data=data[cc,])) pvalue=a[2,6] RSS=a[2,2] TSS=a[1,2] FSS=TSS-RSS fraction_explained=FSS/TSS percents=fraction_explained*100 if (!is.na(pvalue)){ if (pvalue<0.05) { # Keeps only significant covariates list_of_significant[[paste(m,c,sep='_')]]=a results=rbind(results,c(as.character(m),c,pvalue,fraction_explained,percents)) if (percents>=1){ # Keeps only covariates which effect size is above 0.1% group = test } } } } } } ################################ # Normal option: residuals added to the data table ################################ if (length(grep("Macrophages",m))==1) group = NULL # I don't apply correction for qPCR data (Macrophages) if (length(grep("RNA",m))==1) group = NULL # I don't apply correction for qPCR data (RNA) resid.name = paste(m,".resid",sep="") best_formula = paste(m, "~", paste(c(1,group),collapse=" + "),sep="") # The best formula has to be rewritten here as formula_0 might # be erroneus (in case the last covariate has an effect) cc=complete.cases(data[,c(as.character(m),group)]) # cc needs to be redefined in case the last covariate tested is rejected from the model if (length(grep("Batch",best_formula))==1){ best_formula=gsub("Batch","(1|Batch)",best_formula) # Introduce Batch as random effect if present } if (length(grep("Staining.batch",best_formula))==1){ best_formula=gsub("Neuro.Staining.batch","(1|Neuro.Staining.batch)",best_formula) # Introduce Batch as random effect if present } if (length(grep("[|]",best_formula))==1){ data[cc,resid.name] = resid(lmer(best_formula,data=data[cc,])) # calculate residuals of the best mixed model and add to table } else { data[cc,resid.name] = resid(lm(best_formula,data=data[cc,])) # calculate residuals of the best linear model and add to table } models = c(models,best_formula) listing = rbind(listing,c(as.character(m),paste(group,collapse=","))) ################################# ############################################## # Alternative option: a new (small) table is generated for each residuals ############################################## #best_formula = paste(m, "~", paste(c(1,group),collapse=" + "),sep="") #models = c(models,best_formula) #listing = rbind(listing,c(as.character(m),paste(group,collapse=","))) #r=resid(lm(best_formula,data=data)) #ids=as.character(data$Animal_ID[as.numeric(names(r))]) #table=data.frame(cbind(ids,r)) #table=cbind("n/a",table) #names(table)=c("Family","Individual","Phenotype") #file_name=paste("/data/scratch/nicod/residual_tables_males_only_27May13/",m,".resid_pheno.txt",sep="") #write.table(table,file=file_name,sep="\t",quote=F,row.names=F) ############################################## } write.table(models,"models_For_Article_New_26Jun15.txt",sep="\t",quote=F,row.names=F) write.table(listing,"listing_For_Article_New_26Jun15.txt",sep="\t",quote=F,row.names=F) write.table(results,"results_For_Article_New_26Jun15.txt",sep="\t",quote=F,row.names=F) ## Remove outliers on the residuals (one more time) ## for (i in measures){ y = as.character(paste(i,".resid",sep="")) x =data[,y] out<- abs(x-mean(x,na.rm=T))/sd(x,na.rm=T)>3 # outliers defined as measures outside 3 SD x1<-replace(x,out,NA) data[,y]<-x1 } dim(data) # [1] 2117 1035 ### I don't calculate PCAs #tests=c("Hypoxia","Brain","Cardio","FC","Sleep","SPPI","Voc","Anxiety") out=data #for (t in tests){ # dt=data[,grep(".resid",names(data))] # dt=dt[,-grep("Weight",names(dt))] # Remove "weight" meaasures because the column names also contain "Hypoxia","Neo", etc # if(t=="Anxiety"){ # dt=dt[,c(grep("EPM",names(dt)),grep("OFT",names(dt)),grep("Neo",names(dt)))] # } else { # dt=dt[,grep(t,names(dt))] # } # pca=prcomp(dt[complete.cases(dt),],scale=TRUE) # important=length(summary(pca)$importance[3,][summary(pca)$importance[3,]<0.9])+1 # Number or PCAs explaining more than 90% of variance # x=data.frame(pca$x[,c(1:important)]) # names(x)=paste(t,".",names(x),".resid",sep="") # #ids=data$Animal_ID[complete.cases(dt)] # x=cbind(data$Animal_ID[complete.cases(dt)],x) # names(x)[1]="Animal_ID" # out=merge(out,x,by="Animal_ID",all.x=T,all.y=F) #} ## Save table (residuals) for QTL mapping ## write.table(out,"Harwell_all_measures_resids_For_Article_New_26Jun15.txt",sep="\t",quote=F,row.names=F)