#Install and load the required library library(glmnet) phenotype="HET_2" #Column name of the phenotype to be analysed id.column="line_name" #Column name with sample names (must match sample names of genotype data) phenfile<-read.table("DATA/PHENOTYPES/NDM_phenotypes.tsv", header=TRUE, stringsAsFactors=FALSE) pruned_traw_file<-read.table("DATA/MAGIC_IMPUTED_PRUNED/MAGIC_imputed.pruned.traw", header=TRUE, stringsAsFactors=FALSE) #imputed MAGIC RIL genotypes in traw format, after LD pruning with PLINK option --indep-pairwise 500 10 0.99 #Format datasets for genomic prediction #Extract phenotype data for the MAGIC lines only founderNames=c("Banco", "Bersee", "Brigadier", "Copain", "Cordiale", "Flamingo", "Gladiator", "Holdfast", "Kloka", "MarisFundin", "Robigus", "Slejpner", "Soissons", "Spark", "Steadfast", "Stetson") phenodata <- phenfile[-which(phenfile$line_name %in% founderNames),] genodata <- as.data.frame(t(pruned_traw_file[,-c(1:6)])) #impute genotype and phenotype data with the mean. NAtomean <- function(x){replace(x, is.na(x), mean(x, na.rm = TRUE))} imputed.phenodata<-replace(phenodata, TRUE, lapply(phenodata, NAtomean)) imputed.genodata<-data.matrix(replace(genodata, TRUE, lapply(genodata, NAtomean))) #match the sample names in phenodata and genodata rownames(imputed.genodata)<-sub("\\_.*", "", rownames(imputed.genodata)) imputed.genodata<-imputed.genodata[imputed.phenodata$line_name,] #Split data into training and test set foldid=sample(rep(seq(10),length=454)) subset<-sort(sample(1:504, 50, FALSE)) test.geno<-imputed.genodata[subset,] train.geno<-imputed.genodata[-subset,] test.pheno<-imputed.phenodata[subset,] train.pheno<-imputed.phenodata[-subset,] #Fit model with cross validation and make predictions #Fit ridge regression for HET_2 cv.fit.rr<-cv.glmnet(train.geno, train.pheno$HET_2, foldid=foldid, keep=TRUE, nfolds=10, alpha=0) #Extract coefficients coef.rr<-predict(cv.fit.rr, s = cv.fit.rr$lambda.min, type = "coefficients") fullfit.rr<- coef.rr[1] + (train.geno %*% coef.rr[-1]) pred.rr<- coef.rr[1] + (test.geno %*% coef.rr[-1]) #correlate predictions to test and training phenotype values corr.pred.rr<-cor(test.pheno$HET_2, pred.rr[,1]) corr.fullfit.rr<-cor(train.pheno$HET_2, fullfit.rr) #Fit Elastic net cv.fit.elnet<-cv.glmnet(train.geno, train.pheno$HET_2, foldid=foldid, keep=TRUE, nfolds=10, alpha=0.5) coef.elnet<-predict(cv.fit.elnet, s = cv.fit.elnet$lambda.min, type = "coefficients") fullfit.elnet<- coef.elnet[1] + (train.geno %*% coef.elnet[-1]) pred.elnet<- coef.elnet[1] + (test.geno %*% coef.elnet[-1]) corr.pred.elnet<-cor(test.pheno$HET_2, pred.elnet[,1]) corr.fullfit.elnet<-cor(train.pheno$HET_2, fullfit.elnet[,1]) #Fit LASSO cv.fit.lasso<-cv.glmnet(train.geno, train.pheno$HET_2, foldid=foldid, keep=TRUE, nfolds=10, alpha=1) coef.lasso<-predict(cv.fit.lasso, s = cv.fit.lasso$lambda.min, type = "coefficients") fullfit.lasso<- coef.lasso[1] + (train.geno %*% coef.lasso[-1]) pred.lasso<- coef.lasso[1] + (test.geno %*% coef.lasso[-1]) corr.pred.lasso<- cor(test.pheno$HET_2, pred.lasso[,1]) corr.fullfit.lasso<- cor(train.pheno$HET_2, fullfit.lasso[,1])