#--------------------------------------------------------- #' Assignment #' author: "Rita Castilho rcastil@ualg.pt" #' date: "AUGUST 2019" #--------------------------------------------------------- #--------------------------------------------------------- #--------------------------------------------------------- # 1.preparation #--------------------------------------------------------- #--------------------------------------------------------- # 1.1 install and load packages #--------------------------------------------------------- #[https://alexkychen.github.io/assignPOP/index.html] if(!require(assignPOP)){install.packages("assignPOP"); library(assignPOP)} if(!require(ggplot2)){install.packages("ggplot2"); library(ggplot2)} # set working directories # what is the name of the main root folder? root <- "~/Desktop/ices_course_R/" setwd(root) getwd() # where to retrieve data files from? # make a r.data folder on the root and place datafile there folder.data <- "~/Desktop/ices_course_R/r.data" filelist <- list.files(folder.data, full.names=F) filelist # where to place results directory folder.results <- "r.results" dir.create(paste(root,folder.results,sep="")) folder.results <- paste(root,folder.results,sep="") getwd() # load genepop datafile genepop <- read.Genepop(paste(folder.data,"cod_filtered.gen",sep="/"), pop.names=c("IcO","IcI","GrO","GrI"), haploid = FALSE) #--------------------------------------------------------- # save image (this procedure will be repeated throughout the script to avoid # re-runs in case of session abortion) save.image(paste(folder.results,file=paste(species,"assign.RData",sep="_"),sep="/")) #--------------------------------------------------------- # 3. evaluation of discriminatory power #--------------------------------------------------------- # train.inds = proportion of individuals (observations) from each # population to be used as training data. # train.loci = proportion of loci to be used as training data. # loci.sample = Locus sampling method, "fst" or "random". # if loci.sample="fst" (default) and train.loci=0.1, it means that top # 10 percent of high Fst loci will be sampled as training loci. # if loci.sample="random", then random 10 percent of loci will # be sampled as training loci. # model = which classifier to use for creating predictive models. # options include "lda", "svm", "naiveBayes", "tree", and "randomForest". # default is "svm"(support vector machine). assign.MC(genepop, train.inds=c(0.5, 0.7, 0.9), train.loci=c(0.1, 0.25, 0.5, 1), loci.sample="fst", # random iterations=2, #change 100 or 1000 pca.method="mixed", scaled=TRUE, model="svm", dir = paste(folder.results,"assign.MC.folder/",sep="/"), processors=4 ) save.image(file ="~/Desktop/ices_course_R/R.results/assign.RData") #--------------------------------------------------------- # population assignment test using K-fold cross-validation #--------------------------------------------------------- # model = which classifier to use for creating predictive models. # options include "lda", "svm", "naiveBayes", "tree", and "randomForest". assign.kfold(genepop, k.fold=c(2, 3, 4), train.loci=c(0.1, 0.25, 0.5, 1), loci.sample="random", model="svm", dir = paste(folder.results,"assign.kfold.folder/",sep="/"), processors=3 ) save.image(file ="~/Desktop/ices_course_R/R.results/assign.RData") #--------------------------------------------------------- # assignment accuracies of Monte-Carlo cross-validation results #--------------------------------------------------------- accuMC <- accuracy.MC(dir = paste(folder.results,"assign.MC.folder/",sep="/")) save.image(file ="~/Desktop/ices_course_R/R.results/assign.RData") accuracy.plot(accuMC, pop = "all") ggplot1 <-accuracy.plot(accuMC, pop=c("all", "IcO","IcI","GrO","GrI")) + ylim(0, 1) + #Set y limit between 0 and 1 annotate("segment",x=0.4,xend=3.6,y=0.33,yend=0.33,colour="red",size=1) + #Add a red horizontal line at y = 0.33 (null assignment rate for 3 populations) ggtitle("Monte-Carlo cross-validation")+ #Add a plot title labs(subtitle = "Figure 1. Cod assignment accuracies estimated via Monte-Carlo cross-validation, with three levels of training individuals (50%, 70% and 90% of \nindividuals from each population, on x-axis) crossed by four levels of training loci (top 10%, 25% and 50% highest Fst loci and all loci in \ncolor-coded boxes) by 1000 resampling events.")+ scale_y_continuous(name="Proportion of cods correctly assigned")+ theme(plot.title = element_text(size=20, face="bold")) #Edit plot title text size ggplot1 ggsave(paste(folder.results,"/Monte-Carlo_cross-validation.png", sep=""),ggplot1,width=11, height=8.5) #--------------------------------------------------------- # assignment accuracies of K-fold cross-validation results #--------------------------------------------------------- accuKF <- accuracy.kfold(dir=paste(folder.results,"assign.kfold.folder/",sep="/")) #Use this function for K-fold cross-validation results ggplot2 <-accuracy.plot( accuKF, pop=c("all", "IcO","IcI","GrO","GrI")) + ylim(0, 1) + #Set y limit between 0 and 1 annotate("segment",x=0.4,xend=3.6,y=0.33,yend=0.33,colour="red",size=1) + #Add a red horizontal line at y = 0.33 (null assignment rate for 3 populations) ggtitle("K-fold cross-validation")+ #Add a plot title labs(subtitle = "Figure 2. Assignment accuracies estimated via K cross-validation") + scale_y_continuous(name="Proportion of cods correctly assigned")+ theme(plot.title = element_text(size=20, face="bold")) #Edit plot title text size ggplot2 ggsave(paste(folder.results,"/K-fold_cross-validation.png", sep=""),ggplot2,width=11, height=8.5) check.loci(dir = paste(folder.results,"assign.MC.folder/",sep="/"), top.loci = 20) #--------------------------------------------------------- # create a membership probability plot #--------------------------------------------------------- # estimatation of probabilities to understand how individuals # are assigned to the populations (similar to structure) membership.plot(dir = paste(folder.results,"assign.kfold.folder/",sep="/")) #--------------------------------------------------------- # print mean and standard deviation across assignment tests #--------------------------------------------------------- # default setting reads through all the files in the specified result folder assign.matrix( dir = paste(folder.results,"assign.kfold.folder/",sep="/")) # you also can specify certain results for printing the assignment matrix assign.matrix( dir = paste(folder.results,"assign.kfold.folder/",sep="/"), train.inds=c(0.7, 0.9), train.loci=c(0.5, 1)) #--------------------------------------------------------- # assign unknown individuals #--------------------------------------------------------- #import a GENEPOP file containing unknown individuals, #and ignore the argument `pop.names` Unknown <- read.Genepop(paste(folder.data,"unknown.gen",sep="/")) #--------------------------------------------------------- # perform assignment test #--------------------------------------------------------- # model = which classifier to use for creating predictive models. # options include "lda", "svm", "naiveBayes", "tree", and "randomForest". # Default is "svm"(support vector machine). assign.1 <- assign.X(x1=GenepopRd, x2=Unknown, dir=paste(folder.results,"assign.unknown.folder/",sep="/"), model="naiveBayes") #save plot pdf(file=paste(folder.results,"assign_unknown.pdf",sep="/"),width=11, height=5) plot(assign.1) dev.off() save.image(file ="~/Desktop/ices_course_R/R.results/assign.RData") #--------------------------------------------------------- # end #--------------------------------------------------------- ######################################################################################## # For cookbook #library('rmarkdown') #rmarkdown::render("~/Desktop/ices_course_R/R.code/assign.R")