#---------------------------------------------------------
#' Assignment
#' author: "Rita Castilho rcastil@ualg.pt"
#' date: "AUGUST 2019"
#---------------------------------------------------------
#---------------------------------------------------------

#---------------------------------------------------------
# 1.preparation
#---------------------------------------------------------

#---------------------------------------------------------
# 1.1 install and load packages
#---------------------------------------------------------  

#[https://alexkychen.github.io/assignPOP/index.html]

if(!require(assignPOP)){install.packages("assignPOP"); library(assignPOP)}
if(!require(ggplot2)){install.packages("ggplot2"); library(ggplot2)}

# set working directories
# what is the name of the main root folder?
root <- "~/Desktop/ices_course_R/"
setwd(root)
getwd()

# where to retrieve data files from?
# make a r.data folder on the root and place datafile there
folder.data <- "~/Desktop/ices_course_R/r.data"
filelist <- list.files(folder.data, full.names=F)
filelist

# where to place results directory
folder.results <- "r.results"
dir.create(paste(root,folder.results,sep=""))
folder.results <- paste(root,folder.results,sep="")
getwd()

# load genepop datafile
genepop <- read.Genepop(paste(folder.data,"cod_filtered.gen",sep="/"), pop.names=c("IcO","IcI","GrO","GrI"), haploid = FALSE)

#---------------------------------------------------------
# save image (this procedure will be repeated throughout the script to avoid 
# re-runs in case of session abortion)
save.image(paste(folder.results,file=paste(species,"assign.RData",sep="_"),sep="/"))

#---------------------------------------------------------
# 3. evaluation of discriminatory power
#---------------------------------------------------------

# train.inds = proportion of individuals (observations) from each 
# population to be used as training data.
# train.loci = proportion of loci to be used as training data.
# loci.sample = Locus sampling method, "fst" or "random". 
# if loci.sample="fst" (default) and train.loci=0.1, it means that top 
# 10 percent of high Fst loci will be sampled as training loci. 
# if loci.sample="random", then random 10 percent of loci will 
# be sampled as training loci. 
# model = which classifier to use for creating predictive models. 
# options include "lda", "svm", "naiveBayes", "tree", and "randomForest". 
# default is "svm"(support vector machine).

assign.MC(genepop,
          train.inds=c(0.5, 0.7, 0.9), 
          train.loci=c(0.1, 0.25, 0.5, 1),
          loci.sample="fst", # random
          iterations=2, #change 100 or 1000
          pca.method="mixed", 
          scaled=TRUE, 
          model="svm", 
          dir = paste(folder.results,"assign.MC.folder/",sep="/"),
          processors=4
          )

save.image(file ="~/Desktop/ices_course_R/R.results/assign.RData")

#---------------------------------------------------------
# population assignment test using K-fold cross-validation
#---------------------------------------------------------
# model = which classifier to use for creating predictive models. 
# options include "lda", "svm", "naiveBayes", "tree", and "randomForest".
assign.kfold(genepop,
             k.fold=c(2, 3, 4),
             train.loci=c(0.1, 0.25, 0.5, 1),
             loci.sample="random",
             model="svm",
             dir = paste(folder.results,"assign.kfold.folder/",sep="/"),
             processors=3
             )

save.image(file ="~/Desktop/ices_course_R/R.results/assign.RData")

#---------------------------------------------------------
# assignment accuracies of Monte-Carlo cross-validation results
#---------------------------------------------------------

accuMC <- accuracy.MC(dir = paste(folder.results,"assign.MC.folder/",sep="/"))

save.image(file ="~/Desktop/ices_course_R/R.results/assign.RData")

accuracy.plot(accuMC, pop = "all")
ggplot1 <-accuracy.plot(accuMC, 
              pop=c("all", "IcO","IcI","GrO","GrI")) +
              ylim(0, 1) + #Set y limit between 0 and 1
              annotate("segment",x=0.4,xend=3.6,y=0.33,yend=0.33,colour="red",size=1) + #Add a red horizontal line at y = 0.33 (null assignment rate for 3 populations)
              ggtitle("Monte-Carlo cross-validation")+ #Add a plot title
              labs(subtitle = "Figure 1. Cod assignment accuracies estimated via Monte-Carlo cross-validation, with three levels of training individuals (50%, 70% and 90% of \nindividuals from each population, on x-axis) crossed by four levels of training loci (top 10%, 25% and 50% highest Fst loci and all loci in \ncolor-coded boxes) by 1000 resampling events.")+
              scale_y_continuous(name="Proportion of cods correctly assigned")+         
              theme(plot.title = element_text(size=20, face="bold")) #Edit plot title text size

ggplot1
ggsave(paste(folder.results,"/Monte-Carlo_cross-validation.png", sep=""),ggplot1,width=11, height=8.5)

#---------------------------------------------------------
# assignment accuracies of K-fold cross-validation results
#---------------------------------------------------------
accuKF <- accuracy.kfold(dir=paste(folder.results,"assign.kfold.folder/",sep="/")) #Use this function for K-fold cross-validation results

ggplot2 <-accuracy.plot(
        accuKF, 
        pop=c("all", "IcO","IcI","GrO","GrI")) +
        ylim(0, 1) + #Set y limit between 0 and 1
        annotate("segment",x=0.4,xend=3.6,y=0.33,yend=0.33,colour="red",size=1) + #Add a red horizontal line at y = 0.33 (null assignment rate for 3 populations)
        ggtitle("K-fold cross-validation")+ #Add a plot title
        labs(subtitle = "Figure 2. Assignment accuracies estimated via K cross-validation") +
        scale_y_continuous(name="Proportion of cods correctly assigned")+
        theme(plot.title = element_text(size=20, face="bold")) #Edit plot title text size

ggplot2
ggsave(paste(folder.results,"/K-fold_cross-validation.png", sep=""),ggplot2,width=11, height=8.5)

check.loci(dir = paste(folder.results,"assign.MC.folder/",sep="/"), 
           top.loci = 20)

#---------------------------------------------------------
# create a membership probability plot
#---------------------------------------------------------
# estimatation of probabilities to understand how individuals 
# are assigned to the populations (similar to structure)
membership.plot(dir = paste(folder.results,"assign.kfold.folder/",sep="/"))

#---------------------------------------------------------
# print mean and standard deviation across assignment tests
#---------------------------------------------------------
# default setting reads through all the files in the specified result folder
assign.matrix( dir = paste(folder.results,"assign.kfold.folder/",sep="/"))

# you also can specify certain results for printing the assignment matrix
assign.matrix( dir = paste(folder.results,"assign.kfold.folder/",sep="/"), train.inds=c(0.7, 0.9), train.loci=c(0.5, 1))

#---------------------------------------------------------
# assign unknown individuals
#---------------------------------------------------------

#import a GENEPOP file containing unknown individuals, 
#and ignore the argument `pop.names` 
Unknown <- read.Genepop(paste(folder.data,"unknown.gen",sep="/"))

#---------------------------------------------------------
# perform assignment test
#---------------------------------------------------------

# model = which classifier to use for creating predictive models. 
# options include "lda", "svm", "naiveBayes", "tree", and "randomForest". 
# Default is "svm"(support vector machine).
assign.1 <- assign.X(x1=GenepopRd,
         x2=Unknown,
         dir=paste(folder.results,"assign.unknown.folder/",sep="/"),
         model="naiveBayes")

#save plot
pdf(file=paste(folder.results,"assign_unknown.pdf",sep="/"),width=11, height=5)
plot(assign.1)
dev.off()

save.image(file ="~/Desktop/ices_course_R/R.results/assign.RData")

#---------------------------------------------------------
# end
#---------------------------------------------------------

########################################################################################
# For cookbook
#library('rmarkdown')
#rmarkdown::render("~/Desktop/ices_course_R/R.code/assign.R")