#######################################
### R scripts for Classical MDS
### BEMO 2015
### R. Castilho || CCMAR || 2015-02-17

#######################################
### PREPARATION STEPS
# install packages
install.packages("clustsig")
library(clustsig)


#######################################
### LOAD DATA
# set working directory... change this to the directory in which the species' data is stored and where you want to save output
# setwd("D:/UTILIZADORES/user-en/Desktop/BEMO")
setwd("~/Desktop")
data.tab = read.table("data_andre.txt", header=TRUE, sep = "\t")
data.mat = as.matrix(data.tab[,1:13])

# The agglomeration method to be used:
# This should be (an unambiguous abbreviation of) one of "ward.D", "ward.D2", "single", "complete", 
# "average" (= UPGMA), 
# "mcquitty" (= WPGMA), 
# "median" (= WPGMC) or 
# "centroid" (= UPGMC)

# The distance index to be used:
# "euclidean",
# "maximum",
# "manhattan",
# "canberra",
# "binary" 
# "minkowski", 
# "braycurtis" or "czekanowski" for Czekanowski Dissimilarity 
# (referred to as Bray-Curtis Dissimilarity in some fields, particularly marine biology), 
# "actual-braycurtis" for the true Bray-Curtis Dissimilarity where the data are 
# standardized before the dissimilarity is calculated. 
# This value can also be any function which returns a "dist" object.

BC=simprof(data.mat, num.expected=10, num.simulated=9, method.cluster="average", method.distance="braycurtis", alpha=0.05, sample.orientation="column", const=0, silent=FALSE, increment=100)
simprof.plot(BC, leafcolors=NA, plot=TRUE, fill=TRUE, leaflab="perpendicular", siglinetype=1)

# we can now obtain the number of groups which are found to be statistically significant
BC$numgroups
# obtain a list of length numgroups with each element containing the sample IDs (row/column numbers in
#the corresponding original data) that are in each significant cluster
BC$significantclusters
# p-values for testing whether the two groups in that row are statistically different. The null hypothesis is
# that there is no a priori group structure, so if p<0.05 means that there is structure
BC$pval
################
library(cluster)
# row = observation (sites), column = variable (species)
tdata.mat <- t(data.mat)
UPGMA <- agnes(tdata.mat, diss = inherits(data.mat, "dist"), metric = "euclidean",
      stand = FALSE, method = "average",
      keep.diss = FALSE, keep.data = FALSE, trace.lev = 0)

plot(UPGMA)

################
library(vegan)
# the default index in vegdist is Bray–Curtis
d <- vegdist(tdata.mat)
csin <- hclust(d, method="single")
plot(csin)
plot(csin, hang=-1)

####### Ward Hierarchical Clustering with Bootstrapped p values
# install.packages("pvclust")
library(pvclust)
# pvclust 
# method: "euclidean", "maximum", "manhattan", "canberra", "binary" or "minkowski"
# matrix: row=species; columns=sites
fit <- pvclust(data.mat, method.hclust="ward.D", method.dist="euclidean")
plot(fit) # dendogram with p values
# add rectangles around groups highly supported by the data
pvrect(fit, alpha=.95, pv="bp")
################