#######################################
### R scripts for Cluster Analysis
### MBE
### R. Castilho || CCMAR || 2025-02-113

#######################################
### PREPARATION STEPS


#######################################
### LOAD DATA
# set working directory... change this to the directory in which the species' data is stored and where you want to save output
# go to sessions tab, choose set working directory, choose directory
setwd("~/Desktop") # you need to change this!!

# you need to read data file, which is an excel file
# to do that you need to find a function 
# that will allow you to easily upload the original data
# directly into R. Further manipulations can be done inside R

# script line below needs to be completed to load the data into a new object "data.tab"
# use the function read.csv with the proper arguments
data.tab <- read.csv (:::)

  # check the size of the dataframe
  dim(data.tab)

# now delete the Lesspessian column
# find a way to get read only the columns you want
data.tab <- data.tab[1:828,::::]

#transform your dataframe into a matrix
data.mat <- as.matrix(data.tab)
# you are now ready to proceed!


################################################################
################################################################
# PART 1

# install packages
install.packages("clustsig")
library(clustsig)


# The agglomeration method to be used:
# This should be (an unambiguous abbreviation of) one of "ward.D", "ward.D2", "single", "complete", 
# "average" (= UPGMA), 
# "mcquitty" (= WPGMA), 
# "median" (= WPGMC) or 
# "centroid" (= UPGMC)

# The distance index to be used:
# "euclidean",
# "maximum",
# "manhattan",
# "canberra",
# "binary" 
# "minkowski", 
# "braycurtis" or "czekanowski" for Czekanowski Dissimilarity 
# (referred to as Bray-Curtis Dissimilarity in some fields, particularly marine biology), 
# "actual-braycurtis" for the true Bray-Curtis Dissimilarity where the data are 
# standardized before the dissimilarity is calculated. 
# This value can also be any function which returns a "dist" object.

# check what simprof function does
# obtain the distance matrix
BCdist <- simprof(data.mat, num.expected=10, num.simulated=9, method.cluster="average", method.distance="braycurtis", alpha=0.05, sample.orientation="column", const=0, silent=FALSE, increment=100)
# having the distance matrix, now we need to obtain the cladogram 
simprof.plot(BC, leafcolors=NA, plot=TRUE, fill=TRUE, leaflab="perpendicular", siglinetype=1)

# we can now obtain the number of groups which are found to be statistically significant
BCdist$numgroups
# obtain a list of length numgroups with each element containing the sample IDs (row/column numbers in
#the corresponding original data) that are in each significant cluster
BCdist$significantclusters
# p-values for testing whether the two groups in that row are statistically different. The null hypothesis is
# that there is no a priori group structure, so if p<0.05 means that there is structure
BCdist$pval


################################################################
################################################################
# PART 2
# let's try a different clustering method
install.packages("cluster")
library(cluster)

# row = observation (sites), column = variable (species)
# tranpose matrix for this algorithm
tdata.mat <- t(data.mat)
# obtain the distance matrix
UPGMA <- agnes(tdata.mat, diss = inherits(data.mat, "dist"), metric = "euclidean",
               stand = FALSE, method = "average",
               keep.diss = FALSE, keep.data = FALSE, trace.lev = 0)
# plot the cladogram 
plot(UPGMA)


################################################################
################################################################
# PART 3
install.packages("vegan")
library(vegan)

# the default index in vegdist is BrayÃ¢â‚¬â€œCurtis
dist <- vegdist(tdata.mat)
# hierarchical cluster analysis
csin <- hclust(d, method="single")

# plot cluster
plot(csin)
plot(csin, hang=-1)

################################################################
################################################################
# PART 4
# Ward Hierarchical Clustering with Bootstrapped p values
install.packages("pvclust")
library(pvclust)


# method: "euclidean", "maximum", "manhattan", "canberra", "binary" or "minkowski"
# matrix: row=species; columns=sites
# obtain distance matrix
fit <- pvclust(data.mat, method.hclust="ward.D", method.dist="euclidean")

# plot cladogram
plot(fit) # dendogram with p values

# add rectangles around groups highly supported by the data
pvrect(fit, alpha=.95, pv="bp")
################################################

################################################
# END
################################################