####################################### ### R scripts for Cluster Analysis ### MBE ### R. Castilho || CCMAR || 2025-02-113 ####################################### ### PREPARATION STEPS ####################################### ### LOAD DATA # set working directory... change this to the directory in which the species' data is stored and where you want to save output # go to sessions tab, choose set working directory, choose directory setwd("~/Desktop") # you need to change this!! # you need to read data file, which is an excel file # to do that you need to find a function # that will allow you to easily upload the original data # directly into R. Further manipulations can be done inside R # script line below needs to be completed to load the data into a new object "data.tab" # use the function read.csv with the proper arguments data.tab <- read.csv (:::) # check the size of the dataframe dim(data.tab) # now delete the Lesspessian column # find a way to get read only the columns you want data.tab <- data.tab[1:828,::::] #transform your dataframe into a matrix data.mat <- as.matrix(data.tab) # you are now ready to proceed! ################################################################ ################################################################ # PART 1 # install packages install.packages("clustsig") library(clustsig) # The agglomeration method to be used: # This should be (an unambiguous abbreviation of) one of "ward.D", "ward.D2", "single", "complete", # "average" (= UPGMA), # "mcquitty" (= WPGMA), # "median" (= WPGMC) or # "centroid" (= UPGMC) # The distance index to be used: # "euclidean", # "maximum", # "manhattan", # "canberra", # "binary" # "minkowski", # "braycurtis" or "czekanowski" for Czekanowski Dissimilarity # (referred to as Bray-Curtis Dissimilarity in some fields, particularly marine biology), # "actual-braycurtis" for the true Bray-Curtis Dissimilarity where the data are # standardized before the dissimilarity is calculated. # This value can also be any function which returns a "dist" object. # check what simprof function does # obtain the distance matrix BCdist <- simprof(data.mat, num.expected=10, num.simulated=9, method.cluster="average", method.distance="braycurtis", alpha=0.05, sample.orientation="column", const=0, silent=FALSE, increment=100) # having the distance matrix, now we need to obtain the cladogram simprof.plot(BC, leafcolors=NA, plot=TRUE, fill=TRUE, leaflab="perpendicular", siglinetype=1) # we can now obtain the number of groups which are found to be statistically significant BCdist$numgroups # obtain a list of length numgroups with each element containing the sample IDs (row/column numbers in #the corresponding original data) that are in each significant cluster BCdist$significantclusters # p-values for testing whether the two groups in that row are statistically different. The null hypothesis is # that there is no a priori group structure, so if p<0.05 means that there is structure BCdist$pval ################################################################ ################################################################ # PART 2 # let's try a different clustering method install.packages("cluster") library(cluster) # row = observation (sites), column = variable (species) # tranpose matrix for this algorithm tdata.mat <- t(data.mat) # obtain the distance matrix UPGMA <- agnes(tdata.mat, diss = inherits(data.mat, "dist"), metric = "euclidean", stand = FALSE, method = "average", keep.diss = FALSE, keep.data = FALSE, trace.lev = 0) # plot the cladogram plot(UPGMA) ################################################################ ################################################################ # PART 3 install.packages("vegan") library(vegan) # the default index in vegdist is Bray–Curtis dist <- vegdist(tdata.mat) # hierarchical cluster analysis csin <- hclust(d, method="single") # plot cluster plot(csin) plot(csin, hang=-1) ################################################################ ################################################################ # PART 4 # Ward Hierarchical Clustering with Bootstrapped p values install.packages("pvclust") library(pvclust) # method: "euclidean", "maximum", "manhattan", "canberra", "binary" or "minkowski" # matrix: row=species; columns=sites # obtain distance matrix fit <- pvclust(data.mat, method.hclust="ward.D", method.dist="euclidean") # plot cladogram plot(fit) # dendogram with p values # add rectangles around groups highly supported by the data pvrect(fit, alpha=.95, pv="bp") ################################################ ################################################ # END ################################################