library(ape);library(geiger);library(vegan)

#tree in newick format
tree<-read.tree("rockfish.phy")
#file with trait data for each species (rownames are species names, same as tree tip labels)
traits<-read.csv("rockfish_PC_traits.csv",row.names=1)
#file with community data: rownames are species, column names are different communities, species presence/absence coded as 1/0
comdata<-read.csv("community_presabs.csv",row.names=1)
#optional file with info on communities (e.g. depth and latitude)
comnotes<-read.csv("community_info.csv",row.names=1)

#make sure tree only has species in trait dataset
tree<-drop.tip(tree,which(tree$tip.label%in%rownames(traits)==FALSE))

#match rows in data frames to order of tree tip labels
traits<-traits[tree$tip.label,]
comdata<-comdata[tree$tip.label,]

#number of traits (assuming using all columns of the data frame)
ntrait<-dim(traits)[2]
#number of species per community
Nsp<-apply(comdata,2,sum)

#function to calculate metrics of community structure (phylogenetic and trait dispersion) and get a null distribution by randomly drawing species. pooltree and pooltrait are the tree and trait data only for species that are potentially community members (only differs among communities for null model 2)

calc.comstruct<-function(comdata,pooltree,pooltrait,nsims){
	com<-which(comdata==1);n<-length(com)
#patristic (branch length) distances between species
	treedist<-cophenetic(pooltree)
	comtreedist<-treedist[com,com];diag(comtreedist)<-NA
#mean nearest neighbor distance	(closest relative only)
	MNND<-mean(apply(comtreedist,1,min,na.rm=T))
#mean phylogenetic distance (between all species)
	MPD<-mean(comtreedist,na.rm=T)
#trait data
	pooltrait<-as.matrix(pooltrait);ntrait<-dim(pooltrait)[2]
	simrangetrait<-rangetrait<-simSRtrait<-SRtrait<-numeric(ntrait)
for(m in 1:ntrait){
#sort traits for calculating neighbor distances
	comtrait<-sort(pooltrait[com,m])
#range of trait values
	rangetrait[m]<-max(comtrait)-min(comtrait)
#neighbor distances (ND - should all be positive)
	traitdist<-diff(comtrait)
#sd(ND)/range as described in paper
	SRtrait[m]<-sd(traitdist)/(max(comtrait)-min(comtrait))
}
#initiate matrix for output of function
sims<-matrix(NA,nrow=(nsims+1),ncol=(2+2*ntrait))
	colnames(sims)<-c("MNND","MPD",paste("rangetrait",1:ntrait,sep=""),paste("SRtrait",1:ntrait,sep=""))
#first row is observed values: phylogenetic metrics, then range and sd(ND)/range for each trait
	sims[1,]<-c(MNND,MPD,rangetrait,SRtrait)
#simulate communities by sampling equal numbers of species as in the original community from whatever regional pool is being used
for(k in 2:(nsims+1)){
	simcom<-sample(1:length(comdata),n,replace=F)
#tree
	simtreedist<-treedist[simcom,simcom];diag(simtreedist)<-NA
	simMNND<-mean(apply(simtreedist,1,min,na.rm=T))
	simMPD<-mean(simtreedist,na.rm=T)
#traits	
for(m in 1:ntrait){
	simtrait<-sort(pooltrait[simcom,m])
	simrangetrait[m]<-max(simtrait)-min(simtrait)
	simtraitdist<-diff(simtrait)
	simSRtrait[m]<-sd(simtraitdist)/(max(simtrait)-min(simtrait))
	}
	sims[k,]<-c(simMNND,simMPD,simrangetrait,simSRtrait)
}
#the matrix of observed and simulated metrics is returned by the function
return(sims)
}

#run the function for each community, and calculate indices for each descriptor (standardized using the mean and standard deviation of the null values)
#this step may take a while
com_out<-list()
indices<-data.frame(matrix(NA,nrow=dim(comdata)[2],ncol=2+2*ntrait,dimnames=list(colnames(comdata),c("MNND","MPD",paste("rangetrait",1:ntrait,sep=""),paste("SRtrait",1: ntrait,sep="")))))
for(i in 1:dim(comdata)[2]){
#save the output matrix for each community as an element in a list
	com_out[[i]]<-data.frame(calc.comstruct(comdata[,i],pooltree=tree,pooltrait=traits[,1:ntrait],nsims=999))
#calculate the index for each metric (check that sign is correct - as set up, positive values indicate lower range or lower SD(nd)/range
for(j in 1:dim(indices)[2]){
	indices[i,j]<-(-1)*(com_out[[i]][1,j]-mean(com_out[[i]][-1,j]))/sd(com_out[[i]][-1,j])
	}
}

#save files - in my experience the list "com_out" is easiest to save as an R object, but this means it can only be accessed through R
#save(com_out,file="community_output.R")
#write.csv(indices,file="indices.csv")

#plot all indices against each other - useful to see if phylogenetic and trait inferences are comparable, as well as to ensure that sd(ND)/range is not strongly correlated with range, which might make inference about evenness misleading. It's also useful to go through "com_out" and compare the null distribution of range and evenness for the same trait to see if they're relatively independent. In my experience they are, but quantifying evenness tends to be the most complicated/contentious part, so it's good to be thorough.
plot(indices)

#meta-analyzing across the 30 communities. In the Ecology paper I just used one sample t-test against a mean index value of zero. Can use nonparametric tests if more appropriate.
for(i in 1:dim(indices)[2]){
	print(names(indices)[i])
	print(t.test(indices[,i]))
}

#if wanting to calculate p-values based on rank of observed values relative to null distribution (within each community). This setup has alpha = 0.05 for a 2-tailed test, so a p < 0.025 means a metric (range, evenness, or PD) significantly lower than the null, and p > 0.975 means significantly higher
p_matrix<-data.frame(matrix(NA,nrow=dim(comdata)[2],ncol=2+2*ntrait,dimnames=list(colnames(comdata),c("MNND","MPD",paste("rangetrait",1:ntrait,sep=""),paste("SRtrait",1: ntrait,sep="")))))
for(i in 1:dim(comdata)[2]){
	for(j in 1:dim(indices)[2]){
		p_matrix[i,j]<-sum(com_out[[i]][-1,j]>com_out[[i]][1,j])/length(com_out[[i]][,j])
	}
}

#a way to visualize the average null values of each statistic at various community sizes, then look at the observed values for each community. First, looking at the null and observed ranges for trait #3
null_ranges<-numeric(25)
for(i in 2:25){
	x<-0
	for(j in 1:100){
		x<-x+diff(range(traits[sample(1:dim(traits)[1],i),3]))
		}
	null_ranges[i]<-x/100
}		
		
plot(null_ranges)
for(i in 1:30){
	points(Nsp[i],diff(range(traits[which(comdata[,i]==1),3])),col="red")
}

#now, the null and observed SD(ND)/range for trait #2
null_even<-numeric(25)
for(i in 2:25){
	x<-0
	for(j in 1:100){
		y<-traits[sample(1:dim(traits)[1],i),2]
		x<-x+sd(diff(sort(y)))/diff(range(y))
		}
	null_even[i]<-x/100
}		

plot(null_even)
for(i in 1:30){
	points(Nsp[i],sd(diff(sort(traits[which(comdata[,i]==1),2])))/diff(range(traits[which(comdata[,i]==1),2])),col="red")
}