PcRandForest-PerfEval-PvsN.Rmd

---
title: "False Killer Whale Whistle Classification- Random Forest Analysis"
author: "Yvonne Barkley"
date: "March 21, 2018"
output:
  html_document: default
  pdf_document: default
  word_document: default
---

This script creates random forests using whistle datasets (either old or new datasets)


 Load your libraries
```{r warning=FALSE, message=FALSE}
library(sqldf)
library(randomForest)
library(partykit)
library(caret)
library(corrplot)
library(parallel); 
library(doParallel);
library(gtools)
library(devtools) 
library(dplyr)
#library(ROCR)
library(Hmisc)
library(tree)
```


If using the raw data file, run this section to modify the columns
```{r}
#6/28/18
#Load ROCCA data (already replaced 'NA' and '999' with '0')
ROCCAdata<-read.csv('C:\\Users\\Yvonne\\Documents\\PHD\\CHP1-FKW\\data\\2017- 100 New Whistles/ROCCA Towed Data/BigRoccaFile_PM_TowedAll_RAW_20180717.csv')

#ROCCAdata<-read.csv('C:\\Users\\Yvonne\\Documents\\PHD\\CHP1-FKW\\data\\2017- 100 New Whistles/decimation_test/PcMHI31_32_RoccaContourStats_RAWDEC.csv')

#Read old dataset
#pcdata<-read.csv('C:\\Users\\Yvonne\\Documents\\PHD\\CHP1-FKW\\data\\Pc_MasterListEdit-1016labs.csv')

#pcdata<-droplevels(subset(pcdata, Recorder != 'PMRF'))


#Add species names to last columns and save (used this for BIG ROCCA with all recorders)
colnames(ROCCAdata)[93] <- "Dc_Dd"
colnames(ROCCAdata)[94] <- "Gm"
colnames(ROCCAdata)[95] <- "Pc"
colnames(ROCCAdata)[96] <- "Sa"
colnames(ROCCAdata)[97] <- "Sb"
colnames(ROCCAdata)[98] <- "Sc"
colnames(ROCCAdata)[99] <- "Sl"
colnames(ROCCAdata)[100] <- "Tt"

#Used this routine for towed only data set
# colnames(ROCCAdata)[92]
# colnames(ROCCAdata)[93]
# colnames(ROCCAdata)[94]
# colnames(ROCCAdata)[95]
# colnames(ROCCAdata)[97] <- "Dc_Dd"
# colnames(ROCCAdata)[98] <- "Gm"
# colnames(ROCCAdata)[99] <- "Pc"
# colnames(ROCCAdata)[100] <- "Sa"
# colnames(ROCCAdata)[101]<- "Sb"
# colnames(ROCCAdata)[102]<- "Sc"
# colnames(ROCCAdata)[103]<- "Sl"
# colnames(ROCCAdata)[104]<- "Tt"


#Reexamining whistles with outlying values of slope means
# test<- filter(ROCCAdata, FREQPOSSLOPEMEAN  > 90000)
# test2<- filter(ROCCAdata, FREQNEGSLOPEMEAN <  -90000)

#Remove unnecessary columns

pcdata <- ROCCAdata[, -c(1:7, 11:13, 21, 22, 34:37, 68:92, 101)] #for towed only

#for decimation test
#pcdata <- ROCCAdata[, -c(1:7, 11:13, 15:18, 25, 26, 38:41, 73:96, 105)]
pcdata %>% mutate_if(is.factor, as.character) -> pcdata
#not sure why I included the opposite code below...
#pcdata %>% mutate_if(is.character, as.factor) -> pcdata


#i <- sapply(pcdata, is.factor)
#pcdata[i] <- lapply(pcdata[i], as.character)
###Added group column manually
###Add Population column and reorder
pcdata$population <-NA
levels(pcdata$population) <- c(levels(pcdata$population), "pel", "mhi") #, "nwhi")
pcdata[pcdata$group %in% c(31:34), "population"] <- "mhi"
#pcdata[pcdata$group %in% c(14:17), "population"] <- "nwhi"
pcdata[pcdata$group %in% c(1:8), "population"] <- "pel"
pcdata$population <- as.factor(pcdata$population)
pcdata.pn<- pcdata[, c(60, 1:59)]

#Change column namec
colnames(pcdata.pn)[2] <- "EncounterID"

#save file
write.csv(pcdata.pn, 'C:\\Users\\Yvonne\\Documents\\PHD\\CHP1-FKW\\data\\2017- 100 New Whistles/ROCCA Towed Data/BigRoccaFile_PM_TowedAll_EDIT_20180717.csv', row.names=F)

```

Correlation Stuff
Will remove correlated variables based on the r value.
```{r}
##Correlation?#### 
# Including correlated variables would mostly impact the important variables
##Calculate Pearson's correlation coefficient for all variables. Remove variables -0.70 > r > 0.70.
pcsub<-pcdata.pn[,6:52]

#pcsub<-pc_HarpFostex[,7:54]

corrplot.mixed(cor(pcsub, method= "pearson"), upper="number", lower="circle")
tmp <- cor(pcsub)
tmp[upper.tri(tmp)] <- 0
diag(tmp) <- 0
pcsubcor <- pcsub[,!apply(tmp,2,function(x) any(x > 0.80 | x < -0.80))]

#remove FreqCOFM
pcsubcor <-pcsubcor[, -6]
#Adding beginning freq, range, freqquarter3. See Correlation_20180717.xlsx for more details. 
pcsubcor <-cbind(pcsubcor, pcdata.pn[, c(9,11)]) #add in FreqBeg & Range
pcsubcor <-cbind(pcsubcor, pc_HarpFostex[, c(10,12)]) #add in FreqBeg & Range
tmp_new <- cor(pcsubcor)
tmp_new[upper.tri(tmp_new)] <- 0
diag(tmp_new) <- 0
corrPlot<-corrplot.mixed(cor(pcsubcor), upper="circle", lower="number") # for a visual representation of the correlation
write.csv(tmp_new, 'C:\\Users\\Yvonne\\Documents\\PHD\\CHP1-FKW\\data\\results/20180717/Correlation_2_20180717.csv', row.names=F)  


#Adding first 5 cols plus a few freq variables that were correlated and removed altogether. 
  pcdata.pn <- cbind(pcdata.pn[, c(1:5)], pcsubcor)
  pc_HarpFostex <- cbind(pc_HarpFostex[, c(1:5)], pcsubcor)

 write.csv(pc_HarpFostex, 'C:\\Users\\Yvonne\\Documents\\PHD\\CHP1-FKW\\data\\results/20180720/pcdata.HarpFostex_201807123.csv', row.names=F)   
  

#Example Correlation Code
# https://stats.stackexchange.com/questions/50537/should-one-remove-highly-correlated-variables-before-doing-pca
n.cases <- 240               # Number of points.
n.vars <- 4                  # Number of mutually correlated variables.
set.seed(26)                 # Make these results reproducible.
eps <- rnorm(n.vars, 0, 1/4) # Make "1/4" smaller to *increase* the correlations.
x <- matrix(rnorm(n.cases * (n.vars+2)), nrow=n.cases)
beta <- rbind(c(1,rep(0, n.vars)), c(0,rep(1, n.vars)), cbind(rep(0,n.vars), diag(eps)))
y <- x%*%beta                # The variables.
cor(y)                       # Verify their correlations are as intended.
plot(data.frame(y))          # Show the scatterplot matrix.

# Perform PCA on the first 2, 3, 4, ..., n.vars+1 variables.
p <- lapply(2:dim(beta)[2], function(k) prcomp(y[, 1:k], scale=TRUE))

# Print summaries and display plots.
tmp <- lapply(p, summary)
par(mfrow=c(2,2))
tmp <- lapply(p, plot)


```


#Set up encounters for RF using new or old whistle data
```{r}
#all whistles for pel and mhi only
#pcdata.pn <- read.csv('C:\\Users\\Yvonne\\Documents\\PHD\\CHP1-FKW\\data\\2017- 100 New Whistles/ROCCA Towed Data/BigRoccaFile_PMN_TowedAll_EDIT_20180816.csv')

#pcdata.pn <- read.csv('C:\\Users\\yvers\\Documents\\CHP1\\BigRoccaFile_PMN_TowedAll_EDIT_20180816.csv')

pcdata.pn <- read.csv('C:\\Users\\yvers\\Documents\\CHP 1\\data\\ROCCA Towed Data\\BigRoccaFile_PMN_TowedAll_EDIT_20181124.csv')


#try using only top 5 important variables
#pcdata.pn <- pcdata.pn[, c(1:6, 8, 12:15)]

# Need even number of encounters for each population represented in the training data. 
PEL = 1:8                # designated groupID's assigned for each pelagic group 
NWHI  = 14:17   # designated groupID's assigned for each NWHI group, #20 was absorbed into #19 (June 2017)     
#MHI = c(31:34)    # designated groupID's assigned for each MHI group, #25 absorbed into #26 (June 2017) 
                              # Oct 7: more combining of NWHI and MHI occurred. Performed in Pc_DatManipulation.rmd

nrep  = 4           # multiplier for total number of groups
npel  = 1           # number of pelagic schools to group together
nnwhi = 1           # number of northwest HI schools to group together
nmhi  = 1           # number of main HI schools to group together
t = 0.6
w = 150              # Number of whistles randomly pulled from each group 
ntest = 1           # number of groups to randomly pull for test dataset
n_samps = 100         #number of times you want to resample from your dataset


```


Meat of the Analysis

```{r message=FALSE, warning=FALSE}

ptm<-proc.time() 


#### Start Main Analysis ####
results.pn = matrix(0, nrow = 2, ncol = 2)
output_ModelFit.pn = NULL
realP.pn = c()
realM.pn = c()
#use this format of output_WhistleClass on desktop (R version 3.3.1)
output_WhistleClass.pn = NULL
#use this format of output_WhistleClass on laptop (R version 3.5.1)
# output_WhistleClass = data.frame(Doubles=double(),
#                  Ints=integer(),
#                  Factors=factor(),
#                  Logicals=logical(),
#                  Characters=character(),
#                  stringsAsFactors=FALSE)
#output_EncounterClass.pn=NULL
output_Results.pn=NULL
output_TrainSetAll.pn=NULL
output_TestSetAll.pn = NULL
output_Votes.pnAll=NULL
tunedAll.pn=NULL
output_varIMPtotal.pn = NULL
fit_error_rate.pn = c()
accuracies.pn = c()
#fit.pn <- vector("list", n_samps)
#conf.mat.pn = matrix(0, nrow = 2, ncol = 2)
# names.test=NULL
# names.train=NULL
#output_IndDWAll=NULL

#### OVERALL LOOP TO DERIVE MEANS AND STANDARD DEVS ####  
for(rep in 1:n_samps){ 
 
  set.seed(rep) #YB placed the seed inside the loop bc it wasn't duplicating trees
  
  # Total groups included
  #This randomly samples 'nrep' groups from each population
  N_pel = as.numeric(sample(PEL, nrep*npel))
  N_nwhi  = as.numeric(sample(NWHI, nrep*nnwhi))
  #N_nwhi  = as.numeric(sample(NWHI, nrep*nnwhi))
 
  
  # Test data 
  #Randomly sample 1 group out of the selected groups for each population to be included in the test dataset later
  test_pel= as.integer(sample(N_pel, ntest))
  test_nwhi= as.integer(sample(N_nwhi, ntest))

  
  #Combines test groups into single vector
  N_test = rbind(test_pel, test_nwhi)
  
  all.pn = NULL
  for(i in c(N_pel,N_nwhi)){
    sub=droplevels(subset(pcdata.pn, pcdata.pn$group==i)) #selects all whistles from the randomly selected groups, drops empty levels too
    samp=sub[sample(nrow(sub), w),] #randomly samples w whistles from the 50 or more whistles
    all.pn <-rbind(all.pn, samp)
    
    }
  #Correlation Step####
pcsub.pn<-all.pn[,7:53]
#corrplot.mixed(cor(pcsub, method= "pearson"), upper="number", lower="circle")
tmp.pn <- cor(pcsub.pn)
tmp.pn[upper.tri(tmp.pn)] <- 0
diag(tmp.pn) <- 0
pcsubcor.pn <- pcsub.pn[,!apply(tmp.pn,2,function(x) any(x > 0.80 | x < -0.80))]

all.pn <- cbind(all.pn[, c(1:6)], pcsubcor.pn)
  

test.pn<-droplevels(subset(all.pn, group%in%N_test ))
train.pn<-droplevels(subset(all.pn, !(group%in%N_test)))


#### Tuning parameters ####
## Set up a dataframe where each row is a specific combination of mtry (# of variables selected at each split), n_tree (# of trees in  forest), possibly other stuff
mtry_vals = 5:8
ntree_vals = seq(500,5000,500) #from, to, by
tune_settings = expand.grid(mtry = mtry_vals, ntree = ntree_vals)#, nodesize = node_vals)
#tune_settings2 = expand.grid(mtry = mtry_vals, ntree = ntree_vals)

#### Run Optimization Sequence ####
## Initiate Cluster

cl = makeCluster(detectCores())
registerDoParallel(cl, cores = detectCores())


#### Formula ####
# for the RF model with 'population' as the response and time-frequency measurements 

formPc.pn <- as.formula(paste("population ~ ", paste(names(all.pn)[7:length(all.pn)],collapse="+")))


## TOP 5 UNCORRELATED VARIABLES
#formPc <- as.formula(paste("population ~ ", paste(names(pcdata.pn)[7:11],collapse="+")))

## CORRELATED VARIABLES
#formPc <- as.formula(paste("population ~ ", paste(names(pcdata.pn)[7:54],collapse="+"))) #Use variables to classify to population


##The foreach function is the loop function for parallel processing. 
##It uses %dopar% which partitions the loop among your cores, then combines each output using the .combine argument. 

x.pn = foreach(i = 1:nrow(tune_settings), .combine = rbind) %dopar% { 
    library(randomForest)
    do_RF = randomForest(formPc.pn, data = train.pn, test = test.pn, ntree = tune_settings$ntree[i], mtry = tune_settings$mtry[i])#, nodesize = tune_settings$nodesize[i])
    print(do_RF$err.rate[tune_settings$ntree[i],])
  }

#so, for each of the parameter combinations, i.e., rows of the tune_settings df, the RF is run using those 
#settings (line 188) and the error rates for the 3 pop's and the oob are printed (line 189). 
#when all the rows are finished, the results are combined using the rbind function into one matrix, stored as x.

#NB: .combine can be anything, really, for example .combine = c for a vector, .combine = list for a list, etc.


#### Stop Cluster ####

stopCluster(cl)

## Combine settings and results ####

tune_settings.pn = cbind(tune_settings, x.pn)


## Optimal Settings: the settings combo
## that has the lowest OOB rate

tuned.pn<-tune_settings.pn[which.min(tune_settings.pn$OOB),]
tunedAll.pn <- rbind(tunedAll.pn, tuned.pn)

#### Run Random Forest ####
##Run Random Forest model using 'tuned' parameters for mtry and ntree
  fit.pn <- randomForest(formPc.pn, data=train.pn, ntree=tuned.pn[1,2], importance=TRUE, replace=T, mtry=tuned.pn[1,1], norm.votes=F, confusion=T, keep.inbag=T, proximity=T, keep.forest=T, scale = F) 

#Save mean error rate for each model (added 7/20/18)
fit_error_rate.pn = c(fit_error_rate.pn, fit.pn$err.rate)


#prediction for population of test data using RF model, 'fit', returns a list with aggregate and individual votes for each whistle  
  prediction.pn <-predict(fit.pn, newdata=test.pn, predict.all = F, type='response', nodes = T) 
  prediction.vote <-predict(fit.pn, newdata=test.pn, predict.all = T, type='vote')

   #shows T/F for individual whistles and whether they were classified correctly  
   test.pn$rightPred <- as.character(prediction.pn) == as.character(test.pn$population) 
  
   #sums all 'TRUE' and divides by total
   accuracy.pn <- sum((test.pn$rightPred)/nrow(test.pn)) 
   accuracies.pn <- c(accuracies.pn, accuracy.pn)
  ####RESULTS!!!#####  
  results.pntemp <- table(test.pn$population, prediction.pn)  # conf matrix of individual whistle counts
  
   
################### 12/10/18
   #Set a threshold for the percentage of whistles that must be correctly classified to classify the population
true.pel = if(results.pntemp[1,1] >= t*w) 'Pelagic' else 'Ambiguous'
pel.nwhi = if(results.pntemp[1,2] >= t*w) 'NWHI' else 'Ambiguous'
#pel.mhi = if(results.temp[1,3] >= t*w) 'MHI' else 'Ambiguous'

true.nwhi = if(results.pntemp[2,2] >= t*w) 'NWHI' else 'Ambiguous'
nwhi.pel = if(results.pntemp[2,1] >= t*w) 'Pelagic' else 'Ambiguous'
#nwhi.mhi = if(results.temp[2,3] >= t*w) 'MHI' else 'Ambiguous'

# true.mhi = if(results.temp[3,3] >= t*w) 'MHI' else 'Ambiguous'
# mhi.pel = if(results.temp[3,1] >= t*w) 'Pelagic' else 'Ambiguous'
# mhi.nwhi = if(results.temp[3,2] >= t*w) 'NWHI' else 'Ambiguous'


true.enc = data.frame(rbind(results.pntemp[1,1], results.pntemp[2,2]))
#d = as.data.frame(unique(test$EncounterID))
#d = as.data.frame(d[c(1:3), ])
e.pn=c()
f=c()
#h=c()
e.pn=as.data.frame(rbind(true.pel, pel.nwhi))
f=as.data.frame(rbind(nwhi.pel, true.nwhi))
#h=as.data.frame(rbind(mhi.pel, mhi.nwhi, true.mhi))
e.pn=cbind(as.data.frame(N_test), e.pn, f, true.enc)
colnames(e.pn) = c("TrueEnc", "Pelagic", "NWHI", "CorrectDW" ) #, "EncounterID")
   
   
   ##################
   
# ## Get percentages of whistles correctly and incorrectly classified   
#   #calculate % accuracies of diagonal by population from confusion matrix of percentages
# pel.pel<-round((results.pntemp[1,1])/(results.pntemp[1,1]+results.pntemp[1,2])*100,2) #pelagic
# nwhi.nwhi<-round((results.pntemp[2,2])/(results.pntemp[2,1]+results.pntemp[2,2])*100,2) #nwhi 
#   #calculate misclassification for pelagic
# pel.nwhi<-round((results.pntemp[1,2])/(results.pntemp[1,1]+results.pntemp[1,2])*100,2)
# nwhi.pel<-round((results.pntemp[2,1])/(results.pntemp[2,1]+results.pntemp[2,2])*100,2) 
#   
# 
# #Consolidate back into confusion matrix
# pel.row<-c(pel.pel, pel.nwhi)
# #nwhi.row<-c(nwhi.pel, nwhi.nwhi, nwhi.mhi)
# nwhi.row<-c(nwhi.pel, nwhi.nwhi)
# #class.title<-c('pelagic', 'nwhi')
# conf.mat.pn <- rbind(pel.row, nwhi.row) #(% of whistles)
# #conf.mat.pn = conf.mat.pn + conf.mat.pntemp
# #conf.mat.pn<-data.frame(rbind("Pelagic"=pel.row, "MHI"=mhi.row))
# 
# 
# colnames(conf.mat.pn)=c("Pelagic", "NWHI")
# #conf.mat.pn
# 
# true.pel = if(pel.pel>=60) 'Pelagic' else 'Ambiguous'
# #true.nwhi = if(nwhi.nwhi>50) 'NWHI' else 'Ambiguous'
# true.nwhi = if(nwhi.nwhi>=60) 'NWHI' else 'Ambiguous'
# true.enc.pn = data.frame(rbind(pel.pel, nwhi.nwhi))
# #d = as.data.frame(unique(test$EncounterID))
# #d = as.data.frame(d[c(1:3), ])
# e.pn=c()
# e.pn=as.data.frame(rbind(true.pel, true.nwhi))
# e.pn=cbind(e.pn, true.enc.pn)
# colnames(e.pn) = c("EncounterClass", "PctCorrect") #, "EncounterID")

#Summed results confusion matrix after each iteration
results.pn = results.pn + results.pntemp 

#For calculating means and standard dev for EACH ITERATION of confusion matrix
#For a single iteration, stores separate cells of class results for calculating mean and standard error
#YOU NEED THIS STEP 4/26/18, TRUST ME!
realP.pn <- c(realP.pn,results.pntemp[1,1])
#fakeP <- c(fakeP,results.pntemp[1,2])
realN.pn <- c(realN.pn,results.pntemp[2,2])
#fakeM <- c(fakeM,results.pntemp[2,1])
        
#### OUTPUTS ####
#1.
#Save each RF model's confusion matrix and OOB error
output_ModelFit.pn <- rbind(output_ModelFit.pn, as.table(fit.pn$confusion), mean(fit.pn$err.rate))

#2.
#Most comprehensive information about overall predictions, proportional classification of trees, and tree predictions for each whistle
g.pn=cbind(rep, "EncounterID"= test.pn$EncounterID, "EncounterCount"=test.pn$EncounterCount, "Prediction"=test.pn$rightPred, "PredictedPop"=prediction.pn,as.data.frame(cbind(prediction.vote$aggregate, prediction.vote$individual)))

#output_WhistleClass.pn <- smartbind(output_WhistleClass.pn, g.pn, fill = NA, sep=':')

#3.
#table keeping track of confusion matrices for all iterations
output_Results.pn<-rbind(output_Results.pn, rep, results.pntemp, accuracy.pn=signif(accuracy.pn, digits = 3)) 

#4.
#Output of encounters used in each training set  
output_TrainSet.pn <- data.frame(rep, "EncounterID" = sort(unique(train.pn$EncounterID))) #, "encounter" = sort(unique(train$group)))
output_TrainSetAll.pn <-rbind(output_TrainSetAll.pn, output_TrainSet.pn)

#5.
#Output shows encounters in each test set, encounter prediction, and % correctly classified
output_TestSet.pn <- data.frame(rep, "EncounterID" = sort(unique(test.pn$EncounterID)), e.pn) #, "encounter" = sort(unique(train$group)))
output_TestSetAll.pn <-rbind(output_TestSetAll.pn, output_TestSet.pn)

#6.
  # Important Variables
  varIMPtemp.pn <- as.data.frame(importance(fit.pn))
  varIMP.pn <- varIMPtemp.pn[order(varIMPtemp.pn$MeanDecreaseGini), , drop =F]
  output_varIMPtotal.pn <- rbind(output_varIMPtotal.pn, "REP"=rep, varIMP.pn)


}

#7.
#Overall correct classification scores
correctP.pn <- round((results.pn[1,1]/(results.pn[1,1]+results.pn[1,2])*100), 2)
correctN.pn <- round((results.pn[2,2]/(results.pn[2,1]+results.pn[2,2])*100), 2)

output_CorrectClass.pn <- rbind(results.pn,cbind(correctP.pn, correctN.pn))


proc.time() - ptm

```

####Calculate means and standard errors
```{r}
se <- function(x) sd(x)/sqrt(length(x))

meanRealP.pn <- mean(realP.pn)
meanRealN.pn <- mean(realN.pn)
seRealP.pn <- se(realP.pn)
seRealN.pn <- se(realN.pn)

 
date='20181212'
#Combine means and std errors into matrix
sink("C:\\Users\\yvers\\OneDrive\\PHD\\CHP1-FKW\\data\\results\\2018\\ConfMatrixPN_means_20181210.txt", append=T)
PNMmeans.pn <- cbind("PEL"= c(meanRealP.pn), "NWHI"=c(meanRealN.pn)) #means
PNMse.pn <- cbind("PEL"= c(seRealP.pn), "NWHI"=c(seRealN.pn)) #standard errors
PNMtotal.pn <- rbind("Means" = PNMmeans.pn, "Stand Err" = PNMse.pn)

date
w
n_samps
PNMtotal.pn
sink()


```

#save to desktop OneDrive
```{r}

#Save overall confusion matrix
write.table(output_CorrectClass.pn, "C:\\Users\\yvers\\OneDrive\\PHD\\CHP1-FKW\\data\\results\\2018/20181209_PN_100x/PcResults_ConfusionMatrix_PN_20181209.csv", append = T, row.names = T, col.names=T, sep = ",")

write.table(output_Results.pn, "C:\\Users\\yvers\\OneDrive\\PHD\\CHP1-FKW\\data\\results\\2018/20181209_PN_100x/PcResults_TotalResults_PN_20181209.csv", append = T, row.names = T, col.names=T, sep = ",")

write.table(output_WhistleClass.pn, "C:\\Users\\yvers\\OneDrive\\PHD\\CHP1-FKW\\data\\results\\2018/20181209_PN_100x/PcResults_TotalWhistleClass_PN_20181209.csv", append = T, row.names = F, col.names=T, sep = ",")

write.table(output_TrainSetAll.pn, "C:\\Users\\yvers\\OneDrive\\PHD\\CHP1-FKW\\data\\results\\2018/20181209_PN_100x/PcResults_TrainSets_PN_20181209.csv", append = T, col.names=T, sep = ",")

write.table(output_TestSetAll.pn, "C:\\Users\\yvers\\OneDrive\\PHD\\CHP1-FKW\\data\\results\\2018/20181209_PN_100x/PcResults_TestResults_PN_20181209.csv", append = T, col.names=T, sep = ",")

write.table(output_varIMPtotal.pn, "C:\\Users\\yvers\\OneDrive\\PHD\\CHP1-FKW\\data\\results\\2018/20181209_PN_100x/PcResults_ImportantVars_PN_20181209.csv", append = T, col.names=T, sep = ",")

write.table(tunedAll.pn, "C:\\Users\\yvers\\OneDrive\\PHD\\CHP1-FKW\\data\\results\\2018/20181209_PN_100x/PcResults_TunedParameters_PN_20181209.csv", append = T, col.names=T, sep = ",")

write.table(output_ModelFit.pn, "C:\\Users\\yvers\\OneDrive\\PHD\\CHP1-FKW\\data\\results\\2018/20181209_PN_100x/PcResults_TotalModels_PN_20181209.csv", append = T, col.names=F, sep = ",")

```

#For laptop directories

```{r}
se <- function(x) sd(x)/sqrt(length(x))

meanRealP <- mean(realP)
meanRealM <- mean(realM)
seRealP <- se(realP)
seRealM <- se(realM)

 
date='20180822'
#Combine means and std errors into matrix
sink("C:\\Users\\yvers\\OneDrive\\PHD\\CHP1-FKW\\data\\results\\2018\\ConfMatrixPM_means_20180822.txt", append=T)
PNMmeans.pn <- cbind("PEL"= c(meanRealP), "NWHI"=c(meanRealM)) #means
PNMse.pn <- cbind("PEL"= c(seRealP), "NWHI"=c(seRealM)) #standard errors
PNMtotal.pn <- rbind("Means" = PNMmeans.pn, "Stand Err" = PNMse.pn)

date
w
n_samps
PNMtotal.pn
sink()


```
###Write NEW WHISTLE DATA outputs to file
```{r}

#Save overall confusion matrix
write.table(output_CorrectClass, "C://Users//Yvonne//Documents//PHD//CHP1-FKW//data//results//20180818//PM///PcResults_ConfusionMatrix_PM_20180818.csv", append = T, row.names = T, col.names=T, sep = ",")

write.table(output_Results.pn, "C://Users//Yvonne//Documents//PHD//CHP1-FKW//data//results//20180818//PM//PcResults_TotalResults_PM_20180818.csv", append = T, row.names = T, col.names=T, sep = ",")

write.table(output_WhistleClass.pn, "C://Users//Yvonne//Documents//PHD//CHP1-FKW//data//results//20180818//PM//PcResults_TotalWhistleClass_PM_20180818.csv", append = T, row.names = F, col.names=T, sep = ",")

write.table(output_TrainSetAll.pn, "C://Users//Yvonne//Documents//PHD//CHP1-FKW//data//results//20180818//PM///Top5ImpVar_PcResults_TrainSets_newPvM_NoPel8.csv", append = T, col.names=T, sep = ",")

write.table(output_TestSetAll.pn, "C://Users//Yvonne//Documents//PHD//CHP1-FKW//data//results//20180818//PM//Top5ImpVar_PcResults_TestResults_newPvM_NoPel8.csv", append = T, col.names=T, sep = ",")

write.table(output_varIMPtotal.pn, "C://Users//Yvonne//Documents//PHD//CHP1-FKW//data//results//20180818//PM//Top5ImpVar_PcResults_ImportantVars_newPvM_NoPel8.csv", append = T, col.names=T, sep = ",")

write.table(tunedAll.pn, "C://Users//Yvonne//Documents//PHD//CHP1-FKW//data//results//20180818//PM//Top5ImpVar_PcResults_TunedParameters_newPvM_NoPel8.csv", append = T, col.names=T, sep = ",")

```


```{r}
#OLD CODE, maybe need?

####Mean & StDev ####
#After all forests are fit, take the cumulative confusion matrix and run some stats

#Calculates overall results for each iteration, summing up all individual whistle classifications
# pel.diag<-round((results.pn[1,1])/(results.pn[1,1]+results.pn[1,2])*100,2) #pelagic
# mhi.diag<-round((results.pn[2,2])/(results.pn[2,1]+results.pn[2,2])*100,2) #mhi
# #calculate misclassification for pelagic and mhi
# pel.mis<-round((results.pn[1,2])/(results.pn[1,1]+results.pn[1,2])*100,2)
# mhi.mis<-round((results.pn[2,1])/(results.pn[2,1]+results.pn[2,2])*100,2)
# 
# pel.tot<-c(pel.diag, pel.mis)
# mhi.tot<-c(mhi.mis, mhi.diag)
# Conf.Mat.pn <- rbind(pel.tot, mhi.tot)
# colnames(Conf.Mat.pn)=c("Pelagic", "MHI")

#Takes mean and standard dev of cumulative whistles correctly and incorrectly classified


# meanRealP <- mean(realP)
# meanFakeP <- mean(fakeP)
# meanRealM <- mean(realM)
# meanFakeM <- mean(fakeM)
# seRealP <- se(realP)
# seFakeP <- se(fakeP)
# seRealM <- se(realM)
# seFakeM <- se(fakeM)
# sdRealP <- sd(realP)
# sdFakeP <- sd(fakeP)
# sdRealM <- sd(realM)
# sdFakeM <- sd(fakeM)

# date='20180809'
# #Combine means and std errors into matrix
# sink("C:\\Users\\Yvonne\\Documents\\PHD\\CHP1-FKW\\data\\results\\20180809\\ConfMatrix_means_20180809.txt", append=T)
# PMmeans <- rbind("PEL"= c(meanRealP, meanFakeP), "MHI"=c(meanRealM, meanFakeM))
# colnames(PMmeans) <- c("PEL", "MHI")
# date
# w
# n_samps
# PMmeans
# sink()
# 
# sink("ConfMatrix_stddevs_20180720.txt", append=T)
# PMsds <- rbind("PEL"= c(sdRealP, sdFakeP), "MHI"=c(sdRealM, sdFakeM))
# colnames(PMsds) <- c("PEL", "MHI")
# date
# w
# n_samps
# PMsds
# sink()

```


##Testing HARP data with new whistles
```{r warning=FALSE, message=FALSE}

train.pn<-read.csv('C:\\Users\\Yvonne\\Documents\\PHD\\CHP1-FKW\\data\\2017- 100 New Whistles/BIG ROCCA FILE.csv')
train.pn <- droplevels(filter(train.pn, population == 'mhi' | population == 'pel'))

test.pn<-read.csv('C:\\Users\\Yvonne\\Documents\\PHD\\CHP1-FKW\\data\\2017- 100 New Whistles/BIG ROCCA harp only.csv')

# Need even number of encounters for each population represented in the training data. 
PEL = 1:8                # designated groupID's assigned for each pelagic group 
NWHI  = 14:17   # designated groupID's assigned for each NWHI group, #20 was absorbed into #19 (June 2017)     
MHI = c(26:30)    # designated groupID's assigned for each MHI group, #25 absorbed into #26 (June 2017) 
                              # Oct 7: more combining of NWHI and MHI occurred. Performed in Pc_DatManipulation.rmd
#This is set up to be flexible to change later if needed
nrep  = 8           # multiplier for total number of groups
npel  = 1           # number of pelagic schools to group together
nnwhi = 1           # number of northwest HI schools to group together
nmhi  = 1           # number of main HI schools to group together
w = 45              # Number of whistles randomly pulled from each group 
ntest = 1           # number of groups to randomly pull for test dataset
n_samps = 2        #Zack: number of times you want to resample from your dataset


```


#Classify HARP data only
Meat of the Analysis
```{r message=FALSE, warning=FALSE}

ptm<-proc.time() 


#### Start ####
results.pn = matrix(0, nrow = 1, ncol = 2)
realP = c()
fakeP = c()
realM = c()
fakeM = c()
output_WhistleClass.pn =NULL
output_EncounterClass.pn=NULL
output_RawResults.pn=NULL
output_TrainSet.pnAll=NULL
output_TestSet.pnAll = NULL
output_Votes.pnAll=NULL
tuned.pnAll=NULL
varIMPtotal.pn = NULL


#fit.pn <- vector("list", n_samps)
#conf.mat.pn = matrix(0, nrow = 2, ncol = 2)
# names.test=NULL
# names.train=NULL
#output_IndDWAll=NULL

#### OVERALL LOOP TO DERIVE MEANS AND STANDARD DEVS ####  
 for(rep in 1:n_samps){ 
#  
#   set.seed(rep) #YB placed the seed inside the loop bc it wasn't duplicating trees
#   
#   # Total groups included
#   #This randomly samples 'nrep' groups from each population
#   N_pel = as.numeric(sample(PEL, nrep*npel))
#   N_mhi  = as.numeric(sample(MHI, nrep*nmhi))
#   #N_nwhi  = as.numeric(sample(NWHI, nrep*nnwhi))
#  
#   
#   # Test data 
#   #Randomly sample 1 group out of the selected groups for each population to be included in the test dataset later
#   test_pel= as.integer(sample(N_pel, ntest))
#   test_mhi= as.integer(sample(N_mhi, ntest))
#   #test_nwhi= as.integer(sample(N_nwhi, ntest))
#   
#   #Combines test groups into single vector
#   N_test = rbind(test_pel, test_mhi)
#   
#   all = NULL
#   for(i in c(N_pel,N_mhi)){
#     sub=droplevels(subset(pcdata.pn, pcdata.pn$group==i)) #selects all whistles from the randomly selected groups, drops empty levels too
#     samp=sub[sample(nrow(sub), w),] #randomly samples w whistles from the 50 or more whistles
#     all <-rbind(all, samp)
#     
#     }
#   
#   test.pn<-droplevels(subset(all, group%in%N_test ))
#   train.pn<-droplevels(subset(all, !(group%in%N_test)))
  #print(unique(train$EncounterID))
  #print(unique(test$EncounterID))
  
  # testp <- subset(test, test$population == 'pelagic')
  # testn <- subset(test, test$population == 'nwhi')

# ADDING } to test making the training and test data
#}
  
  
#ptm<-proc.time() 


#### Tuning parameters ####
## Set up a dataframe where each row is a specific combination of mtry (# of variables selected at each split), n_tree (# of trees in  forest), possibly other stuff
mtry_vals = 5:10 
ntree_vals = seq(501,3001,50) #from, to, by
#node_vals=1:3

tune_settings = expand.grid(mtry = mtry_vals, ntree = ntree_vals)#, nodesize = node_vals)
#tune_settings2 = expand.grid(mtry = mtry_vals, ntree = ntree_vals)

#### Run Optimization Sequence ####
## Initiate Cluster

cl = makeCluster(detectCores())
registerDoParallel(cl, cores = detectCores())


#### Formula ####
# for the RF model with 'population' as the response and time-frequency measurements 

## UNCORRELATED VARIABLES
#formPc <- as.formula(paste("population ~ ", paste(names(pcdata.pn)[5:30],collapse="+")))

## CORRELATED VARIABLES
formPc <- as.formula(paste("population ~ ", paste(names(pcdata.pn)[7:54],collapse="+"))) #Use variables to classify to population


##The foreach function is the loop function for parallel processing. 
##It uses %dopar% which partitions the loop among your cores, then combines each output using the .combine argument. 

x = foreach(i = 1:nrow(tune_settings), .combine = rbind) %dopar% { 
    library(randomForest)
    do_RF = randomForest(formPc, data = train.pn, test = test.pn, ntree = tune_settings$ntree[i], mtry = tune_settings$mtry[i])#, nodesize = tune_settings$nodesize[i])
    print(do_RF$err.rate[tune_settings$ntree[i],])
  }

#so, for each of the parameter combinations, i.e., rows of the tune_settings df, the RF is run using those 
#settings (line 188) and the error rates for the 3 pop's and the oob are printed (line 189). 
#when all the rows are finished, the results are combined using the rbind function into one matrix, stored as x.

#NB: .combine can be anything, really, for example .combine = c for a vector, .combine = list for a list, etc.


#### Stop Cluster ####

stopCluster(cl)

## Combine settings and results ####

tune_settings = cbind(tune_settings, x)


## Optimal Settings: the settings combo
## that has the lowest OOB rate

tuned.pn<-tune_settings[which.min(tune_settings$OOB),]
tuned.pnAll <- rbind(tuned.pnAll, tuned.pn)

#### Run Random Forest ####
##Run Random Forest model using 'tuned' parameters for mtry and ntree
  fit.pn <- randomForest(formPc, data=train.pn, ntree=tuned.pn[1,2], importance=TRUE, replace=T, mtry=tuned.pn[1,1], norm.votes=F, confusion=T, keep.inbag=T, proximity=T, keep.forest=T, scale = F) 

#prediction for population of test data using RF model, 'fit', returns a list with aggregate and individual votes for each whistle  
  prediction.pn <-predict(fit.pn, newdata=test.pn, predict.all = F, type='response', nodes = T) 
  prediction.vote <-predict(fit.pn, newdata=test.pn, predict.all = T, type='vote')

   #shows T/F for individual whistles and whether they were classified correctly  
   test.pn$rightPred <- as.character(prediction.pn) == as.character(test.pn$population) 
  
   #sums all 'TRUE' and divides by total
   accuracy.pn <- sum((test.pn$rightPred)/nrow(test.pn)) 
  
#### Accuracy for each EncounterID in test set####  
   encID = unique(test.pn$EncounterID)
   accID.pn = NULL
   encCLASS = NULL
   for (id in encID) {
     subID <- subset(test.pn, EncounterID == id)
     accID.pnTemp <- sum((subID$rightPred)/nrow(subID)) 
     encCLASStemp = if(accID.pnTemp>=.60) 'MHI' else 'Ambiguous'
     encCLASS = rbind.data.frame(encCLASS, encCLASStemp)
     accID.pn <- rbind.data.frame(accID.pn, accID.pnTemp)
     
   }
   accID.pnALL <- as.data.frame(cbind(rep, encID, accID.pn,  encCLASS))
   colnames(accID.pnALL) = c("Rep", "EncounterID", "Accuracy", "Class")
   
  ####RESULTS!!!#####  
  results.pntemp <- table(test.pn$population, prediction.pn)  # conf matrix of individual whistle classification
  
  #calculate % accuracies of diagonal by population from confusion matrix of percentages
#pel.pel<-round((results.pntemp[1,1])/(results.pntemp[1,1]+results.pntemp[1,2])*100,2) #pelagic
mhi.mhi<-round((results.pntemp[1,1])/(results.pntemp[1,1]+results.pntemp[1,2])*100,2) #mhi 
  #calculate misclassification for pelagic
# pel.mhi<-round((results.pntemp[1,2])/(results.pntemp[1,1]+results.pntemp[1,2])*100,2)
# mhi.pel<-round((results.pntemp[2,1])/(results.pntemp[2,1]+results.pntemp[2,2])*100,2) 
  

  # Important Variables
  varIMPtemp.pn <- as.data.frame(importance(fit.pn))
  varIMP.pn <- varIMPtemp.pn[order(varIMPtemp.pn$MeanDecreaseGini), , drop =F]
  varIMPtotal.pn <- rbind(varIMPtotal.pn, "REP"=rep, varIMP.pn)

  
#Calculate means and standard dev for each iteration of confusion matrix
#se=function(x) sd(x)/sqrt(length(x))
#for a single iteration, stores separate cells of results for calculating mean and standard error
se <- function(x) sd(x)/sqrt(length(x))
# realP <- c(realP,results.pntemp[1,1])
# fakeP <- c(fakeP,results.pntemp[1,2])
realM <- c(realM,results.pntemp[1,2])
fakeM <- c(fakeM,results.pntemp[1,2])


#Consolidate back into confusion matrix
# pel.row<-c(pel.pel, pel.mhi)
# #nwhi.row<-c(nwhi.pel, nwhi.nwhi, nwhi.mhi)
# mhi.row<-c(mhi.pel, mhi.mhi)
# #class.title<-c('pelagic', 'mhi')
# conf.mat.pn <- rbind(pel.row, mhi.row)
# #conf.mat.pn = conf.mat.pn + conf.mat.pntemp
# #conf.mat.pn<-data.frame(rbind("Pelagic"=pel.row, "MHI"=mhi.row))
# 
# 
# colnames(conf.mat.pn)=c("Pelagic", "MHI")
#conf.mat.pn

#true.pel = if(pel.pel>60) 'Pelagic' else 'Ambiguous'
#true.nwhi = if(nwhi.nwhi>50) 'NWHI' else 'Ambiguous'
true.mhi = if(mhi.mhi>60) 'MHI' else 'Ambiguous'
true.enc = data.frame(mhi.mhi)
#d = as.data.frame(unique(test$EncounterID))
#d = as.data.frame(d[c(1:3), ])
e.pn=c()
e.pn=as.data.frame(true.mhi)
e.pn=cbind(e.pn, true.enc)
colnames(e.pn) = c("EncounterClass", "% Correct") #, "EncounterID")

#Summed results confusion matrix after each iteration
results.pn = results.pn + results.pntemp 


#### OUTPUTS ####
# BIG dataframe with RF results for each encounter, including the % of votes, individ tree votes
g.pn=cbind(rep, "EncounterID"= test.pn$EncounterID, "EncounterCount"=test.pn$EncounterCount, "Prediction"=test.pn$rightPred, "PredictedPop"=prediction.pn, as.data.frame(cbind(prediction.vote$aggregate, prediction.vote$individual))) 

output_WhistleClass.pn <- smartbind(output_WhistleClass.pn, g.pn)

output_EncounterClass.pn <- rbind(output_EncounterClass.pn, e.pn)

output_RawResults.pn<-rbind(output_RawResults.pn, rep, results.pn, accuracy.pn=signif(accuracy.pn, digits = 3)) #rough table keeping track of confusion matrices for all iterations
output_ConfMat.pn<-rbind(output_ConfMat.pn, rep, conf.mat.pn, accuracy.pn=signif(accuracy.pn, digits = 3))
#df to show encounters used in each training set  
output_TrainSet.pn <- data.frame(rep, "EncounterID" = sort(unique(train.pn$EncounterID))) #, "encounter" = sort(unique(train$group)))
output_TrainSet.pnAll <-rbind(output_TrainSet.pnAll, output_TrainSet.pn)

#df to show encounters used in each test set
output_TestSet.pn <- data.frame(rep, "EncounterID" = sort(unique(test.pn$EncounterID)), e.pn) #, "encounter" = sort(unique(train$group)))
output_TestSet.pnAll <-rbind(output_TestSet.pnAll, output_TestSet.pn)

#df to show the prediction of each individual whistle per encounter (THIS IS DOCUMENTED IN 'g')
# output_IndDW<-data.frame(rep, "EncounterID"=test$EncounterID, "PredictedPop"=prediction, "EncounterCount"=test$EncounterCount) 
# output_IndDWAll <- rbind(output_IndDWAll, output_IndDW)  #consolidates all individual whistle results for each iteration

}


#Calculates overall results for each iteration, summing up all whistle classifications
pel.diag<-round((results.pn[1,1])/(results.pn[1,1]+results.pn[1,2])*100,2) #pelagic
mhi.diag<-round((results.pn[2,2])/(results.pn[2,1]+results.pn[2,2])*100,2) #mhi
#calculate misclassification for pelagic and mhi
pel.mis<-round((results.pn[1,2])/(results.pn[1,1]+results.pn[1,2])*100,2)
mhi.mis<-round((results.pn[2,1])/(results.pn[2,1]+results.pn[2,2])*100,2)

pel.tot<-c(pel.diag, pel.mis)
mhi.tot<-c(mhi.mis, mhi.diag)
Conf.Mat.pn <- rbind(pel.tot, mhi.tot)
colnames(Conf.Mat.pn)=c("Pelagic", "MHI")


meanRealP <- mean(realP)
meanFakeP <- mean(fakeP)
meanRealM <- mean(realM)
meanFakeM <- mean(fakeM)
# seRealP <- se(realP)
# seFakeP <- se(fakeP)
# seRealM <- se(realM)
# seFakeM <- se(fakeM)
sdRealP <- sd(realP)
sdFakeP <- sd(fakeP)
sdRealM <- sd(realM)
sdFakeM <- sd(fakeM)

#Combine means and std errors into matrix
sink("ConfMatrix_means_20180409.txt", append=T)
PMmeans <- rbind("PEL"= c(meanRealP, meanFakeP), "MHI"=c(meanFakeM,meanRealM))
colnames(PMmeans) <- c("PEL", "MHI")
w
n_samps
PMmeans
sink()

sink("ConfMatrix_stddevs_20180409.txt", append=T)
PMsds <- rbind("PEL"= c(sdRealP, sdFakeP), "MHI"=c(sdFakeM,sdRealM))
colnames(PMsds) <- c("PEL", "MHI")
w
n_samps
PMsds
sink()


proc.time() - ptm

```


```{r}

  N_pel = as.numeric(sample(PEL, nrep*npel))
  N_mhi  = as.numeric(sample(MHI, nrep*nmhi))


#Subsetting 50dw from each encounter since some have >50 dw.
  
pw_pm = NULL

for(i in c(N_mhi)){
# for(i in c(N_pel)){   
 sub=droplevels(subset(pcdata, pcdata$group==i)) #selects all whistles from the randomly selected groups, drops empty levels too
    samp=sub[sample(nrow(sub), w),] #randomly samples w whistles from the 50 or more whistles
    pw_pm <-rbind(pw_pm, samp)  #this will stay the same
    
  }


P = subset(pcdata, pcdata$population=='pelagic')
M = NULL
for(i in c(MHI)){
 sub=droplevels(subset(pcdata, pcdata$group==i)) #selects all whistles from the randomly selected groups, drops empty levels too
    samp=sub[sample(nrow(sub), w),] #randomly samples w whistles from the 50 or more whistles
    M <-rbind(M, samp)  #this will stay the same
    
  }


```

#Combining old and new whistles for array data only
```{r warning=FALSE, message=FALSE}
library(reshape2)
pcdata_old<-read.csv('C:\\Users\\Yvonne\\Documents\\PHD\\CHP1-FKW\\data\\2017- 100 New Whistles/BIG ROCCA OLD raw.csv')
pcdata_new<-read.csv('C:\\Users\\Yvonne\\Documents\\PHD\\CHP1-FKW\\data\\2017- 100 New Whistles/BIG ROCCA FILE.csv')
pcdata_new<-pcdata_new[,-c(55:62)]
pcdata_all<-rbind(pcdata_old, pcdata_new)
pcdata_all<-droplevels(subset(pcdata_all, pcdata_all$population != 'nwhi'& pcdata_all$CruiseID != 'harp'))

```