From ffa00ac2ddcdc4eaac609cf3505b3848b990b615 Mon Sep 17 00:00:00 2001
From: ttriche <tim.triche@gmail.com>
Date: Tue, 10 Nov 2015 10:56:56 -0800
Subject: [PATCH 1/3] make setup less annoying

---
 assignment5/setup.r | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/assignment5/setup.r b/assignment5/setup.r
index 434177b5..94089b1a 100644
--- a/assignment5/setup.r
+++ b/assignment5/setup.r
@@ -1,7 +1,8 @@
 
-install.packages("caret")
-install.packages("rpart")
-install.packages("tree")
-install.packages("randomForest")
-install.packages("e1071")
-install.packages("ggplot2")
+required_packages <- c("caret","rpart","tree","randomForest","e1071","ggplot2")
+
+for (package in required_packages) {
+  if (!require(package, character.only=TRUE)) { 
+    install.packages(package, character.only=TRUE)
+  }
+}

From 7db81dc0f28c577734550eedf9640e0dc3246913 Mon Sep 17 00:00:00 2001
From: ttriche <tim.triche@gmail.com>
Date: Tue, 10 Nov 2015 12:30:11 -0800
Subject: [PATCH 2/3] code for assignment 1

---
 assignment5/seaflow.r | 75 +++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 75 insertions(+)
 create mode 100644 assignment5/seaflow.r

diff --git a/assignment5/seaflow.r b/assignment5/seaflow.r
new file mode 100644
index 00000000..08ef6b65
--- /dev/null
+++ b/assignment5/seaflow.r
@@ -0,0 +1,75 @@
+source("setup.r")
+seaflow <- read.csv("seaflow_21min.csv")
+
+# q2
+table(seaflow$pop)
+
+# q3
+# assuming the question actually should be 3rd QUARTILE, not quantile(x, 0.03)
+# quantile(seaflow$fsc_small, 0.75) != summary(seaflow$fsc_small)[5] (!)
+summary(seaflow$fsc_small)[5]
+
+# q4
+trainset <- sample(seq_len(nrow(seaflow)), round(nrow(seaflow) / 2))
+testset <- setdiff(seq_len(nrow(seaflow)), trainset)
+training <- seaflow[trainset, ] 
+mean(training$time)
+
+# q5
+qplot(pe, chl_small, data=seaflow, color=pop, shape=pop, size=pop)
+
+# q6
+fol <- formula(pop ~ fsc_small + fsc_perp + fsc_big + pe + chl_big + chl_small) 
+model <- rpart(fol, method="class", data=training)
+model
+
+# q7
+model
+
+# q8 
+model
+
+# q9 
+testing <- seaflow[testset,] 
+predictions <- predict(model, testing, type="class")
+correct <- sum(predictions == testing$pop) / nrow(testing)
+print(correct) 
+
+# q10 
+training$pop <- as.factor(training$pop)
+rfmodel <- randomForest(fol, data=training)
+rfpredictions <- predict(rfmodel, testing)
+rfcorrect <- sum(rfpredictions == testing$pop) / nrow(testing)
+print(rfcorrect)
+
+# q11
+importance(rfmodel)
+
+# q12
+svmmodel <- svm(fol, data=training)
+svmpredictions <- predict(svmmodel, testing)
+svmcorrect <- sum(svmpredictions == testing$pop) / nrow(testing)
+print(svmcorrect)
+
+# q13
+confmat <- function(pred) table(pred, true=testing$pop)
+confmat(svmpredictions)
+confmat(rfpredictions)
+confmat(predictions)
+
+# q14 
+newseaflow <- seaflow[seaflow$file_id != 208, ] 
+newtrain <- sample(seq_len(nrow(newseaflow)), round(nrow(newseaflow) / 2))
+newtest <- setdiff(seq_len(nrow(newseaflow)), newtrain)
+newtraining <- newseaflow[newtrain, ] 
+newtraining$pop <- as.factor(newtraining$pop)a
+if (any(newtraining$file_id == 208)) stop("You botched your new training set.")
+newsvmmodel <- svm(fol, data=newtraining)
+newtesting <- newseaflow[newtest, ] 
+if (any(newtesting$file_id == 208)) stop("You botched your new test set.")
+newsvmpredictions <- predict(newsvmmodel, newtesting)
+newsvmcorrect <- sum(newsvmpredictions == newtesting$pop) / nrow(newtesting)
+print(newsvmcorrect)
+
+# q15 
+for (v in names(seaflow)) print(paste(v, length(table(seaflow[[v]])))) # fsc_big

From 92ba40a1ef0a260685c3993fa01ed30c4759ce89 Mon Sep 17 00:00:00 2001
From: ttriche <tim.triche@gmail.com>
Date: Tue, 10 Nov 2015 12:44:16 -0800
Subject: [PATCH 3/3] add simulation for mean(training$time)

---
 assignment5/seaflow.r | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/assignment5/seaflow.r b/assignment5/seaflow.r
index 08ef6b65..ed92561b 100644
--- a/assignment5/seaflow.r
+++ b/assignment5/seaflow.r
@@ -73,3 +73,10 @@ print(newsvmcorrect)
 
 # q15 
 for (v in names(seaflow)) print(paste(v, length(table(seaflow[[v]])))) # fsc_big
+
+# extra: distribution of the mean of training$time for 100 samples
+getTrainingMeans <- function(x, n=1000) {
+  sapply(1:n, function(y) mean(x[sample(seq_along(x),round(length(x)/2))]))
+}
+plot(density(getTrainingMeans(seaflow$time, 1000)), main="mean(training$time)")
+