From ffa00ac2ddcdc4eaac609cf3505b3848b990b615 Mon Sep 17 00:00:00 2001 From: ttriche Date: Tue, 10 Nov 2015 10:56:56 -0800 Subject: [PATCH 1/3] make setup less annoying --- assignment5/setup.r | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/assignment5/setup.r b/assignment5/setup.r index 434177b5..94089b1a 100644 --- a/assignment5/setup.r +++ b/assignment5/setup.r @@ -1,7 +1,8 @@ -install.packages("caret") -install.packages("rpart") -install.packages("tree") -install.packages("randomForest") -install.packages("e1071") -install.packages("ggplot2") +required_packages <- c("caret","rpart","tree","randomForest","e1071","ggplot2") + +for (package in required_packages) { + if (!require(package, character.only=TRUE)) { + install.packages(package, character.only=TRUE) + } +} From 7db81dc0f28c577734550eedf9640e0dc3246913 Mon Sep 17 00:00:00 2001 From: ttriche Date: Tue, 10 Nov 2015 12:30:11 -0800 Subject: [PATCH 2/3] code for assignment 1 --- assignment5/seaflow.r | 75 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 75 insertions(+) create mode 100644 assignment5/seaflow.r diff --git a/assignment5/seaflow.r b/assignment5/seaflow.r new file mode 100644 index 00000000..08ef6b65 --- /dev/null +++ b/assignment5/seaflow.r @@ -0,0 +1,75 @@ +source("setup.r") +seaflow <- read.csv("seaflow_21min.csv") + +# q2 +table(seaflow$pop) + +# q3 +# assuming the question actually should be 3rd QUARTILE, not quantile(x, 0.03) +# quantile(seaflow$fsc_small, 0.75) != summary(seaflow$fsc_small)[5] (!) +summary(seaflow$fsc_small)[5] + +# q4 +trainset <- sample(seq_len(nrow(seaflow)), round(nrow(seaflow) / 2)) +testset <- setdiff(seq_len(nrow(seaflow)), trainset) +training <- seaflow[trainset, ] +mean(training$time) + +# q5 +qplot(pe, chl_small, data=seaflow, color=pop, shape=pop, size=pop) + +# q6 +fol <- formula(pop ~ fsc_small + fsc_perp + fsc_big + pe + chl_big + chl_small) +model <- rpart(fol, method="class", data=training) +model + +# q7 +model + +# q8 +model + +# q9 +testing <- seaflow[testset,] +predictions <- predict(model, testing, type="class") +correct <- sum(predictions == testing$pop) / nrow(testing) +print(correct) + +# q10 +training$pop <- as.factor(training$pop) +rfmodel <- randomForest(fol, data=training) +rfpredictions <- predict(rfmodel, testing) +rfcorrect <- sum(rfpredictions == testing$pop) / nrow(testing) +print(rfcorrect) + +# q11 +importance(rfmodel) + +# q12 +svmmodel <- svm(fol, data=training) +svmpredictions <- predict(svmmodel, testing) +svmcorrect <- sum(svmpredictions == testing$pop) / nrow(testing) +print(svmcorrect) + +# q13 +confmat <- function(pred) table(pred, true=testing$pop) +confmat(svmpredictions) +confmat(rfpredictions) +confmat(predictions) + +# q14 +newseaflow <- seaflow[seaflow$file_id != 208, ] +newtrain <- sample(seq_len(nrow(newseaflow)), round(nrow(newseaflow) / 2)) +newtest <- setdiff(seq_len(nrow(newseaflow)), newtrain) +newtraining <- newseaflow[newtrain, ] +newtraining$pop <- as.factor(newtraining$pop)a +if (any(newtraining$file_id == 208)) stop("You botched your new training set.") +newsvmmodel <- svm(fol, data=newtraining) +newtesting <- newseaflow[newtest, ] +if (any(newtesting$file_id == 208)) stop("You botched your new test set.") +newsvmpredictions <- predict(newsvmmodel, newtesting) +newsvmcorrect <- sum(newsvmpredictions == newtesting$pop) / nrow(newtesting) +print(newsvmcorrect) + +# q15 +for (v in names(seaflow)) print(paste(v, length(table(seaflow[[v]])))) # fsc_big From 92ba40a1ef0a260685c3993fa01ed30c4759ce89 Mon Sep 17 00:00:00 2001 From: ttriche Date: Tue, 10 Nov 2015 12:44:16 -0800 Subject: [PATCH 3/3] add simulation for mean(training$time) --- assignment5/seaflow.r | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/assignment5/seaflow.r b/assignment5/seaflow.r index 08ef6b65..ed92561b 100644 --- a/assignment5/seaflow.r +++ b/assignment5/seaflow.r @@ -73,3 +73,10 @@ print(newsvmcorrect) # q15 for (v in names(seaflow)) print(paste(v, length(table(seaflow[[v]])))) # fsc_big + +# extra: distribution of the mean of training$time for 100 samples +getTrainingMeans <- function(x, n=1000) { + sapply(1:n, function(y) mean(x[sample(seq_along(x),round(length(x)/2))])) +} +plot(density(getTrainingMeans(seaflow$time, 1000)), main="mean(training$time)") +