> # Data Preprocessing
>
> titanic <- read.csv("titanic.csv")
> str(titanic)
'data.frame': 891 obs. of 12 variables:
$ PassengerId: int 1 2 3 4 5 6 7 8 9 10 ...
$ Survived : int 0 1 1 1 0 0 0 0 1 1 ...
$ Pclass : int 3 1 3 1 3 3 1 3 3 2 ...
$ Name : chr "Braund, Mr. Owen Harris" "Cumings, Mrs. John Bradley (Florence Briggs Thayer)" "Heikkinen, Miss. Laina" "Futrelle, Mrs. Jacques Heath (Lily May Peel)" ...
$ Sex : chr "male" "female" "female" "female" ...
$ Age : num 22 38 26 35 35 NA 54 2 27 14 ...
$ SibSp : int 1 1 0 1 0 0 0 3 0 1 ...
$ Parch : int 0 0 0 0 0 0 0 1 2 0 ...
$ Ticket : chr "A/5 21171" "PC 17599" "STON/O2. 3101282" "113803" ...
$ Fare : num 7.25 71.28 7.92 53.1 8.05 ...
$ Cabin : chr "" "C85" "" "C123" ...
$ Embarked : chr "S" "C" "S" "S" ...
> titanic <- titanic[, -1]
> titanic$Pclass <- as.factor(titanic$Pclass)
> titanic$Survived <- factor(titanic$Survived, levels = c(0,1), labels = c("dead", "survived"))
> str(titanic)
'data.frame': 891 obs. of 11 variables:
$ Survived: Factor w/ 2 levels "dead","survived": 1 2 2 2 1 1 1 1 2 2 ...
$ Pclass : Factor w/ 3 levels "1","2","3": 3 1 3 1 3 3 1 3 3 2 ...
$ Name : chr "Braund, Mr. Owen Harris" "Cumings, Mrs. John Bradley (Florence Briggs Thayer)" "Heikkinen, Miss. Laina" "Futrelle, Mrs. Jacques Heath (Lily May Peel)" ...
$ Sex : chr "male" "female" "female" "female" ...
$ Age : num 22 38 26 35 35 NA 54 2 27 14 ...
$ SibSp : int 1 1 0 1 0 0 0 3 0 1 ...
$ Parch : int 0 0 0 0 0 0 0 1 2 0 ...
$ Ticket : chr "A/5 21171" "PC 17599" "STON/O2. 3101282" "113803" ...
$ Fare : num 7.25 71.28 7.92 53.1 8.05 ...
$ Cabin : chr "" "C85" "" "C123" ...
$ Embarked: chr "S" "C" "S" "S" ...
> titanic[, c(1,2)] <- titanic[, c(2,1)]
> names(titanic)[1:2] <- c("Pclass","Survived")
> str(titanic)
'data.frame': 891 obs. of 11 variables:
$ Pclass : Factor w/ 3 levels "1","2","3": 3 1 3 1 3 3 1 3 3 2 ...
$ Survived: Factor w/ 2 levels "dead","survived": 1 2 2 2 1 1 1 1 2 2 ...
$ Name : chr "Braund, Mr. Owen Harris" "Cumings, Mrs. John Bradley (Florence Briggs Thayer)" "Heikkinen, Miss. Laina" "Futrelle, Mrs. Jacques Heath (Lily May Peel)" ...
$ Sex : chr "male" "female" "female" "female" ...
$ Age : num 22 38 26 35 35 NA 54 2 27 14 ...
$ SibSp : int 1 1 0 1 0 0 0 3 0 1 ...
$ Parch : int 0 0 0 0 0 0 0 1 2 0 ...
$ Ticket : chr "A/5 21171" "PC 17599" "STON/O2. 3101282" "113803" ...
$ Fare : num 7.25 71.28 7.92 53.1 8.05 ...
$ Cabin : chr "" "C85" "" "C123" ...
$ Embarked: chr "S" "C" "S" "S" ...
> titanic$Sex <- as.factor(titanic$Sex)
> titanic$Embarked <- as.factor(titanic$Embarked)
> table(titanic$Embarked)
C Q S
2 168 77 644
> levels(titanic$Embarked)
[1] "" "C" "Q" "S"
> levels(titanic$Embarked)[1] <- NA
> table(titanic$Embarked, useNA = "always")
C Q S <NA>
168 77 644 2
> titanic$Cabin <- ifelse(titanic$Cabin == "", NA, titanic$Cabin)
> str(titanic)
'data.frame': 891 obs. of 11 variables:
$ Pclass : Factor w/ 3 levels "1","2","3": 3 1 3 1 3 3 1 3 3 2 ...
$ Survived: Factor w/ 2 levels "dead","survived": 1 2 2 2 1 1 1 1 2 2 ...
$ Name : chr "Braund, Mr. Owen Harris" "Cumings, Mrs. John Bradley (Florence Briggs Thayer)" "Heikkinen, Miss. Laina" "Futrelle, Mrs. Jacques Heath (Lily May Peel)" ...
$ Sex : Factor w/ 2 levels "female","male": 2 1 1 1 2 2 2 2 1 1 ...
$ Age : num 22 38 26 35 35 NA 54 2 27 14 ...
$ SibSp : int 1 1 0 1 0 0 0 3 0 1 ...
$ Parch : int 0 0 0 0 0 0 0 1 2 0 ...
$ Ticket : chr "A/5 21171" "PC 17599" "STON/O2. 3101282" "113803" ...
$ Fare : num 7.25 71.28 7.92 53.1 8.05 ...
$ Cabin : chr NA "C85" NA "C123" ...
$ Embarked: Factor w/ 3 levels "C","Q","S": 3 1 3 3 3 2 3 3 3 1 ...
>
>
> # Data Partitioning
>
> library(caret)
> test_idx <- createDataPartition(titanic$Survived, p=0.1)$Resample1
> titanic.test <- titanic[test_idx, ]
> titanic.train <- titanic[-test_idx, ]
> NROW(titanic.test)
[1] 90
> prop.table(table(titanic.test$Survived))
dead survived
0.6111111 0.3888889
> prop.table(table(titanic.train$Survived))
dead survived
0.6167291 0.3832709
> save(titanic, titanic.test, titanic.train, file = "titanic.RData")
>
>
> # Ready to do Cross-Validation
>
> createFolds(titanic.train$Survived, k=10)
$Fold01
[1] 3 13 26 31 62 67 68 81 90 96 116 126 127 156 157 160 171 180 181 195 201 206 219 227 243 280 306 322 323
[30] 327 329 331 338 351 409 414 417 425 426 428 432 451 491 503 506 514 520 533 536 548 566 571 604 613 625 628 629 630
[59] 640 641 647 653 654 663 672 675 685 690 696 698 700 701 711 725 745 753 758 759 769 779
...
$Fold10
[1] 4 10 18 32 39 40 46 50 55 60 61 65 66 71 72 79 82 91 100 124 125 143 179 186 202 211 220 222 224
[30] 229 259 265 281 289 309 321 326 335 342 345 359 363 369 375 384 408 416 433 448 471 479 481 484 487 489 494 511 521
[59] 534 574 576 583 586 601 607 616 638 662 669 680 686 688 695 723 738 739 742 755 767 791
>
> create_ten_fold_cv <- function() {
+ set.seed(137)
+ lapply(createFolds(titanic.train$Survived, k=10), function(idx) {
+ return(list(train=titanic.train[-idx, ]
+ , validation=titanic.train[idx, ]))
+ })
+ }
>
> x <- create_ten_fold_cv()
> str(x)
List of 10
$ Fold01:List of 2
..$ train :'data.frame': 721 obs. of 11 variables:
.. ..$ Pclass : Factor w/ 3 levels "1","2","3": 3 3 1 3 3 1 3 2 3 1 ...
.. ..$ Survived: Factor w/ 2 levels "dead","survived": 1 2 2 1 1 1 2 2 2 2 ...
.. ..$ Name : chr [1:721] "Braund, Mr. Owen Harris" "Heikkinen, Miss. Laina" "Futrelle, Mrs. Jacques Heath (Lily May Peel)" "Allen, Mr. William Henry" ...
.. ..$ Sex : Factor w/ 2 levels "female","male": 2 1 1 2 2 2 1 1 1 1 ...
.. ..$ Age : num [1:721] 22 26 35 35 NA 54 27 14 4 58 ...
.. ..$ SibSp : int [1:721] 1 0 1 0 0 0 0 1 1 0 ...
.. ..$ Parch : int [1:721] 0 0 0 0 0 0 2 0 1 0 ...
.. ..$ Ticket : chr [1:721] "A/5 21171" "STON/O2. 3101282" "113803" "373450" ...
.. ..$ Fare : num [1:721] 7.25 7.92 53.1 8.05 8.46 ...
.. ..$ Cabin : chr [1:721] NA NA "C123" NA ...
.. ..$ Embarked: Factor w/ 3 levels "C","Q","S": 3 3 3 3 2 3 3 1 3 3 ...
..$ validation:'data.frame': 80 obs. of 11 variables:
.. ..$ Pclass : Factor w/ 3 levels "1","2","3": 3 3 3 3 1 1 3 3 3 3 ...
.. ..$ Survived: Factor w/ 2 levels "dead","survived": 1 2 1 2 1 2 1 1 2 1 ...
.. ..$ Name : chr [1:80] "Palsson, Master. Gosta Leonard" "Masselmani, Mrs. Fatima" "Emir, Mr. Farred Chehab" "Glynn, Miss. Mary Agatha" ...
.. ..$ Sex : Factor w/ 2 levels "female","male": 2 1 2 1 2 1 1 2 1 2 ...
.. ..$ Age : num [1:80] 2 NA NA NA 42 49 16 22 NA 26 ...
.. ..$ SibSp : int [1:80] 3 0 0 0 1 1 5 0 0 1 ...
.. ..$ Parch : int [1:80] 1 0 0 0 0 0 2 0 0 2 ...
.. ..$ Ticket : chr [1:80] "349909" "2649" "2631" "335677" ...
.. ..$ Fare : num [1:80] 21.07 7.22 7.22 7.75 52 ...
.. ..$ Cabin : chr [1:80] NA NA NA NA ...
.. ..$ Embarked: Factor w/ 3 levels "C","Q","S": 3 1 1 2 3 1 3 3 2 3 ...
...
$ Fold10:List of 2
..$ train :'data.frame': 721 obs. of 11 variables:
.. ..$ Pclass : Factor w/ 3 levels "1","2","3": 3 3 1 3 3 1 3 3 2 3 ...
.. ..$ Survived: Factor w/ 2 levels "dead","survived": 1 2 2 1 1 1 1 2 2 2 ...
.. ..$ Name : chr [1:721] "Braund, Mr. Owen Harris" "Heikkinen, Miss. Laina" "Futrelle, Mrs. Jacques Heath (Lily May Peel)" "Allen, Mr. William Henry" ...
.. ..$ Sex : Factor w/ 2 levels "female","male": 2 1 1 2 2 2 2 1 1 1 ...
.. ..$ Age : num [1:721] 22 26 35 35 NA 54 2 27 14 4 ...
.. ..$ SibSp : int [1:721] 1 0 1 0 0 0 3 0 1 1 ...
.. ..$ Parch : int [1:721] 0 0 0 0 0 0 1 2 0 1 ...
.. ..$ Ticket : chr [1:721] "A/5 21171" "STON/O2. 3101282" "113803" "373450" ...
.. ..$ Fare : num [1:721] 7.25 7.92 53.1 8.05 8.46 ...
.. ..$ Cabin : chr [1:721] NA NA "C123" NA ...
.. ..$ Embarked: Factor w/ 3 levels "C","Q","S": 3 3 3 3 2 3 3 3 1 3 ...
..$ validation:'data.frame': 80 obs. of 11 variables:
.. ..$ Pclass : Factor w/ 3 levels "1","2","3": 1 2 2 1 3 2 3 3 3 2 ...
.. ..$ Survived: Factor w/ 2 levels "dead","survived": 2 2 1 1 1 1 1 2 1 2 ...
.. ..$ Name : chr [1:80] "Bonnell, Miss. Elizabeth" "Hewlett, Mrs. (Mary D Kingcome) " "Turpin, Mrs. William John Robert (Dorothy Ann Wonnacott)" "Ostby, Mr. Engelhart Cornelius" ...
.. ..$ Sex : Factor w/ 2 levels "female","male": 1 1 1 2 2 2 2 2 2 1 ...
.. ..$ Age : num [1:80] 58 55 27 65 22 32 37 24 20 29 ...
.. ..$ SibSp : int [1:80] 0 0 1 0 0 0 2 0 0 1 ...
.. ..$ Parch : int [1:80] 0 0 0 1 0 0 0 0 0 0 ...
.. ..$ Ticket : chr [1:80] "113783" "248706" "11668" "113509" ...
.. ..$ Fare : num [1:80] 26.55 16 21 61.98 7.23 ...
.. ..$ Cabin : chr [1:80] "C103" NA NA "B30" ...
.. ..$ Embarked: Factor w/ 3 levels "C","Q","S": 3 3 3 1 1 3 3 3 3 3 ...
> head(x$Fold01$train)
Pclass Survived Name Sex Age SibSp Parch Ticket Fare Cabin
1 3 dead Braund, Mr. Owen Harris male 22 1 0 A/5 21171 7.2500 <NA>
3 3 survived Heikkinen, Miss. Laina female 26 0 0 STON/O2. 3101282 7.9250 <NA>
4 1 survived Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35 1 0 113803 53.1000 C123
5 3 dead Allen, Mr. William Henry male 35 0 0 373450 8.0500 <NA>
6 3 dead Moran, Mr. James male NA 0 0 330877 8.4583 <NA>
7 1 dead McCarthy, Mr. Timothy J male 54 0 0 17463 51.8625 E46
Embarked
1 S
3 S
4 S
5 S
6 Q
7 S
>
>
> # Data Exploration
>
> library(Hmisc)
>
> data <- x$Fold01$train
> summary(Survived ~ Pclass + Sex + Age + SibSp + Parch + Fare + Embarked, data=data, method="reverse")
Descriptive Statistics by Survived
+------------+---+--------------------------+--------------------------+
| |N |dead |survived |
| | |(N=445) |(N=276) |
+------------+---+--------------------------+--------------------------+
|Pclass : 1 |721| 16% ( 70) | 41% (112) |
+------------+---+--------------------------+--------------------------+
| 2 | | 17% ( 75) | 25% ( 69) |
+------------+---+--------------------------+--------------------------+
| 3 | | 67% (300) | 34% ( 95) |
+------------+---+--------------------------+--------------------------+
|Sex : male |721| 86% (381) | 32% ( 87) |
+------------+---+--------------------------+--------------------------+
|Age |578| 21/28/39 | 19/29/36 |
+------------+---+--------------------------+--------------------------+
|SibSp : 0 |721| 74% (328) | 61% (167) |
+------------+---+--------------------------+--------------------------+
| 1 | | 17% ( 74) | 33% ( 91) |
+------------+---+--------------------------+--------------------------+
| 2 | | 3% ( 14) | 4% ( 11) |
+------------+---+--------------------------+--------------------------+
| 3 | | 2% ( 9) | 1% ( 4) |
+------------+---+--------------------------+--------------------------+
| 4 | | 2% ( 10) | 1% ( 3) |
+------------+---+--------------------------+--------------------------+
| 5 | | 1% ( 3) | 0% ( 0) |
+------------+---+--------------------------+--------------------------+
| 8 | | 2% ( 7) | 0% ( 0) |
+------------+---+--------------------------+--------------------------+
|Parch : 0 |721| 82% (364) | 67% (186) |
+------------+---+--------------------------+--------------------------+
| 1 | | 9% ( 39) | 18% ( 49) |
+------------+---+--------------------------+--------------------------+
| 2 | | 7% ( 32) | 14% ( 38) |
+------------+---+--------------------------+--------------------------+
| 3 | | 0% ( 2) | 1% ( 2) |
+------------+---+--------------------------+--------------------------+
| 4 | | 1% ( 3) | 0% ( 0) |
+------------+---+--------------------------+--------------------------+
| 5 | | 1% ( 4) | 0% ( 1) |
+------------+---+--------------------------+--------------------------+
| 6 | | 0% ( 1) | 0% ( 0) |
+------------+---+--------------------------+--------------------------+
|Fare |721| 7.85420/10.50000/26.00000|12.60625/26.00000/60.38957|
+------------+---+--------------------------+--------------------------+
|Embarked : C|719| 14% ( 61) | 26% ( 71) |
+------------+---+--------------------------+--------------------------+
| Q | | 8% ( 36) | 8% ( 22) |
+------------+---+--------------------------+--------------------------+
| S | | 78% (348) | 66% (181) |
+------------+---+--------------------------+--------------------------+
> data.complete <- data[complete.cases(data), ]
>
> install.packages("ellipse")
> library(ellipse)
>
> featurePlot(
+ data.complete[,
+ sapply(names(data.complete)
+ , function(n) { is.numeric(data.complete[, n]) } )]
+ , data.complete[, c("Survived")]
+ , "ellipse"
+ )
>
> mosaicplot(Survived ~ Pclass + Sex, data = data, color=T, main = "pclass and sex" )
> xtabs(~ Sex + Pclass, data = data)
Pclass
Sex 1 2 3
female 76 60 117
male 106 84 278
> xtabs(Survived == "survived" ~ Sex + Pclass, data = data)
Pclass
Sex 1 2 3
female 73 55 61
male 39 14 34
> xtabs(Survived == "survived" ~ Sex + Pclass, data = data) / xtabs(~ Sex + Pclass, data = data)
Pclass
Sex 1 2 3
female 0.9605263 0.9166667 0.5213675
male 0.3679245 0.1666667 0.1223022
>
>
> # Data Analysis by Decision Tree Model
>
> library(rpart)
> m <- rpart(
+ Survived ~ Pclass + Sex + Age + SibSp + Parch + Fare + Embarked
+ , data = titanic.train
+ )
> p <- predict(m, newdata=titanic.train, type="class")
> head(p)
1 3 4 5 6 7
dead survived survived dead dead dead
Levels: dead survived
>
> library(foreach)
> folds <- create_ten_fold_cv()
>
> rpart_result <- foreach(f=folds) %do% {
+ model_rpart <- rpart(
+ Survived ~ Pclass + Sex + Age + SibSp + Parch + Fare + Embarked
+ , data = f$train
+ )
+ predicted <- predict(model_rpart, newdata=f$validation, type="class")
+ return(list(actual=f$validation$Survived, predicted=predicted))
+ }
>
> head(rpart_result)
[[1]]
[[1]]$actual
[1] dead survived dead survived dead survived dead dead survived dead dead dead dead ...
Levels: dead survived
[[1]]$predicted
8 20 27 33 36 53 72 81 83 94 114 122 136
dead survived dead survived dead survived dead dead survived dead survived dead dead
...
Levels: dead survived
...
>
> evaluation <- function(lst) {
+ accuracy <- sapply(lst, function(one_result) {
+ return(sum(one_result$predicted == one_result$actual) /
+ NROW(one_result$actual))
+ })
+ print(sprintf("MEAN +/- SD: %.3f +/- %.3f", mean(accuracy), sd(accuracy)))
+ return(accuracy)
+ }
>
> (rpart_accuracy <- evaluation(rpart_result))
[1] "MEAN +/- SD: 0.822 +/- 0.056"
[1] 0.8500000 0.9125000 0.7594937 0.8250000 0.8250000 0.8500000 0.7037037 0.8395062 0.8250000 0.8250000
>
> library(party)
> ctree_result <- foreach(f=folds) %do% {
+ model_ctree <- ctree(
+ Survived ~ Pclass + Sex + Age + SibSp + Parch + Fare + Embarked
+ , data = f$train
+ )
+ predicted <- predict(model_ctree, newdata=f$validation, type="response")
+ return(list(actual=f$validation$Survived, predicted=predicted))
+ }
>
> (ctree_accuracy <- evaluation(ctree_result))
[1] "MEAN +/- SD: 0.828 +/- 0.047"
[1] 0.8625000 0.8875000 0.7974684 0.8250000 0.8000000 0.8750000 0.7283951 0.8518519 0.8125000 0.8375000
>
> plot(density(ctree_accuracy), main="rpart VS ctree")
> lines(density(rpart_accuracy), col="red", lty="dashed")
>
>
> # Data Advanced Analysis
>
> head(titanic)
Pclass Survived Name Sex Age SibSp Parch Ticket Fare
1 3 dead Braund, Mr. Owen Harris male 22 1 0 A/5 21171 7.2500
2 1 survived Cumings, Mrs. John Bradley (Florence Briggs Thayer) female 38 1 0 PC 17599 71.2833
3 3 survived Heikkinen, Miss. Laina female 26 0 0 STON/O2. 3101282 7.9250
4 1 survived Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35 1 0 113803 53.1000
5 3 dead Allen, Mr. William Henry male 35 0 0 373450 8.0500
6 3 dead Moran, Mr. James male NA 0 0 330877 8.4583
Cabin Embarked
1 <NA> S
2 C85 C
3 <NA> S
4 C123 S
5 <NA> S
6 <NA> Q
> View(titanic.train[order(titanic.train$Ticket), c("Ticket","Parch","Name","Cabin","Embarked")])
> sum(is.na(titanic.train$Ticket))
[1] 0
> sum(is.na(titanic.train$Embarked))
[1] 2
> sum(is.na(titanic.train$Cabin))
[1] 616
>
> library(plyr)
>
> family.result <- foreach(f=folds) %do% {
+ f$train$type <- "T"
+ f$validation$type <- "V"
+ all <- rbind(f$train, f$validation)
+ ctree_model <- ctree(
+ Survived ~ Pclass + Sex + Age + SibSp + Parch + Fare + Embarked
+ , data = f$train
+ )
+ all$prob <- sapply(
+ predict(ctree_model, type="prob", newdata=all)
+ , function(result) { result[1] }
+ )
+
+ # 티켓 번호를 사용한 family_id
+ family_idx <- 0
+ ticket_based_family_id <- ddply(all, .(Ticket), function(rows) {
+ family_idx <<- family_idx + 1
+ return(data.frame(family_id=paste0("TICKET_", family_idx)))
+ })
+ all <- adply(all, 1
+ , function(row) {
+ family_id <- NA
+ if (!is.na(row$Ticket)) {
+ family_id <- subset(ticket_based_family_id
+ , Ticket == row$Ticket)$family_id
+ }
+ return(data.frame(family_id=family_id))
+ })
+
+ # avg_prob
+ all <- ddply(all, .(family_id), function(rows) {
+ rows$avg_prob <- mean(rows$prob)
+ return(rows)
+ })
+
+ # maybe_parent, maybe_child
+ all <- ddply(all, .(family_id), function(rows) {
+ rows$maybe_parent <- FALSE
+ rows$maybe_child <- FALSE
+ if (NROW(rows) == 1 || sum(rows$Parch) == 0
+ || NROW(rows) == sum(is.na(rows$Age))) {
+ return(rows)
+ }
+ max_age <- max(rows$Age, na.rm = T)
+ min_age <- min(rows$Age, na.rm = T)
+ return(adply(rows, 1, function(row) {
+ if (!is.na(row$Age) && !is.na(row$Sex)) {
+ row$maybe_parent <- (max_age - row$Age) < 10
+ row$maybe_child <- (row$Age - min_age) < 10
+ }
+ return(row)
+ }))
+ })
+
+ # avg_parent_prob, avg_child_prob
+ all <- ddply(all, .(family_id), function(rows) {
+ rows$avg_parent_prob <- rows$avg_prob
+ rows$avg_child_prob <- rows$avg_prob
+ if (NROW(rows) == 1 || sum(rows$Parch) == 0) {
+ return(rows)
+ }
+ parent_prob <- subset(rows, maybe_parent == TRUE)[, "prob"]
+ if (NROW(parent_prob) > 0) {
+ rows$avg_parent_prob <- mean(parent_prob)
+ }
+ child_prob <- subset(rows, maybe_child == TRUE)[, "prob"]
+ if (NROW(child_prob) > 0) {
+ rows$avg_child_prob <- mean(child_prob)
+ }
+ return(rows)
+ })
+
+ # ctree 모델
+ f$train <- subset(all, type == "T")
+ f$validation <- subset(all, type == "V")
+ (m <- ctree(Survived ~ Pclass + Sex + Age + SibSp + Parch + Fare + Embarked
+ # + maybe_parent + maybe_child + age + sex + avg_prob + avg_parent_prob + avg_child_prob, data = f$train))
+ + maybe_parent + maybe_child + avg_prob + avg_parent_prob + avg_child_prob, data = f$train))
+ predicted <- predict(m, newdata=f$validation)
+ return(list(actual=f$validation$Survived, predicted=predicted))
+ }
>
> family_accuracy <- evaluation(family.result)
[1] "MEAN +/- SD: 0.808 +/- 0.039"
>
>
> # Multiplization
>
> createMultiFolds(titanic.train$Survived, k=10, times = 3)
$Fold01.Rep1
[1] 1 2 3 4 5 7 8 9 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 28 29 30 31 33
...
[697] 773 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 795 797 798 799 800 801
...
$Fold10.Rep1
...
$Fold01.Rep2
...
$Fold10.Rep2
...
$Fold01.Rep3
...
$Fold10.Rep3
...
>
> create_three_ten_fold_cv <- function() {
+ set.seed(137)
+ lapply(createMultiFolds(titanic.train$Survived, k=10, times = 3), function(idx) {
+ return(list(train=titanic.train[idx, ], validation=titanic.train[-idx, ]))
+ })
+ }
>
> folds <- create_three_ten_fold_cv()
>
> ctree_result <- foreach(f=folds) %do% {
+ model_ctree <- ctree(Survived ~ Pclass + Sex + Age + SibSp + Parch + Fare + Embarked
+ , data = f$train)
+ predicted <- predict(model_ctree, newdata=f$validation, type="response")
+ return(list(actual=f$validation$Survived, predicted=predicted))
+ }
>
> (ctree_accuracy <- evaluation(ctree_result))
[1] "MEAN +/- SD: 0.826 +/- 0.047"
[1] 0.8625000 0.8875000 0.7974684 0.8250000 0.8000000 0.8750000 0.7283951 0.8518519 0.8125000 0.8375000 0.7777778
[12] 0.8481013 0.8125000 0.8395062 0.8000000 0.8625000 0.8888889 0.7250000 0.8607595 0.8375000 0.8734177 0.7875000
[23] 0.8500000 0.8250000 0.7530864 0.7375000 0.8375000 0.8641975 0.9000000 0.8125000
>
> system.time(ctree_result <- foreach(f=folds) %do% {
+ model_ctree <- ctree(Survived ~ Pclass + Sex + Age + SibSp + Parch + Fare + Embarked
+ , data = f$train)
+ predicted <- predict(model_ctree, newdata=f$validation, type="response")
+ return(list(actual=f$validation$Survived, predicted=predicted))
+ })
사용자 시스템 elapsed
0.53 0.01 0.54
>
> library(doParallel)
>
> registerDoParallel(cores = 4)
>
> system.time(ctree_result <- foreach(f=folds) %dopar% {
+ model_ctree <- party::ctree(Survived ~ Pclass + Sex + Age + SibSp + Parch + Fare + Embarked
+ , data = f$train)
+ predicted <- predict(model_ctree, newdata=f$validation, type="response")
+ return(list(actual=f$validation$Survived, predicted=predicted))
+ })
사용자 시스템 elapsed
0.04 0.02 0.98
>
> registerDoParallel(cores = 2)
>
> system.time(ctree_result <- foreach(f=folds) %dopar% {
+ model_ctree <- party::ctree(Survived ~ Pclass + Sex + Age + SibSp + Parch + Fare + Embarked
+ , data = f$train)
+ predicted <- predict(model_ctree, newdata=f$validation, type="response")
+ return(list(actual=f$validation$Survived, predicted=predicted))
+ })
사용자 시스템 elapsed
0.03 0.03 1.72
출처 : R을 이용한 데이터 처리&분석 실무
'데이터분석 > R' 카테고리의 다른 글
[현장에서 바로 써먹는...] 텍스트마이닝 - 감성 분석 (0) | 2022.05.03 |
---|---|
[현장에서 바로 써먹는...] 텍스트마이닝 - 워드클라우드 (0) | 2022.05.02 |
[현장에서 바로 써먹는...] 인공 신경망과 딥러닝 - 분류 (0) | 2022.04.16 |
[현장에서 바로 써먹는...] 인공신경망과 딥러닝 - 회귀 (0) | 2022.04.13 |
[현장에서 바로 써먹는...] 군집분석 (0) | 2022.04.09 |