본문 바로가기
데이터분석/R

[실무 프로젝트로 배우는...] 머신러닝 기초

by 버섯도리 2022. 1. 31.

> ### 3.10 중고차 등급 분석을 위한 머신러닝 기초

> #### 3.10.1 의사결정나무   

> # 엔트로피 모델    
> # 엔트로피(Entropy)는 의사결정나무에서 불순도(Impurity) 지표를 나타낸다.

> # 불순도 지표 계산

> DF = data.frame(
+   TEXT = rep(c("A","B"),c(9,3)),
+   x = rep(1:4,3),
+   y = rep(1:3, each = 4)
+ )

> H = 0

> for(k in unique(DF$TEXT)) {
+   Prob = sum(DF$TEXT == k)/ nrow(DF) 
+   Value = Prob * log2(Prob)
+   H = H + Value
+ }
H = H * (-1)
> print(H)
[1] 0.8112781

> DF = data.frame(
+   TEXT = rep(c("A","B","C","D"),c(3,3,1,5)),
+   x = rep(1:4,3),
+   y = rep(1:3, each = 4)
+ )

> H = 0
> for(k in unique(DF$TEXT)) {
+   Prob = sum(DF$TEXT == k)/ nrow(DF) 
+   Value = Prob * log2(Prob)
+   H = H + Value
+ }
H = H * (-1)
> print(H)
[1] 1.825011

> # 정보획득
> # 의사결정나무에서 엔트로피를 감소시키는 방향으로 분류를 진행할 때 감소한 엔트로피 양을 정보획득(Information gain)이라고 정의한다.
> # 먼저, 분류 규칙에 따라 분리된 2개의 결과에 따라 분리된 2개의 결과에 대해 각각 엔트로피를 계산한다.
> # 계산된 2개의 엔트로피 값에 분리된 비율을 곱한 다음, 이전 엔트로피에서 값을 빼면 정보획득을 구할 수 있다.


> # 엔트로피를 계산하는 사용자 함수
> Entropy_Function = function(DF) {
+   H = 0

+   for(k in unique(DF$TEXT)){
+     Prob = sum(DF$TEXT == k)/ nrow(DF) 
+     Value = Prob * log2(Prob)
+     H = H + Value
+   }
+   
+   H = H * (-1)
+   print(H)
+   return(H)
+ }
> DF_AC = DF %>%
+   dplyr::filter(TEXT %in% c("A","C"))
> DF_BD = DF %>%
+   dplyr::filter(TEXT %in% c("B","D"))

> H = Entropy_Function(DF = DF)
[1] 1.825011
> H_AC = Entropy_Function(DF = DF_AC)
[1] 0.8112781
> H_BD = Entropy_Function(DF = DF_BD)
[1] 0.954434

> IG = H - (4/12)*H_AC - (8/12)*H_BD
> IG
[1] 0.9182958
> # 정보획득은 0.918이다. 의사결정나무는 기본적으로 정보획득을 가장 크게 얻는 분류 규칙을 생성하도록 학습을 진행한다.

> # C5.0 의사결정나무

> install.packages("C50")
> library(C50)

> Sample$price_G = as.factor(Sample$price_G)
> FEATURE = Sample[,c("mileage","mpg","engineSize")]
> RESPONSE = Sample[,c("price_G")]

> tree1 = C5.0(FEATURE, RESPONSE, control = C5.0Control(noGlobalPruning = FALSE, 
+     minCases = 150), trials = 10)
> # minCases : 의사결정나무가 분류하는 데 있어 분류 결과가 최소로 가져야 하는 표본 개수
> # trials : 의사결정나무를 몇 개 만들지를 지정하는 옵션 (의사결정나무를 부스팅을 활용해 생성한다.)

> summary(tree1)

Call:
C5.0.default(x = FEATURE, y = RESPONSE, trials = 10, control = C5.0Control(noGlobalPruning = FALSE, minCases = 150))


C5.0 [Release 2.07 GPL Edition]   Tue Feb 01 17:10:29 2022
-------------------------------

Class specified by attribute `outcome'

Read 7467 cases (4 attributes) from undefined.data

-----  Trial 0:  -----
...

-----  Trial 9:  -----

Decision tree:

mpg > 48.7: 0 (1779.3/192)
mpg <= 48.7:
:...mileage > 51663: 0 (381.7/25)
    mileage <= 51663:
    :...mpg <= 35.3: 1 (376.6)
        mpg > 35.3:
        :...engineSize <= 1.9: 0 (554.4/90.9)
            engineSize > 1.9:
            :...mileage <= 985: 1 (200.1/20.4)
                mileage > 985:
                :...mileage > 40283: 0 (349/71.6)
                    mileage <= 40283:
                    :...mpg <= 38.2: 1 (354/60.5)
                        mpg > 38.2:
                        :...engineSize > 2.5: 1 (491.4/147)
                            engineSize <= 2.5:
                            :...mileage > 10581: 0 (356.7/9.8)
                                mileage <= 10581:
                                :...mpg <= 43.5: 0 (1280.7/444)
                                    mpg > 43.5: 1 (984.2/349.7)


Evaluation on training data (7467 cases):

Trial     Decision Tree   
-----   ----------------  
  Size      Errors  

   0      7  772(10.3%)
   1      5 1065(14.3%)
   2      8  965(12.9%)
   3      7  853(11.4%)
   4      9  898(12.0%)
   5      8 1000(13.4%)
   6      5 1019(13.6%)
   7      7  841(11.3%)
   8      7  746(10.0%)
   9     11  668( 8.9%)
boost         590( 7.9%)   <<


   (a)   (b)    <-classified as
  ----  ----
  5380   227    (a): class 0
   363  1497    (b): class 1


Attribute usage:

100.00% mileage
100.00% mpg
100.00% engineSize


Time: 0.1 secs

> plot(tree1)


> # minCases 값을 높이면 의사결정나무를 더 간단하게 생성할 수 있다.
> tree2 = C5.0(FEATURE, RESPONSE, control = C5.0Control(noGlobalPruning = FALSE, 
+     minCases = 400), trials = 10)
> plot(tree2)


> # 성능 평가

> tree_pred1 = predict(tree1, newdata = Test)
confusionMatrix(factor(tree_pred1,levels = c(1,0)), 
+                 factor(Test$price_G, levels = c(1,0)))
Confusion Matrix and Statistics

          Reference
Prediction    1    0
         1  601  117
         0  189 2294
                                          
               Accuracy : 0.9044          
                 95% CI : (0.8937, 0.9144)
    No Information Rate : 0.7532          
    P-Value [Acc > NIR] : < 2.2e-16       
                                          
                  Kappa : 0.7347          
                                          
 Mcnemar's Test P-Value : 4.933e-05       
                                          
            Sensitivity : 0.7608          
            Specificity : 0.9515          
         Pos Pred Value : 0.8370          
         Neg Pred Value : 0.9239          
             Prevalence : 0.2468          
         Detection Rate : 0.1878          
   Detection Prevalence : 0.2243          
      Balanced Accuracy : 0.8561          
                                          
       'Positive' Class : 1               
                                          

> tree_pred2 = predict(tree2, newdata = Test)
confusionMatrix(factor(tree_pred2,levels = c(1,0)), 
+                 factor(Test$price_G, levels = c(1,0)))
Confusion Matrix and Statistics

          Reference
Prediction    1    0
         1  609  136
         0  181 2275
                                          
               Accuracy : 0.901           
                 95% CI : (0.8901, 0.9111)
    No Information Rate : 0.7532          
    P-Value [Acc > NIR] : < 2e-16         
                                          
                  Kappa : 0.7284          
                                          
 Mcnemar's Test P-Value : 0.01346         
                                          
            Sensitivity : 0.7709          
            Specificity : 0.9436          
         Pos Pred Value : 0.8174          
         Neg Pred Value : 0.9263          
             Prevalence : 0.2468          
         Detection Rate : 0.1903          
   Detection Prevalence : 0.2327          
      Balanced Accuracy : 0.8572          
                                          
       'Positive' Class : 1               
                                          

> # minCases에 따라 분류 성능이 크게 차이나지 않는다. 이럴 때에는 모형이 무조건 간단한 것이 좋다.


> #### 3.10.2 랜덤 포레스트

> library(randomForest)

> rf.fit = randomForest(price_G ~ mileage + mpg + engineSize, 
+                       data = Sample, mtry = 2, ntree = 50)
> # mtry : 예측자의 갯수
> # ntree : 총 몇 개의 의사결정나무를 생성할지 지정

> y_pred = predict(rf.fit, newdata = Test)
confusionMatrix(factor(y_pred,levels = c(1,0)), 
+                 factor(Test$price_G, levels = c(1,0)))
Confusion Matrix and Statistics

          Reference
Prediction    1    0
         1  647  123
         0  143 2288
                                          
               Accuracy : 0.9169          
                 95% CI : (0.9068, 0.9262)
    No Information Rate : 0.7532          
    P-Value [Acc > NIR] : <2e-16          
                                          
                  Kappa : 0.7746          
                                          
 Mcnemar's Test P-Value : 0.244           
                                          
            Sensitivity : 0.8190          
            Specificity : 0.9490          
         Pos Pred Value : 0.8403          
         Neg Pred Value : 0.9412          
             Prevalence : 0.2468          
         Detection Rate : 0.2021          
   Detection Prevalence : 0.2405          
      Balanced Accuracy : 0.8840          
                                          
       'Positive' Class : 1               
                                          
> # 민감도가 크게 상승하였다.

plot(rf.fit$err.rate[, 1], col = "red")

> # 랜덤 포레스트에서 err.rate를 확인하면 학습 진행에 따른 오류율을 확인할 수 있다.
> # Index(의사결정나무 수)가 증가할수록 오류율이 대체로 감소한다.
> # Index가 어느 수준에 도달하면 오류율은 크게 감소하지 않게 되는데, 그 시점에서는 더 이상 학습을 진행하는 것이 의미가 없기 때문에 ntree 옵션을 조절해서 학습 시간을 효율적으로 관리하면 된다.

 

 

 

 

 

출처 : 실무 프로젝트로 배우는 데이터 분석 with R