> ### Chapter 7. 인공신경망과 딥 러닝
>
> ## Chapter 7-1. 닭의 무게를 예측할 수 있을까? (회귀)
>
> # 종란무게와 누적 사료량으로 닭의 무게 예측을 위한 데이터 불러오기
> w <- read.csv("ch7-1.csv", header = TRUE)
> head(w)
egg_weight acc_food weight
1 69 10602 4128
2 76 10640 4104
3 76 10898 4119
4 71 10384 4127
5 71 10709 4112
6 72 10768 4124
> str(w)
'data.frame': 300 obs. of 3 variables:
$ egg_weight: int 69 76 76 71 71 72 61 69 64 52 ...
$ acc_food : int 10602 10640 10898 10384 10709 10768 10077 10574 10256 9722 ...
$ weight : int 4128 4104 4119 4127 4112 4124 4093 4114 4104 4055 ...
>
> # ind라는 인덱스를 무작위로 만들어 8:2로 훈련, 테스트 셋 분할
> ind <- sample(1:nrow(w), nrow(w)*0.8, replace =F)
> r_train <- w[ind,] # 80%의 데이터를 훈련 셋으로 분할
> r_test <- w[-ind,] # 나머지 데이터를 테스트 셋으로 분할
> head(r_train)
egg_weight acc_food weight
65 57 9915 4074
154 64 10230 4091
205 80 10928 4104
11 71 10512 4120
10 52 9722 4055
219 73 10448 4129
>
> cor(r_train) # 상관분석을 통해 유의한 인자 인지 확인
egg_weight acc_food weight
egg_weight 1.0000000 0.9446786 0.7617528
acc_food 0.9446786 1.0000000 0.7793485
weight 0.7617528 0.7793485 1.0000000
> # 훈련용 데이터셋 산점도 그리기
> plot(r_train)
>
> library(nnet)
>
> ?nnet
> # nnet 함수활용 은닉층 하나의 간단한 신경망 구현
> r_nn <- nnet(weight~., data = r_train, size = 3 , decay = 5e-4,
+ rang = 0.1, maxit = 500, linout = TRUE)
# weights: 13
initial value 4034735172.455049
final value 149709.581282
converged
>
> summary(r_nn) # 모델 확인
a 2-3-1 network with 13 weights
options were - linear output units decay=5e-04
b->h1 i1->h1 i2->h1
0.01 -0.09 -0.09
b->h2 i1->h2 i2->h2
-0.04 0.07 -0.01
b->h3 i1->h3 i2->h3
0.06 0.09 -0.05
b->o h1->o h2->o h3->o
4100.19 -0.06 -0.07 -0.06
>
> # r_test 데이터 셋을 이후에 쓸 예정으로 성능평가를 위한 test 데이터 셋 만들기
> test <- r_test
>
> # 신경망 모델을 이용해 예측값 생성
> test$pred <- predict(r_nn, newdata = r_test)
> head(test) # 데이터 확인
egg_weight acc_food weight pred
1 69 10602 4128 4100.187
4 71 10384 4127 4100.187
13 69 10298 4114 4100.187
38 78 10766 4096 4100.187
43 53 9963 4054 4100.187
46 71 10496 4112 4100.187
>
> # 산점도 확인
> plot(test)
>
> # 회귀 예측값 성능평가
> R2(test$pred, test$weight)
[,1]
[1,] NA
경고메시지(들):
In cor(obs, pred, use = ifelse(na.rm, "complete.obs", "everything")) :
표준편차가 0입니다
> RMSE(test$pred, test$weight)
[1] 25.12031
> MAE(test$pred, test$weight)
[1] 21.70624
>
> # Data Normalization
> # caret 패키지에서 preProcess() 함수로 스케일링을 제공하기 때문에 불러옴
> library(caret)
>
> # preProcess() 함수에서 method를 range로 지정하면 Normalization 가능
> nor <- preProcess(w[,1:2], method="range")
>
> # predict() 함수를 이용해 r_train 데이터의 독립변수를 Normalization 실시
> r_x_train <- predict(nor, r_train[,1:2])
> summary(r_x_train)
egg_weight acc_food
Min. :0.0000 Min. :0.0000
1st Qu.:0.2143 1st Qu.:0.2967
Median :0.5000 Median :0.5271
Mean :0.4975 Mean :0.5065
3rd Qu.:0.7143 3rd Qu.:0.6987
Max. :1.0000 Max. :0.9685
> # predict() 함수를 이용해 r_test 데이터의 독립변수를 Normalization 실시
> r_x_test <- predict(nor, r_test[,1:2])
> summary(r_x_test)
egg_weight acc_food
Min. :0.0000 Min. :0.007331
1st Qu.:0.2143 1st Qu.:0.256232
Median :0.5536 Median :0.458211
Mean :0.4875 Mean :0.505682
3rd Qu.:0.7589 3rd Qu.:0.714626
Max. :1.0000 Max. :1.000000
>
> # Normalization 시킨 r_x_train 데이터 셋과 기존 종속변수를 열병합 실시
> r_n_train <- cbind(r_x_train, r_train[,3])
> names(r_n_train)[3] <- "weight" # 합쳐진 데이터 셋의 3번째 열 이름을 weight로 변경
>
> # Normalization 시킨 r_x_test 데이터 셋과 기존 종속변수를 열병합 실시
> r_n_test <- cbind(r_x_test, r_test[,3])
> names(r_n_test)[3] <- "weight" # 합쳐진 데이터 셋의 3번째 열 이름을 weight로 변경
>
> head(r_n_train)
egg_weight acc_food weight
65 0.1785714 0.18695015 4074
154 0.4285714 0.41788856 4091
205 1.0000000 0.92961877 4104
11 0.6785714 0.62463343 4120
10 0.0000000 0.04545455 4055
219 0.7500000 0.57771261 4129
> head(r_n_test)
egg_weight acc_food weight
1 0.60714286 0.6906158 4128
4 0.67857143 0.5307918 4127
13 0.60714286 0.4677419 4114
38 0.92857143 0.8108504 4096
43 0.03571429 0.2221408 4054
46 0.67857143 0.6129032 4112
>
> # nnet 함수활용 간단한 신경망 구현
> r_nn_s <- nnet(weight~., data = r_n_train, size = 3, decay = 5e-4,
+ rang = 0.1, maxit = 500, linout = TRUE)
# weights: 13
initial value 4035168897.112994
iter 10 value 150491.169529
iter 20 value 147954.731425
iter 30 value 133653.305602
iter 40 value 86942.878235
iter 50 value 70032.698819
iter 60 value 64944.518061
iter 70 value 60384.746717
iter 80 value 57454.783759
iter 90 value 55783.350577
iter 100 value 52899.973568
iter 110 value 45938.934744
iter 120 value 35247.970567
iter 130 value 34923.314561
iter 140 value 34674.639147
final value 34673.104067
converged
>
> summary(r_nn_s)
a 2-3-1 network with 13 weights
options were - linear output units decay=5e-04
b->h1 i1->h1 i2->h1
2.64 3.04 0.75
b->h2 i1->h2 i2->h2
-22.35 -1.65 -2.83
b->h3 i1->h3 i2->h3
25.59 4.71 6.36
b->o h1->o h2->o h3->o
1435.12 1245.07 -5.94 1441.98
>
> # 신경망 모델을 이용해 예측값 생성
> r_n_test$pred <- predict(r_nn_s, newdata = r_n_test)
> head(r_n_test)
egg_weight acc_food weight pred
1 0.60714286 0.6906158 4128 4113.890
4 0.67857143 0.5307918 4127 4114.659
13 0.60714286 0.4677419 4114 4112.403
38 0.92857143 0.8108504 4096 4119.313
43 0.03571429 0.2221408 4054 4058.349
46 0.67857143 0.6129032 4112 4115.103
>
> # 회귀모델 성능 평가
> R2(r_n_test$pred, r_n_test$weight) # R2
[,1]
[1,] 0.827669
> RMSE(r_n_test$pred, r_n_test$weight) # RMSE
[1] 10.39716
> MAE(r_n_test$pred, r_n_test$weight) # MAE
[1] 8.762227
>
>
> # H2O 활용 딥러닝 실시(회귀)
> install.packages("h2o") # h2o 패키지 설치
also installing the dependency ‘RCurl’
URL 'https://cran.rstudio.com/bin/windows/contrib/4.0/RCurl_1.98-1.6.zip'을 시도합니다
Content type 'application/zip' length 3072932 bytes (2.9 MB)
downloaded 2.9 MB
URL 'https://cran.rstudio.com/bin/windows/contrib/4.0/h2o_3.36.0.4.zip'을 시도합니다
Content type 'application/zip' length 178064119 bytes (169.8 MB)
downloaded 169.8 MB
package ‘RCurl’ successfully unpacked and MD5 sums checked
package ‘h2o’ successfully unpacked and MD5 sums checked
The downloaded binary packages are in
C:\Users\jbt\AppData\Local\Temp\Rtmpsnfzaw\downloaded_packages
> library(h2o) # h2o 라이브러리 불러오기
----------------------------------------------------------------------
Your next step is to start H2O:
> h2o.init()
For H2O package documentation, ask for help:
> ??h2o
After starting H2O, you can use the Web UI at http://localhost:54321
For more information visit https://docs.h2o.ai
----------------------------------------------------------------------
다음의 패키지를 부착합니다: ‘h2o’
The following objects are masked from ‘package:stats’:
cor, sd, var
The following objects are masked from ‘package:base’:
%*%, %in%, &&, ||, apply, as.factor, as.numeric, colnames, colnames<-, ifelse, is.character, is.factor,
is.numeric, log, log10, log1p, log2, round, signif, trunc
경고메시지(들):
패키지 ‘h2o’는 R 버전 4.0.5에서 작성되었습니다
> h2o.init() # h2o 라이브러리 기동
H2O is not running yet, starting it now...
Note: In case of errors look at the following log files:
C:\Users\jbt\AppData\Local\Temp\Rtmpsnfzaw\file610047a3e2abc/h2o_jbt_started_from_r.out
C:\Users\jbt\AppData\Local\Temp\Rtmpsnfzaw\file61004459f7124/h2o_jbt_started_from_r.err
java version "1.8.0_45"
Java(TM) SE Runtime Environment (build 1.8.0_45-b15)
Java HotSpot(TM) 64-Bit Server VM (build 25.45-b02, mixed mode)
Starting H2O JVM and connecting: Connection successful!
R is connected to the H2O cluster:
H2O cluster uptime: 8 seconds 434 milliseconds
H2O cluster timezone: Asia/Seoul
H2O data parsing timezone: UTC
H2O cluster version: 3.36.0.4
H2O cluster version age: 14 days, 11 hours and 46 minutes
H2O cluster name: H2O_started_from_R_jbt_dak018
H2O cluster total nodes: 1
H2O cluster total memory: 3.53 GB
H2O cluster total cores: 8
H2O cluster allowed cores: 8
H2O cluster healthy: TRUE
H2O Connection ip: localhost
H2O Connection port: 54321
H2O Connection proxy: NA
H2O Internal Security: FALSE
R Version: R version 4.0.2 (2020-06-22)
>
> # 데이터 불러오기
> head(r_train)
egg_weight acc_food weight
65 57 9915 4074
154 64 10230 4091
205 80 10928 4104
11 71 10512 4120
10 52 9722 4055
219 73 10448 4129
>
> # H2O 전용 데이터프레임으로 변환
> hf_r_train <- as.h2o(r_train)
|=====================================================================| 100%
> hf_r_test <- as.h2o(r_test)
|=====================================================================| 100%
>
> head(hf_r_train)
egg_weight acc_food weight
1 57 9915 4074
2 64 10230 4091
3 80 10928 4104
4 71 10512 4120
5 52 9722 4055
6 73 10448 4129
> str(hf_r_train)
Class 'H2OFrame' <environment: 0x0000019368572228>
- attr(*, "op")= chr "Parse"
- attr(*, "id")= chr "r_train_sid_b41b_1"
- attr(*, "eval")= logi FALSE
- attr(*, "nrow")= int 240
- attr(*, "ncol")= int 3
- attr(*, "types")=List of 3
..$ : chr "int"
..$ : chr "int"
..$ : chr "int"
- attr(*, "data")='data.frame': 10 obs. of 3 variables:
..$ egg_weight: num 57 64 80 71 52 73 58 62 75 69
..$ acc_food : num 9915 10230 10928 10512 9722 ...
..$ weight : num 4074 4091 4104 4120 4055 ...
>
> # 설정된 심층 신경망과 역전파 알고리즘을 통하여 예측
> # hf.r_train 데이터 셋 활용 노드가 3개씩인 2개의 은닉층을 가진 DNN 구축
> fit <- h2o.deeplearning(x = 1:2, y = 3, training_frame = hf_r_train,
+ hidden = c(3, 3), epochs = 200,
+ standardize = TRUE)
|==================================================================================================================| 100%
>
> summary(fit)
Model Details:
==============
H2ORegressionModel: deeplearning
Model Key: DeepLearning_model_R_1649884908201_1
Status of Neuron Layers: predicting weight, regression, gaussian distribution, Quadratic loss, 25 weights/biases, 3.7 KB, 48,000 training samples, mini-batch size 1
layer units type dropout l1 l2 mean_rate rate_rms momentum mean_weight weight_rms mean_bias bias_rms
1 1 2 Input 0.00 % NA NA NA NA NA NA NA NA NA
2 2 3 Rectifier 0.00 % 0.000000 0.000000 0.001101 0.000381 0.000000 -0.455136 0.552540 0.758124 0.253093
3 3 3 Rectifier 0.00 % 0.000000 0.000000 0.057340 0.080426 0.000000 0.034545 1.200521 0.879458 0.786997
4 4 1 Linear NA 0.000000 0.000000 0.000914 0.000569 0.000000 0.232891 0.750817 -0.135004 0.000000
H2ORegressionMetrics: deeplearning
** Reported on training data. **
** Metrics reported on full training frame **
MSE: 75.84092
RMSE: 8.708669
MAE: 7.150456
RMSLE: 0.002119319
Mean Residual Deviance : 75.84092
Scoring History:
timestamp duration training_speed epochs iterations samples training_rmse training_deviance
1 2022-04-14 06:25:30 0.000 sec NA 0.00000 0 0.000000 NA NA
2 2022-04-14 06:25:30 0.361 sec 52173 obs/sec 10.00000 1 2400.000000 10.83041 117.29780
3 2022-04-14 06:25:31 0.486 sec 289156 obs/sec 200.00000 20 48000.000000 8.70867 75.84092
training_mae training_r2
1 NA NA
2 8.77397 0.80077
3 7.15046 0.87119
Variable Importances: (Extract with `h2o.varimp`)
=================================================
Variable Importances:
variable relative_importance scaled_importance percentage
1 egg_weight 1.000000 1.000000 0.740365
2 acc_food 0.350685 0.350685 0.259635
>
> # hf.r_test 데이터셋과 fit 모델을 이용해 만든 예측값을 r_pred에 입력
> r_pred <- h2o.predict(fit, newdata = hf_r_test)
|==================================================================================================================| 100%
> class(r_pred)
[1] "H2OFrame"
>
> # r_pred를 데이터프레임으로 변환
> df_r_pred <- as.data.frame(r_pred)
> class(df_r_pred)
[1] "data.frame"
>
> R2(df_r_pred$predict, r_test$weight) # R2
[1] 0.898282
> RMSE(df_r_pred$predict, r_test$weight) # RMSE
[1] 8.150211
> MAE(df_r_pred$predict, r_test$weight) # MAE
[1] 6.986426
출처 : 현장에서 바로 써먹는 데이터 분석 with R
'데이터분석 > R' 카테고리의 다른 글
타이타닉 데이터 분류 예측 (0) | 2022.04.25 |
---|---|
[현장에서 바로 써먹는...] 인공 신경망과 딥러닝 - 분류 (0) | 2022.04.16 |
[현장에서 바로 써먹는...] 군집분석 (0) | 2022.04.09 |
[현장에서 바로 써먹는...] 분류분석 (0) | 2022.04.09 |
[현장에서 바로 써먹는...] 로지스틱 회귀 (0) | 2022.04.09 |