[실무 프로젝트로 배우는...] 예측 분석
> ### 3.8 중고차 판매 가격 예측을 위한 분석
>
> #### 3.8.1 범주형 변수를 활용한 예측 분석
>
> # 가변수를 활용한 회귀분석
>
> D_Reg = lm(log(price) ~ mileage + fuelType, data = Sample)
> summary(D_Reg)
Call:
lm(formula = log(price) ~ mileage + fuelType, data = Sample)
Residuals:
Min 1Q Median 3Q Max
-1.58062 -0.21701 -0.02386 0.18308 1.73167
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 1.037e+01 7.472e-03 1387.797 < 2e-16 ***
mileage -1.468e-05 1.719e-07 -85.397 < 2e-16 ***
fuelTypeHybrid 2.064e-01 6.898e-02 2.992 0.00278 **
fuelTypePetrol -1.646e-01 8.038e-03 -20.479 < 2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 0.3369 on 7463 degrees of freedom
Multiple R-squared: 0.4945, Adjusted R-squared: 0.4943
F-statistic: 2434 on 3 and 7463 DF, p-value: < 2.2e-16
>
> New_DF2 = data.frame(
+ mileage = rep(seq(0,1000000,by = 100000),3),
+ fuelType = rep(c("Hybrid","Petrol","Diesel"),c(11,11,11))
+ )
>
> Predicted1 = predict(D_Reg, newdata = New_DF2)
> New_DF2$Predicted1 = Predicted1
>
> ggplot(New_DF2) +
+ geom_point(aes(x = mileage, y = Predicted1, col = fuelType)) +
+ geom_line(aes(x = mileage, y = Predicted1,
+ col = fuelType, group = fuelType)) +
+ geom_hline(yintercept = 0, linetype = "dashed") +
+ ylab("Predicted price") +
+ theme_bw() +
+ theme(legend.position = "bottom")

>
> # 가변수를 활용한 일반화 가법모형
>
> D_GAM = gam(log(price) ~ s(mileage) + fuelType, data = Sample)
> summary(D_GAM)
Family: gaussian
Link function: identity
Formula:
log(price) ~ s(mileage) + fuelType
Parametric coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 9.994265 0.005374 1859.791 < 2e-16 ***
fuelTypeHybrid 0.178238 0.067308 2.648 0.00811 **
fuelTypePetrol -0.144051 0.007959 -18.099 < 2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Approximate significance of smooth terms:
edf Ref.df F p-value
s(mileage) 8.165 8.819 915.3 <2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
R-sq.(adj) = 0.519 Deviance explained = 52%
GCV = 0.10803 Scale est. = 0.10787 n = 7467
> # mileage의 edf는 8차 항 수준으로 비슷함.
>
> Predicted2 = predict(D_GAM, newdata = New_DF2)
> New_DF2$Predicted2 = Predicted2
> New_DF2 %>%
+ rename(c("Predicted1" = "Linear",
+ "Predicted2" = "GAM")) %>%
+ melt(id.vars = c("mileage", "fuelType")) %>%
+ ggplot() +
+ geom_point(aes(x = mileage, y = value, col = fuelType)) +
+ geom_line(aes(x = mileage, y = value, col = fuelType, linetype = variable)) +
+ geom_hline(yintercept = 0, linetype = "dashed") +
+ ylab("Predicted price") +
+ theme_bw() +
+ theme(legend.position = "bottom")

> # 데이터의 빈선형을 유지하면서 fuelType의 수준에 따른 회귀선을 각각 구할 수 있다.
>
>
> #### 3.8.2 다중 회귀분석을 활용한 예측 분석
>
> Full_Model = lm(log(price) ~ mileage + mpg + engineSize + fuelType,
+ data = Sample)
> summary(Full_Model)
Call:
lm(formula = log(price) ~ mileage + mpg + engineSize + fuelType,
data = Sample)
Residuals:
Min 1Q Median 3Q Max
-1.84100 -0.10740 0.01035 0.11544 2.00184
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 1.038e+01 2.467e-02 420.57 <2e-16 ***
mileage -1.221e-05 1.212e-07 -100.67 <2e-16 ***
mpg -1.264e-02 3.015e-04 -41.91 <2e-16 ***
engineSize 2.808e-01 5.625e-03 49.92 <2e-16 ***
fuelTypeHybrid 1.564e+00 5.168e-02 30.27 <2e-16 ***
fuelTypePetrol -1.097e-01 6.383e-03 -17.19 <2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 0.212 on 7461 degrees of freedom
Multiple R-squared: 0.7999, Adjusted R-squared: 0.7998
F-statistic: 5966 on 5 and 7461 DF, p-value: < 2.2e-16
> # 예측자의 투입이 증가함에 따라 회귀선의 설명력이 79.99%로 상승함.
>
> # 다중공선성
> # 다중공선성(multicollinearity)을 진단하기 위해 분산팽창지수(variance infaltion factor, VIF)를 확인
> vif(Full_Model)
GVIF Df GVIF^(1/(2*Df))
mileage 1.328626 1 1.152660
mpg 2.654201 1 1.629172
engineSize 1.921767 1 1.386278
fuelType 2.257592 2 1.225777
> # VIF 값이 모두 3보다 작으며, 따라서 다중공선성은 존재하지 않는다.
>
> # 다중 일반화 가법모형
> Full_GAM = gam(log(price) ~ s(mileage) + s(mpg) + engineSize + fuelType,
+ data = Sample)
> summary(Full_GAM)
Family: gaussian
Link function: identity
Formula:
log(price) ~ s(mileage) + s(mpg) + engineSize + fuelType
Parametric coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 9.494274 0.015487 613.04 <2e-16 ***
engineSize 0.254749 0.006645 38.34 <2e-16 ***
fuelTypeHybrid -0.218949 0.124405 -1.76 0.0785 .
fuelTypePetrol -0.125616 0.007064 -17.78 <2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Approximate significance of smooth terms:
edf Ref.df F p-value
s(mileage) 5.037 6.198 1668.1 <2e-16 ***
s(mpg) 8.344 8.814 197.2 <2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
R-sq.(adj) = 0.813 Deviance explained = 81.4%
GCV = 0.042024 Scale est. = 0.041926 n = 7467
> # 일반화 기법모형으로 예측자를 여러 개 활용하니 설명력이 81.3%로 더 상승함.
>
>
> #### 3.8.3 예측 모형의 성능 평가
>
> # MSE 계산
>
> Reg2_P = predict(Reg2, newdata = TEST)
> Reg3_P = predict(Reg3, newdata = TEST)
> GAM_P = predict(GAM, newdata = TEST)
> D_Reg_P = predict(D_Reg, newdata = TEST)
> D_GAM_P = predict(D_GAM, newdata = TEST)
> Full_Model_P = predict(Full_Model, newdata = TEST)
> Full_GAM_P = predict(Full_GAM, newdata = TEST)
>
> Reg2_MSE = mean((Reg2_P - log(TEST$price))^2)
> Reg3_MSE = mean((Reg3_P - log(TEST$price))^2)
> GAM_MSE = mean((GAM_P - log(TEST$price))^2)
> D_Reg_MSE = mean((D_Reg_P - log(TEST$price))^2)
> D_GAM_MSE = mean((D_GAM_P - log(TEST$price))^2)
> Full_Model_MSE = mean((Full_Model_P - log(TEST$price))^2)
> Full_GAM_MSE = mean((Full_GAM_P - log(TEST$price))^2)
>
> Result = data.frame(
+ Model = c("Linear","Polynomial","GAM",
+ "Dummy_Linear","Dummy_GAM",
+ "Full_Linear","Full_GAM"),
+ Value = c(Reg2_MSE, Reg3_MSE, GAM_MSE,
+ D_Reg_MSE,D_GAM_MSE,Full_Model_MSE,Full_GAM_MSE)
+ )
>
> ggplot(Result) +
+ geom_bar(aes(x = reorder(Model,Value), y = Value, fill = Model),
+ stat = 'identity', alpha = 0.8) +
+ guides(fill = FALSE) +
+ ylab("MSE") + xlab("") +
+ coord_flip() +
+ theme_bw() +
+ theme(legend.position = "bottom")
경고메시지(들):
`guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> = "none")` instead.

> # 단순 선형회귀모형(Linear)의 MSE가 가장 크고, 예측자를 여러 개 활용한 다중 가법모형(Full_GAM)의 MSE가 가장 낮다.
>
> # k-fold 교차검증법
>
> set.seed(1234)
> ID = sample(rep(seq(5), length = nrow(Audi)))
> # 데이터를 5개로 나눠서 진행하는 5-fold 교차검증 진행
>
> Matrix = matrix(data = 0, nrow = 5, ncol = 7)
>
> for(k in 1:5) {
+
+ TEST = Audi[ID == k,]
+ TRAIN = Audi[ID != k,]
+
+ Linear = lm(log(price) ~ mileage, data = TRAIN)
+ Poly = lm(log(price) ~ poly(mileage,2), data = TRAIN)
+ GAM = gam(log(price) ~ s(mileage), data = TRAIN)
+ D_Reg = lm(log(price) ~ mileage + fuelType, data = TRAIN)
+ D_GAM = gam(log(price) ~ s(mileage) + fuelType, data = TRAIN)
+ Full_Model = lm(log(price) ~ mileage + mpg + engineSize + fuelType,
+ data = TRAIN)
+ Full_GAM = gam(log(price) ~ s(mileage) + s(mpg) + engineSize + fuelType,
+ data = TRAIN)
+
+ Linear_P = predict(Linear, newdata = TEST)
+ Poly_P = predict(Poly, newdata = TEST)
+ GAM_P = predict(GAM, newdata = TEST)
+ D_Reg_P = predict(D_Reg, newdata = TEST)
+ D_GAM_P = predict(D_GAM, newdata = TEST)
+ Full_Model_P = predict(Full_Model, newdata = TEST)
+ Full_GAM_P = predict(Full_GAM, newdata = TEST)
+
+ Linear_MSE = mean((Linear_P - log(TEST$price))^2)
+ Poly_MSE = mean((Poly_P - log(TEST$price))^2)
+ GAM_MSE = mean((GAM_P - log(TEST$price))^2)
+ D_Reg_MSE = mean((D_Reg_P - log(TEST$price))^2)
+ D_GAM_MSE = mean((D_GAM_P - log(TEST$price))^2)
+ Full_Model_MSE = mean((Full_Model_P - log(TEST$price))^2)
+ Full_GAM_MSE = mean((Full_GAM_P - log(TEST$price))^2)
+
+ Matrix[k,1:7] = c(Linear_MSE, Poly_MSE, GAM_MSE, D_Reg_MSE,
+ D_GAM_MSE, Full_Model_MSE, Full_GAM_MSE)
+
+ }
>
> Matrix = as.data.frame(Matrix)
> colnames(Matrix) = c("Linear","Polynomial","GAM",
+ "Dummy_Linear","Dummy_GAM",
+ "Full_Linear","Full_GAM")
> Matrix
Linear Polynomial GAM Dummy_Linear Dummy_GAM Full_Linear Full_GAM
1 0.1187959 0.1167826 0.1101573 0.1120776 0.10570781 0.04227763 0.04038662
2 0.1218905 0.1193234 0.1146295 0.1158817 0.11022267 0.04796794 0.04342912
3 0.1191377 0.1166760 0.1126962 0.1105547 0.10569429 0.03858679 0.03790995
4 0.1241025 0.1234796 0.1177354 0.1202511 0.11524348 0.04454187 0.05853426
5 0.1106108 0.1086388 0.1049174 0.1039181 0.09986993 0.04451998 0.04025552
>
> Matrix %>%
+ melt() %>%
+ ggplot() +
+ geom_boxplot(aes(x = variable, y = value, fill = variable),
+ alpha = 0.6) +
+ xlab("Model") + ylab("MSE") +
+ guides(fill = FALSE) +
+ theme_bw() +
+ coord_flip()
Using as id variables
경고메시지(들):
`guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> = "none")` instead.

> # 예측자를 여러 개 사용한 일반화 기법모형과 다중 회귀모형의 MSE가 다른 분석방법보다 많이 낮은 것을 확인할 수 있다.
출처 : 실무 프로젝트로 배우는 데이터 분석 with R