[실무 프로젝트로 배우는...] 리뷰 데이터 분석을 통한 감성사전 만들기
> ### 7.2 데이터 전처리
>
> #### 7.2.1 데이터 및 패키지 불러오기
>
> library(data.table)
> library(ggplot2)
> library(dplyr)
> library(reshape)
>
> 다운로드 URL : https://www.kaggle.com/datasets/nicapotato/womens-ecommerce-clothing-reviews
> DIR = "F:/1_Study/1_BigData/12_R/02_Practical-R/ReviewData/"
> Womens = fread(paste0(DIR, "Womens Clothing E-Commerce Reviews.csv"))
> str(Womens)
Classes ‘data.table’ and 'data.frame': 23486 obs. of 11 variables:
$ V1 : chr "0" "1" "2" "3" ...
$ Clothing ID : chr "767" "1080" "1077" "1049" ...
$ Age : chr "33" "34" "60" "50" ...
$ Title : chr "" "" "Some major design flaws" "My favorite buy!" ...
$ Review Text : chr "Absolutely wonderful - silky and sexy and comfortable" "Love this dress! it's sooo pretty. i happened to find it in a store, and i'm glad i did bc i never would have"| __truncated__ "I had such high hopes for this dress and really wanted it to work for me. i initially ordered the petite small "| __truncated__ "I love, love, love this jumpsuit. it's fun, flirty, and fabulous! every time i wear it, i get nothing but great compliments!" ...
$ Rating : int 4 5 3 5 5 2 5 4 5 5 ...
$ Recommended IND : int 1 1 0 1 1 0 1 1 1 1 ...
$ Positive Feedback Count: int 0 4 0 0 6 4 1 4 0 0 ...
$ Division Name : chr "Initmates" "General" "General" "General Petite" ...
$ Department Name : chr "Intimate" "Dresses" "Dresses" "Bottoms" ...
$ Class Name : chr "Intimates" "Dresses" "Dresses" "Pants" ...
- attr(*, ".internal.selfref")=<externalptr>
>
>
> ### 7.3 리뷰 데이터 기본 분석
>
> #### 7.3.1 데이터 전처리
>
> # 연속형 변수의 범주화 - cut()
>
> Age_G = cut(as.numeric(Womens$Age),
+ breaks = seq(10,100,by = 10),
+ include.lowest = TRUE,
+ right = FALSE,
+ labels = paste0(seq(10,90,by = 10),"th"))
> Age_G
[1] 30th 30th 60th 50th 40th 40th 30th 30th 20th 30th 50th 30th 50th 40th 50th 40th 30th 40th 30th 40th 30th 50th 30th
[24] 30th 50th 30th 30th 30th 50th 20th 30th 40th 20th 30th 30th 60th 20th 30th 30th 50th 40th 40th 20th 60th 40th 40th
[47] 40th 50th 50th 30th 40th 40th 30th 30th 60th 60th 30th 30th 30th 50th 40th 30th 20th 50th 40th 50th 60th 40th 30th
...
[990] 60th 60th 50th 60th 40th 40th 70th 30th 30th 20th 30th
[ reached getOption("max.print") -- omitted 22486 entries ]
Levels: 10th 20th 30th 40th 50th 60th 70th 80th 90th
> Womens$Age_G = Age_G
> summary(Womens$Age_G)
10th 20th 30th 40th 50th 60th 70th 80th 90th
44 2887 7702 6127 3948 2305 364 94 15
>
> # 텍스트 데이터 전처리 방법
>
> set.seed(1234)
> SL = sample(1:nrow(Womens), nrow(Womens) * 0.2, replace = FALSE)
> Womens_T = Womens[SL,]
>
>
> TEXT = as.character(Womens_T$`Review Text`)
> TEXT[1]
[1] "Wore my new red cardigan for the first time today to work. fit is tts. this is a true red and not neon or orange tinted. it wraps nicely in front and has a nice flair at the hips.. i also like the detail in the knit. i like to push my sleeves up and love the fact that they don't slide down my arms all day. i'll wear this cardigan a lot."
>
> # 대소문자 변환
> tolower(TEXT[1])
[1] "wore my new red cardigan for the first time today to work. fit is tts. this is a true red and not neon or orange tinted. it wraps nicely in front and has a nice flair at the hips.. i also like the detail in the knit. i like to push my sleeves up and love the fact that they don't slide down my arms all day. i'll wear this cardigan a lot."
>
> # 알파벳 이외의 문자열 제거
> gsub("[A-Z]"," ",TEXT[1])
[1] " ore my new red cardigan for the first time today to work. fit is tts. this is a true red and not neon or orange tinted. it wraps nicely in front and has a nice flair at the hips.. i also like the detail in the knit. i like to push my sleeves up and love the fact that they don't slide down my arms all day. i'll wear this cardigan a lot."
> gsub("[a-z]"," ",TEXT[1])
[1] "W . . . .. . ' . ' ."
> gsub("[^a-z]"," ",TEXT[1])
[1] " ore my new red cardigan for the first time today to work fit is tts this is a true red and not neon or orange tinted it wraps nicely in front and has a nice flair at the hips i also like the detail in the knit i like to push my sleeves up and love the fact that they don t slide down my arms all day i ll wear this cardigan a lot "
>
> #### 7.3.2 데이터 기본 분석
>
> # 데이터 집계 및 시각화
>
> # 어떤 옷이 가장 많이 팔렸는지...
> Product_Ranking = Womens %>%
+ group_by(`Department Name`,`Class Name`,`Clothing ID`) %>%
+ summarise(Count = n()) %>%
+ arrange(-Count)
`summarise()` has grouped output by 'Department Name', 'Class Name'. You can override using the `.groups` argument.
>
> Product_Ranking
# A tibble: 1,207 x 4
# Groups: Department Name, Class Name [21]
`Department Name` `Class Name` `Clothing ID` Count
<chr> <chr> <chr> <int>
1 Dresses Dresses 1078 1024
2 Tops Knits 862 806
3 Dresses Dresses 1094 756
4 Dresses Dresses 1081 582
5 Tops Knits 872 545
6 Tops Blouses 829 527
7 Dresses Dresses 1110 480
8 Tops Knits 868 430
9 Tops Fine gauge 895 404
10 Tops Sweaters 936 358
# ... with 1,197 more rows
>
> # 연령대를 포함해 판매량을 분석
> Product_Ranking2 = Womens %>%
+ dplyr::filter(Age_G %in% c("20th","30th","40th","50th","60th", "70th")) %>%
+ group_by(`Age_G`,`Department Name`,`Class Name`,`Clothing ID`) %>%
+ summarise(Count = n()) %>%
+ arrange(-Count) %>%
+ ungroup() %>%
+ group_by(Age_G) %>%
+ top_n(n = 5,wt = Count) %>%
+ mutate(Rank = row_number()) %>%
+ arrange(Age_G)
`summarise()` has grouped output by 'Age_G', 'Department Name', 'Class Name'. You can override using the `.groups` argument.
>
> Product_Ranking2
# A tibble: 30 x 6
# Groups: Age_G [6]
Age_G `Department Name` `Class Name` `Clothing ID` Count Rank
<fct> <chr> <chr> <chr> <int> <int>
1 20th Dresses Dresses 1078 138 1
2 20th Dresses Dresses 1094 122 2
3 20th Tops Knits 862 97 3
4 20th Dresses Dresses 1081 82 4
5 20th Dresses Dresses 1110 65 5
6 30th Dresses Dresses 1078 339 1
7 30th Dresses Dresses 1094 267 2
8 30th Tops Knits 862 239 3
9 30th Dresses Dresses 1081 195 4
10 30th Tops Knits 872 185 5
# ... with 20 more rows
>
> ggplot(Product_Ranking2) +
+ geom_bar(aes(x = Rank, y = Count, fill = `Department Name`),
+ stat = 'identity') +
+ geom_label(aes(x = Rank, y = 100, label = paste0(`Class Name`,"-",`Clothing ID`))) +
+ scale_x_reverse(breaks = 1:7) +
+ coord_flip() +
+ theme_bw() +
+ theme(legend.position = "bottom") +
+ facet_wrap(~Age_G)
>
> ggplot(Womens) +
+ geom_bar(aes(x = Rating)) +
+ theme_bw()
>
> Rating_Ranking = Womens %>%
+ group_by(`Clothing ID`,
+ `Department Name`,`Class Name`) %>%
+ summarise(Count = n(),
+ Mean = mean(Rating)) %>%
+ ungroup() %>%
+ top_n(n = 100, wt = Count) %>%
+ top_n(n = 10, wt = Mean)
`summarise()` has grouped output by 'Clothing ID', 'Department Name'. You can override using the `.groups` argument.
>
> Rating_Ranking
# A tibble: 10 x 5
`Clothing ID` `Department Name` `Class Name` Count Mean
<chr> <chr> <chr> <int> <dbl>
1 1008 Bottoms Skirts 186 4.46
2 1022 Bottoms Jeans 205 4.44
3 1025 Bottoms Jeans 125 4.46
4 1066 Bottoms Pants 89 4.44
5 834 Tops Blouses 150 4.54
6 869 Tops Knits 50 4.5
7 886 Tops Knits 106 4.44
8 939 Tops Sweaters 83 4.48
9 964 Jackets Jackets 77 4.62
10 984 Jackets Jackets 175 4.46
>
> Rating_Ranking_Total = Womens %>%
+ group_by(`Clothing ID`,
+ `Department Name`,`Class Name`) %>%
+ summarise(Count = n(),
+ Mean = mean(Rating))
`summarise()` has grouped output by 'Clothing ID', 'Department Name'. You can override using the `.groups` argument.
>
> ggplot(Rating_Ranking_Total) +
+ geom_label(aes(x = Count, y = Mean, col = `Department Name`,
+ label = `Clothing ID`)) +
+ theme_bw() +
+ theme(legend.position = "bottom")
> # 1078, 862번 상품은 판매량이 높지만 평점이 아주 높은 편은 아니다.
> # 대부분의 상품이 판매량은 그렇게 높지 않다.
>
> # 리뷰 텍스트 마이닝
>
> library(tm)
>
> TEXT_tolower = tolower(TEXT)
> CORPUS = Corpus(VectorSource(TEXT_tolower))
> CORPUS_TM = tm_map(CORPUS,removePunctuation)
경고메시지(들):
In tm_map.SimpleCorpus(CORPUS, removePunctuation) :
transformation drops documents
> CORPUS_TM = tm_map(CORPUS_TM, removeNumbers)
경고메시지(들):
In tm_map.SimpleCorpus(CORPUS_TM, removeNumbers) :
transformation drops documents
> CORPUS_TM = tm_map(CORPUS_TM,removeWords, c(stopwords("english")))
경고메시지(들):
In tm_map.SimpleCorpus(CORPUS_TM, removeWords, c(stopwords("english"))) :
transformation drops documents
>
> TDM = TermDocumentMatrix(CORPUS_TM)
> TDM_Matrix = as.matrix(TDM)
>
> Freq = rowSums(TDM_Matrix)
> TERM_FREQ = data.frame(
+ Words = names(Freq),
+ Freq = Freq
+ )
>
> TERM_FREQ = TERM_FREQ %>%
+ arrange(-Freq)
> TERM_FREQ
Words Freq
dress dress 2135
love love 1797
size size 1770
like like 1408
top top 1403
...
whole whole 46
[ reached 'max' / getOption("max.print") -- omitted 7181 rows ]
>
> # 키워드 토큰화
>
> install.packages("tokenizers")
> library(tokenizers)
> tokenize_word_stems(TEXT[1])
[[1]]
[1] "wore" "my" "new" "red" "cardigan" "for" "the" "first" "time" "today"
[11] "to" "work" "fit" "is" "tts" "this" "is" "a" "true" "red"
[21] "and" "not" "neon" "or" "orang" "tint" "it" "wrap" "nice" "in"
[31] "front" "and" "has" "a" "nice" "flair" "at" "the" "hip" "i"
[41] "also" "like" "the" "detail" "in" "the" "knit" "i" "like" "to"
[51] "push" "my" "sleev" "up" "and" "love" "the" "fact" "that" "they"
[61] "don't" "slide" "down" "my" "arm" "all" "day" "i'll" "wear" "this"
[71] "cardigan" "a" "lot"
>
> Sentence = ""
> for(tk in unlist(tokenize_word_stems(TEXT[1]))) {
+ Sentence = paste(Sentence, tk)
+ }
> Sentence
[1] " wore my new red cardigan for the first time today to work fit is tts this is a true red and not neon or orang tint it wrap nice in front and has a nice flair at the hip i also like the detail in the knit i like to push my sleev up and love the fact that they don't slide down my arm all day i'll wear this cardigan a lot"
>
> TEXT_Token = c()
> for(i in 1:length(TEXT)) {
+ Words_token = unlist(tokenize_word_stems(TEXT[i]))
+
+ Sentence = ""
+ for(tk in Words_token){
+ Sentence = paste(Sentence, tk)
+ }
+
+ TEXT_Token[i] = Sentence
+ }
>
> CORPUS_Token = Corpus(VectorSource(TEXT_Token))
> CORPUS_TM_Token = tm_map(CORPUS_Token,removePunctuation)
> CORPUS_TM_Token = tm_map(CORPUS_TM_Token, removeNumbers)
> CORPUS_TM_Token = tm_map(CORPUS_TM_Token,removeWords, c(stopwords("english")))
>
> TDM_Token = TermDocumentMatrix(CORPUS_TM_Token)
> TDM_Matrix_Token = as.matrix(TDM_Token)
> Freq_Token = rowSums(TDM_Matrix_Token)
>
> TERM_FREQ_Token = data.frame(
+ Words = names(Freq_Token),
+ Freq = Freq_Token
+ )
>
> TERM_FREQ_Token = TERM_FREQ_Token %>%
+ arrange(-Freq)
> head(TERM_FREQ_Token)
Words Freq
dress dress 2454
love love 2328
fit fit 2241
size size 2155
look look 1823
veri veri 1671
>
> nrow(TERM_FREQ)
[1] 7681
> nrow(TERM_FREQ_Token)
[1] 4612
>
> # 토큰화를 통해 의미를 직관적으로 분석하기 어려울 수는 있지만 이 정도로 키워드가 줄어든 효과는 이후 분석의 효율성에 매우 중요하게 작용한다.
>
> # 워드클라우드
>
> library(wordcloud)
>
> wordcloud(words = TERM_FREQ_Token$Words,
+ freq = TERM_FREQ_Token$Freq,
+ max.words = 300,
+ random.order = FALSE,
+ random.color = TRUE,
+ colors = brewer.pal(8, "Dark2"))
> # random.order = FALSE 옵션은 빈도가 큰 키워드일수록 중앙에 모이게 한다.
>
>
> ### 7.4 감성사전 생성을 위한 모델링
>
> Positive_Feedback_Analysis = Womens %>%
+ mutate(Positive_Binary = ifelse(`Positive Feedback Count` > 0, 1, 0)) %>%
+ select(`Positive Feedback Count`, Positive_Binary)
>
> Positive_DATA = Positive_Feedback_Analysis[SL,]
> summary(as.factor(Positive_DATA$Positive_Binary))
0 1
2221 2476
>
> #### 7.4.1 키워드 점수 계산을 위한 데이터셋 생성
>
> # DTM 생성
> DTM_Token = DocumentTermMatrix(CORPUS_TM_Token)
> DTM_Matrix_Token = as.matrix(DTM_Token)
>
> # 상위 키워드 추출
>
> Many_Words = colSums(DTM_Matrix_Token) > quantile(colSums(DTM_Matrix_Token),
+ probs = 0.99)
>
> DTM_Matrix_Token_Selected = DTM_Matrix_Token[,Many_Words]
> ncol(DTM_Matrix_Token_Selected)
[1] 47
> DTM_DF_Token_Selected = as.data.frame(DTM_Matrix_Token_Selected)
> Positive_DATA = cbind(Positive_DATA, DTM_DF_Token_Selected)
>
> # 훈련/검증용 데이터 분류
>
> set.seed(123)
> SL2 = sample(1:nrow(Positive_DATA), nrow(Positive_DATA) * 0.7, replace = FALSE)
>
> Positive_DATA_Train = Positive_DATA[SL2,]
> Positive_DATA_Test = Positive_DATA[-SL2,]
>
> #### 7.4.2 고차원 분류 모형
>
> # 변수 선택법
>
> Start_Time = Sys.time()
> GLM = step(glm(Positive_Binary ~ ., data = Positive_DATA_Train[,-1],
+ family = binomial(link = "logit")), direction = "backward")
...
Step: AIC=4393.42
Positive_Binary ~ also + fit + like + wear + run + size + veri +
fabric + top + usual + look + tri + beauti + dress + materi +
realli + length + one + back + much + skirt
Df Deviance AIC
<none> 4349.4 4393.4
- one 1 4351.5 4393.5
- run 1 4351.6 4393.6
- also 1 4352.1 4394.1
- size 1 4352.3 4394.3
- dress 1 4352.4 4394.4
- materi 1 4352.5 4394.5
- beauti 1 4353.3 4395.3
- look 1 4353.6 4395.6
- fabric 1 4353.8 4395.8
- much 1 4354.1 4396.1
- wear 1 4354.2 4396.2
- top 1 4354.3 4396.3
- fit 1 4354.4 4396.4
- usual 1 4354.9 4396.9
- back 1 4355.1 4397.1
- length 1 4355.6 4397.6
- like 1 4356.3 4398.3
- skirt 1 4356.7 4398.7
- realli 1 4357.1 4399.1
- veri 1 4359.2 4401.2
- tri 1 4363.3 4405.3
> End_Time = Sys.time()
> difftime(End_Time,Start_Time, unit = "secs")
Time difference of 34.49855 secs
>
> summary(GLM)
Call:
glm(formula = Positive_Binary ~ also + fit + like + wear + run +
size + veri + fabric + top + usual + look + tri + beauti +
dress + materi + realli + length + one + back + much + skirt,
family = binomial(link = "logit"), data = Positive_DATA_Train[,
-1])
Deviance Residuals:
Min 1Q Median 3Q Max
-2.1233 -1.1339 0.7476 1.1148 1.4434
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -0.60654 0.06884 -8.811 < 2e-16 ***
also 0.17633 0.10848 1.626 0.104051
fit 0.12085 0.05420 2.230 0.025768 *
like 0.15880 0.06123 2.594 0.009494 **
wear 0.13607 0.06256 2.175 0.029635 *
run 0.16075 0.10873 1.478 0.139296
size 0.08636 0.05090 1.697 0.089753 .
veri 0.18394 0.05919 3.107 0.001887 **
fabric 0.16818 0.08080 2.081 0.037392 *
top 0.11434 0.05244 2.180 0.029229 *
usual 0.25957 0.11182 2.321 0.020276 *
look 0.11598 0.05683 2.041 0.041251 *
tri 0.34715 0.09550 3.635 0.000278 ***
beauti 0.18437 0.09356 1.970 0.048783 *
dress 0.06560 0.03843 1.707 0.087829 .
materi 0.18162 0.10488 1.732 0.083345 .
realli 0.23836 0.08722 2.733 0.006278 **
length 0.26693 0.10851 2.460 0.013893 *
one 0.12825 0.08901 1.441 0.149629
back 0.22119 0.09408 2.351 0.018720 *
much 0.22018 0.10272 2.143 0.032079 *
skirt 0.24632 0.09372 2.628 0.008584 **
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 4540.8 on 3286 degrees of freedom
Residual deviance: 4349.4 on 3265 degrees of freedom
AIC: 4393.4
Number of Fisher Scoring iterations: 4
>
> # 추정된 로지스틱 모형의 성능을 계산하기 위해 ROC 커브를 작성
> library(pROC)
> Predict_GLM = predict(GLM, newdata = Positive_DATA_Test,
+ type = 'response')
> ROC_GLM = roc(Positive_DATA_Test$Positive_Binary, Predict_GLM)
Setting levels: control = 0, case = 1
Setting direction: controls < cases
> plot.roc(ROC_GLM,
+ print.auc = TRUE)
> # 현재 추정한 로지스틱 회귀모형의 문제점
> # 1) 모든 예측자(키워드)를 활용하지 못했다.
> # 2) 단순히 빈도가 높은 순서대로 키워드를 선정해서 모델에 활용했다. (등장 빈도가 상황에 따라서는 부정확한 지표가 될 수 있다.)
> # ==> 텍스트 마이닝에서는 키워드의 중요성을 나타내기 위해 단순 빈도 외에도 TF_IDF(Term Frequency-Inverse Document Frequency)를 활용하는 경우도 있다.
>
> # Ridge, Lasso 회귀분석
>
> install.packages("glmnet")
> library(glmnet)
> x = as.matrix(Positive_DATA_Train[,c(-1,-2)])
> y = as.matrix(Positive_DATA_Train[,2])
> Start_Time = Sys.time()
> Ridge = glmnet(x,y, alpha = 0, family = "binomial")
> Lasso = glmnet(x,y, alpha = 1, family = "binomial")
> End_Time = Sys.time()
> difftime(End_Time,Start_Time, unit = "secs")
Time difference of 0.325763 secs
>
> Ridge_Beta = as.data.frame(t(as.matrix(Ridge$beta)))
> Ridge_Beta$Lambda = Ridge$lambda
> Ridge_Beta_M = Ridge_Beta %>%
+ reshape::melt(id.vars = c("Lambda"))
>
> ggplot(Ridge_Beta_M) +
+ geom_line(aes(x = log(Lambda), y = value, group = variable, col = variable),
+ alpha = 0.6) +
+ guides(col = FALSE) +
+ ylab("Coefficients") + xlab("Log Lambda") +
+ scale_color_grey() +
+ theme_bw() +
+ theme(text = element_text(size = 15, face = "bold")) +
+ ggtitle("Ridge Regreesion")
경고메시지(들):
`guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> = "none")` instead.
>
> Lasso_Beta = as.data.frame(t(as.matrix(Lasso$beta)))
> Lasso_Beta$Lambda = Lasso$lambda
> Lasso_Beta_M = Lasso_Beta %>%
+ reshape::melt(id.vars = c("Lambda"))
>
> ggplot(Lasso_Beta_M) +
+ geom_line(aes(x = log(Lambda), y = value, group = variable, col = variable),
+ alpha = 0.6) +
+ guides(col = FALSE) +
+ ylab("Coefficients") + xlab("Log Lambda") +
+ scale_color_grey() +
+ theme_bw() +
+ theme(text = element_text(size = 15, face = "bold")) +
+ ggtitle("Lasso Regreesion")
경고메시지(들):
`guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> = "none")` instead.
>
> # x축은 튜닝 파라미터를 의미하고, y축은 투입된 예측자들의 회귀계수를 의미한다.
> # 튜닝 파라미터의 값이 증가할수록 회귀계수가 0에 가까워지는 것을 확인할 수 있다.
> # Ridge 회귀분석은 회귀계수를 0으로 만들 수 없기 때문에 모두 수렴하는 형태를 보인다.
> # 하지만 Lasso 회귀분석은 예측자들의 회귀계수가 0으로 축소된다.
>
> Lambda = sort(unique(Lasso_Beta_M$Lambda))
> Lambda_S = Lambda[c(1,15,35)]
>
> # 튜닝 파라미터에 따른 Lasso 회귀분석의 회귀계수 변화
> Lasso_Beta_M %>%
+ dplyr::filter(Lambda %in% Lambda_S) %>%
+ group_by(Lambda) %>%
+ top_n(n = 20, wt = value) %>%
+ ggplot() +
+ geom_bar(aes(x = reorder(variable,value), y = value), stat = 'identity') +
+ facet_wrap(~Lambda) +
+ xlab("Words") + ylab("Coefficient") +
+ coord_flip() +
+ theme_bw() +
+ theme(axis.text.x = element_text(size = 10),
+ axis.text.y = element_text(size = 7.5))
> # 튜닝 파라미터가 커질수록 키워드들의 회귀계수가 0이 되는 것을 확인할 수 있다.
> # 결국 정규화 회귀분석에서는 성능을 가장 높게 올리는 튜닝 파라미터를 찾는 것이 매우 중요하다.
>
> # 최적의 모형 추정
>
> DTM_DF_Token = as.data.frame(DTM_Matrix_Token)
> ncol(DTM_DF_Token)
[1] 4612
>
> Positive_Feedback_Analysis = Womens %>%
+ mutate(Positive_Binary = ifelse(`Positive Feedback Count` > 0, 1, 0)) %>%
+ select(`Positive Feedback Count`, Positive_Binary)
>
> Positive_DATA = Positive_Feedback_Analysis[SL,]
> Positive_DATA = cbind(Positive_DATA, DTM_DF_Token)
>
> set.seed(123)
> SL2 = sample(1:nrow(Positive_DATA), nrow(Positive_DATA) * 0.7, replace = FALSE)
>
> Positive_DATA_Train = Positive_DATA[SL2,]
> Positive_DATA_Test = Positive_DATA[-SL2,]
>
> set.seed(123)
> CV_ID = sample(rep(seq(4), length=nrow(Positive_DATA_Train)))
>
> # 최적의 모형 추정 - K-fold 교차검증법 활용 (K=4)
>
> x = as.matrix(Positive_DATA_Train[,c(-1,-2)])
> y = as.matrix(Positive_DATA_Train[,2])
>
> Start_Time = Sys.time()
> CV_Lasso = cv.glmnet(x,y, alpha = 1, foldid = CV_ID, family = "binomial")
> End_Time = Sys.time()
> difftime(End_Time,Start_Time, unit = "secs")
Time difference of 51.91798 secs
>
> Lasso_Min = CV_Lasso$lambda.min
> Lasso_1se = CV_Lasso$lambda.1se
> # lambda.min : 교차 검증에서 오차가 가장 적게 나온 튜닝 파라미터
> # lambda.1se : 1se는 one-standard error rule을 의미
>
> Lasso_Optimal = glmnet(x,y, lambda = Lasso_Min, family = "binomial")
>
> CV_Coef = as.vector(Lasso_Optimal$beta)
> CV_Coef_Index = which(CV_Coef != 0)
> CV_Coef2 = CV_Coef[CV_Coef_Index]
>
> Lasso_Coef = data.frame(
+ Predictors = CV_Coef_Index,
+ Words = colnames(x)[CV_Coef_Index],
+ Coefficients = CV_Coef2
+ )
>
> knitr::kable(Lasso_Coef %>%
+ arrange(-Coefficients) %>%
+ top_n(n = 10))
Selecting by Coefficients
| Predictors|Words | Coefficients|
|----------:|:-------|------------:|
| 833|whim | 0.5310100|
| 224|pictur | 0.4872785|
| 708|wed | 0.3732927|
| 813|bad | 0.3246705|
| 942|suggest | 0.3142197|
| 206|appear | 0.3139376|
| 2984|manag | 0.2977191|
| 1553|modest | 0.2915151|
| 409|bulki | 0.2834490|
| 848|toward | 0.2819394|
>
> knitr::kable(Lasso_Coef %>%
+ arrange(Coefficients) %>%
+ slice(1:10))
| Predictors|Words | Coefficients|
|----------:|:-------|------------:|
| 1029|balanc | -0.5928775|
| 1380|sunday | -0.4524296|
| 1717|reserv | -0.4042919|
| 251|sale | -0.3661852|
| 1544|fill | -0.2943380|
| 588|soften | -0.1482404|
| 1611|everyon | -0.1326617|
| 1455|viscos | -0.1268542|
| 957|scream | -0.1059502|
| 36|got | -0.0466150|
>
> # 회귀계수가 양수인 키워드는 긍정적인 리뷰를 받는 데 영향을 미쳤고, 반대로 음수인 키워드들은 긍정적인 리뷰와는 거리가 먼 키워드이다.
>
> # 분류 모형의 성능 비교
>
> x_test = as.matrix(Positive_DATA_Test[,c(-1,-2)])
> y_test = as.matrix(Positive_DATA_Test[,2])
>
> x_selected = x[,CV_Coef_Index]
> x_test_selected = x_test[,CV_Coef_Index]
>
> GLM = glm(y ~ ., data = as.data.frame(x_selected),
+ family = binomial())
경고메시지(들):
glm.fit: 적합된 확률값들이 0 또는 1 입니다
> GLM_Probs = predict(GLM, newdata = as.data.frame(x_test_selected),
+ type = "response")
>
> library(randomForest)
> RF = randomForest(as.factor(y) ~ .,data = x_selected,
+ mtry = 30, ntree = 1000, importance = T)
> RF_Probs = predict(RF, x_test_selected, type = "prob")
>
> Lasso_Probs = predict(Lasso_Optimal,newx = x_test, type = 'response')
>
> library(pROC)
>
> GLM_ROC = roc(as.vector(y_test), GLM_Probs)
Setting levels: control = 0, case = 1
Setting direction: controls < cases
> RF_ROC = roc(as.vector(y_test), RF_Probs[,1])
Setting levels: control = 0, case = 1
Setting direction: controls > cases
> Lasso_ROC = roc(as.vector(y_test), Lasso_Probs[,1])
Setting levels: control = 0, case = 1
Setting direction: controls < cases
>
> par(mfrow = c(1,3))
> plot.roc(GLM_ROC, print.auc = TRUE, print.thres = TRUE,main = "Logistic Regression")
> plot.roc(RF_ROC, print.auc = TRUE, print.thres = TRUE, main = "Random Forest")
> plot.roc(Lasso_ROC, print.auc = TRUE, print.thres = TRUE, main = "Lasso Regression")
> # Lasso 회귀분석의 AUC가 가장 높게 나타났다.
> # 다만 AUC 값 자체가 높다고 성능이 무조건 높다고 볼 수는 없기 때문에 추가로 모형의 성능을 높일 수 있는 새로운 전처리 방식 혹은 접근 방법을 고려할 필요는 있다.
>
>
> ### 7.5 고객 리뷰 감성분석
>
> #### 7.5.1 감성분석 진행
>
> # 패키지를 활용한 감성분석
>
> install.packages("syuzhet")
> library(syuzhet)
>
> TEXT = Womens_T$`Review Text`
> sentiment_vector = get_sentiment(TEXT, method="syuzhet")
> summary(sentiment_vector)
Min. 1st Qu. Median Mean 3rd Qu. Max.
-2.950 1.250 2.300 2.419 3.500 8.500
>
> Womens_T$Sentiment = sentiment_vector
>
> Womens_T %>%
+ dplyr::filter(`Department Name` != "") %>%
+ ggplot() +
+ geom_density(aes(x = Sentiment, fill = `Department Name`),
+ alpha = 0.4) +
+ geom_vline(xintercept = 0, linetype = 'dashed') +
+ theme_bw() +
+ guides(fill = guide_legend(nrow = 1)) +
+ theme(legend.position = "bottom") +
+ facet_wrap(~ `Department Name`)
>
> # 생성한 감성사전을 활용한 감성분석
>
> x_words = rbind(as.data.frame(x_selected),
+ as.data.frame(x_test_selected))
>
> Lasso_Sentiment = x_words * Lasso_Coef$Coefficients
>
> Womens_T$Sentiment2 = rowSums(Lasso_Sentiment)
>
> Womens_T %>%
+ dplyr::filter(`Department Name` != "") %>%
+ ggplot() +
+ geom_density(aes(x = Sentiment2, fill = `Department Name`),
+ alpha = 0.4) +
+ geom_vline(xintercept = 0, linetype = 'dashed') +
+ theme_bw() +
+ guides(fill = guide_legend(nrow = 1)) +
+ theme(legend.position = "bottom") +
+ facet_wrap(~ `Department Name`)
출처 : 실무 프로젝트로 배우는 데이터 분석 with R