데이터분석/R

[실무 프로젝트로 배우는...] 마켓 데이터 분석

버섯도리 2022. 2. 8. 06:38

> ## 8. 마켓 데이터 분석

> ### 8.2 데이터 전처리

> #### 8.2.1 데이터 및 패키지 불러오기

> library(dplyr)
> library(data.table)
> library(ggplot2)
> library(reshape)

> # 다운로드 URL : https://www.kaggle.com/psparks/instacart-market-basket-analysis
> DIR = "F:/1_Study/1_BigData/12_R/02_Practical-R/MarketData/"
> LISTFILES = list.files(DIR)

> for(k in 1:length(LISTFILES)) {
+   assign(gsub(".csv","",LISTFILES[k]),
+          fread(paste0(DIR,LISTFILES[k])))
+ }


> ### 8.3 상위 판매 상품 분석

> #### 8.3.1 판매 상품 분석

> # 구매/재구매 상품 분석

> Order_Id = order_products__prior %>%
+   group_by(order_id) %>%
+   summarise(Count = n(),
+             Reordered = sum(reordered))

> nrow(Order_Id)
[1] 3214874
> summary(Order_Id$Count)
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
   1.00    5.00    8.00   10.09   14.00  145.00 
> summary(Order_Id$Reordered)
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
  0.000   2.000   4.000   5.949   8.000 130.000 

> Order_Id$Reorder_Perc = Order_Id$Reordered / Order_Id$Count * 100
> summary(Order_Id$Reorder_Perc)
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
   0.00   33.33   66.67   59.87   90.91  100.00 

> ggplot(Order_Id) +
+   geom_histogram(aes(x = Reorder_Perc), binwidth = 5) +
+   xlab("Reorder Proportion") + ylab("Count") +
+   theme_bw()


> Order_products_Department = order_products__prior %>%
+   inner_join(products, by = "product_id") %>%
+   inner_join(departments, by = "department_id")

> # 상품 카테고리별 판매 분석

> Order_ID_Department = Order_products_Department %>%
+   group_by(department) %>%
+   summarise(Count = n(),
+             Reordered = sum(reordered)) %>%
+   mutate(NonReorder = Count - Reordered,
+          Reorder_Perc = Reordered/Count * 100)

> knitr::kable(Order_ID_Department %>%
+                arrange(-Count))


|department      |   Count| Reordered| NonReorder| Reorder_Perc|
|:---------------|-------:|---------:|----------:|------------:|
|produce         | 9479291|   6160710|    3318581|     64.99125|
|dairy eggs      | 5414016|   3627221|    1786795|     66.99687|
|snacks          | 2887550|   1657973|    1229577|     57.41798|
|beverages       | 2690129|   1757892|     932237|     65.34601|
|frozen          | 2236432|   1211890|    1024542|     54.18855|
|pantry          | 1875577|    650301|    1225276|     34.67205|
|bakery          | 1176787|    739188|     437599|     62.81409|
|canned goods    | 1068058|    488535|     579523|     45.74049|
|deli            | 1051249|    638864|     412385|     60.77190|
|dry goods pasta |  866627|    399581|     467046|     46.10761|
|household       |  738666|    297075|     441591|     40.21777|
|breakfast       |  709569|    398013|     311556|     56.09222|
|meat seafood    |  708931|    402442|     306489|     56.76744|
|personal care   |  447123|    143584|     303539|     32.11286|
|babies          |  423802|    245369|     178433|     57.89708|
|international   |  269253|     99416|     169837|     36.92289|
|alcohol         |  153696|     87595|      66101|     56.99237|
|pets            |   97724|     58760|      38964|     60.12853|
|missing         |   69145|     27371|      41774|     39.58493|
|other           |   36291|     14806|      21485|     40.79799|
|bulk            |   34573|     19950|      14623|     57.70399|

> # 카테고리별 판매량
> Order_ID_Department %>%
+   select(department, NonReorder, Reordered) %>%
+   as.data.frame() %>%
+   melt(id.vars = c("department")) %>%
+   ggplot() +
+   geom_bar(aes(x = reorder(department,value), y = value, fill = variable), stat = 'identity') +
+   xlab("Department") + ylab("Sales Count") +
+   labs(fill = "") +
+   coord_flip() +
+   theme_bw() +
+   theme(legend.position = "bottom")
경고메시지(들): 
In melt(., id.vars = c("department")) :
  The melt generic in data.table has been passed a data.frame and will attempt to redirect to the relevant reshape2 method; please note that reshape2 is deprecated, and this redirection is now deprecated as well. To continue using melt methods from reshape2 while both libraries are attached, e.g. melt.list, you can prepend the namespace like reshape2::melt(.). In the next version, this warning will become an error.

> # 카테고리별 재구매 비율
> Order_ID_Department %>%
+   ggplot() +
+   geom_bar(aes(x = reorder(department, Reorder_Perc), y = Reorder_Perc), stat = 'identity') +
+   geom_label(aes(x = reorder(department, Reorder_Perc), y = Reorder_Perc,
+                  label = round(Reorder_Perc,2))) +
+   xlab("Department") + ylab("Reorder Percent (%)") +
+   labs(fill = "") +
+   coord_flip() +
+   theme_bw()


> # 세부 상품별 판매 분석

> Order_ID_Product = Order_products_Department %>%
+   group_by(department, product_name) %>%
+   summarise(Count = n(),
+             Reordered = sum(reordered)) %>%
+   mutate(NonReorder = Count - Reordered,
+          Reorder_Perc = Reordered/Count * 100)
`summarise()` has grouped output by 'department'. You can override using the `.groups` argument.

> Order_ID_Product %>%
+   ungroup() %>%
+   top_n(n = 10, wt = Count)
# A tibble: 10 x 6
   department product_name            Count Reordered NonReorder Reorder_Perc
   <chr>      <chr>                   <int>     <int>      <int>        <dbl>
 1 dairy eggs Organic Whole Milk     137905    114510      23395         83.0
 2 produce    Bag of Organic Bananas 379450    315913      63537         83.3
 3 produce    Banana                 472565    398609      73956         84.4
 4 produce    Large Lemon            152657    106255      46402         69.6
 5 produce    Limes                  140627     95768      44859         68.1
 6 produce    Organic Avocado        176815    134044      42771         75.8
 7 produce    Organic Baby Spinach   241921    186884      55037         77.3
 8 produce    Organic Hass Avocado   213584    170131      43453         79.7
 9 produce    Organic Strawberries   264683    205845      58838         77.8
10 produce    Strawberries           142951     99802      43149         69.8

> # 특정 카테고리의 상품별 판매량 그래프 함수 정의
> Product_Graph = function(Order_ID_Product, department_name) {
+   
+   Graph = Order_ID_Product %>%
+     dplyr::filter(department == department_name) %>%
+     ungroup() %>%
+     top_n(n = 20, wt = Count) %>%
+     select(department, product_name, NonReorder, Reordered) %>%
+     as.data.frame() %>%
+     melt(id.vars = c("department","product_name")) %>%
+     ggplot() +
+     geom_bar(aes(x = reorder(product_name,value), y = value, fill = variable), stat = 'identity') +
+     xlab("product name") + ylab("Sales Count") +
+     labs(fill = "") +
+     coord_flip() +
+     theme_bw() +
+     theme(legend.position = "bottom") +
+     ggtitle(department_name)
+   
+   return(Graph)
+   
+ }

> Product_Graph(Order_ID_Product = Order_ID_Product,
+               department_name = "snacks")
경고메시지(들): 
In melt(., id.vars = c("department", "product_name")) :
  The melt generic in data.table has been passed a data.frame and will attempt to redirect to the relevant reshape2 method; please note that reshape2 is deprecated, and this redirection is now deprecated as well. To continue using melt methods from reshape2 while both libraries are attached, e.g. melt.list, you can prepend the namespace like reshape2::melt(.). In the next version, this warning will become an error.

> ### 8.4 고객 구매 패턴 분석

> #### 8.4.1 고객 재방문 시기 분석

> orders_prior = orders %>%
+   dplyr::filter(eval_set == "prior")

> User_Orders = orders_prior %>%
+   group_by(user_id) %>%
+   summarise(Count = n(),
+             Revisit_Days_Mean = mean(days_since_prior_order, na.rm = TRUE),
+             Revisit_Days_Median = median(days_since_prior_order, na.rm = TRUE),
+             Revisit_Days_Mean = round(Revisit_Days_Mean,2),
+             Revisit_Days_Median = round(Revisit_Days_Median,2))

> knitr::kable(User_Orders[1:10,])


| user_id| Count| Revisit_Days_Mean| Revisit_Days_Median|
|-------:|-----:|-----------------:|-------------------:|
|       1|    10|             19.56|                20.0|
|       2|    14|             15.23|                13.0|
|       3|    12|             12.09|                11.0|
|       4|     5|             13.75|                17.0|
|       5|     4|             13.33|                11.0|
|       6|     3|              9.00|                 9.0|
|       7|    20|             10.68|                 7.0|
|       8|     3|             30.00|                30.0|
|       9|     3|             18.00|                18.0|
|      10|     5|             19.75|                18.5|

> User_Orders_Info = orders_prior %>%
+   inner_join(Order_Id, by = "order_id") %>%
+   group_by(user_id) %>%
+   summarise(Count = n(),
+             Revisit_Days_Mean = mean(days_since_prior_order, na.rm = TRUE),
+             Revisit_Days_Median = median(days_since_prior_order, na.rm = TRUE),
+             Revisit_Days_Mean = round(Revisit_Days_Mean,2),
+             Revisit_Days_Median = round(Revisit_Days_Median,2),
+             Reorder_Mean = mean(Reordered))

> knitr::kable(User_Orders_Info[1:10,])


| user_id| Count| Revisit_Days_Mean| Revisit_Days_Median| Reorder_Mean|
|-------:|-----:|-----------------:|-------------------:|------------:|
|       1|    10|             19.56|                20.0|    4.1000000|
|       2|    14|             15.23|                13.0|    6.6428571|
|       3|    12|             12.09|                11.0|    4.5833333|
|       4|     5|             13.75|                17.0|    0.2000000|
|       5|     4|             13.33|                11.0|    3.5000000|
|       6|     3|              9.00|                 9.0|    0.6666667|
|       7|    20|             10.68|                 7.0|    6.9000000|
|       8|     3|             30.00|                30.0|    4.3333333|
|       9|     3|             18.00|                18.0|    6.0000000|
|      10|     5|             19.75|                18.5|    9.8000000|

> #### 8.4.2 고객 구매 물품 분석

> # 연관분석

> # 데이터 무작위 추출
> set.seed(1234)
> Order_id = unique(Order_products_Department$order_id)
> SL = sample(1:length(Order_id), length(Order_id) * 0.1)
> Sample_Order_id = Order_id[SL]
> Sample_Orders = Order_products_Department %>%
+   dplyr::filter(order_id %in% Sample_Order_id)

> # 거래행렬 생성

> library(arules)
> install.packages("arulesViz")
> library(arulesViz)

> DepartmentList = split(Sample_Orders$department, Sample_Orders$order_id)
> DepartmentList_trans = as(DepartmentList, "transactions")
경고메시지(들): 
In asMethod(object) : removing duplicated items in transactions
summary(DepartmentList_trans)
transactions as itemMatrix in sparse format with
 321487 rows (elements/itemsets/transactions) and
 21 columns (items) and a density of 0.2257279 

most frequent items:
   produce dairy eggs  beverages     snacks     frozen    (Other) 
    241354     218462     145446     138744     118269     661665 

element (itemset/transaction) length distribution:
sizes
    1     2     3     4     5     6     7     8     9    10    11    12    13    14    15    16    18 
28687 39610 46590 49013 44822 36720 28012 19879 12878  7923  4247  1984   789   262    55    15     1 

   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
   1.00    3.00    4.00    4.74    6.00   18.00 

includes extended item information - examples:
   labels
1 alcohol
2  babies
3  bakery

includes extended transaction information - examples:
  transactionID
1            11
2            15
3            18
> # most frequent items = 가장 많이 구매한 상품 카테고리
> # includes extended item information = 카테고리에 대한 고유번호
> # transactionID = 구매번호

> # 연관분석 진행

> rules1 = apriori(DepartmentList_trans,
+                  parameter = list(support = 0.2, confidence = 0.1, minlen = 2))
Apriori

Parameter specification:
 confidence minval smax arem  aval originalSupport maxtime support minlen maxlen target  ext
        0.1    0.1    1 none FALSE            TRUE       5     0.2      2     10  rules TRUE

Algorithmic control:
 filter tree heap memopt load sort verbose
    0.1 TRUE TRUE  FALSE TRUE    2    TRUE

Absolute minimum support count: 64297 

set item appearances ...[0 item(s)] done [0.00s].
set transactions ...[21 item(s), 321487 transaction(s)] done [0.08s].
sorting and recoding items ... [9 item(s)] done [0.01s].
creating transaction tree ... done [0.14s].
checking subsets of size 1 2 3 done [0.00s].
writing ... [38 rule(s)] done [0.00s].
creating S4 object  ... done [0.03s].
> rules1
set of 38 rules 

> rule.list1 = as.data.frame(inspect(rules1))
     lhs                       rhs          support   confidence coverage  lift      count 
[1]  {deli}                 => {produce}    0.2068451 0.8607710  0.2403021 1.1465593  66498
[2]  {produce}              => {deli}       0.2068451 0.2755206  0.7507426 1.1465593  66498
[3]  {bakery}               => {dairy eggs} 0.2260465 0.8243735  0.2742039 1.2131418  72671
[4]  {dairy eggs}           => {bakery}     0.2260465 0.3326482  0.6795360 1.2131418  72671
[5]  {bakery}               => {produce}    0.2296765 0.8376119  0.2742039 1.1157111  73838
[6]  {produce}              => {bakery}     0.2296765 0.3059324  0.7507426 1.1157111  73838
[7]  {pantry}               => {dairy eggs} 0.2697216 0.7751070  0.3479799 1.1406416  86712
[8]  {dairy eggs}           => {pantry}     0.2697216 0.3969203  0.6795360 1.1406416  86712
[9]  {pantry}               => {produce}    0.2868701 0.8243870  0.3479799 1.0980954  92225
[10] {produce}              => {pantry}     0.2868701 0.3821151  0.7507426 1.0980954  92225
[11] {frozen}               => {dairy eggs} 0.2850815 0.7749283  0.3678811 1.1403786  91650
[12] {dairy eggs}           => {frozen}     0.2850815 0.4195238  0.6795360 1.1403786  91650
[13] {frozen}               => {produce}    0.2990043 0.8127743  0.3678811 1.0826270  96126
[14] {produce}              => {frozen}     0.2990043 0.3982780  0.7507426 1.0826270  96126
[15] {beverages}            => {snacks}     0.2285878 0.5052597  0.4524164 1.1707491  73488
[16] {snacks}               => {beverages}  0.2285878 0.5296661  0.4315696 1.1707491  73488
[17] {beverages}            => {dairy eggs} 0.3212416 0.7100573  0.4524164 1.0449149 103275
[18] {dairy eggs}           => {beverages}  0.3212416 0.4727367  0.6795360 1.0449149 103275
[19] {beverages}            => {produce}    0.3335500 0.7372633  0.4524164 0.9820453 107232
[20] {produce}              => {beverages}  0.3335500 0.4442934  0.7507426 0.9820453 107232
[21] {snacks}               => {dairy eggs} 0.3224267 0.7471026  0.4315696 1.0994304 103656
[22] {dairy eggs}           => {snacks}     0.3224267 0.4744807  0.6795360 1.0994304 103656
[23] {snacks}               => {produce}    0.3372827 0.7815257  0.4315696 1.0410035 108432
[24] {produce}              => {snacks}     0.3372827 0.4492654  0.7507426 1.0410035 108432
[25] {dairy eggs}           => {produce}    0.5524485 0.8129789  0.6795360 1.0828996 177605
[26] {produce}              => {dairy eggs} 0.5524485 0.7358693  0.7507426 1.0828996 177605
[27] {dairy eggs,pantry}    => {produce}    0.2323422 0.8614148  0.2697216 1.1474169  74695
[28] {pantry,produce}       => {dairy eggs} 0.2323422 0.8099214  0.2868701 1.1918741  74695
[29] {dairy eggs,produce}   => {pantry}     0.2323422 0.4205681  0.5524485 1.2085990  74695
[30] {dairy eggs,frozen}    => {produce}    0.2443489 0.8571195  0.2850815 1.1416955  78555
[31] {frozen,produce}       => {dairy eggs} 0.2443489 0.8172087  0.2990043 1.2025980  78555
[32] {dairy eggs,produce}   => {frozen}     0.2443489 0.4423017  0.5524485 1.2022953  78555
[33] {beverages,dairy eggs} => {produce}    0.2616809 0.8145921  0.3212416 1.0850484  84127
[34] {beverages,produce}    => {dairy eggs} 0.2616809 0.7845326  0.3335500 1.1545121  84127
[35] {dairy eggs,produce}   => {beverages}  0.2616809 0.4736747  0.5524485 1.0469883  84127
[36] {dairy eggs,snacks}    => {produce}    0.2706983 0.8395655  0.3224267 1.1183133  87026
[37] {produce,snacks}       => {dairy eggs} 0.2706983 0.8025860  0.3372827 1.1810793  87026
[38] {dairy eggs,produce}   => {snacks}     0.2706983 0.4899975  0.5524485 1.1353847  87026
> rule.list1[order(-rule.list1$support),][1:5,]
              lhs             rhs   support confidence  coverage      lift  count
[25] {dairy eggs} =>    {produce} 0.5524485  0.8129789 0.6795360 1.0828996 177605
[26]    {produce} => {dairy eggs} 0.5524485  0.7358693 0.7507426 1.0828996 177605
[23]     {snacks} =>    {produce} 0.3372827  0.7815257 0.4315696 1.0410035 108432
[24]    {produce} =>     {snacks} 0.3372827  0.4492654 0.7507426 1.0410035 108432
[19]  {beverages} =>    {produce} 0.3335500  0.7372633 0.4524164 0.9820453 107232

> # 연관분석 시각화

plot(rules1)

> # 지지도와 신뢰도의 산점도가 출력된다.

> # 생성된 규칙을 네트워크 그래프로 표현
plot(rules1,
+      method = "graph",
+      control = list(type="items"),
+      vertex.label.cex = 0.1,
+      edge.arrow.size = 0.1,
+      edge.arrow.width = 2)
경고: Unknown control parameters: type, vertex.label.cex, edge.arrow.size, edge.arrow.width
Available control parameters (with default values):
layout  =  stress
circular  =  FALSE
ggraphdots  =  NULL
edges  =  <environment>
nodes  =  <environment>
nodetext  =  <environment>
colors  =  c("#EE0000FF", "#EEEEEEFF")
engine  =  ggplot2
max  =  100
verbose  =  FALSE

> # 많이 판매되는 상품 카테고리일수록 중앙에 위치한다. (produce, daily eggs 카테고리가 많이 팔리는 것을 확인할 수 있다.)

> # 생성된 규칙의 지지도와 향상도를 행렬 형태로 표현
plot(rules1, method = "grouped matrix")
Registered S3 methods overwritten by 'registry':
  method               from 
  print.registry_field proxy
  print.registry_entry proxy

> # 7 rules: {produce}은 produce를 구매했을 때 총 7가지{bakery, pantry, deli, dairy eggs, frozen, snacks, beverages} 를 같이 구매했음을 의미한다.

> #### 8.4.3 고객의 구매 패턴에 따른 군집 생성

> # 고객 패턴 데이터 생성

> # reshape 패키지의 cast() 명령어 활용

> Orders_Department_Matrix = Order_products_Department %>%
+   slice(1:10000) %>%
+   select(order_id, department) %>%
+   mutate(Count = 1) %>%
+   reshape::cast(order_id~department,
+                 fun.aggregate = sum)
Using Count as value column.  Use the value argument to cast to override this choice

> Orders_Department_Matrix
   order_id alcohol babies bakery beverages breakfast bulk canned goods dairy eggs deli dry goods pasta frozen household
1         2       0      0      0         0         0    0            0          1    0               0      0         0
2         3       0      0      1         0         0    0            0          3    0               0      0         0
3         4       0      0      1         3         4    0            0          0    0               0      0         0
4         5       0      0      0         1         0    0            0          3    1               2      0         3
5         6       0      0      0         1         0    0            0          0    0               0      0         2
...
   international meat seafood missing other pantry personal care pets produce snacks
1              0            0       0     0      5             0    0       3      0
2              0            1       0     0      0             0    0       3      0
3              0            0       0     0      0             1    0       0      4
4              1            1       0     0      2             1    0       7      4
5              0            0       0     0      0             0    0       0      0
...
 [ reached 'max' / getOption("max.print") -- omitted 932 rows ]
> # 이 방법은 대용량의 데이터에 대해서는 연산 속도가 너무 느리기 때문에 추천하는 방법은 아니다.

> # tm 패키지의 Corpus(), DocumentMatrix() 명령어 활용

> library(tm)
> Order_products_Department$department2 = gsub(" ","",Order_products_Department$department)

> Corpus = Corpus(VectorSource(Order_products_Department$department2[1:10000]))
> DTM = DocumentTermMatrix(Corpus)
> DTM_Matrix = as.matrix(DTM)
> DTM_DF = as.data.frame(DTM_Matrix)
> DTM_DF$order_id = Order_products_Department$order_id[1:10000]
> Orders_Department_DF = DTM_DF

> # 구매 정보 데이터 생성

> set.seed(123)
> SL = sample(1:length(Order_id), length(Order_id) * 0.1)
> Sample_Order_id = Order_id[SL]
> Sample_Orders = Order_products_Department %>%
+   dplyr::filter(order_id %in% Sample_Order_id)

> Corpus2 = Corpus(VectorSource(Sample_Orders$department2))
> DTM2 = DocumentTermMatrix(Corpus2)
> DTM_Matrix2 = as.matrix(DTM2)
> DTM_DF2 = as.data.frame(DTM_Matrix2)
> DTM_DF2$order_id = Sample_Orders$order_id
> Orders_Department_DF2 = DTM_DF2

> DIR2 = "F:/1_Study/1_BigData/12_R/02_Practical-R/MarketData_output/"
> fwrite(Orders_Department_DF2,
+        paste0(DIR2,"Orders_Department_DF2.csv"))
                                                                                                                                     

> orders_prior = orders %>%
+   dplyr::filter(eval_set == "prior")

> Orders_Department_DF2_Sum = Orders_Department_DF2 %>%
+   group_by(order_id) %>%
+   summarise_all(.funs = sum)

> knitr::kable(Orders_Department_DF2_Sum[1:10,])


| order_id| household| produce| bulk| dairyeggs| beverages| babies| international| frozen| cannedgoods| pantry| snacks| meatseafood| deli| drygoodspasta| personalcare| bakery| breakfast| other| alcohol| pets| missing|
|--------:|---------:|-------:|----:|---------:|---------:|------:|-------------:|------:|-----------:|------:|------:|-----------:|----:|-------------:|------------:|------:|---------:|-----:|-------:|----:|-------:|
|       18|         1|      10|    1|         2|         1|      3|             1|      1|           1|      4|      1|           1|    1|             0|            0|      0|         0|     0|       0|    0|       0|
|       28|         0|       6|    0|         8|         0|      0|             0|      0|           0|      1|      0|           1|    0|             0|            0|      0|         0|     0|       0|    0|       0|
|       31|         0|       6|    0|         0|         1|      0|             0|      0|           0|      0|      1|           1|    0|             1|            0|      0|         0|     0|       0|    0|       0|
|       43|         1|       1|    0|         3|         0|      0|             0|      0|           0|      0|      1|           0|    0|             0|            1|      0|         0|     0|       0|    0|       0|
|       51|         1|       4|    0|         1|         1|      0|             0|      0|           0|      1|      0|           0|    0|             0|            0|      1|         0|     0|       0|    0|       0|
|       58|         0|       4|    0|         0|         0|      0|             0|      0|           0|      0|      0|           0|    1|             1|            0|      0|         0|     0|       0|    0|       0|
|       61|         0|       4|    0|         6|         0|      0|             0|      0|           0|      0|      1|           0|    0|             0|            0|      0|         1|     0|       0|    0|       0|
|       79|         2|       1|    0|         1|         3|      0|             0|      0|           0|      0|      0|           0|    0|             0|            0|      0|         0|     1|       0|    0|       0|
|       90|         0|       1|    0|         0|         0|      0|             0|      0|           3|      1|      0|           0|    0|             0|            0|      0|         0|     0|       0|    0|       0|
|      103|         0|       0|    0|         0|         4|      0|             0|      0|           0|      0|      0|           0|    0|             0|            0|      0|         0|     0|       0|    0|       0|

> Orders_Department_DF2_User = Orders_Department_DF2_Sum %>%
+   inner_join(orders_prior, by = 'order_id')

> User_Orders_Product = Orders_Department_DF2_User %>%
+   select(-eval_set,-order_number,-order_dow,-order_id,
+          -order_hour_of_day,-days_since_prior_order) %>%
+   group_by(user_id) %>%
+   summarise_all(.funs = mean)

> User_Orders_Product
# A tibble: 130,875 x 22
   user_id household produce  bulk dairyeggs beverages babies international frozen cannedgoods pantry snacks meatseafood
     <int>     <dbl>   <dbl> <dbl>     <dbl>     <dbl>  <dbl>         <dbl>  <dbl>       <dbl>  <dbl>  <dbl>       <dbl>
 1       1      0.25       0     0     1.25       1.25      0             0      0         0     0.25  2           0    
 2       2      0          2     0     6          2         0             0      0         0     2     4           0    
 3       3      1          2     0     3          1         0             0      0         0     0     2           0    
 4       7      0          2     0     0.667      1.67      0             0      0         0     0     0.667       0.333
 5       9      1          2     0     7          1         5             0      1         1     3     7           1    
 6      11      0          2     0     2          3         0             0      0         0     4     0           0    
 7      12      0          5     0     1          1         0             0      1         0     1     0           0    
 8      13      0          2     0     2.2        0         0             0      0         0.2   0.2   0.2         0    
 9      14      0          1     0     0          0         0             0      2         0.5   2     0           0    
10      15      0          0     0     0          1         0             0      0         0     0     2.33        0    
# ... with 130,865 more rows, and 9 more variables: deli <dbl>, drygoodspasta <dbl>, personalcare <dbl>, bakery <dbl>,
#   breakfast <dbl>, other <dbl>, alcohol <dbl>, pets <dbl>, missing <dbl>

> # 군집 분석

> Sample_Orders_Info = orders_prior %>%
+   dplyr::filter(order_id %in% Order_id) %>%
+   inner_join(Order_Id, by = "order_id") %>%
+   group_by(user_id) %>%
+   summarise(Count = n(),
+             Revisit_Days_Mean = mean(days_since_prior_order, na.rm = TRUE),
+             Revisit_Days_Mean = round(Revisit_Days_Mean,2),
+             Reorder_Mean = mean(Reordered))

> User_Orders_Info2 = Sample_Orders_Info %>%
+   inner_join(User_Orders_Product, by = 'user_id') %>%
+   as.data.frame()

> knitr::kable(User_Orders_Info2[1:10,])


| user_id| Count| Revisit_Days_Mean| Reorder_Mean| household| produce| bulk| dairyeggs| beverages| babies| international| frozen| cannedgoods| pantry|    snacks| meatseafood|      deli| drygoodspasta| personalcare| bakery| breakfast| other| alcohol| pets| missing|
|-------:|-----:|-----------------:|------------:|---------:|-------:|----:|---------:|---------:|------:|-------------:|------:|-----------:|------:|---------:|-----------:|---------:|-------------:|------------:|------:|---------:|-----:|-------:|----:|-------:|
|       1|    10|             19.56|     4.100000|      0.25|       0|    0| 1.2500000|  1.250000|      0|             0|      0|         0.0|   0.25| 2.0000000|   0.0000000| 0.0000000|             0|            0|    0.0|       0.0|     0|       0|    0|     0.0|
|       2|    14|             15.23|     6.642857|      0.00|       2|    0| 6.0000000|  2.000000|      0|             0|      0|         0.0|   2.00| 4.0000000|   0.0000000| 2.0000000|             0|            0|    1.0|       0.0|     0|       0|    0|     0.0|
|       3|    12|             12.09|     4.583333|      1.00|       2|    0| 3.0000000|  1.000000|      0|             0|      0|         0.0|   0.00| 2.0000000|   0.0000000| 0.0000000|             0|            0|    0.0|       0.0|     0|       0|    0|     0.0|
|       7|    20|             10.68|     6.900000|      0.00|       2|    0| 0.6666667|  1.666667|      0|             0|      0|         0.0|   0.00| 0.6666667|   0.3333333| 0.3333333|             0|            0|    0.0|       0.0|     0|       0|    0|     0.0|
|       9|     3|             18.00|     6.000000|      1.00|       2|    0| 7.0000000|  1.000000|      5|             0|      1|         1.0|   3.00| 7.0000000|   1.0000000| 0.0000000|             0|            0|    0.0|       1.0|     0|       0|    0|     0.0|
|      11|     7|             20.50|     4.714286|      0.00|       2|    0| 2.0000000|  3.000000|      0|             0|      0|         0.0|   4.00| 0.0000000|   0.0000000| 0.0000000|             0|            0|    0.0|       0.0|     0|       0|    0|     0.0|
|      12|     5|             25.00|     2.600000|      0.00|       5|    0| 1.0000000|  1.000000|      0|             0|      1|         0.0|   1.00| 0.0000000|   0.0000000| 0.0000000|             1|            1|    0.0|       0.0|     0|       0|    6|     0.0|
|      13|    12|              7.64|     4.333333|      0.00|       2|    0| 2.2000000|  0.000000|      0|             0|      0|         0.2|   0.20| 0.2000000|   0.0000000| 0.0000000|             0|            0|    0.6|       0.2|     0|       0|    0|     0.6|
|      14|    13|             22.08|     5.230769|      0.00|       1|    0| 0.0000000|  0.000000|      0|             0|      2|         0.5|   2.00| 0.0000000|   0.0000000| 0.5000000|             0|            0|    1.5|       0.0|     0|       1|    0|     0.0|
|      15|    22|             10.81|     2.681818|      0.00|       0|    0| 0.0000000|  1.000000|      0|             0|      0|         0.0|   0.00| 2.3333333|   0.0000000| 0.0000000|             0|            0|    0.0|       0.0|     0|       0|    0|     0.0|

> # 지금까지 정리한 데이터를 기반으로 유사한 구매패턴을 보이는 고객들을 분류하기 위해 군집분석 진행

> # 정규화 후 군집분석

Normalization = function(x){
+   y = (x-min(x))/(max(x)-min(x))
+   return(y)
+ }

> rownames(User_Orders_Info2) = User_Orders_Info2$user_id

> User_Orders_Info2_2 = User_Orders_Info2 %>%
+   select(-user_id)

> User_Orders_Normalization = apply(User_Orders_Info2_2,
+                                   MARGIN = 2,
+                                   FUN = Normalization)

> User_Orders_Normalization = as.data.frame(User_Orders_Normalization)

> library(factoextra)
> library(FactoMineR)

> set.seed(123)
> SL = sample(1:nrow(User_Orders_Normalization),
+             nrow(User_Orders_Normalization) * 0.05,
+             replace = FALSE)

> k_cluster1 = kmeans(User_Orders_Normalization[SL,], 2, nstart = 25)

fviz_cluster(k_cluster1,
+              data = User_Orders_Normalization[SL,],
+              ellipse.type = "convex",
+              palette = "jco",
+              ggtheme = theme_minimal())


> k_cluster2 = kmeans(User_Orders_Normalization[SL,], 3, nstart = 25)

fviz_cluster(k_cluster2,
+              data = User_Orders_Normalization[SL,],
+              ellipse.type = "convex",
+              palette = "jco",
+              ggtheme = theme_minimal())


> # K=2, 3일 때 모두 겹치는 영역이 큰 것을 확인할 수 있다.

> # 표준화 후 군집분석

> User_Orders_Scale = scale(User_Orders_Info2_2)
> User_Orders_Scale = as.data.frame(User_Orders_Scale)

> k_cluster3 = kmeans(User_Orders_Scale[SL,], 2, nstart = 25)

fviz_cluster(k_cluster3,
+              data = User_Orders_Scale[SL,],
+              ellipse.type = "convex",
+              palette = "jco",
+              ggtheme = theme_minimal())


> k_cluster4 = kmeans(User_Orders_Scale[SL,], 3, nstart = 25)

fviz_cluster(k_cluster4,
+              data = User_Orders_Scale[SL,],
+              ellipse.type = "convex",
+              palette = "jco",
+              ggtheme = theme_minimal())


> # 정규화 후 보다는 겹치는 영역이 눈에 띄게 적다.

> # 군집 시각화

> Sample_User = User_Orders_Scale[SL,]
> Sample_User = as.data.frame(Sample_User)
> Sample_User$Cluster = k_cluster4$cluster

> Sample_User %>%
+   group_by(Cluster) %>%
+   summarise_all(.funs = mean) %>%
+   as.data.frame() %>%
+   melt(id.vars = c("Cluster")) %>%
+   ggplot() +
+   geom_point(aes(x = variable, y = value, col = as.factor(Cluster))) +
+   geom_path(aes(x = variable, y = value, col = as.factor(Cluster),
+                 group = as.factor(Cluster))) +
+   labs(col = "Cluster") +
+   xlab("") + ylab("") +
+   coord_polar() +
+   theme_bw() +
+   theme(legend.position = "bottom")
경고메시지(들): 
In melt(., id.vars = c("Cluster")) :
  The melt generic in data.table has been passed a data.frame and will attempt to redirect to the relevant reshape2 method; please note that reshape2 is deprecated, and this redirection is now deprecated as well. To continue using melt methods from reshape2 while both libraries are attached, e.g. melt.list, you can prepend the namespace like reshape2::melt(.). In the next version, this warning will become an error.

> # 각 군집의 평균값을 레이더 형태로 표현한 그래프이다.

> # 군집 2(초록색)는 Count 변수의 평균이 다른 군집보다 높다.

> # 군집 1(빨간색)의 경우 모든 변수에 대해 대체로 작은 수치로 나타났다.

> # 군집 3(파란색)은 여러 상품 카테고리 구매 빈도가 높다.

> ### 8.5 추천 시스템

> #### 8.5.2 추천 시스템 구현 - 군집분석 기반

> install.packages("recommenderlab")
> library(recommenderlab)

> rownames(User_Orders_Info2) = User_Orders_Info2$user_id
> User_Orders_Info2_2 = User_Orders_Info2 %>%
+   select(-user_id)

> User_Orders_Scale = scale(User_Orders_Info2_2)
> User_Orders_Scale = as.data.frame(User_Orders_Scale)

> k_cluster_Total = kmeans(User_Orders_Scale, 3, nstart = 25)
> User_Orders_Info2_2$Cluster = k_cluster_Total$cluster

> # 추천 시스템을 위한 데이터 생성

> # 추천 시스템을 위한 데이터 생성 - realRatingMatrix
> # 카테고리별 고객의 평균 구매 수량을 평점으로 활용

> Cluster3 = User_Orders_Info2_2 %>%
+   dplyr::filter(Cluster == 3) %>%
+   select(-Count,-Revisit_Days_Mean,-Reorder_Mean, -Cluster)
> Cluster3$user_id = paste0("user",rownames(Cluster3))

> Cluster3 = Cluster3 %>%
+   melt(id.vars = c("user_id")) %>%
+   arrange(user_id)
경고메시지(들): 
In melt(., id.vars = c("user_id")) :
  The melt generic in data.table has been passed a data.frame and will attempt to redirect to the relevant reshape2 method; please note that reshape2 is deprecated, and this redirection is now deprecated as well. To continue using melt methods from reshape2 while both libraries are attached, e.g. melt.list, you can prepend the namespace like reshape2::melt(.). In the next version, this warning will become an error.

> C3_Matrix = as(Cluster3, "realRatingMatrix")
> head(as(C3_Matrix, "data.frame"))
        user      item rating
1      user1 household      0
26144  user1   produce      2
52287  user1      bulk      0
78430  user1 dairyeggs      6
104573 user1 beverages      2
130716 user1    babies      0
> C3_DF = as.data.frame(as(C3_Matrix, "data.frame"))

> C3_DF$user = as.character(C3_DF$user)
> C3_DF$item = as.character(C3_DF$item)

> C3_DF %>%
+   dplyr::filter(user %in% c("user1","user2","user3","user4","user5")) %>%
+   ggplot() +
+   geom_tile(aes(x = user, y = item, fill = rating)) +
+   geom_vline(xintercept = seq(0.5,by = 1, length.out = 9),
+              linetype = 'dashed') +
+   geom_hline(yintercept = seq(0.5,by = 1, length.out = 26),
+              linetype = 'dashed') +
+   scale_fill_gradientn(colours = c("white","black"),
+                        values = c(0,0.5,1)) +
+   theme_minimal() +
+   labs(fill = "Rating") +
+   theme(legend.position = "bottom",
+         legend.key.width=unit(3,"cm"),
+         text = element_text(size = 15, face = "bold"),
+         legend.box.background =element_rect())

> # 히트맵으로 고객들의 구매 기록을 시각화
> # 5명의 고객 모두 dairyeggs를 자주 구매한 것을 확인할 수 있다.

> C3_Ratings = getRatings(C3_Matrix)
> summary(C3_Ratings)
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
 0.0000  0.0000  0.0000  0.9674  1.0000 44.0000 

> # 사용자 기반 추천 시스템

> SL = 1:round(nrow(C3_Matrix)*0.99)
> C3_Train = C3_Matrix[SL]
> C3_Test = C3_Matrix[-SL]

> C3_UBCF = Recommender(C3_Train, method = "UBCF")

> C3_U_Predict = predict(C3_UBCF, C3_Test, type="ratingMatrix")
> M3_U = as(C3_U_Predict,"matrix")
> sort(colMeans(M3_U),decreasing = TRUE)
      produce     dairyeggs        snacks        frozen     beverages        pantry   cannedgoods        bakery 
  5.849511136   3.653764867   2.165279659   1.657472774   1.275934247   1.213372056   0.840879867   0.721148980 
drygoodspasta          deli     breakfast   meatseafood     household        babies  personalcare international 
  0.706848884   0.662119690   0.468265055   0.399293029   0.240509589   0.210590482   0.143619328   0.129676474 
      missing          pets          bulk         other       alcohol 
 -0.005934469  -0.020007760  -0.027432865  -0.028609904  -0.030971191 
> # 카테고리 추천은 produce, dairyeggs, snacks 순서로 많이 된 것을 확인할 수 있다.

> # 개개인에 대한 추천 카테고리
> sort(unlist(as(C3_U_Predict,"list")[1]),decreasing = TRUE)
       user9764.frozen     user9764.dairyeggs        user9764.snacks       user9764.produce user9764.drygoodspasta 
           4.267444070            3.834786840            2.918881010            1.377616208            1.208052726 
       user9764.pantry        user9764.bakery   user9764.meatseafood          user9764.deli     user9764.breakfast 
           1.124289482            0.665140600            0.437739323            0.326362613            0.314000302 
    user9764.beverages   user9764.cannedgoods     user9764.household user9764.international  user9764.personalcare 
           0.297241038            0.086418526            0.036528459           -0.009330444           -0.015894607 
      user9764.missing        user9764.babies          user9764.bulk         user9764.other       user9764.alcohol 
          -0.029588681           -0.036197060           -0.075872601           -0.075872601           -0.075872601 
         user9764.pets 
          -0.075872601 
> sort(unlist(as(C3_U_Predict,"list")[2]),decreasing = TRUE)
       user9765.snacks       user9765.produce        user9765.frozen     user9765.dairyeggs        user9765.pantry 
             6.1775364              5.8083456              4.1918586              2.7318436              1.6520688 
user9765.drygoodspasta          user9765.deli     user9765.beverages        user9765.bakery   user9765.meatseafood 
             1.4238955              1.2085222              1.1048437              1.0807753              1.0002579 
  user9765.cannedgoods     user9765.breakfast  user9765.personalcare user9765.international     user9765.household 
             0.7699697              0.7626861              0.6752035              0.5489099              0.5304160 
         user9765.pets        user9765.babies          user9765.bulk         user9765.other       user9765.alcohol 
             0.4020718              0.4020411              0.3821886              0.3821886              0.3821886 
      user9765.missing 
             0.3821886 

> # 아이템 기반 추천 시스템

> C3_IBCF = Recommender(C3_Train, method = "IBCF")
> C3_I_Predict = predict(C3_IBCF, C3_Test, type="ratingMatrix")
> M3_I = as(C3_I_Predict,"matrix")
> sort(colMeans(M3_I),decreasing = TRUE)
  cannedgoods drygoodspasta        bakery     household     breakfast          deli          pets       alcohol 
    1.6912742     1.2811703     1.1946219     1.0324000     1.0263149     1.0187827     1.0123656     1.0078731 
        other          bulk       missing  personalcare        babies international   meatseafood     dairyeggs 
    1.0048902     1.0023125     1.0017125     0.9854815     0.9854258     0.9765354     0.9344719     0.6007949 
      produce        snacks     beverages        pantry        frozen 
    0.3940936     0.3828307     0.3347450     0.1488792     0.1366518 
> # 아이템 기반 추천에서는 cannedgoods와 drygoodspasta가 가장 많이 추천된 것을 확인할 수 있다.

> # 개개인에 대한 추천 결과
> sort(unlist(as(C3_I_Predict,"list")[1]),decreasing = TRUE)
  user9764.cannedgoods user9764.drygoodspasta user9764.international          user9764.pets       user9764.alcohol 
          1.530601e+00           8.366094e-01           5.497031e-01           5.478477e-01           5.469612e-01 
         user9764.bulk         user9764.other       user9764.missing          user9764.deli     user9764.household 
          5.469222e-01           5.467807e-01           5.462298e-01           4.871058e-01           4.839347e-01 
 user9764.personalcare        user9764.babies   user9764.meatseafood        user9764.bakery     user9764.breakfast 
          4.752139e-01           4.697888e-01           4.658389e-01           4.639385e-01           4.594893e-01 
    user9764.beverages        user9764.snacks       user9764.produce     user9764.dairyeggs        user9764.frozen 
          4.173209e-01           3.838209e-01           3.168960e-01           1.729550e-01           8.655586e-02 
       user9764.pantry 
          1.110223e-16 

 

 

 

 

 

출처 : 실무 프로젝트로 배우는 데이터 분석 with R