> ### 2.12 중고차 데이터를 활용한 데이터 전처리 2 - dplyr 패키지
>
>
> #### 2.12.1 체인 연산자
>
> library(dplyr)
> colMeans(filter(.data = Audi, year > 2016)[,c("tax","mpg","engineSize")])
tax mpg engineSize
139.188766 47.049449 1.922512
>
> Audi %>%
+ filter(year > 2016) %>%
+ select(tax,mpg,engineSize) %>%
+ colMeans()
tax mpg engineSize
139.188766 47.049449 1.922512
>
>
> #### 2.12.2 데이터셋에 새로운 변수 추가
>
> Audi2 = Audi %>%
+ mutate(tax2 = ifelse(tax > 100 , 1,0),
+ engineSize2 = round(engineSize))
>
> head(Audi2)
model year price transmission mileage fuelType tax mpg engineSize tax2 engineSize2
1 A1 2017 12500 Manual 15735 Petrol 150 55.4 1.4 1 1
2 A6 2016 16500 Automatic 36203 Diesel 20 64.2 2.0 0 2
3 A1 2016 11000 Manual 29946 Petrol 30 55.4 1.4 0 1
4 A4 2017 16800 Automatic 25952 Diesel 145 67.3 2.0 1 2
5 A3 2019 17300 Manual 1998 Petrol 145 49.6 1.0 1 1
6 A1 2016 13900 Automatic 32260 Petrol 30 58.9 1.4 0 1
>
>
> #### 2.12.3 데이터 집계
>
> # 그룹 간 빈도 수 계산 – group_by() + n()
> Audi3 = Audi %>%
+ group_by(transmission) %>%
+ summarise(Count = n())
>
> Audi3
# A tibble: 3 x 2
transmission Count
<chr> <int>
1 Automatic 2708
2 Manual 4369
3 Semi-Auto 3591
>
> Audi4 = Audi %>%
+ group_by(transmission) %>%
+ summarise(Count = n(),
+ Price_Mean = mean(price),
+ Price_Sd = sd(price))
>
> Audi4
# A tibble: 3 x 4
transmission Count Price_Mean Price_Sd
<chr> <int> <dbl> <dbl>
1 Automatic 2708 28205. 13209.
2 Manual 4369 16101. 5519.
3 Semi-Auto 3591 27162. 11979.
>
> # Group변수가 2개일 떄
>
> Audi5 = Audi %>%
+ group_by(transmission,year) %>%
+ summarise(Count = n(),
+ Price_Mean = mean(price))
`summarise()` has grouped output by 'transmission'. You can override using the `.groups` argument.
>
> head(Audi5)
# A tibble: 6 x 4
# Groups: transmission [1]
transmission year Count Price_Mean
<chr> <int> <int> <dbl>
1 Automatic 1997 1 4650
2 Automatic 1998 1 4999
3 Automatic 2003 3 3492.
4 Automatic 2004 3 6891.
5 Automatic 2005 1 5895
6 Automatic 2006 4 6459.
>
> # 그룹 간 비율 계산
>
> Audi6 = Audi %>%
+ group_by(transmission,year) %>%
+ summarise(Count = n(),
+ Price_Mean = mean(price)) %>%
+ mutate(Perc = Count/sum(Count))
`summarise()` has grouped output by 'transmission'. You can override using the `.groups` argument.
>
> head(Audi6)
# A tibble: 6 x 5
# Groups: transmission [1]
transmission year Count Price_Mean Perc
<chr> <int> <int> <dbl> <dbl>
1 Automatic 1997 1 4650 0.000369
2 Automatic 1998 1 4999 0.000369
3 Automatic 2003 3 3492. 0.00111
4 Automatic 2004 3 6891. 0.00111
5 Automatic 2005 1 5895 0.000369
6 Automatic 2006 4 6459. 0.00148
>
>
> #### 2.12.4 데이터 정렬
>
> # 오름차순
> Audi7 = Audi %>%
+ arrange(price)
>
> head(Audi7)
model year price transmission mileage fuelType tax mpg engineSize
1 A3 2003 1490 Automatic 101000 Diesel 240 44.5 1.9
2 A4 2004 1699 Manual 110000 Diesel 315 36.2 2.5
3 A3 2005 1975 Manual 104000 Petrol 240 38.1 2.0
4 TT 2002 1990 Manual 131925 Petrol 325 30.1 1.8
5 A3 2009 2490 Manual 152034 Diesel 150 55.4 1.9
6 A6 2008 2490 Manual 323000 Diesel 200 44.1 2.0
>
> # 내림차순
> Audi8 = Audi %>%
+ arrange(-price)
>
> head(Audi8)
model year price transmission mileage fuelType tax mpg engineSize
1 R8 2020 145000 Semi-Auto 2000 Petrol 145 21.1 5.2
2 R8 2020 137995 Semi-Auto 70 Petrol 145 21.1 5.2
3 R8 2019 137500 Semi-Auto 10 Petrol 150 21.4 5.2
4 R8 2019 135000 Automatic 1000 Petrol 145 32.8 5.2
5 R8 2020 133900 Semi-Auto 333 Petrol 145 21.4 5.2
6 R8 2019 129000 Semi-Auto 4000 Petrol 145 21.4 5.2
>
> # 상위 n개 추출 - top_n()
>
> Audi20 = Audi %>%
+ arrange(-price) %>%
+ top_n(n = 5,wt = price)
>
> Audi20
model year price transmission mileage fuelType tax mpg engineSize
1 R8 2020 145000 Semi-Auto 2000 Petrol 145 21.1 5.2
2 R8 2020 137995 Semi-Auto 70 Petrol 145 21.1 5.2
3 R8 2019 137500 Semi-Auto 10 Petrol 150 21.4 5.2
4 R8 2019 135000 Automatic 1000 Petrol 145 32.8 5.2
5 R8 2020 133900 Semi-Auto 333 Petrol 145 21.4 5.2
>
>
> #### 2.12.5 조건에 따른 데이터 추출
>
> # 추출 조건이 1개인 경우
> Audi9 = Audi %>%
+ filter(year > 2016)
>
> head(Audi9)
model year price transmission mileage fuelType tax mpg engineSize
1 A1 2017 12500 Manual 15735 Petrol 150 55.4 1.4
2 A4 2017 16800 Automatic 25952 Diesel 145 67.3 2.0
3 A3 2019 17300 Manual 1998 Petrol 145 49.6 1.0
4 A3 2017 16100 Manual 28955 Petrol 145 58.9 1.4
5 A3 2017 16400 Manual 21695 Petrol 30 58.9 1.4
6 A3 2017 14500 Automatic 26156 Petrol 145 58.9 1.4
>
> # 추출 조건이 2개인 경우
> Audi10 = Audi %>%
+ filter(year > 2016 & transmission == "Manual")
>
> head(Audi10)
model year price transmission mileage fuelType tax mpg engineSize
1 A1 2017 12500 Manual 15735 Petrol 150 55.4 1.4
2 A3 2019 17300 Manual 1998 Petrol 145 49.6 1.0
3 A3 2017 16100 Manual 28955 Petrol 145 58.9 1.4
4 A3 2017 16400 Manual 21695 Petrol 30 58.9 1.4
5 A1 2018 15800 Manual 10793 Petrol 145 56.5 1.4
6 Q3 2017 16500 Manual 21369 Petrol 125 51.4 1.4
>
>
> #### 2.12.6 데이터 병합
>
> Audi12 = Audi %>%
+ group_by(model) %>%
+ summarise(Count = n()) %>%
+ filter(model %in% c(" A1"," A2"," A3"," A4"))
>
> head(Audi12)
# A tibble: 4 x 2
model Count
<chr> <int>
1 " A1" 1347
2 " A2" 1
3 " A3" 1929
4 " A4" 1381
>
> Audi13 = Audi %>%
+ group_by(model) %>%
+ summarise(Mean = mean(price)) %>%
+ filter(model %in% c(" A3"," A4"," A5", " A6"))
>
> head(Audi13)
# A tibble: 4 x 2
model Mean
<chr> <dbl>
1 " A3" 17409.
2 " A4" 20255.
3 " A5" 23577.
4 " A6" 22695.
>
> # left join
> Audi14 = Audi12 %>%
+ left_join(Audi13, by = "model")
>
> head(Audi14)
# A tibble: 4 x 3
model Count Mean
<chr> <int> <dbl>
1 " A1" 1347 NA
2 " A2" 1 NA
3 " A3" 1929 17409.
4 " A4" 1381 20255.
>
> # right join
> Audi15 = Audi12 %>%
+ right_join(Audi13, by = "model")
>
> head(Audi15)
# A tibble: 4 x 3
model Count Mean
<chr> <int> <dbl>
1 " A3" 1929 17409.
2 " A4" 1381 20255.
3 " A5" NA 23577.
4 " A6" NA 22695.
>
> # full join
> Audi16 = Audi12 %>%
+ full_join(Audi13, by = "model")
>
> head(Audi16)
# A tibble: 6 x 3
model Count Mean
<chr> <int> <dbl>
1 " A1" 1347 NA
2 " A2" 1 NA
3 " A3" 1929 17409.
4 " A4" 1381 20255.
5 " A5" NA 23577.
6 " A6" NA 22695.
>
>
> #### 2.12.7 데이터 샘플링
>
> # 무작위 데이터 추출 - sample_n(), sample_frac()
> set.seed(123)
> Audi17 = Audi %>%
+ sample_n(size = 5)
>
> Audi17
model year price transmission mileage fuelType tax mpg engineSize
1 A4 2017 19220 Manual 14378 Diesel 30 67.3 2.0
2 Q5 2016 28698 Automatic 44419 Diesel 235 42.2 3.0
3 A4 2018 20290 Automatic 19450 Petrol 150 50.4 1.4
4 A3 2018 14500 Semi-Auto 22496 Diesel 145 72.4 1.6
5 TT 2015 16406 Semi-Auto 12123 Petrol 160 44.8 2.0
>
> Audi18 = Audi %>%
+ sample_frac(size = 0.05)
>
> head(Audi18)
model year price transmission mileage fuelType tax mpg engineSize
1 A3 2018 18995 Manual 5864 Petrol 145 47.1 1.0
2 Q2 2017 19490 Manual 13248 Petrol 125 50.4 1.4
3 A8 2019 48000 Automatic 5900 Diesel 145 50.4 3.0
4 A6 2015 21990 Automatic 37200 Diesel 235 44.8 3.0
5 A3 2017 19990 Manual 31191 Diesel 145 58.9 2.0
6 A1 2017 13199 Manual 14087 Petrol 30 55.4 1.4
>
>
> #### 2.18.8 중복 데이터 처리
>
>
> # 중복 데이터 처리 기본
>
> # 중복 데이터 생성
>
> set.seed(1234)
> Audi19 = Audi %>%
+ mutate(Unique_Key = 1:nrow(Audi)) %>%
+ sample_frac(size = 1.5,replace = TRUE)
>
> head(Audi19)
model year price transmission mileage fuelType tax mpg engineSize Unique_Key
1 Q5 2017 23990 Automatic 27000 Diesel 205 45.6 3.0 7452
2 TT 2016 14919 Manual 37352 Diesel 30 62.8 2.0 8016
3 Q3 2019 25900 Manual 6997 Diesel 150 42.8 2.0 7162
4 A1 2015 12000 Manual 35441 Diesel 0 80.7 1.6 8086
5 Q5 2015 24879 Automatic 39353 Diesel 235 42.8 3.0 7269
6 RS3 2020 42490 Automatic 320 Petrol 145 29.7 2.5 9196
> dim(Audi19)
[1] 16002 10
>
> # 중복 수 확인
> sum(duplicated(Audi19$Unique_Key))
[1] 7727
>
> # 중복 데이터 행 제거
> Audi19_2 = Audi19[!duplicated(Audi19$Unique_Key),]
> dim(Audi19_2)
[1] 8275 10
>
> # 중복 키가 2개 이상일 경우 - duplicated()
>
> # 중복 데이터 생성
> set.seed(1234)
> Audi19_3 = Audi %>%
+ mutate(Unique_Key = 1:nrow(Audi),
+ Unique_Key2 = nrow(Audi):1) %>%
+ sample_frac(size = 1.5,replace = TRUE)
>
> head(Audi19_3)
model year price transmission mileage fuelType tax mpg engineSize Unique_Key Unique_Key2
1 Q5 2017 23990 Automatic 27000 Diesel 205 45.6 3.0 7452 3217
2 TT 2016 14919 Manual 37352 Diesel 30 62.8 2.0 8016 2653
3 Q3 2019 25900 Manual 6997 Diesel 150 42.8 2.0 7162 3507
4 A1 2015 12000 Manual 35441 Diesel 0 80.7 1.6 8086 2583
5 Q5 2015 24879 Automatic 39353 Diesel 235 42.8 3.0 7269 3400
6 RS3 2020 42490 Automatic 320 Petrol 145 29.7 2.5 9196 1473
> dim(Audi19_3)
[1] 16002 11
>
> # 중복된 행의 수 확인
>
> sum(duplicated(Audi19_3[,c("Unique_Key","Unique_Key2")]))
[1] 7727
>
> # 중복 데이터 행 제거
>
> Audi19_4 = Audi19_3[!duplicated(Audi19_3[,c("Unique_Key","Unique_Key2")]),]
> dim(Audi19_4)
[1] 8275 11
>
> # dplyr 패키지를 활용한 중복 데이터 제거 - distinct()
>
> Audi19_5 = Audi19_3 %>%
+ distinct(Unique_Key, Unique_Key2, .keep_all = TRUE)
>
> dim(Audi19_5)
[1] 8275 11
출처 : 실무 프로젝트로 배우는 데이터 분석 with R
'데이터분석 > R' 카테고리의 다른 글
[실무 프로젝트로 배우는...] 데이터 시각화 기초 (ggplot2 패키지) (0) | 2022.01.21 |
---|---|
[실무 프로젝트로 배우는...] dplyr 응용 (0) | 2022.01.21 |
[실무 프로젝트로 배우는...] 데이터 전처리 1 - apply (0) | 2022.01.19 |
[ADP] 순차 패턴 분석 (Sequence Pattern Analysis) (0) | 2022.01.16 |
[ADP] 연관성 분석 (Association Analysis) (0) | 2022.01.16 |