[ADP] 그래프 작성

데이터분석/R

[ADP] 그래프 작성

버섯도리 2022. 1. 15. 11:43

> ## 3장. 시각화 구현
> # 02. 분석 도구를 이용한 시각화 구현 : R
>
> # 1. 그래프 작성
>
>
> # 1) XY 그래프
>
> library(ggplot2)
>
> data("ChickWeight")
> head(ChickWeight)
Grouped Data: weight ~ Time | Chick
  weight Time Chick Diet
1     42    0     1    1
2     51    2     1    1
3     59    4     1    1
4     64    6     1    1
5     76    8     1    1
6     93   10     1    1
>
> ggplot(ChickWeight, aes(x=Time, y=weight, colour=Diet, group=Chick)) + geom_line()

> # 결과를 보면 먹이별로 체중 변화를 보여주지만, 어느 먹이(Diet)가 효율적인지는 알기는 어렵다.
>
> ggplot(ChickWeight, aes(x=Time, y=weight, colour=Diet)) + geom_point(alpha=0.3) + geom_smooth(formula = y ~ x, alpha=0.2,size=1,method = 'loess')

> # geom_point로 투명도와 사이즈를 지정해 표시하고, geom_smooth로 배경 색상의 투명도와 평균값 선의 굵기를 조정
>
>
> # 2) 히스토그램(Histogram)
>
> ggplot(subset(ChickWeight,Time==21), aes(x=weight,colour=Diet)) + geom_density()

> # Time이 21인 경우 weight가 어느 구간에 걸쳐 있는지, 분포 차이가 있는지를 파악
>
> ggplot(subset(ChickWeight,Time==21), aes(x=weight,fill=Diet)) + geom_histogram(colour='black',binwidth=50) + facet_grid(Diet~.)

> # 분포가 연속적인 값이고 선으로 되어 있어서 내용을 파악하기 어렵거나 분류 유형이 많은 경우 히스토그램을 이용
>
>
> # 3) 포인트 그래프
>
> data("mtcars")
> head(mtcars)
                   mpg cyl disp  hp drat    wt  qsec vs am gear carb
Mazda RX4         21.0   6  160 110 3.90 2.620 16.46  0  1    4    4
Mazda RX4 Wag     21.0   6  160 110 3.90 2.875 17.02  0  1    4    4
Datsun 710        22.8   4  108  93 3.85 2.320 18.61  1  1    4    1
Hornet 4 Drive    21.4   6  258 110 3.08 3.215 19.44  1  0    3    1
Hornet Sportabout 18.7   8  360 175 3.15 3.440 17.02  0  0    3    2
Valiant           18.1   6  225 105 2.76 3.460 20.22  1  0    3    1
> p <- qplot(wt, mpg, colour=hp, data = mtcars)
> p = p + coord_cartesian(ylim = c(0,40)) + scale_color_continuous(breaks=c(100,300)) + guides(colour='colorbar')
> p

> # ylim으로 y축의 범위을 지정, breaks로 hp의 범위를 지정, guides(colour=..)는 hp의 수치에 따른 색의 범위를 알려준다.
>
> m <- mtcars[1:10,]
> p%+%m

> # 10건만 추출해 그래프 표현
>

> # 4) 막대그래프

> c <- ggplot(mtcars, aes(factor(cyl)))
> c + geom_bar()

> c + geom_bar(fill='red')

> c + geom_bar(colour='red')

> c + geom_bar(fill='white',colour='red')

>
> k <- ggplot(mtcars, aes(factor(cyl), fill=factor(vs)))
> k + geom_bar()

> # fill 옵션으로 vs 변수 추가 표시
>
> library(ggplot2movies)
> m <- ggplot(movies, aes(x=rating))
> m + geom_histogram(aes(fill=..count..))
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

> # 히스토그램 형식으로 표시한 후에 연속형 Count를 색상으로 표시

> # 5) 선그래프
>
> data("economics")
> head(economics)
# A tibble: 6 x 6
  date         pce    pop psavert uempmed unemploy
  <date>     <dbl>  <dbl>   <dbl>   <dbl>    <dbl>
1 1967-07-01  507. 198712    12.6     4.5     2944
2 1967-08-01  510. 198911    12.6     4.7     2945
3 1967-09-01  516. 199113    11.9     4.6     2958
4 1967-10-01  512. 199311    12.9     4.9     3143
5 1967-11-01  517. 199498    12.8     4.7     3066
6 1967-12-01  525. 199657    11.8     4.8     3018
> # date는 월별로 정보를 수집한 시점, psavert는 개인 저축률, pce는 개인소비 지출
> # unemploy는 실업자 수, unempmed는 실업기간 중앙값, pop은 총인구
> b <- ggplot(economics, aes(x=date, y=unemploy))
> b + geom_line()

> b + geom_line(colour='red')

> b + geom_line(colour='red',size=3)

> b + geom_line(linetype=2)

> # 6) 효과주기
>
> df <- data.frame(x=rnorm(5000), y=rnorm(5000))
> h <- ggplot(df, aes(x,y))
> h + geom_point()

> h + geom_point(alpha=0.5)

>
> p <- ggplot(mtcars, aes(wt,mpg))
> p + geom_point(size=4)

> p + geom_point(aes(colour=factor(cyl)), size=4)

> p + geom_point(aes(shape=factor(cyl)), size=4)

>
> library(reshape2)
> library(plyr)
>
> rescale01 <- function(x)(x-min(x))/diff(range(x))
> # 데이터 표준화
>
> ec_scaled <- data.frame(
+ date = economics$date,
+ colwise(rescale01)(economics[,-(1:2)])
+ )
>
> ecm <- melt(ec_scaled, id='date')
>
> f <- ggplot(ecm, aes(date, value))
> f + geom_line(aes(linetype=variable))

> # melt 함수로 변환 후 변수별로 다양한 라인으로 시각화
>
>

> # 7) 그 외 다양한 그래프
>
> # ① 줄기잎그림
> library(aplpack)
> score <- c(1,2,3,4,10,2,30,42,31,50,80,76,90,87,21,43,65,76,32,12,34,54)
> score
[1]  1  2  3  4 10  2 30 42 31 50 80 76 90 87 21 43 65 76 32 12 34 54
> stem.leaf(score)
1 | 2: represents 12
leaf unit: 1
            n: 22
   5    0 | 12234
   7    1 | 02
   8    2 | 1
  (4)   3 | 0124
  10    4 | 23
   8    5 | 04
   6    6 | 5
   5    7 | 66
   3    8 | 07
   1    9 | 0
> # 줄기잎그램은 원래의 변량을 정확히 알 뿐만 아니라 자료의 전체적인 분포 상태도 쉽게 파악할 수 있다.
>
> # ② 얼굴그림
> WorldPhones
     N.Amer Europe Asia S.Amer Oceania Africa Mid.Amer
1951  45939  21574 2876   1815    1646     89      555
1956  60423  29990 4708   2568    2366   1411      733
1957  64721  32510 5230   2695    2526   1546      773
1958  68484  35218 6662   2845    2691   1663      836
1959  71799  37598 6856   3000    2868   1769      911
1960  76036  40341 8220   3145    3054   1905     1008
1961  79831  43173 9053   3338    3224   2005     1076
> faces(WorldPhones)
effect of variables:
modified item       Var
"height of face   " "N.Amer"
"width of face    " "Europe"
"structure of face" "Asia"
"height of mouth  " "S.Amer"
"width of mouth   " "Oceania"
"smiling          " "Africa"
"height of eyes   " "Mid.Amer"
"width of eyes    " "N.Amer"
"height of hair   " "Europe"
"width of hair   "  "Asia"
"style of hair   "  "S.Amer"
"height of nose  "  "Oceania"
"width of nose   "  "Africa"
"width of ear    "  "Mid.Amer"
"height of ear   "  "N.Amer"

> # 연도별 변화를 얼굴 크기로 보여주는 시각화
>
> # ③ 별그림
> stars(WorldPhones)

출처 : 2020 데이터 분석 전문가 ADP 필기 한 권으로 끝내기