[R 통계분석] 가설검정

데이터분석/R

[R 통계분석] 가설검정

버섯도리 2022. 8. 12. 17:42

Section 01. 가설검정

> # 자유도가 14인 t-분포에서 유의수준 α = 0.05일 때의 기각역
>
> par(mar=c(0.5,1,1,1))
> x <- seq(-3, 3, by=0.001)
> y <- dt(x, df=14)
> plot(x, y, type="l", axes=F, ylim=c(-0.02, 0.38), main="", xlab="t", ylab="", col = "blue")
> abline(h=0, col = "blue")
> alpha <- 0.05
> ul <- qt(1-(alpha/2), df=14)
> ll <- -ul
> polygon(c(-3, x[x<ll], ll), c(0, y[x<ll], 0), col=2)
> polygon(c(ul, x[x>ul], 3), c(0, y[x>ul], 0), col=2)
> text(-2.2, 0.1, expression(plain(P)(T<t) == 0.025), cex=0.7)
> text(2.2, 0.1, expression(plain(P)(T>t) == 0.025), cex=0.7)
> text(ll, -0.02, expression(-t[0.025]==-2.14), cex=0.8)
> text(ul, -0.02, expression(t[0.025]==2.14), cex=0.8)

> # 검정의 종류에 따른 기각역
>
> par(mar=c(0,1,1,1))
>
> # 양쪽검정
> x <- seq(-3, 3, by=0.001)
> y <- dt(x, df=14)
> plot(x, y, type="l", axes=F, ylim=c(-0.02, 0.38), main="", xlab="", ylab="", col = "blue")
> abline(h=0, col = "blue")
> alpha <- 0.05
> ul <- qt(1-(alpha/2), df=14)
> ll <- -ul
> polygon(c(-3, x[x<ll], ll), c(0, y[x<ll], 0), col=2)
> polygon(c(ul, x[x>ul], 3), c(0, y[x>ul], 0), col=2)
> text(-2.2, 0.1, expression(plain(P)(T<c[l]) == over(alpha, 2)))
> text(2.2, 0.1, expression(plain(P)(T>c[u]) == over(alpha, 2)))
> text(ll, -0.02, expression(c[l]==-t[alpha / 2]), cex=1.2)
> text(ul, -0.02, expression(c[u]==t[alpha / 2]), cex=1.2)

>
> # (왼쪽) 한쪽검정
> x <- seq(-3, 3, by=0.001)
> y <- dt(x, df=14)
> plot(x, y, type="l", axes=F, ylim=c(-0.02, 0.38), main="", xlab="", ylab="", col = "blue")
> abline(h=0, col = "blue")
> alpha <- 0.05
> ll <- qt(alpha, df=14)
> polygon(c(-3, x[x<ll], ll), c(0, y[x<ll], 0), col=2)
> text(-2.2, 0.1, expression(plain(P)(T<c[l]) == alpha))
> text(ll, -0.02, expression(c[l]==-t[alpha]), cex=1.2)

>
> # (오른쪽) 한쪽검정
> x <- seq(-3, 3, by=0.001)
> y <- dt(x, df=14)
> plot(x, y, type="l", axes=F, ylim=c(-0.02, 0.38), main="", xlab="", ylab="", col = "blue")
> abline(h=0, col = "blue")
> alpha <- 0.05
> ul <- qt(1-alpha, df=14)
> polygon(c(ul, x[x>ul], 3), c(0, y[x>ul], 0), col=2)
> text(2.2, 0.1, expression(plain(P)(T>c[u]) == alpha))
> text(ul, -0.02, expression(c[u]==t[alpha]), cex=1.2)

> # 기각과 검정통계량
>
> # 사이즈코리아(https://sizekorea.kr) -> 인체정보 알아보기 -> 인체치수조사 보고서 -> 6차 인체치수조사 (2010~14)
> data <- read.csv("./data/2016.6th.csv", header=T)
> str(data)
'data.frame': 7532 obs. of  155 variables:
$ 성별                        : chr  "남" "남" "남" "남" ...
$ 나이                        : int  7 7 7 7 7 7 7 7 7 7 ...
$ 오른쪽어깨경사각            : num  18 24 16 31 19 21 20 22 12 29 ...
$ 왼쪽어깨경사각              : num  22 26 26 26 24 17 17 18 20 25 ...
$ 머리위로뻗은주먹높이        : num  1543 1305 1487 1280 1480 ...
$ 키                          : num  1330 1160 1243 1144 1277 ...
...
  [list output truncated]
> tmp <- subset(data, data$나이 == 7)
> height.p <- tmp$키
>
> set.seed(9)
> height <- height.p[sample(length(height.p), 15)]
> height
[1] 1178 1196 1340 1231 1288 1177 1243 1209 1246 1230 1234 1197 1180 1281 1220
>
> xbar <- mean(height)
> xbar
[1] 1230
> > mu0 <- 1220
> s <- sd(height)
> s
[1] 45.75244
> n <- length(height)
> t.t <- (xbar - mu0) / (s / sqrt(n))
> t.t
[1] 0.8465086
>
> x <- seq(-3, 3, by=0.001)
> y <- dt(x, df=14)
> plot(x, y, type="l", axes=F, ylim=c(-0.02, 0.38), main="", xlab="t", ylab="", col = "blue")
> abline(h=0, col = "blue")
> alpha <- 0.05
> ul <- qt(1-(alpha/2), df=14)
> ll <- -ul
> polygon(c(-3, x[x<ll], ll), c(0, y[x<ll], 0), col=2)
> polygon(c(ul, x[x>ul], 3), c(0, y[x>ul], 0), col=2)
>
> arrows(t.t, 0.05, t.t, 0, length=0.1)
> text(t.t, 0.07, paste("t=", round(t.t, 3)))
>
> text(ll, -0.02, expression(-t[0.025]==-2.14))
> text(ul, -0.02, expression(t[0.025]==2.14))
>

> # 검정통계량으로부터 구한 유의확률
>
> par(mar=c(0,1,1,1))
> x <- seq(-3, 3, by=0.001)
> y <- dt(x, df=14)
> plot(x, y, type="l", axes=F, ylim=c(-0.02, 0.38), main="", xlab="t", ylab="", col = "blue")
> abline(h=0, col = "blue")
> alpha <- 0.05
> ul <- qt(1-(alpha/2), df=14)
> ll <- -ul
> polygon(c(-3, x[x<ll], ll), c(0, y[x<ll], 0), col=2)
> polygon(c(ul, x[x>ul], 3), c(0, y[x>ul], 0), col=2)
> text(ll, -0.02, expression(-t[0.025]==-2.14))
> text(ul, -0.02, expression(t[0.025]==2.14))
>
> t.t <- 0.8465086
> p.value <- 1 - pt(t.t, df=14)
> p.value
[1] 0.2057528
> polygon(c(t.t, x[x>t.t], 3), c(0, y[x>t.t], 0), density=20, angle=45, col = "blue")
> text(t.t, -0.02, paste("t=", round(t.t, 3)))
>
> text(1.7, 0.2, paste("P(T>t)=",round(p.value, 3)))
> arrows(1.7, 0.18, 1.5, dt(1.5, df=14), length=0.1)
>

Section 02. 단일 모집단의 가설검정

> ## 01) 단일 모집단의 평균검정
>

> # 여아 신생아의 몸무게의 평균이 2800(g)보다 더 높아졌다는 주장을 할 수 있는지 검정 실시

>
> data <- read.table("http://jse.amstat.org/datasets/babyboom.dat.txt", header=F)
> str(data)
'data.frame':   44 obs. of  4 variables:
$ V1: int  5 104 118 155 257 405 407 422 431 708 ...
$ V2: int  1 1 2 2 2 1 1 2 2 2 ...
$ V3: int  3837 3334 3554 3838 3625 2208 1745 2846 3166 3520 ...
$ V4: int  5 64 78 115 177 245 247 262 271 428 ...
> names(data) <- c("time", "gender", "weight", "minutes")
> tmp <- subset(data, gender==1)
> weight <- tmp[[3]]
>
> barx <- mean(weight)
> s <- sd(weight)
> n <- length(weight)
> h0 <- 2800
> # 검정통계량 계산
> ( t.t <- (barx - h0) / (s / sqrt(n)) )
[1] 2.233188
>
> alpha <- 0.05
> # 임계값 계산 : 대립가설이 큰지 여부에 대한 것(오른쪽 한쪽검정)이므로 오른쪽 임계값을 구한다.
> ( c.u <- qt(1-alpha, df=n-1) )
[1] 1.739607
> # 유의확률(p-value) 계산
> ( p.value <- 1 - pt(t.t, df=n-1) )
[1] 0.01963422
> # p-value 값이 유의수준(alpha=0.05)보다 작으므로 귀무가설을 기각한다.
> # 즉 여아 신생아의 몸무게의 평균이 2800(g)보다 더 높아졌다고 결론을 얻을 수 있다.
>
> # t.test 함수 사용
> t.test(weight, mu=2800, alternative="greater")

    One Sample t-test

data:  weight
t = 2.2332, df = 17, p-value = 0.01963
alternative hypothesis: true mean is greater than 2800
95 percent confidence interval:
2873.477      Inf
sample estimates:
mean of x
3132.444

>
> # 임계값, 검정통계량, 유의확률 그래프
> par(mar=c(0,1,1,1))
> x <- seq(-3, 3, by=0.001)
> y <- dt(x, df=n-1)
> plot(x, y, type="l", axes=F, ylim=c(-0.02, 0.38), main="", xlab="t", ylab="", col = "blue")
> abline(h=0, col = "blue")
>
> polygon(c(c.u, x[x>c.u], 3), c(0, y[x>c.u], 0), col=2)
> text(c.u, -0.02, expression(t[0.05]==1.74))
> text(1.8, 0.2, expression(alpha == 0.05), cex=0.8)
> arrows(1.8, 0.18, 1.8, 0.09, length=0.05, col = "orange")
>
> polygon(c(t.t, x[x>t.t], 3), c(0, y[x>t.t], 0), density=20, angle=45, col = "blue")
> text(t.t, -0.02, paste("t=", round(t.t, 3)), pos=4)
> text(2.5, 0.1, expression(plain(P)(T>2.233) == 0.0196), cex=0.8)
> arrows(2.5, 0.08, 2.5, 0.03, length=0.05, col = "orange")

> ## 02) 단일 모집단의 비율검정
>
> # 야구공의 불량률이 10%를 넘는다고 주장을 할 수 있는지 검정 실시
>
> tmp <- read.table("./data/restitution.txt", header=T)
> str(tmp)
'data.frame': 100 obs. of  1 variable:
$ rst: num  0.425 0.425 0.429 0.428 0.433 ...
> rel <- ifelse(tmp$rst < 0.4134 | tmp$rst > 0.4374, 1, 0)
>
> n <- length(rel)
> # 불량품의 갯수
> nos <- sum(rel)
> # 불량률
> sp <- nos / n
> # 귀무가설에서의 모비율
> hp <- 0.1
> # 검정통계량 계산
> (z <- (sp - hp) / sqrt( ( hp*(1-hp) )/n ) )
[1] 0.3333333
>
> alpha <- 0.05
> # 임계값 계산
> ( c.u <- qnorm(1-alpha) )
[1] 1.644854
> # 유의확률(p-value) 계산
> ( p.value <- 1 - pnorm(z) )
[1] 0.3694413
> # p-value 값이 유의수준(alpha=0.05)보다 크므로 대립가설을 기각한다.
> # 즉 "야구공의 불량률이 10%를 넘는다".는 통계적으로 유의한 결론을 얻을 수 없다.
>
> # prop.test 함수 사용
> prop.test(nos, n, p=0.1, alternative="greater", correct=FALSE)

1-sample proportions test without continuity correction

data:  nos out of n, null probability 0.1
X-squared = 0.11111, df = 1, p-value = 0.3694
alternative hypothesis: true p is greater than 0.1
95 percent confidence interval:
0.0684615 1.0000000
sample estimates:
   p
0.11

> # prop.test 함수는 표준정규분포 대신 χ2 분포의 통계량을 사용한다.
> # 자유도가 1인 χ2 분포는 하나의 표준정규분포를 제곱한 것으로 sqrt(0.11111)은 앞서 구한 검정통계량 Z(0.33333)와 같다.
> # 유의확률(0.3694) 또한 동일하다.
>
> # 임계값, 검정통계량, 유의확률 그래프
> par(mar=c(0,1,1,1))
> x <- seq(-3, 3, by=0.001)
> y <- dnorm(x)
> plot(x, y, type="l", axes=F, ylim=c(-0.02, 0.4), main="", xlab="z", ylab="", col = "blue")
> abline(h=0, col = "blue")
>
> polygon(c(c.u, x[x>c.u], 3), c(0, y[x>c.u], 0), col=2)
> text(c.u, -0.02, expression(z[0.05]==1.645))
>
> polygon(c(z, x[x>z], 3), c(0, y[x>z], 0), density=20, angle=45, col = "blue")
> text(z, -0.02, paste("z=", round(z, 3)))
> text(1.2, 0.3, paste("P(Z>z)=", round(p.value, 3)), cex=0.8)