다변량 자료와 정규분포

데이터사이언스/데이터 분석 2022. 8. 30. 21:07

목적:

setwd("D:/Find in data/Findindata 블로그/데이터사이언스/데이터 분석")
getwd()

## [1] "D:/Find in data/Findindata 블로그/데이터사이언스/데이터 분석"

rm(list=ls())      #모든 정의된 변수 초기화

1. 평균벡터와 분산-공분산행렬

다음은 2001년에 집계된 네 개의 주요 자원들로부터의 에너지 소비량 x1: petroleum(석유), x2: natural gas(천연가스), x3: hydroelectric power(수력 발전), x4: nuclear electric power(원자력 발전) in quadrillions (1015) of BTUs 을 포함한 자료를 이용하여 구한 표본평균과 표본공분산이다.

Xbar<- c(0.766, 0.508, 0.438, 0.161)

S<- matrix(c(0.856,0.635,0.173,0.096,0.635,
             0.568,0.128,0.067,0.173,0.128,
             0.171,0.039,0.096,0.067,0.039,
             0.043), nrow=4)

(1)-1 전체 에너지 소비량의 표본평균은

sum(Xbar)

## [1] 1.873

(1)-2 전체 에너지 소비량의 표본분산은

sum(S)

## [1] 3.914

(2)-1 석유와 천연가스의 에너지 소비량 차이의 표본평균은

Xbar[1]-Xbar[2]

## [1] 0.258

(2)-2 석유와 천연가스의 에너지 소비량 차이의 표본분산은

S[1,1]+S[2,2]-2*S[1,2]

## [1] 0.154

(3)- 전체에너지소비량과 석유와 천연가스의 에너지 소비량 차이의 표본 공분산은

S[1,1]+S[1,3]+S[1,4]-S[2,2]-S[2,3]-S[2,4]

## [1] 0.362

2. 이변량 정규분포

(1)- 표본의 평균벡터와 분산공분산 행렬을 모수와 비교

library(MASS)

n = 100                                     #표본의 크기
mu = c(0,0)
Sigma = matrix(c(2, 1, 1, 2), nrow = 2)     #분포 파라미터 설정

options(max.print=1000000)                  #출력옵션제한

n.rep = 1000                                #표본의 개수 
xbar.mat = matrix(rep(0,n.rep*2), ncol = 2) #빈 벡터 생성 
S.mat = matrix(rep(0,n.rep*4), ncol = 4)    #빈 행렬 생성

set.seed(913)                               #반복을 위한 난수     


for(i in 1:n.rep){                          #1000개의 표본 추출
  x = mvrnorm(n, mu, Sigma)                 #크기가 100인 표본   (for문 사용) 
  
  xbar.mat[i,] = colMeans(x)                #표본들의 평균벡터 저장
  S.mat[i,] = matrix(cov(x), nrow = 1)      #표본들의 분산공분산행렬저장
}                                           

colMeans(xbar.mat) #통계량

## [1] 0.002935786 0.003084619

mu                 #모수

## [1] 0 0

colMeans(S.mat)    #통계량

## [1] 2.0017090 0.9997019 0.9997019 1.9928098

Sigma              #모수

##      [,1] [,2]
## [1,]    2    1
## [2,]    1    2

#히스토그램으로 결과 확인 
#평균벡터

par(mfrow = c(2, 1)) 
hist(xbar.mat[,1], breaks = 50, col="skyblue", main = "", xlab = "Sample Mean of X1")
hist(xbar.mat[,2], breaks = 50, col="skyblue", main = "", xlab = "Sample Mean of X2")

#분산공분산 행렬

par(mfrow = c(2, 2))
hist(S.mat[,1], breaks = 50, col="588", main = "", xlab = "Sample Variance of X1")
hist(S.mat[,2], breaks = 50, col="588", main = "", xlab = "Sample Covariance of X1 and X2")
hist(S.mat[,3], breaks = 50, col="588", main = "", xlab = "Sample Covariance of X2 and X1")
hist(S.mat[,4], breaks = 50, col="588", main = "", xlab = "Sample Variance of X2")

(2)- 표본 분산공분산행렬의 행렬식과 trace를 모수와 비교

n.rep = 1000
det.mat = matrix(rep(0,n.rep), ncol = 1) 
tr.mat = matrix(rep(0,n.rep), ncol = 1)    #빈 행렬 생성

set.seed(913) 

for(i in 1:n.rep){
  x = mvrnorm(n, mu, Sigma)                 #1000개의 표본 추출
  
  det.mat[i,] = det(cov(x))                 
  tr.mat[i,] = sum(diag(cov(x)))  
}

#히스토그램

par(mfrow = c(2, 1)) 
hist(det.mat[,1], breaks = 50, col="599", main = "", xlab = "det of S")
hist(tr.mat[,1], breaks = 50, col="599", main = "", xlab = "trace of S")

colMeans(det.mat)

## [1] 2.957251

det(Sigma)

## [1] 3

colMeans(tr.mat)

## [1] 3.994519

sum(diag(Sigma)) #모수와 거의 같은 값 확인

## [1] 4

(3)- 표본 분산공분산 행렬의 고유값과 고유벡터를 모수와 비교

#고유값에 대한 비교

n.rep = 1000
eigen_value.mat = matrix(rep(0,n.rep), ncol = 1)    #빈 행렬 생성

set.seed(913) 

for(i in 1:n.rep){
  x = mvrnorm(n, mu, Sigma)                 #1000개의 표본 추출
  
  eigen_value.mat[i,] = eigen(cov(x))$values[1]                
  
}

par(mfrow = c(1, 1))
hist(eigen_value.mat[,1], breaks = 50, col="599", main = "", xlab = "Leadig 고유값 of S")

#모수와 거의 같은 값이다.

colMeans(eigen_value.mat)

## [1] 3.012564

eigen(Sigma)$values[1]

## [1] 3

#고유벡터에 대한 비교

n.rep = 1000
e.mat = matrix(rep(0,n.rep*2), ncol = 2)    #빈 행렬 생성

set.seed(913) 

for(i in 1:n.rep){
  x = mvrnorm(n, mu, Sigma)                 #1000개의 표본 추출
  
  e.mat[i,] = eigen(cov(x))$vectors[,1]                
  
}    
head(e.mat,10)

##             [,1]       [,2]
##  [1,]  0.6385921  0.7695454
##  [2,] -0.7358466 -0.6771482
##  [3,] -0.7484589 -0.6631812
##  [4,]  0.6950509  0.7189606
##  [5,] -0.7600826 -0.6498264
##  [6,]  0.7008541  0.7133046
##  [7,]  0.7053442  0.7088650
##  [8,] -0.8674867 -0.4974603
##  [9,] -0.7340555 -0.6790895
## [10,] -0.7322331 -0.6810541

plot(0,0,type="n",main="Sampling Variability of e1",xlab="X1",ylab="X2")

for(i in 1:1000){
  slope = e.mat[i,2]/e.mat[i,1]
  abline(0, slope, col="skyblue")
}

slope.true=eigen(Sigma)$vectors[2,1]/eigen(Sigma)$vectors[1,1]
abline(0, slope.true, col="red",lwd=2)

3. 정규성 검정

업로드된 T4-6.DAT은 130명의 페루 청소년들을 대상으로 실시한 심리검사 결과를 포함하고 있다. 이 자료의 첫 5개의 변수로는 independence, support, benevolence, conformity, leadership이 있고, 이들의 점수를 관측값으로 가지고 있다.

mind<-read.table("T4-6.DAT")
dim(mind)

## [1] 130   7

mind_data<- mind[, -c(6,7)]
dim(mind_data)

## [1] 130   5

head(mind_data)

##   V1 V2 V3 V4 V5
## 1 27 13 14 20 11
## 2 12 13 24 25  6
## 3 14 20 15 16  7
## 4 18 20 17 12  6
## 5  9 22 22 21  6
## 6 18 15 17 25  9

(1)- 각 변수에 대한 Q-Q plot

par(mfrow = c(1, 1)) 

qqnorm(mind_data[,1], col=2, pch=16,  main="independence")
qqline(mind_data[,1])

qqnorm(mind_data[,2], col=2, pch=16,  main="support")
qqline(mind_data[,2])

qqnorm(mind_data[,3], col=2, pch=16,  main="benevolence")
qqline(mind_data[,3])

qqnorm(mind_data[,4], col=2, pch=16,  main="conformity")
qqline(mind_data[,4])

qqnorm(mind_data[,5], col=2, pch=16,  main="leadership")
qqline(mind_data[,5])

(2)- chi-square plot for the five variables

dev.mat = scale(mind_data, center=T, scale=F) 
S = cov(mind_data)
S.inv = solve(S)

d2 = rep(0,130) 
for(i in 1:130){
  d2[i] = t(dev.mat[i,]) %*% S.inv %*% dev.mat[i,] 
}

par(mfrow=c(1,1))
plot(qchisq(((1:130)-.5)/130, 5), 
     sort(d2), type="n",
     xlab = "Quantile of Chi-square with df = 5",
     ylab = "d-square")
abline(0,1, col="grey")
points(qchisq(((1:130)-.5)/130, 5), sort(d2), pch=16, col=2)

감사합니다

저작자표시 비영리 변경금지 (새창열림)

'데이터사이언스 > 데이터 분석' 카테고리의 다른 글

유용한 파이썬 코드 모음 (데이터 분석) (0)	2023.02.24
시계열 데이터에 대한 ARIMA 모델 with R (0)	2023.02.22
딥러닝(CNN)을 이용한 음성분류 (0)	2023.02.21
이미지에 대한 PCA와 클러스터링, LDA (0)	2023.02.21
실험계획법 with R (0)	2022.08.30

ABOUT ME

Find in data Find in data

1. 평균벡터와 분산-공분산행렬

2. 이변량 정규분포

3. 정규성 검정

감사합니다

'데이터사이언스 > 데이터 분석' 카테고리의 다른 글

티스토리툴바

ABOUT ME

1. 평균벡터와 분산-공분산행렬

2. 이변량 정규분포

3. 정규성 검정

감사합니다

'데이터사이언스 > 데이터 분석' 카테고리의 다른 글

관련글 관련글 더보기

티스토리툴바