-
FDA에 대하여데이터사이언스/데이터 분석 2024. 3. 15. 22:29
Prediction Error (PE), bias and variances from Prediction Sum of Squares (PRESS) and Generalized Cross-Validation (GCV) statistics considering N = 1000 simulations for 10, 30, 50 100 and 200 observations.
#Simulate rm(list=ls()) set.seed(123) V1 <- rnorm(1000, mean = 0, sd = sqrt(290)) V2 <- rnorm(1000, mean = 0, sd = sqrt(300)) X1 <- V1 + rnorm(1000, mean = 0, sd = 1) X2 <- V1 + rnorm(1000, mean = 0, sd = 1) X3 <- V1 + rnorm(1000, mean = 0, sd = 1) X4 <- V2 + rnorm(1000, mean = 0, sd = 1) X5 <- V2 + rnorm(1000, mean = 0, sd = 1) X6 <- V2 + rnorm(1000, mean = 0, sd = 1) X <- cbind(X1, X2, X3, X4, X5, X6) n <- c(10, 30, 50, 100, 200) loocv <- function(x){ h = lm.influence(x)$h mean((residuals(x)/(1-h))^2) } # loocv method gcv <- function (x){ LL <- logLik(object = x) n <- length(x$residuals) df <- attr(LL, "df") mean(x$residuals^2)/(1 - df/n)^2 } #gcv method # Calculate bias and variance for each value of n PRESS <- GCV <- bias_PRESS <- bias_GCV <- var_PRESS <- var_GCV <- vector(length = length(n)) for (i in 1:length(n)) { sample_n <- sample(1:1000,n[i]) X_n <- X[sample_n, ] y <- X_n %*% matrix(c(1, 2, 3, 4, 5, 6), ncol = 1) + rnorm(n[i], mean = 0, sd = 1) p <- ncol(X_n) H <- X_n %*% solve(t(X_n) %*% X_n) %*% t(X_n) h <- diag(H) B <- diag(1/(1-h),n[i],n[i]) glm_y_x <- glm(y~X_n) PRESS[i] <- loocv(glm_y_x) GCV[i] <- gcv(glm_y_x) bias_PRESS[i] <- var(y) * sum(1 / (1 - h)) - var(y) * (n[i] + p) bias_GCV[i] <- n[i] * var(y) / (1 - p/n[i]) - var(y) * (n[i] + p) var_PRESS[i] <- 2 * sum(diag(B^2 %*% (diag(n[i]) - H) %*% B^2 %*% (diag(n[i]) - H))) * var(y) var_GCV[i] <- 2 * n[i] * var(y) / (1 - p/n[i])^3 } data.frame(n = n, PRESS = PRESS, GCV = GCV, bias_PRESS = bias_PRESS, var_PRESS = var_PRESS, bias_GCV = bias_GCV, var_GCV = var_GCV)
n PRESS GCV bias_PRESS var_PRESS bias_GCV var_GCV 1 10 1.3209362 2.3873777 2661463.93 100072121 1301809.99 45201736 2 30 0.8392764 0.8710279 141776.65 8992164 107774.09 8419851 3 50 0.9955050 1.0507783 107408.65 13982999 75348.94 13513856 4 100 1.0602863 1.0778219 38866.91 19224236 30398.34 19112682 5 200 1.2291556 1.2475732 19700.92 34949963 14771.74 34887979
Conclusion : PRESS and GCV statistics are almost unbiased statistics of prediction error
'데이터사이언스 > 데이터 분석' 카테고리의 다른 글
추천시스템 이해 (2) 2024.03.17 머신러닝(분류): 신용카드 사기 검출 분석 (0) 2023.02.28 유용한 파이썬 코드 모음 (데이터 분석) (0) 2023.02.24 시계열 데이터에 대한 ARIMA 모델 with R (0) 2023.02.22 딥러닝(CNN)을 이용한 음성분류 (0) 2023.02.21