ABOUT ME

-

Today
-
Yesterday
-
Total
-
  • FDA에 대하여
    데이터사이언스/데이터 분석 2024. 3. 15. 22:29

     

     

     

     

     

     

     

     

    Prediction Error (PE), bias and variances from Prediction Sum of Squares (PRESS) and Generalized Cross-Validation (GCV) statistics considering N = 1000 simulations for 10, 30, 50 100 and 200 observations.

     

     

    #Simulate
    rm(list=ls())
    set.seed(123)
    
    V1 <- rnorm(1000, mean = 0, sd = sqrt(290))
    V2 <- rnorm(1000, mean = 0, sd = sqrt(300))
    
    X1 <- V1 + rnorm(1000, mean = 0, sd = 1)
    X2 <- V1 + rnorm(1000, mean = 0, sd = 1)
    X3 <- V1 + rnorm(1000, mean = 0, sd = 1)
    X4 <- V2 + rnorm(1000, mean = 0, sd = 1)
    X5 <- V2 + rnorm(1000, mean = 0, sd = 1)
    X6 <- V2 + rnorm(1000, mean = 0, sd = 1)
    
    X <- cbind(X1, X2, X3, X4, X5, X6)
    
    n <- c(10, 30, 50, 100, 200)
    
    loocv <- function(x){
      h = lm.influence(x)$h
      mean((residuals(x)/(1-h))^2)
    } # loocv method
    
    gcv <- function (x){
      LL <- logLik(object = x)
      n <- length(x$residuals)
      df <- attr(LL, "df")
      mean(x$residuals^2)/(1 - df/n)^2
    } #gcv method
    
    # Calculate bias and variance for each value of n
    PRESS <- GCV <- bias_PRESS <- bias_GCV <- var_PRESS <- var_GCV <- vector(length = length(n))
    
    
    
    for (i in 1:length(n)) {
      sample_n <- sample(1:1000,n[i])
      X_n <- X[sample_n, ]
      y <- X_n %*% matrix(c(1, 2, 3, 4, 5, 6), ncol = 1) + rnorm(n[i], mean = 0, sd = 1)
      p <- ncol(X_n)
      H <- X_n %*% solve(t(X_n) %*% X_n) %*% t(X_n)
      h <- diag(H)
      B <- diag(1/(1-h),n[i],n[i])
      glm_y_x <- glm(y~X_n)
      
      PRESS[i] <- loocv(glm_y_x)
      GCV[i] <- gcv(glm_y_x)
      
      bias_PRESS[i] <- var(y) * sum(1 / (1 - h)) - var(y) * (n[i] + p)
      bias_GCV[i] <- n[i] * var(y) / (1 - p/n[i]) - var(y) * (n[i] + p)
      
      var_PRESS[i] <- 2 * sum(diag(B^2 %*% (diag(n[i]) - H) %*%
                                     B^2 %*% (diag(n[i]) - H))) * var(y)
      var_GCV[i] <- 2 * n[i] * var(y) / (1 - p/n[i])^3
    }
    
    data.frame(n = n, PRESS = PRESS, GCV = GCV,
               bias_PRESS = bias_PRESS, var_PRESS = var_PRESS,
               bias_GCV = bias_GCV, var_GCV = var_GCV)

     

     

        n     PRESS       GCV bias_PRESS var_PRESS   bias_GCV  var_GCV
    1  10 1.3209362 2.3873777 2661463.93 100072121 1301809.99 45201736
    2  30 0.8392764 0.8710279  141776.65   8992164  107774.09  8419851
    3  50 0.9955050 1.0507783  107408.65  13982999   75348.94 13513856
    4 100 1.0602863 1.0778219   38866.91  19224236   30398.34 19112682
    5 200 1.2291556 1.2475732   19700.92  34949963   14771.74 34887979

     

     

     

    Conclusion : PRESS and GCV statistics are almost unbiased statistics of prediction error

     

     

    댓글