library(QREM)
# https://github.com/haimbar/QREM
library("SEMMS")
# https://github.com/haimbar/SEMMS

library("rqPen") # to compare with a Lasso-based method

# Mixed Effect Modeling and Variable Selection for Quantile Regression
# Bar, Booth, and Wells (Statistical Modelling 2021)

# Code to create Table 1 (variable selection)

# multiple variables, non-constant variance:
sims <- list(seed=11001, n=200,  err=function(x) {rnorm(length(x), 0, 0.1+x)},
             errvar=1, mod=y~X1+X2+X3+X4+X5, coefs=c(1,-3,2,2,-1,-2), 
             qns=seq(0.1,0.9,by=0.1),
             B=0, reps=100, xrng=rbind(c(0,1),c(0,1),c(0,1),c(0,1),c(0,1)))

# to try just one run with a subset of 50 predictors:
data(simLargeP) # simulation 5 in the paper
qn <- 0.25
res <- QREM_vs(simLargeP, 1, 2:51, qn=qn)
dfsemms <- simLargeP[,c(1, 1+res$fittedSEMMS$gam.out$nn)]
qremFit <- QREM(lm, y~., dfsemms, qn=qn)
ests <- rbind(qremFit$coef$beta,
              sqrt(diag(bcov(qremFit,linmod=y~., df=dfsemms, qn=0.2))))
rownames(ests) <- c("Estimate","s.d")
print(ests)

########
# The full simulation takes a while to complete (100 reps * 9 quantiles)

res <- matrix(0,ncol=11,nrow=100*9)
confmat <- function(allidx, actual, estimated) {
  TP <- which(estimated  %in% actual)
  FP <- which(estimated %in% setdiff(allidx,actual))
  TN <- which(setdiff(allidx,estimated) %in% setdiff(allidx,actual))
  FN <- which(setdiff(allidx,estimated) %in% actual)
  c(length(TP), length(FP),length(TN), length(FN))
}

cnt <- 1
simno <- 1
n <- sims$n
qns <- sims$qns
reps <- sims$reps
coefs <- sims$coefs
lp <- as.list(attr(terms(sims$mod), "variables"))[-(1:2)]
truepreds = rep(0,length(lp))
for(i in 1:length(lp)) {
  truepreds[i] = gsub("[a-zA-Z]","",lp[[i]], perl=TRUE)
}
truepreds <- as.numeric(truepreds)
xrng <- matrix(sims$xrng,  nrow=(length(coefs)-1), ncol=2)
for (i in 1:reps) {
  set.seed(500000 + sims$seed + i)
  X <- matrix(0,nrow=n, ncol=length(coefs))
  X[,1] <- rep(1,n)
  for (jj in 1:(length(coefs)-1)) {
    X[,jj+1] <- runif(n, min=xrng[jj,1], max=xrng[jj,2])
  }
  colnames(X) <- c("const", paste("X",1:(length(coefs)-1),sep=""))
  if (sims$errvar == 0) {
    errs <- sims$err(rep(0,n))
  } else {
    if (length(sims$errvar) == 2) {
      errs <- sims$err(X[,sims$errvar[1]+1], X[,sims$errvar[2]+1])
    } else {
      errs <- sims$err(X[,sims$errvar+1])
    }
  }
  y <- X%*%coefs + errs
  dframe <- data.frame(y,X[,-1], matrix(rnorm((500-ncol(X)+1)*n,0,0.1), 
                                        nrow=n, ncol=(500-ncol(X)+1)))
  X <- as.matrix(dframe[,-1])
  y <- dframe[,1]
  for (qn in qns) {
    L1fit <- rq.lasso.fit(X,y,lambda=0.1,tau = qn)
    selectedL1 <- which(L1fit$coefficients[-1]!=0)
    if (length(selectedL1) > 2) {
      resQREM <- QREM_vs(simLargeP, 1, 2:501, qn=qn, nnset=selectedL1)
    } else {
      resQREM <- QREM_vs(simLargeP, 1, 2:501, qn=qn)
    }
    selected <- resQREM$fittedSEMMS$gam.out$nn
    res[cnt,]  <- c(simno, i, qn, confmat(1:500, truepreds, selected),
                    confmat(1:500, truepreds, selectedL1))
    cat(c(simno, i, qn, confmat(1:500, truepreds, selected), 
          confmat(1:500, truepreds, selectedL1)),"\n")
    cnt <- cnt+1
  }
}

# True positives (QREM), actual number of predictors is 5
print(table(res[,4],as.factor(res[,3])))
# True positives (Lasso), actual number of predictors is 5
print(table(res[,8],as.factor(res[,3])))

# False positives (QREM)
print(table(res[,5],as.factor(res[,3])))
# False positives (Lasso)
print(table(res[,9],as.factor(res[,3])))
