
library(Rcpp)
library(inline)
library(RcppArmadillo)
library(betareg)
library(mixtools)
library(ggplot2)
sourceCpp("./extra/simplex_mixture.cpp")
source("./extra/simplex_mixture_cpp.R")
source("./extra/extra_functions.R")

##########
# http://datam.i2r.a-star.edu.sg/datasets/krbd/LungCancer/LungCancer-Michigan.html
# http://www.google.ne/patents/US6335170
dd <- read.table("data/LungCancer-Michigan/lung-michigan.data", sep = ",")
dd <- dd[, -7130]
gens <- read.table("data/LungCancer-Michigan/lung-michigan.names")
gens <- as.character(gens[, 1])
colnames(dd) <- gens
correlations <- cor(dd)
TH <- (as.numeric(correlations[, 6735][correlations[, 6735] != 1]) + 1)/2

##############

modelo <- simplexEM(TH, 2, tol = 0.9, method = "Nelder-Mead", hessian = T) # 

# comparison beta model
# consider 'identity' links
modelo_b <- betamix(y~1|1, data=data.frame(y=TH), k=2, link='logit', link.phi='log')

mu <- plogis(coef(modelo_b)[,1])
phi <- exp(coef(modelo_b)[,2])

pj_b<-modelo_b$flexmix@prior

# comparison normal model
modelo_n<-normalmixEM(TH, k=2)

# sum of log-likelihood contributions
c(-modelo$loglik, # 
  sum(betamix_loglik_ind(x=TH, pj = pj_b, mu, phi, log=T)), # 
  sum(normalmix_loglik_ind(TH, modelo_n$lambda, modelo_n$mu, modelo_n$sigma,log=T)))

# AIC and BIC
c(2*modelo$loglik+2*4, 
  -2*sum(betamix_loglik_ind(x=TH, pj = pj_b, mu, phi, log=T))+2*4,
  -2*sum(normalmix_loglik_ind(TH, modelo_n$lambda, modelo_n$mu, modelo_n$sigma, log=T))+2*4)

c(2*modelo$loglik+log(length(TH))*4, 
  -2*sum(betamix_loglik_ind(x=TH, pj = pj_b, mu, phi, log=T))+log(length(TH))*4,
  -2*sum(normalmix_loglik_ind(TH, modelo_n$lambda, modelo_n$mu, modelo_n$sigma, log=T))+log(length(TH))*4)

# Vuong's test
vuong(modelo$loglik_ind, betamix_loglik_ind(x=TH, pj = pj_b, mu, phi, log=T))
vuong(modelo$loglik_ind, normalmix_loglik_ind(TH, modelo_n$lambda,modelo_n$mu,modelo_n$sigma,log=T))

# steps for creating a histogram:
datos<-data.frame(x=rep(sort(TH), 3),
                  y=c(y_to_hist(modelo, TH),
                      y_to_hist_b(mu,phi,pj_b,p_inf=0,TH),
                      y_to_hist_n(modelo_n$mu,modelo_n$sigma,modelo_n$lambda,TH)),
                  cat=factor(rep(1:3, each=length(TH))) )

ggplot(data=datos, aes(x,y, group=cat) ) +
  geom_histogram(data=datos[1:length(TH), ], aes(y = ..density..), colour="black", fill="white") + 
  theme_bw() + xlab('TH') + ylab('density') +
  theme(axis.text = element_text(size=15, colour = 1), 
        axis.title=element_text(size=20), legend.position = "none") +
  geom_line(aes(linetype=factor(cat))) + # http://sape.inf.usi.ch/quick-reference/ggplot2/linetype
  scale_linetype_manual(values=c('solid','dashed','dotted')) # http://stackoverflow.com/questions/14077274/ggplot2-manually-specifying-color-linetype-duplicate-legend
  
# Empirical Hellinger distance
c(emp_hellinger(TH, modelo), emp_hellinger_b(TH, modelo_b), 
  emp_hellinger_n(TH, modelo_n))

