###################
## PRELIMINARIES ##
###################
#
## packages
library("sandwich")
library("lmtest")
library("MASS")
library("car")
library("pscl")
#
# DATA
# Deb and Trivedi (1997) analyze data on 4406 individuals, aged 66 and over, who are covered by Medicare, 
# a public insurance program. Originally obtained from the US National Medical Expenditure Survey (NMES) 
# for 1987/88, the data are available from the data archive of the Journal of Applied Econometrics. 
# It was prepared for an R package accompanying Kleiber and Zeileis (2008) and is also available as DebTrivedi.rda 
# in the Journal of Statistical Software together with Zeileis (2006). The objective is to model the demand for 
# medical care -as captured by the number of physician/non-physician office and hospital outpatient visits- by the 
# covariates available for the patients.
#
setwd("C:/Users/Fotis/Dropbox/Uni of Athens/Teaching/Maths/GLNLM/R_labs/Poisson_Regression/v27i08.R")
load("DebTrivedi.rda")
dt <- DebTrivedi[, c(1, 6:8, 13, 15, 18)]
#
###################
## VISUALIZATION ##
###################
#
## convenience functions for visualization
clog <- function(x) log(x + 0.5)
cfac <- function(x, breaks = NULL) {
  if(is.null(breaks)) breaks <- unique(quantile(x, 0:10/10))
  x <- cut(x, breaks, include.lowest = TRUE, right = FALSE)
  levels(x) <- paste(breaks[-length(breaks)], ifelse(diff(breaks) > 1,
    c(paste("-", breaks[-c(1, length(breaks))] - 1, sep = ""), "+"), ""), sep = "")
  return(x)
}
#
## visualization: dependent variables
plot(table(dt$ofp), xlab = "Number of physician office visits",ylab = "Frequency", axes = FALSE)
axis(2)
axis(1, at = 0:18 * 5, labels = FALSE)
axis(1, at = 0:9 * 10)
#
## visualization transformations
par(mfrow = c(1, 2))
plot(ofp ~ numchron, data = dt)
plot(clog(ofp) ~ cfac(numchron), data = dt)
par(mfrow = c(1, 1))
#
## visualization of each regressor
plot(clog(ofp) ~ health, data = dt, varwidth = TRUE,
  ylab = "Physician office visits (in clogs)", xlab = "Self-perceived health status", main = "health")
plot(clog(ofp) ~ cfac(numchron), data = dt,
  ylab = "Physician office visits (in clogs)", xlab = "Number of chronic conditions", main = "numchron")
plot(clog(ofp) ~ privins, data = dt, varwidth = TRUE,
  ylab = "Physician office visits (in clogs)", xlab = "Covered by private insurance", main = "privins")
plot(clog(ofp) ~ cfac(hosp, c(0:2, 8)), data = dt,
  ylab = "Physician office visits (in clogs)", xlab = "Number of hospital stays", main = "hosp")
plot(clog(ofp) ~ gender, data = dt, varwidth = TRUE,
  ylab = "Physician office visits (in clogs)", xlab = "Gender", main = "gender")
plot(cfac(ofp, c(0:2, 4, 6, 10, 100)) ~ school, data = dt, breaks = 9,
  ylab = "Physician office visits (number of visits)", xlab = "Number of years of education", main = "school")
#
#
##############
## MODELING ##
##############
#
# FULL MODEL ANALYSIS (step)
#
fm1 <- glm(ofp ~ ., data = DebTrivedi, family = poisson);					                summary(fm1)
fm2 <- glm(ofp ~ .-faminc, data = DebTrivedi, family = poisson);				          summary(fm2)
fm3 <- glm(ofp ~ .-faminc-emer, data = DebTrivedi, family = poisson);			        summary(fm3)
fm4 <- glm(ofp ~ .-faminc-emer-opp, data = DebTrivedi, family = poisson);		      summary(fm4)
fm5 <- glm(ofp ~ .-faminc-emer-opp-region, data = DebTrivedi, family = poisson);	summary(fm5)
#
# RESTRICTED MODEL
#
## Poisson regression
fm_pois <- glm(ofp ~ ., data = dt, family = poisson)
summary(fm_pois)

## adjusted Poisson regression
coeftest(fm_pois, vcov = sandwich)

## quasi-Poisson regression (assumes that the mean is not equal to the variance, actually Var>Mean)
fm_qpois <- glm(ofp ~ ., data = dt, family = quasipoisson)
summary(fm_qpois)

## negative binomial regression
fm_nbin <- glm.nb(ofp ~ ., data = dt)
summary(fm_nbin)

## hurdle NB regression
fm_hurdle0 <- hurdle(ofp ~ ., data = dt, dist = "negbin")
summary(fm_hurdle0)
fm_hurdle <- hurdle(ofp ~ . | hosp + numchron + privins + school + gender,
  data = dt, dist = "negbin")
waldtest(fm_hurdle0, fm_hurdle)
lrtest(fm_hurdle0, fm_hurdle)

## zero-inflated NB regression
fm_zinb0 <- zeroinfl(ofp ~ ., data = dt, dist = "negbin")
fm_zinb <- zeroinfl(ofp ~ . | hosp + numchron + privins + school + gender,
  data = dt, dist = "negbin")
waldtest(fm_zinb0, fm_zinb)
summary(fm_zinb)


######################
## MODELING SUMMARY ##
######################

## coefficients
fm <- list("ML-Pois" = fm_pois, "Quasi-Pois" = fm_qpois, "NB" = fm_nbin,
  "Hurdle-NB" = fm_hurdle, "ZINB" = fm_zinb)
sapply(fm, function(x) coef(x)[1:8])
#
## standard errors
cbind("ML-Pois" = sqrt(diag(vcov(fm_pois))),
  "Adj-Pois" = sqrt(diag(sandwich(fm_pois))),
  sapply(fm[-1], function(x) sqrt(diag(vcov(x)))[1:8]))
#
## log-likelihood
rbind(logLik = sapply(fm, function(x) round(logLik(x), digits = 0)),
  Df = sapply(fm, function(x) attr(logLik(x), "df")))
#
## zero-counts
round(c("Obs" = sum(dt$ofp < 1),
  "ML-Pois" = sum(dpois(0, fitted(fm_pois))),
  "NB" = sum(dnbinom(0, mu = fitted(fm_nbin), size = fm_nbin$theta)),
  "NB-Hurdle" = sum(predict(fm_hurdle, type = "prob")[,1]),
  "ZINB" = sum(predict(fm_zinb, type = "prob")[,1])))
#
## zero model coefficients
t(sapply(fm[4:5], function(x) round(x$coefficients$zero, digits = 3)))
#
###################################
## CAMERON & TRIVEDI REPLICATION ##
###################################
#
## data
dt2 <- DebTrivedi[, -(2:6)]
dt2$region <- relevel(dt2$region, "other")
#
## model
fm_hurdle2 <- hurdle(ofp ~ ., data = dt2, dist = "negbin")
#
## summary
cfz <- coef(fm_hurdle2, model = "zero")
cfc <- coef(fm_hurdle2, model = "count")
se <- sqrt(diag(sandwich(fm_hurdle2)))
round(cbind(zero = cfz, zero_t = cfz/se[-seq(along = cfc)], 
  count = cfc, count_t = cfc/se[seq(along = cfc)]),
  digits = 3)[c(3, 2, 4, 5, 7, 6, 8, 9:17, 1),]
logLik(fm_hurdle2)
1/fm_hurdle2$theta