################### ## PRELIMINARIES ## ################### # ## packages library("sandwich") library("lmtest") library("MASS") library("car") library("pscl") # # DATA # Deb and Trivedi (1997) analyze data on 4406 individuals, aged 66 and over, who are covered by Medicare, # a public insurance program. Originally obtained from the US National Medical Expenditure Survey (NMES) # for 1987/88, the data are available from the data archive of the Journal of Applied Econometrics. # It was prepared for an R package accompanying Kleiber and Zeileis (2008) and is also available as DebTrivedi.rda # in the Journal of Statistical Software together with Zeileis (2006). The objective is to model the demand for # medical care -as captured by the number of physician/non-physician office and hospital outpatient visits- by the # covariates available for the patients. # setwd("C:/Users/Fotis/Dropbox/Uni of Athens/Teaching/Maths/GLNLM/R_labs/Poisson_Regression/v27i08.R") load("DebTrivedi.rda") dt <- DebTrivedi[, c(1, 6:8, 13, 15, 18)] # ################### ## VISUALIZATION ## ################### # ## convenience functions for visualization clog <- function(x) log(x + 0.5) cfac <- function(x, breaks = NULL) { if(is.null(breaks)) breaks <- unique(quantile(x, 0:10/10)) x <- cut(x, breaks, include.lowest = TRUE, right = FALSE) levels(x) <- paste(breaks[-length(breaks)], ifelse(diff(breaks) > 1, c(paste("-", breaks[-c(1, length(breaks))] - 1, sep = ""), "+"), ""), sep = "") return(x) } # ## visualization: dependent variables plot(table(dt$ofp), xlab = "Number of physician office visits",ylab = "Frequency", axes = FALSE) axis(2) axis(1, at = 0:18 * 5, labels = FALSE) axis(1, at = 0:9 * 10) # ## visualization transformations par(mfrow = c(1, 2)) plot(ofp ~ numchron, data = dt) plot(clog(ofp) ~ cfac(numchron), data = dt) par(mfrow = c(1, 1)) # ## visualization of each regressor plot(clog(ofp) ~ health, data = dt, varwidth = TRUE, ylab = "Physician office visits (in clogs)", xlab = "Self-perceived health status", main = "health") plot(clog(ofp) ~ cfac(numchron), data = dt, ylab = "Physician office visits (in clogs)", xlab = "Number of chronic conditions", main = "numchron") plot(clog(ofp) ~ privins, data = dt, varwidth = TRUE, ylab = "Physician office visits (in clogs)", xlab = "Covered by private insurance", main = "privins") plot(clog(ofp) ~ cfac(hosp, c(0:2, 8)), data = dt, ylab = "Physician office visits (in clogs)", xlab = "Number of hospital stays", main = "hosp") plot(clog(ofp) ~ gender, data = dt, varwidth = TRUE, ylab = "Physician office visits (in clogs)", xlab = "Gender", main = "gender") plot(cfac(ofp, c(0:2, 4, 6, 10, 100)) ~ school, data = dt, breaks = 9, ylab = "Physician office visits (number of visits)", xlab = "Number of years of education", main = "school") # # ############## ## MODELING ## ############## # # FULL MODEL ANALYSIS (step) # fm1 <- glm(ofp ~ ., data = DebTrivedi, family = poisson); summary(fm1) fm2 <- glm(ofp ~ .-faminc, data = DebTrivedi, family = poisson); summary(fm2) fm3 <- glm(ofp ~ .-faminc-emer, data = DebTrivedi, family = poisson); summary(fm3) fm4 <- glm(ofp ~ .-faminc-emer-opp, data = DebTrivedi, family = poisson); summary(fm4) fm5 <- glm(ofp ~ .-faminc-emer-opp-region, data = DebTrivedi, family = poisson); summary(fm5) # # RESTRICTED MODEL # ## Poisson regression fm_pois <- glm(ofp ~ ., data = dt, family = poisson) summary(fm_pois) ## adjusted Poisson regression coeftest(fm_pois, vcov = sandwich) ## quasi-Poisson regression (assumes that the mean is not equal to the variance, actually Var>Mean) fm_qpois <- glm(ofp ~ ., data = dt, family = quasipoisson) summary(fm_qpois) ## negative binomial regression fm_nbin <- glm.nb(ofp ~ ., data = dt) summary(fm_nbin) ## hurdle NB regression fm_hurdle0 <- hurdle(ofp ~ ., data = dt, dist = "negbin") summary(fm_hurdle0) fm_hurdle <- hurdle(ofp ~ . | hosp + numchron + privins + school + gender, data = dt, dist = "negbin") waldtest(fm_hurdle0, fm_hurdle) lrtest(fm_hurdle0, fm_hurdle) ## zero-inflated NB regression fm_zinb0 <- zeroinfl(ofp ~ ., data = dt, dist = "negbin") fm_zinb <- zeroinfl(ofp ~ . | hosp + numchron + privins + school + gender, data = dt, dist = "negbin") waldtest(fm_zinb0, fm_zinb) summary(fm_zinb) ###################### ## MODELING SUMMARY ## ###################### ## coefficients fm <- list("ML-Pois" = fm_pois, "Quasi-Pois" = fm_qpois, "NB" = fm_nbin, "Hurdle-NB" = fm_hurdle, "ZINB" = fm_zinb) sapply(fm, function(x) coef(x)[1:8]) # ## standard errors cbind("ML-Pois" = sqrt(diag(vcov(fm_pois))), "Adj-Pois" = sqrt(diag(sandwich(fm_pois))), sapply(fm[-1], function(x) sqrt(diag(vcov(x)))[1:8])) # ## log-likelihood rbind(logLik = sapply(fm, function(x) round(logLik(x), digits = 0)), Df = sapply(fm, function(x) attr(logLik(x), "df"))) # ## zero-counts round(c("Obs" = sum(dt$ofp < 1), "ML-Pois" = sum(dpois(0, fitted(fm_pois))), "NB" = sum(dnbinom(0, mu = fitted(fm_nbin), size = fm_nbin$theta)), "NB-Hurdle" = sum(predict(fm_hurdle, type = "prob")[,1]), "ZINB" = sum(predict(fm_zinb, type = "prob")[,1]))) # ## zero model coefficients t(sapply(fm[4:5], function(x) round(x$coefficients$zero, digits = 3))) # ################################### ## CAMERON & TRIVEDI REPLICATION ## ################################### # ## data dt2 <- DebTrivedi[, -(2:6)] dt2$region <- relevel(dt2$region, "other") # ## model fm_hurdle2 <- hurdle(ofp ~ ., data = dt2, dist = "negbin") # ## summary cfz <- coef(fm_hurdle2, model = "zero") cfc <- coef(fm_hurdle2, model = "count") se <- sqrt(diag(sandwich(fm_hurdle2))) round(cbind(zero = cfz, zero_t = cfz/se[-seq(along = cfc)], count = cfc, count_t = cfc/se[seq(along = cfc)]), digits = 3)[c(3, 2, 4, 5, 7, 6, 8, 9:17, 1),] logLik(fm_hurdle2) 1/fm_hurdle2$theta