library(dplyr)
library(pROC)
Generate artifical dataset
set.seed(1234)
# number of people
n <- 1000
age <- sample(18:85, size=n, replace=TRUE)
edu <- sample(c("high school", "college", "university"), size=n, replace=TRUE)
sex <- sample(c("male", "female"), size=n, replace=TRUE)
res <- runif(n)*0.95 + age/85*0.05 > 0.95
data <- data.frame(age, sex, edu, res)
Short overview
summary(data)
## age sex edu res
## Min. :18.00 female:515 college :350 Mode :logical
## 1st Qu.:35.00 male :485 high school:344 FALSE:972
## Median :52.00 university :306 TRUE :28
## Mean :51.99
## 3rd Qu.:69.00
## Max. :85.00
summary(glm(res ~ ., data=data, family=binomial))
##
## Call:
## glm(formula = res ~ ., family = binomial, data = data)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.3381 -0.2728 -0.2254 -0.1858 2.8822
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -4.88685 0.71676 -6.818 9.23e-12 ***
## age 0.02211 0.01022 2.163 0.0305 *
## sexmale 0.06268 0.38466 0.163 0.8706
## eduhigh school 0.11152 0.46708 0.239 0.8113
## eduuniversity 0.09909 0.47982 0.207 0.8364
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 255.44 on 999 degrees of freedom
## Residual deviance: 250.37 on 995 degrees of freedom
## AIC: 260.37
##
## Number of Fisher Scoring iterations: 6
Resample to balance classes
pos <- which(data$res == TRUE)
diff <- nrow(data) - 2*length(pos)
upsample <- sample(pos, size=diff, replace=TRUE)
data.upsampled <- rbind(data, data[upsample,])
Generate model and a short summary
model <- glm(res ~ ., data=data.upsampled, family=binomial)
summary(model)
##
## Call:
## glm(formula = res ~ ., family = binomial, data = data.upsampled)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.57489 -1.12560 0.06165 1.14330 1.49921
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.617702 0.175712 -9.207 < 2e-16 ***
## age 0.025623 0.002620 9.781 < 2e-16 ***
## sexmale 0.008431 0.094359 0.089 0.92880
## eduhigh school 0.212338 0.114104 1.861 0.06276 .
## eduuniversity 0.329840 0.116985 2.820 0.00481 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2695.0 on 1943 degrees of freedom
## Residual deviance: 2588.8 on 1939 degrees of freedom
## AIC: 2598.8
##
## Number of Fisher Scoring iterations: 4
# investigate the "education" - factor
table(data.upsampled$edu, data.upsampled$res)
##
## FALSE TRUE
## college 341 300
## high school 334 340
## university 297 332
table(data$edu, data$res)
##
## FALSE TRUE
## college 341 9
## high school 334 10
## university 297 9