library(dplyr)
library(pROC)

Generate artifical dataset

set.seed(1234)

# number of people
n <- 1000

age <- sample(18:85, size=n, replace=TRUE)
edu <- sample(c("high school", "college", "university"), size=n, replace=TRUE)
sex <- sample(c("male", "female"), size=n, replace=TRUE)
res <- runif(n)*0.95 + age/85*0.05 > 0.95

data <- data.frame(age, sex, edu, res)

Short overview

summary(data)
##       age            sex               edu         res         
##  Min.   :18.00   female:515   college    :350   Mode :logical  
##  1st Qu.:35.00   male  :485   high school:344   FALSE:972      
##  Median :52.00                university :306   TRUE :28       
##  Mean   :51.99                                                 
##  3rd Qu.:69.00                                                 
##  Max.   :85.00
summary(glm(res ~ ., data=data, family=binomial))
## 
## Call:
## glm(formula = res ~ ., family = binomial, data = data)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -0.3381  -0.2728  -0.2254  -0.1858   2.8822  
## 
## Coefficients:
##                Estimate Std. Error z value Pr(>|z|)    
## (Intercept)    -4.88685    0.71676  -6.818 9.23e-12 ***
## age             0.02211    0.01022   2.163   0.0305 *  
## sexmale         0.06268    0.38466   0.163   0.8706    
## eduhigh school  0.11152    0.46708   0.239   0.8113    
## eduuniversity   0.09909    0.47982   0.207   0.8364    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 255.44  on 999  degrees of freedom
## Residual deviance: 250.37  on 995  degrees of freedom
## AIC: 260.37
## 
## Number of Fisher Scoring iterations: 6

Resample to balance classes

pos <- which(data$res == TRUE)
diff <- nrow(data) - 2*length(pos)

upsample <- sample(pos, size=diff, replace=TRUE)

data.upsampled <- rbind(data, data[upsample,])

Generate model and a short summary

model <- glm(res ~ ., data=data.upsampled, family=binomial)
summary(model)
## 
## Call:
## glm(formula = res ~ ., family = binomial, data = data.upsampled)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -1.57489  -1.12560   0.06165   1.14330   1.49921  
## 
## Coefficients:
##                 Estimate Std. Error z value Pr(>|z|)    
## (Intercept)    -1.617702   0.175712  -9.207  < 2e-16 ***
## age             0.025623   0.002620   9.781  < 2e-16 ***
## sexmale         0.008431   0.094359   0.089  0.92880    
## eduhigh school  0.212338   0.114104   1.861  0.06276 .  
## eduuniversity   0.329840   0.116985   2.820  0.00481 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 2695.0  on 1943  degrees of freedom
## Residual deviance: 2588.8  on 1939  degrees of freedom
## AIC: 2598.8
## 
## Number of Fisher Scoring iterations: 4
# investigate the "education" - factor
table(data.upsampled$edu, data.upsampled$res)
##              
##               FALSE TRUE
##   college       341  300
##   high school   334  340
##   university    297  332
table(data$edu, data$res)
##              
##               FALSE TRUE
##   college       341    9
##   high school   334   10
##   university    297    9