#DV: STATUS #IV: age, length of service, gender,Business unit
termination<-read.csv("MFG10YearTerminationData.csv")
unique(termination$STATUS)
## [1] "ACTIVE" "TERMINATED"
termination$STATUS<-ifelse(termination$STATUS=="TERMINATED", 1, 0)
data_term<-termination %>%
mutate(recorddate_key = mdy(str_sub(recorddate_key,1,-6))) %>%
arrange(EmployeeID,STATUS,desc(recorddate_key)) %>%
group_by(EmployeeID,STATUS) %>%
summarise(age = max(age),
length_of_service = max(length_of_service),
gender = unique(gender_full)[1],
BUSINESS_UNIT = unique(BUSINESS_UNIT)[1]
)
## `summarise()` has regrouped the output.
## ℹ Summaries were computed grouped by EmployeeID and STATUS.
## ℹ Output is grouped by EmployeeID.
## ℹ Use `summarise(.groups = "drop_last")` to silence this message.
## ℹ Use `summarise(.by = c(EmployeeID, STATUS))` for per-operation grouping
## (`?dplyr::dplyr_by`) instead.
set.seed(2026)
train_id<-sample(nrow(data_term),round(nrow(data_term)*.7,0))
trainData<-data_term[train_id,]
testData<-data_term[-train_id,]
model<-glm(STATUS~age+length_of_service+gender+BUSINESS_UNIT,data = data_term,family = binomial)
summary(model)
##
## Call:
## glm(formula = STATUS ~ age + length_of_service + gender + BUSINESS_UNIT,
## family = binomial, data = data_term)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.368435 0.222798 -6.142 0.000000000815 ***
## age 0.104130 0.003086 33.739 < 0.0000000000000002 ***
## length_of_service -0.234562 0.007962 -29.460 < 0.0000000000000002 ***
## genderMale -0.212242 0.068181 -3.113 0.00185 **
## BUSINESS_UNITSTORES -2.087879 0.181983 -11.473 < 0.0000000000000002 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 7518.0 on 7622 degrees of freedom
## Residual deviance: 6104.4 on 7618 degrees of freedom
## AIC: 6114.4
##
## Number of Fisher Scoring iterations: 5
Here, Estimate = Log Odds, Odds Ratio = exp(Estimate)
exp(coef(model))
## (Intercept) age length_of_service genderMale
## 0.2545049 1.1097449 0.7909170 0.8087691
## BUSINESS_UNITSTORES
## 0.1239498
Here, the value of odds ratio for age is 1.11 which indicates that for per unit increment of age, the probability of termination will be significantly increased by (1.11-1)x100% = 11%.
Here, the value of odds ratio for length_of_service is 0.79 which indicates that for per unit increment of length_of_service, the probability of termination will be significantly decreased by (1-0.79)x100% = 21%.
Here, the value of odds ratio for male is 0.81 which indicates that for male employees, the probability of termination will be significantly decreased by (1-0.81)x100% = 19% compared to female employees.
pred_prob<-predict(model,testData,type = "response")
pred_cat<-ifelse(pred_prob>0.5,1,0)
confusionMatrix(factor(pred_cat),factor(testData$STATUS),
positive = "1")
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 1660 264
## 1 156 207
##
## Accuracy : 0.8164
## 95% CI : (0.7999, 0.832)
## No Information Rate : 0.7941
## P-Value [Acc > NIR] : 0.004115
##
## Kappa : 0.3864
##
## Mcnemar's Test P-Value : 0.0000001779
##
## Sensitivity : 0.43949
## Specificity : 0.91410
## Pos Pred Value : 0.57025
## Neg Pred Value : 0.86279
## Prevalence : 0.20595
## Detection Rate : 0.09051
## Detection Prevalence : 0.15872
## Balanced Accuracy : 0.67679
##
## 'Positive' Class : 1
##
Accuracy : (1660+207)/2287 = 0.8164 (81.64%) Sensitivity (True Positive/ Correctly Predicated Termination): 207/(264+207) = 0.4395 (43.95%) Specificity (True Negative/ Correctly Predicted Active): 1660/(1660+156) = 0.9141 (91.41%)
roc_ob<-roc(testData$STATUS, pred_prob)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
plot(roc_ob,main = "ROC Curve",col = "darkblue")
auc(roc_ob)
## Area under the curve: 0.812