Employee Churn Prediction Analysis¶

Author: Allan Almaraz
Course: BANA 273 - Machine Learning
Institution: University of California, Irvine
Date: December 2025


Project Overview¶

This analysis develops machine learning models to predict employee churn using logistic regression and decision tree classifiers. The primary goal is to identify at-risk employees with high recall to enable proactive retention strategies.

Key Metrics:

  • Dataset: 4,653 employees
  • Best Model: Tuned Decision Tree (79% recall)
  • Primary Drivers: Tenure, Compensation Tier, Education Level

1. Setup and Data Import¶

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

df = pd.read_excel("employment churn.xlsx").round(2)
df.columns = df.columns.str.lower()
df.head()
Out[2]:
education joiningyear city paymenttier age male everbenched experienceincurrentdomain leaveornot
0 0 2017 0 0 34 1 0 0 0
1 0 2013 1 2 28 0 0 3 1
2 0 2014 2 0 38 0 0 2 0
3 1 2016 0 0 27 1 0 5 1
4 1 2017 1 0 24 1 1 2 1

2. Exploratory Data Analysis¶

In [3]:
print(df.shape, '\n')
print(df.dtypes)
display(df.describe().T.round(2))
(4653, 9) 

education                    int64
joiningyear                  int64
city                         int64
paymenttier                  int64
age                          int64
male                         int64
everbenched                  int64
experienceincurrentdomain    int64
leaveornot                   int64
dtype: object
count mean std min 25% 50% 75% max
education 4653.0 0.26 0.52 0.0 0.0 0.0 0.0 2.0
joiningyear 4653.0 2015.06 1.86 2012.0 2013.0 2015.0 2017.0 2018.0
city 4653.0 0.77 0.82 0.0 0.0 1.0 1.0 2.0
paymenttier 4653.0 0.30 0.56 0.0 0.0 0.0 0.0 2.0
age 4653.0 29.39 4.83 22.0 26.0 28.0 32.0 41.0
male 4653.0 0.60 0.49 0.0 0.0 1.0 1.0 1.0
everbenched 4653.0 0.10 0.30 0.0 0.0 0.0 0.0 1.0
experienceincurrentdomain 4653.0 2.91 1.56 0.0 2.0 3.0 4.0 7.0
leaveornot 4653.0 0.34 0.48 0.0 0.0 0.0 1.0 1.0
In [7]:
import matplotlib.pyplot as plt

bar_color = "#4C72B0"
bar_edge = "black"
bar_edge_width = 0.5


# Percent Labels
def add_percent_labels(ax, total):
    for p in ax.patches:
        height = p.get_height()
        ax.annotate(f"{height/total*100:.1f}%", (p.get_x() + p.get_width()/2, height), ha='center', va='bottom', fontsize=9)


# DV: leaveornot
plt.figure(figsize=(7,4))
counts = df['leaveornot'].value_counts().sort_index()

ax = counts.plot(kind='bar',color=bar_color,edgecolor=bar_edge,linewidth=bar_edge_width)

plt.title("LeaveOrNot Distribution")
plt.xlabel("Stay (0) / Leave (1)")
plt.ylabel("Count")
plt.xticks(rotation=0)

add_percent_labels(ax, len(df))
ax.grid(axis='y', alpha=0.3)

plt.tight_layout()
#plt.savefig("churn_distribution.png", dpi=300, bbox_inches="tight")
#plt.show()
No description has been provided for this image
In [ ]:
# IVs Subplot

edu_map = {0: "Bachelors", 1: "Masters", 2: "PhD"}
df['education_label'] = df['education'].map(edu_map)
tier_map = {0: "Low Pay", 1: "Mid Pay", 2: "High Pay"}
df['payment_label'] = df['paymenttier'].map(tier_map)
city_map = {0: "City 0", 1: "City 1", 2: "City 2"}
df['city_label'] = df['city'].map(city_map)

fig, axes = plt.subplots(2, 3, figsize=(16, 8))
axes = axes.flatten()
total_n = len(df)

# Age
ax = axes[0]
ax.hist(df['age'], bins=15, color=bar_color, edgecolor=bar_edge)
ax.set_title("Age Distribution (Histogram)")
ax.set_xlabel("Age")
ax.set_ylabel("Count")
ax.grid(alpha=0.3)

# Gender
ax = axes[1]
gender_counts = df['male'].value_counts().sort_index()
gender_counts.plot(kind='bar',color=bar_color,edgecolor=bar_edge,linewidth=bar_edge_width,ax=ax)
ax.set_title("Gender Distribution")
ax.set_xlabel("Male (1) / Female (0)")
ax.set_ylabel("Count")
ax.set_xticklabels(["Female (0)", "Male (1)"], rotation=0)
add_percent_labels(ax, total_n)
ax.grid(axis='y', alpha=0.3)

# Education
ax = axes[2]
edu_counts = df['education_label'].value_counts().loc[["Bachelors","Masters","PhD"]]
edu_counts.plot(kind='bar',color=bar_color,edgecolor=bar_edge,linewidth=bar_edge_width,ax=ax)
ax.set_title("Education Level")
ax.set_xlabel("Education")
ax.set_ylabel("Count")
ax.set_xticklabels(["Bachelors", "Masters", "PhD"], rotation=0)
add_percent_labels(ax, total_n)
ax.grid(axis='y', alpha=0.3)

# Ever Benched
ax = axes[3]
bench_counts = df['everbenched'].value_counts().sort_index()
bench_counts.plot(kind='bar',color=bar_color,edgecolor=bar_edge,linewidth=bar_edge_width,ax=ax)
ax.set_title("Ever Benched")
ax.set_xlabel("No (0) / Yes (1)")
ax.set_ylabel("Count")
ax.set_xticklabels(["No (0)", "Yes (1)"], rotation=0)
add_percent_labels(ax, total_n)
ax.grid(axis='y', alpha=0.3)

# City
ax = axes[4]
city_counts = df['city_label'].value_counts().sort_index()
city_counts.plot(kind='bar',color=bar_color,edgecolor=bar_edge,linewidth=bar_edge_width,ax=ax)
ax.set_title("City Distribution")
ax.set_xlabel("City")
ax.set_ylabel("Count")
ax.set_xticklabels(city_counts.index, rotation=0)
add_percent_labels(ax, total_n)
ax.grid(axis='y', alpha=0.3)

# Payment Tier
ax = axes[5]
pay_counts = df['payment_label'].value_counts().loc[["Low Pay","Mid Pay","High Pay"]]
pay_counts.plot(kind='bar',color=bar_color,edgecolor=bar_edge,linewidth=bar_edge_width,ax=ax)
ax.set_title("Payment Tier")
ax.set_xlabel("Tier")
ax.set_ylabel("Count")
ax.set_xticklabels(["Low Pay", "Mid Pay", "High Pay"], rotation=0)
add_percent_labels(ax, total_n)
ax.grid(axis='y', alpha=0.3)

plt.tight_layout()


# Saving Figure
fig.savefig("descriptive_attributes_subplot.png", dpi=300, bbox_inches="tight")
plt.show()
No description has been provided for this image
In [9]:
# Correlation Heatmap

import seaborn as sns
numeric_cols = ['joiningyear', 'paymenttier', 'age', 'experienceincurrentdomain']

corr_df = df[numeric_cols + ['leaveornot']].corr()

plt.figure(figsize=(8,6))
sns.heatmap(corr_df, annot=True, fmt=".2f", cmap="coolwarm", center=0)
plt.title("Correlation Heatmap: Numeric Features & LeaveOrNot")
plt.tight_layout()

# Save Figure
#plt.savefig("correlation_heatmap.png", dpi=300, bbox_inches="tight")
#plt.show()
No description has been provided for this image
In [10]:
# IVs for Each DV

import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

# Make sure DV is numeric 0/1
df["leaveornot"] = df["leaveornot"].astype(int)

# Label mappings
edu_map    = {0: "Bachelors", 1: "Masters", 2: "PhD"}
city_map   = {0: "Bangalore", 1: "Pune", 2: "New Delhi"}
tier_map   = {0: "Low Pay", 1: "Mid Pay", 2: "High Pay"}
gender_map = {0: "Female", 1: "Male"}
bench_map  = {0: "No", 1: "Yes"}

df["education_label"] = df["education"].map(edu_map)
df["city_label"]      = df["city"].map(city_map)
df["payment_label"]   = df["paymenttier"].map(tier_map)
df["gender_label"]    = df["male"].map(gender_map)
df["benched_label"]   = df["everbenched"].map(bench_map)

# Colors
BLUE   = "#4C72B0"   # Stay (0)
ORANGE = "#DD8452"   # Leave (1)


# Grouped IV Visualizations for Churned and Non-Churned

def grouped_bar_with_percent(ax, df, feature, title):
    """
    Plot grouped bar chart (counts on y-axis) with percent labels above each bar.
    Percentages are computed within each category of `feature`,
    so that Stay% + Leave% = 100% for each group.
    """
    # Contingency table: rows = feature categories, cols = leaveornot (0=stay,1=leave)
    ct = pd.crosstab(df[feature], df["leaveornot"]).reindex(columns=[0, 1], fill_value=0)

    # Percentages within each category
    pct = ct.div(ct.sum(axis=1), axis=0)

    categories = ct.index.tolist()
    x = np.arange(len(categories))
    width = 0.4

    # Bars for Stay (0) and Leave (1)
    bars_stay = ax.bar(x - width/2, ct[0], width, label="Stay (0)", color=BLUE, edgecolor="black", linewidth=0.5)
    bars_leave = ax.bar(x + width/2, ct[1], width, label="Leave (1)", color=ORANGE, edgecolor="black", linewidth=0.5)

    ax.set_title(title)
    ax.set_xlabel(feature.replace("_", " ").title())
    ax.set_ylabel("Count")
    ax.set_xticks(x)
    ax.set_xticklabels(categories)
    ax.grid(axis="y", alpha=0.3)
    ax.legend(title="LeaveOrNot")

    # Headroom for labels
    max_height = max(max(ct[0]), max(ct[1]))
    ax.set_ylim(0, max_height * 1.15)

    # Add percentage labels above each bar
    for i, cat in enumerate(categories):
        h0 = bars_stay[i].get_height()
        p0 = pct.loc[cat, 0] * 100
        ax.annotate(f"{p0:.1f}%", (bars_stay[i].get_x() + bars_stay[i].get_width()/2., h0), ha="center", va="bottom", fontsize=10)

        # Leave
        h1 = bars_leave[i].get_height()
        p1 = pct.loc[cat, 1] * 100
        ax.annotate(f"{p1:.1f}%", (bars_leave[i].get_x() + bars_leave[i].get_width()/2., h1), ha="center", va="bottom", fontsize=10)


# 3×3 Figure
fig, axes = plt.subplots(3, 3, figsize=(18, 12))
axes = axes.flatten()

# 1) Age boxplot
sns.boxplot(data=df, x="leaveornot", y="age", order=[0, 1], palette=[BLUE, ORANGE], ax=axes[0])
axes[0].set_title("Age by LeaveOrNot")
axes[0].set_xlabel("LeaveOrNot (0 = Stay, 1 = Leave)")
axes[0].set_ylabel("Age")
axes[0].grid(axis="y", alpha=0.3)

# 2) Experience boxplot
sns.boxplot(data=df, x="leaveornot", y="experienceincurrentdomain", order=[0, 1], palette=[BLUE, ORANGE], ax=axes[1])
axes[1].set_title("Experience in Current Domain by LeaveOrNot")
axes[1].set_xlabel("LeaveOrNot (0 = Stay, 1 = Leave)")
axes[1].set_ylabel("Years of Experience")
axes[1].grid(axis="y", alpha=0.3)

# 3) Payment Tier
grouped_bar_with_percent(axes[2], df, "payment_label", "Payment Tier vs LeaveOrNot")

# 4) Gender
grouped_bar_with_percent(axes[3], df, "gender_label", "Gender vs LeaveOrNot")

# 5) Education
grouped_bar_with_percent(axes[4], df, "education_label", "Education vs LeaveOrNot")

# 6) Ever Benched
grouped_bar_with_percent(axes[5], df, "benched_label", "Ever Benched vs LeaveOrNot")

# 7) City
grouped_bar_with_percent(axes[6], df, "city_label", "City vs LeaveOrNot")

# Turn off unused subplots
axes[7].axis("off")
axes[8].axis("off")


plt.tight_layout()
#fig.savefig("features_vs_churn_overview_grouped.png", dpi=300, bbox_inches="tight")
#plt.show()
/var/folders/qv/_ft96x6x6h1_2t8_53rz9xp00000gn/T/ipykernel_51215/3613661694.py:80: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.boxplot(data=df, x="leaveornot", y="age", order=[0, 1], palette=[BLUE, ORANGE], ax=axes[0])
/var/folders/qv/_ft96x6x6h1_2t8_53rz9xp00000gn/T/ipykernel_51215/3613661694.py:87: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.boxplot(data=df, x="leaveornot", y="experienceincurrentdomain", order=[0, 1], palette=[BLUE, ORANGE], ax=axes[1])
No description has been provided for this image

3. Which Models Should we Focus on?¶

In [12]:
import numpy as np
import pandas as pd
from sklearn.model_selection import (train_test_split,StratifiedKFold,GridSearchCV,cross_val_score)
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import make_scorer, recall_score, confusion_matrix
pd.set_option("display.max_colwidth", None)

RANDOM_STATE = 42

# Load and prep data
df = pd.read_excel("employment churn.xlsx").round(2)
df.columns = df.columns.str.lower()
y = df["leaveornot"].astype(int)
X_raw = df.drop(columns=["leaveornot"])
cat_cols = ["education", "city", "paymenttier"]

# One-hot encode categoricals (drop first category as reference)
X_cat = pd.get_dummies(X_raw[cat_cols].astype("category"),prefix=cat_cols,drop_first=True)

# Keep numeric / already-binary columns as they are
X_num = X_raw.drop(columns=cat_cols)

# Final feature matrix used by all models
X = pd.concat([X_num, X_cat], axis=1)

# Train–test split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.30,random_state=RANDOM_STATE,stratify=y)

# CV setup + recall scorer
recall_scorer = make_scorer(recall_score, pos_label=1)

cv = StratifiedKFold(n_splits=5,shuffle=True,random_state=RANDOM_STATE)

# Models + hyperparameter grids
models = {"Logistic Regression": {"estimator": LogisticRegression(max_iter=1000,class_weight="balanced",solver="liblinear"),
                                  "param_grid": {"C": [0.01, 0.1, 1, 10],"penalty": ["l1", "l2"]},"key_param": "C"},
          
          "Decision Tree": {"estimator": DecisionTreeClassifier(random_state=RANDOM_STATE,class_weight="balanced"),
                            "param_grid": {"max_depth": [3, 4, 6, 8],"min_samples_split": [5, 10, 20],"min_samples_leaf": [2, 5, 10]},
                            "key_param": "max_depth"},
          
          "Random Forest": {"estimator": RandomForestClassifier(random_state=RANDOM_STATE,class_weight="balanced",n_jobs=-1),
                            "param_grid": {"n_estimators": [100, 200],"max_depth": [4, 6, 8],"min_samples_split": [5, 10],
                                           "min_samples_leaf": [2, 5]},"key_param": "n_estimators"}}

#GridSearchCV + evaluation loop (recall only)
results = []
best_estimators = {}

for name, cfg in models.items():
    print(f"\n==== Fitting {name} ====")

    grid = GridSearchCV(estimator=cfg["estimator"],param_grid=cfg["param_grid"],scoring=recall_scorer,cv=cv,n_jobs=-1,verbose=1)
    grid.fit(X_train, y_train)

    best_model = grid.best_estimator_
    best_estimators[name] = best_model

    # 5-fold CV recall (from GridSearch best score)
    cv_recall = grid.best_score_

    # Test set predictions (threshold 0.50)
    y_pred = best_model.predict(X_test)
    test_recall = recall_score(y_test, y_pred, pos_label=1)

    # Confusion matrix
    cm = confusion_matrix(y_test, y_pred)
    print(f"\nConfusion Matrix – {name}:")
    print(cm)
    print("Test Recall (leave=1):", round(test_recall, 2))

    results.append({"Model": name,"Best_Params": grid.best_params_,"Key_Tunable_Param": cfg["key_param"],
                    "CV Recall": cv_recall,"Test Recall": test_recall})

# Comparison table (recall-focused)
results_df = pd.DataFrame(results)

for col in ["CV Recall", "Test Recall"]:
    results_df[col] = results_df[col].round(2)

results_df = results_df[["Model", "Best_Params", "Key_Tunable_Param","CV Recall", "Test Recall"]]
results_df = results_df.sort_values(by="CV Recall",ascending=False).reset_index(drop=True)

print("\n=== Model Comparison (Employment Churn, recall-focused) ===")
display(results_df)
best_row = results_df.iloc[0]
best_model_name = best_row["Model"]
print(f"\nRecommended model (by 5-Fold CV Recall): {best_model_name}")
==== Fitting Logistic Regression ====
Fitting 5 folds for each of 8 candidates, totalling 40 fits
/opt/anaconda3/lib/python3.13/site-packages/sklearn/svm/_base.py:1249: ConvergenceWarning: Liblinear failed to converge, increase the number of iterations.
  warnings.warn(
/opt/anaconda3/lib/python3.13/site-packages/sklearn/svm/_base.py:1249: ConvergenceWarning: Liblinear failed to converge, increase the number of iterations.
  warnings.warn(
/opt/anaconda3/lib/python3.13/site-packages/sklearn/svm/_base.py:1249: ConvergenceWarning: Liblinear failed to converge, increase the number of iterations.
  warnings.warn(
/opt/anaconda3/lib/python3.13/site-packages/sklearn/svm/_base.py:1249: ConvergenceWarning: Liblinear failed to converge, increase the number of iterations.
  warnings.warn(
/opt/anaconda3/lib/python3.13/site-packages/sklearn/svm/_base.py:1249: ConvergenceWarning: Liblinear failed to converge, increase the number of iterations.
  warnings.warn(
Confusion Matrix – Logistic Regression:
[[666 250]
 [177 303]]
Test Recall (leave=1): 0.63

==== Fitting Decision Tree ====
Fitting 5 folds for each of 36 candidates, totalling 180 fits

Confusion Matrix – Decision Tree:
[[823  93]
 [145 335]]
Test Recall (leave=1): 0.7

==== Fitting Random Forest ====
Fitting 5 folds for each of 24 candidates, totalling 120 fits

Confusion Matrix – Random Forest:
[[811 105]
 [133 347]]
Test Recall (leave=1): 0.72

=== Model Comparison (Employment Churn, recall-focused) ===
Model Best_Params Key_Tunable_Param CV Recall Test Recall
0 Random Forest {'max_depth': 6, 'min_samples_leaf': 5, 'min_samples_split': 5, 'n_estimators': 100} n_estimators 0.73 0.72
1 Decision Tree {'max_depth': 8, 'min_samples_leaf': 10, 'min_samples_split': 5} max_depth 0.72 0.70
2 Logistic Regression {'C': 1, 'penalty': 'l1'} C 0.64 0.63
Recommended model (by 5-Fold CV Recall): Random Forest

4. Decision Tree Models¶

In [13]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from IPython.display import display
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.metrics import (confusion_matrix,classification_report,make_scorer,recall_score)

RANDOM_STATE = 42

# Load data
df = pd.read_excel("employment churn.xlsx").round(2)
df.columns = df.columns.str.lower()
X = df.drop(columns=["leaveornot"])
y = df["leaveornot"].astype(int)   # 1 = leaves, 0 = stays
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.30,random_state=RANDOM_STATE,stratify=y)

# Basic Decision Tree (entropy)
dt = DecisionTreeClassifier(criterion="entropy",random_state=RANDOM_STATE)
dt.fit(X_train, y_train)
y_pred_dt = dt.predict(X_test)

# Confusion matrix 
cm = confusion_matrix(y_test, y_pred_dt)
cm_df = pd.DataFrame(cm,index=["Actual Stay (0)", "Actual Churn (1)"],columns=["Predicted Stay (0)", "Predicted Churn (1)"])

print("Basic Decision Tree:\n")
display(cm_df)
print() 

# Classification report
print(classification_report(y_test, y_pred_dt, target_names=["Stay", "Churn"]))

# 5-Fold CV on training data
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)
recall_scorer = make_scorer(recall_score, pos_label=1)
cv_recall = cross_val_score(dt,X_train,y_train,cv=cv,scoring=recall_scorer)
print("\n5-Fold CV Recall (Churn = 1):", np.round(cv_recall, 3))
print("Mean CV Recall (Churn = 1):", round(cv_recall.mean(), 3), "\n")

# Visualize top levels of the tree
plt.figure(figsize=(18, 10))
plot_tree(dt,feature_names=X.columns,class_names=["Stay", "Leave"],filled=True,rounded=True,fontsize=10,max_depth=2)
plt.title("Top Levels of Decision Tree (Employment Churn, Entropy Criterion)")
plt.show()
Basic Decision Tree:

Predicted Stay (0) Predicted Churn (1)
Actual Stay (0) 814 102
Actual Churn (1) 175 305
              precision    recall  f1-score   support

        Stay       0.82      0.89      0.85       916
       Churn       0.75      0.64      0.69       480

    accuracy                           0.80      1396
   macro avg       0.79      0.76      0.77      1396
weighted avg       0.80      0.80      0.80      1396


5-Fold CV Recall (Churn = 1): [0.696 0.701 0.634 0.629 0.688]
Mean CV Recall (Churn = 1): 0.67 

No description has been provided for this image
In [14]:
import pandas as pd
import numpy as np

# Extract feature importances from your fitted decision tree
importances = dt.feature_importances_

# Build table
dt_importance_table = pd.DataFrame({"Feature": X_train.columns,"Information_Gain": importances})

# Improved Impact Labeling (3-tier system)
def dt_impact(val):
    if val > 0.15:
        return "Key driver of churn"
    elif val > 0.03:
        return "Secondary driver"
    else:
        return "Minimal / no impact"

dt_importance_table["Impact"] = dt_importance_table["Information_Gain"].apply(dt_impact)

# Sort by importance, descending
dt_importance_table = dt_importance_table.sort_values(by="Information_Gain", ascending=False)

# Reset index for clean display
dt_importance_table.reset_index(drop=True, inplace=True)

# Display table (rounded)
dt_importance_table.round(2)
Out[14]:
Feature Information_Gain Impact
0 joiningyear 0.31 Key driver of churn
1 age 0.18 Key driver of churn
2 paymenttier 0.13 Secondary driver
3 experienceincurrentdomain 0.10 Secondary driver
4 city 0.10 Secondary driver
5 education 0.08 Secondary driver
6 male 0.06 Secondary driver
7 everbenched 0.03 Minimal / no impact
Exception ignored in: <function ResourceTracker.__del__ at 0x10375dbc0>
Traceback (most recent call last):
  File "/opt/anaconda3/lib/python3.13/multiprocessing/resource_tracker.py", line 82, in __del__
  File "/opt/anaconda3/lib/python3.13/multiprocessing/resource_tracker.py", line 91, in _stop
  File "/opt/anaconda3/lib/python3.13/multiprocessing/resource_tracker.py", line 116, in _stop_locked
ChildProcessError: [Errno 10] No child processes
Exception ignored in: <function ResourceTracker.__del__ at 0x102a89bc0>
Traceback (most recent call last):
  File "/opt/anaconda3/lib/python3.13/multiprocessing/resource_tracker.py", line 82, in __del__
  File "/opt/anaconda3/lib/python3.13/multiprocessing/resource_tracker.py", line 91, in _stop
  File "/opt/anaconda3/lib/python3.13/multiprocessing/resource_tracker.py", line 116, in _stop_locked
ChildProcessError: [Errno 10] No child processes
Exception ignored in: <function ResourceTracker.__del__ at 0x107051bc0>
Traceback (most recent call last):
  File "/opt/anaconda3/lib/python3.13/multiprocessing/resource_tracker.py", line 82, in __del__
  File "/opt/anaconda3/lib/python3.13/multiprocessing/resource_tracker.py", line 91, in _stop
  File "/opt/anaconda3/lib/python3.13/multiprocessing/resource_tracker.py", line 116, in _stop_locked
ChildProcessError: [Errno 10] No child processes
Exception ignored in: <function ResourceTracker.__del__ at 0x105071bc0>
Traceback (most recent call last):
  File "/opt/anaconda3/lib/python3.13/multiprocessing/resource_tracker.py", line 82, in __del__
  File "/opt/anaconda3/lib/python3.13/multiprocessing/resource_tracker.py", line 91, in _stop
  File "/opt/anaconda3/lib/python3.13/multiprocessing/resource_tracker.py", line 116, in _stop_locked
ChildProcessError: [Errno 10] No child processes
Exception ignored in: <function ResourceTracker.__del__ at 0x1029f9bc0>
Traceback (most recent call last):
  File "/opt/anaconda3/lib/python3.13/multiprocessing/resource_tracker.py", line 82, in __del__
  File "/opt/anaconda3/lib/python3.13/multiprocessing/resource_tracker.py", line 91, in _stop
  File "/opt/anaconda3/lib/python3.13/multiprocessing/resource_tracker.py", line 116, in _stop_locked
ChildProcessError: [Errno 10] No child processes
Exception ignored in: <function ResourceTracker.__del__ at 0x102c1dbc0>
Traceback (most recent call last):
  File "/opt/anaconda3/lib/python3.13/multiprocessing/resource_tracker.py", line 82, in __del__
  File "/opt/anaconda3/lib/python3.13/multiprocessing/resource_tracker.py", line 91, in _stop
  File "/opt/anaconda3/lib/python3.13/multiprocessing/resource_tracker.py", line 116, in _stop_locked
ChildProcessError: [Errno 10] No child processes
Exception ignored in: <function ResourceTracker.__del__ at 0x106d71bc0>
Traceback (most recent call last):
  File "/opt/anaconda3/lib/python3.13/multiprocessing/resource_tracker.py", line 82, in __del__
  File "/opt/anaconda3/lib/python3.13/multiprocessing/resource_tracker.py", line 91, in _stop
  File "/opt/anaconda3/lib/python3.13/multiprocessing/resource_tracker.py", line 116, in _stop_locked
ChildProcessError: [Errno 10] No child processes
Exception ignored in: <function ResourceTracker.__del__ at 0x10283dbc0>
Traceback (most recent call last):
  File "/opt/anaconda3/lib/python3.13/multiprocessing/resource_tracker.py", line 82, in __del__
  File "/opt/anaconda3/lib/python3.13/multiprocessing/resource_tracker.py", line 91, in _stop
  File "/opt/anaconda3/lib/python3.13/multiprocessing/resource_tracker.py", line 116, in _stop_locked
ChildProcessError: [Errno 10] No child processes

Improved Decision Tree¶

In [15]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from IPython.display import display
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.metrics import (confusion_matrix,classification_report,make_scorer,recall_score)

RANDOM_STATE = 42

# Load data
df = pd.read_excel("employment churn.xlsx").round(2)
df.columns = df.columns.str.lower()
X = df.drop(columns=["leaveornot"])
y = df["leaveornot"].astype(int)   # 1 = leaves, 0 = stays
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.30,random_state=RANDOM_STATE,stratify=y)

# 2. Tuned Decision Tree (entropy, balanced)
dt_best = DecisionTreeClassifier(random_state=RANDOM_STATE,class_weight="balanced",max_depth=8,min_samples_leaf=10,
                                 min_samples_split=5,criterion="entropy")
dt_best.fit(X_train, y_train)

# Predictions with custom probability threshold
custom_threshold = 0.35   # adjust as needed
y_proba = dt_best.predict_proba(X_test)[:, 1]
y_pred_adj = (y_proba >= custom_threshold).astype(int)

# Confusion matrix (pretty table)
cm_adj = confusion_matrix(y_test, y_pred_adj)
cm_df_adj = pd.DataFrame(cm_adj,index=["Actual Stay (0)", "Actual Churn (1)"],columns=["Predicted Stay (0)", "Predicted Churn (1)"])

print(f"\nTuned Decision Tree (Adjusted Threshold = {custom_threshold}):")
display(cm_df_adj)
print() 

# Classification report
print(classification_report(y_test, y_pred_adj, target_names=["Stay", "Churn"]))

# 5-Fold CV Recall on training data (standard 0.50 threshold)
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)
recall_scorer = make_scorer(recall_score, pos_label=1)
cv_recall_scores = cross_val_score(dt_best,X_train,y_train,cv=cv,scoring=recall_scorer)
print("\nCV Recall (Churn = 1, standard threshold):", np.round(cv_recall_scores, 3))
print("Mean CV Recall (Churn = 1):", round(cv_recall_scores.mean(), 3), "\n")

# Visualize top levels of the tuned tree
plt.figure(figsize=(18, 10))
plot_tree(dt_best,feature_names=X.columns,class_names=["Stay", "Leave"],filled=True,rounded=True,fontsize=10,max_depth=2)
plt.title("Top Levels of Tuned Decision Tree (Employment Churn)")
plt.show()
Tuned Decision Tree (Adjusted Threshold = 0.35):
Predicted Stay (0) Predicted Churn (1)
Actual Stay (0) 687 229
Actual Churn (1) 102 378
              precision    recall  f1-score   support

        Stay       0.87      0.75      0.81       916
       Churn       0.62      0.79      0.70       480

    accuracy                           0.76      1396
   macro avg       0.75      0.77      0.75      1396
weighted avg       0.79      0.76      0.77      1396


CV Recall (Churn = 1, standard threshold): [0.728 0.763 0.75  0.692 0.701]
Mean CV Recall (Churn = 1): 0.727 

No description has been provided for this image
In [17]:
#Feature Importance Table for Tuned DT Model

import pandas as pd
import numpy as np

# Extract feature importances from tuned decision tree
importances_best = dt_best.feature_importances_

# Build dataframe
dt_best_importance_table = pd.DataFrame({"Feature": X_train.columns,"Information_Gain": importances_best})

# 3-Tier impact logic
def dt_best_impact(val):
    if val > 0.15:
        return "Key driver of churn"
    elif val > 0.03:
        return "Secondary driver"
    else:
        return "Minimal / no impact"

dt_best_importance_table["Impact"] = dt_best_importance_table["Information_Gain"].apply(dt_best_impact)

# Sort results by information gain
dt_best_importance_table = dt_best_importance_table.sort_values(by="Information_Gain", ascending=False)

# Reset index for clean display
dt_best_importance_table.reset_index(drop=True, inplace=True)

# Display table rounded to 3 decimals
dt_best_importance_table.round(2)
Out[17]:
Feature Information_Gain Impact
0 joiningyear 0.36 Key driver of churn
1 paymenttier 0.21 Key driver of churn
2 city 0.13 Secondary driver
3 education 0.12 Secondary driver
4 male 0.07 Secondary driver
5 age 0.05 Secondary driver
6 experienceincurrentdomain 0.04 Secondary driver
7 everbenched 0.02 Minimal / no impact

5. Logit Regression Models¶

In [24]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report, recall_score, make_scorer

RANDOM_STATE = 42

df = pd.read_excel("employment churn.xlsx").round(2)
df.columns = df.columns.str.lower()
y = df["leaveornot"].astype(int)

# 2. Create dummy variables for categorical features
#    education:  0=Bachelors, 1=Masters, 2=PhD
#    city:       0=Bangalore, 1=Pune, 2=New Delhi
#    paymenttier:0=Low, 1=Mid, 2=High

X_raw = df.drop(columns=["leaveornot"])
X = pd.get_dummies(X_raw,columns=["education", "city", "paymenttier"],drop_first=True)
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.30,random_state=42,stratify=y)

# Basic logistic regression
logit = LogisticRegression(max_iter=500)
logit.fit(X_train, y_train)

y_pred = logit.predict(X_test)

# Confusion matrix 
cm = confusion_matrix(y_test, y_pred)

cm_df = pd.DataFrame(cm,index=["Actual Stay (0)", "Actual Churn (1)"],columns=["Predicted Stay (0)", "Predicted Churn (1)"])

print("Basic Logistic Regression:")
display(cm_df)
print()   

# Classification report
print(classification_report(y_test, y_pred, target_names=["Stay", "Churn"]))

# 5-Fold CV Recall (churn = 1)
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
recall_scorer = make_scorer(recall_score, pos_label=1)
cv_recall_scores = cross_val_score(logit,X_train,y_train,cv=cv,scoring=recall_scorer)

print("\n5-Fold CV Recall (Churn = 1):", np.round(cv_recall_scores, 3))
print("Mean CV Recall (Churn = 1):", round(cv_recall_scores.mean(), 3), '\n')
Basic Logistic Regression:
Predicted Stay (0) Predicted Churn (1)
Actual Stay (0) 828 88
Actual Churn (1) 284 196
              precision    recall  f1-score   support

        Stay       0.74      0.90      0.82       916
       Churn       0.69      0.41      0.51       480

    accuracy                           0.73      1396
   macro avg       0.72      0.66      0.66      1396
weighted avg       0.73      0.73      0.71      1396

/opt/anaconda3/lib/python3.13/site-packages/sklearn/linear_model/_logistic.py:465: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
/opt/anaconda3/lib/python3.13/site-packages/sklearn/linear_model/_logistic.py:465: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
/opt/anaconda3/lib/python3.13/site-packages/sklearn/linear_model/_logistic.py:465: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
/opt/anaconda3/lib/python3.13/site-packages/sklearn/linear_model/_logistic.py:465: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
5-Fold CV Recall (Churn = 1): [0.397 0.42  0.321 0.308 0.438]
Mean CV Recall (Churn = 1): 0.377 

/opt/anaconda3/lib/python3.13/site-packages/sklearn/linear_model/_logistic.py:465: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
In [26]:
import numpy as np
import pandas as pd

# Extract coefficients from your model
coef = logit.coef_[0]

# Compute odds ratios
odds = np.exp(coef)

# Build dataframe
odds_table = pd.DataFrame({"Feature": X.columns,"Coefficient": coef,"Odds_Ratio": odds})

# 4. Determine impact
def get_impact(row):
    if row["Odds_Ratio"] > 1.05:
        return "Increases churn"
    elif row["Odds_Ratio"] < 0.95:
        return "Decreases churn"
    else:
        return "No effect"

odds_table["Impact"] = odds_table.apply(get_impact, axis=1)

# 5. Sort by strongest effect (furthest from 1)
odds_table["Effect_Size"] = abs(odds_table["Odds_Ratio"] - 1)
odds_table = odds_table.sort_values(by="Effect_Size", ascending=False)

# 6. Drop helper column for clean presentation
odds_table = odds_table[["Feature", "Coefficient", "Odds_Ratio", "Impact"]]

# 7. Display final table
odds_table.reset_index(drop=True, inplace=True)
odds_table.round(2)
Out[26]:
Feature Coefficient Odds_Ratio Impact
0 education_1 0.90 2.45 Increases churn
1 paymenttier_1 0.85 2.34 Increases churn
2 everbenched 0.59 1.81 Increases churn
3 city_1 0.52 1.68 Increases churn
4 male -0.87 0.42 Decreases churn
5 city_2 -0.62 0.54 Decreases churn
6 paymenttier_2 0.18 1.20 Increases churn
7 education_2 0.17 1.18 Increases churn
8 experienceincurrentdomain -0.03 0.97 No effect
9 age -0.03 0.97 No effect
10 joiningyear 0.00 1.00 No effect

Improved Logit Regression Model¶

In [27]:
import numpy as np
import pandas as pd
from sklearn.model_selection import (train_test_split,StratifiedKFold,GridSearchCV,cross_val_score)
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import make_scorer, recall_score, confusion_matrix

RANDOM_STATE = 42

# import data
df = pd.read_excel("employment churn.xlsx").round(2)
df.columns = df.columns.str.lower()
y = df["leaveornot"].astype(int)

# Create dummy variables for categorical features
#    education:  0=Bachelors, 1=Masters, 2=PhD
#    city:       0=Bangalore, 1=Pune, 2=New Delhi
#    paymenttier:0=Low, 1=Mid, 2=High

X_raw = df.drop(columns=["leaveornot"])
X = pd.get_dummies(X_raw,columns=["education", "city", "paymenttier"],drop_first=True)

# Train–test split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.30,random_state=RANDOM_STATE,stratify=y)

#Improved Logistic Regression (L1, balanced)
logit_best = LogisticRegression(max_iter=1000,class_weight="balanced",solver="liblinear",C=0.1,penalty="l1")

logit_best.fit(X_train, y_train)

# Standard prediction using default 0.50 threshold
y_pred = logit_best.predict(X_test)

# Confusion matrix (pretty table)
cm = confusion_matrix(y_test, y_pred)

cm_df = pd.DataFrame(cm,index=["Actual Stay (0)", "Actual Churn (1)"],columns=["Predicted Stay (0)", "Predicted Churn (1)"])
print("Improved Logistic Regression:\n")
display(cm_df)
print()  

# Classification report
print(classification_report(y_test, y_pred, target_names=["Stay", "Churn"]))

# 5-Fold CV on training data (Recall for churn = 1)
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)
recall_scorer = make_scorer(recall_score, pos_label=1)
cv_recall = cross_val_score(logit_best,X_train,y_train,cv=cv,scoring=recall_scorer)

print("\nCV Recall (Churn = 1):", np.round(cv_recall, 3))
print("Mean Recall (Churn = 1):", round(cv_recall.mean(), 3), "\n")
Improved Logistic Regression:

Predicted Stay (0) Predicted Churn (1)
Actual Stay (0) 671 245
Actual Churn (1) 179 301
              precision    recall  f1-score   support

        Stay       0.79      0.73      0.76       916
       Churn       0.55      0.63      0.59       480

    accuracy                           0.70      1396
   macro avg       0.67      0.68      0.67      1396
weighted avg       0.71      0.70      0.70      1396


CV Recall (Churn = 1): [0.652 0.625 0.638 0.571 0.661]
Mean Recall (Churn = 1): 0.629 

In [30]:
coef_best = logit_best.coef_[0]
odds_best = np.exp(coef_best)

odds_best_table = pd.DataFrame({"Feature": X_train.columns,"Coefficient": coef_best,"Odds_Ratio": odds_best})

# Rename dummy variables to human-readable labels
rename_map = { "education_1": "Masters vs Bachelors","education_2": "PhD vs Bachelors",
              "city_1": "Pune vs Bangalore","city_2": "New Delhi vs Bangalore",
              "paymenttier_1": "Mid Pay vs Low Pay","paymenttier_2": "High Pay vs Low Pay"}

odds_best_table["Feature"] = odds_best_table["Feature"].replace(rename_map)

# Label impact categories
def get_impact(row):
    if row["Odds_Ratio"] > 1.05:
        return "Increases churn"
    elif row["Odds_Ratio"] < 0.95:
        return "Decreases churn"
    else:
        return "No effect"

odds_best_table["Impact"] = odds_best_table.apply(get_impact, axis=1)

# Sort by effect size
odds_best_table["Effect_Size"] = abs(odds_best_table["Odds_Ratio"] - 1)
odds_best_table = odds_best_table.sort_values(by="Effect_Size", ascending=False)

# Final clean table
odds_best_table = odds_best_table[["Feature", "Coefficient", "Odds_Ratio", "Impact"]]
odds_best_table = odds_best_table.reset_index(drop=True)

print("\nFinal Odds Ratios Table:")
display(odds_best_table.round(3))
Final Odds Ratios Table:
Feature Coefficient Odds_Ratio Impact
0 Masters vs Bachelors 0.894 2.444 Increases churn
1 Mid Pay vs Low Pay 0.767 2.154 Increases churn
2 Pune vs Bangalore 0.461 1.585 Increases churn
3 male -0.795 0.452 Decreases churn
4 everbenched 0.413 1.511 Increases churn
5 New Delhi vs Bangalore -0.512 0.599 Decreases churn
6 experienceincurrentdomain -0.031 0.970 No effect
7 age -0.029 0.971 No effect
8 joiningyear 0.000 1.000 No effect
9 PhD vs Bachelors 0.000 1.000 No effect
10 High Pay vs Low Pay 0.000 1.000 No effect

6. Comparison¶

In [31]:
import numpy as np
import matplotlib.pyplot as plt

# Recall values from your models
recall_values = {"Logit": [0.41, 0.63],"Tree":  [0.64, 0.79] }
models = list(recall_values.keys())
base = [recall_values[m][0] for m in models]
improved = [recall_values[m][1] for m in models]

x = np.arange(len(models))  # positions for "Logit", "Tree"
width = 0.35

plt.figure(figsize=(10,6))

# Bars
bars1 = plt.bar(x - width/2, base, width,label="Base Model",color="#4e79a7",edgecolor="black",linewidth=0.7)
bars2 = plt.bar(x + width/2, improved, width,label="Improved Model",color="#59a14f",edgecolor="black",linewidth=0.7)

# Value labels above bars
for bars in [bars1, bars2]:
    for bar in bars:
        h = bar.get_height()
        plt.text(bar.get_x() + bar.get_width()/2,h + 0.015,f"{h:.2f}",ha="center",fontsize=12)

# Formatting
plt.xticks(x, models, fontsize=12)
plt.ylim(0, 1.05)
plt.ylabel("Recall (Churn = 1)", fontsize=13)
plt.title("Recall Comparison: Logistic vs Tree Models (Base vs Improved)", fontsize=16)
plt.legend(fontsize=12)
plt.tight_layout()

# Save figure
#plt.savefig("recall_comparison_grouped.png", dpi=300, bbox_inches="tight")
#plt.show()
No description has been provided for this image
In [ ]: