import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

df = pd.read_excel("employment churn.xlsx").round(2)
df.columns = df.columns.str.lower()
df.head()

print(df.shape, '\n')
print(df.dtypes)
display(df.describe().T.round(2))

(4653, 9) 

education                    int64
joiningyear                  int64
city                         int64
paymenttier                  int64
age                          int64
male                         int64
everbenched                  int64
experienceincurrentdomain    int64
leaveornot                   int64
dtype: object

import matplotlib.pyplot as plt

bar_color = "#4C72B0"
bar_edge = "black"
bar_edge_width = 0.5


# Percent Labels
def add_percent_labels(ax, total):
    for p in ax.patches:
        height = p.get_height()
        ax.annotate(f"{height/total*100:.1f}%", (p.get_x() + p.get_width()/2, height), ha='center', va='bottom', fontsize=9)


# DV: leaveornot
plt.figure(figsize=(7,4))
counts = df['leaveornot'].value_counts().sort_index()

ax = counts.plot(kind='bar',color=bar_color,edgecolor=bar_edge,linewidth=bar_edge_width)

plt.title("LeaveOrNot Distribution")
plt.xlabel("Stay (0) / Leave (1)")
plt.ylabel("Count")
plt.xticks(rotation=0)

add_percent_labels(ax, len(df))
ax.grid(axis='y', alpha=0.3)

plt.tight_layout()
#plt.savefig("churn_distribution.png", dpi=300, bbox_inches="tight")
#plt.show()

# IVs Subplot

edu_map = {0: "Bachelors", 1: "Masters", 2: "PhD"}
df['education_label'] = df['education'].map(edu_map)
tier_map = {0: "Low Pay", 1: "Mid Pay", 2: "High Pay"}
df['payment_label'] = df['paymenttier'].map(tier_map)
city_map = {0: "City 0", 1: "City 1", 2: "City 2"}
df['city_label'] = df['city'].map(city_map)

fig, axes = plt.subplots(2, 3, figsize=(16, 8))
axes = axes.flatten()
total_n = len(df)

# Age
ax = axes[0]
ax.hist(df['age'], bins=15, color=bar_color, edgecolor=bar_edge)
ax.set_title("Age Distribution (Histogram)")
ax.set_xlabel("Age")
ax.set_ylabel("Count")
ax.grid(alpha=0.3)

# Gender
ax = axes[1]
gender_counts = df['male'].value_counts().sort_index()
gender_counts.plot(kind='bar',color=bar_color,edgecolor=bar_edge,linewidth=bar_edge_width,ax=ax)
ax.set_title("Gender Distribution")
ax.set_xlabel("Male (1) / Female (0)")
ax.set_ylabel("Count")
ax.set_xticklabels(["Female (0)", "Male (1)"], rotation=0)
add_percent_labels(ax, total_n)
ax.grid(axis='y', alpha=0.3)

# Education
ax = axes[2]
edu_counts = df['education_label'].value_counts().loc[["Bachelors","Masters","PhD"]]
edu_counts.plot(kind='bar',color=bar_color,edgecolor=bar_edge,linewidth=bar_edge_width,ax=ax)
ax.set_title("Education Level")
ax.set_xlabel("Education")
ax.set_ylabel("Count")
ax.set_xticklabels(["Bachelors", "Masters", "PhD"], rotation=0)
add_percent_labels(ax, total_n)
ax.grid(axis='y', alpha=0.3)

# Ever Benched
ax = axes[3]
bench_counts = df['everbenched'].value_counts().sort_index()
bench_counts.plot(kind='bar',color=bar_color,edgecolor=bar_edge,linewidth=bar_edge_width,ax=ax)
ax.set_title("Ever Benched")
ax.set_xlabel("No (0) / Yes (1)")
ax.set_ylabel("Count")
ax.set_xticklabels(["No (0)", "Yes (1)"], rotation=0)
add_percent_labels(ax, total_n)
ax.grid(axis='y', alpha=0.3)

# City
ax = axes[4]
city_counts = df['city_label'].value_counts().sort_index()
city_counts.plot(kind='bar',color=bar_color,edgecolor=bar_edge,linewidth=bar_edge_width,ax=ax)
ax.set_title("City Distribution")
ax.set_xlabel("City")
ax.set_ylabel("Count")
ax.set_xticklabels(city_counts.index, rotation=0)
add_percent_labels(ax, total_n)
ax.grid(axis='y', alpha=0.3)

# Payment Tier
ax = axes[5]
pay_counts = df['payment_label'].value_counts().loc[["Low Pay","Mid Pay","High Pay"]]
pay_counts.plot(kind='bar',color=bar_color,edgecolor=bar_edge,linewidth=bar_edge_width,ax=ax)
ax.set_title("Payment Tier")
ax.set_xlabel("Tier")
ax.set_ylabel("Count")
ax.set_xticklabels(["Low Pay", "Mid Pay", "High Pay"], rotation=0)
add_percent_labels(ax, total_n)
ax.grid(axis='y', alpha=0.3)

plt.tight_layout()


# Saving Figure
fig.savefig("descriptive_attributes_subplot.png", dpi=300, bbox_inches="tight")
plt.show()

# Correlation Heatmap

import seaborn as sns
numeric_cols = ['joiningyear', 'paymenttier', 'age', 'experienceincurrentdomain']

corr_df = df[numeric_cols + ['leaveornot']].corr()

plt.figure(figsize=(8,6))
sns.heatmap(corr_df, annot=True, fmt=".2f", cmap="coolwarm", center=0)
plt.title("Correlation Heatmap: Numeric Features & LeaveOrNot")
plt.tight_layout()

# Save Figure
#plt.savefig("correlation_heatmap.png", dpi=300, bbox_inches="tight")
#plt.show()

# IVs for Each DV

import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

# Make sure DV is numeric 0/1
df["leaveornot"] = df["leaveornot"].astype(int)

# Label mappings
edu_map    = {0: "Bachelors", 1: "Masters", 2: "PhD"}
city_map   = {0: "Bangalore", 1: "Pune", 2: "New Delhi"}
tier_map   = {0: "Low Pay", 1: "Mid Pay", 2: "High Pay"}
gender_map = {0: "Female", 1: "Male"}
bench_map  = {0: "No", 1: "Yes"}

df["education_label"] = df["education"].map(edu_map)
df["city_label"]      = df["city"].map(city_map)
df["payment_label"]   = df["paymenttier"].map(tier_map)
df["gender_label"]    = df["male"].map(gender_map)
df["benched_label"]   = df["everbenched"].map(bench_map)

# Colors
BLUE   = "#4C72B0"   # Stay (0)
ORANGE = "#DD8452"   # Leave (1)


# Grouped IV Visualizations for Churned and Non-Churned

def grouped_bar_with_percent(ax, df, feature, title):
    """
    Plot grouped bar chart (counts on y-axis) with percent labels above each bar.
    Percentages are computed within each category of `feature`,
    so that Stay% + Leave% = 100% for each group.
    """
    # Contingency table: rows = feature categories, cols = leaveornot (0=stay,1=leave)
    ct = pd.crosstab(df[feature], df["leaveornot"]).reindex(columns=[0, 1], fill_value=0)

    # Percentages within each category
    pct = ct.div(ct.sum(axis=1), axis=0)

    categories = ct.index.tolist()
    x = np.arange(len(categories))
    width = 0.4

    # Bars for Stay (0) and Leave (1)
    bars_stay = ax.bar(x - width/2, ct[0], width, label="Stay (0)", color=BLUE, edgecolor="black", linewidth=0.5)
    bars_leave = ax.bar(x + width/2, ct[1], width, label="Leave (1)", color=ORANGE, edgecolor="black", linewidth=0.5)

    ax.set_title(title)
    ax.set_xlabel(feature.replace("_", " ").title())
    ax.set_ylabel("Count")
    ax.set_xticks(x)
    ax.set_xticklabels(categories)
    ax.grid(axis="y", alpha=0.3)
    ax.legend(title="LeaveOrNot")

    # Headroom for labels
    max_height = max(max(ct[0]), max(ct[1]))
    ax.set_ylim(0, max_height * 1.15)

    # Add percentage labels above each bar
    for i, cat in enumerate(categories):
        h0 = bars_stay[i].get_height()
        p0 = pct.loc[cat, 0] * 100
        ax.annotate(f"{p0:.1f}%", (bars_stay[i].get_x() + bars_stay[i].get_width()/2., h0), ha="center", va="bottom", fontsize=10)

        # Leave
        h1 = bars_leave[i].get_height()
        p1 = pct.loc[cat, 1] * 100
        ax.annotate(f"{p1:.1f}%", (bars_leave[i].get_x() + bars_leave[i].get_width()/2., h1), ha="center", va="bottom", fontsize=10)


# 3×3 Figure
fig, axes = plt.subplots(3, 3, figsize=(18, 12))
axes = axes.flatten()

# 1) Age boxplot
sns.boxplot(data=df, x="leaveornot", y="age", order=[0, 1], palette=[BLUE, ORANGE], ax=axes[0])
axes[0].set_title("Age by LeaveOrNot")
axes[0].set_xlabel("LeaveOrNot (0 = Stay, 1 = Leave)")
axes[0].set_ylabel("Age")
axes[0].grid(axis="y", alpha=0.3)

# 2) Experience boxplot
sns.boxplot(data=df, x="leaveornot", y="experienceincurrentdomain", order=[0, 1], palette=[BLUE, ORANGE], ax=axes[1])
axes[1].set_title("Experience in Current Domain by LeaveOrNot")
axes[1].set_xlabel("LeaveOrNot (0 = Stay, 1 = Leave)")
axes[1].set_ylabel("Years of Experience")
axes[1].grid(axis="y", alpha=0.3)

# 3) Payment Tier
grouped_bar_with_percent(axes[2], df, "payment_label", "Payment Tier vs LeaveOrNot")

# 4) Gender
grouped_bar_with_percent(axes[3], df, "gender_label", "Gender vs LeaveOrNot")

# 5) Education
grouped_bar_with_percent(axes[4], df, "education_label", "Education vs LeaveOrNot")

# 6) Ever Benched
grouped_bar_with_percent(axes[5], df, "benched_label", "Ever Benched vs LeaveOrNot")

# 7) City
grouped_bar_with_percent(axes[6], df, "city_label", "City vs LeaveOrNot")

# Turn off unused subplots
axes[7].axis("off")
axes[8].axis("off")


plt.tight_layout()
#fig.savefig("features_vs_churn_overview_grouped.png", dpi=300, bbox_inches="tight")
#plt.show()

/var/folders/qv/_ft96x6x6h1_2t8_53rz9xp00000gn/T/ipykernel_51215/3613661694.py:80: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.boxplot(data=df, x="leaveornot", y="age", order=[0, 1], palette=[BLUE, ORANGE], ax=axes[0])
/var/folders/qv/_ft96x6x6h1_2t8_53rz9xp00000gn/T/ipykernel_51215/3613661694.py:87: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.boxplot(data=df, x="leaveornot", y="experienceincurrentdomain", order=[0, 1], palette=[BLUE, ORANGE], ax=axes[1])

import numpy as np
import pandas as pd
from sklearn.model_selection import (train_test_split,StratifiedKFold,GridSearchCV,cross_val_score)
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import make_scorer, recall_score, confusion_matrix
pd.set_option("display.max_colwidth", None)

RANDOM_STATE = 42

# Load and prep data
df = pd.read_excel("employment churn.xlsx").round(2)
df.columns = df.columns.str.lower()
y = df["leaveornot"].astype(int)
X_raw = df.drop(columns=["leaveornot"])
cat_cols = ["education", "city", "paymenttier"]

# One-hot encode categoricals (drop first category as reference)
X_cat = pd.get_dummies(X_raw[cat_cols].astype("category"),prefix=cat_cols,drop_first=True)

# Keep numeric / already-binary columns as they are
X_num = X_raw.drop(columns=cat_cols)

# Final feature matrix used by all models
X = pd.concat([X_num, X_cat], axis=1)

# Train–test split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.30,random_state=RANDOM_STATE,stratify=y)

# CV setup + recall scorer
recall_scorer = make_scorer(recall_score, pos_label=1)

cv = StratifiedKFold(n_splits=5,shuffle=True,random_state=RANDOM_STATE)

# Models + hyperparameter grids
models = {"Logistic Regression": {"estimator": LogisticRegression(max_iter=1000,class_weight="balanced",solver="liblinear"),
                                  "param_grid": {"C": [0.01, 0.1, 1, 10],"penalty": ["l1", "l2"]},"key_param": "C"},
          
          "Decision Tree": {"estimator": DecisionTreeClassifier(random_state=RANDOM_STATE,class_weight="balanced"),
                            "param_grid": {"max_depth": [3, 4, 6, 8],"min_samples_split": [5, 10, 20],"min_samples_leaf": [2, 5, 10]},
                            "key_param": "max_depth"},
          
          "Random Forest": {"estimator": RandomForestClassifier(random_state=RANDOM_STATE,class_weight="balanced",n_jobs=-1),
                            "param_grid": {"n_estimators": [100, 200],"max_depth": [4, 6, 8],"min_samples_split": [5, 10],
                                           "min_samples_leaf": [2, 5]},"key_param": "n_estimators"}}

#GridSearchCV + evaluation loop (recall only)
results = []
best_estimators = {}

for name, cfg in models.items():
    print(f"\n==== Fitting {name} ====")

    grid = GridSearchCV(estimator=cfg["estimator"],param_grid=cfg["param_grid"],scoring=recall_scorer,cv=cv,n_jobs=-1,verbose=1)
    grid.fit(X_train, y_train)

    best_model = grid.best_estimator_
    best_estimators[name] = best_model

    # 5-fold CV recall (from GridSearch best score)
    cv_recall = grid.best_score_

    # Test set predictions (threshold 0.50)
    y_pred = best_model.predict(X_test)
    test_recall = recall_score(y_test, y_pred, pos_label=1)

    # Confusion matrix
    cm = confusion_matrix(y_test, y_pred)
    print(f"\nConfusion Matrix – {name}:")
    print(cm)
    print("Test Recall (leave=1):", round(test_recall, 2))

    results.append({"Model": name,"Best_Params": grid.best_params_,"Key_Tunable_Param": cfg["key_param"],
                    "CV Recall": cv_recall,"Test Recall": test_recall})

# Comparison table (recall-focused)
results_df = pd.DataFrame(results)

for col in ["CV Recall", "Test Recall"]:
    results_df[col] = results_df[col].round(2)

results_df = results_df[["Model", "Best_Params", "Key_Tunable_Param","CV Recall", "Test Recall"]]
results_df = results_df.sort_values(by="CV Recall",ascending=False).reset_index(drop=True)

print("\n=== Model Comparison (Employment Churn, recall-focused) ===")
display(results_df)
best_row = results_df.iloc[0]
best_model_name = best_row["Model"]
print(f"\nRecommended model (by 5-Fold CV Recall): {best_model_name}")

==== Fitting Logistic Regression ====
Fitting 5 folds for each of 8 candidates, totalling 40 fits

/opt/anaconda3/lib/python3.13/site-packages/sklearn/svm/_base.py:1249: ConvergenceWarning: Liblinear failed to converge, increase the number of iterations.
  warnings.warn(
/opt/anaconda3/lib/python3.13/site-packages/sklearn/svm/_base.py:1249: ConvergenceWarning: Liblinear failed to converge, increase the number of iterations.
  warnings.warn(
/opt/anaconda3/lib/python3.13/site-packages/sklearn/svm/_base.py:1249: ConvergenceWarning: Liblinear failed to converge, increase the number of iterations.
  warnings.warn(
/opt/anaconda3/lib/python3.13/site-packages/sklearn/svm/_base.py:1249: ConvergenceWarning: Liblinear failed to converge, increase the number of iterations.
  warnings.warn(
/opt/anaconda3/lib/python3.13/site-packages/sklearn/svm/_base.py:1249: ConvergenceWarning: Liblinear failed to converge, increase the number of iterations.
  warnings.warn(

Confusion Matrix – Logistic Regression:
[[666 250]
 [177 303]]
Test Recall (leave=1): 0.63

==== Fitting Decision Tree ====
Fitting 5 folds for each of 36 candidates, totalling 180 fits

Confusion Matrix – Decision Tree:
[[823  93]
 [145 335]]
Test Recall (leave=1): 0.7

==== Fitting Random Forest ====
Fitting 5 folds for each of 24 candidates, totalling 120 fits

Confusion Matrix – Random Forest:
[[811 105]
 [133 347]]
Test Recall (leave=1): 0.72

=== Model Comparison (Employment Churn, recall-focused) ===

Recommended model (by 5-Fold CV Recall): Random Forest

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from IPython.display import display
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.metrics import (confusion_matrix,classification_report,make_scorer,recall_score)

RANDOM_STATE = 42

# Load data
df = pd.read_excel("employment churn.xlsx").round(2)
df.columns = df.columns.str.lower()
X = df.drop(columns=["leaveornot"])
y = df["leaveornot"].astype(int)   # 1 = leaves, 0 = stays
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.30,random_state=RANDOM_STATE,stratify=y)

# Basic Decision Tree (entropy)
dt = DecisionTreeClassifier(criterion="entropy",random_state=RANDOM_STATE)
dt.fit(X_train, y_train)
y_pred_dt = dt.predict(X_test)

# Confusion matrix 
cm = confusion_matrix(y_test, y_pred_dt)
cm_df = pd.DataFrame(cm,index=["Actual Stay (0)", "Actual Churn (1)"],columns=["Predicted Stay (0)", "Predicted Churn (1)"])

print("Basic Decision Tree:\n")
display(cm_df)
print() 

# Classification report
print(classification_report(y_test, y_pred_dt, target_names=["Stay", "Churn"]))

# 5-Fold CV on training data
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)
recall_scorer = make_scorer(recall_score, pos_label=1)
cv_recall = cross_val_score(dt,X_train,y_train,cv=cv,scoring=recall_scorer)
print("\n5-Fold CV Recall (Churn = 1):", np.round(cv_recall, 3))
print("Mean CV Recall (Churn = 1):", round(cv_recall.mean(), 3), "\n")

# Visualize top levels of the tree
plt.figure(figsize=(18, 10))
plot_tree(dt,feature_names=X.columns,class_names=["Stay", "Leave"],filled=True,rounded=True,fontsize=10,max_depth=2)
plt.title("Top Levels of Decision Tree (Employment Churn, Entropy Criterion)")
plt.show()

Basic Decision Tree:

              precision    recall  f1-score   support

        Stay       0.82      0.89      0.85       916
       Churn       0.75      0.64      0.69       480

    accuracy                           0.80      1396
   macro avg       0.79      0.76      0.77      1396
weighted avg       0.80      0.80      0.80      1396


5-Fold CV Recall (Churn = 1): [0.696 0.701 0.634 0.629 0.688]
Mean CV Recall (Churn = 1): 0.67

import pandas as pd
import numpy as np

# Extract feature importances from your fitted decision tree
importances = dt.feature_importances_

# Build table
dt_importance_table = pd.DataFrame({"Feature": X_train.columns,"Information_Gain": importances})

# Improved Impact Labeling (3-tier system)
def dt_impact(val):
    if val > 0.15:
        return "Key driver of churn"
    elif val > 0.03:
        return "Secondary driver"
    else:
        return "Minimal / no impact"

dt_importance_table["Impact"] = dt_importance_table["Information_Gain"].apply(dt_impact)

# Sort by importance, descending
dt_importance_table = dt_importance_table.sort_values(by="Information_Gain", ascending=False)

# Reset index for clean display
dt_importance_table.reset_index(drop=True, inplace=True)

# Display table (rounded)
dt_importance_table.round(2)

Exception ignored in: <function ResourceTracker.__del__ at 0x10375dbc0>
Traceback (most recent call last):
  File "/opt/anaconda3/lib/python3.13/multiprocessing/resource_tracker.py", line 82, in __del__
  File "/opt/anaconda3/lib/python3.13/multiprocessing/resource_tracker.py", line 91, in _stop
  File "/opt/anaconda3/lib/python3.13/multiprocessing/resource_tracker.py", line 116, in _stop_locked
ChildProcessError: [Errno 10] No child processes
Exception ignored in: <function ResourceTracker.__del__ at 0x102a89bc0>
Traceback (most recent call last):
  File "/opt/anaconda3/lib/python3.13/multiprocessing/resource_tracker.py", line 82, in __del__
  File "/opt/anaconda3/lib/python3.13/multiprocessing/resource_tracker.py", line 91, in _stop
  File "/opt/anaconda3/lib/python3.13/multiprocessing/resource_tracker.py", line 116, in _stop_locked
ChildProcessError: [Errno 10] No child processes
Exception ignored in: <function ResourceTracker.__del__ at 0x107051bc0>
Traceback (most recent call last):
  File "/opt/anaconda3/lib/python3.13/multiprocessing/resource_tracker.py", line 82, in __del__
  File "/opt/anaconda3/lib/python3.13/multiprocessing/resource_tracker.py", line 91, in _stop
  File "/opt/anaconda3/lib/python3.13/multiprocessing/resource_tracker.py", line 116, in _stop_locked
ChildProcessError: [Errno 10] No child processes
Exception ignored in: <function ResourceTracker.__del__ at 0x105071bc0>
Traceback (most recent call last):
  File "/opt/anaconda3/lib/python3.13/multiprocessing/resource_tracker.py", line 82, in __del__
  File "/opt/anaconda3/lib/python3.13/multiprocessing/resource_tracker.py", line 91, in _stop
  File "/opt/anaconda3/lib/python3.13/multiprocessing/resource_tracker.py", line 116, in _stop_locked
ChildProcessError: [Errno 10] No child processes
Exception ignored in: <function ResourceTracker.__del__ at 0x1029f9bc0>
Traceback (most recent call last):
  File "/opt/anaconda3/lib/python3.13/multiprocessing/resource_tracker.py", line 82, in __del__
  File "/opt/anaconda3/lib/python3.13/multiprocessing/resource_tracker.py", line 91, in _stop
  File "/opt/anaconda3/lib/python3.13/multiprocessing/resource_tracker.py", line 116, in _stop_locked
ChildProcessError: [Errno 10] No child processes
Exception ignored in: <function ResourceTracker.__del__ at 0x102c1dbc0>
Traceback (most recent call last):
  File "/opt/anaconda3/lib/python3.13/multiprocessing/resource_tracker.py", line 82, in __del__
  File "/opt/anaconda3/lib/python3.13/multiprocessing/resource_tracker.py", line 91, in _stop
  File "/opt/anaconda3/lib/python3.13/multiprocessing/resource_tracker.py", line 116, in _stop_locked
ChildProcessError: [Errno 10] No child processes
Exception ignored in: <function ResourceTracker.__del__ at 0x106d71bc0>
Traceback (most recent call last):
  File "/opt/anaconda3/lib/python3.13/multiprocessing/resource_tracker.py", line 82, in __del__
  File "/opt/anaconda3/lib/python3.13/multiprocessing/resource_tracker.py", line 91, in _stop
  File "/opt/anaconda3/lib/python3.13/multiprocessing/resource_tracker.py", line 116, in _stop_locked
ChildProcessError: [Errno 10] No child processes
Exception ignored in: <function ResourceTracker.__del__ at 0x10283dbc0>
Traceback (most recent call last):
  File "/opt/anaconda3/lib/python3.13/multiprocessing/resource_tracker.py", line 82, in __del__
  File "/opt/anaconda3/lib/python3.13/multiprocessing/resource_tracker.py", line 91, in _stop
  File "/opt/anaconda3/lib/python3.13/multiprocessing/resource_tracker.py", line 116, in _stop_locked
ChildProcessError: [Errno 10] No child processes

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from IPython.display import display
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.metrics import (confusion_matrix,classification_report,make_scorer,recall_score)

RANDOM_STATE = 42

# Load data
df = pd.read_excel("employment churn.xlsx").round(2)
df.columns = df.columns.str.lower()
X = df.drop(columns=["leaveornot"])
y = df["leaveornot"].astype(int)   # 1 = leaves, 0 = stays
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.30,random_state=RANDOM_STATE,stratify=y)

# 2. Tuned Decision Tree (entropy, balanced)
dt_best = DecisionTreeClassifier(random_state=RANDOM_STATE,class_weight="balanced",max_depth=8,min_samples_leaf=10,
                                 min_samples_split=5,criterion="entropy")
dt_best.fit(X_train, y_train)

# Predictions with custom probability threshold
custom_threshold = 0.35   # adjust as needed
y_proba = dt_best.predict_proba(X_test)[:, 1]
y_pred_adj = (y_proba >= custom_threshold).astype(int)

# Confusion matrix (pretty table)
cm_adj = confusion_matrix(y_test, y_pred_adj)
cm_df_adj = pd.DataFrame(cm_adj,index=["Actual Stay (0)", "Actual Churn (1)"],columns=["Predicted Stay (0)", "Predicted Churn (1)"])

print(f"\nTuned Decision Tree (Adjusted Threshold = {custom_threshold}):")
display(cm_df_adj)
print() 

# Classification report
print(classification_report(y_test, y_pred_adj, target_names=["Stay", "Churn"]))

# 5-Fold CV Recall on training data (standard 0.50 threshold)
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)
recall_scorer = make_scorer(recall_score, pos_label=1)
cv_recall_scores = cross_val_score(dt_best,X_train,y_train,cv=cv,scoring=recall_scorer)
print("\nCV Recall (Churn = 1, standard threshold):", np.round(cv_recall_scores, 3))
print("Mean CV Recall (Churn = 1):", round(cv_recall_scores.mean(), 3), "\n")

# Visualize top levels of the tuned tree
plt.figure(figsize=(18, 10))
plot_tree(dt_best,feature_names=X.columns,class_names=["Stay", "Leave"],filled=True,rounded=True,fontsize=10,max_depth=2)
plt.title("Top Levels of Tuned Decision Tree (Employment Churn)")
plt.show()

Tuned Decision Tree (Adjusted Threshold = 0.35):

              precision    recall  f1-score   support

        Stay       0.87      0.75      0.81       916
       Churn       0.62      0.79      0.70       480

    accuracy                           0.76      1396
   macro avg       0.75      0.77      0.75      1396
weighted avg       0.79      0.76      0.77      1396


CV Recall (Churn = 1, standard threshold): [0.728 0.763 0.75  0.692 0.701]
Mean CV Recall (Churn = 1): 0.727

#Feature Importance Table for Tuned DT Model

import pandas as pd
import numpy as np

# Extract feature importances from tuned decision tree
importances_best = dt_best.feature_importances_

# Build dataframe
dt_best_importance_table = pd.DataFrame({"Feature": X_train.columns,"Information_Gain": importances_best})

# 3-Tier impact logic
def dt_best_impact(val):
    if val > 0.15:
        return "Key driver of churn"
    elif val > 0.03:
        return "Secondary driver"
    else:
        return "Minimal / no impact"

dt_best_importance_table["Impact"] = dt_best_importance_table["Information_Gain"].apply(dt_best_impact)

# Sort results by information gain
dt_best_importance_table = dt_best_importance_table.sort_values(by="Information_Gain", ascending=False)

# Reset index for clean display
dt_best_importance_table.reset_index(drop=True, inplace=True)

# Display table rounded to 3 decimals
dt_best_importance_table.round(2)

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report, recall_score, make_scorer

RANDOM_STATE = 42

df = pd.read_excel("employment churn.xlsx").round(2)
df.columns = df.columns.str.lower()
y = df["leaveornot"].astype(int)

# 2. Create dummy variables for categorical features
#    education:  0=Bachelors, 1=Masters, 2=PhD
#    city:       0=Bangalore, 1=Pune, 2=New Delhi
#    paymenttier:0=Low, 1=Mid, 2=High

X_raw = df.drop(columns=["leaveornot"])
X = pd.get_dummies(X_raw,columns=["education", "city", "paymenttier"],drop_first=True)
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.30,random_state=42,stratify=y)

# Basic logistic regression
logit = LogisticRegression(max_iter=500)
logit.fit(X_train, y_train)

y_pred = logit.predict(X_test)

# Confusion matrix 
cm = confusion_matrix(y_test, y_pred)

cm_df = pd.DataFrame(cm,index=["Actual Stay (0)", "Actual Churn (1)"],columns=["Predicted Stay (0)", "Predicted Churn (1)"])

print("Basic Logistic Regression:")
display(cm_df)
print()   

# Classification report
print(classification_report(y_test, y_pred, target_names=["Stay", "Churn"]))

# 5-Fold CV Recall (churn = 1)
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
recall_scorer = make_scorer(recall_score, pos_label=1)
cv_recall_scores = cross_val_score(logit,X_train,y_train,cv=cv,scoring=recall_scorer)

print("\n5-Fold CV Recall (Churn = 1):", np.round(cv_recall_scores, 3))
print("Mean CV Recall (Churn = 1):", round(cv_recall_scores.mean(), 3), '\n')

Basic Logistic Regression:

              precision    recall  f1-score   support

        Stay       0.74      0.90      0.82       916
       Churn       0.69      0.41      0.51       480

    accuracy                           0.73      1396
   macro avg       0.72      0.66      0.66      1396
weighted avg       0.73      0.73      0.71      1396

/opt/anaconda3/lib/python3.13/site-packages/sklearn/linear_model/_logistic.py:465: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
/opt/anaconda3/lib/python3.13/site-packages/sklearn/linear_model/_logistic.py:465: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
/opt/anaconda3/lib/python3.13/site-packages/sklearn/linear_model/_logistic.py:465: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
/opt/anaconda3/lib/python3.13/site-packages/sklearn/linear_model/_logistic.py:465: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(

5-Fold CV Recall (Churn = 1): [0.397 0.42  0.321 0.308 0.438]
Mean CV Recall (Churn = 1): 0.377

/opt/anaconda3/lib/python3.13/site-packages/sklearn/linear_model/_logistic.py:465: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(

import numpy as np
import pandas as pd

# Extract coefficients from your model
coef = logit.coef_[0]

# Compute odds ratios
odds = np.exp(coef)

# Build dataframe
odds_table = pd.DataFrame({"Feature": X.columns,"Coefficient": coef,"Odds_Ratio": odds})

# 4. Determine impact
def get_impact(row):
    if row["Odds_Ratio"] > 1.05:
        return "Increases churn"
    elif row["Odds_Ratio"] < 0.95:
        return "Decreases churn"
    else:
        return "No effect"

odds_table["Impact"] = odds_table.apply(get_impact, axis=1)

# 5. Sort by strongest effect (furthest from 1)
odds_table["Effect_Size"] = abs(odds_table["Odds_Ratio"] - 1)
odds_table = odds_table.sort_values(by="Effect_Size", ascending=False)

# 6. Drop helper column for clean presentation
odds_table = odds_table[["Feature", "Coefficient", "Odds_Ratio", "Impact"]]

# 7. Display final table
odds_table.reset_index(drop=True, inplace=True)
odds_table.round(2)

import numpy as np
import pandas as pd
from sklearn.model_selection import (train_test_split,StratifiedKFold,GridSearchCV,cross_val_score)
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import make_scorer, recall_score, confusion_matrix

RANDOM_STATE = 42

# import data
df = pd.read_excel("employment churn.xlsx").round(2)
df.columns = df.columns.str.lower()
y = df["leaveornot"].astype(int)

# Create dummy variables for categorical features
#    education:  0=Bachelors, 1=Masters, 2=PhD
#    city:       0=Bangalore, 1=Pune, 2=New Delhi
#    paymenttier:0=Low, 1=Mid, 2=High

X_raw = df.drop(columns=["leaveornot"])
X = pd.get_dummies(X_raw,columns=["education", "city", "paymenttier"],drop_first=True)

# Train–test split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.30,random_state=RANDOM_STATE,stratify=y)

#Improved Logistic Regression (L1, balanced)
logit_best = LogisticRegression(max_iter=1000,class_weight="balanced",solver="liblinear",C=0.1,penalty="l1")

logit_best.fit(X_train, y_train)

# Standard prediction using default 0.50 threshold
y_pred = logit_best.predict(X_test)

# Confusion matrix (pretty table)
cm = confusion_matrix(y_test, y_pred)

cm_df = pd.DataFrame(cm,index=["Actual Stay (0)", "Actual Churn (1)"],columns=["Predicted Stay (0)", "Predicted Churn (1)"])
print("Improved Logistic Regression:\n")
display(cm_df)
print()  

# Classification report
print(classification_report(y_test, y_pred, target_names=["Stay", "Churn"]))

# 5-Fold CV on training data (Recall for churn = 1)
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)
recall_scorer = make_scorer(recall_score, pos_label=1)
cv_recall = cross_val_score(logit_best,X_train,y_train,cv=cv,scoring=recall_scorer)

print("\nCV Recall (Churn = 1):", np.round(cv_recall, 3))
print("Mean Recall (Churn = 1):", round(cv_recall.mean(), 3), "\n")

Improved Logistic Regression:

              precision    recall  f1-score   support

        Stay       0.79      0.73      0.76       916
       Churn       0.55      0.63      0.59       480

    accuracy                           0.70      1396
   macro avg       0.67      0.68      0.67      1396
weighted avg       0.71      0.70      0.70      1396


CV Recall (Churn = 1): [0.652 0.625 0.638 0.571 0.661]
Mean Recall (Churn = 1): 0.629

coef_best = logit_best.coef_[0]
odds_best = np.exp(coef_best)

odds_best_table = pd.DataFrame({"Feature": X_train.columns,"Coefficient": coef_best,"Odds_Ratio": odds_best})

# Rename dummy variables to human-readable labels
rename_map = { "education_1": "Masters vs Bachelors","education_2": "PhD vs Bachelors",
              "city_1": "Pune vs Bangalore","city_2": "New Delhi vs Bangalore",
              "paymenttier_1": "Mid Pay vs Low Pay","paymenttier_2": "High Pay vs Low Pay"}

odds_best_table["Feature"] = odds_best_table["Feature"].replace(rename_map)

# Label impact categories
def get_impact(row):
    if row["Odds_Ratio"] > 1.05:
        return "Increases churn"
    elif row["Odds_Ratio"] < 0.95:
        return "Decreases churn"
    else:
        return "No effect"

odds_best_table["Impact"] = odds_best_table.apply(get_impact, axis=1)

# Sort by effect size
odds_best_table["Effect_Size"] = abs(odds_best_table["Odds_Ratio"] - 1)
odds_best_table = odds_best_table.sort_values(by="Effect_Size", ascending=False)

# Final clean table
odds_best_table = odds_best_table[["Feature", "Coefficient", "Odds_Ratio", "Impact"]]
odds_best_table = odds_best_table.reset_index(drop=True)

print("\nFinal Odds Ratios Table:")
display(odds_best_table.round(3))

Final Odds Ratios Table:

import numpy as np
import matplotlib.pyplot as plt

# Recall values from your models
recall_values = {"Logit": [0.41, 0.63],"Tree":  [0.64, 0.79] }
models = list(recall_values.keys())
base = [recall_values[m][0] for m in models]
improved = [recall_values[m][1] for m in models]

x = np.arange(len(models))  # positions for "Logit", "Tree"
width = 0.35

plt.figure(figsize=(10,6))

# Bars
bars1 = plt.bar(x - width/2, base, width,label="Base Model",color="#4e79a7",edgecolor="black",linewidth=0.7)
bars2 = plt.bar(x + width/2, improved, width,label="Improved Model",color="#59a14f",edgecolor="black",linewidth=0.7)

# Value labels above bars
for bars in [bars1, bars2]:
    for bar in bars:
        h = bar.get_height()
        plt.text(bar.get_x() + bar.get_width()/2,h + 0.015,f"{h:.2f}",ha="center",fontsize=12)

# Formatting
plt.xticks(x, models, fontsize=12)
plt.ylim(0, 1.05)
plt.ylabel("Recall (Churn = 1)", fontsize=13)
plt.title("Recall Comparison: Logistic vs Tree Models (Base vs Improved)", fontsize=16)
plt.legend(fontsize=12)
plt.tight_layout()

# Save figure
#plt.savefig("recall_comparison_grouped.png", dpi=300, bbox_inches="tight")
#plt.show()

Employee Churn Prediction Analysis¶

Project Overview¶

1. Setup and Data Import¶

2. Exploratory Data Analysis¶

3. Which Models Should we Focus on?¶

4. Decision Tree Models¶

Improved Decision Tree¶

5. Logit Regression Models¶

Improved Logit Regression Model¶

6. Comparison¶

	education	joiningyear	city	paymenttier	age	male	everbenched	experienceincurrentdomain	leaveornot
0	0	2017	0	0	34	1	0	0	0
1	0	2013	1	2	28	0	0	3	1
2	0	2014	2	0	38	0	0	2	0
3	1	2016	0	0	27	1	0	5	1
4	1	2017	1	0	24	1	1	2	1

	count	mean	std	min	25%	50%	75%	max
education	4653.0	0.26	0.52	0.0	0.0	0.0	0.0	2.0
joiningyear	4653.0	2015.06	1.86	2012.0	2013.0	2015.0	2017.0	2018.0
city	4653.0	0.77	0.82	0.0	0.0	1.0	1.0	2.0
paymenttier	4653.0	0.30	0.56	0.0	0.0	0.0	0.0	2.0
age	4653.0	29.39	4.83	22.0	26.0	28.0	32.0	41.0
male	4653.0	0.60	0.49	0.0	0.0	1.0	1.0	1.0
everbenched	4653.0	0.10	0.30	0.0	0.0	0.0	0.0	1.0
experienceincurrentdomain	4653.0	2.91	1.56	0.0	2.0	3.0	4.0	7.0
leaveornot	4653.0	0.34	0.48	0.0	0.0	0.0	1.0	1.0

	Model	Best_Params	Key_Tunable_Param	CV Recall	Test Recall
0	Random Forest	{'max_depth': 6, 'min_samples_leaf': 5, 'min_samples_split': 5, 'n_estimators': 100}	n_estimators	0.73	0.72
1	Decision Tree	{'max_depth': 8, 'min_samples_leaf': 10, 'min_samples_split': 5}	max_depth	0.72	0.70
2	Logistic Regression	{'C': 1, 'penalty': 'l1'}	C	0.64	0.63

	Feature	Information_Gain	Impact
0	joiningyear	0.31	Key driver of churn
1	age	0.18	Key driver of churn
2	paymenttier	0.13	Secondary driver
3	experienceincurrentdomain	0.10	Secondary driver
4	city	0.10	Secondary driver
5	education	0.08	Secondary driver
6	male	0.06	Secondary driver
7	everbenched	0.03	Minimal / no impact

	Feature	Information_Gain	Impact
0	joiningyear	0.36	Key driver of churn
1	paymenttier	0.21	Key driver of churn
2	city	0.13	Secondary driver
3	education	0.12	Secondary driver
4	male	0.07	Secondary driver
5	age	0.05	Secondary driver
6	experienceincurrentdomain	0.04	Secondary driver
7	everbenched	0.02	Minimal / no impact

	Feature	Coefficient	Odds_Ratio	Impact
0	education_1	0.90	2.45	Increases churn
1	paymenttier_1	0.85	2.34	Increases churn
2	everbenched	0.59	1.81	Increases churn
3	city_1	0.52	1.68	Increases churn
4	male	-0.87	0.42	Decreases churn
5	city_2	-0.62	0.54	Decreases churn
6	paymenttier_2	0.18	1.20	Increases churn
7	education_2	0.17	1.18	Increases churn
8	experienceincurrentdomain	-0.03	0.97	No effect
9	age	-0.03	0.97	No effect
10	joiningyear	0.00	1.00	No effect

	Feature	Coefficient	Odds_Ratio	Impact
0	Masters vs Bachelors	0.894	2.444	Increases churn
1	Mid Pay vs Low Pay	0.767	2.154	Increases churn
2	Pune vs Bangalore	0.461	1.585	Increases churn
3	male	-0.795	0.452	Decreases churn
4	everbenched	0.413	1.511	Increases churn
5	New Delhi vs Bangalore	-0.512	0.599	Decreases churn
6	experienceincurrentdomain	-0.031	0.970	No effect
7	age	-0.029	0.971	No effect
8	joiningyear	0.000	1.000	No effect
9	PhD vs Bachelors	0.000	1.000	No effect
10	High Pay vs Low Pay	0.000	1.000	No effect

	Predicted Stay (0)	Predicted Churn (1)
Actual Stay (0)	814	102
Actual Churn (1)	175	305

	Predicted Stay (0)	Predicted Churn (1)
Actual Stay (0)	687	229
Actual Churn (1)	102	378

	Predicted Stay (0)	Predicted Churn (1)
Actual Stay (0)	671	245
Actual Churn (1)	179	301