import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import seaborn as sns
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
import warnings
warnings.filterwarnings('ignore')

# --- Global style settings ---
CLUSTER_COLORS = ['#4A7FA5', '#C0554E', '#4E9A72', '#D4895A', '#7B6B9E']
CLUSTER_NAMES  = ['C0: MSP Direct Bookers','C1: MSP Deal Seekers','C2: Inbound Visitors','C3: Loyal Frequent Fliers','C4: Returning Home Fliers']
SHORT_NAMES = ['MSP Direct\nBookers', 'MSP Deal\nSeekers', 'Inbound\nVisitors','Loyal Frequent\nFliers', 'Returning Home\nFliers']

plt.rcParams.update({'figure.facecolor':'white','axes.facecolor':'white', 'axes.spines.top': False, 'axes.spines.right': False, 
                     'font.family':'Times New Roman', 'axes.titlesize': 14, 'axes.labelsize': 12, 'xtick.labelsize': 10, 'ytick.labelsize': 10,})

# Load the clustering dataset
clu = pd.read_csv('Clustering Data.csv')

print(f'Dataset shape: {clu.shape[0]:,} rows × {clu.shape[1]} columns')
print(f'Missing values: {clu.isnull().sum().sum()}')

Dataset shape: 15,144 rows × 90 columns
Missing values: 0

# Preview the clustering dataset structure
clu.head()

fig, axes = plt.subplots(2, 2, figsize=(8, 6))

continuous_vars = [
    ('avg_amt',         'Average Ticket Amount (normalized)',  '#4A7FA5'),
    ('group_size',      'Group Size (normalized)',             '#4E9A72'),
    ('days_pre_booked', 'Days Pre-Booked (normalized)',        '#D4895A'),
]

# Flatten to get first 3 axes, hide the 4th
ax_list = [axes[0,0], axes[0,1], axes[1,0]]
axes[1,1].set_visible(False)

for ax, (col, title, color) in zip(ax_list, continuous_vars):
    ax.hist(clu[col], bins=40, color=color, edgecolor='white', linewidth=0.5, alpha=0.85)
    ax.axvline(clu[col].mean(), color='black', linestyle='--', linewidth=1.2,
               label=f'Mean: {clu[col].mean():.3f}')
    ax.set_title(title, fontweight='bold', pad=20)
    ax.set_xlabel('Normalized value (0–1)', labelpad=10)
    ax.set_ylabel('Count', labelpad=10)
    ax.legend(fontsize=9)

plt.tight_layout()
plt.savefig('00 - eda_continuous_distributions.png', dpi=500, bbox_inches='tight')
plt.show()

def autopct_threshold(pct):
    return f'{pct:.1f}%' if pct >= 5 else ''

# Chart 1: Booking Channel Mix
channel_cols = ['BookingChannel_SCA_Website_Booking', 'BookingChannel_Outside_Booking', 'BookingChannel_Reservations_Booking', 
                'BookingChannel_Tour_Operator_Portal', 'BookingChannel_SY_Vacation', 'BookingChannel_Other']
channel_labels = ['SCA Website', 'Outside/3rd Party', 'Reservations', 'Tour Operator', 'SY Vacation', 'Other']
channel_counts = clu[channel_cols].sum().values
colors_ch = ['#4A7FA5', '#C0554E', '#4E9A72', '#D4895A', '#7B6B9E', '#9E9E9E']

fig1, ax1 = plt.subplots(figsize=(7, 7))
wedges, _, autotexts = ax1.pie(channel_counts, labels=None, colors=colors_ch, autopct=autopct_threshold, startangle=90, pctdistance=1.18,
                               wedgeprops={'edgecolor': 'white', 'linewidth': 1.5}, radius=0.75)
for at in autotexts: at.set_fontsize(15); at.set_fontweight('bold')

legend_labels_ch = [f'{l} ({c/channel_counts.sum()*100:.1f}%)' if c/channel_counts.sum()*100 < 5 else l
    for l, c in zip(channel_labels, channel_counts)]

ax1.legend(wedges, legend_labels_ch, loc='upper center', bbox_to_anchor=(0.5, 0.15), ncol=3, fontsize=12, frameon=False)
ax1.set_title('Booking Channel Mix', fontweight='bold', fontsize=17, y=0.90)
plt.tight_layout()
plt.savefig('00 - eda_booking_channel.png', dpi=500, bbox_inches='tight')
plt.show()

def autopct_threshold(pct):
    return f'{pct:.1f}%' if pct >= 5 else ''

# Chart 2: Ufly Loyalty Program Status
loyalty_cols   = ['UflyMemberStatus_non-ufly', 'UflyMemberStatus_Standard', 'UflyMemberStatus_Elite']
loyalty_labels = ['Non-Member', 'Standard', 'Elite']
loyalty_counts = clu[loyalty_cols].sum().values
colors_ly = ['#9E9E9E', '#D4895A', '#C0554E']

fig2, ax2 = plt.subplots(figsize=(6, 6))
wedges2, _, autotexts2 = ax2.pie(loyalty_counts, labels=None, colors=colors_ly, autopct=autopct_threshold, startangle=90, pctdistance=1.18,
                                 wedgeprops={'edgecolor': 'white', 'linewidth': 1.5}, radius=0.75)
for at in autotexts2: at.set_fontsize(15); at.set_fontweight('bold')

legend_labels_ly = [f'{l} ({c/loyalty_counts.sum()*100:.1f}%)' if c/loyalty_counts.sum()*100 < 5 else l
    for l, c in zip(loyalty_labels, loyalty_counts)]

ax2.legend(wedges2, legend_labels_ly, loc='upper center', bbox_to_anchor=(0.5, 0.15), ncol=3, fontsize=12, frameon=False)
ax2.set_title('Ufly Loyalty Program Status', fontweight='bold', fontsize=17, y=0.90)
plt.tight_layout()
plt.savefig('00 - eda_loyalty_status.png', dpi=500, bbox_inches='tight')
plt.show()

# EDA: Top destination airports

dest_cols = [c for c in clu.columns if c.startswith('true_destination_dest_')]
dest_means = clu[dest_cols].mean().sort_values(ascending=False).head(12)
dest_labels = [c.replace('true_destination_dest_', '') for c in dest_means.index]

fig, ax = plt.subplots(figsize=(12, 4))
bars = ax.bar(dest_labels, dest_means.values, color='#4A7FA5', edgecolor='white', linewidth=0.8)
bars[0].set_color('#C0554E')  # highlight top destination
ax.set_title('Top 12 Destinations by Booking Frequency', fontweight='bold', fontsize=15)
ax.set_ylabel('Mean booking proportion (0–1)', labelpad=20, fontsize=15)
ax.set_xlabel('Destination Airport Code', labelpad=20, fontsize=15)
ax.tick_params(axis='x', labelsize=13)
ax.tick_params(axis='y', labelsize=13)
for bar in bars:
    ax.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.003,
            f'{bar.get_height():.3f}', ha='center', va='bottom', fontsize=12)
plt.tight_layout()
plt.savefig('00 - eda_top_destinations.png', dpi=500, bbox_inches='tight')
plt.show()

# Select numeric features for clustering (exclude identifier columns)

exclude = ['uid', 'PNRLocatorID']
feature_cols = [c for c in clu.select_dtypes(include='number').columns if c not in exclude]

X = clu[feature_cols].copy()
print(f'Feature matrix shape: {X.shape}')
print(f'\nFeature categories:')
print(f'Booking behavior          : avg_amt, round_trip, group_size, group, days_pre_booked')
print(f'Booking channels          : {len([c for c in feature_cols if "BookingChannel" in c])} columns')
print(f'Age groups                : {len([c for c in feature_cols if "age_group" in c])} columns')
print(f'Origin airports           : {len([c for c in feature_cols if "true_origins" in c])} columns')
print(f'Destination airports.     : {len([c for c in feature_cols if "true_destination" in c])} columns')
print(f'Loyalty status            : {len([c for c in feature_cols if "Ufly" in c])} columns')
print(f'Seasonality               : {len([c for c in feature_cols if "seasonality" in c])} columns')

Feature matrix shape: (15144, 88)

Feature categories:
Booking behavior          : avg_amt, round_trip, group_size, group, days_pre_booked
Booking channels          : 6 columns
Age groups                : 5 columns
Origin airports           : 29 columns
Destination airports.     : 36 columns
Loyalty status            : 3 columns
Seasonality               : 4 columns

# Run elbow method and silhouette analysis for k = 2 through 10.

k_range = range(2, 11)
inertias = []
silhouettes = []

print('Computing inertia and silhouette scores...')
for k in k_range:
    km = KMeans(n_clusters=k, random_state=42, n_init=10)
    labels = km.fit_predict(X)
    inertias.append(km.inertia_)
    sil = silhouette_score(X, labels, sample_size=3000, random_state=42)
    silhouettes.append(sil)
    print(f'  k={k}: inertia={km.inertia_:,.0f}, silhouette={sil:.4f}')

Computing inertia and silhouette scores...
  k=2: inertia=59,918, silhouette=0.1151
  k=3: inertia=55,337, silhouette=0.1000
  k=4: inertia=52,394, silhouette=0.0942
  k=5: inertia=50,292, silhouette=0.1036
  k=6: inertia=48,803, silhouette=0.0992
  k=7: inertia=47,585, silhouette=0.0991
  k=8: inertia=46,479, silhouette=0.1025
  k=9: inertia=45,236, silhouette=0.1099
  k=10: inertia=44,749, silhouette=0.0987

ks = list(k_range)

# Elbow plot
fig, ax = plt.subplots(figsize=(8, 6))
ax.plot(ks, inertias, 'o-', color='#4A7FA5', linewidth=2, markersize=8)
ax.axvline(5, color='#C0554E', linestyle='--', linewidth=1.5, label='k = 5 (selected)')
ax.set_title('Elbow Method - Inertia vs. k', fontweight='bold', pad=25, fontsize=20)
ax.set_xlabel('Number of clusters (k)', labelpad=15, fontsize=16)
ax.set_ylabel('Inertia (within-cluster sum of squares)', labelpad=15, fontsize=16)
ax.set_xticks(ks)
ax.tick_params(axis='x', labelsize=13)
ax.tick_params(axis='y', labelsize=13)
ax.legend(fontsize=13)
plt.tight_layout()
plt.savefig('00 - elbow.png', dpi=500, bbox_inches='tight')
plt.show()

#Fit final K-Means model with k=5 and assign cluster labels.

kmeans = KMeans(n_clusters=5, random_state=42, n_init=10)
kmeans.fit(X)

# Assign labels and reorder clusters by size (largest = Cluster 0)
clu['cluster'] = kmeans.labels_
order   = clu['cluster'].value_counts().index.tolist()
cmap    = {old: new for new, old in enumerate(order)}
clu['cluster'] = clu['cluster'].map(cmap)

print('Cluster sizes:')
for i, row in clu['cluster'].value_counts().sort_index().items():
    print(f'  Cluster {i} ({CLUSTER_NAMES[i]}): {row:,} customers ({row/len(clu)*100:.1f}%)')

Cluster sizes:
  Cluster 0 (C0: MSP Direct Bookers): 4,127 customers (27.3%)
  Cluster 1 (C1: MSP Deal Seekers): 3,843 customers (25.4%)
  Cluster 2 (C2: Inbound Visitors): 2,436 customers (16.1%)
  Cluster 3 (C3: Loyal Frequent Fliers): 2,385 customers (15.7%)
  Cluster 4 (C4: Returning Home Fliers): 2,353 customers (15.5%)

profile_cols = {
    'avg_amt':                              'Avg Ticket Amount',
    'round_trip':                           'Round Trip Rate',
    'group_size':                           'Avg Group Size',
    'days_pre_booked':                      'Days Pre-Booked',
    'BookingChannel_SCA_Website_Booking':   'SCA Website %',
    'BookingChannel_Outside_Booking':       'Outside/3rd Party %',
    'BookingChannel_Tour_Operator_Portal':  'Tour Operator %',
    'UflyMemberStatus_non-ufly':            'Non-Ufly Member %',
    'UflyMemberStatus_Standard':            'Ufly Standard %',
    'UflyMemberStatus_Elite':               'Ufly Elite %',
    'age_group_18-24':                      'Age 18-24 %',
    'age_group_35-54':                      'Age 35-54 %',
    'age_group_55+':                        'Age 55+ %',
    'seasonality_Q1':                       'Q1 (Jan-Mar) %',
    'seasonality_Q4':                       'Q4 (Oct-Dec) %',
    'true_origins_ori_MSP':                 'Origin: MSP %',
    'true_destination_dest_MSP':            'Destination: MSP %',
}

# Compute means per cluster
summary = clu.groupby('cluster')[[c for c in profile_cols.keys() if c in clu.columns]].mean()
summary.insert(0, 'n_customers', clu.groupby('cluster').size())
summary.insert(1, 'pct_of_total', (clu.groupby('cluster').size() / len(clu) * 100).round(1))
summary = summary.rename(columns=profile_cols)
summary.index = [f'C{i}: {CLUSTER_NAMES[i].split(": ")[1]}' for i in summary.index]

# Display rounded
display_summary = summary.copy()
for col in display_summary.columns[2:]:
    display_summary[col] = display_summary[col].map(lambda x: f'{x:.3f}')

print(display_summary.to_string())
print('\n* All feature values are normalized (0–1) means of the cluster centroid.')

                           n_customers  pct_of_total Avg Ticket Amount Round Trip Rate Avg Group Size Days Pre-Booked SCA Website % Outside/3rd Party % Tour Operator % Non-Ufly Member % Ufly Standard % Ufly Elite % Age 18-24 % Age 35-54 % Age 55+ % Q1 (Jan-Mar) % Q4 (Oct-Dec) % Origin: MSP % Destination: MSP %
C0: MSP Direct Bookers            4127          27.3             0.053           0.793          0.169           0.085         0.740               0.000           0.126             0.999           0.000        0.001       0.093       0.331     0.256          0.330          0.234         0.907              0.000
C1: MSP Deal Seekers              3843          25.4             0.053           0.794          0.150           0.088         0.000               1.000           0.000             0.998           0.000        0.002       0.111       0.350     0.213          0.263          0.259         0.888              0.001
C2: Inbound Visitors              2436          16.1             0.048           0.417          0.068           0.052         0.000               0.946           0.000             0.949           0.050        0.002       0.174       0.298     0.174          0.155          0.298         0.001              0.950
C3: Loyal Frequent Fliers         2385          15.7             0.060           0.790          0.112           0.098         0.679               0.218           0.013             0.000           0.990        0.010       0.047       0.356     0.395          0.268          0.319         0.946              0.001
C4: Returning Home Fliers         2353          15.5             0.050           0.428          0.106           0.074         0.968               0.000           0.000             0.689           0.306        0.006       0.106       0.275     0.283          0.163          0.290         0.001              0.979

* All feature values are normalized (0–1) means of the cluster centroid.

hm_data = clu.groupby('cluster')[heatmap_cols].mean()
hm_data.index = [f'Cluster {i}:\n{SHORT_NAMES[i].replace(chr(10), " ")}' for i in hm_data.index]
hm_data.columns = heatmap_labels

fig, ax = plt.subplots(figsize=(20, 10))
sns.heatmap(hm_data, annot=True, fmt='.2f', cmap='RdYlGn', linewidths=0.5, linecolor='white', ax=ax,
            cbar_kws={'label': 'Mean value (0–1)', 'shrink': 0.8, 'pad': 0.02}, annot_kws={'size': 18, 'weight': 'bold'})
ax.set_title('Feature Means Across All 5 Customer Segments', fontsize=24, fontweight='bold', pad=30)
ax.set_xlabel('')
ax.set_ylabel('')
ax.tick_params(axis='x', rotation=30, labelsize=18)
ax.tick_params(axis='y', rotation=0, labelsize=18)
ax.figure.axes[-1].tick_params(labelsize=17)
ax.figure.axes[-1].set_ylabel('Mean value (0–1)', fontsize=17, labelpad=15)
plt.tight_layout(pad=2)
plt.savefig('00 - heatmap_all_clusters.png', dpi=500, bbox_inches='tight')
plt.show()

# Average Ticket Amount by Segment
vals   = clu.groupby('cluster')['avg_amt'].mean()
errors = clu.groupby('cluster')['avg_amt'].sem()

fig, ax = plt.subplots(figsize=(10, 6))
bars = ax.bar(range(5), vals, color=CLUSTER_COLORS, edgecolor='white', linewidth=0.8)

ax.set_xticks(range(5))
ax.set_xticklabels(FULL_NAMES, fontsize=13)
ax.set_title('Average Ticket Amount by Segment', fontweight='bold', pad=20, fontsize=18)
ax.set_ylabel('Normalized value (0-1)', labelpad=10, fontsize=14)
ax.tick_params(axis='y', labelsize=13)
for bar, val in zip(bars, vals):
    ax.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.001,
            f'{val:.3f}', ha='center', va='bottom', fontsize=13, fontweight='bold')

plt.tight_layout()
plt.grid(axis='y', linestyle='--', alpha=0.5)
plt.savefig('00 - avg_ticket_amount.png', dpi=500, bbox_inches='tight')
plt.show()

# Booking Lead Time by Segment
vals   = clu.groupby('cluster')['days_pre_booked'].mean()
errors = clu.groupby('cluster')['days_pre_booked'].sem()

fig, ax = plt.subplots(figsize=(10, 6))
bars = ax.bar(range(5), vals, color=CLUSTER_COLORS, edgecolor='white', linewidth=0.8)

ax.set_xticks(range(5))
ax.set_xticklabels(FULL_NAMES, fontsize=13)
ax.set_title('Booking Lead Time by Segment', fontweight='bold', pad=20, fontsize=18)
ax.set_ylabel('Normalized days pre-booked (0-1)', labelpad=10, fontsize=14)
ax.tick_params(axis='y', labelsize=13)
for bar, val in zip(bars, vals):
    ax.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.001,
            f'{val:.3f}', ha='center', va='bottom', fontsize=13, fontweight='bold')

plt.tight_layout()
plt.grid(axis='y', linestyle='--', alpha=0.5)
plt.savefig('00 - booking_lead_time.png', dpi=500, bbox_inches='tight')
plt.show()

# Stacked bar chart showing booking channel composition for each cluster.

FULL_NAMES = [
    'C0: MSP Direct\nBookers',
    'C1: MSP Deal\nSeekers',
    'C2: Inbound\nVisitors',
    'C3: Loyal Frequent\nFliers',
    'C4: Returning Home\nFliers'
]

# Stacked bar chart
fig, ax = plt.subplots(figsize=(9, 6))
bottom = np.zeros(5)
for col, label, color in zip(channel_cols_plot, ch_labels, ch_colors):
    vals = ch_data[col].values
    ax.bar(range(5), vals, bottom=bottom, label=label, color=color, edgecolor='white', linewidth=0.6)
    for i, (v, b) in enumerate(zip(vals, bottom)):
        if v > 0.07:
            ax.text(i, b + v/2, f'{v:.0%}', ha='center', va='center', fontsize=12,
                    color='white', fontweight='bold')
    bottom += vals

ax.set_xticks(range(5))
ax.set_xticklabels(FULL_NAMES, fontsize=12)
ax.set_title('Booking Channel Composition by Customer Segment', fontweight='bold', fontsize=14, pad=30)
ax.set_ylabel('Proportion of bookings', fontsize=12, labelpad=10)
ax.tick_params(axis='y', labelsize=12)
ax.tick_params(axis='x', labelsize=12)
ax.legend(title='Booking Channel', title_fontsize=12, ncol=3, frameon=False, fontsize=12,
          loc='upper center', bbox_to_anchor=(0.5, -0.15))
plt.tight_layout()
plt.savefig('00 - stacked_bar_channels.png', dpi=500, bbox_inches='tight')
plt.show()

FULL_NAMES = [
    'C0: MSP Direct\nBookers',
    'C1: MSP Deal\nSeekers',
    'C2: Inbound\nVisitors',
    'C3: Loyal Frequent\nFliers',
    'C4: Returning Home\nFliers'
]

# Grouped bar chart
x = np.arange(5)
width = 0.25
fig, ax = plt.subplots(figsize=(9, 6))

for i, (col, label, color) in enumerate(zip(loyalty_cols_plot, ly_labels, ly_colors)):
    bars = ax.bar(x + (i - 1) * width, ly_data[col], width, label=label,
                  color=color, edgecolor='white', linewidth=0.8)

ax.set_xticks(x)
ax.set_xticklabels(FULL_NAMES, fontsize=12)
ax.set_title('Ufly Loyalty Program Status by Customer Segment', fontweight='bold', fontsize=14, pad=20)
ax.set_ylabel('Proportion of customers', fontsize=12, labelpad=10)
ax.tick_params(axis='y', labelsize=12)
ax.tick_params(axis='x', labelsize=12)
ax.legend(title='Ufly Status', ncol=3, frameon=False, title_fontsize=12, fontsize=12,
          loc='upper center', bbox_to_anchor=(0.45, -0.20))
ax.set_ylim(0, 1.1)
plt.tight_layout()
plt.grid(axis='y', linestyle='--', alpha=0.5)
plt.savefig('00 - grouped_bar_loyalty.png', dpi=500, bbox_inches='tight')
plt.show()

age_cols   = ['age_group_0-17', 'age_group_18-24', 'age_group_25-34', 'age_group_35-54', 'age_group_55+']
age_labels = ['0-17', '18-24', '25-34', '35-54', '55+']
age_colors = ['#4A7FA5', '#4E9A72', '#D4895A', '#7B6B9E', '#C0554E']

age_data = clu.groupby('cluster')[age_cols].mean()

x = np.arange(5)
width = 0.15
fig, ax = plt.subplots(figsize=(8, 5))

for i, (col, label, color) in enumerate(zip(age_cols, age_labels, age_colors)):
    offset = (i - 2) * width
    ax.bar(x + offset, age_data[col], width, label=f'Age {label}',
           color=color, edgecolor='white', linewidth=0.8)

ax.set_xticks(x)
ax.set_xticklabels(FULL_NAMES, fontsize=10)
ax.set_title('Age Group Distribution by Customer Segment', fontweight='bold', fontsize=14, pad=20)
ax.set_ylabel('Proportion of customers', fontsize=12, labelpad=10)
ax.tick_params(axis='y', labelsize=12)
ax.legend(title='Age Group', ncol=5, frameon=False, title_fontsize=12, fontsize=12, loc='upper center', bbox_to_anchor=(0.5, -0.25))
plt.tight_layout()
plt.savefig('00 - grouped_bar_age.png', dpi=500, bbox_inches='tight')
plt.grid(axis='y', linestyle='--', alpha=0.5)
plt.show()

#season_cols   = ['seasonality_Q1', 'seasonality_Q2', 'seasonality_Q3', 'seasonality_Q4']
season_labels = ['Q1\n(Jan-Mar)', 'Q2\n(Apr-Jun)', 'Q3\n(Jul-Sep)', 'Q4\n(Oct-Dec)']

season_data = clu.groupby('cluster')[season_cols].mean()

fig, ax = plt.subplots(figsize=(8, 6))
for i in range(5):
    ax.plot(season_labels, season_data.iloc[i].values, 'o-', color=CLUSTER_COLORS[i], label=CLUSTER_NAMES[i], linewidth=2, markersize=5)

ax.set_title('Travel Seasonality by Customer Segment', fontweight='bold', fontsize=14, pad=20)
ax.set_ylabel('Proportion of bookings', labelpad=15, fontsize=12)
ax.tick_params(axis='x', labelsize=12)
ax.tick_params(axis='y', labelsize=12)
ax.legend(title='Cluster', title_fontsize=12, fontsize=12, frameon=False, loc='upper center', bbox_to_anchor=(0.5, -0.2), ncol=3)
ax.grid(axis='y', linestyle='--', alpha=0.5)
plt.tight_layout()
plt.savefig('00 - line_seasonality.png', dpi=500, bbox_inches='tight')
plt.show()

fig, ax = plt.subplots(figsize=(9, 5))
ax.set_ylim(0, 0.35)

data_by_cluster = [clu[clu['cluster'] == i]['avg_amt'].values for i in range(5)]
bp = ax.boxplot(data_by_cluster, patch_artist=True, notch=False,
                medianprops={'color': 'white', 'linewidth': 2})
for patch, color in zip(bp['boxes'], CLUSTER_COLORS):
    patch.set_facecolor(color)
    patch.set_alpha(0.85)
for element in ['whiskers', 'caps', 'fliers']:
    for item, color in zip(bp[element], CLUSTER_COLORS * 2):
        item.set_color(color)

ax.set_xticks(range(1, 6))
ax.set_xticklabels(FULL_NAMES, fontsize=12)
ax.tick_params(axis='y', labelsize=12)
ax.set_title('Distribution of Ticket Prices by Customer Segment', fontweight='bold', fontsize=16, pad=30)
ax.set_ylabel('Avg Ticket Amount (normalized 0-1)', labelpad=15, fontsize=12)
ax.grid(axis='y', linestyle='--', alpha=0.5)
plt.tight_layout()
plt.savefig('00 - boxplot_ticket_price.png', dpi=500, bbox_inches='tight')
plt.show()

dest_cols_all = [c for c in clu.columns if c.startswith('true_destination_dest_')]

single_line_names = ['MSP Direct Bookers', 'MSP Deal Seekers', 'Inbound Visitors', 'Loyal Frequent Fliers', 'Returning Home Fliers']

fig, axes = plt.subplots(3, 2, figsize=(10, 14))
axes = axes.flatten()

for i in range(5):
    ax = axes[i]
    sub    = clu[clu['cluster'] == i][dest_cols_all].mean().sort_values(ascending=False).head(5)
    labels = [c.replace('true_destination_dest_', '') for c in sub.index]
    bars   = ax.bar(labels, sub.values, color=CLUSTER_COLORS[i], edgecolor='white', linewidth=0.8)

    for bar in bars:
        if bar.get_height() > 0.001:
            ax.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.003,
                    f'{bar.get_height():.3f}', ha='center', va='bottom', fontsize=12)

    ax.set_title(f'Cluster {i}: {single_line_names[i]}', fontsize=14, fontweight='bold',
                 color='black', pad=12)
    ax.set_ylabel('Mean proportion' if i % 2 == 0 else '', labelpad=12, fontsize=12)
    ax.set_xlabel('Destination', labelpad=12, fontsize=12)
    ax.tick_params(axis='x', labelsize=12, pad=6)
    ax.tick_params(axis='y', labelsize=12, pad=6)

axes[5].set_visible(False)

fig.suptitle('Top 5 Destinations by Customer Segment', fontsize=16, fontweight='bold', y=1.02)
plt.subplots_adjust(hspace=0.4, top=0.96, bottom=0.04, left=0.08, right=0.97)
plt.savefig('00 - top_destinations_by_segment.png', dpi=500, bbox_inches='tight')
plt.show()

Sun Country Airlines — Customer Segmentation Analysis¶

0. Setup & Imports¶

1. Load & Inspect Data¶

2. Exploratory Data Analysis (EDA)¶

3. Feature Selection & Clustering Preparation¶

4. Choosing k: Elbow Method + Silhouette Analysis¶

5. K-Means Clustering (k=5)¶

6. Cluster Summary Table¶

7. Visualizations¶

7A. Cross-Cluster Comparison: Feature Heatmap¶

7B. Cross-Cluster Comparison: Ticket Price & Booking Lead Time¶

7C. Booking Channel Split by Segment¶

7D. Ufly Loyalty Status by Segment¶

7E. Age Distribution by Segment¶

7F. Travel Seasonality by Segment (Line Chart)¶

7G. Box Plots — Ticket Price Distribution by Segment¶

7J. Top Destinations by Segment¶

	uid	PNRLocatorID	avg_amt	round_trip	group_size	group	days_pre_booked	BookingChannel_Outside_Booking	...	UflyMemberStatus_non-ufly	UflyMemberStatus_Standard	seasonality_Q1	seasonality_Q3	seasonality_Q4
0	504554455244696420493F7C2067657420746869732072...	AADMLF	0.019524	0	0.000	0	0.029703	1	...	0	1	0	0	1
1	46495853454E44696420493F7C20676574207468697320...	AAFBOM	0.081774	1	0.000	0	0.039604	0	...	0	1	0	1	0
2	534355545444696420493F7C2067657420746869732072...	AAFILI	0.026650	0	0.125	1	0.069307	0	...	1	0	1	0	0
3	534355545444696420493F7C2067657420746869732072...	AAFILI	0.026650	0	0.125	1	0.069307	0	...	1	0	1	0	0
4	44554D4D414E4E44696420493F7C206765742074686973...	AAFRQI	0.000000	1	0.000	0	0.035361	1	...	1	0	0	0	1