import subprocess
subprocess.run(['pip3', 'install', 'textblob', 'seaborn', 'xlsxwriter', '--quiet'], capture_output=True)

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.ticker as mticker
import seaborn as sns
from textblob import TextBlob
import warnings
warnings.filterwarnings('ignore')

LAFC_BLACK = '#1A1A1A'
LAFC_GOLD  = '#C39E6A'
LAFC_WHITE = '#FFFFFF'

plt.rcParams.update({
    'figure.facecolor':  LAFC_WHITE,
    'axes.facecolor':    LAFC_WHITE,
    'axes.edgecolor':    LAFC_BLACK,
    'axes.labelcolor':   LAFC_BLACK,
    'xtick.color':       LAFC_BLACK,
    'ytick.color':       LAFC_BLACK,
    'text.color':        LAFC_BLACK,
    'font.family':       'sans-serif',
    'axes.spines.top':   False,
    'axes.spines.right': False,
})

SAVE_PATH = '/Users/new/Downloads/Projects/LAFC/'

df = pd.read_excel('/Users/new/Downloads/Projects/LAFC/rawdata.xlsx')

print(f'Shape: {df.shape}')
print(f'Columns: {list(df.columns)}')
print(f'Date range: {df["survey date"].min()} to {df["survey date"].max()}')
df.head(10)

Shape: (9246, 6)
Columns: ['survey date', 'response date', 'total spend', 'seating location', 'lafc member', 'feedback']
Date range: 2021-07-15 19:40:57 to 2026-03-16 19:17:44

null_summary = pd.DataFrame({'Null Count': df.isnull().sum(),'Null %': (df.isnull().sum() / len(df) * 100).round(1)})
print('=== NULL AUDIT ===')
print(null_summary)
print(f'\nSeating Location values: {df["seating location"].value_counts().to_dict()}')
print(f'\nLAFC Member (raw) values: {df["lafc member"].value_counts(dropna=False).to_dict()}')

=== NULL AUDIT ===
                  Null Count  Null %
survey date                0     0.0
response date              0     0.0
total spend             2559    27.7
seating location        2337    25.3
lafc member             7004    75.8
feedback                 244     2.6

Seating Location values: {'General': 3871, 'Floor/Pit': 1948, 'Premium': 868, 'Supporters Section': 157, 'VIP Floor/Pit': 65}

LAFC Member (raw) values: {nan: 7004, 'YES': 1677, 'NO': 565}

df['survey date']    = pd.to_datetime(df['survey date'])
df['response date']  = pd.to_datetime(df['response date'])

df['year']            = df['survey date'].dt.year
df['quarter']         = df['survey date'].dt.quarter
df['month']           = df['survey date'].dt.month
df['year_quarter']    = df['year'].astype(str) + ' Q' + df['quarter'].astype(str)
df['response_lag_days'] = (df['response date'] - df['survey date']).dt.days

EMPTY_RESPONSES = ['', 'no', 'n/a', 'na', 'none', 'nothing', 'nope', 'n.a.']
df['has_feedback'] = (df['feedback'].notna() &(~df['feedback'].str.strip().str.lower().isin(EMPTY_RESPONSES)))
df['has_spend'] = df['total spend'].notna()

# Renamed from 'lafc member' — likely reflects LAFC Account Manager registration,
# not season ticket membership. See data quality note below.
df['has_lafc_account'] = df['lafc member'].fillna('UNKNOWN')

# Spend tiers aligned to actual data distribution (median: $226, 75th pct: $399)
def spend_tier(x):
    if pd.isna(x):  return 'No Spend Data'
    if x < 100:     return 'A: Under $100'
    if x < 175:     return 'B: $100-$174'
    if x < 250:     return 'C: $175-$249'
    if x < 400:     return 'D: $250-$399'
    if x < 700:     return 'E: $400-$699'
    return          'F: $700+'

df['spend_tier'] = df['total spend'].apply(spend_tier)

print(f'Rows with meaningful feedback: {df["has_feedback"].sum():,} of {len(df):,} ({df["has_feedback"].mean()*100:.1f}%)')
print(f'Rows with spend data:          {df["has_spend"].sum():,} of {len(df):,} ({df["has_spend"].mean()*100:.1f}%)')

DISPLAY_COLS = ['survey date', 'year', 'year_quarter', 'seating location','total spend', 'spend_tier', 'has_lafc_account', 'has_feedback', 'feedback']
display(df[DISPLAY_COLS].head(8))

Rows with meaningful feedback: 8,580 of 9,246 (92.8%)
Rows with spend data:          6,687 of 9,246 (72.3%)

# Only unambiguous keywords used. Generic terms like 'floor seats', 'merch line',
# 'general admission' appear at both concerts AND matches and are intentionally excluded.

CONCERT_KW = [
    'concert', 'artist', 'band', 'music', 'performer',
    'encore', 'headliner', 'setlist', 'soundcheck', 'opening act',
    'ga pit', 'vip pit', 'fan pit', 'mosh',
    'kpop', 'k-pop', 'bts', 'ateez', 'itzy', 'txt', 'wango tango',
    'foo fighters', 'system of a down', 'soad', 'korn', 'the hives',
    'misfits', 'blackpink', 'stray kids', 'bad bunny',
]
ACFC_KW = [
    'acfc', 'angel city', 'nwsl', "women's", 'womens',
    'christen press', 'sophia smith', 'trinity rodman',
]
LAFC_KW = [
    'lafc', 'black and gold', 'endo', '3252', 'mls', 'season ticket',
    'dale', 'vela', 'bobblehead', 'scarf', 'shield',
    'home opener', 'supporters section', 'tifo',
]

def row_classify(row):
    text = str(row['feedback']).lower() if pd.notna(row['feedback']) else ''
    if any(k in text for k in CONCERT_KW): return 'Concert'
    if any(k in text for k in ACFC_KW):    return 'ACFC Match'
    if any(k in text for k in LAFC_KW):    return 'LAFC Match'
    return None

df['_row_type'] = df.apply(row_classify, axis=1)

# Batch-level majority vote — each survey date = one event
# Note: 2023-03-23 batch shows near-equal LAFC/ACFC votes (26 vs 25),
# suggesting a double-header day — classified as LAFC Match by plurality.
batch_type = (
    df[df['_row_type'].notna()]
    .groupby('survey date')['_row_type']
    .agg(lambda x: x.value_counts().index[0])
)

df['event_type'] = df['survey date'].map(batch_type)
df = df.drop(columns=['_row_type'])

print('=== EVENT TYPE DISTRIBUTION ===')
print(df['event_type'].value_counts())
print(f'\nConcert share: {(df["event_type"]=="Concert").mean()*100:.1f}%')
print(f'ACFC share:    {(df["event_type"]=="ACFC Match").mean()*100:.1f}%')
print(f'LAFC share:    {(df["event_type"]=="LAFC Match").mean()*100:.1f}%')

=== EVENT TYPE DISTRIBUTION ===
event_type
Concert       5656
LAFC Match    3422
ACFC Match     168
Name: count, dtype: int64

Concert share: 61.2%
ACFC share:    1.8%
LAFC share:    37.0%

def get_sentiment(text):
    if pd.isna(text) or str(text).strip().lower() in EMPTY_RESPONSES:
        return np.nan, 'No Feedback'
    score = TextBlob(str(text)).sentiment.polarity
    if   score >  0.1: label = 'Positive'
    elif score < -0.1: label = 'Negative'
    else:              label = 'Neutral'
    return round(score, 4), label

sentiment_results      = df['feedback'].apply(lambda x: get_sentiment(x))
df['sentiment_score']  = sentiment_results.apply(lambda x: x[0])
df['sentiment_label']  = sentiment_results.apply(lambda x: x[1])

print('=== SENTIMENT DISTRIBUTION ===')
print(df['sentiment_label'].value_counts())
print(f'\nAvg sentiment score: {df["sentiment_score"].mean():.3f}')
print(f'Pct Positive: {(df["sentiment_label"]=="Positive").mean()*100:.1f}%')
print(f'Pct Negative: {(df["sentiment_label"]=="Negative").mean()*100:.1f}%')

=== SENTIMENT DISTRIBUTION ===
sentiment_label
Positive       3630
Neutral        3196
Negative       1754
No Feedback     666
Name: count, dtype: int64

Avg sentiment score: 0.094
Pct Positive: 39.3%
Pct Negative: 19.0%

THEME_KEYWORDS = {
    'Parking & Transit': [
        'parking', 'traffic', 'uber', 'rideshare', 'lyft', 'transit',
        'expo line', 'freeway', 'congestion', 'figueroa', 'transportation',
        'prepaid parking', 'parking lot', 'parking fee', 'parking price',
        'parking cost', 'shuttle', 'exit the parking', 'leaving the parking',
    ],
    'Food & Beverage': [
        'food', 'beer', 'beverage', 'concession', 'menu', 'vendor',
        'alcohol', 'nachos', 'hot dog', 'pizza', 'burger', 'snack',
        'bartend', 'cocktail', 'pretzel', 'salad', 'drink',
        'water bottle', 'non-alcoholic', 'vegan', 'food option',
        'food quality', 'food price', 'food line', 'food selection',
        'dole whip', 'churro', 'ice cream', 'seltzer', 'heineken',
        'cheaper options', 'water dispenser', 'free water',
        'bar line', 'get a drink', 'buy a drink', 'more bars',
    ],
    'Entry & Operations': [
        'entry', 'entrance', 'wristband', 'gate', 'checkpoint',
        'signage', 'disorganized', 'chaos', 'clear bag',
        'metal detector', 'check-in', 'checkin', 'ingress', 'egress',
        'ticket scan', 'long line', 'wait time', 'queue',
        'line to get in', 'line was long', 'wait to get',
        'getting in', 'to enter', 'exit was', 'exiting',
        'manage lines', 'security check', 'security line',
        'getting into the venue', 'security procedure',
    ],
    'Staff & Service': [
        'staff', 'employee', 'worker', 'customer service', 'rude',
        'friendly staff', 'helpful staff', 'unhelpful', 'professional',
        'training', 'usher', 'knowledgeable', 'condescending',
        'security guard', 'security staff', 'security team',
        'security personnel', 'change security', 'new security',
        'security let', 'security was',
    ],
    'Facilities & Amenities': [
        'bathroom', 'restroom', 'toilet', 'toilet paper', 'dirty',
        'acoustics', 'lighting', 'accessible', 'accessibility',
        'elevator', 'staircase', 'floor access', 'obstructed view',
        'sound quality', 'sound system', 'broken seat',
        'seats are too', 'seat spacing', 'extra space',
        'seats too close', 'close together', 'ada', 'wheelchair',
        'washroom', 'more shade', 'floor seats', 'see the stage',
        'stage visibility', 'inclined floor', 'stage too small',
        'screens are small', 'echo', 'smoke', 'pyrotechnics',
    ],
    'Pricing & Value': [
        'expensive', 'overpriced', 'affordable', 'price',
        'too much', 'cost too', 'costs too', 'pricing',
        'ticket price', 'parking price', 'beer price',
        'food price', 'raise prices', 'lower prices',
        'price gouging', 'markup', 'worth the price',
        'not worth', 'rip-off', 'ripoff', 'cheaper',
        'less than $', 'more than $', 'cost $', 'costs $',
    ],
    'Atmosphere & Experience': [
        'atmosphere', 'vibe', 'energy', 'ambiance', 'community',
        'culture', 'tradition', '3252', 'electric',
        'fan experience', 'overall experience', 'first time',
        'love this', 'love the', 'loved the', 'love going',
        'amazing time', 'great time', 'had a blast',
        'disappointing', 'frustrating experience',
        'supporter section', 'flags in the',
    ],
    'Technology & App': [
        'app', 'mobile app', 'website', 'digital', 'qr code',
        'technology', 'mobile order', 'contactless', 'cashless',
        'preorder', 'pre-order', 'account manager', 'ticketmaster',
        'notification', 'online purchase', 'emails arrive', 'email late',
    ],
    'Merchandise': [
        'merch', 'merchandise', 'team store', 'jersey', 'bobblehead',
        'giveaway', 'souvenir', 'sold out', 'merch line',
        'merch booth', 'buy a jersey', 'merchandise selection',
        'collectors', 'game pins', 'luggage tag',
    ],
    'Membership & Loyalty': [
        'season ticket', 'membership', 'sth', 'loyalty', 'reward',
        'member benefit', 'perk', 'renewal', 'exclusive access',
        'club member', 'season pass', 'member discount',
        'ticket holder', 'season holder', 'became member',
        'as a member', 'member since',
    ],
}

def assign_themes(text):
    if pd.isna(text) or str(text).strip().lower() in EMPTY_RESPONSES:
        return []
    text_lower = str(text).lower()
    matched = [theme for theme, kws in THEME_KEYWORDS.items()
               if any(k in text_lower for k in kws)]
    return matched if matched else ['General / Other']

df['themes']        = df['feedback'].apply(assign_themes)
df['theme_count']   = df['themes'].apply(len)
df['primary_theme'] = df['themes'].apply(lambda x: x[0] if x else 'General / Other')

df_exploded = df.explode('themes').rename(columns={'themes': 'theme'})
df_exploded = df_exploded[df_exploded['theme'].notna()]

print('=== THEME FREQUENCY (all years) ===')
print(df_exploded['theme'].value_counts())
print(f'\nTotal theme assignments: {len(df_exploded):,}')
print(f'Avg themes per response: {df["theme_count"].mean():.2f}')
print(f'Responses with no theme match (General/Other): {(df["themes"].apply(lambda x: x == ["General / Other"])).sum():,}')

=== THEME FREQUENCY (all years) ===
theme
General / Other            2180
Food & Beverage            2021
Parking & Transit          1643
Entry & Operations         1611
Staff & Service            1272
Facilities & Amenities     1124
Pricing & Value             816
Technology & App            752
Merchandise                 474
Atmosphere & Experience     456
Membership & Loyalty        290
Name: count, dtype: int64

Total theme assignments: 12,639
Avg themes per response: 1.37
Responses with no theme match (General/Other): 2,180

# Chart 1: Overall Theme Frequency (% of total theme assignments)
theme_counts = (
    df_exploded[df_exploded['theme'] != 'General / Other']
    ['theme'].value_counts().sort_values()
)
theme_pct = (theme_counts / theme_counts.sum() * 100).round(1)

fig, ax = plt.subplots(figsize=(10, 6))
bars = ax.barh(theme_pct.index, theme_pct.values,
               color=LAFC_GOLD, edgecolor=LAFC_BLACK, linewidth=0.5)
ax.bar_label(bars, fmt='%.1f%%', padding=4, fontsize=9, color=LAFC_BLACK)
ax.set_xlabel('% of Total Feedback Mentions', fontsize=10)
ax.set_title('Fan Feedback Theme Frequency\n2021-2026 (All Events)',
             fontsize=13, fontweight='bold', pad=12)
ax.set_xlim(0, theme_pct.max() * 1.2)
ax.xaxis.set_major_formatter(mticker.PercentFormatter())

plt.tight_layout()
plt.savefig(SAVE_PATH + 'chart_theme_frequency.png', dpi=150, bbox_inches='tight')
plt.show()

# Chart 2: Theme Frequency by Year (heatmap) — excludes General/Other and flags 2026 partial year
theme_year_pivot = (
    df_exploded[df_exploded['theme'] != 'General / Other']
    .groupby(['year', 'theme']).size().unstack(fill_value=0)
)

fig, ax = plt.subplots(figsize=(13, 6))
sns.heatmap(
    theme_year_pivot.T,
    annot=True, fmt='d', cmap='YlOrBr',
    linewidths=0.5, linecolor='white',
    ax=ax, cbar_kws={'label': 'Mention Count'}
)
ax.set_title('Feedback Theme Frequency by Year\n(Darker = More Mentions)',
             fontsize=13, fontweight='bold', pad=12)
ax.set_xlabel('Year', fontsize=10)
ax.set_ylabel('')

new_labels = [str(int(float(t.get_text()))) if t.get_text() != '2026'
              else '2026*' for t in ax.get_xticklabels()]
ax.set_xticklabels(new_labels)

plt.tight_layout()
plt.savefig(SAVE_PATH + 'chart_theme_heatmap.png', dpi=150, bbox_inches='tight')
plt.show()

# Chart 3: Fan Sentiment Distribution by Year (stacked bar, 2021-2026)
sent_year = df[df['sentiment_label'] != 'No Feedback'].groupby(
    ['year', 'sentiment_label']).size().unstack(fill_value=0)
sent_pct = sent_year.div(sent_year.sum(axis=1), axis=0) * 100

# Rename 2026 to flag partial year
sent_pct.index = [str(y) if y != 2026 else '2026*' for y in sent_pct.index]

fig, ax = plt.subplots(figsize=(10, 5))
sent_pct[['Positive', 'Neutral', 'Negative']].plot(
    kind='bar', stacked=True, ax=ax,
    color=[LAFC_GOLD, '#CCCCCC', LAFC_BLACK],
    edgecolor='white', linewidth=0.5
)
ax.set_title('Fan Sentiment Distribution by Year', fontsize=13, fontweight='bold', pad=12)
ax.set_xlabel('Year', fontsize=10)
ax.set_ylabel('% of Responses', fontsize=10)
ax.yaxis.set_major_formatter(mticker.PercentFormatter())
ax.legend(title='Sentiment', bbox_to_anchor=(1.01, 1), loc='upper left')
ax.set_xticklabels(ax.get_xticklabels(), rotation=0)
fig.text(0.13, -0.04, '* 2026 data through March only', fontsize=7.5, color='gray', style='italic')
plt.tight_layout()
plt.savefig(SAVE_PATH + 'chart_sentiment_year.png', dpi=150, bbox_inches='tight')
plt.show()

# Chart 4: Sentiment Breakdown by Theme (stacked 100% bar with % labels)
# Re-create df_exploded with sentiment columns now available
df_exploded = df.explode('themes').rename(columns={'themes': 'theme'})
df_exploded = df_exploded[df_exploded['theme'].notna()]

df_sent_theme = df_exploded[
    (df_exploded['theme'] != 'General / Other') &
    (df_exploded['sentiment_label'].isin(['Positive', 'Neutral', 'Negative']))
]

theme_sent = (
    df_sent_theme
    .groupby(['theme', 'sentiment_label']).size()
    .unstack(fill_value=0)
)
theme_sent_pct = theme_sent.div(theme_sent.sum(axis=1), axis=0) * 100
theme_sent_pct = theme_sent_pct[['Positive', 'Neutral', 'Negative']]
theme_sent_pct = theme_sent_pct.sort_values('Negative', ascending=True)

fig, ax = plt.subplots(figsize=(12, 7))

bars_pos = ax.barh(theme_sent_pct.index, theme_sent_pct['Positive'],
                   color=LAFC_GOLD, edgecolor='white', linewidth=0.5, label='Positive')
bars_neu = ax.barh(theme_sent_pct.index, theme_sent_pct['Neutral'],
                   left=theme_sent_pct['Positive'],
                   color='#CCCCCC', edgecolor='white', linewidth=0.5, label='Neutral')
bars_neg = ax.barh(theme_sent_pct.index, theme_sent_pct['Negative'],
                   left=theme_sent_pct['Positive'] + theme_sent_pct['Neutral'],
                   color=LAFC_BLACK, edgecolor='white', linewidth=0.5, label='Negative')

for i, theme in enumerate(theme_sent_pct.index):
    pos = theme_sent_pct.loc[theme, 'Positive']
    neu = theme_sent_pct.loc[theme, 'Neutral']
    neg = theme_sent_pct.loc[theme, 'Negative']
    if pos > 6:
        ax.text(pos / 2, i, f'{pos:.0f}%', ha='center', va='center',
                fontsize=8.5, color=LAFC_BLACK, fontweight='bold')
    if neu > 6:
        ax.text(pos + neu / 2, i, f'{neu:.0f}%', ha='center', va='center',
                fontsize=8.5, color=LAFC_BLACK, fontweight='bold')
    if neg > 6:
        ax.text(pos + neu + neg / 2, i, f'{neg:.0f}%', ha='center', va='center',
                fontsize=8.5, color='white', fontweight='bold')

ax.set_xlabel('% of Responses', fontsize=10)
ax.set_title('Sentiment Breakdown by Feedback Theme\n(Sorted by Most Negative)',
             fontsize=13, fontweight='bold', pad=12)
ax.xaxis.set_major_formatter(mticker.PercentFormatter())
ax.set_xlim(0, 100)
ax.axvline(x=50, color='gray', linestyle='--', linewidth=0.8, alpha=0.4)
ax.legend(title='Sentiment', bbox_to_anchor=(1.01, 1), loc='upper left')
plt.tight_layout()
plt.savefig(SAVE_PATH + 'chart_theme_sentiment.png', dpi=150, bbox_inches='tight')
plt.show()

# Setup: filter to rows with spend data
df_spend = df[df['has_spend']].copy()

print(f'Respondents with spend data: {len(df_spend):,} of {len(df):,} ({len(df_spend)/len(df)*100:.1f}%)')
print(f'Total captured spend: ${df_spend["total spend"].sum():,.0f}')
print(f'Avg spend per respondent: ${df_spend["total spend"].mean():,.2f}')
print(f'Median spend per respondent: ${df_spend["total spend"].median():,.2f}')

Respondents with spend data: 6,687 of 9,246 (72.3%)
Total captured spend: $2,263,087
Avg spend per respondent: $338.43
Median spend per respondent: $226.80

# Spend by seating tier
spend_by_tier = df_spend.groupby('seating location')['total spend'].agg(
    ['mean', 'median', 'sum', 'count']
).round(2).sort_values('mean', ascending=False)
spend_by_tier.columns = ['Avg Spend', 'Median Spend', 'Total Spend', 'Respondents']
spend_by_tier['% of Total Spend'] = (
    spend_by_tier['Total Spend'] / spend_by_tier['Total Spend'].sum() * 100
).round(1)

print('=== SPEND BY SEATING TIER ===')
print(spend_by_tier.to_string())
print('\nKey insight: Floor/Pit generates 41.9% of all captured revenue despite VIP Floor/Pit')
print('having the highest avg spend — volume beats rate. General (39.3%) and Floor/Pit')
print('together drive 81% of total revenue.')

=== SPEND BY SEATING TIER ===
                    Avg Spend  Median Spend  Total Spend  Respondents  % of Total Spend
seating location                                                                       
VIP Floor/Pit          572.57         499.0     37217.00           65               1.6
Floor/Pit              485.10         389.5    944982.21         1948              41.9
Premium                462.47         336.4    372291.93          805              16.5
General                239.52         178.5    887891.29         3707              39.3
Supporters Section     102.88          44.0     14814.45          144               0.7

Key insight: Floor/Pit generates 41.9% of all captured revenue despite VIP Floor/Pit
having the highest avg spend — volume beats rate. General (39.3%) and Floor/Pit
together drive 81% of total revenue.

# Spend by account status
account_spend = df_spend.groupby('has_lafc_account')['total spend'].agg(
    ['mean', 'median', 'count']
).round(2)
account_spend.columns = ['Avg Spend', 'Median Spend', 'Respondents']

print('=== SPEND BY ACCOUNT STATUS ===')
print(account_spend)
print('\nData quality notes:')
print('- NO group (n=27): too small for reliable conclusions')
print('- UNKNOWN (n=5,504): majority of spend data, most reliable segment')
print('- has_lafc_account likely reflects LAFC Account Manager registration, not season ticket membership')
print('\nHypothesis: account holders (YES) spend less per visit than UNKNOWN ($278 vs $350):')
print('- Account holders are likely more digitally savvy — they may seek presales, promo codes,')
print('  and discounted parking, resulting in lower per-visit spend')
print('- Lower per-visit spend does not mean lower lifetime value — frequency is the missing variable')
print('- Recommendation: track visit frequency alongside spend to measure true fan LTV')

=== SPEND BY ACCOUNT STATUS ===
                  Avg Spend  Median Spend  Respondents
has_lafc_account                                      
NO                   534.84        182.70           27
UNKNOWN              349.97        239.85         5504
YES                  278.90        169.44         1156

Data quality notes:
- NO group (n=27): too small for reliable conclusions
- UNKNOWN (n=5,504): majority of spend data, most reliable segment
- has_lafc_account likely reflects LAFC Account Manager registration, not season ticket membership

Hypothesis: account holders (YES) spend less per visit than UNKNOWN ($278 vs $350):
- Account holders are likely more digitally savvy — they may seek presales, promo codes,
  and discounted parking, resulting in lower per-visit spend
- Lower per-visit spend does not mean lower lifetime value — frequency is the missing variable
- Recommendation: track visit frequency alongside spend to measure true fan LTV

# Spend by event type — with data availability audit
print('=== SPEND DATA AVAILABILITY BY EVENT TYPE ===')
availability = df.groupby('event_type')['has_spend'].value_counts().unstack(fill_value=0)
availability.columns = ['No Spend Data', 'Has Spend Data']
availability['Total Respondents'] = availability.sum(axis=1)
availability['Spend Capture Rate'] = (
    availability['Has Spend Data'] / availability['Total Respondents'] * 100
).round(1)
display(availability)

print('\n=== SPEND BY EVENT TYPE (where data exists) ===')
display(df_spend.groupby('event_type')['total spend'].agg(['mean', 'median', 'count']).round(2))

print('\nCritical data quality flags:')
print('- ACFC Match: 0% spend capture — surveys never collected spend data')
print('- LAFC Match: only 35.7% spend capture vs 96.6% for Concerts')
print('- Concert vs LAFC Match spend comparison is NOT reliable due to this gap')
print('- Recommendation: standardize spend data collection across ALL event types')

=== SPEND DATA AVAILABILITY BY EVENT TYPE ===

=== SPEND BY EVENT TYPE (where data exists) ===

Critical data quality flags:
- ACFC Match: 0% spend capture — surveys never collected spend data
- LAFC Match: only 35.7% spend capture vs 96.6% for Concerts
- Concert vs LAFC Match spend comparison is NOT reliable due to this gap
- Recommendation: standardize spend data collection across ALL event types

# Overall spend distribution
print('=== OVERALL SPEND DISTRIBUTION ===')
print(df_spend['total spend'].describe(percentiles=[.10, .25, .50, .75, .90, .95]).round(2))

print('\n=== SPEND TIER BREAKDOWN ===')
print(df_spend['spend_tier'].value_counts().sort_index())
print('\n50% of fans spend under $227 per visit')
print('Top 10% of fans spend over $718 per visit')
print('Top 5% spend over $959 — premium upsell target segment')

=== OVERALL SPEND DISTRIBUTION ===
count    6687.00
mean      338.43
std       338.23
min        20.00
10%        91.06
25%       139.00
50%       226.80
75%       399.00
90%       718.00
95%       959.00
max      5500.00
Name: total spend, dtype: float64

=== SPEND TIER BREAKDOWN ===
spend_tier
A: Under $100    1118
B: $100-$174     1218
C: $175-$249     1232
D: $250-$399     1556
E: $400-$699      834
F: $700+          729
Name: count, dtype: int64

50% of fans spend under $227 per visit
Top 10% of fans spend over $718 per visit
Top 5% spend over $959 — premium upsell target segment

# Chart 5: Avg Fan Spend by Seating Tier
tier_order = spend_by_tier.sort_values('Avg Spend').index
colors = [LAFC_GOLD if t in ['VIP Floor/Pit', 'Floor/Pit', 'Premium'] else '#AAAAAA'for t in tier_order]

fig, ax = plt.subplots(figsize=(10, 5))
bars = ax.barh(tier_order, spend_by_tier.loc[tier_order, 'Avg Spend'],color=colors, edgecolor=LAFC_BLACK, linewidth=0.5)
ax.bar_label(bars, fmt='$%.0f', padding=4, fontsize=10)
ax.set_xlabel('Average Spend per Fan ($)', fontsize=10)
ax.set_title('Average Fan Spend by Seating Tier\n2021-2026',
             fontsize=13, fontweight='bold', pad=12)
ax.set_xlim(0, 700)
fig.text(
    0.13, -0.04,
    'Sample sizes — VIP Floor/Pit: n=65  |  Floor/Pit: n=1,948  |  '
    'Premium: n=805  |  General: n=3,707  |  Supporters Section: n=144\n'
    'Gold = premium tiers  |  Grey = value tiers  |  Survey-captured spend only',
    fontsize=7.5, color='gray', style='italic'
)
plt.tight_layout()
plt.savefig(SAVE_PATH + 'chart_spend_tier.png', dpi=150, bbox_inches='tight')
plt.show()

# Chart 6: Revenue Share by Seating Tier (Donut)
revenue_share = spend_by_tier.sort_values('Total Spend', ascending=False)
tier_colors = {
    'Floor/Pit':          '#C39E6A',
    'General':            '#2E4057',
    'Premium':            '#1A1A1A',
    'VIP Floor/Pit':      '#E84855',
    'Supporters Section': '#3BB273',
}
donut_colors = [tier_colors[t] for t in revenue_share.index]

fig, ax = plt.subplots(figsize=(9, 7))
wedges, texts, autotexts = ax.pie(
    revenue_share['Total Spend'],
    autopct='%1.1f%%',
    colors=donut_colors,
    startangle=90,
    pctdistance=0.78,
    wedgeprops=dict(width=0.5, edgecolor='white', linewidth=2.5),
    labels=None,
)
for i, autotext in enumerate(autotexts):
    pct = revenue_share['Total Spend'].iloc[i] / revenue_share['Total Spend'].sum() * 100
    if pct < 3:
        autotext.set_text('')
    else:
        autotext.set_fontsize(10)
        autotext.set_fontweight('bold')
        autotext.set_color('white')

ax.legend(
    wedges,
    [f'{tier}  —  {pct:.1f}%' for tier, pct in
     zip(revenue_share.index,
         revenue_share['Total Spend'] / revenue_share['Total Spend'].sum() * 100)],
    title='Seating Tier',
    loc='center left',
    bbox_to_anchor=(1.0, 0.5),
    fontsize=10,
    title_fontsize=10
)
fig.suptitle('% of Total Captured Revenue by Seating Tier 2021-2026',
             fontsize=13, fontweight='bold', x=0.55, y=0.92)
plt.tight_layout()
plt.savefig(SAVE_PATH + 'chart_revenue_share.png', dpi=150, bbox_inches='tight')
plt.show()

# Chart 7: Avg Spend per Event — Year Over Year (2021-2025, excludes partial 2026)
yearly_spend = (
    df_spend[df_spend['year'] < 2026]
    .groupby('year')['total spend'].mean().reset_index()
)

fig, ax = plt.subplots(figsize=(9, 5))
ax.plot(yearly_spend['year'], yearly_spend['total spend'],
        color=LAFC_GOLD, marker='o', linewidth=2.5,
        markersize=8, markerfacecolor=LAFC_BLACK)
for _, row in yearly_spend.iterrows():
    ax.annotate(f"${row['total spend']:.0f}",
                xy=(row['year'], row['total spend']),
                xytext=(0, 10), textcoords='offset points',
                ha='center', fontsize=9)
ax.set_title('Average Fan Spend per Event — Year Over Year\n2021-2025',
             fontsize=13, fontweight='bold', pad=12)
ax.set_xlabel('Year', fontsize=10)
ax.set_ylabel('Avg Spend ($)', fontsize=10)
ax.set_xticks(yearly_spend['year'])
ax.yaxis.set_major_formatter(mticker.FuncFormatter(lambda x, _: f'${x:,.0f}'))

plt.tight_layout()
plt.savefig(SAVE_PATH + 'chart_spend_yoy.png', dpi=150, bbox_inches='tight')
plt.show()

# Chart 8: Avg Spend by Seating Tier — Year Over Year (2021-2025)
# VIP Floor/Pit excluded (n=65 total, unreliable trend)
tier_year = (
    df_spend[
        (df_spend['year'] < 2026) &
        (df_spend['seating location'] != 'VIP Floor/Pit')
    ]
    .groupby(['year', 'seating location'])['total spend'].mean().unstack()
)

tier_styles = {
    'Floor/Pit':          {'color': '#C39E6A', 'marker': 'o', 'linestyle': '-',  'linewidth': 2.5},
    'General':            {'color': '#1A1A1A', 'marker': 's', 'linestyle': '-',  'linewidth': 2.5},
    'Premium':            {'color': '#2E4057', 'marker': '^', 'linestyle': '-',  'linewidth': 2.5},
    'Supporters Section': {'color': '#3BB273', 'marker': 'D', 'linestyle': '--', 'linewidth': 2},
}

fig, ax = plt.subplots(figsize=(11, 6))
for col in tier_year.columns:
    style = tier_styles.get(col, {})
    ax.plot(tier_year.index, tier_year[col],
            color=style['color'], marker=style['marker'],
            linestyle=style['linestyle'], linewidth=style['linewidth'],
            markersize=7, label=col)

ax.set_title('Average Spend by Seating Tier — Year Over Year\n2021-2025',
             fontsize=13, fontweight='bold', pad=12)
ax.set_xlabel('Year', fontsize=10)
ax.set_ylabel('Avg Spend ($)', fontsize=10)
ax.yaxis.set_major_formatter(mticker.FuncFormatter(lambda x, _: f'${x:,.0f}'))
ax.legend(title='Seating Tier', bbox_to_anchor=(1.01, 1), loc='upper left', fontsize=9)
ax.set_xticks(tier_year.index)
fig.text(
    0.13, -0.03,
    'VIP Floor/Pit excluded (n=65 total — insufficient for reliable trend)\n'
    'Floor/Pit n=1,877  |  General n=3,563  |  Premium n=767  |  Supporters Section n=135',
    fontsize=7.5, color='gray', style='italic'
)
plt.tight_layout()
plt.savefig(SAVE_PATH + 'chart_spend_tier_yoy.png', dpi=150, bbox_inches='tight')
plt.show()

# Chart 9: Spend Tier Distribution Shift — Simplified to 3 tiers
df_spend_2125 = df_spend[df_spend['year'] < 2026].copy()

def broad_tier(x):
    if x < 175:  return 'Low (Under $175)'
    if x < 400:  return 'Mid ($175-$399)'
    return       'High ($400+)'

df_spend_2125['broad_tier'] = df_spend_2125['total spend'].apply(broad_tier)

tier_dist = (
    pd.crosstab(df_spend_2125['year'], df_spend_2125['broad_tier'], normalize='index')
    .mul(100).round(1)
)[['Low (Under $175)', 'Mid ($175-$399)', 'High ($400+)']]

tier_styles = {
    'Low (Under $175)': {'color': '#E84855', 'marker': 'o', 'linestyle': '--'},
    'Mid ($175-$399)':  {'color': '#2E4057', 'marker': 's', 'linestyle': '-'},
    'High ($400+)':     {'color': '#C39E6A', 'marker': '*', 'linestyle': '-'},
}

fig, ax = plt.subplots(figsize=(10, 5))

for tier in tier_dist.columns:
    style = tier_styles[tier]
    ax.plot(tier_dist.index, tier_dist[tier],
            color=style['color'], marker=style['marker'],
            linestyle=style['linestyle'], linewidth=2.5,
            markersize=8, label=tier)

    # Label only start (2021) and end (2025)
    start_val = tier_dist[tier].iloc[0]
    end_val   = tier_dist[tier].iloc[-1]

    ax.annotate(f'{start_val:.0f}%',
                xy=(2021, start_val),
                xytext=(-28, 0), textcoords='offset points',
                va='center', fontsize=9,
                color=style['color'], fontweight='bold')

    ax.annotate(f'{end_val:.0f}%',
                xy=(2025, end_val),
                xytext=(10, 0), textcoords='offset points',
                va='center', fontsize=9,
                color=style['color'], fontweight='bold')

ax.set_title('Fan Spend Distribution — Low vs. Mid vs. High Spenders\n2021-2025',
             fontsize=13, fontweight='bold', pad=12)
ax.set_xlabel('Year', fontsize=10)
ax.set_ylabel('% of Fans', fontsize=10)
ax.yaxis.set_major_formatter(mticker.PercentFormatter())
ax.set_xticks(tier_dist.index)
ax.set_xlim(2020.5, 2025.8)
ax.set_ylim(0, 70)
ax.yaxis.grid(True, linestyle='--', linewidth=0.5, color='#DDDDDD', alpha=0.7)
ax.set_axisbelow(True)
ax.legend(title='Spend Tier', loc='upper right', fontsize=9)

fig.text(
    0.13, -0.04,
    'Low spenders (under 175) dropped from 54% to 27% of fans (2021-2025)  |  High spenders (400+) grew from 9% to 39%\n'
    '2024 figures should be interpreted cautiously — lower survey volume (n=627) increases variance',
    fontsize=7.5, color='gray', style='italic'
)

plt.tight_layout()
plt.savefig(SAVE_PATH + 'chart_spend_tier_shift.png', dpi=150, bbox_inches='tight')
plt.show()

# Yearly KPI Summary (2021-2025)
yearly_kpi = df[df['year'] < 2026].groupby('year').agg(
    total_responses  = ('feedback', 'count'),
    with_feedback    = ('has_feedback', 'sum'),
    avg_spend        = ('total spend', 'mean'),
    median_spend     = ('total spend', 'median'),
    total_spend      = ('total spend', 'sum'),
    pct_with_account = ('has_lafc_account', lambda x: round((x == 'YES').sum() / len(x) * 100, 1)),
    avg_sentiment    = ('sentiment_score', 'mean'),
    avg_response_lag = ('response_lag_days', 'mean')
).reset_index()

yearly_kpi['feedback_rate'] = (yearly_kpi['with_feedback'] / yearly_kpi['total_responses'] * 100).round(1)
yearly_kpi = yearly_kpi.round(2)

print('=== YEARLY KPI SUMMARY (2021-2025) ===')
print(yearly_kpi.to_string())
print('\nKey observations:')
print('- Survey volume peaked in 2022 (n=3,085) and dropped to 858 in 2024 — reason unclear, worth investigating')
print(f'- Avg spend growth 2021-2025: ${yearly_kpi.loc[0, "avg_spend"]:.0f} to ${yearly_kpi.loc[4, "avg_spend"]:.0f} (+{((yearly_kpi.loc[4, "avg_spend"] / yearly_kpi.loc[0, "avg_spend"]) - 1) * 100:.0f}%)')
print(f'- Avg sentiment remarkably flat across all years (range: {yearly_kpi["avg_sentiment"].min():.2f} to {yearly_kpi["avg_sentiment"].max():.2f})')
print('- pct_with_account volatility likely reflects inconsistent survey question inclusion, not true behavioral change')
print('\n2026 excluded from this table — partial year (Jan-Mar only)')

=== YEARLY KPI SUMMARY (2021-2025) ===
   year  total_responses  with_feedback  avg_spend  median_spend  total_spend  pct_with_account  avg_sentiment  avg_response_lag  feedback_rate
0  2021             1713           1669     205.66        159.00    317339.93              17.1           0.08              3.32           97.4
1  2022             3085           2947     327.55        253.68    682931.50              15.8           0.09              4.31           95.5
2  2023             1853           1706     419.68        308.00    606438.47               7.7           0.11              3.97           92.1
3  2024              858            824     397.34        359.00    249131.75              29.5           0.06              2.73           96.0
4  2025             1163           1131     440.78        299.00    320448.15              31.6           0.10              3.11           97.2

Key observations:
- Survey volume peaked in 2022 (n=3,085) and dropped to 858 in 2024 — reason unclear, worth investigating
- Avg spend growth 2021-2025: $206 to $441 (+114%)
- Avg sentiment remarkably flat across all years (range: 0.06 to 0.11)
- pct_with_account volatility likely reflects inconsistent survey question inclusion, not true behavioral change

2026 excluded from this table — partial year (Jan-Mar only)

# Account status penetration by year (2021-2025)
member_year = df[df['year'] < 2026].groupby(['year', 'has_lafc_account']).size().unstack(fill_value=0)
member_pct  = member_year.div(member_year.sum(axis=1), axis=0) * 100

print('=== ACCOUNT STATUS PENETRATION BY YEAR (2021-2025) ===')
print(member_pct.round(1))
print('\nKey observations:')
print('- UNKNOWN % declining (75% in 2021 to 58% in 2025) — reflects improved data collection,')
print('  not necessarily more account holders')
print('- YES % appears to grow but may just reflect more surveys including the question')
print('- True account holder growth cannot be determined from this data alone')
print('- Recommendation: include account status question on ALL surveys for reliable loyalty baseline')

=== ACCOUNT STATUS PENETRATION BY YEAR (2021-2025) ===
has_lafc_account    NO  UNKNOWN   YES
year                                 
2021               7.8     75.1  17.1
2022               3.6     80.6  15.8
2023               2.9     89.3   7.7
2024              10.8     59.7  29.5
2025              10.6     57.8  31.6

Key observations:
- UNKNOWN % declining (75% in 2021 to 58% in 2025) — reflects improved data collection,
  not necessarily more account holders
- YES % appears to grow but may just reflect more surveys including the question
- True account holder growth cannot be determined from this data alone
- Recommendation: include account status question on ALL surveys for reliable loyalty baseline

# Chart 10: High-Value Fan Share Over Time (2021-2025)
threshold = df_spend['total spend'].quantile(0.75)
df_spend['is_high_value'] = df_spend['total spend'] >= threshold

hv_year = (
    df_spend[df_spend['year'] < 2026]
    .groupby('year')['is_high_value']
    .agg(['sum', 'count'])
)
hv_year['pct'] = (hv_year['sum'] / hv_year['count'] * 100).round(1)
hv_year = hv_year.reset_index()

fig, ax = plt.subplots(figsize=(10, 5))
ax.plot(hv_year['year'], hv_year['pct'],
        color=LAFC_GOLD, marker='o', linewidth=2.5,
        markersize=9, markerfacecolor=LAFC_BLACK)

for _, row in hv_year.iterrows():
    ax.annotate(f"{row['pct']:.1f}%",
                xy=(row['year'], row['pct']),
                xytext=(0, 12), textcoords='offset points',
                ha='center', fontsize=10, fontweight='bold', color=LAFC_BLACK)

ax.set_title(f'High-Value Fan Share Over Time\n'
             f'High Value = Top 25% of Attendees by Spend (>= ${threshold:.0f}/visit)',
             fontsize=13, fontweight='bold', pad=12)
ax.set_xlabel('Year', fontsize=10)
ax.set_ylabel('% of Attendees', fontsize=10)
ax.set_xticks(hv_year['year'])
ax.yaxis.set_major_formatter(mticker.PercentFormatter())
ax.set_ylim(0, 55)

fig.text(
    0.13, -0.03,
    '2024 dip likely reflects lower survey volume (n=627 vs n=1,445 in 2023) rather than a real behavioral change\n'
    '2026 excluded — partial year data',
    fontsize=7.5, color='gray', style='italic'
)
plt.tight_layout()
plt.savefig(SAVE_PATH + 'chart_high_value_growth.png', dpi=150, bbox_inches='tight')
plt.show()

# Chart 11a: Top 3 Themes — Negative Sentiment Rate Over Time
# % of theme mentions that are negative each year
top3_themes = ['Food & Beverage', 'Parking & Transit', 'Entry & Operations']

df_exploded = df.explode('themes').rename(columns={'themes': 'theme'})
df_exploded = df_exploded[df_exploded['theme'].notna()]

theme_counts_yr = df_exploded[
    (df_exploded['theme'].isin(top3_themes)) &
    (df_exploded['year'] < 2026)
].groupby(['year', 'theme']).size()

sent_counts_yr = df_exploded[
    (df_exploded['theme'].isin(top3_themes)) &
    (df_exploded['year'] < 2026)
].groupby(['year', 'theme', 'sentiment_label']).size().unstack(fill_value=0)

neg_rate = (
    sent_counts_yr['Negative']
    .div(theme_counts_yr)
    .mul(100).round(1)
    .unstack('theme')
)

theme_styles = {
    'Food & Beverage':    {'color': '#C39E6A'},
    'Parking & Transit':  {'color': '#E84855'},
    'Entry & Operations': {'color': '#2E4057'},
}

fig, ax = plt.subplots(figsize=(11, 6))

for theme, style in theme_styles.items():
    if theme not in neg_rate.columns:
        continue
    ax.plot(neg_rate.index, neg_rate[theme],
            color=style['color'], marker='o',
            linewidth=2.5, markersize=8, label=theme)
    ax.annotate(f"{neg_rate[theme].iloc[0]:.0f}%",
                xy=(2021, neg_rate[theme].iloc[0]),
                xytext=(-28, 0), textcoords='offset points',
                va='center', fontsize=8.5,
                color=style['color'], fontweight='bold')
    ax.annotate(f"{neg_rate[theme].iloc[-1]:.0f}%",
                xy=(2025, neg_rate[theme].iloc[-1]),
                xytext=(10, 0), textcoords='offset points',
                va='center', fontsize=8.5,
                color=style['color'], fontweight='bold')

ax.set_title('Three Persistent Pain Points — Negative Sentiment Rate Over Time\n'
             '% of Theme Mentions That Were Negative Each Year (2021-2025)',
             fontsize=13, fontweight='bold', pad=12)
ax.set_xlabel('Year', fontsize=10)
ax.set_ylabel('% of Theme Mentions (Negative)', fontsize=10)
ax.yaxis.set_major_formatter(mticker.PercentFormatter())
ax.set_xticks(neg_rate.index)
ax.set_xlim(2020.7, 2025.5)
ax.set_ylim(0, 50)
ax.yaxis.grid(True, linestyle='--', linewidth=0.5, color='#DDDDDD', alpha=0.7)
ax.set_axisbelow(True)
ax.legend(title='Theme', loc='upper right', fontsize=9)
fig.text(
    0.13, -0.03,
    'Negative = TextBlob sentiment score < -0.1  |  % of mentions for that theme in that year\n'
    'Example: 32% means 32 out of every 100 parking mentions that year were negative. 2026 excluded.',
    fontsize=7.5, color='gray', style='italic'
)

plt.tight_layout()
plt.savefig(SAVE_PATH + 'chart_top3_negative_yoy.png', dpi=150, bbox_inches='tight')
plt.show()

# Re-create df_exploded with all engineered columns for export
df_exploded = df.explode('themes').rename(columns={'themes': 'theme'})
df_exploded = df_exploded[df_exploded['theme'].notna()]

df_export = df.copy()
df_export['themes'] = df_export['themes'].apply(
    lambda x: ' | '.join(x) if isinstance(x, list) else ''
)

OUTPUT_PATH = '/Users/new/Downloads/Projects/LAFC/lafc_clean_data.xlsx'

with pd.ExcelWriter(OUTPUT_PATH, engine='xlsxwriter') as writer:

    df_export.to_excel(writer, sheet_name='Main Data', index=False)

    df_exploded[[
        'survey date', 'year', 'quarter', 'year_quarter', 'event_type',
        'seating location', 'has_lafc_account', 'total spend', 'spend_tier',
        'theme', 'sentiment_score', 'sentiment_label', 'response_lag_days'
    ]].to_excel(writer, sheet_name='Theme Detail (Tableau)', index=False)

    df_exploded.groupby(['year', 'theme']).size().reset_index(name='count').to_excel(
        writer, sheet_name='Theme by Year', index=False)

    df_spend.groupby(['seating location', 'year']).agg(
        avg_spend=('total spend', 'mean'),
        median_spend=('total spend', 'median'),
        total_spend=('total spend', 'sum'),
        count=('total spend', 'count')
    ).reset_index().to_excel(writer, sheet_name='Spend by Segment', index=False)

    df_spend.groupby(['has_lafc_account', 'year']).agg(
        avg_spend=('total spend', 'mean'),
        median_spend=('total spend', 'median'),
        count=('total spend', 'count')
    ).reset_index().to_excel(writer, sheet_name='Account Spend', index=False)

    df_spend.groupby(['event_type', 'year']).agg(
        avg_spend=('total spend', 'mean'),
        count=('total spend', 'count'),
        total_spend=('total spend', 'sum')
    ).reset_index().to_excel(writer, sheet_name='Event Type Spend', index=False)

    df[df['sentiment_label'] != 'No Feedback'].groupby(
        ['year', 'sentiment_label']).size().reset_index(name='count').to_excel(
        writer, sheet_name='Sentiment by Year', index=False)

    df_exploded[df_exploded['sentiment_label'] != 'No Feedback'].groupby(
        ['theme', 'sentiment_label']).size().reset_index(name='count').to_excel(
        writer, sheet_name='Theme Sentiment', index=False)

    df[['year', 'event_type', 'seating location', 'has_lafc_account',
        'response_lag_days', 'total spend', 'sentiment_score',
        'sentiment_label', 'has_feedback']].to_excel(
        writer, sheet_name='Fan Journey', index=False)

    yearly_kpi.to_excel(writer, sheet_name='Yearly KPIs', index=False)

print(f'Exported to: {OUTPUT_PATH}')
print('Sheets: Main Data, Theme Detail (Tableau), Theme by Year, Spend by Segment,')
print('        Account Spend, Event Type Spend, Sentiment by Year, Theme Sentiment,')
print('        Fan Journey, Yearly KPIs')

Exported to: /Users/new/Downloads/Projects/LAFC/lafc_clean_data.xlsx
Sheets: Main Data, Theme Detail (Tableau), Theme by Year, Spend by Segment,
        Account Spend, Event Type Spend, Sentiment by Year, Theme Sentiment,
        Fan Journey, Yearly KPIs

# ── Key Findings Summary ──
top_themes     = df_exploded[df_exploded['theme'] != 'General / Other']['theme'].value_counts().head(3)
avg_spend_2021 = yearly_kpi.loc[yearly_kpi['year'] == 2021, 'avg_spend'].values[0]
avg_spend_2025 = yearly_kpi.loc[yearly_kpi['year'] == 2025, 'avg_spend'].values[0]
spend_growth   = (avg_spend_2025 - avg_spend_2021) / avg_spend_2021 * 100
pct_positive   = (df['sentiment_label'] == 'Positive').mean() * 100
pct_negative   = (df['sentiment_label'] == 'Negative').mean() * 100
threshold      = df_spend['total spend'].quantile(0.75)
hv_2021        = (df_spend[df_spend['year'] == 2021]['total spend'] >= threshold).mean() * 100
hv_2025        = (df_spend[df_spend['year'] == 2025]['total spend'] >= threshold).mean() * 100
floor_pit_rev  = spend_by_tier.loc['Floor/Pit', '% of Total Spend']
general_rev    = spend_by_tier.loc['General', '% of Total Spend']
pricing_neg    = theme_sent_pct.loc['Pricing & Value', 'Negative'] if 'Pricing & Value' in theme_sent_pct.index else 0
atmos_pos      = theme_sent_pct.loc['Atmosphere & Experience', 'Positive'] if 'Atmosphere & Experience' in theme_sent_pct.index else 0

avail = df.groupby('event_type')['has_spend'].value_counts().unstack(fill_value=0)
avail.columns = ['No Spend', 'Has Spend']
avail['Total'] = avail.sum(axis=1)
avail['Capture %'] = (avail['Has Spend'] / avail['Total'] * 100).round(1)

print('=' * 62)
print('   LAFC & BMO STADIUM — KEY FINDINGS (2021-2025)')
print('=' * 62)

print(f"""
FINDING 1 — THREE OPERATIONAL PAIN POINTS, FIVE YEARS UNRESOLVED
  #1: {top_themes.index[0]:<32} ({top_themes.iloc[0]:,} theme mentions)
  #2: {top_themes.index[1]:<32} ({top_themes.iloc[1]:,} theme mentions)
  #3: {top_themes.index[2]:<32} ({top_themes.iloc[2]:,} theme mentions)
  Pricing & Value: highest negative sentiment rate at {pricing_neg:.0f}%
  All three themes appeared consistently across every year 2021-2025.

FINDING 2 — FAN SENTIMENT IS NET POSITIVE, BUT OPERATIONS DRAG IT DOWN
  Overall Positive:              {pct_positive:.1f}%
  Overall Negative:              {pct_negative:.1f}%
  Atmosphere & Experience:       {atmos_pos:.0f}% positive — strongest theme by far
  Fans love the product. Friction is operational and fixable.

FINDING 3 — FANS ARE SPENDING SIGNIFICANTLY MORE OVER TIME
  Avg spend 2021:   ${avg_spend_2021:,.0f}
  Avg spend 2025:   ${avg_spend_2025:,.0f}
  Growth:           +{spend_growth:.0f}%
  High-value fans (top 25% by spend): {hv_2021:.1f}% in 2021 to {hv_2025:.1f}% in 2025
  The fanbase is premiumizing — more fans spending more per visit each year.

FINDING 4 — REVENUE IS CONCENTRATED IN TWO SEATING TIERS
  Floor/Pit + General = {floor_pit_rev + general_rev:.0f}% of all captured revenue
  VIP Floor/Pit avg spend: $573 — but only 1.6% of total revenue
  Premium upsell is the clearest per-fan revenue growth opportunity.

FINDING 5 — CRITICAL DATA GAPS LIMIT DECISION-MAKING
  ACFC Match spend capture:   {avail.loc['ACFC Match', 'Capture %']:.0f}% — zero financial visibility
  LAFC Match spend capture:   {avail.loc['LAFC Match', 'Capture %']:.1f}% vs {avail.loc['Concert', 'Capture %']:.1f}% for Concerts
  has_lafc_account null rate:  75.8% — loyalty tracking is unreliable
  Standardizing data collection is a prerequisite to any revenue optimization.
""")
print('=' * 62)

==============================================================
   LAFC & BMO STADIUM — KEY FINDINGS (2021-2025)
==============================================================

FINDING 1 — THREE OPERATIONAL PAIN POINTS, FIVE YEARS UNRESOLVED
  #1: Food & Beverage                  (2,021 theme mentions)
  #2: Parking & Transit                (1,643 theme mentions)
  #3: Entry & Operations               (1,611 theme mentions)
  Pricing & Value: highest negative sentiment rate at 32%
  All three themes appeared consistently across every year 2021-2025.

FINDING 2 — FAN SENTIMENT IS NET POSITIVE, BUT OPERATIONS DRAG IT DOWN
  Overall Positive:              39.3%
  Overall Negative:              19.0%
  Atmosphere & Experience:       60% positive — strongest theme by far
  Fans love the product. Friction is operational and fixable.

FINDING 3 — FANS ARE SPENDING SIGNIFICANTLY MORE OVER TIME
  Avg spend 2021:   $206
  Avg spend 2025:   $441
  Growth:           +114%
  High-value fans (top 25% by spend): 8.6% in 2021 to 42.0% in 2025
  The fanbase is premiumizing — more fans spending more per visit each year.

FINDING 4 — REVENUE IS CONCENTRATED IN TWO SEATING TIERS
  Floor/Pit + General = 81% of all captured revenue
  VIP Floor/Pit avg spend: $573 — but only 1.6% of total revenue
  Premium upsell is the clearest per-fan revenue growth opportunity.

FINDING 5 — CRITICAL DATA GAPS LIMIT DECISION-MAKING
  ACFC Match spend capture:   0% — zero financial visibility
  LAFC Match spend capture:   35.7% vs 96.6% for Concerts
  has_lafc_account null rate:  75.8% — loyalty tracking is unreliable
  Standardizing data collection is a prerequisite to any revenue optimization.

==============================================================

	survey date	response date	total spend	seating location	lafc member	feedback
0	2022-03-18 21:11:36	2022-03-22 00:31:56	NaN	NaN	NO	Cheaper beer
1	2023-10-21 00:07:20	2023-10-24 00:45:22	426.75	General	NaN	Just to improve the diversity of food, include...
2	2023-10-21 00:07:20	2023-10-24 04:37:47	790.00	Floor/Pit	NaN	The floor area needs bathrooms without having ...
3	2023-03-23 12:17:31	2023-03-29 02:14:18	NaN	NaN	NaN	One of my favorite things last season was wait...
4	2021-11-12 01:22:36	2021-11-15 23:58:32	40.00	General	NaN	NaN
5	2021-11-12 01:22:36	2021-11-16 17:33:23	99.50	Floor/Pit	NaN	It took way too long to get in the GA pit entr...
6	2023-10-21 00:07:20	2023-10-24 02:27:20	827.50	Premium	NaN	Staff was extremely helpful, friendly and kind...
7	2021-11-12 01:22:36	2021-11-15 20:34:35	119.00	General	NaN	more affordable onsite parking
8	2023-03-23 12:17:31	2023-03-29 02:59:32	NaN	NaN	NaN	We were unable to pick up our season ticket ho...
9	2022-03-18 21:11:36	2022-03-21 21:54:08	NaN	NaN	NO	Great ambiance, very please with experience an...

LAFC & BMO Stadium — Fan Experience Analysis¶

Allan Almaraz | Analyst, Business & Data Strategy¶

0. Setup & Imports¶

1. Load & Inspect Raw Data¶

2. Data Cleaning & Feature Engineering¶

Data Quality Note — `has_lafc_account` Column¶

3. Event Type Inference¶

4. Sentiment Analysis¶

5. Analysis 1 — Feedback Theme Categorization¶

6. Analysis 2 — Spend & Segmentation¶

7. Analysis 3 — Fan Journey & Loyalty¶

8. Export — Tableau-Ready Excel File¶

9. Key Findings Summary¶

	survey date	year	year_quarter	seating location	total spend	spend_tier	has_lafc_account	has_feedback	feedback
0	2022-03-18 21:11:36	2022	2022 Q1	NaN	NaN	No Spend Data	NO	True	Cheaper beer
1	2023-10-21 00:07:20	2023	2023 Q4	General	426.75	E: $400-$699	UNKNOWN	True	Just to improve the diversity of food, include...
2	2023-10-21 00:07:20	2023	2023 Q4	Floor/Pit	790.00	F: $700+	UNKNOWN	True	The floor area needs bathrooms without having ...
3	2023-03-23 12:17:31	2023	2023 Q1	NaN	NaN	No Spend Data	UNKNOWN	True	One of my favorite things last season was wait...
4	2021-11-12 01:22:36	2021	2021 Q4	General	40.00	A: Under $100	UNKNOWN	False	NaN
5	2021-11-12 01:22:36	2021	2021 Q4	Floor/Pit	99.50	A: Under $100	UNKNOWN	True	It took way too long to get in the GA pit entr...
6	2023-10-21 00:07:20	2023	2023 Q4	Premium	827.50	F: $700+	UNKNOWN	True	Staff was extremely helpful, friendly and kind...
7	2021-11-12 01:22:36	2021	2021 Q4	General	119.00	B: $100-$174	UNKNOWN	True	more affordable onsite parking

	No Spend Data	Has Spend Data	Total Respondents	Spend Capture Rate
event_type
ACFC Match	168	0	168	0.0
Concert	191	5465	5656	96.6
LAFC Match	2200	1222	3422	35.7

LAFC & BMO Stadium — Fan Experience Analysis¶

Allan Almaraz | Analyst, Business & Data Strategy¶

0. Setup & Imports¶

1. Load & Inspect Raw Data¶

2. Data Cleaning & Feature Engineering¶

Data Quality Note — has_lafc_account Column¶

3. Event Type Inference¶

4. Sentiment Analysis¶

5. Analysis 1 — Feedback Theme Categorization¶

6. Analysis 2 — Spend & Segmentation¶

7. Analysis 3 — Fan Journey & Loyalty¶

8. Export — Tableau-Ready Excel File¶

9. Key Findings Summary¶

Data Quality Note — `has_lafc_account` Column¶