UFC Sell-Through Project - Visualization

Overview

The visualization module creates 10 plots for analysis and presentation:

# Plot Purpose
1 Distribution Target variable distribution
2 Feature Importance Model interpretability
3 Time Series Trends over years
4 By Event Type PPV vs Fight Night
5 By Location Geographic patterns
6 Actual vs Predicted Model accuracy
7 Model Comparison Baseline vs Improved
8 Title Fight Impact Title fight effect
9 Residuals Error analysis
10 Correlation Heatmap Feature relationships

Plot Functions

1. Sell-Through Distribution

def plot_sellthrough_distribution(df, output_dir):
    plt.figure(figsize=(10, 6))
    plt.hist(df['sell_through'].dropna(), bins=30, edgecolor='black', alpha=0.7)
    plt.xlabel('Sell-Through Rate', fontsize=12)
    plt.ylabel('Number of Events', fontsize=12)
    plt.title('Distribution of UFC Event Sell-Through Rates', fontsize=14, fontweight='bold')
    
    # Add statistics
    mean_val = df['sell_through'].mean()
    median_val = df['sell_through'].median()
    plt.axvline(mean_val, color='red', linestyle='--', label=f'Mean: {mean_val:.2f}')
    plt.axvline(median_val, color='green', linestyle='--', label=f'Median: {median_val:.2f}')
    plt.legend()
    
    plt.tight_layout()
    plt.savefig(os.path.join(output_dir, '1_sellthrough_distribution.png'), dpi=300)
    plt.close()

2. Feature Importance

def plot_feature_importance(metrics, output_dir, top_n=15):
    if not metrics or 'feature_importance' not in metrics:
        return
    
    importance = metrics['feature_importance'][:top_n]
    features = [x['feature'] for x in importance]
    values = [x['importance'] for x in importance]
    
    plt.figure(figsize=(10, 8))
    bars = plt.barh-1], values[::-1], color='steelblue'
    plt.xlabel('Importance', fontsize=12)
    plt.title(f'Top {top_n} Feature Importances', fontsize=14, fontweight='bold')
    plt.tight_layout()
    plt.savefig(os.path.join(output_dir, '2_feature_importance.png'), dpi=300)
    plt.close()

3. Sell-Through Over Time

def plot_sellthrough_over_time(df, output_dir):
    df['year'] = pd.to_datetime(df['event_date']).dt.year
    yearly = df.groupby('year')['sell_through'].agg(['mean', 'std', 'count']).reset_index()
    
    plt.figure(figsize=(12, 6))
    plt.plot(yearly['year'], yearly['mean'], marker='o', linewidth=2, markersize=8)
    plt.fill_between(yearly['year'], 
                     yearly['mean'] - yearly['std'], 
                     yearly['mean'] + yearly['std'], 
                     alpha=0.2)
    plt.xlabel('Year', fontsize=12)
    plt.ylabel('Average Sell-Through Rate', fontsize=12)
    plt.title('UFC Sell-Through Rates Over Time', fontsize=14, fontweight='bold')
    plt.grid(True, alpha=0.3)
    plt.tight_layout()
    plt.savefig(os.path.join(output_dir, '3_sellthrough_over_time.png'), dpi=300)
    plt.close()

4. By Event Type

def plot_sellthrough_by_event_type(df, output_dir):
    event_types = df.groupby('event_type')['sell_through'].agg(['mean', 'std', 'count'])
    event_types = event_types.sort_values('mean', ascending=True)
    
    plt.figure(figsize=(10, 6))
    bars = plt.barh(event_types.index, event_types['mean'], xerr=event_types['std'], 
                    capsize=5, color='coral')
    plt.xlabel('Average Sell-Through Rate', fontsize=12)
    plt.title('Sell-Through by Event Type', fontsize=14, fontweight='bold')
    plt.tight_layout()
    plt.savefig(os.path.join(output_dir, '4_sellthrough_by_event_type.png'), dpi=300)
    plt.close()

5. By Location

def plot_sellthrough_by_location(df, output_dir, top_n=15):
    location_stats = df.groupby('city')['sell_through'].agg(['mean', 'count'])
    location_stats = location_stats[location_stats['count'] >= 3]  # Min 3 events
    location_stats = location_stats.sort_values('mean', ascending=False).head(top_n)
    
    plt.figure(figsize=(10, 8))
    plt.barh-1], location_stats['mean'][::-1], color='teal'
    plt.xlabel('Average Sell-Through Rate', fontsize=12)
    plt.title(f'Top {top_n} Cities by Sell-Through Rate', fontsize=14, fontweight='bold')
    plt.tight_layout()
    plt.savefig(os.path.join(output_dir, '5_sellthrough_by_location.png'), dpi=300)
    plt.close()

6. Actual vs Predicted

def plot_actual_vs_predicted(df, data_dir, output_dir):
    pred_path = os.path.join(data_dir, 'models', 'predictions.csv')
    if not os.path.exists(pred_path):
        return
    
    pred_df = pd.read_csv(pred_path)
    
    plt.figure(figsize=(10, 8))
    plt.scatter(pred_df['sell_through'], pred_df['prediction'], alpha=0.6, s=50)
    
    # Diagonal line (perfect prediction)
    min_val = min(pred_df['sell_through'].min(), pred_df['prediction'].min())
    max_val = max(pred_df['sell_through'].max(), pred_df['prediction'].max())
    plt.plot([min_val, max_val], [min_val, max_val], 'r--', linewidth=2, label='Perfect')
    
    plt.xlabel('Actual Sell-Through', fontsize=12)
    plt.ylabel('Predicted Sell-Through', fontsize=12)
    plt.title('Actual vs Predicted Sell-Through', fontsize=14, fontweight='bold')
    plt.legend()
    plt.grid(True, alpha=0.3)
    plt.tight_layout()
    plt.savefig(os.path.join(output_dir, '6_actual_vs_predicted.png'), dpi=300)
    plt.close()

7. Model Comparison

def plot_model_comparison(data_dir, output_dir):
    baseline_path = os.path.join(data_dir, "models", "metrics.json")
    improved_path = os.path.join(data_dir, "models", "metrics_improved.json")
    
    metrics = []
    labels = []
    
    if os.path.exists(baseline_path):
        with open(baseline_path) as f:
            metrics.append(json.load(f).get('r2', 0))
            labels.append('Baseline (9 features)')
    
    if os.path.exists(improved_path):
        with open(improved_path) as f:
            metrics.append(json.load(f).get('r2', 0))
            labels.append('Improved (36 features)')
    
    plt.figure(figsize=(8, 6))
    bars = plt.bar(labels, metrics, color=['#3498db', '#2ecc71'])
    
    for bar, val in zip(bars, metrics):
        plt.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.01,
                f'{val:.3f}', ha='center', fontsize=12, fontweight='bold')
    
    plt.ylabel('R² Score', fontsize=12)
    plt.title('Model Performance Comparison', fontsize=14, fontweight='bold')
    plt.ylim(0, max(metrics) * 1.2)
    plt.tight_layout()
    plt.savefig(os.path.join(output_dir, '7_model_comparison.png'), dpi=300)
    plt.close()

8. Title Fight Impact

def plot_title_fight_impact(df, output_dir):
    df['title_status'] = df['has_title'].map({1: 'Has Title Fight', 0: 'No Title Fight'})
    
    plt.figure(figsize=(8, 6))
    df.boxplot(column='sell_through', by='title_status')
    plt.xlabel('Event Type', fontsize=12)
    plt.ylabel('Sell-Through Rate', fontsize=12)
    plt.title('Impact of Title Fights', fontsize=14, fontweight='bold')
    plt.suptitle('')
    plt.tight_layout()
    plt.savefig(os.path.join(output_dir, '8_title_fight_impact.png'), dpi=300)
    plt.close()

9. Residuals Plot

def plot_residuals(df, data_dir, output_dir):
    pred_df = pd.read_csv(os.path.join(data_dir, 'models', 'predictions.csv'))
    residuals = pred_df['sell_through'] - pred_df['prediction']
    
    plt.figure(figsize=(10, 6))
    plt.scatter(pred_df['prediction'], residuals, alpha=0.6, s=50)
    plt.axhline(y=0, color='r', linestyle='--', linewidth=2)
    plt.xlabel('Predicted Sell-Through', fontsize=12)
    plt.ylabel('Residuals (Actual - Predicted)', fontsize=12)
    plt.title('Residuals Plot', fontsize=14, fontweight='bold')
    plt.grid(True, alpha=0.3)
    plt.tight_layout()
    plt.savefig(os.path.join(output_dir, '9_residuals.png'), dpi=300)
    plt.close()

10. Correlation Heatmap

def plot_correlation_heatmap(df, output_dir, top_n=15):
    numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
    
    if 'sell_through' not in numeric_cols:
        return
    
    # Get top correlated features
    corr = df[numeric_cols].corr()['sell_through'].abs().sort_values(ascending=False)
    top_features = ['sell_through'] + corr.head(top_n + 1).index.tolist()[1:top_n+1]
    
    corr_matrix = df[top_features].corr()
    
    plt.figure(figsize=(12, 10))
    sns.heatmap(corr_matrix, annot=True, fmt='.2f', cmap='coolwarm', center=0,
                square=True, linewidths=0.5)
    plt.title(f'Correlation Heatmap: Top {top_n} Features', fontsize=14, fontweight='bold')
    plt.tight_layout()
    plt.savefig(os.path.join(output_dir, '10_correlation_heatmap.png'), dpi=300)
    plt.close()

Running Visualizations

python src/visualization/create_plots.py \
    --data-dir ./data \
    --output-dir ./visualizations

Output

visualizations/
├── 1_sellthrough_distribution.png
├── 2_feature_importance.png
├── 3_sellthrough_over_time.png
├── 4_sellthrough_by_event_type.png
├── 5_sellthrough_by_location.png
├── 6_actual_vs_predicted.png
├── 7_model_comparison.png
├── 8_title_fight_impact.png
├── 9_residuals.png
└── 10_correlation_heatmap.png