UFC Sell-Through Project - Visualization
Overview
The visualization module creates 10 plots for analysis and presentation:
| # | Plot | Purpose |
|---|---|---|
| 1 | Distribution | Target variable distribution |
| 2 | Feature Importance | Model interpretability |
| 3 | Time Series | Trends over years |
| 4 | By Event Type | PPV vs Fight Night |
| 5 | By Location | Geographic patterns |
| 6 | Actual vs Predicted | Model accuracy |
| 7 | Model Comparison | Baseline vs Improved |
| 8 | Title Fight Impact | Title fight effect |
| 9 | Residuals | Error analysis |
| 10 | Correlation Heatmap | Feature relationships |
Plot Functions
1. Sell-Through Distribution
def plot_sellthrough_distribution(df, output_dir):
plt.figure(figsize=(10, 6))
plt.hist(df['sell_through'].dropna(), bins=30, edgecolor='black', alpha=0.7)
plt.xlabel('Sell-Through Rate', fontsize=12)
plt.ylabel('Number of Events', fontsize=12)
plt.title('Distribution of UFC Event Sell-Through Rates', fontsize=14, fontweight='bold')
# Add statistics
mean_val = df['sell_through'].mean()
median_val = df['sell_through'].median()
plt.axvline(mean_val, color='red', linestyle='--', label=f'Mean: {mean_val:.2f}')
plt.axvline(median_val, color='green', linestyle='--', label=f'Median: {median_val:.2f}')
plt.legend()
plt.tight_layout()
plt.savefig(os.path.join(output_dir, '1_sellthrough_distribution.png'), dpi=300)
plt.close()
2. Feature Importance
def plot_feature_importance(metrics, output_dir, top_n=15):
if not metrics or 'feature_importance' not in metrics:
return
importance = metrics['feature_importance'][:top_n]
features = [x['feature'] for x in importance]
values = [x['importance'] for x in importance]
plt.figure(figsize=(10, 8))
bars = plt.barh-1], values[::-1], color='steelblue'
plt.xlabel('Importance', fontsize=12)
plt.title(f'Top {top_n} Feature Importances', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.savefig(os.path.join(output_dir, '2_feature_importance.png'), dpi=300)
plt.close()
3. Sell-Through Over Time
def plot_sellthrough_over_time(df, output_dir):
df['year'] = pd.to_datetime(df['event_date']).dt.year
yearly = df.groupby('year')['sell_through'].agg(['mean', 'std', 'count']).reset_index()
plt.figure(figsize=(12, 6))
plt.plot(yearly['year'], yearly['mean'], marker='o', linewidth=2, markersize=8)
plt.fill_between(yearly['year'],
yearly['mean'] - yearly['std'],
yearly['mean'] + yearly['std'],
alpha=0.2)
plt.xlabel('Year', fontsize=12)
plt.ylabel('Average Sell-Through Rate', fontsize=12)
plt.title('UFC Sell-Through Rates Over Time', fontsize=14, fontweight='bold')
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.savefig(os.path.join(output_dir, '3_sellthrough_over_time.png'), dpi=300)
plt.close()
4. By Event Type
def plot_sellthrough_by_event_type(df, output_dir):
event_types = df.groupby('event_type')['sell_through'].agg(['mean', 'std', 'count'])
event_types = event_types.sort_values('mean', ascending=True)
plt.figure(figsize=(10, 6))
bars = plt.barh(event_types.index, event_types['mean'], xerr=event_types['std'],
capsize=5, color='coral')
plt.xlabel('Average Sell-Through Rate', fontsize=12)
plt.title('Sell-Through by Event Type', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.savefig(os.path.join(output_dir, '4_sellthrough_by_event_type.png'), dpi=300)
plt.close()
5. By Location
def plot_sellthrough_by_location(df, output_dir, top_n=15):
location_stats = df.groupby('city')['sell_through'].agg(['mean', 'count'])
location_stats = location_stats[location_stats['count'] >= 3] # Min 3 events
location_stats = location_stats.sort_values('mean', ascending=False).head(top_n)
plt.figure(figsize=(10, 8))
plt.barh-1], location_stats['mean'][::-1], color='teal'
plt.xlabel('Average Sell-Through Rate', fontsize=12)
plt.title(f'Top {top_n} Cities by Sell-Through Rate', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.savefig(os.path.join(output_dir, '5_sellthrough_by_location.png'), dpi=300)
plt.close()
6. Actual vs Predicted
def plot_actual_vs_predicted(df, data_dir, output_dir):
pred_path = os.path.join(data_dir, 'models', 'predictions.csv')
if not os.path.exists(pred_path):
return
pred_df = pd.read_csv(pred_path)
plt.figure(figsize=(10, 8))
plt.scatter(pred_df['sell_through'], pred_df['prediction'], alpha=0.6, s=50)
# Diagonal line (perfect prediction)
min_val = min(pred_df['sell_through'].min(), pred_df['prediction'].min())
max_val = max(pred_df['sell_through'].max(), pred_df['prediction'].max())
plt.plot([min_val, max_val], [min_val, max_val], 'r--', linewidth=2, label='Perfect')
plt.xlabel('Actual Sell-Through', fontsize=12)
plt.ylabel('Predicted Sell-Through', fontsize=12)
plt.title('Actual vs Predicted Sell-Through', fontsize=14, fontweight='bold')
plt.legend()
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.savefig(os.path.join(output_dir, '6_actual_vs_predicted.png'), dpi=300)
plt.close()
7. Model Comparison
def plot_model_comparison(data_dir, output_dir):
baseline_path = os.path.join(data_dir, "models", "metrics.json")
improved_path = os.path.join(data_dir, "models", "metrics_improved.json")
metrics = []
labels = []
if os.path.exists(baseline_path):
with open(baseline_path) as f:
metrics.append(json.load(f).get('r2', 0))
labels.append('Baseline (9 features)')
if os.path.exists(improved_path):
with open(improved_path) as f:
metrics.append(json.load(f).get('r2', 0))
labels.append('Improved (36 features)')
plt.figure(figsize=(8, 6))
bars = plt.bar(labels, metrics, color=['#3498db', '#2ecc71'])
for bar, val in zip(bars, metrics):
plt.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.01,
f'{val:.3f}', ha='center', fontsize=12, fontweight='bold')
plt.ylabel('R² Score', fontsize=12)
plt.title('Model Performance Comparison', fontsize=14, fontweight='bold')
plt.ylim(0, max(metrics) * 1.2)
plt.tight_layout()
plt.savefig(os.path.join(output_dir, '7_model_comparison.png'), dpi=300)
plt.close()
8. Title Fight Impact
def plot_title_fight_impact(df, output_dir):
df['title_status'] = df['has_title'].map({1: 'Has Title Fight', 0: 'No Title Fight'})
plt.figure(figsize=(8, 6))
df.boxplot(column='sell_through', by='title_status')
plt.xlabel('Event Type', fontsize=12)
plt.ylabel('Sell-Through Rate', fontsize=12)
plt.title('Impact of Title Fights', fontsize=14, fontweight='bold')
plt.suptitle('')
plt.tight_layout()
plt.savefig(os.path.join(output_dir, '8_title_fight_impact.png'), dpi=300)
plt.close()
9. Residuals Plot
def plot_residuals(df, data_dir, output_dir):
pred_df = pd.read_csv(os.path.join(data_dir, 'models', 'predictions.csv'))
residuals = pred_df['sell_through'] - pred_df['prediction']
plt.figure(figsize=(10, 6))
plt.scatter(pred_df['prediction'], residuals, alpha=0.6, s=50)
plt.axhline(y=0, color='r', linestyle='--', linewidth=2)
plt.xlabel('Predicted Sell-Through', fontsize=12)
plt.ylabel('Residuals (Actual - Predicted)', fontsize=12)
plt.title('Residuals Plot', fontsize=14, fontweight='bold')
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.savefig(os.path.join(output_dir, '9_residuals.png'), dpi=300)
plt.close()
10. Correlation Heatmap
def plot_correlation_heatmap(df, output_dir, top_n=15):
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
if 'sell_through' not in numeric_cols:
return
# Get top correlated features
corr = df[numeric_cols].corr()['sell_through'].abs().sort_values(ascending=False)
top_features = ['sell_through'] + corr.head(top_n + 1).index.tolist()[1:top_n+1]
corr_matrix = df[top_features].corr()
plt.figure(figsize=(12, 10))
sns.heatmap(corr_matrix, annot=True, fmt='.2f', cmap='coolwarm', center=0,
square=True, linewidths=0.5)
plt.title(f'Correlation Heatmap: Top {top_n} Features', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.savefig(os.path.join(output_dir, '10_correlation_heatmap.png'), dpi=300)
plt.close()
Running Visualizations
python src/visualization/create_plots.py \
--data-dir ./data \
--output-dir ./visualizations
Output
visualizations/
├── 1_sellthrough_distribution.png
├── 2_feature_importance.png
├── 3_sellthrough_over_time.png
├── 4_sellthrough_by_event_type.png
├── 5_sellthrough_by_location.png
├── 6_actual_vs_predicted.png
├── 7_model_comparison.png
├── 8_title_fight_impact.png
├── 9_residuals.png
└── 10_correlation_heatmap.png