UFC Sell-Through Project - External Data Collection
Overview
Four external data sources supplement the core UFC stats:
| Source | Script | Records | Purpose |
|---|---|---|---|
| Wikipedia | scrape_attendance.py |
~177 | Attendance, gate revenue |
| Betting Odds | scrape_betting_odds.py |
~84,000 | Match competitiveness |
| Google Trends | fetch_google_trends.py |
~2.3M | Fighter popularity |
scrape_reddit_sentiment.py |
~380,000 | Fan sentiment |
Wikipedia Attendance Scraper
Source: scrape_attendance.py
Scrapes UFC event Wikipedia pages for attendance data.
import requests
from bs4 import BeautifulSoup
HEADERS = {'User-Agent': 'Mozilla/5.0'}
REQUEST_DELAY = 1.0 # Rate limiting
def scrape_event_page(url):
response = requests.get(url, headers=HEADERS, timeout=30)
soup = BeautifulSoup(response.content, 'lxml')
result = {'wikipedia_url': url}
# Find infobox
infobox = soup.find('table', class_='infobox')
if not infobox:
return None
# Parse infobox rows
for row in infobox.find_all('tr'):
header = row.find('th')
data = row.find('td')
if not header or not data:
continue
header_text = header.get_text(strip=True).lower()
data_text = data.get_text(strip=True)
if 'attendance' in header_text:
result['attendance'] = parse_number(data_text)
elif 'gate' in header_text:
result['gate_revenue'] = parse_currency(data_text)
elif 'venue' in header_text:
result['venue'] = data_text
elif 'city' in header_text or 'location' in header_text:
result['location'] = data_text
return result
def parse_number(text):
# Remove ~, +, *, etc.
text = text.replace('~', '').replace('+', '').replace('*', '')
number_match = re.search(r'[\d,]+', text)
if number_match:
return int(number_match.group().replace(',', ''))
return None
def parse_currency(text):
# Handle "$X million" format
text = re.sub(r'[US\$£€]', '', text)
if 'million' in text.lower():
number = float(re.search(r'[\d.]+', text).group())
return number * 1_000_000
return float(text.replace(',', ''))
Running the Scraper
python src/etl/scrape_all_attendance.py \
--events ./data/raw/events.csv \
--output ./data/external/attendance_full.csv \
--limit 200 # Optional: limit for testing
Betting Odds Collection
Source: scrape_betting_odds.py
Collects historical betting lines (or generates synthetic data).
def generate_synthetic_odds(fights_df):
"""Generate realistic betting odds based on fighter performance."""
odds_records = []
for row in fights_df.itertuples():
# Simulate odds based on outcomes
if hasattr(row, 'outcome') and row.outcome == 'W':
# Winner was likely favored
f1_prob = random.uniform(0.55, 0.85)
else:
f1_prob = random.uniform(0.15, 0.45)
f2_prob = 1 - f1_prob
# Convert to American odds
f1_odds = probability_to_american(f1_prob)
f2_odds = probability_to_american(f2_prob)
odds_records.append({
'fight_id': row.fight_id if hasattr(row, 'fight_id') else None,
'event_name': row.event,
'fighter1_name': row.fighter1_name if hasattr(row, 'fighter1_name') else None,
'fighter2_name': row.fighter2_name if hasattr(row, 'fighter2_name') else None,
'f1_close_odds': f1_odds,
'f2_close_odds': f2_odds,
'implied_prob_f1': f1_prob,
'implied_prob_f2': f2_prob,
'source': 'synthetic'
})
return pd.DataFrame(odds_records)
def probability_to_american(prob):
"""Convert probability to American odds."""
if prob >= 0.5:
return int(-100 * prob / (1 - prob))
else:
return int(100 * (1 - prob) / prob)
def calculate_betting_features(df):
"""Add betting-derived features."""
# Odds spread (how lopsided)
df['odds_spread'] = abs(df['implied_prob_f1'] - df['implied_prob_f2'])
# Competitive matchup flag
df['is_competitive_matchup'] = (df['odds_spread'] < 0.10).astype(int)
# Heavy favorite flag
df['has_heavy_favorite'] = (
(df['implied_prob_f1'] > 0.70) | (df['implied_prob_f2'] > 0.70)
).astype(int)
return df
Running
python src/etl/scrape_betting_odds.py \
--data-dir ./data \
--output-dir ./data
Google Trends
Source: fetch_google_trends.py
Fetches search interest for fighters using pytrends.
from pytrends.request import TrendReq
def fetch_fighter_trends(fighter_name, start_date, end_date, pytrends):
"""Fetch Google Trends for a single fighter."""
# Add "UFC" to get relevant results
keyword = f"{fighter_name} UFC"
pytrends.build_payload(
kw_list=[keyword],
timeframe=f"{start_date} {end_date}",
geo='',
gprop=''
)
df = pytrends.interest_over_time()
if df.empty:
return pd.DataFrame()
df = df.reset_index()
df = df.rename(columns={keyword: "search_interest"})
df["fighter_name"] = fighter_name
return df[["date", "fighter_name", "search_interest"]]
def generate_synthetic_trends(fighter_names, events_df):
"""Generate realistic synthetic trends when API unavailable."""
all_records = []
date_range = pd.date_range('2015-01-01', '2025-12-31', freq='W')
for fighter in fighter_names:
# Base interest varies by "star power"
base_interest = random.randint(5, 40)
for date in date_range:
search_interest = base_interest + random.gauss(0, base_interest * 0.3)
# Spike around events
for event_date in event_dates:
days_to_event = (event_date - date).days
if 0 <= days_to_event <= 7: # Pre-event spike
spike = random.uniform(1.5, 3.0) * (7 - days_to_event) / 7
search_interest *= (1 + spike)
elif -14 <= days_to_event < 0: # Post-event decay
search_interest *= random.uniform(0.8, 1.0)
search_interest = max(0, min(100, search_interest))
all_records.append({
"date": date,
"fighter_name": fighter,
"search_interest": int(search_interest)
})
return pd.DataFrame(all_records)
def calculate_pre_event_buzz(trends_df, fights_df):
"""Calculate 7-day and 30-day pre-event buzz."""
buzz_records = []
for fight in fights_df.itertuples():
event_date = fight.event_date
for fighter in [fight.fighter1_name, fight.fighter2_name]:
fighter_trends = trends_df[trends_df['fighter_name'] == fighter]
# 7-day pre-event
mask_7d = (fighter_trends['date'] >= event_date - timedelta(days=7)) & \
(fighter_trends['date'] < event_date)
buzz_7d = fighter_trends[mask_7d]['search_interest'].mean()
# 30-day pre-event
mask_30d = (fighter_trends['date'] >= event_date - timedelta(days=30)) & \
(fighter_trends['date'] < event_date)
buzz_30d = fighter_trends[mask_30d]['search_interest'].mean()
buzz_records.append({
'fighter_name': fighter,
'event_date': event_date,
'buzz_7d': buzz_7d,
'buzz_30d': buzz_30d,
'buzz_trend': buzz_7d / buzz_30d if buzz_30d > 0 else 1.0
})
return pd.DataFrame(buzz_records)
Running
python src/etl/fetch_google_trends.py \
--data-dir ./data \
--output-dir ./data
Reddit Sentiment
Source: scrape_reddit_sentiment.py
Analyzes r/MMA discussion threads.
from textblob import TextBlob
def analyze_sentiment(text):
"""Analyze sentiment using TextBlob."""
blob = TextBlob(str(text))
return {
"polarity": blob.sentiment.polarity, # -1 to 1
"subjectivity": blob.sentiment.subjectivity # 0 to 1
}
def simple_sentiment(text):
"""Simple word-based sentiment (no dependencies)."""
text = text.lower()
positive_words = [
"hype", "excited", "amazing", "banger", "war", "knockout",
"goat", "legend", "best", "can't wait", "insane"
]
negative_words = [
"boring", "trash", "robbery", "overrated", "ducking",
"disappointed", "skip", "worst"
]
pos_count = sum(1 for word in positive_words if word in text)
neg_count = sum(1 for word in negative_words if word in text)
total = pos_count + neg_count
if total == 0:
return {"polarity": 0.0, "subjectivity": 0.5}
polarity = (pos_count - neg_count) / total
return {"polarity": polarity, "subjectivity": min(1.0, total / 10)}
def aggregate_event_sentiment(comments_df):
"""Aggregate sentiment to event level."""
event_sentiment = comments_df.groupby('event_name').agg({
'polarity': 'mean',
'subjectivity': 'mean',
'score': ['sum', 'count', 'mean']
}).reset_index()
event_sentiment.columns = [
'event_name', 'avg_polarity', 'avg_subjectivity',
'total_score', 'comment_count', 'avg_score'
]
# Calculate hype score (engagement-weighted sentiment)
event_sentiment['hype_score'] = (
event_sentiment['avg_polarity'] *
np.log1p(event_sentiment['total_score'])
)
# Categorize sentiment
event_sentiment['sentiment_category'] = pd.cut(
event_sentiment['avg_polarity'],
bins=[-1, -0.1, 0.1, 1],
labels=['Negative', 'Neutral', 'Positive']
)
return event_sentiment
def generate_synthetic_comments(events_df, comments_per_event=500):
"""Generate synthetic Reddit comments."""
positive_templates = [
"This card is gonna be a banger!",
"Can't wait for the main event",
"Finally a stacked card",
"{fighter} is going to put on a show"
]
negative_templates = [
"Another weak card...",
"Skip this one",
"Who asked for this matchup?"
]
neutral_templates = [
"Interesting matchup",
"Should be competitive",
"Let's see what happens"
]
all_comments = []
for event in events_df.itertuples():
for i in range(comments_per_event):
# Random sentiment category
category = random.choices(
['positive', 'negative', 'neutral'],
weights=[0.5, 0.2, 0.3]
)[0]
if category == 'positive':
text = random.choice(positive_templates)
polarity = random.uniform(0.3, 1.0)
elif category == 'negative':
text = random.choice(negative_templates)
polarity = random.uniform(-1.0, -0.2)
else:
text = random.choice(neutral_templates)
polarity = random.uniform(-0.2, 0.2)
all_comments.append({
'event_name': event.event,
'comment_text': text,
'score': random.randint(-5, 100),
'polarity': polarity,
'subjectivity': random.uniform(0.3, 0.9)
})
return pd.DataFrame(all_comments)
Running
python src/etl/scrape_reddit_sentiment.py \
--data-dir ./data \
--output-dir ./data \
--max-events 200 \
--comments-per-event 500
Data Pipeline Order
# 1. Download base data
python src/etl/ingest.py --data-dir ./data
# 2. Collect external data (can run in parallel)
python src/etl/scrape_betting_odds.py --data-dir ./data --output-dir ./data
python src/etl/fetch_google_trends.py --data-dir ./data --output-dir ./data
python src/etl/scrape_reddit_sentiment.py --data-dir ./data --output-dir ./data
# 3. Run Spark ETL (loads external data)
spark-submit src/etl/spark_etl.py --data-dir ./data --output-dir ./data
Output Files
data/external/
├── attendance.csv # Wikipedia attendance
├── attendance_full.csv # Complete scrape
├── betting_odds.csv # Betting lines
├── google_trends.csv # Raw search interest
├── fighter_buzz.csv # Pre-event buzz metrics
├── reddit_comments.csv # Raw comments
└── event_sentiment.csv # Aggregated sentiment