UFC Sell-Through Project - External Data Collection

Overview

Four external data sources supplement the core UFC stats:

Source Script Records Purpose
Wikipedia scrape_attendance.py ~177 Attendance, gate revenue
Betting Odds scrape_betting_odds.py ~84,000 Match competitiveness
Google Trends fetch_google_trends.py ~2.3M Fighter popularity
Reddit scrape_reddit_sentiment.py ~380,000 Fan sentiment

Wikipedia Attendance Scraper

Source: scrape_attendance.py

Scrapes UFC event Wikipedia pages for attendance data.

import requests
from bs4 import BeautifulSoup

HEADERS = {'User-Agent': 'Mozilla/5.0'}
REQUEST_DELAY = 1.0  # Rate limiting

def scrape_event_page(url):
    response = requests.get(url, headers=HEADERS, timeout=30)
    soup = BeautifulSoup(response.content, 'lxml')
    
    result = {'wikipedia_url': url}
    
    # Find infobox
    infobox = soup.find('table', class_='infobox')
    if not infobox:
        return None
    
    # Parse infobox rows
    for row in infobox.find_all('tr'):
        header = row.find('th')
        data = row.find('td')
        
        if not header or not data:
            continue
        
        header_text = header.get_text(strip=True).lower()
        data_text = data.get_text(strip=True)
        
        if 'attendance' in header_text:
            result['attendance'] = parse_number(data_text)
        elif 'gate' in header_text:
            result['gate_revenue'] = parse_currency(data_text)
        elif 'venue' in header_text:
            result['venue'] = data_text
        elif 'city' in header_text or 'location' in header_text:
            result['location'] = data_text
    
    return result

def parse_number(text):
    # Remove ~, +, *, etc.
    text = text.replace('~', '').replace('+', '').replace('*', '')
    number_match = re.search(r'[\d,]+', text)
    if number_match:
        return int(number_match.group().replace(',', ''))
    return None

def parse_currency(text):
    # Handle "$X million" format
    text = re.sub(r'[US\$£€]', '', text)
    if 'million' in text.lower():
        number = float(re.search(r'[\d.]+', text).group())
        return number * 1_000_000
    return float(text.replace(',', ''))

Running the Scraper

python src/etl/scrape_all_attendance.py \
    --events ./data/raw/events.csv \
    --output ./data/external/attendance_full.csv \
    --limit 200  # Optional: limit for testing

Betting Odds Collection

Source: scrape_betting_odds.py

Collects historical betting lines (or generates synthetic data).

def generate_synthetic_odds(fights_df):
    """Generate realistic betting odds based on fighter performance."""
    
    odds_records = []
    
    for row in fights_df.itertuples():
        # Simulate odds based on outcomes
        if hasattr(row, 'outcome') and row.outcome == 'W':
            # Winner was likely favored
            f1_prob = random.uniform(0.55, 0.85)
        else:
            f1_prob = random.uniform(0.15, 0.45)
        
        f2_prob = 1 - f1_prob
        
        # Convert to American odds
        f1_odds = probability_to_american(f1_prob)
        f2_odds = probability_to_american(f2_prob)
        
        odds_records.append({
            'fight_id': row.fight_id if hasattr(row, 'fight_id') else None,
            'event_name': row.event,
            'fighter1_name': row.fighter1_name if hasattr(row, 'fighter1_name') else None,
            'fighter2_name': row.fighter2_name if hasattr(row, 'fighter2_name') else None,
            'f1_close_odds': f1_odds,
            'f2_close_odds': f2_odds,
            'implied_prob_f1': f1_prob,
            'implied_prob_f2': f2_prob,
            'source': 'synthetic'
        })
    
    return pd.DataFrame(odds_records)

def probability_to_american(prob):
    """Convert probability to American odds."""
    if prob >= 0.5:
        return int(-100 * prob / (1 - prob))
    else:
        return int(100 * (1 - prob) / prob)

def calculate_betting_features(df):
    """Add betting-derived features."""
    
    # Odds spread (how lopsided)
    df['odds_spread'] = abs(df['implied_prob_f1'] - df['implied_prob_f2'])
    
    # Competitive matchup flag
    df['is_competitive_matchup'] = (df['odds_spread'] < 0.10).astype(int)
    
    # Heavy favorite flag
    df['has_heavy_favorite'] = (
        (df['implied_prob_f1'] > 0.70) | (df['implied_prob_f2'] > 0.70)
    ).astype(int)
    
    return df

Running

python src/etl/scrape_betting_odds.py \
    --data-dir ./data \
    --output-dir ./data

Fetches search interest for fighters using pytrends.

from pytrends.request import TrendReq

def fetch_fighter_trends(fighter_name, start_date, end_date, pytrends):
    """Fetch Google Trends for a single fighter."""
    
    # Add "UFC" to get relevant results
    keyword = f"{fighter_name} UFC"
    
    pytrends.build_payload(
        kw_list=[keyword],
        timeframe=f"{start_date} {end_date}",
        geo='',
        gprop=''
    )
    
    df = pytrends.interest_over_time()
    
    if df.empty:
        return pd.DataFrame()
    
    df = df.reset_index()
    df = df.rename(columns={keyword: "search_interest"})
    df["fighter_name"] = fighter_name
    
    return df[["date", "fighter_name", "search_interest"]]

def generate_synthetic_trends(fighter_names, events_df):
    """Generate realistic synthetic trends when API unavailable."""
    
    all_records = []
    date_range = pd.date_range('2015-01-01', '2025-12-31', freq='W')
    
    for fighter in fighter_names:
        # Base interest varies by "star power"
        base_interest = random.randint(5, 40)
        
        for date in date_range:
            search_interest = base_interest + random.gauss(0, base_interest * 0.3)
            
            # Spike around events
            for event_date in event_dates:
                days_to_event = (event_date - date).days
                
                if 0 <= days_to_event <= 7:  # Pre-event spike
                    spike = random.uniform(1.5, 3.0) * (7 - days_to_event) / 7
                    search_interest *= (1 + spike)
                elif -14 <= days_to_event < 0:  # Post-event decay
                    search_interest *= random.uniform(0.8, 1.0)
            
            search_interest = max(0, min(100, search_interest))
            
            all_records.append({
                "date": date,
                "fighter_name": fighter,
                "search_interest": int(search_interest)
            })
    
    return pd.DataFrame(all_records)

def calculate_pre_event_buzz(trends_df, fights_df):
    """Calculate 7-day and 30-day pre-event buzz."""
    
    buzz_records = []
    
    for fight in fights_df.itertuples():
        event_date = fight.event_date
        
        for fighter in [fight.fighter1_name, fight.fighter2_name]:
            fighter_trends = trends_df[trends_df['fighter_name'] == fighter]
            
            # 7-day pre-event
            mask_7d = (fighter_trends['date'] >= event_date - timedelta(days=7)) & \
                      (fighter_trends['date'] < event_date)
            buzz_7d = fighter_trends[mask_7d]['search_interest'].mean()
            
            # 30-day pre-event
            mask_30d = (fighter_trends['date'] >= event_date - timedelta(days=30)) & \
                       (fighter_trends['date'] < event_date)
            buzz_30d = fighter_trends[mask_30d]['search_interest'].mean()
            
            buzz_records.append({
                'fighter_name': fighter,
                'event_date': event_date,
                'buzz_7d': buzz_7d,
                'buzz_30d': buzz_30d,
                'buzz_trend': buzz_7d / buzz_30d if buzz_30d > 0 else 1.0
            })
    
    return pd.DataFrame(buzz_records)

Running

python src/etl/fetch_google_trends.py \
    --data-dir ./data \
    --output-dir ./data

Reddit Sentiment

Source: scrape_reddit_sentiment.py

Analyzes r/MMA discussion threads.

from textblob import TextBlob

def analyze_sentiment(text):
    """Analyze sentiment using TextBlob."""
    blob = TextBlob(str(text))
    return {
        "polarity": blob.sentiment.polarity,      # -1 to 1
        "subjectivity": blob.sentiment.subjectivity  # 0 to 1
    }

def simple_sentiment(text):
    """Simple word-based sentiment (no dependencies)."""
    text = text.lower()
    
    positive_words = [
        "hype", "excited", "amazing", "banger", "war", "knockout",
        "goat", "legend", "best", "can't wait", "insane"
    ]
    
    negative_words = [
        "boring", "trash", "robbery", "overrated", "ducking",
        "disappointed", "skip", "worst"
    ]
    
    pos_count = sum(1 for word in positive_words if word in text)
    neg_count = sum(1 for word in negative_words if word in text)
    
    total = pos_count + neg_count
    if total == 0:
        return {"polarity": 0.0, "subjectivity": 0.5}
    
    polarity = (pos_count - neg_count) / total
    return {"polarity": polarity, "subjectivity": min(1.0, total / 10)}

def aggregate_event_sentiment(comments_df):
    """Aggregate sentiment to event level."""
    
    event_sentiment = comments_df.groupby('event_name').agg({
        'polarity': 'mean',
        'subjectivity': 'mean',
        'score': ['sum', 'count', 'mean']
    }).reset_index()
    
    event_sentiment.columns = [
        'event_name', 'avg_polarity', 'avg_subjectivity',
        'total_score', 'comment_count', 'avg_score'
    ]
    
    # Calculate hype score (engagement-weighted sentiment)
    event_sentiment['hype_score'] = (
        event_sentiment['avg_polarity'] * 
        np.log1p(event_sentiment['total_score'])
    )
    
    # Categorize sentiment
    event_sentiment['sentiment_category'] = pd.cut(
        event_sentiment['avg_polarity'],
        bins=[-1, -0.1, 0.1, 1],
        labels=['Negative', 'Neutral', 'Positive']
    )
    
    return event_sentiment

def generate_synthetic_comments(events_df, comments_per_event=500):
    """Generate synthetic Reddit comments."""
    
    positive_templates = [
        "This card is gonna be a banger!",
        "Can't wait for the main event",
        "Finally a stacked card",
        "{fighter} is going to put on a show"
    ]
    
    negative_templates = [
        "Another weak card...",
        "Skip this one",
        "Who asked for this matchup?"
    ]
    
    neutral_templates = [
        "Interesting matchup",
        "Should be competitive",
        "Let's see what happens"
    ]
    
    all_comments = []
    
    for event in events_df.itertuples():
        for i in range(comments_per_event):
            # Random sentiment category
            category = random.choices(
                ['positive', 'negative', 'neutral'],
                weights=[0.5, 0.2, 0.3]
            )[0]
            
            if category == 'positive':
                text = random.choice(positive_templates)
                polarity = random.uniform(0.3, 1.0)
            elif category == 'negative':
                text = random.choice(negative_templates)
                polarity = random.uniform(-1.0, -0.2)
            else:
                text = random.choice(neutral_templates)
                polarity = random.uniform(-0.2, 0.2)
            
            all_comments.append({
                'event_name': event.event,
                'comment_text': text,
                'score': random.randint(-5, 100),
                'polarity': polarity,
                'subjectivity': random.uniform(0.3, 0.9)
            })
    
    return pd.DataFrame(all_comments)

Running

python src/etl/scrape_reddit_sentiment.py \
    --data-dir ./data \
    --output-dir ./data \
    --max-events 200 \
    --comments-per-event 500

Data Pipeline Order

# 1. Download base data
python src/etl/ingest.py --data-dir ./data

# 2. Collect external data (can run in parallel)
python src/etl/scrape_betting_odds.py --data-dir ./data --output-dir ./data
python src/etl/fetch_google_trends.py --data-dir ./data --output-dir ./data
python src/etl/scrape_reddit_sentiment.py --data-dir ./data --output-dir ./data

# 3. Run Spark ETL (loads external data)
spark-submit src/etl/spark_etl.py --data-dir ./data --output-dir ./data

Output Files

data/external/
├── attendance.csv           # Wikipedia attendance
├── attendance_full.csv      # Complete scrape
├── betting_odds.csv         # Betting lines
├── google_trends.csv        # Raw search interest
├── fighter_buzz.csv         # Pre-event buzz metrics
├── reddit_comments.csv      # Raw comments
└── event_sentiment.csv      # Aggregated sentiment