UFC Sell-Through Project - Feature Engineering
Overview
The feature engineering module creates 36 features across four categories:
- Fighter Statistics - Rolling performance metrics
- Matchup Features - Physical and historical comparisons
- Event Features - Card-level aggregations
- External Features - Betting, trends, sentiment
Window Functions
PySpark window functions enable rolling calculations without loops:
from pyspark.sql import Window
from pyspark.sql import functions as F
# Window for all previous fights (career stats)
win_all = Window.partitionBy("fighter_name").orderBy("event_date").rowsBetween(
Window.unboundedPreceding, -1
)
# Window for last N fights
win_n = Window.partitionBy("fighter_name").orderBy("event_date").rowsBetween(-5, -1)
Fighter Statistics
make_fighter_stats()
def make_fighter_stats(fights, stats, n=5):
# Split into fighter1 and fighter2 views
f1 = fights.select(
"fight_id", "event_id", "event_date",
F.col("fighter1_name").alias("fighter_name"),
"winner_name", "method_category", "is_title_fight", "weight_class"
)
f1 = f1.withColumn("won", (F.col("fighter_name") == F.col("winner_name")).cast(IntegerType()))
f2 = fights.select(
"fight_id", "event_id", "event_date",
F.col("fighter2_name").alias("fighter_name"),
"winner_name", "method_category", "is_title_fight", "weight_class"
)
f2 = f2.withColumn("won", (F.col("fighter_name") == F.col("winner_name")).cast(IntegerType()))
# Union both perspectives
all_fights = f1.union(f2)
# Check if fight ended in finish
all_fights = all_fights.withColumn(
"finished",
F.when(
(F.col("won") == 1) & F.col("method_category").isin(["KO/TKO", "Submission"]),
1
).otherwise(0)
)
# Join with fight stats
all_fights = all_fights.join(
stats.select("fight_id", "fighter_name", "sig_strikes_landed",
"sig_strikes_attempted", "takedowns_landed", "takedowns_attempted"),
on=["fight_id", "fighter_name"],
how="left"
)
# Calculate rolling statistics
result = all_fights
# Win rate over last N fights
result = result.withColumn(
f"win_rate_last{n}",
F.avg("won").over(win_n)
)
# Finish rate over last N fights
result = result.withColumn(
f"finish_rate_last{n}",
F.avg("finished").over(win_n)
)
# Total career fights
result = result.withColumn(
"total_fights",
F.count("*").over(win_all)
)
# Days since last fight
fighter_window = Window.partitionBy("fighter_name").orderBy("event_date")
result = result.withColumn("last_fight", F.lag("event_date", 1).over(fighter_window))
result = result.withColumn("days_off", F.datediff("event_date", "last_fight"))
# Career strike accuracy
strike_accuracy = F.when(
F.col("sig_strikes_attempted") > 0,
F.col("sig_strikes_landed") / F.col("sig_strikes_attempted")
)
result = result.withColumn("strike_acc", F.avg(strike_accuracy).over(win_all))
# Career takedown accuracy
td_accuracy = F.when(
F.col("takedowns_attempted") > 0,
F.col("takedowns_landed") / F.col("takedowns_attempted")
)
result = result.withColumn("td_acc", F.avg(td_accuracy).over(win_all))
# Total title fights
result = result.withColumn(
"title_fights",
F.sum(F.col("is_title_fight").cast(IntegerType())).over(win_all)
)
return result.select([
"fight_id", "event_id", "event_date", "fighter_name",
f"win_rate_last{n}", f"finish_rate_last{n}",
"total_fights", "days_off", "strike_acc", "td_acc", "title_fights"
])
Fighter Features Table
| Feature |
Description |
Calculation |
win_rate_last5 |
Win rate in last 5 fights |
avg(won) over window |
finish_rate_last5 |
Finish rate in last 5 |
avg(finished) over window |
total_fights |
Career fight count |
count(*) over career |
days_off |
Days since last fight |
datediff |
strike_acc |
Career strike accuracy |
landed / attempted |
td_acc |
Career takedown accuracy |
landed / attempted |
title_fights |
Career title fight count |
sum(is_title) over career |
Matchup Features
make_matchup_features()
def make_matchup_features(fights, fighter_feats, fighters):
df = fights
# Create pair identifiers for rematch detection
df = df.withColumn("pair1", F.least("fighter1_name", "fighter2_name"))
df = df.withColumn("pair2", F.greatest("fighter1_name", "fighter2_name"))
# Count previous meetings
prev_fights = fights.select(
F.least("fighter1_name", "fighter2_name").alias("pair1"),
F.greatest("fighter1_name", "fighter2_name").alias("pair2"),
F.col("event_date").alias("prev_event_date")
)
current = df.select("fight_id", F.col("event_date").alias("curr_date"), "pair1", "pair2")
prev_joined = prev_fights.join(current, on=["pair1", "pair2"])
prev_filtered = prev_joined.filter(F.col("prev_event_date") < F.col("curr_date"))
prev_counts = prev_filtered.groupBy("fight_id").agg(F.count("*").alias("times_fought"))
df = df.join(prev_counts, on="fight_id", how="left")
df = df.fillna({"times_fought": 0})
# Rematch flags
df = df.withColumn("is_rematch", (F.col("times_fought") > 0).cast(IntegerType()))
df = df.withColumn("is_rivalry", (F.col("times_fought") >= 2).cast(IntegerType()))
# Join fighter physical stats
f1_stats = fighters.select(
F.col("fighter_name").alias("fighter1_name"),
F.col("height_inches").alias("h1"),
F.col("reach_inches").alias("r1"),
F.col("dob").alias("dob1")
)
f2_stats = fighters.select(
F.col("fighter_name").alias("fighter2_name"),
F.col("height_inches").alias("h2"),
F.col("reach_inches").alias("r2"),
F.col("dob").alias("dob2")
)
df = df.join(f1_stats, on="fighter1_name", how="left")
df = df.join(f2_stats, on="fighter2_name", how="left")
# Calculate differentials
df = df.withColumn("reach_diff", F.abs(F.col("r1") - F.col("r2")))
df = df.withColumn("height_diff", F.abs(F.col("h1") - F.col("h2")))
# Age difference in years
age1 = F.datediff("event_date", "dob1") / 365.25
age2 = F.datediff("event_date", "dob2") / 365.25
df = df.withColumn("age_diff", F.abs(age1 - age2))
return df.select([
"fight_id", "event_id", "fighter1_name", "fighter2_name",
"is_rematch", "is_rivalry", "is_title_fight",
"reach_diff", "height_diff", "age_diff"
])
Matchup Features Table
| Feature |
Description |
is_rematch |
Fighters have met before |
is_rivalry |
Met 2+ times before |
reach_diff |
Absolute reach difference (inches) |
height_diff |
Absolute height difference (inches) |
age_diff |
Absolute age difference (years) |
Event Features
make_event_features()
def make_event_features(events, fights, matchups, fighter_feats):
df = events
# Count fights per event
fight_counts = fights.groupBy("event_id").agg(
F.count("*").alias("num_fights"),
F.sum(F.col("is_title_fight").cast(IntegerType())).alias("num_title_fights"),
F.max(F.col("is_title_fight").cast(IntegerType())).alias("has_title")
)
df = df.join(fight_counts, on="event_id", how="left")
# Count rematches
rematch_counts = matchups.groupBy("event_id").agg(
F.sum(F.col("is_rematch").cast(IntegerType())).alias("num_rematches"),
F.max("is_rivalry").alias("has_rivalry")
)
df = df.join(rematch_counts, on="event_id", how="left")
# Aggregate physical features
matchup_stats = matchups.groupBy("event_id").agg(
F.avg("reach_diff").alias("avg_reach_diff"),
F.avg("height_diff").alias("avg_height_diff"),
F.avg("age_diff").alias("avg_age_diff")
)
df = df.join(matchup_stats, on="event_id", how="left")
# Average fighter experience on card
fighter_avg = fighter_feats.groupBy("event_id").agg(
F.avg("total_fights").alias("avg_exp"),
F.avg("win_rate_last5").alias("avg_win_rate"),
F.max("total_fights").alias("max_exp")
)
df = df.join(fighter_avg, on="event_id", how="left")
# Date features
df = df.withColumn("day_of_week", F.dayofweek("event_date"))
df = df.withColumn("month", F.month("event_date"))
df = df.withColumn("is_saturday", (F.col("day_of_week") == 7).cast(IntegerType()))
# Days since last event of same type
win_prev = Window.partitionBy("event_type").orderBy("event_date")
df = df.withColumn("days_since_last",
F.datediff("event_date", F.lag("event_date", 1).over(win_prev)))
# Location features
df = df.withColumn("is_vegas",
F.lower(F.col("city")).contains("las vegas").cast(IntegerType()))
df = df.withColumn("is_usa", (F.col("country") == "USA").cast(IntegerType()))
df = df.withColumn("is_ppv", (F.col("event_type") == "PPV").cast(IntegerType()))
return df
Event Features Table
| Feature |
Description |
num_fights |
Total fights on card |
num_title_fights |
Number of title fights |
has_title |
At least one title fight |
num_rematches |
Rematch count |
has_rivalry |
Card has a rivalry |
avg_reach_diff |
Mean reach differential |
avg_height_diff |
Mean height differential |
avg_age_diff |
Mean age differential |
avg_exp |
Average fighter experience |
avg_win_rate |
Average fighter win rate |
max_exp |
Most experienced fighter |
day_of_week |
Day (1=Sunday, 7=Saturday) |
month |
Month number |
is_saturday |
Saturday event |
days_since_last |
Days since previous event |
is_vegas |
Las Vegas location |
is_usa |
USA location |
is_ppv |
Pay-per-view event |
Extended Features (External Data)
Graph Features
| Feature |
Description |
avg_combined_pagerank |
Mean PageRank sum |
max_combined_pagerank |
Max star power |
avg_pagerank_diff |
Mean PageRank differential |
avg_network_size |
Mean opponent count |
num_same_community_fights |
Same-division matchups |
Betting Features
| Feature |
Description |
avg_betting_spread |
Mean odds spread |
max_betting_spread |
Biggest mismatch |
min_betting_spread |
Closest matchup |
num_competitive_fights |
Spread < 10% |
num_heavy_favorites |
Favorite > 70% |
Sentiment Features
| Feature |
Description |
avg_buzz_7d |
Mean 7-day search interest |
max_buzz_7d |
Peak search interest |
total_buzz_7d |
Sum of all buzz |
avg_buzz_diff |
Mean buzz differential |
reddit_sentiment |
Average r/MMA sentiment |
reddit_hype |
Engagement-weighted hype |
reddit_engagement |
Total Reddit engagement |
reddit_comments |
Comment count |
Complete Feature List (36 Features)
feature_columns = [
# Base features
"num_fights", "num_title_fights", "has_title",
"num_rematches", "has_rivalry",
"avg_exp", "avg_win_rate", "max_exp",
"avg_reach_diff", "avg_height_diff", "avg_age_diff",
"day_of_week", "month", "is_saturday", "days_since_last",
"is_vegas", "is_usa", "is_ppv",
# Graph features
"avg_combined_pagerank", "max_combined_pagerank",
"avg_pagerank_diff", "avg_network_size", "num_same_community_fights",
# Betting features
"avg_betting_spread", "max_betting_spread", "min_betting_spread",
"num_competitive_fights", "num_heavy_favorites",
# Sentiment features
"avg_buzz_7d", "max_buzz_7d", "total_buzz_7d", "avg_buzz_diff",
"reddit_sentiment", "reddit_hype", "reddit_engagement", "reddit_comments"
]
Running Feature Engineering
spark-submit src/features/feature_engineering.py \
--data-dir ./data \
--output-dir ./data \
--no-external # Skip external data if not available
Output
Features saved to Parquet:
data/features/fighter_features/
data/features/matchup_features/
data/features/event_features/