UFC Sell-Through Project - Graph Analytics
Overview
The graph module builds a fighter network where:
- Nodes = Fighters
- Edges = Fights between them
This enables computation of:
- PageRank - Identifies important fighters based on opponent quality
- Connected Components - Finds isolated fighter groups
- Label Propagation - Community/division detection
- Triangle Count - Tight-knit fighter clusters
Dependencies
# GraphFrames package for spark-submit
spark-submit \
--packages graphframes:graphframes:0.8.2-spark3.2-s_2.12 \
src/graph/fighter_network.py
Building the Fighter Graph
create_spark_session()
def create_spark_session():
builder = SparkSession.builder
builder = builder.appName("UFC-Fighter-Network")
builder = builder.config("spark.driver.memory", "4g")
builder = builder.config("spark.jars.packages",
"graphframes:graphframes:0.8.2-spark3.2-s_2.12")
spark = builder.getOrCreate()
spark.sparkContext.setLogLevel("WARN")
return spark
build_fighter_graph()
from graphframes import GraphFrame
def build_fighter_graph(fights_df, events_df=None):
# Extract fighter names
if "fighter1_name" not in fights_df.columns and "bout" in fights_df.columns:
fights_df = fights_df.withColumn(
"fighter1_name",
F.trim(F.regexp_extract(F.col("bout"), r"^(.+?)\s+vs\.?\s+", 1))
)
fights_df = fights_df.withColumn(
"fighter2_name",
F.trim(F.regexp_extract(F.col("bout"), r"\s+vs\.?\s+(.+)$", 1))
)
# Collect unique fighters
fighters1 = fights_df.select(F.col("fighter1_name").alias("name"))
fighters2 = fights_df.select(F.col("fighter2_name").alias("name"))
all_fighters = fighters1.union(fighters2).distinct()
# GraphFrames needs numeric IDs
fighters_with_ids = all_fighters.withColumn("id", F.monotonically_increasing_id())
nodes = fighters_with_ids.select("id", "name")
name_to_id = nodes.select("name", "id")
# Create edges
fight_pairs = fights_df.select(
F.col("fighter1_name").alias("src_name"),
F.col("fighter2_name").alias("dst_name")
)
edges = fight_pairs.join(
name_to_id.withColumnRenamed("id", "src").withColumnRenamed("name", "src_name"),
on="src_name"
).join(
name_to_id.withColumnRenamed("id", "dst").withColumnRenamed("name", "dst_name"),
on="dst_name"
).select("src", "dst")
# Create GraphFrame
graph = GraphFrame(nodes, edges)
print(f"Graph created: {nodes.count()} fighters, {edges.count()} fights")
return graph, nodes
PageRank
PageRank measures fighter "importance" based on:
- Number of fights (more fights = more connections)
- Quality of opponents (beating high-PageRank fighters boosts your score)
def compute_pagerank(graph, reset_probability=0.15, max_iter=20):
print("Computing PageRank...")
pagerank = graph.pageRank(
resetProbability=reset_probability,
maxIter=max_iter
)
# pagerank.vertices has "id" and "pagerank" columns
pr_df = pagerank.vertices.select("id", "pagerank")
# Get top fighters
top_fighters = pr_df.join(
graph.vertices, on="id"
).orderBy(F.desc("pagerank")).limit(20)
print("Top 10 fighters by PageRank:")
top_fighters.show(10, truncate=False)
return pr_df
PageRank Interpretation
| PageRank Range | Interpretation |
|---|---|
| > 0.01 | Elite champion level |
| 0.005 - 0.01 | Top contender |
| 0.001 - 0.005 | Ranked fighter |
| < 0.001 | Unranked / regional |
Connected Components
Find isolated groups of fighters (rare in UFC):
def compute_connected_components(graph):
print("Computing Connected Components...")
cc = graph.connectedComponents()
# Count fighters per component
component_sizes = cc.groupBy("component").count()
component_sizes.orderBy(F.desc("count")).show(10)
return cc
Label Propagation (Community Detection)
Identifies "communities" - often corresponding to weight classes:
def compute_communities(graph, max_iter=10):
print("Computing Communities via Label Propagation...")
lp = graph.labelPropagation(maxIter=max_iter)
# lp has "id" and "label" columns
community_df = lp.select("id", F.col("label").alias("community"))
# Count per community
community_sizes = community_df.groupBy("community").count()
print(f"Found {community_sizes.count()} communities")
community_sizes.orderBy(F.desc("count")).show(10)
return community_df
Community Interpretation
Fighters in the same community typically:
- Share the same weight class
- Have fought common opponents
- Are part of the same era
Triangle Count
Measures clustering - fighters who share many common opponents:
def compute_triangle_count(graph):
print("Computing Triangle Count...")
triangles = graph.triangleCount()
# triangles.vertices has "count" column
triangle_df = triangles.select("id", F.col("count").alias("triangle_count"))
# Fighters with most triangles
triangle_df.orderBy(F.desc("triangle_count")).show(10)
return triangle_df
Network Statistics
def compute_network_stats(graph, fights_df):
print("Computing network statistics...")
# Degree (number of unique opponents)
in_degree = graph.inDegrees
out_degree = graph.outDegrees
# Since fights are undirected, we sum in+out
total_degree = in_degree.join(out_degree, on="id", how="outer")
total_degree = total_degree.fillna(0)
total_degree = total_degree.withColumn(
"network_size",
F.col("inDegree") + F.col("outDegree")
)
return total_degree.select("id", "network_size")
Aggregating Graph Features
Per-Fighter Features
def create_fighter_graph_features(pagerank_df, community_df, network_df, nodes):
# Join all features
features = nodes.select("id", "name")
features = features.join(pagerank_df, on="id", how="left")
features = features.join(community_df, on="id", how="left")
features = features.join(network_df, on="id", how="left")
# Rename for clarity
features = features.select(
F.col("name").alias("fighter_name"),
"pagerank",
"community",
"network_size"
)
return features
Per-Fight Features
def create_fight_graph_features(fights_df, fighter_features):
# Join fighter1 features
f1_feats = fighter_features.select(
F.col("fighter_name").alias("fighter1_name"),
F.col("pagerank").alias("f1_pagerank"),
F.col("community").alias("f1_community"),
F.col("network_size").alias("f1_network")
)
# Join fighter2 features
f2_feats = fighter_features.select(
F.col("fighter_name").alias("fighter2_name"),
F.col("pagerank").alias("f2_pagerank"),
F.col("community").alias("f2_community"),
F.col("network_size").alias("f2_network")
)
df = fights_df.join(f1_feats, on="fighter1_name", how="left")
df = df.join(f2_feats, on="fighter2_name", how="left")
# Calculate combined features
df = df.withColumn("combined_pagerank", F.col("f1_pagerank") + F.col("f2_pagerank"))
df = df.withColumn("pagerank_diff", F.abs(F.col("f1_pagerank") - F.col("f2_pagerank")))
df = df.withColumn("combined_network", F.col("f1_network") + F.col("f2_network"))
df = df.withColumn("same_community", (F.col("f1_community") == F.col("f2_community")).cast(IntegerType()))
return df
Per-Event Aggregation
def aggregate_graph_to_events(fight_features):
event_graph_feats = fight_features.groupBy("event_id").agg(
F.avg("combined_pagerank").alias("avg_combined_pagerank"),
F.max("combined_pagerank").alias("max_combined_pagerank"),
F.avg("pagerank_diff").alias("avg_pagerank_diff"),
F.avg("combined_network").alias("avg_network_size"),
F.sum("same_community").alias("num_same_community_fights")
)
return event_graph_feats
Graph Features Output
| Feature | Description | Impact on Sell-Through |
|---|---|---|
avg_combined_pagerank |
Mean star power | Higher = more draws |
max_combined_pagerank |
Biggest star fight | Strong predictor |
avg_pagerank_diff |
Average mismatch | Lower = competitive |
avg_network_size |
Mean experience | More experience helps |
num_same_community_fights |
Division matchups | Division fights draw fans |
Running Graph Analysis
spark-submit \
--packages graphframes:graphframes:0.8.2-spark3.2-s_2.12 \
src/graph/fighter_network.py \
--data-dir ./data \
--output-dir ./data
Output Files
data/features/graph_features/
├── _SUCCESS
├── part-00000-*.parquet
└── ...
Example Output
Top 10 fighters by PageRank:
+------------------+------------+
|name |pagerank |
+------------------+------------+
|Conor McGregor |0.0147 |
|Jon Jones |0.0139 |
|Georges St-Pierre |0.0128 |
|Amanda Nunes |0.0115 |
|Khabib Nurmagomedov|0.0108 |
|Daniel Cormier |0.0102 |
|Max Holloway |0.0095 |
|Israel Adesanya |0.0091 |
|Dustin Poirier |0.0088 |
|Charles Oliveira |0.0084 |
+------------------+------------+