UFC Sell-Through Project - Graph Analytics

Overview

The graph module builds a fighter network where:

This enables computation of:

Dependencies

# GraphFrames package for spark-submit
spark-submit \
    --packages graphframes:graphframes:0.8.2-spark3.2-s_2.12 \
    src/graph/fighter_network.py

Building the Fighter Graph

create_spark_session()

def create_spark_session():
    builder = SparkSession.builder
    builder = builder.appName("UFC-Fighter-Network")
    builder = builder.config("spark.driver.memory", "4g")
    builder = builder.config("spark.jars.packages", 
                             "graphframes:graphframes:0.8.2-spark3.2-s_2.12")
    
    spark = builder.getOrCreate()
    spark.sparkContext.setLogLevel("WARN")
    
    return spark

build_fighter_graph()

from graphframes import GraphFrame

def build_fighter_graph(fights_df, events_df=None):
    # Extract fighter names
    if "fighter1_name" not in fights_df.columns and "bout" in fights_df.columns:
        fights_df = fights_df.withColumn(
            "fighter1_name",
            F.trim(F.regexp_extract(F.col("bout"), r"^(.+?)\s+vs\.?\s+", 1))
        )
        fights_df = fights_df.withColumn(
            "fighter2_name",
            F.trim(F.regexp_extract(F.col("bout"), r"\s+vs\.?\s+(.+)$", 1))
        )
    
    # Collect unique fighters
    fighters1 = fights_df.select(F.col("fighter1_name").alias("name"))
    fighters2 = fights_df.select(F.col("fighter2_name").alias("name"))
    all_fighters = fighters1.union(fighters2).distinct()
    
    # GraphFrames needs numeric IDs
    fighters_with_ids = all_fighters.withColumn("id", F.monotonically_increasing_id())
    nodes = fighters_with_ids.select("id", "name")
    name_to_id = nodes.select("name", "id")
    
    # Create edges
    fight_pairs = fights_df.select(
        F.col("fighter1_name").alias("src_name"),
        F.col("fighter2_name").alias("dst_name")
    )
    
    edges = fight_pairs.join(
        name_to_id.withColumnRenamed("id", "src").withColumnRenamed("name", "src_name"),
        on="src_name"
    ).join(
        name_to_id.withColumnRenamed("id", "dst").withColumnRenamed("name", "dst_name"),
        on="dst_name"
    ).select("src", "dst")
    
    # Create GraphFrame
    graph = GraphFrame(nodes, edges)
    
    print(f"Graph created: {nodes.count()} fighters, {edges.count()} fights")
    
    return graph, nodes

PageRank

PageRank measures fighter "importance" based on:

def compute_pagerank(graph, reset_probability=0.15, max_iter=20):
    print("Computing PageRank...")
    
    pagerank = graph.pageRank(
        resetProbability=reset_probability,
        maxIter=max_iter
    )
    
    # pagerank.vertices has "id" and "pagerank" columns
    pr_df = pagerank.vertices.select("id", "pagerank")
    
    # Get top fighters
    top_fighters = pr_df.join(
        graph.vertices, on="id"
    ).orderBy(F.desc("pagerank")).limit(20)
    
    print("Top 10 fighters by PageRank:")
    top_fighters.show(10, truncate=False)
    
    return pr_df

PageRank Interpretation

PageRank Range Interpretation
> 0.01 Elite champion level
0.005 - 0.01 Top contender
0.001 - 0.005 Ranked fighter
< 0.001 Unranked / regional

Connected Components

Find isolated groups of fighters (rare in UFC):

def compute_connected_components(graph):
    print("Computing Connected Components...")
    
    cc = graph.connectedComponents()
    
    # Count fighters per component
    component_sizes = cc.groupBy("component").count()
    component_sizes.orderBy(F.desc("count")).show(10)
    
    return cc

Label Propagation (Community Detection)

Identifies "communities" - often corresponding to weight classes:

def compute_communities(graph, max_iter=10):
    print("Computing Communities via Label Propagation...")
    
    lp = graph.labelPropagation(maxIter=max_iter)
    
    # lp has "id" and "label" columns
    community_df = lp.select("id", F.col("label").alias("community"))
    
    # Count per community
    community_sizes = community_df.groupBy("community").count()
    print(f"Found {community_sizes.count()} communities")
    community_sizes.orderBy(F.desc("count")).show(10)
    
    return community_df

Community Interpretation

Fighters in the same community typically:

Triangle Count

Measures clustering - fighters who share many common opponents:

def compute_triangle_count(graph):
    print("Computing Triangle Count...")
    
    triangles = graph.triangleCount()
    
    # triangles.vertices has "count" column
    triangle_df = triangles.select("id", F.col("count").alias("triangle_count"))
    
    # Fighters with most triangles
    triangle_df.orderBy(F.desc("triangle_count")).show(10)
    
    return triangle_df

Network Statistics

def compute_network_stats(graph, fights_df):
    print("Computing network statistics...")
    
    # Degree (number of unique opponents)
    in_degree = graph.inDegrees
    out_degree = graph.outDegrees
    
    # Since fights are undirected, we sum in+out
    total_degree = in_degree.join(out_degree, on="id", how="outer")
    total_degree = total_degree.fillna(0)
    total_degree = total_degree.withColumn(
        "network_size", 
        F.col("inDegree") + F.col("outDegree")
    )
    
    return total_degree.select("id", "network_size")

Aggregating Graph Features

Per-Fighter Features

def create_fighter_graph_features(pagerank_df, community_df, network_df, nodes):
    # Join all features
    features = nodes.select("id", "name")
    features = features.join(pagerank_df, on="id", how="left")
    features = features.join(community_df, on="id", how="left")
    features = features.join(network_df, on="id", how="left")
    
    # Rename for clarity
    features = features.select(
        F.col("name").alias("fighter_name"),
        "pagerank",
        "community",
        "network_size"
    )
    
    return features

Per-Fight Features

def create_fight_graph_features(fights_df, fighter_features):
    # Join fighter1 features
    f1_feats = fighter_features.select(
        F.col("fighter_name").alias("fighter1_name"),
        F.col("pagerank").alias("f1_pagerank"),
        F.col("community").alias("f1_community"),
        F.col("network_size").alias("f1_network")
    )
    
    # Join fighter2 features
    f2_feats = fighter_features.select(
        F.col("fighter_name").alias("fighter2_name"),
        F.col("pagerank").alias("f2_pagerank"),
        F.col("community").alias("f2_community"),
        F.col("network_size").alias("f2_network")
    )
    
    df = fights_df.join(f1_feats, on="fighter1_name", how="left")
    df = df.join(f2_feats, on="fighter2_name", how="left")
    
    # Calculate combined features
    df = df.withColumn("combined_pagerank", F.col("f1_pagerank") + F.col("f2_pagerank"))
    df = df.withColumn("pagerank_diff", F.abs(F.col("f1_pagerank") - F.col("f2_pagerank")))
    df = df.withColumn("combined_network", F.col("f1_network") + F.col("f2_network"))
    df = df.withColumn("same_community", (F.col("f1_community") == F.col("f2_community")).cast(IntegerType()))
    
    return df

Per-Event Aggregation

def aggregate_graph_to_events(fight_features):
    event_graph_feats = fight_features.groupBy("event_id").agg(
        F.avg("combined_pagerank").alias("avg_combined_pagerank"),
        F.max("combined_pagerank").alias("max_combined_pagerank"),
        F.avg("pagerank_diff").alias("avg_pagerank_diff"),
        F.avg("combined_network").alias("avg_network_size"),
        F.sum("same_community").alias("num_same_community_fights")
    )
    
    return event_graph_feats

Graph Features Output

Feature Description Impact on Sell-Through
avg_combined_pagerank Mean star power Higher = more draws
max_combined_pagerank Biggest star fight Strong predictor
avg_pagerank_diff Average mismatch Lower = competitive
avg_network_size Mean experience More experience helps
num_same_community_fights Division matchups Division fights draw fans

Running Graph Analysis

spark-submit \
    --packages graphframes:graphframes:0.8.2-spark3.2-s_2.12 \
    src/graph/fighter_network.py \
    --data-dir ./data \
    --output-dir ./data

Output Files

data/features/graph_features/
├── _SUCCESS
├── part-00000-*.parquet
└── ...

Example Output

Top 10 fighters by PageRank:
+------------------+------------+
|name              |pagerank    |
+------------------+------------+
|Conor McGregor    |0.0147      |
|Jon Jones         |0.0139      |
|Georges St-Pierre |0.0128      |
|Amanda Nunes      |0.0115      |
|Khabib Nurmagomedov|0.0108     |
|Daniel Cormier    |0.0102      |
|Max Holloway      |0.0095      |
|Israel Adesanya   |0.0091      |
|Dustin Poirier    |0.0088      |
|Charles Oliveira  |0.0084      |
+------------------+------------+