UFC Sell-Through Project - HPC Deployment

Overview

The project runs on Penn State's ICDS Roar cluster using:

Connecting to Roar

# SSH to submit node
ssh YOUR_USERNAME@submit.hpc.psu.edu

# Transfer files
scp -r UFC_SellThrough_Project/ YOUR_USERNAME@submit.hpc.psu.edu:~/

Environment Setup

One-Time Setup

# Load modules
module load anaconda3
module load spark/3.4.1

# Create conda environment
conda create -n ds410 python=3.10 \
    pyspark pandas numpy \
    beautifulsoup4 requests lxml \
    pytrends textblob \
    matplotlib seaborn -y

# Activate
conda activate ds410

# Install additional packages
pip install graphframes

Session Setup

# Every time you log in
module load anaconda3
module load spark/3.4.1
conda activate ds410

SLURM Job Scripts

Full Pipeline (run_pipeline.slurm)

#!/bin/bash
#SBATCH --job-name=ufc_pipeline
#SBATCH --output=logs/pipeline_%j.out
#SBATCH --error=logs/pipeline_%j.err
#SBATCH --time=04:00:00
#SBATCH --nodes=1
#SBATCH --ntasks=1
#SBATCH --cpus-per-task=8
#SBATCH --mem=32GB
#SBATCH --partition=open

# Load modules
module load anaconda3
module load spark/3.4.1
conda activate ds410

# Set up directories
DATA_DIR="./data"
mkdir -p $DATA_DIR/raw $DATA_DIR/processed $DATA_DIR/features $DATA_DIR/models $DATA_DIR/external
mkdir -p logs

echo "======================================"
echo "UFC Sell-Through Pipeline - $(date)"
echo "======================================"

# Step 1: Download data
echo "Step 1: Downloading data..."
python src/etl/ingest.py --data-dir $DATA_DIR

# Step 2: Collect external data
echo "Step 2: Collecting external data..."
python src/etl/scrape_betting_odds.py --data-dir $DATA_DIR --output-dir $DATA_DIR
python src/etl/fetch_google_trends.py --data-dir $DATA_DIR --output-dir $DATA_DIR
python src/etl/scrape_reddit_sentiment.py --data-dir $DATA_DIR --output-dir $DATA_DIR

# Step 3: Run Spark ETL
echo "Step 3: Running Spark ETL..."
spark-submit \
    --driver-memory 8g \
    --executor-memory 8g \
    src/etl/spark_etl.py \
    --data-dir $DATA_DIR \
    --output-dir $DATA_DIR

# Step 4: Graph analysis
echo "Step 4: Running graph analysis..."
spark-submit \
    --packages graphframes:graphframes:0.8.2-spark3.2-s_2.12 \
    --driver-memory 8g \
    src/graph/fighter_network.py \
    --data-dir $DATA_DIR \
    --output-dir $DATA_DIR

# Step 5: Feature engineering
echo "Step 5: Feature engineering..."
spark-submit \
    --driver-memory 8g \
    src/features/feature_engineering.py \
    --data-dir $DATA_DIR \
    --output-dir $DATA_DIR

# Step 6: Train model
echo "Step 6: Training model..."
spark-submit \
    --driver-memory 8g \
    src/models/train_improved.py \
    --data-dir $DATA_DIR \
    --output-dir $DATA_DIR \
    --test-year 2024

echo "======================================"
echo "Pipeline Complete! $(date)"
echo "======================================"

# Print results
echo "Model metrics:"
cat $DATA_DIR/models/metrics_improved.json

echo "Data sizes:"
du -sh $DATA_DIR/raw $DATA_DIR/external $DATA_DIR/processed $DATA_DIR/features $DATA_DIR/models

Visualization Job (run_visualization.slurm)

#!/bin/bash
#SBATCH --job-name=ufc_viz
#SBATCH --output=logs/viz_%j.out
#SBATCH --error=logs/viz_%j.err
#SBATCH --time=00:30:00
#SBATCH --nodes=1
#SBATCH --ntasks=1
#SBATCH --cpus-per-task=4
#SBATCH --mem=8GB
#SBATCH --partition=open

module load anaconda3
conda activate ds410

python src/visualization/create_plots.py \
    --data-dir ./data \
    --output-dir ./visualizations

echo "Visualizations complete!"
ls -la ./visualizations/

Submitting Jobs

# Submit full pipeline
sbatch scripts/run_pipeline.slurm

# Check job status
squeue -u $USER

# View job output
tail -f logs/pipeline_*.out

# Cancel job
scancel JOB_ID

Resource Allocation

Step CPUs Memory Time
Data Download 1 4 GB 10 min
External Data 2 8 GB 30 min
Spark ETL 8 16 GB 30 min
Graph Analysis 8 16 GB 20 min
Feature Engineering 8 16 GB 15 min
Model Training 8 16 GB 30 min
Total Pipeline 8 32 GB ~2.5 hours

SLURM Partitions

Partition Max Time Priority Use Case
open 48 hours Normal Development, testing
burst 4 hours High Quick jobs
sla-prio 7 days High Production (requires allocation)

Spark Configuration for Cluster

Local Mode (Testing)

spark = SparkSession.builder \
    .master("local[*]") \
    .config("spark.driver.memory", "4g") \
    .getOrCreate()

YARN Mode (Cluster)

spark-submit \
    --master yarn \
    --deploy-mode client \
    --driver-memory 8g \
    --executor-memory 8g \
    --num-executors 4 \
    --executor-cores 2 \
    src/etl/spark_etl.py

File Storage

Scratch Storage

# Fast temporary storage (purged after 30 days)
/storage/scratch/$USER/

Work Storage

# Persistent storage
/storage/work/$USER/
/storage/work/$USER/
└── ufc_project/
    ├── data/
    │   ├── raw/
    │   ├── processed/
    │   ├── features/
    │   ├── external/
    │   └── models/
    ├── src/
    ├── scripts/
    └── logs/

Troubleshooting

Common Issues

Job pending too long:

# Check queue status
squeue --partition=open

# Try different partition
#SBATCH --partition=burst

Out of memory:

# Increase memory
#SBATCH --mem=64GB

# Or reduce Spark memory
--driver-memory 4g
--executor-memory 4g

Module not found:

# Ensure conda environment is active
conda activate ds410

# Check installed packages
pip list | grep pyspark

Spark timeout:

# Increase timeout in Spark config
--conf spark.network.timeout=600s
--conf spark.executor.heartbeatInterval=60s

Collecting Results

# Download results to local machine
scp -r YOUR_USERNAME@submit.hpc.psu.edu:~/ufc_project/data/models ./results/
scp -r YOUR_USERNAME@submit.hpc.psu.edu:~/ufc_project/visualizations ./results/

Multi-Year Validation

# Test on multiple years
for year in 2022 2023 2024; do
    echo "Testing on $year..."
    spark-submit src/models/train_improved.py \
        --data-dir ./data \
        --test-year $year \
        --no-cv
done