Spaces:

fireworks-ai
/

catalog-extract

Running

File size: 6,814 Bytes

import pandas as pd
import matplotlib.pyplot as plt
import matplotlib

matplotlib.use("Agg")


def load_evaluation_data(data_path) -> pd.DataFrame:
    """Load evaluation results from CSV"""
    eval_file = data_path / "evaluation_results.csv"
    if eval_file.exists():
        return pd.read_csv(eval_file)
    return None


def get_model_style(model_name):
    """
    Get color and hatch pattern for a model

    Color scheme:
    - GPT models: Gray (#808080)
    - Qwen2.5-VL-32B: Light purple (#9B87E8) - BASE solid, SFT with pattern
    - Qwen2-VL-72B: Medium blue (#5B7FD8) - BASE solid, SFT with pattern

    Returns:
        tuple: (color, hatch_pattern)
    """
    if "GPT" in model_name or "gpt" in model_name:
        return "#808080", None

    if "Qwen2.5" in model_name or "qwen2p5" in model_name or "32B" in model_name:
        if "SFT" in model_name:
            return "#9B87E8", "///"
        else:
            return "#9B87E8", None

    if "Qwen2" in model_name or "72B" in model_name:
        if "SFT" in model_name:
            return "#5B7FD8", "///"
        else:
            return "#5B7FD8", None

    if "Qwen3" in model_name or "qwen3" in model_name or "32B" in model_name:
        if "SFT" in model_name:
            return "#6B4DB8", "///"
        else:
            return "#6B4DB8", None

    return "#6B4DC1", None


def create_accuracy_plot(
    eval_df: pd.DataFrame,
    selected_models: list = None,
    selected_categories: list = None,
):
    """
    Create bar chart of accuracy by category, colored by model

    Args:
        eval_df: DataFrame with evaluation results
        selected_models: List of models to display (None for all)
        selected_categories: List of categories to display (None for all)

    Returns:
        matplotlib figure
    """
    if eval_df is None:
        return None

    # Filter data
    df_filtered = eval_df.copy()
    if selected_models:
        df_filtered = df_filtered[df_filtered["model"].isin(selected_models)]
    if selected_categories:
        df_filtered = df_filtered[df_filtered["category"].isin(selected_categories)]

    # Create figure
    fig, ax = plt.subplots(figsize=(12, 6))

    # Get unique categories and models
    categories = df_filtered["category"].unique()
    models = df_filtered["model"].unique()

    # Set up bar positions
    x = range(len(categories))
    width = 0.8 / len(models)

    for i, model in enumerate(models):
        model_data = df_filtered[df_filtered["model"] == model]
        accuracies = [
            model_data[model_data["category"] == cat]["accuracy"].values[0]
            for cat in categories
        ]

        color, hatch = get_model_style(model)

        offset = (i - len(models) / 2) * width + width / 2
        ax.bar(
            [xi + offset for xi in x],
            accuracies,
            width,
            label=model,
            color=color,
            hatch=hatch,
            alpha=0.8,
            edgecolor="white",
            linewidth=1.2,
        )

    # Customize plot
    ax.set_xlabel("Category", fontsize=12, fontweight="bold")
    ax.set_ylabel("Accuracy", fontsize=12, fontweight="bold")
    ax.set_title("Model Accuracy by Category", fontsize=14, fontweight="bold")
    ax.set_xticks(x)
    ax.set_xticklabels(categories, rotation=0)
    ax.set_ylim(0, 1.0)
    ax.legend(loc="lower right", framealpha=0.9)
    ax.grid(axis="y", alpha=0.3, linestyle="--")

    plt.tight_layout()
    return fig


def create_precision_recall_plot(
    eval_df: pd.DataFrame,
    selected_models: list = None,
    selected_categories: list = None,
):
    """
    Create subplot with precision and recall by category, colored by model

    Args:
        eval_df: DataFrame with evaluation results
        selected_models: List of models to display (None for all)
        selected_categories: List of categories to display (None for all)

    Returns:
        matplotlib figure
    """
    if eval_df is None:
        return None

    # Filter data
    df_filtered = eval_df.copy()
    if selected_models:
        df_filtered = df_filtered[df_filtered["model"].isin(selected_models)]
    if selected_categories:
        df_filtered = df_filtered[df_filtered["category"].isin(selected_categories)]

    # Create figure with subplots
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 6))

    # Get unique categories and models
    categories = df_filtered["category"].unique()
    models = df_filtered["model"].unique()

    # Set up bar positions
    x = range(len(categories))
    width = 0.8 / len(models)

    # Plot precision bars
    for i, model in enumerate(models):
        model_data = df_filtered[df_filtered["model"] == model]
        precisions = [
            model_data[model_data["category"] == cat]["precision"].values[0]
            for cat in categories
        ]

        # Get color and pattern for this model
        color, hatch = get_model_style(model)

        offset = (i - len(models) / 2) * width + width / 2
        ax1.bar(
            [xi + offset for xi in x],
            precisions,
            width,
            label=model,
            color=color,
            hatch=hatch,
            alpha=0.8,
            edgecolor="white",
            linewidth=1.2,
        )

    # Customize precision plot
    ax1.set_xlabel("Category", fontsize=12, fontweight="bold")
    ax1.set_ylabel("Precision", fontsize=12, fontweight="bold")
    ax1.set_title("Model Precision by Category", fontsize=14, fontweight="bold")
    ax1.set_xticks(x)
    ax1.set_xticklabels(categories, rotation=0)
    ax1.set_ylim(0, 1.0)
    ax1.legend(loc="lower right", framealpha=0.9)
    ax1.grid(axis="y", alpha=0.3, linestyle="--")

    # Plot recall bars
    for i, model in enumerate(models):
        model_data = df_filtered[df_filtered["model"] == model]
        recalls = [
            model_data[model_data["category"] == cat]["recall"].values[0]
            for cat in categories
        ]

        # Get color and pattern for this model
        color, hatch = get_model_style(model)

        offset = (i - len(models) / 2) * width + width / 2
        ax2.bar(
            [xi + offset for xi in x],
            recalls,
            width,
            label=model,
            color=color,
            hatch=hatch,
            alpha=0.8,
            edgecolor="white",
            linewidth=1.2,
        )

    ax2.set_xlabel("Category", fontsize=12, fontweight="bold")
    ax2.set_ylabel("Recall", fontsize=12, fontweight="bold")
    ax2.set_title("Model Recall by Category", fontsize=14, fontweight="bold")
    ax2.set_xticks(x)
    ax2.set_xticklabels(categories, rotation=0)
    ax2.set_ylim(0, 1.0)
    ax2.legend(loc="lower right", framealpha=0.9)
    ax2.grid(axis="y", alpha=0.3, linestyle="--")

    plt.tight_layout()
    return fig