import pandas as pd import matplotlib.pyplot as plt import matplotlib matplotlib.use("Agg") def load_evaluation_data(data_path) -> pd.DataFrame: """Load evaluation results from CSV""" eval_file = data_path / "evaluation_results.csv" if eval_file.exists(): return pd.read_csv(eval_file) return None def get_model_style(model_name): """ Get color and hatch pattern for a model Color scheme: - GPT models: Gray (#808080) - Qwen2.5-VL-32B: Light purple (#9B87E8) - BASE solid, SFT with pattern - Qwen2-VL-72B: Medium blue (#5B7FD8) - BASE solid, SFT with pattern Returns: tuple: (color, hatch_pattern) """ if "GPT" in model_name or "gpt" in model_name: return "#808080", None if "Qwen2.5" in model_name or "qwen2p5" in model_name or "32B" in model_name: if "SFT" in model_name: return "#9B87E8", "///" else: return "#9B87E8", None if "Qwen2" in model_name or "72B" in model_name: if "SFT" in model_name: return "#5B7FD8", "///" else: return "#5B7FD8", None if "Qwen3" in model_name or "qwen3" in model_name or "32B" in model_name: if "SFT" in model_name: return "#6B4DB8", "///" else: return "#6B4DB8", None return "#6B4DC1", None def create_accuracy_plot( eval_df: pd.DataFrame, selected_models: list = None, selected_categories: list = None, ): """ Create bar chart of accuracy by category, colored by model Args: eval_df: DataFrame with evaluation results selected_models: List of models to display (None for all) selected_categories: List of categories to display (None for all) Returns: matplotlib figure """ if eval_df is None: return None # Filter data df_filtered = eval_df.copy() if selected_models: df_filtered = df_filtered[df_filtered["model"].isin(selected_models)] if selected_categories: df_filtered = df_filtered[df_filtered["category"].isin(selected_categories)] # Create figure fig, ax = plt.subplots(figsize=(12, 6)) # Get unique categories and models categories = df_filtered["category"].unique() models = df_filtered["model"].unique() # Set up bar positions x = range(len(categories)) width = 0.8 / len(models) for i, model in enumerate(models): model_data = df_filtered[df_filtered["model"] == model] accuracies = [ model_data[model_data["category"] == cat]["accuracy"].values[0] for cat in categories ] color, hatch = get_model_style(model) offset = (i - len(models) / 2) * width + width / 2 ax.bar( [xi + offset for xi in x], accuracies, width, label=model, color=color, hatch=hatch, alpha=0.8, edgecolor="white", linewidth=1.2, ) # Customize plot ax.set_xlabel("Category", fontsize=12, fontweight="bold") ax.set_ylabel("Accuracy", fontsize=12, fontweight="bold") ax.set_title("Model Accuracy by Category", fontsize=14, fontweight="bold") ax.set_xticks(x) ax.set_xticklabels(categories, rotation=0) ax.set_ylim(0, 1.0) ax.legend(loc="lower right", framealpha=0.9) ax.grid(axis="y", alpha=0.3, linestyle="--") plt.tight_layout() return fig def create_precision_recall_plot( eval_df: pd.DataFrame, selected_models: list = None, selected_categories: list = None, ): """ Create subplot with precision and recall by category, colored by model Args: eval_df: DataFrame with evaluation results selected_models: List of models to display (None for all) selected_categories: List of categories to display (None for all) Returns: matplotlib figure """ if eval_df is None: return None # Filter data df_filtered = eval_df.copy() if selected_models: df_filtered = df_filtered[df_filtered["model"].isin(selected_models)] if selected_categories: df_filtered = df_filtered[df_filtered["category"].isin(selected_categories)] # Create figure with subplots fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 6)) # Get unique categories and models categories = df_filtered["category"].unique() models = df_filtered["model"].unique() # Set up bar positions x = range(len(categories)) width = 0.8 / len(models) # Plot precision bars for i, model in enumerate(models): model_data = df_filtered[df_filtered["model"] == model] precisions = [ model_data[model_data["category"] == cat]["precision"].values[0] for cat in categories ] # Get color and pattern for this model color, hatch = get_model_style(model) offset = (i - len(models) / 2) * width + width / 2 ax1.bar( [xi + offset for xi in x], precisions, width, label=model, color=color, hatch=hatch, alpha=0.8, edgecolor="white", linewidth=1.2, ) # Customize precision plot ax1.set_xlabel("Category", fontsize=12, fontweight="bold") ax1.set_ylabel("Precision", fontsize=12, fontweight="bold") ax1.set_title("Model Precision by Category", fontsize=14, fontweight="bold") ax1.set_xticks(x) ax1.set_xticklabels(categories, rotation=0) ax1.set_ylim(0, 1.0) ax1.legend(loc="lower right", framealpha=0.9) ax1.grid(axis="y", alpha=0.3, linestyle="--") # Plot recall bars for i, model in enumerate(models): model_data = df_filtered[df_filtered["model"] == model] recalls = [ model_data[model_data["category"] == cat]["recall"].values[0] for cat in categories ] # Get color and pattern for this model color, hatch = get_model_style(model) offset = (i - len(models) / 2) * width + width / 2 ax2.bar( [xi + offset for xi in x], recalls, width, label=model, color=color, hatch=hatch, alpha=0.8, edgecolor="white", linewidth=1.2, ) ax2.set_xlabel("Category", fontsize=12, fontweight="bold") ax2.set_ylabel("Recall", fontsize=12, fontweight="bold") ax2.set_title("Model Recall by Category", fontsize=14, fontweight="bold") ax2.set_xticks(x) ax2.set_xticklabels(categories, rotation=0) ax2.set_ylim(0, 1.0) ax2.legend(loc="lower right", framealpha=0.9) ax2.grid(axis="y", alpha=0.3, linestyle="--") plt.tight_layout() return fig