import json import io from typing import List, Dict, Tuple import pandas as pd import numpy as np import altair as alt import streamlit as st from pathlib import PurePosixPath st.set_page_config(page_title="Twinkle Eval Analyzer", page_icon=":star2:", layout="wide") st.title("✨ Twinkle Eval Analyzer (.json / .jsonl)") # ----------------- Helpers ----------------- def _decode_bytes_to_text(b: bytes) -> str: for enc in ("utf-8", "utf-16", "utf-16le", "utf-16be", "big5", "cp950"): try: return b.decode(enc) except Exception: continue return b.decode("utf-8", errors="ignore") def read_twinkle_doc(file) -> Dict: raw = file.read() if isinstance(raw, bytes): text = _decode_bytes_to_text(raw) else: text = raw text = text.strip() try: obj = json.loads(text) except Exception: for line in text.splitlines(): line = line.strip().rstrip(",") if not line: continue try: obj = json.loads(line) break except Exception: continue if not isinstance(obj, dict): raise ValueError("檔案不是有效的 Twinkle Eval JSON 物件。") if "timestamp" not in obj or "config" not in obj or "dataset_results" not in obj: raise ValueError("缺少必要欄位") return obj def extract_records(doc: Dict) -> Tuple[pd.DataFrame, Dict[str, float]]: model = doc.get("config", {}).get("model", {}).get("name", "") timestamp = doc.get("timestamp", "") source_label = f"{model} @ {timestamp}" rows = [] avg_map = {} for ds_path, ds_payload in doc.get("dataset_results", {}).items(): ds_name = ds_path.split("datasets/")[-1].strip("/") if ds_path.startswith("datasets/") else ds_path avg_meta = ds_payload.get("average_accuracy") if isinstance(ds_payload, dict) else None results = ds_payload.get("results", []) if isinstance(ds_payload, dict) else [] for item in results: if not isinstance(item, dict): continue file_path = item.get("file") acc_mean = item.get("accuracy_mean") if file_path is None or acc_mean is None: continue fname = PurePosixPath(file_path).name category = fname.rsplit(".", 1)[0] rows.append({ "dataset": ds_name, "category": category, "file": fname, "accuracy_mean": float(acc_mean), "source_label": source_label }) if avg_meta is None and results: vals = [float(it.get("accuracy_mean", np.nan)) for it in results if "accuracy_mean" in it] if vals: avg_meta = float(np.mean(vals)) if avg_meta is not None: avg_map[ds_name] = avg_meta return pd.DataFrame(rows), avg_map def load_all(files) -> Tuple[pd.DataFrame, Dict[str, Dict[str, float]]]: frames = [] meta = {} for f in files or []: try: doc = read_twinkle_doc(f) except Exception as e: st.error(f"❌ 無法讀取 {getattr(f, 'name', '檔案')}:{e}") continue df, avg_map = extract_records(doc) if not df.empty: frames.append(df) src = df["source_label"].iloc[0] meta[src] = avg_map if not frames: return pd.DataFrame(columns=["dataset", "category", "file", "accuracy_mean", "source_label"]), {} return pd.concat(frames, ignore_index=True), meta # ----------------- Sidebar ----------------- with st.sidebar: files = st.file_uploader("選擇 Twinkle Eval 檔案", type=["json", "jsonl"], accept_multiple_files=True) df_all, meta_all = load_all(files) normalize_0_100 = st.checkbox("以 0–100 顯示", value=False) page_size = st.selectbox("每張圖顯示幾個類別", [10, 20, 30, 50, 100], index=1) sort_mode = st.selectbox("排序方式", ["依整體平均由高到低", "依整體平均由低到高", "依字母排序"]) if df_all.empty: st.info("請上傳 Twinkle Eval 檔案") st.stop() all_datasets = sorted(df_all["dataset"].unique().tolist()) selected_dataset = st.selectbox("選擇資料集", options=all_datasets) work = df_all[df_all["dataset"] == selected_dataset].copy() metric_plot = "accuracy_mean" + (" (x100)" if normalize_0_100 else "") work[metric_plot] = work["accuracy_mean"] * (100.0 if normalize_0_100 else 1.0) order_df = work.groupby("category")[metric_plot].mean().reset_index() if sort_mode == "依整體平均由高到低": order_df = order_df.sort_values(metric_plot, ascending=False) elif sort_mode == "依整體平均由低到高": order_df = order_df.sort_values(metric_plot, ascending=True) else: order_df = order_df.sort_values("category", ascending=True) cat_order = order_df["category"].tolist() work["category"] = pd.Categorical(work["category"], categories=cat_order, ordered=True) n = len(cat_order) pages = int(np.ceil(n / page_size)) for p in range(pages): start, end = p * page_size, min((p + 1) * page_size, n) subset_cats = cat_order[start:end] sub = work[work["category"].isin(subset_cats)] st.subheader(f"📊 {selected_dataset}|類別 {start+1}-{end} / {n}") base = alt.Chart(sub).encode( x=alt.X("category:N", sort=subset_cats), y=alt.Y(f"{metric_plot}:Q"), color=alt.Color("source_label:N"), tooltip=["source_label", "file", alt.Tooltip(metric_plot, format=".3f")] ) bars = base.mark_bar().encode(xOffset="source_label") st.altair_chart(bars.properties(height=420), use_container_width=True) pivot = sub.pivot_table(index="category", columns="source_label", values=metric_plot) st.dataframe(pivot, use_container_width=True) st.download_button( label=f"下載此頁 CSV ({start+1}-{end})", data=pivot.reset_index().to_csv(index=False).encode("utf-8"), file_name=f"twinkle_{selected_dataset}_{start+1}_{end}.csv", mime="text/csv" )