m00916919 commited on
Commit
8d50beb
·
1 Parent(s): e321b5c

launch leaderboard

Browse files
Files changed (6) hide show
  1. .gitignore +1 -0
  2. Dockerfile +36 -0
  3. benchmark.json +202 -0
  4. index.html +598 -0
  5. requirements.txt +13 -0
  6. server.py +299 -0
.gitignore ADDED
@@ -0,0 +1 @@
 
 
1
+ __pycache__
Dockerfile ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # --- Base image ---
2
+ FROM python:3.11-slim
3
+
4
+ ENV PYTHONDONTWRITEBYTECODE=1 \
5
+ PYTHONUNBUFFERED=1 \
6
+ PIP_NO_CACHE_DIR=on
7
+
8
+ # OS deps (git useful for HF repos; curl for HEALTHCHECK)
9
+ RUN apt-get update && apt-get install -y --no-install-recommends \
10
+ git curl && \
11
+ rm -rf /var/lib/apt/lists/*
12
+
13
+ # Non-root user
14
+ RUN useradd -m -u 1000 user
15
+ USER user
16
+ ENV PATH="/home/user/.local/bin:$PATH"
17
+
18
+ WORKDIR /app
19
+
20
+ # Python deps
21
+ COPY --chown=user ./requirements.txt requirements.txt
22
+ RUN pip install --upgrade pip && pip install -r requirements.txt
23
+
24
+ # App source
25
+ COPY --chown=user . /app
26
+
27
+ EXPOSE 7860
28
+
29
+ # Healthcheck pings your FastAPI health endpoint
30
+ HEALTHCHECK --interval=30s --timeout=3s --start-period=20s --retries=3 \
31
+ CMD curl -fsS http://localhost:7860/api/health || exit 1
32
+
33
+ # Production server (Gunicorn + Uvicorn worker)
34
+ #CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
35
+
36
+ CMD ["gunicorn", "-k", "uvicorn.workers.UvicornWorker", "-w", "2", "-b", "0.0.0.0:7860", "server:app"]
benchmark.json ADDED
@@ -0,0 +1,202 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "provider": "OpenAI",
4
+ "name": "GPT-OSS-120B",
5
+ "repo": "openai/gpt-oss-120b",
6
+ "updated_at": "2025-09-16T00:00:00Z",
7
+ "scores": [
8
+ {"dataset_name": "TeleQna", "metric_type": "raw", "score": 78.51, "energy_consumed": "", "co2_consumed": ""},
9
+ {"dataset_name": "TeleMath", "metric_type": "raw", "score": 60.40, "energy_consumed": "", "co2_consumed": ""},
10
+ {"dataset_name": "TeleLogs", "metric_type": "raw", "score": 44.70, "energy_consumed": "", "co2_consumed": ""}
11
+ ]
12
+ },
13
+ {
14
+ "provider": "Qwen",
15
+ "name": "Qwen3-32B",
16
+ "repo": "qwen/qwen3-32b",
17
+ "updated_at": "2025-09-16T00:00:00Z",
18
+ "scores": [
19
+ {"dataset_name": "TeleMath", "metric_type": "raw", "score": 69.51, "energy_consumed": "", "co2_consumed": ""},
20
+ {"dataset_name": "TeleLogs", "metric_type": "raw", "score": 33.77, "energy_consumed": "", "co2_consumed": ""}
21
+ ]
22
+ },
23
+ {
24
+ "provider": "OpenAI",
25
+ "name": "GPT-OSS-20B",
26
+ "repo": "openai/gpt-oss-20b",
27
+ "updated_at": "2025-09-16T00:00:00Z",
28
+ "scores": [
29
+ {"dataset_name": "TeleQna", "metric_type": "raw", "score": 75.79, "energy_consumed": "", "co2_consumed": ""},
30
+ {"dataset_name": "TeleMath", "metric_type": "raw", "score": 53.80, "energy_consumed": "", "co2_consumed": ""},
31
+ {"dataset_name": "TeleLogs", "metric_type": "raw", "score": 40.10, "energy_consumed": "", "co2_consumed": ""}
32
+ ]
33
+ },
34
+ {
35
+ "provider": "DeepSeek",
36
+ "name": "R1-Distill-Llama-70B",
37
+ "repo": "deepseek/r1-distill-llama-70b",
38
+ "updated_at": "2025-09-16T00:00:00Z",
39
+ "scores": [
40
+ {"dataset_name": "TeleMath", "metric_type": "raw", "score": 53.21, "energy_consumed": "", "co2_consumed": ""},
41
+ {"dataset_name": "TeleLogs", "metric_type": "raw", "score": 29.42, "energy_consumed": "", "co2_consumed": ""}
42
+ ]
43
+ },
44
+ {
45
+ "provider": "ByteDance",
46
+ "name": "Seed-OSS-36B",
47
+ "repo": "bytedance/seed-oss-36b",
48
+ "updated_at": "2025-09-16T00:00:00Z",
49
+ "scores": [
50
+ {"dataset_name": "TeleQna", "metric_type": "raw", "score": 75.67, "energy_consumed": "", "co2_consumed": ""},
51
+ {"dataset_name": "TeleMath", "metric_type": "raw", "score": 56.05, "energy_consumed": "", "co2_consumed": ""},
52
+ {"dataset_name": "TeleLogs", "metric_type": "raw", "score": 57.00, "energy_consumed": "", "co2_consumed": ""}
53
+ ]
54
+ },
55
+ {
56
+ "provider": "Qwen",
57
+ "name": "QwQ-32B",
58
+ "repo": "qwen/qwq-32b",
59
+ "updated_at": "2025-09-16T00:00:00Z",
60
+ "scores": [
61
+ {"dataset_name": "TeleLogs", "metric_type": "raw", "score": 33.62, "energy_consumed": "", "co2_consumed": ""}
62
+ ]
63
+ },
64
+ {
65
+ "provider": "Qwen",
66
+ "name": "Qwen2.5-72B",
67
+ "repo": "qwen/qwen2.5-72b",
68
+ "updated_at": "2025-09-16T00:00:00Z",
69
+ "scores": [
70
+ {"dataset_name": "TeleQna", "metric_type": "raw", "score": 76.50, "energy_consumed": "", "co2_consumed": ""}
71
+ ]
72
+ },
73
+ {
74
+ "provider": "Meta",
75
+ "name": "Llama-3.3-70B-Instruct",
76
+ "repo": "meta/llama-3.3-70b-instruct",
77
+ "updated_at": "2025-09-16T00:00:00Z",
78
+ "scores": [
79
+ {"dataset_name": "TeleQna", "metric_type": "raw", "score": 74.98, "energy_consumed": "", "co2_consumed": ""},
80
+ {"dataset_name": "TeleMath", "metric_type": "raw", "score": 36.23, "energy_consumed": "", "co2_consumed": ""}
81
+ ]
82
+ },
83
+ {
84
+ "provider": "OpenAI",
85
+ "name": "GPT-4",
86
+ "repo": "openai/gpt-4",
87
+ "updated_at": "2025-09-16T00:00:00Z",
88
+ "scores": [
89
+ {"dataset_name": "TeleQna", "metric_type": "raw", "score": 74.91, "energy_consumed": "", "co2_consumed": ""}
90
+ ]
91
+ },
92
+ {
93
+ "provider": "Qwen",
94
+ "name": "Qwen3-8B",
95
+ "repo": "qwen/qwen3-8b",
96
+ "updated_at": "2025-09-16T00:00:00Z",
97
+ "scores": [
98
+ {"dataset_name": "TeleQna", "metric_type": "raw", "score": 73.21, "energy_consumed": "", "co2_consumed": ""}
99
+ ]
100
+ },
101
+ {
102
+ "provider": "Qwen",
103
+ "name": "Qwen2.5-7B-Instruct",
104
+ "repo": "qwen/qwen2.5-7b-instruct",
105
+ "updated_at": "2025-09-16T00:00:00Z",
106
+ "scores": [
107
+ {"dataset_name": "TeleQna", "metric_type": "raw", "score": 69.31, "energy_consumed": "", "co2_consumed": ""},
108
+ {"dataset_name": "TeleLogs", "metric_type": "raw", "score": 12.05, "energy_consumed": "", "co2_consumed": ""}
109
+ ]
110
+ },
111
+ {
112
+ "provider": "Meta",
113
+ "name": "Llama-3.1-8B-Instruct",
114
+ "repo": "meta/llama-3.1-8b-instruct",
115
+ "updated_at": "2025-09-16T00:00:00Z",
116
+ "scores": [
117
+ {"dataset_name": "TeleQna", "metric_type": "raw", "score": 68.03, "energy_consumed": "", "co2_consumed": ""},
118
+ {"dataset_name": "TeleMath", "metric_type": "raw", "score": 13.56, "energy_consumed": "", "co2_consumed": ""}
119
+ ]
120
+ },
121
+ {
122
+ "provider": "OpenAI",
123
+ "name": "GPT-3.5",
124
+ "repo": "openai/gpt-3.5",
125
+ "updated_at": "2025-09-16T00:00:00Z",
126
+ "scores": [
127
+ {"dataset_name": "TeleQna", "metric_type": "raw", "score": 67.29, "energy_consumed": "", "co2_consumed": ""}
128
+ ]
129
+ },
130
+ {
131
+ "provider": "Mistral",
132
+ "name": "Mixtral",
133
+ "repo": "mistral/mixtral",
134
+ "updated_at": "2025-09-16T00:00:00Z",
135
+ "scores": [
136
+ {"dataset_name": "TeleQna", "metric_type": "raw", "score": 67.74, "energy_consumed": "", "co2_consumed": ""}
137
+ ]
138
+ },
139
+ {
140
+ "provider": "Mistral",
141
+ "name": "Mistral-7B",
142
+ "repo": "mistral/mistral-7b",
143
+ "updated_at": "2025-09-16T00:00:00Z",
144
+ "scores": [
145
+ {"dataset_name": "TeleQna", "metric_type": "raw", "score": 47.07, "energy_consumed": "", "co2_consumed": ""}
146
+ ]
147
+ },
148
+ {
149
+ "provider": "Microsoft",
150
+ "name": "Phi-4-Reasoning+",
151
+ "repo": "microsoft/phi-4-reasoning+",
152
+ "updated_at": "2025-09-16T00:00:00Z",
153
+ "scores": [
154
+ {"dataset_name": "TeleMath", "metric_type": "raw", "score": 53.56, "energy_consumed": "", "co2_consumed": ""}
155
+ ]
156
+ },
157
+ {
158
+ "provider": "Qwen",
159
+ "name": "Qwen3-4B-Instruct",
160
+ "repo": "qwen/qwen3-4b-instruct",
161
+ "updated_at": "2025-09-16T00:00:00Z",
162
+ "scores": [
163
+ {"dataset_name": "TeleMath", "metric_type": "raw", "score": 45.62, "energy_consumed": "", "co2_consumed": ""}
164
+ ]
165
+ },
166
+ {
167
+ "provider": "Qwen",
168
+ "name": "Qwen2.5-Math-72B-Instruct",
169
+ "repo": "qwen/qwen2.5-math-72b-instruct",
170
+ "updated_at": "2025-09-16T00:00:00Z",
171
+ "scores": [
172
+ {"dataset_name": "TeleMath", "metric_type": "raw", "score": 39.99, "energy_consumed": "", "co2_consumed": ""}
173
+ ]
174
+ },
175
+ {
176
+ "provider": "Qwen",
177
+ "name": "Qwen2.5-Math-7B-Instruct",
178
+ "repo": "qwen/qwen2.5-math-7b-instruct",
179
+ "updated_at": "2025-09-16T00:00:00Z",
180
+ "scores": [
181
+ {"dataset_name": "TeleMath", "metric_type": "raw", "score": 22.38, "energy_consumed": "", "co2_consumed": ""}
182
+ ]
183
+ },
184
+ {
185
+ "provider": "Qwen",
186
+ "name": "Qwen2.5-32B-Instruct",
187
+ "repo": "qwen/qwen2.5-32b-instruct",
188
+ "updated_at": "2025-09-16T00:00:00Z",
189
+ "scores": [
190
+ {"dataset_name": "TeleLogs", "metric_type": "raw", "score": 18.85, "energy_consumed": "", "co2_consumed": ""}
191
+ ]
192
+ },
193
+ {
194
+ "provider": "Qwen",
195
+ "name": "Qwen2.5-1.5B-Instruct",
196
+ "repo": "qwen/qwen2.5-1.5b-instruct",
197
+ "updated_at": "2025-09-16T00:00:00Z",
198
+ "scores": [
199
+ {"dataset_name": "TeleLogs", "metric_type": "raw", "score": 11.25, "energy_consumed": "", "co2_consumed": ""}
200
+ ]
201
+ }
202
+ ]
index.html ADDED
@@ -0,0 +1,598 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!DOCTYPE html>
2
+ <html lang="en">
3
+ <head>
4
+ <meta charset="UTF-8" />
5
+ <meta name="viewport" content="width=device-width, initial-scale=1.0"/>
6
+ <title>GSMA Open-Telco LLM Benchmarks</title>
7
+
8
+ <!-- Tailwind + Chart.js -->
9
+ <script src="https://cdn.tailwindcss.com"></script>
10
+ <script src="https://cdn.jsdelivr.net/npm/chart.js"></script>
11
+
12
+ <!-- Fonts -->
13
+ <link rel="preconnect" href="https://fonts.googleapis.com">
14
+ <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
15
+ <link href="https://fonts.googleapis.com/css2?family=Inter:wght@300;400;500;600;700;800&display=swap" rel="stylesheet">
16
+
17
+ <style>
18
+ :root{
19
+ color-scheme: light dark;
20
+ --text: #0f172a;
21
+ --bg: #f8fafc;
22
+ --card-bg: rgba(255,255,255,.75);
23
+ --border: #e2e8f0;
24
+ --header-bg: rgba(241,245,249,.85);
25
+ --sticky-bg: rgba(248,250,252,.92);
26
+ --chip-bg:#f1f5f9;
27
+ }
28
+ .dark{
29
+ --text: #e2e8f0;
30
+ --bg: #020617;
31
+ --card-bg: rgba(2,6,23,.6);
32
+ --border:#334155;
33
+ --header-bg: rgba(30,41,59,.75);
34
+ --sticky-bg: rgba(2,6,23,.92);
35
+ --chip-bg: rgba(15,23,42,.6);
36
+ }
37
+ html { font-family: Inter, system-ui, -apple-system, Segoe UI, Roboto, "Helvetica Neue", Arial, "Noto Sans"; }
38
+ body { color: var(--text); background: var(--bg); }
39
+
40
+ .card{ border-radius:1rem; box-shadow:0 10px 25px rgba(2,6,23,.08); background:var(--card-bg); backdrop-filter: blur(8px); border:1px solid var(--border); }
41
+ .btn{ display:inline-flex; align-items:center; justify-content:center; gap:.5rem; border-radius:.8rem; padding:.625rem 1rem; font-weight:700; transition: transform .05s ease; }
42
+ .btn:active{ transform: scale(.98); }
43
+ .btn-primary{ background:#4f46e5; color:#fff; } .btn-primary:hover{ background:#6366f1; }
44
+ .btn-ghost{ background:transparent; border:1px solid var(--border); } .btn-ghost:hover{ background:var(--chip-bg); }
45
+ .btn-outline{ border:1px solid #4f46e5; color:#4338ca; } .dark .btn-outline{ color:#a5b4fc; }
46
+ .btn-outline:hover{ background:#eef2ff; } .dark .btn-outline:hover{ background: rgba(30,27,75,.5); }
47
+ .input{ width:100%; border:1px solid var(--border); border-radius:.8rem; padding:.6rem .8rem; background:#fff; color:#0f172a; }
48
+ .dark .input{ background:#0b1220; color:#e2e8f0; }
49
+ .input:focus{ outline:none; box-shadow:0 0 0 2px rgba(99,102,241,.6); }
50
+ .label{ font-size:.875rem; font-weight:600; color:#334155; } .dark .label{ color:#cbd5e1; }
51
+ .tab{ padding:.6rem 1rem; border-radius:.8rem; cursor:pointer; font-weight:700; }
52
+ .tab-active{ background:#fff; border:1px solid var(--border); box-shadow:0 1px 2px rgba(0,0,0,.04); } .dark .tab-active{ background:#0b1220; }
53
+ .pill{ display:inline-flex; align-items:center; padding:.1rem .5rem; border-radius:999px; font-size:.53rem; font-weight:700; }
54
+ .metric-badge{ background:#ecfdf5; color:#065f46; } .dark .metric-badge{ background:rgba(16,185,129,.18); color:#d1fae5; }
55
+ .metric-badge-judge{ background:#e0f2fe; color:#075985; } .dark .metric-badge-judge{ background:rgba(56,189,248,.18); color:#bae6fd; }
56
+ .kpi{ color:#475569; font-size:.875rem; } .dark .kpi{ color:#94a3b8; }
57
+
58
+ /* Column widths */
59
+ /* .rank-col{ width:1.5rem; }
60
+ .provider-col{ width:4rem; }
61
+ .model-col{ width:10rem; white-space:nowrap; }
62
+ .mean-col{ width:3rem; white-space:nowrap; }
63
+ .ds-col{ min-width:9.5rem; white-space:nowrap; } */
64
+
65
+ .rank-col{ }
66
+ .provider-col{ }
67
+ .model-col{ white-space:nowrap; }
68
+ .mean-col{ white-space:nowrap; }
69
+ .ds-col{ white-space:nowrap; }
70
+
71
+ /* Sticky columns: use calc so offsets update easily */
72
+ .sticky-rank{ position: sticky; left: 0; z-index: 3; background: var(--sticky-bg); }
73
+ .sticky-provider{ position: sticky; left: 2.3rem; z-index: 3; background: var(--sticky-bg); }
74
+ .sticky-model{ position: sticky; left: calc(1.85rem + 6rem); z-index: 3; background: var(--sticky-bg); }
75
+ .sticky-mean{ position: sticky; left: calc(2.3rem + 4rem + 10rem + 0.5rem); z-index: 3; background: var(--sticky-bg); }
76
+
77
+ /* Mobile: hide rank & dataset columns; update sticky offsets */
78
+ @media (max-width: 768px){
79
+ .rank-col, .ds-col, .ds-head { display:none !important; }
80
+ .sticky-provider{ left: 0; }
81
+ .sticky-model{ left: 14rem; }
82
+ .sticky-mean{ left: calc(14rem + 18rem); }
83
+ }
84
+
85
+ /* Header background */
86
+ thead th{ background: var(--header-bg); }
87
+
88
+ /* Global gradient */
89
+ .gradient-bg{
90
+ background:
91
+ radial-gradient(1200px 600px at 20% -10%, rgba(99,102,241,.25), rgba(99,102,241,0) 60%),
92
+ radial-gradient(1200px 600px at 80% -10%, rgba(20,184,166,.2), rgba(20,184,166,0) 60%);
93
+ }
94
+
95
+ /* Scrollbar for horizontal overflow */
96
+ .scrollbar-thin::-webkit-scrollbar{ height:10px; }
97
+ .scrollbar-thin::-webkit-scrollbar-thumb{ background:#c7d2fe; border-radius:999px; }
98
+ .scrollbar-thin::-webkit-scrollbar-track{ background:transparent; }
99
+
100
+ /* Chart container: responsive height */
101
+ #chartWrap{ height: clamp(260px, 42vh, 460px); }
102
+ </style>
103
+ </head>
104
+ <body class="min-h-screen dark:bg-gray-800">
105
+ <!-- Header -->
106
+ <header class="sticky top-0 z-40 backdrop-blur dark:bg-white-800 border-b" style="border-color: var(--border)">
107
+ <div class="mx-auto max-w-7xl px-4 md:px-6 py-3 md:py-4 flex items-center justify-between">
108
+ <div class="flex items-center gap-3 md:gap-4">
109
+ <div class="h-10 w-10 rounded-xl bg-indigo-600 text-white grid place-items-center shadow-lg">📡</div>
110
+ <div>
111
+ <h1 class="text-lg md:text-xl font-extrabold tracking-tight" style="color: var(--text)">GSMA Open-Telco LLM Benchmarks</h1>
112
+ <p class="text-xs md:text-sm">Benchmarking models across telecom datasets</p>
113
+ </div>
114
+ </div>
115
+ <div class="md:flex items-center gap-2 hidden">
116
+ <button id="refreshBtn" class="btn btn-ghost" title="Refresh results">⟲ Refresh</button>
117
+ <!--button id="exportCsvBtn" class="btn btn-outline" title="Export current view to CSV">⭳ Export CSV</button-->
118
+ <button id="themeToggle" class="btn btn-ghost" title="Toggle dark mode">🌙</button>
119
+ </div>
120
+ </div>
121
+ </header>
122
+
123
+ <!-- Main -->
124
+ <main class="mx-auto max-w-7xl px-4 md:px-6 py-6 md:py-8 space-y-6 md:space-y-8">
125
+ <!-- Tabs -->
126
+ <div class="flex gap-2 md:gap-3">
127
+ <button class="tab tab-active" data-tab="leaderboard">🏆 Leaderboard</button>
128
+ <button class="tab" data-tab="submit">📤 Submit Model</button>
129
+ <button class="tab" data-tab="about">ℹ️ About</button>
130
+ </div>
131
+
132
+ <!-- Leaderboard Tab -->
133
+ <section id="tab-leaderboard" class="space-y-6 md:space-y-8">
134
+ <!-- Controls -->
135
+ <div class="grid lg:grid-cols-4 gap-4 md:gap-6">
136
+ <div class="card p-4 md:p-6 lg:col-span-2">
137
+ <div class="flex items-center justify-between">
138
+ <h2 class="font-semibold text-base md:text-lg" style="color: var(--text)">Select Datasets</h2>
139
+ <div class="flex gap-2">
140
+ <button id="selectAllBtn" class="btn btn-ghost text-xs md:text-sm px-3">Select all</button>
141
+ <button id="clearAllBtn" class="btn btn-ghost text-xs md:text-sm px-3">Clear</button>
142
+ </div>
143
+ </div>
144
+ <div id="datasetFilters" class="mt-3 md:mt-4 flex flex-wrap gap-2.5"></div> <!--grid-cols-1 sm:grid-cols-2 lg:grid-cols-3-->
145
+ </div>
146
+
147
+ <div class="card p-4 md:p-6">
148
+ <h2 class="font-semibold text-base md:text-lg" style="color: var(--text)">Search & Filter</h2>
149
+ <div class="mt-2 md:mt-3 space-y-2.5">
150
+ <input id="searchInput" class="input" placeholder="Search provider/model…"/>
151
+ <select id="providerSelect" class="input">
152
+ <option value="">All providers</option>
153
+ </select>
154
+ </div>
155
+ </div>
156
+
157
+ <div class="card p-4 md:p-6">
158
+ <h2 class="font-semibold text-base md:text-lg" style="color: var(--text)">KPI</h2>
159
+ <div class="mt-2 grid grid-cols-2 gap-3">
160
+ <div>
161
+ <div class="text-2xl md:text-3xl font-extrabold" id="kpiModels">—</div>
162
+ <div class="kpi">Models</div>
163
+ </div>
164
+ <div>
165
+ <div class="text-2xl md:text-3xl font-extrabold" id="kpiDatasets">—</div>
166
+ <div class="kpi">Datasets</div>
167
+ </div>
168
+ <div class="col-span-2 text-xs text-slate-500 dark:text-slate-400" id="lastUpdated">Last updated —</div>
169
+ </div>
170
+ </div>
171
+ </div>
172
+
173
+ <!-- Visualization -->
174
+ <div class="card p-5 md:p-6 lg:p-8">
175
+ <div class="flex items-center justify-between gap-3">
176
+ <h2 class="font-semibold text-base md:text-lg" style="color: var(--text)">Model Comparison</h2>
177
+ <div class="text-xs md:text-sm text-slate-500">Click ⭐ on rows to compare (max 3)</div>
178
+ </div>
179
+ <div id="chartWrap" class="mt-3 md:mt-4">
180
+ <canvas id="scoresChart"></canvas>
181
+ </div>
182
+ </div>
183
+
184
+ <!-- Table (always horizontally scrollable) -->
185
+ <div class="card overflow-hidden">
186
+ <div class="px-4 md:px-6 py-3 md:py-4 flex items-center justify-between">
187
+ <div class="font-semibold" style="color: var(--text)">Leaderboard</div>
188
+ <div class="text-xs md:text-sm text-slate-500">Sorted by mean score across selected datasets</div>
189
+ </div>
190
+
191
+ <div class="overflow-x-auto overscroll-x-contain scrollbar-thin pb-2">
192
+ <table class="min-w-[64rem] w-full text-sm">
193
+ <thead class="border-t border-b" style="border-color: var(--border)">
194
+ <tr id="tableHeaderRow">
195
+ <th class="p-3 text-left rank-col sticky-rank">#</th>
196
+ <th class="p-3 text-left provider-col sticky-provider border-l">Provider</th>
197
+ <th class="p-3 text-left model-col sticky-model border-l">Model</th>
198
+ <th class="p-3 text-left mean-col sticky-mean border-l">
199
+ Mean
200
+ <div class="text-[11px] text-slate-500">on selected</div>
201
+ </th>
202
+ <!-- Dataset columns injected here -->
203
+ </tr>
204
+ </thead>
205
+ <tbody id="tableBody"></tbody>
206
+ </table>
207
+ </div>
208
+
209
+ <div class="flex items-center gap-2 justify-end p-3">
210
+ <button id="refreshBtn" class="btn btn-ghost md:hidden" title="Refresh results">⟲ Refresh</button>
211
+ <button id="exportCsvBtn" class="btn btn-outline" title="Export current view to CSV">⭳ Export CSV</button>
212
+ <button id="themeToggle" class="btn btn-ghost md:hidden" title="Toggle dark mode">🌙</button>
213
+ </div>
214
+ </div>
215
+ </section>
216
+
217
+ <!-- Submit Tab -->
218
+ <section id="tab-submit" class="hidden">
219
+ <div class="card p-5 md:p-6 lg:p-8">
220
+ <h2 class="text-lg md:text-xl font-bold" style="color: var(--text)">Submit a Model for Evaluation</h2>
221
+ <p class="text-sm text-slate-600 dark:text-slate-300 mt-1">
222
+ Provide your model details. Submissions are queued (status: <span class="font-semibold">pending</span>) and evaluated automatically. Results will appear on the leaderboard when ready.
223
+ </p>
224
+
225
+ <form id="submitForm" class="mt-4 grid md:grid-cols-2 gap-4 md:gap-6">
226
+ <div>
227
+ <label class="label" for="modelProvider">Model Provider</label>
228
+ <input class="input" id="modelProvider" name="model_provider" required placeholder="e.g., TelcoAI Labs"/>
229
+ </div>
230
+ <div>
231
+ <label class="label" for="modelName">Model Name</label>
232
+ <input class="input" id="modelName" name="model_name" required placeholder="e.g., T-LLM-7B"/>
233
+ </div>
234
+ <div>
235
+ <label class="label" for="hfRepo">Hugging Face Repo</label>
236
+ <input class="input" id="hfRepo" name="hf_repo" required placeholder="e.g., telcoai/t-llm-7b"/>
237
+ </div>
238
+ <div>
239
+ <label class="label" for="contactEmail">Contact Email</label>
240
+ <input class="input" id="contactEmail" name="contact_email" required type="email" placeholder="[email protected]"/>
241
+ </div>
242
+ <div class="md:col-span-2">
243
+ <label class="label" for="notes">Notes (optional)</label>
244
+ <textarea class="input" id="notes" name="notes" rows="3" placeholder="Anything we should know about your model or expected behavior"></textarea>
245
+ </div>
246
+ <div class="md:col-span-2 flex items-center gap-2">
247
+ <input id="agree" type="checkbox" required class="h-4 w-4 accent-indigo-600"/>
248
+ <label for="agree" class="text-sm">I agree to have my model evaluated and results published.</label>
249
+ </div>
250
+ <div class="md:col-span-2 flex items-center gap-3">
251
+ <button class="btn btn-primary" type="submit">Submit to Queue</button>
252
+ <button class="btn btn-ghost" type="reset">Reset</button>
253
+ <span id="submitStatus" class="text-sm"></span>
254
+ </div>
255
+ </form>
256
+ </div>
257
+ </section>
258
+
259
+ <!-- About Tab -->
260
+ <section id="tab-about" class="hidden">
261
+ <div class="card p-5 md:p-6 lg:p-8 space-y-3">
262
+ <h2 class="text-lg md:text-xl font-bold" style="color: var(--text)">About this Leaderboard</h2>
263
+ <p class="text-sm text-slate-600 dark:text-slate-300">
264
+ This dashboard ranks LLMs on telecom-focused datasets. Each cell shows the score and the metric type used
265
+ (<span class="pill metric-badge">raw</span> or <span class="pill metric-badge-judge">llm-as-judge</span>). Energy & CO₂ appear on hover.
266
+ </p>
267
+ </div>
268
+ </section>
269
+ </main>
270
+
271
+ <div id="toast" class="fixed bottom-4 left-1/2 -translate-x-1/2 hidden">
272
+ <div class="rounded-xl bg-slate-900 text-white px-4 py-2 shadow-lg">✅ <span id="toastMsg">Done</span></div>
273
+ </div>
274
+
275
+ <script>
276
+ // ===== Config =====
277
+ const API_BASE = "";
278
+ const USE_MOCK_FALLBACK = false;
279
+ const MAX_COMPARE = 3;
280
+
281
+ // ===== State =====
282
+ const state = {
283
+ datasets: [],
284
+ models: [],
285
+ selectedDatasets: new Set(),
286
+ providerFilter: "",
287
+ search: "",
288
+ compare: new Set(),
289
+ dark: window.matchMedia && window.matchMedia('(prefers-color-scheme: dark)').matches,
290
+ };
291
+
292
+ // ===== Utils =====
293
+ const $ = (s)=>document.querySelector(s);
294
+ const $$ = (s)=>Array.from(document.querySelectorAll(s));
295
+ const showToast=(m)=>{const t=$("#toast");$("#toastMsg").textContent=m;t.classList.remove("hidden");setTimeout(()=>t.classList.add("hidden"),2200);};
296
+ const fmt=(n,d=2)=>(n==null||Number.isNaN(n))?"—":Number(n).toFixed(d);
297
+ const slug=(s)=>s.toLowerCase().replace(/[^a-z0-9]+/g,'-');
298
+
299
+ function savePrefs(){
300
+ localStorage.setItem('llm_lb_prefs', JSON.stringify({
301
+ selectedDatasets:[...state.selectedDatasets],
302
+ datasets:[...state.datasets],
303
+ providerFilter:state.providerFilter,
304
+ search:state.search, dark:state.dark,
305
+ compare:[...state.compare],
306
+ }));
307
+ }
308
+ function loadPrefs(){
309
+ try{
310
+ const p=JSON.parse(localStorage.getItem('llm_lb_prefs')||'{}');
311
+ if(p.selectedDatasets) state.selectedDatasets=new Set(p.selectedDatasets);
312
+ if(p.providerFilter) state.providerFilter=p.providerFilter;
313
+ if(p.search) state.search=p.search;
314
+ if(typeof p.dark==='boolean') state.dark=p.dark;
315
+ if(p.compare) state.compare=new Set(p.compare);
316
+ }catch{}
317
+ }
318
+
319
+ // ===== Mock =====
320
+ function mockDatasets(){ return [
321
+ "3GPP-TSG", "NetBench", "TeleQna", "TeleLogs", "TeleMath",
322
+ ]; }
323
+ function mockResults(){
324
+ const providers=["Qwen","Qwen","OpenAI","OpenAI","DeepSeek","ByteDance", "LLama"];
325
+ const models=["Qwen3-32B","QwQ-32B","GPT-OSS-120B","GPT-OSS-20B","R1-Distill-Llama-70B","Seed-OSS-36B", "Llama-8B"];
326
+ const repos=["qwen/qwen3-32b","qwen/qwq-32b","openai/gpt-oss-120b","openai/gpt-oss-20b","deepseek/r1-distill-llama-70b","bytedance/seed-oss-36b", "llama/llama-8B-instruct"];
327
+ const ds=mockDatasets(); const now=new Date().toISOString();
328
+ const mtypes=["raw","llm-as-judge"];
329
+ const rnd=(a)=>a[Math.floor(Math.random()*a.length)];
330
+ const rScore=()=>Math.round((45+Math.random()*50)*100)/100;
331
+ return {models: providers.map((p,i)=>({
332
+ provider:p, name:models[i], repo:repos[i], updated_at:now,
333
+ scores: ds.map(d=>({dataset_name:d, metric_type:rnd(mtypes), score:rScore(),
334
+ energy_consumed:+(0.1+Math.random()*2.4).toFixed(3),
335
+ co2_consumed:+(0.05+Math.random()*0.95).toFixed(3)}))
336
+ }))};
337
+ }
338
+
339
+ // ===== API =====
340
+ async function apiGet(path){
341
+ try{ const r=await fetch(`${API_BASE}${path}`); if(!r.ok) throw new Error(r.status); return await r.json(); }
342
+ catch(e){ if(USE_MOCK_FALLBACK){ if(path==='/api/datasets') return {datasets:mockDatasets()}; if(path==='/api/results') return mockResults(); } throw e; }
343
+ }
344
+ async function apiPost(path, body){
345
+ try{ const r=await fetch(`${API_BASE}${path}`,{method:'POST', headers:{'Content-Type':'application/json'}, body:JSON.stringify(body)}); if(!r.ok) throw new Error(r.status); return await r.json(); }
346
+ catch(e){ if(USE_MOCK_FALLBACK) return {status:'pending', id:`mock-${Date.now()}`}; throw e; }
347
+ }
348
+
349
+ // ===== Rendering =====
350
+ function visibleDatasets(){return state.selectedDatasets.size ? state.datasets.filter(d=>state.selectedDatasets.has(d)) : []; }
351
+ function modelRowMean(model, dsList){
352
+ const map=Object.fromEntries(model.scores.map(s=>[s.dataset_name,s]));
353
+ const vals=dsList.map(d=>map[d]?.score).filter(v=>typeof v==='number');
354
+ return vals.length? vals.reduce((a,b)=>a+b,0)/vals.length : null;
355
+ }
356
+
357
+ function renderDatasetFilters(){
358
+ const c=$("#datasetFilters"); c.innerHTML='';
359
+ state.datasets.forEach(d=>{
360
+ const checked=state.selectedDatasets.has(d);
361
+ c.insertAdjacentHTML('beforeend', `
362
+ <label class="flex flex-nowrap items-center gap-2 rounded-lg border" style="border-color: var(--border); padding:.5rem .6rem;">
363
+ <input type="checkbox" class="h-4 w-4 accent-indigo-600" data-dataset="${d}" ${checked?'checked':''}/>
364
+ <span class="text-sm" style="color: var(--text)">${d}</span>
365
+ </label>
366
+ `);
367
+ });
368
+ c.querySelectorAll('input[type="checkbox"]').forEach(cb=>{
369
+ cb.addEventListener('change', ()=>{
370
+ if(cb.checked) state.selectedDatasets.add(cb.dataset.dataset);
371
+ else state.selectedDatasets.delete(cb.dataset.dataset);
372
+ if([...c.querySelectorAll('input:checked')].length===0) state.selectedDatasets=new Set();
373
+ savePrefs(); renderTable(); updateChart();
374
+ });
375
+ });
376
+ $("#kpiDatasets").textContent = state.datasets.length;
377
+ }
378
+
379
+ function renderProviders(){
380
+ const provs=[...new Set(state.models.map(m=>m.provider))].sort();
381
+ const sel=$("#providerSelect");
382
+ sel.innerHTML='<option value="">All providers</option>'+provs.map(p=>`<option>${p}</option>`).join('');
383
+ if(state.providerFilter) sel.value=state.providerFilter;
384
+ }
385
+
386
+ function renderHeader(){
387
+ const head=$("#tableHeaderRow");
388
+ head.querySelectorAll('th[data-ds]').forEach(el=>el.remove());
389
+ visibleDatasets().forEach(d=>{
390
+ const th=document.createElement('th');
391
+ th.dataset.ds=d; th.className='p-3 text-left ds-col ds-head';
392
+ th.innerHTML = `<div class="font-semibold" style="color: var(--text)">${d}</div><div class="text-[11px] text-slate-500">score • metric</div>`;
393
+ head.appendChild(th);
394
+ });
395
+ }
396
+
397
+ function renderTable(){
398
+ renderHeader();
399
+ const ds=visibleDatasets();
400
+ const tb=$("#tableBody"); tb.innerHTML='';
401
+
402
+ let models=state.models.filter(m=>{
403
+ const text=(m.provider+" "+m.name+" "+m.repo).toLowerCase();
404
+ const okProvider=!state.providerFilter || m.provider===state.providerFilter;
405
+ const okSearch=!state.search || text.includes(state.search.toLowerCase());
406
+ return okProvider && okSearch;
407
+ }).map(m=>({...m, mean:modelRowMean(m, ds)}))
408
+ .sort((a,b)=>(b.mean??-1)-(a.mean??-1));
409
+
410
+ models.forEach((m, i)=>{
411
+ const sMap=Object.fromEntries(m.scores.map(s=>[s.dataset_name, s]));
412
+ const id=slug(m.provider+"-"+m.name);
413
+ const favOn=state.compare.has(id);
414
+
415
+ const row=document.createElement('tr');
416
+ row.className='border-b'; row.style.borderColor=getComputedStyle(document.documentElement).getPropertyValue('--border');
417
+ row.innerHTML = `
418
+ <td class="p-3 rank-col sticky-rank">${i+1}</td>
419
+ <td class="p-3 provider-col sticky-provider border-l">
420
+ <div class="font-medium">${m.provider}</div>
421
+ </td>
422
+ <td class="p-3 model-col sticky-model border-l">
423
+ <div class="flex items-center gap-2">
424
+ <button class="text-lg" data-fav="${id}" title="Add to compare">${favOn?'⭐':'☆'}</button>
425
+ <div class="font-semibold" style="color: var(--text)">${m.name}</div>
426
+ </div>
427
+ <div class="text-xs text-slate-500">${m.repo}</div>
428
+ </td>
429
+ <td class="p-3 mean-col sticky-mean border-l">
430
+ <div class="text-base font-semibold text-blue-500">${m.mean==null?'—':fmt(m.mean,2)}</div>
431
+ <!--div class="text-[11px] text-slate-500">mean across selected</div-->
432
+ </td>
433
+ `;
434
+ ds.forEach(d=>{
435
+ const s=sMap[d]; const mt=s?.metric_type;
436
+ // const badge= mt==='llm-as-judge' ? 'metric-badge-judge' : 'metric-badge';
437
+ const title= s ? `Energy: ${fmt(s.energy_consumed,3)} kWh\nCO₂: ${fmt(s.co2_consumed,3)} kg` : '';
438
+ const cell=document.createElement('td');
439
+ cell.className='p-3 ds-col'; cell.title=title;
440
+ cell.innerHTML = s ? (
441
+ mt==='llm-as-judge' ? `
442
+ <div class="flex item-center">
443
+ <div class="text-base font-semibold" style="color: var(--text)">${fmt(s.score,2)}</div>
444
+ <div class="pill metric-badge-judge ms-2">${mt}</div>
445
+ </div>
446
+ ` :
447
+ `
448
+ <div class="flex item-center">
449
+ <div class="text-base font-semibold" style="color: var(--text)">${fmt(s.score,2)}</div>
450
+ </div>
451
+ `
452
+ ) : '—';
453
+ row.appendChild(cell);
454
+ });
455
+ tb.appendChild(row);
456
+ });
457
+
458
+ $("#kpiModels").textContent=models.length;
459
+
460
+ $$('button[data-fav]').forEach(b=>{
461
+ b.onclick=()=>{
462
+ const id=b.dataset.fav;
463
+ if(state.compare.has(id)) state.compare.delete(id);
464
+ else{
465
+ if(state.compare.size>=MAX_COMPARE){ showToast(`You can compare up to ${MAX_COMPARE} models.`); return; }
466
+ state.compare.add(id);
467
+ }
468
+ savePrefs(); renderTable(); updateChart();
469
+ };
470
+ });
471
+ }
472
+
473
+ // ===== Chart =====
474
+ let chart;
475
+ function updateChart(){
476
+ const ds=visibleDatasets();
477
+ const labels=ds;
478
+ const datasets=[];
479
+ const chosen=state.models.filter(m=>state.compare.has(slug(m.provider+"-"+m.name)));
480
+ chosen.forEach(m=>{
481
+ const map=Object.fromEntries(m.scores.map(s=>[s.dataset_name, s.score]));
482
+ datasets.push({
483
+ label:`${m.provider} / ${m.name}`,
484
+ data: labels.map(d=>map[d] ?? null),
485
+ tension:.25, spanGaps:true, borderWidth:3, pointRadius:3, pointHoverRadius:5,
486
+ });
487
+ });
488
+
489
+ const ctx=document.getElementById('scoresChart').getContext('2d');
490
+ if(!chart){
491
+ chart=new Chart(ctx,{
492
+ type:'line',
493
+ data:{labels, datasets},
494
+ options:{
495
+ responsive:true,
496
+ maintainAspectRatio:false, // uses #chartWrap height (clamp -> responsive)
497
+ plugins:{
498
+ legend:{ display:true, labels:{ boxWidth:18, usePointStyle:true }},
499
+ tooltip:{ mode:'index', intersect:false }
500
+ },
501
+ interaction:{ mode:'nearest', intersect:false },
502
+ scales:{
503
+ y:{ beginAtZero:true, max:100, title:{display:true, text:'Score'}, grid:{ drawBorder:false }},
504
+ x:{ grid:{ display:false } }
505
+ },
506
+ layout:{ padding:0 }
507
+ }
508
+ });
509
+ }else{
510
+ chart.data.labels=labels;
511
+ chart.data.datasets=datasets;
512
+ chart.update();
513
+ }
514
+ }
515
+
516
+ // ===== Tabs / Theme / Export =====
517
+ function bindTabs(){
518
+ $$(".tab").forEach(btn=>{
519
+ btn.addEventListener('click', ()=>{
520
+ const t=btn.dataset.tab;
521
+ $$(".tab").forEach(b=>b.classList.remove('tab-active')); btn.classList.add('tab-active');
522
+ ["leaderboard","submit","about"].forEach(x=>{
523
+ const el=document.getElementById(`tab-${x}`); (x===t)?el.classList.remove('hidden'):el.classList.add('hidden');
524
+ });
525
+ });
526
+ });
527
+ }
528
+ function applyTheme(){
529
+ document.documentElement.classList.toggle('dark', state.dark);
530
+ $("#themeToggle").textContent= state.dark ? '☀️' : '🌙'; savePrefs();
531
+ }
532
+
533
+ function exportCSV(){
534
+ const ds=visibleDatasets();
535
+ const headers=['Rank','Provider','Model','Repo','Mean',...ds];
536
+ const rows=[];
537
+ let models=state.models.map(m=>({...m, mean:modelRowMean(m, ds)})).sort((a,b)=>(b.mean??-1)-(a.mean??-1));
538
+ models=models.filter(m=>{
539
+ const text=(m.provider+" "+m.name+" "+m.repo).toLowerCase();
540
+ const okProvider=!state.providerFilter || m.provider===state.providerFilter;
541
+ const okSearch=!state.search || text.includes(state.search.toLowerCase());
542
+ return okProvider && okSearch;
543
+ });
544
+ models.forEach((m,i)=>{
545
+ const map=Object.fromEntries(m.scores.map(s=>[s.dataset_name, s]));
546
+ const row=[i+1, m.provider, m.name, m.repo, fmt(m.mean,2)];
547
+ ds.forEach(d=>{ const s=map[d]; row.push(s? `${fmt(s.score,2)} (${s.metric_type})`:''); });
548
+ rows.push(row);
549
+ });
550
+ const csv=[headers, ...rows].map(r=> r.map(x=>'"'+String(x).replaceAll('"','""')+'"').join(',')).join('\n');
551
+ const blob=new Blob([csv],{type:'text/csv;charset=utf-8;'}); const url=URL.createObjectURL(blob);
552
+ const a=document.createElement('a'); a.href=url; a.download='telecom-llm-leaderboard.csv'; a.click(); URL.revokeObjectURL(url);
553
+ }
554
+
555
+ // ===== Controls & Submit =====
556
+ function bindControls(){
557
+ $("#selectAllBtn").onclick=()=>{ state.selectedDatasets=new Set(state.datasets); renderDatasetFilters(); renderTable(); updateChart(); savePrefs(); };
558
+ $("#clearAllBtn").onclick=()=>{ state.selectedDatasets=new Set(); renderDatasetFilters(); renderTable(); updateChart(); savePrefs(); };
559
+ $("#searchInput").addEventListener('input', (e)=>{ state.search=e.target.value; savePrefs(); renderTable(); });
560
+ $("#providerSelect").addEventListener('change', (e)=>{ state.providerFilter=e.target.value; savePrefs(); renderTable(); });
561
+ $("#refreshBtn").onclick=init;
562
+ $("#exportCsvBtn").onclick=exportCSV;
563
+ $("#themeToggle").onclick=()=>{ state.dark=!state.dark; applyTheme(); };
564
+
565
+ $("#submitForm").addEventListener('submit', async (e)=>{
566
+ e.preventDefault();
567
+ const payload={
568
+ model_provider: $("#modelProvider").value.trim(),
569
+ model_name: $("#modelName").value.trim(),
570
+ hf_repo: $("#hfRepo").value.trim(),
571
+ contact_email: $("#contactEmail").value.trim(),
572
+ notes: $("#notes").value.trim(),
573
+ };
574
+ $("#submitStatus").textContent='Submitting…';
575
+ try{
576
+ const res=await apiPost('/api/submit', payload);
577
+ $("#submitStatus").innerHTML=`Status: <span class="font-semibold">${res.status}</span> (id: ${res.id})`;
578
+ showToast('Submission received — queued as pending'); e.target.reset();
579
+ }catch(err){ $("#submitStatus").textContent='Submission failed. See console.'; }
580
+ });
581
+ }
582
+
583
+ // ===== Init =====
584
+ async function init(){
585
+ loadPrefs(); applyTheme(); bindTabs(); bindControls();
586
+ try{ state.datasets=(await apiGet('/api/datasets')).datasets; } catch{ state.datasets=mockDatasets(); }
587
+ try{ state.models=(await apiGet('/api/results')).models; }catch{ state.models=mockResults().models; }
588
+ const last=state.models.map(m=>new Date(m.updated_at)).sort((a,b)=>b-a)[0];
589
+ if(last) $("#lastUpdated").textContent='Last updated '+last.toLocaleString();
590
+ if (!state.selectedDatasets.size) {
591
+ state.selectedDatasets = new Set (state.datasets);
592
+ }
593
+ renderDatasetFilters(); renderProviders(); renderTable(); updateChart();
594
+ }
595
+ init();
596
+ </script>
597
+ </body>
598
+ </html>
requirements.txt ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # FastAPI + server
2
+ fastapi>=0.110
3
+ uvicorn[standard]>=0.29
4
+ gunicorn>=21.2
5
+
6
+ # Data validation
7
+ pydantic>=2.5
8
+
9
+ # Hugging Face queue integration used by server.py
10
+ huggingface_hub>=0.20
11
+
12
+ # load .env if you add it later; otherwise safe to remove
13
+ python-dotenv>=1.0
server.py ADDED
@@ -0,0 +1,299 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # server.py
2
+ # FastAPI backend for the Telecom LLM Leaderboard
3
+ # ------------------------------------------------
4
+ # Developer:
5
+ # Name: Mohamed SANA
6
+ # Email: [email protected]
7
+ #
8
+ # Quickstart:
9
+ # pip install fastapi uvicorn "pydantic>=2" huggingface_hub python-dotenv
10
+ # uvicorn server:app --reload --port 8000
11
+ #
12
+ # Configure (optional) environment variables for real HF queue uploads:
13
+ # HF_TOKEN=<your token>
14
+ # QUEUE_REPO=<org-or-user/submission-queue-dataset>
15
+ # RESULTS_REPO=<org-or-user/results-dataset> # (not used in this fake generator)
16
+ # HF_HOME=<custom cache dir> # optional
17
+ # USE_LOCAL_DATA=<set true to load data from local file>
18
+ # LOCAL_DATA_FILE=<the filename to load>
19
+ #
20
+ # The frontend expects:
21
+ # GET /api/datasets
22
+ # GET /api/results
23
+ # POST /api/submit
24
+
25
+ from fastapi import FastAPI, Body, HTTPException
26
+ from fastapi.middleware.cors import CORSMiddleware
27
+ from fastapi.responses import FileResponse
28
+ from pydantic import BaseModel, EmailStr, Field
29
+ from datetime import datetime
30
+ from typing import List, Literal, Optional, Dict, Any
31
+ import os, json, random
32
+
33
+ # Optional: Hugging Face Hub for queue upload
34
+ HF_AVAILABLE = False
35
+ try:
36
+ from huggingface_hub import HfApi
37
+ HF_AVAILABLE = True
38
+ except Exception:
39
+ HF_AVAILABLE = False
40
+
41
+ # ---------------- Config ----------------
42
+ HF_TOKEN = os.getenv("HF_TOKEN")
43
+ QUEUE_REPO = os.getenv("QUEUE_REPO", "msana/test-submission") # e.g., "your-org/telecom-eval-queue"
44
+ RESULTS_REPO = os.getenv("RESULTS_REPO", "msana/test-results") # future: read results remotely if desired
45
+ CACHE_PATH = os.getenv("HF_HOME", ".") # cache dir
46
+ USE_LOCAL_DATA = os.getenv("USE_LOCAL_DATA", True) # set False to load data from results repo
47
+ LOCAL_DATA_FILE = os.getenv("LOCAL_DATA_FILE", "benchmark.json") # the local data to loadt
48
+ DATASETS = os.getenv("DATASETS", "3GPP-TSG;NetBench;TeleQna;TeleLogs;TeleMath")
49
+
50
+ EVAL_REQUESTS_PATH = os.path.join(CACHE_PATH, "submission-queue")
51
+ EVAL_RESULTS_PATH = os.path.join(CACHE_PATH, "results-queue")
52
+
53
+ # ---------------- App ----------------
54
+ app = FastAPI(title="Telecom LLM Leaderboard API", version="1.0.0")
55
+ app.add_middleware(
56
+ CORSMiddleware, allow_origins=["*"], allow_credentials=True,
57
+ allow_methods=["*"], allow_headers=["*"]
58
+ )
59
+
60
+ # ---------------- Data Models ----------------
61
+ MetricType = Literal["raw", "llm-as-judge"]
62
+
63
+ class Score(BaseModel):
64
+ dataset_name: str
65
+ metric_type: MetricType
66
+ score: float
67
+ energy_consumed: float
68
+ co2_consumed: float
69
+
70
+ class ModelResult(BaseModel):
71
+ provider: str
72
+ name: str
73
+ repo: str
74
+ updated_at: str
75
+ scores: List[Score]
76
+
77
+ class SubmitPayload(BaseModel):
78
+ model_provider: str = Field(..., examples=["TelcoAI"])
79
+ model_name: str = Field(..., examples=["T-LLM-7B"])
80
+ hf_repo: str = Field(..., examples=["telcoai/t-llm-7b"])
81
+ contact_email: EmailStr
82
+ notes: Optional[str] = ""
83
+
84
+
85
+
86
+ if isinstance(DATASETS, str):
87
+ if ';' in DATASETS:
88
+ DATASETS = DATASETS.split(";")
89
+ elif '|' in DATASETS:
90
+ DATASETS = DATASETS.split("|")
91
+ elif ',' in DATASETS:
92
+ DATASETS = DATASETS.split(",")
93
+ else:
94
+ DATASETS = DATASETS.split(" ")
95
+
96
+ DATASETS = list(map(lambda s: s.strip(), DATASETS))
97
+
98
+
99
+
100
+ # ---------- fake data for test purpose
101
+ FAKE_MODELS = [
102
+ ("Qwen", "Qwen3-32B", "qwen/qwen3-32b"),
103
+ ("Qwen", "QwQ-32B", "qwen/qwq-32b"),
104
+ ("OpenAI", "GPT-OSS-120B", "openai/gpt-oss-120b"),
105
+ ("OpenAI", "GPT-OSS-20B", "openai/gpt-oss-20b"),
106
+ ("DeepSeek", "R1-Distill-Llama-70B", "deepseek/r1-distill-llama-70b"),
107
+ ("ByteDance", "Seed-OSS-36B", "bytedance/seed-oss-36b"),
108
+ ("LLama", "Llama-8B", "llama/llama-8B-instruct"),
109
+ ]
110
+
111
+ def _random_score() -> float:
112
+ return round(random.uniform(45.0, 62.0), 2)
113
+
114
+ def generate_fake_model(provider: str, name: str, repo: str) -> ModelResult:
115
+ metric_types = ["raw", "llm-as-judge"]
116
+ now = datetime.utcnow().isoformat() + "Z"
117
+ scores = []
118
+ for d in DATASETS:
119
+ mt = random.choice(metric_types)
120
+ scores.append(Score(
121
+ dataset_name=d,
122
+ metric_type=mt, score=_random_score(),
123
+ energy_consumed=round(random.uniform(0.1, 2.5), 3),
124
+ co2_consumed=round(random.uniform(0.05, 1.0), 3),
125
+ ))
126
+ return ModelResult(provider=provider, name=name, repo=repo, updated_at=now, scores=scores)
127
+
128
+ def generate_fake_results() -> Dict[str, Any]:
129
+ models = [generate_fake_model(p, n, r) for (p, n, r) in FAKE_MODELS]
130
+ return {"models": [m.model_dump() for m in models]}
131
+
132
+
133
+
134
+ def load_benchmark() -> Dict[str, Any]:
135
+ with open(LOCAL_DATA_FILE, 'r') as f:
136
+ return json.load(f)
137
+
138
+ # ---------------- HF Upload Helper ----------------
139
+ def push_submission_to_queue(eval_entry: Dict[str, Any]) -> str:
140
+ """
141
+ Writes a JSON file locally under HF_HOME/submission-queue/<provider>/<model>_submission_<version>.json
142
+ and uploads it to the dataset repo specified by QUEUE_REPO (if configured).
143
+ Returns a submission id string.
144
+ """
145
+ model_provider = eval_entry.get("model_provider", "unknown-provider")
146
+ model_name = eval_entry.get("model_name", "unknown-model")
147
+ version = eval_entry.get("version", "v1")
148
+
149
+ OUT_DIR = os.path.join(EVAL_REQUESTS_PATH, model_provider)
150
+ os.makedirs(OUT_DIR, exist_ok=True)
151
+ out_path = os.path.join(OUT_DIR, f"{model_name}_submission_{version}.json")
152
+
153
+ with open(out_path, "w", encoding="utf-8") as f:
154
+ f.write(json.dumps(eval_entry, ensure_ascii=False, indent=2))
155
+
156
+ upload_ok = False
157
+ if HF_AVAILABLE and HF_TOKEN and QUEUE_REPO:
158
+ api = HfApi(token=HF_TOKEN)
159
+ # path in repo: <provider>/<file>
160
+ print("-------------", out_path.split("submission-queue/"), out_path)
161
+ if '\\' in out_path:
162
+ out_path = out_path.replace('\\', '/')
163
+ path_in_repo = out_path.split("submission-queue/")[1]
164
+
165
+ api.upload_file(
166
+ path_or_fileobj=out_path,
167
+ path_in_repo=path_in_repo,
168
+ repo_id=QUEUE_REPO,
169
+ repo_type="dataset",
170
+ commit_message=f"Add {model_name} to eval queue",
171
+ )
172
+ upload_ok = True
173
+
174
+ return f"{model_provider}-{model_name}-{version}{'' if upload_ok else '-local'}"
175
+
176
+ # ---------------- Routes ----------------
177
+
178
+ @app.get("/", include_in_schema=False)
179
+ async def index():
180
+ return FileResponse("index.html")
181
+
182
+
183
+ @app.get("/api/health")
184
+ def health():
185
+ return {"ok": True, "huggingface_available": HF_AVAILABLE, "queue_repo": QUEUE_REPO, "result_repo": RESULTS_REPO}
186
+
187
+ @app.get("/api/datasets")
188
+ def get_datasets():
189
+ return {"datasets": DATASETS}
190
+
191
+ @app.get("/api/models")
192
+ def get_submitted_models():
193
+ models = []
194
+
195
+ if HF_AVAILABLE and HF_TOKEN and QUEUE_REPO:
196
+ api = HfApi(token=HF_TOKEN)
197
+ # path in repo: <provider>/<file>
198
+ api.snapshot_download(
199
+ repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30
200
+ )
201
+
202
+ models_filepaths = []
203
+
204
+ for root, _, files in os.walk(EVAL_REQUESTS_PATH):
205
+ # We should only have json files in model results
206
+ if len(files) == 0 or any([not f.endswith(".json") for f in files]):
207
+ continue
208
+
209
+ for file in files:
210
+ models_filepaths.append(os.path.join(root, file))
211
+
212
+ for filepath in models_filepaths:
213
+ try:
214
+ with open(filepath, 'r', encoding="utf-8") as f:
215
+ model: dict = json.load(f)
216
+
217
+ # --- security issue
218
+ model.pop("contact_email")
219
+
220
+ models.append(model)
221
+ except:
222
+ continue
223
+
224
+ return {"models": models}
225
+
226
+
227
+ @app.get("/api/results")
228
+ def get_results():
229
+ results = []
230
+
231
+ if USE_LOCAL_DATA:
232
+ return {"models": load_benchmark()}
233
+
234
+ if HF_AVAILABLE and HF_TOKEN and QUEUE_REPO:
235
+ api = HfApi(token=HF_TOKEN)
236
+ # path in repo: <provider>/<file>
237
+ api.snapshot_download(
238
+ repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30
239
+ )
240
+
241
+ model_result_filepaths = []
242
+
243
+ for root, _, files in os.walk(EVAL_RESULTS_PATH):
244
+ # We should only have json files in model results
245
+ if len(files) == 0 or any([not f.endswith(".json") for f in files]):
246
+ continue
247
+
248
+ for file in files:
249
+ model_result_filepaths.append(os.path.join(root, file))
250
+
251
+ for filepath in model_result_filepaths:
252
+ try:
253
+ with open(filepath, 'r', encoding="utf-8") as f:
254
+ results.append(json.load(f))
255
+ except:
256
+ continue
257
+
258
+ # print("----------------", results)
259
+
260
+ return {"models": results}
261
+
262
+
263
+ # For simplicity, this returns generated fake data.
264
+ # If you want to read from a RESULTS_REPO, implement a reader here that loads
265
+ # per-model folders and aggregates JSON files into the required shape.
266
+ # return generate_fake_results()
267
+
268
+ @app.post("/api/submit")
269
+ def submit_model(payload: SubmitPayload = Body(...)):
270
+ ts = datetime.utcnow().strftime("%Y%m%dT%H%M%SZ")
271
+ eval_entry = {
272
+ "model_provider": payload.model_provider,
273
+ "model_name": payload.model_name,
274
+ "hf_repo": payload.hf_repo,
275
+ "contact_email": payload.contact_email,
276
+ "notes": payload.notes or "",
277
+ "status": "pending",
278
+ "version": ts,
279
+ "submitted_at": datetime.utcnow().isoformat() + "Z",
280
+ }
281
+
282
+ # Is the model info correctly filled?
283
+ try:
284
+ _ = HfApi.model_info(repo_id=payload.model_name)
285
+ except Exception as e:
286
+ # If queue upload fails, still persist locally and report an error
287
+ raise HTTPException(status_code=500, detail=f"Could not get your model information. Please fill it up properly.")
288
+
289
+ try:
290
+ submission_id = push_submission_to_queue(eval_entry)
291
+ return {"status": "pending", "id": submission_id}
292
+ except Exception as e:
293
+ # If queue upload fails, still persist locally and report an error
294
+ raise HTTPException(status_code=500, detail=f"Failed to queue submission: {e}")
295
+
296
+ # ---------------- Optional: run via `python server.py` ----------------
297
+ if __name__ == "__main__":
298
+ import uvicorn
299
+ uvicorn.run("server:app", host="0.0.0.0", port=7600, reload=True)