import json, torch, gradio as gr
from huggingface_hub import hf_hub_download
from model.SydsGPTv2 import SydsGPTv2
import tiktoken
try:
    import spaces
    SPACES_AVAILABLE = True
    # Declare GPU requirement for Spaces without touching CUDA at import time
    @spaces.GPU
    def requires_gpu():
        return True
except ImportError:
    SPACES_AVAILABLE = False

repo_id = "siddsachar/sydsgpt-v2-164m-finetuned-alpaca"
device = None  # defer CUDA init to avoid Spaces stateless GPU error

cfg_path = hf_hub_download(repo_id, filename="config.json")
ckpt_path = hf_hub_download(repo_id, filename="pytorch_model.bin")

with open(cfg_path) as f:
    cfg = json.load(f)

# Defer heavy initialization to a function to avoid CUDA init in main process
model = None
tokenizer = None
eos_id = None

def init_runtime():
    global device, model, tokenizer, eos_id
    if device is None:
        device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    if tokenizer is None:
        tokenizer = tiktoken.get_encoding("gpt2")
        eos_id = tokenizer.eot_token
    if model is None:
        mdl = SydsGPTv2(cfg)
        # Load weights on CPU first, then move to device
        state = torch.load(ckpt_path, map_location='cpu')
        mdl.load_state_dict(state)
        mdl.eval()
        model = mdl

def format_prompt(instruction, inp):
    base = f"<|user|>\n{instruction}"
    if inp:
        base += f"\n{inp}"
    return base + "\n\n<|assistant|>\n"

from modules.Generate import generate  # or inline equivalents

def infer(instruction, inp, max_new_tokens, temperature, top_k, top_p, repetition_penalty):
    init_runtime()
    prompt = format_prompt(instruction, inp)
    input_tokens = torch.tensor(tokenizer.encode(prompt), dtype=torch.long).unsqueeze(0)
    out = generate(
        model,
        input_tokens,
        max_new_tokens=int(max_new_tokens),
        context_size=cfg["context_length"],
        temperature=float(temperature),
        top_k=int(top_k),
        top_p=float(top_p),
        repetition_penalty=float(repetition_penalty),
        eos_id=eos_id
    )
    full_text = tokenizer.decode(out[0].tolist())
    reply = full_text[len(prompt):].replace("<|assistant|>", "").strip()
    return reply

with gr.Blocks(title="SydsGPT v2 164M (Finetuned Alpaca)") as demo:
    gr.Markdown("Single-turn demo. Tags: `<|user|>`, `<|assistant|>`. Enter instruction and optional input.")
    instruction = gr.Textbox(label="Instruction", lines=4)
    inp = gr.Textbox(label="Input (optional)", lines=4)
    max_new = gr.Slider(16, 2048, value=200, step=1, label="Max New Tokens")
    temp = gr.Slider(0.1, 1.5, value=0.7, step=0.05, label="Temperature")
    topk = gr.Slider(1, 100, value=40, step=1, label="Top-k")
    topp = gr.Slider(0.0, 1.0, value=0.9, step=0.01, label="Top-p (nucleus)")
    rep_pen = gr.Slider(1.0, 2.0, value=1.1, step=0.01, label="Repetition Penalty")
    generate_btn = gr.Button("Generate")
    response_box = gr.Textbox(label="Response", lines=8)

    generate_btn.click(
        infer,
        inputs=[instruction, inp, max_new, temp, topk, topp, rep_pen],
        outputs=[response_box]
    )

if __name__ == "__main__":
    demo.launch()