import json, torch, gradio as gr from huggingface_hub import hf_hub_download from model.SydsGPTv2 import SydsGPTv2 import tiktoken try: import spaces SPACES_AVAILABLE = True # Declare GPU requirement for Spaces without touching CUDA at import time @spaces.GPU def requires_gpu(): return True except ImportError: SPACES_AVAILABLE = False repo_id = "siddsachar/sydsgpt-v2-164m-finetuned-alpaca" device = None # defer CUDA init to avoid Spaces stateless GPU error cfg_path = hf_hub_download(repo_id, filename="config.json") ckpt_path = hf_hub_download(repo_id, filename="pytorch_model.bin") with open(cfg_path) as f: cfg = json.load(f) # Defer heavy initialization to a function to avoid CUDA init in main process model = None tokenizer = None eos_id = None def init_runtime(): global device, model, tokenizer, eos_id if device is None: device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') if tokenizer is None: tokenizer = tiktoken.get_encoding("gpt2") eos_id = tokenizer.eot_token if model is None: mdl = SydsGPTv2(cfg) # Load weights on CPU first, then move to device state = torch.load(ckpt_path, map_location='cpu') mdl.load_state_dict(state) mdl.eval() model = mdl def format_prompt(instruction, inp): base = f"<|user|>\n{instruction}" if inp: base += f"\n{inp}" return base + "\n\n<|assistant|>\n" from modules.Generate import generate # or inline equivalents def infer(instruction, inp, max_new_tokens, temperature, top_k, top_p, repetition_penalty): init_runtime() prompt = format_prompt(instruction, inp) input_tokens = torch.tensor(tokenizer.encode(prompt), dtype=torch.long).unsqueeze(0) out = generate( model, input_tokens, max_new_tokens=int(max_new_tokens), context_size=cfg["context_length"], temperature=float(temperature), top_k=int(top_k), top_p=float(top_p), repetition_penalty=float(repetition_penalty), eos_id=eos_id ) full_text = tokenizer.decode(out[0].tolist()) reply = full_text[len(prompt):].replace("<|assistant|>", "").strip() return reply with gr.Blocks(title="SydsGPT v2 164M (Finetuned Alpaca)") as demo: gr.Markdown("Single-turn demo. Tags: `<|user|>`, `<|assistant|>`. Enter instruction and optional input.") instruction = gr.Textbox(label="Instruction", lines=4) inp = gr.Textbox(label="Input (optional)", lines=4) max_new = gr.Slider(16, 2048, value=200, step=1, label="Max New Tokens") temp = gr.Slider(0.1, 1.5, value=0.7, step=0.05, label="Temperature") topk = gr.Slider(1, 100, value=40, step=1, label="Top-k") topp = gr.Slider(0.0, 1.0, value=0.9, step=0.01, label="Top-p (nucleus)") rep_pen = gr.Slider(1.0, 2.0, value=1.1, step=0.01, label="Repetition Penalty") generate_btn = gr.Button("Generate") response_box = gr.Textbox(label="Response", lines=8) generate_btn.click( infer, inputs=[instruction, inp, max_new, temp, topk, topp, rep_pen], outputs=[response_box] ) if __name__ == "__main__": demo.launch()