handler.py · ZennyKenny/novoyaz-20b at main

novoyaz-20b / handler.py

update prompt prefix and suffix

093ee81 verified about 1 month ago

6.12 kB

	# handler.py - PRODUCTION VERSION FOR INFERENCE ENDPOINTS
	from __future__ import annotations

	import os
	from typing import Any, Dict, List, Union

	import torch
	from transformers import AutoTokenizer, AutoModelForCausalLM

	PROMPT_PREFIX = (
	"Ты – модель, которая строго переписывает дореформенный русский текст "
	"в современную орфографию, не меняя смысл и пунктуацию. "
	"Не добавляй комментарии и не переводь текст.\n\nТекст:\n"
	)
	PROMPT_SUFFIX = "\n\nСовременный орфографический вариант:"


	def _as_list(x: Union[str, List[str]]) -> List[str]:
	return [x] if isinstance(x, str) else [str(t) for t in x]


	# Load from unsloth's 4-bit quantized version which doesn't use custom files
	# OR use the base openai model with trust_remote_code
	USE_BASE_MODEL = os.getenv("USE_BASE_MODEL", "true").lower() == "true"

	if USE_BASE_MODEL:
	MODEL_ID = "openai/gpt-oss-20b"
	TRUST_REMOTE_CODE = True
	else:
	# Alternative: use unsloth's version which may have custom files included
	MODEL_ID = "unsloth/gpt-oss-20b-bnb-4bit"
	TRUST_REMOTE_CODE = False

	GEN_KW = {
	"do_sample": False,
	"temperature": 0.0,
	"num_beams": 1,
	"max_new_tokens": int(os.getenv("GEN_MAX_NEW_TOKENS", "512")),
	"repetition_penalty": 1.0,
	}


	class EndpointHandler:
	def __init__(self, model_dir: str):
	"""
	Initialize the endpoint handler.

	NOTE: For Inference Endpoints, model_dir points to /repository
	but we're loading from HuggingFace Hub instead since your
	quantized model is missing the custom architecture files.
	"""
	print(f"[handler] Model directory provided: {model_dir}")
	print(f"[handler] Loading model from: {MODEL_ID}")
	print(f"[handler] Trust remote code: {TRUST_REMOTE_CODE}")

	self.device = "cuda" if torch.cuda.is_available() else "cpu"

	# Load tokenizer
	self.tokenizer = AutoTokenizer.from_pretrained(
	MODEL_ID,
	use_fast=True,
	trust_remote_code=TRUST_REMOTE_CODE
	)

	# Load model
	# The openai/gpt-oss-20b model uses MXFP4 quantization by default
	# which requires specific hardware (H100/A100)
	# For general deployment, we use bfloat16 or float16
	if torch.cuda.is_available():
	# Check if we can use MXFP4 (ideal)
	dtype = "auto" # Will use MXFP4 if available, otherwise bf16/f16
	else:
	dtype = torch.float32

	try:
	self.model = AutoModelForCausalLM.from_pretrained(
	MODEL_ID,
	torch_dtype=dtype,
	device_map="auto" if torch.cuda.is_available() else None,
	trust_remote_code=TRUST_REMOTE_CODE,
	low_cpu_mem_usage=True,
	)
	except Exception as e:
	print(f"[handler] Error loading with MXFP4/auto dtype: {e}")
	print(f"[handler] Falling back to bfloat16...")
	# Fallback to bfloat16 if MXFP4 not supported
	self.model = AutoModelForCausalLM.from_pretrained(
	MODEL_ID,
	torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32,
	device_map="auto" if torch.cuda.is_available() else None,
	trust_remote_code=TRUST_REMOTE_CODE,
	low_cpu_mem_usage=True,
	)

	# Set pad token
	if self.tokenizer.pad_token is None:
	self.tokenizer.pad_token = self.tokenizer.eos_token

	# Disable caching on CPU
	if not torch.cuda.is_available():
	self.model.config.use_cache = False

	self.model.eval()

	print(f"[handler] ✓ Model loaded successfully")
	print(f"[handler] Device: {self.model.device}")
	print(f"[handler] Dtype: {self.model.dtype}")
	print(f"[handler] Model architecture: {self.model.config.architectures}")

	def _encode(self, texts: List[str]) -> Dict[str, Any]:
	"""Encode texts with task-specific prompt."""
	prompts = [f"{PROMPT_PREFIX}{t}{PROMPT_SUFFIX}" for t in texts]
	toks = self.tokenizer(
	prompts,
	return_tensors="pt",
	padding=True,
	truncation=True,
	max_length=2048 # Prevent overly long inputs
	)
	return {k: v.to(self.model.device) for k, v in toks.items()}

	@torch.inference_mode()
	def __call__(self, data: Dict[str, Any]) -> List[Dict[str, str]]:
	"""
	Process inference request.

	Expected input format:
	{
	"inputs": "дореформенный текст" or ["текст1", "текст2"]
	}

	Returns:
	[
	{"generated_text": "современный текст"},
	...
	]
	"""
	if "inputs" not in data:
	return [{"error": "missing 'inputs' field"}]

	texts = _as_list(data["inputs"])

	if not texts or all(not t.strip() for t in texts):
	return [{"error": "empty input text"}]

	try:
	inputs = self._encode(texts)
	outputs = self.model.generate(inputs, GEN_KW)

	results: List[Dict[str, str]] = []
	for i, seq in enumerate(outputs):
	# Remove input tokens from output
	in_len = inputs["input_ids"][i].shape[-1]
	gen_only = seq[in_len:]
	text = self.tokenizer.decode(gen_only, skip_special_tokens=True).strip()
	results.append({"generated_text": text})

	return results

	except Exception as e:
	print(f"[handler] Error during generation: {e}")
	import traceback
	traceback.print_exc()
	return [{"error": f"generation failed: {str(e)}"}]