Text Generation
Transformers
Safetensors
nemotron
reasoning
dual-mode
thinking
tool-calling
agentic
multilingual
conversational
Instructions to use domyn/Domyn-Small-v1.0 with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Transformers
How to use domyn/Domyn-Small-v1.0 with Transformers:
# Use a pipeline as a high-level helper from transformers import pipeline pipe = pipeline("text-generation", model="domyn/Domyn-Small-v1.0") messages = [ {"role": "user", "content": "Who are you?"}, ] pipe(messages)# Load model directly from transformers import AutoTokenizer, AutoModelForCausalLM tokenizer = AutoTokenizer.from_pretrained("domyn/Domyn-Small-v1.0") model = AutoModelForCausalLM.from_pretrained("domyn/Domyn-Small-v1.0") messages = [ {"role": "user", "content": "Who are you?"}, ] inputs = tokenizer.apply_chat_template( messages, add_generation_prompt=True, tokenize=True, return_dict=True, return_tensors="pt", ).to(model.device) outputs = model.generate(**inputs, max_new_tokens=40) print(tokenizer.decode(outputs[0][inputs["input_ids"].shape[-1]:])) - Notebooks
- Google Colab
- Kaggle
- Local Apps
- vLLM
How to use domyn/Domyn-Small-v1.0 with vLLM:
Install from pip and serve model
# Install vLLM from pip: pip install vllm # Start the vLLM server: vllm serve "domyn/Domyn-Small-v1.0" # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:8000/v1/chat/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "domyn/Domyn-Small-v1.0", "messages": [ { "role": "user", "content": "What is the capital of France?" } ] }'Use Docker
docker model run hf.co/domyn/Domyn-Small-v1.0
- SGLang
How to use domyn/Domyn-Small-v1.0 with SGLang:
Install from pip and serve model
# Install SGLang from pip: pip install sglang # Start the SGLang server: python3 -m sglang.launch_server \ --model-path "domyn/Domyn-Small-v1.0" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/chat/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "domyn/Domyn-Small-v1.0", "messages": [ { "role": "user", "content": "What is the capital of France?" } ] }'Use Docker images
docker run --gpus all \ --shm-size 32g \ -p 30000:30000 \ -v ~/.cache/huggingface:/root/.cache/huggingface \ --env "HF_TOKEN=<secret>" \ --ipc=host \ lmsysorg/sglang:latest \ python3 -m sglang.launch_server \ --model-path "domyn/Domyn-Small-v1.0" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/chat/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "domyn/Domyn-Small-v1.0", "messages": [ { "role": "user", "content": "What is the capital of France?" } ] }' - Docker Model Runner
How to use domyn/Domyn-Small-v1.0 with Docker Model Runner:
docker model run hf.co/domyn/Domyn-Small-v1.0
| """ | |
| Custom vLLM tool parser plugin for models that use <tool_call> XML tags. | |
| The model outputs tool calls in this format: | |
| <tool_call> | |
| {"name": "function_name", "arguments": {"arg1": "val1"}} | |
| </tool_call> | |
| Multiple tool calls can appear in a single response (parallel tool calling). | |
| Usage: | |
| vllm serve <model> \ | |
| --enable-auto-tool-choice \ | |
| --tool-parser-plugin /absolute/path/to/tool_parser_plugin.py \ | |
| --tool-call-parser xml_tool_call \ | |
| --chat-template /absolute/path/to/tool_chat_template.jinja | |
| """ | |
| import ast | |
| import json | |
| import re | |
| import uuid | |
| from typing import Sequence, Union | |
| # --------------------------------------------------------------------------- | |
| # Import compatibility: vLLM >=0.8 moved tool_parsers to vllm.tool_parsers; | |
| # older versions keep them under vllm.entrypoints.openai.tool_parsers. | |
| # --------------------------------------------------------------------------- | |
| try: | |
| # Newer vLLM, roughly 0.15+ | |
| from vllm.entrypoints.openai.chat_completion.protocol import ChatCompletionRequest | |
| from vllm.entrypoints.openai.engine.protocol import ( | |
| DeltaFunctionCall, | |
| DeltaMessage, | |
| DeltaToolCall, | |
| ExtractedToolCallInformation, | |
| FunctionCall, | |
| ToolCall, | |
| ) | |
| except ImportError: | |
| # Older vLLM | |
| from vllm.entrypoints.openai.protocol import ( | |
| ChatCompletionRequest, | |
| DeltaFunctionCall, | |
| DeltaMessage, | |
| DeltaToolCall, | |
| ExtractedToolCallInformation, | |
| FunctionCall, | |
| ToolCall, | |
| ) | |
| try: | |
| from vllm.tool_parsers.abstract_tool_parser import ToolParser, ToolParserManager | |
| except ImportError: | |
| from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import ( | |
| ToolParser, | |
| ToolParserManager, | |
| ) | |
| from vllm.logger import init_logger | |
| logger = init_logger(__name__) | |
| def _generate_tool_call_id() -> str: | |
| """Generate a unique tool-call ID in the format expected by OpenAI.""" | |
| return f"call_{uuid.uuid4().hex[:24]}" | |
| # --------------------------------------------------------------------------- | |
| # Register the parser so it can be referenced via --tool-call-parser | |
| # --------------------------------------------------------------------------- | |
| class XMLToolCallParser(ToolParser): | |
| """ | |
| Parses tool calls wrapped in <tool_call>...</tool_call> XML tags. | |
| Handles both single and parallel (multiple) tool calls in one response. | |
| Supports streaming and non-streaming extraction. | |
| """ | |
| # Regex to match complete <tool_call>...</tool_call> blocks | |
| TOOL_CALL_RE = re.compile( | |
| r"<tool_call>\s*(.*?)\s*</tool_call>", | |
| re.DOTALL, | |
| ) | |
| # Regex that also matches an incomplete (still-streaming) block | |
| TOOL_CALL_OPEN_RE = re.compile( | |
| r"<tool_call>\s*(.*?)(?:</tool_call>|$)", | |
| re.DOTALL, | |
| ) | |
| TOOL_CALL_START = "<tool_call>" | |
| TOOL_CALL_END = "</tool_call>" | |
| def __init__(self, tokenizer, tools=None): | |
| # vLLM newer versions: ToolParser.__init__(tokenizer, tools) | |
| # vLLM older versions: ToolParser.__init__(tokenizer) | |
| try: | |
| super().__init__(tokenizer, tools) | |
| except TypeError: | |
| super().__init__(tokenizer) | |
| self.tools = tools or [] | |
| # ---- streaming state ---- | |
| self.current_tool_id: int = -1 | |
| self.current_tool_name_sent: bool = False | |
| self.prev_tool_call_arr: list[dict] = [] | |
| self.streamed_args_for_tool: list[str] = [] | |
| # ------------------------------------------------------------------ | |
| # Optional: adjust the request before inference | |
| # ------------------------------------------------------------------ | |
| def _parse_tool_json(raw: str) -> dict | None: | |
| """Parse a tool call JSON block, handling Python-style single quotes.""" | |
| # Try standard JSON first | |
| try: | |
| return json.loads(raw) | |
| except (json.JSONDecodeError, ValueError): | |
| pass | |
| # Fall back to ast.literal_eval for Python-style dicts with single quotes | |
| try: | |
| result = ast.literal_eval(raw) | |
| if isinstance(result, dict): | |
| return result | |
| except (ValueError, SyntaxError): | |
| pass | |
| return None | |
| def adjust_request( | |
| self, request: ChatCompletionRequest | |
| ) -> ChatCompletionRequest: | |
| return request | |
| # ------------------------------------------------------------------ | |
| # NON-STREAMING extraction | |
| # ------------------------------------------------------------------ | |
| def extract_tool_calls( | |
| self, | |
| model_output: str, | |
| request: ChatCompletionRequest, | |
| ) -> ExtractedToolCallInformation: | |
| """ | |
| Parse all <tool_call>...</tool_call> blocks from the full model | |
| output and convert them to OpenAI ToolCall objects. | |
| """ | |
| # Find all complete tool-call blocks | |
| raw_matches = self.TOOL_CALL_RE.findall(model_output) | |
| if not raw_matches: | |
| # No tool calls found — return the text as-is | |
| return ExtractedToolCallInformation( | |
| tools_called=False, | |
| tool_calls=[], | |
| content=model_output, | |
| ) | |
| tool_calls: list[ToolCall] = [] | |
| for raw_json in raw_matches: | |
| parsed = self._parse_tool_json(raw_json) | |
| if parsed is None: | |
| logger.warning( | |
| "Failed to parse tool call JSON: %s", raw_json | |
| ) | |
| continue | |
| fn_name = parsed.get("name", "") | |
| fn_args = parsed.get("arguments", {}) | |
| # Ensure arguments is a JSON string (OpenAI format) | |
| if isinstance(fn_args, dict): | |
| fn_args_str = json.dumps(fn_args) | |
| elif isinstance(fn_args, str): | |
| # Model may emit arguments as a JSON string — validate and pass through | |
| try: | |
| json.loads(fn_args) | |
| fn_args_str = fn_args | |
| except (json.JSONDecodeError, ValueError): | |
| # Try ast.literal_eval for Python-style dicts (e.g. single quotes, | |
| # unquoted keys). If that also fails, emit an empty dict so | |
| # downstream json.loads never sees an invalid string. | |
| try: | |
| recovered = ast.literal_eval(fn_args) | |
| fn_args_str = json.dumps(recovered) if isinstance(recovered, dict) else json.dumps({}) | |
| except (ValueError, SyntaxError): | |
| fn_args_str = "{}" | |
| else: | |
| fn_args_str = str(fn_args) | |
| tool_calls.append( | |
| ToolCall( | |
| id=_generate_tool_call_id(), | |
| type="function", | |
| function=FunctionCall( | |
| name=fn_name, | |
| arguments=fn_args_str, | |
| ), | |
| ) | |
| ) | |
| # Strip tool-call blocks from content to get any surrounding text | |
| remaining_content = self.TOOL_CALL_RE.sub("", model_output).strip() | |
| return ExtractedToolCallInformation( | |
| tools_called=True, | |
| tool_calls=tool_calls, | |
| content=remaining_content if remaining_content else None, | |
| ) | |
| # ------------------------------------------------------------------ | |
| # STREAMING extraction | |
| # ------------------------------------------------------------------ | |
| def extract_tool_calls_streaming( | |
| self, | |
| previous_text: str, | |
| current_text: str, | |
| delta_text: str, | |
| previous_token_ids: Sequence[int], | |
| current_token_ids: Sequence[int], | |
| delta_token_ids: Sequence[int], | |
| request: ChatCompletionRequest, | |
| ) -> Union[DeltaMessage, None]: | |
| """ | |
| Incrementally parse tool calls from the streaming token output. | |
| Strategy: | |
| - Before seeing <tool_call>, stream tokens as regular content. | |
| - Once <tool_call> is detected, buffer until </tool_call>. | |
| - On </tool_call>, emit the complete tool call delta. | |
| - Support multiple sequential tool calls. | |
| """ | |
| # If we haven't seen a tool_call opening tag yet, pass through as | |
| # regular content (unless the start tag is partially forming). | |
| if self.TOOL_CALL_START not in current_text: | |
| # Check if the current text ends with a partial match of the | |
| # start tag — if so, hold back to avoid emitting partial tags. | |
| for i in range(1, len(self.TOOL_CALL_START)): | |
| if current_text.endswith(self.TOOL_CALL_START[:i]): | |
| # Possibly forming the start tag — hold delta | |
| return None | |
| return DeltaMessage(content=delta_text) | |
| # ---- We are inside or past a <tool_call> block ---- | |
| # Find all *complete* tool call blocks so far | |
| complete_matches = self.TOOL_CALL_RE.findall(current_text) | |
| num_complete = len(complete_matches) | |
| # Determine how many we've already streamed | |
| num_already_sent = len(self.prev_tool_call_arr) | |
| if num_complete > num_already_sent: | |
| # A new tool call just completed — emit it | |
| new_raw = complete_matches[num_already_sent] | |
| parsed = self._parse_tool_json(new_raw) | |
| if parsed is None: | |
| logger.warning( | |
| "Streaming: failed to parse tool call JSON: %s", | |
| new_raw, | |
| ) | |
| return None | |
| fn_name = parsed.get("name", "") | |
| fn_args = parsed.get("arguments", {}) | |
| if isinstance(fn_args, dict): | |
| fn_args_str = json.dumps(fn_args) | |
| elif isinstance(fn_args, str): | |
| try: | |
| json.loads(fn_args) | |
| fn_args_str = fn_args | |
| except (json.JSONDecodeError, ValueError): | |
| try: | |
| recovered = ast.literal_eval(fn_args) | |
| fn_args_str = json.dumps(recovered) if isinstance(recovered, dict) else json.dumps({}) | |
| except (ValueError, SyntaxError): | |
| fn_args_str = "{}" | |
| else: | |
| fn_args_str = str(fn_args) | |
| self.current_tool_id += 1 | |
| self.prev_tool_call_arr.append(parsed) | |
| self.streamed_args_for_tool.append(fn_args_str) | |
| self.current_tool_name_sent = True | |
| return DeltaMessage( | |
| tool_calls=[ | |
| DeltaToolCall( | |
| index=self.current_tool_id, | |
| id=_generate_tool_call_id(), | |
| type="function", | |
| function=DeltaFunctionCall( | |
| name=fn_name, | |
| arguments=fn_args_str, | |
| ), | |
| ) | |
| ] | |
| ) | |
| # If we're currently inside an incomplete tool call block, | |
| # don't emit anything — wait for it to complete. | |
| # Check if there's an open <tool_call> without a matching close | |
| open_count = current_text.count(self.TOOL_CALL_START) | |
| close_count = current_text.count(self.TOOL_CALL_END) | |
| if open_count > close_count: | |
| # Still buffering inside a tool call | |
| return None | |
| # If we're past all tool call blocks, stream remaining content | |
| # (unlikely for most models but handles edge cases) | |
| return None |