Text Generation
Transformers
Safetensors
English
code
helion-osc
mathematics
reasoning
algorithm
causal-lm
conversational
bitsandbytes
Instructions to use DeepXR/Helion-OSC with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Transformers
How to use DeepXR/Helion-OSC with Transformers:
# Use a pipeline as a high-level helper from transformers import pipeline pipe = pipeline("text-generation", model="DeepXR/Helion-OSC") messages = [ {"role": "user", "content": "Who are you?"}, ] pipe(messages)# Load model directly from transformers import AutoModelForCausalLM model = AutoModelForCausalLM.from_pretrained("DeepXR/Helion-OSC", dtype="auto") - Notebooks
- Google Colab
- Kaggle
- Local Apps
- vLLM
How to use DeepXR/Helion-OSC with vLLM:
Install from pip and serve model
# Install vLLM from pip: pip install vllm # Start the vLLM server: vllm serve "DeepXR/Helion-OSC" # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:8000/v1/chat/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "DeepXR/Helion-OSC", "messages": [ { "role": "user", "content": "What is the capital of France?" } ] }'Use Docker
docker model run hf.co/DeepXR/Helion-OSC
- SGLang
How to use DeepXR/Helion-OSC with SGLang:
Install from pip and serve model
# Install SGLang from pip: pip install sglang # Start the SGLang server: python3 -m sglang.launch_server \ --model-path "DeepXR/Helion-OSC" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/chat/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "DeepXR/Helion-OSC", "messages": [ { "role": "user", "content": "What is the capital of France?" } ] }'Use Docker images
docker run --gpus all \ --shm-size 32g \ -p 30000:30000 \ -v ~/.cache/huggingface:/root/.cache/huggingface \ --env "HF_TOKEN=<secret>" \ --ipc=host \ lmsysorg/sglang:latest \ python3 -m sglang.launch_server \ --model-path "DeepXR/Helion-OSC" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/chat/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "DeepXR/Helion-OSC", "messages": [ { "role": "user", "content": "What is the capital of France?" } ] }' - Docker Model Runner
How to use DeepXR/Helion-OSC with Docker Model Runner:
docker model run hf.co/DeepXR/Helion-OSC
| """ | |
| Helion-OSC Inference Script | |
| DeepXR/Helion-OSC - Mathematical Coding Language Model | |
| This module provides comprehensive inference capabilities for the Helion-OSC model, | |
| including specialized methods for different programming and mathematical tasks. | |
| """ | |
| import torch | |
| import json | |
| import logging | |
| from typing import Optional, Dict, Any, List, Union | |
| from transformers import ( | |
| AutoTokenizer, | |
| AutoModelForCausalLM, | |
| GenerationConfig, | |
| StoppingCriteria, | |
| StoppingCriteriaList | |
| ) | |
| from dataclasses import dataclass | |
| import warnings | |
| # Configure logging | |
| logging.basicConfig( | |
| level=logging.INFO, | |
| format='%(asctime)s - %(name)s - %(levelname)s - %(message)s' | |
| ) | |
| logger = logging.getLogger(__name__) | |
| class GenerationParameters: | |
| """Parameters for text generation""" | |
| max_length: int = 2048 | |
| temperature: float = 0.7 | |
| top_p: float = 0.95 | |
| top_k: int = 50 | |
| repetition_penalty: float = 1.05 | |
| length_penalty: float = 1.0 | |
| do_sample: bool = True | |
| num_return_sequences: int = 1 | |
| early_stopping: bool = False | |
| class CodeStoppingCriteria(StoppingCriteria): | |
| """Custom stopping criteria for code generation""" | |
| def __init__(self, stop_sequences: List[str], tokenizer): | |
| self.stop_sequences = stop_sequences | |
| self.tokenizer = tokenizer | |
| def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool: | |
| decoded = self.tokenizer.decode(input_ids[0], skip_special_tokens=True) | |
| return any(seq in decoded for seq in self.stop_sequences) | |
| class HelionOSCInference: | |
| """ | |
| Comprehensive inference wrapper for Helion-OSC model | |
| Supports multiple generation modes: | |
| - Code generation | |
| - Mathematical reasoning | |
| - Algorithm design | |
| - Code debugging | |
| - Documentation generation | |
| """ | |
| def __init__( | |
| self, | |
| model_name: str = "DeepXR/Helion-OSC", | |
| device: Optional[str] = None, | |
| load_in_8bit: bool = False, | |
| load_in_4bit: bool = False, | |
| use_flash_attention: bool = True, | |
| trust_remote_code: bool = True | |
| ): | |
| """ | |
| Initialize the Helion-OSC model | |
| Args: | |
| model_name: HuggingFace model identifier | |
| device: Device to load model on (cuda/cpu/mps) | |
| load_in_8bit: Load model in 8-bit precision | |
| load_in_4bit: Load model in 4-bit precision | |
| use_flash_attention: Use flash attention for faster inference | |
| trust_remote_code: Trust remote code from model repository | |
| """ | |
| self.model_name = model_name | |
| self.device = self._get_device(device) | |
| self.load_in_8bit = load_in_8bit | |
| self.load_in_4bit = load_in_4bit | |
| logger.info(f"Initializing Helion-OSC on {self.device}...") | |
| # Load tokenizer | |
| self.tokenizer = self._load_tokenizer(trust_remote_code) | |
| # Load model | |
| self.model = self._load_model( | |
| use_flash_attention=use_flash_attention, | |
| trust_remote_code=trust_remote_code | |
| ) | |
| # Load generation configs | |
| self.generation_configs = self._load_generation_configs() | |
| logger.info("Model loaded successfully!") | |
| self._print_model_info() | |
| def _get_device(self, device: Optional[str]) -> str: | |
| """Determine the best available device""" | |
| if device: | |
| return device | |
| if torch.cuda.is_available(): | |
| return "cuda" | |
| elif torch.backends.mps.is_available(): | |
| return "mps" | |
| return "cpu" | |
| def _load_tokenizer(self, trust_remote_code: bool): | |
| """Load and configure tokenizer""" | |
| logger.info("Loading tokenizer...") | |
| tokenizer = AutoTokenizer.from_pretrained( | |
| self.model_name, | |
| trust_remote_code=trust_remote_code, | |
| padding_side="left" | |
| ) | |
| # Ensure pad token is set | |
| if tokenizer.pad_token is None: | |
| tokenizer.pad_token = tokenizer.eos_token | |
| return tokenizer | |
| def _load_model(self, use_flash_attention: bool, trust_remote_code: bool): | |
| """Load and configure model""" | |
| logger.info("Loading model...") | |
| model_kwargs = { | |
| "trust_remote_code": trust_remote_code, | |
| "low_cpu_mem_usage": True | |
| } | |
| # Configure precision and quantization | |
| if self.load_in_8bit: | |
| model_kwargs["load_in_8bit"] = True | |
| logger.info("Loading in 8-bit precision") | |
| elif self.load_in_4bit: | |
| model_kwargs["load_in_4bit"] = True | |
| model_kwargs["bnb_4bit_compute_dtype"] = torch.bfloat16 | |
| model_kwargs["bnb_4bit_use_double_quant"] = True | |
| model_kwargs["bnb_4bit_quant_type"] = "nf4" | |
| logger.info("Loading in 4-bit precision") | |
| else: | |
| if self.device == "cuda": | |
| model_kwargs["torch_dtype"] = torch.bfloat16 | |
| else: | |
| model_kwargs["torch_dtype"] = torch.float32 | |
| # Configure device mapping | |
| if self.device == "cuda" and not (self.load_in_8bit or self.load_in_4bit): | |
| model_kwargs["device_map"] = "auto" | |
| # Load model | |
| model = AutoModelForCausalLM.from_pretrained( | |
| self.model_name, | |
| **model_kwargs | |
| ) | |
| # Move to device if needed | |
| if self.device != "cuda" or (self.load_in_8bit or self.load_in_4bit): | |
| if not (self.load_in_8bit or self.load_in_4bit): | |
| model = model.to(self.device) | |
| model.eval() | |
| # Enable gradient checkpointing for memory efficiency if needed | |
| if hasattr(model, 'gradient_checkpointing_enable'): | |
| model.gradient_checkpointing_enable() | |
| return model | |
| def _load_generation_configs(self) -> Dict[str, GenerationParameters]: | |
| """Load task-specific generation configurations""" | |
| return { | |
| "code_generation": GenerationParameters( | |
| max_length=4096, | |
| temperature=0.7, | |
| top_p=0.95, | |
| top_k=50, | |
| repetition_penalty=1.05, | |
| do_sample=True | |
| ), | |
| "mathematical_reasoning": GenerationParameters( | |
| max_length=2048, | |
| temperature=0.3, | |
| top_p=0.9, | |
| top_k=40, | |
| repetition_penalty=1.0, | |
| do_sample=False | |
| ), | |
| "code_completion": GenerationParameters( | |
| max_length=1024, | |
| temperature=0.6, | |
| top_p=0.92, | |
| top_k=45, | |
| repetition_penalty=1.03, | |
| do_sample=True | |
| ), | |
| "algorithm_design": GenerationParameters( | |
| max_length=3072, | |
| temperature=0.5, | |
| top_p=0.93, | |
| top_k=50, | |
| repetition_penalty=1.08, | |
| do_sample=True | |
| ), | |
| "debugging": GenerationParameters( | |
| max_length=2048, | |
| temperature=0.4, | |
| top_p=0.88, | |
| repetition_penalty=1.0, | |
| do_sample=False | |
| ) | |
| } | |
| def _print_model_info(self): | |
| """Print model information""" | |
| try: | |
| num_params = sum(p.numel() for p in self.model.parameters()) | |
| logger.info(f"Model parameters: {num_params:,}") | |
| logger.info(f"Model dtype: {next(self.model.parameters()).dtype}") | |
| logger.info(f"Device: {self.device}") | |
| except Exception as e: | |
| logger.warning(f"Could not get model info: {e}") | |
| def generate( | |
| self, | |
| prompt: Union[str, List[str]], | |
| task_type: str = "code_generation", | |
| custom_params: Optional[GenerationParameters] = None, | |
| stop_sequences: Optional[List[str]] = None, | |
| return_full_text: bool = False, | |
| **kwargs | |
| ) -> Union[str, List[str]]: | |
| """ | |
| Generate text based on prompt | |
| Args: | |
| prompt: Input prompt or list of prompts | |
| task_type: Type of task (code_generation, mathematical_reasoning, etc.) | |
| custom_params: Custom generation parameters | |
| stop_sequences: List of sequences to stop generation | |
| return_full_text: Whether to return full text including prompt | |
| **kwargs: Additional generation parameters | |
| Returns: | |
| Generated text or list of generated texts | |
| """ | |
| # Get generation parameters | |
| if custom_params: | |
| params = custom_params | |
| elif task_type in self.generation_configs: | |
| params = self.generation_configs[task_type] | |
| else: | |
| logger.warning(f"Unknown task type '{task_type}', using default parameters") | |
| params = GenerationParameters() | |
| # Override with kwargs | |
| for key, value in kwargs.items(): | |
| if hasattr(params, key): | |
| setattr(params, key, value) | |
| # Tokenize input | |
| is_batch = isinstance(prompt, list) | |
| inputs = self.tokenizer( | |
| prompt, | |
| return_tensors="pt", | |
| padding=True, | |
| truncation=True, | |
| max_length=self.model.config.max_position_embeddings | |
| ).to(self.device) | |
| # Setup stopping criteria | |
| stopping_criteria = None | |
| if stop_sequences: | |
| stopping_criteria = StoppingCriteriaList([ | |
| CodeStoppingCriteria(stop_sequences, self.tokenizer) | |
| ]) | |
| # Generate | |
| with torch.no_grad(): | |
| outputs = self.model.generate( | |
| **inputs, | |
| max_length=params.max_length, | |
| temperature=params.temperature, | |
| top_p=params.top_p, | |
| top_k=params.top_k, | |
| repetition_penalty=params.repetition_penalty, | |
| length_penalty=params.length_penalty, | |
| do_sample=params.do_sample, | |
| num_return_sequences=params.num_return_sequences, | |
| early_stopping=params.early_stopping, | |
| pad_token_id=self.tokenizer.pad_token_id, | |
| eos_token_id=self.tokenizer.eos_token_id, | |
| stopping_criteria=stopping_criteria | |
| ) | |
| # Decode outputs | |
| generated_texts = [] | |
| for output in outputs: | |
| text = self.tokenizer.decode(output, skip_special_tokens=True) | |
| if not return_full_text and not is_batch: | |
| # Remove prompt from single generation | |
| if isinstance(prompt, str): | |
| text = text[len(prompt):].strip() | |
| generated_texts.append(text) | |
| return generated_texts if is_batch or params.num_return_sequences > 1 else generated_texts[0] | |
| def code_generation( | |
| self, | |
| prompt: str, | |
| language: Optional[str] = None, | |
| max_length: int = 4096, | |
| **kwargs | |
| ) -> str: | |
| """ | |
| Generate code for a given prompt | |
| Args: | |
| prompt: Code generation prompt | |
| language: Programming language (optional) | |
| max_length: Maximum length of generated code | |
| **kwargs: Additional generation parameters | |
| Returns: | |
| Generated code | |
| """ | |
| if language: | |
| prompt = f"Language: {language}\n{prompt}" | |
| return self.generate( | |
| prompt, | |
| task_type="code_generation", | |
| max_length=max_length, | |
| **kwargs | |
| ) | |
| def mathematical_reasoning( | |
| self, | |
| prompt: str, | |
| max_length: int = 2048, | |
| **kwargs | |
| ) -> str: | |
| """ | |
| Solve mathematical problems with step-by-step reasoning | |
| Args: | |
| prompt: Mathematical problem | |
| max_length: Maximum length of solution | |
| **kwargs: Additional generation parameters | |
| Returns: | |
| Mathematical solution with reasoning | |
| """ | |
| return self.generate( | |
| prompt, | |
| task_type="mathematical_reasoning", | |
| max_length=max_length, | |
| **kwargs | |
| ) | |
| def algorithm_design( | |
| self, | |
| prompt: str, | |
| include_complexity: bool = True, | |
| max_length: int = 3072, | |
| **kwargs | |
| ) -> str: | |
| """ | |
| Design algorithms with complexity analysis | |
| Args: | |
| prompt: Algorithm design prompt | |
| include_complexity: Whether to include complexity analysis | |
| max_length: Maximum length of output | |
| **kwargs: Additional generation parameters | |
| Returns: | |
| Algorithm design with analysis | |
| """ | |
| if include_complexity: | |
| prompt += "\n\nPlease include time and space complexity analysis." | |
| return self.generate( | |
| prompt, | |
| task_type="algorithm_design", | |
| max_length=max_length, | |
| **kwargs | |
| ) | |
| def debug_code( | |
| self, | |
| code: str, | |
| error_message: Optional[str] = None, | |
| max_length: int = 2048, | |
| **kwargs | |
| ) -> str: | |
| """ | |
| Debug code and provide fixes | |
| Args: | |
| code: Code to debug | |
| error_message: Optional error message | |
| max_length: Maximum length of output | |
| **kwargs: Additional generation parameters | |
| Returns: | |
| Debugging analysis and fixes | |
| """ | |
| prompt = f"Debug the following code:\n\n```\n{code}\n```" | |
| if error_message: | |
| prompt += f"\n\nError message: {error_message}" | |
| prompt += "\n\nProvide a detailed explanation and fixed code." | |
| return self.generate( | |
| prompt, | |
| task_type="debugging", | |
| max_length=max_length, | |
| **kwargs | |
| ) | |
| def complete_code( | |
| self, | |
| code_context: str, | |
| max_length: int = 1024, | |
| **kwargs | |
| ) -> str: | |
| """ | |
| Complete partial code | |
| Args: | |
| code_context: Partial code to complete | |
| max_length: Maximum length of completion | |
| **kwargs: Additional generation parameters | |
| Returns: | |
| Code completion | |
| """ | |
| return self.generate( | |
| code_context, | |
| task_type="code_completion", | |
| max_length=max_length, | |
| stop_sequences=["\n\n", "```", "###"], | |
| **kwargs | |
| ) | |
| def batch_generate( | |
| self, | |
| prompts: List[str], | |
| task_type: str = "code_generation", | |
| batch_size: int = 4, | |
| **kwargs | |
| ) -> List[str]: | |
| """ | |
| Generate responses for multiple prompts in batches | |
| Args: | |
| prompts: List of prompts | |
| task_type: Type of task | |
| batch_size: Batch size for processing | |
| **kwargs: Additional generation parameters | |
| Returns: | |
| List of generated responses | |
| """ | |
| results = [] | |
| for i in range(0, len(prompts), batch_size): | |
| batch = prompts[i:i + batch_size] | |
| batch_results = self.generate(batch, task_type=task_type, **kwargs) | |
| if isinstance(batch_results, str): | |
| batch_results = [batch_results] | |
| results.extend(batch_results) | |
| return results | |
| def main(): | |
| """Example usage and demonstrations""" | |
| print("=" * 80) | |
| print("Helion-OSC Inference Examples") | |
| print("=" * 80) | |
| # Initialize model | |
| helion = HelionOSCInference( | |
| load_in_8bit=False, # Set to True for lower memory usage | |
| load_in_4bit=False # Set to True for even lower memory usage | |
| ) | |
| # Example 1: Code Generation | |
| print("\n" + "=" * 80) | |
| print("Example 1: Code Generation") | |
| print("=" * 80) | |
| code_prompt = """Write a Python function to implement a binary search tree with the following methods: | |
| - insert(value): Insert a new value | |
| - search(value): Search for a value | |
| - delete(value): Delete a value | |
| - inorder_traversal(): Return inorder traversal | |
| Include proper documentation and type hints.""" | |
| print(f"\nPrompt:\n{code_prompt}") | |
| print("\nGenerating...") | |
| result = helion.code_generation(code_prompt, language="python") | |
| print(f"\nGenerated Code:\n{result}") | |
| # Example 2: Mathematical Reasoning | |
| print("\n" + "=" * 80) | |
| print("Example 2: Mathematical Reasoning") | |
| print("=" * 80) | |
| math_prompt = """Prove that the sum of the first n natural numbers equals n(n+1)/2 using mathematical induction.""" | |
| print(f"\nPrompt:\n{math_prompt}") | |
| print("\nGenerating...") | |
| result = helion.mathematical_reasoning(math_prompt) | |
| print(f"\nSolution:\n{result}") | |
| # Example 3: Algorithm Design | |
| print("\n" + "=" * 80) | |
| print("Example 3: Algorithm Design") | |
| print("=" * 80) | |
| algo_prompt = """Design an efficient algorithm to find the longest palindromic substring in a given string.""" | |
| print(f"\nPrompt:\n{algo_prompt}") | |
| print("\nGenerating...") | |
| result = helion.algorithm_design(algo_prompt, include_complexity=True) | |
| print(f"\nAlgorithm:\n{result}") | |
| # Example 4: Code Debugging | |
| print("\n" + "=" * 80) | |
| print("Example 4: Code Debugging") | |
| print("=" * 80) | |
| buggy_code = """ | |
| def fibonacci(n): | |
| if n <= 1: | |
| return n | |
| return fibonacci(n-1) + fibonacci(n-2) | |
| # This is too slow for large n | |
| result = fibonacci(100) | |
| """ | |
| print(f"\nBuggy Code:\n{buggy_code}") | |
| print("\nGenerating debugging analysis...") | |
| result = helion.debug_code(buggy_code, error_message="Takes too long to compute") | |
| print(f"\nDebug Analysis:\n{result}") | |
| # Example 5: Batch Processing | |
| print("\n" + "=" * 80) | |
| print("Example 5: Batch Code Generation") | |
| print("=" * 80) | |
| batch_prompts = [ | |
| "Write a Python function to reverse a linked list", | |
| "Write a JavaScript function to debounce API calls", | |
| "Write a Rust function to parse JSON safely" | |
| ] | |
| print("\nProcessing batch prompts...") | |
| results = helion.batch_generate(batch_prompts, batch_size=2) | |
| for i, (prompt, result) in enumerate(zip(batch_prompts, results), 1): | |
| print(f"\nPrompt {i}: {prompt}") | |
| print(f"Result {i}:\n{result}\n") | |
| print("=" * 80) | |
| print("Examples completed!") | |
| print("=" * 80) | |
| if __name__ == "__main__": | |
| main() |