Text Generation
Transformers
Safetensors
Korean
hanforge
korean
causal-lm
pretraining
small-language-model
custom_code
Instructions to use drlee1/HanForge-base with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Transformers
How to use drlee1/HanForge-base with Transformers:
# Use a pipeline as a high-level helper from transformers import pipeline pipe = pipeline("text-generation", model="drlee1/HanForge-base", trust_remote_code=True)# Load model directly from transformers import AutoModelForCausalLM model = AutoModelForCausalLM.from_pretrained("drlee1/HanForge-base", trust_remote_code=True, dtype="auto") - Notebooks
- Google Colab
- Kaggle
- Local Apps
- vLLM
How to use drlee1/HanForge-base with vLLM:
Install from pip and serve model
# Install vLLM from pip: pip install vllm # Start the vLLM server: vllm serve "drlee1/HanForge-base" # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:8000/v1/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "drlee1/HanForge-base", "prompt": "Once upon a time,", "max_tokens": 512, "temperature": 0.5 }'Use Docker
docker model run hf.co/drlee1/HanForge-base
- SGLang
How to use drlee1/HanForge-base with SGLang:
Install from pip and serve model
# Install SGLang from pip: pip install sglang # Start the SGLang server: python3 -m sglang.launch_server \ --model-path "drlee1/HanForge-base" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "drlee1/HanForge-base", "prompt": "Once upon a time,", "max_tokens": 512, "temperature": 0.5 }'Use Docker images
docker run --gpus all \ --shm-size 32g \ -p 30000:30000 \ -v ~/.cache/huggingface:/root/.cache/huggingface \ --env "HF_TOKEN=<secret>" \ --ipc=host \ lmsysorg/sglang:latest \ python3 -m sglang.launch_server \ --model-path "drlee1/HanForge-base" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "drlee1/HanForge-base", "prompt": "Once upon a time,", "max_tokens": 512, "temperature": 0.5 }' - Docker Model Runner
How to use drlee1/HanForge-base with Docker Model Runner:
docker model run hf.co/drlee1/HanForge-base
| from __future__ import annotations | |
| from transformers import PretrainedConfig | |
| class HanForgeConfig(PretrainedConfig): | |
| model_type = "hanforge" | |
| # <<< disabled (refactor 20260423, ยง4.1 hybrid local/global attention ๋ฏธ์ฌ์ฉ) | |
| # ๋ณด์กด๋ ์ค๊ณ ์์ฐ: sliding_window / global_layer_interval / is_global_layer. | |
| # ๋ณธ refactor์์๋ full causal attention๋ง ์ฌ์ฉํ๋ค. | |
| # sliding_window: int = 256 | |
| # global_layer_interval: int = 4 | |
| # def is_global_layer(self, layer_idx: int) -> bool: | |
| # return layer_idx % self.global_layer_interval == 0 | |
| # >>> end disabled | |
| # <<< disabled (refactor 20260423, ยง4.2 YaRN ๋ฏธ์ฌ์ฉ) | |
| # rope_scaling / original_max_position_embeddings ๋ YaRN ํ์ฅ ์ ์ ํ๋์๋ค. | |
| # from-scratch 4k context ํ์ต์์๋ ๋จ์ RoPE ๋ก ์ถฉ๋ถํ๋ค. | |
| # original_max_position_embeddings: int = 4096 | |
| # rope_scaling: dict | None = None | |
| # >>> end disabled | |
| def __init__( | |
| self, | |
| vocab_size: int = 32000, | |
| hidden_size: int = 384, | |
| intermediate_size: int = 1024, | |
| num_hidden_layers: int = 8, | |
| num_attention_heads: int = 6, | |
| num_key_value_heads: int = 2, | |
| max_position_embeddings: int = 4096, | |
| rope_theta: float = 50_000.0, | |
| rms_norm_eps: float = 1e-6, | |
| hidden_dropout_prob: float = 0.0, | |
| attention_dropout: float = 0.0, | |
| initializer_range: float = 0.02, | |
| pad_token_id: int = 0, | |
| bos_token_id: int = 1, | |
| eos_token_id: int = 2, | |
| unk_token_id: int = 3, | |
| use_cache: bool = False, | |
| **kwargs, | |
| ): | |
| # Back-compat: ๊ณผ๊ฑฐ ์คํฌ๋ฆฝํธ/์ฒดํฌํฌ์ธํธ๊ฐ ๋นํ์ฑํ๋ ํ๋๋ฅผ ๋๊ธฐ๋๋ผ๋ ๋ฌด์ํ๋ค. | |
| kwargs.pop("sliding_window", None) | |
| kwargs.pop("global_layer_interval", None) | |
| kwargs.pop("original_max_position_embeddings", None) | |
| kwargs.pop("rope_scaling", None) | |
| self.vocab_size = vocab_size | |
| self.hidden_size = hidden_size | |
| self.intermediate_size = intermediate_size | |
| self.num_hidden_layers = num_hidden_layers | |
| self.num_attention_heads = num_attention_heads | |
| self.num_key_value_heads = num_key_value_heads | |
| self.max_position_embeddings = max_position_embeddings | |
| self.rope_theta = rope_theta | |
| self.rms_norm_eps = rms_norm_eps | |
| self.hidden_dropout_prob = hidden_dropout_prob | |
| self.attention_dropout = attention_dropout | |
| self.initializer_range = initializer_range | |
| self.use_cache = use_cache | |
| tie_word_embeddings = kwargs.pop("tie_word_embeddings", True) | |
| if hidden_size % num_attention_heads != 0: | |
| raise ValueError("hidden_size must be divisible by num_attention_heads") | |
| if num_attention_heads % num_key_value_heads != 0: | |
| raise ValueError("num_attention_heads must be divisible by num_key_value_heads") | |
| super().__init__( | |
| pad_token_id=pad_token_id, | |
| bos_token_id=bos_token_id, | |
| eos_token_id=eos_token_id, | |
| unk_token_id=unk_token_id, | |
| tie_word_embeddings=tie_word_embeddings, | |
| **kwargs, | |
| ) | |
| def head_dim(self) -> int: | |
| return self.hidden_size // self.num_attention_heads | |
| def num_key_value_groups(self) -> int: | |
| return self.num_attention_heads // self.num_key_value_heads | |