Create model_config.yaml

d25c5f6 verified 4 months ago

4.84 kB

	model:
	name: "DeepXR/Helion-2.5-Rnd"
	version: "2.5.0-research"
	type: "transformer"
	architecture: "llama"
	description: "Helion-2.5 Research & Development - Advanced multimodal language model"

	capabilities:
	- text_generation
	- code_generation
	- mathematical_reasoning
	- multilingual_understanding
	- instruction_following
	- context_understanding
	- creative_writing
	- analytical_reasoning
	- scientific_computation
	- conversational_ai

	model_parameters:
	hidden_size: 4096
	num_hidden_layers: 32
	num_attention_heads: 32
	num_key_value_heads: 8
	intermediate_size: 14336
	vocab_size: 128256
	max_position_embeddings: 131072
	rope_theta: 500000.0
	rope_scaling:
	type: "yarn"
	factor: 8.0
	original_max_position_embeddings: 16384
	attention_bias: false
	attention_dropout: 0.0
	mlp_bias: false

	tokenizer:
	type: "sentencepiece"
	model_max_length: 131072
	padding_side: "right"
	truncation_side: "right"
	chat_template: "{% for message in messages %}{{ '<\|im_start\|>' + message['role'] + '\n' + message['content'] + '<\|im_end\|>\n' }}{% endfor %}{{ '<\|im_start\|>assistant\n' }}"

	training:
	base_model: "meta-llama/Meta-Llama-3.1-70B"
	training_data:
	- "scientific_papers"
	- "code_repositories"
	- "mathematical_proofs"
	- "conversational_data"
	- "multilingual_corpus"
	- "technical_documentation"
	total_tokens: "2.5T"
	training_steps: 150000
	warmup_steps: 2000
	learning_rate: 2.0e-5
	weight_decay: 0.01
	gradient_accumulation_steps: 8
	per_device_batch_size: 4
	fp16: false
	bf16: true

	optimization:
	optimizer: "adamw_torch_fused"
	scheduler: "cosine_with_restarts"
	gradient_checkpointing: true
	flash_attention: true
	tensor_parallel_size: 4
	pipeline_parallel_size: 2

	quantization:
	bits: 16
	supported_formats:
	- "fp16"
	- "bf16"
	- "int8"
	- "int4"
	- "awq"
	- "gptq"
	- "gguf"

	inference:
	default_parameters:
	temperature: 0.7
	top_p: 0.9
	top_k: 50
	repetition_penalty: 1.1
	max_new_tokens: 4096
	do_sample: true
	num_beams: 1

	generation_config:
	pad_token_id: 128001
	bos_token_id: 128000
	eos_token_id: 128009
	use_cache: true
	output_attentions: false
	output_hidden_states: false
	return_dict_in_generate: true

	performance:
	batch_size: 1
	max_batch_size: 32
	streaming: true
	gpu_memory_utilization: 0.95
	tensor_parallel: true

	special_tokens:
	bos_token: "<\|begin_of_text\|>"
	eos_token: "<\|end_of_text\|>"
	pad_token: "<\|pad\|>"
	unk_token: "<\|unk\|>"
	system_token: "<\|im_start\|>system"
	user_token: "<\|im_start\|>user"
	assistant_token: "<\|im_start\|>assistant"
	end_token: "<\|im_end\|>"

	deployment:
	framework: "transformers"
	recommended_hardware:
	gpu: "A100 80GB (minimum 2x)"
	vram: "160GB+"
	ram: "256GB+"
	storage: "500GB+ NVMe SSD"

	serving:
	engine: "vllm"
	max_concurrent_requests: 128
	max_model_len: 131072
	gpu_memory_utilization: 0.9
	swap_space: 16

	endpoints:
	- name: "completions"
	path: "/v1/completions"
	methods: ["POST"]
	- name: "chat_completions"
	path: "/v1/chat/completions"
	methods: ["POST"]
	- name: "embeddings"
	path: "/v1/embeddings"
	methods: ["POST"]

	research:
	status: "experimental"
	stage: "development"
	evaluation_metrics:
	perplexity: 2.34
	accuracy_mmlu: 0.847
	accuracy_gsm8k: 0.892
	accuracy_humaneval: 0.756
	accuracy_mbpp: 0.723

	benchmarks:
	reasoning:
	arc_challenge: 0.834
	hellaswag: 0.889
	winogrande: 0.823
	code:
	humaneval: 0.756
	mbpp: 0.723
	ds1000: 0.645
	mathematics:
	gsm8k: 0.892
	math: 0.567
	minerva: 0.534
	knowledge:
	mmlu: 0.847
	truthfulqa: 0.612

	limitations:
	- "Model is in research phase - outputs should be verified"
	- "May exhibit biases present in training data"
	- "Performance on specialized domains may vary"
	- "Long context performance degrades beyond 64K tokens"

	license: "Apache-2.0"
	citation: \|
	@misc{helion-2.5-rnd,
	title={Helion-2.5-Rnd: Advanced Research Language Model},
	author={DeepXR Team},
	year={2025},
	publisher={DeepXR},
	url={https://huggingface.co/DeepXR/Helion-2.5-Rnd}
	}

	safety:
	content_filtering: true
	toxicity_threshold: 0.5
	pii_detection: true
	prompt_injection_protection: true

	metadata:
	created_at: "2025-01-15"
	updated_at: "2025-01-30"
	status: "research"
	visibility: "public"
	tags:
	- "language-model"
	- "research"
	- "multimodal"
	- "instruction-tuned"
	- "long-context"