ehartford commited on
Commit
00718d5
·
verified ·
1 Parent(s): ac010c8

Upload folder using huggingface_hub

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ assets/dflash_system.png filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,107 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: mit
3
+ library_name: transformers
4
+ pipeline_tag: text-generation
5
+ tags:
6
+ - dflash
7
+ - speculative-decoding
8
+ - diffusion
9
+ - efficiency
10
+ - flash-decoding
11
+ - qwen
12
+ - diffusion-language-model
13
+ ---
14
+
15
+ # Qwen3.6-27B-DFlash
16
+ [**Paper**](https://arxiv.org/abs/2602.06036) | [**GitHub**](https://github.com/z-lab/dflash) | [**Blog**](https://z-lab.ai/projects/dflash/)
17
+
18
+ **This model is still under training, and inference engine support may not be fully available yet due to architectural changes, including causal SWA layers.**
19
+
20
+ **DFlash** is a novel speculative decoding method that utilizes a lightweight **block diffusion** model for drafting. It enables efficient, high-quality parallel drafting that pushes the limits of inference speed.
21
+
22
+ This model is the **drafter** component. It must be used in conjunction with the target model `Qwen/Qwen3.6-27B`.
23
+
24
+ <div align="center">
25
+ <img src="assets/dflash_system.png" alt="DFlash Architecture" width="100%">
26
+ </div>
27
+
28
+ ## Quick Start
29
+
30
+ ### Installation
31
+
32
+ vLLM (We temporarily modify the installation through this PR to support interleaved SWA and ensure correct handling of target hidden states for optimal performance):
33
+ ```bash
34
+ uv pip install vllm
35
+ uv pip install -U --torch-backend=auto "vllm @ git+https://github.com/vllm-project/vllm.git@refs/pull/40898/head"
36
+ ```
37
+
38
+ SGLang:
39
+ ```bash
40
+ uv pip install "git+https://github.com/sgl-project/sglang.git@refs/pull/23000/head#subdirectory=python"
41
+ ```
42
+
43
+ ### Launch Server
44
+
45
+ vLLM:
46
+ ```bash
47
+ vllm serve Qwen/Qwen3.6-27B \
48
+ --speculative-config '{"method": "dflash", "model": "z-lab/Qwen3.6-27B-DFlash", "num_speculative_tokens": 15}' \
49
+ --attention-backend flash_attn \
50
+ --max-num-batched-tokens 32768
51
+ ```
52
+
53
+ SGLang:
54
+ ```bash
55
+ # Optional: enable schedule overlapping (experimental, may not be stable)
56
+ # export SGLANG_ENABLE_SPEC_V2=1
57
+ # export SGLANG_ENABLE_DFLASH_SPEC_V2=1
58
+ # export SGLANG_ENABLE_OVERLAP_PLAN_STREAM=1
59
+
60
+ python -m sglang.launch_server \
61
+ --model-path Qwen/Qwen3.6-27B \
62
+ --speculative-algorithm DFLASH \
63
+ --speculative-draft-model-path z-lab/Qwen3.6-27B-DFlash \
64
+ --speculative-num-draft-tokens 16 \
65
+ --tp-size 1 \
66
+ --attention-backend fa3 \
67
+ --mem-fraction-static 0.75 \
68
+ --mamba-scheduler-strategy extra_buffer \
69
+ --trust-remote-code
70
+ ```
71
+
72
+ ### Usage
73
+
74
+ ```python
75
+ from openai import OpenAI
76
+
77
+ client = OpenAI(base_url="http://localhost:30000/v1", api_key="EMPTY")
78
+
79
+ response = client.chat.completions.create(
80
+ model="Qwen/Qwen3.6-27B",
81
+ messages=[{"role": "user", "content": "Write a quicksort in Python."}],
82
+ max_tokens=4096,
83
+ temperature=0.0
84
+ )
85
+ print(response.choices[0].message.content)
86
+ ```
87
+
88
+ ## Benchmark Results
89
+
90
+ N/A
91
+
92
+ ## Acknowledgements
93
+
94
+ Special thanks to [David Wang](https://davidwa.ng/) for his outstanding engineering support on this project. We are also grateful to [Modal](https://modal.com/), [InnoMatrix](https://innomatrix.ai), and [Yotta Labs](https://www.yottalabs.ai/) for providing the compute resources used to train this draft model.
95
+
96
+ ## Citation
97
+
98
+ If you find DFlash useful, please cite our work. To share feedback on DFlash or request new model support, please fill out this form: [DFlash Feedback](https://forms.gle/4YNwfqb4nJdqn6hq9).
99
+
100
+ ```bibtex
101
+ @article{chen2026dflash,
102
+ title = {{DFlash: Block Diffusion for Flash Speculative Decoding}},
103
+ author = {Chen, Jian and Liang, Yesheng and Liu, Zhijian},
104
+ journal = {arXiv preprint arXiv:2602.06036},
105
+ year = {2026}
106
+ }
107
+ ```
assets/dflash_system.png ADDED

Git LFS Details

  • SHA256: bea1f82796909c1e4f7261ee3c08af743ec3c25057b83fca918808b76af4a7dc
  • Pointer size: 131 Bytes
  • Size of remote file: 338 kB
assets/speedup.png ADDED
config.json ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "DFlashDraftModel"
4
+ ],
5
+ "attention_bias": false,
6
+ "attention_dropout": 0.0,
7
+ "auto_map": {
8
+ "AutoModel": "dflash.DFlashDraftModel"
9
+ },
10
+ "block_size": 16,
11
+ "bos_token_id": null,
12
+ "dflash_config": {
13
+ "mask_token_id": 248070,
14
+ "target_layer_ids": [
15
+ 1,
16
+ 16,
17
+ 31,
18
+ 46,
19
+ 61
20
+ ]
21
+ },
22
+ "dtype": "bfloat16",
23
+ "eos_token_id": 248044,
24
+ "head_dim": 128,
25
+ "hidden_act": "silu",
26
+ "hidden_size": 5120,
27
+ "initializer_range": 0.02,
28
+ "intermediate_size": 17408,
29
+ "layer_types": [
30
+ "sliding_attention",
31
+ "sliding_attention",
32
+ "sliding_attention",
33
+ "sliding_attention",
34
+ "full_attention"
35
+ ],
36
+ "max_position_embeddings": 262144,
37
+ "max_window_layers": 5,
38
+ "model_type": "qwen3",
39
+ "num_attention_heads": 32,
40
+ "num_hidden_layers": 5,
41
+ "num_key_value_heads": 8,
42
+ "num_target_layers": 64,
43
+ "pad_token_id": 248044,
44
+ "rms_norm_eps": 1e-06,
45
+ "sliding_window": 2048,
46
+ "tie_word_embeddings": false,
47
+ "transformers_version": "5.5.3",
48
+ "use_cache": true,
49
+ "use_sliding_window": true,
50
+ "vocab_size": 248320,
51
+ "rope_theta": 10000000,
52
+ "rope_scaling": null
53
+ }
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e0c050b34798d32728a164d2c3f1681746ff85c11945701b0205b654e2f1fdbe
3
+ size 3460432504