Qwen-Image-Edit-Angles

Running on Zero

App Files Files Community

Elea Zhong commited on Nov 12

Commit

e64ed84

1 Parent(s): 454ba5e

add debug functions

Browse files

Files changed (7) hide show

.gitignore +19 -0
app.py +6 -4
pyproject.toml +14 -0
qwenimage/debug.py +178 -0
optimization.py → qwenimage/optimization.py +3 -0
qwenimage/pipeline_qwenimage_edit_plus.py +258 -246
prompt.py → qwenimage/prompt.py +0 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,19 @@

+*.egg-info
+.env
+**/__pycache__/*
+wandb/*
+*.log
+venv/*
+.venv/*
+keyfile
+**/.ipynb_checkpoints/
+**/.DS_Store/*
+.idea/*
+.vscode/*
+latentanalysis_data/*
+latentanalysis_scripts/*
+test-images/*
+weights/*
+latentmask/latentanalysis/*
+cache/*
+docs/automated-documentation/*

app.py CHANGED Viewed

@@ -13,11 +13,12 @@ from diffusers import FlowMatchEulerDiscreteScheduler
 from huggingface_hub import hf_hub_download
 from safetensors.torch import load_file
-from optimization import optimize_pipeline_
 from qwenimage.pipeline_qwenimage_edit_plus import QwenImageEditPlusPipeline
 from qwenimage.transformer_qwenimage import QwenImageTransformer2DModel
 from qwenimage.qwen_fa3_processor import QwenDoubleStreamAttnProcessorFA3
-from prompt import build_camera_prompt
 # --- Model Loading ---
 dtype = torch.bfloat16
@@ -53,7 +54,8 @@ optimize_pipeline_(pipe, image=[Image.new("RGB", (1024, 1024)), Image.new("RGB",
 MAX_SEED = np.iinfo(np.int32).max
-@spaces.GPU
 def infer_camera_edit(
     image,
     rotate_deg,
@@ -111,7 +113,7 @@ css = '''#col-container { max-width: 800px; margin: 0 auto; }
 #examples{max-width: 800px; margin: 0 auto; }'''
 def reset_all():
-    return [0, 0, 0, 0, False, True]
 def end_reset():
     return False

 from huggingface_hub import hf_hub_download
 from safetensors.torch import load_file
+from qwenimage.debug import ftimed
+from qwenimage.optimization import optimize_pipeline_
 from qwenimage.pipeline_qwenimage_edit_plus import QwenImageEditPlusPipeline
 from qwenimage.transformer_qwenimage import QwenImageTransformer2DModel
 from qwenimage.qwen_fa3_processor import QwenDoubleStreamAttnProcessorFA3
+from qwenimage.prompt import build_camera_prompt
 # --- Model Loading ---
 dtype = torch.bfloat16
 MAX_SEED = np.iinfo(np.int32).max
+# @spaces.GPU
+@ftimed
 def infer_camera_edit(
     image,
     rotate_deg,
 #examples{max-width: 800px; margin: 0 auto; }'''
 def reset_all():
+    return [0, 0, 0, 0, False]
 def end_reset():
     return False

pyproject.toml ADDED Viewed

	@@ -0,0 +1,14 @@

+[build-system]
+requires = ["setuptools>=61.0"]
+build-backend = "setuptools.build_meta"
+[project]
+name = "qwenimage"
+version = "0.1"
+[tool.setuptools.packages.find]
+where = ["."]
+include = ["qwenimage*"]
+[tool.setuptools.dynamic]
+dependencies = {file = ["requirements/requirements.txt"]}

qwenimage/debug.py ADDED Viewed

	@@ -0,0 +1,178 @@

+import os
+from pathlib import Path
+import time
+import uuid
+import warnings
+from functools import wraps
+from typing import Callable, Literal
+import numpy as np
+from PIL import Image
+import torch
+from torchvision.utils import save_image
+DEBUG = True
+def ftimed(func=None):
+    def decorator(func):
+        @wraps(func)
+        def wrapper(*args, **kwargs):
+            if not DEBUG:
+                return func(*args, **kwargs)
+            else:
+                start_time = time.perf_counter()
+                result = func(*args, **kwargs)
+                end_time = time.perf_counter()
+                print(f"Time taken by {func.__qualname__}: {end_time - start_time} seconds")
+                return result
+        return wrapper
+    if func is None:
+        return decorator
+    else:
+        return decorator(func)
+class ctimed:
+    """
+    Context manager for timing lines of code. Use like:
+    ```
+    with ctimed(name="Model Forward"):
+        y = model(x)
+    ```
+    """
+    def __init__(self, name=None):
+        self.name = name
+        self.start_time = None
+    def __enter__(self):
+        if DEBUG:
+            self.start_time = time.perf_counter()
+        return self
+    def __exit__(self, exc_type, exc_value, traceback):
+        if DEBUG:
+            end_time = time.perf_counter()
+            if self.name:
+                print(f"Time taken by {self.name}: {end_time - self.start_time} seconds")
+            else:
+                print(f"Time taken: {end_time - self.start_time} seconds")
+def print_gpu_memory(clear_mem: Literal["pre", "post", None] = "pre"):
+    if not torch.cuda.is_available():
+        warnings.warn("Warning: CUDA device not available. Running on CPU.")
+        return
+    if clear_mem == "pre":
+        torch.cuda.empty_cache()
+    allocated = torch.cuda.memory_allocated()
+    reserved = torch.cuda.memory_reserved()
+    total = torch.cuda.get_device_properties(0).total_memory
+    print(f"Memory allocated: {allocated / (1024**2):.2f} MB")
+    print(f"Memory reserved: {reserved / (1024**2):.2f} MB")
+    print(f"Total memory: {total / (1024**2):.2f} MB")
+    if clear_mem == "post":
+        torch.cuda.empty_cache()
+def cuda_empty_cache(func):
+    def wrapper(*args, **kwargs):
+        result = func(*args, **kwargs)
+        if torch.cuda.is_available():
+            torch.cuda.empty_cache()
+        return result
+    return wrapper
+def print_first_param(module):
+    print(list(module.parameters())[0])
+def fdebug(func=None, *, exclude=None):
+    if exclude is None:
+        exclude = []
+    elif isinstance(exclude, str):
+        exclude = [exclude]
+    def decorator(func):
+        @wraps(func)
+        def wrapper(*args, **kwargs):
+            arg_names = func.__code__.co_varnames[:func.__code__.co_argcount]
+            arg_vals = args[:len(arg_names)]
+            arg_vals = [
+                (str(value)+str(value.shape) if isinstance(value, torch.Tensor) else value)
+                for value in arg_vals
+            ]
+            args_pairs = ", ".join(f"{name}={value}" for name, value in zip(arg_names, arg_vals) if name not in exclude)
+            kwargs_pairs = ", ".join(f"{k}={v}" for k, v in kwargs.items() if k not in exclude)
+            all_args = ", ".join(filter(None, [args_pairs, kwargs_pairs]))
+            print(f"Calling {func.__name__}({all_args})")
+            result = func(*args, **kwargs)
+            print(f"{func.__name__} returned {str(result)+str(result.shape) if isinstance(result, torch.Tensor) else result}")
+            return result
+        return wrapper
+    if func is None:
+        return decorator
+    else:
+        return decorator(func)
+class IncrementIndex:
+    def __init__(self, max:int=100):
+        self.retry_max = max
+        self.retries = 0
+    def __call__(self, index):
+        if self.retries > self.retry_max:
+            raise RuntimeError(f"Retried too many times, max:{self.retry_max}")
+        else:
+            self.retries += 1
+        index += 1
+        return index
+_identity = lambda x: x
+def fretry(func=None, *, exceptions=(Exception,), mod_args:tuple[Callable|None, ...]=tuple(), mod_kwargs:dict[str,Callable|None]=dict()):
+    def decorator(func):
+        @wraps(func)
+        def fretry_wrapper(*args, **kwargs):
+            try:
+                out = func(*args, **kwargs)
+            except exceptions as e:
+                new_args = []
+                for i, arg in enumerate(args):
+                    if i < len(mod_args):
+                        mod_func = mod_args[i] or _identity
+                        new_args.append(mod_func(arg))
+                    else:
+                        new_args.append(arg)
+                new_kwargs = {}
+                for k, kwarg in kwargs.items():
+                    if k in mod_kwargs:
+                        mod_func = mod_kwargs[k] or _identity
+                        new_kwargs[k] = mod_func(kwarg)
+                kwargs.update(new_kwargs)
+                import traceback
+                traceback.print_exc()
+                warnings.warn(
+                    f"Function {func} failed due to {e} with inputs {args}, {kwargs}, "
+                    f"retrying with modified inputs {new_args}, {new_kwargs}"
+                )
+                out = fretry_wrapper(*new_args, **new_kwargs)
+            return out
+        return fretry_wrapper
+    if func is None:
+        return decorator
+    else:
+        return decorator(func)
+def texam(t: torch.Tensor):
+    print(f"Shape: {tuple(t.shape)}")
+    if t.dtype.is_floating_point or t.dtype.is_complex:
+        mean_val = t.mean().item()
+    else:
+        mean_val = "N/A"
+    print(f"Min: {t.min().item()}, Max: {t.max().item()}, Mean: {mean_val}")
+    print(f"Device: {t.device}, Dtype: {t.dtype}, Requires Grad: {t.requires_grad}")

optimization.py → qwenimage/optimization.py RENAMED Viewed

@@ -1,5 +1,6 @@
 """
 """
 from typing import Any
 from typing import Callable
@@ -66,5 +67,7 @@ def optimize_pipeline_(pipeline: Callable[P, Any], *args: P.args, **kwargs: P.kw
         )
         return spaces.aoti_compile(exported, INDUCTOR_CONFIGS)
     spaces.aoti_apply(compile_transformer(), pipeline.transformer)

 """
 """
+import os
 from typing import Any
 from typing import Callable
         )
         return spaces.aoti_compile(exported, INDUCTOR_CONFIGS)
     spaces.aoti_apply(compile_transformer(), pipeline.transformer)

qwenimage/pipeline_qwenimage_edit_plus.py CHANGED Viewed

@@ -18,7 +18,10 @@ from typing import Any, Callable, Dict, List, Optional, Union
 import numpy as np
 import torch
-from transformers import Qwen2_5_VLForConditionalGeneration, Qwen2Tokenizer, Qwen2VLProcessor
 from diffusers.image_processor import PipelineImageInput, VaeImageProcessor
 from diffusers.loaders import QwenImageLoraLoaderMixin
@@ -29,6 +32,8 @@ from diffusers.utils.torch_utils import randn_tensor
 from diffusers.pipelines.pipeline_utils import DiffusionPipeline
 from diffusers.pipelines.qwenimage.pipeline_output import QwenImagePipelineOutput
 if is_torch_xla_available():
     import torch_xla.core.xla_model as xm
@@ -284,6 +289,7 @@ class QwenImageEditPlusPipeline(DiffusionPipeline, QwenImageLoraLoaderMixin):
         return prompt_embeds, encoder_attention_mask
     # Copied from diffusers.pipelines.qwenimage.pipeline_qwenimage_edit.QwenImageEditPipeline.encode_prompt
     def encode_prompt(
         self,
         prompt: Union[str, List[str]],
@@ -627,265 +633,271 @@ class QwenImageEditPlusPipeline(DiffusionPipeline, QwenImageLoraLoaderMixin):
             [`~pipelines.qwenimage.QwenImagePipelineOutput`] if `return_dict` is True, otherwise a `tuple`. When
             returning a tuple, the first element is a list with the generated images.
         """
-        image_size = image[-1].size if isinstance(image, list) else image.size
-        calculated_width, calculated_height = calculate_dimensions(1024 * 1024, image_size[0] / image_size[1])
-        height = height or calculated_height
-        width = width or calculated_width
-        multiple_of = self.vae_scale_factor * 2
-        width = width // multiple_of * multiple_of
-        height = height // multiple_of * multiple_of
-        # 1. Check inputs. Raise error if not correct
-        self.check_inputs(
-            prompt,
-            height,
-            width,
-            negative_prompt=negative_prompt,
-            prompt_embeds=prompt_embeds,
-            negative_prompt_embeds=negative_prompt_embeds,
-            prompt_embeds_mask=prompt_embeds_mask,
-            negative_prompt_embeds_mask=negative_prompt_embeds_mask,
-            callback_on_step_end_tensor_inputs=callback_on_step_end_tensor_inputs,
-            max_sequence_length=max_sequence_length,
-        )
-        self._guidance_scale = guidance_scale
-        self._attention_kwargs = attention_kwargs
-        self._current_timestep = None
-        self._interrupt = False
-        # 2. Define call parameters
-        if prompt is not None and isinstance(prompt, str):
-            batch_size = 1
-        elif prompt is not None and isinstance(prompt, list):
-            batch_size = len(prompt)
-        else:
-            batch_size = prompt_embeds.shape[0]
-        device = self._execution_device
-        # 3. Preprocess image
-        if image is not None and not (isinstance(image, torch.Tensor) and image.size(1) == self.latent_channels):
-            if not isinstance(image, list):
-                image = [image]
-            condition_image_sizes = []
-            condition_images = []
-            vae_image_sizes = []
-            vae_images = []
-            for img in image:
-                image_width, image_height = img.size
-                condition_width, condition_height = calculate_dimensions(
-                    CONDITION_IMAGE_SIZE, image_width / image_height
                 )
-                vae_width, vae_height = calculate_dimensions(VAE_IMAGE_SIZE, image_width / image_height)
-                condition_image_sizes.append((condition_width, condition_height))
-                vae_image_sizes.append((vae_width, vae_height))
-                condition_images.append(self.image_processor.resize(img, condition_height, condition_width))
-                vae_images.append(self.image_processor.preprocess(img, vae_height, vae_width).unsqueeze(2))
-        has_neg_prompt = negative_prompt is not None or (
-            negative_prompt_embeds is not None and negative_prompt_embeds_mask is not None
-        )
-        if true_cfg_scale > 1 and not has_neg_prompt:
-            logger.warning(
-                f"true_cfg_scale is passed as {true_cfg_scale}, but classifier-free guidance is not enabled since no negative_prompt is provided."
-            )
-        elif true_cfg_scale <= 1 and has_neg_prompt:
-            logger.warning(
-                " negative_prompt is passed but classifier-free guidance is not enabled since true_cfg_scale <= 1"
-            )
-        do_true_cfg = true_cfg_scale > 1 and has_neg_prompt
-        prompt_embeds, prompt_embeds_mask = self.encode_prompt(
-            image=condition_images,
-            prompt=prompt,
-            prompt_embeds=prompt_embeds,
-            prompt_embeds_mask=prompt_embeds_mask,
-            device=device,
-            num_images_per_prompt=num_images_per_prompt,
-            max_sequence_length=max_sequence_length,
-        )
-        if do_true_cfg:
-            negative_prompt_embeds, negative_prompt_embeds_mask = self.encode_prompt(
                 image=condition_images,
-                prompt=negative_prompt,
-                prompt_embeds=negative_prompt_embeds,
-                prompt_embeds_mask=negative_prompt_embeds_mask,
                 device=device,
                 num_images_per_prompt=num_images_per_prompt,
                 max_sequence_length=max_sequence_length,
             )
-        # 4. Prepare latent variables
-        num_channels_latents = self.transformer.config.in_channels // 4
-        latents, image_latents = self.prepare_latents(
-            vae_images,
-            batch_size * num_images_per_prompt,
-            num_channels_latents,
-            height,
-            width,
-            prompt_embeds.dtype,
-            device,
-            generator,
-            latents,
-        )
-        img_shapes = [
-            [
-                (1, height // self.vae_scale_factor // 2, width // self.vae_scale_factor // 2),
-                *[
-                    (1, vae_height // self.vae_scale_factor // 2, vae_width // self.vae_scale_factor // 2)
-                    for vae_width, vae_height in vae_image_sizes
-                ],
-            ]
-        ] * batch_size
-        # 5. Prepare timesteps
-        sigmas = np.linspace(1.0, 1 / num_inference_steps, num_inference_steps) if sigmas is None else sigmas
-        image_seq_len = latents.shape[1]
-        mu = calculate_shift(
-            image_seq_len,
-            self.scheduler.config.get("base_image_seq_len", 256),
-            self.scheduler.config.get("max_image_seq_len", 4096),
-            self.scheduler.config.get("base_shift", 0.5),
-            self.scheduler.config.get("max_shift", 1.15),
-        )
-        timesteps, num_inference_steps = retrieve_timesteps(
-            self.scheduler,
-            num_inference_steps,
-            device,
-            sigmas=sigmas,
-            mu=mu,
-        )
-        num_warmup_steps = max(len(timesteps) - num_inference_steps * self.scheduler.order, 0)
-        self._num_timesteps = len(timesteps)
-        # handle guidance
-        if self.transformer.config.guidance_embeds and guidance_scale is None:
-            raise ValueError("guidance_scale is required for guidance-distilled model.")
-        elif self.transformer.config.guidance_embeds:
-            guidance = torch.full([1], guidance_scale, device=device, dtype=torch.float32)
-            guidance = guidance.expand(latents.shape[0])
-        elif not self.transformer.config.guidance_embeds and guidance_scale is not None:
-            logger.warning(
-                f"guidance_scale is passed as {guidance_scale}, but ignored since the model is not guidance-distilled."
             )
-            guidance = None
-        elif not self.transformer.config.guidance_embeds and guidance_scale is None:
-            guidance = None
-        if self.attention_kwargs is None:
-            self._attention_kwargs = {}
-        txt_seq_lens = prompt_embeds_mask.sum(dim=1).tolist() if prompt_embeds_mask is not None else None
-        image_rotary_emb = self.transformer.pos_embed(img_shapes, txt_seq_lens, device=latents.device)
-        if do_true_cfg:
-            negative_txt_seq_lens = (
-                negative_prompt_embeds_mask.sum(dim=1).tolist()
-                if negative_prompt_embeds_mask is not None
-                else None
             )
-            uncond_image_rotary_emb = self.transformer.pos_embed(
-                img_shapes, negative_txt_seq_lens, device=latents.device
             )
-        else:
-            uncond_image_rotary_emb = None
-        # 6. Denoising loop
-        self.scheduler.set_begin_index(0)
-        with self.progress_bar(total=num_inference_steps) as progress_bar:
-            for i, t in enumerate(timesteps):
-                if self.interrupt:
-                    continue
-                self._current_timestep = t
-                latent_model_input = latents
-                if image_latents is not None:
-                    latent_model_input = torch.cat([latents, image_latents], dim=1)
-                # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
-                timestep = t.expand(latents.shape[0]).to(latents.dtype)
-                with self.transformer.cache_context("cond"):
-                    noise_pred = self.transformer(
-                        hidden_states=latent_model_input,
-                        timestep=timestep / 1000,
-                        guidance=guidance,
-                        encoder_hidden_states_mask=prompt_embeds_mask,
-                        encoder_hidden_states=prompt_embeds,
-                        image_rotary_emb=image_rotary_emb,
-                        attention_kwargs=self.attention_kwargs,
-                        return_dict=False,
-                    )[0]
-                    noise_pred = noise_pred[:, : latents.size(1)]
-                if do_true_cfg:
-                    with self.transformer.cache_context("uncond"):
-                        neg_noise_pred = self.transformer(
-                            hidden_states=latent_model_input,
-                            timestep=timestep / 1000,
-                            guidance=guidance,
-                            encoder_hidden_states_mask=negative_prompt_embeds_mask,
-                            encoder_hidden_states=negative_prompt_embeds,
-                            image_rotary_emb=uncond_image_rotary_emb,
-                            attention_kwargs=self.attention_kwargs,
-                            return_dict=False,
-                        )[0]
-                    neg_noise_pred = neg_noise_pred[:, : latents.size(1)]
-                    comb_pred = neg_noise_pred + true_cfg_scale * (noise_pred - neg_noise_pred)
-                    cond_norm = torch.norm(noise_pred, dim=-1, keepdim=True)
-                    noise_norm = torch.norm(comb_pred, dim=-1, keepdim=True)
-                    noise_pred = comb_pred * (cond_norm / noise_norm)
-                # compute the previous noisy sample x_t -> x_t-1
-                latents_dtype = latents.dtype
-                latents = self.scheduler.step(noise_pred, t, latents, return_dict=False)[0]
-                if latents.dtype != latents_dtype:
-                    if torch.backends.mps.is_available():
-                        # some platforms (eg. apple mps) misbehave due to a pytorch bug: https://github.com/pytorch/pytorch/pull/99272
-                        latents = latents.to(latents_dtype)
-                if callback_on_step_end is not None:
-                    callback_kwargs = {}
-                    for k in callback_on_step_end_tensor_inputs:
-                        callback_kwargs[k] = locals()[k]
-                    callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
-                    latents = callback_outputs.pop("latents", latents)
-                    prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
-                # call the callback, if provided
-                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
-                    progress_bar.update()
-                if XLA_AVAILABLE:
-                    xm.mark_step()
-        self._current_timestep = None
-        if output_type == "latent":
-            image = latents
-        else:
-            latents = self._unpack_latents(latents, height, width, self.vae_scale_factor)
-            latents = latents.to(self.vae.dtype)
-            latents_mean = (
-                torch.tensor(self.vae.config.latents_mean)
-                .view(1, self.vae.config.z_dim, 1, 1, 1)
-                .to(latents.device, latents.dtype)
-            )
-            latents_std = 1.0 / torch.tensor(self.vae.config.latents_std).view(1, self.vae.config.z_dim, 1, 1, 1).to(
-                latents.device, latents.dtype
-            )
-            latents = latents / latents_std + latents_mean
-            image = self.vae.decode(latents, return_dict=False)[0][:, :, 0]
-            image = self.image_processor.postprocess(image, output_type=output_type)
-        # Offload all models
-        self.maybe_free_model_hooks()
-        if not return_dict:
-            return (image,)
         return QwenImagePipelineOutput(images=image)

 import numpy as np
 import torch
+# from transformers import Qwen2_5_VLForConditionalGeneration, Qwen2Tokenizer, Qwen2VLProcessor
+from transformers.models.qwen2_5_vl.modeling_qwen2_5_vl import Qwen2_5_VLForConditionalGeneration
+from transformers.models.qwen2 import Qwen2Tokenizer
+from transformers.models.qwen2_vl import Qwen2VLProcessor
 from diffusers.image_processor import PipelineImageInput, VaeImageProcessor
 from diffusers.loaders import QwenImageLoraLoaderMixin
 from diffusers.pipelines.pipeline_utils import DiffusionPipeline
 from diffusers.pipelines.qwenimage.pipeline_output import QwenImagePipelineOutput
+from qwenimage.debug import ctimed, ftimed
 if is_torch_xla_available():
     import torch_xla.core.xla_model as xm
         return prompt_embeds, encoder_attention_mask
     # Copied from diffusers.pipelines.qwenimage.pipeline_qwenimage_edit.QwenImageEditPipeline.encode_prompt
+    @ftimed
     def encode_prompt(
         self,
         prompt: Union[str, List[str]],
             [`~pipelines.qwenimage.QwenImagePipelineOutput`] if `return_dict` is True, otherwise a `tuple`. When
             returning a tuple, the first element is a list with the generated images.
         """
+        with ctimed("Preprocessing"):
+            image_size = image[-1].size if isinstance(image, list) else image.size
+            calculated_width, calculated_height = calculate_dimensions(1024 * 1024, image_size[0] / image_size[1])
+            height = height or calculated_height
+            width = width or calculated_width
+            multiple_of = self.vae_scale_factor * 2
+            width = width // multiple_of * multiple_of
+            height = height // multiple_of * multiple_of
+            # 1. Check inputs. Raise error if not correct
+            self.check_inputs(
+                prompt,
+                height,
+                width,
+                negative_prompt=negative_prompt,
+                prompt_embeds=prompt_embeds,
+                negative_prompt_embeds=negative_prompt_embeds,
+                prompt_embeds_mask=prompt_embeds_mask,
+                negative_prompt_embeds_mask=negative_prompt_embeds_mask,
+                callback_on_step_end_tensor_inputs=callback_on_step_end_tensor_inputs,
+                max_sequence_length=max_sequence_length,
+            )
+            self._guidance_scale = guidance_scale
+            self._attention_kwargs = attention_kwargs
+            self._current_timestep = None
+            self._interrupt = False
+            # 2. Define call parameters
+            if prompt is not None and isinstance(prompt, str):
+                batch_size = 1
+            elif prompt is not None and isinstance(prompt, list):
+                batch_size = len(prompt)
+            else:
+                batch_size = prompt_embeds.shape[0]
+            device = self._execution_device
+            # 3. Preprocess image
+            if image is not None and not (isinstance(image, torch.Tensor) and image.size(1) == self.latent_channels):
+                if not isinstance(image, list):
+                    image = [image]
+                condition_image_sizes = []
+                condition_images = []
+                vae_image_sizes = []
+                vae_images = []
+                for img in image:
+                    image_width, image_height = img.size
+                    condition_width, condition_height = calculate_dimensions(
+                        CONDITION_IMAGE_SIZE, image_width / image_height
+                    )
+                    vae_width, vae_height = calculate_dimensions(VAE_IMAGE_SIZE, image_width / image_height)
+                    condition_image_sizes.append((condition_width, condition_height))
+                    vae_image_sizes.append((vae_width, vae_height))
+                    condition_images.append(self.image_processor.resize(img, condition_height, condition_width))
+                    vae_images.append(self.image_processor.preprocess(img, vae_height, vae_width).unsqueeze(2))
+            has_neg_prompt = negative_prompt is not None or (
+                negative_prompt_embeds is not None and negative_prompt_embeds_mask is not None
+            )
+            if true_cfg_scale > 1 and not has_neg_prompt:
+                logger.warning(
+                    f"true_cfg_scale is passed as {true_cfg_scale}, but classifier-free guidance is not enabled since no negative_prompt is provided."
+                )
+            elif true_cfg_scale <= 1 and has_neg_prompt:
+                logger.warning(
+                    " negative_prompt is passed but classifier-free guidance is not enabled since true_cfg_scale <= 1"
                 )
+        with ctimed("Encode Prompt"):
+            do_true_cfg = true_cfg_scale > 1 and has_neg_prompt
+            prompt_embeds, prompt_embeds_mask = self.encode_prompt(
                 image=condition_images,
+                prompt=prompt,
+                prompt_embeds=prompt_embeds,
+                prompt_embeds_mask=prompt_embeds_mask,
                 device=device,
                 num_images_per_prompt=num_images_per_prompt,
                 max_sequence_length=max_sequence_length,
             )
+            if do_true_cfg:
+                negative_prompt_embeds, negative_prompt_embeds_mask = self.encode_prompt(
+                    image=condition_images,
+                    prompt=negative_prompt,
+                    prompt_embeds=negative_prompt_embeds,
+                    prompt_embeds_mask=negative_prompt_embeds_mask,
+                    device=device,
+                    num_images_per_prompt=num_images_per_prompt,
+                    max_sequence_length=max_sequence_length,
+                )
+        with ctimed("Prep gen"):
+            # 4. Prepare latent variables
+            num_channels_latents = self.transformer.config.in_channels // 4
+            latents, image_latents = self.prepare_latents(
+                vae_images,
+                batch_size * num_images_per_prompt,
+                num_channels_latents,
+                height,
+                width,
+                prompt_embeds.dtype,
+                device,
+                generator,
+                latents,
             )
+            img_shapes = [
+                [
+                    (1, height // self.vae_scale_factor // 2, width // self.vae_scale_factor // 2),
+                    *[
+                        (1, vae_height // self.vae_scale_factor // 2, vae_width // self.vae_scale_factor // 2)
+                        for vae_width, vae_height in vae_image_sizes
+                    ],
+                ]
+            ] * batch_size
+            # 5. Prepare timesteps
+            sigmas = np.linspace(1.0, 1 / num_inference_steps, num_inference_steps) if sigmas is None else sigmas
+            image_seq_len = latents.shape[1]
+            mu = calculate_shift(
+                image_seq_len,
+                self.scheduler.config.get("base_image_seq_len", 256),
+                self.scheduler.config.get("max_image_seq_len", 4096),
+                self.scheduler.config.get("base_shift", 0.5),
+                self.scheduler.config.get("max_shift", 1.15),
             )
+            timesteps, num_inference_steps = retrieve_timesteps(
+                self.scheduler,
+                num_inference_steps,
+                device,
+                sigmas=sigmas,
+                mu=mu,
             )
+            num_warmup_steps = max(len(timesteps) - num_inference_steps * self.scheduler.order, 0)
+            self._num_timesteps = len(timesteps)
+            # handle guidance
+            if self.transformer.config.guidance_embeds and guidance_scale is None:
+                raise ValueError("guidance_scale is required for guidance-distilled model.")
+            elif self.transformer.config.guidance_embeds:
+                guidance = torch.full([1], guidance_scale, device=device, dtype=torch.float32)
+                guidance = guidance.expand(latents.shape[0])
+            elif not self.transformer.config.guidance_embeds and guidance_scale is not None:
+                logger.warning(
+                    f"guidance_scale is passed as {guidance_scale}, but ignored since the model is not guidance-distilled."
+                )
+                guidance = None
+            elif not self.transformer.config.guidance_embeds and guidance_scale is None:
+                guidance = None
+            if self.attention_kwargs is None:
+                self._attention_kwargs = {}
+            txt_seq_lens = prompt_embeds_mask.sum(dim=1).tolist() if prompt_embeds_mask is not None else None
+            image_rotary_emb = self.transformer.pos_embed(img_shapes, txt_seq_lens, device=latents.device)
+            if do_true_cfg:
+                negative_txt_seq_lens = (
+                    negative_prompt_embeds_mask.sum(dim=1).tolist()
+                    if negative_prompt_embeds_mask is not None
+                    else None
+                )
+                uncond_image_rotary_emb = self.transformer.pos_embed(
+                    img_shapes, negative_txt_seq_lens, device=latents.device
+                )
+            else:
+                uncond_image_rotary_emb = None
+        with ctimed("loop"):
+            # 6. Denoising loop
+            self.scheduler.set_begin_index(0)
+            with self.progress_bar(total=num_inference_steps) as progress_bar:
+                for i, t in enumerate(timesteps):
+                    with ctimed(f"loop {i}"):
+                        if self.interrupt:
+                            continue
+                        self._current_timestep = t
+                        latent_model_input = latents
+                        if image_latents is not None:
+                            latent_model_input = torch.cat([latents, image_latents], dim=1)
+                        # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
+                        timestep = t.expand(latents.shape[0]).to(latents.dtype)
+                        with self.transformer.cache_context("cond"):
+                            noise_pred = self.transformer(
+                                hidden_states=latent_model_input,
+                                timestep=timestep / 1000,
+                                guidance=guidance,
+                                encoder_hidden_states_mask=prompt_embeds_mask,
+                                encoder_hidden_states=prompt_embeds,
+                                image_rotary_emb=image_rotary_emb,
+                                attention_kwargs=self.attention_kwargs,
+                                return_dict=False,
+                            )[0]
+                            noise_pred = noise_pred[:, : latents.size(1)]
+                        if do_true_cfg:
+                            with self.transformer.cache_context("uncond"):
+                                neg_noise_pred = self.transformer(
+                                    hidden_states=latent_model_input,
+                                    timestep=timestep / 1000,
+                                    guidance=guidance,
+                                    encoder_hidden_states_mask=negative_prompt_embeds_mask,
+                                    encoder_hidden_states=negative_prompt_embeds,
+                                    image_rotary_emb=uncond_image_rotary_emb,
+                                    attention_kwargs=self.attention_kwargs,
+                                    return_dict=False,
+                                )[0]
+                            neg_noise_pred = neg_noise_pred[:, : latents.size(1)]
+                            comb_pred = neg_noise_pred + true_cfg_scale * (noise_pred - neg_noise_pred)
+                            cond_norm = torch.norm(noise_pred, dim=-1, keepdim=True)
+                            noise_norm = torch.norm(comb_pred, dim=-1, keepdim=True)
+                            noise_pred = comb_pred * (cond_norm / noise_norm)
+                        # compute the previous noisy sample x_t -> x_t-1
+                        latents_dtype = latents.dtype
+                        latents = self.scheduler.step(noise_pred, t, latents, return_dict=False)[0]
+                        if latents.dtype != latents_dtype:
+                            if torch.backends.mps.is_available():
+                                # some platforms (eg. apple mps) misbehave due to a pytorch bug: https://github.com/pytorch/pytorch/pull/99272
+                                latents = latents.to(latents_dtype)
+                        if callback_on_step_end is not None:
+                            callback_kwargs = {}
+                            for k in callback_on_step_end_tensor_inputs:
+                                callback_kwargs[k] = locals()[k]
+                            callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
+                            latents = callback_outputs.pop("latents", latents)
+                            prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
+                        # call the callback, if provided
+                        if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                            progress_bar.update()
+                        if XLA_AVAILABLE:
+                            xm.mark_step()
+        with ctimed("Post (vae)"):
+            self._current_timestep = None
+            if output_type == "latent":
+                image = latents
+            else:
+                latents = self._unpack_latents(latents, height, width, self.vae_scale_factor)
+                latents = latents.to(self.vae.dtype)
+                latents_mean = (
+                    torch.tensor(self.vae.config.latents_mean)
+                    .view(1, self.vae.config.z_dim, 1, 1, 1)
+                    .to(latents.device, latents.dtype)
+                )
+                latents_std = 1.0 / torch.tensor(self.vae.config.latents_std).view(1, self.vae.config.z_dim, 1, 1, 1).to(
+                    latents.device, latents.dtype
+                )
+                latents = latents / latents_std + latents_mean
+                image = self.vae.decode(latents, return_dict=False)[0][:, :, 0]
+                image = self.image_processor.postprocess(image, output_type=output_type)
+            # Offload all models
+            self.maybe_free_model_hooks()
+            if not return_dict:
+                return (image,)
         return QwenImagePipelineOutput(images=image)

prompt.py → qwenimage/prompt.py RENAMED Viewed

File without changes