import torch import numpy as np from PIL import Image from diffusers import StableDiffusionInpaintPipeline, StableDiffusionXLInpaintPipeline class SDInpainter: def __init__(self, model_id="runwayml/stable-diffusion-inpainting"): self.device = "cuda" if torch.cuda.is_available() else "cpu" self.pipe = StableDiffusionInpaintPipeline.from_pretrained( model_id, torch_dtype=torch.float16 if self.device == "cuda" else torch.float32, ).to(self.device) if self.device == "cuda": self.pipe.enable_model_cpu_offload() def inpaint(self, image, mask, prompt="background"): pil_image = Image.fromarray(image).convert('RGB') mask = self._dilate_mask(mask) pil_mask = Image.fromarray((mask * 255).astype(np.uint8)).convert('L') w, h = pil_image.size factor = 512 / max(w, h) new_w = int(w * factor) - (int(w * factor) % 8) new_h = int(h * factor) - (int(h * factor) % 8) resized_image = pil_image.resize((new_w, new_h), Image.LANCZOS) resized_mask = pil_mask.resize((new_w, new_h), Image.NEAREST) output = self.pipe( prompt=prompt, negative_prompt="artifacts, low quality, distortion, object", image=resized_image, mask_image=resized_mask, num_inference_steps=30, guidance_scale=7.5, ).images[0] result = output.resize((w, h), Image.LANCZOS) return np.array(result) def _dilate_mask(self, mask, kernel_size=9): import cv2 kernel = np.ones((kernel_size, kernel_size), np.uint8) return cv2.dilate(mask, kernel, iterations=1) class SDXLInpainter: def __init__(self, model_id="diffusers/stable-diffusion-xl-1.0-inpainting-0.1"): self.device = "cuda" if torch.cuda.is_available() else "cpu" self.pipe = StableDiffusionXLInpaintPipeline.from_pretrained( model_id, torch_dtype=torch.float16 if self.device == "cuda" else torch.float32, variant="fp16", use_safetensors=True ).to(self.device) if self.device == "cuda": self.pipe.enable_model_cpu_offload() def inpaint(self, image, mask, prompt=""): pil_image = Image.fromarray(image).convert('RGB') mask = self._dilate_mask(mask, kernel_size=15) import cv2 mask = cv2.GaussianBlur(mask, (21, 21), 0) pil_mask = Image.fromarray((mask * 255).astype(np.uint8)).convert('L') w, h = pil_image.size target_size = 1024 scale = target_size / max(w, h) new_w = int(w * scale) - (int(w * scale) % 8) new_h = int(h * scale) - (int(h * scale) % 8) resized_image = pil_image.resize((new_w, new_h), Image.LANCZOS) resized_mask = pil_mask.resize((new_w, new_h), Image.NEAREST) if not prompt or prompt == "background": final_prompt = "clean background, empty space, seamless texture, high quality" guidance_scale = 4.5 else: final_prompt = prompt guidance_scale = 7.5 neg_prompt = ( "object, subject, person, animal, cat, dog, " "glass, transparent, crystal, bottle, cup, reflection, " "complex, 3d render, artifacts, shadow, distortion, blur, watermark" ) output = self.pipe( prompt=final_prompt, negative_prompt=neg_prompt, image=resized_image, mask_image=resized_mask, num_inference_steps=40, guidance_scale=guidance_scale, strength=0.99, ).images[0] result = output.resize((w, h), Image.LANCZOS) return np.array(result) def _dilate_mask(self, mask, kernel_size=15): import cv2 kernel = np.ones((kernel_size, kernel_size), np.uint8) return cv2.dilate(mask, kernel, iterations=1)