Spaces:

lxzcpro
/

TextEraser

Running

App Files Files Community

lxzcpro commited on 29 days ago

Commit

a8246e3

1 Parent(s): 144afae

Implement SAM2 and better inpainting

Browse files

Files changed (4) hide show

src/matcher.py +44 -36
src/painter.py +70 -1
src/pipeline.py +43 -30
src/segmenter.py +58 -1

src/matcher.py CHANGED Viewed

@@ -3,62 +3,70 @@ from PIL import Image
 from transformers import CLIPProcessor, CLIPModel
 class CLIPMatcher:
-    def __init__(self, model_name='openai/clip-vit-base-patch32'):
         self.device = "cuda" if torch.cuda.is_available() else "cpu"
         self.model = CLIPModel.from_pretrained(model_name).to(self.device)
         self.processor = CLIPProcessor.from_pretrained(model_name)
-        self.model.eval()
-    def match_segments(self, image, segments, text_query):
-        if not segments:
-            return None
-        ignore_words = ['remove', 'delete', 'erase', 'the', 'a', 'an']
-        query_words = text_query.lower().split()
-        clean_query = " ".join([w for w in query_words if w not in ignore_words])
-        # If query becomes empty (e.g. user just typed "remove"), fallback to original
-        target_text = clean_query if clean_query else text_query
-        print(f"Debug: CLIP searching for object: '{target_text}'")
         pil_image = Image.fromarray(image)
-        best_score = -float('inf')
-        best_segment = None
         crops = []
         valid_segments = []
         for seg in segments:
             x1, y1, x2, y2 = seg['bbox'].astype(int)
-            # Check bounds to prevent crash
-            h, w = image.shape[:2]
-            x1, y1 = max(0, x1), max(0, y1)
-            x2, y2 = min(w, x2), min(h, y2)
-            if x2 - x1 < 5 or y2 - y1 < 5: continue # Skip tiny/invalid boxes
             crops.append(pil_image.crop((x1, y1, x2, y2)))
             valid_segments.append(seg)
-        if not crops: return None
-        # Batch inference
         inputs = self.processor(
-            text=[target_text],
-            images=crops,
-            return_tensors="pt",
-            padding=True
         ).to(self.device)
         with torch.no_grad():
             outputs = self.model(**inputs)
-            # logits_per_image: [num_crops, 1]
-            probs = outputs.logits_per_image.softmax(dim=0)
-            # Get the index of the highest match
-            best_idx = probs.argmax().item()
-            best_score = probs[best_idx].item()
-            best_segment = valid_segments[best_idx]
-        return best_segment

 from transformers import CLIPProcessor, CLIPModel
 class CLIPMatcher:
+    def __init__(self, model_name='openai/clip-vit-large-patch14'):
         self.device = "cuda" if torch.cuda.is_available() else "cpu"
         self.model = CLIPModel.from_pretrained(model_name).to(self.device)
         self.processor = CLIPProcessor.from_pretrained(model_name)
+    def get_top_k_segments(self, image, segments, text_query, k=5):
+        """
+        Returns top K segments based on CLIP score + Area Weight.
+        """
+        if not segments: return []
+        # 1. Clean Text
+        ignore = ['remove', 'delete', 'erase', 'the', 'a', 'an']
+        words = [w for w in text_query.lower().split() if w not in ignore]
+        clean_text = " ".join(words) if words else text_query
         pil_image = Image.fromarray(image)
         crops = []
         valid_segments = []
+        # Prepare crops
+        h, w = image.shape[:2]
+        total_img_area = h * w
         for seg in segments:
             x1, y1, x2, y2 = seg['bbox'].astype(int)
+            # Pad slightly
+            pad = 10
+            x1, y1 = max(0, x1-pad), max(0, y1-pad)
+            x2, y2 = min(w, x2+pad), min(h, y2+pad)
             crops.append(pil_image.crop((x1, y1, x2, y2)))
             valid_segments.append(seg)
+        if not crops: return []
+        # 2. Inference
         inputs = self.processor(
+            text=[clean_text], images=crops, return_tensors="pt", padding=True
         ).to(self.device)
         with torch.no_grad():
             outputs = self.model(**inputs)
+            # Standardize scores
+            probs = outputs.logits_per_image.softmax(dim=0).cpu().numpy().flatten()
+        # 3. Re-Scoring with Area Weight
+        final_results = []
+        for i, score in enumerate(probs):
+            seg = valid_segments[i]
+            area_ratio = seg['area'] / total_img_area
+            # HEURISTIC: Boost score for larger objects.
+            # If searching for general terms (bus, car, cat), bigger is usually better.
+            # We add 20% of the area_ratio to the score.
+            weighted_score = score + (area_ratio * 0.2)
+            final_results.append({
+                'mask': seg['mask'],
+                'bbox': seg['bbox'],
+                'original_score': float(score),
+                'weighted_score': float(weighted_score)
+            })
+        # 4. Sort and take Top K
+        final_results.sort(key=lambda x: x['weighted_score'], reverse=True)
+        return final_results[:k]

src/painter.py CHANGED Viewed

@@ -1,7 +1,7 @@
 import torch
 import numpy as np
 from PIL import Image
-from diffusers import StableDiffusionInpaintPipeline
 class SDInpainter:
     def __init__(self, model_id="runwayml/stable-diffusion-inpainting"):
@@ -48,6 +48,75 @@ class SDInpainter:
     def _dilate_mask(self, mask, kernel_size=9):
         # Increased kernel size slightly for better blending
         import cv2
         kernel = np.ones((kernel_size, kernel_size), np.uint8)
         return cv2.dilate(mask, kernel, iterations=1)

 import torch
 import numpy as np
 from PIL import Image
+from diffusers import StableDiffusionInpaintPipeline, StableDiffusionXLInpaintPipeline
 class SDInpainter:
     def __init__(self, model_id="runwayml/stable-diffusion-inpainting"):
     def _dilate_mask(self, mask, kernel_size=9):
         # Increased kernel size slightly for better blending
+        import cv2
+        kernel = np.ones((kernel_size, kernel_size), np.uint8)
+        return cv2.dilate(mask, kernel, iterations=1)
+class SDXLInpainter:
+    def __init__(self, model_id="diffusers/stable-diffusion-xl-1.0-inpainting-0.1"):
+        self.device = "cuda" if torch.cuda.is_available() else "cpu"
+        # Use float16
+        self.pipe = StableDiffusionXLInpaintPipeline.from_pretrained(
+            model_id,
+            torch_dtype=torch.float16 if self.device == "cuda" else torch.float32,
+            variant="fp16", # Add variant for faster loading if available
+            use_safetensors=True
+        ).to(self.device)
+        if self.device == "cuda":
+            self.pipe.enable_model_cpu_offload() # Saves VRAM effectively
+    def inpaint(self, image, mask, prompt=""): # Default prompt changed to empty
+        pil_image = Image.fromarray(image).convert('RGB')
+        # Increase kernel size to 15 or 20 to ensure no edge artifacts remain
+        mask = self._dilate_mask(mask, kernel_size=15)
+        # Blur the mask slightly to make the transition smoother
+        import cv2
+        mask = cv2.GaussianBlur(mask, (5, 5), 0)
+        pil_mask = Image.fromarray((mask * 255).astype(np.uint8)).convert('L')
+        w, h = pil_image.size
+        target_size = 1024
+        scale = target_size / max(w, h)
+        new_w = int(w * scale) - (int(w * scale) % 8)
+        new_h = int(h * scale) - (int(h * scale) % 8)
+        resized_image = pil_image.resize((new_w, new_h), Image.LANCZOS)
+        resized_mask = pil_mask.resize((new_w, new_h), Image.NEAREST)
+        if not prompt or prompt == "background":
+            final_prompt = "clean background, empty space, seamless texture, high quality"
+            # Lower guidance scale for background filling to rely more on image context
+            guidance_scale = 4.5
+        else:
+            final_prompt = prompt
+            guidance_scale = 7.5
+        neg_prompt = (
+            "object, subject, person, animal, cat, dog, "
+            "glass, transparent, crystal, bottle, cup, reflection, "
+            "complex, 3d render, artifacts, shadow, distortion, blur, watermark"
+        )
+        output = self.pipe(
+            prompt=final_prompt,
+            negative_prompt=neg_prompt,
+            image=resized_image,
+            mask_image=resized_mask,
+            num_inference_steps=40,
+            guidance_scale=guidance_scale, # Dynamic guidance
+            strength=0.99, # High strength to ensure removal
+        ).images[0]
+        result = output.resize((w, h), Image.LANCZOS)
+        return np.array(result)
+    def _dilate_mask(self, mask, kernel_size=15):
         import cv2
         kernel = np.ones((kernel_size, kernel_size), np.uint8)
         return cv2.dilate(mask, kernel, iterations=1)

src/pipeline.py CHANGED Viewed

@@ -1,45 +1,58 @@
 import numpy as np
-from .segmenter import YOLOSegmenter
 from .matcher import CLIPMatcher
-from .painter import SDInpainter
-from .utils import resize_image
 class ObjectRemovalPipeline:
     def __init__(self):
         print("Initializing models...")
-        self.segmenter = YOLOSegmenter()
         self.matcher = CLIPMatcher()
-        self.inpainter = SDInpainter()
-        print("Models loaded successfully!")
-    def process(self, image, text_query, inpaint_prompt="background"):
         """
-        Main pipeline for object removal
-        Args:
-            image: numpy array (H, W, 3)
-            text_query: str, e.g., "remove the bottle"
-            inpaint_prompt: str, prompt for inpainting
         """
-        # Resize for processing
-        original_shape = image.shape[:2]
-        image = resize_image(image, max_size=1024)
-        # Step 1: Segment objects
         segments = self.segmenter.segment(image)
         if not segments:
-            return image, None, "No objects detected"
-        # Step 2: Match text query to segment
-        matched_segment = self.matcher.match_segments(image, segments, text_query)
-        if matched_segment is None:
-            return image, None, "No matching object found"
-        # Step 3: Inpaint to remove object
-        result = self.inpainter.inpaint(image, matched_segment['mask'], inpaint_prompt)
-        # Resize back if needed
-        if result.shape[:2] != original_shape:
-            import cv2
-            result = cv2.resize(result, (original_shape[1], original_shape[0]))
-        return result, matched_segment['mask'], f"Removed: {matched_segment['class_name']}"

 import numpy as np
+import cv2
+from .segmenter import SAM2Segmenter
 from .matcher import CLIPMatcher
+from .painter import SDXLInpainter
+from .utils import visualize_mask
 class ObjectRemovalPipeline:
     def __init__(self):
         print("Initializing models...")
+        self.segmenter = SAM2Segmenter()
         self.matcher = CLIPMatcher()
+        self.inpainter = SDXLInpainter()
+        print("Pipeline ready.")
+    def process(self, image, text_query, inpaint_prompt=""):
         """
+        Main processing function for object removal.
         """
+        # 1. Segment
         segments = self.segmenter.segment(image)
         if not segments:
+            return image, None, "No segments found"
+        # 2. Match with Top-K Strategy
+        # We get top 5 candidates to handle "Part-Whole" ambiguity (e.g. tire vs car)
+        candidates = self.matcher.get_top_k_segments(image, segments, text_query, k=5)
+        if not candidates:
+            return image, None, "No match found"
+        # 3. Merge Masks (The "Cat Tail" Fix)
+        best_candidate = candidates[0]
+        final_mask = best_candidate['mask'].copy()
+        print(f"Top Match Score: {best_candidate['weighted_score']:.3f}")
+        # Merge other candidates if they are close in score or physically overlap
+        for i in range(1, len(candidates)):
+            cand = candidates[i]
+            score_ratio = cand['weighted_score'] / best_candidate['weighted_score']
+            # Check intersection
+            intersection = np.logical_and(final_mask, cand['mask']).sum()
+            # Rule: Merge if score is similar (>85%) OR if they overlap pixels
+            if score_ratio > 0.85 or intersection > 0:
+                print(f"Merging Rank {i+1} (Score ratio: {score_ratio:.2f}, Overlap: {intersection > 0})")
+                final_mask = np.logical_or(final_mask, cand['mask'])
+        # 4. Dilate Final Mask
+        # Expands mask slightly to cover edges/seams
+        kernel = np.ones((15, 15), np.uint8)
+        final_mask = cv2.dilate(final_mask.astype(np.uint8), kernel, iterations=1)
+        # 5. Inpaint
+        result = self.inpainter.inpaint(image, final_mask, prompt=inpaint_prompt)
+        return result, final_mask, "Success"

src/segmenter.py CHANGED Viewed

@@ -1,7 +1,10 @@
 import numpy as np
-from ultralytics import YOLO
 import cv2
 class YOLOSegmenter:
     def __init__(self, model_name='yolov8x-seg.pt'):
         self.model = YOLO(model_name)
@@ -24,4 +27,58 @@ class YOLOSegmenter:
                     'class_name': self.model.names[class_id]
                 })
         return segments

+import torch
 import numpy as np
 import cv2
+from ultralytics import YOLO
+from sam2.automatic_mask_generator import SAM2AutomaticMaskGenerator
 class YOLOSegmenter:
     def __init__(self, model_name='yolov8x-seg.pt'):
         self.model = YOLO(model_name)
                     'class_name': self.model.names[class_id]
                 })
+        return segments
+class SAM2Segmenter:
+    def __init__(self, model_cfg='sam2.1_hiera_l.yaml', checkpoint=''):
+        self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
+        # Load the Automatic Generator
+        self.mask_generator = SAM2AutomaticMaskGenerator.from_pretrained(
+            "facebook/sam2.1-hiera-large",
+            points_per_side=32,
+            pred_iou_thresh=0.80,
+            stability_score_thresh=0.92,
+            crop_n_layers=1,
+            crop_n_points_downscale_factor=2,
+            device=self.device
+        )
+    def segment(self, image):
+        """
+        Generates masks and filters out background-like huge segments.
+        """
+        if hasattr(self.mask_generator, 'generate'):
+            masks = self.mask_generator.generate(image)
+        else:
+            masks = self.mask_generator.predict(image)
+        segments = []
+        img_h, img_w = image.shape[:2]
+        total_area = img_h * img_w
+        for m in masks:
+            # SAM returns [x, y, w, h]
+            x, y, w, h = m['bbox']
+            # Convert to [x1, y1, x2, y2]
+            x1, y1, x2, y2 = x, y, x + w, y + h
+            # Ignore masks that are too large (> 75% of image)
+            if m['area'] > total_area * 0.75:
+                continue
+            # Ignore masks that are too small (< 0.5% of image)
+            if m['area'] < total_area * 0.005:
+                continue
+            segments.append({
+                'mask': m['segmentation'].astype(np.uint8),
+                'bbox': np.array([x1, y1, x2, y2]),
+                'score': m.get('predicted_iou', 1.0),
+                'area': m['area']
+            })
+        # Sort by area (smallest to largest) to prefer specific objects over containers
+        segments.sort(key=lambda s: s['area'])
         return segments