Spaces:

lxzcpro
/

TextEraser

Running

App Files Files Community

lxzcpro commited on 10 days ago

Commit

03bafc0

1 Parent(s): cd002fc

implement second version

Browse files

Files changed (7) hide show

.gitignore +2 -1
app.py +74 -94
src/__init__.py +3 -3
src/matcher.py +51 -30
src/painter.py +1 -1
src/pipeline.py +90 -41
src/segmenter.py +64 -72

.gitignore CHANGED Viewed

@@ -206,4 +206,5 @@ marimo/_static/
 marimo/_lsp/
 __marimo__/
-models/yolov8

 marimo/_lsp/
 __marimo__/
+models/yolov8
+rubrics.txt

app.py CHANGED Viewed

@@ -1,129 +1,109 @@
 import gradio as gr
 import numpy as np
-import torch
 from src.pipeline import ObjectRemovalPipeline
 from src.utils import visualize_mask
-# Initialize pipeline globally to load models only once
-print("Loading pipeline...")
 pipeline = ObjectRemovalPipeline()
 def ensure_uint8(image):
-    """
-    Ensures the image is in valid uint8 format (0-255) for Gradio display.
-    """
-    if image is None:
-        return None
     image = np.array(image)
-    # 1. Handle NaN/Inf (Exploding gradients often cause this)
-    if not np.isfinite(image).all():
-        print("Warning: Image contains NaN or Inf. Replacing with black.")
-        image = np.nan_to_num(image, nan=0.0, posinf=255.0, neginf=0.0)
-    # 2. Normalize Float (0.0-1.0) to Int (0-255)
     if image.dtype != np.uint8:
-        # If image is in 0-1 range (common in torch/diffusers)
-        if image.max() <= 1.0:
-            image = (image * 255.0)
-        # Clip to safe range and cast
         image = np.clip(image, 0, 255).astype(np.uint8)
     return image
-def remove_object(image, text_query, inpaint_prompt, progress=gr.Progress()):
-    """
-    Gradio wrapper with progress tracking and error handling.
-    """
-    if image is None:
-        return None, None, "Error: Please upload an image first."
-    if not text_query:
-        return image, None, "Error: Please specify what to remove."
-    try:
-        # 1. Segmentation Phase
-        progress(0.2, desc="Segmenting & Matching Object...")
-        # Note: We call the pipeline.
-        # Ideally, you would break the pipeline.process method apart to update progress
-        # between segmentation and inpainting, but this works for now.
-        result, mask, message = pipeline.process(
-            image,
-            text_query,
-            inpaint_prompt if inpaint_prompt else "background"
-        )
-        # 2. Visualization Phase
-        progress(0.9, desc="Post-processing...")
-        mask_viz = None
-        if mask is not None:
-            mask_viz = visualize_mask(image, mask)
-        else:
-            # If no mask found, return original image as preview
-            mask_viz = image
-        mask_viz = ensure_uint8(mask_viz)
-        result = ensure_uint8(result)
-        return result, mask_viz, message
-    except torch.cuda.OutOfMemoryError:
-        return None, None, "Error: GPU Out of Memory. Try a smaller image."
-    except Exception as e:
-        return None, None, f"Error: {str(e)}"
-# Define Custom CSS for a cleaner look (Optional)
 css = """
-footer {visibility: hidden}
 .gradio-container {min-height: 0px !important}
 """
-with gr.Blocks(title="Object Removal", css=css, theme=gr.themes.Soft()) as demo:
-    gr.Markdown("## Text-Guided Object Removal Pipeline")
-    gr.Markdown("Identify objects via CLIP and remove them using Stable Diffusion.")
     with gr.Row():
         with gr.Column(scale=1):
             input_image = gr.Image(label="Input Image", type="numpy", height=400)
-            text_query = gr.Textbox(
-                label="Target Object",
-                placeholder="e.g., 'bottle', 'cell', 'petri dish'",
-                info="What should be removed?"
-            )
-            inpaint_prompt = gr.Textbox(
-                label="Inpaint Prompt (Context)",
-                placeholder="background",
-                value="background",
-                info="What should fill the empty space?"
-            )
-            submit_btn = gr.Button("Run Pipeline", variant="primary")
         with gr.Column(scale=1):
-            # Result tabs to switch between final result and debug mask
-            with gr.Tabs():
-                with gr.TabItem("Final Result"):
-                    output_image = gr.Image(label="Inpainted Result", height=400)
-                with gr.TabItem("Segmentation Debug"):
-                    mask_preview = gr.Image(label="Detected Mask Overlay", height=400)
-            status_text = gr.Textbox(label="Pipeline Logs", interactive=False)
-    # Examples allow users to test without uploading
-    # Ensure these files actually exist in your folder, or comment this out
-    # gr.Examples(
-    #     examples=[["examples/lab_bench.jpg", "remove the pipette", "table surface"]],
-    #     inputs=[input_image, text_query, inpaint_prompt],
-    # )
-    submit_btn.click(
-        fn=remove_object,
-        inputs=[input_image, text_query, inpaint_prompt],
-        outputs=[output_image, mask_preview, status_text]
     )
 if __name__ == "__main__":
-    # queue() is essential for handling GPU workloads and preventing timeouts
     demo.queue().launch(share=True)

 import gradio as gr
 import numpy as np
 from src.pipeline import ObjectRemovalPipeline
 from src.utils import visualize_mask
+# Initialize pipeline once
 pipeline = ObjectRemovalPipeline()
 def ensure_uint8(image):
+    if image is None: return None
     image = np.array(image)
     if image.dtype != np.uint8:
+        if image.max() <= 1.0: image = image * 255.0
         image = np.clip(image, 0, 255).astype(np.uint8)
     return image
+def step1_detect(image, text_query):
+    if image is None or not text_query:
+        return [], [], "Please upload image and enter text."
+    # Calls the new method in pipeline.py
+    candidates, msg = pipeline.get_candidates(image, text_query)
+    if not candidates:
+        return [], [], f"Error: {msg}"
+    masks = [c['mask'] for c in candidates]
+    # Generate visualization for gallery
+    gallery_imgs = []
+    for i, mask in enumerate(masks):
+        viz = visualize_mask(image, mask)
+        # Label with rank and score if available
+        label = f"Option {i+1} (Score: {candidates[i].get('weighted_score', 0):.2f})"
+        gallery_imgs.append((ensure_uint8(viz), label))
+    return masks, gallery_imgs, "Select the best match below."
+def on_select(evt: gr.SelectData):
+    return evt.index
+def step2_remove(image, masks, selected_idx, prompt, shadow_exp):
+    if not masks or selected_idx is None:
+        return None, "Please select an object first."
+    target_mask = masks[selected_idx]
+    # Calls the pipeline method
+    result = pipeline.inpaint_selected(image, target_mask, prompt, shadow_expansion=shadow_exp)
+    return ensure_uint8(result), "Success!"
+# CSS for cleaner UI
 css = """
 .gradio-container {min-height: 0px !important}
+/* Ensure images in gallery don't get cropped strictly */
+button.gallery-item {object-fit: contain !important}
 """
+with gr.Blocks(title="TextEraser", css=css, theme=gr.themes.Soft()) as demo:
+    mask_state = gr.State([])
+    idx_state = gr.State(0)
+    gr.Markdown("## TextEraser: Interactive Object Removal")
     with gr.Row():
         with gr.Column(scale=1):
             input_image = gr.Image(label="Input Image", type="numpy", height=400)
+            text_query = gr.Textbox(label="What to remove?", placeholder="e.g. 'bottle', 'shadow'")
+            btn_detect = gr.Button("1. Detect Objects", variant="primary")
         with gr.Column(scale=1):
+            # FIXED: object_fit="contain" prevents cropping
+            # allow_preview=True lets you click to zoom
+            gallery = gr.Gallery(
+                label="Candidates (Select One)",
+                columns=2,
+                height=400,
+                allow_preview=True,
+                object_fit="contain"
+            )
+            status = gr.Textbox(label="Status", interactive=False)
+    with gr.Row():
+        with gr.Column(scale=1):
+            shadow_slider = gr.Slider(0, 40, value=10, label="Shadow Fix (Expand Mask Downwards)")
+            inpaint_prompt = gr.Textbox(label="Background Description", value="background")
+            btn_remove = gr.Button("2. Remove Selected", variant="stop")
+        with gr.Column(scale=1):
+            output_image = gr.Image(label="Final Result", height=400)
+    # Event Wiring
+    btn_detect.click(
+        fn=step1_detect,
+        inputs=[input_image, text_query],
+        outputs=[mask_state, gallery, status]
+    )
+    gallery.select(fn=on_select, inputs=None, outputs=idx_state)
+    btn_remove.click(
+        fn=step2_remove,
+        inputs=[input_image, mask_state, idx_state, inpaint_prompt, shadow_slider],
+        outputs=[output_image, status]
     )
 if __name__ == "__main__":
     demo.queue().launch(share=True)

src/__init__.py CHANGED Viewed

@@ -1,6 +1,6 @@
 from .pipeline import ObjectRemovalPipeline
-from .segmenter import YOLOSegmenter
 from .matcher import CLIPMatcher
-from .painter import SDInpainter
-__all__ = ['ObjectRemovalPipeline', 'YOLOSegmenter', 'CLIPMatcher', 'SDInpainter']

 from .pipeline import ObjectRemovalPipeline
+from .segmenter import SAM2Predictor
 from .matcher import CLIPMatcher
+from .painter import SDXLInpainter
+__all__ = ['ObjectRemovalPipeline', 'CLIPMatcher', 'SDXLInpainter', 'SAM2Predictor']

src/matcher.py CHANGED Viewed

@@ -1,17 +1,17 @@
 import torch
 from PIL import Image
 from transformers import CLIPProcessor, CLIPModel
 class CLIPMatcher:
     def __init__(self, model_name='openai/clip-vit-large-patch14'):
         self.device = "cuda" if torch.cuda.is_available() else "cpu"
-        self.model = CLIPModel.from_pretrained(model_name).to(self.device)
         self.processor = CLIPProcessor.from_pretrained(model_name)
     def get_top_k_segments(self, image, segments, text_query, k=5):
-        """
-        Returns top K segments based on CLIP score + Area Weight.
-        """
         if not segments: return []
         # 1. Clean Text
@@ -19,54 +19,75 @@ class CLIPMatcher:
         words = [w for w in text_query.lower().split() if w not in ignore]
         clean_text = " ".join(words) if words else text_query
         pil_image = Image.fromarray(image)
         crops = []
         valid_segments = []
-        # Prepare crops
-        h, w = image.shape[:2]
-        total_img_area = h * w
         for seg in segments:
-            x1, y1, x2, y2 = seg['bbox'].astype(int)
-            # Pad slightly
-            pad = 10
-            x1, y1 = max(0, x1-pad), max(0, y1-pad)
-            x2, y2 = min(w, x2+pad), min(h, y2+pad)
-            crops.append(pil_image.crop((x1, y1, x2, y2)))
             valid_segments.append(seg)
         if not crops: return []
-        # 2. Inference
-        inputs = self.processor(
-            text=[clean_text], images=crops, return_tensors="pt", padding=True
-        ).to(self.device)
-        with torch.no_grad():
-            outputs = self.model(**inputs)
-            # Standardize scores
-            probs = outputs.logits_per_image.softmax(dim=0).cpu().numpy().flatten()
-        # 3. Re-Scoring with Area Weight
         final_results = []
         for i, score in enumerate(probs):
             seg = valid_segments[i]
-            area_ratio = seg['area'] / total_img_area
-            # HEURISTIC: Boost score for larger objects.
-            # If searching for general terms (bus, car, cat), bigger is usually better.
-            # We add 20% of the area_ratio to the score.
-            weighted_score = score + (area_ratio * 0.2)
             final_results.append({
-                'mask': seg['mask'],
                 'bbox': seg['bbox'],
                 'original_score': float(score),
-                'weighted_score': float(weighted_score)
             })
-        # 4. Sort and take Top K
         final_results.sort(key=lambda x: x['weighted_score'], reverse=True)
         return final_results[:k]

 import torch
+import numpy as np
+import gc
 from PIL import Image
 from transformers import CLIPProcessor, CLIPModel
 class CLIPMatcher:
     def __init__(self, model_name='openai/clip-vit-large-patch14'):
         self.device = "cuda" if torch.cuda.is_available() else "cpu"
+        # Load directly to CPU first
+        self.model = CLIPModel.from_pretrained(model_name).to("cpu")
         self.processor = CLIPProcessor.from_pretrained(model_name)
     def get_top_k_segments(self, image, segments, text_query, k=5):
         if not segments: return []
         # 1. Clean Text
         words = [w for w in text_query.lower().split() if w not in ignore]
         clean_text = " ".join(words) if words else text_query
+        # 2. Crop (CPU)
         pil_image = Image.fromarray(image)
         crops = []
         valid_segments = []
+        h_img, w_img = image.shape[:2]
+        total_img_area = h_img * w_img
         for seg in segments:
+            if 'bbox' not in seg: continue
+            # Safe numpy cast
+            bbox = np.array(seg['bbox']).astype(int)
+            x1, y1, x2, y2 = bbox
+            # Adaptive Context Padding (30%)
+            w_box, h_box = x2 - x1, y2 - y1
+            pad_x = int(w_box * 0.3)
+            pad_y = int(h_box * 0.3)
+            crop_x1 = max(0, x1 - pad_x)
+            crop_y1 = max(0, y1 - pad_y)
+            crop_x2 = min(w_img, x2 + pad_x)
+            crop_y2 = min(h_img, y2 + pad_y)
+            crops.append(pil_image.crop((crop_x1, crop_y1, crop_x2, crop_y2)))
             valid_segments.append(seg)
         if not crops: return []
+        # 3. Inference (Brief GPU usage)
+        try:
+            self.model.to(self.device)
+            inputs = self.processor(
+                text=[clean_text], images=crops, return_tensors="pt", padding=True
+            ).to(self.device)
+            with torch.no_grad():
+                outputs = self.model(**inputs)
+                # FIX: Use raw logits for meaningful scores.
+                # (Softmax forces sum=1, concealing bad matches)
+                probs = outputs.logits_per_image.cpu().numpy().flatten()
+        except Exception as e:
+            print(f"CLIP Error: {e}")
+            return []
+        finally:
+            # Move back to CPU immediately
+            self.model.to("cpu")
+        # 4. Score & Sort
         final_results = []
         for i, score in enumerate(probs):
             seg = valid_segments[i]
+            if 'area' in seg:
+                area_ratio = seg['area'] / total_img_area
+            else:
+                w, h = seg['bbox'][2]-seg['bbox'][0], seg['bbox'][3]-seg['bbox'][1]
+                area_ratio = (w*h) / total_img_area
+            # Logits are roughly 15-30 range. Add small boost for area.
+            weighted_score = float(score) + (area_ratio * 2.0)
             final_results.append({
+                'mask': seg.get('mask', None),
                 'bbox': seg['bbox'],
                 'original_score': float(score),
+                'weighted_score': weighted_score,
+                'label': seg.get('label', 'object')
             })
         final_results.sort(key=lambda x: x['weighted_score'], reverse=True)
         return final_results[:k]

src/painter.py CHANGED Viewed

@@ -75,7 +75,7 @@ class SDXLInpainter:
         # Blur the mask slightly to make the transition smoother
         import cv2
-        mask = cv2.GaussianBlur(mask, (5, 5), 0)
         pil_mask = Image.fromarray((mask * 255).astype(np.uint8)).convert('L')

         # Blur the mask slightly to make the transition smoother
         import cv2
+        mask = cv2.GaussianBlur(mask, (21, 21), 0)
         pil_mask = Image.fromarray((mask * 255).astype(np.uint8)).convert('L')

src/pipeline.py CHANGED Viewed

@@ -1,58 +1,107 @@
 import numpy as np
 import cv2
-from .segmenter import SAM2Segmenter
 from .matcher import CLIPMatcher
 from .painter import SDXLInpainter
-from .utils import visualize_mask
 class ObjectRemovalPipeline:
     def __init__(self):
-        print("Initializing models...")
-        self.segmenter = SAM2Segmenter()
-        self.matcher = CLIPMatcher()
-        self.inpainter = SDXLInpainter()
-        print("Pipeline ready.")
-    def process(self, image, text_query, inpaint_prompt=""):
         """
-        Main processing function for object removal.
         """
-        # 1. Segment
-        segments = self.segmenter.segment(image)
-        if not segments:
-            return image, None, "No segments found"
-        # 2. Match with Top-K Strategy
-        # We get top 5 candidates to handle "Part-Whole" ambiguity (e.g. tire vs car)
-        candidates = self.matcher.get_top_k_segments(image, segments, text_query, k=5)
-        if not candidates:
-            return image, None, "No match found"
-        # 3. Merge Masks (The "Cat Tail" Fix)
-        best_candidate = candidates[0]
-        final_mask = best_candidate['mask'].copy()
-        print(f"Top Match Score: {best_candidate['weighted_score']:.3f}")
-        # Merge other candidates if they are close in score or physically overlap
-        for i in range(1, len(candidates)):
-            cand = candidates[i]
-            score_ratio = cand['weighted_score'] / best_candidate['weighted_score']
-            # Check intersection
-            intersection = np.logical_and(final_mask, cand['mask']).sum()
-            # Rule: Merge if score is similar (>85%) OR if they overlap pixels
-            if score_ratio > 0.85 or intersection > 0:
-                print(f"Merging Rank {i+1} (Score ratio: {score_ratio:.2f}, Overlap: {intersection > 0})")
-                final_mask = np.logical_or(final_mask, cand['mask'])
-        # 4. Dilate Final Mask
-        # Expands mask slightly to cover edges/seams
-        kernel = np.ones((15, 15), np.uint8)
-        final_mask = cv2.dilate(final_mask.astype(np.uint8), kernel, iterations=1)
-        # 5. Inpaint
-        result = self.inpainter.inpaint(image, final_mask, prompt=inpaint_prompt)
-        return result, final_mask, "Success"

 import numpy as np
 import cv2
+import torch
+import gc
+# Note: We import classes but DO NOT instantiate them globally
+from .segmenter import YOLOWorldDetector, SAM2Predictor
 from .matcher import CLIPMatcher
 from .painter import SDXLInpainter
 class ObjectRemovalPipeline:
     def __init__(self):
+        print("Initializing Pipeline in LOW MEMORY mode...")
+        # No models loaded at startup!
+        pass
+    def _clear_ram(self):
+        """Helper to force clear RAM & VRAM"""
+        gc.collect()
+        torch.cuda.empty_cache()
+    def get_candidates(self, image, text_query):
         """
+        Step 1: Detect & Segment & Rank
+        Strategy: Load one model at a time, use it, then delete it.
         """
+        candidates = []
+        box_candidates = []
+        # --- PHASE 1: YOLO (Detect) ---
+        print("Loading YOLO...")
+        detector = YOLOWorldDetector()
+        try:
+            box_candidates = detector.detect(image, text_query)
+        finally:
+            del detector # Delete model immediately
+            self._clear_ram()
+        if not box_candidates:
+            return [], "No objects detected."
+        # --- PHASE 2: SAM2 (Segment) ---
+        print("Loading SAM2...")
+        segmenter = SAM2Predictor()
+        segments_to_score = []
+        try:
+            segmenter.set_image(image)
+            # Process top 3 boxes -> up to 9 masks
+            for cand in box_candidates[:3]:
+                bbox = cand['bbox']
+                mask_variations = segmenter.predict_from_box(bbox)
+                for i, (mask, sam_score) in enumerate(mask_variations):
+                    segments_to_score.append({
+                        'mask': mask,
+                        'bbox': bbox,
+                        'area': mask.sum(),
+                        'label': f"{cand['label']} (Var {i+1})"
+                    })
+        finally:
+            # Critical cleanup for SAM2
+            if hasattr(segmenter, 'clear_memory'):
+                segmenter.clear_memory()
+            del segmenter
+            self._clear_ram()
+        # --- PHASE 3: CLIP (Rank) ---
+        print("Loading CLIP...")
+        matcher = CLIPMatcher()
+        ranked_candidates = []
+        try:
+            ranked_candidates = matcher.get_top_k_segments(
+                image,
+                segments_to_score,
+                text_query,
+                k=len(segments_to_score)
+            )
+        finally:
+            del matcher
+            self._clear_ram()
+        return ranked_candidates, f"Found {len(ranked_candidates)} options."
+    def inpaint_selected(self, image, selected_mask, inpaint_prompt="", shadow_expansion=0):
+        """
+        Step 2: Inpaint
+        """
+        # Shadow / Edge Logic (CPU ops)
+        if shadow_expansion > 0:
+            kernel_h = int(shadow_expansion * 1.5)
+            kernel_w = int(shadow_expansion * 0.5)
+            kernel = np.ones((kernel_h, kernel_w), np.uint8)
+            selected_mask = cv2.dilate(selected_mask.astype(np.uint8), kernel, iterations=1)
+        kernel = np.ones((10, 10), np.uint8)
+        final_mask = cv2.dilate(selected_mask.astype(np.uint8), kernel, iterations=1)
+        result = None
+        # --- PHASE 4: SDXL (Inpaint) ---
+        print("Loading SDXL...")
+        inpainter = SDXLInpainter()
+        try:
+            result = inpainter.inpaint(image, final_mask, prompt=inpaint_prompt)
+        finally:
+            del inpainter
+            self._clear_ram()
+        return result

src/segmenter.py CHANGED Viewed

@@ -1,84 +1,76 @@
 import torch
 import numpy as np
-import cv2
 from ultralytics import YOLO
-from sam2.automatic_mask_generator import SAM2AutomaticMaskGenerator
-class YOLOSegmenter:
-    def __init__(self, model_name='yolov8x-seg.pt'):
         self.model = YOLO(model_name)
-    def segment(self, image):
-        """Return list of (mask, bbox, class_id) tuples"""
-        results = self.model(image)[0]
-        segments = []
-        if results.masks is not None:
-            for i, mask in enumerate(results.masks.data):
-                mask_np = mask.cpu().numpy()
-                mask_resized = cv2.resize(mask_np, (image.shape[1], image.shape[0]))
-                bbox = results.boxes.xyxy[i].cpu().numpy()
-                class_id = int(results.boxes.cls[i])
-                segments.append({
-                    'mask': (mask_resized > 0.5).astype(np.uint8),
-                    'bbox': bbox,
-                    'class_id': class_id,
-                    'class_name': self.model.names[class_id]
-                })
-        return segments
-class SAM2Segmenter:
-    def __init__(self, model_cfg='sam2.1_hiera_l.yaml', checkpoint=''):
         self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
-        # Load the Automatic Generator
-        self.mask_generator = SAM2AutomaticMaskGenerator.from_pretrained(
-            "facebook/sam2.1-hiera-large",
-            points_per_side=32,
-            pred_iou_thresh=0.80,
-            stability_score_thresh=0.92,
-            crop_n_layers=1,
-            crop_n_points_downscale_factor=2,
-            device=self.device
-        )
-    def segment(self, image):
-        """
-        Generates masks and filters out background-like huge segments.
-        """
-        if hasattr(self.mask_generator, 'generate'):
-            masks = self.mask_generator.generate(image)
-        else:
-            masks = self.mask_generator.predict(image)
-        segments = []
-        img_h, img_w = image.shape[:2]
-        total_area = img_h * img_w
-        for m in masks:
-            # SAM returns [x, y, w, h]
-            x, y, w, h = m['bbox']
-            # Convert to [x1, y1, x2, y2]
-            x1, y1, x2, y2 = x, y, x + w, y + h
-            # Ignore masks that are too large (> 75% of image)
-            if m['area'] > total_area * 0.75:
-                continue
-            # Ignore masks that are too small (< 0.5% of image)
-            if m['area'] < total_area * 0.005:
-                continue
-            segments.append({
-                'mask': m['segmentation'].astype(np.uint8),
-                'bbox': np.array([x1, y1, x2, y2]),
-                'score': m.get('predicted_iou', 1.0),
-                'area': m['area']
-            })
-        # Sort by area (smallest to largest) to prefer specific objects over containers
-        segments.sort(key=lambda s: s['area'])
-        return segments

 import torch
 import numpy as np
+import gc
 from ultralytics import YOLO
+from sam2.sam2_image_predictor import SAM2ImagePredictor
+class YOLOWorldDetector:
+    def __init__(self, model_name='yolov8s-worldv2.pt'):
+        # Initialize, but manage device carefully
         self.model = YOLO(model_name)
+        self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
+    def detect(self, image, text_query):
+        clean_text = text_query.replace("remove", "").replace("delete", "").strip()
+        if not clean_text: clean_text = "object"
+        boxes = []
+        try:
+            # FIX: Force CPU for text encoding to prevent RuntimeError
+            self.model.to('cpu')
+            self.model.set_classes([clean_text])
+            if self.device == 'cuda':
+                self.model.to('cuda')
+            results = self.model.predict(image, conf=0.05, iou=0.5, verbose=False)[0]
+            if results.boxes:
+                for box in results.boxes.data:
+                    x1, y1, x2, y2 = box[:4].cpu().numpy()
+                    conf = float(box[4])
+                    boxes.append({
+                        'bbox': [int(x1), int(y1), int(x2), int(y2)],
+                        'score': conf,
+                        'label': clean_text
+                    })
+        except Exception as e:
+            print(f"YOLO Error: {e}")
+        finally:
+            # Always offload after use
+            self.model.to('cpu')
+        boxes.sort(key=lambda x: x['score'], reverse=True)
+        return boxes
+class SAM2Predictor:
+    def __init__(self, checkpoint="facebook/sam2.1-hiera-large"):
         self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
+        try:
+            self.predictor = SAM2ImagePredictor.from_pretrained(checkpoint)
+        except:
+            self.predictor = SAM2ImagePredictor.from_pretrained(checkpoint, device='cpu')
+    def set_image(self, image):
+        self.predictor.model.to(self.device)
+        self.predictor.set_image(image)
+    def predict_from_box(self, bbox):
+        box_input = np.array(bbox)[None, :]
+        # Multimask = True for variety
+        masks, scores, logits = self.predictor.predict(
+            point_coords=None,
+            point_labels=None,
+            box=box_input,
+            multimask_output=True
+        )
+        sorted_results = sorted(zip(masks, scores), key=lambda x: x[1], reverse=True)
+        return [(m.astype(np.uint8), s) for m, s in sorted_results]
+    def clear_memory(self):
+        # Critical for preventing memory leaks
+        self.predictor.reset_predictor()
+        self.predictor.model.to('cpu')
+        del self.predictor
+        torch.cuda.empty_cache()
+        gc.collect()