Spaces:

furbola
/

chaskick

Sleeping

App Files Files Community

Mirko Trasciatti commited on Nov 11

Commit

a9a341a

1 Parent(s): dc9a383

Match reference Space: process frames 0→N sequentially, SAM2 handles bidirectional propagation internally

Browse files

Files changed (1) hide show

app.py +33 -56

app.py CHANGED Viewed

@@ -492,64 +492,41 @@ def segment_video_multi(video_file, objects_json):
                 video_segments = {}
                 confidence_scores = []
-                print(f"  Propagating masks bidirectionally from frame {init_frame}...")
                 with torch.inference_mode():
-                    # STEP 1: Process BACKWARD from init_frame to 0
-                    if init_frame > 0:
-                        print(f"    Backward: frames {init_frame} → 0")
-                        for frame_idx in range(init_frame, -1, -1):
-                            frame_pil = video_frames[frame_idx]
-                            pixel_values = None
-                            if inference_session.processed_frames is None or frame_idx not in inference_session.processed_frames:
-                                pixel_values = processor(images=frame_pil, device=device, return_tensors="pt").pixel_values[0]
-                            sam2_output = model(
-                                inference_session=inference_session,
-                                frame=pixel_values,
-                                frame_idx=frame_idx
-                            )
-                            H = inference_session.video_height
-                            W = inference_session.video_width
-                            pred_masks = sam2_output.pred_masks.detach().cpu()
-                            video_res_masks = processor.post_process_masks(
-                                [pred_masks],
-                                original_sizes=[[H, W]],
-                                binarize=False
-                            )[0]
-                            video_segments[frame_idx] = video_res_masks
-                            mask_float = video_res_masks.float() if video_res_masks.dtype == torch.bool else video_res_masks
-                            confidence_scores.append(float(mask_float.mean()))
-                    # STEP 2: Process FORWARD from init_frame+1 to end
-                    if init_frame < len(video_frames) - 1:
-                        print(f"    Forward: frames {init_frame+1} → {len(video_frames)-1}")
-                        for frame_idx in range(init_frame + 1, len(video_frames)):
-                            frame_pil = video_frames[frame_idx]
-                            pixel_values = None
-                            if inference_session.processed_frames is None or frame_idx not in inference_session.processed_frames:
-                                pixel_values = processor(images=frame_pil, device=device, return_tensors="pt").pixel_values[0]
-                            sam2_output = model(
-                                inference_session=inference_session,
-                                frame=pixel_values,
-                                frame_idx=frame_idx
-                            )
-                            H = inference_session.video_height
-                            W = inference_session.video_width
-                            pred_masks = sam2_output.pred_masks.detach().cpu()
-                            video_res_masks = processor.post_process_masks(
-                                [pred_masks],
-                                original_sizes=[[H, W]],
-                                binarize=False
-                            )[0]
-                            video_segments[frame_idx] = video_res_masks
-                            mask_float = video_res_masks.float() if video_res_masks.dtype == torch.bool else video_res_masks
-                            confidence_scores.append(float(mask_float.mean()))
                 print(f"  ✅ Got masks for {len(video_segments)} frames (init_frame was {init_frame})")

                 video_segments = {}
                 confidence_scores = []
+                print(f"  Propagating masks through all frames (0 → {len(video_frames)-1})...")
+                print(f"    Annotation at frame {init_frame} will guide propagation")
                 with torch.inference_mode():
+                    # Process ALL frames in sequential order (0→N)
+                    # SAM2's temporal model expects sequential processing
+                    # The annotated frame (init_frame) is already in processed_frames
+                    for frame_idx in range(len(video_frames)):
+                        frame_pil = video_frames[frame_idx]
+                        pixel_values = None
+                        # Check if this frame was already processed (e.g., the annotated frame)
+                        if inference_session.processed_frames is None or frame_idx not in inference_session.processed_frames:
+                            pixel_values = processor(images=frame_pil, device=device, return_tensors="pt").pixel_values[0]
+                        # Call model - it will use annotation if frame_idx == init_frame
+                        sam2_output = model(
+                            inference_session=inference_session,
+                            frame=pixel_values,
+                            frame_idx=frame_idx
+                        )
+                        # Post-process masks
+                        H = inference_session.video_height
+                        W = inference_session.video_width
+                        pred_masks = sam2_output.pred_masks.detach().cpu()
+                        video_res_masks = processor.post_process_masks(
+                            [pred_masks],
+                            original_sizes=[[H, W]],
+                            binarize=False
+                        )[0]
+                        video_segments[frame_idx] = video_res_masks
+                        mask_float = video_res_masks.float() if video_res_masks.dtype == torch.bool else video_res_masks
+                        confidence_scores.append(float(mask_float.mean()))
                 print(f"  ✅ Got masks for {len(video_segments)} frames (init_frame was {init_frame})")