tiiuae
/

Falcon-Perception

Mask Generation

falcon_perception

text-generation

vision-language

open-vocabulary

Model card Files Files and versions

yasserDahou commited on Mar 29

Commit

9abbbe0

·

verified ·

1 Parent(s): 9b053e0

Update modeling_falcon_perception.py

Files changed (1) hide show

modeling_falcon_perception.py +30 -5

modeling_falcon_perception.py CHANGED Viewed

@@ -733,13 +733,38 @@ class FalconPerceptionForSegmentation(PreTrainedModel):
                 tokens_B1[should_stop_B, :] = self._pad_token_id
             padded_tokens[:, pos] = tokens_B1[:, -1]
-            # Decode coords
             coord_logits = self.decode_coords(h_BSD[:, -1:], tokens_B1)
-            xy_b2 = torch.argmax(coord_logits, dim=-1) / coord_logits.size(-1)
-            coord_preds = [{"x": xy[0].item(), "y": xy[1].item()} for xy in xy_b2]
             sample_w_coord = torch.where(tokens_B1 == self.config.coord_token_id)[0]
             for i, b in enumerate(sample_w_coord.tolist()):
-                aux_output_B[b].append(coord_preds[i])
             # Decode sizes
             size_logits = self.decode_sizes(h_BSD[:, -1:], tokens_B1)
@@ -847,4 +872,4 @@ class FalconPerceptionForSegmentation(PreTrainedModel):
                 "mask_rle": mask_rle,
             })
-        return detections

                 tokens_B1[should_stop_B, :] = self._pad_token_id
             padded_tokens[:, pos] = tokens_B1[:, -1]
+            # Decode coords (with deduplication to avoid repeating the same location)
             coord_logits = self.decode_coords(h_BSD[:, -1:], tokens_B1)
             sample_w_coord = torch.where(tokens_B1 == self.config.coord_token_id)[0]
+            num_bins = coord_logits.size(-1)
+            coord_repeat_threshold = 0.01  # coords within 1% of image size are considered duplicates
+            max_coord_attempts = 100
+            xy_b2 = torch.zeros(B, 2, device=device, dtype=self.dtype)
             for i, b in enumerate(sample_w_coord.tolist()):
+                logits_b = coord_logits[i].clone()  # (2, num_bins)
+                existing_coords = [
+                    item for item in aux_output_B[b]
+                    if isinstance(item, dict) and "x" in item and "y" in item
+                ]
+                pred_x, pred_y = 0.0, 0.0
+                for _ in range(max_coord_attempts):
+                    pred_bins = torch.argmax(logits_b, dim=-1)  # (2,)
+                    pred_x = pred_bins[0].item() / (num_bins - 1)
+                    pred_y = pred_bins[1].item() / (num_bins - 1)
+                    is_repeat = any(
+                        abs(ec["x"] - pred_x) < coord_repeat_threshold
+                        and abs(ec["y"] - pred_y) < coord_repeat_threshold
+                        for ec in existing_coords
+                    )
+                    if not is_repeat:
+                        break
+                    logits_b[0, pred_bins[0]] = float("-inf")
+                    logits_b[1, pred_bins[1]] = float("-inf")
+                xy_b2[b, 0] = pred_x
+                xy_b2[b, 1] = pred_y
+                aux_output_B[b].append({"x": pred_x, "y": pred_y})
             # Decode sizes
             size_logits = self.decode_sizes(h_BSD[:, -1:], tokens_B1)
                 "mask_rle": mask_rle,
             })
+        return detections