Spaces:

HorizonRobotics
/

DIPO

Running on Zero

App Files Files Community

xinjjj commited on 27 days ago

Commit

ce34030

verified ·

1 Parent(s): c28dddb

Upload 29 files

Browse files

Upload large files.

Files changed (30) hide show

.gitattributes +22 -0
ckpts/dipo.ckpt +3 -0
examples/1.png +3 -0
examples/1_open_1.png +3 -0
examples/1_open_2.png +3 -0
examples/close1.png +3 -0
examples/close10.png +3 -0
examples/close2.png +3 -0
examples/close3.png +3 -0
examples/close4.png +3 -0
examples/close5.png +3 -0
examples/close6.png +0 -0
examples/close7.png +3 -0
examples/close8.png +3 -0
examples/close9.jpg +3 -0
examples/open1.png +3 -0
examples/open10.png +3 -0
examples/open2.png +3 -0
examples/open3.png +3 -0
examples/open4.png +3 -0
examples/open5.png +3 -0
examples/open6.png +3 -0
examples/open7.png +3 -0
examples/open8.png +3 -0
examples/open9.jpg +3 -0
systems/__init__.py +22 -0
systems/base.py +286 -0
systems/dino_dummy.npy +3 -0
systems/plot.py +122 -0
systems/system_origin.py +391 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,25 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+examples/1_open_1.png filter=lfs diff=lfs merge=lfs -text
+examples/1_open_2.png filter=lfs diff=lfs merge=lfs -text
+examples/1.png filter=lfs diff=lfs merge=lfs -text
+examples/close1.png filter=lfs diff=lfs merge=lfs -text
+examples/close10.png filter=lfs diff=lfs merge=lfs -text
+examples/close2.png filter=lfs diff=lfs merge=lfs -text
+examples/close3.png filter=lfs diff=lfs merge=lfs -text
+examples/close4.png filter=lfs diff=lfs merge=lfs -text
+examples/close5.png filter=lfs diff=lfs merge=lfs -text
+examples/close7.png filter=lfs diff=lfs merge=lfs -text
+examples/close8.png filter=lfs diff=lfs merge=lfs -text
+examples/close9.jpg filter=lfs diff=lfs merge=lfs -text
+examples/open1.png filter=lfs diff=lfs merge=lfs -text
+examples/open10.png filter=lfs diff=lfs merge=lfs -text
+examples/open2.png filter=lfs diff=lfs merge=lfs -text
+examples/open3.png filter=lfs diff=lfs merge=lfs -text
+examples/open4.png filter=lfs diff=lfs merge=lfs -text
+examples/open5.png filter=lfs diff=lfs merge=lfs -text
+examples/open6.png filter=lfs diff=lfs merge=lfs -text
+examples/open7.png filter=lfs diff=lfs merge=lfs -text
+examples/open8.png filter=lfs diff=lfs merge=lfs -text
+examples/open9.jpg filter=lfs diff=lfs merge=lfs -text

ckpts/dipo.ckpt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:493f551499b95af57b5bb6e872d1107a9cf4056fbf151fc45f416f96a919dad6
+size 24565754

examples/1.png ADDED Viewed

Git LFS Details

SHA256: c301a718a1401acdc67b1c3ad0a03ce7d44b4fbecdff035e362fbcab4e8146c7
Pointer size: 131 Bytes
Size of remote file: 107 kB

examples/1_open_1.png ADDED Viewed

Git LFS Details

SHA256: 500bb1e80e9f140c48c26be0cc911b45f35bae3402dcd4b65222636c5de4209a
Pointer size: 132 Bytes
Size of remote file: 1.45 MB

examples/1_open_2.png ADDED Viewed

Git LFS Details

SHA256: d5ddfcb9c0e18bccef98906cf14d2806ed5cd1baf2ae2a9b8d47aa5dee0fc728
Pointer size: 132 Bytes
Size of remote file: 1.55 MB

examples/close1.png ADDED Viewed

Git LFS Details

SHA256: aab9059f5a24cfb4e5aa83109dec10e0b60c3305d824ea7ca4b3815b388299fc
Pointer size: 132 Bytes
Size of remote file: 2.89 MB

examples/close10.png ADDED Viewed

Git LFS Details

SHA256: 75a59cba68214dbc3c6f3c4f314aaa7e53444f1ca97f71dc2543db1d21cdaa8e
Pointer size: 131 Bytes
Size of remote file: 324 kB

examples/close2.png ADDED Viewed

Git LFS Details

SHA256: 021207db2e7bc4edd2d8564ae4b6a471ffbe863b355c2de779671992cb309a88
Pointer size: 132 Bytes
Size of remote file: 2.31 MB

examples/close3.png ADDED Viewed

Git LFS Details

SHA256: 9611338fcc37ad15f97740e1c8c55f4f7664f55258ee2c7576fab6c539ae5097
Pointer size: 132 Bytes
Size of remote file: 2.41 MB

examples/close4.png ADDED Viewed

Git LFS Details

SHA256: a378ff1c881453ac05e7d89a9d1b8d63c9cece9d2ded4a5802d2eff8337bc4f4
Pointer size: 132 Bytes
Size of remote file: 2.13 MB

examples/close5.png ADDED Viewed

Git LFS Details

SHA256: 79e7d13acaa9c8615deb7d8a6590b9e451754f55b2e2ff17cd646048f8092b88
Pointer size: 131 Bytes
Size of remote file: 581 kB

examples/close6.png ADDED Viewed

examples/close7.png ADDED Viewed

Git LFS Details

SHA256: 924ea038fa387b3740c6369e351a48bc1f8ed01d1b25ed6f290b582e46066533
Pointer size: 131 Bytes
Size of remote file: 640 kB

examples/close8.png ADDED Viewed

Git LFS Details

SHA256: 754f8b420729f5b0d680c083f507f0ab9f3506c0ebfb0514305c613de389c04c
Pointer size: 131 Bytes
Size of remote file: 157 kB

examples/close9.jpg ADDED Viewed

Git LFS Details

SHA256: dd19189cbaf67063ff6a5e0cdfe1159a2cb1a7f771ccf372bf1d7b2704db4266
Pointer size: 131 Bytes
Size of remote file: 140 kB

examples/open1.png ADDED Viewed

Git LFS Details

SHA256: 082b51f5894c82c5be5ebf7d137bb9db5e1ce4a2aed7f4571dab422c6c6e7ba5
Pointer size: 132 Bytes
Size of remote file: 2.82 MB

examples/open10.png ADDED Viewed

Git LFS Details

SHA256: 41a9182d705c1482b58ab33f5575936df1d02c77acc3b6b56a316de47b9b3d0a
Pointer size: 131 Bytes
Size of remote file: 350 kB

examples/open2.png ADDED Viewed

Git LFS Details

SHA256: ed531e4310d76d4e07c673e76e6ca693ea616f94c0b0cfa13fdf9c83fc0dca0d
Pointer size: 132 Bytes
Size of remote file: 2.4 MB

examples/open3.png ADDED Viewed

Git LFS Details

SHA256: 79ecc7922eae1e28d74870213a957cfa8c0f12faaf23f5324acd27f8cff8ce12
Pointer size: 132 Bytes
Size of remote file: 2.62 MB

examples/open4.png ADDED Viewed

Git LFS Details

SHA256: 28bcb2b1429fcbc7c9b8d6555d134fd1fbf0f6a7d2d538f2a5f4c72a8d8d3879
Pointer size: 132 Bytes
Size of remote file: 2.52 MB

examples/open5.png ADDED Viewed

Git LFS Details

SHA256: 462dbf2f0380151d416752eb62b53f2ebf1bc78591909fee7532eea73fd71963
Pointer size: 131 Bytes
Size of remote file: 959 kB

examples/open6.png ADDED Viewed

Git LFS Details

SHA256: 747335a8c3d6bb5558d964eb992a61225e02a6e726e5cb7642f7f0c472bce547
Pointer size: 131 Bytes
Size of remote file: 288 kB

examples/open7.png ADDED Viewed

Git LFS Details

SHA256: 7ec08b66d9747070f8450a3787b70db4be008615d7a1e468fd78409759e6fce5
Pointer size: 131 Bytes
Size of remote file: 682 kB

examples/open8.png ADDED Viewed

Git LFS Details

SHA256: 8415042543b27900ae583789ae6f6c46c980e245551b5879e728d465f38d9d8f
Pointer size: 131 Bytes
Size of remote file: 281 kB

examples/open9.jpg ADDED Viewed

Git LFS Details

SHA256: 2a5e780c5bfbb7fb8ceedb719260212d58caf02891969fd2d193f7658662bdb5
Pointer size: 131 Bytes
Size of remote file: 312 kB

systems/__init__.py ADDED Viewed

	@@ -0,0 +1,22 @@

+systems = {}
+def register(name):
+    def decorator(cls):
+        systems[name] = cls
+        return cls
+    return decorator
+def make(name, config, load_from_checkpoint=None):
+    if load_from_checkpoint is None:
+        system = systems[name](config)
+    else:
+        system = systems[name].load_from_checkpoint(
+            load_from_checkpoint, strict=False, config=config
+        )
+    return system
+from . import system_origin

systems/base.py ADDED Viewed

	@@ -0,0 +1,286 @@

+import os, sys
+sys.path.append(os.path.join(os.path.dirname(__file__), '..', '..'))
+import json
+import math
+import numpy as np
+import lightning.pytorch as pl
+from metrics.iou_cdist import IoU_cDist
+from my_utils.savermixins import SaverMixin
+from my_utils.refs import sem_ref, joint_ref
+from dataset.utils import convert_data_range, parse_tree
+from my_utils.plot import viz_graph, make_grid, add_text
+from my_utils.render import draw_boxes_axiss_anim, prepare_meshes
+from PIL import Image
+class BaseSystem(pl.LightningModule, SaverMixin):
+    def __init__(self, hparams):
+        super().__init__()
+        self.hparams.update(hparams)
+    def setup(self, stage: str):
+        # config the logger dir for images
+        self.hparams.save_dir = os.path.join(self.hparams.exp_dir, 'output', stage)
+        os.makedirs(self.hparams.save_dir, exist_ok=True)
+    # --------------------------------- visualization ---------------------------------
+    def convert_json(self, x, c, idx, prefix=''):
+        out = {"meta": {}, "diffuse_tree": []}
+        n_nodes = c[f"{prefix}n_nodes"][idx].item()
+        par = c[f"{prefix}parents"][idx].cpu().numpy().tolist()
+        adj = c[f"{prefix}adj"][idx].cpu().numpy()
+        np.fill_diagonal(adj, 0) # remove self-loop for the root node
+        if f"{prefix}obj_cat" in c:
+            out["meta"]["obj_cat"] = c[f"{prefix}obj_cat"][idx]
+        # convert the data to original range
+        data = convert_data_range(x.cpu().numpy())
+        # parse the tree
+        out["diffuse_tree"] = parse_tree(data, n_nodes, par, adj)
+        return out
+    # def save_val_img(self, pred, gt, cond):
+    #     B = pred.shape[0]
+    #     pred_imgs, gt_imgs, gt_graphs_view = [], [], []
+    #     for b in range(B):
+    #         print(b)
+    #         # convert to humnan readable format json
+    #         pred_json = self.convert_json(pred[b], cond, b)
+    #         gt_json = self.convert_json(gt[b], cond, b)
+    #         # visualize bbox and axis
+    #         pred_meshes = prepare_meshes(pred_json)
+    #         bbox_0, bbox_1, axiss = (
+    #             pred_meshes["bbox_0"],
+    #             pred_meshes["bbox_1"],
+    #             pred_meshes["axiss"],
+    #         )
+    #         pred_img = draw_boxes_axiss_anim(
+    #             bbox_0, bbox_1, axiss, mode="graph", resolution=128
+    #         )
+    #         gt_meshes = prepare_meshes(gt_json)
+    #         bbox_0, bbox_1, axiss = (
+    #             gt_meshes["bbox_0"],
+    #             gt_meshes["bbox_1"],
+    #             gt_meshes["axiss"],
+    #         )
+    #         gt_img = draw_boxes_axiss_anim(
+    #             bbox_0, bbox_1, axiss, mode="graph", resolution=128
+    #         )
+    #         # visualize graph
+    #         # gt_graph = viz_graph(gt_json, res=128)
+    #         # gt_graph = add_text(cond["name"][b], gt_graph)
+    #         # GT views
+    #         rgb_view = cond["img"][b].cpu().numpy()
+    #         pred_imgs.append(pred_img)
+    #         gt_imgs.append(gt_img)
+    #         gt_graphs_view.append(rgb_view)
+    #         # gt_graphs_view.append(gt_graph)
+    #     # save images for generated results
+    #     epoch = str(self.current_epoch).zfill(5)
+    #     # pred_thumbnails = np.concatenate(pred_imgs, axis=1)  # concat batch in width
+    #     import ipdb
+    #     ipdb.set_trace()
+    #     # save images for ground truth
+    #     for i in range(math.ceil(len(gt_graphs_view) / 8)):
+    #         start = i * 8
+    #         end = min((i + 1) * 8, len(gt_graphs_view))
+    #         pred_thumbnails = np.concatenate(pred_imgs[start:end], axis=1)
+    #         gt_graph_imgs = np.concatenate(gt_graphs_view[start:end], axis=1)
+    #         gt_thumbnails = np.concatenate(gt_imgs[start:end], axis=1)  # concat batch in width
+    #         grid = np.concatenate([gt_graph_imgs, gt_thumbnails, pred_thumbnails], axis=0)
+            # self.save_rgb_image(f"new_out_valid_{i}.png", grid)
+    def save_test_step(self, pred, gt, cond, batch_idx, res=128):
+        exp_name = self._get_exp_name()
+        model_name = cond["name"][0].replace("/", '@')
+        save_dir = f"{exp_name}/{str(batch_idx)}@{model_name}"
+        # input image
+        input_img = cond["img"][0].cpu().numpy()
+        # GT recordings
+        if not self.hparams.get('test_no_GT', False):
+            gt_json = self.convert_json(gt[0], cond, 0)
+            # gt_graph = viz_graph(gt_json, res=256)
+            gt_meshes = prepare_meshes(gt_json)
+            bbox_0, bbox_1, axiss = (
+                gt_meshes["bbox_0"],
+                gt_meshes["bbox_1"],
+                gt_meshes["axiss"],
+            )
+            gt_img = draw_boxes_axiss_anim(bbox_0, bbox_1, axiss, mode="graph", resolution=res)
+        else:
+            # gt_graph = 255 * np.ones((res, res, 3), dtype=np.uint8)
+            gt_img = 255 * np.ones((res, 2 * res, 3), dtype=np.uint8)
+        gt_block = np.concatenate([input_img, gt_img], axis=1)
+        # recordings for generated results
+        img_blocks = []
+        for b in range(pred.shape[0]):
+            pred_json = self.convert_json(pred[b], cond, 0)
+            # visualize bbox and axis
+            pred_meshes = prepare_meshes(pred_json)
+            bbox_0, bbox_1, axiss = (
+                pred_meshes["bbox_0"],
+                pred_meshes["bbox_1"],
+                pred_meshes["axiss"],
+            )
+            pred_img = draw_boxes_axiss_anim(
+                bbox_0, bbox_1, axiss, mode="graph", resolution=res
+            )
+            img_blocks.append(pred_img)
+            self.save_json(f"{save_dir}/{b}/object.json", pred_json)
+        # save images for generated results
+        img_grid = make_grid(img_blocks, cols=5)
+        # visualize the input graph
+        # input_graph = viz_graph(pred_json, res=256)
+        # save images
+        # self.save_rgb_image(f"{save_dir}/gt_graph.png", gt_graph)
+        self.save_rgb_image(f"{save_dir}/output.png", img_grid)
+        self.save_rgb_image(f"{save_dir}/gt.png", gt_block)
+        # self.save_rgb_image(f"{save_dir}/input_graph.png", input_graph)
+    def _save_html_end(self):
+        exp_name = self._get_exp_name()
+        save_dir = self.get_save_path(exp_name)
+        cases = sorted(os.listdir(save_dir), key=lambda x: int(x.split("@")[0]))
+        html_head = """
+        <!DOCTYPE html>
+        <html lang="en">
+        <head>
+            <meta charset="UTF-8">
+            <meta name="viewport" content="width=device-width, initial-scale=1.0">
+            <title>Test Image Results</title>
+            <style>
+                table {
+                    width: 100%;
+                    border-collapse: collapse;
+                }
+                th, td {
+                    border: 1px solid black;
+                    padding: 8px;
+                    text-align: left;
+                }
+                .separator {
+                    border-top: 2px solid black;
+                }
+            </style>
+        </head>
+        <body>
+            <table>
+        """
+        total = len(cases)
+        each = 200
+        n_pages = total // each + 1
+        for p in range(n_pages):
+            html_content = html_head
+            for i in range(p * each, min((p + 1) * each, total)):
+                case = cases[i]
+                if self.hparams.get("test_no_GT", False):
+                    aid_iou = rid_iou = aid_cdist = rid_cdist = aid_cd = rid_cd = aor = "N/A"
+                else:
+                    with open(os.path.join(save_dir, case, "metrics.json"), "r") as f:
+                        metrics = json.load(f)["avg"]
+                    aid_iou = round(metrics["AS-IoU"], 4)
+                    rid_iou = round(metrics["RS-IoU"], 4)
+                    aid_cdist = round(metrics["AS-cDist"], 4)
+                    rid_cdist = round(metrics["RS-cDist"], 4)
+                    aid_cd = round(metrics["AS-CD"], 4)
+                    rid_cd = round(metrics["RS-CD"], 4)
+                    aor = metrics["AOR"]
+                    if aor is not None:
+                        aor = round(aor, 4)
+                html_content += f"""
+                    <tr>
+                        <th>Object ID</th>
+                        <th>Metrics (avg) </th>
+                        <th>Input image + GT object + GT graph</th>
+                        <th>Input graph </th>
+                    </tr>
+                    <tr>
+                        <td rowspan="3">{case}</td>
+                        <td>
+                        [AS-cDist] {aid_cdist}<br>
+                        [RS-cDist] {rid_cdist}<br>
+                        -----------------------<br>
+                        [AS-IoU]  {aid_iou}<br>
+                        [RS-IoU]  {rid_iou}<br>
+                        -----------------------<br>
+                        [RS-CD]   {rid_cd}<br>
+                        [AS-CD]   {aid_cd}<br>
+                        -----------------------<br>
+                        [AOR]     {aor}<br>
+                        </td>
+                        <td>
+                            <img src="{exp_name}/{case}/gt.png" alt="GT Image" style="height: 128px; width: 3*128px;">
+                            <img src="{exp_name}/{case}/gt_graph.png" alt="Graph Image" style="height: 128px; width: 3*128px;">
+                        </td>
+                        <td>
+                            <img src="{exp_name}/{case}/input_graph.png" alt="Graph Image" style="height: 128px; width: 3*128px;">
+                        </td>
+                    </tr>
+                    <tr><th colspan="3">Generated samples</th></tr>
+                    <tr>
+                        <td colspan="3"><img src="{exp_name}/{case}/output.png" alt="Generated Image" style="height: 3*128px; width: 10*128px;"></td>
+                    </tr>
+                    <tr class="separator"><td colspan="4"></td></tr>
+                """
+            html_content += """</table></body></html>"""
+            outfile = self.get_save_path(f"{exp_name}_page_{p+1}.html")
+            with open(outfile, "w") as file:
+                file.write(html_content)
+    def val_compute_metrics(self, pred, gt, cond):
+        loss_dict = {}
+        B = pred.shape[0]
+        as_ious = 0.0
+        rs_ious = 0.0
+        as_cdists = 0.0
+        rs_cdists = 0.0
+        for b in range(B):
+            gt_json = self.convert_json(gt[b], cond, b)
+            pred_json = self.convert_json(pred[b], cond, b)
+            scores = IoU_cDist(
+                pred_json,
+                gt_json,
+                num_states=5,
+                compare_handles=True,
+                iou_include_base=True,
+            )
+            as_ious += scores['AS-IoU']
+            rs_ious += scores['RS-IoU']
+            as_cdists += scores['AS-cDist']
+            rs_cdists += scores['RS-cDist']
+        as_ious /= B
+        rs_ious /= B
+        as_cdists /= B
+        rs_cdists /= B
+        loss_dict['val/AS-IoU'] = as_ious
+        loss_dict['val/RS-IoU'] = rs_ious
+        loss_dict['val/AS-cDist'] = as_cdists
+        loss_dict['val/RS-cDist'] = rs_cdists
+        return loss_dict
+    def _get_exp_name(self):
+        which_ds = self.hparams.get("test_which", 'pm')
+        is_pred_G = self.hparams.get("test_pred_G", False)
+        is_label_free = self.hparams.get("test_label_free", False)
+        guidance_scaler = self.hparams.get("guidance_scaler", 0)
+        # config saving directory
+        exp_postfix = f"_w={guidance_scaler}_{which_ds}"
+        if is_pred_G:
+            exp_postfix += "_pred_G"
+        if is_label_free:
+            exp_postfix += "_label_free"
+        exp_name = "epoch_" + str(self.current_epoch).zfill(3) + exp_postfix
+        return exp_name

systems/dino_dummy.npy ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:67b13dadf868704eb0e5a1b55355d54bce806b7f9d8d877cdf4142f759544bbd
+size 1572992

systems/plot.py ADDED Viewed

	@@ -0,0 +1,122 @@

+import os, sys
+sys.path.append(os.path.join(os.path.dirname(__file__), '..', '..'))
+import matplotlib
+matplotlib.use('Agg')
+import numpy as np
+import networkx as nx
+from io import BytesIO
+from PIL import Image, ImageDraw
+from matplotlib import pyplot as plt
+from sklearn.decomposition import PCA
+from singapo_utils.refs import graph_color_ref
+def add_text(text, imgarr):
+    '''
+    Function to add text to image
+    Args:
+    - text (str): text to add
+    - imgarr (np.array): image array
+    Returns:
+    - img (np.array): image array with text
+    '''
+    img = Image.fromarray(imgarr)
+    I = ImageDraw.Draw(img)
+    I.text((10, 10), text, fill='black')
+    return np.asarray(img)
+def get_color(ref, n_nodes):
+    '''
+    Function to color the nodes
+    Args:
+    - ref (list): list of color reference
+    - n_nodes (int): number of nodes
+    Returns:
+    - colors (list): list of colors
+    '''
+    N = len(ref)
+    colors = []
+    for i in range(n_nodes):
+        colors.append(np.array([[int(i) for i in ref[i%N][4:-1].split(',')]]) / 255.)
+    return colors
+def make_grid(images, cols=5):
+    """
+    Arrange list of images into a N x cols grid.
+    Args:
+    - images (list): List of Numpy arrays representing the images.
+    - cols (int): Number of columns for the grid.
+    Returns:
+    - grid (numpy array): Numpy array representing the image grid.
+    """
+    # Determine the dimensions of each image
+    img_h, img_w, _ = images[0].shape
+    rows = len(images) // cols
+    # Initialize a blank canvas
+    grid = np.zeros((rows * img_h, cols * img_w, 3), dtype=images[0].dtype)
+    # Place each image onto the grid
+    for idx, img in enumerate(images):
+        y = (idx // cols) * img_h
+        x = (idx % cols) * img_w
+        grid[y: y + img_h, x: x + img_w] = img
+    return grid
+def viz_graph(info_dict, res=256):
+    '''
+    Function to plot the directed graph
+    Args:
+    - info_dict (dict): output json containing the graph information
+    - res (int): resolution of the image
+    Returns:
+    - img_arr (np.array): image array
+    '''
+    # build tree
+    tree = info_dict['diffuse_tree']
+    edges = []
+    for node in tree:
+        edges += [(node['id'], child) for child in node['children']]
+    G = nx.DiGraph()
+    G.add_edges_from(edges)
+    # plot tree
+    plt.figure(figsize=(res/100, res/100))
+    colors = get_color(graph_color_ref, len(tree))
+    pos = nx.nx_agraph.graphviz_layout(G, prog="twopi", args="")
+    node_order = sorted(G.nodes())
+    nx.draw(G, pos, node_color=colors, nodelist=node_order, edge_color='k', with_labels=False)
+    buf = BytesIO()
+    plt.savefig(buf, format="png", dpi=100)
+    buf.seek(0)
+    img = Image.open(buf)
+    img_arr = np.asarray(img)
+    buf.close()
+    plt.clf()
+    plt.close()
+    return img_arr[:, :, :3]
+def viz_patch_feat_pca(feat):
+    pca = PCA(n_components=3)
+    pca.fit(feat)
+    feat_pca = pca.transform(feat)
+    t = np.array(feat_pca)
+    t_min = t.min(axis=0, keepdims=True)
+    t_max = t.max(axis=0, keepdims=True)
+    normalized_t = (t - t_min) / (t_max - t_min)
+    array = (normalized_t * 255).astype(np.uint8)
+    img_array = array.reshape(16, 16, 3)
+    return img_array

systems/system_origin.py ADDED Viewed

	@@ -0,0 +1,391 @@

+import os, sys
+sys.path.append(os.path.join(os.path.dirname(__file__), '..', '..'))
+import torch
+import subprocess
+import numpy as np
+import models
+import systems
+import torch.nn.functional as F
+from diffusers import DDPMScheduler
+from systems.base import BaseSystem
+from my_utils.lr_schedulers import LinearWarmupCosineAnnealingLR
+from datetime import datetime
+import logging
+@systems.register("sys_origin")
+class SingapoSystem(BaseSystem):
+    """Trainer for the B9 model, incorporating the classifier-free for image condition."""
+    def __init__(self, hparams):
+        super().__init__(hparams)
+        self.model = models.make(hparams.model.name, hparams.model)
+        # configure the scheduler of DDPM
+        self.scheduler = DDPMScheduler(**self.hparams.scheduler.config)
+        # load the dummy DINO features
+        self.dummy_dino = np.load('systems/dino_dummy.npy').astype(np.float32)
+        # use the manual optimization
+        self.automatic_optimization = False
+        # save the hyperparameters
+        self.save_hyperparameters()
+        self.custom_logger = logging.getLogger(__name__)
+        self.custom_logger.setLevel(logging.INFO)
+        if self.global_rank == 0:
+            self.custom_logger.addHandler(logging.StreamHandler())
+    def load_cage_weights(self, pretrained_ckpt=None):
+        ckpt = torch.load(pretrained_ckpt)
+        state_dict = ckpt["state_dict"]
+        # remove the "model." prefix from the keys
+        state_dict = {k.replace("model.", ""): v for k, v in state_dict.items()}
+        # load the weights
+        self.model.load_state_dict(state_dict, strict=False)
+        # separate the weights of CAGE and our new modules
+        print("[INFO] loaded model weights of the pretrained CAGE.")
+    def fg_loss(self, all_attn_maps, loss_masks):
+        """
+        Excite the attention maps within the object regions, while weaken the attention outside the object regions.
+        Args:
+            all_attn_maps: cross-attention maps from all layers, shape (B*L, H, 160, 256)
+            loss_masks: object seg mask on the image patches, shape (B, 160, 256)
+        Returns:
+            loss: loss on the attention maps
+        """
+        valid_mask = loss_masks['valid_nodes']
+        fg_mask = loss_masks['fg']
+        # get the number of layers and batch size
+        L = self.hparams.model.n_layers
+        H = all_attn_maps.shape[1]
+        # Reshape all the masks to the shape of the attention maps
+        valid_node = valid_mask[:, :, 0].unsqueeze(1).expand(-1, H, -1).unsqueeze(-1).expand(-1, -1, -1, 256).repeat(L, 1, 1, 1)
+        obj_region = fg_mask.unsqueeze(1).expand(-1, H, -1, -1).repeat(L, 1, 1, 1)
+        # construct masks for the object and non-object regions
+        fg_region = torch.logical_and(valid_node, obj_region)
+        bg_region = torch.logical_and(valid_node, ~obj_region)
+        # loss to excite the foreground regions
+        loss = 1. - all_attn_maps[fg_region].mean() + all_attn_maps[bg_region].mean()
+        return loss
+    def diffuse_process(self, inputs):
+        x = inputs["x"]
+        # Sample Gaussian noise
+        noise = torch.randn(x.shape, device=self.device, dtype=x.dtype)
+        # Sample a random timestep for each image
+        timesteps = torch.randint(
+            0,
+            self.scheduler.config.num_train_timesteps,
+            (x.shape[0],),
+            device=self.device,
+            dtype=torch.long,
+        )
+        # Add Gaussian noise to the input
+        noisy_x = self.scheduler.add_noise(x, noise, timesteps)
+        # update the inputs
+        inputs["noise"] = noise
+        inputs["timesteps"] = timesteps
+        inputs["noisy_x"] = noisy_x
+    def prepare_inputs(self, batch, mode='train', n_samples=1):
+        x, c, f = batch
+        cat = c["cat"]                   # object category
+        attr_mask = c["attr_mask"]       # attention mask for local self-attention (follow the CAGE)
+        key_pad_mask = c["key_pad_mask"] # key padding mask for global self-attention (follow the CAGE)
+        graph_mask = c["adj_mask"]       # attention mask for graph relation self-attention (follow the CAGE)
+        inputs = {}
+        if mode == 'train':
+            # the number of sampled timesteps per iteration
+            n_repeat = self.hparams.n_time_samples
+            # for sampling multiple timesteps
+            x = x.repeat(n_repeat, 1, 1)
+            cat = cat.repeat(n_repeat)
+            f = f.repeat(n_repeat, 1, 1)
+            key_pad_mask = key_pad_mask.repeat(n_repeat, 1, 1)
+            graph_mask = graph_mask.repeat(n_repeat, 1, 1)
+            attr_mask = attr_mask.repeat(n_repeat, 1, 1)
+        elif mode == 'val':
+            noisy_x = torch.randn(x.shape, device=x.device)
+            dummy_f = torch.tensor(self.dummy_dino, device=self.device).unsqueeze(0).repeat(1, 2, 1).expand_as(f)
+            inputs["noisy_x"] = noisy_x
+            inputs["dummy_f"] = dummy_f
+        elif mode == 'test':
+            # for sampling multiple outputs
+            x = x.repeat(n_samples, 1, 1)
+            cat = cat.repeat(n_samples)
+            f = f.repeat(n_samples, 1, 1)
+            key_pad_mask = key_pad_mask.repeat(n_samples, 1, 1)
+            graph_mask = graph_mask.repeat(n_samples, 1, 1)
+            attr_mask = attr_mask.repeat(n_samples, 1, 1)
+            noisy_x = torch.randn(x.shape, device=x.device)
+            dummy_f = torch.tensor(self.dummy_dino, device=self.device).unsqueeze(0).repeat(1, 2, 1).expand_as(f)
+            inputs["noisy_x"] = noisy_x
+            inputs["dummy_f"] = dummy_f.repeat(1, 2, 1)
+        else:
+            raise ValueError(f"Invalid mode: {mode}")
+        inputs["x"] = x
+        inputs["f"] = f
+        inputs["cat"] = cat
+        inputs["key_pad_mask"] = key_pad_mask
+        inputs["graph_mask"] = graph_mask
+        inputs["attr_mask"] = attr_mask
+        return inputs
+    def prepare_loss_mask(self, batch):
+        x, c, _ = batch
+        n_repeat = self.hparams.n_time_samples # the number of sampled timesteps per iteration
+        # mask on the image patches for the foreground regions
+        # mask_fg = c["img_obj_mask"]
+        # if mask_fg is not None:
+        #     mask_fg = mask_fg.repeat(n_repeat, 1, 1)
+        # mask on the valid nodes
+        index_tensor = torch.arange(x.shape[1], device=self.device, dtype=torch.int32).unsqueeze(0)  # (1, N)
+        valid_nodes = index_tensor < (c['n_nodes'] * 5).unsqueeze(-1)
+        mask_valid_nodes = valid_nodes.unsqueeze(-1).expand_as(x)
+        mask_valid_nodes = mask_valid_nodes.repeat(n_repeat, 1, 1)
+        return {"fg": None, "valid_nodes": mask_valid_nodes}
+    def manage_cfg(self, inputs):
+        '''
+        Manage the classifier-free training for the image and graph condition.
+        The CFG for object category is managed by the model (i.e. the CombinedTimestepLabelEmbeddings module in norm1 for each attention block)
+        '''
+        img_drop_prob = self.hparams.get("img_drop_prob", 0.0)
+        graph_drop_prob = self.hparams.get("graph_drop_prob", 0.0)
+        drop_img, drop_graph = False, False
+        if img_drop_prob > 0.0:
+            drop_img = torch.rand(1) < img_drop_prob
+            if drop_img.item():
+                dummy_batch = torch.tensor(self.dummy_dino, device=self.device).unsqueeze(0).repeat(1, 2, 1).expand_as(inputs['f'])
+                inputs['f'] = dummy_batch  # use the dummy DINO features
+        if graph_drop_prob > 0.0:
+            if not drop_img:
+                drop_graph = torch.rand(1) < graph_drop_prob
+                if drop_graph.item():
+                    inputs['graph_mask'] = None # for varify the model only, replace with the below line later and retrain the model
+                    # inputs['graph_mask'] = inputs['key_pad_mask'] # use the key padding mask
+    def compute_loss(self, batch, inputs, outputs):
+        loss_dict = {}
+        # loss_weight = self.hparams.get("loss_fg_weight", 1.0)
+        # prepare the loss masks
+        loss_masks = self.prepare_loss_mask(batch)
+        # diffusion model loss: MSE on the residual noise
+        loss_mse = F.mse_loss(outputs['noise_pred'] * loss_masks['valid_nodes'], inputs['noise'] * loss_masks['valid_nodes'])
+        # attention mask loss: BCE loss on the attention maps
+        # loss_fg = loss_weight * self.fg_loss(outputs['attn_maps'], loss_masks)
+        # total loss
+        loss = loss_mse
+        # log the losses
+        loss_dict["train/loss_mse"] = loss_mse
+        loss_dict["train/loss_total"] = loss
+        return loss, loss_dict
+    def training_step(self, batch, batch_idx):
+        # prepare the inputs and GT
+        inputs = self.prepare_inputs(batch, mode='train')
+        # manage the classifier-free training
+        self.manage_cfg(inputs)
+        # forward: diffusion process
+        self.diffuse_process(inputs)
+        # reverse: denoising process
+        outputs = self.model(
+            x=inputs['noisy_x'],
+            cat=inputs['cat'],
+            timesteps=inputs['timesteps'],
+            feat=inputs['f'],
+            key_pad_mask=inputs['key_pad_mask'],
+            graph_mask=inputs['graph_mask'],
+            attr_mask=inputs['attr_mask'],
+        )
+        # compute the loss
+        loss, loss_dict = self.compute_loss(batch, inputs, outputs)
+        # manual backward
+        opt1, opt2 = self.optimizers()
+        opt1.zero_grad()
+        opt2.zero_grad()
+        self.manual_backward(loss)
+        opt1.step()
+        opt2.step()
+        if batch_idx % 20 == 0 and self.global_rank == 0:
+            now = datetime.now()
+            now_str = now.strftime("%Y-%m-%d %H:%M:%S")
+            loss_str = f'Epoch:{self.current_epoch} | Step:{batch_idx:03d} | '
+            for key, value in loss_dict.items():
+                loss_str += f"{key}: {value.item():.4f} | "
+            self.custom_logger.info(now_str + ' | ' + loss_str)
+        # logging
+        # self.log_dict(loss_dict, sync_dist=True, on_step=True, on_epoch=False)
+    def on_train_epoch_end(self):
+        # step the lr scheduler every epoch
+        sch1, sch2 = self.lr_schedulers()
+        sch1.step()
+        sch2.step()
+    def inference(self, inputs, is_label_free=False):
+        device = inputs['x'].device
+        omega = self.hparams.get("guidance_scaler", 0)
+        noisy_x = inputs['noisy_x']
+        # set scheduler to denoise every 100 steps
+        self.scheduler.set_timesteps(100)
+        # denoising process
+        for t in self.scheduler.timesteps:
+            timesteps = torch.tensor([t], device=device)
+            outputs_cond = self.model(
+                x=noisy_x,
+                cat=inputs['cat'],
+                timesteps=timesteps,
+                feat=inputs['f'],
+                key_pad_mask=inputs['key_pad_mask'],
+                graph_mask=inputs['graph_mask'],
+                attr_mask=inputs['attr_mask'],
+                label_free=is_label_free,
+            ) # take condtional image as input
+            if omega != 0:
+                outputs_free = self.model(
+                    x=noisy_x,
+                    cat=inputs['cat'],
+                    timesteps=timesteps,
+                    feat=inputs['dummy_f'],
+                    key_pad_mask=inputs['key_pad_mask'],
+                    graph_mask=inputs['graph_mask'],
+                    attr_mask=inputs['attr_mask'],
+                    label_free=is_label_free,
+                ) # take the dummy DINO features for the condition-free mode
+                noise_pred = (1 + omega) * outputs_cond['noise_pred'] - omega * outputs_free['noise_pred']
+            else:
+                noise_pred = outputs_cond['noise_pred']
+            noisy_x = self.scheduler.step(noise_pred, t, noisy_x).prev_sample
+        return noisy_x
+    def validation_step(self, batch, batch_idx):
+        # prepare the inputs and GT
+        inputs = self.prepare_inputs(batch, mode='val')
+        # denoising process for inference
+        out = self.inference(inputs)
+        # compute the metrics
+        # new_out = torch.zeros_like(out).type_as(out).to(out.device)
+        # for b in range(out.shape[0]):
+        #     for k in range(32):
+        #         if out[b][(k + 1) * 6 - 1].mean() > 0.5:
+        #             new_out[b][k * 6: (k + 1) * 6] = out[b][k * 6: (k + 1) * 6]
+        # zero center
+        # rescale
+        # ready
+        # out = new_out
+        # new_out = torch.zeros_like(out).type_as(out).to(out.device)
+        # for b in range(out.shape[0]):
+        #     for k in range(32):
+        #         min_aabb_diff = 1e10
+        #         min_index = k
+        #         aabb_center = (out[b][k * 6][:3] + out[b][k * 6 ][3:]) / 2
+        #         for k_gt in range(32):
+        #             aabb_gt_center = (batch[1][b][k_gt * 6][:3] + batch[1][b][k_gt * 6][3:]) / 2
+        #             aabb_diff = torch.norm(aabb_center - aabb_gt_center)
+        #             if aabb_diff < min_aabb_diff:
+        #                 min_aabb_diff = aabb_diff
+        #                 min_index = k_gt
+        #         new_out[b][min_index * 6: (min_index + 1) * 6] = out[b][k * 6: (k + 1) * 6]
+        # out = new_out
+        log_dict = self.val_compute_metrics(out, inputs['x'], batch[1])
+        self.log_dict(log_dict, on_step=True)
+        # visualize the first 10 results
+        # self.save_val_img(out[:16], inputs['x'][:16], batch[1])
+    def test_step(self, batch, batch_idx):
+        # exp_name = self._get_exp_name()
+        # print(self.get_save_path(exp_name))
+        # if batch_idx > 2:
+        #     return
+        # return
+        is_label_free = self.hparams.get("test_label_free", False)
+        exp_name = self._get_exp_name()
+        model_name = batch[1]["name"][0].replace("/", '@')
+        save_dir = f"{exp_name}/{str(batch_idx)}@{model_name}"
+        print(save_dir)
+        if os.path.exists(self.get_save_path(f"{save_dir}/output.png")):
+            return
+        # prepare the inputs and GT
+        inputs = self.prepare_inputs(batch, mode='test', n_samples=5)
+        # denoising process for inference
+        out = self.inference(inputs, is_label_free)
+        # save the results
+        self.save_test_step(out, inputs['x'], batch[1], batch_idx)
+    def on_test_end(self):
+        # only run the single GPU
+        # if self.global_rank == 0:
+        #     exp_name = self._get_exp_name()
+        #     # retrieve parts
+        #     subprocess.run(['python', 'scripts/mesh_retrieval/run_retrieve.py', '--src', self.get_save_path(exp_name), '--json_name', 'object.json', '--gt_data_root', '../singapo'])
+        #     # save metrics
+        #     if not self.hparams.get("test_no_GT", False):
+        #         subprocess.run(['python', 'scripts/eval_metrics.py', '--exp_dir', self.get_save_path(exp_name), '--gt_root', '../acd_data/'])
+        #     # save html
+        #     self._save_html_end()
+        pass
+    def configure_optimizers(self):
+        self.cage_params = self.adapter_params = []
+        for name, param in self.model.named_parameters():
+            if "img" in name or "norm5" in name or "norm6" in name:
+                self.adapter_params.append(param)
+            else:
+                self.cage_params.append(param)
+        optimizer_adapter = torch.optim.AdamW(
+            self.adapter_params, **self.hparams.optimizer_adapter.args
+        )
+        lr_scheduler_adapter = LinearWarmupCosineAnnealingLR(
+            optimizer_adapter,
+            warmup_epochs=self.hparams.lr_scheduler_adapter.warmup_epochs,
+            max_epochs=self.hparams.lr_scheduler_adapter.max_epochs,
+            warmup_start_lr=self.hparams.lr_scheduler_adapter.warmup_start_lr,
+            eta_min=self.hparams.lr_scheduler_adapter.eta_min,
+        )
+        optimizer_cage = torch.optim.AdamW(
+            self.cage_params, **self.hparams.optimizer_cage.args
+        )
+        lr_scheduler_cage = LinearWarmupCosineAnnealingLR(
+            optimizer_cage,
+            warmup_epochs=self.hparams.lr_scheduler_cage.warmup_epochs,
+            max_epochs=self.hparams.lr_scheduler_cage.max_epochs,
+            warmup_start_lr=self.hparams.lr_scheduler_cage.warmup_start_lr,
+            eta_min=self.hparams.lr_scheduler_cage.eta_min,
+        )
+        return (
+            {"optimizer": optimizer_adapter, "lr_scheduler": lr_scheduler_adapter},
+            {"optimizer": optimizer_cage, "lr_scheduler": lr_scheduler_cage},
+        )