another approach for final decoder

2025-09-29 23:20:27 +09:00
parent 12a165e461
commit 0ccf1ff42d
14 changed files with 88 additions and 24 deletions
--- a/main.py
+++ b/main.py
@@ -72,8 +72,8 @@ for epoch in range(100):
            ):
                batch = test_dataset[i : i + batch_size]
                images = rf.sample(batch["x0"].to(device))
-                image = denormalize(images[-1]).clamp(0, 1) * 255
-                original = denormalize(batch["x1"]).clamp(0, 1) * 255
+                image = denormalize(images[-1]).clamp(0, 1)
+                original = denormalize(batch["x1"]).clamp(0, 1)

                psnr, ssim, lpips = benchmark(image.cpu(), original.cpu())
                psnr_sum += psnr.sum().item()
@@ -92,7 +92,6 @@ for epoch in range(100):
                "epoch": epoch + 1,
            }
        )
-
        rf.model.train()

        torch.save(
--- a/quick_eval.py
+++ b/quick_eval.py
@@ -41,13 +41,13 @@ with torch.no_grad():
        batch = test_dataset[i : i + batch_size]
        images = rf.sample(batch["x0"].to(device))

-        image = denormalize(images[-1]).clamp(0, 1) * 255
-        original = denormalize(batch["x1"]).clamp(0, 1) * 255
+        image = denormalize(images[-1]).clamp(0, 1)
+        original = denormalize(batch["x1"]).clamp(0, 1)

        if saved_count < max_save:
            for j in range(min(image.shape[0], max_save - saved_count)):
-                save_image(image[j] / 255, f"{save_dir}/pred_{saved_count}.png")
-                save_image(original[j] / 255, f"{save_dir}/gt_{saved_count}.png")
+                save_image(image[j], f"{save_dir}/pred_{saved_count}.png")
+                save_image(original[j], f"{save_dir}/gt_{saved_count}.png")
                save_image(
                    denormalize(batch["x0"][j]).clamp(0, 1),
                    f"{save_dir}/input_{saved_count}.png",
--- a/src/benchmark/init.py
+++ b/src/benchmark/init.py
@@ -4,9 +4,11 @@ from torchmetrics.image import (
    StructuralSimilarityIndexMeasure,
 )

-psnr = PeakSignalNoiseRatio(255.0, reduction="none")
-ssim = StructuralSimilarityIndexMeasure(reduction="none")
-lpips = LearnedPerceptualImagePatchSimilarity(net_type="alex", reduction="none")
+psnr = PeakSignalNoiseRatio(1.0, reduction="none")
+ssim = StructuralSimilarityIndexMeasure(data_range=1.0, reduction="none")
+lpips = LearnedPerceptualImagePatchSimilarity(
+    net_type="alex", reduction="none", normalize=True
+)


 def benchmark(image1, image2):
--- a/src/model/utransformer.py
+++ b/src/model/utransformer.py
@@ -137,40 +137,103 @@ class DinoConditionedLayer(DINOv3ViTLayer):
        return hidden_states


+# class DinoV3ViTDecoder(nn.Module):
+#     def __init__(self, config: DINOv3ViTConfig):
+#         super().__init__()
+#         self.config = config
+#         self.num_channels_out = config.num_channels
+
+#         self.projection = nn.Linear(
+#             config.hidden_size,
+#             self.num_channels_out * config.patch_size * config.patch_size,
+#             bias=True,
+#         )
+
+#     def forward(self, x: torch.Tensor, image_size: tuple[int, int]) -> torch.Tensor:
+#         batch_size = x.shape[0]
+
+#         num_special_tokens = 1 + self.config.num_register_tokens
+#         patch_tokens = x[:, num_special_tokens:, :]
+
+#         projected_tokens = self.projection(patch_tokens)
+
+#         p = self.config.patch_size
+#         c = self.num_channels_out
+#         h_grid = image_size[0] // p
+#         w_grid = image_size[1] // p
+
+#         assert patch_tokens.shape[1] == h_grid * w_grid, (
+#             "Number of patches does not match image size."
+#         )
+
+#         x_reshaped = projected_tokens.reshape(batch_size, h_grid, w_grid, p, p, c)
+
+#         x_permuted = torch.einsum("nhwpqc->nchpwq", x_reshaped)
+
+#         reconstructed_image = x_permuted.reshape(batch_size, c, h_grid * p, w_grid * p)
+
+#         return reconstructed_image
+
+# lets try conv decoder
+
+
 class DinoV3ViTDecoder(nn.Module):
    def __init__(self, config: DINOv3ViTConfig):
        super().__init__()
        self.config = config
        self.num_channels_out = config.num_channels
+        hidden_dim = config.hidden_size
+        patch_size = config.patch_size

-        self.projection = nn.Linear(
-            config.hidden_size,
-            self.num_channels_out * config.patch_size * config.patch_size,
-            bias=True,
+        self.projection = nn.Linear(hidden_dim, hidden_dim)
+
+        if patch_size == 14:
+            final_upsample = 7
+        elif patch_size == 16:
+            final_upsample = 8
+        elif patch_size == 8:
+            final_upsample = 4
+        else:
+            raise ValueError("invalid")
+
+        self.decoder = nn.Sequential(
+            nn.Conv2d(hidden_dim, 256, kernel_size=3, padding=1),
+            nn.BatchNorm2d(256),
+            nn.ReLU(inplace=True),
+            nn.Upsample(scale_factor=2, mode="bilinear", align_corners=False),
+            nn.Conv2d(256, 128, kernel_size=3, padding=1),
+            nn.BatchNorm2d(128),
+            nn.ReLU(inplace=True),
+            nn.Upsample(
+                scale_factor=final_upsample, mode="bilinear", align_corners=False
+            ),
+            nn.Conv2d(128, 64, kernel_size=3, padding=1),
+            nn.BatchNorm2d(64),
+            nn.ReLU(inplace=True),
+            nn.Conv2d(64, 32, kernel_size=3, padding=1),
+            nn.ReLU(inplace=True),
+            nn.Conv2d(32, self.num_channels_out, kernel_size=1),
        )

    def forward(self, x: torch.Tensor, image_size: tuple[int, int]) -> torch.Tensor:
        batch_size = x.shape[0]

-        num_special_tokens = 1 + self.config.num_register_tokens
-        patch_tokens = x[:, num_special_tokens:, :]
+        patch_tokens = x[:, 1 + self.config.num_register_tokens :, :]

        projected_tokens = self.projection(patch_tokens)

        p = self.config.patch_size
-        c = self.num_channels_out
        h_grid = image_size[0] // p
        w_grid = image_size[1] // p

-        assert patch_tokens.shape[1] == h_grid * w_grid, (
-            "Number of patches does not match image size."
+        assert patch_tokens.shape[1] == h_grid * w_grid
+
+        x_spatial = projected_tokens.reshape(
+            batch_size, h_grid, w_grid, self.config.hidden_size
        )

-        x_reshaped = projected_tokens.reshape(batch_size, h_grid, w_grid, p, p, c)
-
-        x_permuted = torch.einsum("nhwpqc->nchpwq", x_reshaped)
-
-        reconstructed_image = x_permuted.reshape(batch_size, c, h_grid * p, w_grid * p)
+        x_spatial = x_spatial.permute(0, 3, 1, 2)
+        reconstructed_image = self.decoder(x_spatial)

        return reconstructed_image

--- a/test_images/pred_0.png
+++ b/test_images/pred_0.png
--- a/test_images/pred_1.png
+++ b/test_images/pred_1.png
--- a/test_images/pred_2.png
+++ b/test_images/pred_2.png
--- a/test_images/pred_3.png
+++ b/test_images/pred_3.png
--- a/test_images/pred_4.png
+++ b/test_images/pred_4.png
--- a/test_images/pred_5.png
+++ b/test_images/pred_5.png
--- a/test_images/pred_6.png
+++ b/test_images/pred_6.png
--- a/test_images/pred_7.png
+++ b/test_images/pred_7.png
--- a/test_images/pred_8.png
+++ b/test_images/pred_8.png
--- a/test_images/pred_9.png
+++ b/test_images/pred_9.png