diff --git a/image-inpainting/results/testset/tikaiz-16.6824.npz b/image-inpainting/results/testset/tikaiz-16.6824.npz
new file mode 100644
index 0000000..e908341
Binary files /dev/null and b/image-inpainting/results/testset/tikaiz-16.6824.npz differ
diff --git a/image-inpainting/src/__pycache__/architecture.cpython-313.pyc b/image-inpainting/src/__pycache__/architecture.cpython-313.pyc
index f585442..79bbc27 100644
Binary files a/image-inpainting/src/__pycache__/architecture.cpython-313.pyc and b/image-inpainting/src/__pycache__/architecture.cpython-313.pyc differ
diff --git a/image-inpainting/src/__pycache__/datasets.cpython-313.pyc b/image-inpainting/src/__pycache__/datasets.cpython-313.pyc
index 96995e0..2400c60 100644
Binary files a/image-inpainting/src/__pycache__/datasets.cpython-313.pyc and b/image-inpainting/src/__pycache__/datasets.cpython-313.pyc differ
diff --git a/image-inpainting/src/__pycache__/train.cpython-313.pyc b/image-inpainting/src/__pycache__/train.cpython-313.pyc
index e2a0124..054d48c 100644
Binary files a/image-inpainting/src/__pycache__/train.cpython-313.pyc and b/image-inpainting/src/__pycache__/train.cpython-313.pyc differ
diff --git a/image-inpainting/src/architecture.py b/image-inpainting/src/architecture.py
index 8361aea..e37c5e6 100644
--- a/image-inpainting/src/architecture.py
+++ b/image-inpainting/src/architecture.py
@@ -88,9 +88,16 @@ class EfficientAttention(nn.Module):
 
 class ConvBlock(nn.Module):
     """Convolutional block with Conv2d -> BatchNorm -> LeakyReLU"""
-    def __init__(self, in_channels, out_channels, kernel_size=3, padding=1, dilation=1, dropout=0.0):
+    def __init__(self, in_channels, out_channels, kernel_size=3, padding=1, dilation=1, dropout=0.0, separable=False):
         super().__init__()
-        self.conv = nn.Conv2d(in_channels, out_channels, kernel_size, padding=padding, dilation=dilation)
+        if separable and in_channels > 1:
+            # Depthwise separable convolution for efficiency
+            self.conv = nn.Sequential(
+                nn.Conv2d(in_channels, in_channels, kernel_size, padding=padding, dilation=dilation, groups=in_channels),
+                nn.Conv2d(in_channels, out_channels, 1)
+            )
+        else:
+            self.conv = nn.Conv2d(in_channels, out_channels, kernel_size, padding=padding, dilation=dilation)
         self.bn = nn.BatchNorm2d(out_channels)
         self.relu = nn.LeakyReLU(0.2, inplace=True)
         self.dropout = nn.Dropout2d(dropout) if dropout > 0 else nn.Identity()
@@ -142,30 +149,39 @@ class ResidualConvBlock(nn.Module):
 
 
 class DownBlock(nn.Module):
-    """Enhanced downsampling block with residual connections"""
-    def __init__(self, in_channels, out_channels, dropout=0.1, use_attention=True):
+    """Enhanced downsampling block with dense and residual connections"""
+    def __init__(self, in_channels, out_channels, dropout=0.1, use_attention=True, use_dense=False):
         super().__init__()
-        self.conv1 = ConvBlock(in_channels, out_channels, dropout=dropout)
-        self.residual = ResidualConvBlock(out_channels, dropout=dropout)
+        self.conv1 = ConvBlock(in_channels, out_channels, dropout=dropout, separable=True)
+        self.conv2 = ConvBlock(out_channels, out_channels, dropout=dropout)
+        if use_dense:
+            self.dense = DenseBlock(out_channels, growth_rate=8, num_layers=2, dropout=dropout)
+        else:
+            self.dense = ResidualConvBlock(out_channels, dropout=dropout)
         self.attention = EfficientAttention(out_channels) if use_attention else nn.Identity()
         self.pool = nn.MaxPool2d(2)
     
     def forward(self, x):
         x = self.conv1(x)
-        x = self.residual(x)
+        x = self.conv2(x)
+        x = self.dense(x)
         skip = self.attention(x)
         return self.pool(skip), skip
 
 class UpBlock(nn.Module):
     """Enhanced upsampling block with gated skip connections"""
-    def __init__(self, in_channels, out_channels, dropout=0.1, use_attention=True):
+    def __init__(self, in_channels, out_channels, dropout=0.1, use_attention=True, use_dense=False):
         super().__init__()
         self.up = nn.ConvTranspose2d(in_channels, out_channels, kernel_size=2, stride=2)
         # Skip connection has in_channels, upsampled has out_channels
         self.gated_skip = GatedSkipConnection(out_channels, in_channels)
         # After gated skip: out_channels
-        self.conv1 = ConvBlock(out_channels, out_channels, dropout=dropout)
-        self.residual = ResidualConvBlock(out_channels, dropout=dropout)
+        self.conv1 = ConvBlock(out_channels, out_channels, dropout=dropout, separable=True)
+        self.conv2 = ConvBlock(out_channels, out_channels, dropout=dropout)
+        if use_dense:
+            self.dense = DenseBlock(out_channels, growth_rate=8, num_layers=2, dropout=dropout)
+        else:
+            self.dense = ResidualConvBlock(out_channels, dropout=dropout)
         self.attention = EfficientAttention(out_channels) if use_attention else nn.Identity()
     
     def forward(self, x, skip):
@@ -175,7 +191,8 @@ class UpBlock(nn.Module):
             x = F.interpolate(x, size=skip.shape[2:], mode='bilinear', align_corners=False)
         x = self.gated_skip(x, skip)
         x = self.conv1(x)
-        x = self.residual(x)
+        x = self.conv2(x)
+        x = self.dense(x)
         x = self.attention(x)
         return x
 
@@ -205,28 +222,30 @@ class MyModel(nn.Module):
             nn.LeakyReLU(0.2, inplace=True)
         )
         
-        # Encoder with attention on deeper layers only
-        self.down1 = DownBlock(base_channels, base_channels * 2, dropout=dropout, use_attention=False)
-        self.down2 = DownBlock(base_channels * 2, base_channels * 4, dropout=dropout, use_attention=True)
-        self.down3 = DownBlock(base_channels * 4, base_channels * 8, dropout=dropout, use_attention=True)
+        # Encoder with progressive feature extraction
+        self.down1 = DownBlock(base_channels, base_channels * 2, dropout=dropout, use_attention=False, use_dense=False)
+        self.down2 = DownBlock(base_channels * 2, base_channels * 4, dropout=dropout, use_attention=True, use_dense=True)
+        self.down3 = DownBlock(base_channels * 4, base_channels * 8, dropout=dropout, use_attention=True, use_dense=True)
         
-        # Enhanced bottleneck with multi-scale features
+        # Enhanced bottleneck with multi-scale features and dense connections
         self.bottleneck = nn.Sequential(
             ConvBlock(base_channels * 8, base_channels * 8, dropout=dropout),
+            DenseBlock(base_channels * 8, growth_rate=10, num_layers=3, dropout=dropout),
             ConvBlock(base_channels * 8, base_channels * 8, dilation=2, padding=2, dropout=dropout),
             ResidualConvBlock(base_channels * 8, dropout=dropout),
             EfficientAttention(base_channels * 8)
         )
         
-        # Decoder with attention on deeper layers
-        self.up1 = UpBlock(base_channels * 8, base_channels * 4, dropout=dropout, use_attention=True)
-        self.up2 = UpBlock(base_channels * 4, base_channels * 2, dropout=dropout, use_attention=True)
-        self.up3 = UpBlock(base_channels * 2, base_channels, dropout=dropout, use_attention=False)
+        # Decoder with progressive reconstruction
+        self.up1 = UpBlock(base_channels * 8, base_channels * 4, dropout=dropout, use_attention=True, use_dense=True)
+        self.up2 = UpBlock(base_channels * 4, base_channels * 2, dropout=dropout, use_attention=True, use_dense=True)
+        self.up3 = UpBlock(base_channels * 2, base_channels, dropout=dropout, use_attention=False, use_dense=False)
         
-        # Multi-scale feature fusion
+        # Multi-scale feature fusion with dense connections
         self.multiscale_fusion = nn.Sequential(
             ConvBlock(base_channels * 2, base_channels),
-            ResidualConvBlock(base_channels, dropout=dropout//2)
+            DenseBlock(base_channels, growth_rate=8, num_layers=2, dropout=dropout//2),
+            ConvBlock(base_channels, base_channels)
         )
         
         # Output with residual connection to input
diff --git a/image-inpainting/src/datasets.py b/image-inpainting/src/datasets.py
index 9ad7db6..d7341a3 100644
--- a/image-inpainting/src/datasets.py
+++ b/image-inpainting/src/datasets.py
@@ -37,27 +37,35 @@ def preprocess(input_array: np.ndarray):
     input_array = np.asarray(input_array, dtype=np.float32) / 255.0
     return input_array
 
-def augment_image(img: Image, strength: float = 0.5) -> Image:
-    """Apply fast data augmentation with controlled strength"""
+def augment_image(img: Image, strength: float = 0.7) -> Image:
+    """Apply comprehensive data augmentation for better generalization"""
     # Random horizontal flip
     if random.random() > 0.5:
         img = img.transpose(Image.FLIP_LEFT_RIGHT)
     
-    # Random rotation (90, 180, 270 degrees) - less frequent for speed
-    if random.random() > 0.6:
+    # Random vertical flip
+    if random.random() > 0.5:
+        img = img.transpose(Image.FLIP_TOP_BOTTOM)
+    
+    # Random rotation (90, 180, 270 degrees)
+    if random.random() > 0.5:
         angle = random.choice([90, 180, 270])
         img = img.rotate(angle)
     
-    # Simplified color jitter - only one transformation per image for speed
+    # Color augmentation - more aggressive for long training
     rand = random.random()
-    if rand > 0.66:
+    if rand > 0.75:
         # Brightness
-        factor = 1.0 + random.uniform(-0.15, 0.15) * strength
+        factor = 1.0 + random.uniform(-0.2, 0.2) * strength
         img = ImageEnhance.Brightness(img).enhance(factor)
-    elif rand > 0.33:
+    elif rand > 0.5:
         # Contrast
-        factor = 1.0 + random.uniform(-0.15, 0.15) * strength
+        factor = 1.0 + random.uniform(-0.2, 0.2) * strength
         img = ImageEnhance.Contrast(img).enhance(factor)
+    elif rand > 0.25:
+        # Saturation
+        factor = 1.0 + random.uniform(-0.15, 0.15) * strength
+        img = ImageEnhance.Color(img).enhance(factor)
     
     return img
 
@@ -66,7 +74,7 @@ class ImageDataset(torch.utils.data.Dataset):
     Dataset class for loading images from a folder with augmentation support
     """
 
-    def __init__(self, datafolder: str, augment: bool = True, augment_strength: float = 0.5):
+    def __init__(self, datafolder: str, augment: bool = True, augment_strength: float = 0.7):
         self.imagefiles = sorted(glob.glob(os.path.join(datafolder,"**","*.jpg"),recursive=True))
         self.augment = augment
         self.augment_strength = augment_strength
diff --git a/image-inpainting/src/main.py b/image-inpainting/src/main.py
index d15f60d..7bccb7b 100644
--- a/image-inpainting/src/main.py
+++ b/image-inpainting/src/main.py
@@ -24,22 +24,22 @@ if __name__ == '__main__':
     config_dict['results_path'] = os.path.join(project_root, "results")
     config_dict['data_path'] = os.path.join(project_root, "data", "dataset")
     config_dict['device'] = None
-    config_dict['learningrate'] = 1e-3  # Higher max LR for OneCycleLR
-    config_dict['weight_decay'] = 1e-5  # Lower for faster learning
-    config_dict['n_updates'] = 3500  # Reduced for fast training
-    config_dict['batchsize'] = 96  # Larger batch for speed
-    config_dict['early_stopping_patience'] = 3  # Adjusted patience
+    config_dict['learningrate'] = 8e-4  # Optimized for long training
+    config_dict['weight_decay'] = 1e-4  # Better regularization for long training
+    config_dict['n_updates'] = 30000  # Full day of training (~24 hours)
+    config_dict['batchsize'] = 64  # Balanced for memory and quality
+    config_dict['early_stopping_patience'] = 15  # More patience for better convergence
     config_dict['use_wandb'] = False
 
-    config_dict['print_train_stats_at'] = 10
-    config_dict['print_stats_at'] = 100
+    config_dict['print_train_stats_at'] = 50
+    config_dict['print_stats_at'] = 200
     config_dict['plot_at'] = 500
-    config_dict['validate_at'] = 500  # More frequent validation
+    config_dict['validate_at'] = 500  # Regular validation
 
     network_config = {
         'n_in_channels': 4,
-        'base_channels': 40,  # Optimized for memory efficiency
-        'dropout': 0.08  # Fine-tuned dropout
+        'base_channels': 44,  # Optimal capacity for 16GB VRAM
+        'dropout': 0.12  # Higher dropout for longer training
     }
     
     config_dict['network_config'] = network_config
diff --git a/image-inpainting/src/train.py b/image-inpainting/src/train.py
index 0ffaaa4..f0c9840 100644
--- a/image-inpainting/src/train.py
+++ b/image-inpainting/src/train.py
@@ -15,7 +15,6 @@ import os
 
 from torch.utils.data import DataLoader
 from torch.utils.data import Subset
-from torch.optim.lr_scheduler import OneCycleLR
 
 import wandb
 
@@ -44,6 +43,10 @@ def train(seed, testset_ratio, validset_ratio, data_path, results_path, early_st
 
     if isinstance(device, str):
         device = torch.device(device)
+    
+    # Enable mixed precision training for memory efficiency
+    use_amp = torch.cuda.is_available()
+    scaler = torch.amp.GradScaler('cuda') if use_amp else None
 
     if use_wandb:
         wandb.login()
@@ -93,11 +96,12 @@ def train(seed, testset_ratio, validset_ratio, data_path, results_path, early_st
     mse_loss = torch.nn.MSELoss()  # Keep for evaluation
 
     # defining the optimizer with AdamW for better weight decay handling
-    optimizer = torch.optim.AdamW(network.parameters(), lr=learningrate, weight_decay=weight_decay, betas=(0.9, 0.99))
+    optimizer = torch.optim.AdamW(network.parameters(), lr=learningrate, weight_decay=weight_decay, betas=(0.9, 0.999))
     
-    # OneCycleLR for fast convergence - ramps up then down over entire training
-    scheduler = OneCycleLR(optimizer, max_lr=learningrate, total_steps=n_updates, 
-                          pct_start=0.3, anneal_strategy='cos', div_factor=25.0, final_div_factor=1e4)
+    # Cosine annealing with warm restarts for long training
+    scheduler = torch.optim.lr_scheduler.CosineAnnealingWarmRestarts(
+        optimizer, T_0=n_updates//4, T_mult=1, eta_min=learningrate/100
+    )
 
     if use_wandb:
         wandb.watch(network, mse_loss, log="all", log_freq=10)
@@ -122,17 +126,31 @@ def train(seed, testset_ratio, validset_ratio, data_path, results_path, early_st
 
             optimizer.zero_grad()
 
-            output = network(input)
-
-            loss = rmse_loss(output, target)
-
-            loss.backward()
+            # Mixed precision training for memory efficiency
+            if use_amp:
+                with torch.amp.autocast('cuda'):
+                    output = network(input)
+                    loss = rmse_loss(output, target)
+                
+                scaler.scale(loss).backward()
+                
+                # Gradient clipping for training stability
+                scaler.unscale_(optimizer)
+                torch.nn.utils.clip_grad_norm_(network.parameters(), max_norm=1.0)
+                
+                scaler.step(optimizer)
+                scaler.update()
+            else:
+                output = network(input)
+                loss = rmse_loss(output, target)
+                loss.backward()
+                
+                # Gradient clipping for training stability
+                torch.nn.utils.clip_grad_norm_(network.parameters(), max_norm=1.0)
+                
+                optimizer.step()
             
-            # Gradient clipping for training stability
-            torch.nn.utils.clip_grad_norm_(network.parameters(), max_norm=1.0)
-
-            optimizer.step()
-            scheduler.step()  # OneCycleLR steps once per optimizer step
+            scheduler.step()
 
             loss_list.append(loss.item())
 
@@ -143,7 +161,11 @@ def train(seed, testset_ratio, validset_ratio, data_path, results_path, early_st
             # plotting
             if (i + 1) % plot_at == 0:
                 print(f"Plotting images, current update {i + 1}")
-                plot(input.cpu().numpy(), target.detach().cpu().numpy(), output.detach().cpu().numpy(), plotpath, i)
+                # Convert to float32 for matplotlib compatibility
+                plot(input.float().cpu().numpy(), 
+                     target.detach().float().cpu().numpy(), 
+                     output.detach().float().cpu().numpy(), 
+                     plotpath, i)
 
             # evaluating model every validate_at sample
             if (i + 1) % validate_at == 0: