diff --git a/ThermalSolver/data_module.py b/ThermalSolver/data_module.py
index 1a48b1d..b3a0a73 100644
--- a/ThermalSolver/data_module.py
+++ b/ThermalSolver/data_module.py
@@ -23,8 +23,9 @@ class GraphDataModule(LightningDataModule):
         super().__init__()
         self.hf_repo = hf_repo
         self.split_name = split_name
-        self.dataset = None
-        self.geometry = None
+        self.dataset_dict = {}
+        # self.geometry = None
+        self.geometry_dict = {}
         self.train_size = train_size
         self.val_size = val_size
         self.test_size = test_size
@@ -32,20 +33,30 @@ class GraphDataModule(LightningDataModule):
         self.remove_boundary_edges = remove_boundary_edges
 
     def prepare_data(self):
-        hf_dataset = load_dataset(self.hf_repo, name="snapshots")[
-            self.split_name
-        ]
-        self.geometry = load_dataset(self.hf_repo, name="geometry")[
-            self.split_name
-        ]
-        self.data = [
-            self._build_dataset(snapshot, geometry)
-            for snapshot, geometry in tqdm(
-                zip(hf_dataset, self.geometry),
-                desc="Building graphs",
-                total=len(hf_dataset),
-            )
-        ]
+        dataset = load_dataset(self.hf_repo, name="snapshots")[self.split_name]
+        geometry = load_dataset(self.hf_repo, name="geometry")[self.split_name]
+        # data = [
+        #     self._build_dataset(snapshot, geometry)
+        #     for snapshot, geometry in tqdm(
+        #         zip(hf_dataset, self.geometry),
+        #         desc="Building graphs",
+        #         total=len(hf_dataset),
+        #     )
+        # ]
+
+        total_len = len(dataset)
+        train_len = int(self.train_size * total_len)
+        valid_len = int(self.val_size * total_len)
+        self.dataset_dict = {
+            "train": dataset.select(range(0, train_len)),
+            "val": dataset.select(range(train_len, train_len + valid_len)),
+            "test": dataset.select(range(train_len + valid_len, total_len)),
+        }
+        self.geometry_dict = {
+            "train": geometry.select(range(0, train_len)),
+            "val": geometry.select(range(train_len, train_len + valid_len)),
+            "test": geometry.select(range(train_len + valid_len, total_len)),
+        }
 
     def _compute_boundary_mask(
         self, bottom_ids, right_ids, top_ids, left_ids, temperature
@@ -132,15 +143,36 @@ class GraphDataModule(LightningDataModule):
         )
 
     def setup(self, stage: str = None):
-        n = len(self.data)
-        train_end = int(n * self.train_size)
-        val_end = train_end + int(n * self.val_size)
+        print(type(self.dataset_dict["train"]))
 
         if stage == "fit" or stage is None:
-            self.train_data = self.data[:train_end]
-            self.val_data = self.data[train_end:val_end]
+            self.train_data = [
+                self._build_dataset(snap, geom)
+                for snap, geom in tqdm(
+                    zip(
+                        self.dataset_dict["train"], self.geometry_dict["train"]
+                    ),
+                    desc="Building train graphs",
+                    total=len(self.dataset_dict["train"]),
+                )
+            ]
+            self.val_data = [
+                self._build_dataset(snap, geom)
+                for snap, geom in tqdm(
+                    zip(self.dataset_dict["val"], self.geometry_dict["val"]),
+                    desc="Building val graphs",
+                    total=len(self.dataset_dict["val"]),
+                )
+            ]
         if stage == "test" or stage is None:
-            self.test_data = self.data[val_end:]
+            self.test_data = [
+                self._build_dataset(snap, geom)
+                for snap, geom in tqdm(
+                    zip(self.dataset_dict["test"], self.geometry_dict["test"]),
+                    desc="Building test graphs",
+                    total=len(self.dataset_dict["test"]),
+                )
+            ]
 
     def train_dataloader(self):
         return DataLoader(
diff --git a/ThermalSolver/model/local_gno.py b/ThermalSolver/model/local_gno.py
index 0d097e9..efdb868 100644
--- a/ThermalSolver/model/local_gno.py
+++ b/ThermalSolver/model/local_gno.py
@@ -22,8 +22,9 @@ class EncX(nn.Module):
         super().__init__()
         self.net = nn.Sequential(
             nn.Linear(x_ch, hidden // 2),
-            nn.SiLU(),
+            nn.GELU(),
             nn.Linear(hidden // 2, hidden),
+            nn.GELU(),
         )
 
     def forward(self, x):
@@ -35,8 +36,9 @@ class EncC(nn.Module):
         super().__init__()
         self.net = nn.Sequential(
             nn.Linear(c_ch, hidden // 2),
-            nn.SiLU(),
+            nn.GELU(),
             nn.Linear(hidden // 2, hidden),
+            nn.GELU(),
         )
 
     def forward(self, c):
@@ -48,8 +50,9 @@ class DecX(nn.Module):
         super().__init__()
         self.net = nn.Sequential(
             nn.Linear(hidden, hidden // 2),
-            nn.SiLU(),
+            nn.GELU(),
             nn.Linear(hidden // 2, out_ch),
+            nn.GELU(),
         )
 
     def forward(self, x):
@@ -133,18 +136,18 @@ class ConditionalGNOBlock(MessagePassing):
         # Se edge_ch==0 useremo un coefficiente apprendibile globale
         self.edge_attr_net = nn.Sequential(
             nn.Linear(edge_ch, hidden_ch),
-            nn.SiLU(),
+            nn.GELU(),
             nn.Linear(hidden_ch, hidden_ch // 2),
-            nn.SiLU(),
+            nn.GELU(),
             nn.Linear(hidden_ch // 2, 1),
             nn.Softplus(),
         )
         # gating dalla condizione c_ij (restituisce scalar in (0,1))
         self.c_ij_net = nn.Sequential(
             nn.Linear(hidden_ch, hidden_ch),
-            nn.SiLU(),
+            nn.GELU(),
             nn.Linear(hidden_ch, hidden_ch // 2),
-            nn.SiLU(),
+            nn.GELU(),
             nn.Linear(hidden_ch // 2, 1),
             nn.Sigmoid(),
         )
@@ -152,13 +155,22 @@ class ConditionalGNOBlock(MessagePassing):
         # alpha per passo (clampato tramite sigmoid)
         self.alpha_net = nn.Sequential(
             nn.Linear(2 * hidden_ch, hidden_ch),
-            nn.SiLU(),
+            nn.GELU(),
             nn.Linear(hidden_ch, hidden_ch // 2),
-            nn.SiLU(),
+            nn.GELU(),
             nn.Linear(hidden_ch // 2, 1),
             nn.Sigmoid(),
         )
 
+        self.diff_net = nn.Sequential(
+            nn.Linear(hidden_ch, hidden_ch * 2),
+            nn.GELU(),
+            nn.Linear(hidden_ch * 2, hidden_ch**2),
+            nn.GELU(),
+            nn.Linear(hidden_ch**2, hidden_ch),
+            nn.GELU(),
+        )
+
         # self.norm = nn.LayerNorm(hidden_ch)
 
     def forward(self, x, c, edge_index, edge_attr=None):
@@ -171,43 +183,21 @@ class ConditionalGNOBlock(MessagePassing):
         m_ij = w_ij * (x_j - x_i) * c_gate_ij
         dove w_ij = softplus(edge_attr_net(edge_attr)) >= 0
         """
-        # 1) calcola c_ij e gating da c
         c_ij = 0.5 * (c_i + c_j)  # [E, H]
         c_gate = self.c_ij_net(c_ij)  # [E, 1] in (0,1)
-
-        # 2) calcola peso scalare non-negativo per edge
         w_raw = self.edge_attr_net(edge_attr)  # [E,1]
-
-        # softplus -> peso >= 0; aggiungo epsilon per stabilità
-        w = w_raw + 1e-12  # [E,1]
-
-        # 3) messaggio base: differenza pesata
+        w = w_raw + 1e-8
         diff = x_j - x_i  # [E, H]
-        m = w * diff  # broadcast: [E,1] * [E,H] -> [E,H]
-
-        # 4) applica gating dalla condizione
+        m = w * self.diff_net(diff) + diff  # [E,H]
         m = m * c_gate  # [E,H]
-
-        # Restituisco anche w (sfruttabile in update) — ma MessagePassing non ritorna extra,
-        # così se vuoi degree-normalization devi calcolare i gradi prima di propagate.
-        # Qui ritorno solo m: la normalizzazione per grado la faccio in update usando 'mean' aggr
         return m
 
     def update(self, aggr_out, x):
         """
-        aggr_out:
-         - se aggr='sum': somma delle w_ij*(x_j-x_i) incoming
-         - se aggr='mean': già normalizzato sul numero di vicini (ma non per somma dei pesi)
-        Qui normalizziamo implicitamente dividendo per (1 + |aggr_out|_norm) per stabilità,
-        e applichiamo il passo alpha.
+        TODO: doc
         """
-        # aggr_out = self.norm(aggr_out)  # stabilizza la scala
-
-        # alpha vettoriale/scalar: [N,1]
-        alpha = self.alpha_net(torch.cat([x, aggr_out], dim=-1))  # in (0,1)
-
+        alpha = self.alpha_net(torch.cat([x, aggr_out], dim=-1))
         x_new = x + alpha * aggr_out
-
         return x_new
 
 
@@ -250,10 +240,10 @@ class GatingGNO(nn.Module):
             x_ = self.dec(x)
             plot_results_fn(x_, pos, 0, batch=batch)
         for _ in range(1, unrolling_steps + 1):
-            for blk in self.blocks:
+            for i, blk in enumerate(self.blocks):
                 x = blk(x, c, edge_index, edge_attr=edge_attr)
-            if plot_results:
-                x_ = self.dec(x)
-                plot_results_fn(x_, pos, _, batch=batch)
+                if plot_results:
+                    x_ = self.dec(x)
+                    plot_results_fn(x_, pos, i * _, batch=batch)
 
         return self.dec(x)
diff --git a/ThermalSolver/module.py b/ThermalSolver/module.py
index 5bfecce..0329db5 100644
--- a/ThermalSolver/module.py
+++ b/ThermalSolver/module.py
@@ -111,6 +111,7 @@ class GraphSolver(LightningModule):
             unrolling_steps=self.unrolling_steps,
             batch=batch.batch,
             pos=batch.pos,
+            plot_results=True,
         )
         loss = self._compute_loss(y_pred, y)
         self._log_loss(loss, batch, "test")