Improve conditions and refactor dataset classes (#475)

* Reimplement conditions * Refactor datasets and implement LabelBatch --------- Co-authored-by: Dario Coscia <dariocos99@gmail.com>
2025-03-07 11:24:09 +01:00
parent bdad144461
commit a0cbf1c44a
40 changed files with 943 additions and 550 deletions
--- a/pina/data/data_module.py
+++ b/pina/data/data_module.py
@@ -285,7 +285,7 @@ class PinaDataModule(LightningDataModule):

    @staticmethod
    def _split_condition(condition_dict, splits_dict):
-        len_condition = len(condition_dict["input_points"])
+        len_condition = len(condition_dict["input"])

        lengths = [
            int(len_condition * length) for length in splits_dict.values()
@@ -343,7 +343,7 @@ class PinaDataModule(LightningDataModule):
            condition_name,
            condition_dict,
        ) in collector.data_collections.items():
-            len_data = len(condition_dict["input_points"])
+            len_data = len(condition_dict["input"])
            if self.shuffle:
                _apply_shuffle(condition_dict, len_data)
            for key, data in self._split_condition(
@@ -390,12 +390,12 @@ class PinaDataModule(LightningDataModule):
        max_conditions_lengths = {}
        for k, v in self.collector_splits[split].items():
            if self.batch_size is None:
-                max_conditions_lengths[k] = len(v["input_points"])
+                max_conditions_lengths[k] = len(v["input"])
            elif self.repeat:
                max_conditions_lengths[k] = self.batch_size
            else:
                max_conditions_lengths[k] = min(
-                    len(v["input_points"]), self.batch_size
+                    len(v["input"]), self.batch_size
                )
        return max_conditions_lengths

@@ -455,15 +455,15 @@ class PinaDataModule(LightningDataModule):
            raise ValueError("The sum of the splits must be 1")

    @property
-    def input_points(self):
+    def input(self):
        """
        # TODO
        """
        to_return = {}
        if hasattr(self, "train_dataset") and self.train_dataset is not None:
-            to_return["train"] = self.train_dataset.input_points
+            to_return["train"] = self.train_dataset.input
        if hasattr(self, "val_dataset") and self.val_dataset is not None:
-            to_return["val"] = self.val_dataset.input_points
+            to_return["val"] = self.val_dataset.input
        if hasattr(self, "test_dataset") and self.test_dataset is not None:
-            to_return = self.test_dataset.input_points
+            to_return = self.test_dataset.input
        return to_return
--- a/pina/data/dataset.py
+++ b/pina/data/dataset.py
@@ -2,12 +2,10 @@
 This module provide basic data management functionalities
 """

-import functools
-import torch
-from torch.utils.data import Dataset
 from abc import abstractmethod
-from torch_geometric.data import Batch, Data
-from pina import LabelTensor
+from torch.utils.data import Dataset
+from torch_geometric.data import Data
+from ..graph import Graph, LabelBatch


 class PinaDatasetFactory:
@@ -19,25 +17,25 @@ class PinaDatasetFactory:
    """

    def __new__(cls, conditions_dict, **kwargs):
+        # Check if conditions_dict is empty
        if len(conditions_dict) == 0:
            raise ValueError("No conditions provided")
-        if all(
-            [
-                isinstance(v["input_points"], torch.Tensor)
-                for v in conditions_dict.values()
-            ]
-        ):
-            return PinaTensorDataset(conditions_dict, **kwargs)
-        elif all(
-            [
-                isinstance(v["input_points"], list)
-                for v in conditions_dict.values()
-            ]
-        ):
+
+        # Check is a Graph is present in the conditions
+        is_graph = cls._is_graph_dataset(conditions_dict)
+        if is_graph:
+            # If a Graph is present, return a PinaGraphDataset
            return PinaGraphDataset(conditions_dict, **kwargs)
-        raise ValueError(
-            "Conditions must be either torch.Tensor or list of Data " "objects."
-        )
+        # If no Graph is present, return a PinaTensorDataset
+        return PinaTensorDataset(conditions_dict, **kwargs)
+
+    @staticmethod
+    def _is_graph_dataset(conditions_dict):
+        for v in conditions_dict.values():
+            for cond in v.values():
+                if isinstance(cond, (Data, Graph, list)):
+                    return True
+        return False


 class PinaDataset(Dataset):
@@ -45,209 +43,140 @@ class PinaDataset(Dataset):
    Abstract class for the PINA dataset
    """

-    def __init__(self, conditions_dict, max_conditions_lengths):
+    def __init__(
+        self, conditions_dict, max_conditions_lengths, automatic_batching
+    ):
+        # Store the conditions dictionary
        self.conditions_dict = conditions_dict
+        # Store the maximum number of conditions to consider
        self.max_conditions_lengths = max_conditions_lengths
+        # Store length of each condition
        self.conditions_length = {
-            k: len(v["input_points"]) for k, v in self.conditions_dict.items()
+            k: len(v["input"]) for k, v in self.conditions_dict.items()
        }
+        # Store the maximum length of the dataset
        self.length = max(self.conditions_length.values())
+        # Dynamically set the getitem function based on automatic batching
+        if automatic_batching:
+            self._getitem_func = self._getitem_int
+        else:
+            self._getitem_func = self._getitem_dummy

    def _get_max_len(self):
+        """"""
        max_len = 0
        for condition in self.conditions_dict.values():
-            max_len = max(max_len, len(condition["input_points"]))
+            max_len = max(max_len, len(condition["input"]))
        return max_len

    def __len__(self):
        return self.length

+    def __getitem__(self, idx):
+        return self._getitem_func(idx)
+
+    def _getitem_dummy(self, idx):
+        # If automatic batching is disabled, return the data at the given index
+        return idx
+
+    def _getitem_int(self, idx):
+        # If automatic batching is enabled, return the data at the given index
+        return {
+            k: {k_data: v[k_data][idx % len(v["input"])] for k_data in v.keys()}
+            for k, v in self.conditions_dict.items()
+        }
+
+    def get_all_data(self):
+        """
+        Return all data in the dataset
+
+        :return: All data in the dataset
+        :rtype: dict
+        """
+        index = list(range(len(self)))
+        return self.fetch_from_idx_list(index)
+
+    def fetch_from_idx_list(self, idx):
+        """
+        Return data from the dataset given a list of indices
+
+        :param idx: List of indices
+        :type idx: list
+        :return: Data from the dataset
+        :rtype: dict
+        """
+        to_return_dict = {}
+        for condition, data in self.conditions_dict.items():
+            # Get the indices for the current condition
+            cond_idx = idx[: self.max_conditions_lengths[condition]]
+            # Get the length of the current condition
+            condition_len = self.conditions_length[condition]
+            # If the length of the dataset is greater than the length of the
+            # current condition, repeat the indices
+            if self.length > condition_len:
+                cond_idx = [idx % condition_len for idx in cond_idx]
+            # Retrieve the data from the current condition
+            to_return_dict[condition] = self._retrive_data(data, cond_idx)
+        return to_return_dict
+
    @abstractmethod
-    def __getitem__(self, item):
+    def _retrive_data(self, data, idx_list):
        pass


 class PinaTensorDataset(PinaDataset):
-    def __init__(
-        self, conditions_dict, max_conditions_lengths, automatic_batching
-    ):
-        super().__init__(conditions_dict, max_conditions_lengths)
+    """
+    Class for the PINA dataset with torch.Tensor data
+    """

-        if automatic_batching:
-            self._getitem_func = self._getitem_int
-        else:
-            self._getitem_func = self._getitem_dummy
-
-    def _getitem_int(self, idx):
-        return {
-            k: {
-                k_data: v[k_data][idx % len(v["input_points"])]
-                for k_data in v.keys()
-            }
-            for k, v in self.conditions_dict.items()
-        }
-
-    def fetch_from_idx_list(self, idx):
-        to_return_dict = {}
-        for condition, data in self.conditions_dict.items():
-            cond_idx = idx[: self.max_conditions_lengths[condition]]
-            condition_len = self.conditions_length[condition]
-            if self.length > condition_len:
-                cond_idx = [idx % condition_len for idx in cond_idx]
-            to_return_dict[condition] = {
-                k: v[cond_idx] for k, v in data.items()
-            }
-        return to_return_dict
-
-    @staticmethod
-    def _getitem_dummy(idx):
-        return idx
-
-    def get_all_data(self):
-        index = [i for i in range(len(self))]
-        return self.fetch_from_idx_list(index)
-
-    def __getitem__(self, idx):
-        return self._getitem_func(idx)
+    # Override _retrive_data method for torch.Tensor data
+    def _retrive_data(self, data, idx_list):
+        return {k: v[idx_list] for k, v in data.items()}

    @property
-    def input_points(self):
+    def input(self):
        """
        Method to return input points for training.
        """
-        return {k: v["input_points"] for k, v in self.conditions_dict.items()}
-
-
-class PinaBatch(Batch):
-    """
-    Add extract function to torch_geometric Batch object
-    """
-
-    def __init__(self):
-
-        super().__init__(self)
-
-    def extract(self, labels):
-        """
-        Perform extraction of labels on node features (x)
-
-        :param labels: Labels to extract
-        :type labels: list[str] | tuple[str] | str
-        :return: Batch object with extraction performed on x
-        :rtype: PinaBatch
-        """
-        self.x = self.x.extract(labels)
-        return self
+        return {k: v["input"] for k, v in self.conditions_dict.items()}


 class PinaGraphDataset(PinaDataset):
+    """
+    Class for the PINA dataset with torch_geometric.data.Data data
+    """

-    def __init__(
-        self, conditions_dict, max_conditions_lengths, automatic_batching
-    ):
-        super().__init__(conditions_dict, max_conditions_lengths)
-        self.in_labels = {}
-        self.out_labels = None
-        if automatic_batching:
-            self._getitem_func = self._getitem_int
-        else:
-            self._getitem_func = self._getitem_dummy
-
-        ex_data = conditions_dict[list(conditions_dict.keys())[0]][
-            "input_points"
-        ][0]
-        for name, attr in ex_data.items():
-            if isinstance(attr, LabelTensor):
-                self.in_labels[name] = attr.stored_labels
-        ex_data = conditions_dict[list(conditions_dict.keys())[0]][
-            "output_points"
-        ][0]
-        if isinstance(ex_data, LabelTensor):
-            self.out_labels = ex_data.labels
-
-        self._create_graph_batch_from_list = (
-            self._labelise_batch(self._base_create_graph_batch_from_list)
-            if self.in_labels
-            else self._base_create_graph_batch_from_list
-        )
-
-        self._create_output_batch = (
-            self._labelise_tensor(self._base_create_output_batch)
-            if self.out_labels is not None
-            else self._base_create_output_batch
-        )
-
-    def fetch_from_idx_list(self, idx):
-        to_return_dict = {}
-        for condition, data in self.conditions_dict.items():
-            cond_idx = idx[: self.max_conditions_lengths[condition]]
-            condition_len = self.conditions_length[condition]
-            if self.length > condition_len:
-                cond_idx = [idx % condition_len for idx in cond_idx]
-            to_return_dict[condition] = {
-                k: (
-                    self._create_graph_batch_from_list([v[i] for i in idx])
-                    if isinstance(v, list)
-                    else self._create_output_batch(v[idx])
-                )
-                for k, v in data.items()
-            }
-
-        return to_return_dict
-
-    def _base_create_graph_batch_from_list(self, data):
-        batch = PinaBatch.from_data_list(data)
+    def _create_graph_batch_from_list(self, data):
+        batch = LabelBatch.from_data_list(data)
        return batch

-    def _base_create_output_batch(self, data):
+    def _create_output_batch(self, data):
        out = data.reshape(-1, *data.shape[2:])
        return out

-    def _getitem_dummy(self, idx):
-        return idx
-
-    def _getitem_int(self, idx):
-        return {
-            k: {
-                k_data: v[k_data][idx % len(v["input_points"])]
-                for k_data in v.keys()
-            }
-            for k, v in self.conditions_dict.items()
-        }
-
-    def get_all_data(self):
-        index = [i for i in range(len(self))]
-        return self.fetch_from_idx_list(index)
-
-    def __getitem__(self, idx):
-        return self._getitem_func(idx)
-
-    def _labelise_batch(self, func):
-        @functools.wraps(func)
-        def wrapper(*args, **kwargs):
-            batch = func(*args, **kwargs)
-            for k, v in self.in_labels.items():
-                tmp = batch[k]
-                tmp.labels = v
-                batch[k] = tmp
-            return batch
-
-        return wrapper
-
-    def _labelise_tensor(self, func):
-        @functools.wraps(func)
-        def wrapper(*args, **kwargs):
-            out = func(*args, **kwargs)
-            if isinstance(out, LabelTensor):
-                out.labels = self.out_labels
-            return out
-
-        return wrapper
-
    def create_graph_batch(self, data):
        """
-        # TODO
+        Create a Batch object from a list of Data objects.
+
+        :param data: List of Data objects
+        :type data: list
+        :return: Batch object
+        :rtype: Batch or PinaBatch
        """
        if isinstance(data[0], Data):
            return self._create_graph_batch_from_list(data)
        return self._create_output_batch(data)
+
+    # Override _retrive_data method for graph handling
+    def _retrive_data(self, data, idx_list):
+        # Return the data from the current condition
+        # If the data is a list of Data objects, create a Batch object
+        # If the data is a list of torch.Tensor objects, create a torch.Tensor
+        return {
+            k: (
+                self._create_graph_batch_from_list([v[i] for i in idx_list])
+                if isinstance(v, list)
+                else self._create_output_batch(v[idx_list])
+            )
+            for k, v in data.items()
+        }