improve efficiency data module
This commit is contained in:
@@ -23,8 +23,9 @@ class GraphDataModule(LightningDataModule):
|
||||
super().__init__()
|
||||
self.hf_repo = hf_repo
|
||||
self.split_name = split_name
|
||||
self.dataset = None
|
||||
self.geometry = None
|
||||
self.dataset_dict = {}
|
||||
# self.geometry = None
|
||||
self.geometry_dict = {}
|
||||
self.train_size = train_size
|
||||
self.val_size = val_size
|
||||
self.test_size = test_size
|
||||
@@ -32,20 +33,30 @@ class GraphDataModule(LightningDataModule):
|
||||
self.remove_boundary_edges = remove_boundary_edges
|
||||
|
||||
def prepare_data(self):
|
||||
hf_dataset = load_dataset(self.hf_repo, name="snapshots")[
|
||||
self.split_name
|
||||
]
|
||||
self.geometry = load_dataset(self.hf_repo, name="geometry")[
|
||||
self.split_name
|
||||
]
|
||||
self.data = [
|
||||
self._build_dataset(snapshot, geometry)
|
||||
for snapshot, geometry in tqdm(
|
||||
zip(hf_dataset, self.geometry),
|
||||
desc="Building graphs",
|
||||
total=len(hf_dataset),
|
||||
)
|
||||
]
|
||||
dataset = load_dataset(self.hf_repo, name="snapshots")[self.split_name]
|
||||
geometry = load_dataset(self.hf_repo, name="geometry")[self.split_name]
|
||||
# data = [
|
||||
# self._build_dataset(snapshot, geometry)
|
||||
# for snapshot, geometry in tqdm(
|
||||
# zip(hf_dataset, self.geometry),
|
||||
# desc="Building graphs",
|
||||
# total=len(hf_dataset),
|
||||
# )
|
||||
# ]
|
||||
|
||||
total_len = len(dataset)
|
||||
train_len = int(self.train_size * total_len)
|
||||
valid_len = int(self.val_size * total_len)
|
||||
self.dataset_dict = {
|
||||
"train": dataset.select(range(0, train_len)),
|
||||
"val": dataset.select(range(train_len, train_len + valid_len)),
|
||||
"test": dataset.select(range(train_len + valid_len, total_len)),
|
||||
}
|
||||
self.geometry_dict = {
|
||||
"train": geometry.select(range(0, train_len)),
|
||||
"val": geometry.select(range(train_len, train_len + valid_len)),
|
||||
"test": geometry.select(range(train_len + valid_len, total_len)),
|
||||
}
|
||||
|
||||
def _compute_boundary_mask(
|
||||
self, bottom_ids, right_ids, top_ids, left_ids, temperature
|
||||
@@ -132,15 +143,36 @@ class GraphDataModule(LightningDataModule):
|
||||
)
|
||||
|
||||
def setup(self, stage: str = None):
|
||||
n = len(self.data)
|
||||
train_end = int(n * self.train_size)
|
||||
val_end = train_end + int(n * self.val_size)
|
||||
print(type(self.dataset_dict["train"]))
|
||||
|
||||
if stage == "fit" or stage is None:
|
||||
self.train_data = self.data[:train_end]
|
||||
self.val_data = self.data[train_end:val_end]
|
||||
self.train_data = [
|
||||
self._build_dataset(snap, geom)
|
||||
for snap, geom in tqdm(
|
||||
zip(
|
||||
self.dataset_dict["train"], self.geometry_dict["train"]
|
||||
),
|
||||
desc="Building train graphs",
|
||||
total=len(self.dataset_dict["train"]),
|
||||
)
|
||||
]
|
||||
self.val_data = [
|
||||
self._build_dataset(snap, geom)
|
||||
for snap, geom in tqdm(
|
||||
zip(self.dataset_dict["val"], self.geometry_dict["val"]),
|
||||
desc="Building val graphs",
|
||||
total=len(self.dataset_dict["val"]),
|
||||
)
|
||||
]
|
||||
if stage == "test" or stage is None:
|
||||
self.test_data = self.data[val_end:]
|
||||
self.test_data = [
|
||||
self._build_dataset(snap, geom)
|
||||
for snap, geom in tqdm(
|
||||
zip(self.dataset_dict["test"], self.geometry_dict["test"]),
|
||||
desc="Building test graphs",
|
||||
total=len(self.dataset_dict["test"]),
|
||||
)
|
||||
]
|
||||
|
||||
def train_dataloader(self):
|
||||
return DataLoader(
|
||||
|
||||
Reference in New Issue
Block a user