* Enable DDP training with batch_size=None and add validity check for split sizes * Refactoring SolverInterfaces (#435) * Solver update + weighting * Updating PINN for 0.2 * Modify GAROM + tests * Adding more versatile loggers * Disable compilation when running on Windows * Fix tests --------- Co-authored-by: giovanni <giovanni.canali98@yahoo.it> Co-authored-by: FilippoOlivo <filippo@filippoolivo.com>
177 lines
6.8 KiB
Python
177 lines
6.8 KiB
Python
""" Trainer module. """
|
|
import sys
|
|
import torch
|
|
import lightning
|
|
from .utils import check_consistency
|
|
from .data import PinaDataModule
|
|
from .solvers import SolverInterface, PINNInterface
|
|
|
|
|
|
class Trainer(lightning.pytorch.Trainer):
|
|
|
|
def __init__(self,
|
|
solver,
|
|
batch_size=None,
|
|
train_size=.7,
|
|
test_size=.2,
|
|
val_size=.1,
|
|
predict_size=0.,
|
|
compile=None,
|
|
automatic_batching=None,
|
|
**kwargs):
|
|
"""
|
|
PINA Trainer class for costumizing every aspect of training via flags.
|
|
|
|
:param solver: A pina:class:`SolverInterface` solver for the
|
|
differential problem.
|
|
:type solver: SolverInterface
|
|
:param batch_size: How many samples per batch to load.
|
|
If ``batch_size=None`` all
|
|
samples are loaded and data are not batched, defaults to None.
|
|
:type batch_size: int | None
|
|
:param train_size: percentage of elements in the train dataset
|
|
:type train_size: float
|
|
:param test_size: percentage of elements in the test dataset
|
|
:type test_size: float
|
|
:param val_size: percentage of elements in the val dataset
|
|
:type val_size: float
|
|
:param predict_size: percentage of elements in the predict dataset
|
|
:type predict_size: float
|
|
:param compile: if True model is compiled before training,
|
|
default False. For Windows users compilation is always disabled.
|
|
:type compile: bool
|
|
:param automatic_batching: if True automatic PyTorch batching is
|
|
performed. Please avoid using automatic batching when batch_size is
|
|
large, default False.
|
|
:type automatic_batching: bool
|
|
|
|
:Keyword Arguments:
|
|
The additional keyword arguments specify the training setup
|
|
and can be choosen from the `pytorch-lightning
|
|
Trainer API <https://lightning.ai/docs/pytorch/stable/common/trainer.html#trainer-class-api>`_
|
|
"""
|
|
# check consistency for init types
|
|
check_consistency(solver, SolverInterface)
|
|
check_consistency(train_size, float)
|
|
check_consistency(test_size, float)
|
|
check_consistency(val_size, float)
|
|
check_consistency(predict_size, float)
|
|
if automatic_batching is not None:
|
|
check_consistency(automatic_batching, bool)
|
|
if compile is not None:
|
|
check_consistency(compile, bool)
|
|
if train_size + test_size + val_size + predict_size > 1:
|
|
raise ValueError('train_size, test_size, val_size and predict_size '
|
|
'must sum up to 1.')
|
|
for size in [train_size, test_size, val_size, predict_size]:
|
|
if size < 0 or size > 1:
|
|
raise ValueError('splitting sizes for train, validation, test '
|
|
'and prediction must be between [0, 1].')
|
|
if batch_size is not None:
|
|
check_consistency(batch_size, int)
|
|
|
|
# inference mode set to false when validating/testing PINNs otherwise
|
|
# gradient is not tracked and optimization_cycle fails
|
|
if isinstance(solver, PINNInterface):
|
|
kwargs['inference_mode'] = False
|
|
|
|
# Logging depends on the batch size, when batch_size is None then
|
|
# log_every_n_steps should be zero
|
|
if batch_size is None:
|
|
kwargs['log_every_n_steps'] = 0
|
|
else:
|
|
kwargs.setdefault('log_every_n_steps', 50) # default for lightning
|
|
|
|
# Setting default kwargs, overriding lightning defaults
|
|
kwargs.setdefault('enable_progress_bar', True)
|
|
kwargs.setdefault('logger', None)
|
|
|
|
super().__init__(**kwargs)
|
|
|
|
# checking compilation and automatic batching
|
|
if compile is None or sys.platform == "win32":
|
|
compile = False
|
|
if automatic_batching is None:
|
|
automatic_batching = False
|
|
|
|
# set attributes
|
|
self.compile = compile
|
|
self.automatic_batching = automatic_batching
|
|
self.train_size = train_size
|
|
self.test_size = test_size
|
|
self.val_size = val_size
|
|
self.predict_size = predict_size
|
|
self.solver = solver
|
|
self.batch_size = batch_size
|
|
self._move_to_device()
|
|
self.data_module = None
|
|
self._create_loader()
|
|
|
|
# logging
|
|
self.logging_kwargs = {
|
|
'logger': bool(
|
|
kwargs['logger'] is None or kwargs['logger'] is True),
|
|
'sync_dist': bool(
|
|
len(self._accelerator_connector._parallel_devices) > 1),
|
|
'on_step': bool(kwargs['log_every_n_steps'] > 0),
|
|
'prog_bar': bool(kwargs['enable_progress_bar']),
|
|
'on_epoch': True
|
|
}
|
|
|
|
def _move_to_device(self):
|
|
device = self._accelerator_connector._parallel_devices[0]
|
|
# move parameters to device
|
|
pb = self.solver.problem
|
|
if hasattr(pb, "unknown_parameters"):
|
|
for key in pb.unknown_parameters:
|
|
pb.unknown_parameters[key] = torch.nn.Parameter(
|
|
pb.unknown_parameters[key].data.to(device))
|
|
|
|
def _create_loader(self):
|
|
"""
|
|
This method is used here because is resampling is needed
|
|
during training, there is no need to define to touch the
|
|
trainer dataloader, just call the method.
|
|
"""
|
|
if not self.solver.problem.are_all_domains_discretised:
|
|
error_message = '\n'.join([
|
|
f"""{" " * 13} ---> Domain {key} {
|
|
"sampled" if key in self.solver.problem.discretised_domains else
|
|
"not sampled"}""" for key in
|
|
self.solver.problem.domains.keys()
|
|
])
|
|
raise RuntimeError('Cannot create Trainer if not all conditions '
|
|
'are sampled. The Trainer got the following:\n'
|
|
f'{error_message}')
|
|
self.data_module = PinaDataModule(
|
|
self.solver.problem,
|
|
train_size=self.train_size,
|
|
test_size=self.test_size,
|
|
val_size=self.val_size,
|
|
predict_size=self.predict_size,
|
|
batch_size=self.batch_size,
|
|
automatic_batching=self.automatic_batching)
|
|
|
|
def train(self, **kwargs):
|
|
"""
|
|
Train the solver method.
|
|
"""
|
|
return super().fit(self.solver, datamodule=self.data_module, **kwargs)
|
|
|
|
def test(self, **kwargs):
|
|
"""
|
|
Test the solver method.
|
|
"""
|
|
return super().test(self.solver, datamodule=self.data_module, **kwargs)
|
|
|
|
@property
|
|
def solver(self):
|
|
"""
|
|
Returning trainer solver.
|
|
"""
|
|
return self._solver
|
|
|
|
@solver.setter
|
|
def solver(self, solver):
|
|
self._solver = solver
|