metatensor
diff --git a/‎docs/src/architectures/pet.rst‎
Lines changed: 68 additions & 95 deletions b/‎docs/src/architectures/pet.rst‎
Lines changed: 68 additions & 95 deletions
diff --git a/‎src/metatrain/pet/default-hypers.yaml‎
Lines changed: 2 additions & 1 deletion b/‎src/metatrain/pet/default-hypers.yaml‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎src/metatrain/pet/hypers.py‎
Lines changed: 138 additions & 0 deletions b/‎src/metatrain/pet/hypers.py‎
Lines changed: 138 additions & 0 deletions
diff --git a/‎src/metatrain/pet/model.py‎
Lines changed: 5 additions & 3 deletions b/‎src/metatrain/pet/model.py‎
Lines changed: 5 additions & 3 deletions
@@ -25,108 +25,81 @@ This will install the PET model along with the ``metatrain`` package.
 Default Hyperparameters
 -----------------------
 
-The default hyperparameters for the PET model are:
+The description of all the hyperparameters used in PET is provided further
+down this page. However, here we provide you with a yaml file containing all 
+the default hyperparameters, which might be convenient as a starting point to
+create your own hyperparameter files:
 
 .. literalinclude:: ../../../src/metatrain/pet/default-hypers.yaml
  :language: yaml
+ :lines: 2-
 
-Tuning Hyperparameters
+Tuning hyperparameters
 ----------------------
 
-PET offers a number of tuning knobs for flexibility across datasets:
-
 The default hyperparameters above will work well in most cases, but they
-may not be optimal for your specific dataset. In general, the most important
-hyperparameters to tune are (in decreasing order of importance):
-
-- ``cutoff``: This should be set to a value after which most of the interactions between
- atoms is expected to be negligible. A lower cutoff will lead to faster models.
-- ``learning_rate``: The learning rate for the neural network. This hyperparameter
- controls how much the weights of the network are updated at each step of the
- optimization. A larger learning rate will lead to faster training, but might cause
- instability and/or divergence.
-- ``batch_size``: The number of samples to use in each batch of training. This
- hyperparameter controls the tradeoff between training speed and memory usage. In
- general, larger batch sizes will lead to faster training, but might require more
- memory.
-- ``d_pet``: This hyperparameters controls width of the neural network. In general,
- increasing it might lead to better accuracy, especially on larger datasets, at the
- cost of increased training and evaluation time.
-- ``d_node``: The dimension of the node features. Increasing this hyperparameter
- might lead to better accuracy, with a relatively small increase in inference time.
-- ``num_gnn_layers``: The number of graph neural network layers. In general, decreasing
- this hyperparameter to 1 will lead to much faster models, at the expense of accuracy.
- Increasing it may or may not lead to better accuracy, depending on the dataset, at the
- cost of increased training and evaluation time.
-- ``num_attention_layers``: The number of attention layers in each layer of the graph
- neural network. Depending on the dataset, increasing this hyperparameter might lead to
- better accuracy, at the cost of increased training and evaluation time.
-- ``loss``: This section describes the loss function to be used. See the
- :ref:`loss-functions` for more details.
-- ``long_range``: In some systems and datasets, enabling long-range Coulomb interactions
- might be beneficial for the accuracy of the model and/or its physical correctness.
- See below for a breakdown of the long-range section of the model hyperparameters.
-
-All Hyperparameters
--------------------
-
-:param name: ``pet``
-
-model
-#####
-
-:param cutoff: Cutoff radius for neighbor search
-:param cutoff_width: Width of the smoothing function at the cutoff
-:param d_pet: Dimension of the edge features
-:param d_head: Dimension of the attention heads
-:param d_node: Dimension of the node features
-:param d_feedforward: Dimension of the feedforward network in the attention layer
-:param num_heads: Attention heads per attention layer
-:param num_attention_layers: Number of attention layers per GNN layer
-:param num_gnn_layers: Number of GNN layers
-:param normalization: Layer normalization type. Currently available options are
- ``RMSNorm`` or ``LayerNorm``.
-:param activation: Activation function. Currently available options are ``SiLU``,
- and ``SwiGLU``.
-:param transformer_type: The order in which the layer normalization and attention
- are applied in a transformer block. Available options are ``PreLN``
- (normalization before attention) and ``PostLN`` (normalization after attention).
-:param featurizer_type: Implementation of the featurizer of the model to use. Available
- options are ``residual`` (the original featurizer from the PET paper, that uses
- residual connections at each GNN layer for readout) and ``feedforward`` (a modern
- version that uses the last representation after all GNN iterations for readout).
- Additionally, the feedforward version uses bidirectional features flow during the
- message passing iterations, that favors features flowing from atom ``i`` to atom
- ``j`` to be not equal to the features flowing from atom ``j`` to atom ``i``.
-:param zbl: Use ZBL potential for short-range repulsion
-:param long_range: Long-range Coulomb interactions parameters:
- - ``enable``: Toggle for enabling long-range interactions
- - ``use_ewald``: Use Ewald summation. If False, P3M is used
- - ``smearing``: Smearing width in Fourier space
- - ``kspace_resolution``: Resolution of the reciprocal space grid
- - ``interpolation_nodes``: Number of grid points for interpolation (for PME only)
-
-training
-########
-
-:param distributed: Whether to use distributed training
-:param distributed_port: Port for DDP communication
-:param batch_size: Training batch size
-:param num_epochs: Number of epochs
-:param warmup_fraction: Fraction of training steps used for learning rate warmup
-:param learning_rate: Learning rate
-:param log_interval: Interval to log metrics
-:param checkpoint_interval: Interval to save checkpoints
-:param scale_targets: Normalize targets to unit std during training
-:param fixed_composition_weights: Weights for atomic contributions
-:param per_structure_targets: Targets to calculate per-structure losses
-:param log_mae: Log MAE alongside RMSE
-:param log_separate_blocks: Log per-block error
-:param grad_clip_norm: Maximum gradient norm value, by default inf (no clipping)
-:param loss: Loss configuration (see above)
-:param best_model_metric: Metric used to select best checkpoint (e.g., ``rmse_prod``)
-:param num_workers: Number of workers for data loading. If not provided, it is set
- automatically.
+may not be optimal for your specific dataset. There is good number of
+parameters to tune, both for the :ref:`model <pet_model_hypers>` and the
+:ref:`trainer <pet_trainer_hypers>`. Since seeing them for the first time
+might be overwhelming, here we provide a least of the parameters that in general
+are the most important (in decreasing order of importance):
+
+.. autoattribute:: metatrain.pet.hypers.PETHypers.cutoff
+ :no-index:
+
+.. autoattribute:: metatrain.pet.hypers.PETTrainerHypers.learning_rate
+ :no-index:
+
+.. autoattribute:: metatrain.pet.hypers.PETTrainerHypers.batch_size
+ :no-index:
+
+.. autoattribute:: metatrain.pet.hypers.PETHypers.d_pet
+ :no-index:
+
+.. autoattribute:: metatrain.pet.hypers.PETHypers.d_node
+ :no-index:
+
+.. autoattribute:: metatrain.pet.hypers.PETHypers.num_gnn_layers
+ :no-index:
+
+.. autoattribute:: metatrain.pet.hypers.PETHypers.num_attention_layers
+ :no-index:
+
+.. autoattribute:: metatrain.pet.hypers.PETTrainerHypers.loss
+ :no-index:
+
+.. autoattribute:: metatrain.pet.hypers.PETHypers.long_range
+ :no-index:
+
+.. _pet_model_hypers:
+
+Model hyperparameters
+------------------------
+
+The parameters that go under the ``architecture.model`` section of the config file
+are the following:
+
+.. autoclass:: metatrain.pet.hypers.PETHypers
+ :members:
+ :undoc-members:
+
+with the long-range section being:
+
+.. autoclass:: metatrain.pet.hypers.LongRangeHypers
+ :members:
+ :undoc-members:
+
+.. _pet_trainer_hypers:
+
+Trainer hyperparameters
+-------------------------
+
+The parameters that go under the ``architecture.trainer`` section of the config file
+are the following:
+
+.. autoclass:: metatrain.pet.hypers.PETTrainerHypers
+ :members:
+ :undoc-members:
 
 References
 ----------
 
@@ -1,3 +1,4 @@
+# This file is auto-generated. Do not edit directly.
 architecture:
  name: pet
  model:
@@ -27,7 +28,7 @@ architecture:
  batch_size: 16
  num_epochs: 1000
  warmup_fraction: 0.01
- learning_rate: 1e-4
+ learning_rate: 0.0001
  weight_decay: null
  log_interval: 1
  checkpoint_interval: 100
 
@@ -0,0 +1,138 @@
+from typing import Literal, Optional, TypedDict
+
+from metatrain.utils.hypers import (
+ CompositionWeightsDict,
+ LossDict,
+ ScalingWeightsDict,
+ init_with_defaults,
+)
+
+
+class LongRangeHypers(TypedDict):
+ """In some systems and datasets, enabling long-range Coulomb interactions
+ might be beneficial for the accuracy of the model and/or
+ its physical correctness."""
+
+ enable: bool = False
+ """Toggle for enabling long-range interactions"""
+ use_ewald: bool = False
+ """Use Ewald summation. If False, P3M is used"""
+ smearing: float = 1.4
+ """Smearing width in Fourier space"""
+ kspace_resolution: float = 1.33
+ """Resolution of the reciprocal space grid"""
+ interpolation_nodes: int = 5
+ """Number of grid points for interpolation (for PME only)"""
+
+
+class PETHypers(TypedDict):
+ """Hyperparameters for the PET model."""
+
+ cutoff: float = 4.5
+ """Cutoff radius for neighbor search.
+
+ This should be set to a value after which most of the interactions 
+ between atoms is expected to be negligible. A lower cutoff will lead
+ to faster models.
+ """
+ cutoff_width: float = 0.2
+ """Width of the smoothing function at the cutoff"""
+ d_pet: int = 128
+ """Dimension of the edge features.
+ 
+ This hyperparameters controls width of the neural network. In general,
+ increasing it might lead to better accuracy, especially on larger datasets, at the
+ cost of increased training and evaluation time.
+ """
+ d_head: int = 128
+ """Dimension of the attention heads."""
+ d_node: int = 256
+ """Dimension of the node features.
+
+ Increasing this hyperparameter might lead to better accuracy, 
+ with a relatively small increase in inference time.
+ """
+ d_feedforward: int = 256
+ """Dimension of the feedforward network in the attention layer."""
+ num_heads: int = 8
+ """Attention heads per attention layer."""
+ num_attention_layers: int = 2
+ """The number of attention layers in each layer of the graph
+ neural network. Depending on the dataset, increasing this hyperparameter might 
+ lead to better accuracy, at the cost of increased training and evaluation time.
+ """
+ num_gnn_layers: int = 2
+ """The number of graph neural network layers. 
+ 
+ In general, decreasing this hyperparameter to 1 will lead to much faster models,
+ at the expense of accuracy. Increasing it may or may not lead to better accuracy,
+ depending on the dataset, at the cost of increased training and evaluation time.
+ """
+ normalization: Literal["RMSNorm", "LayerNorm"] = "RMSNorm"
+ """Layer normalization type."""
+ activation: Literal["SiLU", "SwiGLU"] = "SwiGLU"
+ """Activation function."""
+ transformer_type: Literal["PreLN", "PostLN"] = "PreLN"
+ """The order in which the layer normalization and attention
+ are applied in a transformer block. Available options are ``PreLN``
+ (normalization before attention) and ``PostLN`` (normalization after attention)."""
+ featurizer_type: Literal["residual", "feedforward"] = "feedforward"
+ """Implementation of the featurizer of the model to use. Available
+ options are ``residual`` (the original featurizer from the PET paper, that uses
+ residual connections at each GNN layer for readout) and ``feedforward`` (a modern
+ version that uses the last representation after all GNN iterations for readout).
+ Additionally, the feedforward version uses bidirectional features flow during the
+ message passing iterations, that favors features flowing from atom ``i`` to atom
+ ``j`` to be not equal to the features flowing from atom ``j`` to atom ``i``."""
+ zbl: bool = False
+ """Use ZBL potential for short-range repulsion"""
+ long_range: LongRangeHypers = init_with_defaults(LongRangeHypers)
+ """Long-range Coulomb interactions parameters."""
+
+
+class PETTrainerHypers(TypedDict):
+ """Hyperparameters for training PET models."""
+
+ distributed: bool = False
+ """Whether to use distributed training"""
+ distributed_port: int = 39591
+ """Port for DDP communication"""
+ batch_size: int = 16
+ """The number of samples to use in each batch of training. This
+ hyperparameter controls the tradeoff between training speed and memory usage. In
+ general, larger batch sizes will lead to faster training, but might require more
+ memory."""
+ num_epochs: int = 1000
+ """Number of epochs."""
+ warmup_fraction: float = 0.01
+ """Fraction of training steps used for learning rate warmup."""
+ learning_rate: float = 1e-4
+ """Learning rate."""
+ weight_decay: Optional[float] = None
+
+ log_interval: int = 1
+ """Interval to log metrics."""
+ checkpoint_interval: int = 100
+ """Interval to save checkpoints."""
+ scale_targets: bool = True
+ """Normalize targets to unit std during training."""
+ fixed_composition_weights: CompositionWeightsDict = {}
+ """Weights for atomic contributions."""
+ fixed_scaling_weights: ScalingWeightsDict = {}
+
+ per_structure_targets: list[str] = []
+ """Targets to calculate per-structure losses."""
+ num_workers: Optional[int] = None
+ """Number of workers for data loading. If not provided, it is set
+ automatically."""
+ log_mae: bool = True
+ """Log MAE alongside RMSE"""
+ log_separate_blocks: bool = False
+ """Log per-block error."""
+ best_model_metric: Literal["rmse_prod", "mae_prod", "loss"] = "mae_prod"
+ """Metric used to select best checkpoint (e.g., ``rmse_prod``)"""
+ grad_clip_norm: float = 1.0
+ """Maximum gradient norm value, by default inf (no clipping)"""
+ loss: str | LossDict = "mse"
+ """This section describes the loss function to be used. See the
+ :ref:`loss-functions` for more details."""
@@ -1,4 +1,5 @@
 import logging
+import typing
 import warnings
 from math import prod
 from typing import Any, Dict, List, Literal, Optional, Tuple
@@ -26,16 +27,17 @@
 from metatrain.utils.sum_over_atoms import sum_over_atoms
 
 from . import checkpoints
+from .hypers import PETHypers
 from .modules.finetuning import apply_finetuning_strategy
 from .modules.structures import systems_to_batch
 from .modules.transformer import CartesianTransformer
 from .modules.utilities import cutoff_func
 
 
-AVAILABLE_FEATURIZERS = ["feedforward", "residual"]
+AVAILABLE_FEATURIZERS = typing.get_args(PETHypers.__annotations__["featurizer_type"])
 
 
-class PET(ModelInterface):
+class PET(ModelInterface[PETHypers]):
  """
  Metatrain-native implementation of the PET architecture.
 
@@ -56,7 +58,7 @@ class PET(ModelInterface):
  component_labels: Dict[str, List[List[Labels]]]
  NUM_FEATURE_TYPES: int = 2 # node + edge features
 
- def __init__(self, hypers: Dict, dataset_info: DatasetInfo) -> None:
+ def __init__(self, hypers: PETHypers, dataset_info: DatasetInfo) -> None:
  super().__init__(hypers, dataset_info, self.__default_metadata__)
 
  # Cache frequently accessed hyperparameters