deepmodeling
diff --git a/‎.github/workflows/build_wheel.yml‎
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/build_wheel.yml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎.github/workflows/package_c.yml‎
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/package_c.yml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎.pre-commit-config.yaml‎
Lines changed: 21 additions & 21 deletions b/‎.pre-commit-config.yaml‎
Lines changed: 21 additions & 21 deletions
diff --git a/‎backend/find_tensorflow.py‎
Lines changed: 2 additions & 2 deletions b/‎backend/find_tensorflow.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎deepmd/common.py‎
Lines changed: 1 addition & 1 deletion b/‎deepmd/common.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎deepmd/pt/loss/property.py‎
Lines changed: 13 additions & 1 deletion b/‎deepmd/pt/loss/property.py‎
Lines changed: 13 additions & 1 deletion
diff --git a/‎deepmd/pt/train/training.py‎
Lines changed: 111 additions & 25 deletions b/‎deepmd/pt/train/training.py‎
Lines changed: 111 additions & 25 deletions
diff --git a/‎deepmd/utils/argcheck.py‎
Lines changed: 10 additions & 0 deletions b/‎deepmd/utils/argcheck.py‎
Lines changed: 10 additions & 0 deletions
diff --git a/‎pyproject.toml‎
Lines changed: 1 addition & 1 deletion b/‎pyproject.toml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎source/api_cc/src/DeepPotPT.cc‎
Lines changed: 6 additions & 6 deletions b/‎source/api_cc/src/DeepPotPT.cc‎
Lines changed: 6 additions & 6 deletions
@@ -70,7 +70,7 @@ jobs:
  rm -rf .git
  if: matrix.dp_pkg_name == 'deepmd-kit-cu11'
  - name: Build wheels
- uses: pypa/cibuildwheel@v3.0
+ uses: pypa/cibuildwheel@v3.1
  env:
  CIBW_BUILD_VERBOSITY: 1
  CIBW_ARCHS: all
 
@@ -22,7 +22,7 @@ jobs:
  tensorflow_version: ""
  filename: libdeepmd_c.tar.gz
  - tensorflow_build_version: "2.14"
- tensorflow_version: ">=2.5.0rc0,<2.15"
+ tensorflow_version: ">=2.5.0,<2.15"
  filename: libdeepmd_c_cu11.tar.gz
  steps:
  - uses: actions/checkout@v4
 
@@ -29,7 +29,7 @@ repos:
  exclude: ^source/3rdparty
  - repo: https://github.com/astral-sh/ruff-pre-commit
  # Ruff version.
- rev: v0.12.3
+ rev: v0.12.7
  hooks:
  - id: ruff
  args: ["--fix"]
@@ -74,7 +74,7 @@ repos:
  # exclude: ^(source/3rdparty|\.github/workflows|\.clang-format)
  # Shell
  - repo: https://github.com/scop/pre-commit-shfmt
- rev: v3.12.0-1
+ rev: v3.12.0-2
  hooks:
  - id: shfmt
  # CMake
@@ -83,25 +83,25 @@ repos:
  hooks:
  - id: cmake-format
  #- id: cmake-lint
- # - repo: https://github.com/njzjz/mirrors-bibtex-tidy
- # rev: v1.13.0
- #  hooks:
- #  - id: bibtex-tidy
- #  args:
- #  - --curly
- #  - --numeric
- #  - --align=13
- #  - --blank-lines
- #  # disable sort: the order of keys and fields has explict meanings
- #  #- --sort=key
- #  - --duplicates=key,doi,citation,abstract
- #  - --merge=combine
- #  #- --sort-fields
- #  #- --strip-comments
- #  - --trailing-commas
- #  - --encode-urls
- #  - --remove-empty-fields
- #  - --wrap=80
+ - repo: https://github.com/njzjz/mirrors-bibtex-tidy
+  rev: v1.14.0
+ hooks:
+ - id: bibtex-tidy
+ args:
+ - --curly
+ - --numeric
+ - --align=13
+ - --blank-lines
+ # disable sort: the order of keys and fields has explict meanings
+ #- --sort=key
+ - --duplicates=key,doi,citation,abstract
+ - --merge=combine
+ #- --sort-fields
+ #- --strip-comments
+ - --trailing-commas
+ - --encode-urls
+ - --remove-empty-fields
+ - --wrap=80
  # license header
  - repo: https://github.com/Lucas-C/pre-commit-hooks
  rev: v1.5.5
 
@@ -88,14 +88,14 @@ def find_tensorflow() -> tuple[Optional[str], list[str]]:
  # CUDA 12.2, cudnn 9
  requires.extend(
  [
- "tensorflow-cpu>=2.18.0rc0; platform_machine=='x86_64' and platform_system == 'Linux'",
+ "tensorflow-cpu>=2.18.0; platform_machine=='x86_64' and platform_system == 'Linux'",
  ]
  )
  elif cuda_version in SpecifierSet(">=11,<12"):
  # CUDA 11.8, cudnn 8
  requires.extend(
  [
- "tensorflow-cpu>=2.5.0rc0,<2.15; platform_machine=='x86_64' and platform_system == 'Linux'",
+ "tensorflow-cpu>=2.5.0,<2.15; platform_machine=='x86_64' and platform_system == 'Linux'",
  ]
  )
  tf_version = "2.14.1"
 
@@ -44,7 +44,7 @@
  "select_idx_map",
 ]
 
-_PRECISION = Literal["default", "float16", "float32", "float64"]
+_PRECISION = Literal["default", "float16", "bfloat16", "float32", "float64"]
 _ACTIVATION = Literal[
  "relu",
  "relu6",
 
@@ -42,7 +42,7 @@ def __init__(
  var_name : str
  The atomic property to fit, 'energy', 'dipole', and 'polar'.
  loss_func : str
- The loss function, such as "smooth_mae", "mae", "rmse".
+ The loss function, such as "smooth_mae", "mae", "rmse", "mape".
  metric : list
  The metric such as mae, rmse which will be printed.
  beta : float
@@ -151,6 +151,12 @@ def forward(self, input_dict, model, label, natoms, learning_rate=0.0, mae=False
  reduction="mean",
  )
  )
+ elif self.loss_func == "mape":
+ loss += torch.mean(
+ torch.abs(
+ (label[var_name] - model_pred[var_name]) / (label[var_name] + 1e-3)
+ )
+ )
  else:
  raise RuntimeError(f"Unknown loss function : {self.loss_func}")
 
@@ -182,6 +188,12 @@ def forward(self, input_dict, model, label, natoms, learning_rate=0.0, mae=False
  reduction="mean",
  )
  ).detach()
+ if "mape" in self.metric:
+ more_loss["mape"] = torch.mean(
+ torch.abs(
+ (label[var_name] - model_pred[var_name]) / (label[var_name] + 1e-3)
+ )
+ ).detach()
 
  return model_pred, loss, more_loss
 
 
@@ -140,6 +140,7 @@ def __init__(
  self.num_steps = training_params["numb_steps"]
  self.disp_file = training_params.get("disp_file", "lcurve.out")
  self.disp_freq = training_params.get("disp_freq", 1000)
+ self.disp_avg = training_params.get("disp_avg", False)
  self.save_ckpt = training_params.get("save_ckpt", "model.ckpt")
  self.save_freq = training_params.get("save_freq", 1000)
  self.max_ckpt_keep = training_params.get("max_ckpt_keep", 5)
@@ -808,23 +809,75 @@ def fake_model():
  else:
  raise ValueError(f"Not supported optimizer type '{self.opt_type}'")
 
+ if self.disp_avg:
+ # Accumulate loss for averaging over display interval
+ self.step_count_in_interval += 1
+ if not self.multi_task:
+ # Accumulate loss for single task
+ if not self.train_loss_accu:
+ # Initialize accumulator with current loss structure
+ for item in more_loss:
+ if "l2_" not in item:
+ self.train_loss_accu[item] = 0.0
+ for item in more_loss:
+ if "l2_" not in item:
+ self.train_loss_accu[item] += more_loss[item]
+ else:
+ # Accumulate loss for multi-task
+ if task_key not in self.train_loss_accu:
+ self.train_loss_accu[task_key] = {}
+ if task_key not in self.step_count_per_task:
+ self.step_count_per_task[task_key] = 0
+ self.step_count_per_task[task_key] += 1
+
+ for item in more_loss:
+ if "l2_" not in item:
+ if item not in self.train_loss_accu[task_key]:
+ self.train_loss_accu[task_key][item] = 0.0
+ self.train_loss_accu[task_key][item] += more_loss[item]
+
  # Log and persist
  display_step_id = _step_id + 1
  if self.display_in_training and (
  display_step_id % self.disp_freq == 0 or display_step_id == 1
  ):
  self.wrapper.eval() # Will set to train mode before fininshing validation
 
- def log_loss_train(_loss, _more_loss, _task_key="Default"):
- results = {}
- rmse_val = {
- item: _more_loss[item]
- for item in _more_loss
- if "l2_" not in item
- }
- for item in sorted(rmse_val.keys()):
- results[item] = rmse_val[item]
- return results
+ if self.disp_avg:
+
+ def log_loss_train(_loss, _more_loss, _task_key="Default"):
+ results = {}
+ if not self.multi_task:
+ # Use accumulated average loss for single task
+ for item in self.train_loss_accu:
+ results[item] = (
+ self.train_loss_accu[item]
+ / self.step_count_in_interval
+ )
+ else:
+ # Use accumulated average loss for multi-task
+ if (
+ _task_key in self.train_loss_accu
+ and _task_key in self.step_count_per_task
+ ):
+ for item in self.train_loss_accu[_task_key]:
+ results[item] = (
+ self.train_loss_accu[_task_key][item]
+ / self.step_count_per_task[_task_key]
+ )
+ return results
+ else:
+
+ def log_loss_train(_loss, _more_loss, _task_key="Default"):
+ results = {}
+ rmse_val = {
+ item: _more_loss[item]
+ for item in _more_loss
+ if "l2_" not in item
+ }
+ for item in sorted(rmse_val.keys()):
+ results[item] = rmse_val[item]
+ return results
 
  def log_loss_valid(_task_key="Default"):
  single_results = {}
@@ -882,24 +935,31 @@ def log_loss_valid(_task_key="Default"):
  else:
  train_results = {_key: {} for _key in self.model_keys}
  valid_results = {_key: {} for _key in self.model_keys}
- train_results[task_key] = log_loss_train(
- loss, more_loss, _task_key=task_key
- )
- for _key in self.model_keys:
- if _key != task_key:
- self.optimizer.zero_grad()
- input_dict, label_dict, _ = self.get_data(
- is_train=True, task_key=_key
- )
- _, loss, more_loss = self.wrapper(
- **input_dict,
- cur_lr=pref_lr,
- label=label_dict,
- task_key=_key,
- )
+ if self.disp_avg:
+ # For multi-task, use accumulated average loss for all tasks
+ for _key in self.model_keys:
  train_results[_key] = log_loss_train(
  loss, more_loss, _task_key=_key
  )
+ else:
+ train_results[task_key] = log_loss_train(
+ loss, more_loss, _task_key=task_key
+ )
+ for _key in self.model_keys:
+ if _key != task_key:
+ self.optimizer.zero_grad()
+ input_dict, label_dict, _ = self.get_data(
+ is_train=True, task_key=_key
+ )
+ _, loss, more_loss = self.wrapper(
+ **input_dict,
+ cur_lr=pref_lr,
+ label=label_dict,
+ task_key=_key,
+ )
+ train_results[_key] = log_loss_train(
+ loss, more_loss, _task_key=_key
+ )
  valid_results[_key] = log_loss_valid(_task_key=_key)
  if self.rank == 0:
  log.info(
@@ -921,6 +981,21 @@ def log_loss_valid(_task_key="Default"):
  )
  self.wrapper.train()
 
+ if self.disp_avg:
+ # Reset loss accumulators after display
+ if not self.multi_task:
+ for item in self.train_loss_accu:
+ self.train_loss_accu[item] = 0.0
+ else:
+ for task_key in self.model_keys:
+ if task_key in self.train_loss_accu:
+ for item in self.train_loss_accu[task_key]:
+ self.train_loss_accu[task_key][item] = 0.0
+ if task_key in self.step_count_per_task:
+ self.step_count_per_task[task_key] = 0
+ self.step_count_in_interval = 0
+ self.last_display_step = display_step_id
+
  current_time = time.time()
  train_time = current_time - self.t0
  self.t0 = current_time
@@ -993,6 +1068,17 @@ def log_loss_valid(_task_key="Default"):
  self.t0 = time.time()
  self.total_train_time = 0.0
  self.timed_steps = 0
+
+ if self.disp_avg:
+ # Initialize loss accumulators
+ if not self.multi_task:
+ self.train_loss_accu = {}
+ else:
+ self.train_loss_accu = {key: {} for key in self.model_keys}
+ self.step_count_per_task = dict.fromkeys(self.model_keys, 0)
+ self.step_count_in_interval = 0
+ self.last_display_step = 0
+
  for step_id in range(self.start_step, self.num_steps):
  step(step_id)
  if JIT:
 
@@ -3137,6 +3137,9 @@ def training_args(
  )
  doc_disp_training = "Displaying verbose information during training."
  doc_time_training = "Timing during training."
+ doc_disp_avg = (
+ "Display the average loss over the display interval for training sets."
+ )
  doc_profiling = "Export the profiling results to the Chrome JSON file for performance analysis, driven by the legacy TensorFlow profiling API or PyTorch Profiler. The output file will be saved to `profiling_file`."
  doc_profiling_file = "Output file for profiling."
  doc_enable_profiler = "Export the profiling results to the TensorBoard log for performance analysis, driven by TensorFlow Profiler (available in TensorFlow 2.3) or PyTorch Profiler. The log will be saved to `tensorboard_log_dir`."
@@ -3213,6 +3216,13 @@ def training_args(
  Argument(
  "time_training", bool, optional=True, default=True, doc=doc_time_training
  ),
+ Argument(
+ "disp_avg",
+ bool,
+ optional=True,
+ default=False,
+ doc=doc_only_pt_supported + doc_disp_avg,
+ ),
  Argument(
  "profiling",
  bool,
 
@@ -90,7 +90,7 @@ test = [
 docs = [
  "sphinx>=3.1.1",
  "sphinx-book-theme",
- "myst-nb>=1.0.0rc0",
+ "myst-nb>=1.0.0",
  "myst-parser>=0.19.2",
  "sphinx-design",
  "breathe",
 
@@ -197,12 +197,12 @@ void DeepPotPT::compute(ENERGYVTYPE& ener,
  std::accumulate(lmp_list.sendnum, lmp_list.sendnum + nswap, 0);
  torch::Tensor sendlist_tensor =
  torch::from_blob(lmp_list.sendlist, {total_send}, int32_option);
- comm_dict.insert("send_list", sendlist_tensor);
- comm_dict.insert("send_proc", sendproc_tensor);
- comm_dict.insert("recv_proc", recvproc_tensor);
- comm_dict.insert("send_num", sendnum_tensor);
- comm_dict.insert("recv_num", recvnum_tensor);
- comm_dict.insert("communicator", communicator_tensor);
+ comm_dict.insert_or_assign("send_list", sendlist_tensor);
+ comm_dict.insert_or_assign("send_proc", sendproc_tensor);
+ comm_dict.insert_or_assign("recv_proc", recvproc_tensor);
+ comm_dict.insert_or_assign("send_num", sendnum_tensor);
+ comm_dict.insert_or_assign("recv_num", recvnum_tensor);
+ comm_dict.insert_or_assign("communicator", communicator_tensor);
  }
  if (lmp_list.mapping) {
  std::vector<std::int64_t> mapping(nall_real);
Original file line number	Diff line number	Diff line change
`@@ -88,14 +88,14 @@ def find_tensorflow() -> tuple[Optional[str], list[str]]:`
`88`	`88`	`# CUDA 12.2, cudnn 9`
`89`	`89`	`requires.extend(`
`90`	`90`	`[`
`91`		`- "tensorflow-cpu>=2.18.0rc0; platform_machine=='x86_64' and platform_system == 'Linux'",`
	`91`	`+ "tensorflow-cpu>=2.18.0; platform_machine=='x86_64' and platform_system == 'Linux'",`
`92`	`92`	`]`
`93`	`93`	`)`
`94`	`94`	`elif cuda_version in SpecifierSet(">=11,<12"):`
`95`	`95`	`# CUDA 11.8, cudnn 8`
`96`	`96`	`requires.extend(`
`97`	`97`	`[`
`98`		`- "tensorflow-cpu>=2.5.0rc0,<2.15; platform_machine=='x86_64' and platform_system == 'Linux'",`
	`98`	`+ "tensorflow-cpu>=2.5.0,<2.15; platform_machine=='x86_64' and platform_system == 'Linux'",`
`99`	`99`	`]`
`100`	`100`	`)`
`101`	`101`	`tf_version = "2.14.1"`
Original file line number	Diff line number	Diff line change
`@@ -44,7 +44,7 @@`
`44`	`44`	`"select_idx_map",`
`45`	`45`	`]`
`46`	`46`
`47`		`-_PRECISION = Literal["default", "float16", "float32", "float64"]`
	`47`	`+_PRECISION = Literal["default", "float16", "bfloat16", "float32", "float64"]`
`48`	`48`	`_ACTIVATION = Literal[`
`49`	`49`	`"relu",`
`50`	`50`	`"relu6",`