hpcaitech · flybird11111 · Jan 16, 2025 · Jan 17, 2025 · Jan 20, 2025 · Jan 20, 2025
@@ -78,6 +78,9 @@ def __init__(
  self.require_grad_sync = True
  self.overlap_allgather = overlap_allgather
  self.use_fp8 = use_fp8
+ self.param_origin_shape = {}
+ for name, param in module.named_parameters():
+ self.param_origin_shape[name] = param.shape
 
  shardformer = ShardFormer(shard_config)
  if custom_policy is not None:

@@ -0,0 +1,238 @@
+import json
+import os
+from typing import Dict
+
+import torch
+import torch.distributed as dist
+import torch.nn as nn
+from torch.distributed.distributed_c10d import _get_default_group
+
+from colossalai.interface import ModelWrapper
+from colossalai.shardformer.layer.parallel_module import ParallelModule
+from contextlib import contextmanager
+
+from .utils import (
+ load_state_dict,
+ search_tp_partition_dim,
+)
+
+MODEL_META_PREFIX = "pytorch_model-meta-dist-"
+MODEL_WEIGHT_PREFIX = "pytorch_model-dist-"
+SHARD_META_SUFFIX = ".index.json"
+UNSHARD_META_SUFFIX = ".json"
+
+
+@contextmanager
+def RestoreDefaultStateDictBehavior(model):
+ original_methods = {}
+ for name, module in model.named_modules():
+ if isinstance(module, ParallelModule):
+ original_methods[module] = (module._save_to_state_dict, module._load_from_state_dict)
+ module._save_to_state_dict = nn.Module._save_to_state_dict.__get__(module, nn.Module)
+ module._load_from_state_dict = nn.Module._load_from_state_dict.__get__(module, nn.Module)
+ try:
+ yield model
+ finally:
+ for module, original_method in original_methods.items():
+ module._save_to_state_dict, module._load_from_state_dict = original_method
+
+
+def save_metadata(model_metadata, metadata_file, checkpoint_file=None, total_size=None):
+ metadata_dicts = {
+ "checkpoint_version": "1.0",
+ "total_size": total_size,
+ "metadata": {},
+ }
+ for name, data in model_metadata.items():
+ metadata_dicts["metadata"][name] = {}
+ for k, v in data.items():
+ if isinstance(v, torch.Tensor):
+ v = v.tolist()
+ metadata_dicts["metadata"][name][k] = v
+ if checkpoint_file is not None:
+ metadata_dicts["metadata"][name]["file"] = checkpoint_file
+ metadata_dicts["metadata"][name]["rank"] = dist.get_rank(_get_default_group())
+ with open(metadata_file, "w") as json_file:
+ json.dump(metadata_dicts, json_file, indent=4)
+
+
+def load_metadata(checkpoint: str):
+ metadata_dict = {}
+ for filename in os.listdir(checkpoint):
+ if filename.startswith(MODEL_META_PREFIX) and filename.endswith(".json"):
+ file_path = os.path.join(checkpoint, filename)
+ try:
+ with open(file_path, "r") as f:
+ metadata_json = json.load(f)
+ for name, item in metadata_json["metadata"].items():
+ if name not in metadata_dict:
+ metadata_dict[name] = {}
+ metadata_dict[name]["global_shape"] = item["global_shape"]
+ metadata_dict[name]["shards"] = {}
+ else:
+ assert metadata_dict[name]["global_shape"] == item["global_shape"]
+ shard = {item["rank"]: {}}
+ for k, v in item.items():
+ if k == "rank":
+ continue
+ shard[item["rank"]][k] = v
+ metadata_dict[name]["shards"].update(shard)
+ except (json.JSONDecodeError, IOError) as e:
+ print(f"Unable to load file {file_path}: {e}")
+ return metadata_dict
+
+
+def find_covering_shards(shards, target_offsets, target_lengths):
+ """
+ Parameters:
+
+ shards: A list containing information about all shards.
+ target_offsets: A one-dimensional array representing the starting position of the target tensor in each dimension.
+ target_lengths: A one-dimensional array representing the lengths of the target tensor in each dimension.
+ Returns:
+
+ A list of all shards that cover the target range.
+ """
+ target_start = target_offsets
+ target_end = [start + length for start, length in zip(target_offsets, target_lengths)]
+
+ covering_shards = {}
+
+ global_shape = None
+ total_lengths = None
+ for rank, shard in shards.items():
+ shard_start = shard["offsets"]
+ shard_lengths = shard["lengths"]
+ if global_shape == None:
+ global_shape = shard["global_shape"]
+ total_lengths = [0] * len(global_shape)
+ shard_end = [start + length for start, length in zip(shard_start, shard_lengths)]
+
+ overlap = any(
+ not (target_end[dim] <= shard_start[dim] or target_start[dim] >= shard_end[dim])
+ for dim in range(len(target_start))
+ )
+ if overlap:
+ covering_shards.update({rank: shard})
+ for dim in range(len(shard_start)):
+ total_lengths[dim] = max(total_lengths[dim], shard_start[dim] + shard_lengths[dim])
+
+ assert total_lengths == global_shape
+ return covering_shards
+
+
+def extract_weight_from_shard_partial(shard, target_offsets, target_lengths):
+ """
+ Extract the target range of weights from shard data, supporting partial overlap.
+
+ param shard: A dictionary containing shard data, including 'offsets', 'lengths', and 'weight'.
+ param target_offsets: A 1D array indicating the starting position of the target tensor in each dimension.
+ param target_lengths: A 1D array indicating the length of the target tensor in each dimension.
+ return: The extracted sub-tensor of the target weights and its position within the target range.
+ """
+ shard_offsets = shard["offsets"]
+ shard_lengths = shard["lengths"]
+ weight = shard["weight"]
+
+ slices = []
+ target_slices = []
+
+ for dim, (t_offset, t_length, s_offset, s_length) in enumerate(
+ zip(target_offsets, target_lengths, shard_offsets, shard_lengths)
+ ):
+ intersection_start = max(t_offset, s_offset)
+ intersection_end = min(t_offset + t_length, s_offset + s_length)
+
+ if intersection_start >= intersection_end:
+ return None, None
+
+ shard_slice_start = intersection_start - s_offset
+ shard_slice_end = intersection_end - s_offset
+ slices.append(slice(shard_slice_start, shard_slice_end))
+
+ target_slice_start = intersection_start - t_offset
+ target_slice_end = intersection_end - t_offset
+ target_slices.append(slice(target_slice_start, target_slice_end))
+
+ target_weight = weight[tuple(slices)]
+ return target_weight, target_slices
+
+
+def assemble_tensor_from_shards_partial(shards, target_offsets, target_lengths, dtype):
+ target_tensor = torch.zeros(target_lengths, dtype=dtype)
+
+ for rank, shard in shards.items():
+ target_weight, target_slices = extract_weight_from_shard_partial(shard, target_offsets, target_lengths)
+
+ if target_weight is not None and target_slices is not None:
+ target_tensor[tuple(target_slices)] = target_weight
+
+ return target_tensor
+
+
+def is_pytorch_model_meta_dist_file(checkpoint_index_file):
+ if MODEL_META_PREFIX in str(checkpoint_index_file):
+ return True
+ return False
+
+
+def load_dist_model(
+ model_metadata: Dict,
+ checkpoint: str,
+):
+ """
+ Load model from a single file with the given path of checkpoint.
+
+ Args:
+ model (nn.Module): The model to be loaded.
+ checkpoint_index_file (str): Path to the checkpoint file.
+ strict (bool, optional): For name matching during loading state_dict. Defaults to False.
+ This argument should be manually set to False since not all params in checkpoint are needed for each device when pipeline is enabled.
+ """
+ metadata_loaded = load_metadata(checkpoint)
+
+ load_files = {}
+ covered_shards = {}
+ for key, item in model_metadata.items():
+ offsets = item["offsets"]
+ lengths = item["lengths"]
+ assert (
+ item["global_shape"] == metadata_loaded[key]["global_shape"]
+ ), f"{item['global_shape']}, {metadata_loaded[key]['global_shape']}"
+ shards = metadata_loaded[key]["shards"]
+ covering_shards = find_covering_shards(shards=shards, target_offsets=offsets, target_lengths=lengths)
+ covered_shards[key] = covering_shards
+ for rank, shard in covering_shards.items():
+ if rank not in load_files:
+ load_files[rank] = set()
+ load_files[rank].add(shard["file"])
+
+ dtype = None
+ for rank, files in load_files.items():
+ for file in files:
+ file_path = os.path.join(checkpoint, file)
+ state_dict_shard = load_state_dict(file_path)
+ for key, weight in state_dict_shard.items():
+ if key not in covered_shards or rank not in covered_shards[key]:
+ continue
+ if dtype == None:
+ dtype = weight.dtype
+ covered_shards[key][rank]["weight"] = weight
+ state_dict = {}
+ for key, shards in covered_shards.items():
+ state = assemble_tensor_from_shards_partial(
+ shards, model_metadata[key]["offsets"], model_metadata[key]["lengths"], dtype=dtype
+ )
+ state_dict[key] = state
+
+ return state_dict
+
+def get_dist_files_name(weights_name, dist_id):
+ weights_name = weights_name.replace(".bin", f"-dist-{dist_id:05d}-shard.bin")
+ weights_name = weights_name.replace(".safetensors", f"-dist-{dist_id:05d}-shard.safetensors")
+ return weights_name
+
+def get_dist_meta_file_name(checkpoint, dist_id, use_safetensors):
+ if use_safetensors:
+ return os.path.join(checkpoint, f"{MODEL_META_PREFIX}{dist_id:05d}{SHARD_META_SUFFIX}")
+ return os.path.join(checkpoint, f"{MODEL_META_PREFIX}{dist_id:05d}{UNSHARD_META_SUFFIX}")