diff --git a/evaluator.py b/evaluator.py index 110ea02..b467ec3 100644 --- a/evaluator.py +++ b/evaluator.py @@ -51,4 +51,4 @@ def main( if __name__ == "__main__": - fire.Fire(main) \ No newline at end of file + fire.Fire(main) diff --git a/moe_peft.py b/moe_peft.py index fad929b..7a0ef41 100644 --- a/moe_peft.py +++ b/moe_peft.py @@ -176,7 +176,9 @@ def init_adapter_config( config_class.prompt_template = lora_config.get("prompt", None) config_list.append(config_class) elif args.evaluate: - config_list.extend(moe_peft.EvaluateConfig.from_config(lora_config)) # config["lora"] 部分 + config_list.extend( + moe_peft.EvaluateConfig.from_config(lora_config) + ) # config["lora"] 部分 # moe_flag? else: config_list.append(moe_peft.TrainConfig.from_config(lora_config)) @@ -206,7 +208,7 @@ def inference( for config in configs: config.prompts = [input_raw] callback = None if args.disable_log else inference_callback - outputs = moe_peft.generate( #此处已经配置好Genconfig,开始分词并且逐渐生成切片后的向量 + outputs = moe_peft.generate( # 此处已经配置好Genconfig,开始分词并且逐渐生成切片后的向量 model, tokenizer, configs, diff --git a/moe_peft/__init__.py b/moe_peft/__init__.py index f298574..d856b11 100644 --- a/moe_peft/__init__.py +++ b/moe_peft/__init__.py @@ -1,4 +1,6 @@ from .adapters import adapter_factory +from .analyst import process +from .analysts import SVDProcessor from .common import ( AdapterConfig, LLMBatchConfig, @@ -19,8 +21,6 @@ from .tokenizer import Tokenizer from .trainer import TrainConfig, train from .utils import is_package_available, setup_logging -from .analyst import process -from .analysts import SVDProcessor assert is_package_available("torch", "2.3.0"), "MoE-PEFT requires torch>=2.3.0" assert is_package_available( @@ -54,5 +54,5 @@ "setup_logging", "executor", "process", - "SVDProcessor" + "SVDProcessor", ] diff --git a/moe_peft/adapters/mixlora/model.py b/moe_peft/adapters/mixlora/model.py index 6ef8eff..8c37124 100644 --- a/moe_peft/adapters/mixlora/model.py +++ b/moe_peft/adapters/mixlora/model.py @@ -182,7 +182,9 @@ def forward( # router_logits: (batch * sequence_length, n_experts) router_logits = self.gate_(hidden_states) # 在此处计算各专家的分数 - routing_weights = F.softmax(router_logits, dim=1, dtype=self.dtype_) # 归一化处理 + routing_weights = F.softmax( + router_logits, dim=1, dtype=self.dtype_ + ) # 归一化处理 routing_weights, selected_experts = torch.topk( routing_weights, self.topk_, dim=-1 ) diff --git a/moe_peft/analyst.py b/moe_peft/analyst.py index 5fa9a35..e75e7d9 100644 --- a/moe_peft/analyst.py +++ b/moe_peft/analyst.py @@ -1,11 +1,11 @@ -import torch import logging -import numpy as np -from typing import Tuple, Dict -from sklearn.metrics.pairwise import cosine_similarity +from typing import Dict, Tuple + +import torch from .model import LLMModel + def keys_extraction(config) -> list: result = [] @@ -16,6 +16,7 @@ def keys_extraction(config) -> list: return result # 冗余数据结构设计待优化 + def mapping(keys_list) -> list: mapping_dict = { "q_proj": "wq_", @@ -24,7 +25,7 @@ def mapping(keys_list) -> list: "o_proj": "wo_", "gate_proj": "w1_", "down_proj": "w2_", - "up_proj": "w3_" + "up_proj": "w3_", } mapped_list = [ @@ -35,17 +36,19 @@ def mapping(keys_list) -> list: return mapped_list + def moe_weight_caculate(loading: list, lora_weights: list) -> torch.Tensor: return sum(weight * tensor for weight, tensor in zip(loading, lora_weights)) + def lora_weight_traverse(model, target_linears_list) -> Tuple[Dict, Dict]: - attn_linears = ['wq_', 'wk_', 'wv_', 'wo_'] - mlp_linears = ['w1_', 'w2_', 'w3_'] - + attn_linears = ["wq_", "wk_", "wv_", "wo_"] + mlp_linears = ["w1_", "w2_", "w3_"] + pretrained_layers_weights = [] tuned_layers_weights = [] - for layer in model.model_.layers_: + for layer in model.model_.layers_: pretrained_layer_weights = [] tuned_layer_weights = [] for item in target_linears_list: @@ -55,48 +58,53 @@ def lora_weight_traverse(model, target_linears_list) -> Tuple[Dict, Dict]: try: loras_dict = getattr(layer.self_attn_, linear).loras_ adapter = loras_dict.get(adapter_name, None) - + if adapter is not None: - p_weight = getattr(adapter, 'base_layer_').weight - lora_a_weight = getattr(adapter, 'lora_a_').weight - lora_b_weight = getattr(adapter, 'lora_b_').weight + p_weight = getattr(adapter, "base_layer_").weight + lora_a_weight = getattr(adapter, "lora_a_").weight + lora_b_weight = getattr(adapter, "lora_b_").weight t_weight = lora_b_weight @ lora_a_weight + p_weight - - linear_key = linear.rstrip('_') + + linear_key = linear.rstrip("_") pretrained_layer_weights.append({linear_key: p_weight}) tuned_layer_weights.append({linear_key: t_weight}) except AttributeError as e: - raise AttributeError(f"Error accessing attributes for linear '{linear}' in adapter '{adapter_name}': {e}") - + raise AttributeError( + f"Error accessing attributes for linear '{linear}' in adapter '{adapter_name}': {e}" + ) + elif linear in mlp_linears: try: loras_dict = getattr(layer.mlp_.mlp_, linear).loras_ adapter = loras_dict.get(adapter_name, None) - + if adapter is not None: - p_weight = getattr(adapter, 'base_layer_').weight - lora_a_weight = getattr(adapter, 'lora_a_').weight - lora_b_weight = getattr(adapter, 'lora_b_').weight + p_weight = getattr(adapter, "base_layer_").weight + lora_a_weight = getattr(adapter, "lora_a_").weight + lora_b_weight = getattr(adapter, "lora_b_").weight t_weight = lora_b_weight @ lora_a_weight + p_weight - - linear_key = linear.rstrip('_') + + linear_key = linear.rstrip("_") pretrained_layer_weights.append({linear_key: p_weight}) tuned_layer_weights.append({linear_key: t_weight}) except AttributeError as e: - raise AttributeError(f"Error accessing attributes for linear '{linear}' in adapter '{adapter_name}': {e}") - + raise AttributeError( + f"Error accessing attributes for linear '{linear}' in adapter '{adapter_name}': {e}" + ) + else: raise ValueError(f"Invalid linear name: {linear}") - + pretrained_layers_weights.append(pretrained_layer_weights) tuned_layers_weights.append(tuned_layer_weights) return pretrained_layers_weights, tuned_layers_weights + def moe_weight_traverse(model, target_linears_list) -> Tuple[Dict, Dict]: - attn_linears = ['wq_', 'wk_', 'wv_', 'wo_'] - mlp_linears = ['w1_', 'w2_', 'w3_'] - + attn_linears = ["wq_", "wk_", "wv_", "wo_"] + mlp_linears = ["w1_", "w2_", "w3_"] + pretrained_layers_weights = [] tuned_layers_weights = [] @@ -110,28 +118,36 @@ def moe_weight_traverse(model, target_linears_list) -> Tuple[Dict, Dict]: try: loras_dict = getattr(layer.self_attn_, linear).loras_ adapter = loras_dict.get(adapter_name, None) - + if adapter is not None: - p_weight = getattr(adapter, 'base_layer_').weight - lora_a_weight = getattr(adapter, 'lora_a_').weight - lora_b_weight = getattr(adapter, 'lora_b_').weight + p_weight = getattr(adapter, "base_layer_").weight + lora_a_weight = getattr(adapter, "lora_a_").weight + lora_b_weight = getattr(adapter, "lora_b_").weight t_weight = lora_b_weight @ lora_a_weight + p_weight - - linear_key = linear.rstrip('_') + + linear_key = linear.rstrip("_") pretrained_layer_weights.append({linear_key: p_weight}) tuned_layer_weights.append({linear_key: t_weight}) except AttributeError as e: - raise AttributeError(f"Error accessing attributes for linear '{linear}' in adapter '{adapter_name}': {e}") - + raise AttributeError( + f"Error accessing attributes for linear '{linear}' in adapter '{adapter_name}': {e}" + ) + elif linear in mlp_linears: try: loras_dict = getattr(layer.mlp_.mlp_, linear).loras_ - adapter = loras_dict.get(adapter_name, None) # 获取adapter_name + adapter = loras_dict.get( + adapter_name, None + ) # 获取adapter_name if layer.mlp_.moes_: - profile_matrix = layer.mlp_.moes_[adapter_name].profiler_ + profile_matrix = layer.mlp_.moes_[ + adapter_name + ].profiler_ expert_value_lists = loras_dict.values() tuned_expert_value_lists = [] - total_base_layer = getattr(layer.mlp_.mlp_, linear).base_layer_.weight + total_base_layer = getattr( + layer.mlp_.mlp_, linear + ).base_layer_.weight for value in expert_value_lists: p_weight = value.base_layer_.weight @@ -140,104 +156,101 @@ def moe_weight_traverse(model, target_linears_list) -> Tuple[Dict, Dict]: t_weight = lora_b_weight @ lora_a_weight + p_weight tuned_expert_value_lists.append(t_weight) - final_tuned_weights = moe_weight_caculate(profile_matrix, tuned_expert_value_lists) - linear_key = linear.rstrip('_') - pretrained_layer_weights.append({linear_key: total_base_layer}) # 这里的权重是moe层的预训练总权重 - tuned_layer_weights.append({linear_key: final_tuned_weights}) # 这里的权重是微调并且加权后的moe层的所有权重 + final_tuned_weights = moe_weight_caculate( + profile_matrix, tuned_expert_value_lists + ) + linear_key = linear.rstrip("_") + pretrained_layer_weights.append( + {linear_key: total_base_layer} + ) # 这里的权重是moe层的预训练总权重 + tuned_layer_weights.append( + {linear_key: final_tuned_weights} + ) # 这里的权重是微调并且加权后的moe层的所有权重 else: # 普通lora微调的逻辑 if adapter is not None: - p_weight = getattr(adapter, 'base_layer_').weight - lora_a_weight = getattr(adapter, 'lora_a_').weight - lora_b_weight = getattr(adapter, 'lora_b_').weight + p_weight = getattr(adapter, "base_layer_").weight + lora_a_weight = getattr(adapter, "lora_a_").weight + lora_b_weight = getattr(adapter, "lora_b_").weight t_weight = lora_b_weight @ lora_a_weight + p_weight - - linear_key = linear.rstrip('_') - pretrained_layer_weights.append({linear_key: p_weight}) + + linear_key = linear.rstrip("_") + pretrained_layer_weights.append( + {linear_key: p_weight} + ) tuned_layer_weights.append({linear_key: t_weight}) except AttributeError as e: - raise AttributeError(f"Error accessing attributes for linear '{linear}' in adapter '{adapter_name}': {e}") - + raise AttributeError( + f"Error accessing attributes for linear '{linear}' in adapter '{adapter_name}': {e}" + ) + else: raise ValueError(f"Invalid linear name: {linear}") - + pretrained_layers_weights.append(pretrained_layer_weights) tuned_layers_weights.append(tuned_layer_weights) - + return pretrained_layers_weights, tuned_layers_weights -'''def svd_analysis(p_weights: list, f_weights: list, n: int = 9, device = 'cuda:0'): - total_results = [] - results = [] - for idx, (single_p_layer, single_f_layer) in enumerate(zip(p_weights, f_weights)): # 126个linear - logging.info(f"Processing layer {idx} for SVD analysis...") - for (p_linear, f_linear) in zip(single_p_layer, single_f_layer): - # layer_results = [] - for key in p_linear.keys(): - p_tensor = p_linear[key].to(device) if isinstance(p_linear[key], torch.Tensor) else torch.tensor(p_linear[key], device=device) - f_tensor = f_linear[key].to(device) if isinstance(f_linear[key], torch.Tensor) else torch.tensor(f_linear[key], device=device) - - p_u, _, _ = torch.linalg.svd(p_tensor, full_matrices=False) - f_u, _, _ = torch.linalg.svd(f_tensor, full_matrices=False) - - n_min = min(n, p_u.shape[1], f_u.shape[1]) - p_top_n = p_u[:, :n_min] - f_top_n = f_u[:, :n_min] - - similarity = torch.mm(p_top_n.T, f_top_n) # 点积 - p_norms = torch.norm(p_top_n.T, dim=1, keepdim=True) # 计算 p_top_n 的范数 - f_norms = torch.norm(f_top_n, dim=0, keepdim=True) # 计算 f_top_n 的范数 - similarity = similarity / (p_norms * f_norms) # 标准化为余弦相似度 - avg_similarity = similarity.mean().item() # 转为 Python 标量 - - # layer_results.append({key: avg_similarity}) - results.append({key: avg_similarity}) - total_results.append(results) - - return total_results''' - -def svd_analysis(p_weights: list, f_weights: list, n: int = 9, device='cuda:0'): +def svd_analysis(p_weights: list, f_weights: list, n: int = 9, device="cuda:0"): total_results = [] - for idx, (single_p_layer, single_f_layer) in enumerate(zip(p_weights, f_weights)): # 遍历每一层 + for idx, (single_p_layer, single_f_layer) in enumerate( + zip(p_weights, f_weights) + ): # 遍历每一层 logging.info(f"Processing layer {idx} for SVD analysis...") layer_results = [] - for p_linear, f_linear in zip(single_p_layer, single_f_layer): # 遍历每一层的线性层 + for p_linear, f_linear in zip( + single_p_layer, single_f_layer + ): # 遍历每一层的线性层 layer_linear_results = {} - + for key in p_linear.keys(): # 遍历线性层中的每组权重 - p_tensor = p_linear[key].to(device) if isinstance(p_linear[key], torch.Tensor) else torch.tensor(p_linear[key], device=device) - f_tensor = f_linear[key].to(device) if isinstance(f_linear[key], torch.Tensor) else torch.tensor(f_linear[key], device=device) - + p_tensor = ( + p_linear[key].to(device) + if isinstance(p_linear[key], torch.Tensor) + else torch.tensor(p_linear[key], device=device) + ) + f_tensor = ( + f_linear[key].to(device) + if isinstance(f_linear[key], torch.Tensor) + else torch.tensor(f_linear[key], device=device) + ) + # 进行SVD分解 p_u, _, _ = torch.linalg.svd(p_tensor, full_matrices=False) f_u, _, _ = torch.linalg.svd(f_tensor, full_matrices=False) - + # 获取前n个奇异向量 n_min = min(n, p_u.shape[1], f_u.shape[1]) p_top_n = p_u[:, :n_min] f_top_n = f_u[:, :n_min] - + # 计算余弦相似度 similarity = torch.mm(p_top_n.T, f_top_n) # 点积 - p_norms = torch.norm(p_top_n.T, dim=1, keepdim=True) # 计算 p_top_n 的范数 - f_norms = torch.norm(f_top_n, dim=0, keepdim=True) # 计算 f_top_n 的范数 + p_norms = torch.norm( + p_top_n.T, dim=1, keepdim=True + ) # 计算 p_top_n 的范数 + f_norms = torch.norm( + f_top_n, dim=0, keepdim=True + ) # 计算 f_top_n 的范数 similarity = similarity / (p_norms * f_norms) # 标准化为余弦相似度 - + # 转为 Python 标量列表 cos_similarities = similarity.diagonal().tolist() - + # 存储结果 layer_linear_results[key] = cos_similarities - + layer_results.append(layer_linear_results) - + total_results.append(layer_results) - + return total_results + def process(model: LLMModel, config): if config.moe_flag: weights = moe_weight_traverse(model, mapping(keys_extraction(config))) diff --git a/moe_peft/analysts.py b/moe_peft/analysts.py index 4e4b2b3..2a4f226 100644 --- a/moe_peft/analysts.py +++ b/moe_peft/analysts.py @@ -1,6 +1,7 @@ -import torch from typing import List, Tuple +import torch + class SVDProcessor: def __init__(self, model, config): @@ -38,7 +39,7 @@ def mapping(keys_list: list) -> list: "o_proj": "wo_", "gate_proj": "w1_", "down_proj": "w2_", - "up_proj": "w3_" + "up_proj": "w3_", } return [ @@ -57,15 +58,17 @@ def moe_weight_caculate(loading: list, lora_weights: list) -> torch.Tensor: """ return sum(weight * tensor for weight, tensor in zip(loading, lora_weights)) - def weight_traverse(self, target_linears_list, is_moe: bool = False) -> Tuple[List, List]: + def weight_traverse( + self, target_linears_list, is_moe: bool = False + ) -> Tuple[List, List]: """ 遍历权重 :param target_linears_list: 提取的线性层列表 :param is_moe: 是否为 MOE 模式 :return: (预训练权重, 微调权重) """ - attn_linears = ['wq_', 'wk_', 'wv_', 'wo_'] - mlp_linears = ['w1_', 'w2_', 'w3_'] + attn_linears = ["wq_", "wk_", "wv_", "wo_"] + mlp_linears = ["w1_", "w2_", "w3_"] pretrained_layers_weights = [] tuned_layers_weights = [] @@ -88,21 +91,29 @@ def weight_traverse(self, target_linears_list, is_moe: bool = False) -> Tuple[Li adapter = loras_dict.get(adapter_name, None) if adapter: - p_weight = getattr(adapter, 'base_layer_').weight - lora_a_weight = getattr(adapter, 'lora_a_').weight - lora_b_weight = getattr(adapter, 'lora_b_').weight + p_weight = getattr(adapter, "base_layer_").weight + lora_a_weight = getattr(adapter, "lora_a_").weight + lora_b_weight = getattr(adapter, "lora_b_").weight t_weight = lora_b_weight @ lora_a_weight + p_weight - linear_key = linear.rstrip('_') + linear_key = linear.rstrip("_") pretrained_layer_weights.append({linear_key: p_weight}) tuned_layer_weights.append({linear_key: t_weight}) # MOE 特定逻辑 - if is_moe and hasattr(layer.mlp_, 'moes_') and adapter_name in layer.mlp_.moes_: - profile_matrix = layer.mlp_.moes_[adapter_name].profiler_ + if ( + is_moe + and hasattr(layer.mlp_, "moes_") + and adapter_name in layer.mlp_.moes_ + ): + profile_matrix = layer.mlp_.moes_[ + adapter_name + ].profiler_ expert_value_lists = loras_dict.values() tuned_expert_value_lists = [] - total_base_layer = getattr(layer.mlp_.mlp_, linear).base_layer_.weight + total_base_layer = getattr( + layer.mlp_.mlp_, linear + ).base_layer_.weight for value in expert_value_lists: p_weight = value.base_layer_.weight @@ -111,12 +122,20 @@ def weight_traverse(self, target_linears_list, is_moe: bool = False) -> Tuple[Li t_weight = lora_b_weight @ lora_a_weight + p_weight tuned_expert_value_lists.append(t_weight) - final_tuned_weights = self.moe_weight_caculate(profile_matrix, tuned_expert_value_lists) - pretrained_layer_weights.append({linear_key: total_base_layer}) - tuned_layer_weights.append({linear_key: final_tuned_weights}) + final_tuned_weights = self.moe_weight_caculate( + profile_matrix, tuned_expert_value_lists + ) + pretrained_layer_weights.append( + {linear_key: total_base_layer} + ) + tuned_layer_weights.append( + {linear_key: final_tuned_weights} + ) except AttributeError as e: - raise AttributeError(f"Error accessing attributes for linear '{linear}' in adapter '{adapter_name}': {e}") + raise AttributeError( + f"Error accessing attributes for linear '{linear}' in adapter '{adapter_name}': {e}" + ) pretrained_layers_weights.append(pretrained_layer_weights) tuned_layers_weights.append(tuned_layer_weights) @@ -124,7 +143,9 @@ def weight_traverse(self, target_linears_list, is_moe: bool = False) -> Tuple[Li return pretrained_layers_weights, tuned_layers_weights @staticmethod - def svd_analysis(p_weights: list, f_weights: list, n: int = 9, device: str = 'cuda:0') -> List: + def svd_analysis( + p_weights: list, f_weights: list, n: int = 9, device: str = "cuda:0" + ) -> List: """ 对比分析 SVD 分解的权重 :param p_weights: 预训练权重 @@ -138,8 +159,16 @@ def svd_analysis(p_weights: list, f_weights: list, n: int = 9, device: str = 'cu for layer_idx, (p_layer, f_layer) in enumerate(zip(p_weights, f_weights)): layer_results = [] for key in p_layer.keys(): - p_tensor = p_layer[key].to(device) if isinstance(p_layer[key], torch.Tensor) else torch.tensor(p_layer[key], device=device) - f_tensor = f_layer[key].to(device) if isinstance(f_layer[key], torch.Tensor) else torch.tensor(f_layer[key], device=device) + p_tensor = ( + p_layer[key].to(device) + if isinstance(p_layer[key], torch.Tensor) + else torch.tensor(p_layer[key], device=device) + ) + f_tensor = ( + f_layer[key].to(device) + if isinstance(f_layer[key], torch.Tensor) + else torch.tensor(f_layer[key], device=device) + ) p_u, _, _ = torch.linalg.svd(p_tensor, full_matrices=False) f_u, _, _ = torch.linalg.svd(f_tensor, full_matrices=False) @@ -172,4 +201,4 @@ def process(self) -> List: else: weights = self.weight_traverse(mapped_keys) - return self.svd_analysis(weights[0], weights[1]) \ No newline at end of file + return self.svd_analysis(weights[0], weights[1]) diff --git a/moe_peft/common/lora_linear.py b/moe_peft/common/lora_linear.py index 1885a19..b5b7695 100644 --- a/moe_peft/common/lora_linear.py +++ b/moe_peft/common/lora_linear.py @@ -367,7 +367,9 @@ def init_lora_weight( self.device_, ) - self.loras_[adapter_name].reset_parameters(lora_tensor) # lora_tensor即为lora_a & lora_b元组 + self.loras_[adapter_name].reset_parameters( + lora_tensor + ) # lora_tensor即为lora_a & lora_b元组 def _appy_dora( self, diff --git a/moe_peft/evaluator.py b/moe_peft/evaluator.py index 6cf2d67..68fd136 100644 --- a/moe_peft/evaluator.py +++ b/moe_peft/evaluator.py @@ -2,16 +2,16 @@ import logging import time from dataclasses import dataclass -from typing import Dict, List, Optional +from typing import Dict, List import torch from .adapters import MixLoraConfig +from .analyst import process from .common import InputData, LLMBatchConfig, LLMModelInput, Prompt from .model import LLMModel from .tasks import BasicMetric, BasicTask, CommonSenseTask, task_dict from .tokenizer import Tokenizer -from .analyst import process @dataclass @@ -44,7 +44,9 @@ def _dataload_fn(self, tokenizer: Tokenizer, **tokenizer_kwargs): return data @staticmethod - def from_config(config: Dict[str, any]) -> List["EvaluateConfig"]: # 所有config有关的设置均可在这里修改 + def from_config( + config: Dict[str, any] + ) -> List["EvaluateConfig"]: # 所有config有关的设置均可在这里修改 adapter_name = config["name"] data_path = config.get("data", None) task_list = config.get("task_name", "casual").split(";") @@ -249,7 +251,9 @@ def _compute_result(model, configs, save_file): router_statistic_[idx] += val if not config.svd_ana: layer.mlp_.moes_[config.adapter_name].profiler_ = None - result["router_profile"] = list(val / 32 for val in router_statistic_) + result["router_profile"] = list( + val / 32 for val in router_statistic_ + ) final_result = result results.append(final_result) @@ -269,7 +273,7 @@ def evaluate( model: LLMModel, tokenizer: Tokenizer, configs: List[EvaluateConfig], # 可能是多个config文件😋 - max_concurrent_jobs: int = None, + max_concurrent_jobs: int = None, retrying_steps: int = 20, max_seq_len: int = 512, save_file: str = None, @@ -338,11 +342,13 @@ def evaluate( for config in configs: # call analyst process svd_result = process(model, config) - file = f"svd_result_{config.adapter_name}.json" if not save_file else save_file + file = ( + f"svd_result_{config.adapter_name}.json" if not save_file else save_file + ) with open(file, "w") as f: json.dump(svd_result, f, indent=4) logging.info(f"saving svd_analysis result to {file}") return _compute_result(model, configs, save_file) - return _compute_result(model, configs, save_file) \ No newline at end of file + return _compute_result(model, configs, save_file) diff --git a/moe_peft/model.py b/moe_peft/model.py index 1c1b37e..e7ad880 100644 --- a/moe_peft/model.py +++ b/moe_peft/model.py @@ -252,10 +252,14 @@ def init_lora_layer_weight( # 将LoRA weight attach到不同的线性层上 lora_a = None lora_b = None else: - lora_a = lora_weights.get(f"{module_name}.lora_A.weight", None) # lora_weight为 parameter container类,用.get()取值 + lora_a = lora_weights.get( + f"{module_name}.lora_A.weight", None + ) # lora_weight为 parameter container类,用.get()取值 lora_b = lora_weights.get(f"{module_name}.lora_B.weight", None) - lora_linear.init_lora_weight(lora_config, (lora_a, lora_b)) # 初始化lora_linear,传入config和lora_a&b元组 + lora_linear.init_lora_weight( + lora_config, (lora_a, lora_b) + ) # 初始化lora_linear,传入config和lora_a&b元组 def get_lora_layer_weight( @@ -471,13 +475,15 @@ def forward( hidden_states, cache_position.unsqueeze(0) ) - hidden_states, all_router_logits = self._call_decoder_stack( # 正式call decoder stack - hidden_states, - input_args, - rotary_emb, - causal_mask, - cache_position, - past_key_values, + hidden_states, all_router_logits = ( + self._call_decoder_stack( # 正式call decoder stack + hidden_states, + input_args, + rotary_emb, + causal_mask, + cache_position, + past_key_values, + ) ) # calculate loss @@ -585,7 +591,11 @@ def from_pretrained( return LLMModel(model) def init_adapter( - self, config: AdapterConfig, weight: Optional[Dict[str, torch.Tensor]] = None # 目前来看传入的weight是lora weight + self, + config: AdapterConfig, + weight: Optional[ + Dict[str, torch.Tensor] + ] = None, # 目前来看传入的weight是lora weight ): # Patch for MixLoRA if isinstance(config, MixLoraConfig) and config.act_fn_ is None: @@ -596,7 +606,7 @@ def init_adapter( if config.task_name in task_dict and isinstance( task_dict[config.task_name], SequenceClassificationTask ): - output_layer = ClassificationOutputLayer( # 输出层,包含lora_weight + output_layer = ClassificationOutputLayer( # 输出层,包含lora_weight **task_dict[config.task_name].init_kwargs(), hidden_size=self.config_.dim_, pad_token_id=self.config_.pad_token_id_, @@ -605,14 +615,17 @@ def init_adapter( ) else: output_layer = CasualOutputLayer( - vocab_size=self.config_.vocab_size_, weight=self.model_.lm_head_ # 此处加载预训练权重 + vocab_size=self.config_.vocab_size_, + weight=self.model_.lm_head_, # 此处加载预训练权重 ) self.output_.layers_[config.adapter_name] = output_layer if type(config) is not AdapterConfig: # init transformer layers 该循环遍历所有TransformerLayer,加载微调参数 for transformer_layer in self.model_.layers_: - init_lora_layer_weight(transformer_layer, self.config_, config, weight) # LoRA weight + init_lora_layer_weight( + transformer_layer, self.config_, config, weight + ) # LoRA weight else: assert weight is None, "can not load basic adapter with weight" @@ -672,7 +685,7 @@ def load_adapter(self, name_or_path: str, adapter_name: Optional[str] = None): ) as fp: lora_config = lora_config_factory(json.load(fp)) lora_config.adapter_name = adapter_name - lora_weight = torch.load( #此处加载微调后的lora权重 + lora_weight = torch.load( # 此处加载微调后的lora权重 name_or_path + os.sep + "adapter_model.bin", map_location=self.device_, weights_only=False,