fix ci errors

TUDB-Labs · Dec 9, 2024 · 3d07888 · 3d07888
1 parent 0ac37de
commit 3d07888
Show file tree

Hide file tree

Showing 9 changed files with 216 additions and 149 deletions.
diff --git a/evaluator.py b/evaluator.py
@@ -51,4 +51,4 @@ def main(
 
 
 if __name__ == "__main__":
-    fire.Fire(main)
+    fire.Fire(main)
diff --git a/moe_peft.py b/moe_peft.py
@@ -176,7 +176,9 @@ def init_adapter_config(
                 config_class.prompt_template = lora_config.get("prompt", None)
             config_list.append(config_class)
         elif args.evaluate:
-            config_list.extend(moe_peft.EvaluateConfig.from_config(lora_config))  # config["lora"] 部分
+            config_list.extend(
+                moe_peft.EvaluateConfig.from_config(lora_config)
+            )  # config["lora"] 部分
             # moe_flag?
         else:
             config_list.append(moe_peft.TrainConfig.from_config(lora_config))
@@ -206,7 +208,7 @@ def inference(
         for config in configs:
             config.prompts = [input_raw]
         callback = None if args.disable_log else inference_callback
-        outputs = moe_peft.generate(  #此处已经配置好Genconfig，开始分词并且逐渐生成切片后的向量
+        outputs = moe_peft.generate(  # 此处已经配置好Genconfig，开始分词并且逐渐生成切片后的向量
             model,
             tokenizer,
             configs,

diff --git a/moe_peft/__init__.py b/moe_peft/__init__.py
@@ -1,4 +1,6 @@
 from .adapters import adapter_factory
+from .analyst import process
+from .analysts import SVDProcessor
 from .common import (
     AdapterConfig,
     LLMBatchConfig,
@@ -19,8 +21,6 @@
 from .tokenizer import Tokenizer
 from .trainer import TrainConfig, train
 from .utils import is_package_available, setup_logging
-from .analyst import process
-from .analysts import SVDProcessor
 
 assert is_package_available("torch", "2.3.0"), "MoE-PEFT requires torch>=2.3.0"
 assert is_package_available(
@@ -54,5 +54,5 @@
     "setup_logging",
     "executor",
     "process",
-    "SVDProcessor"
+    "SVDProcessor",
 ]
diff --git a/moe_peft/adapters/mixlora/model.py b/moe_peft/adapters/mixlora/model.py
@@ -182,7 +182,9 @@ def forward(
         # router_logits: (batch * sequence_length, n_experts)
         router_logits = self.gate_(hidden_states)  # 在此处计算各专家的分数
 
-        routing_weights = F.softmax(router_logits, dim=1, dtype=self.dtype_)  # 归一化处理
+        routing_weights = F.softmax(
+            router_logits, dim=1, dtype=self.dtype_
+        )  # 归一化处理
         routing_weights, selected_experts = torch.topk(
             routing_weights, self.topk_, dim=-1
         )

diff --git a/moe_peft/analyst.py b/moe_peft/analyst.py
diff --git a/moe_peft/analysts.py b/moe_peft/analysts.py
@@ -1,6 +1,7 @@
-import torch
 from typing import List, Tuple
 
+import torch
+
 
 class SVDProcessor:
     def __init__(self, model, config):
@@ -38,7 +39,7 @@ def mapping(keys_list: list) -> list:
             "o_proj": "wo_",
             "gate_proj": "w1_",
             "down_proj": "w2_",
-            "up_proj": "w3_"
+            "up_proj": "w3_",
         }
 
         return [
@@ -57,15 +58,17 @@ def moe_weight_caculate(loading: list, lora_weights: list) -> torch.Tensor:
         """
         return sum(weight * tensor for weight, tensor in zip(loading, lora_weights))
 
-    def weight_traverse(self, target_linears_list, is_moe: bool = False) -> Tuple[List, List]:
+    def weight_traverse(
+        self, target_linears_list, is_moe: bool = False
+    ) -> Tuple[List, List]:
         """
         遍历权重
         :param target_linears_list: 提取的线性层列表
         :param is_moe: 是否为 MOE 模式
         :return: (预训练权重, 微调权重)
         """
-        attn_linears = ['wq_', 'wk_', 'wv_', 'wo_']
-        mlp_linears = ['w1_', 'w2_', 'w3_']
+        attn_linears = ["wq_", "wk_", "wv_", "wo_"]
+        mlp_linears = ["w1_", "w2_", "w3_"]
 
         pretrained_layers_weights = []
         tuned_layers_weights = []
@@ -88,21 +91,29 @@ def weight_traverse(self, target_linears_list, is_moe: bool = False) -> Tuple[Li
                             adapter = loras_dict.get(adapter_name, None)
 
                             if adapter:
-                                p_weight = getattr(adapter, 'base_layer_').weight
-                                lora_a_weight = getattr(adapter, 'lora_a_').weight
-                                lora_b_weight = getattr(adapter, 'lora_b_').weight
+                                p_weight = getattr(adapter, "base_layer_").weight
+                                lora_a_weight = getattr(adapter, "lora_a_").weight
+                                lora_b_weight = getattr(adapter, "lora_b_").weight
                                 t_weight = lora_b_weight @ lora_a_weight + p_weight
 
-                                linear_key = linear.rstrip('_')
+                                linear_key = linear.rstrip("_")
                                 pretrained_layer_weights.append({linear_key: p_weight})
                                 tuned_layer_weights.append({linear_key: t_weight})
 
                             # MOE 特定逻辑
-                            if is_moe and hasattr(layer.mlp_, 'moes_') and adapter_name in layer.mlp_.moes_:
-                                profile_matrix = layer.mlp_.moes_[adapter_name].profiler_
+                            if (
+                                is_moe
+                                and hasattr(layer.mlp_, "moes_")
+                                and adapter_name in layer.mlp_.moes_
+                            ):
+                                profile_matrix = layer.mlp_.moes_[
+                                    adapter_name
+                                ].profiler_
                                 expert_value_lists = loras_dict.values()
                                 tuned_expert_value_lists = []
-                                total_base_layer = getattr(layer.mlp_.mlp_, linear).base_layer_.weight
+                                total_base_layer = getattr(
+                                    layer.mlp_.mlp_, linear
+                                ).base_layer_.weight
 
                                 for value in expert_value_lists:
                                     p_weight = value.base_layer_.weight
@@ -111,20 +122,30 @@ def weight_traverse(self, target_linears_list, is_moe: bool = False) -> Tuple[Li
                                     t_weight = lora_b_weight @ lora_a_weight + p_weight
                                     tuned_expert_value_lists.append(t_weight)
 
-                                final_tuned_weights = self.moe_weight_caculate(profile_matrix, tuned_expert_value_lists)
-                                pretrained_layer_weights.append({linear_key: total_base_layer})
-                                tuned_layer_weights.append({linear_key: final_tuned_weights})
+                                final_tuned_weights = self.moe_weight_caculate(
+                                    profile_matrix, tuned_expert_value_lists
+                                )
+                                pretrained_layer_weights.append(
+                                    {linear_key: total_base_layer}
+                                )
+                                tuned_layer_weights.append(
+                                    {linear_key: final_tuned_weights}
+                                )
 
                         except AttributeError as e:
-                            raise AttributeError(f"Error accessing attributes for linear '{linear}' in adapter '{adapter_name}': {e}")
+                            raise AttributeError(
+                                f"Error accessing attributes for linear '{linear}' in adapter '{adapter_name}': {e}"
+                            )
 
             pretrained_layers_weights.append(pretrained_layer_weights)
             tuned_layers_weights.append(tuned_layer_weights)
 
         return pretrained_layers_weights, tuned_layers_weights
 
     @staticmethod
-    def svd_analysis(p_weights: list, f_weights: list, n: int = 9, device: str = 'cuda:0') -> List:
+    def svd_analysis(
+        p_weights: list, f_weights: list, n: int = 9, device: str = "cuda:0"
+    ) -> List:
         """
         对比分析 SVD 分解的权重
         :param p_weights: 预训练权重
@@ -138,8 +159,16 @@ def svd_analysis(p_weights: list, f_weights: list, n: int = 9, device: str = 'cu
         for layer_idx, (p_layer, f_layer) in enumerate(zip(p_weights, f_weights)):
             layer_results = []
             for key in p_layer.keys():
-                p_tensor = p_layer[key].to(device) if isinstance(p_layer[key], torch.Tensor) else torch.tensor(p_layer[key], device=device)
-                f_tensor = f_layer[key].to(device) if isinstance(f_layer[key], torch.Tensor) else torch.tensor(f_layer[key], device=device)
+                p_tensor = (
+                    p_layer[key].to(device)
+                    if isinstance(p_layer[key], torch.Tensor)
+                    else torch.tensor(p_layer[key], device=device)
+                )
+                f_tensor = (
+                    f_layer[key].to(device)
+                    if isinstance(f_layer[key], torch.Tensor)
+                    else torch.tensor(f_layer[key], device=device)
+                )
 
                 p_u, _, _ = torch.linalg.svd(p_tensor, full_matrices=False)
                 f_u, _, _ = torch.linalg.svd(f_tensor, full_matrices=False)
@@ -172,4 +201,4 @@ def process(self) -> List:
         else:
             weights = self.weight_traverse(mapped_keys)
 
-        return self.svd_analysis(weights[0], weights[1])
+        return self.svd_analysis(weights[0], weights[1])
diff --git a/moe_peft/common/lora_linear.py b/moe_peft/common/lora_linear.py
@@ -367,7 +367,9 @@ def init_lora_weight(
                 self.device_,
             )
 
-        self.loras_[adapter_name].reset_parameters(lora_tensor)  # lora_tensor即为lora_a & lora_b元组
+        self.loras_[adapter_name].reset_parameters(
+            lora_tensor
+        )  # lora_tensor即为lora_a & lora_b元组
 
     def _appy_dora(
         self,

diff --git a/moe_peft/evaluator.py b/moe_peft/evaluator.py
@@ -2,16 +2,16 @@
 import logging
 import time
 from dataclasses import dataclass
-from typing import Dict, List, Optional
+from typing import Dict, List
 
 import torch
 
 from .adapters import MixLoraConfig
+from .analyst import process
 from .common import InputData, LLMBatchConfig, LLMModelInput, Prompt
 from .model import LLMModel
 from .tasks import BasicMetric, BasicTask, CommonSenseTask, task_dict
 from .tokenizer import Tokenizer
-from .analyst import process
 
 
 @dataclass
@@ -44,7 +44,9 @@ def _dataload_fn(self, tokenizer: Tokenizer, **tokenizer_kwargs):
         return data
 
     @staticmethod
-    def from_config(config: Dict[str, any]) -> List["EvaluateConfig"]:  # 所有config有关的设置均可在这里修改
+    def from_config(
+        config: Dict[str, any]
+    ) -> List["EvaluateConfig"]:  # 所有config有关的设置均可在这里修改
         adapter_name = config["name"]
         data_path = config.get("data", None)
         task_list = config.get("task_name", "casual").split(";")
@@ -249,7 +251,9 @@ def _compute_result(model, configs, save_file):
                         router_statistic_[idx] += val
                     if not config.svd_ana:
                         layer.mlp_.moes_[config.adapter_name].profiler_ = None
-                    result["router_profile"] = list(val / 32 for val in router_statistic_)
+                    result["router_profile"] = list(
+                        val / 32 for val in router_statistic_
+                    )
 
         final_result = result
         results.append(final_result)
@@ -269,7 +273,7 @@ def evaluate(
     model: LLMModel,
     tokenizer: Tokenizer,
     configs: List[EvaluateConfig],  # 可能是多个config文件😋
-    max_concurrent_jobs: int = None, 
+    max_concurrent_jobs: int = None,
     retrying_steps: int = 20,
     max_seq_len: int = 512,
     save_file: str = None,
@@ -338,11 +342,13 @@ def evaluate(
         for config in configs:  # call analyst process
             svd_result = process(model, config)
 
-            file = f"svd_result_{config.adapter_name}.json" if not save_file else save_file
+            file = (
+                f"svd_result_{config.adapter_name}.json" if not save_file else save_file
+            )
             with open(file, "w") as f:
                 json.dump(svd_result, f, indent=4)
             logging.info(f"saving svd_analysis result to {file}")
 
         return _compute_result(model, configs, save_file)
 
-    return _compute_result(model, configs, save_file)
+    return _compute_result(model, configs, save_file)
diff --git a/moe_peft/model.py b/moe_peft/model.py
@@ -252,10 +252,14 @@ def init_lora_layer_weight(  # 将LoRA weight attach到不同的线性层上
                 lora_a = None
                 lora_b = None
             else:
-                lora_a = lora_weights.get(f"{module_name}.lora_A.weight", None)  # lora_weight为 parameter container类，用.get()取值
+                lora_a = lora_weights.get(
+                    f"{module_name}.lora_A.weight", None
+                )  # lora_weight为 parameter container类，用.get()取值
                 lora_b = lora_weights.get(f"{module_name}.lora_B.weight", None)
 
-            lora_linear.init_lora_weight(lora_config, (lora_a, lora_b))  # 初始化lora_linear，传入config和lora_a&b元组
+            lora_linear.init_lora_weight(
+                lora_config, (lora_a, lora_b)
+            )  # 初始化lora_linear，传入config和lora_a&b元组
 
 
 def get_lora_layer_weight(
@@ -471,13 +475,15 @@ def forward(
             hidden_states, cache_position.unsqueeze(0)
         )
 
-        hidden_states, all_router_logits = self._call_decoder_stack(  # 正式call decoder stack  
-            hidden_states,
-            input_args,
-            rotary_emb,
-            causal_mask,
-            cache_position,
-            past_key_values,
+        hidden_states, all_router_logits = (
+            self._call_decoder_stack(  # 正式call decoder stack
+                hidden_states,
+                input_args,
+                rotary_emb,
+                causal_mask,
+                cache_position,
+                past_key_values,
+            )
         )
 
         # calculate loss
@@ -585,7 +591,11 @@ def from_pretrained(
         return LLMModel(model)
 
     def init_adapter(
-        self, config: AdapterConfig, weight: Optional[Dict[str, torch.Tensor]] = None  # 目前来看传入的weight是lora weight
+        self,
+        config: AdapterConfig,
+        weight: Optional[
+            Dict[str, torch.Tensor]
+        ] = None,  # 目前来看传入的weight是lora weight
     ):
         # Patch for MixLoRA
         if isinstance(config, MixLoraConfig) and config.act_fn_ is None:
@@ -596,7 +606,7 @@ def init_adapter(
         if config.task_name in task_dict and isinstance(
             task_dict[config.task_name], SequenceClassificationTask
         ):
-            output_layer = ClassificationOutputLayer( # 输出层，包含lora_weight
+            output_layer = ClassificationOutputLayer(  # 输出层，包含lora_weight
                 **task_dict[config.task_name].init_kwargs(),
                 hidden_size=self.config_.dim_,
                 pad_token_id=self.config_.pad_token_id_,
@@ -605,14 +615,17 @@ def init_adapter(
             )
         else:
             output_layer = CasualOutputLayer(
-                vocab_size=self.config_.vocab_size_, weight=self.model_.lm_head_  # 此处加载预训练权重
+                vocab_size=self.config_.vocab_size_,
+                weight=self.model_.lm_head_,  # 此处加载预训练权重
             )
 
         self.output_.layers_[config.adapter_name] = output_layer
         if type(config) is not AdapterConfig:
             # init transformer layers 该循环遍历所有TransformerLayer，加载微调参数
             for transformer_layer in self.model_.layers_:
-                init_lora_layer_weight(transformer_layer, self.config_, config, weight)  # LoRA weight
+                init_lora_layer_weight(
+                    transformer_layer, self.config_, config, weight
+                )  # LoRA weight
         else:
             assert weight is None, "can not load basic adapter with weight"
 
@@ -672,7 +685,7 @@ def load_adapter(self, name_or_path: str, adapter_name: Optional[str] = None):
         ) as fp:
             lora_config = lora_config_factory(json.load(fp))
         lora_config.adapter_name = adapter_name
-        lora_weight = torch.load(  #此处加载微调后的lora权重
+        lora_weight = torch.load(  # 此处加载微调后的lora权重
             name_or_path + os.sep + "adapter_model.bin",
             map_location=self.device_,
             weights_only=False,
Original file line number	Diff line number	Diff line change
Expand Up		@@ -51,4 +51,4 @@ def main(


		if __name__ == "__main__":
		fire.Fire(main)
		fire.Fire(main)