diff --git a/evaluator.py b/evaluator.py
index 110ea02..b467ec3 100644
--- a/evaluator.py
+++ b/evaluator.py
@@ -51,4 +51,4 @@ def main(
 
 
 if __name__ == "__main__":
-    fire.Fire(main)
\ No newline at end of file
+    fire.Fire(main)
diff --git a/moe_peft.py b/moe_peft.py
index fad929b..7a0ef41 100644
--- a/moe_peft.py
+++ b/moe_peft.py
@@ -176,7 +176,9 @@ def init_adapter_config(
                 config_class.prompt_template = lora_config.get("prompt", None)
             config_list.append(config_class)
         elif args.evaluate:
-            config_list.extend(moe_peft.EvaluateConfig.from_config(lora_config))  # config["lora"] 部分
+            config_list.extend(
+                moe_peft.EvaluateConfig.from_config(lora_config)
+            )  # config["lora"] 部分
             # moe_flag?
         else:
             config_list.append(moe_peft.TrainConfig.from_config(lora_config))
@@ -206,7 +208,7 @@ def inference(
         for config in configs:
             config.prompts = [input_raw]
         callback = None if args.disable_log else inference_callback
-        outputs = moe_peft.generate(  #此处已经配置好Genconfig，开始分词并且逐渐生成切片后的向量
+        outputs = moe_peft.generate(  # 此处已经配置好Genconfig，开始分词并且逐渐生成切片后的向量
             model,
             tokenizer,
             configs,
diff --git a/moe_peft/__init__.py b/moe_peft/__init__.py
index f298574..d856b11 100644
--- a/moe_peft/__init__.py
+++ b/moe_peft/__init__.py
@@ -1,4 +1,6 @@
 from .adapters import adapter_factory
+from .analyst import process
+from .analysts import SVDProcessor
 from .common import (
     AdapterConfig,
     LLMBatchConfig,
@@ -19,8 +21,6 @@
 from .tokenizer import Tokenizer
 from .trainer import TrainConfig, train
 from .utils import is_package_available, setup_logging
-from .analyst import process
-from .analysts import SVDProcessor
 
 assert is_package_available("torch", "2.3.0"), "MoE-PEFT requires torch>=2.3.0"
 assert is_package_available(
@@ -54,5 +54,5 @@
     "setup_logging",
     "executor",
     "process",
-    "SVDProcessor"
+    "SVDProcessor",
 ]
diff --git a/moe_peft/adapters/mixlora/model.py b/moe_peft/adapters/mixlora/model.py
index 6ef8eff..8c37124 100644
--- a/moe_peft/adapters/mixlora/model.py
+++ b/moe_peft/adapters/mixlora/model.py
@@ -182,7 +182,9 @@ def forward(
         # router_logits: (batch * sequence_length, n_experts)
         router_logits = self.gate_(hidden_states)  # 在此处计算各专家的分数
 
-        routing_weights = F.softmax(router_logits, dim=1, dtype=self.dtype_)  # 归一化处理
+        routing_weights = F.softmax(
+            router_logits, dim=1, dtype=self.dtype_
+        )  # 归一化处理
         routing_weights, selected_experts = torch.topk(
             routing_weights, self.topk_, dim=-1
         )
diff --git a/moe_peft/analyst.py b/moe_peft/analyst.py
index 5fa9a35..e75e7d9 100644
--- a/moe_peft/analyst.py
+++ b/moe_peft/analyst.py
@@ -1,11 +1,11 @@
-import torch
 import logging
-import numpy as np
-from typing import Tuple, Dict
-from sklearn.metrics.pairwise import cosine_similarity
+from typing import Dict, Tuple
+
+import torch
 
 from .model import LLMModel
 
+
 def keys_extraction(config) -> list:
     result = []
 
@@ -16,6 +16,7 @@ def keys_extraction(config) -> list:
 
     return result  # 冗余数据结构设计待优化
 
+
 def mapping(keys_list) -> list:
     mapping_dict = {
         "q_proj": "wq_",
@@ -24,7 +25,7 @@ def mapping(keys_list) -> list:
         "o_proj": "wo_",
         "gate_proj": "w1_",
         "down_proj": "w2_",
-        "up_proj": "w3_"
+        "up_proj": "w3_",
     }
 
     mapped_list = [
@@ -35,17 +36,19 @@ def mapping(keys_list) -> list:
 
     return mapped_list
 
+
 def moe_weight_caculate(loading: list, lora_weights: list) -> torch.Tensor:
     return sum(weight * tensor for weight, tensor in zip(loading, lora_weights))
 
+
 def lora_weight_traverse(model, target_linears_list) -> Tuple[Dict, Dict]:
-    attn_linears = ['wq_', 'wk_', 'wv_', 'wo_']
-    mlp_linears = ['w1_', 'w2_', 'w3_']
-    
+    attn_linears = ["wq_", "wk_", "wv_", "wo_"]
+    mlp_linears = ["w1_", "w2_", "w3_"]
+
     pretrained_layers_weights = []
     tuned_layers_weights = []
 
-    for layer in model.model_.layers_:  
+    for layer in model.model_.layers_:
         pretrained_layer_weights = []
         tuned_layer_weights = []
         for item in target_linears_list:
@@ -55,48 +58,53 @@ def lora_weight_traverse(model, target_linears_list) -> Tuple[Dict, Dict]:
                         try:
                             loras_dict = getattr(layer.self_attn_, linear).loras_
                             adapter = loras_dict.get(adapter_name, None)
-                            
+
                             if adapter is not None:
-                                p_weight = getattr(adapter, 'base_layer_').weight
-                                lora_a_weight = getattr(adapter, 'lora_a_').weight
-                                lora_b_weight = getattr(adapter, 'lora_b_').weight
+                                p_weight = getattr(adapter, "base_layer_").weight
+                                lora_a_weight = getattr(adapter, "lora_a_").weight
+                                lora_b_weight = getattr(adapter, "lora_b_").weight
                                 t_weight = lora_b_weight @ lora_a_weight + p_weight
-                                
-                                linear_key = linear.rstrip('_')
+
+                                linear_key = linear.rstrip("_")
                                 pretrained_layer_weights.append({linear_key: p_weight})
                                 tuned_layer_weights.append({linear_key: t_weight})
                         except AttributeError as e:
-                            raise AttributeError(f"Error accessing attributes for linear '{linear}' in adapter '{adapter_name}': {e}")
-                    
+                            raise AttributeError(
+                                f"Error accessing attributes for linear '{linear}' in adapter '{adapter_name}': {e}"
+                            )
+
                     elif linear in mlp_linears:
                         try:
                             loras_dict = getattr(layer.mlp_.mlp_, linear).loras_
                             adapter = loras_dict.get(adapter_name, None)
-                            
+
                             if adapter is not None:
-                                p_weight = getattr(adapter, 'base_layer_').weight
-                                lora_a_weight = getattr(adapter, 'lora_a_').weight
-                                lora_b_weight = getattr(adapter, 'lora_b_').weight
+                                p_weight = getattr(adapter, "base_layer_").weight
+                                lora_a_weight = getattr(adapter, "lora_a_").weight
+                                lora_b_weight = getattr(adapter, "lora_b_").weight
                                 t_weight = lora_b_weight @ lora_a_weight + p_weight
-                                
-                                linear_key = linear.rstrip('_')
+
+                                linear_key = linear.rstrip("_")
                                 pretrained_layer_weights.append({linear_key: p_weight})
                                 tuned_layer_weights.append({linear_key: t_weight})
                         except AttributeError as e:
-                            raise AttributeError(f"Error accessing attributes for linear '{linear}' in adapter '{adapter_name}': {e}")
-                    
+                            raise AttributeError(
+                                f"Error accessing attributes for linear '{linear}' in adapter '{adapter_name}': {e}"
+                            )
+
                     else:
                         raise ValueError(f"Invalid linear name: {linear}")
-        
+
         pretrained_layers_weights.append(pretrained_layer_weights)
         tuned_layers_weights.append(tuned_layer_weights)
 
     return pretrained_layers_weights, tuned_layers_weights
 
+
 def moe_weight_traverse(model, target_linears_list) -> Tuple[Dict, Dict]:
-    attn_linears = ['wq_', 'wk_', 'wv_', 'wo_']
-    mlp_linears = ['w1_', 'w2_', 'w3_']
-    
+    attn_linears = ["wq_", "wk_", "wv_", "wo_"]
+    mlp_linears = ["w1_", "w2_", "w3_"]
+
     pretrained_layers_weights = []
     tuned_layers_weights = []
 
@@ -110,28 +118,36 @@ def moe_weight_traverse(model, target_linears_list) -> Tuple[Dict, Dict]:
                         try:
                             loras_dict = getattr(layer.self_attn_, linear).loras_
                             adapter = loras_dict.get(adapter_name, None)
-                            
+
                             if adapter is not None:
-                                p_weight = getattr(adapter, 'base_layer_').weight
-                                lora_a_weight = getattr(adapter, 'lora_a_').weight
-                                lora_b_weight = getattr(adapter, 'lora_b_').weight
+                                p_weight = getattr(adapter, "base_layer_").weight
+                                lora_a_weight = getattr(adapter, "lora_a_").weight
+                                lora_b_weight = getattr(adapter, "lora_b_").weight
                                 t_weight = lora_b_weight @ lora_a_weight + p_weight
-                                
-                                linear_key = linear.rstrip('_')
+
+                                linear_key = linear.rstrip("_")
                                 pretrained_layer_weights.append({linear_key: p_weight})
                                 tuned_layer_weights.append({linear_key: t_weight})
                         except AttributeError as e:
-                            raise AttributeError(f"Error accessing attributes for linear '{linear}' in adapter '{adapter_name}': {e}")
-                    
+                            raise AttributeError(
+                                f"Error accessing attributes for linear '{linear}' in adapter '{adapter_name}': {e}"
+                            )
+
                     elif linear in mlp_linears:
                         try:
                             loras_dict = getattr(layer.mlp_.mlp_, linear).loras_
-                            adapter = loras_dict.get(adapter_name, None)  # 获取adapter_name
+                            adapter = loras_dict.get(
+                                adapter_name, None
+                            )  # 获取adapter_name
                             if layer.mlp_.moes_:
-                                profile_matrix = layer.mlp_.moes_[adapter_name].profiler_
+                                profile_matrix = layer.mlp_.moes_[
+                                    adapter_name
+                                ].profiler_
                                 expert_value_lists = loras_dict.values()
                                 tuned_expert_value_lists = []
-                                total_base_layer = getattr(layer.mlp_.mlp_, linear).base_layer_.weight
+                                total_base_layer = getattr(
+                                    layer.mlp_.mlp_, linear
+                                ).base_layer_.weight
 
                                 for value in expert_value_lists:
                                     p_weight = value.base_layer_.weight
@@ -140,104 +156,101 @@ def moe_weight_traverse(model, target_linears_list) -> Tuple[Dict, Dict]:
                                     t_weight = lora_b_weight @ lora_a_weight + p_weight
                                     tuned_expert_value_lists.append(t_weight)
 
-                                final_tuned_weights = moe_weight_caculate(profile_matrix, tuned_expert_value_lists)
-                                linear_key = linear.rstrip('_')
-                                pretrained_layer_weights.append({linear_key: total_base_layer})  # 这里的权重是moe层的预训练总权重
-                                tuned_layer_weights.append({linear_key: final_tuned_weights})  # 这里的权重是微调并且加权后的moe层的所有权重
+                                final_tuned_weights = moe_weight_caculate(
+                                    profile_matrix, tuned_expert_value_lists
+                                )
+                                linear_key = linear.rstrip("_")
+                                pretrained_layer_weights.append(
+                                    {linear_key: total_base_layer}
+                                )  # 这里的权重是moe层的预训练总权重
+                                tuned_layer_weights.append(
+                                    {linear_key: final_tuned_weights}
+                                )  # 这里的权重是微调并且加权后的moe层的所有权重
 
                             else:  # 普通lora微调的逻辑
                                 if adapter is not None:
-                                    p_weight = getattr(adapter, 'base_layer_').weight
-                                    lora_a_weight = getattr(adapter, 'lora_a_').weight
-                                    lora_b_weight = getattr(adapter, 'lora_b_').weight
+                                    p_weight = getattr(adapter, "base_layer_").weight
+                                    lora_a_weight = getattr(adapter, "lora_a_").weight
+                                    lora_b_weight = getattr(adapter, "lora_b_").weight
                                     t_weight = lora_b_weight @ lora_a_weight + p_weight
-                                    
-                                    linear_key = linear.rstrip('_')
-                                    pretrained_layer_weights.append({linear_key: p_weight})
+
+                                    linear_key = linear.rstrip("_")
+                                    pretrained_layer_weights.append(
+                                        {linear_key: p_weight}
+                                    )
                                     tuned_layer_weights.append({linear_key: t_weight})
                         except AttributeError as e:
-                            raise AttributeError(f"Error accessing attributes for linear '{linear}' in adapter '{adapter_name}': {e}")
-                    
+                            raise AttributeError(
+                                f"Error accessing attributes for linear '{linear}' in adapter '{adapter_name}': {e}"
+                            )
+
                     else:
                         raise ValueError(f"Invalid linear name: {linear}")
-                    
+
         pretrained_layers_weights.append(pretrained_layer_weights)
         tuned_layers_weights.append(tuned_layer_weights)
-    
+
     return pretrained_layers_weights, tuned_layers_weights
 
-'''def svd_analysis(p_weights: list, f_weights: list, n: int = 9, device = 'cuda:0'):
-    total_results = []
-    results = []
 
-    for idx, (single_p_layer, single_f_layer) in enumerate(zip(p_weights, f_weights)):  # 126个linear
-        logging.info(f"Processing layer {idx} for SVD analysis...")
-        for (p_linear, f_linear) in zip(single_p_layer, single_f_layer):
-            # layer_results = []
-            for key in p_linear.keys():
-                p_tensor = p_linear[key].to(device) if isinstance(p_linear[key], torch.Tensor) else torch.tensor(p_linear[key], device=device)
-                f_tensor = f_linear[key].to(device) if isinstance(f_linear[key], torch.Tensor) else torch.tensor(f_linear[key], device=device)
-                
-                p_u, _, _ = torch.linalg.svd(p_tensor, full_matrices=False)
-                f_u, _, _ = torch.linalg.svd(f_tensor, full_matrices=False)
-                
-                n_min = min(n, p_u.shape[1], f_u.shape[1])
-                p_top_n = p_u[:, :n_min]
-                f_top_n = f_u[:, :n_min]
-                
-                similarity = torch.mm(p_top_n.T, f_top_n)  # 点积
-                p_norms = torch.norm(p_top_n.T, dim=1, keepdim=True)  # 计算 p_top_n 的范数
-                f_norms = torch.norm(f_top_n, dim=0, keepdim=True)  # 计算 f_top_n 的范数
-                similarity = similarity / (p_norms * f_norms)  # 标准化为余弦相似度
-                avg_similarity = similarity.mean().item()  # 转为 Python 标量
-                
-                # layer_results.append({key: avg_similarity})
-            results.append({key: avg_similarity})
-        total_results.append(results)
-    
-    return total_results'''
-
-def svd_analysis(p_weights: list, f_weights: list, n: int = 9, device='cuda:0'):
+def svd_analysis(p_weights: list, f_weights: list, n: int = 9, device="cuda:0"):
     total_results = []
 
-    for idx, (single_p_layer, single_f_layer) in enumerate(zip(p_weights, f_weights)):  # 遍历每一层
+    for idx, (single_p_layer, single_f_layer) in enumerate(
+        zip(p_weights, f_weights)
+    ):  # 遍历每一层
         logging.info(f"Processing layer {idx} for SVD analysis...")
         layer_results = []
 
-        for p_linear, f_linear in zip(single_p_layer, single_f_layer):  # 遍历每一层的线性层
+        for p_linear, f_linear in zip(
+            single_p_layer, single_f_layer
+        ):  # 遍历每一层的线性层
             layer_linear_results = {}
-            
+
             for key in p_linear.keys():  # 遍历线性层中的每组权重
-                p_tensor = p_linear[key].to(device) if isinstance(p_linear[key], torch.Tensor) else torch.tensor(p_linear[key], device=device)
-                f_tensor = f_linear[key].to(device) if isinstance(f_linear[key], torch.Tensor) else torch.tensor(f_linear[key], device=device)
-                
+                p_tensor = (
+                    p_linear[key].to(device)
+                    if isinstance(p_linear[key], torch.Tensor)
+                    else torch.tensor(p_linear[key], device=device)
+                )
+                f_tensor = (
+                    f_linear[key].to(device)
+                    if isinstance(f_linear[key], torch.Tensor)
+                    else torch.tensor(f_linear[key], device=device)
+                )
+
                 # 进行SVD分解
                 p_u, _, _ = torch.linalg.svd(p_tensor, full_matrices=False)
                 f_u, _, _ = torch.linalg.svd(f_tensor, full_matrices=False)
-                
+
                 # 获取前n个奇异向量
                 n_min = min(n, p_u.shape[1], f_u.shape[1])
                 p_top_n = p_u[:, :n_min]
                 f_top_n = f_u[:, :n_min]
-                
+
                 # 计算余弦相似度
                 similarity = torch.mm(p_top_n.T, f_top_n)  # 点积
-                p_norms = torch.norm(p_top_n.T, dim=1, keepdim=True)  # 计算 p_top_n 的范数
-                f_norms = torch.norm(f_top_n, dim=0, keepdim=True)  # 计算 f_top_n 的范数
+                p_norms = torch.norm(
+                    p_top_n.T, dim=1, keepdim=True
+                )  # 计算 p_top_n 的范数
+                f_norms = torch.norm(
+                    f_top_n, dim=0, keepdim=True
+                )  # 计算 f_top_n 的范数
                 similarity = similarity / (p_norms * f_norms)  # 标准化为余弦相似度
-                
+
                 # 转为 Python 标量列表
                 cos_similarities = similarity.diagonal().tolist()
-                
+
                 # 存储结果
                 layer_linear_results[key] = cos_similarities
-            
+
             layer_results.append(layer_linear_results)
-        
+
         total_results.append(layer_results)
-    
+
     return total_results
 
+
 def process(model: LLMModel, config):
     if config.moe_flag:
         weights = moe_weight_traverse(model, mapping(keys_extraction(config)))
diff --git a/moe_peft/analysts.py b/moe_peft/analysts.py
index 4e4b2b3..2a4f226 100644
--- a/moe_peft/analysts.py
+++ b/moe_peft/analysts.py
@@ -1,6 +1,7 @@
-import torch
 from typing import List, Tuple
 
+import torch
+
 
 class SVDProcessor:
     def __init__(self, model, config):
@@ -38,7 +39,7 @@ def mapping(keys_list: list) -> list:
             "o_proj": "wo_",
             "gate_proj": "w1_",
             "down_proj": "w2_",
-            "up_proj": "w3_"
+            "up_proj": "w3_",
         }
 
         return [
@@ -57,15 +58,17 @@ def moe_weight_caculate(loading: list, lora_weights: list) -> torch.Tensor:
         """
         return sum(weight * tensor for weight, tensor in zip(loading, lora_weights))
 
-    def weight_traverse(self, target_linears_list, is_moe: bool = False) -> Tuple[List, List]:
+    def weight_traverse(
+        self, target_linears_list, is_moe: bool = False
+    ) -> Tuple[List, List]:
         """
         遍历权重
         :param target_linears_list: 提取的线性层列表
         :param is_moe: 是否为 MOE 模式
         :return: (预训练权重, 微调权重)
         """
-        attn_linears = ['wq_', 'wk_', 'wv_', 'wo_']
-        mlp_linears = ['w1_', 'w2_', 'w3_']
+        attn_linears = ["wq_", "wk_", "wv_", "wo_"]
+        mlp_linears = ["w1_", "w2_", "w3_"]
 
         pretrained_layers_weights = []
         tuned_layers_weights = []
@@ -88,21 +91,29 @@ def weight_traverse(self, target_linears_list, is_moe: bool = False) -> Tuple[Li
                             adapter = loras_dict.get(adapter_name, None)
 
                             if adapter:
-                                p_weight = getattr(adapter, 'base_layer_').weight
-                                lora_a_weight = getattr(adapter, 'lora_a_').weight
-                                lora_b_weight = getattr(adapter, 'lora_b_').weight
+                                p_weight = getattr(adapter, "base_layer_").weight
+                                lora_a_weight = getattr(adapter, "lora_a_").weight
+                                lora_b_weight = getattr(adapter, "lora_b_").weight
                                 t_weight = lora_b_weight @ lora_a_weight + p_weight
 
-                                linear_key = linear.rstrip('_')
+                                linear_key = linear.rstrip("_")
                                 pretrained_layer_weights.append({linear_key: p_weight})
                                 tuned_layer_weights.append({linear_key: t_weight})
 
                             # MOE 特定逻辑
-                            if is_moe and hasattr(layer.mlp_, 'moes_') and adapter_name in layer.mlp_.moes_:
-                                profile_matrix = layer.mlp_.moes_[adapter_name].profiler_
+                            if (
+                                is_moe
+                                and hasattr(layer.mlp_, "moes_")
+                                and adapter_name in layer.mlp_.moes_
+                            ):
+                                profile_matrix = layer.mlp_.moes_[
+                                    adapter_name
+                                ].profiler_
                                 expert_value_lists = loras_dict.values()
                                 tuned_expert_value_lists = []
-                                total_base_layer = getattr(layer.mlp_.mlp_, linear).base_layer_.weight
+                                total_base_layer = getattr(
+                                    layer.mlp_.mlp_, linear
+                                ).base_layer_.weight
 
                                 for value in expert_value_lists:
                                     p_weight = value.base_layer_.weight
@@ -111,12 +122,20 @@ def weight_traverse(self, target_linears_list, is_moe: bool = False) -> Tuple[Li
                                     t_weight = lora_b_weight @ lora_a_weight + p_weight
                                     tuned_expert_value_lists.append(t_weight)
 
-                                final_tuned_weights = self.moe_weight_caculate(profile_matrix, tuned_expert_value_lists)
-                                pretrained_layer_weights.append({linear_key: total_base_layer})
-                                tuned_layer_weights.append({linear_key: final_tuned_weights})
+                                final_tuned_weights = self.moe_weight_caculate(
+                                    profile_matrix, tuned_expert_value_lists
+                                )
+                                pretrained_layer_weights.append(
+                                    {linear_key: total_base_layer}
+                                )
+                                tuned_layer_weights.append(
+                                    {linear_key: final_tuned_weights}
+                                )
 
                         except AttributeError as e:
-                            raise AttributeError(f"Error accessing attributes for linear '{linear}' in adapter '{adapter_name}': {e}")
+                            raise AttributeError(
+                                f"Error accessing attributes for linear '{linear}' in adapter '{adapter_name}': {e}"
+                            )
 
             pretrained_layers_weights.append(pretrained_layer_weights)
             tuned_layers_weights.append(tuned_layer_weights)
@@ -124,7 +143,9 @@ def weight_traverse(self, target_linears_list, is_moe: bool = False) -> Tuple[Li
         return pretrained_layers_weights, tuned_layers_weights
 
     @staticmethod
-    def svd_analysis(p_weights: list, f_weights: list, n: int = 9, device: str = 'cuda:0') -> List:
+    def svd_analysis(
+        p_weights: list, f_weights: list, n: int = 9, device: str = "cuda:0"
+    ) -> List:
         """
         对比分析 SVD 分解的权重
         :param p_weights: 预训练权重
@@ -138,8 +159,16 @@ def svd_analysis(p_weights: list, f_weights: list, n: int = 9, device: str = 'cu
         for layer_idx, (p_layer, f_layer) in enumerate(zip(p_weights, f_weights)):
             layer_results = []
             for key in p_layer.keys():
-                p_tensor = p_layer[key].to(device) if isinstance(p_layer[key], torch.Tensor) else torch.tensor(p_layer[key], device=device)
-                f_tensor = f_layer[key].to(device) if isinstance(f_layer[key], torch.Tensor) else torch.tensor(f_layer[key], device=device)
+                p_tensor = (
+                    p_layer[key].to(device)
+                    if isinstance(p_layer[key], torch.Tensor)
+                    else torch.tensor(p_layer[key], device=device)
+                )
+                f_tensor = (
+                    f_layer[key].to(device)
+                    if isinstance(f_layer[key], torch.Tensor)
+                    else torch.tensor(f_layer[key], device=device)
+                )
 
                 p_u, _, _ = torch.linalg.svd(p_tensor, full_matrices=False)
                 f_u, _, _ = torch.linalg.svd(f_tensor, full_matrices=False)
@@ -172,4 +201,4 @@ def process(self) -> List:
         else:
             weights = self.weight_traverse(mapped_keys)
 
-        return self.svd_analysis(weights[0], weights[1])
\ No newline at end of file
+        return self.svd_analysis(weights[0], weights[1])
diff --git a/moe_peft/common/lora_linear.py b/moe_peft/common/lora_linear.py
index 1885a19..b5b7695 100644
--- a/moe_peft/common/lora_linear.py
+++ b/moe_peft/common/lora_linear.py
@@ -367,7 +367,9 @@ def init_lora_weight(
                 self.device_,
             )
 
-        self.loras_[adapter_name].reset_parameters(lora_tensor)  # lora_tensor即为lora_a & lora_b元组
+        self.loras_[adapter_name].reset_parameters(
+            lora_tensor
+        )  # lora_tensor即为lora_a & lora_b元组
 
     def _appy_dora(
         self,
diff --git a/moe_peft/evaluator.py b/moe_peft/evaluator.py
index 6cf2d67..68fd136 100644
--- a/moe_peft/evaluator.py
+++ b/moe_peft/evaluator.py
@@ -2,16 +2,16 @@
 import logging
 import time
 from dataclasses import dataclass
-from typing import Dict, List, Optional
+from typing import Dict, List
 
 import torch
 
 from .adapters import MixLoraConfig
+from .analyst import process
 from .common import InputData, LLMBatchConfig, LLMModelInput, Prompt
 from .model import LLMModel
 from .tasks import BasicMetric, BasicTask, CommonSenseTask, task_dict
 from .tokenizer import Tokenizer
-from .analyst import process
 
 
 @dataclass
@@ -44,7 +44,9 @@ def _dataload_fn(self, tokenizer: Tokenizer, **tokenizer_kwargs):
         return data
 
     @staticmethod
-    def from_config(config: Dict[str, any]) -> List["EvaluateConfig"]:  # 所有config有关的设置均可在这里修改
+    def from_config(
+        config: Dict[str, any]
+    ) -> List["EvaluateConfig"]:  # 所有config有关的设置均可在这里修改
         adapter_name = config["name"]
         data_path = config.get("data", None)
         task_list = config.get("task_name", "casual").split(";")
@@ -249,7 +251,9 @@ def _compute_result(model, configs, save_file):
                         router_statistic_[idx] += val
                     if not config.svd_ana:
                         layer.mlp_.moes_[config.adapter_name].profiler_ = None
-                    result["router_profile"] = list(val / 32 for val in router_statistic_)
+                    result["router_profile"] = list(
+                        val / 32 for val in router_statistic_
+                    )
 
         final_result = result
         results.append(final_result)
@@ -269,7 +273,7 @@ def evaluate(
     model: LLMModel,
     tokenizer: Tokenizer,
     configs: List[EvaluateConfig],  # 可能是多个config文件😋
-    max_concurrent_jobs: int = None, 
+    max_concurrent_jobs: int = None,
     retrying_steps: int = 20,
     max_seq_len: int = 512,
     save_file: str = None,
@@ -338,11 +342,13 @@ def evaluate(
         for config in configs:  # call analyst process
             svd_result = process(model, config)
 
-            file = f"svd_result_{config.adapter_name}.json" if not save_file else save_file
+            file = (
+                f"svd_result_{config.adapter_name}.json" if not save_file else save_file
+            )
             with open(file, "w") as f:
                 json.dump(svd_result, f, indent=4)
             logging.info(f"saving svd_analysis result to {file}")
 
         return _compute_result(model, configs, save_file)
 
-    return _compute_result(model, configs, save_file)
\ No newline at end of file
+    return _compute_result(model, configs, save_file)
diff --git a/moe_peft/model.py b/moe_peft/model.py
index 1c1b37e..e7ad880 100644
--- a/moe_peft/model.py
+++ b/moe_peft/model.py
@@ -252,10 +252,14 @@ def init_lora_layer_weight(  # 将LoRA weight attach到不同的线性层上
                 lora_a = None
                 lora_b = None
             else:
-                lora_a = lora_weights.get(f"{module_name}.lora_A.weight", None)  # lora_weight为 parameter container类，用.get()取值
+                lora_a = lora_weights.get(
+                    f"{module_name}.lora_A.weight", None
+                )  # lora_weight为 parameter container类，用.get()取值
                 lora_b = lora_weights.get(f"{module_name}.lora_B.weight", None)
 
-            lora_linear.init_lora_weight(lora_config, (lora_a, lora_b))  # 初始化lora_linear，传入config和lora_a&b元组
+            lora_linear.init_lora_weight(
+                lora_config, (lora_a, lora_b)
+            )  # 初始化lora_linear，传入config和lora_a&b元组
 
 
 def get_lora_layer_weight(
@@ -471,13 +475,15 @@ def forward(
             hidden_states, cache_position.unsqueeze(0)
         )
 
-        hidden_states, all_router_logits = self._call_decoder_stack(  # 正式call decoder stack  
-            hidden_states,
-            input_args,
-            rotary_emb,
-            causal_mask,
-            cache_position,
-            past_key_values,
+        hidden_states, all_router_logits = (
+            self._call_decoder_stack(  # 正式call decoder stack
+                hidden_states,
+                input_args,
+                rotary_emb,
+                causal_mask,
+                cache_position,
+                past_key_values,
+            )
         )
 
         # calculate loss
@@ -585,7 +591,11 @@ def from_pretrained(
         return LLMModel(model)
 
     def init_adapter(
-        self, config: AdapterConfig, weight: Optional[Dict[str, torch.Tensor]] = None  # 目前来看传入的weight是lora weight
+        self,
+        config: AdapterConfig,
+        weight: Optional[
+            Dict[str, torch.Tensor]
+        ] = None,  # 目前来看传入的weight是lora weight
     ):
         # Patch for MixLoRA
         if isinstance(config, MixLoraConfig) and config.act_fn_ is None:
@@ -596,7 +606,7 @@ def init_adapter(
         if config.task_name in task_dict and isinstance(
             task_dict[config.task_name], SequenceClassificationTask
         ):
-            output_layer = ClassificationOutputLayer( # 输出层，包含lora_weight
+            output_layer = ClassificationOutputLayer(  # 输出层，包含lora_weight
                 **task_dict[config.task_name].init_kwargs(),
                 hidden_size=self.config_.dim_,
                 pad_token_id=self.config_.pad_token_id_,
@@ -605,14 +615,17 @@ def init_adapter(
             )
         else:
             output_layer = CasualOutputLayer(
-                vocab_size=self.config_.vocab_size_, weight=self.model_.lm_head_  # 此处加载预训练权重
+                vocab_size=self.config_.vocab_size_,
+                weight=self.model_.lm_head_,  # 此处加载预训练权重
             )
 
         self.output_.layers_[config.adapter_name] = output_layer
         if type(config) is not AdapterConfig:
             # init transformer layers 该循环遍历所有TransformerLayer，加载微调参数
             for transformer_layer in self.model_.layers_:
-                init_lora_layer_weight(transformer_layer, self.config_, config, weight)  # LoRA weight
+                init_lora_layer_weight(
+                    transformer_layer, self.config_, config, weight
+                )  # LoRA weight
         else:
             assert weight is None, "can not load basic adapter with weight"
 
@@ -672,7 +685,7 @@ def load_adapter(self, name_or_path: str, adapter_name: Optional[str] = None):
         ) as fp:
             lora_config = lora_config_factory(json.load(fp))
         lora_config.adapter_name = adapter_name
-        lora_weight = torch.load(  #此处加载微调后的lora权重
+        lora_weight = torch.load(  # 此处加载微调后的lora权重
             name_or_path + os.sep + "adapter_model.bin",
             map_location=self.device_,
             weights_only=False,