Skip to content

Commit

Permalink
fix ci errors
Browse files Browse the repository at this point in the history
  • Loading branch information
Pherenice1125 committed Dec 9, 2024
1 parent 0ac37de commit 3d07888
Show file tree
Hide file tree
Showing 9 changed files with 216 additions and 149 deletions.
2 changes: 1 addition & 1 deletion evaluator.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,4 +51,4 @@ def main(


if __name__ == "__main__":
fire.Fire(main)
fire.Fire(main)
6 changes: 4 additions & 2 deletions moe_peft.py
Original file line number Diff line number Diff line change
Expand Up @@ -176,7 +176,9 @@ def init_adapter_config(
config_class.prompt_template = lora_config.get("prompt", None)
config_list.append(config_class)
elif args.evaluate:
config_list.extend(moe_peft.EvaluateConfig.from_config(lora_config)) # config["lora"] 部分
config_list.extend(
moe_peft.EvaluateConfig.from_config(lora_config)
) # config["lora"] 部分
# moe_flag?
else:
config_list.append(moe_peft.TrainConfig.from_config(lora_config))
Expand Down Expand Up @@ -206,7 +208,7 @@ def inference(
for config in configs:
config.prompts = [input_raw]
callback = None if args.disable_log else inference_callback
outputs = moe_peft.generate( #此处已经配置好Genconfig,开始分词并且逐渐生成切片后的向量
outputs = moe_peft.generate( # 此处已经配置好Genconfig,开始分词并且逐渐生成切片后的向量
model,
tokenizer,
configs,
Expand Down
6 changes: 3 additions & 3 deletions moe_peft/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
from .adapters import adapter_factory
from .analyst import process
from .analysts import SVDProcessor
from .common import (
AdapterConfig,
LLMBatchConfig,
Expand All @@ -19,8 +21,6 @@
from .tokenizer import Tokenizer
from .trainer import TrainConfig, train
from .utils import is_package_available, setup_logging
from .analyst import process
from .analysts import SVDProcessor

assert is_package_available("torch", "2.3.0"), "MoE-PEFT requires torch>=2.3.0"
assert is_package_available(
Expand Down Expand Up @@ -54,5 +54,5 @@
"setup_logging",
"executor",
"process",
"SVDProcessor"
"SVDProcessor",
]
4 changes: 3 additions & 1 deletion moe_peft/adapters/mixlora/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -182,7 +182,9 @@ def forward(
# router_logits: (batch * sequence_length, n_experts)
router_logits = self.gate_(hidden_states) # 在此处计算各专家的分数

routing_weights = F.softmax(router_logits, dim=1, dtype=self.dtype_) # 归一化处理
routing_weights = F.softmax(
router_logits, dim=1, dtype=self.dtype_
) # 归一化处理
routing_weights, selected_experts = torch.topk(
routing_weights, self.topk_, dim=-1
)
Expand Down
213 changes: 113 additions & 100 deletions moe_peft/analyst.py

Large diffs are not rendered by default.

69 changes: 49 additions & 20 deletions moe_peft/analysts.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import torch
from typing import List, Tuple

import torch


class SVDProcessor:
def __init__(self, model, config):
Expand Down Expand Up @@ -38,7 +39,7 @@ def mapping(keys_list: list) -> list:
"o_proj": "wo_",
"gate_proj": "w1_",
"down_proj": "w2_",
"up_proj": "w3_"
"up_proj": "w3_",
}

return [
Expand All @@ -57,15 +58,17 @@ def moe_weight_caculate(loading: list, lora_weights: list) -> torch.Tensor:
"""
return sum(weight * tensor for weight, tensor in zip(loading, lora_weights))

def weight_traverse(self, target_linears_list, is_moe: bool = False) -> Tuple[List, List]:
def weight_traverse(
self, target_linears_list, is_moe: bool = False
) -> Tuple[List, List]:
"""
遍历权重
:param target_linears_list: 提取的线性层列表
:param is_moe: 是否为 MOE 模式
:return: (预训练权重, 微调权重)
"""
attn_linears = ['wq_', 'wk_', 'wv_', 'wo_']
mlp_linears = ['w1_', 'w2_', 'w3_']
attn_linears = ["wq_", "wk_", "wv_", "wo_"]
mlp_linears = ["w1_", "w2_", "w3_"]

pretrained_layers_weights = []
tuned_layers_weights = []
Expand All @@ -88,21 +91,29 @@ def weight_traverse(self, target_linears_list, is_moe: bool = False) -> Tuple[Li
adapter = loras_dict.get(adapter_name, None)

if adapter:
p_weight = getattr(adapter, 'base_layer_').weight
lora_a_weight = getattr(adapter, 'lora_a_').weight
lora_b_weight = getattr(adapter, 'lora_b_').weight
p_weight = getattr(adapter, "base_layer_").weight
lora_a_weight = getattr(adapter, "lora_a_").weight
lora_b_weight = getattr(adapter, "lora_b_").weight
t_weight = lora_b_weight @ lora_a_weight + p_weight

linear_key = linear.rstrip('_')
linear_key = linear.rstrip("_")
pretrained_layer_weights.append({linear_key: p_weight})
tuned_layer_weights.append({linear_key: t_weight})

# MOE 特定逻辑
if is_moe and hasattr(layer.mlp_, 'moes_') and adapter_name in layer.mlp_.moes_:
profile_matrix = layer.mlp_.moes_[adapter_name].profiler_
if (
is_moe
and hasattr(layer.mlp_, "moes_")
and adapter_name in layer.mlp_.moes_
):
profile_matrix = layer.mlp_.moes_[
adapter_name
].profiler_
expert_value_lists = loras_dict.values()
tuned_expert_value_lists = []
total_base_layer = getattr(layer.mlp_.mlp_, linear).base_layer_.weight
total_base_layer = getattr(
layer.mlp_.mlp_, linear
).base_layer_.weight

for value in expert_value_lists:
p_weight = value.base_layer_.weight
Expand All @@ -111,20 +122,30 @@ def weight_traverse(self, target_linears_list, is_moe: bool = False) -> Tuple[Li
t_weight = lora_b_weight @ lora_a_weight + p_weight
tuned_expert_value_lists.append(t_weight)

final_tuned_weights = self.moe_weight_caculate(profile_matrix, tuned_expert_value_lists)
pretrained_layer_weights.append({linear_key: total_base_layer})
tuned_layer_weights.append({linear_key: final_tuned_weights})
final_tuned_weights = self.moe_weight_caculate(
profile_matrix, tuned_expert_value_lists
)
pretrained_layer_weights.append(
{linear_key: total_base_layer}
)
tuned_layer_weights.append(
{linear_key: final_tuned_weights}
)

except AttributeError as e:
raise AttributeError(f"Error accessing attributes for linear '{linear}' in adapter '{adapter_name}': {e}")
raise AttributeError(
f"Error accessing attributes for linear '{linear}' in adapter '{adapter_name}': {e}"
)

pretrained_layers_weights.append(pretrained_layer_weights)
tuned_layers_weights.append(tuned_layer_weights)

return pretrained_layers_weights, tuned_layers_weights

@staticmethod
def svd_analysis(p_weights: list, f_weights: list, n: int = 9, device: str = 'cuda:0') -> List:
def svd_analysis(
p_weights: list, f_weights: list, n: int = 9, device: str = "cuda:0"
) -> List:
"""
对比分析 SVD 分解的权重
:param p_weights: 预训练权重
Expand All @@ -138,8 +159,16 @@ def svd_analysis(p_weights: list, f_weights: list, n: int = 9, device: str = 'cu
for layer_idx, (p_layer, f_layer) in enumerate(zip(p_weights, f_weights)):
layer_results = []
for key in p_layer.keys():
p_tensor = p_layer[key].to(device) if isinstance(p_layer[key], torch.Tensor) else torch.tensor(p_layer[key], device=device)
f_tensor = f_layer[key].to(device) if isinstance(f_layer[key], torch.Tensor) else torch.tensor(f_layer[key], device=device)
p_tensor = (
p_layer[key].to(device)
if isinstance(p_layer[key], torch.Tensor)
else torch.tensor(p_layer[key], device=device)
)
f_tensor = (
f_layer[key].to(device)
if isinstance(f_layer[key], torch.Tensor)
else torch.tensor(f_layer[key], device=device)
)

p_u, _, _ = torch.linalg.svd(p_tensor, full_matrices=False)
f_u, _, _ = torch.linalg.svd(f_tensor, full_matrices=False)
Expand Down Expand Up @@ -172,4 +201,4 @@ def process(self) -> List:
else:
weights = self.weight_traverse(mapped_keys)

return self.svd_analysis(weights[0], weights[1])
return self.svd_analysis(weights[0], weights[1])
4 changes: 3 additions & 1 deletion moe_peft/common/lora_linear.py
Original file line number Diff line number Diff line change
Expand Up @@ -367,7 +367,9 @@ def init_lora_weight(
self.device_,
)

self.loras_[adapter_name].reset_parameters(lora_tensor) # lora_tensor即为lora_a & lora_b元组
self.loras_[adapter_name].reset_parameters(
lora_tensor
) # lora_tensor即为lora_a & lora_b元组

def _appy_dora(
self,
Expand Down
20 changes: 13 additions & 7 deletions moe_peft/evaluator.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,16 +2,16 @@
import logging
import time
from dataclasses import dataclass
from typing import Dict, List, Optional
from typing import Dict, List

import torch

from .adapters import MixLoraConfig
from .analyst import process
from .common import InputData, LLMBatchConfig, LLMModelInput, Prompt
from .model import LLMModel
from .tasks import BasicMetric, BasicTask, CommonSenseTask, task_dict
from .tokenizer import Tokenizer
from .analyst import process


@dataclass
Expand Down Expand Up @@ -44,7 +44,9 @@ def _dataload_fn(self, tokenizer: Tokenizer, **tokenizer_kwargs):
return data

@staticmethod
def from_config(config: Dict[str, any]) -> List["EvaluateConfig"]: # 所有config有关的设置均可在这里修改
def from_config(
config: Dict[str, any]
) -> List["EvaluateConfig"]: # 所有config有关的设置均可在这里修改
adapter_name = config["name"]
data_path = config.get("data", None)
task_list = config.get("task_name", "casual").split(";")
Expand Down Expand Up @@ -249,7 +251,9 @@ def _compute_result(model, configs, save_file):
router_statistic_[idx] += val
if not config.svd_ana:
layer.mlp_.moes_[config.adapter_name].profiler_ = None
result["router_profile"] = list(val / 32 for val in router_statistic_)
result["router_profile"] = list(
val / 32 for val in router_statistic_
)

final_result = result
results.append(final_result)
Expand All @@ -269,7 +273,7 @@ def evaluate(
model: LLMModel,
tokenizer: Tokenizer,
configs: List[EvaluateConfig], # 可能是多个config文件😋
max_concurrent_jobs: int = None,
max_concurrent_jobs: int = None,
retrying_steps: int = 20,
max_seq_len: int = 512,
save_file: str = None,
Expand Down Expand Up @@ -338,11 +342,13 @@ def evaluate(
for config in configs: # call analyst process
svd_result = process(model, config)

file = f"svd_result_{config.adapter_name}.json" if not save_file else save_file
file = (
f"svd_result_{config.adapter_name}.json" if not save_file else save_file
)
with open(file, "w") as f:
json.dump(svd_result, f, indent=4)
logging.info(f"saving svd_analysis result to {file}")

return _compute_result(model, configs, save_file)

return _compute_result(model, configs, save_file)
return _compute_result(model, configs, save_file)
41 changes: 27 additions & 14 deletions moe_peft/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -252,10 +252,14 @@ def init_lora_layer_weight( # 将LoRA weight attach到不同的线性层上
lora_a = None
lora_b = None
else:
lora_a = lora_weights.get(f"{module_name}.lora_A.weight", None) # lora_weight为 parameter container类,用.get()取值
lora_a = lora_weights.get(
f"{module_name}.lora_A.weight", None
) # lora_weight为 parameter container类,用.get()取值
lora_b = lora_weights.get(f"{module_name}.lora_B.weight", None)

lora_linear.init_lora_weight(lora_config, (lora_a, lora_b)) # 初始化lora_linear,传入config和lora_a&b元组
lora_linear.init_lora_weight(
lora_config, (lora_a, lora_b)
) # 初始化lora_linear,传入config和lora_a&b元组


def get_lora_layer_weight(
Expand Down Expand Up @@ -471,13 +475,15 @@ def forward(
hidden_states, cache_position.unsqueeze(0)
)

hidden_states, all_router_logits = self._call_decoder_stack( # 正式call decoder stack
hidden_states,
input_args,
rotary_emb,
causal_mask,
cache_position,
past_key_values,
hidden_states, all_router_logits = (
self._call_decoder_stack( # 正式call decoder stack
hidden_states,
input_args,
rotary_emb,
causal_mask,
cache_position,
past_key_values,
)
)

# calculate loss
Expand Down Expand Up @@ -585,7 +591,11 @@ def from_pretrained(
return LLMModel(model)

def init_adapter(
self, config: AdapterConfig, weight: Optional[Dict[str, torch.Tensor]] = None # 目前来看传入的weight是lora weight
self,
config: AdapterConfig,
weight: Optional[
Dict[str, torch.Tensor]
] = None, # 目前来看传入的weight是lora weight
):
# Patch for MixLoRA
if isinstance(config, MixLoraConfig) and config.act_fn_ is None:
Expand All @@ -596,7 +606,7 @@ def init_adapter(
if config.task_name in task_dict and isinstance(
task_dict[config.task_name], SequenceClassificationTask
):
output_layer = ClassificationOutputLayer( # 输出层,包含lora_weight
output_layer = ClassificationOutputLayer( # 输出层,包含lora_weight
**task_dict[config.task_name].init_kwargs(),
hidden_size=self.config_.dim_,
pad_token_id=self.config_.pad_token_id_,
Expand All @@ -605,14 +615,17 @@ def init_adapter(
)
else:
output_layer = CasualOutputLayer(
vocab_size=self.config_.vocab_size_, weight=self.model_.lm_head_ # 此处加载预训练权重
vocab_size=self.config_.vocab_size_,
weight=self.model_.lm_head_, # 此处加载预训练权重
)

self.output_.layers_[config.adapter_name] = output_layer
if type(config) is not AdapterConfig:
# init transformer layers 该循环遍历所有TransformerLayer,加载微调参数
for transformer_layer in self.model_.layers_:
init_lora_layer_weight(transformer_layer, self.config_, config, weight) # LoRA weight
init_lora_layer_weight(
transformer_layer, self.config_, config, weight
) # LoRA weight
else:
assert weight is None, "can not load basic adapter with weight"

Expand Down Expand Up @@ -672,7 +685,7 @@ def load_adapter(self, name_or_path: str, adapter_name: Optional[str] = None):
) as fp:
lora_config = lora_config_factory(json.load(fp))
lora_config.adapter_name = adapter_name
lora_weight = torch.load( #此处加载微调后的lora权重
lora_weight = torch.load( # 此处加载微调后的lora权重
name_or_path + os.sep + "adapter_model.bin",
map_location=self.device_,
weights_only=False,
Expand Down

0 comments on commit 3d07888

Please sign in to comment.