diff --git a/doc/tutorial/pipeline/pipline_hetero_secureboost.py b/doc/tutorial/pipeline/pipline_hetero_secureboost.py new file mode 100644 index 0000000000..10e55e09d1 --- /dev/null +++ b/doc/tutorial/pipeline/pipline_hetero_secureboost.py @@ -0,0 +1,83 @@ +# 连接FATE flow +# !pipeline init --ip 127.0.0.1 --port 9380 +from pipeline.backend.pipeline import PipeLine +from pipeline.component import Reader, DataTransform, Intersection, HeteroSecureBoost, Evaluation +from pipeline.interface import Data +def train_pipeline(): + # 创建pipeline实例 + pipeline = PipeLine() \ + .set_initiator(role='guest', party_id=9999) \ + .set_roles(guest=9999, host=10000) + # load data + reader_0 = Reader(name="reader_0") + # set guest parameter + reader_0.get_party_instance(role='guest', party_id=9999).component_param( + table={"name": "breast_hetero_guest", "namespace": "experiment"}) + # set host parameter + reader_0.get_party_instance(role='host', party_id=10000).component_param( + table={"name": "breast_hetero_host", "namespace": "experiment"}) + # 添加DataTransform组件以将原始数据解析到数据实例中 + data_transform_0 = DataTransform(name="data_transform_0") + # set guest parameter + data_transform_0.get_party_instance(role='guest', party_id=9999).component_param( + with_label=True) + data_transform_0.get_party_instance(role='host', party_id=[10000]).component_param( + with_label=False) + # 添加求交组件以执行纵向场景的PSI + intersect_0 = Intersection(name="intersect_0") + # 定义HeteroSecureBoost组件。将为所有相关方设置以下参数。 + hetero_secureboost_0 = HeteroSecureBoost(name="hetero_secureboost_0", + num_trees=5, + bin_num=16, + task_type="classification", + objective_param={"objective": "cross_entropy"}, + encrypt_param={"method": "paillier"}, + tree_param={"max_depth": 3}) + # 模型评估组件 + evaluation_0 = Evaluation(name="evaluation_0", eval_type="binary") + # 将组件添加到管道,按执行顺序 + # -data_transform_0 用 reader_0的输出数据 + # -intersect_0 用 data_transform_0的输出数据 + # -heter_securebost_0用intersect_0的输出数据 + # -评估0用纵向securebost0对训练数据的预测结果 + pipeline.add_component(reader_0) + pipeline.add_component(data_transform_0, data=Data(data=reader_0.output.data)) + pipeline.add_component(intersect_0, data=Data(data=data_transform_0.output.data)) + pipeline.add_component(hetero_secureboost_0, data=Data(train_data=intersect_0.output.data)) + pipeline.add_component(evaluation_0, data=Data(data=hetero_secureboost_0.output.data)) + pipeline.compile() + # submit pipeline + pipeline.fit() + # 一旦完成了训练,就可以将训练后的模型用于预测。(可选)保存经过训练的管道以备将来使用。 + # 模型保存 + pipeline.dump("hetero_secureboost_pipeline_saved.pkl") +def predict_pipeline(): + # 模型预测 + # 首先,从训练管道部署所需组件 + pipeline = PipeLine.load_model_from_file('hetero_secureboost_pipeline_saved.pkl') + pipeline.deploy_component([pipeline.data_transform_0, pipeline.intersect_0, pipeline.hetero_secureboost_0]) + # 定义用于读取预测数据的新reader组件 + reader_1 = Reader(name="reader_1") + reader_1.get_party_instance(role="guest", party_id=9999).component_param(table={"name": "breast_hetero_guest", "namespace": "experiment"}) + reader_1.get_party_instance(role="host", party_id=10000).component_param(table={"name": "breast_hetero_host", "namespace": "experiment"}) + # (可选)定义新的评估组件。 + evaluation_0 = Evaluation(name="evaluation_0", eval_type="binary") + # 按执行顺序添加组件到预测管道 + predict_pipeline = PipeLine() + predict_pipeline.add_component(reader_1)\ + .add_component(pipeline, + data=Data(predict_input={pipeline.data_transform_0.input.data: reader_1.output.data}))\ + .add_component(evaluation_0, data=Data(data=pipeline.hetero_secureboost_0.output.data)) + # 执行预测 + predict_pipeline.predict() + +if __name__=='__main__': + # train_pipeline() + predict_pipeline() + + + + + + + diff --git a/doc/tutorial/pipline_upload_csv.py b/doc/tutorial/pipline_upload_csv.py new file mode 100644 index 0000000000..fe5a52c775 --- /dev/null +++ b/doc/tutorial/pipline_upload_csv.py @@ -0,0 +1,30 @@ +# 连接FATE flow +# !pipeline init --ip 127.0.0.1 --port 9380 +from pipeline.backend.pipeline import PipeLine +# 创建一个pipeline实例 +pipeline_upload = PipeLine().set_initiator(role='guest', party_id=9999).set_roles(guest=9999) +partition = 4 +# 定义将在FATE作业配置中使用的表名和命名空间 +dense_data_guest = {"name": "breast_hetero_guest", "namespace": f"experiment"} +dense_data_host = {"name": "breast_hetero_host", "namespace": f"experiment"} +tag_data = {"name": "breast_hetero_host", "namespace": f"experiment"} +# 添加要上传的数据 +import os + +data_base = "/root/PycharmProjects/FATE/" +pipeline_upload.add_upload_data(file=os.path.join(data_base, "examples/data/breast_hetero_guest.csv"), + table_name=dense_data_guest["name"], # table name + namespace=dense_data_guest["namespace"], # namespace + head=1, partition=partition) # data info + +pipeline_upload.add_upload_data(file=os.path.join(data_base, "examples/data/breast_hetero_host.csv"), + table_name=dense_data_host["name"], + namespace=dense_data_host["namespace"], + head=1, partition=partition) + +pipeline_upload.add_upload_data(file=os.path.join(data_base, "examples/data/breast_hetero_host.csv"), + table_name=tag_data["name"], + namespace=tag_data["namespace"], + head=1, partition=partition) +# 上传数据 +pipeline_upload.upload(drop=1) \ No newline at end of file diff --git a/python/fate_client/pipeline/param/consts.py b/python/fate_client/pipeline/param/consts.py index 21fee8a3f0..3f2b4e039d 100644 --- a/python/fate_client/pipeline/param/consts.py +++ b/python/fate_client/pipeline/param/consts.py @@ -344,7 +344,7 @@ # curve names CURVE25519 = 'curve25519' - +SM2='sm2' # positive unlabeled PROBABILITY = "probability" QUANTITY = "quantity" diff --git a/python/federatedml/ensemble/secureboost/steteless_hetero_secureboost/__init__.py b/python/federatedml/ensemble/secureboost/steteless_hetero_secureboost/__init__.py new file mode 100644 index 0000000000..fa29f4c09b --- /dev/null +++ b/python/federatedml/ensemble/secureboost/steteless_hetero_secureboost/__init__.py @@ -0,0 +1,11 @@ +# -*-coding: Utf-8 -*- +''' +==================== +@File : __init__.py .py +author: circle +Time:2023/12/19 +@Desc: +===================== +''' +if __name__ == '__main__': + print('Python') diff --git a/python/federatedml/ensemble/secureboost/steteless_hetero_secureboost/stateless_hetero_secureboost_guest.py b/python/federatedml/ensemble/secureboost/steteless_hetero_secureboost/stateless_hetero_secureboost_guest.py new file mode 100644 index 0000000000..757782f9c8 --- /dev/null +++ b/python/federatedml/ensemble/secureboost/steteless_hetero_secureboost/stateless_hetero_secureboost_guest.py @@ -0,0 +1,509 @@ +import numpy as np +from operator import itemgetter +from federatedml.util import consts +from federatedml.util import LOGGER +from federatedml.ensemble.boosting import HeteroBoostingGuest +from federatedml.param.boosting_param import HeteroSecureBoostParam, DecisionTreeParam +from federatedml.util.io_check import assert_io_num_rows_equal +from federatedml.util.anonymous_generator_util import Anonymous +from federatedml.statistic.data_overview import with_weight, get_max_sample_weight +from federatedml.ensemble.basic_algorithms.decision_tree.tree_core.feature_importance import FeatureImportance +from federatedml.transfer_variable.transfer_class.hetero_secure_boosting_predict_transfer_variable import \ + HeteroSecureBoostTransferVariable +from federatedml.ensemble.basic_algorithms.decision_tree.tree_core import tree_plan as plan +from federatedml.protobuf.generated.boosting_tree_model_meta_pb2 import BoostingTreeModelMeta +from federatedml.protobuf.generated.boosting_tree_model_meta_pb2 import ObjectiveMeta +from federatedml.protobuf.generated.boosting_tree_model_meta_pb2 import QuantileMeta +from federatedml.protobuf.generated.boosting_tree_model_param_pb2 import BoostingTreeModelParam +from federatedml.protobuf.generated.boosting_tree_model_param_pb2 import FeatureImportanceInfo +from federatedml.ensemble.secureboost.secureboost_util.tree_model_io import load_hetero_tree_learner, \ + produce_hetero_tree_learner +from federatedml.ensemble.secureboost.secureboost_util.boosting_tree_predict import sbt_guest_predict, \ + mix_sbt_guest_predict, EINI_guest_predict +from federatedml.ensemble.secureboost.secureboost_util.subsample import goss_sampling + + +class HeteroSecureBoostingTreeGuest(HeteroBoostingGuest): + + def __init__(self): + super(HeteroSecureBoostingTreeGuest, self).__init__() + + self.tree_param = DecisionTreeParam() # decision tree param + self.use_missing = False + self.zero_as_missing = False + self.cur_epoch_idx = -1 + self.grad_and_hess = None + self.feature_importances_ = {} + self.model_param = HeteroSecureBoostParam() + self.complete_secure = False + self.data_alignment_map = {} + self.hetero_sbt_transfer_variable = HeteroSecureBoostTransferVariable() + self.model_name = 'HeteroSecureBoost' + self.max_sample_weight = 1 + self.max_sample_weight_computed = False + self.re_compute_goss_sample_weight = False + self.cipher_compressing = False + + self.enable_goss = False # GOSS + self.top_rate = None + self.other_rate = None + self.new_ver = True + + self.boosting_strategy = consts.STD_TREE # default work mode is std + + # fast sbt param + self.tree_num_per_party = 1 + self.guest_depth = 0 + self.host_depth = 0 + self.init_tree_plan = False + self.tree_plan = [] + + # multi-classification mode + self.multi_mode = consts.SINGLE_OUTPUT + + # EINI predict param + self.EINI_inference = False + self.EINI_random_mask = False + + def _init_model(self, param: HeteroSecureBoostParam): + + super(HeteroSecureBoostingTreeGuest, self)._init_model(param) + self.tree_param = param.tree_param + self.use_missing = param.use_missing + self.zero_as_missing = param.zero_as_missing + self.complete_secure = param.complete_secure#?False + self.enable_goss = param.run_goss#高斯采样?False + self.top_rate = param.top_rate + self.other_rate = param.other_rate + self.cipher_compressing = param.cipher_compress + self.new_ver = param.new_ver + self.EINI_inference = param.EINI_inference + self.EINI_random_mask = param.EINI_random_mask + + # fast sbt param + self.tree_num_per_party = param.tree_num_per_party + self.boosting_strategy = param.boosting_strategy + self.guest_depth = param.guest_depth + self.host_depth = param.host_depth + + if self.use_missing: + self.tree_param.use_missing = self.use_missing + self.tree_param.zero_as_missing = self.zero_as_missing + + self.multi_mode = param.multi_mode + + def process_sample_weights(self, grad_and_hess, data_with_sample_weight=None): + """ + 这个方法的作用是处理样本权重(sample weights)。样本权重是机器学习中一个常用的概念,它表示每个样本对模型更新的影响程度。 + @param grad_and_hess: + @param data_with_sample_weight: + @return: + """ + # add sample weights to gradient and hessian + if data_with_sample_weight is not None: + if with_weight(data_with_sample_weight): + LOGGER.info('weighted sample detected, multiply g/h by weights') + grad_and_hess = grad_and_hess.join(data_with_sample_weight, + lambda v1, v2: (v1[0] * v2.weight, v1[1] * v2.weight)) + if not self.max_sample_weight_computed: + self.max_sample_weight = get_max_sample_weight(data_with_sample_weight) + LOGGER.info('max sample weight is {}'.format(self.max_sample_weight)) + self.max_sample_weight_computed = True + + return grad_and_hess + + def get_tree_plan(self, idx): + + if not self.init_tree_plan: + tree_plan = plan.create_tree_plan(self.boosting_strategy, k=self.tree_num_per_party, + tree_num=self.boosting_round, + host_list=self.component_properties.host_party_idlist, + complete_secure=self.complete_secure) + self.tree_plan += tree_plan + self.init_tree_plan = True + + LOGGER.info('tree plan is {}'.format(self.tree_plan)) + return self.tree_plan[idx] + + def check_host_number(self, tree_type): + host_num = len(self.component_properties.host_party_idlist) + LOGGER.info('host number is {}'.format(host_num)) + if tree_type == plan.tree_type_dict['layered_tree']: + assert host_num == 1, 'only 1 host party is allowed in layered mode' + + def compute_grad_and_hess(self, y_hat, y, data_with_sample_weight=None): + + LOGGER.info("compute grad and hess") + loss_method = self.loss + if self.task_type == consts.CLASSIFICATION: + grad_and_hess = y.join(y_hat, lambda y, f_val: + (loss_method.compute_grad(y, loss_method.predict(f_val)), + loss_method.compute_hess(y, loss_method.predict(f_val)))) + else: + grad_and_hess = y.join(y_hat, lambda y, f_val: + (loss_method.compute_grad(y, f_val), + loss_method.compute_hess(y, f_val))) + + grad_and_hess = self.process_sample_weights(grad_and_hess, data_with_sample_weight) + + return grad_and_hess + + @staticmethod + def get_grad_and_hess(g_h, dim=0): + LOGGER.info("get grad and hess of tree {}".format(dim)) + grad_and_hess_subtree = g_h.mapValues( + lambda grad_and_hess: (grad_and_hess[0][dim], grad_and_hess[1][dim])) + return grad_and_hess_subtree + + def update_feature_importance(self, tree_feature_importance): + for fid in tree_feature_importance: + if fid not in self.feature_importances_: + self.feature_importances_[fid] = tree_feature_importance[fid] + else: + self.feature_importances_[fid] += tree_feature_importance[fid] + LOGGER.debug('cur feature importance {}'.format(self.feature_importances_)) + + def align_feature_importance_guest(self, suffix): + """ + receive feature importance from host to update global feature importance + """ + host_feature_importance_list = self.hetero_sbt_transfer_variable.host_feature_importance.get( + idx=-1, suffix=suffix) + # remove host importance, make sure host importance is latest when host anonymous features are updated + pop_key = [] + for key in self.feature_importances_: + sitename, fid = key + if consts.GUEST not in sitename: + pop_key.append(key) + + for k in pop_key: + self.feature_importances_.pop(k) + + for i in host_feature_importance_list: + self.feature_importances_.update(i) + + def goss_sample(self): + + sampled_gh = goss_sampling(self.grad_and_hess, self.top_rate, self.other_rate) + return sampled_gh + + def on_epoch_prepare(self, epoch_idx): + """ + + Parameters + ---------- + epoch_idx cur epoch idx + + Returns None + ------- + + Prepare g, h, sample weights, sampling at the beginning of every epoch + + """ + if self.cur_epoch_idx != epoch_idx: + self.grad_and_hess = self.compute_grad_and_hess(self.y_hat, self.y, self.data_inst) + self.cur_epoch_idx = epoch_idx + # goss sampling + if self.enable_goss: + if not self.re_compute_goss_sample_weight: + self.max_sample_weight = self.max_sample_weight * ((1 - self.top_rate) / self.other_rate) + self.grad_and_hess = self.goss_sample() + + def preprocess(self): + if self.multi_mode == consts.MULTI_OUTPUT: + # re-set dimension + self.booster_dim = 1 + + def postprocess(self): + pass + + def fit_a_learner(self, epoch_idx: int, booster_dim: int): + + self.on_epoch_prepare(epoch_idx) + + if self.multi_mode == consts.MULTI_OUTPUT: + g_h = self.grad_and_hess + else: + g_h = self.get_grad_and_hess(self.grad_and_hess, booster_dim) + + flow_id = self.generate_flowid(epoch_idx, booster_dim) + complete_secure = True if (epoch_idx == 0 and self.complete_secure) else False + + tree_type, target_host_id = None, None + fast_sbt = (self.boosting_strategy == consts.MIX_TREE or self.boosting_strategy == consts.LAYERED_TREE) + if fast_sbt: + tree_type, target_host_id = self.get_tree_plan(epoch_idx) + self.check_host_number(tree_type) + + tree = produce_hetero_tree_learner(role=self.role, tree_param=self.tree_param, flow_id=flow_id, + data_bin=self.data_bin, bin_split_points=self.bin_split_points, + bin_sparse_points=self.bin_sparse_points, task_type=self.task_type, + valid_features=self.sample_valid_features(), + host_party_list=self.component_properties.host_party_idlist, + runtime_idx=self.component_properties.local_partyid, + cipher_compress=self.cipher_compressing, + g_h=g_h, encrypter=self.encrypter, + goss_subsample=self.enable_goss, + objective=self.objective_param.objective, + complete_secure=complete_secure, max_sample_weights=self.max_sample_weight, + fast_sbt=fast_sbt, tree_type=tree_type, target_host_id=target_host_id, + guest_depth=self.guest_depth, host_depth=self.host_depth, + mo_tree=(self.multi_mode == consts.MULTI_OUTPUT), + class_num=len(self.classes_) if len(self.classes_) > 2 else 1 # mo parameter + ) + + tree.fit() + self.update_feature_importance(tree.get_feature_importance()) + self.align_feature_importance_guest(suffix=(epoch_idx, booster_dim)) + return tree + + def load_learner(self, model_meta, model_param, epoch_idx, booster_idx): + + flow_id = self.generate_flowid(epoch_idx, booster_idx) + runtime_idx = self.component_properties.local_partyid + host_list = self.component_properties.host_party_idlist + fast_sbt = (self.boosting_strategy == consts.MIX_TREE or self.boosting_strategy == consts.LAYERED_TREE) + tree_type, target_host_id = None, None + + if fast_sbt: + tree_type, target_host_id = self.get_tree_plan(epoch_idx) + + tree = load_hetero_tree_learner(role=self.role, tree_param=self.tree_param, model_meta=model_meta, + model_param=model_param, + flow_id=flow_id, runtime_idx=runtime_idx, host_party_list=host_list, + fast_sbt=fast_sbt, tree_type=tree_type, target_host_id=target_host_id) + + return tree + + def generate_summary(self) -> dict: + + summary = {'loss_history': self.history_loss, + 'best_iteration': self.callback_variables.best_iteration, + 'feature_importance': self.make_readable_feature_importance(self.feature_name_fid_mapping, + self.feature_importances_), + 'validation_metrics': self.callback_variables.validation_summary, + 'is_converged': self.is_converged} + + return summary + + @staticmethod + def make_readable_feature_importance(fid_mapping, feature_importances): + """ + replace feature id by real feature name + """ + new_fi = {} + for id_ in feature_importances: + if isinstance(id_, tuple): + if consts.GUEST in id_[0]: + new_fi[fid_mapping[id_[1]]] = feature_importances[id_].importance + else: + new_fi[id_[0] + '_' + str(id_[1])] = feature_importances[ + id_].importance + else: + new_fi[fid_mapping[id_]] = feature_importances[id_].importance + + return new_fi + + @assert_io_num_rows_equal + def predict(self, data_inst, ret_format='std'): + + # standard format, leaf indices, raw score + assert ret_format in ['std', 'leaf', 'raw'], 'illegal ret format' + + LOGGER.info('running prediction') + cache_dataset_key = self.predict_data_cache.get_data_key(data_inst) + + processed_data = self.data_and_header_alignment(data_inst) + + # sync feature importance if host anonymous change in model migration + if not self.on_training: + self.align_feature_importance_guest('predict') + + last_round = self.predict_data_cache.predict_data_last_round(cache_dataset_key) + + self.sync_predict_round(last_round) + + rounds = len(self.boosting_model_list) // self.booster_dim + trees = [] + LOGGER.debug('round involved in prediction {}, last round is {}, data key {}' + .format(list(range(last_round, rounds)), last_round, cache_dataset_key)) + + for idx in range(last_round, rounds): + for booster_idx in range(self.booster_dim): + tree = self.load_learner(self.booster_meta, + self.boosting_model_list[idx * self.booster_dim + booster_idx], + idx, booster_idx) + trees.append(tree) + + predict_cache = None + tree_num = len(trees) + + if last_round != 0: + predict_cache = self.predict_data_cache.predict_data_at(cache_dataset_key, min(rounds, last_round)) + LOGGER.info('load predict cache of round {}'.format(min(rounds, last_round))) + + if tree_num == 0 and predict_cache is not None and not (ret_format == 'leaf'): + return self.score_to_predict_result(data_inst, predict_cache) + + if self.boosting_strategy == consts.MIX_TREE: + predict_rs = mix_sbt_guest_predict( + processed_data, + self.hetero_sbt_transfer_variable, + trees, + self.learning_rate, + self.init_score, + self.booster_dim, + predict_cache, + pred_leaf=( + ret_format == 'leaf')) + else: + if self.EINI_inference and not self.on_training: # EINI is for inference stage + sitename = self.role + ':' + str(self.component_properties.local_partyid) + predict_rs = EINI_guest_predict( + processed_data, + trees, + self.learning_rate, + self.init_score, + self.booster_dim, + self.encrypt_param.key_length, + self.hetero_sbt_transfer_variable, + sitename, + self.component_properties.host_party_idlist, + predict_cache, + False) + else: + predict_rs = sbt_guest_predict( + processed_data, + self.hetero_sbt_transfer_variable, + trees, + self.learning_rate, + self.init_score, + self.booster_dim, + predict_cache, + pred_leaf=( + ret_format == 'leaf')) + + if ret_format == 'leaf': + return predict_rs # predict result is leaf position + + self.predict_data_cache.add_data(cache_dataset_key, predict_rs, cur_boosting_round=rounds) + LOGGER.debug('adding predict rs {}'.format(predict_rs)) + LOGGER.debug('last round is {}'.format(self.predict_data_cache.predict_data_last_round(cache_dataset_key))) + + if ret_format == 'raw': + return predict_rs + else: + return self.score_to_predict_result(data_inst, predict_rs) + + def load_feature_importance(self, feat_importance_param): + param = list(feat_importance_param) + rs_dict = {} + for fp in param: + if consts.GUEST in fp.sitename: + key = (fp.sitename.replace(':', '_'), fp.fid) # guest format + else: + sitename = fp.sitename.replace(':', '_') + anonymous_feat_name = Anonymous().get_suffix_from_anonymous_column(fp.fullname) + key = (sitename, anonymous_feat_name) + importance = FeatureImportance() + importance.from_protobuf(fp) + rs_dict[key] = importance + + self.feature_importances_ = rs_dict + + def get_model_meta(self): + model_meta = BoostingTreeModelMeta() + model_meta.tree_meta.CopyFrom(self.booster_meta) + model_meta.learning_rate = self.learning_rate + model_meta.num_trees = self.boosting_round + model_meta.quantile_meta.CopyFrom(QuantileMeta(bin_num=self.bin_num)) + model_meta.objective_meta.CopyFrom(ObjectiveMeta(objective=self.objective_param.objective, + param=self.objective_param.params)) + model_meta.use_missing = self.use_missing + model_meta.zero_as_missing = self.zero_as_missing + model_meta.task_type = self.task_type + model_meta.n_iter_no_change = self.n_iter_no_change + model_meta.tol = self.tol + model_meta.boosting_strategy = self.boosting_strategy + model_meta.module = "HeteroSecureBoost" + meta_name = consts.HETERO_SBT_GUEST_MODEL + "Meta" + + return meta_name, model_meta + + def get_model_param(self): + + model_param = BoostingTreeModelParam() + model_param.tree_num = len(self.boosting_model_list) + model_param.tree_dim = self.booster_dim + model_param.trees_.extend(self.boosting_model_list) + model_param.init_score.extend(self.init_score) + model_param.losses.extend(self.history_loss) + model_param.classes_.extend(map(str, self.classes_)) + model_param.num_classes = self.num_classes + if self.boosting_strategy == consts.STD_TREE: + model_param.model_name = consts.HETERO_SBT + elif self.boosting_strategy == consts.LAYERED_TREE: + model_param.model_name = consts.HETERO_FAST_SBT_LAYERED + elif self.boosting_strategy == consts.MIX_TREE: + model_param.model_name = consts.HETERO_FAST_SBT_MIX + model_param.best_iteration = self.callback_variables.best_iteration + + feature_importances = list(self.feature_importances_.items()) + feature_importances = sorted(feature_importances, key=itemgetter(1), reverse=True) + feature_importance_param = [] + + for (sitename, fid), importance in feature_importances: + if consts.GUEST in sitename: + fullname = self.feature_name_fid_mapping[fid] + else: + fullname = sitename + '_' + str(fid) + sitename = sitename.replace('_', ':') + fid = None + feature_importance_param.append(FeatureImportanceInfo(sitename=sitename, # sitename to distinguish sites + fid=fid, + importance=importance.importance, + fullname=fullname, + importance2=importance.importance_2, + main=importance.main_type + )) + model_param.feature_importances.extend(feature_importance_param) + model_param.feature_name_fid_mapping.update(self.feature_name_fid_mapping) + model_param.tree_plan.extend(plan.encode_plan(self.tree_plan)) + param_name = consts.HETERO_SBT_GUEST_MODEL + "Param" + + return param_name, model_param + + def set_model_meta(self, model_meta): + + if not self.is_warm_start: + # these hyper parameters are not needed in warm start setting + self.boosting_round = model_meta.num_trees + self.tol = model_meta.tol + self.n_iter_no_change = model_meta.n_iter_no_change + self.bin_num = model_meta.quantile_meta.bin_num + + self.learning_rate = model_meta.learning_rate + self.booster_meta = model_meta.tree_meta + self.objective_param.objective = model_meta.objective_meta.objective + self.objective_param.params = list(model_meta.objective_meta.param) + self.task_type = model_meta.task_type + self.boosting_strategy = model_meta.boosting_strategy + + def set_model_param(self, model_param): + + self.boosting_model_list = list(model_param.trees_) + self.init_score = np.array(list(model_param.init_score)) + self.history_loss = list(model_param.losses) + self.classes_ = list(map(int, model_param.classes_)) + self.booster_dim = model_param.tree_dim + self.num_classes = model_param.num_classes + self.feature_name_fid_mapping.update(model_param.feature_name_fid_mapping) + self.load_feature_importance(model_param.feature_importances) + # initialize loss function + self.loss = self.get_loss_function() + # init model tree plan if it exists + self.tree_plan = plan.decode_plan(model_param.tree_plan) + + +if __name__ == '__main__': + HeteroSecureBoostingTreeGuest() diff --git a/python/federatedml/ensemble/secureboost/steteless_hetero_secureboost/stateless_hetero_secureboost_host.py b/python/federatedml/ensemble/secureboost/steteless_hetero_secureboost/stateless_hetero_secureboost_host.py new file mode 100644 index 0000000000..f85117869f --- /dev/null +++ b/python/federatedml/ensemble/secureboost/steteless_hetero_secureboost/stateless_hetero_secureboost_host.py @@ -0,0 +1,293 @@ +from operator import itemgetter +from federatedml.util import LOGGER +from federatedml.util import consts +from federatedml.util.io_check import assert_io_num_rows_equal +from federatedml.protobuf.generated.boosting_tree_model_param_pb2 import FeatureImportanceInfo +from federatedml.ensemble.basic_algorithms.decision_tree.tree_core.feature_importance import FeatureImportance +from federatedml.ensemble.boosting import HeteroBoostingHost +from federatedml.param.boosting_param import HeteroSecureBoostParam, DecisionTreeParam +from federatedml.transfer_variable.transfer_class.hetero_secure_boosting_predict_transfer_variable import \ + HeteroSecureBoostTransferVariable +from federatedml.ensemble.secureboost.secureboost_util.tree_model_io import produce_hetero_tree_learner, \ + load_hetero_tree_learner +from federatedml.ensemble.secureboost.secureboost_util.boosting_tree_predict import sbt_host_predict, \ + mix_sbt_host_predict, EINI_host_predict +from federatedml.protobuf.generated.boosting_tree_model_meta_pb2 import BoostingTreeModelMeta +from federatedml.protobuf.generated.boosting_tree_model_meta_pb2 import QuantileMeta +from federatedml.protobuf.generated.boosting_tree_model_param_pb2 import BoostingTreeModelParam +from federatedml.ensemble.basic_algorithms.decision_tree.tree_core import tree_plan as plan +from federatedml.util.anonymous_generator_util import Anonymous + + +class HeteroSecureBoostingTreeHost(HeteroBoostingHost): + + def __init__(self): + super(HeteroSecureBoostingTreeHost, self).__init__() + self.use_missing = False + self.zero_as_missing = False + self.grad_and_hess = None + self.tree_param = DecisionTreeParam() # decision tree param + self.model_param = HeteroSecureBoostParam() + self.complete_secure = False + self.model_name = 'HeteroSecureBoost' + self.enable_goss = False + self.cipher_compressing = False + self.max_sample_weight = None + self.round_decimal = None + self.new_ver = True + self.feature_importance_aligned = False + + self.boosting_strategy = consts.STD_TREE + + # fast sbt param + self.tree_num_per_party = 1 + self.guest_depth = 0 + self.host_depth = 0 + self.init_tree_plan = False + self.tree_plan = [] + self.feature_importances_ = {} + + # EINI predict param + self.EINI_inference = False + self.EINI_random_mask = False + self.EINI_complexity_check = False + + self.multi_mode = consts.SINGLE_OUTPUT + + self.hetero_sbt_transfer_variable = HeteroSecureBoostTransferVariable() + + def _init_model(self, param: HeteroSecureBoostParam): + + super(HeteroSecureBoostingTreeHost, self)._init_model(param) + self.tree_param = param.tree_param + self.use_missing = param.use_missing + self.enable_goss = param.run_goss + self.zero_as_missing = param.zero_as_missing + self.complete_secure = param.complete_secure + self.sparse_opt_para = param.sparse_optimization + self.cipher_compressing = param.cipher_compress + self.new_ver = param.new_ver + + self.tree_num_per_party = param.tree_num_per_party + self.boosting_strategy = param.boosting_strategy + self.guest_depth = param.guest_depth + self.host_depth = param.host_depth + self.multi_mode = param.multi_mode + self.EINI_inference = param.EINI_inference + self.EINI_random_mask = param.EINI_random_mask + self.EINI_complexity_check = param.EINI_complexity_check + + if self.use_missing: + self.tree_param.use_missing = self.use_missing + self.tree_param.zero_as_missing = self.zero_as_missing + + def get_tree_plan(self, idx): + + if not self.init_tree_plan: + tree_plan = plan.create_tree_plan(self.boosting_strategy, k=self.tree_num_per_party, + tree_num=self.boosting_round, + host_list=self.component_properties.host_party_idlist, + complete_secure=self.complete_secure) + self.tree_plan += tree_plan + self.init_tree_plan = True + + LOGGER.info('tree plan is {}'.format(self.tree_plan)) + return self.tree_plan[idx] + + def update_feature_importance(self, tree_feature_importance): + for fid in tree_feature_importance: + if fid not in self.feature_importances_: + self.feature_importances_[fid] = tree_feature_importance[fid] + else: + self.feature_importances_[fid] += tree_feature_importance[fid] + + def load_feature_importance(self, feat_importance_param): + param = list(feat_importance_param) + rs_dict = {} + for fp in param: + key = fp.fid + importance = FeatureImportance() + importance.from_protobuf(fp) + rs_dict[key] = importance + + self.feature_importances_ = rs_dict + + def get_anonymous_importance(self): + new_feat_importance = {} + for key in self.feature_importances_: + anonymous_name = self.anonymous_header[self.feature_name_fid_mapping[key]] + party = Anonymous.get_role_from_anonymous_column(anonymous_name) + party_id = Anonymous.get_party_id_from_anonymous_column(anonymous_name) + anonymous_feat = Anonymous.get_suffix_from_anonymous_column(anonymous_name) + new_feat_importance[(party + '_' + party_id, anonymous_feat)] = self.feature_importances_[key] + return new_feat_importance + + def align_feature_importance_host(self, suffix): + """ + send feature importance to guest to update global feature importance + """ + new_feat_importance = self.get_anonymous_importance() + self.hetero_sbt_transfer_variable.host_feature_importance.remote(new_feat_importance, suffix=suffix) + + def preprocess(self): + if self.multi_mode == consts.MULTI_OUTPUT: + self.booster_dim = 1 + + def postprocess(self): + pass + + def fit_a_learner(self, epoch_idx: int, booster_dim: int): + + flow_id = self.generate_flowid(epoch_idx, booster_dim) + complete_secure = True if (epoch_idx == 0 and self.complete_secure) else False + fast_sbt = (self.boosting_strategy == consts.MIX_TREE or self.boosting_strategy == consts.LAYERED_TREE) + + tree_type, target_host_id = None, None + if fast_sbt: + tree_type, target_host_id = self.get_tree_plan(epoch_idx) + + tree = produce_hetero_tree_learner(role=self.role, tree_param=self.tree_param, flow_id=flow_id, + data_bin=self.data_bin, bin_split_points=self.bin_split_points, + bin_sparse_points=self.bin_sparse_points, task_type=self.task_type, + valid_features=self.sample_valid_features(), + host_party_list=self.component_properties.host_party_idlist, + runtime_idx=self.component_properties.local_partyid, + cipher_compress=self.cipher_compressing, + complete_secure=complete_secure, + fast_sbt=fast_sbt, tree_type=tree_type, target_host_id=target_host_id, + guest_depth=self.guest_depth, host_depth=self.host_depth, + mo_tree=(self.multi_mode == consts.MULTI_OUTPUT), bin_num=self.bin_num + ) + tree.fit() + self.update_feature_importance(tree.get_feature_importance()) + self.align_feature_importance_host(suffix=(epoch_idx, booster_dim)) + return tree + + def load_learner(self, model_meta, model_param, epoch_idx, booster_idx): + + flow_id = self.generate_flowid(epoch_idx, booster_idx) + runtime_idx = self.component_properties.local_partyid + fast_sbt = (self.boosting_strategy == consts.MIX_TREE or self.boosting_strategy == consts.LAYERED_TREE) + tree_type, target_host_id = None, None + + if fast_sbt: + tree_type, target_host_id = self.get_tree_plan(epoch_idx) + + tree = load_hetero_tree_learner(self.role, self.tree_param, model_meta, model_param, flow_id, + runtime_idx, + fast_sbt=fast_sbt, tree_type=tree_type, target_host_id=target_host_id) + + return tree + + def generate_summary(self) -> dict: + + summary = {'best_iteration': self.callback_variables.best_iteration, 'is_converged': self.is_converged} + LOGGER.debug('summary is {}'.format(summary)) + + return summary + + @assert_io_num_rows_equal + def predict(self, data_inst): + + LOGGER.info('running prediction') + + processed_data = self.data_and_header_alignment(data_inst) + + self.set_anonymous_header(processed_data) + + # sync feature importance if host anonymous change in model migration + if not self.on_training: + self.align_feature_importance_host('predict') + + predict_start_round = self.sync_predict_start_round() + + rounds = len(self.boosting_model_list) // self.booster_dim + trees = [] + for idx in range(predict_start_round, rounds): + for booster_idx in range(self.booster_dim): + tree = self.load_learner(self.booster_meta, + self.boosting_model_list[idx * self.booster_dim + booster_idx], + idx, booster_idx) + trees.append(tree) + + if len(trees) == 0: + LOGGER.info('no tree for predicting, prediction done') + return + + if self.boosting_strategy == consts.MIX_TREE: + mix_sbt_host_predict(processed_data, self.hetero_sbt_transfer_variable, trees) + else: + if self.EINI_inference and not self.on_training: + sitename = self.role + ':' + str(self.component_properties.local_partyid) + EINI_host_predict(processed_data, trees, sitename, self.component_properties.local_partyid, + self.component_properties.host_party_idlist, self.booster_dim, + self.hetero_sbt_transfer_variable, self.EINI_complexity_check, self.EINI_random_mask,) + else: + sbt_host_predict(processed_data, self.hetero_sbt_transfer_variable, trees) + + def get_model_meta(self): + model_meta = BoostingTreeModelMeta() + model_meta.tree_meta.CopyFrom(self.booster_meta) + model_meta.num_trees = self.boosting_round + model_meta.quantile_meta.CopyFrom(QuantileMeta(bin_num=self.bin_num)) + model_meta.boosting_strategy = self.boosting_strategy + model_meta.module = "HeteroSecureBoost" + meta_name = "HeteroSecureBoostingTreeHostMeta" + return meta_name, model_meta + + def get_model_param(self): + + model_param = BoostingTreeModelParam() + model_param.tree_num = len(self.boosting_model_list) + model_param.tree_dim = self.booster_dim + model_param.trees_.extend(self.boosting_model_list) + + anonymous_name_mapping = {} + + for fid, name in self.feature_name_fid_mapping.items(): + anonymous_name_mapping[self.anonymous_header[name]] = name + + model_param.anonymous_name_mapping.update(anonymous_name_mapping) + model_param.feature_name_fid_mapping.update(self.feature_name_fid_mapping) + if self.boosting_strategy == consts.STD_TREE: + model_param.model_name = consts.HETERO_SBT + elif self.boosting_strategy == consts.LAYERED_TREE: + model_param.model_name = consts.HETERO_FAST_SBT_LAYERED + elif self.boosting_strategy == consts.MIX_TREE: + model_param.model_name = consts.HETERO_FAST_SBT_MIX + model_param.best_iteration = self.callback_variables.best_iteration + model_param.tree_plan.extend(plan.encode_plan(self.tree_plan)) + + feature_importances = list(self.feature_importances_.items()) + feature_importances = sorted(feature_importances, key=itemgetter(1), reverse=True) + feature_importance_param = [] + for fid, importance in feature_importances: + feature_importance_param.append(FeatureImportanceInfo(sitename=consts.HOST_LOCAL, + fid=fid, + importance=importance.importance, + fullname=self.feature_name_fid_mapping[fid], + main=importance.main_type + )) + model_param.feature_importances.extend(feature_importance_param) + + param_name = "HeteroSecureBoostingTreeHostParam" + + return param_name, model_param + + def set_model_meta(self, model_meta): + if not self.is_warm_start: + self.boosting_round = model_meta.num_trees + self.booster_meta = model_meta.tree_meta + self.bin_num = model_meta.quantile_meta.bin_num + self.boosting_strategy = model_meta.boosting_strategy + + def set_model_param(self, model_param): + self.boosting_model_list = list(model_param.trees_) + self.booster_dim = model_param.tree_dim + self.feature_name_fid_mapping.update(model_param.feature_name_fid_mapping) + self.tree_plan = plan.decode_plan(model_param.tree_plan) + self.load_feature_importance(model_param.feature_importances) + + # implement abstract function + def check_label(self, *args): + pass diff --git a/python/federatedml/param/intersect_param.py b/python/federatedml/param/intersect_param.py index 7b59ccc96b..dd3cf8c887 100644 --- a/python/federatedml/param/intersect_param.py +++ b/python/federatedml/param/intersect_param.py @@ -251,7 +251,7 @@ def check(self): consts.SM3], f"{descr}hash_method") - self.curve = self.check_and_change_lower(self.curve, [consts.CURVE25519], f"{descr}curve") + self.curve = self.check_and_change_lower(self.curve, [consts.CURVE25519,consts.SM2], f"{descr}curve") LOGGER.debug("Finish ECDHParam parameter check!") return True diff --git a/python/federatedml/secureprotol/ecc/__init__.py b/python/federatedml/secureprotol/ecc/__init__.py new file mode 100644 index 0000000000..8b13789179 --- /dev/null +++ b/python/federatedml/secureprotol/ecc/__init__.py @@ -0,0 +1 @@ + diff --git a/python/federatedml/secureprotol/ecc/sm2.py b/python/federatedml/secureprotol/ecc/sm2.py new file mode 100644 index 0000000000..e55cde6653 --- /dev/null +++ b/python/federatedml/secureprotol/ecc/sm2.py @@ -0,0 +1,66 @@ +# -*-coding: Utf-8 -*- + +''' +from federatedml.util import LOGGER +import tinyec.ec as ec + +LOGGER.setLevel('DEBUG') + +# 国家密码管理局:SM2椭圆曲线公钥密码算法推荐曲线参数 +SM2_p = 0xFFFFFFFEFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF00000000FFFFFFFFFFFFFFFF +SM2_a = 0xFFFFFFFEFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF00000000FFFFFFFFFFFFFFFC +SM2_b = 0x28E9FA9E9D9F5E344D5A9E4BCF6509A7F39789F515AB8F92DDBCBD414D940E93 +SM2_n = 0xFFFFFFFEFFFFFFFFFFFFFFFFFFFFFFFF7203DF6B21C6052B53BBF40939D54123 +SM2_Gx = 0x32C4AE2C1F1981195F9904466A39C9948FE30BBFF2660BE1715A4589334C74C7 +SM2_Gy = 0xBC3736A2F4F6779C59BDCEE36B692153D0A9877CC62A474002DF32E52139F0A0 +PARA_SIZE = 32 # 参数长度(字节) + + +# 转换为bytes,第二参数为字节数(可不填) +def to_byte(x, size=None): + if isinstance(x, int): + if size is None: # 计算合适的字节数 + size = 0 + tmp = x >> 64 + while tmp: + size += 8 + tmp >>= 64 + tmp = x >> (size << 3) + while tmp: + size += 1 + tmp >>= 8 + elif x >> (size << 3): # 指定的字节数不够则截取低位 + x &= (1 << (size << 3)) - 1 + return x.to_bytes(size, byteorder='big') + + +class SM2: + def __init__(self, p=SM2_p, a=SM2_a, b=SM2_b, n=SM2_n, G=(SM2_Gx, SM2_Gy), h=1,curve_key=None): + field = ec.SubGroup(p, G, n, h) + self.curve = ec.Curve(a, b, field) + self.G = ec.Point(self.curve, G[0], G[1]) + keypair = ec.make_keypair(self.curve) + self.private_key = curve_key or keypair.priv + self.p, self.a, self.b, self.n, self.h = p, a, b, n, h + + def mult_point(self, k): + return k * self.G + + def encrypt(self, hashvalue): + LOGGER.info('!!!!sm2 encry start') + int_hashvalue = int.from_bytes(hashvalue, byteorder='big') + hash_to_curve = self.mult_point(int_hashvalue) + c = (hash_to_curve.x + hash_to_curve.y) * self.private_key + LOGGER.info('!!!!sm2 encry finish') + return to_byte(c) + + def diffie_hellman(self, ciphertext): + ciphertext = int.from_bytes(ciphertext, byteorder='big') + c2 = ciphertext * self.private_key + return to_byte(c2) + + def get_private_key(self): + return self.private_key +''' + + diff --git a/python/federatedml/secureprotol/elliptic_curve_encryption.py b/python/federatedml/secureprotol/elliptic_curve_encryption.py index 4eaea5f581..1bb097f537 100644 --- a/python/federatedml/secureprotol/elliptic_curve_encryption.py +++ b/python/federatedml/secureprotol/elliptic_curve_encryption.py @@ -19,7 +19,7 @@ from fate_crypto.psi import Curve25519 - +# from federatedml.secureprotol.ecc.sm2 import SM2 class EllipticCurve(object): """ @@ -31,9 +31,14 @@ def __init__(self, curve_name, curve_key=None): @staticmethod def __get_curve_instance(curve_name, curve_key): - if curve_key is None: - return Curve25519() - return Curve25519(curve_key) + if curve_name == 'curve25519': + if curve_key is None: + return Curve25519() + return Curve25519(curve_key) + # if curve_name == 'sm2': + # if curve_key is None: + # return SM2() + # return SM2(curve_key) def get_curve_key(self): return self.curve.get_private_key() diff --git a/python/federatedml/util/consts.py b/python/federatedml/util/consts.py index d3685a8e52..357fa8905c 100644 --- a/python/federatedml/util/consts.py +++ b/python/federatedml/util/consts.py @@ -349,6 +349,7 @@ pytorch_backend = 'pytorch' keras_backend = 'keras' CURVE25519 = 'curve25519' +SM2='sm2' # HOMO NN Framework FEDAVG_TRAINER = 'fedavg_trainer'