From 8bb9a27707d44e77cc8ebaebe41d173a9c0c25e2 Mon Sep 17 00:00:00 2001 From: Bo Yuan Date: Thu, 23 Apr 2020 15:41:41 -0400 Subject: [PATCH] Add support to tf.Datasets --- configs/debugger.cfg.json | 7 ++-- configs/debugger.compile.json | 11 +++-- pertbio/pertbio/dataset.py | 76 ++++++++++++++++++++++++++--------- pertbio/pertbio/model.py | 22 +++++----- pertbio/pertbio/train.py | 62 +++++++++------------------- pertbio/pertbio/version.py | 4 ++ scripts/main.py | 17 ++++---- 7 files changed, 109 insertions(+), 90 deletions(-) diff --git a/configs/debugger.cfg.json b/configs/debugger.cfg.json index 9c35e9d..a43a66f 100644 --- a/configs/debugger.cfg.json +++ b/configs/debugger.cfg.json @@ -2,6 +2,7 @@ "experiment_id": "Debugging", "experiment_type": "random partition", "model": "CellBox", + "sparse_data": false, "pert_file": "data/pert.csv", "expr_file": "data/expr.csv", "node_index_file": "data/node_Index.csv", @@ -12,8 +13,7 @@ "n_x" : 99, "trainset_ratio": 0.7, "validset_ratio": 0.8, - "dropout_percent": 0.8, - "batchsize": 40, + "batchsize": 4, "envelop_form": "hill", "dT": 0.1, @@ -23,7 +23,8 @@ "ode_solver": "heun", "ode_last_steps": 20, - "n_iter": 1000, + "n_epoch": 1000, + "n_iter": 50000, "n_iter_buffer":5, "n_iter_patience":50, diff --git a/configs/debugger.compile.json b/configs/debugger.compile.json index 1a34c66..d5c8801 100644 --- a/configs/debugger.compile.json +++ b/configs/debugger.compile.json @@ -2,6 +2,7 @@ "experiment_id": "Debugging", "experiment_type": "random partition", "model": "CellBox", + "sparse_data": false, "pert_file": "data/pert.csv", "expr_file": "data/expr.csv", "node_index_file": "data/node_Index.csv", @@ -22,15 +23,19 @@ "ode_solver": "rk4", "ode_last_steps": 2, - "n_epoch": 10000, - "n_iter": 10000, + "n_epoch": 10, + "n_iter": 1000, "n_iter_buffer":5, "n_iter_patience":10, "stages":[{ "nT": 10, "sub_stages":[ - {"lr_val": 0.01,"l1lamda": 0.01} + {"lr_val": 0.1,"l1lamda": 0.01} + ]}, { + "nT": 20, + "sub_stages":[ + {"lr_val": 0.1,"l1lamda": 0.01} ]}], "export_verbose": 3, diff --git a/pertbio/pertbio/dataset.py b/pertbio/pertbio/dataset.py index 431c2bb..406d97c 100644 --- a/pertbio/pertbio/dataset.py +++ b/pertbio/pertbio/dataset.py @@ -1,24 +1,58 @@ import numpy as np import pandas as pd import tensorflow as tf +import os def factory(cfg): + cfg.pert_in = tf.compat.v1.placeholder(tf.float32, [None, cfg.n_x], name='pert_in') + cfg.expr_out = tf.compat.v1.placeholder(tf.float32, [None, cfg.n_x], name='expr_out') + cfg.pert = pd.read_csv(os.path.join(cfg.root_dir, cfg.pert_file), header=None, dtype=np.float32) + cfg.expr = pd.read_csv(os.path.join(cfg.root_dir, cfg.expr_file), header=None, dtype=np.float32) + cfg.l1_lambda = tf.compat.v1.placeholder(tf.float32, name='lambda') + cfg.lr = tf.compat.v1.placeholder(tf.float32, name='lr') + + # Prepare dataset iterators + dataset = tf.data.Dataset.from_tensor_slices((cfg.pert_in, cfg.expr_out)) + cfg.iter_train = tf.compat.v1.data.make_initializable_iterator( + dataset.batch(cfg.batchsize).shuffle(buffer_size=1024, reshuffle_each_iteration=True)) + cfg.iter_monitor = tf.compat.v1.data.make_initializable_iterator( + dataset.repeat().batch(cfg.batchsize).shuffle(buffer_size=1024, reshuffle_each_iteration=True)) + cfg.iter_eval = tf.compat.v1.data.make_initializable_iterator(dataset.batch(cfg.batchsize)) + + # Data partition if cfg.experiment_type == 'random partition' or cfg.experiment_type == 'full data': - return random_partition(cfg) - - if cfg.experiment_type == 'leave one out (w/o single)': - return loo(cfg, singles=False) - - if cfg.experiment_type == 'leave one out (w/ single)': - return loo(cfg, singles=True) - - if cfg.experiment_type == 'single to combo': - return s2c(cfg) - - raise Exception('Invalid experiment type. \nValid options: [random partition, leave one out (w/o single), ' - 'leave one out (w/ single), full data, single to combo]') + cfg.dataset = random_partition(cfg) + + elif cfg.experiment_type == 'leave one out (w/o single)': + cfg.dataset = loo(cfg, singles=False) + + elif cfg.experiment_type == 'leave one out (w/ single)': + cfg.dataset = loo(cfg, singles=True) + + elif cfg.experiment_type == 'single to combo': + cfg.dataset = s2c(cfg) + else: + raise Exception('Invalid experiment type. \nValid options: [random partition, leave one out (w/o single), ' + 'leave one out (w/ single), full data, single to combo]') + + # Prepare feed_dicts + cfg.feed_dicts = { + 'train_set' : { + cfg.pert_in: cfg.dataset['pert_train'], + cfg.expr_out: cfg.dataset['expr_train'], + }, + 'valid_set': { + cfg.pert_in: cfg.dataset['pert_valid'], + cfg.expr_out: cfg.dataset['expr_valid'], + }, + 'test_set':{ + cfg.pert_in: cfg.dataset['pert_test'], + cfg.expr_out: cfg.dataset['expr_test'] + } + } + return cfg def s2c(cfg): @@ -84,6 +118,7 @@ def loo(cfg, singles): def random_partition(cfg): + nexp, n_x = cfg.pert.shape nvalid = int(nexp * cfg.trainset_ratio) ntrain = int(nvalid * cfg.validset_ratio) @@ -95,16 +130,19 @@ def random_partition(cfg): dataset = { "node_index": cfg.node_index, - "pert_train": cfg.pert.iloc[random_pos[:ntrain], :], - "pert_valid": cfg.pert.iloc[random_pos[ntrain:nvalid], :], - "pert_test": cfg.pert.iloc[random_pos[nvalid:], :], "pert_full": cfg.pert, - "expr_train": cfg.expr.iloc[random_pos[:ntrain], :], - "expr_valid": cfg.expr.iloc[random_pos[ntrain:nvalid], :], - "expr_test": cfg.expr.iloc[random_pos[nvalid:], :], "train_pos": random_pos[:ntrain], "valid_pos": random_pos[ntrain:nvalid], "test_pos": random_pos[nvalid:] } + dataset.update({ + "pert_train": cfg.pert.iloc[random_pos[:ntrain], :].values, + "pert_valid": cfg.pert.iloc[random_pos[ntrain:nvalid], :].values, + "pert_test": cfg.pert.iloc[random_pos[nvalid:], :].values, + "expr_train": cfg.expr.iloc[random_pos[:ntrain], :].values, + "expr_valid": cfg.expr.iloc[random_pos[ntrain:nvalid], :].values, + "expr_test": cfg.expr.iloc[random_pos[nvalid:], :].values + }) + return dataset diff --git a/pertbio/pertbio/model.py b/pertbio/pertbio/model.py index 372c485..302c383 100644 --- a/pertbio/pertbio/model.py +++ b/pertbio/pertbio/model.py @@ -30,30 +30,25 @@ class PertBio: def __init__(self, args): self.args = args self.n_x = args.n_x - self.pert_in = tf.compat.v1.placeholder(tf.float32, [None, self.n_x], name='pert_in') - self.expr_out = tf.compat.v1.placeholder(tf.float32, [None, self.n_x], name='expr_out') - - # Prepare datasets - dataset = tf.data.Dataset.from_tensor_slices((self.pert_in, self.expr_out)) - self.iter = tf.compat.v1.data.make_initializable_iterator(dataset - .shuffle(buffer_size=1024).batch(args.batchsize)) - self.train_x, self.train_y = self.iter.get_next() - self.iter_eval = tf.compat.v1.data.make_initializable_iterator(dataset - .shuffle(buffer_size=1024).batch(args.batchsize)) + self.pert_in, self.expr_out = args.pert_in, args.expr_out + self.iter_train, self.iter_monitor, self.iter_eval = args.iter_train, args.iter_monitor, args.iter_eval + self.train_x, self.train_y = self.iter_train.get_next() + self.monitor_x, self.monitor_y = self.iter_monitor.get_next() self.eval_x, self.eval_y = self.iter_eval.get_next() + self.l1_lambda = self.args.l1_lambda + self.lr = self.args.lr def get_ops(self): - self.l1_lambda = tf.compat.v1.placeholder(tf.float32, name='lambda') self.train_loss, self.train_mse_loss = loss(self.train_y, self.train_yhat, self.l1_lambda, self.params['W']) + self.monitor_loss, self.monitor_mse_loss = loss(self.monitor_y, self.monitor_yhat, self.l1_lambda, self.params['W']) self.eval_loss, self.eval_mse_loss = loss(self.eval_y, self.eval_yhat, self.l1_lambda, self.params['W']) - - self.lr = tf.compat.v1.placeholder(tf.float32, name='lr') self.op_optimize = optimize(self.train_loss, self.lr) def build(self): self.params = {} self.get_variables() self.train_yhat = self.forward(self.train_x) + self.monitor_yhat = self.forward(self.monitor_x) self.eval_yhat = self.forward(self.eval_x) self.get_ops() return self @@ -195,6 +190,7 @@ def build(self): self.ode_solver = pertbio.kernel.get_ode_solver(self.args) self._dxdt = pertbio.kernel.get_dxdt(self.args, self.params) self.convergence_metric_train, self.train_yhat = self.forward(self.train_x) + self.convergence_metric_monitor, self.monitor_yhat = self.forward(self.monitor_x) self.convergence_metric_eval, self.eval_yhat = self.forward(self.eval_x) self.get_ops() return self diff --git a/pertbio/pertbio/train.py b/pertbio/pertbio/train.py index b120016..2dcd8a2 100644 --- a/pertbio/pertbio/train.py +++ b/pertbio/pertbio/train.py @@ -36,40 +36,25 @@ def train_substage(model, sess, lr_val, l1lamda, n_epoch, n_iter, n_iter_buffer, n_unchanged = 0 idx_iter = 0 - train_set = { - model.pert_in: args.dataset['pert_train'].values, - model.expr_out: args.dataset['expr_train'].values, - model.lr: lr_val, - model.l1_lambda: l1lamda - } - valid_set = { - model.pert_in: args.dataset['pert_valid'].values, - model.expr_out: args.dataset['expr_valid'].values, - model.l1_lambda: l1lamda - } - test_set = { - model.pert_in: args.dataset['pert_test'].values, - model.expr_out: args.dataset['expr_test'].values - } + for key in args.feed_dicts: + args.feed_dicts[key].update({model.lr: lr_val, model.l1_lambda: l1lamda}) + sess.run(model.iter_monitor.initializer, feed_dict=args.feed_dicts['valid_set']) for idx_epoch in range(n_epoch): - - sess.run(model.iter.initializer, feed_dict=train_set) + sess.run(model.iter_train.initializer, feed_dict=args.feed_dicts['train_set']) while True: if idx_iter > n_iter or n_unchanged > n_iter_patience: break - - # training step t0 = time.clock() try: - _, loss_train_i, loss_train_mse_i = sess.run((model.op_optimize, model.train_loss, - model.train_mse_loss), feed_dict=train_set) - except OutOfRangeError: + _, loss_train_i, loss_train_mse_i = sess.run( + (model.op_optimize, model.train_loss, model.train_mse_loss), feed_dict=args.feed_dicts['train_set']) + except OutOfRangeError: # for iter_train break # record training - sess.run(model.iter_eval.initializer, feed_dict=valid_set) - loss_valid_i, loss_valid_mse_i = sess.run((model.eval_loss, model.eval_mse_loss), feed_dict=valid_set) + loss_valid_i, loss_valid_mse_i = sess.run( + (model.monitor_loss, model.monitor_mse_loss), feed_dict=args.feed_dicts['valid_set']) new_loss = best_params.avg_n_iters_loss(loss_valid_i) if args.export_verbose >= 3: print("Epoch:{}/{}\tIteration: {}/{}\tnew_loss:{}\tbuffer_loss:{}\tbest:{}\tTolerance: {}/{}".format( @@ -79,7 +64,6 @@ def train_substage(model, sess, lr_val, l1lamda, n_epoch, n_iter, n_iter_buffer, append_record("record_eval.csv", [idx_epoch, idx_iter, loss_train_i, loss_valid_i, loss_train_mse_i, loss_valid_mse_i, None, time.clock() - t0]) - # early stopping idx_iter += 1 if new_loss < best_params.loss_min: @@ -89,18 +73,17 @@ def train_substage(model, sess, lr_val, l1lamda, n_epoch, n_iter, n_iter_buffer, else: n_unchanged += 1 - # Evaluation on valid set - t0 = time.clock() - loss_valid_i, loss_valid_mse_i = eval_model(sess, model.iter_eval, (model.eval_loss, model.eval_mse_loss), - valid_set) - append_record("record_eval.csv", [idx_epoch, None, None, loss_valid_i, None, loss_valid_mse_i, - None, time.clock() - t0]) - if idx_iter > n_iter or n_unchanged > n_iter_patience: - break + # Evaluation on valid set + t0 = time.clock() + sess.run(model.iter_eval.initializer, feed_dict=args.feed_dicts['valid_set']) + loss_valid_i, loss_valid_mse_i = eval_model(sess, model.iter_eval, (model.eval_loss, model.eval_mse_loss), + args.feed_dicts['valid_set']) + append_record("record_eval.csv", [-1, None, None, loss_valid_i, None, loss_valid_mse_i, None, time.clock() - t0]) # Evaluation on test set t0 = time.clock() - loss_test_mse = eval_model(sess, model.iter_eval, model.eval_mse_loss, test_set) + sess.run(model.iter_eval.initializer, feed_dict=args.feed_dicts['test_set']) + loss_test_mse = eval_model(sess, model.iter_eval, model.eval_mse_loss, args.feed_dicts['test_set']) append_record("record_eval.csv", [-1, None, None, None, None, None, loss_test_mse, time.clock() - t0]) best_params.save() @@ -126,12 +109,9 @@ def eval_model(sess, eval_iter, obj_fn, eval_dict): return np.mean(np.array(eval_results), axis=0) -def train_model(args): +def train_model(model, args): args.logger = TimeLogger(time_logger_step=1, hierachy=2) - # Constructing model - model = pertbio.model.factory(args) - # Check if all variables in scope # TODO: put variables under appropriate scopes for i in tf.compat.v1.get_collection(tf.compat.v1.GraphKeys.GLOBAL_VARIABLES, scope='initialization'): @@ -245,10 +225,8 @@ def screenshot(self, sess, model, substage_i, node_index, loss_min, args): self.update(params) if self.export_verbose > 1 or self.export_verbose == -1: # no params but y_hat - test_set = {model.pert_in: args.dataset['pert_test'].values, - model.expr_out: args.dataset['expr_test'].values} - sess.run(model.iter_eval.initializer, feed_dict=test_set) - y_hat = sess.run(model.eval_yhat, feed_dict=test_set) + sess.run(model.iter_eval.initializer, feed_dict=model.args.feed_dicts['test_set']) + y_hat = sess.run(model.eval_yhat, feed_dict=model.args.feed_dicts['test_set']) y_hat = pd.DataFrame(y_hat, columns=node_index[0]) self.update({'y_hat': y_hat}) diff --git a/pertbio/pertbio/version.py b/pertbio/pertbio/version.py index 5c6c96c..69d9fc6 100644 --- a/pertbio/pertbio/version.py +++ b/pertbio/pertbio/version.py @@ -62,6 +62,10 @@ def get_msg(): -- Apr 5, 2020 -- * Reformat for better code style * Revise docs + + version 0.2.2 + -- Apr23, 2020 -- + * Add support to tf.Datasets """ print(changelog) diff --git a/scripts/main.py b/scripts/main.py index 0ff2278..3bfc88f 100644 --- a/scripts/main.py +++ b/scripts/main.py @@ -1,4 +1,3 @@ -import sys import pertbio import os import numpy as np @@ -22,8 +21,7 @@ def set_seed(in_seed): def prepare_workdir(in_cfg): # Read Data - in_cfg.pert = pd.read_csv(in_cfg.pert_file, header=None, dtype=np.float32) - in_cfg.expr = pd.read_csv(in_cfg.expr_file, header=None, dtype=np.float32) + in_cfg.root_dir = os.getcwd() in_cfg.node_index = pd.read_csv(in_cfg.node_index_file, header=None, names=None) in_cfg.loo = pd.read_csv("data/loo_label.csv", header=None) @@ -50,14 +48,12 @@ def prepare_workdir(in_cfg): pass os.makedirs(in_cfg.working_index) os.chdir(in_cfg.working_index) + with open("record_eval.csv", 'w') as f: f.write("epoch,iter,train_loss,valid_loss,train_mse,valid_mse,test_mse,time_elapsed\n") - # Load dataset - dataset = pertbio.dataset.factory(cfg) - print('Working directory is ready at {}.'.format(experiment_path)) - return dataset + return 0 if __name__ == '__main__': @@ -71,12 +67,13 @@ def prepare_workdir(in_cfg): set_seed(cfg.seed) print(vars(cfg)) - cfg.dataset = prepare_workdir(cfg) - + prepare_workdir(cfg) logger = pertbio.utils.TimeLogger(time_logger_step=1, hierachy=3) args = cfg for i, stage in enumerate(cfg.stages): + cfg = pertbio.dataset.factory(cfg) logger.log("Training on stage {}/{} ...".format(i + 1, len(cfg.stages))) args.sub_stages = stage['sub_stages'] args.n_T = stage['nT'] - pertbio.train.train_model(args) + model = pertbio.model.factory(args) + pertbio.train.train_model(model, args)