Skip to content

Commit

Permalink
Add support to tf.Datasets
Browse files Browse the repository at this point in the history
  • Loading branch information
DesmondYuan committed Apr 23, 2020
1 parent 7aae05a commit 8bb9a27
Show file tree
Hide file tree
Showing 7 changed files with 109 additions and 90 deletions.
7 changes: 4 additions & 3 deletions configs/debugger.cfg.json
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
"experiment_id": "Debugging",
"experiment_type": "random partition",
"model": "CellBox",
"sparse_data": false,
"pert_file": "data/pert.csv",
"expr_file": "data/expr.csv",
"node_index_file": "data/node_Index.csv",
Expand All @@ -12,8 +13,7 @@
"n_x" : 99,
"trainset_ratio": 0.7,
"validset_ratio": 0.8,
"dropout_percent": 0.8,
"batchsize": 40,
"batchsize": 4,

"envelop_form": "hill",
"dT": 0.1,
Expand All @@ -23,7 +23,8 @@
"ode_solver": "heun",
"ode_last_steps": 20,

"n_iter": 1000,
"n_epoch": 1000,
"n_iter": 50000,
"n_iter_buffer":5,
"n_iter_patience":50,

Expand Down
11 changes: 8 additions & 3 deletions configs/debugger.compile.json
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
"experiment_id": "Debugging",
"experiment_type": "random partition",
"model": "CellBox",
"sparse_data": false,
"pert_file": "data/pert.csv",
"expr_file": "data/expr.csv",
"node_index_file": "data/node_Index.csv",
Expand All @@ -22,15 +23,19 @@
"ode_solver": "rk4",
"ode_last_steps": 2,

"n_epoch": 10000,
"n_iter": 10000,
"n_epoch": 10,
"n_iter": 1000,
"n_iter_buffer":5,
"n_iter_patience":10,

"stages":[{
"nT": 10,
"sub_stages":[
{"lr_val": 0.01,"l1lamda": 0.01}
{"lr_val": 0.1,"l1lamda": 0.01}
]}, {
"nT": 20,
"sub_stages":[
{"lr_val": 0.1,"l1lamda": 0.01}
]}],

"export_verbose": 3,
Expand Down
76 changes: 57 additions & 19 deletions pertbio/pertbio/dataset.py
Original file line number Diff line number Diff line change
@@ -1,24 +1,58 @@
import numpy as np
import pandas as pd
import tensorflow as tf
import os


def factory(cfg):

cfg.pert_in = tf.compat.v1.placeholder(tf.float32, [None, cfg.n_x], name='pert_in')
cfg.expr_out = tf.compat.v1.placeholder(tf.float32, [None, cfg.n_x], name='expr_out')
cfg.pert = pd.read_csv(os.path.join(cfg.root_dir, cfg.pert_file), header=None, dtype=np.float32)
cfg.expr = pd.read_csv(os.path.join(cfg.root_dir, cfg.expr_file), header=None, dtype=np.float32)
cfg.l1_lambda = tf.compat.v1.placeholder(tf.float32, name='lambda')
cfg.lr = tf.compat.v1.placeholder(tf.float32, name='lr')

# Prepare dataset iterators
dataset = tf.data.Dataset.from_tensor_slices((cfg.pert_in, cfg.expr_out))
cfg.iter_train = tf.compat.v1.data.make_initializable_iterator(
dataset.batch(cfg.batchsize).shuffle(buffer_size=1024, reshuffle_each_iteration=True))
cfg.iter_monitor = tf.compat.v1.data.make_initializable_iterator(
dataset.repeat().batch(cfg.batchsize).shuffle(buffer_size=1024, reshuffle_each_iteration=True))
cfg.iter_eval = tf.compat.v1.data.make_initializable_iterator(dataset.batch(cfg.batchsize))

# Data partition
if cfg.experiment_type == 'random partition' or cfg.experiment_type == 'full data':
return random_partition(cfg)

if cfg.experiment_type == 'leave one out (w/o single)':
return loo(cfg, singles=False)

if cfg.experiment_type == 'leave one out (w/ single)':
return loo(cfg, singles=True)

if cfg.experiment_type == 'single to combo':
return s2c(cfg)

raise Exception('Invalid experiment type. \nValid options: [random partition, leave one out (w/o single), '
'leave one out (w/ single), full data, single to combo]')
cfg.dataset = random_partition(cfg)

elif cfg.experiment_type == 'leave one out (w/o single)':
cfg.dataset = loo(cfg, singles=False)

elif cfg.experiment_type == 'leave one out (w/ single)':
cfg.dataset = loo(cfg, singles=True)

elif cfg.experiment_type == 'single to combo':
cfg.dataset = s2c(cfg)
else:
raise Exception('Invalid experiment type. \nValid options: [random partition, leave one out (w/o single), '
'leave one out (w/ single), full data, single to combo]')

# Prepare feed_dicts
cfg.feed_dicts = {
'train_set' : {
cfg.pert_in: cfg.dataset['pert_train'],
cfg.expr_out: cfg.dataset['expr_train'],
},
'valid_set': {
cfg.pert_in: cfg.dataset['pert_valid'],
cfg.expr_out: cfg.dataset['expr_valid'],
},
'test_set':{
cfg.pert_in: cfg.dataset['pert_test'],
cfg.expr_out: cfg.dataset['expr_test']
}
}
return cfg


def s2c(cfg):
Expand Down Expand Up @@ -84,6 +118,7 @@ def loo(cfg, singles):


def random_partition(cfg):

nexp, n_x = cfg.pert.shape
nvalid = int(nexp * cfg.trainset_ratio)
ntrain = int(nvalid * cfg.validset_ratio)
Expand All @@ -95,16 +130,19 @@ def random_partition(cfg):

dataset = {
"node_index": cfg.node_index,
"pert_train": cfg.pert.iloc[random_pos[:ntrain], :],
"pert_valid": cfg.pert.iloc[random_pos[ntrain:nvalid], :],
"pert_test": cfg.pert.iloc[random_pos[nvalid:], :],
"pert_full": cfg.pert,
"expr_train": cfg.expr.iloc[random_pos[:ntrain], :],
"expr_valid": cfg.expr.iloc[random_pos[ntrain:nvalid], :],
"expr_test": cfg.expr.iloc[random_pos[nvalid:], :],
"train_pos": random_pos[:ntrain],
"valid_pos": random_pos[ntrain:nvalid],
"test_pos": random_pos[nvalid:]
}

dataset.update({
"pert_train": cfg.pert.iloc[random_pos[:ntrain], :].values,
"pert_valid": cfg.pert.iloc[random_pos[ntrain:nvalid], :].values,
"pert_test": cfg.pert.iloc[random_pos[nvalid:], :].values,
"expr_train": cfg.expr.iloc[random_pos[:ntrain], :].values,
"expr_valid": cfg.expr.iloc[random_pos[ntrain:nvalid], :].values,
"expr_test": cfg.expr.iloc[random_pos[nvalid:], :].values
})

return dataset
22 changes: 9 additions & 13 deletions pertbio/pertbio/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,30 +30,25 @@ class PertBio:
def __init__(self, args):
self.args = args
self.n_x = args.n_x
self.pert_in = tf.compat.v1.placeholder(tf.float32, [None, self.n_x], name='pert_in')
self.expr_out = tf.compat.v1.placeholder(tf.float32, [None, self.n_x], name='expr_out')

# Prepare datasets
dataset = tf.data.Dataset.from_tensor_slices((self.pert_in, self.expr_out))
self.iter = tf.compat.v1.data.make_initializable_iterator(dataset
.shuffle(buffer_size=1024).batch(args.batchsize))
self.train_x, self.train_y = self.iter.get_next()
self.iter_eval = tf.compat.v1.data.make_initializable_iterator(dataset
.shuffle(buffer_size=1024).batch(args.batchsize))
self.pert_in, self.expr_out = args.pert_in, args.expr_out
self.iter_train, self.iter_monitor, self.iter_eval = args.iter_train, args.iter_monitor, args.iter_eval
self.train_x, self.train_y = self.iter_train.get_next()
self.monitor_x, self.monitor_y = self.iter_monitor.get_next()
self.eval_x, self.eval_y = self.iter_eval.get_next()
self.l1_lambda = self.args.l1_lambda
self.lr = self.args.lr

def get_ops(self):
self.l1_lambda = tf.compat.v1.placeholder(tf.float32, name='lambda')
self.train_loss, self.train_mse_loss = loss(self.train_y, self.train_yhat, self.l1_lambda, self.params['W'])
self.monitor_loss, self.monitor_mse_loss = loss(self.monitor_y, self.monitor_yhat, self.l1_lambda, self.params['W'])
self.eval_loss, self.eval_mse_loss = loss(self.eval_y, self.eval_yhat, self.l1_lambda, self.params['W'])

self.lr = tf.compat.v1.placeholder(tf.float32, name='lr')
self.op_optimize = optimize(self.train_loss, self.lr)

def build(self):
self.params = {}
self.get_variables()
self.train_yhat = self.forward(self.train_x)
self.monitor_yhat = self.forward(self.monitor_x)
self.eval_yhat = self.forward(self.eval_x)
self.get_ops()
return self
Expand Down Expand Up @@ -195,6 +190,7 @@ def build(self):
self.ode_solver = pertbio.kernel.get_ode_solver(self.args)
self._dxdt = pertbio.kernel.get_dxdt(self.args, self.params)
self.convergence_metric_train, self.train_yhat = self.forward(self.train_x)
self.convergence_metric_monitor, self.monitor_yhat = self.forward(self.monitor_x)
self.convergence_metric_eval, self.eval_yhat = self.forward(self.eval_x)
self.get_ops()
return self
Expand Down
62 changes: 20 additions & 42 deletions pertbio/pertbio/train.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,40 +36,25 @@ def train_substage(model, sess, lr_val, l1lamda, n_epoch, n_iter, n_iter_buffer,

n_unchanged = 0
idx_iter = 0
train_set = {
model.pert_in: args.dataset['pert_train'].values,
model.expr_out: args.dataset['expr_train'].values,
model.lr: lr_val,
model.l1_lambda: l1lamda
}
valid_set = {
model.pert_in: args.dataset['pert_valid'].values,
model.expr_out: args.dataset['expr_valid'].values,
model.l1_lambda: l1lamda
}
test_set = {
model.pert_in: args.dataset['pert_test'].values,
model.expr_out: args.dataset['expr_test'].values
}
for key in args.feed_dicts:
args.feed_dicts[key].update({model.lr: lr_val, model.l1_lambda: l1lamda})

sess.run(model.iter_monitor.initializer, feed_dict=args.feed_dicts['valid_set'])
for idx_epoch in range(n_epoch):

sess.run(model.iter.initializer, feed_dict=train_set)
sess.run(model.iter_train.initializer, feed_dict=args.feed_dicts['train_set'])
while True:
if idx_iter > n_iter or n_unchanged > n_iter_patience:
break

# training step
t0 = time.clock()
try:
_, loss_train_i, loss_train_mse_i = sess.run((model.op_optimize, model.train_loss,
model.train_mse_loss), feed_dict=train_set)
except OutOfRangeError:
_, loss_train_i, loss_train_mse_i = sess.run(
(model.op_optimize, model.train_loss, model.train_mse_loss), feed_dict=args.feed_dicts['train_set'])
except OutOfRangeError: # for iter_train
break

# record training
sess.run(model.iter_eval.initializer, feed_dict=valid_set)
loss_valid_i, loss_valid_mse_i = sess.run((model.eval_loss, model.eval_mse_loss), feed_dict=valid_set)
loss_valid_i, loss_valid_mse_i = sess.run(
(model.monitor_loss, model.monitor_mse_loss), feed_dict=args.feed_dicts['valid_set'])
new_loss = best_params.avg_n_iters_loss(loss_valid_i)
if args.export_verbose >= 3:
print("Epoch:{}/{}\tIteration: {}/{}\tnew_loss:{}\tbuffer_loss:{}\tbest:{}\tTolerance: {}/{}".format(
Expand All @@ -79,7 +64,6 @@ def train_substage(model, sess, lr_val, l1lamda, n_epoch, n_iter, n_iter_buffer,
append_record("record_eval.csv",
[idx_epoch, idx_iter, loss_train_i, loss_valid_i, loss_train_mse_i,
loss_valid_mse_i, None, time.clock() - t0])

# early stopping
idx_iter += 1
if new_loss < best_params.loss_min:
Expand All @@ -89,18 +73,17 @@ def train_substage(model, sess, lr_val, l1lamda, n_epoch, n_iter, n_iter_buffer,
else:
n_unchanged += 1

# Evaluation on valid set
t0 = time.clock()
loss_valid_i, loss_valid_mse_i = eval_model(sess, model.iter_eval, (model.eval_loss, model.eval_mse_loss),
valid_set)
append_record("record_eval.csv", [idx_epoch, None, None, loss_valid_i, None, loss_valid_mse_i,
None, time.clock() - t0])
if idx_iter > n_iter or n_unchanged > n_iter_patience:
break
# Evaluation on valid set
t0 = time.clock()
sess.run(model.iter_eval.initializer, feed_dict=args.feed_dicts['valid_set'])
loss_valid_i, loss_valid_mse_i = eval_model(sess, model.iter_eval, (model.eval_loss, model.eval_mse_loss),
args.feed_dicts['valid_set'])
append_record("record_eval.csv", [-1, None, None, loss_valid_i, None, loss_valid_mse_i, None, time.clock() - t0])

# Evaluation on test set
t0 = time.clock()
loss_test_mse = eval_model(sess, model.iter_eval, model.eval_mse_loss, test_set)
sess.run(model.iter_eval.initializer, feed_dict=args.feed_dicts['test_set'])
loss_test_mse = eval_model(sess, model.iter_eval, model.eval_mse_loss, args.feed_dicts['test_set'])
append_record("record_eval.csv", [-1, None, None, None, None, None, loss_test_mse, time.clock() - t0])

best_params.save()
Expand All @@ -126,12 +109,9 @@ def eval_model(sess, eval_iter, obj_fn, eval_dict):
return np.mean(np.array(eval_results), axis=0)


def train_model(args):
def train_model(model, args):
args.logger = TimeLogger(time_logger_step=1, hierachy=2)

# Constructing model
model = pertbio.model.factory(args)

# Check if all variables in scope
# TODO: put variables under appropriate scopes
for i in tf.compat.v1.get_collection(tf.compat.v1.GraphKeys.GLOBAL_VARIABLES, scope='initialization'):
Expand Down Expand Up @@ -245,10 +225,8 @@ def screenshot(self, sess, model, substage_i, node_index, loss_min, args):
self.update(params)

if self.export_verbose > 1 or self.export_verbose == -1: # no params but y_hat
test_set = {model.pert_in: args.dataset['pert_test'].values,
model.expr_out: args.dataset['expr_test'].values}
sess.run(model.iter_eval.initializer, feed_dict=test_set)
y_hat = sess.run(model.eval_yhat, feed_dict=test_set)
sess.run(model.iter_eval.initializer, feed_dict=model.args.feed_dicts['test_set'])
y_hat = sess.run(model.eval_yhat, feed_dict=model.args.feed_dicts['test_set'])
y_hat = pd.DataFrame(y_hat, columns=node_index[0])
self.update({'y_hat': y_hat})

Expand Down
4 changes: 4 additions & 0 deletions pertbio/pertbio/version.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,10 @@ def get_msg():
-- Apr 5, 2020 --
* Reformat for better code style
* Revise docs
version 0.2.2
-- Apr23, 2020 --
* Add support to tf.Datasets
"""

print(changelog)
17 changes: 7 additions & 10 deletions scripts/main.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
import sys
import pertbio
import os
import numpy as np
Expand All @@ -22,8 +21,7 @@ def set_seed(in_seed):

def prepare_workdir(in_cfg):
# Read Data
in_cfg.pert = pd.read_csv(in_cfg.pert_file, header=None, dtype=np.float32)
in_cfg.expr = pd.read_csv(in_cfg.expr_file, header=None, dtype=np.float32)
in_cfg.root_dir = os.getcwd()
in_cfg.node_index = pd.read_csv(in_cfg.node_index_file, header=None, names=None)
in_cfg.loo = pd.read_csv("data/loo_label.csv", header=None)

Expand All @@ -50,14 +48,12 @@ def prepare_workdir(in_cfg):
pass
os.makedirs(in_cfg.working_index)
os.chdir(in_cfg.working_index)

with open("record_eval.csv", 'w') as f:
f.write("epoch,iter,train_loss,valid_loss,train_mse,valid_mse,test_mse,time_elapsed\n")

# Load dataset
dataset = pertbio.dataset.factory(cfg)

print('Working directory is ready at {}.'.format(experiment_path))
return dataset
return 0


if __name__ == '__main__':
Expand All @@ -71,12 +67,13 @@ def prepare_workdir(in_cfg):
set_seed(cfg.seed)
print(vars(cfg))

cfg.dataset = prepare_workdir(cfg)

prepare_workdir(cfg)
logger = pertbio.utils.TimeLogger(time_logger_step=1, hierachy=3)
args = cfg
for i, stage in enumerate(cfg.stages):
cfg = pertbio.dataset.factory(cfg)
logger.log("Training on stage {}/{} ...".format(i + 1, len(cfg.stages)))
args.sub_stages = stage['sub_stages']
args.n_T = stage['nT']
pertbio.train.train_model(args)
model = pertbio.model.factory(args)
pertbio.train.train_model(model, args)

0 comments on commit 8bb9a27

Please sign in to comment.