Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Towards a 2D RG-PG-SGD #491

Draft
wants to merge 9 commits into
base: master
Choose a base branch
from
Draft
33 changes: 28 additions & 5 deletions src/algorithms/path_sgd_layout.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,9 @@ namespace odgi {
const bool &snapshot,
const std::string &snapshot_prefix,
std::vector<std::atomic<double>> &X,
std::vector<std::atomic<double>> &Y) {
std::vector<std::atomic<double>> &Y,
const bool &p_sgd_target_paths,
const std::vector<bool> &is_ref) {
#ifdef debug_path_sgd
std::cerr << "iter_max: " << iter_max << std::endl;
std::cerr << "min_term_updates: " << min_term_updates << std::endl;
Expand Down Expand Up @@ -244,6 +246,23 @@ namespace odgi {
uint64_t term_i_length = graph.get_length(term_i);
uint64_t term_j_length = graph.get_length(term_j);

bool update_term_i = true;
bool update_term_j = true;

if (p_sgd_target_paths) {
if (is_ref[graph.get_id(term_i) - 1]) {
update_term_i = false;
}
if (is_ref[graph.get_id(term_j) - 1]) {
update_term_j = false;
}
}
if (!update_term_j && !update_term_i) {
// we also have to update the number of terms here, because else we will over sample and the sorting will take much longer
term_updates_local++;
continue;
}

// adjust the positions to the node starts
size_t pos_in_path_a = path_index.get_position_of_step(step_a);
size_t pos_in_path_b = path_index.get_position_of_step(step_b);
Expand Down Expand Up @@ -357,10 +376,14 @@ namespace odgi {
#ifdef debug_path_sgd
std::cerr << "before X[i] " << X[i].load() << " X[j] " << X[j].load() << std::endl;
#endif
X[2 * i + offset_i].store(X[2 * i + offset_i].load() - r_x);
Y[2 * i + offset_i].store(Y[2 * i + offset_i].load() - r_y);
X[2 * j + offset_j].store(X[2 * j + offset_j].load() + r_x);
Y[2 * j + offset_j].store(Y[2 * j + offset_j].load() + r_y);
if (update_term_i) {
X[2 * i + offset_i].store(X[2 * i + offset_i].load() - r_x);
Y[2 * i + offset_i].store(Y[2 * i + offset_i].load() - r_y);
}
if (update_term_j) {
Y[2 * j + offset_j].store(Y[2 * j + offset_j].load() + r_y);
X[2 * j + offset_j].store(X[2 * j + offset_j].load() + r_x);
}
#ifdef debug_path_sgd
std::cerr << "after X[i] " << X[i].load() << " X[j] " << X[j].load() << std::endl;
#endif
Expand Down
4 changes: 3 additions & 1 deletion src/algorithms/path_sgd_layout.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,9 @@ namespace odgi {
const bool &snapshot,
const std::string &snapshot_prefix,
std::vector<std::atomic<double>> &X,
std::vector<std::atomic<double>> &Y);
std::vector<std::atomic<double>> &Y,
const bool &p_sgd_target_paths,
const std::vector<bool> &is_ref);

/// our learning schedule
std::vector<double> path_linear_sgd_layout_schedule(const double &w_min,
Expand Down
193 changes: 146 additions & 47 deletions src/subcommand/layout_main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,8 @@ int main_layout(int argc, char **argv) {
args::ValueFlag<std::string> p_sgd_snapshot(pg_sgd_opts, "STRING",
"Set the prefix to which each snapshot layout of a path guided 2D SGD iteration should be written to (default: NONE).",
{'u', "path-sgd-snapshot"});
args::Group threading_opts(parser, "[ Threading ]");
args::ValueFlag<std::string> _p_sgd_target_paths(pg_sgd_opts, "FILE", "Read the paths that should be considered as target paths (references) from this *FILE*. PG-SGD will keep the nodes of the given paths fixed. A path's rank determines it's weight for decision making and is given by its position in the given *FILE*.", {'H', "target-paths"});
args::Group threading_opts(parser, "[ Threading ]");
args::ValueFlag<uint64_t> nthreads(threading_opts, "N",
"Number of threads to use for parallel operations.",
{'t', "threads"});
Expand Down Expand Up @@ -168,6 +169,42 @@ int main_layout(int argc, char **argv) {
}
return max_path_step_count;
};

// TODO We have this function here, in untangle, maybe somewhere else? Refactor!
// path loading
auto load_paths = [&](const std::string& path_names_file) {
std::ifstream path_names_in(path_names_file);
uint64_t num_of_paths_in_file = 0;
std::vector<bool> path_already_seen;
path_already_seen.resize(graph.get_path_count(), false);
std::string line;
std::vector<path_handle_t> paths;
while (std::getline(path_names_in, line)) {
if (!line.empty()) {
if (graph.has_path(line)) {
const path_handle_t path = graph.get_path_handle(line);
const uint64_t path_rank = as_integer(path) - 1;
if (!path_already_seen[path_rank]) {
path_already_seen[path_rank] = true;
paths.push_back(path);
} else {
std::cerr << "[odgi::layout] error: in the path list there are duplicated path names."
<< std::endl;
exit(1);
}
}
++num_of_paths_in_file;
}
}
path_names_in.close();
std::cerr << "[odgi::layout] found " << paths.size() << "/" << num_of_paths_in_file
<< " paths to consider." << std::endl;
if (paths.empty()) {
std::cerr << "[odgi::layout] error: no path to consider." << std::endl;
exit(1);
}
return paths;
};
// default parameters
/* We don't do this, yet.
std::string path_sgd_seed;
Expand All @@ -183,6 +220,13 @@ int main_layout(int argc, char **argv) {
path_sgd_seed = "pangenomic!";
}
*/

std::vector<bool> is_ref;
std::vector<path_handle_t> target_paths;
if (_p_sgd_target_paths) {
target_paths = load_paths(args::get(_p_sgd_target_paths));
}

if (p_sgd_min_term_updates_paths && p_sgd_min_term_updates_num_nodes) {
std::cerr
<< "[odgi::layout] error: there can only be one argument provided for the minimum number of term updates in the path guided 1D SGD."
Expand Down Expand Up @@ -279,50 +323,104 @@ int main_layout(int argc, char **argv) {

uint64_t square_space = graph.get_node_count() * 2;
uint64_t x, y;
graph.for_each_handle([&](const handle_t &h) {
uint64_t pos = 2 * number_bool_packing::unpack_number(h);
switch (layout_initialization) {
case 'g': {
graph_X[pos].store(gaussian_noise(rng));
graph_Y[pos].store(gaussian_noise(rng));
graph_X[pos + 1].store(gaussian_noise(rng));
graph_Y[pos + 1].store(gaussian_noise(rng));
break;
}
case 'u': {
graph_X[pos].store(len);
graph_Y[pos].store(uniform_noise(rng));
len += graph.get_length(h);
graph_X[pos + 1].store(len);
graph_Y[pos + 1].store(uniform_noise(rng));
break;
}
case 'r': {
graph_X[pos].store(uniform_noise_in_length(rng));
graph_Y[pos].store(uniform_noise_in_length(rng));
graph_X[pos + 1].store(uniform_noise_in_length(rng));
graph_Y[pos + 1].store(uniform_noise_in_length(rng));
break;
}
case 'h': {
d2xy(square_space, pos, &x, &y);
graph_X[pos].store(x);
graph_Y[pos].store(y);
d2xy(square_space, pos + 1, &x, &y);
graph_X[pos + 1].store(x);
graph_Y[pos + 1].store(y);
break;
}
default: {
graph_X[pos].store(len);
graph_Y[pos].store(gaussian_noise(rng));
len += graph.get_length(h);
graph_X[pos + 1].store(len);
graph_Y[pos + 1].store(gaussian_noise(rng));
}
}
//std::cerr << pos << ": " << graph_X[pos] << "," << graph_Y[pos] << " ------ " << graph_X[pos + 1] << "," << graph_Y[pos + 1] << std::endl;
});
if (_p_sgd_target_paths) {
std::unique_ptr <odgi::algorithms::progress_meter::ProgressMeter> target_paths_progress;
if (args::get(progress)) {
std::string banner = "[odgi::layout] preparing target path vectors:";
target_paths_progress = std::make_unique<odgi::algorithms::progress_meter::ProgressMeter>(target_paths.size(), banner);
}
std::fill_n(std::back_inserter(is_ref), graph.get_node_count(), false);
for (auto target_path : target_paths) {
graph.for_each_step_in_path(
target_path,
[&](const step_handle_t &step) {
handle_t h = graph.get_handle_of_step(step);
uint64_t pos = 2 * number_bool_packing::unpack_number(h);
graph_X[pos].store(len);
graph_Y[pos].store(100);
len += graph.get_length(h);
graph_X[pos + 1].store(len);
graph_Y[pos + 1].store(100);
uint64_t i = graph.get_id(h) - 1;
if (!is_ref[i]) {
is_ref[i] = true;
}
});
if (args::get(progress)) {
target_paths_progress->increment(1);
}
}
if (args::get(progress)) {
target_paths_progress->finish();
}
for (uint64_t i = 0; i < is_ref.size(); i++) {
bool ref = is_ref[i];
if (!ref) {
handle_t h = graph.get_handle(i + 1);
uint64_t pos = 2 * number_bool_packing::unpack_number(h);
graph_X[pos].store(len);
graph_Y[pos].store(gaussian_noise(rng));
len += graph.get_length(h);
graph_X[pos + 1].store(len);
graph_Y[pos + 1].store(gaussian_noise(rng));
}
}
std::vector<double> X_final(graph_X.size());
uint64_t i = 0;
for (auto& x : graph_X) {
X_final[i++] = x.load();
}
std::vector<double> Y_final(graph_Y.size());
i = 0;
for (auto& y : graph_Y) {
Y_final[i++] = y.load();
}
} else {
graph.for_each_handle([&](const handle_t &h) {
uint64_t pos = 2 * number_bool_packing::unpack_number(h);
switch (layout_initialization) {
case 'g': {
graph_X[pos].store(gaussian_noise(rng));
graph_Y[pos].store(gaussian_noise(rng));
graph_X[pos + 1].store(gaussian_noise(rng));
graph_Y[pos + 1].store(gaussian_noise(rng));
break;
}
case 'u': {
graph_X[pos].store(len);
graph_Y[pos].store(uniform_noise(rng));
len += graph.get_length(h);
graph_X[pos + 1].store(len);
graph_Y[pos + 1].store(uniform_noise(rng));
break;
}
case 'r': {
graph_X[pos].store(uniform_noise_in_length(rng));
graph_Y[pos].store(uniform_noise_in_length(rng));
graph_X[pos + 1].store(uniform_noise_in_length(rng));
graph_Y[pos + 1].store(uniform_noise_in_length(rng));
break;
}
case 'h': {
d2xy(square_space, pos, &x, &y);
graph_X[pos].store(x);
graph_Y[pos].store(y);
d2xy(square_space, pos + 1, &x, &y);
graph_X[pos + 1].store(x);
graph_Y[pos + 1].store(y);
break;
}
default: {
graph_X[pos].store(len);
graph_Y[pos].store(gaussian_noise(rng));
len += graph.get_length(h);
graph_X[pos + 1].store(len);
graph_Y[pos + 1].store(gaussian_noise(rng));
}
}
//std::cerr << pos << ": " << graph_X[pos] << "," << graph_Y[pos] << " ------ " << graph_X[pos + 1] << "," << graph_Y[pos + 1] << std::endl;
});
}

//double max_x = 0;
algorithms::path_linear_sgd_layout(
Expand All @@ -345,8 +443,9 @@ int main_layout(int argc, char **argv) {
snapshot,
snapshot_prefix,
graph_X,
graph_Y
);
graph_Y,
_p_sgd_target_paths,
is_ref);

// drop out of atomic stuff... maybe not the best way to do this
// TODO: use directly the atomic vector?
Expand Down
Loading