diff --git a/src/core/device/device.cc b/src/core/device/device.cc index 458b944a6c..4114601829 100644 --- a/src/core/device/device.cc +++ b/src/core/device/device.cc @@ -50,8 +50,6 @@ void Device::RunGraph(bool serial) { bool previous_state = graph_enabled_; graph_enabled_ = false; - // graph_->Debug(); - if (serial) { // sequential execution graph_->RunInSerial(); @@ -60,6 +58,8 @@ void Device::RunGraph(bool serial) { graph_->RunGraph(); } + // graph_->Debug(); + graph_enabled_ = previous_state; } diff --git a/src/core/scheduler/scheduler.cc b/src/core/scheduler/scheduler.cc index deecb682b7..2f0bb34c23 100644 --- a/src/core/scheduler/scheduler.cc +++ b/src/core/scheduler/scheduler.cc @@ -18,7 +18,10 @@ #include "singa/core/scheduler.h" +#include #include +#include +#include #include #include @@ -99,39 +102,97 @@ void Graph::Reset() { } void Graph::Debug() { + if (dirty_) Analysis(); + + size_t max_in_num = 0, max_out_num = 0, max_next_num = 0, max_free_num = 0; + for (auto &it : nodes_) { + max_in_num = std::max(max_in_num, it->in_edges_.size()); + max_out_num = std::max(max_out_num, it->out_edges_.size()); + } + + for (auto &it : next_nodes_) { + max_next_num = std::max(max_next_num, it.size()); + } + + for (auto &it : free_blocks_) { + max_free_num = std::max(max_free_num, it.size()); + } + + int w = 2; + std::stringstream ss; + ss << "begin nodes:["; + for (size_t i = 0; i < begin_nodes_.size(); ++i) { + ss << begin_nodes_[i]->id_; + } + ss << "]" << std::endl; + + size_t size = 0; for (size_t i = 0; i < nodes_.size(); ++i) { - printf("OP[%2lu]: ", i); - printf("Inputs: "); + ss << "OP[" << std::setw(w) << i; auto node = nodes_[i]; - for (size_t j = 0; j < node->in_edges_.size(); ++j) { - printf("%d\t", blocks_[node->in_edges_[j]->blk_]->id_); + + ss << "] Inputs:["; + size = node->in_edges_.size(); + for (size_t j = 0; j < max_in_num; ++j) { + if (j < size) + ss << std::setw(w) << blocks_[node->in_edges_[j]->blk_]->id_ << " "; + else + ss << std::setw(w + 1) << " "; } - for (size_t j = node->in_edges_.size(); j < 3; ++j) { - printf("\t"); + + ss << "] Outputs:["; + size = node->out_edges_.size(); + for (size_t j = 0; j < max_out_num; ++j) { + if (j < size) + ss << std::setw(w) << blocks_[node->out_edges_[j]->blk_]->id_ << " "; + else + ss << std::setw(w + 1) << " "; } - printf("Outputs: "); - for (size_t j = 0; j < node->out_edges_.size(); ++j) { - printf("%d\t", blocks_[node->out_edges_[j]->blk_]->id_); + + ss << "] Next nodes:["; + size = next_nodes_[i].size(); + for (size_t j = 0; j < max_next_num; ++j) { + if (j < size) + ss << std::setw(w) << next_nodes_[i][j]->id_ << " "; + else + ss << std::setw(w + 1) << " "; + } + + ss << "] Free blocks:["; + size = free_blocks_[i].size(); + for (size_t j = 0; j < max_free_num; ++j) { + if (j < size) + ss << std::setw(w) << blocks_[free_blocks_[i][j]]->id_ << " "; + else + ss << std::setw(w + 1) << " "; } - printf("\n"); + ss << "]" << std::endl; } + std::vector blkInfos; + blkInfos.resize(blocks_.size()); + for (auto it : blocks_) { - auto blkInfo = it.second; - printf("Block[%2d]: addr[%p] graph_ref[%d] ref_count[%d] ", blkInfo->id_, - blkInfo->blk_, blkInfo->graph_ref_, it.first->ref_count()); + blkInfos[it.second->id_] = it.second; + } + + for (auto it : blkInfos) { + auto blkInfo = it; + ss << "Block[" << std::setw(w) << blkInfo->id_ << "] addr[" << std::setw(w) + << blkInfo->blk_ << "] graph_ref[" << std::setw(w) << blkInfo->graph_ref_ + << "] ref_count[" << std::setw(w) << blkInfo->blk_->ref_count() << "] "; switch (blkInfo->type_) { case BlockType::kInput: - printf("type[input] "); + ss << "type[input] "; break; case BlockType::kParam: - printf("type[param] "); + ss << "type[param] "; break; case BlockType::kInter: - printf("type[inter] "); + ss << "type[inter] "; break; case BlockType::kEnd: - printf("type[_end_] "); + ss << "type[_end_] "; break; default: break; @@ -140,14 +201,16 @@ void Graph::Debug() { if (blkInfo->write_node_) { id = blkInfo->write_node_->id_; } - printf(" write_node[%2d]", id); + ss << " write_node[" << std::setw(w) << id << "]"; id = -1; if (blkInfo->last_node_) { id = blkInfo->last_node_->id_; } - printf(" last_node[%2d]", id); - printf("\n"); + ss << " last_node[" << std::setw(w) << id << "]" << std::endl; + ; } + + printf("%s", ss.str().c_str()); } void Graph::RunGraph() { @@ -358,6 +421,8 @@ void Graph::Analysis() { } dirty_ = false; + + // Debug(); } void Graph::FreeLoop() { diff --git a/src/core/tensor/tensor.cc b/src/core/tensor/tensor.cc index 0e55776b30..2dfee71514 100644 --- a/src/core/tensor/tensor.cc +++ b/src/core/tensor/tensor.cc @@ -161,6 +161,9 @@ Tensor Resize(const Tensor &in, const Shape &shape) { // return new tensor Tensor Tensor::AsType(const DataType type) { + CHECK(block() && block()->initialized() == true) + << "the data of the tensor needs be initialized before casting to " + "another type"; if (data_type_ != type) { Tensor &thisRef = *this; Tensor ret(shape_, device_, type); @@ -1466,8 +1469,12 @@ void Mult(const Tensor &A, const Tensor &B, Tensor *out) { template void Mult(const SType alpha, const Tensor &A, const Tensor &B, const SType beta, Tensor *C) { + Tensor fakeC; vector read_blocks = {A.block(), B.block()}; - // if (beta) read_blocks.push_back(C->block()); + if (beta) { + fakeC = *C; + read_blocks.push_back(C->block()); + } if (B.nDim() == 1u) { CHECK_EQ(A.shape().size(), 2u); TYPE_LANG_SWITCH(A.data_type(), DType, A.device()->lang(), Lang, { @@ -1475,7 +1482,7 @@ void Mult(const SType alpha, const Tensor &A, const Tensor &B, const SType beta, auto b = TypeCast(beta); Tensor &CRef = *C; C->device()->Exec( - [a, A, b, B, CRef](Context *ctx) mutable { + [a, A, b, B, CRef, fakeC](Context *ctx) mutable { GEMV(a, A, B, b, &CRef, ctx); }, read_blocks, {C->block()}); @@ -1488,7 +1495,7 @@ void Mult(const SType alpha, const Tensor &A, const Tensor &B, const SType beta, auto b = TypeCast(beta); Tensor &CRef = *C; C->device()->Exec( - [a, A, b, B, CRef](Context *ctx) mutable { + [a, A, b, B, CRef, fakeC](Context *ctx) mutable { GEMM(a, A, B, b, &CRef, ctx); }, read_blocks, {C->block()}); @@ -1523,7 +1530,7 @@ void Mult(const SType alpha, const Tensor &A, const Tensor &B, const SType beta, Tensor &CRef = *C; C->device()->Exec( - [a, A_tmp, b, B_tmp, CRef](Context *ctx) mutable { + [a, A_tmp, b, B_tmp, CRef, fakeC](Context *ctx) mutable { GEMMBatched(a, A_tmp, B_tmp, b, &CRef, ctx); }, read_blocks, {C->block()}); diff --git a/test/singa/test_cross_entropy.cc b/test/singa/test_cross_entropy.cc index cd7e7bafcf..aa48e6f6b8 100644 --- a/test/singa/test_cross_entropy.cc +++ b/test/singa/test_cross_entropy.cc @@ -42,8 +42,10 @@ class TestSoftmaxCrossEntropy : public ::testing::Test { TEST_F(TestSoftmaxCrossEntropy, CppForward) { p.CopyDataFromHostPtr(pdat, 8); - t.AsType(singa::kInt); + EXPECT_TRUE(p.block()->initialized()); t.CopyDataFromHostPtr(tdat, 2); + t.AsType(singa::kInt); + singa::SoftmaxCrossEntropy cross_entropy; const Tensor& loss = cross_entropy.Forward(singa::kEval, p, t); @@ -56,8 +58,8 @@ TEST_F(TestSoftmaxCrossEntropy, CppForward) { TEST_F(TestSoftmaxCrossEntropy, CppForwardAryTarget) { p.CopyDataFromHostPtr(pdat, 8); - ta.AsType(singa::kInt); ta.CopyDataFromHostPtr(tary, 8); + ta.AsType(singa::kInt); singa::SoftmaxCrossEntropy cross_entropy; const Tensor& loss = cross_entropy.Forward(singa::kEval, p, ta); @@ -70,8 +72,8 @@ TEST_F(TestSoftmaxCrossEntropy, CppForwardAryTarget) { TEST_F(TestSoftmaxCrossEntropy, CppBackward) { p.CopyDataFromHostPtr(pdat, 8); - t.AsType(singa::kInt); t.CopyDataFromHostPtr(tdat, 2); + t.AsType(singa::kInt); singa::SoftmaxCrossEntropy cross_entropy; cross_entropy.Forward(singa::kTrain, p, t); @@ -90,8 +92,8 @@ TEST_F(TestSoftmaxCrossEntropy, CppBackward) { TEST_F(TestSoftmaxCrossEntropy, CppBackwardAryTarget) { p.CopyDataFromHostPtr(pdat, 8); - ta.AsType(singa::kInt); ta.CopyDataFromHostPtr(tary, 8); + ta.AsType(singa::kInt); singa::SoftmaxCrossEntropy cross_entropy; cross_entropy.Forward(singa::kTrain, p, ta); diff --git a/test/singa/test_platform.cc b/test/singa/test_platform.cc index c38ef37e52..fce5f347ab 100644 --- a/test/singa/test_platform.cc +++ b/test/singa/test_platform.cc @@ -28,8 +28,10 @@ using singa::Platform; TEST(Platform, CreateMultDevice) { int n = Platform::GetNumGPUs(); auto devs = Platform::CreateCudaGPUs(n); - for (int i = 0; i < devs.size(); i++) { + for (size_t i = 0; i < devs.size(); i++) { auto b = devs[i]->NewBlock(512 + 512 * (2 - i)); + // for lazy allocation + b->mutable_data(); EXPECT_EQ(512 + 512 * (2 - i), devs[i]->GetAllocatedMem()); devs[i]->FreeBlock(b); } @@ -54,6 +56,8 @@ TEST(Platform, CreateDevice) { size_t size[] = {128, 256, 3, 24}; { auto ptr = dev->NewBlock(size[0]); + // for lazy allocation + ptr->mutable_data(); auto allocated = dev->GetAllocatedMem(); EXPECT_LE(size[0], allocated); dev->FreeBlock(ptr); @@ -63,9 +67,13 @@ TEST(Platform, CreateDevice) { auto ptr0 = dev->NewBlock(size[0]); auto ptr1 = dev->NewBlock(size[1]); auto ptr2 = dev->NewBlock(size[2]); + ptr0->mutable_data(); + ptr1->mutable_data(); + ptr2->mutable_data(); auto allocated = dev->GetAllocatedMem(); EXPECT_LE(size[0] + size[1] + size[2], allocated); auto ptr3 = dev->NewBlock(size[3]); + ptr3->mutable_data(); allocated = dev->GetAllocatedMem(); EXPECT_LE(size[0] + size[1] + size[2] + size[3], allocated); dev->FreeBlock(ptr0); diff --git a/test/singa/test_snapshot.cc b/test/singa/test_snapshot.cc index 43c879c7cc..ab1a69ff31 100644 --- a/test/singa/test_snapshot.cc +++ b/test/singa/test_snapshot.cc @@ -79,8 +79,8 @@ TEST(Snapshot, ReadIntTest) { singa::Snapshot int_snapshot_write(prefix + ".int", singa::Snapshot::kWrite); singa::Tensor int_param(singa::Shape{4}); - int_param.AsType(singa::kInt); int_param.CopyDataFromHostPtr(int_data, 4); + int_param.AsType(singa::kInt); int_snapshot_write.Write("IntParam", int_param); }