diff --git a/src/core/device/device.cc b/src/core/device/device.cc
index 458b944a6c..4114601829 100644
--- a/src/core/device/device.cc
+++ b/src/core/device/device.cc
@@ -50,8 +50,6 @@ void Device::RunGraph(bool serial) {
   bool previous_state = graph_enabled_;
   graph_enabled_ = false;
 
-  // graph_->Debug();
-
   if (serial) {
     // sequential execution
     graph_->RunInSerial();
@@ -60,6 +58,8 @@ void Device::RunGraph(bool serial) {
     graph_->RunGraph();
   }
 
+  // graph_->Debug();
+
   graph_enabled_ = previous_state;
 }
 
diff --git a/src/core/scheduler/scheduler.cc b/src/core/scheduler/scheduler.cc
index deecb682b7..2f0bb34c23 100644
--- a/src/core/scheduler/scheduler.cc
+++ b/src/core/scheduler/scheduler.cc
@@ -18,7 +18,10 @@
 
 #include "singa/core/scheduler.h"
 
+#include <algorithm>
 #include <functional>
+#include <iomanip>
+#include <sstream>
 #include <thread>
 #include <unordered_set>
 
@@ -99,39 +102,97 @@ void Graph::Reset() {
 }
 
 void Graph::Debug() {
+  if (dirty_) Analysis();
+
+  size_t max_in_num = 0, max_out_num = 0, max_next_num = 0, max_free_num = 0;
+  for (auto &it : nodes_) {
+    max_in_num = std::max(max_in_num, it->in_edges_.size());
+    max_out_num = std::max(max_out_num, it->out_edges_.size());
+  }
+
+  for (auto &it : next_nodes_) {
+    max_next_num = std::max(max_next_num, it.size());
+  }
+
+  for (auto &it : free_blocks_) {
+    max_free_num = std::max(max_free_num, it.size());
+  }
+
+  int w = 2;
+  std::stringstream ss;
+  ss << "begin nodes:[";
+  for (size_t i = 0; i < begin_nodes_.size(); ++i) {
+    ss << begin_nodes_[i]->id_;
+  }
+  ss << "]" << std::endl;
+
+  size_t size = 0;
   for (size_t i = 0; i < nodes_.size(); ++i) {
-    printf("OP[%2lu]: ", i);
-    printf("Inputs: ");
+    ss << "OP[" << std::setw(w) << i;
     auto node = nodes_[i];
-    for (size_t j = 0; j < node->in_edges_.size(); ++j) {
-      printf("%d\t", blocks_[node->in_edges_[j]->blk_]->id_);
+
+    ss << "] Inputs:[";
+    size = node->in_edges_.size();
+    for (size_t j = 0; j < max_in_num; ++j) {
+      if (j < size)
+        ss << std::setw(w) << blocks_[node->in_edges_[j]->blk_]->id_ << " ";
+      else
+        ss << std::setw(w + 1) << " ";
     }
-    for (size_t j = node->in_edges_.size(); j < 3; ++j) {
-      printf("\t");
+
+    ss << "] Outputs:[";
+    size = node->out_edges_.size();
+    for (size_t j = 0; j < max_out_num; ++j) {
+      if (j < size)
+        ss << std::setw(w) << blocks_[node->out_edges_[j]->blk_]->id_ << " ";
+      else
+        ss << std::setw(w + 1) << " ";
     }
-    printf("Outputs: ");
-    for (size_t j = 0; j < node->out_edges_.size(); ++j) {
-      printf("%d\t", blocks_[node->out_edges_[j]->blk_]->id_);
+
+    ss << "] Next nodes:[";
+    size = next_nodes_[i].size();
+    for (size_t j = 0; j < max_next_num; ++j) {
+      if (j < size)
+        ss << std::setw(w) << next_nodes_[i][j]->id_ << " ";
+      else
+        ss << std::setw(w + 1) << " ";
+    }
+
+    ss << "] Free blocks:[";
+    size = free_blocks_[i].size();
+    for (size_t j = 0; j < max_free_num; ++j) {
+      if (j < size)
+        ss << std::setw(w) << blocks_[free_blocks_[i][j]]->id_ << " ";
+      else
+        ss << std::setw(w + 1) << " ";
     }
-    printf("\n");
+    ss << "]" << std::endl;
   }
 
+  std::vector<BlkInfo *> blkInfos;
+  blkInfos.resize(blocks_.size());
+
   for (auto it : blocks_) {
-    auto blkInfo = it.second;
-    printf("Block[%2d]: addr[%p] graph_ref[%d] ref_count[%d] ", blkInfo->id_,
-           blkInfo->blk_, blkInfo->graph_ref_, it.first->ref_count());
+    blkInfos[it.second->id_] = it.second;
+  }
+
+  for (auto it : blkInfos) {
+    auto blkInfo = it;
+    ss << "Block[" << std::setw(w) << blkInfo->id_ << "] addr[" << std::setw(w)
+       << blkInfo->blk_ << "] graph_ref[" << std::setw(w) << blkInfo->graph_ref_
+       << "] ref_count[" << std::setw(w) << blkInfo->blk_->ref_count() << "] ";
     switch (blkInfo->type_) {
       case BlockType::kInput:
-        printf("type[input] ");
+        ss << "type[input] ";
         break;
       case BlockType::kParam:
-        printf("type[param] ");
+        ss << "type[param] ";
         break;
       case BlockType::kInter:
-        printf("type[inter] ");
+        ss << "type[inter] ";
         break;
       case BlockType::kEnd:
-        printf("type[_end_] ");
+        ss << "type[_end_] ";
         break;
       default:
         break;
@@ -140,14 +201,16 @@ void Graph::Debug() {
     if (blkInfo->write_node_) {
       id = blkInfo->write_node_->id_;
     }
-    printf(" write_node[%2d]", id);
+    ss << " write_node[" << std::setw(w) << id << "]";
     id = -1;
     if (blkInfo->last_node_) {
       id = blkInfo->last_node_->id_;
     }
-    printf(" last_node[%2d]", id);
-    printf("\n");
+    ss << " last_node[" << std::setw(w) << id << "]" << std::endl;
+    ;
   }
+
+  printf("%s", ss.str().c_str());
 }
 
 void Graph::RunGraph() {
@@ -358,6 +421,8 @@ void Graph::Analysis() {
   }
 
   dirty_ = false;
+
+  // Debug();
 }
 
 void Graph::FreeLoop() {
diff --git a/src/core/tensor/tensor.cc b/src/core/tensor/tensor.cc
index 0e55776b30..2dfee71514 100644
--- a/src/core/tensor/tensor.cc
+++ b/src/core/tensor/tensor.cc
@@ -161,6 +161,9 @@ Tensor Resize(const Tensor &in, const Shape &shape) {
 
 // return new tensor
 Tensor Tensor::AsType(const DataType type) {
+  CHECK(block() && block()->initialized() == true)
+      << "the data of the tensor needs be initialized before casting to "
+         "another type";
   if (data_type_ != type) {
     Tensor &thisRef = *this;
     Tensor ret(shape_, device_, type);
@@ -1466,8 +1469,12 @@ void Mult(const Tensor &A, const Tensor &B, Tensor *out) {
 template <typename SType>
 void Mult(const SType alpha, const Tensor &A, const Tensor &B, const SType beta,
           Tensor *C) {
+  Tensor fakeC;
   vector<Block *> read_blocks = {A.block(), B.block()};
-  // if (beta) read_blocks.push_back(C->block());
+  if (beta) {
+    fakeC = *C;
+    read_blocks.push_back(C->block());
+  }
   if (B.nDim() == 1u) {
     CHECK_EQ(A.shape().size(), 2u);
     TYPE_LANG_SWITCH(A.data_type(), DType, A.device()->lang(), Lang, {
@@ -1475,7 +1482,7 @@ void Mult(const SType alpha, const Tensor &A, const Tensor &B, const SType beta,
       auto b = TypeCast<SType, DType>(beta);
       Tensor &CRef = *C;
       C->device()->Exec(
-          [a, A, b, B, CRef](Context *ctx) mutable {
+          [a, A, b, B, CRef, fakeC](Context *ctx) mutable {
             GEMV<DType, Lang>(a, A, B, b, &CRef, ctx);
           },
           read_blocks, {C->block()});
@@ -1488,7 +1495,7 @@ void Mult(const SType alpha, const Tensor &A, const Tensor &B, const SType beta,
       auto b = TypeCast<SType, DType>(beta);
       Tensor &CRef = *C;
       C->device()->Exec(
-          [a, A, b, B, CRef](Context *ctx) mutable {
+          [a, A, b, B, CRef, fakeC](Context *ctx) mutable {
             GEMM<DType, Lang>(a, A, B, b, &CRef, ctx);
           },
           read_blocks, {C->block()});
@@ -1523,7 +1530,7 @@ void Mult(const SType alpha, const Tensor &A, const Tensor &B, const SType beta,
 
       Tensor &CRef = *C;
       C->device()->Exec(
-          [a, A_tmp, b, B_tmp, CRef](Context *ctx) mutable {
+          [a, A_tmp, b, B_tmp, CRef, fakeC](Context *ctx) mutable {
             GEMMBatched<DType, Lang>(a, A_tmp, B_tmp, b, &CRef, ctx);
           },
           read_blocks, {C->block()});
diff --git a/test/singa/test_cross_entropy.cc b/test/singa/test_cross_entropy.cc
index cd7e7bafcf..aa48e6f6b8 100644
--- a/test/singa/test_cross_entropy.cc
+++ b/test/singa/test_cross_entropy.cc
@@ -42,8 +42,10 @@ class TestSoftmaxCrossEntropy : public ::testing::Test {
 
 TEST_F(TestSoftmaxCrossEntropy, CppForward) {
   p.CopyDataFromHostPtr(pdat, 8);
-  t.AsType(singa::kInt);
+  EXPECT_TRUE(p.block()->initialized());
   t.CopyDataFromHostPtr(tdat, 2);
+  t.AsType(singa::kInt);
+
 
   singa::SoftmaxCrossEntropy cross_entropy;
   const Tensor& loss = cross_entropy.Forward(singa::kEval, p, t);
@@ -56,8 +58,8 @@ TEST_F(TestSoftmaxCrossEntropy, CppForward) {
 
 TEST_F(TestSoftmaxCrossEntropy, CppForwardAryTarget) {
   p.CopyDataFromHostPtr(pdat, 8);
-  ta.AsType(singa::kInt);
   ta.CopyDataFromHostPtr(tary, 8);
+  ta.AsType(singa::kInt);
 
   singa::SoftmaxCrossEntropy cross_entropy;
   const Tensor& loss = cross_entropy.Forward(singa::kEval, p, ta);
@@ -70,8 +72,8 @@ TEST_F(TestSoftmaxCrossEntropy, CppForwardAryTarget) {
 
 TEST_F(TestSoftmaxCrossEntropy, CppBackward) {
   p.CopyDataFromHostPtr(pdat, 8);
-  t.AsType(singa::kInt);
   t.CopyDataFromHostPtr(tdat, 2);
+  t.AsType(singa::kInt);
 
   singa::SoftmaxCrossEntropy cross_entropy;
   cross_entropy.Forward(singa::kTrain, p, t);
@@ -90,8 +92,8 @@ TEST_F(TestSoftmaxCrossEntropy, CppBackward) {
 
 TEST_F(TestSoftmaxCrossEntropy, CppBackwardAryTarget) {
   p.CopyDataFromHostPtr(pdat, 8);
-  ta.AsType(singa::kInt);
   ta.CopyDataFromHostPtr(tary, 8);
+  ta.AsType(singa::kInt);
 
   singa::SoftmaxCrossEntropy cross_entropy;
   cross_entropy.Forward(singa::kTrain, p, ta);
diff --git a/test/singa/test_platform.cc b/test/singa/test_platform.cc
index c38ef37e52..fce5f347ab 100644
--- a/test/singa/test_platform.cc
+++ b/test/singa/test_platform.cc
@@ -28,8 +28,10 @@ using singa::Platform;
 TEST(Platform, CreateMultDevice) {
   int n = Platform::GetNumGPUs();
   auto devs = Platform::CreateCudaGPUs(n);
-  for (int i = 0; i < devs.size(); i++) {
+  for (size_t i = 0; i < devs.size(); i++) {
     auto b = devs[i]->NewBlock(512 + 512 * (2 - i));
+    // for lazy allocation
+    b->mutable_data();
     EXPECT_EQ(512 + 512 * (2 - i), devs[i]->GetAllocatedMem());
     devs[i]->FreeBlock(b);
   }
@@ -54,6 +56,8 @@ TEST(Platform, CreateDevice) {
   size_t size[] = {128, 256, 3, 24};
   {
     auto ptr = dev->NewBlock(size[0]);
+    // for lazy allocation
+    ptr->mutable_data();
     auto allocated = dev->GetAllocatedMem();
     EXPECT_LE(size[0], allocated);
     dev->FreeBlock(ptr);
@@ -63,9 +67,13 @@ TEST(Platform, CreateDevice) {
     auto ptr0 = dev->NewBlock(size[0]);
     auto ptr1 = dev->NewBlock(size[1]);
     auto ptr2 = dev->NewBlock(size[2]);
+    ptr0->mutable_data();
+    ptr1->mutable_data();
+    ptr2->mutable_data();
     auto allocated = dev->GetAllocatedMem();
     EXPECT_LE(size[0] + size[1] + size[2], allocated);
     auto ptr3 = dev->NewBlock(size[3]);
+    ptr3->mutable_data();
     allocated = dev->GetAllocatedMem();
     EXPECT_LE(size[0] + size[1] + size[2] + size[3], allocated);
     dev->FreeBlock(ptr0);
diff --git a/test/singa/test_snapshot.cc b/test/singa/test_snapshot.cc
index 43c879c7cc..ab1a69ff31 100644
--- a/test/singa/test_snapshot.cc
+++ b/test/singa/test_snapshot.cc
@@ -79,8 +79,8 @@ TEST(Snapshot, ReadIntTest) {
     singa::Snapshot int_snapshot_write(prefix + ".int",
                                        singa::Snapshot::kWrite);
     singa::Tensor int_param(singa::Shape{4});
-    int_param.AsType(singa::kInt);
     int_param.CopyDataFromHostPtr(int_data, 4);
+    int_param.AsType(singa::kInt);
     int_snapshot_write.Write("IntParam", int_param);
   }