From 73603a0aae7e94a8dd40214a2b871063e24eb9c3 Mon Sep 17 00:00:00 2001
From: Shenghang Tsai <jackalcooper@gmail.com>
Date: Mon, 7 Sep 2020 22:25:40 +0800
Subject: [PATCH] upgrade XRT XLA to TF 2.3.0 (#3531)

* compile tf 2.3.0 with gcc 7.3

* fix oneflow eigen

* minor fix

* fix include

* update protobuf if xla is on

* update path of tf proto generated cpp files

* fix path in script

* add .clangd to git ignore

* update xla ifs

* update scripts

* update path in script for clangd

* add gitignore

* add cmake flag XRT_TF_URL

* rm comment

* check in changes

* bash tricks to enable gcc 7.3

* use arg to control tuna

* bumpversion

* fix build wheel

* use real path

* add dir for cpu

* fix unwanted yum update cublas

* uncomment all

* rm suffix of wheelhouse_dir

* add log info

Co-authored-by: tsai <caishenghang@1f-dev.kbaeegfb1x0ubnoznzequyxzve.bx.internal.cloudapp.net>
Co-authored-by: tsai <caishenghang@oneflow.org>
Former-commit-id: da12e8db4f52d3c5351f0e43f3677dd948d3801d
---
 .gitignore                                 |  2 +
 cmake/third_party.cmake                    |  1 +
 cmake/third_party/eigen.cmake              |  6 +-
 cmake/third_party/protobuf.cmake           |  2 +-
 cmake/third_party/tensorflow.cmake         | 67 ++++++++++++++++------
 docker/package/manylinux/Dockerfile        | 20 +++++--
 docker/package/manylinux/build_wheel.sh    | 13 ++++-
 docker/package/manylinux/build_xla.sh      | 23 ++++++++
 docker/package/manylinux/launch.sh         |  4 ++
 docker/package/manylinux/make_release.sh   | 48 +++++++++++++---
 oneflow/python/test/ops/test_optimizers.py |  4 +-
 oneflow/python/version.py                  |  2 +-
 oneflow/xrt/xla/xla_allocator.cpp          |  3 +-
 oneflow/xrt/xla/xla_allocator.h            | 11 +++-
 oneflow/xrt/xla/xla_graph_compiler.cpp     |  7 ++-
 15 files changed, 167 insertions(+), 46 deletions(-)
 create mode 100644 docker/package/manylinux/build_xla.sh
 create mode 100644 docker/package/manylinux/launch.sh

diff --git a/.gitignore b/.gitignore
index 81f4241c588..b4b03a602cf 100644
--- a/.gitignore
+++ b/.gitignore
@@ -21,3 +21,5 @@ wheelhouse*
 /oneflow/python/__export_symbols__.py
 /oneflow/python/compatibility.py
 /oneflow/python/framework/sysconfig_gen.py
+.clangd
+compile_commands.json
diff --git a/cmake/third_party.cmake b/cmake/third_party.cmake
index a908ecb0bc8..886c7400adb 100644
--- a/cmake/third_party.cmake
+++ b/cmake/third_party.cmake
@@ -214,6 +214,7 @@ include_directories(${ONEFLOW_INCLUDE_SRC_DIRS})
 
 if(WITH_XLA)
   list(APPEND oneflow_third_party_dependencies tensorflow_copy_libs_to_destination)
+  list(APPEND oneflow_third_party_dependencies tensorflow_symlink_headers)
   list(APPEND oneflow_third_party_libs ${TENSORFLOW_XLA_LIBRARIES})
 endif()
 
diff --git a/cmake/third_party/eigen.cmake b/cmake/third_party/eigen.cmake
index 178b11f8f7e..4302429c25f 100644
--- a/cmake/third_party/eigen.cmake
+++ b/cmake/third_party/eigen.cmake
@@ -4,8 +4,8 @@ set(EIGEN_INCLUDE_DIR ${THIRD_PARTY_DIR}/eigen/include/eigen3)
 set(EIGEN_INSTALL_DIR ${THIRD_PARTY_DIR}/eigen)
 
 if(WITH_XLA)
-  #set(EIGEN_URL "https://storage.googleapis.com/mirror.tensorflow.org/bitbucket.org/eigen/eigen/get/8071cda5714d.tar.gz")
-  set(EIGEN_URL "https://bitbucket.org/eigen/eigen/get/8071cda5714d.tar.gz")
+  #set(EIGEN_URL "https://storage.googleapis.com/mirror.tensorflow.org/gitlab.com/libeigen/eigen/-/archive/386d809bde475c65b7940f290efe80e6a05878c4/eigen-386d809bde475c65b7940f290efe80e6a05878c4.tar.gz")
+  set(EIGEN_URL "https://gitlab.com/libeigen/eigen/-/archive/386d809bde475c65b7940f290efe80e6a05878c4/eigen-386d809bde475c65b7940f290efe80e6a05878c4.tar.gz")
 else()
   set(EIGEN_URL ${THIRD_PARTY_SUBMODULE_DIR}/eigen/src/eigen)
 endif()
@@ -17,7 +17,7 @@ endif()
 #add_definitions(-DEIGEN_NO_AUTOMATIC_RESIZING -DEIGEN_NO_MALLOC -DEIGEN_USE_GPU)
 
 if (THIRD_PARTY)
-  
+
 ExternalProject_Add(eigen
     PREFIX eigen
     URL ${EIGEN_URL}
diff --git a/cmake/third_party/protobuf.cmake b/cmake/third_party/protobuf.cmake
index 023ae812de1..2d4dfcde9d0 100644
--- a/cmake/third_party/protobuf.cmake
+++ b/cmake/third_party/protobuf.cmake
@@ -6,7 +6,7 @@ set(PROTOBUF_BINARY_DIR ${THIRD_PARTY_DIR}/protobuf/bin)
 
 set(PROTOBUF_SRC_DIR ${CMAKE_CURRENT_BINARY_DIR}/protobuf/src/protobuf/src)
 if(WITH_XLA)
-  set(PROTOBUF_URL "https://storage.googleapis.com/mirror.tensorflow.org/github.com/protocolbuffers/protobuf/archive/310ba5ee72661c081129eb878c1bbcec936b20f0.tar.gz")
+  set(PROTOBUF_URL "https://github.com/protocolbuffers/protobuf/archive/v3.9.2.zip")
 else()
   set(PROTOBUF_URL ${THIRD_PARTY_SUBMODULE_DIR}/protobuf/src/protobuf)
 endif()
diff --git a/cmake/third_party/tensorflow.cmake b/cmake/third_party/tensorflow.cmake
index ac1868a1803..997b9ef8dd5 100644
--- a/cmake/third_party/tensorflow.cmake
+++ b/cmake/third_party/tensorflow.cmake
@@ -36,8 +36,7 @@ set(TENSORFLOW_INSTALL_DIR ${THIRD_PARTY_DIR}/tensorflow)
 
 set(PATCHES_DIR  ${PROJECT_SOURCE_DIR}/oneflow/xrt/patches)
 set(TENSORFLOW_JIT_DIR ${TENSORFLOW_SRCS_DIR}/tensorflow/compiler/jit)
-
-set(TENSORFLOW_GEN_DIR ${TENSORFLOW_SRCS_DIR}/bazel-out/${TENSORFLOW_GENFILE_DIR}/genfiles)
+set(TENSORFLOW_GEN_DIR ${TENSORFLOW_SRCS_DIR}/bazel-out/${TENSORFLOW_GENFILE_DIR}/bin)
 set(TENSORFLOW_EXTERNAL_DIR ${TENSORFLOW_SRCS_DIR}/bazel-tensorflow/external)
 set(THIRD_ABSL_DIR ${TENSORFLOW_EXTERNAL_DIR}/com_google_absl)
 set(THIRD_PROTOBUF_DIR ${TENSORFLOW_EXTERNAL_DIR}/com_google_protobuf/src)
@@ -54,36 +53,68 @@ list(APPEND TENSORFLOW_XLA_INCLUDE_DIR
   ${THIRD_SNAPPY_DIR}
   ${THIRD_RE2_DIR}
 )
-include_directories(${TENSORFLOW_XLA_INCLUDE_DIR})
+
+list(APPEND TENSORFLOW_XLA_INCLUDE_INSTALL_DIR
+  "${TENSORFLOW_INSTALL_DIR}/include/tensorflow_inc"
+  "${TENSORFLOW_INSTALL_DIR}/include/tensorflow_gen"
+  "${TENSORFLOW_INSTALL_DIR}/include/absl"
+  "${TENSORFLOW_INSTALL_DIR}/include/protobuf"
+  "${TENSORFLOW_INSTALL_DIR}/include/boringssl"
+  "${TENSORFLOW_INSTALL_DIR}/include/snappy"
+  "${TENSORFLOW_INSTALL_DIR}/include/re2"
+)
+
 list(APPEND TENSORFLOW_XLA_LIBRARIES libtensorflow_framework.so.1)
 list(APPEND TENSORFLOW_XLA_LIBRARIES libxla_core.so)
 link_directories(${TENSORFLOW_INSTALL_DIR}/lib)
 
+if(NOT XRT_TF_URL)
+  set(XRT_TF_URL https://github.com/Oneflow-Inc/tensorflow/archive/1f_dep_v2.3.0r4.zip)
+endif()
 if (THIRD_PARTY)
   ExternalProject_Add(${TENSORFLOW_PROJECT}
     PREFIX ${TENSORFLOW_SOURCES_DIR}
-    GIT_REPOSITORY ${TENSORFLOW_GIT_URL}
-    GIT_TAG ${TENSORFLOW_GIT_TAG}
+    URL ${XRT_TF_URL}
     CONFIGURE_COMMAND ""
     BUILD_COMMAND cd ${TENSORFLOW_SRCS_DIR} &&
-                  bazel build ${TENSORFLOW_BUILD_CMD} -j 20 //tensorflow/compiler/jit/xla_lib:libxla_core.so
+                  bazel build ${TENSORFLOW_BUILD_CMD} -j HOST_CPUS //tensorflow/compiler/jit/xla_lib:libxla_core.so
     INSTALL_COMMAND ""
   )
 
-set(TENSORFLOW_XLA_FRAMEWORK_LIB ${TENSORFLOW_SRCS_DIR}/bazel-bin/tensorflow/libtensorflow_framework.so.1)
-set(TENSORFLOW_XLA_CORE_LIB ${TENSORFLOW_SRCS_DIR}/bazel-bin/tensorflow/compiler/jit/xla_lib/libxla_core.so)
+  set(TENSORFLOW_XLA_FRAMEWORK_LIB ${TENSORFLOW_SRCS_DIR}/bazel-bin/tensorflow/libtensorflow_framework.so.2)
+  set(TENSORFLOW_XLA_CORE_LIB ${TENSORFLOW_SRCS_DIR}/bazel-bin/tensorflow/compiler/jit/xla_lib/libxla_core.so)
 
-add_custom_target(tensorflow_create_library_dir
-  COMMAND ${CMAKE_COMMAND} -E make_directory ${TENSORFLOW_INSTALL_DIR}/lib
-  DEPENDS ${TENSORFLOW_PROJECT})
+  add_custom_target(tensorflow_create_library_dir
+    COMMAND ${CMAKE_COMMAND} -E make_directory ${TENSORFLOW_INSTALL_DIR}/lib
+    DEPENDS ${TENSORFLOW_PROJECT})
+
+  add_custom_target(tensorflow_copy_libs_to_destination
+    COMMAND ${CMAKE_COMMAND} -E copy_if_different
+        ${TENSORFLOW_XLA_FRAMEWORK_LIB} ${TENSORFLOW_XLA_CORE_LIB} ${TENSORFLOW_INSTALL_DIR}/lib
+    COMMAND ${CMAKE_COMMAND} -E create_symlink
+        ${TENSORFLOW_INSTALL_DIR}/lib/libtensorflow_framework.so.2
+        ${TENSORFLOW_INSTALL_DIR}/lib/libtensorflow_framework.so
+    DEPENDS tensorflow_create_library_dir)
+
+  add_custom_target(tensorflow_create_include_dir
+    COMMAND ${CMAKE_COMMAND} -E make_directory ${TENSORFLOW_INSTALL_DIR}/include
+    DEPENDS ${TENSORFLOW_PROJECT})
+
+  add_custom_target(tensorflow_symlink_headers
+    DEPENDS tensorflow_create_include_dir)
+
+  foreach(src_dst_pair IN ZIP_LISTS TENSORFLOW_XLA_INCLUDE_DIR TENSORFLOW_XLA_INCLUDE_INSTALL_DIR)
+    set(src ${src_dst_pair_0})
+    set(dst ${src_dst_pair_1})
+    add_custom_command(TARGET tensorflow_symlink_headers
+      COMMAND ${CMAKE_COMMAND} -E create_symlink
+        ${src}
+        ${dst}
+    )
+  endforeach()
 
-add_custom_target(tensorflow_copy_libs_to_destination
-  COMMAND ${CMAKE_COMMAND} -E copy_if_different
-      ${TENSORFLOW_XLA_FRAMEWORK_LIB} ${TENSORFLOW_XLA_CORE_LIB} ${TENSORFLOW_INSTALL_DIR}/lib
-  COMMAND ${CMAKE_COMMAND} -E create_symlink
-      ${TENSORFLOW_INSTALL_DIR}/lib/libtensorflow_framework.so.1
-      ${TENSORFLOW_INSTALL_DIR}/lib/libtensorflow_framework.so
-  DEPENDS tensorflow_create_library_dir)
 endif(THIRD_PARTY)
 
+include_directories(${TENSORFLOW_XLA_INCLUDE_INSTALL_DIR})
+
 endif(WITH_XLA)
diff --git a/docker/package/manylinux/Dockerfile b/docker/package/manylinux/Dockerfile
index 6c1544a709e..f29b60d9ce4 100644
--- a/docker/package/manylinux/Dockerfile
+++ b/docker/package/manylinux/Dockerfile
@@ -1,4 +1,6 @@
 ARG from
+ARG use_tuna_yum=1
+ARG pip_args="-i https://pypi.tuna.tsinghua.edu.cn/simple"
 FROM ${from}
 LABEL maintainer="OneFlow Maintainers"
 
@@ -13,9 +15,11 @@ ENV LD_LIBRARY_PATH /usr/local/lib64:/usr/local/lib
 ENV PKG_CONFIG_PATH /usr/local/lib/pkgconfig
 
 # use tuna mirror
-COPY docker/package/manylinux/CentOS-Base.repo /etc/yum.repos.d/CentOS-Base.repo
-RUN yum makecache
+COPY docker/package/manylinux/CentOS-Base.repo /tmp/CentOS-Base.repo
+RUN if [ "${use_tuna}" = "1" ]; then mv /etc/yum.repos.d/CentOS-Base.repo /etc/yum.repos.d/ && yum makecache ; fi
 
+# in 10.1, cuda yum repo will update cublas to 10.2 and breaks build
+RUN yum-config-manager --disable cuda
 ARG MANYLINUX_SHA=f5da004
 RUN yum -y install unzip && curl -L -o manylinux.zip https://github.com/pypa/manylinux/archive/${MANYLINUX_SHA}.zip && unzip manylinux.zip -d tmp && cp -r tmp/*/docker/build_scripts /build_scripts && bash build_scripts/build.sh && rm -r build_scripts tmp manylinux.zip
 
@@ -25,10 +29,10 @@ ENV SSL_CERT_FILE=/opt/_internal/certs.pem
 RUN yum-config-manager --add-repo https://yum.repos.intel.com/setup/intelproducts.repo && \
     rpm --import https://yum.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS-2019.PUB && \
     yum update -y && yum install -y epel-release && \
-    yum install -y intel-mkl-64bit-2020.0-088 nasm swig rdma-core-devel
+    yum -y install centos-release-scl && \
+    yum install -y intel-mkl-64bit-2020.0-088 nasm swig rdma-core-devel devtoolset-7-gcc*
 
-ENV TUNA_INDEX="-i https://pypi.tuna.tsinghua.edu.cn/simple"
-RUN /opt/python/cp35-cp35m/bin/pip install $TUNA_INDEX -U cmake && ln -s /opt/_internal/cpython-3.5.9/bin/cmake /usr/bin/cmake
+RUN /opt/python/cp35-cp35m/bin/pip install $pip_args -U cmake && ln -s /opt/_internal/cpython-3.5.9/bin/cmake /usr/bin/cmake
 
 # overwrite patchelf to fix "maximum size exceed" problem
 RUN mkdir -p /tmp && curl -L -o 0.11.zip https://github.com/NixOS/patchelf/archive/0.11.zip && unzip 0.11.zip && cd patchelf-0.11 && sed -i 's/32/64/g' src/patchelf.cc && ./bootstrap.sh && ./configure && make -j`nproc` && make install && cd .. && rm -rf patchelf-0.11 0.11.zip
@@ -40,4 +44,10 @@ RUN /opt/python/cp35-cp35m/bin/pip install $TUNA_INDEX -r /tmp/dev-requirements.
     && /opt/python/cp38-cp38/bin/pip install $TUNA_INDEX -r /tmp/dev-requirements.txt --user \
     && rm /tmp/dev-requirements.txt
 
+RUN curl -L https://github.com/bazelbuild/bazel/releases/download/3.4.1/bazel-3.4.1-linux-x86_64 -o /usr/local/bin/bazel \
+    && chmod +x /usr/local/bin/bazel \
+    && bazel
+
+RUN echo "source scl_source enable devtoolset-7" >> ~/.bashrc
+
 CMD ["/oneflow-src/docker/package/manylinux/build_wheel.sh"]
diff --git a/docker/package/manylinux/build_wheel.sh b/docker/package/manylinux/build_wheel.sh
index d0ed5847a2b..c3ed5d9d991 100755
--- a/docker/package/manylinux/build_wheel.sh
+++ b/docker/package/manylinux/build_wheel.sh
@@ -48,6 +48,14 @@ fi
 
 cd $ONEFLOW_SRC_DIR
 
+# TF requires py3 to build
+export PATH=/opt/python/cp37-cp37m/bin:$PATH
+python --version
+gcc --version
+
+# specify a mounted dir as bazel cache dir
+export TEST_TMPDIR=$CACHE_DIR/bazel_cache
+
 THIRD_PARTY_BUILD_DIR=$CACHE_DIR/build-third-party
 THIRD_PARTY_INSTALL_DIR=$CACHE_DIR/build-third-party-install
 COMMON_CMAKE_ARGS="-DCMAKE_BUILD_TYPE=Release -DBUILD_RDMA=ON -DTHIRD_PARTY_DIR=$THIRD_PARTY_INSTALL_DIR"
@@ -86,9 +94,10 @@ do
     rm -rf $ONEFLOW_BUILD_DIR/python_scripts/oneflow/*.so
     rm -rf $ONEFLOW_SRC_DIR/build/bdist.linux-x86_64
     rm -rf $ONEFLOW_SRC_DIR/build/lib
-    cmake -DTHIRD_PARTY=OFF -DONEFLOW=ON\
+    cmake -DTHIRD_PARTY=OFF -DONEFLOW=ON \
+        -DCMAKE_EXPORT_COMPILE_COMMANDS=1 \
         $COMMON_CMAKE_ARGS \
-        -DPython3_ROOT_DIR=$PY_ROOT \
+        -DPython3_EXECUTABLE=${PY_BIN} \
         $EXTRA_ONEFLOW_CMAKE_ARGS \
         $ONEFLOW_SRC_DIR
     cmake --build . -j `nproc`
diff --git a/docker/package/manylinux/build_xla.sh b/docker/package/manylinux/build_xla.sh
new file mode 100644
index 00000000000..f203061a8dc
--- /dev/null
+++ b/docker/package/manylinux/build_xla.sh
@@ -0,0 +1,23 @@
+set -ex
+ONEFLOW_SRC_DIR=${ONEFLOW_SRC_DIR:-${PWD}}
+wheelhouse_dir=${ONEFLOW_SRC_DIR}/wheelhouse-xla
+
+# TF requires py3 to build
+PY_ROOT=/opt/python/cp37-cp37m
+PY_BIN=${PY_ROOT}/bin
+export PATH=$PY_BIN:$PATH
+python --version
+
+source scl_source enable devtoolset-7
+
+cache_dir=$ONEFLOW_SRC_DIR/manylinux2014-build-cache-cuda-10.2-xla
+cache_dir=$ONEFLOW_SRC_DIR/manylinux2014-build-cache-cuda-11.0-xla
+export TEST_TMPDIR=$cache_dir/bazel_cache
+gcc --version
+
+bash docker/package/manylinux/build_wheel.sh \
+    --python3.6 \
+    --cache-dir $cache_dir \
+    --house-dir $wheelhouse_dir \
+    -DCMAKE_EXPORT_COMPILE_COMMANDS=1 \
+    -DWITH_XLA=ON
diff --git a/docker/package/manylinux/launch.sh b/docker/package/manylinux/launch.sh
new file mode 100644
index 00000000000..b0f540d5ed7
--- /dev/null
+++ b/docker/package/manylinux/launch.sh
@@ -0,0 +1,4 @@
+set -ex
+docker run --rm -it \
+    -v `pwd`:`pwd` \
+    -w `pwd` oneflow:rel-manylinux2014-cuda-11.0 bash
diff --git a/docker/package/manylinux/make_release.sh b/docker/package/manylinux/make_release.sh
index 3663fc55c04..61cd91c0306 100644
--- a/docker/package/manylinux/make_release.sh
+++ b/docker/package/manylinux/make_release.sh
@@ -1,9 +1,12 @@
 set -ex
 
-wheelhouse_dir=/oneflow-src/wheelhouse
+wheelhouse_dir=`pwd`/wheelhouse
 
 package_name=oneflow
 
+tuna_build_args=""
+tuna_build_args="--build-arg use_tuna_yum=0 --build-arg pip_args="""
+
 function release() {
     set -ex
     docker_tag=oneflow:rel-manylinux2014-cuda-$1
@@ -12,22 +15,41 @@ function release() {
     else
         cudnn_version=7
     fi
-    docker build --build-arg from=nvidia/cuda:$1-cudnn${cudnn_version}-devel-centos7 -f docker/package/manylinux/Dockerfile -t $docker_tag .
-    docker run --rm -it -v `pwd`:/oneflow-src -w /oneflow-src $docker_tag \
-        /oneflow-src/docker/package/manylinux/build_wheel.sh --cache-dir /oneflow-src/manylinux2014-build-cache-cuda-$1 \
-        --house-dir $wheelhouse_dir \
+    docker build --build-arg from=nvidia/cuda:$1-cudnn${cudnn_version}-devel-centos7 \
+        ${tuna_build_args} \
+        -f docker/package/manylinux/Dockerfile -t $docker_tag .
+    docker run --rm -it -v `pwd`:`pwd` -w `pwd` $docker_tag \
+        docker/package/manylinux/build_wheel.sh --cache-dir `pwd`/manylinux2014-build-cache-cuda-$1 \
+        --house-dir ${wheelhouse_dir} \
         --package-name ${package_name}_cu`echo $1 | tr -d .`
 }
 
 function release_cpu() {
-    docker run --rm -it -v `pwd`:/oneflow-src -w /oneflow-src oneflow:rel-manylinux2014-cuda-10.2 \
-        /oneflow-src/docker/package/manylinux/build_wheel.sh --cache-dir /oneflow-src/manylinux2014-build-cache-cpu \
-        --house-dir $wheelhouse_dir \
+    docker run --rm -it -v `pwd`:`pwd` -w `pwd` oneflow:rel-manylinux2014-cuda-10.2 \
+        docker/package/manylinux/build_wheel.sh --cache-dir `pwd`/manylinux2014-build-cache-cpu \
+        --house-dir ${wheelhouse_dir} \
         -DBUILD_CUDA=OFF \
         --package-name "${package_name}_cpu"
 }
 
-release_cpu
+function release_xla() {
+    set -ex
+    docker_tag=oneflow:rel-manylinux2014-cuda-$1
+    if [ "$1" == "11.0" ]; then
+        cudnn_version=8
+    else
+        cudnn_version=7
+    fi
+    docker build --build-arg from=nvidia/cuda:$1-cudnn${cudnn_version}-devel-centos7 \
+        ${tuna_build_args} \
+        -f docker/package/manylinux/Dockerfile -t $docker_tag .
+    docker run --rm -it -v `pwd`:`pwd` -w `pwd` $docker_tag \
+        bash -l docker/package/manylinux/build_wheel.sh --cache-dir `pwd`/manylinux2014-build-cache-cuda-$1-xla \
+        --house-dir ${wheelhouse_dir} \
+        --package-name ${package_name}_cu`echo $1 | tr -d .`_xla \
+        -DWITH_XLA=ON
+}
+
 release 11.0
 release 10.2
 release 10.1
@@ -35,3 +57,11 @@ release 10.0
 release 9.2
 release 9.1
 release 9.0
+
+release_cpu
+
+release_xla 11.0
+release_xla 10.2
+release_xla 10.1
+release_xla 10.0
+# failed to build XLA with CUDA 9.X
diff --git a/oneflow/python/test/ops/test_optimizers.py b/oneflow/python/test/ops/test_optimizers.py
index 0e2b2372c0d..f07ee9dc3fe 100644
--- a/oneflow/python/test/ops/test_optimizers.py
+++ b/oneflow/python/test/ops/test_optimizers.py
@@ -87,7 +87,9 @@ def testRmsprop(
         gradients = tape.gradient(loss, var)
         opt.apply_gradients(zip([gradients], [var]))
 
-    assert np.allclose(x.flatten(), var.numpy().flatten(), rtol=1e-3, atol=1e-3,)
+    assert np.allclose(x.flatten(), var.numpy().flatten(), rtol=1e-3, atol=1e-3,), (
+        x.flatten() - var.numpy().flatten()
+    )
 
 
 def compare_with_tensorflow_adam(
diff --git a/oneflow/python/version.py b/oneflow/python/version.py
index 639f0c49dbf..0a201297351 100644
--- a/oneflow/python/version.py
+++ b/oneflow/python/version.py
@@ -13,4 +13,4 @@
 See the License for the specific language governing permissions and
 limitations under the License.
 """
-__version__ = "0.1.10"
+__version__ = "0.1.11b1"
diff --git a/oneflow/xrt/xla/xla_allocator.cpp b/oneflow/xrt/xla/xla_allocator.cpp
index 24694a5fbae..19ab7143b99 100644
--- a/oneflow/xrt/xla/xla_allocator.cpp
+++ b/oneflow/xrt/xla/xla_allocator.cpp
@@ -38,7 +38,8 @@ XlaAllocator::XlaAllocator(const se::Platform *platform, DeviceBufferAllocator *
 XlaAllocator::~XlaAllocator() {}
 
 xla::StatusOr<se::OwningDeviceMemory> XlaAllocator::Allocate(int device_ordinal, uint64 size,
-                                                             bool retry_on_failure) {
+                                                             bool retry_on_failure,
+                                                             int64 /*memory_space*/) {
   se::DeviceMemoryBase memory_base;
   if (allocate_index_ < populated_buffers_.size()
       && populated_buffers_[allocate_index_].populated) {
diff --git a/oneflow/xrt/xla/xla_allocator.h b/oneflow/xrt/xla/xla_allocator.h
index e33fa9b7b9e..f746bc10d87 100644
--- a/oneflow/xrt/xla/xla_allocator.h
+++ b/oneflow/xrt/xla/xla_allocator.h
@@ -16,6 +16,7 @@ limitations under the License.
 #ifndef ONEFLOW_XRT_XLA_XLA_ALLOCATOR_H_
 #define ONEFLOW_XRT_XLA_XLA_ALLOCATOR_H_
 
+#include "oneflow/core/common/util.h"
 #include "oneflow/xrt/xla/memory/device_buffer_allocator.h"
 
 #include "tensorflow/compiler/xla/statusor.h"
@@ -28,14 +29,16 @@ namespace mola {
 
 namespace se = tensorflow::se;
 using uint64 = tensorflow::uint64;
+using int64 = tensorflow::int64;
 
 class XlaAllocator : public se::DeviceMemoryAllocator {
  public:
   explicit XlaAllocator(const se::Platform *platform, DeviceBufferAllocator *allocator);
   virtual ~XlaAllocator();
-
+  using se::DeviceMemoryAllocator::Allocate;
   xla::StatusOr<se::OwningDeviceMemory> Allocate(int device_ordinal, uint64 size,
-                                                 bool retry_on_failure) override;
+                                                 bool retry_on_failure,
+                                                 int64 /*memory_space*/) override;
   tensorflow::Status Deallocate(int device_ordinal, se::DeviceMemoryBase mem) override;
 
   bool AllowsAsynchronousDeallocation() const override { return true; }
@@ -47,6 +50,10 @@ class XlaAllocator : public se::DeviceMemoryAllocator {
 
   void PopulateDeviceMemory(const std::vector<se::DeviceMemoryBase> &device_buffers,
                             const std::vector<int64_t> &allocation_indices);
+  stream_executor::port::StatusOr<stream_executor::Stream *> GetStream(
+      int device_ordinal) override {
+    UNIMPLEMENTED();
+  };
 
  private:
   DeviceBufferAllocator *allocator_;
diff --git a/oneflow/xrt/xla/xla_graph_compiler.cpp b/oneflow/xrt/xla/xla_graph_compiler.cpp
index 767ea91aacf..883e406c28c 100644
--- a/oneflow/xrt/xla/xla_graph_compiler.cpp
+++ b/oneflow/xrt/xla/xla_graph_compiler.cpp
@@ -137,10 +137,11 @@ std::shared_ptr<Executable> XlaGraphCompiler::BuildExecutable(
   xla::ExecutableBuildOptions build_options;
   build_options.set_device_ordinal(this->device_ordinal_);
   build_options.set_result_layout(xla_output_shape);
-  MOLA_CHECK_AND_ASSIGN(auto executable,
+  MOLA_CHECK_AND_ASSIGN(auto executables,
                         client->Compile(computation, argument_layouts, build_options));
-  return std::make_shared<XlaExecutable>(builder_->name(), this->device_, xla_input_shapes, xla_output_shape,
-                                         std::move(executable));
+  CHECK(executables.size() == 1);
+  return std::make_shared<XlaExecutable>(builder_->name(), this->device_, xla_input_shapes,
+                                         xla_output_shape, std::move(executables.at(0)));
 }
 
 void XlaGraphCompiler::BuildEntryParameters(const std::vector<Parameter> &entry_params,